├── local ├── files │ ├── cvlist │ │ ├── IND_cv_spk │ │ ├── KR_cv_spk │ │ ├── RU_cv_spk │ │ ├── CHN_cv_spk │ │ ├── JPN_cv_spk │ │ ├── PT_cv_spk │ │ ├── US_cv_spk │ │ └── UK_cv_spk │ ├── ar.dict │ └── asr.dict ├── tools │ ├── train_kenlm.sh │ ├── preprocess.py │ ├── apply_lexicon.py │ ├── parse_track1_jsons.py │ ├── dump_spk_yzl23.sh │ ├── word_frequency.py │ ├── dump.sh │ ├── data2json.sh │ └── merge_scp2json.py ├── prepare_LG.fst ├── track2_kaldi_gmm_train.sh ├── track1_espnet_transformer_train.sh ├── track2_espnet_transformer_train.sh ├── prepare_data.sh └── track2_kaldi_chain_train.sh ├── conf ├── espnet_decode.yaml ├── fbank.conf ├── espnet_lm.yaml ├── espnet_specaug.yaml ├── track1_accent_transformer.yaml ├── espnet_train.yaml └── xconfig ├── README.md ├── README_en.md ├── module ├── track1_accent_transformer.py └── track2_asr_transformer.py └── LICENSE /local/files/cvlist/IND_cv_spk: -------------------------------------------------------------------------------- 1 | IND-G00892 2 | IND-G01006 3 | IND-G01501 4 | IND-G0760 -------------------------------------------------------------------------------- /local/files/cvlist/KR_cv_spk: -------------------------------------------------------------------------------- 1 | KR-G00022 2 | KR-G00276 3 | KR-G10029 4 | KR-G10122 -------------------------------------------------------------------------------- /local/files/cvlist/RU_cv_spk: -------------------------------------------------------------------------------- 1 | RU-G00163 2 | RU-G00196 3 | RU-G00439 4 | RU-G10416 -------------------------------------------------------------------------------- /local/files/cvlist/CHN_cv_spk: -------------------------------------------------------------------------------- 1 | CHN-G00190 2 | CHN-G00992 3 | CHN-G61365 4 | CHN-G01372 -------------------------------------------------------------------------------- /local/files/cvlist/JPN_cv_spk: -------------------------------------------------------------------------------- 1 | JPN-G00040 2 | JPN-G00125 3 | JPN-G00354 4 | JPN-G20194 -------------------------------------------------------------------------------- /local/files/cvlist/PT_cv_spk: -------------------------------------------------------------------------------- 1 | PT-G00600 2 | PT-G00643 3 | PT-G00963 4 | PT-G10618 5 | PT-G20539 -------------------------------------------------------------------------------- /local/files/cvlist/US_cv_spk: -------------------------------------------------------------------------------- 1 | US-G00007 2 | US-G01459 3 | US-G10948 4 | US-G20537 5 | US-G20939 6 | US-G30201 -------------------------------------------------------------------------------- /local/files/ar.dict: -------------------------------------------------------------------------------- 1 | 0 2 | 1 3 | 2 4 | 3 5 | 4 6 | 5 7 | 6 8 | 7 9 | -------------------------------------------------------------------------------- /local/files/cvlist/UK_cv_spk: -------------------------------------------------------------------------------- 1 | UK-G00025 2 | UK-G00808 3 | UK-G01337 4 | UK-G01807 5 | UK-G10261 6 | UK-G11032 7 | UK-G11739 8 | UK-G40517 -------------------------------------------------------------------------------- /conf/espnet_decode.yaml: -------------------------------------------------------------------------------- 1 | batchsize: 0 2 | beam-size: 10 3 | penalty: 0.0 4 | maxlenratio: 0.0 5 | minlenratio: 0.0 6 | ctc-weight: 0.3 7 | lm-weight: 0.3 8 | -------------------------------------------------------------------------------- /conf/fbank.conf: -------------------------------------------------------------------------------- 1 | --window-type=hamming # disable Dans window, use the standard 2 | --sample-frequency=16000 3 | --num-mel-bins=71 # 8kHz so we use 36 bins (@ 8 filters/octave to get closer to 40 filters/16Khz used by IBM) 4 | --allow_downsample=true 5 | -------------------------------------------------------------------------------- /conf/espnet_lm.yaml: -------------------------------------------------------------------------------- 1 | # rnnlm related 2 | layer: 2 3 | unit: 1024 4 | opt: sgd # or adam 5 | batchsize: 64 # batch size in LM training 6 | epoch: 30 # if the data size is large, we can reduce this 7 | patience: 3 8 | maxlen: 100 # if sentence length > lm_maxlen, lm_batchsize is automatically reduced 9 | -------------------------------------------------------------------------------- /conf/espnet_specaug.yaml: -------------------------------------------------------------------------------- 1 | process: 2 | # these three processes are a.k.a. SpecAugument 3 | # - type: "time_warp" 4 | # max_time_warp: 5 5 | # inplace: true 6 | # mode: "PIL" 7 | - type: "freq_mask" 8 | F: 20 9 | n_mask: 2 10 | inplace: true 11 | replace_with_zero: false 12 | - type: "time_mask" 13 | T: 40 14 | n_mask: 2 15 | inplace: true 16 | replace_with_zero: false 17 | -------------------------------------------------------------------------------- /local/tools/train_kenlm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2020 Audio, Speech and Language Processing Group @ NWPU (Author: Xian Shi) 4 | # Apache 2.0 5 | 6 | order=3 7 | prune="0 1 1" 8 | mem_rate=40% 9 | output_dir= 10 | arpa_name= 11 | fallback="0.5 1 1.5" 12 | 13 | input=$1 14 | 15 | cat $input | lmplz \ 16 | -o $order \ 17 | -S $mem_rate \ 18 | --prune $prune \ 19 | --discount_fallback $fallback \ 20 | --arpa $2 21 | 22 | echo "local/train_kenlm.sh succeeded" 23 | exit 0; 24 | -------------------------------------------------------------------------------- /local/tools/preprocess.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Audio, Speech and Language Processing Group @ NWPU (Author: Xian Shi) 2 | # Apache 2.0 3 | 4 | import sys 5 | 6 | fin=open(sys.argv[1], 'r') 7 | fout_text = open(sys.argv[2], 'w') 8 | fout_utt2spk = open(sys.argv[3], 'w') 9 | 10 | for line in fin.readlines(): 11 | uttid, path = line.strip('\n').split('\t') 12 | text_path = path.replace('.wav', '.txt') 13 | text_ori = open(text_path, 'r').readlines()[0].strip('\n') 14 | feild = path.split('/') 15 | accid = feild[-3] 16 | spkid = accid + '-' + feild[-2] 17 | fout_utt2spk.write(uttid + '\t' + spkid + '\n') 18 | fout_text.write(text_ori + '\n') -------------------------------------------------------------------------------- /conf/track1_accent_transformer.yaml: -------------------------------------------------------------------------------- 1 | # network architecture 2 | # encoder related 3 | elayers: 6 4 | eunits: 2048 5 | # attention related 6 | adim: 256 7 | aheads: 4 8 | 9 | # label smoothing 10 | lsm-weight: 0.0 11 | 12 | # minibatch related 13 | batch-size: 32 14 | maxlen-in: 512 # if input length > maxlen-in, batchsize is automatically reduced 15 | maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced 16 | 17 | # optimization related 18 | sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 19 | opt: noam 20 | accum-grad: 2 21 | grad-clip: 5 22 | patience: 0 23 | epochs: 40 24 | dropout-rate: 0.1 25 | 26 | # transformer specific setting 27 | backend: pytorch 28 | model-module: "espnet.nets.pytorch_backend.track1_accent_transformer:E2E" 29 | transformer-input-layer: conv2d # encoder architecture type 30 | transformer-lr: 5.0 31 | transformer-warmup-steps: 25000 32 | transformer-attn-dropout-rate: 0.0 33 | transformer-length-normalized-loss: false 34 | transformer-init: pytorch 35 | 36 | -------------------------------------------------------------------------------- /local/prepare_LG.fst: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2020 Audio, Speech and Language Processing Group @ NWPU (Author: Xian Shi) 4 | # Apache 2.0 5 | 6 | stage=1 7 | 8 | . ./cmd.sh 9 | . ./path.sh 10 | . ./utils/parse_options.sh 11 | 12 | if [ $# -ne 2 ]; then 13 | echo "prepare_all.sh " 14 | echo " e.g prepare_all.sh data data/train/trans the data/ contains the dir of data and mfcc." 15 | exit 1; 16 | fi 17 | 18 | data_set=$1 19 | train_text=$2 20 | 21 | # L 22 | if [ $stage -le 1 ]; then 23 | ./utils/prepare_lang.sh --position-dependent-phones false \ 24 | $data_set/local/dict "" $data_set/local/lang $data_set/lang || exit 1; 25 | fi 26 | 27 | # arpa LM 28 | if [ $stage -le 2 ]; then 29 | local/train_kenlm.sh $train_text \ 30 | $data_set/local/lm.arpa || exit 1; 31 | fi 32 | 33 | # G compilation, check LG composition 34 | if [ $stage -le 3 ]; then 35 | ./local/format_lm.sh $data_set/lang $data_set/local/lm.arpa \ 36 | $data_set/local/dict/lexicon.txt $data_set/lang_test || exit 1; 37 | fi 38 | 39 | echo "local/prepare_lang.sh succeeded" 40 | exit 0; 41 | 42 | -------------------------------------------------------------------------------- /conf/espnet_train.yaml: -------------------------------------------------------------------------------- 1 | # network architecture 2 | # encoder related 3 | elayers: 12 4 | eunits: 2048 5 | # decoder related 6 | dlayers: 6 7 | dunits: 2048 8 | # attention related 9 | adim: 256 10 | aheads: 4 11 | 12 | # hybrid CTC/attention 13 | mtlalpha: 0.3 14 | 15 | # label smoothing 16 | lsm-weight: 0.1 17 | 18 | # minibatch related 19 | batch-size: 32 20 | maxlen-in: 450 # if input length > maxlen-in, batchsize is automatically reduced 21 | maxlen-out: 18 # if output length > maxlen-out, batchsize is automatically reduced 22 | 23 | # batch-count: frame 24 | # batch-frames-in 3200 25 | # batch-frames-out 100 26 | # batch-frames-inout 900 27 | 28 | # optimization related 29 | sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 30 | opt: noam 31 | accum-grad: 2 32 | grad-clip: 5 33 | patience: 0 34 | epochs: 50 35 | dropout-rate: 0.1 36 | 37 | # transformer specific setting 38 | backend: pytorch 39 | model-module: "espnet.nets.pytorch_backend.e2e_asr_transformer:E2E" 40 | transformer-input-layer: conv2d # encoder architecture type 41 | transformer-lr: 1.0 42 | transformer-warmup-steps: 25000 43 | transformer-attn-dropout-rate: 0.0 44 | transformer-length-normalized-loss: false 45 | transformer-init: pytorch 46 | -------------------------------------------------------------------------------- /local/tools/apply_lexicon.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Audio, Speech and Language Processing Group @ NWPU (Author: Xian Shi) 2 | # Apache 2.0 3 | 4 | import codecs 5 | import sys 6 | 7 | dict = sys.argv[1] 8 | input = sys.argv[2] 9 | output = sys.argv[3] 10 | unk = sys.argv[4] 11 | warning = sys.argv[5] 12 | unit_name = sys.argv[6] 13 | 14 | map = {} 15 | units = [] 16 | 17 | with codecs.open(dict, 'r', encoding='utf-8') as f1: 18 | for line in f1: 19 | word = line.split('\t')[0] 20 | tokens = line.rstrip('\n').split('\t')[1] 21 | map[word] = tokens 22 | 23 | with codecs.open(input, 'r', encoding='utf-8') as f2: 24 | with codecs.open(output, 'w', encoding='utf-8') as f3 ,codecs.open(warning, 'w', encoding='utf-8') as f4: 25 | for line in f2: 26 | if len(line.split('\t')) > 1: 27 | head = line.split('\t')[0] 28 | sentence = line.rstrip('\n').split('\t')[1].split(' ') 29 | else: 30 | head = line.split(' ')[0] 31 | sentence = line.rstrip('\n').split(' ')[1:] 32 | result = head + '\t' 33 | for word in sentence: 34 | if len(word): 35 | if word in map: 36 | result += map[word] + ' ' 37 | for unit in map[word].split(' '): 38 | if unit not in units: 39 | units.append(unit) 40 | else: 41 | f4.write(word + '\n') 42 | result += unk + ' ' 43 | f3.write(result.rstrip(' ').lstrip(' ') + '\n') 44 | 45 | list.sort(units) 46 | units.insert(0, '') 47 | with codecs.open(unit_name, 'w', encoding='utf-8') as f5: 48 | for i in range(len(units)): 49 | f5.write(str(units[i]) + ' ' + str(i+1)+'\n') 50 | -------------------------------------------------------------------------------- /local/tools/parse_track1_jsons.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: luyizhou4 3 | # @Date: 2019-10-08 15:36:36 4 | # @Function: 5 | # @Last Modified time: 2020-09-13 19:17:44 6 | 7 | import sys 8 | import json 9 | 10 | def parse_result(result_label): 11 | ACCENT_LIST = ["US", "UK", "CHN", "IND", "JPN", "KR", "PT", "RU"] 12 | ACCENT_NUM = len(ACCENT_LIST) 13 | utt_nums = [0] * ACCENT_NUM 14 | correct_nums = [0] * ACCENT_NUM 15 | 16 | with open(result_label, 'r') as fd: 17 | for line in fd.readlines(): 18 | if not line.strip(): 19 | continue 20 | uttid, hyp = line.split()[:] 21 | hyp = int(hyp) 22 | ref = ACCENT_LIST.index(uttid.split('-')[0]) 23 | utt_nums[ref] += 1 24 | 25 | if ref == hyp: 26 | correct_nums[ref] += 1 27 | 28 | acc_per_accent = [100.0 * correct_nums[i] / utt_nums[i] for i in range(ACCENT_NUM)] 29 | for i in range(ACCENT_NUM): 30 | print('{} Accent Accuracy: {:.1f}'.format(ACCENT_LIST[i], acc_per_accent[i])) 31 | print('Average ACC: {} / {} = {:.1f}'.format(sum(correct_nums), sum(utt_nums), 100.0 * sum(correct_nums) / sum(utt_nums))) 32 | 33 | def main(): 34 | json_file = sys.argv[1] 35 | result_label = sys.argv[2] 36 | 37 | with open(json_file, 'r') as fd, open(result_label, 'w+') as w_fd: 38 | data = json.load(fd) 39 | uttid_list = list (data["utts"].keys()) 40 | uttid_list.sort() 41 | print('There are totally %s utts'%(len(uttid_list))) 42 | for uttid in uttid_list: 43 | rec_tokenid_list = data['utts'][uttid]["output"][0]["rec_tokenid"].split() 44 | rec_tokenid = ' '.join(rec_tokenid_list) 45 | w_fd.write(uttid + ' ' + rec_tokenid + '\n') 46 | 47 | parse_result(result_label) 48 | 49 | if __name__ == '__main__': 50 | main() 51 | -------------------------------------------------------------------------------- /local/track2_kaldi_gmm_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2020 Audio, Speech and Language Processing Group @ NWPU (Author: Xian Shi) 4 | # Apache 2.0 5 | 6 | set -e 7 | 8 | nj=50 9 | stage=2 10 | 11 | . ./cmd.sh 12 | [ -f ./path.sh ] && . ./path.sh; 13 | . ./utils/parse_options.sh 14 | 15 | data=kdata 16 | exp=kaldi-exp 17 | 18 | test_sets="CHN IND KR PT RU UK US JPN" 19 | 20 | # mono 21 | if [ $stage -le 2 ]; then 22 | # training 23 | steps/train_mono.sh --cmd "$decode_cmd" --nj $nj \ 24 | $data/train $data/lang $exp/mono || exit 1; 25 | 26 | # alignment 27 | steps/align_si.sh --cmd "$decode_cmd" --nj $nj \ 28 | $data/train $data/lang $exp/mono $exp/mono_ali || exit 1; 29 | fi 30 | 31 | # tri1 32 | if [ $stage -le 3 ]; then 33 | # training 34 | steps/train_deltas.sh --cmd "$decode_cmd" \ 35 | 4000 32000 $data/train $data/lang $exp/mono_ali $exp/tri1 || exit 1; 36 | 37 | # alignment 38 | steps/align_si.sh --cmd "$decode_cmd" --nj $nj \ 39 | $data/train $data/lang $exp/tri1 $exp/tri1_ali || exit 1; 40 | fi 41 | 42 | # tri2 43 | if [ $stage -le 4 ]; then 44 | # training 45 | steps/train_deltas.sh --cmd "$decode_cmd" \ 46 | 7000 56000 $data/train $data/lang $exp/tri1_ali $exp/tri2 || exit 1; 47 | 48 | # alignment 49 | steps/align_si.sh --cmd "$decode_cmd" --nj $nj \ 50 | $data/train $data/lang $exp/tri2 $exp/tri2_ali || exit 1; 51 | fi 52 | 53 | # tri3 54 | if [ $stage -le 5 ]; then 55 | # training [LDA+MLLT] 56 | steps/train_lda_mllt.sh --cmd "$decode_cmd" \ 57 | 10000 80000 $data/train $data/lang $exp/tri2_ali $exp/tri3 || exit 1; 58 | 59 | # decoding 60 | utils/mkgraph.sh $data/lang_test $exp/tri3 $exp/tri3/graph || exit 1; 61 | 62 | for test_set in $test_sets;do 63 | steps/decode.sh --cmd "$decode_cmd" --nj 30 --config conf/decode.conf \ 64 | $exp/tri3/graph $data/cv/$test_set $exp/tri3/decode_test_$test_set 65 | done 66 | 67 | # alignment 68 | steps/align_si.sh --cmd "$decode_cmd" --nj $nj \ 69 | $data/train $data/lang $exp/tri3 $exp/tri3_ali || exit 1; 70 | fi 71 | 72 | echo "local/track2_kaldi_gmm_train.sh succeeded" 73 | exit 0; 74 | 75 | -------------------------------------------------------------------------------- /local/tools/dump_spk_yzl23.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2017 Nagoya University (Tomoki Hayashi) 4 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 5 | 6 | echo "$0 $*" # Print the command line for logging 7 | . ./path.sh 8 | 9 | cmd=run.pl 10 | nj=1 11 | verbose=0 12 | compress=true 13 | write_utt2num_frames=true 14 | filetype='mat' # mat or hdf5 15 | 16 | . utils/parse_options.sh 17 | 18 | scp=$1 19 | cvmnark=$2 20 | logdir=$3 21 | dumpdir=$4 22 | utt2spk=$5 23 | 24 | if [ $# != 5 ]; then 25 | echo "Usage: $0 " 26 | exit 1; 27 | fi 28 | 29 | set -euo pipefail 30 | 31 | mkdir -p ${logdir} 32 | mkdir -p ${dumpdir} 33 | 34 | dumpdir=$(perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' ${dumpdir} ${PWD}) 35 | 36 | for n in $(seq ${nj}); do 37 | # the next command does nothing unless $dumpdir/storage/ exists, see 38 | # utils/create_data_link.pl for more info. 39 | utils/create_data_link.pl ${dumpdir}/feats.${n}.ark 40 | done 41 | 42 | if ${write_utt2num_frames}; then 43 | write_num_frames_opt="--write-num-frames=ark,t:$dumpdir/utt2num_frames.JOB" 44 | else 45 | write_num_frames_opt= 46 | fi 47 | 48 | # split scp file 49 | split_scps="" 50 | for n in $(seq ${nj}); do 51 | split_scps="$split_scps $logdir/feats.$n.scp" 52 | done 53 | 54 | utils/split_scp.pl ${scp} ${split_scps} || exit 1; 55 | 56 | # dump features 57 | ${cmd} JOB=1:${nj} ${logdir}/dump_feature.JOB.log \ 58 | apply-cmvn --norm-vars=true --utt2spk=ark:${utt2spk} scp:${cvmnark} scp:${logdir}/feats.JOB.scp ark:- \| \ 59 | copy-feats.py --verbose ${verbose} --out-filetype ${filetype} \ 60 | --compress=${compress} --compression-method=2 ${write_num_frames_opt} \ 61 | ark:- ark,scp:${dumpdir}/feats.JOB.ark,${dumpdir}/feats.JOB.scp \ 62 | || exit 1 63 | 64 | # concatenate scp files 65 | for n in $(seq ${nj}); do 66 | cat ${dumpdir}/feats.${n}.scp || exit 1; 67 | done > ${dumpdir}/feats.scp || exit 1 68 | 69 | if ${write_utt2num_frames}; then 70 | for n in $(seq ${nj}); do 71 | cat ${dumpdir}/utt2num_frames.${n} || exit 1; 72 | done > ${dumpdir}/utt2num_frames || exit 1 73 | rm ${dumpdir}/utt2num_frames.* 2>/dev/null 74 | fi 75 | 76 | # Write the filetype, this will be used for data2json.sh 77 | echo ${filetype} > ${dumpdir}/filetype 78 | 79 | 80 | # remove temp scps 81 | rm ${logdir}/feats.*.scp 2>/dev/null 82 | if [ ${verbose} -eq 1 ]; then 83 | echo "Succeeded dumping features for training" 84 | fi 85 | -------------------------------------------------------------------------------- /local/tools/word_frequency.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | ###################################################################### 5 | # 6 | # Copyright ASLP@NPU. All Rights Reserved 7 | # 8 | # Licensed under the Apache License, Veresion 2.0(the "License"); 9 | # You may not use the file except in compliance with the Licese. 10 | # You may obtain a copy of the License at 11 | # 12 | # http://www.apache.org/license/LICENSE-2.0 13 | # 14 | # Unless required by applicable law or agreed to in writing,software 15 | # distributed under the License is distributed on an "AS IS" BASIS 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | # See the License for the specific language governing permissions and 18 | # limitations under the License. 19 | # 20 | # Author shixian(npu) 21 | # Date 2019/10/09 14:25:50 22 | # 23 | ###################################################################### 24 | import codecs 25 | import sys 26 | import operator 27 | 28 | if __name__ == '__main__': 29 | filename = sys.argv[1] 30 | top_nums = int(sys.argv[2]) 31 | prefix = sys.argv[3] 32 | dict_cn = {} 33 | dict_en = {} 34 | f2 = codecs.open("enwords.txt", "w", encoding='utf-8') 35 | with codecs.open(filename, "r", encoding='utf-8') as f: 36 | for line in f.readlines(): 37 | if len(line.split('\t')) > 1: 38 | line = line.split('\t')[1] 39 | start = 0 40 | else: 41 | start = 1 42 | for char in line.rstrip('\n').split(' ')[start:]: 43 | if char >= u'\u4e00' and char <= u'\u9fa5': 44 | if char not in dict_cn: 45 | dict_cn[char] = 1 46 | else: 47 | dict_cn[char] += 1 48 | else: 49 | f2.write(char + ' ') 50 | if char not in dict_en: 51 | dict_en[char] = 1 52 | else: 53 | dict_en[char] += 1 54 | f2.write('\n') 55 | dict_cn = sorted(dict_cn.items(),key=operator.itemgetter(1),reverse=True) 56 | dict_en = sorted(dict_en.items(),key=operator.itemgetter(1),reverse=True) 57 | fout_cn = codecs.open(prefix + '.cnwf', 'w', encoding='utf-8') 58 | fout_en = codecs.open(prefix + '.enwf', 'w', encoding='utf-8') 59 | if len(dict_cn): 60 | if top_nums == 0: 61 | for i in range(len(dict_cn)): 62 | fout_cn.write(dict_cn[i][0] + ' ' + str(dict_cn[i][1]) + '\n') 63 | else: 64 | for i in range(top_nums): 65 | fout_cn.write(dict_cn[i][0] + ' ' + str(dict_cn[i][1]) + '\n') 66 | if len(dict_en): 67 | for i in range(len(dict_en)): 68 | fout_en.write(dict_en[i][0] + ' ' + str(dict_en[i][1]) + '\n') 69 | fout_cn.close() 70 | fout_en.close() 71 | f2.close() 72 | -------------------------------------------------------------------------------- /local/tools/dump.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2017 Nagoya University (Tomoki Hayashi) 4 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 5 | 6 | echo "$0 $*" # Print the command line for logging 7 | . ./path.sh 8 | 9 | cmd=run.pl 10 | do_delta=false 11 | nj=1 12 | verbose=0 13 | compress=true 14 | write_utt2num_frames=true 15 | filetype='mat' # mat or hdf5 16 | help_message="Usage: $0 " 17 | 18 | . utils/parse_options.sh 19 | 20 | scp=$1 21 | cvmnark=$2 22 | logdir=$3 23 | dumpdir=$4 24 | 25 | if [ $# != 4 ]; then 26 | echo "${help_message}" 27 | exit 1; 28 | fi 29 | 30 | set -euo pipefail 31 | 32 | mkdir -p ${logdir} 33 | mkdir -p ${dumpdir} 34 | 35 | dumpdir=$(perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' ${dumpdir} ${PWD}) 36 | 37 | for n in $(seq ${nj}); do 38 | # the next command does nothing unless $dumpdir/storage/ exists, see 39 | # utils/create_data_link.pl for more info. 40 | utils/create_data_link.pl ${dumpdir}/feats.${n}.ark 41 | done 42 | 43 | if ${write_utt2num_frames}; then 44 | write_num_frames_opt="--write-num-frames=ark,t:$dumpdir/utt2num_frames.JOB" 45 | else 46 | write_num_frames_opt= 47 | fi 48 | 49 | # split scp file 50 | split_scps="" 51 | for n in $(seq ${nj}); do 52 | split_scps="$split_scps $logdir/feats.$n.scp" 53 | done 54 | 55 | utils/split_scp.pl ${scp} ${split_scps} || exit 1; 56 | 57 | # dump features 58 | if ${do_delta}; then 59 | ${cmd} JOB=1:${nj} ${logdir}/dump_feature.JOB.log \ 60 | apply-cmvn --norm-vars=true ${cvmnark} scp:${logdir}/feats.JOB.scp ark:- \| \ 61 | add-deltas ark:- ark:- \| \ 62 | copy-feats.py --verbose ${verbose} --out-filetype ${filetype} \ 63 | --compress=${compress} --compression-method=2 ${write_num_frames_opt} \ 64 | ark:- ark,scp:${dumpdir}/feats.JOB.ark,${dumpdir}/feats.JOB.scp \ 65 | || exit 1 66 | else 67 | ${cmd} JOB=1:${nj} ${logdir}/dump_feature.JOB.log \ 68 | apply-cmvn --norm-vars=true ${cvmnark} scp:${logdir}/feats.JOB.scp ark:- \| \ 69 | copy-feats.py --verbose ${verbose} --out-filetype ${filetype} \ 70 | --compress=${compress} --compression-method=2 ${write_num_frames_opt} \ 71 | ark:- ark,scp:${dumpdir}/feats.JOB.ark,${dumpdir}/feats.JOB.scp \ 72 | || exit 1 73 | fi 74 | 75 | # concatenate scp files 76 | for n in $(seq ${nj}); do 77 | cat ${dumpdir}/feats.${n}.scp || exit 1; 78 | done > ${dumpdir}/feats.scp || exit 1 79 | 80 | if ${write_utt2num_frames}; then 81 | for n in $(seq ${nj}); do 82 | cat ${dumpdir}/utt2num_frames.${n} || exit 1; 83 | done > ${dumpdir}/utt2num_frames || exit 1 84 | rm ${dumpdir}/utt2num_frames.* 2>/dev/null 85 | fi 86 | 87 | # Write the filetype, this will be used for data2json.sh 88 | echo ${filetype} > ${dumpdir}/filetype 89 | 90 | 91 | # remove temp scps 92 | # rm ${logdir}/feats.*.scp 2>/dev/null 93 | if [ ${verbose} -eq 1 ]; then 94 | echo "Succeeded dumping features for training" 95 | fi 96 | -------------------------------------------------------------------------------- /conf/xconfig: -------------------------------------------------------------------------------- 1 | # This file was created by the command: 2 | # steps/nnet3/xconfig_to_configs.py --xconfig-file exp2/chain/tdnn_a_all_sp/configs/network.xconfig --config-dir exp2/chain/tdnn_a_all_sp/configs/ 3 | # It is a copy of the source from which the config files in # this directory were generated. 4 | 5 | input dim=71 name=input 6 | conv-relu-batchnorm-layer name=cnn1 l2-regularize=0.005 height-in=71 height-out=71 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=32 7 | linear-component name=cnn2 dim=284 orthonormal-constraint=1.0 8 | # the first splicing is moved before the lda layer, so no splicing here 9 | relu-batchnorm-dropout-layer name=tdnn1 l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true dim=1280 10 | tdnnf-layer name=tdnnf2 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=1 11 | tdnnf-layer name=tdnnf3 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=1 12 | tdnnf-layer name=tdnnf4 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=1 13 | tdnnf-layer name=tdnnf5 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=0 14 | tdnnf-layer name=tdnnf6 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=3 15 | tdnnf-layer name=tdnnf7 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=3 16 | tdnnf-layer name=tdnnf8 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=3 17 | tdnnf-layer name=tdnnf9 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=3 18 | tdnnf-layer name=tdnnf10 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=3 19 | tdnnf-layer name=tdnnf11 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=3 20 | tdnnf-layer name=tdnnf12 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=3 21 | tdnnf-layer name=tdnnf13 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=3 22 | tdnnf-layer name=tdnnf14 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=3 23 | tdnnf-layer name=tdnnf15 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=3 24 | tdnnf-layer name=tdnnf16 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=3 25 | tdnnf-layer name=tdnnf17 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=3 26 | tdnnf-layer name=tdnnf18 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=3 27 | tdnnf-layer name=tdnnf19 l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66 dim=1280 bottleneck-dim=256 time-stride=3 28 | linear-component name=prefinal-l dim=512 orthonormal-constraint=1.0 29 | 30 | ## adding the layers for chain branch 31 | prefinal-layer name=prefinal-chain input=prefinal-l l2-regularize=0.03 small-dim=512 big-dim=1280 32 | output-layer name=output include-log-softmax=false dim=2170 l2-regularize=0.015 33 | 34 | # adding the layers for xent branch 35 | prefinal-layer name=prefinal-xent input=prefinal-l l2-regularize=0.03 small-dim=512 big-dim=1280 36 | output-layer name=output-xent dim=2170 learning-rate-factor=5.0 l2-regularize=0.015 37 | -------------------------------------------------------------------------------- /local/track1_espnet_transformer_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2020 Speechlab @ SJTU (Author: Yizhou Lu) 4 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 5 | 6 | . ./path.sh || exit 1; 7 | . ./cmd.sh || exit 1; 8 | 9 | # general configuration 10 | backend=pytorch 11 | stage=1 12 | stop_stage=2 13 | ngpu=4 # number of gpus ("0" uses cpu, otherwise use gpu) 14 | debugmode=1 15 | N=0 # number of minibatches to be used (mainly for debugging). "0" uses all minibatches. 16 | verbose=0 # verbose option 17 | resume= # Resume the training from snapshot 18 | log=100 19 | 20 | preprocess_config=conf/specaug.yaml 21 | train_config=conf/track1_accent_transformer.yaml 22 | 23 | # others 24 | accum_grad=2 25 | n_iter_processes=2 26 | lsm_weight=0.0 27 | epochs=40 28 | elayers=12 29 | batch_size=20 30 | pretrained_model= 31 | 32 | # decoding parameter 33 | recog_model=model.acc.best # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best' 34 | 35 | . utils/parse_options.sh || exit 1; 36 | 37 | # Set bash to 'debug' mode, it will exit on : 38 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', 39 | set -e 40 | set -u 41 | set -o pipefail 42 | 43 | train_json=kdata/train/ar.json 44 | valid_json=kdata/cv_all/ar.json 45 | 46 | expdir=exp/track1_accent_classification_transformer_elayers${elayers} 47 | mkdir -p ${expdir} 48 | 49 | 50 | if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then 51 | echo "stage 1: Network Training" 52 | ${cuda_cmd} --gpu ${ngpu} ${expdir}/train.log \ 53 | asr_train.py \ 54 | --config ${train_config} \ 55 | --preprocess-conf ${preprocess_config} \ 56 | --ngpu ${ngpu} \ 57 | --backend ${backend} \ 58 | --outdir ${expdir}/results \ 59 | --debugmode ${debugmode} \ 60 | --debugdir ${expdir} \ 61 | --minibatches ${N} \ 62 | --verbose ${verbose} \ 63 | --resume ${resume} \ 64 | --report-interval-iters ${log} \ 65 | --accum-grad ${accum_grad} \ 66 | --n-iter-processes ${n_iter_processes} \ 67 | --elayers ${elayers} \ 68 | --lsm-weight ${lsm_weight} \ 69 | --epochs ${epochs} \ 70 | --batch-size ${batch_size} \ 71 | ${pretrained_model:+--pretrained-model $pretrained_model} \ 72 | --train-json ${train_json} \ 73 | --valid-json ${valid_json} 74 | fi 75 | 76 | decode_dir=decode_track1 77 | if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then 78 | echo "stage 2: Decoding" 79 | nj=10 80 | 81 | # split data 82 | dev_root=kdata/cv_all 83 | splitjson.py --parts ${nj} ${dev_root}/ar.json 84 | #### use CPU for decoding 85 | ngpu=0 86 | 87 | slurm.pl JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \ 88 | asr_recog.py \ 89 | --ngpu ${ngpu} \ 90 | --backend ${backend} \ 91 | --batchsize 0 \ 92 | --recog-json ${dev_root}/split${nj}utt/ar.JOB.json \ 93 | --result-label ${expdir}/${decode_dir}/ar.JOB.json \ 94 | --model ${expdir}/results/${recog_model} 95 | 96 | concatjson.py ${expdir}/${decode_dir}/ar.*.json > ${expdir}/${decode_dir}/ar.json 97 | echo "Decoding finished" 98 | fi 99 | 100 | if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then 101 | echo "stage 3: Analyze decoding results" 102 | python ./local/tools/parse_track1_jsons.py ${expdir}/${decode_dir}/ar.json ${expdir}/${decode_dir}/result.txt 103 | 104 | fi 105 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AESRC2020 2 | 3 | 4 | #### 介绍 5 | 6 | Interspeech 2020 口音英语识别挑战赛数据准备相关脚本、训练流程代码与基线实验结果。 7 | 8 | Data preparation scripts and training pipeline for the Interspeech 2020 Accented English Speech Recognition Challenge (AESRC). 9 | 10 | #### 依赖环境 11 | 12 | 1. 安装Kaldi (数据准备有关功能脚本、Track2传统模型训练) 13 | [Github链接](https://github.com/kaldi-asr/kaldi) 14 | 2. 安装ESPnet(Track1 E2E AR Model训练、Track2 E2E ASR Transformer训练) 15 | [Github链接](https://github.com/espnet/espnet) 16 | 3. (可选)安装Google SentencePiece (Track2 E2E ASR 词表缩减、建模单元构建) 17 | [Github链接](https://github.com/google/sentencepiece) 18 | 4. (可选)安装KenLM (N-gram语言模型训练) 19 | [Github链接](http://https://github.com/kpu/kenlm) 20 | 21 | #### 使用说明 22 | 23 | **数据准备 Data Preparation** 24 | 25 | 1. 下载评测数据 26 | 2. 准备数据,划分开发集,特征准备以及训练BPE模型 `./local/prepare_data.sh` 27 | 28 | **口音识别赛道 AR Track** 29 | 30 | 训练Track1 ESPnet AR模型 `./local/track1_espnet_transformer_train.sh` 31 | 32 | **语音识别赛道 ASR Track** 33 | 34 | 1. 训练Track2 Kaldi GMM对齐模型 `./local/track2_kaldi_gmm_train.sh` 35 | 2. 生成Lattice,决策树,训练Track2 Kaldi Chain Model `./local/track2_kaldi_chain_train.sh` 36 | 3. 训练Track2 ESPnet Transformer模型(Track2 ESPnet RNN语言模型) `./local/track2_espnet_transformer_train.sh` 37 | 38 | **注意** 39 | 1. 官方不提供Kaldi模型所需的英文的发音词典 40 | 2. 训练脚本中不包括数据扩充、添加Librispeech数据等,参赛者可按需添加 41 | 3. 正确安装并激活Kaldi与ESPnet的环境之后才能运行相关脚本 42 | 4. ASR Track中Baseline提供了多种数据的组合、Librispeech全量数据预训练等试验结果 43 | 5. 参赛者应严格按照评测中关于数据使用的相关规则训练模型,以确保结果的公平可比性 44 | 45 | #### 基线实验结果 46 | 47 | **Track1基线实验结果** 48 | 49 | | Model | RU | KR | US | PT | JPN | UK | CHN | IND | AVE | 50 | | -------- | -- |---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | 51 | | Transformer-3L | 30.0 | 45.0 | 45.7 | 57.2 | 48.5 | 70.0 | 56.2 | 83.5 | 54.1 | 52 | | Transformer-6L | 34.0 | 43.7 | 30.6 | 65.7 | 44.0 | 74.5 | 50.9 | 75.2 | 52.2 | 53 | | Transformer-12L | 49.6 | 26.0 | 21.2 | 51.8 | 42.7 | 85.0 | 38.2 | 66.1 | 47.8 | 54 | | + ASR-init | 75.7 | 55.6 | 60.2 | 85.5 | 73.2 | 93.9 | 67.0 | 97.0 | 76.1 | 55 | 56 | Transformer-3L、Transformer-6L、Transformer-12L均使用`./local/track1_espnet_transformer_train.sh`训练(elayers分别为3、6、12),ASR-init实验使用Track2中Joint CTC/Attention模型进行初始化 57 | 58 | *在cv集的结果上发现了某个语种的acc与说话人强相关的现象,由于cv集说话人较少,所以上述结果的绝对数值并不具备统计意义,测试集将包含更多的说话人 59 | 60 | **Track2基线实验结果** 61 | 62 | Kaldi Hybrid Chain Model: CNN + 18 TDNN 63 | *基于内部的非开源英文发音词典 64 | *随后会公布基于CMU词典的结果 65 | 66 | ESPnet Transformer Model: 12 Encoder + 6 Decoder (simple self-attention, CTC joint training used, 1k sub-word BPE) 67 | 68 | 详细超参数见`./local/files/conf/`目录中模型配置与相关脚本中的设置 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 |
DataDecode RelatedWER on cv set
RUKRUSPTJPNUKCHNINDAVE
Kaldi
Accent160-6.6711.4615.9510.279.7816.8820.9717.4813.68
Libri960 ~ Accent1606.6110.9515.339.799.7516.0319.6816.9313.13
Accent160 + Libri1606.9511.7613.059.9610.1514.2120.7618.2613.14
ESPnet
Accent160+0.3RNNLM5.267.699.967.456.7910.0611.7710.058.63
Libri960 ~ Accent160+0.3RNNLM4.66.47.425.95.717.649.877.856.92
Accent160 +Libri160
-5.359.078.527.137.298.612.039.058.38
+0.3RNNLM4.687.597.76.426.377.7610.888.417.48
+0.3RNNLM+0.3CTC4.767.817.716.366.47.2310.778.017.38
194 | * Data A ~ Data B指使用Data B fine-tune Data A训练的模型 195 | -------------------------------------------------------------------------------- /local/track2_espnet_transformer_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2017 Johns Hopkins University (Shinji Watanabe) 4 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 5 | 6 | . ./path.sh || exit 1; 7 | . ./cmd.sh || exit 1; 8 | 9 | # general configuration 10 | backend=pytorch 11 | stage=1 # start from 0 if you need to start from data preparation 12 | stop_stage=1 13 | ngpu=4 # number of gpus ("0" uses cpu, otherwise use gpu) 14 | debugmode=1 15 | dumpdir=dump # directory to dump full features 16 | N=0 # number of minibatches to be used (mainly for debugging). "0" uses all minibatches. 17 | verbose=0 # verbose option 18 | resume= # Resume the training from snapshot 19 | 20 | # feature configuration 21 | do_delta=false 22 | 23 | train_config=conf/espnet_train.conf 24 | lm_config=conf/espnet_lm.yaml 25 | decode_config=conf/espnet_decode.yaml 26 | preprocess_config=conf/espnet_specaug.yaml 27 | 28 | # rnnlm related 29 | lm_resume= # specify a snapshot file to resume LM training 30 | lmtag=0 # tag for managing LMs 31 | 32 | # decoding parameter 33 | recog_model=model.acc.best # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best' 34 | n_average=5 35 | 36 | # exp tag 37 | tag="base" # tag for managing experiments. 38 | 39 | . utils/parse_options.sh || exit 1; 40 | 41 | # Set bash to 'debug' mode, it will exit on : 42 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', 43 | set -e 44 | set -u 45 | set -o pipefail 46 | 47 | data=kdata 48 | exp=exp-espnet 49 | 50 | train_set=train 51 | train_dev=cv_all 52 | #recog_set="dev test" 53 | recog_set="cv/UK cv/US cv/CHN cv/JPN cv/KR cv/RU cv/IND cv/PT" 54 | 55 | 56 | lexi=$data/lang/lexicon.txt 57 | dict=$data/lang/units.txt 58 | echo "dictionary: ${dict}" 59 | 60 | # you can skip this and remove --rnnlm option in the recognition (stage 5) 61 | if [ -z ${lmtag} ]; then 62 | lmtag=$(basename ${lm_config%.*}) 63 | fi 64 | lmexpname=train_rnnlm_${backend}_${lmtag} 65 | lmexpdir=$exp/${lmexpname} 66 | mkdir -p ${lmexpdir} 67 | 68 | if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then 69 | echo "stage 1: LM Preparation" 70 | lmdatadir=$data/local/lm_train 71 | mkdir -p ${lmdatadir} 72 | text2token.py -s 1 -n 1 $data/train/text | cut -f 2- -d" " \ 73 | > ${lmdatadir}/train.txt 74 | text2token.py -s 1 -n 1 $data/${train_dev}/text | cut -f 2- -d" " \ 75 | > ${lmdatadir}/valid.txt 76 | 77 | ${cuda_cmd} --gpu 1 ${lmexpdir}/train.log \ 78 | lm_train.py \ 79 | --config ${lm_config} \ 80 | --ngpu $ngpu \ 81 | --backend ${backend} \ 82 | --batchsize 1000 \ 83 | --verbose 1 \ 84 | --outdir ${lmexpdir} \ 85 | --tensorboard-dir tensorboard/${lmexpname} \ 86 | --train-label ${lmdatadir}/train.txt \ 87 | --valid-label ${lmdatadir}/valid.txt \ 88 | --resume ${lm_resume} \ 89 | --dict ${dict} 90 | fi 91 | 92 | if [ -z ${tag} ]; then 93 | expname=${train_set}_${backend}_$(basename ${train_config%.*}) 94 | if ${do_delta}; then 95 | expname=${expname}_delta 96 | fi 97 | else 98 | expname=${train_set}_${backend}_${tag} 99 | fi 100 | expdir=$exp/${expname} 101 | mkdir -p ${expdir} 102 | 103 | if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then 104 | echo "stage 2: Network Training" 105 | ${cuda_cmd} --gpu $ngpu ${expdir}/train.log \ 106 | asr_train.py \ 107 | --config ${train_config} \ 108 | --ngpu $ngpu \ 109 | --backend ${backend} \ 110 | --preprocess-conf $preprocess_config \ 111 | --outdir ${expdir}/results \ 112 | --tensorboard-dir tensorboard/${expname} \ 113 | --debugmode ${debugmode} \ 114 | --dict ${dict} \ 115 | --debugdir ${expdir} \ 116 | --minibatches ${N} \ 117 | --verbose ${verbose} \ 118 | --resume ${resume} \ 119 | --train-json $data/$train_set/asr.json \ 120 | --valid-json $data/$train_dev/asr.json \ 121 | --n-iter-processes $ngpu 122 | fi 123 | 124 | if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then 125 | echo "stage 3: Decoding" 126 | nj=30 127 | if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]]; then 128 | recog_model=model.last${n_average}.avg.best 129 | average_checkpoints.py --backend ${backend} \ 130 | --snapshots ${expdir}/results/snapshot.ep.* \ 131 | --out ${expdir}/results/${recog_model} \ 132 | --num ${n_average} 133 | fi 134 | pids=() # initialize pids 135 | for rtask in ${recog_set}; do 136 | ( 137 | decode_dir=decode_${rtask}_$(basename ${decode_config%.*})_${lmtag} 138 | feat_recog_dir=$data/$rtask 139 | echo $feat_recog_dir 140 | # split data 141 | splitjson.py --parts ${nj} ${feat_recog_dir}/asr.json 142 | 143 | #### use CPU for decoding 144 | ngpu=0 145 | 146 | ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \ 147 | asr_recog.py \ 148 | --config ${decode_config} \ 149 | --ngpu ${ngpu} \ 150 | --backend ${backend} \ 151 | --batchsize 0 \ 152 | --recog-json ${feat_recog_dir}/split${nj}utt/asr.JOB.json \ 153 | --result-label ${expdir}/${decode_dir}/asr.JOB.json \ 154 | --model ${expdir}/results/${recog_model} 155 | 156 | score_sclite.sh ${expdir}/${decode_dir} ${dict} 157 | ) & 158 | pids+=($!) # store background pids 159 | done 160 | i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done 161 | [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false 162 | echo "Finished" 163 | fi 164 | -------------------------------------------------------------------------------- /local/tools/data2json.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2017 Johns Hopkins University (Shinji Watanabe) 4 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 5 | 6 | echo "$0 $*" >&2 # Print the command line for logging 7 | . ./path.sh 8 | 9 | nj=20 10 | cmd="queue.pl -q all.q" 11 | nlsyms="" 12 | lang="" 13 | feat="" # feat.scp 14 | oov="" 15 | bpecode="" 16 | allow_one_column=false 17 | verbose=0 18 | trans_type=phn 19 | filetype="" 20 | preprocess_conf="" 21 | category="" 22 | out="" # If omitted, write in stdout 23 | 24 | text="" 25 | multilingual=false 26 | 27 | help_message=$(cat << EOF 28 | Usage: $0 29 | e.g. $0 data/train data/lang_1char/train_units.txt 30 | Options: 31 | --nj # number of parallel jobs 32 | --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. 33 | --feat # feat.scp or feat1.scp,feat2.scp,... 34 | --oov # Default: 35 | --out # If omitted, write in stdout 36 | --filetype # Specify the format of feats file 37 | --preprocess-conf # Apply preprocess to feats when creating shape.scp 38 | --verbose # Default: 0 39 | --text text_file # uttid to label of each utt 40 | EOF 41 | ) 42 | . utils/parse_options.sh 43 | 44 | if [ $# != 2 ]; then 45 | echo "${help_message}" 1>&2 46 | exit 1; 47 | fi 48 | 49 | set -euo pipefail 50 | 51 | dir=$1 52 | dic=$2 53 | tmpdir=$(mktemp -d ${dir}/tmp-XXXXX) 54 | trap 'rm -rf ${tmpdir}' EXIT 55 | 56 | if [ -z ${text} ]; then 57 | text=${dir}/text 58 | fi 59 | 60 | # 1. Create scp files for inputs 61 | # These are not necessary for decoding mode, and make it as an option 62 | input= 63 | if [ -n "${feat}" ]; then 64 | _feat_scps=$(echo "${feat}" | tr ',' ' ' ) 65 | read -r -a feat_scps <<< $_feat_scps 66 | num_feats=${#feat_scps[@]} 67 | 68 | for (( i=1; i<=num_feats; i++ )); do 69 | feat=${feat_scps[$((i-1))]} 70 | mkdir -p ${tmpdir}/input_${i} 71 | input+="input_${i} " 72 | cat ${feat} > ${tmpdir}/input_${i}/feat.scp 73 | 74 | # Dump in the "legacy" style JSON format 75 | if [ -n "${filetype}" ]; then 76 | awk -v filetype=${filetype} '{print $1 " " filetype}' ${feat} \ 77 | > ${tmpdir}/input_${i}/filetype.scp 78 | fi 79 | 80 | feat_to_shape.sh --cmd "${cmd}" --nj ${nj} \ 81 | --filetype "${filetype}" \ 82 | --preprocess-conf "${preprocess_conf}" \ 83 | --verbose ${verbose} ${feat} ${tmpdir}/input_${i}/shape.scp 84 | done 85 | fi 86 | 87 | # 2. Create scp files for outputs 88 | mkdir -p ${tmpdir}/output 89 | if [ -n "${bpecode}" ]; then 90 | if [ ${multilingual} = true ]; then 91 | # remove a space before the language ID 92 | paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \ 93 | | spm_encode --model=${bpecode} --output_format=piece | cut -f 2- -d" ") \ 94 | > ${tmpdir}/output/token.scp 95 | else 96 | paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \ 97 | | spm_encode --model=${bpecode} --output_format=piece) \ 98 | > ${tmpdir}/output/token.scp 99 | fi 100 | elif [ -n "${nlsyms}" ]; then 101 | text2token.py -s 1 -n 1 -l ${nlsyms} ${text} --trans_type ${trans_type} > ${tmpdir}/output/token.scp 102 | else 103 | text2token.py -s 1 -n 1 ${text} --trans_type ${trans_type} > ${tmpdir}/output/token.scp 104 | fi 105 | < ${tmpdir}/output/token.scp utils/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp 106 | # +2 comes from CTC blank and EOS 107 | vocsize=$(tail -n 1 ${dic} | awk '{print $2}') 108 | odim=$(echo "$vocsize + 2" | bc) 109 | < ${tmpdir}/output/tokenid.scp awk -v odim=${odim} '{print $1 " " NF-1 "," odim}' > ${tmpdir}/output/shape.scp 110 | 111 | cat ${text} > ${tmpdir}/output/text.scp 112 | 113 | 114 | # 3. Create scp files for the others 115 | mkdir -p ${tmpdir}/other 116 | if [ ${multilingual} == true ]; then 117 | awk '{ 118 | n = split($1,S,"[-]"); 119 | lang=S[n]; 120 | print $1 " " lang 121 | }' ${text} > ${tmpdir}/other/lang.scp 122 | elif [ -n "${lang}" ]; then 123 | awk -v lang=${lang} '{print $1 " " lang}' ${text} > ${tmpdir}/other/lang.scp 124 | fi 125 | 126 | if [ -n "${category}" ]; then 127 | awk -v category=${category} '{print $1 " " category}' ${dir}/text \ 128 | > ${tmpdir}/other/category.scp 129 | fi 130 | cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp 131 | 132 | 133 | # 4. Merge scp files into a JSON file 134 | opts="" 135 | if [ -n "${feat}" ]; then 136 | intypes="${input} output other" 137 | else 138 | intypes="output other" 139 | fi 140 | for intype in ${intypes}; do 141 | if [ -z "$(find "${tmpdir}/${intype}" -name "*.scp")" ]; then 142 | continue 143 | fi 144 | 145 | if [ ${intype} != other ]; then 146 | opts+="--${intype%_*}-scps " 147 | else 148 | opts+="--scps " 149 | fi 150 | 151 | for x in "${tmpdir}/${intype}"/*.scp; do 152 | k=$(basename ${x} .scp) 153 | if [ ${k} = shape ]; then 154 | opts+="shape:${x}:shape " 155 | else 156 | opts+="${k}:${x} " 157 | fi 158 | done 159 | done 160 | 161 | if ${allow_one_column}; then 162 | opts+="--allow-one-column true " 163 | else 164 | opts+="--allow-one-column false " 165 | fi 166 | 167 | if [ -n "${out}" ]; then 168 | opts+="-O ${out}" 169 | fi 170 | 171 | local/tools/merge_scp2json.py --verbose ${verbose} ${opts} 172 | 173 | rm -fr ${tmpdir} 174 | -------------------------------------------------------------------------------- /README_en.md: -------------------------------------------------------------------------------- 1 | # AESRC2020 2 | 3 | 4 | #### Introduction 5 | 6 | Data preparation scripts and training pipeline for the Interspeech 2020 Accented English Speech Recognition Challenge (AESRC). 7 | 8 | #### Dependent Environment 9 | 10 | 1. Install Kaldi (Data preparation scripts, Track2 traditional ASR model training) 11 | [Github Link](https://github.com/kaldi-asr/kaldi) 12 | 2. Install ESPnet(Track1 E2E AR Model training, Track2 E2E ASR Transformer training) 13 | [Github Link](https://github.com/espnet/espnet) 14 | 3. (Optional) Install Google SentencePiece (Track2 E2E ASR modeling units building) 15 | [Github Link](https://github.com/google/sentencepiece) 16 | 4. (Optional) Install KenLM (N-gram language model training) 17 | [Github Link](http://https://github.com/kpu/kenlm) 18 | 19 | #### Usage 20 | 21 | **Data Preparation** 22 | 23 | 1. Download challenge data 24 | 2. Data preparation, divide cv set, feature extraction and bpe model training `./local/prepare_data.sh` 25 | 26 | **AR Track** 27 | 28 | Train Track1 ESPnet AR model `./local/track1_espnet_transformer_train.sh` 29 | 30 | **ASR Track** 31 | 32 | 1. Train Track2 Kaldi GMM alignment model `./local/track2_kaldi_gmm_train.sh` 33 | 2. Generate Lattice, decision tree, Train Track2 Kaldi Chain Model `./local/track2_kaldi_chain_train.sh` 34 | 3. Train Track2 ESPnet Transformer Model (Track2 ESPnet RNN Language Model) `./local/track2_espnet_transformer_train.sh` 35 | 36 | **Notice** 37 | 1. There's no lexicon provided, please prepare it by yourself. 38 | 2. Data augment methods are not included in scirpts. 39 | 3. Install Kaldi and ESPnet and activate their envrionment then you can run the scripts. 40 | 4. Baseline experiments in Track2 include several data using methods. 41 | 5. Participants should obey the rules about data strictly. 42 | 43 | #### Baseline Experiments Results 44 | 45 | **Track1** 46 | 47 | | Model | RU | KR | US | PT | JPN | UK | CHN | IND | AVE | 48 | | -------- | -- |---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | 49 | | Transformer-3L | 30.0 | 45.0 | 45.7 | 57.2 | 48.5 | 70.0 | 56.2 | 83.5 | 54.1 | 50 | | Transformer-6L | 34.0 | 43.7 | 30.6 | 65.7 | 44.0 | 74.5 | 50.9 | 75.2 | 52.2 | 51 | | Transformer-12L | 49.6 | 26.0 | 21.2 | 51.8 | 42.7 | 85.0 | 38.2 | 66.1 | 47.8 | 52 | | + ASR-init | 75.7 | 55.6 | 60.2 | 85.5 | 73.2 | 93.9 | 67.0 | 97.0 | 76.1 | 53 | 54 | Transformer-3L, Transformer-6L, Transformer-12L all use`./local/track1_espnet_transformer_train.sh` (elayers: 3, 6, 12) 55 | 56 | ASR-init uses encoder in Track2 to initialize self-attention parameters 57 | 58 | *In cv sets, we found that the acc of some accent is strongly related with speaker. As there are few speakers in cv sets, the absolute value above is not statistically significant, and the test set will contain more speakers 59 | 60 | **Track2** 61 | 62 | Kaldi Hybrid Chain Model: CNN + 18 TDNN 63 | *Based on internal non open source dictionary 64 | *Results on CMU dict comes up soon 65 | 66 | ESPnet Transformer Model: 12 Encoder + 6 Decoder (simple self-attention, CTC joint training used, 1k sub-word BPE) 67 | 68 | You can find detailed hyperparameters settings in `./local/files/conf/` and training scripts 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 |
DataDecode RelatedWER on cv set
RUKRUSPTJPNUKCHNINDAVE
Kaldi
Accent160-6.6711.4615.9510.279.7816.8820.9717.4813.68
Libri960 ~ Accent1606.6110.9515.339.799.7516.0319.6816.9313.13
Accent160 + Libri1606.9511.7613.059.9610.1514.2120.7618.2613.14
ESPnet
Accent160+0.3RNNLM5.267.699.967.456.7910.0611.7710.058.63
Libri960 ~ Accent160+0.3RNNLM4.66.47.425.95.717.649.877.856.92
Accent160 +Libri160
-5.359.078.527.137.298.612.039.058.38
+0.3RNNLM4.687.597.76.426.377.7610.888.417.48
+0.3RNNLM+0.3CTC4.767.817.716.366.47.2310.778.017.38
194 | * Data A ~ Data B means fine-tune Data A model with Data B 195 | -------------------------------------------------------------------------------- /local/prepare_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2020 Audio, Speech and Language Processing Group @ NWPU (Author: Xian Shi) 4 | # Apache 2.0 5 | 6 | raw_data=$1 # raw data with metadata, txt and wav 7 | data=$2 # data transformed into kaldi format 8 | zipped_data=$raw_data/AESRC2020.zip 9 | 10 | stage=2 11 | feature_cmd="run.pl" 12 | nj=50 13 | 14 | vocab_size=1000 15 | 16 | 17 | # unzip and rename each accent 18 | if [ $stage -le 1 ];then 19 | # unzip $zipped_data 20 | mv $raw_data/American\ English\ Speech\ Data $raw_data/US 21 | mv $raw_data/British\ English\ Speech\ Data $raw_data/UK 22 | mv $raw_data/Chinese\ Speaking\ English\ Speech\ Data $raw_data/CHN 23 | mv $raw_data/Indian\ English\ Speech\ Data $raw_data/IND 24 | mv $raw_data/Portuguese\ Speaking\ English\ Speech\ Data $raw_data/PT 25 | mv $raw_data/Russian\ Speaking\ English\ Speech\ Data $raw_data/RU 26 | mv $raw_data/Japanese\ Speaking\ English\ Speech\ Data $raw_data/JPN 27 | mv $raw_data/Korean\ Speaking\ English\ Speech\ Data $raw_data/KR 28 | fi 29 | 30 | 31 | # generate kaldi format data for all 32 | if [ $stage -le 2 ];then 33 | echo "Generating kaldi format data." 34 | mkdir -p $data/data_all 35 | find `pwd`/ -name '*.wav' > $data/data_all/wavpath 36 | awk -F'/' '{print $(NF-2)"-"$(NF-1)"-"$NF}' $data/data_all/wavpath | sed 's:\.wav::g' > $data/data_all/uttlist 37 | paste $data/data_all/uttlist $data/data_all/wavpath > $data/data_all/wav.scp 38 | python local/tools/preprocess.py $data/data_all/wav.scp $data/data_all/trans $data/data_all/utt2spk # faster than for in shell 39 | ./utils/utt2spk_to_spk2utt.pl $data/data_all/utt2spk > $data/data_all/spk2utt 40 | fi 41 | 42 | 43 | # clean transcription 44 | if [ $stage -le 3 ];then 45 | echo "Cleaning transcription." 46 | tr '[a-z]' '[A-Z]' < $data/data_all/trans > $data/data_all/trans_upper 47 | # turn "." in specific abbreviations into "" tag 48 | sed -i -e 's: MR\.: MR:g' -e 's: MRS\.: MRS:g' -e 's: MS\.: MS:g' \ 49 | -e 's:^MR\.:MR:g' -e 's:^MRS\.:MRS:g' -e 's:^MS\.:MS:g' $data/data_all/trans_upper 50 | # fix bug 51 | sed -i 's:^ST\.:STREET:g' $data/data_all/trans_upper 52 | sed -i 's: ST\.: STREET:g' $data/data_all/trans_upper 53 | # punctuation marks 54 | sed -i "s%,\|\.\|?\|!\|;\|-\|:\|,'\|\.'\|?'\|!'\| '% %g" $data/data_all/trans_upper 55 | sed -i 's::.:g' $data/data_all/trans_upper 56 | # blank 57 | sed -i 's:[ ][ ]*: :g' $data/data_all/trans_upper 58 | paste $data/data_all/uttlist $data/data_all/trans_upper > $data/data_all/text 59 | fi 60 | 61 | exit 1; 62 | # extracting filter-bank features and cmvn 63 | if [ $stage -le 4 ];then 64 | ./utils/fix_data_dir.sh $data/data_all 65 | ./steps/make_fbank.sh --cmd $feature_cmd --nj $nj --fbank-config conf/fbank.conf $data/data_all $data/feats/log $data/feats/ark 66 | ./steps/compute_cmvn_stats.sh $data/data_all $data/feats/log $data/feats/ark # for kaldi 67 | fi 68 | 69 | exit 1; 70 | # divide development set for cross validation 71 | if [ $stage -le 5 ];then 72 | for i in US UK IND CHN JPN PT RU KR;do 73 | ./utils/subset_data_dir.sh --spk-list local/files/cvlist/${i}_cv_spk $data/data_all $data/cv/$i 74 | cat $data/cv/$i/feats.scp >> $data/cv.scp 75 | done 76 | ./utils/filter_scp.pl --exclude $data/cv.scp $data/data_all/feats.scp > $data/train.scp 77 | ./utils/subset_data_dir.sh --utt-list $data/train.scp $data/data_all $data/train 78 | ./utils/subset_data_dir.sh --utt-list $data/cv.scp $data/data_all $data/cv_all 79 | compute-cmvn-stats scp:$data/train/feats.scp `pwd`/$data/train/dump_cmvn.ark # for espnet 80 | rm $data/cv.scp $data/train.scp 81 | fi 82 | 83 | 84 | # generate label file and dump features for track2:E2E 85 | if [ $stage -le 6 ];then 86 | for i in US UK IND CHN JPN PT RU KR;do 87 | local/tools/dump.sh --cmd $feature_cmd --nj 3 --do_delta false \ 88 | $data/cv/$i/feats.scp $data/train/dump_cmvn.ark $data/cv/$i/dump/log $data/cv/$i/dump # for track2 e2e testing 89 | done 90 | local/tools/dump.sh --cmd $feature_cmd --nj $nj --do_delta false \ 91 | $data/train/feats.scp $data/train/dump_cmvn.ark $data/train/dump/log $data/train/dump # for track2 e2e training 92 | # for track1, utterance-level CMVN is applied 93 | for data_set in train cv_all; do 94 | set_dir=$data/$data_set 95 | # hack to set utterance-level spk2utt & utt2spk 96 | awk '{printf "%s %s\n", $1, $1 }' $set_dir/text > $set_dir/spk2utt.utt 97 | cp $set_dir/spk2utt.utt $set_dir/utt2spk.utt 98 | compute-cmvn-stats --spk2utt=ark:$set_dir/spk2utt.utt scp:$set_dir/feats.scp \ 99 | ark,scp:`pwd`/$set_dir/cmvn_utt.ark,$set_dir/cmvn_utt.scp 100 | local/tools/dump_spk_yzl23.sh --cmd slurm.pl --nj 48 \ 101 | $set_dir/feats.scp $set_dir/cmvn_utt.scp \ 102 | exp/dump_feats/$data_set $set_dir/dump_utt $set_dir/utt2spk.utt 103 | done 104 | fi 105 | 106 | 107 | # generate label file for track1 108 | if [ $stage -le 7 ];then 109 | for i in train cv_all;do 110 | cut -f 1 $data/$i/text > $data/$i/uttlist 111 | cut -d '-' -f 1 $data/$i/text | sed -e "s:^:<:g" -e "s:$:>:g" > $data/$i/accentlist 112 | paste $data/$i/uttlist $data/$i/accentlist > $data/$i/utt2accent 113 | rm $data/$i/uttlist 114 | local/tools/data2json.sh --nj 20 --feat $data/$i/dump_utt/feats.scp --text $data/$i/utt2accent --oov 8 $data/$i local/files/ar.dict > $data/$i/ar.json 115 | done 116 | fi 117 | 118 | 119 | # generate label file for track2 e2e 120 | if [ $stage -le 8 ];then 121 | # goolgle sentence piece toolkit is used to train a bpe model and decode 122 | mkdir -p $data/bpe 123 | mkdir -p $data/lang 124 | # male sure you have installed sentencepiece successfully 125 | spm_train \ 126 | --input=$data/train/trans_upper \ 127 | --model_prefix=$data/bpe/bpe_${vocab_size} \ 128 | --vocab_size=$vocab_size \ 129 | --character_coverage=1.0 \ 130 | --model_type=unigram 131 | python local/tools/word_frequency.py $data/train/trans_upper 0 $data/bpe/train 132 | cut -d ' ' -f 1 $data/bpe/train.enwf | awk '{if(NF==1)print $0}' > $data/bpe/wordlist.txt 133 | spm_encode \ 134 | --model=$data/bpe/bpe_${vocab_size}.model \ 135 | --output_format=piece < $data/bpe/wordlist.txt > $data/bpe/bpelist.txt 136 | paste $data/bpe/wordlist.txt $data/bpe/bpelist.txt > $data/lang/lexicon.txt 137 | sed -i 's:▁ :▁:g' $data/lang/lexicon.txt 138 | python local/tools/apply_lexicon.py $data/lang/lexicon.txt $data/train/text $data/train/utt2tokens "" $data/train/.warning $data/lang/units.txt 139 | local/tools/data2json.sh --nj 20 --feat $data/train/dump/feats.scp --text $data/train/utt2tokens --oov 0 $data/train $data/lang/units.txt > $data/train/asr.json || exit 1; 140 | for i in US UK IND CHN JPN PT RU KR; do 141 | # units.txt generate form cv set aborted 142 | python local/tools/apply_lexicon.py $data/lang/lexicon.txt $data/cv/$i/text $data/cv/$i/utt2tokens "" $data/cv/$i/.warning $data/cv/${i}/.units.txt || exit 1; 143 | local/tools/data2json.sh --nj 20 --feat $data/cv/$i/dump/feats.scp --text $data/cv/$i/utt2tokens --oov 0 $data/cv/$i $data/lang/units.txt > $data/cv/$i/asr.json 144 | done 145 | 146 | fi 147 | 148 | echo "local/prepare_data.sh succeeded" 149 | exit 0; 150 | -------------------------------------------------------------------------------- /local/track2_kaldi_chain_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2020 Audio, Speech and Language Processing Group @ NWPU (Author: Xian Shi) 4 | # Apache 2.0 5 | 6 | set -e 7 | 8 | exp=exp-kaldi 9 | data=kdata 10 | # configs for 'chain' 11 | affix=all 12 | stage=1 13 | train_stage=-6 14 | get_egs_stage=0 15 | dir=$exp/chain/tdnn # Note: _sp will get added to this 16 | decode_iter= 17 | 18 | # training options 19 | num_epochs=5 20 | initial_effective_lrate=0.001 21 | final_effective_lrate=0.0001 22 | max_param_change=2.0 23 | final_layer_normalize_target=0.5 24 | num_jobs_initial=2 25 | num_jobs_final=2 26 | nj=50 27 | minibatch_size=128 28 | dropout_schedule='0,0@0.20,0.3@0.50,0' 29 | frames_per_eg=150,110,90 30 | remove_egs=false 31 | common_egs_dir= 32 | common_egs_dir= 33 | xent_regularize=0.1 34 | graph=$exp/chain/graph 35 | 36 | # End configuration section. 37 | echo "$0 $@" # Print the command line for logging 38 | 39 | . ./cmd.sh 40 | . ./path.sh 41 | . ./utils/parse_options.sh 42 | 43 | if ! cuda-compiled; then 44 | cat <$lang/topo 78 | fi 79 | 80 | if [ $stage -le 3 ]; then 81 | # Build a tree using our new topology. This is the critically different 82 | # step compared with other recipes. 83 | steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ 84 | --context-opts "--context-width=2 --central-position=1" \ 85 | --cmd "$train_cmd" 11500 $data/$train_set $lang $ali_dir $treedir 86 | fi 87 | 88 | if [ $stage -le 4 ]; then 89 | echo "$0: creating neural net configs using the xconfig parser"; 90 | 91 | num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') 92 | 93 | learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) 94 | output_opts="l2-regularize=0.015" 95 | tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" 96 | tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" 97 | prefinal_opts="l2-regularize=0.03" 98 | ivector_affine_opts="l2-regularize=0.005" 99 | cnn_opts="l2-regularize=0.005" 100 | linear_opts="orthonormal-constraint=1.0" 101 | echo "$feat_dim" 102 | mkdir -p $dir/configs 103 | cat < $dir/configs/network.xconfig 104 | input dim=71 name=input 105 | conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=71 height-out=71 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=32 106 | linear-component name=cnn2 dim=284 $linear_opts 107 | # the first splicing is moved before the lda layer, so no splicing here 108 | relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=1280 109 | tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=1 110 | tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=1 111 | tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=1 112 | tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=0 113 | tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=3 114 | tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=3 115 | tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=3 116 | tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=3 117 | tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=3 118 | tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=3 119 | tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=3 120 | tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=3 121 | tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=3 122 | tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=3 123 | tdnnf-layer name=tdnnf16 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=3 124 | tdnnf-layer name=tdnnf17 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=3 125 | tdnnf-layer name=tdnnf18 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=3 126 | tdnnf-layer name=tdnnf19 $tdnnf_opts dim=1280 bottleneck-dim=256 time-stride=3 127 | linear-component name=prefinal-l dim=512 $linear_opts 128 | 129 | ## adding the layers for chain branch 130 | prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=512 big-dim=1280 131 | output-layer name=output include-log-softmax=false dim=$num_targets $output_opts 132 | 133 | # adding the layers for xent branch 134 | prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=512 big-dim=1280 135 | output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts 136 | EOF 137 | steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ 138 | fi 139 | 140 | if [ $stage -le 5 ]; then 141 | steps/nnet3/chain/train.py --stage $train_stage \ 142 | --cmd "run.pl" \ 143 | --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ 144 | --chain.xent-regularize $xent_regularize \ 145 | --chain.leaky-hmm-coefficient 0.1 \ 146 | --chain.l2-regularize 0.00005 \ 147 | --chain.apply-deriv-weights false \ 148 | --chain.lm-opts="--num-extra-lm-states=2000" \ 149 | --egs.dir "$common_egs_dir" \ 150 | --egs.stage $get_egs_stage \ 151 | --egs.opts "--frames-overlap-per-eg 0" \ 152 | --egs.chunk-width $frames_per_eg \ 153 | --trainer.dropout-schedule $dropout_schedule \ 154 | --trainer.num-chunk-per-minibatch $minibatch_size \ 155 | --trainer.frames-per-iter 1500000 \ 156 | --trainer.num-epochs $num_epochs \ 157 | --trainer.optimization.num-jobs-initial $num_jobs_initial \ 158 | --trainer.optimization.num-jobs-final $num_jobs_final \ 159 | --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ 160 | --trainer.optimization.final-effective-lrate $final_effective_lrate \ 161 | --trainer.max-param-change $max_param_change \ 162 | --cleanup.remove-egs $remove_egs \ 163 | --feat-dir $data/${train_set} \ 164 | --tree-dir $treedir \ 165 | --lat-dir $exp/tri4_sp_lats \ 166 | --dir $dir || exit 1; 167 | fi 168 | 169 | 170 | if [ $stage -le 6 ]; then 171 | ./local/mkgraph.sh $lang $dir/final.mdl $graph 172 | fi 173 | 174 | if [ $stage -le 7 ]; then 175 | for test_set in $test_sets; do 176 | steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ 177 | --nj 20 --cmd "$decode_cmd" \ 178 | $graph $data/cv/${test_set} $dir/decode_${test_set} || exit 1; 179 | done 180 | fi 181 | 182 | echo "local/track2_kaldi_chain_train.sh succeeded" 183 | exit 0; 184 | -------------------------------------------------------------------------------- /module/track1_accent_transformer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # encoding: utf-8 3 | 4 | # Copyright 2020 SpeechLab @ SJTU (Author: Yizhou Lu) 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | 7 | """ Transformer-based accent recognition model (pytorch), 8 | Codes mainly borrowed from espnet (https://github.com/espnet/espnet) 9 | """ 10 | 11 | from argparse import Namespace 12 | from distutils.util import strtobool 13 | 14 | import logging 15 | import math 16 | 17 | import torch 18 | import chainer 19 | from chainer import reporter 20 | 21 | from espnet.nets.asr_interface import ASRInterface 22 | from espnet.nets.pytorch_backend.e2e_asr import CTC_LOSS_THRESHOLD 23 | from espnet.nets.pytorch_backend.nets_utils import make_pad_mask 24 | from espnet.nets.pytorch_backend.nets_utils import th_accuracy 25 | from espnet.nets.pytorch_backend.transformer.attention import MultiHeadedAttention 26 | from espnet.nets.pytorch_backend.transformer.encoder import Encoder 27 | from espnet.nets.pytorch_backend.transformer.initializer import initialize 28 | from espnet.nets.pytorch_backend.transformer.label_smoothing_loss import LabelSmoothingLoss 29 | from espnet.nets.pytorch_backend.transformer.plot import PlotAttentionReport 30 | 31 | class Reporter(chainer.Chain): 32 | """A chainer reporter wrapper.""" 33 | 34 | def report(self, acc, loss): 35 | """Report at every step.""" 36 | reporter.report({'acc': acc}, self) 37 | reporter.report({'loss': loss}, self) 38 | 39 | class E2E(ASRInterface, torch.nn.Module): 40 | """E2E module. 41 | 42 | :param int idim: dimension of inputs 43 | :param int odim: dimension of outputs 44 | :param Namespace args: argument Namespace containing options 45 | 46 | """ 47 | 48 | @staticmethod 49 | def add_arguments(parser): 50 | """Add arguments.""" 51 | group = parser.add_argument_group("transformer model setting") 52 | 53 | group.add_argument("--transformer-init", type=str, default="pytorch", 54 | choices=["pytorch", "xavier_uniform", "xavier_normal", 55 | "kaiming_uniform", "kaiming_normal"], 56 | help='how to initialize transformer parameters') 57 | group.add_argument("--transformer-input-layer", type=str, default="conv2d", 58 | choices=["conv2d", "linear", "embed"], 59 | help='transformer input layer type') 60 | group.add_argument('--transformer-attn-dropout-rate', default=None, type=float, 61 | help='dropout in transformer attention. use --dropout-rate if None is set') 62 | group.add_argument('--transformer-lr', default=10.0, type=float, 63 | help='Initial value of learning rate') 64 | group.add_argument('--transformer-warmup-steps', default=25000, type=int, 65 | help='optimizer warmup steps') 66 | group.add_argument('--transformer-length-normalized-loss', default=True, type=strtobool, 67 | help='normalize loss by length') 68 | 69 | group.add_argument('--dropout-rate', default=0.0, type=float, 70 | help='Dropout rate for the encoder') 71 | # Encoder 72 | group.add_argument('--elayers', default=4, type=int, 73 | help='Number of encoder layers (for shared recognition part in multi-speaker asr mode)') 74 | group.add_argument('--eunits', '-u', default=300, type=int, 75 | help='Number of encoder hidden units') 76 | # Attention 77 | group.add_argument('--adim', default=320, type=int, 78 | help='Number of attention transformation dimensions') 79 | group.add_argument('--aheads', default=4, type=int, 80 | help='Number of heads for multi head attention') 81 | group.add_argument('--pretrained-model', default="", type=str, 82 | help='pretrained ASR model for initialization') 83 | return parser 84 | 85 | @property 86 | def attention_plot_class(self): 87 | """Return PlotAttentionReport.""" 88 | return PlotAttentionReport 89 | 90 | def __init__(self, idim, odim, args, ignore_id=-1): 91 | """Construct an E2E object. 92 | 93 | :param int idim: dimension of inputs 94 | :param int odim: dimension of outputs 95 | :param Namespace args: argument Namespace containing options 96 | """ 97 | torch.nn.Module.__init__(self) 98 | if args.transformer_attn_dropout_rate is None: 99 | args.transformer_attn_dropout_rate = args.dropout_rate 100 | self.encoder = Encoder( 101 | idim=idim, 102 | attention_dim=args.adim, 103 | attention_heads=args.aheads, 104 | linear_units=args.eunits, 105 | num_blocks=args.elayers, 106 | input_layer=args.transformer_input_layer, 107 | dropout_rate=args.dropout_rate, 108 | positional_dropout_rate=args.dropout_rate, 109 | attention_dropout_rate=args.transformer_attn_dropout_rate 110 | ) 111 | odim = odim - 1 # ignore additional dim added by data2json 112 | self.odim = odim 113 | self.ignore_id = ignore_id 114 | self.subsample = [1] 115 | self.reporter = Reporter() 116 | self.criterion = LabelSmoothingLoss(self.odim, self.ignore_id, args.lsm_weight, 117 | args.transformer_length_normalized_loss) 118 | self.output = torch.nn.Linear(2 * args.adim, self.odim) # mean + std pooling 119 | # reset parameters 120 | self.reset_parameters(args) 121 | logging.warning(self) 122 | 123 | def reset_parameters(self, args): 124 | """Initialize parameters.""" 125 | # initialize parameters 126 | if args.pretrained_model: 127 | path = args.pretrained_model 128 | logging.warning("load pretrained asr model from {}".format(path)) 129 | if 'snapshot' in path: 130 | model_state_dict = torch.load(path, map_location=lambda storage, loc: storage)['model'] 131 | else: 132 | model_state_dict = torch.load(path, map_location=lambda storage, loc: storage) 133 | self.load_state_dict(model_state_dict, strict=False) 134 | del model_state_dict 135 | else: 136 | initialize(self, args.transformer_init) 137 | 138 | def forward(self, xs_pad, ilens, ys_pad): 139 | """E2E forward. 140 | 141 | :param torch.Tensor xs_pad: batch of padded source sequences (B, Tmax, idim) 142 | :param torch.Tensor ilens: batch of lengths of source sequences (B) 143 | :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax) 144 | :return: label smoothing loss value 145 | :rtype: torch.Tensor 146 | """ 147 | # forward encoder 148 | xs_pad = xs_pad[:, :max(ilens)] # for data parallel 149 | src_mask = (~make_pad_mask(ilens.tolist())).to(xs_pad.device).unsqueeze(-2) 150 | hs_pad, hs_mask = self.encoder(xs_pad, src_mask) 151 | mean = torch.mean(hs_pad, dim=1).unsqueeze(1) 152 | std = torch.std(hs_pad, dim=1).unsqueeze(1) 153 | hs_pad = torch.cat((mean, std), dim=-1) # (B, 1, D) 154 | # output layer 155 | pred_pad = self.output(hs_pad) 156 | 157 | # compute loss 158 | self.loss = self.criterion(pred_pad, ys_pad) 159 | self.acc = th_accuracy(pred_pad.view(-1, self.odim), ys_pad, 160 | ignore_label=self.ignore_id) 161 | 162 | loss_data = float(self.loss) 163 | if loss_data < CTC_LOSS_THRESHOLD and not math.isnan(loss_data): 164 | self.reporter.report(self.acc, loss_data) 165 | else: 166 | logging.warning('loss (=%f) is not correct', loss_data) 167 | return self.loss 168 | 169 | def encode(self, x): 170 | """Encode acoustic features. 171 | 172 | :param ndarray x: source acoustic feature (T, D) 173 | :return: encoder outputs 174 | :rtype: torch.Tensor 175 | """ 176 | self.eval() 177 | x = torch.as_tensor(x).unsqueeze(0) # (B, T, D) with #B=1 178 | enc_output, _ = self.encoder(x, None) 179 | return enc_output.squeeze(0) # returns tensor(T, D) 180 | 181 | # todo: batch decoding 182 | def recognize(self, x, recog_args, char_list=None, rnnlm=None, use_jit=False): 183 | """Recognize input speech. 184 | 185 | """ 186 | enc_output = self.encode(x).unsqueeze(0) # (1, T, D) 187 | mean = torch.mean(enc_output, dim=1).unsqueeze(1) # (1, 1, D) 188 | std = torch.std(enc_output, dim=1).unsqueeze(1) 189 | enc_output = torch.cat((mean, std), dim=-1) 190 | lpz = self.output(enc_output) 191 | lpz = lpz.squeeze(0) # shape of (T, D) 192 | idx = lpz.argmax(-1).cpu().numpy().tolist() 193 | hyp = {} 194 | # [-1] is added here to be compatible with ASR decoding, see espnet/asr/asr_utils/parse_hypothesis 195 | hyp['yseq'] = [-1] + idx 196 | hyp['score'] = -1 197 | logging.info(hyp['yseq']) 198 | return [hyp] 199 | 200 | def calculate_all_attentions(self, xs_pad, ilens, ys_pad): 201 | """E2E attention calculation. 202 | 203 | :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, idim) 204 | :param torch.Tensor ilens: batch of lengths of input sequences (B) 205 | :param torch.Tensor ys_pad: batch of padded token id sequence tensor (B, Lmax) 206 | :return: attention weights with the following shape, 207 | 1) multi-head case => attention weights (B, H, Lmax, Tmax), 208 | 2) other case => attention weights (B, Lmax, Tmax). 209 | :rtype: float ndarray 210 | """ 211 | with torch.no_grad(): 212 | self.forward(xs_pad, ilens, ys_pad) 213 | ret = dict() 214 | for name, m in self.named_modules(): 215 | if isinstance(m, MultiHeadedAttention): 216 | ret[name] = m.attn.cpu().numpy() 217 | return ret 218 | 219 | # fix calculate_all_ctc_probs method not implemented bug 220 | def calculate_all_ctc_probs(self, xs_pad, ilens, ys_pad): 221 | return None 222 | 223 | -------------------------------------------------------------------------------- /local/tools/merge_scp2json.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # encoding: utf-8 3 | 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import argparse 8 | import codecs 9 | from distutils.util import strtobool 10 | from io import open 11 | import json 12 | import logging 13 | import sys 14 | 15 | from espnet.utils.cli_utils import get_commandline_args 16 | 17 | PY2 = sys.version_info[0] == 2 18 | sys.stdin = codecs.getreader('utf-8')(sys.stdin if PY2 else sys.stdin.buffer) 19 | sys.stdout = codecs.getwriter('utf-8')( 20 | sys.stdout if PY2 else sys.stdout.buffer) 21 | 22 | 23 | # Special types: 24 | def shape(x): 25 | """Change str to List[int] 26 | 27 | >>> shape('3,5') 28 | [3, 5] 29 | >>> shape(' [3, 5] ') 30 | [3, 5] 31 | 32 | """ 33 | 34 | # x: ' [3, 5] ' -> '3, 5' 35 | x = x.strip() 36 | if x[0] == '[': 37 | x = x[1:] 38 | if x[-1] == ']': 39 | x = x[:-1] 40 | 41 | return list(map(int, x.split(','))) 42 | 43 | 44 | def get_parser(): 45 | parser = argparse.ArgumentParser( 46 | description='Given each file paths with such format as ' 47 | '::. type> can be omitted and the default ' 48 | 'is "str". e.g. {} ' 49 | '--input-scps feat:data/feats.scp shape:data/utt2feat_shape:shape ' 50 | '--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape ' 51 | '--output-scps text:data/text shape:data/utt2text_shape:shape ' 52 | '--scps utt2spk:data/utt2spk'.format(sys.argv[0]), 53 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 54 | parser.add_argument('--input-scps', type=str, nargs='*', action='append', 55 | default=[], help='Json files for the inputs') 56 | parser.add_argument('--output-scps', type=str, nargs='*', action='append', 57 | default=[], help='Json files for the outputs') 58 | parser.add_argument('--scps', type=str, nargs='+', default=[], 59 | help='The json files except for the input and outputs') 60 | parser.add_argument('--verbose', '-V', default=1, type=int, 61 | help='Verbose option') 62 | parser.add_argument('--allow-one-column', type=strtobool, default=False, 63 | help='Allow one column in input scp files. ' 64 | 'In this case, the value will be empty string.') 65 | parser.add_argument('--out', '-O', type=str, 66 | help='The output filename. ' 67 | 'If omitted, then output to sys.stdout') 68 | return parser 69 | 70 | 71 | if __name__ == '__main__': 72 | parser = get_parser() 73 | args = parser.parse_args() 74 | args.scps = [args.scps] 75 | 76 | # logging info 77 | logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" 78 | if args.verbose > 0: 79 | logging.basicConfig(level=logging.INFO, format=logfmt) 80 | else: 81 | logging.basicConfig(level=logging.WARN, format=logfmt) 82 | logging.info(get_commandline_args()) 83 | 84 | # List[List[Tuple[str, str, Callable[[str], Any], str, str]]] 85 | input_infos = [] 86 | output_infos = [] 87 | infos = [] 88 | for lis_list, key_scps_list in [(input_infos, args.input_scps), 89 | (output_infos, args.output_scps), 90 | (infos, args.scps)]: 91 | for key_scps in key_scps_list: 92 | lis = [] 93 | for key_scp in key_scps: 94 | sps = key_scp.split(':') 95 | if len(sps) == 2: 96 | key, scp = sps 97 | type_func = None 98 | type_func_str = 'none' 99 | elif len(sps) == 3: 100 | key, scp, type_func_str = sps 101 | fail = False 102 | 103 | try: 104 | # type_func: Callable[[str], Any] 105 | # e.g. type_func_str = "int" -> type_func = int 106 | type_func = eval(type_func_str) 107 | except Exception: 108 | raise RuntimeError( 109 | 'Unknown type: {}'.format(type_func_str)) 110 | 111 | if not callable(type_func): 112 | raise RuntimeError( 113 | 'Unknown type: {}'.format(type_func_str)) 114 | 115 | else: 116 | raise RuntimeError( 117 | 'Format : ' 118 | 'or :: ' 119 | 'e.g. feat:data/feat.scp ' 120 | 'or shape:data/feat.scp:shape: {}'.format(key_scp)) 121 | 122 | for item in lis: 123 | if key == item[0]: 124 | raise RuntimeError('The key "{}" is duplicated: {} {}' 125 | .format(key, item[3], key_scp)) 126 | 127 | lis.append((key, scp, type_func, key_scp, type_func_str)) 128 | lis_list.append(lis) 129 | 130 | # Open scp files 131 | input_fscps = [[open(i[1], 'r', encoding='utf-8') 132 | for i in il] for il in input_infos] 133 | output_fscps = [[open(i[1], 'r', encoding='utf-8') for i in il] 134 | for il in output_infos] 135 | fscps = [[open(i[1], 'r', encoding='utf-8') for i in il] for il in infos] 136 | 137 | # Note(kamo): What is done here? 138 | # The final goal is creating a JSON file such as. 139 | # { 140 | # "utts": { 141 | # "sample_id1": {(omitted)}, 142 | # "sample_id2": {(omitted)}, 143 | # .... 144 | # } 145 | # } 146 | # 147 | # To reduce memory usage, reading the input text files for each lines 148 | # and writing JSON elements per samples. 149 | if args.out is None: 150 | out = sys.stdout 151 | else: 152 | out = open(args.out, 'w', encoding='utf-8') 153 | out.write('{\n "utts": {\n') 154 | nutt = 0 155 | while True: 156 | nutt += 1 157 | # List[List[str]] 158 | input_lines = [[f.readline() for f in fl] for fl in input_fscps] 159 | output_lines = [[f.readline() for f in fl] for fl in output_fscps] 160 | lines = [[f.readline() for f in fl] for fl in fscps] 161 | 162 | # Get the first line 163 | concat = sum(input_lines + output_lines + lines, []) 164 | if len(concat) == 0: 165 | break 166 | first = concat[0] 167 | 168 | # Sanity check: Must be sorted by the first column and have same keys 169 | count = 0 170 | for ls_list in (input_lines, output_lines, lines): 171 | for ls in ls_list: 172 | for line in ls: 173 | if line == '' or first == '': 174 | if line != first: 175 | concat = sum( 176 | input_infos + output_infos + infos, []) 177 | raise RuntimeError( 178 | 'The number of lines mismatch ' 179 | 'between: "{}" and "{}"' 180 | .format(concat[0][1], concat[count][1])) 181 | 182 | elif line.split()[0] != first.split()[0]: 183 | concat = sum(input_infos + output_infos + infos, []) 184 | raise RuntimeError( 185 | 'The keys are mismatch at {}th line ' 186 | 'between "{}" and "{}":\n>>> {}\n>>> {}' 187 | .format(nutt, concat[0][1], concat[count][1], 188 | first.rstrip(), line.rstrip())) 189 | count += 1 190 | 191 | # The end of file 192 | if first == '': 193 | if nutt != 1: 194 | out.write('\n') 195 | break 196 | if nutt != 1: 197 | out.write(',\n') 198 | 199 | entry = {} 200 | for inout, _lines, _infos in [('input', input_lines, input_infos), 201 | ('output', output_lines, output_infos), 202 | ('other', lines, infos)]: 203 | 204 | lis = [] 205 | for idx, (line_list, info_list) \ 206 | in enumerate(zip(_lines, _infos), 1): 207 | if inout == 'input': 208 | d = {'name': 'input{}'.format(idx)} 209 | elif inout == 'output': 210 | d = {'name': 'target{}'.format(idx)} 211 | else: 212 | d = {} 213 | 214 | # info_list: List[Tuple[str, str, Callable]] 215 | # line_list: List[str] 216 | for line, info in zip(line_list, info_list): 217 | sps = line.split(None, 1) 218 | if len(sps) < 2: 219 | if not args.allow_one_column: 220 | raise RuntimeError( 221 | 'Format error {}th line in {}: ' 222 | ' Expecting " ":\n>>> {}' 223 | .format(nutt, info[1], line)) 224 | uttid = sps[0] 225 | value = '' 226 | else: 227 | uttid, value = sps 228 | 229 | key = info[0] 230 | type_func = info[2] 231 | value = value.rstrip() 232 | 233 | if type_func is not None: 234 | try: 235 | # type_func: Callable[[str], Any] 236 | value = type_func(value) 237 | except Exception: 238 | logging.error('"{}" is an invalid function ' 239 | 'for the {} th line in {}: \n>>> {}' 240 | .format(info[4], nutt, info[1], line)) 241 | raise 242 | 243 | d[key] = value 244 | lis.append(d) 245 | 246 | if inout != 'other': 247 | entry[inout] = lis 248 | else: 249 | # If key == 'other'. only has the first item 250 | entry.update(lis[0]) 251 | 252 | entry = json.dumps(entry, indent=None, ensure_ascii=False, 253 | sort_keys=True, separators=(',', ': ')) 254 | # Add indent 255 | indent = ' ' * 2 256 | entry = ('\n' + indent).join(entry.split('\n')) 257 | 258 | uttid = first.split()[0] 259 | out.write(' "{}": {}'.format(uttid, entry)) 260 | 261 | out.write(' }\n}\n') 262 | 263 | logging.info('{} entries in {}'.format(nutt, out.name)) 264 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /local/files/asr.dict: -------------------------------------------------------------------------------- 1 | 1 2 | " 2 3 | ' 3 4 | . 4 5 | A 5 6 | ABLE 6 7 | AC 7 8 | AD 8 9 | AGE 9 10 | AK 10 11 | AL 11 12 | AM 12 13 | AN 13 14 | ANCE 14 15 | ANT 15 16 | AR 16 17 | ARD 17 18 | ARY 18 19 | AS 19 20 | AT 20 21 | ATE 21 22 | ATION 22 23 | B 23 24 | BLE 24 25 | BO 25 26 | C 26 27 | CAME 27 28 | CE 28 29 | CH 29 30 | CK 30 31 | CLOCK 31 32 | CO 32 33 | D 33 34 | DA 34 35 | E 35 36 | ED 36 37 | EL 37 38 | EN 38 39 | ENT 39 40 | ER 40 41 | ERS 41 42 | ES 42 43 | EST 43 44 | EVER 44 45 | F 45 46 | FORD 46 47 | FUL 47 48 | G 48 49 | GE 49 50 | H 50 51 | HA 51 52 | HE 52 53 | I 53 54 | IA 54 55 | IC 55 56 | ICAL 56 57 | ID 57 58 | IE 58 59 | IES 59 60 | IGH 60 61 | IL 61 62 | IN 62 63 | INE 63 64 | ING 64 65 | ION 65 66 | IP 66 67 | IR 67 68 | IS 68 69 | ISE 69 70 | IST 70 71 | IT 71 72 | ITY 72 73 | IVE 73 74 | IZE 74 75 | J 75 76 | K 76 77 | KE 77 78 | L 78 79 | LA 79 80 | LAND 80 81 | LE 81 82 | LESS 82 83 | LI 83 84 | LIGHT 84 85 | LL 85 86 | LO 86 87 | LY 87 88 | M 88 89 | MA 89 90 | MAN 90 91 | ME 91 92 | MENT 92 93 | MP 93 94 | N 94 95 | NA 95 96 | NCE 96 97 | NE 97 98 | NESS 98 99 | O 99 100 | OL 100 101 | ON 101 102 | OOK 102 103 | OR 103 104 | OUND 104 105 | OW 105 106 | P 106 107 | PE 107 108 | PER 108 109 | Q 109 110 | QUE 110 111 | QUI 111 112 | R 112 113 | RA 113 114 | RE 114 115 | REET 115 116 | RI 116 117 | RO 117 118 | RROW 118 119 | RY 119 120 | S 120 121 | SE 121 122 | SH 122 123 | SIDE 123 124 | STREET 124 125 | T 125 126 | TE 126 127 | TED 127 128 | TER 128 129 | TH 129 130 | THER 130 131 | TING 131 132 | TION 132 133 | TURE 133 134 | TY 134 135 | U 135 136 | UGH 136 137 | UM 137 138 | UN 138 139 | UR 139 140 | US 140 141 | V 141 142 | VE 142 143 | VER 143 144 | W 144 145 | WARD 145 146 | WAY 146 147 | X 147 148 | Y 148 149 | Z 149 150 | ▁" 150 151 | ▁A 151 152 | ▁ABOUT 152 153 | ▁AC 153 154 | ▁ACCOUNT 154 155 | ▁ACROSS 155 156 | ▁ACTRESS 156 157 | ▁ACTUALLY 157 158 | ▁ADD 158 159 | ▁ADJUST 159 160 | ▁AFRAID 160 161 | ▁AFTER 161 162 | ▁AGAIN 162 163 | ▁AGAINST 163 164 | ▁AGO 164 165 | ▁AGREE 165 166 | ▁AIR 166 167 | ▁ALBUM 167 168 | ▁ALL 168 169 | ▁ALMOST 169 170 | ▁ALONE 170 171 | ▁ALONG 171 172 | ▁ALREADY 172 173 | ▁ALSO 173 174 | ▁ALWAYS 174 175 | ▁AM 175 176 | ▁AMERICA 176 177 | ▁AN 177 178 | ▁AND 178 179 | ▁ANGEL 179 180 | ▁ANIMAL 180 181 | ▁ANOTHER 181 182 | ▁ANSWER 182 183 | ▁ANY 183 184 | ▁ANYMORE 184 185 | ▁ANYONE 185 186 | ▁ANYTHING 186 187 | ▁APP 187 188 | ▁ARD 188 189 | ▁ARE 189 190 | ▁ARM 190 191 | ▁AROUND 191 192 | ▁ARRIVE 192 193 | ▁ART 193 194 | ▁ARTICLE 194 195 | ▁ARTIST 195 196 | ▁AS 196 197 | ▁ASK 197 198 | ▁AT 198 199 | ▁ATTACK 199 200 | ▁AVERAGE 200 201 | ▁AWARDS 201 202 | ▁AWAY 202 203 | ▁B 203 204 | ▁BA 204 205 | ▁BABY 205 206 | ▁BACK 206 207 | ▁BAD 207 208 | ▁BANK 208 209 | ▁BAR 209 210 | ▁BE 210 211 | ▁BEACH 211 212 | ▁BECAUSE 212 213 | ▁BECOME 213 214 | ▁BEEN 214 215 | ▁BEFORE 215 216 | ▁BEGAN 216 217 | ▁BEGIN 217 218 | ▁BEHIND 218 219 | ▁BEING 219 220 | ▁BELIEVE 220 221 | ▁BEST 221 222 | ▁BETTER 222 223 | ▁BETWEEN 223 224 | ▁BIG 224 225 | ▁BIT 225 226 | ▁BLACK 226 227 | ▁BLOOD 227 228 | ▁BLOW 228 229 | ▁BLUE 229 230 | ▁BO 230 231 | ▁BODY 231 232 | ▁BOOK 232 233 | ▁BOTH 233 234 | ▁BOTTLE 234 235 | ▁BOUGHT 235 236 | ▁BOX 236 237 | ▁BOY 237 238 | ▁BR 238 239 | ▁BRANCH 239 240 | ▁BREAK 240 241 | ▁BREATH 241 242 | ▁BRING 242 243 | ▁BROKE 243 244 | ▁BROTHER 244 245 | ▁BROUGHT 245 246 | ▁BUILD 246 247 | ▁BURN 247 248 | ▁BUS 248 249 | ▁BUSINESS 249 250 | ▁BUT 250 251 | ▁BUY 251 252 | ▁BY 252 253 | ▁C 253 254 | ▁CA 254 255 | ▁CAFE 255 256 | ▁CALL 256 257 | ▁CAME 257 258 | ▁CAN 258 259 | ▁CAR 259 260 | ▁CARE 260 261 | ▁CASE 261 262 | ▁CAST 262 263 | ▁CATCH 263 264 | ▁CAUSE 264 265 | ▁CERTAIN 265 266 | ▁CH 266 267 | ▁CHA 267 268 | ▁CHANCE 268 269 | ▁CHANGE 269 270 | ▁CHANNEL 270 271 | ▁CHARGE 271 272 | ▁CHEAP 272 273 | ▁CHECK 273 274 | ▁CHILD 274 275 | ▁CHILDREN 275 276 | ▁CHINA 276 277 | ▁CHOICE 277 278 | ▁CINEMAS 278 279 | ▁CITY 279 280 | ▁CLASS 280 281 | ▁CLEAN 281 282 | ▁CLEAR 282 283 | ▁CLOCK 283 284 | ▁CLOSE 284 285 | ▁CLOUD 285 286 | ▁CLUB 286 287 | ▁CO 287 288 | ▁COFFEE 288 289 | ▁COLD 289 290 | ▁COLLEGE 290 291 | ▁COME 291 292 | ▁COMING 292 293 | ▁COMMON 293 294 | ▁COMPANY 294 295 | ▁COMPLETE 295 296 | ▁COMPUTER 296 297 | ▁CON 297 298 | ▁CONCERT 298 299 | ▁CONTROL 299 300 | ▁COOK 300 301 | ▁CORNER 301 302 | ▁COST 302 303 | ▁COULD 303 304 | ▁COUNT 304 305 | ▁COUNTRY 305 306 | ▁COUPLE 306 307 | ▁COURSE 307 308 | ▁CR 308 309 | ▁CRAZY 309 310 | ▁CREATE 310 311 | ▁CROSS 311 312 | ▁CURRENT 312 313 | ▁CUT 313 314 | ▁D 314 315 | ▁DA 315 316 | ▁DARK 316 317 | ▁DAUGHTER 317 318 | ▁DAY 318 319 | ▁DAYS 319 320 | ▁DE 320 321 | ▁DEAD 321 322 | ▁DEAL 322 323 | ▁DEATH 323 324 | ▁DECIDE 324 325 | ▁DECISION 325 326 | ▁DEEP 326 327 | ▁DELIVERY 327 328 | ▁DESIGN 328 329 | ▁DEVICE 329 330 | ▁DI 330 331 | ▁DID 331 332 | ▁DIDN 332 333 | ▁DIFFERENT 333 334 | ▁DINNER 334 335 | ▁DIRECT 335 336 | ▁DIS 336 337 | ▁DISCOUNT 337 338 | ▁DISHES 338 339 | ▁DO 339 340 | ▁DOCTOR 340 341 | ▁DOES 341 342 | ▁DOING 342 343 | ▁DON 343 344 | ▁DONE 344 345 | ▁DOUBT 345 346 | ▁DOWN 346 347 | ▁DRAW 347 348 | ▁DREAM 348 349 | ▁DRESS 349 350 | ▁DRINK 350 351 | ▁DRIVE 351 352 | ▁DROP 352 353 | ▁DU 353 354 | ▁E 354 355 | ▁EACH 355 356 | ▁EASY 356 357 | ▁EAT 357 358 | ▁ED 358 359 | ▁EFFECT 359 360 | ▁EIGHT 360 361 | ▁EIGHTEEN 361 362 | ▁EIGHTY 362 363 | ▁EITHER 363 364 | ▁ELEVEN 364 365 | ▁ELSE 365 366 | ▁EN 366 367 | ▁END 367 368 | ▁ENGLISH 368 369 | ▁ENJOY 369 370 | ▁ENOUGH 370 371 | ▁EPISODE 371 372 | ▁ER 372 373 | ▁EST 373 374 | ▁EVEN 374 375 | ▁EVER 375 376 | ▁EVERY 376 377 | ▁EVERYONE 377 378 | ▁EVERYTHING 378 379 | ▁EX 379 380 | ▁EXACTLY 380 381 | ▁EXAMPLE 381 382 | ▁EXCUSE 382 383 | ▁EXPECT 383 384 | ▁EXPLAIN 384 385 | ▁F 385 386 | ▁FACE 386 387 | ▁FACT 387 388 | ▁FAIL 388 389 | ▁FAIR 389 390 | ▁FALL 390 391 | ▁FAMILY 391 392 | ▁FAMOUS 392 393 | ▁FAN 393 394 | ▁FAR 394 395 | ▁FAST 395 396 | ▁FATHER 396 397 | ▁FEAR 397 398 | ▁FEATURE 398 399 | ▁FEEL 399 400 | ▁FELL 400 401 | ▁FELT 401 402 | ▁FEW 402 403 | ▁FIELD 403 404 | ▁FIFTEEN 404 405 | ▁FIFTY 405 406 | ▁FIGHT 406 407 | ▁FIGURE 407 408 | ▁FILM 408 409 | ▁FINAL 409 410 | ▁FIND 410 411 | ▁FINE 411 412 | ▁FINISH 412 413 | ▁FIRE 413 414 | ▁FIRST 414 415 | ▁FISH 415 416 | ▁FIVE 416 417 | ▁FIX 417 418 | ▁FLOOR 418 419 | ▁FLOW 419 420 | ▁FOLLOW 420 421 | ▁FOOD 421 422 | ▁FOOL 422 423 | ▁FOR 423 424 | ▁FORTY 424 425 | ▁FOUND 425 426 | ▁FOUR 426 427 | ▁FOURTEEN 427 428 | ▁FREE 428 429 | ▁FRESH 429 430 | ▁FRIEND 430 431 | ▁FRIENDS 431 432 | ▁FROM 432 433 | ▁FRONT 433 434 | ▁FUL 434 435 | ▁FULL 435 436 | ▁FUN 436 437 | ▁FUTURE 437 438 | ▁G 438 439 | ▁GA 439 440 | ▁GAME 440 441 | ▁GARDEN 441 442 | ▁GAVE 442 443 | ▁GENERAL 443 444 | ▁GET 444 445 | ▁GETTING 445 446 | ▁GIRL 446 447 | ▁GIVE 447 448 | ▁GLASS 448 449 | ▁GO 449 450 | ▁GOD 450 451 | ▁GOING 451 452 | ▁GOLD 452 453 | ▁GONNA 453 454 | ▁GOOD 454 455 | ▁GOT 455 456 | ▁GR 456 457 | ▁GRAND 457 458 | ▁GREAT 458 459 | ▁GREEN 459 460 | ▁GROUP 460 461 | ▁GROW 461 462 | ▁GUESS 462 463 | ▁GUY 463 464 | ▁H 464 465 | ▁HA 465 466 | ▁HAD 466 467 | ▁HALF 467 468 | ▁HAND 468 469 | ▁HAPPEN 469 470 | ▁HAPPY 470 471 | ▁HARD 471 472 | ▁HAS 472 473 | ▁HAVE 473 474 | ▁HAVING 474 475 | ▁HE 475 476 | ▁HEAD 476 477 | ▁HEALTH 477 478 | ▁HEAR 478 479 | ▁HEART 479 480 | ▁HELP 480 481 | ▁HER 481 482 | ▁HERE 482 483 | ▁HI 483 484 | ▁HIGH 484 485 | ▁HIM 485 486 | ▁HIMSELF 486 487 | ▁HIS 487 488 | ▁HISTORY 488 489 | ▁HO 489 490 | ▁HOLD 490 491 | ▁HOME 491 492 | ▁HOPE 492 493 | ▁HORSE 493 494 | ▁HOSPITAL 494 495 | ▁HOTEL 495 496 | ▁HOUSE 496 497 | ▁HOW 497 498 | ▁HUMAN 498 499 | ▁HUNDRED 499 500 | ▁HURT 500 501 | ▁HUSBAND 501 502 | ▁I 502 503 | ▁IDEA 503 504 | ▁IF 504 505 | ▁IMAGINE 505 506 | ▁IMPORTANT 506 507 | ▁IN 507 508 | ▁INDIA 508 509 | ▁INTEREST 509 510 | ▁INTO 510 511 | ▁IS 511 512 | ▁ISSUE 512 513 | ▁IT 513 514 | ▁ITSELF 514 515 | ▁J 515 516 | ▁JA 516 517 | ▁JAPAN 517 518 | ▁JO 518 519 | ▁JOB 519 520 | ▁JOHN 520 521 | ▁JOURNEY 521 522 | ▁JU 522 523 | ▁JUST 523 524 | ▁K 524 525 | ▁KEEP 525 526 | ▁KEPT 526 527 | ▁KEY 527 528 | ▁KID 528 529 | ▁KILL 529 530 | ▁KIND 530 531 | ▁KITCHEN 531 532 | ▁KNEW 532 533 | ▁KNOW 533 534 | ▁L 534 535 | ▁LA 535 536 | ▁LADY 536 537 | ▁LAND 537 538 | ▁LANGUAGE 538 539 | ▁LARGE 539 540 | ▁LAST 540 541 | ▁LATE 541 542 | ▁LAW 542 543 | ▁LE 543 544 | ▁LEAD 544 545 | ▁LEARN 545 546 | ▁LEAST 546 547 | ▁LEAVE 547 548 | ▁LEFT 548 549 | ▁LEG 549 550 | ▁LESS 550 551 | ▁LET 551 552 | ▁LEVEL 552 553 | ▁LI 553 554 | ▁LIFE 554 555 | ▁LIGHT 555 556 | ▁LIKE 556 557 | ▁LINE 557 558 | ▁LISTEN 558 559 | ▁LITTLE 559 560 | ▁LIVE 560 561 | ▁LIVING 561 562 | ▁LL 562 563 | ▁LO 563 564 | ▁LOCAL 564 565 | ▁LOCATION 565 566 | ▁LONDON 566 567 | ▁LONG 567 568 | ▁LOOK 568 569 | ▁LOST 569 570 | ▁LOT 570 571 | ▁LOVE 571 572 | ▁LOW 572 573 | ▁LU 573 574 | ▁LY 574 575 | ▁LYRICS 575 576 | ▁M 576 577 | ▁MA 577 578 | ▁MACHINE 578 579 | ▁MADE 579 580 | ▁MAIN 580 581 | ▁MAKE 581 582 | ▁MAKING 582 583 | ▁MAN 583 584 | ▁MANY 584 585 | ▁MAR 585 586 | ▁MARKET 586 587 | ▁MATTER 587 588 | ▁MAY 588 589 | ▁MAYBE 589 590 | ▁ME 590 591 | ▁MEAN 591 592 | ▁MEET 592 593 | ▁MEMORY 593 594 | ▁MEN 594 595 | ▁MESSAGE 595 596 | ▁METHOD 596 597 | ▁MI 597 598 | ▁MIDDLE 598 599 | ▁MIGHT 599 600 | ▁MILLION 600 601 | ▁MIND 601 602 | ▁MINUTE 602 603 | ▁MIRROR 603 604 | ▁MISS 604 605 | ▁MISTAKE 605 606 | ▁MO 606 607 | ▁MODE 607 608 | ▁MOMENT 608 609 | ▁MONEY 609 610 | ▁MONTH 610 611 | ▁MORE 611 612 | ▁MORNING 612 613 | ▁MOST 613 614 | ▁MOTHER 614 615 | ▁MOVE 615 616 | ▁MOVIE 616 617 | ▁MP 617 618 | ▁MR 618 619 | ▁MU 619 620 | ▁MUCH 620 621 | ▁MUSIC 621 622 | ▁MUST 622 623 | ▁MY 623 624 | ▁MYSELF 624 625 | ▁N 625 626 | ▁NA 626 627 | ▁NAME 627 628 | ▁NE 628 629 | ▁NEAR 629 630 | ▁NEED 630 631 | ▁NESS 631 632 | ▁NEVER 632 633 | ▁NEW 633 634 | ▁NEWS 634 635 | ▁NEXT 635 636 | ▁NICE 636 637 | ▁NIGHT 637 638 | ▁NINE 638 639 | ▁NINETEEN 639 640 | ▁NINETY 640 641 | ▁NO 641 642 | ▁NOBODY 642 643 | ▁NORMAL 643 644 | ▁NORTH 644 645 | ▁NOT 645 646 | ▁NOTHING 646 647 | ▁NOW 647 648 | ▁NUMBER 648 649 | ▁O 649 650 | ▁OF 650 651 | ▁OFF 651 652 | ▁OFFICE 652 653 | ▁OFTEN 653 654 | ▁OH 654 655 | ▁OKAY 655 656 | ▁OLD 656 657 | ▁ON 657 658 | ▁ONCE 658 659 | ▁ONE 659 660 | ▁ONLINE 660 661 | ▁ONLY 661 662 | ▁OPEN 662 663 | ▁OPERA 663 664 | ▁OPTION 664 665 | ▁OR 665 666 | ▁ORDER 666 667 | ▁ORGAN 667 668 | ▁ORIGIN 668 669 | ▁ORIGINAL 669 670 | ▁OTHER 670 671 | ▁OUR 671 672 | ▁OUT 672 673 | ▁OVER 673 674 | ▁OW 674 675 | ▁OWN 675 676 | ▁P 676 677 | ▁PA 677 678 | ▁PARENTS 678 679 | ▁PARK 679 680 | ▁PART 680 681 | ▁PASS 681 682 | ▁PAST 682 683 | ▁PATIENT 683 684 | ▁PAY 684 685 | ▁PE 685 686 | ▁PEOPLE 686 687 | ▁PERSON 687 688 | ▁PHONE 688 689 | ▁PHOTO 689 690 | ▁PHRASE 690 691 | ▁PICK 691 692 | ▁PICTURE 692 693 | ▁PIECE 693 694 | ▁PLACE 694 695 | ▁PLACES 695 696 | ▁PLAN 696 697 | ▁PLAY 697 698 | ▁PLAYLIST 698 699 | ▁PLEASE 699 700 | ▁PM 700 701 | ▁PO 701 702 | ▁POCKET 702 703 | ▁POINT 703 704 | ▁POLICE 704 705 | ▁POPULAR 705 706 | ▁POSITION 706 707 | ▁POWER 707 708 | ▁PRE 708 709 | ▁PREPARE 709 710 | ▁PRESSURE 710 711 | ▁PRETTY 711 712 | ▁PRICE 712 713 | ▁PRO 713 714 | ▁PROBABLY 714 715 | ▁PROBLEM 715 716 | ▁PRODUCE 716 717 | ▁PRODUCT 717 718 | ▁PROJECT 718 719 | ▁PROMISE 719 720 | ▁PUBLIC 720 721 | ▁PULL 721 722 | ▁PUT 722 723 | ▁Q 723 724 | ▁QUA 724 725 | ▁QUALITY 725 726 | ▁QUE 726 727 | ▁QUESTION 727 728 | ▁QUEUE 728 729 | ▁QUI 729 730 | ▁QUICK 730 731 | ▁QUIET 731 732 | ▁QUITE 732 733 | ▁R 733 734 | ▁RA 734 735 | ▁RADIO 735 736 | ▁RAIN 736 737 | ▁RATING 737 738 | ▁RE 738 739 | ▁REACH 739 740 | ▁READ 740 741 | ▁REAL 741 742 | ▁REALLY 742 743 | ▁REASON 743 744 | ▁RECENT 744 745 | ▁RECORD 745 746 | ▁RELEASED 746 747 | ▁REMEMBER 747 748 | ▁REMIND 748 749 | ▁REMOVE 749 750 | ▁REPEAT 750 751 | ▁REPLAY 751 752 | ▁REPORT 752 753 | ▁REPUBLIC 753 754 | ▁REST 754 755 | ▁RESTART 755 756 | ▁RESTAURANT 756 757 | ▁RESULT 757 758 | ▁RETURN 758 759 | ▁REVIEWS 759 760 | ▁RI 760 761 | ▁RICH 761 762 | ▁RID 762 763 | ▁RIGHT 763 764 | ▁RISE 764 765 | ▁RISK 765 766 | ▁RIVER 766 767 | ▁RO 767 768 | ▁ROAD 768 769 | ▁ROBOT 769 770 | ▁ROCK 770 771 | ▁ROOM 771 772 | ▁ROUND 772 773 | ▁RU 773 774 | ▁RUN 774 775 | ▁RY 775 776 | ▁S 776 777 | ▁SA 777 778 | ▁SAFE 778 779 | ▁SAID 779 780 | ▁SAME 780 781 | ▁SAW 781 782 | ▁SAY 782 783 | ▁SCHOOL 783 784 | ▁SCIENCE 784 785 | ▁SCREEN 785 786 | ▁SE 786 787 | ▁SECOND 787 788 | ▁SECRET 788 789 | ▁SEE 789 790 | ▁SEND 790 791 | ▁SENSE 791 792 | ▁SENTENCE 792 793 | ▁SERIES 793 794 | ▁SERIOUS 794 795 | ▁SERVICE 795 796 | ▁SET 796 797 | ▁SEVEN 797 798 | ▁SEVENTY 798 799 | ▁SH 799 800 | ▁SHALL 800 801 | ▁SHARE 801 802 | ▁SHE 802 803 | ▁SHOP 803 804 | ▁SHORT 804 805 | ▁SHOULD 805 806 | ▁SHOW 806 807 | ▁SHOWS 807 808 | ▁SHUT 808 809 | ▁SICK 809 810 | ▁SIDE 810 811 | ▁SIGN 811 812 | ▁SIMILAR 812 813 | ▁SIMPLE 813 814 | ▁SINCE 814 815 | ▁SINGLE 815 816 | ▁SISTER 816 817 | ▁SIT 817 818 | ▁SIX 818 819 | ▁SIXTEEN 819 820 | ▁SIXTY 820 821 | ▁SKIP 821 822 | ▁SLEEP 822 823 | ▁SLOW 823 824 | ▁SMALL 824 825 | ▁SMART 825 826 | ▁SNOW 826 827 | ▁SO 827 828 | ▁SOME 828 829 | ▁SOMEBODY 829 830 | ▁SOMEONE 830 831 | ▁SOMETHING 831 832 | ▁SONG 832 833 | ▁SONGS 833 834 | ▁SOON 834 835 | ▁SORRY 835 836 | ▁SORT 836 837 | ▁SOUL 837 838 | ▁SOUND 838 839 | ▁SOUTH 839 840 | ▁SP 840 841 | ▁SPACE 841 842 | ▁SPEAK 842 843 | ▁SPECIAL 843 844 | ▁SPEED 844 845 | ▁SPEND 845 846 | ▁ST 846 847 | ▁STAND 847 848 | ▁STAR 848 849 | ▁START 849 850 | ▁STATE 850 851 | ▁STAY 851 852 | ▁STEP 852 853 | ▁STEREO 853 854 | ▁STILL 854 855 | ▁STOP 855 856 | ▁STORIES 856 857 | ▁STORY 857 858 | ▁STRAIGHT 858 859 | ▁STRANGE 859 860 | ▁STREET 860 861 | ▁STRONG 861 862 | ▁STUDENT 862 863 | ▁STUDY 863 864 | ▁STUFF 864 865 | ▁SU 865 866 | ▁SUCH 866 867 | ▁SUDDEN 867 868 | ▁SUMMER 868 869 | ▁SUN 869 870 | ▁SUPPORT 870 871 | ▁SUPPOSE 871 872 | ▁SURE 872 873 | ▁SW 873 874 | ▁SWEET 874 875 | ▁SWITCH 875 876 | ▁SYSTEM 876 877 | ▁T 877 878 | ▁TA 878 879 | ▁TABLE 879 880 | ▁TAKE 880 881 | ▁TAKING 881 882 | ▁TALK 882 883 | ▁TALKING 883 884 | ▁TE 884 885 | ▁TEA 885 886 | ▁TEACHER 886 887 | ▁TELL 887 888 | ▁TEMPERATURE 888 889 | ▁TEN 889 890 | ▁TH 890 891 | ▁THAN 891 892 | ▁THANK 892 893 | ▁THAT 893 894 | ▁THE 894 895 | ▁THEIR 895 896 | ▁THEM 896 897 | ▁THEN 897 898 | ▁THERE 898 899 | ▁THESE 899 900 | ▁THEY 900 901 | ▁THING 901 902 | ▁THINGS 902 903 | ▁THINK 903 904 | ▁THIRD 904 905 | ▁THIRTEEN 905 906 | ▁THIRTY 906 907 | ▁THIS 907 908 | ▁THOSE 908 909 | ▁THOUGH 909 910 | ▁THOUGHT 910 911 | ▁THOUSAND 911 912 | ▁THREE 912 913 | ▁THROUGH 913 914 | ▁THROW 914 915 | ▁TI 915 916 | ▁TICKET 916 917 | ▁TIME 917 918 | ▁TO 918 919 | ▁TODAY 919 920 | ▁TOGETHER 920 921 | ▁TOLD 921 922 | ▁TOMORROW 922 923 | ▁TONIGHT 923 924 | ▁TOO 924 925 | ▁TOOK 925 926 | ▁TOWN 926 927 | ▁TRA 927 928 | ▁TRAIN 928 929 | ▁TRAVEL 929 930 | ▁TREAT 930 931 | ▁TREE 931 932 | ▁TRIED 932 933 | ▁TRIP 933 934 | ▁TROUBLE 934 935 | ▁TRU 935 936 | ▁TRUE 936 937 | ▁TRY 937 938 | ▁TRYING 938 939 | ▁TURE 939 940 | ▁TURN 940 941 | ▁TWELVE 941 942 | ▁TWENTY 942 943 | ▁TWO 943 944 | ▁TYPE 944 945 | ▁U 945 946 | ▁UN 946 947 | ▁UNDER 947 948 | ▁UNDERSTAND 948 949 | ▁UNITED 949 950 | ▁UNTI 950 951 | ▁UP 951 952 | ▁UR 952 953 | ▁US 953 954 | ▁USE 954 955 | ▁USED 955 956 | ▁V 956 957 | ▁VA 957 958 | ▁VALUE 958 959 | ▁VE 959 960 | ▁VER 960 961 | ▁VERSION 961 962 | ▁VERY 962 963 | ▁VI 963 964 | ▁VIDEO 964 965 | ▁VIEW 965 966 | ▁VISIT 966 967 | ▁VOICE 967 968 | ▁W 968 969 | ▁WAIT 969 970 | ▁WALK 970 971 | ▁WALL 971 972 | ▁WANT 972 973 | ▁WAR 973 974 | ▁WAS 974 975 | ▁WATCH 975 976 | ▁WATER 976 977 | ▁WAY 977 978 | ▁WE 978 979 | ▁WEATHER 979 980 | ▁WEEK 980 981 | ▁WELL 981 982 | ▁WENT 982 983 | ▁WERE 983 984 | ▁WHAT 984 985 | ▁WHEN 985 986 | ▁WHERE 986 987 | ▁WHICH 987 988 | ▁WHILE 988 989 | ▁WHITE 989 990 | ▁WHO 990 991 | ▁WHY 991 992 | ▁WIFE 992 993 | ▁WILL 993 994 | ▁WIN 994 995 | ▁WIND 995 996 | ▁WINDOW 996 997 | ▁WISH 997 998 | ▁WITH 998 999 | ▁WITHOUT 999 1000 | ▁WOMAN 1000 1001 | ▁WOMEN 1001 1002 | ▁WON 1002 1003 | ▁WONDER 1003 1004 | ▁WOOD 1004 1005 | ▁WORD 1005 1006 | ▁WORDS 1006 1007 | ▁WORK 1007 1008 | ▁WORLD 1008 1009 | ▁WORRY 1009 1010 | ▁WORTH 1010 1011 | ▁WOULD 1011 1012 | ▁WRITE 1012 1013 | ▁WRONG 1013 1014 | ▁X 1014 1015 | ▁Y 1015 1016 | ▁YEAH 1016 1017 | ▁YEAR 1017 1018 | ▁YEARS 1018 1019 | ▁YES 1019 1020 | ▁YET 1020 1021 | ▁YORK 1021 1022 | ▁YOU 1022 1023 | ▁YOUNG 1023 1024 | ▁YOUR 1024 1025 | ▁YOURSELF 1025 1026 | ▁Z 1026 1027 | ▁ZERO 1027 1028 | -------------------------------------------------------------------------------- /module/track2_asr_transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Shigeki Karita 2 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | 4 | """Transformer speech recognition model (pytorch).""" 5 | 6 | from argparse import Namespace 7 | from distutils.util import strtobool 8 | 9 | import logging 10 | import math 11 | 12 | import torch 13 | import pdb 14 | 15 | from espnet.nets.asr_interface import ASRInterface 16 | from espnet.nets.pytorch_backend.ctc import CTC 17 | from espnet.nets.pytorch_backend.e2e_asr import CTC_LOSS_THRESHOLD 18 | from espnet.nets.pytorch_backend.e2e_asr import Reporter 19 | from espnet.nets.pytorch_backend.nets_utils import get_subsample 20 | from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask 21 | from espnet.nets.pytorch_backend.nets_utils import th_accuracy 22 | from espnet.nets.pytorch_backend.transformer.add_sos_eos import add_sos_eos 23 | from espnet.nets.pytorch_backend.transformer.attention import MultiHeadedAttention 24 | from espnet.nets.pytorch_backend.transformer.decoder import Decoder 25 | from espnet.nets.pytorch_backend.transformer.encoder import Encoder 26 | from espnet.nets.pytorch_backend.transformer.initializer import initialize 27 | from espnet.nets.pytorch_backend.transformer.label_smoothing_loss import LabelSmoothingLoss 28 | from espnet.nets.pytorch_backend.transformer.mask import subsequent_mask 29 | from espnet.nets.pytorch_backend.transformer.mask import target_mask 30 | from espnet.nets.pytorch_backend.transformer.plot import PlotAttentionReport 31 | from espnet.nets.scorers.ctc import CTCPrefixScorer 32 | 33 | 34 | class E2E(ASRInterface, torch.nn.Module): 35 | """E2E module. 36 | 37 | :param int idim: dimension of inputs 38 | :param int odim: dimension of outputs 39 | :param Namespace args: argument Namespace containing options 40 | 41 | """ 42 | 43 | @staticmethod 44 | def add_arguments(parser): 45 | """Add arguments.""" 46 | group = parser.add_argument_group("transformer model setting") 47 | 48 | group.add_argument("--transformer-init", type=str, default="pytorch", 49 | choices=["pytorch", "xavier_uniform", "xavier_normal", 50 | "kaiming_uniform", "kaiming_normal"], 51 | help='how to initialize transformer parameters') 52 | group.add_argument("--transformer-input-layer", type=str, default="conv2d", 53 | choices=["conv2d", "linear", "embed"], 54 | help='transformer input layer type') 55 | group.add_argument('--transformer-attn-dropout-rate', default=None, type=float, 56 | help='dropout in transformer attention. use --dropout-rate if None is set') 57 | group.add_argument('--transformer-lr', default=10.0, type=float, 58 | help='Initial value of learning rate') 59 | group.add_argument('--transformer-warmup-steps', default=25000, type=int, 60 | help='optimizer warmup steps') 61 | group.add_argument('--transformer-length-normalized-loss', default=True, type=strtobool, 62 | help='normalize loss by length') 63 | 64 | group.add_argument('--dropout-rate', default=0.0, type=float, 65 | help='Dropout rate for the encoder') 66 | # Encoder 67 | group.add_argument('--elayers', default=4, type=int, 68 | help='Number of encoder layers (for shared recognition part in multi-speaker asr mode)') 69 | group.add_argument('--eunits', '-u', default=300, type=int, 70 | help='Number of encoder hidden units') 71 | # Attention 72 | group.add_argument('--adim', default=320, type=int, 73 | help='Number of attention transformation dimensions') 74 | group.add_argument('--aheads', default=4, type=int, 75 | help='Number of heads for multi head attention') 76 | # Decoder 77 | group.add_argument('--dlayers', default=1, type=int, 78 | help='Number of decoder layers') 79 | group.add_argument('--dunits', default=320, type=int, 80 | help='Number of decoder hidden units') 81 | return parser 82 | 83 | @property 84 | def attention_plot_class(self): 85 | """Return PlotAttentionReport.""" 86 | return PlotAttentionReport 87 | 88 | def __init__(self, idim, odim, args, ignore_id=-1): 89 | """Construct an E2E object. 90 | 91 | :param int idim: dimension of inputs 92 | :param int odim: dimension of outputs 93 | :param Namespace args: argument Namespace containing options 94 | """ 95 | torch.nn.Module.__init__(self) 96 | if args.transformer_attn_dropout_rate is None: 97 | args.transformer_attn_dropout_rate = args.dropout_rate 98 | self.encoder = Encoder( 99 | idim=idim, 100 | attention_dim=args.adim, 101 | attention_heads=args.aheads, 102 | linear_units=args.eunits, 103 | num_blocks=args.elayers, 104 | input_layer=args.transformer_input_layer, 105 | dropout_rate=args.dropout_rate, 106 | positional_dropout_rate=args.dropout_rate, 107 | attention_dropout_rate=args.transformer_attn_dropout_rate 108 | ) 109 | self.decoder = Decoder( 110 | odim=odim, 111 | attention_dim=args.adim, 112 | attention_heads=args.aheads, 113 | linear_units=args.dunits, 114 | num_blocks=args.dlayers, 115 | dropout_rate=args.dropout_rate, 116 | positional_dropout_rate=args.dropout_rate, 117 | self_attention_dropout_rate=args.transformer_attn_dropout_rate, 118 | src_attention_dropout_rate=args.transformer_attn_dropout_rate 119 | ) 120 | self.sos = odim - 1 121 | self.eos = odim - 1 122 | self.odim = odim 123 | self.ignore_id = ignore_id 124 | self.subsample = get_subsample(args, mode='asr', arch='transformer') 125 | self.reporter = Reporter() 126 | 127 | # self.lsm_weight = a 128 | self.criterion = LabelSmoothingLoss(self.odim, self.ignore_id, args.lsm_weight, 129 | args.transformer_length_normalized_loss) 130 | # self.verbose = args.verbose 131 | self.reset_parameters(args) 132 | self.adim = args.adim 133 | self.mtlalpha = args.mtlalpha 134 | if args.mtlalpha > 0.0: 135 | self.ctc = CTC(odim, args.adim, args.dropout_rate, ctc_type=args.ctc_type, reduce=True) 136 | else: 137 | self.ctc = None 138 | 139 | if args.report_cer or args.report_wer: 140 | from espnet.nets.e2e_asr_common import ErrorCalculator 141 | self.error_calculator = ErrorCalculator(args.char_list, 142 | args.sym_space, args.sym_blank, 143 | args.report_cer, args.report_wer) 144 | else: 145 | self.error_calculator = None 146 | self.rnnlm = None 147 | 148 | def reset_parameters(self, args): 149 | """Initialize parameters.""" 150 | # initialize parameters 151 | initialize(self, args.transformer_init) 152 | 153 | def forward(self, xs_pad, ilens, ys_pad): 154 | """E2E forward. 155 | 156 | :param torch.Tensor xs_pad: batch of padded source sequences (B, Tmax, idim) 157 | :param torch.Tensor ilens: batch of lengths of source sequences (B) 158 | :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax) 159 | :return: ctc loass value 160 | :rtype: torch.Tensor 161 | :return: attention loss value 162 | :rtype: torch.Tensor 163 | :return: accuracy in attention decoder 164 | :rtype: float 165 | """ 166 | # 1. forward encoder 167 | xs_pad = xs_pad[:, :max(ilens)] # for data parallel 168 | src_mask = make_non_pad_mask(ilens.tolist()).to(xs_pad.device).unsqueeze(-2) 169 | hs_pad, hs_mask = self.encoder(xs_pad, src_mask) 170 | self.hs_pad = hs_pad 171 | 172 | # 2. forward decoder 173 | ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id) 174 | ys_mask = target_mask(ys_in_pad, self.ignore_id) 175 | pred_pad, pred_mask, _, _ = self.decoder(ys_in_pad, ys_mask, hs_pad, hs_mask) 176 | self.pred_pad = pred_pad 177 | 178 | # 3. compute attention loss 179 | loss_att = self.criterion(pred_pad, ys_out_pad) 180 | self.acc = th_accuracy(pred_pad.view(-1, self.odim), ys_out_pad, 181 | ignore_label=self.ignore_id) 182 | 183 | # TODO(karita) show predicted text 184 | # TODO(karita) calculate these stats 185 | cer_ctc = None 186 | if self.mtlalpha == 0.0: 187 | loss_ctc = None 188 | else: 189 | batch_size = xs_pad.size(0) 190 | hs_len = hs_mask.view(batch_size, -1).sum(1) 191 | loss_ctc = self.ctc(hs_pad.view(batch_size, -1, self.adim), hs_len, ys_pad) 192 | if self.error_calculator is not None: 193 | ys_hat = self.ctc.argmax(hs_pad.view(batch_size, -1, self.adim)).data 194 | cer_ctc = self.error_calculator(ys_hat.cpu(), ys_pad.cpu(), is_ctc=True) 195 | 196 | # 5. compute cer/wer 197 | if self.training or self.error_calculator is None: 198 | cer, wer = None, None 199 | else: 200 | ys_hat = pred_pad.argmax(dim=-1) 201 | cer, wer = self.error_calculator(ys_hat.cpu(), ys_pad.cpu()) 202 | 203 | # copyied from e2e_asr 204 | alpha = self.mtlalpha 205 | if alpha == 0: 206 | self.loss = loss_att 207 | loss_att_data = float(loss_att) 208 | loss_ctc_data = None 209 | elif alpha == 1: 210 | self.loss = loss_ctc 211 | loss_att_data = None 212 | loss_ctc_data = float(loss_ctc) 213 | else: 214 | self.loss = alpha * loss_ctc + (1 - alpha) * loss_att 215 | loss_att_data = float(loss_att) 216 | loss_ctc_data = float(loss_ctc) 217 | 218 | loss_data = float(self.loss) 219 | if loss_data < CTC_LOSS_THRESHOLD and not math.isnan(loss_data): 220 | self.reporter.report(loss_ctc_data, loss_att_data, self.acc, cer_ctc, cer, wer, loss_data) 221 | else: 222 | logging.warning('loss (=%f) is not correct', loss_data) 223 | return self.loss 224 | 225 | def scorers(self): 226 | """Scorers.""" 227 | return dict(decoder=self.decoder, ctc=CTCPrefixScorer(self.ctc, self.eos)) 228 | 229 | def encode(self, x): 230 | """Encode acoustic features. 231 | 232 | :param ndarray x: source acoustic feature (T, D) 233 | :return: encoder outputs 234 | :rtype: torch.Tensor 235 | """ 236 | self.eval() 237 | x = torch.as_tensor(x).unsqueeze(0) 238 | enc_output, _ = self.encoder(x, None) 239 | return enc_output.squeeze(0) 240 | 241 | def recognize(self, x, recog_args, char_list=None, rnnlm=None, use_jit=False): 242 | """Recognize input speech. 243 | 244 | :param ndnarray x: input acoustic feature (B, T, D) or (T, D) 245 | :param Namespace recog_args: argment Namespace contraining options 246 | :param list char_list: list of characters 247 | :param torch.nn.Module rnnlm: language model module 248 | :return: N-best decoding results 249 | :rtype: list 250 | """ 251 | enc_output = self.encode(x).unsqueeze(0) 252 | if recog_args.ctc_weight > 0.0: 253 | lpz = self.ctc.log_softmax(enc_output) 254 | lpz = lpz.squeeze(0) 255 | else: 256 | lpz = None 257 | 258 | h = enc_output.squeeze(0) 259 | 260 | logging.info('input lengths: ' + str(h.size(0))) 261 | # search parms 262 | beam = recog_args.beam_size 263 | penalty = recog_args.penalty 264 | ctc_weight = recog_args.ctc_weight 265 | 266 | # preprare sos 267 | y = self.sos 268 | vy = h.new_zeros(1).long() 269 | 270 | if recog_args.maxlenratio == 0: 271 | maxlen = h.shape[0] 272 | else: 273 | # maxlen >= 1 274 | maxlen = max(1, int(recog_args.maxlenratio * h.size(0))) 275 | minlen = int(recog_args.minlenratio * h.size(0)) 276 | logging.info('max output length: ' + str(maxlen)) 277 | logging.info('min output length: ' + str(minlen)) 278 | 279 | # initialize hypothesis 280 | if rnnlm: 281 | hyp = {'score': 0.0, 'yseq': [y], 'rnnlm_prev': None} 282 | else: 283 | hyp = {'score': 0.0, 'yseq': [y]} 284 | if lpz is not None: 285 | import numpy 286 | 287 | from espnet.nets.ctc_prefix_score import CTCPrefixScore 288 | 289 | ctc_prefix_score = CTCPrefixScore(lpz.detach().numpy(), 0, self.eos, numpy) 290 | hyp['ctc_state_prev'] = ctc_prefix_score.initial_state() 291 | hyp['ctc_score_prev'] = 0.0 292 | if ctc_weight != 1.0: 293 | # pre-pruning based on attention scores 294 | from espnet.nets.pytorch_backend.rnn.decoders import CTC_SCORING_RATIO 295 | ctc_beam = min(lpz.shape[-1], int(beam * CTC_SCORING_RATIO)) 296 | else: 297 | ctc_beam = lpz.shape[-1] 298 | hyps = [hyp] 299 | ended_hyps = [] 300 | 301 | import six 302 | traced_decoder = None 303 | for i in six.moves.range(maxlen): 304 | logging.debug('position ' + str(i)) 305 | 306 | hyps_best_kept = [] 307 | for hyp in hyps: 308 | vy.unsqueeze(1) 309 | vy[0] = hyp['yseq'][i] 310 | 311 | # get nbest local scores and their ids 312 | ys_mask = subsequent_mask(i + 1).unsqueeze(0) 313 | ys = torch.tensor(hyp['yseq']).unsqueeze(0) 314 | # FIXME: jit does not match non-jit result 315 | if use_jit: 316 | if traced_decoder is None: 317 | traced_decoder = torch.jit.trace(self.decoder.forward_one_step, 318 | (ys, ys_mask, enc_output)) 319 | local_att_scores = traced_decoder(ys, ys_mask, enc_output)[0] 320 | else: 321 | local_att_scores = self.decoder.forward_one_step(ys, ys_mask, enc_output)[0] 322 | 323 | if rnnlm: 324 | rnnlm_state, local_lm_scores = rnnlm.predict(hyp['rnnlm_prev'], vy) 325 | local_scores = local_att_scores + recog_args.lm_weight * local_lm_scores 326 | else: 327 | local_scores = local_att_scores 328 | 329 | if lpz is not None: 330 | local_best_scores, local_best_ids = torch.topk( 331 | local_att_scores, ctc_beam, dim=1) 332 | ctc_scores, ctc_states = ctc_prefix_score( 333 | hyp['yseq'], local_best_ids[0], hyp['ctc_state_prev']) 334 | local_scores = \ 335 | (1.0 - ctc_weight) * local_att_scores[:, local_best_ids[0]] \ 336 | + ctc_weight * torch.from_numpy(ctc_scores - hyp['ctc_score_prev']) 337 | if rnnlm: 338 | local_scores += recog_args.lm_weight * local_lm_scores[:, local_best_ids[0]] 339 | local_best_scores, joint_best_ids = torch.topk(local_scores, beam, dim=1) 340 | local_best_ids = local_best_ids[:, joint_best_ids[0]] 341 | else: 342 | local_best_scores, local_best_ids = torch.topk(local_scores, beam, dim=1) 343 | 344 | for j in six.moves.range(beam): 345 | new_hyp = {} 346 | new_hyp['score'] = hyp['score'] + float(local_best_scores[0, j]) 347 | new_hyp['yseq'] = [0] * (1 + len(hyp['yseq'])) 348 | new_hyp['yseq'][:len(hyp['yseq'])] = hyp['yseq'] 349 | new_hyp['yseq'][len(hyp['yseq'])] = int(local_best_ids[0, j]) 350 | if rnnlm: 351 | new_hyp['rnnlm_prev'] = rnnlm_state 352 | if lpz is not None: 353 | new_hyp['ctc_state_prev'] = ctc_states[joint_best_ids[0, j]] 354 | new_hyp['ctc_score_prev'] = ctc_scores[joint_best_ids[0, j]] 355 | # will be (2 x beam) hyps at most 356 | hyps_best_kept.append(new_hyp) 357 | 358 | hyps_best_kept = sorted( 359 | hyps_best_kept, key=lambda x: x['score'], reverse=True)[:beam] 360 | 361 | # sort and get nbest 362 | hyps = hyps_best_kept 363 | logging.debug('number of pruned hypothes: ' + str(len(hyps))) 364 | if char_list is not None: 365 | logging.debug( 366 | 'best hypo: ' + ''.join([char_list[int(x)] for x in hyps[0]['yseq'][1:]])) 367 | 368 | # add eos in the final loop to avoid that there are no ended hyps 369 | if i == maxlen - 1: 370 | logging.info('adding in the last postion in the loop') 371 | for hyp in hyps: 372 | hyp['yseq'].append(self.eos) 373 | 374 | # add ended hypothes to a final list, and removed them from current hypothes 375 | # (this will be a probmlem, number of hyps < beam) 376 | remained_hyps = [] 377 | for hyp in hyps: 378 | if hyp['yseq'][-1] == self.eos: 379 | # only store the sequence that has more than minlen outputs 380 | # also add penalty 381 | if len(hyp['yseq']) > minlen: 382 | hyp['score'] += (i + 1) * penalty 383 | if rnnlm: # Word LM needs to add final score 384 | hyp['score'] += recog_args.lm_weight * rnnlm.final( 385 | hyp['rnnlm_prev']) 386 | ended_hyps.append(hyp) 387 | else: 388 | remained_hyps.append(hyp) 389 | 390 | # end detection 391 | from espnet.nets.e2e_asr_common import end_detect 392 | if end_detect(ended_hyps, i) and recog_args.maxlenratio == 0.0: 393 | logging.info('end detected at %d', i) 394 | break 395 | 396 | hyps = remained_hyps 397 | if len(hyps) > 0: 398 | logging.debug('remeined hypothes: ' + str(len(hyps))) 399 | else: 400 | logging.info('no hypothesis. Finish decoding.') 401 | break 402 | 403 | if char_list is not None: 404 | for hyp in hyps: 405 | logging.debug( 406 | 'hypo: ' + ''.join([char_list[int(x)] for x in hyp['yseq'][1:]])) 407 | 408 | logging.debug('number of ended hypothes: ' + str(len(ended_hyps))) 409 | 410 | nbest_hyps = sorted( 411 | ended_hyps, key=lambda x: x['score'], reverse=True)[:min(len(ended_hyps), recog_args.nbest)] 412 | 413 | # check number of hypotheis 414 | if len(nbest_hyps) == 0: 415 | logging.warning('there is no N-best results, perform recognition again with smaller minlenratio.') 416 | # should copy becasuse Namespace will be overwritten globally 417 | recog_args = Namespace(**vars(recog_args)) 418 | recog_args.minlenratio = max(0.0, recog_args.minlenratio - 0.1) 419 | return self.recognize(x, recog_args, char_list, rnnlm) 420 | 421 | logging.info('total log probability: ' + str(nbest_hyps[0]['score'])) 422 | logging.info('normalized log probability: ' + str(nbest_hyps[0]['score'] / len(nbest_hyps[0]['yseq']))) 423 | return nbest_hyps 424 | 425 | def calculate_all_attentions(self, xs_pad, ilens, ys_pad): 426 | """E2E attention calculation. 427 | 428 | :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, idim) 429 | :param torch.Tensor ilens: batch of lengths of input sequences (B) 430 | :param torch.Tensor ys_pad: batch of padded token id sequence tensor (B, Lmax) 431 | :return: attention weights with the following shape, 432 | 1) multi-head case => attention weights (B, H, Lmax, Tmax), 433 | 2) other case => attention weights (B, Lmax, Tmax). 434 | :rtype: float ndarray 435 | """ 436 | with torch.no_grad(): 437 | self.forward(xs_pad, ilens, ys_pad) 438 | ret = dict() 439 | for name, m in self.named_modules(): 440 | if isinstance(m, MultiHeadedAttention): 441 | ret[name] = m.attn.cpu().numpy() 442 | return ret 443 | --------------------------------------------------------------------------------