├── steps ├── decode_si.sh ├── train_nnet.sh ├── append_feats.sh ├── decode_nnet.sh ├── tandem │ └── decode_si.sh ├── score_kaldi.sh ├── score_kaldi_compare.sh ├── nnet2 │ ├── get_num_frames.sh │ ├── get_ivector_id.sh │ ├── check_ivectors_compatible.sh │ └── remove_egs.sh ├── tfrnnlm │ ├── check_py.py │ └── check_tensorflow_installed.sh ├── libs │ ├── common.pyc │ ├── __init__.pyc │ ├── nnet3 │ │ ├── __init__.pyc │ │ ├── xconfig │ │ │ ├── gru.pyc │ │ │ ├── lstm.pyc │ │ │ ├── utils.pyc │ │ │ ├── __init__.pyc │ │ │ ├── layers.pyc │ │ │ ├── parser.pyc │ │ │ ├── attention.pyc │ │ │ ├── basic_layers.pyc │ │ │ ├── convolution.pyc │ │ │ ├── stats_layer.pyc │ │ │ ├── trivial_layers.pyc │ │ │ ├── layers.py │ │ │ └── __init__.py │ │ ├── train │ │ │ ├── common.pyc │ │ │ ├── __init__.pyc │ │ │ ├── dropout_schedule.pyc │ │ │ ├── chain_objf │ │ │ │ ├── __init__.pyc │ │ │ │ ├── acoustic_model.pyc │ │ │ │ └── __init__.py │ │ │ ├── __init__.py │ │ │ └── frame_level_objf │ │ │ │ └── __init__.py │ │ ├── report │ │ │ ├── __init__.pyc │ │ │ ├── log_parse.pyc │ │ │ └── __init__.py │ │ └── __init__.py │ └── __init__.py ├── data │ ├── reverberate_data_dir.pyc │ ├── data_dir_manipulation_lib.pyc │ ├── __pycache__ │ │ ├── reverberate_data_dir.cpython-36.pyc │ │ └── data_dir_manipulation_lib.cpython-36.pyc │ └── data_dir_manipulation_lib.py ├── nnet3 │ ├── chain │ │ ├── e2e │ │ │ └── README.txt │ │ └── gen_topo.pl │ └── nnet3_to_dot.sh ├── conf │ ├── convert_ctm_to_tra.py │ ├── lattice_depth_per_frame.sh │ ├── parse_arpa_unigrams.py │ └── prepare_word_categories.py ├── segmentation │ ├── internal │ │ ├── verify_phones_list.py │ │ └── find_oov_phone.py │ ├── copy_targets_dir.sh │ ├── combine_targets_dirs.sh │ ├── decode_sad.sh │ └── post_process_sad_to_segments.sh ├── online │ └── nnet2 │ │ └── copy_ivector_dir.sh ├── word_align_lattices.sh ├── scoring │ └── score_kaldi_compare.sh ├── cleanup │ └── make_utterance_fsts.pl └── subset_ali_dir.sh ├── utils ├── pbs.pl ├── run.pl ├── fix_ctm.sh ├── queue.pl ├── slurm.pl ├── convert_ctm.pl ├── data │ ├── split_data.sh │ ├── combine_data.sh │ ├── copy_data_dir.sh │ ├── fix_data_dir.sh │ ├── subset_data_dir.sh │ ├── validate_data_dir.sh │ ├── perturb_data_dir_speed.sh │ ├── get_reco2utt_for_data.sh │ ├── get_num_frames.sh │ ├── get_segments_for_data.sh │ ├── extract_wav_segments_data_dir.sh │ ├── get_utt2num_frames.sh │ ├── resample_data_dir.sh │ ├── convert_data_dir_to_whole.sh │ ├── limit_feature_dim.sh │ ├── modify_speaker_info_to_recording.sh │ └── shift_feats.sh ├── lang │ ├── prepare_lang.sh │ ├── validate_lang.pl │ └── add_lex_disambig.pl ├── subset_data_dir_tr_cv.sh ├── filt.py ├── make_absolute.sh ├── ctm │ └── fix_ctm.sh ├── spk2utt_to_utt2spk.pl ├── s2eps.pl ├── eps2disambig.pl ├── build_const_arpa_lm.sh ├── summarize_warnings.pl ├── utt2spk_to_spk2utt.pl ├── shuffle_list.pl ├── analyze_segments.pl ├── show_lattice.sh ├── best_wer.sh ├── remove_oovs.pl ├── add_disambig.pl ├── remove_data_links.sh ├── nnet │ ├── gen_hamm_mat.py │ └── gen_splice.py ├── parallel │ └── limit_num_gpus.sh └── ln.pl ├── conf ├── pitch.conf ├── online_pitch.conf ├── g2p_model ├── online_cmvn.conf ├── decode.config ├── mfcc.conf ├── pinyin_initial ├── fbank.conf ├── cmu2pinyin ├── mfcc_hires.conf └── pinyin2cmu ├── .gitmodules ├── local ├── kaggle │ ├── __pycache__ │ │ ├── xlsx.cpython-36.pyc │ │ └── parse_choices.cpython-36.pyc │ ├── get_ppl.py │ ├── get_best_lambda.py │ ├── accumulate_lambda.py │ ├── get_id_list.py │ ├── check_sample_rate.sh │ ├── see_decode_time.sh │ ├── google_drive_download.sh │ ├── choose_lm.py │ ├── copy_error_files.py │ ├── max_ppl.py │ ├── test_lambda.sh │ ├── replace_iflytek_answer.py │ ├── choose_lm2.py │ ├── choose_lm.sh │ ├── parse_text.py │ ├── add.py │ ├── test │ │ ├── select_lm.py │ │ └── decode_test.sh │ ├── check_output.py │ ├── choose_lm2.sh │ ├── mix_LM_with_A.sh │ ├── data_prep_wav_seperate.py │ ├── replace_choice.py │ ├── demo.py │ ├── mix_LM_with_A.py │ ├── decode_demo.sh │ ├── replace_iflytek_choice.py │ ├── decode_kaggle_simulate.sh │ └── decode_kaggle.sh ├── data │ ├── get_total_dur.sh │ ├── __pycache__ │ │ ├── number2chinese.cpython-36.pyc │ │ ├── parse_choices.cpython-36.pyc │ │ ├── data_prep_kaggle.cpython-36.pyc │ │ └── normalize_utils.cpython-36.pyc │ ├── get_total_dur.py │ ├── data_prep_wav.sh │ ├── data_prep_Tl.sh │ ├── data_prep_NER.sh │ ├── data_prep_TOCFL.sh │ ├── data_prep_MATBN.sh │ ├── data_prep_seame.sh │ ├── data_prep_kaggle.sh │ ├── clean_up_data.sh │ ├── data_prep_PTS.sh │ ├── data_prep_cyberon_english.sh │ ├── data_prep_cyberon_chinese.sh │ ├── data_prep_PTS.py │ ├── corpus_path.sh │ ├── data_prep_noise.sh │ ├── word_segmentation.py │ ├── data_prep_wav.py │ ├── extract_ptt.py │ ├── fix_segments.py │ ├── data_prep_TOCFL.py │ ├── data_prep_NER.py │ ├── extract_wiki.py │ ├── data_prep_Tl.py │ ├── normalize.py │ ├── normalize_text.py │ └── merge_json.py ├── score.sh ├── lm │ ├── wfst │ │ ├── compose.sh │ │ ├── generate_choice_fst.sh │ │ ├── temp2.sh │ │ ├── run_wfst.sh │ │ └── format_data.sh │ ├── dirty │ │ ├── train_lms.sh │ │ ├── temp2.sh │ │ ├── mix_lm2.sh │ │ ├── temp.sh │ │ ├── lm_to_carpa.sh │ │ ├── get_3gram_prune.sh │ │ ├── mix_lm3.sh │ │ ├── kaggle4.sh │ │ ├── parse_text.py │ │ ├── compile_lm.sh │ │ ├── format_lm_from_text.sh │ │ └── mix_all_lms.sh │ ├── get_best_lambda.py │ ├── get_best_lambda2.py │ ├── run_4gram.sh │ ├── mix_lm2_test.sh │ ├── get_all_context.py │ ├── get_all_problem.py │ ├── mix_lm3_test.sh │ ├── generate_ori.sh │ ├── get_all_choices.py │ ├── prune_all_lm.sh │ ├── run_3gram_kaggle5.sh │ ├── news_crawler.py │ └── run_3gram.sh ├── show_all_cer.sh ├── modify_utt2spk.sh ├── change_machine.sh ├── nnet │ ├── copy_alignment.sh │ ├── DFSMN_M.proto │ ├── DFSMN_S.proto │ ├── DFSMN_M.proto.2560 │ ├── DFSMN_M.proto.8136 │ ├── DFSMN_S.proto.2560 │ ├── DFSMN_S.proto.8136 │ ├── DFSMN_M_ivector.proto │ ├── DFSMN_S_ivector.proto │ ├── DFSMN_M_ivector.proto.2560 │ ├── DFSMN_S_ivector.proto.2560 │ ├── retrain.sh │ ├── augment_data_only_kgb_noise.sh │ ├── DFSMN_L.proto │ ├── DFSMN_L.proto.2560 │ ├── DFSMN_L.proto.8136 │ ├── DFSMN_L_ivector.proto │ ├── DFSMN_L_ivector.proto.2560 │ └── augment_data.sh ├── combine_kaggle.sh ├── temp.sh ├── create_oov_char_lexicon.pl ├── extract_kaggle_feature.sh └── format_data.sh ├── path.sh ├── cmd.sh └── README.md /steps/decode_si.sh: -------------------------------------------------------------------------------- 1 | decode.sh -------------------------------------------------------------------------------- /utils/pbs.pl: -------------------------------------------------------------------------------- 1 | parallel/pbs.pl -------------------------------------------------------------------------------- /utils/run.pl: -------------------------------------------------------------------------------- 1 | parallel/run.pl -------------------------------------------------------------------------------- /steps/train_nnet.sh: -------------------------------------------------------------------------------- 1 | nnet/train.sh -------------------------------------------------------------------------------- /utils/fix_ctm.sh: -------------------------------------------------------------------------------- 1 | ctm/fix_ctm.sh -------------------------------------------------------------------------------- /utils/queue.pl: -------------------------------------------------------------------------------- 1 | parallel/queue.pl -------------------------------------------------------------------------------- /utils/slurm.pl: -------------------------------------------------------------------------------- 1 | parallel/slurm.pl -------------------------------------------------------------------------------- /steps/append_feats.sh: -------------------------------------------------------------------------------- 1 | paste_feats.sh -------------------------------------------------------------------------------- /steps/decode_nnet.sh: -------------------------------------------------------------------------------- 1 | nnet/decode.sh -------------------------------------------------------------------------------- /steps/tandem/decode_si.sh: -------------------------------------------------------------------------------- 1 | decode.sh -------------------------------------------------------------------------------- /utils/convert_ctm.pl: -------------------------------------------------------------------------------- 1 | ctm/convert_ctm.pl -------------------------------------------------------------------------------- /conf/pitch.conf: -------------------------------------------------------------------------------- 1 | --sample-frequency=8000 2 | -------------------------------------------------------------------------------- /utils/data/split_data.sh: -------------------------------------------------------------------------------- 1 | ../split_data.sh -------------------------------------------------------------------------------- /steps/score_kaldi.sh: -------------------------------------------------------------------------------- 1 | scoring/score_kaldi_wer.sh -------------------------------------------------------------------------------- /utils/data/combine_data.sh: -------------------------------------------------------------------------------- 1 | ../combine_data.sh -------------------------------------------------------------------------------- /utils/data/copy_data_dir.sh: -------------------------------------------------------------------------------- 1 | ../copy_data_dir.sh -------------------------------------------------------------------------------- /utils/data/fix_data_dir.sh: -------------------------------------------------------------------------------- 1 | ../fix_data_dir.sh -------------------------------------------------------------------------------- /utils/lang/prepare_lang.sh: -------------------------------------------------------------------------------- 1 | ../prepare_lang.sh -------------------------------------------------------------------------------- /utils/lang/validate_lang.pl: -------------------------------------------------------------------------------- 1 | ../validate_lang.pl -------------------------------------------------------------------------------- /conf/online_pitch.conf: -------------------------------------------------------------------------------- 1 | --sample-frequency=16000 2 | -------------------------------------------------------------------------------- /utils/data/subset_data_dir.sh: -------------------------------------------------------------------------------- 1 | ../subset_data_dir.sh -------------------------------------------------------------------------------- /utils/lang/add_lex_disambig.pl: -------------------------------------------------------------------------------- 1 | ../add_lex_disambig.pl -------------------------------------------------------------------------------- /steps/score_kaldi_compare.sh: -------------------------------------------------------------------------------- 1 | scoring/score_kaldi_compare.sh -------------------------------------------------------------------------------- /utils/data/validate_data_dir.sh: -------------------------------------------------------------------------------- 1 | ../validate_data_dir.sh -------------------------------------------------------------------------------- /utils/subset_data_dir_tr_cv.sh: -------------------------------------------------------------------------------- 1 | nnet/subset_data_tr_cv.sh -------------------------------------------------------------------------------- /steps/nnet2/get_num_frames.sh: -------------------------------------------------------------------------------- 1 | ../../utils/data/get_num_frames.sh -------------------------------------------------------------------------------- /utils/data/perturb_data_dir_speed.sh: -------------------------------------------------------------------------------- 1 | ../perturb_data_dir_speed.sh -------------------------------------------------------------------------------- /steps/tfrnnlm/check_py.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | -------------------------------------------------------------------------------- /conf/g2p_model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/conf/g2p_model -------------------------------------------------------------------------------- /steps/libs/common.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/common.pyc -------------------------------------------------------------------------------- /steps/libs/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/__init__.pyc -------------------------------------------------------------------------------- /steps/libs/nnet3/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/__init__.pyc -------------------------------------------------------------------------------- /conf/online_cmvn.conf: -------------------------------------------------------------------------------- 1 | # configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh 2 | -------------------------------------------------------------------------------- /steps/libs/nnet3/xconfig/gru.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/xconfig/gru.pyc -------------------------------------------------------------------------------- /steps/libs/nnet3/train/common.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/train/common.pyc -------------------------------------------------------------------------------- /steps/libs/nnet3/xconfig/lstm.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/xconfig/lstm.pyc -------------------------------------------------------------------------------- /steps/libs/nnet3/xconfig/utils.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/xconfig/utils.pyc -------------------------------------------------------------------------------- /steps/data/reverberate_data_dir.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/data/reverberate_data_dir.pyc -------------------------------------------------------------------------------- /steps/libs/nnet3/report/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/report/__init__.pyc -------------------------------------------------------------------------------- /steps/libs/nnet3/report/log_parse.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/report/log_parse.pyc -------------------------------------------------------------------------------- /steps/libs/nnet3/train/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/train/__init__.pyc -------------------------------------------------------------------------------- /steps/libs/nnet3/xconfig/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/xconfig/__init__.pyc -------------------------------------------------------------------------------- /steps/libs/nnet3/xconfig/layers.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/xconfig/layers.pyc -------------------------------------------------------------------------------- /steps/libs/nnet3/xconfig/parser.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/xconfig/parser.pyc -------------------------------------------------------------------------------- /steps/libs/nnet3/xconfig/attention.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/xconfig/attention.pyc -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "local/data/tool/jieba-zh_TW"] 2 | path = local/data/tool/jieba-zh_TW 3 | url = https://github.com/APCLab/jieba-tw 4 | -------------------------------------------------------------------------------- /steps/data/data_dir_manipulation_lib.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/data/data_dir_manipulation_lib.pyc -------------------------------------------------------------------------------- /steps/libs/nnet3/xconfig/basic_layers.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/xconfig/basic_layers.pyc -------------------------------------------------------------------------------- /steps/libs/nnet3/xconfig/convolution.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/xconfig/convolution.pyc -------------------------------------------------------------------------------- /steps/libs/nnet3/xconfig/stats_layer.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/xconfig/stats_layer.pyc -------------------------------------------------------------------------------- /local/kaggle/__pycache__/xlsx.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/local/kaggle/__pycache__/xlsx.cpython-36.pyc -------------------------------------------------------------------------------- /local/kaggle/get_ppl.py: -------------------------------------------------------------------------------- 1 | import sys 2 | S = sys.stdin.read() 3 | start = S.find('ppl=') 4 | endd = S.find('ppl1=') 5 | print(S[start+5:endd]) 6 | -------------------------------------------------------------------------------- /steps/libs/nnet3/train/dropout_schedule.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/train/dropout_schedule.pyc -------------------------------------------------------------------------------- /steps/libs/nnet3/xconfig/trivial_layers.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/xconfig/trivial_layers.pyc -------------------------------------------------------------------------------- /conf/decode.config: -------------------------------------------------------------------------------- 1 | beam=11.0 # beam for decoding. Was 13.0 in the scripts. 2 | first_beam=8.0 # beam for 1st-pass decoding in SAT. 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /local/data/get_total_dur.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | datadir=$1 3 | bash utils/data/get_utt2dur.sh $datadir 4 | python local/data/get_total_dur.py $datadir 5 | -------------------------------------------------------------------------------- /steps/libs/nnet3/train/chain_objf/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/train/chain_objf/__init__.pyc -------------------------------------------------------------------------------- /conf/mfcc.conf: -------------------------------------------------------------------------------- 1 | --use-energy=false # only non-default option. 2 | --sample-frequency=16000 # Switchboard is sampled at 8kHz 3 | --allow_downsample=true 4 | -------------------------------------------------------------------------------- /local/data/__pycache__/number2chinese.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/local/data/__pycache__/number2chinese.cpython-36.pyc -------------------------------------------------------------------------------- /local/data/__pycache__/parse_choices.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/local/data/__pycache__/parse_choices.cpython-36.pyc -------------------------------------------------------------------------------- /steps/libs/nnet3/train/chain_objf/acoustic_model.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/train/chain_objf/acoustic_model.pyc -------------------------------------------------------------------------------- /local/data/__pycache__/data_prep_kaggle.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/local/data/__pycache__/data_prep_kaggle.cpython-36.pyc -------------------------------------------------------------------------------- /local/data/__pycache__/normalize_utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/local/data/__pycache__/normalize_utils.cpython-36.pyc -------------------------------------------------------------------------------- /local/kaggle/__pycache__/parse_choices.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/local/kaggle/__pycache__/parse_choices.cpython-36.pyc -------------------------------------------------------------------------------- /steps/libs/nnet3/report/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Copyright 2016 Vimal Manohar 4 | # Apache 2.0. 5 | 6 | from . import log_parse 7 | 8 | __all__ = ["log_parse"] 9 | -------------------------------------------------------------------------------- /local/score.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e -o pipefail 4 | set -x 5 | steps/score_kaldi.sh "$@" 6 | steps/scoring/score_kaldi_cer.sh --stage 2 "$@" 7 | 8 | echo "$0: Done" 9 | -------------------------------------------------------------------------------- /steps/data/__pycache__/reverberate_data_dir.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/data/__pycache__/reverberate_data_dir.cpython-36.pyc -------------------------------------------------------------------------------- /local/lm/wfst/compose.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | .path.sh 3 | 4 | 5 | fsttablecompose G.fst b.fst | \ 6 | fstdeterminizestar --use-log=true | \ 7 | fstminimizeencoded > bG.fst 8 | -------------------------------------------------------------------------------- /conf/pinyin_initial: -------------------------------------------------------------------------------- 1 | B 2 | C 3 | CH 4 | D 5 | F 6 | G 7 | H 8 | J 9 | K 10 | L 11 | M 12 | N 13 | P 14 | Q 15 | R 16 | S 17 | SH 18 | T 19 | W 20 | X 21 | Y 22 | Z 23 | ZH 24 | -------------------------------------------------------------------------------- /steps/data/__pycache__/data_dir_manipulation_lib.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/data/__pycache__/data_dir_manipulation_lib.cpython-36.pyc -------------------------------------------------------------------------------- /local/lm/dirty/train_lms.sh: -------------------------------------------------------------------------------- 1 | . ../path.sh 2 | for x in guan water nie 20years laotsan water ; do 3 | ngram-count -text text_test/$x\.txt -lm text_test/$x\.lm -vocab text_test/vocab.txt -limit-vocab -order 4 4 | done 5 | -------------------------------------------------------------------------------- /local/show_all_cer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | dirs=$1 3 | [ -z $dirs ] && dirs="exp/* exp/nnet/* exp/aishell2/* exp/nnet/aishell2/*" 4 | for x in $dirs/decode*; do 5 | [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; 6 | done 7 | -------------------------------------------------------------------------------- /local/lm/get_best_lambda.py: -------------------------------------------------------------------------------- 1 | import sys 2 | log_file = sys.argv[1] 3 | s = open(log_file,encoding='utf8').read() 4 | start = s.find('best lambda') 5 | start = start + 13 6 | end = s[start:].find(' ') 7 | end+=start 8 | print(s[start:end]) 9 | -------------------------------------------------------------------------------- /local/kaggle/get_best_lambda.py: -------------------------------------------------------------------------------- 1 | import sys 2 | log_file = sys.argv[1] 3 | s = open(log_file,encoding='utf8').read() 4 | start = s.find('best lambda') 5 | start = start + 13 6 | end = s[start:].find(' ') 7 | end+=start 8 | print(s[start:end]) 9 | -------------------------------------------------------------------------------- /steps/libs/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Copyright 2016 Vimal Manohar 4 | # Apache 2.0. 5 | 6 | """ This package contains modules and subpackages used in kaldi scripts. 7 | """ 8 | 9 | from . import common 10 | 11 | __all__ = ["common"] 12 | -------------------------------------------------------------------------------- /local/lm/dirty/temp2.sh: -------------------------------------------------------------------------------- 1 | for novel in journey_west red_mansion ; do 2 | for x in A B C ; do 3 | local/mix_lm3.sh text_test/ori.lm text_test/$novel.lm text_test/kaggle12_$x.lm text_test/$novel\_$x.txt LM/$novel\_$x\_kaggle12.lm $x\_lambda 4 | done 5 | done 6 | wait 7 | 8 | -------------------------------------------------------------------------------- /local/lm/get_best_lambda2.py: -------------------------------------------------------------------------------- 1 | import sys 2 | log_file = sys.argv[1] 3 | s = open(log_file,encoding='utf8').read() 4 | start = s.find('best lambda') 5 | start = start + 13 6 | end = s[start:].find(' ') 7 | end+=start 8 | end2 = s[end+1:].find(' ') 9 | print(s[end+1:][:end2]) 10 | -------------------------------------------------------------------------------- /local/kaggle/accumulate_lambda.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | best_lambda_file = sys.argv[1] 4 | L = [] 5 | with open(best_lambda_file,'r',encoding='utf-8') as f: 6 | for line in f: 7 | L.append(float(line.rstrip())) 8 | print(np.mean(L),np.var(L)) 9 | 10 | -------------------------------------------------------------------------------- /conf/fbank.conf: -------------------------------------------------------------------------------- 1 | --htk-compat=false 2 | --window-type=hamming # disable Dans window, use the standard 3 | --use-energy=false # only fbank outputs 4 | --dither=1 5 | --num-mel-bins=80 # 8 filters/octave, 40 filters/16Khz as used by IBM 6 | --sample-frequency=16000 7 | --allow_downsample=true 8 | -------------------------------------------------------------------------------- /local/data/get_total_dur.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | datadir = sys.argv[1] 3 | 4 | total = 0.0 5 | with open(os.path.join(datadir,'utt2dur'),'r') as f: 6 | for line in f: 7 | tokens = line.rstrip().split() 8 | dur = float(tokens[1]) 9 | total += dur 10 | print('Total : {:f} minutes.'.format(total/60)) 11 | -------------------------------------------------------------------------------- /steps/libs/nnet3/train/chain_objf/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Copyright 2016 Vimal Manohar 4 | # Apache 2.0. 5 | 6 | """ This is a subpackage containing modules for training of 7 | deep neural network acoustic model with chain objective. 8 | """ 9 | 10 | from . import acoustic_model 11 | 12 | __all__ = ["acoustic_model"] 13 | -------------------------------------------------------------------------------- /local/kaggle/get_id_list.py: -------------------------------------------------------------------------------- 1 | import os,sys,json 2 | wav_dir = sys.argv[1] 3 | output_json = sys.argv[2] 4 | L = [] 5 | for wav in os.listdir(wav_dir): 6 | if wav.endswith('.wav'): 7 | name = wav[1:].replace('.wav','') 8 | idx = int(name) 9 | L.append(idx) 10 | json.dump(L,open(output_json,'w')) 11 | 12 | 13 | -------------------------------------------------------------------------------- /local/modify_utt2spk.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | datadir=$1 3 | if [ -f $datadir/utt2spk ] ; then 4 | mv $datadir/utt2spk $datadir/utt2spk_backup 5 | cat $datadir/utt2spk_backup | awk '{print $1 " " $1}' > $datadir/utt2spk 6 | cat $datadir/utt2spk | utils/utt2spk_to_spk2utt.pl > $datadir/spk2utt || exit 1; 7 | utils/fix_data_dir.sh $data || exit 1; 8 | fi 9 | -------------------------------------------------------------------------------- /local/change_machine.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | /data/local/kgb/Chinese-ASR/data 3 | /home/jacky/work/kgb/ 4 | 5 | 6 | for dir in data ; do 7 | for scp in $dir/*/*/*.scp ; do 8 | cat $scp | sed 's/home\/jackyyy/data\/local/g' > ${scp}2 9 | mv $scp ${scp}_backup 10 | mv ${scp}2 $scp 11 | echo "Changing path of $scp" 12 | done 13 | done 14 | -------------------------------------------------------------------------------- /local/lm/run_4gram.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | novels="ori 20years nie guan laotsan water journey_west red_mansion 3kingdom beauty_n hunghuang lai_ho old_time one_gan lu_shun news" 4 | #local/lm/generate_ori.sh 5 | for novel in $novels ; do 6 | ( 7 | txt=data/text/$novel.txt 8 | local/lm/text2Gfst.sh $txt 9 | )& 10 | done 11 | 12 | wait 13 | 14 | -------------------------------------------------------------------------------- /local/data/data_prep_wav.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# != 2 ]; then 4 | echo " Usage : data_prep_wav.sh " 5 | fi 6 | 7 | wav=$1 8 | data=$2 9 | 10 | mkdir -p $data 11 | 12 | python3 local/data/data_prep_wav.py $wav $data 13 | cat $data/utt2spk | utils/utt2spk_to_spk2utt.pl > $data/spk2utt || exit 1; 14 | utils/fix_data_dir.sh $data || exit 1; 15 | -------------------------------------------------------------------------------- /conf/cmu2pinyin: -------------------------------------------------------------------------------- 1 | AA A 2 | AE A 3 | AH A 4 | AO UO 5 | AW U 6 | AY AI 7 | B B 8 | CH CH 9 | D D 10 | DH S I 11 | EH AI 12 | ER E 13 | EY AI 14 | F F 15 | G G 16 | HH H 17 | IH I 18 | IY I 19 | JH ZH 20 | K K 21 | L L 22 | M M 23 | N N 24 | NG N 25 | OW UO 26 | OY UO 27 | P P 28 | R R 29 | S S 30 | SH SH 31 | T T 32 | TH S 33 | UH U 34 | UW U 35 | V W 36 | W W 37 | Y Y 38 | Z Z 39 | ZH X 40 | -------------------------------------------------------------------------------- /utils/filt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | # Apache 2.0 4 | 5 | import sys 6 | 7 | vocab=set() 8 | with open(sys.argv[1]) as vocabfile: 9 | for line in vocabfile: 10 | vocab.add(line.strip()) 11 | 12 | with open(sys.argv[2]) as textfile: 13 | for line in textfile: 14 | print " ".join(map(lambda word: word if word in vocab else '', line.strip().split())) 15 | -------------------------------------------------------------------------------- /local/kaggle/check_sample_rate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | wav_dir=$1 3 | for wav in $wav_dir/*.wav ; do 4 | sr=`sox --i -r $wav` 5 | if [ "$sr" != "16000" ] ; then 6 | echo $wav $sr 7 | name=${wav::-4} 8 | sox $wav -r 16000 ${name}2.wav 9 | mv ${name}2.wav $wav 10 | fi 11 | name=${wav::-4} 12 | sox $wav -t wav -r 16000 -b 16 ${name}2.wav 13 | mv ${name}2.wav $wav 14 | done 15 | -------------------------------------------------------------------------------- /local/kaggle/see_decode_time.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | test_dir=$1 3 | for dir in $test_dir/* ; do 4 | if [ -d $dir ] && [ -f $dir/3small_time ] && [ -f $dir/rescore_time ] ; then 5 | echo $dir 6 | cat $dir/3small_time 7 | cat $dir/rescore_time 8 | du -sh $dir/decode*/lat* 9 | cat $dir/rescore_lang 10 | echo " " 11 | echo "-------------------------------" 12 | fi 13 | done 14 | -------------------------------------------------------------------------------- /path.sh: -------------------------------------------------------------------------------- 1 | export KALDI_ROOT=/home/kgb/kaldi-DFSMN/kaldi 2 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh 3 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH 4 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 5 | . $KALDI_ROOT/tools/config/common_path.sh 6 | export LC_ALL=C 7 | -------------------------------------------------------------------------------- /steps/libs/nnet3/train/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright 2016 Vimal Manohar 3 | # Apache 2.0 4 | 5 | """ This library has classes and methods commonly used for training nnet3 6 | neural networks. 7 | 8 | It has separate submodules for frame-level objectives and chain objective: 9 | frame_level_objf -- For both recurrent and non-recurrent architectures 10 | chain_objf -- LF-MMI objective training 11 | """ 12 | -------------------------------------------------------------------------------- /steps/libs/nnet3/xconfig/layers.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Johns Hopkins University (Dan Povey) 2 | # 2016 Vijayaditya Peddinti 3 | # 2016 Yiming Wang 4 | # Apache 2.0. 5 | 6 | from .basic_layers import * 7 | from .convolution import * 8 | from .attention import * 9 | from .lstm import * 10 | from .gru import * 11 | from .stats_layer import * 12 | from .trivial_layers import * 13 | -------------------------------------------------------------------------------- /steps/libs/nnet3/train/frame_level_objf/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Copyright 2016 Vimal Manohar 4 | # Apache 2.0 5 | 6 | """ This library has classes and methods commonly used for training nnet3 7 | neural networks with frame-level objectives. 8 | """ 9 | 10 | from . import common 11 | from . import raw_model 12 | from . import acoustic_model 13 | 14 | __all__ = ["common", "raw_model", "acoustic_model"] 15 | -------------------------------------------------------------------------------- /local/data/data_prep_Tl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . ./local/data/corpus_path.sh 3 | 4 | data=./data/Tl/mfcc39_pitch9 5 | mkdir -p $data 6 | 7 | for x in wav.scp utt2spk text; do 8 | PYTHONIOENCODING=utf-8 python3 local/data/data_prep_Tl.py $Tl $x | sort -k1,1 -u > $data/$x || exit 1; 9 | done 10 | cat $data/utt2spk | utils/utt2spk_to_spk2utt.pl > $data/spk2utt || exit 1; 11 | utils/fix_data_dir.sh $data || exit 1; 12 | -------------------------------------------------------------------------------- /local/data/data_prep_NER.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . ./local/data/corpus_path.sh 3 | 4 | data=./data/NER/mfcc39_pitch9 5 | mkdir -p $data 6 | 7 | for x in wav.scp utt2spk text; do 8 | PYTHONIOENCODING=utf-8 python3 local/data/data_prep_NER.py $NER $x | sort -k1,1 -u > $data/$x || exit 1; 9 | done 10 | cat $data/utt2spk | utils/utt2spk_to_spk2utt.pl > $data/spk2utt || exit 1; 11 | utils/fix_data_dir.sh $data || exit 1; 12 | -------------------------------------------------------------------------------- /local/data/data_prep_TOCFL.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ./local/data/corpus_path.sh 4 | data=./data/TOCFL/mfcc39_pitch9 5 | mkdir -p $data 6 | for x in wav.scp text utt2spk ; do 7 | PYTHONIOENCODING=utf-8 python3 local/data/data_prep_TOCFL.py $TOCFL $x | sort -k1,1 -u > $data/$x || exit 1; 8 | done 9 | cat $data/utt2spk | utils/utt2spk_to_spk2utt.pl > $data/spk2utt || exit 1; 10 | utils/fix_data_dir.sh $data || exit 1; 11 | -------------------------------------------------------------------------------- /local/kaggle/google_drive_download.sh: -------------------------------------------------------------------------------- 1 | function gdrive_download () { 2 | CONFIRM=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate "https://docs.google.com/uc?export=download&id=$1" -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p') 3 | wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$CONFIRM&id=$1" -O $2 4 | rm -rf /tmp/cookies.txt 5 | } 6 | -------------------------------------------------------------------------------- /local/lm/dirty/mix_lm2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . ../path.sh 3 | lm1=$1 4 | lm2=$2 5 | test_text=$3 6 | lm_out=$4 7 | ngram -lm $lm1 -ppl $test_text -debug 2 > lm1.ppl 8 | ngram -lm $lm2 -ppl $test_text -debug 2 > lm2.ppl 9 | compute-best-mix lm1.ppl lm2.ppl > log 10 | lambda=`python3 local/get_best_lambda.py log` 11 | ngram -lm $lm1 -ppl $test_text -mix-lm $lm2 -lambda $lambda -write-lm $lm_out 12 | rm lm1.ppl lm2.ppl log 13 | -------------------------------------------------------------------------------- /local/data/data_prep_MATBN.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . ./local/data/corpus_path.sh 3 | 4 | data=./data/MATBN/mfcc39_pitch9 5 | mkdir -p $data 6 | 7 | for x in wav.scp utt2spk text segments ; do 8 | PYTHONIOENCODING=utf-8 python3 local/data/data_prep_MATBN.py $MATBN $x | sort -k1,1 -u > $data/$x || exit 1; 9 | done 10 | cat $data/utt2spk | utils/utt2spk_to_spk2utt.pl > $data/spk2utt || exit 1; 11 | utils/fix_data_dir.sh $data || exit 1; 12 | -------------------------------------------------------------------------------- /local/data/data_prep_seame.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . ./local/data/corpus_path.sh 3 | 4 | data=./data/seame/mfcc39_pitch9 5 | mkdir -p $data 6 | 7 | for x in wav.scp utt2spk text segments ; do 8 | PYTHONIOENCODING=utf-8 python3 local/data/data_prep_seame.py $seame $x | sort -k1,1 -u > $data/$x || exit 1; 9 | done 10 | cat $data/utt2spk | utils/utt2spk_to_spk2utt.pl > $data/spk2utt || exit 1; 11 | utils/fix_data_dir.sh $data || exit 1; 12 | -------------------------------------------------------------------------------- /local/lm/mix_lm2_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . path.sh 3 | lm1=$1 4 | lm2=$2 5 | lm_replace=$3 6 | test_text=$4 7 | lm_out=$5 8 | 9 | ngram -lm $lm1 -ppl $test_text -debug 2 > lm1.ppl 10 | ngram -lm $lm2 -ppl $test_text -debug 2 > lm2.ppl 11 | compute-best-mix lm1.ppl lm2.ppl > log 12 | lambda=`python3 local/lm/get_best_lambda.py log` 13 | ngram -lm $lm1 -ppl $test_text -mix-lm $lm_replace -lambda $lambda -write-lm $lm_out 14 | rm lm1.ppl lm2.ppl log 15 | -------------------------------------------------------------------------------- /steps/libs/nnet3/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Copyright 2016 Johns Hopkins University (Dan Povey) 4 | # 2016 Vimal Manohar 5 | # 2016 Vijayaditya Peddinti 6 | # 2016 Yiming Wang 7 | # Apache 2.0. 8 | 9 | 10 | # This module has the python functions which facilitate the use of nnet3 toolkit 11 | # It has two sub-modules 12 | # xconfig : Library for parsing high level description of neural networks 13 | # train : Library for training scripts 14 | -------------------------------------------------------------------------------- /local/lm/dirty/temp.sh: -------------------------------------------------------------------------------- 1 | for novel in 3kingdom ; do 2 | for x in A B C ; do 3 | local/mix_lm3.sh text_test/ori.lm text_test/$novel.lm text_test/kaggle12_$x.lm text_test/$novel\_$x.txt LM/$novel\_$x\_kaggle12.lm 4 | local/mix_lm3_test.sh text_test/ori.lm text_test/$novel.lm text_test/kaggle12_$x.lm text_test/kaggle123_$x.lm text_test/$novel\_$x.txt LM/$novel\_$x\.lm 5 | local/compile_lm.sh LM/$novel\_$x\_kaggle12.lm & 6 | local/compile_lm.sh LM/$novel\_$x\.lm & 7 | done 8 | done 9 | wait 10 | -------------------------------------------------------------------------------- /local/kaggle/choose_lm.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | sys.path.append('local/data/') 3 | from normalize_utils import * 4 | 5 | text_file = sys.argv[1] 6 | output_dir = sys.argv[2] 7 | with open(text_file,'r',encoding='utf-8') as f: 8 | for line in f: 9 | start = line.find(' ') 10 | token1 = line.split()[0] 11 | tex = normalize(line[start:].replace(' ','')) 12 | with open(os.path.join(output_dir,token1),'w',encoding='utf-8') as f: 13 | f.write(tex) 14 | 15 | 16 | -------------------------------------------------------------------------------- /local/data/data_prep_kaggle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | kaggle=/data/local/kgb/corpus/kgb/kaggle6 4 | 5 | for typ in A B C ; do 6 | data=./data/kaggle6/$typ/fbank 7 | mkdir -p $data 8 | for x in wav.scp utt2spk text ; do 9 | PYTHONIOENCODING=utf-8 python3 local/data/data_prep_kaggle.py $kaggle $x $typ data/lang/words.txt | sort -k1,1 -u > $data/$x || exit 1; 10 | done 11 | cat $data/utt2spk | utils/utt2spk_to_spk2utt.pl > $data/spk2utt || exit 1; 12 | utils/fix_data_dir.sh $data || exit 1; 13 | done 14 | -------------------------------------------------------------------------------- /local/kaggle/copy_error_files.py: -------------------------------------------------------------------------------- 1 | import os,sys 2 | from shutil import copyfile 3 | error_list = sys.argv[1] 4 | src_dir = sys.argv[2] 5 | target_dir = sys.argv[3] 6 | 7 | 8 | if not os.path.isdir(target_dir): 9 | os.makedirs(target_dir) 10 | 11 | with open(error_list,'r') as f: 12 | for line in f: 13 | name = line.rstrip() 14 | src = os.path.join(src_dir,name) 15 | target = os.path.join(target_dir,name) 16 | print("Copy {} to {} ".format(src,target)) 17 | copyfile(src,target) 18 | -------------------------------------------------------------------------------- /local/lm/dirty/lm_to_carpa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . path.sh 3 | for lang in lm_test/LM/* ; do 4 | ( 5 | if [ -d $lang ]; then 6 | bos=`grep "" $lang/words.txt | awk '{print $2}'` 7 | eos=`grep "" $lang/words.txt | awk '{print $2}'` 8 | unk=`cat $lang/oov.int` 9 | 10 | cat $lang.lm | \ 11 | utils/map_arpa_lm.pl $lang/words.txt | \ 12 | arpa-to-const-arpa --bos-symbol=$bos --eos-symbol=$eos \ 13 | --unk-symbol=$unk - $lang/G.carpa 14 | fi 15 | )& 16 | done 17 | wait 18 | -------------------------------------------------------------------------------- /local/lm/dirty/get_3gram_prune.sh: -------------------------------------------------------------------------------- 1 | . path.sh 2 | bash local/mix_lm3.sh lm_test/text_test/3gram_ori.lm lm_test/text_test/news.lm lm_test/text_test/all_novels_3gram.lm \ 3 | text_test/kaggle123_A.txt text_test/3gram_mix.lm 4 | ngram -lm lm_test/text_test/3gram_mix.lm -prune 0.0000001 -write-lm lm_test/text_test/3gram_mix_prune.lm 5 | gzip lm_test/text_test/3gram_mix_prune.lm 6 | bash local/format_data.sh lm_test/text_test/3gram_mix_prune.lm.gz data/lang_3small_mix_test 7 | utils/mkgraph.sh data/lang_3small_mix_test exp/tri4a exp/tri4a/graph_mix 8 | -------------------------------------------------------------------------------- /local/data/clean_up_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . ./path.sh 3 | . ./cmd.sh 4 | . ./utils/parse_options.sh 5 | 6 | set -e 7 | set -u 8 | set -o pipefail 9 | 10 | data=./data/kaggle3/mfcc39_pitch9 11 | name=kaggle3 12 | nj=40 13 | 14 | steps/align_fmllr.sh --cmd "$train_cmd" --nj $nj \ 15 | $data data/lang exp/tri4a exp/tri4a_ali_$name || exit 1; 16 | 17 | steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" --nj $nj $data data/lang \ 18 | exp/tri4a_ali_$name exp/tri4a_cleanup_$name data/kaggle3/cleaned_mfcc39_pitch9 || exit 1; 19 | -------------------------------------------------------------------------------- /local/data/data_prep_PTS.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . ./local/data/corpus_path.sh 3 | data=./data/PTS/mfcc39_pitch9 4 | mkdir -p $data 5 | 6 | cp $PTS/PTS_segmented/{text,segments} $data/ 7 | 8 | cat $PTS/PTS_segmented/segments | awk '{print $1 " " $1}' | sort -k1,1 -u > $data/utt2spk 9 | 10 | for x in wav.scp ; do 11 | python3 local/data/data_prep_PTS.py $PTS $x | sort -k1,1 -u > $data/$x || exit 1; 12 | done 13 | 14 | cat $data/utt2spk | utils/utt2spk_to_spk2utt.pl > $data/spk2utt || exit 1; 15 | utils/fix_data_dir.sh $data || exit 1; 16 | -------------------------------------------------------------------------------- /local/data/data_prep_cyberon_english.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . ./local/data/corpus_path.sh 3 | 4 | for part in cyberon_english_train cyberon_english_test ; do 5 | data=./data/$part/mfcc39_pitch9 6 | mkdir -p $data 7 | for x in wav.scp text utt2spk ; do 8 | PYTHONIOENCODING=utf-8 python3 local/data/data_prep_cyberon_english.py $cyberon_english $part $x | sort -k1,1 -u > $data/$x || exit 1; 9 | done 10 | cat $data/utt2spk | utils/utt2spk_to_spk2utt.pl > $data/spk2utt || exit 1; 11 | utils/fix_data_dir.sh $data || exit 1; 12 | done 13 | -------------------------------------------------------------------------------- /local/data/data_prep_cyberon_chinese.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . ./local/data/corpus_path.sh 3 | 4 | 5 | for part in cyberon_chinese_train cyberon_chinese_test ; do 6 | data=./data/$part/mfcc39_pitch9 7 | mkdir -p $data 8 | for x in wav.scp text utt2spk ; do 9 | PYTHONIOENCODING=utf-8 python3 local/data/data_prep_cyberon_chinese.py $cyberon_chinese $part $x | sort -k1,1 -u > $data/$x || exit 1; 10 | done 11 | cat $data/utt2spk | utils/utt2spk_to_spk2utt.pl > $data/spk2utt || exit 1; 12 | utils/fix_data_dir.sh $data || exit 1; 13 | done 14 | 15 | -------------------------------------------------------------------------------- /local/lm/get_all_context.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | sys.path.append('local/kaggle') 3 | sys.path.append('local/data/') 4 | import xlsx 5 | from normalize_utils import * 6 | import itertools 7 | 8 | if __name__ == '__main__': 9 | word_list = get_word_list('data/wfst/lang/words.txt') 10 | for i in range(4,5): 11 | xlsx_path = '/data/local/kgb/corpus/kgb/kaggle{}/answer.xlsx'.format(i) 12 | tmp = xlsx.get_content(xlsx_path,True,word_list) 13 | for row in tmp: 14 | print(row[1]) 15 | print(' '.join(list(row[1].replace(' ','')))) 16 | -------------------------------------------------------------------------------- /local/lm/get_all_problem.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | sys.path.append('local/kaggle') 3 | sys.path.append('local/data/') 4 | import xlsx 5 | from normalize_utils import * 6 | import itertools 7 | 8 | if __name__ == '__main__': 9 | word_list = get_word_list('data/wfst/lang/words.txt') 10 | for i in range(4,5): 11 | xlsx_path = '/data/local/kgb/corpus/kgb/kaggle{}/answer.xlsx'.format(i) 12 | tmp = xlsx.get_content(xlsx_path,True,word_list) 13 | for row in tmp: 14 | print(row[2]) 15 | print(' '.join(list(row[2].replace(' ','')))) 16 | -------------------------------------------------------------------------------- /local/data/data_prep_PTS.py: -------------------------------------------------------------------------------- 1 | import os,sys 2 | def main(pts_path,file_type): 3 | for root, dirs, files in os.walk(pts_path, topdown=False): 4 | for name in files: 5 | if name.endswith('.wav'): 6 | wav_label = name.split('.')[0] 7 | wav_path = os.path.join(root,name) 8 | wav_path = os.path.abspath(wav_path) 9 | if file_type == 'wav.scp': 10 | print(wav_label,wav_path) 11 | if __name__ == '__main__': 12 | pts_path = sys.argv[1] 13 | file_type = sys.argv[2] 14 | main(pts_path,file_type) 15 | 16 | -------------------------------------------------------------------------------- /local/data/corpus_path.sh: -------------------------------------------------------------------------------- 1 | cyberon_chinese=/home/jacky/work/kgb/corpus/CyberonChinese 2 | cyberon_english=/home/jacky/work/kgb/corpus/CyberonEnglish 3 | eatmic=/home/jacky/work/kgb/corpus/EatMic16 4 | PTS=/home/jacky/work/kgb/corpus/PTS-MSub-Vol1 5 | NER=/home/jacky/work/kgb/corpus/NER-Trs-Vol1 6 | TOCFL=/home/jacky/work/kgb/corpus/TOCFL/segmented 7 | seame=/home/jacky/work/kgb/corpus/seame 8 | Tl=/home/jacky/work/kgb/corpus/TlAlphaDigit 9 | wiki=/home/jacky/work/kgb/corpus/wiki 10 | ptt=/home/jacky/work/kgb/corpus/ptt 11 | MATBN=/home/jacky/work/kgb/corpus/MATBN 12 | aishell2=/data/local/kgb/corpus/AISHELL-2/iOS/data 13 | -------------------------------------------------------------------------------- /local/kaggle/max_ppl.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | names = [] 4 | scores = [] 5 | wav_name = sys.argv[1] 6 | flag=sys.argv[3] 7 | with open(sys.argv[2],'r') as f: 8 | for idx,line in enumerate(f): 9 | if idx %2 == 0 : 10 | name = line.rstrip() 11 | names.append(name) 12 | else: 13 | score = line.rstrip() 14 | scores.append(float(score)) 15 | min_idx = np.argmin(scores) 16 | min_name = names[min_idx] 17 | min_score = scores[min_idx] 18 | if flag == '3': 19 | print(wav_name,min_name,min_score) 20 | else: 21 | print(min_name) 22 | 23 | -------------------------------------------------------------------------------- /local/lm/mix_lm3_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . path.sh 3 | lm1=$1 4 | lm2=$2 5 | lm3=$3 6 | lm_replace=$4 7 | test_text=$5 8 | lm_out=$6 9 | ngram -lm $lm1 -ppl $test_text -debug 2 > lm1.ppl 10 | ngram -lm $lm2 -ppl $test_text -debug 2 > lm2.ppl 11 | ngram -lm $lm3 -ppl $test_text -debug 2 > lm3.ppl 12 | compute-best-mix lm1.ppl lm2.ppl lm3.ppl > log 13 | lambda=`python3 local/lm/get_best_lambda.py log` 14 | lambda2=`python3 local/lm/get_best_lambda2.py log` 15 | ngram -lm $lm1 -ppl $test_text -mix-lm $lm_replace -lambda $lambda -mix-lm2 $lm2 -mix-lambda2 $lambda2 -write-lm $lm_out 16 | rm lm1.ppl lm2.ppl lm3.ppl log 17 | -------------------------------------------------------------------------------- /local/lm/dirty/mix_lm3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . ../path.sh 3 | lm1=$1 4 | lm2=$2 5 | lm3=$3 6 | test_text=$4 7 | lm_out=$5 8 | #lambda3=$6 9 | ngram -lm $lm1 -ppl $test_text -debug 2 > lm1.ppl 10 | ngram -lm $lm2 -ppl $test_text -debug 2 > lm2.ppl 11 | ngram -lm $lm3 -ppl $test_text -debug 2 > lm3.ppl 12 | compute-best-mix lm1.ppl lm2.ppl lm3.ppl > log 13 | lambda=`python3 local/get_best_lambda.py log` 14 | lambda2=`python3 local/get_best_lambda2.py log` 15 | #echo "$lambda $lambda2" >> $lambda3 16 | ngram -lm $lm1 -ppl $test_text -mix-lm $lm3 -lambda $lambda -mix-lm2 $lm2 -mix-lambda2 $lambda2 -write-lm $lm_out 17 | rm lm1.ppl lm2.ppl lm3.ppl log 18 | -------------------------------------------------------------------------------- /local/lm/dirty/kaggle4.sh: -------------------------------------------------------------------------------- 1 | . ../path.sh 2 | for novel in 20years guan laotsan nie water ; do 3 | ngram -lm text_test/ori.lm -mix-lm text_test/kaggle123_A.lm -lambda 0.15 -mix-lm2 text_test/$novel.lm -mix-lambda2 0.8 -write-lm LM/$novel\_A\.lm 4 | ngram -lm text_test/ori.lm -mix-lm text_test/kaggle123_B.lm -lambda 0.16 -mix-lm2 text_test/$novel.lm -mix-lambda2 0.35 -write-lm LM/$novel\_B\.lm 5 | ngram -lm text_test/ori.lm -mix-lm text_test/kaggle123_C.lm -lambda 0.13 -mix-lm2 text_test/$novel.lm -mix-lambda2 0.35 -write-lm LM/$novel\_C\.lm 6 | for x in A B C ; do 7 | lm=LM/$novel\_$x.lm 8 | local/compile_lm.sh $lm & 9 | done 10 | done 11 | wait 12 | -------------------------------------------------------------------------------- /local/data/data_prep_noise.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | corpus_dir=/data/local/kgb/corpus/esc_speech_noise 3 | data_dir=data/esc_speech_noise 4 | mkdir -p $data_dir 5 | find -L $corpus_dir/ -iname "*.wav" | sort | xargs -I% basename % .wav | \ 6 | awk -v "dir=$corpus_dir" '{printf "%s %s/%s.wav \n", $0, dir, $0}' > $data_dir/wav.scp 7 | find -L $corpus_dir/ -iname "*.wav" | sort | xargs -I% basename % .wav | \ 8 | awk -v "dir=$corpus_dir" '{printf "%s %s.wav \n", $0, $0}' > $data_dir/utt2spk 9 | cat $data_dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $data_dir/spk2utt || exit 1; 10 | bash utils/data/get_reco2utt.sh $data_dir 11 | 12 | utils/fix_data_dir.sh $data_dir || exit 1; 13 | -------------------------------------------------------------------------------- /local/nnet/copy_alignment.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . path.sh 3 | dir=$1 4 | aug(){ 5 | ali=$1 6 | gunzip -c $ali | copy-int-vector ark:- ark,t:- | python3 -c " 7 | import sys 8 | for line in sys.stdin.readlines(): 9 | tokens = line.rstrip().split() 10 | label = tokens[0] 11 | values = ' '.join(tokens[1:]) 12 | print(label + '-aug',values) 13 | print(label ,values) 14 | #print('rvb1_'+label,values) 15 | #print(label+'-aug_kgb_noise',values) 16 | " | copy-int-vector ark,t:- ark:- | gzip -c > $ali\_after 17 | mv $ali\_after $ali 18 | echo "Done $ali" 19 | } 20 | 21 | export -f aug 22 | 23 | parallel -j 20 "aug {}" ::: $dir/ali.*.gz 24 | 25 | wait 26 | 27 | -------------------------------------------------------------------------------- /local/lm/dirty/parse_text.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | sys.path.append('../local/data/') 3 | from normalize_utils import * 4 | 5 | if __name__ == '__main__': 6 | text_path = sys.argv[1] 7 | output_path = sys.argv[2] 8 | new_text = '' 9 | #check_new_delete_word(text_path) 10 | word_list = get_word_list('data/lang/words.txt') 11 | with open(text_path,'r',encoding='utf-8') as f: 12 | for line in f: 13 | if 'ETtoday' in line: 14 | continue 15 | line = line.rstrip() 16 | new_text += normalize(line,word_list) + '\n' 17 | with open(output_path,'w',encoding='utf-8') as f: 18 | f.write(new_text) 19 | 20 | -------------------------------------------------------------------------------- /local/lm/generate_ori.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | . path.sh 3 | LM=data/LM 4 | text=data/text 5 | lang=data/wfst/lang 6 | vocab=$lang/vocabs.txt 7 | words=$lang/words.txt 8 | ngram -lm data/wfst/LM/ori_4gram.lm -vocab $vocab -limit-vocab -write-lm $LM/ori.lm 9 | for x in A B C; do 10 | ( 11 | ngram-count -text $text/kaggle1234_$x.txt -lm $LM/kaggle1234_$x.lm -vocab $vocab -limit-vocab -order 4 12 | ngram-count -text $text/kaggle12345_$x.txt -lm $LM/kaggle12345_$x.lm -vocab $vocab -limit-vocab -order 4 13 | ) & 14 | done 15 | wait 16 | for x in A B C; do 17 | local/lm/mix_lm2_test.sh $LM/ori.lm $LM/kaggle1234_$x.lm $LM/kaggle12345_$x.lm $text/kaggle5_$x.txt $LM/ori_$x.lm 18 | done 19 | -------------------------------------------------------------------------------- /utils/data/get_reco2utt_for_data.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Copyright 2016 Vimal Manohar 4 | # Apache 2.0 5 | 6 | if [ $# -ne 1 ]; then 7 | echo "This script outputs a mapping from recording to a list of utterances " 8 | echo "corresponding to the recording. It is analogous to the content of " 9 | echo "a spk2utt file, but is indexed by recording instead of speaker." 10 | echo "Usage: get_reco2utt.sh " 11 | echo " e.g.: get_reco2utt.sh data/train" 12 | exit 1 13 | fi 14 | 15 | data=$1 16 | 17 | if [ ! -s $data/segments ]; then 18 | utils/data/get_segments_for_data.sh $data > $data/segments 19 | fi 20 | 21 | cut -d ' ' -f 1,2 $data/segments | utils/utt2spk_to_spk2utt.pl 22 | -------------------------------------------------------------------------------- /conf/mfcc_hires.conf: -------------------------------------------------------------------------------- 1 | # config for high-resolution MFCC features, intended for neural network training. 2 | # Note: we keep all cepstra, so it has the same info as filterbank features, 3 | # but MFCC is more easily compressible (because less correlated) which is why 4 | # we prefer this method. 5 | --use-energy=false # use average of log energy, not energy. 6 | --sample-frequency=8000 # Switchboard is sampled at 8kHz 7 | --num-mel-bins=40 # similar to Google's setup. 8 | --num-ceps=40 # there is no dimensionality reduction. 9 | --low-freq=40 # low cutoff frequency for mel bins 10 | --high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800) 11 | --allow_downsample=true 12 | -------------------------------------------------------------------------------- /conf/pinyin2cmu: -------------------------------------------------------------------------------- 1 | A AA 2 | AI AY 3 | AN AE N 4 | ANG AE NG 5 | AO AW 6 | B B 7 | CH CH 8 | C T S 9 | D D 10 | E ER 11 | EI EY 12 | EN AH N 13 | ENG AH NG 14 | ER AA R 15 | F F 16 | G G 17 | H HH 18 | IA IY AA 19 | IANG IY AE NG 20 | IAN IY AE N 21 | IAO IY AW 22 | IE IY EH 23 | I IY 24 | ING IY NG 25 | IN IY N 26 | IONG IY UH NG 27 | IU IY UH 28 | J J 29 | K K 30 | L L 31 | M M 32 | N N 33 | O AO 34 | ONG UH NG 35 | OU OW 36 | P P 37 | Q Q 38 | R R 39 | SH SH 40 | S S 41 | T T 42 | UAI UW AY 43 | UANG UW AE NG 44 | UAN UW AE N 45 | UA UW AA 46 | UI UW IY 47 | UN UW AH N 48 | UO UW AO 49 | U UW 50 | UE IY EH 51 | VE IY EH 52 | V IY UW 53 | VN IY N 54 | W W 55 | X X 56 | Y Y 57 | ZH JH 58 | Z Z 59 | -------------------------------------------------------------------------------- /utils/data/get_num_frames.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script works out the approximate number of frames in a training directory. 4 | # This is sometimes needed by higher-level scripts 5 | 6 | 7 | if [ -f path.sh ]; then . ./path.sh; fi 8 | . parse_options.sh || exit 1; 9 | 10 | if [ $# -ne 1 ]; then 11 | ( 12 | echo "Usage: $0 " 13 | echo "Prints the number of frames of data in the data-dir" 14 | ) 1>&2 15 | fi 16 | 17 | data=$1 18 | 19 | if [ ! -f $data/utt2dur ]; then 20 | utils/data/get_utt2dur.sh $data 1>&2 || exit 1 21 | fi 22 | 23 | frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1 24 | 25 | awk -v s=$frame_shift '{n += $2} END{printf("%.0f\n", (n / s))}' <$data/utt2dur 26 | -------------------------------------------------------------------------------- /local/data/word_segmentation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding=utf-8 3 | # Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG) 4 | # 2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU) 5 | # Apache 2.0 6 | 7 | import sys 8 | from normalize_utils import * 9 | 10 | if len(sys.argv) < 3: 11 | sys.stderr.write("word_segmentation.py > \n") 12 | exit(1) 13 | 14 | vocab_file=sys.argv[1] 15 | trans_file=sys.argv[2] 16 | word_list = get_word_list(vocab_file) 17 | 18 | for line in open(trans_file,'r',encoding='utf-8'): 19 | key,trans = line.strip().split('\t',1) 20 | new_line = key + '\t' + normalize(trans,word_list) 21 | print(new_line) 22 | -------------------------------------------------------------------------------- /utils/make_absolute.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script replaces the command readlink -f (which is not portable). 4 | # It turns a pathname into an absolute pathname, including following soft links. 5 | target_file=$1 6 | 7 | cd $(dirname $target_file) 8 | target_file=$(basename $target_file) 9 | 10 | # Iterate down a (possible) chain of symlinks 11 | while [ -L "$target_file" ]; do 12 | target_file=$(readlink $target_file) 13 | cd $(dirname $target_file) 14 | target_file=$(basename $target_file) 15 | done 16 | 17 | # Compute the canonicalized name by finding the physical path 18 | # for the directory we're in and appending the target file. 19 | phys_dir=$(pwd -P) 20 | result=$phys_dir/$target_file 21 | echo $result 22 | -------------------------------------------------------------------------------- /local/data/data_prep_wav.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | 3 | 4 | if __name__ == '__main__': 5 | wav_dir = sys.argv[1] 6 | data_dir = sys.argv[2] 7 | utt2spk_path = os.path.join(data_dir,'utt2spk') 8 | wavscp_path = os.path.join(data_dir,'wav.scp') 9 | with open(utt2spk_path,'w') as f1, open(wavscp_path,'w') as f2: 10 | for dirPath, dirNames, fileNames in os.walk(sys.argv[1]): 11 | for name in fileNames: 12 | if name.endswith('.wav'): 13 | file_name = os.path.join(dirPath, name) 14 | file_name = os.path.abspath(file_name) 15 | f1.write(name + ' ' + name + '\n') 16 | f2.write(name + ' ' + file_name + '\n') 17 | 18 | -------------------------------------------------------------------------------- /local/lm/wfst/generate_choice_fst.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | words=$1 3 | out_fst=$2 4 | text_fst=`dirname $2` 5 | text_fst=$text_fst/text.fst 6 | 7 | rm $text_fst 8 | 9 | . path.sh 10 | 11 | echo " 12 | 0 1 13 | 1 2 14 | 2 3 15 | 3 4 " >> $text_fst 16 | 17 | for i in 1 2 3 4 ; do 18 | cat $words | grep -v "" | grep -v "" |\ 19 | grep -v "" | grep -v "" |\ 20 | grep -v "" | grep -v "" |\ 21 | grep -v "" | awk -v i=$i '{print i " " i " " $1 " " $1 }' >> $text_fst 22 | done 23 | echo 4 >> $text_fst 24 | 25 | 26 | fstcompile --isymbols=$words --osymbols=$words \ 27 | --keep_isymbols=false --keep_osymbols=false $text_fst | fstarcsort --sort_type=olabel > $out_fst 28 | 29 | -------------------------------------------------------------------------------- /local/data/extract_ptt.py: -------------------------------------------------------------------------------- 1 | import os,sys,json,re 2 | sys.path.append('local/data/tool/jieba-zh_TW') 3 | import jieba 4 | from number2chinese import * 5 | 6 | ptt_corpus = sys.argv[1] 7 | crawl_path = os.path.join(ptt_corpus,'ptt_crawl.json') 8 | ptt = json.load(open(crawl_path,'r')) 9 | for item in ptt: 10 | text = item['Content'] 11 | text = text.replace('\n\n','\n').replace(' ','') 12 | tokens = jieba.cut(text) 13 | new_tokens = [] 14 | for token in tokens: 15 | if re.match('^[0-9]+$',token): 16 | if len(token) > 15: 17 | continue 18 | token = to_chinese(int(token)) 19 | new_tokens.append(token) 20 | text = ' '.join(new_tokens) 21 | text = text.upper() 22 | if len(text) > 0: 23 | print(text) 24 | 25 | -------------------------------------------------------------------------------- /local/kaggle/test_lambda.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . path.sh 3 | thread_num=100 4 | #ngram ori -mix-lm nre lambda 5 | test_lambda(){ 6 | dir=$1 7 | Alm=$dir/A.lm 8 | orilm=/data/local/kgb/Chinese-ASR/lm_test/LM/C_kaggle12.lm 9 | echo $dir 10 | ngram-count -text $dir/A.txt -order 4 -lm $Alm 11 | ngram -lm $orilm -ppl $dir/C.txt -debug 2 > $dir/ori.ppl 12 | ngram -lm $Alm -ppl $dir/C.txt -debug 2 > $dir/A.ppl 13 | compute-best-mix $dir/ori.ppl $dir/A.ppl > $dir/log 14 | python3 local/data/get_best_lambda.py $dir/log >> $dir/../best_lambda 15 | } 16 | export -f test_lambda 17 | 18 | #PYTHOIOENCODING=utf-8 python3 local/data/test_lambda.py 19 | parallel -j $thread_num "test_lambda {}" ::: lambda_test/* 20 | python3 local/data/accumulate_lambda.py lambda_test/best_lambda 21 | 22 | wait 23 | -------------------------------------------------------------------------------- /steps/data/data_dir_manipulation_lib.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | def RunKaldiCommand(command, wait = True): 4 | """ Runs commands frequently seen in Kaldi scripts. These are usually a 5 | sequence of commands connected by pipes, so we use shell=True """ 6 | #logger.info("Running the command\n{0}".format(command)) 7 | p = subprocess.Popen(command, shell = True, 8 | stdout = subprocess.PIPE, 9 | stderr = subprocess.PIPE) 10 | 11 | if wait: 12 | [stdout, stderr] = p.communicate() 13 | if p.returncode is not 0: 14 | raise Exception("There was an error while running the command {0}\n".format(command)+"-"*10+"\n"+stderr) 15 | return stdout, stderr 16 | else: 17 | return p 18 | 19 | -------------------------------------------------------------------------------- /local/kaggle/replace_iflytek_answer.py: -------------------------------------------------------------------------------- 1 | from xlsx import * 2 | import sys,os 3 | 4 | 5 | if __name__ == '__main__': 6 | iflytek_json = sys.argv[1] 7 | output_json = sys.argv[2] 8 | kaggle_id = sys.argv[3] 9 | d = {} 10 | ans_path = '/data/local/kgb/corpus/kgb/kaggle{}/answer.xlsx'.format(kaggle_id) 11 | L = get_content(ans_path,False) 12 | 13 | for (No,p,q,c,answer) in L: 14 | d[No] = answer 15 | 16 | with open(iflytek_json,'r',encoding='utf8') as f: 17 | data = json.load(f) 18 | outputs = [] 19 | for sample in data: 20 | id = sample['id'] 21 | if id in d: 22 | sample['answer'] = d[id] 23 | outputs.append(sample) 24 | with open(output_json,'w',encoding='utf8') as f: 25 | json.dump(outputs,f,indent=4,ensure_ascii=False) 26 | 27 | -------------------------------------------------------------------------------- /local/combine_kaggle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | nj=8 #number of job parallel running 3 | stage=0 4 | . ./path.sh 5 | . ./cmd.sh 6 | . ./utils/parse_options.sh 7 | 8 | 9 | if [ $stage -le 1 ] ; then 10 | mfccdir=data/mfcc_pitch 11 | mkdir -p $mfccdir 12 | 13 | for corpus in kaggle1 kaggle2 kaggle3 ; do 14 | combine48='' 15 | for typ in A B C ; do 16 | ##Extract MFCC39 + pitch9 feature 17 | data=./data/$corpus/$typ/mfcc39_pitch9 18 | name=$corpus\_$typ 19 | steps/make_mfcc_pitch_online.sh --cmd "$train_cmd" --nj $nj --name $name $data exp/make_mfcc/$name $mfccdir || exit 1; 20 | steps/compute_cmvn_stats.sh --name $corpus $data exp/make_mfcc/$name $mfccdir || exit 1; 21 | combine48="$data $combine48" 22 | done 23 | utils/combine_data.sh ./data/$corpus/mfcc39_pitch9 $combine48 24 | done 25 | fi 26 | -------------------------------------------------------------------------------- /local/kaggle/choose_lm2.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | import sys 3 | import numpy as np 4 | sys.path.append('local/data/') 5 | from normalize_utils import * 6 | 7 | iflytek_A = sys.argv[1] 8 | test_dir=sys.argv[2] 9 | 10 | L = [] 11 | with open(iflytek_A,'r') as f: 12 | for line in f: 13 | start = line.find(' ') 14 | token1 = line.split()[0] 15 | L.append(token1) 16 | L2 = [] 17 | lms = [] 18 | for lm in os.listdir(test_dir): 19 | lms.append(lm) 20 | temp = [] 21 | with open(os.path.join(test_dir,lm),'r') as f: 22 | for line in f: 23 | temp.append(float(line)) 24 | L2.append(temp) 25 | n_line,n_lm = len(L),len(L2) 26 | scores = np.array(L2).transpose() 27 | for i in range(n_line): 28 | max_score = np.min(scores[i]) 29 | lm = lms[np.argmin(scores[i])] 30 | print(L[i],lm,max_score) 31 | -------------------------------------------------------------------------------- /steps/nnet3/chain/e2e/README.txt: -------------------------------------------------------------------------------- 1 | The scripts related to end2end chain training are in this directory 2 | Currently it has 3 scripts: 3 | 4 | ** prepare_e2e.sh which is almost equivalent 5 | to regular chain's build-tree.sh (i.e. it creates the tree and 6 | the transition-model) except it does not require any previously 7 | trained models (in other terms, it does what stages -3 and -2 8 | of steps/train_mono.sh do). 9 | 10 | ** get_egs_e2e.sh: this is simlilar to chain/get_egs.sh except it 11 | uses training FSTs (instead of lattices) to generate end2end egs. 12 | 13 | ** train_e2e.py: this is very similar to chain/train.py but 14 | with fewer stages (e.g. it does not compute the preconditioning matrix) 15 | 16 | 17 | For details please see the comments at top of local/chain/e2e/run_flatstart_*.sh 18 | and also src/chain/chain-generic-numerator.h. 19 | -------------------------------------------------------------------------------- /local/kaggle/choose_lm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . path.sh 3 | thread_num=56 4 | iflytek_A_text=$1 5 | test_dir=$2 6 | output=$3 7 | mkdir -p $test_dir 8 | 9 | export output=$output 10 | 11 | 12 | choose_lm(){ 13 | tex=$1 14 | if [ -f $tex\_result ] ; then 15 | rm $tex\_result 16 | fi 17 | for lm in ori news 20years nie guan laotsan water journey_west red_mansion 3kingdom beauty_n hunghuang lai_ho old_time one_gan lu_shun ; do 18 | echo $lm >> $tex\_result 19 | cat $tex | ngram -lm lm_test/LM/$lm\_A.lm -ppl - | python3 local/kaggle/get_ppl.py - >> $tex\_result 20 | done 21 | wav=`basename $tex` 22 | python3 local/kaggle/max_ppl.py $wav $tex\_result 3 >> $output 23 | } 24 | 25 | export -f choose_lm 26 | 27 | PYTHOIOENCODING=utf-8 python3 local/kaggle/choose_lm.py $iflytek_A_text $test_dir 28 | 29 | parallel -j $thread_num "choose_lm {}" ::: $test_dir/*.wav 30 | 31 | echo "Done choose_lm.sh." 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /utils/data/get_segments_for_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script operates on a data directory, such as in data/train/, 4 | # and writes new segments to stdout. The file 'segments' maps from 5 | # utterance to time offsets into a recording, with the format: 6 | # 7 | # This script assumes utterance and recording ids are the same (i.e., that 8 | # wav.scp is indexed by utterance), and uses durations from 'utt2dur', 9 | # created if necessary by get_utt2dur.sh. 10 | 11 | . ./path.sh 12 | 13 | if [ $# != 1 ]; then 14 | echo "Usage: $0 [options] " 15 | echo "e.g.:" 16 | echo " $0 data/train > data/train/segments" 17 | exit 1 18 | fi 19 | 20 | data=$1 21 | 22 | if [ ! -s $data/utt2dur ]; then 23 | utils/data/get_utt2dur.sh $data 1>&2 || exit 1; 24 | fi 25 | 26 | # 0 27 | awk '{ print $1, $1, 0, $2 }' $data/utt2dur 28 | 29 | exit 0 30 | -------------------------------------------------------------------------------- /local/temp.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | . ./path.sh 4 | . ./cmd.sh 5 | . ./utils/parse_options.sh 6 | mfccdir=data/mfcc 7 | fbankdir=data/fbank 8 | nj=40 9 | stage=2 10 | lang=data/wfst/lang 11 | lang_test=data/wfst/lang_test 12 | # Now make MFCC features. 13 | if [ $stage -le 1 ]; then 14 | # mfccdir should be some place with a largish disk where you 15 | # want to store MFCC features. 16 | for corpus in cyberon_chinese_test ; do 17 | data=./data/$corpus/mfcc39 18 | utils/copy_data_dir.sh ./data/$corpus/mfcc40 $data 19 | steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj --name $corpus $data exp/make_mfcc/$corpus $mfccdir || exit 1; 20 | steps/compute_cmvn_stats.sh --name $corpus $data exp/make_mfcc/$corpus $mfccdir || exit 1; 21 | done 22 | fi 23 | 24 | steps/decode.sh --cmd "$decode_cmd" --nj 12 --config conf/decode.config \ 25 | exp/aishell2/tri3/graph data/cyberon_chinese_test/mfcc39 exp/aishell2/tri3/decode_cyberon_chinese_test 26 | -------------------------------------------------------------------------------- /local/lm/get_all_choices.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | sys.path.append('local/kaggle') 3 | sys.path.append('local/data/') 4 | import xlsx 5 | from normalize_utils import * 6 | import itertools 7 | 8 | if __name__ == '__main__': 9 | word_list = get_word_list('data/wfst/lang/words.txt') 10 | for i in range(4,5): 11 | xlsx_path = '/data/local/kgb/corpus/kgb/kaggle{}/answer.xlsx'.format(i) 12 | tmp = xlsx.get_content(xlsx_path,True,word_list) 13 | for row in tmp: 14 | for perm in list(itertools.permutations(row[3])): 15 | text = xlsx.merge_choice(perm,True) 16 | print(text) 17 | split_text = [] 18 | for x in row[3]: 19 | split_text.append(' '.join(list(x.replace(' ','')))) 20 | for perm in list(itertools.permutations(split_text)): 21 | text = xlsx.merge_choice(perm,True) 22 | print(text) 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /local/lm/dirty/compile_lm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . path.sh 3 | x=$1 4 | dir=${x::-3} 5 | 6 | mkdir -p $dir 7 | cp -r data/lang/* $dir 8 | 9 | ngram -lm $x -vocab lm_test/text/vocab.txt -limit-vocab -write-lm $x 10 | 11 | cat $x | \ 12 | arpa2fst --disambig-symbol=#0 \ 13 | --read-symbol-table=$dir/words.txt - $dir/G.fst || exit 1; 14 | ## compile Ldet.fst 15 | newlang=$dir 16 | phi=`grep -w '#0' $newlang/words.txt | awk '{print $2}'` 17 | fstprint $newlang/L_disambig.fst | awk '{if($4 != '$phi'){print;}}' | fstcompile | \ 18 | fstdeterminizestar | fstrmsymbols $newlang/phones/disambig.int >$newlang/Ldet.fst || exit 1; 19 | 20 | ##transform to G.carpa 21 | bos=`grep "" $dir/words.txt | awk '{print $2}'` 22 | eos=`grep "" $dir/words.txt | awk '{print $2}'` 23 | unk=`cat $dir/oov.int` 24 | 25 | cat $x | \ 26 | utils/map_arpa_lm.pl $dir/words.txt | \ 27 | arpa-to-const-arpa --bos-symbol=$bos --eos-symbol=$eos \ 28 | --unk-symbol=$unk - $dir/G.carpa 29 | -------------------------------------------------------------------------------- /utils/ctm/fix_ctm.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | stmfile=$1 4 | ctmfile=$2 5 | 6 | segments_stm=`cat $stmfile | cut -f 1 -d ' ' | sort -u` 7 | segments_ctm=`cat $ctmfile | cut -f 1 -d ' ' | sort -u` 8 | 9 | segments_stm_count=`echo "$segments_stm" | wc -l ` 10 | segments_ctm_count=`echo "$segments_ctm" | wc -l ` 11 | 12 | #echo $segments_stm_count 13 | #echo $segments_ctm_count 14 | 15 | if [ "$segments_stm_count" -gt "$segments_ctm_count" ] ; then 16 | pp=$( diff <(echo "$segments_stm") <(echo "$segments_ctm" ) | grep "^<" | sed "s/^< *//g") 17 | ( 18 | for elem in $pp ; do 19 | echo "$elem 1 0 0 EMPTY_RECOGNIZED_PHRASE" 20 | done 21 | ) >> $ctmfile 22 | echo "FIXED CTM FILE" 23 | exit 0 24 | elif [ "$segments_stm_count" -lt "$segments_ctm_count" ] ; then 25 | echo "Segment STM count: $segments_stm_count" 26 | echo "Segment CTM count: $segments_ctm_count" 27 | echo "FAILURE FIXING CTM FILE" 28 | exit 1 29 | else 30 | exit 0 31 | fi 32 | 33 | -------------------------------------------------------------------------------- /utils/spk2utt_to_utt2spk.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | while(<>){ 19 | @A = split(" ", $_); 20 | @A > 1 || die "Invalid line in spk2utt file: $_"; 21 | $s = shift @A; 22 | foreach $u ( @A ) { 23 | print "$u $s\n"; 24 | } 25 | } 26 | 27 | 28 | -------------------------------------------------------------------------------- /cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | export train_cmd="run.pl --mem 6G" 14 | export decode_cmd="run.pl --mem 6G" 15 | export mkgraph_cmd="run.pl --mem 8G" 16 | export cuda_cmd="run.pl --gpu 1" 17 | -------------------------------------------------------------------------------- /steps/tfrnnlm/check_tensorflow_installed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # this script checks if TF is installed to be used with python 4 | # and if TF related binaries in kaldi is ready to use 5 | . ./path.sh 6 | 7 | if which lattice-lmrescore-tf-rnnlm 2>&1>/dev/null; then 8 | echo TensorFlow relate binaries found. This is good. 9 | else 10 | echo TF related binaries not compiled. 11 | echo You need to go to tools/ and run extras/install_tensorflow_cc.sh first 12 | echo and then do \"make\" under both src/tfrnnlm and src/tfrnnlmbin 13 | exit 1 14 | fi 15 | 16 | echo 17 | 18 | if python steps/tfrnnlm/check_py.py 2>/dev/null; then 19 | echo TensorFlow ready to use on the python side. This is good. 20 | else 21 | echo TensorFlow not found on the python side. 22 | echo Please go to tools/ and run extras/install_tensorflow_py.sh to install it 23 | echo If you already have TensorFlow installed somewhere else, you would need 24 | echo to add it to your PATH 25 | exit 1 26 | fi 27 | -------------------------------------------------------------------------------- /local/data/fix_segments.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | 3 | def get_labels(file_path): 4 | s = set() 5 | with open(file_path,'r') as f: 6 | for line in f: 7 | token = line.split()[0] 8 | s.add(token) 9 | return s 10 | 11 | if __name__ == '__main__': 12 | data_path = sys.argv[1] 13 | s1 = get_labels(os.path.join(data_path,'segments')) 14 | s2 = get_labels(os.path.join(data_path,'feats.scp')) 15 | s3 = s1 - s2 16 | for scp in ['text','utt2spk','spk2utt','segments']: 17 | all_lines = [] 18 | with open(os.path.join(data_path,scp),'r',encoding='utf-8') as f: 19 | for line in f: 20 | token = line.split()[0] 21 | if token in s3: 22 | continue 23 | else: 24 | all_lines.append(line) 25 | with open(os.path.join(data_path,scp),'w',encoding='utf-8') as f: 26 | for line in all_lines: 27 | f.write(line) 28 | 29 | 30 | -------------------------------------------------------------------------------- /local/kaggle/parse_text.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | sys.path.append('local/data/') 3 | from normalize_utils import * 4 | def check_new_delete_word(text_path): 5 | new_text = '' 6 | with open(text_path,'r',encoding='utf-8') as f: 7 | for line in f: 8 | new_text += line 9 | S1 = check_not_chinese(new_text) 10 | S2 = set(delete_symbols) 11 | for x in list(S1-S2): 12 | print(x) 13 | exit() 14 | 15 | 16 | if __name__ == '__main__': 17 | text_path = sys.argv[1] 18 | output_path = sys.argv[2] 19 | new_text = '' 20 | #check_new_delete_word(text_path) 21 | word_list = get_word_list('data/lang/words.txt') 22 | with open(text_path,'r',encoding='utf-8') as f: 23 | for line in f: 24 | if 'ETtoday' in line: 25 | continue 26 | line = line.rstrip() 27 | new_text += normalize(line,word_list) + '\n' 28 | with open(output_path,'w',encoding='utf-8') as f: 29 | f.write(new_text) 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /local/kaggle/add.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | from shutil import copyfile 3 | null='開口,師兄,呆子嚇了一跳,起身了,初出江湖人稱把師傅變成,功能不夠涼快,一點,受傷之事,拉進去了,是這樣的效果,你掙扎着下來,還去當你的王太子,標上行裏最高的一張,坐盆裏去,沒人說你有類說,優雅,你可千萬不要偷懶,又要做師傅,趁着假日到花果山來了,阿姐一名男童說,不行不行,前幾天在白骨嶺上,他打死白骨精,我只當玩耍。老和尚揍他,那找和尚當日軍,把它作爲通訊協定書趕走,他,不知怎樣按摩,那風尚版六座,給我來上幾下,還活得成呢,滿滿說,的,會跟你記仇,你見了他別說吃住都難,確實辛苦他了,他見到這種情景,令人氣憤,定會有那怪爭鬥,管叫哪個妖精救出師傅扎針,八戒只有橫下一條心來。' 4 | def read_choose_lm(choose_lm): 5 | d = {} 6 | with open(choose_lm,'r') as f: 7 | for line in f: 8 | tokens = line.rstrip().split() 9 | idx = int(tokens[0][1:].replace('.wav','')) 10 | novel = null 11 | if len(tokens) > 1: 12 | novel = tokens[1] 13 | d[tokens[0]] = novel 14 | return d 15 | d = read_choose_lm(sys.argv[1]) 16 | wav_dir = sys.argv[2] 17 | L = [] 18 | for wav in os.listdir(wav_dir): 19 | if wav.endswith('.wav'): 20 | if wav not in d: 21 | d[wav] = null 22 | with open(sys.argv[3],'w') as f: 23 | for k,v in d.items(): 24 | f.write('{} {} \n'.format(k,v)) 25 | 26 | 27 | -------------------------------------------------------------------------------- /local/kaggle/test/select_lm.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | 3 | def process_C2(src_dir,dirname,C_lang_dir,lang): 4 | idx = int(dirname[1:]) 5 | lm = os.path.join(C_lang_dir,lang,'rescore') 6 | if lm is not None: 7 | rescore_lang = os.path.join(src_dir,dirname,'rescore_lang') 8 | with open(rescore_lang,'w') as f: 9 | f.write(lm) 10 | 11 | if __name__ == '__main__': 12 | src_dir = sys.argv[1] 13 | C_lang_dir = sys.argv[2] 14 | d_list1 = [] 15 | C_langs = os.listdir(C_lang_dir) 16 | lang_id = 0 17 | for dirname in os.listdir(src_dir): 18 | if not os.path.isdir(os.path.join(src_dir,dirname)): 19 | continue 20 | idx = int(dirname[1:]) 21 | typ = dirname[0] 22 | if idx % 3 == 0 and idx <= 1500: 23 | use_gpu = os.path.join(src_dir,dirname,'use_gpu') 24 | with open(use_gpu,'w') as f: 25 | f.write('yes') 26 | if typ == 'C': 27 | process_C2(src_dir,dirname,C_lang_dir,C_langs[lang_id]) 28 | lang_id += 1 29 | 30 | -------------------------------------------------------------------------------- /local/lm/prune_all_lm.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | . path.sh 3 | lm_dir=data/LM 4 | lang=data/wfst/lang 5 | choice_fst=data/wfst/lang_test/choice.fst 6 | words=$lang/words.txt 7 | 8 | for x in $lm_dir/*C ; do 9 | if [ -d $x ]; then 10 | ( 11 | ngram -lm $x.lm -prune 2e-7 -write-lm $x\_pruned.lm 12 | xdir=$x\_pruned 13 | xlm=$x\_pruned.lm 14 | cp -r $lang $xdir 15 | cat $xlm | arpa2fst --disambig-symbol=#0 \ 16 | --read-symbol-table=$words - | fstarcsort --sort_type=olabel > $xdir/G.fst 17 | 18 | ## compile Ldet.fst 19 | phi=`grep -w '#0' $words | awk '{print $2}'` 20 | 21 | fstprint $xdir/L_disambig.fst | awk '{if($4 != '$phi'){print;}}' | fstcompile | \ 22 | fstdeterminizestar | fstrmsymbols $xdir/phones/disambig.int > $xdir/Ldet.fst || exit 1; 23 | 24 | mv $xdir/G.fst $xdir/G_head.fst 25 | fsttablecompose $xdir/G_head.fst $choice_fst | \ 26 | fstdeterminizestar --use-log=true | \ 27 | fstminimizeencoded > $xdir/G.fst 28 | ) & 29 | fi 30 | done 31 | 32 | wait 33 | 34 | 35 | -------------------------------------------------------------------------------- /local/lm/wfst/temp2.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | . path.sh 3 | wfst=./data/wfst 4 | dict=./data/wfst/dict 5 | lang=./data/wfst/lang 6 | tmp_lang=./data/wfst/local/lang 7 | model_dir=exp/tri4a 8 | LM=data/LM 9 | text=data/text 10 | #modify dict/lexicon.txt lexiconp.txt 11 | #utils/prepare_lang.sh $dict "" $tmp_lang $lang 12 | 13 | #LM training 14 | mkdir -p $LM/3gram 15 | #PYTHONENCODING=utf-8 python3 local/lm/get_all_choices.py #> $wfst/kaggle12_C.txt 16 | 17 | #ngram -lm data/local/lm/3gram-mincount/lm_pr10.0 -vocab $lang/vocabs.txt -limit-vocab -write-lm $LM/3gram/ori_pr10.0.lm 18 | 19 | local/lm/mix_lm3_test.sh $LM/3gram/ori_pr10.0.lm $LM/3gram/kaggle123_C.lm $LM/3gram/mix.lm \ 20 | $LM/3gram/kaggle1234_C.lm $text/kaggle4_C.txt $LM/3gram/ori_C_10.0.lm 21 | 22 | 23 | lm=$LM/3gram/ori_C_10.0.lm 24 | lang_test=./data/wfst/lang_test_pr10_C 25 | graph_dir=exp/tri4a/graph_pr10_C 26 | #G compilation and check L and G stochastic 27 | local/kaggle/wfst/format_data.sh $lm $lang $lang_test 28 | 29 | #compose HCLG(choice) 30 | utils/mkgraph.sh $lang_test $model_dir $graph_dir 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /local/lm/dirty/format_lm_from_text.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . path.sh 3 | set -euo pipefail 4 | 5 | text_dir=lm_test/new_text 6 | text_test=lm_test/text_test 7 | LM=lm_test/LM 8 | novel=$1 9 | 10 | opencc -i $text_dir/$novel.txt -o $text_dir/$novel\_tra.txt 11 | 12 | PYTHONIOENCODING=utf-8 python3 local/kaggle/parse_text.py $text_dir/$novel\_tra.txt $text_dir/$novel\_norm.txt 13 | 14 | ngram-count -text $text_dir/$novel\_norm.txt -lm $text_test/$novel\.lm -vocab $text_test/vocab.txt -limit-vocab -order 4 15 | 16 | ngram -lm $text_test/ori.lm -mix-lm $text_test/kaggle123_A.lm -lambda 0.15 -mix-lm2 $text_test/$novel.lm \ 17 | -mix-lambda2 0.8 -write-lm $LM/$novel\_A\.lm 18 | ngram -lm $text_test/ori.lm -mix-lm $text_test/kaggle123_B.lm -lambda 0.16 -mix-lm2 $text_test/$novel.lm \ 19 | -mix-lambda2 0.35 -write-lm $LM/$novel\_B\.lm 20 | ngram -lm $text_test/ori.lm -mix-lm $text_test/kaggle123_C.lm -lambda 0.13 -mix-lm2 $text_test/$novel.lm \ 21 | -mix-lambda2 0.35 -write-lm $LM/$novel\_C\.lm 22 | 23 | for x in A B C ; do 24 | lm=$LM/$novel\_$x.lm 25 | lm_test/local/compile_lm.sh $lm & 26 | done 27 | 28 | wait 29 | -------------------------------------------------------------------------------- /utils/s2eps.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # This script replaces and with (on both input and output sides), 18 | # for the G.fst acceptor. 19 | 20 | while(<>){ 21 | @A = split(" ", $_); 22 | if ( @A >= 4 ) { 23 | if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } 24 | if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } 25 | } 26 | print join("\t", @A) . "\n"; 27 | } 28 | -------------------------------------------------------------------------------- /utils/eps2disambig.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | # 2015 Guoguo Chen 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | # MERCHANTABLITY OR NON-INFRINGEMENT. 15 | # See the Apache 2 License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # This script replaces epsilon with #0 on the input side only, of the G.fst 19 | # acceptor. 20 | 21 | while(<>){ 22 | if (/\s+#0\s+/) { 23 | print STDERR "$0: ERROR: LM has word #0, " . 24 | "which is reserved as disambiguation symbol\n"; 25 | exit 1; 26 | } 27 | s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; 28 | print; 29 | } 30 | -------------------------------------------------------------------------------- /local/kaggle/check_output.py: -------------------------------------------------------------------------------- 1 | import os,sys 2 | sys.path.append('local/data/') 3 | from normalize_utils import * 4 | src_dir=sys.argv[1] 5 | d_list = [] 6 | for d in os.listdir(src_dir): 7 | if os.path.isdir(os.path.join(src_dir,d)): 8 | d_list.append(d) 9 | output_path = os.path.join(src_dir,'output.txt') 10 | 11 | L = read_outputs(output_path) 12 | missing_trans = [] 13 | 14 | for name,trans in L: 15 | f_name = name.replace('.wav','') 16 | if len(trans) == 0 : 17 | missing_trans.append(name) 18 | 19 | 20 | 21 | missing_files =[] 22 | L2 = [x.replace('.wav','') for x,y in L] 23 | for d in d_list: 24 | if d not in L2: 25 | missing_files.append(d+'.wav') 26 | 27 | wrong = False 28 | if len(missing_files) > 0: 29 | wrong = True 30 | for f in missing_files: 31 | print(f) 32 | print("Missing {} files.".format(len(missing_files))) 33 | if len(missing_trans) > 0: 34 | wrong = True 35 | for f in missing_trans: 36 | print(f) 37 | print("Missing {} trans.".format(len(missing_trans))) 38 | if not wrong: 39 | print("All wav files have outputs.") 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /local/data/data_prep_TOCFL.py: -------------------------------------------------------------------------------- 1 | import os,sys 2 | 3 | def main(tocfl_path,file_type): 4 | wavdir_path = os.path.join(tocfl_path,'wav') 5 | wavdir_path = os.path.abspath(wavdir_path) 6 | txt_path = os.path.join(tocfl_path,'txt') 7 | for filename in os.listdir(wavdir_path): 8 | wav_label = filename.split('.')[0] 9 | wav_path = os.path.join(wavdir_path,filename) 10 | txt_file = os.path.join(txt_path,wav_label+'.txt') 11 | txt = open(txt_file,'r',encoding='UTF-8').read() 12 | trans = txt.rstrip() 13 | #trans = ' '.join(list(trans)) 14 | if file_type == 'text': 15 | sys.path.append('local/data/tool/jieba-zh_TW') 16 | import jieba 17 | trans = ' '.join(jieba.cut(trans)) 18 | trans = trans.upper() 19 | print(wav_label,trans) 20 | elif file_type == 'wav.scp': 21 | print(wav_label,wav_path) 22 | elif file_type == 'utt2spk': 23 | print(wav_label, wav_label) 24 | if __name__ == '__main__': 25 | tocfl_path = sys.argv[1] 26 | file_type = sys.argv[2] 27 | main(tocfl_path,file_type) 28 | 29 | -------------------------------------------------------------------------------- /local/data/data_prep_NER.py: -------------------------------------------------------------------------------- 1 | import os,sys 2 | 3 | def main(corpus_path,file_type): 4 | for root, dirs, files in os.walk(corpus_path, topdown=False): 5 | for name in files: 6 | if name.endswith('.wav'): 7 | wav_label = name.split('.')[0] 8 | wav_path = os.path.join(root,name) 9 | wav_path = os.path.abspath(wav_path) 10 | 11 | txt_path = wav_path.replace('Wav','Text').replace('.wav','.txt') 12 | if not os.path.isfile(txt_path): 13 | continue 14 | trans = open(txt_path,'r', encoding='utf-8').read() 15 | trans = trans.rstrip() 16 | trans = trans.upper() 17 | 18 | if file_type == 'wav.scp': 19 | print(wav_label, wav_path) 20 | elif file_type == 'utt2spk': 21 | print(wav_label, wav_label) 22 | elif file_type == 'text': 23 | print(wav_label, trans) 24 | if __name__ == '__main__': 25 | corpus_path = sys.argv[1] 26 | file_type = sys.argv[2] 27 | main(corpus_path,file_type) 28 | -------------------------------------------------------------------------------- /local/kaggle/choose_lm2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . path.sh 3 | iflytek_A_text=$1 4 | test_dir=$2 5 | output=$3 6 | mkdir -p $test_dir 7 | export LC_ALL='en_US.utf8' 8 | for lm in ori news 20years nie guan laotsan water journey_west red_mansion 3kingdom beauty_n hunghuang lai_ho old_time one_gan lu_shun ; do 9 | ( 10 | cat $iflytek_A_text | PYTHOIOENCODING="utf-8" python3 -c " 11 | import sys 12 | sys.path.append('local/data/') 13 | from normalize_utils import * 14 | for line in sys.stdin.readlines(): 15 | start = line.find(' ') 16 | token1 = line.split()[0] 17 | tex = normalize(line[start:].replace(' ','')) 18 | print(tex) 19 | " | ngram -lm data/LM/$lm\_A.lm -ppl - -debug 1 | PYTHOIOENCODING=utf-8 python3 -c " 20 | import sys 21 | for line in sys.stdin.readlines(): 22 | if 'zeroprobs' in line: 23 | start = line.find('ppl=') 24 | endd = line.find('ppl1=') 25 | print(line[start+5:endd]) 26 | if line.startswith('file'): 27 | break 28 | " > $test_dir/$lm 29 | ) & 30 | done 31 | wait 32 | PYTHOIOENCODING="utf-8" python3 local/kaggle/choose_lm2.py $iflytek_A_text $test_dir > $output 33 | 34 | 35 | echo "Done choose_lm.sh." 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /utils/data/extract_wav_segments_data_dir.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2017 Hossein Hadian 4 | # Apache 2.0 5 | 6 | # This script copies a data directory (which has a 'segments' file), extracting 7 | # wav segments (according to the 'segments' file) 8 | # so that the resulting data directory does not have a 'segments' file anymore. 9 | 10 | . utils/parse_options.sh 11 | . ./path.sh 12 | 13 | if [ $# != 2 ]; then 14 | echo "Usage: $0 " 15 | echo " This script copies data directory to and gets" 16 | echo "rid of the 'segments' file by extracting the wav segments." 17 | exit 1; 18 | fi 19 | 20 | 21 | export LC_ALL=C 22 | 23 | srcdir=$1 24 | dir=$2 25 | 26 | 27 | if ! mkdir -p $dir/data; then 28 | echo "$0: failed to create directory $dir/data" 29 | exit 1 30 | fi 31 | 32 | set -e -o pipefail 33 | utils/copy_data_dir.sh $srcdir $dir 34 | 35 | extract-segments scp:$srcdir/wav.scp $srcdir/segments \ 36 | ark,scp:$dir/data/wav_segments.ark,$dir/data/wav_segments.scp 37 | cat $dir/data/wav_segments.scp | awk '{ print $1 " wav-copy " $2 " - |" }' >$dir/wav.scp 38 | rm $dir/reco2file_and_channel || true 39 | -------------------------------------------------------------------------------- /steps/nnet2/get_ivector_id.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2016, Johns Hopkins University (Yenda Trmal ) 3 | # License: Apache 2.0 4 | 5 | # Begin configuration section. 6 | # End configuration section 7 | set -e -o pipefail 8 | set -o nounset # Treat unset variables as an error 9 | 10 | # End configuration section. 11 | 12 | #echo >&2 "$0 $@" # Print the command line for logging 13 | 14 | if [ -f path.sh ]; then . ./path.sh; fi 15 | . parse_options.sh || exit 1; 16 | 17 | 18 | if [ $# != 1 ]; then 19 | echo >&2 "Usage: $0 " 20 | echo >&2 " e.g.: $0 exp/nnet3/extractor" 21 | exit 1 22 | fi 23 | 24 | ivecdir=$1 25 | 26 | if [ -f $ivecdir/final.ie.id ] ; then 27 | cat $ivecdir/final.ie.id 28 | elif [ -f $ivecdir/final.ie ] ; then 29 | # note the creation can fail in case the extractor directory 30 | # is not read-only media or the user des not have access rights 31 | # in that case we will just behave as if the id is not available 32 | id=$(md5sum $ivecdir/final.ie | awk '{print $1}') 33 | echo "$id" > $ivecdir/final.ie.id || true 34 | echo "$id" 35 | else 36 | exit 0 37 | fi 38 | 39 | exit 0 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /local/lm/dirty/mix_all_lms.sh: -------------------------------------------------------------------------------- 1 | #local/mix_lm2.sh text_test/ori.lm text_test/kaggle12_A.lm text_test/kaggle3_A.txt LM/A_kaggle12.lm 2 | #local/mix_lm2.sh text_test/ori.lm text_test/kaggle12_B.lm text_test/kaggle3_B.txt LM/B_kaggle12.lm 3 | #local/mix_lm2.sh text_test/ori.lm text_test/kaggle12_C.lm text_test/kaggle3_C.txt LM/C_kaggle12.lm 4 | 5 | #local/mix_lm2_test.sh text_test/ori.lm text_test/kaggle12_A.lm text_test/kaggle123_A.lm text_test/kaggle3_A.txt LM/A.lm 6 | #local/mix_lm2_test.sh text_test/ori.lm text_test/kaggle12_B.lm text_test/kaggle123_B.lm text_test/kaggle3_B.txt LM/B.lm 7 | #local/mix_lm2_test.sh text_test/ori.lm text_test/kaggle12_C.lm text_test/kaggle123_C.lm text_test/kaggle3_C.txt LM/C.lm 8 | 9 | for novel in 3kingdom journey_west red_mansion hunghuang ; do 10 | for x in A B C ; do 11 | local/mix_lm3.sh text_test/ori.lm text_test/$novel.lm text_test/kaggle12_$x.lm text_test/$novel\_$x.txt LM/$novel\_$x\_kaggle12.lm 12 | local/mix_lm3_test.sh text_test/ori.lm text_test/$novel.lm text_test/kaggle12_$x.lm text_test/kaggle123_$x.lm text_test/$novel\_$x.txt LM/$novel\_$x\.lm 13 | done 14 | done 15 | for x in LM/*.lm ; do 16 | ( 17 | local/compile_lm.sh $x 18 | ) & 19 | done 20 | wait 21 | -------------------------------------------------------------------------------- /steps/conf/convert_ctm_to_tra.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | # Copyright 2015 Brno University of Technology (author: Karel Vesely) 4 | # Apache 2.0 5 | 6 | import sys, operator 7 | 8 | # This scripts loads a 'ctm' file and converts it into the 'tra' format: 9 | # "utt-key word1 word2 word3 ... wordN" 10 | # The 'utt-key' is the 1st column in the CTM. 11 | 12 | # Typically the CTM contains: 13 | # - utterance-relative timimng (i.e. prepared without 'utils/convert_ctm.pl') 14 | # - confidences 15 | 16 | if len(sys.argv) != 3: 17 | print 'Usage: %s ctm-in tra-out' % __file__ 18 | sys.exit(1) 19 | dummy, ctm_in, tra_out = sys.argv 20 | 21 | if ctm_in == '-': ctm_in = '/dev/stdin' 22 | if tra_out == '-': tra_out = '/dev/stdout' 23 | 24 | # Load the 'ctm' into dictionary, 25 | tra = dict() 26 | with open(ctm_in) as f: 27 | for l in f: 28 | utt, ch, beg, dur, wrd, conf = l.split() 29 | if not utt in tra: tra[utt] = [] 30 | tra[utt].append((float(beg),wrd)) 31 | 32 | # Store the in 'tra' format, 33 | with open(tra_out,'w') as f: 34 | for utt,tuples in tra.iteritems(): 35 | tuples.sort(key = operator.itemgetter(0)) # Sort by 'beg' time, 36 | f.write('%s %s\n' % (utt,' '.join([t[1] for t in tuples]))) 37 | 38 | -------------------------------------------------------------------------------- /local/kaggle/mix_LM_with_A.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . path.sh 3 | 4 | A_outputs=$1 5 | C_lang_dir=$2 6 | thread_num=15 7 | 8 | mix_lm(){ 9 | dir=$1 10 | Alm=$dir/A.lm 11 | orilm=`cat $dir/lm_path` 12 | echo $dir 13 | ngram-count -text $dir/A.txt -order 4 -lm $Alm 14 | ngram -lm $orilm -mix-lm $Alm -lambda 0.6582 -write-lm $dir/rescore.lm -limit-vocab -vocab ./lm_test/text/vocab.txt 15 | mkdir -p $dir/rescore 16 | cp -r data/lang/* $dir/rescore 17 | cat $dir/rescore.lm | \ 18 | arpa2fst --disambig-symbol=#0 \ 19 | --read-symbol-table=$dir/rescore/words.txt - $dir/rescore/G.fst || exit 1; 20 | rm $dir/rescore.lm 21 | 22 | newlang=$dir/rescore 23 | phi=`grep -w '#0' $newlang/words.txt | awk '{print $2}'` 24 | fstprint $newlang/L_disambig.fst | awk '{if($4 != '$phi'){print;}}' | fstcompile | \ 25 | fstdeterminizestar | fstrmsymbols $newlang/phones/disambig.int >$newlang/Ldet.fst || exit 1; 26 | } 27 | 28 | export -f mix_lm 29 | mkdir -p $C_lang_dir 30 | 31 | startt=`date +%s` 32 | python3 local/kaggle/mix_LM_with_A.py $A_outputs $C_lang_dir kaggle4_lm 33 | 34 | parallel -j $thread_num "mix_lm {}" ::: $C_lang_dir/* 35 | endt=`date +%s` 36 | runtime=$((endt-startt)) 37 | echo "Total time $runtime seconds" 38 | -------------------------------------------------------------------------------- /steps/segmentation/internal/verify_phones_list.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | # Copyright 2017 Vimal Manohar 4 | # Apache 2.0 5 | 6 | """This script verifies the list of phones read from stdin are valid 7 | phones present in lang/phones.txt.""" 8 | 9 | import argparse 10 | import sys 11 | 12 | def get_args(): 13 | parser = argparse.ArgumentParser(description=""" 14 | This script verifies the list of phones read from stdin are valid 15 | phones present in lang/phones.txt.""") 16 | 17 | parser.add_argument("phones", type=str, 18 | help="File containing the list of all phones as the " 19 | "first column") 20 | 21 | args = parser.parse_args() 22 | return args 23 | 24 | 25 | def main(): 26 | args = get_args() 27 | phones = set() 28 | for line in open(args.phones): 29 | phones.add(line.strip().split()[0]) 30 | 31 | for line in sys.stdin.readlines(): 32 | p = line.strip() 33 | 34 | if p not in phones: 35 | sys.stderr.write("Could not find phone {p} in {f}" 36 | "\n".format(p=p, f=args.phones)) 37 | raise SystemExit(1) 38 | 39 | 40 | if __name__ == "__main__": 41 | main() 42 | -------------------------------------------------------------------------------- /local/kaggle/data_prep_wav_seperate.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | from shutil import copyfile 3 | 4 | if __name__ == '__main__': 5 | wav_dir = sys.argv[1] 6 | data_dir = sys.argv[2] 7 | for dirPath, dirNames, fileNames in os.walk(sys.argv[1]): 8 | for fname in fileNames: 9 | if fname.endswith('.wav'): 10 | file_path = os.path.join(dirPath, fname) 11 | file_path = os.path.abspath(file_path) 12 | name = fname.replace('.wav','') 13 | decode_dir = os.path.join(data_dir,name,'data') 14 | if not os.path.isdir(decode_dir): 15 | os.makedirs(decode_dir) 16 | #os.symlink(file_path, os.path.join(decode_dir,fname)) 17 | utt2spk_path = os.path.join(decode_dir,'utt2spk') 18 | wavscp_path = os.path.join(decode_dir,'wav.scp') 19 | spk2utt_path = os.path.join(decode_dir,'spk2utt') 20 | with open(utt2spk_path,'w') as f: 21 | f.write(fname + ' ' + fname) 22 | with open(wavscp_path,'w') as f: 23 | f.write(fname + ' ' + file_path) 24 | with open(spk2utt_path,'w') as f: 25 | f.write(fname + ' ' + fname) 26 | 27 | -------------------------------------------------------------------------------- /steps/conf/lattice_depth_per_frame.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2015 Brno University of Technology (Author: Karel Vesely) 3 | # Licensed under the Apache License, Version 2.0 (the "License") 4 | 5 | # Extract lattice-depth for each frame. 6 | 7 | # Begin configuration 8 | cmd=run.pl 9 | # End configuration 10 | 11 | echo "$0 $@" # Print the command line for logging 12 | 13 | [ -f path.sh ] && . ./path.sh # source the path. 14 | . parse_options.sh || exit 1; 15 | 16 | if [ $# != 2 ]; then 17 | echo "usage: $0 [opts] " 18 | echo "main options (for others, see top of script file)" 19 | echo " --config # config containing options" 20 | echo " --cmd" 21 | exit 1; 22 | fi 23 | 24 | set -euo pipefail 25 | 26 | latdir=$1 27 | dir=$2 28 | 29 | [ ! -f $latdir/lat.1.gz ] && echo "Missing $latdir/lat.1.gz" && exit 1 30 | nj=$(cat $latdir/num_jobs) 31 | 32 | # Get the pdf-posterior vectors, 33 | $cmd JOB=1:$nj $dir/log/lattice_depth_per_frame.JOB.log \ 34 | lattice-depth-per-frame "ark:gunzip -c $latdir/lat.JOB.gz |" ark,t:$dir/lattice_frame_depth.JOB.ark 35 | # Merge, 36 | for ((n=1; n<=nj; n++)); do cat $dir/lattice_frame_depth.${n}.ark; done >$dir/lattice_frame_depth.ark 37 | rm $dir/lattice_frame_depth.*.ark 38 | 39 | # Done! 40 | -------------------------------------------------------------------------------- /steps/conf/parse_arpa_unigrams.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | # Copyright 2015 Brno University of Technology (author: Karel Vesely) 4 | # Apache 2.0 5 | 6 | import sys, gzip, re 7 | 8 | # Parse options, 9 | if len(sys.argv) != 4: 10 | print "Usage: %s " % __file__ 11 | sys.exit(0) 12 | words_txt, arpa_gz, unigrams_out = sys.argv[1:] 13 | 14 | if arpa_gz == '-': arpa_gz = '/dev/stdin' 15 | if unigrams_out == '-': unigrams_out = '/dev/stdout' 16 | 17 | # Load the words.txt, 18 | words = [ l.split() for l in open(words_txt) ] 19 | 20 | # Load the unigram probabilities in 10log from ARPA, 21 | wrd_log10 = dict() 22 | with gzip.open(arpa_gz,'r') as f: 23 | read = False 24 | for l in f: 25 | if l.strip() == '\\1-grams:': read = True 26 | if l.strip() == '\\2-grams:': break 27 | if read and len(l.split())>=2: 28 | log10_p_unigram, wrd = re.split('[\t ]+',l.strip(),2)[:2] 29 | wrd_log10[wrd] = float(log10_p_unigram) 30 | 31 | # Create list, 'wrd id log_p_unigram', 32 | words_unigram = [[wrd, id, (wrd_log10[wrd] if wrd in wrd_log10 else -99)] for wrd,id in words ] 33 | 34 | print >>sys.stderr, words_unigram[0] 35 | # Store, 36 | with open(unigrams_out,'w') as f: 37 | f.writelines(['%s %s %g\n' % (w,i,p) for (w,i,p) in words_unigram]) 38 | 39 | -------------------------------------------------------------------------------- /local/kaggle/replace_choice.py: -------------------------------------------------------------------------------- 1 | import os,sys,json 2 | sys.path.append('local/data/') 3 | from parse_choices import * 4 | from normalize_utils import * 5 | 6 | def process_outputs(outputs): 7 | L = read_outputs(outputs) 8 | L2 = [] 9 | for name,trans in L: 10 | idx = int(name[1:].replace('.wav','')) 11 | trans = trans.replace(' ','') 12 | L2.append((idx,trans)) 13 | L2 =sorted(L2, key=lambda s: s[0]) 14 | return L2 15 | def write_d(key,X_list,L): 16 | for idx,value in X_list: 17 | for i,l in enumerate(L): 18 | if l["id"] == idx: 19 | L[i][key] = value 20 | break 21 | return L 22 | 23 | 24 | if __name__ == '__main__': 25 | C_outputs = sys.argv[1] 26 | iflytek_json = sys.argv[2] 27 | output_json = sys.argv[3] 28 | d = {} 29 | 30 | C_list = process_outputs(C_outputs) 31 | 32 | C_list_parse = [] 33 | 34 | for idx,trans in C_list: 35 | options = parse(trans) 36 | C_list_parse.append((idx,options)) 37 | 38 | with open(iflytek_json,'r',encoding='utf8') as f: 39 | L = json.load(f) 40 | 41 | L = write_d("options",C_list_parse,L) 42 | with open(output_json,'w',encoding='utf8') as f: 43 | json.dump(L,f,indent=4,ensure_ascii=False) 44 | 45 | -------------------------------------------------------------------------------- /utils/build_const_arpa_lm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2014 Guoguo Chen 4 | # Apache 2.0 5 | 6 | # This script reads in an Arpa format language model, and converts it into the 7 | # ConstArpaLm format language model. 8 | 9 | # begin configuration section 10 | # end configuration section 11 | 12 | [ -f path.sh ] && . ./path.sh; 13 | 14 | . utils/parse_options.sh 15 | 16 | if [ $# != 3 ]; then 17 | echo "Usage: " 18 | echo " $0 [options] " 19 | echo "e.g.:" 20 | echo " $0 data/local/lm/3-gram.full.arpa.gz data/lang/ data/lang_test_tgmed" 21 | echo "Options" 22 | exit 1; 23 | fi 24 | 25 | export LC_ALL=C 26 | 27 | arpa_lm=$1 28 | old_lang=$2 29 | new_lang=$3 30 | 31 | mkdir -p $new_lang 32 | 33 | mkdir -p $new_lang 34 | cp -r $old_lang/* $new_lang 35 | 36 | unk=`cat $new_lang/oov.int` 37 | bos=`grep -w "" $new_lang/words.txt | awk '{print $2}'` 38 | eos=`grep "" $new_lang/words.txt | awk '{print $2}'` 39 | if [[ -z $bos || -z $eos ]]; then 40 | echo "$0: and symbols are not in $new_lang/words.txt" 41 | exit 1 42 | fi 43 | 44 | 45 | arpa-to-const-arpa --bos-symbol=$bos \ 46 | --eos-symbol=$eos --unk-symbol=$unk \ 47 | "gunzip -c $arpa_lm | utils/map_arpa_lm.pl $new_lang/words.txt|" $new_lang/G.carpa || exit 1; 48 | 49 | exit 0; 50 | -------------------------------------------------------------------------------- /local/nnet/DFSMN_M.proto: -------------------------------------------------------------------------------- 1 | 2 | 720 2048 0.000000 1 3 | 2048 2048 4 | 2048 512 0.010000 1 5 | 512 512 20 20 2 2 6 | 512 512 2048 20 20 2 2 7 | 512 512 2048 20 20 2 2 8 | 512 512 2048 20 20 2 2 9 | 512 512 2048 20 20 2 2 10 | 512 512 2048 20 20 2 2 11 | 512 2048 1 12 | 2048 2048 13 | 2048 2048 1 14 | 2048 2048 15 | 2048 512 1 16 | 512 5777 1 17 | 5777 5777 18 | 19 | 20 | -------------------------------------------------------------------------------- /local/nnet/DFSMN_S.proto: -------------------------------------------------------------------------------- 1 | 2 | 720 1024 0.000000 1 3 | 1024 1024 4 | 1024 384 0.010000 1 5 | 384 384 20 20 2 2 6 | 384 384 1024 20 20 2 2 7 | 384 384 1024 20 20 2 2 8 | 384 384 1024 20 20 2 2 9 | 384 384 1024 20 20 2 2 10 | 384 384 1024 20 20 2 2 11 | 384 1024 1 12 | 1024 1024 13 | 1024 1024 1 14 | 1024 1024 15 | 1024 384 1 16 | 384 5777 1 17 | 5777 5777 18 | 19 | 20 | -------------------------------------------------------------------------------- /local/create_oov_char_lexicon.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2016 Alibaba Robotics Corp. (Author: Xingyu Na) 3 | # 4 | # A script for char-based Chinese OOV lexicon generation. 5 | # 6 | # Input 1: char-based dictionary, example 7 | # CHAR1 ph1 ph2 8 | # CHAR2 ph3 9 | # CHAR3 ph2 ph4 10 | # 11 | # Input 2: OOV word list, example 12 | # WORD1 13 | # WORD2 14 | # WORD3 15 | # 16 | # where WORD1 is in the format of "CHAR1CHAR2". 17 | # 18 | # Output: OOV lexicon, in the format of normal lexicon 19 | 20 | if($#ARGV != 1) { 21 | print STDERR "usage: perl create_oov_char_lexicon.pl chardict oovwordlist > oovlex\n\n"; 22 | print STDERR "### chardict: a dict in which each line contains the pronunciation of one Chinese char\n"; 23 | print STDERR "### oovwordlist: OOV word list\n"; 24 | print STDERR "### oovlex: output OOV lexicon\n"; 25 | exit; 26 | } 27 | 28 | use utf8; 29 | my %prons; 30 | open(DICT, $ARGV[0]) || die("Can't open dict ".$ARGV[0]."\n"); 31 | foreach () { 32 | chomp; @A = split(" ", $_); $prons{$A[0]} = $A[1]; 33 | } 34 | close DICT; 35 | 36 | open(WORDS, $ARGV[1]) || die("Can't open oov word list ".$ARGV[1]."\n"); 37 | while () { 38 | chomp; 39 | print $_; 40 | @A = split("", $_); 41 | foreach (@A) { 42 | print " $prons{$_}"; 43 | } 44 | print "\n"; 45 | } 46 | close WORDS; 47 | -------------------------------------------------------------------------------- /local/nnet/DFSMN_M.proto.2560: -------------------------------------------------------------------------------- 1 | 2 | 720 2048 0.000000 1 3 | 2048 2048 4 | 2048 512 0.010000 1 5 | 512 512 20 20 2 2 6 | 512 512 2048 20 20 2 2 7 | 512 512 2048 20 20 2 2 8 | 512 512 2048 20 20 2 2 9 | 512 512 2048 20 20 2 2 10 | 512 512 2048 20 20 2 2 11 | 512 2048 1 12 | 2048 2048 13 | 2048 2048 1 14 | 2048 2048 15 | 2048 512 1 16 | 512 2560 1 17 | 2560 2560 18 | 19 | 20 | -------------------------------------------------------------------------------- /local/nnet/DFSMN_M.proto.8136: -------------------------------------------------------------------------------- 1 | 2 | 720 2048 0.000000 1 3 | 2048 2048 4 | 2048 512 0.010000 1 5 | 512 512 20 20 2 2 6 | 512 512 2048 20 20 2 2 7 | 512 512 2048 20 20 2 2 8 | 512 512 2048 20 20 2 2 9 | 512 512 2048 20 20 2 2 10 | 512 512 2048 20 20 2 2 11 | 512 2048 1 12 | 2048 2048 13 | 2048 2048 1 14 | 2048 2048 15 | 2048 512 1 16 | 512 8136 1 17 | 8136 8136 18 | 19 | 20 | -------------------------------------------------------------------------------- /local/nnet/DFSMN_S.proto.2560: -------------------------------------------------------------------------------- 1 | 2 | 720 1024 0.000000 1 3 | 1024 1024 4 | 1024 384 0.010000 1 5 | 384 384 20 20 2 2 6 | 384 384 1024 20 20 2 2 7 | 384 384 1024 20 20 2 2 8 | 384 384 1024 20 20 2 2 9 | 384 384 1024 20 20 2 2 10 | 384 384 1024 20 20 2 2 11 | 384 1024 1 12 | 1024 1024 13 | 1024 1024 1 14 | 1024 1024 15 | 1024 384 1 16 | 384 2560 1 17 | 2560 2560 18 | 19 | 20 | -------------------------------------------------------------------------------- /local/nnet/DFSMN_S.proto.8136: -------------------------------------------------------------------------------- 1 | 2 | 720 1024 0.000000 1 3 | 1024 1024 4 | 1024 384 0.010000 1 5 | 384 384 20 20 2 2 6 | 384 384 1024 20 20 2 2 7 | 384 384 1024 20 20 2 2 8 | 384 384 1024 20 20 2 2 9 | 384 384 1024 20 20 2 2 10 | 384 384 1024 20 20 2 2 11 | 384 1024 1 12 | 1024 1024 13 | 1024 1024 1 14 | 1024 1024 15 | 1024 384 1 16 | 384 8136 1 17 | 8136 8136 18 | 19 | 20 | -------------------------------------------------------------------------------- /local/nnet/DFSMN_M_ivector.proto: -------------------------------------------------------------------------------- 1 | 2 | 1020 2048 0.000000 1 3 | 2048 2048 4 | 2048 512 0.010000 1 5 | 512 512 20 20 2 2 6 | 512 512 2048 20 20 2 2 7 | 512 512 2048 20 20 2 2 8 | 512 512 2048 20 20 2 2 9 | 512 512 2048 20 20 2 2 10 | 512 512 2048 20 20 2 2 11 | 512 2048 1 12 | 2048 2048 13 | 2048 2048 1 14 | 2048 2048 15 | 2048 512 1 16 | 512 5777 1 17 | 5777 5777 18 | 19 | 20 | -------------------------------------------------------------------------------- /local/nnet/DFSMN_S_ivector.proto: -------------------------------------------------------------------------------- 1 | 2 | 1020 1024 0.000000 1 3 | 1024 1024 4 | 1024 384 0.010000 1 5 | 384 384 20 20 2 2 6 | 384 384 1024 20 20 2 2 7 | 384 384 1024 20 20 2 2 8 | 384 384 1024 20 20 2 2 9 | 384 384 1024 20 20 2 2 10 | 384 384 1024 20 20 2 2 11 | 384 1024 1 12 | 1024 1024 13 | 1024 1024 1 14 | 1024 1024 15 | 1024 384 1 16 | 384 5777 1 17 | 5777 5777 18 | 19 | 20 | -------------------------------------------------------------------------------- /local/kaggle/test/decode_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | all_wav_dir=$1 3 | dir=$2 4 | C_lang_dir=$3 5 | nnet_dir=exp/nnet/tri4a_DFSMN_L_woiv_nnet_ali 6 | nnet_dir=exp/nnet/tri4a_DFSMN_S_woiv 7 | 8 | export nnet_dir=$nnet_dir 9 | thread_num=100 10 | 11 | asr() { 12 | wav_dir=$1 13 | rescore_lang=data/lang_4large_test 14 | use_gpu="no" 15 | 16 | if [ -f $wav_dir/rescore_lang ]; then 17 | rescore_lang=`cat $wav_dir/rescore_lang` 18 | fi 19 | 20 | if [ -f $wav_dir/use_gpu ]; then 21 | use_gpu=`cat $wav_dir/use_gpu` 22 | fi 23 | 24 | local/nnet/decode_from_wav.sh \ 25 | --rescore_lang $rescore_lang \ 26 | --fbank_nj 1 \ 27 | --decode_nj 1 \ 28 | --stage 1 \ 29 | --use_gpu $use_gpu \ 30 | $wav_dir $nnet_dir $wav_dir > /dev/null || echo "error decoding $wav_dir" 31 | rm -r $wav_dir/data $wav_dir/final.mdl 32 | cat $wav_dir/output.txt >> $wav_dir/../output.txt 33 | echo "Done $wav_dir files" 34 | } 35 | 36 | export -f asr 37 | 38 | mkdir -p $dir 39 | startt=`date +%s` 40 | python3 local/data/data_prep_wav_seperate.py $all_wav_dir $dir 41 | python3 local/nnet/test/select_lm.py $dir $C_lang_dir 42 | 43 | parallel -j $thread_num "asr {}" ::: $dir/* 44 | 45 | endt=`date +%s` 46 | runtime=$((endt-startt)) 47 | echo "Total time $runtime seconds" 48 | echo "Total time $runtime seconds" > $dir/run_time 49 | -------------------------------------------------------------------------------- /local/nnet/DFSMN_M_ivector.proto.2560: -------------------------------------------------------------------------------- 1 | 2 | 1020 2048 0.000000 1 3 | 2048 2048 4 | 2048 512 0.010000 1 5 | 512 512 20 20 2 2 6 | 512 512 2048 20 20 2 2 7 | 512 512 2048 20 20 2 2 8 | 512 512 2048 20 20 2 2 9 | 512 512 2048 20 20 2 2 10 | 512 512 2048 20 20 2 2 11 | 512 2048 1 12 | 2048 2048 13 | 2048 2048 1 14 | 2048 2048 15 | 2048 512 1 16 | 512 2560 1 17 | 2560 2560 18 | 19 | 20 | -------------------------------------------------------------------------------- /local/nnet/DFSMN_S_ivector.proto.2560: -------------------------------------------------------------------------------- 1 | 2 | 1020 1024 0.000000 1 3 | 1024 1024 4 | 1024 384 0.010000 1 5 | 384 384 20 20 2 2 6 | 384 384 1024 20 20 2 2 7 | 384 384 1024 20 20 2 2 8 | 384 384 1024 20 20 2 2 9 | 384 384 1024 20 20 2 2 10 | 384 384 1024 20 20 2 2 11 | 384 1024 1 12 | 1024 1024 13 | 1024 1024 1 14 | 1024 1024 15 | 1024 384 1 16 | 384 2560 1 17 | 2560 2560 18 | 19 | 20 | -------------------------------------------------------------------------------- /utils/data/get_utt2num_frames.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Copyright 2016 Vimal Manohar 4 | # Apache 2.0. 5 | 6 | cmd=run.pl 7 | nj=4 8 | 9 | frame_shift=0.01 10 | frame_overlap=0.015 11 | 12 | . utils/parse_options.sh 13 | 14 | if [ $# -ne 1 ]; then 15 | echo "This script writes a file utt2num_frames with the " 16 | echo "number of frames in each utterance as measured based on the " 17 | echo "duration of the utterances (in utt2dur) and the specified " 18 | echo "frame_shift and frame_overlap." 19 | echo "Usage: $0 " 20 | exit 1 21 | fi 22 | 23 | data=$1 24 | 25 | if [ -s $data/utt2num_frames ]; then 26 | echo "$0: $data/utt2num_frames already present!" 27 | exit 0; 28 | fi 29 | 30 | if [ ! -f $data/feats.scp ]; then 31 | utils/data/get_utt2dur.sh $data 32 | awk -v fs=$frame_shift -v fovlp=$frame_overlap \ 33 | '{print $1" "int( ($2 - fovlp) / fs)}' $data/utt2dur > $data/utt2num_frames 34 | exit 0 35 | fi 36 | 37 | utils/split_data.sh --per-utt $data $nj || exit 1 38 | $cmd JOB=1:$nj $data/log/get_utt2num_frames.JOB.log \ 39 | feat-to-len scp:$data/split${nj}utt/JOB/feats.scp ark,t:$data/split${nj}utt/JOB/utt2num_frames || exit 1 40 | 41 | for n in `seq $nj`; do 42 | cat $data/split${nj}utt/$n/utt2num_frames 43 | done > $data/utt2num_frames 44 | 45 | echo "$0: Computed and wrote $data/utt2num_frames" 46 | -------------------------------------------------------------------------------- /utils/summarize_warnings.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. 4 | 5 | @ARGV != 1 && print STDERR "Usage: summarize_warnings.pl \n" && exit 1; 6 | 7 | $dir = $ARGV[0]; 8 | 9 | ! -d $dir && print STDERR "summarize_warnings.pl: no such directory $dir\n" && exit 1; 10 | 11 | $dir =~ s:/$::; # Remove trailing slash. 12 | 13 | 14 | # Group the files into categories where all have the same base-name. 15 | foreach $f (glob ("$dir/*.log")) { 16 | $f_category = $f; 17 | # do next expression twice; s///g doesn't work as they overlap. 18 | $f_category =~ s:\.\d+\.:.*.:; 19 | $f_category =~ s:\.\d+\.:.*.:; 20 | $fmap{$f_category} .= " $f"; 21 | } 22 | 23 | sub split_hundreds { # split list of filenames into groups of 100. 24 | my $names = shift @_; 25 | my @A = split(" ", $names); 26 | my @ans = (); 27 | while (@A > 0) { 28 | my $group = ""; 29 | for ($x = 0; $x < 100 && @A>0; $x++) { 30 | $fname = pop @A; 31 | $group .= "$fname "; 32 | } 33 | push @ans, $group; 34 | } 35 | return @ans; 36 | } 37 | 38 | foreach $c (keys %fmap) { 39 | $n = 0; 40 | foreach $fgroup (split_hundreds($fmap{$c})) { 41 | $n += `grep -w WARNING $fgroup | wc -l`; 42 | } 43 | if ($n != 0) { 44 | print "$n warnings in $c\n" 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /local/kaggle/demo.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import parse_choices as pc 3 | 4 | def asr(A_path, B_path, C_path): 5 | # Inputs : 6 | # A_path : path of context wav 7 | # B_path : path of question wav 8 | # C_path : path of option wav 9 | # Outputs : 10 | # {"context":"","question":"","options":["","","",""], "answer":-1} 11 | 12 | outputs = subprocess.check_output(['bash' ,'/data/local/kgb/Chinese-ASR/local/kaggle/decode_demo.sh', A_path, B_path, C_path ]) 13 | d = {"context":"","question":"","options":["","","",""], "answer":-1} 14 | for line in outputs.decode('utf-8').split('\n'): 15 | if len(line) == 0 : 16 | continue 17 | line = line.replace('','') 18 | tokens = line.split() 19 | typ = tokens[0] 20 | trans = ' '.join(tokens[1:]) 21 | if typ == 'A': 22 | d['context'] = trans 23 | elif typ == 'B': 24 | d['question'] = trans 25 | elif typ == 'C': 26 | d['options'] = pc.parse(trans.replace(' ','')) 27 | return d 28 | 29 | if __name__ == '__main__': 30 | A_path = '/data/local/kgb/Chinese-ASR/one_qa/A0001500.wav' 31 | B_path = '/data/local/kgb/Chinese-ASR/one_qa/B0001500.wav' 32 | C_path = '/data/local/kgb/Chinese-ASR/one_qa/C0001500.wav' 33 | d = asr(A_path,B_path,C_path) 34 | print(d) 35 | -------------------------------------------------------------------------------- /local/kaggle/mix_LM_with_A.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.append('local/data/') 4 | from normalize_utils import * 5 | 6 | def read_choose_lm(lm_file): 7 | d = {} 8 | with open(lm_file,'r') as f: 9 | for line in f: 10 | tokens = line.rstrip().split() 11 | name = tokens[0][1:].replace('.wav','') 12 | idx = int(name) 13 | novel = tokens[1] 14 | d[idx] = novel 15 | return d 16 | def process_C(idx): 17 | lm = "lm_test/LM/"+ d[idx] + "_C.lm" 18 | return lm 19 | if __name__ == '__main__': 20 | A_outputs = sys.argv[1] 21 | C_lang_dir = sys.argv[2] 22 | choose_lm = sys.argv[3] 23 | outputs = read_outputs(A_outputs) 24 | d = read_choose_lm(choose_lm) 25 | for (name,trans) in outputs: 26 | idx = int(name[1:].replace('.wav','')) 27 | name = name.replace('.wav','').replace('A','C') 28 | src_dir = os.path.join(C_lang_dir,name) 29 | os.makedirs(src_dir) 30 | 31 | A_txt_path = os.path.join(src_dir,'A.txt') 32 | with open(A_txt_path,'w',encoding='utf-8') as f: 33 | f.write(trans) 34 | 35 | lm = process_C(idx) 36 | ori_lm = os.path.join(os.getcwd(),lm) 37 | lm_path = os.path.join(src_dir,'lm_path') 38 | with open(lm_path,'w') as f: 39 | f.write(lm) 40 | 41 | 42 | -------------------------------------------------------------------------------- /local/data/extract_wiki.py: -------------------------------------------------------------------------------- 1 | import os,sys,json,re 2 | sys.path.append('local/data/tool/jieba-zh_TW') 3 | import jieba 4 | from opencc import OpenCC 5 | from number2chinese import * 6 | 7 | def main(wiki_corpus): 8 | openCC = OpenCC('s2t') 9 | for root, dirs, files in os.walk(wiki_corpus, topdown=False): 10 | for name in files: 11 | txt_path = os.path.join(root,name) 12 | print(txt_path) 13 | with open(txt_path,'r',encoding='utf-8') as f: 14 | for line in f: 15 | d = json.loads(line) 16 | text = d['text'].replace('\n\n','\n') 17 | text = openCC.convert(text) 18 | text = text.upper() 19 | tokens = jieba.cut(text) 20 | new_tokens = [] 21 | for token in tokens: 22 | if re.match('^[0-9]+$',token): 23 | if len(token) > 15: 24 | continue 25 | token = to_chinese(int(token)) 26 | new_tokens.append(token) 27 | text = ' '.join(new_tokens) 28 | if len(text) > 0: 29 | print(text) 30 | 31 | 32 | if __name__ == '__main__': 33 | wiki_corpus = sys.argv[1] 34 | main(wiki_corpus) 35 | 36 | -------------------------------------------------------------------------------- /utils/utt2spk_to_spk2utt.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # converts an utt2spk file to a spk2utt file. 18 | # Takes input from the stdin or from a file argument; 19 | # output goes to the standard out. 20 | 21 | if ( @ARGV > 1 ) { 22 | die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; 23 | } 24 | 25 | while(<>){ 26 | @A = split(" ", $_); 27 | @A == 2 || die "Invalid line in utt2spk file: $_"; 28 | ($u,$s) = @A; 29 | if(!$seen_spk{$s}) { 30 | $seen_spk{$s} = 1; 31 | push @spklist, $s; 32 | } 33 | push (@{$spk_hash{$s}}, "$u"); 34 | } 35 | foreach $s (@spklist) { 36 | $l = join(' ',@{$spk_hash{$s}}); 37 | print "$s $l\n"; 38 | } 39 | -------------------------------------------------------------------------------- /steps/online/nnet2/copy_ivector_dir.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2017 Johns Hopkins University (author: Hossein Hadian) 4 | # Apache 2.0 5 | 6 | # This script copies the necessary parts of an online ivector directory 7 | # optionally applying a mapping to the ivector_online.scp file 8 | 9 | utt2orig= 10 | 11 | . utils/parse_options.sh 12 | 13 | if [ $# != 2 ]; then 14 | echo "Usage: " 15 | echo " $0 [options] " 16 | echo "e.g.:" 17 | echo " $0 exp/nnet3/online_ivector_train exp/nnet3/online_ivector_train_fs" 18 | echo "Options" 19 | echo " --utt2orig= # utterance id mapping to use" 20 | exit 1; 21 | fi 22 | 23 | 24 | srcdir=$1 25 | destdir=$2 26 | 27 | if [ ! -f $srcdir/ivector_period ]; then 28 | echo "$0: no such file $srcdir/ivector_period" 29 | exit 1; 30 | fi 31 | 32 | if [ "$destdir" == "$srcdir" ]; then 33 | echo "$0: this script requires and to be different." 34 | exit 1 35 | fi 36 | 37 | set -e; 38 | 39 | mkdir -p $destdir 40 | cp -r $srcdir/{conf,ivector_period} $destdir 41 | if [ -z $utt2orig ]; then 42 | cp $srcdir/ivector_online.scp $destdir 43 | else 44 | utils/apply_map.pl -f 2 $srcdir/ivector_online.scp < $utt2orig > $destdir/ivector_online.scp 45 | fi 46 | cp $srcdir/final.ie.id $destdir 47 | 48 | echo "$0: Copied necessary parts of online ivector directory $srcdir to $destdir" 49 | -------------------------------------------------------------------------------- /local/data/data_prep_Tl.py: -------------------------------------------------------------------------------- 1 | import os,sys 2 | 3 | def main(corpus_path,file_type): 4 | for root, dirs, files in os.walk(os.path.join(corpus_path,'syl'), topdown=False): 5 | for name in files: 6 | if name.endswith('.txt'): 7 | txt_path = os.path.join(root,name) 8 | with open(txt_path,'r') as f: 9 | for line in f: 10 | tokens = line.rstrip().split() 11 | wav_file, trans = tokens[-1],tokens[1] 12 | 13 | wav_label = wav_file.split('.')[0] 14 | trans = ' '.join(list(trans)) 15 | trans = trans.upper() 16 | 17 | spk = wav_file.split('_')[0] 18 | wav_path = os.path.join(corpus_path,'Wav/{}/{}'.format(spk,wav_file)) 19 | wav_path = os.path.abspath(wav_path) 20 | if file_type == 'wav.scp': 21 | print(wav_label, wav_path) 22 | elif file_type == 'utt2spk': 23 | print(wav_label, spk) 24 | elif file_type == 'text': 25 | print(wav_label, trans) 26 | if __name__ == '__main__': 27 | corpus_path = sys.argv[1] 28 | file_type = sys.argv[2] 29 | main(corpus_path,file_type) 30 | -------------------------------------------------------------------------------- /local/kaggle/decode_demo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd /data/local/kgb/Chinese-ASR 3 | nnet_dir=exp/nnet/tri4a_DFSMN_S_woiv_aug_ori 4 | mode=4 # mode for lmrescoring 5 | 6 | . ./utils/parse_options.sh 7 | 8 | wavA=$1 9 | wavB=$2 10 | 11 | 12 | export nnet_dir=$nnet_dir 13 | export mode=$mode 14 | export graph=$graph 15 | export rescore_arpa=$rescore_arpa 16 | 17 | asr() { 18 | wav=$1 19 | typ=$2 20 | tmpdir=$3 21 | 22 | wav_dir=$tmpdir/$typ 23 | data_dir=$wav_dir/data 24 | 25 | mkdir -p $data_dir 26 | 27 | name=`basename $wav` 28 | 29 | echo $name $wav > $data_dir/wav.scp 30 | echo $name $name > $data_dir/utt2spk 31 | echo $name $name > $data_dir/spk2utt 32 | 33 | rescore_lang=data/LM/ori_$typ 34 | graph=exp/tri4a/graph_pr10_$typ 35 | 36 | local/kaggle/decode_from_wav.sh \ 37 | --rescore true \ 38 | --rescore_lang $rescore_lang \ 39 | --fbank_nj 1 --mode $mode \ 40 | --decode_nj 1 \ 41 | --stage 1 \ 42 | --graph $graph \ 43 | $wav_dir $nnet_dir $wav_dir > $wav_dir/log || echo "error decoding $wav_dir" 44 | 45 | cp $wav_dir/data/wav.scp $wav_dir 46 | rm -r $wav_dir/data 47 | rm -r $wav_dir/final.mdl 48 | 49 | output=`cat $wav_dir/output.txt | cut -d' ' -f2- ` 50 | echo $typ $output 51 | } 52 | 53 | tmpdir=`mktemp -d` 54 | 55 | 56 | ( asr $wavA A $tmpdir ) & 57 | ( asr $wavB B $tmpdir ) & 58 | 59 | 60 | wait 61 | 62 | rm -r $tmpdir 63 | -------------------------------------------------------------------------------- /steps/word_align_lattices.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright Johns Hopkins University (Author: Daniel Povey) 2012 4 | # Apache 2.0. 5 | 6 | # Begin configuration section. 7 | silence_label=0 8 | cmd=run.pl 9 | # End configuration section. 10 | 11 | echo "$0 $@" # Print the command line for logging 12 | 13 | for x in `seq 2`; do 14 | [ "$1" == "--silence-label" ] && silence_label=$2 && shift 2; 15 | [ "$1" == "--cmd" ] && cmd="$2" && shift 2; 16 | done 17 | 18 | if [ $# != 3 ]; then 19 | echo "Word-align lattices (make the arcs sync up with words)" 20 | echo "" 21 | echo "Usage: $0 [options] " 22 | echo "options: [--cmd (run.pl|queue.pl [queue opts])] [--silence-label ]" 23 | exit 1; 24 | fi 25 | 26 | . ./path.sh || exit 1; 27 | 28 | lang=$1 29 | indir=$2 30 | outdir=$3 31 | 32 | mdl=`dirname $indir`/final.mdl 33 | wbfile=$lang/phones/word_boundary.int 34 | 35 | for f in $mdl $wbfile $indir/num_jobs; do 36 | [ ! -f $f ] && echo "word_align_lattices.sh: no such file $f" && exit 1; 37 | done 38 | 39 | mkdir -p $outdir/log 40 | 41 | 42 | cp $indir/num_jobs $outdir; 43 | nj=`cat $indir/num_jobs` 44 | 45 | $cmd JOB=1:$nj $outdir/log/align.JOB.log \ 46 | lattice-align-words --silence-label=$silence_label --test=true \ 47 | $wbfile $mdl "ark:gunzip -c $indir/lat.JOB.gz|" "ark,t:|gzip -c >$outdir/lat.JOB.gz" || exit 1; 48 | 49 | -------------------------------------------------------------------------------- /local/nnet/retrain.sh: -------------------------------------------------------------------------------- 1 | . ./path.sh 2 | . ./cmd.sh 3 | 4 | set -e 5 | set -u 6 | set -o pipefail 7 | 8 | . utils/parse_options.sh || exit 1; 9 | 10 | 11 | data=data/train_sp_aug/fbank 12 | ali=exp/aishell2/tri4_ali_train_sp_aug 13 | dnn_model=$1 14 | oridir=$2 15 | visible_gpu=$3 16 | 17 | export CUDA_VISIBLE_DEVICES=$visible_gpu 18 | 19 | ######################### 20 | stage=2 21 | nj=10 22 | 23 | dir=$oridir\_train_more 24 | 25 | lrate=1.95313e-08 26 | mlp_init=$(cat $oridir/.mlp_best) 27 | 28 | if [ $stage -le 3 ]; then 29 | proto=local/nnet/${dnn_model}.proto 30 | ori_num_pdf=`cat $proto |grep "Softmax" |awk '{print $3}'` 31 | echo $ori_num_pdf 32 | new_num_pdf=`gmm-info ./exp/aishell2/tri4_ali_train_sp_aug/final.mdl | grep "number of pdfs" |awk '{print $4}'` 33 | echo $new_num_pdf 34 | new_proto=${proto}.$new_num_pdf 35 | sed -r "s/"$ori_num_pdf"/"$new_num_pdf"/g" $proto > $new_proto 36 | 37 | $cuda_cmd $dir/_train_nnet.log \ 38 | local/nnet/train_more.sh --learn-rate $lrate --nnet-proto $new_proto \ 39 | --start_half_lr 10 --momentum 0.9 \ 40 | --train-tool "nnet-train-fsmn-streams" \ 41 | --feat-type plain --splice 1 \ 42 | --cmvn-opts "--norm-means=true --norm-vars=false" --delta_opts "--delta-order=2" \ 43 | --train-tool-opts "--minibatch-size=4096" \ 44 | --max_iters 7 \ 45 | --split_feats 7 \ 46 | $mlp_init $data data/lang $ali $dir 47 | fi 48 | -------------------------------------------------------------------------------- /utils/data/resample_data_dir.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Copyright 2016 Vimal Manohar 4 | # 2018 Xiaohui Zhang 5 | # Apache 2.0. 6 | 7 | if [ $# -ne 2 ]; then 8 | echo "This script adds a sox line in wav.scp to resample the audio at a " 9 | echo "different sampling-rate" 10 | echo "Usage: $0 " 11 | echo " e.g.: $0 8000 data/dev" 12 | exit 1 13 | fi 14 | 15 | freq=$1 16 | dir=$2 17 | 18 | sox=`which sox` || { echo "Could not find sox in PATH"; exit 1; } 19 | 20 | if [ -f $dir/feats.scp ]; then 21 | mkdir -p $dir/.backup 22 | mv $dir/feats.scp $dir/.backup/ 23 | if [ -f $dir/cmvn.scp ]; then 24 | mv $dir/cmvn.scp $dir/.backup/ 25 | fi 26 | echo "$0: feats.scp already exists. Moving it to $dir/.backup" 27 | fi 28 | 29 | # After resampling we cannot compute utt2dur from wav.scp any more, 30 | # so we create utt2dur now, in case it's needed later 31 | if [ ! -s $dir/utt2dur ]; then 32 | utils/data/get_utt2dur.sh $dir 1>&2 || exit 1; 33 | fi 34 | 35 | mv $dir/wav.scp $dir/wav.scp.tmp 36 | cat $dir/wav.scp.tmp | python -c "import sys 37 | for line in sys.stdin.readlines(): 38 | splits = line.strip().split() 39 | if splits[-1] == '|': 40 | out_line = line.strip() + ' $sox -t wav - -c 1 -b 16 -t wav - rate $freq |' 41 | else: 42 | out_line = 'cat {0} {1} | $sox -t wav - -c 1 -b 16 -t wav - rate $freq |'.format(splits[0], ' '.join(splits[1:])) 43 | print (out_line)" > ${dir}/wav.scp 44 | rm $dir/wav.scp.tmp 45 | 46 | -------------------------------------------------------------------------------- /utils/shuffle_list.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Copyright 2013 Johns Hopkins University (author: Daniel Povey) 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | # MERCHANTABLITY OR NON-INFRINGEMENT. 15 | # See the Apache 2 License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | if ($ARGV[0] eq "--srand") { 20 | $n = $ARGV[1]; 21 | $n =~ m/\d+/ || die "Bad argument to --srand option: \"$n\""; 22 | srand($ARGV[1]); 23 | shift; 24 | shift; 25 | } else { 26 | srand(0); # Gives inconsistent behavior if we don't seed. 27 | } 28 | 29 | if (@ARGV > 1 || $ARGV[0] =~ m/^-.+/) { # >1 args, or an option we 30 | # don't understand. 31 | print "Usage: shuffle_list.pl [--srand N] [input file] > output\n"; 32 | print "randomizes the order of lines of input.\n"; 33 | exit(1); 34 | } 35 | 36 | @lines; 37 | while (<>) { 38 | push @lines, [ (rand(), $_)] ; 39 | } 40 | 41 | @lines = sort { $a->[0] cmp $b->[0] } @lines; 42 | foreach $l (@lines) { 43 | print $l->[1]; 44 | } 45 | -------------------------------------------------------------------------------- /steps/nnet2/check_ivectors_compatible.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2016, Johns Hopkins University (Yenda Trmal ) 3 | # License: Apache 2.0 4 | 5 | # Begin configuration section. 6 | # End configuration section 7 | 8 | #echo >&2 "$0 $@" # Print the command line for logging 9 | if [ $# != 2 ] ; then 10 | echo >&2 "Usage: $0 " 11 | echo >&2 " e.g.: $0 exp/nnet3/extractor exp/nnet3/ivectors_dev10h.pem" 12 | fi 13 | 14 | dir_a=$1 15 | dir_b=$2 16 | 17 | id_a=$(steps/nnet2/get_ivector_id.sh $dir_a) 18 | ret_a=$? 19 | id_b=$(steps/nnet2/get_ivector_id.sh $dir_b) 20 | ret_b=$? 21 | 22 | if [ ! -z "$id_a" ] && [ ! -z "${id_b}" ] ; then 23 | if [ "${id_a}" == "${id_b}" ]; then 24 | exit 0 25 | else 26 | echo >&2 "$0: ERROR: iVector id ${id_a} in $dir_a and the iVector id ${id_b} in $dir_b do not match" 27 | echo >&2 "$0: ERROR: that means that the systems are not compatible." 28 | exit 1 29 | fi 30 | elif [ -z "$id_a" ] && [ -z "${id_b}" ] ; then 31 | echo >&2 "$0: WARNING: The directories do not contain iVector ID." 32 | echo >&2 "$0: WARNING: That means it's you who's reponsible for keeping " 33 | echo >&2 "$0: WARNING: the directories compatible" 34 | exit 0 35 | else 36 | echo >&2 "$0: WARNING: One of the directories do not contain iVector ID." 37 | echo >&2 "$0: WARNING: That means it's you who's reponsible for keeping " 38 | echo >&2 "$0: WARNING: the directories compatible" 39 | exit 0 40 | fi 41 | -------------------------------------------------------------------------------- /utils/analyze_segments.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # Copyright 2015 GoVivace Inc. (Author: Nagendra Kumar Goel) 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Analyze a segments file and print important stats on it. 18 | 19 | $dur = $total = 0; 20 | $maxDur = 0; 21 | $minDur = 9999999999; 22 | $n = 0; 23 | while(<>){ 24 | chomp; 25 | @t = split(/\s+/); 26 | $dur = $t[3] - $t[2]; 27 | $total += $dur; 28 | if ($dur > $maxDur) { 29 | $maxSegId = $t[0]; 30 | $maxDur = $dur; 31 | } 32 | if ($dur < $minDur) { 33 | $minSegId = $t[0]; 34 | $minDur = $dur; 35 | } 36 | $n++; 37 | } 38 | $avg=$total/$n; 39 | $hrs = $total/3600; 40 | print "Total $hrs hours of data\n"; 41 | print "Average segment length $avg seconds\n"; 42 | print "Segment $maxSegId has length of $maxDur seconds\n"; 43 | print "Segment $minSegId has length of $minDur seconds\n"; 44 | -------------------------------------------------------------------------------- /local/lm/run_3gram_kaggle5.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | . path.sh 3 | wfst=./data/wfst 4 | dict=./data/wfst/dict 5 | lang=./data/wfst/lang 6 | tmp_lang=./data/wfst/local/lang 7 | model_dir=exp/tri4a 8 | LM=data/LM 9 | text=data/text 10 | #modify dict/lexicon.txt lexiconp.txt 11 | #utils/prepare_lang.sh $dict "" $tmp_lang $lang 12 | 13 | #LM training 14 | mkdir -p $LM/3gram 15 | #ngram-count -text $text/mix.txt -lm $LM/3gram/mix.lm -vocab $lang/vocabs.txt -limit-vocab -order 3 16 | #ngram -lm data/local/lm/3gram-mincount/lm_pr10.0 -vocab $lang/vocabs.txt -limit-vocab -write-lm $LM/3gram/ori_pr10.0.lm 17 | 18 | for x in A B C ; do 19 | #ngram-count -text $text/kaggle123_$x.txt -lm $LM/3gram/kaggle123_$x.lm -vocab $lang/vocabs.txt -limit-vocab -order 3 20 | #ngram-count -text $text/kaggle1234_$x.txt -lm $LM/3gram/kaggle1234_$x.lm -vocab $lang/vocabs.txt -limit-vocab -order 3 21 | 22 | local/lm/mix_lm3_test.sh $LM/3gram/ori_pr10.0.lm $LM/3gram/kaggle123_$x.lm $LM/3gram/mix.lm \ 23 | $LM/3gram/kaggle1234_$x.lm $text/kaggle4_$x.txt $LM/3gram/ori_$x\_10.0_kaggle1234.lm 24 | done 25 | 26 | for x in A B C ; do 27 | ( 28 | lm=$LM/3gram/ori_$x\_10.0_kaggle1234.lm 29 | lang_test=./data/wfst/lang_test_pr10_$x\_kaggle5 30 | graph_dir=exp/tri4a/graph_pr10_$x\_kaggle5 31 | #G compilation and check L and G stochastic 32 | local/lm/wfst/format_data.sh $lm $lang $lang_test 33 | 34 | #compose HCLG(choice) 35 | utils/mkgraph.sh $lang_test $model_dir $graph_dir 36 | ) & 37 | done 38 | wait 39 | -------------------------------------------------------------------------------- /steps/nnet2/remove_egs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2014 Johns Hopkins University (Author: Daniel Povey). 4 | # Apache 2.0. 5 | 6 | # This script removes the examples in an egs/ directory, e.g. 7 | # steps/nnet2/remove_egs.sh exp/nnet4b/egs/ 8 | # We give it its own script because we need to be careful about 9 | # things that are soft links to something in storage/ (i.e. remove the 10 | # data that's linked to as well as the soft link), and we want to not 11 | # delete the examples if someone has done "touch $dir/egs/.nodelete". 12 | 13 | 14 | if [ $# != 1 ]; then 15 | echo "Usage: $0 " 16 | echo "e.g.: $0 data/nnet4b/egs/" 17 | echo "e.g.: $0 data/nnet4b_mpe/degs/" 18 | echo "This script is usually equivalent to 'rm /egs.* /degs.*' but it follows" 19 | echo "soft links to /storage/; and it avoids deleting anything in the directory if" 20 | echo "someone did 'touch /.nodelete" 21 | exit 1; 22 | fi 23 | 24 | egs=$1 25 | 26 | if [ ! -d $egs ]; then 27 | echo "$0: expected directory $egs to exist" 28 | exit 1; 29 | fi 30 | 31 | if [ -f $egs/.nodelete ]; then 32 | echo "$0: not deleting egs in $egs since $egs/.nodelete exists" 33 | exit 0; 34 | fi 35 | 36 | 37 | 38 | for f in $egs/egs.*.ark $egs/degs.*.ark $egs/cegs.*.ark; do 39 | if [ -L $f ]; then 40 | rm $(dirname $f)/$(readlink $f) # this will print a warning if it fails. 41 | fi 42 | rm $f 2>/dev/null 43 | done 44 | 45 | 46 | echo "$0: Finished deleting examples in $egs" 47 | -------------------------------------------------------------------------------- /utils/data/convert_data_dir_to_whole.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Copyright 2016-2018 Vimal Manohar 4 | # Apache 2.0 5 | 6 | # This scripts converts a data directory into a "whole" data directory 7 | # by removing the segments and using the recordings themselves as 8 | # utterances 9 | 10 | set -o pipefail 11 | 12 | . ./path.sh 13 | 14 | . utils/parse_options.sh 15 | 16 | if [ $# -ne 2 ]; then 17 | echo "Usage: convert_data_dir_to_whole.sh " 18 | echo " e.g.: convert_data_dir_to_whole.sh data/dev data/dev_whole" 19 | exit 1 20 | fi 21 | 22 | data=$1 23 | dir=$2 24 | 25 | if [ ! -f $data/segments ]; then 26 | echo "$0: Data directory already does not contain segments. So just copying it." 27 | utils/copy_data_dir.sh $data $dir 28 | exit 0 29 | fi 30 | 31 | mkdir -p $dir 32 | cp $data/wav.scp $dir 33 | if [ -f $data/reco2file_and_channel ]; then 34 | cp $data/reco2file_and_channel $dir; 35 | fi 36 | 37 | mkdir -p $dir/.backup 38 | mv $dir/feats.scp $dir/cmvn.scp $dir/.backup 39 | 40 | rm $dir/utt2spk || true 41 | 42 | [ -f $data/stm ] && cp $data/stm $dir 43 | [ -f $data/glm ] && cp $data/glm $dir 44 | 45 | utils/data/internal/combine_segments_to_recording.py \ 46 | --write-reco2utt=$dir/reco2sorted_utts $data/segments $dir/utt2spk || exit 1 47 | 48 | if [ -f $data/text ]; then 49 | utils/apply_map.pl -f 2- $data/text < $dir/reco2sorted_utts > $dir/text || exit 1 50 | fi 51 | 52 | rm $dir/reco2sorted_utts 53 | 54 | utils/fix_data_dir.sh $dir || exit 1 55 | 56 | exit 0 57 | -------------------------------------------------------------------------------- /utils/data/limit_feature_dim.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2016 Alibaba Robotics Corp. (author: Xingyu Na) 4 | # Apache 2.0 5 | 6 | # The script creates a new data directory by selecting a specified 7 | # dimension range of the features in the source directory. 8 | 9 | . utils/parse_options.sh 10 | 11 | if [ $# != 3 ]; then 12 | echo "Usage: " 13 | echo " $0 " 14 | echo "The script creates a new data directory by selecting a specified" 15 | echo "dimension range of the features in the source directory." 16 | echo "e.g.:" 17 | echo " $0 0:39 data/train_hires_pitch data/train_hires" 18 | exit 1; 19 | fi 20 | 21 | feat_dim_range=$1 22 | srcdir=$2 23 | destdir=$3 24 | 25 | if [ "$destdir" == "$srcdir" ]; then 26 | echo "$0: this script requires and to be different." 27 | exit 1 28 | fi 29 | 30 | if [ ! -f $srcdir/feats.scp ]; then 31 | echo "$0: no such file $srcdir/feats.scp" 32 | exit 1; 33 | fi 34 | 35 | mkdir -p $destdir 36 | utils/copy_data_dir.sh $srcdir $destdir 37 | 38 | if [ -f $destdir/cmvn.scp ]; then 39 | rm $destdir/cmvn.scp 40 | echo "$0: warning: removing $destdir/cmvn.cp, you will have to regenerate it from the features." 41 | fi 42 | 43 | rm $destdir/feats.scp 44 | sed 's/$/\[:,'${feat_dim_range}'\]/' $srcdir/feats.scp | \ 45 | utils/data/normalize_data_range.pl > $destdir/feats.scp 46 | 47 | [ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text" 48 | utils/validate_data_dir.sh $validate_opts $destdir 49 | -------------------------------------------------------------------------------- /local/nnet/augment_data_only_kgb_noise.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | fbankdir=data/fbank 3 | 4 | . ./path.sh 5 | . ./cmd.sh 6 | . ./utils/parse_options.sh 7 | set -euo pipefail 8 | noise_dir=data/kgb_noise 9 | 10 | for corpus in cyberon_chinese_test TOCFL train_sp ; do 11 | data=data/$corpus/fbank 12 | data_aug=data/$corpus\_aug_kgb_noise/fbank 13 | if [ ! -f $data/reco2dur ] ; then 14 | bash utils/data/get_reco2utt.sh $data || exit 1 15 | fi 16 | 17 | if [ ! -f $noise_dir ] ; then 18 | bash utils/data/get_reco2utt.sh $noise_dir || exit 1 19 | fi 20 | 21 | python2 steps/data/augment_data_dir.py --utt-suffix aug_kgb_noise --bg-snrs 9:7:5 --num-bg-noises 1 --bg-noise-dir $noise_dir $data $data_aug 22 | 23 | name=$corpus\_aug_kgb_noise 24 | steps/make_fbank.sh --nj 50 --cmd "$train_cmd" --fbank-config conf/fbank.conf --name $name $data_aug exp/make_fbank/$name $fbankdir 25 | steps/compute_cmvn_stats.sh --name $name $data_aug exp/make_fbank/$name $fbankdir 26 | 27 | 28 | rm -rf ./data/$corpus\_rvb_aug/fbank 29 | utils/combine_data.sh ./data/$corpus\_aug_kgb_noise_ori/fbank $data_aug $data 30 | done 31 | 32 | ali_src=exp/tri4a_sp_ali 33 | ali_target=exp/tri4a_sp_aug_kgb_noise_ali 34 | rm -r $ali_target 35 | cp -r $ali_src $ali_target 36 | local/nnet/copy_alignment.sh $ali_target 37 | 38 | ali_src=exp/tri4a_ali_cyberon_chinese_test 39 | ali_target=exp/tri4a_ali_cyberon_chinese_test_aug_kgb_noise 40 | 41 | rm -r $ali_target 42 | cp -r $ali_src $ali_target 43 | local/nnet/copy_alignment.sh $ali_target 44 | -------------------------------------------------------------------------------- /utils/show_lattice.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | format=pdf # pdf svg 4 | mode=save # display save 5 | lm_scale=0.0 6 | acoustic_scale=0.0 7 | #end of config 8 | 9 | . utils/parse_options.sh 10 | 11 | if [ $# != 3 ]; then 12 | echo "usage: $0 [--mode display|save] [--format pdf|svg] " 13 | echo "e.g.: $0 utt-0001 \"test/lat.*.gz\" tri1/graph/words.txt" 14 | exit 1; 15 | fi 16 | 17 | . ./path.sh 18 | 19 | uttid=$1 20 | lat=$2 21 | words=$3 22 | 23 | tmpdir=$(mktemp -d /tmp/kaldi.XXXX); # trap "rm -r $tmpdir" EXIT # cleanup 24 | 25 | gunzip -c $lat | lattice-to-fst --lm-scale=$lm_scale --acoustic-scale=$acoustic_scale ark:- "scp,p:echo $uttid $tmpdir/$uttid.fst|" || exit 1; 26 | ! [ -s $tmpdir/$uttid.fst ] && \ 27 | echo "Failed to extract lattice for utterance $uttid (not present?)" && exit 1; 28 | fstdraw --portrait=true --osymbols=$words $tmpdir/$uttid.fst | dot -T${format} > $tmpdir/$uttid.${format} 29 | 30 | if [ "$(uname)" == "Darwin" ]; then 31 | doc_open=open 32 | elif [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then 33 | doc_open=xdg-open 34 | elif [ $mode == "display" ] ; then 35 | echo "Can not automaticaly open file on your operating system" 36 | mode=save 37 | fi 38 | 39 | [ $mode == "display" ] && $doc_open $tmpdir/$uttid.${format} 40 | [[ $mode == "display" && $? -ne 0 ]] && echo "Failed to open ${format} format." && mode=save 41 | [ $mode == "save" ] && echo "Saving to $uttid.${format}" && cp $tmpdir/$uttid.${format} . 42 | 43 | exit 0 44 | -------------------------------------------------------------------------------- /local/kaggle/replace_iflytek_choice.py: -------------------------------------------------------------------------------- 1 | import os,sys,json 2 | from parse_choices import * 3 | from normalize_utils import * 4 | 5 | 6 | 7 | if __name__ == '__main__': 8 | choices_file = sys.argv[1] 9 | iflytek_json = sys.argv[2] 10 | output_json = sys.argv[3] 11 | d = {} 12 | kaggle_dir = '/data/local/kgb/corpus/kgb/kaggle3' 13 | 14 | xlxs_path = os.path.join(kaggle_dir,'answer.xlsx') 15 | content = read_xlsx(xlxs_path) 16 | ''' 17 | with open(choices_file,'r',encoding='utf8') as f: 18 | for line in f: 19 | tokens = line.rstrip().split() 20 | text = ''.join(tokens[1:]) 21 | name = tokens[0].replace('.wav','') 22 | No = int(name[1:]) 23 | d[No] = parse(text) 24 | ''' 25 | d2 = {} 26 | for idx,row in enumerate(content): 27 | if idx == 0: 28 | continue 29 | No,passage,question,c1,c2,c3,c4 = row[:7] 30 | No = int(No[1:]) 31 | print(question) 32 | n_q = normalize(str(question),[]) 33 | q = n_q.replace(' ','') 34 | d2[No] = q 35 | 36 | 37 | with open(iflytek_json,'r',encoding='utf8') as f: 38 | data = json.load(f) 39 | outputs = [] 40 | for sample in data: 41 | id = sample['id'] 42 | sample['options'] = d[id] 43 | sample['question'] = d2[id] 44 | outputs.append(sample) 45 | with open(output_json,'w',encoding='utf8') as f: 46 | json.dump(outputs,f,indent=4,ensure_ascii=False) 47 | 48 | -------------------------------------------------------------------------------- /steps/segmentation/copy_targets_dir.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2017 Nagendra Kumar Goel 4 | # 2014 Johns Hopkins University (author: Nagendra K Goel) 5 | # Apache 2.0 6 | 7 | # This script makes a copy of targets directory (by copying targets.scp), 8 | # possibly adding a specified prefix or a suffix to the utterance names. 9 | 10 | # begin configuration section 11 | utt_prefix= 12 | utt_suffix= 13 | # end configuration section 14 | 15 | if [ -f ./path.sh ]; then . ./path.sh; fi 16 | . ./utils/parse_options.sh 17 | 18 | if [ $# != 2 ]; then 19 | echo "Usage: " 20 | echo " $0 [options] " 21 | echo "e.g.:" 22 | echo " $0 --utt-prefix=1- exp/segmentation_1a/train_whole_combined_targets_sub3 exp/segmentation_1a/train_whole_combined_targets_sub3_rev1" 23 | echo "Options" 24 | echo " --utt-prefix= # Prefix for utterance ids, default empty" 25 | echo " --utt-suffix= # Suffix for utterance ids, default empty" 26 | exit 1; 27 | fi 28 | 29 | export LC_ALL=C 30 | 31 | srcdir=$1 32 | destdir=$2 33 | 34 | mkdir -p $destdir 35 | 36 | if [ -f $srcdir/frame_subsampling_factor ]; then 37 | cp $srcdir/frame_subsampling_factor $destdir 38 | fi 39 | 40 | cat $srcdir/targets.scp | awk -v p=$utt_prefix -v s=$utt_suffix \ 41 | '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map 42 | 43 | cat $srcdir/targets.scp | utils/apply_map.pl -f 1 $destdir/utt_map | \ 44 | sort -k1,1 > $destdir/targets.scp 45 | 46 | echo "$0: copied targets from $srcdir to $destdir" 47 | -------------------------------------------------------------------------------- /utils/best_wer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2010-2011 Microsoft Corporation 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | # MERCHANTABLITY OR NON-INFRINGEMENT. 15 | # See the Apache 2 License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # To be run from one directory above this script. 19 | 20 | perl -e 'while(<>){ 21 | s/\|(\d)/\| $1/g; s/(\d)\|/$1 \|/g; 22 | if (m/[WS]ER (\S+)/ && (!defined $bestwer || $bestwer > $1)){ $bestwer = $1; $bestline=$_; } # kaldi "compute-wer" tool. 23 | elsif (m: (Mean|Sum/Avg|)\s*\|\s*\S+\s+\S+\s+\|\s+\S+\s+\S+\s+\S+\s+\S+\s+(\S+)\s+\S+\s+\|: 24 | && (!defined $bestwer || $bestwer > $2)){ $bestwer = $2; $bestline=$_; } } # sclite. 25 | if (defined $bestline){ print $bestline; } ' | \ 26 | awk 'BEGIN{ FS="%WER"; } { if(NF == 2) { print FS$2" "$1; } else { print $0; }}' | \ 27 | awk 'BEGIN{ FS="Sum/Avg"; } { if(NF == 2) { print $2" "$1; } else { print $0; }}' | \ 28 | awk '{ if($1!~/%WER/) { print "%WER "$9" "$0; } else { print $0; }}' | \ 29 | sed -e 's|\s\s*| |g' -e 's|\:$||' -e 's|\:\s*\|\s*$||' 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /local/extract_kaggle_feature.sh: -------------------------------------------------------------------------------- 1 | #$!/bin/bash 2 | nj=8 #number of job parallel running 3 | stage=0 4 | . ./path.sh 5 | . ./cmd.sh 6 | . ./utils/parse_options.sh 7 | 8 | 9 | if [ $stage -le 1 ] ; then 10 | mfccdir=data/mfcc_pitch 11 | mkdir -p $mfccdir 12 | 13 | for corpus in kaggle1 kaggle2 kaggle3 ; do 14 | combine48='' 15 | for typ in A B C ; do 16 | ##Extract MFCC39 + pitch9 feature 17 | data=./data/$corpus/$typ/mfcc39_pitch9 18 | name=$corpus\_$typ 19 | combine48="$data $combine48" 20 | steps/make_mfcc_pitch_online.sh --cmd "$train_cmd" --nj $nj --name $name $data exp/make_mfcc/$name $mfccdir || exit 1; 21 | steps/compute_cmvn_stats.sh --name $name $data exp/make_mfcc/$name $mfccdir || exit 1; 22 | done 23 | utils/combine_data.sh ./data/$corpus/mfcc39_pitch9 $combine48 24 | done 25 | fi 26 | 27 | if [ $stage -le 1 ] ; then 28 | fbankdir=data/fbank 29 | mkdir -p $fbankdir 30 | 31 | for corpus in kaggle1 kaggle2 kaggle3 ; do 32 | combine48='' 33 | for typ in A B C ; do 34 | mfccdata=./data/$corpus/$typ/mfcc39_pitch9 35 | data=./data/$corpus/$typ/fbank 36 | name=$corpus\_$typ 37 | combine48="$data $combine48" 38 | 39 | utils/copy_data_dir.sh $mfccdata $data 40 | steps/make_fbank.sh --nj 30 --cmd "$train_cmd" --fbank-config conf/fbank.conf --name $name \ 41 | $data exp/make_fbank/$name $fbankdir 42 | steps/compute_cmvn_stats.sh --name $name $data exp/make_fbank/$name $fbankdir 43 | done 44 | utils/combine_data.sh ./data/$corpus/fbank $combine48 45 | done 46 | fi 47 | -------------------------------------------------------------------------------- /utils/remove_oovs.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # This script removes lines that contain these OOVs on either the 18 | # third or fourth fields of the line. It is intended to remove arcs 19 | # with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). 20 | 21 | if ( @ARGV < 1 && @ARGV > 2) { 22 | die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; 23 | } 24 | 25 | $unklist = shift @ARGV; 26 | open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; 27 | while(){ 28 | @A = split(" ", $_); 29 | @A == 1 || die "Bad line in unknown-symbol list: $_"; 30 | $unk{$A[0]} = 1; 31 | } 32 | 33 | $num_removed = 0; 34 | while(<>){ 35 | @A = split(" ", $_); 36 | if(defined $unk{$A[2]} || defined $unk{$A[3]}) { 37 | $num_removed++; 38 | } else { 39 | print; 40 | } 41 | } 42 | print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; 43 | 44 | -------------------------------------------------------------------------------- /local/data/normalize.py: -------------------------------------------------------------------------------- 1 | import string,sys 2 | import re 3 | from number2chinese import * 4 | 5 | fin = sys.argv[1] 6 | fout = sys.argv[2] 7 | 8 | l = [] 9 | with open(fin,'r') as f: 10 | for line in f: 11 | for cha in [' ','、','「','」','”','“','…',')',')',':']: 12 | line = line.replace(cha,'') 13 | for cha in string.punctuation: 14 | line = line.replace(cha,'') 15 | for cha in [',',':','?','、','。',';','!']: 16 | line = line.replace(cha,'\n') 17 | line = line.replace('\n\n','\n') 18 | if len(line) >= 1: 19 | ## 我是john先生 -> 我 是 john 先 生 20 | newline = '' 21 | flag = True 22 | for char in line: 23 | if re.match('^[a-zA-Z0-9]+$',char): 24 | flag = False 25 | newline += char 26 | else: 27 | if not flag: 28 | newline += ' ' 29 | flag = True 30 | newline += char + ' ' 31 | if flag: 32 | newline = newline[:-1] 33 | #covert number to chinese 34 | line = '' 35 | for token in newline.split(' '): 36 | if re.match('^[0-9]+$',token): 37 | if len(token) > 15: 38 | break 39 | token = to_chinese(int(token)) 40 | token = ' '.join(list(token)) 41 | line += token + ' ' 42 | l.append(line[:-1]) 43 | with open(fout,'w') as f: 44 | for line in l: 45 | f.write(line) 46 | -------------------------------------------------------------------------------- /local/lm/news_crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import requests,sys 3 | from bs4 import BeautifulSoup 4 | 5 | # https://www.ettoday.net/news/news-list-2017-07-15-5.htm 6 | # 1 政治 7 | # 17 財經 8 | # 2 國際 9 | # 6 社會 10 | # 9 影劇 11 | # 10 體育 12 | # 20 3c 13 | # 30 時尚 14 | # 24 遊戲 15 | # 5 生活 16 | for tt in [1, 17, 2, 6, 9, 10, 20, 30, 24, 5]: 17 | urls = [] 18 | for n in range(1,12): 19 | for n2 in [5,10,15,20,25,31]: 20 | u = "https://www.ettoday.net/news/news-list-"+str(sys.argv[1]) + "-" + str(n)+"-"+str(n2)+"-"+str(tt)+".htm" 21 | res = requests.get(u) 22 | soup = BeautifulSoup(res.content, "lxml") 23 | soup = soup.find("div", class_="part_list_2") 24 | domian = "https://www.ettoday.net" 25 | for a in soup.find_all("h3"): 26 | urls.append(domian+a.a['href']) 27 | allcontent = [] 28 | for u in urls: 29 | content = [] 30 | res = requests.get(u) 31 | soup = BeautifulSoup(res.content, "lxml") 32 | try: 33 | soup = soup.find("div", class_="story") 34 | for a in soup.find_all("p"): 35 | p = a.string 36 | if p != None: 37 | p = p.split('/') 38 | if len(p) > 1: 39 | content.append(p[1]) 40 | print(p[1].encode('utf-8')) 41 | else: 42 | content.append(p[0]) 43 | print(p[0].encode('utf-8')) 44 | allcontent.append(content.encode('utf-8')) 45 | except: 46 | pass 47 | print(len(allcontent)) 48 | -------------------------------------------------------------------------------- /steps/nnet3/nnet3_to_dot.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # script showing use of nnet3_to_dot.py 4 | # Copyright 2015 Johns Hopkins University (Author: Vijayaditya Peddinti). 5 | 6 | # Begin configuration section. 7 | component_attributes="name,type" 8 | node_prefixes="" 9 | info_bin=nnet3-am-info 10 | echo "$0 $@" # Print the command line for logging 11 | 12 | [ -f ./path.sh ] && . ./path.sh; # source the path. 13 | . parse_options.sh || exit 1; 14 | 15 | if [ $# != 3 ]; then 16 | echo "Usage: $0 [opts] " 17 | echo " e.g.: $0 exp/sdm1/nnet3/lstm_sp/0.mdl lstm.dot lstm.png" 18 | echo "" 19 | echo "Main options (for others, see top of script file)" 20 | echo " --info-bin # Name of the binary to generate the nnet3 file" 21 | echo " --component-attributes # attributes to be printed in nnet3 components" 22 | echo " --node-prefixes # list of prefixes. Nnet3 components/component-nodes with the same prefix" 23 | echo " # will be clustered together in the dot-graph" 24 | 25 | 26 | exit 1; 27 | fi 28 | 29 | model=$1 30 | dot_file=$2 31 | output_file=$3 32 | 33 | attr=${node_prefixes:+ --node-prefixes "$node_prefixes"} 34 | $info_bin $model | \ 35 | steps/nnet3/dot/nnet3_to_dot.py \ 36 | --component-attributes "$component_attributes" \ 37 | $attr $dot_file 38 | echo "Generated the dot file $dot_file" 39 | 40 | command -v dot >/dev/null 2>&1 || { echo >&2 "This script requires dot but it's not installed. Please compile $dot_file with dot"; exit 1; } 41 | dot -Tpdf $dot_file -o $output_file 42 | -------------------------------------------------------------------------------- /steps/scoring/score_kaldi_compare.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2016 Nicolas Serrano 3 | # Apache 2.0 4 | 5 | [ -f ./path.sh ] && . ./path.sh 6 | 7 | # begin configuration section. 8 | cmd=run.pl 9 | replications=10000 10 | #end configuration section. 11 | 12 | echo "$0 $@" # Print the command line for logging 13 | [ -f ./path.sh ] && . ./path.sh 14 | . parse_options.sh || exit 1; 15 | 16 | if [ $# -ne 3 ]; then 17 | echo "Usage: $0 [--cmd (run.pl|queue.pl...)] " 18 | echo " Options:" 19 | echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." 20 | echo " --replications # number of bootstrap evaluation to compute confidence." 21 | exit 1; 22 | fi 23 | 24 | dir1=$1 25 | dir2=$2 26 | dir_compare=$3 27 | 28 | mkdir -p $dir_compare/log 29 | 30 | for d in $dir1 $dir2; do 31 | for f in test_filt.txt best_wer; do 32 | [ ! -f $d/$f ] && echo "$0: no such file $d/$f" && exit 1; 33 | done 34 | done 35 | 36 | 37 | best_wer_file1=$(awk '{print $NF}' $dir1/best_wer) 38 | best_transcript_file1=$(echo $best_wer_file1 | sed -e 's=.*/wer_==' | \ 39 | awk -v FS='_' -v dir=$dir1 '{print dir"/penalty_"$2"/"$1".txt"}') 40 | 41 | best_wer_file2=$(awk '{print $NF}' $dir2/best_wer) 42 | best_transcript_file2=$(echo $best_wer_file2 | sed -e 's=.*/wer_==' | \ 43 | awk -v FS='_' -v dir=$dir2 '{print dir"/penalty_"$2"/"$1".txt"}') 44 | 45 | $cmd $dir_compare/log/score_compare.log \ 46 | compute-wer-bootci --replications=$replications \ 47 | ark:$dir1/test_filt.txt ark:$best_transcript_file1 ark:$best_transcript_file2 \ 48 | '>' $dir_compare/wer_bootci_comparison || exit 1; 49 | 50 | exit 0; 51 | -------------------------------------------------------------------------------- /utils/add_disambig.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | # Adds some specified number of disambig symbols to a symbol table. 19 | # Adds these as #1, #2, etc. 20 | # If the --include-zero option is specified, includes an extra one 21 | # #0. 22 | 23 | $include_zero = 0; 24 | if($ARGV[0] eq "--include-zero") { 25 | $include_zero = 1; 26 | shift @ARGV; 27 | } 28 | 29 | if(@ARGV != 2) { 30 | die "Usage: add_disambig.pl [--include-zero] symtab.txt num_extra > symtab_out.txt "; 31 | } 32 | 33 | 34 | $input = $ARGV[0]; 35 | $nsyms = $ARGV[1]; 36 | 37 | open(F, "<$input") || die "Opening file $input"; 38 | 39 | while() { 40 | @A = split(" ", $_); 41 | @A == 2 || die "Bad line $_"; 42 | $lastsym = $A[1]; 43 | print; 44 | } 45 | 46 | if(!defined($lastsym)){ 47 | die "Empty symbol file?"; 48 | } 49 | 50 | if($include_zero) { 51 | $lastsym++; 52 | print "#0 $lastsym\n"; 53 | } 54 | 55 | for($n = 1; $n <= $nsyms; $n++) { 56 | $y = $n + $lastsym; 57 | print "#$n $y\n"; 58 | } 59 | -------------------------------------------------------------------------------- /local/nnet/DFSMN_L.proto: -------------------------------------------------------------------------------- 1 | 2 | 720 2048 1 3 | 2048 2048 4 | 2048 512 1 5 | 512 512 20 20 2 2 6 | 512 512 2048 20 20 2 2 7 | 512 512 2048 20 20 2 2 8 | 512 512 2048 20 20 2 2 9 | 512 512 2048 20 20 2 2 10 | 512 512 2048 20 20 2 2 11 | 512 512 2048 20 20 2 2 12 | 512 512 2048 20 20 2 2 13 | 512 512 2048 20 20 2 2 14 | 512 512 2048 20 20 2 2 15 | 512 2048 1 16 | 2048 2048 17 | 2048 2048 1 18 | 2048 2048 19 | 2048 512 1 20 | 512 5777 1 21 | 5777 5777 22 | 23 | 24 | -------------------------------------------------------------------------------- /utils/data/modify_speaker_info_to_recording.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2017 Vimal Manohar 4 | # Apache 2.0. 5 | 6 | # Copy the data directory, but modify it to use the recording-id as the 7 | # speaker. This is useful to get matching speaker information in the 8 | # whole recording data directory. 9 | # Note that this also appends the recording-id as a prefix to the 10 | # utterance-id. 11 | 12 | if [ $# -ne 2 ]; then 13 | echo "Usage: $0 " 14 | echo " e.g.: $0 data/train data/train_recospk" 15 | exit 1 16 | fi 17 | 18 | in_data=$1 19 | out_data=$2 20 | 21 | mkdir -p $out_data 22 | 23 | for f in wav.scp segments utt2spk; do 24 | if [ ! -f $in_data/$f ]; then 25 | echo "$0: Could not find file $in_data/$f" 26 | exit 1 27 | fi 28 | done 29 | 30 | cp $in_data/wav.scp $out_data/ || exit 1 31 | cp $in_data/reco2file_and_channel $out_data/ 2> /dev/null || true 32 | awk '{print $1" "$2"-"$1}' $in_data/segments > \ 33 | $out_data/old2new.uttmap || exit 1 34 | utils/apply_map.pl -f 1 $out_data/old2new.uttmap < $in_data/segments > \ 35 | $out_data/segments || exit 1 36 | awk '{print $1" "$2}' $out_data/segments > $out_data/utt2spk || exit 1 37 | utils/utt2spk_to_spk2utt.pl $out_data/utt2spk > $out_data/spk2utt || exit 1 38 | 39 | if [ -f $in_data/text ]; then 40 | utils/apply_map.pl -f 1 $out_data/old2new.uttmap < $in_data/text > \ 41 | $out_data/text || exit 1 42 | fi 43 | 44 | if [ -f $in_data/feats.scp ]; then 45 | utils/apply_map.pl -f 1 $out_data/old2new.uttmap < $in_data/feats.scp > \ 46 | $out_data/feats.scp || exit 1 47 | fi 48 | 49 | utils/fix_data_dir.sh $out_data || exit 1 50 | utils/validate_data_dir.sh --no-text --no-feats $out_data || exit 1 51 | -------------------------------------------------------------------------------- /local/nnet/DFSMN_L.proto.2560: -------------------------------------------------------------------------------- 1 | 2 | 720 2048 1 3 | 2048 2048 4 | 2048 512 1 5 | 512 512 20 20 2 2 6 | 512 512 2048 20 20 2 2 7 | 512 512 2048 20 20 2 2 8 | 512 512 2048 20 20 2 2 9 | 512 512 2048 20 20 2 2 10 | 512 512 2048 20 20 2 2 11 | 512 512 2048 20 20 2 2 12 | 512 512 2048 20 20 2 2 13 | 512 512 2048 20 20 2 2 14 | 512 512 2048 20 20 2 2 15 | 512 2048 1 16 | 2048 2048 17 | 2048 2048 1 18 | 2048 2048 19 | 2048 512 1 20 | 512 2560 1 21 | 2560 2560 22 | 23 | 24 | -------------------------------------------------------------------------------- /local/nnet/DFSMN_L.proto.8136: -------------------------------------------------------------------------------- 1 | 2 | 720 2048 1 3 | 2048 2048 4 | 2048 512 1 5 | 512 512 20 20 2 2 6 | 512 512 2048 20 20 2 2 7 | 512 512 2048 20 20 2 2 8 | 512 512 2048 20 20 2 2 9 | 512 512 2048 20 20 2 2 10 | 512 512 2048 20 20 2 2 11 | 512 512 2048 20 20 2 2 12 | 512 512 2048 20 20 2 2 13 | 512 512 2048 20 20 2 2 14 | 512 512 2048 20 20 2 2 15 | 512 2048 1 16 | 2048 2048 17 | 2048 2048 1 18 | 2048 2048 19 | 2048 512 1 20 | 512 8136 1 21 | 8136 8136 22 | 23 | 24 | -------------------------------------------------------------------------------- /steps/segmentation/combine_targets_dirs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2017 Nagendra Kumar Goel 4 | # 2018 Vimal Manohar 5 | # Apache 2.0. 6 | 7 | # This script combines targets directory into a new targets directory 8 | # containing targets from all the input targets directories. 9 | 10 | echo "$0 $@" # Print the command line for logging 11 | 12 | if [ -f path.sh ]; then . ./path.sh; fi 13 | . parse_options.sh || exit 1; 14 | 15 | if [ $# -lt 3 ]; then 16 | echo "Usage: $0 [options] ..." 17 | echo "e.g.: $0 data/train exp/targets_combined exp/targets_1 exp/targets_2" 18 | exit 1; 19 | fi 20 | 21 | export LC_ALL=C 22 | 23 | data=$1; 24 | shift; 25 | dest=$1; 26 | shift; 27 | first_src=$1; 28 | 29 | mkdir -p $dest; 30 | rm -f $dest/{targets.*.ark,frame_subsampling_factor} 2>/dev/null 31 | 32 | frame_subsampling_factor=1 33 | if [ -f $first_src/frame_subsampling_factor ]; then 34 | cp $first_src/frame_subsampling_factor $dest 35 | frame_subsampling_factor=$(cat $dest/frame_subsampling_factor) 36 | fi 37 | 38 | for d in $*; do 39 | this_frame_subsampling_factor=1 40 | if [ -f $d/frame_subsampling_factor ]; then 41 | this_frame_subsampling_factor=$(cat $d/frame_subsampling_factor) 42 | fi 43 | 44 | if [ $this_frame_subsampling_factor != $frame_subsampling_factor ]; then 45 | echo "$0: Cannot combine targets directories with different frame-subsampling-factors" 1>&2 46 | exit 1 47 | fi 48 | 49 | cat $d/targets.scp 50 | done | sort -k1,1 > $dest/targets.scp || exit 1 51 | 52 | steps/segmentation/validate_targets_dir.sh $dest $data || exit 1 53 | 54 | echo "Combined targets and stored in $dest" 55 | exit 0 56 | -------------------------------------------------------------------------------- /local/nnet/DFSMN_L_ivector.proto: -------------------------------------------------------------------------------- 1 | 2 | 1020 2048 1 3 | 2048 2048 4 | 2048 512 1 5 | 512 512 20 20 2 2 6 | 512 512 2048 20 20 2 2 7 | 512 512 2048 20 20 2 2 8 | 512 512 2048 20 20 2 2 9 | 512 512 2048 20 20 2 2 10 | 512 512 2048 20 20 2 2 11 | 512 512 2048 20 20 2 2 12 | 512 512 2048 20 20 2 2 13 | 512 512 2048 20 20 2 2 14 | 512 512 2048 20 20 2 2 15 | 512 2048 1 16 | 2048 2048 17 | 2048 2048 1 18 | 2048 2048 19 | 2048 512 1 20 | 512 5777 1 21 | 5777 5777 22 | 23 | 24 | -------------------------------------------------------------------------------- /local/nnet/DFSMN_L_ivector.proto.2560: -------------------------------------------------------------------------------- 1 | 2 | 1020 2048 1 3 | 2048 2048 4 | 2048 512 1 5 | 512 512 20 20 2 2 6 | 512 512 2048 20 20 2 2 7 | 512 512 2048 20 20 2 2 8 | 512 512 2048 20 20 2 2 9 | 512 512 2048 20 20 2 2 10 | 512 512 2048 20 20 2 2 11 | 512 512 2048 20 20 2 2 12 | 512 512 2048 20 20 2 2 13 | 512 512 2048 20 20 2 2 14 | 512 512 2048 20 20 2 2 15 | 512 2048 1 16 | 2048 2048 17 | 2048 2048 1 18 | 2048 2048 19 | 2048 512 1 20 | 512 2560 1 21 | 2560 2560 22 | 23 | 24 | -------------------------------------------------------------------------------- /steps/segmentation/decode_sad.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2016 Vimal Manohar 4 | # Apache 2.0. 5 | 6 | # This script does Viterbi decoding using a matrix of frame log-likelihoods 7 | # with the columns corresponding to the pdfs. 8 | # It is a wrapper around the binary decode-faster. 9 | 10 | set -e 11 | set -o pipefail 12 | 13 | cmd=run.pl 14 | nj=4 15 | acwt=0.1 16 | beam=8 17 | max_active=1000 18 | transform= # Transformation matrix to apply on the input archives read from output.scp 19 | 20 | . ./path.sh 21 | 22 | . utils/parse_options.sh 23 | 24 | if [ $# -ne 3 ]; then 25 | echo "Usage: $0 " 26 | echo " e.g.: $0 " 27 | exit 1 28 | fi 29 | 30 | graph_dir=$1 31 | nnet_output_dir=$2 32 | dir=$3 33 | 34 | mkdir -p $dir/log 35 | 36 | echo $nj > $dir/num_jobs 37 | 38 | for f in $graph_dir/HCLG.fst $nnet_output_dir/output.scp $extra_files; do 39 | if [ ! -f $f ]; then 40 | echo "$0: Could not find file $f" 41 | exit 1 42 | fi 43 | done 44 | 45 | rspecifier="ark:utils/split_scp.pl -j $nj \$[JOB-1] $nnet_output_dir/output.scp | copy-feats scp:- ark:- |" 46 | 47 | # Apply a transformation on the input matrix to combine 48 | # probs from different columns to pseudo-likelihoods 49 | if [ ! -z "$transform" ]; then 50 | rspecifier="$rspecifier transform-feats $transform ark:- ark:- |" 51 | fi 52 | 53 | # Convert pseudo-likelihoods to pseudo log-likelihood 54 | rspecifier="$rspecifier copy-matrix --apply-log ark:- ark:- |" 55 | 56 | decoder_opts+=(--acoustic-scale=$acwt --beam=$beam --max-active=$max_active) 57 | 58 | $cmd JOB=1:$nj $dir/log/decode.JOB.log \ 59 | decode-faster ${decoder_opts[@]} \ 60 | $graph_dir/HCLG.fst "$rspecifier" \ 61 | ark:/dev/null "ark:| gzip -c > $dir/ali.JOB.gz" 62 | -------------------------------------------------------------------------------- /local/data/normalize_text.py: -------------------------------------------------------------------------------- 1 | import sys,re 2 | from number2chinese import * 3 | 4 | not_in_word=[ '`', '÷', '×', '≠', '<', '>', '|', '°', '┬', '┐', '├', '┼', '┤', '└', '┴', '│', '¯', '-', ';', '!', '¿', '·', '‘', '’', '"', '(', ')', '[', ']', '{', '}', '§', '®', '™', '@', '$', '€', '*', '&', '&&', '&&&', '±', '━', '←', '→', '↑', '↓', '♪', '╱', '╲', '◢', '◣', 'ˋ', '▁', '\x1b', '\x7f', '\x80', '¼', '½', '-', 'Á', 'À', 'Â', 'Å', 'Ä', 'Ā','( ','˙','!', '(', ')', '-', '.', ':', '<', '>', '·', 'β', '—', '•', '℃', '。', '《', '》', 'ㄅ', 'ㄆ', 'ㄇ', 'ㄈ', 'ㄔ', 'ㄙ', 'ㄞ', 'ㄟ', '一', '\ue015', '\ue028', '\ufeff', '.', ':', 'C', 'D', 'E', 'I', 'K', 'T'] 5 | 6 | if __name__ == '__main__': 7 | texts_path = sys.argv[1] 8 | words_path = sys.argv[2] 9 | 10 | words = [] 11 | with open(words_path,'r',encoding='utf-8') as f: 12 | for line in f: 13 | line = line.rstrip() 14 | words.append(line) 15 | words = set(words) 16 | 17 | with open(texts_path,'r',encoding='utf-8') as f: 18 | for line in f: 19 | line = line.rstrip() 20 | tokens = line.split() 21 | new_line = '' 22 | for token in tokens: 23 | if re.match('^[0-9]+$',token): 24 | if len(token) > 15: 25 | continue 26 | token = to_chinese(int(token)) 27 | if token in not_in_word : 28 | continue 29 | if token not in words: 30 | if len(re.findall(u'[\u4e00-\u9fff]+', token)) != 0: 31 | if len(token) > 1: 32 | token = ' '.join(token) 33 | new_line = new_line + token + ' ' 34 | print(new_line) 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /utils/remove_data_links.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This program searches within a directory for soft links that 4 | # appear to be created by 'create_data_link.pl' to a 'storage/' subdirectory, 5 | # and it removes both the soft links and the things they point to. 6 | # for instance, if you have a soft link 7 | # foo/egs/1.1.egs -> storage/2/1.1.egs 8 | # it will remove both foo/egs/storage/2/1.1.egs, and foo/egs/1.1.egs. 9 | 10 | ret=0 11 | 12 | dry_run=false 13 | 14 | if [ "$1" == "--dry-run" ]; then 15 | dry_run=true 16 | shift 17 | fi 18 | 19 | if [ $# == 0 ]; then 20 | echo "Usage: $0 [--dry-run] " 21 | echo "e.g.: $0 exp/nnet4a/egs/" 22 | echo " Removes from any subdirectories of the command-line arguments, soft links that " 23 | echo " appear to have been created by utils/create_data_link.pl, as well as the things" 24 | echo " that those soft links point to. Will typically be called on a directory prior" 25 | echo " to 'rm -r' on that directory, to ensure that data that was distributed on other" 26 | echo " volumes also gets deleted." 27 | echo " With --dry-run, just prints what it would do." 28 | fi 29 | 30 | for dir in $*; do 31 | if [ ! -d $dir ]; then 32 | echo "$0: not a directory: $dir" 33 | ret=1 34 | else 35 | for subdir in $(find $dir -type d); do 36 | if [ -d $subdir/storage ]; then 37 | for x in $(ls $subdir); do 38 | f=$subdir/$x 39 | if [ -L $f ] && [[ $(readlink $f) == storage/* ]]; then 40 | target=$subdir/$(readlink $f) 41 | if $dry_run; then 42 | echo rm $f $target 43 | else 44 | rm $f $target 45 | fi 46 | fi 47 | done 48 | fi 49 | done 50 | fi 51 | done 52 | 53 | exit $ret 54 | -------------------------------------------------------------------------------- /local/kaggle/decode_kaggle_simulate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | A_dir=/data/local/kgb/Chinese-ASR/1213_simulate/wav/A 3 | B_dir=/data/local/kgb/Chinese-ASR/1213_simulate/wav/B 4 | C_dir=/data/local/kgb/Chinese-ASR/1213_simulate/wav/C 5 | #B_dir=/data/local/kgb/corpus/kgb/kaggle1/data/wav/B 6 | #C_dir=/data/local/kgb/corpus/kgb/kaggle1/data/wav/C 7 | #test_C_dir=/data/local/kgb/corpus/kgb/kaggle3/data/wav/C 8 | #iflytek_A=/data/local/kgb/Chinese-ASR/0622/iflytek_A 9 | src_dir=./1213_simulate 10 | decode_A=true 11 | 12 | set -e 13 | set -u 14 | set -o pipefail 15 | . path.sh 16 | 17 | mkdir -p $src_dir 18 | 19 | if $decode_A ; then 20 | python3 local/kaggle/get_id_list.py $A_dir $src_dir/idx.json || exit 1; 21 | 22 | #bash local/kaggle/check_sample_rate.sh $A_dir 23 | local/kaggle/decode_from_wav_seperate.sh $A_dir $src_dir/A || exit 1; #select lm itself 24 | 25 | python3 local/kaggle/check_output.py $src_dir/A 26 | #bash local/kaggle/mix_LM_with_A.sh $src_dir/A/output.txt $src_dir/C_lang 27 | #bash local/kaggle/test/decode_test.sh $test_C_dir $src_dir/C_test $src_dir/C_lang 28 | else 29 | #C:choices 把choose lm comment掉 30 | bash local/kaggle/check_sample_rate.sh $C_dir || exit 1; 31 | local/kaggle/decode_from_wav_seperate.sh --choose_lm_file $src_dir/kaggle_simulate_lm $C_dir $src_dir/C || exit 1; 32 | python3 local/kaggle/check_output.py $src_dir/C || exit 1; 33 | #B:question 34 | bash local/kaggle/check_sample_rate.sh $B_dir 35 | local/kaggle/decode_from_wav_seperate.sh --choose_lm_file $src_dir/kaggle_simulate_lm $B_dir $src_dir/B || exit 1; 36 | python3 local/kaggle/check_output.py $src_dir/B || exit 1; 37 | 38 | python3 local/kaggle/merge_json.py $src_dir/A/output.txt $src_dir/B/output.txt $src_dir/C/output.txt $src_dir/idx.json $src_dir/result_kaldi.json 39 | fi 40 | 41 | -------------------------------------------------------------------------------- /steps/segmentation/internal/find_oov_phone.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | # Copyright 2017 Vimal Manohar 4 | # Apache 2.0 5 | 6 | """This script finds the OOV phone by reading the OOV word from 7 | oov.int in the input directory and the lexicon 8 | /phones/align_lexicon.int. 9 | It prints the OOV phone to stdout, if it can find a single phone 10 | mapping for the OOV word.""" 11 | 12 | import sys 13 | 14 | 15 | def main(): 16 | if len(sys.argv) != 2: 17 | raise RuntimeError("Usage: {0} ".format(sys.argv[0])) 18 | 19 | lang = sys.argv[1] 20 | 21 | oov_int = int(open("{0}/oov.int").readline()) 22 | assert oov_int > 0 23 | 24 | oov_mapped_to_multiple_phones = False 25 | for line in open("{0}/phones/align_lexicon.int"): 26 | parts = line.strip().split() 27 | 28 | if len(parts) < 3: 29 | raise RuntimeError("Could not parse line {0} in " 30 | "{1}/phones/align_lexicon.int" 31 | "".format(line, lang)) 32 | 33 | w = int(parts[0]) 34 | if w != oov_int: 35 | continue 36 | 37 | if len(parts[2:]) > 1: 38 | # Try to find a single phone mapping for OOV 39 | oov_mapped_to_multiple_phones = True 40 | continue 41 | 42 | p = int(parts[2]) 43 | print ("{0}".format(p)) 44 | 45 | raise SystemExit(0) 46 | 47 | if oov_mapped_to_multiple_phones: 48 | raise RuntimeError("OOV word found, but is mapped to multiples phones. " 49 | "This is an unusual case.") 50 | 51 | raise RuntimeError("Could not find OOV word in " 52 | "{0}/phones/align_lexicon.int".format(lang)) 53 | 54 | 55 | if __name__ != "__main__": 56 | main() 57 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Chinese-ASR 2 | Chinese-ASR built on kaldi 3 | 4 | ## Dependencies: 5 | Opencc: convert simplified Chinese to traditional Chinese 6 | 7 | https://github.com/yichen0831/opencc-python 8 | 9 | jeiba zh version : Traditional Chinese word segmentation tool: 10 | 11 | https://l.facebook.com/l.php?u=https%3A%2F%2Fgithub.com%2Fldkrsi%2Fjieba-zh_TW&h=ATPhhi1b7UYw84pPzgAz4MDbn3MRo7oFLAuhBLW8geUqHF0O1YZDnXsNh5qe7tQVWGQ5uaocYvuV-UsuvALNeN3LRaq68ACLMfbWE2RivhiCHoyjcFtNTVy6XG0sh5MJTp5tYEZm0xA 12 | 13 | ## Usage 14 | 15 | 1.modify kaldi path in path.sh 16 | 17 | 2.modify corpus path in local/data/corpus_path.sh 18 | 19 | 3.Install sequitar(G2P), sox, kaldi_lm in kaldi/tools/ 20 | 21 | 4.bash run.sh 22 | 23 | ## some useful scripts 24 | 25 | 1. LM training and interpolation : local/lm 26 | 27 | 2. Customed WFST for multiple choice problem : local/lm/wfst 28 | 29 | -> Force the outputs to the format of " XXX XXX XXX XXX" 30 | 31 | 3. scripts of training DFSMN : local/nnet 32 | 33 | ## Experiment 34 | 35 | | Model | TOCFL(CER%) | Cyberon_Chinese_test(CER%) | 36 | | ------------- |:--------------:| ----------------------------:| 37 | | mono0a | 97.76 | 100.71 | 38 | | tri1 | 50.55 | 63.64 | 39 | | tri2 | 56.62 | 46.65 | 40 | | tri3 | 34.78 | 46.78 | 41 | | tri4 | 37.02 | 34.02 | 42 | | tri5 | 65.60 | 49.96 | 43 | | tdnn_lstm1 | 18.30 | 24.82 | 44 | | tdnn_lstm(realign) | 15.88 | 22.24 | 45 | | DFSMN(Alibaba) | 11.22 | 12.14 | 46 | -------------------------------------------------------------------------------- /utils/nnet/gen_hamm_mat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | # Copyright 2012 Brno University of Technology (author: Karel Vesely) 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | # MERCHANTABLITY OR NON-INFRINGEMENT. 15 | # See the Apache 2 License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # ./gen_hamm_mat.py 19 | # script generates diagonal matrix with hamming window values 20 | 21 | from math import * 22 | import sys 23 | 24 | 25 | from optparse import OptionParser 26 | 27 | parser = OptionParser() 28 | parser.add_option('--fea-dim', dest='dim', help='feature dimension') 29 | parser.add_option('--splice', dest='splice', help='applied splice value') 30 | (options, args) = parser.parse_args() 31 | 32 | if(options.dim == None): 33 | parser.print_help() 34 | sys.exit(1) 35 | 36 | dim=int(options.dim) 37 | splice=int(options.splice) 38 | 39 | 40 | #generate the diagonal matrix with hammings 41 | M_2PI = 6.283185307179586476925286766559005 42 | 43 | dim_mat=(2*splice+1)*dim 44 | timeContext=2*splice+1 45 | print '[' 46 | for row in range(dim_mat): 47 | for col in range(dim_mat): 48 | if col!=row: 49 | print '0', 50 | else: 51 | i=int(row/dim) 52 | print str(0.54 - 0.46*cos((M_2PI * i) / (timeContext-1))), 53 | print 54 | 55 | print ']' 56 | 57 | 58 | -------------------------------------------------------------------------------- /local/nnet/augment_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | fbankdir=data/fbank 3 | 4 | . ./path.sh 5 | . ./cmd.sh 6 | . ./utils/parse_options.sh 7 | set -euo pipefail 8 | 9 | 10 | for corpus in cyberon_chinese_test TOCFL train_sp ; do 11 | data=data/$corpus/fbank 12 | data_aug=data/$corpus\_aug/fbank 13 | data_rvb=data/$corpus\_rvb/fbank 14 | if [ ! -f $data/reco2dur ] ; then 15 | bash utils/data/get_reco2utt.sh $data || exit 1 16 | fi 17 | 18 | python2 steps/data/augment_data_dir.py --utt-suffix aug --bg-snrs 20:10:5:3:0 --num-bg-noises 1:2 --bg-noise-dir data/noise $data $data_aug 19 | python2 steps/data/reverberate_data_dir.py --prefix rvb --speech-rvb-probability 1 --num-replications 1 \ 20 | --rir-set-parameters data/RIRS_NOISES/simulated_rirs/smallroom/rir_list $data $data_rvb 21 | 22 | name=$corpus\_aug 23 | steps/make_fbank.sh --nj 50 --cmd "$train_cmd" --fbank-config conf/fbank.conf --name $name $data_aug exp/make_fbank/$name $fbankdir 24 | steps/compute_cmvn_stats.sh --name $name $data_aug exp/make_fbank/$name $fbankdir 25 | 26 | name=$corpus\_rvb 27 | steps/make_fbank.sh --nj 50 --cmd "$train_cmd" --fbank-config conf/fbank.conf --name $name $data_rvb exp/make_fbank/$name $fbankdir 28 | steps/compute_cmvn_stats.sh --name $name $data_rvb exp/make_fbank/$name $fbankdir 29 | 30 | rm -rf ./data/$corpus\_rvb_aug/fbank 31 | utils/combine_data.sh ./data/$corpus\_rvb_aug/fbank $data_aug $data_rvb $data 32 | done 33 | 34 | rm -r exp/tri4a_sp_rvb_aug_ali 35 | cp -r exp/tri4a_sp_ali exp/tri4a_sp_rvb_aug_ali 36 | local/nnet/copy_alignment.sh exp/tri4a_sp_rvb_aug_ali/ 37 | 38 | rm -r exp/tri4a_ali_cyberon_chinese_test_rvb_aug 39 | cp -r exp/tri4a_ali_cyberon_chinese_test exp/tri4a_ali_cyberon_chinese_test_rvb_aug 40 | local/nnet/copy_alignment.sh exp/tri4a_ali_cyberon_chinese_test_rvb_aug 41 | 42 | -------------------------------------------------------------------------------- /utils/data/shift_feats.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2016 Vimal Manohar 4 | # 2017 Hossein Hadian 5 | # Apache 2.0 6 | 7 | echo "$0 $@" # Print the command line for logging 8 | if [ -f path.sh ]; then . ./path.sh; fi 9 | . parse_options.sh || exit 1; 10 | 11 | if [ $# != 3 ]; then 12 | echo " Usage: $0 " 13 | echo "e.g.: $0 -1 data/train data/train_fs-1" 14 | echo "The script creates a new data directory with the features modified" 15 | echo "using the program shift-feats with the specified frame-shift." 16 | echo "This program automatically adds the prefix 'fs-' to the" 17 | echo "utterance and speaker names. See also utils/data/shift_and_combine_feats.sh" 18 | exit 1 19 | fi 20 | 21 | frame_shift=$1 22 | srcdir=$2 23 | destdir=$3 24 | 25 | 26 | if [ "$destdir" == "$srcdir" ]; then 27 | echo "$0: this script requires and to be different." 28 | exit 1 29 | fi 30 | 31 | if [ ! -f $srcdir/feats.scp ]; then 32 | echo "$0: no such file $srcdir/feats.scp" 33 | exit 1; 34 | fi 35 | 36 | utt_prefix="fs$frame_shift-" 37 | spk_prefix="fs$frame_shift-" 38 | 39 | mkdir -p $destdir 40 | utils/copy_data_dir.sh --utt-prefix $utt_prefix --spk-prefix $spk_prefix \ 41 | $srcdir $destdir 42 | 43 | if grep --quiet "'" $srcdir/feats.scp; then 44 | echo "$0: the input features already use single quotes. Can't proceed." 45 | exit 1; 46 | fi 47 | 48 | awk -v shift=$frame_shift 'NF == 2 {uttid=$1; feat=$2; qt="";} \ 49 | NF > 2 {idx=index($0, " "); uttid=$1; feat=substr($0, idx + 1); qt="\x27";} \ 50 | NF {print uttid " shift-feats --print-args=false --shift=" shift, qt feat qt " - |";}' \ 51 | $destdir/feats.scp >$destdir/feats_shifted.scp 52 | mv -f $destdir/feats_shifted.scp $destdir/feats.scp 53 | 54 | echo "$0: Done" 55 | 56 | -------------------------------------------------------------------------------- /utils/nnet/gen_splice.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | # Copyright 2012 Brno University of Technology (author: Karel Vesely) 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | # MERCHANTABLITY OR NON-INFRINGEMENT. 15 | # See the Apache 2 License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # ./gen_splice.py 19 | # generates Component 20 | 21 | from math import * 22 | import sys 23 | 24 | 25 | from optparse import OptionParser 26 | 27 | parser = OptionParser() 28 | parser.add_option('--fea-dim', dest='dim_in', help='feature dimension') 29 | parser.add_option('--splice', dest='splice', help='number of frames to concatenate with the central frame') 30 | parser.add_option('--splice-step', dest='splice_step', help='splicing step (frames dont need to be consecutive, --splice 3 --splice-step 2 will select offsets: -6 -4 -2 0 2 4 6)', default='1' ) 31 | (options, args) = parser.parse_args() 32 | 33 | if(options.dim_in == None): 34 | parser.print_help() 35 | sys.exit(1) 36 | 37 | dim_in=int(options.dim_in) 38 | splice=int(options.splice) 39 | splice_step=int(options.splice_step) 40 | 41 | dim_out=(2*splice+1)*dim_in 42 | 43 | print '', dim_out, dim_in 44 | print '[', 45 | 46 | splice_vec = range(-splice*splice_step, splice*splice_step+1, splice_step) 47 | for idx in range(len(splice_vec)): 48 | print splice_vec[idx], 49 | 50 | print ']' 51 | 52 | -------------------------------------------------------------------------------- /local/format_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | 4 | if [ -f ./path.sh ]; then . ./path.sh; fi 5 | 6 | silprob=0.5 7 | 8 | arpa_lm=data/local/lm/3gram-mincount/lm_pr4.0.gz 9 | lang_test=data/lang_3small_test 10 | arpa_lm=$1 11 | lang_test=$2 12 | . ./utils/parse_options.sh 13 | 14 | 15 | 16 | [ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; 17 | 18 | 19 | rm -r $lang_test 20 | cp -r data/lang $lang_test 21 | 22 | echo $arpa_lm 23 | 24 | gunzip -c "$arpa_lm" | \ 25 | arpa2fst --disambig-symbol=#0 \ 26 | --read-symbol-table=$lang_test/words.txt - $lang_test/G.fst 27 | 28 | 29 | echo "Checking how stochastic G is (the first of these numbers should be small):" 30 | fstisstochastic $lang_test/G.fst 31 | 32 | ## Check lexicon. 33 | ## just have a look and make sure it seems sane. 34 | echo "First few lines of lexicon FST:" 35 | fstprint --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst | head 36 | 37 | echo Performing further checks 38 | 39 | # Checking that G.fst is determinizable. 40 | fstdeterminize $lang_test/G.fst /dev/null || echo Error determinizing G. 41 | 42 | # Checking that L_disambig.fst is determinizable. 43 | fstdeterminize $lang_test/L_disambig.fst /dev/null || echo Error determinizing L. 44 | 45 | # Checking that disambiguated lexicon times G is determinizable 46 | # Note: we do this with fstdeterminizestar not fstdeterminize, as 47 | # fstdeterminize was taking forever (presumbaly relates to a bug 48 | # in this version of OpenFst that makes determinization slow for 49 | # some case). 50 | fsttablecompose $lang_test/L_disambig.fst $lang_test/G.fst | \ 51 | fstdeterminizestar >/dev/null || echo Error 52 | 53 | # Checking that LG is stochastic: 54 | fsttablecompose data/lang/L_disambig.fst $lang_test/G.fst | \ 55 | fstisstochastic || echo LG is not stochastic 56 | 57 | 58 | echo format_data succeeded. 59 | -------------------------------------------------------------------------------- /local/data/merge_json.py: -------------------------------------------------------------------------------- 1 | import os,sys,json 2 | from normalize_utils import * 3 | from parse_choices import parse 4 | 5 | def process_outputs(outputs): 6 | L = read_outputs(outputs) 7 | L2 = [] 8 | for name,trans in L: 9 | idx = int(name[1:].replace('.wav','')) 10 | trans = trans.replace(' ','') 11 | L2.append((idx,trans)) 12 | L2 =sorted(L2, key=lambda s: s[0]) 13 | return L2 14 | def write_d(key,X_list,L): 15 | for idx,value in X_list: 16 | for i,l in enumerate(L): 17 | if l["id"] == idx: 18 | L[i][key] = value 19 | break 20 | return L 21 | 22 | if __name__ == '__main__': 23 | A_outputs = sys.argv[1] 24 | B_outputs = sys.argv[2] 25 | C_outputs = sys.argv[3] 26 | idx_json = sys.argv[4] 27 | result_json = sys.argv[5] 28 | A_list = process_outputs(A_outputs) 29 | if len(A_list) != 1500: 30 | print("len(A_list) = {}".format(len(A_list))) 31 | B_list = process_outputs(B_outputs) 32 | if len(B_list) != 1500: 33 | print("len(B_list) = {}".format(len(B_list))) 34 | C_list = process_outputs(C_outputs) 35 | if len(C_list) != 1500: 36 | print("len(C_list) = {}".format(len(C_list))) 37 | all_idx = json.load(open(idx_json,'r')) 38 | all_idx = sorted(all_idx) 39 | L = [] 40 | for idx in all_idx: 41 | d = {"context":"","question":"","options":["","","",""],"id":idx,"answer":-1} 42 | L.append(d) 43 | C_list_parse = [] 44 | for idx,trans in C_list: 45 | options = parse(trans) 46 | C_list_parse.append((idx,options)) 47 | L = write_d("context",A_list,L) 48 | L = write_d("question",B_list,L) 49 | L = write_d("options",C_list_parse,L) 50 | json.dump(L,open(result_json,'w',encoding='utf8'),indent=4,ensure_ascii=False) 51 | 52 | 53 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /steps/libs/nnet3/xconfig/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Johns Hopkins University (Dan Povey) 2 | # 2016 Vijayaditya Peddinti 3 | # 2016 Yiming Wang 4 | # Apache 2.0. 5 | 6 | """This library has classes and methods to form neural network computation graphs, 7 | in the nnet3 framework, using higher level abstractions called 'layers' 8 | (e.g. sub-graphs like LSTMS ). 9 | 10 | Note : We use the term 'layer' though the computation graph can have a highly 11 | non-linear structure as, other terms such as nodes/components have already been 12 | used in C++ codebase of nnet3. 13 | 14 | This is basically a config parser module, where the configs have very concise 15 | descriptions of a neural network. 16 | 17 | This module has methods to convert the xconfigs into a configs interpretable by 18 | nnet3 C++ library. 19 | 20 | It generates three different configs: 21 | 'init.config' : which is the config with the info necessary for computing 22 | the preconditioning matrix i.e., LDA transform 23 | e.g. 24 | input-node name=input dim=40 25 | input-node name=ivector dim=100 26 | output-node name=output input=Append(Offset(input, -2), Offset(input, -1), input, Offset(input, 1), Offset(input, 2), ReplaceIndex(ivector, t, 0)) objective=linear 27 | 28 | 'ref.config' : which is a version of the config file used to generate 29 | a model for getting left and right context (it doesn't read 30 | anything for the LDA-like transform and/or 31 | presoftmax-prior-scale components) 32 | 33 | 'final.config' : which has the actual config used to initialize the model used 34 | in training i.e, it has file paths for LDA transform and 35 | other initialization files 36 | """ 37 | 38 | 39 | __all__ = ["utils", "layers", "parser"] 40 | -------------------------------------------------------------------------------- /steps/conf/prepare_word_categories.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | # Copyright 2015 Brno University of Technology (author: Karel Vesely) 4 | # Apache 2.0 5 | 6 | import sys 7 | 8 | from optparse import OptionParser 9 | desc = """ 10 | Prepare mapping of words into categories. Each word with minimal frequency 11 | has its own category, the rest is merged into single class. 12 | """ 13 | usage = "%prog [opts] words.txt ctm category_mapping" 14 | parser = OptionParser(usage=usage, description=desc) 15 | parser.add_option("--min-count", help="Minimum word-count to have a single word category. [default %default]", type='int', default=20) 16 | (o, args) = parser.parse_args() 17 | 18 | if len(args) != 3: 19 | parser.print_help() 20 | sys.exit(1) 21 | words_file, text_file, category_mapping_file = args 22 | 23 | if text_file == '-': text_file = '/dev/stdin' 24 | if category_mapping_file == '-': category_mapping_file = '/dev/stdout' 25 | 26 | # Read the words from the 'tra' file, 27 | with open(text_file) as f: 28 | text_words = [ l.split()[1:] for l in f ] 29 | 30 | # Flatten the array of arrays of words, 31 | import itertools 32 | text_words = list(itertools.chain.from_iterable(text_words)) 33 | 34 | # Count the words (regardless if correct or incorrect), 35 | word_counts = dict() 36 | for w in text_words: 37 | if w not in word_counts: word_counts[w] = 0 38 | word_counts[w] += 1 39 | 40 | # Read the words.txt, 41 | with open(words_file) as f: 42 | word_id = [ l.split() for l in f ] 43 | 44 | # Append the categories, 45 | n=1 46 | word_id_cat=[] 47 | for word, idx in word_id: 48 | cat = 0 49 | if word in word_counts: 50 | if word_counts[word] > o.min_count: 51 | cat = n; n += 1 52 | word_id_cat.append([word, idx, str(cat)]) 53 | 54 | # Store the mapping, 55 | with open(category_mapping_file,'w') as f: 56 | f.writelines([' '.join(record)+'\n' for record in word_id_cat]) 57 | -------------------------------------------------------------------------------- /local/lm/wfst/run_wfst.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | . path.sh 3 | wfst=./data/wfst 4 | dict=./data/wfst/dict 5 | lang=./data/wfst/lang 6 | tmp_lang=./data/wfst/local/lang 7 | model_dir=exp/tri4a 8 | LM=data/LM 9 | text=data/text 10 | #modify dict/lexicon.txt lexiconp.txt 11 | #utils/prepare_lang.sh $dict "" $tmp_lang $lang 12 | 13 | #LM training 14 | mkdir -p $LM/3gram 15 | #PYTHONENCODING=utf-8 python3 local/lm/get_all_choices.py #> $wfst/kaggle12_C.txt 16 | ngram-count -text $text/kaggle1234_C.txt -lm $LM/3gram/kaggle1234_C.lm -vocab $lang/vocabs.txt -limit-vocab -order 3 17 | ngram-count -text $text/kaggle12345_C.txt -lm $LM/3gram/kaggle12345_C.lm -vocab $lang/vocabs.txt -limit-vocab -order 3 18 | ngram-count -text $text/mix.txt -lm $LM/3gram/mix.lm -vocab $lang/vocabs.txt -limit-vocab -order 3 19 | 20 | 21 | ngram -lm data/local/lm/3gram-mincount/lm_pr10.0 -vocab $lang/vocabs.txt -limit-vocab -write-lm $LM/3gram/ori_pr10.0.lm 22 | 23 | local/lm/mix_lm3_test.sh $LM/3gram/ori_pr10.0.lm $LM/3gram/kaggle1234_C.lm $LM/3gram/mix.lm \ 24 | $LM/3gram/kaggle12345_C.lm $text/kaggle5_C.txt $LM/3gram/ori_C_10.0.lm 25 | 26 | ( 27 | lm=$LM/3gram/ori_C_10.0.lm 28 | lang_test=./data/wfst/lang_test_pr10_C 29 | graph_dir=exp/tri4a/graph_wfst_pr10_C 30 | #G compilation and check L and G stochastic 31 | local/kaggle/wfst/format_data.sh $lm $lang $lang_test 32 | if false ; then 33 | #Choice fst compilation 34 | local/kaggle/wfst/generate_choice_fst.sh $lang_test/words.txt $lang_test/choice.fst 35 | 36 | 37 | #compose choice.fst and G.fst 38 | mv $lang_test/G.fst $lang_test/G_head.fst 39 | fsttablecompose $lang_test/G_head.fst $lang_test/choice.fst | \ 40 | fstdeterminizestar --use-log=true | \ 41 | fstminimizeencoded > $lang_test/G.fst 42 | fi 43 | 44 | #compose HCLG(choice) 45 | utils/mkgraph.sh $lang_test $model_dir $graph_dir 46 | ) 47 | 48 | wait 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /steps/cleanup/make_utterance_fsts.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use warnings; #sed replacement for -w perl parameter 3 | 4 | # makes unigram decoding-graph FSTs specific to each utterances, where the 5 | # supplied top-n-words list together with the supervision text of the utterance are 6 | # combined. 7 | 8 | if (@ARGV != 1) { 9 | print STDERR "** Warning: this script is deprecated and will be removed. See\n" . 10 | "** steps/cleanup/make_biased_lm_graphs.sh.\n" . 11 | "Usage: make_utterance_fsts.pl top-words-file.txt < text-archive > fsts-archive\n" . 12 | "e.g.: utils/sym2int.pl -f 2- data/lang/words.txt data/train/text | \\\n" . 13 | " make_utterance_fsts.pl exp/foo/top_words.int | compile-train-graphs-fsts ... \n"; 14 | exit(1); 15 | } 16 | 17 | ($top_words_file) = @ARGV; 18 | 19 | open(F, "<$top_words_file") || die "opening $top_words_file"; 20 | 21 | %top_word_probs = ( ); 22 | 23 | while() { 24 | @A = split; 25 | (@A == 2 && $A[0] > 0.0) || die "Bad line $_ in $top_words_file"; 26 | $A[1] =~ m/^[0-9]+$/ || die "Expecting numeric word-ids in $top_words_file: $_\n"; 27 | $top_word_probs{$A[1]} += $A[0]; 28 | } 29 | 30 | while () { 31 | @A = split; 32 | $utterance_id = shift @A; 33 | print "$utterance_id\n"; 34 | $num_words = @A + 0; # length of array @A 35 | %word_probs = %top_word_probs; 36 | foreach $w (@A) { 37 | $w =~ m/^[0-9]+$/ || die "Expecting numeric word-ids as stdin: $_"; 38 | $word_probs{$w} += 1.0 / $num_words; 39 | } 40 | foreach $w (keys %word_probs) { 41 | $prob = $word_probs{$w}; 42 | $prob > 0.0 || die "Word $w with bad probability $prob, utterance-id = $utterance_id\n"; 43 | $cost = -log($prob); 44 | print "0 0 $w $w $cost\n"; 45 | } 46 | $final_cost = -log(1.0 / $num_words); 47 | print "0 $final_cost\n"; 48 | print "\n"; # Empty line terminates the FST in the text-archive format. 49 | } 50 | -------------------------------------------------------------------------------- /local/lm/run_3gram.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | . path.sh 3 | wfst=./data/wfst 4 | dict=./data/wfst/dict 5 | lang=./data/wfst/lang 6 | tmp_lang=./data/wfst/local/lang 7 | LM=data/LM 8 | text=data/text 9 | stage=1 10 | #modify dict/lexicon.txt lexiconp.txt 11 | #utils/prepare_lang.sh $dict "" $tmp_lang $lang 12 | if [ $stage -le 0 ] ; then 13 | #LM training 14 | mkdir -p $LM/3gram 15 | ngram-count -text $text/mix.txt -lm $LM/3gram/mix_novel.lm -vocab $lang/vocabs.txt -limit-vocab -order 3 16 | ngram-count -text $text/news.txt -lm $LM/3gram/news.lm -vocab $lang/vocabs.txt -limit-vocab -order 3 -prune 2e-7 17 | ngram -lm $LM/3gram/mix_novel.lm -mix-lm $LM/3gram/news.lm -lambda 0.9 -write-lm $LM/3gram/mix.lm 18 | 19 | ngram -lm data/local/lm/3gram-mincount/lm_pr10.0 -vocab $lang/vocabs.txt -limit-vocab -write-lm $LM/3gram/ori_pr10.0.lm 20 | 21 | for x in A B C ; do 22 | ngram-count -text $text/kaggle1234_$x.txt -lm $LM/3gram/kaggle1234_$x.lm -vocab $lang/vocabs.txt -limit-vocab -order 3 23 | ngram-count -text $text/kaggle12345_$x.txt -lm $LM/3gram/kaggle12345_$x.lm -vocab $lang/vocabs.txt -limit-vocab -order 3 24 | 25 | local/lm/mix_lm3_test.sh $LM/3gram/ori_pr10.0.lm $LM/3gram/kaggle1234_$x.lm $LM/3gram/mix.lm \ 26 | $LM/3gram/kaggle12345_$x.lm $text/kaggle5_$x.txt $LM/3gram/ori_$x\_10.0_kaggle12345.lm 27 | done 28 | fi 29 | if [ $stage -le 1 ] ; then 30 | for x in A B C ; do 31 | ( 32 | lm=$LM/3gram/ori_$x\_10.0_kaggle12345.lm 33 | lang_test=./data/wfst/lang_test_pr10_$x 34 | graph_dir=exp/tri4a/graph_pr10_$x 35 | model_dir=exp/tri4a 36 | model_dir=exp/aishell2/tri4_taiwanese 37 | graph_dir=$model_dir/graph_pr10_$x 38 | #G compilation and check L and G stochastic 39 | local/lm/wfst/format_data.sh $lm $lang $lang_test 40 | #compose HCLG(choice) 41 | rm -r $graph_dir 42 | utils/mkgraph.sh $lang_test $model_dir $graph_dir 43 | ) & 44 | done 45 | wait 46 | fi 47 | -------------------------------------------------------------------------------- /local/kaggle/decode_kaggle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bas1 2 | A_dir=/data/local/kgb/corpus/kgb/semi-finals-2018/1/data/wav/A 3 | B_dir=/data/local/kgb/corpus/kgb/semi-finals-2018/1/data/wav/B 4 | C_dir=/data/local/kgb/corpus/kgb/semi-finals-2018/1/data/wav/C 5 | A_dir=/data/local/kgb/Chinese-ASR/1213_simulate/wav/A 6 | B_dir=/data/local/kgb/Chinese-ASR/1213_simulate/wav/B 7 | C_dir=/data/local/kgb/Chinese-ASR/1213_simulate/wav/C 8 | 9 | C_dir=/data/local/kgb/corpus/kgb/kaggle6/data/wav/C 10 | iflytek_A=/data/local/kgb/Chinese-ASR/1110/iflytek_A 11 | src_dir=./1110 12 | decode_A=false 13 | 14 | set -e 15 | set -u 16 | set -o pipefail 17 | . path.sh 18 | 19 | mkdir -p $src_dir 20 | 21 | if $decode_A ; then 22 | 23 | #python3 local/kaggle/get_id_list.py $A_dir $src_dir/idx.json || exit 1; 24 | #bash local/kaggle/choose_lm2.sh $iflytek_A $src_dir/A_lm_test $src_dir/choose_lm || exit 1; 25 | 26 | #bash local/kaggle/check_sample_rate.sh $A_dir 27 | local/kaggle/decode_from_wav_seperate_by_lm.sh $A_dir $src_dir/A $src_dir/choose_lm A || exit 1; 28 | python3 local/kaggle/check_output_by_lm.py $src_dir/A 29 | #bash local/kaggle/mix_LM_with_A.sh $src_dir/A/output.txt $src_dir/C_lang 30 | #bash local/kaggle/test/decode_test.sh $test_C_dir $src_dir/C_test $src_dir/C_lang 31 | else 32 | #C:choices 33 | #bash local/kaggle/check_sample_rate.sh $C_dir || exit 1; 34 | local/kaggle/decode_from_wav_seperate_by_lm.sh $C_dir $src_dir/C_aishell_DFSMN_S_fine_tune $src_dir/choose_lm C || exit 1; 35 | python3 local/kaggle/check_output_by_lm.py $src_dir/C || exit 1; 36 | 37 | #B:question 38 | #bash local/kaggle/check_sample_rate.sh $B_dir 39 | #local/kaggle/decode_from_wav_seperate_by_lm.sh $B_dir $src_dir/B $src_dir/choose_lm B || exit 1; 40 | #python3 local/kaggle/check_output_by_lm.py $src_dir/B || exit 1; 41 | 42 | #python3 local/kaggle/merge_json.py $src_dir/A/output.txt $src_dir/B/output.txt $src_dir/C_n5/output.txt $src_dir/idx.json 5 $src_dir/result_kaldi_n.json 43 | 44 | fi 45 | -------------------------------------------------------------------------------- /steps/segmentation/post_process_sad_to_segments.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2015-17 Vimal Manohar 4 | # Apache 2.0. 5 | 6 | # This script post-processes the output of steps/segmentation/decode_sad.sh, 7 | # which is in the form of frame-level alignments, into a 'segments' file. 8 | # The alignments must be speech activity detection marks i.e. 1 for silence 9 | # and 2 for speech. 10 | 11 | set -e -o pipefail -u 12 | . ./path.sh 13 | 14 | cmd=run.pl 15 | stage=-10 16 | nj=18 17 | 18 | # The values below are in seconds 19 | frame_shift=0.01 20 | segment_padding=0.2 21 | 22 | . utils/parse_options.sh 23 | 24 | if [ $# -ne 3 ]; then 25 | echo "This script post-processes the output of steps/segmentation/decode_sad.sh, " 26 | echo "which is in the form of frame-level alignments, into kaldi segments. " 27 | echo "The alignments must be speech activity detection marks i.e. 1 for silence " 28 | echo "and 2 for speech." 29 | echo "Usage: $0 " 30 | echo " e.g.: $0 data/dev_aspire_whole exp/vad_dev_aspire" 31 | exit 1 32 | fi 33 | 34 | data_dir=$1 35 | vad_dir=$2 # Alignment directory containing frame-level SAD labels 36 | dir=$3 37 | 38 | mkdir -p $dir 39 | 40 | for f in $vad_dir/ali.1.gz $vad_dir/num_jobs; do 41 | if [ ! -f $f ]; then 42 | echo "$0: Could not find file $f" && exit 1 43 | fi 44 | done 45 | 46 | nj=`cat $vad_dir/num_jobs` || exit 1 47 | utils/split_data.sh $data_dir $nj 48 | 49 | utils/data/get_utt2dur.sh $data_dir 50 | 51 | if [ $stage -le 0 ]; then 52 | $cmd JOB=1:$nj $dir/log/segmentation.JOB.log \ 53 | copy-int-vector "ark:gunzip -c $vad_dir/ali.JOB.gz |" ark,t:- \| \ 54 | steps/segmentation/internal/sad_to_segments.py \ 55 | --frame-shift=$frame_shift --segment-padding=$segment_padding \ 56 | --utt2dur=$data_dir/utt2dur - $dir/segments.JOB 57 | fi 58 | 59 | echo $nj > $dir/num_jobs 60 | 61 | for n in $(seq $nj); do 62 | cat $dir/segments.$n 63 | done > $dir/segments 64 | -------------------------------------------------------------------------------- /utils/parallel/limit_num_gpus.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script functions as a wrapper of a bash command that uses GPUs. 4 | # 5 | # It sets the CUDA_VISIBLE_DEVICES variable so that it limits the number of GPUs 6 | # used for programs. It is neccesary for running a job on the grid if the job 7 | # would automatically grabs all resources available on the system, e.g. a 8 | # TensorFlow program. 9 | 10 | num_gpus=1 # this variable indicates how many GPUs we will allow the command 11 | # passed to this script will run on. We achieve this by setting the 12 | # CUDA_VISIBLE_DEVICES variable 13 | set -e 14 | 15 | if [ "$1" == "--num-gpus" ]; then 16 | num_gpus=$2 17 | shift 18 | shift 19 | fi 20 | 21 | if ! printf "%d" "$num_gpus" >/dev/null || [ $num_gpus -le 0 ]; then 22 | echo $0: Must pass a positive interger after --num-gpus 23 | echo e.g. $0 --num-gpus 2 local/tfrnnlm/run_lstm.sh 24 | exit 1 25 | fi 26 | 27 | if [ $# -eq 0 ]; then 28 | echo "Usage: $0 [--num-gpus ] [...]" 29 | echo "Runs with args after setting CUDA_VISIBLE_DEVICES to " 30 | echo "make sure exactly GPUs are visible (default: 1)." 31 | exit 1 32 | fi 33 | 34 | CUDA_VISIBLE_DEVICES= 35 | num_total_gpus=`nvidia-smi -L | wc -l` 36 | num_gpus_assigned=0 37 | 38 | for i in `seq 0 $[$num_total_gpus-1]`; do 39 | # going over all GPUs and check if it is idle, and add to the list if yes 40 | if nvidia-smi -i $i | grep "No running processes found" >/dev/null; then 41 | CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}$i, && num_gpus_assigned=$[$num_gpus_assigned+1] 42 | fi 43 | # once we have enough GPUs, break out of the loop 44 | [ $num_gpus_assigned -eq $num_gpus ] && break 45 | done 46 | 47 | [ $num_gpus_assigned -ne $num_gpus ] && echo Could not find enough idle GPUs && exit 1 48 | 49 | export CUDA_VISIBLE_DEVICES=$(echo $CUDA_VISIBLE_DEVICES | sed "s=,$==g") 50 | 51 | echo "$0: Running the job on GPU(s) $CUDA_VISIBLE_DEVICES" 52 | "$@" 53 | -------------------------------------------------------------------------------- /local/lm/wfst/format_data.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | . path.sh 3 | 4 | 5 | if [ -f ./path.sh ]; then . ./path.sh; fi 6 | 7 | arpa_lm=$1 8 | lang=$2 9 | lang_test=$3 10 | . ./utils/parse_options.sh 11 | 12 | [ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; 13 | 14 | 15 | rm -r $lang_test 16 | cp -r $lang $lang_test 17 | 18 | echo $arpa_lm 19 | cat $arpa_lm | arpa2fst --disambig-symbol=#0 \ 20 | --read-symbol-table=$lang_test/words.txt - | fstarcsort --sort_type=olabel > $lang_test/G.fst 21 | #cat $arpa_lm | arpa2fst - | fstprint |\ 22 | # fstcompile --isymbols=$lang_test/words.txt --osymbols=$lang_test/words.txt \ 23 | # --keep_isymbols=false --keep_osymbols=false | fstrmepsilon | fstarcsort --sort_type=olabel > $lang_test/G.fst 24 | 25 | echo "Checking how stochastic G is (the first of these numbers should be small):" 26 | fstisstochastic $lang_test/G.fst 27 | 28 | ## Check lexicon. 29 | ## just have a look and make sure it seems sane. 30 | echo "First few lines of lexicon FST:" 31 | fstprint --isymbols=$lang/phones.txt --osymbols=$lang/words.txt $lang/L.fst | head 32 | 33 | echo Performing further checks 34 | 35 | # Checking that G.fst is determinizable. 36 | fstdeterminize $lang_test/G.fst /dev/null || echo Error determinizing G. 37 | 38 | # Checking that L_disambig.fst is determinizable. 39 | fstdeterminize $lang_test/L_disambig.fst /dev/null || echo Error determinizing L. 40 | 41 | # Checking that disambiguated lexicon times G is determinizable 42 | # Note: we do this with fstdeterminizestar not fstdeterminize, as 43 | # fstdeterminize was taking forever (presumbaly relates to a bug 44 | # in this version of OpenFst that makes determinization slow for 45 | # some case). 46 | fsttablecompose $lang_test/L_disambig.fst $lang_test/G.fst | \ 47 | fstdeterminizestar >/dev/null || echo Error 48 | 49 | # Checking that LG is stochastic: 50 | fsttablecompose $lang/L_disambig.fst $lang_test/G.fst | \ 51 | fstisstochastic || echo LG is not stochastic 52 | 53 | echo format_data succeeded. 54 | 55 | -------------------------------------------------------------------------------- /steps/subset_ali_dir.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2017 Vimal Manohar 4 | # Apache 2.0. 5 | 6 | cmd=run.pl 7 | 8 | if [ -f ./path.sh ]; then . ./path.sh; fi 9 | 10 | . ./utils/parse_options.sh 11 | 12 | if [ $# -ne 4 ]; then 13 | cat < from the 16 | original alignment directory containing alignments for utterances in 17 | . 18 | 19 | The number of split jobs in the output alignment directory is 20 | equal to the number of jobs in the original alignment directory, 21 | unless the subset data directory has too few speakers. 22 | 23 | Usage: $0 [options] 24 | e.g.: $0 data/train_sp data/train exp/tri3_ali_sp exp/tri3_ali 25 | 26 | Options: 27 | --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. 28 | EOF 29 | exit 1 30 | fi 31 | 32 | data=$1 33 | subset_data=$2 34 | ali_dir=$3 35 | dir=$4 36 | 37 | nj=$(cat $ali_dir/num_jobs) || exit 1 38 | utils/split_data.sh $data $nj 39 | 40 | mkdir -p $dir 41 | cp $ali_dir/{final.mdl,*.mat,*_opts,tree} $dir/ || true 42 | cp -r $ali_dir/phones $dir 2>/dev/null || true 43 | 44 | $cmd JOB=1:$nj $dir/log/copy_alignments.JOB.log \ 45 | copy-int-vector "ark:gunzip -c $ali_dir/ali.JOB.gz |" \ 46 | ark,scp:$dir/ali_tmp.JOB.ark,$dir/ali_tmp.JOB.scp || exit 1 47 | 48 | for n in `seq $nj`; do 49 | cat $dir/ali_tmp.$n.scp 50 | done > $dir/ali_tmp.scp 51 | 52 | num_spk=$(cat $subset_data/spk2utt | wc -l) 53 | if [ $num_spk -lt $nj ]; then 54 | nj=$num_spk 55 | fi 56 | 57 | utils/split_data.sh $subset_data $nj 58 | $cmd JOB=1:$nj $dir/log/filter_alignments.JOB.log \ 59 | copy-int-vector \ 60 | "scp:utils/filter_scp.pl $subset_data/split${nj}/JOB/utt2spk $dir/ali_tmp.scp |" \ 61 | "ark:| gzip -c > $dir/ali.JOB.gz" || exit 1 62 | 63 | echo $nj > $dir/num_jobs 64 | 65 | rm $dir/ali_tmp.*.{ark,scp} $dir/ali_tmp.scp 66 | 67 | exit 0 68 | -------------------------------------------------------------------------------- /utils/ln.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use File::Spec; 3 | 4 | if ( @ARGV < 2 ) { 5 | print STDERR "usage: ln.pl input1 input2 dest-dir\n" . 6 | "This script does a soft link of input1, input2, etc." . 7 | "to dest-dir, using relative links where possible\n" . 8 | "Note: input-n and dest-dir may both be absolute pathnames,\n" . 9 | "or relative pathnames, relative to the current directlory.\n"; 10 | exit(1); 11 | } 12 | 13 | $dir = pop @ARGV; 14 | if ( ! -d $dir ) { 15 | print STDERR "ln.pl: last argument must be a directory ($dir is not a directory)\n"; 16 | exit(1); 17 | } 18 | 19 | $ans = 1; # true. 20 | 21 | $absdir = File::Spec->rel2abs($dir); # Get $dir as abs path. 22 | defined $absdir || die "No such directory $dir"; 23 | foreach $file (@ARGV) { 24 | $absfile = File::Spec->rel2abs($file); # Get $file as abs path. 25 | defined $absfile || die "No such file or directory: $file"; 26 | @absdir_split = split("/", $absdir); 27 | @absfile_split = split("/", $absfile); 28 | 29 | $newfile = $absdir . "/" . $absfile_split[$#absfile_split]; # we'll use this 30 | # as the destination in the link command. 31 | $num_removed = 0; 32 | while (@absdir_split > 0 && $absdir_split[0] eq $absfile_split[0]) { 33 | shift @absdir_split; 34 | shift @absfile_split; 35 | $num_removed++; 36 | } 37 | if (-l $newfile) { # newfile is already a link -> safe to delete it. 38 | unlink($newfile); # "unlink" just means delete. 39 | } 40 | if ($num_removed == 0) { # will use absolute pathnames. 41 | $oldfile = "/" . join("/", @absfile_split); 42 | $ret = symlink($oldfile, $newfile); 43 | } else { 44 | $num_dots = @absdir_split; 45 | $oldfile = join("/", @absfile_split); 46 | for ($n = 0; $n < $num_dots; $n++) { 47 | $oldfile = "../" . $oldfile; 48 | } 49 | $ret = symlink($oldfile, $newfile); 50 | } 51 | $ans = $ans && $ret; 52 | if (! $ret) { 53 | print STDERR "Error linking $oldfile to $newfile\n"; 54 | } 55 | } 56 | 57 | exit ($ans == 1 ? 0 : 1); 58 | 59 | -------------------------------------------------------------------------------- /steps/nnet3/chain/gen_topo.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Copyright 2012 Johns Hopkins University (author: Daniel Povey) 4 | 5 | # Generate a topology file. This allows control of the number of states in the 6 | # non-silence HMMs, and in the silence HMMs. This is a modified version of 7 | # 'utils/gen_topo.pl' that generates a different type of topology, one that we 8 | # believe should be useful in the 'chain' model. Note: right now it doesn't 9 | # have any real options, and it treats silence and nonsilence the same. The 10 | # intention is that you write different versions of this script, or add options, 11 | # if you experiment with it. 12 | 13 | if (@ARGV != 2) { 14 | print STDERR "Usage: utils/gen_topo.pl \n"; 15 | print STDERR "e.g.: utils/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n"; 16 | exit (1); 17 | } 18 | 19 | ($nonsil_phones, $sil_phones) = @ARGV; 20 | 21 | $nonsil_phones =~ s/:/ /g; 22 | $sil_phones =~ s/:/ /g; 23 | $nonsil_phones =~ m/^\d[ \d]+$/ || die "$0: bad arguments @ARGV\n"; 24 | $sil_phones =~ m/^\d[ \d]*$/ || die "$0: bad arguments @ARGV\n"; 25 | 26 | print "\n"; 27 | print "\n"; 28 | print "\n"; 29 | print "$nonsil_phones $sil_phones\n"; 30 | print "\n"; 31 | # The next two lines may look like a bug, but they are as intended. State 0 has 32 | # no self-loop, it happens exactly once. And it can go either to state 1 (with 33 | # a self-loop) or to state 2, so we can have zero or more instances of state 1 34 | # following state 0. 35 | # We make the transition-probs 0.5 so they normalize, to keep the code happy. 36 | # In fact, we always set the transition probability scale to 0.0 in the 'chain' 37 | # code, so they are never used. 38 | print " 0 0 1 0.5 2 0.5 \n"; 39 | print " 1 1 1 0.5 2 0.5 \n"; 40 | print " 2 \n"; 41 | print "\n"; 42 | print "\n"; 43 | --------------------------------------------------------------------------------