├── steps
    ├── decode_si.sh
    ├── train_nnet.sh
    ├── append_feats.sh
    ├── decode_nnet.sh
    ├── tandem
    │   └── decode_si.sh
    ├── score_kaldi.sh
    ├── score_kaldi_compare.sh
    ├── nnet2
    │   ├── get_num_frames.sh
    │   ├── get_ivector_id.sh
    │   ├── check_ivectors_compatible.sh
    │   └── remove_egs.sh
    ├── tfrnnlm
    │   ├── check_py.py
    │   └── check_tensorflow_installed.sh
    ├── libs
    │   ├── common.pyc
    │   ├── __init__.pyc
    │   ├── nnet3
    │   │   ├── __init__.pyc
    │   │   ├── xconfig
    │   │   │   ├── gru.pyc
    │   │   │   ├── lstm.pyc
    │   │   │   ├── utils.pyc
    │   │   │   ├── __init__.pyc
    │   │   │   ├── layers.pyc
    │   │   │   ├── parser.pyc
    │   │   │   ├── attention.pyc
    │   │   │   ├── basic_layers.pyc
    │   │   │   ├── convolution.pyc
    │   │   │   ├── stats_layer.pyc
    │   │   │   ├── trivial_layers.pyc
    │   │   │   ├── layers.py
    │   │   │   └── __init__.py
    │   │   ├── train
    │   │   │   ├── common.pyc
    │   │   │   ├── __init__.pyc
    │   │   │   ├── dropout_schedule.pyc
    │   │   │   ├── chain_objf
    │   │   │   │   ├── __init__.pyc
    │   │   │   │   ├── acoustic_model.pyc
    │   │   │   │   └── __init__.py
    │   │   │   ├── __init__.py
    │   │   │   └── frame_level_objf
    │   │   │   │   └── __init__.py
    │   │   ├── report
    │   │   │   ├── __init__.pyc
    │   │   │   ├── log_parse.pyc
    │   │   │   └── __init__.py
    │   │   └── __init__.py
    │   └── __init__.py
    ├── data
    │   ├── reverberate_data_dir.pyc
    │   ├── data_dir_manipulation_lib.pyc
    │   ├── __pycache__
    │   │   ├── reverberate_data_dir.cpython-36.pyc
    │   │   └── data_dir_manipulation_lib.cpython-36.pyc
    │   └── data_dir_manipulation_lib.py
    ├── nnet3
    │   ├── chain
    │   │   ├── e2e
    │   │   │   └── README.txt
    │   │   └── gen_topo.pl
    │   └── nnet3_to_dot.sh
    ├── conf
    │   ├── convert_ctm_to_tra.py
    │   ├── lattice_depth_per_frame.sh
    │   ├── parse_arpa_unigrams.py
    │   └── prepare_word_categories.py
    ├── segmentation
    │   ├── internal
    │   │   ├── verify_phones_list.py
    │   │   └── find_oov_phone.py
    │   ├── copy_targets_dir.sh
    │   ├── combine_targets_dirs.sh
    │   ├── decode_sad.sh
    │   └── post_process_sad_to_segments.sh
    ├── online
    │   └── nnet2
    │   │   └── copy_ivector_dir.sh
    ├── word_align_lattices.sh
    ├── scoring
    │   └── score_kaldi_compare.sh
    ├── cleanup
    │   └── make_utterance_fsts.pl
    └── subset_ali_dir.sh
├── utils
    ├── pbs.pl
    ├── run.pl
    ├── fix_ctm.sh
    ├── queue.pl
    ├── slurm.pl
    ├── convert_ctm.pl
    ├── data
    │   ├── split_data.sh
    │   ├── combine_data.sh
    │   ├── copy_data_dir.sh
    │   ├── fix_data_dir.sh
    │   ├── subset_data_dir.sh
    │   ├── validate_data_dir.sh
    │   ├── perturb_data_dir_speed.sh
    │   ├── get_reco2utt_for_data.sh
    │   ├── get_num_frames.sh
    │   ├── get_segments_for_data.sh
    │   ├── extract_wav_segments_data_dir.sh
    │   ├── get_utt2num_frames.sh
    │   ├── resample_data_dir.sh
    │   ├── convert_data_dir_to_whole.sh
    │   ├── limit_feature_dim.sh
    │   ├── modify_speaker_info_to_recording.sh
    │   └── shift_feats.sh
    ├── lang
    │   ├── prepare_lang.sh
    │   ├── validate_lang.pl
    │   └── add_lex_disambig.pl
    ├── subset_data_dir_tr_cv.sh
    ├── filt.py
    ├── make_absolute.sh
    ├── ctm
    │   └── fix_ctm.sh
    ├── spk2utt_to_utt2spk.pl
    ├── s2eps.pl
    ├── eps2disambig.pl
    ├── build_const_arpa_lm.sh
    ├── summarize_warnings.pl
    ├── utt2spk_to_spk2utt.pl
    ├── shuffle_list.pl
    ├── analyze_segments.pl
    ├── show_lattice.sh
    ├── best_wer.sh
    ├── remove_oovs.pl
    ├── add_disambig.pl
    ├── remove_data_links.sh
    ├── nnet
    │   ├── gen_hamm_mat.py
    │   └── gen_splice.py
    ├── parallel
    │   └── limit_num_gpus.sh
    └── ln.pl
├── conf
    ├── pitch.conf
    ├── online_pitch.conf
    ├── g2p_model
    ├── online_cmvn.conf
    ├── decode.config
    ├── mfcc.conf
    ├── pinyin_initial
    ├── fbank.conf
    ├── cmu2pinyin
    ├── mfcc_hires.conf
    └── pinyin2cmu
├── .gitmodules
├── local
    ├── kaggle
    │   ├── __pycache__
    │   │   ├── xlsx.cpython-36.pyc
    │   │   └── parse_choices.cpython-36.pyc
    │   ├── get_ppl.py
    │   ├── get_best_lambda.py
    │   ├── accumulate_lambda.py
    │   ├── get_id_list.py
    │   ├── check_sample_rate.sh
    │   ├── see_decode_time.sh
    │   ├── google_drive_download.sh
    │   ├── choose_lm.py
    │   ├── copy_error_files.py
    │   ├── max_ppl.py
    │   ├── test_lambda.sh
    │   ├── replace_iflytek_answer.py
    │   ├── choose_lm2.py
    │   ├── choose_lm.sh
    │   ├── parse_text.py
    │   ├── add.py
    │   ├── test
    │   │   ├── select_lm.py
    │   │   └── decode_test.sh
    │   ├── check_output.py
    │   ├── choose_lm2.sh
    │   ├── mix_LM_with_A.sh
    │   ├── data_prep_wav_seperate.py
    │   ├── replace_choice.py
    │   ├── demo.py
    │   ├── mix_LM_with_A.py
    │   ├── decode_demo.sh
    │   ├── replace_iflytek_choice.py
    │   ├── decode_kaggle_simulate.sh
    │   └── decode_kaggle.sh
    ├── data
    │   ├── get_total_dur.sh
    │   ├── __pycache__
    │   │   ├── number2chinese.cpython-36.pyc
    │   │   ├── parse_choices.cpython-36.pyc
    │   │   ├── data_prep_kaggle.cpython-36.pyc
    │   │   └── normalize_utils.cpython-36.pyc
    │   ├── get_total_dur.py
    │   ├── data_prep_wav.sh
    │   ├── data_prep_Tl.sh
    │   ├── data_prep_NER.sh
    │   ├── data_prep_TOCFL.sh
    │   ├── data_prep_MATBN.sh
    │   ├── data_prep_seame.sh
    │   ├── data_prep_kaggle.sh
    │   ├── clean_up_data.sh
    │   ├── data_prep_PTS.sh
    │   ├── data_prep_cyberon_english.sh
    │   ├── data_prep_cyberon_chinese.sh
    │   ├── data_prep_PTS.py
    │   ├── corpus_path.sh
    │   ├── data_prep_noise.sh
    │   ├── word_segmentation.py
    │   ├── data_prep_wav.py
    │   ├── extract_ptt.py
    │   ├── fix_segments.py
    │   ├── data_prep_TOCFL.py
    │   ├── data_prep_NER.py
    │   ├── extract_wiki.py
    │   ├── data_prep_Tl.py
    │   ├── normalize.py
    │   ├── normalize_text.py
    │   └── merge_json.py
    ├── score.sh
    ├── lm
    │   ├── wfst
    │   │   ├── compose.sh
    │   │   ├── generate_choice_fst.sh
    │   │   ├── temp2.sh
    │   │   ├── run_wfst.sh
    │   │   └── format_data.sh
    │   ├── dirty
    │   │   ├── train_lms.sh
    │   │   ├── temp2.sh
    │   │   ├── mix_lm2.sh
    │   │   ├── temp.sh
    │   │   ├── lm_to_carpa.sh
    │   │   ├── get_3gram_prune.sh
    │   │   ├── mix_lm3.sh
    │   │   ├── kaggle4.sh
    │   │   ├── parse_text.py
    │   │   ├── compile_lm.sh
    │   │   ├── format_lm_from_text.sh
    │   │   └── mix_all_lms.sh
    │   ├── get_best_lambda.py
    │   ├── get_best_lambda2.py
    │   ├── run_4gram.sh
    │   ├── mix_lm2_test.sh
    │   ├── get_all_context.py
    │   ├── get_all_problem.py
    │   ├── mix_lm3_test.sh
    │   ├── generate_ori.sh
    │   ├── get_all_choices.py
    │   ├── prune_all_lm.sh
    │   ├── run_3gram_kaggle5.sh
    │   ├── news_crawler.py
    │   └── run_3gram.sh
    ├── show_all_cer.sh
    ├── modify_utt2spk.sh
    ├── change_machine.sh
    ├── nnet
    │   ├── copy_alignment.sh
    │   ├── DFSMN_M.proto
    │   ├── DFSMN_S.proto
    │   ├── DFSMN_M.proto.2560
    │   ├── DFSMN_M.proto.8136
    │   ├── DFSMN_S.proto.2560
    │   ├── DFSMN_S.proto.8136
    │   ├── DFSMN_M_ivector.proto
    │   ├── DFSMN_S_ivector.proto
    │   ├── DFSMN_M_ivector.proto.2560
    │   ├── DFSMN_S_ivector.proto.2560
    │   ├── retrain.sh
    │   ├── augment_data_only_kgb_noise.sh
    │   ├── DFSMN_L.proto
    │   ├── DFSMN_L.proto.2560
    │   ├── DFSMN_L.proto.8136
    │   ├── DFSMN_L_ivector.proto
    │   ├── DFSMN_L_ivector.proto.2560
    │   └── augment_data.sh
    ├── combine_kaggle.sh
    ├── temp.sh
    ├── create_oov_char_lexicon.pl
    ├── extract_kaggle_feature.sh
    └── format_data.sh
├── path.sh
├── cmd.sh
└── README.md


/steps/decode_si.sh:
--------------------------------------------------------------------------------
1 | decode.sh


--------------------------------------------------------------------------------
/utils/pbs.pl:
--------------------------------------------------------------------------------
1 | parallel/pbs.pl


--------------------------------------------------------------------------------
/utils/run.pl:
--------------------------------------------------------------------------------
1 | parallel/run.pl


--------------------------------------------------------------------------------
/steps/train_nnet.sh:
--------------------------------------------------------------------------------
1 | nnet/train.sh


--------------------------------------------------------------------------------
/utils/fix_ctm.sh:
--------------------------------------------------------------------------------
1 | ctm/fix_ctm.sh


--------------------------------------------------------------------------------
/utils/queue.pl:
--------------------------------------------------------------------------------
1 | parallel/queue.pl


--------------------------------------------------------------------------------
/utils/slurm.pl:
--------------------------------------------------------------------------------
1 | parallel/slurm.pl


--------------------------------------------------------------------------------
/steps/append_feats.sh:
--------------------------------------------------------------------------------
1 | paste_feats.sh


--------------------------------------------------------------------------------
/steps/decode_nnet.sh:
--------------------------------------------------------------------------------
1 | nnet/decode.sh


--------------------------------------------------------------------------------
/steps/tandem/decode_si.sh:
--------------------------------------------------------------------------------
1 | decode.sh


--------------------------------------------------------------------------------
/utils/convert_ctm.pl:
--------------------------------------------------------------------------------
1 | ctm/convert_ctm.pl


--------------------------------------------------------------------------------
/conf/pitch.conf:
--------------------------------------------------------------------------------
1 | --sample-frequency=8000
2 | 


--------------------------------------------------------------------------------
/utils/data/split_data.sh:
--------------------------------------------------------------------------------
1 | ../split_data.sh


--------------------------------------------------------------------------------
/steps/score_kaldi.sh:
--------------------------------------------------------------------------------
1 | scoring/score_kaldi_wer.sh


--------------------------------------------------------------------------------
/utils/data/combine_data.sh:
--------------------------------------------------------------------------------
1 | ../combine_data.sh


--------------------------------------------------------------------------------
/utils/data/copy_data_dir.sh:
--------------------------------------------------------------------------------
1 | ../copy_data_dir.sh


--------------------------------------------------------------------------------
/utils/data/fix_data_dir.sh:
--------------------------------------------------------------------------------
1 | ../fix_data_dir.sh


--------------------------------------------------------------------------------
/utils/lang/prepare_lang.sh:
--------------------------------------------------------------------------------
1 | ../prepare_lang.sh


--------------------------------------------------------------------------------
/utils/lang/validate_lang.pl:
--------------------------------------------------------------------------------
1 | ../validate_lang.pl


--------------------------------------------------------------------------------
/conf/online_pitch.conf:
--------------------------------------------------------------------------------
1 | --sample-frequency=16000
2 | 


--------------------------------------------------------------------------------
/utils/data/subset_data_dir.sh:
--------------------------------------------------------------------------------
1 | ../subset_data_dir.sh


--------------------------------------------------------------------------------
/utils/lang/add_lex_disambig.pl:
--------------------------------------------------------------------------------
1 | ../add_lex_disambig.pl


--------------------------------------------------------------------------------
/steps/score_kaldi_compare.sh:
--------------------------------------------------------------------------------
1 | scoring/score_kaldi_compare.sh


--------------------------------------------------------------------------------
/utils/data/validate_data_dir.sh:
--------------------------------------------------------------------------------
1 | ../validate_data_dir.sh


--------------------------------------------------------------------------------
/utils/subset_data_dir_tr_cv.sh:
--------------------------------------------------------------------------------
1 | nnet/subset_data_tr_cv.sh


--------------------------------------------------------------------------------
/steps/nnet2/get_num_frames.sh:
--------------------------------------------------------------------------------
1 | ../../utils/data/get_num_frames.sh


--------------------------------------------------------------------------------
/utils/data/perturb_data_dir_speed.sh:
--------------------------------------------------------------------------------
1 | ../perturb_data_dir_speed.sh


--------------------------------------------------------------------------------
/steps/tfrnnlm/check_py.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | 


--------------------------------------------------------------------------------
/conf/g2p_model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/conf/g2p_model


--------------------------------------------------------------------------------
/steps/libs/common.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/common.pyc


--------------------------------------------------------------------------------
/steps/libs/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/__init__.pyc


--------------------------------------------------------------------------------
/steps/libs/nnet3/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/__init__.pyc


--------------------------------------------------------------------------------
/conf/online_cmvn.conf:
--------------------------------------------------------------------------------
1 | # configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
2 | 


--------------------------------------------------------------------------------
/steps/libs/nnet3/xconfig/gru.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/xconfig/gru.pyc


--------------------------------------------------------------------------------
/steps/libs/nnet3/train/common.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/train/common.pyc


--------------------------------------------------------------------------------
/steps/libs/nnet3/xconfig/lstm.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/xconfig/lstm.pyc


--------------------------------------------------------------------------------
/steps/libs/nnet3/xconfig/utils.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/xconfig/utils.pyc


--------------------------------------------------------------------------------
/steps/data/reverberate_data_dir.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/data/reverberate_data_dir.pyc


--------------------------------------------------------------------------------
/steps/libs/nnet3/report/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/report/__init__.pyc


--------------------------------------------------------------------------------
/steps/libs/nnet3/report/log_parse.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/report/log_parse.pyc


--------------------------------------------------------------------------------
/steps/libs/nnet3/train/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/train/__init__.pyc


--------------------------------------------------------------------------------
/steps/libs/nnet3/xconfig/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/xconfig/__init__.pyc


--------------------------------------------------------------------------------
/steps/libs/nnet3/xconfig/layers.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/xconfig/layers.pyc


--------------------------------------------------------------------------------
/steps/libs/nnet3/xconfig/parser.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/xconfig/parser.pyc


--------------------------------------------------------------------------------
/steps/libs/nnet3/xconfig/attention.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/xconfig/attention.pyc


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "local/data/tool/jieba-zh_TW"]
2 | 	path = local/data/tool/jieba-zh_TW
3 | 	url = https://github.com/APCLab/jieba-tw
4 | 


--------------------------------------------------------------------------------
/steps/data/data_dir_manipulation_lib.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/data/data_dir_manipulation_lib.pyc


--------------------------------------------------------------------------------
/steps/libs/nnet3/xconfig/basic_layers.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/xconfig/basic_layers.pyc


--------------------------------------------------------------------------------
/steps/libs/nnet3/xconfig/convolution.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/xconfig/convolution.pyc


--------------------------------------------------------------------------------
/steps/libs/nnet3/xconfig/stats_layer.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/xconfig/stats_layer.pyc


--------------------------------------------------------------------------------
/local/kaggle/__pycache__/xlsx.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/local/kaggle/__pycache__/xlsx.cpython-36.pyc


--------------------------------------------------------------------------------
/local/kaggle/get_ppl.py:
--------------------------------------------------------------------------------
1 | import sys
2 | S = sys.stdin.read()
3 | start = S.find('ppl=')
4 | endd = S.find('ppl1=')
5 | print(S[start+5:endd])
6 | 


--------------------------------------------------------------------------------
/steps/libs/nnet3/train/dropout_schedule.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/train/dropout_schedule.pyc


--------------------------------------------------------------------------------
/steps/libs/nnet3/xconfig/trivial_layers.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/xconfig/trivial_layers.pyc


--------------------------------------------------------------------------------
/conf/decode.config:
--------------------------------------------------------------------------------
1 | beam=11.0 # beam for decoding.  Was 13.0 in the scripts.
2 | first_beam=8.0 # beam for 1st-pass decoding in SAT.
3 | 
4 | 
5 | 
6 | 


--------------------------------------------------------------------------------
/local/data/get_total_dur.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | datadir=$1
3 | bash utils/data/get_utt2dur.sh $datadir
4 | python local/data/get_total_dur.py $datadir
5 | 


--------------------------------------------------------------------------------
/steps/libs/nnet3/train/chain_objf/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/train/chain_objf/__init__.pyc


--------------------------------------------------------------------------------
/conf/mfcc.conf:
--------------------------------------------------------------------------------
1 | --use-energy=false   # only non-default option.
2 | --sample-frequency=16000 #  Switchboard is sampled at 8kHz
3 | --allow_downsample=true 
4 | 


--------------------------------------------------------------------------------
/local/data/__pycache__/number2chinese.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/local/data/__pycache__/number2chinese.cpython-36.pyc


--------------------------------------------------------------------------------
/local/data/__pycache__/parse_choices.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/local/data/__pycache__/parse_choices.cpython-36.pyc


--------------------------------------------------------------------------------
/steps/libs/nnet3/train/chain_objf/acoustic_model.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/libs/nnet3/train/chain_objf/acoustic_model.pyc


--------------------------------------------------------------------------------
/local/data/__pycache__/data_prep_kaggle.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/local/data/__pycache__/data_prep_kaggle.cpython-36.pyc


--------------------------------------------------------------------------------
/local/data/__pycache__/normalize_utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/local/data/__pycache__/normalize_utils.cpython-36.pyc


--------------------------------------------------------------------------------
/local/kaggle/__pycache__/parse_choices.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/local/kaggle/__pycache__/parse_choices.cpython-36.pyc


--------------------------------------------------------------------------------
/steps/libs/nnet3/report/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | # Copyright 2016    Vimal Manohar
4 | # Apache 2.0.
5 | 
6 | from . import log_parse
7 | 
8 | __all__ = ["log_parse"]
9 | 


--------------------------------------------------------------------------------
/local/score.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -e -o pipefail
4 | set -x
5 | steps/score_kaldi.sh "$@"
6 | steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
7 | 
8 | echo "$0: Done"
9 | 


--------------------------------------------------------------------------------
/steps/data/__pycache__/reverberate_data_dir.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/data/__pycache__/reverberate_data_dir.cpython-36.pyc


--------------------------------------------------------------------------------
/local/lm/wfst/compose.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | .path.sh
3 | 
4 | 
5 | fsttablecompose G.fst b.fst  | \
6 |   fstdeterminizestar --use-log=true | \
7 |   fstminimizeencoded  > bG.fst
8 | 


--------------------------------------------------------------------------------
/conf/pinyin_initial:
--------------------------------------------------------------------------------
 1 | B 
 2 | C
 3 | CH
 4 | D
 5 | F
 6 | G
 7 | H
 8 | J
 9 | K
10 | L
11 | M
12 | N
13 | P
14 | Q
15 | R
16 | S
17 | SH
18 | T
19 | W
20 | X
21 | Y
22 | Z
23 | ZH
24 | 


--------------------------------------------------------------------------------
/steps/data/__pycache__/data_dir_manipulation_lib.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackyyy0228/Chinese-ASR/HEAD/steps/data/__pycache__/data_dir_manipulation_lib.cpython-36.pyc


--------------------------------------------------------------------------------
/local/lm/dirty/train_lms.sh:
--------------------------------------------------------------------------------
1 | . ../path.sh
2 | for x in guan water nie 20years laotsan water ; do
3 |   ngram-count -text text_test/$x\.txt -lm text_test/$x\.lm -vocab text_test/vocab.txt -limit-vocab -order 4 
4 | done
5 | 


--------------------------------------------------------------------------------
/local/show_all_cer.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | dirs=$1
3 | [ -z $dirs ] && dirs="exp/* exp/nnet/* exp/aishell2/* exp/nnet/aishell2/*"
4 | for x in $dirs/decode*; do 
5 |   [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; 
6 | done
7 | 


--------------------------------------------------------------------------------
/local/lm/get_best_lambda.py:
--------------------------------------------------------------------------------
1 | import sys
2 | log_file = sys.argv[1]
3 | s = open(log_file,encoding='utf8').read()
4 | start = s.find('best lambda')
5 | start = start + 13
6 | end = s[start:].find(' ')
7 | end+=start
8 | print(s[start:end])
9 | 


--------------------------------------------------------------------------------
/local/kaggle/get_best_lambda.py:
--------------------------------------------------------------------------------
1 | import sys
2 | log_file = sys.argv[1]
3 | s = open(log_file,encoding='utf8').read()
4 | start = s.find('best lambda')
5 | start = start + 13
6 | end = s[start:].find(' ')
7 | end+=start
8 | print(s[start:end])
9 | 


--------------------------------------------------------------------------------
/steps/libs/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # Copyright 2016    Vimal Manohar
 4 | # Apache 2.0.
 5 | 
 6 | """ This package contains modules and subpackages used in kaldi scripts.
 7 | """
 8 | 
 9 | from . import common
10 | 
11 | __all__ = ["common"]
12 | 


--------------------------------------------------------------------------------
/local/lm/dirty/temp2.sh:
--------------------------------------------------------------------------------
1 | for novel in journey_west red_mansion ; do
2 |   for x in A B C ; do
3 |     local/mix_lm3.sh text_test/ori.lm text_test/$novel.lm text_test/kaggle12_$x.lm text_test/$novel\_$x.txt LM/$novel\_$x\_kaggle12.lm $x\_lambda
4 |   done
5 | done
6 | wait
7 | 
8 | 


--------------------------------------------------------------------------------
/local/lm/get_best_lambda2.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | log_file = sys.argv[1]
 3 | s = open(log_file,encoding='utf8').read()
 4 | start = s.find('best lambda')
 5 | start = start + 13
 6 | end = s[start:].find(' ')
 7 | end+=start
 8 | end2 = s[end+1:].find(' ')
 9 | print(s[end+1:][:end2])
10 | 


--------------------------------------------------------------------------------
/local/kaggle/accumulate_lambda.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sys
 3 | best_lambda_file = sys.argv[1]
 4 | L = []
 5 | with open(best_lambda_file,'r',encoding='utf-8') as f:
 6 |     for line in f:
 7 |         L.append(float(line.rstrip()))
 8 | print(np.mean(L),np.var(L))
 9 | 
10 | 


--------------------------------------------------------------------------------
/conf/fbank.conf:
--------------------------------------------------------------------------------
1 | --htk-compat=false
2 | --window-type=hamming # disable Dans window, use the standard
3 | --use-energy=false    # only fbank outputs
4 | --dither=1
5 | --num-mel-bins=80     # 8 filters/octave, 40 filters/16Khz as used by IBM
6 | --sample-frequency=16000
7 | --allow_downsample=true
8 | 


--------------------------------------------------------------------------------
/local/data/get_total_dur.py:
--------------------------------------------------------------------------------
 1 | import sys,os
 2 | datadir = sys.argv[1]
 3 | 
 4 | total = 0.0
 5 | with open(os.path.join(datadir,'utt2dur'),'r') as f:
 6 |     for line in f:
 7 |         tokens = line.rstrip().split()
 8 |         dur = float(tokens[1])
 9 |         total += dur
10 | print('Total : {:f} minutes.'.format(total/60))
11 | 


--------------------------------------------------------------------------------
/steps/libs/nnet3/train/chain_objf/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # Copyright 2016    Vimal Manohar
 4 | # Apache 2.0.
 5 | 
 6 | """ This is a subpackage containing modules for training of
 7 | deep neural network acoustic model with chain objective.
 8 | """
 9 | 
10 | from . import acoustic_model
11 | 
12 | __all__ = ["acoustic_model"]
13 | 


--------------------------------------------------------------------------------
/local/kaggle/get_id_list.py:
--------------------------------------------------------------------------------
 1 | import os,sys,json
 2 | wav_dir = sys.argv[1]
 3 | output_json = sys.argv[2]
 4 | L = []
 5 | for wav in os.listdir(wav_dir):
 6 |     if wav.endswith('.wav'):
 7 |         name = wav[1:].replace('.wav','')
 8 |         idx = int(name)
 9 |         L.append(idx)
10 | json.dump(L,open(output_json,'w'))
11 |     
12 | 
13 | 


--------------------------------------------------------------------------------
/local/modify_utt2spk.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | datadir=$1
3 | if [ -f $datadir/utt2spk ] ; then
4 |   mv $datadir/utt2spk $datadir/utt2spk_backup
5 |   cat $datadir/utt2spk_backup | awk '{print $1 " " $1}' > $datadir/utt2spk
6 |   cat $datadir/utt2spk | utils/utt2spk_to_spk2utt.pl > $datadir/spk2utt || exit 1;
7 |   utils/fix_data_dir.sh $data || exit 1;
8 | fi
9 | 


--------------------------------------------------------------------------------
/local/change_machine.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | /data/local/kgb/Chinese-ASR/data
 3 | /home/jacky/work/kgb/
 4 | 
 5 | 
 6 | for dir in data ; do
 7 |   for scp in $dir/*/*/*.scp ; do
 8 |     cat $scp | sed 's/home\/jackyyy/data\/local/g' > ${scp}2
 9 |     mv $scp ${scp}_backup
10 |     mv ${scp}2 $scp
11 |     echo "Changing path of $scp"
12 |   done
13 | done
14 | 


--------------------------------------------------------------------------------
/local/lm/run_4gram.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | novels="ori 20years nie guan laotsan water journey_west red_mansion 3kingdom beauty_n hunghuang lai_ho old_time one_gan lu_shun news"
 4 | #local/lm/generate_ori.sh
 5 | for novel in $novels ; do
 6 |   (
 7 |     txt=data/text/$novel.txt
 8 |     local/lm/text2Gfst.sh $txt 
 9 |   )&
10 | done
11 | 
12 | wait
13 | 
14 | 


--------------------------------------------------------------------------------
/local/data/data_prep_wav.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $# != 2 ]; then
 4 |   echo " Usage : data_prep_wav.sh <wav_dir> <decode_dir> "
 5 | fi
 6 | 
 7 | wav=$1
 8 | data=$2
 9 | 
10 | mkdir -p $data
11 | 
12 | python3 local/data/data_prep_wav.py $wav $data
13 | cat $data/utt2spk | utils/utt2spk_to_spk2utt.pl > $data/spk2utt || exit 1;
14 | utils/fix_data_dir.sh $data || exit 1;
15 | 


--------------------------------------------------------------------------------
/conf/cmu2pinyin:
--------------------------------------------------------------------------------
 1 | AA A
 2 | AE A
 3 | AH A
 4 | AO UO
 5 | AW U
 6 | AY AI
 7 | B B
 8 | CH CH 
 9 | D D
10 | DH S I
11 | EH AI
12 | ER E
13 | EY AI
14 | F F
15 | G G
16 | HH H
17 | IH I
18 | IY I
19 | JH ZH 
20 | K K
21 | L L
22 | M M
23 | N N
24 | NG N
25 | OW UO
26 | OY UO
27 | P P
28 | R R
29 | S S
30 | SH SH
31 | T T
32 | TH S
33 | UH U
34 | UW U
35 | V W
36 | W W
37 | Y Y
38 | Z Z 
39 | ZH X  
40 | 


--------------------------------------------------------------------------------
/utils/filt.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | 
 3 | # Apache 2.0
 4 | 
 5 | import sys
 6 | 
 7 | vocab=set()
 8 | with open(sys.argv[1]) as vocabfile:
 9 |     for line in vocabfile:
10 |         vocab.add(line.strip())
11 | 
12 | with open(sys.argv[2]) as textfile:
13 |     for line in textfile:
14 |         print " ".join(map(lambda word: word if word in vocab else '<UNK>', line.strip().split()))
15 | 


--------------------------------------------------------------------------------
/local/kaggle/check_sample_rate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | wav_dir=$1
 3 | for wav in $wav_dir/*.wav ; do
 4 |   sr=`sox --i -r $wav`
 5 |   if [ "$sr" != "16000" ] ; then
 6 |     echo $wav $sr
 7 |     name=${wav::-4}  
 8 |     sox $wav -r 16000 ${name}2.wav
 9 |     mv ${name}2.wav $wav
10 |   fi
11 |   name=${wav::-4}  
12 |   sox $wav -t wav -r 16000 -b 16 ${name}2.wav 
13 |   mv ${name}2.wav $wav
14 | done
15 | 


--------------------------------------------------------------------------------
/local/kaggle/see_decode_time.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | test_dir=$1
 3 | for dir in $test_dir/* ; do
 4 |   if [ -d $dir ] && [ -f $dir/3small_time ] && [ -f $dir/rescore_time ] ; then
 5 |     echo $dir 
 6 |     cat $dir/3small_time
 7 |     cat $dir/rescore_time
 8 |     du -sh $dir/decode*/lat*
 9 |     cat $dir/rescore_lang
10 |     echo " "
11 |     echo "-------------------------------"
12 |   fi
13 | done
14 | 


--------------------------------------------------------------------------------
/path.sh:
--------------------------------------------------------------------------------
1 | export KALDI_ROOT=/home/kgb/kaldi-DFSMN/kaldi
2 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
3 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
4 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
5 | . $KALDI_ROOT/tools/config/common_path.sh
6 | export LC_ALL=C
7 | 


--------------------------------------------------------------------------------
/steps/libs/nnet3/train/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Copyright 2016 Vimal Manohar
 3 | # Apache 2.0
 4 | 
 5 | """ This library has classes and methods commonly used for training nnet3
 6 | neural networks.
 7 | 
 8 | It has separate submodules for frame-level objectives and chain objective:
 9 | frame_level_objf -- For both recurrent and non-recurrent architectures
10 | chain_objf -- LF-MMI objective training
11 | """
12 | 


--------------------------------------------------------------------------------
/steps/libs/nnet3/xconfig/layers.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016    Johns Hopkins University (Dan Povey)
 2 | #           2016    Vijayaditya Peddinti
 3 | #           2016    Yiming Wang
 4 | # Apache 2.0.
 5 | 
 6 | from .basic_layers import *
 7 | from .convolution import *
 8 | from .attention import *
 9 | from .lstm import *
10 | from .gru import *
11 | from .stats_layer import *
12 | from .trivial_layers import *
13 | 


--------------------------------------------------------------------------------
/steps/libs/nnet3/train/frame_level_objf/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # Copyright 2016 Vimal Manohar
 4 | # Apache 2.0
 5 | 
 6 | """ This library has classes and methods commonly used for training nnet3
 7 | neural networks with frame-level objectives.
 8 | """
 9 | 
10 | from . import common
11 | from . import raw_model
12 | from . import acoustic_model
13 | 
14 | __all__ = ["common", "raw_model", "acoustic_model"]
15 | 


--------------------------------------------------------------------------------
/local/data/data_prep_Tl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . ./local/data/corpus_path.sh
 3 | 
 4 | data=./data/Tl/mfcc39_pitch9
 5 | mkdir -p $data
 6 | 
 7 | for x in wav.scp utt2spk text; do
 8 |     PYTHONIOENCODING=utf-8 python3 local/data/data_prep_Tl.py $Tl $x | sort -k1,1 -u > $data/$x || exit 1;
 9 | done
10 | cat $data/utt2spk | utils/utt2spk_to_spk2utt.pl > $data/spk2utt || exit 1;
11 | utils/fix_data_dir.sh $data || exit 1;
12 | 


--------------------------------------------------------------------------------
/local/data/data_prep_NER.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . ./local/data/corpus_path.sh
 3 | 
 4 | data=./data/NER/mfcc39_pitch9
 5 | mkdir -p $data
 6 | 
 7 | for x in wav.scp utt2spk text; do
 8 |     PYTHONIOENCODING=utf-8 python3 local/data/data_prep_NER.py $NER $x | sort -k1,1 -u > $data/$x || exit 1;
 9 | done
10 | cat $data/utt2spk | utils/utt2spk_to_spk2utt.pl > $data/spk2utt || exit 1;
11 | utils/fix_data_dir.sh $data || exit 1;
12 | 


--------------------------------------------------------------------------------
/local/data/data_prep_TOCFL.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | . ./local/data/corpus_path.sh
 4 | data=./data/TOCFL/mfcc39_pitch9
 5 | mkdir -p $data
 6 | for x in wav.scp text utt2spk ; do
 7 |   PYTHONIOENCODING=utf-8 python3 local/data/data_prep_TOCFL.py $TOCFL $x | sort -k1,1 -u > $data/$x || exit 1;
 8 | done
 9 | cat $data/utt2spk | utils/utt2spk_to_spk2utt.pl > $data/spk2utt || exit 1;
10 | utils/fix_data_dir.sh $data || exit 1;
11 | 


--------------------------------------------------------------------------------
/local/kaggle/google_drive_download.sh:
--------------------------------------------------------------------------------
1 | function gdrive_download () {
2 |   CONFIRM=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate "https://docs.google.com/uc?export=download&id=$1" -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')
3 |   wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$CONFIRM&id=$1" -O $2
4 |   rm -rf /tmp/cookies.txt
5 | }
6 | 


--------------------------------------------------------------------------------
/local/lm/dirty/mix_lm2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . ../path.sh
 3 | lm1=$1
 4 | lm2=$2
 5 | test_text=$3
 6 | lm_out=$4
 7 | ngram -lm $lm1 -ppl $test_text -debug 2 > lm1.ppl
 8 | ngram -lm $lm2 -ppl $test_text -debug 2 > lm2.ppl
 9 | compute-best-mix lm1.ppl lm2.ppl > log
10 | lambda=`python3 local/get_best_lambda.py log`
11 | ngram -lm $lm1 -ppl $test_text -mix-lm $lm2 -lambda $lambda  -write-lm $lm_out
12 | rm lm1.ppl lm2.ppl log
13 | 


--------------------------------------------------------------------------------
/local/data/data_prep_MATBN.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . ./local/data/corpus_path.sh
 3 | 
 4 | data=./data/MATBN/mfcc39_pitch9
 5 | mkdir -p $data
 6 | 
 7 | for x in wav.scp utt2spk text segments ; do
 8 |     PYTHONIOENCODING=utf-8 python3 local/data/data_prep_MATBN.py $MATBN $x | sort -k1,1 -u > $data/$x || exit 1;
 9 | done
10 | cat $data/utt2spk | utils/utt2spk_to_spk2utt.pl > $data/spk2utt || exit 1;
11 | utils/fix_data_dir.sh $data || exit 1;
12 | 


--------------------------------------------------------------------------------
/local/data/data_prep_seame.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . ./local/data/corpus_path.sh
 3 | 
 4 | data=./data/seame/mfcc39_pitch9
 5 | mkdir -p $data
 6 | 
 7 | for x in wav.scp utt2spk text segments ; do
 8 |     PYTHONIOENCODING=utf-8 python3 local/data/data_prep_seame.py $seame $x | sort -k1,1 -u > $data/$x || exit 1;
 9 | done
10 | cat $data/utt2spk | utils/utt2spk_to_spk2utt.pl > $data/spk2utt || exit 1;
11 | utils/fix_data_dir.sh $data || exit 1;
12 | 


--------------------------------------------------------------------------------
/local/lm/mix_lm2_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . path.sh
 3 | lm1=$1
 4 | lm2=$2
 5 | lm_replace=$3
 6 | test_text=$4
 7 | lm_out=$5
 8 | 
 9 | ngram -lm $lm1 -ppl $test_text -debug 2 > lm1.ppl
10 | ngram -lm $lm2 -ppl $test_text -debug 2 > lm2.ppl
11 | compute-best-mix lm1.ppl lm2.ppl > log
12 | lambda=`python3 local/lm/get_best_lambda.py log`
13 | ngram -lm $lm1 -ppl $test_text -mix-lm $lm_replace -lambda $lambda  -write-lm $lm_out
14 | rm lm1.ppl lm2.ppl log
15 | 


--------------------------------------------------------------------------------
/steps/libs/nnet3/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # Copyright 2016    Johns Hopkins University (Dan Povey)
 4 | #           2016    Vimal Manohar
 5 | #           2016    Vijayaditya Peddinti
 6 | #           2016    Yiming Wang
 7 | # Apache 2.0.
 8 | 
 9 | 
10 | # This module has the python functions which facilitate the use of nnet3 toolkit
11 | # It has two sub-modules
12 | # xconfig : Library for parsing high level description of neural networks
13 | # train : Library for training scripts
14 | 


--------------------------------------------------------------------------------
/local/lm/dirty/temp.sh:
--------------------------------------------------------------------------------
 1 | for novel in 3kingdom ; do
 2 |   for x in A B C ; do
 3 |     local/mix_lm3.sh text_test/ori.lm text_test/$novel.lm text_test/kaggle12_$x.lm text_test/$novel\_$x.txt LM/$novel\_$x\_kaggle12.lm
 4 |     local/mix_lm3_test.sh text_test/ori.lm text_test/$novel.lm text_test/kaggle12_$x.lm text_test/kaggle123_$x.lm text_test/$novel\_$x.txt LM/$novel\_$x\.lm
 5 |     local/compile_lm.sh LM/$novel\_$x\_kaggle12.lm &
 6 |     local/compile_lm.sh LM/$novel\_$x\.lm &
 7 |   done
 8 | done
 9 | wait
10 | 


--------------------------------------------------------------------------------
/local/kaggle/choose_lm.py:
--------------------------------------------------------------------------------
 1 | import sys,os
 2 | sys.path.append('local/data/')
 3 | from normalize_utils import *
 4 | 
 5 | text_file = sys.argv[1]
 6 | output_dir = sys.argv[2]
 7 | with open(text_file,'r',encoding='utf-8') as f:
 8 |     for line in f:
 9 |         start = line.find(' ')
10 |         token1 = line.split()[0]
11 |         tex = normalize(line[start:].replace(' ',''))
12 |         with open(os.path.join(output_dir,token1),'w',encoding='utf-8') as f:
13 |             f.write(tex)
14 |         
15 | 
16 | 


--------------------------------------------------------------------------------
/local/data/data_prep_kaggle.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | kaggle=/data/local/kgb/corpus/kgb/kaggle6 
 4 | 
 5 | for typ in A B C ; do
 6 |   data=./data/kaggle6/$typ/fbank
 7 |   mkdir -p $data
 8 |   for x in wav.scp utt2spk text ; do
 9 |     PYTHONIOENCODING=utf-8 python3 local/data/data_prep_kaggle.py $kaggle $x $typ data/lang/words.txt | sort -k1,1 -u > $data/$x || exit 1;
10 |   done
11 |   cat $data/utt2spk | utils/utt2spk_to_spk2utt.pl > $data/spk2utt || exit 1;
12 |   utils/fix_data_dir.sh $data || exit 1;
13 | done
14 | 


--------------------------------------------------------------------------------
/local/kaggle/copy_error_files.py:
--------------------------------------------------------------------------------
 1 | import os,sys
 2 | from shutil import copyfile
 3 | error_list = sys.argv[1]
 4 | src_dir = sys.argv[2]
 5 | target_dir = sys.argv[3]
 6 | 
 7 | 
 8 | if not os.path.isdir(target_dir):
 9 |     os.makedirs(target_dir)
10 | 
11 | with open(error_list,'r') as f:
12 |     for line in f:
13 |         name = line.rstrip()
14 |         src = os.path.join(src_dir,name)
15 |         target = os.path.join(target_dir,name)
16 |         print("Copy {} to {} ".format(src,target))
17 |         copyfile(src,target)
18 | 


--------------------------------------------------------------------------------
/local/lm/dirty/lm_to_carpa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . path.sh
 3 | for lang in lm_test/LM/* ; do
 4 |   (
 5 |     if [ -d $lang ]; then
 6 |       bos=`grep "<s>" $lang/words.txt | awk '{print $2}'`
 7 |       eos=`grep "</s>" $lang/words.txt | awk '{print $2}'`
 8 |       unk=`cat $lang/oov.int`
 9 | 
10 |       cat $lang.lm  | \
11 |         utils/map_arpa_lm.pl $lang/words.txt | \
12 |         arpa-to-const-arpa --bos-symbol=$bos --eos-symbol=$eos \
13 |         --unk-symbol=$unk - $lang/G.carpa
14 |     fi
15 |   )&
16 | done
17 | wait 
18 | 


--------------------------------------------------------------------------------
/local/lm/dirty/get_3gram_prune.sh:
--------------------------------------------------------------------------------
1 | . path.sh
2 | bash local/mix_lm3.sh lm_test/text_test/3gram_ori.lm lm_test/text_test/news.lm lm_test/text_test/all_novels_3gram.lm \
3 |   text_test/kaggle123_A.txt text_test/3gram_mix.lm
4 | ngram -lm lm_test/text_test/3gram_mix.lm -prune 0.0000001 -write-lm lm_test/text_test/3gram_mix_prune.lm
5 | gzip lm_test/text_test/3gram_mix_prune.lm
6 | bash local/format_data.sh lm_test/text_test/3gram_mix_prune.lm.gz data/lang_3small_mix_test
7 | utils/mkgraph.sh data/lang_3small_mix_test exp/tri4a exp/tri4a/graph_mix
8 | 


--------------------------------------------------------------------------------
/local/data/clean_up_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . ./path.sh
 3 | . ./cmd.sh
 4 | . ./utils/parse_options.sh
 5 | 
 6 | set -e
 7 | set -u
 8 | set -o pipefail
 9 | 
10 | data=./data/kaggle3/mfcc39_pitch9
11 | name=kaggle3
12 | nj=40
13 | 
14 | steps/align_fmllr.sh  --cmd "$train_cmd" --nj $nj \
15 |   $data data/lang exp/tri4a exp/tri4a_ali_$name || exit 1;
16 | 
17 | steps/cleanup/clean_and_segment_data.sh --cmd "$train_cmd" --nj $nj $data data/lang \
18 |   exp/tri4a_ali_$name exp/tri4a_cleanup_$name data/kaggle3/cleaned_mfcc39_pitch9 || exit 1;
19 | 


--------------------------------------------------------------------------------
/local/data/data_prep_PTS.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . ./local/data/corpus_path.sh
 3 | data=./data/PTS/mfcc39_pitch9
 4 | mkdir -p $data
 5 | 
 6 | cp $PTS/PTS_segmented/{text,segments} $data/
 7 | 
 8 | cat $PTS/PTS_segmented/segments | awk '{print $1 " " $1}' | sort -k1,1 -u > $data/utt2spk
 9 | 
10 | for x in wav.scp ; do
11 |     python3 local/data/data_prep_PTS.py $PTS $x | sort -k1,1 -u > $data/$x || exit 1;
12 | done
13 | 
14 | cat $data/utt2spk | utils/utt2spk_to_spk2utt.pl > $data/spk2utt || exit 1;
15 | utils/fix_data_dir.sh $data || exit 1;
16 | 


--------------------------------------------------------------------------------
/local/data/data_prep_cyberon_english.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . ./local/data/corpus_path.sh
 3 | 
 4 | for part in cyberon_english_train cyberon_english_test ; do
 5 |   data=./data/$part/mfcc39_pitch9
 6 |   mkdir -p $data
 7 |   for x in wav.scp text utt2spk ; do
 8 |     PYTHONIOENCODING=utf-8 python3 local/data/data_prep_cyberon_english.py $cyberon_english $part $x | sort -k1,1 -u > $data/$x || exit 1;
 9 |   done
10 |   cat $data/utt2spk | utils/utt2spk_to_spk2utt.pl > $data/spk2utt || exit 1;
11 |   utils/fix_data_dir.sh $data || exit 1;
12 | done
13 | 


--------------------------------------------------------------------------------
/local/data/data_prep_cyberon_chinese.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . ./local/data/corpus_path.sh
 3 | 
 4 | 
 5 | for part in cyberon_chinese_train cyberon_chinese_test ; do
 6 |   data=./data/$part/mfcc39_pitch9
 7 |   mkdir -p $data
 8 |   for x in wav.scp text utt2spk ; do
 9 |     PYTHONIOENCODING=utf-8 python3 local/data/data_prep_cyberon_chinese.py $cyberon_chinese $part $x | sort -k1,1 -u > $data/$x || exit 1;
10 |   done
11 |   cat $data/utt2spk | utils/utt2spk_to_spk2utt.pl > $data/spk2utt || exit 1;
12 |   utils/fix_data_dir.sh $data || exit 1;
13 | done
14 | 
15 | 


--------------------------------------------------------------------------------
/local/lm/get_all_context.py:
--------------------------------------------------------------------------------
 1 | import sys,os
 2 | sys.path.append('local/kaggle')
 3 | sys.path.append('local/data/')
 4 | import xlsx
 5 | from normalize_utils import *
 6 | import itertools
 7 | 
 8 | if __name__ == '__main__':
 9 |     word_list = get_word_list('data/wfst/lang/words.txt')
10 |     for i in range(4,5):
11 |         xlsx_path = '/data/local/kgb/corpus/kgb/kaggle{}/answer.xlsx'.format(i)
12 |         tmp = xlsx.get_content(xlsx_path,True,word_list)
13 |         for row in tmp:
14 |             print(row[1])
15 |             print(' '.join(list(row[1].replace(' ',''))))
16 | 


--------------------------------------------------------------------------------
/local/lm/get_all_problem.py:
--------------------------------------------------------------------------------
 1 | import sys,os
 2 | sys.path.append('local/kaggle')
 3 | sys.path.append('local/data/')
 4 | import xlsx
 5 | from normalize_utils import *
 6 | import itertools
 7 | 
 8 | if __name__ == '__main__':
 9 |     word_list = get_word_list('data/wfst/lang/words.txt')
10 |     for i in range(4,5):
11 |         xlsx_path = '/data/local/kgb/corpus/kgb/kaggle{}/answer.xlsx'.format(i)
12 |         tmp = xlsx.get_content(xlsx_path,True,word_list)
13 |         for row in tmp:
14 |             print(row[2])
15 |             print(' '.join(list(row[2].replace(' ',''))))
16 | 


--------------------------------------------------------------------------------
/local/data/data_prep_PTS.py:
--------------------------------------------------------------------------------
 1 | import os,sys
 2 | def main(pts_path,file_type):
 3 |     for root, dirs, files in os.walk(pts_path, topdown=False):
 4 |         for name in files:
 5 |             if name.endswith('.wav'):
 6 |                 wav_label = name.split('.')[0]
 7 |                 wav_path = os.path.join(root,name)
 8 |                 wav_path = os.path.abspath(wav_path)
 9 |                 if file_type == 'wav.scp':
10 |                     print(wav_label,wav_path)
11 | if __name__ == '__main__':
12 |     pts_path = sys.argv[1]
13 |     file_type = sys.argv[2]
14 |     main(pts_path,file_type)
15 | 
16 | 


--------------------------------------------------------------------------------
/local/data/corpus_path.sh:
--------------------------------------------------------------------------------
 1 | cyberon_chinese=/home/jacky/work/kgb/corpus/CyberonChinese
 2 | cyberon_english=/home/jacky/work/kgb/corpus/CyberonEnglish
 3 | eatmic=/home/jacky/work/kgb/corpus/EatMic16
 4 | PTS=/home/jacky/work/kgb/corpus/PTS-MSub-Vol1
 5 | NER=/home/jacky/work/kgb/corpus/NER-Trs-Vol1
 6 | TOCFL=/home/jacky/work/kgb/corpus/TOCFL/segmented
 7 | seame=/home/jacky/work/kgb/corpus/seame
 8 | Tl=/home/jacky/work/kgb/corpus/TlAlphaDigit
 9 | wiki=/home/jacky/work/kgb/corpus/wiki
10 | ptt=/home/jacky/work/kgb/corpus/ptt
11 | MATBN=/home/jacky/work/kgb/corpus/MATBN
12 | aishell2=/data/local/kgb/corpus/AISHELL-2/iOS/data
13 | 


--------------------------------------------------------------------------------
/local/kaggle/max_ppl.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | names = []
 4 | scores = []
 5 | wav_name = sys.argv[1]
 6 | flag=sys.argv[3]
 7 | with open(sys.argv[2],'r') as f:
 8 |     for idx,line in enumerate(f):
 9 |         if idx %2 == 0 :
10 |             name = line.rstrip()
11 |             names.append(name)
12 |         else:
13 |             score = line.rstrip()
14 |             scores.append(float(score))
15 | min_idx = np.argmin(scores)
16 | min_name = names[min_idx]
17 | min_score = scores[min_idx]
18 | if flag == '3':
19 |     print(wav_name,min_name,min_score)
20 | else:
21 |     print(min_name)
22 | 
23 | 


--------------------------------------------------------------------------------
/local/lm/mix_lm3_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . path.sh
 3 | lm1=$1
 4 | lm2=$2
 5 | lm3=$3
 6 | lm_replace=$4
 7 | test_text=$5
 8 | lm_out=$6
 9 | ngram -lm $lm1 -ppl $test_text -debug 2 > lm1.ppl
10 | ngram -lm $lm2 -ppl $test_text -debug 2 > lm2.ppl
11 | ngram -lm $lm3 -ppl $test_text -debug 2 > lm3.ppl
12 | compute-best-mix lm1.ppl lm2.ppl lm3.ppl > log
13 | lambda=`python3 local/lm/get_best_lambda.py log`
14 | lambda2=`python3 local/lm/get_best_lambda2.py log`
15 | ngram -lm $lm1 -ppl $test_text -mix-lm $lm_replace -lambda $lambda -mix-lm2 $lm2 -mix-lambda2 $lambda2 -write-lm $lm_out
16 | rm lm1.ppl lm2.ppl lm3.ppl log
17 | 


--------------------------------------------------------------------------------
/local/lm/dirty/mix_lm3.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . ../path.sh
 3 | lm1=$1
 4 | lm2=$2
 5 | lm3=$3
 6 | test_text=$4
 7 | lm_out=$5
 8 | #lambda3=$6
 9 | ngram -lm $lm1 -ppl $test_text -debug 2 > lm1.ppl
10 | ngram -lm $lm2 -ppl $test_text -debug 2 > lm2.ppl
11 | ngram -lm $lm3 -ppl $test_text -debug 2 > lm3.ppl
12 | compute-best-mix lm1.ppl lm2.ppl lm3.ppl > log
13 | lambda=`python3 local/get_best_lambda.py log`
14 | lambda2=`python3 local/get_best_lambda2.py log`
15 | #echo "$lambda $lambda2" >> $lambda3
16 | ngram -lm $lm1 -ppl $test_text -mix-lm $lm3 -lambda $lambda -mix-lm2 $lm2 -mix-lambda2 $lambda2 -write-lm $lm_out
17 | rm lm1.ppl lm2.ppl lm3.ppl log
18 | 


--------------------------------------------------------------------------------
/local/lm/dirty/kaggle4.sh:
--------------------------------------------------------------------------------
 1 | . ../path.sh
 2 | for novel in 20years guan laotsan nie water ; do
 3 |   ngram -lm text_test/ori.lm -mix-lm text_test/kaggle123_A.lm  -lambda 0.15 -mix-lm2 text_test/$novel.lm -mix-lambda2 0.8 -write-lm LM/$novel\_A\.lm
 4 |   ngram -lm text_test/ori.lm -mix-lm text_test/kaggle123_B.lm  -lambda 0.16 -mix-lm2 text_test/$novel.lm -mix-lambda2 0.35 -write-lm LM/$novel\_B\.lm
 5 |   ngram -lm text_test/ori.lm -mix-lm text_test/kaggle123_C.lm  -lambda 0.13 -mix-lm2 text_test/$novel.lm -mix-lambda2 0.35 -write-lm LM/$novel\_C\.lm
 6 |   for x in A B C ; do
 7 |     lm=LM/$novel\_$x.lm
 8 |     local/compile_lm.sh $lm &
 9 |   done
10 | done
11 | wait
12 | 


--------------------------------------------------------------------------------
/local/data/data_prep_noise.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | corpus_dir=/data/local/kgb/corpus/esc_speech_noise
 3 | data_dir=data/esc_speech_noise
 4 | mkdir -p $data_dir
 5 | find -L $corpus_dir/ -iname "*.wav" | sort | xargs -I% basename % .wav | \
 6 |   awk -v "dir=$corpus_dir" '{printf "%s %s/%s.wav \n", $0, dir, $0}' > $data_dir/wav.scp
 7 | find -L $corpus_dir/ -iname "*.wav" | sort | xargs -I% basename % .wav | \
 8 |   awk -v "dir=$corpus_dir" '{printf "%s %s.wav \n", $0, $0}' > $data_dir/utt2spk
 9 | cat $data_dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $data_dir/spk2utt || exit 1;
10 | bash utils/data/get_reco2utt.sh $data_dir
11 | 
12 | utils/fix_data_dir.sh $data_dir || exit 1;
13 | 


--------------------------------------------------------------------------------
/local/nnet/copy_alignment.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . path.sh
 3 | dir=$1
 4 | aug(){
 5 |   ali=$1
 6 |   gunzip -c $ali | copy-int-vector  ark:- ark,t:- | python3 -c " 
 7 | import sys
 8 | for line in sys.stdin.readlines():
 9 |   tokens = line.rstrip().split()
10 |   label = tokens[0]
11 |   values = ' '.join(tokens[1:])
12 |   print(label + '-aug',values)
13 |   print(label ,values)
14 |   #print('rvb1_'+label,values)
15 |   #print(label+'-aug_kgb_noise',values)
16 | " | copy-int-vector  ark,t:- ark:- | gzip -c > $ali\_after
17 |   mv $ali\_after $ali
18 |   echo "Done $ali"
19 | }
20 | 
21 | export -f aug
22 | 
23 | parallel -j 20 "aug {}" ::: $dir/ali.*.gz
24 | 
25 | wait
26 | 
27 | 


--------------------------------------------------------------------------------
/local/lm/dirty/parse_text.py:
--------------------------------------------------------------------------------
 1 | import sys,os
 2 | sys.path.append('../local/data/')
 3 | from normalize_utils import *
 4 | 
 5 | if __name__ == '__main__':
 6 |     text_path = sys.argv[1]
 7 |     output_path = sys.argv[2]
 8 |     new_text = ''
 9 |     #check_new_delete_word(text_path)
10 |     word_list = get_word_list('data/lang/words.txt')
11 |     with open(text_path,'r',encoding='utf-8') as f:
12 |         for line in f:
13 |             if 'ETtoday' in line:
14 |                 continue
15 |             line = line.rstrip()
16 |             new_text += normalize(line,word_list) + '\n'
17 |     with open(output_path,'w',encoding='utf-8') as f:
18 |         f.write(new_text)
19 | 
20 | 


--------------------------------------------------------------------------------
/local/lm/generate_ori.sh:
--------------------------------------------------------------------------------
 1 | #/bin/bash
 2 | . path.sh
 3 | LM=data/LM
 4 | text=data/text
 5 | lang=data/wfst/lang
 6 | vocab=$lang/vocabs.txt
 7 | words=$lang/words.txt 
 8 | ngram -lm data/wfst/LM/ori_4gram.lm -vocab $vocab -limit-vocab -write-lm $LM/ori.lm
 9 | for x in A B C; do
10 |   (
11 |     ngram-count -text $text/kaggle1234_$x.txt -lm $LM/kaggle1234_$x.lm -vocab $vocab -limit-vocab -order 4
12 |     ngram-count -text $text/kaggle12345_$x.txt -lm $LM/kaggle12345_$x.lm -vocab $vocab -limit-vocab -order 4
13 |   ) &
14 | done
15 | wait
16 | for x in A B C; do
17 |     local/lm/mix_lm2_test.sh $LM/ori.lm $LM/kaggle1234_$x.lm $LM/kaggle12345_$x.lm $text/kaggle5_$x.txt $LM/ori_$x.lm
18 | done
19 | 


--------------------------------------------------------------------------------
/utils/data/get_reco2utt_for_data.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Copyright 2016  Vimal Manohar
 4 | # Apache 2.0
 5 | 
 6 | if [ $# -ne 1 ]; then
 7 |   echo "This script outputs a mapping from recording to a list of utterances "
 8 |   echo "corresponding to the recording. It is analogous to the content of "
 9 |   echo "a spk2utt file, but is indexed by recording instead of speaker."
10 |   echo "Usage: get_reco2utt.sh <data>"
11 |   echo " e.g.: get_reco2utt.sh data/train"
12 |   exit 1
13 | fi
14 | 
15 | data=$1
16 | 
17 | if [ ! -s $data/segments ]; then
18 |   utils/data/get_segments_for_data.sh $data > $data/segments
19 | fi
20 | 
21 | cut -d ' ' -f 1,2 $data/segments | utils/utt2spk_to_spk2utt.pl
22 | 


--------------------------------------------------------------------------------
/conf/mfcc_hires.conf:
--------------------------------------------------------------------------------
 1 | # config for high-resolution MFCC features, intended for neural network training.
 2 | # Note: we keep all cepstra, so it has the same info as filterbank features,
 3 | # but MFCC is more easily compressible (because less correlated) which is why
 4 | # we prefer this method.
 5 | --use-energy=false   # use average of log energy, not energy.
 6 | --sample-frequency=8000 #  Switchboard is sampled at 8kHz
 7 | --num-mel-bins=40     # similar to Google's setup.
 8 | --num-ceps=40     # there is no dimensionality reduction.
 9 | --low-freq=40    # low cutoff frequency for mel bins
10 | --high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
11 | --allow_downsample=true
12 | 


--------------------------------------------------------------------------------
/conf/pinyin2cmu:
--------------------------------------------------------------------------------
 1 | A AA
 2 | AI AY
 3 | AN AE N 
 4 | ANG AE NG
 5 | AO AW   
 6 | B B 
 7 | CH CH
 8 | C T S
 9 | D D
10 | E ER 
11 | EI EY
12 | EN AH N
13 | ENG AH NG
14 | ER AA R 
15 | F F
16 | G G
17 | H HH
18 | IA IY AA
19 | IANG IY AE NG
20 | IAN IY AE N
21 | IAO IY AW
22 | IE IY EH
23 | I IY
24 | ING IY NG
25 | IN IY N
26 | IONG IY UH NG
27 | IU IY UH 
28 | J J
29 | K K
30 | L L
31 | M M
32 | N N
33 | O AO
34 | ONG UH NG
35 | OU OW
36 | P P
37 | Q Q
38 | R R
39 | SH SH
40 | S S
41 | T T
42 | UAI UW AY
43 | UANG UW AE NG
44 | UAN UW AE N
45 | UA UW AA
46 | UI UW IY 
47 | UN UW AH N
48 | UO UW AO
49 | U UW
50 | UE IY EH 
51 | VE IY EH 
52 | V IY UW
53 | VN IY N 
54 | W W
55 | X X 
56 | Y Y
57 | ZH JH 
58 | Z Z
59 | 


--------------------------------------------------------------------------------
/utils/data/get_num_frames.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script works out the approximate number of frames in a training directory.
 4 | # This is sometimes needed by higher-level scripts
 5 | 
 6 | 
 7 | if [ -f path.sh ]; then . ./path.sh; fi
 8 | . parse_options.sh || exit 1;
 9 | 
10 | if [ $# -ne 1 ]; then
11 |   (
12 |     echo "Usage: $0 <data-dir>"
13 |     echo "Prints the number of frames of data in the data-dir"
14 |   ) 1>&2
15 | fi
16 | 
17 | data=$1
18 | 
19 | if [ ! -f $data/utt2dur ]; then
20 |   utils/data/get_utt2dur.sh $data 1>&2 || exit 1
21 | fi
22 | 
23 | frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1
24 | 
25 | awk -v s=$frame_shift '{n += $2} END{printf("%.0f\n", (n / s))}' <$data/utt2dur
26 | 


--------------------------------------------------------------------------------
/local/data/word_segmentation.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding=utf-8
 3 | # Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG)
 4 | #           2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU)
 5 | # Apache 2.0
 6 | 
 7 | import sys
 8 | from normalize_utils import *
 9 | 
10 | if len(sys.argv) < 3:
11 |   sys.stderr.write("word_segmentation.py <vocab> <trans> > <word-segmented-trans>\n")
12 |   exit(1)
13 | 
14 | vocab_file=sys.argv[1]
15 | trans_file=sys.argv[2]
16 | word_list = get_word_list(vocab_file)
17 | 
18 | for line in open(trans_file,'r',encoding='utf-8'):
19 |   key,trans = line.strip().split('\t',1)
20 |   new_line = key + '\t' + normalize(trans,word_list)
21 |   print(new_line)
22 | 


--------------------------------------------------------------------------------
/utils/make_absolute.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script replaces the command readlink -f (which is not portable).
 4 | # It turns a pathname into an absolute pathname, including following soft links.
 5 | target_file=$1
 6 | 
 7 | cd $(dirname $target_file)
 8 | target_file=$(basename $target_file)
 9 | 
10 | # Iterate down a (possible) chain of symlinks
11 | while [ -L "$target_file" ]; do
12 |     target_file=$(readlink $target_file)
13 |     cd $(dirname $target_file)
14 |     target_file=$(basename $target_file)
15 | done
16 | 
17 | # Compute the canonicalized name by finding the physical path 
18 | # for the directory we're in and appending the target file.
19 | phys_dir=$(pwd -P)
20 | result=$phys_dir/$target_file
21 | echo $result
22 | 


--------------------------------------------------------------------------------
/local/data/data_prep_wav.py:
--------------------------------------------------------------------------------
 1 | import sys,os
 2 | 
 3 | 
 4 | if __name__ == '__main__':
 5 |     wav_dir = sys.argv[1]
 6 |     data_dir = sys.argv[2]
 7 |     utt2spk_path = os.path.join(data_dir,'utt2spk')
 8 |     wavscp_path = os.path.join(data_dir,'wav.scp')
 9 |     with open(utt2spk_path,'w') as f1, open(wavscp_path,'w') as f2:
10 |         for dirPath, dirNames, fileNames in os.walk(sys.argv[1]):
11 |             for name in fileNames:
12 |                 if name.endswith('.wav'):
13 |                     file_name = os.path.join(dirPath, name)
14 |                     file_name = os.path.abspath(file_name)
15 |                     f1.write(name + ' ' + name + '\n')
16 |                     f2.write(name + ' ' + file_name + '\n')
17 |     
18 | 


--------------------------------------------------------------------------------
/local/lm/wfst/generate_choice_fst.sh:
--------------------------------------------------------------------------------
 1 | #/bin/bash
 2 | words=$1
 3 | out_fst=$2
 4 | text_fst=`dirname $2`
 5 | text_fst=$text_fst/text.fst
 6 | 
 7 | rm $text_fst
 8 | 
 9 | . path.sh
10 | 
11 | echo "
12 | 0 1 <one> <one>
13 | 1 2 <two> <two>
14 | 2 3 <three> <three>
15 | 3 4  <four> <four>" >> $text_fst
16 | 
17 | for i in 1 2 3 4  ; do
18 |   cat $words | grep -v "<eps>" | grep -v "<one>" |\
19 |     grep -v "<two>" | grep -v "<three>" |\
20 |     grep -v "<four>" | grep -v "<s>" |\
21 |     grep -v "</s>" | awk -v i=$i '{print i " " i " " $1 " " $1 }' >> $text_fst
22 | done
23 | echo 4 >> $text_fst
24 | 
25 | 
26 | fstcompile --isymbols=$words --osymbols=$words \
27 |    --keep_isymbols=false --keep_osymbols=false $text_fst | fstarcsort --sort_type=olabel  > $out_fst
28 | 
29 | 


--------------------------------------------------------------------------------
/local/data/extract_ptt.py:
--------------------------------------------------------------------------------
 1 | import os,sys,json,re
 2 | sys.path.append('local/data/tool/jieba-zh_TW')
 3 | import jieba
 4 | from number2chinese import *
 5 | 
 6 | ptt_corpus = sys.argv[1]
 7 | crawl_path = os.path.join(ptt_corpus,'ptt_crawl.json')
 8 | ptt = json.load(open(crawl_path,'r'))
 9 | for item in ptt:
10 |     text = item['Content']
11 |     text = text.replace('\n\n','\n').replace(' ','')
12 |     tokens = jieba.cut(text)
13 |     new_tokens = []
14 |     for token in tokens:
15 |         if re.match('^[0-9]+$',token):
16 |             if len(token) > 15:
17 |                 continue
18 |             token = to_chinese(int(token))
19 |         new_tokens.append(token)
20 |     text = ' '.join(new_tokens)
21 |     text = text.upper()
22 |     if len(text) > 0:
23 |         print(text)
24 | 
25 | 


--------------------------------------------------------------------------------
/local/kaggle/test_lambda.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . path.sh
 3 | thread_num=100
 4 | #ngram ori -mix-lm nre lambda
 5 | test_lambda(){
 6 |   dir=$1
 7 |   Alm=$dir/A.lm
 8 |   orilm=/data/local/kgb/Chinese-ASR/lm_test/LM/C_kaggle12.lm
 9 |   echo $dir
10 |   ngram-count -text $dir/A.txt -order 4 -lm $Alm 
11 |   ngram -lm $orilm -ppl $dir/C.txt -debug 2 > $dir/ori.ppl
12 |   ngram -lm $Alm -ppl $dir/C.txt -debug 2 > $dir/A.ppl
13 |   compute-best-mix $dir/ori.ppl $dir/A.ppl > $dir/log
14 |   python3 local/data/get_best_lambda.py $dir/log >> $dir/../best_lambda
15 | }
16 | export -f test_lambda
17 | 
18 | #PYTHOIOENCODING=utf-8 python3 local/data/test_lambda.py
19 | parallel -j $thread_num "test_lambda {}" ::: lambda_test/*
20 | python3 local/data/accumulate_lambda.py lambda_test/best_lambda
21 | 
22 | wait
23 | 


--------------------------------------------------------------------------------
/steps/data/data_dir_manipulation_lib.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | 
 3 | def RunKaldiCommand(command, wait = True):
 4 |     """ Runs commands frequently seen in Kaldi scripts. These are usually a
 5 |         sequence of commands connected by pipes, so we use shell=True """
 6 |     #logger.info("Running the command\n{0}".format(command))
 7 |     p = subprocess.Popen(command, shell = True,
 8 |                          stdout = subprocess.PIPE,
 9 |                          stderr = subprocess.PIPE)
10 | 
11 |     if wait:
12 |         [stdout, stderr] = p.communicate()
13 |         if p.returncode is not 0:
14 |             raise Exception("There was an error while running the command {0}\n".format(command)+"-"*10+"\n"+stderr)
15 |         return stdout, stderr
16 |     else:
17 |         return p
18 | 
19 | 


--------------------------------------------------------------------------------
/local/kaggle/replace_iflytek_answer.py:
--------------------------------------------------------------------------------
 1 | from xlsx import *
 2 | import sys,os
 3 | 
 4 | 
 5 | if __name__ == '__main__':
 6 |     iflytek_json = sys.argv[1]
 7 |     output_json = sys.argv[2]
 8 |     kaggle_id = sys.argv[3]
 9 |     d = {}
10 |     ans_path = '/data/local/kgb/corpus/kgb/kaggle{}/answer.xlsx'.format(kaggle_id)
11 |     L = get_content(ans_path,False)
12 | 
13 |     for (No,p,q,c,answer) in L:
14 |         d[No] = answer 
15 |     
16 |     with open(iflytek_json,'r',encoding='utf8') as f:
17 |         data = json.load(f)
18 |     outputs = []
19 |     for sample in data:
20 |         id = sample['id']
21 |         if id in d:
22 |             sample['answer'] = d[id]
23 |         outputs.append(sample)
24 |     with open(output_json,'w',encoding='utf8') as f:
25 |         json.dump(outputs,f,indent=4,ensure_ascii=False)
26 | 
27 | 


--------------------------------------------------------------------------------
/local/combine_kaggle.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | nj=8 #number of job parallel running
 3 | stage=0
 4 | . ./path.sh
 5 | . ./cmd.sh
 6 | . ./utils/parse_options.sh
 7 | 
 8 | 
 9 | if [ $stage -le 1 ] ; then
10 |   mfccdir=data/mfcc_pitch
11 |   mkdir -p $mfccdir
12 | 
13 |   for corpus in kaggle1 kaggle2 kaggle3 ; do
14 |     combine48=''
15 |     for typ in A B C ; do
16 |       ##Extract MFCC39 + pitch9 feature
17 |       data=./data/$corpus/$typ/mfcc39_pitch9
18 |       name=$corpus\_$typ
19 |       steps/make_mfcc_pitch_online.sh --cmd "$train_cmd" --nj $nj --name $name $data exp/make_mfcc/$name $mfccdir || exit 1;
20 |       steps/compute_cmvn_stats.sh --name $corpus $data exp/make_mfcc/$name $mfccdir || exit 1;
21 |       combine48="$data $combine48"
22 |     done
23 |     utils/combine_data.sh ./data/$corpus/mfcc39_pitch9 $combine48 
24 |   done
25 | fi
26 | 


--------------------------------------------------------------------------------
/local/kaggle/choose_lm2.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf-8
 2 | import sys
 3 | import numpy as np
 4 | sys.path.append('local/data/')
 5 | from normalize_utils import *
 6 | 
 7 | iflytek_A = sys.argv[1]
 8 | test_dir=sys.argv[2] 
 9 | 
10 | L = []
11 | with open(iflytek_A,'r') as f:
12 |     for line in f:
13 |         start = line.find(' ')
14 |         token1 = line.split()[0]
15 |         L.append(token1)
16 | L2 = []
17 | lms = []
18 | for lm in os.listdir(test_dir):
19 |     lms.append(lm)
20 |     temp = []
21 |     with open(os.path.join(test_dir,lm),'r') as f:
22 |         for line in f:
23 |             temp.append(float(line))
24 |     L2.append(temp)
25 | n_line,n_lm = len(L),len(L2)
26 | scores = np.array(L2).transpose()
27 | for i in range(n_line):
28 |     max_score = np.min(scores[i])
29 |     lm = lms[np.argmin(scores[i])]
30 |     print(L[i],lm,max_score)
31 | 


--------------------------------------------------------------------------------
/steps/nnet3/chain/e2e/README.txt:
--------------------------------------------------------------------------------
 1 | The scripts related to end2end chain training are in this directory
 2 | Currently it has 3 scripts:
 3 | 
 4 | ** prepare_e2e.sh which is almost equivalent
 5 | to regular chain's build-tree.sh (i.e. it creates the tree and
 6 | the transition-model) except it does not require any previously
 7 | trained models (in other terms, it does what stages -3 and -2
 8 | of steps/train_mono.sh do).
 9 | 
10 | ** get_egs_e2e.sh: this is simlilar to chain/get_egs.sh except it
11 | uses training FSTs (instead of lattices) to generate end2end egs.
12 | 
13 | ** train_e2e.py: this is very similar to chain/train.py but
14 | with fewer stages (e.g. it does not compute the preconditioning matrix)
15 | 
16 | 
17 | For details please see the comments at top of local/chain/e2e/run_flatstart_*.sh
18 | and also src/chain/chain-generic-numerator.h.
19 | 


--------------------------------------------------------------------------------
/local/kaggle/choose_lm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . path.sh
 3 | thread_num=56
 4 | iflytek_A_text=$1
 5 | test_dir=$2
 6 | output=$3
 7 | mkdir -p $test_dir
 8 | 
 9 | export output=$output
10 | 
11 | 
12 | choose_lm(){
13 |   tex=$1
14 |   if [ -f $tex\_result ] ; then
15 |     rm $tex\_result
16 |   fi
17 |   for lm in ori news 20years nie guan laotsan water journey_west red_mansion 3kingdom beauty_n hunghuang lai_ho old_time one_gan lu_shun ; do
18 |     echo $lm >> $tex\_result
19 |     cat $tex | ngram -lm lm_test/LM/$lm\_A.lm -ppl - | python3 local/kaggle/get_ppl.py - >> $tex\_result
20 |   done
21 |   wav=`basename $tex`
22 |   python3 local/kaggle/max_ppl.py $wav $tex\_result 3 >> $output
23 | }
24 | 
25 | export -f choose_lm
26 | 
27 | PYTHOIOENCODING=utf-8 python3 local/kaggle/choose_lm.py $iflytek_A_text $test_dir
28 | 
29 | parallel -j $thread_num "choose_lm {}" ::: $test_dir/*.wav
30 | 
31 | echo "Done choose_lm.sh."
32 | 
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/utils/data/get_segments_for_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script operates on a data directory, such as in data/train/,
 4 | # and writes new segments to stdout. The file 'segments' maps from
 5 | # utterance to time offsets into a recording, with the format:
 6 | #   <utterance-id> <recording-id> <segment-begin> <segment-end>
 7 | # This script assumes utterance and recording ids are the same (i.e., that
 8 | # wav.scp is indexed by utterance), and uses durations from 'utt2dur', 
 9 | # created if necessary by get_utt2dur.sh.
10 | 
11 | . ./path.sh
12 | 
13 | if [ $# != 1 ]; then
14 |   echo "Usage: $0 [options] <datadir>"
15 |   echo "e.g.:"
16 |   echo " $0 data/train > data/train/segments"
17 |   exit 1
18 | fi
19 | 
20 | data=$1
21 | 
22 | if [ ! -s $data/utt2dur ]; then
23 |   utils/data/get_utt2dur.sh $data 1>&2 || exit 1;
24 | fi
25 | 
26 | # <utt-id> <utt-id> 0 <utt-dur>
27 | awk '{ print $1, $1, 0, $2 }' $data/utt2dur
28 | 
29 | exit 0
30 | 


--------------------------------------------------------------------------------
/local/temp.sh:
--------------------------------------------------------------------------------
 1 | #/bin/bash
 2 | 
 3 | . ./path.sh
 4 | . ./cmd.sh
 5 | . ./utils/parse_options.sh
 6 | mfccdir=data/mfcc
 7 | fbankdir=data/fbank
 8 | nj=40
 9 | stage=2
10 | lang=data/wfst/lang
11 | lang_test=data/wfst/lang_test
12 | # Now make MFCC features.
13 | if [ $stage -le 1 ]; then
14 |   # mfccdir should be some place with a largish disk where you
15 |   # want to store MFCC features.
16 |   for corpus in cyberon_chinese_test ; do
17 |     data=./data/$corpus/mfcc39
18 |     utils/copy_data_dir.sh ./data/$corpus/mfcc40 $data
19 |     steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj --name $corpus $data exp/make_mfcc/$corpus $mfccdir || exit 1;
20 |     steps/compute_cmvn_stats.sh --name $corpus $data exp/make_mfcc/$corpus $mfccdir || exit 1;
21 |   done
22 | fi
23 | 
24 | steps/decode.sh --cmd "$decode_cmd" --nj 12 --config conf/decode.config \
25 |   exp/aishell2/tri3/graph data/cyberon_chinese_test/mfcc39 exp/aishell2/tri3/decode_cyberon_chinese_test
26 | 


--------------------------------------------------------------------------------
/local/lm/get_all_choices.py:
--------------------------------------------------------------------------------
 1 | import sys,os
 2 | sys.path.append('local/kaggle')
 3 | sys.path.append('local/data/')
 4 | import xlsx
 5 | from normalize_utils import *
 6 | import itertools
 7 | 
 8 | if __name__ == '__main__':
 9 |     word_list = get_word_list('data/wfst/lang/words.txt')
10 |     for i in range(4,5):
11 |         xlsx_path = '/data/local/kgb/corpus/kgb/kaggle{}/answer.xlsx'.format(i)
12 |         tmp = xlsx.get_content(xlsx_path,True,word_list)
13 |         for row in tmp:
14 |             for perm in list(itertools.permutations(row[3])):
15 |                 text = xlsx.merge_choice(perm,True)
16 |                 print(text)
17 |             split_text = []
18 |             for x in row[3]:
19 |                 split_text.append(' '.join(list(x.replace(' ',''))))
20 |             for perm in list(itertools.permutations(split_text)):
21 |                 text = xlsx.merge_choice(perm,True)
22 |                 print(text)
23 |                 
24 |     
25 | 
26 | 


--------------------------------------------------------------------------------
/local/lm/dirty/compile_lm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . path.sh
 3 | x=$1
 4 | dir=${x::-3}
 5 | 
 6 | mkdir -p $dir
 7 | cp -r data/lang/* $dir
 8 | 
 9 | ngram -lm $x -vocab lm_test/text/vocab.txt -limit-vocab -write-lm $x
10 | 
11 | cat $x | \
12 |   arpa2fst --disambig-symbol=#0 \
13 |            --read-symbol-table=$dir/words.txt - $dir/G.fst || exit 1;
14 | ## compile Ldet.fst
15 | newlang=$dir
16 | phi=`grep -w '#0' $newlang/words.txt | awk '{print $2}'`
17 | fstprint $newlang/L_disambig.fst | awk '{if($4 != '$phi'){print;}}' | fstcompile | \
18 |   fstdeterminizestar | fstrmsymbols $newlang/phones/disambig.int >$newlang/Ldet.fst || exit 1;
19 | 
20 | ##transform to G.carpa
21 | bos=`grep "<s>" $dir/words.txt | awk '{print $2}'`
22 | eos=`grep "</s>" $dir/words.txt | awk '{print $2}'`
23 | unk=`cat $dir/oov.int`
24 | 
25 | cat $x  | \
26 |   utils/map_arpa_lm.pl $dir/words.txt | \
27 |   arpa-to-const-arpa --bos-symbol=$bos --eos-symbol=$eos \
28 |   --unk-symbol=$unk - $dir/G.carpa
29 | 


--------------------------------------------------------------------------------
/utils/ctm/fix_ctm.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | stmfile=$1
 4 | ctmfile=$2
 5 | 
 6 | segments_stm=`cat $stmfile | cut -f 1 -d ' ' | sort -u`
 7 | segments_ctm=`cat $ctmfile | cut -f 1 -d ' ' | sort -u`
 8 | 
 9 | segments_stm_count=`echo "$segments_stm" | wc -l `
10 | segments_ctm_count=`echo "$segments_ctm" | wc -l `
11 | 
12 | #echo $segments_stm_count
13 | #echo $segments_ctm_count
14 | 
15 | if [ "$segments_stm_count" -gt "$segments_ctm_count"  ] ; then
16 |   pp=$( diff <(echo "$segments_stm") <(echo "$segments_ctm" ) | grep "^<" | sed "s/^< *//g")
17 |   (
18 |     for elem in $pp ; do
19 |       echo "$elem 1 0 0 EMPTY_RECOGNIZED_PHRASE"
20 |     done
21 |   ) >> $ctmfile
22 |   echo "FIXED CTM FILE"
23 |   exit 0
24 | elif [ "$segments_stm_count" -lt "$segments_ctm_count"  ] ; then
25 |   echo "Segment STM count: $segments_stm_count"
26 |   echo "Segment CTM count: $segments_ctm_count"
27 |   echo "FAILURE FIXING CTM FILE"
28 |   exit 1
29 | else
30 |   exit 0
31 | fi
32 | 
33 | 


--------------------------------------------------------------------------------
/utils/spk2utt_to_utt2spk.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # Copyright 2010-2011 Microsoft Corporation
 3 | 
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #  http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
14 | # See the Apache 2 License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | while(<>){ 
19 |     @A = split(" ", $_);
20 |     @A > 1 || die "Invalid line in spk2utt file: $_";
21 |     $s = shift @A;
22 |     foreach $u ( @A ) {
23 |         print "$u $s\n";
24 |     }
25 | }
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | export train_cmd="run.pl --mem 6G"
14 | export decode_cmd="run.pl --mem 6G"
15 | export mkgraph_cmd="run.pl --mem 8G"
16 | export cuda_cmd="run.pl --gpu 1"
17 | 


--------------------------------------------------------------------------------
/steps/tfrnnlm/check_tensorflow_installed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # this script checks if TF is installed to be used with python
 4 | #                    and if TF related binaries in kaldi is ready to use
 5 | . ./path.sh
 6 | 
 7 | if which lattice-lmrescore-tf-rnnlm 2>&1>/dev/null; then
 8 |   echo TensorFlow relate binaries found. This is good.
 9 | else
10 |   echo TF related binaries not compiled.
11 |   echo You need to go to tools/ and run extras/install_tensorflow_cc.sh first
12 |   echo and then do \"make\" under both src/tfrnnlm and src/tfrnnlmbin
13 |   exit 1
14 | fi
15 | 
16 | echo
17 | 
18 | if python steps/tfrnnlm/check_py.py 2>/dev/null; then
19 |   echo TensorFlow ready to use on the python side. This is good.
20 | else
21 |   echo TensorFlow not found on the python side.
22 |   echo Please go to tools/ and run extras/install_tensorflow_py.sh to install it
23 |   echo If you already have TensorFlow installed somewhere else, you would need
24 |   echo to add it to your PATH
25 |   exit 1
26 | fi
27 | 


--------------------------------------------------------------------------------
/local/data/fix_segments.py:
--------------------------------------------------------------------------------
 1 | import sys,os
 2 | 
 3 | def get_labels(file_path):
 4 |     s = set()
 5 |     with open(file_path,'r') as f:
 6 |         for line in f:
 7 |             token  = line.split()[0]
 8 |             s.add(token)
 9 |     return s
10 |     
11 | if __name__ == '__main__':
12 |     data_path = sys.argv[1]
13 |     s1 = get_labels(os.path.join(data_path,'segments'))  
14 |     s2 = get_labels(os.path.join(data_path,'feats.scp'))
15 |     s3 = s1 - s2
16 |     for scp in ['text','utt2spk','spk2utt','segments']:
17 |         all_lines = []
18 |         with open(os.path.join(data_path,scp),'r',encoding='utf-8') as f:
19 |             for line in f:
20 |                 token = line.split()[0]
21 |                 if token in s3:
22 |                     continue
23 |                 else:
24 |                     all_lines.append(line)
25 |         with open(os.path.join(data_path,scp),'w',encoding='utf-8') as f:
26 |             for line in all_lines:
27 |                 f.write(line)
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/local/kaggle/parse_text.py:
--------------------------------------------------------------------------------
 1 | import sys,os
 2 | sys.path.append('local/data/')
 3 | from normalize_utils import *
 4 | def check_new_delete_word(text_path):
 5 |     new_text = ''
 6 |     with open(text_path,'r',encoding='utf-8') as f:
 7 |         for line in f:
 8 |             new_text += line
 9 |     S1 = check_not_chinese(new_text)
10 |     S2 = set(delete_symbols)
11 |     for x in list(S1-S2):
12 |         print(x)
13 |     exit()
14 | 
15 | 
16 | if __name__ == '__main__':
17 |     text_path = sys.argv[1]
18 |     output_path = sys.argv[2]
19 |     new_text = ''
20 |     #check_new_delete_word(text_path)
21 |     word_list = get_word_list('data/lang/words.txt')
22 |     with open(text_path,'r',encoding='utf-8') as f:
23 |         for line in f:
24 |             if 'ETtoday' in line:
25 |                 continue
26 |             line = line.rstrip()
27 |             new_text += normalize(line,word_list) + '\n'
28 |     with open(output_path,'w',encoding='utf-8') as f:
29 |         f.write(new_text)
30 | 
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/local/kaggle/add.py:
--------------------------------------------------------------------------------
 1 | import sys,os
 2 | from shutil import copyfile
 3 | null='開口，師兄，呆子嚇了一跳，起身了，初出江湖人稱把師傅變成，功能不夠涼快，一點，受傷之事，拉進去了，是這樣的效果，你掙扎着下來，還去當你的王太子，標上行裏最高的一張，坐盆裏去，沒人說你有類說，優雅，你可千萬不要偷懶，又要做師傅，趁着假日到花果山來了，阿姐一名男童說，不行不行，前幾天在白骨嶺上，他打死白骨精，我只當玩耍。老和尚揍他，那找和尚當日軍，把它作爲通訊協定書趕走，他，不知怎樣按摩，那風尚版六座，給我來上幾下，還活得成呢，滿滿說，的，會跟你記仇，你見了他別說吃住都難，確實辛苦他了，他見到這種情景，令人氣憤，定會有那怪爭鬥，管叫哪個妖精救出師傅扎針，八戒只有橫下一條心來。'
 4 | def read_choose_lm(choose_lm):
 5 |     d = {}
 6 |     with open(choose_lm,'r') as f:
 7 |         for line in f:
 8 |             tokens = line.rstrip().split()
 9 |             idx = int(tokens[0][1:].replace('.wav',''))
10 |             novel = null
11 |             if len(tokens) > 1:
12 |                 novel = tokens[1]
13 |             d[tokens[0]] = novel 
14 |     return d
15 | d = read_choose_lm(sys.argv[1])
16 | wav_dir = sys.argv[2]
17 | L = []
18 | for wav in os.listdir(wav_dir):
19 |     if wav.endswith('.wav'):
20 |         if wav not in d:
21 |             d[wav] = null
22 | with open(sys.argv[3],'w') as f:
23 |     for k,v in d.items():
24 |         f.write('{} {} \n'.format(k,v))
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/local/kaggle/test/select_lm.py:
--------------------------------------------------------------------------------
 1 | import sys,os
 2 | 
 3 | def process_C2(src_dir,dirname,C_lang_dir,lang):
 4 |     idx = int(dirname[1:])
 5 |     lm = os.path.join(C_lang_dir,lang,'rescore')
 6 |     if lm is not None:
 7 |         rescore_lang = os.path.join(src_dir,dirname,'rescore_lang')
 8 |         with open(rescore_lang,'w') as f:
 9 |             f.write(lm)
10 | 
11 | if __name__ == '__main__':
12 |     src_dir = sys.argv[1]
13 |     C_lang_dir = sys.argv[2]
14 |     d_list1 = []
15 |     C_langs = os.listdir(C_lang_dir)
16 |     lang_id = 0
17 |     for dirname in os.listdir(src_dir):
18 |         if not os.path.isdir(os.path.join(src_dir,dirname)):
19 |             continue
20 |         idx = int(dirname[1:])
21 |         typ = dirname[0]
22 |         if idx % 3 == 0 and idx <= 1500:
23 |             use_gpu = os.path.join(src_dir,dirname,'use_gpu')
24 |             with open(use_gpu,'w') as f:
25 |                 f.write('yes')
26 |         if typ == 'C':
27 |             process_C2(src_dir,dirname,C_lang_dir,C_langs[lang_id])
28 |             lang_id += 1
29 |             
30 | 


--------------------------------------------------------------------------------
/local/lm/prune_all_lm.sh:
--------------------------------------------------------------------------------
 1 | #/bin/bash
 2 | . path.sh
 3 | lm_dir=data/LM
 4 | lang=data/wfst/lang
 5 | choice_fst=data/wfst/lang_test/choice.fst
 6 | words=$lang/words.txt 
 7 | 
 8 | for x in $lm_dir/*C ; do
 9 |   if [ -d $x ]; then
10 |     (
11 |       ngram -lm $x.lm -prune 2e-7 -write-lm $x\_pruned.lm
12 |       xdir=$x\_pruned
13 |       xlm=$x\_pruned.lm
14 |       cp -r $lang $xdir
15 |       cat $xlm | arpa2fst --disambig-symbol=#0 \
16 |         --read-symbol-table=$words -  | fstarcsort --sort_type=olabel > $xdir/G.fst
17 | 
18 |       ## compile Ldet.fst
19 |       phi=`grep -w '#0' $words | awk '{print $2}'`
20 |       
21 |       fstprint $xdir/L_disambig.fst | awk '{if($4 != '$phi'){print;}}' | fstcompile | \
22 |         fstdeterminizestar | fstrmsymbols $xdir/phones/disambig.int > $xdir/Ldet.fst || exit 1;
23 |       
24 |       mv $xdir/G.fst $xdir/G_head.fst
25 |       fsttablecompose $xdir/G_head.fst $choice_fst  | \
26 |         fstdeterminizestar --use-log=true | \
27 |         fstminimizeencoded  > $xdir/G.fst
28 |     ) &
29 |   fi
30 | done
31 | 
32 | wait
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/local/lm/wfst/temp2.sh:
--------------------------------------------------------------------------------
 1 | #/bin/bash
 2 | . path.sh
 3 | wfst=./data/wfst
 4 | dict=./data/wfst/dict
 5 | lang=./data/wfst/lang
 6 | tmp_lang=./data/wfst/local/lang
 7 | model_dir=exp/tri4a
 8 | LM=data/LM
 9 | text=data/text
10 | #modify dict/lexicon.txt lexiconp.txt
11 | #utils/prepare_lang.sh $dict "<UNK>" $tmp_lang $lang
12 | 
13 | #LM training
14 | mkdir -p $LM/3gram
15 | #PYTHONENCODING=utf-8 python3 local/lm/get_all_choices.py #> $wfst/kaggle12_C.txt
16 | 
17 | #ngram -lm data/local/lm/3gram-mincount/lm_pr10.0 -vocab $lang/vocabs.txt -limit-vocab -write-lm $LM/3gram/ori_pr10.0.lm
18 | 
19 | local/lm/mix_lm3_test.sh $LM/3gram/ori_pr10.0.lm $LM/3gram/kaggle123_C.lm $LM/3gram/mix.lm \
20 |   $LM/3gram/kaggle1234_C.lm $text/kaggle4_C.txt $LM/3gram/ori_C_10.0.lm  
21 | 
22 | 
23 | lm=$LM/3gram/ori_C_10.0.lm
24 | lang_test=./data/wfst/lang_test_pr10_C
25 | graph_dir=exp/tri4a/graph_pr10_C
26 | #G compilation and check L and G stochastic
27 | local/kaggle/wfst/format_data.sh $lm $lang $lang_test
28 | 
29 | #compose HCLG(choice)
30 | utils/mkgraph.sh $lang_test $model_dir $graph_dir
31 | 
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/local/lm/dirty/format_lm_from_text.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . path.sh
 3 | set -euo pipefail
 4 | 
 5 | text_dir=lm_test/new_text
 6 | text_test=lm_test/text_test
 7 | LM=lm_test/LM
 8 | novel=$1
 9 | 
10 | opencc -i $text_dir/$novel.txt -o $text_dir/$novel\_tra.txt 
11 | 
12 | PYTHONIOENCODING=utf-8 python3 local/kaggle/parse_text.py $text_dir/$novel\_tra.txt  $text_dir/$novel\_norm.txt 
13 | 
14 | ngram-count -text $text_dir/$novel\_norm.txt -lm $text_test/$novel\.lm -vocab $text_test/vocab.txt -limit-vocab -order 4 
15 | 
16 | ngram -lm $text_test/ori.lm -mix-lm $text_test/kaggle123_A.lm  -lambda 0.15 -mix-lm2 $text_test/$novel.lm \
17 |   -mix-lambda2 0.8 -write-lm $LM/$novel\_A\.lm
18 | ngram -lm $text_test/ori.lm -mix-lm $text_test/kaggle123_B.lm  -lambda 0.16 -mix-lm2 $text_test/$novel.lm \
19 |   -mix-lambda2 0.35 -write-lm $LM/$novel\_B\.lm
20 | ngram -lm $text_test/ori.lm -mix-lm $text_test/kaggle123_C.lm  -lambda 0.13 -mix-lm2 $text_test/$novel.lm \
21 |   -mix-lambda2 0.35 -write-lm $LM/$novel\_C\.lm
22 | 
23 | for x in A B C ; do
24 |   lm=$LM/$novel\_$x.lm
25 |   lm_test/local/compile_lm.sh $lm &
26 | done
27 | 
28 | wait
29 | 


--------------------------------------------------------------------------------
/utils/s2eps.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # Copyright 2010-2011 Microsoft Corporation
 3 | 
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #  http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
14 | # See the Apache 2 License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | # This script replaces <s> and </s> with <eps> (on both input and output sides),
18 | # for the G.fst acceptor.
19 | 
20 | while(<>){
21 |     @A = split(" ", $_);
22 |     if ( @A >= 4 ) {
23 |         if ($A[2] eq "<s>" || $A[2] eq "</s>") { $A[2] = "<eps>"; }
24 |         if ($A[3] eq "<s>" || $A[3] eq "</s>") { $A[3] = "<eps>"; }
25 |     }
26 |     print join("\t", @A) . "\n";
27 | }
28 | 


--------------------------------------------------------------------------------
/utils/eps2disambig.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # Copyright 2010-2011 Microsoft Corporation
 3 | #                2015 Guoguo Chen
 4 | 
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #  http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
14 | # MERCHANTABLITY OR NON-INFRINGEMENT.
15 | # See the Apache 2 License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | # This script replaces epsilon with #0 on the input side only, of the G.fst
19 | # acceptor.  
20 | 
21 | while(<>){
22 |   if (/\s+#0\s+/) {
23 |     print STDERR "$0: ERROR: LM has word #0, " .
24 |                  "which is reserved as disambiguation symbol\n";
25 |     exit 1;
26 |   }
27 |   s:^(\d+\s+\d+\s+)\<eps\>(\s+):$1#0$2:;
28 |   print;
29 | }
30 | 


--------------------------------------------------------------------------------
/local/kaggle/check_output.py:
--------------------------------------------------------------------------------
 1 | import os,sys
 2 | sys.path.append('local/data/')
 3 | from normalize_utils import *
 4 | src_dir=sys.argv[1]
 5 | d_list = []
 6 | for d in os.listdir(src_dir):
 7 |     if os.path.isdir(os.path.join(src_dir,d)):
 8 |         d_list.append(d)
 9 | output_path = os.path.join(src_dir,'output.txt')
10 | 
11 | L = read_outputs(output_path)
12 | missing_trans = []
13 | 
14 | for name,trans in L:
15 |     f_name = name.replace('.wav','')
16 |     if len(trans) == 0 :
17 |         missing_trans.append(name)
18 | 
19 | 
20 | 
21 | missing_files =[]
22 | L2 = [x.replace('.wav','') for x,y in L]
23 | for d in d_list:
24 |     if d not in L2:
25 |         missing_files.append(d+'.wav')
26 | 
27 | wrong = False
28 | if len(missing_files) > 0:
29 |     wrong = True
30 |     for f in missing_files:
31 |         print(f)
32 |     print("Missing {}  files.".format(len(missing_files)))
33 | if len(missing_trans) > 0:
34 |     wrong = True
35 |     for f in missing_trans:
36 |         print(f)
37 |     print("Missing {}  trans.".format(len(missing_trans)))
38 | if not wrong:
39 |     print("All wav files have outputs.")
40 |     
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/local/data/data_prep_TOCFL.py:
--------------------------------------------------------------------------------
 1 | import os,sys
 2 | 
 3 | def main(tocfl_path,file_type):
 4 |     wavdir_path = os.path.join(tocfl_path,'wav')
 5 |     wavdir_path = os.path.abspath(wavdir_path)
 6 |     txt_path = os.path.join(tocfl_path,'txt')
 7 |     for filename in os.listdir(wavdir_path):
 8 |         wav_label = filename.split('.')[0]
 9 |         wav_path = os.path.join(wavdir_path,filename)
10 |         txt_file = os.path.join(txt_path,wav_label+'.txt')
11 |         txt = open(txt_file,'r',encoding='UTF-8').read()
12 |         trans = txt.rstrip()
13 |         #trans = ' '.join(list(trans))
14 |         if file_type == 'text':
15 |             sys.path.append('local/data/tool/jieba-zh_TW')
16 |             import jieba
17 |             trans = ' '.join(jieba.cut(trans))
18 |             trans = trans.upper()
19 |             print(wav_label,trans)
20 |         elif file_type == 'wav.scp':
21 |             print(wav_label,wav_path)
22 |         elif file_type == 'utt2spk':
23 |             print(wav_label, wav_label)
24 | if __name__ == '__main__':
25 |     tocfl_path = sys.argv[1]
26 |     file_type = sys.argv[2]
27 |     main(tocfl_path,file_type)
28 | 
29 | 


--------------------------------------------------------------------------------
/local/data/data_prep_NER.py:
--------------------------------------------------------------------------------
 1 | import os,sys
 2 | 
 3 | def main(corpus_path,file_type):
 4 |     for root, dirs, files in os.walk(corpus_path, topdown=False):
 5 |         for name in files:
 6 |             if name.endswith('.wav'):
 7 |                 wav_label = name.split('.')[0]
 8 |                 wav_path = os.path.join(root,name)
 9 |                 wav_path = os.path.abspath(wav_path)
10 |                 
11 |                 txt_path = wav_path.replace('Wav','Text').replace('.wav','.txt')
12 |                 if not os.path.isfile(txt_path):
13 |                     continue
14 |                 trans = open(txt_path,'r', encoding='utf-8').read()
15 |                 trans = trans.rstrip()
16 |                 trans = trans.upper()
17 | 
18 |                 if file_type == 'wav.scp':
19 |                     print(wav_label, wav_path)
20 |                 elif file_type == 'utt2spk':
21 |                     print(wav_label, wav_label)
22 |                 elif file_type == 'text':
23 |                     print(wav_label, trans)
24 | if __name__ == '__main__':
25 |     corpus_path = sys.argv[1]
26 |     file_type = sys.argv[2]
27 |     main(corpus_path,file_type)
28 | 


--------------------------------------------------------------------------------
/local/kaggle/choose_lm2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . path.sh
 3 | iflytek_A_text=$1
 4 | test_dir=$2
 5 | output=$3
 6 | mkdir -p $test_dir
 7 | export LC_ALL='en_US.utf8'
 8 | for lm in ori news 20years nie guan laotsan water journey_west red_mansion 3kingdom beauty_n hunghuang lai_ho old_time one_gan lu_shun ; do
 9 |   (
10 |     cat $iflytek_A_text | PYTHOIOENCODING="utf-8" python3 -c "
11 | import sys
12 | sys.path.append('local/data/')
13 | from normalize_utils import *
14 | for line in sys.stdin.readlines():
15 |     start = line.find(' ')
16 |     token1 = line.split()[0]
17 |     tex = normalize(line[start:].replace(' ',''))
18 |     print(tex)
19 | "  | ngram -lm data/LM/$lm\_A.lm -ppl - -debug 1 | PYTHOIOENCODING=utf-8 python3 -c "
20 | import sys
21 | for line in sys.stdin.readlines():
22 |   if 'zeroprobs' in line:
23 |     start = line.find('ppl=')
24 |     endd = line.find('ppl1=')
25 |     print(line[start+5:endd])
26 |   if line.startswith('file'):
27 |     break
28 | " > $test_dir/$lm
29 |   ) & 
30 | done
31 | wait
32 | PYTHOIOENCODING="utf-8" python3 local/kaggle/choose_lm2.py $iflytek_A_text $test_dir > $output
33 | 
34 | 
35 | echo "Done choose_lm.sh."
36 | 
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/utils/data/extract_wav_segments_data_dir.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright    2017  Hossein Hadian
 4 | # Apache 2.0
 5 | 
 6 | # This script copies a data directory (which has a 'segments' file), extracting
 7 | # wav segments (according to the 'segments' file)
 8 | # so that the resulting data directory does not have a 'segments' file anymore.
 9 | 
10 | . utils/parse_options.sh
11 | . ./path.sh
12 | 
13 | if [ $# != 2 ]; then
14 |   echo "Usage: $0 <srcdir> <destdir>"
15 |   echo " This script copies data directory <srcdir> to <destdir> and gets"
16 |   echo "rid of the 'segments' file by extracting the wav segments."
17 |   exit 1;
18 | fi
19 | 
20 | 
21 | export LC_ALL=C
22 | 
23 | srcdir=$1
24 | dir=$2
25 | 
26 | 
27 | if ! mkdir -p $dir/data; then
28 |   echo "$0: failed to create directory $dir/data"
29 |   exit 1
30 | fi
31 | 
32 | set -e -o pipefail
33 | utils/copy_data_dir.sh $srcdir $dir
34 | 
35 | extract-segments scp:$srcdir/wav.scp $srcdir/segments \
36 |                  ark,scp:$dir/data/wav_segments.ark,$dir/data/wav_segments.scp
37 | cat $dir/data/wav_segments.scp | awk '{ print $1 " wav-copy " $2 " - |" }' >$dir/wav.scp
38 | rm $dir/reco2file_and_channel || true
39 | 


--------------------------------------------------------------------------------
/steps/nnet2/get_ivector_id.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2016, Johns Hopkins University (Yenda Trmal <jtrmal@gmail.com>)
 3 | # License: Apache 2.0
 4 | 
 5 | # Begin configuration section.
 6 | # End configuration section
 7 | set -e -o pipefail
 8 | set -o nounset                              # Treat unset variables as an error
 9 | 
10 | # End configuration section.
11 | 
12 | #echo >&2 "$0 $@"  # Print the command line for logging
13 | 
14 | if [ -f path.sh ]; then . ./path.sh; fi
15 | . parse_options.sh || exit 1;
16 | 
17 | 
18 | if [ $# != 1 ]; then
19 |   echo >&2 "Usage: $0 <directory>"
20 |   echo >&2 " e.g.: $0 exp/nnet3/extractor"
21 |   exit 1
22 | fi
23 | 
24 | ivecdir=$1
25 | 
26 | if [ -f $ivecdir/final.ie.id ] ; then
27 |   cat $ivecdir/final.ie.id
28 | elif [ -f $ivecdir/final.ie ] ; then
29 |   # note the creation can fail in case the extractor directory
30 |   # is not read-only media or the user des not have access rights
31 |   # in that case we will just behave as if the id is not available
32 |   id=$(md5sum $ivecdir/final.ie | awk '{print $1}')
33 |   echo "$id" > $ivecdir/final.ie.id || true
34 |   echo "$id"
35 | else
36 |   exit 0
37 | fi
38 | 
39 | exit 0
40 | 
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/local/lm/dirty/mix_all_lms.sh:
--------------------------------------------------------------------------------
 1 | #local/mix_lm2.sh text_test/ori.lm text_test/kaggle12_A.lm text_test/kaggle3_A.txt LM/A_kaggle12.lm
 2 | #local/mix_lm2.sh text_test/ori.lm text_test/kaggle12_B.lm text_test/kaggle3_B.txt LM/B_kaggle12.lm
 3 | #local/mix_lm2.sh text_test/ori.lm text_test/kaggle12_C.lm text_test/kaggle3_C.txt LM/C_kaggle12.lm
 4 | 
 5 | #local/mix_lm2_test.sh text_test/ori.lm text_test/kaggle12_A.lm text_test/kaggle123_A.lm text_test/kaggle3_A.txt LM/A.lm
 6 | #local/mix_lm2_test.sh text_test/ori.lm text_test/kaggle12_B.lm text_test/kaggle123_B.lm text_test/kaggle3_B.txt LM/B.lm
 7 | #local/mix_lm2_test.sh text_test/ori.lm text_test/kaggle12_C.lm text_test/kaggle123_C.lm text_test/kaggle3_C.txt LM/C.lm
 8 | 
 9 | for novel in 3kingdom journey_west red_mansion hunghuang ; do
10 |   for x in A B C ; do
11 |     local/mix_lm3.sh text_test/ori.lm text_test/$novel.lm text_test/kaggle12_$x.lm text_test/$novel\_$x.txt LM/$novel\_$x\_kaggle12.lm
12 |     local/mix_lm3_test.sh text_test/ori.lm text_test/$novel.lm text_test/kaggle12_$x.lm text_test/kaggle123_$x.lm text_test/$novel\_$x.txt LM/$novel\_$x\.lm
13 |   done
14 | done
15 | for x in LM/*.lm ; do
16 |   (
17 |     local/compile_lm.sh $x
18 |   ) & 
19 | done
20 | wait
21 | 


--------------------------------------------------------------------------------
/steps/conf/convert_ctm_to_tra.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | 
 3 | # Copyright 2015  Brno University of Technology (author: Karel Vesely)
 4 | # Apache 2.0
 5 | 
 6 | import sys, operator
 7 | 
 8 | # This scripts loads a 'ctm' file and converts it into the 'tra' format:
 9 | # "utt-key word1 word2 word3 ... wordN"
10 | # The 'utt-key' is the 1st column in the CTM.
11 | 
12 | # Typically the CTM contains:
13 | # - utterance-relative timimng (i.e. prepared without 'utils/convert_ctm.pl')
14 | # - confidences
15 | 
16 | if len(sys.argv) != 3:
17 |   print 'Usage: %s ctm-in tra-out' % __file__
18 |   sys.exit(1)
19 | dummy, ctm_in, tra_out = sys.argv
20 | 
21 | if ctm_in == '-': ctm_in = '/dev/stdin'
22 | if tra_out == '-': tra_out = '/dev/stdout'
23 | 
24 | # Load the 'ctm' into dictionary,
25 | tra = dict()
26 | with open(ctm_in) as f:
27 |   for l in f:
28 |     utt, ch, beg, dur, wrd, conf = l.split()
29 |     if not utt in tra: tra[utt] = []
30 |     tra[utt].append((float(beg),wrd))
31 | 
32 | # Store the in 'tra' format,
33 | with open(tra_out,'w') as f:
34 |   for utt,tuples in tra.iteritems():
35 |     tuples.sort(key = operator.itemgetter(0)) # Sort by 'beg' time,
36 |     f.write('%s %s\n' % (utt,' '.join([t[1] for t in tuples])))
37 | 
38 | 


--------------------------------------------------------------------------------
/local/kaggle/mix_LM_with_A.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . path.sh
 3 | 
 4 | A_outputs=$1
 5 | C_lang_dir=$2
 6 | thread_num=15
 7 | 
 8 | mix_lm(){
 9 |   dir=$1
10 |   Alm=$dir/A.lm
11 |   orilm=`cat $dir/lm_path`
12 |   echo $dir
13 |   ngram-count -text $dir/A.txt -order 4 -lm $Alm 
14 |   ngram -lm $orilm -mix-lm $Alm -lambda 0.6582 -write-lm $dir/rescore.lm -limit-vocab -vocab ./lm_test/text/vocab.txt
15 |   mkdir -p $dir/rescore
16 |   cp -r data/lang/* $dir/rescore
17 |   cat $dir/rescore.lm | \
18 |     arpa2fst --disambig-symbol=#0 \
19 |              --read-symbol-table=$dir/rescore/words.txt - $dir/rescore/G.fst || exit 1;
20 |   rm $dir/rescore.lm
21 |   
22 |   newlang=$dir/rescore
23 |   phi=`grep -w '#0' $newlang/words.txt | awk '{print $2}'`
24 |   fstprint $newlang/L_disambig.fst | awk '{if($4 != '$phi'){print;}}' | fstcompile | \
25 |     fstdeterminizestar | fstrmsymbols $newlang/phones/disambig.int >$newlang/Ldet.fst || exit 1;
26 | }
27 | 
28 | export -f mix_lm
29 | mkdir -p $C_lang_dir
30 | 
31 | startt=`date +%s`
32 | python3 local/kaggle/mix_LM_with_A.py $A_outputs $C_lang_dir kaggle4_lm
33 | 
34 | parallel -j $thread_num "mix_lm {}" ::: $C_lang_dir/*
35 | endt=`date +%s`
36 | runtime=$((endt-startt))
37 | echo "Total time $runtime seconds"
38 | 


--------------------------------------------------------------------------------
/steps/segmentation/internal/verify_phones_list.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | 
 3 | # Copyright 2017  Vimal Manohar
 4 | # Apache 2.0
 5 | 
 6 | """This script verifies the list of phones read from stdin are valid
 7 | phones present in lang/phones.txt."""
 8 | 
 9 | import argparse
10 | import sys
11 | 
12 | def get_args():
13 |     parser = argparse.ArgumentParser(description="""
14 |     This script verifies the list of phones read from stdin are valid
15 |     phones present in lang/phones.txt.""")
16 | 
17 |     parser.add_argument("phones", type=str,
18 |                         help="File containing the list of all phones as the "
19 |                         "first column")
20 | 
21 |     args = parser.parse_args()
22 |     return args
23 | 
24 | 
25 | def main():
26 |     args = get_args()
27 |     phones = set()
28 |     for line in open(args.phones):
29 |         phones.add(line.strip().split()[0])
30 | 
31 |     for line in sys.stdin.readlines():
32 |         p = line.strip()
33 | 
34 |         if p not in phones:
35 |             sys.stderr.write("Could not find phone {p} in {f}"
36 |                              "\n".format(p=p, f=args.phones))
37 |             raise SystemExit(1)
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     main()
42 | 


--------------------------------------------------------------------------------
/local/kaggle/data_prep_wav_seperate.py:
--------------------------------------------------------------------------------
 1 | import sys,os
 2 | from shutil import copyfile
 3 | 
 4 | if __name__ == '__main__':
 5 |     wav_dir = sys.argv[1]
 6 |     data_dir = sys.argv[2]
 7 |     for dirPath, dirNames, fileNames in os.walk(sys.argv[1]):
 8 |         for fname in fileNames:
 9 |             if fname.endswith('.wav'):
10 |                 file_path = os.path.join(dirPath, fname)
11 |                 file_path = os.path.abspath(file_path)
12 |                 name = fname.replace('.wav','')
13 |                 decode_dir = os.path.join(data_dir,name,'data')
14 |                 if not os.path.isdir(decode_dir):
15 |                     os.makedirs(decode_dir)
16 |                 #os.symlink(file_path, os.path.join(decode_dir,fname))
17 |                 utt2spk_path = os.path.join(decode_dir,'utt2spk')
18 |                 wavscp_path = os.path.join(decode_dir,'wav.scp')
19 |                 spk2utt_path = os.path.join(decode_dir,'spk2utt')
20 |                 with open(utt2spk_path,'w') as f:
21 |                     f.write(fname + ' ' + fname)
22 |                 with open(wavscp_path,'w') as f:
23 |                     f.write(fname + ' ' + file_path)
24 |                 with open(spk2utt_path,'w') as f:
25 |                     f.write(fname + ' ' + fname)
26 |                 
27 | 


--------------------------------------------------------------------------------
/steps/conf/lattice_depth_per_frame.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2015  Brno University of Technology (Author: Karel Vesely)
 3 | # Licensed under the Apache License, Version 2.0 (the "License")
 4 | 
 5 | # Extract lattice-depth for each frame.
 6 | 
 7 | # Begin configuration
 8 | cmd=run.pl
 9 | # End configuration
10 | 
11 | echo "$0 $@"  # Print the command line for logging
12 | 
13 | [ -f path.sh ] && . ./path.sh # source the path.
14 | . parse_options.sh || exit 1;
15 | 
16 | if [ $# != 2 ]; then
17 |    echo "usage: $0 [opts] <dir-with-lats> <out-dir>"
18 |    echo "main options (for others, see top of script file)"
19 |    echo "  --config <config-file>          # config containing options"
20 |    echo "  --cmd"
21 |    exit 1;
22 | fi
23 | 
24 | set -euo pipefail
25 | 
26 | latdir=$1
27 | dir=$2
28 | 
29 | [ ! -f $latdir/lat.1.gz ] && echo "Missing $latdir/lat.1.gz" && exit 1
30 | nj=$(cat $latdir/num_jobs)
31 | 
32 | # Get the pdf-posterior vectors,
33 | $cmd JOB=1:$nj $dir/log/lattice_depth_per_frame.JOB.log \
34 |   lattice-depth-per-frame "ark:gunzip -c $latdir/lat.JOB.gz |" ark,t:$dir/lattice_frame_depth.JOB.ark
35 | # Merge,
36 | for ((n=1; n<=nj; n++)); do cat $dir/lattice_frame_depth.${n}.ark; done >$dir/lattice_frame_depth.ark
37 | rm $dir/lattice_frame_depth.*.ark
38 | 
39 | # Done!
40 | 


--------------------------------------------------------------------------------
/steps/conf/parse_arpa_unigrams.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | 
 3 | # Copyright 2015  Brno University of Technology (author: Karel Vesely)
 4 | # Apache 2.0
 5 | 
 6 | import sys, gzip, re
 7 | 
 8 | # Parse options,
 9 | if len(sys.argv) != 4:
10 |   print "Usage: %s <words.txt> <arpa-gz> <unigrams>" % __file__
11 |   sys.exit(0)
12 | words_txt, arpa_gz, unigrams_out = sys.argv[1:]
13 | 
14 | if arpa_gz == '-': arpa_gz = '/dev/stdin'
15 | if unigrams_out == '-': unigrams_out = '/dev/stdout'
16 | 
17 | # Load the words.txt,
18 | words = [ l.split() for l in open(words_txt) ]
19 | 
20 | # Load the unigram probabilities in 10log from ARPA,
21 | wrd_log10 = dict()
22 | with gzip.open(arpa_gz,'r') as f:
23 |   read = False
24 |   for l in f:
25 |     if l.strip() == '\\1-grams:': read = True
26 |     if l.strip() == '\\2-grams:': break
27 |     if read and len(l.split())>=2:
28 |       log10_p_unigram, wrd = re.split('[\t ]+',l.strip(),2)[:2]
29 |       wrd_log10[wrd] = float(log10_p_unigram)
30 | 
31 | # Create list, 'wrd id log_p_unigram',
32 | words_unigram = [[wrd, id, (wrd_log10[wrd] if wrd in wrd_log10 else -99)] for wrd,id in words ]
33 | 
34 | print >>sys.stderr, words_unigram[0]
35 | # Store,
36 | with open(unigrams_out,'w') as f:
37 |   f.writelines(['%s %s %g\n' % (w,i,p) for (w,i,p) in words_unigram])
38 | 
39 | 


--------------------------------------------------------------------------------
/local/kaggle/replace_choice.py:
--------------------------------------------------------------------------------
 1 | import os,sys,json
 2 | sys.path.append('local/data/')
 3 | from parse_choices import *
 4 | from normalize_utils import *
 5 | 
 6 | def process_outputs(outputs):
 7 |     L = read_outputs(outputs)
 8 |     L2 = []
 9 |     for name,trans in L:
10 |         idx = int(name[1:].replace('.wav',''))
11 |         trans = trans.replace(' ','')
12 |         L2.append((idx,trans))
13 |     L2 =sorted(L2, key=lambda s: s[0])
14 |     return L2
15 | def write_d(key,X_list,L):
16 |     for idx,value in X_list:
17 |         for i,l in enumerate(L):
18 |             if l["id"] == idx:
19 |                 L[i][key] = value
20 |                 break
21 |     return L
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     C_outputs =  sys.argv[1]
26 |     iflytek_json = sys.argv[2]
27 |     output_json = sys.argv[3]
28 |     d = {}
29 |     
30 |     C_list = process_outputs(C_outputs)
31 |     
32 |     C_list_parse = []
33 |     
34 |     for idx,trans in C_list:
35 |         options = parse(trans)
36 |         C_list_parse.append((idx,options))
37 |     
38 |     with open(iflytek_json,'r',encoding='utf8') as f:
39 |         L = json.load(f)
40 |     
41 |     L = write_d("options",C_list_parse,L)
42 |     with open(output_json,'w',encoding='utf8') as f:
43 |         json.dump(L,f,indent=4,ensure_ascii=False)
44 | 
45 | 


--------------------------------------------------------------------------------
/utils/build_const_arpa_lm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2014  Guoguo Chen
 4 | # Apache 2.0
 5 | 
 6 | # This script reads in an Arpa format language model, and converts it into the
 7 | # ConstArpaLm format language model.
 8 | 
 9 | # begin configuration section
10 | # end configuration section
11 | 
12 | [ -f path.sh ] && . ./path.sh;
13 | 
14 | . utils/parse_options.sh
15 | 
16 | if [ $# != 3 ]; then
17 |   echo "Usage: "
18 |   echo "  $0 [options] <arpa-lm-path> <old-lang-dir> <new-lang-dir>"
19 |   echo "e.g.:"
20 |   echo "  $0 data/local/lm/3-gram.full.arpa.gz data/lang/ data/lang_test_tgmed"
21 |   echo "Options"
22 |   exit 1;
23 | fi
24 | 
25 | export LC_ALL=C
26 | 
27 | arpa_lm=$1
28 | old_lang=$2
29 | new_lang=$3
30 | 
31 | mkdir -p $new_lang
32 | 
33 | mkdir -p $new_lang
34 | cp -r $old_lang/* $new_lang
35 | 
36 | unk=`cat $new_lang/oov.int`
37 | bos=`grep -w "<s>" $new_lang/words.txt | awk '{print $2}'`
38 | eos=`grep "</s>" $new_lang/words.txt | awk '{print $2}'`
39 | if [[ -z $bos || -z $eos ]]; then
40 |   echo "$0: <s> and </s> symbols are not in $new_lang/words.txt"
41 |   exit 1
42 | fi
43 | 
44 | 
45 | arpa-to-const-arpa --bos-symbol=$bos \
46 |   --eos-symbol=$eos --unk-symbol=$unk \
47 |   "gunzip -c $arpa_lm | utils/map_arpa_lm.pl $new_lang/words.txt|"  $new_lang/G.carpa  || exit 1;
48 | 
49 | exit 0;
50 | 


--------------------------------------------------------------------------------
/local/nnet/DFSMN_M.proto:
--------------------------------------------------------------------------------
 1 | <NnetProto>
 2 | <AffineTransform> <InputDim> 720  <OutputDim> 2048 <MaxNorm> 0.000000 <Xavier> 1
 3 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
 4 | <LinearTransform> <InputDim> 2048 <OutputDim> 512 <ParamStddev> 0.010000 <Xavier> 1
 5 | <Fsmn> <InputDim> 512 <OutputDim> 512  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 6 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 7 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 8 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 9 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
10 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
11 | <AffineTransform> <InputDim> 512 <OutputDim> 2048 <Xavier> 1
12 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
13 | <AffineTransform> <InputDim> 2048 <OutputDim> 2048 <Xavier> 1
14 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
15 | <LinearTransform> <InputDim> 2048 <OutputDim> 512 <Xavier> 1
16 | <AffineTransform> <InputDim> 512 <OutputDim> 5777 <Xavier> 1
17 | <Softmax> <InputDim> 5777 <OutputDim> 5777
18 | </NnetProto>
19 | 
20 | 


--------------------------------------------------------------------------------
/local/nnet/DFSMN_S.proto:
--------------------------------------------------------------------------------
 1 | <NnetProto>
 2 | <AffineTransform> <InputDim> 720  <OutputDim> 1024 <MaxNorm> 0.000000 <Xavier> 1
 3 | <ParametricRelu> <InputDim> 1024 <OutputDim> 1024
 4 | <LinearTransform> <InputDim> 1024 <OutputDim> 384 <ParamStddev> 0.010000 <Xavier> 1
 5 | <Fsmn> <InputDim> 384 <OutputDim> 384  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 6 | <DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 7 | <DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 8 | <DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 9 | <DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
10 | <DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
11 | <AffineTransform> <InputDim> 384 <OutputDim> 1024 <Xavier> 1
12 | <ParametricRelu> <InputDim> 1024 <OutputDim> 1024
13 | <AffineTransform> <InputDim> 1024 <OutputDim> 1024 <Xavier> 1
14 | <ParametricRelu> <InputDim> 1024 <OutputDim> 1024
15 | <LinearTransform> <InputDim> 1024 <OutputDim> 384 <Xavier> 1
16 | <AffineTransform> <InputDim> 384 <OutputDim> 5777 <Xavier> 1
17 | <Softmax> <InputDim> 5777 <OutputDim> 5777
18 | </NnetProto>
19 | 
20 | 


--------------------------------------------------------------------------------
/local/create_oov_char_lexicon.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # Copyright 2016 Alibaba Robotics Corp. (Author: Xingyu Na)
 3 | #
 4 | # A script for char-based Chinese OOV lexicon generation.
 5 | #
 6 | # Input 1: char-based dictionary, example
 7 | # CHAR1 ph1 ph2
 8 | # CHAR2 ph3
 9 | # CHAR3 ph2 ph4
10 | #
11 | # Input 2: OOV word list, example
12 | # WORD1
13 | # WORD2
14 | # WORD3
15 | #
16 | # where WORD1 is in the format of "CHAR1CHAR2".
17 | #
18 | # Output: OOV lexicon, in the format of normal lexicon
19 | 
20 | if($#ARGV != 1) {
21 |   print STDERR "usage: perl create_oov_char_lexicon.pl chardict oovwordlist > oovlex\n\n";
22 |   print STDERR "### chardict: a dict in which each line contains the pronunciation of one Chinese char\n";
23 |   print STDERR "### oovwordlist: OOV word list\n";
24 |   print STDERR "### oovlex: output OOV lexicon\n";
25 |   exit;
26 | }
27 | 
28 | use  utf8;
29 | my %prons;
30 | open(DICT, $ARGV[0]) || die("Can't open dict ".$ARGV[0]."\n");
31 | foreach (<DICT>) {
32 |   chomp; @A = split(" ", $_); $prons{$A[0]} = $A[1];
33 | }
34 | close DICT;
35 | 
36 | open(WORDS, $ARGV[1]) || die("Can't open oov word list ".$ARGV[1]."\n");
37 | while (<WORDS>) {
38 |   chomp;
39 |   print $_;
40 |   @A = split("", $_);
41 |   foreach (@A) {
42 |     print " $prons{$_}";
43 |   }
44 |   print "\n";
45 | }
46 | close WORDS;
47 | 


--------------------------------------------------------------------------------
/local/nnet/DFSMN_M.proto.2560:
--------------------------------------------------------------------------------
 1 | <NnetProto>
 2 | <AffineTransform> <InputDim> 720  <OutputDim> 2048 <MaxNorm> 0.000000 <Xavier> 1
 3 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
 4 | <LinearTransform> <InputDim> 2048 <OutputDim> 512 <ParamStddev> 0.010000 <Xavier> 1
 5 | <Fsmn> <InputDim> 512 <OutputDim> 512  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 6 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 7 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 8 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 9 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
10 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
11 | <AffineTransform> <InputDim> 512 <OutputDim> 2048 <Xavier> 1
12 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
13 | <AffineTransform> <InputDim> 2048 <OutputDim> 2048 <Xavier> 1
14 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
15 | <LinearTransform> <InputDim> 2048 <OutputDim> 512 <Xavier> 1
16 | <AffineTransform> <InputDim> 512 <OutputDim> 2560 <Xavier> 1
17 | <Softmax> <InputDim> 2560 <OutputDim> 2560
18 | </NnetProto>
19 | 
20 | 


--------------------------------------------------------------------------------
/local/nnet/DFSMN_M.proto.8136:
--------------------------------------------------------------------------------
 1 | <NnetProto>
 2 | <AffineTransform> <InputDim> 720  <OutputDim> 2048 <MaxNorm> 0.000000 <Xavier> 1
 3 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
 4 | <LinearTransform> <InputDim> 2048 <OutputDim> 512 <ParamStddev> 0.010000 <Xavier> 1
 5 | <Fsmn> <InputDim> 512 <OutputDim> 512  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 6 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 7 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 8 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 9 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
10 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
11 | <AffineTransform> <InputDim> 512 <OutputDim> 2048 <Xavier> 1
12 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
13 | <AffineTransform> <InputDim> 2048 <OutputDim> 2048 <Xavier> 1
14 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
15 | <LinearTransform> <InputDim> 2048 <OutputDim> 512 <Xavier> 1
16 | <AffineTransform> <InputDim> 512 <OutputDim> 8136 <Xavier> 1
17 | <Softmax> <InputDim> 8136 <OutputDim> 8136
18 | </NnetProto>
19 | 
20 | 


--------------------------------------------------------------------------------
/local/nnet/DFSMN_S.proto.2560:
--------------------------------------------------------------------------------
 1 | <NnetProto>
 2 | <AffineTransform> <InputDim> 720  <OutputDim> 1024 <MaxNorm> 0.000000 <Xavier> 1
 3 | <ParametricRelu> <InputDim> 1024 <OutputDim> 1024
 4 | <LinearTransform> <InputDim> 1024 <OutputDim> 384 <ParamStddev> 0.010000 <Xavier> 1
 5 | <Fsmn> <InputDim> 384 <OutputDim> 384  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 6 | <DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 7 | <DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 8 | <DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 9 | <DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
10 | <DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
11 | <AffineTransform> <InputDim> 384 <OutputDim> 1024 <Xavier> 1
12 | <ParametricRelu> <InputDim> 1024 <OutputDim> 1024
13 | <AffineTransform> <InputDim> 1024 <OutputDim> 1024 <Xavier> 1
14 | <ParametricRelu> <InputDim> 1024 <OutputDim> 1024
15 | <LinearTransform> <InputDim> 1024 <OutputDim> 384 <Xavier> 1
16 | <AffineTransform> <InputDim> 384 <OutputDim> 2560 <Xavier> 1
17 | <Softmax> <InputDim> 2560 <OutputDim> 2560
18 | </NnetProto>
19 | 
20 | 


--------------------------------------------------------------------------------
/local/nnet/DFSMN_S.proto.8136:
--------------------------------------------------------------------------------
 1 | <NnetProto>
 2 | <AffineTransform> <InputDim> 720  <OutputDim> 1024 <MaxNorm> 0.000000 <Xavier> 1
 3 | <ParametricRelu> <InputDim> 1024 <OutputDim> 1024
 4 | <LinearTransform> <InputDim> 1024 <OutputDim> 384 <ParamStddev> 0.010000 <Xavier> 1
 5 | <Fsmn> <InputDim> 384 <OutputDim> 384  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 6 | <DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 7 | <DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 8 | <DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 9 | <DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
10 | <DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
11 | <AffineTransform> <InputDim> 384 <OutputDim> 1024 <Xavier> 1
12 | <ParametricRelu> <InputDim> 1024 <OutputDim> 1024
13 | <AffineTransform> <InputDim> 1024 <OutputDim> 1024 <Xavier> 1
14 | <ParametricRelu> <InputDim> 1024 <OutputDim> 1024
15 | <LinearTransform> <InputDim> 1024 <OutputDim> 384 <Xavier> 1
16 | <AffineTransform> <InputDim> 384 <OutputDim> 8136 <Xavier> 1
17 | <Softmax> <InputDim> 8136 <OutputDim> 8136
18 | </NnetProto>
19 | 
20 | 


--------------------------------------------------------------------------------
/local/nnet/DFSMN_M_ivector.proto:
--------------------------------------------------------------------------------
 1 | <NnetProto>
 2 | <AffineTransform> <InputDim> 1020  <OutputDim> 2048 <MaxNorm> 0.000000 <Xavier> 1
 3 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
 4 | <LinearTransform> <InputDim> 2048 <OutputDim> 512 <ParamStddev> 0.010000 <Xavier> 1
 5 | <Fsmn> <InputDim> 512 <OutputDim> 512  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 6 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 7 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 8 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 9 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
10 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
11 | <AffineTransform> <InputDim> 512 <OutputDim> 2048 <Xavier> 1
12 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
13 | <AffineTransform> <InputDim> 2048 <OutputDim> 2048 <Xavier> 1
14 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
15 | <LinearTransform> <InputDim> 2048 <OutputDim> 512 <Xavier> 1
16 | <AffineTransform> <InputDim> 512 <OutputDim> 5777 <Xavier> 1
17 | <Softmax> <InputDim> 5777 <OutputDim> 5777
18 | </NnetProto>
19 | 
20 | 


--------------------------------------------------------------------------------
/local/nnet/DFSMN_S_ivector.proto:
--------------------------------------------------------------------------------
 1 | <NnetProto>
 2 | <AffineTransform> <InputDim> 1020  <OutputDim> 1024 <MaxNorm> 0.000000 <Xavier> 1
 3 | <ParametricRelu> <InputDim> 1024 <OutputDim> 1024
 4 | <LinearTransform> <InputDim> 1024 <OutputDim> 384 <ParamStddev> 0.010000 <Xavier> 1
 5 | <Fsmn> <InputDim> 384 <OutputDim> 384  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 6 | <DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 7 | <DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 8 | <DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 9 | <DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
10 | <DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
11 | <AffineTransform> <InputDim> 384 <OutputDim> 1024 <Xavier> 1
12 | <ParametricRelu> <InputDim> 1024 <OutputDim> 1024
13 | <AffineTransform> <InputDim> 1024 <OutputDim> 1024 <Xavier> 1
14 | <ParametricRelu> <InputDim> 1024 <OutputDim> 1024
15 | <LinearTransform> <InputDim> 1024 <OutputDim> 384 <Xavier> 1
16 | <AffineTransform> <InputDim> 384 <OutputDim> 5777 <Xavier> 1
17 | <Softmax> <InputDim> 5777 <OutputDim> 5777
18 | </NnetProto>
19 | 
20 | 


--------------------------------------------------------------------------------
/local/kaggle/test/decode_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | all_wav_dir=$1
 3 | dir=$2
 4 | C_lang_dir=$3
 5 | nnet_dir=exp/nnet/tri4a_DFSMN_L_woiv_nnet_ali
 6 | nnet_dir=exp/nnet/tri4a_DFSMN_S_woiv
 7 | 
 8 | export nnet_dir=$nnet_dir
 9 | thread_num=100
10 | 
11 | asr() {
12 |   wav_dir=$1
13 |   rescore_lang=data/lang_4large_test
14 |   use_gpu="no"
15 | 
16 |   if [ -f $wav_dir/rescore_lang ]; then
17 |     rescore_lang=`cat $wav_dir/rescore_lang`
18 |   fi
19 |   
20 |   if [ -f $wav_dir/use_gpu ]; then
21 |     use_gpu=`cat $wav_dir/use_gpu`
22 |   fi
23 | 
24 |   local/nnet/decode_from_wav.sh \
25 |     --rescore_lang $rescore_lang \
26 |     --fbank_nj 1 \
27 |     --decode_nj 1 \
28 |     --stage 1 \
29 |     --use_gpu  $use_gpu \
30 |     $wav_dir $nnet_dir $wav_dir > /dev/null || echo "error decoding $wav_dir"
31 |   rm -r $wav_dir/data $wav_dir/final.mdl
32 |   cat $wav_dir/output.txt >> $wav_dir/../output.txt
33 |   echo "Done $wav_dir files"
34 | }
35 | 
36 | export -f asr
37 | 
38 | mkdir -p $dir
39 | startt=`date +%s`
40 | python3 local/data/data_prep_wav_seperate.py $all_wav_dir $dir
41 | python3 local/nnet/test/select_lm.py $dir $C_lang_dir
42 | 
43 | parallel -j $thread_num "asr {}" ::: $dir/*
44 | 
45 | endt=`date +%s`
46 | runtime=$((endt-startt))
47 | echo "Total time $runtime seconds"
48 | echo "Total time $runtime seconds" > $dir/run_time
49 | 


--------------------------------------------------------------------------------
/local/nnet/DFSMN_M_ivector.proto.2560:
--------------------------------------------------------------------------------
 1 | <NnetProto>
 2 | <AffineTransform> <InputDim> 1020  <OutputDim> 2048 <MaxNorm> 0.000000 <Xavier> 1
 3 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
 4 | <LinearTransform> <InputDim> 2048 <OutputDim> 512 <ParamStddev> 0.010000 <Xavier> 1
 5 | <Fsmn> <InputDim> 512 <OutputDim> 512  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 6 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 7 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 8 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 9 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
10 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
11 | <AffineTransform> <InputDim> 512 <OutputDim> 2048 <Xavier> 1
12 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
13 | <AffineTransform> <InputDim> 2048 <OutputDim> 2048 <Xavier> 1
14 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
15 | <LinearTransform> <InputDim> 2048 <OutputDim> 512 <Xavier> 1
16 | <AffineTransform> <InputDim> 512 <OutputDim> 2560 <Xavier> 1
17 | <Softmax> <InputDim> 2560 <OutputDim> 2560
18 | </NnetProto>
19 | 
20 | 


--------------------------------------------------------------------------------
/local/nnet/DFSMN_S_ivector.proto.2560:
--------------------------------------------------------------------------------
 1 | <NnetProto>
 2 | <AffineTransform> <InputDim> 1020  <OutputDim> 1024 <MaxNorm> 0.000000 <Xavier> 1
 3 | <ParametricRelu> <InputDim> 1024 <OutputDim> 1024
 4 | <LinearTransform> <InputDim> 1024 <OutputDim> 384 <ParamStddev> 0.010000 <Xavier> 1
 5 | <Fsmn> <InputDim> 384 <OutputDim> 384  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 6 | <DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 7 | <DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 8 | <DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 9 | <DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
10 | <DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
11 | <AffineTransform> <InputDim> 384 <OutputDim> 1024 <Xavier> 1
12 | <ParametricRelu> <InputDim> 1024 <OutputDim> 1024
13 | <AffineTransform> <InputDim> 1024 <OutputDim> 1024 <Xavier> 1
14 | <ParametricRelu> <InputDim> 1024 <OutputDim> 1024
15 | <LinearTransform> <InputDim> 1024 <OutputDim> 384 <Xavier> 1
16 | <AffineTransform> <InputDim> 384 <OutputDim> 2560 <Xavier> 1
17 | <Softmax> <InputDim> 2560 <OutputDim> 2560
18 | </NnetProto>
19 | 
20 | 


--------------------------------------------------------------------------------
/utils/data/get_utt2num_frames.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Copyright 2016  Vimal Manohar
 4 | # Apache 2.0.
 5 | 
 6 | cmd=run.pl
 7 | nj=4
 8 | 
 9 | frame_shift=0.01
10 | frame_overlap=0.015
11 | 
12 | . utils/parse_options.sh
13 | 
14 | if [ $# -ne 1 ]; then
15 |   echo "This script writes a file utt2num_frames with the "
16 |   echo "number of frames in each utterance as measured based on the "
17 |   echo "duration of the utterances (in utt2dur) and the specified "
18 |   echo "frame_shift and frame_overlap."
19 |   echo "Usage: $0 <data>" 
20 |   exit 1
21 | fi
22 | 
23 | data=$1
24 | 
25 | if [ -s $data/utt2num_frames ]; then
26 |   echo "$0: $data/utt2num_frames already present!"
27 |   exit 0;
28 | fi
29 | 
30 | if [ ! -f $data/feats.scp ]; then
31 |   utils/data/get_utt2dur.sh $data
32 |   awk -v fs=$frame_shift -v fovlp=$frame_overlap \
33 |     '{print $1" "int( ($2 - fovlp) / fs)}' $data/utt2dur > $data/utt2num_frames
34 |   exit 0
35 | fi
36 | 
37 | utils/split_data.sh --per-utt $data $nj || exit 1
38 | $cmd JOB=1:$nj $data/log/get_utt2num_frames.JOB.log \
39 |   feat-to-len scp:$data/split${nj}utt/JOB/feats.scp ark,t:$data/split${nj}utt/JOB/utt2num_frames || exit 1
40 | 
41 | for n in `seq $nj`; do
42 |   cat $data/split${nj}utt/$n/utt2num_frames
43 | done > $data/utt2num_frames
44 | 
45 | echo "$0: Computed and wrote $data/utt2num_frames"
46 | 


--------------------------------------------------------------------------------
/utils/summarize_warnings.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | # Copyright 2012 Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
 4 | 
 5 |  @ARGV != 1 && print STDERR "Usage: summarize_warnings.pl <log-dir>\n" && exit 1;
 6 | 
 7 | $dir = $ARGV[0];
 8 | 
 9 | ! -d $dir && print STDERR "summarize_warnings.pl: no such directory $dir\n" && exit 1;
10 | 
11 | $dir =~ s:/$::; # Remove trailing slash.
12 | 
13 | 
14 | # Group the files into categories where all have the same base-name.
15 | foreach $f (glob ("$dir/*.log")) {
16 |   $f_category = $f;
17 |   # do next expression twice; s///g doesn't work as they overlap.
18 |   $f_category =~ s:\.\d+\.:.*.:;
19 |   $f_category =~ s:\.\d+\.:.*.:;
20 |   $fmap{$f_category} .= " $f";
21 | }
22 | 
23 | sub split_hundreds { # split list of filenames into groups of 100.
24 |   my $names = shift @_;
25 |   my @A = split(" ", $names);
26 |   my @ans = ();
27 |   while (@A > 0) {
28 |     my $group = "";
29 |     for ($x = 0; $x < 100 && @A>0; $x++) {
30 |       $fname = pop @A;
31 |       $group .= "$fname ";
32 |     }
33 |     push @ans, $group;
34 |   }
35 |   return @ans;
36 | }
37 | 
38 | foreach $c (keys %fmap) {
39 |   $n = 0;
40 |   foreach $fgroup (split_hundreds($fmap{$c})) {
41 |     $n += `grep -w WARNING $fgroup | wc -l`;
42 |   }
43 |   if ($n != 0) {
44 |     print "$n warnings in $c\n"
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/local/kaggle/demo.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import parse_choices as pc
 3 | 
 4 | def asr(A_path, B_path, C_path):
 5 |     # Inputs :
 6 |     # A_path : path of context wav
 7 |     # B_path : path of question wav
 8 |     # C_path : path of option wav
 9 |     # Outputs : 
10 |     # {"context":"","question":"","options":["","","",""], "answer":-1}
11 |     
12 |     outputs = subprocess.check_output(['bash' ,'/data/local/kgb/Chinese-ASR/local/kaggle/decode_demo.sh', A_path, B_path, C_path ])
13 |     d = {"context":"","question":"","options":["","","",""], "answer":-1}
14 |     for line in outputs.decode('utf-8').split('\n'):
15 |         if len(line) == 0 :
16 |             continue
17 |         line = line.replace('<UNK>','')
18 |         tokens = line.split()
19 |         typ = tokens[0]
20 |         trans = ' '.join(tokens[1:])
21 |         if typ == 'A':
22 |             d['context'] = trans
23 |         elif typ == 'B':
24 |             d['question'] = trans
25 |         elif typ == 'C':
26 |             d['options'] = pc.parse(trans.replace(' ',''))
27 |     return d
28 | 
29 | if __name__ == '__main__':
30 |     A_path = '/data/local/kgb/Chinese-ASR/one_qa/A0001500.wav'
31 |     B_path = '/data/local/kgb/Chinese-ASR/one_qa/B0001500.wav'
32 |     C_path = '/data/local/kgb/Chinese-ASR/one_qa/C0001500.wav'
33 |     d = asr(A_path,B_path,C_path)
34 |     print(d)
35 | 


--------------------------------------------------------------------------------
/local/kaggle/mix_LM_with_A.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | sys.path.append('local/data/')
 4 | from normalize_utils import *
 5 | 
 6 | def read_choose_lm(lm_file):
 7 |     d = {}
 8 |     with open(lm_file,'r') as f:
 9 |         for line in f:
10 |             tokens = line.rstrip().split()
11 |             name = tokens[0][1:].replace('.wav','')
12 |             idx = int(name)
13 |             novel = tokens[1]
14 |             d[idx] = novel
15 |     return d
16 | def process_C(idx):
17 |     lm = "lm_test/LM/"+ d[idx] + "_C.lm"
18 |     return lm
19 | if __name__ == '__main__':
20 |     A_outputs = sys.argv[1]
21 |     C_lang_dir = sys.argv[2]
22 |     choose_lm = sys.argv[3]
23 |     outputs = read_outputs(A_outputs)
24 |     d = read_choose_lm(choose_lm)
25 |     for (name,trans) in outputs:
26 |         idx = int(name[1:].replace('.wav',''))
27 |         name = name.replace('.wav','').replace('A','C')
28 |         src_dir = os.path.join(C_lang_dir,name)
29 |         os.makedirs(src_dir)
30 | 
31 |         A_txt_path = os.path.join(src_dir,'A.txt') 
32 |         with open(A_txt_path,'w',encoding='utf-8') as f:
33 |             f.write(trans)
34 | 
35 |         lm = process_C(idx)
36 |         ori_lm = os.path.join(os.getcwd(),lm)
37 |         lm_path = os.path.join(src_dir,'lm_path')
38 |         with open(lm_path,'w') as f:
39 |             f.write(lm)
40 |         
41 | 
42 | 


--------------------------------------------------------------------------------
/local/data/extract_wiki.py:
--------------------------------------------------------------------------------
 1 | import os,sys,json,re
 2 | sys.path.append('local/data/tool/jieba-zh_TW')
 3 | import jieba
 4 | from opencc import OpenCC
 5 | from number2chinese import *
 6 | 
 7 | def main(wiki_corpus):
 8 |     openCC = OpenCC('s2t')  
 9 |     for root, dirs, files in os.walk(wiki_corpus, topdown=False):
10 |         for name in files:
11 |             txt_path = os.path.join(root,name)
12 |             print(txt_path)
13 |             with open(txt_path,'r',encoding='utf-8') as f:
14 |                 for line in f:
15 |                     d = json.loads(line)
16 |                     text = d['text'].replace('\n\n','\n')
17 |                     text = openCC.convert(text)
18 |                     text = text.upper()
19 |                     tokens = jieba.cut(text)
20 |                     new_tokens = []
21 |                     for token in tokens:
22 |                         if re.match('^[0-9]+$',token):
23 |                             if len(token) > 15:
24 |                                 continue
25 |                             token = to_chinese(int(token))
26 |                         new_tokens.append(token)
27 |                     text = ' '.join(new_tokens)
28 |                     if len(text) > 0:
29 |                         print(text)
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     wiki_corpus = sys.argv[1]
34 |     main(wiki_corpus)
35 | 
36 | 


--------------------------------------------------------------------------------
/utils/utt2spk_to_spk2utt.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # Copyright 2010-2011 Microsoft Corporation
 3 | 
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #  http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
14 | # See the Apache 2 License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | # converts an utt2spk file to a spk2utt file.
18 | # Takes input from the stdin or from a file argument;
19 | # output goes to the standard out.
20 | 
21 | if ( @ARGV > 1 ) {
22 |     die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt";
23 | }
24 | 
25 | while(<>){ 
26 |     @A = split(" ", $_);
27 |     @A == 2 || die "Invalid line in utt2spk file: $_";
28 |     ($u,$s) = @A;
29 |     if(!$seen_spk{$s}) {
30 |         $seen_spk{$s} = 1;
31 |         push @spklist, $s;
32 |     }
33 |     push (@{$spk_hash{$s}}, "$u");
34 | }
35 | foreach $s (@spklist) {
36 |     $l = join(' ',@{$spk_hash{$s}});
37 |     print "$s $l\n";
38 | }
39 | 


--------------------------------------------------------------------------------
/steps/online/nnet2/copy_ivector_dir.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2017  Johns Hopkins University (author: Hossein Hadian)
 4 | # Apache 2.0
 5 | 
 6 | # This script copies the necessary parts of an online ivector directory
 7 | # optionally applying a mapping to the ivector_online.scp file
 8 | 
 9 | utt2orig=
10 | 
11 | . utils/parse_options.sh
12 | 
13 | if [ $# != 2 ]; then
14 |   echo "Usage: "
15 |   echo "  $0 [options] <srcdir> <destdir>"
16 |   echo "e.g.:"
17 |   echo " $0 exp/nnet3/online_ivector_train exp/nnet3/online_ivector_train_fs"
18 |   echo "Options"
19 |   echo "   --utt2orig=<file>     # utterance id mapping to use"
20 |   exit 1;
21 | fi
22 | 
23 | 
24 | srcdir=$1
25 | destdir=$2
26 | 
27 | if [ ! -f $srcdir/ivector_period ]; then
28 |   echo "$0: no such file $srcdir/ivector_period"
29 |   exit 1;
30 | fi
31 | 
32 | if [ "$destdir" == "$srcdir" ]; then
33 |   echo "$0: this script requires <srcdir> and <destdir> to be different."
34 |   exit 1
35 | fi
36 | 
37 | set -e;
38 | 
39 | mkdir -p $destdir
40 | cp -r $srcdir/{conf,ivector_period} $destdir
41 | if [ -z $utt2orig ]; then
42 |   cp $srcdir/ivector_online.scp $destdir
43 | else
44 |   utils/apply_map.pl -f 2 $srcdir/ivector_online.scp < $utt2orig > $destdir/ivector_online.scp
45 | fi
46 | cp $srcdir/final.ie.id $destdir
47 | 
48 | echo "$0: Copied necessary parts of online ivector directory $srcdir to $destdir"
49 | 


--------------------------------------------------------------------------------
/local/data/data_prep_Tl.py:
--------------------------------------------------------------------------------
 1 | import os,sys
 2 | 
 3 | def main(corpus_path,file_type):
 4 |     for root, dirs, files in os.walk(os.path.join(corpus_path,'syl'), topdown=False):
 5 |         for name in files:
 6 |             if name.endswith('.txt'):
 7 |                 txt_path = os.path.join(root,name)
 8 |                 with open(txt_path,'r') as f:
 9 |                     for line in f:
10 |                         tokens = line.rstrip().split()
11 |                         wav_file, trans = tokens[-1],tokens[1]
12 | 
13 |                         wav_label = wav_file.split('.')[0]
14 |                         trans = ' '.join(list(trans))
15 |                         trans = trans.upper()
16 |                         
17 |                         spk = wav_file.split('_')[0]
18 |                         wav_path = os.path.join(corpus_path,'Wav/{}/{}'.format(spk,wav_file))
19 |                         wav_path = os.path.abspath(wav_path)
20 |                         if file_type == 'wav.scp':
21 |                             print(wav_label, wav_path)
22 |                         elif file_type == 'utt2spk':
23 |                             print(wav_label, spk)
24 |                         elif file_type == 'text':
25 |                             print(wav_label, trans)
26 | if __name__ == '__main__':
27 |     corpus_path = sys.argv[1]
28 |     file_type = sys.argv[2]
29 |     main(corpus_path,file_type)
30 | 


--------------------------------------------------------------------------------
/local/kaggle/decode_demo.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | cd /data/local/kgb/Chinese-ASR
 3 | nnet_dir=exp/nnet/tri4a_DFSMN_S_woiv_aug_ori
 4 | mode=4 # mode for lmrescoring
 5 | 
 6 | . ./utils/parse_options.sh
 7 | 
 8 | wavA=$1
 9 | wavB=$2
10 | 
11 | 
12 | export nnet_dir=$nnet_dir
13 | export mode=$mode
14 | export graph=$graph
15 | export rescore_arpa=$rescore_arpa
16 | 
17 | asr() {
18 |   wav=$1
19 |   typ=$2
20 |   tmpdir=$3
21 | 
22 |   wav_dir=$tmpdir/$typ
23 |   data_dir=$wav_dir/data
24 | 
25 |   mkdir -p $data_dir
26 | 
27 |   name=`basename $wav`
28 |   
29 |   echo $name $wav > $data_dir/wav.scp
30 |   echo $name $name > $data_dir/utt2spk
31 |   echo $name $name > $data_dir/spk2utt
32 | 
33 |   rescore_lang=data/LM/ori_$typ
34 |   graph=exp/tri4a/graph_pr10_$typ
35 |   
36 |   local/kaggle/decode_from_wav.sh \
37 |     --rescore true \
38 |     --rescore_lang $rescore_lang \
39 |     --fbank_nj 1 --mode $mode \
40 |     --decode_nj 1 \
41 |     --stage 1 \
42 |     --graph $graph \
43 |     $wav_dir $nnet_dir $wav_dir > $wav_dir/log || echo "error decoding $wav_dir"
44 | 
45 |   cp $wav_dir/data/wav.scp $wav_dir
46 |   rm -r $wav_dir/data
47 |   rm -r $wav_dir/final.mdl
48 | 
49 |   output=`cat $wav_dir/output.txt | cut -d' ' -f2- `
50 |   echo $typ $output
51 | }
52 | 
53 | tmpdir=`mktemp -d`
54 | 
55 | 
56 | ( asr $wavA A $tmpdir ) &
57 | ( asr $wavB B $tmpdir ) &
58 | 
59 | 
60 | wait
61 | 
62 | rm -r $tmpdir
63 | 


--------------------------------------------------------------------------------
/steps/word_align_lattices.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright Johns Hopkins University (Author: Daniel Povey)  2012
 4 | # Apache 2.0.
 5 | 
 6 | # Begin configuration section.
 7 | silence_label=0
 8 | cmd=run.pl
 9 | # End configuration section.
10 | 
11 | echo "$0 $@"  # Print the command line for logging
12 | 
13 | for x in `seq 2`; do
14 |   [ "$1" == "--silence-label" ] && silence_label=$2 && shift 2;
15 |   [ "$1" == "--cmd" ] && cmd="$2" && shift 2;
16 | done
17 | 
18 | if [ $# != 3 ]; then
19 |    echo "Word-align lattices (make the arcs sync up with words)"
20 |    echo ""
21 |    echo "Usage: $0 [options] <lang-dir> <decode-dir-in> <decode-dir-out>"
22 |    echo "options: [--cmd (run.pl|queue.pl [queue opts])] [--silence-label <integer-id-of-silence-word>]"
23 |    exit 1;
24 | fi
25 | 
26 | . ./path.sh || exit 1;
27 | 
28 | lang=$1
29 | indir=$2
30 | outdir=$3
31 | 
32 | mdl=`dirname $indir`/final.mdl
33 | wbfile=$lang/phones/word_boundary.int
34 | 
35 | for f in $mdl $wbfile $indir/num_jobs; do
36 |   [ ! -f $f ] && echo "word_align_lattices.sh: no such file $f" && exit 1;
37 | done
38 | 
39 | mkdir -p $outdir/log
40 | 
41 | 
42 | cp $indir/num_jobs $outdir;
43 | nj=`cat $indir/num_jobs`
44 | 
45 | $cmd JOB=1:$nj $outdir/log/align.JOB.log \
46 |   lattice-align-words --silence-label=$silence_label --test=true \
47 |    $wbfile $mdl "ark:gunzip -c $indir/lat.JOB.gz|" "ark,t:|gzip -c >$outdir/lat.JOB.gz" || exit 1;
48 | 
49 | 


--------------------------------------------------------------------------------
/local/nnet/retrain.sh:
--------------------------------------------------------------------------------
 1 | . ./path.sh
 2 | . ./cmd.sh
 3 | 
 4 | set -e
 5 | set -u
 6 | set -o pipefail
 7 | 
 8 | . utils/parse_options.sh || exit 1;
 9 | 
10 | 
11 | data=data/train_sp_aug/fbank
12 | ali=exp/aishell2/tri4_ali_train_sp_aug  
13 | dnn_model=$1
14 | oridir=$2
15 | visible_gpu=$3
16 | 
17 | export CUDA_VISIBLE_DEVICES=$visible_gpu
18 | 
19 | #########################
20 | stage=2
21 | nj=10
22 | 
23 | dir=$oridir\_train_more
24 | 
25 | lrate=1.95313e-08
26 | mlp_init=$(cat $oridir/.mlp_best)
27 | 
28 | if [ $stage -le 3 ]; then
29 |     proto=local/nnet/${dnn_model}.proto
30 |     ori_num_pdf=`cat $proto |grep "Softmax" |awk '{print $3}'`
31 |     echo $ori_num_pdf
32 |     new_num_pdf=`gmm-info ./exp/aishell2/tri4_ali_train_sp_aug/final.mdl | grep "number of pdfs" |awk '{print $4}'`
33 |     echo $new_num_pdf
34 |     new_proto=${proto}.$new_num_pdf
35 |     sed -r "s/"$ori_num_pdf"/"$new_num_pdf"/g" $proto > $new_proto
36 | 
37 |     $cuda_cmd $dir/_train_nnet.log \
38 |         local/nnet/train_more.sh --learn-rate $lrate --nnet-proto $new_proto \
39 |         --start_half_lr 10 --momentum 0.9 \
40 |         --train-tool "nnet-train-fsmn-streams" \
41 |         --feat-type plain --splice 1 \
42 |         --cmvn-opts "--norm-means=true --norm-vars=false" --delta_opts "--delta-order=2" \
43 |         --train-tool-opts "--minibatch-size=4096" \
44 |         --max_iters 7 \
45 |         --split_feats 7 \
46 |         $mlp_init $data data/lang $ali $dir
47 | fi
48 | 


--------------------------------------------------------------------------------
/utils/data/resample_data_dir.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Copyright 2016  Vimal Manohar
 4 | #           2018  Xiaohui Zhang
 5 | # Apache 2.0.
 6 | 
 7 | if [ $# -ne 2 ]; then
 8 |   echo "This script adds a sox line in wav.scp to resample the audio at a "
 9 |   echo "different sampling-rate"
10 |   echo "Usage: $0 <frequency> <data-dir>"
11 |   echo " e.g.: $0 8000 data/dev"
12 |   exit 1
13 | fi
14 | 
15 | freq=$1
16 | dir=$2
17 | 
18 | sox=`which sox` || { echo "Could not find sox in PATH"; exit 1; }
19 | 
20 | if [ -f $dir/feats.scp ]; then
21 |   mkdir -p $dir/.backup
22 |   mv $dir/feats.scp $dir/.backup/
23 |   if [ -f $dir/cmvn.scp ]; then
24 |     mv $dir/cmvn.scp $dir/.backup/
25 |   fi
26 |   echo "$0: feats.scp already exists. Moving it to $dir/.backup"
27 | fi
28 | 
29 | # After resampling we cannot compute utt2dur from wav.scp any more,
30 | # so we create utt2dur now, in case it's needed later
31 | if [ ! -s $dir/utt2dur ]; then
32 |   utils/data/get_utt2dur.sh $dir 1>&2 || exit 1;
33 | fi
34 | 
35 | mv $dir/wav.scp $dir/wav.scp.tmp
36 | cat $dir/wav.scp.tmp | python -c "import sys
37 | for line in sys.stdin.readlines():
38 |   splits = line.strip().split()
39 |   if splits[-1] == '|':
40 |     out_line = line.strip() + ' $sox -t wav - -c 1 -b 16 -t wav - rate $freq |'
41 |   else:
42 |     out_line = 'cat {0} {1} | $sox -t wav - -c 1 -b 16 -t wav - rate $freq |'.format(splits[0], ' '.join(splits[1:]))
43 |   print (out_line)" > ${dir}/wav.scp
44 | rm $dir/wav.scp.tmp
45 | 
46 | 


--------------------------------------------------------------------------------
/utils/shuffle_list.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | # Copyright 2013  Johns Hopkins University (author: Daniel Povey)
 4 | 
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #  http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
14 | # MERCHANTABLITY OR NON-INFRINGEMENT.
15 | # See the Apache 2 License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | 
19 | if ($ARGV[0] eq "--srand") {
20 |   $n = $ARGV[1];
21 |   $n =~ m/\d+/ || die "Bad argument to --srand option: \"$n\"";
22 |   srand($ARGV[1]);
23 |   shift;
24 |   shift;
25 | } else {
26 |   srand(0); # Gives inconsistent behavior if we don't seed.
27 | }
28 | 
29 | if (@ARGV > 1 || $ARGV[0] =~ m/^-.+/) { # >1 args, or an option we 
30 |   # don't understand.
31 |   print "Usage: shuffle_list.pl [--srand N] [input file]  > output\n";
32 |   print "randomizes the order of lines of input.\n";
33 |   exit(1);
34 | }
35 | 
36 | @lines;
37 | while (<>) {
38 |   push @lines, [ (rand(), $_)] ;
39 | }
40 | 
41 | @lines = sort { $a->[0] cmp $b->[0] } @lines;
42 | foreach $l (@lines) {
43 |     print $l->[1];
44 | }
45 | 


--------------------------------------------------------------------------------
/steps/nnet2/check_ivectors_compatible.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2016, Johns Hopkins University (Yenda Trmal <jtrmal@gmail.com>)
 3 | # License: Apache 2.0
 4 | 
 5 | # Begin configuration section.
 6 | # End configuration section
 7 | 
 8 | #echo >&2 "$0 $@"  # Print the command line for logging
 9 | if [ $# != 2 ] ; then
10 |   echo >&2 "Usage: $0  <first-dir> <second-dir>"
11 |   echo >&2 " e.g.: $0 exp/nnet3/extractor exp/nnet3/ivectors_dev10h.pem"
12 | fi
13 | 
14 | dir_a=$1
15 | dir_b=$2
16 | 
17 | id_a=$(steps/nnet2/get_ivector_id.sh $dir_a)
18 | ret_a=$?
19 | id_b=$(steps/nnet2/get_ivector_id.sh $dir_b)
20 | ret_b=$?
21 | 
22 | if [ ! -z "$id_a" ] && [ ! -z "${id_b}" ] ; then
23 |   if [ "${id_a}" == "${id_b}" ]; then
24 |     exit 0
25 |   else
26 |     echo >&2 "$0: ERROR: iVector id ${id_a} in $dir_a and the iVector id ${id_b} in $dir_b do not match"
27 |     echo >&2 "$0: ERROR: that means that the systems are not compatible."
28 |     exit 1
29 |   fi
30 | elif [ -z "$id_a" ] && [ -z "${id_b}" ] ; then
31 |     echo >&2 "$0: WARNING: The directories do not contain iVector ID."
32 |     echo >&2 "$0: WARNING: That means it's you who's reponsible for keeping "
33 |     echo >&2 "$0: WARNING: the directories compatible"
34 |     exit 0
35 | else
36 |     echo >&2 "$0: WARNING: One of the directories do not contain iVector ID."
37 |     echo >&2 "$0: WARNING: That means it's you who's reponsible for keeping "
38 |     echo >&2 "$0: WARNING: the directories compatible"
39 |     exit 0
40 | fi
41 | 


--------------------------------------------------------------------------------
/utils/analyze_segments.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | # Copyright 2015 GoVivace Inc. (Author: Nagendra Kumar Goel)
 3 | 
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #  http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
14 | # See the Apache 2 License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | # Analyze a segments file and print important stats on it.
18 | 
19 | $dur = $total = 0;
20 | $maxDur = 0;
21 | $minDur = 9999999999;
22 | $n = 0;
23 | while(<>){
24 |     chomp;
25 |     @t = split(/\s+/);
26 |     $dur = $t[3] - $t[2];
27 |     $total += $dur;
28 |     if ($dur > $maxDur) {
29 |         $maxSegId = $t[0];
30 |         $maxDur = $dur;
31 |     }
32 |     if ($dur < $minDur) {
33 |         $minSegId = $t[0];
34 |         $minDur = $dur;
35 |     }
36 |     $n++;
37 | }
38 | $avg=$total/$n;
39 | $hrs = $total/3600;
40 | print "Total $hrs hours of data\n";
41 | print "Average segment length $avg seconds\n";
42 | print "Segment $maxSegId has length of $maxDur seconds\n";
43 | print "Segment $minSegId has length of $minDur seconds\n";
44 | 


--------------------------------------------------------------------------------
/local/lm/run_3gram_kaggle5.sh:
--------------------------------------------------------------------------------
 1 | #/bin/bash
 2 | . path.sh
 3 | wfst=./data/wfst
 4 | dict=./data/wfst/dict
 5 | lang=./data/wfst/lang
 6 | tmp_lang=./data/wfst/local/lang
 7 | model_dir=exp/tri4a
 8 | LM=data/LM
 9 | text=data/text
10 | #modify dict/lexicon.txt lexiconp.txt
11 | #utils/prepare_lang.sh $dict "<UNK>" $tmp_lang $lang
12 | 
13 | #LM training
14 | mkdir -p $LM/3gram
15 | #ngram-count -text $text/mix.txt -lm $LM/3gram/mix.lm -vocab $lang/vocabs.txt -limit-vocab -order 3
16 | #ngram -lm data/local/lm/3gram-mincount/lm_pr10.0 -vocab $lang/vocabs.txt -limit-vocab -write-lm $LM/3gram/ori_pr10.0.lm
17 | 
18 | for x in A B C ; do
19 |     #ngram-count -text $text/kaggle123_$x.txt -lm $LM/3gram/kaggle123_$x.lm -vocab $lang/vocabs.txt -limit-vocab -order 3
20 |     #ngram-count -text $text/kaggle1234_$x.txt -lm $LM/3gram/kaggle1234_$x.lm -vocab $lang/vocabs.txt -limit-vocab -order 3
21 | 
22 |     local/lm/mix_lm3_test.sh $LM/3gram/ori_pr10.0.lm $LM/3gram/kaggle123_$x.lm $LM/3gram/mix.lm \
23 |       $LM/3gram/kaggle1234_$x.lm $text/kaggle4_$x.txt $LM/3gram/ori_$x\_10.0_kaggle1234.lm
24 | done
25 | 
26 | for x in A B C ; do
27 |   (
28 |     lm=$LM/3gram/ori_$x\_10.0_kaggle1234.lm
29 |     lang_test=./data/wfst/lang_test_pr10_$x\_kaggle5
30 |     graph_dir=exp/tri4a/graph_pr10_$x\_kaggle5
31 |     #G compilation and check L and G stochastic
32 |     local/lm/wfst/format_data.sh $lm $lang $lang_test
33 | 
34 |     #compose HCLG(choice)
35 |     utils/mkgraph.sh $lang_test $model_dir $graph_dir
36 |   ) &
37 | done
38 | wait
39 | 


--------------------------------------------------------------------------------
/steps/nnet2/remove_egs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2014  Johns Hopkins University (Author: Daniel Povey).  
 4 | # Apache 2.0.
 5 | 
 6 | # This script removes the examples in an egs/ directory, e.g.
 7 | # steps/nnet2/remove_egs.sh exp/nnet4b/egs/
 8 | # We give it its own script because we need to be careful about
 9 | # things that are soft links to something in storage/ (i.e. remove the
10 | # data that's linked to as well as the soft link), and we want to not
11 | # delete the examples if someone has done "touch $dir/egs/.nodelete".
12 | 
13 | 
14 | if [ $# != 1 ]; then
15 |   echo "Usage: $0 <egs-dir>"
16 |   echo "e.g.: $0 data/nnet4b/egs/"
17 |   echo "e.g.: $0 data/nnet4b_mpe/degs/"
18 |   echo "This script is usually equivalent to 'rm <egs-dir>/egs.* <egs-dir>/degs.*' but it follows"
19 |   echo "soft links to <egs-dir>/storage/; and it avoids deleting anything in the directory if"
20 |   echo "someone did 'touch <egs-dir>/.nodelete"
21 |   exit 1;
22 | fi
23 | 
24 | egs=$1
25 | 
26 | if [ ! -d $egs ]; then
27 |   echo "$0: expected directory $egs to exist"
28 |   exit 1;
29 | fi
30 | 
31 | if [ -f $egs/.nodelete ]; then
32 |   echo "$0: not deleting egs in $egs since $egs/.nodelete exists"
33 |   exit 0;
34 | fi
35 | 
36 | 
37 | 
38 | for f in $egs/egs.*.ark $egs/degs.*.ark $egs/cegs.*.ark; do
39 |   if [ -L $f ]; then
40 |     rm $(dirname $f)/$(readlink $f)  # this will print a warning if it fails.
41 |   fi
42 |   rm $f 2>/dev/null
43 | done
44 | 
45 | 
46 | echo "$0: Finished deleting examples in $egs"
47 | 


--------------------------------------------------------------------------------
/utils/data/convert_data_dir_to_whole.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Copyright 2016-2018  Vimal Manohar
 4 | # Apache 2.0
 5 | 
 6 | # This scripts converts a data directory into a "whole" data directory
 7 | # by removing the segments and using the recordings themselves as 
 8 | # utterances
 9 | 
10 | set -o pipefail
11 | 
12 | . ./path.sh
13 | 
14 | . utils/parse_options.sh
15 | 
16 | if [ $# -ne 2 ]; then
17 |   echo "Usage: convert_data_dir_to_whole.sh <in-data> <out-data>"
18 |   echo " e.g.: convert_data_dir_to_whole.sh data/dev data/dev_whole"
19 |   exit 1
20 | fi
21 | 
22 | data=$1
23 | dir=$2
24 | 
25 | if [ ! -f $data/segments ]; then
26 |   echo "$0: Data directory already does not contain segments. So just copying it."
27 |   utils/copy_data_dir.sh $data $dir
28 |   exit 0
29 | fi
30 | 
31 | mkdir -p $dir
32 | cp $data/wav.scp $dir
33 | if [ -f $data/reco2file_and_channel ]; then 
34 |   cp $data/reco2file_and_channel $dir; 
35 | fi
36 | 
37 | mkdir -p $dir/.backup
38 | mv $dir/feats.scp $dir/cmvn.scp $dir/.backup
39 | 
40 | rm $dir/utt2spk || true
41 | 
42 | [ -f $data/stm ] && cp $data/stm $dir
43 | [ -f $data/glm ] && cp $data/glm $dir
44 | 
45 | utils/data/internal/combine_segments_to_recording.py \
46 |   --write-reco2utt=$dir/reco2sorted_utts $data/segments $dir/utt2spk || exit 1
47 | 
48 | if [ -f $data/text ]; then
49 |   utils/apply_map.pl -f 2- $data/text < $dir/reco2sorted_utts > $dir/text || exit 1
50 | fi
51 | 
52 | rm $dir/reco2sorted_utts
53 | 
54 | utils/fix_data_dir.sh $dir || exit 1
55 | 
56 | exit 0
57 | 


--------------------------------------------------------------------------------
/utils/data/limit_feature_dim.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2016  Alibaba Robotics Corp. (author: Xingyu Na)
 4 | # Apache 2.0
 5 | 
 6 | # The script creates a new data directory by selecting a specified
 7 | # dimension range of the features in the source directory.
 8 | 
 9 | . utils/parse_options.sh
10 | 
11 | if [ $# != 3 ]; then
12 |   echo "Usage: "
13 |   echo "  $0 <feat-dim-range> <srcdir> <destdir>"
14 |   echo "The script creates a new data directory by selecting a specified"
15 |   echo "dimension range of the features in the source directory."
16 |   echo "e.g.:"
17 |   echo " $0 0:39 data/train_hires_pitch data/train_hires"
18 |   exit 1;
19 | fi
20 | 
21 | feat_dim_range=$1
22 | srcdir=$2
23 | destdir=$3
24 | 
25 | if [ "$destdir" == "$srcdir" ]; then
26 |   echo "$0: this script requires <srcdir> and <destdir> to be different."
27 |   exit 1
28 | fi
29 | 
30 | if [ ! -f $srcdir/feats.scp ]; then
31 |   echo "$0: no such file $srcdir/feats.scp"
32 |   exit 1;
33 | fi
34 | 
35 | mkdir -p $destdir
36 | utils/copy_data_dir.sh $srcdir $destdir
37 | 
38 | if [ -f $destdir/cmvn.scp ]; then
39 |   rm $destdir/cmvn.scp
40 |   echo "$0: warning: removing $destdir/cmvn.cp, you will have to regenerate it from the features."
41 | fi
42 | 
43 | rm $destdir/feats.scp
44 | sed 's/$/\[:,'${feat_dim_range}'\]/' $srcdir/feats.scp | \
45 |   utils/data/normalize_data_range.pl > $destdir/feats.scp
46 | 
47 | [ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text"
48 | utils/validate_data_dir.sh $validate_opts $destdir
49 | 


--------------------------------------------------------------------------------
/local/nnet/augment_data_only_kgb_noise.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | fbankdir=data/fbank
 3 | 
 4 | . ./path.sh
 5 | . ./cmd.sh
 6 | . ./utils/parse_options.sh
 7 | set -euo pipefail
 8 | noise_dir=data/kgb_noise
 9 | 
10 | for corpus in cyberon_chinese_test TOCFL train_sp ; do
11 |   data=data/$corpus/fbank
12 |   data_aug=data/$corpus\_aug_kgb_noise/fbank
13 |   if [ ! -f $data/reco2dur ] ; then
14 |     bash utils/data/get_reco2utt.sh $data || exit 1
15 |   fi
16 |   
17 |   if [ ! -f $noise_dir ] ; then
18 |     bash utils/data/get_reco2utt.sh $noise_dir || exit 1
19 |   fi
20 | 
21 |   python2 steps/data/augment_data_dir.py --utt-suffix aug_kgb_noise --bg-snrs 9:7:5 --num-bg-noises 1 --bg-noise-dir $noise_dir $data $data_aug
22 | 
23 |   name=$corpus\_aug_kgb_noise
24 |   steps/make_fbank.sh --nj 50 --cmd "$train_cmd"  --fbank-config conf/fbank.conf --name $name $data_aug exp/make_fbank/$name $fbankdir
25 |   steps/compute_cmvn_stats.sh --name $name $data_aug exp/make_fbank/$name $fbankdir
26 | 
27 | 
28 |   rm -rf ./data/$corpus\_rvb_aug/fbank
29 |   utils/combine_data.sh ./data/$corpus\_aug_kgb_noise_ori/fbank $data_aug $data 
30 | done
31 | 
32 | ali_src=exp/tri4a_sp_ali
33 | ali_target=exp/tri4a_sp_aug_kgb_noise_ali
34 | rm -r $ali_target
35 | cp -r $ali_src $ali_target
36 | local/nnet/copy_alignment.sh $ali_target
37 | 
38 | ali_src=exp/tri4a_ali_cyberon_chinese_test
39 | ali_target=exp/tri4a_ali_cyberon_chinese_test_aug_kgb_noise
40 | 
41 | rm -r $ali_target
42 | cp -r $ali_src $ali_target
43 | local/nnet/copy_alignment.sh $ali_target
44 | 


--------------------------------------------------------------------------------
/utils/show_lattice.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | format=pdf # pdf svg
 4 | mode=save # display save
 5 | lm_scale=0.0
 6 | acoustic_scale=0.0
 7 | #end of config
 8 | 
 9 | . utils/parse_options.sh
10 | 
11 | if [ $# != 3 ]; then
12 |    echo "usage: $0 [--mode display|save] [--format pdf|svg] <utt-id> <lattice-ark> <word-list>"
13 |    echo "e.g.:  $0 utt-0001 \"test/lat.*.gz\" tri1/graph/words.txt"
14 |    exit 1;
15 | fi
16 | 
17 | . ./path.sh
18 | 
19 | uttid=$1
20 | lat=$2
21 | words=$3
22 | 
23 | tmpdir=$(mktemp -d /tmp/kaldi.XXXX); # trap "rm -r $tmpdir" EXIT # cleanup
24 | 
25 | gunzip -c $lat | lattice-to-fst --lm-scale=$lm_scale --acoustic-scale=$acoustic_scale ark:- "scp,p:echo $uttid $tmpdir/$uttid.fst|" || exit 1;
26 | ! [ -s $tmpdir/$uttid.fst ] && \
27 |   echo "Failed to extract lattice for utterance $uttid (not present?)" && exit 1;
28 | fstdraw --portrait=true --osymbols=$words $tmpdir/$uttid.fst | dot -T${format} > $tmpdir/$uttid.${format}
29 | 
30 | if [ "$(uname)" == "Darwin" ]; then
31 |     doc_open=open
32 | elif [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then
33 |     doc_open=xdg-open
34 | elif [ $mode == "display" ] ; then
35 |         echo "Can not automaticaly open file on your operating system"
36 |         mode=save
37 | fi
38 | 
39 | [ $mode == "display" ] && $doc_open $tmpdir/$uttid.${format}
40 | [[ $mode == "display" && $? -ne 0 ]] && echo "Failed to open ${format} format." && mode=save
41 | [ $mode == "save" ] && echo "Saving to $uttid.${format}" && cp $tmpdir/$uttid.${format} .
42 | 
43 | exit 0
44 | 


--------------------------------------------------------------------------------
/local/kaggle/replace_iflytek_choice.py:
--------------------------------------------------------------------------------
 1 | import os,sys,json
 2 | from parse_choices import *
 3 | from normalize_utils import *
 4 | 
 5 | 
 6 | 
 7 | if __name__ == '__main__':
 8 |     choices_file =  sys.argv[1]
 9 |     iflytek_json = sys.argv[2]
10 |     output_json = sys.argv[3]
11 |     d = {}
12 |     kaggle_dir = '/data/local/kgb/corpus/kgb/kaggle3'
13 | 
14 |     xlxs_path = os.path.join(kaggle_dir,'answer.xlsx') 
15 |     content = read_xlsx(xlxs_path)
16 |     '''
17 |     with open(choices_file,'r',encoding='utf8') as f:
18 |         for line in f:
19 |             tokens = line.rstrip().split()
20 |             text = ''.join(tokens[1:])
21 |             name = tokens[0].replace('.wav','')
22 |             No = int(name[1:])
23 |             d[No] = parse(text)
24 |     '''
25 |     d2 = {}
26 |     for idx,row in enumerate(content):
27 |         if idx == 0:
28 |             continue
29 |         No,passage,question,c1,c2,c3,c4 = row[:7]
30 |         No = int(No[1:])
31 |         print(question)
32 |         n_q = normalize(str(question),[])
33 |         q = n_q.replace(' ','')
34 |         d2[No] = q
35 |     
36 |         
37 |     with open(iflytek_json,'r',encoding='utf8') as f:
38 |         data = json.load(f)
39 |     outputs = []
40 |     for sample in data:
41 |         id = sample['id']
42 |         sample['options'] = d[id]
43 |         sample['question'] = d2[id]
44 |         outputs.append(sample)
45 |     with open(output_json,'w',encoding='utf8') as f:
46 |         json.dump(outputs,f,indent=4,ensure_ascii=False)
47 | 
48 | 


--------------------------------------------------------------------------------
/steps/segmentation/copy_targets_dir.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright    2017  Nagendra Kumar Goel
 4 | #              2014  Johns Hopkins University (author: Nagendra K Goel)
 5 | # Apache 2.0
 6 | 
 7 | # This script makes a copy of targets directory (by copying targets.scp),
 8 | # possibly adding a specified prefix or a suffix to the utterance names.
 9 | 
10 | # begin configuration section
11 | utt_prefix=
12 | utt_suffix=
13 | # end configuration section
14 | 
15 | if [ -f ./path.sh ]; then . ./path.sh; fi
16 | . ./utils/parse_options.sh
17 | 
18 | if [ $# != 2 ]; then
19 |   echo "Usage: "
20 |   echo "  $0 [options] <srcdir> <destdir>"
21 |   echo "e.g.:"
22 |   echo " $0  --utt-prefix=1- exp/segmentation_1a/train_whole_combined_targets_sub3 exp/segmentation_1a/train_whole_combined_targets_sub3_rev1"
23 |   echo "Options"
24 |   echo "   --utt-prefix=<prefix>     # Prefix for utterance ids, default empty"
25 |   echo "   --utt-suffix=<suffix>     # Suffix for utterance ids, default empty"
26 |   exit 1;
27 | fi
28 | 
29 | export LC_ALL=C
30 | 
31 | srcdir=$1
32 | destdir=$2
33 | 
34 | mkdir -p $destdir
35 | 
36 | if [ -f $srcdir/frame_subsampling_factor ]; then
37 |   cp $srcdir/frame_subsampling_factor $destdir
38 | fi
39 | 
40 | cat $srcdir/targets.scp | awk -v p=$utt_prefix -v s=$utt_suffix \
41 |   '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map
42 | 
43 | cat $srcdir/targets.scp | utils/apply_map.pl -f 1 $destdir/utt_map | \
44 |   sort -k1,1 > $destdir/targets.scp
45 | 
46 | echo "$0: copied targets from $srcdir to $destdir"
47 | 


--------------------------------------------------------------------------------
/utils/best_wer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright 2010-2011 Microsoft Corporation
 4 | 
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #  http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
14 | # MERCHANTABLITY OR NON-INFRINGEMENT.
15 | # See the Apache 2 License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | # To be run from one directory above this script.
19 | 
20 | perl -e 'while(<>){ 
21 |     s/\|(\d)/\| $1/g; s/(\d)\|/$1 \|/g;
22 |     if (m/[WS]ER (\S+)/ && (!defined $bestwer || $bestwer > $1)){ $bestwer = $1; $bestline=$_; } # kaldi "compute-wer" tool.
23 |     elsif (m: (Mean|Sum/Avg|)\s*\|\s*\S+\s+\S+\s+\|\s+\S+\s+\S+\s+\S+\s+\S+\s+(\S+)\s+\S+\s+\|:
24 |         && (!defined $bestwer || $bestwer > $2)){ $bestwer = $2; $bestline=$_; } }  # sclite.
25 |    if (defined $bestline){ print $bestline; } ' | \
26 |   awk 'BEGIN{ FS="%WER"; } { if(NF == 2) { print FS$2" "$1; } else { print $0; }}' | \
27 |   awk 'BEGIN{ FS="Sum/Avg"; } { if(NF == 2) { print $2" "$1; } else { print $0; }}' | \
28 |   awk '{ if($1!~/%WER/) { print "%WER "$9" "$0; } else { print $0; }}' | \
29 |   sed -e 's|\s\s*| |g' -e 's|\:$||' -e 's|\:\s*\|\s*$||'
30 | 
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/local/extract_kaggle_feature.sh:
--------------------------------------------------------------------------------
 1 | #$!/bin/bash
 2 | nj=8 #number of job parallel running
 3 | stage=0
 4 | . ./path.sh
 5 | . ./cmd.sh
 6 | . ./utils/parse_options.sh
 7 | 
 8 | 
 9 | if [ $stage -le 1 ] ; then
10 |   mfccdir=data/mfcc_pitch
11 |   mkdir -p $mfccdir
12 | 
13 |   for corpus in kaggle1 kaggle2 kaggle3 ; do
14 |     combine48=''
15 |     for typ in A B C ; do
16 |       ##Extract MFCC39 + pitch9 feature
17 |       data=./data/$corpus/$typ/mfcc39_pitch9
18 |       name=$corpus\_$typ
19 |       combine48="$data $combine48"
20 |       steps/make_mfcc_pitch_online.sh --cmd "$train_cmd" --nj $nj --name $name $data exp/make_mfcc/$name $mfccdir || exit 1;
21 |       steps/compute_cmvn_stats.sh --name $name $data exp/make_mfcc/$name $mfccdir || exit 1;
22 |     done
23 |     utils/combine_data.sh ./data/$corpus/mfcc39_pitch9 $combine48 
24 |   done
25 | fi
26 | 
27 | if [ $stage -le 1 ] ; then
28 |   fbankdir=data/fbank
29 |   mkdir -p $fbankdir
30 | 
31 |   for corpus in kaggle1 kaggle2 kaggle3 ; do
32 |     combine48=''
33 |     for typ in A B C ; do
34 |       mfccdata=./data/$corpus/$typ/mfcc39_pitch9
35 |       data=./data/$corpus/$typ/fbank
36 |       name=$corpus\_$typ
37 |       combine48="$data $combine48"
38 |       
39 |       utils/copy_data_dir.sh $mfccdata $data
40 |       steps/make_fbank.sh --nj 30 --cmd "$train_cmd"  --fbank-config conf/fbank.conf --name $name \
41 |           $data exp/make_fbank/$name $fbankdir
42 |       steps/compute_cmvn_stats.sh --name $name $data exp/make_fbank/$name $fbankdir
43 |     done
44 |     utils/combine_data.sh ./data/$corpus/fbank $combine48 
45 |   done
46 | fi
47 | 


--------------------------------------------------------------------------------
/utils/remove_oovs.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # Copyright 2010-2011 Microsoft Corporation
 3 | 
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #  http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
14 | # See the Apache 2 License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | # This script removes lines that contain these OOVs on either the
18 | # third or fourth fields  of the line.  It is intended to remove arcs
19 | # with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in).
20 | 
21 | if (  @ARGV < 1 && @ARGV > 2) {
22 |     die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n";
23 | }
24 | 
25 | $unklist = shift @ARGV;
26 | open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n";
27 | while(<S>){ 
28 |     @A = split(" ", $_);
29 |     @A == 1 || die "Bad line in unknown-symbol list: $_";
30 |     $unk{$A[0]} = 1;
31 | }
32 | 
33 | $num_removed = 0;
34 | while(<>){ 
35 |     @A = split(" ", $_);
36 |     if(defined $unk{$A[2]} || defined $unk{$A[3]}) {
37 |         $num_removed++;
38 |     } else {
39 |         print;
40 |     }
41 | }
42 | print STDERR "remove_oovs.pl: removed $num_removed lines.\n";
43 | 
44 | 


--------------------------------------------------------------------------------
/local/data/normalize.py:
--------------------------------------------------------------------------------
 1 | import string,sys
 2 | import re
 3 | from number2chinese import * 
 4 | 
 5 | fin = sys.argv[1]
 6 | fout = sys.argv[2]
 7 | 
 8 | l = []
 9 | with open(fin,'r') as f:
10 |     for line in f:
11 |         for cha in [' ','、','「','」','”','“','…','）','）','：']:
12 |             line = line.replace(cha,'')
13 |         for cha in string.punctuation:
14 |             line = line.replace(cha,'')
15 |         for cha in ['，',':','?','、','。','；','！']:
16 |             line = line.replace(cha,'\n')
17 |         line = line.replace('\n\n','\n')
18 |         if len(line) >= 1:
19 |             ## 我是john先生 -> 我 是 john 先 生
20 |             newline = ''
21 |             flag = True
22 |             for char in line:
23 |                 if re.match('^[a-zA-Z0-9]+$',char):
24 |                     flag = False
25 |                     newline += char
26 |                 else:
27 |                     if not flag:
28 |                         newline += ' '
29 |                     flag = True
30 |                     newline += char + ' '
31 |             if flag:
32 |                 newline = newline[:-1]
33 |             #covert number to chinese
34 |             line = ''
35 |             for token in newline.split(' '):
36 |                 if re.match('^[0-9]+$',token):
37 |                     if len(token) > 15:
38 |                         break
39 |                     token = to_chinese(int(token))
40 |                     token = ' '.join(list(token))
41 |                 line += token + ' '
42 |             l.append(line[:-1])
43 | with open(fout,'w') as f:
44 |     for line in l:
45 |         f.write(line)
46 | 


--------------------------------------------------------------------------------
/local/lm/news_crawler.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import requests,sys
 3 | from bs4 import BeautifulSoup  
 4 | 
 5 | # https://www.ettoday.net/news/news-list-2017-07-15-5.htm
 6 | # 1 政治 
 7 | # 17 財經
 8 | # 2 國際
 9 | # 6 社會
10 | # 9 影劇
11 | # 10 體育
12 | # 20 3c
13 | # 30 時尚 
14 | # 24 遊戲
15 | # 5 生活
16 | for tt in [1, 17, 2, 6, 9, 10, 20, 30, 24, 5]:
17 |     urls = []
18 |     for n in range(1,12):
19 |         for n2 in [5,10,15,20,25,31]:
20 |             u = "https://www.ettoday.net/news/news-list-"+str(sys.argv[1]) + "-" + str(n)+"-"+str(n2)+"-"+str(tt)+".htm"
21 |             res = requests.get(u)
22 |             soup = BeautifulSoup(res.content, "lxml")
23 |             soup = soup.find("div", class_="part_list_2")
24 |             domian = "https://www.ettoday.net"
25 |             for a in soup.find_all("h3"):
26 |                 urls.append(domian+a.a['href'])
27 |     allcontent = []
28 |     for u in urls:
29 |         content = []
30 |         res = requests.get(u)
31 |         soup = BeautifulSoup(res.content, "lxml")
32 |         try:
33 |             soup = soup.find("div", class_="story")
34 |             for a in soup.find_all("p"):
35 |                 p = a.string
36 |                 if p != None:
37 |                     p = p.split('/')
38 |                     if len(p) > 1:
39 |                         content.append(p[1])
40 |                         print(p[1].encode('utf-8'))
41 |                     else:
42 |                         content.append(p[0])
43 |                         print(p[0].encode('utf-8'))
44 |             allcontent.append(content.encode('utf-8'))
45 |         except:
46 |             pass
47 |     print(len(allcontent))
48 | 


--------------------------------------------------------------------------------
/steps/nnet3/nnet3_to_dot.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # script showing use of nnet3_to_dot.py
 4 | # Copyright 2015  Johns Hopkins University (Author: Vijayaditya Peddinti).
 5 | 
 6 | # Begin configuration section.
 7 | component_attributes="name,type"
 8 | node_prefixes=""
 9 | info_bin=nnet3-am-info
10 | echo "$0 $@"  # Print the command line for logging
11 | 
12 | [ -f ./path.sh ] && . ./path.sh; # source the path.
13 | . parse_options.sh || exit 1;
14 | 
15 | if [ $# != 3 ]; then
16 |   echo "Usage: $0 [opts] <nnet3-mdl-file> <output-dot-file> <output-png-file>"
17 |   echo " e.g.: $0 exp/sdm1/nnet3/lstm_sp/0.mdl lstm.dot lstm.png"
18 |   echo ""
19 |   echo "Main options (for others, see top of script file)"
20 |   echo "  --info-bin <nnet3-am-info|nnet3-info>        # Name of the binary to generate the nnet3 file"
21 |   echo "  --component-attributes <string|name,type>     # attributes to be printed in nnet3 components"
22 |   echo "  --node-prefixes <string|Lstm1,Lstm2>          # list of prefixes. Nnet3 components/component-nodes with the same prefix"
23 |   echo "                                                # will be clustered together in the dot-graph"
24 | 
25 | 
26 |   exit 1;
27 | fi
28 | 
29 | model=$1
30 | dot_file=$2
31 | output_file=$3
32 | 
33 | attr=${node_prefixes:+ --node-prefixes "$node_prefixes"}
34 | $info_bin $model | \
35 |   steps/nnet3/dot/nnet3_to_dot.py \
36 |     --component-attributes "$component_attributes" \
37 |     $attr $dot_file
38 | echo "Generated the dot file $dot_file"
39 | 
40 | command -v dot >/dev/null 2>&1 || { echo >&2 "This script requires dot but it's not installed. Please compile $dot_file with dot"; exit 1; }
41 | dot -Tpdf $dot_file -o $output_file
42 | 


--------------------------------------------------------------------------------
/steps/scoring/score_kaldi_compare.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2016 Nicolas Serrano
 3 | # Apache 2.0
 4 | 
 5 | [ -f ./path.sh ] && . ./path.sh
 6 | 
 7 | # begin configuration section.
 8 | cmd=run.pl
 9 | replications=10000
10 | #end configuration section.
11 | 
12 | echo "$0 $@"  # Print the command line for logging
13 | [ -f ./path.sh ] && . ./path.sh
14 | . parse_options.sh || exit 1;
15 | 
16 | if [ $# -ne 3 ]; then
17 |   echo "Usage: $0 [--cmd (run.pl|queue.pl...)] <score-dir1> <score-dir2> <score-compare-dir>"
18 |   echo " Options:"
19 |   echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
20 |   echo "    --replications <int>            # number of bootstrap evaluation to compute confidence."
21 |   exit 1;
22 | fi
23 | 
24 | dir1=$1
25 | dir2=$2
26 | dir_compare=$3
27 | 
28 | mkdir -p $dir_compare/log
29 | 
30 | for d in $dir1 $dir2; do
31 |   for f in test_filt.txt best_wer; do
32 |     [ ! -f $d/$f ] && echo "$0: no such file $d/$f" && exit 1;
33 |   done
34 | done
35 | 
36 | 
37 | best_wer_file1=$(awk '{print $NF}' $dir1/best_wer)
38 | best_transcript_file1=$(echo $best_wer_file1 | sed -e 's=.*/wer_==' | \
39 |         awk -v FS='_' -v dir=$dir1 '{print dir"/penalty_"$2"/"$1".txt"}')
40 | 
41 | best_wer_file2=$(awk '{print $NF}' $dir2/best_wer)
42 | best_transcript_file2=$(echo $best_wer_file2 | sed -e 's=.*/wer_==' | \
43 |         awk -v FS='_' -v dir=$dir2 '{print dir"/penalty_"$2"/"$1".txt"}')
44 | 
45 | $cmd $dir_compare/log/score_compare.log \
46 |   compute-wer-bootci --replications=$replications \
47 |     ark:$dir1/test_filt.txt ark:$best_transcript_file1 ark:$best_transcript_file2 \
48 |     '>' $dir_compare/wer_bootci_comparison || exit 1;
49 | 
50 | exit 0;
51 | 


--------------------------------------------------------------------------------
/utils/add_disambig.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # Copyright 2010-2011 Microsoft Corporation
 3 | 
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #  http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
14 | # See the Apache 2 License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | # Adds some specified number of disambig symbols to a symbol table.
19 | # Adds these as #1, #2, etc.
20 | # If the --include-zero option is specified, includes an extra one
21 | # #0.
22 | 
23 | $include_zero = 0;
24 | if($ARGV[0] eq "--include-zero") {
25 |     $include_zero = 1;
26 |     shift @ARGV;
27 | }
28 | 
29 | if(@ARGV != 2) {
30 |     die "Usage: add_disambig.pl [--include-zero] symtab.txt num_extra > symtab_out.txt ";
31 | }
32 | 
33 | 
34 | $input = $ARGV[0];
35 | $nsyms = $ARGV[1];
36 | 
37 | open(F, "<$input") || die "Opening file $input";
38 | 
39 | while(<F>) {
40 |     @A = split(" ", $_);
41 |     @A == 2 || die "Bad line $_";
42 |     $lastsym = $A[1];
43 |     print;
44 | }
45 | 
46 | if(!defined($lastsym)){
47 |  die "Empty symbol file?";
48 | }
49 | 
50 | if($include_zero) {
51 |     $lastsym++;
52 |     print "#0  $lastsym\n";
53 | }
54 | 
55 | for($n = 1; $n <= $nsyms; $n++) {
56 |     $y = $n + $lastsym;
57 |     print "#$n  $y\n";
58 | }
59 | 


--------------------------------------------------------------------------------
/local/nnet/DFSMN_L.proto:
--------------------------------------------------------------------------------
 1 | <NnetProto>
 2 | <AffineTransform> <InputDim> 720  <OutputDim> 2048 <Xavier> 1
 3 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
 4 | <LinearTransform> <InputDim> 2048 <OutputDim> 512 <Xavier> 1
 5 | <Fsmn> <InputDim> 512 <OutputDim> 512  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 6 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 7 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 8 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 9 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
10 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
11 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
12 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
13 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
14 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
15 | <AffineTransform> <InputDim> 512 <OutputDim> 2048 <Xavier> 1
16 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
17 | <AffineTransform> <InputDim> 2048 <OutputDim> 2048 <Xavier> 1
18 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
19 | <LinearTransform> <InputDim> 2048 <OutputDim> 512 <Xavier> 1
20 | <AffineTransform> <InputDim> 512 <OutputDim> 5777 <Xavier> 1
21 | <Softmax> <InputDim> 5777 <OutputDim> 5777
22 | </NnetProto>
23 | 
24 | 


--------------------------------------------------------------------------------
/utils/data/modify_speaker_info_to_recording.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2017  Vimal Manohar
 4 | # Apache 2.0.
 5 | 
 6 | # Copy the data directory, but modify it to use the recording-id as the 
 7 | # speaker. This is useful to get matching speaker information in the 
 8 | # whole recording data directory.
 9 | # Note that this also appends the recording-id as a prefix to the 
10 | # utterance-id.
11 | 
12 | if [ $# -ne 2 ]; then
13 |   echo "Usage: $0 <in-data> <out-data>"
14 |   echo " e.g.: $0 data/train data/train_recospk"
15 |   exit 1
16 | fi
17 | 
18 | in_data=$1
19 | out_data=$2
20 | 
21 | mkdir -p $out_data
22 | 
23 | for f in wav.scp segments utt2spk; do 
24 |   if [ ! -f $in_data/$f ]; then
25 |     echo "$0: Could not find file $in_data/$f" 
26 |     exit 1
27 |   fi
28 | done
29 | 
30 | cp $in_data/wav.scp $out_data/ || exit 1
31 | cp $in_data/reco2file_and_channel $out_data/ 2> /dev/null || true
32 | awk '{print $1" "$2"-"$1}' $in_data/segments > \
33 |   $out_data/old2new.uttmap || exit 1
34 | utils/apply_map.pl -f 1 $out_data/old2new.uttmap < $in_data/segments > \
35 |   $out_data/segments || exit 1
36 | awk '{print $1" "$2}' $out_data/segments > $out_data/utt2spk || exit 1
37 | utils/utt2spk_to_spk2utt.pl $out_data/utt2spk > $out_data/spk2utt || exit 1
38 | 
39 | if [ -f $in_data/text ]; then
40 |   utils/apply_map.pl -f 1 $out_data/old2new.uttmap < $in_data/text > \
41 |     $out_data/text || exit 1
42 | fi
43 | 
44 | if [ -f $in_data/feats.scp ]; then
45 |   utils/apply_map.pl -f 1 $out_data/old2new.uttmap < $in_data/feats.scp > \
46 |     $out_data/feats.scp || exit 1
47 | fi
48 | 
49 | utils/fix_data_dir.sh $out_data || exit 1
50 | utils/validate_data_dir.sh --no-text --no-feats $out_data || exit 1
51 | 


--------------------------------------------------------------------------------
/local/nnet/DFSMN_L.proto.2560:
--------------------------------------------------------------------------------
 1 | <NnetProto>
 2 | <AffineTransform> <InputDim> 720  <OutputDim> 2048 <Xavier> 1
 3 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
 4 | <LinearTransform> <InputDim> 2048 <OutputDim> 512 <Xavier> 1
 5 | <Fsmn> <InputDim> 512 <OutputDim> 512  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 6 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 7 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 8 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 9 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
10 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
11 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
12 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
13 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
14 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
15 | <AffineTransform> <InputDim> 512 <OutputDim> 2048 <Xavier> 1
16 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
17 | <AffineTransform> <InputDim> 2048 <OutputDim> 2048 <Xavier> 1
18 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
19 | <LinearTransform> <InputDim> 2048 <OutputDim> 512 <Xavier> 1
20 | <AffineTransform> <InputDim> 512 <OutputDim> 2560 <Xavier> 1
21 | <Softmax> <InputDim> 2560 <OutputDim> 2560
22 | </NnetProto>
23 | 
24 | 


--------------------------------------------------------------------------------
/local/nnet/DFSMN_L.proto.8136:
--------------------------------------------------------------------------------
 1 | <NnetProto>
 2 | <AffineTransform> <InputDim> 720  <OutputDim> 2048 <Xavier> 1
 3 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
 4 | <LinearTransform> <InputDim> 2048 <OutputDim> 512 <Xavier> 1
 5 | <Fsmn> <InputDim> 512 <OutputDim> 512  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 6 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 7 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 8 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 9 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
10 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
11 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
12 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
13 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
14 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
15 | <AffineTransform> <InputDim> 512 <OutputDim> 2048 <Xavier> 1
16 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
17 | <AffineTransform> <InputDim> 2048 <OutputDim> 2048 <Xavier> 1
18 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
19 | <LinearTransform> <InputDim> 2048 <OutputDim> 512 <Xavier> 1
20 | <AffineTransform> <InputDim> 512 <OutputDim> 8136 <Xavier> 1
21 | <Softmax> <InputDim> 8136 <OutputDim> 8136
22 | </NnetProto>
23 | 
24 | 


--------------------------------------------------------------------------------
/steps/segmentation/combine_targets_dirs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2017 Nagendra Kumar Goel
 4 | #           2018 Vimal Manohar   
 5 | # Apache 2.0.
 6 | 
 7 | # This script combines targets directory into a new targets directory 
 8 | # containing targets from all the input targets directories.
 9 | 
10 | echo "$0 $@"  # Print the command line for logging
11 | 
12 | if [ -f path.sh ]; then . ./path.sh; fi
13 | . parse_options.sh || exit 1;
14 | 
15 | if [ $# -lt 3 ]; then
16 |   echo "Usage: $0 [options] <data> <dest-targets-dir> <src-targets-dir1> <src-targets-dir2> ..."
17 |   echo "e.g.: $0 data/train exp/targets_combined exp/targets_1 exp/targets_2"
18 |   exit 1;
19 | fi
20 | 
21 | export LC_ALL=C
22 | 
23 | data=$1;
24 | shift;
25 | dest=$1;
26 | shift;
27 | first_src=$1;
28 | 
29 | mkdir -p $dest;
30 | rm -f $dest/{targets.*.ark,frame_subsampling_factor} 2>/dev/null
31 | 
32 | frame_subsampling_factor=1
33 | if [ -f $first_src/frame_subsampling_factor ]; then
34 |   cp $first_src/frame_subsampling_factor $dest
35 |   frame_subsampling_factor=$(cat $dest/frame_subsampling_factor)
36 | fi
37 | 
38 | for d in $*; do
39 |   this_frame_subsampling_factor=1
40 |   if [ -f $d/frame_subsampling_factor ]; then
41 |     this_frame_subsampling_factor=$(cat $d/frame_subsampling_factor)
42 |   fi
43 | 
44 |   if [ $this_frame_subsampling_factor != $frame_subsampling_factor ]; then
45 |     echo "$0: Cannot combine targets directories with different frame-subsampling-factors" 1>&2
46 |     exit 1
47 |   fi
48 | 
49 |   cat $d/targets.scp
50 | done | sort -k1,1 > $dest/targets.scp || exit 1
51 | 
52 | steps/segmentation/validate_targets_dir.sh $dest $data || exit 1
53 | 
54 | echo "Combined targets and stored in $dest"
55 | exit 0
56 | 


--------------------------------------------------------------------------------
/local/nnet/DFSMN_L_ivector.proto:
--------------------------------------------------------------------------------
 1 | <NnetProto>
 2 | <AffineTransform> <InputDim> 1020  <OutputDim> 2048 <Xavier> 1
 3 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
 4 | <LinearTransform> <InputDim> 2048 <OutputDim> 512 <Xavier> 1
 5 | <Fsmn> <InputDim> 512 <OutputDim> 512  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 6 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 7 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 8 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 9 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
10 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
11 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
12 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
13 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
14 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
15 | <AffineTransform> <InputDim> 512 <OutputDim> 2048 <Xavier> 1
16 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
17 | <AffineTransform> <InputDim> 2048 <OutputDim> 2048 <Xavier> 1
18 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
19 | <LinearTransform> <InputDim> 2048 <OutputDim> 512 <Xavier> 1
20 | <AffineTransform> <InputDim> 512 <OutputDim> 5777 <Xavier> 1
21 | <Softmax> <InputDim> 5777 <OutputDim> 5777
22 | </NnetProto>
23 | 
24 | 


--------------------------------------------------------------------------------
/local/nnet/DFSMN_L_ivector.proto.2560:
--------------------------------------------------------------------------------
 1 | <NnetProto>
 2 | <AffineTransform> <InputDim> 1020  <OutputDim> 2048 <Xavier> 1
 3 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
 4 | <LinearTransform> <InputDim> 2048 <OutputDim> 512 <Xavier> 1
 5 | <Fsmn> <InputDim> 512 <OutputDim> 512  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 6 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 7 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 8 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
 9 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
10 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
11 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
12 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
13 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
14 | <DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048  <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
15 | <AffineTransform> <InputDim> 512 <OutputDim> 2048 <Xavier> 1
16 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
17 | <AffineTransform> <InputDim> 2048 <OutputDim> 2048 <Xavier> 1
18 | <ParametricRelu> <InputDim> 2048 <OutputDim> 2048
19 | <LinearTransform> <InputDim> 2048 <OutputDim> 512 <Xavier> 1
20 | <AffineTransform> <InputDim> 512 <OutputDim> 2560 <Xavier> 1
21 | <Softmax> <InputDim> 2560 <OutputDim> 2560
22 | </NnetProto>
23 | 
24 | 


--------------------------------------------------------------------------------
/steps/segmentation/decode_sad.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2016  Vimal Manohar
 4 | # Apache 2.0.
 5 | 
 6 | # This script does Viterbi decoding using a matrix of frame log-likelihoods 
 7 | # with the columns corresponding to the pdfs.
 8 | # It is a wrapper around the binary decode-faster.
 9 | 
10 | set -e
11 | set -o pipefail
12 | 
13 | cmd=run.pl
14 | nj=4
15 | acwt=0.1
16 | beam=8
17 | max_active=1000
18 | transform=   # Transformation matrix to apply on the input archives read from output.scp
19 | 
20 | . ./path.sh
21 | 
22 | . utils/parse_options.sh
23 | 
24 | if [ $# -ne 3 ]; then
25 |   echo "Usage: $0 <graph-dir> <nnet_output_dir> <decode-dir>"
26 |   echo " e.g.: $0 "
27 |   exit 1 
28 | fi
29 | 
30 | graph_dir=$1
31 | nnet_output_dir=$2
32 | dir=$3
33 | 
34 | mkdir -p $dir/log
35 | 
36 | echo $nj > $dir/num_jobs
37 | 
38 | for f in $graph_dir/HCLG.fst $nnet_output_dir/output.scp $extra_files; do
39 |   if [ ! -f $f ]; then
40 |     echo "$0: Could not find file $f"
41 |     exit 1
42 |   fi
43 | done
44 | 
45 | rspecifier="ark:utils/split_scp.pl -j $nj \$[JOB-1] $nnet_output_dir/output.scp | copy-feats scp:- ark:- |"
46 | 
47 | # Apply a transformation on the input matrix to combine 
48 | # probs from different columns to pseudo-likelihoods
49 | if [ ! -z "$transform" ]; then
50 |   rspecifier="$rspecifier transform-feats $transform ark:- ark:- |"
51 | fi
52 | 
53 | # Convert pseudo-likelihoods to pseudo log-likelihood
54 | rspecifier="$rspecifier copy-matrix --apply-log ark:- ark:- |"
55 | 
56 | decoder_opts+=(--acoustic-scale=$acwt --beam=$beam --max-active=$max_active)
57 | 
58 | $cmd JOB=1:$nj $dir/log/decode.JOB.log \
59 |   decode-faster ${decoder_opts[@]} \
60 |   $graph_dir/HCLG.fst "$rspecifier" \
61 |   ark:/dev/null "ark:| gzip -c > $dir/ali.JOB.gz"
62 | 


--------------------------------------------------------------------------------
/local/data/normalize_text.py:
--------------------------------------------------------------------------------
 1 | import sys,re
 2 | from number2chinese import *
 3 | 
 4 | not_in_word=[ '`', '÷', '×', '≠', '<', '>', '|', '°', '┬', '┐', '├', '┼', '┤', '└', '┴', '│', '¯', '-', ';', '!', '¿', '·', '‘', '’', '"', '(', ')', '[', ']', '{', '}', '§', '®', '™', '@', '$', '€', '*', '&', '&&', '&&&', '±', '━', '←', '→', '↑', '↓', '♪', '╱', '╲', '◢', '◣', 'ˋ', '▁', '\x1b', '\x7f', '\x80', '¼', '½', '-', 'Á', 'À', 'Â', 'Å', 'Ä', 'Ā','（ ','˙','!', '(', ')', '-', '.', ':', '<', '>', '·', 'β', '—', '•', '℃', '。', '《', '》', 'ㄅ', 'ㄆ', 'ㄇ', 'ㄈ', 'ㄔ', 'ㄙ', 'ㄞ', 'ㄟ', '一', '\ue015', '\ue028', '\ufeff', '．', '：', 'Ｃ', 'Ｄ', 'Ｅ', 'Ｉ', 'Ｋ', 'Ｔ']
 5 | 
 6 | if __name__ == '__main__':
 7 |     texts_path = sys.argv[1]
 8 |     words_path = sys.argv[2]
 9 |     
10 |     words = []
11 |     with open(words_path,'r',encoding='utf-8') as f:
12 |         for line in f:
13 |             line = line.rstrip()
14 |             words.append(line)
15 |     words = set(words)
16 | 
17 |     with open(texts_path,'r',encoding='utf-8') as f:
18 |         for line in f:
19 |             line = line.rstrip()
20 |             tokens = line.split()
21 |             new_line = ''
22 |             for token in tokens:
23 |                 if re.match('^[0-9]+$',token):
24 |                     if len(token) > 15:
25 |                         continue
26 |                     token = to_chinese(int(token))
27 |                 if token in not_in_word :
28 |                     continue
29 |                 if token not in words:
30 |                     if len(re.findall(u'[\u4e00-\u9fff]+', token)) != 0:
31 |                         if len(token) > 1:
32 |                             token = ' '.join(token)
33 |                 new_line = new_line + token + ' '
34 |             print(new_line)
35 | 
36 | 
37 |             
38 | 
39 | 


--------------------------------------------------------------------------------
/utils/remove_data_links.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This program searches within a directory for soft links that
 4 | # appear to be created by 'create_data_link.pl' to a 'storage/' subdirectory,
 5 | # and it removes both the soft links and the things they point to.
 6 | # for instance, if you have a soft link 
 7 | #   foo/egs/1.1.egs -> storage/2/1.1.egs
 8 | # it will remove both foo/egs/storage/2/1.1.egs, and foo/egs/1.1.egs.
 9 | 
10 | ret=0
11 | 
12 | dry_run=false
13 | 
14 | if [ "$1" == "--dry-run" ]; then
15 |   dry_run=true
16 |   shift
17 | fi
18 | 
19 | if [ $# == 0 ]; then
20 |   echo "Usage:  $0 [--dry-run] <list-of-directories>"
21 |   echo "e.g.: $0 exp/nnet4a/egs/"
22 |   echo " Removes from any subdirectories of the command-line arguments, soft links that "
23 |   echo " appear to have been created by utils/create_data_link.pl, as well as the things"
24 |   echo " that those soft links point to.  Will typically be called on a directory prior"
25 |   echo " to 'rm -r' on that directory, to ensure that data that was distributed on other"
26 |   echo " volumes also gets deleted."
27 |   echo " With --dry-run, just prints what it would do."
28 | fi
29 | 
30 | for dir in $*; do
31 |   if [ ! -d $dir ]; then
32 |     echo "$0: not a directory: $dir"
33 |     ret=1
34 |   else
35 |     for subdir in $(find $dir -type d); do
36 |       if [ -d $subdir/storage ]; then
37 |         for x in $(ls $subdir); do
38 |           f=$subdir/$x
39 |           if [ -L $f ] && [[ $(readlink $f) == storage/* ]]; then
40 |             target=$subdir/$(readlink $f)
41 |             if $dry_run; then
42 |               echo rm $f $target
43 |             else
44 |               rm $f $target
45 |             fi
46 |           fi
47 |         done
48 |       fi
49 |     done
50 |   fi
51 | done
52 | 
53 | exit $ret
54 | 


--------------------------------------------------------------------------------
/local/kaggle/decode_kaggle_simulate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | A_dir=/data/local/kgb/Chinese-ASR/1213_simulate/wav/A
 3 | B_dir=/data/local/kgb/Chinese-ASR/1213_simulate/wav/B
 4 | C_dir=/data/local/kgb/Chinese-ASR/1213_simulate/wav/C
 5 | #B_dir=/data/local/kgb/corpus/kgb/kaggle1/data/wav/B
 6 | #C_dir=/data/local/kgb/corpus/kgb/kaggle1/data/wav/C
 7 | #test_C_dir=/data/local/kgb/corpus/kgb/kaggle3/data/wav/C
 8 | #iflytek_A=/data/local/kgb/Chinese-ASR/0622/iflytek_A
 9 | src_dir=./1213_simulate
10 | decode_A=true
11 | 
12 | set -e
13 | set -u
14 | set -o pipefail
15 | . path.sh
16 | 
17 | mkdir -p $src_dir
18 | 
19 | if $decode_A ; then
20 |   python3 local/kaggle/get_id_list.py $A_dir $src_dir/idx.json || exit 1;
21 |   
22 |   #bash local/kaggle/check_sample_rate.sh $A_dir
23 |   local/kaggle/decode_from_wav_seperate.sh $A_dir $src_dir/A || exit 1; #select lm itself
24 |   
25 |   python3 local/kaggle/check_output.py $src_dir/A
26 |   #bash local/kaggle/mix_LM_with_A.sh $src_dir/A/output.txt $src_dir/C_lang
27 |   #bash local/kaggle/test/decode_test.sh $test_C_dir $src_dir/C_test $src_dir/C_lang
28 | else 
29 |   #C:choices 把choose lm comment掉
30 |   bash local/kaggle/check_sample_rate.sh $C_dir || exit 1;
31 |   local/kaggle/decode_from_wav_seperate.sh --choose_lm_file $src_dir/kaggle_simulate_lm $C_dir $src_dir/C  || exit 1;
32 |   python3 local/kaggle/check_output.py $src_dir/C || exit 1;
33 |   #B:question
34 |   bash local/kaggle/check_sample_rate.sh $B_dir
35 |   local/kaggle/decode_from_wav_seperate.sh --choose_lm_file $src_dir/kaggle_simulate_lm $B_dir $src_dir/B || exit 1;
36 |   python3 local/kaggle/check_output.py $src_dir/B || exit 1;
37 | 
38 |   python3 local/kaggle/merge_json.py $src_dir/A/output.txt $src_dir/B/output.txt $src_dir/C/output.txt $src_dir/idx.json $src_dir/result_kaldi.json
39 | fi
40 | 
41 | 


--------------------------------------------------------------------------------
/steps/segmentation/internal/find_oov_phone.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | 
 3 | # Copyright 2017  Vimal Manohar
 4 | # Apache 2.0
 5 | 
 6 | """This script finds the OOV phone by reading the OOV word from
 7 | oov.int in the input <lang> directory and the lexicon
 8 | <lang>/phones/align_lexicon.int.
 9 | It prints the OOV phone to stdout, if it can find a single phone
10 | mapping for the OOV word."""
11 | 
12 | import sys
13 | 
14 | 
15 | def main():
16 |     if len(sys.argv) != 2:
17 |         raise RuntimeError("Usage: {0} <lang>".format(sys.argv[0]))
18 | 
19 |     lang = sys.argv[1]
20 | 
21 |     oov_int = int(open("{0}/oov.int").readline())
22 |     assert oov_int > 0
23 | 
24 |     oov_mapped_to_multiple_phones = False
25 |     for line in open("{0}/phones/align_lexicon.int"):
26 |         parts = line.strip().split()
27 | 
28 |         if len(parts) < 3:
29 |             raise RuntimeError("Could not parse line {0} in "
30 |                                "{1}/phones/align_lexicon.int"
31 |                                "".format(line, lang))
32 | 
33 |         w = int(parts[0])
34 |         if w != oov_int:
35 |             continue
36 | 
37 |         if len(parts[2:]) > 1:
38 |             # Try to find a single phone mapping for OOV
39 |             oov_mapped_to_multiple_phones = True
40 |             continue
41 | 
42 |         p = int(parts[2])
43 |         print ("{0}".format(p))
44 | 
45 |         raise SystemExit(0)
46 | 
47 |     if oov_mapped_to_multiple_phones:
48 |         raise RuntimeError("OOV word found, but is mapped to multiples phones. "
49 |                            "This is an unusual case.")
50 | 
51 |     raise RuntimeError("Could not find OOV word in "
52 |                        "{0}/phones/align_lexicon.int".format(lang))
53 | 
54 | 
55 | if __name__ != "__main__":
56 |     main()
57 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Chinese-ASR
 2 | Chinese-ASR built on kaldi
 3 | 
 4 | ## Dependencies:
 5 | Opencc: convert simplified Chinese to traditional Chinese
 6 | 
 7 | https://github.com/yichen0831/opencc-python
 8 | 
 9 | jeiba zh version : Traditional Chinese word segmentation tool:
10 | 
11 | https://l.facebook.com/l.php?u=https%3A%2F%2Fgithub.com%2Fldkrsi%2Fjieba-zh_TW&h=ATPhhi1b7UYw84pPzgAz4MDbn3MRo7oFLAuhBLW8geUqHF0O1YZDnXsNh5qe7tQVWGQ5uaocYvuV-UsuvALNeN3LRaq68ACLMfbWE2RivhiCHoyjcFtNTVy6XG0sh5MJTp5tYEZm0xA
12 | 
13 | ## Usage
14 | 
15 | 1.modify kaldi path in path.sh 
16 | 
17 | 2.modify corpus path in local/data/corpus_path.sh
18 | 
19 | 3.Install sequitar(G2P), sox, kaldi_lm in kaldi/tools/
20 | 
21 | 4.bash run.sh
22 | 
23 | ## some useful scripts
24 | 
25 | 1. LM training and interpolation : local/lm
26 | 
27 | 2. Customed WFST for multiple choice problem : local/lm/wfst
28 | 
29 | -> Force the outputs to the format of "<one> XXX <two> XXX  <three> XXX  <four> XXX"
30 | 
31 | 3. scripts of training DFSMN : local/nnet
32 | 
33 | ## Experiment
34 | 
35 | | Ｍodel        | TOCFL(CER%)     | Cyberon_Chinese_test(CER%)  |
36 | | ------------- |:--------------:| ----------------------------:|
37 | | mono0a        | 97.76           |    100.71                   |
38 | | tri1          | 50.55           |    63.64                    |
39 | | tri2          | 56.62           |    46.65                    |
40 | | tri3          | 34.78           |    46.78                    |
41 | | tri4          | 37.02           |    34.02                    | 
42 | | tri5          | 65.60           |    49.96                    |
43 | | tdnn_lstm1     | 18.30           |    24.82                    | 
44 | | tdnn_lstm(realign)     | 15.88           |    22.24                    | 
45 | | DFSMN(Alibaba)     | 11.22         |    12.14                    | 
46 | 


--------------------------------------------------------------------------------
/utils/nnet/gen_hamm_mat.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | 
 3 | # Copyright 2012  Brno University of Technology (author: Karel Vesely)
 4 | 
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #  http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
14 | # MERCHANTABLITY OR NON-INFRINGEMENT.
15 | # See the Apache 2 License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | # ./gen_hamm_mat.py
19 | # script generates diagonal matrix with hamming window values
20 | 
21 | from math import *
22 | import sys
23 | 
24 | 
25 | from optparse import OptionParser
26 | 
27 | parser = OptionParser()
28 | parser.add_option('--fea-dim', dest='dim', help='feature dimension')
29 | parser.add_option('--splice', dest='splice', help='applied splice value')
30 | (options, args) = parser.parse_args()
31 | 
32 | if(options.dim == None):
33 |     parser.print_help()
34 |     sys.exit(1)
35 | 
36 | dim=int(options.dim)
37 | splice=int(options.splice)
38 | 
39 | 
40 | #generate the diagonal matrix with hammings
41 | M_2PI = 6.283185307179586476925286766559005
42 | 
43 | dim_mat=(2*splice+1)*dim
44 | timeContext=2*splice+1
45 | print '['
46 | for row in range(dim_mat):
47 |     for col in range(dim_mat):
48 |         if col!=row:
49 |             print '0',
50 |         else:
51 |             i=int(row/dim)
52 |             print str(0.54 - 0.46*cos((M_2PI * i) / (timeContext-1))),
53 |     print
54 | 
55 | print ']'
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/local/nnet/augment_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | fbankdir=data/fbank
 3 | 
 4 | . ./path.sh
 5 | . ./cmd.sh
 6 | . ./utils/parse_options.sh
 7 | set -euo pipefail
 8 | 
 9 | 
10 | for corpus in cyberon_chinese_test TOCFL train_sp ; do
11 |   data=data/$corpus/fbank
12 |   data_aug=data/$corpus\_aug/fbank
13 |   data_rvb=data/$corpus\_rvb/fbank
14 |   if [ ! -f $data/reco2dur ] ; then
15 |     bash utils/data/get_reco2utt.sh $data || exit 1
16 |   fi
17 | 
18 |   python2 steps/data/augment_data_dir.py --utt-suffix aug --bg-snrs 20:10:5:3:0 --num-bg-noises 1:2 --bg-noise-dir data/noise $data $data_aug
19 |   python2 steps/data/reverberate_data_dir.py --prefix rvb --speech-rvb-probability 1 --num-replications 1 \
20 |     --rir-set-parameters data/RIRS_NOISES/simulated_rirs/smallroom/rir_list $data $data_rvb
21 | 
22 |   name=$corpus\_aug
23 |   steps/make_fbank.sh --nj 50 --cmd "$train_cmd"  --fbank-config conf/fbank.conf --name $name $data_aug exp/make_fbank/$name $fbankdir
24 |   steps/compute_cmvn_stats.sh --name $name $data_aug exp/make_fbank/$name $fbankdir
25 | 
26 |   name=$corpus\_rvb
27 |   steps/make_fbank.sh --nj 50 --cmd "$train_cmd"  --fbank-config conf/fbank.conf --name $name $data_rvb exp/make_fbank/$name $fbankdir
28 |   steps/compute_cmvn_stats.sh --name $name $data_rvb exp/make_fbank/$name $fbankdir
29 | 
30 |   rm -rf ./data/$corpus\_rvb_aug/fbank
31 |   utils/combine_data.sh ./data/$corpus\_rvb_aug/fbank $data_aug $data_rvb $data 
32 | done
33 | 
34 | rm -r exp/tri4a_sp_rvb_aug_ali
35 | cp -r exp/tri4a_sp_ali exp/tri4a_sp_rvb_aug_ali
36 | local/nnet/copy_alignment.sh exp/tri4a_sp_rvb_aug_ali/
37 | 
38 | rm -r exp/tri4a_ali_cyberon_chinese_test_rvb_aug
39 | cp -r exp/tri4a_ali_cyberon_chinese_test exp/tri4a_ali_cyberon_chinese_test_rvb_aug
40 | local/nnet/copy_alignment.sh exp/tri4a_ali_cyberon_chinese_test_rvb_aug
41 | 
42 | 


--------------------------------------------------------------------------------
/utils/data/shift_feats.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2016    Vimal Manohar
 4 | #           2017    Hossein Hadian
 5 | # Apache 2.0
 6 | 
 7 | echo "$0 $@"  # Print the command line for logging
 8 | if [ -f path.sh ]; then . ./path.sh; fi
 9 | . parse_options.sh || exit 1;
10 | 
11 | if [ $# != 3 ]; then
12 |   echo " Usage: $0 <frame-shift> <srcdir> <destdir>"
13 |   echo "e.g.: $0 -1 data/train data/train_fs-1"
14 |   echo "The script creates a new data directory with the features modified"
15 |   echo "using the program shift-feats with the specified frame-shift."
16 |   echo "This program automatically adds the prefix 'fs<frame-shift>-' to the"
17 |   echo "utterance and speaker names. See also utils/data/shift_and_combine_feats.sh"
18 |   exit 1
19 | fi
20 | 
21 | frame_shift=$1
22 | srcdir=$2
23 | destdir=$3
24 | 
25 | 
26 | if [ "$destdir" == "$srcdir" ]; then
27 |   echo "$0: this script requires <srcdir> and <destdir> to be different."
28 |   exit 1
29 | fi
30 | 
31 | if [ ! -f $srcdir/feats.scp ]; then
32 |   echo "$0: no such file $srcdir/feats.scp"
33 |   exit 1;
34 | fi
35 | 
36 | utt_prefix="fs$frame_shift-"
37 | spk_prefix="fs$frame_shift-"
38 | 
39 | mkdir -p $destdir
40 | utils/copy_data_dir.sh --utt-prefix $utt_prefix --spk-prefix $spk_prefix \
41 |   $srcdir $destdir
42 | 
43 | if grep --quiet "'" $srcdir/feats.scp; then
44 |   echo "$0: the input features already use single quotes. Can't proceed."
45 |   exit 1;
46 | fi
47 | 
48 | awk -v shift=$frame_shift 'NF == 2 {uttid=$1; feat=$2; qt="";} \
49 | NF > 2 {idx=index($0, " "); uttid=$1; feat=substr($0, idx + 1); qt="\x27";} \
50 | NF {print uttid " shift-feats --print-args=false --shift=" shift, qt feat qt " - |";}' \
51 |   $destdir/feats.scp >$destdir/feats_shifted.scp
52 | mv -f $destdir/feats_shifted.scp $destdir/feats.scp
53 | 
54 | echo "$0: Done"
55 | 
56 | 


--------------------------------------------------------------------------------
/utils/nnet/gen_splice.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | 
 3 | # Copyright 2012  Brno University of Technology (author: Karel Vesely)
 4 | 
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #  http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
14 | # MERCHANTABLITY OR NON-INFRINGEMENT.
15 | # See the Apache 2 License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | # ./gen_splice.py
19 | # generates <splice> Component
20 | 
21 | from math import *
22 | import sys
23 | 
24 | 
25 | from optparse import OptionParser
26 | 
27 | parser = OptionParser()
28 | parser.add_option('--fea-dim', dest='dim_in', help='feature dimension')
29 | parser.add_option('--splice', dest='splice', help='number of frames to concatenate with the central frame')
30 | parser.add_option('--splice-step', dest='splice_step', help='splicing step (frames dont need to be consecutive, --splice 3 --splice-step 2 will select offsets: -6 -4 -2 0 2 4 6)', default='1' )
31 | (options, args) = parser.parse_args()
32 | 
33 | if(options.dim_in == None):
34 |     parser.print_help()
35 |     sys.exit(1)
36 | 
37 | dim_in=int(options.dim_in)
38 | splice=int(options.splice)
39 | splice_step=int(options.splice_step)
40 | 
41 | dim_out=(2*splice+1)*dim_in
42 | 
43 | print '<splice>', dim_out, dim_in
44 | print '[',
45 | 
46 | splice_vec = range(-splice*splice_step, splice*splice_step+1, splice_step)
47 | for idx in range(len(splice_vec)):
48 |     print splice_vec[idx],
49 | 
50 | print ']'
51 | 
52 | 


--------------------------------------------------------------------------------
/local/format_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | 
 4 | if [ -f ./path.sh ]; then . ./path.sh; fi
 5 | 
 6 | silprob=0.5
 7 | 
 8 | arpa_lm=data/local/lm/3gram-mincount/lm_pr4.0.gz
 9 | lang_test=data/lang_3small_test
10 | arpa_lm=$1
11 | lang_test=$2
12 | . ./utils/parse_options.sh
13 | 
14 | 
15 | 
16 | [ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;
17 | 
18 | 
19 | rm -r $lang_test
20 | cp -r data/lang $lang_test
21 | 
22 | echo $arpa_lm
23 | 
24 | gunzip -c "$arpa_lm" | \
25 |   arpa2fst --disambig-symbol=#0 \
26 |            --read-symbol-table=$lang_test/words.txt - $lang_test/G.fst
27 | 
28 | 
29 | echo  "Checking how stochastic G is (the first of these numbers should be small):"
30 | fstisstochastic $lang_test/G.fst
31 | 
32 | ## Check lexicon.
33 | ## just have a look and make sure it seems sane.
34 | echo "First few lines of lexicon FST:"
35 | fstprint   --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst  | head
36 | 
37 | echo Performing further checks
38 | 
39 | # Checking that G.fst is determinizable.
40 | fstdeterminize $lang_test/G.fst /dev/null || echo Error determinizing G.
41 | 
42 | # Checking that L_disambig.fst is determinizable.
43 | fstdeterminize $lang_test/L_disambig.fst /dev/null || echo Error determinizing L.
44 | 
45 | # Checking that disambiguated lexicon times G is determinizable
46 | # Note: we do this with fstdeterminizestar not fstdeterminize, as
47 | # fstdeterminize was taking forever (presumbaly relates to a bug
48 | # in this version of OpenFst that makes determinization slow for
49 | # some case).
50 | fsttablecompose $lang_test/L_disambig.fst $lang_test/G.fst | \
51 |    fstdeterminizestar >/dev/null || echo Error
52 | 
53 | # Checking that LG is stochastic:
54 | fsttablecompose data/lang/L_disambig.fst $lang_test/G.fst | \
55 |    fstisstochastic || echo LG is not stochastic
56 | 
57 | 
58 | echo format_data succeeded.
59 | 


--------------------------------------------------------------------------------
/local/data/merge_json.py:
--------------------------------------------------------------------------------
 1 | import os,sys,json
 2 | from normalize_utils import *
 3 | from parse_choices import parse
 4 | 
 5 | def process_outputs(outputs):
 6 |     L = read_outputs(outputs)
 7 |     L2 = []
 8 |     for name,trans in L:
 9 |         idx = int(name[1:].replace('.wav',''))
10 |         trans = trans.replace(' ','')
11 |         L2.append((idx,trans))
12 |     L2 =sorted(L2, key=lambda s: s[0])
13 |     return L2
14 | def write_d(key,X_list,L):
15 |     for idx,value in X_list:
16 |         for i,l in enumerate(L):
17 |             if l["id"] == idx:
18 |                 L[i][key] = value
19 |                 break
20 |     return L
21 | 
22 | if __name__ == '__main__':
23 |     A_outputs = sys.argv[1]
24 |     B_outputs = sys.argv[2]
25 |     C_outputs = sys.argv[3]
26 |     idx_json = sys.argv[4]
27 |     result_json = sys.argv[5]
28 |     A_list = process_outputs(A_outputs)
29 |     if len(A_list) != 1500:
30 |         print("len(A_list) = {}".format(len(A_list)))
31 |     B_list = process_outputs(B_outputs)
32 |     if len(B_list) != 1500:
33 |         print("len(B_list) = {}".format(len(B_list)))
34 |     C_list = process_outputs(C_outputs)
35 |     if len(C_list) != 1500:
36 |         print("len(C_list) = {}".format(len(C_list)))
37 |     all_idx = json.load(open(idx_json,'r'))
38 |     all_idx = sorted(all_idx)
39 |     L = []
40 |     for idx in all_idx:
41 |         d = {"context":"","question":"","options":["","","",""],"id":idx,"answer":-1}
42 |         L.append(d)
43 |     C_list_parse = []
44 |     for idx,trans in C_list:
45 |         options = parse(trans)
46 |         C_list_parse.append((idx,options))
47 |     L = write_d("context",A_list,L)
48 |     L = write_d("question",B_list,L)
49 |     L = write_d("options",C_list_parse,L)
50 |     json.dump(L,open(result_json,'w',encoding='utf8'),indent=4,ensure_ascii=False)
51 | 
52 | 
53 | 
54 |         
55 |     
56 |     
57 | 


--------------------------------------------------------------------------------
/steps/libs/nnet3/xconfig/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016    Johns Hopkins University (Dan Povey)
 2 | #           2016    Vijayaditya Peddinti
 3 | #           2016    Yiming Wang
 4 | # Apache 2.0.
 5 | 
 6 | """This library has classes and methods to form neural network computation graphs,
 7 | in the nnet3 framework, using higher level abstractions called 'layers'
 8 | (e.g. sub-graphs like LSTMS ).
 9 | 
10 | Note : We use the term 'layer' though the computation graph can have a highly
11 | non-linear structure as, other terms such as nodes/components have already been
12 | used in C++ codebase of nnet3.
13 | 
14 | This is basically a config parser module, where the configs have very concise
15 | descriptions of a neural network.
16 | 
17 | This module has methods to convert the xconfigs into a configs interpretable by
18 | nnet3 C++ library.
19 | 
20 | It generates three different configs:
21 |  'init.config' : which is the config with the info necessary for computing
22 |                the preconditioning matrix i.e., LDA transform
23 |                e.g.
24 |                  input-node name=input dim=40
25 |                  input-node name=ivector dim=100
26 |                  output-node name=output input=Append(Offset(input, -2), Offset(input, -1), input, Offset(input, 1), Offset(input, 2), ReplaceIndex(ivector, t, 0)) objective=linear
27 | 
28 |  'ref.config' : which is a version of the config file used to generate
29 |                 a model for getting left and right context (it doesn't read
30 |                 anything for the LDA-like transform and/or
31 |                 presoftmax-prior-scale components)
32 | 
33 |  'final.config' : which has the actual config used to initialize the model used
34 |                  in training i.e, it has file paths for LDA transform and
35 |                  other initialization files
36 | """
37 | 
38 | 
39 | __all__ = ["utils", "layers", "parser"]
40 | 


--------------------------------------------------------------------------------
/steps/conf/prepare_word_categories.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | 
 3 | # Copyright 2015  Brno University of Technology (author: Karel Vesely)
 4 | # Apache 2.0
 5 | 
 6 | import sys
 7 | 
 8 | from optparse import OptionParser
 9 | desc = """
10 | Prepare mapping of words into categories. Each word with minimal frequency 
11 | has its own category, the rest is merged into single class.
12 | """
13 | usage = "%prog [opts] words.txt ctm category_mapping"
14 | parser = OptionParser(usage=usage, description=desc)
15 | parser.add_option("--min-count", help="Minimum word-count to have a single word category. [default %default]", type='int', default=20)
16 | (o, args) = parser.parse_args()
17 | 
18 | if len(args) != 3:
19 |   parser.print_help()
20 |   sys.exit(1)
21 | words_file, text_file, category_mapping_file = args
22 | 
23 | if text_file == '-': text_file = '/dev/stdin'
24 | if category_mapping_file == '-': category_mapping_file = '/dev/stdout'
25 | 
26 | # Read the words from the 'tra' file,
27 | with open(text_file) as f:
28 |   text_words = [ l.split()[1:] for l in f ]
29 | 
30 | # Flatten the array of arrays of words,
31 | import itertools
32 | text_words = list(itertools.chain.from_iterable(text_words))
33 | 
34 | # Count the words (regardless if correct or incorrect),
35 | word_counts = dict()
36 | for w in text_words:
37 |   if w not in word_counts: word_counts[w] = 0
38 |   word_counts[w] += 1
39 | 
40 | # Read the words.txt,
41 | with open(words_file) as f:
42 |   word_id = [ l.split() for l in f ]
43 | 
44 | # Append the categories,
45 | n=1
46 | word_id_cat=[]
47 | for word, idx in word_id:
48 |   cat = 0 
49 |   if word in word_counts:
50 |     if word_counts[word] > o.min_count:
51 |       cat = n; n += 1
52 |   word_id_cat.append([word, idx, str(cat)])
53 | 
54 | # Store the mapping,
55 | with open(category_mapping_file,'w') as f:
56 |   f.writelines([' '.join(record)+'\n' for record in word_id_cat])
57 | 


--------------------------------------------------------------------------------
/local/lm/wfst/run_wfst.sh:
--------------------------------------------------------------------------------
 1 | #/bin/bash
 2 | . path.sh
 3 | wfst=./data/wfst
 4 | dict=./data/wfst/dict
 5 | lang=./data/wfst/lang
 6 | tmp_lang=./data/wfst/local/lang
 7 | model_dir=exp/tri4a
 8 | LM=data/LM
 9 | text=data/text
10 | #modify dict/lexicon.txt lexiconp.txt
11 | #utils/prepare_lang.sh $dict "<UNK>" $tmp_lang $lang
12 | 
13 | #LM training
14 | mkdir -p $LM/3gram
15 | #PYTHONENCODING=utf-8 python3 local/lm/get_all_choices.py #> $wfst/kaggle12_C.txt
16 | ngram-count -text $text/kaggle1234_C.txt -lm $LM/3gram/kaggle1234_C.lm -vocab $lang/vocabs.txt -limit-vocab -order 3
17 | ngram-count -text $text/kaggle12345_C.txt -lm $LM/3gram/kaggle12345_C.lm -vocab $lang/vocabs.txt -limit-vocab -order 3
18 | ngram-count -text $text/mix.txt -lm $LM/3gram/mix.lm -vocab $lang/vocabs.txt -limit-vocab -order 3
19 | 
20 | 
21 | ngram -lm data/local/lm/3gram-mincount/lm_pr10.0 -vocab $lang/vocabs.txt -limit-vocab -write-lm $LM/3gram/ori_pr10.0.lm
22 | 
23 | local/lm/mix_lm3_test.sh $LM/3gram/ori_pr10.0.lm $LM/3gram/kaggle1234_C.lm $LM/3gram/mix.lm \
24 |   $LM/3gram/kaggle12345_C.lm $text/kaggle5_C.txt $LM/3gram/ori_C_10.0.lm  
25 | 
26 | (
27 |   lm=$LM/3gram/ori_C_10.0.lm
28 |   lang_test=./data/wfst/lang_test_pr10_C
29 |   graph_dir=exp/tri4a/graph_wfst_pr10_C
30 |   #G compilation and check L and G stochastic
31 |   local/kaggle/wfst/format_data.sh $lm $lang $lang_test
32 |   if false ; then
33 |     #Choice fst compilation
34 |     local/kaggle/wfst/generate_choice_fst.sh $lang_test/words.txt $lang_test/choice.fst
35 | 
36 | 
37 |     #compose choice.fst and G.fst
38 |     mv $lang_test/G.fst $lang_test/G_head.fst
39 |     fsttablecompose $lang_test/G_head.fst $lang_test/choice.fst  | \
40 |       fstdeterminizestar --use-log=true | \
41 |       fstminimizeencoded  > $lang_test/G.fst
42 |   fi
43 | 
44 |   #compose HCLG(choice)
45 |   utils/mkgraph.sh $lang_test $model_dir $graph_dir
46 | ) 
47 | 
48 | wait
49 | 
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/steps/cleanup/make_utterance_fsts.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | use warnings; #sed replacement for -w perl parameter
 3 | 
 4 | # makes unigram decoding-graph FSTs specific to each utterances, where the
 5 | # supplied top-n-words list together with the supervision text of the utterance are
 6 | # combined.
 7 | 
 8 | if (@ARGV != 1) {
 9 |   print STDERR "** Warning: this script is deprecated and will be removed.  See\n" .
10 |                "** steps/cleanup/make_biased_lm_graphs.sh.\n" .
11 |                "Usage: make_utterance_fsts.pl top-words-file.txt < text-archive > fsts-archive\n" .
12 |                "e.g.: utils/sym2int.pl -f 2- data/lang/words.txt data/train/text | \\\n" .
13 |                "  make_utterance_fsts.pl exp/foo/top_words.int | compile-train-graphs-fsts ... \n";
14 |   exit(1);
15 | }
16 | 
17 | ($top_words_file) = @ARGV;
18 | 
19 | open(F, "<$top_words_file") || die "opening $top_words_file";
20 | 
21 | %top_word_probs = ( );
22 | 
23 | while(<F>) {
24 |   @A = split;
25 |   (@A == 2 && $A[0] > 0.0) || die "Bad line $_ in $top_words_file";
26 |   $A[1] =~ m/^[0-9]+$/ || die "Expecting numeric word-ids in $top_words_file: $_\n";
27 |   $top_word_probs{$A[1]} += $A[0];
28 | }
29 | 
30 | while (<STDIN>) {
31 |   @A = split;
32 |   $utterance_id = shift @A;
33 |   print "$utterance_id\n";
34 |   $num_words = @A + 0;  # length of array @A
35 |   %word_probs = %top_word_probs;
36 |   foreach $w (@A) {
37 |     $w =~ m/^[0-9]+$/ || die "Expecting numeric word-ids as stdin: $_";
38 |     $word_probs{$w} += 1.0 / $num_words;
39 |   }
40 |   foreach $w (keys %word_probs) {
41 |     $prob = $word_probs{$w};
42 |     $prob > 0.0 || die "Word $w with bad probability $prob, utterance-id = $utterance_id\n";
43 |     $cost = -log($prob);
44 |     print "0 0 $w $w $cost\n";
45 |   }
46 |   $final_cost = -log(1.0 / $num_words);
47 |   print "0 $final_cost\n";
48 |   print "\n"; # Empty line terminates the FST in the text-archive format.
49 | }
50 | 


--------------------------------------------------------------------------------
/local/lm/run_3gram.sh:
--------------------------------------------------------------------------------
 1 | #/bin/bash
 2 | . path.sh
 3 | wfst=./data/wfst
 4 | dict=./data/wfst/dict
 5 | lang=./data/wfst/lang
 6 | tmp_lang=./data/wfst/local/lang
 7 | LM=data/LM
 8 | text=data/text
 9 | stage=1
10 | #modify dict/lexicon.txt lexiconp.txt
11 | #utils/prepare_lang.sh $dict "<UNK>" $tmp_lang $lang
12 | if [ $stage -le 0 ] ; then
13 |   #LM training
14 |   mkdir -p $LM/3gram
15 |   ngram-count -text $text/mix.txt -lm $LM/3gram/mix_novel.lm -vocab $lang/vocabs.txt -limit-vocab -order 3
16 |   ngram-count -text $text/news.txt -lm $LM/3gram/news.lm -vocab $lang/vocabs.txt -limit-vocab -order 3 -prune 2e-7
17 |   ngram -lm $LM/3gram/mix_novel.lm -mix-lm $LM/3gram/news.lm -lambda 0.9 -write-lm $LM/3gram/mix.lm
18 | 
19 |   ngram -lm data/local/lm/3gram-mincount/lm_pr10.0 -vocab $lang/vocabs.txt -limit-vocab -write-lm $LM/3gram/ori_pr10.0.lm
20 | 
21 |   for x in A B C ; do
22 |       ngram-count -text $text/kaggle1234_$x.txt -lm $LM/3gram/kaggle1234_$x.lm -vocab $lang/vocabs.txt -limit-vocab -order 3
23 |       ngram-count -text $text/kaggle12345_$x.txt -lm $LM/3gram/kaggle12345_$x.lm -vocab $lang/vocabs.txt -limit-vocab -order 3
24 | 
25 |       local/lm/mix_lm3_test.sh $LM/3gram/ori_pr10.0.lm $LM/3gram/kaggle1234_$x.lm $LM/3gram/mix.lm \
26 |         $LM/3gram/kaggle12345_$x.lm $text/kaggle5_$x.txt $LM/3gram/ori_$x\_10.0_kaggle12345.lm
27 |   done
28 | fi
29 | if [ $stage -le 1 ] ; then
30 |   for x in A B C ; do
31 |     (
32 |       lm=$LM/3gram/ori_$x\_10.0_kaggle12345.lm
33 |       lang_test=./data/wfst/lang_test_pr10_$x
34 |       graph_dir=exp/tri4a/graph_pr10_$x
35 |       model_dir=exp/tri4a
36 |       model_dir=exp/aishell2/tri4_taiwanese
37 |       graph_dir=$model_dir/graph_pr10_$x
38 |       #G compilation and check L and G stochastic
39 |       local/lm/wfst/format_data.sh $lm $lang $lang_test
40 |       #compose HCLG(choice)
41 |       rm -r $graph_dir
42 |       utils/mkgraph.sh $lang_test $model_dir $graph_dir
43 |     ) &
44 |   done
45 |   wait
46 | fi
47 | 


--------------------------------------------------------------------------------
/local/kaggle/decode_kaggle.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bas1
 2 | A_dir=/data/local/kgb/corpus/kgb/semi-finals-2018/1/data/wav/A
 3 | B_dir=/data/local/kgb/corpus/kgb/semi-finals-2018/1/data/wav/B
 4 | C_dir=/data/local/kgb/corpus/kgb/semi-finals-2018/1/data/wav/C
 5 | A_dir=/data/local/kgb/Chinese-ASR/1213_simulate/wav/A
 6 | B_dir=/data/local/kgb/Chinese-ASR/1213_simulate/wav/B
 7 | C_dir=/data/local/kgb/Chinese-ASR/1213_simulate/wav/C
 8 | 
 9 | C_dir=/data/local/kgb/corpus/kgb/kaggle6/data/wav/C
10 | iflytek_A=/data/local/kgb/Chinese-ASR/1110/iflytek_A
11 | src_dir=./1110
12 | decode_A=false
13 | 
14 | set -e
15 | set -u
16 | set -o pipefail
17 | . path.sh
18 | 
19 | mkdir -p $src_dir
20 | 
21 | if $decode_A ; then
22 | 
23 |   #python3 local/kaggle/get_id_list.py $A_dir $src_dir/idx.json || exit 1;
24 |   #bash local/kaggle/choose_lm2.sh $iflytek_A $src_dir/A_lm_test $src_dir/choose_lm || exit 1;
25 |   
26 |   #bash local/kaggle/check_sample_rate.sh $A_dir
27 |   local/kaggle/decode_from_wav_seperate_by_lm.sh $A_dir $src_dir/A $src_dir/choose_lm A || exit 1;
28 |   python3 local/kaggle/check_output_by_lm.py $src_dir/A
29 |   #bash local/kaggle/mix_LM_with_A.sh $src_dir/A/output.txt $src_dir/C_lang
30 |   #bash local/kaggle/test/decode_test.sh $test_C_dir $src_dir/C_test $src_dir/C_lang
31 | else 
32 |   #C:choices
33 |   #bash local/kaggle/check_sample_rate.sh $C_dir || exit 1;
34 |   local/kaggle/decode_from_wav_seperate_by_lm.sh $C_dir $src_dir/C_aishell_DFSMN_S_fine_tune $src_dir/choose_lm C || exit 1;
35 |   python3 local/kaggle/check_output_by_lm.py $src_dir/C || exit 1;
36 |   
37 |   #B:question
38 |   #bash local/kaggle/check_sample_rate.sh $B_dir
39 |   #local/kaggle/decode_from_wav_seperate_by_lm.sh $B_dir $src_dir/B $src_dir/choose_lm B || exit 1;
40 |   #python3 local/kaggle/check_output_by_lm.py $src_dir/B || exit 1;
41 | 
42 |   #python3 local/kaggle/merge_json.py $src_dir/A/output.txt $src_dir/B/output.txt $src_dir/C_n5/output.txt $src_dir/idx.json 5 $src_dir/result_kaldi_n.json
43 | 
44 | fi
45 | 


--------------------------------------------------------------------------------
/steps/segmentation/post_process_sad_to_segments.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2015-17  Vimal Manohar
 4 | # Apache 2.0.
 5 | 
 6 | # This script post-processes the output of steps/segmentation/decode_sad.sh,
 7 | # which is in the form of frame-level alignments, into a 'segments' file.
 8 | # The alignments must be speech activity detection marks i.e. 1 for silence 
 9 | # and 2 for speech.
10 | 
11 | set -e -o pipefail -u
12 | . ./path.sh
13 | 
14 | cmd=run.pl
15 | stage=-10
16 | nj=18
17 | 
18 | # The values below are in seconds
19 | frame_shift=0.01
20 | segment_padding=0.2
21 | 
22 | . utils/parse_options.sh
23 | 
24 | if [ $# -ne 3 ]; then
25 |   echo "This script post-processes the output of steps/segmentation/decode_sad.sh, "
26 |   echo "which is in the form of frame-level alignments, into kaldi segments. "
27 |   echo "The alignments must be speech activity detection marks i.e. 1 for silence "
28 |   echo "and 2 for speech."
29 |   echo "Usage: $0 <data-dir> <vad-dir> <segmentation-dir>"
30 |   echo " e.g.: $0 data/dev_aspire_whole exp/vad_dev_aspire"
31 |   exit 1
32 | fi
33 | 
34 | data_dir=$1
35 | vad_dir=$2    # Alignment directory containing frame-level SAD labels
36 | dir=$3
37 | 
38 | mkdir -p $dir
39 | 
40 | for f in $vad_dir/ali.1.gz $vad_dir/num_jobs; do
41 |   if [ ! -f $f ]; then
42 |     echo "$0: Could not find file $f" && exit 1
43 |   fi
44 | done
45 | 
46 | nj=`cat $vad_dir/num_jobs` || exit 1
47 | utils/split_data.sh $data_dir $nj
48 | 
49 | utils/data/get_utt2dur.sh $data_dir
50 | 
51 | if [ $stage -le 0 ]; then
52 |   $cmd JOB=1:$nj $dir/log/segmentation.JOB.log \
53 |     copy-int-vector "ark:gunzip -c $vad_dir/ali.JOB.gz |" ark,t:- \| \
54 |     steps/segmentation/internal/sad_to_segments.py \
55 |       --frame-shift=$frame_shift --segment-padding=$segment_padding \
56 |       --utt2dur=$data_dir/utt2dur - $dir/segments.JOB
57 | fi
58 | 
59 | echo $nj > $dir/num_jobs
60 | 
61 | for n in $(seq $nj); do 
62 |   cat $dir/segments.$n
63 | done > $dir/segments
64 | 


--------------------------------------------------------------------------------
/utils/parallel/limit_num_gpus.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script functions as a wrapper of a bash command that uses GPUs.
 4 | #
 5 | # It sets the CUDA_VISIBLE_DEVICES variable so that it limits the number of GPUs
 6 | # used for programs. It is neccesary for running a job on the grid if the job
 7 | # would automatically grabs all resources available on the system, e.g. a
 8 | # TensorFlow program.
 9 | 
10 | num_gpus=1 # this variable indicates how many GPUs we will allow the command
11 |            # passed to this script will run on. We achieve this by setting the
12 |            # CUDA_VISIBLE_DEVICES variable
13 | set -e
14 | 
15 | if [ "$1" == "--num-gpus" ]; then
16 |   num_gpus=$2
17 |   shift
18 |   shift
19 | fi
20 | 
21 | if ! printf "%d" "$num_gpus" >/dev/null || [ $num_gpus -le 0 ]; then
22 |   echo $0: Must pass a positive interger after --num-gpus
23 |   echo e.g. $0 --num-gpus 2 local/tfrnnlm/run_lstm.sh
24 |   exit 1
25 | fi
26 | 
27 | if [ $# -eq 0 ]; then
28 |   echo "Usage:  $0 [--num-gpus <num-gpus>] <command> [<arg1>...]"
29 |   echo "Runs <command> with args after setting CUDA_VISIBLE_DEVICES to "
30 |   echo "make sure exactly <num-gpus> GPUs are visible (default: 1)."
31 |   exit 1
32 | fi
33 | 
34 | CUDA_VISIBLE_DEVICES=
35 | num_total_gpus=`nvidia-smi -L | wc -l`
36 | num_gpus_assigned=0
37 | 
38 | for i in `seq 0 $[$num_total_gpus-1]`; do
39 | # going over all GPUs and check if it is idle, and add to the list if yes
40 |   if nvidia-smi -i $i | grep "No running processes found" >/dev/null; then
41 |     CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}$i, && num_gpus_assigned=$[$num_gpus_assigned+1]
42 |   fi
43 | # once we have enough GPUs, break out of the loop
44 |   [ $num_gpus_assigned -eq $num_gpus ] && break
45 | done
46 | 
47 | [ $num_gpus_assigned -ne $num_gpus ] && echo Could not find enough idle GPUs && exit 1
48 | 
49 | export CUDA_VISIBLE_DEVICES=$(echo $CUDA_VISIBLE_DEVICES | sed "s=,$==g")
50 | 
51 | echo "$0: Running the job on GPU(s) $CUDA_VISIBLE_DEVICES"
52 | "$@"
53 | 


--------------------------------------------------------------------------------
/local/lm/wfst/format_data.sh:
--------------------------------------------------------------------------------
 1 | #/bin/bash
 2 | . path.sh
 3 | 
 4 | 
 5 | if [ -f ./path.sh ]; then . ./path.sh; fi
 6 | 
 7 | arpa_lm=$1
 8 | lang=$2
 9 | lang_test=$3
10 | . ./utils/parse_options.sh
11 | 
12 | [ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;
13 | 
14 | 
15 | rm -r $lang_test
16 | cp -r $lang $lang_test
17 | 
18 | echo $arpa_lm
19 | cat $arpa_lm | arpa2fst --disambig-symbol=#0 \
20 |   --read-symbol-table=$lang_test/words.txt -  | fstarcsort --sort_type=olabel > $lang_test/G.fst
21 | #cat $arpa_lm | arpa2fst - |  fstprint |\
22 | #   fstcompile --isymbols=$lang_test/words.txt --osymbols=$lang_test/words.txt \
23 | #   --keep_isymbols=false --keep_osymbols=false | fstrmepsilon | fstarcsort --sort_type=olabel > $lang_test/G.fst
24 | 
25 | echo  "Checking how stochastic G is (the first of these numbers should be small):"
26 | fstisstochastic $lang_test/G.fst
27 | 
28 | ## Check lexicon.
29 | ## just have a look and make sure it seems sane.
30 | echo "First few lines of lexicon FST:"
31 | fstprint   --isymbols=$lang/phones.txt --osymbols=$lang/words.txt $lang/L.fst  | head
32 | 
33 | echo Performing further checks
34 | 
35 | # Checking that G.fst is determinizable.
36 | fstdeterminize $lang_test/G.fst /dev/null || echo Error determinizing G.
37 | 
38 | # Checking that L_disambig.fst is determinizable.
39 | fstdeterminize $lang_test/L_disambig.fst /dev/null || echo Error determinizing L.
40 | 
41 | # Checking that disambiguated lexicon times G is determinizable
42 | # Note: we do this with fstdeterminizestar not fstdeterminize, as
43 | # fstdeterminize was taking forever (presumbaly relates to a bug
44 | # in this version of OpenFst that makes determinization slow for
45 | # some case).
46 | fsttablecompose $lang_test/L_disambig.fst $lang_test/G.fst | \
47 |    fstdeterminizestar >/dev/null || echo Error
48 | 
49 | # Checking that LG is stochastic:
50 | fsttablecompose $lang/L_disambig.fst $lang_test/G.fst | \
51 |    fstisstochastic || echo LG is not stochastic
52 | 
53 | echo format_data succeeded.
54 | 
55 | 


--------------------------------------------------------------------------------
/steps/subset_ali_dir.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2017  Vimal Manohar
 4 | # Apache 2.0.
 5 | 
 6 | cmd=run.pl
 7 | 
 8 | if [ -f ./path.sh ]; then . ./path.sh; fi
 9 | 
10 | . ./utils/parse_options.sh
11 | 
12 | if [ $# -ne 4 ]; then
13 |   cat <<EOF
14 |   This script creates an alignment directory containing a subset of 
15 |   utterances contained in <subset-data-dir> from the 
16 |   original alignment directory containing alignments for utterances in
17 |   <full-data-dir>.
18 | 
19 |   The number of split jobs in the output alignment directory is 
20 |   equal to the number of jobs in the original alignment directory, 
21 |   unless the subset data directory has too few speakers.
22 | 
23 |   Usage: $0 [options] <full-data-dir> <subset-data-dir> <ali-dir> <subset-ali-dir>
24 |    e.g.: $0 data/train_sp data/train exp/tri3_ali_sp exp/tri3_ali
25 | 
26 |   Options: 
27 |       --cmd (utils/run.pl|utils/queue.pl <queue opts>)  # how to run jobs.
28 | EOF
29 |   exit 1
30 | fi
31 | 
32 | data=$1
33 | subset_data=$2
34 | ali_dir=$3
35 | dir=$4
36 | 
37 | nj=$(cat $ali_dir/num_jobs) || exit 1
38 | utils/split_data.sh $data $nj
39 | 
40 | mkdir -p $dir
41 | cp $ali_dir/{final.mdl,*.mat,*_opts,tree} $dir/ || true
42 | cp -r $ali_dir/phones $dir 2>/dev/null || true
43 | 
44 | $cmd JOB=1:$nj $dir/log/copy_alignments.JOB.log \
45 |   copy-int-vector "ark:gunzip -c $ali_dir/ali.JOB.gz |" \
46 |   ark,scp:$dir/ali_tmp.JOB.ark,$dir/ali_tmp.JOB.scp || exit 1
47 | 
48 | for n in `seq $nj`; do
49 |   cat $dir/ali_tmp.$n.scp 
50 | done > $dir/ali_tmp.scp
51 | 
52 | num_spk=$(cat $subset_data/spk2utt | wc -l)
53 | if [ $num_spk -lt $nj ]; then
54 |   nj=$num_spk
55 | fi
56 | 
57 | utils/split_data.sh $subset_data $nj
58 | $cmd JOB=1:$nj $dir/log/filter_alignments.JOB.log \
59 |   copy-int-vector \
60 |   "scp:utils/filter_scp.pl $subset_data/split${nj}/JOB/utt2spk $dir/ali_tmp.scp |" \
61 |   "ark:| gzip -c > $dir/ali.JOB.gz" || exit 1
62 | 
63 | echo $nj > $dir/num_jobs
64 | 
65 | rm $dir/ali_tmp.*.{ark,scp} $dir/ali_tmp.scp
66 | 
67 | exit 0
68 | 


--------------------------------------------------------------------------------
/utils/ln.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | use File::Spec;
 3 | 
 4 | if ( @ARGV < 2 ) {
 5 |   print STDERR "usage: ln.pl input1 input2 dest-dir\n" .
 6 |     "This script does a soft link of input1, input2, etc." .
 7 |     "to dest-dir, using relative links where possible\n" .
 8 |     "Note: input-n and dest-dir may both be absolute pathnames,\n" .
 9 |     "or relative pathnames, relative to the current directlory.\n";
10 |   exit(1);
11 | }  
12 | 
13 | $dir = pop @ARGV;
14 | if ( ! -d $dir ) {
15 |   print STDERR "ln.pl: last argument must be a directory ($dir is not a directory)\n";
16 |   exit(1);
17 | }
18 | 
19 | $ans = 1; # true.
20 | 
21 | $absdir = File::Spec->rel2abs($dir); # Get $dir as abs path.
22 | defined $absdir || die "No such directory $dir";
23 | foreach $file (@ARGV) {
24 |   $absfile =  File::Spec->rel2abs($file); # Get $file as abs path.
25 |   defined $absfile || die "No such file or directory: $file";
26 |   @absdir_split = split("/", $absdir);
27 |   @absfile_split = split("/", $absfile);
28 | 
29 |   $newfile = $absdir . "/" . $absfile_split[$#absfile_split]; # we'll use this
30 |   # as the destination in the link command.
31 |   $num_removed = 0;
32 |   while (@absdir_split > 0 && $absdir_split[0] eq $absfile_split[0]) {
33 |     shift @absdir_split;
34 |     shift @absfile_split;
35 |     $num_removed++;
36 |   }
37 |   if (-l $newfile) { # newfile is already a link -> safe to delete it.
38 |     unlink($newfile); # "unlink" just means delete.
39 |   }
40 |   if ($num_removed == 0) { # will use absolute pathnames.
41 |     $oldfile = "/" . join("/", @absfile_split);
42 |     $ret = symlink($oldfile, $newfile);
43 |   } else {
44 |     $num_dots = @absdir_split;
45 |     $oldfile = join("/", @absfile_split);
46 |     for ($n = 0; $n < $num_dots; $n++) {
47 |       $oldfile = "../" . $oldfile;
48 |     }
49 |     $ret = symlink($oldfile, $newfile);
50 |   }
51 |   $ans = $ans && $ret;
52 |   if (! $ret) {
53 |     print STDERR "Error linking $oldfile to $newfile\n";
54 |   }
55 | }
56 | 
57 | exit ($ans == 1 ? 0 : 1);
58 | 
59 | 


--------------------------------------------------------------------------------
/steps/nnet3/chain/gen_topo.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | # Copyright 2012  Johns Hopkins University (author: Daniel Povey)
 4 | 
 5 | # Generate a topology file.  This allows control of the number of states in the
 6 | # non-silence HMMs, and in the silence HMMs.  This is a modified version of
 7 | # 'utils/gen_topo.pl' that generates a different type of topology, one that we
 8 | # believe should be useful in the 'chain' model.  Note: right now it doesn't
 9 | # have any real options, and it treats silence and nonsilence the same.  The
10 | # intention is that you write different versions of this script, or add options,
11 | # if you experiment with it.
12 | 
13 | if (@ARGV != 2) {
14 |   print STDERR "Usage: utils/gen_topo.pl <colon-separated-nonsilence-phones> <colon-separated-silence-phones>\n";
15 |   print STDERR "e.g.:  utils/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n";
16 |   exit (1);
17 | }
18 | 
19 | ($nonsil_phones, $sil_phones) = @ARGV;
20 | 
21 | $nonsil_phones =~ s/:/ /g;
22 | $sil_phones =~ s/:/ /g;
23 | $nonsil_phones =~ m/^\d[ \d]+$/ || die "$0: bad arguments @ARGV\n";
24 | $sil_phones =~ m/^\d[ \d]*$/ || die "$0: bad arguments @ARGV\n";
25 | 
26 | print "<Topology>\n";
27 | print "<TopologyEntry>\n";
28 | print "<ForPhones>\n";
29 | print "$nonsil_phones $sil_phones\n";
30 | print "</ForPhones>\n";
31 | # The next two lines may look like a bug, but they are as intended.  State 0 has
32 | # no self-loop, it happens exactly once.  And it can go either to state 1 (with
33 | # a self-loop) or to state 2, so we can have zero or more instances of state 1
34 | # following state 0.
35 | # We make the transition-probs 0.5 so they normalize, to keep the code happy.
36 | # In fact, we always set the transition probability scale to 0.0 in the 'chain'
37 | # code, so they are never used.
38 | print "<State> 0 <PdfClass> 0 <Transition> 1 0.5 <Transition> 2 0.5 </State>\n";
39 | print "<State> 1 <PdfClass> 1 <Transition> 1 0.5 <Transition> 2 0.5 </State>\n";
40 | print "<State> 2 </State>\n";
41 | print "</TopologyEntry>\n";
42 | print "</Topology>\n";
43 | 


--------------------------------------------------------------------------------