├── .gitignore
├── .vscode
    ├── settings.json
    ├── sgm_hyper.json
    └── sgm_nn.json
├── LICENSE
├── README.md
├── assets
    ├── G.fst.png
    ├── JoinAP.png
    ├── L.fst(no NOISE).png
    ├── L.fst.png
    ├── ME2E.png
    ├── MVDR.png
    ├── PSD.png
    ├── T.fst(no NOISE).png
    ├── T.fst.png
    ├── TLG.fst.png
    ├── TLG.png
    ├── WFST.png
    ├── den.png
    ├── h_f.png
    ├── intellisense.gif
    ├── logo.png
    ├── loss.png
    ├── phonological_feature.png
    ├── pipeline_rnnt.png
    └── potential.png
├── cat
    ├── __init__.py
    ├── ctc
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── cal_logit.py
    │   ├── decode.py
    │   ├── decode_me2e.py
    │   ├── fst_decode.sh
    │   ├── train.py
    │   ├── train_me2e.py
    │   ├── train_me2e_chunk.py
    │   ├── train_me2e_kaldi.py
    │   ├── train_me2e_kaldi_chunk.py
    │   └── train_unified.py
    ├── front
    │   ├── beamformer_net.py
    │   ├── conv_beamformer.py
    │   ├── dnn_beamformer.py
    │   ├── dnn_wpe_new.py
    │   ├── filter_net.py
    │   ├── kaldifbank.py
    │   ├── log_mel.py
    │   ├── mask_estimator.py
    │   ├── multi2mono.py
    │   ├── nets_utils.py
    │   └── stft.py
    ├── lm
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── ppl.py
    │   ├── rescore.py
    │   ├── train.py
    │   └── trf
    │   │   ├── __init__.py
    │   │   ├── model.py
    │   │   └── train.py
    ├── rnnt
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── ctct_decoder.py
    │   ├── decode.py
    │   ├── joiner.py
    │   ├── rnnt_decoder.py
    │   ├── train.py
    │   └── train_unified.py
    ├── shared
    │   ├── __init__.py
    │   ├── _constants.py
    │   ├── coreutils.py
    │   ├── data.py
    │   ├── decoder.py
    │   ├── encoder.py
    │   ├── layer.py
    │   ├── manager.py
    │   ├── manager_simu.py
    │   ├── manager_wo.py
    │   ├── scheduler.py
    │   ├── simu_net.py
    │   ├── specaug.py
    │   └── tokenizer.py
    └── utils
    │   ├── __init__.py
    │   ├── avgmodel.py
    │   ├── cleandir.py
    │   ├── compat
    │       ├── repl_am_to_encoder.py
    │       └── update_transformer_lm.py
    │   ├── data
    │       ├── __init__.py
    │       ├── _data_prep_kaldi.py
    │       ├── corpus2index.py
    │       ├── data_prep.py
    │       ├── data_prep_kaldi.sh
    │       ├── exclude_corpus.py
    │       ├── pack_corpus.py
    │       ├── prep_wds.py
    │       ├── resolvedata.py
    │       └── text2nbest.py
    │   ├── lm
    │       ├── __init__.py
    │       ├── interpolate_nbests.py
    │       └── lmweight_search.py
    │   ├── parseopt.py
    │   ├── parseschema.py
    │   ├── pipeline
    │       ├── __init__.py
    │       ├── _constants.py
    │       ├── asr.py
    │       ├── common_utils.py
    │       ├── lm.py
    │       └── ngram.sh
    │   ├── plot_tb.py
    │   ├── tool
    │       ├── build_ctc_topo.py
    │       ├── build_decoding_graph.sh
    │       ├── get_prune_args.py
    │       ├── pack_audios.py
    │       ├── pack_audios_multi.py
    │       ├── prep_bigcidian.sh
    │       ├── prep_decoding_graph_materials.py
    │       ├── prep_den_lm.sh
    │       ├── prep_syllable_converter.py
    │       └── prep_wlm_lodr.sh
    │   └── wer.py
├── docs
    ├── ME2E_ASR_ch.md
    ├── configure_guide.md
    ├── contributing.md
    ├── cuside-array.md
    ├── cuside.md
    ├── cuside_ch.md
    ├── energy-based_LM_training.md
    ├── guide_for_third_party_tools.md
    ├── how_to_prepare_large_dataset.md
    ├── how_to_prepare_large_dataset_ch.md
    ├── joinap_tutorial_ch.md
    ├── significance_test.md
    ├── toolkitworkflow.md
    ├── whatsnew.md
    └── yesno_tutorial_ch.md
├── egs
    ├── IuMien
    │   ├── README-zh.md
    │   ├── README.md
    │   ├── cat
    │   ├── exp
    │   │   ├── Mono-phoneme
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor
    │   │   │   │   ├── exp-monitor.png
    │   │   │   │   ├── exp2-monitor.png
    │   │   │   │   ├── exp3-monitor.png
    │   │   │   │   └── readme.md
    │   │   │   ├── readme.md
    │   │   │   └── run.history.sh
    │   │   ├── Mono-subword
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor
    │   │   │   │   ├── exp2-monitor.png
    │   │   │   │   ├── exp3-monitor.png
    │   │   │   │   ├── monitor.png
    │   │   │   │   └── readme.md
    │   │   │   ├── readme.md
    │   │   │   └── run.history.sh
    │   │   ├── Mul10-sub-PT-sub-FT
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor
    │   │   │   │   ├── exp2-monitor.png
    │   │   │   │   ├── exp3-monitor.png
    │   │   │   │   ├── monitor.png
    │   │   │   │   └── readme.md
    │   │   │   ├── readme.md
    │   │   │   └── run.history.sh
    │   │   ├── Wav2vec2-cv10-phoneme-FT
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor
    │   │   │   │   ├── exp2-monitor.png
    │   │   │   │   ├── exp3-monitor.png
    │   │   │   │   ├── monitor.png
    │   │   │   │   └── readme.md
    │   │   │   ├── readme.md
    │   │   │   └── run.history.sh
    │   │   ├── Wav2vec2-cv10-sub-FT
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor
    │   │   │   │   ├── exp2-monitor.png
    │   │   │   │   ├── exp3-monitor.png
    │   │   │   │   ├── monitor.png
    │   │   │   │   └── readme.md
    │   │   │   ├── readme.md
    │   │   │   └── run.history.sh
    │   │   ├── Whistle-phoneme-FT
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor
    │   │   │   │   ├── exp-monitor.png
    │   │   │   │   ├── exp2-monitor.png
    │   │   │   │   ├── exp3-monitor.png
    │   │   │   │   └── readme.md
    │   │   │   ├── readme.md
    │   │   │   ├── run.history.sh
    │   │   │   └── unpack_mulingual_param.py
    │   │   ├── Whistle-sub-FT
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor
    │   │   │   │   ├── exp-monitor.png
    │   │   │   │   ├── exp2-monitor.png
    │   │   │   │   ├── exp3-monitor.png
    │   │   │   │   └── readme.md
    │   │   │   ├── readme.md
    │   │   │   └── run.history.sh
    │   │   └── decode_lm
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── readme.md
    │   │   │   └── run.history.sh
    │   ├── exp_data
    │   │   ├── exp-1
    │   │   │   ├── dev_data-1
    │   │   │   └── train_data-1
    │   │   ├── exp-2
    │   │   │   ├── dev-2_data-2
    │   │   │   ├── test-2_data-2
    │   │   │   └── train-2_data-2
    │   │   └── exp-3
    │   │   │   ├── dev-3_data-3
    │   │   │   ├── test-3_data-3
    │   │   │   └── train-3_data-3
    │   ├── exp_dict
    │   │   ├── lexicon
    │   │   ├── lexicon-2
    │   │   └── lexicon-3
    │   ├── local
    │   │   ├── bpe_wfst_run.sh
    │   │   ├── data_kaldi.sh
    │   │   ├── extract_fbank.py
    │   │   ├── fliter_data.py
    │   │   ├── get_lexicon.py
    │   │   ├── get_wordlist.py
    │   │   ├── lexicon_wfst_run.sh
    │   │   ├── process_model_for_subword_ft.py
    │   │   ├── split_data.py
    │   │   └── unpack_mulingual_param.py
    │   └── utils
    ├── TEMPLATE
    │   ├── .vscode
    │   ├── README.md
    │   ├── cat
    │   ├── exp
    │   │   ├── asr-ctc-crf
    │   │   │   ├── config.json
    │   │   │   ├── decode-lm
    │   │   │   │   ├── config.json
    │   │   │   │   └── hyper-p.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor.png
    │   │   │   ├── readme.md
    │   │   │   └── run.sh
    │   │   ├── asr-ctc-large-corpora
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor.png
    │   │   │   ├── readme.md
    │   │   │   ├── run.sh
    │   │   │   └── tokenizer
    │   │   │   │   └── hyper-p.json
    │   │   ├── asr-ctc-lexicon
    │   │   │   ├── config.json
    │   │   │   ├── decode_lm
    │   │   │   │   ├── config.json
    │   │   │   │   └── hyper-p.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── readme.md
    │   │   │   └── run.sh
    │   │   ├── asr-ctc
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor.png
    │   │   │   └── readme.md
    │   │   ├── asr-rnnt-cuside
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor.png
    │   │   │   └── readme.md
    │   │   ├── asr-rnnt
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor.png
    │   │   │   └── readme.md
    │   │   ├── lm-ebm
    │   │   │   ├── cfg_aux.json
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor.png
    │   │   │   ├── readme.md
    │   │   │   └── run.sh
    │   │   ├── lm-ngram-word
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   └── readme.md
    │   │   └── lm-nn
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor.png
    │   │   │   └── readme.md
    │   ├── local
    │   │   ├── audio2ark.sh
    │   │   ├── data.sh
    │   │   ├── eval_fst_decode.sh
    │   │   ├── extract_feat.py
    │   │   ├── lm_data.sh
    │   │   ├── prep_wds.py
    │   │   └── significance_test.py
    │   └── utils
    ├── aishell
    │   ├── .vscode
    │   ├── README.md
    │   ├── cat
    │   ├── exp
    │   │   ├── ctc-crf-cuside
    │   │   │   ├── config.json
    │   │   │   ├── decode_lm
    │   │   │   │   ├── config.json
    │   │   │   │   └── hyper-p.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor.png
    │   │   │   ├── readme.md
    │   │   │   ├── run.sh
    │   │   │   └── run_lexicon.sh
    │   │   ├── ctc-v1
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor.png
    │   │   │   └── readme.md
    │   │   ├── ebm-lm
    │   │   │   ├── GN-ELM-DNCE
    │   │   │   │   ├── config.json
    │   │   │   │   ├── config_ebm.json
    │   │   │   │   ├── config_noise.json
    │   │   │   │   ├── hyper-p.json
    │   │   │   │   ├── monitor.png
    │   │   │   │   └── readme.md
    │   │   │   ├── GN-ELM-ML
    │   │   │   │   ├── config.json
    │   │   │   │   ├── config_ebm.json
    │   │   │   │   ├── hyper-p.json
    │   │   │   │   ├── monitor.png
    │   │   │   │   └── readme.md
    │   │   │   ├── GN-ELM-NCE
    │   │   │   │   ├── config.json
    │   │   │   │   ├── config_ebm.json
    │   │   │   │   ├── config_noise.json
    │   │   │   │   ├── hyper-p.json
    │   │   │   │   ├── monitor.png
    │   │   │   │   └── readme.md
    │   │   │   └── TRF-LM-DNCE
    │   │   │   │   ├── config.json
    │   │   │   │   ├── config_noise.json
    │   │   │   │   ├── config_trf.json
    │   │   │   │   ├── hyper-p.json
    │   │   │   │   ├── monitor.png
    │   │   │   │   └── readme.md
    │   │   ├── lm
    │   │   │   ├── lm-v1-char-5gram
    │   │   │   │   ├── config.json
    │   │   │   │   ├── hyper-p.json
    │   │   │   │   └── readme.md
    │   │   │   └── lm-v2-word-3gram
    │   │   │   │   ├── config.json
    │   │   │   │   ├── hyper-p.json
    │   │   │   │   └── readme.md
    │   │   ├── rnnt-cuside
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor.png
    │   │   │   └── readme.md
    │   │   └── rnnt-v1
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor.png
    │   │   │   └── readme.md
    │   ├── local
    │   │   ├── data.sh
    │   │   ├── data_kaldi.sh
    │   │   ├── extract_fbank.py
    │   │   └── extract_meta_kaldi.sh
    │   └── utils
    ├── aishell4
    │   ├── README.md
    │   ├── cat
    │   ├── exp
    │   │   ├── Exp1-SingalChannel_E2E
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor.png
    │   │   │   └── readme.md
    │   │   ├── Exp10~12-CUSIDE-Array+Pre-trained_BE+E2E-FT(ID+simu_data)
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor.png
    │   │   │   └── readme.md
    │   │   ├── Exp13-CUSIDE-Array(OOD)+Pre-trained_BE
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   └── readme.md
    │   │   ├── Exp2-SingalChannel_E2E+JT(CUSIDE)
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor.png
    │   │   │   └── readme.md
    │   │   ├── Exp3-MultiChannel_E2E
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor.png
    │   │   │   └── readme.md
    │   │   ├── Exp4-MultiChannel_E2E+JT(CUSIDE-Array)
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor.png
    │   │   │   └── readme.md
    │   │   ├── Exp5-CUSIDE-Array+real_right_ctx
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor.png
    │   │   │   └── readme.md
    │   │   ├── Exp6-CUSIDE-Array+simu_right_ctx
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor.png
    │   │   │   └── readme.md
    │   │   ├── Exp7-CUSIDE+Pre-trained_BE
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   └── readme.md
    │   │   ├── Exp8-CUSIDE-Array+Pre-trained_BE
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── hyper-p_ori.json
    │   │   │   └── readme.md
    │   │   ├── Exp9-CUSIDE-Array+Pre-trained_BE+E2E-FT(ID)
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── hyper-p_ori.json
    │   │   │   ├── monitor.png
    │   │   │   └── readme.md
    │   │   ├── ctc-e2e-chunk+simu
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor.png
    │   │   │   ├── readme.md
    │   │   │   ├── right_context.png
    │   │   │   └── simu_right_context.png
    │   │   ├── ctc-e2e-chunk-kaldi
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   └── readme.md
    │   │   └── ctc-e2e-chunk
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor.png
    │   │   │   ├── readme.md
    │   │   │   └── tokenizer.tknz
    │   ├── local
    │   │   ├── Statistical_Significance_Test_Tools
    │   │   │   ├── Readme.md
    │   │   │   ├── Readme_ch.md
    │   │   │   ├── cer.py
    │   │   │   ├── p_cal.bash
    │   │   │   └── significance_test.py
    │   │   ├── after_data_char_dealing.py
    │   │   ├── audio2ark_multi.sh
    │   │   ├── data_char_dealing.py
    │   │   ├── data_multi.sh
    │   │   ├── extract_fbank_multi.py
    │   │   ├── mix_gen.py
    │   │   └── ori_data_prep.py
    │   └── utils
    ├── commonvoice
    │   ├── .vscode
    │   ├── README.md
    │   ├── cat
    │   ├── exp
    │   │   ├── asr-ctc-russian
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor.png
    │   │   │   └── readme.md
    │   │   ├── asr-rnnt-chinese
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor.png
    │   │   │   └── readme.md
    │   │   ├── asr-rnnt-russian
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor.png
    │   │   │   └── readme.md
    │   │   └── joinap
    │   │   │   ├── decode-lm-indonesia
    │   │   │       ├── config.json
    │   │   │       ├── hyper-p.json
    │   │   │       └── readme.md
    │   │   │   ├── decode-lm-russian
    │   │   │       ├── config.json
    │   │   │       ├── hyper-p.json
    │   │   │       └── readme.md
    │   │   │   ├── finetune-id
    │   │   │       ├── config.json
    │   │   │       ├── hyper-p.json
    │   │   │       ├── monitor.png
    │   │   │       └── readme.md
    │   │   │   ├── mono-indonesia-L
    │   │   │       ├── config.json
    │   │   │       ├── hyper-p.json
    │   │   │       ├── monitor.png
    │   │   │       └── readme.md
    │   │   │   ├── mono-indonesia-NL
    │   │   │       ├── config.json
    │   │   │       ├── hyper-p.json
    │   │   │       ├── monitor.png
    │   │   │       └── readme.md
    │   │   │   ├── mono-indonesia-flat
    │   │   │       ├── config.json
    │   │   │       ├── hyper-p.json
    │   │   │       ├── monitor.png
    │   │   │       └── readme.md
    │   │   │   ├── mul-ru+id-L
    │   │   │       ├── config.json
    │   │   │       ├── hyper-p.json
    │   │   │       ├── monitor.png
    │   │   │       ├── prep_mul_pv.sh
    │   │   │       └── readme.md
    │   │   │   ├── readme.md
    │   │   │   └── unpack_mulingual_param.py
    │   ├── local
    │   │   ├── data.sh
    │   │   ├── data
    │   │   │   ├── bipa.txt
    │   │   │   └── ipa_extend.txt
    │   │   ├── get_ipa_mapping.py
    │   │   ├── make_fbank.py
    │   │   ├── prep_ipa_lexicon.sh
    │   │   ├── repl_nonIPA.py
    │   │   └── text_normalize.sh
    │   └── utils
    ├── cv-lang10
    │   ├── cat
    │   ├── data
    │   │   └── metainfo.json
    │   ├── exp
    │   │   ├── Crosslingual
    │   │   │   ├── id
    │   │   │   │   ├── Multi._phoneme_ft_phoneme_10h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Multi._phoneme_ft_phoneme_10m
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Multi._phoneme_ft_phoneme_1h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Multi._phoneme_ft_phoneme_20h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Multi._phoneme_ft_subword_10h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Multi._phoneme_ft_subword_10m
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Multi._phoneme_ft_subword_1h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Multi._phoneme_ft_subword_20h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Multi._subword_ft_subword_10h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Multi._subword_ft_subword_10m
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Multi._subword_ft_subword_1h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Multi._subword_ft_subword_20h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Wav2vec-En_ft_phoneme_10h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Wav2vec-En_ft_phoneme_1h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Wav2vec-En_ft_phoneme_20h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Wav2vec-En_ft_subword_10h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Wav2vec-En_ft_subword_1h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Wav2vec-En_ft_subword_20h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Wav2vec-lang10_ft_phoneme_10h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Wav2vec-lang10_ft_phoneme_1h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Wav2vec-lang10_ft_phoneme_20h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Wav2vec-lang10_ft_subword_10h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Wav2vec-lang10_ft_subword_1h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   └── Wav2vec-lang10_ft_subword_20h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   ├── pl
    │   │   │   │   ├── Multi._phoneme_ft_phoneme_10h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Multi._phoneme_ft_phoneme_10m
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Multi._phoneme_ft_phoneme_130h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Multi._phoneme_ft_phoneme_1h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Multi._phoneme_ft_subword_10h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Multi._phoneme_ft_subword_10m
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Multi._phoneme_ft_subword_130h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Multi._phoneme_ft_subword_1h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Multi._subword_ft_subword_10h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Multi._subword_ft_subword_10m
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Multi._subword_ft_subword_130h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Multi._subword_ft_subword_1h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Wav2vec-En_ft_phoneme_10h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Wav2vec-En_ft_phoneme_130h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Wav2vec-En_ft_phoneme_1h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Wav2vec-En_ft_subword_10h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Wav2vec-En_ft_subword_130h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Wav2vec-En_ft_subword_1h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Wav2vec-lang10_ft_phoneme_10h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Wav2vec-lang10_ft_phoneme_130h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Wav2vec-lang10_ft_phoneme_1h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Wav2vec-lang10_ft_subword_10h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Wav2vec-lang10_ft_subword_130h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   └── Wav2vec-lang10_ft_subword_1h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   └── readme.md
    │   │   ├── Monolingual
    │   │   │   ├── en
    │   │   │   │   ├── config.json
    │   │   │   │   ├── hyper-p.json
    │   │   │   │   ├── lm
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   └── hyper-p.json
    │   │   │   │   ├── monitor.png
    │   │   │   │   └── readme.md
    │   │   │   ├── es
    │   │   │   │   ├── config.json
    │   │   │   │   ├── hyper-p.json
    │   │   │   │   ├── lm
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   └── hyper-p.json
    │   │   │   │   ├── monitor.png
    │   │   │   │   └── readme.md
    │   │   │   ├── fr
    │   │   │   │   ├── config.json
    │   │   │   │   ├── hyper-p.json
    │   │   │   │   ├── lm
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   └── hyper-p.json
    │   │   │   │   ├── monitor.png
    │   │   │   │   └── readme.md
    │   │   │   ├── id
    │   │   │   │   ├── Mono._phoneme_10h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Mono._phoneme_1h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Mono._phoneme_20h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Mono._subword_10h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Mono._subword_1h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Mono._subword_20h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   └── lm
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   └── hyper-p.json
    │   │   │   ├── it
    │   │   │   │   ├── config.json
    │   │   │   │   ├── hyper-p.json
    │   │   │   │   ├── lm
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   └── hyper-p.json
    │   │   │   │   ├── monitor.png
    │   │   │   │   └── readme.md
    │   │   │   ├── ky
    │   │   │   │   ├── config.json
    │   │   │   │   ├── hyper-p.json
    │   │   │   │   ├── lm
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   └── hyper-p.json
    │   │   │   │   ├── monitor.png
    │   │   │   │   └── readme.md
    │   │   │   ├── nl
    │   │   │   │   ├── config.json
    │   │   │   │   ├── hyper-p.json
    │   │   │   │   ├── lm
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   └── hyper-p.json
    │   │   │   │   ├── monitor.png
    │   │   │   │   └── readme.md
    │   │   │   ├── pl
    │   │   │   │   ├── Mono._phoneme_10h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Mono._phoneme_130h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Mono._phoneme_1h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Mono._subword_10h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Mono._subword_130h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── Mono._subword_1h
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   ├── hyper-p.json
    │   │   │   │   │   ├── monitor.png
    │   │   │   │   │   └── readme.md
    │   │   │   │   └── lm
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   └── hyper-p.json
    │   │   │   ├── readme.md
    │   │   │   ├── ru
    │   │   │   │   ├── config.json
    │   │   │   │   ├── hyper-p.json
    │   │   │   │   ├── lm
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   └── hyper-p.json
    │   │   │   │   ├── monitor.png
    │   │   │   │   └── readme.md
    │   │   │   ├── sv-SE
    │   │   │   │   ├── config.json
    │   │   │   │   ├── hyper-p.json
    │   │   │   │   ├── lm
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   └── hyper-p.json
    │   │   │   │   ├── monitor.png
    │   │   │   │   └── readme.md
    │   │   │   ├── tr
    │   │   │   │   ├── config.json
    │   │   │   │   ├── hyper-p.json
    │   │   │   │   ├── lm
    │   │   │   │   │   ├── config.json
    │   │   │   │   │   └── hyper-p.json
    │   │   │   │   ├── monitor.png
    │   │   │   │   └── readme.md
    │   │   │   └── tt
    │   │   │   │   ├── config.json
    │   │   │   │   ├── hyper-p.json
    │   │   │   │   ├── lm
    │   │   │   │       ├── config.json
    │   │   │   │       └── hyper-p.json
    │   │   │   │   ├── monitor.png
    │   │   │   │   └── readme.md
    │   │   └── Multilingual
    │   │   │   ├── Multi._phoneme_L
    │   │   │       ├── config.json
    │   │   │       ├── hyper-p.json
    │   │   │       ├── monitor.png
    │   │   │       └── readme.md
    │   │   │   ├── Multi._phoneme_M
    │   │   │       ├── config.json
    │   │   │       ├── hyper-p.json
    │   │   │       ├── monitor.png
    │   │   │       └── readme.md
    │   │   │   ├── Multi._phoneme_S
    │   │   │       ├── config.json
    │   │   │       ├── hyper-p.json
    │   │   │       ├── monitor.png
    │   │   │       └── readme.md
    │   │   │   ├── Multi._subword
    │   │   │       ├── config.json
    │   │   │       ├── hyper-p.json
    │   │   │       ├── monitor.png
    │   │   │       └── readme.md
    │   │   │   ├── Wav2vec-lang10
    │   │   │       ├── Wav2vec-lang10.yaml
    │   │   │       └── readme.md
    │   │   │   └── readme.md
    │   ├── lang-process
    │   │   ├── en
    │   │   │   ├── lang_process.md
    │   │   │   ├── lexicon.sh
    │   │   │   └── text_norm.sh
    │   │   ├── es
    │   │   │   ├── lang_process.md
    │   │   │   ├── lexicon.sh
    │   │   │   └── text_norm.sh
    │   │   ├── fr
    │   │   │   ├── lang_process.md
    │   │   │   ├── lexicon.sh
    │   │   │   └── text_norm.sh
    │   │   ├── id
    │   │   │   ├── lang_process.md
    │   │   │   ├── lexicon.sh
    │   │   │   └── text_norm.sh
    │   │   ├── it
    │   │   │   ├── lang_process.md
    │   │   │   ├── lexicon.sh
    │   │   │   └── text_norm.sh
    │   │   ├── ky
    │   │   │   ├── lang_process.md
    │   │   │   ├── lexicon.sh
    │   │   │   └── text_norm.sh
    │   │   ├── lang-process.md
    │   │   ├── nl
    │   │   │   ├── lang_process.md
    │   │   │   ├── lexicon.sh
    │   │   │   └── text_norm.sh
    │   │   ├── pl
    │   │   │   ├── lang_process.md
    │   │   │   ├── lexicon.sh
    │   │   │   └── text_norm.sh
    │   │   ├── ru
    │   │   │   ├── lang_process.md
    │   │   │   ├── lexicon.sh
    │   │   │   └── text_norm.sh
    │   │   ├── sv-SE
    │   │   │   ├── lang_process.md
    │   │   │   ├── lexicon.sh
    │   │   │   └── text_norm.sh
    │   │   ├── tr
    │   │   │   ├── lang_process.md
    │   │   │   ├── lexicon.sh
    │   │   │   └── text_norm.sh
    │   │   └── tt
    │   │   │   ├── lang_process.md
    │   │   │   ├── lexicon.sh
    │   │   │   └── text_norm.sh
    │   ├── local
    │   │   ├── data_prep.md
    │   │   ├── data_prep.sh
    │   │   ├── eval_fst_decode.sh
    │   │   ├── expect.py
    │   │   ├── parseopt.py
    │   │   └── tools
    │   │   │   ├── calculate_dur.py
    │   │   │   ├── char_list.py
    │   │   │   ├── get_ipa_mapping.py
    │   │   │   ├── phone_list.py
    │   │   │   ├── prep_ld.py
    │   │   │   ├── sample_data.py
    │   │   │   ├── subset.sh
    │   │   │   └── unpack_mulingual_param.py
    │   ├── readme.md
    │   ├── run.sh
    │   └── utils
    ├── libri
    │   ├── README.md
    │   ├── cat
    │   ├── exp
    │   │   ├── crf-v1
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor.jpg
    │   │   │   └── readme.md
    │   │   ├── lm
    │   │   │   └── lm-v1-transformer
    │   │   │   │   ├── config.json
    │   │   │   │   ├── hyper-p.json
    │   │   │   │   ├── monitor.jpg
    │   │   │   │   └── readme.md
    │   │   └── rnnt-v1
    │   │   │   ├── config.json
    │   │   │   ├── hyper-p.json
    │   │   │   ├── monitor.png
    │   │   │   └── readme.md
    │   ├── local
    │   │   ├── data.sh
    │   │   ├── data_kaldi.sh
    │   │   ├── extract_fbank.py
    │   │   ├── extract_meta_kaldi.py
    │   │   ├── prep_lexicon.sh
    │   │   └── prep_libri_corpus.sh
    │   └── utils
    ├── wenetspeech
    │   ├── .vscode
    │   ├── README.md
    │   ├── cat
    │   ├── exp
    │   │   ├── ebm-lm
    │   │   │   ├── GN-ELM-DNCE
    │   │   │   │   ├── config.json
    │   │   │   │   ├── config_ebm.json
    │   │   │   │   ├── config_noise.json
    │   │   │   │   ├── hyper-p.json
    │   │   │   │   ├── monitor.png
    │   │   │   │   └── readme.md
    │   │   │   └── TRF-LM-DNCE
    │   │   │   │   ├── config.json
    │   │   │   │   ├── config_noise.json
    │   │   │   │   ├── config_trf.json
    │   │   │   │   ├── hyper-p.json
    │   │   │   │   ├── monitor.png
    │   │   │   │   └── readme.md
    │   │   ├── lm
    │   │   │   ├── lm-trans-l
    │   │   │   │   ├── config.json
    │   │   │   │   ├── hyper-p.json
    │   │   │   │   └── readme.md
    │   │   │   └── lm-trans-m
    │   │   │   │   ├── config.json
    │   │   │   │   ├── hyper-p.json
    │   │   │   │   └── readme.md
    │   │   ├── train_l
    │   │   │   ├── crf-v1
    │   │   │   │   ├── config.json
    │   │   │   │   ├── hyper-p.json
    │   │   │   │   ├── monitor.png
    │   │   │   │   └── readme.md
    │   │   │   └── rnnt-v1
    │   │   │   │   ├── config.json
    │   │   │   │   ├── hyper-p.json
    │   │   │   │   ├── monitor.png
    │   │   │   │   └── readme.md
    │   │   └── train_m
    │   │   │   ├── crf-v1
    │   │   │       ├── config.json
    │   │   │       ├── hyper-p.json
    │   │   │       ├── monitor.png
    │   │   │       └── readme.md
    │   │   │   ├── ctc-v1
    │   │   │       ├── config.json
    │   │   │       ├── hyper-p.json
    │   │   │       ├── monitor.png
    │   │   │       └── readme.md
    │   │   │   └── rnnt-v1
    │   │   │       ├── config.json
    │   │   │       ├── hyper-p.json
    │   │   │       ├── monitor.png
    │   │   │       └── readme.md
    │   ├── local
    │   │   ├── data_kaldi.sh
    │   │   ├── extract_meta.py
    │   │   └── wenetspeech_data_prep.sh
    │   └── utils
    └── wsj
    │   ├── .vscode
    │   ├── README.md
    │   ├── cat
    │   ├── exp
    │       ├── asr-ctc-crf-phone
    │       │   ├── config.json
    │       │   ├── decode_lm
    │       │   │   ├── config.json
    │       │   │   └── hyper-p.json
    │       │   ├── hyper-p.json
    │       │   ├── monitor.png
    │       │   ├── readme.md
    │       │   └── run.sh
    │       ├── asr-ctc-phone
    │       │   ├── config.json
    │       │   ├── decode_lm
    │       │   │   ├── config.json
    │       │   │   └── hyper-p.json
    │       │   ├── hyper-p.json
    │       │   ├── monitor.png
    │       │   ├── readme.md
    │       │   └── run.sh
    │       └── asr-rnnt-bpe
    │       │   ├── config.json
    │       │   ├── hyper-p.json
    │       │   ├── monitor.png
    │       │   └── readme.md
    │   ├── local
    │       └── data_kaldi.sh
    │   └── utils
├── install.sh
├── requirements.txt
├── setup.py
└── src
    ├── ctc_crf
        ├── Makefile
        ├── binding.cpp
        ├── ctc_crf
        │   └── __init__.py
        ├── gpu_ctc
        │   ├── CMakeLists.txt
        │   ├── LICENSE
        │   ├── README.txt
        │   ├── contrib
        │   │   └── moderngpu
        │   │   │   ├── LICENSE
        │   │   │   └── include
        │   │   │       ├── device
        │   │   │           ├── ctaloadbalance.cuh
        │   │   │           ├── ctamerge.cuh
        │   │   │           ├── ctascan.cuh
        │   │   │           ├── ctasearch.cuh
        │   │   │           ├── ctasegreduce.cuh
        │   │   │           ├── ctasegscan.cuh
        │   │   │           ├── ctasegsort.cuh
        │   │   │           ├── ctasortedsearch.cuh
        │   │   │           ├── devicetypes.cuh
        │   │   │           ├── deviceutil.cuh
        │   │   │           ├── intrinsics.cuh
        │   │   │           ├── loadstore.cuh
        │   │   │           ├── serialsets.cuh
        │   │   │           └── sortnetwork.cuh
        │   │   │       ├── mgpudevice.cuh
        │   │   │       ├── mgpuenums.h
        │   │   │       └── util
        │   │   │           └── static.h
        │   ├── ctc.h
        │   ├── ctc_entrypoint.cu
        │   ├── ctc_helper.h
        │   ├── gpu_ctc.h
        │   ├── gpu_ctc_kernels.h
        │   └── hostdevice.h
        ├── gpu_den
        │   ├── CMakeLists.txt
        │   ├── den_calculate.cu
        │   └── fst_read.cc
        ├── setup.py
        └── test
        │   ├── den_lm.fst
        │   └── main.py
    ├── fst-decoder
        ├── Makefile
        └── latgen-faster.cc
    └── g2p-tool
        └── build.sh


/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "json.schemas": [
 3 |         {
 4 |             "fileMatch": [
 5 |                 "exp/**/config.json"
 6 |             ],
 7 |             "url": ".vscode/sgm_nn.json"
 8 |         },
 9 |         {
10 |             "fileMatch": [
11 |                 "exp/**/hyper-p.json"
12 |             ],
13 |             "url": ".vscode/sgm_hyper.json"
14 |         }
15 |     ]
16 | }


--------------------------------------------------------------------------------
/assets/G.fst.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/G.fst.png


--------------------------------------------------------------------------------
/assets/JoinAP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/JoinAP.png


--------------------------------------------------------------------------------
/assets/L.fst(no NOISE).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/L.fst(no NOISE).png


--------------------------------------------------------------------------------
/assets/L.fst.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/L.fst.png


--------------------------------------------------------------------------------
/assets/ME2E.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/ME2E.png


--------------------------------------------------------------------------------
/assets/MVDR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/MVDR.png


--------------------------------------------------------------------------------
/assets/PSD.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/PSD.png


--------------------------------------------------------------------------------
/assets/T.fst(no NOISE).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/T.fst(no NOISE).png


--------------------------------------------------------------------------------
/assets/T.fst.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/T.fst.png


--------------------------------------------------------------------------------
/assets/TLG.fst.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/TLG.fst.png


--------------------------------------------------------------------------------
/assets/TLG.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/TLG.png


--------------------------------------------------------------------------------
/assets/WFST.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/WFST.png


--------------------------------------------------------------------------------
/assets/den.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/den.png


--------------------------------------------------------------------------------
/assets/h_f.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/h_f.png


--------------------------------------------------------------------------------
/assets/intellisense.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/intellisense.gif


--------------------------------------------------------------------------------
/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/logo.png


--------------------------------------------------------------------------------
/assets/loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/loss.png


--------------------------------------------------------------------------------
/assets/phonological_feature.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/phonological_feature.png


--------------------------------------------------------------------------------
/assets/pipeline_rnnt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/pipeline_rnnt.png


--------------------------------------------------------------------------------
/assets/potential.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/potential.png


--------------------------------------------------------------------------------
/cat/__init__.py:
--------------------------------------------------------------------------------
1 | """Transducer/CTC/CRF/LM training/inference tool
2 | """
3 | 
4 | from . import ctc
5 | from . import lm
6 | from . import rnnt
7 | from . import shared
8 | from . import utils
9 | 


--------------------------------------------------------------------------------
/cat/ctc/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 Tsinghua University
 2 | # Apache 2.0.
 3 | # Author: Huahuan Zheng (maxwellzh@outlook.com)
 4 | 
 5 | """CTC-related modules
 6 | """
 7 | 
 8 | 
 9 | from .train import build_model as ctc_builder
10 | 
11 | __all__ = [ctc_builder]
12 | 


--------------------------------------------------------------------------------
/cat/ctc/__main__.py:
--------------------------------------------------------------------------------
1 | from .train import main
2 | main()
3 | 


--------------------------------------------------------------------------------
/cat/lm/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 Tsinghua University
 2 | # Apache 2.0.
 3 | # Author: Huahuan Zheng (maxwellzh@outlook.com)
 4 | 
 5 | """LM-related modules
 6 | """
 7 | 
 8 | from .train import build_model as lm_builder
 9 | 
10 | __all__ = [lm_builder]
11 | 


--------------------------------------------------------------------------------
/cat/lm/__main__.py:
--------------------------------------------------------------------------------
1 | from .train import main
2 | main()
3 | 


--------------------------------------------------------------------------------
/cat/lm/trf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/cat/lm/trf/__init__.py


--------------------------------------------------------------------------------
/cat/rnnt/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 Tsinghua University
 2 | # Apache 2.0.
 3 | # Author: Huahuan Zheng (maxwellzh@outlook.com)
 4 | 
 5 | """RNN-Transducer related module
 6 | """
 7 | 
 8 | 
 9 | from .train import build_model as rnnt_builder
10 | 


--------------------------------------------------------------------------------
/cat/rnnt/__main__.py:
--------------------------------------------------------------------------------
1 | from .train import main
2 | main()
3 | 


--------------------------------------------------------------------------------
/cat/shared/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 Tsinghua University
 2 | # Apache 2.0.
 3 | # Author: Huahuan Zheng (maxwellzh@outlook.com)
 4 | 
 5 | """Common files and variables for different trainer.
 6 | """
 7 | 
 8 | from . import tokenizer
 9 | from . import scheduler
10 | from . import manager
11 | from . import layer
12 | from . import encoder
13 | from . import decoder
14 | from . import data
15 | from . import coreutils
16 | from .specaug import SpecAug
17 | from .manager import Manager
18 | 


--------------------------------------------------------------------------------
/cat/shared/_constants.py:
--------------------------------------------------------------------------------
 1 | """Declare all used global constants (like file names)"""
 2 | 
 3 | # number of utterances per file for wds
 4 | UTTS_PER_FILE = 2048
 5 | 
 6 | # folder
 7 | D_CHECKPOINT = "check"
 8 | D_LOG = "log"
 9 | D_INFER = "decode"
10 | D_TMP = "tmp"
11 | D_CACHE = ".cache"
12 | 
13 | # file
14 | ## configurations
15 | F_NN_CONFIG = "config.json"
16 | F_HYPER_CONFIG = "hyper-p.json"
17 | 
18 | ## monitor/log related
19 | F_MONITOR_FIG = "monitor.png"
20 | 
21 | ## checkpoint
22 | F_CHECKPOINT_LIST = "checkpoint.list"
23 | 
24 | ## others
25 | F_TOKENIZER = 'tokenizer.tknz'
26 | F_TRAINING_INFO = "readme.md"
27 | F_DATAINFO = "data/metainfo.json"
28 | 
29 | # schema
30 | SCHEMA_NN_CONFIG = "sgm_nn.json"
31 | SCHEMA_HYPER_CONFIG = "sgm_hyper.json"
32 | 


--------------------------------------------------------------------------------
/cat/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/cat/utils/__init__.py


--------------------------------------------------------------------------------
/cat/utils/compat/repl_am_to_encoder.py:
--------------------------------------------------------------------------------
 1 | """
 2 | In previous version, the encoder in CTC trainer is named as 'am', thus the 
 3 | names of parameters are like 'module.am.xxx'
 4 | 
 5 | Now the 'am' is replaced to 'encoder' to be consistent with other non-CTC models.
 6 | So we have to replace the 'am' to 'encoder' to allow loading
 7 | models from previous checkpoints.
 8 | 
 9 | Usage:
10 |     python utils/compat/repl_am_to_encoder.py /path/to/checkpoint.pt
11 | """
12 | import torch
13 | import sys
14 | import os
15 | from collections import OrderedDict
16 | 
17 | if __name__ == "__main__":
18 |     if len(sys.argv[1:]) != 1:
19 |         raise RuntimeError("Require one argument to specify the checkpoint.")
20 | 
21 |     file = sys.argv[1]
22 |     assert os.path.isfile(file), file
23 | 
24 |     check = torch.load(file, "cpu")
25 |     m = check["model"]
26 |     newdict = OrderedDict()
27 | 
28 |     for k, v in m.items():
29 |         newdict[k.replace(".am.", ".encoder.")] = v
30 | 
31 |     check["model"] = newdict
32 |     torch.save(check, file)
33 | 


--------------------------------------------------------------------------------
/cat/utils/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/cat/utils/data/__init__.py


--------------------------------------------------------------------------------
/cat/utils/lm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/cat/utils/lm/__init__.py


--------------------------------------------------------------------------------
/cat/utils/pipeline/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/cat/utils/pipeline/__init__.py


--------------------------------------------------------------------------------
/cat/utils/pipeline/_constants.py:
--------------------------------------------------------------------------------
1 | ../../shared/_constants.py


--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
 1 | # Guideline for contributing
 2 | 
 3 | ## Add dependencies
 4 | 
 5 | If your contributing relies on dependencies from outside (such as `matplotlib` module of python), you need to assure the dependencies are properly installed.
 6 | 
 7 | - For python modules that `cat` relying on, you could add it in [requirements.txt](../requirements.txt)
 8 | - Some modules have their special installation processes (like `kenlm`), then you should add the installation in [install.sh](../install.sh), where you'll modify:
 9 |    1. add the new module in `choices` list of the parser
10 |    2. add installation process in `exc_install()`
11 |    3. add uninstallation process in `exc_rm()`


--------------------------------------------------------------------------------
/docs/significance_test.md:
--------------------------------------------------------------------------------
 1 | # Significance Test
 2 | 
 3 | To see whether the difference between two experiments is significant, we need to conduct significance test and calculate the $p$ value. If we set the significance level $\alpha=0.05$ (typical values are 0.05, 0.01 and 0.001), then all the experiment pairs with $p$ value less than 0.05 are considered to be significantly different.
 4 | 
 5 | ```bash
 6 | # in egs/xxx/
 7 | python ../TEMPLATE/local/significance_test.py ${result_path1} ${result_path2} --method mp
 8 | ```
 9 | 
10 | `result_path1` and `result_path2` denote the metric values on all the test samples extracted from the results of the two experiments. `--method mp` denotes matched pair test and you can also set `--method mc`, which denotes McNemar test. Noting that the metric value can only be 0 or 1 in McNemar test.
11 | 
12 | ### References
13 | 
14 | L. Gillick and S. J. Cox, “Some statistical issues in the comparison of speech recognition algorithms,” in International Conference on Acoustics, Speech, and Signal Processing (ICASSP), 1989, pp.532–535.


--------------------------------------------------------------------------------
/egs/IuMien/cat:
--------------------------------------------------------------------------------
1 | ../../cat/


--------------------------------------------------------------------------------
/egs/IuMien/exp/Mono-phoneme/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "specaug": {
 3 |         "freq_mask_width_range": 0.35,
 4 |         "num_freq_mask": 2,
 5 |         "num_time_mask": 10,
 6 |         "time_mask_width_range": 0.05
 7 |     },
 8 |     "encoder": {
 9 |         "type": "ConformerNet",
10 |         "kwargs": {
11 |             "num_cells": 14,
12 |             "idim": 80,
13 |             "hdim": 512,
14 |             "conv": "vgg2l",
15 |             "num_heads": 4,
16 |             "kernel_size": 15,
17 |             "num_classes": 54
18 |         }
19 |     },
20 |     "scheduler": {
21 |         "type": "SchedulerEarlyStop",
22 |         "kwargs": {
23 |             "min_step": 2020,
24 |             "n_tol": 8,
25 |             "gamma": 0.5,
26 |             "stop_lr": 1e-06
27 |         },
28 |         "optimizer": {
29 |             "type": "Adam",
30 |             "kwargs": {
31 |                 "lr": 3e-05,
32 |                 "betas": [
33 |                     0.9,
34 |                     0.98
35 |                 ],
36 |                 "weight_decay": 1e-06
37 |             }
38 |         }
39 |     }
40 | }


--------------------------------------------------------------------------------
/egs/IuMien/exp/Mono-phoneme/monitor/exp-monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Mono-phoneme/monitor/exp-monitor.png


--------------------------------------------------------------------------------
/egs/IuMien/exp/Mono-phoneme/monitor/exp2-monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Mono-phoneme/monitor/exp2-monitor.png


--------------------------------------------------------------------------------
/egs/IuMien/exp/Mono-phoneme/monitor/exp3-monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Mono-phoneme/monitor/exp3-monitor.png


--------------------------------------------------------------------------------
/egs/IuMien/exp/Mono-phoneme/monitor/readme.md:
--------------------------------------------------------------------------------
 1 | # training process monitor
 2 | 
 3 | 
 4 | The variation of Loss and learning rate in three  independent cross-validation runs are shown below.
 5 | 
 6 | 
 7 | |     training process    |
 8 | |:-----------------------:|
 9 | |![tb-plot](./exp-monitor.png)|
10 | |![tb-plot](./exp2-monitor.png)|
11 | |![tb-plot](./exp3-monitor.png)|


--------------------------------------------------------------------------------
/egs/IuMien/exp/Mono-phoneme/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 89.99
 6 | * GPU info \[1\]
 7 |   * \[1\] NVIDIA GeForce RTX 4090
 8 | 
 9 | ### Notes
10 | 
11 | Phone modeling, using Mien language data for training from scratch.
12 | 
13 | ### How to run exp
14 | 
15 | Please refer to the [`run.history.sh`](./run.history.sh)
16 | 
17 | ### Result
18 | 
19 | We did three independent experiments, and the results of each independent experiment on its corresponding test set are as follows.
20 | 
21 | |  |  PER | WER  |
22 | |---| ---|--- |
23 | | exp1 | 4.04 | 4.44  |
24 | | exp2 | 5.18 | 5.64 |
25 | | exp3 | 3.45 | 3.99 |
26 | | avg-3 | 4.22 | 4.69 |
27 | 
28 | 
29 | ### Training process monitor
30 | 
31 | During the training process, the loss change curve can be seen in the [training process monitor](./monitor/readme.md)
32 | 


--------------------------------------------------------------------------------
/egs/IuMien/exp/Mono-phoneme/run.history.sh:
--------------------------------------------------------------------------------
 1 | # train model
 2 | # python utils/pipeline/asr.py exp/Mono-phoneme --sta 1 --sto 3
 3 | 
 4 | # test model per
 5 | # python utils/pipeline/asr.py exp/Mono-phoneme --sta 4 --sto 4
 6 | 
 7 | 
 8 | # test model wer
 9 | # First, you need to modify the hyper-p.json file.
10 | # "infer": {
11 | #             "bin": "cat.ctc.cal_logit",
12 | #             "option": {
13 | #                 "beam_size": 32,
14 | #                 "nj": 16,
15 | #                 "store_ark": true
16 | #             }
17 | #         },
18 | # python utils/pipeline/asr.py exp/Mono-phoneme --sta 4 --sto 4
19 | # bash exp/lexicon_wfst_run.sh --exp_dir exp/Mono-phoneme --lm_dir exp/decode_lm  --dataset_name test
20 | 


--------------------------------------------------------------------------------
/egs/IuMien/exp/Mono-subword/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "specaug": {
 3 |         "freq_mask_width_range": 0.35,
 4 |         "num_freq_mask": 2,
 5 |         "num_time_mask": 10,
 6 |         "time_mask_width_range": 0.05
 7 |     },
 8 |     "encoder": {
 9 |         "type": "ConformerNet",
10 |         "kwargs": {
11 |             "num_cells": 14,
12 |             "idim": 80,
13 |             "hdim": 512,
14 |             "conv": "vgg2l",
15 |             "num_heads": 4,
16 |             "kernel_size": 15,
17 |             "num_classes": 500
18 |         }
19 |     },
20 |     "scheduler": {
21 |         "type": "SchedulerEarlyStop",
22 |         "kwargs": {
23 |             "min_step": 2020,
24 |             "n_tol": 8,
25 |             "gamma": 0.5,
26 |             "stop_lr": 1e-06
27 |         },
28 |         "optimizer": {
29 |             "type": "Adam",
30 |             "kwargs": {
31 |                 "lr": 3e-05,
32 |                 "betas": [
33 |                     0.9,
34 |                     0.98
35 |                 ],
36 |                 "weight_decay": 1e-06
37 |             }
38 |         }
39 |     }
40 | }


--------------------------------------------------------------------------------
/egs/IuMien/exp/Mono-subword/monitor/exp2-monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Mono-subword/monitor/exp2-monitor.png


--------------------------------------------------------------------------------
/egs/IuMien/exp/Mono-subword/monitor/exp3-monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Mono-subword/monitor/exp3-monitor.png


--------------------------------------------------------------------------------
/egs/IuMien/exp/Mono-subword/monitor/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Mono-subword/monitor/monitor.png


--------------------------------------------------------------------------------
/egs/IuMien/exp/Mono-subword/monitor/readme.md:
--------------------------------------------------------------------------------
 1 | # training process monitor
 2 | 
 3 | 
 4 | The variation of Loss and learning rate in three  independent cross-validation runs are shown below.
 5 | 
 6 | 
 7 | |     training process    |
 8 | |:-----------------------:|
 9 | |![tb-plot](./exp-monitor.png)|
10 | |![tb-plot](./exp2-monitor.png)|
11 | |![tb-plot](./exp3-monitor.png)|


--------------------------------------------------------------------------------
/egs/IuMien/exp/Mono-subword/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 90.22
 6 | * GPU info \[1\]
 7 |   * \[1\] NVIDIA GeForce RTX 4090
 8 | 
 9 | ### Notes
10 | 
11 | BPE modeling, using Mien language data for training from scratch.
12 | 
13 | 
14 | ### How to run exp
15 | 
16 | Please refer to the [`run.history.sh`](./run.history.sh)
17 | 
18 | ### Result
19 | We did three independent experiments, and the results of each independent experiment on its corresponding test set are as follows.
20 | 
21 | |  |  WER w/o LM| WER with LM |
22 | |---| ---|--- |
23 | | exp1 | 9.80 |  7.11 |
24 | | exp2 | 10.04 | 7.04 |
25 | | exp3 | 9.29 | 6.46 |
26 | | avg-3 | 9.71 | 6.87 |
27 | 
28 | 
29 | ### training process monitor
30 | 
31 | During the training process, the loss change curve can be seen in the [training process monitor](./monitor/readme.md)
32 | 


--------------------------------------------------------------------------------
/egs/IuMien/exp/Mono-subword/run.history.sh:
--------------------------------------------------------------------------------
 1 | # train model
 2 | # python utils/pipeline/asr.py exp2/Mono-subword --sta 1 --sto 3
 3 | # decode w/o lm
 4 | # python utils/pipeline/asr.py exp2/Mono-subword --sta 4 --sto 4
 5 | 
 6 | 
 7 | # decode with lm
 8 | # cal_logit
 9 | # First, you need to modify the hyper-p.json file.
10 | # "infer": {
11 | #             "bin": "cat.ctc.cal_logit",
12 | #             "option": {
13 | #                 "beam_size": 32,
14 | #                 "nj": 16,
15 | #                 "store_ark": true
16 | #             }
17 | #         },
18 | # python utils/pipeline/asr.py exp/Mono-subword --sta 4 --sto 4
19 | # to decode
20 | # bash exp/bpe_wfst_run.sh --exp_dir exp/Mono-subword --lm_dir exp/decode_lm --word_list dict/word_list --dataset_name test
21 | 
22 | 


--------------------------------------------------------------------------------
/egs/IuMien/exp/Mul10-sub-PT-sub-FT/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "specaug": {
 3 |         "freq_mask_width_range": 0.35,
 4 |         "num_freq_mask": 2,
 5 |         "num_time_mask": 10,
 6 |         "time_mask_width_range": 0.05
 7 |     },
 8 |     "encoder": {
 9 |         "type": "ConformerNet",
10 |         "kwargs": {
11 |             "num_cells": 14,
12 |             "idim": 80,
13 |             "hdim": 512,
14 |             "conv": "vgg2l",
15 |             "num_heads": 4,
16 |             "kernel_size": 15,
17 |             "num_classes": 500
18 |         }
19 |     },
20 |     "scheduler": {
21 |         "type": "SchedulerEarlyStop",
22 |         "kwargs": {
23 |             "min_step": 2020,
24 |             "n_tol": 8,
25 |             "gamma": 0.5,
26 |             "stop_lr": 1e-06
27 |         },
28 |         "optimizer": {
29 |             "type": "Adam",
30 |             "kwargs": {
31 |                 "lr": 3e-05,
32 |                 "betas": [
33 |                     0.9,
34 |                     0.98
35 |                 ],
36 |                 "weight_decay": 1e-06
37 |             }
38 |         }
39 |     }
40 | }


--------------------------------------------------------------------------------
/egs/IuMien/exp/Mul10-sub-PT-sub-FT/monitor/exp2-monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Mul10-sub-PT-sub-FT/monitor/exp2-monitor.png


--------------------------------------------------------------------------------
/egs/IuMien/exp/Mul10-sub-PT-sub-FT/monitor/exp3-monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Mul10-sub-PT-sub-FT/monitor/exp3-monitor.png


--------------------------------------------------------------------------------
/egs/IuMien/exp/Mul10-sub-PT-sub-FT/monitor/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Mul10-sub-PT-sub-FT/monitor/monitor.png


--------------------------------------------------------------------------------
/egs/IuMien/exp/Mul10-sub-PT-sub-FT/monitor/readme.md:
--------------------------------------------------------------------------------
 1 | # training process monitor
 2 | 
 3 | 
 4 | The variation of Loss and learning rate in three  independent cross-validation runs are shown below.
 5 | 
 6 | 
 7 | |     training process    |
 8 | |:-----------------------:|
 9 | |![tb-plot](./exp-monitor.png)|
10 | |![tb-plot](./exp2-monitor.png)|
11 | |![tb-plot](./exp3-monitor.png)|


--------------------------------------------------------------------------------
/egs/IuMien/exp/Mul10-sub-PT-sub-FT/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 90.22
 6 | * GPU info \[1\]
 7 |   * \[1\] NVIDIA GeForce RTX 4090
 8 |   
 9 | ### Notes
10 | 
11 | BPE modeling, fine-tuning with Mien language data based on a pretrained model with subwords from cv-10.
12 | 
13 | ### How to run exp
14 | 
15 | Please refer to the [`run.history.sh`](./run.history.sh)
16 | 
17 | ### Result
18 | We did three independent experiments, and the results of each independent experiment on its corresponding test set are as follows.
19 | 
20 | |  |  WER w/o LM| WER with LM |
21 | |---| ---|--- |
22 | | exp1 | 4.18 | 3.42  |
23 | | exp2 | 4.79 | 3.92 |
24 | | exp3 | 4.02 | 3.05 |
25 | | avg-3 | 4.33 | 3.46 |
26 | 
27 | ### training process monitor
28 | 
29 | During the training process, the loss change curve can be seen in the [training process monitor](./monitor/readme.md)
30 | 


--------------------------------------------------------------------------------
/egs/IuMien/exp/Wav2vec2-cv10-phoneme-FT/monitor/exp2-monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Wav2vec2-cv10-phoneme-FT/monitor/exp2-monitor.png


--------------------------------------------------------------------------------
/egs/IuMien/exp/Wav2vec2-cv10-phoneme-FT/monitor/exp3-monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Wav2vec2-cv10-phoneme-FT/monitor/exp3-monitor.png


--------------------------------------------------------------------------------
/egs/IuMien/exp/Wav2vec2-cv10-phoneme-FT/monitor/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Wav2vec2-cv10-phoneme-FT/monitor/monitor.png


--------------------------------------------------------------------------------
/egs/IuMien/exp/Wav2vec2-cv10-phoneme-FT/monitor/readme.md:
--------------------------------------------------------------------------------
 1 | # training process monitor
 2 | 
 3 | 
 4 | The variation of Loss and learning rate in three  independent cross-validation runs are shown below.
 5 | 
 6 | 
 7 | |     training process    |
 8 | |:-----------------------:|
 9 | |![tb-plot](./exp-monitor.png)|
10 | |![tb-plot](./exp2-monitor.png)|
11 | |![tb-plot](./exp3-monitor.png)|


--------------------------------------------------------------------------------
/egs/IuMien/exp/Wav2vec2-cv10-phoneme-FT/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 90.21
 6 | * GPU info \[1\]
 7 |   * \[1\] NVIDIA GeForce RTX 4090
 8 | 
 9 | ### Notes
10 | 
11 | Phone modeling, fine-tuning with Mien language data based on the Wav2vec2-cv10 pretrained model.
12 | 
13 | ### How to run exp
14 | 
15 | Please refer to the [`run.history.sh`](./run.history.sh)
16 | 
17 | ### Result
18 | 
19 | We did three independent experiments, and the results of each independent experiment on its corresponding test set are as follows.
20 | 
21 | |  |  PER | WER  |
22 | |---| ---|--- |
23 | | exp1 | 2.40 | 2.71  |
24 | | exp2 | 2.82 | 3.06 |
25 | | exp3 | 2.39 | 2.53 |
26 | | avg-3 | 2.53 | 2.76 |
27 | 
28 | ### training process monitor
29 | 
30 | During the training process, the loss change curve can be seen in the [training process monitor](./monitor/readme.md)
31 | 


--------------------------------------------------------------------------------
/egs/IuMien/exp/Wav2vec2-cv10-sub-FT/monitor/exp2-monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Wav2vec2-cv10-sub-FT/monitor/exp2-monitor.png


--------------------------------------------------------------------------------
/egs/IuMien/exp/Wav2vec2-cv10-sub-FT/monitor/exp3-monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Wav2vec2-cv10-sub-FT/monitor/exp3-monitor.png


--------------------------------------------------------------------------------
/egs/IuMien/exp/Wav2vec2-cv10-sub-FT/monitor/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Wav2vec2-cv10-sub-FT/monitor/monitor.png


--------------------------------------------------------------------------------
/egs/IuMien/exp/Wav2vec2-cv10-sub-FT/monitor/readme.md:
--------------------------------------------------------------------------------
 1 | # training process monitor
 2 | 
 3 | 
 4 | The variation of Loss and learning rate in three  independent cross-validation runs are shown below.
 5 | 
 6 | 
 7 | |     training process    |
 8 | |:-----------------------:|
 9 | |![tb-plot](./exp-monitor.png)|
10 | |![tb-plot](./exp2-monitor.png)|
11 | |![tb-plot](./exp3-monitor.png)|


--------------------------------------------------------------------------------
/egs/IuMien/exp/Wav2vec2-cv10-sub-FT/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 90.55
 6 | * GPU info \[1\]
 7 |   * \[1\] NVIDIA GeForce RTX 4090
 8 |   
 9 | ### Notes
10 | 
11 | BPE modeling, fine-tuning with Mien language data based on the Wav2Vec2-cv10 pretrained model.
12 |  
13 | ### How to run exp
14 | 
15 | Please refer to the [`run.history.sh`](./run.history.sh)
16 | 
17 | 
18 | ### Result
19 | 
20 | We did three independent experiments, and the results of each independent experiment on its corresponding test set are as follows.
21 | 
22 | |  |  WER w/o LM| WER with LM |
23 | |---| ---|--- |
24 | | exp1 | 3.75 | 3.16  |
25 | | exp2 | 4.08 | 3.33 |
26 | | exp3 | 3.47 | 2.69 |
27 | | avg-3 | 3.76 | 3.06 |
28 | 
29 | ### training process monitor
30 | 
31 | During the training process, the loss change curve can be seen in the [training process monitor](./monitor/readme.md)
32 | 


--------------------------------------------------------------------------------
/egs/IuMien/exp/Wav2vec2-cv10-sub-FT/run.history.sh:
--------------------------------------------------------------------------------
 1 | # Firstly,you could download pretrain model from https://github.com/thu-spmi/CAT/tree/master/egs/cv-lang10/exp/Multilingual/Wav2vec-lang10
 2 | # and then we should modify pt model classfier layer 
 3 | # python local/process_model_for_subword_ft.py --pt_model_path --output_model_path --vocab_size
 4 | 
 5 | 
 6 | # train
 7 | # python utils/pipeline/asr.py exp/Wav2vec2-cv10-sub-FT --sta 1 --sto 3
 8 | # decode w/o lm
 9 | # python utils/pipeline/asr.py exp/Wav2vec2-cv10-sub-FT --sta 4 --sto 4
10 | 
11 | # decode with lm
12 | # First, you need to modify the hyper-p.json file.
13 | # "infer": {
14 | #             "bin": "cat.ctc.cal_logit",
15 | #             "option": {
16 | #                 "beam_size": 32,
17 | #                 "nj": 16,
18 | #                 "store_ark": true
19 | #             }
20 | #         },
21 | # cal_logit
22 | # python utils/pipeline/asr.py exp/Wav2vec2-cv10-sub-FT --sta 4 --sto 4
23 | # to decode
24 | # bash exp/bpe_wfst_run.sh --exp_dir exp/Wav2vec2-cv10-sub-FT --lm_dir exp/decode_lm --word_list dict/word_list-2 --dataset_name test-2_raw
25 | 


--------------------------------------------------------------------------------
/egs/IuMien/exp/Whistle-phoneme-FT/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "specaug": {
 3 |         "freq_mask_width_range": 0.35,
 4 |         "num_freq_mask": 2,
 5 |         "num_time_mask": 10,
 6 |         "time_mask_width_range": 0.05
 7 |     },
 8 |     "encoder": {
 9 |         "type": "ConformerNet",
10 |         "kwargs": {
11 |             "num_cells": 14,
12 |             "idim": 80,
13 |             "hdim": 512,
14 |             "conv": "vgg2l",
15 |             "num_heads": 4,
16 |             "kernel_size": 15,
17 |             "num_classes": 54
18 |         }
19 |     },
20 |     "scheduler": {
21 |         "type": "SchedulerEarlyStop",
22 |         "kwargs": {
23 |             "min_step": 2020,
24 |             "n_tol": 10,
25 |             "gamma": 0.5,
26 |             "stop_lr": 1e-06
27 |         },
28 |         "optimizer": {
29 |             "type": "Adam",
30 |             "kwargs": {
31 |                 "lr": 3e-05,
32 |                 "betas": [
33 |                     0.9,
34 |                     0.98
35 |                 ],
36 |                 "weight_decay": 1e-06
37 |             }
38 |         }
39 |     }
40 | }


--------------------------------------------------------------------------------
/egs/IuMien/exp/Whistle-phoneme-FT/monitor/exp-monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Whistle-phoneme-FT/monitor/exp-monitor.png


--------------------------------------------------------------------------------
/egs/IuMien/exp/Whistle-phoneme-FT/monitor/exp2-monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Whistle-phoneme-FT/monitor/exp2-monitor.png


--------------------------------------------------------------------------------
/egs/IuMien/exp/Whistle-phoneme-FT/monitor/exp3-monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Whistle-phoneme-FT/monitor/exp3-monitor.png


--------------------------------------------------------------------------------
/egs/IuMien/exp/Whistle-phoneme-FT/monitor/readme.md:
--------------------------------------------------------------------------------
 1 | # training process monitor
 2 | 
 3 | 
 4 | The variation of Loss and learning rate in three  independent cross-validation runs are shown below.
 5 | 
 6 | 
 7 | |     training process    |
 8 | |:-----------------------:|
 9 | |![tb-plot](./exp-monitor.png)|
10 | |![tb-plot](./exp2-monitor.png)|
11 | |![tb-plot](./exp3-monitor.png)|


--------------------------------------------------------------------------------
/egs/IuMien/exp/Whistle-phoneme-FT/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 89.99
 6 | * GPU info \[1\]
 7 |   * \[1\] NVIDIA GeForce RTX 4090
 8 |   
 9 | ### Notes
10 | 
11 | Phone modeling, fine-tuning with Mien language data based on the Whistle-small pretrained model.
12 | 
13 | 
14 | ### How to run exp
15 | 
16 | Please refer to the [`run.history.sh`](./run.history.sh)
17 | 
18 | ### Result
19 | We did three independent experiments, and the results of each independent experiment on its corresponding test set are as follows.
20 | 
21 | |  |  PER | WER  |
22 | |---| ---|--- |
23 | | exp1 | 2.45 | 2.93  |
24 | | exp2 | 2.65 | 3.08 |
25 | | exp3 | 2.13 | 2.38 |
26 | | avg-3 | 2.41 | 2.71 |
27 | 
28 | ### training process monitor
29 | 
30 | During the training process, the loss change curve can be seen in the [training process monitor](./monitor/readme.md)
31 | 


--------------------------------------------------------------------------------
/egs/IuMien/exp/Whistle-sub-FT/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "specaug": {
 3 |         "freq_mask_width_range": 0.35,
 4 |         "num_freq_mask": 2,
 5 |         "num_time_mask": 10,
 6 |         "time_mask_width_range": 0.05
 7 |     },
 8 |     "encoder": {
 9 |         "type": "ConformerNet",
10 |         "kwargs": {
11 |             "num_cells": 14,
12 |             "idim": 80,
13 |             "hdim": 512,
14 |             "conv": "vgg2l",
15 |             "num_heads": 4,
16 |             "kernel_size": 15,
17 |             "num_classes": 500
18 |         }
19 |     },
20 |     "scheduler": {
21 |         "type": "SchedulerEarlyStop",
22 |         "kwargs": {
23 |             "min_step": 2020,
24 |             "n_tol": 8,
25 |             "gamma": 0.5,
26 |             "stop_lr": 1e-06
27 |         },
28 |         "optimizer": {
29 |             "type": "Adam",
30 |             "kwargs": {
31 |                 "lr": 3e-05,
32 |                 "betas": [
33 |                     0.9,
34 |                     0.98
35 |                 ],
36 |                 "weight_decay": 1e-06
37 |             }
38 |         }
39 |     }
40 | }


--------------------------------------------------------------------------------
/egs/IuMien/exp/Whistle-sub-FT/monitor/exp-monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Whistle-sub-FT/monitor/exp-monitor.png


--------------------------------------------------------------------------------
/egs/IuMien/exp/Whistle-sub-FT/monitor/exp2-monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Whistle-sub-FT/monitor/exp2-monitor.png


--------------------------------------------------------------------------------
/egs/IuMien/exp/Whistle-sub-FT/monitor/exp3-monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Whistle-sub-FT/monitor/exp3-monitor.png


--------------------------------------------------------------------------------
/egs/IuMien/exp/Whistle-sub-FT/monitor/readme.md:
--------------------------------------------------------------------------------
 1 | # training process monitor
 2 | 
 3 | 
 4 | The variation of Loss and learning rate in three  independent cross-validation runs are shown below.
 5 | 
 6 | 
 7 | |     training process    |
 8 | |:-----------------------:|
 9 | |![tb-plot](./exp-monitor.png)|
10 | |![tb-plot](./exp2-monitor.png)|
11 | |![tb-plot](./exp3-monitor.png)|


--------------------------------------------------------------------------------
/egs/IuMien/exp/Whistle-sub-FT/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | * \# of parameters (million): 90.22
 4 | * GPU info \[1\]
 5 |   * \[1\] NVIDIA GeForce RTX 4090
 6 | 
 7 | ### Notes
 8 | 
 9 | BPE modeling, fine-tuning with Mien language data based on the Whistle-small pretrained model.
10 | 
11 | ### How to run exp
12 | 
13 | Please refer to the [`run.history.sh`](./run.history.sh)
14 | 
15 | ### Result
16 | 
17 | We did three independent experiments, and the results of each independent experiment on its corresponding test set are as follows.
18 | 
19 | |  |  WER w/o LM| WER with LM |
20 | |---| ---|--- |
21 | | exp1 | 3.17 | 2.88  |
22 | | exp2 | 3.71 | 3.29 |
23 | | exp3 | 3.04 | 2.70 |
24 | | avg-3 | 3.30 | 2.95 |
25 | 
26 | ### training process monitor
27 | 
28 | During the training process, the loss change curve can be seen in the [training process monitor](./monitor/readme.md)
29 | 


--------------------------------------------------------------------------------
/egs/IuMien/exp/decode_lm/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "NGram",
 4 |         "kwargs": {
 5 |             "gram_order": 4,
 6 |             "f_binlm": "exp2/decode_lm/4gram.arpa",
 7 |             "num_classes": 1549
 8 |         }
 9 |     }
10 | }


--------------------------------------------------------------------------------
/egs/IuMien/exp/decode_lm/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": "train",
 4 |         "dev": "dev",
 5 |         "test": "test"
 6 |     },
 7 |     "tokenizer": {
 8 |         "type": "SimpleTokenizer",
 9 |         "option-init": {
10 |             "dmap": "/home/dlk/code/asr/cat/egs/MightLJSpeech/dict/word_list"
11 |         },
12 |         "|V|": 1549,
13 |         "file": "exp2/decode_lm/tokenizer.tknz"
14 |     },
15 |     "inference": {},
16 |     "commit": "618a15f70780200cdc42eed3f69f6ce1d61a4e61"
17 | }


--------------------------------------------------------------------------------
/egs/IuMien/exp/decode_lm/readme.md:
--------------------------------------------------------------------------------
 1 | 
 2 | train command:
 3 | 
 4 | ```bash
 5 | utils/pipeline/ngram.sh exp2/decode_lm -o 4 --arpa --output exp2/decode_lm/4gram.arpa --sta 3 --sto 3
 6 | ```
 7 | 
 8 | property:
 9 | 
10 | - prune: 
11 | - type:  probing
12 | - size:  3.2MB
13 | 
14 | perplexity:
15 | 
16 | ```
17 | 
18 | ```
19 | 


--------------------------------------------------------------------------------
/egs/IuMien/exp/decode_lm/run.history.sh:
--------------------------------------------------------------------------------
1 | # you need to run python local/get_wordlist.py to get word_list if you don't have word list
2 | # train tokenizer and pickle data
3 | utils/pipeline/ngram.sh exp/decode_lm -o 4 --arpa --output exp2/decode_lm/4gram.arpa --sta 1 --sto 2
4 | # train lm
5 | utils/pipeline/ngram.sh exp/decode_lm -o 4 --arpa --output exp2/decode_lm/4gram.arpa --sta 3 --sto 3
6 | # test lm
7 | # utils/pipeline/ngram.sh exp2/decode_lm -o 4 --arpa --output exp2/decode_lm/4gram.arpa --sta 4 --sto 4


--------------------------------------------------------------------------------
/egs/IuMien/local/get_wordlist.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: DongLukuan (330293721@qq.com)
 2 | 
 3 | text_path = './data/src/train/text'
 4 | 
 5 | word_set = set()
 6 | with open(text_path,'r',encoding='utf-8') as f:
 7 |     for line in f:
 8 |         try:
 9 |             ids,sentence = line.strip().split('\t')
10 |         except:
11 |             print(line.strip().split('\t'))
12 |         word_set.update(sentence.split(' '))
13 |         # word_set(set(sentence.split(' ')))
14 | word_set = sorted(word_set)
15 | 
16 | word_list_path = './dict/word_list'
17 | with open(word_list_path,'w',encoding='utf-8') as f:
18 |     for word in word_set:
19 |         f.write(word+'\n')


--------------------------------------------------------------------------------
/egs/IuMien/utils:
--------------------------------------------------------------------------------
1 | ../../cat/utils/


--------------------------------------------------------------------------------
/egs/TEMPLATE/.vscode:
--------------------------------------------------------------------------------
1 | ../../.vscode/


--------------------------------------------------------------------------------
/egs/TEMPLATE/cat:
--------------------------------------------------------------------------------
1 | ../../cat


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/asr-ctc-crf/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "trainer": {
 3 |         "use_crf": true,
 4 |         "lamb": 0.1,
 5 |         "den_lm": "exp/asr-ctc-crf/den_lm.fst",
 6 |         "decoder": {
 7 |             "beam_size": 4,
 8 |             "num_classes": 8
 9 |         }
10 |     },
11 |     "encoder": {
12 |         "type": "LSTM",
13 |         "kwargs": {
14 |             "bidirectional": true,
15 |             "dropout": 0.2,
16 |             "hdim": 512,
17 |             "idim": 80,
18 |             "num_layers": 2,
19 |             "num_classes": 8,
20 |             "with_head": true
21 |         }
22 |     },
23 |     "scheduler": {
24 |         "type": "SchedulerCosineAnnealing",
25 |         "kwargs": {
26 |             "min_lr": 0.0001,
27 |             "stop_step": 200
28 |         },
29 |         "optimizer": {
30 |             "type": "Adam",
31 |             "kwargs": {
32 |                 "lr": 0.001
33 |             },
34 |             "zeroredundancy": true
35 |         }
36 |     }
37 | }


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/asr-ctc-crf/decode-lm/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "NGram",
 4 |         "kwargs": {
 5 |             "f_binlm": "exp/asr-ctc-crf/decode-lm/3gram.bin",
 6 |             "gram_order": 3,
 7 |             "num_classes": 8
 8 |         }
 9 |     }
10 | }


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/asr-ctc-crf/decode-lm/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": "yesno",
 4 |         "dev": "yesno",
 5 |         "test": "yesno"
 6 |     },
 7 |     "tokenizer": {
 8 |         "|V|": 8,
 9 |         "file": "exp/asr-ctc-crf/tokenizer.tknz"
10 |     },
11 |     "inference": {}
12 | }


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/asr-ctc-crf/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/TEMPLATE/exp/asr-ctc-crf/monitor.png


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/asr-ctc-crf/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 8.74
 6 | * GPU info \[1\]
 7 |   * \[1\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | * example of training CTC-CRF model, including den-lm preparation.
12 | 
13 | ### Result
14 | ```
15 | yesno   %SER 100.00 | %WER 66.04 [ 317 / 480, 0 ins, 302 del, 15 sub ]
16 | ```
17 | 
18 | |     training process    |
19 | |:-----------------------:|
20 | |![tb-plot](./monitor.png)|
21 | 


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/asr-ctc-large-corpora/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "trainer": {
 3 |         "decoder": {
 4 |             "beam_size": 4,
 5 |             "num_classes": 4
 6 |         }
 7 |     },
 8 |     "encoder": {
 9 |         "type": "LSTM",
10 |         "kwargs": {
11 |             "bidirectional": true,
12 |             "proj_size": 128,
13 |             "hdim": 256,
14 |             "idim": 80,
15 |             "num_layers": 3,
16 |             "num_classes": 4
17 |         }
18 |     },
19 |     "scheduler": {
20 |         "type": "SchedulerCosineAnnealing",
21 |         "kwargs": {
22 |             "min_lr": 1e-05,
23 |             "stop_step": 300
24 |         },
25 |         "optimizer": {
26 |             "type": "Adam",
27 |             "kwargs": {
28 |                 "lr": 0.001
29 |             }
30 |         }
31 |     }
32 | }


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/asr-ctc-large-corpora/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "dev": "yesno",
 4 |         "test": "yesno"
 5 |     },
 6 |     "tokenizer": {
 7 |         "type": "SentencePieceTokenizer",
 8 |         "file": "exp/asr-ctc-large-corpora/tokenizer.tknz"
 9 |     },
10 |     "env": {
11 |         "CUDA_VISIBLE_DEVICES": "0"
12 |     },
13 |     "train": {
14 |         "bin": "cat.ctc.train",
15 |         "option": {
16 |             "amp": false,
17 |             "batch_size": 1,
18 |             "eval_error_rate": true,
19 |             "ld": "data/wds/10_1000/*.tar",
20 |             "check_freq": 120
21 |         }
22 |     },
23 |     "inference": {
24 |         "infer": {
25 |             "bin": "cat.ctc.decode",
26 |             "option": {
27 |                 "beam_size": 16,
28 |                 "nj": 2
29 |             }
30 |         },
31 |         "er": {}
32 |     },
33 |     "commit": "9bb2af8441e590ebf522e24924284f8f994c54c7"
34 | }


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/asr-ctc-large-corpora/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/TEMPLATE/exp/asr-ctc-large-corpora/monitor.png


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/asr-ctc-large-corpora/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 2.21
 6 | * GPU info \[1\]
 7 |   * \[1\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | * this template shows how to deal with very large corpora.
12 | 
13 | ### Result
14 | ```      
15 | yesno   %SER 100.00 | %WER 64.79 [ 311 / 480, 0 ins, 300 del, 11 sub ]
16 | ```
17 | 
18 | |     training process    |
19 | |:-----------------------:|
20 | |![tb-plot](./monitor.png)|
21 | 


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/asr-ctc-large-corpora/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # This is an example of processing and training
 4 | # ... very large corpora. In this example, we
 5 | # ... assume the 'train' set is too large to fit into memory.
 6 | set -e -u
 7 | 
 8 | dir=$(dirname $0)
 9 | 
10 | [ ! -f $dir/.processed_data.done ] && {
11 |     bash local/data.sh
12 | 
13 |     python local/prep_wds.py >/dev/null || exit 1
14 | 
15 |     touch $dir/.processed_data.done
16 | }
17 | 
18 | # train tokenizer
19 | python utils/pipeline/asr.py \
20 |     $dir/tokenizer \
21 |     --sto 1 || exit 1
22 | 
23 | # finish following steps
24 | # NOTE:
25 | #     with --ld in train:option, the epoch id will always
26 | #     be 1. However, you can estimate the #epochs
27 | #     according to #steps by
28 | #         #epochs = #steps * batch_size / #total_utts
29 | python utils/pipeline/asr.py \
30 |     $dir --sta 2 || exit 1
31 | 
32 | echo "$0 done"
33 | exit 0
34 | 


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/asr-ctc-large-corpora/tokenizer/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": "yesno"
 4 |     },
 5 |     "tokenizer": {
 6 |         "type": "SentencePieceTokenizer",
 7 |         "option-train": {
 8 |             "model_type": "word",
 9 |             "use_all_vocab": true,
10 |             "vocab_size": 4,
11 |             "model_prefix": "sentencepiece/yesno_word/spm"
12 |         },
13 |         "file": "exp/asr-ctc-large-corpora/tokenizer.tknz"
14 |     }
15 | }


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/asr-ctc-lexicon/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "trainer": {
 3 |         "decoder": {
 4 |             "beam_size": 16,
 5 |             "num_classes": 7
 6 |         }
 7 |     },
 8 |     "encoder": {
 9 |         "type": "LSTM",
10 |         "kwargs": {
11 |             "bidirectional": true,
12 |             "dropout": 0.2,
13 |             "hdim": 512,
14 |             "idim": 80,
15 |             "num_layers": 2,
16 |             "num_classes": 7,
17 |             "with_head": true
18 |         }
19 |     },
20 |     "scheduler": {
21 |         "type": "SchedulerCosineAnnealing",
22 |         "kwargs": {
23 |             "min_lr": 0.0001,
24 |             "stop_step": 200
25 |         },
26 |         "optimizer": {
27 |             "type": "Adam",
28 |             "kwargs": {
29 |                 "lr": 0.001
30 |             },
31 |             "zeroredundancy": true
32 |         }
33 |     }
34 | }


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/asr-ctc-lexicon/decode_lm/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "NGram",
 4 |         "kwargs": {
 5 |             "f_binlm": "exp/asr-ctc-lexicon/decode_lm/2gram.arpa",
 6 |             "gram_order": 2,
 7 |             "num_classes": 4
 8 |         }
 9 |     }
10 | }


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/asr-ctc-lexicon/decode_lm/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": "yesno"
 4 |     },
 5 |     "tokenizer": {
 6 |         "type": "SimpleTokenizer",
 7 |         "option-init": {
 8 |             "dmap": "exp/asr-ctc-lexicon/local/lexicon.txt",
 9 |             "read_index_from_file": false
10 |         },
11 |         "|V|": 4,
12 |         "file": "exp/asr-ctc-lexicon/decode_lm/tokenizer.tknz"
13 |     },
14 |     "inference": {},
15 |     "commit": "d43b70416911b47882f6f360ec41add206a2fb1d"
16 | }


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/asr-ctc-lexicon/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 8.74
 6 | * GPU info \[1\]
 7 |   * \[1\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | * CTC loss with lexicon
12 | 
13 | ### Result
14 | ```
15 | exp/asr-ctc-lexicon/decode/yesno/text_ac1.0_lm0.2_wip0.0.hyp    %SER 100.00 | %CER 85.14 [ 1031 / 1211, 0 ins, 1031 del, 0 sub ]
16 | ```
17 | 


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/asr-ctc/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "trainer": {
 3 |         "decoder": {
 4 |             "beam_size": 4,
 5 |             "num_classes": 4
 6 |         }
 7 |     },
 8 |     "encoder": {
 9 |         "type": "LSTM",
10 |         "kwargs": {
11 |             "bidirectional": true,
12 |             "proj_size": 128,
13 |             "hdim": 256,
14 |             "idim": 80,
15 |             "num_layers": 3,
16 |             "num_classes": 4
17 |         }
18 |     },
19 |     "scheduler": {
20 |         "type": "SchedulerCosineAnnealing",
21 |         "kwargs": {
22 |             "min_lr": 1e-05,
23 |             "stop_step": 300
24 |         },
25 |         "optimizer": {
26 |             "type": "Adam",
27 |             "kwargs": {
28 |                 "lr": 0.001
29 |             }
30 |         }
31 |     }
32 | }


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/asr-ctc/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": "yesno",
 4 |         "dev": "yesno",
 5 |         "test": "yesno",
 6 |         "filter": "10:2000"
 7 |     },
 8 |     "tokenizer": {
 9 |         "type": "SentencePieceTokenizer",
10 |         "option-train": {
11 |             "model_type": "word",
12 |             "use_all_vocab": true,
13 |             "vocab_size": 4,
14 |             "model_prefix": "sentencepiece/yesno_word/spm"
15 |         }
16 |     },
17 |     "env": {
18 |         "CUDA_VISIBLE_DEVICES": "0"
19 |     },
20 |     "train": {
21 |         "bin": "cat.ctc.train",
22 |         "option": {
23 |             "amp": false,
24 |             "batch_size": 1,
25 |             "eval_error_rate": true
26 |         }
27 |     },
28 |     "inference": {
29 |         "infer": {
30 |             "bin": "cat.ctc.decode",
31 |             "option": {
32 |                 "beam_size": 16,
33 |                 "nj": 2
34 |             }
35 |         },
36 |         "er": {}
37 |     }
38 | }


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/asr-ctc/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/TEMPLATE/exp/asr-ctc/monitor.png


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/asr-ctc/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 2.21
 6 | * GPU info \[1\]
 7 |   * \[1\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | * template of training a ctc model on yesno corpora.
12 | 
13 | ### Result
14 | ```
15 | yesno   %SER 100.00 | %WER 55.83 [ 268 / 480, 0 ins, 241 del, 27 sub ]
16 | ```
17 | 
18 | |     training process    |
19 | |:-----------------------:|
20 | |![tb-plot](./monitor.png)|
21 | 


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/asr-rnnt-cuside/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/TEMPLATE/exp/asr-rnnt-cuside/monitor.png


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/asr-rnnt-cuside/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 4.90
 6 | * GPU info \[1\]
 7 |   * \[1\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | * chunk and streaming experiment
12 | * use LSTM as encoder and there is no subsampling in the encoder.
13 | 
14 | ### Result
15 | ```
16 | yesno   %SER 100.00 | %WER 67.92 [ 326 / 480, 0 ins, 317 del, 9 sub ]
17 | ```
18 | 
19 | |     training process    |
20 | |:-----------------------:|
21 | |![tb-plot](./monitor.png)|
22 | 


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/asr-rnnt/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": "yesno",
 4 |         "dev": "yesno",
 5 |         "test": "yesno",
 6 |         "filter": "10:2000"
 7 |     },
 8 |     "tokenizer": {
 9 |         "type": "SentencePieceTokenizer",
10 |         "option-train": {
11 |             "model_type": "char",
12 |             "use_all_vocab": true,
13 |             "vocab_size": 8,
14 |             "model_prefix": "sentencepiece/yesno_char/spm"
15 |         }
16 |     },
17 |     "env": {
18 |         "CUDA_VISIBLE_DEVICES": "0"
19 |     },
20 |     "train": {
21 |         "bin": "cat.rnnt.train",
22 |         "option": {
23 |             "amp": true,
24 |             "batch_size": 4,
25 |             "check_freq": 50
26 |         }
27 |     },
28 |     "inference": {
29 |         "infer": {
30 |             "bin": "cat.rnnt.decode",
31 |             "option": {
32 |                 "beam_size": 16,
33 |                 "cpu": true,
34 |                 "nj": 4
35 |             }
36 |         },
37 |         "er": {}
38 |     }
39 | }


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/asr-rnnt/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/TEMPLATE/exp/asr-rnnt/monitor.png


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/asr-rnnt/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 3.20
 6 | * GPU info \[1\]
 7 |   * \[1\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | * template experiment using RNN-T loss to train on the yesno data.
12 | 
13 | ### Result
14 | ```
15 | yesno   %SER 98.33 | %WER 48.96 [ 235 / 480, 25 ins, 25 del, 185 sub ]
16 | ```
17 | 
18 | |     training process    |
19 | |:-----------------------:|
20 | |![tb-plot](./monitor.png)|
21 | 


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/lm-ebm/cfg_aux.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "CausalTransformer",
 4 |         "kwargs": {
 5 |             "attn_dropout": 0.1,
 6 |             "dim_hid": 320,
 7 |             "num_classes": 4000,
 8 |             "num_head": 8,
 9 |             "num_layers": 6
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/lm-ebm/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "EBM",
 4 |         "kwargs": {
 5 |             "noise_rate": 1,
 6 |             "method": "dnce",
 7 |             "energy_func": "sumtargetlogit",
 8 |             "config_ebm_model": "exp/lm-ebm/cfg_aux.json",
 9 |             "config_noise_model": "exp/lm-ebm/cfg_aux.json",
10 |             "tokenizer_path": "exp/lm-ebm/tokenizer.tknz",
11 |             "bert_tokenizer": false
12 |         }
13 |     },
14 |     "scheduler": {
15 |         "type": "SchedulerNoam",
16 |         "kwargs": {
17 |             "dim_model": 768,
18 |             "peak_factor": 0.01,
19 |             "warmup_step": 100,
20 |             "stop_step": 500
21 |         },
22 |         "optimizer": {
23 |             "type": "Adam",
24 |             "kwargs": {
25 |                 "lr": 0.0008
26 |             }
27 |         }
28 |     }
29 | }


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/lm-ebm/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": "data/local-lm/libri-part.train",
 4 |         "dev": "data/local-lm/libri-part.dev",
 5 |         "test": [
 6 |             "data/local-lm/libri-part.dev"
 7 |         ],
 8 |         "packing-text-lm": {
 9 |             "nj": 8,
10 |             "truncate": 128,
11 |             "prune_shorter": 4
12 |         }
13 |     },
14 |     "tokenizer": {
15 |         "type": "SentencePieceTokenizer",
16 |         "option-train": {
17 |             "model_type": "bpe",
18 |             "vocab_size": 4000,
19 |             "model_prefix": "sentencepiece/temp-bpe4000/spm"
20 |         }
21 |     },
22 |     "env": {
23 |         "CUDA_VISIBLE_DEVICES": "0"
24 |     },
25 |     "train": {
26 |         "bin": "cat.lm.trf.train",
27 |         "option": {
28 |             "amp": true,
29 |             "batch_size": 16,
30 |             "check-freq": 100,
31 |             "grad-norm": 5.0
32 |         }
33 |     }
34 | }


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/lm-ebm/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/TEMPLATE/exp/lm-ebm/monitor.png


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/lm-ebm/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 20.58
 6 | * GPU info \[1\]
 7 |   * \[1\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | * An example of training energy-based language model.
12 | 
13 | 
14 | |     training process    |
15 | |:-----------------------:|
16 | |![tb-plot](./monitor.png)|
17 | 


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/lm-ebm/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e -u
 3 | 
 4 | dir=$(dirname $0)
 5 | 
 6 | bash local/lm_data.sh
 7 | 
 8 | python utils/pipeline/lm.py $dir --sto 3 || exit 1
 9 | 
10 | echo "$0 done" && exit 0
11 | 


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/lm-ngram-word/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "NGram",
 4 |         "kwargs": {
 5 |             "f_binlm": "exp/lm-ngram-word/3gram.bin",
 6 |             "gram_order": 3,
 7 |             "num_classes": 45899
 8 |         }
 9 |     }
10 | }


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/lm-ngram-word/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": "data/local-lm/libri-part.train",
 4 |         "test": [
 5 |             "data/local-lm/libri-part.dev"
 6 |         ]
 7 |     },
 8 |     "tokenizer": {
 9 |         "type": "SentencePieceTokenizer",
10 |         "option-train": {
11 |             "model_type": "word",
12 |             "vocab_size": 45899,
13 |             "use_all_vocab": true,
14 |             "model_prefix": "sentencepiece/lm-word/spm"
15 |         }
16 |     },
17 |     "inference": {}
18 | }


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/lm-ngram-word/readme.md:
--------------------------------------------------------------------------------
 1 | 
 2 | train command:
 3 | 
 4 | ```bash
 5 | utils/pipeline/ngram.sh exp/lm-ngram-word -o 3
 6 | ```
 7 | 
 8 | property:
 9 | 
10 | - prune: 
11 | - type:  probing
12 | - size:  25MB
13 | 
14 | perplexity:
15 | 
16 | ```
17 | data: data/local-lm/libri-part.dev
18 | ppl:   436.06  |
19 | ```
20 | 


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/lm-nn/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "CausalTransformer",
 4 |         "kwargs": {
 5 |             "attn_dropout": 0.2,
 6 |             "dim_hid": 256,
 7 |             "num_classes": 45899,
 8 |             "num_head": 2,
 9 |             "num_layers": 4
10 |         }
11 |     },
12 |     "scheduler": {
13 |         "type": "SchedulerCosineAnnealing",
14 |         "kwargs": {
15 |             "min_lr": 1e-05,
16 |             "stop_step": 2000
17 |         },
18 |         "optimizer": {
19 |             "type": "Adam",
20 |             "zeroredundancy": true,
21 |             "kwargs": {
22 |                 "lr": 0.001
23 |             }
24 |         }
25 |     }
26 | }


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/lm-nn/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/TEMPLATE/exp/lm-nn/monitor.png


--------------------------------------------------------------------------------
/egs/TEMPLATE/exp/lm-nn/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 26.97
 6 | * GPU info \[1\]
 7 |   * \[1\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | * template experiment training a Transformer LM.
12 | 
13 | ### Result
14 | ```
15 | data: data/local-lm/libri-part.dev
16 | ppl:   437.87  |
17 | ```
18 | 
19 | |     training process    |
20 | |:-----------------------:|
21 | |![tb-plot](./monitor.png)|
22 | 


--------------------------------------------------------------------------------
/egs/TEMPLATE/local/data.sh:
--------------------------------------------------------------------------------
 1 | set -e
 2 | set -u
 3 | 
 4 | mkdir -p data/local
 5 | cwd=$(pwd)
 6 | 
 7 | cd data/local
 8 | if [ ! -f .completed ]; then
 9 |     # download data
10 |     [ ! -f yesno.tar.gz ] &&
11 |         wget https://www.openslr.org/resources/1/waves_yesno.tar.gz -O yesno.tar.gz
12 | 
13 |     # check downloaded file
14 |     [ "$(md5sum yesno.tar.gz | awk '{print $1}')" != "962ff6e904d2df1126132ecec6978786" ] && (
15 |         echo "MD5SUM check failed for yesno.tar.gz, please rm it then re-run the script."
16 |         exit 1
17 |     )
18 | 
19 |     # untar
20 |     tar -zxf yesno.tar.gz
21 |     touch .completed
22 | else
23 |     echo "Found previous processed data. Skip download"
24 | fi
25 | cd $cwd
26 | 
27 | [ ! $(command -v python) ] && (
28 |     echo "No python executable found in PATH"
29 |     exit 1
30 | )
31 | 
32 | python local/extract_feat.py data/local/waves_yesno/
33 | echo "FBank spectrum generate done."
34 | 
35 | python utils/data/resolvedata.py
36 | 
37 | echo "$0 done"
38 | exit 0
39 | 


--------------------------------------------------------------------------------
/egs/TEMPLATE/local/lm_data.sh:
--------------------------------------------------------------------------------
 1 | set -e -u
 2 | 
 3 | dir="data/local-lm"
 4 | n_utts=50000
 5 | url="https://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz"
 6 | 
 7 | [ $n_utts -le 500 ] && {
 8 |     echo "#utterances must > 500 for spliting train & dev" >&2
 9 |     exit 1
10 | }
11 | 
12 | mkdir -p $dir
13 | cd $dir
14 | if [ ! -f .completed ]; then
15 |     # download and process data
16 |     echo "Start downloading corpus, please wait..."
17 |     wget $url -q -O - | gunzip -c | head -n $n_utts |
18 |         tr '[:upper:]' '[:lower:]' >libri-part.txt
19 |     echo "Corpus downloaded. ($n_utts utterances from librispeech corpus)"
20 | 
21 |     # take the last 500 utterances as dev
22 |     head -n $(($n_utts - 500)) libri-part.txt >libri-part.train
23 |     tail -n 500 libri-part.txt >libri-part.dev
24 |     touch .completed
25 | else
26 |     echo "Found previous processed data."
27 | fi
28 | cd - >/dev/null
29 | 
30 | echo "$0 done"
31 | exit 0
32 | 


--------------------------------------------------------------------------------
/egs/TEMPLATE/utils:
--------------------------------------------------------------------------------
1 | ../../cat/utils/


--------------------------------------------------------------------------------
/egs/aishell/.vscode:
--------------------------------------------------------------------------------
1 | ../../.vscode/


--------------------------------------------------------------------------------
/egs/aishell/cat:
--------------------------------------------------------------------------------
1 | ../../cat/


--------------------------------------------------------------------------------
/egs/aishell/exp/ctc-crf-cuside/decode_lm/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "tyep": "NGram",
 4 |         "kwargs": {
 5 |             "f_binlm": "exp/ctc-crf-cuside/decode_lm/3gram.arpa",
 6 |             "gram_order": 3,
 7 |             "num_classes": 137076
 8 |         }
 9 |     }
10 | }
11 | 


--------------------------------------------------------------------------------
/egs/aishell/exp/ctc-crf-cuside/decode_lm/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": "train"
 4 |     },
 5 |     "tokenizer": {
 6 |         "type": "JiebaTokenizer",
 7 |         "option-init": {
 8 |             "userdict": "exp/ctc-crf-cuside/prepare_lexicon/dict.txt"
 9 |         }
10 |     },
11 |     "inference": {}
12 | }
13 | 


--------------------------------------------------------------------------------
/egs/aishell/exp/ctc-crf-cuside/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell/exp/ctc-crf-cuside/monitor.png


--------------------------------------------------------------------------------
/egs/aishell/exp/ctc-crf-cuside/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 117.07
 6 | * GPU info \[10\]
 7 |   * \[10\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | * ctc-crf-cuside training
12 | * use torchaudio for feature extraction w/o CMVN
13 | * This experiment is conducted on the `v3` released code base, and it may slightly differ from the results in [CUSIDE paper](https://arxiv.org/abs/2203.16758).
14 | 
15 | ### Result
16 | ```
17 | test    %SER 41.60 | %CER 5.57 [ 5840 / 104765, 137 ins, 105 del, 5598 sub ]/streaming
18 | test    %SER 37.56 | %CER 4.99 [ 5228 / 104765, 142 ins, 115 del, 4971 sub ]/non-streaming
19 | ```
20 | 
21 | |     training process    |
22 | |:-----------------------:|
23 | |![monitor](./monitor.png)|
24 | 


--------------------------------------------------------------------------------
/egs/aishell/exp/ctc-crf-cuside/run_lexicon.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | dir=$(dirname $0)
 4 | mkdir -p $dir/prepare_lexicon
 5 | 
 6 | cd $dir/prepare_lexicon
 7 | [[ ! -f lexicon.txt || ! -f dict.txt ]] && {
 8 |     [ ! -f resource_aishell.tgz ] &&
 9 |         wget https://www.openslr.org/resources/33/resource_aishell.tgz
10 | 
11 |     [ ! -f lexicon.txt ] && {
12 |         tar -zxf resource_aishell.tgz
13 |         mv resource_aishell/lexicon.txt ./
14 |     }
15 | 
16 |     [ ! -f dict.txt ] && (
17 |         # prepare word segmentation dictionary for jieba token
18 |         cut <lexicon.txt -f 1 |
19 |             awk {'print $1, 99'} >dict.txt
20 |     )
21 | }
22 | echo "finished: lexicon and dict"
23 | 


--------------------------------------------------------------------------------
/egs/aishell/exp/ctc-v1/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell/exp/ctc-v1/monitor.png


--------------------------------------------------------------------------------
/egs/aishell/exp/ctc-v1/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 79.50
 6 | * GPU info \[6\]
 7 |   * \[6\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | * data prepare
12 | 
13 | ```bash
14 | bash local/data_kaldi.sh /path/to/data -use-3way-sp
15 | ```
16 | 
17 | * same encoder as `rnnt-v1`
18 | * batch size of 128 with 500k iters.
19 | 
20 | ### Result
21 | ```
22 | dev     %SER 36.97 | %CER 4.65 [ 9544 / 205341, 167 ins, 144 del, 9233 sub ]
23 | test    %SER 39.62 | %CER 5.21 [ 5462 / 104765, 88 ins, 125 del, 5249 sub ]
24 | 
25 | +lm-v1-char-5gram 5g char 0.25
26 | dev     %SER 35.08 | %CER 4.49 [ 9211 / 205341, 137 ins, 165 del, 8909 sub ]
27 | test    %SER 37.25 | %CER 4.95 [ 5184 / 104765, 73 ins, 142 del, 4969 sub ]
28 | 
29 | +lm-v2-word-3gram 3g word 0.3
30 | dev     %SER 33.05 | %CER 4.25 [ 8732 / 205341, 136 ins, 168 del, 8428 sub ]
31 | test    %SER 35.37 | %CER 4.72 [ 4948 / 104765, 71 ins, 143 del, 4734 sub ]
32 | ```
33 | 
34 | |     training process    |
35 | |:-----------------------:|
36 | |![monitor](./monitor.png)|
37 | 


--------------------------------------------------------------------------------
/egs/aishell/exp/ebm-lm/GN-ELM-DNCE/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "EBM",
 4 |         "kwargs": {
 5 |             "noise_rate": 1,
 6 |             "method": "dnce",
 7 |             "energy_func": "hidden2scalar-sum",
 8 |             "config_ebm_model": "exp/ebm-lm/GN-ELM-DNCE/config_ebm.json",
 9 |             "config_noise_model": "exp/ebm-lm/GN-ELM-DNCE/config_noise.json",
10 |             "tokenizer_path": "exp/ebm-lm/GN-ELM-DNCE/tokenizer.tknz",
11 |             "bert_tokenizer": true
12 |         }
13 |     },
14 |     "scheduler": {
15 |         "type": "SchedulerNoam",
16 |         "kwargs": {
17 |             "dim_model": 768,
18 |             "peak_factor": 0.3,
19 |             "warmup_step": 5000,
20 |             "stop_step": 2000
21 |         },
22 |         "optimizer": {
23 |             "type": "Adam",
24 |             "kwargs": {
25 |                 "lr": 0.001
26 |             }
27 |         }
28 |     }
29 | }


--------------------------------------------------------------------------------
/egs/aishell/exp/ebm-lm/GN-ELM-DNCE/config_ebm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "PretrainedTransformer",
 4 |         "kwargs": {
 5 |             "T_model": "BertModel",
 6 |             "T_config": "BertConfig",
 7 |             "pretrained": "bert-base-chinese",
 8 |             "with_head": false
 9 |         }
10 |     }
11 | }


--------------------------------------------------------------------------------
/egs/aishell/exp/ebm-lm/GN-ELM-DNCE/config_noise.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "PretrainedTransformer",
 4 |         "kwargs": {
 5 |             "T_model": "GPT2LMHeadModel",
 6 |             "T_config": "GPT2Config",
 7 |             "pretrained": "uer/gpt2-chinese-cluecorpussmall",
 8 |             "with_head": false
 9 |         }
10 |     }
11 | }


--------------------------------------------------------------------------------
/egs/aishell/exp/ebm-lm/GN-ELM-DNCE/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell/exp/ebm-lm/GN-ELM-DNCE/monitor.png


--------------------------------------------------------------------------------
/egs/aishell/exp/ebm-lm/GN-ELM-ML/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "EBM_IS",
 4 |         "kwargs": {
 5 |             "noise_rate": 1,
 6 |             "method": "IS",
 7 |             "energy_func": "sumtokenlogit",
 8 |             "config_ebm_model": "exp/ebm-lm/GN-ELM-ML/config_ebm.json",
 9 |             "config_noise_model": "exp/ebm-lm/GN-ELM-DNCE/config_noise.json",
10 |             "tokenizer_path": "exp/ebm-lm/GN-ELM-DNCE/tokenizer.tknz",
11 |             "bert_tokenizer": true,
12 |             "freeze_noise": false
13 |         }
14 |     },
15 |     "scheduler": {
16 |         "type": "SchedulerNoam",
17 |         "kwargs": {
18 |             "dim_model": 768,
19 |             "peak_factor": 0.15,
20 |             "warmup_step": 3000,
21 |             "stop_step": 5000
22 |         },
23 |         "optimizer": {
24 |             "type": "Adam",
25 |             "kwargs": {
26 |                 "lr": 0.001
27 |             }
28 |         }
29 |     }
30 | }


--------------------------------------------------------------------------------
/egs/aishell/exp/ebm-lm/GN-ELM-ML/config_ebm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "PretrainedTransformer",
 4 |         "kwargs": {
 5 |             "T_model": "BertLMHeadModel",
 6 |             "T_config": "BertConfig",
 7 |             "pretrained": "bert-base-chinese",
 8 |             "with_head": false
 9 |         }
10 |     }
11 | }


--------------------------------------------------------------------------------
/egs/aishell/exp/ebm-lm/GN-ELM-ML/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell/exp/ebm-lm/GN-ELM-ML/monitor.png


--------------------------------------------------------------------------------
/egs/aishell/exp/ebm-lm/GN-ELM-NCE/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "EBM",
 4 |         "kwargs": {
 5 |             "noise_rate": 1,
 6 |             "method": "nce",
 7 |             "energy_func": "hidden2scalar-sum",
 8 |             "config_ebm_model": "exp/lm/GN-ELM-NCE/config_ebm.json",
 9 |             "config_noise_model": "exp/lm/GN-ELM-NCE/config_noise.json",
10 |             "check_noise_model": "exp/lm/lm-gpt2/check/best-2.pt",
11 |             "tokenizer_path": "exp/lm/GN-ELM-NCE/tokenizer.tknz",
12 |             "bert_tokenizer": true
13 |         }
14 |     },
15 |     "scheduler": {
16 |         "type": "SchedulerNoam",
17 |         "kwargs": {
18 |             "dim_model": 768,
19 |             "peak_factor": 0.45,
20 |             "warmup_step": 5000,
21 |             "stop_step": 2000
22 |         },
23 |         "optimizer": {
24 |             "type": "Adam",
25 |             "kwargs": {
26 |                 "lr": 0.001
27 |             }
28 |         }
29 |     }
30 | }


--------------------------------------------------------------------------------
/egs/aishell/exp/ebm-lm/GN-ELM-NCE/config_ebm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "PretrainedTransformer",
 4 |         "kwargs": {
 5 |             "T_model": "BertModel",
 6 |             "T_config": "BertConfig",
 7 |             "pretrained": "bert-base-chinese",
 8 |             "with_head": false
 9 |         }
10 |     }
11 | }


--------------------------------------------------------------------------------
/egs/aishell/exp/ebm-lm/GN-ELM-NCE/config_noise.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "PretrainedTransformer",
 4 |         "kwargs": {
 5 |             "T_model": "GPT2LMHeadModel",
 6 |             "T_config": "GPT2Config",
 7 |             "pretrained": "uer/gpt2-chinese-cluecorpussmall",
 8 |             "with_head": false
 9 |         }
10 |     }
11 | }


--------------------------------------------------------------------------------
/egs/aishell/exp/ebm-lm/GN-ELM-NCE/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell/exp/ebm-lm/GN-ELM-NCE/monitor.png


--------------------------------------------------------------------------------
/egs/aishell/exp/ebm-lm/GN-ELM-NCE/readme.md:
--------------------------------------------------------------------------------
 1 | # Train GN-ELM with NCE
 2 | 
 3 | The training and testing process is basically consistent with [Train GN-ELM with DNCE](../GN-ELM-DNCE/). We only explain the differences here.
 4 | 
 5 | ## Notes
 6 | 
 7 | * In NCE training (set `config['decoder']['kwargs']['method']=nce`), the noise model is fixed, so we need a  trained language model to initialize the noise model. We use a [finetuned GPT-2](../lm-gpt2/) for initialization, which is specified in `config['decoder']['kwargs']['check_noise_model']`.
 8 | 
 9 | ## Result
10 | 
11 | We also try 3 different energy functions, whose results are as follows:
12 | 
13 | |CER type     | SumTargetLogit |  Hidden2Scalar  | SumTokenLogit |
14 | | -------     | -------- | ----------- | ----------- |
15 | | in-domain   | 3.32     |  3.20       |  3.27       |
16 | | cross-domain| 3.39     |  3.36       |  3.43       | 
17 | 
18 | The training curve of the best model is shown below.
19 | 
20 | |     training curve    |
21 | |:-----------------------:|
22 | |![monitor](./monitor.png)|
23 | 


--------------------------------------------------------------------------------
/egs/aishell/exp/ebm-lm/TRF-LM-DNCE/config_noise.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "PretrainedTransformer",
 4 |         "kwargs": {
 5 |             "T_model": "GPT2LMHeadModel",
 6 |             "T_config": "GPT2Config",
 7 |             "pretrained": "uer/gpt2-chinese-cluecorpussmall",
 8 |             "with_head": false
 9 |         }
10 |     }
11 | }


--------------------------------------------------------------------------------
/egs/aishell/exp/ebm-lm/TRF-LM-DNCE/config_trf.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "PretrainedTransformer",
 4 |         "kwargs": {
 5 |             "T_model": "GPT2LMHeadModel",
 6 |             "T_config": "GPT2Config",
 7 |             "pretrained": "uer/gpt2-chinese-cluecorpussmall",
 8 |             "with_head": false
 9 |         }
10 |     }
11 | }


--------------------------------------------------------------------------------
/egs/aishell/exp/ebm-lm/TRF-LM-DNCE/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell/exp/ebm-lm/TRF-LM-DNCE/monitor.png


--------------------------------------------------------------------------------
/egs/aishell/exp/ebm-lm/TRF-LM-DNCE/readme.md:
--------------------------------------------------------------------------------
 1 | # Train TRF-LM with DNCE
 2 | The training and testing process is basically consistent with [Train GN-ELM with DNCE](../GN-ELM-DNCE/). We only explain the differences here.
 3 | ## Notes
 4 | * **In stage 2 (data packing)**, for training TRF, we need to calculate the length distribution after packing data and before training.
 5 | ```
 6 | python -m cat.lm.trf.prep_feats exp/TRF-LM-DNCE/pkl/train.pkl exp/TRF-LM-DNCE/linfo.pkl
 7 | ```
 8 | 
 9 | ## Result
10 | We also try 3 different energy functions, whose results are as follows:
11 | |CER type     | SumTargetLogit |  Hidden2Scalar  | SumTokenLogit |
12 | | -------     | -------- | ----------- | ----------- |
13 | | in-domain   | 3.11     |  3.13       |  3.21       |
14 | | cross-domain| 3.44     |  3.39       |  3.47       | 
15 | 
16 | The training curve of the best model is shown below.
17 | |     training curve    |
18 | |:-----------------------:|
19 | |![monitor](./monitor.png)|


--------------------------------------------------------------------------------
/egs/aishell/exp/lm/lm-v1-char-5gram/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "NGram",
 4 |         "kwargs": {
 5 |             "f_binlm": "exp/lm/lm-v1-char-5gram/5gram.bin",
 6 |             "gram_order": 5,
 7 |             "num_classes": 4232
 8 |         }
 9 |     }
10 | }


--------------------------------------------------------------------------------
/egs/aishell/exp/lm/lm-v1-char-5gram/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": "train",
 4 |         "dev": "dev",
 5 |         "test": [
 6 |             "dev",
 7 |             "test"
 8 |         ]
 9 |     },
10 |     "tokenizer": {
11 |         "type": "SentencePieceTokenizer",
12 |         "option-train": {
13 |             "model_type": "char",
14 |             "vocab_size": 4232,
15 |             "add_dummy_prefix": false,
16 |             "use_all_vocab": true,
17 |             "model_prefix": "sentencepiece/aishell_char/spm"
18 |         }
19 |     },
20 |     "inference": {}
21 | }


--------------------------------------------------------------------------------
/egs/aishell/exp/lm/lm-v1-char-5gram/readme.md:
--------------------------------------------------------------------------------
 1 | 
 2 | train command:
 3 | 
 4 | ```bash
 5 | utils/pipeline/ngram.sh exp/lm/lm-v1-char-5gram
 6 | ```
 7 | 
 8 | property:
 9 | 
10 | - prune: 
11 | - type:  probing
12 | - size:  73MB
13 | 
14 | perplexity:
15 | 
16 | ```
17 | data:  dev       test
18 | ppl:   59.06  |  58.44  |
19 | ```
20 | 


--------------------------------------------------------------------------------
/egs/aishell/exp/lm/lm-v2-word-3gram/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "NGram",
 4 |         "kwargs": {
 5 |             "f_binlm": "exp/lm/lm-v2-word-3gram/3gram.klm",
 6 |             "gram_order": 3,
 7 |             "num_classes": 498115
 8 |         }
 9 |     }
10 | }


--------------------------------------------------------------------------------
/egs/aishell/exp/lm/lm-v2-word-3gram/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": "train",
 4 |         "test": [
 5 |             "dev",
 6 |             "test"
 7 |         ],
 8 |         "lang": "zh-cn"
 9 |     },
10 |     "tokenizer": {
11 |         "type": "JiebaTokenizer",
12 |         "option-init": {}
13 |     },
14 |     "inference": {}
15 | }


--------------------------------------------------------------------------------
/egs/aishell/exp/lm/lm-v2-word-3gram/readme.md:
--------------------------------------------------------------------------------
 1 | 
 2 | train command:
 3 | 
 4 | ```bash
 5 | utils/pipeline/ngram.sh exp/lm/lm-v2-word-3gram -o 3
 6 | ```
 7 | 
 8 | property:
 9 | 
10 | - prune: 
11 | - type:  probing
12 | - size:  26MB
13 | 
14 | perplexity:
15 | 
16 | ```
17 | using jieba default dict produces better results:
18 | Test file: dev.tmp -> ppl: 788.34
19 | Test file: test.tmp -> ppl: 840.97
20 | 
21 | with bigcidian dict:
22 | ppl ~1000
23 | ```
24 | 


--------------------------------------------------------------------------------
/egs/aishell/exp/rnnt-cuside/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell/exp/rnnt-cuside/monitor.png


--------------------------------------------------------------------------------
/egs/aishell/exp/rnnt-cuside/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 86.00
 6 | * GPU info \[10\]
 7 |   * \[10\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | * use torchaudio for feature extraction w/o CMVN
12 | 
13 | ### Result
14 | ```
15 | test    %SER 41.76 | %CER 6.02 [ 6302 / 104765, 225 ins, 255 del, 5822 sub ]/streaming
16 | test    %SER 36.97 | %CER 5.12 [ 5369 / 104765, 102 ins, 180 del, 5087 sub ]/non-streaming
17 | ```
18 | 
19 | |     training process    |
20 | |:-----------------------:|
21 | |![monitor](./monitor.png)|
22 | 


--------------------------------------------------------------------------------
/egs/aishell/exp/rnnt-v1/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell/exp/rnnt-v1/monitor.png


--------------------------------------------------------------------------------
/egs/aishell/utils:
--------------------------------------------------------------------------------
1 | ../../cat/utils/


--------------------------------------------------------------------------------
/egs/aishell4/cat:
--------------------------------------------------------------------------------
1 | ../../cat/


--------------------------------------------------------------------------------
/egs/aishell4/exp/Exp1-SingalChannel_E2E/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell4/exp/Exp1-SingalChannel_E2E/monitor.png


--------------------------------------------------------------------------------
/egs/aishell4/exp/Exp1-SingalChannel_E2E/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 20.70
 6 | * GPU info \[2\]
 7 |   * \[2\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | * Choose channel 0 as input_channel
12 | 
13 | ### Result
14 | ```
15 | Streaming:
16 | test_raw_ori    %SER 98.02 | %CER 55.07 [ 72303 / 131298, 2006 ins, 25601 del, 44696 sub ]
17 | -------------------------
18 | Non-streaming
19 | test_raw_ori    %SER 91.26 | %CER 38.76 [ 50886 / 131298, 4611 ins, 6505 del, 39770 sub ]
20 | 
21 | ```
22 | 
23 | |     training process    |
24 | |:-----------------------:|
25 | |![tb-plot](./monitor.png)|
26 | 


--------------------------------------------------------------------------------
/egs/aishell4/exp/Exp10~12-CUSIDE-Array+Pre-trained_BE+E2E-FT(ID+simu_data)/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell4/exp/Exp10~12-CUSIDE-Array+Pre-trained_BE+E2E-FT(ID+simu_data)/monitor.png


--------------------------------------------------------------------------------
/egs/aishell4/exp/Exp2-SingalChannel_E2E+JT(CUSIDE)/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell4/exp/Exp2-SingalChannel_E2E+JT(CUSIDE)/monitor.png


--------------------------------------------------------------------------------
/egs/aishell4/exp/Exp3-MultiChannel_E2E/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell4/exp/Exp3-MultiChannel_E2E/monitor.png


--------------------------------------------------------------------------------
/egs/aishell4/exp/Exp3-MultiChannel_E2E/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 25.77
 6 | * GPU info \[2\]
 7 |   * \[2\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | * 
12 | 
13 | ### Result
14 | ```
15 | Streaming
16 | test_raw_ori    %SER 98.55 | %CER 56.84 [ 74626 / 131298, 2181 ins, 30414 del, 42031 sub ]
17 | -----------------------
18 | Non-streaming
19 | test_raw_ori    %SER 88.28 | %CER 27.93 [ 36673 / 131298, 3925 ins, 4613 del, 28135 sub ]
20 | ```
21 | 
22 | |     training process    |
23 | |:-----------------------:|
24 | |![tb-plot](./monitor.png)|
25 | 


--------------------------------------------------------------------------------
/egs/aishell4/exp/Exp4-MultiChannel_E2E+JT(CUSIDE-Array)/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell4/exp/Exp4-MultiChannel_E2E+JT(CUSIDE-Array)/monitor.png


--------------------------------------------------------------------------------
/egs/aishell4/exp/Exp5-CUSIDE-Array+real_right_ctx/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell4/exp/Exp5-CUSIDE-Array+real_right_ctx/monitor.png


--------------------------------------------------------------------------------
/egs/aishell4/exp/Exp5-CUSIDE-Array+real_right_ctx/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 25.77
 6 | * GPU info \[4\]
 7 |   * \[4\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | * 
12 | 
13 | ### Result
14 | ```
15 | Streaming
16 | test_raw_ori    %SER 89.57 | %CER 32.51 [ 42688 / 131298, 4416 ins, 5202 del, 33070 sub ]
17 | -----------------------
18 | Non-streaming
19 | test_raw_ori    %SER 89.07 | %CER 31.21 [ 40975 / 131298, 4239 ins, 4902 del, 31834 sub ]
20 | ```
21 | 
22 | |     training process    |
23 | |:-----------------------:|
24 | |![tb-plot](./monitor.png)|
25 | 


--------------------------------------------------------------------------------
/egs/aishell4/exp/Exp6-CUSIDE-Array+simu_right_ctx/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell4/exp/Exp6-CUSIDE-Array+simu_right_ctx/monitor.png


--------------------------------------------------------------------------------
/egs/aishell4/exp/Exp6-CUSIDE-Array+simu_right_ctx/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 27.64
 6 | * GPU info \[4\]
 7 |   * \[4\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | * 
12 | 
13 | ### Result
14 | ```
15 | Streaming
16 | test_raw_ori    %SER 90.56 | %CER 35.96 [ 47215 / 131298, 4954 ins, 5610 del, 36651 sub ]
17 | -----------------------
18 | Non-streaming
19 | test_raw_ori    %SER 89.39 | %CER 31.70 [ 41623 / 131298, 4432 ins, 4906 del, 32285 sub ]
20 | ```
21 | 
22 | |     training process    |
23 | |:-----------------------:|
24 | |![tb-plot](./monitor.png)|
25 | 


--------------------------------------------------------------------------------
/egs/aishell4/exp/Exp7-CUSIDE+Pre-trained_BE/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 80.72
 6 | * GPU info \[1\]
 7 |   * \[1\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | * 
12 | 
13 | ### Result
14 | ```
15 | Streaming
16 | test_alimeeting_raw_ori %SER 79.73 | %CER 28.83 [ 17637 / 61184, 546 ins, 8243 del, 8848 sub ]
17 | dev_alimeeting_raw_ori  %SER 77.96 | %CER 29.07 [ 5597 / 19256, 189 ins, 2842 del, 2566 sub ]
18 | test_raw_ori    %SER 91.67 | %CER 35.72 [ 46899 / 131298, 1445 ins, 30181 del, 15273 sub ]
19 | test_706_array_raw_ori  %SER 100.00 | %CER 41.09 [ 415 / 1010, 3 ins, 159 del, 253 sub ]
20 | Non-streaming
21 | test_alimeeting_raw_ori %SER 65.44 | %CER 20.29 [ 12415 / 61184, 467 ins, 4150 del, 7798 sub ]
22 | dev_alimeeting_raw_ori  %SER 64.97 | %CER 20.55 [ 3957 / 19256, 175 ins, 1634 del, 2148 sub ]
23 | test_raw_ori    %SER 83.03 | %CER 26.42 [ 34689 / 131298, 1734 ins, 18568 del, 14387 sub ]
24 | test_706_array_raw_ori  %SER 95.00 | %CER 29.80 [ 301 / 1010, 2 ins, 93 del, 206 sub ]
25 | 
26 | ```
27 | 
28 | 


--------------------------------------------------------------------------------
/egs/aishell4/exp/Exp9-CUSIDE-Array+Pre-trained_BE+E2E-FT(ID)/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell4/exp/Exp9-CUSIDE-Array+Pre-trained_BE+E2E-FT(ID)/monitor.png


--------------------------------------------------------------------------------
/egs/aishell4/exp/ctc-e2e-chunk+simu/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell4/exp/ctc-e2e-chunk+simu/monitor.png


--------------------------------------------------------------------------------
/egs/aishell4/exp/ctc-e2e-chunk+simu/right_context.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell4/exp/ctc-e2e-chunk+simu/right_context.png


--------------------------------------------------------------------------------
/egs/aishell4/exp/ctc-e2e-chunk+simu/simu_right_context.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell4/exp/ctc-e2e-chunk+simu/simu_right_context.png


--------------------------------------------------------------------------------
/egs/aishell4/exp/ctc-e2e-chunk-kaldi/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 25.77
 6 | * GPU info \[4\]
 7 |   * \[4\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | * 
12 | - "bf": null, 用于加载前端模型
13 | - "am": null, 用于加载后端模型
14 | - "unfreeze": null, 部分训练中，用于选择需要梯度的模块，null情况下不使用
15 | 
16 | ### Result
17 | ```
18 | - 流式：36.68
19 | - 非流式：31.21
20 | ```
21 | 
22 | 


--------------------------------------------------------------------------------
/egs/aishell4/exp/ctc-e2e-chunk/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell4/exp/ctc-e2e-chunk/monitor.png


--------------------------------------------------------------------------------
/egs/aishell4/exp/ctc-e2e-chunk/tokenizer.tknz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell4/exp/ctc-e2e-chunk/tokenizer.tknz


--------------------------------------------------------------------------------
/egs/aishell4/utils:
--------------------------------------------------------------------------------
1 | ../../cat/utils/


--------------------------------------------------------------------------------
/egs/commonvoice/.vscode:
--------------------------------------------------------------------------------
1 | ../../.vscode/


--------------------------------------------------------------------------------
/egs/commonvoice/README.md:
--------------------------------------------------------------------------------
 1 | ## Data
 2 | 
 3 | Source: [Common Voice Corpus](https://commonvoice.mozilla.org)
 4 | 
 5 | ### Data preparation
 6 | 
 7 | You should first follow the **Common Voice** official guide to download the data.
 8 | 
 9 | Then prepare data with:
10 | 
11 | ```
12 | # Any version of Common Voice data is OK. Here CV-11.0 is used by default
13 | bash local/data.sh /path/to/data -lang xx
14 | ```
15 | 
16 | ### Result
17 | 
18 | Performance is evaluated on CER (%).
19 | 
20 | 130 hours **Chinese (China)** speech data
21 | 
22 | | model                         | Unit   | dev   | test  |
23 | | ----------------------------- | -----  | ----- | ----  |
24 | | [rnnt](exp/asr-rnnt-chinese/) | char   | 18.14 | 17.14 |
25 | 
26 | 
27 | Performance is evaluated on WER (%).
28 | 
29 | 180 hours **Russian** speech data
30 | 
31 | | model                         | Unit   | dev   | test  |
32 | | ----------------------------- | -----  | ----- | ----  |
33 | | [rnnt](exp/asr-rnnt-russian/) | bpe-2k | 6.44  | 8.55  |
34 | | [ctc](exp/asr-ctc-russian/)   | bpe-2K | 16.22 | 19.50 |
35 | 


--------------------------------------------------------------------------------
/egs/commonvoice/cat:
--------------------------------------------------------------------------------
1 | ../../cat


--------------------------------------------------------------------------------
/egs/commonvoice/exp/asr-ctc-russian/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/commonvoice/exp/asr-ctc-russian/monitor.png


--------------------------------------------------------------------------------
/egs/commonvoice/exp/asr-ctc-russian/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 46.51
 6 | * GPU info \[10\]
 7 |   * \[10\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | * use torchaudio for feature extraction w/o CMVN
12 | * please note that `add_dummy_prefix = false` in tokenizer training setting of SentencePiece tokenizer is erroneous, but would probably only introduce minor differences to results.
13 | 
14 | ### Result
15 | ```
16 | dev     %SER 59.08 | %WER 16.22 [ 13632 / 84022, 1105 ins, 2330 del, 10197 sub ]
17 | test    %SER 63.25 | %WER 19.50 [ 15970 / 81896, 1233 ins, 2868 del, 11869 sub ]
18 | ```
19 | 
20 | |     training process    |
21 | |:-----------------------:|
22 | |![monitor](./monitor.png)|
23 | 


--------------------------------------------------------------------------------
/egs/commonvoice/exp/asr-rnnt-chinese/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/commonvoice/exp/asr-rnnt-chinese/monitor.png


--------------------------------------------------------------------------------
/egs/commonvoice/exp/asr-rnnt-chinese/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 53.01
 6 | * GPU info \[10\]
 7 |   * \[10\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | * use torchaudio for feature extraction w/o CMVN
12 | 
13 | ### Result
14 | ```
15 | dev     %SER 66.03 | %CER 18.14 [ 31626 / 174359, 860 ins, 11275 del, 19491 sub ]
16 | test    %SER 73.29 | %CER 17.14 [ 29549 / 172400, 975 ins, 4791 del, 23783 sub ]
17 | ```
18 | 
19 | |     training process    |
20 | |:-----------------------:|
21 | |![monitor](./monitor.png)|
22 | 


--------------------------------------------------------------------------------
/egs/commonvoice/exp/asr-rnnt-russian/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/commonvoice/exp/asr-rnnt-russian/monitor.png


--------------------------------------------------------------------------------
/egs/commonvoice/exp/asr-rnnt-russian/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 50.43
 6 | * GPU info \[10\]
 7 |   * \[10\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | * use torchaudio for feature extraction w/o CMVN
12 | * please note that `add_dummy_prefix = false` in tokenizer training setting of SentencePiece tokenizer is erroneous, but would probably only introduce minor differences to results.
13 | 
14 | ### Result
15 | ```
16 | dev     %SER 29.11 | %WER 6.44 [ 5412 / 84022, 437 ins, 1061 del, 3914 sub ]
17 | test    %SER 33.46 | %WER 8.55 [ 7001 / 81896, 553 ins, 1566 del, 4882 sub ]
18 | ```
19 | 
20 | |     training process    |
21 | |:-----------------------:|
22 | |![monitor](./monitor.png)|
23 | 


--------------------------------------------------------------------------------
/egs/commonvoice/exp/joinap/decode-lm-indonesia/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "NGram",
 4 |         "kwargs": {
 5 |             "num_classes": 12673,
 6 |             "f_binlm": "exp/joinap/decode-lm-indonesia/3gram.arpa",
 7 |             "gram_order": 3
 8 |         }
 9 |     }
10 | }


--------------------------------------------------------------------------------
/egs/commonvoice/exp/joinap/decode-lm-indonesia/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": "id-excluded_train",
 4 |         "dev": [
 5 |             "id-dev"
 6 |         ],
 7 |         "test": [
 8 |             "id-dev",
 9 |             "id-test"
10 |         ],
11 |         "packing-text-lm": {
12 |             "nj": 4,
13 |             "prune_shorter": 5
14 |         }
15 |     },
16 |     "tokenizer": {
17 |         "type": "SimpleTokenizer",
18 |         "option-init": {
19 |             "dmap": "data/lang-id/lexicon"
20 |         }
21 |     },
22 |     "inference": {}
23 | }


--------------------------------------------------------------------------------
/egs/commonvoice/exp/joinap/decode-lm-indonesia/readme.md:
--------------------------------------------------------------------------------
 1 | 
 2 | train command:
 3 | 
 4 | ```bash
 5 | utils/pipeline/ngram.sh exp/debug-decode-lm-id/ -o 3 --arpa
 6 | ```
 7 | 
 8 | property:
 9 | 
10 | - prune: 
11 | - type:  probing
12 | - size:  3.8MB
13 | 
14 | perplexity:
15 | 
16 | ```
17 | 
18 | ```
19 | 


--------------------------------------------------------------------------------
/egs/commonvoice/exp/joinap/decode-lm-russian/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "NGram",
 4 |         "kwargs": {
 5 |             "num_classes": 50171,
 6 |             "f_binlm": "exp/joinap/decode-lm-russian/3gram.arpa",
 7 |             "gram_order": 3
 8 |         }
 9 |     }
10 | }


--------------------------------------------------------------------------------
/egs/commonvoice/exp/joinap/decode-lm-russian/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": "ru-excluded_train",
 4 |         "dev": [
 5 |             "ru-dev"
 6 |         ],
 7 |         "test": [
 8 |             "ru-dev",
 9 |             "ru-test"
10 |         ],
11 |         "packing-text-lm": {
12 |             "nj": 4,
13 |             "prune_shorter": 5
14 |         }
15 |     },
16 |     "tokenizer": {
17 |         "type": "SimpleTokenizer",
18 |         "option-init": {
19 |             "dmap": "data/lang-ru/lexicon"
20 |         }
21 |     },
22 |     "inference": {}
23 | }


--------------------------------------------------------------------------------
/egs/commonvoice/exp/joinap/decode-lm-russian/readme.md:
--------------------------------------------------------------------------------
 1 | 
 2 | train command:
 3 | 
 4 | ```bash
 5 | utils/pipeline/ngram.sh exp/decode-lm-russian -o 3 --arpa
 6 | ```
 7 | 
 8 | property:
 9 | 
10 | - prune: 
11 | - type:  probing
12 | - size:  17MB
13 | 
14 | perplexity:
15 | 
16 | ```
17 | 
18 | ```
19 | 


--------------------------------------------------------------------------------
/egs/commonvoice/exp/joinap/finetune-id/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/commonvoice/exp/joinap/finetune-id/monitor.png


--------------------------------------------------------------------------------
/egs/commonvoice/exp/joinap/finetune-id/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 39.47
 6 | * GPU info \[5\]
 7 |   * \[5\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | * Use `russian+indonesia` data pretrained model 
12 | 
13 | ### Result
14 | ```
15 | id-dev_ac1.0_lm1.0_wip0.0.hyp      %SER 33.24 | %WER 14.93 [ 3278 / 21951, 409 ins, 454 del, 2415 sub ]
16 | id-test_ac1.0_lm1.0_wip0.0.hyp     %SER 21.28 | %WER 7.63 [ 1654 / 21664, 194 ins, 256 del, 1204 sub ]
17 | ```
18 | 
19 | |     training process    |
20 | |:-----------------------:|
21 | |![tb-plot](./monitor.png)|
22 | 


--------------------------------------------------------------------------------
/egs/commonvoice/exp/joinap/mono-indonesia-L/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/commonvoice/exp/joinap/mono-indonesia-L/monitor.png


--------------------------------------------------------------------------------
/egs/commonvoice/exp/joinap/mono-indonesia-L/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 39.47
 6 | * GPU info \[5\]
 7 |   * \[5\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | * 
12 | 
13 | ### Result
14 | ```
15 | 
16 | id-dev_ac1.0_lm1.0_wip0.0.hyp      %SER 30.32 | %WER 14.35 [ 3149 / 21951, 430 ins, 342 del, 2377 sub ]
17 | id-test_ac1.0_lm1.0_wip0.0.hyp     %SER 18.62 | %WER 6.92 [ 1500 / 21664, 174 ins, 217 del, 1109 sub ]
18 | ```
19 | 
20 | |     training process    |
21 | |:-----------------------:|
22 | |![tb-plot](./monitor.png)|
23 | 


--------------------------------------------------------------------------------
/egs/commonvoice/exp/joinap/mono-indonesia-NL/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/commonvoice/exp/joinap/mono-indonesia-NL/monitor.png


--------------------------------------------------------------------------------
/egs/commonvoice/exp/joinap/mono-indonesia-NL/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 45.77
 6 | * GPU info \[8\]
 7 |   * \[8\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | * 
12 | 
13 | ### Result
14 | ```
15 | id-dev_ac1.0_lm1.0_wip0.0.hyp      %SER 31.93 | %WER 15.10 [ 3315 / 21951, 437 ins, 345 del, 2533 sub ]
16 | id-test_ac1.0_lm1.0_wip0.0.hyp     %SER 19.41 | %WER 7.11 [ 1540 / 21664, 164 ins, 206 del, 1170 sub ]
17 | ```
18 | 
19 | |     training process    |
20 | |:-----------------------:|
21 | |![tb-plot](./monitor.png)|
22 | 


--------------------------------------------------------------------------------
/egs/commonvoice/exp/joinap/mono-indonesia-flat/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/commonvoice/exp/joinap/mono-indonesia-flat/monitor.png


--------------------------------------------------------------------------------
/egs/commonvoice/exp/joinap/mono-indonesia-flat/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 39.46
 6 | * GPU info \[5\]
 7 |   * \[5\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | * 
12 | 
13 | ### Result
14 | ```
15 | id-dev_ac1.0_lm1.0_wip0.0.hyp      %SER 30.20 | %WER 13.64 [ 2994 / 21951, 385 ins, 374 del, 2235 sub ]
16 | id-test_ac1.0_lm1.0_wip0.0.hyp     %SER 16.89 | %WER 6.25 [ 1353 / 21664, 144 ins, 190 del, 1019 sub ]
17 | ```
18 | 
19 | |     training process    |
20 | |:-----------------------:|
21 | |![tb-plot](./monitor.png)|
22 | 


--------------------------------------------------------------------------------
/egs/commonvoice/exp/joinap/mul-ru+id-L/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/commonvoice/exp/joinap/mul-ru+id-L/monitor.png


--------------------------------------------------------------------------------
/egs/commonvoice/exp/joinap/mul-ru+id-L/prep_mul_pv.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | lang="ru id"
 4 | 
 5 | dlang="data/lang-mul"
 6 | echo $lang >>$dlang/lang.txt
 7 | mkdir -p $dlang
 8 | export LC_ALL=C.UTF-8
 9 | 
10 | for l in $lang; do
11 |     cat data/lang-$l/lexicon
12 | done | sort -k 1,1 -u -s \
13 |     >$dlang/lexicon
14 | 
15 | cut <$dlang/lexicon -f 2- | tr ' ' '\n' | sort -u -s >$dlang/phonemes.txt
16 | 
17 | [ ! -f local/data/ipa_all.csv ] && {
18 |     wget https://raw.githubusercontent.com/dmort27/panphon/master/panphon/data/ipa_all.csv \
19 |         -O local/data/ipa_all.csv
20 | }
21 | python local/get_ipa_mapping.py \
22 |     $dlang/phonemes.txt \
23 |     local/data/ipa_all.csv \
24 |     $dlang/mul-pv.npy || exit 1
25 | 
26 | echo "$0 done" && exit 0
27 | 


--------------------------------------------------------------------------------
/egs/commonvoice/exp/joinap/mul-ru+id-L/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 39.47
 6 | * GPU info \[5\]
 7 |   * \[5\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | * 
12 | 
13 | ### Result
14 | ```
15 | id
16 | id-dev_ac1.0_lm1.0_wip0.0.hyp    %SER 43.52 | %WER 21.99 [ 4827 / 21951, 267 ins, 1249 del, 3311 sub ]
17 | id-test_ac1.0_lm1.0_wip0.0.hyp   %SER 32.90 | %WER 12.89 [ 2792 / 21664, 147 ins, 790 del, 1855 sub ]
18 | 
19 | ru
20 | ru-dev_ac1.0_lm1.0_wip0.0.hyp    %SER 31.61 | %WER 7.63 [ 6413 / 84022, 460 ins, 2108 del, 3845 sub ]
21 | ru-test_ac1.0_lm1.0_wip0.0.hyp   %SER 36.68 | %WER 9.76 [ 7989 / 81896, 508 ins, 2918 del, 4563 sub ]
22 | ```
23 | 
24 | |     training process    |
25 | |:-----------------------:|
26 | |![tb-plot](./monitor.png)|
27 | 


--------------------------------------------------------------------------------
/egs/commonvoice/local/data/ipa_extend.txt:
--------------------------------------------------------------------------------
 1 | # russian
 2 | oʲ o
 3 | iʲ i
 4 | ɨʲ ɨ
 5 | æʲ æ
 6 | yʲ y
 7 | aʲ a
 8 | eʲ e
 9 | ʉʲ ʉ
10 | jʲ j
11 | ɵʲ ɵ
12 | # indonesia
13 | au a u
14 | ai a i
15 | ʊi ʊ i
16 | oi̯ o i̯


--------------------------------------------------------------------------------
/egs/commonvoice/local/text_normalize.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Author: Huahuan Zheng (maxwellzh@outlook.com)
 3 | # Text normalize
 4 | set -e -u
 5 | 
 6 | for file in $(python -c "import json;\
 7 | print(' '.join(x['trans'] for x in json.load(open('data/metainfo.json', 'r')).values()))"); do
 8 |     [ ! -f $file.bak ] && mv $file $file.bak
 9 |     cut <$file.bak -f 2- | sed -e 's/[.]//g; s/!//g; s/?//g' \
10 |         -e 's/“//g; s/"//g; s/,//g; s/”//g' \
11 |         -e "s/'//g; s/’//g; s/‘//g" \
12 |         -e 's/://g; s/[;]//g; s/[(]//g; s/[)]//g;' \
13 |         -e 's/[\]//g' |
14 |         tr '[:upper:]' '[:lower:]' >$file.trans.tmp
15 | 
16 |     cut <$file.bak -f 1 >$file.id.tmp
17 |     paste $file.{id,trans}.tmp >$file
18 |     rm -rf $file.{id,trans}.tmp
19 | done
20 | 


--------------------------------------------------------------------------------
/egs/commonvoice/utils:
--------------------------------------------------------------------------------
1 | ../../cat/utils/


--------------------------------------------------------------------------------
/egs/cv-lang10/cat:
--------------------------------------------------------------------------------
1 | ../../cat


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_phoneme_10h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_phoneme_10h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_phoneme_10m/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "specaug": {
 3 |         "freq_mask_width_range": 0.35,
 4 |         "num_freq_mask": 2,
 5 |         "num_time_mask": 10,
 6 |         "time_mask_width_range": 0.05
 7 |     },
 8 |     "encoder": {
 9 |         "type": "ConformerNet",
10 |         "kwargs": {
11 |             "num_cells": 14,
12 |             "idim": 80,
13 |             "hdim": 512,
14 |             "conv": "vgg2l",
15 |             "num_heads": 4,
16 |             "kernel_size": 15,
17 |             "num_classes": 37
18 |         }
19 |     },
20 |     "scheduler": {
21 |         "type": "SchedulerEarlyStop",
22 |         "kwargs": {
23 |             "min_step": 30,
24 |             "n_tol": 5,
25 |             "gamma": 0.5
26 |         },
27 |         "optimizer": {
28 |             "type": "Adam",
29 |             "kwargs": {
30 |                 "lr": 3e-05,
31 |                 "betas": [
32 |                     0.9,
33 |                     0.98
34 |                 ],
35 |                 "weight_decay": 1e-06
36 |             }
37 |         }
38 |     }
39 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_phoneme_10m/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_phoneme_10m/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_phoneme_1h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_phoneme_1h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_phoneme_20h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_phoneme_20h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_subword_10h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_subword_10h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_subword_10m/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_subword_10m/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_subword_1h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_subword_1h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_subword_20h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_subword_20h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/id/Multi._subword_ft_subword_10h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Multi._subword_ft_subword_10h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/id/Multi._subword_ft_subword_10m/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Multi._subword_ft_subword_10m/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/id/Multi._subword_ft_subword_1h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Multi._subword_ft_subword_1h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/id/Multi._subword_ft_subword_20h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Multi._subword_ft_subword_20h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-En_ft_phoneme_10h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-En_ft_phoneme_10h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-En_ft_phoneme_1h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-En_ft_phoneme_1h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-En_ft_phoneme_20h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-En_ft_phoneme_20h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-En_ft_subword_10h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-En_ft_subword_10h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-En_ft_subword_1h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-En_ft_subword_1h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-En_ft_subword_20h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-En_ft_subword_20h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-lang10_ft_phoneme_10h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-lang10_ft_phoneme_10h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-lang10_ft_phoneme_1h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-lang10_ft_phoneme_1h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-lang10_ft_phoneme_20h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-lang10_ft_phoneme_20h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-lang10_ft_subword_10h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-lang10_ft_subword_10h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-lang10_ft_subword_1h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-lang10_ft_subword_1h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-lang10_ft_subword_20h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-lang10_ft_subword_20h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_phoneme_10h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_phoneme_10h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_phoneme_10m/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "specaug": {
 3 |         "freq_mask_width_range": 0.35,
 4 |         "num_freq_mask": 2,
 5 |         "num_time_mask": 10,
 6 |         "time_mask_width_range": 0.05
 7 |     },
 8 |     "encoder": {
 9 |         "type": "ConformerNet",
10 |         "kwargs": {
11 |             "num_cells": 14,
12 |             "idim": 80,
13 |             "hdim": 512,
14 |             "conv": "vgg2l",
15 |             "num_heads": 4,
16 |             "kernel_size": 15,
17 |             "num_classes": 37
18 |         }
19 |     },
20 |     "scheduler": {
21 |         "type": "SchedulerEarlyStop",
22 |         "kwargs": {
23 |             "min_step": 140,
24 |             "n_tol": 20,
25 |             "gamma": 0.5
26 |         },
27 |         "optimizer": {
28 |             "type": "Adam",
29 |             "kwargs": {
30 |                 "lr": 3e-05,
31 |                 "betas": [
32 |                     0.9,
33 |                     0.98
34 |                 ],
35 |                 "weight_decay": 1e-06
36 |             }
37 |         }
38 |     }
39 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_phoneme_10m/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_phoneme_10m/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_phoneme_130h/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "specaug": {
 3 |         "freq_mask_width_range": 0.35,
 4 |         "num_freq_mask": 2,
 5 |         "num_time_mask": 10,
 6 |         "time_mask_width_range": 0.05
 7 |     },
 8 |     "encoder": {
 9 |         "type": "ConformerNet",
10 |         "kwargs": {
11 |             "num_cells": 14,
12 |             "idim": 80,
13 |             "hdim": 512,
14 |             "conv": "vgg2l",
15 |             "num_heads": 4,
16 |             "kernel_size": 15,
17 |             "num_classes": 37
18 |         }
19 |     },
20 |     "scheduler": {
21 |         "type": "SchedulerEarlyStop",
22 |         "kwargs": {
23 |             "min_step": 1670,
24 |             "n_tol": 20,
25 |             "gamma": 0.5
26 |         },
27 |         "optimizer": {
28 |             "type": "Adam",
29 |             "kwargs": {
30 |                 "lr": 0.0003,
31 |                 "betas": [
32 |                     0.9,
33 |                     0.98
34 |                 ],
35 |                 "weight_decay": 1e-06
36 |             }
37 |         }
38 |     }
39 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_phoneme_130h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_phoneme_130h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_phoneme_1h/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "specaug": {
 3 |         "freq_mask_width_range": 0.35,
 4 |         "num_freq_mask": 2,
 5 |         "num_time_mask": 10,
 6 |         "time_mask_width_range": 0.05
 7 |     },
 8 |     "encoder": {
 9 |         "type": "ConformerNet",
10 |         "kwargs": {
11 |             "num_cells": 14,
12 |             "idim": 80,
13 |             "hdim": 512,
14 |             "conv": "vgg2l",
15 |             "num_heads": 4,
16 |             "kernel_size": 15,
17 |             "num_classes": 37
18 |         }
19 |     },
20 |     "scheduler": {
21 |         "type": "SchedulerEarlyStop",
22 |         "kwargs": {
23 |             "min_step": 140,
24 |             "n_tol": 20,
25 |             "gamma": 0.5
26 |         },
27 |         "optimizer": {
28 |             "type": "Adam",
29 |             "kwargs": {
30 |                 "lr": 3e-05,
31 |                 "betas": [
32 |                     0.9,
33 |                     0.98
34 |                 ],
35 |                 "weight_decay": 1e-06
36 |             }
37 |         }
38 |     }
39 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_phoneme_1h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_phoneme_1h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_subword_10h/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "specaug": {
 3 |         "freq_mask_width_range": 0.35,
 4 |         "num_freq_mask": 2,
 5 |         "num_time_mask": 10,
 6 |         "time_mask_width_range": 0.05
 7 |     },
 8 |     "encoder": {
 9 |         "type": "ConformerNet",
10 |         "kwargs": {
11 |             "num_cells": 14,
12 |             "idim": 80,
13 |             "hdim": 512,
14 |             "conv": "vgg2l",
15 |             "num_heads": 4,
16 |             "kernel_size": 15,
17 |             "num_classes": 500
18 |         }
19 |     },
20 |     "scheduler": {
21 |         "type": "SchedulerEarlyStop",
22 |         "kwargs": {
23 |             "min_step": 640,
24 |             "n_tol": 10,
25 |             "gamma": 0.5
26 |         },
27 |         "optimizer": {
28 |             "type": "Adam",
29 |             "kwargs": {
30 |                 "lr": 3e-05,
31 |                 "betas": [
32 |                     0.9,
33 |                     0.98
34 |                 ],
35 |                 "weight_decay": 1e-06
36 |             }
37 |         }
38 |     }
39 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_subword_10h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_subword_10h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_subword_10m/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "specaug": {
 3 |         "freq_mask_width_range": 0.35,
 4 |         "num_freq_mask": 2,
 5 |         "num_time_mask": 10,
 6 |         "time_mask_width_range": 0.05
 7 |     },
 8 |     "encoder": {
 9 |         "type": "ConformerNet",
10 |         "kwargs": {
11 |             "num_cells": 14,
12 |             "idim": 80,
13 |             "hdim": 512,
14 |             "conv": "vgg2l",
15 |             "num_heads": 4,
16 |             "kernel_size": 15,
17 |             "num_classes": 500
18 |         }
19 |     },
20 |     "scheduler": {
21 |         "type": "SchedulerEarlyStop",
22 |         "kwargs": {
23 |             "min_step": 100,
24 |             "n_tol": 5,
25 |             "gamma": 0.5
26 |         },
27 |         "optimizer": {
28 |             "type": "Adam",
29 |             "kwargs": {
30 |                 "lr": 3e-05,
31 |                 "betas": [
32 |                     0.9,
33 |                     0.98
34 |                 ],
35 |                 "weight_decay": 1e-06
36 |             }
37 |         }
38 |     }
39 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_subword_10m/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_subword_10m/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_subword_130h/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "specaug": {
 3 |         "freq_mask_width_range": 0.35,
 4 |         "num_freq_mask": 2,
 5 |         "num_time_mask": 10,
 6 |         "time_mask_width_range": 0.05
 7 |     },
 8 |     "encoder": {
 9 |         "type": "ConformerNet",
10 |         "kwargs": {
11 |             "num_cells": 14,
12 |             "idim": 80,
13 |             "hdim": 512,
14 |             "conv": "vgg2l",
15 |             "num_heads": 4,
16 |             "kernel_size": 15,
17 |             "num_classes": 500
18 |         }
19 |     },
20 |     "scheduler": {
21 |         "type": "SchedulerEarlyStop",
22 |         "kwargs": {
23 |             "min_step": 3340,
24 |             "n_tol": 20,
25 |             "gamma": 0.5
26 |         },
27 |         "optimizer": {
28 |             "type": "Adam",
29 |             "kwargs": {
30 |                 "lr": 0.0003,
31 |                 "betas": [
32 |                     0.9,
33 |                     0.98
34 |                 ],
35 |                 "weight_decay": 1e-06
36 |             }
37 |         }
38 |     }
39 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_subword_130h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_subword_130h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_subword_1h/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "specaug": {
 3 |         "freq_mask_width_range": 0.35,
 4 |         "num_freq_mask": 2,
 5 |         "num_time_mask": 10,
 6 |         "time_mask_width_range": 0.05
 7 |     },
 8 |     "encoder": {
 9 |         "type": "ConformerNet",
10 |         "kwargs": {
11 |             "num_cells": 14,
12 |             "idim": 80,
13 |             "hdim": 512,
14 |             "conv": "vgg2l",
15 |             "num_heads": 4,
16 |             "kernel_size": 15,
17 |             "num_classes": 500
18 |         }
19 |     },
20 |     "scheduler": {
21 |         "type": "SchedulerEarlyStop",
22 |         "kwargs": {
23 |             "min_step": 140,
24 |             "n_tol": 20,
25 |             "gamma": 0.5
26 |         },
27 |         "optimizer": {
28 |             "type": "Adam",
29 |             "kwargs": {
30 |                 "lr": 3e-05,
31 |                 "betas": [
32 |                     0.9,
33 |                     0.98
34 |                 ],
35 |                 "weight_decay": 1e-06
36 |             }
37 |         }
38 |     }
39 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_subword_1h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_subword_1h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Multi._subword_ft_subword_10h/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "specaug": {
 3 |         "freq_mask_width_range": 0.35,
 4 |         "num_freq_mask": 2,
 5 |         "num_time_mask": 10,
 6 |         "time_mask_width_range": 0.05
 7 |     },
 8 |     "encoder": {
 9 |         "type": "ConformerNet",
10 |         "kwargs": {
11 |             "num_cells": 14,
12 |             "idim": 80,
13 |             "hdim": 512,
14 |             "conv": "vgg2l",
15 |             "num_heads": 4,
16 |             "kernel_size": 15,
17 |             "num_classes": 500
18 |         }
19 |     },
20 |     "scheduler": {
21 |         "type": "SchedulerEarlyStop",
22 |         "kwargs": {
23 |             "min_step": 320,
24 |             "n_tol": 10,
25 |             "gamma": 0.5
26 |         },
27 |         "optimizer": {
28 |             "type": "Adam",
29 |             "kwargs": {
30 |                 "lr": 6e-05,
31 |                 "betas": [
32 |                     0.9,
33 |                     0.98
34 |                 ],
35 |                 "weight_decay": 1e-06
36 |             }
37 |         }
38 |     }
39 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Multi._subword_ft_subword_10h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Multi._subword_ft_subword_10h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Multi._subword_ft_subword_10m/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "specaug": {
 3 |         "freq_mask_width_range": 0.35,
 4 |         "num_freq_mask": 2,
 5 |         "num_time_mask": 10,
 6 |         "time_mask_width_range": 0.05
 7 |     },
 8 |     "encoder": {
 9 |         "type": "ConformerNet",
10 |         "kwargs": {
11 |             "num_cells": 14,
12 |             "idim": 80,
13 |             "hdim": 512,
14 |             "conv": "vgg2l",
15 |             "num_heads": 4,
16 |             "kernel_size": 15,
17 |             "num_classes": 500
18 |         }
19 |     },
20 |     "scheduler": {
21 |         "type": "SchedulerEarlyStop",
22 |         "kwargs": {
23 |             "min_step": 100,
24 |             "n_tol": 5,
25 |             "gamma": 0.5
26 |         },
27 |         "optimizer": {
28 |             "type": "Adam",
29 |             "kwargs": {
30 |                 "lr": 3e-05,
31 |                 "betas": [
32 |                     0.9,
33 |                     0.98
34 |                 ],
35 |                 "weight_decay": 1e-06
36 |             }
37 |         }
38 |     }
39 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Multi._subword_ft_subword_10m/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Multi._subword_ft_subword_10m/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Multi._subword_ft_subword_130h/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "specaug": {
 3 |         "freq_mask_width_range": 0.35,
 4 |         "num_freq_mask": 2,
 5 |         "num_time_mask": 10,
 6 |         "time_mask_width_range": 0.05
 7 |     },
 8 |     "encoder": {
 9 |         "type": "ConformerNet",
10 |         "kwargs": {
11 |             "num_cells": 14,
12 |             "idim": 80,
13 |             "hdim": 512,
14 |             "conv": "vgg2l",
15 |             "num_heads": 4,
16 |             "kernel_size": 15,
17 |             "num_classes": 500
18 |         }
19 |     },
20 |     "scheduler": {
21 |         "type": "SchedulerEarlyStop",
22 |         "kwargs": {
23 |             "min_step": 3340,
24 |             "n_tol": 20,
25 |             "gamma": 0.5
26 |         },
27 |         "optimizer": {
28 |             "type": "Adam",
29 |             "kwargs": {
30 |                 "lr": 0.0003,
31 |                 "betas": [
32 |                     0.9,
33 |                     0.98
34 |                 ],
35 |                 "weight_decay": 1e-06
36 |             }
37 |         }
38 |     }
39 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Multi._subword_ft_subword_130h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Multi._subword_ft_subword_130h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Multi._subword_ft_subword_1h/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "specaug": {
 3 |         "freq_mask_width_range": 0.35,
 4 |         "num_freq_mask": 2,
 5 |         "num_time_mask": 10,
 6 |         "time_mask_width_range": 0.05
 7 |     },
 8 |     "encoder": {
 9 |         "type": "ConformerNet",
10 |         "kwargs": {
11 |             "num_cells": 14,
12 |             "idim": 80,
13 |             "hdim": 512,
14 |             "conv": "vgg2l",
15 |             "num_heads": 4,
16 |             "kernel_size": 15,
17 |             "num_classes": 500
18 |         }
19 |     },
20 |     "scheduler": {
21 |         "type": "SchedulerEarlyStop",
22 |         "kwargs": {
23 |             "min_step": 210,
24 |             "n_tol": 20,
25 |             "gamma": 0.5
26 |         },
27 |         "optimizer": {
28 |             "type": "Adam",
29 |             "kwargs": {
30 |                 "lr": 3e-05,
31 |                 "betas": [
32 |                     0.9,
33 |                     0.98
34 |                 ],
35 |                 "weight_decay": 1e-06
36 |             }
37 |         }
38 |     }
39 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Multi._subword_ft_subword_1h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Multi._subword_ft_subword_1h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-En_ft_phoneme_10h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-En_ft_phoneme_10h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-En_ft_phoneme_130h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-En_ft_phoneme_130h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-En_ft_phoneme_1h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-En_ft_phoneme_1h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-En_ft_subword_10h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-En_ft_subword_10h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-En_ft_subword_130h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-En_ft_subword_130h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-En_ft_subword_1h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-En_ft_subword_1h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-lang10_ft_phoneme_10h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-lang10_ft_phoneme_10h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-lang10_ft_phoneme_130h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-lang10_ft_phoneme_130h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-lang10_ft_phoneme_1h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-lang10_ft_phoneme_1h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-lang10_ft_subword_10h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-lang10_ft_subword_10h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-lang10_ft_subword_130h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-lang10_ft_subword_130h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-lang10_ft_subword_1h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-lang10_ft_subword_1h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/en/lm/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "NGram",
 4 |         "kwargs": {
 5 |             "f_binlm": "dict/es/lm/4gram.arpa",
 6 |             "gram_order": 4,
 7 |             "num_classes": 87908
 8 |         }
 9 |     }
10 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/en/lm/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": [
 4 |             "train_en"
 5 |         ],
 6 |         "dev": [
 7 |             "dev_en"
 8 |         ],
 9 |         "test": [
10 |             "test_en"
11 |         ],
12 |         "packing-text-lm": {
13 |             "nj": 4,
14 |             "prune_shorter": 5
15 |         }
16 |     },
17 |     "tokenizer": {
18 |         "type": "SimpleTokenizer",
19 |         "option-init": {
20 |             "dmap": "dict/en/word_list"
21 |         },
22 |         "|V|": 246234,
23 |         "file": "dict/en/lm/tokenizer_lm.tknz"
24 |     },
25 |     "commit": "c102b404d8bbce612eecb7e5fa6cb7679609ec5c"
26 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/en/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/en/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/es/lm/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "NGram",
 4 |         "kwargs": {
 5 |             "f_binlm": "dict/en/lm/4gram.arpa",
 6 |             "gram_order": 4,
 7 |             "num_classes": 246234
 8 |         }
 9 |     }
10 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/es/lm/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": [
 4 |             "train_es"
 5 |         ],
 6 |         "dev": [
 7 |             "dev_es"
 8 |         ],
 9 |         "test": [
10 |             "test_es"
11 |         ],
12 |         "packing-text-lm": {
13 |             "nj": 4,
14 |             "prune_shorter": 5
15 |         }
16 |     },
17 |     "tokenizer": {
18 |         "type": "SimpleTokenizer",
19 |         "option-init": {
20 |             "dmap": "dict/es/word_list"
21 |         },
22 |         "|V|": 87908,
23 |         "file": "dict/es/lm/tokenizer_lm.tknz"
24 |     },
25 |     "commit": "c102b404d8bbce612eecb7e5fa6cb7679609ec5c"
26 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/es/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/es/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/fr/lm/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "NGram",
 4 |         "kwargs": {
 5 |             "f_binlm": "dict/fr/lm/4gram.arpa",
 6 |             "gram_order": 4,
 7 |             "num_classes": 217706
 8 |         }
 9 |     }
10 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/fr/lm/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": [
 4 |             "train_fr"
 5 |         ],
 6 |         "dev": [
 7 |             "dev_fr"
 8 |         ],
 9 |         "test": [
10 |             "test_fr"
11 |         ],
12 |         "packing-text-lm": {
13 |             "nj": 4,
14 |             "prune_shorter": 5
15 |         }
16 |     },
17 |     "tokenizer": {
18 |         "type": "SimpleTokenizer",
19 |         "option-init": {
20 |             "dmap": "dict/fr/word_list"
21 |         },
22 |         "|V|": 217706,
23 |         "file": "dict/fr/lm/tokenizer_lm.tknz"
24 |     },
25 |     "commit": "c102b404d8bbce612eecb7e5fa6cb7679609ec5c"
26 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/fr/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/fr/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/id/Mono._phoneme_10h/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "specaug": {
 3 |         "freq_mask_width_range": 0.35,
 4 |         "num_freq_mask": 2,
 5 |         "num_time_mask": 10,
 6 |         "time_mask_width_range": 0.05
 7 |     },
 8 |     "encoder": {
 9 |         "type": "ConformerNet",
10 |         "kwargs": {
11 |             "num_cells": 14,
12 |             "idim": 80,
13 |             "hdim": 512,
14 |             "conv": "vgg2l",
15 |             "num_heads": 4,
16 |             "kernel_size": 15,
17 |             "num_classes": 37
18 |         }
19 |     },
20 |     "scheduler": {
21 |         "type": "SchedulerEarlyStop",
22 |         "kwargs": {
23 |             "min_step": 1000,
24 |             "n_tol": 10,
25 |             "gamma": 0.5,
26 |             "stop_lr": 1e-06
27 |         },
28 |         "optimizer": {
29 |             "type": "Adam",
30 |             "kwargs": {
31 |                 "lr": 3e-05,
32 |                 "betas": [
33 |                     0.9,
34 |                     0.98
35 |                 ],
36 |                 "weight_decay": 1e-06
37 |             }
38 |         }
39 |     }
40 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/id/Mono._phoneme_10h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/id/Mono._phoneme_10h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/id/Mono._phoneme_1h/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "specaug": {
 3 |         "freq_mask_width_range": 0.35,
 4 |         "num_freq_mask": 2,
 5 |         "num_time_mask": 10,
 6 |         "time_mask_width_range": 0.05
 7 |     },
 8 |     "encoder": {
 9 |         "type": "ConformerNet",
10 |         "kwargs": {
11 |             "num_cells": 14,
12 |             "idim": 80,
13 |             "hdim": 512,
14 |             "conv": "vgg2l",
15 |             "num_heads": 4,
16 |             "kernel_size": 15,
17 |             "num_classes": 37
18 |         }
19 |     },
20 |     "scheduler": {
21 |         "type": "SchedulerEarlyStop",
22 |         "kwargs": {
23 |             "min_step": 100,
24 |             "n_tol": 10,
25 |             "gamma": 0.5,
26 |             "stop_lr": 1e-06
27 |         },
28 |         "optimizer": {
29 |             "type": "Adam",
30 |             "kwargs": {
31 |                 "lr": 1e-05,
32 |                 "betas": [
33 |                     0.9,
34 |                     0.98
35 |                 ],
36 |                 "weight_decay": 1e-06
37 |             }
38 |         }
39 |     }
40 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/id/Mono._phoneme_1h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/id/Mono._phoneme_1h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/id/Mono._phoneme_20h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/id/Mono._phoneme_20h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/id/Mono._subword_10h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/id/Mono._subword_10h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/id/Mono._subword_1h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/id/Mono._subword_1h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/id/Mono._subword_20h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/id/Mono._subword_20h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/id/lm/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "NGram",
 4 |         "kwargs": {
 5 |             "f_binlm": "dict/id/lm/4gram.arpa",
 6 |             "gram_order": 4,
 7 |             "num_classes": 13660
 8 |         }
 9 |     }
10 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/id/lm/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": [
 4 |             "train_id"
 5 |         ],
 6 |         "dev": [
 7 |             "dev_id"
 8 |         ],
 9 |         "test": [
10 |             "test_id"
11 |         ],
12 |         "packing-text-lm": {
13 |             "nj": 4,
14 |             "prune_shorter": 5
15 |         }
16 |     },
17 |     "tokenizer": {
18 |         "type": "SimpleTokenizer",
19 |         "option-init": {
20 |             "dmap": "dict/id/word_list"
21 |         },
22 |         "|V|": 13660,
23 |         "file": "dict/id/lm/tokenizer_lm.tknz"
24 |     },
25 |     "commit": "c102b404d8bbce612eecb7e5fa6cb7679609ec5c"
26 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/it/lm/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "NGram",
 4 |         "kwargs": {
 5 |             "f_binlm": "dict/it/lm/4gram.arpa",
 6 |             "gram_order": 4,
 7 |             "num_classes": 85831
 8 |         }
 9 |     }
10 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/it/lm/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": [
 4 |             "train_it"
 5 |         ],
 6 |         "dev": [
 7 |             "dev_it"
 8 |         ],
 9 |         "test": [
10 |             "test_it"
11 |         ],
12 |         "packing-text-lm": {
13 |             "nj": 4,
14 |             "prune_shorter": 5
15 |         }
16 |     },
17 |     "tokenizer": {
18 |         "type": "SimpleTokenizer",
19 |         "option-init": {
20 |             "dmap": "dict/it/word_list"
21 |         },
22 |         "|V|": 85831,
23 |         "file": "dict/it/lm/tokenizer_lm.tknz"
24 |     },
25 |     "commit": "c102b404d8bbce612eecb7e5fa6cb7679609ec5c"
26 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/it/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/it/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/ky/lm/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "NGram",
 4 |         "kwargs": {
 5 |             "f_binlm": "dict/ky/lm/4gram.arpa",
 6 |             "gram_order": 4,
 7 |             "num_classes": 10608
 8 |         }
 9 |     }
10 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/ky/lm/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": [
 4 |             "train_ky"
 5 |         ],
 6 |         "dev": [
 7 |             "dev_ky"
 8 |         ],
 9 |         "test": [
10 |             "test_ky"
11 |         ],
12 |         "packing-text-lm": {
13 |             "nj": 4,
14 |             "prune_shorter": 5
15 |         }
16 |     },
17 |     "tokenizer": {
18 |         "type": "SimpleTokenizer",
19 |         "option-init": {
20 |             "dmap": "dict/ky/word_list"
21 |         },
22 |         "|V|": 10608,
23 |         "file": "dict/ky/lm/tokenizer_lm.tknz"
24 |     },
25 |     "commit": "c102b404d8bbce612eecb7e5fa6cb7679609ec5c"
26 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/ky/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/ky/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/nl/lm/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "NGram",
 4 |         "kwargs": {
 5 |             "f_binlm": "dict/nl/lm/4gram.arpa",
 6 |             "gram_order": 4,
 7 |             "num_classes": 24518
 8 |         }
 9 |     }
10 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/nl/lm/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": [
 4 |             "train_nl"
 5 |         ],
 6 |         "dev": [
 7 |             "dev_nl"
 8 |         ],
 9 |         "test": [
10 |             "test_nl"
11 |         ],
12 |         "packing-text-lm": {
13 |             "nj": 4,
14 |             "prune_shorter": 5
15 |         }
16 |     },
17 |     "tokenizer": {
18 |         "type": "SimpleTokenizer",
19 |         "option-init": {
20 |             "dmap": "dict/nl/word_list"
21 |         },
22 |         "|V|": 24518,
23 |         "file": "dict/nl/lm/tokenizer_lm.tknz"
24 |     },
25 |     "commit": "c102b404d8bbce612eecb7e5fa6cb7679609ec5c"
26 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/nl/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/nl/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/pl/Mono._phoneme_10h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/pl/Mono._phoneme_10h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/pl/Mono._phoneme_130h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/pl/Mono._phoneme_130h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/pl/Mono._phoneme_1h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/pl/Mono._phoneme_1h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/pl/Mono._subword_10h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/pl/Mono._subword_10h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/pl/Mono._subword_130h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/pl/Mono._subword_130h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/pl/Mono._subword_1h/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/pl/Mono._subword_1h/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/pl/lm/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "NGram",
 4 |         "kwargs": {
 5 |             "f_binlm": "dict/pl/lm/4gram.arpa",
 6 |             "gram_order": 4,
 7 |             "num_classes": 43748
 8 |         }
 9 |     }
10 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/pl/lm/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": [
 4 |             "train_pl"
 5 |         ],
 6 |         "dev": [
 7 |             "dev_pl"
 8 |         ],
 9 |         "test": [
10 |             "test_pl"
11 |         ],
12 |         "packing-text-lm": {
13 |             "nj": 4,
14 |             "prune_shorter": 5
15 |         }
16 |     },
17 |     "tokenizer": {
18 |         "type": "SimpleTokenizer",
19 |         "option-init": {
20 |             "dmap": "dict/pl/word_list"
21 |         },
22 |         "|V|": 43748,
23 |         "file": "dict/pl/lm/tokenizer_lm.tknz"
24 |     },
25 |     "commit": "c102b404d8bbce612eecb7e5fa6cb7679609ec5c"
26 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/ru/lm/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "NGram",
 4 |         "kwargs": {
 5 |             "f_binlm": "dict/ru/lm/4gram.arpa",
 6 |             "gram_order": 4,
 7 |             "num_classes": 45653
 8 |         }
 9 |     }
10 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/ru/lm/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": [
 4 |             "train_ru"
 5 |         ],
 6 |         "dev": [
 7 |             "dev_ru"
 8 |         ],
 9 |         "test": [
10 |             "test_ru"
11 |         ],
12 |         "packing-text-lm": {
13 |             "nj": 4,
14 |             "prune_shorter": 5
15 |         }
16 |     },
17 |     "tokenizer": {
18 |         "type": "SimpleTokenizer",
19 |         "option-init": {
20 |             "dmap": "dict/ru/word_list"
21 |         },
22 |         "|V|": 45653,
23 |         "file": "dict/ru/lm/tokenizer_lm.tknz"
24 |     },
25 |     "commit": "c102b404d8bbce612eecb7e5fa6cb7679609ec5c"
26 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/ru/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/ru/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/sv-SE/lm/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "NGram",
 4 |         "kwargs": {
 5 |             "f_binlm": "dict/sv-SE/lm/4gram.arpa",
 6 |             "gram_order": 4,
 7 |             "num_classes": 18689
 8 |         }
 9 |     }
10 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/sv-SE/lm/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": [
 4 |             "train_sv-SE"
 5 |         ],
 6 |         "dev": [
 7 |             "dev_sv-SE"
 8 |         ],
 9 |         "test": [
10 |             "test_sv-SE"
11 |         ],
12 |         "packing-text-lm": {
13 |             "nj": 4,
14 |             "prune_shorter": 5
15 |         }
16 |     },
17 |     "tokenizer": {
18 |         "type": "SimpleTokenizer",
19 |         "option-init": {
20 |             "dmap": "dict/sv-SE/word_list"
21 |         },
22 |         "|V|": 18689,
23 |         "file": "dict/sv-SE/lm/tokenizer_lm.tknz"
24 |     },
25 |     "commit": "c102b404d8bbce612eecb7e5fa6cb7679609ec5c"
26 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/sv-SE/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/sv-SE/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/tr/lm/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "NGram",
 4 |         "kwargs": {
 5 |             "f_binlm": "dict/tr/lm/4gram.arpa",
 6 |             "gram_order": 4,
 7 |             "num_classes": 38397
 8 |         }
 9 |     }
10 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/tr/lm/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": [
 4 |             "train_tr"
 5 |         ],
 6 |         "dev": [
 7 |             "dev_tr"
 8 |         ],
 9 |         "test": [
10 |             "test_tr"
11 |         ],
12 |         "packing-text-lm": {
13 |             "nj": 4,
14 |             "prune_shorter": 5
15 |         }
16 |     },
17 |     "tokenizer": {
18 |         "type": "SimpleTokenizer",
19 |         "option-init": {
20 |             "dmap": "dict/tr/word_list"
21 |         },
22 |         "|V|": 38397,
23 |         "file": "dict/tr/lm/tokenizer_lm.tknz"
24 |     },
25 |     "commit": "c102b404d8bbce612eecb7e5fa6cb7679609ec5c"
26 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/tr/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/tr/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/tt/lm/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "NGram",
 4 |         "kwargs": {
 5 |             "f_binlm": "dict/tt/lm/4gram.arpa",
 6 |             "gram_order": 4,
 7 |             "num_classes": 22496
 8 |         }
 9 |     }
10 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/tt/lm/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": [
 4 |             "train_tt"
 5 |         ],
 6 |         "dev": [
 7 |             "dev_tt"
 8 |         ],
 9 |         "test": [
10 |             "test_tt"
11 |         ],
12 |         "packing-text-lm": {
13 |             "nj": 4,
14 |             "prune_shorter": 5
15 |         }
16 |     },
17 |     "tokenizer": {
18 |         "type": "SimpleTokenizer",
19 |         "option-init": {
20 |             "dmap": "dict/tt/word_list"
21 |         },
22 |         "|V|": 22496,
23 |         "file": "dict/tt/lm/tokenizer_lm.tknz"
24 |     },
25 |     "commit": "c102b404d8bbce612eecb7e5fa6cb7679609ec5c"
26 | }


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Monolingual/tt/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/tt/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Multilingual/Multi._phoneme_L/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Multilingual/Multi._phoneme_L/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Multilingual/Multi._phoneme_M/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Multilingual/Multi._phoneme_M/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Multilingual/Multi._phoneme_S/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Multilingual/Multi._phoneme_S/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/exp/Multilingual/Multi._subword/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Multilingual/Multi._subword/monitor.png


--------------------------------------------------------------------------------
/egs/cv-lang10/lang-process/en/lexicon.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com)
 2 | # This script prepares phoneme-based lexicon and corrects it for English.
 3 | 
 4 | dict_dir=$1
 5 | # Generating lexicon
 6 |   g2ps=g2ps/models  # The path containing G2P models from https://github.com/uiuc-sst/g2ps
 7 |   phonetisaurus-apply --model $g2ps/american-english.fst --word_list $dict_dir/word_list > $dict_dir/lexicon.txt
 8 | 
 9 | # Lexicon correction
10 | cat $dict_dir/lexicon.txt | awk '{$1=""; print $0}' | sed -e 's/ˌ//g; s/l̩/l/g; s/n̩/n/g; s/#//g; s/[.]//g; s/g/ɡ/g; s/ei/e i/g; s/aɪ/a ɪ/g; s/ɔi/ɔ i/g; s/oʊ/o ʊ/g; s/aʊ/a ʊ/g; s/ɔɪ/ɔ ɪ/g; s/ɑɪ/ɑ ɪ/g; s/ɝ/ɜ/g; s/ɚ/ə/g; s/tʃ/t͡ʃ/g; s/dʒ/d͡ʒ/g; s/d ʒ/d͡ʒ/g' > $dict_dir/phone.txt
11 | 


--------------------------------------------------------------------------------
/egs/cv-lang10/lang-process/es/lexicon.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com)
 2 | # This script prepares phoneme-based lexicon and corrects it for Spanish.
 3 | 
 4 | # Generating lexicon
 5 | dict_dir=$1
 6 |   g2ps=g2ps/models  # The path containing G2P models from https://github.com/uiuc-sst/g2ps
 7 |   phonetisaurus-apply --model $g2ps/spanish_4_3_2.fst --word_list $dict_dir/word_list > $dict_dir/lexicon.txt
 8 | 
 9 | # Lexicon correction
10 | cat $dict_dir/lexicon.txt | awk '{$1=""; print $0}' | sed -e 's/g/ɡ/g' > $dict_dir/phone.txt
11 | 


--------------------------------------------------------------------------------
/egs/cv-lang10/lang-process/fr/lexicon.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com)
 2 | # This script prepares phoneme-based lexicon and corrects it for French.
 3 | 
 4 | dict_dir=$1
 5 | # Generating lexicon
 6 |   g2ps=g2ps/models
 7 |   phonetisaurus-apply --model $g2psench_8_4_3.fst --word_list $dict_dir/word_list > $dict_dir/lexicon.txt
 8 | 
 9 | # Lexicon correction
10 | cat $dict_dir/lexicon.txt | awk '{$1=""; print $0}' | sed -e 's/w ˈa//g; s/g/ʒ/g;
11 |         s/R/ʁ/g; s/í/i/g; s/ì/i/g; s/ò/o/g; s/ó/o/g; s/ü/u/g; s/ú/u/g; s/ù/u/g; s/á/a/g;
12 |         s/ɑ̃/ɑ/g; s/œ̃/œ/g; s/ɛ̃/ɛ/g; s/ÿ/y/g; s/ë/e/g; s/ɔ̃/ɔ/g;' \
13 |     -e 's/[ ]*$//g; s/^[ ]*//g; s/[ ][ ]*/ /g' > $dict_dir/phone.txt
14 | 


--------------------------------------------------------------------------------
/egs/cv-lang10/lang-process/id/lexicon.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com)
 2 | # This script prepares phoneme-based lexicon and corrects it for Indonesian.
 3 | 
 4 | # Generating lexicon
 5 | dict_dir=$1
 6 |   g2ps=local/g2ps/models 
 7 |   phonetisaurus-apply --model $g2ps/Indonesian.fst --word_list $dict_dir/word_list > $dict_dir/lexicon.txt
 8 | 
 9 | # Lexicon correction
10 | cat $dict_dir/lexicon.txt | awk '{$1=""; print $0}' > $dict_dir/phone.txt


--------------------------------------------------------------------------------
/egs/cv-lang10/lang-process/it/lexicon.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com)
 2 | # This script prepares phoneme-based lexicon and corrects it for Italian.
 3 | 
 4 | dict_dir=$1
 5 | # Generating lexicon
 6 |   g2ps=g2ps/models
 7 |   phonetisaurus-apply --model $g2ps/italian_8_2_3.fst --word_list $dict_dir/word_list > $dict_dir/lexicon.txt
 8 | 
 9 | # Lexicon correction
10 | cat $dict_dir/lexicon.txt | awk '{$1=""; print $0}' | sed -e 's/dʒ/d͡ʒ/g; s/dz/d͡z/g; s/tʃ/t͡ʃ/g; s/ts/t͡s/g; s/∅/ø/g' > $dict_dir/phone.txt
11 | 


--------------------------------------------------------------------------------
/egs/cv-lang10/lang-process/ky/lexicon.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com)
 2 | # This script prepares phoneme-based lexicon and corrects it for Kirghiz.
 3 | 
 4 | # Generating lexicon
 5 | dict_dir=$1
 6 |   g2ps=g2ps/models 
 7 |   phonetisaurus-apply --model $g2ps/kirghiz_8_2_2.fst --word_list $dict_dir/word_list > $dict_dir/lexicon.txt
 8 | 
 9 | # Lexicon correction
10 | cat $dict_dir/lexicon.txt | awk '{$1=""; print $0}' > $dict_dir/phone.txt


--------------------------------------------------------------------------------
/egs/cv-lang10/lang-process/lang-process.md:
--------------------------------------------------------------------------------
1 | # Language process
2 | ## 1. Text normalization 
3 | The training dataset of our models are sourced from the publicly available [`Common Voice`](https://commonvoice.mozilla.org/) 11.0. There are some redundant symbols or alien words which may affect model performance, so we do text normalize to remove them for each language.
4 | 
5 | ## 2. Lexicon generation and correction
6 | The %PER of FST (Finite State Transducer) based G2P (Grapheme-to-Phoneme) toolkit that we used to generate pronunciation lexicon range from 7% to 45%, so the lexicon need to be corrected.
7 | 
8 | ## Check of phonemes
9 | After lexicon correction, the final lexicon is also not perfect, with some noise. We further check our phonemes by referring to the IPA symbol table in LanguageNet and Phoible, with Google Translate listening test.


--------------------------------------------------------------------------------
/egs/cv-lang10/lang-process/nl/lexicon.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com)
 2 | # This script prepares phoneme-based lexicon and corrects it for Dutch.
 3 | 
 4 | # Generating lexicon
 5 | dict_dir=$1
 6 |   g2ps=g2ps/models 
 7 |   phonetisaurus-apply --model $g2ps/dutch.fst --word_list $dict_dir/word_list > $dict_dir/lexicon.txt
 8 | 
 9 | # Lexicon correction
10 | cat $dict_dir/lexicon.txt | awk '{$1=""; print $0}' | sed -e 's/dʒ/d͡ʒ/g; s/œɪ/œ y/g; s/ɛɪ/ɛ i/g' > $dict_dir/phone.txt
11 | 


--------------------------------------------------------------------------------
/egs/cv-lang10/lang-process/nl/text_norm.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com)
 2 | # Acknowlegement: This script refer to the code of Huahuan Zheng (maxwellzh@outlook.com)
 3 | # This script completes text normalization for Dutch dataset from CommonVoice
 4 | 
 5 | data_dir=$1
 6 |     for set in dev test excluded_train; do
 7 |       paste $data_dir/$set/text > $data_dir/$set/text.bak
 8 |       cut <$data_dir/$set/text.bak -f 2- | \
 9 |          sed -e 's/,/ /g; s/"/ /g; s/“/ /g; s/[;]/ /g; s/[—]/ /g; s/[.]/ /g; s/:/ /g; s/!/ /g; s/”/ /g; s/?/ /g; s/«/ /g; s/»/ /g' | \
10 |          sed -e 's/[ ][ ]*/ /g; s/^[ ]*//g; s/[ ]*$//g' | \
11 |          python -c "import sys; print(sys.stdin.read().lower())" > $data_dir/$set/text.trans.tmp
12 |       cut <$data_dir/$set/text.bak -f 1 > $data_dir/$set/text.id.tmp
13 |       paste $data_dir/$set/text.{id,trans}.tmp > $data_dir/$set/text
14 |       cat $data_dir/$set/text | sed -e 's/^[	]*//g' | grep -v "^$" > $data_dir/$set/text_new
15 |       mv $data_dir/$set/text_new $data_dir/$set/text
16 |       rm -rf $data_dir/$set/text.{id,trans}.tmp
17 |     done


--------------------------------------------------------------------------------
/egs/cv-lang10/lang-process/pl/lexicon.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com)
 2 | # This script prepares phoneme-based lexicon and corrects it for Polish.
 3 | 
 4 | # Generating lexicon
 5 | dict_dir=$1
 6 |   g2ps=g2ps/models 
 7 |   phonetisaurus-apply --model $g2ps/polish_2_2_2.fst --word_list $dict_dir/word_list > $dict_dir/lexicon.txt
 8 | 
 9 | # Lexicon correction
10 | cat $dict_dir/lexicon.txt | awk '{$1=""; print $0}' | sed -e 's/ts/t͡s/g; s/dz/d͡z/g; s/ɖʐ/ɖ͡ʐ/g; s/tʂ/ʈ͡ʂ/g; s/dʑ/d͡ʑ/g; s/tɕ/t͡ɕ/g; s/ɔ̃/ɔ/g; s/ɨ̃/ɨ/g; s/ɛ̃/ɛ/g; s/w̃/w/g; s/ɛ̝/ɛ/g' > $dict_dir/phone.txt
11 | 


--------------------------------------------------------------------------------
/egs/cv-lang10/lang-process/pl/text_norm.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com)
 2 | # Acknowlegement: This script refer to the code of Huahuan Zheng (maxwellzh@outlook.com)
 3 | # This script completes text normalization for Polish dataset from CommonVoice
 4 | 
 5 | data_dir=$1
 6 |     for set in dev test excluded_train; do
 7 |       paste $data_dir/$set/text > $data_dir/$set/text.bak
 8 |       cut <$data_dir/$set/text.bak -f 2- | \
 9 |          sed -e 's/,/ /g; s/"/ /g; s/“/ /g; s/[;]/ /g; s/[—]/ /g; s/[.]/ /g; s/:/ /g; s/!/ /g; s/”/ /g; s/?/ /g; s/«/ /g; s/»/ /g' | \
10 |          sed -e 's/[ ][ ]*/ /g; s/^[ ]*//g; s/[ ]*$//g' | \
11 |          python -c "import sys; print(sys.stdin.read().lower())" > $data_dir/$set/text.trans.tmp
12 |       cut <$data_dir/$set/text.bak -f 1 > $data_dir/$set/text.id.tmp
13 |       paste $data_dir/$set/text.{id,trans}.tmp > $data_dir/$set/text
14 |       cat $data_dir/$set/text | sed -e 's/^[	]*//g' | grep -v "^$" > $data_dir/$set/text_new
15 |       mv $data_dir/$set/text_new $data_dir/$set/text
16 |       rm -rf $data_dir/$set/text.{id,trans}.tmp
17 |     done


--------------------------------------------------------------------------------
/egs/cv-lang10/lang-process/ru/lexicon.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com) /
 2 | # This script prepares phoneme-based lexicon and corrects it for Russian.
 3 | 
 4 | dict_dir=$1
 5 | # Generating lexicon
 6 |   g2ps=g2ps/models
 7 |   phonetisaurus-apply --model $g2ps/russian.fst --word_list $dict_dir/word_list > $dict_dir/lexicon.txt
 8 | 
 9 | # Lexicon correction
10 | cat $dict_dir/lexicon.txt | awk '{$1=""; print $0}' > $dict_dir/phone.txt
11 | 


--------------------------------------------------------------------------------
/egs/cv-lang10/lang-process/sv-SE/lexicon.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com)
 2 | # This script prepares phoneme-based lexicon and corrects it for Swedish.
 3 | 
 4 | # Generating lexicon
 5 | dict_dir=$1
 6 |   g2ps=g2ps/models
 7 |   phonetisaurus-apply --model $g2ps/swedish_4_4_4.fst --word_list $dict_dir/word_list > $dict_dir/lexicon.txt
 8 | 
 9 | # Lexicon correction
10 | cat $dict_dir/lexicon.txt | awk '{$1=""; print $0}'  > $dict_dir/phone.txt
11 | 


--------------------------------------------------------------------------------
/egs/cv-lang10/lang-process/tr/lexicon.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com)
 2 | # This script prepares phoneme-based lexicon and corrects it for Turkish.
 3 | 
 4 | # Generating lexicon
 5 | dict_dir=$1
 6 |   g2ps=g2ps/models 
 7 |   phonetisaurus-apply --model $g2ps/turkish.fst --word_list $dict_dir/word_list > $dict_dir/lexicon.txt
 8 | 
 9 | # Lexicon correction
10 | cat $dict_dir/lexicon.txt | awk '{$1=""; print $0}' | sed -e 's/d ʒ/d͡ʒ/g; s/dʒ/d͡ʒ/g; s/t ʃ/t͡ʃ/g; s/tʃ/t͡ʃ/g; s/ɡj/ɡ/g; s/g/ɡ/g; s/â/a/g; s/é/e/g; s/û/u/g; s/*//g; s/ ̇//g; s/[.]//g; s/ë/e/g; s/î/i/g' > $dict_dir/phone.txt
11 | 


--------------------------------------------------------------------------------
/egs/cv-lang10/lang-process/tt/lexicon.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com)
 2 | # This script prepares phoneme-based lexicon and corrects it for Tatar.
 3 | 
 4 | # Generating lexicon
 5 | dict_dir=$1
 6 |   g2ps=g2ps/models 
 7 |   phonetisaurus-apply --model $g2ps/tatar_2_2_2.fst --word_list $dict_dir/word_list > $dict_dir/lexicon.txt
 8 | 
 9 | # Lexicon correction
10 | cat $dict_dir/lexicon.txt | awk '{$1=""; print $0}' | sed -e 's/jo/j o/g; s/g/ɡ/g' > $dict_dir/phone.txt


--------------------------------------------------------------------------------
/egs/cv-lang10/lang-process/tt/text_norm.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com)
 2 | # Acknowlegement: This script refer to the code of Huahuan Zheng (maxwellzh@outlook.com)
 3 | # This script completes text normalization for Tatar dataset from CommonVoice
 4 | 
 5 | data_dir=$1
 6 |     for set in dev test excluded_train; do
 7 |       paste $data_dir/$set/text > $data_dir/$set/text.bak
 8 |       cut <$data_dir/$set/text.bak -f 2- | \
 9 |         sed -e 's/,/ /g; s/"/ /g; s/“/ /g; s/[;]/ /g; s/[—]/ /g; s/[.]/ /g; s/:/ /g; s/!/ /g; s/”/ /g; s/?/ /g; s/«/ /g; s/»/ /g' \
10 |          sed -e 's/[ ][ ]*/ /g; s/^[ ]*//g; s/[ ]*$//g' | \
11 |          python -c "import sys; print(sys.stdin.read().lower())" > $data_dir/$set/text.trans.tmp
12 |       cut <$data_dir/$set/text.bak -f 1 > $data_dir/$set/text.id.tmp
13 |       paste $data_dir/$set/text.{id,trans}.tmp > $data_dir/$set/text
14 |       cat $data_dir/$set/text | sed -e 's/^[	]*//g' | grep -v "^$" > $data_dir/$set/text_new
15 |       mv $data_dir/$set/text_new $data_dir/$set/text
16 |       rm -rf $data_dir/$set/text.{id,trans}.tmp
17 |     done


--------------------------------------------------------------------------------
/egs/cv-lang10/local/tools/calculate_dur.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse
 3 | import os
 4 | import time
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument("data_dir", type=str, help="path of data dir")
 8 | args = parser.parse_args()
 9 | file_name = os.path.join(args.data_dir, "utt2dur")
10 | 
11 | assert os.path.isfile(file_name), "this script require utt2dur for calculate total duration."
12 | 
13 | # start_time = time.time()
14 | total_duration = 0.
15 | with open(file_name, "r") as f:
16 |     for line in f:
17 |         path = line.split()[1]
18 |         duration = float(path)
19 |         # duration = librosa.get_duration(filename=path)
20 |         total_duration += duration
21 | # end_time = time.time()
22 | print(f"total duration: {total_duration/3600:2f} hour")
23 | # print(f"process time : {end_time-start_time:2f} second")
24 | 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/egs/cv-lang10/local/tools/char_list.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse
 3 | import os
 4 | 
 5 | if __name__ == "__main__":
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument("word_list", type=str, help="text file")
 8 |     parser.add_argument("--out", type=str, help="path of output char list file")
 9 |     args = parser.parse_args()
10 | 
11 |     assert os.path.isfile(args.word_list), f"word_list={args.word_list} is not a valid file."
12 | 
13 |     char_list = set()
14 | 
15 |     with open(args.word_list, "r", encoding="utf-8") as f:
16 |         for line in f:
17 |             char_list.update(list(line.strip()))
18 |     
19 |     out = args.out if args.out else os.path.join(os.path.dirname(args.word_list), "char_list.txt")
20 | 
21 |     with open(out, "w", encoding="utf-8") as wf:
22 |         for char in char_list:
23 |             wf.write(char + "\n")
24 | 
25 |     
26 | 


--------------------------------------------------------------------------------
/egs/cv-lang10/local/tools/phone_list.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | if __name__ == "__main__":
 5 |     parser = argparse.ArgumentParser()
 6 |     parser.add_argument("lexicon", type=str, help="lexicon file")
 7 |     parser.add_argument("--out", type=str, help="path of output phone list file")
 8 |     args = parser.parse_args()
 9 | 
10 |     assert os.path.isfile(args.lexicon), f"phone_list={args.lexicon} is not a valid file."
11 | 
12 |     phone_list = set()
13 | 
14 |     with open(args.lexicon, "r", encoding="utf-8") as f:
15 |         for line in f:
16 |             phone_seq = line.strip().split('\t', maxsplit=1)[1]
17 |             phone_list.update(phone_seq.split())
18 |     
19 |     out = args.out if args.out else os.path.join(os.path.dirname(args.lexicon), "phone_list.txt")
20 | 
21 |     with open(out, "w", encoding="utf-8") as wf:
22 |         for phone in phone_list:
23 |             if phone != ' ':
24 |                 wf.write(phone + "\n")
25 | 
26 |     


--------------------------------------------------------------------------------
/egs/cv-lang10/utils:
--------------------------------------------------------------------------------
1 | ../../cat/utils


--------------------------------------------------------------------------------
/egs/libri/README.md:
--------------------------------------------------------------------------------
 1 | ## Data
 2 | 960 hour English speech data. Book reading speech.
 3 | 
 4 | **Data prepare**
 5 | 
 6 | Use one of the options:
 7 | 
 8 | - Prepare data with Kaldi (default in results)
 9 | 
10 |    ```bash
11 |    bash local/data_kaldi.sh -h
12 |    ```
13 | 
14 | - Prepare data with `torchaudio`: run following command to get help
15 | 
16 |    ```bash
17 |    bash local/data.sh -h
18 |    ```
19 | 
20 | ## Result
21 | 
22 | Summarize experiments here.
23 | 
24 | Evaluated by WER (%)
25 | 
26 | | EXPID                                                              | dev-clean | dev-other | test-clean | test-other |
27 | | ------------------------------------------------------------------ | --------- | --------- | ---------- | ---------- |
28 | | [rnnt](exp/rnnt-v1) + transformer [lm](exp/lm/lm-v1-transformer)   | 1.81      | 4.03      | 1.94       | 4.39       |
29 | | [ctc-crf](exp/crf-v1) + transformer [lm](exp/lm/lm-v1-transformer) | 2.05      | 4.54      | 2.25       | 4.73       |
30 | 
31 | 


--------------------------------------------------------------------------------
/egs/libri/cat:
--------------------------------------------------------------------------------
1 | ../../cat


--------------------------------------------------------------------------------
/egs/libri/exp/crf-v1/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": [
 4 |             "train-clean-100",
 5 |             "train-clean-360",
 6 |             "train-other-500"
 7 |         ],
 8 |         "dev": [
 9 |             "dev-clean",
10 |             "dev-other"
11 |         ],
12 |         "test": [
13 |             "dev-clean",
14 |             "dev-other",
15 |             "test-clean",
16 |             "test-other"
17 |         ],
18 |         "filter": ":2000"
19 |     },
20 |     "tokenizer": {
21 |         "type": "LexiconTokenizer",
22 |         "option-init": {
23 |             "lexicon": "data/local/librispeech-lexicon.txt"
24 |         }
25 |     },
26 |     "train": {
27 |         "bin": "cat.ctc.train",
28 |         "option": {
29 |             "amp": true,
30 |             "batch_size": 128,
31 |             "grad_accum_fold": 16,
32 |             "grad_norm": 5.0,
33 |             "dynamic_batch_mode": 1
34 |         }
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/egs/libri/exp/crf-v1/monitor.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/libri/exp/crf-v1/monitor.jpg


--------------------------------------------------------------------------------
/egs/libri/exp/lm/lm-v1-transformer/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "CausalTransformer",
 4 |         "kwargs": {
 5 |             "attn_dropout": 0.1,
 6 |             "dim_hid": 768,
 7 |             "num_classes": 1024,
 8 |             "num_head": 12,
 9 |             "num_layers": 12
10 |         }
11 |     },
12 |     "scheduler": {
13 |         "type": "SchedulerNoam",
14 |         "optimizer": {
15 |             "type": "Adam",
16 |             "kwargs": {
17 |                 "lr": 0.001,
18 |                 "weight_decay": 1e-06
19 |             }
20 |         },
21 |         "kwargs": {
22 |             "dim_model": 768,
23 |             "peak_factor": 1.0,
24 |             "warmup_step": 25000,
25 |             "stop_step": 1200000
26 |         }
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/egs/libri/exp/lm/lm-v1-transformer/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": [
 4 |             "train-clean-100",
 5 |             "train-clean-360",
 6 |             "train-other-500",
 7 |             "data/librispeech.txt"
 8 |         ],
 9 |         "dev": [
10 |             "dev-clean",
11 |             "dev-other"
12 |         ],
13 |         "test": [
14 |             "dev-clean",
15 |             "dev-other",
16 |             "test-clean",
17 |             "test-other"
18 |         ],
19 |         "packing-text-lm": {
20 |             "truncate": 128
21 |         }
22 |     },
23 |     "tokenizer": {
24 |         "type": "SentencePieceTokenizer",
25 |         "option-train": {
26 |             "model_type": "unigram",
27 |             "vocab_size": 1024,
28 |             "model_prefix": "sentencepiece/libri_unigram_1024/spm"
29 |         }
30 |     },
31 |     "train": {
32 |         "bin": "cat.lm.train",
33 |         "option": {
34 |             "amp": true,
35 |             "batch_size": 2048,
36 |             "grad-norm": 5.0
37 |         }
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/egs/libri/exp/lm/lm-v1-transformer/monitor.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/libri/exp/lm/lm-v1-transformer/monitor.jpg


--------------------------------------------------------------------------------
/egs/libri/exp/lm/lm-v1-transformer/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * Model size/M: 87.42
 6 | * GPU info \[10\]
 7 |   * \[10\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | - Transformer LM for libri corpus.
12 | - The training might take over 100 hours.
13 | 
14 | ### Result
15 | ```
16 | dev_clean | dev_other | test_other | test_clean
17 |    12.49  |   13.34   |    13.50   |   12.49
18 | ```
19 | 
20 | ### Monitor figure
21 | ![monitor](./monitor.jpg)
22 | 


--------------------------------------------------------------------------------
/egs/libri/exp/rnnt-v1/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/libri/exp/rnnt-v1/monitor.png


--------------------------------------------------------------------------------
/egs/libri/local/prep_lexicon.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mkdir -p data/local
4 | [ ! -f data/local/librispeech-lexicon.txt ] &&
5 |     wget https://www.openslr.org/resources/11/librispeech-lexicon.txt -P data/local
6 | 
7 | echo "$0 done."
8 | exit 0
9 | 


--------------------------------------------------------------------------------
/egs/libri/local/prep_libri_corpus.sh:
--------------------------------------------------------------------------------
 1 | # Author: Zheng Huahuan (maxwellzh@outlook.com)
 2 | # This script includes the processing of librispeech extra corpus text
 3 | set -e -u
 4 | 
 5 | d_out=data
 6 | 
 7 | mkdir -p $d_out
 8 | text=$d_out/librispeech.txt
 9 | if [ ! -f $text ]; then
10 |     archive=$d_out/librispeech-lm-norm.txt.gz
11 |     if [ ! -f $archive ]; then
12 |         wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -P $d_out || exit 1
13 |     fi
14 |     # check archive
15 |     if [ $(md5sum $archive | cut -d ' ' -f 1) != "c83c64c726a1aedfe65f80aa311de402" ]; then
16 |         echo "MD5 checking failed for $archive, please rm it then run this script again."
17 |         exit 1
18 |     fi
19 |     gunzip -c $archive >$text || exit 1
20 |     rm $archive
21 |     echo "Fetched librispeech extra text corpus at $text"
22 | else
23 |     echo "$text file exist. skipped"
24 | fi
25 | 
26 | # check md5sum
27 | if [ $(md5sum $text | cut -d ' ' -f 1) != "c8288034566b62698db24f6cd414160d" ]; then
28 |     echo "MD5 checking failed for $text, please rm it then run this script again."
29 |     exit 1
30 | fi
31 | 


--------------------------------------------------------------------------------
/egs/libri/utils:
--------------------------------------------------------------------------------
1 | ../../cat/utils/


--------------------------------------------------------------------------------
/egs/wenetspeech/.vscode:
--------------------------------------------------------------------------------
1 | ../../.vscode/


--------------------------------------------------------------------------------
/egs/wenetspeech/cat:
--------------------------------------------------------------------------------
1 | ../../cat/


--------------------------------------------------------------------------------
/egs/wenetspeech/exp/ebm-lm/GN-ELM-DNCE/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "EBM",
 4 |         "kwargs": {
 5 |             "noise_rate": 1,
 6 |             "method": "dnce",
 7 |             "energy_func": "sumtokenlogit",
 8 |             "config_ebm_model": "exp/lm/GN-ELM-DNCE/config_ebm.json",
 9 |             "config_noise_model": "exp/lm/GN-ELM-DNCE/config_noise.json",
10 |             "tokenizer_path": "exp/lm/GN-ELM-DNCE/tokenizer.tknz",
11 |             "bert_tokenizer": true
12 |         }
13 |     },
14 |     "scheduler": {
15 |         "type": "SchedulerNoam",
16 |         "kwargs": {
17 |             "dim_model": 768,
18 |             "peak_factor": 0.16,
19 |             "warmup_step": 10000,
20 |             "stop_step": 30000
21 |         },
22 |         "optimizer": {
23 |             "type": "Adam",
24 |             "kwargs": {
25 |                 "lr": 0.0008
26 |             }
27 |         }
28 |     }
29 | }


--------------------------------------------------------------------------------
/egs/wenetspeech/exp/ebm-lm/GN-ELM-DNCE/config_ebm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "PretrainedTransformer",
 4 |         "kwargs": {
 5 |             "T_model": "BertLMHeadModel",
 6 |             "T_config": "BertConfig",
 7 |             "pretrained": "bert-base-chinese",
 8 |             "with_head": false
 9 |         }
10 |     }
11 | }


--------------------------------------------------------------------------------
/egs/wenetspeech/exp/ebm-lm/GN-ELM-DNCE/config_noise.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "PretrainedTransformer",
 4 |         "kwargs": {
 5 |             "T_model": "GPT2LMHeadModel",
 6 |             "T_config": "GPT2Config",
 7 |             "pretrained": "uer/gpt2-chinese-cluecorpussmall",
 8 |             "with_head": false
 9 |         }
10 |     }
11 | }


--------------------------------------------------------------------------------
/egs/wenetspeech/exp/ebm-lm/GN-ELM-DNCE/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/wenetspeech/exp/ebm-lm/GN-ELM-DNCE/monitor.png


--------------------------------------------------------------------------------
/egs/wenetspeech/exp/ebm-lm/TRF-LM-DNCE/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "TRFLM",
 4 |         "kwargs": {
 5 |             "noise_rate": 1,
 6 |             "method": "dnce",
 7 |             "energy_func": "sumtargetlogit",
 8 |             "f_linfo": "exp/ebm-lm/TRF-LM-DNCE/pkl/train.pkl",
 9 |             "config_ebm_model": "exp/ebm-lm/TRF-LM-DNCE/config_trf.json",
10 |             "config_noise_model": "exp/ebm-lm/TRF-LM-DNCE/config_noise.json",
11 |             "tokenizer_path": "exp/ebm-lm/TRF-LM-DNCE/tokenizer.tknz",
12 |             "alpha": 1,
13 |             "with_end_mark": false,
14 |             "bert_tokenizer": true
15 |         }
16 |     },
17 |     "scheduler": {
18 |         "type": "SchedulerNoam",
19 |         "kwargs": {
20 |             "dim_model": 768,
21 |             "peak_factor": 0.16,
22 |             "warmup_step": 10000,
23 |             "stop_step": 30000
24 |         },
25 |         "optimizer": {
26 |             "type": "Adam",
27 |             "kwargs": {
28 |                 "lr": 0.0008
29 |             }
30 |         }
31 |     }
32 | }


--------------------------------------------------------------------------------
/egs/wenetspeech/exp/ebm-lm/TRF-LM-DNCE/config_noise.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "PretrainedTransformer",
 4 |         "kwargs": {
 5 |             "T_model": "GPT2LMHeadModel",
 6 |             "T_config": "GPT2Config",
 7 |             "pretrained": "uer/gpt2-chinese-cluecorpussmall",
 8 |             "with_head": false
 9 |         }
10 |     }
11 | }


--------------------------------------------------------------------------------
/egs/wenetspeech/exp/ebm-lm/TRF-LM-DNCE/config_trf.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "PretrainedTransformer",
 4 |         "kwargs": {
 5 |             "T_model": "GPT2LMHeadModel",
 6 |             "T_config": "GPT2Config",
 7 |             "pretrained": "uer/gpt2-chinese-cluecorpussmall",
 8 |             "with_head": false
 9 |         }
10 |     }
11 | }


--------------------------------------------------------------------------------
/egs/wenetspeech/exp/ebm-lm/TRF-LM-DNCE/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/wenetspeech/exp/ebm-lm/TRF-LM-DNCE/monitor.png


--------------------------------------------------------------------------------
/egs/wenetspeech/exp/ebm-lm/TRF-LM-DNCE/readme.md:
--------------------------------------------------------------------------------
 1 | # Train TRF-LM with DNCE
 2 | 
 3 | ## Result
 4 | We also try 3 different energy functions, whose results are as follows:
 5 | 
 6 | | CER type     | SumTargetLogit | Hidden2Scalar | SumTokenLogit |
 7 | | ------------ | -------------- | ------------- | ------------- |
 8 | | in-domain    | 8.97           | 8.95          | 9.00          |
 9 | | cross-domain | 15.77          | 15.67         | 15.65         |
10 | 
11 | The training curve of the best model (Hidden2Scalar) is shown below.
12 | 
13 | |     training curve    |
14 | |:-----------------------:|
15 | |![monitor](./monitor.png)|


--------------------------------------------------------------------------------
/egs/wenetspeech/exp/lm/lm-trans-l/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "NGram",
 4 |         "kwargs": {
 5 |             "f_binlm": "exp/lm/lm-trans-l/5gram.bin",
 6 |             "gram_order": 5,
 7 |             "num_classes": 5536
 8 |         }
 9 |     }
10 | }


--------------------------------------------------------------------------------
/egs/wenetspeech/exp/lm/lm-trans-l/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": "train_l",
 4 |         "test": [
 5 |             "test_net",
 6 |             "test_meeting"
 7 |         ]
 8 |     },
 9 |     "tokenizer": {
10 |         "type": "SentencePieceTokenizer",
11 |         "option-train": {
12 |             "model_type": "char",
13 |             "add_dummy_prefix": false,
14 |             "use_all_vocab": true,
15 |             "model_prefix": "sentencepiece/wenetspeech_l_char/spm",
16 |             "vocab_size": 5536
17 |         }
18 |     },
19 |     "inference": {}
20 | }


--------------------------------------------------------------------------------
/egs/wenetspeech/exp/lm/lm-trans-l/readme.md:
--------------------------------------------------------------------------------
 1 | 
 2 | train command:
 3 | 
 4 | ```bash
 5 | utils/pipeline/ngram.sh exp/lm/lm-trans-l
 6 | ```
 7 | 
 8 | property:
 9 | 
10 | - prune: 
11 | - type:  trie
12 | - size:  1.3GB
13 | 
14 | perplexity:
15 | 
16 | ```
17 | Test file: test_net -> ppl: 59.07
18 | Test file: test_meeting -> ppl: 55.39
19 | ```
20 | 


--------------------------------------------------------------------------------
/egs/wenetspeech/exp/lm/lm-trans-m/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "NGram",
 4 |         "kwargs": {
 5 |             "f_binlm": "exp/lm/lm-trans-m/5gram.bin",
 6 |             "gram_order": 5,
 7 |             "num_classes": 5147
 8 |         }
 9 |     }
10 | }


--------------------------------------------------------------------------------
/egs/wenetspeech/exp/lm/lm-trans-m/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": "train_m",
 4 |         "dev": [
 5 |             "dev"
 6 |         ],
 7 |         "test": [
 8 |             "dev",
 9 |             "test_net",
10 |             "test_meeting"
11 |         ]
12 |     },
13 |     "tokenizer": {
14 |         "type": "SentencePieceTokenizer",
15 |         "option-train": {
16 |             "model_type": "char",
17 |             "add_dummy_prefix": false,
18 |             "use_all_vocab": true,
19 |             "model_prefix": "sentencepiece/wenetspeech_m_char/spm",
20 |             "vocab_size": 5147
21 |         }
22 |     },
23 |     "inference": {}
24 | }
25 | 


--------------------------------------------------------------------------------
/egs/wenetspeech/exp/lm/lm-trans-m/readme.md:
--------------------------------------------------------------------------------
 1 | 
 2 | train command:
 3 | 
 4 | ```bash
 5 | utils/pipeline/ngram.sh exp/lm/lm-trans-m
 6 | ```
 7 | 
 8 | property:
 9 | 
10 | - prune: 
11 | - type:  probing
12 | - size:  467MB
13 | 
14 | perplexity:
15 | 
16 | ```
17 | data: dev   test_net   test_meeting
18 | ppl:   61.33  |  73.53  |  65.49  |
19 | ```
20 | 


--------------------------------------------------------------------------------
/egs/wenetspeech/exp/train_l/crf-v1/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "dtrain": "train_l",
 4 |         "dev": [
 5 |             "dev"
 6 |         ],
 7 |         "dtest": [
 8 |             "dev",
 9 |             "test_net",
10 |             "test_meeting",
11 |             "aishell-test"
12 |         ],
13 |         "filter": "10:1000"
14 |     },
15 |     "tokenizer": {
16 |         "type": "JiebaComposeLexiconTokenizer",
17 |         "option-init": {
18 |             "lexicon": "",
19 |             "userdict": ""
20 |         }
21 |     },
22 |     "train": {
23 |         "bin": "cat.ctc.train",
24 |         "option": {
25 |             "amp": true,
26 |             "batch_size": 864,
27 |             "grad-norm": 2.0,
28 |             "grad-accum-fold": 3,
29 |             "check-freq": 500
30 |         }
31 |     },
32 |     "inference": {
33 |         "avgmodel": {
34 |             "mode": "best",
35 |             "num": 10
36 |         },
37 |         "infer": {
38 |             "bin": "cat.ctc.cal_logit",
39 |             "option": {
40 |                 "nj": 48
41 |             }
42 |         }
43 |     }
44 | }


--------------------------------------------------------------------------------
/egs/wenetspeech/exp/train_l/crf-v1/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/wenetspeech/exp/train_l/crf-v1/monitor.png


--------------------------------------------------------------------------------
/egs/wenetspeech/exp/train_l/rnnt-v1/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/wenetspeech/exp/train_l/rnnt-v1/monitor.png


--------------------------------------------------------------------------------
/egs/wenetspeech/exp/train_m/crf-v1/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": "train_m",
 4 |         "dev": [
 5 |             "dev"
 6 |         ],
 7 |         "test": [
 8 |             "dev",
 9 |             "test_net",
10 |             "test_meeting"
11 |         ],
12 |         "filter": ":1000"
13 |     },
14 |     "train": {
15 |         "bin": "cat.ctc.train",
16 |         "option": {
17 |             "amp": true,
18 |             "batch_size": 512,
19 |             "grad_norm": 5.0,
20 |             "dynamic_batch_mode": 1
21 |         }
22 |     },
23 |     "inference": {
24 |         "avgmodel": {
25 |             "mode": "last",
26 |             "num": 10
27 |         }
28 |     }
29 | }


--------------------------------------------------------------------------------
/egs/wenetspeech/exp/train_m/crf-v1/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/wenetspeech/exp/train_m/crf-v1/monitor.png


--------------------------------------------------------------------------------
/egs/wenetspeech/exp/train_m/ctc-v1/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/wenetspeech/exp/train_m/ctc-v1/monitor.png


--------------------------------------------------------------------------------
/egs/wenetspeech/exp/train_m/ctc-v1/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * Model size/M: 86.01
 6 | * GPU info \[7\]
 7 |   * \[7\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | - CTC topo of `rnnt-v1`
12 | 
13 | ### Result
14 | ```
15 | no lm
16 | dev             %SER 75.46 | %CER 11.80 [ 39003 / 330498, 1282 ins, 14446 del, 23275 sub ]
17 | test_net        %SER 70.72 | %CER 14.28 [ 59372 / 415746, 2004 ins, 12692 del, 44676 sub ]
18 | test_meeting    %SER 94.50 | %CER 22.23 [ 48983 / 220385, 1622 ins, 17767 del, 29594 sub ]
19 | aishell-test    %SER 61.05 | %CER 9.05 [ 9478 / 104765, 347 ins, 201 del, 8930 sub ]
20 | ```
21 | 
22 | |     training process    |
23 | |:-----------------------:|
24 | |![tb-plot](./monitor.png)|
25 | 


--------------------------------------------------------------------------------
/egs/wenetspeech/exp/train_m/rnnt-v1/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/wenetspeech/exp/train_m/rnnt-v1/monitor.png


--------------------------------------------------------------------------------
/egs/wenetspeech/exp/train_m/rnnt-v1/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * Model size/M: 91.27
 6 | * GPU info \[10\]
 7 |   * \[10\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Appendix
10 | 
11 | * trained on wenet speech M subset (1000 hour speech)
12 | 
13 | ### Result
14 | ```
15 | beamsize 128 no lm
16 | dev             %SER 71.10 | %CER 11.14 [ 36833 / 330498, 1284 ins, 16210 del, 19339 sub ]
17 | test_net        %SER 65.51 | %CER 12.75 [ 52991 / 415746, 1942 ins, 12914 del, 38135 sub ]
18 | test_meeting    %SER 91.74 | %CER 20.88 [ 46025 / 220385, 1236 ins, 22703 del, 22086 sub ]
19 | aishell-dev     %SER 45.05 | %CER 6.32 [ 12985 / 205341, 420 ins, 248 del, 12317 sub ]
20 | aishell-test    %SER 49.97 | %CER 7.22 [ 7562 / 104765, 253 ins, 204 del, 7105 sub ]
21 | ```
22 | 
23 | 
24 | ### Monitor figure
25 | ![monitor](./monitor.png)
26 | 


--------------------------------------------------------------------------------
/egs/wenetspeech/utils:
--------------------------------------------------------------------------------
1 | ../../cat/utils/


--------------------------------------------------------------------------------
/egs/wsj/.vscode:
--------------------------------------------------------------------------------
1 | ../../.vscode/


--------------------------------------------------------------------------------
/egs/wsj/cat:
--------------------------------------------------------------------------------
1 | ../../cat


--------------------------------------------------------------------------------
/egs/wsj/exp/asr-ctc-crf-phone/decode_lm/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "NGram",
 4 |         "kwargs": {
 5 |             "f_binlm": "exp/asr-ctc-crf-phone/decode_lm/4gram.arpa",
 6 |             "gram_order": 4,
 7 |             "num_classes": 127924
 8 |         }
 9 |     }
10 | }


--------------------------------------------------------------------------------
/egs/wsj/exp/asr-ctc-crf-phone/decode_lm/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": "data/extra.corpus"
 4 |     },
 5 |     "tokenizer": {
 6 |         "type": "SimpleTokenizer",
 7 |         "option-init": {
 8 |             "dmap": "data/cmudict.txt",
 9 |             "read_index_from_file": false
10 |         }
11 |     }
12 | }
13 | 


--------------------------------------------------------------------------------
/egs/wsj/exp/asr-ctc-crf-phone/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": "train_si284-3sp",
 4 |         "dev": "test_dev93-3sp",
 5 |         "test": [
 6 |             "test_dev93",
 7 |             "test_eval92"
 8 |         ],
 9 |         "filter": ":2000"
10 |     },
11 |     "tokenizer": {
12 |         "type": "LexiconTokenizer",
13 |         "option-init": {
14 |             "lexicon": "data/cmudict.txt"
15 |         }
16 |     },
17 |     "train": {
18 |         "bin": "cat.ctc.train",
19 |         "option": {
20 |             "amp": true,
21 |             "batch_size": 256
22 |         }
23 |     },
24 |     "inference": {
25 |         "avgmodel": {
26 |             "mode": "last",
27 |             "num": 2
28 |         },
29 |         "infer": {
30 |             "bin": "cat.ctc.cal_logit",
31 |             "option": {
32 |                 "nj": 16
33 |             }
34 |         }
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/egs/wsj/exp/asr-ctc-crf-phone/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/wsj/exp/asr-ctc-crf-phone/monitor.png


--------------------------------------------------------------------------------
/egs/wsj/exp/asr-ctc-crf-phone/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 13.39
 6 | * GPU info \[5\]
 7 |   * \[5\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | * CTC-CRF training for around 23 epochs
12 | 
13 | ### Result
14 | ```
15 | eval92  %SER 29.73 | %WER 2.87 [ 162 / 5643, 14 ins, 14 del, 134 sub ]
16 | dev93  %SER 46.32 | %WER 5.53 [ 455 / 8234, 49 ins, 52 del, 354 sub ]  
17 | 
18 | ```
19 | 
20 | |     training process    |
21 | |:-----------------------:|
22 | |![monitor](./monitor.png)|
23 | 


--------------------------------------------------------------------------------
/egs/wsj/exp/asr-ctc-phone/decode_lm/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "decoder": {
 3 |         "type": "NGram",
 4 |         "kwargs": {
 5 |             "f_binlm": "exp/asr-ctc-phone/decode_lm/4gram.arpa",
 6 |             "gram_order": 4,
 7 |             "num_classes": 127924
 8 |         }
 9 |     }
10 | }


--------------------------------------------------------------------------------
/egs/wsj/exp/asr-ctc-phone/decode_lm/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": "data/extra.corpus"
 4 |     },
 5 |     "tokenizer": {
 6 |         "type": "SimpleTokenizer",
 7 |         "option-init": {
 8 |             "dmap": "data/cmudict.txt",
 9 |             "read_index_from_file": false
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/egs/wsj/exp/asr-ctc-phone/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": "train_si284-3sp",
 4 |         "dev": "test_dev93-3sp",                                                    
 5 |         "test": [
 6 |             "test_dev93",
 7 |             "test_eval92"
 8 |         ],
 9 |         "filter": ":2000"
10 |     },
11 |     "tokenizer": {
12 |         "type": "LexiconTokenizer",
13 |         "option-init": {
14 |             "lexicon": "data/cmudict.txt"
15 |         }
16 |     },
17 |     "train": {
18 |         "bin": "cat.ctc.train",
19 |         "option": {
20 |             "amp": true,
21 |             "batch_size": 256
22 |         }
23 |     },
24 |     "inference": {
25 |         "avgmodel": {
26 |             "mode": "last",
27 |             "num": 2
28 |         },
29 |         "infer": {
30 |             "bin": "cat.ctc.cal_logit",
31 |             "option": {
32 |                 "nj": 16
33 |             }
34 |         }
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/egs/wsj/exp/asr-ctc-phone/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/wsj/exp/asr-ctc-phone/monitor.png


--------------------------------------------------------------------------------
/egs/wsj/exp/asr-ctc-phone/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 13.39
 6 | * GPU info \[10\]
 7 |   * \[10\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | * CTC trainging 23 epochs
12 | 
13 | ### Result
14 | ```
15 | eval92  %SER 51.49 | %WER 6.79 [ 383 / 5643, 30 ins, 54 del, 299 sub ]
16 | dev93   %SER 69.77 | %WER 11.88 [ 978 / 8234, 60 ins, 211 del, 707 sub ]
17 | ```
18 | 
19 | |     training process    |
20 | |:-----------------------:|
21 | |![monitor](./monitor.png)|
22 | 


--------------------------------------------------------------------------------
/egs/wsj/exp/asr-ctc-phone/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Author: Huahuan Zheng
 3 | set -e -u
 4 | 
 5 | dir=$(dirname $0)
 6 | # Use a hack to re-use the script
 7 | touch $dir/den_lm.fst
 8 | bash ../asr-ctc-crf-phone/run.sh
 9 | 
10 | exit 0
11 | 


--------------------------------------------------------------------------------
/egs/wsj/exp/asr-rnnt-bpe/hyper-p.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": {
 3 |         "train": "train_si284-3sp",
 4 |         "dev": "test_dev93-3sp",                                                    
 5 |         "test": [
 6 |             "test_dev93",
 7 |             "test_eval92"
 8 |         ],
 9 |         "filter": ":2000"
10 |     },
11 |    "tokenizer": {
12 |         "type": "SentencePieceTokenizer",
13 |         "option-train": {
14 |             "model_type": "bpe",
15 |             "vocab_size": 2000,
16 |             "model_prefix": "sentencepiece/wsj_bpe2k/spm"
17 |         }
18 |     },
19 |     "train": {
20 |         "bin": "cat.rnnt.train",
21 |         "option": {
22 |             "amp": true,
23 |             "batch_size": 256
24 |         }
25 |     },
26 |     "inference": {
27 |         "avgmodel": {
28 |             "mode": "best",
29 |             "num": 5
30 |         },
31 |         "infer": {
32 |             "bin": "cat.rnnt.decode",
33 |             "option": {
34 |                 "beam_size": 16,
35 |                 "nj": 16
36 |             }
37 |         }
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/egs/wsj/exp/asr-rnnt-bpe/monitor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/wsj/exp/asr-rnnt-bpe/monitor.png


--------------------------------------------------------------------------------
/egs/wsj/exp/asr-rnnt-bpe/readme.md:
--------------------------------------------------------------------------------
 1 | ### Basic info
 2 | 
 3 | **This part is auto-generated, add your details in Appendix**
 4 | 
 5 | * \# of parameters (million): 21.60
 6 | * GPU info \[10\]
 7 |   * \[10\] NVIDIA GeForce RTX 3090
 8 | 
 9 | ### Notes
10 | 
11 | ```bash
12 | # prepare data
13 | bash local/data_kaldi.sh -use-3way-sp
14 | 
15 | # train and inference
16 | python utils/pipeline/asr.py exp/asr-rnnt-bpe
17 | ```
18 | 
19 | * RNN-T training and Beam Search decoding
20 | 
21 | ### Result
22 | ```
23 | eval92  %SER 66.37 | %WER 9.87 [ 557 / 5643, 35 ins, 74 del, 448 sub ]
24 | dev93   %SER 72.76 | %WER 12.63 [ 1040 / 8234, 103 ins, 174 del, 763 sub ]
25 | ```
26 | 
27 | |     training process    |
28 | |:-----------------------:|
29 | |![monitor](./monitor.png)|
30 | 


--------------------------------------------------------------------------------
/egs/wsj/utils:
--------------------------------------------------------------------------------
1 | ../../cat/utils/


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # gather >= 0.2.3
 2 | -e git+https://github.com/maxwellzh/torch-gather.git#egg=gather
 3 | # warp-rnnt >= 0.9.0
 4 | -e git+https://github.com/maxwellzh/warp-rnnt.git#egg=warp-rnnt
 5 | # webdataset, do not use 'pip install webdataset', that's outdated
 6 | -e git+https://github.com/webdataset/webdataset.git@d7334016f44a03c4a385971aa835c4f460d3f30a#egg=webdataset
 7 | # warp-ctct >= 0.3.0
 8 | -e git+https://github.com/maxwellzh/warp-ctct.git#egg=warp-ctct
 9 | 
10 | # module dependencies
11 | torch>=1.9.0
12 | tqdm>=4.62.3
13 | matplotlib>=3.4.3
14 | sentencepiece>=0.1.96
15 | kaldiio>=2.17.2
16 | # dependency issue, see https://github.com/protocolbuffers/protobuf/issues/10051
17 | protobuf==3.20.2
18 | tensorboard>=2.6.0
19 | jiwer>=2.2.0
20 | pyyaml>=6.0
21 | transformers>=4.12.3
22 | jieba>=0.42.1
23 | levenshtein


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='cat',
 5 |     version='3.0.1',
 6 |     packages=find_packages(exclude=['src', 'tools']),
 7 |     description="CRF-based ASR Toolkit.",
 8 |     long_description=open('README.md', 'r').read(),
 9 |     author="THU-SPMI Lab.",
10 |     url="https://github.com/thu-spmi/CAT",
11 |     platforms=["Linux x86-64"],
12 |     license="Apache 2.0"
13 | )
14 | 


--------------------------------------------------------------------------------
/src/ctc_crf/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright 2018-2019 Tsinghua University, Author: Hongyu Xiang
 2 | #           2021      Tsinghua University, Author: Huahuan Zheng
 3 | # Apache 2.0.
 4 | # CTC-CRF Makefile
 5 | 
 6 | openfst_dir=${CURDIR}/openfst-1.6.7/build
 7 | 
 8 | .PHONY: OPENFST GPUCTC GPUDEN CTCCRF
 9 | all: CTCCRF
10 | OPENFST:
11 | 	if [ ! -f "openfst-1.6.7.tar.gz" ]; then wget http://www.openfst.org/twiki/pub/FST/FstDownload/openfst-1.6.7.tar.gz; fi
12 | 	tar -zxf openfst-1.6.7.tar.gz
13 | 	cd openfst-1.6.7; ./configure --prefix=${openfst_dir}
14 | 	cd openfst-1.6.7; make -j $(nproc) && make install
15 | GPUCTC:
16 | 	mkdir -p gpu_ctc/build
17 | 	cd gpu_ctc/build && cmake ..
18 | 	cd gpu_ctc/build && make	
19 | GPUDEN: OPENFST
20 | 	mkdir -p gpu_den/build
21 | 	cd gpu_den/build && cmake -D openfst_dir:STRING=${openfst_dir} ..
22 | 	cd gpu_den/build && make
23 | CTCCRF: GPUCTC GPUDEN 
24 | 	python3 setup.py install
25 | 
26 | clean:
27 | 	python setup.py clean --all
28 | 	rm -rf gpu_{ctc,den}/build build/ \
29 | 		openfst-1.6.7/ *.tar.gz \
30 | 		dist/ ctc_crf.egg-info/	
31 | 


--------------------------------------------------------------------------------
/src/ctc_crf/gpu_ctc/README.txt:
--------------------------------------------------------------------------------
1 | The codes in this directory are from Baidu's warp-ctc. (https://github.com/baidu-research/warp-ctc).
2 | We modify the code organization and modify the input to be the softmax of the logits.


--------------------------------------------------------------------------------
/src/ctc_crf/gpu_ctc/hostdevice.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #ifdef __CUDACC__
4 |     #define HOSTDEVICE __host__ __device__
5 | #else
6 |     #define HOSTDEVICE
7 | #endif
8 | 


--------------------------------------------------------------------------------
/src/ctc_crf/test/den_lm.fst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/src/ctc_crf/test/den_lm.fst


--------------------------------------------------------------------------------
/src/fst-decoder/Makefile:
--------------------------------------------------------------------------------
 1 | # author: Huahuan Zheng (maxwellzh@outlook.com)
 2 | # This file is modified from kaldi/src/bin/Makefile
 3 | 
 4 | kaldi_root_src=${KALDI_ROOT}/src
 5 | EXTRA_CXXFLAGS = -Wno-sign-compare -I${kaldi_root_src}
 6 | include ${kaldi_root_src}/kaldi.mk
 7 | 
 8 | BINFILES = latgen-faster
 9 | 
10 | 
11 | OBJFILES =
12 | 
13 | ADDLIBS = ${kaldi_root_src}/decoder/kaldi-decoder.a ${kaldi_root_src}/lat/kaldi-lat.a ${kaldi_root_src}/lm/kaldi-lm.a \
14 |           ${kaldi_root_src}/fstext/kaldi-fstext.a ${kaldi_root_src}/hmm/kaldi-hmm.a \
15 |           ${kaldi_root_src}/transform/kaldi-transform.a ${kaldi_root_src}/gmm/kaldi-gmm.a \
16 |           ${kaldi_root_src}/tree/kaldi-tree.a ${kaldi_root_src}/util/kaldi-util.a ${kaldi_root_src}/matrix/kaldi-matrix.a \
17 |           ${kaldi_root_src}/base/kaldi-base.a
18 | 
19 | 
20 | TESTFILES =
21 | 
22 | include ${kaldi_root_src}/makefiles/default_rules.mk
23 | 


--------------------------------------------------------------------------------