├── .gitignore ├── .vscode ├── settings.json ├── sgm_hyper.json └── sgm_nn.json ├── LICENSE ├── README.md ├── assets ├── G.fst.png ├── JoinAP.png ├── L.fst(no NOISE).png ├── L.fst.png ├── ME2E.png ├── MVDR.png ├── PSD.png ├── T.fst(no NOISE).png ├── T.fst.png ├── TLG.fst.png ├── TLG.png ├── WFST.png ├── den.png ├── h_f.png ├── intellisense.gif ├── logo.png ├── loss.png ├── phonological_feature.png ├── pipeline_rnnt.png └── potential.png ├── cat ├── __init__.py ├── ctc │ ├── __init__.py │ ├── __main__.py │ ├── cal_logit.py │ ├── decode.py │ ├── decode_me2e.py │ ├── fst_decode.sh │ ├── train.py │ ├── train_me2e.py │ ├── train_me2e_chunk.py │ ├── train_me2e_kaldi.py │ ├── train_me2e_kaldi_chunk.py │ └── train_unified.py ├── front │ ├── beamformer_net.py │ ├── conv_beamformer.py │ ├── dnn_beamformer.py │ ├── dnn_wpe_new.py │ ├── filter_net.py │ ├── kaldifbank.py │ ├── log_mel.py │ ├── mask_estimator.py │ ├── multi2mono.py │ ├── nets_utils.py │ └── stft.py ├── lm │ ├── __init__.py │ ├── __main__.py │ ├── ppl.py │ ├── rescore.py │ ├── train.py │ └── trf │ │ ├── __init__.py │ │ ├── model.py │ │ └── train.py ├── rnnt │ ├── __init__.py │ ├── __main__.py │ ├── ctct_decoder.py │ ├── decode.py │ ├── joiner.py │ ├── rnnt_decoder.py │ ├── train.py │ └── train_unified.py ├── shared │ ├── __init__.py │ ├── _constants.py │ ├── coreutils.py │ ├── data.py │ ├── decoder.py │ ├── encoder.py │ ├── layer.py │ ├── manager.py │ ├── manager_simu.py │ ├── manager_wo.py │ ├── scheduler.py │ ├── simu_net.py │ ├── specaug.py │ └── tokenizer.py └── utils │ ├── __init__.py │ ├── avgmodel.py │ ├── cleandir.py │ ├── compat │ ├── repl_am_to_encoder.py │ └── update_transformer_lm.py │ ├── data │ ├── __init__.py │ ├── _data_prep_kaldi.py │ ├── corpus2index.py │ ├── data_prep.py │ ├── data_prep_kaldi.sh │ ├── exclude_corpus.py │ ├── pack_corpus.py │ ├── prep_wds.py │ ├── resolvedata.py │ └── text2nbest.py │ ├── lm │ ├── __init__.py │ ├── interpolate_nbests.py │ └── lmweight_search.py │ ├── parseopt.py │ ├── parseschema.py │ ├── pipeline │ ├── __init__.py │ ├── _constants.py │ ├── asr.py │ ├── common_utils.py │ ├── lm.py │ └── ngram.sh │ ├── plot_tb.py │ ├── tool │ ├── build_ctc_topo.py │ ├── build_decoding_graph.sh │ ├── get_prune_args.py │ ├── pack_audios.py │ ├── pack_audios_multi.py │ ├── prep_bigcidian.sh │ ├── prep_decoding_graph_materials.py │ ├── prep_den_lm.sh │ ├── prep_syllable_converter.py │ └── prep_wlm_lodr.sh │ └── wer.py ├── docs ├── ME2E_ASR_ch.md ├── configure_guide.md ├── contributing.md ├── cuside-array.md ├── cuside.md ├── cuside_ch.md ├── energy-based_LM_training.md ├── guide_for_third_party_tools.md ├── how_to_prepare_large_dataset.md ├── how_to_prepare_large_dataset_ch.md ├── joinap_tutorial_ch.md ├── significance_test.md ├── toolkitworkflow.md ├── whatsnew.md └── yesno_tutorial_ch.md ├── egs ├── IuMien │ ├── README-zh.md │ ├── README.md │ ├── cat │ ├── exp │ │ ├── Mono-phoneme │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor │ │ │ │ ├── exp-monitor.png │ │ │ │ ├── exp2-monitor.png │ │ │ │ ├── exp3-monitor.png │ │ │ │ └── readme.md │ │ │ ├── readme.md │ │ │ └── run.history.sh │ │ ├── Mono-subword │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor │ │ │ │ ├── exp2-monitor.png │ │ │ │ ├── exp3-monitor.png │ │ │ │ ├── monitor.png │ │ │ │ └── readme.md │ │ │ ├── readme.md │ │ │ └── run.history.sh │ │ ├── Mul10-sub-PT-sub-FT │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor │ │ │ │ ├── exp2-monitor.png │ │ │ │ ├── exp3-monitor.png │ │ │ │ ├── monitor.png │ │ │ │ └── readme.md │ │ │ ├── readme.md │ │ │ └── run.history.sh │ │ ├── Wav2vec2-cv10-phoneme-FT │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor │ │ │ │ ├── exp2-monitor.png │ │ │ │ ├── exp3-monitor.png │ │ │ │ ├── monitor.png │ │ │ │ └── readme.md │ │ │ ├── readme.md │ │ │ └── run.history.sh │ │ ├── Wav2vec2-cv10-sub-FT │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor │ │ │ │ ├── exp2-monitor.png │ │ │ │ ├── exp3-monitor.png │ │ │ │ ├── monitor.png │ │ │ │ └── readme.md │ │ │ ├── readme.md │ │ │ └── run.history.sh │ │ ├── Whistle-phoneme-FT │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor │ │ │ │ ├── exp-monitor.png │ │ │ │ ├── exp2-monitor.png │ │ │ │ ├── exp3-monitor.png │ │ │ │ └── readme.md │ │ │ ├── readme.md │ │ │ ├── run.history.sh │ │ │ └── unpack_mulingual_param.py │ │ ├── Whistle-sub-FT │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor │ │ │ │ ├── exp-monitor.png │ │ │ │ ├── exp2-monitor.png │ │ │ │ ├── exp3-monitor.png │ │ │ │ └── readme.md │ │ │ ├── readme.md │ │ │ └── run.history.sh │ │ └── decode_lm │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── readme.md │ │ │ └── run.history.sh │ ├── exp_data │ │ ├── exp-1 │ │ │ ├── dev_data-1 │ │ │ └── train_data-1 │ │ ├── exp-2 │ │ │ ├── dev-2_data-2 │ │ │ ├── test-2_data-2 │ │ │ └── train-2_data-2 │ │ └── exp-3 │ │ │ ├── dev-3_data-3 │ │ │ ├── test-3_data-3 │ │ │ └── train-3_data-3 │ ├── exp_dict │ │ ├── lexicon │ │ ├── lexicon-2 │ │ └── lexicon-3 │ ├── local │ │ ├── bpe_wfst_run.sh │ │ ├── data_kaldi.sh │ │ ├── extract_fbank.py │ │ ├── fliter_data.py │ │ ├── get_lexicon.py │ │ ├── get_wordlist.py │ │ ├── lexicon_wfst_run.sh │ │ ├── process_model_for_subword_ft.py │ │ ├── split_data.py │ │ └── unpack_mulingual_param.py │ └── utils ├── TEMPLATE │ ├── .vscode │ ├── README.md │ ├── cat │ ├── exp │ │ ├── asr-ctc-crf │ │ │ ├── config.json │ │ │ ├── decode-lm │ │ │ │ ├── config.json │ │ │ │ └── hyper-p.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ ├── readme.md │ │ │ └── run.sh │ │ ├── asr-ctc-large-corpora │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ ├── readme.md │ │ │ ├── run.sh │ │ │ └── tokenizer │ │ │ │ └── hyper-p.json │ │ ├── asr-ctc-lexicon │ │ │ ├── config.json │ │ │ ├── decode_lm │ │ │ │ ├── config.json │ │ │ │ └── hyper-p.json │ │ │ ├── hyper-p.json │ │ │ ├── readme.md │ │ │ └── run.sh │ │ ├── asr-ctc │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ │ ├── asr-rnnt-cuside │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ │ ├── asr-rnnt │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ │ ├── lm-ebm │ │ │ ├── cfg_aux.json │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ ├── readme.md │ │ │ └── run.sh │ │ ├── lm-ngram-word │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ └── readme.md │ │ └── lm-nn │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ ├── local │ │ ├── audio2ark.sh │ │ ├── data.sh │ │ ├── eval_fst_decode.sh │ │ ├── extract_feat.py │ │ ├── lm_data.sh │ │ ├── prep_wds.py │ │ └── significance_test.py │ └── utils ├── aishell │ ├── .vscode │ ├── README.md │ ├── cat │ ├── exp │ │ ├── ctc-crf-cuside │ │ │ ├── config.json │ │ │ ├── decode_lm │ │ │ │ ├── config.json │ │ │ │ └── hyper-p.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ ├── readme.md │ │ │ ├── run.sh │ │ │ └── run_lexicon.sh │ │ ├── ctc-v1 │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ │ ├── ebm-lm │ │ │ ├── GN-ELM-DNCE │ │ │ │ ├── config.json │ │ │ │ ├── config_ebm.json │ │ │ │ ├── config_noise.json │ │ │ │ ├── hyper-p.json │ │ │ │ ├── monitor.png │ │ │ │ └── readme.md │ │ │ ├── GN-ELM-ML │ │ │ │ ├── config.json │ │ │ │ ├── config_ebm.json │ │ │ │ ├── hyper-p.json │ │ │ │ ├── monitor.png │ │ │ │ └── readme.md │ │ │ ├── GN-ELM-NCE │ │ │ │ ├── config.json │ │ │ │ ├── config_ebm.json │ │ │ │ ├── config_noise.json │ │ │ │ ├── hyper-p.json │ │ │ │ ├── monitor.png │ │ │ │ └── readme.md │ │ │ └── TRF-LM-DNCE │ │ │ │ ├── config.json │ │ │ │ ├── config_noise.json │ │ │ │ ├── config_trf.json │ │ │ │ ├── hyper-p.json │ │ │ │ ├── monitor.png │ │ │ │ └── readme.md │ │ ├── lm │ │ │ ├── lm-v1-char-5gram │ │ │ │ ├── config.json │ │ │ │ ├── hyper-p.json │ │ │ │ └── readme.md │ │ │ └── lm-v2-word-3gram │ │ │ │ ├── config.json │ │ │ │ ├── hyper-p.json │ │ │ │ └── readme.md │ │ ├── rnnt-cuside │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ │ └── rnnt-v1 │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ ├── local │ │ ├── data.sh │ │ ├── data_kaldi.sh │ │ ├── extract_fbank.py │ │ └── extract_meta_kaldi.sh │ └── utils ├── aishell4 │ ├── README.md │ ├── cat │ ├── exp │ │ ├── Exp1-SingalChannel_E2E │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ │ ├── Exp10~12-CUSIDE-Array+Pre-trained_BE+E2E-FT(ID+simu_data) │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ │ ├── Exp13-CUSIDE-Array(OOD)+Pre-trained_BE │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ └── readme.md │ │ ├── Exp2-SingalChannel_E2E+JT(CUSIDE) │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ │ ├── Exp3-MultiChannel_E2E │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ │ ├── Exp4-MultiChannel_E2E+JT(CUSIDE-Array) │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ │ ├── Exp5-CUSIDE-Array+real_right_ctx │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ │ ├── Exp6-CUSIDE-Array+simu_right_ctx │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ │ ├── Exp7-CUSIDE+Pre-trained_BE │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ └── readme.md │ │ ├── Exp8-CUSIDE-Array+Pre-trained_BE │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── hyper-p_ori.json │ │ │ └── readme.md │ │ ├── Exp9-CUSIDE-Array+Pre-trained_BE+E2E-FT(ID) │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── hyper-p_ori.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ │ ├── ctc-e2e-chunk+simu │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ ├── readme.md │ │ │ ├── right_context.png │ │ │ └── simu_right_context.png │ │ ├── ctc-e2e-chunk-kaldi │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ └── readme.md │ │ └── ctc-e2e-chunk │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ ├── readme.md │ │ │ └── tokenizer.tknz │ ├── local │ │ ├── Statistical_Significance_Test_Tools │ │ │ ├── Readme.md │ │ │ ├── Readme_ch.md │ │ │ ├── cer.py │ │ │ ├── p_cal.bash │ │ │ └── significance_test.py │ │ ├── after_data_char_dealing.py │ │ ├── audio2ark_multi.sh │ │ ├── data_char_dealing.py │ │ ├── data_multi.sh │ │ ├── extract_fbank_multi.py │ │ ├── mix_gen.py │ │ └── ori_data_prep.py │ └── utils ├── commonvoice │ ├── .vscode │ ├── README.md │ ├── cat │ ├── exp │ │ ├── asr-ctc-russian │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ │ ├── asr-rnnt-chinese │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ │ ├── asr-rnnt-russian │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ │ └── joinap │ │ │ ├── decode-lm-indonesia │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ └── readme.md │ │ │ ├── decode-lm-russian │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ └── readme.md │ │ │ ├── finetune-id │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ │ │ ├── mono-indonesia-L │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ │ │ ├── mono-indonesia-NL │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ │ │ ├── mono-indonesia-flat │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ │ │ ├── mul-ru+id-L │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ ├── prep_mul_pv.sh │ │ │ └── readme.md │ │ │ ├── readme.md │ │ │ └── unpack_mulingual_param.py │ ├── local │ │ ├── data.sh │ │ ├── data │ │ │ ├── bipa.txt │ │ │ └── ipa_extend.txt │ │ ├── get_ipa_mapping.py │ │ ├── make_fbank.py │ │ ├── prep_ipa_lexicon.sh │ │ ├── repl_nonIPA.py │ │ └── text_normalize.sh │ └── utils ├── cv-lang10 │ ├── cat │ ├── data │ │ └── metainfo.json │ ├── exp │ │ ├── Crosslingual │ │ │ ├── id │ │ │ │ ├── Multi._phoneme_ft_phoneme_10h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Multi._phoneme_ft_phoneme_10m │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Multi._phoneme_ft_phoneme_1h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Multi._phoneme_ft_phoneme_20h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Multi._phoneme_ft_subword_10h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Multi._phoneme_ft_subword_10m │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Multi._phoneme_ft_subword_1h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Multi._phoneme_ft_subword_20h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Multi._subword_ft_subword_10h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Multi._subword_ft_subword_10m │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Multi._subword_ft_subword_1h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Multi._subword_ft_subword_20h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Wav2vec-En_ft_phoneme_10h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Wav2vec-En_ft_phoneme_1h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Wav2vec-En_ft_phoneme_20h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Wav2vec-En_ft_subword_10h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Wav2vec-En_ft_subword_1h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Wav2vec-En_ft_subword_20h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Wav2vec-lang10_ft_phoneme_10h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Wav2vec-lang10_ft_phoneme_1h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Wav2vec-lang10_ft_phoneme_20h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Wav2vec-lang10_ft_subword_10h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Wav2vec-lang10_ft_subword_1h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ └── Wav2vec-lang10_ft_subword_20h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ ├── pl │ │ │ │ ├── Multi._phoneme_ft_phoneme_10h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Multi._phoneme_ft_phoneme_10m │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Multi._phoneme_ft_phoneme_130h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Multi._phoneme_ft_phoneme_1h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Multi._phoneme_ft_subword_10h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Multi._phoneme_ft_subword_10m │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Multi._phoneme_ft_subword_130h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Multi._phoneme_ft_subword_1h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Multi._subword_ft_subword_10h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Multi._subword_ft_subword_10m │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Multi._subword_ft_subword_130h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Multi._subword_ft_subword_1h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Wav2vec-En_ft_phoneme_10h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Wav2vec-En_ft_phoneme_130h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Wav2vec-En_ft_phoneme_1h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Wav2vec-En_ft_subword_10h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Wav2vec-En_ft_subword_130h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Wav2vec-En_ft_subword_1h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Wav2vec-lang10_ft_phoneme_10h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Wav2vec-lang10_ft_phoneme_130h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Wav2vec-lang10_ft_phoneme_1h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Wav2vec-lang10_ft_subword_10h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Wav2vec-lang10_ft_subword_130h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ └── Wav2vec-lang10_ft_subword_1h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ └── readme.md │ │ ├── Monolingual │ │ │ ├── en │ │ │ │ ├── config.json │ │ │ │ ├── hyper-p.json │ │ │ │ ├── lm │ │ │ │ │ ├── config.json │ │ │ │ │ └── hyper-p.json │ │ │ │ ├── monitor.png │ │ │ │ └── readme.md │ │ │ ├── es │ │ │ │ ├── config.json │ │ │ │ ├── hyper-p.json │ │ │ │ ├── lm │ │ │ │ │ ├── config.json │ │ │ │ │ └── hyper-p.json │ │ │ │ ├── monitor.png │ │ │ │ └── readme.md │ │ │ ├── fr │ │ │ │ ├── config.json │ │ │ │ ├── hyper-p.json │ │ │ │ ├── lm │ │ │ │ │ ├── config.json │ │ │ │ │ └── hyper-p.json │ │ │ │ ├── monitor.png │ │ │ │ └── readme.md │ │ │ ├── id │ │ │ │ ├── Mono._phoneme_10h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Mono._phoneme_1h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Mono._phoneme_20h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Mono._subword_10h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Mono._subword_1h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Mono._subword_20h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ └── lm │ │ │ │ │ ├── config.json │ │ │ │ │ └── hyper-p.json │ │ │ ├── it │ │ │ │ ├── config.json │ │ │ │ ├── hyper-p.json │ │ │ │ ├── lm │ │ │ │ │ ├── config.json │ │ │ │ │ └── hyper-p.json │ │ │ │ ├── monitor.png │ │ │ │ └── readme.md │ │ │ ├── ky │ │ │ │ ├── config.json │ │ │ │ ├── hyper-p.json │ │ │ │ ├── lm │ │ │ │ │ ├── config.json │ │ │ │ │ └── hyper-p.json │ │ │ │ ├── monitor.png │ │ │ │ └── readme.md │ │ │ ├── nl │ │ │ │ ├── config.json │ │ │ │ ├── hyper-p.json │ │ │ │ ├── lm │ │ │ │ │ ├── config.json │ │ │ │ │ └── hyper-p.json │ │ │ │ ├── monitor.png │ │ │ │ └── readme.md │ │ │ ├── pl │ │ │ │ ├── Mono._phoneme_10h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Mono._phoneme_130h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Mono._phoneme_1h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Mono._subword_10h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Mono._subword_130h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ ├── Mono._subword_1h │ │ │ │ │ ├── config.json │ │ │ │ │ ├── hyper-p.json │ │ │ │ │ ├── monitor.png │ │ │ │ │ └── readme.md │ │ │ │ └── lm │ │ │ │ │ ├── config.json │ │ │ │ │ └── hyper-p.json │ │ │ ├── readme.md │ │ │ ├── ru │ │ │ │ ├── config.json │ │ │ │ ├── hyper-p.json │ │ │ │ ├── lm │ │ │ │ │ ├── config.json │ │ │ │ │ └── hyper-p.json │ │ │ │ ├── monitor.png │ │ │ │ └── readme.md │ │ │ ├── sv-SE │ │ │ │ ├── config.json │ │ │ │ ├── hyper-p.json │ │ │ │ ├── lm │ │ │ │ │ ├── config.json │ │ │ │ │ └── hyper-p.json │ │ │ │ ├── monitor.png │ │ │ │ └── readme.md │ │ │ ├── tr │ │ │ │ ├── config.json │ │ │ │ ├── hyper-p.json │ │ │ │ ├── lm │ │ │ │ │ ├── config.json │ │ │ │ │ └── hyper-p.json │ │ │ │ ├── monitor.png │ │ │ │ └── readme.md │ │ │ └── tt │ │ │ │ ├── config.json │ │ │ │ ├── hyper-p.json │ │ │ │ ├── lm │ │ │ │ ├── config.json │ │ │ │ └── hyper-p.json │ │ │ │ ├── monitor.png │ │ │ │ └── readme.md │ │ └── Multilingual │ │ │ ├── Multi._phoneme_L │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ │ │ ├── Multi._phoneme_M │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ │ │ ├── Multi._phoneme_S │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ │ │ ├── Multi._subword │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ │ │ ├── Wav2vec-lang10 │ │ │ ├── Wav2vec-lang10.yaml │ │ │ └── readme.md │ │ │ └── readme.md │ ├── lang-process │ │ ├── en │ │ │ ├── lang_process.md │ │ │ ├── lexicon.sh │ │ │ └── text_norm.sh │ │ ├── es │ │ │ ├── lang_process.md │ │ │ ├── lexicon.sh │ │ │ └── text_norm.sh │ │ ├── fr │ │ │ ├── lang_process.md │ │ │ ├── lexicon.sh │ │ │ └── text_norm.sh │ │ ├── id │ │ │ ├── lang_process.md │ │ │ ├── lexicon.sh │ │ │ └── text_norm.sh │ │ ├── it │ │ │ ├── lang_process.md │ │ │ ├── lexicon.sh │ │ │ └── text_norm.sh │ │ ├── ky │ │ │ ├── lang_process.md │ │ │ ├── lexicon.sh │ │ │ └── text_norm.sh │ │ ├── lang-process.md │ │ ├── nl │ │ │ ├── lang_process.md │ │ │ ├── lexicon.sh │ │ │ └── text_norm.sh │ │ ├── pl │ │ │ ├── lang_process.md │ │ │ ├── lexicon.sh │ │ │ └── text_norm.sh │ │ ├── ru │ │ │ ├── lang_process.md │ │ │ ├── lexicon.sh │ │ │ └── text_norm.sh │ │ ├── sv-SE │ │ │ ├── lang_process.md │ │ │ ├── lexicon.sh │ │ │ └── text_norm.sh │ │ ├── tr │ │ │ ├── lang_process.md │ │ │ ├── lexicon.sh │ │ │ └── text_norm.sh │ │ └── tt │ │ │ ├── lang_process.md │ │ │ ├── lexicon.sh │ │ │ └── text_norm.sh │ ├── local │ │ ├── data_prep.md │ │ ├── data_prep.sh │ │ ├── eval_fst_decode.sh │ │ ├── expect.py │ │ ├── parseopt.py │ │ └── tools │ │ │ ├── calculate_dur.py │ │ │ ├── char_list.py │ │ │ ├── get_ipa_mapping.py │ │ │ ├── phone_list.py │ │ │ ├── prep_ld.py │ │ │ ├── sample_data.py │ │ │ ├── subset.sh │ │ │ └── unpack_mulingual_param.py │ ├── readme.md │ ├── run.sh │ └── utils ├── libri │ ├── README.md │ ├── cat │ ├── exp │ │ ├── crf-v1 │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.jpg │ │ │ └── readme.md │ │ ├── lm │ │ │ └── lm-v1-transformer │ │ │ │ ├── config.json │ │ │ │ ├── hyper-p.json │ │ │ │ ├── monitor.jpg │ │ │ │ └── readme.md │ │ └── rnnt-v1 │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ ├── local │ │ ├── data.sh │ │ ├── data_kaldi.sh │ │ ├── extract_fbank.py │ │ ├── extract_meta_kaldi.py │ │ ├── prep_lexicon.sh │ │ └── prep_libri_corpus.sh │ └── utils ├── wenetspeech │ ├── .vscode │ ├── README.md │ ├── cat │ ├── exp │ │ ├── ebm-lm │ │ │ ├── GN-ELM-DNCE │ │ │ │ ├── config.json │ │ │ │ ├── config_ebm.json │ │ │ │ ├── config_noise.json │ │ │ │ ├── hyper-p.json │ │ │ │ ├── monitor.png │ │ │ │ └── readme.md │ │ │ └── TRF-LM-DNCE │ │ │ │ ├── config.json │ │ │ │ ├── config_noise.json │ │ │ │ ├── config_trf.json │ │ │ │ ├── hyper-p.json │ │ │ │ ├── monitor.png │ │ │ │ └── readme.md │ │ ├── lm │ │ │ ├── lm-trans-l │ │ │ │ ├── config.json │ │ │ │ ├── hyper-p.json │ │ │ │ └── readme.md │ │ │ └── lm-trans-m │ │ │ │ ├── config.json │ │ │ │ ├── hyper-p.json │ │ │ │ └── readme.md │ │ ├── train_l │ │ │ ├── crf-v1 │ │ │ │ ├── config.json │ │ │ │ ├── hyper-p.json │ │ │ │ ├── monitor.png │ │ │ │ └── readme.md │ │ │ └── rnnt-v1 │ │ │ │ ├── config.json │ │ │ │ ├── hyper-p.json │ │ │ │ ├── monitor.png │ │ │ │ └── readme.md │ │ └── train_m │ │ │ ├── crf-v1 │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ │ │ ├── ctc-v1 │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ │ │ └── rnnt-v1 │ │ │ ├── config.json │ │ │ ├── hyper-p.json │ │ │ ├── monitor.png │ │ │ └── readme.md │ ├── local │ │ ├── data_kaldi.sh │ │ ├── extract_meta.py │ │ └── wenetspeech_data_prep.sh │ └── utils └── wsj │ ├── .vscode │ ├── README.md │ ├── cat │ ├── exp │ ├── asr-ctc-crf-phone │ │ ├── config.json │ │ ├── decode_lm │ │ │ ├── config.json │ │ │ └── hyper-p.json │ │ ├── hyper-p.json │ │ ├── monitor.png │ │ ├── readme.md │ │ └── run.sh │ ├── asr-ctc-phone │ │ ├── config.json │ │ ├── decode_lm │ │ │ ├── config.json │ │ │ └── hyper-p.json │ │ ├── hyper-p.json │ │ ├── monitor.png │ │ ├── readme.md │ │ └── run.sh │ └── asr-rnnt-bpe │ │ ├── config.json │ │ ├── hyper-p.json │ │ ├── monitor.png │ │ └── readme.md │ ├── local │ └── data_kaldi.sh │ └── utils ├── install.sh ├── requirements.txt ├── setup.py └── src ├── ctc_crf ├── Makefile ├── binding.cpp ├── ctc_crf │ └── __init__.py ├── gpu_ctc │ ├── CMakeLists.txt │ ├── LICENSE │ ├── README.txt │ ├── contrib │ │ └── moderngpu │ │ │ ├── LICENSE │ │ │ └── include │ │ │ ├── device │ │ │ ├── ctaloadbalance.cuh │ │ │ ├── ctamerge.cuh │ │ │ ├── ctascan.cuh │ │ │ ├── ctasearch.cuh │ │ │ ├── ctasegreduce.cuh │ │ │ ├── ctasegscan.cuh │ │ │ ├── ctasegsort.cuh │ │ │ ├── ctasortedsearch.cuh │ │ │ ├── devicetypes.cuh │ │ │ ├── deviceutil.cuh │ │ │ ├── intrinsics.cuh │ │ │ ├── loadstore.cuh │ │ │ ├── serialsets.cuh │ │ │ └── sortnetwork.cuh │ │ │ ├── mgpudevice.cuh │ │ │ ├── mgpuenums.h │ │ │ └── util │ │ │ └── static.h │ ├── ctc.h │ ├── ctc_entrypoint.cu │ ├── ctc_helper.h │ ├── gpu_ctc.h │ ├── gpu_ctc_kernels.h │ └── hostdevice.h ├── gpu_den │ ├── CMakeLists.txt │ ├── den_calculate.cu │ └── fst_read.cc ├── setup.py └── test │ ├── den_lm.fst │ └── main.py ├── fst-decoder ├── Makefile └── latgen-faster.cc └── g2p-tool └── build.sh /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "json.schemas": [ 3 | { 4 | "fileMatch": [ 5 | "exp/**/config.json" 6 | ], 7 | "url": ".vscode/sgm_nn.json" 8 | }, 9 | { 10 | "fileMatch": [ 11 | "exp/**/hyper-p.json" 12 | ], 13 | "url": ".vscode/sgm_hyper.json" 14 | } 15 | ] 16 | } -------------------------------------------------------------------------------- /assets/G.fst.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/G.fst.png -------------------------------------------------------------------------------- /assets/JoinAP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/JoinAP.png -------------------------------------------------------------------------------- /assets/L.fst(no NOISE).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/L.fst(no NOISE).png -------------------------------------------------------------------------------- /assets/L.fst.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/L.fst.png -------------------------------------------------------------------------------- /assets/ME2E.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/ME2E.png -------------------------------------------------------------------------------- /assets/MVDR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/MVDR.png -------------------------------------------------------------------------------- /assets/PSD.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/PSD.png -------------------------------------------------------------------------------- /assets/T.fst(no NOISE).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/T.fst(no NOISE).png -------------------------------------------------------------------------------- /assets/T.fst.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/T.fst.png -------------------------------------------------------------------------------- /assets/TLG.fst.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/TLG.fst.png -------------------------------------------------------------------------------- /assets/TLG.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/TLG.png -------------------------------------------------------------------------------- /assets/WFST.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/WFST.png -------------------------------------------------------------------------------- /assets/den.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/den.png -------------------------------------------------------------------------------- /assets/h_f.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/h_f.png -------------------------------------------------------------------------------- /assets/intellisense.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/intellisense.gif -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/logo.png -------------------------------------------------------------------------------- /assets/loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/loss.png -------------------------------------------------------------------------------- /assets/phonological_feature.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/phonological_feature.png -------------------------------------------------------------------------------- /assets/pipeline_rnnt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/pipeline_rnnt.png -------------------------------------------------------------------------------- /assets/potential.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/assets/potential.png -------------------------------------------------------------------------------- /cat/__init__.py: -------------------------------------------------------------------------------- 1 | """Transducer/CTC/CRF/LM training/inference tool 2 | """ 3 | 4 | from . import ctc 5 | from . import lm 6 | from . import rnnt 7 | from . import shared 8 | from . import utils 9 | -------------------------------------------------------------------------------- /cat/ctc/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Tsinghua University 2 | # Apache 2.0. 3 | # Author: Huahuan Zheng (maxwellzh@outlook.com) 4 | 5 | """CTC-related modules 6 | """ 7 | 8 | 9 | from .train import build_model as ctc_builder 10 | 11 | __all__ = [ctc_builder] 12 | -------------------------------------------------------------------------------- /cat/ctc/__main__.py: -------------------------------------------------------------------------------- 1 | from .train import main 2 | main() 3 | -------------------------------------------------------------------------------- /cat/lm/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Tsinghua University 2 | # Apache 2.0. 3 | # Author: Huahuan Zheng (maxwellzh@outlook.com) 4 | 5 | """LM-related modules 6 | """ 7 | 8 | from .train import build_model as lm_builder 9 | 10 | __all__ = [lm_builder] 11 | -------------------------------------------------------------------------------- /cat/lm/__main__.py: -------------------------------------------------------------------------------- 1 | from .train import main 2 | main() 3 | -------------------------------------------------------------------------------- /cat/lm/trf/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/cat/lm/trf/__init__.py -------------------------------------------------------------------------------- /cat/rnnt/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Tsinghua University 2 | # Apache 2.0. 3 | # Author: Huahuan Zheng (maxwellzh@outlook.com) 4 | 5 | """RNN-Transducer related module 6 | """ 7 | 8 | 9 | from .train import build_model as rnnt_builder 10 | -------------------------------------------------------------------------------- /cat/rnnt/__main__.py: -------------------------------------------------------------------------------- 1 | from .train import main 2 | main() 3 | -------------------------------------------------------------------------------- /cat/shared/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Tsinghua University 2 | # Apache 2.0. 3 | # Author: Huahuan Zheng (maxwellzh@outlook.com) 4 | 5 | """Common files and variables for different trainer. 6 | """ 7 | 8 | from . import tokenizer 9 | from . import scheduler 10 | from . import manager 11 | from . import layer 12 | from . import encoder 13 | from . import decoder 14 | from . import data 15 | from . import coreutils 16 | from .specaug import SpecAug 17 | from .manager import Manager 18 | -------------------------------------------------------------------------------- /cat/shared/_constants.py: -------------------------------------------------------------------------------- 1 | """Declare all used global constants (like file names)""" 2 | 3 | # number of utterances per file for wds 4 | UTTS_PER_FILE = 2048 5 | 6 | # folder 7 | D_CHECKPOINT = "check" 8 | D_LOG = "log" 9 | D_INFER = "decode" 10 | D_TMP = "tmp" 11 | D_CACHE = ".cache" 12 | 13 | # file 14 | ## configurations 15 | F_NN_CONFIG = "config.json" 16 | F_HYPER_CONFIG = "hyper-p.json" 17 | 18 | ## monitor/log related 19 | F_MONITOR_FIG = "monitor.png" 20 | 21 | ## checkpoint 22 | F_CHECKPOINT_LIST = "checkpoint.list" 23 | 24 | ## others 25 | F_TOKENIZER = 'tokenizer.tknz' 26 | F_TRAINING_INFO = "readme.md" 27 | F_DATAINFO = "data/metainfo.json" 28 | 29 | # schema 30 | SCHEMA_NN_CONFIG = "sgm_nn.json" 31 | SCHEMA_HYPER_CONFIG = "sgm_hyper.json" 32 | -------------------------------------------------------------------------------- /cat/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/cat/utils/__init__.py -------------------------------------------------------------------------------- /cat/utils/compat/repl_am_to_encoder.py: -------------------------------------------------------------------------------- 1 | """ 2 | In previous version, the encoder in CTC trainer is named as 'am', thus the 3 | names of parameters are like 'module.am.xxx' 4 | 5 | Now the 'am' is replaced to 'encoder' to be consistent with other non-CTC models. 6 | So we have to replace the 'am' to 'encoder' to allow loading 7 | models from previous checkpoints. 8 | 9 | Usage: 10 | python utils/compat/repl_am_to_encoder.py /path/to/checkpoint.pt 11 | """ 12 | import torch 13 | import sys 14 | import os 15 | from collections import OrderedDict 16 | 17 | if __name__ == "__main__": 18 | if len(sys.argv[1:]) != 1: 19 | raise RuntimeError("Require one argument to specify the checkpoint.") 20 | 21 | file = sys.argv[1] 22 | assert os.path.isfile(file), file 23 | 24 | check = torch.load(file, "cpu") 25 | m = check["model"] 26 | newdict = OrderedDict() 27 | 28 | for k, v in m.items(): 29 | newdict[k.replace(".am.", ".encoder.")] = v 30 | 31 | check["model"] = newdict 32 | torch.save(check, file) 33 | -------------------------------------------------------------------------------- /cat/utils/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/cat/utils/data/__init__.py -------------------------------------------------------------------------------- /cat/utils/lm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/cat/utils/lm/__init__.py -------------------------------------------------------------------------------- /cat/utils/pipeline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/cat/utils/pipeline/__init__.py -------------------------------------------------------------------------------- /cat/utils/pipeline/_constants.py: -------------------------------------------------------------------------------- 1 | ../../shared/_constants.py -------------------------------------------------------------------------------- /docs/contributing.md: -------------------------------------------------------------------------------- 1 | # Guideline for contributing 2 | 3 | ## Add dependencies 4 | 5 | If your contributing relies on dependencies from outside (such as `matplotlib` module of python), you need to assure the dependencies are properly installed. 6 | 7 | - For python modules that `cat` relying on, you could add it in [requirements.txt](../requirements.txt) 8 | - Some modules have their special installation processes (like `kenlm`), then you should add the installation in [install.sh](../install.sh), where you'll modify: 9 | 1. add the new module in `choices` list of the parser 10 | 2. add installation process in `exc_install()` 11 | 3. add uninstallation process in `exc_rm()` -------------------------------------------------------------------------------- /docs/significance_test.md: -------------------------------------------------------------------------------- 1 | # Significance Test 2 | 3 | To see whether the difference between two experiments is significant, we need to conduct significance test and calculate the $p$ value. If we set the significance level $\alpha=0.05$ (typical values are 0.05, 0.01 and 0.001), then all the experiment pairs with $p$ value less than 0.05 are considered to be significantly different. 4 | 5 | ```bash 6 | # in egs/xxx/ 7 | python ../TEMPLATE/local/significance_test.py ${result_path1} ${result_path2} --method mp 8 | ``` 9 | 10 | `result_path1` and `result_path2` denote the metric values on all the test samples extracted from the results of the two experiments. `--method mp` denotes matched pair test and you can also set `--method mc`, which denotes McNemar test. Noting that the metric value can only be 0 or 1 in McNemar test. 11 | 12 | ### References 13 | 14 | L. Gillick and S. J. Cox, “Some statistical issues in the comparison of speech recognition algorithms,” in International Conference on Acoustics, Speech, and Signal Processing (ICASSP), 1989, pp.532–535. -------------------------------------------------------------------------------- /egs/IuMien/cat: -------------------------------------------------------------------------------- 1 | ../../cat/ -------------------------------------------------------------------------------- /egs/IuMien/exp/Mono-phoneme/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "specaug": { 3 | "freq_mask_width_range": 0.35, 4 | "num_freq_mask": 2, 5 | "num_time_mask": 10, 6 | "time_mask_width_range": 0.05 7 | }, 8 | "encoder": { 9 | "type": "ConformerNet", 10 | "kwargs": { 11 | "num_cells": 14, 12 | "idim": 80, 13 | "hdim": 512, 14 | "conv": "vgg2l", 15 | "num_heads": 4, 16 | "kernel_size": 15, 17 | "num_classes": 54 18 | } 19 | }, 20 | "scheduler": { 21 | "type": "SchedulerEarlyStop", 22 | "kwargs": { 23 | "min_step": 2020, 24 | "n_tol": 8, 25 | "gamma": 0.5, 26 | "stop_lr": 1e-06 27 | }, 28 | "optimizer": { 29 | "type": "Adam", 30 | "kwargs": { 31 | "lr": 3e-05, 32 | "betas": [ 33 | 0.9, 34 | 0.98 35 | ], 36 | "weight_decay": 1e-06 37 | } 38 | } 39 | } 40 | } -------------------------------------------------------------------------------- /egs/IuMien/exp/Mono-phoneme/monitor/exp-monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Mono-phoneme/monitor/exp-monitor.png -------------------------------------------------------------------------------- /egs/IuMien/exp/Mono-phoneme/monitor/exp2-monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Mono-phoneme/monitor/exp2-monitor.png -------------------------------------------------------------------------------- /egs/IuMien/exp/Mono-phoneme/monitor/exp3-monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Mono-phoneme/monitor/exp3-monitor.png -------------------------------------------------------------------------------- /egs/IuMien/exp/Mono-phoneme/monitor/readme.md: -------------------------------------------------------------------------------- 1 | # training process monitor 2 | 3 | 4 | The variation of Loss and learning rate in three independent cross-validation runs are shown below. 5 | 6 | 7 | | training process | 8 | |:-----------------------:| 9 | |![tb-plot](./exp-monitor.png)| 10 | |![tb-plot](./exp2-monitor.png)| 11 | |![tb-plot](./exp3-monitor.png)| -------------------------------------------------------------------------------- /egs/IuMien/exp/Mono-phoneme/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 89.99 6 | * GPU info \[1\] 7 | * \[1\] NVIDIA GeForce RTX 4090 8 | 9 | ### Notes 10 | 11 | Phone modeling, using Mien language data for training from scratch. 12 | 13 | ### How to run exp 14 | 15 | Please refer to the [`run.history.sh`](./run.history.sh) 16 | 17 | ### Result 18 | 19 | We did three independent experiments, and the results of each independent experiment on its corresponding test set are as follows. 20 | 21 | | | PER | WER | 22 | |---| ---|--- | 23 | | exp1 | 4.04 | 4.44 | 24 | | exp2 | 5.18 | 5.64 | 25 | | exp3 | 3.45 | 3.99 | 26 | | avg-3 | 4.22 | 4.69 | 27 | 28 | 29 | ### Training process monitor 30 | 31 | During the training process, the loss change curve can be seen in the [training process monitor](./monitor/readme.md) 32 | -------------------------------------------------------------------------------- /egs/IuMien/exp/Mono-phoneme/run.history.sh: -------------------------------------------------------------------------------- 1 | # train model 2 | # python utils/pipeline/asr.py exp/Mono-phoneme --sta 1 --sto 3 3 | 4 | # test model per 5 | # python utils/pipeline/asr.py exp/Mono-phoneme --sta 4 --sto 4 6 | 7 | 8 | # test model wer 9 | # First, you need to modify the hyper-p.json file. 10 | # "infer": { 11 | # "bin": "cat.ctc.cal_logit", 12 | # "option": { 13 | # "beam_size": 32, 14 | # "nj": 16, 15 | # "store_ark": true 16 | # } 17 | # }, 18 | # python utils/pipeline/asr.py exp/Mono-phoneme --sta 4 --sto 4 19 | # bash exp/lexicon_wfst_run.sh --exp_dir exp/Mono-phoneme --lm_dir exp/decode_lm --dataset_name test 20 | -------------------------------------------------------------------------------- /egs/IuMien/exp/Mono-subword/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "specaug": { 3 | "freq_mask_width_range": 0.35, 4 | "num_freq_mask": 2, 5 | "num_time_mask": 10, 6 | "time_mask_width_range": 0.05 7 | }, 8 | "encoder": { 9 | "type": "ConformerNet", 10 | "kwargs": { 11 | "num_cells": 14, 12 | "idim": 80, 13 | "hdim": 512, 14 | "conv": "vgg2l", 15 | "num_heads": 4, 16 | "kernel_size": 15, 17 | "num_classes": 500 18 | } 19 | }, 20 | "scheduler": { 21 | "type": "SchedulerEarlyStop", 22 | "kwargs": { 23 | "min_step": 2020, 24 | "n_tol": 8, 25 | "gamma": 0.5, 26 | "stop_lr": 1e-06 27 | }, 28 | "optimizer": { 29 | "type": "Adam", 30 | "kwargs": { 31 | "lr": 3e-05, 32 | "betas": [ 33 | 0.9, 34 | 0.98 35 | ], 36 | "weight_decay": 1e-06 37 | } 38 | } 39 | } 40 | } -------------------------------------------------------------------------------- /egs/IuMien/exp/Mono-subword/monitor/exp2-monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Mono-subword/monitor/exp2-monitor.png -------------------------------------------------------------------------------- /egs/IuMien/exp/Mono-subword/monitor/exp3-monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Mono-subword/monitor/exp3-monitor.png -------------------------------------------------------------------------------- /egs/IuMien/exp/Mono-subword/monitor/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Mono-subword/monitor/monitor.png -------------------------------------------------------------------------------- /egs/IuMien/exp/Mono-subword/monitor/readme.md: -------------------------------------------------------------------------------- 1 | # training process monitor 2 | 3 | 4 | The variation of Loss and learning rate in three independent cross-validation runs are shown below. 5 | 6 | 7 | | training process | 8 | |:-----------------------:| 9 | |![tb-plot](./exp-monitor.png)| 10 | |![tb-plot](./exp2-monitor.png)| 11 | |![tb-plot](./exp3-monitor.png)| -------------------------------------------------------------------------------- /egs/IuMien/exp/Mono-subword/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 90.22 6 | * GPU info \[1\] 7 | * \[1\] NVIDIA GeForce RTX 4090 8 | 9 | ### Notes 10 | 11 | BPE modeling, using Mien language data for training from scratch. 12 | 13 | 14 | ### How to run exp 15 | 16 | Please refer to the [`run.history.sh`](./run.history.sh) 17 | 18 | ### Result 19 | We did three independent experiments, and the results of each independent experiment on its corresponding test set are as follows. 20 | 21 | | | WER w/o LM| WER with LM | 22 | |---| ---|--- | 23 | | exp1 | 9.80 | 7.11 | 24 | | exp2 | 10.04 | 7.04 | 25 | | exp3 | 9.29 | 6.46 | 26 | | avg-3 | 9.71 | 6.87 | 27 | 28 | 29 | ### training process monitor 30 | 31 | During the training process, the loss change curve can be seen in the [training process monitor](./monitor/readme.md) 32 | -------------------------------------------------------------------------------- /egs/IuMien/exp/Mono-subword/run.history.sh: -------------------------------------------------------------------------------- 1 | # train model 2 | # python utils/pipeline/asr.py exp2/Mono-subword --sta 1 --sto 3 3 | # decode w/o lm 4 | # python utils/pipeline/asr.py exp2/Mono-subword --sta 4 --sto 4 5 | 6 | 7 | # decode with lm 8 | # cal_logit 9 | # First, you need to modify the hyper-p.json file. 10 | # "infer": { 11 | # "bin": "cat.ctc.cal_logit", 12 | # "option": { 13 | # "beam_size": 32, 14 | # "nj": 16, 15 | # "store_ark": true 16 | # } 17 | # }, 18 | # python utils/pipeline/asr.py exp/Mono-subword --sta 4 --sto 4 19 | # to decode 20 | # bash exp/bpe_wfst_run.sh --exp_dir exp/Mono-subword --lm_dir exp/decode_lm --word_list dict/word_list --dataset_name test 21 | 22 | -------------------------------------------------------------------------------- /egs/IuMien/exp/Mul10-sub-PT-sub-FT/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "specaug": { 3 | "freq_mask_width_range": 0.35, 4 | "num_freq_mask": 2, 5 | "num_time_mask": 10, 6 | "time_mask_width_range": 0.05 7 | }, 8 | "encoder": { 9 | "type": "ConformerNet", 10 | "kwargs": { 11 | "num_cells": 14, 12 | "idim": 80, 13 | "hdim": 512, 14 | "conv": "vgg2l", 15 | "num_heads": 4, 16 | "kernel_size": 15, 17 | "num_classes": 500 18 | } 19 | }, 20 | "scheduler": { 21 | "type": "SchedulerEarlyStop", 22 | "kwargs": { 23 | "min_step": 2020, 24 | "n_tol": 8, 25 | "gamma": 0.5, 26 | "stop_lr": 1e-06 27 | }, 28 | "optimizer": { 29 | "type": "Adam", 30 | "kwargs": { 31 | "lr": 3e-05, 32 | "betas": [ 33 | 0.9, 34 | 0.98 35 | ], 36 | "weight_decay": 1e-06 37 | } 38 | } 39 | } 40 | } -------------------------------------------------------------------------------- /egs/IuMien/exp/Mul10-sub-PT-sub-FT/monitor/exp2-monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Mul10-sub-PT-sub-FT/monitor/exp2-monitor.png -------------------------------------------------------------------------------- /egs/IuMien/exp/Mul10-sub-PT-sub-FT/monitor/exp3-monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Mul10-sub-PT-sub-FT/monitor/exp3-monitor.png -------------------------------------------------------------------------------- /egs/IuMien/exp/Mul10-sub-PT-sub-FT/monitor/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Mul10-sub-PT-sub-FT/monitor/monitor.png -------------------------------------------------------------------------------- /egs/IuMien/exp/Mul10-sub-PT-sub-FT/monitor/readme.md: -------------------------------------------------------------------------------- 1 | # training process monitor 2 | 3 | 4 | The variation of Loss and learning rate in three independent cross-validation runs are shown below. 5 | 6 | 7 | | training process | 8 | |:-----------------------:| 9 | |![tb-plot](./exp-monitor.png)| 10 | |![tb-plot](./exp2-monitor.png)| 11 | |![tb-plot](./exp3-monitor.png)| -------------------------------------------------------------------------------- /egs/IuMien/exp/Mul10-sub-PT-sub-FT/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 90.22 6 | * GPU info \[1\] 7 | * \[1\] NVIDIA GeForce RTX 4090 8 | 9 | ### Notes 10 | 11 | BPE modeling, fine-tuning with Mien language data based on a pretrained model with subwords from cv-10. 12 | 13 | ### How to run exp 14 | 15 | Please refer to the [`run.history.sh`](./run.history.sh) 16 | 17 | ### Result 18 | We did three independent experiments, and the results of each independent experiment on its corresponding test set are as follows. 19 | 20 | | | WER w/o LM| WER with LM | 21 | |---| ---|--- | 22 | | exp1 | 4.18 | 3.42 | 23 | | exp2 | 4.79 | 3.92 | 24 | | exp3 | 4.02 | 3.05 | 25 | | avg-3 | 4.33 | 3.46 | 26 | 27 | ### training process monitor 28 | 29 | During the training process, the loss change curve can be seen in the [training process monitor](./monitor/readme.md) 30 | -------------------------------------------------------------------------------- /egs/IuMien/exp/Wav2vec2-cv10-phoneme-FT/monitor/exp2-monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Wav2vec2-cv10-phoneme-FT/monitor/exp2-monitor.png -------------------------------------------------------------------------------- /egs/IuMien/exp/Wav2vec2-cv10-phoneme-FT/monitor/exp3-monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Wav2vec2-cv10-phoneme-FT/monitor/exp3-monitor.png -------------------------------------------------------------------------------- /egs/IuMien/exp/Wav2vec2-cv10-phoneme-FT/monitor/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Wav2vec2-cv10-phoneme-FT/monitor/monitor.png -------------------------------------------------------------------------------- /egs/IuMien/exp/Wav2vec2-cv10-phoneme-FT/monitor/readme.md: -------------------------------------------------------------------------------- 1 | # training process monitor 2 | 3 | 4 | The variation of Loss and learning rate in three independent cross-validation runs are shown below. 5 | 6 | 7 | | training process | 8 | |:-----------------------:| 9 | |![tb-plot](./exp-monitor.png)| 10 | |![tb-plot](./exp2-monitor.png)| 11 | |![tb-plot](./exp3-monitor.png)| -------------------------------------------------------------------------------- /egs/IuMien/exp/Wav2vec2-cv10-phoneme-FT/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 90.21 6 | * GPU info \[1\] 7 | * \[1\] NVIDIA GeForce RTX 4090 8 | 9 | ### Notes 10 | 11 | Phone modeling, fine-tuning with Mien language data based on the Wav2vec2-cv10 pretrained model. 12 | 13 | ### How to run exp 14 | 15 | Please refer to the [`run.history.sh`](./run.history.sh) 16 | 17 | ### Result 18 | 19 | We did three independent experiments, and the results of each independent experiment on its corresponding test set are as follows. 20 | 21 | | | PER | WER | 22 | |---| ---|--- | 23 | | exp1 | 2.40 | 2.71 | 24 | | exp2 | 2.82 | 3.06 | 25 | | exp3 | 2.39 | 2.53 | 26 | | avg-3 | 2.53 | 2.76 | 27 | 28 | ### training process monitor 29 | 30 | During the training process, the loss change curve can be seen in the [training process monitor](./monitor/readme.md) 31 | -------------------------------------------------------------------------------- /egs/IuMien/exp/Wav2vec2-cv10-sub-FT/monitor/exp2-monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Wav2vec2-cv10-sub-FT/monitor/exp2-monitor.png -------------------------------------------------------------------------------- /egs/IuMien/exp/Wav2vec2-cv10-sub-FT/monitor/exp3-monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Wav2vec2-cv10-sub-FT/monitor/exp3-monitor.png -------------------------------------------------------------------------------- /egs/IuMien/exp/Wav2vec2-cv10-sub-FT/monitor/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Wav2vec2-cv10-sub-FT/monitor/monitor.png -------------------------------------------------------------------------------- /egs/IuMien/exp/Wav2vec2-cv10-sub-FT/monitor/readme.md: -------------------------------------------------------------------------------- 1 | # training process monitor 2 | 3 | 4 | The variation of Loss and learning rate in three independent cross-validation runs are shown below. 5 | 6 | 7 | | training process | 8 | |:-----------------------:| 9 | |![tb-plot](./exp-monitor.png)| 10 | |![tb-plot](./exp2-monitor.png)| 11 | |![tb-plot](./exp3-monitor.png)| -------------------------------------------------------------------------------- /egs/IuMien/exp/Wav2vec2-cv10-sub-FT/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 90.55 6 | * GPU info \[1\] 7 | * \[1\] NVIDIA GeForce RTX 4090 8 | 9 | ### Notes 10 | 11 | BPE modeling, fine-tuning with Mien language data based on the Wav2Vec2-cv10 pretrained model. 12 | 13 | ### How to run exp 14 | 15 | Please refer to the [`run.history.sh`](./run.history.sh) 16 | 17 | 18 | ### Result 19 | 20 | We did three independent experiments, and the results of each independent experiment on its corresponding test set are as follows. 21 | 22 | | | WER w/o LM| WER with LM | 23 | |---| ---|--- | 24 | | exp1 | 3.75 | 3.16 | 25 | | exp2 | 4.08 | 3.33 | 26 | | exp3 | 3.47 | 2.69 | 27 | | avg-3 | 3.76 | 3.06 | 28 | 29 | ### training process monitor 30 | 31 | During the training process, the loss change curve can be seen in the [training process monitor](./monitor/readme.md) 32 | -------------------------------------------------------------------------------- /egs/IuMien/exp/Wav2vec2-cv10-sub-FT/run.history.sh: -------------------------------------------------------------------------------- 1 | # Firstly,you could download pretrain model from https://github.com/thu-spmi/CAT/tree/master/egs/cv-lang10/exp/Multilingual/Wav2vec-lang10 2 | # and then we should modify pt model classfier layer 3 | # python local/process_model_for_subword_ft.py --pt_model_path --output_model_path --vocab_size 4 | 5 | 6 | # train 7 | # python utils/pipeline/asr.py exp/Wav2vec2-cv10-sub-FT --sta 1 --sto 3 8 | # decode w/o lm 9 | # python utils/pipeline/asr.py exp/Wav2vec2-cv10-sub-FT --sta 4 --sto 4 10 | 11 | # decode with lm 12 | # First, you need to modify the hyper-p.json file. 13 | # "infer": { 14 | # "bin": "cat.ctc.cal_logit", 15 | # "option": { 16 | # "beam_size": 32, 17 | # "nj": 16, 18 | # "store_ark": true 19 | # } 20 | # }, 21 | # cal_logit 22 | # python utils/pipeline/asr.py exp/Wav2vec2-cv10-sub-FT --sta 4 --sto 4 23 | # to decode 24 | # bash exp/bpe_wfst_run.sh --exp_dir exp/Wav2vec2-cv10-sub-FT --lm_dir exp/decode_lm --word_list dict/word_list-2 --dataset_name test-2_raw 25 | -------------------------------------------------------------------------------- /egs/IuMien/exp/Whistle-phoneme-FT/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "specaug": { 3 | "freq_mask_width_range": 0.35, 4 | "num_freq_mask": 2, 5 | "num_time_mask": 10, 6 | "time_mask_width_range": 0.05 7 | }, 8 | "encoder": { 9 | "type": "ConformerNet", 10 | "kwargs": { 11 | "num_cells": 14, 12 | "idim": 80, 13 | "hdim": 512, 14 | "conv": "vgg2l", 15 | "num_heads": 4, 16 | "kernel_size": 15, 17 | "num_classes": 54 18 | } 19 | }, 20 | "scheduler": { 21 | "type": "SchedulerEarlyStop", 22 | "kwargs": { 23 | "min_step": 2020, 24 | "n_tol": 10, 25 | "gamma": 0.5, 26 | "stop_lr": 1e-06 27 | }, 28 | "optimizer": { 29 | "type": "Adam", 30 | "kwargs": { 31 | "lr": 3e-05, 32 | "betas": [ 33 | 0.9, 34 | 0.98 35 | ], 36 | "weight_decay": 1e-06 37 | } 38 | } 39 | } 40 | } -------------------------------------------------------------------------------- /egs/IuMien/exp/Whistle-phoneme-FT/monitor/exp-monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Whistle-phoneme-FT/monitor/exp-monitor.png -------------------------------------------------------------------------------- /egs/IuMien/exp/Whistle-phoneme-FT/monitor/exp2-monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Whistle-phoneme-FT/monitor/exp2-monitor.png -------------------------------------------------------------------------------- /egs/IuMien/exp/Whistle-phoneme-FT/monitor/exp3-monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Whistle-phoneme-FT/monitor/exp3-monitor.png -------------------------------------------------------------------------------- /egs/IuMien/exp/Whistle-phoneme-FT/monitor/readme.md: -------------------------------------------------------------------------------- 1 | # training process monitor 2 | 3 | 4 | The variation of Loss and learning rate in three independent cross-validation runs are shown below. 5 | 6 | 7 | | training process | 8 | |:-----------------------:| 9 | |![tb-plot](./exp-monitor.png)| 10 | |![tb-plot](./exp2-monitor.png)| 11 | |![tb-plot](./exp3-monitor.png)| -------------------------------------------------------------------------------- /egs/IuMien/exp/Whistle-phoneme-FT/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 89.99 6 | * GPU info \[1\] 7 | * \[1\] NVIDIA GeForce RTX 4090 8 | 9 | ### Notes 10 | 11 | Phone modeling, fine-tuning with Mien language data based on the Whistle-small pretrained model. 12 | 13 | 14 | ### How to run exp 15 | 16 | Please refer to the [`run.history.sh`](./run.history.sh) 17 | 18 | ### Result 19 | We did three independent experiments, and the results of each independent experiment on its corresponding test set are as follows. 20 | 21 | | | PER | WER | 22 | |---| ---|--- | 23 | | exp1 | 2.45 | 2.93 | 24 | | exp2 | 2.65 | 3.08 | 25 | | exp3 | 2.13 | 2.38 | 26 | | avg-3 | 2.41 | 2.71 | 27 | 28 | ### training process monitor 29 | 30 | During the training process, the loss change curve can be seen in the [training process monitor](./monitor/readme.md) 31 | -------------------------------------------------------------------------------- /egs/IuMien/exp/Whistle-sub-FT/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "specaug": { 3 | "freq_mask_width_range": 0.35, 4 | "num_freq_mask": 2, 5 | "num_time_mask": 10, 6 | "time_mask_width_range": 0.05 7 | }, 8 | "encoder": { 9 | "type": "ConformerNet", 10 | "kwargs": { 11 | "num_cells": 14, 12 | "idim": 80, 13 | "hdim": 512, 14 | "conv": "vgg2l", 15 | "num_heads": 4, 16 | "kernel_size": 15, 17 | "num_classes": 500 18 | } 19 | }, 20 | "scheduler": { 21 | "type": "SchedulerEarlyStop", 22 | "kwargs": { 23 | "min_step": 2020, 24 | "n_tol": 8, 25 | "gamma": 0.5, 26 | "stop_lr": 1e-06 27 | }, 28 | "optimizer": { 29 | "type": "Adam", 30 | "kwargs": { 31 | "lr": 3e-05, 32 | "betas": [ 33 | 0.9, 34 | 0.98 35 | ], 36 | "weight_decay": 1e-06 37 | } 38 | } 39 | } 40 | } -------------------------------------------------------------------------------- /egs/IuMien/exp/Whistle-sub-FT/monitor/exp-monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Whistle-sub-FT/monitor/exp-monitor.png -------------------------------------------------------------------------------- /egs/IuMien/exp/Whistle-sub-FT/monitor/exp2-monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Whistle-sub-FT/monitor/exp2-monitor.png -------------------------------------------------------------------------------- /egs/IuMien/exp/Whistle-sub-FT/monitor/exp3-monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/IuMien/exp/Whistle-sub-FT/monitor/exp3-monitor.png -------------------------------------------------------------------------------- /egs/IuMien/exp/Whistle-sub-FT/monitor/readme.md: -------------------------------------------------------------------------------- 1 | # training process monitor 2 | 3 | 4 | The variation of Loss and learning rate in three independent cross-validation runs are shown below. 5 | 6 | 7 | | training process | 8 | |:-----------------------:| 9 | |![tb-plot](./exp-monitor.png)| 10 | |![tb-plot](./exp2-monitor.png)| 11 | |![tb-plot](./exp3-monitor.png)| -------------------------------------------------------------------------------- /egs/IuMien/exp/Whistle-sub-FT/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | * \# of parameters (million): 90.22 4 | * GPU info \[1\] 5 | * \[1\] NVIDIA GeForce RTX 4090 6 | 7 | ### Notes 8 | 9 | BPE modeling, fine-tuning with Mien language data based on the Whistle-small pretrained model. 10 | 11 | ### How to run exp 12 | 13 | Please refer to the [`run.history.sh`](./run.history.sh) 14 | 15 | ### Result 16 | 17 | We did three independent experiments, and the results of each independent experiment on its corresponding test set are as follows. 18 | 19 | | | WER w/o LM| WER with LM | 20 | |---| ---|--- | 21 | | exp1 | 3.17 | 2.88 | 22 | | exp2 | 3.71 | 3.29 | 23 | | exp3 | 3.04 | 2.70 | 24 | | avg-3 | 3.30 | 2.95 | 25 | 26 | ### training process monitor 27 | 28 | During the training process, the loss change curve can be seen in the [training process monitor](./monitor/readme.md) 29 | -------------------------------------------------------------------------------- /egs/IuMien/exp/decode_lm/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "NGram", 4 | "kwargs": { 5 | "gram_order": 4, 6 | "f_binlm": "exp2/decode_lm/4gram.arpa", 7 | "num_classes": 1549 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /egs/IuMien/exp/decode_lm/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": "train", 4 | "dev": "dev", 5 | "test": "test" 6 | }, 7 | "tokenizer": { 8 | "type": "SimpleTokenizer", 9 | "option-init": { 10 | "dmap": "/home/dlk/code/asr/cat/egs/MightLJSpeech/dict/word_list" 11 | }, 12 | "|V|": 1549, 13 | "file": "exp2/decode_lm/tokenizer.tknz" 14 | }, 15 | "inference": {}, 16 | "commit": "618a15f70780200cdc42eed3f69f6ce1d61a4e61" 17 | } -------------------------------------------------------------------------------- /egs/IuMien/exp/decode_lm/readme.md: -------------------------------------------------------------------------------- 1 | 2 | train command: 3 | 4 | ```bash 5 | utils/pipeline/ngram.sh exp2/decode_lm -o 4 --arpa --output exp2/decode_lm/4gram.arpa --sta 3 --sto 3 6 | ``` 7 | 8 | property: 9 | 10 | - prune: 11 | - type: probing 12 | - size: 3.2MB 13 | 14 | perplexity: 15 | 16 | ``` 17 | 18 | ``` 19 | -------------------------------------------------------------------------------- /egs/IuMien/exp/decode_lm/run.history.sh: -------------------------------------------------------------------------------- 1 | # you need to run python local/get_wordlist.py to get word_list if you don't have word list 2 | # train tokenizer and pickle data 3 | utils/pipeline/ngram.sh exp/decode_lm -o 4 --arpa --output exp2/decode_lm/4gram.arpa --sta 1 --sto 2 4 | # train lm 5 | utils/pipeline/ngram.sh exp/decode_lm -o 4 --arpa --output exp2/decode_lm/4gram.arpa --sta 3 --sto 3 6 | # test lm 7 | # utils/pipeline/ngram.sh exp2/decode_lm -o 4 --arpa --output exp2/decode_lm/4gram.arpa --sta 4 --sto 4 -------------------------------------------------------------------------------- /egs/IuMien/local/get_wordlist.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: DongLukuan (330293721@qq.com) 2 | 3 | text_path = './data/src/train/text' 4 | 5 | word_set = set() 6 | with open(text_path,'r',encoding='utf-8') as f: 7 | for line in f: 8 | try: 9 | ids,sentence = line.strip().split('\t') 10 | except: 11 | print(line.strip().split('\t')) 12 | word_set.update(sentence.split(' ')) 13 | # word_set(set(sentence.split(' '))) 14 | word_set = sorted(word_set) 15 | 16 | word_list_path = './dict/word_list' 17 | with open(word_list_path,'w',encoding='utf-8') as f: 18 | for word in word_set: 19 | f.write(word+'\n') -------------------------------------------------------------------------------- /egs/IuMien/utils: -------------------------------------------------------------------------------- 1 | ../../cat/utils/ -------------------------------------------------------------------------------- /egs/TEMPLATE/.vscode: -------------------------------------------------------------------------------- 1 | ../../.vscode/ -------------------------------------------------------------------------------- /egs/TEMPLATE/cat: -------------------------------------------------------------------------------- 1 | ../../cat -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/asr-ctc-crf/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "trainer": { 3 | "use_crf": true, 4 | "lamb": 0.1, 5 | "den_lm": "exp/asr-ctc-crf/den_lm.fst", 6 | "decoder": { 7 | "beam_size": 4, 8 | "num_classes": 8 9 | } 10 | }, 11 | "encoder": { 12 | "type": "LSTM", 13 | "kwargs": { 14 | "bidirectional": true, 15 | "dropout": 0.2, 16 | "hdim": 512, 17 | "idim": 80, 18 | "num_layers": 2, 19 | "num_classes": 8, 20 | "with_head": true 21 | } 22 | }, 23 | "scheduler": { 24 | "type": "SchedulerCosineAnnealing", 25 | "kwargs": { 26 | "min_lr": 0.0001, 27 | "stop_step": 200 28 | }, 29 | "optimizer": { 30 | "type": "Adam", 31 | "kwargs": { 32 | "lr": 0.001 33 | }, 34 | "zeroredundancy": true 35 | } 36 | } 37 | } -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/asr-ctc-crf/decode-lm/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "NGram", 4 | "kwargs": { 5 | "f_binlm": "exp/asr-ctc-crf/decode-lm/3gram.bin", 6 | "gram_order": 3, 7 | "num_classes": 8 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/asr-ctc-crf/decode-lm/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": "yesno", 4 | "dev": "yesno", 5 | "test": "yesno" 6 | }, 7 | "tokenizer": { 8 | "|V|": 8, 9 | "file": "exp/asr-ctc-crf/tokenizer.tknz" 10 | }, 11 | "inference": {} 12 | } -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/asr-ctc-crf/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/TEMPLATE/exp/asr-ctc-crf/monitor.png -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/asr-ctc-crf/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 8.74 6 | * GPU info \[1\] 7 | * \[1\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | * example of training CTC-CRF model, including den-lm preparation. 12 | 13 | ### Result 14 | ``` 15 | yesno %SER 100.00 | %WER 66.04 [ 317 / 480, 0 ins, 302 del, 15 sub ] 16 | ``` 17 | 18 | | training process | 19 | |:-----------------------:| 20 | |![tb-plot](./monitor.png)| 21 | -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/asr-ctc-large-corpora/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "trainer": { 3 | "decoder": { 4 | "beam_size": 4, 5 | "num_classes": 4 6 | } 7 | }, 8 | "encoder": { 9 | "type": "LSTM", 10 | "kwargs": { 11 | "bidirectional": true, 12 | "proj_size": 128, 13 | "hdim": 256, 14 | "idim": 80, 15 | "num_layers": 3, 16 | "num_classes": 4 17 | } 18 | }, 19 | "scheduler": { 20 | "type": "SchedulerCosineAnnealing", 21 | "kwargs": { 22 | "min_lr": 1e-05, 23 | "stop_step": 300 24 | }, 25 | "optimizer": { 26 | "type": "Adam", 27 | "kwargs": { 28 | "lr": 0.001 29 | } 30 | } 31 | } 32 | } -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/asr-ctc-large-corpora/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "dev": "yesno", 4 | "test": "yesno" 5 | }, 6 | "tokenizer": { 7 | "type": "SentencePieceTokenizer", 8 | "file": "exp/asr-ctc-large-corpora/tokenizer.tknz" 9 | }, 10 | "env": { 11 | "CUDA_VISIBLE_DEVICES": "0" 12 | }, 13 | "train": { 14 | "bin": "cat.ctc.train", 15 | "option": { 16 | "amp": false, 17 | "batch_size": 1, 18 | "eval_error_rate": true, 19 | "ld": "data/wds/10_1000/*.tar", 20 | "check_freq": 120 21 | } 22 | }, 23 | "inference": { 24 | "infer": { 25 | "bin": "cat.ctc.decode", 26 | "option": { 27 | "beam_size": 16, 28 | "nj": 2 29 | } 30 | }, 31 | "er": {} 32 | }, 33 | "commit": "9bb2af8441e590ebf522e24924284f8f994c54c7" 34 | } -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/asr-ctc-large-corpora/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/TEMPLATE/exp/asr-ctc-large-corpora/monitor.png -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/asr-ctc-large-corpora/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 2.21 6 | * GPU info \[1\] 7 | * \[1\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | * this template shows how to deal with very large corpora. 12 | 13 | ### Result 14 | ``` 15 | yesno %SER 100.00 | %WER 64.79 [ 311 / 480, 0 ins, 300 del, 11 sub ] 16 | ``` 17 | 18 | | training process | 19 | |:-----------------------:| 20 | |![tb-plot](./monitor.png)| 21 | -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/asr-ctc-large-corpora/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # This is an example of processing and training 4 | # ... very large corpora. In this example, we 5 | # ... assume the 'train' set is too large to fit into memory. 6 | set -e -u 7 | 8 | dir=$(dirname $0) 9 | 10 | [ ! -f $dir/.processed_data.done ] && { 11 | bash local/data.sh 12 | 13 | python local/prep_wds.py >/dev/null || exit 1 14 | 15 | touch $dir/.processed_data.done 16 | } 17 | 18 | # train tokenizer 19 | python utils/pipeline/asr.py \ 20 | $dir/tokenizer \ 21 | --sto 1 || exit 1 22 | 23 | # finish following steps 24 | # NOTE: 25 | # with --ld in train:option, the epoch id will always 26 | # be 1. However, you can estimate the #epochs 27 | # according to #steps by 28 | # #epochs = #steps * batch_size / #total_utts 29 | python utils/pipeline/asr.py \ 30 | $dir --sta 2 || exit 1 31 | 32 | echo "$0 done" 33 | exit 0 34 | -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/asr-ctc-large-corpora/tokenizer/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": "yesno" 4 | }, 5 | "tokenizer": { 6 | "type": "SentencePieceTokenizer", 7 | "option-train": { 8 | "model_type": "word", 9 | "use_all_vocab": true, 10 | "vocab_size": 4, 11 | "model_prefix": "sentencepiece/yesno_word/spm" 12 | }, 13 | "file": "exp/asr-ctc-large-corpora/tokenizer.tknz" 14 | } 15 | } -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/asr-ctc-lexicon/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "trainer": { 3 | "decoder": { 4 | "beam_size": 16, 5 | "num_classes": 7 6 | } 7 | }, 8 | "encoder": { 9 | "type": "LSTM", 10 | "kwargs": { 11 | "bidirectional": true, 12 | "dropout": 0.2, 13 | "hdim": 512, 14 | "idim": 80, 15 | "num_layers": 2, 16 | "num_classes": 7, 17 | "with_head": true 18 | } 19 | }, 20 | "scheduler": { 21 | "type": "SchedulerCosineAnnealing", 22 | "kwargs": { 23 | "min_lr": 0.0001, 24 | "stop_step": 200 25 | }, 26 | "optimizer": { 27 | "type": "Adam", 28 | "kwargs": { 29 | "lr": 0.001 30 | }, 31 | "zeroredundancy": true 32 | } 33 | } 34 | } -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/asr-ctc-lexicon/decode_lm/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "NGram", 4 | "kwargs": { 5 | "f_binlm": "exp/asr-ctc-lexicon/decode_lm/2gram.arpa", 6 | "gram_order": 2, 7 | "num_classes": 4 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/asr-ctc-lexicon/decode_lm/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": "yesno" 4 | }, 5 | "tokenizer": { 6 | "type": "SimpleTokenizer", 7 | "option-init": { 8 | "dmap": "exp/asr-ctc-lexicon/local/lexicon.txt", 9 | "read_index_from_file": false 10 | }, 11 | "|V|": 4, 12 | "file": "exp/asr-ctc-lexicon/decode_lm/tokenizer.tknz" 13 | }, 14 | "inference": {}, 15 | "commit": "d43b70416911b47882f6f360ec41add206a2fb1d" 16 | } -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/asr-ctc-lexicon/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 8.74 6 | * GPU info \[1\] 7 | * \[1\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | * CTC loss with lexicon 12 | 13 | ### Result 14 | ``` 15 | exp/asr-ctc-lexicon/decode/yesno/text_ac1.0_lm0.2_wip0.0.hyp %SER 100.00 | %CER 85.14 [ 1031 / 1211, 0 ins, 1031 del, 0 sub ] 16 | ``` 17 | -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/asr-ctc/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "trainer": { 3 | "decoder": { 4 | "beam_size": 4, 5 | "num_classes": 4 6 | } 7 | }, 8 | "encoder": { 9 | "type": "LSTM", 10 | "kwargs": { 11 | "bidirectional": true, 12 | "proj_size": 128, 13 | "hdim": 256, 14 | "idim": 80, 15 | "num_layers": 3, 16 | "num_classes": 4 17 | } 18 | }, 19 | "scheduler": { 20 | "type": "SchedulerCosineAnnealing", 21 | "kwargs": { 22 | "min_lr": 1e-05, 23 | "stop_step": 300 24 | }, 25 | "optimizer": { 26 | "type": "Adam", 27 | "kwargs": { 28 | "lr": 0.001 29 | } 30 | } 31 | } 32 | } -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/asr-ctc/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": "yesno", 4 | "dev": "yesno", 5 | "test": "yesno", 6 | "filter": "10:2000" 7 | }, 8 | "tokenizer": { 9 | "type": "SentencePieceTokenizer", 10 | "option-train": { 11 | "model_type": "word", 12 | "use_all_vocab": true, 13 | "vocab_size": 4, 14 | "model_prefix": "sentencepiece/yesno_word/spm" 15 | } 16 | }, 17 | "env": { 18 | "CUDA_VISIBLE_DEVICES": "0" 19 | }, 20 | "train": { 21 | "bin": "cat.ctc.train", 22 | "option": { 23 | "amp": false, 24 | "batch_size": 1, 25 | "eval_error_rate": true 26 | } 27 | }, 28 | "inference": { 29 | "infer": { 30 | "bin": "cat.ctc.decode", 31 | "option": { 32 | "beam_size": 16, 33 | "nj": 2 34 | } 35 | }, 36 | "er": {} 37 | } 38 | } -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/asr-ctc/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/TEMPLATE/exp/asr-ctc/monitor.png -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/asr-ctc/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 2.21 6 | * GPU info \[1\] 7 | * \[1\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | * template of training a ctc model on yesno corpora. 12 | 13 | ### Result 14 | ``` 15 | yesno %SER 100.00 | %WER 55.83 [ 268 / 480, 0 ins, 241 del, 27 sub ] 16 | ``` 17 | 18 | | training process | 19 | |:-----------------------:| 20 | |![tb-plot](./monitor.png)| 21 | -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/asr-rnnt-cuside/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/TEMPLATE/exp/asr-rnnt-cuside/monitor.png -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/asr-rnnt-cuside/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 4.90 6 | * GPU info \[1\] 7 | * \[1\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | * chunk and streaming experiment 12 | * use LSTM as encoder and there is no subsampling in the encoder. 13 | 14 | ### Result 15 | ``` 16 | yesno %SER 100.00 | %WER 67.92 [ 326 / 480, 0 ins, 317 del, 9 sub ] 17 | ``` 18 | 19 | | training process | 20 | |:-----------------------:| 21 | |![tb-plot](./monitor.png)| 22 | -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/asr-rnnt/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": "yesno", 4 | "dev": "yesno", 5 | "test": "yesno", 6 | "filter": "10:2000" 7 | }, 8 | "tokenizer": { 9 | "type": "SentencePieceTokenizer", 10 | "option-train": { 11 | "model_type": "char", 12 | "use_all_vocab": true, 13 | "vocab_size": 8, 14 | "model_prefix": "sentencepiece/yesno_char/spm" 15 | } 16 | }, 17 | "env": { 18 | "CUDA_VISIBLE_DEVICES": "0" 19 | }, 20 | "train": { 21 | "bin": "cat.rnnt.train", 22 | "option": { 23 | "amp": true, 24 | "batch_size": 4, 25 | "check_freq": 50 26 | } 27 | }, 28 | "inference": { 29 | "infer": { 30 | "bin": "cat.rnnt.decode", 31 | "option": { 32 | "beam_size": 16, 33 | "cpu": true, 34 | "nj": 4 35 | } 36 | }, 37 | "er": {} 38 | } 39 | } -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/asr-rnnt/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/TEMPLATE/exp/asr-rnnt/monitor.png -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/asr-rnnt/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 3.20 6 | * GPU info \[1\] 7 | * \[1\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | * template experiment using RNN-T loss to train on the yesno data. 12 | 13 | ### Result 14 | ``` 15 | yesno %SER 98.33 | %WER 48.96 [ 235 / 480, 25 ins, 25 del, 185 sub ] 16 | ``` 17 | 18 | | training process | 19 | |:-----------------------:| 20 | |![tb-plot](./monitor.png)| 21 | -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/lm-ebm/cfg_aux.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "CausalTransformer", 4 | "kwargs": { 5 | "attn_dropout": 0.1, 6 | "dim_hid": 320, 7 | "num_classes": 4000, 8 | "num_head": 8, 9 | "num_layers": 6 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/lm-ebm/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "EBM", 4 | "kwargs": { 5 | "noise_rate": 1, 6 | "method": "dnce", 7 | "energy_func": "sumtargetlogit", 8 | "config_ebm_model": "exp/lm-ebm/cfg_aux.json", 9 | "config_noise_model": "exp/lm-ebm/cfg_aux.json", 10 | "tokenizer_path": "exp/lm-ebm/tokenizer.tknz", 11 | "bert_tokenizer": false 12 | } 13 | }, 14 | "scheduler": { 15 | "type": "SchedulerNoam", 16 | "kwargs": { 17 | "dim_model": 768, 18 | "peak_factor": 0.01, 19 | "warmup_step": 100, 20 | "stop_step": 500 21 | }, 22 | "optimizer": { 23 | "type": "Adam", 24 | "kwargs": { 25 | "lr": 0.0008 26 | } 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/lm-ebm/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": "data/local-lm/libri-part.train", 4 | "dev": "data/local-lm/libri-part.dev", 5 | "test": [ 6 | "data/local-lm/libri-part.dev" 7 | ], 8 | "packing-text-lm": { 9 | "nj": 8, 10 | "truncate": 128, 11 | "prune_shorter": 4 12 | } 13 | }, 14 | "tokenizer": { 15 | "type": "SentencePieceTokenizer", 16 | "option-train": { 17 | "model_type": "bpe", 18 | "vocab_size": 4000, 19 | "model_prefix": "sentencepiece/temp-bpe4000/spm" 20 | } 21 | }, 22 | "env": { 23 | "CUDA_VISIBLE_DEVICES": "0" 24 | }, 25 | "train": { 26 | "bin": "cat.lm.trf.train", 27 | "option": { 28 | "amp": true, 29 | "batch_size": 16, 30 | "check-freq": 100, 31 | "grad-norm": 5.0 32 | } 33 | } 34 | } -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/lm-ebm/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/TEMPLATE/exp/lm-ebm/monitor.png -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/lm-ebm/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 20.58 6 | * GPU info \[1\] 7 | * \[1\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | * An example of training energy-based language model. 12 | 13 | 14 | | training process | 15 | |:-----------------------:| 16 | |![tb-plot](./monitor.png)| 17 | -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/lm-ebm/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e -u 3 | 4 | dir=$(dirname $0) 5 | 6 | bash local/lm_data.sh 7 | 8 | python utils/pipeline/lm.py $dir --sto 3 || exit 1 9 | 10 | echo "$0 done" && exit 0 11 | -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/lm-ngram-word/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "NGram", 4 | "kwargs": { 5 | "f_binlm": "exp/lm-ngram-word/3gram.bin", 6 | "gram_order": 3, 7 | "num_classes": 45899 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/lm-ngram-word/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": "data/local-lm/libri-part.train", 4 | "test": [ 5 | "data/local-lm/libri-part.dev" 6 | ] 7 | }, 8 | "tokenizer": { 9 | "type": "SentencePieceTokenizer", 10 | "option-train": { 11 | "model_type": "word", 12 | "vocab_size": 45899, 13 | "use_all_vocab": true, 14 | "model_prefix": "sentencepiece/lm-word/spm" 15 | } 16 | }, 17 | "inference": {} 18 | } -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/lm-ngram-word/readme.md: -------------------------------------------------------------------------------- 1 | 2 | train command: 3 | 4 | ```bash 5 | utils/pipeline/ngram.sh exp/lm-ngram-word -o 3 6 | ``` 7 | 8 | property: 9 | 10 | - prune: 11 | - type: probing 12 | - size: 25MB 13 | 14 | perplexity: 15 | 16 | ``` 17 | data: data/local-lm/libri-part.dev 18 | ppl: 436.06 | 19 | ``` 20 | -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/lm-nn/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "CausalTransformer", 4 | "kwargs": { 5 | "attn_dropout": 0.2, 6 | "dim_hid": 256, 7 | "num_classes": 45899, 8 | "num_head": 2, 9 | "num_layers": 4 10 | } 11 | }, 12 | "scheduler": { 13 | "type": "SchedulerCosineAnnealing", 14 | "kwargs": { 15 | "min_lr": 1e-05, 16 | "stop_step": 2000 17 | }, 18 | "optimizer": { 19 | "type": "Adam", 20 | "zeroredundancy": true, 21 | "kwargs": { 22 | "lr": 0.001 23 | } 24 | } 25 | } 26 | } -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/lm-nn/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/TEMPLATE/exp/lm-nn/monitor.png -------------------------------------------------------------------------------- /egs/TEMPLATE/exp/lm-nn/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 26.97 6 | * GPU info \[1\] 7 | * \[1\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | * template experiment training a Transformer LM. 12 | 13 | ### Result 14 | ``` 15 | data: data/local-lm/libri-part.dev 16 | ppl: 437.87 | 17 | ``` 18 | 19 | | training process | 20 | |:-----------------------:| 21 | |![tb-plot](./monitor.png)| 22 | -------------------------------------------------------------------------------- /egs/TEMPLATE/local/data.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | set -u 3 | 4 | mkdir -p data/local 5 | cwd=$(pwd) 6 | 7 | cd data/local 8 | if [ ! -f .completed ]; then 9 | # download data 10 | [ ! -f yesno.tar.gz ] && 11 | wget https://www.openslr.org/resources/1/waves_yesno.tar.gz -O yesno.tar.gz 12 | 13 | # check downloaded file 14 | [ "$(md5sum yesno.tar.gz | awk '{print $1}')" != "962ff6e904d2df1126132ecec6978786" ] && ( 15 | echo "MD5SUM check failed for yesno.tar.gz, please rm it then re-run the script." 16 | exit 1 17 | ) 18 | 19 | # untar 20 | tar -zxf yesno.tar.gz 21 | touch .completed 22 | else 23 | echo "Found previous processed data. Skip download" 24 | fi 25 | cd $cwd 26 | 27 | [ ! $(command -v python) ] && ( 28 | echo "No python executable found in PATH" 29 | exit 1 30 | ) 31 | 32 | python local/extract_feat.py data/local/waves_yesno/ 33 | echo "FBank spectrum generate done." 34 | 35 | python utils/data/resolvedata.py 36 | 37 | echo "$0 done" 38 | exit 0 39 | -------------------------------------------------------------------------------- /egs/TEMPLATE/local/lm_data.sh: -------------------------------------------------------------------------------- 1 | set -e -u 2 | 3 | dir="data/local-lm" 4 | n_utts=50000 5 | url="https://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz" 6 | 7 | [ $n_utts -le 500 ] && { 8 | echo "#utterances must > 500 for spliting train & dev" >&2 9 | exit 1 10 | } 11 | 12 | mkdir -p $dir 13 | cd $dir 14 | if [ ! -f .completed ]; then 15 | # download and process data 16 | echo "Start downloading corpus, please wait..." 17 | wget $url -q -O - | gunzip -c | head -n $n_utts | 18 | tr '[:upper:]' '[:lower:]' >libri-part.txt 19 | echo "Corpus downloaded. ($n_utts utterances from librispeech corpus)" 20 | 21 | # take the last 500 utterances as dev 22 | head -n $(($n_utts - 500)) libri-part.txt >libri-part.train 23 | tail -n 500 libri-part.txt >libri-part.dev 24 | touch .completed 25 | else 26 | echo "Found previous processed data." 27 | fi 28 | cd - >/dev/null 29 | 30 | echo "$0 done" 31 | exit 0 32 | -------------------------------------------------------------------------------- /egs/TEMPLATE/utils: -------------------------------------------------------------------------------- 1 | ../../cat/utils/ -------------------------------------------------------------------------------- /egs/aishell/.vscode: -------------------------------------------------------------------------------- 1 | ../../.vscode/ -------------------------------------------------------------------------------- /egs/aishell/cat: -------------------------------------------------------------------------------- 1 | ../../cat/ -------------------------------------------------------------------------------- /egs/aishell/exp/ctc-crf-cuside/decode_lm/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "tyep": "NGram", 4 | "kwargs": { 5 | "f_binlm": "exp/ctc-crf-cuside/decode_lm/3gram.arpa", 6 | "gram_order": 3, 7 | "num_classes": 137076 8 | } 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /egs/aishell/exp/ctc-crf-cuside/decode_lm/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": "train" 4 | }, 5 | "tokenizer": { 6 | "type": "JiebaTokenizer", 7 | "option-init": { 8 | "userdict": "exp/ctc-crf-cuside/prepare_lexicon/dict.txt" 9 | } 10 | }, 11 | "inference": {} 12 | } 13 | -------------------------------------------------------------------------------- /egs/aishell/exp/ctc-crf-cuside/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell/exp/ctc-crf-cuside/monitor.png -------------------------------------------------------------------------------- /egs/aishell/exp/ctc-crf-cuside/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 117.07 6 | * GPU info \[10\] 7 | * \[10\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | * ctc-crf-cuside training 12 | * use torchaudio for feature extraction w/o CMVN 13 | * This experiment is conducted on the `v3` released code base, and it may slightly differ from the results in [CUSIDE paper](https://arxiv.org/abs/2203.16758). 14 | 15 | ### Result 16 | ``` 17 | test %SER 41.60 | %CER 5.57 [ 5840 / 104765, 137 ins, 105 del, 5598 sub ]/streaming 18 | test %SER 37.56 | %CER 4.99 [ 5228 / 104765, 142 ins, 115 del, 4971 sub ]/non-streaming 19 | ``` 20 | 21 | | training process | 22 | |:-----------------------:| 23 | |![monitor](./monitor.png)| 24 | -------------------------------------------------------------------------------- /egs/aishell/exp/ctc-crf-cuside/run_lexicon.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | dir=$(dirname $0) 4 | mkdir -p $dir/prepare_lexicon 5 | 6 | cd $dir/prepare_lexicon 7 | [[ ! -f lexicon.txt || ! -f dict.txt ]] && { 8 | [ ! -f resource_aishell.tgz ] && 9 | wget https://www.openslr.org/resources/33/resource_aishell.tgz 10 | 11 | [ ! -f lexicon.txt ] && { 12 | tar -zxf resource_aishell.tgz 13 | mv resource_aishell/lexicon.txt ./ 14 | } 15 | 16 | [ ! -f dict.txt ] && ( 17 | # prepare word segmentation dictionary for jieba token 18 | cut dict.txt 20 | ) 21 | } 22 | echo "finished: lexicon and dict" 23 | -------------------------------------------------------------------------------- /egs/aishell/exp/ctc-v1/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell/exp/ctc-v1/monitor.png -------------------------------------------------------------------------------- /egs/aishell/exp/ctc-v1/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 79.50 6 | * GPU info \[6\] 7 | * \[6\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | * data prepare 12 | 13 | ```bash 14 | bash local/data_kaldi.sh /path/to/data -use-3way-sp 15 | ``` 16 | 17 | * same encoder as `rnnt-v1` 18 | * batch size of 128 with 500k iters. 19 | 20 | ### Result 21 | ``` 22 | dev %SER 36.97 | %CER 4.65 [ 9544 / 205341, 167 ins, 144 del, 9233 sub ] 23 | test %SER 39.62 | %CER 5.21 [ 5462 / 104765, 88 ins, 125 del, 5249 sub ] 24 | 25 | +lm-v1-char-5gram 5g char 0.25 26 | dev %SER 35.08 | %CER 4.49 [ 9211 / 205341, 137 ins, 165 del, 8909 sub ] 27 | test %SER 37.25 | %CER 4.95 [ 5184 / 104765, 73 ins, 142 del, 4969 sub ] 28 | 29 | +lm-v2-word-3gram 3g word 0.3 30 | dev %SER 33.05 | %CER 4.25 [ 8732 / 205341, 136 ins, 168 del, 8428 sub ] 31 | test %SER 35.37 | %CER 4.72 [ 4948 / 104765, 71 ins, 143 del, 4734 sub ] 32 | ``` 33 | 34 | | training process | 35 | |:-----------------------:| 36 | |![monitor](./monitor.png)| 37 | -------------------------------------------------------------------------------- /egs/aishell/exp/ebm-lm/GN-ELM-DNCE/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "EBM", 4 | "kwargs": { 5 | "noise_rate": 1, 6 | "method": "dnce", 7 | "energy_func": "hidden2scalar-sum", 8 | "config_ebm_model": "exp/ebm-lm/GN-ELM-DNCE/config_ebm.json", 9 | "config_noise_model": "exp/ebm-lm/GN-ELM-DNCE/config_noise.json", 10 | "tokenizer_path": "exp/ebm-lm/GN-ELM-DNCE/tokenizer.tknz", 11 | "bert_tokenizer": true 12 | } 13 | }, 14 | "scheduler": { 15 | "type": "SchedulerNoam", 16 | "kwargs": { 17 | "dim_model": 768, 18 | "peak_factor": 0.3, 19 | "warmup_step": 5000, 20 | "stop_step": 2000 21 | }, 22 | "optimizer": { 23 | "type": "Adam", 24 | "kwargs": { 25 | "lr": 0.001 26 | } 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /egs/aishell/exp/ebm-lm/GN-ELM-DNCE/config_ebm.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "PretrainedTransformer", 4 | "kwargs": { 5 | "T_model": "BertModel", 6 | "T_config": "BertConfig", 7 | "pretrained": "bert-base-chinese", 8 | "with_head": false 9 | } 10 | } 11 | } -------------------------------------------------------------------------------- /egs/aishell/exp/ebm-lm/GN-ELM-DNCE/config_noise.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "PretrainedTransformer", 4 | "kwargs": { 5 | "T_model": "GPT2LMHeadModel", 6 | "T_config": "GPT2Config", 7 | "pretrained": "uer/gpt2-chinese-cluecorpussmall", 8 | "with_head": false 9 | } 10 | } 11 | } -------------------------------------------------------------------------------- /egs/aishell/exp/ebm-lm/GN-ELM-DNCE/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell/exp/ebm-lm/GN-ELM-DNCE/monitor.png -------------------------------------------------------------------------------- /egs/aishell/exp/ebm-lm/GN-ELM-ML/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "EBM_IS", 4 | "kwargs": { 5 | "noise_rate": 1, 6 | "method": "IS", 7 | "energy_func": "sumtokenlogit", 8 | "config_ebm_model": "exp/ebm-lm/GN-ELM-ML/config_ebm.json", 9 | "config_noise_model": "exp/ebm-lm/GN-ELM-DNCE/config_noise.json", 10 | "tokenizer_path": "exp/ebm-lm/GN-ELM-DNCE/tokenizer.tknz", 11 | "bert_tokenizer": true, 12 | "freeze_noise": false 13 | } 14 | }, 15 | "scheduler": { 16 | "type": "SchedulerNoam", 17 | "kwargs": { 18 | "dim_model": 768, 19 | "peak_factor": 0.15, 20 | "warmup_step": 3000, 21 | "stop_step": 5000 22 | }, 23 | "optimizer": { 24 | "type": "Adam", 25 | "kwargs": { 26 | "lr": 0.001 27 | } 28 | } 29 | } 30 | } -------------------------------------------------------------------------------- /egs/aishell/exp/ebm-lm/GN-ELM-ML/config_ebm.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "PretrainedTransformer", 4 | "kwargs": { 5 | "T_model": "BertLMHeadModel", 6 | "T_config": "BertConfig", 7 | "pretrained": "bert-base-chinese", 8 | "with_head": false 9 | } 10 | } 11 | } -------------------------------------------------------------------------------- /egs/aishell/exp/ebm-lm/GN-ELM-ML/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell/exp/ebm-lm/GN-ELM-ML/monitor.png -------------------------------------------------------------------------------- /egs/aishell/exp/ebm-lm/GN-ELM-NCE/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "EBM", 4 | "kwargs": { 5 | "noise_rate": 1, 6 | "method": "nce", 7 | "energy_func": "hidden2scalar-sum", 8 | "config_ebm_model": "exp/lm/GN-ELM-NCE/config_ebm.json", 9 | "config_noise_model": "exp/lm/GN-ELM-NCE/config_noise.json", 10 | "check_noise_model": "exp/lm/lm-gpt2/check/best-2.pt", 11 | "tokenizer_path": "exp/lm/GN-ELM-NCE/tokenizer.tknz", 12 | "bert_tokenizer": true 13 | } 14 | }, 15 | "scheduler": { 16 | "type": "SchedulerNoam", 17 | "kwargs": { 18 | "dim_model": 768, 19 | "peak_factor": 0.45, 20 | "warmup_step": 5000, 21 | "stop_step": 2000 22 | }, 23 | "optimizer": { 24 | "type": "Adam", 25 | "kwargs": { 26 | "lr": 0.001 27 | } 28 | } 29 | } 30 | } -------------------------------------------------------------------------------- /egs/aishell/exp/ebm-lm/GN-ELM-NCE/config_ebm.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "PretrainedTransformer", 4 | "kwargs": { 5 | "T_model": "BertModel", 6 | "T_config": "BertConfig", 7 | "pretrained": "bert-base-chinese", 8 | "with_head": false 9 | } 10 | } 11 | } -------------------------------------------------------------------------------- /egs/aishell/exp/ebm-lm/GN-ELM-NCE/config_noise.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "PretrainedTransformer", 4 | "kwargs": { 5 | "T_model": "GPT2LMHeadModel", 6 | "T_config": "GPT2Config", 7 | "pretrained": "uer/gpt2-chinese-cluecorpussmall", 8 | "with_head": false 9 | } 10 | } 11 | } -------------------------------------------------------------------------------- /egs/aishell/exp/ebm-lm/GN-ELM-NCE/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell/exp/ebm-lm/GN-ELM-NCE/monitor.png -------------------------------------------------------------------------------- /egs/aishell/exp/ebm-lm/GN-ELM-NCE/readme.md: -------------------------------------------------------------------------------- 1 | # Train GN-ELM with NCE 2 | 3 | The training and testing process is basically consistent with [Train GN-ELM with DNCE](../GN-ELM-DNCE/). We only explain the differences here. 4 | 5 | ## Notes 6 | 7 | * In NCE training (set `config['decoder']['kwargs']['method']=nce`), the noise model is fixed, so we need a trained language model to initialize the noise model. We use a [finetuned GPT-2](../lm-gpt2/) for initialization, which is specified in `config['decoder']['kwargs']['check_noise_model']`. 8 | 9 | ## Result 10 | 11 | We also try 3 different energy functions, whose results are as follows: 12 | 13 | |CER type | SumTargetLogit | Hidden2Scalar | SumTokenLogit | 14 | | ------- | -------- | ----------- | ----------- | 15 | | in-domain | 3.32 | 3.20 | 3.27 | 16 | | cross-domain| 3.39 | 3.36 | 3.43 | 17 | 18 | The training curve of the best model is shown below. 19 | 20 | | training curve | 21 | |:-----------------------:| 22 | |![monitor](./monitor.png)| 23 | -------------------------------------------------------------------------------- /egs/aishell/exp/ebm-lm/TRF-LM-DNCE/config_noise.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "PretrainedTransformer", 4 | "kwargs": { 5 | "T_model": "GPT2LMHeadModel", 6 | "T_config": "GPT2Config", 7 | "pretrained": "uer/gpt2-chinese-cluecorpussmall", 8 | "with_head": false 9 | } 10 | } 11 | } -------------------------------------------------------------------------------- /egs/aishell/exp/ebm-lm/TRF-LM-DNCE/config_trf.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "PretrainedTransformer", 4 | "kwargs": { 5 | "T_model": "GPT2LMHeadModel", 6 | "T_config": "GPT2Config", 7 | "pretrained": "uer/gpt2-chinese-cluecorpussmall", 8 | "with_head": false 9 | } 10 | } 11 | } -------------------------------------------------------------------------------- /egs/aishell/exp/ebm-lm/TRF-LM-DNCE/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell/exp/ebm-lm/TRF-LM-DNCE/monitor.png -------------------------------------------------------------------------------- /egs/aishell/exp/ebm-lm/TRF-LM-DNCE/readme.md: -------------------------------------------------------------------------------- 1 | # Train TRF-LM with DNCE 2 | The training and testing process is basically consistent with [Train GN-ELM with DNCE](../GN-ELM-DNCE/). We only explain the differences here. 3 | ## Notes 4 | * **In stage 2 (data packing)**, for training TRF, we need to calculate the length distribution after packing data and before training. 5 | ``` 6 | python -m cat.lm.trf.prep_feats exp/TRF-LM-DNCE/pkl/train.pkl exp/TRF-LM-DNCE/linfo.pkl 7 | ``` 8 | 9 | ## Result 10 | We also try 3 different energy functions, whose results are as follows: 11 | |CER type | SumTargetLogit | Hidden2Scalar | SumTokenLogit | 12 | | ------- | -------- | ----------- | ----------- | 13 | | in-domain | 3.11 | 3.13 | 3.21 | 14 | | cross-domain| 3.44 | 3.39 | 3.47 | 15 | 16 | The training curve of the best model is shown below. 17 | | training curve | 18 | |:-----------------------:| 19 | |![monitor](./monitor.png)| -------------------------------------------------------------------------------- /egs/aishell/exp/lm/lm-v1-char-5gram/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "NGram", 4 | "kwargs": { 5 | "f_binlm": "exp/lm/lm-v1-char-5gram/5gram.bin", 6 | "gram_order": 5, 7 | "num_classes": 4232 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /egs/aishell/exp/lm/lm-v1-char-5gram/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": "train", 4 | "dev": "dev", 5 | "test": [ 6 | "dev", 7 | "test" 8 | ] 9 | }, 10 | "tokenizer": { 11 | "type": "SentencePieceTokenizer", 12 | "option-train": { 13 | "model_type": "char", 14 | "vocab_size": 4232, 15 | "add_dummy_prefix": false, 16 | "use_all_vocab": true, 17 | "model_prefix": "sentencepiece/aishell_char/spm" 18 | } 19 | }, 20 | "inference": {} 21 | } -------------------------------------------------------------------------------- /egs/aishell/exp/lm/lm-v1-char-5gram/readme.md: -------------------------------------------------------------------------------- 1 | 2 | train command: 3 | 4 | ```bash 5 | utils/pipeline/ngram.sh exp/lm/lm-v1-char-5gram 6 | ``` 7 | 8 | property: 9 | 10 | - prune: 11 | - type: probing 12 | - size: 73MB 13 | 14 | perplexity: 15 | 16 | ``` 17 | data: dev test 18 | ppl: 59.06 | 58.44 | 19 | ``` 20 | -------------------------------------------------------------------------------- /egs/aishell/exp/lm/lm-v2-word-3gram/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "NGram", 4 | "kwargs": { 5 | "f_binlm": "exp/lm/lm-v2-word-3gram/3gram.klm", 6 | "gram_order": 3, 7 | "num_classes": 498115 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /egs/aishell/exp/lm/lm-v2-word-3gram/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": "train", 4 | "test": [ 5 | "dev", 6 | "test" 7 | ], 8 | "lang": "zh-cn" 9 | }, 10 | "tokenizer": { 11 | "type": "JiebaTokenizer", 12 | "option-init": {} 13 | }, 14 | "inference": {} 15 | } -------------------------------------------------------------------------------- /egs/aishell/exp/lm/lm-v2-word-3gram/readme.md: -------------------------------------------------------------------------------- 1 | 2 | train command: 3 | 4 | ```bash 5 | utils/pipeline/ngram.sh exp/lm/lm-v2-word-3gram -o 3 6 | ``` 7 | 8 | property: 9 | 10 | - prune: 11 | - type: probing 12 | - size: 26MB 13 | 14 | perplexity: 15 | 16 | ``` 17 | using jieba default dict produces better results: 18 | Test file: dev.tmp -> ppl: 788.34 19 | Test file: test.tmp -> ppl: 840.97 20 | 21 | with bigcidian dict: 22 | ppl ~1000 23 | ``` 24 | -------------------------------------------------------------------------------- /egs/aishell/exp/rnnt-cuside/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell/exp/rnnt-cuside/monitor.png -------------------------------------------------------------------------------- /egs/aishell/exp/rnnt-cuside/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 86.00 6 | * GPU info \[10\] 7 | * \[10\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | * use torchaudio for feature extraction w/o CMVN 12 | 13 | ### Result 14 | ``` 15 | test %SER 41.76 | %CER 6.02 [ 6302 / 104765, 225 ins, 255 del, 5822 sub ]/streaming 16 | test %SER 36.97 | %CER 5.12 [ 5369 / 104765, 102 ins, 180 del, 5087 sub ]/non-streaming 17 | ``` 18 | 19 | | training process | 20 | |:-----------------------:| 21 | |![monitor](./monitor.png)| 22 | -------------------------------------------------------------------------------- /egs/aishell/exp/rnnt-v1/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell/exp/rnnt-v1/monitor.png -------------------------------------------------------------------------------- /egs/aishell/utils: -------------------------------------------------------------------------------- 1 | ../../cat/utils/ -------------------------------------------------------------------------------- /egs/aishell4/cat: -------------------------------------------------------------------------------- 1 | ../../cat/ -------------------------------------------------------------------------------- /egs/aishell4/exp/Exp1-SingalChannel_E2E/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell4/exp/Exp1-SingalChannel_E2E/monitor.png -------------------------------------------------------------------------------- /egs/aishell4/exp/Exp1-SingalChannel_E2E/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 20.70 6 | * GPU info \[2\] 7 | * \[2\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | * Choose channel 0 as input_channel 12 | 13 | ### Result 14 | ``` 15 | Streaming: 16 | test_raw_ori %SER 98.02 | %CER 55.07 [ 72303 / 131298, 2006 ins, 25601 del, 44696 sub ] 17 | ------------------------- 18 | Non-streaming 19 | test_raw_ori %SER 91.26 | %CER 38.76 [ 50886 / 131298, 4611 ins, 6505 del, 39770 sub ] 20 | 21 | ``` 22 | 23 | | training process | 24 | |:-----------------------:| 25 | |![tb-plot](./monitor.png)| 26 | -------------------------------------------------------------------------------- /egs/aishell4/exp/Exp10~12-CUSIDE-Array+Pre-trained_BE+E2E-FT(ID+simu_data)/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell4/exp/Exp10~12-CUSIDE-Array+Pre-trained_BE+E2E-FT(ID+simu_data)/monitor.png -------------------------------------------------------------------------------- /egs/aishell4/exp/Exp2-SingalChannel_E2E+JT(CUSIDE)/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell4/exp/Exp2-SingalChannel_E2E+JT(CUSIDE)/monitor.png -------------------------------------------------------------------------------- /egs/aishell4/exp/Exp3-MultiChannel_E2E/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell4/exp/Exp3-MultiChannel_E2E/monitor.png -------------------------------------------------------------------------------- /egs/aishell4/exp/Exp3-MultiChannel_E2E/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 25.77 6 | * GPU info \[2\] 7 | * \[2\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | * 12 | 13 | ### Result 14 | ``` 15 | Streaming 16 | test_raw_ori %SER 98.55 | %CER 56.84 [ 74626 / 131298, 2181 ins, 30414 del, 42031 sub ] 17 | ----------------------- 18 | Non-streaming 19 | test_raw_ori %SER 88.28 | %CER 27.93 [ 36673 / 131298, 3925 ins, 4613 del, 28135 sub ] 20 | ``` 21 | 22 | | training process | 23 | |:-----------------------:| 24 | |![tb-plot](./monitor.png)| 25 | -------------------------------------------------------------------------------- /egs/aishell4/exp/Exp4-MultiChannel_E2E+JT(CUSIDE-Array)/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell4/exp/Exp4-MultiChannel_E2E+JT(CUSIDE-Array)/monitor.png -------------------------------------------------------------------------------- /egs/aishell4/exp/Exp5-CUSIDE-Array+real_right_ctx/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell4/exp/Exp5-CUSIDE-Array+real_right_ctx/monitor.png -------------------------------------------------------------------------------- /egs/aishell4/exp/Exp5-CUSIDE-Array+real_right_ctx/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 25.77 6 | * GPU info \[4\] 7 | * \[4\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | * 12 | 13 | ### Result 14 | ``` 15 | Streaming 16 | test_raw_ori %SER 89.57 | %CER 32.51 [ 42688 / 131298, 4416 ins, 5202 del, 33070 sub ] 17 | ----------------------- 18 | Non-streaming 19 | test_raw_ori %SER 89.07 | %CER 31.21 [ 40975 / 131298, 4239 ins, 4902 del, 31834 sub ] 20 | ``` 21 | 22 | | training process | 23 | |:-----------------------:| 24 | |![tb-plot](./monitor.png)| 25 | -------------------------------------------------------------------------------- /egs/aishell4/exp/Exp6-CUSIDE-Array+simu_right_ctx/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell4/exp/Exp6-CUSIDE-Array+simu_right_ctx/monitor.png -------------------------------------------------------------------------------- /egs/aishell4/exp/Exp6-CUSIDE-Array+simu_right_ctx/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 27.64 6 | * GPU info \[4\] 7 | * \[4\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | * 12 | 13 | ### Result 14 | ``` 15 | Streaming 16 | test_raw_ori %SER 90.56 | %CER 35.96 [ 47215 / 131298, 4954 ins, 5610 del, 36651 sub ] 17 | ----------------------- 18 | Non-streaming 19 | test_raw_ori %SER 89.39 | %CER 31.70 [ 41623 / 131298, 4432 ins, 4906 del, 32285 sub ] 20 | ``` 21 | 22 | | training process | 23 | |:-----------------------:| 24 | |![tb-plot](./monitor.png)| 25 | -------------------------------------------------------------------------------- /egs/aishell4/exp/Exp7-CUSIDE+Pre-trained_BE/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 80.72 6 | * GPU info \[1\] 7 | * \[1\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | * 12 | 13 | ### Result 14 | ``` 15 | Streaming 16 | test_alimeeting_raw_ori %SER 79.73 | %CER 28.83 [ 17637 / 61184, 546 ins, 8243 del, 8848 sub ] 17 | dev_alimeeting_raw_ori %SER 77.96 | %CER 29.07 [ 5597 / 19256, 189 ins, 2842 del, 2566 sub ] 18 | test_raw_ori %SER 91.67 | %CER 35.72 [ 46899 / 131298, 1445 ins, 30181 del, 15273 sub ] 19 | test_706_array_raw_ori %SER 100.00 | %CER 41.09 [ 415 / 1010, 3 ins, 159 del, 253 sub ] 20 | Non-streaming 21 | test_alimeeting_raw_ori %SER 65.44 | %CER 20.29 [ 12415 / 61184, 467 ins, 4150 del, 7798 sub ] 22 | dev_alimeeting_raw_ori %SER 64.97 | %CER 20.55 [ 3957 / 19256, 175 ins, 1634 del, 2148 sub ] 23 | test_raw_ori %SER 83.03 | %CER 26.42 [ 34689 / 131298, 1734 ins, 18568 del, 14387 sub ] 24 | test_706_array_raw_ori %SER 95.00 | %CER 29.80 [ 301 / 1010, 2 ins, 93 del, 206 sub ] 25 | 26 | ``` 27 | 28 | -------------------------------------------------------------------------------- /egs/aishell4/exp/Exp9-CUSIDE-Array+Pre-trained_BE+E2E-FT(ID)/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell4/exp/Exp9-CUSIDE-Array+Pre-trained_BE+E2E-FT(ID)/monitor.png -------------------------------------------------------------------------------- /egs/aishell4/exp/ctc-e2e-chunk+simu/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell4/exp/ctc-e2e-chunk+simu/monitor.png -------------------------------------------------------------------------------- /egs/aishell4/exp/ctc-e2e-chunk+simu/right_context.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell4/exp/ctc-e2e-chunk+simu/right_context.png -------------------------------------------------------------------------------- /egs/aishell4/exp/ctc-e2e-chunk+simu/simu_right_context.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell4/exp/ctc-e2e-chunk+simu/simu_right_context.png -------------------------------------------------------------------------------- /egs/aishell4/exp/ctc-e2e-chunk-kaldi/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 25.77 6 | * GPU info \[4\] 7 | * \[4\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | * 12 | - "bf": null, 用于加载前端模型 13 | - "am": null, 用于加载后端模型 14 | - "unfreeze": null, 部分训练中,用于选择需要梯度的模块,null情况下不使用 15 | 16 | ### Result 17 | ``` 18 | - 流式:36.68 19 | - 非流式:31.21 20 | ``` 21 | 22 | -------------------------------------------------------------------------------- /egs/aishell4/exp/ctc-e2e-chunk/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell4/exp/ctc-e2e-chunk/monitor.png -------------------------------------------------------------------------------- /egs/aishell4/exp/ctc-e2e-chunk/tokenizer.tknz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/aishell4/exp/ctc-e2e-chunk/tokenizer.tknz -------------------------------------------------------------------------------- /egs/aishell4/utils: -------------------------------------------------------------------------------- 1 | ../../cat/utils/ -------------------------------------------------------------------------------- /egs/commonvoice/.vscode: -------------------------------------------------------------------------------- 1 | ../../.vscode/ -------------------------------------------------------------------------------- /egs/commonvoice/README.md: -------------------------------------------------------------------------------- 1 | ## Data 2 | 3 | Source: [Common Voice Corpus](https://commonvoice.mozilla.org) 4 | 5 | ### Data preparation 6 | 7 | You should first follow the **Common Voice** official guide to download the data. 8 | 9 | Then prepare data with: 10 | 11 | ``` 12 | # Any version of Common Voice data is OK. Here CV-11.0 is used by default 13 | bash local/data.sh /path/to/data -lang xx 14 | ``` 15 | 16 | ### Result 17 | 18 | Performance is evaluated on CER (%). 19 | 20 | 130 hours **Chinese (China)** speech data 21 | 22 | | model | Unit | dev | test | 23 | | ----------------------------- | ----- | ----- | ---- | 24 | | [rnnt](exp/asr-rnnt-chinese/) | char | 18.14 | 17.14 | 25 | 26 | 27 | Performance is evaluated on WER (%). 28 | 29 | 180 hours **Russian** speech data 30 | 31 | | model | Unit | dev | test | 32 | | ----------------------------- | ----- | ----- | ---- | 33 | | [rnnt](exp/asr-rnnt-russian/) | bpe-2k | 6.44 | 8.55 | 34 | | [ctc](exp/asr-ctc-russian/) | bpe-2K | 16.22 | 19.50 | 35 | -------------------------------------------------------------------------------- /egs/commonvoice/cat: -------------------------------------------------------------------------------- 1 | ../../cat -------------------------------------------------------------------------------- /egs/commonvoice/exp/asr-ctc-russian/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/commonvoice/exp/asr-ctc-russian/monitor.png -------------------------------------------------------------------------------- /egs/commonvoice/exp/asr-ctc-russian/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 46.51 6 | * GPU info \[10\] 7 | * \[10\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | * use torchaudio for feature extraction w/o CMVN 12 | * please note that `add_dummy_prefix = false` in tokenizer training setting of SentencePiece tokenizer is erroneous, but would probably only introduce minor differences to results. 13 | 14 | ### Result 15 | ``` 16 | dev %SER 59.08 | %WER 16.22 [ 13632 / 84022, 1105 ins, 2330 del, 10197 sub ] 17 | test %SER 63.25 | %WER 19.50 [ 15970 / 81896, 1233 ins, 2868 del, 11869 sub ] 18 | ``` 19 | 20 | | training process | 21 | |:-----------------------:| 22 | |![monitor](./monitor.png)| 23 | -------------------------------------------------------------------------------- /egs/commonvoice/exp/asr-rnnt-chinese/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/commonvoice/exp/asr-rnnt-chinese/monitor.png -------------------------------------------------------------------------------- /egs/commonvoice/exp/asr-rnnt-chinese/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 53.01 6 | * GPU info \[10\] 7 | * \[10\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | * use torchaudio for feature extraction w/o CMVN 12 | 13 | ### Result 14 | ``` 15 | dev %SER 66.03 | %CER 18.14 [ 31626 / 174359, 860 ins, 11275 del, 19491 sub ] 16 | test %SER 73.29 | %CER 17.14 [ 29549 / 172400, 975 ins, 4791 del, 23783 sub ] 17 | ``` 18 | 19 | | training process | 20 | |:-----------------------:| 21 | |![monitor](./monitor.png)| 22 | -------------------------------------------------------------------------------- /egs/commonvoice/exp/asr-rnnt-russian/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/commonvoice/exp/asr-rnnt-russian/monitor.png -------------------------------------------------------------------------------- /egs/commonvoice/exp/asr-rnnt-russian/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 50.43 6 | * GPU info \[10\] 7 | * \[10\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | * use torchaudio for feature extraction w/o CMVN 12 | * please note that `add_dummy_prefix = false` in tokenizer training setting of SentencePiece tokenizer is erroneous, but would probably only introduce minor differences to results. 13 | 14 | ### Result 15 | ``` 16 | dev %SER 29.11 | %WER 6.44 [ 5412 / 84022, 437 ins, 1061 del, 3914 sub ] 17 | test %SER 33.46 | %WER 8.55 [ 7001 / 81896, 553 ins, 1566 del, 4882 sub ] 18 | ``` 19 | 20 | | training process | 21 | |:-----------------------:| 22 | |![monitor](./monitor.png)| 23 | -------------------------------------------------------------------------------- /egs/commonvoice/exp/joinap/decode-lm-indonesia/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "NGram", 4 | "kwargs": { 5 | "num_classes": 12673, 6 | "f_binlm": "exp/joinap/decode-lm-indonesia/3gram.arpa", 7 | "gram_order": 3 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /egs/commonvoice/exp/joinap/decode-lm-indonesia/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": "id-excluded_train", 4 | "dev": [ 5 | "id-dev" 6 | ], 7 | "test": [ 8 | "id-dev", 9 | "id-test" 10 | ], 11 | "packing-text-lm": { 12 | "nj": 4, 13 | "prune_shorter": 5 14 | } 15 | }, 16 | "tokenizer": { 17 | "type": "SimpleTokenizer", 18 | "option-init": { 19 | "dmap": "data/lang-id/lexicon" 20 | } 21 | }, 22 | "inference": {} 23 | } -------------------------------------------------------------------------------- /egs/commonvoice/exp/joinap/decode-lm-indonesia/readme.md: -------------------------------------------------------------------------------- 1 | 2 | train command: 3 | 4 | ```bash 5 | utils/pipeline/ngram.sh exp/debug-decode-lm-id/ -o 3 --arpa 6 | ``` 7 | 8 | property: 9 | 10 | - prune: 11 | - type: probing 12 | - size: 3.8MB 13 | 14 | perplexity: 15 | 16 | ``` 17 | 18 | ``` 19 | -------------------------------------------------------------------------------- /egs/commonvoice/exp/joinap/decode-lm-russian/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "NGram", 4 | "kwargs": { 5 | "num_classes": 50171, 6 | "f_binlm": "exp/joinap/decode-lm-russian/3gram.arpa", 7 | "gram_order": 3 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /egs/commonvoice/exp/joinap/decode-lm-russian/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": "ru-excluded_train", 4 | "dev": [ 5 | "ru-dev" 6 | ], 7 | "test": [ 8 | "ru-dev", 9 | "ru-test" 10 | ], 11 | "packing-text-lm": { 12 | "nj": 4, 13 | "prune_shorter": 5 14 | } 15 | }, 16 | "tokenizer": { 17 | "type": "SimpleTokenizer", 18 | "option-init": { 19 | "dmap": "data/lang-ru/lexicon" 20 | } 21 | }, 22 | "inference": {} 23 | } -------------------------------------------------------------------------------- /egs/commonvoice/exp/joinap/decode-lm-russian/readme.md: -------------------------------------------------------------------------------- 1 | 2 | train command: 3 | 4 | ```bash 5 | utils/pipeline/ngram.sh exp/decode-lm-russian -o 3 --arpa 6 | ``` 7 | 8 | property: 9 | 10 | - prune: 11 | - type: probing 12 | - size: 17MB 13 | 14 | perplexity: 15 | 16 | ``` 17 | 18 | ``` 19 | -------------------------------------------------------------------------------- /egs/commonvoice/exp/joinap/finetune-id/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/commonvoice/exp/joinap/finetune-id/monitor.png -------------------------------------------------------------------------------- /egs/commonvoice/exp/joinap/finetune-id/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 39.47 6 | * GPU info \[5\] 7 | * \[5\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | * Use `russian+indonesia` data pretrained model 12 | 13 | ### Result 14 | ``` 15 | id-dev_ac1.0_lm1.0_wip0.0.hyp %SER 33.24 | %WER 14.93 [ 3278 / 21951, 409 ins, 454 del, 2415 sub ] 16 | id-test_ac1.0_lm1.0_wip0.0.hyp %SER 21.28 | %WER 7.63 [ 1654 / 21664, 194 ins, 256 del, 1204 sub ] 17 | ``` 18 | 19 | | training process | 20 | |:-----------------------:| 21 | |![tb-plot](./monitor.png)| 22 | -------------------------------------------------------------------------------- /egs/commonvoice/exp/joinap/mono-indonesia-L/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/commonvoice/exp/joinap/mono-indonesia-L/monitor.png -------------------------------------------------------------------------------- /egs/commonvoice/exp/joinap/mono-indonesia-L/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 39.47 6 | * GPU info \[5\] 7 | * \[5\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | * 12 | 13 | ### Result 14 | ``` 15 | 16 | id-dev_ac1.0_lm1.0_wip0.0.hyp %SER 30.32 | %WER 14.35 [ 3149 / 21951, 430 ins, 342 del, 2377 sub ] 17 | id-test_ac1.0_lm1.0_wip0.0.hyp %SER 18.62 | %WER 6.92 [ 1500 / 21664, 174 ins, 217 del, 1109 sub ] 18 | ``` 19 | 20 | | training process | 21 | |:-----------------------:| 22 | |![tb-plot](./monitor.png)| 23 | -------------------------------------------------------------------------------- /egs/commonvoice/exp/joinap/mono-indonesia-NL/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/commonvoice/exp/joinap/mono-indonesia-NL/monitor.png -------------------------------------------------------------------------------- /egs/commonvoice/exp/joinap/mono-indonesia-NL/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 45.77 6 | * GPU info \[8\] 7 | * \[8\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | * 12 | 13 | ### Result 14 | ``` 15 | id-dev_ac1.0_lm1.0_wip0.0.hyp %SER 31.93 | %WER 15.10 [ 3315 / 21951, 437 ins, 345 del, 2533 sub ] 16 | id-test_ac1.0_lm1.0_wip0.0.hyp %SER 19.41 | %WER 7.11 [ 1540 / 21664, 164 ins, 206 del, 1170 sub ] 17 | ``` 18 | 19 | | training process | 20 | |:-----------------------:| 21 | |![tb-plot](./monitor.png)| 22 | -------------------------------------------------------------------------------- /egs/commonvoice/exp/joinap/mono-indonesia-flat/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/commonvoice/exp/joinap/mono-indonesia-flat/monitor.png -------------------------------------------------------------------------------- /egs/commonvoice/exp/joinap/mono-indonesia-flat/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 39.46 6 | * GPU info \[5\] 7 | * \[5\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | * 12 | 13 | ### Result 14 | ``` 15 | id-dev_ac1.0_lm1.0_wip0.0.hyp %SER 30.20 | %WER 13.64 [ 2994 / 21951, 385 ins, 374 del, 2235 sub ] 16 | id-test_ac1.0_lm1.0_wip0.0.hyp %SER 16.89 | %WER 6.25 [ 1353 / 21664, 144 ins, 190 del, 1019 sub ] 17 | ``` 18 | 19 | | training process | 20 | |:-----------------------:| 21 | |![tb-plot](./monitor.png)| 22 | -------------------------------------------------------------------------------- /egs/commonvoice/exp/joinap/mul-ru+id-L/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/commonvoice/exp/joinap/mul-ru+id-L/monitor.png -------------------------------------------------------------------------------- /egs/commonvoice/exp/joinap/mul-ru+id-L/prep_mul_pv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | lang="ru id" 4 | 5 | dlang="data/lang-mul" 6 | echo $lang >>$dlang/lang.txt 7 | mkdir -p $dlang 8 | export LC_ALL=C.UTF-8 9 | 10 | for l in $lang; do 11 | cat data/lang-$l/lexicon 12 | done | sort -k 1,1 -u -s \ 13 | >$dlang/lexicon 14 | 15 | cut <$dlang/lexicon -f 2- | tr ' ' '\n' | sort -u -s >$dlang/phonemes.txt 16 | 17 | [ ! -f local/data/ipa_all.csv ] && { 18 | wget https://raw.githubusercontent.com/dmort27/panphon/master/panphon/data/ipa_all.csv \ 19 | -O local/data/ipa_all.csv 20 | } 21 | python local/get_ipa_mapping.py \ 22 | $dlang/phonemes.txt \ 23 | local/data/ipa_all.csv \ 24 | $dlang/mul-pv.npy || exit 1 25 | 26 | echo "$0 done" && exit 0 27 | -------------------------------------------------------------------------------- /egs/commonvoice/exp/joinap/mul-ru+id-L/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 39.47 6 | * GPU info \[5\] 7 | * \[5\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | * 12 | 13 | ### Result 14 | ``` 15 | id 16 | id-dev_ac1.0_lm1.0_wip0.0.hyp %SER 43.52 | %WER 21.99 [ 4827 / 21951, 267 ins, 1249 del, 3311 sub ] 17 | id-test_ac1.0_lm1.0_wip0.0.hyp %SER 32.90 | %WER 12.89 [ 2792 / 21664, 147 ins, 790 del, 1855 sub ] 18 | 19 | ru 20 | ru-dev_ac1.0_lm1.0_wip0.0.hyp %SER 31.61 | %WER 7.63 [ 6413 / 84022, 460 ins, 2108 del, 3845 sub ] 21 | ru-test_ac1.0_lm1.0_wip0.0.hyp %SER 36.68 | %WER 9.76 [ 7989 / 81896, 508 ins, 2918 del, 4563 sub ] 22 | ``` 23 | 24 | | training process | 25 | |:-----------------------:| 26 | |![tb-plot](./monitor.png)| 27 | -------------------------------------------------------------------------------- /egs/commonvoice/local/data/ipa_extend.txt: -------------------------------------------------------------------------------- 1 | # russian 2 | oʲ o 3 | iʲ i 4 | ɨʲ ɨ 5 | æʲ æ 6 | yʲ y 7 | aʲ a 8 | eʲ e 9 | ʉʲ ʉ 10 | jʲ j 11 | ɵʲ ɵ 12 | # indonesia 13 | au a u 14 | ai a i 15 | ʊi ʊ i 16 | oi̯ o i̯ -------------------------------------------------------------------------------- /egs/commonvoice/local/text_normalize.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Author: Huahuan Zheng (maxwellzh@outlook.com) 3 | # Text normalize 4 | set -e -u 5 | 6 | for file in $(python -c "import json;\ 7 | print(' '.join(x['trans'] for x in json.load(open('data/metainfo.json', 'r')).values()))"); do 8 | [ ! -f $file.bak ] && mv $file $file.bak 9 | cut <$file.bak -f 2- | sed -e 's/[.]//g; s/!//g; s/?//g' \ 10 | -e 's/“//g; s/"//g; s/,//g; s/”//g' \ 11 | -e "s/'//g; s/’//g; s/‘//g" \ 12 | -e 's/://g; s/[;]//g; s/[(]//g; s/[)]//g;' \ 13 | -e 's/[\]//g' | 14 | tr '[:upper:]' '[:lower:]' >$file.trans.tmp 15 | 16 | cut <$file.bak -f 1 >$file.id.tmp 17 | paste $file.{id,trans}.tmp >$file 18 | rm -rf $file.{id,trans}.tmp 19 | done 20 | -------------------------------------------------------------------------------- /egs/commonvoice/utils: -------------------------------------------------------------------------------- 1 | ../../cat/utils/ -------------------------------------------------------------------------------- /egs/cv-lang10/cat: -------------------------------------------------------------------------------- 1 | ../../cat -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_phoneme_10h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_phoneme_10h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_phoneme_10m/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "specaug": { 3 | "freq_mask_width_range": 0.35, 4 | "num_freq_mask": 2, 5 | "num_time_mask": 10, 6 | "time_mask_width_range": 0.05 7 | }, 8 | "encoder": { 9 | "type": "ConformerNet", 10 | "kwargs": { 11 | "num_cells": 14, 12 | "idim": 80, 13 | "hdim": 512, 14 | "conv": "vgg2l", 15 | "num_heads": 4, 16 | "kernel_size": 15, 17 | "num_classes": 37 18 | } 19 | }, 20 | "scheduler": { 21 | "type": "SchedulerEarlyStop", 22 | "kwargs": { 23 | "min_step": 30, 24 | "n_tol": 5, 25 | "gamma": 0.5 26 | }, 27 | "optimizer": { 28 | "type": "Adam", 29 | "kwargs": { 30 | "lr": 3e-05, 31 | "betas": [ 32 | 0.9, 33 | 0.98 34 | ], 35 | "weight_decay": 1e-06 36 | } 37 | } 38 | } 39 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_phoneme_10m/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_phoneme_10m/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_phoneme_1h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_phoneme_1h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_phoneme_20h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_phoneme_20h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_subword_10h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_subword_10h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_subword_10m/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_subword_10m/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_subword_1h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_subword_1h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_subword_20h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Multi._phoneme_ft_subword_20h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/id/Multi._subword_ft_subword_10h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Multi._subword_ft_subword_10h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/id/Multi._subword_ft_subword_10m/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Multi._subword_ft_subword_10m/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/id/Multi._subword_ft_subword_1h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Multi._subword_ft_subword_1h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/id/Multi._subword_ft_subword_20h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Multi._subword_ft_subword_20h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/id/Wav2vec-En_ft_phoneme_10h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-En_ft_phoneme_10h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/id/Wav2vec-En_ft_phoneme_1h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-En_ft_phoneme_1h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/id/Wav2vec-En_ft_phoneme_20h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-En_ft_phoneme_20h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/id/Wav2vec-En_ft_subword_10h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-En_ft_subword_10h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/id/Wav2vec-En_ft_subword_1h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-En_ft_subword_1h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/id/Wav2vec-En_ft_subword_20h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-En_ft_subword_20h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/id/Wav2vec-lang10_ft_phoneme_10h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-lang10_ft_phoneme_10h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/id/Wav2vec-lang10_ft_phoneme_1h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-lang10_ft_phoneme_1h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/id/Wav2vec-lang10_ft_phoneme_20h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-lang10_ft_phoneme_20h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/id/Wav2vec-lang10_ft_subword_10h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-lang10_ft_subword_10h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/id/Wav2vec-lang10_ft_subword_1h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-lang10_ft_subword_1h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/id/Wav2vec-lang10_ft_subword_20h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/id/Wav2vec-lang10_ft_subword_20h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_phoneme_10h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_phoneme_10h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_phoneme_10m/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "specaug": { 3 | "freq_mask_width_range": 0.35, 4 | "num_freq_mask": 2, 5 | "num_time_mask": 10, 6 | "time_mask_width_range": 0.05 7 | }, 8 | "encoder": { 9 | "type": "ConformerNet", 10 | "kwargs": { 11 | "num_cells": 14, 12 | "idim": 80, 13 | "hdim": 512, 14 | "conv": "vgg2l", 15 | "num_heads": 4, 16 | "kernel_size": 15, 17 | "num_classes": 37 18 | } 19 | }, 20 | "scheduler": { 21 | "type": "SchedulerEarlyStop", 22 | "kwargs": { 23 | "min_step": 140, 24 | "n_tol": 20, 25 | "gamma": 0.5 26 | }, 27 | "optimizer": { 28 | "type": "Adam", 29 | "kwargs": { 30 | "lr": 3e-05, 31 | "betas": [ 32 | 0.9, 33 | 0.98 34 | ], 35 | "weight_decay": 1e-06 36 | } 37 | } 38 | } 39 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_phoneme_10m/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_phoneme_10m/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_phoneme_130h/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "specaug": { 3 | "freq_mask_width_range": 0.35, 4 | "num_freq_mask": 2, 5 | "num_time_mask": 10, 6 | "time_mask_width_range": 0.05 7 | }, 8 | "encoder": { 9 | "type": "ConformerNet", 10 | "kwargs": { 11 | "num_cells": 14, 12 | "idim": 80, 13 | "hdim": 512, 14 | "conv": "vgg2l", 15 | "num_heads": 4, 16 | "kernel_size": 15, 17 | "num_classes": 37 18 | } 19 | }, 20 | "scheduler": { 21 | "type": "SchedulerEarlyStop", 22 | "kwargs": { 23 | "min_step": 1670, 24 | "n_tol": 20, 25 | "gamma": 0.5 26 | }, 27 | "optimizer": { 28 | "type": "Adam", 29 | "kwargs": { 30 | "lr": 0.0003, 31 | "betas": [ 32 | 0.9, 33 | 0.98 34 | ], 35 | "weight_decay": 1e-06 36 | } 37 | } 38 | } 39 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_phoneme_130h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_phoneme_130h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_phoneme_1h/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "specaug": { 3 | "freq_mask_width_range": 0.35, 4 | "num_freq_mask": 2, 5 | "num_time_mask": 10, 6 | "time_mask_width_range": 0.05 7 | }, 8 | "encoder": { 9 | "type": "ConformerNet", 10 | "kwargs": { 11 | "num_cells": 14, 12 | "idim": 80, 13 | "hdim": 512, 14 | "conv": "vgg2l", 15 | "num_heads": 4, 16 | "kernel_size": 15, 17 | "num_classes": 37 18 | } 19 | }, 20 | "scheduler": { 21 | "type": "SchedulerEarlyStop", 22 | "kwargs": { 23 | "min_step": 140, 24 | "n_tol": 20, 25 | "gamma": 0.5 26 | }, 27 | "optimizer": { 28 | "type": "Adam", 29 | "kwargs": { 30 | "lr": 3e-05, 31 | "betas": [ 32 | 0.9, 33 | 0.98 34 | ], 35 | "weight_decay": 1e-06 36 | } 37 | } 38 | } 39 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_phoneme_1h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_phoneme_1h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_subword_10h/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "specaug": { 3 | "freq_mask_width_range": 0.35, 4 | "num_freq_mask": 2, 5 | "num_time_mask": 10, 6 | "time_mask_width_range": 0.05 7 | }, 8 | "encoder": { 9 | "type": "ConformerNet", 10 | "kwargs": { 11 | "num_cells": 14, 12 | "idim": 80, 13 | "hdim": 512, 14 | "conv": "vgg2l", 15 | "num_heads": 4, 16 | "kernel_size": 15, 17 | "num_classes": 500 18 | } 19 | }, 20 | "scheduler": { 21 | "type": "SchedulerEarlyStop", 22 | "kwargs": { 23 | "min_step": 640, 24 | "n_tol": 10, 25 | "gamma": 0.5 26 | }, 27 | "optimizer": { 28 | "type": "Adam", 29 | "kwargs": { 30 | "lr": 3e-05, 31 | "betas": [ 32 | 0.9, 33 | 0.98 34 | ], 35 | "weight_decay": 1e-06 36 | } 37 | } 38 | } 39 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_subword_10h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_subword_10h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_subword_10m/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "specaug": { 3 | "freq_mask_width_range": 0.35, 4 | "num_freq_mask": 2, 5 | "num_time_mask": 10, 6 | "time_mask_width_range": 0.05 7 | }, 8 | "encoder": { 9 | "type": "ConformerNet", 10 | "kwargs": { 11 | "num_cells": 14, 12 | "idim": 80, 13 | "hdim": 512, 14 | "conv": "vgg2l", 15 | "num_heads": 4, 16 | "kernel_size": 15, 17 | "num_classes": 500 18 | } 19 | }, 20 | "scheduler": { 21 | "type": "SchedulerEarlyStop", 22 | "kwargs": { 23 | "min_step": 100, 24 | "n_tol": 5, 25 | "gamma": 0.5 26 | }, 27 | "optimizer": { 28 | "type": "Adam", 29 | "kwargs": { 30 | "lr": 3e-05, 31 | "betas": [ 32 | 0.9, 33 | 0.98 34 | ], 35 | "weight_decay": 1e-06 36 | } 37 | } 38 | } 39 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_subword_10m/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_subword_10m/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_subword_130h/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "specaug": { 3 | "freq_mask_width_range": 0.35, 4 | "num_freq_mask": 2, 5 | "num_time_mask": 10, 6 | "time_mask_width_range": 0.05 7 | }, 8 | "encoder": { 9 | "type": "ConformerNet", 10 | "kwargs": { 11 | "num_cells": 14, 12 | "idim": 80, 13 | "hdim": 512, 14 | "conv": "vgg2l", 15 | "num_heads": 4, 16 | "kernel_size": 15, 17 | "num_classes": 500 18 | } 19 | }, 20 | "scheduler": { 21 | "type": "SchedulerEarlyStop", 22 | "kwargs": { 23 | "min_step": 3340, 24 | "n_tol": 20, 25 | "gamma": 0.5 26 | }, 27 | "optimizer": { 28 | "type": "Adam", 29 | "kwargs": { 30 | "lr": 0.0003, 31 | "betas": [ 32 | 0.9, 33 | 0.98 34 | ], 35 | "weight_decay": 1e-06 36 | } 37 | } 38 | } 39 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_subword_130h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_subword_130h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_subword_1h/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "specaug": { 3 | "freq_mask_width_range": 0.35, 4 | "num_freq_mask": 2, 5 | "num_time_mask": 10, 6 | "time_mask_width_range": 0.05 7 | }, 8 | "encoder": { 9 | "type": "ConformerNet", 10 | "kwargs": { 11 | "num_cells": 14, 12 | "idim": 80, 13 | "hdim": 512, 14 | "conv": "vgg2l", 15 | "num_heads": 4, 16 | "kernel_size": 15, 17 | "num_classes": 500 18 | } 19 | }, 20 | "scheduler": { 21 | "type": "SchedulerEarlyStop", 22 | "kwargs": { 23 | "min_step": 140, 24 | "n_tol": 20, 25 | "gamma": 0.5 26 | }, 27 | "optimizer": { 28 | "type": "Adam", 29 | "kwargs": { 30 | "lr": 3e-05, 31 | "betas": [ 32 | 0.9, 33 | 0.98 34 | ], 35 | "weight_decay": 1e-06 36 | } 37 | } 38 | } 39 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_subword_1h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Multi._phoneme_ft_subword_1h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Multi._subword_ft_subword_10h/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "specaug": { 3 | "freq_mask_width_range": 0.35, 4 | "num_freq_mask": 2, 5 | "num_time_mask": 10, 6 | "time_mask_width_range": 0.05 7 | }, 8 | "encoder": { 9 | "type": "ConformerNet", 10 | "kwargs": { 11 | "num_cells": 14, 12 | "idim": 80, 13 | "hdim": 512, 14 | "conv": "vgg2l", 15 | "num_heads": 4, 16 | "kernel_size": 15, 17 | "num_classes": 500 18 | } 19 | }, 20 | "scheduler": { 21 | "type": "SchedulerEarlyStop", 22 | "kwargs": { 23 | "min_step": 320, 24 | "n_tol": 10, 25 | "gamma": 0.5 26 | }, 27 | "optimizer": { 28 | "type": "Adam", 29 | "kwargs": { 30 | "lr": 6e-05, 31 | "betas": [ 32 | 0.9, 33 | 0.98 34 | ], 35 | "weight_decay": 1e-06 36 | } 37 | } 38 | } 39 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Multi._subword_ft_subword_10h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Multi._subword_ft_subword_10h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Multi._subword_ft_subword_10m/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "specaug": { 3 | "freq_mask_width_range": 0.35, 4 | "num_freq_mask": 2, 5 | "num_time_mask": 10, 6 | "time_mask_width_range": 0.05 7 | }, 8 | "encoder": { 9 | "type": "ConformerNet", 10 | "kwargs": { 11 | "num_cells": 14, 12 | "idim": 80, 13 | "hdim": 512, 14 | "conv": "vgg2l", 15 | "num_heads": 4, 16 | "kernel_size": 15, 17 | "num_classes": 500 18 | } 19 | }, 20 | "scheduler": { 21 | "type": "SchedulerEarlyStop", 22 | "kwargs": { 23 | "min_step": 100, 24 | "n_tol": 5, 25 | "gamma": 0.5 26 | }, 27 | "optimizer": { 28 | "type": "Adam", 29 | "kwargs": { 30 | "lr": 3e-05, 31 | "betas": [ 32 | 0.9, 33 | 0.98 34 | ], 35 | "weight_decay": 1e-06 36 | } 37 | } 38 | } 39 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Multi._subword_ft_subword_10m/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Multi._subword_ft_subword_10m/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Multi._subword_ft_subword_130h/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "specaug": { 3 | "freq_mask_width_range": 0.35, 4 | "num_freq_mask": 2, 5 | "num_time_mask": 10, 6 | "time_mask_width_range": 0.05 7 | }, 8 | "encoder": { 9 | "type": "ConformerNet", 10 | "kwargs": { 11 | "num_cells": 14, 12 | "idim": 80, 13 | "hdim": 512, 14 | "conv": "vgg2l", 15 | "num_heads": 4, 16 | "kernel_size": 15, 17 | "num_classes": 500 18 | } 19 | }, 20 | "scheduler": { 21 | "type": "SchedulerEarlyStop", 22 | "kwargs": { 23 | "min_step": 3340, 24 | "n_tol": 20, 25 | "gamma": 0.5 26 | }, 27 | "optimizer": { 28 | "type": "Adam", 29 | "kwargs": { 30 | "lr": 0.0003, 31 | "betas": [ 32 | 0.9, 33 | 0.98 34 | ], 35 | "weight_decay": 1e-06 36 | } 37 | } 38 | } 39 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Multi._subword_ft_subword_130h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Multi._subword_ft_subword_130h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Multi._subword_ft_subword_1h/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "specaug": { 3 | "freq_mask_width_range": 0.35, 4 | "num_freq_mask": 2, 5 | "num_time_mask": 10, 6 | "time_mask_width_range": 0.05 7 | }, 8 | "encoder": { 9 | "type": "ConformerNet", 10 | "kwargs": { 11 | "num_cells": 14, 12 | "idim": 80, 13 | "hdim": 512, 14 | "conv": "vgg2l", 15 | "num_heads": 4, 16 | "kernel_size": 15, 17 | "num_classes": 500 18 | } 19 | }, 20 | "scheduler": { 21 | "type": "SchedulerEarlyStop", 22 | "kwargs": { 23 | "min_step": 210, 24 | "n_tol": 20, 25 | "gamma": 0.5 26 | }, 27 | "optimizer": { 28 | "type": "Adam", 29 | "kwargs": { 30 | "lr": 3e-05, 31 | "betas": [ 32 | 0.9, 33 | 0.98 34 | ], 35 | "weight_decay": 1e-06 36 | } 37 | } 38 | } 39 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Multi._subword_ft_subword_1h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Multi._subword_ft_subword_1h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-En_ft_phoneme_10h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-En_ft_phoneme_10h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-En_ft_phoneme_130h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-En_ft_phoneme_130h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-En_ft_phoneme_1h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-En_ft_phoneme_1h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-En_ft_subword_10h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-En_ft_subword_10h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-En_ft_subword_130h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-En_ft_subword_130h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-En_ft_subword_1h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-En_ft_subword_1h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-lang10_ft_phoneme_10h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-lang10_ft_phoneme_10h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-lang10_ft_phoneme_130h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-lang10_ft_phoneme_130h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-lang10_ft_phoneme_1h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-lang10_ft_phoneme_1h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-lang10_ft_subword_10h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-lang10_ft_subword_10h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-lang10_ft_subword_130h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-lang10_ft_subword_130h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-lang10_ft_subword_1h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Crosslingual/pl/Wav2vec-lang10_ft_subword_1h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/en/lm/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "NGram", 4 | "kwargs": { 5 | "f_binlm": "dict/es/lm/4gram.arpa", 6 | "gram_order": 4, 7 | "num_classes": 87908 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/en/lm/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": [ 4 | "train_en" 5 | ], 6 | "dev": [ 7 | "dev_en" 8 | ], 9 | "test": [ 10 | "test_en" 11 | ], 12 | "packing-text-lm": { 13 | "nj": 4, 14 | "prune_shorter": 5 15 | } 16 | }, 17 | "tokenizer": { 18 | "type": "SimpleTokenizer", 19 | "option-init": { 20 | "dmap": "dict/en/word_list" 21 | }, 22 | "|V|": 246234, 23 | "file": "dict/en/lm/tokenizer_lm.tknz" 24 | }, 25 | "commit": "c102b404d8bbce612eecb7e5fa6cb7679609ec5c" 26 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/en/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/en/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/es/lm/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "NGram", 4 | "kwargs": { 5 | "f_binlm": "dict/en/lm/4gram.arpa", 6 | "gram_order": 4, 7 | "num_classes": 246234 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/es/lm/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": [ 4 | "train_es" 5 | ], 6 | "dev": [ 7 | "dev_es" 8 | ], 9 | "test": [ 10 | "test_es" 11 | ], 12 | "packing-text-lm": { 13 | "nj": 4, 14 | "prune_shorter": 5 15 | } 16 | }, 17 | "tokenizer": { 18 | "type": "SimpleTokenizer", 19 | "option-init": { 20 | "dmap": "dict/es/word_list" 21 | }, 22 | "|V|": 87908, 23 | "file": "dict/es/lm/tokenizer_lm.tknz" 24 | }, 25 | "commit": "c102b404d8bbce612eecb7e5fa6cb7679609ec5c" 26 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/es/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/es/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/fr/lm/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "NGram", 4 | "kwargs": { 5 | "f_binlm": "dict/fr/lm/4gram.arpa", 6 | "gram_order": 4, 7 | "num_classes": 217706 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/fr/lm/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": [ 4 | "train_fr" 5 | ], 6 | "dev": [ 7 | "dev_fr" 8 | ], 9 | "test": [ 10 | "test_fr" 11 | ], 12 | "packing-text-lm": { 13 | "nj": 4, 14 | "prune_shorter": 5 15 | } 16 | }, 17 | "tokenizer": { 18 | "type": "SimpleTokenizer", 19 | "option-init": { 20 | "dmap": "dict/fr/word_list" 21 | }, 22 | "|V|": 217706, 23 | "file": "dict/fr/lm/tokenizer_lm.tknz" 24 | }, 25 | "commit": "c102b404d8bbce612eecb7e5fa6cb7679609ec5c" 26 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/fr/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/fr/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/id/Mono._phoneme_10h/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "specaug": { 3 | "freq_mask_width_range": 0.35, 4 | "num_freq_mask": 2, 5 | "num_time_mask": 10, 6 | "time_mask_width_range": 0.05 7 | }, 8 | "encoder": { 9 | "type": "ConformerNet", 10 | "kwargs": { 11 | "num_cells": 14, 12 | "idim": 80, 13 | "hdim": 512, 14 | "conv": "vgg2l", 15 | "num_heads": 4, 16 | "kernel_size": 15, 17 | "num_classes": 37 18 | } 19 | }, 20 | "scheduler": { 21 | "type": "SchedulerEarlyStop", 22 | "kwargs": { 23 | "min_step": 1000, 24 | "n_tol": 10, 25 | "gamma": 0.5, 26 | "stop_lr": 1e-06 27 | }, 28 | "optimizer": { 29 | "type": "Adam", 30 | "kwargs": { 31 | "lr": 3e-05, 32 | "betas": [ 33 | 0.9, 34 | 0.98 35 | ], 36 | "weight_decay": 1e-06 37 | } 38 | } 39 | } 40 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/id/Mono._phoneme_10h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/id/Mono._phoneme_10h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/id/Mono._phoneme_1h/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "specaug": { 3 | "freq_mask_width_range": 0.35, 4 | "num_freq_mask": 2, 5 | "num_time_mask": 10, 6 | "time_mask_width_range": 0.05 7 | }, 8 | "encoder": { 9 | "type": "ConformerNet", 10 | "kwargs": { 11 | "num_cells": 14, 12 | "idim": 80, 13 | "hdim": 512, 14 | "conv": "vgg2l", 15 | "num_heads": 4, 16 | "kernel_size": 15, 17 | "num_classes": 37 18 | } 19 | }, 20 | "scheduler": { 21 | "type": "SchedulerEarlyStop", 22 | "kwargs": { 23 | "min_step": 100, 24 | "n_tol": 10, 25 | "gamma": 0.5, 26 | "stop_lr": 1e-06 27 | }, 28 | "optimizer": { 29 | "type": "Adam", 30 | "kwargs": { 31 | "lr": 1e-05, 32 | "betas": [ 33 | 0.9, 34 | 0.98 35 | ], 36 | "weight_decay": 1e-06 37 | } 38 | } 39 | } 40 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/id/Mono._phoneme_1h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/id/Mono._phoneme_1h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/id/Mono._phoneme_20h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/id/Mono._phoneme_20h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/id/Mono._subword_10h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/id/Mono._subword_10h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/id/Mono._subword_1h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/id/Mono._subword_1h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/id/Mono._subword_20h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/id/Mono._subword_20h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/id/lm/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "NGram", 4 | "kwargs": { 5 | "f_binlm": "dict/id/lm/4gram.arpa", 6 | "gram_order": 4, 7 | "num_classes": 13660 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/id/lm/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": [ 4 | "train_id" 5 | ], 6 | "dev": [ 7 | "dev_id" 8 | ], 9 | "test": [ 10 | "test_id" 11 | ], 12 | "packing-text-lm": { 13 | "nj": 4, 14 | "prune_shorter": 5 15 | } 16 | }, 17 | "tokenizer": { 18 | "type": "SimpleTokenizer", 19 | "option-init": { 20 | "dmap": "dict/id/word_list" 21 | }, 22 | "|V|": 13660, 23 | "file": "dict/id/lm/tokenizer_lm.tknz" 24 | }, 25 | "commit": "c102b404d8bbce612eecb7e5fa6cb7679609ec5c" 26 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/it/lm/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "NGram", 4 | "kwargs": { 5 | "f_binlm": "dict/it/lm/4gram.arpa", 6 | "gram_order": 4, 7 | "num_classes": 85831 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/it/lm/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": [ 4 | "train_it" 5 | ], 6 | "dev": [ 7 | "dev_it" 8 | ], 9 | "test": [ 10 | "test_it" 11 | ], 12 | "packing-text-lm": { 13 | "nj": 4, 14 | "prune_shorter": 5 15 | } 16 | }, 17 | "tokenizer": { 18 | "type": "SimpleTokenizer", 19 | "option-init": { 20 | "dmap": "dict/it/word_list" 21 | }, 22 | "|V|": 85831, 23 | "file": "dict/it/lm/tokenizer_lm.tknz" 24 | }, 25 | "commit": "c102b404d8bbce612eecb7e5fa6cb7679609ec5c" 26 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/it/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/it/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/ky/lm/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "NGram", 4 | "kwargs": { 5 | "f_binlm": "dict/ky/lm/4gram.arpa", 6 | "gram_order": 4, 7 | "num_classes": 10608 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/ky/lm/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": [ 4 | "train_ky" 5 | ], 6 | "dev": [ 7 | "dev_ky" 8 | ], 9 | "test": [ 10 | "test_ky" 11 | ], 12 | "packing-text-lm": { 13 | "nj": 4, 14 | "prune_shorter": 5 15 | } 16 | }, 17 | "tokenizer": { 18 | "type": "SimpleTokenizer", 19 | "option-init": { 20 | "dmap": "dict/ky/word_list" 21 | }, 22 | "|V|": 10608, 23 | "file": "dict/ky/lm/tokenizer_lm.tknz" 24 | }, 25 | "commit": "c102b404d8bbce612eecb7e5fa6cb7679609ec5c" 26 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/ky/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/ky/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/nl/lm/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "NGram", 4 | "kwargs": { 5 | "f_binlm": "dict/nl/lm/4gram.arpa", 6 | "gram_order": 4, 7 | "num_classes": 24518 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/nl/lm/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": [ 4 | "train_nl" 5 | ], 6 | "dev": [ 7 | "dev_nl" 8 | ], 9 | "test": [ 10 | "test_nl" 11 | ], 12 | "packing-text-lm": { 13 | "nj": 4, 14 | "prune_shorter": 5 15 | } 16 | }, 17 | "tokenizer": { 18 | "type": "SimpleTokenizer", 19 | "option-init": { 20 | "dmap": "dict/nl/word_list" 21 | }, 22 | "|V|": 24518, 23 | "file": "dict/nl/lm/tokenizer_lm.tknz" 24 | }, 25 | "commit": "c102b404d8bbce612eecb7e5fa6cb7679609ec5c" 26 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/nl/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/nl/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/pl/Mono._phoneme_10h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/pl/Mono._phoneme_10h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/pl/Mono._phoneme_130h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/pl/Mono._phoneme_130h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/pl/Mono._phoneme_1h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/pl/Mono._phoneme_1h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/pl/Mono._subword_10h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/pl/Mono._subword_10h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/pl/Mono._subword_130h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/pl/Mono._subword_130h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/pl/Mono._subword_1h/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/pl/Mono._subword_1h/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/pl/lm/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "NGram", 4 | "kwargs": { 5 | "f_binlm": "dict/pl/lm/4gram.arpa", 6 | "gram_order": 4, 7 | "num_classes": 43748 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/pl/lm/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": [ 4 | "train_pl" 5 | ], 6 | "dev": [ 7 | "dev_pl" 8 | ], 9 | "test": [ 10 | "test_pl" 11 | ], 12 | "packing-text-lm": { 13 | "nj": 4, 14 | "prune_shorter": 5 15 | } 16 | }, 17 | "tokenizer": { 18 | "type": "SimpleTokenizer", 19 | "option-init": { 20 | "dmap": "dict/pl/word_list" 21 | }, 22 | "|V|": 43748, 23 | "file": "dict/pl/lm/tokenizer_lm.tknz" 24 | }, 25 | "commit": "c102b404d8bbce612eecb7e5fa6cb7679609ec5c" 26 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/ru/lm/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "NGram", 4 | "kwargs": { 5 | "f_binlm": "dict/ru/lm/4gram.arpa", 6 | "gram_order": 4, 7 | "num_classes": 45653 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/ru/lm/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": [ 4 | "train_ru" 5 | ], 6 | "dev": [ 7 | "dev_ru" 8 | ], 9 | "test": [ 10 | "test_ru" 11 | ], 12 | "packing-text-lm": { 13 | "nj": 4, 14 | "prune_shorter": 5 15 | } 16 | }, 17 | "tokenizer": { 18 | "type": "SimpleTokenizer", 19 | "option-init": { 20 | "dmap": "dict/ru/word_list" 21 | }, 22 | "|V|": 45653, 23 | "file": "dict/ru/lm/tokenizer_lm.tknz" 24 | }, 25 | "commit": "c102b404d8bbce612eecb7e5fa6cb7679609ec5c" 26 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/ru/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/ru/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/sv-SE/lm/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "NGram", 4 | "kwargs": { 5 | "f_binlm": "dict/sv-SE/lm/4gram.arpa", 6 | "gram_order": 4, 7 | "num_classes": 18689 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/sv-SE/lm/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": [ 4 | "train_sv-SE" 5 | ], 6 | "dev": [ 7 | "dev_sv-SE" 8 | ], 9 | "test": [ 10 | "test_sv-SE" 11 | ], 12 | "packing-text-lm": { 13 | "nj": 4, 14 | "prune_shorter": 5 15 | } 16 | }, 17 | "tokenizer": { 18 | "type": "SimpleTokenizer", 19 | "option-init": { 20 | "dmap": "dict/sv-SE/word_list" 21 | }, 22 | "|V|": 18689, 23 | "file": "dict/sv-SE/lm/tokenizer_lm.tknz" 24 | }, 25 | "commit": "c102b404d8bbce612eecb7e5fa6cb7679609ec5c" 26 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/sv-SE/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/sv-SE/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/tr/lm/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "NGram", 4 | "kwargs": { 5 | "f_binlm": "dict/tr/lm/4gram.arpa", 6 | "gram_order": 4, 7 | "num_classes": 38397 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/tr/lm/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": [ 4 | "train_tr" 5 | ], 6 | "dev": [ 7 | "dev_tr" 8 | ], 9 | "test": [ 10 | "test_tr" 11 | ], 12 | "packing-text-lm": { 13 | "nj": 4, 14 | "prune_shorter": 5 15 | } 16 | }, 17 | "tokenizer": { 18 | "type": "SimpleTokenizer", 19 | "option-init": { 20 | "dmap": "dict/tr/word_list" 21 | }, 22 | "|V|": 38397, 23 | "file": "dict/tr/lm/tokenizer_lm.tknz" 24 | }, 25 | "commit": "c102b404d8bbce612eecb7e5fa6cb7679609ec5c" 26 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/tr/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/tr/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/tt/lm/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "NGram", 4 | "kwargs": { 5 | "f_binlm": "dict/tt/lm/4gram.arpa", 6 | "gram_order": 4, 7 | "num_classes": 22496 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/tt/lm/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": [ 4 | "train_tt" 5 | ], 6 | "dev": [ 7 | "dev_tt" 8 | ], 9 | "test": [ 10 | "test_tt" 11 | ], 12 | "packing-text-lm": { 13 | "nj": 4, 14 | "prune_shorter": 5 15 | } 16 | }, 17 | "tokenizer": { 18 | "type": "SimpleTokenizer", 19 | "option-init": { 20 | "dmap": "dict/tt/word_list" 21 | }, 22 | "|V|": 22496, 23 | "file": "dict/tt/lm/tokenizer_lm.tknz" 24 | }, 25 | "commit": "c102b404d8bbce612eecb7e5fa6cb7679609ec5c" 26 | } -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Monolingual/tt/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Monolingual/tt/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Multilingual/Multi._phoneme_L/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Multilingual/Multi._phoneme_L/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Multilingual/Multi._phoneme_M/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Multilingual/Multi._phoneme_M/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Multilingual/Multi._phoneme_S/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Multilingual/Multi._phoneme_S/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/exp/Multilingual/Multi._subword/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/cv-lang10/exp/Multilingual/Multi._subword/monitor.png -------------------------------------------------------------------------------- /egs/cv-lang10/lang-process/en/lexicon.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com) 2 | # This script prepares phoneme-based lexicon and corrects it for English. 3 | 4 | dict_dir=$1 5 | # Generating lexicon 6 | g2ps=g2ps/models # The path containing G2P models from https://github.com/uiuc-sst/g2ps 7 | phonetisaurus-apply --model $g2ps/american-english.fst --word_list $dict_dir/word_list > $dict_dir/lexicon.txt 8 | 9 | # Lexicon correction 10 | cat $dict_dir/lexicon.txt | awk '{$1=""; print $0}' | sed -e 's/ˌ//g; s/l̩/l/g; s/n̩/n/g; s/#//g; s/[.]//g; s/g/ɡ/g; s/ei/e i/g; s/aɪ/a ɪ/g; s/ɔi/ɔ i/g; s/oʊ/o ʊ/g; s/aʊ/a ʊ/g; s/ɔɪ/ɔ ɪ/g; s/ɑɪ/ɑ ɪ/g; s/ɝ/ɜ/g; s/ɚ/ə/g; s/tʃ/t͡ʃ/g; s/dʒ/d͡ʒ/g; s/d ʒ/d͡ʒ/g' > $dict_dir/phone.txt 11 | -------------------------------------------------------------------------------- /egs/cv-lang10/lang-process/es/lexicon.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com) 2 | # This script prepares phoneme-based lexicon and corrects it for Spanish. 3 | 4 | # Generating lexicon 5 | dict_dir=$1 6 | g2ps=g2ps/models # The path containing G2P models from https://github.com/uiuc-sst/g2ps 7 | phonetisaurus-apply --model $g2ps/spanish_4_3_2.fst --word_list $dict_dir/word_list > $dict_dir/lexicon.txt 8 | 9 | # Lexicon correction 10 | cat $dict_dir/lexicon.txt | awk '{$1=""; print $0}' | sed -e 's/g/ɡ/g' > $dict_dir/phone.txt 11 | -------------------------------------------------------------------------------- /egs/cv-lang10/lang-process/fr/lexicon.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com) 2 | # This script prepares phoneme-based lexicon and corrects it for French. 3 | 4 | dict_dir=$1 5 | # Generating lexicon 6 | g2ps=g2ps/models 7 | phonetisaurus-apply --model $g2psench_8_4_3.fst --word_list $dict_dir/word_list > $dict_dir/lexicon.txt 8 | 9 | # Lexicon correction 10 | cat $dict_dir/lexicon.txt | awk '{$1=""; print $0}' | sed -e 's/w ˈa//g; s/g/ʒ/g; 11 | s/R/ʁ/g; s/í/i/g; s/ì/i/g; s/ò/o/g; s/ó/o/g; s/ü/u/g; s/ú/u/g; s/ù/u/g; s/á/a/g; 12 | s/ɑ̃/ɑ/g; s/œ̃/œ/g; s/ɛ̃/ɛ/g; s/ÿ/y/g; s/ë/e/g; s/ɔ̃/ɔ/g;' \ 13 | -e 's/[ ]*$//g; s/^[ ]*//g; s/[ ][ ]*/ /g' > $dict_dir/phone.txt 14 | -------------------------------------------------------------------------------- /egs/cv-lang10/lang-process/id/lexicon.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com) 2 | # This script prepares phoneme-based lexicon and corrects it for Indonesian. 3 | 4 | # Generating lexicon 5 | dict_dir=$1 6 | g2ps=local/g2ps/models 7 | phonetisaurus-apply --model $g2ps/Indonesian.fst --word_list $dict_dir/word_list > $dict_dir/lexicon.txt 8 | 9 | # Lexicon correction 10 | cat $dict_dir/lexicon.txt | awk '{$1=""; print $0}' > $dict_dir/phone.txt -------------------------------------------------------------------------------- /egs/cv-lang10/lang-process/it/lexicon.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com) 2 | # This script prepares phoneme-based lexicon and corrects it for Italian. 3 | 4 | dict_dir=$1 5 | # Generating lexicon 6 | g2ps=g2ps/models 7 | phonetisaurus-apply --model $g2ps/italian_8_2_3.fst --word_list $dict_dir/word_list > $dict_dir/lexicon.txt 8 | 9 | # Lexicon correction 10 | cat $dict_dir/lexicon.txt | awk '{$1=""; print $0}' | sed -e 's/dʒ/d͡ʒ/g; s/dz/d͡z/g; s/tʃ/t͡ʃ/g; s/ts/t͡s/g; s/∅/ø/g' > $dict_dir/phone.txt 11 | -------------------------------------------------------------------------------- /egs/cv-lang10/lang-process/ky/lexicon.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com) 2 | # This script prepares phoneme-based lexicon and corrects it for Kirghiz. 3 | 4 | # Generating lexicon 5 | dict_dir=$1 6 | g2ps=g2ps/models 7 | phonetisaurus-apply --model $g2ps/kirghiz_8_2_2.fst --word_list $dict_dir/word_list > $dict_dir/lexicon.txt 8 | 9 | # Lexicon correction 10 | cat $dict_dir/lexicon.txt | awk '{$1=""; print $0}' > $dict_dir/phone.txt -------------------------------------------------------------------------------- /egs/cv-lang10/lang-process/lang-process.md: -------------------------------------------------------------------------------- 1 | # Language process 2 | ## 1. Text normalization 3 | The training dataset of our models are sourced from the publicly available [`Common Voice`](https://commonvoice.mozilla.org/) 11.0. There are some redundant symbols or alien words which may affect model performance, so we do text normalize to remove them for each language. 4 | 5 | ## 2. Lexicon generation and correction 6 | The %PER of FST (Finite State Transducer) based G2P (Grapheme-to-Phoneme) toolkit that we used to generate pronunciation lexicon range from 7% to 45%, so the lexicon need to be corrected. 7 | 8 | ## Check of phonemes 9 | After lexicon correction, the final lexicon is also not perfect, with some noise. We further check our phonemes by referring to the IPA symbol table in LanguageNet and Phoible, with Google Translate listening test. -------------------------------------------------------------------------------- /egs/cv-lang10/lang-process/nl/lexicon.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com) 2 | # This script prepares phoneme-based lexicon and corrects it for Dutch. 3 | 4 | # Generating lexicon 5 | dict_dir=$1 6 | g2ps=g2ps/models 7 | phonetisaurus-apply --model $g2ps/dutch.fst --word_list $dict_dir/word_list > $dict_dir/lexicon.txt 8 | 9 | # Lexicon correction 10 | cat $dict_dir/lexicon.txt | awk '{$1=""; print $0}' | sed -e 's/dʒ/d͡ʒ/g; s/œɪ/œ y/g; s/ɛɪ/ɛ i/g' > $dict_dir/phone.txt 11 | -------------------------------------------------------------------------------- /egs/cv-lang10/lang-process/nl/text_norm.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com) 2 | # Acknowlegement: This script refer to the code of Huahuan Zheng (maxwellzh@outlook.com) 3 | # This script completes text normalization for Dutch dataset from CommonVoice 4 | 5 | data_dir=$1 6 | for set in dev test excluded_train; do 7 | paste $data_dir/$set/text > $data_dir/$set/text.bak 8 | cut <$data_dir/$set/text.bak -f 2- | \ 9 | sed -e 's/,/ /g; s/"/ /g; s/“/ /g; s/[;]/ /g; s/[—]/ /g; s/[.]/ /g; s/:/ /g; s/!/ /g; s/”/ /g; s/?/ /g; s/«/ /g; s/»/ /g' | \ 10 | sed -e 's/[ ][ ]*/ /g; s/^[ ]*//g; s/[ ]*$//g' | \ 11 | python -c "import sys; print(sys.stdin.read().lower())" > $data_dir/$set/text.trans.tmp 12 | cut <$data_dir/$set/text.bak -f 1 > $data_dir/$set/text.id.tmp 13 | paste $data_dir/$set/text.{id,trans}.tmp > $data_dir/$set/text 14 | cat $data_dir/$set/text | sed -e 's/^[ ]*//g' | grep -v "^$" > $data_dir/$set/text_new 15 | mv $data_dir/$set/text_new $data_dir/$set/text 16 | rm -rf $data_dir/$set/text.{id,trans}.tmp 17 | done -------------------------------------------------------------------------------- /egs/cv-lang10/lang-process/pl/lexicon.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com) 2 | # This script prepares phoneme-based lexicon and corrects it for Polish. 3 | 4 | # Generating lexicon 5 | dict_dir=$1 6 | g2ps=g2ps/models 7 | phonetisaurus-apply --model $g2ps/polish_2_2_2.fst --word_list $dict_dir/word_list > $dict_dir/lexicon.txt 8 | 9 | # Lexicon correction 10 | cat $dict_dir/lexicon.txt | awk '{$1=""; print $0}' | sed -e 's/ts/t͡s/g; s/dz/d͡z/g; s/ɖʐ/ɖ͡ʐ/g; s/tʂ/ʈ͡ʂ/g; s/dʑ/d͡ʑ/g; s/tɕ/t͡ɕ/g; s/ɔ̃/ɔ/g; s/ɨ̃/ɨ/g; s/ɛ̃/ɛ/g; s/w̃/w/g; s/ɛ̝/ɛ/g' > $dict_dir/phone.txt 11 | -------------------------------------------------------------------------------- /egs/cv-lang10/lang-process/pl/text_norm.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com) 2 | # Acknowlegement: This script refer to the code of Huahuan Zheng (maxwellzh@outlook.com) 3 | # This script completes text normalization for Polish dataset from CommonVoice 4 | 5 | data_dir=$1 6 | for set in dev test excluded_train; do 7 | paste $data_dir/$set/text > $data_dir/$set/text.bak 8 | cut <$data_dir/$set/text.bak -f 2- | \ 9 | sed -e 's/,/ /g; s/"/ /g; s/“/ /g; s/[;]/ /g; s/[—]/ /g; s/[.]/ /g; s/:/ /g; s/!/ /g; s/”/ /g; s/?/ /g; s/«/ /g; s/»/ /g' | \ 10 | sed -e 's/[ ][ ]*/ /g; s/^[ ]*//g; s/[ ]*$//g' | \ 11 | python -c "import sys; print(sys.stdin.read().lower())" > $data_dir/$set/text.trans.tmp 12 | cut <$data_dir/$set/text.bak -f 1 > $data_dir/$set/text.id.tmp 13 | paste $data_dir/$set/text.{id,trans}.tmp > $data_dir/$set/text 14 | cat $data_dir/$set/text | sed -e 's/^[ ]*//g' | grep -v "^$" > $data_dir/$set/text_new 15 | mv $data_dir/$set/text_new $data_dir/$set/text 16 | rm -rf $data_dir/$set/text.{id,trans}.tmp 17 | done -------------------------------------------------------------------------------- /egs/cv-lang10/lang-process/ru/lexicon.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com) / 2 | # This script prepares phoneme-based lexicon and corrects it for Russian. 3 | 4 | dict_dir=$1 5 | # Generating lexicon 6 | g2ps=g2ps/models 7 | phonetisaurus-apply --model $g2ps/russian.fst --word_list $dict_dir/word_list > $dict_dir/lexicon.txt 8 | 9 | # Lexicon correction 10 | cat $dict_dir/lexicon.txt | awk '{$1=""; print $0}' > $dict_dir/phone.txt 11 | -------------------------------------------------------------------------------- /egs/cv-lang10/lang-process/sv-SE/lexicon.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com) 2 | # This script prepares phoneme-based lexicon and corrects it for Swedish. 3 | 4 | # Generating lexicon 5 | dict_dir=$1 6 | g2ps=g2ps/models 7 | phonetisaurus-apply --model $g2ps/swedish_4_4_4.fst --word_list $dict_dir/word_list > $dict_dir/lexicon.txt 8 | 9 | # Lexicon correction 10 | cat $dict_dir/lexicon.txt | awk '{$1=""; print $0}' > $dict_dir/phone.txt 11 | -------------------------------------------------------------------------------- /egs/cv-lang10/lang-process/tr/lexicon.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com) 2 | # This script prepares phoneme-based lexicon and corrects it for Turkish. 3 | 4 | # Generating lexicon 5 | dict_dir=$1 6 | g2ps=g2ps/models 7 | phonetisaurus-apply --model $g2ps/turkish.fst --word_list $dict_dir/word_list > $dict_dir/lexicon.txt 8 | 9 | # Lexicon correction 10 | cat $dict_dir/lexicon.txt | awk '{$1=""; print $0}' | sed -e 's/d ʒ/d͡ʒ/g; s/dʒ/d͡ʒ/g; s/t ʃ/t͡ʃ/g; s/tʃ/t͡ʃ/g; s/ɡj/ɡ/g; s/g/ɡ/g; s/â/a/g; s/é/e/g; s/û/u/g; s/*//g; s/ ̇//g; s/[.]//g; s/ë/e/g; s/î/i/g' > $dict_dir/phone.txt 11 | -------------------------------------------------------------------------------- /egs/cv-lang10/lang-process/tt/lexicon.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com) 2 | # This script prepares phoneme-based lexicon and corrects it for Tatar. 3 | 4 | # Generating lexicon 5 | dict_dir=$1 6 | g2ps=g2ps/models 7 | phonetisaurus-apply --model $g2ps/tatar_2_2_2.fst --word_list $dict_dir/word_list > $dict_dir/lexicon.txt 8 | 9 | # Lexicon correction 10 | cat $dict_dir/lexicon.txt | awk '{$1=""; print $0}' | sed -e 's/jo/j o/g; s/g/ɡ/g' > $dict_dir/phone.txt -------------------------------------------------------------------------------- /egs/cv-lang10/lang-process/tt/text_norm.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Tsinghua SPMI Lab, Author: Ma, Te (mate153125@gmail.com) 2 | # Acknowlegement: This script refer to the code of Huahuan Zheng (maxwellzh@outlook.com) 3 | # This script completes text normalization for Tatar dataset from CommonVoice 4 | 5 | data_dir=$1 6 | for set in dev test excluded_train; do 7 | paste $data_dir/$set/text > $data_dir/$set/text.bak 8 | cut <$data_dir/$set/text.bak -f 2- | \ 9 | sed -e 's/,/ /g; s/"/ /g; s/“/ /g; s/[;]/ /g; s/[—]/ /g; s/[.]/ /g; s/:/ /g; s/!/ /g; s/”/ /g; s/?/ /g; s/«/ /g; s/»/ /g' \ 10 | sed -e 's/[ ][ ]*/ /g; s/^[ ]*//g; s/[ ]*$//g' | \ 11 | python -c "import sys; print(sys.stdin.read().lower())" > $data_dir/$set/text.trans.tmp 12 | cut <$data_dir/$set/text.bak -f 1 > $data_dir/$set/text.id.tmp 13 | paste $data_dir/$set/text.{id,trans}.tmp > $data_dir/$set/text 14 | cat $data_dir/$set/text | sed -e 's/^[ ]*//g' | grep -v "^$" > $data_dir/$set/text_new 15 | mv $data_dir/$set/text_new $data_dir/$set/text 16 | rm -rf $data_dir/$set/text.{id,trans}.tmp 17 | done -------------------------------------------------------------------------------- /egs/cv-lang10/local/tools/calculate_dur.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import os 4 | import time 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("data_dir", type=str, help="path of data dir") 8 | args = parser.parse_args() 9 | file_name = os.path.join(args.data_dir, "utt2dur") 10 | 11 | assert os.path.isfile(file_name), "this script require utt2dur for calculate total duration." 12 | 13 | # start_time = time.time() 14 | total_duration = 0. 15 | with open(file_name, "r") as f: 16 | for line in f: 17 | path = line.split()[1] 18 | duration = float(path) 19 | # duration = librosa.get_duration(filename=path) 20 | total_duration += duration 21 | # end_time = time.time() 22 | print(f"total duration: {total_duration/3600:2f} hour") 23 | # print(f"process time : {end_time-start_time:2f} second") 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /egs/cv-lang10/local/tools/char_list.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import os 4 | 5 | if __name__ == "__main__": 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("word_list", type=str, help="text file") 8 | parser.add_argument("--out", type=str, help="path of output char list file") 9 | args = parser.parse_args() 10 | 11 | assert os.path.isfile(args.word_list), f"word_list={args.word_list} is not a valid file." 12 | 13 | char_list = set() 14 | 15 | with open(args.word_list, "r", encoding="utf-8") as f: 16 | for line in f: 17 | char_list.update(list(line.strip())) 18 | 19 | out = args.out if args.out else os.path.join(os.path.dirname(args.word_list), "char_list.txt") 20 | 21 | with open(out, "w", encoding="utf-8") as wf: 22 | for char in char_list: 23 | wf.write(char + "\n") 24 | 25 | 26 | -------------------------------------------------------------------------------- /egs/cv-lang10/local/tools/phone_list.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | if __name__ == "__main__": 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("lexicon", type=str, help="lexicon file") 7 | parser.add_argument("--out", type=str, help="path of output phone list file") 8 | args = parser.parse_args() 9 | 10 | assert os.path.isfile(args.lexicon), f"phone_list={args.lexicon} is not a valid file." 11 | 12 | phone_list = set() 13 | 14 | with open(args.lexicon, "r", encoding="utf-8") as f: 15 | for line in f: 16 | phone_seq = line.strip().split('\t', maxsplit=1)[1] 17 | phone_list.update(phone_seq.split()) 18 | 19 | out = args.out if args.out else os.path.join(os.path.dirname(args.lexicon), "phone_list.txt") 20 | 21 | with open(out, "w", encoding="utf-8") as wf: 22 | for phone in phone_list: 23 | if phone != ' ': 24 | wf.write(phone + "\n") 25 | 26 | -------------------------------------------------------------------------------- /egs/cv-lang10/utils: -------------------------------------------------------------------------------- 1 | ../../cat/utils -------------------------------------------------------------------------------- /egs/libri/README.md: -------------------------------------------------------------------------------- 1 | ## Data 2 | 960 hour English speech data. Book reading speech. 3 | 4 | **Data prepare** 5 | 6 | Use one of the options: 7 | 8 | - Prepare data with Kaldi (default in results) 9 | 10 | ```bash 11 | bash local/data_kaldi.sh -h 12 | ``` 13 | 14 | - Prepare data with `torchaudio`: run following command to get help 15 | 16 | ```bash 17 | bash local/data.sh -h 18 | ``` 19 | 20 | ## Result 21 | 22 | Summarize experiments here. 23 | 24 | Evaluated by WER (%) 25 | 26 | | EXPID | dev-clean | dev-other | test-clean | test-other | 27 | | ------------------------------------------------------------------ | --------- | --------- | ---------- | ---------- | 28 | | [rnnt](exp/rnnt-v1) + transformer [lm](exp/lm/lm-v1-transformer) | 1.81 | 4.03 | 1.94 | 4.39 | 29 | | [ctc-crf](exp/crf-v1) + transformer [lm](exp/lm/lm-v1-transformer) | 2.05 | 4.54 | 2.25 | 4.73 | 30 | 31 | -------------------------------------------------------------------------------- /egs/libri/cat: -------------------------------------------------------------------------------- 1 | ../../cat -------------------------------------------------------------------------------- /egs/libri/exp/crf-v1/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": [ 4 | "train-clean-100", 5 | "train-clean-360", 6 | "train-other-500" 7 | ], 8 | "dev": [ 9 | "dev-clean", 10 | "dev-other" 11 | ], 12 | "test": [ 13 | "dev-clean", 14 | "dev-other", 15 | "test-clean", 16 | "test-other" 17 | ], 18 | "filter": ":2000" 19 | }, 20 | "tokenizer": { 21 | "type": "LexiconTokenizer", 22 | "option-init": { 23 | "lexicon": "data/local/librispeech-lexicon.txt" 24 | } 25 | }, 26 | "train": { 27 | "bin": "cat.ctc.train", 28 | "option": { 29 | "amp": true, 30 | "batch_size": 128, 31 | "grad_accum_fold": 16, 32 | "grad_norm": 5.0, 33 | "dynamic_batch_mode": 1 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /egs/libri/exp/crf-v1/monitor.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/libri/exp/crf-v1/monitor.jpg -------------------------------------------------------------------------------- /egs/libri/exp/lm/lm-v1-transformer/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "CausalTransformer", 4 | "kwargs": { 5 | "attn_dropout": 0.1, 6 | "dim_hid": 768, 7 | "num_classes": 1024, 8 | "num_head": 12, 9 | "num_layers": 12 10 | } 11 | }, 12 | "scheduler": { 13 | "type": "SchedulerNoam", 14 | "optimizer": { 15 | "type": "Adam", 16 | "kwargs": { 17 | "lr": 0.001, 18 | "weight_decay": 1e-06 19 | } 20 | }, 21 | "kwargs": { 22 | "dim_model": 768, 23 | "peak_factor": 1.0, 24 | "warmup_step": 25000, 25 | "stop_step": 1200000 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /egs/libri/exp/lm/lm-v1-transformer/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": [ 4 | "train-clean-100", 5 | "train-clean-360", 6 | "train-other-500", 7 | "data/librispeech.txt" 8 | ], 9 | "dev": [ 10 | "dev-clean", 11 | "dev-other" 12 | ], 13 | "test": [ 14 | "dev-clean", 15 | "dev-other", 16 | "test-clean", 17 | "test-other" 18 | ], 19 | "packing-text-lm": { 20 | "truncate": 128 21 | } 22 | }, 23 | "tokenizer": { 24 | "type": "SentencePieceTokenizer", 25 | "option-train": { 26 | "model_type": "unigram", 27 | "vocab_size": 1024, 28 | "model_prefix": "sentencepiece/libri_unigram_1024/spm" 29 | } 30 | }, 31 | "train": { 32 | "bin": "cat.lm.train", 33 | "option": { 34 | "amp": true, 35 | "batch_size": 2048, 36 | "grad-norm": 5.0 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /egs/libri/exp/lm/lm-v1-transformer/monitor.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/libri/exp/lm/lm-v1-transformer/monitor.jpg -------------------------------------------------------------------------------- /egs/libri/exp/lm/lm-v1-transformer/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * Model size/M: 87.42 6 | * GPU info \[10\] 7 | * \[10\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | - Transformer LM for libri corpus. 12 | - The training might take over 100 hours. 13 | 14 | ### Result 15 | ``` 16 | dev_clean | dev_other | test_other | test_clean 17 | 12.49 | 13.34 | 13.50 | 12.49 18 | ``` 19 | 20 | ### Monitor figure 21 | ![monitor](./monitor.jpg) 22 | -------------------------------------------------------------------------------- /egs/libri/exp/rnnt-v1/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/libri/exp/rnnt-v1/monitor.png -------------------------------------------------------------------------------- /egs/libri/local/prep_lexicon.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p data/local 4 | [ ! -f data/local/librispeech-lexicon.txt ] && 5 | wget https://www.openslr.org/resources/11/librispeech-lexicon.txt -P data/local 6 | 7 | echo "$0 done." 8 | exit 0 9 | -------------------------------------------------------------------------------- /egs/libri/local/prep_libri_corpus.sh: -------------------------------------------------------------------------------- 1 | # Author: Zheng Huahuan (maxwellzh@outlook.com) 2 | # This script includes the processing of librispeech extra corpus text 3 | set -e -u 4 | 5 | d_out=data 6 | 7 | mkdir -p $d_out 8 | text=$d_out/librispeech.txt 9 | if [ ! -f $text ]; then 10 | archive=$d_out/librispeech-lm-norm.txt.gz 11 | if [ ! -f $archive ]; then 12 | wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -P $d_out || exit 1 13 | fi 14 | # check archive 15 | if [ $(md5sum $archive | cut -d ' ' -f 1) != "c83c64c726a1aedfe65f80aa311de402" ]; then 16 | echo "MD5 checking failed for $archive, please rm it then run this script again." 17 | exit 1 18 | fi 19 | gunzip -c $archive >$text || exit 1 20 | rm $archive 21 | echo "Fetched librispeech extra text corpus at $text" 22 | else 23 | echo "$text file exist. skipped" 24 | fi 25 | 26 | # check md5sum 27 | if [ $(md5sum $text | cut -d ' ' -f 1) != "c8288034566b62698db24f6cd414160d" ]; then 28 | echo "MD5 checking failed for $text, please rm it then run this script again." 29 | exit 1 30 | fi 31 | -------------------------------------------------------------------------------- /egs/libri/utils: -------------------------------------------------------------------------------- 1 | ../../cat/utils/ -------------------------------------------------------------------------------- /egs/wenetspeech/.vscode: -------------------------------------------------------------------------------- 1 | ../../.vscode/ -------------------------------------------------------------------------------- /egs/wenetspeech/cat: -------------------------------------------------------------------------------- 1 | ../../cat/ -------------------------------------------------------------------------------- /egs/wenetspeech/exp/ebm-lm/GN-ELM-DNCE/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "EBM", 4 | "kwargs": { 5 | "noise_rate": 1, 6 | "method": "dnce", 7 | "energy_func": "sumtokenlogit", 8 | "config_ebm_model": "exp/lm/GN-ELM-DNCE/config_ebm.json", 9 | "config_noise_model": "exp/lm/GN-ELM-DNCE/config_noise.json", 10 | "tokenizer_path": "exp/lm/GN-ELM-DNCE/tokenizer.tknz", 11 | "bert_tokenizer": true 12 | } 13 | }, 14 | "scheduler": { 15 | "type": "SchedulerNoam", 16 | "kwargs": { 17 | "dim_model": 768, 18 | "peak_factor": 0.16, 19 | "warmup_step": 10000, 20 | "stop_step": 30000 21 | }, 22 | "optimizer": { 23 | "type": "Adam", 24 | "kwargs": { 25 | "lr": 0.0008 26 | } 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /egs/wenetspeech/exp/ebm-lm/GN-ELM-DNCE/config_ebm.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "PretrainedTransformer", 4 | "kwargs": { 5 | "T_model": "BertLMHeadModel", 6 | "T_config": "BertConfig", 7 | "pretrained": "bert-base-chinese", 8 | "with_head": false 9 | } 10 | } 11 | } -------------------------------------------------------------------------------- /egs/wenetspeech/exp/ebm-lm/GN-ELM-DNCE/config_noise.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "PretrainedTransformer", 4 | "kwargs": { 5 | "T_model": "GPT2LMHeadModel", 6 | "T_config": "GPT2Config", 7 | "pretrained": "uer/gpt2-chinese-cluecorpussmall", 8 | "with_head": false 9 | } 10 | } 11 | } -------------------------------------------------------------------------------- /egs/wenetspeech/exp/ebm-lm/GN-ELM-DNCE/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/wenetspeech/exp/ebm-lm/GN-ELM-DNCE/monitor.png -------------------------------------------------------------------------------- /egs/wenetspeech/exp/ebm-lm/TRF-LM-DNCE/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "TRFLM", 4 | "kwargs": { 5 | "noise_rate": 1, 6 | "method": "dnce", 7 | "energy_func": "sumtargetlogit", 8 | "f_linfo": "exp/ebm-lm/TRF-LM-DNCE/pkl/train.pkl", 9 | "config_ebm_model": "exp/ebm-lm/TRF-LM-DNCE/config_trf.json", 10 | "config_noise_model": "exp/ebm-lm/TRF-LM-DNCE/config_noise.json", 11 | "tokenizer_path": "exp/ebm-lm/TRF-LM-DNCE/tokenizer.tknz", 12 | "alpha": 1, 13 | "with_end_mark": false, 14 | "bert_tokenizer": true 15 | } 16 | }, 17 | "scheduler": { 18 | "type": "SchedulerNoam", 19 | "kwargs": { 20 | "dim_model": 768, 21 | "peak_factor": 0.16, 22 | "warmup_step": 10000, 23 | "stop_step": 30000 24 | }, 25 | "optimizer": { 26 | "type": "Adam", 27 | "kwargs": { 28 | "lr": 0.0008 29 | } 30 | } 31 | } 32 | } -------------------------------------------------------------------------------- /egs/wenetspeech/exp/ebm-lm/TRF-LM-DNCE/config_noise.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "PretrainedTransformer", 4 | "kwargs": { 5 | "T_model": "GPT2LMHeadModel", 6 | "T_config": "GPT2Config", 7 | "pretrained": "uer/gpt2-chinese-cluecorpussmall", 8 | "with_head": false 9 | } 10 | } 11 | } -------------------------------------------------------------------------------- /egs/wenetspeech/exp/ebm-lm/TRF-LM-DNCE/config_trf.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "PretrainedTransformer", 4 | "kwargs": { 5 | "T_model": "GPT2LMHeadModel", 6 | "T_config": "GPT2Config", 7 | "pretrained": "uer/gpt2-chinese-cluecorpussmall", 8 | "with_head": false 9 | } 10 | } 11 | } -------------------------------------------------------------------------------- /egs/wenetspeech/exp/ebm-lm/TRF-LM-DNCE/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/wenetspeech/exp/ebm-lm/TRF-LM-DNCE/monitor.png -------------------------------------------------------------------------------- /egs/wenetspeech/exp/ebm-lm/TRF-LM-DNCE/readme.md: -------------------------------------------------------------------------------- 1 | # Train TRF-LM with DNCE 2 | 3 | ## Result 4 | We also try 3 different energy functions, whose results are as follows: 5 | 6 | | CER type | SumTargetLogit | Hidden2Scalar | SumTokenLogit | 7 | | ------------ | -------------- | ------------- | ------------- | 8 | | in-domain | 8.97 | 8.95 | 9.00 | 9 | | cross-domain | 15.77 | 15.67 | 15.65 | 10 | 11 | The training curve of the best model (Hidden2Scalar) is shown below. 12 | 13 | | training curve | 14 | |:-----------------------:| 15 | |![monitor](./monitor.png)| -------------------------------------------------------------------------------- /egs/wenetspeech/exp/lm/lm-trans-l/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "NGram", 4 | "kwargs": { 5 | "f_binlm": "exp/lm/lm-trans-l/5gram.bin", 6 | "gram_order": 5, 7 | "num_classes": 5536 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /egs/wenetspeech/exp/lm/lm-trans-l/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": "train_l", 4 | "test": [ 5 | "test_net", 6 | "test_meeting" 7 | ] 8 | }, 9 | "tokenizer": { 10 | "type": "SentencePieceTokenizer", 11 | "option-train": { 12 | "model_type": "char", 13 | "add_dummy_prefix": false, 14 | "use_all_vocab": true, 15 | "model_prefix": "sentencepiece/wenetspeech_l_char/spm", 16 | "vocab_size": 5536 17 | } 18 | }, 19 | "inference": {} 20 | } -------------------------------------------------------------------------------- /egs/wenetspeech/exp/lm/lm-trans-l/readme.md: -------------------------------------------------------------------------------- 1 | 2 | train command: 3 | 4 | ```bash 5 | utils/pipeline/ngram.sh exp/lm/lm-trans-l 6 | ``` 7 | 8 | property: 9 | 10 | - prune: 11 | - type: trie 12 | - size: 1.3GB 13 | 14 | perplexity: 15 | 16 | ``` 17 | Test file: test_net -> ppl: 59.07 18 | Test file: test_meeting -> ppl: 55.39 19 | ``` 20 | -------------------------------------------------------------------------------- /egs/wenetspeech/exp/lm/lm-trans-m/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "NGram", 4 | "kwargs": { 5 | "f_binlm": "exp/lm/lm-trans-m/5gram.bin", 6 | "gram_order": 5, 7 | "num_classes": 5147 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /egs/wenetspeech/exp/lm/lm-trans-m/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": "train_m", 4 | "dev": [ 5 | "dev" 6 | ], 7 | "test": [ 8 | "dev", 9 | "test_net", 10 | "test_meeting" 11 | ] 12 | }, 13 | "tokenizer": { 14 | "type": "SentencePieceTokenizer", 15 | "option-train": { 16 | "model_type": "char", 17 | "add_dummy_prefix": false, 18 | "use_all_vocab": true, 19 | "model_prefix": "sentencepiece/wenetspeech_m_char/spm", 20 | "vocab_size": 5147 21 | } 22 | }, 23 | "inference": {} 24 | } 25 | -------------------------------------------------------------------------------- /egs/wenetspeech/exp/lm/lm-trans-m/readme.md: -------------------------------------------------------------------------------- 1 | 2 | train command: 3 | 4 | ```bash 5 | utils/pipeline/ngram.sh exp/lm/lm-trans-m 6 | ``` 7 | 8 | property: 9 | 10 | - prune: 11 | - type: probing 12 | - size: 467MB 13 | 14 | perplexity: 15 | 16 | ``` 17 | data: dev test_net test_meeting 18 | ppl: 61.33 | 73.53 | 65.49 | 19 | ``` 20 | -------------------------------------------------------------------------------- /egs/wenetspeech/exp/train_l/crf-v1/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "dtrain": "train_l", 4 | "dev": [ 5 | "dev" 6 | ], 7 | "dtest": [ 8 | "dev", 9 | "test_net", 10 | "test_meeting", 11 | "aishell-test" 12 | ], 13 | "filter": "10:1000" 14 | }, 15 | "tokenizer": { 16 | "type": "JiebaComposeLexiconTokenizer", 17 | "option-init": { 18 | "lexicon": "", 19 | "userdict": "" 20 | } 21 | }, 22 | "train": { 23 | "bin": "cat.ctc.train", 24 | "option": { 25 | "amp": true, 26 | "batch_size": 864, 27 | "grad-norm": 2.0, 28 | "grad-accum-fold": 3, 29 | "check-freq": 500 30 | } 31 | }, 32 | "inference": { 33 | "avgmodel": { 34 | "mode": "best", 35 | "num": 10 36 | }, 37 | "infer": { 38 | "bin": "cat.ctc.cal_logit", 39 | "option": { 40 | "nj": 48 41 | } 42 | } 43 | } 44 | } -------------------------------------------------------------------------------- /egs/wenetspeech/exp/train_l/crf-v1/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/wenetspeech/exp/train_l/crf-v1/monitor.png -------------------------------------------------------------------------------- /egs/wenetspeech/exp/train_l/rnnt-v1/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/wenetspeech/exp/train_l/rnnt-v1/monitor.png -------------------------------------------------------------------------------- /egs/wenetspeech/exp/train_m/crf-v1/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": "train_m", 4 | "dev": [ 5 | "dev" 6 | ], 7 | "test": [ 8 | "dev", 9 | "test_net", 10 | "test_meeting" 11 | ], 12 | "filter": ":1000" 13 | }, 14 | "train": { 15 | "bin": "cat.ctc.train", 16 | "option": { 17 | "amp": true, 18 | "batch_size": 512, 19 | "grad_norm": 5.0, 20 | "dynamic_batch_mode": 1 21 | } 22 | }, 23 | "inference": { 24 | "avgmodel": { 25 | "mode": "last", 26 | "num": 10 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /egs/wenetspeech/exp/train_m/crf-v1/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/wenetspeech/exp/train_m/crf-v1/monitor.png -------------------------------------------------------------------------------- /egs/wenetspeech/exp/train_m/ctc-v1/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/wenetspeech/exp/train_m/ctc-v1/monitor.png -------------------------------------------------------------------------------- /egs/wenetspeech/exp/train_m/ctc-v1/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * Model size/M: 86.01 6 | * GPU info \[7\] 7 | * \[7\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | - CTC topo of `rnnt-v1` 12 | 13 | ### Result 14 | ``` 15 | no lm 16 | dev %SER 75.46 | %CER 11.80 [ 39003 / 330498, 1282 ins, 14446 del, 23275 sub ] 17 | test_net %SER 70.72 | %CER 14.28 [ 59372 / 415746, 2004 ins, 12692 del, 44676 sub ] 18 | test_meeting %SER 94.50 | %CER 22.23 [ 48983 / 220385, 1622 ins, 17767 del, 29594 sub ] 19 | aishell-test %SER 61.05 | %CER 9.05 [ 9478 / 104765, 347 ins, 201 del, 8930 sub ] 20 | ``` 21 | 22 | | training process | 23 | |:-----------------------:| 24 | |![tb-plot](./monitor.png)| 25 | -------------------------------------------------------------------------------- /egs/wenetspeech/exp/train_m/rnnt-v1/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/wenetspeech/exp/train_m/rnnt-v1/monitor.png -------------------------------------------------------------------------------- /egs/wenetspeech/exp/train_m/rnnt-v1/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * Model size/M: 91.27 6 | * GPU info \[10\] 7 | * \[10\] NVIDIA GeForce RTX 3090 8 | 9 | ### Appendix 10 | 11 | * trained on wenet speech M subset (1000 hour speech) 12 | 13 | ### Result 14 | ``` 15 | beamsize 128 no lm 16 | dev %SER 71.10 | %CER 11.14 [ 36833 / 330498, 1284 ins, 16210 del, 19339 sub ] 17 | test_net %SER 65.51 | %CER 12.75 [ 52991 / 415746, 1942 ins, 12914 del, 38135 sub ] 18 | test_meeting %SER 91.74 | %CER 20.88 [ 46025 / 220385, 1236 ins, 22703 del, 22086 sub ] 19 | aishell-dev %SER 45.05 | %CER 6.32 [ 12985 / 205341, 420 ins, 248 del, 12317 sub ] 20 | aishell-test %SER 49.97 | %CER 7.22 [ 7562 / 104765, 253 ins, 204 del, 7105 sub ] 21 | ``` 22 | 23 | 24 | ### Monitor figure 25 | ![monitor](./monitor.png) 26 | -------------------------------------------------------------------------------- /egs/wenetspeech/utils: -------------------------------------------------------------------------------- 1 | ../../cat/utils/ -------------------------------------------------------------------------------- /egs/wsj/.vscode: -------------------------------------------------------------------------------- 1 | ../../.vscode/ -------------------------------------------------------------------------------- /egs/wsj/cat: -------------------------------------------------------------------------------- 1 | ../../cat -------------------------------------------------------------------------------- /egs/wsj/exp/asr-ctc-crf-phone/decode_lm/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "NGram", 4 | "kwargs": { 5 | "f_binlm": "exp/asr-ctc-crf-phone/decode_lm/4gram.arpa", 6 | "gram_order": 4, 7 | "num_classes": 127924 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /egs/wsj/exp/asr-ctc-crf-phone/decode_lm/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": "data/extra.corpus" 4 | }, 5 | "tokenizer": { 6 | "type": "SimpleTokenizer", 7 | "option-init": { 8 | "dmap": "data/cmudict.txt", 9 | "read_index_from_file": false 10 | } 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /egs/wsj/exp/asr-ctc-crf-phone/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": "train_si284-3sp", 4 | "dev": "test_dev93-3sp", 5 | "test": [ 6 | "test_dev93", 7 | "test_eval92" 8 | ], 9 | "filter": ":2000" 10 | }, 11 | "tokenizer": { 12 | "type": "LexiconTokenizer", 13 | "option-init": { 14 | "lexicon": "data/cmudict.txt" 15 | } 16 | }, 17 | "train": { 18 | "bin": "cat.ctc.train", 19 | "option": { 20 | "amp": true, 21 | "batch_size": 256 22 | } 23 | }, 24 | "inference": { 25 | "avgmodel": { 26 | "mode": "last", 27 | "num": 2 28 | }, 29 | "infer": { 30 | "bin": "cat.ctc.cal_logit", 31 | "option": { 32 | "nj": 16 33 | } 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /egs/wsj/exp/asr-ctc-crf-phone/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/wsj/exp/asr-ctc-crf-phone/monitor.png -------------------------------------------------------------------------------- /egs/wsj/exp/asr-ctc-crf-phone/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 13.39 6 | * GPU info \[5\] 7 | * \[5\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | * CTC-CRF training for around 23 epochs 12 | 13 | ### Result 14 | ``` 15 | eval92 %SER 29.73 | %WER 2.87 [ 162 / 5643, 14 ins, 14 del, 134 sub ] 16 | dev93 %SER 46.32 | %WER 5.53 [ 455 / 8234, 49 ins, 52 del, 354 sub ] 17 | 18 | ``` 19 | 20 | | training process | 21 | |:-----------------------:| 22 | |![monitor](./monitor.png)| 23 | -------------------------------------------------------------------------------- /egs/wsj/exp/asr-ctc-phone/decode_lm/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decoder": { 3 | "type": "NGram", 4 | "kwargs": { 5 | "f_binlm": "exp/asr-ctc-phone/decode_lm/4gram.arpa", 6 | "gram_order": 4, 7 | "num_classes": 127924 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /egs/wsj/exp/asr-ctc-phone/decode_lm/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": "data/extra.corpus" 4 | }, 5 | "tokenizer": { 6 | "type": "SimpleTokenizer", 7 | "option-init": { 8 | "dmap": "data/cmudict.txt", 9 | "read_index_from_file": false 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /egs/wsj/exp/asr-ctc-phone/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": "train_si284-3sp", 4 | "dev": "test_dev93-3sp", 5 | "test": [ 6 | "test_dev93", 7 | "test_eval92" 8 | ], 9 | "filter": ":2000" 10 | }, 11 | "tokenizer": { 12 | "type": "LexiconTokenizer", 13 | "option-init": { 14 | "lexicon": "data/cmudict.txt" 15 | } 16 | }, 17 | "train": { 18 | "bin": "cat.ctc.train", 19 | "option": { 20 | "amp": true, 21 | "batch_size": 256 22 | } 23 | }, 24 | "inference": { 25 | "avgmodel": { 26 | "mode": "last", 27 | "num": 2 28 | }, 29 | "infer": { 30 | "bin": "cat.ctc.cal_logit", 31 | "option": { 32 | "nj": 16 33 | } 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /egs/wsj/exp/asr-ctc-phone/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/wsj/exp/asr-ctc-phone/monitor.png -------------------------------------------------------------------------------- /egs/wsj/exp/asr-ctc-phone/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 13.39 6 | * GPU info \[10\] 7 | * \[10\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | * CTC trainging 23 epochs 12 | 13 | ### Result 14 | ``` 15 | eval92 %SER 51.49 | %WER 6.79 [ 383 / 5643, 30 ins, 54 del, 299 sub ] 16 | dev93 %SER 69.77 | %WER 11.88 [ 978 / 8234, 60 ins, 211 del, 707 sub ] 17 | ``` 18 | 19 | | training process | 20 | |:-----------------------:| 21 | |![monitor](./monitor.png)| 22 | -------------------------------------------------------------------------------- /egs/wsj/exp/asr-ctc-phone/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Author: Huahuan Zheng 3 | set -e -u 4 | 5 | dir=$(dirname $0) 6 | # Use a hack to re-use the script 7 | touch $dir/den_lm.fst 8 | bash ../asr-ctc-crf-phone/run.sh 9 | 10 | exit 0 11 | -------------------------------------------------------------------------------- /egs/wsj/exp/asr-rnnt-bpe/hyper-p.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "train": "train_si284-3sp", 4 | "dev": "test_dev93-3sp", 5 | "test": [ 6 | "test_dev93", 7 | "test_eval92" 8 | ], 9 | "filter": ":2000" 10 | }, 11 | "tokenizer": { 12 | "type": "SentencePieceTokenizer", 13 | "option-train": { 14 | "model_type": "bpe", 15 | "vocab_size": 2000, 16 | "model_prefix": "sentencepiece/wsj_bpe2k/spm" 17 | } 18 | }, 19 | "train": { 20 | "bin": "cat.rnnt.train", 21 | "option": { 22 | "amp": true, 23 | "batch_size": 256 24 | } 25 | }, 26 | "inference": { 27 | "avgmodel": { 28 | "mode": "best", 29 | "num": 5 30 | }, 31 | "infer": { 32 | "bin": "cat.rnnt.decode", 33 | "option": { 34 | "beam_size": 16, 35 | "nj": 16 36 | } 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /egs/wsj/exp/asr-rnnt-bpe/monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/egs/wsj/exp/asr-rnnt-bpe/monitor.png -------------------------------------------------------------------------------- /egs/wsj/exp/asr-rnnt-bpe/readme.md: -------------------------------------------------------------------------------- 1 | ### Basic info 2 | 3 | **This part is auto-generated, add your details in Appendix** 4 | 5 | * \# of parameters (million): 21.60 6 | * GPU info \[10\] 7 | * \[10\] NVIDIA GeForce RTX 3090 8 | 9 | ### Notes 10 | 11 | ```bash 12 | # prepare data 13 | bash local/data_kaldi.sh -use-3way-sp 14 | 15 | # train and inference 16 | python utils/pipeline/asr.py exp/asr-rnnt-bpe 17 | ``` 18 | 19 | * RNN-T training and Beam Search decoding 20 | 21 | ### Result 22 | ``` 23 | eval92 %SER 66.37 | %WER 9.87 [ 557 / 5643, 35 ins, 74 del, 448 sub ] 24 | dev93 %SER 72.76 | %WER 12.63 [ 1040 / 8234, 103 ins, 174 del, 763 sub ] 25 | ``` 26 | 27 | | training process | 28 | |:-----------------------:| 29 | |![monitor](./monitor.png)| 30 | -------------------------------------------------------------------------------- /egs/wsj/utils: -------------------------------------------------------------------------------- 1 | ../../cat/utils/ -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # gather >= 0.2.3 2 | -e git+https://github.com/maxwellzh/torch-gather.git#egg=gather 3 | # warp-rnnt >= 0.9.0 4 | -e git+https://github.com/maxwellzh/warp-rnnt.git#egg=warp-rnnt 5 | # webdataset, do not use 'pip install webdataset', that's outdated 6 | -e git+https://github.com/webdataset/webdataset.git@d7334016f44a03c4a385971aa835c4f460d3f30a#egg=webdataset 7 | # warp-ctct >= 0.3.0 8 | -e git+https://github.com/maxwellzh/warp-ctct.git#egg=warp-ctct 9 | 10 | # module dependencies 11 | torch>=1.9.0 12 | tqdm>=4.62.3 13 | matplotlib>=3.4.3 14 | sentencepiece>=0.1.96 15 | kaldiio>=2.17.2 16 | # dependency issue, see https://github.com/protocolbuffers/protobuf/issues/10051 17 | protobuf==3.20.2 18 | tensorboard>=2.6.0 19 | jiwer>=2.2.0 20 | pyyaml>=6.0 21 | transformers>=4.12.3 22 | jieba>=0.42.1 23 | levenshtein -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='cat', 5 | version='3.0.1', 6 | packages=find_packages(exclude=['src', 'tools']), 7 | description="CRF-based ASR Toolkit.", 8 | long_description=open('README.md', 'r').read(), 9 | author="THU-SPMI Lab.", 10 | url="https://github.com/thu-spmi/CAT", 11 | platforms=["Linux x86-64"], 12 | license="Apache 2.0" 13 | ) 14 | -------------------------------------------------------------------------------- /src/ctc_crf/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright 2018-2019 Tsinghua University, Author: Hongyu Xiang 2 | # 2021 Tsinghua University, Author: Huahuan Zheng 3 | # Apache 2.0. 4 | # CTC-CRF Makefile 5 | 6 | openfst_dir=${CURDIR}/openfst-1.6.7/build 7 | 8 | .PHONY: OPENFST GPUCTC GPUDEN CTCCRF 9 | all: CTCCRF 10 | OPENFST: 11 | if [ ! -f "openfst-1.6.7.tar.gz" ]; then wget http://www.openfst.org/twiki/pub/FST/FstDownload/openfst-1.6.7.tar.gz; fi 12 | tar -zxf openfst-1.6.7.tar.gz 13 | cd openfst-1.6.7; ./configure --prefix=${openfst_dir} 14 | cd openfst-1.6.7; make -j $(nproc) && make install 15 | GPUCTC: 16 | mkdir -p gpu_ctc/build 17 | cd gpu_ctc/build && cmake .. 18 | cd gpu_ctc/build && make 19 | GPUDEN: OPENFST 20 | mkdir -p gpu_den/build 21 | cd gpu_den/build && cmake -D openfst_dir:STRING=${openfst_dir} .. 22 | cd gpu_den/build && make 23 | CTCCRF: GPUCTC GPUDEN 24 | python3 setup.py install 25 | 26 | clean: 27 | python setup.py clean --all 28 | rm -rf gpu_{ctc,den}/build build/ \ 29 | openfst-1.6.7/ *.tar.gz \ 30 | dist/ ctc_crf.egg-info/ 31 | -------------------------------------------------------------------------------- /src/ctc_crf/gpu_ctc/README.txt: -------------------------------------------------------------------------------- 1 | The codes in this directory are from Baidu's warp-ctc. (https://github.com/baidu-research/warp-ctc). 2 | We modify the code organization and modify the input to be the softmax of the logits. -------------------------------------------------------------------------------- /src/ctc_crf/gpu_ctc/hostdevice.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef __CUDACC__ 4 | #define HOSTDEVICE __host__ __device__ 5 | #else 6 | #define HOSTDEVICE 7 | #endif 8 | -------------------------------------------------------------------------------- /src/ctc_crf/test/den_lm.fst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-spmi/CAT/d203db6151327e1f947428daefefd0bdba934dd1/src/ctc_crf/test/den_lm.fst -------------------------------------------------------------------------------- /src/fst-decoder/Makefile: -------------------------------------------------------------------------------- 1 | # author: Huahuan Zheng (maxwellzh@outlook.com) 2 | # This file is modified from kaldi/src/bin/Makefile 3 | 4 | kaldi_root_src=${KALDI_ROOT}/src 5 | EXTRA_CXXFLAGS = -Wno-sign-compare -I${kaldi_root_src} 6 | include ${kaldi_root_src}/kaldi.mk 7 | 8 | BINFILES = latgen-faster 9 | 10 | 11 | OBJFILES = 12 | 13 | ADDLIBS = ${kaldi_root_src}/decoder/kaldi-decoder.a ${kaldi_root_src}/lat/kaldi-lat.a ${kaldi_root_src}/lm/kaldi-lm.a \ 14 | ${kaldi_root_src}/fstext/kaldi-fstext.a ${kaldi_root_src}/hmm/kaldi-hmm.a \ 15 | ${kaldi_root_src}/transform/kaldi-transform.a ${kaldi_root_src}/gmm/kaldi-gmm.a \ 16 | ${kaldi_root_src}/tree/kaldi-tree.a ${kaldi_root_src}/util/kaldi-util.a ${kaldi_root_src}/matrix/kaldi-matrix.a \ 17 | ${kaldi_root_src}/base/kaldi-base.a 18 | 19 | 20 | TESTFILES = 21 | 22 | include ${kaldi_root_src}/makefiles/default_rules.mk 23 | --------------------------------------------------------------------------------