├── espnet2 ├── asr │ ├── mamba_ssm │ │ ├── ops │ │ │ ├── __init__.py │ │ │ ├── triton │ │ │ │ ├── __init__.py │ │ │ │ ├── __pycache__ │ │ │ │ │ ├── __init__.cpython-39.pyc │ │ │ │ │ ├── layernorm.cpython-39.pyc │ │ │ │ │ └── selective_state_update.cpython-39.pyc │ │ │ │ └── selective_state_update.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-39.pyc │ │ │ │ └── selective_scan_interface.cpython-39.pyc │ │ │ └── selective_scan_interface.py │ │ ├── modules │ │ │ ├── __init__.py │ │ │ └── mamba_simple.py │ │ ├── utils │ │ │ ├── __init__.py │ │ │ ├── hf.py │ │ │ └── generation.py │ │ └── __init__.py │ ├── uma.py │ ├── decoder │ │ └── unimodal_attention_decoder.py │ └── encoder │ │ ├── mamba_encoder.py │ │ └── conformer_encoder.py └── bin │ └── asr_unimodal_train.py ├── uma.png ├── mamba_uma.png ├── egs2 ├── aishell │ ├── umaconf │ │ ├── decode_asr_uma.yaml │ │ ├── train_asr_uma_mamba.yaml │ │ ├── train_asr_uma_conformer.yaml │ │ └── train_asr_uma_conformer_condition.yaml │ ├── exp_uma_mamba_0617 │ │ └── asr_train_asr_uma_mamba_raw_zh_char_sp │ │ │ ├── images │ │ │ ├── cer.png │ │ │ ├── loss.png │ │ │ ├── cer_ctc.png │ │ │ ├── iter_time.png │ │ │ ├── loss_ctc.png │ │ │ ├── optim0_lr0.png │ │ │ ├── train_time.png │ │ │ ├── forward_time.png │ │ │ ├── text_vs_uma.png │ │ │ ├── backward_time.png │ │ │ ├── optim_step_time.png │ │ │ ├── uma_reduction.png │ │ │ └── gpu_max_cached_mem_GB.png │ │ │ └── RESULTS.md │ ├── exp_uma_conformer_12e_69 │ │ └── asr_train_asr_unimodal_conformer_raw_zh_char_sp │ │ │ ├── images │ │ │ ├── cer.png │ │ │ ├── loss.png │ │ │ ├── cer_ctc.png │ │ │ ├── iter_time.png │ │ │ ├── loss_ctc.png │ │ │ ├── optim0_lr0.png │ │ │ ├── train_time.png │ │ │ ├── forward_time.png │ │ │ ├── backward_time.png │ │ │ ├── optim_step_time.png │ │ │ └── gpu_max_cached_mem_GB.png │ │ │ └── RESULTS.md │ ├── exp_uma_conformer_condition0302_32_731 │ │ └── asr_train_asr_uma_conformer_condition_raw_zh_char_sp │ │ │ ├── images │ │ │ ├── cer.png │ │ │ ├── loss.png │ │ │ ├── cer_ctc.png │ │ │ ├── loss_ctc.png │ │ │ ├── iter_time.png │ │ │ ├── optim0_lr0.png │ │ │ ├── train_time.png │ │ │ ├── backward_time.png │ │ │ ├── forward_time.png │ │ │ ├── optim_step_time.png │ │ │ ├── gpu_max_cached_mem_GB.png │ │ │ ├── cer_interctc_declayer2.png │ │ │ ├── cer_interctc_declayer4.png │ │ │ ├── cer_interctc_enclayer12.png │ │ │ ├── cer_interctc_enclayer6.png │ │ │ ├── cer_interctc_enclayer9.png │ │ │ ├── loss_interctc_declayer2.png │ │ │ ├── loss_interctc_declayer4.png │ │ │ ├── loss_interctc_enclayer6.png │ │ │ ├── loss_interctc_enclayer9.png │ │ │ └── loss_interctc_enclayer12.png │ │ │ └── RESULTS.md │ └── run_unimodal.sh ├── hkust │ ├── umaconf │ │ ├── decode_asr_uma.yaml │ │ ├── train_asr_uma_conformer.yaml │ │ ├── train_asr_uma_branchformer.yaml │ │ ├── train_asr_uma_conformer_condition.yaml │ │ └── train_asr_uma_branchformer_condition.yaml │ ├── exp_uma_conformer_12e_67 │ │ └── asr_train_asr_uma_conformer_raw_zh_char_sp │ │ │ ├── images │ │ │ ├── cer.png │ │ │ ├── loss.png │ │ │ ├── cer_ctc.png │ │ │ ├── iter_time.png │ │ │ ├── loss_ctc.png │ │ │ ├── forward_time.png │ │ │ ├── optim0_lr0.png │ │ │ ├── train_time.png │ │ │ ├── backward_time.png │ │ │ ├── optim_step_time.png │ │ │ └── gpu_max_cached_mem_GB.png │ │ │ └── RESULTS.md │ ├── exp_uma_branchformer_12e_69 │ │ └── asr_train_asr_uma_branchformer_raw_zh_char_sp │ │ │ ├── images │ │ │ ├── cer.png │ │ │ ├── loss.png │ │ │ ├── cer_ctc.png │ │ │ ├── loss_ctc.png │ │ │ ├── iter_time.png │ │ │ ├── optim0_lr0.png │ │ │ ├── train_time.png │ │ │ ├── backward_time.png │ │ │ ├── forward_time.png │ │ │ ├── optim_step_time.png │ │ │ └── gpu_max_cached_mem_GB.png │ │ │ └── RESULTS.md │ ├── exp_uma_conformer_condition0302_32_712 │ │ └── asr_train_asr_uma_conformer_condition_raw_zh_char_sp │ │ │ ├── images │ │ │ ├── cer.png │ │ │ ├── loss.png │ │ │ ├── cer_ctc.png │ │ │ ├── loss_ctc.png │ │ │ ├── iter_time.png │ │ │ ├── optim0_lr0.png │ │ │ ├── train_time.png │ │ │ ├── backward_time.png │ │ │ ├── forward_time.png │ │ │ ├── optim_step_time.png │ │ │ ├── cer_interctc_declayer2.png │ │ │ ├── cer_interctc_declayer4.png │ │ │ ├── cer_interctc_enclayer12.png │ │ │ ├── cer_interctc_enclayer6.png │ │ │ ├── cer_interctc_enclayer9.png │ │ │ ├── gpu_max_cached_mem_GB.png │ │ │ ├── loss_interctc_declayer2.png │ │ │ ├── loss_interctc_declayer4.png │ │ │ ├── loss_interctc_enclayer6.png │ │ │ ├── loss_interctc_enclayer9.png │ │ │ └── loss_interctc_enclayer12.png │ │ │ └── RESULTS.md │ ├── exp_uma_branchformer_condition0302_32_711 │ │ └── asr_train_asr_uma_branchformer_condition_raw_zh_char_sp │ │ │ ├── images │ │ │ ├── cer.png │ │ │ ├── loss.png │ │ │ ├── cer_ctc.png │ │ │ ├── iter_time.png │ │ │ ├── loss_ctc.png │ │ │ ├── forward_time.png │ │ │ ├── optim0_lr0.png │ │ │ ├── train_time.png │ │ │ ├── backward_time.png │ │ │ ├── optim_step_time.png │ │ │ ├── cer_interctc_declayer2.png │ │ │ ├── cer_interctc_declayer4.png │ │ │ ├── cer_interctc_enclayer6.png │ │ │ ├── cer_interctc_enclayer9.png │ │ │ ├── gpu_max_cached_mem_GB.png │ │ │ ├── cer_interctc_enclayer12.png │ │ │ ├── loss_interctc_declayer2.png │ │ │ ├── loss_interctc_declayer4.png │ │ │ ├── loss_interctc_enclayer12.png │ │ │ ├── loss_interctc_enclayer6.png │ │ │ └── loss_interctc_enclayer9.png │ │ │ └── RESULTS.md │ └── run_unimodal.sh └── aishell2 │ ├── umaconf │ ├── decode_asr_uma.yaml │ ├── train_asr_uma_mamba_b.yaml │ ├── train_asr_uma_conformer.yaml │ └── train_asr_uma_conformer_condition.yaml │ ├── exp_uma_mamba_0819 │ └── asr_train_asr_uma_mamba_b_raw_zh_char_sp │ │ ├── images │ │ ├── cer.png │ │ ├── loss.png │ │ ├── cer_ctc.png │ │ ├── iter_time.png │ │ ├── loss_ctc.png │ │ ├── forward_time.png │ │ ├── optim0_lr0.png │ │ ├── text_vs_uma.png │ │ ├── train_time.png │ │ ├── backward_time.png │ │ ├── uma_reduction.png │ │ ├── optim_step_time.png │ │ └── gpu_max_cached_mem_GB.png │ │ └── RESULTS.md │ ├── exp_uma_conformer_12e_718 │ ├── asr_train_asr_uma_conformer_raw_zh_char_sp │ │ ├── images │ │ │ ├── cer.png │ │ │ ├── loss.png │ │ │ ├── cer_ctc.png │ │ │ ├── loss_ctc.png │ │ │ ├── iter_time.png │ │ │ ├── optim0_lr0.png │ │ │ ├── train_time.png │ │ │ ├── backward_time.png │ │ │ ├── forward_time.png │ │ │ ├── optim_step_time.png │ │ │ └── gpu_max_cached_mem_GB.png │ │ └── RESULTS.md │ └── asr_train_asr_uma_conformer_condition_raw_zh_char_sp │ │ ├── images │ │ ├── cer.png │ │ ├── loss.png │ │ ├── cer_ctc.png │ │ ├── loss_ctc.png │ │ ├── iter_time.png │ │ ├── optim0_lr0.png │ │ ├── train_time.png │ │ ├── backward_time.png │ │ ├── forward_time.png │ │ ├── optim_step_time.png │ │ ├── cer_interctc_declayer2.png │ │ ├── cer_interctc_declayer4.png │ │ ├── cer_interctc_enclayer12.png │ │ ├── cer_interctc_enclayer6.png │ │ ├── cer_interctc_enclayer9.png │ │ ├── gpu_max_cached_mem_GB.png │ │ ├── loss_interctc_declayer2.png │ │ ├── loss_interctc_declayer4.png │ │ ├── loss_interctc_enclayer6.png │ │ ├── loss_interctc_enclayer9.png │ │ └── loss_interctc_enclayer12.png │ │ └── RESULTS.md │ └── run_unimodal.sh └── README.md /espnet2/asr/mamba_ssm/ops/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /espnet2/asr/mamba_ssm/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /espnet2/asr/mamba_ssm/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /espnet2/asr/mamba_ssm/ops/triton/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /uma.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/uma.png -------------------------------------------------------------------------------- /mamba_uma.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/mamba_uma.png -------------------------------------------------------------------------------- /egs2/aishell/umaconf/decode_asr_uma.yaml: -------------------------------------------------------------------------------- 1 | beam_size: 1 2 | penalty: 0.0 3 | maxlenratio: 0.0 4 | minlenratio: 0.0 5 | ctc_weight: 1 6 | lm_weight: 0.7 -------------------------------------------------------------------------------- /egs2/hkust/umaconf/decode_asr_uma.yaml: -------------------------------------------------------------------------------- 1 | beam_size: 1 2 | penalty: 0.0 3 | maxlenratio: 0.0 4 | minlenratio: 0.0 5 | ctc_weight: 1 6 | lm_weight: 0.3 -------------------------------------------------------------------------------- /egs2/aishell2/umaconf/decode_asr_uma.yaml: -------------------------------------------------------------------------------- 1 | beam_size: 1 2 | penalty: 0.0 3 | maxlenratio: 0.0 4 | minlenratio: 0.0 5 | ctc_weight: 1 6 | lm_weight: 0.3 -------------------------------------------------------------------------------- /espnet2/asr/mamba_ssm/ops/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/espnet2/asr/mamba_ssm/ops/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /espnet2/asr/mamba_ssm/ops/triton/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/espnet2/asr/mamba_ssm/ops/triton/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /espnet2/asr/mamba_ssm/ops/triton/__pycache__/layernorm.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/espnet2/asr/mamba_ssm/ops/triton/__pycache__/layernorm.cpython-39.pyc -------------------------------------------------------------------------------- /espnet2/asr/mamba_ssm/ops/__pycache__/selective_scan_interface.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/espnet2/asr/mamba_ssm/ops/__pycache__/selective_scan_interface.cpython-39.pyc -------------------------------------------------------------------------------- /espnet2/asr/mamba_ssm/ops/triton/__pycache__/selective_state_update.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/espnet2/asr/mamba_ssm/ops/triton/__pycache__/selective_state_update.cpython-39.pyc -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_mamba_0617/asr_train_asr_uma_mamba_raw_zh_char_sp/images/cer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_mamba_0617/asr_train_asr_uma_mamba_raw_zh_char_sp/images/cer.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_mamba_0617/asr_train_asr_uma_mamba_raw_zh_char_sp/images/loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_mamba_0617/asr_train_asr_uma_mamba_raw_zh_char_sp/images/loss.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_mamba_0617/asr_train_asr_uma_mamba_raw_zh_char_sp/images/cer_ctc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_mamba_0617/asr_train_asr_uma_mamba_raw_zh_char_sp/images/cer_ctc.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_mamba_0819/asr_train_asr_uma_mamba_b_raw_zh_char_sp/images/cer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_mamba_0819/asr_train_asr_uma_mamba_b_raw_zh_char_sp/images/cer.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_mamba_0819/asr_train_asr_uma_mamba_b_raw_zh_char_sp/images/loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_mamba_0819/asr_train_asr_uma_mamba_b_raw_zh_char_sp/images/loss.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_mamba_0617/asr_train_asr_uma_mamba_raw_zh_char_sp/images/iter_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_mamba_0617/asr_train_asr_uma_mamba_raw_zh_char_sp/images/iter_time.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_mamba_0617/asr_train_asr_uma_mamba_raw_zh_char_sp/images/loss_ctc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_mamba_0617/asr_train_asr_uma_mamba_raw_zh_char_sp/images/loss_ctc.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_mamba_0617/asr_train_asr_uma_mamba_raw_zh_char_sp/images/optim0_lr0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_mamba_0617/asr_train_asr_uma_mamba_raw_zh_char_sp/images/optim0_lr0.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_mamba_0617/asr_train_asr_uma_mamba_raw_zh_char_sp/images/train_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_mamba_0617/asr_train_asr_uma_mamba_raw_zh_char_sp/images/train_time.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_mamba_0819/asr_train_asr_uma_mamba_b_raw_zh_char_sp/images/cer_ctc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_mamba_0819/asr_train_asr_uma_mamba_b_raw_zh_char_sp/images/cer_ctc.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_mamba_0617/asr_train_asr_uma_mamba_raw_zh_char_sp/images/forward_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_mamba_0617/asr_train_asr_uma_mamba_raw_zh_char_sp/images/forward_time.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_mamba_0617/asr_train_asr_uma_mamba_raw_zh_char_sp/images/text_vs_uma.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_mamba_0617/asr_train_asr_uma_mamba_raw_zh_char_sp/images/text_vs_uma.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_mamba_0819/asr_train_asr_uma_mamba_b_raw_zh_char_sp/images/iter_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_mamba_0819/asr_train_asr_uma_mamba_b_raw_zh_char_sp/images/iter_time.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_mamba_0819/asr_train_asr_uma_mamba_b_raw_zh_char_sp/images/loss_ctc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_mamba_0819/asr_train_asr_uma_mamba_b_raw_zh_char_sp/images/loss_ctc.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_12e_67/asr_train_asr_uma_conformer_raw_zh_char_sp/images/cer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_12e_67/asr_train_asr_uma_conformer_raw_zh_char_sp/images/cer.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_12e_67/asr_train_asr_uma_conformer_raw_zh_char_sp/images/loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_12e_67/asr_train_asr_uma_conformer_raw_zh_char_sp/images/loss.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_mamba_0617/asr_train_asr_uma_mamba_raw_zh_char_sp/images/backward_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_mamba_0617/asr_train_asr_uma_mamba_raw_zh_char_sp/images/backward_time.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_mamba_0617/asr_train_asr_uma_mamba_raw_zh_char_sp/images/optim_step_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_mamba_0617/asr_train_asr_uma_mamba_raw_zh_char_sp/images/optim_step_time.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_mamba_0617/asr_train_asr_uma_mamba_raw_zh_char_sp/images/uma_reduction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_mamba_0617/asr_train_asr_uma_mamba_raw_zh_char_sp/images/uma_reduction.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_raw_zh_char_sp/images/cer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_raw_zh_char_sp/images/cer.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_mamba_0819/asr_train_asr_uma_mamba_b_raw_zh_char_sp/images/forward_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_mamba_0819/asr_train_asr_uma_mamba_b_raw_zh_char_sp/images/forward_time.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_mamba_0819/asr_train_asr_uma_mamba_b_raw_zh_char_sp/images/optim0_lr0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_mamba_0819/asr_train_asr_uma_mamba_b_raw_zh_char_sp/images/optim0_lr0.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_mamba_0819/asr_train_asr_uma_mamba_b_raw_zh_char_sp/images/text_vs_uma.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_mamba_0819/asr_train_asr_uma_mamba_b_raw_zh_char_sp/images/text_vs_uma.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_mamba_0819/asr_train_asr_uma_mamba_b_raw_zh_char_sp/images/train_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_mamba_0819/asr_train_asr_uma_mamba_b_raw_zh_char_sp/images/train_time.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_12e_67/asr_train_asr_uma_conformer_raw_zh_char_sp/images/cer_ctc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_12e_67/asr_train_asr_uma_conformer_raw_zh_char_sp/images/cer_ctc.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_raw_zh_char_sp/images/loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_raw_zh_char_sp/images/loss.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_mamba_0819/asr_train_asr_uma_mamba_b_raw_zh_char_sp/images/backward_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_mamba_0819/asr_train_asr_uma_mamba_b_raw_zh_char_sp/images/backward_time.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_mamba_0819/asr_train_asr_uma_mamba_b_raw_zh_char_sp/images/uma_reduction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_mamba_0819/asr_train_asr_uma_mamba_b_raw_zh_char_sp/images/uma_reduction.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_12e_69/asr_train_asr_uma_branchformer_raw_zh_char_sp/images/cer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_12e_69/asr_train_asr_uma_branchformer_raw_zh_char_sp/images/cer.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_12e_67/asr_train_asr_uma_conformer_raw_zh_char_sp/images/iter_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_12e_67/asr_train_asr_uma_conformer_raw_zh_char_sp/images/iter_time.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_12e_67/asr_train_asr_uma_conformer_raw_zh_char_sp/images/loss_ctc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_12e_67/asr_train_asr_uma_conformer_raw_zh_char_sp/images/loss_ctc.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_12e_69/asr_train_asr_unimodal_conformer_raw_zh_char_sp/images/cer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_12e_69/asr_train_asr_unimodal_conformer_raw_zh_char_sp/images/cer.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_12e_69/asr_train_asr_unimodal_conformer_raw_zh_char_sp/images/loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_12e_69/asr_train_asr_unimodal_conformer_raw_zh_char_sp/images/loss.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_raw_zh_char_sp/images/cer_ctc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_raw_zh_char_sp/images/cer_ctc.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_raw_zh_char_sp/images/loss_ctc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_raw_zh_char_sp/images/loss_ctc.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_mamba_0819/asr_train_asr_uma_mamba_b_raw_zh_char_sp/images/optim_step_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_mamba_0819/asr_train_asr_uma_mamba_b_raw_zh_char_sp/images/optim_step_time.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_12e_69/asr_train_asr_uma_branchformer_raw_zh_char_sp/images/loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_12e_69/asr_train_asr_uma_branchformer_raw_zh_char_sp/images/loss.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_12e_67/asr_train_asr_uma_conformer_raw_zh_char_sp/images/forward_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_12e_67/asr_train_asr_uma_conformer_raw_zh_char_sp/images/forward_time.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_12e_67/asr_train_asr_uma_conformer_raw_zh_char_sp/images/optim0_lr0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_12e_67/asr_train_asr_uma_conformer_raw_zh_char_sp/images/optim0_lr0.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_12e_67/asr_train_asr_uma_conformer_raw_zh_char_sp/images/train_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_12e_67/asr_train_asr_uma_conformer_raw_zh_char_sp/images/train_time.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_12e_69/asr_train_asr_unimodal_conformer_raw_zh_char_sp/images/cer_ctc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_12e_69/asr_train_asr_unimodal_conformer_raw_zh_char_sp/images/cer_ctc.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_mamba_0617/asr_train_asr_uma_mamba_raw_zh_char_sp/images/gpu_max_cached_mem_GB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_mamba_0617/asr_train_asr_uma_mamba_raw_zh_char_sp/images/gpu_max_cached_mem_GB.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_raw_zh_char_sp/images/iter_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_raw_zh_char_sp/images/iter_time.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_raw_zh_char_sp/images/optim0_lr0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_raw_zh_char_sp/images/optim0_lr0.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_raw_zh_char_sp/images/train_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_raw_zh_char_sp/images/train_time.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_12e_69/asr_train_asr_uma_branchformer_raw_zh_char_sp/images/cer_ctc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_12e_69/asr_train_asr_uma_branchformer_raw_zh_char_sp/images/cer_ctc.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_12e_69/asr_train_asr_uma_branchformer_raw_zh_char_sp/images/loss_ctc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_12e_69/asr_train_asr_uma_branchformer_raw_zh_char_sp/images/loss_ctc.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_12e_67/asr_train_asr_uma_conformer_raw_zh_char_sp/images/backward_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_12e_67/asr_train_asr_uma_conformer_raw_zh_char_sp/images/backward_time.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_12e_69/asr_train_asr_unimodal_conformer_raw_zh_char_sp/images/iter_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_12e_69/asr_train_asr_unimodal_conformer_raw_zh_char_sp/images/iter_time.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_12e_69/asr_train_asr_unimodal_conformer_raw_zh_char_sp/images/loss_ctc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_12e_69/asr_train_asr_unimodal_conformer_raw_zh_char_sp/images/loss_ctc.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_12e_69/asr_train_asr_unimodal_conformer_raw_zh_char_sp/images/optim0_lr0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_12e_69/asr_train_asr_unimodal_conformer_raw_zh_char_sp/images/optim0_lr0.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_12e_69/asr_train_asr_unimodal_conformer_raw_zh_char_sp/images/train_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_12e_69/asr_train_asr_unimodal_conformer_raw_zh_char_sp/images/train_time.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_raw_zh_char_sp/images/backward_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_raw_zh_char_sp/images/backward_time.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_raw_zh_char_sp/images/forward_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_raw_zh_char_sp/images/forward_time.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_mamba_0819/asr_train_asr_uma_mamba_b_raw_zh_char_sp/images/gpu_max_cached_mem_GB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_mamba_0819/asr_train_asr_uma_mamba_b_raw_zh_char_sp/images/gpu_max_cached_mem_GB.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_12e_69/asr_train_asr_uma_branchformer_raw_zh_char_sp/images/iter_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_12e_69/asr_train_asr_uma_branchformer_raw_zh_char_sp/images/iter_time.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_12e_69/asr_train_asr_uma_branchformer_raw_zh_char_sp/images/optim0_lr0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_12e_69/asr_train_asr_uma_branchformer_raw_zh_char_sp/images/optim0_lr0.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_12e_69/asr_train_asr_uma_branchformer_raw_zh_char_sp/images/train_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_12e_69/asr_train_asr_uma_branchformer_raw_zh_char_sp/images/train_time.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_12e_67/asr_train_asr_uma_conformer_raw_zh_char_sp/images/optim_step_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_12e_67/asr_train_asr_uma_conformer_raw_zh_char_sp/images/optim_step_time.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_12e_69/asr_train_asr_unimodal_conformer_raw_zh_char_sp/images/forward_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_12e_69/asr_train_asr_unimodal_conformer_raw_zh_char_sp/images/forward_time.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_raw_zh_char_sp/images/optim_step_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_raw_zh_char_sp/images/optim_step_time.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_12e_69/asr_train_asr_uma_branchformer_raw_zh_char_sp/images/backward_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_12e_69/asr_train_asr_uma_branchformer_raw_zh_char_sp/images/backward_time.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_12e_69/asr_train_asr_uma_branchformer_raw_zh_char_sp/images/forward_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_12e_69/asr_train_asr_uma_branchformer_raw_zh_char_sp/images/forward_time.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_12e_69/asr_train_asr_unimodal_conformer_raw_zh_char_sp/images/backward_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_12e_69/asr_train_asr_unimodal_conformer_raw_zh_char_sp/images/backward_time.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_12e_69/asr_train_asr_unimodal_conformer_raw_zh_char_sp/images/optim_step_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_12e_69/asr_train_asr_unimodal_conformer_raw_zh_char_sp/images/optim_step_time.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_ctc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_ctc.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_ctc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_ctc.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_12e_69/asr_train_asr_uma_branchformer_raw_zh_char_sp/images/optim_step_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_12e_69/asr_train_asr_uma_branchformer_raw_zh_char_sp/images/optim_step_time.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_12e_67/asr_train_asr_uma_conformer_raw_zh_char_sp/images/gpu_max_cached_mem_GB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_12e_67/asr_train_asr_uma_conformer_raw_zh_char_sp/images/gpu_max_cached_mem_GB.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/iter_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/iter_time.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/optim0_lr0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/optim0_lr0.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/train_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/train_time.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/backward_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/backward_time.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/forward_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/forward_time.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_raw_zh_char_sp/images/gpu_max_cached_mem_GB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_raw_zh_char_sp/images/gpu_max_cached_mem_GB.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_12e_69/asr_train_asr_uma_branchformer_raw_zh_char_sp/images/gpu_max_cached_mem_GB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_12e_69/asr_train_asr_uma_branchformer_raw_zh_char_sp/images/gpu_max_cached_mem_GB.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_12e_69/asr_train_asr_unimodal_conformer_raw_zh_char_sp/images/gpu_max_cached_mem_GB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_12e_69/asr_train_asr_unimodal_conformer_raw_zh_char_sp/images/gpu_max_cached_mem_GB.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/optim_step_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/optim_step_time.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_ctc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_ctc.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_ctc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_ctc.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_ctc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_ctc.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_ctc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_ctc.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/cer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/cer.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/loss.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/iter_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/iter_time.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/optim0_lr0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/optim0_lr0.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/train_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/train_time.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/iter_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/iter_time.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/optim0_lr0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/optim0_lr0.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/train_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/train_time.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_declayer2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_declayer2.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_declayer4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_declayer4.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_enclayer12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_enclayer12.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_enclayer6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_enclayer6.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_enclayer9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_enclayer9.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/gpu_max_cached_mem_GB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/gpu_max_cached_mem_GB.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_declayer2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_declayer2.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_declayer4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_declayer4.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_enclayer6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_enclayer6.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_enclayer9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_enclayer9.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/cer_ctc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/cer_ctc.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/backward_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/backward_time.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/forward_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/forward_time.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/backward_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/backward_time.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/forward_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/forward_time.png -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_enclayer12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_enclayer12.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/iter_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/iter_time.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/loss_ctc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/loss_ctc.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/optim_step_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/optim_step_time.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/optim_step_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/optim_step_time.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/forward_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/forward_time.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/optim0_lr0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/optim0_lr0.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/train_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/train_time.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/backward_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/backward_time.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/gpu_max_cached_mem_GB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/gpu_max_cached_mem_GB.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/optim_step_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/optim_step_time.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_declayer2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_declayer2.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_declayer4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_declayer4.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_enclayer12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_enclayer12.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_enclayer6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_enclayer6.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_enclayer9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_enclayer9.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/gpu_max_cached_mem_GB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/gpu_max_cached_mem_GB.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_declayer2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_declayer2.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_declayer4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_declayer4.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_enclayer6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_enclayer6.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_enclayer9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_enclayer9.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_declayer2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_declayer2.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_declayer4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_declayer4.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_enclayer12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_enclayer12.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_enclayer6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_enclayer6.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_enclayer9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/cer_interctc_enclayer9.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_declayer2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_declayer2.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_declayer4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_declayer4.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_enclayer6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_enclayer6.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_enclayer9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_enclayer9.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_enclayer12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_enclayer12.png -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_enclayer12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/images/loss_interctc_enclayer12.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/cer_interctc_declayer2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/cer_interctc_declayer2.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/cer_interctc_declayer4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/cer_interctc_declayer4.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/cer_interctc_enclayer6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/cer_interctc_enclayer6.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/cer_interctc_enclayer9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/cer_interctc_enclayer9.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/gpu_max_cached_mem_GB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/gpu_max_cached_mem_GB.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/cer_interctc_enclayer12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/cer_interctc_enclayer12.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/loss_interctc_declayer2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/loss_interctc_declayer2.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/loss_interctc_declayer4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/loss_interctc_declayer4.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/loss_interctc_enclayer12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/loss_interctc_enclayer12.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/loss_interctc_enclayer6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/loss_interctc_enclayer6.png -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/loss_interctc_enclayer9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Audio-WestlakeU/UMA-ASR/HEAD/egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/images/loss_interctc_enclayer9.png -------------------------------------------------------------------------------- /espnet2/asr/mamba_ssm/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author: FnoY fangying@westlake.edu.cn 3 | LastEditors: FnoY0723 fangying@westlake.edu.cn 4 | LastEditTime: 2024-03-15 20:55:13 5 | FilePath: /espnet/espnet2/asr/mamba_ssm/__init__.py 6 | ''' 7 | __version__ = "1.2.0.post1" 8 | 9 | from espnet2.asr.mamba_ssm.ops.selective_scan_interface import selective_scan_fn, mamba_inner_fn 10 | from espnet2.asr.mamba_ssm.modules.mamba_simple import Mamba 11 | 12 | -------------------------------------------------------------------------------- /espnet2/bin/asr_unimodal_train.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author: FnoY fangying@westlake.edu.cn 3 | LastEditTime: 2023-09-15 14:22:03 4 | FilePath: /espnet/espnet2/bin/asr_unimodal_train.py 5 | ''' 6 | #!/usr/bin/env python3 7 | from espnet2.tasks.asr_unimodal import ASRTask 8 | 9 | 10 | def get_parser(): 11 | parser = ASRTask.get_parser() 12 | return parser 13 | 14 | 15 | def main(cmd=None): 16 | r"""ASR training. 17 | 18 | Example: 19 | 20 | % python asr_train.py asr --print_config --optim adadelta \ 21 | > conf/train_asr.yaml 22 | % python asr_train.py --config conf/train_asr.yaml 23 | """ 24 | ASRTask.main(cmd=cmd) 25 | 26 | 27 | if __name__ == "__main__": 28 | main() 29 | -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_mamba_0819/asr_train_asr_uma_mamba_b_raw_zh_char_sp/RESULTS.md: -------------------------------------------------------------------------------- 1 | 2 | # RESULTS 3 | ## Environments 4 | - date: `Thu Sep 5 11:44:58 CST 2024` 5 | - python version: `3.9.16 (main, Jan 11 2023, 16:05:54) [GCC 11.2.0]` 6 | - espnet version: `espnet 202301` 7 | - pytorch version: `pytorch 1.12.1` 8 | - Git hash: `8f063f87a8c5de189a5d092e050694a4fd5115d4` 9 | - Commit date: `Mon Jul 8 14:14:22 2024 +0800` 10 | 11 | ## exp_uma_mamba_0819/asr_train_asr_uma_mamba_b_raw_zh_char_sp 12 | ### WER 13 | 14 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 15 | |---|---|---|---|---|---|---|---|---| 16 | |decode_uma_asr_model_valid.cer.ave_10best/test_ios|5000|5002|62.1|37.9|0.0|0.0|37.9|37.9| 17 | 18 | ### CER 19 | 20 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 21 | |---|---|---|---|---|---|---|---|---| 22 | |decode_uma_asr_model_valid.cer.ave_10best/test_ios|5000|49534|94.1|5.6|0.3|0.2|6.1|37.9| 23 | 24 | ### TER 25 | 26 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 27 | |---|---|---|---|---|---|---|---|---| 28 | -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_12e_67/asr_train_asr_uma_conformer_raw_zh_char_sp/RESULTS.md: -------------------------------------------------------------------------------- 1 | 2 | # RESULTS 3 | ## Environments 4 | - date: `Mon Jun 12 20:55:04 CST 2023` 5 | - python version: `3.9.16 (main, Jan 11 2023, 16:05:54) [GCC 11.2.0]` 6 | - espnet version: `espnet 202301` 7 | - pytorch version: `pytorch 1.12.1` 8 | - Git hash: `69054b7b6203973d158be95a0816e551da4d4bd6` 9 | - Commit date: `Tue Jun 6 11:23:40 2023 +0800` 10 | 11 | ## exp_uma_conformer_12e_67/asr_train_asr_uma_conformer_raw_zh_char_sp 12 | ### WER 13 | 14 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 15 | |---|---|---|---|---|---|---|---|---| 16 | |decode_uma_asr_model_valid.cer.ave_10best/dev|5413|5240|28.3|71.6|0.1|0.7|72.4|70.1| 17 | 18 | ### CER 19 | 20 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 21 | |---|---|---|---|---|---|---|---|---| 22 | |decode_uma_asr_model_valid.cer.ave_10best/dev|5413|56154|81.8|15.6|2.7|3.2|21.4|68.7| 23 | 24 | ### TER 25 | 26 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 27 | |---|---|---|---|---|---|---|---|---| 28 | -------------------------------------------------------------------------------- /espnet2/asr/mamba_ssm/utils/hf.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import torch 4 | 5 | from transformers.utils import WEIGHTS_NAME, CONFIG_NAME 6 | from transformers.utils.hub import cached_file 7 | 8 | 9 | def load_config_hf(model_name): 10 | resolved_archive_file = cached_file(model_name, CONFIG_NAME, _raise_exceptions_for_missing_entries=False) 11 | return json.load(open(resolved_archive_file)) 12 | 13 | 14 | def load_state_dict_hf(model_name, device=None, dtype=None): 15 | # If not fp32, then we don't want to load directly to the GPU 16 | mapped_device = "cpu" if dtype not in [torch.float32, None] else device 17 | resolved_archive_file = cached_file(model_name, WEIGHTS_NAME, _raise_exceptions_for_missing_entries=False) 18 | return torch.load(resolved_archive_file, map_location=mapped_device) 19 | # Convert dtype before moving to GPU to save memory 20 | if dtype is not None: 21 | state_dict = {k: v.to(dtype=dtype) for k, v in state_dict.items()} 22 | state_dict = {k: v.to(device=device) for k, v in state_dict.items()} 23 | return state_dict 24 | -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_12e_69/asr_train_asr_uma_branchformer_raw_zh_char_sp/RESULTS.md: -------------------------------------------------------------------------------- 1 | 2 | # RESULTS 3 | ## Environments 4 | - date: `Mon Jun 12 20:17:01 CST 2023` 5 | - python version: `3.9.16 (main, Jan 11 2023, 16:05:54) [GCC 11.2.0]` 6 | - espnet version: `espnet 202301` 7 | - pytorch version: `pytorch 1.12.1` 8 | - Git hash: `69054b7b6203973d158be95a0816e551da4d4bd6` 9 | - Commit date: `Tue Jun 6 11:23:40 2023 +0800` 10 | 11 | ## exp_uma_branchformer_12e_69/asr_train_asr_uma_branchformer_raw_zh_char_sp 12 | ### WER 13 | 14 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 15 | |---|---|---|---|---|---|---|---|---| 16 | |decode_uma_asr_model_valid.cer.ave_10best/dev|5413|5240|29.0|70.8|0.2|0.6|71.5|69.3| 17 | 18 | ### CER 19 | 20 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 21 | |---|---|---|---|---|---|---|---|---| 22 | |decode_uma_asr_model_valid.cer.ave_10best/dev|5413|56154|82.5|14.1|3.4|2.6|20.1|68.1| 23 | 24 | ### TER 25 | 26 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 27 | |---|---|---|---|---|---|---|---|---| 28 | -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/RESULTS.md: -------------------------------------------------------------------------------- 1 | 2 | # RESULTS 3 | ## Environments 4 | - date: `Fri Jul 14 12:33:51 CST 2023` 5 | - python version: `3.9.16 (main, Jan 11 2023, 16:05:54) [GCC 11.2.0]` 6 | - espnet version: `espnet 202301` 7 | - pytorch version: `pytorch 1.12.1` 8 | - Git hash: `58d7c097f69a5ddc15aa4658e9462e028157f326` 9 | - Commit date: `Thu Jun 29 15:27:10 2023 +0800` 10 | 11 | ## exp_uma_conformer_condition0302_32_712/asr_train_asr_uma_conformer_condition_raw_zh_char_sp 12 | ### WER 13 | 14 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 15 | |---|---|---|---|---|---|---|---|---| 16 | |decode_uma_asr_model_valid.cer.ave_10best/dev|5413|5240|29.6|70.2|0.1|0.6|71.0|68.7| 17 | 18 | ### CER 19 | 20 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 21 | |---|---|---|---|---|---|---|---|---| 22 | |decode_uma_asr_model_valid.cer.ave_10best/dev|5413|56154|83.0|14.4|2.6|3.1|20.0|67.3| 23 | 24 | ### TER 25 | 26 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 27 | |---|---|---|---|---|---|---|---|---| 28 | -------------------------------------------------------------------------------- /egs2/hkust/exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp/RESULTS.md: -------------------------------------------------------------------------------- 1 | 2 | # RESULTS 3 | ## Environments 4 | - date: `Wed Jul 12 11:10:43 CST 2023` 5 | - python version: `3.9.16 (main, Jan 11 2023, 16:05:54) [GCC 11.2.0]` 6 | - espnet version: `espnet 202301` 7 | - pytorch version: `pytorch 1.12.1` 8 | - Git hash: `58d7c097f69a5ddc15aa4658e9462e028157f326` 9 | - Commit date: `Thu Jun 29 15:27:10 2023 +0800` 10 | 11 | ## exp_uma_branchformer_condition0302_32_711/asr_train_asr_uma_branchformer_condition_raw_zh_char_sp 12 | ### WER 13 | 14 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 15 | |---|---|---|---|---|---|---|---|---| 16 | |decode_uma_asr_model_valid.cer.ave_10best/dev|5413|5240|30.8|69.0|0.2|0.4|69.6|67.3| 17 | 18 | ### CER 19 | 20 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 21 | |---|---|---|---|---|---|---|---|---| 22 | |decode_uma_asr_model_valid.cer.ave_10best/dev|5413|56154|83.7|13.7|2.6|2.9|19.2|66.0| 23 | 24 | ### TER 25 | 26 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 27 | |---|---|---|---|---|---|---|---|---| 28 | -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_mamba_0617/asr_train_asr_uma_mamba_raw_zh_char_sp/RESULTS.md: -------------------------------------------------------------------------------- 1 | 2 | # RESULTS 3 | ## Environments 4 | - date: `Thu Oct 31 12:45:06 CST 2024` 5 | - python version: `3.9.16 (main, Jan 11 2023, 16:05:54) [GCC 11.2.0]` 6 | - espnet version: `espnet 202301` 7 | - pytorch version: `pytorch 1.12.1` 8 | - Git hash: `4f786daee971a55f5d2e299f071d5e661de4a3ca` 9 | - Commit date: `Thu Oct 10 20:19:06 2024 +0800` 10 | 11 | ## exp_uma_mamba_0617/asr_train_asr_uma_mamba_raw_zh_char_sp 12 | ### WER 13 | 14 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 15 | |---|---|---|---|---|---|---|---|---| 16 | |decode_asr_unimodal_attention_asr_model_valid.cer.ave_10best/test|7176|7176|59.0|41.0|0.0|0.0|41.0|41.0| 17 | |peak_decode_asr_unimodal_attention_asr_model_valid.cer.ave_10best/test|7176|7176|59.0|41.0|0.0|0.0|41.0|41.0| 18 | 19 | ### CER 20 | 21 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 22 | |---|---|---|---|---|---|---|---|---| 23 | |decode_asr_unimodal_attention_asr_model_valid.cer.ave_10best/test|7176|104765|94.6|5.2|0.2|0.1|5.5|41.0| 24 | |peak_decode_asr_unimodal_attention_asr_model_valid.cer.ave_10best/test|7176|104765|94.6|5.2|0.1|0.2|5.6|41.0| 25 | 26 | ### TER 27 | 28 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 29 | |---|---|---|---|---|---|---|---|---| 30 | -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_12e_69/asr_train_asr_unimodal_conformer_raw_zh_char_sp/RESULTS.md: -------------------------------------------------------------------------------- 1 | 2 | # RESULTS 3 | ## Environments 4 | - date: `Mon Jun 12 02:23:35 CST 2023` 5 | - python version: `3.9.16 (main, Jan 11 2023, 16:05:54) [GCC 11.2.0]` 6 | - espnet version: `espnet 202301` 7 | - pytorch version: `pytorch 1.12.1` 8 | - Git hash: `69054b7b6203973d158be95a0816e551da4d4bd6` 9 | - Commit date: `Tue Jun 6 11:23:40 2023 +0800` 10 | 11 | ## exp_uma_conformer_12e_69/asr_train_asr_unimodal_conformer_raw_zh_char_sp 12 | ### WER 13 | 14 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 15 | |---|---|---|---|---|---|---|---|---| 16 | |decode_asr_unimodal_attention_asr_model_valid.cer.ave_10best/test|7176|7176|62.7|37.3|0.0|0.0|37.3|37.3| 17 | 18 | ### CER 19 | 20 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 21 | |---|---|---|---|---|---|---|---|---| 22 | |decode_asr_unimodal_attention_asr_model_valid.cer.ave_10best/test|7176|104765|95.3|4.5|0.2|0.1|4.8|37.3| 23 | 24 | ### TER 25 | 26 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 27 | |---|---|---|---|---|---|---|---|---| 28 | ## exp_uma_conformer_12e_69/asr_train_asr_unimodal_conformer_raw_zh_char_sp/decode_asr_unimodal_attention_asr_model_valid.cer.ave_10best 29 | ### WER 30 | 31 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 32 | |---|---|---|---|---|---|---|---|---| 33 | |org/dev|14326|14326|64.3|35.7|0.0|0.0|35.7|35.7| 34 | 35 | ### CER 36 | 37 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 38 | |---|---|---|---|---|---|---|---|---| 39 | |org/dev|14326|205341|95.6|4.3|0.1|0.1|4.5|35.7| 40 | 41 | ### TER 42 | 43 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 44 | |---|---|---|---|---|---|---|---|---| 45 | -------------------------------------------------------------------------------- /egs2/aishell/exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/RESULTS.md: -------------------------------------------------------------------------------- 1 | 2 | # RESULTS 3 | ## Environments 4 | - date: `Tue Aug 1 16:39:06 CST 2023` 5 | - python version: `3.9.16 (main, Jan 11 2023, 16:05:54) [GCC 11.2.0]` 6 | - espnet version: `espnet 202301` 7 | - pytorch version: `pytorch 1.12.1` 8 | - Git hash: `58d7c097f69a5ddc15aa4658e9462e028157f326` 9 | - Commit date: `Thu Jun 29 15:27:10 2023 +0800` 10 | 11 | ## exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp 12 | ### WER 13 | 14 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 15 | |---|---|---|---|---|---|---|---|---| 16 | |decode_asr_unimodal_attention_asr_model_valid.cer.ave_10best/test|7176|7176|64.1|35.9|0.0|0.0|35.9|35.9| 17 | 18 | ### CER 19 | 20 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 21 | |---|---|---|---|---|---|---|---|---| 22 | |decode_asr_unimodal_attention_asr_model_valid.cer.ave_10best/test|7176|104765|95.4|4.4|0.1|0.1|4.7|35.9| 23 | 24 | ### TER 25 | 26 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 27 | |---|---|---|---|---|---|---|---|---| 28 | ## exp_uma_conformer_condition0302_32_731/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/decode_asr_unimodal_attention_asr_model_valid.cer.ave_10best 29 | ### WER 30 | 31 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 32 | |---|---|---|---|---|---|---|---|---| 33 | |org/dev|14326|14326|66.0|34.0|0.0|0.0|34.0|34.0| 34 | 35 | ### CER 36 | 37 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 38 | |---|---|---|---|---|---|---|---|---| 39 | |org/dev|14326|205341|95.7|4.1|0.1|0.1|4.4|34.0| 40 | 41 | ### TER 42 | 43 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 44 | |---|---|---|---|---|---|---|---|---| 45 | -------------------------------------------------------------------------------- /egs2/aishell/umaconf/train_asr_uma_mamba.yaml: -------------------------------------------------------------------------------- 1 | # network architecture 2 | # encoder related 3 | encoder: mamba 4 | encoder_conf: 5 | output_size: 256 6 | num_blocks: 36 7 | dropout_rate: 0.1 8 | input_layer: causal_conv2d 9 | rms_norm: true 10 | fused_add_norm: true 11 | residual_in_fp32: true 12 | normalize_before: true 13 | lookahead_kernel: 17 14 | 15 | # decoder related 16 | decoder: unimodal_transformer 17 | decoder_conf: 18 | output_size: 256 19 | attention_heads: 4 20 | linear_units: 2048 21 | num_blocks: 6 22 | dropout_rate: 0.1 23 | positional_dropout_rate: 0.1 24 | 25 | # hybrid CTC/attention 26 | model_conf: 27 | ctc_weight: 1.0 28 | lsm_weight: 0.1 # label smoothing option 29 | length_normalized_loss: false 30 | 31 | 32 | # minibatch related 33 | batch_type: folded 34 | batch_size: 128 35 | 36 | # optimization related 37 | accum_grad: 1 38 | grad_clip: 5.0 39 | max_epoch: 50 40 | val_scheduler_criterion: 41 | - valid 42 | - loss 43 | best_model_criterion: 44 | - - valid 45 | - cer 46 | - min 47 | keep_nbest_models: 10 48 | 49 | optim: adamw 50 | optim_conf: 51 | lr: 0.001 52 | weight_decay: 0.01 53 | scheduler: warmuplr 54 | scheduler_conf: 55 | warmup_steps: 25000 56 | 57 | 58 | num_workers: 4 # num of workers of data loader 59 | use_amp: true # automatic mixed precision 60 | unused_parameters: false # set as true if some params are unused in DDP 61 | 62 | specaug: specaug 63 | specaug_conf: 64 | apply_time_warp: true 65 | time_warp_window: 5 66 | time_warp_mode: bicubic 67 | apply_freq_mask: true 68 | freq_mask_width_range: 69 | - 0 70 | - 27 71 | num_freq_mask: 2 72 | apply_time_mask: true 73 | time_mask_width_ratio_range: 74 | - 0. 75 | - 0.05 76 | num_time_mask: 10 77 | -------------------------------------------------------------------------------- /egs2/aishell2/umaconf/train_asr_uma_mamba_b.yaml: -------------------------------------------------------------------------------- 1 | # network architecture 2 | # encoder related 3 | encoder: mamba 4 | encoder_conf: 5 | output_size: 512 6 | num_blocks: 36 7 | dropout_rate: 0.1 8 | input_layer: causal_conv2d 9 | rms_norm: true 10 | fused_add_norm: true 11 | residual_in_fp32: true 12 | normalize_before: true 13 | lookahead_kernel: 29 14 | 15 | # decoder related 16 | decoder: unimodal_transformer 17 | decoder_conf: 18 | output_size: 512 19 | attention_heads: 8 20 | linear_units: 2048 21 | num_blocks: 6 22 | dropout_rate: 0.1 23 | positional_dropout_rate: 0.1 24 | 25 | # hybrid CTC/attention 26 | model_conf: 27 | ctc_weight: 1.0 28 | lsm_weight: 0.1 # label smoothing option 29 | length_normalized_loss: false 30 | 31 | 32 | # minibatch related 33 | batch_type: folded 34 | batch_size: 128 35 | num_iters_per_epoch: 7126 36 | 37 | # optimization related 38 | accum_grad: 2 39 | grad_clip: 5.0 40 | max_epoch: 150 41 | log_interval: 200 42 | val_scheduler_criterion: 43 | - valid 44 | - loss 45 | best_model_criterion: 46 | - - valid 47 | - cer 48 | - min 49 | keep_nbest_models: 10 50 | 51 | optim: adamw 52 | optim_conf: 53 | lr: 0.0005 54 | weight_decay: 0.1 55 | scheduler: warmuplr 56 | scheduler_conf: 57 | warmup_steps: 30000 58 | 59 | 60 | num_workers: 4 # num of workers of data loader 61 | use_amp: true # automatic mixed precision 62 | unused_parameters: false # set as true if some params are unused in DDP 63 | 64 | specaug: specaug 65 | specaug_conf: 66 | apply_time_warp: true 67 | time_warp_window: 5 68 | time_warp_mode: bicubic 69 | apply_freq_mask: true 70 | freq_mask_width_range: 71 | - 0 72 | - 27 73 | num_freq_mask: 2 74 | apply_time_mask: true 75 | time_mask_width_ratio_range: 76 | - 0. 77 | - 0.05 78 | num_time_mask: 10 79 | -------------------------------------------------------------------------------- /egs2/hkust/umaconf/train_asr_uma_conformer.yaml: -------------------------------------------------------------------------------- 1 | encoder: conformer 2 | encoder_conf: 3 | # comformer encoder 4 | output_size: 256 # dimension of attention 5 | attention_heads: 4 6 | linear_units: 2048 # the number of units of position-wise feed forward 7 | num_blocks: 12 # the number of encoder blocks 8 | dropout_rate: 0.1 9 | positional_dropout_rate: 0.1 10 | attention_dropout_rate: 0.0 11 | input_layer: conv2d # encoder architecture type 12 | normalize_before: true 13 | rel_pos_type: latest 14 | pos_enc_layer_type: rel_pos 15 | selfattention_layer_type: rel_selfattn 16 | activation_type: swish 17 | macaron_style: true 18 | use_cnn_module: true 19 | cnn_module_kernel: 31 20 | 21 | # decoder related 22 | decoder: unimodal_transformer 23 | decoder_conf: 24 | attention_heads: 4 25 | linear_units: 2048 26 | num_blocks: 6 27 | dropout_rate: 0.1 28 | positional_dropout_rate: 0.1 29 | 30 | # hybrid CTC/attention 31 | model_conf: 32 | ctc_weight: 1 33 | lsm_weight: 0.1 # label smoothing option 34 | length_normalized_loss: false 35 | 36 | # minibatch related 37 | batch_type: numel 38 | batch_bins: 20000000 39 | 40 | # optimization related 41 | accum_grad: 2 42 | grad_clip: 5 43 | max_epoch: 70 44 | val_scheduler_criterion: 45 | - valid 46 | - loss 47 | best_model_criterion: 48 | - - valid 49 | - cer 50 | - min 51 | keep_nbest_models: 10 52 | 53 | optim: adam 54 | optim_conf: 55 | lr: 0.0005 56 | scheduler: warmuplr 57 | scheduler_conf: 58 | warmup_steps: 30000 59 | 60 | specaug: specaug 61 | specaug_conf: 62 | apply_time_warp: true 63 | time_warp_window: 5 64 | time_warp_mode: bicubic 65 | apply_freq_mask: true 66 | freq_mask_width_range: 67 | - 0 68 | - 30 69 | num_freq_mask: 2 70 | apply_time_mask: true 71 | time_mask_width_range: 72 | - 0 73 | - 40 74 | num_time_mask: 2 75 | -------------------------------------------------------------------------------- /egs2/aishell/run_unimodal.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | ### 3 | # @Author: FnoY fangying@westlake.edu.cn 4 | # @LastEditTime: 2023-09-15 13:28:12 5 | # @FilePath: /espnet/egs2/aishell/asr1/run_unimodal.sh 6 | ### 7 | # Set bash to 'debug' mode, it will exit on : 8 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', 9 | # CUDA_VISIBLE_DEVICES=6 10 | set -e 11 | set -u 12 | set -o pipefail 13 | 14 | train_set=train 15 | valid_set=dev 16 | test_sets="dev test" 17 | 18 | asr_config=umaconf/train_asr_uma_conformer.yaml 19 | inference_config=umaconf/decode_asr_uma.yaml 20 | 21 | lm_config=conf/train_lm_transformer.yaml 22 | use_lm=false 23 | use_wordlm=false 24 | expdir=exp_uma_conformer 25 | inference_asr_model=valid.cer.ave_10best.pth 26 | 27 | # speed perturbation related 28 | # (train_set will be "${train_set}_sp" if speed_perturb_factors is specified) 29 | speed_perturb_factors="0.9 1.0 1.1" 30 | 31 | ./asr_unimodal.sh \ 32 | --nj 64 \ 33 | --inference_nj 64 \ 34 | --ngpu 1 \ 35 | --lang zh \ 36 | --audio_format "flac.ark" \ 37 | --feats_type raw \ 38 | --token_type char \ 39 | --use_lm ${use_lm} \ 40 | --use_word_lm ${use_wordlm} \ 41 | --expdir ${expdir} \ 42 | --inference_asr_model ${inference_asr_model} \ 43 | --lm_config "${lm_config}" \ 44 | --asr_config "${asr_config}" \ 45 | --inference_config "${inference_config}" \ 46 | --train_set "${train_set}" \ 47 | --valid_set "${valid_set}" \ 48 | --test_sets "${test_sets}" \ 49 | --speed_perturb_factors "${speed_perturb_factors}" \ 50 | --asr_speech_fold_length 512 \ 51 | --asr_text_fold_length 150 \ 52 | --lm_fold_length 150 \ 53 | --lm_train_text "data/${train_set}/text" "$@" 54 | -------------------------------------------------------------------------------- /egs2/aishell2/umaconf/train_asr_uma_conformer.yaml: -------------------------------------------------------------------------------- 1 | # Trained with A100 (80GB) x 2 GPUs. It takes about 6 days. 2 | encoder: conformer 3 | encoder_conf: 4 | output_size: 512 # dimension of attention 5 | attention_heads: 8 6 | linear_units: 2048 # the number of units of position-wise feed forward 7 | num_blocks: 12 # the number of encoder blocks 8 | dropout_rate: 0.1 9 | positional_dropout_rate: 0.1 10 | attention_dropout_rate: 0.0 11 | input_layer: conv2d # encoder architecture type 12 | normalize_before: true 13 | pos_enc_layer_type: rel_pos 14 | selfattention_layer_type: rel_selfattn 15 | activation_type: swish 16 | macaron_style: true 17 | use_cnn_module: true 18 | cnn_module_kernel: 31 19 | 20 | # decoder related 21 | decoder: unimodal_transformer 22 | decoder_conf: 23 | attention_heads: 4 24 | linear_units: 2048 25 | num_blocks: 6 26 | dropout_rate: 0.1 27 | positional_dropout_rate: 0.1 28 | 29 | # hybrid CTC/attention 30 | model_conf: 31 | ctc_weight: 1 32 | lsm_weight: 0.1 # label smoothing option 33 | length_normalized_loss: false 34 | 35 | # minibatch related 36 | batch_type: numel 37 | batch_bins: 20000000 38 | num_workers: 4 39 | 40 | # optimization related 41 | accum_grad: 4 42 | grad_clip: 5 43 | max_epoch: 50 44 | val_scheduler_criterion: 45 | - valid 46 | - loss 47 | best_model_criterion: 48 | - - valid 49 | - cer 50 | - min 51 | keep_nbest_models: 10 52 | 53 | optim: adam 54 | optim_conf: 55 | lr: 0.0005 56 | scheduler: warmuplr 57 | scheduler_conf: 58 | warmup_steps: 30000 59 | 60 | specaug: specaug 61 | specaug_conf: 62 | apply_time_warp: true 63 | time_warp_window: 5 64 | time_warp_mode: bicubic 65 | apply_freq_mask: true 66 | freq_mask_width_range: 67 | - 0 68 | - 30 69 | num_freq_mask: 2 70 | apply_time_mask: true 71 | time_mask_width_range: 72 | - 0 73 | - 40 74 | num_time_mask: 2 75 | -------------------------------------------------------------------------------- /egs2/hkust/run_unimodal.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | ### 3 | # @Author: FnoY fangying@westlake.edu.cn 4 | # @LastEditTime: 2023-09-15 13:43:56 5 | # @FilePath: /espnet/egs2/hkust/asr1/run_unimodal.sh 6 | ### 7 | # Set bash to 'debug' mode, it will exit on : 8 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', 9 | # CUDA_VISIBLE_DEVICES=7 10 | set -e 11 | set -u 12 | set -o pipefail 13 | 14 | train_set=train_nodup 15 | valid_set=train_dev 16 | test_sets="dev" 17 | 18 | 19 | asr_config=umaconf/train_asr_uma_conformer.yaml 20 | inference_config=umaconf/decode_asr_uma.yaml 21 | 22 | lm_config=conf/tuning/train_lm_transformer.yaml 23 | use_lm=false 24 | expdir=exp_uma_conformer 25 | inference_asr_model=valid.cer.ave_10best.pth 26 | 27 | # speed perturbation related 28 | # (train_set will be "${train_set}_sp" if speed_perturb_factors is specified) 29 | speed_perturb_factors="0.9 1.0 1.1" 30 | 31 | ./asr_unimodal.sh \ 32 | --nj 64 \ 33 | --inference_nj 1 \ 34 | --ngpu 1 \ 35 | --lang zh \ 36 | --audio_format flac \ 37 | --feats_type raw \ 38 | --token_type char \ 39 | --nlsyms_txt data/nlsyms.txt \ 40 | --use_lm ${use_lm} \ 41 | --expdir ${expdir} \ 42 | --inference_asr_model ${inference_asr_model} \ 43 | --lm_config "${lm_config}" \ 44 | --asr_config "${asr_config}" \ 45 | --inference_config "${inference_config}" \ 46 | --train_set "${train_set}" \ 47 | --valid_set "${valid_set}" \ 48 | --test_sets "${test_sets}" \ 49 | --speed_perturb_factors "${speed_perturb_factors}" \ 50 | --lm_train_text "data/${train_set}/text" "$@" -------------------------------------------------------------------------------- /egs2/aishell2/run_unimodal.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | ### 3 | # @Author: FnoY fangying@westlake.edu.cn 4 | # @LastEditTime: 2023-09-15 13:35:03 5 | # @FilePath: /espnet/egs2/aishell2/asr1/run_unimodal.sh 6 | ### 7 | # Set bash to 'debug' mode, it will exit on : 8 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', 9 | # CUDA_VISIBLE_DEVICES=4,5 10 | set -e 11 | set -u 12 | set -o pipefail 13 | 14 | train_set=train_noeng 15 | valid_set=dev_ios 16 | test_sets="dev_ios test_android test_ios test_mic" 17 | 18 | asr_config=umaconf/train_asr_uma_conformer.yaml 19 | inference_config=umaconf/decode_uma.yaml 20 | 21 | lm_config=conf/train_lm_transformer.yaml 22 | use_lm=false 23 | use_wordlm=false 24 | expdir=exp_uma_conformer_12e_718 25 | inference_asr_model=valid.cer.ave_10best.pth 26 | 27 | # speed perturbation related 28 | # (train_set will be "${train_set}_sp" if speed_perturb_factors is specified) 29 | speed_perturb_factors="0.9 1.0 1.1" 30 | 31 | ./asr_unimodal.sh \ 32 | --nj 64 \ 33 | --inference_nj 1 \ 34 | --ngpu 2 \ 35 | --lang zh \ 36 | --audio_format wav \ 37 | --feats_type raw \ 38 | --token_type char \ 39 | --use_lm ${use_lm} \ 40 | --use_word_lm ${use_wordlm} \ 41 | --expdir ${expdir} \ 42 | --inference_asr_model ${inference_asr_model} \ 43 | --lm_config "${lm_config}" \ 44 | --asr_config "${asr_config}" \ 45 | --inference_config "${inference_config}" \ 46 | --train_set "${train_set}" \ 47 | --valid_set "${valid_set}" \ 48 | --test_sets "${test_sets}" \ 49 | --speed_perturb_factors "${speed_perturb_factors}" \ 50 | --asr_speech_fold_length 512 \ 51 | --asr_text_fold_length 150 \ 52 | --lm_fold_length 150 \ 53 | --lm_train_text "data/${train_set}/text" "$@" 54 | -------------------------------------------------------------------------------- /egs2/hkust/umaconf/train_asr_uma_branchformer.yaml: -------------------------------------------------------------------------------- 1 | # network architecture 2 | # encoder related 3 | # encoder: unimodal_branchformer 4 | encoder: e_branchformer 5 | encoder_conf: 6 | output_size: 256 7 | attention_heads: 4 8 | attention_layer_type: rel_selfattn 9 | pos_enc_layer_type: rel_pos 10 | rel_pos_type: latest 11 | cgmlp_linear_units: 1024 12 | cgmlp_conv_kernel: 31 13 | use_linear_after_conv: false 14 | gate_activation: identity 15 | num_blocks: 12 16 | dropout_rate: 0.1 17 | positional_dropout_rate: 0.1 18 | attention_dropout_rate: 0.1 19 | input_layer: conv2d 20 | layer_drop_rate: 0.0 21 | linear_units: 1024 22 | positionwise_layer_type: linear 23 | use_ffn: true 24 | macaron_ffn: true 25 | merge_conv_kernel: 31 26 | 27 | # # decoder related 28 | decoder: unimodal_transformer 29 | decoder_conf: 30 | attention_heads: 4 31 | linear_units: 2048 32 | num_blocks: 6 33 | dropout_rate: 0.1 34 | positional_dropout_rate: 0.1 35 | 36 | # hybrid CTC/attention 37 | model_conf: 38 | ctc_weight: 1 39 | lsm_weight: 0.1 # label smoothing option 40 | length_normalized_loss: false 41 | 42 | # minibatch related 43 | batch_type: numel 44 | batch_bins: 40000000 45 | 46 | # optimization related 47 | accum_grad: 1 48 | grad_clip: 5 49 | max_epoch: 70 50 | best_model_criterion: 51 | - - valid 52 | - cer 53 | - min 54 | keep_nbest_models: 10 55 | 56 | optim: adam 57 | optim_conf: 58 | lr: 0.001 59 | weight_decay: 0.000001 60 | scheduler: warmuplr 61 | scheduler_conf: 62 | warmup_steps: 35000 63 | 64 | num_workers: 4 # num of workers of data loader 65 | use_amp: true # automatic mixed precision 66 | unused_parameters: false # set as true if some params are unused in DDP 67 | 68 | specaug: specaug 69 | specaug_conf: 70 | apply_time_warp: true 71 | time_warp_window: 5 72 | time_warp_mode: bicubic 73 | apply_freq_mask: true 74 | freq_mask_width_range: 75 | - 0 76 | - 27 77 | num_freq_mask: 2 78 | apply_time_mask: true 79 | time_mask_width_ratio_range: 80 | - 0. 81 | - 0.05 82 | num_time_mask: 10 83 | -------------------------------------------------------------------------------- /egs2/hkust/umaconf/train_asr_uma_conformer_condition.yaml: -------------------------------------------------------------------------------- 1 | encoder: conformer 2 | encoder_conf: 3 | # comformer encoder 4 | output_size: 256 # dimension of attention 5 | attention_heads: 4 6 | linear_units: 2048 # the number of units of position-wise feed forward 7 | num_blocks: 12 # the number of encoder blocks 8 | dropout_rate: 0.1 9 | positional_dropout_rate: 0.1 10 | attention_dropout_rate: 0.0 11 | input_layer: conv2d # encoder architecture type 12 | normalize_before: true 13 | rel_pos_type: latest 14 | pos_enc_layer_type: rel_pos 15 | selfattention_layer_type: rel_selfattn 16 | activation_type: swish 17 | macaron_style: true 18 | use_cnn_module: true 19 | cnn_module_kernel: 31 20 | interctc_layer_idx: [6,9,12] 21 | interctc_use_conditioning: true 22 | 23 | # decoder related 24 | decoder: unimodal_transformer 25 | decoder_conf: 26 | attention_heads: 4 27 | linear_units: 2048 28 | num_blocks: 6 29 | dropout_rate: 0.1 30 | positional_dropout_rate: 0.1 31 | interctc_layer_idx: [2,4] 32 | interctc_use_conditioning: true 33 | 34 | # hybrid CTC/attention 35 | model_conf: 36 | ctc_weight: 1 37 | interctc_weight_enc: 0.3 38 | interctc_weight_dec: 0.2 39 | lsm_weight: 0.1 # label smoothing option 40 | length_normalized_loss: false 41 | 42 | # minibatch related 43 | batch_type: numel 44 | batch_bins: 20000000 45 | 46 | # optimization related 47 | accum_grad: 2 48 | grad_clip: 5 49 | max_epoch: 70 50 | val_scheduler_criterion: 51 | - valid 52 | - loss 53 | best_model_criterion: 54 | - - valid 55 | - cer 56 | - min 57 | keep_nbest_models: 10 58 | 59 | optim: adam 60 | optim_conf: 61 | lr: 0.0005 62 | scheduler: warmuplr 63 | scheduler_conf: 64 | warmup_steps: 30000 65 | 66 | specaug: specaug 67 | specaug_conf: 68 | apply_time_warp: true 69 | time_warp_window: 5 70 | time_warp_mode: bicubic 71 | apply_freq_mask: true 72 | freq_mask_width_range: 73 | - 0 74 | - 30 75 | num_freq_mask: 2 76 | apply_time_mask: true 77 | time_mask_width_range: 78 | - 0 79 | - 40 80 | num_time_mask: 2 81 | -------------------------------------------------------------------------------- /egs2/aishell2/umaconf/train_asr_uma_conformer_condition.yaml: -------------------------------------------------------------------------------- 1 | # Trained with A100 (80GB) x 2 GPUs. It takes about 6 days. 2 | encoder: conformer 3 | encoder_conf: 4 | output_size: 512 # dimension of attention 5 | attention_heads: 8 6 | linear_units: 2048 # the number of units of position-wise feed forward 7 | num_blocks: 12 # the number of encoder blocks 8 | dropout_rate: 0.1 9 | positional_dropout_rate: 0.1 10 | attention_dropout_rate: 0.0 11 | input_layer: conv2d # encoder architecture type 12 | normalize_before: true 13 | pos_enc_layer_type: rel_pos 14 | selfattention_layer_type: rel_selfattn 15 | activation_type: swish 16 | macaron_style: true 17 | use_cnn_module: true 18 | cnn_module_kernel: 31 19 | interctc_layer_idx: [6,9,12] 20 | interctc_use_conditioning: true 21 | 22 | # decoder related 23 | decoder: unimodal_transformer 24 | decoder_conf: 25 | attention_heads: 4 26 | linear_units: 2048 27 | num_blocks: 6 28 | dropout_rate: 0.1 29 | positional_dropout_rate: 0.1 30 | interctc_layer_idx: [2,4] 31 | interctc_use_conditioning: true 32 | 33 | # hybrid CTC/attention 34 | model_conf: 35 | ctc_weight: 1 36 | interctc_weight_enc: 0.3 37 | interctc_weight_dec: 0.2 38 | lsm_weight: 0.1 # label smoothing option 39 | length_normalized_loss: false 40 | 41 | # minibatch related 42 | batch_type: numel 43 | batch_bins: 20000000 44 | 45 | # optimization related 46 | accum_grad: 4 47 | grad_clip: 5 48 | max_epoch: 50 49 | val_scheduler_criterion: 50 | - valid 51 | - loss 52 | best_model_criterion: 53 | - - valid 54 | - cer 55 | - min 56 | keep_nbest_models: 10 57 | 58 | optim: adam 59 | optim_conf: 60 | lr: 0.0005 61 | scheduler: warmuplr 62 | scheduler_conf: 63 | warmup_steps: 30000 64 | 65 | specaug: specaug 66 | specaug_conf: 67 | apply_time_warp: true 68 | time_warp_window: 5 69 | time_warp_mode: bicubic 70 | apply_freq_mask: true 71 | freq_mask_width_range: 72 | - 0 73 | - 30 74 | num_freq_mask: 2 75 | apply_time_mask: true 76 | time_mask_width_range: 77 | - 0 78 | - 40 79 | num_time_mask: 2 80 | -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/RESULTS.md: -------------------------------------------------------------------------------- 1 | 2 | # RESULTS 3 | ## Environments 4 | - date: `Wed Aug 23 15:31:26 CST 2023` 5 | - python version: `3.9.16 (main, Jan 11 2023, 16:05:54) [GCC 11.2.0]` 6 | - espnet version: `espnet 202301` 7 | - pytorch version: `pytorch 1.12.1` 8 | - Git hash: `58d7c097f69a5ddc15aa4658e9462e028157f326` 9 | - Commit date: `Thu Jun 29 15:27:10 2023 +0800` 10 | 11 | ## exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp 12 | ### WER 13 | 14 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 15 | |---|---|---|---|---|---|---|---|---| 16 | |50epoch_decode_uma_asr_model_valid.cer.ave_10best/test_android|5000|5002|63.4|36.5|0.0|0.0|36.6|36.6| 17 | |50epoch_decode_uma_asr_model_valid.cer.ave_10best/test_ios|5000|5002|66.1|33.9|0.0|0.0|33.9|33.9| 18 | |50epoch_decode_uma_asr_model_valid.cer.ave_10best/test_mic|5000|5002|63.7|36.2|0.0|0.0|36.3|36.3| 19 | |decode_uma_asr_model_valid.cer.ave_10best/test_mic|50|50|48.0|52.0|0.0|0.0|52.0|52.0| 20 | 21 | ### CER 22 | 23 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 24 | |---|---|---|---|---|---|---|---|---| 25 | |50epoch_decode_uma_asr_model_valid.cer.ave_10best/test_android|5000|49534|94.1|5.6|0.3|0.2|6.0|36.6| 26 | |50epoch_decode_uma_asr_model_valid.cer.ave_10best/test_ios|5000|49534|94.8|5.0|0.2|0.2|5.3|33.9| 27 | |50epoch_decode_uma_asr_model_valid.cer.ave_10best/test_mic|5000|49534|94.2|5.6|0.2|0.2|5.9|36.3| 28 | |decode_uma_asr_model_valid.cer.ave_10best/test_mic|50|458|89.3|10.5|0.2|0.2|10.9|52.0| 29 | 30 | ### TER 31 | 32 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 33 | |---|---|---|---|---|---|---|---|---| 34 | ## exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_condition_raw_zh_char_sp/50epoch_decode_uma_asr_model_valid.cer.ave_10best 35 | ### WER 36 | 37 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 38 | |---|---|---|---|---|---|---|---|---| 39 | |org/dev_ios|2500|2500|67.9|32.1|0.0|0.0|32.1|32.1| 40 | 41 | ### CER 42 | 43 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 44 | |---|---|---|---|---|---|---|---|---| 45 | |org/dev_ios|2500|24802|95.2|4.6|0.2|0.1|4.9|32.1| 46 | 47 | ### TER 48 | 49 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 50 | |---|---|---|---|---|---|---|---|---| 51 | -------------------------------------------------------------------------------- /egs2/aishell/umaconf/train_asr_uma_conformer.yaml: -------------------------------------------------------------------------------- 1 | encoder: conformer 2 | encoder_conf: 3 | # comformer encoder 4 | output_size: 256 # dimension of attention 5 | attention_heads: 4 6 | linear_units: 2048 # the number of units of position-wise feed forward 7 | num_blocks: 12 # the number of encoder blocks 8 | dropout_rate: 0.1 9 | positional_dropout_rate: 0.1 10 | attention_dropout_rate: 0.1 11 | input_layer: conv2d # encoder architecture type 12 | normalize_before: true 13 | rel_pos_type: latest 14 | pos_enc_layer_type: rel_pos 15 | selfattention_layer_type: rel_selfattn 16 | activation_type: swish 17 | macaron_style: true 18 | use_cnn_module: true 19 | cnn_module_kernel: 31 20 | 21 | # decoder related 22 | decoder: unimodal_transformer 23 | decoder_conf: 24 | attention_heads: 4 25 | linear_units: 2048 26 | num_blocks: 6 27 | dropout_rate: 0.1 28 | positional_dropout_rate: 0.1 29 | 30 | # hybrid CTC/attention 31 | model_conf: 32 | ctc_weight: 1 33 | lsm_weight: 0.1 # label smoothing option 34 | length_normalized_loss: false 35 | 36 | # minibatch related 37 | batch_type: numel 38 | batch_bins: 25000000 39 | 40 | # optimization related 41 | accum_grad: 1 42 | grad_clip: 5 43 | # patience: 3 44 | max_epoch: 60 45 | val_scheduler_criterion: 46 | - valid 47 | - loss 48 | best_model_criterion: 49 | - - valid 50 | - cer 51 | - min 52 | keep_nbest_models: 10 53 | 54 | # NoamLR is deprecated. Use WarmupLR. 55 | # The following is equivalent setting for NoamLR: 56 | optim: adam 57 | optim_conf: 58 | lr: 0.001 59 | weight_decay: 0.000001 60 | scheduler: warmuplr # pytorch v1.1.0+ required 61 | scheduler_conf: 62 | warmup_steps: 35000 63 | 64 | num_workers: 4 # num of workers of data loader 65 | use_amp: true # automatic mixed precision 66 | unused_parameters: false # set as true if some params are unused in DDP 67 | 68 | specaug: specaug 69 | specaug_conf: 70 | apply_time_warp: true 71 | time_warp_window: 5 72 | time_warp_mode: bicubic 73 | apply_freq_mask: true 74 | freq_mask_width_range: 75 | - 0 76 | - 27 77 | num_freq_mask: 2 78 | apply_time_mask: true 79 | time_mask_width_ratio_range: 80 | - 0. 81 | - 0.05 82 | num_time_mask: 10 -------------------------------------------------------------------------------- /egs2/aishell2/exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_raw_zh_char_sp/RESULTS.md: -------------------------------------------------------------------------------- 1 | 7 | 8 | # RESULTS 9 | ## Environments 10 | - date: `Wed Aug 23 15:04:44 CST 2023` 11 | - python version: `3.9.16 (main, Jan 11 2023, 16:05:54) [GCC 11.2.0]` 12 | - espnet version: `espnet 202301` 13 | - pytorch version: `pytorch 1.12.1` 14 | - Git hash: `58d7c097f69a5ddc15aa4658e9462e028157f326` 15 | - Commit date: `Thu Jun 29 15:27:10 2023 +0800` 16 | 17 | ## exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_raw_zh_char_sp 18 | ### WER 19 | 20 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 21 | |---|---|---|---|---|---|---|---|---| 22 | |decode_uma_asr_model_valid.cer.ave_10best/test_android|5000|5002|62.7|37.3|0.0|0.0|37.3|37.3| 23 | |decode_uma_asr_model_valid.cer.ave_10best/test_ios|5000|5002|65.6|34.3|0.0|0.0|34.4|34.3| 24 | |decode_uma_asr_model_valid.cer.ave_10best/test_mic|5000|5002|62.6|37.4|0.0|0.0|37.4|37.4| 25 | 26 | ### CER 27 | 28 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 29 | |---|---|---|---|---|---|---|---|---| 30 | |decode_uma_asr_model_valid.cer.ave_10best/test_android|5000|49534|94.1|5.7|0.2|0.1|6.0|37.3| 31 | |decode_uma_asr_model_valid.cer.ave_10best/test_ios|5000|49534|94.8|5.0|0.2|0.1|5.3|34.3| 32 | |decode_uma_asr_model_valid.cer.ave_10best/test_mic|5000|49534|94.1|5.7|0.2|0.2|6.0|37.4| 33 | 34 | ### TER 35 | 36 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 37 | |---|---|---|---|---|---|---|---|---| 38 | ## exp_uma_conformer_12e_718/asr_train_asr_uma_conformer_raw_zh_char_sp/decode_uma_asr_model_valid.cer.ave_10best 39 | ### WER 40 | 41 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 42 | |---|---|---|---|---|---|---|---|---| 43 | |org/dev_ios|2500|2500|67.5|32.5|0.0|0.0|32.5|32.5| 44 | 45 | ### CER 46 | 47 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 48 | |---|---|---|---|---|---|---|---|---| 49 | |org/dev_ios|2500|24802|95.2|4.6|0.2|0.1|4.9|32.5| 50 | 51 | ### TER 52 | 53 | |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| 54 | |---|---|---|---|---|---|---|---|---| 55 | -------------------------------------------------------------------------------- /egs2/hkust/umaconf/train_asr_uma_branchformer_condition.yaml: -------------------------------------------------------------------------------- 1 | # network architecture 2 | # encoder related 3 | # encoder: unimodal_branchformer 4 | encoder: e_branchformer 5 | encoder_conf: 6 | output_size: 256 7 | attention_heads: 4 8 | attention_layer_type: rel_selfattn 9 | pos_enc_layer_type: rel_pos 10 | rel_pos_type: latest 11 | cgmlp_linear_units: 1024 12 | cgmlp_conv_kernel: 31 13 | use_linear_after_conv: false 14 | gate_activation: identity 15 | num_blocks: 12 16 | dropout_rate: 0.1 17 | positional_dropout_rate: 0.1 18 | attention_dropout_rate: 0.1 19 | input_layer: conv2d 20 | layer_drop_rate: 0.0 21 | linear_units: 1024 22 | positionwise_layer_type: linear 23 | use_ffn: true 24 | macaron_ffn: true 25 | merge_conv_kernel: 31 26 | interctc_layer_idx: [6,9,12] 27 | interctc_use_conditioning: true 28 | 29 | # # decoder related 30 | decoder: unimodal_transformer 31 | decoder_conf: 32 | attention_heads: 4 33 | linear_units: 2048 34 | num_blocks: 6 35 | dropout_rate: 0.1 36 | positional_dropout_rate: 0.1 37 | interctc_layer_idx: [2,4] 38 | interctc_use_conditioning: true 39 | 40 | # hybrid CTC/attention 41 | model_conf: 42 | ctc_weight: 1 43 | interctc_weight_enc: 0.3 44 | interctc_weight_dec: 0.2 45 | lsm_weight: 0.1 # label smoothing option 46 | length_normalized_loss: false 47 | 48 | # minibatch related 49 | batch_type: numel 50 | batch_bins: 40000000 51 | 52 | # optimization related 53 | accum_grad: 1 54 | grad_clip: 5 55 | max_epoch: 70 56 | best_model_criterion: 57 | - - valid 58 | - cer 59 | - min 60 | keep_nbest_models: 10 61 | 62 | optim: adam 63 | optim_conf: 64 | lr: 0.001 65 | weight_decay: 0.000001 66 | scheduler: warmuplr 67 | scheduler_conf: 68 | warmup_steps: 35000 69 | 70 | num_workers: 4 # num of workers of data loader 71 | use_amp: true # automatic mixed precision 72 | unused_parameters: false # set as true if some params are unused in DDP 73 | 74 | specaug: specaug 75 | specaug_conf: 76 | apply_time_warp: true 77 | time_warp_window: 5 78 | time_warp_mode: bicubic 79 | apply_freq_mask: true 80 | freq_mask_width_range: 81 | - 0 82 | - 27 83 | num_freq_mask: 2 84 | apply_time_mask: true 85 | time_mask_width_ratio_range: 86 | - 0. 87 | - 0.05 88 | num_time_mask: 10 89 | -------------------------------------------------------------------------------- /egs2/aishell/umaconf/train_asr_uma_conformer_condition.yaml: -------------------------------------------------------------------------------- 1 | encoder: conformer 2 | encoder_conf: 3 | # comformer encoder 4 | output_size: 256 # dimension of attention 5 | attention_heads: 4 6 | linear_units: 2048 # the number of units of position-wise feed forward 7 | num_blocks: 12 # the number of encoder blocks 8 | dropout_rate: 0.1 9 | positional_dropout_rate: 0.1 10 | attention_dropout_rate: 0.0 11 | input_layer: conv2d # encoder architecture type 12 | normalize_before: true 13 | rel_pos_type: latest 14 | pos_enc_layer_type: rel_pos 15 | selfattention_layer_type: rel_selfattn 16 | activation_type: swish 17 | macaron_style: true 18 | use_cnn_module: true 19 | cnn_module_kernel: 31 20 | interctc_layer_idx: [6,9,12] 21 | interctc_use_conditioning: true 22 | 23 | # decoder related 24 | decoder: unimodal_transformer 25 | decoder_conf: 26 | attention_heads: 4 27 | linear_units: 2048 28 | num_blocks: 6 29 | dropout_rate: 0.1 30 | positional_dropout_rate: 0.1 31 | interctc_layer_idx: [2,4] 32 | interctc_use_conditioning: true 33 | 34 | # hybrid CTC/attention 35 | model_conf: 36 | ctc_weight: 1 37 | interctc_weight_enc: 0.3 38 | interctc_weight_dec: 0.2 39 | lsm_weight: 0.1 # label smoothing option 40 | length_normalized_loss: false 41 | 42 | # minibatch related 43 | batch_type: numel 44 | batch_bins: 25000000 45 | 46 | # optimization related 47 | accum_grad: 1 48 | grad_clip: 5 49 | # patience: 3 50 | max_epoch: 60 51 | val_scheduler_criterion: 52 | - valid 53 | - loss 54 | best_model_criterion: 55 | - - valid 56 | - cer 57 | - min 58 | keep_nbest_models: 10 59 | 60 | # NoamLR is deprecated. Use WarmupLR. 61 | # The following is equivalent setting for NoamLR: 62 | optim: adam 63 | optim_conf: 64 | lr: 0.001 65 | weight_decay: 0.000001 66 | scheduler: warmuplr # pytorch v1.1.0+ required 67 | scheduler_conf: 68 | warmup_steps: 35000 69 | 70 | num_workers: 4 # num of workers of data loader 71 | use_amp: true # automatic mixed precision 72 | unused_parameters: false # set as true if some params are unused in DDP 73 | 74 | specaug: specaug 75 | specaug_conf: 76 | apply_time_warp: true 77 | time_warp_window: 5 78 | time_warp_mode: bicubic 79 | apply_freq_mask: true 80 | freq_mask_width_range: 81 | - 0 82 | - 27 83 | num_freq_mask: 2 84 | apply_time_mask: true 85 | time_mask_width_ratio_range: 86 | - 0. 87 | - 0.05 88 | num_time_mask: 10 -------------------------------------------------------------------------------- /espnet2/asr/uma.py: -------------------------------------------------------------------------------- 1 | 2 | ''' 3 | Author: FnoY fangying@westlake.edu.cn 4 | LastEditTime: 2024-10-09 14:49:17 5 | FilePath: \UMA-ASR\espnet2\asr\uma.py 6 | Notes: If the feature dimension changes from 256 to 512, just modify 'output_size: int = 256' to 'output_size: int = 512'; 7 | If you want to use the early termination during inference, just set 'self.EarlyTermination = True'. 8 | ''' 9 | # """Unimodal aggregation definition.""" 10 | import logging 11 | from typing import Optional, Tuple 12 | import torch 13 | from typeguard import check_argument_types 14 | 15 | 16 | class UMA(torch.nn.Module): 17 | """UMA module. 18 | 19 | """ 20 | 21 | def __init__( 22 | self, 23 | input_size: int = 512, 24 | output_size: int = 256, 25 | ): 26 | assert check_argument_types() 27 | super().__init__() 28 | self._output_size = output_size 29 | input_size = output_size 30 | 31 | self.linear_sigmoid = torch.nn.Sequential( 32 | torch.nn.Linear(input_size, 1), 33 | torch.nn.Sigmoid(), 34 | ) 35 | 36 | self.EarlyTermination = False 37 | 38 | def output_size(self) -> int: 39 | return self._output_size 40 | 41 | def forward( 42 | self, 43 | xs_pad: torch.Tensor, 44 | olens: torch.Tensor, 45 | ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: 46 | """Calculate forward propagation. 47 | 48 | Args: 49 | xs_pad (torch.Tensor): Input tensor (#batch, L, input_size). 50 | olens (torch.Tensor): Input length (#batch). 51 | prev_states (torch.Tensor): Not to be used now. 52 | Returns: 53 | torch.Tensor: Output tensor (#batch, I, output_size). 54 | torch.Tensor: Output length (#batch). 55 | torch.Tensor: Not to be used now. 56 | """ 57 | 58 | batch, length, _ = xs_pad.size() 59 | # Use Linear-Sigmoid to generate unimodal aggregation weights 60 | # uma_weights: (#batch, L, 1) 61 | uma_weights = self.linear_sigmoid(xs_pad) 62 | 63 | # Unimodal Detection 64 | scalar_before = uma_weights[:,:-1,:].detach() # (#batch, L-1, 1) 65 | scalar_after = uma_weights[:,1:,:].detach() # (#batch, L-1, 1) 66 | scalar_before = torch.nn.functional.pad(scalar_before,(0,0,1,0)) # (#batch, L, 1) 67 | scalar_after = torch.nn.functional.pad(scalar_after,(0,0,0,1)) # (#batch, L, 1) 68 | 69 | mask = (uma_weights.lt(scalar_before)) & (uma_weights.lt(scalar_after)) # bool tensor (#batch, L, 1) 70 | 71 | if not self.training and self.EarlyTermination: 72 | mask2 = (uma_weights.gt(scalar_before)) & (uma_weights.gt(scalar_after)) # bool tensor (#batch, L, 1) 73 | mask = mask | mask2 74 | 75 | mask = mask.reshape(uma_weights.shape[0], -1) # bool tensor (#batch, L) 76 | mask[:,0] = True 77 | # mask.nonzero() is [[0,0],[0,3],[0,7],...,[1,0],[1,2],...,[2,0],[2,4],...,[#batch-1,0],...] 78 | # mask.nonzero() : (K,2); K is the total number of valleys in this batch 79 | batch_index = mask.nonzero()[:,0] # (k,1); [0,0,0,...,1,1,...,2,2,...,#batch-1,...] 80 | valley_index_start = mask.nonzero()[:,1] # (k,1); [0,3,7,...,0,2,...,0,4,...,0,...] 81 | mask[:,0] = False 82 | mask[:,-1] = True 83 | valley_index_end = mask.nonzero()[:,1] + 2 84 | # (k,1); [5,9,...,4,...,6,...] 85 | valley_index_end = torch.where(valley_index_end > (length) * torch.ones_like(valley_index_end), 86 | (length) * torch.ones_like(valley_index_end), valley_index_end) 87 | 88 | _,counts = torch.unique(batch_index, return_counts = True) # (#batch, 1); the number of valleys in each sample 89 | max_counts = (torch.max(counts)).item() 90 | 91 | utri_mat1 = torch.tril(torch.ones(max_counts+1,max_counts),-1).to(xs_pad.device) 92 | batch_index_mask = utri_mat1[counts] 93 | batch_index_mask = batch_index_mask.reshape(-1,1) 94 | batch_index_mask = batch_index_mask.nonzero()[:, 0] 95 | 96 | valleys = torch.zeros(batch * max_counts, 2).type_as(valley_index_start) 97 | valleys[batch_index_mask] = torch.cat((valley_index_start.unsqueeze(1), valley_index_end.unsqueeze(1)),1) 98 | # logging.info(str(valleys)) 99 | 100 | # utri_mat = torch.tril(torch.cuda.FloatTensor(length+1,length).fill_(1),-1) 101 | utri_mat = torch.tril(torch.ones(length+1,length),-1).to(xs_pad.device) 102 | output_mask = (utri_mat[valleys[:,1]]-utri_mat[valleys[:,0]]).reshape(batch, max_counts, length) 103 | output_mask = output_mask.detach() 104 | 105 | # Aggregation 106 | alpha_h = torch.mul(uma_weights, xs_pad) 107 | xs_pad = torch.bmm(output_mask, alpha_h) / torch.bmm(output_mask, uma_weights).clamp_(1e-6) 108 | 109 | # olens = (olens / olens[0] * xs_pad.shape[1]).type_as(olens) 110 | olens = counts 111 | 112 | # return xs_pad, olens, uma_weights 113 | return xs_pad, olens, None 114 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 7 | # UMA-ASR 8 | This repository is the official implementation of unimodal aggregation (UMA) for automaticspeech recognition (ASR). 9 | 10 | It consists of two works: 11 | 1. for non-autoregressive offline ASR: ["Unimodal Aggregation for CTC-based Speech Recognition" (ICASSP 2024)](https://ieeexplore.ieee.org/abstract/document/10448248) 12 | 2. for streaming ASR: ["Mamba for Streaming ASR Combined with Unimodal Aggregation" (submitted to ICASSP 2025)](https://arxiv.org/abs/2410.00070) 13 | 14 |
20 | 21 | [Poster :star_struck:](https://sigport.org/sites/default/files/docs/fangying_UMA_poster4.0.pdf) **|** [Issues :sweat_smile:](https://github.com/Audio-WestlakeU/UMA-ASR/issues) 22 | **|** [Lab :hear_no_evil:](https://github.com/Audio-WestlakeU) **|** [Contact :kissing_heart:](fangying@westlake.edu.cn) 23 | 24 | ## Introduction 25 | 26 | ### For Non-autoregressive Offline ASR 27 | A unimodal aggregation (UMA) is proposed to segment and integrate the feature frames that belong to the same text token, and thus to learn better feature representations for text tokens. The frame-wise features and weights are both derived from an encoder. Then, the feature frames with unimodal weights are integrated and further processed by a decoder. Connectionist temporal classification (CTC) loss is applied for training. Moreover, by integrating self-conditioned CTC into the proposed framework, the performance can be further noticeably improved. 28 | 29 |