├── .coveragerc ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── examples ├── aishell │ ├── README.txt │ └── s5 │ │ ├── README.md │ │ ├── cmd.sh │ │ ├── conf │ │ ├── asr │ │ │ ├── blstm_las.yaml │ │ │ ├── conformer_kernel15_clamp10_hie_subsample8_las_ln.yaml │ │ │ ├── conformer_kernel15_clamp10_hie_subsample8_las_ln_2mtl.yaml │ │ │ ├── mma │ │ │ │ ├── lc_transformer_mma_hie_subsample8_ma4H_ca4H_w16_from4L_64_128_64.yaml │ │ │ │ ├── lc_transformer_mma_hie_subsample8_ma4H_ca4H_w16_from4L_96_64_32.yaml │ │ │ │ └── transformer_mma_hie_subsample8_ma4H_ca4H_w16_from4L.yaml │ │ │ ├── mocha │ │ │ │ ├── blstm_mocha.yaml │ │ │ │ ├── lcblstm_mocha_chunk4040.yaml │ │ │ │ └── lcblstm_mocha_chunk4040_ctc_sync.yaml │ │ │ ├── transformer.yaml │ │ │ └── transformer_hie_subsample8.yaml │ │ ├── data │ │ │ ├── spec_augment_speed_perturb.yaml │ │ │ ├── spec_augment_speed_perturb_pretrain.yaml │ │ │ └── speed_perturb_pretrain.yaml │ │ ├── fbank.conf │ │ └── lm │ │ │ └── rnnlm.yaml │ │ ├── local │ │ ├── aishell_data_prep.sh │ │ ├── download_and_untar.sh │ │ └── plot_attention.sh │ │ ├── path.sh │ │ ├── run.sh │ │ ├── run_2mtl.sh │ │ ├── score.sh │ │ ├── steps │ │ └── utils ├── aishell2 │ ├── README.md │ └── s5 │ │ ├── RESULTS.md │ │ ├── cmd.sh │ │ ├── conf │ │ ├── asr │ │ │ └── conformer_kernel15_clamp10_hie_subsample8_las_ln_large.yaml │ │ └── fbank.conf │ │ ├── local │ │ └── prepare_data.sh │ │ ├── path.sh │ │ ├── run.sh │ │ ├── score.sh │ │ ├── steps │ │ └── utils ├── ami │ ├── README.txt │ └── s5b │ │ ├── README.txt │ │ ├── cmd.sh │ │ ├── conf │ │ ├── ami_beamformit.cfg │ │ ├── asr │ │ │ ├── blstm_las.yaml │ │ │ ├── blstm_mocha.yaml │ │ │ ├── blstm_rnnt.yaml │ │ │ ├── lcblstm_mocha_chunk4040.yaml │ │ │ ├── lcblstm_mocha_chunk4040_ctc_sync.yaml │ │ │ ├── lcblstm_rnnt_40_40.yaml │ │ │ └── transformer.yaml │ │ ├── data │ │ │ ├── spec_augment.yaml │ │ │ ├── spec_augment_speed_perturb.yaml │ │ │ ├── spec_augment_speed_perturb_pretrain_F27_T100.yaml │ │ │ ├── spec_augment_speed_perturb_pretrain_F27_T50.yaml │ │ │ └── speed_perturb.yaml │ │ ├── fbank.conf │ │ └── lm │ │ │ └── rnnlm.yaml │ │ ├── local │ │ ├── ami_beamform.sh │ │ ├── ami_download.sh │ │ ├── ami_format_data.sh │ │ ├── ami_ihm_data_prep.sh │ │ ├── ami_ihm_scoring_data_prep.sh │ │ ├── ami_mdm_data_prep.sh │ │ ├── ami_mdm_scoring_data_prep.sh │ │ ├── ami_prepare_dict.sh │ │ ├── ami_sdm_data_prep.sh │ │ ├── ami_sdm_scoring_data_prep.sh │ │ ├── ami_split_segments.pl │ │ ├── ami_text_prep.sh │ │ ├── ami_xml2text.sh │ │ ├── beamformit.sh │ │ ├── convert2stm.pl │ │ ├── english.glm │ │ ├── split_REAMDE.txt │ │ ├── split_dev.orig │ │ ├── split_eval.orig │ │ └── split_train.orig │ │ ├── path.sh │ │ ├── run.sh │ │ ├── score.sh │ │ ├── steps │ │ └── utils ├── ci_test │ ├── cmd.sh │ ├── conf │ │ ├── asr │ │ │ ├── blstm_las.yaml │ │ │ ├── blstm_las_2mtl.yaml │ │ │ ├── blstm_las_2mtl_per_batch.yaml │ │ │ ├── blstm_transformer.yaml │ │ │ ├── conformer.yaml │ │ │ ├── lc_transformer_mma_ma4H_ca4H_w16_from4L_64_128_64.yaml │ │ │ ├── lcblstm_transducer.yaml │ │ │ ├── lstm_ctc.yaml │ │ │ ├── tds_las.yaml │ │ │ ├── transformer.yaml │ │ │ ├── transformer_2mtl.yaml │ │ │ ├── transformer_ctc.yaml │ │ │ └── transformer_las.yaml │ │ ├── data │ │ │ ├── adaptive_spec_augment.yaml │ │ │ └── spec_augment.yaml │ │ ├── fbank.conf │ │ └── lm │ │ │ ├── rnnlm.yaml │ │ │ ├── transformer_xl.yaml │ │ │ └── transformerlm.yaml │ ├── ctc_forced_align.sh │ ├── data │ │ └── train │ │ │ ├── spk2utt │ │ │ ├── text │ │ │ ├── text.phone │ │ │ ├── utt2spk │ │ │ └── wav.scp │ ├── local │ │ └── download_sample.sh │ ├── path.sh │ ├── plot_attention.sh │ ├── plot_ctc.sh │ ├── run.sh │ ├── run_2mtl.sh │ ├── score.sh │ ├── steps │ └── utils ├── csj │ ├── README.txt │ └── s5 │ │ ├── README.md │ │ ├── cmd.sh │ │ ├── conf │ │ ├── asr │ │ │ ├── las │ │ │ │ ├── blstm_las.yaml │ │ │ │ ├── blstm_las_2mtl.yaml │ │ │ │ ├── lcblstm_las_chunk4040.yaml │ │ │ │ └── lstm_las.yaml │ │ │ ├── mocha │ │ │ │ ├── blstm_mocha.yaml │ │ │ │ ├── lcblstm_mocha_chunk4040.yaml │ │ │ │ ├── lcblstm_mocha_chunk4040_ctc_sync.yaml │ │ │ │ ├── lcblstm_mocha_chunk4040_decot16.yaml │ │ │ │ ├── lcblstm_mocha_chunk4040_minlt.yaml │ │ │ │ ├── lstm_mocha.yaml │ │ │ │ └── lstm_mocha_ctc_sync.yaml │ │ │ └── transformer │ │ │ │ ├── conformer_kernel15_clamp10_hie_subsample8_las_ln.yaml │ │ │ │ ├── conformer_kernel15_clamp10_hie_subsample8_las_ln_large.yaml │ │ │ │ ├── transformer.yaml │ │ │ │ └── transformer_hie_subsample8.yaml │ │ ├── data │ │ │ ├── pretrain.yaml │ │ │ ├── spec_augment.yaml │ │ │ ├── spec_augment_pretrain_F13_T50.yaml │ │ │ ├── spec_augment_pretrain_F27_T100.yaml │ │ │ ├── spec_augment_pretrain_F27_T50.yaml │ │ │ ├── spec_augment_speed_perturb.yaml │ │ │ └── speed_perturb.yaml │ │ ├── fbank.conf │ │ └── lm │ │ │ ├── rnnlm.yaml │ │ │ ├── transformer_xl.yaml │ │ │ └── transformerlm.yaml │ │ ├── local │ │ ├── csj_data_prep.sh │ │ ├── csj_eval_data_prep.sh │ │ ├── csj_make_trans │ │ │ ├── csj2kaldi4m.pl │ │ │ ├── csj_autorun.sh │ │ │ ├── csjconnect.pl │ │ │ ├── kana2phone │ │ │ └── vocab2dic.pl │ │ ├── csj_prepare_dict.sh │ │ ├── plot_attention.sh │ │ ├── plot_ctc.sh │ │ ├── plot_lm_cache.sh │ │ ├── remove_disfluency.py │ │ ├── remove_pos.py │ │ └── score_lm.sh │ │ ├── path.sh │ │ ├── run.sh │ │ ├── run_2mtl.sh │ │ ├── run_streaming.sh │ │ ├── score.sh │ │ ├── score_streaming.sh │ │ ├── steps │ │ └── utils ├── laborotv │ └── s5 │ │ ├── README.md │ │ ├── cmd.sh │ │ ├── conf │ │ ├── asr │ │ │ ├── conformer_kernel15_clamp10_hie_subsample8_las_ln.yaml │ │ │ └── conformer_kernel15_clamp10_hie_subsample8_las_ln_large.yaml │ │ ├── fbank.conf │ │ └── lm │ │ │ └── rnnlm.yaml │ │ ├── local │ │ ├── laborotv_data_prep.sh │ │ ├── prepare_dict.sh │ │ ├── remove_pos.py │ │ └── tedx-jp-10k_data_prep.sh │ │ ├── path.sh │ │ ├── run.sh │ │ ├── run_plus_csj.sh │ │ ├── score.sh │ │ ├── steps │ │ └── utils ├── language_model │ ├── ptb │ │ ├── RESULTS │ │ ├── cmd.sh │ │ ├── conf │ │ │ └── rnnlm.yaml │ │ ├── local │ │ │ └── plot_lm_cache.sh │ │ ├── path.sh │ │ ├── run.sh │ │ ├── score_lm.sh │ │ ├── steps │ │ └── utils │ └── wikitext2 │ │ ├── RESULTS │ │ ├── cmd.sh │ │ ├── conf │ │ ├── gcnn.yaml │ │ ├── rnnlm.yaml │ │ ├── transformer_xl.yaml │ │ └── transformerlm.yaml │ │ ├── local │ │ └── plot_lm_cache.sh │ │ ├── path.sh │ │ ├── run.sh │ │ ├── score_lm.sh │ │ ├── steps │ │ └── utils ├── librispeech │ ├── README.txt │ └── s5 │ │ ├── RESULTS.md │ │ ├── cmd.sh │ │ ├── conf │ │ ├── asr │ │ │ ├── blstm_las.yaml │ │ │ ├── mma │ │ │ │ ├── offline │ │ │ │ │ ├── transformer_mma_subsample8_ma4H_ca4H_w16_from4L.yaml │ │ │ │ │ ├── transformer_mma_subsample8_ma4H_ca4H_w16_from4L_512dmodel_8H.yaml │ │ │ │ │ └── transformer_mma_subsample8_ma4H_ca4H_w16_from4L_768dmodel_3072dff_8H.yaml │ │ │ │ └── streaming │ │ │ │ │ ├── lc_transformer_mma_subsample8_ma4H_ca4H_w16_from4L_512dmodel_8H_64_128_64.yaml │ │ │ │ │ ├── lc_transformer_mma_subsample8_ma4H_ca4H_w16_from4L_512dmodel_8H_96_64_32.yaml │ │ │ │ │ ├── lc_transformer_mma_subsample8_ma4H_ca4H_w16_from4L_64_128_64.yaml │ │ │ │ │ ├── lc_transformer_mma_subsample8_ma4H_ca4H_w16_from4L_768dmodel_3072dff_8H_64_128_64.yaml │ │ │ │ │ └── lc_transformer_mma_subsample8_ma4H_ca4H_w16_from4L_96_64_32.yaml │ │ │ ├── mocha │ │ │ │ ├── blstm_mocha.yaml │ │ │ │ ├── lcblstm_mocha_chunk4040.yaml │ │ │ │ ├── lcblstm_mocha_chunk4040_ctc_sync.yaml │ │ │ │ ├── lstm_mocha.yaml │ │ │ │ ├── lstm_mocha_ctc_sync.yaml │ │ │ │ ├── lstm_mocha_decot12.yaml │ │ │ │ ├── lstm_mocha_decot16.yaml │ │ │ │ ├── lstm_mocha_minlt.yaml │ │ │ │ └── uni_conformer_kernel7_clamp10_hie_subsample8_mocha_ln_stableemit0.2_qua0.2.yaml │ │ │ ├── transducer │ │ │ │ ├── blstm_transducer_bpe1k.yaml │ │ │ │ ├── lcblstm_rnnt_chunk4040_bpe1k.yaml │ │ │ │ └── lstm_rnnt_bpe1k.yaml │ │ │ └── transformer │ │ │ │ ├── conformer_kernel15_clamp10_hie_subsample8_las_long_ln.yaml │ │ │ │ ├── conformer_kernel15_clamp10_hie_subsample8_las_long_ln_large.yaml │ │ │ │ ├── transformer.yaml │ │ │ │ ├── transformer_512dmodel_8H.yaml │ │ │ │ ├── transformer_768dmodel_3072dff_8H.yaml │ │ │ │ ├── transformer_subsample8.yaml │ │ │ │ ├── transformer_subsample8_512dmodel_8H.yaml │ │ │ │ └── transformer_subsample8_768dmodel_3072dff_8H.yaml │ │ ├── data │ │ │ ├── pretrain.yaml │ │ │ ├── spec_augment.yaml │ │ │ ├── spec_augment_pretrain_F13_T50.yaml │ │ │ ├── spec_augment_pretrain_F27_T100.yaml │ │ │ ├── spec_augment_pretrain_F27_T50.yaml │ │ │ ├── spec_augment_speed_perturb.yaml │ │ │ └── spec_augment_speed_perturb_pretrain_F27_T100.yaml │ │ ├── fbank.conf │ │ └── lm │ │ │ ├── rnnlm.yaml │ │ │ └── rnnlm_6L.yaml │ │ ├── ctc_forced_align.sh │ │ ├── local │ │ ├── data_prep.sh │ │ ├── download_and_untar.sh │ │ ├── download_lm.sh │ │ ├── format_data.sh │ │ ├── format_lms.sh │ │ ├── g2p.sh │ │ ├── g2p │ │ │ └── train_g2p.sh │ │ ├── lm │ │ │ ├── est-gcc4.7.patch │ │ │ ├── install_festival.sh │ │ │ ├── normalize_text.sh │ │ │ ├── python │ │ │ │ ├── pre_filter.py │ │ │ │ ├── text_post_process.py │ │ │ │ └── text_pre_process.py │ │ │ └── train_lm.sh │ │ ├── plot_attention.sh │ │ ├── plot_ctc.sh │ │ ├── prepare_dict.sh │ │ ├── prepare_example_data.sh │ │ └── score_lm.sh │ │ ├── path.sh │ │ ├── run.sh │ │ ├── run_2mtl.sh │ │ ├── score.sh │ │ ├── steps │ │ └── utils ├── swbd │ ├── README.txt │ └── s5c │ │ ├── RESULTS │ │ ├── cmd.sh │ │ ├── conf │ │ ├── asr │ │ │ ├── blstm_las.yaml │ │ │ ├── blstm_las_2mtl.yaml │ │ │ ├── blstm_las_3mtl.yaml │ │ │ ├── blstm_las_fisher_swbd.yaml │ │ │ ├── blstm_mocha.yaml │ │ │ ├── lcblstm_mocha_chunk4040.yaml │ │ │ ├── lcblstm_mocha_chunk4040_ctc_sync.yaml │ │ │ ├── transformer.yaml │ │ │ └── transformer_fisher_swbd.yaml │ │ ├── data │ │ │ ├── spec_augment.yaml │ │ │ ├── spec_augment_speed_perturb.yaml │ │ │ ├── speed_perturb.yaml │ │ │ └── speed_perturb_pretrain.yaml │ │ ├── fbank.conf │ │ └── lm │ │ │ ├── rnnlm.yaml │ │ │ ├── transformer_xl.yaml │ │ │ └── transformerlm.yaml │ │ ├── local │ │ ├── MSU_single_letter.txt │ │ ├── dict.patch │ │ ├── eval2000_data_prep.sh │ │ ├── extend_segments.pl │ │ ├── fisher_data_prep.sh │ │ ├── fisher_map_words.pl │ │ ├── fisher_swbd_prepare_dict.sh │ │ ├── format_acronyms_dict.py │ │ ├── format_acronyms_dict_fisher_swbd.py │ │ ├── map_acronyms_ctm.py │ │ ├── map_acronyms_transcripts.py │ │ ├── plot_attention.sh │ │ ├── plot_ctc.sh │ │ ├── plot_lm_cache.sh │ │ ├── remove_disfluency.py │ │ ├── rt03_data_prep.sh │ │ ├── score_lm.sh │ │ ├── score_sclite.sh │ │ ├── swbd1_data_download.sh │ │ ├── swbd1_data_prep.sh │ │ ├── swbd1_fix_speakerid.pl │ │ ├── swbd1_map_words.pl │ │ └── swbd1_prepare_dict.sh │ │ ├── path.sh │ │ ├── run.sh │ │ ├── run_2mtl.sh │ │ ├── run_3mtl.sh │ │ ├── score.sh │ │ ├── steps │ │ └── utils ├── tedlium │ ├── s5_r2 │ │ ├── RESULTS.md │ │ ├── cmd.sh │ │ ├── conf │ │ │ ├── asr │ │ │ │ ├── blstm_triggered_attention.yaml │ │ │ │ ├── las │ │ │ │ │ ├── blstm_las.yaml │ │ │ │ │ ├── blstm_las_2mtl.yaml │ │ │ │ │ ├── blstm_las_ctc_sync.yaml │ │ │ │ │ ├── lcblstm_las_chunk4020.yaml │ │ │ │ │ ├── lcblstm_las_chunk4040.yaml │ │ │ │ │ └── lstm_las.yaml │ │ │ │ ├── lcblstm_las_chunk4020.yaml │ │ │ │ ├── lcblstm_las_chunk4040.yaml │ │ │ │ ├── lstm_las.yaml │ │ │ │ ├── mma │ │ │ │ │ ├── offline │ │ │ │ │ │ └── transformer_mma_subsample8_ma4H_ca4H_w16_from4L.yaml │ │ │ │ │ └── streaming │ │ │ │ │ │ ├── lc_transformer_mma_subsample8_ma4H_ca4H_w16_from4L_64_128_64.yaml │ │ │ │ │ │ └── lc_transformer_mma_subsample8_ma4H_ca4H_w16_from4L_96_64_32.yaml │ │ │ │ ├── mocha │ │ │ │ │ ├── blstm_mocha.yaml │ │ │ │ │ ├── lcblstm_mocha_chunk4020.yaml │ │ │ │ │ ├── lcblstm_mocha_chunk4020_ctc_sync.yaml │ │ │ │ │ ├── lcblstm_mocha_chunk4040.yaml │ │ │ │ │ ├── lcblstm_mocha_chunk4040_ctc_sync.yaml │ │ │ │ │ ├── lcblstm_mocha_chunk4040_mbr.yaml │ │ │ │ │ ├── lstm_mocha.yaml │ │ │ │ │ ├── lstm_mocha_ctc_sync.yaml │ │ │ │ │ ├── lstm_mocha_decot16.yaml │ │ │ │ │ ├── lstm_mocha_minlt.yaml │ │ │ │ │ ├── lstm_mocha_rsp_enc.yaml │ │ │ │ │ ├── lstm_mocha_stableemit0.1.yaml │ │ │ │ │ ├── uni_conformer_kernel7_clamp10_hie_subsample8_mocha_long_ln.yaml │ │ │ │ │ └── uni_conformer_kernel7_clamp10_hie_subsample8_mocha_long_ln_stableemit0.1.yaml │ │ │ │ ├── transducer │ │ │ │ │ ├── blstm_rnnt_bpe1k.yaml │ │ │ │ │ ├── lcblstm_rnnt_40_20_bpe1k.yaml │ │ │ │ │ ├── lcblstm_rnnt_40_40_bpe1k.yaml │ │ │ │ │ ├── lstm_rnnt_bpe1k.yaml │ │ │ │ │ └── uni_conformer_kernel7_clamp10_hie_subsample8_rnnt_long_ln_bpe1k.yaml │ │ │ │ └── transformer │ │ │ │ │ ├── conformer_kernel15_clamp10_hie_subsample8_las_long_ln.yaml │ │ │ │ │ ├── transformer_hie_subsample8.yaml │ │ │ │ │ └── transformer_hie_subsample8_las_long.yaml │ │ │ ├── data │ │ │ │ ├── pretrain.yaml │ │ │ │ ├── spec_augment_speed_perturb.yaml │ │ │ │ ├── spec_augment_speed_perturb_pretrain_F13_T50.yaml │ │ │ │ ├── spec_augment_speed_perturb_pretrain_F27_T100.yaml │ │ │ │ └── spec_augment_speed_perturb_pretrain_F27_T50.yaml │ │ │ ├── fbank.conf │ │ │ └── lm │ │ │ │ └── rnnlm.yaml │ │ ├── ctc_forced_align.sh │ │ ├── local │ │ │ ├── download_data.sh │ │ │ ├── format_lms.sh │ │ │ ├── join_suffix.py │ │ │ ├── plot_attention.sh │ │ │ ├── prepare_data.sh │ │ │ ├── prepare_dict.sh │ │ │ ├── ted_download_lm.sh │ │ │ └── ted_train_lm.sh │ │ ├── path.sh │ │ ├── run.sh │ │ ├── run_2mtl.sh │ │ ├── run_streaming.sh │ │ ├── score.sh │ │ ├── score_streaming.sh │ │ ├── steps │ │ └── utils │ └── s5_r3 │ │ ├── cmd.sh │ │ ├── conf │ │ ├── asr │ │ │ └── blstm_las.yaml │ │ ├── fbank.conf │ │ ├── lm │ │ │ └── rnnlm.yaml │ │ ├── spec_augment.yaml │ │ ├── spec_augment_speed_perturb.yaml │ │ └── speed_perturb.yaml │ │ ├── local │ │ ├── download_data.sh │ │ ├── format_lms.sh │ │ ├── join_suffix.py │ │ ├── prepare_data.sh │ │ ├── prepare_dict.sh │ │ ├── ted_download_lm.sh │ │ └── ted_train_lm.sh │ │ ├── path.sh │ │ ├── run.sh │ │ ├── score.sh │ │ ├── steps │ │ └── utils ├── timit │ ├── README.txt │ └── s5 │ │ ├── RESULTS.md │ │ ├── cmd.sh │ │ ├── conf │ │ ├── blstm_ctc.yaml │ │ ├── blstm_las.yaml │ │ ├── dev_spk.list │ │ ├── fbank.conf │ │ ├── phones.60-48-39.map │ │ ├── rnn_transducer.yaml │ │ ├── test_spk.list │ │ ├── transformer.yaml │ │ └── transformer_relative.yaml │ │ ├── local │ │ ├── plot_attention.sh │ │ ├── plot_ctc.sh │ │ ├── score_sclite.sh │ │ ├── timit_data_prep.sh │ │ ├── timit_format_data.sh │ │ └── timit_norm_trans.pl │ │ ├── path.sh │ │ ├── run.sh │ │ ├── score.sh │ │ ├── steps │ │ └── utils └── wsj │ ├── README.txt │ └── s5 │ ├── RESULTS │ ├── cmd.sh │ ├── conf │ ├── asr │ │ ├── blstm_las.yaml │ │ ├── glu_encoder.yaml │ │ ├── tds_encoder.yaml │ │ └── transformer.yaml │ ├── data │ │ ├── spec_augment.yaml │ │ ├── spec_augment_speed_perturb.yaml │ │ └── speed_perturb.yaml │ ├── fbank.conf │ └── lm │ │ ├── gated_convlm.yaml │ │ ├── rnnlm.yaml │ │ └── transformerlm.yaml │ ├── local │ ├── append_utterances.sh │ ├── cstr_ndx2flist.pl │ ├── cstr_wsj_data_prep.sh │ ├── cstr_wsj_extend_dict.sh │ ├── dict │ │ ├── add_counts.pl │ │ ├── count_rules.pl │ │ ├── filter_dict.pl │ │ ├── find_acronyms.pl │ │ ├── get_acronym_prons.pl │ │ ├── get_candidate_prons.pl │ │ ├── get_rule_hierarchy.pl │ │ ├── get_rules.pl │ │ ├── limit_candidate_prons.pl │ │ ├── reverse_candidates.pl │ │ ├── reverse_dict.pl │ │ ├── score_prons.pl │ │ ├── score_rules.pl │ │ └── select_candidate_prons.pl │ ├── find_transcripts.pl │ ├── flist2scp.pl │ ├── ndx2flist.pl │ ├── normalize_trans.sh │ ├── normalize_transcript.pl │ ├── plot_attention.sh │ ├── plot_ctc.sh │ ├── score_lm.sh │ ├── wsj_data_prep.sh │ ├── wsj_extend_dict.sh │ ├── wsj_format_data.sh │ ├── wsj_format_local_lms.sh │ └── wsj_prepare_dict.sh │ ├── path.sh │ ├── run.sh │ ├── score.sh │ ├── steps │ └── utils ├── neural_sp ├── __init__.py ├── bin │ ├── __init__.py │ ├── args_asr.py │ ├── args_common.py │ ├── args_lm.py │ ├── asr │ │ ├── __init__.py │ │ ├── ctc_forced_align.py │ │ ├── eval.py │ │ ├── plot_attention.py │ │ ├── plot_ctc.py │ │ └── train.py │ ├── eval_utils.py │ ├── lm │ │ ├── __init__.py │ │ ├── eval.py │ │ ├── plot_cache.py │ │ └── train.py │ ├── model_name.py │ ├── plot_utils.py │ └── train_utils.py ├── datasets │ ├── __init__.py │ ├── alignment.py │ ├── asr │ │ ├── __init__.py │ │ ├── build.py │ │ ├── dataloader.py │ │ ├── dataset.py │ │ └── sampler.py │ ├── lm.py │ ├── token_converter │ │ ├── __init__.py │ │ ├── character.py │ │ ├── phone.py │ │ ├── word.py │ │ └── wordpiece.py │ └── utils.py ├── evaluators │ ├── __init__.py │ ├── accuracy.py │ ├── character.py │ ├── edit_distance.py │ ├── phone.py │ ├── ppl.py │ ├── resolving_unk.py │ ├── word.py │ ├── wordpiece.py │ └── wordpiece_bleu.py ├── models │ ├── __init__.py │ ├── base.py │ ├── criterion.py │ ├── data_parallel.py │ ├── lm │ │ ├── __init__.py │ │ ├── build.py │ │ ├── gated_convlm.py │ │ ├── lm_base.py │ │ ├── rnnlm.py │ │ ├── transformer_xl.py │ │ └── transformerlm.py │ ├── modules │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── causal_conv.py │ │ ├── cif.py │ │ ├── conformer_convolution.py │ │ ├── gelu.py │ │ ├── glu.py │ │ ├── gmm_attention.py │ │ ├── headdrop.py │ │ ├── initialization.py │ │ ├── mocha │ │ │ ├── __init__.py │ │ │ ├── chunk_energy.py │ │ │ ├── hma_test.py │ │ │ ├── hma_train.py │ │ │ ├── mocha.py │ │ │ ├── mocha_test.py │ │ │ ├── mocha_train.py │ │ │ └── monotonic_energy.py │ │ ├── multihead_attention.py │ │ ├── positional_embedding.py │ │ ├── positionwise_feed_forward.py │ │ ├── relative_multihead_attention.py │ │ ├── softplus.py │ │ ├── swish.py │ │ ├── sync_bidir_multihead_attention.py │ │ ├── transformer.py │ │ └── zoneout.py │ ├── seq2seq │ │ ├── __init___.py │ │ ├── decoders │ │ │ ├── __init__.py │ │ │ ├── beam_search.py │ │ │ ├── build.py │ │ │ ├── ctc.py │ │ │ ├── decoder_base.py │ │ │ ├── fwd_bwd_attention.py │ │ │ ├── las.py │ │ │ ├── rnn_transducer.py │ │ │ └── transformer.py │ │ ├── encoders │ │ │ ├── __init__.py │ │ │ ├── build.py │ │ │ ├── conformer.py │ │ │ ├── conformer_block.py │ │ │ ├── conformer_block_v2.py │ │ │ ├── conv.py │ │ │ ├── encoder_base.py │ │ │ ├── gated_conv.py │ │ │ ├── rnn.py │ │ │ ├── subsampling.py │ │ │ ├── tds.py │ │ │ ├── transformer.py │ │ │ ├── transformer_block.py │ │ │ └── utils.py │ │ ├── frontends │ │ │ ├── __init__.py │ │ │ ├── frame_stacking.py │ │ │ ├── input_noise.py │ │ │ ├── sequence_summary.py │ │ │ ├── spec_augment.py │ │ │ ├── splicing.py │ │ │ └── streaming.py │ │ └── speech2text.py │ └── torch_utils.py ├── trainers │ ├── __init__.py │ ├── lr_scheduler.py │ ├── optimizer.py │ └── reporter.py └── utils.py ├── setup.cfg ├── setup.py ├── test ├── __init__.py ├── decoders │ ├── dict.txt │ ├── test_las_decoder.py │ ├── test_rnn_transducer_decoder.py │ └── test_transformer_decoder.py ├── encoders │ ├── test_conformer_encoder.py │ ├── test_conv_encoder.py │ ├── test_rnn_encoder.py │ ├── test_rnn_encoder_streaming_chunkwise.py │ ├── test_tds_encoder.py │ ├── test_transformer_encoder.py │ ├── test_transformer_encoder_streaming_chunkwise.py │ └── test_utils.py ├── frontends │ ├── test_frame_stacking.py │ ├── test_input_noise.py │ ├── test_sequence_summary.py │ ├── test_specaugment.py │ ├── test_splicing.py │ └── test_streaming.py ├── install.sh ├── lm │ ├── test_rnnlm.py │ ├── test_transformer_xl_lm.py │ └── test_transformerlm.py ├── modules │ ├── test_attention.py │ ├── test_causal_conv.py │ ├── test_cif.py │ ├── test_conformer_convolution.py │ ├── test_gmm_attention.py │ ├── test_mocha.py │ ├── test_multihead_attention.py │ ├── test_pointwise_feed_forward.py │ ├── test_relative_multihead_attention.py │ └── test_zoneout.py ├── test_python.sh └── test_training.sh ├── tools └── Makefile └── utils ├── compute_oov_rate.py ├── concat_ref.py ├── dump_feat.sh ├── make_dataset.sh ├── make_tsv.py ├── make_vocab.sh ├── map2phone.py ├── speed_perturb_3way.sh ├── text2dict.py ├── trn2ctm.py └── update_dataset.sh /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | source = neural_sp 4 | 5 | [report] 6 | exclude_lines = 7 | raise ValueError 8 | raise TypeError 9 | raise NotImplementedError 10 | if __name__ == .__main__.: 11 | 12 | ignore_errors = True -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.pyc 3 | *.log 4 | __pycache__ 5 | .dropbox.attr 6 | .ftpconfig 7 | .nfs* 8 | .idea 9 | .pytest_cache 10 | .vscode 11 | *.done 12 | .coverage 13 | coverage.xml 14 | neural_sp.egg-info 15 | wandb 16 | 17 | # CI test 18 | examples/ci_test/data 19 | examples/ci_test/results 20 | examples/ci_test/sample 21 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | notifications: 2 | email: false 3 | 4 | dist: xenial 5 | 6 | language: python 7 | 8 | os: linux 9 | 10 | python: 11 | - "3.7" 12 | 13 | cache: 14 | - pip 15 | - ccache 16 | 17 | sudo: false 18 | 19 | install: 20 | - travis_retry ./test/install.sh 21 | 22 | script: 23 | - ./test/test_python.sh 24 | - ./test/test_training.sh 25 | 26 | after_success: 27 | - bash <(curl -s https://codecov.io/bash) 28 | 29 | env: 30 | - PYTORCH_VERSION=1.0.0 CC=gcc-7 CXX=g++-7 31 | - PYTORCH_VERSION=1.1.0 CC=gcc-7 CXX=g++-7 32 | - PYTORCH_VERSION=1.3.0 CC=gcc-7 CXX=g++-7 33 | - PYTORCH_VERSION=1.4.0 CC=gcc-7 CXX=g++-7 34 | - PYTORCH_VERSION=1.5.0 CC=gcc-7 CXX=g++-7 35 | - PYTORCH_VERSION=1.6.0 CC=gcc-7 CXX=g++-7 36 | - PYTORCH_VERSION=1.7.1 CC=gcc-7 CXX=g++-7 37 | - PYTORCH_VERSION=1.8.1 CC=gcc-7 CXX=g++-7 38 | 39 | addons: 40 | apt: 41 | sources: 42 | - ubuntu-toolchain-r-test 43 | packages: 44 | - cmake 45 | - g++-7 46 | - sox 47 | -------------------------------------------------------------------------------- /examples/aishell/README.txt: -------------------------------------------------------------------------------- 1 | Aishell is an open Chinese Mandarin speech database published by Beijing Shell Shell Technology Co.,Ltd. 2 | 3 | 400 people from different accent areas in China are invited to participate in the recording, which is conducted in a quiet indoor environment using high fidelity microphone and downsampled to 16kHz. The manual transcription accuracy is above 95%, through professional speech annotation and strict quality inspection. The data is free for academic use. The corpus contains 170 hours of speech, and is devided into training(85%), developement(10%) and testing(5%) sets. The developement set is used to tune the hyperparameters in training. 4 | 5 | The database can be downloaded from openslr: 6 | http://www.openslr.org/33/ 7 | 8 | This folder contains two subfolders: 9 | s5: a speech recognition recipe 10 | v1: a speaker recognition recipe 11 | 12 | For more details, please visit: 13 | http://www.aishelltech.com/kysjcp 14 | -------------------------------------------------------------------------------- /examples/aishell/s5/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | # Kyoto University setup 14 | export train_cmd="run.pl --mem 2G" 15 | export cuda_cmd="run.pl --mem 2G --gpu 1" 16 | export decode_cmd="run.pl --mem 4G" 17 | 18 | # JHU setup 19 | # export train_cmd="queue.pl --mem 2G" 20 | # export cuda_cmd="queue.pl --mem 2G --gpu 1 --config conf/gpu.conf" 21 | # export decode_cmd="queue.pl --mem 4G" 22 | -------------------------------------------------------------------------------- /examples/aishell/s5/conf/asr/transformer.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | enc_type: conv_transformer 11 | enc_n_layers: 12 12 | transformer_enc_pe_type: add 13 | transformer_enc_d_model: 256 14 | transformer_enc_d_ff: 2048 15 | transformer_enc_n_heads: 4 16 | dec_type: transformer 17 | dec_n_layers: 6 18 | transformer_dec_pe_type: 1dconv3L ### this is effective 19 | transformer_dec_attn_type: scaled_dot 20 | transformer_dec_d_model: 256 21 | transformer_dec_d_ff: 2048 22 | transformer_dec_n_heads: 4 23 | tie_embedding: false 24 | ctc_fc_list: "512" 25 | ### optimization 26 | batch_size: 32 27 | optimizer: noam 28 | n_epochs: 35 29 | convert_to_sgd_epoch: 100 30 | print_step: 1200 31 | metric: accuracy 32 | lr_factor: 5.0 33 | early_stop_patient_n_epochs: 5 34 | shuffle_bucket: true ### this is important 35 | sort_stop_epoch: 100 36 | eval_start_epoch: 1 37 | warmup_n_steps: 25000 38 | accum_grad_n_steps: 8 39 | ### regularization 40 | clip_grad_norm: 5.0 41 | dropout_in: 0.0 42 | dropout_enc: 0.1 43 | dropout_dec: 0.1 44 | dropout_emb: 0.1 45 | dropout_att: 0.0 46 | weight_decay: 1e-6 47 | lsm_prob: 0.1 48 | ### MTL 49 | ctc_weight: 0.3 50 | ctc_lsm_prob: 0.1 51 | mtl_per_batch: false 52 | task_specific_layer: false 53 | # SpecAugment 54 | freq_width: 27 55 | n_freq_masks: 2 56 | time_width: 100 57 | n_time_masks: 2 58 | time_width_upper: 1.0 59 | -------------------------------------------------------------------------------- /examples/aishell/s5/conf/data/spec_augment_speed_perturb.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 40 # 20->40 3 | print_step: 1200 # 600->1200 4 | lr_decay_start_epoch: 15 5 | lr_decay_rate: 0.9 6 | 7 | # mask 8 | freq_width: 27 9 | n_freq_masks: 2 10 | time_width: 100 11 | n_time_masks: 2 12 | time_width_upper: 1.0 13 | -------------------------------------------------------------------------------- /examples/aishell/s5/conf/data/spec_augment_speed_perturb_pretrain.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 30 # 15->30 3 | print_step: 2400 # 1200->2400 4 | lr_decay_start_epoch: 1 5 | lr_decay_rate: 0.9 6 | 7 | # mask 8 | freq_width: 27 9 | n_freq_masks: 2 10 | time_width: 50 11 | n_time_masks: 2 12 | time_width_upper: 1.0 13 | -------------------------------------------------------------------------------- /examples/aishell/s5/conf/data/speed_perturb_pretrain.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 20 # 20->15->20 3 | print_step: 1200 # 600->1200 4 | lr_decay_start_epoch: 1 5 | lr_decay_rate: 0.8 6 | -------------------------------------------------------------------------------- /examples/aishell/s5/conf/fbank.conf: -------------------------------------------------------------------------------- 1 | --window-type=hamming # disable Dans window, use the standard 2 | --htk-compat=true # try to make it compatible with HTK 3 | 4 | --sample-frequency=16000 5 | --num-mel-bins=80 6 | --use-energy=false 7 | -------------------------------------------------------------------------------- /examples/aishell/s5/conf/lm/rnnlm.yaml: -------------------------------------------------------------------------------- 1 | # topology 2 | lm_type: lstm 3 | n_units: 1024 4 | n_projs: 0 5 | n_layers: 2 6 | emb_dim: 1024 7 | n_units_null_context: 0 8 | tie_embedding: true 9 | residual: true 10 | use_glu: true 11 | # optimization 12 | batch_size: 64 13 | bptt: 200 14 | optimizer: adam 15 | n_epochs: 15 16 | convert_to_sgd_epoch: 15 17 | print_step: 50 18 | lr: 1e-3 19 | lr_decay_start_epoch: 10 20 | lr_decay_rate: 0.9 21 | lr_decay_patient_n_epochs: 0 22 | lr_decay_type: always 23 | early_stop_patient_n_epochs: 5 24 | eval_start_epoch: 1 25 | # initialization 26 | param_init: 0.05 27 | # regularization 28 | clip_grad_norm: 1.0 29 | dropout_in: 0.5 30 | dropout_hidden: 0.5 31 | dropout_out: 0.0 32 | weight_decay: 1e-6 33 | lsm_prob: 0.1 ### 34 | adaptive_softmax: false 35 | -------------------------------------------------------------------------------- /examples/aishell/s5/path.sh: -------------------------------------------------------------------------------- 1 | export NEURALSP_ROOT=$PWD/../../.. 2 | export KALDI_ROOT=$NEURALSP_ROOT/tools/kaldi 3 | export TOOL=$NEURALSP_ROOT/tools/neural_sp 4 | export CONDA=$TOOL/miniconda 5 | 6 | # Kaldi 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh 8 | export PATH=$NEURALSP_ROOT/utils:$PWD/utils/:$KALDI_ROOT/tools/sctk/bin/:$TOOL/sentencepiece/build/src:$PATH 9 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 10 | . $KALDI_ROOT/tools/config/common_path.sh 11 | export LC_ALL=C 12 | 13 | ### Python 14 | source $CONDA/etc/profile.d/conda.sh && conda deactivate && conda activate 15 | export PYTHONDONTWRITEBYTECODE=1 16 | export OMP_NUM_THREADS=1 17 | 18 | ### CUDA 19 | CUDAROOT=/usr/local/cuda 20 | NCCL_ROOT=/usr/local/nccl 21 | export CPATH=$NCCL_ROOT/include:$CPATH 22 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib/:$CUDAROOT/lib64:$LD_LIBRARY_PATH 23 | export LIBRARY_PATH=$NCCL_ROOT/lib/:$LIBRARY_PATH 24 | export CUDA_HOME=$CUDAROOT 25 | export CUDA_PATH=$CUDAROOT 26 | -------------------------------------------------------------------------------- /examples/aishell/s5/steps: -------------------------------------------------------------------------------- 1 | ../../wsj/s5/steps -------------------------------------------------------------------------------- /examples/aishell/s5/utils: -------------------------------------------------------------------------------- 1 | ../../wsj/s5/utils -------------------------------------------------------------------------------- /examples/aishell2/s5/RESULTS.md: -------------------------------------------------------------------------------- 1 | ### Conformer-LAS + SpecAugment (no LM), hierarchical subsample1/8 2 | - conf: `conf/asr/conformer_kernel15_clamp10_hie_subsample8_las_ln_large.yaml` 3 | - decoding parameters 4 | - epoch: 30 5 | - n_average: 10 6 | - beam width: 10 7 | - lm_weight: 0.0 8 | 9 | | Eval Set | # Snt | # Wrd | Corr | Sub | Del | Ins | Err | S.Err | 10 | | -------- | ----- | ----- | ---- | --- | --- | --- | --- | ----- | 11 | |test_android|5000|49532|94.0|5.8|0.2|0.1|**6.1**|36.7| 12 | |test_ios|5000|49532|94.6|5.2|0.2|0.1|**5.5**|34.1| 13 | |test_mic|5000|49532|94.3|5.6|0.2|0.1|**5.9**|35.7| 14 | -------------------------------------------------------------------------------- /examples/aishell2/s5/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | # Kyoto University setup 14 | export train_cmd="run.pl --mem 2G" 15 | export cuda_cmd="run.pl --mem 2G --gpu 1" 16 | export decode_cmd="run.pl --mem 4G" 17 | 18 | # JHU setup 19 | # export train_cmd="queue.pl --mem 2G" 20 | # export cuda_cmd="queue.pl --mem 2G --gpu 1 --config conf/gpu.conf" 21 | # export decode_cmd="queue.pl --mem 4G" 22 | -------------------------------------------------------------------------------- /examples/aishell2/s5/conf/fbank.conf: -------------------------------------------------------------------------------- 1 | --window-type=hamming # disable Dans window, use the standard 2 | --htk-compat=true # try to make it compatible with HTK 3 | 4 | --sample-frequency=16000 5 | --num-mel-bins=80 6 | --use-energy=false 7 | -------------------------------------------------------------------------------- /examples/aishell2/s5/path.sh: -------------------------------------------------------------------------------- 1 | export NEURALSP_ROOT=$PWD/../../.. 2 | export KALDI_ROOT=$NEURALSP_ROOT/tools/kaldi 3 | export TOOL=$NEURALSP_ROOT/tools/neural_sp 4 | export CONDA=$TOOL/miniconda 5 | 6 | # Kaldi 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh 8 | export PATH=$NEURALSP_ROOT/utils:$PWD/utils/:$KALDI_ROOT/tools/sctk/bin/:$TOOL/sentencepiece/build/src:$PATH 9 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 10 | . $KALDI_ROOT/tools/config/common_path.sh 11 | export LC_ALL=C 12 | 13 | ### Python 14 | source $CONDA/etc/profile.d/conda.sh && conda deactivate && conda activate 15 | export PYTHONDONTWRITEBYTECODE=1 16 | export OMP_NUM_THREADS=1 17 | 18 | ### CUDA 19 | CUDAROOT=/usr/local/cuda 20 | NCCL_ROOT=/usr/local/nccl 21 | export CPATH=$NCCL_ROOT/include:$CPATH 22 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib/:$CUDAROOT/lib64:$LD_LIBRARY_PATH 23 | export LIBRARY_PATH=$NCCL_ROOT/lib/:$LIBRARY_PATH 24 | export CUDA_HOME=$CUDAROOT 25 | export CUDA_PATH=$CUDAROOT 26 | -------------------------------------------------------------------------------- /examples/aishell2/s5/steps: -------------------------------------------------------------------------------- 1 | ../../wsj/s5/steps -------------------------------------------------------------------------------- /examples/aishell2/s5/utils: -------------------------------------------------------------------------------- 1 | ../../wsj/s5/utils -------------------------------------------------------------------------------- /examples/ami/s5b/README.txt: -------------------------------------------------------------------------------- 1 | 2 | This s5b recipe is a streamlined and simplified version of the s5 recipe, with 3 | many components removed. 4 | 5 | Before running run.sh, please run run_prepare_shared.sh. 6 | 7 | Afterwards, you can run: 8 | run.sh --mic ihm # builds system for independent headset microphone 9 | run.sh --mic sdm1 # single distant micropophone 10 | run.sh --mic mdm8 # multiple distant microphones + beamforming. 11 | 12 | Note: the sdm1 and mdm8 systems depend on the ihm system, because for 13 | best results we use the IHM alignments to train the neural nets. 14 | Please see RESULTS_* for results. 15 | 16 | - For information about the database see : http://groups.inf.ed.ac.uk/ami/corpus/overview.shtml 17 | 18 | -------------------------------------------------------------------------------- /examples/ami/s5b/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | # Kyoto University setup 14 | export train_cmd="run.pl --mem 2G" 15 | export cuda_cmd="run.pl --mem 2G --gpu 1" 16 | export decode_cmd="run.pl --mem 4G" 17 | 18 | # JHU setup 19 | # export train_cmd="queue.pl --mem 2G" 20 | # export cuda_cmd="queue.pl --mem 2G --gpu 1 --config conf/gpu.conf" 21 | # export decode_cmd="queue.pl --mem 4G" 22 | -------------------------------------------------------------------------------- /examples/ami/s5b/conf/ami_beamformit.cfg: -------------------------------------------------------------------------------- 1 | #BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/) 2 | 3 | # scrolling size to compute the delays 4 | scroll_size = 250 5 | 6 | # cross correlation computation window size 7 | window_size = 500 8 | 9 | #amount of maximum points for the xcorrelation taken into account 10 | nbest_amount = 4 11 | 12 | #flag wether to apply an automatic noise thresholding 13 | do_noise_threshold = 1 14 | 15 | #Percentage of frames with lower xcorr taken as noisy 16 | noise_percent = 10 17 | 18 | ######## acoustic modelling parameters 19 | 20 | #transition probabilities weight for multichannel decoding 21 | trans_weight_multi = 25 22 | trans_weight_nbest = 25 23 | 24 | ### 25 | 26 | #flag wether to print the feaures after setting them, or not 27 | print_features = 1 28 | 29 | #flag wether to use the bad frames in the sum process 30 | do_avoid_bad_frames = 1 31 | 32 | #flag to use the best channel (SNR) as a reference 33 | #defined from command line 34 | do_compute_reference = 1 35 | 36 | #flag wether to use a uem file or not(process all the file) 37 | do_use_uem_file = 0 38 | 39 | #flag wether to use an adaptative weights scheme or fixed weights 40 | do_adapt_weights = 1 41 | 42 | #flag wether to output the sph files or just run the system to create the auxiliary files 43 | do_write_sph_files = 1 44 | 45 | ####directories where to store/retrieve info#### 46 | #channels_file = ./cfg-files/channels 47 | 48 | #show needs to be passed as argument normally, here a default one is given just in case 49 | #show_id = Ttmp 50 | 51 | -------------------------------------------------------------------------------- /examples/ami/s5b/conf/asr/blstm_las.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | subsample: "1_1_1_1_1" 11 | enc_type: conv_blstm 12 | bidirectional_sum_fwd_bwd: true 13 | enc_n_units: 512 14 | enc_n_projs: 0 15 | enc_n_layers: 5 16 | subsample_type: drop 17 | lc_chunk_size_left: -1 ### offline 18 | lc_chunk_size_right: 40 19 | attn_type: location 20 | attn_conv_n_channels: 10 21 | attn_conv_width: 201 22 | attn_dim: 512 23 | attn_n_heads: 1 24 | dec_type: lstm 25 | dec_n_units: 1024 26 | dec_n_projs: 0 27 | dec_n_layers: 1 28 | dec_bottleneck_dim: 1024 ### this is effective 29 | emb_dim: 512 30 | tie_embedding: false 31 | ctc_fc_list: "512" 32 | ### optimization 33 | batch_size: 30 34 | optimizer: adam 35 | n_epochs: 35 36 | convert_to_sgd_epoch: 100 37 | print_step: 200 38 | metric: edit_distance 39 | lr: 1e-3 40 | lr_decay_type: always 41 | lr_decay_start_epoch: 15 42 | lr_decay_rate: 0.9 43 | lr_decay_patient_n_epochs: 0 44 | early_stop_patient_n_epochs: 5 45 | sort_stop_epoch: 100 46 | eval_start_epoch: 1 47 | warmup_start_lr: 1e-4 48 | warmup_n_steps: 4000 49 | ### initialization 50 | param_init: 0.1 51 | ### regularization 52 | clip_grad_norm: 5.0 53 | dropout_in: 0.0 54 | dropout_enc: 0.4 55 | dropout_dec: 0.4 56 | dropout_emb: 0.4 57 | dropout_att: 0.0 58 | weight_decay: 1e-6 59 | ss_prob: 0.2 60 | lsm_prob: 0.1 61 | ### MTL 62 | ctc_weight: 0.3 63 | ctc_lsm_prob: 0.1 64 | mtl_per_batch: false 65 | task_specific_layer: false 66 | -------------------------------------------------------------------------------- /examples/ami/s5b/conf/asr/blstm_rnnt.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | subsample: "1_1_1_1_1" 11 | enc_type: conv_blstm 12 | bidirectional_sum_fwd_bwd: true 13 | enc_n_units: 512 14 | enc_n_projs: 0 15 | enc_n_layers: 5 16 | subsample_type: drop 17 | lc_chunk_size_left: -1 ### offline 18 | lc_chunk_size_right: 40 19 | dec_type: lstm_transducer 20 | dec_n_units: 1024 21 | dec_n_projs: 0 22 | dec_n_layers: 2 23 | dec_bottleneck_dim: 512 24 | emb_dim: 512 25 | tie_embedding: false 26 | ctc_fc_list: "512" 27 | ### optimization 28 | batch_size: 15 29 | optimizer: adam 30 | n_epochs: 35 31 | convert_to_sgd_epoch: 100 32 | print_step: 200 33 | metric: edit_distance 34 | lr: 1e-3 35 | lr_decay_type: always 36 | lr_decay_start_epoch: 15 37 | lr_decay_rate: 0.9 38 | lr_decay_patient_n_epochs: 0 39 | early_stop_patient_n_epochs: 5 40 | sort_stop_epoch: 100 41 | eval_start_epoch: 1 42 | warmup_start_lr: 1e-4 43 | warmup_n_steps: 4000 44 | ### initialization 45 | param_init: 0.1 46 | ### regularization 47 | clip_grad_norm: 5.0 48 | dropout_in: 0.0 49 | dropout_enc: 0.4 50 | dropout_dec: 0.4 51 | dropout_emb: 0.4 52 | weight_decay: 1e-6 53 | ### MTL 54 | ctc_weight: 0.3 55 | ctc_lsm_prob: 0.1 56 | mtl_per_batch: false 57 | task_specific_layer: false 58 | -------------------------------------------------------------------------------- /examples/ami/s5b/conf/asr/lcblstm_rnnt_40_40.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | subsample: "1_1_1_1_1" 11 | enc_type: conv_blstm 12 | bidirectional_sum_fwd_bwd: true 13 | enc_n_units: 512 14 | enc_n_projs: 0 15 | enc_n_layers: 5 16 | subsample_type: drop 17 | lc_chunk_size_left: 40 18 | lc_chunk_size_right: 40 19 | dec_type: lstm_transducer 20 | dec_n_units: 1024 21 | dec_n_projs: 0 22 | dec_n_layers: 2 23 | dec_bottleneck_dim: 512 24 | emb_dim: 512 25 | tie_embedding: false 26 | ctc_fc_list: "512" 27 | ### optimization 28 | batch_size: 10 29 | optimizer: adam 30 | n_epochs: 35 31 | convert_to_sgd_epoch: 100 32 | print_step: 200 33 | metric: edit_distance 34 | lr: 1e-3 35 | lr_decay_type: always 36 | lr_decay_start_epoch: 15 37 | lr_decay_rate: 0.9 38 | lr_decay_patient_n_epochs: 0 39 | early_stop_patient_n_epochs: 5 40 | sort_stop_epoch: 100 41 | eval_start_epoch: 1 42 | warmup_start_lr: 1e-4 43 | warmup_n_steps: 4000 44 | ### initialization 45 | param_init: 0.1 46 | ### regularization 47 | clip_grad_norm: 5.0 48 | dropout_in: 0.0 49 | dropout_enc: 0.4 50 | dropout_dec: 0.4 51 | dropout_emb: 0.4 52 | weight_decay: 1e-6 53 | ### MTL 54 | ctc_weight: 0.3 55 | ctc_lsm_prob: 0.1 56 | mtl_per_batch: false 57 | task_specific_layer: false 58 | -------------------------------------------------------------------------------- /examples/ami/s5b/conf/asr/transformer.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | enc_type: conv_transformer 11 | enc_n_layers: 12 12 | transformer_enc_pe_type: none ### 13 | transformer_enc_d_model: 256 14 | transformer_enc_d_ff: 2048 15 | transformer_enc_n_heads: 4 16 | dec_type: transformer 17 | dec_n_layers: 6 18 | transformer_dec_pe_type: 1dconv3L ### this is effective 19 | transformer_dec_attn_type: scaled_dot 20 | transformer_dec_d_model: 256 21 | transformer_dec_d_ff: 2048 22 | transformer_dec_n_heads: 4 23 | tie_embedding: false 24 | ctc_fc_list: "512" 25 | ### optimization 26 | batch_size: 32 27 | optimizer: noam 28 | n_epochs: 70 29 | convert_to_sgd_epoch: 100 30 | print_step: 1200 31 | metric: accuracy 32 | lr_factor: 5.0 33 | early_stop_patient_n_epochs: 5 34 | shuffle_bucket: true ### this is important 35 | sort_stop_epoch: 100 36 | eval_start_epoch: 1 37 | warmup_n_steps: 25000 38 | accum_grad_n_steps: 8 39 | ### regularization 40 | clip_grad_norm: 5.0 41 | dropout_in: 0.0 42 | dropout_enc: 0.1 43 | dropout_dec: 0.1 44 | dropout_emb: 0.1 45 | dropout_att: 0.0 46 | weight_decay: 1e-6 47 | lsm_prob: 0.1 48 | ### MTL 49 | ctc_weight: 0.3 50 | ctc_lsm_prob: 0.1 51 | mtl_per_batch: false 52 | task_specific_layer: false 53 | # SpecAugment 54 | freq_width: 27 55 | n_freq_masks: 2 56 | time_width: 100 57 | n_time_masks: 2 58 | time_width_upper: 1.0 59 | -------------------------------------------------------------------------------- /examples/ami/s5b/conf/data/spec_augment.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 70 3 | print_step: 400 4 | lr_decay_start_epoch: 20 5 | lr_decay_rate: 0.95 6 | 7 | # mask 8 | freq_width: 27 9 | n_freq_masks: 2 10 | time_width: 100 11 | n_time_masks: 2 12 | time_width_upper: 1.0 13 | -------------------------------------------------------------------------------- /examples/ami/s5b/conf/data/spec_augment_speed_perturb.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 60 # 35->60 3 | print_step: 1200 4 | lr_decay_start_epoch: 10 5 | lr_decay_rate: 0.925 6 | 7 | # mask 8 | freq_width: 27 9 | n_freq_masks: 2 10 | time_width: 100 11 | n_time_masks: 2 12 | time_width_upper: 1.0 13 | -------------------------------------------------------------------------------- /examples/ami/s5b/conf/data/spec_augment_speed_perturb_pretrain_F27_T100.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 60 # 35->60 3 | print_step: 2400 # 1200->2400 4 | lr_decay_start_epoch: 5 5 | lr_decay_rate: 0.925 6 | 7 | # mask 8 | freq_width: 27 9 | n_freq_masks: 2 10 | time_width: 100 11 | n_time_masks: 2 12 | time_width_upper: 1.0 13 | -------------------------------------------------------------------------------- /examples/ami/s5b/conf/data/spec_augment_speed_perturb_pretrain_F27_T50.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 60 # 35->60 3 | print_step: 2400 # 1200->2400 4 | lr_decay_start_epoch: 5 5 | lr_decay_rate: 0.925 6 | 7 | # mask 8 | freq_width: 27 9 | n_freq_masks: 2 10 | time_width: 50 11 | n_time_masks: 2 12 | time_width_upper: 1.0 13 | -------------------------------------------------------------------------------- /examples/ami/s5b/conf/data/speed_perturb.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 30 3 | print_step: 600 # 200->600 4 | lr_decay_start_epoch: 10 5 | lr_decay_rate: 0.85 6 | -------------------------------------------------------------------------------- /examples/ami/s5b/conf/fbank.conf: -------------------------------------------------------------------------------- 1 | --window-type=hamming # disable Dans window, use the standard 2 | --htk-compat=true # try to make it compatible with HTK 3 | 4 | --sample-frequency=16000 5 | --num-mel-bins=80 6 | --use-energy=false 7 | -------------------------------------------------------------------------------- /examples/ami/s5b/conf/lm/rnnlm.yaml: -------------------------------------------------------------------------------- 1 | # topology 2 | lm_type: lstm 3 | n_units: 1024 4 | n_projs: 0 5 | n_layers: 2 6 | emb_dim: 1024 7 | n_units_null_context: 0 8 | tie_embedding: true 9 | residual: true 10 | use_glu: true 11 | # optimization 12 | batch_size: 128 13 | bptt: 200 14 | optimizer: adam 15 | n_epochs: 40 16 | convert_to_sgd_epoch: 40 17 | print_step: 50 18 | lr: 1e-3 19 | lr_decay_start_epoch: 10 20 | lr_decay_rate: 0.9 21 | lr_decay_patient_n_epochs: 0 22 | lr_decay_type: always 23 | early_stop_patient_n_epochs: 10 24 | eval_start_epoch: 1 25 | # initialization 26 | param_init: 0.05 27 | # regularization 28 | clip_grad_norm: 1.0 29 | dropout_in: 0.2 30 | dropout_hidden: 0.5 31 | dropout_out: 0.0 32 | weight_decay: 1e-6 33 | lsm_prob: 0.1 34 | backward: false 35 | adaptive_softmax: false 36 | -------------------------------------------------------------------------------- /examples/ami/s5b/local/ami_text_prep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2015, Brno University of Technology (Author: Karel Vesely) 4 | # Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski), 2014, Apache 2.0 5 | 6 | if [ $# -ne 1 ]; then 7 | echo "Usage: $0 " 8 | echo " is download space." 9 | exit 1; 10 | fi 11 | 12 | set -eux 13 | 14 | dir=$1 15 | mkdir -p $dir 16 | 17 | echo "Downloading annotations..." 18 | 19 | amiurl=http://groups.inf.ed.ac.uk/ami 20 | annotver=ami_public_manual_1.6.1 21 | annot="$dir/$annotver" 22 | 23 | logdir=${data}/local/downloads; mkdir -p $logdir/log 24 | [ ! -f $annot.zip ] && wget -nv -O $annot.zip $amiurl/AMICorpusAnnotations/$annotver.zip &> $logdir/log/download_ami_annot.log 25 | 26 | if [ ! -d $dir/annotations ]; then 27 | mkdir -p $dir/annotations 28 | unzip -o -d $dir/annotations $annot.zip &> /dev/null 29 | fi 30 | 31 | [ ! -f "$dir/annotations/AMI-metadata.xml" ] && echo "$0: File AMI-Metadata.xml not found under $dir/annotations." && exit 1; 32 | 33 | 34 | # extract text from AMI XML annotations, 35 | local/ami_xml2text.sh $dir 36 | 37 | wdir=${data}/local/annotations 38 | [ ! -f $wdir/transcripts1 ] && echo "$0: File $wdir/transcripts1 not found." && exit 1; 39 | 40 | echo "Preprocessing transcripts..." 41 | local/ami_split_segments.pl $wdir/transcripts1 $wdir/transcripts2 &> $wdir/log/split_segments.log 42 | 43 | # make final train/dev/eval splits 44 | for dset in train eval dev; do 45 | grep -f local/split_$dset.orig $wdir/transcripts2 > $wdir/$dset.txt 46 | done 47 | -------------------------------------------------------------------------------- /examples/ami/s5b/local/beamformit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski) 4 | 5 | . ./path.sh 6 | 7 | nj=$1 8 | job=$2 9 | numch=$3 10 | meetings=$4 11 | sdir=$5 12 | odir=$6 13 | wdir=${data}/local/beamforming 14 | 15 | set -e 16 | set -u 17 | 18 | utils/split_scp.pl -j $nj $((job-1)) $meetings $meetings.$job 19 | 20 | while read line; do 21 | 22 | mkdir -p $odir/$line 23 | BeamformIt -s $line -c $wdir/channels_$numch \ 24 | --config_file `pwd`/conf/ami_beamformit.cfg \ 25 | --source_dir $sdir \ 26 | --result_dir $odir/$line 27 | mkdir -p $odir/$line 28 | mv $odir/$line/${line}.del $odir/$line/${line}_MDM$numch.del 29 | mv $odir/$line/${line}.del2 $odir/$line/${line}_MDM$numch.del2 30 | mv $odir/$line/${line}.info $odir/$line/${line}_MDM$numch.info 31 | mv $odir/$line/${line}.weat $odir/$line/${line}_MDM$numch.weat 32 | mv $odir/$line/${line}.wav $odir/$line/${line}_MDM$numch.wav 33 | #mv $odir/$line/${line}.ovl $odir/$line/${line}_MDM$numch.ovl # Was not created! 34 | 35 | done < $meetings.$job 36 | -------------------------------------------------------------------------------- /examples/ami/s5b/local/english.glm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/examples/ami/s5b/local/english.glm -------------------------------------------------------------------------------- /examples/ami/s5b/local/split_REAMDE.txt: -------------------------------------------------------------------------------- 1 | The splits in this directory follow the official AMI Corpus Full-ASR split 2 | on train, dev and eval sets. 3 | 4 | If for some reason ones need to use different split the way to do so is 5 | to create split_*.final versions in this directory and run the recipe. 6 | -------------------------------------------------------------------------------- /examples/ami/s5b/local/split_dev.orig: -------------------------------------------------------------------------------- 1 | ES2011a 2 | ES2011b 3 | ES2011c 4 | ES2011d 5 | IB4001 6 | IB4002 7 | IB4003 8 | IB4004 9 | IB4010 10 | IB4011 11 | IS1008a 12 | IS1008b 13 | IS1008c 14 | IS1008d 15 | TS3004a 16 | TS3004b 17 | TS3004c 18 | TS3004d 19 | -------------------------------------------------------------------------------- /examples/ami/s5b/local/split_eval.orig: -------------------------------------------------------------------------------- 1 | EN2002a 2 | EN2002b 3 | EN2002c 4 | EN2002d 5 | ES2004a 6 | ES2004b 7 | ES2004c 8 | ES2004d 9 | IS1009a 10 | IS1009b 11 | IS1009c 12 | IS1009d 13 | TS3003a 14 | TS3003b 15 | TS3003c 16 | TS3003d 17 | -------------------------------------------------------------------------------- /examples/ami/s5b/path.sh: -------------------------------------------------------------------------------- 1 | export NEURALSP_ROOT=$PWD/../../.. 2 | export KALDI_ROOT=$NEURALSP_ROOT/tools/kaldi 3 | export TOOL=$NEURALSP_ROOT/tools/neural_sp 4 | export CONDA=$TOOL/miniconda 5 | 6 | # Kaldi 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh 8 | export PATH=$NEURALSP_ROOT/utils:$PWD/utils/:$KALDI_ROOT/tools/sctk/bin/:$TOOL/sentencepiece/build/src:$PATH 9 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 10 | . $KALDI_ROOT/tools/config/common_path.sh 11 | export LC_ALL=C 12 | 13 | ### Python 14 | source $CONDA/etc/profile.d/conda.sh && conda deactivate && conda activate 15 | export PYTHONDONTWRITEBYTECODE=1 16 | export OMP_NUM_THREADS=1 17 | 18 | ### CUDA 19 | CUDAROOT=/usr/local/cuda 20 | NCCL_ROOT=/usr/local/nccl 21 | export CPATH=$NCCL_ROOT/include:$CPATH 22 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib/:$CUDAROOT/lib64:$LD_LIBRARY_PATH 23 | export LIBRARY_PATH=$NCCL_ROOT/lib/:$LIBRARY_PATH 24 | export CUDA_HOME=$CUDAROOT 25 | export CUDA_PATH=$CUDAROOT 26 | -------------------------------------------------------------------------------- /examples/ami/s5b/steps: -------------------------------------------------------------------------------- 1 | ../../wsj/s5/steps -------------------------------------------------------------------------------- /examples/ami/s5b/utils: -------------------------------------------------------------------------------- 1 | ../../wsj/s5/utils -------------------------------------------------------------------------------- /examples/ci_test/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | # Kyoto University setup 14 | export train_cmd="run.pl --mem 2G" 15 | export cuda_cmd="run.pl --mem 2G --gpu 1" 16 | export decode_cmd="run.pl --mem 4G" 17 | 18 | # JHU setup 19 | # export train_cmd="queue.pl --mem 2G" 20 | # export cuda_cmd="queue.pl --mem 2G --gpu 1 --config conf/gpu.conf" 21 | # export decode_cmd="queue.pl --mem 4G" 22 | -------------------------------------------------------------------------------- /examples/ci_test/conf/asr/blstm_las.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | subsample: "1_1_1_1_1" 11 | enc_type: conv_blstm 12 | bidirectional_sum_fwd_bwd: true 13 | enc_n_units: 16 14 | enc_n_projs: 8 15 | enc_n_layers: 1 16 | subsample_type: drop 17 | lc_chunk_size_left: -1 ### offline 18 | lc_chunk_size_right: 40 19 | attn_type: location 20 | attn_conv_n_channels: 10 21 | attn_conv_width: 201 22 | attn_dim: 16 23 | attn_n_heads: 1 24 | dec_type: lstm 25 | dec_n_units: 16 26 | dec_n_projs: 8 27 | dec_n_layers: 1 28 | dec_bottleneck_dim: 16 29 | emb_dim: 16 30 | tie_embedding: false 31 | ctc_fc_list: "8" 32 | ### optimization 33 | batch_size: 1 34 | optimizer: adam 35 | n_epochs: 4 36 | convert_to_sgd_epoch: 100 37 | print_step: 1 38 | metric: edit_distance 39 | lr: 1e-3 40 | lr_decay_type: always 41 | lr_decay_start_epoch: 2 42 | lr_decay_rate: 0.85 43 | lr_decay_patient_n_epochs: 0 44 | early_stop_patient_n_epochs: 2 45 | sort_stop_epoch: 100 46 | eval_start_epoch: 2 47 | warmup_start_lr: 1e-4 48 | warmup_n_steps: 2 49 | ### initialization 50 | param_init: 0.1 51 | ### regularization 52 | clip_grad_norm: 5.0 53 | dropout_in: 0.1 54 | dropout_enc: 0.1 55 | dropout_dec: 0.1 56 | dropout_emb: 0.1 57 | dropout_att: 0.1 58 | weight_decay: 1e-6 59 | ss_prob: 0.1 60 | lsm_prob: 0.1 61 | ### MTL 62 | ctc_weight: 0.3 63 | ctc_lsm_prob: 0.1 64 | mtl_per_batch: false 65 | task_specific_layer: false 66 | -------------------------------------------------------------------------------- /examples/ci_test/conf/asr/blstm_transformer.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | subsample: "1_1_1_1_1" 11 | enc_type: conv_blstm 12 | bidirectional_sum_fwd_bwd: true 13 | enc_n_units: 16 14 | enc_n_projs: 8 15 | enc_n_layers: 1 16 | subsample_type: drop 17 | lc_chunk_size_left: -1 ### offline 18 | lc_chunk_size_right: 40 19 | dec_type: transformer 20 | dec_n_layers: 1 21 | transformer_dec_attn_type: scaled_dot 22 | transformer_dec_pe_type: 1dconv3L 23 | transformer_dec_d_model: 8 24 | transformer_dec_d_ff: 64 25 | transformer_dec_n_heads: 4 26 | tie_embedding: false 27 | ctc_fc_list: "8" 28 | ### optimization 29 | batch_size: 1 30 | optimizer: noam 31 | n_epochs: 4 32 | convert_to_sgd_epoch: 100 33 | print_step: 1 34 | metric: accuracy 35 | lr_factor: 5.0 36 | early_stop_patient_n_epochs: 2 37 | shuffle_bucket: true 38 | sort_stop_epoch: 100 39 | eval_start_epoch: 2 40 | warmup_n_steps: 2 41 | accum_grad_n_steps: 2 42 | ### regularization 43 | clip_grad_norm: 5.0 44 | dropout_in: 0.1 45 | dropout_enc: 0.1 46 | dropout_dec: 0.1 47 | dropout_emb: 0.1 48 | dropout_att: 0.1 49 | weight_decay: 1e-6 50 | lsm_prob: 0.1 51 | ### MTL 52 | ctc_weight: 0.3 53 | ctc_lsm_prob: 0.1 54 | mtl_per_batch: false 55 | task_specific_layer: false 56 | -------------------------------------------------------------------------------- /examples/ci_test/conf/asr/conformer.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(1,1)" 10 | subsample: "1_2" 11 | subsample_type: max_pool 12 | enc_type: conv_conformer 13 | conformer_kernel_size: 3 14 | enc_n_layers: 1 15 | transformer_enc_pe_type: relative ### 16 | transformer_enc_d_model: 8 17 | transformer_enc_d_ff: 32 18 | transformer_enc_n_heads: 4 19 | dec_type: transformer 20 | dec_n_layers: 1 21 | transformer_dec_attn_type: scaled_dot 22 | transformer_dec_pe_type: 1dconv3L 23 | transformer_dec_d_model: 8 24 | transformer_dec_d_ff: 32 25 | transformer_dec_n_heads: 4 26 | tie_embedding: false 27 | ctc_fc_list: "8" 28 | ### optimization 29 | batch_size: 1 30 | optimizer: noam 31 | n_epochs: 4 32 | convert_to_sgd_epoch: 100 33 | print_step: 1 34 | metric: accuracy 35 | lr_factor: 5.0 36 | early_stop_patient_n_epochs: 2 37 | shuffle_bucket: true 38 | sort_stop_epoch: 100 39 | eval_start_epoch: 2 40 | warmup_n_steps: 2 41 | accum_grad_n_steps: 2 42 | ### regularization 43 | clip_grad_norm: 5.0 44 | dropout_in: 0.1 45 | dropout_enc: 0.1 46 | dropout_dec: 0.1 47 | dropout_emb: 0.1 48 | dropout_att: 0.1 49 | weight_decay: 1e-6 50 | lsm_prob: 0.1 51 | ### MTL 52 | ctc_weight: 0.3 53 | ctc_lsm_prob: 0.1 54 | mtl_per_batch: false 55 | task_specific_layer: false 56 | -------------------------------------------------------------------------------- /examples/ci_test/conf/asr/lcblstm_transducer.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | subsample: "1_1_1_1_1" 11 | enc_type: conv_blstm 12 | bidirectional_sum_fwd_bwd: true 13 | enc_n_units: 16 14 | enc_n_projs: 8 15 | enc_n_layers: 1 16 | subsample_type: drop 17 | lc_chunk_size_left: 40 18 | lc_chunk_size_right: 40 19 | dec_type: lstm_transducer 20 | dec_n_units: 16 21 | dec_n_projs: 8 22 | dec_n_layers: 1 23 | dec_bottleneck_dim: 16 24 | emb_dim: 16 25 | tie_embedding: false 26 | ctc_fc_list: "8" 27 | ### optimization 28 | batch_size: 1 29 | optimizer: adam 30 | n_epochs: 4 31 | convert_to_sgd_epoch: 100 32 | print_step: 1 33 | metric: edit_distance 34 | lr: 1e-3 35 | lr_decay_type: always 36 | lr_decay_start_epoch: 2 37 | lr_decay_rate: 0.85 38 | lr_decay_patient_n_epochs: 0 39 | early_stop_patient_n_epochs: 2 40 | sort_stop_epoch: 100 41 | eval_start_epoch: 2 42 | warmup_start_lr: 1e-4 43 | warmup_n_steps: 2 44 | ### initialization 45 | param_init: 0.1 46 | ### regularization 47 | clip_grad_norm: 5.0 48 | dropout_in: 0.1 49 | dropout_enc: 0.1 50 | dropout_dec: 0.1 51 | dropout_emb: 0.1 52 | weight_decay: 1e-6 53 | lsm_prob: 0.1 54 | ### MTL 55 | ctc_weight: 0.3 56 | ctc_lsm_prob: 0.1 57 | mtl_per_batch: false 58 | task_specific_layer: false 59 | -------------------------------------------------------------------------------- /examples/ci_test/conf/asr/lstm_ctc.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | subsample: "1_1_1_1_1" 11 | enc_type: conv_lstm 12 | enc_n_units: 16 13 | enc_n_projs: 8 14 | enc_n_layers: 1 15 | subsample_type: drop 16 | dec_type: lstm 17 | ctc_fc_list: "8" 18 | ### optimization 19 | batch_size: 1 20 | optimizer: adam 21 | n_epochs: 4 22 | convert_to_sgd_epoch: 100 23 | print_step: 1 24 | metric: edit_distance 25 | lr: 1e-3 26 | lr_decay_type: always 27 | lr_decay_start_epoch: 2 28 | lr_decay_rate: 0.85 29 | lr_decay_patient_n_epochs: 0 30 | early_stop_patient_n_epochs: 2 31 | sort_stop_epoch: 100 32 | eval_start_epoch: 2 33 | warmup_start_lr: 1e-4 34 | warmup_n_steps: 2 35 | ### initialization 36 | param_init: 0.1 37 | ### regularization 38 | clip_grad_norm: 5.0 39 | dropout_in: 0.1 40 | dropout_enc: 0.1 41 | weight_decay: 1e-6 42 | ### MTL 43 | ctc_weight: 1.0 44 | ctc_lsm_prob: 0.1 45 | mtl_per_batch: false 46 | task_specific_layer: false 47 | -------------------------------------------------------------------------------- /examples/ci_test/conf/asr/tds_las.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "3_3_5_5_5_7_7_7_7_7_7" 7 | conv_kernel_sizes: "(3,1)_(3,1)_(3,1)_(3,1)_(3,1)_(3,1)_(3,1)_(3,1)_(3,1)_(3,1)_(3,1)" 8 | enc_type: tds 9 | attn_type: location 10 | attn_conv_n_channels: 10 11 | attn_conv_width: 201 12 | attn_dim: 16 13 | attn_n_heads: 1 14 | dec_type: lstm 15 | dec_n_units: 16 16 | dec_n_projs: 8 17 | dec_n_layers: 1 18 | dec_bottleneck_dim: 16 19 | emb_dim: 16 20 | tie_embedding: false 21 | ctc_fc_list: "8" 22 | ### optimization 23 | batch_size: 1 24 | optimizer: adam 25 | n_epochs: 4 26 | convert_to_sgd_epoch: 100 27 | print_step: 1 28 | metric: edit_distance 29 | lr: 1e-3 30 | lr_decay_type: always 31 | lr_decay_start_epoch: 2 32 | lr_decay_rate: 0.85 33 | lr_decay_patient_n_epochs: 0 34 | early_stop_patient_n_epochs: 2 35 | sort_stop_epoch: 100 36 | eval_start_epoch: 2 37 | warmup_start_lr: 1e-4 38 | warmup_n_steps: 2 39 | ### initialization 40 | param_init: 0.1 41 | ### regularization 42 | clip_grad_norm: 5.0 43 | dropout_in: 0.1 44 | dropout_enc: 0.1 45 | dropout_dec: 0.1 46 | dropout_emb: 0.1 47 | dropout_att: 0.1 48 | weight_decay: 1e-6 49 | ss_prob: 0.1 50 | lsm_prob: 0.1 51 | ### MTL 52 | ctc_weight: 0.3 53 | ctc_lsm_prob: 0.1 54 | mtl_per_batch: false 55 | task_specific_layer: false 56 | -------------------------------------------------------------------------------- /examples/ci_test/conf/asr/transformer.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(1,1)" 10 | subsample: "1_2" 11 | subsample_type: max_pool 12 | enc_type: conv_transformer 13 | enc_n_layers: 1 14 | transformer_enc_pe_type: none ### 15 | transformer_enc_d_model: 8 16 | transformer_enc_d_ff: 32 17 | transformer_enc_n_heads: 4 18 | dec_type: transformer 19 | dec_n_layers: 1 20 | transformer_dec_attn_type: scaled_dot 21 | transformer_dec_pe_type: 1dconv3L 22 | transformer_dec_d_model: 8 23 | transformer_dec_d_ff: 32 24 | transformer_dec_n_heads: 4 25 | tie_embedding: false 26 | ctc_fc_list: "8" 27 | ### optimization 28 | batch_size: 1 29 | optimizer: noam 30 | n_epochs: 4 31 | convert_to_sgd_epoch: 100 32 | print_step: 1 33 | metric: accuracy 34 | lr_factor: 5.0 35 | early_stop_patient_n_epochs: 2 36 | shuffle_bucket: true 37 | sort_stop_epoch: 100 38 | eval_start_epoch: 2 39 | warmup_n_steps: 2 40 | accum_grad_n_steps: 2 41 | ### regularization 42 | clip_grad_norm: 5.0 43 | dropout_in: 0.1 44 | dropout_enc: 0.1 45 | dropout_dec: 0.1 46 | dropout_emb: 0.1 47 | dropout_att: 0.1 48 | weight_decay: 1e-6 49 | lsm_prob: 0.1 50 | ### MTL 51 | ctc_weight: 0.3 52 | ctc_lsm_prob: 0.1 53 | mtl_per_batch: false 54 | task_specific_layer: false 55 | -------------------------------------------------------------------------------- /examples/ci_test/conf/asr/transformer_2mtl.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | enc_type: conv_transformer 11 | enc_n_layers: 2 12 | enc_n_layers_sub1: 1 13 | transformer_enc_pe_type: none ### 14 | transformer_enc_d_model: 8 15 | transformer_enc_d_ff: 32 16 | transformer_enc_n_heads: 4 17 | dec_type: transformer 18 | dec_n_layers: 2 19 | transformer_dec_attn_type: scaled_dot 20 | transformer_dec_pe_type: 1dconv3L 21 | transformer_dec_d_model: 8 22 | transformer_dec_d_ff: 32 23 | transformer_dec_n_heads: 4 24 | tie_embedding: false 25 | ctc_fc_list: "8" 26 | dec_config_sub1: 27 | dec_type: transformer 28 | dec_n_layers: 2 29 | ctc_fc_list: "8" 30 | ### optimization 31 | batch_size: 1 32 | optimizer: noam 33 | n_epochs: 4 34 | convert_to_sgd_epoch: 100 35 | print_step: 1 36 | metric: accuracy 37 | lr_factor: 5.0 38 | early_stop_patient_n_epochs: 2 39 | shuffle_bucket: true 40 | sort_stop_epoch: 100 41 | eval_start_epoch: 2 42 | warmup_n_steps: 2 43 | accum_grad_n_steps: 2 44 | ### regularization 45 | clip_grad_norm: 5.0 46 | dropout_in: 0.1 47 | dropout_enc: 0.1 48 | dropout_dec: 0.1 49 | dropout_emb: 0.1 50 | dropout_att: 0.1 51 | weight_decay: 1e-6 52 | lsm_prob: 0.1 53 | ### MTL 54 | ctc_weight: 0.3 55 | ctc_weight_sub1: 0.1 56 | ctc_lsm_prob: 0.1 57 | sub1_weight: 0.2 58 | mtl_per_batch: false 59 | task_specific_layer: true 60 | -------------------------------------------------------------------------------- /examples/ci_test/conf/asr/transformer_ctc.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | enc_type: conv_transformer 11 | enc_n_layers: 1 12 | transformer_enc_pe_type: none ### 13 | transformer_enc_d_model: 8 14 | transformer_enc_d_ff: 32 15 | transformer_enc_n_heads: 4 16 | dec_type: transformer 17 | ctc_fc_list: "8" 18 | ### optimization 19 | batch_size: 1 20 | optimizer: noam 21 | n_epochs: 4 22 | convert_to_sgd_epoch: 100 23 | print_step: 1 24 | metric: edit_distance 25 | lr_factor: 5.0 26 | early_stop_patient_n_epochs: 2 27 | shuffle_bucket: true 28 | sort_stop_epoch: 100 29 | eval_start_epoch: 2 30 | warmup_n_steps: 2 31 | accum_grad_n_steps: 2 32 | ### regularization 33 | clip_grad_norm: 5.0 34 | dropout_in: 0.1 35 | dropout_enc: 0.1 36 | dropout_dec: 0.1 37 | dropout_emb: 0.1 38 | dropout_att: 0.1 39 | weight_decay: 1e-6 40 | lsm_prob: 0.1 41 | ### MTL 42 | ctc_weight: 1.0 43 | ctc_lsm_prob: 0.1 44 | mtl_per_batch: false 45 | task_specific_layer: false 46 | -------------------------------------------------------------------------------- /examples/ci_test/conf/asr/transformer_las.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | enc_type: conv_transformer 11 | enc_n_layers: 1 12 | transformer_enc_pe_type: none ### 13 | transformer_enc_d_model: 8 14 | transformer_enc_d_ff: 32 15 | transformer_enc_n_heads: 4 16 | attn_type: location 17 | attn_conv_n_channels: 10 18 | attn_conv_width: 201 19 | attn_dim: 16 20 | attn_n_heads: 1 21 | dec_type: lstm 22 | dec_n_units: 16 23 | dec_n_projs: 8 24 | dec_n_layers: 1 25 | dec_bottleneck_dim: 16 26 | emb_dim: 16 27 | tie_embedding: false 28 | ctc_fc_list: "8" 29 | ### optimization 30 | batch_size: 16000 31 | batch_size_type: frame 32 | optimizer: noam 33 | n_epochs: 4 34 | convert_to_sgd_epoch: 100 35 | print_step: 1 36 | metric: accuracy 37 | lr_factor: 5.0 38 | early_stop_patient_n_epochs: 2 39 | shuffle_bucket: true 40 | sort_stop_epoch: 100 41 | eval_start_epoch: 2 42 | warmup_n_steps: 2 43 | accum_grad_n_steps: 2 44 | ### regularization 45 | clip_grad_norm: 5.0 46 | dropout_in: 0.1 47 | dropout_enc: 0.1 48 | dropout_dec: 0.1 49 | dropout_emb: 0.1 50 | dropout_att: 0.1 51 | weight_decay: 1e-6 52 | ss_prob: 0.1 53 | lsm_prob: 0.1 54 | ### MTL 55 | ctc_weight: 0.3 56 | ctc_lsm_prob: 0.1 57 | mtl_per_batch: false 58 | task_specific_layer: false 59 | -------------------------------------------------------------------------------- /examples/ci_test/conf/data/adaptive_spec_augment.yaml: -------------------------------------------------------------------------------- 1 | # mask 2 | freq_width: 27 3 | n_freq_masks: 2 4 | time_width_upper: 1.0 5 | 6 | adaptive_number_ratio: 0.04 7 | adaptive_size_ratio: 0.04 8 | max_n_time_masks: 20 9 | -------------------------------------------------------------------------------- /examples/ci_test/conf/data/spec_augment.yaml: -------------------------------------------------------------------------------- 1 | # mask 2 | freq_width: 27 3 | n_freq_masks: 2 4 | time_width: 100 5 | n_time_masks: 2 6 | time_width_upper: 1.0 7 | -------------------------------------------------------------------------------- /examples/ci_test/conf/fbank.conf: -------------------------------------------------------------------------------- 1 | --window-type=hamming # disable Dans window, use the standard 2 | --htk-compat=true # try to make it compatible with HTK 3 | 4 | --sample-frequency=16000 5 | --num-mel-bins=80 6 | --use-energy=false 7 | -------------------------------------------------------------------------------- /examples/ci_test/conf/lm/rnnlm.yaml: -------------------------------------------------------------------------------- 1 | # topology 2 | lm_type: lstm 3 | n_units: 16 4 | n_projs: 8 5 | n_layers: 2 6 | emb_dim: 16 7 | n_units_null_context: 0 8 | tie_embedding: true 9 | residual: true 10 | use_glu: true 11 | # optimization 12 | batch_size: 1 13 | bptt: 10 14 | optimizer: adam 15 | n_epochs: 4 16 | convert_to_sgd_epoch: 100 17 | print_step: 1 18 | lr: 1e-3 19 | lr_decay_start_epoch: 2 20 | lr_decay_rate: 0.9 21 | lr_decay_patient_n_epochs: 0 22 | lr_decay_type: always 23 | early_stop_patient_n_epochs: 2 24 | eval_start_epoch: 1 25 | # initialization 26 | param_init: 0.05 27 | # regularization 28 | clip_grad_norm: 1.0 29 | dropout_in: 0.1 30 | dropout_hidden: 0.1 31 | dropout_out: 0.1 32 | weight_decay: 1e-6 33 | lsm_prob: 0.1 34 | adaptive_softmax: false 35 | -------------------------------------------------------------------------------- /examples/ci_test/conf/lm/transformer_xl.yaml: -------------------------------------------------------------------------------- 1 | # topology 2 | lm_type: transformer_xl 3 | n_layers: 2 4 | transformer_d_model: 8 5 | transformer_d_ff: 32 6 | transformer_n_heads: 4 7 | tie_embedding: true 8 | # optimization 9 | batch_size: 1 10 | bptt: 10 11 | mem_len: 10 12 | optimizer: noam 13 | n_epochs: 4 14 | convert_to_sgd_epoch: 100 15 | print_step: 1 16 | lr_factor: 1.0 17 | early_stop_patient_n_epochs: 2 18 | eval_start_epoch: 1 19 | warmup_n_steps: 2 20 | accum_grad_n_steps: 2 21 | # regularization 22 | clip_grad_norm: 1.0 23 | dropout_in: 0.1 24 | dropout_hidden: 0.1 25 | dropout_out: 0.1 26 | dropout_att: 0.1 27 | dropout_layer: 0.1 28 | weight_decay: 1e-6 29 | lsm_prob: 0.1 30 | adaptive_softmax: false 31 | -------------------------------------------------------------------------------- /examples/ci_test/conf/lm/transformerlm.yaml: -------------------------------------------------------------------------------- 1 | # topology 2 | lm_type: transformer 3 | n_layers: 2 4 | transformer_pe_type: add 5 | transformer_d_model: 8 6 | transformer_d_ff: 32 7 | transformer_n_heads: 4 8 | tie_embedding: true 9 | # optimization 10 | batch_size: 1 11 | bptt: 10 12 | optimizer: noam 13 | n_epochs: 4 14 | convert_to_sgd_epoch: 100 15 | print_step: 1 16 | lr_factor: 10.0 17 | early_stop_patient_n_epochs: 2 18 | eval_start_epoch: 1 19 | warmup_n_steps: 2 20 | accum_grad_n_steps: 2 21 | # regularization 22 | clip_grad_norm: 1.0 23 | dropout_in: 0.1 24 | dropout_hidden: 0.1 25 | dropout_out: 0.1 26 | dropout_att: 0.1 27 | dropout_layer: 0.1 28 | weight_decay: 1e-6 29 | lsm_prob: 0.1 30 | adaptive_softmax: false 31 | -------------------------------------------------------------------------------- /examples/ci_test/ctc_forced_align.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2020 Kyoto University (Hirofumi Inaguma) 4 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 5 | 6 | model= 7 | gpu= 8 | stdout=false 9 | n_threads=1 10 | eval_set="train" 11 | cmd_coverage="coverage run -a" 12 | 13 | ### path to save preproecssed data 14 | data=./data 15 | 16 | batch_size=1 17 | n_average=2 # for Transformer 18 | 19 | . ./cmd.sh 20 | . ./path.sh 21 | . utils/parse_options.sh 22 | 23 | set -e 24 | set -u 25 | set -o pipefail 26 | 27 | if [ -z ${gpu} ]; then 28 | # CPU 29 | n_gpus=0 30 | export OMP_NUM_THREADS=${n_threads} 31 | else 32 | n_gpus=$(echo ${gpu} | tr "," "\n" | wc -l) 33 | fi 34 | 35 | for set in ${eval_set}; do 36 | recog_dir=$(dirname ${model})/align_${set} 37 | if [ ${n_average} != 1 ]; then 38 | recog_dir=${recog_dir}_average${n_average} 39 | fi 40 | mkdir -p ${recog_dir} 41 | 42 | CUDA_VISIBLE_DEVICES=${gpu} ${cmd_coverage} ${NEURALSP_ROOT}/neural_sp/bin/asr/ctc_forced_align.py \ 43 | --recog_n_gpus ${n_gpus} \ 44 | --recog_sets ${data}/dataset/${set}_char.tsv \ 45 | --recog_dir ${recog_dir} \ 46 | --recog_model ${model} \ 47 | --recog_batch_size ${batch_size} \ 48 | --recog_n_average ${n_average} \ 49 | --recog_stdout ${stdout} || exit 1; 50 | done 51 | -------------------------------------------------------------------------------- /examples/ci_test/data/train/spk2utt: -------------------------------------------------------------------------------- 1 | LDC93S1 LDC93S1-1 2 | -------------------------------------------------------------------------------- /examples/ci_test/data/train/text: -------------------------------------------------------------------------------- 1 | LDC93S1-1 she had your dark suit in greasy wash water all year 2 | -------------------------------------------------------------------------------- /examples/ci_test/data/train/text.phone: -------------------------------------------------------------------------------- 1 | LDC93S1-1 h# sh ix hv eh dcl jh ih dcl d ah kcl k s ux q en gcl g r ix s ix w ao sh epi w ao dx axr ao l y ih axr h# 2 | -------------------------------------------------------------------------------- /examples/ci_test/data/train/utt2spk: -------------------------------------------------------------------------------- 1 | LDC93S1-1 LDC93S1 2 | -------------------------------------------------------------------------------- /examples/ci_test/data/train/wav.scp: -------------------------------------------------------------------------------- 1 | LDC93S1-1 cat ./sample/LDC93S1.wav | 2 | -------------------------------------------------------------------------------- /examples/ci_test/local/download_sample.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # install TIMIT samples (publicly available) 4 | mkdir -p $(pwd)/sample 5 | wget --no-check-certificate -P $(pwd)/sample https://catalog.ldc.upenn.edu/desc/addenda/LDC93S1.phn 6 | wget --no-check-certificate -P $(pwd)/sample https://catalog.ldc.upenn.edu/desc/addenda/LDC93S1.txt 7 | wget --no-check-certificate -P $(pwd)/sample https://catalog.ldc.upenn.edu/desc/addenda/LDC93S1.wav 8 | wget --no-check-certificate -P $(pwd)/sample https://catalog.ldc.upenn.edu/desc/addenda/LDC93S1.wrd 9 | -------------------------------------------------------------------------------- /examples/ci_test/path.sh: -------------------------------------------------------------------------------- 1 | export NEURALSP_ROOT=$PWD/../.. 2 | export KALDI_ROOT=$NEURALSP_ROOT/tools/kaldi 3 | export TOOL=$NEURALSP_ROOT/tools/neural_sp 4 | export CONDA=$TOOL/miniconda 5 | 6 | # Kaldi 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh 8 | export PATH=$NEURALSP_ROOT/utils:$PWD/utils/:$KALDI_ROOT/tools/sctk/bin/:$TOOL/sentencepiece/build/src:$PATH 9 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 10 | . $KALDI_ROOT/tools/config/common_path.sh 11 | export LC_ALL=C 12 | 13 | ### Python 14 | source $CONDA/etc/profile.d/conda.sh && conda deactivate && conda activate 15 | export PYTHONDONTWRITEBYTECODE=1 16 | export OMP_NUM_THREADS=1 17 | 18 | ### CUDA 19 | CUDAROOT=/usr/local/cuda 20 | NCCL_ROOT=/usr/local/nccl 21 | export CPATH=$NCCL_ROOT/include:$CPATH 22 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib/:$CUDAROOT/lib64:$LD_LIBRARY_PATH 23 | export LIBRARY_PATH=$NCCL_ROOT/lib/:$LIBRARY_PATH 24 | export CUDA_HOME=$CUDAROOT 25 | export CUDA_PATH=$CUDAROOT 26 | -------------------------------------------------------------------------------- /examples/ci_test/plot_ctc.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2020 Kyoto University (Hirofumi Inaguma) 4 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 5 | 6 | model= 7 | gpu= 8 | stdout=false 9 | n_threads=1 10 | eval_set="train" 11 | cmd_coverage="coverage run -a" 12 | 13 | ### path to save preproecssed data 14 | data=./data 15 | 16 | batch_size=1 17 | n_average=2 # for Transformer 18 | 19 | . ./cmd.sh 20 | . ./path.sh 21 | . utils/parse_options.sh 22 | 23 | set -e 24 | set -u 25 | set -o pipefail 26 | 27 | if [ -z ${gpu} ]; then 28 | # CPU 29 | n_gpus=0 30 | export OMP_NUM_THREADS=${n_threads} 31 | else 32 | n_gpus=$(echo ${gpu} | tr "," "\n" | wc -l) 33 | fi 34 | 35 | for set in ${eval_set}; do 36 | recog_dir=$(dirname ${model})/plot_${set} 37 | if [ ${n_average} != 1 ]; then 38 | recog_dir=${recog_dir}_average${n_average} 39 | fi 40 | mkdir -p ${recog_dir} 41 | 42 | CUDA_VISIBLE_DEVICES=${gpu} ${cmd_coverage} ${NEURALSP_ROOT}/neural_sp/bin/asr/plot_ctc.py \ 43 | --recog_n_gpus ${n_gpus} \ 44 | --recog_sets ${data}/dataset/${set}_char.tsv \ 45 | --recog_dir ${recog_dir} \ 46 | --recog_model ${model} \ 47 | --recog_batch_size ${batch_size} \ 48 | --recog_n_average ${n_average} \ 49 | --recog_stdout ${stdout} || exit 1; 50 | done 51 | -------------------------------------------------------------------------------- /examples/ci_test/steps: -------------------------------------------------------------------------------- 1 | ../wsj/s5/steps -------------------------------------------------------------------------------- /examples/ci_test/utils: -------------------------------------------------------------------------------- 1 | ../wsj/s5/utils -------------------------------------------------------------------------------- /examples/csj/README.txt: -------------------------------------------------------------------------------- 1 | About the Corpus of Spontaneous Japanese: 2 | The Corpus of Spontaneous Japanese (CSJ) is a database of spoken 3 | Japanese developed by the Japan's national priority area research 4 | project "Spontaneous Speech: Corpus and Processing Technology". 5 | It contains about 650 hours of speech consisting of approximately 6 | 7.5 million words that were provided by more than 1,400 speakers. 7 | For more details about the corpus, please visit the website of the 8 | National Institute for Japanese Language (NINJAL). It is available 9 | from the Institute. 10 | http://www.ninjal.ac.jp/english/products/csj/ 11 | http://pj.ninjal.ac.jp/corpus_center/csj/ 12 | 13 | Meta-parameter tuning based on evolution strategy: 14 | The meta-parameters of the system contained in conf/config_opt were 15 | automatically tuned using evolution strategy. For the details, 16 | please refer the following paper: 17 | Takafumi Moriya, Tomohiro Tanaka, Takahiro Shinozaki, Shinji Watanabe, 18 | and Kevin Duh, "Automation of System Building for State-of-the-art 19 | Large Vocabulary Speech Recognition Using Evolution Strategy," Proc. 20 | IEEE 2015 Automatic Speech Recognition and Understanding Workshop 21 | (ASRU), 2015. 22 | 23 | 24 | Each subdirectory of this directory contains the 25 | scripts for a sequence of experiments. 26 | s5: This is the current recommended recipe. 27 | The recipe supports the third and fourth editions of CSJ. 28 | -------------------------------------------------------------------------------- /examples/csj/s5/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | # Kyoto University setup 14 | export train_cmd="run.pl --mem 2G" 15 | export cuda_cmd="run.pl --mem 2G --gpu 1" 16 | export decode_cmd="run.pl --mem 4G" 17 | 18 | # JHU setup 19 | # export train_cmd="queue.pl --mem 2G" 20 | # export cuda_cmd="queue.pl --mem 2G --gpu 1 --config conf/gpu.conf" 21 | # export decode_cmd="queue.pl --mem 4G" 22 | -------------------------------------------------------------------------------- /examples/csj/s5/conf/asr/las/blstm_las.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | subsample: "1_1_1_1_1" 11 | enc_type: conv_blstm 12 | bidirectional_sum_fwd_bwd: true 13 | enc_n_units: 512 14 | enc_n_projs: 0 15 | enc_n_layers: 5 16 | subsample_type: drop 17 | lc_chunk_size_left: -1 ### offline 18 | lc_chunk_size_right: 40 19 | attn_type: location 20 | attn_conv_n_channels: 10 21 | attn_conv_width: 201 22 | attn_dim: 512 23 | attn_n_heads: 1 24 | dec_type: lstm 25 | dec_n_units: 1024 26 | dec_n_projs: 0 27 | dec_n_layers: 1 28 | dec_bottleneck_dim: 1024 ### this is effective 29 | emb_dim: 512 30 | tie_embedding: false 31 | ctc_fc_list: "512" 32 | ### optimization 33 | batch_size: 30 34 | optimizer: adam 35 | n_epochs: 25 36 | convert_to_sgd_epoch: 100 37 | print_step: 800 38 | metric: edit_distance 39 | lr: 1e-3 40 | lr_decay_type: always 41 | lr_decay_start_epoch: 10 42 | lr_decay_rate: 0.85 43 | lr_decay_patient_n_epochs: 0 44 | early_stop_patient_n_epochs: 5 45 | sort_stop_epoch: 100 46 | eval_start_epoch: 1 47 | warmup_start_lr: 1e-4 48 | warmup_n_steps: 0 49 | ### initialization 50 | param_init: 0.1 51 | ### regularization 52 | clip_grad_norm: 5.0 53 | dropout_in: 0.0 54 | dropout_enc: 0.4 55 | dropout_dec: 0.4 56 | dropout_emb: 0.4 57 | dropout_att: 0.0 58 | weight_decay: 1e-6 59 | ss_prob: 0.2 60 | lsm_prob: 0.1 61 | ### MTL 62 | ctc_weight: 0.3 63 | ctc_lsm_prob: 0.1 64 | mtl_per_batch: false 65 | task_specific_layer: false 66 | -------------------------------------------------------------------------------- /examples/csj/s5/conf/asr/las/lcblstm_las_chunk4040.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | subsample: "1_1_1_1_1" 11 | enc_type: conv_blstm 12 | bidirectional_sum_fwd_bwd: true 13 | enc_n_units: 512 14 | enc_n_projs: 0 15 | enc_n_layers: 5 16 | subsample_type: drop 17 | lc_chunk_size_left: 40 18 | lc_chunk_size_right: 40 19 | attn_type: location 20 | attn_conv_n_channels: 10 21 | attn_conv_width: 201 22 | attn_dim: 512 23 | attn_n_heads: 1 24 | dec_type: lstm 25 | dec_n_units: 1024 26 | dec_n_projs: 0 27 | dec_n_layers: 1 28 | dec_bottleneck_dim: 1024 ### this is effective 29 | emb_dim: 512 30 | tie_embedding: false 31 | ctc_fc_list: "512" 32 | ### optimization 33 | batch_size: 20 34 | optimizer: adam 35 | n_epochs: 25 36 | convert_to_sgd_epoch: 100 37 | print_step: 800 38 | metric: edit_distance 39 | lr: 1e-3 40 | lr_decay_type: always 41 | lr_decay_start_epoch: 10 42 | lr_decay_rate: 0.85 43 | lr_decay_patient_n_epochs: 0 44 | early_stop_patient_n_epochs: 5 45 | sort_stop_epoch: 100 46 | eval_start_epoch: 1 47 | warmup_start_lr: 1e-4 48 | warmup_n_steps: 0 49 | ### initialization 50 | param_init: 0.1 51 | ### regularization 52 | clip_grad_norm: 5.0 53 | dropout_in: 0.0 54 | dropout_enc: 0.4 55 | dropout_dec: 0.4 56 | dropout_emb: 0.4 57 | dropout_att: 0.0 58 | weight_decay: 1e-6 59 | ss_prob: 0.2 60 | lsm_prob: 0.1 61 | ### MTL 62 | ctc_weight: 0.3 63 | ctc_lsm_prob: 0.1 64 | mtl_per_batch: false 65 | task_specific_layer: false 66 | -------------------------------------------------------------------------------- /examples/csj/s5/conf/asr/las/lstm_las.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | subsample: "1_1_1_1_1" 11 | enc_type: conv_lstm 12 | enc_n_units: 1024 13 | enc_n_projs: 0 14 | enc_n_layers: 5 15 | subsample_type: drop 16 | attn_type: location 17 | attn_conv_n_channels: 10 18 | attn_conv_width: 201 19 | attn_dim: 512 20 | attn_n_heads: 1 21 | dec_type: lstm 22 | dec_n_units: 1024 23 | dec_n_projs: 0 24 | dec_n_layers: 1 25 | dec_bottleneck_dim: 1024 ### this is effective 26 | emb_dim: 512 27 | tie_embedding: false 28 | ctc_fc_list: "512" 29 | ### optimization 30 | batch_size: 30 31 | optimizer: adam 32 | n_epochs: 25 33 | convert_to_sgd_epoch: 100 34 | print_step: 800 35 | metric: edit_distance 36 | lr: 1e-3 37 | lr_decay_type: always 38 | lr_decay_start_epoch: 10 39 | lr_decay_rate: 0.85 40 | lr_decay_patient_n_epochs: 0 41 | early_stop_patient_n_epochs: 5 42 | sort_stop_epoch: 100 43 | eval_start_epoch: 1 44 | warmup_start_lr: 1e-4 45 | warmup_n_steps: 0 46 | ### initialization 47 | param_init: 0.1 48 | ### regularization 49 | clip_grad_norm: 5.0 50 | dropout_in: 0.0 51 | dropout_enc: 0.4 52 | dropout_dec: 0.4 53 | dropout_emb: 0.4 54 | dropout_att: 0.0 55 | weight_decay: 1e-6 56 | ss_prob: 0.2 57 | lsm_prob: 0.1 58 | ### MTL 59 | ctc_weight: 0.3 60 | ctc_lsm_prob: 0.1 61 | mtl_per_batch: false 62 | task_specific_layer: false 63 | -------------------------------------------------------------------------------- /examples/csj/s5/conf/asr/transformer/transformer.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | enc_type: conv_transformer 11 | enc_n_layers: 12 12 | transformer_enc_pe_type: add 13 | transformer_enc_d_model: 256 14 | transformer_enc_d_ff: 2048 15 | transformer_enc_n_heads: 4 16 | dec_type: transformer 17 | dec_n_layers: 6 18 | transformer_dec_pe_type: 1dconv3L ### this is effective 19 | transformer_dec_attn_type: scaled_dot 20 | transformer_dec_d_model: 256 21 | transformer_dec_d_ff: 2048 22 | transformer_dec_n_heads: 4 23 | tie_embedding: false 24 | ctc_fc_list: "512" 25 | ### optimization 26 | batch_size: 32 27 | optimizer: noam 28 | n_epochs: 40 29 | convert_to_sgd_epoch: 100 30 | print_step: 400 31 | metric: accuracy 32 | lr_factor: 5.0 33 | early_stop_patient_n_epochs: 5 34 | shuffle_bucket: true ### this is important 35 | sort_stop_epoch: 100 36 | eval_start_epoch: 1 37 | warmup_n_steps: 25000 38 | accum_grad_n_steps: 8 39 | ### regularization 40 | clip_grad_norm: 5.0 41 | dropout_in: 0.0 42 | dropout_enc: 0.1 43 | dropout_dec: 0.1 44 | dropout_emb: 0.1 45 | dropout_att: 0.0 46 | weight_decay: 1e-6 47 | lsm_prob: 0.1 48 | ### MTL 49 | ctc_weight: 0.3 50 | ctc_lsm_prob: 0.1 51 | mtl_per_batch: false 52 | task_specific_layer: false 53 | # SpecAugment 54 | freq_width: 27 55 | n_freq_masks: 2 56 | time_width: 100 57 | n_time_masks: 2 58 | time_width_upper: 1.0 59 | -------------------------------------------------------------------------------- /examples/csj/s5/conf/data/pretrain.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 20 # 25->20 3 | print_step: 800 4 | lr_decay_start_epoch: 5 5 | lr_decay_rate: 0.8 6 | -------------------------------------------------------------------------------- /examples/csj/s5/conf/data/spec_augment.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 50 3 | print_step: 800 4 | lr_decay_start_epoch: 15 5 | lr_decay_rate: 0.9 6 | 7 | # mask 8 | freq_width: 27 9 | n_freq_masks: 2 10 | time_width: 100 11 | n_time_masks: 2 12 | time_width_upper: 1.0 13 | -------------------------------------------------------------------------------- /examples/csj/s5/conf/data/spec_augment_pretrain_F13_T50.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 40 # 20->40 3 | print_step: 1600 # 800->1600 4 | lr_decay_start_epoch: 5 5 | lr_decay_rate: 0.9 6 | 7 | # mask 8 | freq_width: 13 9 | n_freq_masks: 2 10 | time_width: 50 11 | n_time_masks: 2 12 | time_width_upper: 1.0 13 | -------------------------------------------------------------------------------- /examples/csj/s5/conf/data/spec_augment_pretrain_F27_T100.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 40 # 20->40 3 | print_step: 1600 # 800->1600 4 | lr_decay_start_epoch: 5 5 | lr_decay_rate: 0.9 6 | 7 | # mask 8 | freq_width: 27 9 | n_freq_masks: 2 10 | time_width: 100 11 | n_time_masks: 2 12 | time_width_upper: 1.0 13 | -------------------------------------------------------------------------------- /examples/csj/s5/conf/data/spec_augment_pretrain_F27_T50.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 40 # 20->40 3 | print_step: 1600 # 800->1600 4 | lr_decay_start_epoch: 5 5 | lr_decay_rate: 0.9 6 | 7 | # mask 8 | freq_width: 27 9 | n_freq_masks: 2 10 | time_width: 50 11 | n_time_masks: 2 12 | time_width_upper: 1.0 13 | -------------------------------------------------------------------------------- /examples/csj/s5/conf/data/spec_augment_speed_perturb.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 40 # 20->40 3 | print_step: 2400 4 | lr_decay_start_epoch: 15 5 | lr_decay_rate: 0.9 6 | 7 | # mask 8 | freq_width: 27 9 | n_freq_masks: 2 10 | time_width: 100 11 | n_time_masks: 2 12 | time_width_upper: 1.0 13 | -------------------------------------------------------------------------------- /examples/csj/s5/conf/data/speed_perturb.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 20 3 | print_step: 1200 4 | lr_decay_start_epoch: 10 5 | lr_decay_rate: 0.8 6 | -------------------------------------------------------------------------------- /examples/csj/s5/conf/fbank.conf: -------------------------------------------------------------------------------- 1 | --window-type=hamming # disable Dans window, use the standard 2 | --htk-compat=true # try to make it compatible with HTK 3 | 4 | --sample-frequency=16000 5 | --num-mel-bins=80 6 | --use-energy=false 7 | -------------------------------------------------------------------------------- /examples/csj/s5/conf/lm/rnnlm.yaml: -------------------------------------------------------------------------------- 1 | # topology 2 | lm_type: lstm 3 | n_units: 1024 4 | n_projs: 0 5 | n_layers: 4 ### 6 | emb_dim: 1024 7 | n_units_null_context: 0 8 | tie_embedding: true 9 | residual: true 10 | use_glu: true 11 | # optimization 12 | batch_size: 64 13 | bptt: 200 14 | optimizer: adam 15 | n_epochs: 30 16 | convert_to_sgd_epoch: 30 17 | print_step: 50 18 | lr: 1e-3 19 | lr_decay_start_epoch: 10 20 | lr_decay_rate: 0.9 21 | lr_decay_patient_n_epochs: 0 22 | lr_decay_type: always 23 | early_stop_patient_n_epochs: 5 24 | eval_start_epoch: 1 25 | # initialization 26 | param_init: 0.05 27 | # regularization 28 | clip_grad_norm: 1.0 29 | dropout_in: 0.2 30 | dropout_hidden: 0.5 31 | dropout_out: 0.0 32 | weight_decay: 1e-6 33 | lsm_prob: 0.1 ### 34 | adaptive_softmax: false 35 | -------------------------------------------------------------------------------- /examples/csj/s5/conf/lm/transformer_xl.yaml: -------------------------------------------------------------------------------- 1 | # topology 2 | lm_type: transformer_xl 3 | n_layers: 12 4 | transformer_d_model: 512 5 | transformer_d_ff: 2048 6 | transformer_n_heads: 8 7 | tie_embedding: true 8 | # optimization 9 | batch_size: 24 10 | bptt: 200 11 | mem_len: 200 12 | optimizer: noam 13 | n_epochs: 40 14 | convert_to_sgd_epoch: 100 15 | print_step: 200 16 | lr_factor: 1.0 17 | early_stop_patient_n_epochs: 5 18 | eval_start_epoch: 1 19 | warmup_n_steps: 4000 20 | accum_grad_n_steps: 4 ### 21 | # regularization 22 | clip_grad_norm: 1.0 23 | dropout_in: 0.1 24 | dropout_hidden: 0.3 25 | dropout_out: 0.0 26 | dropout_att: 0.1 27 | dropout_layer: 0.0 28 | weight_decay: 1e-6 29 | lsm_prob: 0.1 ### 30 | adaptive_softmax: false 31 | -------------------------------------------------------------------------------- /examples/csj/s5/conf/lm/transformerlm.yaml: -------------------------------------------------------------------------------- 1 | # topology 2 | lm_type: transformer 3 | n_layers: 12 4 | transformer_pe_type: add 5 | transformer_d_model: 512 6 | transformer_d_ff: 2048 7 | transformer_n_heads: 8 8 | tie_embedding: true 9 | # optimization 10 | batch_size: 32 11 | bptt: 200 12 | optimizer: noam 13 | n_epochs: 40 14 | convert_to_sgd_epoch: 100 15 | print_step: 200 16 | lr_factor: 10.0 17 | early_stop_patient_n_epochs: 5 18 | eval_start_epoch: 1 19 | warmup_n_steps: 4000 20 | accum_grad_n_steps: 2 21 | # regularization 22 | clip_grad_norm: 1.0 23 | dropout_in: 0.1 24 | dropout_hidden: 0.3 25 | dropout_out: 0.0 26 | dropout_att: 0.1 27 | dropout_layer: 0.0 28 | weight_decay: 1e-6 29 | lsm_prob: 0.1 ### 30 | adaptive_softmax: false 31 | -------------------------------------------------------------------------------- /examples/csj/s5/local/csj_prepare_dict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Making dictionary using CSJ data with morpheme analysis. 4 | # from the one in Kaldi s5 recipe in that it uses lower-case --Arnab (Jan 2013) 5 | 6 | # To be run from one directory above this script. 7 | 8 | . ./path.sh 9 | 10 | #check existing directories 11 | [ $# != 0 ] && echo "Usage: local/csj_data_prep.sh" && exit 1; 12 | 13 | srcdir=${data}/local/train_${datasize} 14 | dir=${data}/local/dict_nosp 15 | mkdir -p $dir 16 | srcdict=$srcdir/lexicon.txt 17 | 18 | # assume csj_data_prep.sh was done already. 19 | [ ! -f "$srcdict" ] && echo "No such file $srcdict" && exit 1; 20 | 21 | #(2a) Dictionary preparation: 22 | # Pre-processing (Upper-case, remove comments) 23 | cat $srcdict > $dir/lexicon1.txt || exit 1; 24 | 25 | cat $dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \ 26 | grep -v sp > $dir/nonsilence_phones.txt || exit 1; 27 | 28 | #( echo sil; echo spn; echo nsn; echo lau ) > $dir/silence_phones.txt 29 | ( echo sp ; echo spn ; ) > $dir/silence_phones.txt 30 | 31 | echo sp > $dir/optional_silence.txt 32 | 33 | # No "extra questions" in the input to this setup, as we don't 34 | # have stress or tone. 35 | echo -n >$dir/extra_questions.txt 36 | 37 | # Add to the lexicon the silences, noises etc. 38 | ( echo ' sp' ; echo ' spn'; ) | cat - $dir/lexicon1.txt > $dir/lexicon2.txt || exit 1; 39 | 40 | 41 | pushd $dir >&/dev/null 42 | ln -sf lexicon2.txt lexicon.txt 43 | popd >&/dev/null 44 | 45 | echo Prepared input dictionary and phone-sets for CSJ phase 1. 46 | -------------------------------------------------------------------------------- /examples/csj/s5/path.sh: -------------------------------------------------------------------------------- 1 | export NEURALSP_ROOT=$PWD/../../.. 2 | export KALDI_ROOT=$NEURALSP_ROOT/tools/kaldi 3 | export TOOL=$NEURALSP_ROOT/tools/neural_sp 4 | export CONDA=$TOOL/miniconda 5 | 6 | # Kaldi 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh 8 | export PATH=$NEURALSP_ROOT/utils:$PWD/utils/:$KALDI_ROOT/tools/sctk/bin/:$TOOL/sentencepiece/build/src:$PATH 9 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 10 | . $KALDI_ROOT/tools/config/common_path.sh 11 | export LC_ALL=C 12 | 13 | ### Python 14 | source $CONDA/etc/profile.d/conda.sh && conda deactivate && conda activate 15 | export PYTHONDONTWRITEBYTECODE=1 16 | export OMP_NUM_THREADS=1 17 | 18 | ### CUDA 19 | CUDAROOT=/usr/local/cuda 20 | NCCL_ROOT=/usr/local/nccl 21 | export CPATH=$NCCL_ROOT/include:$CPATH 22 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib/:$CUDAROOT/lib64:$LD_LIBRARY_PATH 23 | export LIBRARY_PATH=$NCCL_ROOT/lib/:$LIBRARY_PATH 24 | export CUDA_HOME=$CUDAROOT 25 | export CUDA_PATH=$CUDAROOT 26 | -------------------------------------------------------------------------------- /examples/csj/s5/steps: -------------------------------------------------------------------------------- 1 | ../../wsj/s5/steps -------------------------------------------------------------------------------- /examples/csj/s5/utils: -------------------------------------------------------------------------------- 1 | ../../wsj/s5/utils -------------------------------------------------------------------------------- /examples/laborotv/s5/README.md: -------------------------------------------------------------------------------- 1 | #### Conformer LAS large + SpecAugment 2 | - conf: `conf/asr/conformer_kernel15_clamp10_hie_subsample8_las_ln_large.yaml` 3 | - decoding parameters 4 | - epoch 40 5 | - beam width: 10 6 | - lm_weight: 0.0 7 | - length norm: true 8 | 9 | ##### WER 10 | | Eval Set | # Snt | # Wrd | Corr | Sub | Del | Ins | Err | S.Err | 11 | | -------- | ----- | ----- | ---- | --- | --- | --- | --- | ----- | 12 | |dev_4k|4000|57637|93.6|4.7|1.7|3.2|**9.7**|48.6| 13 | |dev|12000|153743|91.5|6.4|2.0|4.0|**12.5**|53.5| 14 | 15 | ##### CER 16 | | Eval Set | # Snt | # Wrd | Corr | Sub | Del | Ins | Err | S.Err | 17 | | -------- | ----- | ----- | ---- | --- | --- | --- | --- | ----- | 18 | |dev_4k|4000|101224|95.3|3.0|1.7|3.1|**7.8**|46.2| 19 | |dev|12000|273004|93.8|4.0|2.2|3.9|**10.1**|50.9| 20 | |tedx-jp-10k|10000|191708|90.2|5.0|4.8|2.6|**12.4**|64.8| 21 | -------------------------------------------------------------------------------- /examples/laborotv/s5/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | # Kyoto University setup 14 | export train_cmd="run.pl --mem 2G" 15 | export cuda_cmd="run.pl --mem 2G --gpu 1" 16 | export decode_cmd="run.pl --mem 4G" 17 | 18 | # JHU setup 19 | # export train_cmd="queue.pl --mem 2G" 20 | # export cuda_cmd="queue.pl --mem 2G --gpu 1 --config conf/gpu.conf" 21 | # export decode_cmd="queue.pl --mem 4G" 22 | -------------------------------------------------------------------------------- /examples/laborotv/s5/conf/fbank.conf: -------------------------------------------------------------------------------- 1 | --window-type=hamming # disable Dans window, use the standard 2 | --htk-compat=true # try to make it compatible with HTK 3 | 4 | --sample-frequency=16000 5 | --num-mel-bins=80 6 | --use-energy=false 7 | -------------------------------------------------------------------------------- /examples/laborotv/s5/conf/lm/rnnlm.yaml: -------------------------------------------------------------------------------- 1 | # topology 2 | lm_type: lstm 3 | n_units: 1024 4 | n_projs: 0 5 | n_layers: 4 ### 6 | emb_dim: 1024 7 | n_units_null_context: 0 8 | tie_embedding: true 9 | residual: true 10 | use_glu: true 11 | # optimization 12 | batch_size: 64 13 | bptt: 200 14 | optimizer: adam 15 | n_epochs: 40 16 | convert_to_sgd_epoch: 40 17 | print_step: 200 18 | lr: 1e-3 19 | lr_decay_start_epoch: 10 20 | lr_decay_rate: 0.9 21 | lr_decay_patient_n_epochs: 0 22 | lr_decay_type: always 23 | early_stop_patient_n_epochs: 5 24 | eval_start_epoch: 1 25 | # initialization 26 | param_init: 0.05 27 | # regularization 28 | clip_grad_norm: 1.0 29 | dropout_in: 0.2 30 | dropout_hidden: 0.5 31 | dropout_out: 0.0 32 | weight_decay: 1e-6 33 | lsm_prob: 0.1 ### 34 | adaptive_softmax: false 35 | -------------------------------------------------------------------------------- /examples/laborotv/s5/local/laborotv_data_prep.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Data preparation for LaboroTVSpeech 4 | 5 | . ./path.sh 6 | set -e # exit on error 7 | 8 | if [[ $# -ne 1 ]]; then 9 | echo "Usage: $0 " 10 | exit 1 11 | fi 12 | 13 | CORPUS_DIR=$1 14 | 15 | # Data 16 | for x in train dev; do 17 | echo "$0: Making data/${x} ..." 18 | mkdir -p ${data}/${x} 19 | perl -pe 's/,/ /' ${CORPUS_DIR}/data/${x}/text.csv >${data}/${x}/text 20 | cut -d',' -f1 ${CORPUS_DIR}/data/${x}/text.csv | 21 | awk -v dir=${CORPUS_DIR}/data/${x}/wav/ "{print dir\$1\".wav\"}" | 22 | sort | 23 | perl -pe 's,(.*/)([^/]*)(\.wav),\2 \1\2\3,g' \ 24 | >${data}/${x}/wav.scp 25 | 26 | # Make a dumb utt2spk and spk2utt, 27 | # where each utterance corresponds to a unique speaker. 28 | awk '{print $1,$1_spk}' ${data}/${x}/text >${data}/${x}/utt2spk 29 | utils/utt2spk_to_spk2utt.pl ${data}/${x}/utt2spk >${data}/${x}/spk2utt 30 | 31 | utils/data/get_utt2dur.sh ${data}/${x} 32 | 33 | utils/fix_data_dir.sh ${data}/${x} 34 | utils/validate_data_dir.sh --no-feats ${data}/${x} 35 | done 36 | 37 | echo "$0: done preparing data directories" 38 | -------------------------------------------------------------------------------- /examples/laborotv/s5/local/prepare_dict.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Prepare dict_nosp/ from lexicon.txt 4 | # This is a simplified version of egs/csj/s5/local/csj_prepare_dict.sh 5 | 6 | . ./path.sh 7 | set -e # exit on error 8 | 9 | if [[ $# -ne 2 ]]; then 10 | echo "Usage: $0 " 11 | exit 1 12 | fi 13 | 14 | lexicon=$1 15 | dir=$2 16 | 17 | mkdir -p $dir 18 | 19 | cat $lexicon | 20 | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | 21 | grep -v sp >$dir/nonsilence_phones.txt || exit 1 22 | 23 | ( 24 | echo sp 25 | echo spn 26 | ) >$dir/silence_phones.txt 27 | 28 | echo sp >$dir/optional_silence.txt 29 | 30 | # No "extra questions" in the input to this setup, as we don't 31 | # have stress or tone. 32 | echo -n >$dir/extra_questions.txt 33 | 34 | # Add to the lexicon the silences, noises etc. 35 | ( 36 | echo ' sp' 37 | echo ' spn' 38 | ) | cat - $lexicon >$dir/lexicon.txt || exit 1 39 | 40 | sort $dir/lexicon.txt -uo $dir/lexicon.txt 41 | 42 | echo "$0: Done preparing $dir" 43 | -------------------------------------------------------------------------------- /examples/laborotv/s5/local/remove_pos.py: -------------------------------------------------------------------------------- 1 | ../../../csj/s5/local/remove_pos.py -------------------------------------------------------------------------------- /examples/laborotv/s5/local/tedx-jp-10k_data_prep.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | 4 | # Build the TEDxJP-10K dataset 5 | 6 | . ./path.sh 7 | set -e # exit on error 8 | 9 | #videos_csv="local/tedx-jp/tedx-jp-10k.csv" 10 | #all_to_10k_utt_map="local/tedx-jp/all_to_10k_utt_map.txt" 11 | 12 | . utils/parse_options.sh 13 | 14 | if [[ $# -ne 1 ]]; then 15 | echo "Usage: $0 " 16 | echo "This script does preprocessing of the TEDx-JP-10K dataset." 17 | echo " should contain segments, spk2utt, text, utt2spk, wavlist.txt, and wav/." 18 | exit 1 19 | fi 20 | 21 | RAW_DATA_DIR=$1 22 | dst_dir="${data}/tedx-jp-10k" 23 | 24 | mkdir -p ${dst_dir} 25 | 26 | # Copy necessary files to data directory 27 | echo "$0: Copying segments, spk2utt, text and utt2spk to $dst_dir." 28 | cp ${RAW_DATA_DIR}/{segments,spk2utt,text,utt2spk} ${dst_dir} 29 | 30 | echo "$0: Creating wav.scp from wavlist.txt" 31 | rm -f ${dst_dir}/wav.scp 32 | touch ${dst_dir}/wav.scp 33 | while read line; do 34 | id=$(cut -d' ' -f 1 <<<${line}) 35 | filepath=${RAW_DATA_DIR}/wav/$(cut -d' ' -f 2 <<<${line}) 36 | echo "${id} sox \"${filepath}\" -c 1 -r 16000 -t wav - |" >> ${dst_dir}/wav.scp 37 | done < ${RAW_DATA_DIR}/wavlist.txt 38 | utils/data/validate_data_dir.sh --no-feats ${dst_dir} 39 | 40 | echo "$0: Done preprocessing TEDxJP-10K dataset (${dst_dir})" 41 | -------------------------------------------------------------------------------- /examples/laborotv/s5/path.sh: -------------------------------------------------------------------------------- 1 | export NEURALSP_ROOT=$PWD/../../.. 2 | export KALDI_ROOT=$NEURALSP_ROOT/tools/kaldi 3 | export TOOL=$NEURALSP_ROOT/tools/neural_sp 4 | export CONDA=$TOOL/miniconda 5 | 6 | # Kaldi 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh 8 | export PATH=$NEURALSP_ROOT/utils:$PWD/utils/:$KALDI_ROOT/tools/sctk/bin/:$TOOL/sentencepiece/build/src:$PATH 9 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 10 | . $KALDI_ROOT/tools/config/common_path.sh 11 | export LC_ALL=C 12 | 13 | ### Python 14 | source $CONDA/etc/profile.d/conda.sh && conda deactivate && conda activate 15 | export PYTHONDONTWRITEBYTECODE=1 16 | export OMP_NUM_THREADS=1 17 | 18 | ### CUDA 19 | CUDAROOT=/usr/local/cuda 20 | NCCL_ROOT=/usr/local/nccl 21 | export CPATH=$NCCL_ROOT/include:$CPATH 22 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib/:$CUDAROOT/lib64:$LD_LIBRARY_PATH 23 | export LIBRARY_PATH=$NCCL_ROOT/lib/:$LIBRARY_PATH 24 | export CUDA_HOME=$CUDAROOT 25 | export CUDA_PATH=$CUDAROOT 26 | -------------------------------------------------------------------------------- /examples/laborotv/s5/steps: -------------------------------------------------------------------------------- 1 | ../../wsj/s5/steps -------------------------------------------------------------------------------- /examples/laborotv/s5/utils: -------------------------------------------------------------------------------- 1 | ../../wsj/s5/utils -------------------------------------------------------------------------------- /examples/language_model/ptb/RESULTS: -------------------------------------------------------------------------------- 1 | lstm1024H0P2L_emb1024_adam_lr0.001_bs20_bptt30_tie_residual_glu_1tokens 2 | % PPL 3 | valid (baseline): 87.99 4 | valid (cache size: 100): 79.58 5 | valid (cache size: 500): 77.36 6 | test (baseline): 86.06 7 | test (cache size: 100): 79.12 8 | test (cache size: 500): 76.94 9 | -------------------------------------------------------------------------------- /examples/language_model/ptb/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | # Kyoto University setup 14 | export train_cmd="run.pl --mem 2G" 15 | export cuda_cmd="run.pl --mem 2G --gpu 1" 16 | export decode_cmd="run.pl --mem 4G" 17 | 18 | # JHU setup 19 | # export train_cmd="queue.pl --mem 2G" 20 | # export cuda_cmd="queue.pl --mem 2G --gpu 1 --config conf/gpu.conf" 21 | # export decode_cmd="queue.pl --mem 4G" 22 | -------------------------------------------------------------------------------- /examples/language_model/ptb/conf/rnnlm.yaml: -------------------------------------------------------------------------------- 1 | # topology 2 | lm_type: lstm 3 | n_units: 1024 4 | n_projs: 0 5 | n_layers: 2 6 | emb_dim: 1024 7 | tie_embedding: true 8 | residual: true 9 | use_glu: true 10 | # optimization 11 | batch_size: 20 12 | bptt: 30 13 | optimizer: adam 14 | n_epochs: 50 15 | convert_to_sgd_epoch: 50 16 | print_step: 100 17 | lr: 1e-3 18 | lr_decay_start_epoch: 10 19 | lr_decay_rate: 0.9 20 | lr_decay_patient_n_epochs: 0 21 | lr_decay_type: always 22 | early_stop_patient_n_epochs: 10 23 | eval_start_epoch: 1 24 | # initialization 25 | param_init: 0.05 26 | # regularization 27 | # clip_grad_norm: 0.1 28 | clip_grad_norm: 1.0 29 | dropout_in: 0.2 30 | dropout_hidden: 0.65 31 | dropout_out: 0.0 32 | weight_decay: 1e-6 33 | -------------------------------------------------------------------------------- /examples/language_model/ptb/path.sh: -------------------------------------------------------------------------------- 1 | export NEURALSP_ROOT=$PWD/../../.. 2 | export KALDI_ROOT=$NEURALSP_ROOT/tools/kaldi 3 | export TOOL=$NEURALSP_ROOT/tools/neural_sp 4 | export CONDA=$TOOL/miniconda 5 | 6 | # Kaldi 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh 8 | export PATH=$NEURALSP_ROOT/utils:$PWD/utils/:$KALDI_ROOT/tools/sctk/bin/:$TOOL/sentencepiece/build/src:$PATH 9 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 10 | . $KALDI_ROOT/tools/config/common_path.sh 11 | export LC_ALL=C 12 | 13 | ### Python 14 | source $CONDA/etc/profile.d/conda.sh && conda deactivate && conda activate 15 | export PYTHONDONTWRITEBYTECODE=1 16 | export OMP_NUM_THREADS=1 17 | 18 | ### CUDA 19 | CUDAROOT=/usr/local/cuda 20 | NCCL_ROOT=/usr/local/nccl 21 | export CPATH=$NCCL_ROOT/include:$CPATH 22 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib/:$CUDAROOT/lib64:$LD_LIBRARY_PATH 23 | export LIBRARY_PATH=$NCCL_ROOT/lib/:$LIBRARY_PATH 24 | export CUDA_HOME=$CUDAROOT 25 | export CUDA_PATH=$CUDAROOT 26 | -------------------------------------------------------------------------------- /examples/language_model/ptb/steps: -------------------------------------------------------------------------------- 1 | ../../wsj/s5/steps -------------------------------------------------------------------------------- /examples/language_model/ptb/utils: -------------------------------------------------------------------------------- 1 | ../../wsj/s5/utils -------------------------------------------------------------------------------- /examples/language_model/wikitext2/RESULTS: -------------------------------------------------------------------------------- 1 | lstm1024H0P2L_emb1024_adam_lr0.001_bs20_bptt30_tie_residual_glu_1tokens 2 | % PPL 3 | valid (baseline): 104.53 4 | valid (cache size: 100): 90.86 5 | valid (cache size: 2000): 76.10 6 | test (baseline): 98.73 7 | test (cache size: 100): 85.87 8 | test (cache size: 2000): 72.77 9 | -------------------------------------------------------------------------------- /examples/language_model/wikitext2/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | # Kyoto University setup 14 | export train_cmd="run.pl --mem 2G" 15 | export cuda_cmd="run.pl --mem 2G --gpu 1" 16 | export decode_cmd="run.pl --mem 4G" 17 | 18 | # JHU setup 19 | # export train_cmd="queue.pl --mem 2G" 20 | # export cuda_cmd="queue.pl --mem 2G --gpu 1 --config conf/gpu.conf" 21 | # export decode_cmd="queue.pl --mem 4G" 22 | -------------------------------------------------------------------------------- /examples/language_model/wikitext2/conf/gcnn.yaml: -------------------------------------------------------------------------------- 1 | # topology 2 | lm_type: gated_conv_custom 3 | kernel_size: 4 4 | n_units: 1024 5 | n_projs: 512 6 | n_layers: 6 7 | emb_dim: 300 8 | tie_embedding: false 9 | # optimization 10 | batch_size: 50 11 | bptt: 200 12 | optimizer: nesterov 13 | n_epochs: 100 14 | convert_to_sgd_epoch: 100 15 | print_step: 100 16 | lr: 2.0 17 | lr_decay_start_epoch: 10 18 | lr_decay_rate: 0.75 19 | lr_decay_patient_n_epochs: 0 20 | # lr_decay_type: epoch 21 | lr_decay_type: metric 22 | early_stop_patient_n_epochs: 20 23 | eval_start_epoch: 1 24 | # initialization 25 | param_init: 0.05 26 | # regularization 27 | clip_grad_norm: 0.1 28 | dropout_in: 0.2 29 | dropout_hidden: 0.5 30 | dropout_out: 0.0 31 | weight_decay: 1e-6 32 | adaptive_softmax: true 33 | -------------------------------------------------------------------------------- /examples/language_model/wikitext2/conf/rnnlm.yaml: -------------------------------------------------------------------------------- 1 | # topology 2 | lm_type: lstm 3 | n_units: 1024 4 | n_projs: 0 5 | n_layers: 2 6 | emb_dim: 1024 7 | tie_embedding: true 8 | residual: true 9 | use_glu: true 10 | # optimization 11 | batch_size: 20 12 | bptt: 30 13 | optimizer: adam 14 | n_epochs: 50 15 | convert_to_sgd_epoch: 100 16 | print_step: 200 17 | lr: 1e-3 18 | lr_decay_start_epoch: 10 19 | lr_decay_rate: 0.9 20 | lr_decay_patient_n_epochs: 0 21 | lr_decay_type: always 22 | early_stop_patient_n_epochs: 10 23 | eval_start_epoch: 1 24 | # initialization 25 | param_init: 0.05 26 | # regularization 27 | clip_grad_norm: 0.1 28 | dropout_in: 0.2 29 | dropout_hidden: 0.5 30 | dropout_out: 0.0 31 | weight_decay: 1e-6 32 | lsm_prob: 0.0 33 | adaptive_softmax: false 34 | -------------------------------------------------------------------------------- /examples/language_model/wikitext2/conf/transformer_xl.yaml: -------------------------------------------------------------------------------- 1 | # topology 2 | lm_type: transformer_xl 3 | n_layers: 12 4 | transformer_d_model: 512 5 | transformer_d_ff: 2048 6 | transformer_n_heads: 8 7 | tie_embedding: true 8 | # optimization 9 | batch_size: 24 10 | bptt: 200 11 | mem_len: 200 12 | optimizer: noam 13 | n_epochs: 50 14 | convert_to_sgd_epoch: 100 15 | print_step: 200 16 | lr_factor: 1.0 17 | early_stop_patient_n_epochs: 5 18 | eval_start_epoch: 1 19 | warmup_n_steps: 4000 20 | accum_grad_n_steps: 4 21 | # regularization 22 | clip_grad_norm: 1.0 23 | dropout_in: 0.1 24 | dropout_hidden: 0.3 25 | dropout_out: 0.0 26 | dropout_att: 0.1 27 | dropout_layer: 0.0 28 | weight_decay: 1e-6 29 | lsm_prob: 0.1 ### 30 | adaptive_softmax: false 31 | -------------------------------------------------------------------------------- /examples/language_model/wikitext2/conf/transformerlm.yaml: -------------------------------------------------------------------------------- 1 | # topology 2 | lm_type: transformer 3 | n_layers: 6 4 | transformer_pe_type: add 5 | transformer_d_model: 512 6 | transformer_d_ff: 2048 7 | transformer_n_heads: 8 8 | tie_embedding: true 9 | # optimization 10 | batch_size: 32 11 | bptt: 200 12 | optimizer: noam 13 | n_epochs: 50 14 | convert_to_sgd_epoch: 100 15 | print_step: 200 16 | lr_factor: 10.0 17 | early_stop_patient_n_epochs: 5 18 | eval_start_epoch: 1 19 | warmup_n_steps: 4000 20 | accum_grad_n_steps: 2 21 | # regularization 22 | clip_grad_norm: 1.0 23 | dropout_in: 0.1 24 | dropout_hidden: 0.3 25 | dropout_out: 0.0 26 | dropout_att: 0.1 27 | dropout_layer: 0.0 28 | weight_decay: 1e-6 29 | lsm_prob: 0.0 30 | adaptive_softmax: false 31 | -------------------------------------------------------------------------------- /examples/language_model/wikitext2/path.sh: -------------------------------------------------------------------------------- 1 | export NEURALSP_ROOT=$PWD/../../.. 2 | export KALDI_ROOT=$NEURALSP_ROOT/tools/kaldi 3 | export TOOL=$NEURALSP_ROOT/tools/neural_sp 4 | export CONDA=$TOOL/miniconda 5 | 6 | # Kaldi 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh 8 | export PATH=$NEURALSP_ROOT/utils:$PWD/utils/:$KALDI_ROOT/tools/sctk/bin/:$TOOL/sentencepiece/build/src:$PATH 9 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 10 | . $KALDI_ROOT/tools/config/common_path.sh 11 | export LC_ALL=C 12 | 13 | ### Python 14 | source $CONDA/etc/profile.d/conda.sh && conda deactivate && conda activate 15 | export PYTHONDONTWRITEBYTECODE=1 16 | export OMP_NUM_THREADS=1 17 | 18 | ### CUDA 19 | CUDAROOT=/usr/local/cuda 20 | NCCL_ROOT=/usr/local/nccl 21 | export CPATH=$NCCL_ROOT/include:$CPATH 22 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib/:$CUDAROOT/lib64:$LD_LIBRARY_PATH 23 | export LIBRARY_PATH=$NCCL_ROOT/lib/:$LIBRARY_PATH 24 | export CUDA_HOME=$CUDAROOT 25 | export CUDA_PATH=$CUDAROOT 26 | -------------------------------------------------------------------------------- /examples/language_model/wikitext2/steps: -------------------------------------------------------------------------------- 1 | ../../wsj/s5/steps -------------------------------------------------------------------------------- /examples/language_model/wikitext2/utils: -------------------------------------------------------------------------------- 1 | ../../wsj/s5/utils -------------------------------------------------------------------------------- /examples/librispeech/README.txt: -------------------------------------------------------------------------------- 1 | 2 | The LibriSpeech corpus is a large (1000 hour) corpus of English read speech 3 | derived from audiobooks in the LibriVox project, sampled at 16kHz. The 4 | accents are various and not marked, but the majority are US English. It is 5 | available for download for free at http://www.openslr.org/12/. It was prepared 6 | as a speech recognition corpus by Vassil Panayotov. 7 | 8 | The recipe is in s5/ 9 | 10 | -------------------------------------------------------------------------------- /examples/librispeech/s5/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | # Kyoto University setup 14 | export train_cmd="run.pl --mem 2G" 15 | export cuda_cmd="run.pl --mem 2G --gpu 1" 16 | export decode_cmd="run.pl --mem 4G" 17 | 18 | # JHU setup 19 | # export train_cmd="queue.pl --mem 2G" 20 | # export cuda_cmd="queue.pl --mem 2G --gpu 1 --config conf/gpu.conf" 21 | # export decode_cmd="queue.pl --mem 4G" 22 | -------------------------------------------------------------------------------- /examples/librispeech/s5/conf/asr/transducer/blstm_transducer_bpe1k.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | subsample: "1_1_1_1_1" 11 | enc_type: conv_blstm 12 | bidirectional_sum_fwd_bwd: true 13 | enc_n_units: 512 14 | enc_n_projs: 0 15 | enc_n_layers: 5 16 | subsample_type: drop 17 | lc_chunk_size_left: -1 ### offline 18 | lc_chunk_size_right: 40 19 | dec_type: lstm_transducer 20 | dec_n_units: 1024 21 | dec_n_projs: 0 22 | dec_n_layers: 2 23 | dec_bottleneck_dim: 512 24 | emb_dim: 512 25 | tie_embedding: false 26 | ctc_fc_list: "512" 27 | ### optimization 28 | batch_size: 15 29 | optimizer: adam 30 | n_epochs: 30 31 | convert_to_sgd_epoch: 100 32 | print_step: 1000 33 | metric: edit_distance 34 | lr: 1e-3 35 | lr_decay_type: always 36 | lr_decay_start_epoch: 10 37 | lr_decay_rate: 0.85 38 | lr_decay_patient_n_epochs: 0 39 | early_stop_patient_n_epochs: 5 40 | sort_stop_epoch: 100 41 | eval_start_epoch: 1 42 | warmup_start_lr: 1e-4 43 | warmup_n_steps: 0 44 | ### initialization 45 | param_init: 0.1 46 | ### regularization 47 | clip_grad_norm: 5.0 48 | dropout_in: 0.0 49 | dropout_enc: 0.4 50 | dropout_dec: 0.4 51 | dropout_emb: 0.4 52 | weight_decay: 1e-6 53 | ### MTL 54 | ctc_weight: 0.3 55 | ctc_lsm_prob: 0.1 56 | mtl_per_batch: false 57 | task_specific_layer: false 58 | -------------------------------------------------------------------------------- /examples/librispeech/s5/conf/asr/transducer/lcblstm_rnnt_chunk4040_bpe1k.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | subsample: "1_1_1_1_1" 11 | enc_type: conv_blstm 12 | bidirectional_sum_fwd_bwd: true 13 | enc_n_units: 512 14 | enc_n_projs: 0 15 | enc_n_layers: 5 16 | subsample_type: drop 17 | lc_chunk_size_left: 40 18 | lc_chunk_size_right: 40 19 | dec_type: lstm_transducer 20 | dec_n_units: 1024 21 | dec_n_projs: 0 22 | dec_n_layers: 2 23 | dec_bottleneck_dim: 512 24 | emb_dim: 512 25 | tie_embedding: false 26 | ctc_fc_list: "512" 27 | ### optimization 28 | batch_size: 10 29 | optimizer: adam 30 | n_epochs: 30 31 | convert_to_sgd_epoch: 100 32 | print_step: 1000 33 | metric: edit_distance 34 | lr: 1e-3 35 | lr_decay_type: always 36 | lr_decay_start_epoch: 10 37 | lr_decay_rate: 0.85 38 | lr_decay_patient_n_epochs: 0 39 | early_stop_patient_n_epochs: 5 40 | sort_stop_epoch: 100 41 | eval_start_epoch: 1 42 | warmup_start_lr: 1e-4 43 | warmup_n_steps: 0 44 | ### initialization 45 | param_init: 0.1 46 | ### regularization 47 | clip_grad_norm: 5.0 48 | dropout_in: 0.0 49 | dropout_enc: 0.4 50 | dropout_dec: 0.4 51 | dropout_emb: 0.4 52 | weight_decay: 1e-6 53 | ### MTL 54 | ctc_weight: 0.3 55 | ctc_lsm_prob: 0.1 56 | mtl_per_batch: false 57 | task_specific_layer: false 58 | -------------------------------------------------------------------------------- /examples/librispeech/s5/conf/asr/transducer/lstm_rnnt_bpe1k.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | subsample: "1_1_1_1_1" 11 | enc_type: conv_lstm 12 | enc_n_units: 1024 13 | enc_n_projs: 0 14 | enc_n_layers: 5 15 | subsample_type: drop 16 | dec_type: lstm_transducer 17 | dec_n_units: 1024 18 | dec_n_projs: 0 19 | dec_n_layers: 2 20 | dec_bottleneck_dim: 512 21 | emb_dim: 512 22 | tie_embedding: false 23 | ctc_fc_list: "512" 24 | ### optimization 25 | batch_size: 15 26 | optimizer: adam 27 | n_epochs: 35 # 20->35 28 | convert_to_sgd_epoch: 100 29 | print_step: 1000 30 | metric: edit_distance 31 | lr: 1e-3 32 | lr_decay_type: always 33 | lr_decay_start_epoch: 10 34 | lr_decay_rate: 0.85 ### 0.8->0.85 35 | lr_decay_patient_n_epochs: 0 36 | early_stop_patient_n_epochs: 5 37 | sort_stop_epoch: 100 38 | eval_start_epoch: 1 39 | warmup_start_lr: 1e-4 40 | warmup_n_steps: 4000 ### this is important 41 | ### initialization 42 | param_init: 0.1 43 | ### regularization 44 | clip_grad_norm: 5.0 45 | dropout_in: 0.0 46 | dropout_enc: 0.4 47 | dropout_dec: 0.4 48 | dropout_emb: 0.4 49 | weight_decay: 1e-6 50 | ### MTL 51 | ctc_weight: 0.3 52 | ctc_lsm_prob: 0.1 53 | mtl_per_batch: false 54 | task_specific_layer: false 55 | -------------------------------------------------------------------------------- /examples/librispeech/s5/conf/asr/transformer/transformer.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | enc_type: conv_transformer 11 | enc_n_layers: 12 12 | transformer_enc_pe_type: none ### 13 | transformer_enc_d_model: 256 14 | transformer_enc_d_ff: 2048 15 | transformer_enc_n_heads: 4 16 | dec_type: transformer 17 | dec_n_layers: 6 18 | transformer_dec_pe_type: 1dconv3L ### this is effective 19 | transformer_dec_attn_type: scaled_dot 20 | transformer_dec_d_model: 256 21 | transformer_dec_d_ff: 2048 22 | transformer_dec_n_heads: 4 23 | tie_embedding: false 24 | ctc_fc_list: "512" 25 | ### optimization 26 | batch_size: 32 27 | optimizer: noam 28 | n_epochs: 50 29 | convert_to_sgd_epoch: 100 30 | print_step: 6000 31 | metric: accuracy 32 | lr_factor: 5.0 33 | early_stop_patient_n_epochs: 5 34 | shuffle_bucket: true ### this is important 35 | sort_stop_epoch: 100 36 | eval_start_epoch: 1 37 | warmup_n_steps: 25000 38 | accum_grad_n_steps: 8 39 | ### regularization 40 | clip_grad_norm: 5.0 41 | dropout_in: 0.0 42 | dropout_enc: 0.1 43 | dropout_dec: 0.1 44 | dropout_emb: 0.1 45 | dropout_att: 0.0 46 | weight_decay: 1e-6 47 | lsm_prob: 0.1 48 | ### MTL 49 | ctc_weight: 0.3 50 | ctc_lsm_prob: 0.1 51 | mtl_per_batch: false 52 | task_specific_layer: false 53 | # SpecAugment 54 | freq_width: 27 55 | n_freq_masks: 2 56 | time_width: 100 57 | n_time_masks: 2 58 | time_width_upper: 1.0 59 | -------------------------------------------------------------------------------- /examples/librispeech/s5/conf/asr/transformer/transformer_512dmodel_8H.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | enc_type: conv_transformer 11 | enc_n_layers: 12 12 | transformer_enc_pe_type: none ### 13 | transformer_enc_d_model: 512 ### 14 | transformer_enc_d_ff: 2048 15 | transformer_enc_n_heads: 8 ### 16 | dec_type: transformer 17 | dec_n_layers: 6 18 | transformer_dec_pe_type: 1dconv3L ### this is effective 19 | transformer_dec_attn_type: scaled_dot 20 | transformer_dec_d_model: 512 ### 21 | transformer_dec_d_ff: 2048 22 | transformer_dec_n_heads: 8 ### 23 | tie_embedding: false 24 | ctc_fc_list: "512" 25 | ### optimization 26 | batch_size: 32 27 | optimizer: noam 28 | n_epochs: 50 29 | convert_to_sgd_epoch: 100 30 | print_step: 6000 31 | metric: accuracy 32 | lr_factor: 5.0 33 | early_stop_patient_n_epochs: 5 34 | shuffle_bucket: true ### this is important 35 | sort_stop_epoch: 100 36 | eval_start_epoch: 1 37 | warmup_n_steps: 25000 38 | accum_grad_n_steps: 8 39 | ### regularization 40 | clip_grad_norm: 5.0 41 | dropout_in: 0.0 42 | dropout_enc: 0.1 43 | dropout_dec: 0.1 44 | dropout_emb: 0.1 45 | dropout_att: 0.0 46 | weight_decay: 1e-6 47 | lsm_prob: 0.1 48 | ### MTL 49 | ctc_weight: 0.3 50 | ctc_lsm_prob: 0.1 51 | mtl_per_batch: false 52 | task_specific_layer: false 53 | # SpecAugment 54 | freq_width: 27 55 | n_freq_masks: 2 56 | time_width: 100 57 | n_time_masks: 2 58 | time_width_upper: 1.0 59 | -------------------------------------------------------------------------------- /examples/librispeech/s5/conf/asr/transformer/transformer_768dmodel_3072dff_8H.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | enc_type: conv_transformer 11 | enc_n_layers: 12 12 | transformer_enc_pe_type: none ### 13 | transformer_enc_d_model: 768 ### 14 | transformer_enc_d_ff: 3072 ### 15 | transformer_enc_n_heads: 8 ### 16 | dec_type: transformer 17 | dec_n_layers: 6 18 | transformer_dec_pe_type: 1dconv3L ### this is effective 19 | transformer_dec_attn_type: scaled_dot 20 | transformer_dec_d_model: 768 ### 21 | transformer_dec_d_ff: 3072 ### 22 | transformer_dec_n_heads: 8 ### 23 | tie_embedding: false 24 | ctc_fc_list: "512" 25 | ### optimization 26 | batch_size: 32 27 | optimizer: noam 28 | n_epochs: 50 29 | convert_to_sgd_epoch: 100 30 | print_step: 6000 31 | metric: accuracy 32 | lr_factor: 5.0 33 | early_stop_patient_n_epochs: 5 34 | shuffle_bucket: true ### this is important 35 | sort_stop_epoch: 100 36 | eval_start_epoch: 1 37 | warmup_n_steps: 25000 38 | accum_grad_n_steps: 8 39 | ### regularization 40 | clip_grad_norm: 5.0 41 | dropout_in: 0.0 42 | dropout_enc: 0.1 43 | dropout_dec: 0.1 44 | dropout_emb: 0.1 45 | dropout_att: 0.0 46 | weight_decay: 1e-6 47 | lsm_prob: 0.1 48 | ### MTL 49 | ctc_weight: 0.3 50 | ctc_lsm_prob: 0.1 51 | mtl_per_batch: false 52 | task_specific_layer: false 53 | # SpecAugment 54 | freq_width: 27 55 | n_freq_masks: 2 56 | time_width: 100 57 | n_time_masks: 2 58 | time_width_upper: 1.0 59 | -------------------------------------------------------------------------------- /examples/librispeech/s5/conf/asr/transformer/transformer_subsample8.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)_(2,2)" 10 | enc_type: conv_transformer 11 | enc_n_layers: 12 12 | transformer_enc_pe_type: none ### 13 | transformer_enc_d_model: 256 14 | transformer_enc_d_ff: 2048 15 | transformer_enc_n_heads: 4 16 | dec_type: transformer 17 | dec_n_layers: 6 18 | transformer_dec_pe_type: 1dconv3L ### this is effective 19 | transformer_dec_attn_type: scaled_dot 20 | transformer_dec_d_model: 256 21 | transformer_dec_d_ff: 2048 22 | transformer_dec_n_heads: 4 23 | tie_embedding: false 24 | ctc_fc_list: "512" 25 | ### optimization 26 | batch_size: 32 27 | optimizer: noam 28 | n_epochs: 50 29 | convert_to_sgd_epoch: 100 30 | print_step: 6000 31 | metric: accuracy 32 | lr_factor: 5.0 33 | early_stop_patient_n_epochs: 5 34 | shuffle_bucket: true ### this is important 35 | sort_stop_epoch: 100 36 | eval_start_epoch: 1 37 | warmup_n_steps: 25000 38 | accum_grad_n_steps: 8 39 | ### regularization 40 | clip_grad_norm: 5.0 41 | dropout_in: 0.0 42 | dropout_enc: 0.1 43 | dropout_dec: 0.1 44 | dropout_emb: 0.1 45 | dropout_att: 0.0 46 | weight_decay: 1e-6 47 | lsm_prob: 0.1 48 | ### MTL 49 | ctc_weight: 0.3 50 | ctc_lsm_prob: 0.1 51 | mtl_per_batch: false 52 | task_specific_layer: false 53 | # SpecAugment 54 | freq_width: 27 55 | n_freq_masks: 2 56 | time_width: 100 57 | n_time_masks: 2 58 | time_width_upper: 1.0 59 | -------------------------------------------------------------------------------- /examples/librispeech/s5/conf/data/pretrain.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 20 # 30->20 3 | print_step: 2000 # 1000->2000 4 | lr_decay_start_epoch: 5 5 | lr_decay_rate: 0.8 6 | -------------------------------------------------------------------------------- /examples/librispeech/s5/conf/data/spec_augment.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 60 # 30->60 3 | print_step: 2000 # 1000->2000 4 | lr_decay_start_epoch: 20 5 | lr_decay_rate: 0.9 6 | 7 | # mask 8 | freq_width: 27 9 | n_freq_masks: 2 10 | time_width: 100 11 | n_time_masks: 2 12 | time_width_upper: 1.0 13 | -------------------------------------------------------------------------------- /examples/librispeech/s5/conf/data/spec_augment_pretrain_F13_T50.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 40 # 20->40 3 | print_step: 4000 # 2000->4000 4 | lr_decay_start_epoch: 5 5 | lr_decay_rate: 0.9 6 | 7 | # mask 8 | freq_width: 13 9 | n_freq_masks: 2 10 | time_width: 50 11 | n_time_masks: 2 12 | time_width_upper: 1.0 13 | -------------------------------------------------------------------------------- /examples/librispeech/s5/conf/data/spec_augment_pretrain_F27_T100.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 40 # 20->40 3 | print_step: 4000 # 2000->4000 4 | lr_decay_start_epoch: 5 5 | lr_decay_rate: 0.9 6 | 7 | # mask 8 | freq_width: 27 9 | n_freq_masks: 2 10 | time_width: 100 11 | n_time_masks: 2 12 | time_width_upper: 1.0 13 | -------------------------------------------------------------------------------- /examples/librispeech/s5/conf/data/spec_augment_pretrain_F27_T50.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 40 # 20->40 3 | print_step: 4000 # 2000->4000 4 | lr_decay_start_epoch: 5 5 | lr_decay_rate: 0.9 6 | 7 | # mask 8 | freq_width: 27 9 | n_freq_masks: 2 10 | time_width: 50 11 | n_time_masks: 2 12 | time_width_upper: 1.0 13 | -------------------------------------------------------------------------------- /examples/librispeech/s5/conf/data/spec_augment_speed_perturb.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 50 # 25->50 3 | print_step: 6000 # 1000->6000 4 | lr_decay_start_epoch: 15 5 | lr_decay_rate: 0.9 6 | 7 | # mask 8 | freq_width: 27 9 | n_freq_masks: 2 10 | time_width: 100 11 | n_time_masks: 2 12 | time_width_upper: 1.0 13 | -------------------------------------------------------------------------------- /examples/librispeech/s5/conf/data/spec_augment_speed_perturb_pretrain_F27_T100.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 30 # 15->30 3 | print_step: 6000 # 1000->6000 4 | lr_decay_start_epoch: 5 5 | lr_decay_rate: 0.85 6 | 7 | # mask 8 | freq_width: 27 9 | n_freq_masks: 2 10 | time_width: 100 11 | n_time_masks: 2 12 | time_width_upper: 1.0 13 | -------------------------------------------------------------------------------- /examples/librispeech/s5/conf/fbank.conf: -------------------------------------------------------------------------------- 1 | --window-type=hamming # disable Dans window, use the standard 2 | --htk-compat=true # try to make it compatible with HTK 3 | 4 | --sample-frequency=16000 5 | --num-mel-bins=80 6 | --use-energy=false 7 | -------------------------------------------------------------------------------- /examples/librispeech/s5/conf/lm/rnnlm.yaml: -------------------------------------------------------------------------------- 1 | # topology 2 | lm_type: lstm 3 | n_units: 1024 4 | n_projs: 0 5 | n_layers: 4 ### 6 | emb_dim: 1024 7 | n_units_null_context: 0 8 | tie_embedding: true 9 | residual: true 10 | use_glu: true 11 | # optimization 12 | batch_size: 128 13 | bptt: 200 14 | optimizer: adam 15 | n_epochs: 40 16 | convert_to_sgd_epoch: 100 17 | print_step: 2000 18 | lr: 1e-3 19 | lr_decay_start_epoch: 5 20 | lr_decay_rate: 0.9 21 | lr_decay_patient_n_epochs: 0 22 | lr_decay_type: always 23 | early_stop_patient_n_epochs: 5 24 | eval_start_epoch: 1 25 | # initialization 26 | param_init: 0.05 27 | # regularization 28 | clip_grad_norm: 1.0 29 | dropout_in: 0.0 30 | dropout_hidden: 0.0 31 | dropout_out: 0.0 32 | weight_decay: 1e-6 33 | lsm_prob: 0.0 34 | adaptive_softmax: false 35 | -------------------------------------------------------------------------------- /examples/librispeech/s5/conf/lm/rnnlm_6L.yaml: -------------------------------------------------------------------------------- 1 | # topology 2 | lm_type: lstm 3 | n_units: 1024 4 | n_projs: 0 5 | n_layers: 6 ### 6 | emb_dim: 1024 7 | n_units_null_context: 0 8 | tie_embedding: true 9 | residual: true 10 | use_glu: true 11 | # optimization 12 | batch_size: 128 13 | bptt: 200 14 | optimizer: adam 15 | n_epochs: 40 16 | convert_to_sgd_epoch: 100 17 | print_step: 2000 18 | lr: 1e-3 19 | lr_decay_start_epoch: 5 20 | lr_decay_rate: 0.9 21 | lr_decay_patient_n_epochs: 0 22 | lr_decay_type: always 23 | early_stop_patient_n_epochs: 5 24 | eval_start_epoch: 1 25 | # initialization 26 | param_init: 0.05 27 | # regularization 28 | clip_grad_norm: 1.0 29 | dropout_in: 0.0 30 | dropout_hidden: 0.0 31 | dropout_out: 0.0 32 | weight_decay: 1e-6 33 | lsm_prob: 0.0 ### 34 | adaptive_softmax: false 35 | -------------------------------------------------------------------------------- /examples/librispeech/s5/local/format_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2014 Vassil Panayotov 4 | # Apache 2.0 5 | 6 | # Prepares the test time language model(G) transducers 7 | # (adapted from wsj/s5/local/wsj_format_data.sh) 8 | 9 | . ./path.sh 10 | 11 | if [ $# -ne 1 ]; then 12 | echo "Usage: $0 " 13 | echo "e.g.: $0 /export/a15/vpanayotov/data/lm" 14 | echo ", where:" 15 | echo " is the directory in which the language model is stored/downloaded" 16 | exit 1 17 | fi 18 | 19 | lm_dir=$1 20 | 21 | lexicon=$DATA/local/lang_tmp/lexiconp.txt 22 | 23 | # This loop was taken verbatim from wsj_format_data.sh, and I'm leaving it in place in 24 | # case we decide to add more language models at some point 25 | for lm_suffix in tgpr; do 26 | test=$DATA/lang_test_${lm_suffix} 27 | mkdir -p $test 28 | for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones topo oov.txt oov.int; do 29 | cp -r $DATA/lang/$f $test 30 | done 31 | gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz | \ 32 | arpa2fst --disambig-symbol=#0 \ 33 | --read-symbol-table=$test/words.txt - $test/G.fst 34 | 35 | utils/validate_lang.pl $test || exit 1; 36 | done 37 | 38 | echo "Succeeded in formatting data." 39 | 40 | exit 0 41 | -------------------------------------------------------------------------------- /examples/librispeech/s5/path.sh: -------------------------------------------------------------------------------- 1 | export NEURALSP_ROOT=$PWD/../../.. 2 | export KALDI_ROOT=$NEURALSP_ROOT/tools/kaldi 3 | export TOOL=$NEURALSP_ROOT/tools/neural_sp 4 | export CONDA=$TOOL/miniconda 5 | 6 | # Kaldi 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh 8 | export PATH=$NEURALSP_ROOT/utils:$PWD/utils/:$KALDI_ROOT/tools/sctk/bin/:$TOOL/sentencepiece/build/src:$PATH 9 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 10 | . $KALDI_ROOT/tools/config/common_path.sh 11 | export LC_ALL=C 12 | 13 | ### Python 14 | source $CONDA/etc/profile.d/conda.sh && conda deactivate && conda activate 15 | export PYTHONDONTWRITEBYTECODE=1 16 | export OMP_NUM_THREADS=1 17 | 18 | ### CUDA 19 | CUDAROOT=/usr/local/cuda 20 | NCCL_ROOT=/usr/local/nccl 21 | export CPATH=$NCCL_ROOT/include:$CPATH 22 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib/:$CUDAROOT/lib64:$LD_LIBRARY_PATH 23 | export LIBRARY_PATH=$NCCL_ROOT/lib/:$LIBRARY_PATH 24 | export CUDA_HOME=$CUDAROOT 25 | export CUDA_PATH=$CUDAROOT 26 | -------------------------------------------------------------------------------- /examples/librispeech/s5/steps: -------------------------------------------------------------------------------- 1 | ../../wsj/s5/steps -------------------------------------------------------------------------------- /examples/librispeech/s5/utils: -------------------------------------------------------------------------------- 1 | ../../wsj/s5/utils -------------------------------------------------------------------------------- /examples/swbd/README.txt: -------------------------------------------------------------------------------- 1 | About the Switchboard corpus 2 | 3 | This is conversational telephone speech collected as 2-channel, 8kHz-sampled 4 | data. We are using just the Switchboard-1 Phase 1 training data. 5 | The catalog number LDC97S62 (Switchboard-1 Release 2) corresponds, we believe, 6 | to what we have. We also use the Mississippi State transcriptions, which 7 | we download separately from 8 | http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz 9 | 10 | We are using the eval2000 a.k.a. hub5'00 evaluation data. The acoustics are 11 | LDC2002S09 and the text is LDC2002T43. 12 | 13 | We are also using the RT'03 test set, available as LDC2007S10. Note: not 14 | all parts of the recipe test with this. 15 | 16 | About the Fisher corpus for language modeling 17 | 18 | We use Fisher English training speech transcripts for language modeling, if 19 | they are available. The catalog number for part 1 transcripts is LDC2004T19, 20 | and LDC2005T19 for part 2. 21 | 22 | Each subdirectory of this directory contains the 23 | scripts for a sequence of experiments. 24 | 25 | s5: This is slightly out of date, please see s5c 26 | 27 | s5b: This is (somewhat less) out of date, please see s5c 28 | 29 | s5c: This is the current recipe. 30 | -------------------------------------------------------------------------------- /examples/swbd/s5c/RESULTS: -------------------------------------------------------------------------------- 1 | # swbd 300h 2 | | no LM | RNNLM | 3 | | SWBD | CH | SWBD | CH | 4 | BPE10k attention | 11.8 | 23.1 | 10.9 | 22.6 | 5 | BPE10k attention + SpecAugment | 9.4 | 19.1 | 9.1 | 18.8 | 6 | 7 | 8 | # swbd+fisher 2000h 9 | | no LM | RNNLM | 10 | | SWBD | CH | SWBD | CH | 11 | BPE34k attention | 7.8 | 13.8 | N/A | N/A | 12 | -------------------------------------------------------------------------------- /examples/swbd/s5c/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | # Kyoto University setup 14 | export train_cmd="run.pl --mem 2G" 15 | export cuda_cmd="run.pl --mem 2G --gpu 1" 16 | export decode_cmd="run.pl --mem 4G" 17 | 18 | # JHU setup 19 | # export train_cmd="queue.pl --mem 2G" 20 | # export cuda_cmd="queue.pl --mem 2G --gpu 1 --config conf/gpu.conf" 21 | # export decode_cmd="queue.pl --mem 4G" 22 | -------------------------------------------------------------------------------- /examples/swbd/s5c/conf/asr/blstm_las_fisher_swbd.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | subsample: "1_2_2_2_1" 11 | enc_type: blstm 12 | bidirectional_sum_fwd_bwd: true 13 | enc_n_units: 512 14 | enc_n_projs: 0 15 | enc_n_layers: 6 16 | subsample_type: drop 17 | attn_type: location 18 | attn_conv_n_channels: 10 19 | attn_conv_width: 201 20 | attn_dim: 512 21 | attn_n_heads: 1 22 | dec_type: lstm 23 | dec_n_units: 1024 24 | dec_n_projs: 0 25 | dec_n_layers: 1 26 | dec_bottleneck_dim: 1024 ### this is effective 27 | emb_dim: 512 28 | tie_embedding: false 29 | ctc_fc_list: "512" 30 | ### optimization 31 | batch_size: 50 32 | optimizer: adam 33 | n_epochs: 25 34 | convert_to_sgd_epoch: 100 35 | print_step: 1000 36 | metric: edit_distance 37 | lr: 5e-4 38 | lr_decay_type: always 39 | lr_decay_start_epoch: 10 40 | lr_decay_rate: 0.85 41 | lr_decay_patient_n_epochs: 0 42 | early_stop_patient_n_epochs: 5 43 | sort_stop_epoch: 100 44 | eval_start_epoch: 1 45 | warmup_start_lr: 1e-4 46 | warmup_n_steps: 4000 47 | ### initialization 48 | param_init: 0.1 49 | ### regularization 50 | clip_grad_norm: 5.0 51 | dropout_in: 0.0 52 | dropout_enc: 0.4 53 | dropout_dec: 0.4 54 | dropout_emb: 0.4 55 | dropout_att: 0.0 56 | weight_decay: 1e-6 57 | ss_prob: 0.2 58 | lsm_prob: 0.1 59 | ### MTL 60 | ctc_weight: 0.0 61 | ctc_lsm_prob: 0.1 62 | mtl_per_batch: false 63 | task_specific_layer: false 64 | -------------------------------------------------------------------------------- /examples/swbd/s5c/conf/asr/transformer.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | enc_type: conv_transformer 11 | enc_n_layers: 12 12 | transformer_enc_pe_type: add 13 | transformer_enc_d_model: 256 14 | transformer_enc_d_ff: 2048 15 | transformer_enc_n_heads: 4 16 | dec_type: transformer 17 | dec_n_layers: 6 18 | transformer_dec_pe_type: 1dconv3L ### this is effective 19 | transformer_dec_attn_type: scaled_dot 20 | transformer_dec_d_model: 256 21 | transformer_dec_d_ff: 2048 22 | transformer_dec_n_heads: 4 23 | tie_embedding: false 24 | ctc_fc_list: "512" 25 | ### optimization 26 | batch_size: 32 27 | optimizer: noam 28 | n_epochs: 50 29 | convert_to_sgd_epoch: 100 30 | print_step: 1200 31 | metric: accuracy 32 | lr_factor: 5.0 33 | early_stop_patient_n_epochs: 5 34 | shuffle_bucket: true ### this is important 35 | sort_stop_epoch: 100 36 | eval_start_epoch: 1 37 | warmup_n_steps: 25000 38 | accum_grad_n_steps: 8 39 | ### regularization 40 | clip_grad_norm: 5.0 41 | dropout_in: 0.0 42 | dropout_enc: 0.1 43 | dropout_dec: 0.1 44 | dropout_emb: 0.1 45 | dropout_att: 0.0 46 | weight_decay: 1e-6 47 | lsm_prob: 0.1 48 | ### MTL 49 | ctc_weight: 0.3 50 | ctc_lsm_prob: 0.1 51 | mtl_per_batch: false 52 | task_specific_layer: false 53 | # SpecAugment 54 | freq_width: 27 55 | n_freq_masks: 2 56 | time_width: 70 57 | n_time_masks: 2 58 | time_width_upper: 0.2 59 | -------------------------------------------------------------------------------- /examples/swbd/s5c/conf/asr/transformer_fisher_swbd.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | enc_type: conv_transformer 11 | enc_n_layers: 12 12 | transformer_enc_pe_type: add 13 | transformer_enc_d_model: 512 14 | transformer_enc_d_ff: 2048 15 | transformer_enc_n_heads: 8 16 | dec_type: transformer 17 | dec_n_layers: 6 18 | transformer_dec_pe_type: 1dconv3L ### this is effective 19 | transformer_dec_attn_type: scaled_dot 20 | transformer_dec_d_model: 512 21 | transformer_dec_d_ff: 2048 22 | transformer_dec_n_heads: 8 23 | tie_embedding: false 24 | ctc_fc_list: "512" 25 | ### optimization 26 | batch_size: 32 27 | optimizer: noam 28 | n_epochs: 50 29 | convert_to_sgd_epoch: 100 30 | print_step: 6000 31 | metric: accuracy 32 | lr_factor: 5.0 33 | early_stop_patient_n_epochs: 5 34 | shuffle_bucket: true ### this is important 35 | sort_stop_epoch: 100 36 | eval_start_epoch: 1 37 | warmup_n_steps: 25000 38 | accum_grad_n_steps: 8 39 | ### regularization 40 | clip_grad_norm: 5.0 41 | dropout_in: 0.0 42 | dropout_enc: 0.1 43 | dropout_dec: 0.1 44 | dropout_emb: 0.1 45 | dropout_att: 0.0 46 | weight_decay: 1e-6 47 | lsm_prob: 0.1 48 | ### MTL 49 | ctc_weight: 0.3 50 | ctc_lsm_prob: 0.1 51 | mtl_per_batch: false 52 | task_specific_layer: false 53 | # SpecAugment 54 | freq_width: 27 55 | n_freq_masks: 2 56 | time_width: 70 57 | n_time_masks: 2 58 | time_width_upper: 0.2 59 | -------------------------------------------------------------------------------- /examples/swbd/s5c/conf/data/spec_augment.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 50 # 25->50 3 | print_step: 400 4 | lr_decay_start_epoch: 20 5 | lr_decay_rate: 0.9 6 | 7 | # mask 8 | freq_width: 27 9 | n_freq_masks: 2 10 | time_width: 70 11 | n_time_masks: 2 12 | time_width_upper: 0.2 13 | -------------------------------------------------------------------------------- /examples/swbd/s5c/conf/data/spec_augment_speed_perturb.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 40 # 20->40 3 | print_step: 1200 4 | lr_decay_start_epoch: 15 5 | lr_decay_rate: 0.9 6 | 7 | # mask 8 | freq_width: 27 9 | n_freq_masks: 2 10 | time_width: 70 11 | n_time_masks: 2 12 | time_width_upper: 0.2 13 | -------------------------------------------------------------------------------- /examples/swbd/s5c/conf/data/speed_perturb.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 20 3 | print_step: 600 4 | lr_decay_start_epoch: 10 5 | lr_decay_rate: 0.8 6 | -------------------------------------------------------------------------------- /examples/swbd/s5c/conf/data/speed_perturb_pretrain.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 15 3 | print_step: 1200 4 | lr_decay_start_epoch: 5 5 | lr_decay_rate: 0.8 6 | -------------------------------------------------------------------------------- /examples/swbd/s5c/conf/fbank.conf: -------------------------------------------------------------------------------- 1 | --window-type=hamming # disable Dans window, use the standard 2 | --htk-compat=true # try to make it compatible with HTK 3 | 4 | --sample-frequency=8000 5 | --num-mel-bins=80 6 | --use-energy=false 7 | -------------------------------------------------------------------------------- /examples/swbd/s5c/conf/lm/rnnlm.yaml: -------------------------------------------------------------------------------- 1 | # topology 2 | lm_type: lstm 3 | n_units: 1024 4 | n_projs: 0 5 | n_layers: 4 ### 6 | emb_dim: 1024 7 | n_units_null_context: 0 8 | tie_embedding: true 9 | residual: true 10 | use_glu: true 11 | # optimization 12 | batch_size: 128 13 | bptt: 200 14 | optimizer: adam 15 | n_epochs: 40 16 | convert_to_sgd_epoch: 40 17 | print_step: 200 18 | lr: 1e-3 19 | lr_decay_start_epoch: 10 20 | lr_decay_rate: 0.95 21 | lr_decay_patient_n_epochs: 0 22 | lr_decay_type: always 23 | early_stop_patient_n_epochs: 10 24 | eval_start_epoch: 1 25 | # initialization 26 | param_init: 0.05 27 | # regularization 28 | clip_grad_norm: 1.0 29 | dropout_in: 0.2 30 | dropout_hidden: 0.5 31 | dropout_out: 0.0 32 | weight_decay: 1e-6 33 | lsm_prob: 0.1 ### 34 | adaptive_softmax: false 35 | # contextualization 36 | serialize: false ### 37 | -------------------------------------------------------------------------------- /examples/swbd/s5c/conf/lm/transformer_xl.yaml: -------------------------------------------------------------------------------- 1 | # topology 2 | lm_type: transformer_xl 3 | n_layers: 12 4 | transformer_d_model: 512 5 | transformer_d_ff: 2048 6 | transformer_n_heads: 8 7 | tie_embedding: true 8 | # optimization 9 | batch_size: 24 10 | bptt: 200 11 | mem_len: 200 12 | optimizer: noam 13 | n_epochs: 40 14 | convert_to_sgd_epoch: 100 15 | print_step: 400 16 | lr_factor: 1.0 17 | early_stop_patient_n_epochs: 5 18 | eval_start_epoch: 1 19 | warmup_n_steps: 4000 20 | accum_grad_n_steps: 4 ### 21 | # regularization 22 | clip_grad_norm: 1.0 23 | dropout_in: 0.1 24 | dropout_hidden: 0.3 25 | dropout_out: 0.0 26 | dropout_att: 0.1 27 | dropout_layer: 0.0 28 | weight_decay: 1e-6 29 | lsm_prob: 0.1 ### 30 | adaptive_softmax: false 31 | # contextualization 32 | serialize: false ### 33 | -------------------------------------------------------------------------------- /examples/swbd/s5c/conf/lm/transformerlm.yaml: -------------------------------------------------------------------------------- 1 | # topology 2 | lm_type: transformer 3 | n_layers: 12 4 | transformer_pe_type: add 5 | transformer_d_model: 512 6 | transformer_d_ff: 2048 7 | transformer_n_heads: 8 8 | tie_embedding: true 9 | # optimization 10 | batch_size: 32 11 | bptt: 200 12 | optimizer: noam 13 | n_epochs: 40 14 | convert_to_sgd_epoch: 100 15 | print_step: 400 16 | lr_factor: 10.0 17 | early_stop_patient_n_epochs: 5 18 | eval_start_epoch: 1 19 | warmup_n_steps: 4000 20 | accum_grad_n_steps: 2 21 | # regularization 22 | clip_grad_norm: 1.0 23 | dropout_in: 0.1 24 | dropout_hidden: 0.3 25 | dropout_out: 0.0 26 | dropout_att: 0.1 27 | dropout_layer: 0.0 28 | weight_decay: 1e-6 29 | lsm_prob: 0.1 ### 30 | adaptive_softmax: false 31 | # contextualization 32 | serialize: false ### 33 | -------------------------------------------------------------------------------- /examples/swbd/s5c/local/MSU_single_letter.txt: -------------------------------------------------------------------------------- 1 | A ey 2 | B b iy 3 | C s iy 4 | D d iy 5 | E iy 6 | F eh f 7 | G jh iy 8 | H ey ch 9 | I ay 10 | J jh ey 11 | K k ey 12 | L eh l 13 | M eh m 14 | N eh n 15 | O ow 16 | P p iy 17 | Q k y uw 18 | R aa r 19 | S eh s 20 | T t iy 21 | U y uw 22 | V v iy 23 | W d ah b ax l y uw 24 | X eh k s 25 | Y w ay 26 | Z z iy 27 | -------------------------------------------------------------------------------- /examples/swbd/s5c/local/swbd1_data_download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Switchboard-1 training data preparation customized for Edinburgh 4 | # Author: Arnab Ghoshal (Jan 2013) 5 | 6 | # To be run from one directory above this script. 7 | 8 | ## The input is some directory containing the switchboard-1 release 2 9 | ## corpus (LDC97S62). Note: we don't make many assumptions about how 10 | ## you unpacked this. We are just doing a "find" command to locate 11 | ## the .sph files. 12 | 13 | . ./path.sh 14 | 15 | #check existing directories 16 | if [ $# != 1 ]; then 17 | echo "Usage: swbd1_data_download.sh /path/to/SWBD" 18 | exit 1; 19 | fi 20 | 21 | SWBD_DIR=$1 22 | 23 | dir=${data}/local/train_swbd 24 | mkdir -p $dir 25 | 26 | # Audio data directory check 27 | if [ ! -d $SWBD_DIR ]; then 28 | echo "Error: run.sh requires a directory argument" 29 | exit 1; 30 | fi 31 | 32 | # Trans directory check 33 | if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then 34 | ( 35 | cd $dir; 36 | if [ ! -d swb_ms98_transcriptions ]; then 37 | echo " *** Downloading trascriptions and dictionary ***" 38 | wget http://www.openslr.org/resources/5/switchboard_word_alignments.tar.gz || 39 | wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz 40 | tar -xf switchboard_word_alignments.tar.gz 41 | fi 42 | ) 43 | else 44 | echo "Directory with transcriptions exists, skipping downloading" 45 | [ -f $dir/swb_ms98_transcriptions ] \ 46 | || ln -sf $SWBD_DIR/transcriptions/swb_ms98_transcriptions $dir/ 47 | fi 48 | -------------------------------------------------------------------------------- /examples/swbd/s5c/path.sh: -------------------------------------------------------------------------------- 1 | export NEURALSP_ROOT=$PWD/../../.. 2 | export KALDI_ROOT=$NEURALSP_ROOT/tools/kaldi 3 | export TOOL=$NEURALSP_ROOT/tools/neural_sp 4 | export CONDA=$TOOL/miniconda 5 | 6 | # Kaldi 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh 8 | export PATH=$NEURALSP_ROOT/utils:$PWD/utils/:$KALDI_ROOT/tools/sctk/bin/:$TOOL/sentencepiece/build/src:$PATH 9 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 10 | . $KALDI_ROOT/tools/config/common_path.sh 11 | export LC_ALL=C 12 | 13 | ### Python 14 | source $CONDA/etc/profile.d/conda.sh && conda deactivate && conda activate 15 | export PYTHONDONTWRITEBYTECODE=1 16 | export OMP_NUM_THREADS=1 17 | 18 | ### CUDA 19 | CUDAROOT=/usr/local/cuda 20 | NCCL_ROOT=/usr/local/nccl 21 | export CPATH=$NCCL_ROOT/include:$CPATH 22 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib/:$CUDAROOT/lib64:$LD_LIBRARY_PATH 23 | export LIBRARY_PATH=$NCCL_ROOT/lib/:$LIBRARY_PATH 24 | export CUDA_HOME=$CUDAROOT 25 | export CUDA_PATH=$CUDAROOT 26 | -------------------------------------------------------------------------------- /examples/swbd/s5c/steps: -------------------------------------------------------------------------------- 1 | ../../wsj/s5/steps -------------------------------------------------------------------------------- /examples/swbd/s5c/utils: -------------------------------------------------------------------------------- 1 | ../../wsj/s5/utils -------------------------------------------------------------------------------- /examples/tedlium/s5_r2/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | # Kyoto University setup 14 | export train_cmd="run.pl --mem 2G" 15 | export cuda_cmd="run.pl --mem 2G --gpu 1" 16 | export decode_cmd="run.pl --mem 4G" 17 | 18 | # JHU setup 19 | # export train_cmd="queue.pl --mem 2G" 20 | # export cuda_cmd="queue.pl --mem 2G --gpu 1 --config conf/gpu.conf" 21 | # export decode_cmd="queue.pl --mem 4G" 22 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r2/conf/asr/las/lstm_las.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | subsample: "1_1_1_1_1" 11 | enc_type: conv_lstm 12 | enc_n_units: 1024 13 | enc_n_projs: 0 14 | enc_n_layers: 5 15 | subsample_type: drop 16 | attn_type: location 17 | attn_conv_n_channels: 10 18 | attn_conv_width: 201 19 | attn_dim: 512 20 | attn_n_heads: 1 21 | dec_type: lstm 22 | dec_n_units: 1024 23 | dec_n_projs: 0 24 | dec_n_layers: 1 25 | dec_bottleneck_dim: 1024 ### this is effective 26 | emb_dim: 512 27 | tie_embedding: false 28 | ctc_fc_list: "512" 29 | ### optimization 30 | batch_size: 30 31 | optimizer: adam 32 | n_epochs: 35 # 20->35 33 | convert_to_sgd_epoch: 100 34 | print_step: 600 # 200->600 35 | metric: edit_distance 36 | lr: 1e-3 37 | lr_decay_type: always 38 | lr_decay_start_epoch: 10 39 | lr_decay_rate: 0.85 ### 0.8->0.85 40 | lr_decay_patient_n_epochs: 0 41 | early_stop_patient_n_epochs: 5 42 | sort_stop_epoch: 100 43 | eval_start_epoch: 1 44 | warmup_start_lr: 1e-4 45 | warmup_n_steps: 4000 46 | ### initialization 47 | param_init: 0.1 48 | ### regularization 49 | clip_grad_norm: 5.0 50 | dropout_in: 0.0 51 | dropout_enc: 0.4 52 | dropout_dec: 0.4 53 | dropout_emb: 0.4 54 | dropout_att: 0.0 55 | weight_decay: 1e-6 56 | ss_prob: 0.2 57 | lsm_prob: 0.1 58 | ### MTL 59 | ctc_weight: 0.3 60 | ctc_lsm_prob: 0.1 61 | mtl_per_batch: false 62 | task_specific_layer: false 63 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r2/conf/asr/lstm_las.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | conv_batch_norm: false 11 | subsample: "1_1_1_1_1" 12 | enc_type: conv_lstm 13 | enc_n_units: 1024 14 | enc_n_projs: 0 15 | enc_n_layers: 5 16 | subsample_type: drop 17 | attn_type: location 18 | attn_conv_n_channels: 10 19 | attn_conv_width: 201 20 | attn_dim: 512 21 | attn_n_heads: 1 22 | dec_type: lstm 23 | dec_n_units: 1024 24 | dec_n_projs: 0 25 | dec_n_layers: 1 26 | dec_bottleneck_dim: 1024 ### this is effective 27 | emb_dim: 512 28 | tie_embedding: false 29 | ctc_fc_list: "512" 30 | ### optimization 31 | batch_size: 30 32 | optimizer: adam 33 | n_epochs: 35 # 20->35 34 | convert_to_sgd_epoch: 100 35 | print_step: 600 # 200->600 36 | metric: edit_distance 37 | lr: 1e-3 38 | lr_decay_type: always 39 | lr_decay_start_epoch: 10 40 | lr_decay_rate: 0.85 ### 41 | lr_decay_patient_n_epochs: 0 42 | early_stop_patient_n_epochs: 5 43 | sort_stop_epoch: 100 44 | eval_start_epoch: 1 45 | warmup_start_lr: 1e-4 46 | warmup_n_steps: 4000 47 | ### initialization 48 | param_init: 0.1 49 | ### regularization 50 | clip_grad_norm: 5.0 51 | dropout_in: 0.0 52 | dropout_enc: 0.4 53 | dropout_dec: 0.4 54 | dropout_emb: 0.4 55 | dropout_att: 0.0 56 | weight_decay: 1e-6 57 | ss_prob: 0.2 58 | lsm_prob: 0.1 59 | ### MTL 60 | ctc_weight: 0.3 61 | ctc_lsm_prob: 0.1 62 | mtl_per_batch: false 63 | task_specific_layer: false 64 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r2/conf/asr/transducer/blstm_rnnt_bpe1k.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | subsample: "1_1_1_1_1" 11 | enc_type: conv_blstm 12 | bidirectional_sum_fwd_bwd: true 13 | enc_n_units: 512 14 | enc_n_projs: 0 15 | enc_n_layers: 5 16 | subsample_type: drop 17 | lc_chunk_size_left: -1 ### offline 18 | lc_chunk_size_right: 40 19 | dec_type: lstm_transducer 20 | dec_n_units: 1024 21 | dec_n_projs: 0 22 | dec_n_layers: 2 23 | dec_bottleneck_dim: 512 24 | emb_dim: 512 25 | tie_embedding: false 26 | ctc_fc_list: "512" 27 | ### optimization 28 | batch_size: 15 29 | optimizer: adam 30 | n_epochs: 20 # 25->20 31 | convert_to_sgd_epoch: 100 32 | print_step: 600 # 200->600 33 | metric: edit_distance 34 | lr: 1e-3 35 | lr_decay_type: always 36 | lr_decay_start_epoch: 10 37 | lr_decay_rate: 0.8 ### 0.85->0.8 38 | lr_decay_patient_n_epochs: 0 39 | early_stop_patient_n_epochs: 5 40 | sort_stop_epoch: 100 41 | eval_start_epoch: 1 42 | warmup_start_lr: 1e-4 43 | warmup_n_steps: 4000 44 | ### initialization 45 | param_init: 0.1 46 | ### regularization 47 | clip_grad_norm: 5.0 48 | dropout_in: 0.0 49 | dropout_enc: 0.4 50 | dropout_dec: 0.4 51 | dropout_emb: 0.4 52 | weight_decay: 1e-6 53 | ### MTL 54 | ctc_weight: 0.3 55 | ctc_lsm_prob: 0.1 56 | mtl_per_batch: false 57 | task_specific_layer: false 58 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r2/conf/asr/transducer/lcblstm_rnnt_40_20_bpe1k.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | subsample: "1_1_1_1_1" 11 | enc_type: conv_blstm 12 | bidirectional_sum_fwd_bwd: true 13 | enc_n_units: 512 14 | enc_n_projs: 0 15 | enc_n_layers: 5 16 | subsample_type: drop 17 | lc_chunk_size_left: 40 18 | lc_chunk_size_right: 20 19 | dec_type: lstm_transducer 20 | dec_n_units: 1024 21 | dec_n_projs: 0 22 | dec_n_layers: 2 23 | dec_bottleneck_dim: 512 24 | emb_dim: 512 25 | tie_embedding: false 26 | ctc_fc_list: "512" 27 | ### optimization 28 | batch_size: 10 29 | optimizer: adam 30 | n_epochs: 20 # 25->20 31 | convert_to_sgd_epoch: 100 32 | print_step: 600 # 200->600 33 | metric: edit_distance 34 | lr: 1e-3 35 | lr_decay_type: always 36 | lr_decay_start_epoch: 10 37 | lr_decay_rate: 0.8 ### 0.85->0.8 38 | lr_decay_patient_n_epochs: 0 39 | early_stop_patient_n_epochs: 5 40 | sort_stop_epoch: 100 41 | eval_start_epoch: 1 42 | warmup_start_lr: 1e-4 43 | warmup_n_steps: 4000 44 | ### initialization 45 | param_init: 0.1 46 | ### regularization 47 | clip_grad_norm: 5.0 48 | dropout_in: 0.0 49 | dropout_enc: 0.4 50 | dropout_dec: 0.4 51 | dropout_emb: 0.4 52 | weight_decay: 1e-6 53 | ### MTL 54 | ctc_weight: 0.3 55 | ctc_lsm_prob: 0.1 56 | mtl_per_batch: false 57 | task_specific_layer: false 58 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r2/conf/asr/transducer/lcblstm_rnnt_40_40_bpe1k.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | subsample: "1_1_1_1_1" 11 | enc_type: conv_blstm 12 | bidirectional_sum_fwd_bwd: true 13 | enc_n_units: 512 14 | enc_n_projs: 0 15 | enc_n_layers: 5 16 | subsample_type: drop 17 | lc_chunk_size_left: 40 18 | lc_chunk_size_right: 40 19 | dec_type: lstm_transducer 20 | dec_n_units: 1024 21 | dec_n_projs: 0 22 | dec_n_layers: 2 23 | dec_bottleneck_dim: 512 24 | emb_dim: 512 25 | tie_embedding: false 26 | ctc_fc_list: "512" 27 | ### optimization 28 | batch_size: 10 29 | optimizer: adam 30 | n_epochs: 20 # 25->20 31 | convert_to_sgd_epoch: 100 32 | print_step: 600 # 200->600 33 | metric: edit_distance 34 | lr: 1e-3 35 | lr_decay_type: always 36 | lr_decay_start_epoch: 10 37 | lr_decay_rate: 0.8 ### 0.85->0.8 38 | lr_decay_patient_n_epochs: 0 39 | early_stop_patient_n_epochs: 5 40 | sort_stop_epoch: 100 41 | eval_start_epoch: 1 42 | warmup_start_lr: 1e-4 43 | warmup_n_steps: 4000 44 | ### initialization 45 | param_init: 0.1 46 | ### regularization 47 | clip_grad_norm: 5.0 48 | dropout_in: 0.0 49 | dropout_enc: 0.4 50 | dropout_dec: 0.4 51 | dropout_emb: 0.4 52 | weight_decay: 1e-6 53 | ### MTL 54 | ctc_weight: 0.3 55 | ctc_lsm_prob: 0.1 56 | mtl_per_batch: false 57 | task_specific_layer: false 58 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r2/conf/asr/transducer/lstm_rnnt_bpe1k.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | subsample: "1_1_1_1_1" 11 | enc_type: conv_lstm 12 | enc_n_units: 1024 13 | enc_n_projs: 0 14 | enc_n_layers: 5 15 | subsample_type: drop 16 | dec_type: lstm_transducer 17 | dec_n_units: 1024 18 | dec_n_projs: 0 19 | dec_n_layers: 2 20 | dec_bottleneck_dim: 512 21 | emb_dim: 512 22 | tie_embedding: false 23 | ctc_fc_list: "512" 24 | ### optimization 25 | batch_size: 15 26 | optimizer: adam 27 | n_epochs: 35 # 20->35 28 | convert_to_sgd_epoch: 100 29 | print_step: 600 # 200->600 30 | metric: edit_distance 31 | lr: 1e-3 32 | lr_decay_type: always 33 | lr_decay_start_epoch: 10 34 | lr_decay_rate: 0.85 ### 0.8->0.85 35 | lr_decay_patient_n_epochs: 0 36 | early_stop_patient_n_epochs: 5 37 | sort_stop_epoch: 100 38 | eval_start_epoch: 1 39 | warmup_start_lr: 1e-4 40 | warmup_n_steps: 4000 41 | ### initialization 42 | param_init: 0.1 43 | ### regularization 44 | clip_grad_norm: 5.0 45 | dropout_in: 0.0 46 | dropout_enc: 0.4 47 | dropout_dec: 0.4 48 | dropout_emb: 0.4 49 | weight_decay: 1e-6 50 | ### MTL 51 | ctc_weight: 0.3 52 | ctc_lsm_prob: 0.1 53 | mtl_per_batch: false 54 | task_specific_layer: false 55 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r2/conf/data/pretrain.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 15 # 20->15 3 | print_step: 1200 # 600->1200 4 | lr_decay_start_epoch: 5 5 | lr_decay_rate: 0.8 6 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r2/conf/data/spec_augment_speed_perturb.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 40 # 20->40 3 | print_step: 1200 # 600->1200 4 | lr_decay_start_epoch: 15 5 | lr_decay_rate: 0.9 6 | 7 | # mask 8 | freq_width: 27 9 | n_freq_masks: 2 10 | time_width: 100 11 | n_time_masks: 2 12 | time_width_upper: 1.0 13 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r2/conf/data/spec_augment_speed_perturb_pretrain_F13_T50.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 30 # 15->30 3 | print_step: 2400 # 1200->2400 4 | lr_decay_start_epoch: 5 5 | lr_decay_rate: 0.9 6 | 7 | # mask 8 | freq_width: 13 9 | n_freq_masks: 2 10 | time_width: 50 11 | n_time_masks: 2 12 | time_width_upper: 1.0 13 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r2/conf/data/spec_augment_speed_perturb_pretrain_F27_T100.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 30 # 15->30 3 | print_step: 2400 # 1200->2400 4 | lr_decay_start_epoch: 5 5 | lr_decay_rate: 0.9 6 | 7 | # mask 8 | freq_width: 27 9 | n_freq_masks: 2 10 | time_width: 100 11 | n_time_masks: 2 12 | time_width_upper: 1.0 13 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r2/conf/data/spec_augment_speed_perturb_pretrain_F27_T50.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 30 # 15->30 3 | print_step: 2400 # 1200->2400 4 | lr_decay_start_epoch: 5 5 | lr_decay_rate: 0.9 6 | 7 | # mask 8 | freq_width: 27 9 | n_freq_masks: 2 10 | time_width: 50 11 | n_time_masks: 2 12 | time_width_upper: 1.0 13 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r2/conf/fbank.conf: -------------------------------------------------------------------------------- 1 | --window-type=hamming # disable Dans window, use the standard 2 | --htk-compat=true # try to make it compatible with HTK 3 | 4 | --sample-frequency=16000 5 | --num-mel-bins=80 6 | --use-energy=false 7 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r2/conf/lm/rnnlm.yaml: -------------------------------------------------------------------------------- 1 | # topology 2 | lm_type: lstm 3 | n_units: 1024 4 | n_projs: 0 5 | n_layers: 4 ### 6 | emb_dim: 1024 7 | n_units_null_context: 0 8 | tie_embedding: true 9 | residual: true 10 | use_glu: true 11 | # optimization 12 | batch_size: 128 13 | bptt: 200 14 | optimizer: adam 15 | n_epochs: 60 16 | convert_to_sgd_epoch: 100 17 | print_step: 400 18 | lr: 1e-3 19 | lr_decay_start_epoch: 5 20 | lr_decay_rate: 0.95 ### 21 | lr_decay_patient_n_epochs: 0 22 | lr_decay_type: always 23 | early_stop_patient_n_epochs: 5 24 | eval_start_epoch: 1 25 | # initialization 26 | param_init: 0.05 27 | # regularization 28 | clip_grad_norm: 1.0 29 | dropout_in: 0.2 30 | dropout_hidden: 0.2 31 | dropout_out: 0.0 32 | weight_decay: 1e-6 33 | lsm_prob: 0.0 ### 34 | adaptive_softmax: false 35 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r2/local/download_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2014 Nickolay V. Shmyrev 4 | # 2014 Brno University of Technology (Author: Karel Vesely) 5 | # 2016 John Hopkins University (author: Daniel Povey) 6 | # Apache 2.0 7 | 8 | mkdir -p ${db} 9 | 10 | cd ${db} ### Note: the rest of this script is executed from the directory '${db}'. 11 | 12 | # TED-LIUM database: 13 | if [[ $(hostname -f) == *.clsp.jhu.edu ]] ; then 14 | if [ ! -e TEDLIUM_release2 ]; then 15 | ln -sf /export/corpora5/TEDLIUM_release2 16 | fi 17 | echo "$0: linking the TEDLIUM data from /export/corpora5/TEDLIUM_release2" 18 | else 19 | if [ ! -e TEDLIUM_release2 ]; then 20 | echo "$0: downloading TEDLIUM_release2 data (it won't re-download if it was already downloaded.)" 21 | # the following command won't re-get it if it's already there 22 | # because of the --continue switch. 23 | wget --continue http://www.openslr.org/resources/19/TEDLIUM_release2.tar.gz || exit 1 24 | tar xf "TEDLIUM_release2.tar.gz" 25 | else 26 | echo "$0: not downloading or un-tarring TEDLIUM_release2 because it already exists." 27 | fi 28 | fi 29 | 30 | 31 | num_sph=$(find TEDLIUM_release2/ -name '*.sph' | wc -l) 32 | if [ "$num_sph" != 1514 ]; then 33 | echo "$0: expected to find 1514 .sph files in the directory ${db}/TEDLIUM_release2, found $num_sph" 34 | exit 1 35 | fi 36 | 37 | exit 0 38 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r2/local/format_lms.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2014 Nickolay V. Shmyrev 4 | # Apache 2.0 5 | 6 | if [ -f path.sh ]; then . ./path.sh; fi 7 | 8 | 9 | small_arpa_lm=${data}/local/local_lm/data/arpa/4gram_small.arpa.gz 10 | big_arpa_lm=${data}/local/local_lm/data/arpa/4gram_big.arpa.gz 11 | 12 | for f in $small_arpa_lm $big_arpa_lm ${data}/lang_nosp/words.txt; do 13 | [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1 14 | done 15 | 16 | 17 | set -e 18 | 19 | if [ -f ${data}/lang_nosp/G.fst ] && [ ${data}/lang_nosp/G.fst -nt $small_arpa_lm ]; then 20 | echo "$0: not regenerating ${data}/lang_nosp/G.fst as it already exists and " 21 | echo ".. is newer than the source LM." 22 | else 23 | arpa2fst --disambig-symbol=#0 --read-symbol-table=${data}/lang_nosp/words.txt \ 24 | "gunzip -c $small_arpa_lm|" ${data}/lang_nosp/G.fst 25 | echo "$0: Checking how stochastic G is (the first of these numbers should be small):" 26 | fstisstochastic ${data}/lang_nosp/G.fst || true 27 | utils/validate_lang.pl --skip-determinization-check ${data}/lang_nosp 28 | fi 29 | 30 | 31 | 32 | if [ -f ${data}/lang_nosp_rescore/G.carpa ] && [ ${data}/lang_nosp_rescore/G.carpa -nt $big_arpa_lm ] && \ 33 | [ ${data}/lang_nosp_rescore/G.carpa -nt ${data}/lang_nosp/words.txt ]; then 34 | echo "$0: not regenerating ${data}/lang_nosp_rescore/ as it seems to already by up to date." 35 | else 36 | utils/build_const_arpa_lm.sh $big_arpa_lm ${data}/lang_nosp ${data}/lang_nosp_rescore || exit 1; 37 | fi 38 | 39 | exit 0; 40 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r2/local/join_suffix.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright 2014 Nickolay V. Shmyrev 4 | # 2016 Johns Hopkins University (author: Daniel Povey) 5 | # Apache 2.0 6 | 7 | 8 | from __future__ import print_function 9 | import sys 10 | from codecs import open 11 | 12 | # This script joins together pairs of split-up words like "you 're" -> "you're". 13 | # The TEDLIUM transcripts are normalized in a way that's not traditional for 14 | # speech recognition. 15 | 16 | for line in sys.stdin: 17 | items = line.split() 18 | new_items = [] 19 | i = 1 20 | while i < len(items): 21 | if i < len(items) - 1 and items[i+1][0] == '\'': 22 | new_items.append(items[i] + items[i+1]) 23 | i = i + 1 24 | else: 25 | new_items.append(items[i]) 26 | i = i + 1 27 | print(items[0] + ' ' + ' '.join(new_items)) 28 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r2/local/prepare_dict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2014 Nickolay V. Shmyrev 4 | # 2014 Brno University of Technology (Author: Karel Vesely) 5 | # 2016 Daniel Galvez 6 | # 2016 Vincent Nguyen 7 | # Apache 2.0 8 | # 9 | 10 | dir=${data}/local/dict_nosp 11 | mkdir -p $dir 12 | 13 | srcdict=${db}/TEDLIUM_release2/TEDLIUM.152k.dic 14 | 15 | [ ! -r $srcdict ] && echo "Missing $srcdict" && exit 1 16 | 17 | # Join dicts and fix some troubles 18 | cat $srcdict | grep -v -w "" | grep -v -w "" | grep -v -w "" | \ 19 | LANG= LC_ALL= sort | sed 's:([0-9])::g' > $dir/lexicon_words.txt 20 | 21 | cat $dir/lexicon_words.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \ 22 | grep -v SIL | sort > $dir/nonsilence_phones.txt 23 | 24 | ( echo SIL; echo NSN ) > $dir/silence_phones.txt 25 | 26 | echo SIL > $dir/optional_silence.txt 27 | 28 | # No "extra questions" in the input to this setup, as we don't 29 | # have stress or tone. 30 | echo -n >$dir/extra_questions.txt 31 | 32 | # Add to the lexicon the silences, noises etc. 33 | # Typically, you would use " NSN" here, but the Cantab Research language models 34 | # use instead of to represent out of vocabulary words. 35 | echo ' NSN' | cat - $dir/lexicon_words.txt | sort | uniq > $dir/lexicon.txt 36 | 37 | # Check that the dict dir is okay! 38 | utils/validate_dict_dir.pl $dir || exit 1 39 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r2/local/ted_download_lm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2018 David Snyder 4 | # Apache 2.0 5 | # 6 | # This script downloads pre-built language models trained on the Cantab-Tedlium 7 | # text data and Tedlium acoustic training data. If you want to build these 8 | # models yourself, run the script local/ted_train_lm.sh. 9 | 10 | set -e 11 | 12 | echo "$0: downloading Tedlium 4 gram language models (it won't re-download if it was already downloaded.)" 13 | wget --continue http://kaldi-asr.org/models/5/4gram_small.arpa.gz -P ${data}/local/local_lm/data/arpa || exit 1 14 | wget --continue http://kaldi-asr.org/models/5/4gram_big.arpa.gz -P ${data}/local/local_lm/data/arpa || exit 1 15 | 16 | exit 0 17 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r2/path.sh: -------------------------------------------------------------------------------- 1 | export NEURALSP_ROOT=$PWD/../../.. 2 | export KALDI_ROOT=$NEURALSP_ROOT/tools/kaldi 3 | export TOOL=$NEURALSP_ROOT/tools/neural_sp 4 | export CONDA=$TOOL/miniconda 5 | 6 | # Kaldi 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh 8 | export PATH=$NEURALSP_ROOT/utils:$PWD/utils/:$KALDI_ROOT/tools/sctk/bin/:$KALDI_ROOT/tools/sph2pipe_v2.5/:$TOOL/sentencepiece/build/src:$PATH 9 | export PATH=$TOOL/mwerSegmenter/:$TOOL/moses/scripts/tokenizer/:$TOOL/moses/scripts/generic/:$PATH 10 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 11 | . $KALDI_ROOT/tools/config/common_path.sh 12 | export LC_ALL=C 13 | 14 | ### Python 15 | source $CONDA/etc/profile.d/conda.sh && conda deactivate && conda activate 16 | export PYTHONDONTWRITEBYTECODE=1 17 | export OMP_NUM_THREADS=1 18 | 19 | ### CUDA 20 | CUDAROOT=/usr/local/cuda 21 | NCCL_ROOT=/usr/local/nccl 22 | export CPATH=$NCCL_ROOT/include:$CPATH 23 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib/:$CUDAROOT/lib64:$LD_LIBRARY_PATH 24 | export LIBRARY_PATH=$NCCL_ROOT/lib/:$LIBRARY_PATH 25 | export CUDA_HOME=$CUDAROOT 26 | export CUDA_PATH=$CUDAROOT 27 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r2/steps: -------------------------------------------------------------------------------- 1 | ../../wsj/s5/steps -------------------------------------------------------------------------------- /examples/tedlium/s5_r2/utils: -------------------------------------------------------------------------------- 1 | ../../wsj/s5/utils -------------------------------------------------------------------------------- /examples/tedlium/s5_r3/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | # Kyoto University setup 14 | export train_cmd="run.pl --mem 2G" 15 | export cuda_cmd="run.pl --mem 2G --gpu 1" 16 | export decode_cmd="run.pl --mem 4G" 17 | 18 | # JHU setup 19 | # export train_cmd="queue.pl --mem 2G" 20 | # export cuda_cmd="queue.pl --mem 2G --gpu 1 --config conf/gpu.conf" 21 | # export decode_cmd="queue.pl --mem 4G" 22 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r3/conf/fbank.conf: -------------------------------------------------------------------------------- 1 | --window-type=hamming # disable Dans window, use the standard 2 | --htk-compat=true # try to make it compatible with HTK 3 | 4 | --sample-frequency=16000 5 | --num-mel-bins=80 6 | --use-energy=false 7 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r3/conf/lm/rnnlm.yaml: -------------------------------------------------------------------------------- 1 | # topology 2 | lm_type: lstm 3 | n_units: 1024 4 | n_projs: 0 5 | n_layers: 2 6 | emb_dim: 1024 7 | n_units_null_context: 0 8 | tie_embedding: true 9 | residual: true 10 | use_glu: true 11 | # optimization 12 | batch_size: 128 13 | bptt: 200 14 | optimizer: adam 15 | n_epochs: 40 16 | convert_to_sgd_epoch: 40 17 | print_step: 400 18 | lr: 1e-3 19 | lr_decay_start_epoch: 10 20 | lr_decay_rate: 0.9 21 | lr_decay_patient_n_epochs: 0 22 | lr_decay_type: always 23 | early_stop_patient_n_epochs: 5 24 | eval_start_epoch: 1 25 | # initialization 26 | param_init: 0.05 27 | # regularization 28 | clip_grad_norm: 1.0 29 | dropout_in: 0.2 30 | dropout_hidden: 0.2 31 | dropout_out: 0.0 32 | weight_decay: 1e-6 33 | lsm_prob: 0.0 34 | backward: false 35 | adaptive_softmax: false 36 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r3/conf/spec_augment.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 50 3 | convert_to_sgd_epoch: 50 4 | print_step: 400 5 | lr_decay_start_epoch: 15 6 | lr_decay_rate: 0.9 7 | 8 | # mask 9 | freq_width: 27 10 | n_freq_masks: 2 11 | time_width: 100 12 | n_time_masks: 2 13 | time_width_upper: 1.0 14 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r3/conf/spec_augment_speed_perturb.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 40 3 | convert_to_sgd_epoch: 40 4 | print_step: 1200 5 | lr_decay_start_epoch: 7 6 | lr_decay_rate: 0.875 7 | 8 | # mask 9 | freq_width: 27 10 | n_freq_masks: 2 11 | time_width: 100 12 | n_time_masks: 2 13 | time_width_upper: 1.0 14 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r3/conf/speed_perturb.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 20 3 | convert_to_sgd_epoch: 15 4 | print_step: 600 5 | lr_decay_start_epoch: 5 6 | lr_decay_rate: 0.8 7 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r3/local/download_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2014 Nickolay V. Shmyrev 4 | # 2014 Brno University of Technology (Author: Karel Vesely) 5 | # 2016 John Hopkins University (author: Daniel Povey) 6 | # Apache 2.0 7 | 8 | mkdir -p ${db} 9 | 10 | cd ${db} ### Note: the rest of this script is executed from the directory '${db}'. 11 | 12 | # TED-LIUM database: 13 | if [[ $(hostname -f) == *.clsp.jhu.edu ]] ; then 14 | if [ ! -e TEDLIUM_release-3 ]; then 15 | ln -sf /export/corpora5/TEDLIUM_release-3 16 | fi 17 | echo "$0: linking the TEDLIUM data from /export/corpora5/TEDLIUM_release-3" 18 | else 19 | if [ ! -e TEDLIUM_release-3 ]; then 20 | echo "$0: downloading TEDLIUM_release-3 data (it won't re-download if it was already downloaded.)" 21 | # the following command won't re-get it if it's already there 22 | # because of the --continue switch. 23 | wget --continue http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz || exit 1 24 | 25 | echo "$0: extracting TEDLIUM_release-3 data" 26 | tar xf "TEDLIUM_release-3.tgz" 27 | else 28 | echo "$0: not downloading or un-tarring TEDLIUM_release3 because it already exists." 29 | fi 30 | fi 31 | 32 | 33 | num_sph=$(find TEDLIUM_release-3/data -name '*.sph' | wc -l) 34 | if [ "$num_sph" != 2351 ]; then 35 | echo "$0: expected to find 2351 .sph files in the directory ${db}/TEDLIUM_release-3, found $num_sph" 36 | exit 1 37 | fi 38 | 39 | exit 0 40 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r3/local/format_lms.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2014 Nickolay V. Shmyrev 4 | # Apache 2.0 5 | 6 | if [ -f path.sh ]; then . path.sh; fi 7 | 8 | 9 | small_arpa_lm=${data}/local/local_lm/${data}/arpa/4gram_small.arpa.gz 10 | big_arpa_lm=${data}/local/local_lm/${data}/arpa/4gram_big.arpa.gz 11 | 12 | for f in $small_arpa_lm $big_arpa_lm ${data}/lang_nosp/words.txt; do 13 | [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1 14 | done 15 | 16 | 17 | set -e 18 | 19 | if [ -f ${data}/lang_nosp/G.fst ] && [ ${data}/lang_nosp/G.fst -nt $small_arpa_lm ]; then 20 | echo "$0: not regenerating ${data}/lang_nosp/G.fst as it already exists and " 21 | echo ".. is newer than the source LM." 22 | else 23 | arpa2fst --disambig-symbol=#0 --read-symbol-table=${data}/lang_nosp/words.txt \ 24 | "gunzip -c $small_arpa_lm|" ${data}/lang_nosp/G.fst 25 | echo "$0: Checking how stochastic G is (the first of these numbers should be small):" 26 | fstisstochastic ${data}/lang_nosp/G.fst || true 27 | utils/validate_lang.pl --skip-determinization-check ${data}/lang_nosp 28 | fi 29 | 30 | 31 | 32 | if [ -f ${data}/lang_nosp_rescore/G.carpa ] && [ ${data}/lang_nosp_rescore/G.carpa -nt $big_arpa_lm ] && \ 33 | [ ${data}/lang_nosp_rescore/G.carpa -nt ${data}/lang_nosp/words.txt ]; then 34 | echo "$0: not regenerating ${data}/lang_nosp_rescore/ as it seems to already by up to date." 35 | else 36 | utils/build_const_arpa_lm.sh $big_arpa_lm ${data}/lang_nosp ${data}/lang_nosp_rescore || exit 1; 37 | fi 38 | 39 | exit 0; 40 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r3/local/join_suffix.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright 2014 Nickolay V. Shmyrev 4 | # 2016 Johns Hopkins University (author: Daniel Povey) 5 | # Apache 2.0 6 | 7 | 8 | from __future__ import print_function 9 | import sys 10 | from codecs import open 11 | 12 | # This script joins together pairs of split-up words like "you 're" -> "you're". 13 | # The TEDLIUM transcripts are normalized in a way that's not traditional for 14 | # speech recognition. 15 | 16 | for line in sys.stdin: 17 | items = line.split() 18 | new_items = [] 19 | i = 1 20 | while i < len(items): 21 | if i < len(items) - 1 and items[i+1][0] == '\'': 22 | new_items.append(items[i] + items[i+1]) 23 | i = i + 1 24 | else: 25 | new_items.append(items[i]) 26 | i = i + 1 27 | print(items[0] + ' ' + ' '.join(new_items)) 28 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r3/local/prepare_dict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2014 Nickolay V. Shmyrev 4 | # 2014 Brno University of Technology (Author: Karel Vesely) 5 | # 2016 Daniel Galvez 6 | # 2016 Vincent Nguyen 7 | # Apache 2.0 8 | # 9 | 10 | dir=${data}/local/dict_nosp 11 | mkdir -p $dir 12 | 13 | srcdict=${db}/TEDLIUM_release-3/TEDLIUM.152k.dic 14 | 15 | [ ! -r $srcdict ] && echo "Missing $srcdict" && exit 1 16 | 17 | # Join dicts and fix some troubles 18 | cat $srcdict | grep -v -w "" | grep -v -w "" | grep -v -w "" | \ 19 | LANG= LC_ALL= sort | sed 's:([0-9])::g' > $dir/lexicon_words.txt 20 | 21 | cat $dir/lexicon_words.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \ 22 | grep -v SIL | sort > $dir/nonsilence_phones.txt 23 | 24 | ( echo SIL; echo NSN ) > $dir/silence_phones.txt 25 | 26 | echo SIL > $dir/optional_silence.txt 27 | 28 | # No "extra questions" in the input to this setup, as we don't 29 | # have stress or tone. 30 | echo -n >$dir/extra_questions.txt 31 | 32 | # Add to the lexicon the silences, noises etc. 33 | # Typically, you would use " NSN" here, but the Cantab Research language models 34 | # use instead of to represent out of vocabulary words. 35 | echo ' NSN' | cat - $dir/lexicon_words.txt | sort | uniq > $dir/lexicon.txt 36 | 37 | # Check that the dict dir is okay! 38 | utils/validate_dict_dir.pl $dir || exit 1 39 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r3/local/ted_download_lm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2018 David Snyder 4 | # Apache 2.0 5 | # 6 | # This script downloads pre-built language models trained on the Cantab-Tedlium 7 | # text data and Tedlium acoustic training data. If you want to build these 8 | # models yourself, run the script local/ted_train_lm.sh. 9 | 10 | set -e 11 | 12 | echo "$0: downloading Tedlium 4 gram language models (it won't re-download if it was already downloaded.)" 13 | wget --continue http://kaldi-asr.org/models/5/4gram_small.arpa.gz -P ${data}/local/local_lm/${data}/arpa || exit 1 14 | wget --continue http://kaldi-asr.org/models/5/4gram_big.arpa.gz -P ${data}/local/local_lm/${data}/arpa || exit 1 15 | 16 | exit 0 17 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r3/path.sh: -------------------------------------------------------------------------------- 1 | export NEURALSP_ROOT=$PWD/../../.. 2 | export KALDI_ROOT=$NEURALSP_ROOT/tools/kaldi 3 | export TOOL=$NEURALSP_ROOT/tools/neural_sp 4 | export CONDA=$TOOL/miniconda 5 | 6 | # Kaldi 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh 8 | export PATH=$NEURALSP_ROOT/utils:$PWD/utils/:$KALDI_ROOT/tools/sctk/bin/:$TOOL/sentencepiece/build/src:$PATH 9 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 10 | . $KALDI_ROOT/tools/config/common_path.sh 11 | export LC_ALL=C 12 | 13 | ### Python 14 | source $CONDA/etc/profile.d/conda.sh && conda deactivate && conda activate 15 | export PYTHONDONTWRITEBYTECODE=1 16 | export OMP_NUM_THREADS=1 17 | 18 | ### CUDA 19 | CUDAROOT=/usr/local/cuda 20 | NCCL_ROOT=/usr/local/nccl 21 | export CPATH=$NCCL_ROOT/include:$CPATH 22 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib/:$CUDAROOT/lib64:$LD_LIBRARY_PATH 23 | export LIBRARY_PATH=$NCCL_ROOT/lib/:$LIBRARY_PATH 24 | export CUDA_HOME=$CUDAROOT 25 | export CUDA_PATH=$CUDAROOT 26 | -------------------------------------------------------------------------------- /examples/tedlium/s5_r3/steps: -------------------------------------------------------------------------------- 1 | ../../wsj/s5/steps -------------------------------------------------------------------------------- /examples/tedlium/s5_r3/utils: -------------------------------------------------------------------------------- 1 | ../../wsj/s5/utils -------------------------------------------------------------------------------- /examples/timit/s5/RESULTS.md: -------------------------------------------------------------------------------- 1 | # Use caution when comparing these results with other published results. 2 | Training Set : 3696 sentences 3 | Dev Set : 400 sentences 4 | Test Set : 192 sentences Core Test Set (different from Full 1680 sent. set) 5 | Language Model : no 6 | Phone mapping : Training with 61 phonemes, for testing mapped to 39 phonemes 7 | 8 | 9 | ### BLSTM-CTC 10 | | Eval Set | # Snt | # Wrd | Corr | Sub | Del | Ins | Err | S.Err | 11 | | -------- | ----- | ----- | ---- | --- | --- | --- | --- | ----- | 12 | |dev|Sum/Avg|400|15334|80.7|15.0|4.3|2.3|21.7|99.3| 13 | |test|Sum/Avg|192|7333|79.6|15.4|5.0|2.4|22.8|99.5| 14 | 15 | ### Transformer + SpecAugment 16 | | Eval Set | # Snt | # Wrd | Corr | Sub | Del | Ins | Err | S.Err | 17 | | -------- | ----- | ----- | ---- | --- | --- | --- | --- | ----- | 18 | |dev|Sum/Avg|400|15334|81.3|15.4|3.3|2.9|**21.7**|99.8| 19 | |test|Sum/Avg|192|7333|80.2|15.9|4.0|3.3|**23.1**|100.0| 20 | 21 | ### Transformer + SpecAugment + relative positional encoding (encoder) 22 | | Eval Set | # Snt | # Wrd | Corr | Sub | Del | Ins | Err | S.Err | 23 | | -------- | ----- | ----- | ---- | --- | --- | --- | --- | ----- | 24 | |dev|Sum/Avg|400|15334|82.3|14.6|3.0|2.7|**20.4**|99.5| 25 | |test|Sum/Avg|192|7333|81.7|15.0|3.3|3.1|**21.4**|98.4| 26 | -------------------------------------------------------------------------------- /examples/timit/s5/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | # Kyoto University setup 14 | export train_cmd="run.pl --mem 2G" 15 | export cuda_cmd="run.pl --mem 2G --gpu 1" 16 | export decode_cmd="run.pl --mem 4G" 17 | 18 | # JHU setup 19 | # export train_cmd="queue.pl --mem 2G" 20 | # export cuda_cmd="queue.pl --mem 2G --gpu 1 --config conf/gpu.conf" 21 | # export decode_cmd="queue.pl --mem 4G" 22 | -------------------------------------------------------------------------------- /examples/timit/s5/conf/blstm_ctc.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 2000 5 | input_noise_std: 0 6 | conv_in_channel: 3 7 | conv_channels: "32_32" 8 | conv_kernel_sizes: "(3,3)_(3,3)" 9 | conv_strides: "(1,1)_(1,1)" 10 | conv_poolings: "(1,1)_(1,1)" 11 | subsample: "1_1_1_1_1" 12 | enc_type: blstm 13 | bidirectional_sum_fwd_bwd: false 14 | enc_n_units: 256 15 | enc_n_projs: 0 16 | enc_n_layers: 5 17 | subsample_type: drop 18 | ctc_fc_list: "" 19 | ### optimization 20 | batch_size: 32 21 | optimizer: adam 22 | n_epochs: 100 23 | convert_to_sgd_epoch: 90 24 | print_step: 20 25 | metric: edit_distance 26 | lr: 1e-3 27 | lr_decay_type: always 28 | lr_decay_start_epoch: 20 29 | lr_decay_rate: 0.97 30 | lr_decay_patient_n_epochs: 0 31 | early_stop_patient_n_epochs: 20 32 | sort_stop_epoch: 100 33 | eval_start_epoch: 1 34 | warmup_start_lr: 1e-4 35 | warmup_n_steps: 0 36 | ### initialization 37 | param_init: 0.1 38 | ### regularization 39 | clip_grad_norm: 5.0 40 | dropout_in: 0.2 41 | dropout_enc: 0.5 42 | weight_decay: 1e-6 43 | ### MTL 44 | ctc_weight: 1.0 45 | ctc_lsm_prob: 0.0 46 | -------------------------------------------------------------------------------- /examples/timit/s5/conf/blstm_las.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 2000 5 | input_noise_std: 0 6 | conv_in_channel: 3 7 | conv_channels: "32_32" 8 | conv_kernel_sizes: "(3,3)_(3,3)" 9 | conv_strides: "(1,1)_(1,1)" 10 | conv_poolings: "(1,1)_(1,1)" 11 | subsample: "1_1_1_1_1" 12 | enc_type: blstm 13 | bidirectional_sum_fwd_bwd: false 14 | enc_n_units: 256 15 | enc_n_projs: 0 16 | enc_n_layers: 5 17 | subsample_type: drop 18 | attn_type: location 19 | attn_conv_n_channels: 10 20 | attn_conv_width: 201 21 | attn_dim: 256 22 | attn_n_heads: 1 23 | dec_type: lstm 24 | dec_n_units: 256 25 | dec_n_projs: 0 26 | dec_n_layers: 1 27 | dec_bottleneck_dim: 256 28 | emb_dim: 256 29 | tie_embedding: false 30 | ctc_fc_list: "" 31 | ### optimization 32 | batch_size: 32 33 | optimizer: adam 34 | n_epochs: 100 35 | convert_to_sgd_epoch: 90 36 | print_step: 20 37 | metric: edit_distance 38 | lr: 1e-3 39 | lr_decay_type: always 40 | lr_decay_start_epoch: 20 41 | lr_decay_rate: 0.97 42 | lr_decay_patient_n_epochs: 0 43 | early_stop_patient_n_epochs: 20 44 | sort_stop_epoch: 100 45 | eval_start_epoch: 20 46 | warmup_start_lr: 1e-4 47 | warmup_n_steps: 0 48 | ### initialization 49 | param_init: 0.1 50 | ### regularization 51 | clip_grad_norm: 5.0 52 | dropout_in: 0.2 53 | dropout_enc: 0.5 54 | dropout_dec: 0.2 55 | dropout_emb: 0.2 56 | dropout_att: 0.0 57 | weight_decay: 1e-6 58 | ss_prob: 0.0 59 | lsm_prob: 0.0 60 | ### MTL 61 | ctc_weight: 0.0 62 | ctc_lsm_prob: 0.0 63 | mtl_per_batch: false 64 | task_specific_layer: false 65 | -------------------------------------------------------------------------------- /examples/timit/s5/conf/dev_spk.list: -------------------------------------------------------------------------------- 1 | faks0 2 | fdac1 3 | fjem0 4 | mgwt0 5 | mjar0 6 | mmdb1 7 | mmdm2 8 | mpdf0 9 | fcmh0 10 | fkms0 11 | mbdg0 12 | mbwm0 13 | mcsh0 14 | fadg0 15 | fdms0 16 | fedw0 17 | mgjf0 18 | mglb0 19 | mrtk0 20 | mtaa0 21 | mtdt0 22 | mthc0 23 | mwjg0 24 | fnmr0 25 | frew0 26 | fsem0 27 | mbns0 28 | mmjr0 29 | mdls0 30 | mdlf0 31 | mdvc0 32 | mers0 33 | fmah0 34 | fdrw0 35 | mrcs0 36 | mrjm4 37 | fcal1 38 | mmwh0 39 | fjsj0 40 | majc0 41 | mjsw0 42 | mreb0 43 | fgjd0 44 | fjmg0 45 | mroa0 46 | mteb0 47 | mjfc0 48 | mrjr0 49 | fmml0 50 | mrws1 51 | -------------------------------------------------------------------------------- /examples/timit/s5/conf/fbank.conf: -------------------------------------------------------------------------------- 1 | --window-type=hamming # disable Dans window, use the standard 2 | --htk-compat=true # try to make it compatible with HTK 3 | 4 | --sample-frequency=16000 5 | --num-mel-bins=40 6 | --use-energy=true 7 | -------------------------------------------------------------------------------- /examples/timit/s5/conf/phones.60-48-39.map: -------------------------------------------------------------------------------- 1 | aa aa aa 2 | ae ae ae 3 | ah ah ah 4 | ao ao aa 5 | aw aw aw 6 | ax ax ah 7 | ax-h ax ah 8 | axr er er 9 | ay ay ay 10 | b b b 11 | bcl vcl sil 12 | ch ch ch 13 | d d d 14 | dcl vcl sil 15 | dh dh dh 16 | dx dx dx 17 | eh eh eh 18 | el el l 19 | em m m 20 | en en n 21 | eng ng ng 22 | epi epi sil 23 | er er er 24 | ey ey ey 25 | f f f 26 | g g g 27 | gcl vcl sil 28 | h# sil sil 29 | hh hh hh 30 | hv hh hh 31 | ih ih ih 32 | ix ix ih 33 | iy iy iy 34 | jh jh jh 35 | k k k 36 | kcl cl sil 37 | l l l 38 | m m m 39 | n n n 40 | ng ng ng 41 | nx n n 42 | ow ow ow 43 | oy oy oy 44 | p p p 45 | pau sil sil 46 | pcl cl sil 47 | q 48 | r r r 49 | s s s 50 | sh sh sh 51 | t t t 52 | tcl cl sil 53 | th th th 54 | uh uh uh 55 | uw uw uw 56 | ux uw uw 57 | v v v 58 | w w w 59 | y y y 60 | z z z 61 | zh zh sh 62 | -------------------------------------------------------------------------------- /examples/timit/s5/conf/rnn_transducer.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 2000 5 | input_noise_std: 0 6 | conv_in_channel: 3 7 | conv_channels: "32_32" 8 | conv_kernel_sizes: "(3,3)_(3,3)" 9 | conv_strides: "(1,1)_(1,1)" 10 | conv_poolings: "(1,1)_(1,1)" 11 | subsample: "1_1_1_1_1" 12 | enc_type: blstm 13 | bidirectional_sum_fwd_bwd: true 14 | enc_n_units: 256 15 | enc_n_projs: 0 16 | enc_n_layers: 5 17 | subsample_type: drop 18 | dec_type: lstm_transducer 19 | dec_n_units: 256 20 | dec_n_projs: 0 21 | dec_n_layers: 1 22 | dec_bottleneck_dim: 256 23 | emb_dim: 256 24 | tie_embedding: false 25 | ctc_fc_list: "" 26 | ### optimization 27 | batch_size: 32 28 | optimizer: adam 29 | n_epochs: 100 30 | convert_to_sgd_epoch: 90 31 | print_step: 20 32 | metric: edit_distance 33 | lr: 1e-3 34 | lr_decay_type: always 35 | lr_decay_start_epoch: 20 36 | lr_decay_rate: 0.97 37 | lr_decay_patient_n_epochs: 0 38 | early_stop_patient_n_epochs: 20 39 | sort_stop_epoch: 100 40 | eval_start_epoch: 20 41 | warmup_start_lr: 1e-4 42 | warmup_n_steps: 0 43 | ### initialization 44 | param_init: 0.1 45 | ### regularization 46 | clip_grad_norm: 5.0 47 | dropout_in: 0.2 48 | dropout_enc: 0.5 49 | dropout_dec: 0.2 50 | dropout_emb: 0.2 51 | weight_decay: 1e-6 52 | ### MTL 53 | ctc_weight: 0.0 54 | ctc_lsm_prob: 0.0 55 | mtl_per_batch: false 56 | task_specific_layer: false 57 | -------------------------------------------------------------------------------- /examples/timit/s5/conf/test_spk.list: -------------------------------------------------------------------------------- 1 | mdab0 2 | mwbt0 3 | felc0 4 | mtas1 5 | mwew0 6 | fpas0 7 | mjmp0 8 | mlnt0 9 | fpkt0 10 | mlll0 11 | mtls0 12 | fjlm0 13 | mbpm0 14 | mklt0 15 | fnlp0 16 | mcmj0 17 | mjdh0 18 | fmgd0 19 | mgrt0 20 | mnjm0 21 | fdhc0 22 | mjln0 23 | mpam0 24 | fmld0 25 | -------------------------------------------------------------------------------- /examples/timit/s5/conf/transformer.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 2000 5 | conv_in_channel: 3 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | enc_type: conv_transformer 11 | enc_n_layers: 12 12 | transformer_enc_pe_type: add 13 | transformer_enc_d_model: 256 14 | transformer_enc_d_ff: 2048 15 | transformer_enc_n_heads: 4 16 | dec_type: transformer 17 | dec_n_layers: 6 18 | transformer_dec_attn_type: scaled_dot 19 | transformer_dec_pe_type: 1dconv3L ### this is effective 20 | transformer_dec_d_model: 256 21 | transformer_dec_d_ff: 2048 22 | transformer_dec_n_heads: 4 23 | tie_embedding: false 24 | ctc_fc_list: "512" 25 | ### optimization 26 | batch_size: 32 27 | optimizer: noam 28 | n_epochs: 250 29 | convert_to_sgd_epoch: 1000 30 | print_step: 20 31 | metric: accuracy 32 | lr_factor: 5.0 33 | early_stop_patient_n_epochs: 5 34 | shuffle_bucket: true ### this is important 35 | sort_stop_epoch: 1000 36 | eval_start_epoch: 1 37 | warmup_n_steps: 25000 38 | accum_grad_n_steps: 2 39 | ### regularization 40 | clip_grad_norm: 5.0 41 | dropout_in: 0.2 42 | dropout_enc: 0.5 43 | dropout_dec: 0.5 44 | dropout_emb: 0.2 45 | dropout_att: 0.0 46 | weight_decay: 1e-6 47 | lsm_prob: 0.1 48 | ### MTL 49 | ctc_weight: 0.3 50 | ctc_lsm_prob: 0.1 51 | mtl_per_batch: false 52 | task_specific_layer: false 53 | # SpecAugment 54 | freq_width: 27 55 | n_freq_masks: 2 56 | time_width: 100 57 | n_time_masks: 2 58 | time_width_upper: 1.0 59 | -------------------------------------------------------------------------------- /examples/timit/s5/conf/transformer_relative.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 2000 5 | conv_in_channel: 3 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | enc_type: conv_transformer 11 | enc_n_layers: 12 12 | transformer_enc_pe_type: relative ### 13 | transformer_enc_d_model: 256 14 | transformer_enc_d_ff: 2048 15 | transformer_enc_n_heads: 4 16 | dec_type: transformer 17 | dec_n_layers: 6 18 | transformer_dec_attn_type: scaled_dot 19 | transformer_dec_pe_type: 1dconv3L ### this is effective 20 | transformer_dec_d_model: 256 21 | transformer_dec_d_ff: 2048 22 | transformer_dec_n_heads: 4 23 | tie_embedding: false 24 | ctc_fc_list: "512" 25 | ### optimization 26 | batch_size: 32 27 | optimizer: noam 28 | n_epochs: 200 29 | convert_to_sgd_epoch: 1000 30 | print_step: 20 31 | metric: accuracy 32 | lr_factor: 5.0 33 | early_stop_patient_n_epochs: 5 34 | shuffle_bucket: true ### this is important 35 | sort_stop_epoch: 1000 36 | eval_start_epoch: 1 37 | warmup_n_steps: 25000 38 | accum_grad_n_steps: 2 39 | ### regularization 40 | clip_grad_norm: 5.0 41 | dropout_in: 0.2 42 | dropout_enc: 0.5 43 | dropout_dec: 0.5 44 | dropout_emb: 0.2 45 | dropout_att: 0.0 46 | weight_decay: 1e-6 47 | lsm_prob: 0.1 48 | ### MTL 49 | ctc_weight: 0.3 50 | ctc_lsm_prob: 0.1 51 | mtl_per_batch: false 52 | task_specific_layer: false 53 | # SpecAugment 54 | freq_width: 27 55 | n_freq_masks: 2 56 | time_width: 100 57 | n_time_masks: 2 58 | time_width_upper: 1.0 59 | -------------------------------------------------------------------------------- /examples/timit/s5/local/plot_ctc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2018 Kyoto University (Hirofumi Inaguma) 4 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 5 | 6 | model= 7 | gpu= 8 | stdout=false 9 | n_threads=1 10 | 11 | ### path to save preproecssed data 12 | data=/n/work2/inaguma/corpus/timit 13 | 14 | batch_size=1 15 | 16 | . ./cmd.sh 17 | . ./path.sh 18 | . utils/parse_options.sh 19 | 20 | set -e 21 | set -u 22 | set -o pipefail 23 | 24 | if [ -z ${gpu} ]; then 25 | # CPU 26 | n_gpus=0 27 | export OMP_NUM_THREADS=${n_threads} 28 | else 29 | n_gpus=$(echo ${gpu} | tr "," "\n" | wc -l) 30 | fi 31 | 32 | for set in dev test; do 33 | recog_dir=$(dirname ${model})/plot_${set} 34 | mkdir -p ${recog_dir} 35 | 36 | CUDA_VISIBLE_DEVICES=${gpu} ${NEURALSP_ROOT}/neural_sp/bin/asr/plot_ctc.py \ 37 | --recog_n_gpus ${n_gpus} \ 38 | --recog_sets ${data}/dataset/${set}.csv \ 39 | --recog_dir ${recog_dir} \ 40 | --recog_model ${model} \ 41 | --recog_batch_size ${batch_size} \ 42 | --recog_stdout ${stdout} || exit 1; 43 | done 44 | -------------------------------------------------------------------------------- /examples/timit/s5/local/score_sclite.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2018 Kyoto University (Hirofumi Inaguma) 4 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 5 | 6 | . ./path.sh 7 | 8 | . utils/parse_options.sh 9 | 10 | if [ $# != 1 ]; then 11 | echo "Usage: $0 "; 12 | exit 1; 13 | fi 14 | 15 | decode_dir=$1 16 | phonemap="conf/phones.60-48-39.map" 17 | 18 | # Map reference to 39 phone classes: 19 | cat ${decode_dir}/ref.trn | local/timit_norm_trans.pl -i - -m ${phonemap} -from 60 -to 39 > ${decode_dir}/ref.trn.filt 20 | cat ${decode_dir}/hyp.trn | local/timit_norm_trans.pl -i - -m ${phonemap} -from 60 -to 39 > ${decode_dir}/hyp.trn.filt 21 | 22 | sed -e "s// /g" ${decode_dir}/ref.trn.filt > ${decode_dir}/ref.trn.filt.clean 23 | sed -e "s// /g" ${decode_dir}/hyp.trn.filt > ${decode_dir}/hyp.trn.filt.clean 24 | 25 | sclite -r ${decode_dir}/ref.trn trn -h ${decode_dir}/hyp.trn trn -i rm -o all stdout > ${decode_dir}/result.txt 26 | grep -e Avg -e SPKR -m 2 ${decode_dir}/result.txt 27 | -------------------------------------------------------------------------------- /examples/timit/s5/local/timit_format_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2013 (Author: Daniel Povey) 4 | # Apache 2.0 5 | 6 | # This script takes data prepared in a corpus-dependent way 7 | # in data/local/, and converts it into the "canonical" form, 8 | # in various subdirectories of data/, e.g. data/lang, data/train, etc. 9 | 10 | . ./path.sh || exit 1; 11 | 12 | echo "Preparing train, dev and test data" 13 | srcdir=${data}/local/data 14 | 15 | for x in train dev test; do 16 | mkdir -p ${data}/$x 17 | cp $srcdir/${x}_wav.scp ${data}/$x/wav.scp || exit 1; 18 | cp $srcdir/$x.text ${data}/$x/text || exit 1; 19 | cp $srcdir/$x.spk2utt ${data}/$x/spk2utt || exit 1; 20 | cp $srcdir/$x.utt2spk ${data}/$x/utt2spk || exit 1; 21 | utils/filter_scp.pl ${data}/$x/spk2utt $srcdir/$x.spk2gender > ${data}/$x/spk2gender || exit 1; 22 | cp $srcdir/${x}.stm ${data}/$x/stm 23 | cp $srcdir/${x}.glm ${data}/$x/glm 24 | utils/validate_data_dir.sh --no-feats ${data}/$x || exit 1 25 | 26 | cp $srcdir/${x}.spk2gender ${data}/$x/spk2gender # added 27 | done 28 | 29 | echo "Succeeded in formatting data." 30 | -------------------------------------------------------------------------------- /examples/timit/s5/path.sh: -------------------------------------------------------------------------------- 1 | export NEURALSP_ROOT=$PWD/../../.. 2 | export KALDI_ROOT=$NEURALSP_ROOT/tools/kaldi 3 | export TOOL=$NEURALSP_ROOT/tools/neural_sp 4 | export CONDA=$TOOL/miniconda 5 | 6 | # Kaldi 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh 8 | export PATH=$NEURALSP_ROOT/utils:$PWD/utils/:$KALDI_ROOT/tools/sctk/bin/:$TOOL/sentencepiece/build/src:$PATH 9 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 10 | . $KALDI_ROOT/tools/config/common_path.sh 11 | export LC_ALL=C 12 | 13 | ### Python 14 | source $CONDA/etc/profile.d/conda.sh && conda deactivate && conda activate 15 | export PYTHONDONTWRITEBYTECODE=1 16 | export OMP_NUM_THREADS=1 17 | 18 | ### CUDA 19 | CUDAROOT=/usr/local/cuda 20 | NCCL_ROOT=/usr/local/nccl 21 | export CPATH=$NCCL_ROOT/include:$CPATH 22 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib/:$CUDAROOT/lib64:$LD_LIBRARY_PATH 23 | export LIBRARY_PATH=$NCCL_ROOT/lib/:$LIBRARY_PATH 24 | export CUDA_HOME=$CUDAROOT 25 | export CUDA_PATH=$CUDAROOT 26 | -------------------------------------------------------------------------------- /examples/timit/s5/steps: -------------------------------------------------------------------------------- 1 | ../../wsj/s5/steps -------------------------------------------------------------------------------- /examples/timit/s5/utils: -------------------------------------------------------------------------------- 1 | ../../wsj/s5/utils -------------------------------------------------------------------------------- /examples/wsj/README.txt: -------------------------------------------------------------------------------- 1 | 2 | About the Wall Street Journal corpus: 3 | This is a corpus of read 4 | sentences from the Wall Street Journal, recorded under clean conditions. 5 | The vocabulary is quite large. About 80 hours of training data. 6 | Available from the LDC as either: [ catalog numbers LDC93S6A (WSJ0) and LDC94S13A (WSJ1) ] 7 | or: [ catalog numbers LDC93S6B (WSJ0) and LDC94S13B (WSJ1) ] 8 | The latter option is cheaper and includes only the Sennheiser 9 | microphone data (which is all we use in the example scripts). 10 | 11 | Each subdirectory of this directory contains the 12 | scripts for a sequence of experiments. [note: most of the older 13 | example scripts have been deleted, but are still available at 14 | ^/branches/complete]. 15 | 16 | s5: This is the current recommended recipe. 17 | 18 | -------------------------------------------------------------------------------- /examples/wsj/s5/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | # Kyoto University setup 14 | export train_cmd="run.pl --mem 2G" 15 | export cuda_cmd="run.pl --mem 2G --gpu 1" 16 | export decode_cmd="run.pl --mem 4G" 17 | 18 | # JHU setup 19 | # export train_cmd="queue.pl --mem 2G" 20 | # export cuda_cmd="queue.pl --mem 2G --gpu 1 --config conf/gpu.conf" 21 | # export decode_cmd="queue.pl --mem 4G" 22 | -------------------------------------------------------------------------------- /examples/wsj/s5/conf/asr/glu_encoder.yaml: -------------------------------------------------------------------------------- 1 | enc_type: gated_conv 2 | conv_channels: "100_100_100_125_125_150_175_200_225_250_250_250_300_300_375" 3 | conv_kernel_sizes: "(13,1)_(3,1)_(4,1)_(5,1)_(6,1)_(7,1)_(8,1)_(9,1)_(10,1)_(11,1)_(12,1)_(13,1)_(14,1)_(15,1)_(21,1)" 4 | -------------------------------------------------------------------------------- /examples/wsj/s5/conf/asr/tds_encoder.yaml: -------------------------------------------------------------------------------- 1 | enc_type: tds 2 | conv_channels: "10_10_14_14_14_18_18_18_18_18_18" 3 | conv_kernel_sizes: "(21,1)_(21,1)_(21,1)_(21,1)_(21,1)_(21,1)_(21,1)_(21,1)_(21,1)_(21,1)_(21,1)" 4 | subsample: "1_1_1_1_1" 5 | clip_grad_norm: 15.0 6 | dropout_enc: 0.2 7 | lsm_prob: 0.05 8 | -------------------------------------------------------------------------------- /examples/wsj/s5/conf/asr/transformer.yaml: -------------------------------------------------------------------------------- 1 | ### topology 2 | n_stacks: 1 3 | n_skips: 1 4 | max_n_frames: 1600 5 | conv_in_channel: 1 6 | conv_channels: "32_32" 7 | conv_kernel_sizes: "(3,3)_(3,3)" 8 | conv_strides: "(1,1)_(1,1)" 9 | conv_poolings: "(2,2)_(2,2)" 10 | enc_type: conv_transformer 11 | enc_n_layers: 12 12 | transformer_enc_pe_type: add 13 | transformer_enc_d_model: 256 14 | transformer_enc_d_ff: 2048 15 | transformer_enc_n_heads: 4 16 | dec_type: transformer 17 | dec_n_layers: 6 18 | transformer_dec_pe_type: 1dconv3L ### this is effective 19 | transformer_dec_attn_type: scaled_dot 20 | transformer_dec_d_model: 256 21 | transformer_dec_d_ff: 2048 22 | transformer_dec_n_heads: 4 23 | tie_embedding: false 24 | ctc_fc_list: "512" 25 | ### optimization 26 | batch_size: 32 27 | optimizer: noam 28 | n_epochs: 120 29 | convert_to_sgd_epoch: 100 30 | print_step: 400 31 | metric: accuracy 32 | lr_factor: 5.0 33 | early_stop_patient_n_epochs: 5 34 | shuffle_bucket: true ### this is important 35 | sort_stop_epoch: 100 36 | eval_start_epoch: 1 37 | warmup_n_steps: 25000 38 | accum_grad_n_steps: 2 39 | ### regularization 40 | clip_grad_norm: 5.0 41 | dropout_in: 0.0 42 | dropout_enc: 0.1 43 | dropout_dec: 0.1 44 | dropout_emb: 0.1 45 | dropout_att: 0.0 46 | weight_decay: 1e-6 47 | lsm_prob: 0.1 48 | ### MTL 49 | ctc_weight: 0.3 50 | ctc_lsm_prob: 0.1 51 | mtl_per_batch: false 52 | task_specific_layer: false 53 | # SpecAugment 54 | freq_width: 27 55 | n_freq_masks: 2 56 | time_width: 100 57 | n_time_masks: 2 58 | time_width_upper: 1.0 59 | -------------------------------------------------------------------------------- /examples/wsj/s5/conf/data/spec_augment.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 50 3 | print_step: 400 4 | lr_decay_start_epoch: 20 5 | lr_decay_rate: 0.9 6 | 7 | # mask 8 | freq_width: 27 9 | n_freq_masks: 2 10 | time_width: 100 11 | n_time_masks: 2 12 | time_width_upper: 1.0 -------------------------------------------------------------------------------- /examples/wsj/s5/conf/data/spec_augment_speed_perturb.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 40 # 20->40 3 | print_step: 1200 # 600->1200 4 | lr_decay_start_epoch: 15 5 | lr_decay_rate: 0.9 6 | 7 | # mask 8 | freq_width: 27 9 | n_freq_masks: 2 10 | time_width: 100 11 | n_time_masks: 2 12 | time_width_upper: 1.0 -------------------------------------------------------------------------------- /examples/wsj/s5/conf/data/speed_perturb.yaml: -------------------------------------------------------------------------------- 1 | # optimization 2 | n_epochs: 20 # 25->20 3 | print_step: 600 # 200->600 4 | lr_decay_start_epoch: 10 5 | lr_decay_rate: 0.8 6 | -------------------------------------------------------------------------------- /examples/wsj/s5/conf/fbank.conf: -------------------------------------------------------------------------------- 1 | --window-type=hamming # disable Dans window, use the standard 2 | --htk-compat=true # try to make it compatible with HTK 3 | 4 | --sample-frequency=16000 5 | --num-mel-bins=80 6 | --use-energy=false 7 | -------------------------------------------------------------------------------- /examples/wsj/s5/conf/lm/gated_convlm.yaml: -------------------------------------------------------------------------------- 1 | # topology 2 | lm_type: gated_conv_14B 3 | emb_dim: 128 4 | tie_embedding: false 5 | # optimization 6 | batch_size: 50 7 | bptt: 200 8 | optimizer: nesterov 9 | n_epochs: 50 10 | convert_to_sgd_epoch: 50 11 | print_step: 400 12 | lr: 1.0 13 | lr_decay_start_epoch: 10 14 | lr_decay_rate: 0.5 15 | lr_decay_patient_n_epochs: 0 16 | lr_decay_type: metric 17 | early_stop_patient_n_epochs: 5 18 | eval_start_epoch: 1 19 | # initialization 20 | param_init: 0.05 21 | # regularization 22 | clip_grad_norm: 0.1 23 | dropout_in: 0.2 24 | dropout_hidden: 0.2 25 | dropout_out: 0.0 26 | weight_decay: 1e-6 27 | lsm_prob: 0.1 28 | backward: false 29 | adaptive_softmax: false 30 | -------------------------------------------------------------------------------- /examples/wsj/s5/conf/lm/rnnlm.yaml: -------------------------------------------------------------------------------- 1 | # topology 2 | lm_type: lstm 3 | n_units: 1024 4 | n_projs: 0 5 | n_layers: 4 ### 6 | emb_dim: 1024 7 | n_units_null_context: 0 8 | tie_embedding: true 9 | residual: true 10 | use_glu: true 11 | # optimization 12 | batch_size: 128 13 | bptt: 200 14 | optimizer: adam 15 | n_epochs: 50 16 | convert_to_sgd_epoch: 50 17 | print_step: 200 18 | lr: 1e-3 19 | lr_decay_start_epoch: 10 20 | lr_decay_rate: 0.9 21 | lr_decay_patient_n_epochs: 0 22 | lr_decay_type: always 23 | early_stop_patient_n_epochs: 5 24 | eval_start_epoch: 1 25 | # initialization 26 | param_init: 0.05 27 | # regularization 28 | clip_grad_norm: 1.0 29 | dropout_in: 0.2 30 | dropout_hidden: 0.2 31 | dropout_out: 0.0 32 | weight_decay: 1e-6 33 | lsm_prob: 0.1 ### 34 | adaptive_softmax: false 35 | -------------------------------------------------------------------------------- /examples/wsj/s5/conf/lm/transformerlm.yaml: -------------------------------------------------------------------------------- 1 | # topology 2 | lm_type: transformer 3 | n_layers: 12 4 | transformer_pe_type: add 5 | transformer_d_model: 512 6 | transformer_d_ff: 2048 7 | transformer_n_heads: 8 8 | tie_embedding: true 9 | # optimization 10 | batch_size: 32 11 | bptt: 200 12 | optimizer: noam 13 | n_epochs: 50 14 | convert_to_sgd_epoch: 50 15 | print_step: 200 16 | lr_factor: 10.0 17 | early_stop_patient_n_epochs: 5 18 | eval_start_epoch: 1 19 | warmup_n_steps: 4000 20 | accum_grad_n_steps: 2 21 | # regularization 22 | clip_grad_norm: 1.0 23 | dropout_in: 0.1 24 | dropout_hidden: 0.3 25 | dropout_out: 0.0 26 | dropout_att: 0.1 27 | weight_decay: 1e-6 28 | lsm_prob: 0.0 29 | backward: false 30 | adaptive_softmax: false 31 | -------------------------------------------------------------------------------- /examples/wsj/s5/local/dict/add_counts.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | 4 | # Add counts to an oovlist. 5 | # Reads in counts as output by uniq -c, and 6 | # an oovlist, and prints out the counts of the oovlist. 7 | 8 | (@ARGV == 1 || @ARGV == 2) || die "Usage: add_counts.pl count_file [oovlist]\n"; 9 | 10 | $counts = shift @ARGV; 11 | 12 | open(C, "<$counts") || die "Opening counts file $counts"; 13 | 14 | while() { 15 | @A = split(" ", $_); 16 | @A == 2 || die "Bad line in counts file: $_"; 17 | ($count, $word) = @A; 18 | $count =~ m:^\d+$: || die "Bad count $A[0]\n"; 19 | $counts{$word} = $count; 20 | } 21 | 22 | while(<>) { 23 | chop; 24 | $w = $_; 25 | $w =~ m:\S+: || die "Bad word $w"; 26 | defined $counts{$w} || die "Word $w not present in counts file"; 27 | print "\t$counts{$w}\t$w\n"; 28 | } 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /examples/wsj/s5/local/dict/count_rules.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # This program takes the output of score_prons.pl and collates 4 | # it for each (rule, destress) pair so that we get the 5 | # counts of right/partial/wrong for each pair. 6 | 7 | # The input is a 7-tuple on each line, like: 8 | # word;pron;base-word;base-pron;rule-name;de-stress;right|partial|wrong 9 | # 10 | # The output format is a 5-tuple like: 11 | # 12 | # rule;destress;right-count;partial-count;wrong-count 13 | # 14 | 15 | if (@ARGV != 0 && @ARGV != 1) { 16 | die "Usage: count_rules.pl < scored_candidate_prons > rule_counts"; 17 | } 18 | 19 | 20 | while(<>) { 21 | chop; 22 | $line = $_; 23 | my ($word, $pron, $baseword, $basepron, $rulename, $destress, $score) = split(";", $line); 24 | 25 | my $key = $rulename . ";" . $destress; 26 | 27 | if (!defined $counts{$key}) { 28 | $counts{$key} = [ 0, 0, 0 ]; # new anonymous array. 29 | } 30 | $ref = $counts{$key}; 31 | if ($score eq "right") { 32 | $$ref[0]++; 33 | } elsif ($score eq "partial") { 34 | $$ref[1]++; 35 | } elsif ($score eq "wrong") { 36 | $$ref[2]++; 37 | } else { 38 | die "Bad score $score\n"; 39 | } 40 | } 41 | 42 | while ( my ($key, $value) = each(%counts)) { 43 | print $key . ";" . join(";", @$value) . "\n"; 44 | } 45 | -------------------------------------------------------------------------------- /examples/wsj/s5/local/dict/filter_dict.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | 4 | # This program reads and writes either a dictionary or just a list 5 | # of words, and it removes any words containing ";" or "," as these 6 | # are used in these programs. It will warn about these. 7 | # It will die if the pronunciations have these symbols in. 8 | while(<>) { 9 | chop; 10 | @A = split(" ", $_); 11 | $word = shift @A; 12 | 13 | if ($word =~ m:[;,]:) { 14 | print STDERR "Omitting line $_ since it has one of the banned characters ; or ,\n" ; 15 | } else { 16 | $_ =~ m:[;,]: && die "Phones cannot have ; or , in them."; 17 | print $_ . "\n"; 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /examples/wsj/s5/local/dict/reverse_dict.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Used in conjunction with get_rules.pl 4 | # example input line: XANTHE Z AE1 N DH 5 | # example output line: EHTNAX DH N AE1 Z 6 | 7 | while(<>){ 8 | @A = split(" ", $_); 9 | $word = shift @A; 10 | $word = join("", reverse(split("", $word))); # Reverse letters of word. 11 | @A = reverse(@A); # Reverse phones in pron. 12 | unshift @A, $word; 13 | print join(" ", @A) . "\n"; 14 | } 15 | -------------------------------------------------------------------------------- /examples/wsj/s5/local/flist2scp.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | # takes in a file list with lines like 19 | # /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1 20 | # and outputs an scp in kaldi format with lines like 21 | # 4k0c030a /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1 22 | # (the first thing is the utterance-id, which is the same as the basename of the file. 23 | 24 | 25 | while(<>){ 26 | m:^\S+/(\w+)\.[wW][vV]1$: || die "Bad line $_"; 27 | $id = $1; 28 | $id =~ tr/A-Z/a-z/; # Necessary because of weirdness on disk 13-16.1 (uppercase filenames) 29 | print "$id $_"; 30 | } 31 | -------------------------------------------------------------------------------- /examples/wsj/s5/path.sh: -------------------------------------------------------------------------------- 1 | export NEURALSP_ROOT=$PWD/../../.. 2 | export KALDI_ROOT=$NEURALSP_ROOT/tools/kaldi 3 | export TOOL=$NEURALSP_ROOT/tools/neural_sp 4 | export CONDA=$TOOL/miniconda 5 | 6 | # Kaldi 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh 8 | export PATH=$NEURALSP_ROOT/utils:$PWD/utils/:$KALDI_ROOT/tools/sctk/bin/:$TOOL/sentencepiece/build/src:$PATH 9 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 10 | . $KALDI_ROOT/tools/config/common_path.sh 11 | export LC_ALL=C 12 | 13 | ### Python 14 | source $CONDA/etc/profile.d/conda.sh && conda deactivate && conda activate 15 | export PYTHONDONTWRITEBYTECODE=1 16 | export OMP_NUM_THREADS=1 17 | 18 | ### CUDA 19 | CUDAROOT=/usr/local/cuda 20 | NCCL_ROOT=/usr/local/nccl 21 | export CPATH=$NCCL_ROOT/include:$CPATH 22 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib/:$CUDAROOT/lib64:$LD_LIBRARY_PATH 23 | export LIBRARY_PATH=$NCCL_ROOT/lib/:$LIBRARY_PATH 24 | export CUDA_HOME=$CUDAROOT 25 | export CUDA_PATH=$CUDAROOT 26 | -------------------------------------------------------------------------------- /examples/wsj/s5/steps: -------------------------------------------------------------------------------- 1 | ../../../tools/neural_sp/kaldi/egs/wsj/s5/steps -------------------------------------------------------------------------------- /examples/wsj/s5/utils: -------------------------------------------------------------------------------- 1 | ../../../tools/neural_sp/kaldi/egs/wsj/s5/utils -------------------------------------------------------------------------------- /neural_sp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/__init__.py -------------------------------------------------------------------------------- /neural_sp/bin/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/bin/__init__.py -------------------------------------------------------------------------------- /neural_sp/bin/asr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/bin/asr/__init__.py -------------------------------------------------------------------------------- /neural_sp/bin/lm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/bin/lm/__init__.py -------------------------------------------------------------------------------- /neural_sp/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/datasets/__init__.py -------------------------------------------------------------------------------- /neural_sp/datasets/asr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/datasets/asr/__init__.py -------------------------------------------------------------------------------- /neural_sp/datasets/token_converter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/datasets/token_converter/__init__.py -------------------------------------------------------------------------------- /neural_sp/evaluators/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/evaluators/__init__.py -------------------------------------------------------------------------------- /neural_sp/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/models/__init__.py -------------------------------------------------------------------------------- /neural_sp/models/lm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/models/lm/__init__.py -------------------------------------------------------------------------------- /neural_sp/models/lm/build.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Kyoto University (Hirofumi Inaguma) 2 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | 4 | """Select a language model""" 5 | 6 | 7 | def build_lm(args, save_path=None, wordlm=False, lm_dict_path=None, asr_dict_path=None): 8 | """Select LM class. 9 | 10 | Args: 11 | args (): 12 | save_path (str): 13 | wordlm (bool): 14 | lm_dict_path (dict): 15 | asr_dict_path (dict): 16 | Returns: 17 | lm (): 18 | 19 | """ 20 | if 'gated_conv' in args.lm_type: 21 | from neural_sp.models.lm.gated_convlm import GatedConvLM 22 | lm = GatedConvLM(args, save_path) 23 | elif args.lm_type == 'transformer': 24 | from neural_sp.models.lm.transformerlm import TransformerLM 25 | lm = TransformerLM(args, save_path) 26 | elif args.lm_type == 'transformer_xl': 27 | from neural_sp.models.lm.transformer_xl import TransformerXL 28 | lm = TransformerXL(args, save_path) 29 | else: 30 | from neural_sp.models.lm.rnnlm import RNNLM 31 | lm = RNNLM(args, save_path) 32 | 33 | return lm 34 | -------------------------------------------------------------------------------- /neural_sp/models/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/models/modules/__init__.py -------------------------------------------------------------------------------- /neural_sp/models/modules/gelu.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Kyoto University (Hirofumi Inaguma) 2 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | 4 | """Gaussian Error Linear Units (GELU) activation.""" 5 | 6 | import math 7 | import torch 8 | 9 | 10 | # [reference] https://github.com/pytorch/fairseq/blob/e75cff5f2c1d62f12dc911e0bf420025eb1a4e33/fairseq/modules/gelu.py 11 | def gelu_accurate(x): 12 | if not hasattr(gelu_accurate, "_a"): 13 | gelu_accurate._a = math.sqrt(2 / math.pi) 14 | return 0.5 * x * (1 + torch.tanh(gelu_accurate._a * (x + 0.044715 * torch.pow(x, 3)))) 15 | 16 | 17 | def gelu(x): 18 | if hasattr(torch.nn.functional, 'gelu'): 19 | return torch.nn.functional.gelu(x.float()).type_as(x) 20 | else: 21 | return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) 22 | -------------------------------------------------------------------------------- /neural_sp/models/modules/headdrop.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Kyoto University (Hirofumi Inaguma) 2 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | 4 | """HeadDrop regularization.""" 5 | 6 | import random 7 | 8 | random.seed(1) 9 | 10 | 11 | def headdrop(aws, n_heads, dropout): 12 | """HeadDrop regularization. 13 | 14 | Args: 15 | aws (FloatTensor): `[B, H, qlen, klen]` 16 | n_heads (int): number of attention heads 17 | dropout (float): HeadDrop probability 18 | Returns: 19 | aws (FloatTensor): `[B, H, qlen, klen]` 20 | 21 | """ 22 | n_effective_heads = n_heads 23 | head_mask = aws.new_ones(aws.size()).byte() 24 | for h in range(n_heads): 25 | if random.random() < dropout: 26 | head_mask[:, h] = 0 27 | n_effective_heads -= 1 28 | aws = aws.masked_fill_(head_mask == 0, 0) 29 | # Normalization 30 | if n_effective_heads > 0: 31 | aws = aws * (n_heads / n_effective_heads) 32 | return aws 33 | -------------------------------------------------------------------------------- /neural_sp/models/modules/mocha/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/models/modules/mocha/__init__.py -------------------------------------------------------------------------------- /neural_sp/models/modules/softplus.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Kyoto University (Hirofumi Inaguma) 2 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | 4 | """Softplus function.""" 5 | 6 | import torch 7 | 8 | 9 | def softplus(x): 10 | if hasattr(torch.nn.functional, 'softplus'): 11 | return torch.nn.functional.softplus(x.float()).type_as(x) 12 | else: 13 | raise NotImplementedError 14 | -------------------------------------------------------------------------------- /neural_sp/models/modules/swish.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Kyoto University (Hirofumi Inaguma) 2 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | 4 | """Swish activation. 5 | See details in https://arxiv.org/abs/1710.05941.""" 6 | 7 | import torch 8 | 9 | 10 | class Swish(torch.nn.Module): 11 | def forward(self, x): 12 | return x * torch.sigmoid(x) 13 | -------------------------------------------------------------------------------- /neural_sp/models/seq2seq/__init___.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/models/seq2seq/__init___.py -------------------------------------------------------------------------------- /neural_sp/models/seq2seq/decoders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/models/seq2seq/decoders/__init__.py -------------------------------------------------------------------------------- /neural_sp/models/seq2seq/encoders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/models/seq2seq/encoders/__init__.py -------------------------------------------------------------------------------- /neural_sp/models/seq2seq/frontends/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/models/seq2seq/frontends/__init__.py -------------------------------------------------------------------------------- /neural_sp/models/seq2seq/frontends/input_noise.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Kyoto University (Hirofumi Inaguma) 2 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | 4 | """Add Gaussian noise to input features.""" 5 | 6 | import torch 7 | 8 | 9 | def add_input_noise(xs, std): 10 | noise = torch.normal(xs.new_zeros(xs.shape[-1]), std) 11 | xs.data += noise 12 | return xs 13 | -------------------------------------------------------------------------------- /neural_sp/trainers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/trainers/__init__.py -------------------------------------------------------------------------------- /neural_sp/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Kyoto University (Hirofumi Inaguma) 2 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | 4 | """Unility functions for general purposes.""" 5 | 6 | from pathlib import Path 7 | 8 | 9 | def mkdir_join(path, *dir_name, rank=0): 10 | """Concatenate root path and 1 or more paths, and make a new directory if the directory does not exist. 11 | Args: 12 | path (str): path to a directory 13 | rank (int): rank of current process group 14 | dir_name (str): a directory name 15 | Returns: 16 | path to the new directory 17 | """ 18 | p = Path(path) 19 | if not p.is_dir() and rank == 0: 20 | p.mkdir() 21 | for i in range(len(dir_name)): 22 | # dir 23 | if i < len(dir_name) - 1: 24 | p = p.joinpath(dir_name[i]) 25 | if not p.is_dir() and rank == 0: 26 | p.mkdir() 27 | elif '.' not in dir_name[i]: 28 | p = p.joinpath(dir_name[i]) 29 | if not p.is_dir() and rank == 0: 30 | p.mkdir() 31 | # file 32 | else: 33 | p = p.joinpath(dir_name[i]) 34 | return str(p.absolute()) 35 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [aliases] 2 | test=pytest 3 | 4 | [tool:pytest] 5 | addopts = --maxfail=3 --durations=10 --cov-config=.coveragerc --cov=neural_sp --cov-report xml 6 | python_files = test/*/test_*.py 7 | testpaths = test 8 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/test/__init__.py -------------------------------------------------------------------------------- /test/decoders/dict.txt: -------------------------------------------------------------------------------- 1 | 1 2 | 2 3 | 3 4 | 4 5 | ' 5 6 | a 6 7 | b 7 8 | c 8 9 | d 9 10 | -------------------------------------------------------------------------------- /test/encoders/test_utils.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """Test for encoder utility functions.""" 5 | 6 | import importlib 7 | import numpy as np 8 | import pytest 9 | import torch 10 | 11 | from neural_sp.models.torch_utils import np2tensor 12 | from neural_sp.models.torch_utils import pad_list 13 | 14 | 15 | @pytest.mark.parametrize( 16 | "N_l, N_c, N_r", 17 | [ 18 | (96, 64, 32), 19 | (64, 64, 64), 20 | (40, 40, 40), 21 | (40, 40, 20), 22 | ] 23 | ) 24 | def test_chunkwise(N_l, N_c, N_r): 25 | batch_size = 4 26 | xmaxs = [800, 855] 27 | input_dim = 80 28 | device = "cpu" 29 | 30 | module = importlib.import_module('neural_sp.models.seq2seq.encoders.utils') 31 | 32 | for xmax in xmaxs: 33 | xs = np.random.randn(batch_size, xmax, input_dim).astype(np.float32) 34 | xs = pad_list([np2tensor(x, device).float() for x in xs], 0.) 35 | 36 | xs_chunk = module.chunkwise(xs, N_l, N_c, N_r) 37 | 38 | # Extract the center region 39 | xs_chunk = xs_chunk[:, N_l:N_l + N_c] # `[B * n_chunks, N_c, input_dim]` 40 | xs_chunk = xs_chunk.contiguous().view(batch_size, -1, xs_chunk.size(2)) 41 | xs_chunk = xs_chunk[:, :xmax] 42 | 43 | assert xs_chunk.size() == xs.size() 44 | assert torch.equal(xs_chunk, xs) 45 | -------------------------------------------------------------------------------- /test/frontends/test_frame_stacking.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """Test for frame stacking.""" 5 | 6 | import importlib 7 | import math 8 | import numpy as np 9 | import pytest 10 | 11 | from neural_sp.models.torch_utils import np2tensor 12 | from neural_sp.models.torch_utils import pad_list 13 | 14 | 15 | def make_args(**kwargs): 16 | args = dict( 17 | n_stacks=1, 18 | n_skips=1, 19 | ) 20 | args.update(kwargs) 21 | return args 22 | 23 | 24 | @pytest.mark.parametrize( 25 | "args", 26 | [ 27 | ({'n_stacks': 1, 'n_skips': 1}), 28 | ({'n_stacks': 2, 'n_skips': 2}), 29 | ({'n_stacks': 3, 'n_skips': 3}), 30 | ({'n_stacks': 3, 'n_skips': 1}), 31 | ] 32 | ) 33 | def test_forward(args): 34 | args = make_args(**args) 35 | 36 | batch_size = 4 37 | xmax = 40 38 | input_dim = 80 39 | device = "cpu" 40 | 41 | xs = [np.random.randn(xlen, input_dim).astype(np.float32) 42 | for xlen in range(xmax - batch_size, xmax)] 43 | xs_pad = pad_list([np2tensor(x, device).float() for x in xs], 0.) 44 | 45 | module = importlib.import_module('neural_sp.models.seq2seq.frontends.frame_stacking') 46 | 47 | out = [module.stack_frame(x, args['n_stacks'], args['n_skips']) 48 | for x in xs] 49 | out_pad = pad_list([np2tensor(x, device).float() for x in out], 0.) 50 | assert out_pad.size(0) == xs_pad.size(0) 51 | assert out_pad.size(1) == math.ceil(xs_pad.size(1) / args['n_skips']) 52 | assert out_pad.size(2) == xs_pad.size(2) * args['n_stacks'] 53 | -------------------------------------------------------------------------------- /test/frontends/test_input_noise.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """Test for input noise injection.""" 5 | 6 | import numpy as np 7 | 8 | from neural_sp.models.torch_utils import np2tensor 9 | from neural_sp.models.torch_utils import pad_list 10 | from neural_sp.models.seq2seq.frontends.input_noise import add_input_noise 11 | 12 | 13 | def test_forward(): 14 | batch_size = 4 15 | xmax = 40 16 | input_dim = 80 17 | device = "cpu" 18 | 19 | xs = np.random.randn(batch_size, xmax, input_dim).astype(np.float32) 20 | xs = pad_list([np2tensor(x, device).float() for x in xs], 0.) 21 | 22 | out = add_input_noise(xs, std=0.075) 23 | assert out.size() == xs.size() 24 | -------------------------------------------------------------------------------- /test/frontends/test_sequence_summary.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """Test for sequence summary network.""" 5 | 6 | import importlib 7 | import numpy as np 8 | import pytest 9 | import torch 10 | 11 | from neural_sp.models.torch_utils import np2tensor 12 | from neural_sp.models.torch_utils import pad_list 13 | 14 | 15 | def make_args(**kwargs): 16 | args = dict( 17 | input_dim=80, 18 | n_units=64, 19 | n_layers=2, 20 | bottleneck_dim=0, 21 | dropout=0.1, 22 | param_init=0.1, 23 | ) 24 | args.update(kwargs) 25 | return args 26 | 27 | 28 | @pytest.mark.parametrize( 29 | "args", 30 | [ 31 | ({'n_layers': 2, 'bottleneck_dim': 0}), 32 | ({'n_layers': 2, 'bottleneck_dim': 100}), 33 | ({'n_layers': 3, 'bottleneck_dim': 0}), 34 | ({'n_layers': 3, 'bottleneck_dim': 100}), 35 | ] 36 | ) 37 | def test_forward(args): 38 | args = make_args(**args) 39 | 40 | batch_size = 4 41 | xmax = 40 42 | device = "cpu" 43 | 44 | xs = np.random.randn(batch_size, xmax, args['input_dim']).astype(np.float32) 45 | xlens = torch.IntTensor([len(x) for x in xs]) 46 | xs = pad_list([np2tensor(x, device).float() for x in xs], 0.) 47 | 48 | module = importlib.import_module('neural_sp.models.seq2seq.frontends.sequence_summary') 49 | ssn = module.SequenceSummaryNetwork(**args) 50 | ssn = ssn.to(device) 51 | 52 | out = ssn(xs, xlens) 53 | assert out.size() == xs.size() 54 | -------------------------------------------------------------------------------- /test/install.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -euo pipefail 4 | 5 | $CXX -v 6 | 7 | ROOT=$(pwd) 8 | KALDI_ROOT=${ROOT}/tools/kaldi 9 | TOOL=${ROOT}/tools/neural_sp 10 | 11 | # install kaldi (not compiled) 12 | if [ ! -d ${KALDI_ROOT} ]; then 13 | git clone https://github.com/kaldi-asr/kaldi.git ${KALDI_ROOT} 14 | fi 15 | 16 | # download pre-built kaldi binary (copy from espnet) 17 | [ ! -e ubuntu16-featbin.tar.gz ] && wget --tries=3 https://github.com/espnet/kaldi-bin/releases/download/v0.0.1/ubuntu16-featbin.tar.gz 18 | tar -xf ./ubuntu16-featbin.tar.gz 19 | cp featbin/* ${KALDI_ROOT}/src/featbin/ 20 | 21 | cd tools 22 | make PYTORCH_VERSION="${PYTORCH_VERSION}" PYTHON_VERSION="${TRAVIS_PYTHON_VERSION}" TOOL="${TOOL}" KALDI=${KALDI_ROOT} 23 | cd ${ROOT} 24 | 25 | source ${TOOL}/miniconda/bin/activate 26 | 27 | pip install -e ".[test]" # install test dependencies (setup.py) 28 | 29 | # log 30 | pip freeze 31 | -------------------------------------------------------------------------------- /test/modules/test_pointwise_feed_forward.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """Test for positionwise fully-connected feed-forward neural network (FFN).""" 5 | 6 | import importlib 7 | import pytest 8 | import torch 9 | 10 | 11 | def make_args(**kwargs): 12 | args = dict( 13 | d_model=32, 14 | d_ff=128, 15 | dropout=0.1, 16 | activation='relu', 17 | param_init='', 18 | bottleneck_dim=0, 19 | ) 20 | args.update(kwargs) 21 | return args 22 | 23 | 24 | @pytest.mark.parametrize( 25 | "args", 26 | [ 27 | # activation 28 | ({'activation': 'relu'}), 29 | ({'activation': 'gelu'}), 30 | ({'activation': 'gelu_accurate'}), 31 | ({'activation': 'glu'}), 32 | ({'activation': 'swish'}), 33 | # initialization 34 | ({'param_init': 'xavier_uniform'}), 35 | # bottleneck 36 | ({'bottleneck_dim': 16}), 37 | ] 38 | ) 39 | def test_forward(args): 40 | args = make_args(**args) 41 | 42 | batch_size = 4 43 | max_len = 40 44 | device = "cpu" 45 | 46 | ffn_in = torch.FloatTensor(batch_size, max_len, args['d_model'], device=device) 47 | 48 | module = importlib.import_module('neural_sp.models.modules.positionwise_feed_forward') 49 | ffn = module.PositionwiseFeedForward(**args) 50 | ffn = ffn.to(device) 51 | 52 | ffn_out = ffn(ffn_in) 53 | assert ffn_in.size() == ffn_out.size() 54 | -------------------------------------------------------------------------------- /test/test_python.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | source tools/neural_sp/miniconda/bin/activate 4 | 5 | modules="neural_sp test utils setup.py" 6 | pycodestyle -r ${modules} --show-source --show-pep8 --ignore="E501" 7 | 8 | pytest -------------------------------------------------------------------------------- /utils/make_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2018 Kyoto University (Hirofumi Inaguma) 4 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 5 | 6 | . ./path.sh 7 | 8 | feat="" # feats.scp 9 | unit="" 10 | remove_space=false 11 | unk="" 12 | space="" 13 | nlsyms="" 14 | wp_model="" 15 | wp_nbest=1 16 | text= 17 | 18 | . utils/parse_options.sh 19 | 20 | if [ $# != 2 ]; then 21 | echo "Usage: $0 "; 22 | exit 1; 23 | fi 24 | 25 | data=$1 26 | dict=$2 27 | 28 | if [ -z ${text} ]; then 29 | text=${data}/text 30 | fi 31 | 32 | make_tsv.py --feat ${feat} \ 33 | --utt2num_frames ${data}/utt2num_frames \ 34 | --utt2spk ${data}/utt2spk \ 35 | --text ${text} \ 36 | --dict ${dict} \ 37 | --unit ${unit} \ 38 | --remove_space ${remove_space} \ 39 | --unk ${unk} \ 40 | --space ${space} \ 41 | --nlsyms ${nlsyms} \ 42 | --wp_model ${wp_model} \ 43 | --wp_nbest ${wp_nbest} 44 | -------------------------------------------------------------------------------- /utils/speed_perturb_3way.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2019 Kyoto University (Hirofumi Inaguma) 4 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 5 | 6 | . ./path.sh 7 | 8 | nj=32 9 | speeds="0.9 1.0 1.1" 10 | 11 | . utils/parse_options.sh 12 | 13 | if [ $# != 3 ]; then 14 | echo "Usage: $0 "; 15 | exit 1; 16 | fi 17 | 18 | data=$1 19 | train_set_original=$2 20 | train_set=$3 21 | tmpdir=$(mktemp -d ${data}/${train_set_original}/tmp-XXXXX) 22 | trap 'rm -rf ${tmpdir}' EXIT 23 | 24 | if [ ${train_set_original} = ${train_set} ];then 25 | echo "train_set_original and train_set should be different names" 26 | fi 27 | 28 | for speed in ${speeds}; do 29 | utils/perturb_data_dir_speed.sh ${speed} ${data}/${train_set_original} ${tmpdir}/temp${speed} 30 | done 31 | utils/combine_data.sh --extra-files utt2uniq ${data}/${train_set} ${tmpdir}/temp* 32 | rm -r ${tmpdir}/temp* 33 | steps/make_fbank.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \ 34 | ${data}/${train_set} ${data}/log/make_fbank/${train_set} ${data}/fbank 35 | touch ${data}/${train_set}/text.tmp 36 | for speed in ${speeds}; do 37 | awk -v p="sp${speed}-" '{printf("%s %s%s\n", $1, p, $1);}' ${data}/${train_set_original}/utt2spk > ${data}/${train_set}/utt_map 38 | utils/apply_map.pl -f 1 ${data}/${train_set}/utt_map <${data}/${train_set_original}/text >>${data}/${train_set}/text.tmp 39 | done 40 | mv ${data}/${train_set}/text.tmp ${data}/${train_set}/text 41 | utils/fix_data_dir.sh ${data}/${train_set} 42 | -------------------------------------------------------------------------------- /utils/update_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2018 Kyoto University (Hirofumi Inaguma) 4 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 5 | 6 | . ./path.sh 7 | 8 | unit="" 9 | remove_space=false 10 | unk="" 11 | space="" 12 | nlsyms="" 13 | wp_model="" 14 | 15 | . utils/parse_options.sh 16 | 17 | if [ $# != 3 ]; then 18 | echo "Usage: $0 "; 19 | exit 1; 20 | fi 21 | 22 | text=$1 23 | dict=$2 24 | tsv=$3 25 | tmpdir=$(mktemp -d $(dirname ${text})/tmp-XXXXX) 26 | trap 'rm -rf ${tmpdir}' EXIT 27 | 28 | cp ${tsv} ${tmpdir}/tmp.tsv 29 | 30 | # For additional unpaired text 31 | make_tsv.py --text ${text} \ 32 | --dict ${dict} \ 33 | --unit ${unit} \ 34 | --remove_space ${remove_space} \ 35 | --unk ${unk} \ 36 | --space ${space} \ 37 | --nlsyms ${nlsyms} \ 38 | --wp_model ${wp_model} \ 39 | --update >> ${tmpdir}/tmp.tsv 40 | 41 | cat ${tmpdir}/tmp.tsv 42 | 43 | rm -fr ${tmpdir} 44 | --------------------------------------------------------------------------------