├── .coveragerc
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── examples
    ├── aishell
    │   ├── README.txt
    │   └── s5
    │   │   ├── README.md
    │   │   ├── cmd.sh
    │   │   ├── conf
    │   │       ├── asr
    │   │       │   ├── blstm_las.yaml
    │   │       │   ├── conformer_kernel15_clamp10_hie_subsample8_las_ln.yaml
    │   │       │   ├── conformer_kernel15_clamp10_hie_subsample8_las_ln_2mtl.yaml
    │   │       │   ├── mma
    │   │       │   │   ├── lc_transformer_mma_hie_subsample8_ma4H_ca4H_w16_from4L_64_128_64.yaml
    │   │       │   │   ├── lc_transformer_mma_hie_subsample8_ma4H_ca4H_w16_from4L_96_64_32.yaml
    │   │       │   │   └── transformer_mma_hie_subsample8_ma4H_ca4H_w16_from4L.yaml
    │   │       │   ├── mocha
    │   │       │   │   ├── blstm_mocha.yaml
    │   │       │   │   ├── lcblstm_mocha_chunk4040.yaml
    │   │       │   │   └── lcblstm_mocha_chunk4040_ctc_sync.yaml
    │   │       │   ├── transformer.yaml
    │   │       │   └── transformer_hie_subsample8.yaml
    │   │       ├── data
    │   │       │   ├── spec_augment_speed_perturb.yaml
    │   │       │   ├── spec_augment_speed_perturb_pretrain.yaml
    │   │       │   └── speed_perturb_pretrain.yaml
    │   │       ├── fbank.conf
    │   │       └── lm
    │   │       │   └── rnnlm.yaml
    │   │   ├── local
    │   │       ├── aishell_data_prep.sh
    │   │       ├── download_and_untar.sh
    │   │       └── plot_attention.sh
    │   │   ├── path.sh
    │   │   ├── run.sh
    │   │   ├── run_2mtl.sh
    │   │   ├── score.sh
    │   │   ├── steps
    │   │   └── utils
    ├── aishell2
    │   ├── README.md
    │   └── s5
    │   │   ├── RESULTS.md
    │   │   ├── cmd.sh
    │   │   ├── conf
    │   │       ├── asr
    │   │       │   └── conformer_kernel15_clamp10_hie_subsample8_las_ln_large.yaml
    │   │       └── fbank.conf
    │   │   ├── local
    │   │       └── prepare_data.sh
    │   │   ├── path.sh
    │   │   ├── run.sh
    │   │   ├── score.sh
    │   │   ├── steps
    │   │   └── utils
    ├── ami
    │   ├── README.txt
    │   └── s5b
    │   │   ├── README.txt
    │   │   ├── cmd.sh
    │   │   ├── conf
    │   │       ├── ami_beamformit.cfg
    │   │       ├── asr
    │   │       │   ├── blstm_las.yaml
    │   │       │   ├── blstm_mocha.yaml
    │   │       │   ├── blstm_rnnt.yaml
    │   │       │   ├── lcblstm_mocha_chunk4040.yaml
    │   │       │   ├── lcblstm_mocha_chunk4040_ctc_sync.yaml
    │   │       │   ├── lcblstm_rnnt_40_40.yaml
    │   │       │   └── transformer.yaml
    │   │       ├── data
    │   │       │   ├── spec_augment.yaml
    │   │       │   ├── spec_augment_speed_perturb.yaml
    │   │       │   ├── spec_augment_speed_perturb_pretrain_F27_T100.yaml
    │   │       │   ├── spec_augment_speed_perturb_pretrain_F27_T50.yaml
    │   │       │   └── speed_perturb.yaml
    │   │       ├── fbank.conf
    │   │       └── lm
    │   │       │   └── rnnlm.yaml
    │   │   ├── local
    │   │       ├── ami_beamform.sh
    │   │       ├── ami_download.sh
    │   │       ├── ami_format_data.sh
    │   │       ├── ami_ihm_data_prep.sh
    │   │       ├── ami_ihm_scoring_data_prep.sh
    │   │       ├── ami_mdm_data_prep.sh
    │   │       ├── ami_mdm_scoring_data_prep.sh
    │   │       ├── ami_prepare_dict.sh
    │   │       ├── ami_sdm_data_prep.sh
    │   │       ├── ami_sdm_scoring_data_prep.sh
    │   │       ├── ami_split_segments.pl
    │   │       ├── ami_text_prep.sh
    │   │       ├── ami_xml2text.sh
    │   │       ├── beamformit.sh
    │   │       ├── convert2stm.pl
    │   │       ├── english.glm
    │   │       ├── split_REAMDE.txt
    │   │       ├── split_dev.orig
    │   │       ├── split_eval.orig
    │   │       └── split_train.orig
    │   │   ├── path.sh
    │   │   ├── run.sh
    │   │   ├── score.sh
    │   │   ├── steps
    │   │   └── utils
    ├── ci_test
    │   ├── cmd.sh
    │   ├── conf
    │   │   ├── asr
    │   │   │   ├── blstm_las.yaml
    │   │   │   ├── blstm_las_2mtl.yaml
    │   │   │   ├── blstm_las_2mtl_per_batch.yaml
    │   │   │   ├── blstm_transformer.yaml
    │   │   │   ├── conformer.yaml
    │   │   │   ├── lc_transformer_mma_ma4H_ca4H_w16_from4L_64_128_64.yaml
    │   │   │   ├── lcblstm_transducer.yaml
    │   │   │   ├── lstm_ctc.yaml
    │   │   │   ├── tds_las.yaml
    │   │   │   ├── transformer.yaml
    │   │   │   ├── transformer_2mtl.yaml
    │   │   │   ├── transformer_ctc.yaml
    │   │   │   └── transformer_las.yaml
    │   │   ├── data
    │   │   │   ├── adaptive_spec_augment.yaml
    │   │   │   └── spec_augment.yaml
    │   │   ├── fbank.conf
    │   │   └── lm
    │   │   │   ├── rnnlm.yaml
    │   │   │   ├── transformer_xl.yaml
    │   │   │   └── transformerlm.yaml
    │   ├── ctc_forced_align.sh
    │   ├── data
    │   │   └── train
    │   │   │   ├── spk2utt
    │   │   │   ├── text
    │   │   │   ├── text.phone
    │   │   │   ├── utt2spk
    │   │   │   └── wav.scp
    │   ├── local
    │   │   └── download_sample.sh
    │   ├── path.sh
    │   ├── plot_attention.sh
    │   ├── plot_ctc.sh
    │   ├── run.sh
    │   ├── run_2mtl.sh
    │   ├── score.sh
    │   ├── steps
    │   └── utils
    ├── csj
    │   ├── README.txt
    │   └── s5
    │   │   ├── README.md
    │   │   ├── cmd.sh
    │   │   ├── conf
    │   │       ├── asr
    │   │       │   ├── las
    │   │       │   │   ├── blstm_las.yaml
    │   │       │   │   ├── blstm_las_2mtl.yaml
    │   │       │   │   ├── lcblstm_las_chunk4040.yaml
    │   │       │   │   └── lstm_las.yaml
    │   │       │   ├── mocha
    │   │       │   │   ├── blstm_mocha.yaml
    │   │       │   │   ├── lcblstm_mocha_chunk4040.yaml
    │   │       │   │   ├── lcblstm_mocha_chunk4040_ctc_sync.yaml
    │   │       │   │   ├── lcblstm_mocha_chunk4040_decot16.yaml
    │   │       │   │   ├── lcblstm_mocha_chunk4040_minlt.yaml
    │   │       │   │   ├── lstm_mocha.yaml
    │   │       │   │   └── lstm_mocha_ctc_sync.yaml
    │   │       │   └── transformer
    │   │       │   │   ├── conformer_kernel15_clamp10_hie_subsample8_las_ln.yaml
    │   │       │   │   ├── conformer_kernel15_clamp10_hie_subsample8_las_ln_large.yaml
    │   │       │   │   ├── transformer.yaml
    │   │       │   │   └── transformer_hie_subsample8.yaml
    │   │       ├── data
    │   │       │   ├── pretrain.yaml
    │   │       │   ├── spec_augment.yaml
    │   │       │   ├── spec_augment_pretrain_F13_T50.yaml
    │   │       │   ├── spec_augment_pretrain_F27_T100.yaml
    │   │       │   ├── spec_augment_pretrain_F27_T50.yaml
    │   │       │   ├── spec_augment_speed_perturb.yaml
    │   │       │   └── speed_perturb.yaml
    │   │       ├── fbank.conf
    │   │       └── lm
    │   │       │   ├── rnnlm.yaml
    │   │       │   ├── transformer_xl.yaml
    │   │       │   └── transformerlm.yaml
    │   │   ├── local
    │   │       ├── csj_data_prep.sh
    │   │       ├── csj_eval_data_prep.sh
    │   │       ├── csj_make_trans
    │   │       │   ├── csj2kaldi4m.pl
    │   │       │   ├── csj_autorun.sh
    │   │       │   ├── csjconnect.pl
    │   │       │   ├── kana2phone
    │   │       │   └── vocab2dic.pl
    │   │       ├── csj_prepare_dict.sh
    │   │       ├── plot_attention.sh
    │   │       ├── plot_ctc.sh
    │   │       ├── plot_lm_cache.sh
    │   │       ├── remove_disfluency.py
    │   │       ├── remove_pos.py
    │   │       └── score_lm.sh
    │   │   ├── path.sh
    │   │   ├── run.sh
    │   │   ├── run_2mtl.sh
    │   │   ├── run_streaming.sh
    │   │   ├── score.sh
    │   │   ├── score_streaming.sh
    │   │   ├── steps
    │   │   └── utils
    ├── laborotv
    │   └── s5
    │   │   ├── README.md
    │   │   ├── cmd.sh
    │   │   ├── conf
    │   │       ├── asr
    │   │       │   ├── conformer_kernel15_clamp10_hie_subsample8_las_ln.yaml
    │   │       │   └── conformer_kernel15_clamp10_hie_subsample8_las_ln_large.yaml
    │   │       ├── fbank.conf
    │   │       └── lm
    │   │       │   └── rnnlm.yaml
    │   │   ├── local
    │   │       ├── laborotv_data_prep.sh
    │   │       ├── prepare_dict.sh
    │   │       ├── remove_pos.py
    │   │       └── tedx-jp-10k_data_prep.sh
    │   │   ├── path.sh
    │   │   ├── run.sh
    │   │   ├── run_plus_csj.sh
    │   │   ├── score.sh
    │   │   ├── steps
    │   │   └── utils
    ├── language_model
    │   ├── ptb
    │   │   ├── RESULTS
    │   │   ├── cmd.sh
    │   │   ├── conf
    │   │   │   └── rnnlm.yaml
    │   │   ├── local
    │   │   │   └── plot_lm_cache.sh
    │   │   ├── path.sh
    │   │   ├── run.sh
    │   │   ├── score_lm.sh
    │   │   ├── steps
    │   │   └── utils
    │   └── wikitext2
    │   │   ├── RESULTS
    │   │   ├── cmd.sh
    │   │   ├── conf
    │   │       ├── gcnn.yaml
    │   │       ├── rnnlm.yaml
    │   │       ├── transformer_xl.yaml
    │   │       └── transformerlm.yaml
    │   │   ├── local
    │   │       └── plot_lm_cache.sh
    │   │   ├── path.sh
    │   │   ├── run.sh
    │   │   ├── score_lm.sh
    │   │   ├── steps
    │   │   └── utils
    ├── librispeech
    │   ├── README.txt
    │   └── s5
    │   │   ├── RESULTS.md
    │   │   ├── cmd.sh
    │   │   ├── conf
    │   │       ├── asr
    │   │       │   ├── blstm_las.yaml
    │   │       │   ├── mma
    │   │       │   │   ├── offline
    │   │       │   │   │   ├── transformer_mma_subsample8_ma4H_ca4H_w16_from4L.yaml
    │   │       │   │   │   ├── transformer_mma_subsample8_ma4H_ca4H_w16_from4L_512dmodel_8H.yaml
    │   │       │   │   │   └── transformer_mma_subsample8_ma4H_ca4H_w16_from4L_768dmodel_3072dff_8H.yaml
    │   │       │   │   └── streaming
    │   │       │   │   │   ├── lc_transformer_mma_subsample8_ma4H_ca4H_w16_from4L_512dmodel_8H_64_128_64.yaml
    │   │       │   │   │   ├── lc_transformer_mma_subsample8_ma4H_ca4H_w16_from4L_512dmodel_8H_96_64_32.yaml
    │   │       │   │   │   ├── lc_transformer_mma_subsample8_ma4H_ca4H_w16_from4L_64_128_64.yaml
    │   │       │   │   │   ├── lc_transformer_mma_subsample8_ma4H_ca4H_w16_from4L_768dmodel_3072dff_8H_64_128_64.yaml
    │   │       │   │   │   └── lc_transformer_mma_subsample8_ma4H_ca4H_w16_from4L_96_64_32.yaml
    │   │       │   ├── mocha
    │   │       │   │   ├── blstm_mocha.yaml
    │   │       │   │   ├── lcblstm_mocha_chunk4040.yaml
    │   │       │   │   ├── lcblstm_mocha_chunk4040_ctc_sync.yaml
    │   │       │   │   ├── lstm_mocha.yaml
    │   │       │   │   ├── lstm_mocha_ctc_sync.yaml
    │   │       │   │   ├── lstm_mocha_decot12.yaml
    │   │       │   │   ├── lstm_mocha_decot16.yaml
    │   │       │   │   ├── lstm_mocha_minlt.yaml
    │   │       │   │   └── uni_conformer_kernel7_clamp10_hie_subsample8_mocha_ln_stableemit0.2_qua0.2.yaml
    │   │       │   ├── transducer
    │   │       │   │   ├── blstm_transducer_bpe1k.yaml
    │   │       │   │   ├── lcblstm_rnnt_chunk4040_bpe1k.yaml
    │   │       │   │   └── lstm_rnnt_bpe1k.yaml
    │   │       │   └── transformer
    │   │       │   │   ├── conformer_kernel15_clamp10_hie_subsample8_las_long_ln.yaml
    │   │       │   │   ├── conformer_kernel15_clamp10_hie_subsample8_las_long_ln_large.yaml
    │   │       │   │   ├── transformer.yaml
    │   │       │   │   ├── transformer_512dmodel_8H.yaml
    │   │       │   │   ├── transformer_768dmodel_3072dff_8H.yaml
    │   │       │   │   ├── transformer_subsample8.yaml
    │   │       │   │   ├── transformer_subsample8_512dmodel_8H.yaml
    │   │       │   │   └── transformer_subsample8_768dmodel_3072dff_8H.yaml
    │   │       ├── data
    │   │       │   ├── pretrain.yaml
    │   │       │   ├── spec_augment.yaml
    │   │       │   ├── spec_augment_pretrain_F13_T50.yaml
    │   │       │   ├── spec_augment_pretrain_F27_T100.yaml
    │   │       │   ├── spec_augment_pretrain_F27_T50.yaml
    │   │       │   ├── spec_augment_speed_perturb.yaml
    │   │       │   └── spec_augment_speed_perturb_pretrain_F27_T100.yaml
    │   │       ├── fbank.conf
    │   │       └── lm
    │   │       │   ├── rnnlm.yaml
    │   │       │   └── rnnlm_6L.yaml
    │   │   ├── ctc_forced_align.sh
    │   │   ├── local
    │   │       ├── data_prep.sh
    │   │       ├── download_and_untar.sh
    │   │       ├── download_lm.sh
    │   │       ├── format_data.sh
    │   │       ├── format_lms.sh
    │   │       ├── g2p.sh
    │   │       ├── g2p
    │   │       │   └── train_g2p.sh
    │   │       ├── lm
    │   │       │   ├── est-gcc4.7.patch
    │   │       │   ├── install_festival.sh
    │   │       │   ├── normalize_text.sh
    │   │       │   ├── python
    │   │       │   │   ├── pre_filter.py
    │   │       │   │   ├── text_post_process.py
    │   │       │   │   └── text_pre_process.py
    │   │       │   └── train_lm.sh
    │   │       ├── plot_attention.sh
    │   │       ├── plot_ctc.sh
    │   │       ├── prepare_dict.sh
    │   │       ├── prepare_example_data.sh
    │   │       └── score_lm.sh
    │   │   ├── path.sh
    │   │   ├── run.sh
    │   │   ├── run_2mtl.sh
    │   │   ├── score.sh
    │   │   ├── steps
    │   │   └── utils
    ├── swbd
    │   ├── README.txt
    │   └── s5c
    │   │   ├── RESULTS
    │   │   ├── cmd.sh
    │   │   ├── conf
    │   │       ├── asr
    │   │       │   ├── blstm_las.yaml
    │   │       │   ├── blstm_las_2mtl.yaml
    │   │       │   ├── blstm_las_3mtl.yaml
    │   │       │   ├── blstm_las_fisher_swbd.yaml
    │   │       │   ├── blstm_mocha.yaml
    │   │       │   ├── lcblstm_mocha_chunk4040.yaml
    │   │       │   ├── lcblstm_mocha_chunk4040_ctc_sync.yaml
    │   │       │   ├── transformer.yaml
    │   │       │   └── transformer_fisher_swbd.yaml
    │   │       ├── data
    │   │       │   ├── spec_augment.yaml
    │   │       │   ├── spec_augment_speed_perturb.yaml
    │   │       │   ├── speed_perturb.yaml
    │   │       │   └── speed_perturb_pretrain.yaml
    │   │       ├── fbank.conf
    │   │       └── lm
    │   │       │   ├── rnnlm.yaml
    │   │       │   ├── transformer_xl.yaml
    │   │       │   └── transformerlm.yaml
    │   │   ├── local
    │   │       ├── MSU_single_letter.txt
    │   │       ├── dict.patch
    │   │       ├── eval2000_data_prep.sh
    │   │       ├── extend_segments.pl
    │   │       ├── fisher_data_prep.sh
    │   │       ├── fisher_map_words.pl
    │   │       ├── fisher_swbd_prepare_dict.sh
    │   │       ├── format_acronyms_dict.py
    │   │       ├── format_acronyms_dict_fisher_swbd.py
    │   │       ├── map_acronyms_ctm.py
    │   │       ├── map_acronyms_transcripts.py
    │   │       ├── plot_attention.sh
    │   │       ├── plot_ctc.sh
    │   │       ├── plot_lm_cache.sh
    │   │       ├── remove_disfluency.py
    │   │       ├── rt03_data_prep.sh
    │   │       ├── score_lm.sh
    │   │       ├── score_sclite.sh
    │   │       ├── swbd1_data_download.sh
    │   │       ├── swbd1_data_prep.sh
    │   │       ├── swbd1_fix_speakerid.pl
    │   │       ├── swbd1_map_words.pl
    │   │       └── swbd1_prepare_dict.sh
    │   │   ├── path.sh
    │   │   ├── run.sh
    │   │   ├── run_2mtl.sh
    │   │   ├── run_3mtl.sh
    │   │   ├── score.sh
    │   │   ├── steps
    │   │   └── utils
    ├── tedlium
    │   ├── s5_r2
    │   │   ├── RESULTS.md
    │   │   ├── cmd.sh
    │   │   ├── conf
    │   │   │   ├── asr
    │   │   │   │   ├── blstm_triggered_attention.yaml
    │   │   │   │   ├── las
    │   │   │   │   │   ├── blstm_las.yaml
    │   │   │   │   │   ├── blstm_las_2mtl.yaml
    │   │   │   │   │   ├── blstm_las_ctc_sync.yaml
    │   │   │   │   │   ├── lcblstm_las_chunk4020.yaml
    │   │   │   │   │   ├── lcblstm_las_chunk4040.yaml
    │   │   │   │   │   └── lstm_las.yaml
    │   │   │   │   ├── lcblstm_las_chunk4020.yaml
    │   │   │   │   ├── lcblstm_las_chunk4040.yaml
    │   │   │   │   ├── lstm_las.yaml
    │   │   │   │   ├── mma
    │   │   │   │   │   ├── offline
    │   │   │   │   │   │   └── transformer_mma_subsample8_ma4H_ca4H_w16_from4L.yaml
    │   │   │   │   │   └── streaming
    │   │   │   │   │   │   ├── lc_transformer_mma_subsample8_ma4H_ca4H_w16_from4L_64_128_64.yaml
    │   │   │   │   │   │   └── lc_transformer_mma_subsample8_ma4H_ca4H_w16_from4L_96_64_32.yaml
    │   │   │   │   ├── mocha
    │   │   │   │   │   ├── blstm_mocha.yaml
    │   │   │   │   │   ├── lcblstm_mocha_chunk4020.yaml
    │   │   │   │   │   ├── lcblstm_mocha_chunk4020_ctc_sync.yaml
    │   │   │   │   │   ├── lcblstm_mocha_chunk4040.yaml
    │   │   │   │   │   ├── lcblstm_mocha_chunk4040_ctc_sync.yaml
    │   │   │   │   │   ├── lcblstm_mocha_chunk4040_mbr.yaml
    │   │   │   │   │   ├── lstm_mocha.yaml
    │   │   │   │   │   ├── lstm_mocha_ctc_sync.yaml
    │   │   │   │   │   ├── lstm_mocha_decot16.yaml
    │   │   │   │   │   ├── lstm_mocha_minlt.yaml
    │   │   │   │   │   ├── lstm_mocha_rsp_enc.yaml
    │   │   │   │   │   ├── lstm_mocha_stableemit0.1.yaml
    │   │   │   │   │   ├── uni_conformer_kernel7_clamp10_hie_subsample8_mocha_long_ln.yaml
    │   │   │   │   │   └── uni_conformer_kernel7_clamp10_hie_subsample8_mocha_long_ln_stableemit0.1.yaml
    │   │   │   │   ├── transducer
    │   │   │   │   │   ├── blstm_rnnt_bpe1k.yaml
    │   │   │   │   │   ├── lcblstm_rnnt_40_20_bpe1k.yaml
    │   │   │   │   │   ├── lcblstm_rnnt_40_40_bpe1k.yaml
    │   │   │   │   │   ├── lstm_rnnt_bpe1k.yaml
    │   │   │   │   │   └── uni_conformer_kernel7_clamp10_hie_subsample8_rnnt_long_ln_bpe1k.yaml
    │   │   │   │   └── transformer
    │   │   │   │   │   ├── conformer_kernel15_clamp10_hie_subsample8_las_long_ln.yaml
    │   │   │   │   │   ├── transformer_hie_subsample8.yaml
    │   │   │   │   │   └── transformer_hie_subsample8_las_long.yaml
    │   │   │   ├── data
    │   │   │   │   ├── pretrain.yaml
    │   │   │   │   ├── spec_augment_speed_perturb.yaml
    │   │   │   │   ├── spec_augment_speed_perturb_pretrain_F13_T50.yaml
    │   │   │   │   ├── spec_augment_speed_perturb_pretrain_F27_T100.yaml
    │   │   │   │   └── spec_augment_speed_perturb_pretrain_F27_T50.yaml
    │   │   │   ├── fbank.conf
    │   │   │   └── lm
    │   │   │   │   └── rnnlm.yaml
    │   │   ├── ctc_forced_align.sh
    │   │   ├── local
    │   │   │   ├── download_data.sh
    │   │   │   ├── format_lms.sh
    │   │   │   ├── join_suffix.py
    │   │   │   ├── plot_attention.sh
    │   │   │   ├── prepare_data.sh
    │   │   │   ├── prepare_dict.sh
    │   │   │   ├── ted_download_lm.sh
    │   │   │   └── ted_train_lm.sh
    │   │   ├── path.sh
    │   │   ├── run.sh
    │   │   ├── run_2mtl.sh
    │   │   ├── run_streaming.sh
    │   │   ├── score.sh
    │   │   ├── score_streaming.sh
    │   │   ├── steps
    │   │   └── utils
    │   └── s5_r3
    │   │   ├── cmd.sh
    │   │   ├── conf
    │   │       ├── asr
    │   │       │   └── blstm_las.yaml
    │   │       ├── fbank.conf
    │   │       ├── lm
    │   │       │   └── rnnlm.yaml
    │   │       ├── spec_augment.yaml
    │   │       ├── spec_augment_speed_perturb.yaml
    │   │       └── speed_perturb.yaml
    │   │   ├── local
    │   │       ├── download_data.sh
    │   │       ├── format_lms.sh
    │   │       ├── join_suffix.py
    │   │       ├── prepare_data.sh
    │   │       ├── prepare_dict.sh
    │   │       ├── ted_download_lm.sh
    │   │       └── ted_train_lm.sh
    │   │   ├── path.sh
    │   │   ├── run.sh
    │   │   ├── score.sh
    │   │   ├── steps
    │   │   └── utils
    ├── timit
    │   ├── README.txt
    │   └── s5
    │   │   ├── RESULTS.md
    │   │   ├── cmd.sh
    │   │   ├── conf
    │   │       ├── blstm_ctc.yaml
    │   │       ├── blstm_las.yaml
    │   │       ├── dev_spk.list
    │   │       ├── fbank.conf
    │   │       ├── phones.60-48-39.map
    │   │       ├── rnn_transducer.yaml
    │   │       ├── test_spk.list
    │   │       ├── transformer.yaml
    │   │       └── transformer_relative.yaml
    │   │   ├── local
    │   │       ├── plot_attention.sh
    │   │       ├── plot_ctc.sh
    │   │       ├── score_sclite.sh
    │   │       ├── timit_data_prep.sh
    │   │       ├── timit_format_data.sh
    │   │       └── timit_norm_trans.pl
    │   │   ├── path.sh
    │   │   ├── run.sh
    │   │   ├── score.sh
    │   │   ├── steps
    │   │   └── utils
    └── wsj
    │   ├── README.txt
    │   └── s5
    │       ├── RESULTS
    │       ├── cmd.sh
    │       ├── conf
    │           ├── asr
    │           │   ├── blstm_las.yaml
    │           │   ├── glu_encoder.yaml
    │           │   ├── tds_encoder.yaml
    │           │   └── transformer.yaml
    │           ├── data
    │           │   ├── spec_augment.yaml
    │           │   ├── spec_augment_speed_perturb.yaml
    │           │   └── speed_perturb.yaml
    │           ├── fbank.conf
    │           └── lm
    │           │   ├── gated_convlm.yaml
    │           │   ├── rnnlm.yaml
    │           │   └── transformerlm.yaml
    │       ├── local
    │           ├── append_utterances.sh
    │           ├── cstr_ndx2flist.pl
    │           ├── cstr_wsj_data_prep.sh
    │           ├── cstr_wsj_extend_dict.sh
    │           ├── dict
    │           │   ├── add_counts.pl
    │           │   ├── count_rules.pl
    │           │   ├── filter_dict.pl
    │           │   ├── find_acronyms.pl
    │           │   ├── get_acronym_prons.pl
    │           │   ├── get_candidate_prons.pl
    │           │   ├── get_rule_hierarchy.pl
    │           │   ├── get_rules.pl
    │           │   ├── limit_candidate_prons.pl
    │           │   ├── reverse_candidates.pl
    │           │   ├── reverse_dict.pl
    │           │   ├── score_prons.pl
    │           │   ├── score_rules.pl
    │           │   └── select_candidate_prons.pl
    │           ├── find_transcripts.pl
    │           ├── flist2scp.pl
    │           ├── ndx2flist.pl
    │           ├── normalize_trans.sh
    │           ├── normalize_transcript.pl
    │           ├── plot_attention.sh
    │           ├── plot_ctc.sh
    │           ├── score_lm.sh
    │           ├── wsj_data_prep.sh
    │           ├── wsj_extend_dict.sh
    │           ├── wsj_format_data.sh
    │           ├── wsj_format_local_lms.sh
    │           └── wsj_prepare_dict.sh
    │       ├── path.sh
    │       ├── run.sh
    │       ├── score.sh
    │       ├── steps
    │       └── utils
├── neural_sp
    ├── __init__.py
    ├── bin
    │   ├── __init__.py
    │   ├── args_asr.py
    │   ├── args_common.py
    │   ├── args_lm.py
    │   ├── asr
    │   │   ├── __init__.py
    │   │   ├── ctc_forced_align.py
    │   │   ├── eval.py
    │   │   ├── plot_attention.py
    │   │   ├── plot_ctc.py
    │   │   └── train.py
    │   ├── eval_utils.py
    │   ├── lm
    │   │   ├── __init__.py
    │   │   ├── eval.py
    │   │   ├── plot_cache.py
    │   │   └── train.py
    │   ├── model_name.py
    │   ├── plot_utils.py
    │   └── train_utils.py
    ├── datasets
    │   ├── __init__.py
    │   ├── alignment.py
    │   ├── asr
    │   │   ├── __init__.py
    │   │   ├── build.py
    │   │   ├── dataloader.py
    │   │   ├── dataset.py
    │   │   └── sampler.py
    │   ├── lm.py
    │   ├── token_converter
    │   │   ├── __init__.py
    │   │   ├── character.py
    │   │   ├── phone.py
    │   │   ├── word.py
    │   │   └── wordpiece.py
    │   └── utils.py
    ├── evaluators
    │   ├── __init__.py
    │   ├── accuracy.py
    │   ├── character.py
    │   ├── edit_distance.py
    │   ├── phone.py
    │   ├── ppl.py
    │   ├── resolving_unk.py
    │   ├── word.py
    │   ├── wordpiece.py
    │   └── wordpiece_bleu.py
    ├── models
    │   ├── __init__.py
    │   ├── base.py
    │   ├── criterion.py
    │   ├── data_parallel.py
    │   ├── lm
    │   │   ├── __init__.py
    │   │   ├── build.py
    │   │   ├── gated_convlm.py
    │   │   ├── lm_base.py
    │   │   ├── rnnlm.py
    │   │   ├── transformer_xl.py
    │   │   └── transformerlm.py
    │   ├── modules
    │   │   ├── __init__.py
    │   │   ├── attention.py
    │   │   ├── causal_conv.py
    │   │   ├── cif.py
    │   │   ├── conformer_convolution.py
    │   │   ├── gelu.py
    │   │   ├── glu.py
    │   │   ├── gmm_attention.py
    │   │   ├── headdrop.py
    │   │   ├── initialization.py
    │   │   ├── mocha
    │   │   │   ├── __init__.py
    │   │   │   ├── chunk_energy.py
    │   │   │   ├── hma_test.py
    │   │   │   ├── hma_train.py
    │   │   │   ├── mocha.py
    │   │   │   ├── mocha_test.py
    │   │   │   ├── mocha_train.py
    │   │   │   └── monotonic_energy.py
    │   │   ├── multihead_attention.py
    │   │   ├── positional_embedding.py
    │   │   ├── positionwise_feed_forward.py
    │   │   ├── relative_multihead_attention.py
    │   │   ├── softplus.py
    │   │   ├── swish.py
    │   │   ├── sync_bidir_multihead_attention.py
    │   │   ├── transformer.py
    │   │   └── zoneout.py
    │   ├── seq2seq
    │   │   ├── __init___.py
    │   │   ├── decoders
    │   │   │   ├── __init__.py
    │   │   │   ├── beam_search.py
    │   │   │   ├── build.py
    │   │   │   ├── ctc.py
    │   │   │   ├── decoder_base.py
    │   │   │   ├── fwd_bwd_attention.py
    │   │   │   ├── las.py
    │   │   │   ├── rnn_transducer.py
    │   │   │   └── transformer.py
    │   │   ├── encoders
    │   │   │   ├── __init__.py
    │   │   │   ├── build.py
    │   │   │   ├── conformer.py
    │   │   │   ├── conformer_block.py
    │   │   │   ├── conformer_block_v2.py
    │   │   │   ├── conv.py
    │   │   │   ├── encoder_base.py
    │   │   │   ├── gated_conv.py
    │   │   │   ├── rnn.py
    │   │   │   ├── subsampling.py
    │   │   │   ├── tds.py
    │   │   │   ├── transformer.py
    │   │   │   ├── transformer_block.py
    │   │   │   └── utils.py
    │   │   ├── frontends
    │   │   │   ├── __init__.py
    │   │   │   ├── frame_stacking.py
    │   │   │   ├── input_noise.py
    │   │   │   ├── sequence_summary.py
    │   │   │   ├── spec_augment.py
    │   │   │   ├── splicing.py
    │   │   │   └── streaming.py
    │   │   └── speech2text.py
    │   └── torch_utils.py
    ├── trainers
    │   ├── __init__.py
    │   ├── lr_scheduler.py
    │   ├── optimizer.py
    │   └── reporter.py
    └── utils.py
├── setup.cfg
├── setup.py
├── test
    ├── __init__.py
    ├── decoders
    │   ├── dict.txt
    │   ├── test_las_decoder.py
    │   ├── test_rnn_transducer_decoder.py
    │   └── test_transformer_decoder.py
    ├── encoders
    │   ├── test_conformer_encoder.py
    │   ├── test_conv_encoder.py
    │   ├── test_rnn_encoder.py
    │   ├── test_rnn_encoder_streaming_chunkwise.py
    │   ├── test_tds_encoder.py
    │   ├── test_transformer_encoder.py
    │   ├── test_transformer_encoder_streaming_chunkwise.py
    │   └── test_utils.py
    ├── frontends
    │   ├── test_frame_stacking.py
    │   ├── test_input_noise.py
    │   ├── test_sequence_summary.py
    │   ├── test_specaugment.py
    │   ├── test_splicing.py
    │   └── test_streaming.py
    ├── install.sh
    ├── lm
    │   ├── test_rnnlm.py
    │   ├── test_transformer_xl_lm.py
    │   └── test_transformerlm.py
    ├── modules
    │   ├── test_attention.py
    │   ├── test_causal_conv.py
    │   ├── test_cif.py
    │   ├── test_conformer_convolution.py
    │   ├── test_gmm_attention.py
    │   ├── test_mocha.py
    │   ├── test_multihead_attention.py
    │   ├── test_pointwise_feed_forward.py
    │   ├── test_relative_multihead_attention.py
    │   └── test_zoneout.py
    ├── test_python.sh
    └── test_training.sh
├── tools
    └── Makefile
└── utils
    ├── compute_oov_rate.py
    ├── concat_ref.py
    ├── dump_feat.sh
    ├── make_dataset.sh
    ├── make_tsv.py
    ├── make_vocab.sh
    ├── map2phone.py
    ├── speed_perturb_3way.sh
    ├── text2dict.py
    ├── trn2ctm.py
    └── update_dataset.sh


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | branch = True
 3 | source = neural_sp
 4 | 
 5 | [report]
 6 | exclude_lines =
 7 |     raise ValueError
 8 |     raise TypeError
 9 |     raise NotImplementedError
10 |     if __name__ == .__main__.:
11 | 
12 | ignore_errors = True


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | *.pyc
 3 | *.log
 4 | __pycache__
 5 | .dropbox.attr
 6 | .ftpconfig
 7 | .nfs*
 8 | .idea
 9 | .pytest_cache
10 | .vscode
11 | *.done
12 | .coverage
13 | coverage.xml
14 | neural_sp.egg-info
15 | wandb
16 | 
17 | # CI test
18 | examples/ci_test/data
19 | examples/ci_test/results
20 | examples/ci_test/sample
21 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | notifications:
 2 |   email: false
 3 | 
 4 | dist: xenial
 5 | 
 6 | language: python
 7 | 
 8 | os: linux
 9 | 
10 | python:
11 |   - "3.7"
12 | 
13 | cache:
14 |   - pip
15 |   - ccache
16 | 
17 | sudo: false
18 | 
19 | install:
20 |   - travis_retry ./test/install.sh
21 | 
22 | script:
23 |   - ./test/test_python.sh
24 |   - ./test/test_training.sh
25 | 
26 | after_success:
27 |   - bash <(curl -s https://codecov.io/bash)
28 | 
29 | env:
30 |   - PYTORCH_VERSION=1.0.0 CC=gcc-7 CXX=g++-7
31 |   - PYTORCH_VERSION=1.1.0 CC=gcc-7 CXX=g++-7
32 |   - PYTORCH_VERSION=1.3.0 CC=gcc-7 CXX=g++-7
33 |   - PYTORCH_VERSION=1.4.0 CC=gcc-7 CXX=g++-7
34 |   - PYTORCH_VERSION=1.5.0 CC=gcc-7 CXX=g++-7
35 |   - PYTORCH_VERSION=1.6.0 CC=gcc-7 CXX=g++-7
36 |   - PYTORCH_VERSION=1.7.1 CC=gcc-7 CXX=g++-7
37 |   - PYTORCH_VERSION=1.8.1 CC=gcc-7 CXX=g++-7
38 | 
39 | addons:
40 |   apt:
41 |     sources:
42 |       - ubuntu-toolchain-r-test
43 |     packages:
44 |       - cmake
45 |       - g++-7
46 |       - sox
47 | 


--------------------------------------------------------------------------------
/examples/aishell/README.txt:
--------------------------------------------------------------------------------
 1 | Aishell is an open Chinese Mandarin speech database published by Beijing Shell Shell Technology Co.,Ltd.
 2 | 
 3 | 400 people from different accent areas in China are invited to participate in the recording, which is conducted in a quiet indoor environment using high fidelity microphone and downsampled to 16kHz. The manual transcription accuracy is above 95%, through professional speech annotation and strict quality inspection. The data is free for academic use. The corpus contains 170 hours of speech, and is devided into training(85%), developement(10%) and testing(5%) sets. The developement set is used to tune the hyperparameters in training.
 4 | 
 5 | The database can be downloaded from openslr:
 6 | http://www.openslr.org/33/
 7 | 
 8 | This folder contains two subfolders:
 9 | s5: a speech recognition recipe
10 | v1: a speaker recognition recipe
11 | 
12 | For more details, please visit:
13 | http://www.aishelltech.com/kysjcp
14 | 


--------------------------------------------------------------------------------
/examples/aishell/s5/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | # Kyoto University setup
14 | export train_cmd="run.pl --mem 2G"
15 | export cuda_cmd="run.pl --mem 2G --gpu 1"
16 | export decode_cmd="run.pl --mem 4G"
17 | 
18 | # JHU setup
19 | # export train_cmd="queue.pl --mem 2G"
20 | # export cuda_cmd="queue.pl --mem 2G --gpu 1 --config conf/gpu.conf"
21 | # export decode_cmd="queue.pl --mem 4G"
22 | 


--------------------------------------------------------------------------------
/examples/aishell/s5/conf/asr/transformer.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | enc_type: conv_transformer
11 | enc_n_layers: 12
12 | transformer_enc_pe_type: add
13 | transformer_enc_d_model: 256
14 | transformer_enc_d_ff: 2048
15 | transformer_enc_n_heads: 4
16 | dec_type: transformer
17 | dec_n_layers: 6
18 | transformer_dec_pe_type: 1dconv3L  ### this is effective
19 | transformer_dec_attn_type: scaled_dot
20 | transformer_dec_d_model: 256
21 | transformer_dec_d_ff: 2048
22 | transformer_dec_n_heads: 4
23 | tie_embedding: false
24 | ctc_fc_list: "512"
25 | ### optimization
26 | batch_size: 32
27 | optimizer: noam
28 | n_epochs: 35
29 | convert_to_sgd_epoch: 100
30 | print_step: 1200
31 | metric: accuracy
32 | lr_factor: 5.0
33 | early_stop_patient_n_epochs: 5
34 | shuffle_bucket: true  ### this is important
35 | sort_stop_epoch: 100
36 | eval_start_epoch: 1
37 | warmup_n_steps: 25000
38 | accum_grad_n_steps: 8
39 | ### regularization
40 | clip_grad_norm: 5.0
41 | dropout_in: 0.0
42 | dropout_enc: 0.1
43 | dropout_dec: 0.1
44 | dropout_emb: 0.1
45 | dropout_att: 0.0
46 | weight_decay: 1e-6
47 | lsm_prob: 0.1
48 | ### MTL
49 | ctc_weight: 0.3
50 | ctc_lsm_prob: 0.1
51 | mtl_per_batch: false
52 | task_specific_layer: false
53 | # SpecAugment
54 | freq_width: 27
55 | n_freq_masks: 2
56 | time_width: 100
57 | n_time_masks: 2
58 | time_width_upper: 1.0
59 | 


--------------------------------------------------------------------------------
/examples/aishell/s5/conf/data/spec_augment_speed_perturb.yaml:
--------------------------------------------------------------------------------
 1 | # optimization
 2 | n_epochs: 40  # 20->40
 3 | print_step: 1200  # 600->1200
 4 | lr_decay_start_epoch: 15
 5 | lr_decay_rate: 0.9
 6 | 
 7 | # mask
 8 | freq_width: 27
 9 | n_freq_masks: 2
10 | time_width: 100
11 | n_time_masks: 2
12 | time_width_upper: 1.0
13 | 


--------------------------------------------------------------------------------
/examples/aishell/s5/conf/data/spec_augment_speed_perturb_pretrain.yaml:
--------------------------------------------------------------------------------
 1 | # optimization
 2 | n_epochs: 30  # 15->30
 3 | print_step: 2400  # 1200->2400
 4 | lr_decay_start_epoch: 1
 5 | lr_decay_rate: 0.9
 6 | 
 7 | # mask
 8 | freq_width: 27
 9 | n_freq_masks: 2
10 | time_width: 50
11 | n_time_masks: 2
12 | time_width_upper: 1.0
13 | 


--------------------------------------------------------------------------------
/examples/aishell/s5/conf/data/speed_perturb_pretrain.yaml:
--------------------------------------------------------------------------------
1 | # optimization
2 | n_epochs: 20  # 20->15->20
3 | print_step: 1200  # 600->1200
4 | lr_decay_start_epoch: 1
5 | lr_decay_rate: 0.8
6 | 


--------------------------------------------------------------------------------
/examples/aishell/s5/conf/fbank.conf:
--------------------------------------------------------------------------------
1 | --window-type=hamming # disable Dans window, use the standard
2 | --htk-compat=true     # try to make it compatible with HTK
3 | 
4 | --sample-frequency=16000
5 | --num-mel-bins=80
6 | --use-energy=false
7 | 


--------------------------------------------------------------------------------
/examples/aishell/s5/conf/lm/rnnlm.yaml:
--------------------------------------------------------------------------------
 1 | # topology
 2 | lm_type: lstm
 3 | n_units: 1024
 4 | n_projs: 0
 5 | n_layers: 2
 6 | emb_dim: 1024
 7 | n_units_null_context: 0
 8 | tie_embedding: true
 9 | residual: true
10 | use_glu: true
11 | # optimization
12 | batch_size: 64
13 | bptt: 200
14 | optimizer: adam
15 | n_epochs: 15
16 | convert_to_sgd_epoch: 15
17 | print_step: 50
18 | lr: 1e-3
19 | lr_decay_start_epoch: 10
20 | lr_decay_rate: 0.9
21 | lr_decay_patient_n_epochs: 0
22 | lr_decay_type: always
23 | early_stop_patient_n_epochs: 5
24 | eval_start_epoch: 1
25 | # initialization
26 | param_init: 0.05
27 | # regularization
28 | clip_grad_norm: 1.0
29 | dropout_in: 0.5
30 | dropout_hidden: 0.5
31 | dropout_out: 0.0
32 | weight_decay: 1e-6
33 | lsm_prob: 0.1  ###
34 | adaptive_softmax: false
35 | 


--------------------------------------------------------------------------------
/examples/aishell/s5/path.sh:
--------------------------------------------------------------------------------
 1 | export NEURALSP_ROOT=$PWD/../../..
 2 | export KALDI_ROOT=$NEURALSP_ROOT/tools/kaldi
 3 | export TOOL=$NEURALSP_ROOT/tools/neural_sp
 4 | export CONDA=$TOOL/miniconda
 5 | 
 6 | # Kaldi
 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 8 | export PATH=$NEURALSP_ROOT/utils:$PWD/utils/:$KALDI_ROOT/tools/sctk/bin/:$TOOL/sentencepiece/build/src:$PATH
 9 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
10 | . $KALDI_ROOT/tools/config/common_path.sh
11 | export LC_ALL=C
12 | 
13 | ### Python
14 | source $CONDA/etc/profile.d/conda.sh && conda deactivate && conda activate
15 | export PYTHONDONTWRITEBYTECODE=1
16 | export OMP_NUM_THREADS=1
17 | 
18 | ### CUDA
19 | CUDAROOT=/usr/local/cuda
20 | NCCL_ROOT=/usr/local/nccl
21 | export CPATH=$NCCL_ROOT/include:$CPATH
22 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib/:$CUDAROOT/lib64:$LD_LIBRARY_PATH
23 | export LIBRARY_PATH=$NCCL_ROOT/lib/:$LIBRARY_PATH
24 | export CUDA_HOME=$CUDAROOT
25 | export CUDA_PATH=$CUDAROOT
26 | 


--------------------------------------------------------------------------------
/examples/aishell/s5/steps:
--------------------------------------------------------------------------------
1 | ../../wsj/s5/steps


--------------------------------------------------------------------------------
/examples/aishell/s5/utils:
--------------------------------------------------------------------------------
1 | ../../wsj/s5/utils


--------------------------------------------------------------------------------
/examples/aishell2/s5/RESULTS.md:
--------------------------------------------------------------------------------
 1 | ### Conformer-LAS + SpecAugment (no LM), hierarchical subsample1/8
 2 | - conf: `conf/asr/conformer_kernel15_clamp10_hie_subsample8_las_ln_large.yaml`
 3 | - decoding parameters
 4 |   - epoch: 30
 5 |   - n_average: 10
 6 |   - beam width: 10
 7 |   - lm_weight: 0.0
 8 | 
 9 | | Eval Set | # Snt | # Wrd | Corr | Sub | Del | Ins | Err | S.Err |
10 | | -------- | ----- | ----- | ---- | --- | --- | --- | --- | ----- |
11 | |test_android|5000|49532|94.0|5.8|0.2|0.1|**6.1**|36.7|
12 | |test_ios|5000|49532|94.6|5.2|0.2|0.1|**5.5**|34.1|
13 | |test_mic|5000|49532|94.3|5.6|0.2|0.1|**5.9**|35.7|
14 | 


--------------------------------------------------------------------------------
/examples/aishell2/s5/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | # Kyoto University setup
14 | export train_cmd="run.pl --mem 2G"
15 | export cuda_cmd="run.pl --mem 2G --gpu 1"
16 | export decode_cmd="run.pl --mem 4G"
17 | 
18 | # JHU setup
19 | # export train_cmd="queue.pl --mem 2G"
20 | # export cuda_cmd="queue.pl --mem 2G --gpu 1 --config conf/gpu.conf"
21 | # export decode_cmd="queue.pl --mem 4G"
22 | 


--------------------------------------------------------------------------------
/examples/aishell2/s5/conf/fbank.conf:
--------------------------------------------------------------------------------
1 | --window-type=hamming # disable Dans window, use the standard
2 | --htk-compat=true     # try to make it compatible with HTK
3 | 
4 | --sample-frequency=16000
5 | --num-mel-bins=80
6 | --use-energy=false
7 | 


--------------------------------------------------------------------------------
/examples/aishell2/s5/path.sh:
--------------------------------------------------------------------------------
 1 | export NEURALSP_ROOT=$PWD/../../..
 2 | export KALDI_ROOT=$NEURALSP_ROOT/tools/kaldi
 3 | export TOOL=$NEURALSP_ROOT/tools/neural_sp
 4 | export CONDA=$TOOL/miniconda
 5 | 
 6 | # Kaldi
 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 8 | export PATH=$NEURALSP_ROOT/utils:$PWD/utils/:$KALDI_ROOT/tools/sctk/bin/:$TOOL/sentencepiece/build/src:$PATH
 9 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
10 | . $KALDI_ROOT/tools/config/common_path.sh
11 | export LC_ALL=C
12 | 
13 | ### Python
14 | source $CONDA/etc/profile.d/conda.sh && conda deactivate && conda activate
15 | export PYTHONDONTWRITEBYTECODE=1
16 | export OMP_NUM_THREADS=1
17 | 
18 | ### CUDA
19 | CUDAROOT=/usr/local/cuda
20 | NCCL_ROOT=/usr/local/nccl
21 | export CPATH=$NCCL_ROOT/include:$CPATH
22 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib/:$CUDAROOT/lib64:$LD_LIBRARY_PATH
23 | export LIBRARY_PATH=$NCCL_ROOT/lib/:$LIBRARY_PATH
24 | export CUDA_HOME=$CUDAROOT
25 | export CUDA_PATH=$CUDAROOT
26 | 


--------------------------------------------------------------------------------
/examples/aishell2/s5/steps:
--------------------------------------------------------------------------------
1 | ../../wsj/s5/steps


--------------------------------------------------------------------------------
/examples/aishell2/s5/utils:
--------------------------------------------------------------------------------
1 | ../../wsj/s5/utils


--------------------------------------------------------------------------------
/examples/ami/s5b/README.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | This s5b recipe is a streamlined and simplified version of the s5 recipe, with
 3 | many components removed.
 4 | 
 5 |  Before running run.sh, please run run_prepare_shared.sh.
 6 | 
 7 |  Afterwards, you can run:
 8 |     run.sh --mic ihm    # builds system for independent headset microphone
 9 |     run.sh --mic sdm1   # single distant micropophone
10 |     run.sh --mic mdm8   # multiple distant microphones + beamforming.
11 | 
12 |  Note: the sdm1 and mdm8 systems depend on the ihm system, because for
13 |  best results we use the IHM alignments to train the neural nets.
14 |  Please see RESULTS_* for results.
15 | 
16 | - For information about the database see : http://groups.inf.ed.ac.uk/ami/corpus/overview.shtml
17 | 
18 | 


--------------------------------------------------------------------------------
/examples/ami/s5b/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | # Kyoto University setup
14 | export train_cmd="run.pl --mem 2G"
15 | export cuda_cmd="run.pl --mem 2G --gpu 1"
16 | export decode_cmd="run.pl --mem 4G"
17 | 
18 | # JHU setup
19 | # export train_cmd="queue.pl --mem 2G"
20 | # export cuda_cmd="queue.pl --mem 2G --gpu 1 --config conf/gpu.conf"
21 | # export decode_cmd="queue.pl --mem 4G"
22 | 


--------------------------------------------------------------------------------
/examples/ami/s5b/conf/ami_beamformit.cfg:
--------------------------------------------------------------------------------
 1 | #BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/)
 2 | 
 3 | # scrolling size to compute the delays
 4 | scroll_size = 250
 5 | 
 6 | # cross correlation computation window size
 7 | window_size = 500
 8 | 
 9 | #amount of maximum points for the xcorrelation taken into account
10 | nbest_amount = 4
11 | 
12 | #flag wether to apply an automatic noise thresholding 
13 | do_noise_threshold = 1
14 | 
15 | #Percentage of frames with lower xcorr taken as noisy
16 | noise_percent = 10
17 | 
18 | ######## acoustic modelling parameters
19 | 
20 | #transition probabilities weight for multichannel decoding
21 | trans_weight_multi = 25
22 | trans_weight_nbest = 25
23 | 
24 | ###
25 | 
26 | #flag wether to print the feaures after setting them, or not
27 | print_features = 1
28 | 
29 | #flag wether to use the bad frames in the sum process
30 | do_avoid_bad_frames = 1
31 | 
32 | #flag to use the best channel (SNR) as a reference
33 | #defined from command line
34 | do_compute_reference = 1
35 | 
36 | #flag wether to use a uem file or not(process all the file)
37 | do_use_uem_file = 0
38 | 
39 | #flag wether to use an adaptative weights scheme or fixed weights
40 | do_adapt_weights = 1
41 | 
42 | #flag wether to output the sph files or just run the system to create the auxiliary files
43 | do_write_sph_files = 1
44 | 
45 | ####directories where to store/retrieve info####
46 | #channels_file = ./cfg-files/channels
47 | 
48 | #show needs to be passed as argument normally, here a default one is given just in case
49 | #show_id = Ttmp
50 | 
51 | 


--------------------------------------------------------------------------------
/examples/ami/s5b/conf/asr/blstm_las.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | subsample: "1_1_1_1_1"
11 | enc_type: conv_blstm
12 | bidirectional_sum_fwd_bwd: true
13 | enc_n_units: 512
14 | enc_n_projs: 0
15 | enc_n_layers: 5
16 | subsample_type: drop
17 | lc_chunk_size_left: -1  ### offline
18 | lc_chunk_size_right: 40
19 | attn_type: location
20 | attn_conv_n_channels: 10
21 | attn_conv_width: 201
22 | attn_dim: 512
23 | attn_n_heads: 1
24 | dec_type: lstm
25 | dec_n_units: 1024
26 | dec_n_projs: 0
27 | dec_n_layers: 1
28 | dec_bottleneck_dim: 1024  ### this is effective
29 | emb_dim: 512
30 | tie_embedding: false
31 | ctc_fc_list: "512"
32 | ### optimization
33 | batch_size: 30
34 | optimizer: adam
35 | n_epochs: 35
36 | convert_to_sgd_epoch: 100
37 | print_step: 200
38 | metric: edit_distance
39 | lr: 1e-3
40 | lr_decay_type: always
41 | lr_decay_start_epoch: 15
42 | lr_decay_rate: 0.9
43 | lr_decay_patient_n_epochs: 0
44 | early_stop_patient_n_epochs: 5
45 | sort_stop_epoch: 100
46 | eval_start_epoch: 1
47 | warmup_start_lr: 1e-4
48 | warmup_n_steps: 4000
49 | ### initialization
50 | param_init: 0.1
51 | ### regularization
52 | clip_grad_norm: 5.0
53 | dropout_in: 0.0
54 | dropout_enc: 0.4
55 | dropout_dec: 0.4
56 | dropout_emb: 0.4
57 | dropout_att: 0.0
58 | weight_decay: 1e-6
59 | ss_prob: 0.2
60 | lsm_prob: 0.1
61 | ### MTL
62 | ctc_weight: 0.3
63 | ctc_lsm_prob: 0.1
64 | mtl_per_batch: false
65 | task_specific_layer: false
66 | 


--------------------------------------------------------------------------------
/examples/ami/s5b/conf/asr/blstm_rnnt.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | subsample: "1_1_1_1_1"
11 | enc_type: conv_blstm
12 | bidirectional_sum_fwd_bwd: true
13 | enc_n_units: 512
14 | enc_n_projs: 0
15 | enc_n_layers: 5
16 | subsample_type: drop
17 | lc_chunk_size_left: -1  ### offline
18 | lc_chunk_size_right: 40
19 | dec_type: lstm_transducer
20 | dec_n_units: 1024
21 | dec_n_projs: 0
22 | dec_n_layers: 2
23 | dec_bottleneck_dim: 512
24 | emb_dim: 512
25 | tie_embedding: false
26 | ctc_fc_list: "512"
27 | ### optimization
28 | batch_size: 15
29 | optimizer: adam
30 | n_epochs: 35
31 | convert_to_sgd_epoch: 100
32 | print_step: 200
33 | metric: edit_distance
34 | lr: 1e-3
35 | lr_decay_type: always
36 | lr_decay_start_epoch: 15
37 | lr_decay_rate: 0.9
38 | lr_decay_patient_n_epochs: 0
39 | early_stop_patient_n_epochs: 5
40 | sort_stop_epoch: 100
41 | eval_start_epoch: 1
42 | warmup_start_lr: 1e-4
43 | warmup_n_steps: 4000
44 | ### initialization
45 | param_init: 0.1
46 | ### regularization
47 | clip_grad_norm: 5.0
48 | dropout_in: 0.0
49 | dropout_enc: 0.4
50 | dropout_dec: 0.4
51 | dropout_emb: 0.4
52 | weight_decay: 1e-6
53 | ### MTL
54 | ctc_weight: 0.3
55 | ctc_lsm_prob: 0.1
56 | mtl_per_batch: false
57 | task_specific_layer: false
58 | 


--------------------------------------------------------------------------------
/examples/ami/s5b/conf/asr/lcblstm_rnnt_40_40.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | subsample: "1_1_1_1_1"
11 | enc_type: conv_blstm
12 | bidirectional_sum_fwd_bwd: true
13 | enc_n_units: 512
14 | enc_n_projs: 0
15 | enc_n_layers: 5
16 | subsample_type: drop
17 | lc_chunk_size_left: 40
18 | lc_chunk_size_right: 40
19 | dec_type: lstm_transducer
20 | dec_n_units: 1024
21 | dec_n_projs: 0
22 | dec_n_layers: 2
23 | dec_bottleneck_dim: 512
24 | emb_dim: 512
25 | tie_embedding: false
26 | ctc_fc_list: "512"
27 | ### optimization
28 | batch_size: 10
29 | optimizer: adam
30 | n_epochs: 35
31 | convert_to_sgd_epoch: 100
32 | print_step: 200
33 | metric: edit_distance
34 | lr: 1e-3
35 | lr_decay_type: always
36 | lr_decay_start_epoch: 15
37 | lr_decay_rate: 0.9
38 | lr_decay_patient_n_epochs: 0
39 | early_stop_patient_n_epochs: 5
40 | sort_stop_epoch: 100
41 | eval_start_epoch: 1
42 | warmup_start_lr: 1e-4
43 | warmup_n_steps: 4000
44 | ### initialization
45 | param_init: 0.1
46 | ### regularization
47 | clip_grad_norm: 5.0
48 | dropout_in: 0.0
49 | dropout_enc: 0.4
50 | dropout_dec: 0.4
51 | dropout_emb: 0.4
52 | weight_decay: 1e-6
53 | ### MTL
54 | ctc_weight: 0.3
55 | ctc_lsm_prob: 0.1
56 | mtl_per_batch: false
57 | task_specific_layer: false
58 | 


--------------------------------------------------------------------------------
/examples/ami/s5b/conf/asr/transformer.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | enc_type: conv_transformer
11 | enc_n_layers: 12
12 | transformer_enc_pe_type: none  ###
13 | transformer_enc_d_model: 256
14 | transformer_enc_d_ff: 2048
15 | transformer_enc_n_heads: 4
16 | dec_type: transformer
17 | dec_n_layers: 6
18 | transformer_dec_pe_type: 1dconv3L  ### this is effective
19 | transformer_dec_attn_type: scaled_dot
20 | transformer_dec_d_model: 256
21 | transformer_dec_d_ff: 2048
22 | transformer_dec_n_heads: 4
23 | tie_embedding: false
24 | ctc_fc_list: "512"
25 | ### optimization
26 | batch_size: 32
27 | optimizer: noam
28 | n_epochs: 70
29 | convert_to_sgd_epoch: 100
30 | print_step: 1200
31 | metric: accuracy
32 | lr_factor: 5.0
33 | early_stop_patient_n_epochs: 5
34 | shuffle_bucket: true  ### this is important
35 | sort_stop_epoch: 100
36 | eval_start_epoch: 1
37 | warmup_n_steps: 25000
38 | accum_grad_n_steps: 8
39 | ### regularization
40 | clip_grad_norm: 5.0
41 | dropout_in: 0.0
42 | dropout_enc: 0.1
43 | dropout_dec: 0.1
44 | dropout_emb: 0.1
45 | dropout_att: 0.0
46 | weight_decay: 1e-6
47 | lsm_prob: 0.1
48 | ### MTL
49 | ctc_weight: 0.3
50 | ctc_lsm_prob: 0.1
51 | mtl_per_batch: false
52 | task_specific_layer: false
53 | # SpecAugment
54 | freq_width: 27
55 | n_freq_masks: 2
56 | time_width: 100
57 | n_time_masks: 2
58 | time_width_upper: 1.0
59 | 


--------------------------------------------------------------------------------
/examples/ami/s5b/conf/data/spec_augment.yaml:
--------------------------------------------------------------------------------
 1 | # optimization
 2 | n_epochs: 70
 3 | print_step: 400
 4 | lr_decay_start_epoch: 20
 5 | lr_decay_rate: 0.95
 6 | 
 7 | # mask
 8 | freq_width: 27
 9 | n_freq_masks: 2
10 | time_width: 100
11 | n_time_masks: 2
12 | time_width_upper: 1.0
13 | 


--------------------------------------------------------------------------------
/examples/ami/s5b/conf/data/spec_augment_speed_perturb.yaml:
--------------------------------------------------------------------------------
 1 | # optimization
 2 | n_epochs: 60  # 35->60
 3 | print_step: 1200
 4 | lr_decay_start_epoch: 10
 5 | lr_decay_rate: 0.925
 6 | 
 7 | # mask
 8 | freq_width: 27
 9 | n_freq_masks: 2
10 | time_width: 100
11 | n_time_masks: 2
12 | time_width_upper: 1.0
13 | 


--------------------------------------------------------------------------------
/examples/ami/s5b/conf/data/spec_augment_speed_perturb_pretrain_F27_T100.yaml:
--------------------------------------------------------------------------------
 1 | # optimization
 2 | n_epochs: 60  # 35->60
 3 | print_step: 2400  # 1200->2400
 4 | lr_decay_start_epoch: 5
 5 | lr_decay_rate: 0.925
 6 | 
 7 | # mask
 8 | freq_width: 27
 9 | n_freq_masks: 2
10 | time_width: 100
11 | n_time_masks: 2
12 | time_width_upper: 1.0
13 | 


--------------------------------------------------------------------------------
/examples/ami/s5b/conf/data/spec_augment_speed_perturb_pretrain_F27_T50.yaml:
--------------------------------------------------------------------------------
 1 | # optimization
 2 | n_epochs: 60  # 35->60
 3 | print_step: 2400  # 1200->2400
 4 | lr_decay_start_epoch: 5
 5 | lr_decay_rate: 0.925
 6 | 
 7 | # mask
 8 | freq_width: 27
 9 | n_freq_masks: 2
10 | time_width: 50
11 | n_time_masks: 2
12 | time_width_upper: 1.0
13 | 


--------------------------------------------------------------------------------
/examples/ami/s5b/conf/data/speed_perturb.yaml:
--------------------------------------------------------------------------------
1 | # optimization
2 | n_epochs: 30
3 | print_step: 600  # 200->600
4 | lr_decay_start_epoch: 10
5 | lr_decay_rate: 0.85
6 | 


--------------------------------------------------------------------------------
/examples/ami/s5b/conf/fbank.conf:
--------------------------------------------------------------------------------
1 | --window-type=hamming # disable Dans window, use the standard
2 | --htk-compat=true     # try to make it compatible with HTK
3 | 
4 | --sample-frequency=16000
5 | --num-mel-bins=80
6 | --use-energy=false
7 | 


--------------------------------------------------------------------------------
/examples/ami/s5b/conf/lm/rnnlm.yaml:
--------------------------------------------------------------------------------
 1 | # topology
 2 | lm_type: lstm
 3 | n_units: 1024
 4 | n_projs: 0
 5 | n_layers: 2
 6 | emb_dim: 1024
 7 | n_units_null_context: 0
 8 | tie_embedding: true
 9 | residual: true
10 | use_glu: true
11 | # optimization
12 | batch_size: 128
13 | bptt: 200
14 | optimizer: adam
15 | n_epochs: 40
16 | convert_to_sgd_epoch: 40
17 | print_step: 50
18 | lr: 1e-3
19 | lr_decay_start_epoch: 10
20 | lr_decay_rate: 0.9
21 | lr_decay_patient_n_epochs: 0
22 | lr_decay_type: always
23 | early_stop_patient_n_epochs: 10
24 | eval_start_epoch: 1
25 | # initialization
26 | param_init: 0.05
27 | # regularization
28 | clip_grad_norm: 1.0
29 | dropout_in: 0.2
30 | dropout_hidden: 0.5
31 | dropout_out: 0.0
32 | weight_decay: 1e-6
33 | lsm_prob: 0.1
34 | backward: false
35 | adaptive_softmax: false
36 | 


--------------------------------------------------------------------------------
/examples/ami/s5b/local/ami_text_prep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2015, Brno University of Technology (Author: Karel Vesely)
 4 | # Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski), 2014, Apache 2.0
 5 | 
 6 | if [ $# -ne 1 ]; then
 7 |   echo "Usage: $0 <ami-dir>"
 8 |   echo " <ami-dir> is download space."
 9 |   exit 1;
10 | fi
11 | 
12 | set -eux
13 | 
14 | dir=$1
15 | mkdir -p $dir
16 | 
17 | echo "Downloading annotations..."
18 | 
19 | amiurl=http://groups.inf.ed.ac.uk/ami
20 | annotver=ami_public_manual_1.6.1
21 | annot="$dir/$annotver"
22 | 
23 | logdir=${data}/local/downloads; mkdir -p $logdir/log
24 | [ ! -f $annot.zip ] && wget -nv -O $annot.zip $amiurl/AMICorpusAnnotations/$annotver.zip &> $logdir/log/download_ami_annot.log
25 | 
26 | if [ ! -d $dir/annotations ]; then
27 |   mkdir -p $dir/annotations
28 |   unzip -o -d $dir/annotations $annot.zip &> /dev/null
29 | fi
30 | 
31 | [ ! -f "$dir/annotations/AMI-metadata.xml" ] && echo "$0: File AMI-Metadata.xml not found under $dir/annotations." && exit 1;
32 | 
33 | 
34 | # extract text from AMI XML annotations,
35 | local/ami_xml2text.sh $dir
36 | 
37 | wdir=${data}/local/annotations
38 | [ ! -f $wdir/transcripts1 ] && echo "$0: File $wdir/transcripts1 not found." && exit 1;
39 | 
40 | echo "Preprocessing transcripts..."
41 | local/ami_split_segments.pl $wdir/transcripts1 $wdir/transcripts2 &> $wdir/log/split_segments.log
42 | 
43 | # make final train/dev/eval splits
44 | for dset in train eval dev; do
45 |   grep -f local/split_$dset.orig $wdir/transcripts2 > $wdir/$dset.txt
46 | done
47 | 


--------------------------------------------------------------------------------
/examples/ami/s5b/local/beamformit.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski)
 4 | 
 5 | . ./path.sh
 6 | 
 7 | nj=$1
 8 | job=$2
 9 | numch=$3
10 | meetings=$4
11 | sdir=$5
12 | odir=$6
13 | wdir=${data}/local/beamforming
14 | 
15 | set -e
16 | set -u
17 | 
18 | utils/split_scp.pl -j $nj $((job-1)) $meetings $meetings.$job
19 | 
20 | while read line; do
21 | 
22 |   mkdir -p $odir/$line
23 |   BeamformIt -s $line -c $wdir/channels_$numch \
24 |                         --config_file `pwd`/conf/ami_beamformit.cfg \
25 |                         --source_dir $sdir \
26 |                         --result_dir $odir/$line
27 |   mkdir -p $odir/$line
28 |   mv $odir/$line/${line}.del  $odir/$line/${line}_MDM$numch.del
29 |   mv $odir/$line/${line}.del2 $odir/$line/${line}_MDM$numch.del2
30 |   mv $odir/$line/${line}.info $odir/$line/${line}_MDM$numch.info
31 |   mv $odir/$line/${line}.weat $odir/$line/${line}_MDM$numch.weat
32 |   mv $odir/$line/${line}.wav  $odir/$line/${line}_MDM$numch.wav
33 |   #mv $odir/$line/${line}.ovl  $odir/$line/${line}_MDM$numch.ovl # Was not created!
34 | 
35 | done < $meetings.$job
36 | 


--------------------------------------------------------------------------------
/examples/ami/s5b/local/english.glm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/examples/ami/s5b/local/english.glm


--------------------------------------------------------------------------------
/examples/ami/s5b/local/split_REAMDE.txt:
--------------------------------------------------------------------------------
1 | The splits in this directory follow the official AMI Corpus Full-ASR split
2 | on train, dev and eval sets. 
3 | 
4 | If for some reason ones need to use different split the way to do so is
5 | to create split_*.final versions in this directory and run the recipe.
6 | 


--------------------------------------------------------------------------------
/examples/ami/s5b/local/split_dev.orig:
--------------------------------------------------------------------------------
 1 | ES2011a
 2 | ES2011b
 3 | ES2011c
 4 | ES2011d
 5 | IB4001
 6 | IB4002
 7 | IB4003
 8 | IB4004
 9 | IB4010
10 | IB4011
11 | IS1008a
12 | IS1008b
13 | IS1008c
14 | IS1008d
15 | TS3004a
16 | TS3004b
17 | TS3004c
18 | TS3004d
19 | 


--------------------------------------------------------------------------------
/examples/ami/s5b/local/split_eval.orig:
--------------------------------------------------------------------------------
 1 | EN2002a
 2 | EN2002b
 3 | EN2002c
 4 | EN2002d
 5 | ES2004a
 6 | ES2004b
 7 | ES2004c
 8 | ES2004d
 9 | IS1009a
10 | IS1009b
11 | IS1009c
12 | IS1009d
13 | TS3003a
14 | TS3003b
15 | TS3003c
16 | TS3003d
17 | 


--------------------------------------------------------------------------------
/examples/ami/s5b/path.sh:
--------------------------------------------------------------------------------
 1 | export NEURALSP_ROOT=$PWD/../../..
 2 | export KALDI_ROOT=$NEURALSP_ROOT/tools/kaldi
 3 | export TOOL=$NEURALSP_ROOT/tools/neural_sp
 4 | export CONDA=$TOOL/miniconda
 5 | 
 6 | # Kaldi
 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 8 | export PATH=$NEURALSP_ROOT/utils:$PWD/utils/:$KALDI_ROOT/tools/sctk/bin/:$TOOL/sentencepiece/build/src:$PATH
 9 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
10 | . $KALDI_ROOT/tools/config/common_path.sh
11 | export LC_ALL=C
12 | 
13 | ### Python
14 | source $CONDA/etc/profile.d/conda.sh && conda deactivate && conda activate
15 | export PYTHONDONTWRITEBYTECODE=1
16 | export OMP_NUM_THREADS=1
17 | 
18 | ### CUDA
19 | CUDAROOT=/usr/local/cuda
20 | NCCL_ROOT=/usr/local/nccl
21 | export CPATH=$NCCL_ROOT/include:$CPATH
22 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib/:$CUDAROOT/lib64:$LD_LIBRARY_PATH
23 | export LIBRARY_PATH=$NCCL_ROOT/lib/:$LIBRARY_PATH
24 | export CUDA_HOME=$CUDAROOT
25 | export CUDA_PATH=$CUDAROOT
26 | 


--------------------------------------------------------------------------------
/examples/ami/s5b/steps:
--------------------------------------------------------------------------------
1 | ../../wsj/s5/steps


--------------------------------------------------------------------------------
/examples/ami/s5b/utils:
--------------------------------------------------------------------------------
1 | ../../wsj/s5/utils


--------------------------------------------------------------------------------
/examples/ci_test/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | # Kyoto University setup
14 | export train_cmd="run.pl --mem 2G"
15 | export cuda_cmd="run.pl --mem 2G --gpu 1"
16 | export decode_cmd="run.pl --mem 4G"
17 | 
18 | # JHU setup
19 | # export train_cmd="queue.pl --mem 2G"
20 | # export cuda_cmd="queue.pl --mem 2G --gpu 1 --config conf/gpu.conf"
21 | # export decode_cmd="queue.pl --mem 4G"
22 | 


--------------------------------------------------------------------------------
/examples/ci_test/conf/asr/blstm_las.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | subsample: "1_1_1_1_1"
11 | enc_type: conv_blstm
12 | bidirectional_sum_fwd_bwd: true
13 | enc_n_units: 16
14 | enc_n_projs: 8
15 | enc_n_layers: 1
16 | subsample_type: drop
17 | lc_chunk_size_left: -1  ### offline
18 | lc_chunk_size_right: 40
19 | attn_type: location
20 | attn_conv_n_channels: 10
21 | attn_conv_width: 201
22 | attn_dim: 16
23 | attn_n_heads: 1
24 | dec_type: lstm
25 | dec_n_units: 16
26 | dec_n_projs: 8
27 | dec_n_layers: 1
28 | dec_bottleneck_dim: 16
29 | emb_dim: 16
30 | tie_embedding: false
31 | ctc_fc_list: "8"
32 | ### optimization
33 | batch_size: 1
34 | optimizer: adam
35 | n_epochs: 4
36 | convert_to_sgd_epoch: 100
37 | print_step: 1
38 | metric: edit_distance
39 | lr: 1e-3
40 | lr_decay_type: always
41 | lr_decay_start_epoch: 2
42 | lr_decay_rate: 0.85
43 | lr_decay_patient_n_epochs: 0
44 | early_stop_patient_n_epochs: 2
45 | sort_stop_epoch: 100
46 | eval_start_epoch: 2
47 | warmup_start_lr: 1e-4
48 | warmup_n_steps: 2
49 | ### initialization
50 | param_init: 0.1
51 | ### regularization
52 | clip_grad_norm: 5.0
53 | dropout_in: 0.1
54 | dropout_enc: 0.1
55 | dropout_dec: 0.1
56 | dropout_emb: 0.1
57 | dropout_att: 0.1
58 | weight_decay: 1e-6
59 | ss_prob: 0.1
60 | lsm_prob: 0.1
61 | ### MTL
62 | ctc_weight: 0.3
63 | ctc_lsm_prob: 0.1
64 | mtl_per_batch: false
65 | task_specific_layer: false
66 | 


--------------------------------------------------------------------------------
/examples/ci_test/conf/asr/blstm_transformer.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | subsample: "1_1_1_1_1"
11 | enc_type: conv_blstm
12 | bidirectional_sum_fwd_bwd: true
13 | enc_n_units: 16
14 | enc_n_projs: 8
15 | enc_n_layers: 1
16 | subsample_type: drop
17 | lc_chunk_size_left: -1  ### offline
18 | lc_chunk_size_right: 40
19 | dec_type: transformer
20 | dec_n_layers: 1
21 | transformer_dec_attn_type: scaled_dot
22 | transformer_dec_pe_type: 1dconv3L
23 | transformer_dec_d_model: 8
24 | transformer_dec_d_ff: 64
25 | transformer_dec_n_heads: 4
26 | tie_embedding: false
27 | ctc_fc_list: "8"
28 | ### optimization
29 | batch_size: 1
30 | optimizer: noam
31 | n_epochs: 4
32 | convert_to_sgd_epoch: 100
33 | print_step: 1
34 | metric: accuracy
35 | lr_factor: 5.0
36 | early_stop_patient_n_epochs: 2
37 | shuffle_bucket: true
38 | sort_stop_epoch: 100
39 | eval_start_epoch: 2
40 | warmup_n_steps: 2
41 | accum_grad_n_steps: 2
42 | ### regularization
43 | clip_grad_norm: 5.0
44 | dropout_in: 0.1
45 | dropout_enc: 0.1
46 | dropout_dec: 0.1
47 | dropout_emb: 0.1
48 | dropout_att: 0.1
49 | weight_decay: 1e-6
50 | lsm_prob: 0.1
51 | ### MTL
52 | ctc_weight: 0.3
53 | ctc_lsm_prob: 0.1
54 | mtl_per_batch: false
55 | task_specific_layer: false
56 | 


--------------------------------------------------------------------------------
/examples/ci_test/conf/asr/conformer.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(1,1)"
10 | subsample: "1_2"
11 | subsample_type: max_pool
12 | enc_type: conv_conformer
13 | conformer_kernel_size: 3
14 | enc_n_layers: 1
15 | transformer_enc_pe_type: relative  ###
16 | transformer_enc_d_model: 8
17 | transformer_enc_d_ff: 32
18 | transformer_enc_n_heads: 4
19 | dec_type: transformer
20 | dec_n_layers: 1
21 | transformer_dec_attn_type: scaled_dot
22 | transformer_dec_pe_type: 1dconv3L
23 | transformer_dec_d_model: 8
24 | transformer_dec_d_ff: 32
25 | transformer_dec_n_heads: 4
26 | tie_embedding: false
27 | ctc_fc_list: "8"
28 | ### optimization
29 | batch_size: 1
30 | optimizer: noam
31 | n_epochs: 4
32 | convert_to_sgd_epoch: 100
33 | print_step: 1
34 | metric: accuracy
35 | lr_factor: 5.0
36 | early_stop_patient_n_epochs: 2
37 | shuffle_bucket: true
38 | sort_stop_epoch: 100
39 | eval_start_epoch: 2
40 | warmup_n_steps: 2
41 | accum_grad_n_steps: 2
42 | ### regularization
43 | clip_grad_norm: 5.0
44 | dropout_in: 0.1
45 | dropout_enc: 0.1
46 | dropout_dec: 0.1
47 | dropout_emb: 0.1
48 | dropout_att: 0.1
49 | weight_decay: 1e-6
50 | lsm_prob: 0.1
51 | ### MTL
52 | ctc_weight: 0.3
53 | ctc_lsm_prob: 0.1
54 | mtl_per_batch: false
55 | task_specific_layer: false
56 | 


--------------------------------------------------------------------------------
/examples/ci_test/conf/asr/lcblstm_transducer.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | subsample: "1_1_1_1_1"
11 | enc_type: conv_blstm
12 | bidirectional_sum_fwd_bwd: true
13 | enc_n_units: 16
14 | enc_n_projs: 8
15 | enc_n_layers: 1
16 | subsample_type: drop
17 | lc_chunk_size_left: 40
18 | lc_chunk_size_right: 40
19 | dec_type: lstm_transducer
20 | dec_n_units: 16
21 | dec_n_projs: 8
22 | dec_n_layers: 1
23 | dec_bottleneck_dim: 16
24 | emb_dim: 16
25 | tie_embedding: false
26 | ctc_fc_list: "8"
27 | ### optimization
28 | batch_size: 1
29 | optimizer: adam
30 | n_epochs: 4
31 | convert_to_sgd_epoch: 100
32 | print_step: 1
33 | metric: edit_distance
34 | lr: 1e-3
35 | lr_decay_type: always
36 | lr_decay_start_epoch: 2
37 | lr_decay_rate: 0.85
38 | lr_decay_patient_n_epochs: 0
39 | early_stop_patient_n_epochs: 2
40 | sort_stop_epoch: 100
41 | eval_start_epoch: 2
42 | warmup_start_lr: 1e-4
43 | warmup_n_steps: 2
44 | ### initialization
45 | param_init: 0.1
46 | ### regularization
47 | clip_grad_norm: 5.0
48 | dropout_in: 0.1
49 | dropout_enc: 0.1
50 | dropout_dec: 0.1
51 | dropout_emb: 0.1
52 | weight_decay: 1e-6
53 | lsm_prob: 0.1
54 | ### MTL
55 | ctc_weight: 0.3
56 | ctc_lsm_prob: 0.1
57 | mtl_per_batch: false
58 | task_specific_layer: false
59 | 


--------------------------------------------------------------------------------
/examples/ci_test/conf/asr/lstm_ctc.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | subsample: "1_1_1_1_1"
11 | enc_type: conv_lstm
12 | enc_n_units: 16
13 | enc_n_projs: 8
14 | enc_n_layers: 1
15 | subsample_type: drop
16 | dec_type: lstm
17 | ctc_fc_list: "8"
18 | ### optimization
19 | batch_size: 1
20 | optimizer: adam
21 | n_epochs: 4
22 | convert_to_sgd_epoch: 100
23 | print_step: 1
24 | metric: edit_distance
25 | lr: 1e-3
26 | lr_decay_type: always
27 | lr_decay_start_epoch: 2
28 | lr_decay_rate: 0.85
29 | lr_decay_patient_n_epochs: 0
30 | early_stop_patient_n_epochs: 2
31 | sort_stop_epoch: 100
32 | eval_start_epoch: 2
33 | warmup_start_lr: 1e-4
34 | warmup_n_steps: 2
35 | ### initialization
36 | param_init: 0.1
37 | ### regularization
38 | clip_grad_norm: 5.0
39 | dropout_in: 0.1
40 | dropout_enc: 0.1
41 | weight_decay: 1e-6
42 | ### MTL
43 | ctc_weight: 1.0
44 | ctc_lsm_prob: 0.1
45 | mtl_per_batch: false
46 | task_specific_layer: false
47 | 


--------------------------------------------------------------------------------
/examples/ci_test/conf/asr/tds_las.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "3_3_5_5_5_7_7_7_7_7_7"
 7 | conv_kernel_sizes: "(3,1)_(3,1)_(3,1)_(3,1)_(3,1)_(3,1)_(3,1)_(3,1)_(3,1)_(3,1)_(3,1)"
 8 | enc_type: tds
 9 | attn_type: location
10 | attn_conv_n_channels: 10
11 | attn_conv_width: 201
12 | attn_dim: 16
13 | attn_n_heads: 1
14 | dec_type: lstm
15 | dec_n_units: 16
16 | dec_n_projs: 8
17 | dec_n_layers: 1
18 | dec_bottleneck_dim: 16
19 | emb_dim: 16
20 | tie_embedding: false
21 | ctc_fc_list: "8"
22 | ### optimization
23 | batch_size: 1
24 | optimizer: adam
25 | n_epochs: 4
26 | convert_to_sgd_epoch: 100
27 | print_step: 1
28 | metric: edit_distance
29 | lr: 1e-3
30 | lr_decay_type: always
31 | lr_decay_start_epoch: 2
32 | lr_decay_rate: 0.85
33 | lr_decay_patient_n_epochs: 0
34 | early_stop_patient_n_epochs: 2
35 | sort_stop_epoch: 100
36 | eval_start_epoch: 2
37 | warmup_start_lr: 1e-4
38 | warmup_n_steps: 2
39 | ### initialization
40 | param_init: 0.1
41 | ### regularization
42 | clip_grad_norm: 5.0
43 | dropout_in: 0.1
44 | dropout_enc: 0.1
45 | dropout_dec: 0.1
46 | dropout_emb: 0.1
47 | dropout_att: 0.1
48 | weight_decay: 1e-6
49 | ss_prob: 0.1
50 | lsm_prob: 0.1
51 | ### MTL
52 | ctc_weight: 0.3
53 | ctc_lsm_prob: 0.1
54 | mtl_per_batch: false
55 | task_specific_layer: false
56 | 


--------------------------------------------------------------------------------
/examples/ci_test/conf/asr/transformer.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(1,1)"
10 | subsample: "1_2"
11 | subsample_type: max_pool
12 | enc_type: conv_transformer
13 | enc_n_layers: 1
14 | transformer_enc_pe_type: none  ###
15 | transformer_enc_d_model: 8
16 | transformer_enc_d_ff: 32
17 | transformer_enc_n_heads: 4
18 | dec_type: transformer
19 | dec_n_layers: 1
20 | transformer_dec_attn_type: scaled_dot
21 | transformer_dec_pe_type: 1dconv3L
22 | transformer_dec_d_model: 8
23 | transformer_dec_d_ff: 32
24 | transformer_dec_n_heads: 4
25 | tie_embedding: false
26 | ctc_fc_list: "8"
27 | ### optimization
28 | batch_size: 1
29 | optimizer: noam
30 | n_epochs: 4
31 | convert_to_sgd_epoch: 100
32 | print_step: 1
33 | metric: accuracy
34 | lr_factor: 5.0
35 | early_stop_patient_n_epochs: 2
36 | shuffle_bucket: true
37 | sort_stop_epoch: 100
38 | eval_start_epoch: 2
39 | warmup_n_steps: 2
40 | accum_grad_n_steps: 2
41 | ### regularization
42 | clip_grad_norm: 5.0
43 | dropout_in: 0.1
44 | dropout_enc: 0.1
45 | dropout_dec: 0.1
46 | dropout_emb: 0.1
47 | dropout_att: 0.1
48 | weight_decay: 1e-6
49 | lsm_prob: 0.1
50 | ### MTL
51 | ctc_weight: 0.3
52 | ctc_lsm_prob: 0.1
53 | mtl_per_batch: false
54 | task_specific_layer: false
55 | 


--------------------------------------------------------------------------------
/examples/ci_test/conf/asr/transformer_2mtl.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | enc_type: conv_transformer
11 | enc_n_layers: 2
12 | enc_n_layers_sub1: 1
13 | transformer_enc_pe_type: none  ###
14 | transformer_enc_d_model: 8
15 | transformer_enc_d_ff: 32
16 | transformer_enc_n_heads: 4
17 | dec_type: transformer
18 | dec_n_layers: 2
19 | transformer_dec_attn_type: scaled_dot
20 | transformer_dec_pe_type: 1dconv3L
21 | transformer_dec_d_model: 8
22 | transformer_dec_d_ff: 32
23 | transformer_dec_n_heads: 4
24 | tie_embedding: false
25 | ctc_fc_list: "8"
26 | dec_config_sub1:
27 |   dec_type: transformer
28 |   dec_n_layers: 2
29 |   ctc_fc_list: "8"
30 | ### optimization
31 | batch_size: 1
32 | optimizer: noam
33 | n_epochs: 4
34 | convert_to_sgd_epoch: 100
35 | print_step: 1
36 | metric: accuracy
37 | lr_factor: 5.0
38 | early_stop_patient_n_epochs: 2
39 | shuffle_bucket: true
40 | sort_stop_epoch: 100
41 | eval_start_epoch: 2
42 | warmup_n_steps: 2
43 | accum_grad_n_steps: 2
44 | ### regularization
45 | clip_grad_norm: 5.0
46 | dropout_in: 0.1
47 | dropout_enc: 0.1
48 | dropout_dec: 0.1
49 | dropout_emb: 0.1
50 | dropout_att: 0.1
51 | weight_decay: 1e-6
52 | lsm_prob: 0.1
53 | ### MTL
54 | ctc_weight: 0.3
55 | ctc_weight_sub1: 0.1
56 | ctc_lsm_prob: 0.1
57 | sub1_weight: 0.2
58 | mtl_per_batch: false
59 | task_specific_layer: true
60 | 


--------------------------------------------------------------------------------
/examples/ci_test/conf/asr/transformer_ctc.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | enc_type: conv_transformer
11 | enc_n_layers: 1
12 | transformer_enc_pe_type: none  ###
13 | transformer_enc_d_model: 8
14 | transformer_enc_d_ff: 32
15 | transformer_enc_n_heads: 4
16 | dec_type: transformer
17 | ctc_fc_list: "8"
18 | ### optimization
19 | batch_size: 1
20 | optimizer: noam
21 | n_epochs: 4
22 | convert_to_sgd_epoch: 100
23 | print_step: 1
24 | metric: edit_distance
25 | lr_factor: 5.0
26 | early_stop_patient_n_epochs: 2
27 | shuffle_bucket: true
28 | sort_stop_epoch: 100
29 | eval_start_epoch: 2
30 | warmup_n_steps: 2
31 | accum_grad_n_steps: 2
32 | ### regularization
33 | clip_grad_norm: 5.0
34 | dropout_in: 0.1
35 | dropout_enc: 0.1
36 | dropout_dec: 0.1
37 | dropout_emb: 0.1
38 | dropout_att: 0.1
39 | weight_decay: 1e-6
40 | lsm_prob: 0.1
41 | ### MTL
42 | ctc_weight: 1.0
43 | ctc_lsm_prob: 0.1
44 | mtl_per_batch: false
45 | task_specific_layer: false
46 | 


--------------------------------------------------------------------------------
/examples/ci_test/conf/asr/transformer_las.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | enc_type: conv_transformer
11 | enc_n_layers: 1
12 | transformer_enc_pe_type: none  ###
13 | transformer_enc_d_model: 8
14 | transformer_enc_d_ff: 32
15 | transformer_enc_n_heads: 4
16 | attn_type: location
17 | attn_conv_n_channels: 10
18 | attn_conv_width: 201
19 | attn_dim: 16
20 | attn_n_heads: 1
21 | dec_type: lstm
22 | dec_n_units: 16
23 | dec_n_projs: 8
24 | dec_n_layers: 1
25 | dec_bottleneck_dim: 16
26 | emb_dim: 16
27 | tie_embedding: false
28 | ctc_fc_list: "8"
29 | ### optimization
30 | batch_size: 16000
31 | batch_size_type: frame
32 | optimizer: noam
33 | n_epochs: 4
34 | convert_to_sgd_epoch: 100
35 | print_step: 1
36 | metric: accuracy
37 | lr_factor: 5.0
38 | early_stop_patient_n_epochs: 2
39 | shuffle_bucket: true
40 | sort_stop_epoch: 100
41 | eval_start_epoch: 2
42 | warmup_n_steps: 2
43 | accum_grad_n_steps: 2
44 | ### regularization
45 | clip_grad_norm: 5.0
46 | dropout_in: 0.1
47 | dropout_enc: 0.1
48 | dropout_dec: 0.1
49 | dropout_emb: 0.1
50 | dropout_att: 0.1
51 | weight_decay: 1e-6
52 | ss_prob: 0.1
53 | lsm_prob: 0.1
54 | ### MTL
55 | ctc_weight: 0.3
56 | ctc_lsm_prob: 0.1
57 | mtl_per_batch: false
58 | task_specific_layer: false
59 | 


--------------------------------------------------------------------------------
/examples/ci_test/conf/data/adaptive_spec_augment.yaml:
--------------------------------------------------------------------------------
1 | # mask
2 | freq_width: 27
3 | n_freq_masks: 2
4 | time_width_upper: 1.0
5 | 
6 | adaptive_number_ratio: 0.04
7 | adaptive_size_ratio: 0.04
8 | max_n_time_masks: 20
9 | 


--------------------------------------------------------------------------------
/examples/ci_test/conf/data/spec_augment.yaml:
--------------------------------------------------------------------------------
1 | # mask
2 | freq_width: 27
3 | n_freq_masks: 2
4 | time_width: 100
5 | n_time_masks: 2
6 | time_width_upper: 1.0
7 | 


--------------------------------------------------------------------------------
/examples/ci_test/conf/fbank.conf:
--------------------------------------------------------------------------------
1 | --window-type=hamming # disable Dans window, use the standard
2 | --htk-compat=true     # try to make it compatible with HTK
3 | 
4 | --sample-frequency=16000
5 | --num-mel-bins=80
6 | --use-energy=false
7 | 


--------------------------------------------------------------------------------
/examples/ci_test/conf/lm/rnnlm.yaml:
--------------------------------------------------------------------------------
 1 | # topology
 2 | lm_type: lstm
 3 | n_units: 16
 4 | n_projs: 8
 5 | n_layers: 2
 6 | emb_dim: 16
 7 | n_units_null_context: 0
 8 | tie_embedding: true
 9 | residual: true
10 | use_glu: true
11 | # optimization
12 | batch_size: 1
13 | bptt: 10
14 | optimizer: adam
15 | n_epochs: 4
16 | convert_to_sgd_epoch: 100
17 | print_step: 1
18 | lr: 1e-3
19 | lr_decay_start_epoch: 2
20 | lr_decay_rate: 0.9
21 | lr_decay_patient_n_epochs: 0
22 | lr_decay_type: always
23 | early_stop_patient_n_epochs: 2
24 | eval_start_epoch: 1
25 | # initialization
26 | param_init: 0.05
27 | # regularization
28 | clip_grad_norm: 1.0
29 | dropout_in: 0.1
30 | dropout_hidden: 0.1
31 | dropout_out: 0.1
32 | weight_decay: 1e-6
33 | lsm_prob: 0.1
34 | adaptive_softmax: false
35 | 


--------------------------------------------------------------------------------
/examples/ci_test/conf/lm/transformer_xl.yaml:
--------------------------------------------------------------------------------
 1 | # topology
 2 | lm_type: transformer_xl
 3 | n_layers: 2
 4 | transformer_d_model: 8
 5 | transformer_d_ff: 32
 6 | transformer_n_heads: 4
 7 | tie_embedding: true
 8 | # optimization
 9 | batch_size: 1
10 | bptt: 10
11 | mem_len: 10
12 | optimizer: noam
13 | n_epochs: 4
14 | convert_to_sgd_epoch: 100
15 | print_step: 1
16 | lr_factor: 1.0
17 | early_stop_patient_n_epochs: 2
18 | eval_start_epoch: 1
19 | warmup_n_steps: 2
20 | accum_grad_n_steps: 2
21 | # regularization
22 | clip_grad_norm: 1.0
23 | dropout_in: 0.1
24 | dropout_hidden: 0.1
25 | dropout_out: 0.1
26 | dropout_att: 0.1
27 | dropout_layer: 0.1
28 | weight_decay: 1e-6
29 | lsm_prob: 0.1
30 | adaptive_softmax: false
31 | 


--------------------------------------------------------------------------------
/examples/ci_test/conf/lm/transformerlm.yaml:
--------------------------------------------------------------------------------
 1 | # topology
 2 | lm_type: transformer
 3 | n_layers: 2
 4 | transformer_pe_type: add
 5 | transformer_d_model: 8
 6 | transformer_d_ff: 32
 7 | transformer_n_heads: 4
 8 | tie_embedding: true
 9 | # optimization
10 | batch_size: 1
11 | bptt: 10
12 | optimizer: noam
13 | n_epochs: 4
14 | convert_to_sgd_epoch: 100
15 | print_step: 1
16 | lr_factor: 10.0
17 | early_stop_patient_n_epochs: 2
18 | eval_start_epoch: 1
19 | warmup_n_steps: 2
20 | accum_grad_n_steps: 2
21 | # regularization
22 | clip_grad_norm: 1.0
23 | dropout_in: 0.1
24 | dropout_hidden: 0.1
25 | dropout_out: 0.1
26 | dropout_att: 0.1
27 | dropout_layer: 0.1
28 | weight_decay: 1e-6
29 | lsm_prob: 0.1
30 | adaptive_softmax: false
31 | 


--------------------------------------------------------------------------------
/examples/ci_test/ctc_forced_align.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Copyright 2020 Kyoto University (Hirofumi Inaguma)
 4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | model=
 7 | gpu=
 8 | stdout=false
 9 | n_threads=1
10 | eval_set="train"
11 | cmd_coverage="coverage run -a"
12 | 
13 | ### path to save preproecssed data
14 | data=./data
15 | 
16 | batch_size=1
17 | n_average=2  # for Transformer
18 | 
19 | . ./cmd.sh
20 | . ./path.sh
21 | . utils/parse_options.sh
22 | 
23 | set -e
24 | set -u
25 | set -o pipefail
26 | 
27 | if [ -z ${gpu} ]; then
28 |     # CPU
29 |     n_gpus=0
30 |     export OMP_NUM_THREADS=${n_threads}
31 | else
32 |     n_gpus=$(echo ${gpu} | tr "," "\n" | wc -l)
33 | fi
34 | 
35 | for set in ${eval_set}; do
36 |     recog_dir=$(dirname ${model})/align_${set}
37 |     if [ ${n_average} != 1 ]; then
38 |         recog_dir=${recog_dir}_average${n_average}
39 |     fi
40 |     mkdir -p ${recog_dir}
41 | 
42 |     CUDA_VISIBLE_DEVICES=${gpu} ${cmd_coverage} ${NEURALSP_ROOT}/neural_sp/bin/asr/ctc_forced_align.py \
43 |         --recog_n_gpus ${n_gpus} \
44 |         --recog_sets ${data}/dataset/${set}_char.tsv \
45 |         --recog_dir ${recog_dir} \
46 |         --recog_model ${model} \
47 |         --recog_batch_size ${batch_size} \
48 |         --recog_n_average ${n_average} \
49 |         --recog_stdout ${stdout} || exit 1;
50 | done
51 | 


--------------------------------------------------------------------------------
/examples/ci_test/data/train/spk2utt:
--------------------------------------------------------------------------------
1 | LDC93S1 LDC93S1-1
2 | 


--------------------------------------------------------------------------------
/examples/ci_test/data/train/text:
--------------------------------------------------------------------------------
1 | LDC93S1-1 she had your dark suit in greasy wash water all year
2 | 


--------------------------------------------------------------------------------
/examples/ci_test/data/train/text.phone:
--------------------------------------------------------------------------------
1 | LDC93S1-1 h# sh ix hv eh dcl jh ih dcl d ah kcl k s ux q en gcl g r ix s ix w ao sh epi w ao dx axr ao l y ih axr h#
2 | 


--------------------------------------------------------------------------------
/examples/ci_test/data/train/utt2spk:
--------------------------------------------------------------------------------
1 | LDC93S1-1 LDC93S1
2 | 


--------------------------------------------------------------------------------
/examples/ci_test/data/train/wav.scp:
--------------------------------------------------------------------------------
1 | LDC93S1-1 cat ./sample/LDC93S1.wav |
2 | 


--------------------------------------------------------------------------------
/examples/ci_test/local/download_sample.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | # install TIMIT samples (publicly available)
4 | mkdir -p $(pwd)/sample
5 | wget --no-check-certificate -P $(pwd)/sample https://catalog.ldc.upenn.edu/desc/addenda/LDC93S1.phn
6 | wget --no-check-certificate -P $(pwd)/sample https://catalog.ldc.upenn.edu/desc/addenda/LDC93S1.txt
7 | wget --no-check-certificate -P $(pwd)/sample https://catalog.ldc.upenn.edu/desc/addenda/LDC93S1.wav
8 | wget --no-check-certificate -P $(pwd)/sample https://catalog.ldc.upenn.edu/desc/addenda/LDC93S1.wrd
9 | 


--------------------------------------------------------------------------------
/examples/ci_test/path.sh:
--------------------------------------------------------------------------------
 1 | export NEURALSP_ROOT=$PWD/../..
 2 | export KALDI_ROOT=$NEURALSP_ROOT/tools/kaldi
 3 | export TOOL=$NEURALSP_ROOT/tools/neural_sp
 4 | export CONDA=$TOOL/miniconda
 5 | 
 6 | # Kaldi
 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 8 | export PATH=$NEURALSP_ROOT/utils:$PWD/utils/:$KALDI_ROOT/tools/sctk/bin/:$TOOL/sentencepiece/build/src:$PATH
 9 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
10 | . $KALDI_ROOT/tools/config/common_path.sh
11 | export LC_ALL=C
12 | 
13 | ### Python
14 | source $CONDA/etc/profile.d/conda.sh && conda deactivate && conda activate
15 | export PYTHONDONTWRITEBYTECODE=1
16 | export OMP_NUM_THREADS=1
17 | 
18 | ### CUDA
19 | CUDAROOT=/usr/local/cuda
20 | NCCL_ROOT=/usr/local/nccl
21 | export CPATH=$NCCL_ROOT/include:$CPATH
22 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib/:$CUDAROOT/lib64:$LD_LIBRARY_PATH
23 | export LIBRARY_PATH=$NCCL_ROOT/lib/:$LIBRARY_PATH
24 | export CUDA_HOME=$CUDAROOT
25 | export CUDA_PATH=$CUDAROOT
26 | 


--------------------------------------------------------------------------------
/examples/ci_test/plot_ctc.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Copyright 2020 Kyoto University (Hirofumi Inaguma)
 4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | model=
 7 | gpu=
 8 | stdout=false
 9 | n_threads=1
10 | eval_set="train"
11 | cmd_coverage="coverage run -a"
12 | 
13 | ### path to save preproecssed data
14 | data=./data
15 | 
16 | batch_size=1
17 | n_average=2  # for Transformer
18 | 
19 | . ./cmd.sh
20 | . ./path.sh
21 | . utils/parse_options.sh
22 | 
23 | set -e
24 | set -u
25 | set -o pipefail
26 | 
27 | if [ -z ${gpu} ]; then
28 |     # CPU
29 |     n_gpus=0
30 |     export OMP_NUM_THREADS=${n_threads}
31 | else
32 |     n_gpus=$(echo ${gpu} | tr "," "\n" | wc -l)
33 | fi
34 | 
35 | for set in ${eval_set}; do
36 |     recog_dir=$(dirname ${model})/plot_${set}
37 |     if [ ${n_average} != 1 ]; then
38 |         recog_dir=${recog_dir}_average${n_average}
39 |     fi
40 |     mkdir -p ${recog_dir}
41 | 
42 |     CUDA_VISIBLE_DEVICES=${gpu} ${cmd_coverage} ${NEURALSP_ROOT}/neural_sp/bin/asr/plot_ctc.py \
43 |         --recog_n_gpus ${n_gpus} \
44 |         --recog_sets ${data}/dataset/${set}_char.tsv \
45 |         --recog_dir ${recog_dir} \
46 |         --recog_model ${model} \
47 |         --recog_batch_size ${batch_size} \
48 |         --recog_n_average ${n_average} \
49 |         --recog_stdout ${stdout} || exit 1;
50 | done
51 | 


--------------------------------------------------------------------------------
/examples/ci_test/steps:
--------------------------------------------------------------------------------
1 | ../wsj/s5/steps


--------------------------------------------------------------------------------
/examples/ci_test/utils:
--------------------------------------------------------------------------------
1 | ../wsj/s5/utils


--------------------------------------------------------------------------------
/examples/csj/README.txt:
--------------------------------------------------------------------------------
 1 | About the Corpus of Spontaneous Japanese:
 2 | The Corpus of Spontaneous Japanese (CSJ) is a database of spoken
 3 | Japanese developed by the Japan's national priority area research
 4 | project "Spontaneous Speech: Corpus and Processing Technology".
 5 | It contains about 650 hours of speech consisting of approximately
 6 | 7.5 million words that were provided by more than 1,400 speakers.
 7 | For more details about the corpus, please visit the website of the
 8 | National Institute for Japanese Language (NINJAL). It is available
 9 | from the Institute.
10 | http://www.ninjal.ac.jp/english/products/csj/
11 | http://pj.ninjal.ac.jp/corpus_center/csj/
12 | 
13 | Meta-parameter tuning based on evolution strategy:
14 | The meta-parameters of the system contained in conf/config_opt were
15 | automatically tuned using evolution strategy. For the details,
16 | please refer the following paper:
17 | Takafumi Moriya, Tomohiro Tanaka, Takahiro Shinozaki, Shinji Watanabe,
18 | and Kevin Duh, "Automation of System Building for State-of-the-art
19 | Large Vocabulary Speech Recognition Using Evolution Strategy," Proc.
20 | IEEE 2015 Automatic Speech Recognition and Understanding Workshop
21 | (ASRU), 2015.
22 | 
23 | 
24 | Each subdirectory of this directory contains the
25 | scripts for a sequence of experiments.
26 | s5: This is the current recommended recipe.
27 |    The recipe supports the third and fourth editions of CSJ.
28 | 


--------------------------------------------------------------------------------
/examples/csj/s5/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | # Kyoto University setup
14 | export train_cmd="run.pl --mem 2G"
15 | export cuda_cmd="run.pl --mem 2G --gpu 1"
16 | export decode_cmd="run.pl --mem 4G"
17 | 
18 | # JHU setup
19 | # export train_cmd="queue.pl --mem 2G"
20 | # export cuda_cmd="queue.pl --mem 2G --gpu 1 --config conf/gpu.conf"
21 | # export decode_cmd="queue.pl --mem 4G"
22 | 


--------------------------------------------------------------------------------
/examples/csj/s5/conf/asr/las/blstm_las.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | subsample: "1_1_1_1_1"
11 | enc_type: conv_blstm
12 | bidirectional_sum_fwd_bwd: true
13 | enc_n_units: 512
14 | enc_n_projs: 0
15 | enc_n_layers: 5
16 | subsample_type: drop
17 | lc_chunk_size_left: -1  ### offline
18 | lc_chunk_size_right: 40
19 | attn_type: location
20 | attn_conv_n_channels: 10
21 | attn_conv_width: 201
22 | attn_dim: 512
23 | attn_n_heads: 1
24 | dec_type: lstm
25 | dec_n_units: 1024
26 | dec_n_projs: 0
27 | dec_n_layers: 1
28 | dec_bottleneck_dim: 1024  ### this is effective
29 | emb_dim: 512
30 | tie_embedding: false
31 | ctc_fc_list: "512"
32 | ### optimization
33 | batch_size: 30
34 | optimizer: adam
35 | n_epochs: 25
36 | convert_to_sgd_epoch: 100
37 | print_step: 800
38 | metric: edit_distance
39 | lr: 1e-3
40 | lr_decay_type: always
41 | lr_decay_start_epoch: 10
42 | lr_decay_rate: 0.85
43 | lr_decay_patient_n_epochs: 0
44 | early_stop_patient_n_epochs: 5
45 | sort_stop_epoch: 100
46 | eval_start_epoch: 1
47 | warmup_start_lr: 1e-4
48 | warmup_n_steps: 0
49 | ### initialization
50 | param_init: 0.1
51 | ### regularization
52 | clip_grad_norm: 5.0
53 | dropout_in: 0.0
54 | dropout_enc: 0.4
55 | dropout_dec: 0.4
56 | dropout_emb: 0.4
57 | dropout_att: 0.0
58 | weight_decay: 1e-6
59 | ss_prob: 0.2
60 | lsm_prob: 0.1
61 | ### MTL
62 | ctc_weight: 0.3
63 | ctc_lsm_prob: 0.1
64 | mtl_per_batch: false
65 | task_specific_layer: false
66 | 


--------------------------------------------------------------------------------
/examples/csj/s5/conf/asr/las/lcblstm_las_chunk4040.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | subsample: "1_1_1_1_1"
11 | enc_type: conv_blstm
12 | bidirectional_sum_fwd_bwd: true
13 | enc_n_units: 512
14 | enc_n_projs: 0
15 | enc_n_layers: 5
16 | subsample_type: drop
17 | lc_chunk_size_left: 40
18 | lc_chunk_size_right: 40
19 | attn_type: location
20 | attn_conv_n_channels: 10
21 | attn_conv_width: 201
22 | attn_dim: 512
23 | attn_n_heads: 1
24 | dec_type: lstm
25 | dec_n_units: 1024
26 | dec_n_projs: 0
27 | dec_n_layers: 1
28 | dec_bottleneck_dim: 1024  ### this is effective
29 | emb_dim: 512
30 | tie_embedding: false
31 | ctc_fc_list: "512"
32 | ### optimization
33 | batch_size: 20
34 | optimizer: adam
35 | n_epochs: 25
36 | convert_to_sgd_epoch: 100
37 | print_step: 800
38 | metric: edit_distance
39 | lr: 1e-3
40 | lr_decay_type: always
41 | lr_decay_start_epoch: 10
42 | lr_decay_rate: 0.85
43 | lr_decay_patient_n_epochs: 0
44 | early_stop_patient_n_epochs: 5
45 | sort_stop_epoch: 100
46 | eval_start_epoch: 1
47 | warmup_start_lr: 1e-4
48 | warmup_n_steps: 0
49 | ### initialization
50 | param_init: 0.1
51 | ### regularization
52 | clip_grad_norm: 5.0
53 | dropout_in: 0.0
54 | dropout_enc: 0.4
55 | dropout_dec: 0.4
56 | dropout_emb: 0.4
57 | dropout_att: 0.0
58 | weight_decay: 1e-6
59 | ss_prob: 0.2
60 | lsm_prob: 0.1
61 | ### MTL
62 | ctc_weight: 0.3
63 | ctc_lsm_prob: 0.1
64 | mtl_per_batch: false
65 | task_specific_layer: false
66 | 


--------------------------------------------------------------------------------
/examples/csj/s5/conf/asr/las/lstm_las.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | subsample: "1_1_1_1_1"
11 | enc_type: conv_lstm
12 | enc_n_units: 1024
13 | enc_n_projs: 0
14 | enc_n_layers: 5
15 | subsample_type: drop
16 | attn_type: location
17 | attn_conv_n_channels: 10
18 | attn_conv_width: 201
19 | attn_dim: 512
20 | attn_n_heads: 1
21 | dec_type: lstm
22 | dec_n_units: 1024
23 | dec_n_projs: 0
24 | dec_n_layers: 1
25 | dec_bottleneck_dim: 1024  ### this is effective
26 | emb_dim: 512
27 | tie_embedding: false
28 | ctc_fc_list: "512"
29 | ### optimization
30 | batch_size: 30
31 | optimizer: adam
32 | n_epochs: 25
33 | convert_to_sgd_epoch: 100
34 | print_step: 800
35 | metric: edit_distance
36 | lr: 1e-3
37 | lr_decay_type: always
38 | lr_decay_start_epoch: 10
39 | lr_decay_rate: 0.85
40 | lr_decay_patient_n_epochs: 0
41 | early_stop_patient_n_epochs: 5
42 | sort_stop_epoch: 100
43 | eval_start_epoch: 1
44 | warmup_start_lr: 1e-4
45 | warmup_n_steps: 0
46 | ### initialization
47 | param_init: 0.1
48 | ### regularization
49 | clip_grad_norm: 5.0
50 | dropout_in: 0.0
51 | dropout_enc: 0.4
52 | dropout_dec: 0.4
53 | dropout_emb: 0.4
54 | dropout_att: 0.0
55 | weight_decay: 1e-6
56 | ss_prob: 0.2
57 | lsm_prob: 0.1
58 | ### MTL
59 | ctc_weight: 0.3
60 | ctc_lsm_prob: 0.1
61 | mtl_per_batch: false
62 | task_specific_layer: false
63 | 


--------------------------------------------------------------------------------
/examples/csj/s5/conf/asr/transformer/transformer.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | enc_type: conv_transformer
11 | enc_n_layers: 12
12 | transformer_enc_pe_type: add
13 | transformer_enc_d_model: 256
14 | transformer_enc_d_ff: 2048
15 | transformer_enc_n_heads: 4
16 | dec_type: transformer
17 | dec_n_layers: 6
18 | transformer_dec_pe_type: 1dconv3L  ### this is effective
19 | transformer_dec_attn_type: scaled_dot
20 | transformer_dec_d_model: 256
21 | transformer_dec_d_ff: 2048
22 | transformer_dec_n_heads: 4
23 | tie_embedding: false
24 | ctc_fc_list: "512"
25 | ### optimization
26 | batch_size: 32
27 | optimizer: noam
28 | n_epochs: 40
29 | convert_to_sgd_epoch: 100
30 | print_step: 400
31 | metric: accuracy
32 | lr_factor: 5.0
33 | early_stop_patient_n_epochs: 5
34 | shuffle_bucket: true  ### this is important
35 | sort_stop_epoch: 100
36 | eval_start_epoch: 1
37 | warmup_n_steps: 25000
38 | accum_grad_n_steps: 8
39 | ### regularization
40 | clip_grad_norm: 5.0
41 | dropout_in: 0.0
42 | dropout_enc: 0.1
43 | dropout_dec: 0.1
44 | dropout_emb: 0.1
45 | dropout_att: 0.0
46 | weight_decay: 1e-6
47 | lsm_prob: 0.1
48 | ### MTL
49 | ctc_weight: 0.3
50 | ctc_lsm_prob: 0.1
51 | mtl_per_batch: false
52 | task_specific_layer: false
53 | # SpecAugment
54 | freq_width: 27
55 | n_freq_masks: 2
56 | time_width: 100
57 | n_time_masks: 2
58 | time_width_upper: 1.0
59 | 


--------------------------------------------------------------------------------
/examples/csj/s5/conf/data/pretrain.yaml:
--------------------------------------------------------------------------------
1 | # optimization
2 | n_epochs: 20  # 25->20
3 | print_step: 800
4 | lr_decay_start_epoch: 5
5 | lr_decay_rate: 0.8
6 | 


--------------------------------------------------------------------------------
/examples/csj/s5/conf/data/spec_augment.yaml:
--------------------------------------------------------------------------------
 1 | # optimization
 2 | n_epochs: 50
 3 | print_step: 800
 4 | lr_decay_start_epoch: 15
 5 | lr_decay_rate: 0.9
 6 | 
 7 | # mask
 8 | freq_width: 27
 9 | n_freq_masks: 2
10 | time_width: 100
11 | n_time_masks: 2
12 | time_width_upper: 1.0
13 | 


--------------------------------------------------------------------------------
/examples/csj/s5/conf/data/spec_augment_pretrain_F13_T50.yaml:
--------------------------------------------------------------------------------
 1 | # optimization
 2 | n_epochs: 40  # 20->40
 3 | print_step: 1600  # 800->1600
 4 | lr_decay_start_epoch: 5
 5 | lr_decay_rate: 0.9
 6 | 
 7 | # mask
 8 | freq_width: 13
 9 | n_freq_masks: 2
10 | time_width: 50
11 | n_time_masks: 2
12 | time_width_upper: 1.0
13 | 


--------------------------------------------------------------------------------
/examples/csj/s5/conf/data/spec_augment_pretrain_F27_T100.yaml:
--------------------------------------------------------------------------------
 1 | # optimization
 2 | n_epochs: 40  # 20->40
 3 | print_step: 1600  # 800->1600
 4 | lr_decay_start_epoch: 5
 5 | lr_decay_rate: 0.9
 6 | 
 7 | # mask
 8 | freq_width: 27
 9 | n_freq_masks: 2
10 | time_width: 100
11 | n_time_masks: 2
12 | time_width_upper: 1.0
13 | 


--------------------------------------------------------------------------------
/examples/csj/s5/conf/data/spec_augment_pretrain_F27_T50.yaml:
--------------------------------------------------------------------------------
 1 | # optimization
 2 | n_epochs: 40  # 20->40
 3 | print_step: 1600  # 800->1600
 4 | lr_decay_start_epoch: 5
 5 | lr_decay_rate: 0.9
 6 | 
 7 | # mask
 8 | freq_width: 27
 9 | n_freq_masks: 2
10 | time_width: 50
11 | n_time_masks: 2
12 | time_width_upper: 1.0
13 | 


--------------------------------------------------------------------------------
/examples/csj/s5/conf/data/spec_augment_speed_perturb.yaml:
--------------------------------------------------------------------------------
 1 | # optimization
 2 | n_epochs: 40  # 20->40
 3 | print_step: 2400
 4 | lr_decay_start_epoch: 15
 5 | lr_decay_rate: 0.9
 6 | 
 7 | # mask
 8 | freq_width: 27
 9 | n_freq_masks: 2
10 | time_width: 100
11 | n_time_masks: 2
12 | time_width_upper: 1.0
13 | 


--------------------------------------------------------------------------------
/examples/csj/s5/conf/data/speed_perturb.yaml:
--------------------------------------------------------------------------------
1 | # optimization
2 | n_epochs: 20
3 | print_step: 1200
4 | lr_decay_start_epoch: 10
5 | lr_decay_rate: 0.8
6 | 


--------------------------------------------------------------------------------
/examples/csj/s5/conf/fbank.conf:
--------------------------------------------------------------------------------
1 | --window-type=hamming # disable Dans window, use the standard
2 | --htk-compat=true     # try to make it compatible with HTK
3 | 
4 | --sample-frequency=16000
5 | --num-mel-bins=80
6 | --use-energy=false
7 | 


--------------------------------------------------------------------------------
/examples/csj/s5/conf/lm/rnnlm.yaml:
--------------------------------------------------------------------------------
 1 | # topology
 2 | lm_type: lstm
 3 | n_units: 1024
 4 | n_projs: 0
 5 | n_layers: 4  ###
 6 | emb_dim: 1024
 7 | n_units_null_context: 0
 8 | tie_embedding: true
 9 | residual: true
10 | use_glu: true
11 | # optimization
12 | batch_size: 64
13 | bptt: 200
14 | optimizer: adam
15 | n_epochs: 30
16 | convert_to_sgd_epoch: 30
17 | print_step: 50
18 | lr: 1e-3
19 | lr_decay_start_epoch: 10
20 | lr_decay_rate: 0.9
21 | lr_decay_patient_n_epochs: 0
22 | lr_decay_type: always
23 | early_stop_patient_n_epochs: 5
24 | eval_start_epoch: 1
25 | # initialization
26 | param_init: 0.05
27 | # regularization
28 | clip_grad_norm: 1.0
29 | dropout_in: 0.2
30 | dropout_hidden: 0.5
31 | dropout_out: 0.0
32 | weight_decay: 1e-6
33 | lsm_prob: 0.1  ###
34 | adaptive_softmax: false
35 | 


--------------------------------------------------------------------------------
/examples/csj/s5/conf/lm/transformer_xl.yaml:
--------------------------------------------------------------------------------
 1 | # topology
 2 | lm_type: transformer_xl
 3 | n_layers: 12
 4 | transformer_d_model: 512
 5 | transformer_d_ff: 2048
 6 | transformer_n_heads: 8
 7 | tie_embedding: true
 8 | # optimization
 9 | batch_size: 24
10 | bptt: 200
11 | mem_len: 200
12 | optimizer: noam
13 | n_epochs: 40
14 | convert_to_sgd_epoch: 100
15 | print_step: 200
16 | lr_factor: 1.0
17 | early_stop_patient_n_epochs: 5
18 | eval_start_epoch: 1
19 | warmup_n_steps: 4000
20 | accum_grad_n_steps: 4  ###
21 | # regularization
22 | clip_grad_norm: 1.0
23 | dropout_in: 0.1
24 | dropout_hidden: 0.3
25 | dropout_out: 0.0
26 | dropout_att: 0.1
27 | dropout_layer: 0.0
28 | weight_decay: 1e-6
29 | lsm_prob: 0.1  ###
30 | adaptive_softmax: false
31 | 


--------------------------------------------------------------------------------
/examples/csj/s5/conf/lm/transformerlm.yaml:
--------------------------------------------------------------------------------
 1 | # topology
 2 | lm_type: transformer
 3 | n_layers: 12
 4 | transformer_pe_type: add
 5 | transformer_d_model: 512
 6 | transformer_d_ff: 2048
 7 | transformer_n_heads: 8
 8 | tie_embedding: true
 9 | # optimization
10 | batch_size: 32
11 | bptt: 200
12 | optimizer: noam
13 | n_epochs: 40
14 | convert_to_sgd_epoch: 100
15 | print_step: 200
16 | lr_factor: 10.0
17 | early_stop_patient_n_epochs: 5
18 | eval_start_epoch: 1
19 | warmup_n_steps: 4000
20 | accum_grad_n_steps: 2
21 | # regularization
22 | clip_grad_norm: 1.0
23 | dropout_in: 0.1
24 | dropout_hidden: 0.3
25 | dropout_out: 0.0
26 | dropout_att: 0.1
27 | dropout_layer: 0.0
28 | weight_decay: 1e-6
29 | lsm_prob: 0.1  ###
30 | adaptive_softmax: false
31 | 


--------------------------------------------------------------------------------
/examples/csj/s5/local/csj_prepare_dict.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Making dictionary using CSJ data with morpheme analysis.
 4 | # from the one in Kaldi s5 recipe in that it uses lower-case --Arnab (Jan 2013)
 5 | 
 6 | # To be run from one directory above this script.
 7 | 
 8 | . ./path.sh
 9 | 
10 | #check existing directories
11 | [ $# != 0 ] && echo "Usage: local/csj_data_prep.sh" && exit 1;
12 | 
13 | srcdir=${data}/local/train_${datasize}
14 | dir=${data}/local/dict_nosp
15 | mkdir -p $dir
16 | srcdict=$srcdir/lexicon.txt
17 | 
18 | # assume csj_data_prep.sh was done already.
19 | [ ! -f "$srcdict" ] && echo "No such file $srcdict" && exit 1;
20 | 
21 | #(2a) Dictionary preparation:
22 | # Pre-processing (Upper-case, remove comments)
23 | cat $srcdict > $dir/lexicon1.txt || exit 1;
24 | 
25 | cat $dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \
26 |   grep -v sp > $dir/nonsilence_phones.txt  || exit 1;
27 | 
28 | #( echo sil; echo spn; echo nsn; echo lau ) > $dir/silence_phones.txt
29 | ( echo sp ; echo spn ; ) > $dir/silence_phones.txt
30 | 
31 | echo sp > $dir/optional_silence.txt
32 | 
33 | # No "extra questions" in the input to this setup, as we don't
34 | # have stress or tone.
35 | echo -n >$dir/extra_questions.txt
36 | 
37 | # Add to the lexicon the silences, noises etc.
38 | ( echo '<sp> sp' ; echo '<unk> spn'; ) | cat - $dir/lexicon1.txt  > $dir/lexicon2.txt || exit 1;
39 | 
40 | 
41 | pushd $dir >&/dev/null
42 | ln -sf lexicon2.txt lexicon.txt
43 | popd >&/dev/null
44 | 
45 | echo Prepared input dictionary and phone-sets for CSJ phase 1.
46 | 


--------------------------------------------------------------------------------
/examples/csj/s5/path.sh:
--------------------------------------------------------------------------------
 1 | export NEURALSP_ROOT=$PWD/../../..
 2 | export KALDI_ROOT=$NEURALSP_ROOT/tools/kaldi
 3 | export TOOL=$NEURALSP_ROOT/tools/neural_sp
 4 | export CONDA=$TOOL/miniconda
 5 | 
 6 | # Kaldi
 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 8 | export PATH=$NEURALSP_ROOT/utils:$PWD/utils/:$KALDI_ROOT/tools/sctk/bin/:$TOOL/sentencepiece/build/src:$PATH
 9 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
10 | . $KALDI_ROOT/tools/config/common_path.sh
11 | export LC_ALL=C
12 | 
13 | ### Python
14 | source $CONDA/etc/profile.d/conda.sh && conda deactivate && conda activate
15 | export PYTHONDONTWRITEBYTECODE=1
16 | export OMP_NUM_THREADS=1
17 | 
18 | ### CUDA
19 | CUDAROOT=/usr/local/cuda
20 | NCCL_ROOT=/usr/local/nccl
21 | export CPATH=$NCCL_ROOT/include:$CPATH
22 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib/:$CUDAROOT/lib64:$LD_LIBRARY_PATH
23 | export LIBRARY_PATH=$NCCL_ROOT/lib/:$LIBRARY_PATH
24 | export CUDA_HOME=$CUDAROOT
25 | export CUDA_PATH=$CUDAROOT
26 | 


--------------------------------------------------------------------------------
/examples/csj/s5/steps:
--------------------------------------------------------------------------------
1 | ../../wsj/s5/steps


--------------------------------------------------------------------------------
/examples/csj/s5/utils:
--------------------------------------------------------------------------------
1 | ../../wsj/s5/utils


--------------------------------------------------------------------------------
/examples/laborotv/s5/README.md:
--------------------------------------------------------------------------------
 1 | #### Conformer LAS large + SpecAugment
 2 | - conf: `conf/asr/conformer_kernel15_clamp10_hie_subsample8_las_ln_large.yaml`
 3 | - decoding parameters
 4 |   - epoch 40
 5 |   - beam width: 10
 6 |   - lm_weight: 0.0
 7 |   - length norm: true
 8 | 
 9 | ##### WER
10 | | Eval Set | # Snt | # Wrd | Corr | Sub | Del | Ins | Err | S.Err |
11 | | -------- | ----- | ----- | ---- | --- | --- | --- | --- | ----- |
12 | |dev_4k|4000|57637|93.6|4.7|1.7|3.2|**9.7**|48.6|
13 | |dev|12000|153743|91.5|6.4|2.0|4.0|**12.5**|53.5|
14 | 
15 | ##### CER
16 | | Eval Set | # Snt | # Wrd | Corr | Sub | Del | Ins | Err | S.Err |
17 | | -------- | ----- | ----- | ---- | --- | --- | --- | --- | ----- |
18 | |dev_4k|4000|101224|95.3|3.0|1.7|3.1|**7.8**|46.2|
19 | |dev|12000|273004|93.8|4.0|2.2|3.9|**10.1**|50.9|
20 | |tedx-jp-10k|10000|191708|90.2|5.0|4.8|2.6|**12.4**|64.8|
21 | 


--------------------------------------------------------------------------------
/examples/laborotv/s5/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | # Kyoto University setup
14 | export train_cmd="run.pl --mem 2G"
15 | export cuda_cmd="run.pl --mem 2G --gpu 1"
16 | export decode_cmd="run.pl --mem 4G"
17 | 
18 | # JHU setup
19 | # export train_cmd="queue.pl --mem 2G"
20 | # export cuda_cmd="queue.pl --mem 2G --gpu 1 --config conf/gpu.conf"
21 | # export decode_cmd="queue.pl --mem 4G"
22 | 


--------------------------------------------------------------------------------
/examples/laborotv/s5/conf/fbank.conf:
--------------------------------------------------------------------------------
1 | --window-type=hamming # disable Dans window, use the standard
2 | --htk-compat=true     # try to make it compatible with HTK
3 | 
4 | --sample-frequency=16000
5 | --num-mel-bins=80
6 | --use-energy=false
7 | 


--------------------------------------------------------------------------------
/examples/laborotv/s5/conf/lm/rnnlm.yaml:
--------------------------------------------------------------------------------
 1 | # topology
 2 | lm_type: lstm
 3 | n_units: 1024
 4 | n_projs: 0
 5 | n_layers: 4  ###
 6 | emb_dim: 1024
 7 | n_units_null_context: 0
 8 | tie_embedding: true
 9 | residual: true
10 | use_glu: true
11 | # optimization
12 | batch_size: 64
13 | bptt: 200
14 | optimizer: adam
15 | n_epochs: 40
16 | convert_to_sgd_epoch: 40
17 | print_step: 200
18 | lr: 1e-3
19 | lr_decay_start_epoch: 10
20 | lr_decay_rate: 0.9
21 | lr_decay_patient_n_epochs: 0
22 | lr_decay_type: always
23 | early_stop_patient_n_epochs: 5
24 | eval_start_epoch: 1
25 | # initialization
26 | param_init: 0.05
27 | # regularization
28 | clip_grad_norm: 1.0
29 | dropout_in: 0.2
30 | dropout_hidden: 0.5
31 | dropout_out: 0.0
32 | weight_decay: 1e-6
33 | lsm_prob: 0.1  ###
34 | adaptive_softmax: false
35 | 


--------------------------------------------------------------------------------
/examples/laborotv/s5/local/laborotv_data_prep.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Data preparation for LaboroTVSpeech
 4 | 
 5 | . ./path.sh
 6 | set -e # exit on error
 7 | 
 8 | if [[ $# -ne 1 ]]; then
 9 |   echo "Usage: $0 <corpus_dir>"
10 |   exit 1
11 | fi
12 | 
13 | CORPUS_DIR=$1
14 | 
15 | # Data
16 | for x in train dev; do
17 |   echo "$0: Making data/${x} ..."
18 |   mkdir -p ${data}/${x}
19 |   perl -pe 's/,/ /' ${CORPUS_DIR}/data/${x}/text.csv >${data}/${x}/text
20 |   cut -d',' -f1 ${CORPUS_DIR}/data/${x}/text.csv |
21 |     awk -v dir=${CORPUS_DIR}/data/${x}/wav/ "{print dir\$1\".wav\"}" |
22 |     sort |
23 |     perl -pe 's,(.*/)([^/]*)(\.wav),\2 \1\2\3,g' \
24 |       >${data}/${x}/wav.scp
25 | 
26 |   # Make a dumb utt2spk and spk2utt,
27 |   # where each utterance corresponds to a unique speaker.
28 |   awk '{print $1,$1_spk}' ${data}/${x}/text >${data}/${x}/utt2spk
29 |   utils/utt2spk_to_spk2utt.pl ${data}/${x}/utt2spk >${data}/${x}/spk2utt
30 | 
31 |   utils/data/get_utt2dur.sh ${data}/${x}
32 | 
33 |   utils/fix_data_dir.sh ${data}/${x}
34 |   utils/validate_data_dir.sh --no-feats ${data}/${x}
35 | done
36 | 
37 | echo "$0: done preparing data directories"
38 | 


--------------------------------------------------------------------------------
/examples/laborotv/s5/local/prepare_dict.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Prepare dict_nosp/ from lexicon.txt
 4 | # This is a simplified version of egs/csj/s5/local/csj_prepare_dict.sh
 5 | 
 6 | . ./path.sh
 7 | set -e # exit on error
 8 | 
 9 | if [[ $# -ne 2 ]]; then
10 |   echo "Usage: $0 <lexicon> <dict_dir>"
11 |   exit 1
12 | fi
13 | 
14 | lexicon=$1
15 | dir=$2
16 | 
17 | mkdir -p $dir
18 | 
19 | cat $lexicon |
20 |   awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' |
21 |   grep -v sp >$dir/nonsilence_phones.txt || exit 1
22 | 
23 | (
24 |   echo sp
25 |   echo spn
26 | ) >$dir/silence_phones.txt
27 | 
28 | echo sp >$dir/optional_silence.txt
29 | 
30 | # No "extra questions" in the input to this setup, as we don't
31 | # have stress or tone.
32 | echo -n >$dir/extra_questions.txt
33 | 
34 | # Add to the lexicon the silences, noises etc.
35 | (
36 |   echo '<sp> sp'
37 |   echo '<unk> spn'
38 | ) | cat - $lexicon >$dir/lexicon.txt || exit 1
39 | 
40 | sort $dir/lexicon.txt -uo $dir/lexicon.txt
41 | 
42 | echo "$0: Done preparing $dir"
43 | 


--------------------------------------------------------------------------------
/examples/laborotv/s5/local/remove_pos.py:
--------------------------------------------------------------------------------
1 | ../../../csj/s5/local/remove_pos.py


--------------------------------------------------------------------------------
/examples/laborotv/s5/local/tedx-jp-10k_data_prep.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -euo pipefail
 3 | 
 4 | # Build the TEDxJP-10K dataset
 5 | 
 6 | . ./path.sh
 7 | set -e # exit on error
 8 | 
 9 | #videos_csv="local/tedx-jp/tedx-jp-10k.csv"
10 | #all_to_10k_utt_map="local/tedx-jp/all_to_10k_utt_map.txt"
11 | 
12 | . utils/parse_options.sh
13 | 
14 | if [[ $# -ne 1 ]]; then
15 |   echo "Usage: $0 <raw_data_dir>"
16 |   echo "This script does preprocessing of the TEDx-JP-10K dataset."
17 |   echo "<raw_data_dir> should contain segments, spk2utt, text, utt2spk, wavlist.txt, and wav/."
18 |   exit 1
19 | fi
20 | 
21 | RAW_DATA_DIR=$1
22 | dst_dir="${data}/tedx-jp-10k"
23 | 
24 | mkdir -p ${dst_dir}
25 | 
26 | # Copy necessary files to data directory
27 | echo "$0: Copying segments, spk2utt, text and utt2spk to $dst_dir."
28 | cp ${RAW_DATA_DIR}/{segments,spk2utt,text,utt2spk} ${dst_dir}
29 | 
30 | echo "$0: Creating wav.scp from wavlist.txt"
31 | rm -f ${dst_dir}/wav.scp
32 | touch ${dst_dir}/wav.scp
33 | while read line; do
34 |     id=$(cut -d' ' -f 1 <<<${line})
35 |     filepath=${RAW_DATA_DIR}/wav/$(cut -d' ' -f 2 <<<${line})
36 |     echo "${id} sox \"${filepath}\" -c 1 -r 16000 -t wav - |" >> ${dst_dir}/wav.scp
37 | done < ${RAW_DATA_DIR}/wavlist.txt
38 | utils/data/validate_data_dir.sh --no-feats ${dst_dir}
39 | 
40 | echo "$0: Done preprocessing TEDxJP-10K dataset (${dst_dir})"
41 | 


--------------------------------------------------------------------------------
/examples/laborotv/s5/path.sh:
--------------------------------------------------------------------------------
 1 | export NEURALSP_ROOT=$PWD/../../..
 2 | export KALDI_ROOT=$NEURALSP_ROOT/tools/kaldi
 3 | export TOOL=$NEURALSP_ROOT/tools/neural_sp
 4 | export CONDA=$TOOL/miniconda
 5 | 
 6 | # Kaldi
 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 8 | export PATH=$NEURALSP_ROOT/utils:$PWD/utils/:$KALDI_ROOT/tools/sctk/bin/:$TOOL/sentencepiece/build/src:$PATH
 9 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
10 | . $KALDI_ROOT/tools/config/common_path.sh
11 | export LC_ALL=C
12 | 
13 | ### Python
14 | source $CONDA/etc/profile.d/conda.sh && conda deactivate && conda activate
15 | export PYTHONDONTWRITEBYTECODE=1
16 | export OMP_NUM_THREADS=1
17 | 
18 | ### CUDA
19 | CUDAROOT=/usr/local/cuda
20 | NCCL_ROOT=/usr/local/nccl
21 | export CPATH=$NCCL_ROOT/include:$CPATH
22 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib/:$CUDAROOT/lib64:$LD_LIBRARY_PATH
23 | export LIBRARY_PATH=$NCCL_ROOT/lib/:$LIBRARY_PATH
24 | export CUDA_HOME=$CUDAROOT
25 | export CUDA_PATH=$CUDAROOT
26 | 


--------------------------------------------------------------------------------
/examples/laborotv/s5/steps:
--------------------------------------------------------------------------------
1 | ../../wsj/s5/steps


--------------------------------------------------------------------------------
/examples/laborotv/s5/utils:
--------------------------------------------------------------------------------
1 | ../../wsj/s5/utils


--------------------------------------------------------------------------------
/examples/language_model/ptb/RESULTS:
--------------------------------------------------------------------------------
1 | lstm1024H0P2L_emb1024_adam_lr0.001_bs20_bptt30_tie_residual_glu_1tokens
2 | % PPL
3 | valid (baseline): 87.99
4 | valid (cache size: 100): 79.58
5 | valid (cache size: 500): 77.36
6 | test (baseline): 86.06
7 | test (cache size: 100): 79.12
8 | test (cache size: 500): 76.94
9 | 


--------------------------------------------------------------------------------
/examples/language_model/ptb/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | # Kyoto University setup
14 | export train_cmd="run.pl --mem 2G"
15 | export cuda_cmd="run.pl --mem 2G --gpu 1"
16 | export decode_cmd="run.pl --mem 4G"
17 | 
18 | # JHU setup
19 | # export train_cmd="queue.pl --mem 2G"
20 | # export cuda_cmd="queue.pl --mem 2G --gpu 1 --config conf/gpu.conf"
21 | # export decode_cmd="queue.pl --mem 4G"
22 | 


--------------------------------------------------------------------------------
/examples/language_model/ptb/conf/rnnlm.yaml:
--------------------------------------------------------------------------------
 1 | # topology
 2 | lm_type: lstm
 3 | n_units: 1024
 4 | n_projs: 0
 5 | n_layers: 2
 6 | emb_dim: 1024
 7 | tie_embedding: true
 8 | residual: true
 9 | use_glu: true
10 | # optimization
11 | batch_size: 20
12 | bptt: 30
13 | optimizer: adam
14 | n_epochs: 50
15 | convert_to_sgd_epoch: 50
16 | print_step: 100
17 | lr: 1e-3
18 | lr_decay_start_epoch: 10
19 | lr_decay_rate: 0.9
20 | lr_decay_patient_n_epochs: 0
21 | lr_decay_type: always
22 | early_stop_patient_n_epochs: 10
23 | eval_start_epoch: 1
24 | # initialization
25 | param_init: 0.05
26 | # regularization
27 | # clip_grad_norm: 0.1
28 | clip_grad_norm: 1.0
29 | dropout_in: 0.2
30 | dropout_hidden: 0.65
31 | dropout_out: 0.0
32 | weight_decay: 1e-6
33 | 


--------------------------------------------------------------------------------
/examples/language_model/ptb/path.sh:
--------------------------------------------------------------------------------
 1 | export NEURALSP_ROOT=$PWD/../../..
 2 | export KALDI_ROOT=$NEURALSP_ROOT/tools/kaldi
 3 | export TOOL=$NEURALSP_ROOT/tools/neural_sp
 4 | export CONDA=$TOOL/miniconda
 5 | 
 6 | # Kaldi
 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 8 | export PATH=$NEURALSP_ROOT/utils:$PWD/utils/:$KALDI_ROOT/tools/sctk/bin/:$TOOL/sentencepiece/build/src:$PATH
 9 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
10 | . $KALDI_ROOT/tools/config/common_path.sh
11 | export LC_ALL=C
12 | 
13 | ### Python
14 | source $CONDA/etc/profile.d/conda.sh && conda deactivate && conda activate
15 | export PYTHONDONTWRITEBYTECODE=1
16 | export OMP_NUM_THREADS=1
17 | 
18 | ### CUDA
19 | CUDAROOT=/usr/local/cuda
20 | NCCL_ROOT=/usr/local/nccl
21 | export CPATH=$NCCL_ROOT/include:$CPATH
22 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib/:$CUDAROOT/lib64:$LD_LIBRARY_PATH
23 | export LIBRARY_PATH=$NCCL_ROOT/lib/:$LIBRARY_PATH
24 | export CUDA_HOME=$CUDAROOT
25 | export CUDA_PATH=$CUDAROOT
26 | 


--------------------------------------------------------------------------------
/examples/language_model/ptb/steps:
--------------------------------------------------------------------------------
1 | ../../wsj/s5/steps


--------------------------------------------------------------------------------
/examples/language_model/ptb/utils:
--------------------------------------------------------------------------------
1 | ../../wsj/s5/utils


--------------------------------------------------------------------------------
/examples/language_model/wikitext2/RESULTS:
--------------------------------------------------------------------------------
1 | lstm1024H0P2L_emb1024_adam_lr0.001_bs20_bptt30_tie_residual_glu_1tokens
2 | % PPL
3 | valid (baseline): 104.53
4 | valid (cache size: 100): 90.86
5 | valid (cache size: 2000): 76.10
6 | test (baseline): 98.73
7 | test (cache size: 100): 85.87
8 | test (cache size: 2000): 72.77
9 | 


--------------------------------------------------------------------------------
/examples/language_model/wikitext2/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | # Kyoto University setup
14 | export train_cmd="run.pl --mem 2G"
15 | export cuda_cmd="run.pl --mem 2G --gpu 1"
16 | export decode_cmd="run.pl --mem 4G"
17 | 
18 | # JHU setup
19 | # export train_cmd="queue.pl --mem 2G"
20 | # export cuda_cmd="queue.pl --mem 2G --gpu 1 --config conf/gpu.conf"
21 | # export decode_cmd="queue.pl --mem 4G"
22 | 


--------------------------------------------------------------------------------
/examples/language_model/wikitext2/conf/gcnn.yaml:
--------------------------------------------------------------------------------
 1 | # topology
 2 | lm_type: gated_conv_custom
 3 | kernel_size: 4
 4 | n_units: 1024
 5 | n_projs: 512
 6 | n_layers: 6
 7 | emb_dim: 300
 8 | tie_embedding: false
 9 | # optimization
10 | batch_size: 50
11 | bptt: 200
12 | optimizer: nesterov
13 | n_epochs: 100
14 | convert_to_sgd_epoch: 100
15 | print_step: 100
16 | lr: 2.0
17 | lr_decay_start_epoch: 10
18 | lr_decay_rate: 0.75
19 | lr_decay_patient_n_epochs: 0
20 | # lr_decay_type: epoch
21 | lr_decay_type: metric
22 | early_stop_patient_n_epochs: 20
23 | eval_start_epoch: 1
24 | # initialization
25 | param_init: 0.05
26 | # regularization
27 | clip_grad_norm: 0.1
28 | dropout_in: 0.2
29 | dropout_hidden: 0.5
30 | dropout_out: 0.0
31 | weight_decay: 1e-6
32 | adaptive_softmax: true
33 | 


--------------------------------------------------------------------------------
/examples/language_model/wikitext2/conf/rnnlm.yaml:
--------------------------------------------------------------------------------
 1 | # topology
 2 | lm_type: lstm
 3 | n_units: 1024
 4 | n_projs: 0
 5 | n_layers: 2
 6 | emb_dim: 1024
 7 | tie_embedding: true
 8 | residual: true
 9 | use_glu: true
10 | # optimization
11 | batch_size: 20
12 | bptt: 30
13 | optimizer: adam
14 | n_epochs: 50
15 | convert_to_sgd_epoch: 100
16 | print_step: 200
17 | lr: 1e-3
18 | lr_decay_start_epoch: 10
19 | lr_decay_rate: 0.9
20 | lr_decay_patient_n_epochs: 0
21 | lr_decay_type: always
22 | early_stop_patient_n_epochs: 10
23 | eval_start_epoch: 1
24 | # initialization
25 | param_init: 0.05
26 | # regularization
27 | clip_grad_norm: 0.1
28 | dropout_in: 0.2
29 | dropout_hidden: 0.5
30 | dropout_out: 0.0
31 | weight_decay: 1e-6
32 | lsm_prob: 0.0
33 | adaptive_softmax: false
34 | 


--------------------------------------------------------------------------------
/examples/language_model/wikitext2/conf/transformer_xl.yaml:
--------------------------------------------------------------------------------
 1 | # topology
 2 | lm_type: transformer_xl
 3 | n_layers: 12
 4 | transformer_d_model: 512
 5 | transformer_d_ff: 2048
 6 | transformer_n_heads: 8
 7 | tie_embedding: true
 8 | # optimization
 9 | batch_size: 24
10 | bptt: 200
11 | mem_len: 200
12 | optimizer: noam
13 | n_epochs: 50
14 | convert_to_sgd_epoch: 100
15 | print_step: 200
16 | lr_factor: 1.0
17 | early_stop_patient_n_epochs: 5
18 | eval_start_epoch: 1
19 | warmup_n_steps: 4000
20 | accum_grad_n_steps: 4
21 | # regularization
22 | clip_grad_norm: 1.0
23 | dropout_in: 0.1
24 | dropout_hidden: 0.3
25 | dropout_out: 0.0
26 | dropout_att: 0.1
27 | dropout_layer: 0.0
28 | weight_decay: 1e-6
29 | lsm_prob: 0.1  ###
30 | adaptive_softmax: false
31 | 


--------------------------------------------------------------------------------
/examples/language_model/wikitext2/conf/transformerlm.yaml:
--------------------------------------------------------------------------------
 1 | # topology
 2 | lm_type: transformer
 3 | n_layers: 6
 4 | transformer_pe_type: add
 5 | transformer_d_model: 512
 6 | transformer_d_ff: 2048
 7 | transformer_n_heads: 8
 8 | tie_embedding: true
 9 | # optimization
10 | batch_size: 32
11 | bptt: 200
12 | optimizer: noam
13 | n_epochs: 50
14 | convert_to_sgd_epoch: 100
15 | print_step: 200
16 | lr_factor: 10.0
17 | early_stop_patient_n_epochs: 5
18 | eval_start_epoch: 1
19 | warmup_n_steps: 4000
20 | accum_grad_n_steps: 2
21 | # regularization
22 | clip_grad_norm: 1.0
23 | dropout_in: 0.1
24 | dropout_hidden: 0.3
25 | dropout_out: 0.0
26 | dropout_att: 0.1
27 | dropout_layer: 0.0
28 | weight_decay: 1e-6
29 | lsm_prob: 0.0
30 | adaptive_softmax: false
31 | 


--------------------------------------------------------------------------------
/examples/language_model/wikitext2/path.sh:
--------------------------------------------------------------------------------
 1 | export NEURALSP_ROOT=$PWD/../../..
 2 | export KALDI_ROOT=$NEURALSP_ROOT/tools/kaldi
 3 | export TOOL=$NEURALSP_ROOT/tools/neural_sp
 4 | export CONDA=$TOOL/miniconda
 5 | 
 6 | # Kaldi
 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 8 | export PATH=$NEURALSP_ROOT/utils:$PWD/utils/:$KALDI_ROOT/tools/sctk/bin/:$TOOL/sentencepiece/build/src:$PATH
 9 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
10 | . $KALDI_ROOT/tools/config/common_path.sh
11 | export LC_ALL=C
12 | 
13 | ### Python
14 | source $CONDA/etc/profile.d/conda.sh && conda deactivate && conda activate
15 | export PYTHONDONTWRITEBYTECODE=1
16 | export OMP_NUM_THREADS=1
17 | 
18 | ### CUDA
19 | CUDAROOT=/usr/local/cuda
20 | NCCL_ROOT=/usr/local/nccl
21 | export CPATH=$NCCL_ROOT/include:$CPATH
22 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib/:$CUDAROOT/lib64:$LD_LIBRARY_PATH
23 | export LIBRARY_PATH=$NCCL_ROOT/lib/:$LIBRARY_PATH
24 | export CUDA_HOME=$CUDAROOT
25 | export CUDA_PATH=$CUDAROOT
26 | 


--------------------------------------------------------------------------------
/examples/language_model/wikitext2/steps:
--------------------------------------------------------------------------------
1 | ../../wsj/s5/steps


--------------------------------------------------------------------------------
/examples/language_model/wikitext2/utils:
--------------------------------------------------------------------------------
1 | ../../wsj/s5/utils


--------------------------------------------------------------------------------
/examples/librispeech/README.txt:
--------------------------------------------------------------------------------
 1 | 
 2 |  The LibriSpeech corpus is a large (1000 hour) corpus of English read speech
 3 |  derived from audiobooks in the LibriVox project, sampled at 16kHz.  The
 4 |  accents are various and not marked, but the majority are US English.  It is
 5 |  available for download for free at http://www.openslr.org/12/.  It was prepared
 6 |  as a speech recognition corpus by Vassil Panayotov.
 7 | 
 8 |  The recipe is in s5/
 9 | 
10 | 


--------------------------------------------------------------------------------
/examples/librispeech/s5/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | # Kyoto University setup
14 | export train_cmd="run.pl --mem 2G"
15 | export cuda_cmd="run.pl --mem 2G --gpu 1"
16 | export decode_cmd="run.pl --mem 4G"
17 | 
18 | # JHU setup
19 | # export train_cmd="queue.pl --mem 2G"
20 | # export cuda_cmd="queue.pl --mem 2G --gpu 1 --config conf/gpu.conf"
21 | # export decode_cmd="queue.pl --mem 4G"
22 | 


--------------------------------------------------------------------------------
/examples/librispeech/s5/conf/asr/transducer/blstm_transducer_bpe1k.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | subsample: "1_1_1_1_1"
11 | enc_type: conv_blstm
12 | bidirectional_sum_fwd_bwd: true
13 | enc_n_units: 512
14 | enc_n_projs: 0
15 | enc_n_layers: 5
16 | subsample_type: drop
17 | lc_chunk_size_left: -1  ### offline
18 | lc_chunk_size_right: 40
19 | dec_type: lstm_transducer
20 | dec_n_units: 1024
21 | dec_n_projs: 0
22 | dec_n_layers: 2
23 | dec_bottleneck_dim: 512
24 | emb_dim: 512
25 | tie_embedding: false
26 | ctc_fc_list: "512"
27 | ### optimization
28 | batch_size: 15
29 | optimizer: adam
30 | n_epochs: 30
31 | convert_to_sgd_epoch: 100
32 | print_step: 1000
33 | metric: edit_distance
34 | lr: 1e-3
35 | lr_decay_type: always
36 | lr_decay_start_epoch: 10
37 | lr_decay_rate: 0.85
38 | lr_decay_patient_n_epochs: 0
39 | early_stop_patient_n_epochs: 5
40 | sort_stop_epoch: 100
41 | eval_start_epoch: 1
42 | warmup_start_lr: 1e-4
43 | warmup_n_steps: 0
44 | ### initialization
45 | param_init: 0.1
46 | ### regularization
47 | clip_grad_norm: 5.0
48 | dropout_in: 0.0
49 | dropout_enc: 0.4
50 | dropout_dec: 0.4
51 | dropout_emb: 0.4
52 | weight_decay: 1e-6
53 | ### MTL
54 | ctc_weight: 0.3
55 | ctc_lsm_prob: 0.1
56 | mtl_per_batch: false
57 | task_specific_layer: false
58 | 


--------------------------------------------------------------------------------
/examples/librispeech/s5/conf/asr/transducer/lcblstm_rnnt_chunk4040_bpe1k.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | subsample: "1_1_1_1_1"
11 | enc_type: conv_blstm
12 | bidirectional_sum_fwd_bwd: true
13 | enc_n_units: 512
14 | enc_n_projs: 0
15 | enc_n_layers: 5
16 | subsample_type: drop
17 | lc_chunk_size_left: 40
18 | lc_chunk_size_right: 40
19 | dec_type: lstm_transducer
20 | dec_n_units: 1024
21 | dec_n_projs: 0
22 | dec_n_layers: 2
23 | dec_bottleneck_dim: 512
24 | emb_dim: 512
25 | tie_embedding: false
26 | ctc_fc_list: "512"
27 | ### optimization
28 | batch_size: 10
29 | optimizer: adam
30 | n_epochs: 30
31 | convert_to_sgd_epoch: 100
32 | print_step: 1000
33 | metric: edit_distance
34 | lr: 1e-3
35 | lr_decay_type: always
36 | lr_decay_start_epoch: 10
37 | lr_decay_rate: 0.85
38 | lr_decay_patient_n_epochs: 0
39 | early_stop_patient_n_epochs: 5
40 | sort_stop_epoch: 100
41 | eval_start_epoch: 1
42 | warmup_start_lr: 1e-4
43 | warmup_n_steps: 0
44 | ### initialization
45 | param_init: 0.1
46 | ### regularization
47 | clip_grad_norm: 5.0
48 | dropout_in: 0.0
49 | dropout_enc: 0.4
50 | dropout_dec: 0.4
51 | dropout_emb: 0.4
52 | weight_decay: 1e-6
53 | ### MTL
54 | ctc_weight: 0.3
55 | ctc_lsm_prob: 0.1
56 | mtl_per_batch: false
57 | task_specific_layer: false
58 | 


--------------------------------------------------------------------------------
/examples/librispeech/s5/conf/asr/transducer/lstm_rnnt_bpe1k.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | subsample: "1_1_1_1_1"
11 | enc_type: conv_lstm
12 | enc_n_units: 1024
13 | enc_n_projs: 0
14 | enc_n_layers: 5
15 | subsample_type: drop
16 | dec_type: lstm_transducer
17 | dec_n_units: 1024
18 | dec_n_projs: 0
19 | dec_n_layers: 2
20 | dec_bottleneck_dim: 512
21 | emb_dim: 512
22 | tie_embedding: false
23 | ctc_fc_list: "512"
24 | ### optimization
25 | batch_size: 15
26 | optimizer: adam
27 | n_epochs: 35  # 20->35
28 | convert_to_sgd_epoch: 100
29 | print_step: 1000
30 | metric: edit_distance
31 | lr: 1e-3
32 | lr_decay_type: always
33 | lr_decay_start_epoch: 10
34 | lr_decay_rate: 0.85  ### 0.8->0.85
35 | lr_decay_patient_n_epochs: 0
36 | early_stop_patient_n_epochs: 5
37 | sort_stop_epoch: 100
38 | eval_start_epoch: 1
39 | warmup_start_lr: 1e-4
40 | warmup_n_steps: 4000  ### this is important
41 | ### initialization
42 | param_init: 0.1
43 | ### regularization
44 | clip_grad_norm: 5.0
45 | dropout_in: 0.0
46 | dropout_enc: 0.4
47 | dropout_dec: 0.4
48 | dropout_emb: 0.4
49 | weight_decay: 1e-6
50 | ### MTL
51 | ctc_weight: 0.3
52 | ctc_lsm_prob: 0.1
53 | mtl_per_batch: false
54 | task_specific_layer: false
55 | 


--------------------------------------------------------------------------------
/examples/librispeech/s5/conf/asr/transformer/transformer.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | enc_type: conv_transformer
11 | enc_n_layers: 12
12 | transformer_enc_pe_type: none  ###
13 | transformer_enc_d_model: 256
14 | transformer_enc_d_ff: 2048
15 | transformer_enc_n_heads: 4
16 | dec_type: transformer
17 | dec_n_layers: 6
18 | transformer_dec_pe_type: 1dconv3L  ### this is effective
19 | transformer_dec_attn_type: scaled_dot
20 | transformer_dec_d_model: 256
21 | transformer_dec_d_ff: 2048
22 | transformer_dec_n_heads: 4
23 | tie_embedding: false
24 | ctc_fc_list: "512"
25 | ### optimization
26 | batch_size: 32
27 | optimizer: noam
28 | n_epochs: 50
29 | convert_to_sgd_epoch: 100
30 | print_step: 6000
31 | metric: accuracy
32 | lr_factor: 5.0
33 | early_stop_patient_n_epochs: 5
34 | shuffle_bucket: true  ### this is important
35 | sort_stop_epoch: 100
36 | eval_start_epoch: 1
37 | warmup_n_steps: 25000
38 | accum_grad_n_steps: 8
39 | ### regularization
40 | clip_grad_norm: 5.0
41 | dropout_in: 0.0
42 | dropout_enc: 0.1
43 | dropout_dec: 0.1
44 | dropout_emb: 0.1
45 | dropout_att: 0.0
46 | weight_decay: 1e-6
47 | lsm_prob: 0.1
48 | ### MTL
49 | ctc_weight: 0.3
50 | ctc_lsm_prob: 0.1
51 | mtl_per_batch: false
52 | task_specific_layer: false
53 | # SpecAugment
54 | freq_width: 27
55 | n_freq_masks: 2
56 | time_width: 100
57 | n_time_masks: 2
58 | time_width_upper: 1.0
59 | 


--------------------------------------------------------------------------------
/examples/librispeech/s5/conf/asr/transformer/transformer_512dmodel_8H.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | enc_type: conv_transformer
11 | enc_n_layers: 12
12 | transformer_enc_pe_type: none  ###
13 | transformer_enc_d_model: 512  ###
14 | transformer_enc_d_ff: 2048
15 | transformer_enc_n_heads: 8  ###
16 | dec_type: transformer
17 | dec_n_layers: 6
18 | transformer_dec_pe_type: 1dconv3L  ### this is effective
19 | transformer_dec_attn_type: scaled_dot
20 | transformer_dec_d_model: 512  ###
21 | transformer_dec_d_ff: 2048
22 | transformer_dec_n_heads: 8  ###
23 | tie_embedding: false
24 | ctc_fc_list: "512"
25 | ### optimization
26 | batch_size: 32
27 | optimizer: noam
28 | n_epochs: 50
29 | convert_to_sgd_epoch: 100
30 | print_step: 6000
31 | metric: accuracy
32 | lr_factor: 5.0
33 | early_stop_patient_n_epochs: 5
34 | shuffle_bucket: true  ### this is important
35 | sort_stop_epoch: 100
36 | eval_start_epoch: 1
37 | warmup_n_steps: 25000
38 | accum_grad_n_steps: 8
39 | ### regularization
40 | clip_grad_norm: 5.0
41 | dropout_in: 0.0
42 | dropout_enc: 0.1
43 | dropout_dec: 0.1
44 | dropout_emb: 0.1
45 | dropout_att: 0.0
46 | weight_decay: 1e-6
47 | lsm_prob: 0.1
48 | ### MTL
49 | ctc_weight: 0.3
50 | ctc_lsm_prob: 0.1
51 | mtl_per_batch: false
52 | task_specific_layer: false
53 | # SpecAugment
54 | freq_width: 27
55 | n_freq_masks: 2
56 | time_width: 100
57 | n_time_masks: 2
58 | time_width_upper: 1.0
59 | 


--------------------------------------------------------------------------------
/examples/librispeech/s5/conf/asr/transformer/transformer_768dmodel_3072dff_8H.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | enc_type: conv_transformer
11 | enc_n_layers: 12
12 | transformer_enc_pe_type: none  ###
13 | transformer_enc_d_model: 768  ###
14 | transformer_enc_d_ff: 3072  ###
15 | transformer_enc_n_heads: 8  ###
16 | dec_type: transformer
17 | dec_n_layers: 6
18 | transformer_dec_pe_type: 1dconv3L  ### this is effective
19 | transformer_dec_attn_type: scaled_dot
20 | transformer_dec_d_model: 768  ###
21 | transformer_dec_d_ff: 3072  ###
22 | transformer_dec_n_heads: 8  ###
23 | tie_embedding: false
24 | ctc_fc_list: "512"
25 | ### optimization
26 | batch_size: 32
27 | optimizer: noam
28 | n_epochs: 50
29 | convert_to_sgd_epoch: 100
30 | print_step: 6000
31 | metric: accuracy
32 | lr_factor: 5.0
33 | early_stop_patient_n_epochs: 5
34 | shuffle_bucket: true  ### this is important
35 | sort_stop_epoch: 100
36 | eval_start_epoch: 1
37 | warmup_n_steps: 25000
38 | accum_grad_n_steps: 8
39 | ### regularization
40 | clip_grad_norm: 5.0
41 | dropout_in: 0.0
42 | dropout_enc: 0.1
43 | dropout_dec: 0.1
44 | dropout_emb: 0.1
45 | dropout_att: 0.0
46 | weight_decay: 1e-6
47 | lsm_prob: 0.1
48 | ### MTL
49 | ctc_weight: 0.3
50 | ctc_lsm_prob: 0.1
51 | mtl_per_batch: false
52 | task_specific_layer: false
53 | # SpecAugment
54 | freq_width: 27
55 | n_freq_masks: 2
56 | time_width: 100
57 | n_time_masks: 2
58 | time_width_upper: 1.0
59 | 


--------------------------------------------------------------------------------
/examples/librispeech/s5/conf/asr/transformer/transformer_subsample8.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)_(2,2)"
10 | enc_type: conv_transformer
11 | enc_n_layers: 12
12 | transformer_enc_pe_type: none  ###
13 | transformer_enc_d_model: 256
14 | transformer_enc_d_ff: 2048
15 | transformer_enc_n_heads: 4
16 | dec_type: transformer
17 | dec_n_layers: 6
18 | transformer_dec_pe_type: 1dconv3L  ### this is effective
19 | transformer_dec_attn_type: scaled_dot
20 | transformer_dec_d_model: 256
21 | transformer_dec_d_ff: 2048
22 | transformer_dec_n_heads: 4
23 | tie_embedding: false
24 | ctc_fc_list: "512"
25 | ### optimization
26 | batch_size: 32
27 | optimizer: noam
28 | n_epochs: 50
29 | convert_to_sgd_epoch: 100
30 | print_step: 6000
31 | metric: accuracy
32 | lr_factor: 5.0
33 | early_stop_patient_n_epochs: 5
34 | shuffle_bucket: true  ### this is important
35 | sort_stop_epoch: 100
36 | eval_start_epoch: 1
37 | warmup_n_steps: 25000
38 | accum_grad_n_steps: 8
39 | ### regularization
40 | clip_grad_norm: 5.0
41 | dropout_in: 0.0
42 | dropout_enc: 0.1
43 | dropout_dec: 0.1
44 | dropout_emb: 0.1
45 | dropout_att: 0.0
46 | weight_decay: 1e-6
47 | lsm_prob: 0.1
48 | ### MTL
49 | ctc_weight: 0.3
50 | ctc_lsm_prob: 0.1
51 | mtl_per_batch: false
52 | task_specific_layer: false
53 | # SpecAugment
54 | freq_width: 27
55 | n_freq_masks: 2
56 | time_width: 100
57 | n_time_masks: 2
58 | time_width_upper: 1.0
59 | 


--------------------------------------------------------------------------------
/examples/librispeech/s5/conf/data/pretrain.yaml:
--------------------------------------------------------------------------------
1 | # optimization
2 | n_epochs: 20  # 30->20
3 | print_step: 2000  # 1000->2000
4 | lr_decay_start_epoch: 5
5 | lr_decay_rate: 0.8
6 | 


--------------------------------------------------------------------------------
/examples/librispeech/s5/conf/data/spec_augment.yaml:
--------------------------------------------------------------------------------
 1 | # optimization
 2 | n_epochs: 60  # 30->60
 3 | print_step: 2000  # 1000->2000
 4 | lr_decay_start_epoch: 20
 5 | lr_decay_rate: 0.9
 6 | 
 7 | # mask
 8 | freq_width: 27
 9 | n_freq_masks: 2
10 | time_width: 100
11 | n_time_masks: 2
12 | time_width_upper: 1.0
13 | 


--------------------------------------------------------------------------------
/examples/librispeech/s5/conf/data/spec_augment_pretrain_F13_T50.yaml:
--------------------------------------------------------------------------------
 1 | # optimization
 2 | n_epochs: 40  # 20->40
 3 | print_step: 4000  # 2000->4000
 4 | lr_decay_start_epoch: 5
 5 | lr_decay_rate: 0.9
 6 | 
 7 | # mask
 8 | freq_width: 13
 9 | n_freq_masks: 2
10 | time_width: 50
11 | n_time_masks: 2
12 | time_width_upper: 1.0
13 | 


--------------------------------------------------------------------------------
/examples/librispeech/s5/conf/data/spec_augment_pretrain_F27_T100.yaml:
--------------------------------------------------------------------------------
 1 | # optimization
 2 | n_epochs: 40  # 20->40
 3 | print_step: 4000  # 2000->4000
 4 | lr_decay_start_epoch: 5
 5 | lr_decay_rate: 0.9
 6 | 
 7 | # mask
 8 | freq_width: 27
 9 | n_freq_masks: 2
10 | time_width: 100
11 | n_time_masks: 2
12 | time_width_upper: 1.0
13 | 


--------------------------------------------------------------------------------
/examples/librispeech/s5/conf/data/spec_augment_pretrain_F27_T50.yaml:
--------------------------------------------------------------------------------
 1 | # optimization
 2 | n_epochs: 40  # 20->40
 3 | print_step: 4000  # 2000->4000
 4 | lr_decay_start_epoch: 5
 5 | lr_decay_rate: 0.9
 6 | 
 7 | # mask
 8 | freq_width: 27
 9 | n_freq_masks: 2
10 | time_width: 50
11 | n_time_masks: 2
12 | time_width_upper: 1.0
13 | 


--------------------------------------------------------------------------------
/examples/librispeech/s5/conf/data/spec_augment_speed_perturb.yaml:
--------------------------------------------------------------------------------
 1 | # optimization
 2 | n_epochs: 50  # 25->50
 3 | print_step: 6000  # 1000->6000
 4 | lr_decay_start_epoch: 15
 5 | lr_decay_rate: 0.9
 6 | 
 7 | # mask
 8 | freq_width: 27
 9 | n_freq_masks: 2
10 | time_width: 100
11 | n_time_masks: 2
12 | time_width_upper: 1.0
13 | 


--------------------------------------------------------------------------------
/examples/librispeech/s5/conf/data/spec_augment_speed_perturb_pretrain_F27_T100.yaml:
--------------------------------------------------------------------------------
 1 | # optimization
 2 | n_epochs: 30  # 15->30
 3 | print_step: 6000  # 1000->6000
 4 | lr_decay_start_epoch: 5
 5 | lr_decay_rate: 0.85
 6 | 
 7 | # mask
 8 | freq_width: 27
 9 | n_freq_masks: 2
10 | time_width: 100
11 | n_time_masks: 2
12 | time_width_upper: 1.0
13 | 


--------------------------------------------------------------------------------
/examples/librispeech/s5/conf/fbank.conf:
--------------------------------------------------------------------------------
1 | --window-type=hamming # disable Dans window, use the standard
2 | --htk-compat=true     # try to make it compatible with HTK
3 | 
4 | --sample-frequency=16000
5 | --num-mel-bins=80
6 | --use-energy=false
7 | 


--------------------------------------------------------------------------------
/examples/librispeech/s5/conf/lm/rnnlm.yaml:
--------------------------------------------------------------------------------
 1 | # topology
 2 | lm_type: lstm
 3 | n_units: 1024
 4 | n_projs: 0
 5 | n_layers: 4  ###
 6 | emb_dim: 1024
 7 | n_units_null_context: 0
 8 | tie_embedding: true
 9 | residual: true
10 | use_glu: true
11 | # optimization
12 | batch_size: 128
13 | bptt: 200
14 | optimizer: adam
15 | n_epochs: 40
16 | convert_to_sgd_epoch: 100
17 | print_step: 2000
18 | lr: 1e-3
19 | lr_decay_start_epoch: 5
20 | lr_decay_rate: 0.9
21 | lr_decay_patient_n_epochs: 0
22 | lr_decay_type: always
23 | early_stop_patient_n_epochs: 5
24 | eval_start_epoch: 1
25 | # initialization
26 | param_init: 0.05
27 | # regularization
28 | clip_grad_norm: 1.0
29 | dropout_in: 0.0
30 | dropout_hidden: 0.0
31 | dropout_out: 0.0
32 | weight_decay: 1e-6
33 | lsm_prob: 0.0
34 | adaptive_softmax: false
35 | 


--------------------------------------------------------------------------------
/examples/librispeech/s5/conf/lm/rnnlm_6L.yaml:
--------------------------------------------------------------------------------
 1 | # topology
 2 | lm_type: lstm
 3 | n_units: 1024
 4 | n_projs: 0
 5 | n_layers: 6  ###
 6 | emb_dim: 1024
 7 | n_units_null_context: 0
 8 | tie_embedding: true
 9 | residual: true
10 | use_glu: true
11 | # optimization
12 | batch_size: 128
13 | bptt: 200
14 | optimizer: adam
15 | n_epochs: 40
16 | convert_to_sgd_epoch: 100
17 | print_step: 2000
18 | lr: 1e-3
19 | lr_decay_start_epoch: 5
20 | lr_decay_rate: 0.9
21 | lr_decay_patient_n_epochs: 0
22 | lr_decay_type: always
23 | early_stop_patient_n_epochs: 5
24 | eval_start_epoch: 1
25 | # initialization
26 | param_init: 0.05
27 | # regularization
28 | clip_grad_norm: 1.0
29 | dropout_in: 0.0
30 | dropout_hidden: 0.0
31 | dropout_out: 0.0
32 | weight_decay: 1e-6
33 | lsm_prob: 0.0  ###
34 | adaptive_softmax: false
35 | 


--------------------------------------------------------------------------------
/examples/librispeech/s5/local/format_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2014 Vassil Panayotov
 4 | # Apache 2.0
 5 | 
 6 | # Prepares the test time language model(G) transducers
 7 | # (adapted from wsj/s5/local/wsj_format_data.sh)
 8 | 
 9 | . ./path.sh
10 | 
11 | if [ $# -ne 1 ]; then
12 |   echo "Usage: $0 <lm-dir>"
13 |   echo "e.g.: $0 /export/a15/vpanayotov/data/lm"
14 |   echo ", where:"
15 |   echo "    <lm-dir> is the directory in which the language model is stored/downloaded"
16 |   exit 1
17 | fi
18 | 
19 | lm_dir=$1
20 | 
21 | lexicon=$DATA/local/lang_tmp/lexiconp.txt
22 | 
23 | # This loop was taken verbatim from wsj_format_data.sh, and I'm leaving it in place in
24 | # case we decide to add more language models at some point
25 | for lm_suffix in tgpr; do
26 |   test=$DATA/lang_test_${lm_suffix}
27 |   mkdir -p $test
28 |   for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones topo oov.txt oov.int; do
29 |     cp -r $DATA/lang/$f $test
30 |   done
31 |   gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz | \
32 |     arpa2fst --disambig-symbol=#0 \
33 |              --read-symbol-table=$test/words.txt - $test/G.fst
34 | 
35 |   utils/validate_lang.pl $test || exit 1;
36 | done
37 | 
38 | echo "Succeeded in formatting data."
39 | 
40 | exit 0
41 | 


--------------------------------------------------------------------------------
/examples/librispeech/s5/path.sh:
--------------------------------------------------------------------------------
 1 | export NEURALSP_ROOT=$PWD/../../..
 2 | export KALDI_ROOT=$NEURALSP_ROOT/tools/kaldi
 3 | export TOOL=$NEURALSP_ROOT/tools/neural_sp
 4 | export CONDA=$TOOL/miniconda
 5 | 
 6 | # Kaldi
 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 8 | export PATH=$NEURALSP_ROOT/utils:$PWD/utils/:$KALDI_ROOT/tools/sctk/bin/:$TOOL/sentencepiece/build/src:$PATH
 9 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
10 | . $KALDI_ROOT/tools/config/common_path.sh
11 | export LC_ALL=C
12 | 
13 | ### Python
14 | source $CONDA/etc/profile.d/conda.sh && conda deactivate && conda activate
15 | export PYTHONDONTWRITEBYTECODE=1
16 | export OMP_NUM_THREADS=1
17 | 
18 | ### CUDA
19 | CUDAROOT=/usr/local/cuda
20 | NCCL_ROOT=/usr/local/nccl
21 | export CPATH=$NCCL_ROOT/include:$CPATH
22 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib/:$CUDAROOT/lib64:$LD_LIBRARY_PATH
23 | export LIBRARY_PATH=$NCCL_ROOT/lib/:$LIBRARY_PATH
24 | export CUDA_HOME=$CUDAROOT
25 | export CUDA_PATH=$CUDAROOT
26 | 


--------------------------------------------------------------------------------
/examples/librispeech/s5/steps:
--------------------------------------------------------------------------------
1 | ../../wsj/s5/steps


--------------------------------------------------------------------------------
/examples/librispeech/s5/utils:
--------------------------------------------------------------------------------
1 | ../../wsj/s5/utils


--------------------------------------------------------------------------------
/examples/swbd/README.txt:
--------------------------------------------------------------------------------
 1 | About the Switchboard corpus
 2 | 
 3 |     This is conversational telephone speech collected as 2-channel, 8kHz-sampled
 4 |     data.  We are using just the Switchboard-1 Phase 1 training data.
 5 |     The catalog number LDC97S62 (Switchboard-1 Release 2) corresponds, we believe,
 6 |     to what we have.  We also use the Mississippi State transcriptions, which
 7 |     we download separately from
 8 |     http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz
 9 | 
10 |     We are using the eval2000 a.k.a. hub5'00 evaluation data.  The acoustics are
11 |     LDC2002S09 and the text is LDC2002T43.
12 | 
13 |     We are also using the RT'03 test set, available as LDC2007S10.  Note: not
14 |     all parts of the recipe test with this.
15 | 
16 | About the Fisher corpus for language modeling
17 | 
18 |   We use Fisher English training speech transcripts for language modeling, if
19 |   they are available. The catalog number for part 1 transcripts is LDC2004T19,
20 |   and LDC2005T19 for part 2.
21 | 
22 | Each subdirectory of this directory contains the
23 | scripts for a sequence of experiments.
24 | 
25 |   s5: This is slightly out of date, please see s5c
26 | 
27 |   s5b: This is (somewhat less) out of date, please see s5c
28 | 
29 |   s5c: This is the current recipe.
30 | 


--------------------------------------------------------------------------------
/examples/swbd/s5c/RESULTS:
--------------------------------------------------------------------------------
 1 | # swbd 300h
 2 |                                |          no LM           |           RNNLM          |
 3 |                                | SWBD       | CH          | SWBD       | CH          |
 4 | BPE10k attention               | 11.8       | 23.1        | 10.9       | 22.6        |
 5 | BPE10k attention + SpecAugment | 9.4        | 19.1        | 9.1        | 18.8        |
 6 | 
 7 | 
 8 | # swbd+fisher 2000h
 9 |                                |          no LM           |           RNNLM          |
10 |                                | SWBD       | CH          | SWBD       | CH          |
11 | BPE34k attention               | 7.8        | 13.8        | N/A        | N/A         |
12 | 


--------------------------------------------------------------------------------
/examples/swbd/s5c/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | # Kyoto University setup
14 | export train_cmd="run.pl --mem 2G"
15 | export cuda_cmd="run.pl --mem 2G --gpu 1"
16 | export decode_cmd="run.pl --mem 4G"
17 | 
18 | # JHU setup
19 | # export train_cmd="queue.pl --mem 2G"
20 | # export cuda_cmd="queue.pl --mem 2G --gpu 1 --config conf/gpu.conf"
21 | # export decode_cmd="queue.pl --mem 4G"
22 | 


--------------------------------------------------------------------------------
/examples/swbd/s5c/conf/asr/blstm_las_fisher_swbd.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | subsample: "1_2_2_2_1"
11 | enc_type: blstm
12 | bidirectional_sum_fwd_bwd: true
13 | enc_n_units: 512
14 | enc_n_projs: 0
15 | enc_n_layers: 6
16 | subsample_type: drop
17 | attn_type: location
18 | attn_conv_n_channels: 10
19 | attn_conv_width: 201
20 | attn_dim: 512
21 | attn_n_heads: 1
22 | dec_type: lstm
23 | dec_n_units: 1024
24 | dec_n_projs: 0
25 | dec_n_layers: 1
26 | dec_bottleneck_dim: 1024  ### this is effective
27 | emb_dim: 512
28 | tie_embedding: false
29 | ctc_fc_list: "512"
30 | ### optimization
31 | batch_size: 50
32 | optimizer: adam
33 | n_epochs: 25
34 | convert_to_sgd_epoch: 100
35 | print_step: 1000
36 | metric: edit_distance
37 | lr: 5e-4
38 | lr_decay_type: always
39 | lr_decay_start_epoch: 10
40 | lr_decay_rate: 0.85
41 | lr_decay_patient_n_epochs: 0
42 | early_stop_patient_n_epochs: 5
43 | sort_stop_epoch: 100
44 | eval_start_epoch: 1
45 | warmup_start_lr: 1e-4
46 | warmup_n_steps: 4000
47 | ### initialization
48 | param_init: 0.1
49 | ### regularization
50 | clip_grad_norm: 5.0
51 | dropout_in: 0.0
52 | dropout_enc: 0.4
53 | dropout_dec: 0.4
54 | dropout_emb: 0.4
55 | dropout_att: 0.0
56 | weight_decay: 1e-6
57 | ss_prob: 0.2
58 | lsm_prob: 0.1
59 | ### MTL
60 | ctc_weight: 0.0
61 | ctc_lsm_prob: 0.1
62 | mtl_per_batch: false
63 | task_specific_layer: false
64 | 


--------------------------------------------------------------------------------
/examples/swbd/s5c/conf/asr/transformer.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | enc_type: conv_transformer
11 | enc_n_layers: 12
12 | transformer_enc_pe_type: add
13 | transformer_enc_d_model: 256
14 | transformer_enc_d_ff: 2048
15 | transformer_enc_n_heads: 4
16 | dec_type: transformer
17 | dec_n_layers: 6
18 | transformer_dec_pe_type: 1dconv3L  ### this is effective
19 | transformer_dec_attn_type: scaled_dot
20 | transformer_dec_d_model: 256
21 | transformer_dec_d_ff: 2048
22 | transformer_dec_n_heads: 4
23 | tie_embedding: false
24 | ctc_fc_list: "512"
25 | ### optimization
26 | batch_size: 32
27 | optimizer: noam
28 | n_epochs: 50
29 | convert_to_sgd_epoch: 100
30 | print_step: 1200
31 | metric: accuracy
32 | lr_factor: 5.0
33 | early_stop_patient_n_epochs: 5
34 | shuffle_bucket: true  ### this is important
35 | sort_stop_epoch: 100
36 | eval_start_epoch: 1
37 | warmup_n_steps: 25000
38 | accum_grad_n_steps: 8
39 | ### regularization
40 | clip_grad_norm: 5.0
41 | dropout_in: 0.0
42 | dropout_enc: 0.1
43 | dropout_dec: 0.1
44 | dropout_emb: 0.1
45 | dropout_att: 0.0
46 | weight_decay: 1e-6
47 | lsm_prob: 0.1
48 | ### MTL
49 | ctc_weight: 0.3
50 | ctc_lsm_prob: 0.1
51 | mtl_per_batch: false
52 | task_specific_layer: false
53 | # SpecAugment
54 | freq_width: 27
55 | n_freq_masks: 2
56 | time_width: 70
57 | n_time_masks: 2
58 | time_width_upper: 0.2
59 | 


--------------------------------------------------------------------------------
/examples/swbd/s5c/conf/asr/transformer_fisher_swbd.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | enc_type: conv_transformer
11 | enc_n_layers: 12
12 | transformer_enc_pe_type: add
13 | transformer_enc_d_model: 512
14 | transformer_enc_d_ff: 2048
15 | transformer_enc_n_heads: 8
16 | dec_type: transformer
17 | dec_n_layers: 6
18 | transformer_dec_pe_type: 1dconv3L  ### this is effective
19 | transformer_dec_attn_type: scaled_dot
20 | transformer_dec_d_model: 512
21 | transformer_dec_d_ff: 2048
22 | transformer_dec_n_heads: 8
23 | tie_embedding: false
24 | ctc_fc_list: "512"
25 | ### optimization
26 | batch_size: 32
27 | optimizer: noam
28 | n_epochs: 50
29 | convert_to_sgd_epoch: 100
30 | print_step: 6000
31 | metric: accuracy
32 | lr_factor: 5.0
33 | early_stop_patient_n_epochs: 5
34 | shuffle_bucket: true  ### this is important
35 | sort_stop_epoch: 100
36 | eval_start_epoch: 1
37 | warmup_n_steps: 25000
38 | accum_grad_n_steps: 8
39 | ### regularization
40 | clip_grad_norm: 5.0
41 | dropout_in: 0.0
42 | dropout_enc: 0.1
43 | dropout_dec: 0.1
44 | dropout_emb: 0.1
45 | dropout_att: 0.0
46 | weight_decay: 1e-6
47 | lsm_prob: 0.1
48 | ### MTL
49 | ctc_weight: 0.3
50 | ctc_lsm_prob: 0.1
51 | mtl_per_batch: false
52 | task_specific_layer: false
53 | # SpecAugment
54 | freq_width: 27
55 | n_freq_masks: 2
56 | time_width: 70
57 | n_time_masks: 2
58 | time_width_upper: 0.2
59 | 


--------------------------------------------------------------------------------
/examples/swbd/s5c/conf/data/spec_augment.yaml:
--------------------------------------------------------------------------------
 1 | # optimization
 2 | n_epochs: 50  # 25->50
 3 | print_step: 400
 4 | lr_decay_start_epoch: 20
 5 | lr_decay_rate: 0.9
 6 | 
 7 | # mask
 8 | freq_width: 27
 9 | n_freq_masks: 2
10 | time_width: 70
11 | n_time_masks: 2
12 | time_width_upper: 0.2
13 | 


--------------------------------------------------------------------------------
/examples/swbd/s5c/conf/data/spec_augment_speed_perturb.yaml:
--------------------------------------------------------------------------------
 1 | # optimization
 2 | n_epochs: 40  # 20->40
 3 | print_step: 1200
 4 | lr_decay_start_epoch: 15
 5 | lr_decay_rate: 0.9
 6 | 
 7 | # mask
 8 | freq_width: 27
 9 | n_freq_masks: 2
10 | time_width: 70
11 | n_time_masks: 2
12 | time_width_upper: 0.2
13 | 


--------------------------------------------------------------------------------
/examples/swbd/s5c/conf/data/speed_perturb.yaml:
--------------------------------------------------------------------------------
1 | # optimization
2 | n_epochs: 20
3 | print_step: 600
4 | lr_decay_start_epoch: 10
5 | lr_decay_rate: 0.8
6 | 


--------------------------------------------------------------------------------
/examples/swbd/s5c/conf/data/speed_perturb_pretrain.yaml:
--------------------------------------------------------------------------------
1 | # optimization
2 | n_epochs: 15
3 | print_step: 1200
4 | lr_decay_start_epoch: 5
5 | lr_decay_rate: 0.8
6 | 


--------------------------------------------------------------------------------
/examples/swbd/s5c/conf/fbank.conf:
--------------------------------------------------------------------------------
1 | --window-type=hamming # disable Dans window, use the standard
2 | --htk-compat=true     # try to make it compatible with HTK
3 | 
4 | --sample-frequency=8000
5 | --num-mel-bins=80
6 | --use-energy=false
7 | 


--------------------------------------------------------------------------------
/examples/swbd/s5c/conf/lm/rnnlm.yaml:
--------------------------------------------------------------------------------
 1 | # topology
 2 | lm_type: lstm
 3 | n_units: 1024
 4 | n_projs: 0
 5 | n_layers: 4  ###
 6 | emb_dim: 1024
 7 | n_units_null_context: 0
 8 | tie_embedding: true
 9 | residual: true
10 | use_glu: true
11 | # optimization
12 | batch_size: 128
13 | bptt: 200
14 | optimizer: adam
15 | n_epochs: 40
16 | convert_to_sgd_epoch: 40
17 | print_step: 200
18 | lr: 1e-3
19 | lr_decay_start_epoch: 10
20 | lr_decay_rate: 0.95
21 | lr_decay_patient_n_epochs: 0
22 | lr_decay_type: always
23 | early_stop_patient_n_epochs: 10
24 | eval_start_epoch: 1
25 | # initialization
26 | param_init: 0.05
27 | # regularization
28 | clip_grad_norm: 1.0
29 | dropout_in: 0.2
30 | dropout_hidden: 0.5
31 | dropout_out: 0.0
32 | weight_decay: 1e-6
33 | lsm_prob: 0.1  ###
34 | adaptive_softmax: false
35 | # contextualization
36 | serialize: false  ###
37 | 


--------------------------------------------------------------------------------
/examples/swbd/s5c/conf/lm/transformer_xl.yaml:
--------------------------------------------------------------------------------
 1 | # topology
 2 | lm_type: transformer_xl
 3 | n_layers: 12
 4 | transformer_d_model: 512
 5 | transformer_d_ff: 2048
 6 | transformer_n_heads: 8
 7 | tie_embedding: true
 8 | # optimization
 9 | batch_size: 24
10 | bptt: 200
11 | mem_len: 200
12 | optimizer: noam
13 | n_epochs: 40
14 | convert_to_sgd_epoch: 100
15 | print_step: 400
16 | lr_factor: 1.0
17 | early_stop_patient_n_epochs: 5
18 | eval_start_epoch: 1
19 | warmup_n_steps: 4000
20 | accum_grad_n_steps: 4  ###
21 | # regularization
22 | clip_grad_norm: 1.0
23 | dropout_in: 0.1
24 | dropout_hidden: 0.3
25 | dropout_out: 0.0
26 | dropout_att: 0.1
27 | dropout_layer: 0.0
28 | weight_decay: 1e-6
29 | lsm_prob: 0.1  ###
30 | adaptive_softmax: false
31 | # contextualization
32 | serialize: false  ###
33 | 


--------------------------------------------------------------------------------
/examples/swbd/s5c/conf/lm/transformerlm.yaml:
--------------------------------------------------------------------------------
 1 | # topology
 2 | lm_type: transformer
 3 | n_layers: 12
 4 | transformer_pe_type: add
 5 | transformer_d_model: 512
 6 | transformer_d_ff: 2048
 7 | transformer_n_heads: 8
 8 | tie_embedding: true
 9 | # optimization
10 | batch_size: 32
11 | bptt: 200
12 | optimizer: noam
13 | n_epochs: 40
14 | convert_to_sgd_epoch: 100
15 | print_step: 400
16 | lr_factor: 10.0
17 | early_stop_patient_n_epochs: 5
18 | eval_start_epoch: 1
19 | warmup_n_steps: 4000
20 | accum_grad_n_steps: 2
21 | # regularization
22 | clip_grad_norm: 1.0
23 | dropout_in: 0.1
24 | dropout_hidden: 0.3
25 | dropout_out: 0.0
26 | dropout_att: 0.1
27 | dropout_layer: 0.0
28 | weight_decay: 1e-6
29 | lsm_prob: 0.1  ###
30 | adaptive_softmax: false
31 | # contextualization
32 | serialize: false  ###
33 | 


--------------------------------------------------------------------------------
/examples/swbd/s5c/local/MSU_single_letter.txt:
--------------------------------------------------------------------------------
 1 | A ey
 2 | B b iy
 3 | C s iy
 4 | D d iy
 5 | E iy
 6 | F eh f
 7 | G jh iy
 8 | H ey ch
 9 | I ay
10 | J jh ey
11 | K k ey
12 | L eh l
13 | M eh m
14 | N eh n
15 | O ow
16 | P p iy
17 | Q k y uw
18 | R aa r
19 | S eh s
20 | T t iy
21 | U y uw
22 | V v iy
23 | W d ah b ax l y uw
24 | X eh k s
25 | Y w ay
26 | Z z iy
27 | 


--------------------------------------------------------------------------------
/examples/swbd/s5c/local/swbd1_data_download.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Switchboard-1 training data preparation customized for Edinburgh
 4 | # Author:  Arnab Ghoshal (Jan 2013)
 5 | 
 6 | # To be run from one directory above this script.
 7 | 
 8 | ## The input is some directory containing the switchboard-1 release 2
 9 | ## corpus (LDC97S62).  Note: we don't make many assumptions about how
10 | ## you unpacked this.  We are just doing a "find" command to locate
11 | ## the .sph files.
12 | 
13 | . ./path.sh
14 | 
15 | #check existing directories
16 | if [ $# != 1 ]; then
17 |   echo "Usage: swbd1_data_download.sh /path/to/SWBD"
18 |   exit 1;
19 | fi
20 | 
21 | SWBD_DIR=$1
22 | 
23 | dir=${data}/local/train_swbd
24 | mkdir -p $dir
25 | 
26 | # Audio data directory check
27 | if [ ! -d $SWBD_DIR ]; then
28 |   echo "Error: run.sh requires a directory argument"
29 |   exit 1;
30 | fi
31 | 
32 | # Trans directory check
33 | if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then
34 |   (
35 |     cd $dir;
36 |     if [ ! -d swb_ms98_transcriptions ]; then
37 |       echo " *** Downloading trascriptions and dictionary ***"
38 |       wget http://www.openslr.org/resources/5/switchboard_word_alignments.tar.gz ||
39 |       wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz
40 |       tar -xf switchboard_word_alignments.tar.gz
41 |     fi
42 |   )
43 | else
44 |   echo "Directory with transcriptions exists, skipping downloading"
45 |   [ -f $dir/swb_ms98_transcriptions ] \
46 |     || ln -sf $SWBD_DIR/transcriptions/swb_ms98_transcriptions $dir/
47 | fi
48 | 


--------------------------------------------------------------------------------
/examples/swbd/s5c/path.sh:
--------------------------------------------------------------------------------
 1 | export NEURALSP_ROOT=$PWD/../../..
 2 | export KALDI_ROOT=$NEURALSP_ROOT/tools/kaldi
 3 | export TOOL=$NEURALSP_ROOT/tools/neural_sp
 4 | export CONDA=$TOOL/miniconda
 5 | 
 6 | # Kaldi
 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 8 | export PATH=$NEURALSP_ROOT/utils:$PWD/utils/:$KALDI_ROOT/tools/sctk/bin/:$TOOL/sentencepiece/build/src:$PATH
 9 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
10 | . $KALDI_ROOT/tools/config/common_path.sh
11 | export LC_ALL=C
12 | 
13 | ### Python
14 | source $CONDA/etc/profile.d/conda.sh && conda deactivate && conda activate
15 | export PYTHONDONTWRITEBYTECODE=1
16 | export OMP_NUM_THREADS=1
17 | 
18 | ### CUDA
19 | CUDAROOT=/usr/local/cuda
20 | NCCL_ROOT=/usr/local/nccl
21 | export CPATH=$NCCL_ROOT/include:$CPATH
22 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib/:$CUDAROOT/lib64:$LD_LIBRARY_PATH
23 | export LIBRARY_PATH=$NCCL_ROOT/lib/:$LIBRARY_PATH
24 | export CUDA_HOME=$CUDAROOT
25 | export CUDA_PATH=$CUDAROOT
26 | 


--------------------------------------------------------------------------------
/examples/swbd/s5c/steps:
--------------------------------------------------------------------------------
1 | ../../wsj/s5/steps


--------------------------------------------------------------------------------
/examples/swbd/s5c/utils:
--------------------------------------------------------------------------------
1 | ../../wsj/s5/utils


--------------------------------------------------------------------------------
/examples/tedlium/s5_r2/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | # Kyoto University setup
14 | export train_cmd="run.pl --mem 2G"
15 | export cuda_cmd="run.pl --mem 2G --gpu 1"
16 | export decode_cmd="run.pl --mem 4G"
17 | 
18 | # JHU setup
19 | # export train_cmd="queue.pl --mem 2G"
20 | # export cuda_cmd="queue.pl --mem 2G --gpu 1 --config conf/gpu.conf"
21 | # export decode_cmd="queue.pl --mem 4G"
22 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r2/conf/asr/las/lstm_las.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | subsample: "1_1_1_1_1"
11 | enc_type: conv_lstm
12 | enc_n_units: 1024
13 | enc_n_projs: 0
14 | enc_n_layers: 5
15 | subsample_type: drop
16 | attn_type: location
17 | attn_conv_n_channels: 10
18 | attn_conv_width: 201
19 | attn_dim: 512
20 | attn_n_heads: 1
21 | dec_type: lstm
22 | dec_n_units: 1024
23 | dec_n_projs: 0
24 | dec_n_layers: 1
25 | dec_bottleneck_dim: 1024  ### this is effective
26 | emb_dim: 512
27 | tie_embedding: false
28 | ctc_fc_list: "512"
29 | ### optimization
30 | batch_size: 30
31 | optimizer: adam
32 | n_epochs: 35  # 20->35
33 | convert_to_sgd_epoch: 100
34 | print_step: 600  # 200->600
35 | metric: edit_distance
36 | lr: 1e-3
37 | lr_decay_type: always
38 | lr_decay_start_epoch: 10
39 | lr_decay_rate: 0.85  ### 0.8->0.85
40 | lr_decay_patient_n_epochs: 0
41 | early_stop_patient_n_epochs: 5
42 | sort_stop_epoch: 100
43 | eval_start_epoch: 1
44 | warmup_start_lr: 1e-4
45 | warmup_n_steps: 4000
46 | ### initialization
47 | param_init: 0.1
48 | ### regularization
49 | clip_grad_norm: 5.0
50 | dropout_in: 0.0
51 | dropout_enc: 0.4
52 | dropout_dec: 0.4
53 | dropout_emb: 0.4
54 | dropout_att: 0.0
55 | weight_decay: 1e-6
56 | ss_prob: 0.2
57 | lsm_prob: 0.1
58 | ### MTL
59 | ctc_weight: 0.3
60 | ctc_lsm_prob: 0.1
61 | mtl_per_batch: false
62 | task_specific_layer: false
63 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r2/conf/asr/lstm_las.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | conv_batch_norm: false
11 | subsample: "1_1_1_1_1"
12 | enc_type: conv_lstm
13 | enc_n_units: 1024
14 | enc_n_projs: 0
15 | enc_n_layers: 5
16 | subsample_type: drop
17 | attn_type: location
18 | attn_conv_n_channels: 10
19 | attn_conv_width: 201
20 | attn_dim: 512
21 | attn_n_heads: 1
22 | dec_type: lstm
23 | dec_n_units: 1024
24 | dec_n_projs: 0
25 | dec_n_layers: 1
26 | dec_bottleneck_dim: 1024  ### this is effective
27 | emb_dim: 512
28 | tie_embedding: false
29 | ctc_fc_list: "512"
30 | ### optimization
31 | batch_size: 30
32 | optimizer: adam
33 | n_epochs: 35  # 20->35
34 | convert_to_sgd_epoch: 100
35 | print_step: 600  # 200->600
36 | metric: edit_distance
37 | lr: 1e-3
38 | lr_decay_type: always
39 | lr_decay_start_epoch: 10
40 | lr_decay_rate: 0.85  ###
41 | lr_decay_patient_n_epochs: 0
42 | early_stop_patient_n_epochs: 5
43 | sort_stop_epoch: 100
44 | eval_start_epoch: 1
45 | warmup_start_lr: 1e-4
46 | warmup_n_steps: 4000
47 | ### initialization
48 | param_init: 0.1
49 | ### regularization
50 | clip_grad_norm: 5.0
51 | dropout_in: 0.0
52 | dropout_enc: 0.4
53 | dropout_dec: 0.4
54 | dropout_emb: 0.4
55 | dropout_att: 0.0
56 | weight_decay: 1e-6
57 | ss_prob: 0.2
58 | lsm_prob: 0.1
59 | ### MTL
60 | ctc_weight: 0.3
61 | ctc_lsm_prob: 0.1
62 | mtl_per_batch: false
63 | task_specific_layer: false
64 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r2/conf/asr/transducer/blstm_rnnt_bpe1k.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | subsample: "1_1_1_1_1"
11 | enc_type: conv_blstm
12 | bidirectional_sum_fwd_bwd: true
13 | enc_n_units: 512
14 | enc_n_projs: 0
15 | enc_n_layers: 5
16 | subsample_type: drop
17 | lc_chunk_size_left: -1  ### offline
18 | lc_chunk_size_right: 40
19 | dec_type: lstm_transducer
20 | dec_n_units: 1024
21 | dec_n_projs: 0
22 | dec_n_layers: 2
23 | dec_bottleneck_dim: 512
24 | emb_dim: 512
25 | tie_embedding: false
26 | ctc_fc_list: "512"
27 | ### optimization
28 | batch_size: 15
29 | optimizer: adam
30 | n_epochs: 20  # 25->20
31 | convert_to_sgd_epoch: 100
32 | print_step: 600  # 200->600
33 | metric: edit_distance
34 | lr: 1e-3
35 | lr_decay_type: always
36 | lr_decay_start_epoch: 10
37 | lr_decay_rate: 0.8  ### 0.85->0.8
38 | lr_decay_patient_n_epochs: 0
39 | early_stop_patient_n_epochs: 5
40 | sort_stop_epoch: 100
41 | eval_start_epoch: 1
42 | warmup_start_lr: 1e-4
43 | warmup_n_steps: 4000
44 | ### initialization
45 | param_init: 0.1
46 | ### regularization
47 | clip_grad_norm: 5.0
48 | dropout_in: 0.0
49 | dropout_enc: 0.4
50 | dropout_dec: 0.4
51 | dropout_emb: 0.4
52 | weight_decay: 1e-6
53 | ### MTL
54 | ctc_weight: 0.3
55 | ctc_lsm_prob: 0.1
56 | mtl_per_batch: false
57 | task_specific_layer: false
58 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r2/conf/asr/transducer/lcblstm_rnnt_40_20_bpe1k.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | subsample: "1_1_1_1_1"
11 | enc_type: conv_blstm
12 | bidirectional_sum_fwd_bwd: true
13 | enc_n_units: 512
14 | enc_n_projs: 0
15 | enc_n_layers: 5
16 | subsample_type: drop
17 | lc_chunk_size_left: 40
18 | lc_chunk_size_right: 20
19 | dec_type: lstm_transducer
20 | dec_n_units: 1024
21 | dec_n_projs: 0
22 | dec_n_layers: 2
23 | dec_bottleneck_dim: 512
24 | emb_dim: 512
25 | tie_embedding: false
26 | ctc_fc_list: "512"
27 | ### optimization
28 | batch_size: 10
29 | optimizer: adam
30 | n_epochs: 20  # 25->20
31 | convert_to_sgd_epoch: 100
32 | print_step: 600  # 200->600
33 | metric: edit_distance
34 | lr: 1e-3
35 | lr_decay_type: always
36 | lr_decay_start_epoch: 10
37 | lr_decay_rate: 0.8  ### 0.85->0.8
38 | lr_decay_patient_n_epochs: 0
39 | early_stop_patient_n_epochs: 5
40 | sort_stop_epoch: 100
41 | eval_start_epoch: 1
42 | warmup_start_lr: 1e-4
43 | warmup_n_steps: 4000
44 | ### initialization
45 | param_init: 0.1
46 | ### regularization
47 | clip_grad_norm: 5.0
48 | dropout_in: 0.0
49 | dropout_enc: 0.4
50 | dropout_dec: 0.4
51 | dropout_emb: 0.4
52 | weight_decay: 1e-6
53 | ### MTL
54 | ctc_weight: 0.3
55 | ctc_lsm_prob: 0.1
56 | mtl_per_batch: false
57 | task_specific_layer: false
58 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r2/conf/asr/transducer/lcblstm_rnnt_40_40_bpe1k.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | subsample: "1_1_1_1_1"
11 | enc_type: conv_blstm
12 | bidirectional_sum_fwd_bwd: true
13 | enc_n_units: 512
14 | enc_n_projs: 0
15 | enc_n_layers: 5
16 | subsample_type: drop
17 | lc_chunk_size_left: 40
18 | lc_chunk_size_right: 40
19 | dec_type: lstm_transducer
20 | dec_n_units: 1024
21 | dec_n_projs: 0
22 | dec_n_layers: 2
23 | dec_bottleneck_dim: 512
24 | emb_dim: 512
25 | tie_embedding: false
26 | ctc_fc_list: "512"
27 | ### optimization
28 | batch_size: 10
29 | optimizer: adam
30 | n_epochs: 20  # 25->20
31 | convert_to_sgd_epoch: 100
32 | print_step: 600  # 200->600
33 | metric: edit_distance
34 | lr: 1e-3
35 | lr_decay_type: always
36 | lr_decay_start_epoch: 10
37 | lr_decay_rate: 0.8  ### 0.85->0.8
38 | lr_decay_patient_n_epochs: 0
39 | early_stop_patient_n_epochs: 5
40 | sort_stop_epoch: 100
41 | eval_start_epoch: 1
42 | warmup_start_lr: 1e-4
43 | warmup_n_steps: 4000
44 | ### initialization
45 | param_init: 0.1
46 | ### regularization
47 | clip_grad_norm: 5.0
48 | dropout_in: 0.0
49 | dropout_enc: 0.4
50 | dropout_dec: 0.4
51 | dropout_emb: 0.4
52 | weight_decay: 1e-6
53 | ### MTL
54 | ctc_weight: 0.3
55 | ctc_lsm_prob: 0.1
56 | mtl_per_batch: false
57 | task_specific_layer: false
58 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r2/conf/asr/transducer/lstm_rnnt_bpe1k.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | subsample: "1_1_1_1_1"
11 | enc_type: conv_lstm
12 | enc_n_units: 1024
13 | enc_n_projs: 0
14 | enc_n_layers: 5
15 | subsample_type: drop
16 | dec_type: lstm_transducer
17 | dec_n_units: 1024
18 | dec_n_projs: 0
19 | dec_n_layers: 2
20 | dec_bottleneck_dim: 512
21 | emb_dim: 512
22 | tie_embedding: false
23 | ctc_fc_list: "512"
24 | ### optimization
25 | batch_size: 15
26 | optimizer: adam
27 | n_epochs: 35  # 20->35
28 | convert_to_sgd_epoch: 100
29 | print_step: 600  # 200->600
30 | metric: edit_distance
31 | lr: 1e-3
32 | lr_decay_type: always
33 | lr_decay_start_epoch: 10
34 | lr_decay_rate: 0.85  ### 0.8->0.85
35 | lr_decay_patient_n_epochs: 0
36 | early_stop_patient_n_epochs: 5
37 | sort_stop_epoch: 100
38 | eval_start_epoch: 1
39 | warmup_start_lr: 1e-4
40 | warmup_n_steps: 4000
41 | ### initialization
42 | param_init: 0.1
43 | ### regularization
44 | clip_grad_norm: 5.0
45 | dropout_in: 0.0
46 | dropout_enc: 0.4
47 | dropout_dec: 0.4
48 | dropout_emb: 0.4
49 | weight_decay: 1e-6
50 | ### MTL
51 | ctc_weight: 0.3
52 | ctc_lsm_prob: 0.1
53 | mtl_per_batch: false
54 | task_specific_layer: false
55 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r2/conf/data/pretrain.yaml:
--------------------------------------------------------------------------------
1 | # optimization
2 | n_epochs: 15  # 20->15
3 | print_step: 1200  # 600->1200
4 | lr_decay_start_epoch: 5
5 | lr_decay_rate: 0.8
6 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r2/conf/data/spec_augment_speed_perturb.yaml:
--------------------------------------------------------------------------------
 1 | # optimization
 2 | n_epochs: 40  # 20->40
 3 | print_step: 1200  # 600->1200
 4 | lr_decay_start_epoch: 15
 5 | lr_decay_rate: 0.9
 6 | 
 7 | # mask
 8 | freq_width: 27
 9 | n_freq_masks: 2
10 | time_width: 100
11 | n_time_masks: 2
12 | time_width_upper: 1.0
13 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r2/conf/data/spec_augment_speed_perturb_pretrain_F13_T50.yaml:
--------------------------------------------------------------------------------
 1 | # optimization
 2 | n_epochs: 30  # 15->30
 3 | print_step: 2400  # 1200->2400
 4 | lr_decay_start_epoch: 5
 5 | lr_decay_rate: 0.9
 6 | 
 7 | # mask
 8 | freq_width: 13
 9 | n_freq_masks: 2
10 | time_width: 50
11 | n_time_masks: 2
12 | time_width_upper: 1.0
13 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r2/conf/data/spec_augment_speed_perturb_pretrain_F27_T100.yaml:
--------------------------------------------------------------------------------
 1 | # optimization
 2 | n_epochs: 30  # 15->30
 3 | print_step: 2400  # 1200->2400
 4 | lr_decay_start_epoch: 5
 5 | lr_decay_rate: 0.9
 6 | 
 7 | # mask
 8 | freq_width: 27
 9 | n_freq_masks: 2
10 | time_width: 100
11 | n_time_masks: 2
12 | time_width_upper: 1.0
13 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r2/conf/data/spec_augment_speed_perturb_pretrain_F27_T50.yaml:
--------------------------------------------------------------------------------
 1 | # optimization
 2 | n_epochs: 30  # 15->30
 3 | print_step: 2400  # 1200->2400
 4 | lr_decay_start_epoch: 5
 5 | lr_decay_rate: 0.9
 6 | 
 7 | # mask
 8 | freq_width: 27
 9 | n_freq_masks: 2
10 | time_width: 50
11 | n_time_masks: 2
12 | time_width_upper: 1.0
13 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r2/conf/fbank.conf:
--------------------------------------------------------------------------------
1 | --window-type=hamming # disable Dans window, use the standard
2 | --htk-compat=true     # try to make it compatible with HTK
3 | 
4 | --sample-frequency=16000
5 | --num-mel-bins=80
6 | --use-energy=false
7 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r2/conf/lm/rnnlm.yaml:
--------------------------------------------------------------------------------
 1 | # topology
 2 | lm_type: lstm
 3 | n_units: 1024
 4 | n_projs: 0
 5 | n_layers: 4  ###
 6 | emb_dim: 1024
 7 | n_units_null_context: 0
 8 | tie_embedding: true
 9 | residual: true
10 | use_glu: true
11 | # optimization
12 | batch_size: 128
13 | bptt: 200
14 | optimizer: adam
15 | n_epochs: 60
16 | convert_to_sgd_epoch: 100
17 | print_step: 400
18 | lr: 1e-3
19 | lr_decay_start_epoch: 5
20 | lr_decay_rate: 0.95  ###
21 | lr_decay_patient_n_epochs: 0
22 | lr_decay_type: always
23 | early_stop_patient_n_epochs: 5
24 | eval_start_epoch: 1
25 | # initialization
26 | param_init: 0.05
27 | # regularization
28 | clip_grad_norm: 1.0
29 | dropout_in: 0.2
30 | dropout_hidden: 0.2
31 | dropout_out: 0.0
32 | weight_decay: 1e-6
33 | lsm_prob: 0.0  ###
34 | adaptive_softmax: false
35 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r2/local/download_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright  2014  Nickolay V. Shmyrev
 4 | #            2014  Brno University of Technology (Author: Karel Vesely)
 5 | #            2016  John Hopkins University (author: Daniel Povey)
 6 | # Apache 2.0
 7 | 
 8 | mkdir -p ${db}
 9 | 
10 | cd ${db}  ### Note: the rest of this script is executed from the directory '${db}'.
11 | 
12 | # TED-LIUM database:
13 | if [[ $(hostname -f) == *.clsp.jhu.edu ]] ; then
14 |   if [ ! -e TEDLIUM_release2 ]; then
15 |     ln -sf /export/corpora5/TEDLIUM_release2
16 |   fi
17 |   echo "$0: linking the TEDLIUM data from /export/corpora5/TEDLIUM_release2"
18 | else
19 |   if [ ! -e TEDLIUM_release2 ]; then
20 |     echo "$0: downloading TEDLIUM_release2 data (it won't re-download if it was already downloaded.)"
21 |     # the following command won't re-get it if it's already there
22 |     # because of the --continue switch.
23 |     wget --continue http://www.openslr.org/resources/19/TEDLIUM_release2.tar.gz || exit 1
24 |     tar xf "TEDLIUM_release2.tar.gz"
25 |   else
26 |     echo "$0: not downloading or un-tarring TEDLIUM_release2 because it already exists."
27 |   fi
28 | fi
29 | 
30 | 
31 | num_sph=$(find TEDLIUM_release2/ -name '*.sph' | wc -l)
32 | if [ "$num_sph" != 1514 ]; then
33 |   echo "$0: expected to find 1514 .sph files in the directory ${db}/TEDLIUM_release2, found $num_sph"
34 |   exit 1
35 | fi
36 | 
37 | exit 0
38 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r2/local/format_lms.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright  2014 Nickolay V. Shmyrev
 4 | # Apache 2.0
 5 | 
 6 | if [ -f path.sh ]; then . ./path.sh; fi
 7 | 
 8 | 
 9 | small_arpa_lm=${data}/local/local_lm/data/arpa/4gram_small.arpa.gz
10 | big_arpa_lm=${data}/local/local_lm/data/arpa/4gram_big.arpa.gz
11 | 
12 | for f in $small_arpa_lm $big_arpa_lm ${data}/lang_nosp/words.txt; do
13 |   [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
14 | done
15 | 
16 | 
17 | set -e
18 | 
19 | if [ -f ${data}/lang_nosp/G.fst ] && [ ${data}/lang_nosp/G.fst -nt $small_arpa_lm ]; then
20 |   echo "$0: not regenerating ${data}/lang_nosp/G.fst as it already exists and "
21 |   echo ".. is newer than the source LM."
22 | else
23 |   arpa2fst --disambig-symbol=#0 --read-symbol-table=${data}/lang_nosp/words.txt \
24 |     "gunzip -c $small_arpa_lm|" ${data}/lang_nosp/G.fst
25 |   echo  "$0: Checking how stochastic G is (the first of these numbers should be small):"
26 |   fstisstochastic ${data}/lang_nosp/G.fst || true
27 |   utils/validate_lang.pl --skip-determinization-check ${data}/lang_nosp
28 | fi
29 | 
30 | 
31 | 
32 | if [ -f ${data}/lang_nosp_rescore/G.carpa ] && [ ${data}/lang_nosp_rescore/G.carpa -nt $big_arpa_lm ] && \
33 |     [ ${data}/lang_nosp_rescore/G.carpa -nt ${data}/lang_nosp/words.txt ]; then
34 |   echo "$0: not regenerating ${data}/lang_nosp_rescore/ as it seems to already by up to date."
35 | else
36 |   utils/build_const_arpa_lm.sh $big_arpa_lm ${data}/lang_nosp ${data}/lang_nosp_rescore || exit 1;
37 | fi
38 | 
39 | exit 0;
40 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r2/local/join_suffix.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Copyright  2014  Nickolay V. Shmyrev
 4 | #            2016  Johns Hopkins University (author: Daniel Povey)
 5 | # Apache 2.0
 6 | 
 7 | 
 8 | from __future__ import print_function
 9 | import sys
10 | from codecs import open
11 | 
12 | # This script joins together pairs of split-up words like "you 're" -> "you're".
13 | # The TEDLIUM transcripts are normalized in a way that's not traditional for
14 | # speech recognition.
15 | 
16 | for line in sys.stdin:
17 |     items = line.split()
18 |     new_items = []
19 |     i = 1
20 |     while i < len(items):
21 |         if i < len(items) - 1 and items[i+1][0] == '\'':
22 |             new_items.append(items[i] + items[i+1])
23 |             i = i + 1
24 |         else:
25 |             new_items.append(items[i])
26 |         i = i + 1
27 |     print(items[0] + ' ' + ' '.join(new_items))
28 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r2/local/prepare_dict.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright  2014 Nickolay V. Shmyrev
 4 | #            2014 Brno University of Technology (Author: Karel Vesely)
 5 | #            2016 Daniel Galvez
 6 | #            2016 Vincent Nguyen
 7 | # Apache 2.0
 8 | #
 9 | 
10 | dir=${data}/local/dict_nosp
11 | mkdir -p $dir
12 | 
13 | srcdict=${db}/TEDLIUM_release2/TEDLIUM.152k.dic
14 | 
15 | [ ! -r $srcdict ] && echo "Missing $srcdict" && exit 1
16 | 
17 | # Join dicts and fix some troubles
18 | cat $srcdict | grep -v -w "<s>" | grep -v -w "</s>" | grep -v -w "<unk>" | \
19 |   LANG= LC_ALL= sort | sed 's:([0-9])::g' > $dir/lexicon_words.txt
20 | 
21 | cat $dir/lexicon_words.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \
22 |   grep -v SIL | sort > $dir/nonsilence_phones.txt
23 | 
24 | ( echo SIL; echo NSN ) > $dir/silence_phones.txt
25 | 
26 | echo SIL > $dir/optional_silence.txt
27 | 
28 | # No "extra questions" in the input to this setup, as we don't
29 | # have stress or tone.
30 | echo -n >$dir/extra_questions.txt
31 | 
32 | # Add to the lexicon the silences, noises etc.
33 | # Typically, you would use "<UNK> NSN" here, but the Cantab Research language models
34 | # use <unk> instead of <UNK> to represent out of vocabulary words.
35 | echo '<unk> NSN' | cat - $dir/lexicon_words.txt | sort | uniq > $dir/lexicon.txt
36 | 
37 | # Check that the dict dir is okay!
38 | utils/validate_dict_dir.pl $dir || exit 1
39 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r2/local/ted_download_lm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright  2018  David Snyder
 4 | # Apache 2.0
 5 | #
 6 | # This script downloads pre-built language models trained on the Cantab-Tedlium
 7 | # text data and Tedlium acoustic training data.  If you want to build these
 8 | # models yourself, run the script local/ted_train_lm.sh.
 9 | 
10 | set -e
11 | 
12 | echo "$0: downloading Tedlium 4 gram language models (it won't re-download if it was already downloaded.)"
13 | wget --continue http://kaldi-asr.org/models/5/4gram_small.arpa.gz -P ${data}/local/local_lm/data/arpa || exit 1
14 | wget --continue http://kaldi-asr.org/models/5/4gram_big.arpa.gz -P ${data}/local/local_lm/data/arpa || exit 1
15 | 
16 | exit 0
17 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r2/path.sh:
--------------------------------------------------------------------------------
 1 | export NEURALSP_ROOT=$PWD/../../..
 2 | export KALDI_ROOT=$NEURALSP_ROOT/tools/kaldi
 3 | export TOOL=$NEURALSP_ROOT/tools/neural_sp
 4 | export CONDA=$TOOL/miniconda
 5 | 
 6 | # Kaldi
 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 8 | export PATH=$NEURALSP_ROOT/utils:$PWD/utils/:$KALDI_ROOT/tools/sctk/bin/:$KALDI_ROOT/tools/sph2pipe_v2.5/:$TOOL/sentencepiece/build/src:$PATH
 9 | export PATH=$TOOL/mwerSegmenter/:$TOOL/moses/scripts/tokenizer/:$TOOL/moses/scripts/generic/:$PATH
10 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
11 | . $KALDI_ROOT/tools/config/common_path.sh
12 | export LC_ALL=C
13 | 
14 | ### Python
15 | source $CONDA/etc/profile.d/conda.sh && conda deactivate && conda activate
16 | export PYTHONDONTWRITEBYTECODE=1
17 | export OMP_NUM_THREADS=1
18 | 
19 | ### CUDA
20 | CUDAROOT=/usr/local/cuda
21 | NCCL_ROOT=/usr/local/nccl
22 | export CPATH=$NCCL_ROOT/include:$CPATH
23 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib/:$CUDAROOT/lib64:$LD_LIBRARY_PATH
24 | export LIBRARY_PATH=$NCCL_ROOT/lib/:$LIBRARY_PATH
25 | export CUDA_HOME=$CUDAROOT
26 | export CUDA_PATH=$CUDAROOT
27 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r2/steps:
--------------------------------------------------------------------------------
1 | ../../wsj/s5/steps


--------------------------------------------------------------------------------
/examples/tedlium/s5_r2/utils:
--------------------------------------------------------------------------------
1 | ../../wsj/s5/utils


--------------------------------------------------------------------------------
/examples/tedlium/s5_r3/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | # Kyoto University setup
14 | export train_cmd="run.pl --mem 2G"
15 | export cuda_cmd="run.pl --mem 2G --gpu 1"
16 | export decode_cmd="run.pl --mem 4G"
17 | 
18 | # JHU setup
19 | # export train_cmd="queue.pl --mem 2G"
20 | # export cuda_cmd="queue.pl --mem 2G --gpu 1 --config conf/gpu.conf"
21 | # export decode_cmd="queue.pl --mem 4G"
22 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r3/conf/fbank.conf:
--------------------------------------------------------------------------------
1 | --window-type=hamming # disable Dans window, use the standard
2 | --htk-compat=true     # try to make it compatible with HTK
3 | 
4 | --sample-frequency=16000
5 | --num-mel-bins=80
6 | --use-energy=false
7 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r3/conf/lm/rnnlm.yaml:
--------------------------------------------------------------------------------
 1 | # topology
 2 | lm_type: lstm
 3 | n_units: 1024
 4 | n_projs: 0
 5 | n_layers: 2
 6 | emb_dim: 1024
 7 | n_units_null_context: 0
 8 | tie_embedding: true
 9 | residual: true
10 | use_glu: true
11 | # optimization
12 | batch_size: 128
13 | bptt: 200
14 | optimizer: adam
15 | n_epochs: 40
16 | convert_to_sgd_epoch: 40
17 | print_step: 400
18 | lr: 1e-3
19 | lr_decay_start_epoch: 10
20 | lr_decay_rate: 0.9
21 | lr_decay_patient_n_epochs: 0
22 | lr_decay_type: always
23 | early_stop_patient_n_epochs: 5
24 | eval_start_epoch: 1
25 | # initialization
26 | param_init: 0.05
27 | # regularization
28 | clip_grad_norm: 1.0
29 | dropout_in: 0.2
30 | dropout_hidden: 0.2
31 | dropout_out: 0.0
32 | weight_decay: 1e-6
33 | lsm_prob: 0.0
34 | backward: false
35 | adaptive_softmax: false
36 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r3/conf/spec_augment.yaml:
--------------------------------------------------------------------------------
 1 | # optimization
 2 | n_epochs: 50
 3 | convert_to_sgd_epoch: 50
 4 | print_step: 400
 5 | lr_decay_start_epoch: 15
 6 | lr_decay_rate: 0.9
 7 | 
 8 | # mask
 9 | freq_width: 27
10 | n_freq_masks: 2
11 | time_width: 100
12 | n_time_masks: 2
13 | time_width_upper: 1.0
14 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r3/conf/spec_augment_speed_perturb.yaml:
--------------------------------------------------------------------------------
 1 | # optimization
 2 | n_epochs: 40
 3 | convert_to_sgd_epoch: 40
 4 | print_step: 1200
 5 | lr_decay_start_epoch: 7
 6 | lr_decay_rate: 0.875
 7 | 
 8 | # mask
 9 | freq_width: 27
10 | n_freq_masks: 2
11 | time_width: 100
12 | n_time_masks: 2
13 | time_width_upper: 1.0
14 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r3/conf/speed_perturb.yaml:
--------------------------------------------------------------------------------
1 | # optimization
2 | n_epochs: 20
3 | convert_to_sgd_epoch: 15
4 | print_step: 600
5 | lr_decay_start_epoch: 5
6 | lr_decay_rate: 0.8
7 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r3/local/download_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright  2014  Nickolay V. Shmyrev
 4 | #            2014  Brno University of Technology (Author: Karel Vesely)
 5 | #            2016  John Hopkins University (author: Daniel Povey)
 6 | # Apache 2.0
 7 | 
 8 | mkdir -p ${db}
 9 | 
10 | cd ${db}  ### Note: the rest of this script is executed from the directory '${db}'.
11 | 
12 | # TED-LIUM database:
13 | if [[ $(hostname -f) == *.clsp.jhu.edu ]] ; then
14 |   if [ ! -e TEDLIUM_release-3 ]; then
15 |     ln -sf /export/corpora5/TEDLIUM_release-3
16 |   fi
17 |   echo "$0: linking the TEDLIUM data from /export/corpora5/TEDLIUM_release-3"
18 | else
19 |   if [ ! -e TEDLIUM_release-3 ]; then
20 |     echo "$0: downloading TEDLIUM_release-3 data (it won't re-download if it was already downloaded.)"
21 |     # the following command won't re-get it if it's already there
22 |     # because of the --continue switch.
23 |     wget --continue http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz || exit 1
24 | 
25 |     echo "$0: extracting TEDLIUM_release-3 data"
26 |     tar xf "TEDLIUM_release-3.tgz"
27 |   else
28 |     echo "$0: not downloading or un-tarring TEDLIUM_release3 because it already exists."
29 |   fi
30 | fi
31 | 
32 | 
33 | num_sph=$(find TEDLIUM_release-3/data -name '*.sph' | wc -l)
34 | if [ "$num_sph" != 2351 ]; then
35 |   echo "$0: expected to find 2351 .sph files in the directory ${db}/TEDLIUM_release-3, found $num_sph"
36 |   exit 1
37 | fi
38 | 
39 | exit 0
40 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r3/local/format_lms.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright  2014 Nickolay V. Shmyrev
 4 | # Apache 2.0
 5 | 
 6 | if [ -f path.sh ]; then . path.sh; fi
 7 | 
 8 | 
 9 | small_arpa_lm=${data}/local/local_lm/${data}/arpa/4gram_small.arpa.gz
10 | big_arpa_lm=${data}/local/local_lm/${data}/arpa/4gram_big.arpa.gz
11 | 
12 | for f in $small_arpa_lm $big_arpa_lm ${data}/lang_nosp/words.txt; do
13 |   [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
14 | done
15 | 
16 | 
17 | set -e
18 | 
19 | if [ -f ${data}/lang_nosp/G.fst ] && [ ${data}/lang_nosp/G.fst -nt $small_arpa_lm ]; then
20 |   echo "$0: not regenerating ${data}/lang_nosp/G.fst as it already exists and "
21 |   echo ".. is newer than the source LM."
22 | else
23 |   arpa2fst --disambig-symbol=#0 --read-symbol-table=${data}/lang_nosp/words.txt \
24 |     "gunzip -c $small_arpa_lm|" ${data}/lang_nosp/G.fst
25 |   echo  "$0: Checking how stochastic G is (the first of these numbers should be small):"
26 |   fstisstochastic ${data}/lang_nosp/G.fst || true
27 |   utils/validate_lang.pl --skip-determinization-check ${data}/lang_nosp
28 | fi
29 | 
30 | 
31 | 
32 | if [ -f ${data}/lang_nosp_rescore/G.carpa ] && [ ${data}/lang_nosp_rescore/G.carpa -nt $big_arpa_lm ] && \
33 |     [ ${data}/lang_nosp_rescore/G.carpa -nt ${data}/lang_nosp/words.txt ]; then
34 |   echo "$0: not regenerating ${data}/lang_nosp_rescore/ as it seems to already by up to date."
35 | else
36 |   utils/build_const_arpa_lm.sh $big_arpa_lm ${data}/lang_nosp ${data}/lang_nosp_rescore || exit 1;
37 | fi
38 | 
39 | exit 0;
40 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r3/local/join_suffix.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Copyright  2014  Nickolay V. Shmyrev
 4 | #            2016  Johns Hopkins University (author: Daniel Povey)
 5 | # Apache 2.0
 6 | 
 7 | 
 8 | from __future__ import print_function
 9 | import sys
10 | from codecs import open
11 | 
12 | # This script joins together pairs of split-up words like "you 're" -> "you're".
13 | # The TEDLIUM transcripts are normalized in a way that's not traditional for
14 | # speech recognition.
15 | 
16 | for line in sys.stdin:
17 |     items = line.split()
18 |     new_items = []
19 |     i = 1
20 |     while i < len(items):
21 |         if i < len(items) - 1 and items[i+1][0] == '\'':
22 |             new_items.append(items[i] + items[i+1])
23 |             i = i + 1
24 |         else:
25 |             new_items.append(items[i])
26 |         i = i + 1
27 |     print(items[0] + ' ' + ' '.join(new_items))
28 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r3/local/prepare_dict.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright  2014 Nickolay V. Shmyrev
 4 | #            2014 Brno University of Technology (Author: Karel Vesely)
 5 | #            2016 Daniel Galvez
 6 | #            2016 Vincent Nguyen
 7 | # Apache 2.0
 8 | #
 9 | 
10 | dir=${data}/local/dict_nosp
11 | mkdir -p $dir
12 | 
13 | srcdict=${db}/TEDLIUM_release-3/TEDLIUM.152k.dic
14 | 
15 | [ ! -r $srcdict ] && echo "Missing $srcdict" && exit 1
16 | 
17 | # Join dicts and fix some troubles
18 | cat $srcdict | grep -v -w "<s>" | grep -v -w "</s>" | grep -v -w "<unk>" | \
19 |   LANG= LC_ALL= sort | sed 's:([0-9])::g' > $dir/lexicon_words.txt
20 | 
21 | cat $dir/lexicon_words.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \
22 |   grep -v SIL | sort > $dir/nonsilence_phones.txt
23 | 
24 | ( echo SIL; echo NSN ) > $dir/silence_phones.txt
25 | 
26 | echo SIL > $dir/optional_silence.txt
27 | 
28 | # No "extra questions" in the input to this setup, as we don't
29 | # have stress or tone.
30 | echo -n >$dir/extra_questions.txt
31 | 
32 | # Add to the lexicon the silences, noises etc.
33 | # Typically, you would use "<UNK> NSN" here, but the Cantab Research language models
34 | # use <unk> instead of <UNK> to represent out of vocabulary words.
35 | echo '<unk> NSN' | cat - $dir/lexicon_words.txt | sort | uniq > $dir/lexicon.txt
36 | 
37 | # Check that the dict dir is okay!
38 | utils/validate_dict_dir.pl $dir || exit 1
39 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r3/local/ted_download_lm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright  2018  David Snyder
 4 | # Apache 2.0
 5 | #
 6 | # This script downloads pre-built language models trained on the Cantab-Tedlium
 7 | # text data and Tedlium acoustic training data.  If you want to build these
 8 | # models yourself, run the script local/ted_train_lm.sh.
 9 | 
10 | set -e
11 | 
12 | echo "$0: downloading Tedlium 4 gram language models (it won't re-download if it was already downloaded.)"
13 | wget --continue http://kaldi-asr.org/models/5/4gram_small.arpa.gz -P ${data}/local/local_lm/${data}/arpa || exit 1
14 | wget --continue http://kaldi-asr.org/models/5/4gram_big.arpa.gz -P ${data}/local/local_lm/${data}/arpa || exit 1
15 | 
16 | exit 0
17 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r3/path.sh:
--------------------------------------------------------------------------------
 1 | export NEURALSP_ROOT=$PWD/../../..
 2 | export KALDI_ROOT=$NEURALSP_ROOT/tools/kaldi
 3 | export TOOL=$NEURALSP_ROOT/tools/neural_sp
 4 | export CONDA=$TOOL/miniconda
 5 | 
 6 | # Kaldi
 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 8 | export PATH=$NEURALSP_ROOT/utils:$PWD/utils/:$KALDI_ROOT/tools/sctk/bin/:$TOOL/sentencepiece/build/src:$PATH
 9 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
10 | . $KALDI_ROOT/tools/config/common_path.sh
11 | export LC_ALL=C
12 | 
13 | ### Python
14 | source $CONDA/etc/profile.d/conda.sh && conda deactivate && conda activate
15 | export PYTHONDONTWRITEBYTECODE=1
16 | export OMP_NUM_THREADS=1
17 | 
18 | ### CUDA
19 | CUDAROOT=/usr/local/cuda
20 | NCCL_ROOT=/usr/local/nccl
21 | export CPATH=$NCCL_ROOT/include:$CPATH
22 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib/:$CUDAROOT/lib64:$LD_LIBRARY_PATH
23 | export LIBRARY_PATH=$NCCL_ROOT/lib/:$LIBRARY_PATH
24 | export CUDA_HOME=$CUDAROOT
25 | export CUDA_PATH=$CUDAROOT
26 | 


--------------------------------------------------------------------------------
/examples/tedlium/s5_r3/steps:
--------------------------------------------------------------------------------
1 | ../../wsj/s5/steps


--------------------------------------------------------------------------------
/examples/tedlium/s5_r3/utils:
--------------------------------------------------------------------------------
1 | ../../wsj/s5/utils


--------------------------------------------------------------------------------
/examples/timit/s5/RESULTS.md:
--------------------------------------------------------------------------------
 1 | # Use caution when comparing these results with other published results.
 2 | Training Set   : 3696 sentences
 3 | Dev Set        : 400 sentences
 4 | Test Set       : 192 sentences Core Test Set (different from Full 1680 sent. set)
 5 | Language Model : no
 6 | Phone mapping  : Training with 61 phonemes, for testing mapped to 39 phonemes
 7 | 
 8 | 
 9 | ### BLSTM-CTC
10 | | Eval Set | # Snt | # Wrd | Corr | Sub | Del | Ins | Err | S.Err |
11 | | -------- | ----- | ----- | ---- | --- | --- | --- | --- | ----- |
12 | |dev|Sum/Avg|400|15334|80.7|15.0|4.3|2.3|21.7|99.3|
13 | |test|Sum/Avg|192|7333|79.6|15.4|5.0|2.4|22.8|99.5|
14 | 
15 | ### Transformer + SpecAugment
16 | | Eval Set | # Snt | # Wrd | Corr | Sub | Del | Ins | Err | S.Err |
17 | | -------- | ----- | ----- | ---- | --- | --- | --- | --- | ----- |
18 | |dev|Sum/Avg|400|15334|81.3|15.4|3.3|2.9|**21.7**|99.8|
19 | |test|Sum/Avg|192|7333|80.2|15.9|4.0|3.3|**23.1**|100.0|
20 | 
21 | ### Transformer + SpecAugment + relative positional encoding (encoder)
22 | | Eval Set | # Snt | # Wrd | Corr | Sub | Del | Ins | Err | S.Err |
23 | | -------- | ----- | ----- | ---- | --- | --- | --- | --- | ----- |
24 | |dev|Sum/Avg|400|15334|82.3|14.6|3.0|2.7|**20.4**|99.5|
25 | |test|Sum/Avg|192|7333|81.7|15.0|3.3|3.1|**21.4**|98.4|
26 | 


--------------------------------------------------------------------------------
/examples/timit/s5/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | # Kyoto University setup
14 | export train_cmd="run.pl --mem 2G"
15 | export cuda_cmd="run.pl --mem 2G --gpu 1"
16 | export decode_cmd="run.pl --mem 4G"
17 | 
18 | # JHU setup
19 | # export train_cmd="queue.pl --mem 2G"
20 | # export cuda_cmd="queue.pl --mem 2G --gpu 1 --config conf/gpu.conf"
21 | # export decode_cmd="queue.pl --mem 4G"
22 | 


--------------------------------------------------------------------------------
/examples/timit/s5/conf/blstm_ctc.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 2000
 5 | input_noise_std: 0
 6 | conv_in_channel: 3
 7 | conv_channels: "32_32"
 8 | conv_kernel_sizes: "(3,3)_(3,3)"
 9 | conv_strides: "(1,1)_(1,1)"
10 | conv_poolings: "(1,1)_(1,1)"
11 | subsample: "1_1_1_1_1"
12 | enc_type: blstm
13 | bidirectional_sum_fwd_bwd: false
14 | enc_n_units: 256
15 | enc_n_projs: 0
16 | enc_n_layers: 5
17 | subsample_type: drop
18 | ctc_fc_list: ""
19 | ### optimization
20 | batch_size: 32
21 | optimizer: adam
22 | n_epochs: 100
23 | convert_to_sgd_epoch: 90
24 | print_step: 20
25 | metric: edit_distance
26 | lr: 1e-3
27 | lr_decay_type: always
28 | lr_decay_start_epoch: 20
29 | lr_decay_rate: 0.97
30 | lr_decay_patient_n_epochs: 0
31 | early_stop_patient_n_epochs: 20
32 | sort_stop_epoch: 100
33 | eval_start_epoch: 1
34 | warmup_start_lr: 1e-4
35 | warmup_n_steps: 0
36 | ### initialization
37 | param_init: 0.1
38 | ### regularization
39 | clip_grad_norm: 5.0
40 | dropout_in: 0.2
41 | dropout_enc: 0.5
42 | weight_decay: 1e-6
43 | ### MTL
44 | ctc_weight: 1.0
45 | ctc_lsm_prob: 0.0
46 | 


--------------------------------------------------------------------------------
/examples/timit/s5/conf/blstm_las.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 2000
 5 | input_noise_std: 0
 6 | conv_in_channel: 3
 7 | conv_channels: "32_32"
 8 | conv_kernel_sizes: "(3,3)_(3,3)"
 9 | conv_strides: "(1,1)_(1,1)"
10 | conv_poolings: "(1,1)_(1,1)"
11 | subsample: "1_1_1_1_1"
12 | enc_type: blstm
13 | bidirectional_sum_fwd_bwd: false
14 | enc_n_units: 256
15 | enc_n_projs: 0
16 | enc_n_layers: 5
17 | subsample_type: drop
18 | attn_type: location
19 | attn_conv_n_channels: 10
20 | attn_conv_width: 201
21 | attn_dim: 256
22 | attn_n_heads: 1
23 | dec_type: lstm
24 | dec_n_units: 256
25 | dec_n_projs: 0
26 | dec_n_layers: 1
27 | dec_bottleneck_dim: 256
28 | emb_dim: 256
29 | tie_embedding: false
30 | ctc_fc_list: ""
31 | ### optimization
32 | batch_size: 32
33 | optimizer: adam
34 | n_epochs: 100
35 | convert_to_sgd_epoch: 90
36 | print_step: 20
37 | metric: edit_distance
38 | lr: 1e-3
39 | lr_decay_type: always
40 | lr_decay_start_epoch: 20
41 | lr_decay_rate: 0.97
42 | lr_decay_patient_n_epochs: 0
43 | early_stop_patient_n_epochs: 20
44 | sort_stop_epoch: 100
45 | eval_start_epoch: 20
46 | warmup_start_lr: 1e-4
47 | warmup_n_steps: 0
48 | ### initialization
49 | param_init: 0.1
50 | ### regularization
51 | clip_grad_norm: 5.0
52 | dropout_in: 0.2
53 | dropout_enc: 0.5
54 | dropout_dec: 0.2
55 | dropout_emb: 0.2
56 | dropout_att: 0.0
57 | weight_decay: 1e-6
58 | ss_prob: 0.0
59 | lsm_prob: 0.0
60 | ### MTL
61 | ctc_weight: 0.0
62 | ctc_lsm_prob: 0.0
63 | mtl_per_batch: false
64 | task_specific_layer: false
65 | 


--------------------------------------------------------------------------------
/examples/timit/s5/conf/dev_spk.list:
--------------------------------------------------------------------------------
 1 | faks0
 2 | fdac1
 3 | fjem0
 4 | mgwt0
 5 | mjar0
 6 | mmdb1
 7 | mmdm2
 8 | mpdf0
 9 | fcmh0
10 | fkms0
11 | mbdg0
12 | mbwm0
13 | mcsh0
14 | fadg0
15 | fdms0
16 | fedw0
17 | mgjf0
18 | mglb0
19 | mrtk0
20 | mtaa0
21 | mtdt0
22 | mthc0
23 | mwjg0
24 | fnmr0
25 | frew0
26 | fsem0
27 | mbns0
28 | mmjr0
29 | mdls0
30 | mdlf0
31 | mdvc0
32 | mers0
33 | fmah0
34 | fdrw0
35 | mrcs0
36 | mrjm4
37 | fcal1
38 | mmwh0
39 | fjsj0
40 | majc0
41 | mjsw0
42 | mreb0
43 | fgjd0
44 | fjmg0
45 | mroa0
46 | mteb0
47 | mjfc0
48 | mrjr0
49 | fmml0
50 | mrws1
51 | 


--------------------------------------------------------------------------------
/examples/timit/s5/conf/fbank.conf:
--------------------------------------------------------------------------------
1 | --window-type=hamming # disable Dans window, use the standard
2 | --htk-compat=true     # try to make it compatible with HTK
3 | 
4 | --sample-frequency=16000
5 | --num-mel-bins=40
6 | --use-energy=true
7 | 


--------------------------------------------------------------------------------
/examples/timit/s5/conf/phones.60-48-39.map:
--------------------------------------------------------------------------------
 1 | aa	aa	aa
 2 | ae	ae	ae
 3 | ah	ah	ah
 4 | ao	ao	aa
 5 | aw	aw	aw
 6 | ax	ax	ah
 7 | ax-h	ax	ah
 8 | axr	er	er
 9 | ay	ay	ay
10 | b	b	b
11 | bcl	vcl	sil
12 | ch	ch	ch
13 | d	d	d
14 | dcl	vcl	sil
15 | dh	dh	dh
16 | dx	dx	dx
17 | eh	eh	eh
18 | el	el	l
19 | em	m	m
20 | en	en	n
21 | eng	ng	ng
22 | epi	epi	sil
23 | er	er	er
24 | ey	ey	ey
25 | f	f	f
26 | g	g	g
27 | gcl	vcl	sil
28 | h#	sil	sil
29 | hh	hh	hh
30 | hv	hh	hh
31 | ih	ih	ih
32 | ix	ix	ih
33 | iy	iy	iy
34 | jh	jh	jh
35 | k	k	k
36 | kcl	cl	sil
37 | l	l	l
38 | m	m	m
39 | n	n	n
40 | ng	ng	ng
41 | nx	n	n
42 | ow	ow	ow
43 | oy	oy	oy
44 | p	p	p
45 | pau	sil	sil
46 | pcl	cl	sil
47 | q
48 | r	r	r
49 | s	s	s
50 | sh	sh	sh
51 | t	t	t
52 | tcl	cl	sil
53 | th	th	th
54 | uh	uh	uh
55 | uw	uw	uw
56 | ux	uw	uw
57 | v	v	v
58 | w	w	w
59 | y	y	y
60 | z	z	z
61 | zh	zh	sh
62 | 


--------------------------------------------------------------------------------
/examples/timit/s5/conf/rnn_transducer.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 2000
 5 | input_noise_std: 0
 6 | conv_in_channel: 3
 7 | conv_channels: "32_32"
 8 | conv_kernel_sizes: "(3,3)_(3,3)"
 9 | conv_strides: "(1,1)_(1,1)"
10 | conv_poolings: "(1,1)_(1,1)"
11 | subsample: "1_1_1_1_1"
12 | enc_type: blstm
13 | bidirectional_sum_fwd_bwd: true
14 | enc_n_units: 256
15 | enc_n_projs: 0
16 | enc_n_layers: 5
17 | subsample_type: drop
18 | dec_type: lstm_transducer
19 | dec_n_units: 256
20 | dec_n_projs: 0
21 | dec_n_layers: 1
22 | dec_bottleneck_dim: 256
23 | emb_dim: 256
24 | tie_embedding: false
25 | ctc_fc_list: ""
26 | ### optimization
27 | batch_size: 32
28 | optimizer: adam
29 | n_epochs: 100
30 | convert_to_sgd_epoch: 90
31 | print_step: 20
32 | metric: edit_distance
33 | lr: 1e-3
34 | lr_decay_type: always
35 | lr_decay_start_epoch: 20
36 | lr_decay_rate: 0.97
37 | lr_decay_patient_n_epochs: 0
38 | early_stop_patient_n_epochs: 20
39 | sort_stop_epoch: 100
40 | eval_start_epoch: 20
41 | warmup_start_lr: 1e-4
42 | warmup_n_steps: 0
43 | ### initialization
44 | param_init: 0.1
45 | ### regularization
46 | clip_grad_norm: 5.0
47 | dropout_in: 0.2
48 | dropout_enc: 0.5
49 | dropout_dec: 0.2
50 | dropout_emb: 0.2
51 | weight_decay: 1e-6
52 | ### MTL
53 | ctc_weight: 0.0
54 | ctc_lsm_prob: 0.0
55 | mtl_per_batch: false
56 | task_specific_layer: false
57 | 


--------------------------------------------------------------------------------
/examples/timit/s5/conf/test_spk.list:
--------------------------------------------------------------------------------
 1 | mdab0
 2 | mwbt0
 3 | felc0
 4 | mtas1
 5 | mwew0
 6 | fpas0
 7 | mjmp0
 8 | mlnt0
 9 | fpkt0
10 | mlll0
11 | mtls0
12 | fjlm0
13 | mbpm0
14 | mklt0
15 | fnlp0
16 | mcmj0
17 | mjdh0
18 | fmgd0
19 | mgrt0
20 | mnjm0
21 | fdhc0
22 | mjln0
23 | mpam0
24 | fmld0
25 | 


--------------------------------------------------------------------------------
/examples/timit/s5/conf/transformer.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 2000
 5 | conv_in_channel: 3
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | enc_type: conv_transformer
11 | enc_n_layers: 12
12 | transformer_enc_pe_type: add
13 | transformer_enc_d_model: 256
14 | transformer_enc_d_ff: 2048
15 | transformer_enc_n_heads: 4
16 | dec_type: transformer
17 | dec_n_layers: 6
18 | transformer_dec_attn_type: scaled_dot
19 | transformer_dec_pe_type: 1dconv3L  ### this is effective
20 | transformer_dec_d_model: 256
21 | transformer_dec_d_ff: 2048
22 | transformer_dec_n_heads: 4
23 | tie_embedding: false
24 | ctc_fc_list: "512"
25 | ### optimization
26 | batch_size: 32
27 | optimizer: noam
28 | n_epochs: 250
29 | convert_to_sgd_epoch: 1000
30 | print_step: 20
31 | metric: accuracy
32 | lr_factor: 5.0
33 | early_stop_patient_n_epochs: 5
34 | shuffle_bucket: true  ### this is important
35 | sort_stop_epoch: 1000
36 | eval_start_epoch: 1
37 | warmup_n_steps: 25000
38 | accum_grad_n_steps: 2
39 | ### regularization
40 | clip_grad_norm: 5.0
41 | dropout_in: 0.2
42 | dropout_enc: 0.5
43 | dropout_dec: 0.5
44 | dropout_emb: 0.2
45 | dropout_att: 0.0
46 | weight_decay: 1e-6
47 | lsm_prob: 0.1
48 | ### MTL
49 | ctc_weight: 0.3
50 | ctc_lsm_prob: 0.1
51 | mtl_per_batch: false
52 | task_specific_layer: false
53 | # SpecAugment
54 | freq_width: 27
55 | n_freq_masks: 2
56 | time_width: 100
57 | n_time_masks: 2
58 | time_width_upper: 1.0
59 | 


--------------------------------------------------------------------------------
/examples/timit/s5/conf/transformer_relative.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 2000
 5 | conv_in_channel: 3
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | enc_type: conv_transformer
11 | enc_n_layers: 12
12 | transformer_enc_pe_type: relative  ###
13 | transformer_enc_d_model: 256
14 | transformer_enc_d_ff: 2048
15 | transformer_enc_n_heads: 4
16 | dec_type: transformer
17 | dec_n_layers: 6
18 | transformer_dec_attn_type: scaled_dot
19 | transformer_dec_pe_type: 1dconv3L  ### this is effective
20 | transformer_dec_d_model: 256
21 | transformer_dec_d_ff: 2048
22 | transformer_dec_n_heads: 4
23 | tie_embedding: false
24 | ctc_fc_list: "512"
25 | ### optimization
26 | batch_size: 32
27 | optimizer: noam
28 | n_epochs: 200
29 | convert_to_sgd_epoch: 1000
30 | print_step: 20
31 | metric: accuracy
32 | lr_factor: 5.0
33 | early_stop_patient_n_epochs: 5
34 | shuffle_bucket: true  ### this is important
35 | sort_stop_epoch: 1000
36 | eval_start_epoch: 1
37 | warmup_n_steps: 25000
38 | accum_grad_n_steps: 2
39 | ### regularization
40 | clip_grad_norm: 5.0
41 | dropout_in: 0.2
42 | dropout_enc: 0.5
43 | dropout_dec: 0.5
44 | dropout_emb: 0.2
45 | dropout_att: 0.0
46 | weight_decay: 1e-6
47 | lsm_prob: 0.1
48 | ### MTL
49 | ctc_weight: 0.3
50 | ctc_lsm_prob: 0.1
51 | mtl_per_batch: false
52 | task_specific_layer: false
53 | # SpecAugment
54 | freq_width: 27
55 | n_freq_masks: 2
56 | time_width: 100
57 | n_time_masks: 2
58 | time_width_upper: 1.0
59 | 


--------------------------------------------------------------------------------
/examples/timit/s5/local/plot_ctc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2018 Kyoto University (Hirofumi Inaguma)
 4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | model=
 7 | gpu=
 8 | stdout=false
 9 | n_threads=1
10 | 
11 | ### path to save preproecssed data
12 | data=/n/work2/inaguma/corpus/timit
13 | 
14 | batch_size=1
15 | 
16 | . ./cmd.sh
17 | . ./path.sh
18 | . utils/parse_options.sh
19 | 
20 | set -e
21 | set -u
22 | set -o pipefail
23 | 
24 | if [ -z ${gpu} ]; then
25 |     # CPU
26 |     n_gpus=0
27 |     export OMP_NUM_THREADS=${n_threads}
28 | else
29 |     n_gpus=$(echo ${gpu} | tr "," "\n" | wc -l)
30 | fi
31 | 
32 | for set in dev test; do
33 |     recog_dir=$(dirname ${model})/plot_${set}
34 |     mkdir -p ${recog_dir}
35 | 
36 |     CUDA_VISIBLE_DEVICES=${gpu} ${NEURALSP_ROOT}/neural_sp/bin/asr/plot_ctc.py \
37 |         --recog_n_gpus ${n_gpus} \
38 |         --recog_sets ${data}/dataset/${set}.csv \
39 |         --recog_dir ${recog_dir} \
40 |         --recog_model ${model} \
41 |         --recog_batch_size ${batch_size} \
42 |         --recog_stdout ${stdout} || exit 1;
43 | done
44 | 


--------------------------------------------------------------------------------
/examples/timit/s5/local/score_sclite.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2018 Kyoto University (Hirofumi Inaguma)
 4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | . ./path.sh
 7 | 
 8 | . utils/parse_options.sh
 9 | 
10 | if [ $# != 1 ]; then
11 |     echo "Usage: $0 <decode-dir>";
12 |     exit 1;
13 | fi
14 | 
15 | decode_dir=$1
16 | phonemap="conf/phones.60-48-39.map"
17 | 
18 | # Map reference to 39 phone classes:
19 | cat ${decode_dir}/ref.trn | local/timit_norm_trans.pl -i - -m ${phonemap} -from 60 -to 39 > ${decode_dir}/ref.trn.filt
20 | cat ${decode_dir}/hyp.trn | local/timit_norm_trans.pl -i - -m ${phonemap} -from 60 -to 39 > ${decode_dir}/hyp.trn.filt
21 | 
22 | sed -e "s/<eos>/ /g" ${decode_dir}/ref.trn.filt > ${decode_dir}/ref.trn.filt.clean
23 | sed -e "s/<eos>/ /g" ${decode_dir}/hyp.trn.filt > ${decode_dir}/hyp.trn.filt.clean
24 | 
25 | sclite -r ${decode_dir}/ref.trn trn -h ${decode_dir}/hyp.trn trn -i rm -o all stdout > ${decode_dir}/result.txt
26 | grep -e Avg -e SPKR -m 2 ${decode_dir}/result.txt
27 | 


--------------------------------------------------------------------------------
/examples/timit/s5/local/timit_format_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2013  (Author: Daniel Povey)
 4 | # Apache 2.0
 5 | 
 6 | # This script takes data prepared in a corpus-dependent way
 7 | # in data/local/, and converts it into the "canonical" form,
 8 | # in various subdirectories of data/, e.g. data/lang, data/train, etc.
 9 | 
10 | . ./path.sh || exit 1;
11 | 
12 | echo "Preparing train, dev and test data"
13 | srcdir=${data}/local/data
14 | 
15 | for x in train dev test; do
16 |   mkdir -p ${data}/$x
17 |   cp $srcdir/${x}_wav.scp ${data}/$x/wav.scp || exit 1;
18 |   cp $srcdir/$x.text ${data}/$x/text || exit 1;
19 |   cp $srcdir/$x.spk2utt ${data}/$x/spk2utt || exit 1;
20 |   cp $srcdir/$x.utt2spk ${data}/$x/utt2spk || exit 1;
21 |   utils/filter_scp.pl ${data}/$x/spk2utt $srcdir/$x.spk2gender > ${data}/$x/spk2gender || exit 1;
22 |   cp $srcdir/${x}.stm ${data}/$x/stm
23 |   cp $srcdir/${x}.glm ${data}/$x/glm
24 |   utils/validate_data_dir.sh --no-feats ${data}/$x || exit 1
25 | 
26 |   cp $srcdir/${x}.spk2gender ${data}/$x/spk2gender  # added
27 | done
28 | 
29 | echo "Succeeded in formatting data."
30 | 


--------------------------------------------------------------------------------
/examples/timit/s5/path.sh:
--------------------------------------------------------------------------------
 1 | export NEURALSP_ROOT=$PWD/../../..
 2 | export KALDI_ROOT=$NEURALSP_ROOT/tools/kaldi
 3 | export TOOL=$NEURALSP_ROOT/tools/neural_sp
 4 | export CONDA=$TOOL/miniconda
 5 | 
 6 | # Kaldi
 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 8 | export PATH=$NEURALSP_ROOT/utils:$PWD/utils/:$KALDI_ROOT/tools/sctk/bin/:$TOOL/sentencepiece/build/src:$PATH
 9 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
10 | . $KALDI_ROOT/tools/config/common_path.sh
11 | export LC_ALL=C
12 | 
13 | ### Python
14 | source $CONDA/etc/profile.d/conda.sh && conda deactivate && conda activate
15 | export PYTHONDONTWRITEBYTECODE=1
16 | export OMP_NUM_THREADS=1
17 | 
18 | ### CUDA
19 | CUDAROOT=/usr/local/cuda
20 | NCCL_ROOT=/usr/local/nccl
21 | export CPATH=$NCCL_ROOT/include:$CPATH
22 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib/:$CUDAROOT/lib64:$LD_LIBRARY_PATH
23 | export LIBRARY_PATH=$NCCL_ROOT/lib/:$LIBRARY_PATH
24 | export CUDA_HOME=$CUDAROOT
25 | export CUDA_PATH=$CUDAROOT
26 | 


--------------------------------------------------------------------------------
/examples/timit/s5/steps:
--------------------------------------------------------------------------------
1 | ../../wsj/s5/steps


--------------------------------------------------------------------------------
/examples/timit/s5/utils:
--------------------------------------------------------------------------------
1 | ../../wsj/s5/utils


--------------------------------------------------------------------------------
/examples/wsj/README.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | About the Wall Street Journal corpus:
 3 |     This is a corpus of read
 4 |     sentences from the Wall Street Journal, recorded under clean conditions.
 5 |     The vocabulary is quite large.   About 80 hours of training data.
 6 |     Available from the LDC as either: [ catalog numbers LDC93S6A (WSJ0) and LDC94S13A (WSJ1) ]
 7 |     or: [ catalog numbers LDC93S6B (WSJ0) and LDC94S13B (WSJ1) ]
 8 |     The latter option is cheaper and includes only the Sennheiser
 9 |     microphone data (which is all we use in the example scripts).
10 | 
11 | Each subdirectory of this directory contains the
12 | scripts for a sequence of experiments.  [note: most of the older
13 | example scripts have been deleted, but are still available at
14 | ^/branches/complete].
15 | 
16 |   s5: This is the current recommended recipe. 
17 | 
18 | 


--------------------------------------------------------------------------------
/examples/wsj/s5/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | # Kyoto University setup
14 | export train_cmd="run.pl --mem 2G"
15 | export cuda_cmd="run.pl --mem 2G --gpu 1"
16 | export decode_cmd="run.pl --mem 4G"
17 | 
18 | # JHU setup
19 | # export train_cmd="queue.pl --mem 2G"
20 | # export cuda_cmd="queue.pl --mem 2G --gpu 1 --config conf/gpu.conf"
21 | # export decode_cmd="queue.pl --mem 4G"
22 | 


--------------------------------------------------------------------------------
/examples/wsj/s5/conf/asr/glu_encoder.yaml:
--------------------------------------------------------------------------------
1 | enc_type: gated_conv
2 | conv_channels: "100_100_100_125_125_150_175_200_225_250_250_250_300_300_375"
3 | conv_kernel_sizes: "(13,1)_(3,1)_(4,1)_(5,1)_(6,1)_(7,1)_(8,1)_(9,1)_(10,1)_(11,1)_(12,1)_(13,1)_(14,1)_(15,1)_(21,1)"
4 | 


--------------------------------------------------------------------------------
/examples/wsj/s5/conf/asr/tds_encoder.yaml:
--------------------------------------------------------------------------------
1 | enc_type: tds
2 | conv_channels: "10_10_14_14_14_18_18_18_18_18_18"
3 | conv_kernel_sizes: "(21,1)_(21,1)_(21,1)_(21,1)_(21,1)_(21,1)_(21,1)_(21,1)_(21,1)_(21,1)_(21,1)"
4 | subsample: "1_1_1_1_1"
5 | clip_grad_norm: 15.0
6 | dropout_enc: 0.2
7 | lsm_prob: 0.05
8 | 


--------------------------------------------------------------------------------
/examples/wsj/s5/conf/asr/transformer.yaml:
--------------------------------------------------------------------------------
 1 | ### topology
 2 | n_stacks: 1
 3 | n_skips: 1
 4 | max_n_frames: 1600
 5 | conv_in_channel: 1
 6 | conv_channels: "32_32"
 7 | conv_kernel_sizes: "(3,3)_(3,3)"
 8 | conv_strides: "(1,1)_(1,1)"
 9 | conv_poolings: "(2,2)_(2,2)"
10 | enc_type: conv_transformer
11 | enc_n_layers: 12
12 | transformer_enc_pe_type: add
13 | transformer_enc_d_model: 256
14 | transformer_enc_d_ff: 2048
15 | transformer_enc_n_heads: 4
16 | dec_type: transformer
17 | dec_n_layers: 6
18 | transformer_dec_pe_type: 1dconv3L  ### this is effective
19 | transformer_dec_attn_type: scaled_dot
20 | transformer_dec_d_model: 256
21 | transformer_dec_d_ff: 2048
22 | transformer_dec_n_heads: 4
23 | tie_embedding: false
24 | ctc_fc_list: "512"
25 | ### optimization
26 | batch_size: 32
27 | optimizer: noam
28 | n_epochs: 120
29 | convert_to_sgd_epoch: 100
30 | print_step: 400
31 | metric: accuracy
32 | lr_factor: 5.0
33 | early_stop_patient_n_epochs: 5
34 | shuffle_bucket: true  ### this is important
35 | sort_stop_epoch: 100
36 | eval_start_epoch: 1
37 | warmup_n_steps: 25000
38 | accum_grad_n_steps: 2
39 | ### regularization
40 | clip_grad_norm: 5.0
41 | dropout_in: 0.0
42 | dropout_enc: 0.1
43 | dropout_dec: 0.1
44 | dropout_emb: 0.1
45 | dropout_att: 0.0
46 | weight_decay: 1e-6
47 | lsm_prob: 0.1
48 | ### MTL
49 | ctc_weight: 0.3
50 | ctc_lsm_prob: 0.1
51 | mtl_per_batch: false
52 | task_specific_layer: false
53 | # SpecAugment
54 | freq_width: 27
55 | n_freq_masks: 2
56 | time_width: 100
57 | n_time_masks: 2
58 | time_width_upper: 1.0
59 | 


--------------------------------------------------------------------------------
/examples/wsj/s5/conf/data/spec_augment.yaml:
--------------------------------------------------------------------------------
 1 | # optimization
 2 | n_epochs: 50
 3 | print_step: 400
 4 | lr_decay_start_epoch: 20
 5 | lr_decay_rate: 0.9
 6 | 
 7 | # mask
 8 | freq_width: 27
 9 | n_freq_masks: 2
10 | time_width: 100
11 | n_time_masks: 2
12 | time_width_upper: 1.0


--------------------------------------------------------------------------------
/examples/wsj/s5/conf/data/spec_augment_speed_perturb.yaml:
--------------------------------------------------------------------------------
 1 | # optimization
 2 | n_epochs: 40  # 20->40
 3 | print_step: 1200  # 600->1200
 4 | lr_decay_start_epoch: 15
 5 | lr_decay_rate: 0.9
 6 | 
 7 | # mask
 8 | freq_width: 27
 9 | n_freq_masks: 2
10 | time_width: 100
11 | n_time_masks: 2
12 | time_width_upper: 1.0


--------------------------------------------------------------------------------
/examples/wsj/s5/conf/data/speed_perturb.yaml:
--------------------------------------------------------------------------------
1 | # optimization
2 | n_epochs: 20  # 25->20
3 | print_step: 600  # 200->600
4 | lr_decay_start_epoch: 10
5 | lr_decay_rate: 0.8
6 | 


--------------------------------------------------------------------------------
/examples/wsj/s5/conf/fbank.conf:
--------------------------------------------------------------------------------
1 | --window-type=hamming # disable Dans window, use the standard
2 | --htk-compat=true     # try to make it compatible with HTK
3 | 
4 | --sample-frequency=16000
5 | --num-mel-bins=80
6 | --use-energy=false
7 | 


--------------------------------------------------------------------------------
/examples/wsj/s5/conf/lm/gated_convlm.yaml:
--------------------------------------------------------------------------------
 1 | # topology
 2 | lm_type: gated_conv_14B
 3 | emb_dim: 128
 4 | tie_embedding: false
 5 | # optimization
 6 | batch_size: 50
 7 | bptt: 200
 8 | optimizer: nesterov
 9 | n_epochs: 50
10 | convert_to_sgd_epoch: 50
11 | print_step: 400
12 | lr: 1.0
13 | lr_decay_start_epoch: 10
14 | lr_decay_rate: 0.5
15 | lr_decay_patient_n_epochs: 0
16 | lr_decay_type: metric
17 | early_stop_patient_n_epochs: 5
18 | eval_start_epoch: 1
19 | # initialization
20 | param_init: 0.05
21 | # regularization
22 | clip_grad_norm: 0.1
23 | dropout_in: 0.2
24 | dropout_hidden: 0.2
25 | dropout_out: 0.0
26 | weight_decay: 1e-6
27 | lsm_prob: 0.1
28 | backward: false
29 | adaptive_softmax: false
30 | 


--------------------------------------------------------------------------------
/examples/wsj/s5/conf/lm/rnnlm.yaml:
--------------------------------------------------------------------------------
 1 | # topology
 2 | lm_type: lstm
 3 | n_units: 1024
 4 | n_projs: 0
 5 | n_layers: 4  ###
 6 | emb_dim: 1024
 7 | n_units_null_context: 0
 8 | tie_embedding: true
 9 | residual: true
10 | use_glu: true
11 | # optimization
12 | batch_size: 128
13 | bptt: 200
14 | optimizer: adam
15 | n_epochs: 50
16 | convert_to_sgd_epoch: 50
17 | print_step: 200
18 | lr: 1e-3
19 | lr_decay_start_epoch: 10
20 | lr_decay_rate: 0.9
21 | lr_decay_patient_n_epochs: 0
22 | lr_decay_type: always
23 | early_stop_patient_n_epochs: 5
24 | eval_start_epoch: 1
25 | # initialization
26 | param_init: 0.05
27 | # regularization
28 | clip_grad_norm: 1.0
29 | dropout_in: 0.2
30 | dropout_hidden: 0.2
31 | dropout_out: 0.0
32 | weight_decay: 1e-6
33 | lsm_prob: 0.1  ###
34 | adaptive_softmax: false
35 | 


--------------------------------------------------------------------------------
/examples/wsj/s5/conf/lm/transformerlm.yaml:
--------------------------------------------------------------------------------
 1 | # topology
 2 | lm_type: transformer
 3 | n_layers: 12
 4 | transformer_pe_type: add
 5 | transformer_d_model: 512
 6 | transformer_d_ff: 2048
 7 | transformer_n_heads: 8
 8 | tie_embedding: true
 9 | # optimization
10 | batch_size: 32
11 | bptt: 200
12 | optimizer: noam
13 | n_epochs: 50
14 | convert_to_sgd_epoch: 50
15 | print_step: 200
16 | lr_factor: 10.0
17 | early_stop_patient_n_epochs: 5
18 | eval_start_epoch: 1
19 | warmup_n_steps: 4000
20 | accum_grad_n_steps: 2
21 | # regularization
22 | clip_grad_norm: 1.0
23 | dropout_in: 0.1
24 | dropout_hidden: 0.3
25 | dropout_out: 0.0
26 | dropout_att: 0.1
27 | weight_decay: 1e-6
28 | lsm_prob: 0.0
29 | backward: false
30 | adaptive_softmax: false
31 | 


--------------------------------------------------------------------------------
/examples/wsj/s5/local/dict/add_counts.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | 
 4 | # Add counts to an oovlist.
 5 | # Reads in counts as output by uniq -c, and
 6 | # an oovlist, and prints out the counts of the oovlist.
 7 | 
 8 | (@ARGV == 1 || @ARGV == 2) || die "Usage: add_counts.pl count_file [oovlist]\n";
 9 | 
10 | $counts = shift @ARGV;
11 | 
12 | open(C, "<$counts") || die "Opening counts file $counts";
13 | 
14 | while(<C>) {
15 |   @A = split(" ", $_);
16 |   @A == 2 || die "Bad line in counts file: $_";
17 |   ($count, $word) = @A;
18 |   $count =~ m:^\d+$: || die "Bad count $A[0]\n";
19 |   $counts{$word} = $count;
20 | }
21 | 
22 | while(<>) {
23 |   chop;
24 |   $w = $_;
25 |   $w =~ m:\S+: || die "Bad word $w";
26 |   defined $counts{$w} || die "Word $w not present in counts file";
27 |   print "\t$counts{$w}\t$w\n";
28 | }
29 |     
30 |   
31 | 
32 | 


--------------------------------------------------------------------------------
/examples/wsj/s5/local/dict/count_rules.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | # This program takes the output of score_prons.pl and collates
 4 | # it for each (rule, destress) pair so that we get the
 5 | # counts of right/partial/wrong for each pair.
 6 | 
 7 | # The input is a 7-tuple on each line, like:
 8 | # word;pron;base-word;base-pron;rule-name;de-stress;right|partial|wrong
 9 | #
10 | # The output format is a 5-tuple like:
11 | #
12 | # rule;destress;right-count;partial-count;wrong-count
13 | #
14 | 
15 | if (@ARGV != 0 && @ARGV != 1) {
16 |   die "Usage: count_rules.pl < scored_candidate_prons > rule_counts";
17 | }
18 | 
19 | 
20 | while(<>) {
21 |   chop;
22 |   $line = $_;
23 |   my ($word, $pron, $baseword, $basepron, $rulename, $destress, $score) = split(";", $line);
24 |   
25 |   my $key = $rulename . ";" . $destress;
26 | 
27 |   if (!defined $counts{$key}) {
28 |     $counts{$key} = [ 0, 0, 0 ]; # new anonymous array.
29 |   }
30 |   $ref = $counts{$key};
31 |   if ($score eq "right") {
32 |     $$ref[0]++;
33 |   } elsif ($score eq "partial") {
34 |     $$ref[1]++;
35 |   } elsif ($score eq "wrong") {
36 |     $$ref[2]++;
37 |   } else {
38 |     die "Bad score $score\n";
39 |   }
40 | }
41 | 
42 | while ( my ($key, $value) = each(%counts)) {
43 |   print $key . ";" . join(";", @$value) . "\n";
44 | }
45 | 


--------------------------------------------------------------------------------
/examples/wsj/s5/local/dict/filter_dict.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | 
 4 | # This program reads and writes either a dictionary or just a list
 5 | # of words, and it removes any words containing ";" or "," as these
 6 | # are used in these programs.  It will warn about these.
 7 | # It will die if the pronunciations have these symbols in.
 8 | while(<>) {
 9 |   chop;
10 |   @A = split(" ", $_);
11 |   $word = shift @A;
12 |   
13 |   if ($word =~ m:[;,]:) {
14 |     print STDERR "Omitting line $_ since it has one of the banned characters ; or ,\n" ;
15 |   } else {
16 |     $_ =~ m:[;,]: && die "Phones cannot have ; or , in them.";
17 |     print $_ . "\n";
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/examples/wsj/s5/local/dict/reverse_dict.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | # Used in conjunction with get_rules.pl
 4 | # example input line: XANTHE  Z AE1 N DH
 5 | # example output line: EHTNAX DH N AE1 Z
 6 | 
 7 | while(<>){ 
 8 |   @A = split(" ", $_);
 9 |   $word = shift @A;
10 |   $word = join("", reverse(split("", $word))); # Reverse letters of word.
11 |   @A = reverse(@A); # Reverse phones in pron.
12 |   unshift @A, $word;
13 |   print join(" ", @A) . "\n";
14 | }
15 | 


--------------------------------------------------------------------------------
/examples/wsj/s5/local/flist2scp.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # Copyright 2010-2011 Microsoft Corporation
 3 | 
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #  http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
14 | # See the Apache 2 License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | # takes in a file list with lines like
19 | # /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1
20 | # and outputs an scp in kaldi format with lines like
21 | # 4k0c030a /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1
22 | # (the first thing is the utterance-id, which is the same as the basename of the file.
23 | 
24 | 
25 | while(<>){
26 |     m:^\S+/(\w+)\.[wW][vV]1$: || die "Bad line $_";
27 |     $id = $1;
28 |     $id =~ tr/A-Z/a-z/;  # Necessary because of weirdness on disk 13-16.1 (uppercase filenames)
29 |     print "$id $_";
30 | }
31 | 


--------------------------------------------------------------------------------
/examples/wsj/s5/path.sh:
--------------------------------------------------------------------------------
 1 | export NEURALSP_ROOT=$PWD/../../..
 2 | export KALDI_ROOT=$NEURALSP_ROOT/tools/kaldi
 3 | export TOOL=$NEURALSP_ROOT/tools/neural_sp
 4 | export CONDA=$TOOL/miniconda
 5 | 
 6 | # Kaldi
 7 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 8 | export PATH=$NEURALSP_ROOT/utils:$PWD/utils/:$KALDI_ROOT/tools/sctk/bin/:$TOOL/sentencepiece/build/src:$PATH
 9 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
10 | . $KALDI_ROOT/tools/config/common_path.sh
11 | export LC_ALL=C
12 | 
13 | ### Python
14 | source $CONDA/etc/profile.d/conda.sh && conda deactivate && conda activate
15 | export PYTHONDONTWRITEBYTECODE=1
16 | export OMP_NUM_THREADS=1
17 | 
18 | ### CUDA
19 | CUDAROOT=/usr/local/cuda
20 | NCCL_ROOT=/usr/local/nccl
21 | export CPATH=$NCCL_ROOT/include:$CPATH
22 | export LD_LIBRARY_PATH=$NCCL_ROOT/lib/:$CUDAROOT/lib64:$LD_LIBRARY_PATH
23 | export LIBRARY_PATH=$NCCL_ROOT/lib/:$LIBRARY_PATH
24 | export CUDA_HOME=$CUDAROOT
25 | export CUDA_PATH=$CUDAROOT
26 | 


--------------------------------------------------------------------------------
/examples/wsj/s5/steps:
--------------------------------------------------------------------------------
1 | ../../../tools/neural_sp/kaldi/egs/wsj/s5/steps


--------------------------------------------------------------------------------
/examples/wsj/s5/utils:
--------------------------------------------------------------------------------
1 | ../../../tools/neural_sp/kaldi/egs/wsj/s5/utils


--------------------------------------------------------------------------------
/neural_sp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/__init__.py


--------------------------------------------------------------------------------
/neural_sp/bin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/bin/__init__.py


--------------------------------------------------------------------------------
/neural_sp/bin/asr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/bin/asr/__init__.py


--------------------------------------------------------------------------------
/neural_sp/bin/lm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/bin/lm/__init__.py


--------------------------------------------------------------------------------
/neural_sp/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/datasets/__init__.py


--------------------------------------------------------------------------------
/neural_sp/datasets/asr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/datasets/asr/__init__.py


--------------------------------------------------------------------------------
/neural_sp/datasets/token_converter/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/datasets/token_converter/__init__.py


--------------------------------------------------------------------------------
/neural_sp/evaluators/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/evaluators/__init__.py


--------------------------------------------------------------------------------
/neural_sp/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/models/__init__.py


--------------------------------------------------------------------------------
/neural_sp/models/lm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/models/lm/__init__.py


--------------------------------------------------------------------------------
/neural_sp/models/lm/build.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Kyoto University (Hirofumi Inaguma)
 2 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 3 | 
 4 | """Select a language model"""
 5 | 
 6 | 
 7 | def build_lm(args, save_path=None, wordlm=False, lm_dict_path=None, asr_dict_path=None):
 8 |     """Select LM class.
 9 | 
10 |     Args:
11 |         args ():
12 |         save_path (str):
13 |         wordlm (bool):
14 |         lm_dict_path (dict):
15 |         asr_dict_path (dict):
16 |     Returns:
17 |         lm ():
18 | 
19 |     """
20 |     if 'gated_conv' in args.lm_type:
21 |         from neural_sp.models.lm.gated_convlm import GatedConvLM
22 |         lm = GatedConvLM(args, save_path)
23 |     elif args.lm_type == 'transformer':
24 |         from neural_sp.models.lm.transformerlm import TransformerLM
25 |         lm = TransformerLM(args, save_path)
26 |     elif args.lm_type == 'transformer_xl':
27 |         from neural_sp.models.lm.transformer_xl import TransformerXL
28 |         lm = TransformerXL(args, save_path)
29 |     else:
30 |         from neural_sp.models.lm.rnnlm import RNNLM
31 |         lm = RNNLM(args, save_path)
32 | 
33 |     return lm
34 | 


--------------------------------------------------------------------------------
/neural_sp/models/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/models/modules/__init__.py


--------------------------------------------------------------------------------
/neural_sp/models/modules/gelu.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Kyoto University (Hirofumi Inaguma)
 2 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 3 | 
 4 | """Gaussian Error Linear Units (GELU) activation."""
 5 | 
 6 | import math
 7 | import torch
 8 | 
 9 | 
10 | # [reference] https://github.com/pytorch/fairseq/blob/e75cff5f2c1d62f12dc911e0bf420025eb1a4e33/fairseq/modules/gelu.py
11 | def gelu_accurate(x):
12 |     if not hasattr(gelu_accurate, "_a"):
13 |         gelu_accurate._a = math.sqrt(2 / math.pi)
14 |     return 0.5 * x * (1 + torch.tanh(gelu_accurate._a * (x + 0.044715 * torch.pow(x, 3))))
15 | 
16 | 
17 | def gelu(x):
18 |     if hasattr(torch.nn.functional, 'gelu'):
19 |         return torch.nn.functional.gelu(x.float()).type_as(x)
20 |     else:
21 |         return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
22 | 


--------------------------------------------------------------------------------
/neural_sp/models/modules/headdrop.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Kyoto University (Hirofumi Inaguma)
 2 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 3 | 
 4 | """HeadDrop regularization."""
 5 | 
 6 | import random
 7 | 
 8 | random.seed(1)
 9 | 
10 | 
11 | def headdrop(aws, n_heads, dropout):
12 |     """HeadDrop regularization.
13 | 
14 |         Args:
15 |             aws (FloatTensor): `[B, H, qlen, klen]`
16 |             n_heads (int): number of attention heads
17 |             dropout (float): HeadDrop probability
18 |         Returns:
19 |             aws (FloatTensor): `[B, H, qlen, klen]`
20 | 
21 |     """
22 |     n_effective_heads = n_heads
23 |     head_mask = aws.new_ones(aws.size()).byte()
24 |     for h in range(n_heads):
25 |         if random.random() < dropout:
26 |             head_mask[:, h] = 0
27 |             n_effective_heads -= 1
28 |     aws = aws.masked_fill_(head_mask == 0, 0)
29 |     # Normalization
30 |     if n_effective_heads > 0:
31 |         aws = aws * (n_heads / n_effective_heads)
32 |     return aws
33 | 


--------------------------------------------------------------------------------
/neural_sp/models/modules/mocha/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/models/modules/mocha/__init__.py


--------------------------------------------------------------------------------
/neural_sp/models/modules/softplus.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Kyoto University (Hirofumi Inaguma)
 2 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 3 | 
 4 | """Softplus function."""
 5 | 
 6 | import torch
 7 | 
 8 | 
 9 | def softplus(x):
10 |     if hasattr(torch.nn.functional, 'softplus'):
11 |         return torch.nn.functional.softplus(x.float()).type_as(x)
12 |     else:
13 |         raise NotImplementedError
14 | 


--------------------------------------------------------------------------------
/neural_sp/models/modules/swish.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Kyoto University (Hirofumi Inaguma)
 2 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 3 | 
 4 | """Swish activation.
 5 |    See details in https://arxiv.org/abs/1710.05941."""
 6 | 
 7 | import torch
 8 | 
 9 | 
10 | class Swish(torch.nn.Module):
11 |     def forward(self, x):
12 |         return x * torch.sigmoid(x)
13 | 


--------------------------------------------------------------------------------
/neural_sp/models/seq2seq/__init___.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/models/seq2seq/__init___.py


--------------------------------------------------------------------------------
/neural_sp/models/seq2seq/decoders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/models/seq2seq/decoders/__init__.py


--------------------------------------------------------------------------------
/neural_sp/models/seq2seq/encoders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/models/seq2seq/encoders/__init__.py


--------------------------------------------------------------------------------
/neural_sp/models/seq2seq/frontends/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/models/seq2seq/frontends/__init__.py


--------------------------------------------------------------------------------
/neural_sp/models/seq2seq/frontends/input_noise.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Kyoto University (Hirofumi Inaguma)
 2 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 3 | 
 4 | """Add Gaussian noise to input features."""
 5 | 
 6 | import torch
 7 | 
 8 | 
 9 | def add_input_noise(xs, std):
10 |     noise = torch.normal(xs.new_zeros(xs.shape[-1]), std)
11 |     xs.data += noise
12 |     return xs
13 | 


--------------------------------------------------------------------------------
/neural_sp/trainers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/neural_sp/trainers/__init__.py


--------------------------------------------------------------------------------
/neural_sp/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 Kyoto University (Hirofumi Inaguma)
 2 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 3 | 
 4 | """Unility functions for general purposes."""
 5 | 
 6 | from pathlib import Path
 7 | 
 8 | 
 9 | def mkdir_join(path, *dir_name, rank=0):
10 |     """Concatenate root path and 1 or more paths, and make a new directory if the directory does not exist.
11 |     Args:
12 |         path (str): path to a directory
13 |         rank (int): rank of current process group
14 |         dir_name (str): a directory name
15 |     Returns:
16 |         path to the new directory
17 |     """
18 |     p = Path(path)
19 |     if not p.is_dir() and rank == 0:
20 |         p.mkdir()
21 |     for i in range(len(dir_name)):
22 |         # dir
23 |         if i < len(dir_name) - 1:
24 |             p = p.joinpath(dir_name[i])
25 |             if not p.is_dir() and rank == 0:
26 |                 p.mkdir()
27 |         elif '.' not in dir_name[i]:
28 |             p = p.joinpath(dir_name[i])
29 |             if not p.is_dir() and rank == 0:
30 |                 p.mkdir()
31 |         # file
32 |         else:
33 |             p = p.joinpath(dir_name[i])
34 |     return str(p.absolute())
35 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [aliases]
2 | test=pytest
3 | 
4 | [tool:pytest]
5 | addopts = --maxfail=3 --durations=10 --cov-config=.coveragerc --cov=neural_sp --cov-report xml
6 | python_files = test/*/test_*.py
7 | testpaths = test
8 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hirofumi0810/neural_sp/b91877c6d2a11f06026480ab422176274d88cbf2/test/__init__.py


--------------------------------------------------------------------------------
/test/decoders/dict.txt:
--------------------------------------------------------------------------------
 1 | <unk> 1
 2 | <eos> 2
 3 | <pad> 3
 4 | <space> 4
 5 | ' 5
 6 | a 6
 7 | b 7
 8 | c 8
 9 | d 9
10 | 


--------------------------------------------------------------------------------
/test/encoders/test_utils.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Test for encoder utility functions."""
 5 | 
 6 | import importlib
 7 | import numpy as np
 8 | import pytest
 9 | import torch
10 | 
11 | from neural_sp.models.torch_utils import np2tensor
12 | from neural_sp.models.torch_utils import pad_list
13 | 
14 | 
15 | @pytest.mark.parametrize(
16 |     "N_l, N_c, N_r",
17 |     [
18 |         (96, 64, 32),
19 |         (64, 64, 64),
20 |         (40, 40, 40),
21 |         (40, 40, 20),
22 |     ]
23 | )
24 | def test_chunkwise(N_l, N_c, N_r):
25 |     batch_size = 4
26 |     xmaxs = [800, 855]
27 |     input_dim = 80
28 |     device = "cpu"
29 | 
30 |     module = importlib.import_module('neural_sp.models.seq2seq.encoders.utils')
31 | 
32 |     for xmax in xmaxs:
33 |         xs = np.random.randn(batch_size, xmax, input_dim).astype(np.float32)
34 |         xs = pad_list([np2tensor(x, device).float() for x in xs], 0.)
35 | 
36 |         xs_chunk = module.chunkwise(xs, N_l, N_c, N_r)
37 | 
38 |         # Extract the center region
39 |         xs_chunk = xs_chunk[:, N_l:N_l + N_c]  # `[B * n_chunks, N_c, input_dim]`
40 |         xs_chunk = xs_chunk.contiguous().view(batch_size, -1, xs_chunk.size(2))
41 |         xs_chunk = xs_chunk[:, :xmax]
42 | 
43 |         assert xs_chunk.size() == xs.size()
44 |         assert torch.equal(xs_chunk, xs)
45 | 


--------------------------------------------------------------------------------
/test/frontends/test_frame_stacking.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Test for frame stacking."""
 5 | 
 6 | import importlib
 7 | import math
 8 | import numpy as np
 9 | import pytest
10 | 
11 | from neural_sp.models.torch_utils import np2tensor
12 | from neural_sp.models.torch_utils import pad_list
13 | 
14 | 
15 | def make_args(**kwargs):
16 |     args = dict(
17 |         n_stacks=1,
18 |         n_skips=1,
19 |     )
20 |     args.update(kwargs)
21 |     return args
22 | 
23 | 
24 | @pytest.mark.parametrize(
25 |     "args",
26 |     [
27 |         ({'n_stacks': 1, 'n_skips': 1}),
28 |         ({'n_stacks': 2, 'n_skips': 2}),
29 |         ({'n_stacks': 3, 'n_skips': 3}),
30 |         ({'n_stacks': 3, 'n_skips': 1}),
31 |     ]
32 | )
33 | def test_forward(args):
34 |     args = make_args(**args)
35 | 
36 |     batch_size = 4
37 |     xmax = 40
38 |     input_dim = 80
39 |     device = "cpu"
40 | 
41 |     xs = [np.random.randn(xlen, input_dim).astype(np.float32)
42 |           for xlen in range(xmax - batch_size, xmax)]
43 |     xs_pad = pad_list([np2tensor(x, device).float() for x in xs], 0.)
44 | 
45 |     module = importlib.import_module('neural_sp.models.seq2seq.frontends.frame_stacking')
46 | 
47 |     out = [module.stack_frame(x, args['n_stacks'], args['n_skips'])
48 |            for x in xs]
49 |     out_pad = pad_list([np2tensor(x, device).float() for x in out], 0.)
50 |     assert out_pad.size(0) == xs_pad.size(0)
51 |     assert out_pad.size(1) == math.ceil(xs_pad.size(1) / args['n_skips'])
52 |     assert out_pad.size(2) == xs_pad.size(2) * args['n_stacks']
53 | 


--------------------------------------------------------------------------------
/test/frontends/test_input_noise.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Test for input noise injection."""
 5 | 
 6 | import numpy as np
 7 | 
 8 | from neural_sp.models.torch_utils import np2tensor
 9 | from neural_sp.models.torch_utils import pad_list
10 | from neural_sp.models.seq2seq.frontends.input_noise import add_input_noise
11 | 
12 | 
13 | def test_forward():
14 |     batch_size = 4
15 |     xmax = 40
16 |     input_dim = 80
17 |     device = "cpu"
18 | 
19 |     xs = np.random.randn(batch_size, xmax, input_dim).astype(np.float32)
20 |     xs = pad_list([np2tensor(x, device).float() for x in xs], 0.)
21 | 
22 |     out = add_input_noise(xs, std=0.075)
23 |     assert out.size() == xs.size()
24 | 


--------------------------------------------------------------------------------
/test/frontends/test_sequence_summary.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Test for sequence summary network."""
 5 | 
 6 | import importlib
 7 | import numpy as np
 8 | import pytest
 9 | import torch
10 | 
11 | from neural_sp.models.torch_utils import np2tensor
12 | from neural_sp.models.torch_utils import pad_list
13 | 
14 | 
15 | def make_args(**kwargs):
16 |     args = dict(
17 |         input_dim=80,
18 |         n_units=64,
19 |         n_layers=2,
20 |         bottleneck_dim=0,
21 |         dropout=0.1,
22 |         param_init=0.1,
23 |     )
24 |     args.update(kwargs)
25 |     return args
26 | 
27 | 
28 | @pytest.mark.parametrize(
29 |     "args",
30 |     [
31 |         ({'n_layers': 2, 'bottleneck_dim': 0}),
32 |         ({'n_layers': 2, 'bottleneck_dim': 100}),
33 |         ({'n_layers': 3, 'bottleneck_dim': 0}),
34 |         ({'n_layers': 3, 'bottleneck_dim': 100}),
35 |     ]
36 | )
37 | def test_forward(args):
38 |     args = make_args(**args)
39 | 
40 |     batch_size = 4
41 |     xmax = 40
42 |     device = "cpu"
43 | 
44 |     xs = np.random.randn(batch_size, xmax, args['input_dim']).astype(np.float32)
45 |     xlens = torch.IntTensor([len(x) for x in xs])
46 |     xs = pad_list([np2tensor(x, device).float() for x in xs], 0.)
47 | 
48 |     module = importlib.import_module('neural_sp.models.seq2seq.frontends.sequence_summary')
49 |     ssn = module.SequenceSummaryNetwork(**args)
50 |     ssn = ssn.to(device)
51 | 
52 |     out = ssn(xs, xlens)
53 |     assert out.size() == xs.size()
54 | 


--------------------------------------------------------------------------------
/test/install.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -euo pipefail
 4 | 
 5 | $CXX -v
 6 | 
 7 | ROOT=$(pwd)
 8 | KALDI_ROOT=${ROOT}/tools/kaldi
 9 | TOOL=${ROOT}/tools/neural_sp
10 | 
11 | # install kaldi (not compiled)
12 | if [ ! -d ${KALDI_ROOT} ]; then
13 |   git clone https://github.com/kaldi-asr/kaldi.git ${KALDI_ROOT}
14 | fi
15 | 
16 | # download pre-built kaldi binary (copy from espnet)
17 | [ ! -e ubuntu16-featbin.tar.gz ] && wget --tries=3 https://github.com/espnet/kaldi-bin/releases/download/v0.0.1/ubuntu16-featbin.tar.gz
18 | tar -xf ./ubuntu16-featbin.tar.gz
19 | cp featbin/* ${KALDI_ROOT}/src/featbin/
20 | 
21 | cd tools
22 | make PYTORCH_VERSION="${PYTORCH_VERSION}" PYTHON_VERSION="${TRAVIS_PYTHON_VERSION}" TOOL="${TOOL}" KALDI=${KALDI_ROOT}
23 | cd ${ROOT}
24 | 
25 | source ${TOOL}/miniconda/bin/activate
26 | 
27 | pip install -e ".[test]"  # install test dependencies (setup.py)
28 | 
29 | # log
30 | pip freeze
31 | 


--------------------------------------------------------------------------------
/test/modules/test_pointwise_feed_forward.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Test for positionwise fully-connected feed-forward neural network (FFN)."""
 5 | 
 6 | import importlib
 7 | import pytest
 8 | import torch
 9 | 
10 | 
11 | def make_args(**kwargs):
12 |     args = dict(
13 |         d_model=32,
14 |         d_ff=128,
15 |         dropout=0.1,
16 |         activation='relu',
17 |         param_init='',
18 |         bottleneck_dim=0,
19 |     )
20 |     args.update(kwargs)
21 |     return args
22 | 
23 | 
24 | @pytest.mark.parametrize(
25 |     "args",
26 |     [
27 |         # activation
28 |         ({'activation': 'relu'}),
29 |         ({'activation': 'gelu'}),
30 |         ({'activation': 'gelu_accurate'}),
31 |         ({'activation': 'glu'}),
32 |         ({'activation': 'swish'}),
33 |         # initialization
34 |         ({'param_init': 'xavier_uniform'}),
35 |         # bottleneck
36 |         ({'bottleneck_dim': 16}),
37 |     ]
38 | )
39 | def test_forward(args):
40 |     args = make_args(**args)
41 | 
42 |     batch_size = 4
43 |     max_len = 40
44 |     device = "cpu"
45 | 
46 |     ffn_in = torch.FloatTensor(batch_size, max_len, args['d_model'], device=device)
47 | 
48 |     module = importlib.import_module('neural_sp.models.modules.positionwise_feed_forward')
49 |     ffn = module.PositionwiseFeedForward(**args)
50 |     ffn = ffn.to(device)
51 | 
52 |     ffn_out = ffn(ffn_in)
53 |     assert ffn_in.size() == ffn_out.size()
54 | 


--------------------------------------------------------------------------------
/test/test_python.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | source tools/neural_sp/miniconda/bin/activate
4 | 
5 | modules="neural_sp test utils setup.py"
6 | pycodestyle -r ${modules} --show-source --show-pep8 --ignore="E501"
7 | 
8 | pytest


--------------------------------------------------------------------------------
/utils/make_dataset.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2018 Kyoto University (Hirofumi Inaguma)
 4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | . ./path.sh
 7 | 
 8 | feat="" # feats.scp
 9 | unit=""
10 | remove_space=false
11 | unk="<unk>"
12 | space="<space>"
13 | nlsyms=""
14 | wp_model=""
15 | wp_nbest=1
16 | text=
17 | 
18 | . utils/parse_options.sh
19 | 
20 | if [ $# != 2 ]; then
21 |     echo "Usage: $0 <data_dir> <dict>";
22 |     exit 1;
23 | fi
24 | 
25 | data=$1
26 | dict=$2
27 | 
28 | if [ -z ${text} ]; then
29 |     text=${data}/text
30 | fi
31 | 
32 | make_tsv.py --feat ${feat} \
33 |     --utt2num_frames ${data}/utt2num_frames \
34 |     --utt2spk ${data}/utt2spk \
35 |     --text ${text} \
36 |     --dict ${dict} \
37 |     --unit ${unit} \
38 |     --remove_space ${remove_space} \
39 |     --unk ${unk} \
40 |     --space ${space} \
41 |     --nlsyms ${nlsyms} \
42 |     --wp_model ${wp_model} \
43 |     --wp_nbest ${wp_nbest}
44 | 


--------------------------------------------------------------------------------
/utils/speed_perturb_3way.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2019 Kyoto University (Hirofumi Inaguma)
 4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | . ./path.sh
 7 | 
 8 | nj=32
 9 | speeds="0.9 1.0 1.1"
10 | 
11 | . utils/parse_options.sh
12 | 
13 | if [ $# != 3 ]; then
14 |     echo "Usage: $0 <data> <train_set_original> <train_set>";
15 |     exit 1;
16 | fi
17 | 
18 | data=$1
19 | train_set_original=$2
20 | train_set=$3
21 | tmpdir=$(mktemp -d ${data}/${train_set_original}/tmp-XXXXX)
22 | trap 'rm -rf ${tmpdir}' EXIT
23 | 
24 | if [ ${train_set_original} = ${train_set} ];then
25 |   echo "train_set_original and train_set should be different names"
26 | fi
27 | 
28 | for speed in ${speeds}; do
29 |   utils/perturb_data_dir_speed.sh ${speed} ${data}/${train_set_original} ${tmpdir}/temp${speed}
30 | done
31 | utils/combine_data.sh --extra-files utt2uniq ${data}/${train_set} ${tmpdir}/temp*
32 | rm -r ${tmpdir}/temp*
33 | steps/make_fbank.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \
34 |     ${data}/${train_set} ${data}/log/make_fbank/${train_set} ${data}/fbank
35 | touch ${data}/${train_set}/text.tmp
36 | for speed in ${speeds}; do
37 |     awk -v p="sp${speed}-" '{printf("%s %s%s\n", $1, p, $1);}' ${data}/${train_set_original}/utt2spk > ${data}/${train_set}/utt_map
38 |     utils/apply_map.pl -f 1 ${data}/${train_set}/utt_map <${data}/${train_set_original}/text >>${data}/${train_set}/text.tmp
39 | done
40 | mv ${data}/${train_set}/text.tmp ${data}/${train_set}/text
41 | utils/fix_data_dir.sh ${data}/${train_set}
42 | 


--------------------------------------------------------------------------------
/utils/update_dataset.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2018 Kyoto University (Hirofumi Inaguma)
 4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | . ./path.sh
 7 | 
 8 | unit=""
 9 | remove_space=false
10 | unk="<unk>"
11 | space="<space>"
12 | nlsyms=""
13 | wp_model=""
14 | 
15 | . utils/parse_options.sh
16 | 
17 | if [ $# != 3 ]; then
18 |     echo "Usage: $0 <data_dir> <dict> <tsv>";
19 |     exit 1;
20 | fi
21 | 
22 | text=$1
23 | dict=$2
24 | tsv=$3
25 | tmpdir=$(mktemp -d $(dirname ${text})/tmp-XXXXX)
26 | trap 'rm -rf ${tmpdir}' EXIT
27 | 
28 | cp ${tsv} ${tmpdir}/tmp.tsv
29 | 
30 | # For additional unpaired text
31 | make_tsv.py --text ${text} \
32 |     --dict ${dict} \
33 |     --unit ${unit} \
34 |     --remove_space ${remove_space} \
35 |     --unk ${unk} \
36 |     --space ${space} \
37 |     --nlsyms ${nlsyms} \
38 |     --wp_model ${wp_model} \
39 |     --update  >> ${tmpdir}/tmp.tsv
40 | 
41 | cat ${tmpdir}/tmp.tsv
42 | 
43 | rm -fr ${tmpdir}
44 | 


--------------------------------------------------------------------------------