├── asr1 ├── conf │ ├── pitch.conf │ ├── fbank.conf │ ├── mfcc.conf │ ├── decode_rnn.yaml │ ├── decode_transformer.yaml │ ├── decode_pytorch_transformer_large.yaml │ ├── specaug.yaml │ ├── lm.yaml │ ├── train_pytorch_transformer_large_ngpu4.yaml │ ├── train_rnn.yaml │ ├── train_transformer.yaml │ └── train_pytorch_conformer_lr5.yaml ├── setup_experiment.sh ├── path.sh ├── cmd.sh ├── recog_wav.sh ├── local │ ├── data_prep.py │ └── .ipynb_checkpoints │ │ └── data_prep-checkpoint.py ├── run.sh └── .ipynb_checkpoints │ └── run-checkpoint.sh └── README.md /asr1/conf/pitch.conf: -------------------------------------------------------------------------------- 1 | --sample-frequency=16000 2 | -------------------------------------------------------------------------------- /asr1/conf/fbank.conf: -------------------------------------------------------------------------------- 1 | --sample-frequency=16000 2 | --num-mel-bins=80 3 | -------------------------------------------------------------------------------- /asr1/conf/mfcc.conf: -------------------------------------------------------------------------------- 1 | --use-energy=false # only non-default option. 2 | --sample-frequency=16000 # Switchboard is sampled at 8kHz -------------------------------------------------------------------------------- /asr1/conf/decode_rnn.yaml: -------------------------------------------------------------------------------- 1 | lm-weight: 1.0 2 | beam-size: 30 3 | penalty: 0.0 4 | maxlenratio: 0.0 5 | minlenratio: 0.0 6 | ctc-weight: 0.3 7 | -------------------------------------------------------------------------------- /asr1/conf/decode_transformer.yaml: -------------------------------------------------------------------------------- 1 | batchsize: 0 2 | beam-size: 10 3 | penalty: 0.0 4 | maxlenratio: 0.0 5 | minlenratio: 0.0 6 | ctc-weight: 0.3 7 | lm-weight: 1.0 8 | -------------------------------------------------------------------------------- /asr1/conf/decode_pytorch_transformer_large.yaml: -------------------------------------------------------------------------------- 1 | batchsize: 0 2 | beam-size: 60 3 | ctc-weight: 0.4 4 | lm-weight: 0.6 5 | maxlenratio: 0.0 6 | minlenratio: 0.0 7 | penalty: 0.0 8 | -------------------------------------------------------------------------------- /asr1/conf/specaug.yaml: -------------------------------------------------------------------------------- 1 | process: 2 | # these three processes are a.k.a. SpecAugument 3 | - type: "freq_mask" 4 | F: 30 5 | n_mask: 2 6 | inplace: true 7 | replace_with_zero: false 8 | - type: "time_mask" 9 | T: 40 10 | n_mask: 2 11 | inplace: true 12 | replace_with_zero: false -------------------------------------------------------------------------------- /asr1/setup_experiment.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -ne 1 ]; then 4 | echo >&2 "Usage: ./setup_experiment.sh " 5 | exit 1; 6 | fi 7 | 8 | expname=$1 9 | cd .. 10 | mkdir ${expname} 11 | cd ${expname} 12 | 13 | cp ../asr1/{cmd,path,run,recog_wav}.sh . 14 | cp -P ../asr1/steps . 15 | cp -P ../asr1/utils . 16 | ln -s ../asr1/local . 17 | ln -s ../asr1/conf . 18 | -------------------------------------------------------------------------------- /asr1/conf/lm.yaml: -------------------------------------------------------------------------------- 1 | layer: 2 # 2 for character LMs 2 | unit: 650 # 650 for character LMs 3 | opt: adam # adam for character LMs 4 | sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 5 | batchsize: 1024 # 1024 for character LMs 6 | epoch: 100 # number of epochs 7 | patience: 0 8 | maxlen: 150 # 150 for character LMs -------------------------------------------------------------------------------- /asr1/conf/train_pytorch_transformer_large_ngpu4.yaml: -------------------------------------------------------------------------------- 1 | # This configuration requires 4 gpus with 12GB memory 2 | accum-grad: 4 3 | adim: 512 4 | aheads: 8 5 | backend: pytorch 6 | batch-bins: 15000000 7 | dlayers: 6 8 | dropout-rate: 0.1 9 | dunits: 2048 10 | elayers: 12 11 | epochs: 120 12 | eunits: 2048 13 | grad-clip: 5 14 | lsm-weight: 0.1 15 | model-module: espnet.nets.pytorch_backend.e2e_asr_transformer:E2E 16 | mtlalpha: 0.3 17 | opt: noam 18 | patience: 0 19 | sortagrad: 0 20 | transformer-attn-dropout-rate: 0.0 21 | transformer-init: pytorch 22 | transformer-input-layer: conv2d 23 | transformer-length-normalized-loss: false 24 | transformer-lr: 10.0 25 | transformer-warmup-steps: 25000 -------------------------------------------------------------------------------- /asr1/path.sh: -------------------------------------------------------------------------------- 1 | MAIN_ROOT=$PWD/../../.. 2 | KALDI_ROOT=$MAIN_ROOT/tools/kaldi 3 | 4 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh 5 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH 6 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 7 | . $KALDI_ROOT/tools/config/common_path.sh 8 | export LC_ALL=C 9 | 10 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5/:$PWD:$PATH 11 | 12 | export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/tools/chainer_ctc/ext/warp-ctc/build 13 | if [ -e $MAIN_ROOT/tools/venv/etc/profile.d/conda.sh ]; then 14 | source $MAIN_ROOT/tools/venv/etc/profile.d/conda.sh && conda deactivate && conda activate 15 | else 16 | source $MAIN_ROOT/tools/venv/bin/activate 17 | fi 18 | export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH 19 | 20 | export OMP_NUM_THREADS=1 21 | 22 | # NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C 23 | export PYTHONIOENCODING=UTF-8 24 | -------------------------------------------------------------------------------- /asr1/conf/train_rnn.yaml: -------------------------------------------------------------------------------- 1 | # network architecture 2 | # encoder related 3 | etype: vggblstmp # encoder architecture type 4 | elayers: 6 5 | eunits: 320 6 | eprojs: 320 7 | subsample: "1_2_2_1_1" # skip every n frame from input to nth layers 8 | # decoder related 9 | dlayers: 3 10 | dunits: 300 11 | # attention related 12 | atype: location 13 | adim: 320 14 | awin: 5 15 | aheads: 4 16 | aconv-chans: 10 17 | aconv-filts: 100 18 | 19 | # hybrid CTC/attention 20 | mtlalpha: 0.2 21 | 22 | # label smoothing 23 | lsm-type: unigram 24 | lsm-weight: 0.05 25 | 26 | # minibatch related 27 | batch-size: 30 28 | maxlen-in: 800 # if input length > maxlen_in, batchsize is automatically reduced 29 | maxlen-out: 150 # if output length > maxlen_out, batchsize is automatically reduced 30 | 31 | # optimization related 32 | sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 33 | opt: adadelta 34 | epochs: 40 35 | patience: 5 36 | 37 | # scheduled sampling option 38 | sampling-probability: 0.0 39 | 40 | # Report CER & WER 41 | report-cer: true 42 | report-wer: true -------------------------------------------------------------------------------- /asr1/conf/train_transformer.yaml: -------------------------------------------------------------------------------- 1 | # network architecture 2 | # encoder related 3 | elayers: 12 4 | eunits: 2048 5 | # decoder related 6 | dlayers: 6 7 | dunits: 2048 8 | # attention related 9 | adim: 256 10 | aheads: 4 11 | 12 | # hybrid CTC/attention 13 | mtlalpha: 0.3 14 | 15 | # label smoothing 16 | lsm-weight: 0.1 17 | 18 | # minibatch related 19 | batch-size: 32 20 | maxlen-in: 512 # if input length > maxlen-in, batchsize is automatically reduced 21 | maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced 22 | 23 | # optimization related 24 | sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 25 | opt: noam 26 | accum-grad: 2 27 | grad-clip: 5 28 | patience: 10 29 | epochs: 160 30 | dropout-rate: 0.1 31 | 32 | # transformer specific setting 33 | backend: pytorch 34 | model-module: "espnet.nets.pytorch_backend.e2e_asr_transformer:E2E" 35 | transformer-input-layer: conv2d # encoder architecture type 36 | transformer-lr: 10.0 37 | transformer-warmup-steps: 25000 38 | transformer-attn-dropout-rate: 0.0 39 | transformer-length-normalized-loss: false 40 | transformer-init: pytorch 41 | 42 | # Report CER & WER 43 | report-cer: true 44 | report-wer: true 45 | -------------------------------------------------------------------------------- /asr1/conf/train_pytorch_conformer_lr5.yaml: -------------------------------------------------------------------------------- 1 | # network architecture 2 | # encoder related 3 | elayers: 12 4 | eunits: 2048 5 | # decoder related 6 | dlayers: 6 7 | dunits: 2048 8 | # attention related 9 | adim: 256 10 | aheads: 4 11 | 12 | # hybrid CTC/attention 13 | mtlalpha: 0.2 14 | 15 | # label smoothing 16 | lsm-weight: 0.1 17 | 18 | # minibatch related 19 | batch-size: 64 20 | maxlen-in: 512 # if input length > maxlen-in, batchsize is automatically reduced 21 | maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced 22 | 23 | # optimization related 24 | sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 25 | opt: noam 26 | accum-grad: 8 27 | grad-clip: 5 28 | patience: 0 29 | epochs: 100 30 | dropout-rate: 0.1 31 | 32 | # transformer specific setting 33 | backend: pytorch 34 | model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E" 35 | transformer-input-layer: conv2d # encoder architecture type 36 | transformer-lr: 5.0 37 | transformer-warmup-steps: 25000 38 | transformer-attn-dropout-rate: 0.0 39 | transformer-length-normalized-loss: false 40 | transformer-init: pytorch 41 | 42 | # conformer specific setting 43 | transformer-encoder-pos-enc-layer-type: rel_pos 44 | transformer-encoder-selfattn-layer-type: rel_selfattn 45 | transformer-encoder-activation-type: swish 46 | macaron-style: true 47 | use-cnn-module: true 48 | cnn-module-kernel: 31 -------------------------------------------------------------------------------- /asr1/cmd.sh: -------------------------------------------------------------------------------- 1 | # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ====== 2 | # Usage: .pl [options] JOB=1: 3 | # e.g. 4 | # run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB 5 | # 6 | # Options: 7 | # --time