├── README.md ├── librilight ├── README.md ├── cmd.sh ├── conf │ ├── decode.config │ ├── fbank.conf │ ├── gpu.conf │ ├── mfcc.conf │ ├── mfcc_hires.conf │ ├── online_cmvn.conf │ ├── online_pitch.conf │ ├── queue_no_k20.conf │ └── spec.conf ├── decode_nnet_pytorch.sh ├── local │ ├── data_prep.sh │ ├── download_and_untar.sh │ ├── download_lm.sh │ ├── format_lms.sh │ ├── prepare_dict.sh │ ├── prepare_librilight.sh │ ├── prepare_librilight_dataset.sh │ ├── prepare_test.sh │ ├── prepare_unlabeled_tgt.py │ ├── score.sh │ ├── subset_dataset.sh │ └── train_async_parallel2.sh ├── path.sh ├── run.sh ├── steps ├── train_nnet_pytorch.sh └── utils ├── librispeech ├── README ├── cmd.sh ├── conf │ ├── decode.config │ ├── fbank.conf │ ├── gpu.conf │ ├── mfcc.conf │ ├── mfcc_hires.conf │ ├── online_cmvn.conf │ ├── online_pitch.conf │ ├── queue_no_k20.conf │ └── spec.conf ├── decode.sh ├── local │ ├── data_prep.sh │ ├── download_lm.sh │ ├── format_lms.sh │ ├── prepare_dict.sh │ ├── prepare_test.sh │ ├── score.sh │ └── subset_dataset.sh ├── path.sh ├── run-blstm.sh ├── run-wrn.sh ├── run.sh ├── steps └── utils ├── librispeech100 ├── cmd.sh ├── conf │ ├── decode.config │ ├── fbank.conf │ ├── gpu.conf │ ├── mfcc.conf │ ├── mfcc_hires.conf │ ├── online_cmvn.conf │ ├── online_pitch.conf │ ├── queue_no_k20.conf │ └── spec.conf ├── decode.sh ├── decorrupt.sh ├── generate.sh ├── local │ ├── data_prep.sh │ ├── decode_nnet_pytorch.sh │ ├── download_and_untar.sh │ ├── download_lm.sh │ ├── format_lms.sh │ ├── prepare_dict.sh │ ├── prepare_librilight.sh │ ├── prepare_librilight_dataset.sh │ ├── prepare_test.sh │ ├── prepare_unlabeled_tgt.py │ ├── score.sh │ ├── split_memmap_data.sh │ ├── subset_dataset.sh │ └── train_async_parallel.sh ├── path.sh ├── run-semisup-wrn-scratch.sh ├── run-semisup-wrn.sh ├── run-tdnn.sh ├── run-wrn.sh ├── run.sh ├── steps └── utils ├── nnet_pytorch ├── INSTALL_PYCHAIN ├── IterationTypes.py ├── LRScheduler.py ├── __init__.py ├── batch_generators.py ├── data_utils.py ├── datasets │ ├── HybridASR.py │ ├── NnetPytorchDataset.py │ ├── __init__.py │ └── data_utils.py ├── decode.py ├── decorrupt.py ├── generate.py ├── generate_conditional_from_buffer.py ├── models │ ├── BLSTM.py │ ├── Resnet.py │ ├── TDNN.py │ ├── WideResnet.py │ └── __init__.py ├── objectives │ ├── AcceleratedSGLD.py │ ├── CrossEntropy.py │ ├── CrossEntropy_EBM.py │ ├── L2.py │ ├── LFMMI.py │ ├── LFMMIOnly.py │ ├── LFMMI_EBM.py │ ├── SGLD.py │ ├── SGLDAdam.py │ ├── SGLDSampler.py │ ├── SemisupLFMMI.py │ ├── __init__.py │ └── optimizer.py ├── train.py └── utils │ ├── average_models.py │ ├── combine_models.py │ ├── decode_nnet_pytorch.sh │ ├── memmap_data.py │ ├── prepare_unlabeled_tgt.py │ ├── show_decorruption.py │ ├── show_sampling.py │ ├── split_memmap_data.sh │ └── train_async_parallel.sh └── tools ├── Makefile ├── pychain_patch.diff └── requirements.txt /librilight/README.md: -------------------------------------------------------------------------------- 1 | To start this example commands in the following files will have to be defined 2 | according to the computing environment on which the code is running. 3 | 4 | librilight/cmd.sh -- contains commands for training and decoding. These commands 5 | may need to be modified for new computing clusters. 6 | 7 | librilight/conf/gpu.conf -- gpu configurations that may also need to be changed. 8 | 9 | The CUDA_VISIBLE_DEVICES environment variable is set internally in the code. 10 | Users should modify this line in the following files. It is indicated by 11 | many commented lines before and after with a note: 12 | 13 | 1. train.py 14 | 2. decode.py, 15 | 3. generate_conditional_from_buffer.py 16 | 17 | 18 | Furthermore, the Librispeech Corpus (unlabled data) will need to be downloaded. 19 | Place the path to the Librispeech data in the variable unlabeled_data found the 20 | run.sh script (first line of code). 21 | -------------------------------------------------------------------------------- /librilight/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | export train_cmd="queue.pl --mem 2G" 14 | export decode_cmd="queue.pl --mem 4G" 15 | export mkgraph_cmd="queue.pl --mem 8G" 16 | -------------------------------------------------------------------------------- /librilight/conf/decode.config: -------------------------------------------------------------------------------- 1 | # empty config, just use the defaults. 2 | -------------------------------------------------------------------------------- /librilight/conf/fbank.conf: -------------------------------------------------------------------------------- 1 | --sample-frequency=16000 2 | --num-mel-bins=80 3 | -------------------------------------------------------------------------------- /librilight/conf/gpu.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* 3 | option mem=* -l mem_free=$0,ram_free=$0 4 | option mem=0 # Do not add anything to qsub_opts 5 | option num_threads=* -pe smp $0 6 | option num_threads=1 # Do not add anything to qsub_opts 7 | option max_jobs_run=* -tc $0 8 | default gpu=0 9 | option gpu=0 10 | option gpu=* -l 'hostname=b1[12345678]*|c0[12345789]*|c1[12356789]*|c2[123456789]*,gpu=$0' -q g.q 11 | -------------------------------------------------------------------------------- /librilight/conf/mfcc.conf: -------------------------------------------------------------------------------- 1 | --use-energy=false # only non-default option. 2 | -------------------------------------------------------------------------------- /librilight/conf/mfcc_hires.conf: -------------------------------------------------------------------------------- 1 | # config for high-resolution MFCC features, intended for neural network training 2 | # Note: we keep all cepstra, so it has the same info as filterbank features, 3 | # but MFCC is more easily compressible (because less correlated) which is why 4 | # we prefer this method. 5 | --use-energy=false # use average of log energy, not energy. 6 | --num-mel-bins=40 # similar to Google's setup. 7 | --num-ceps=40 # there is no dimensionality reduction. 8 | --low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so 9 | # there might be some information at the low end. 10 | --high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 11 | -------------------------------------------------------------------------------- /librilight/conf/online_cmvn.conf: -------------------------------------------------------------------------------- 1 | # configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh 2 | -------------------------------------------------------------------------------- /librilight/conf/online_pitch.conf: -------------------------------------------------------------------------------- 1 | ## This config is given by conf/make_pitch_online.sh to the program compute-and-process-kaldi-pitch-feats, 2 | ## and is copied by steps/online/nnet2/prepare_online_decoding.sh and similar scripts, to be given 3 | ## to programs like online2-wav-nnet2-latgen-faster. 4 | ## The program compute-and-process-kaldi-pitch-feats will use it to compute pitch features that 5 | ## are the same as that those which will generated in online decoding; this enables us to train 6 | ## in a way that's compatible with online decoding. 7 | ## 8 | 9 | ## most of these options relate to the post-processing rather than the pitch 10 | ## extraction itself. 11 | --add-raw-log-pitch=true ## this is intended for input to neural nets, so our 12 | ## approach is "throw everything in and see what 13 | ## sticks". 14 | --normalization-left-context=75 15 | --normalization-right-context=50 # We're removing some of the right-context 16 | # for the normalization. Would normally be 75. 17 | # 18 | # Note: our changes to the (left,right) context 19 | # from the defaults of (75,75) to (75,50) will 20 | # almost certainly worsen results, but will 21 | # reduce latency. 22 | --frames-per-chunk=10 ## relates to offline simulation of online decoding; 1 23 | ## would be equivalent to getting in samples one by 24 | ## one. 25 | --simulate-first-pass-online=true ## this make the online-pitch-extraction code 26 | ## output the 'first-pass' features, which 27 | ## are less accurate than the final ones, and 28 | ## which are the only features the neural-net 29 | ## decoding would ever see (since we can't 30 | ## afford to do lattice rescoring in the 31 | ## neural-net code 32 | -------------------------------------------------------------------------------- /librilight/conf/queue_no_k20.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* 3 | option mem=* -l mem_free=$0,ram_free=$0 4 | option mem=0 # Do not add anything to qsub_opts 5 | option num_threads=* -pe smp $0 6 | option num_threads=1 # Do not add anything to qsub_opts 7 | option max_jobs_run=* -tc $0 8 | default gpu=0 9 | option gpu=0 -q all.q 10 | option gpu=* -l gpu=$0 -q g.q 11 | default allow_k20=true 12 | option allow_k20=true 13 | option allow_k20=false -l 'hostname=!g01*&!g02*&!b06*' 14 | -------------------------------------------------------------------------------- /librilight/conf/spec.conf: -------------------------------------------------------------------------------- 1 | --preemphasis-coefficient=0.0 2 | --remove-dc-offset=false 3 | --round-to-power-of-two=false 4 | --window-type=hanning 5 | -------------------------------------------------------------------------------- /librilight/decode_nnet_pytorch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ./path.sh 4 | 5 | batchsize=512 6 | skip_datadump=false 7 | checkpoint=final.mdl 8 | prior_scale=1.0 9 | prior_floor=-20.0 10 | prior_name="priors" 11 | min_active=200 12 | max_active=7000 13 | max_mem=50000000 14 | lattice_beam=8.0 15 | beam=15.0 16 | acoustic_scale=0.1 17 | post_decode_acwt=10.0 # 10.0 for chain systems, 1.0 for non-chain 18 | mean_var="(True, True)" 19 | 20 | min_lmwt=6 21 | max_lmwt=18 22 | nj=80 23 | stage=0 24 | 25 | . ./utils/parse_options.sh 26 | if [ $# -ne 4 ]; then 27 | echo "Usage: ./decode_nnet_pytorch.sh " 28 | echo " --batchsize ${batchsize} --skip-datadump ${skip_datadump}" 29 | echo " --checkpoint ${checkpoint} --prior-scale ${prior_scale} --prior-floor ${prior_floor} --prior-name ${prior_name}" 30 | echo " --min-active ${min_active} --max-active ${max_active}" 31 | echo " --max-mem ${max_mem} --lattice-beam ${lattice_beam}" 32 | echo " --beam ${beam} --acoustic-scale ${acoustic_scale} --post-decode-acwt ${post_decode_acwt}" 33 | echo " --nj ${nj}" 34 | exit 1; 35 | fi 36 | 37 | data=$1 38 | pytorch_model=$2 39 | graphdir=$3 40 | odir=$4 41 | 42 | # We assume the acoustic model (trans.mdl) is 1 level above the graphdir 43 | amdir=`dirname ${graphdir}` 44 | trans_mdl=${amdir}/final.mdl 45 | words_file=${graphdir}/words.txt 46 | hclg=${graphdir}/HCLG.fst 47 | 48 | skip_datadump_opts= 49 | if $skip_datadump; then 50 | skip_datadump_opts="--skip-datadump" 51 | else 52 | memmap_data.py ${data}/feats.scp ${data}/feats.scp.dat 53 | skip_datadump_opts="--skip-datadump" 54 | fi 55 | 56 | mkdir -p ${odir}/log 57 | 58 | decode_cmd="utils/queue.pl --mem 2G -l hostname=b0*" # The 'a' machines are just too slow 59 | if [ $stage -le 0 ]; then 60 | segments=${data}/segments 61 | if [ ! -f ${data}/segments ]; then 62 | echo "No segments file found. Assuming wav.scp is indexed by utterance" 63 | segments=${data}/wav.scp 64 | fi 65 | 66 | ${decode_cmd} JOB=1:${nj} ${odir}/log/decode.JOB.log \ 67 | ./utils/split_scp.pl -j ${nj} \$\[JOB -1\] ${segments} \|\ 68 | decode.py ${skip_datadump_opts} \ 69 | --datadir ${data} \ 70 | --modeldir ${pytorch_model} \ 71 | --dumpdir ${odir} \ 72 | --checkpoint ${checkpoint} \ 73 | --prior-scale ${prior_scale} \ 74 | --prior-floor ${prior_floor} \ 75 | --prior-name ${prior_name} \ 76 | --words-file ${words_file} \ 77 | --trans-mdl ${trans_mdl} \ 78 | --hclg ${hclg} \ 79 | --min-active ${min_active} \ 80 | --max-active ${max_active} \ 81 | --lattice-beam ${lattice_beam} \ 82 | --beam ${beam} \ 83 | --acoustic-scale ${acoustic_scale} \ 84 | --post-decode-acwt ${post_decode_acwt} \ 85 | --job JOB \ 86 | --utt-subset /dev/stdin \ 87 | --batchsize ${batchsize} 88 | fi 89 | 90 | if [ $stage -le 1 ]; then 91 | ./local/score.sh --cmd "$decode_cmd" \ 92 | --min-lmwt ${min_lmwt} --max-lmwt ${max_lmwt} --word-ins-penalty 0.0 \ 93 | ${data} ${graphdir} ${odir} 94 | fi 95 | -------------------------------------------------------------------------------- /librilight/local/data_prep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2014 Vassil Panayotov 4 | # 2014 Johns Hopkins University (author: Daniel Povey) 5 | # Apache 2.0 6 | 7 | if [ "$#" -ne 2 ]; then 8 | echo "Usage: $0 " 9 | echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean" 10 | exit 1 11 | fi 12 | 13 | src=$1 14 | dst=$2 15 | 16 | # all utterances are FLAC compressed 17 | if ! which flac >&/dev/null; then 18 | echo "Please install 'flac' on ALL worker nodes!" 19 | exit 1 20 | fi 21 | 22 | spk_file=$src/../SPEAKERS.TXT 23 | 24 | mkdir -p $dst || exit 1; 25 | 26 | [ ! -d $src ] && echo "$0: no such directory $src" && exit 1; 27 | [ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1; 28 | 29 | 30 | wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp 31 | trans=$dst/text; [[ -f "$trans" ]] && rm $trans 32 | utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk 33 | spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender 34 | 35 | for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do 36 | reader=$(basename $reader_dir) 37 | if ! [ $reader -eq $reader ]; then # not integer. 38 | echo "$0: unexpected subdirectory name $reader" 39 | exit 1; 40 | fi 41 | 42 | reader_gender=$(egrep "^$reader[ ]+\|" $spk_file | awk -F'|' '{gsub(/[ ]+/, ""); print tolower($2)}') 43 | if [ "$reader_gender" != 'm' ] && [ "$reader_gender" != 'f' ]; then 44 | echo "Unexpected gender: '$reader_gender'" 45 | exit 1; 46 | fi 47 | 48 | for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do 49 | chapter=$(basename $chapter_dir) 50 | if ! [ "$chapter" -eq "$chapter" ]; then 51 | echo "$0: unexpected chapter-subdirectory name $chapter" 52 | exit 1; 53 | fi 54 | 55 | find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \ 56 | awk -v "dir=$chapter_dir" '{printf "%s flac -c -d -s %s/%s.flac |\n", $0, dir, $0}' >>$wav_scp|| exit 1 57 | 58 | chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt 59 | [ ! -f $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1 60 | cat $chapter_trans >>$trans 61 | 62 | # NOTE: For now we are using per-chapter utt2spk. That is each chapter is considered 63 | # to be a different speaker. This is done for simplicity and because we want 64 | # e.g. the CMVN to be calculated per-chapter 65 | awk -v "reader=$reader" -v "chapter=$chapter" '{printf "%s %s-%s\n", $1, reader, chapter}' \ 66 | <$chapter_trans >>$utt2spk || exit 1 67 | 68 | # reader -> gender map (again using per-chapter granularity) 69 | echo "${reader}-${chapter} $reader_gender" >>$spk2gender 70 | done 71 | done 72 | 73 | spk2utt=$dst/spk2utt 74 | utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt || exit 1 75 | 76 | ntrans=$(wc -l <$trans) 77 | nutt2spk=$(wc -l <$utt2spk) 78 | ! [ "$ntrans" -eq "$nutt2spk" ] && \ 79 | echo "Inconsistent #transcripts($ntrans) and #utt2spk($nutt2spk)" && exit 1; 80 | 81 | utils/validate_data_dir.sh --no-feats $dst || exit 1; 82 | 83 | echo "$0: successfully prepared data in $dst" 84 | 85 | exit 0 86 | -------------------------------------------------------------------------------- /librilight/local/download_and_untar.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2014 Johns Hopkins University (author: Daniel Povey) 4 | # 2017 Luminar Technologies, Inc. (author: Daniel Galvez) 5 | # Apache 2.0 6 | 7 | remove_archive=false 8 | 9 | if [ "$1" == --remove-archive ]; then 10 | remove_archive=true 11 | shift 12 | fi 13 | 14 | if [ $# -ne 3 ]; then 15 | echo "Usage: $0 [--remove-archive] " 16 | echo "e.g.: $0 /export/a05/dgalvez/ www.openslr.org/resources/31 dev-clean-2" 17 | echo "With --remove-archive it will remove the archive after successfully un-tarring it." 18 | echo " can be one of: dev-clean-2, test-clean-5, dev-other, test-other," 19 | echo " train-clean-100, train-clean-360, train-other-500." 20 | fi 21 | 22 | data=$1 23 | url=$2 24 | part=$3 25 | 26 | if [ ! -d "$data" ]; then 27 | echo "$0: no such directory $data" 28 | exit 1; 29 | fi 30 | 31 | data=$(readlink -f $data) 32 | 33 | part_ok=false 34 | list="dev-clean-2 train-clean-5" 35 | for x in $list; do 36 | if [ "$part" == $x ]; then part_ok=true; fi 37 | done 38 | if ! $part_ok; then 39 | echo "$0: expected to be one of $list, but got '$part'" 40 | exit 1; 41 | fi 42 | 43 | if [ -z "$url" ]; then 44 | echo "$0: empty URL base." 45 | exit 1; 46 | fi 47 | 48 | if [ -f $data/LibriSpeech/$part/.complete ]; then 49 | echo "$0: data part $part was already successfully extracted, nothing to do." 50 | exit 0; 51 | fi 52 | 53 | 54 | #sizes="126046265 332747356" 55 | sizes="126046265 332954390" 56 | 57 | if [ -f $data/$part.tar.gz ]; then 58 | size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}') 59 | size_ok=false 60 | for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done 61 | if ! $size_ok; then 62 | echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size" 63 | echo "does not equal the size of one of the archives." 64 | rm $data/$part.tar.gz 65 | else 66 | echo "$data/$part.tar.gz exists and appears to be complete." 67 | fi 68 | fi 69 | 70 | if [ ! -f $data/$part.tar.gz ]; then 71 | if ! which wget >/dev/null; then 72 | echo "$0: wget is not installed." 73 | exit 1; 74 | fi 75 | full_url=$url/$part.tar.gz 76 | echo "$0: downloading data from $full_url. This may take some time, please be patient." 77 | 78 | cd $data 79 | if ! wget --no-check-certificate $full_url; then 80 | echo "$0: error executing wget $full_url" 81 | exit 1; 82 | fi 83 | cd - 84 | fi 85 | 86 | cd $data 87 | 88 | if ! tar -xvzf $part.tar.gz; then 89 | echo "$0: error un-tarring archive $data/$part.tar.gz" 90 | exit 1; 91 | fi 92 | 93 | touch $data/LibriSpeech/$part/.complete 94 | 95 | echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz" 96 | 97 | if $remove_archive; then 98 | echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied." 99 | rm $data/$part.tar.gz 100 | fi 101 | -------------------------------------------------------------------------------- /librilight/local/download_lm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2014 Vassil Panayotov 4 | # 2017 Daniel Povey 5 | # Apache 2.0 6 | 7 | if [ $# -ne "3" ]; then 8 | echo "Usage: $0 /dev/null | awk '{print $1}' || stat '-f %z' $f) 43 | if [[ "$fsize" -eq "$expect_size" ]]; then 44 | echo "'$fname' already exists and appears to be complete" 45 | return 0 46 | else 47 | echo "WARNING: '$fname' exists, but the size is wrong - re-downloading ..." 48 | fi 49 | fi 50 | wget --no-check-certificate -O $dst_dir/$fname $base_url/$fname || { 51 | echo "Error while trying to download $fname!" 52 | return 1 53 | } 54 | f=$dst_dir/$fname 55 | # In the following statement, the first version works on linux, and the part after '||' 56 | # works on Linux. 57 | fsize=$(set -o pipefail; du -b $f 2>/dev/null | awk '{print $1}' || stat '-f %z' $f) 58 | [[ "$fsize" -eq "$expect_size" ]] || { echo "$fname: file size mismatch!"; return 1; } 59 | return 0 60 | } 61 | 62 | mkdir -p $dst_dir $local_dir 63 | 64 | for f in 3-gram.arpa.gz 3-gram.pruned.1e-7.arpa.gz 3-gram.pruned.3e-7.arpa.gz 4-gram.arpa.gz \ 65 | librispeech-vocab.txt librispeech-lexicon.txt; do 66 | check_and_download $f || exit 1 67 | done 68 | 69 | dst_dir=$(readlink -f $dst_dir) 70 | ln -sf $dst_dir/3-gram.pruned.1e-7.arpa.gz $local_dir/lm_tgmed.arpa.gz 71 | ln -sf $dst_dir/3-gram.pruned.3e-7.arpa.gz $local_dir/lm_tgsmall.arpa.gz 72 | ln -sf $dst_dir/3-gram.arpa.gz $local_dir/lm_tglarge.arpa.gz 73 | ln -sf $dst_dir/4-gram.arpa.gz $local_dir/lm_fglarge.arpa.gz 74 | ln -sf $dst_dir/librispeech-lexicon.txt $local_dir/librispeech-lexicon.txt 75 | ln -sf $dst_dir/librispeech-vocab.txt $local_dir/librispeech-vocab.txt 76 | exit 0 77 | -------------------------------------------------------------------------------- /librilight/local/format_lms.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2014 Vassil Panayotov 4 | # Apache 2.0 5 | 6 | # Prepares the test time language model(G) transducers 7 | # (adapted from wsj/s5/local/wsj_format_data.sh) 8 | 9 | . ./path.sh || exit 1; 10 | 11 | # begin configuration section 12 | src_dir=data/lang 13 | # end configuration section 14 | 15 | . utils/parse_options.sh || exit 1; 16 | 17 | set -e 18 | 19 | if [ $# -ne 1 ]; then 20 | echo "Usage: $0 " 21 | echo "e.g.: $0 /export/a15/vpanayotov/data/lm" 22 | echo ", where:" 23 | echo " is the directory in which the language model is stored/downloaded" 24 | echo "Options:" 25 | echo " --src-dir # source lang directory, default data/lang" 26 | exit 1 27 | fi 28 | 29 | lm_dir=$1 30 | 31 | if [ ! -d $lm_dir ]; then 32 | echo "$0: expected source LM directory $lm_dir to exist" 33 | exit 1; 34 | fi 35 | if [ ! -f $src_dir/words.txt ]; then 36 | echo "$0: expected $src_dir/words.txt to exist." 37 | exit 1; 38 | fi 39 | 40 | 41 | tmpdir=data/local/lm_tmp.$$ 42 | trap "rm -r $tmpdir" EXIT 43 | 44 | mkdir -p $tmpdir 45 | 46 | for lm_suffix in tgsmall tgmed; do 47 | # tglarge is prepared by a separate command, called from run.sh; we don't 48 | # want to compile G.fst for tglarge, as it takes a while. 49 | test=${src_dir}_test_${lm_suffix} 50 | mkdir -p $test 51 | cp -r ${src_dir}/* $test 52 | gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz | \ 53 | arpa2fst --disambig-symbol=#0 \ 54 | --read-symbol-table=$test/words.txt - $test/G.fst 55 | utils/validate_lang.pl --skip-determinization-check $test || exit 1; 56 | done 57 | 58 | echo "Succeeded in formatting data." 59 | 60 | exit 0 61 | -------------------------------------------------------------------------------- /librilight/local/prepare_dict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2014 Vassil Panayotov 4 | # Apache 2.0 5 | 6 | # Prepares the dictionary and auto-generates the pronunciations for the words, 7 | # that are in our vocabulary but not in CMUdict 8 | 9 | stage=0 10 | nj=4 # number of parallel Sequitur G2P jobs, we would like to use 11 | cmd=run.pl 12 | 13 | 14 | . utils/parse_options.sh || exit 1; 15 | . ./path.sh || exit 1 16 | 17 | 18 | if [ $# -ne 3 ]; then 19 | echo "Usage: $0 [options] " 20 | echo "e.g.: /export/a15/vpanayotov/data/lm /export/a15/vpanayotov/data/g2p data/local/dict" 21 | echo "Options:" 22 | echo " --cmd '' # script to launch jobs with, default: run.pl" 23 | echo " --nj # number of jobs to run, default: 4." 24 | exit 1 25 | fi 26 | 27 | lm_dir=$1 28 | g2p_model_dir=$2 29 | dst_dir=$3 30 | 31 | vocab=$lm_dir/librispeech-vocab.txt 32 | [ ! -f $vocab ] && echo "$0: vocabulary file not found at $vocab" && exit 1; 33 | 34 | # this file is either a copy of the lexicon we download from openslr.org/11 or is 35 | # created by the G2P steps below 36 | lexicon_raw_nosil=$dst_dir/lexicon_raw_nosil.txt 37 | 38 | cmudict_dir=$dst_dir/cmudict 39 | cmudict_plain=$dst_dir/cmudict.0.7a.plain 40 | 41 | mkdir -p $dst_dir || exit 1; 42 | 43 | if [ $stage -le 0 ]; then 44 | echo "Downloading and preparing CMUdict" 45 | if [ ! -s $cmudict_dir/cmudict.0.7a ]; then 46 | svn co -r 12440 https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict $cmudict_dir || exit 1; 47 | fi 48 | echo "Removing the pronunciation variant markers ..." 49 | grep -v ';;;' $cmudict_dir/cmudict.0.7a | \ 50 | perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \ 51 | > $cmudict_plain || exit 1; 52 | fi 53 | 54 | 55 | if [ $stage -le 1 ]; then 56 | # check if we have Sequitur G2P is installed 57 | if [ ! -f "$sequitur" ]; then 58 | if ! which swig >&/dev/null; then 59 | echo "Please install 'swig' and then run $KALDI_ROOT/tools/extra/install_sequitur.sh" 60 | exit 1 61 | else 62 | echo "Sequitur G2P not found- running $KALDI_ROOT/tools/extra/install_sequitur.sh" 63 | pushd $KALDI_ROOT/tools 64 | extras/install_sequitur.sh || exit 1 65 | popd 66 | fi 67 | fi 68 | [[ -f "$sequitur" ]] || { echo "Still can't find Sequitur G2P- check your path.sh"; exit 1; } 69 | 70 | g2p_dir=$dst_dir/g2p 71 | auto_vocab_prefix="$g2p_dir/vocab_autogen" 72 | auto_lexicon_prefix="$g2p_dir/lexicon_autogen" 73 | 74 | mkdir -p $g2p_dir/log 75 | auto_vocab_splits=$(eval "echo $auto_vocab_prefix.{$(seq -s',' $nj | sed 's/,$//')}") 76 | awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' $cmudict_plain $vocab |\ 77 | sort | tee $g2p_dir/vocab_autogen.full |\ 78 | utils/split_scp.pl /dev/stdin $auto_vocab_splits || exit 1 79 | echo "Autogenerating pronunciations for the words in $auto_vocab_prefix.* ..." 80 | $cmd JOB=1:$nj $g2p_dir/log/g2p.JOB.log \ 81 | local/g2p.sh $auto_vocab_prefix.JOB $g2p_model_dir $auto_lexicon_prefix.JOB || exit 1 82 | g2p_vocab_size=$(wc -l <$g2p_dir/vocab_autogen.full) 83 | g2p_lex_size=$(wc -l < <(cat $auto_lexicon_prefix.*)) 84 | [[ "$g2p_vocab_size" -eq "$g2p_lex_size" ]] || { echo "Unexpected G2P error"; exit 1; } 85 | sort <(cat $auto_vocab_prefix.*) >$dst_dir/vocab_autogen.txt 86 | sort <(cat $auto_lexicon_prefix.*) >$dst_dir/lexicon_autogen.txt 87 | echo "$(wc -l <$g2p_dir/vocab_autogen.full) pronunciations autogenerated OK" 88 | fi 89 | 90 | if [ $stage -le 2 ]; then 91 | echo "Combining the CMUdict pronunciations with the autogenerated ones ..." 92 | awk 'NR==FNR{a[$1]=1; next} ($1 in a)' $vocab $cmudict_plain |\ 93 | cat - $dst_dir/lexicon_autogen.txt | sort >$lexicon_raw_nosil || exit 1 94 | raw_lex_size=$(cat $lexicon_raw_nosil | awk '{print $1}' | sort -u | wc -l) 95 | vocab_size=$(wc -l <$vocab) 96 | [[ "$vocab_size" -eq "$raw_lex_size" ]] || { 97 | echo "Inconsistent lexicon($raw_lex_size) vs vocabulary($vocab_size) size!"; 98 | exit 1; } 99 | echo "Combined lexicon saved to '$lexicon_raw_nosil'" 100 | fi 101 | 102 | # The copy operation below is necessary, if we skip the g2p stages(e.g. using --stage 3) 103 | if [[ ! -s "$lexicon_raw_nosil" ]]; then 104 | cp $lm_dir/librispeech-lexicon.txt $lexicon_raw_nosil || exit 1 105 | fi 106 | 107 | if [ $stage -le 3 ]; then 108 | silence_phones=$dst_dir/silence_phones.txt 109 | optional_silence=$dst_dir/optional_silence.txt 110 | nonsil_phones=$dst_dir/nonsilence_phones.txt 111 | extra_questions=$dst_dir/extra_questions.txt 112 | 113 | echo "Preparing phone lists and clustering questions" 114 | (echo SIL; echo SPN;) > $silence_phones 115 | echo SIL > $optional_silence 116 | # nonsilence phones; on each line is a list of phones that correspond 117 | # really to the same base phone. 118 | awk '{for (i=2; i<=NF; ++i) { print $i; gsub(/[0-9]/, "", $i); print $i}}' $lexicon_raw_nosil |\ 119 | sort -u |\ 120 | perl -e 'while(<>){ 121 | chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_"; 122 | $phones_of{$1} .= "$_ "; } 123 | foreach $list (values %phones_of) {print $list . "\n"; } ' | sort \ 124 | > $nonsil_phones || exit 1; 125 | # A few extra questions that will be added to those obtained by automatically clustering 126 | # the "real" phones. These ask about stress; there's also one for silence. 127 | cat $silence_phones| awk '{printf("%s ", $1);} END{printf "\n";}' > $extra_questions || exit 1; 128 | cat $nonsil_phones | perl -e 'while(<>){ foreach $p (split(" ", $_)) { 129 | $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \ 130 | >> $extra_questions || exit 1; 131 | echo "$(wc -l <$silence_phones) silence phones saved to: $silence_phones" 132 | echo "$(wc -l <$optional_silence) optional silence saved to: $optional_silence" 133 | echo "$(wc -l <$nonsil_phones) non-silence phones saved to: $nonsil_phones" 134 | echo "$(wc -l <$extra_questions) extra triphone clustering-related questions saved to: $extra_questions" 135 | fi 136 | 137 | if [ $stage -le 4 ]; then 138 | (echo '!SIL SIL'; echo ' SPN'; echo ' SPN'; ) |\ 139 | cat - $lexicon_raw_nosil | sort | uniq >$dst_dir/lexicon.txt 140 | echo "Lexicon text file saved as: $dst_dir/lexicon.txt" 141 | fi 142 | 143 | exit 0 144 | -------------------------------------------------------------------------------- /librilight/local/prepare_librilight.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ./path.sh 4 | . ./cmd 5 | 6 | if [ $# -ne 1 ]; then 7 | echo "Usage: ./local/prepare_librilight.sh " 8 | exit 1; 9 | fi 10 | 11 | data=$1 12 | # Get librilight set 13 | wget https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz 14 | tar -xvf librispeech_finetuning.tgz && mv librispeech_finetuning ${data} 15 | 16 | # The following are the data subsets: 17 | # 1h/{0..5}/{clean,other} 18 | # 9h/{clean,other} 19 | # 20 | # In each of these subsets there speaker directories named with a speaker-id. 21 | # Inside each directory are more directories corresponding to a recording-id. 22 | # Within each speaker-id/recording-id subdirectory are the .flac audio files 23 | # corresponding to speech utterances, as well as a .trans.txt file that has 24 | # the transcription. 25 | 26 | find -L $data -name "*.flac" 27 | 28 | for part in 1h/{0..5}/{clean,other} 9h/{clean,other}; do 29 | dataname=$(echo ${part} | sed 's/\//_/g') 30 | ./local/prepare_librilight_dataset.sh ${data}/${part} data/train_${dataname} 31 | done 32 | 33 | ./utils/combine_data.sh \ 34 | data/train_10h data/train_1h_{0..5}_{clean,other} data/train_9h_{clean,other} 35 | -------------------------------------------------------------------------------- /librilight/local/prepare_librilight_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ./path.sh 4 | . ./cmd.sh 5 | 6 | if [ $# -ne 2 ]; then 7 | echo "Usage: ./local/prepare_librilight.sh " 8 | exit 1; 9 | fi 10 | 11 | data=$1 12 | kaldi_data=$2 13 | 14 | data=$(./utils/make_absolute.sh ${data}) 15 | mkdir -p $kaldi_data 16 | files=( `find -L ${data}/${p} -name "*.flac"` ) 17 | 18 | for f in ${files[@]}; do 19 | fname=`basename $f` 20 | fname=${fname%%.flac} 21 | echo "${fname} flac -c -d -s ${f} |" 22 | done | sort > ${kaldi_data}/wav.scp 23 | 24 | paste -d' ' <(awk '{print $1}' ${kaldi_data}/wav.scp) \ 25 | <(awk '{print $1}' ${kaldi_data}/wav.scp | cut -d'-' -f1) \ 26 | > ${kaldi_data}/utt2spk 27 | 28 | ./utils/utt2spk_to_spk2utt.pl ${kaldi_data}/utt2spk > ${kaldi_data}/spk2utt 29 | 30 | cat `find -L ${data}/${p} -name "*.trans.txt"` | sort > ${kaldi_data}/text 31 | exit 0; 32 | -------------------------------------------------------------------------------- /librilight/local/prepare_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | data=/export/a15/vpanayotov/data 4 | subsampling=4 5 | 6 | . ./cmd.sh 7 | . ./path.sh 8 | 9 | . ./utils/parse_options.sh 10 | 11 | set -euo pipefail 12 | 13 | for part in dev-clean dev-other test-clean test-other; do 14 | echo "-------------- Making ${part} ----------------------" 15 | dataname=$(echo ${part} | sed s/-/_/g) 16 | local/data_prep.sh $data/LibriSpeech/${part} data/${dataname} 17 | ./utils/copy_data_dir.sh data/${dataname} data/${dataname}_fbank 18 | ./steps/make_fbank.sh --cmd "$train_cmd" --nj 32 \ 19 | data/${dataname}_fbank exp/make_fbank/${dataname} fbank 20 | ./utils/fix_data_dir.sh data/${dataname}_fbank 21 | ./steps/compute_cmvn_stats.sh data/${dataname}_fbank 22 | ./utils/fix_data_dir.sh data/${dataname}_fbank 23 | 24 | memmap_data.py data/${dataname}_fbank/feats.scp data/${dataname}_fbank/feats.scp.dat 25 | python local/prepare_unlabeled_tgt.py --subsample ${subsampling} \ 26 | data/${dataname}_fbank/utt2num_frames > data/${dataname}_fbank/pdfid.${subsampling}.tgt 27 | done 28 | 29 | exit 0; 30 | 31 | 32 | -------------------------------------------------------------------------------- /librilight/local/prepare_unlabeled_tgt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf-8 -*- 3 | # Copyright 2019 Johns Hopkins University (Author: Matthew Wiesner) 4 | # Apache 2.0 5 | 6 | from __future__ import print_function 7 | import argparse 8 | import sys 9 | import os 10 | 11 | 12 | def main(): 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('utt2num_frames', 15 | help='', 16 | type=str 17 | ) 18 | parser.add_argument('--subsample', type=int, default=1) 19 | 20 | args = parser.parse_args() 21 | 22 | with open(args.utt2num_frames, 'r') as f: 23 | for l in f: 24 | utt, frames = l.strip().split(None, 1) 25 | print(utt, end='') 26 | num_frames = len(range(0, int(frames), args.subsample)) 27 | print(' -1' * num_frames) 28 | 29 | if __name__ == "__main__": 30 | main() 31 | 32 | -------------------------------------------------------------------------------- /librilight/local/score.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2012 Johns Hopkins University (Author: Daniel Povey) 3 | # 2014 Guoguo Chen 4 | # Apache 2.0 5 | 6 | [ -f ./path.sh ] && . ./path.sh 7 | 8 | # begin configuration section. 9 | cmd=run.pl 10 | stage=0 11 | decode_mbr=true 12 | word_ins_penalty=0.0,0.5,1.0 13 | min_lmwt=7 14 | max_lmwt=17 15 | iter=final 16 | #end configuration section. 17 | 18 | [ -f ./path.sh ] && . ./path.sh 19 | . parse_options.sh || exit 1; 20 | 21 | if [ $# -ne 3 ]; then 22 | echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] " 23 | echo " Options:" 24 | echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." 25 | echo " --stage (0|1|2) # start scoring script from part-way through." 26 | echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)." 27 | echo " --min_lmwt # minumum LM-weight for lattice rescoring " 28 | echo " --max_lmwt # maximum LM-weight for lattice rescoring " 29 | exit 1; 30 | fi 31 | 32 | data=$1 33 | lang_or_graph=$2 34 | dir=$3 35 | 36 | symtab=$lang_or_graph/words.txt 37 | 38 | for f in $symtab $dir/lat.1.gz $data/text; do 39 | [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; 40 | done 41 | 42 | mkdir -p $dir/scoring/log 43 | 44 | cat $data/text | sed 's:::g' | sed 's:::g' > $dir/scoring/test_filt.txt 45 | 46 | for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do 47 | $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.$wip.log \ 48 | lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ 49 | lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ 50 | lattice-best-path --word-symbol-table=$symtab \ 51 | ark:- ark,t:$dir/scoring/LMWT.$wip.tra || exit 1; 52 | done 53 | 54 | # Note: the double level of quoting for the sed command 55 | for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do 56 | $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.$wip.log \ 57 | cat $dir/scoring/LMWT.$wip.tra \| \ 58 | utils/int2sym.pl -f 2- $symtab \| sed 's:\::g' \| \ 59 | compute-wer --text --mode=present \ 60 | ark:$dir/scoring/test_filt.txt ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1; 61 | done 62 | 63 | exit 0; 64 | -------------------------------------------------------------------------------- /librilight/local/subset_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2017 Luminar Technologies, Inc. (author: Daniel Galvez) 4 | # Apache 2.0 5 | 6 | # The following commands were used to generate the mini_librispeech dataset: 7 | # 8 | # Note that data generation is random. This could be fixed by 9 | # providing a seed argument to the shuf program. 10 | 11 | if [ "$#" -ne 3 ]; then 12 | echo "Usage: $0 " 13 | echo "e.g.: $0 /export/a05/dgalvez/LibriSpeech/train-clean-100 \\ 14 | /export/a05/dgalvez/LibriSpeech/train-clean-5 5" 15 | exit 1 16 | fi 17 | 18 | src_dir=$1 19 | dest_dir=$2 20 | dest_num_hours=$3 21 | 22 | src=$(basename $src_dir) 23 | dest=$(basename $dest_dir) 24 | librispeech_dir=$(dirname $src_dir) 25 | 26 | # TODO: Possibly improve this to ensure gender balance and speaker 27 | # balance. 28 | # TODO: Use actual time values instead of assuming that to make sure we get $dest_num_hours of data 29 | src_num_hours=$(grep "$src" $librispeech_dir/CHAPTERS.TXT | awk -F'|' '{ print $3 }' | \ 30 | python -c ' 31 | from __future__ import print_function 32 | from sys import stdin 33 | minutes_str = stdin.read().split() 34 | print(int(round(sum([float(minutes) for minutes in minutes_str]) / 60.0)))') 35 | src_num_chapters=$(grep "$src" $librispeech_dir/CHAPTERS.TXT | \ 36 | awk -F'|' '{ print $1 }' | sort -u | wc -l) 37 | mkdir -p data/subset_tmp 38 | grep "$src" $librispeech_dir/CHAPTERS.TXT | \ 39 | awk -F'|' '{ print $1 }' | \ 40 | shuf -n $(((dest_num_hours * src_num_chapters) / src_num_hours)) > \ 41 | data/subset_tmp/${dest}_chapter_id_list.txt 42 | 43 | while read -r chapter_id || [[ -n "$chapter_id" ]]; do 44 | chapter_dir=$(find $src_dir/ -mindepth 2 -name "$chapter_id" -type d) 45 | speaker_id=$(basename $(dirname $chapter_dir)) 46 | mkdir -p $dest_dir/$speaker_id/ 47 | cp -r $chapter_dir $dest_dir/$speaker_id/ 48 | done < data/subset_tmp/${dest}_chapter_id_list.txt 49 | -------------------------------------------------------------------------------- /librilight/path.sh: -------------------------------------------------------------------------------- 1 | export ROOT=`pwd`/../tools 2 | export KALDI_ROOT=${ROOT}/kaldi 3 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh 4 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/sph2pipe_v2.5:$KALDI_ROOT/tools/openfst/bin:`pwd`/../nnet_pytorch:$PWD:$PATH 5 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 6 | . $KALDI_ROOT/tools/config/common_path.sh 7 | export LC_ALL=C 8 | 9 | export OPENFST_PATH=${ROOT}/openfst #/PATH/TO/OPENFST 10 | export LD_LIBRARY_ORIG=${LD_LIBRARY_PATH} 11 | export LD_LIBRARY_PATH=${OPENFST_PATH}/lib:${LD_LIBRARY_PATH} 12 | #export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${OPENFST_PATH}/lib:/usr/local/cuda/lib64 13 | 14 | export PYTHONPATH=${PYTHONPATH}:`pwd`/../nnet_pytorch/ 15 | export PYTHONUNBUFFERED=1 16 | source ${ROOT}/NeurIPS2020/bin/activate 17 | 18 | export LC_ALL=C 19 | 20 | -------------------------------------------------------------------------------- /librilight/steps: -------------------------------------------------------------------------------- 1 | ../tools/kaldi/egs/wsj/s5/steps -------------------------------------------------------------------------------- /librilight/utils: -------------------------------------------------------------------------------- 1 | ../tools/kaldi/egs/wsj/s5/utils -------------------------------------------------------------------------------- /librispeech/README: -------------------------------------------------------------------------------- 1 | This recipe trains a hybrid ASR model on the 960h 2 | Librispeech data. The training pipeline is almost similar 3 | to the Kaldi Librispeech recipe, except that we don't do 4 | speed pertrubation or use i-vectors here, and we use 5 | the nnet_pytorch for acoustic model training, instead 6 | of the Kaldi nnet3. Using a 6-layer BLSTM (41M params) 7 | and very little hyperparameter tuning, and training on 8 | 4 GPUs for 2 days, we were able to obtain a WER of 4.46% 9 | on the dev-clean subset (with 4-gram LM rescoring). 10 | 11 | To run the training pipeline: `./run.sh` 12 | To run decoding with the trained model: `./decode.sh` 13 | -------------------------------------------------------------------------------- /librispeech/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | export train_cmd="queue.pl --mem 2G -l hostname=!b02*" 14 | export decode_cmd="queue.pl --mem 4G -l hostname=!b02*\&!c06*\&!c23*\&!c24*\&!c25*\&!c27*" 15 | export mkgraph_cmd="queue.pl --mem 8G" 16 | -------------------------------------------------------------------------------- /librispeech/conf/decode.config: -------------------------------------------------------------------------------- 1 | # empty config, just use the defaults. 2 | -------------------------------------------------------------------------------- /librispeech/conf/fbank.conf: -------------------------------------------------------------------------------- 1 | --sample-frequency=16000 2 | --num-mel-bins=80 3 | -------------------------------------------------------------------------------- /librispeech/conf/gpu.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* 3 | option mem=* -l mem_free=$0,ram_free=$0 4 | option mem=0 # Do not add anything to qsub_opts 5 | option num_threads=* -pe smp $0 6 | option num_threads=1 # Do not add anything to qsub_opts 7 | option max_jobs_run=* -tc $0 8 | default gpu=0 9 | option gpu=0 10 | option gpu=* -l 'hostname=c0*|c1*|c2[0126]*,gpu=$0' -q g.q 11 | -------------------------------------------------------------------------------- /librispeech/conf/mfcc.conf: -------------------------------------------------------------------------------- 1 | --use-energy=false # only non-default option. 2 | -------------------------------------------------------------------------------- /librispeech/conf/mfcc_hires.conf: -------------------------------------------------------------------------------- 1 | # config for high-resolution MFCC features, intended for neural network training 2 | # Note: we keep all cepstra, so it has the same info as filterbank features, 3 | # but MFCC is more easily compressible (because less correlated) which is why 4 | # we prefer this method. 5 | --use-energy=false # use average of log energy, not energy. 6 | --num-mel-bins=40 # similar to Google's setup. 7 | --num-ceps=40 # there is no dimensionality reduction. 8 | --low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so 9 | # there might be some information at the low end. 10 | --high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 11 | -------------------------------------------------------------------------------- /librispeech/conf/online_cmvn.conf: -------------------------------------------------------------------------------- 1 | # configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh 2 | -------------------------------------------------------------------------------- /librispeech/conf/online_pitch.conf: -------------------------------------------------------------------------------- 1 | ## This config is given by conf/make_pitch_online.sh to the program compute-and-process-kaldi-pitch-feats, 2 | ## and is copied by steps/online/nnet2/prepare_online_decoding.sh and similar scripts, to be given 3 | ## to programs like online2-wav-nnet2-latgen-faster. 4 | ## The program compute-and-process-kaldi-pitch-feats will use it to compute pitch features that 5 | ## are the same as that those which will generated in online decoding; this enables us to train 6 | ## in a way that's compatible with online decoding. 7 | ## 8 | 9 | ## most of these options relate to the post-processing rather than the pitch 10 | ## extraction itself. 11 | --add-raw-log-pitch=true ## this is intended for input to neural nets, so our 12 | ## approach is "throw everything in and see what 13 | ## sticks". 14 | --normalization-left-context=75 15 | --normalization-right-context=50 # We're removing some of the right-context 16 | # for the normalization. Would normally be 75. 17 | # 18 | # Note: our changes to the (left,right) context 19 | # from the defaults of (75,75) to (75,50) will 20 | # almost certainly worsen results, but will 21 | # reduce latency. 22 | --frames-per-chunk=10 ## relates to offline simulation of online decoding; 1 23 | ## would be equivalent to getting in samples one by 24 | ## one. 25 | --simulate-first-pass-online=true ## this make the online-pitch-extraction code 26 | ## output the 'first-pass' features, which 27 | ## are less accurate than the final ones, and 28 | ## which are the only features the neural-net 29 | ## decoding would ever see (since we can't 30 | ## afford to do lattice rescoring in the 31 | ## neural-net code 32 | -------------------------------------------------------------------------------- /librispeech/conf/queue_no_k20.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* 3 | option mem=* -l mem_free=$0,ram_free=$0 4 | option mem=0 # Do not add anything to qsub_opts 5 | option num_threads=* -pe smp $0 6 | option num_threads=1 # Do not add anything to qsub_opts 7 | option max_jobs_run=* -tc $0 8 | default gpu=0 9 | option gpu=0 -q all.q 10 | option gpu=* -l gpu=$0 -q g.q 11 | default allow_k20=true 12 | option allow_k20=true 13 | option allow_k20=false -l 'hostname=!g01*&!g02*&!b06*' 14 | -------------------------------------------------------------------------------- /librispeech/conf/spec.conf: -------------------------------------------------------------------------------- 1 | --preemphasis-coefficient=0.0 2 | --remove-dc-offset=false 3 | --round-to-power-of-two=false 4 | --window-type=hanning 5 | -------------------------------------------------------------------------------- /librispeech/decode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | speech_data=/export/corpora5 #/PATH/TO/LIBRISPEECH/data 4 | 5 | . ./cmd.sh 6 | . ./path.sh 7 | 8 | stage=0 9 | subsampling=4 10 | chaindir=exp/chain_blstm 11 | model_dirname=blstm 12 | checkpoint=240_300.mdl 13 | acwt=1.0 14 | testsets="dev_clean dev_other test_clean test_other" 15 | decode_nj=80 16 | 17 | . ./utils/parse_options.sh 18 | 19 | set -euo pipefail 20 | 21 | tree=${chaindir}/tree 22 | post_decode_acwt=`echo ${acwt} | awk '{print 10*$1}'` 23 | 24 | # Echo Make graph if it does not exist 25 | if [ ! -f ${tree}/graph_tgsmall/HCLG.fst ]; then 26 | ./utils/mkgraph.sh --self-loop-scale 1.0 \ 27 | data/lang_test_tgsmall ${tree} ${tree}/graph_tgsmall 28 | fi 29 | 30 | ## Prepare the test sets if not already done 31 | if [ ! -f data/dev_clean_fbank/mapped/feats.dat.1 ]; then 32 | ./local/prepare_test.sh --subsampling ${subsampling} --data ${speech_data} 33 | fi 34 | 35 | for ds in $testsets; do 36 | decode_nnet_pytorch.sh --min-lmwt 6 \ 37 | --max-lmwt 18 \ 38 | --checkpoint ${checkpoint} \ 39 | --acoustic-scale ${acwt} \ 40 | --post-decode-acwt ${post_decode_acwt} \ 41 | --nj ${decode_nj} \ 42 | data/${ds}_fbank exp/${model_dirname} \ 43 | ${tree}/graph_tgsmall exp/${model_dirname}/decode_${checkpoint}_graph_${acwt}_${ds} 44 | 45 | echo ${decode_nj} > exp/${model_dirname}/decode_${checkpoint}_graph_${acwt}_${ds}/num_jobs 46 | ./steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ 47 | data/lang_test_{tgsmall,fglarge} \ 48 | data/${ds}_fbank exp/${model_dirname}/decode_${checkpoint}_graph_${acwt}_${ds}{,_fglarge_rescored} 49 | done 50 | 51 | -------------------------------------------------------------------------------- /librispeech/local/data_prep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2014 Vassil Panayotov 4 | # 2014 Johns Hopkins University (author: Daniel Povey) 5 | # Apache 2.0 6 | 7 | if [ "$#" -ne 2 ]; then 8 | echo "Usage: $0 " 9 | echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean" 10 | exit 1 11 | fi 12 | 13 | src=$1 14 | dst=$2 15 | 16 | # all utterances are FLAC compressed 17 | if ! which flac >&/dev/null; then 18 | echo "Please install 'flac' on ALL worker nodes!" 19 | exit 1 20 | fi 21 | 22 | spk_file=$src/../SPEAKERS.TXT 23 | 24 | mkdir -p $dst || exit 1; 25 | 26 | [ ! -d $src ] && echo "$0: no such directory $src" && exit 1; 27 | [ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1; 28 | 29 | 30 | wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp 31 | trans=$dst/text; [[ -f "$trans" ]] && rm $trans 32 | utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk 33 | spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender 34 | 35 | for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do 36 | reader=$(basename $reader_dir) 37 | if ! [ $reader -eq $reader ]; then # not integer. 38 | echo "$0: unexpected subdirectory name $reader" 39 | exit 1; 40 | fi 41 | 42 | reader_gender=$(egrep "^$reader[ ]+\|" $spk_file | awk -F'|' '{gsub(/[ ]+/, ""); print tolower($2)}') 43 | if [ "$reader_gender" != 'm' ] && [ "$reader_gender" != 'f' ]; then 44 | echo "Unexpected gender: '$reader_gender'" 45 | exit 1; 46 | fi 47 | 48 | for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do 49 | chapter=$(basename $chapter_dir) 50 | if ! [ "$chapter" -eq "$chapter" ]; then 51 | echo "$0: unexpected chapter-subdirectory name $chapter" 52 | exit 1; 53 | fi 54 | 55 | find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \ 56 | awk -v "dir=$chapter_dir" '{printf "%s flac -c -d -s %s/%s.flac |\n", $0, dir, $0}' >>$wav_scp|| exit 1 57 | 58 | chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt 59 | [ ! -f $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1 60 | cat $chapter_trans >>$trans 61 | 62 | # NOTE: For now we are using per-chapter utt2spk. That is each chapter is considered 63 | # to be a different speaker. This is done for simplicity and because we want 64 | # e.g. the CMVN to be calculated per-chapter 65 | awk -v "reader=$reader" -v "chapter=$chapter" '{printf "%s %s-%s\n", $1, reader, chapter}' \ 66 | <$chapter_trans >>$utt2spk || exit 1 67 | 68 | # reader -> gender map (again using per-chapter granularity) 69 | echo "${reader}-${chapter} $reader_gender" >>$spk2gender 70 | done 71 | done 72 | 73 | spk2utt=$dst/spk2utt 74 | utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt || exit 1 75 | 76 | ntrans=$(wc -l <$trans) 77 | nutt2spk=$(wc -l <$utt2spk) 78 | ! [ "$ntrans" -eq "$nutt2spk" ] && \ 79 | echo "Inconsistent #transcripts($ntrans) and #utt2spk($nutt2spk)" && exit 1; 80 | 81 | utils/validate_data_dir.sh --no-feats $dst || exit 1; 82 | 83 | echo "$0: successfully prepared data in $dst" 84 | 85 | exit 0 86 | -------------------------------------------------------------------------------- /librispeech/local/download_lm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2014 Vassil Panayotov 4 | # 2017 Daniel Povey 5 | # Apache 2.0 6 | 7 | if [ $# -ne "3" ]; then 8 | echo "Usage: $0 /dev/null | awk '{print $1}' || stat '-f %z' $f) 42 | if [[ "$fsize" -eq "$expect_size" ]]; then 43 | echo "'$fname' already exists and appears to be complete" 44 | return 0 45 | else 46 | echo "WARNING: '$fname' exists, but the size is wrong - re-downloading ..." 47 | fi 48 | fi 49 | wget --no-check-certificate -O $dst_dir/$fname $base_url/$fname || { 50 | echo "Error while trying to download $fname!" 51 | return 1 52 | } 53 | f=$dst_dir/$fname 54 | # In the following statement, the first version works on linux, and the part after '||' 55 | # works on Linux. 56 | fsize=$(set -o pipefail; du -b $f 2>/dev/null | awk '{print $1}' || stat '-f %z' $f) 57 | [[ "$fsize" -eq "$expect_size" ]] || { echo "$fname: file size mismatch!"; return 1; } 58 | return 0 59 | } 60 | 61 | mkdir -p $dst_dir $local_dir 62 | 63 | for f in 3-gram.arpa.gz 3-gram.pruned.1e-7.arpa.gz 3-gram.pruned.3e-7.arpa.gz \ 64 | librispeech-vocab.txt librispeech-lexicon.txt; do 65 | check_and_download $f || exit 1 66 | done 67 | 68 | dst_dir=$(readlink -f $dst_dir) 69 | ln -sf $dst_dir/3-gram.pruned.1e-7.arpa.gz $local_dir/lm_tgmed.arpa.gz 70 | ln -sf $dst_dir/3-gram.pruned.3e-7.arpa.gz $local_dir/lm_tgsmall.arpa.gz 71 | ln -sf $dst_dir/3-gram.arpa.gz $local_dir/lm_tglarge.arpa.gz 72 | ln -sf $dst_dir/librispeech-lexicon.txt $local_dir/librispeech-lexicon.txt 73 | ln -sf $dst_dir/librispeech-vocab.txt $local_dir/librispeech-vocab.txt 74 | exit 0 75 | -------------------------------------------------------------------------------- /librispeech/local/format_lms.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2014 Vassil Panayotov 4 | # Apache 2.0 5 | 6 | # Prepares the test time language model(G) transducers 7 | # (adapted from wsj/s5/local/wsj_format_data.sh) 8 | 9 | . ./path.sh || exit 1; 10 | 11 | # begin configuration section 12 | src_dir=data/lang 13 | # end configuration section 14 | 15 | . utils/parse_options.sh || exit 1; 16 | 17 | set -e 18 | 19 | if [ $# -ne 1 ]; then 20 | echo "Usage: $0 " 21 | echo "e.g.: $0 /export/a15/vpanayotov/data/lm" 22 | echo ", where:" 23 | echo " is the directory in which the language model is stored/downloaded" 24 | echo "Options:" 25 | echo " --src-dir # source lang directory, default data/lang" 26 | exit 1 27 | fi 28 | 29 | lm_dir=$1 30 | 31 | if [ ! -d $lm_dir ]; then 32 | echo "$0: expected source LM directory $lm_dir to exist" 33 | exit 1; 34 | fi 35 | if [ ! -f $src_dir/words.txt ]; then 36 | echo "$0: expected $src_dir/words.txt to exist." 37 | exit 1; 38 | fi 39 | 40 | 41 | tmpdir=data/local/lm_tmp.$$ 42 | trap "rm -r $tmpdir" EXIT 43 | 44 | mkdir -p $tmpdir 45 | 46 | for lm_suffix in tgsmall tgmed; do 47 | # tglarge is prepared by a separate command, called from run.sh; we don't 48 | # want to compile G.fst for tglarge, as it takes a while. 49 | test=${src_dir}_test_${lm_suffix} 50 | mkdir -p $test 51 | cp -r ${src_dir}/* $test 52 | gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz | \ 53 | arpa2fst --disambig-symbol=#0 \ 54 | --read-symbol-table=$test/words.txt - $test/G.fst 55 | utils/validate_lang.pl --skip-determinization-check $test || exit 1; 56 | done 57 | 58 | echo "Succeeded in formatting data." 59 | 60 | exit 0 61 | -------------------------------------------------------------------------------- /librispeech/local/prepare_dict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2014 Vassil Panayotov 4 | # Apache 2.0 5 | 6 | # Prepares the dictionary and auto-generates the pronunciations for the words, 7 | # that are in our vocabulary but not in CMUdict 8 | 9 | stage=0 10 | nj=4 # number of parallel Sequitur G2P jobs, we would like to use 11 | cmd=run.pl 12 | 13 | 14 | . utils/parse_options.sh || exit 1; 15 | . ./path.sh || exit 1 16 | 17 | 18 | if [ $# -ne 3 ]; then 19 | echo "Usage: $0 [options] " 20 | echo "e.g.: /export/a15/vpanayotov/data/lm /export/a15/vpanayotov/data/g2p data/local/dict" 21 | echo "Options:" 22 | echo " --cmd '' # script to launch jobs with, default: run.pl" 23 | echo " --nj # number of jobs to run, default: 4." 24 | exit 1 25 | fi 26 | 27 | lm_dir=$1 28 | g2p_model_dir=$2 29 | dst_dir=$3 30 | 31 | vocab=$lm_dir/librispeech-vocab.txt 32 | [ ! -f $vocab ] && echo "$0: vocabulary file not found at $vocab" && exit 1; 33 | 34 | # this file is either a copy of the lexicon we download from openslr.org/11 or is 35 | # created by the G2P steps below 36 | lexicon_raw_nosil=$dst_dir/lexicon_raw_nosil.txt 37 | 38 | cmudict_dir=$dst_dir/cmudict 39 | cmudict_plain=$dst_dir/cmudict.0.7a.plain 40 | 41 | mkdir -p $dst_dir || exit 1; 42 | 43 | if [ $stage -le 0 ]; then 44 | echo "Downloading and preparing CMUdict" 45 | if [ ! -s $cmudict_dir/cmudict.0.7a ]; then 46 | svn co -r 12440 https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict $cmudict_dir || exit 1; 47 | fi 48 | echo "Removing the pronunciation variant markers ..." 49 | grep -v ';;;' $cmudict_dir/cmudict.0.7a | \ 50 | perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \ 51 | > $cmudict_plain || exit 1; 52 | fi 53 | 54 | 55 | if [ $stage -le 1 ]; then 56 | # check if we have Sequitur G2P is installed 57 | if [ ! -f "$sequitur" ]; then 58 | if ! which swig >&/dev/null; then 59 | echo "Please install 'swig' and then run $KALDI_ROOT/tools/extra/install_sequitur.sh" 60 | exit 1 61 | else 62 | echo "Sequitur G2P not found- running $KALDI_ROOT/tools/extra/install_sequitur.sh" 63 | pushd $KALDI_ROOT/tools 64 | extras/install_sequitur.sh || exit 1 65 | popd 66 | fi 67 | fi 68 | [[ -f "$sequitur" ]] || { echo "Still can't find Sequitur G2P- check your path.sh"; exit 1; } 69 | 70 | g2p_dir=$dst_dir/g2p 71 | auto_vocab_prefix="$g2p_dir/vocab_autogen" 72 | auto_lexicon_prefix="$g2p_dir/lexicon_autogen" 73 | 74 | mkdir -p $g2p_dir/log 75 | auto_vocab_splits=$(eval "echo $auto_vocab_prefix.{$(seq -s',' $nj | sed 's/,$//')}") 76 | awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' $cmudict_plain $vocab |\ 77 | sort | tee $g2p_dir/vocab_autogen.full |\ 78 | utils/split_scp.pl /dev/stdin $auto_vocab_splits || exit 1 79 | echo "Autogenerating pronunciations for the words in $auto_vocab_prefix.* ..." 80 | $cmd JOB=1:$nj $g2p_dir/log/g2p.JOB.log \ 81 | local/g2p.sh $auto_vocab_prefix.JOB $g2p_model_dir $auto_lexicon_prefix.JOB || exit 1 82 | g2p_vocab_size=$(wc -l <$g2p_dir/vocab_autogen.full) 83 | g2p_lex_size=$(wc -l < <(cat $auto_lexicon_prefix.*)) 84 | [[ "$g2p_vocab_size" -eq "$g2p_lex_size" ]] || { echo "Unexpected G2P error"; exit 1; } 85 | sort <(cat $auto_vocab_prefix.*) >$dst_dir/vocab_autogen.txt 86 | sort <(cat $auto_lexicon_prefix.*) >$dst_dir/lexicon_autogen.txt 87 | echo "$(wc -l <$g2p_dir/vocab_autogen.full) pronunciations autogenerated OK" 88 | fi 89 | 90 | if [ $stage -le 2 ]; then 91 | echo "Combining the CMUdict pronunciations with the autogenerated ones ..." 92 | awk 'NR==FNR{a[$1]=1; next} ($1 in a)' $vocab $cmudict_plain |\ 93 | cat - $dst_dir/lexicon_autogen.txt | sort >$lexicon_raw_nosil || exit 1 94 | raw_lex_size=$(cat $lexicon_raw_nosil | awk '{print $1}' | sort -u | wc -l) 95 | vocab_size=$(wc -l <$vocab) 96 | [[ "$vocab_size" -eq "$raw_lex_size" ]] || { 97 | echo "Inconsistent lexicon($raw_lex_size) vs vocabulary($vocab_size) size!"; 98 | exit 1; } 99 | echo "Combined lexicon saved to '$lexicon_raw_nosil'" 100 | fi 101 | 102 | # The copy operation below is necessary, if we skip the g2p stages(e.g. using --stage 3) 103 | if [[ ! -s "$lexicon_raw_nosil" ]]; then 104 | cp $lm_dir/librispeech-lexicon.txt $lexicon_raw_nosil || exit 1 105 | fi 106 | 107 | if [ $stage -le 3 ]; then 108 | silence_phones=$dst_dir/silence_phones.txt 109 | optional_silence=$dst_dir/optional_silence.txt 110 | nonsil_phones=$dst_dir/nonsilence_phones.txt 111 | extra_questions=$dst_dir/extra_questions.txt 112 | 113 | echo "Preparing phone lists and clustering questions" 114 | (echo SIL; echo SPN;) > $silence_phones 115 | echo SIL > $optional_silence 116 | # nonsilence phones; on each line is a list of phones that correspond 117 | # really to the same base phone. 118 | awk '{for (i=2; i<=NF; ++i) { print $i; gsub(/[0-9]/, "", $i); print $i}}' $lexicon_raw_nosil |\ 119 | sort -u |\ 120 | perl -e 'while(<>){ 121 | chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_"; 122 | $phones_of{$1} .= "$_ "; } 123 | foreach $list (values %phones_of) {print $list . "\n"; } ' | sort \ 124 | > $nonsil_phones || exit 1; 125 | # A few extra questions that will be added to those obtained by automatically clustering 126 | # the "real" phones. These ask about stress; there's also one for silence. 127 | cat $silence_phones| awk '{printf("%s ", $1);} END{printf "\n";}' > $extra_questions || exit 1; 128 | cat $nonsil_phones | perl -e 'while(<>){ foreach $p (split(" ", $_)) { 129 | $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \ 130 | >> $extra_questions || exit 1; 131 | echo "$(wc -l <$silence_phones) silence phones saved to: $silence_phones" 132 | echo "$(wc -l <$optional_silence) optional silence saved to: $optional_silence" 133 | echo "$(wc -l <$nonsil_phones) non-silence phones saved to: $nonsil_phones" 134 | echo "$(wc -l <$extra_questions) extra triphone clustering-related questions saved to: $extra_questions" 135 | fi 136 | 137 | if [ $stage -le 4 ]; then 138 | (echo '!SIL SIL'; echo ' SPN'; echo ' SPN'; ) |\ 139 | cat - $lexicon_raw_nosil | sort | uniq >$dst_dir/lexicon.txt 140 | echo "Lexicon text file saved as: $dst_dir/lexicon.txt" 141 | fi 142 | 143 | exit 0 144 | -------------------------------------------------------------------------------- /librispeech/local/prepare_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | data=/export/a15/vpanayotov/data 4 | subsampling=4 5 | 6 | . ./cmd.sh 7 | . ./path.sh 8 | 9 | . ./utils/parse_options.sh 10 | 11 | set -euo pipefail 12 | 13 | for part in dev-clean dev-other test-clean test-other; do 14 | echo "-------------- Making ${part} ----------------------" 15 | dataname=$(echo ${part} | sed s/-/_/g) 16 | local/data_prep.sh $data/LibriSpeech/${part} data/${dataname} 17 | ./utils/copy_data_dir.sh data/${dataname} data/${dataname}_fbank 18 | ./steps/make_fbank.sh --cmd "$train_cmd" --nj 32 \ 19 | data/${dataname}_fbank exp/make_fbank/${dataname} fbank 20 | ./utils/fix_data_dir.sh data/${dataname}_fbank 21 | ./steps/compute_cmvn_stats.sh data/${dataname}_fbank 22 | ./utils/fix_data_dir.sh data/${dataname}_fbank 23 | 24 | memmap_data.py data/${dataname}_fbank/feats.scp data/${dataname}_fbank/feats.scp.dat 25 | python local/prepare_unlabeled_tgt.py --subsample ${subsampling} \ 26 | data/${dataname}_fbank/utt2num_frames > data/${dataname}_fbank/pdfid.${subsampling}.tgt 27 | done 28 | 29 | exit 0; 30 | 31 | 32 | -------------------------------------------------------------------------------- /librispeech/local/score.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2012 Johns Hopkins University (Author: Daniel Povey) 3 | # 2014 Guoguo Chen 4 | # Apache 2.0 5 | 6 | [ -f ./path.sh ] && . ./path.sh 7 | 8 | # begin configuration section. 9 | cmd=run.pl 10 | stage=0 11 | decode_mbr=true 12 | word_ins_penalty=0.0,0.5,1.0 13 | min_lmwt=7 14 | max_lmwt=17 15 | iter=final 16 | #end configuration section. 17 | 18 | [ -f ./path.sh ] && . ./path.sh 19 | . parse_options.sh || exit 1; 20 | 21 | if [ $# -ne 3 ]; then 22 | echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] " 23 | echo " Options:" 24 | echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." 25 | echo " --stage (0|1|2) # start scoring script from part-way through." 26 | echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)." 27 | echo " --min_lmwt # minumum LM-weight for lattice rescoring " 28 | echo " --max_lmwt # maximum LM-weight for lattice rescoring " 29 | exit 1; 30 | fi 31 | 32 | data=$1 33 | lang_or_graph=$2 34 | dir=$3 35 | 36 | symtab=$lang_or_graph/words.txt 37 | 38 | for f in $symtab $dir/lat.1.gz $data/text; do 39 | [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; 40 | done 41 | 42 | mkdir -p $dir/scoring/log 43 | 44 | cat $data/text | sed 's:::g' | sed 's:::g' > $dir/scoring/test_filt.txt 45 | 46 | for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do 47 | $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.$wip.log \ 48 | lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ 49 | lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ 50 | lattice-best-path --word-symbol-table=$symtab \ 51 | ark:- ark,t:$dir/scoring/LMWT.$wip.tra || exit 1; 52 | done 53 | 54 | # Note: the double level of quoting for the sed command 55 | for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do 56 | $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.$wip.log \ 57 | cat $dir/scoring/LMWT.$wip.tra \| \ 58 | utils/int2sym.pl -f 2- $symtab \| sed 's:\::g' \| \ 59 | compute-wer --text --mode=present \ 60 | ark:$dir/scoring/test_filt.txt ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1; 61 | done 62 | 63 | exit 0; 64 | -------------------------------------------------------------------------------- /librispeech/local/subset_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2017 Luminar Technologies, Inc. (author: Daniel Galvez) 4 | # Apache 2.0 5 | 6 | # The following commands were used to generate the mini_librispeech dataset: 7 | # 8 | # Note that data generation is random. This could be fixed by 9 | # providing a seed argument to the shuf program. 10 | 11 | if [ "$#" -ne 3 ]; then 12 | echo "Usage: $0 " 13 | echo "e.g.: $0 /export/a05/dgalvez/LibriSpeech/train-clean-100 \\ 14 | /export/a05/dgalvez/LibriSpeech/train-clean-5 5" 15 | exit 1 16 | fi 17 | 18 | src_dir=$1 19 | dest_dir=$2 20 | dest_num_hours=$3 21 | 22 | src=$(basename $src_dir) 23 | dest=$(basename $dest_dir) 24 | librispeech_dir=$(dirname $src_dir) 25 | 26 | # TODO: Possibly improve this to ensure gender balance and speaker 27 | # balance. 28 | # TODO: Use actual time values instead of assuming that to make sure we get $dest_num_hours of data 29 | src_num_hours=$(grep "$src" $librispeech_dir/CHAPTERS.TXT | awk -F'|' '{ print $3 }' | \ 30 | python -c ' 31 | from __future__ import print_function 32 | from sys import stdin 33 | minutes_str = stdin.read().split() 34 | print(int(round(sum([float(minutes) for minutes in minutes_str]) / 60.0)))') 35 | src_num_chapters=$(grep "$src" $librispeech_dir/CHAPTERS.TXT | \ 36 | awk -F'|' '{ print $1 }' | sort -u | wc -l) 37 | mkdir -p data/subset_tmp 38 | grep "$src" $librispeech_dir/CHAPTERS.TXT | \ 39 | awk -F'|' '{ print $1 }' | \ 40 | shuf -n $(((dest_num_hours * src_num_chapters) / src_num_hours)) > \ 41 | data/subset_tmp/${dest}_chapter_id_list.txt 42 | 43 | while read -r chapter_id || [[ -n "$chapter_id" ]]; do 44 | chapter_dir=$(find $src_dir/ -mindepth 2 -name "$chapter_id" -type d) 45 | speaker_id=$(basename $(dirname $chapter_dir)) 46 | mkdir -p $dest_dir/$speaker_id/ 47 | cp -r $chapter_dir $dest_dir/$speaker_id/ 48 | done < data/subset_tmp/${dest}_chapter_id_list.txt 49 | -------------------------------------------------------------------------------- /librispeech/path.sh: -------------------------------------------------------------------------------- 1 | export ROOT=`pwd`/../tools 2 | export KALDI_ROOT=${ROOT}/kaldi 3 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh 4 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/sph2pipe_v2.5:$KALDI_ROOT/tools/openfst/bin:`pwd`/../nnet_pytorch:$PWD:$PATH:`pwd`/../nnet_pytorch/utils/ 5 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 6 | . $KALDI_ROOT/tools/config/common_path.sh 7 | export LC_ALL=C 8 | 9 | export OPENFST_PATH=${ROOT}/openfst #/PATH/TO/OPENFST 10 | export LD_LIBRARY_ORIG=${LD_LIBRARY_PATH} 11 | export LD_LIBRARY_PATH=${OPENFST_PATH}/lib:${LD_LIBRARY_PATH} 12 | #export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${OPENFST_PATH}/lib:/usr/local/cuda/lib64 13 | 14 | export PYTHONPATH=${PYTHONPATH}:`pwd`/../nnet_pytorch/:`pwd`/../nnet_pytorch/utils/ 15 | export PYTHONUNBUFFERED=1 16 | source ${ROOT}/NeurIPS2020/bin/activate 17 | 18 | export LC_ALL=C 19 | 20 | -------------------------------------------------------------------------------- /librispeech/run-blstm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ./cmd.sh 4 | . ./path.sh 5 | 6 | stage=0 7 | subsampling=3 8 | traindir=data/train_960 9 | feat_affix=_fbank 10 | chaindir=exp/chain_blstm 11 | num_leaves=7000 12 | model_dirname=blstm 13 | batches_per_epoch=500 14 | num_epochs=300 15 | train_nj=4 16 | resume= 17 | num_split=20 # number of splits for memory-mapped data for training 18 | average=true 19 | 20 | . ./utils/parse_options.sh 21 | 22 | set -euo pipefail 23 | 24 | tree=${chaindir}/tree 25 | targets=${traindir}${feat_affix}/pdfid.${subsampling}.tgt 26 | trainname=`basename ${traindir}` 27 | 28 | 29 | if [ $stage -le 1 ]; then 30 | echo "Creating Chain Topology, Denominator Graph, and nnet Targets ..." 31 | lang=data/lang_chain 32 | cp -r data/lang $lang 33 | silphonelist=$(cat $lang/phones/silence.csl) || exit 1; 34 | nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; 35 | # Use our special topology... note that later on may have to tune this 36 | # topology. 37 | steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo 38 | 39 | steps/nnet3/chain/build_tree.sh \ 40 | --frame-subsampling-factor ${subsampling} \ 41 | --context-opts "--context-width=2 --central-position=1" \ 42 | --cmd "$train_cmd" ${num_leaves} ${traindir} \ 43 | $lang exp/tri5b_ali_${trainname} ${tree} 44 | 45 | ali-to-phones ${tree}/final.mdl ark:"gunzip -c ${tree}/ali.*.gz |" ark:- |\ 46 | chain-est-phone-lm --num-extra-lm-states=2000 ark:- ${chaindir}/phone_lm.fst 47 | 48 | chain-make-den-fst ${tree}/tree ${tree}/final.mdl \ 49 | ${chaindir}/phone_lm.fst ${chaindir}/den.fst ${chaindir}/normalization.fst 50 | 51 | ali-to-pdf ${tree}/final.mdl ark:"gunzip -c ${tree}/ali.*.gz |" ark,t:${targets} 52 | fi 53 | 54 | if [ $stage -le 2 ]; then 55 | echo "Dumping memory mapped features ..." 56 | split_memmap_data.sh ${traindir}${feat_affix} ${targets} ${num_split} 57 | fi 58 | 59 | # Multigpu training of Chain-WideResNet with optimizer state averaging 60 | if [ $stage -le 3 ]; then 61 | resume_opts= 62 | if [ ! -z $resume ]; then 63 | resume_opts="--resume ${resume}" 64 | fi 65 | 66 | num_pdfs=$(tree-info ${tree}/tree | grep 'num-pdfs' | cut -d' ' -f2) 67 | train_async_parallel.sh ${resume_opts} \ 68 | --gpu true \ 69 | --objective LFMMI \ 70 | --denom-graph ${chaindir}/den.fst \ 71 | --num-pdfs ${num_pdfs} \ 72 | --subsample ${subsampling} \ 73 | --model ChainBLSTM \ 74 | --hdim 1024 \ 75 | --num-layers 6 \ 76 | --dropout 0.2 \ 77 | --prefinal-dim 512 \ 78 | --warmup 20000 \ 79 | --decay 1e-07 \ 80 | --xent 0.1 \ 81 | --l2 0.0001 \ 82 | --weight-decay 1e-07 \ 83 | --lr 0.0002 \ 84 | --batches-per-epoch ${batches_per_epoch} \ 85 | --num-epochs ${num_epochs} \ 86 | --validation-spks 0 \ 87 | --nj ${train_nj} \ 88 | "[ \ 89 | {\ 90 | 'data': '${traindir}${feat_affix}', \ 91 | 'tgt': '${targets}', \ 92 | 'batchsize': 32, 'chunk_width': 140, \ 93 | 'left_context': 10, 'right_context': 5, \ 94 | 'mean_norm': True, 'var_norm': 'norm' 95 | }\ 96 | ]" \ 97 | `dirname ${chaindir}`/${model_dirname} 98 | fi 99 | 100 | # Average the last 60 epochs 101 | if $average; then 102 | echo "Averaging the last few epochs ..." 103 | average_models.py `dirname ${chaindir}`/${model_dirname} 80 240 300 104 | fi 105 | -------------------------------------------------------------------------------- /librispeech/run-wrn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ./cmd.sh 4 | . ./path.sh 5 | 6 | stage=0 7 | subsampling=4 8 | traindir=data/train_960 9 | feat_affix=_fbank 10 | chaindir=exp/chain_wrn 11 | num_leaves=7000 12 | model_dirname=wrn 13 | batches_per_epoch=500 14 | num_epochs=300 15 | train_nj=4 16 | resume= 17 | num_split=20 # number of splits for memory-mapped data for training 18 | average=true 19 | 20 | . ./utils/parse_options.sh 21 | 22 | set -euo pipefail 23 | 24 | tree=${chaindir}/tree 25 | targets=${traindir}${feat_affix}/pdfid.${subsampling}.tgt 26 | trainname=`basename ${traindir}` 27 | 28 | if [ $stage -le 1 ]; then 29 | echo "Creating Chain Topology, Denominator Graph, and nnet Targets ..." 30 | lang=data/lang_chain 31 | cp -r data/lang $lang 32 | silphonelist=$(cat $lang/phones/silence.csl) || exit 1; 33 | nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; 34 | # Use our special topology... note that later on may have to tune this 35 | # topology. 36 | steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo 37 | 38 | steps/nnet3/chain/build_tree.sh \ 39 | --frame-subsampling-factor ${subsampling} \ 40 | --context-opts "--context-width=2 --central-position=1" \ 41 | --cmd "$train_cmd" ${num_leaves} ${traindir} \ 42 | $lang exp/tri5b_ali_${trainname} ${tree} 43 | 44 | ali-to-phones ${tree}/final.mdl ark:"gunzip -c ${tree}/ali.*.gz |" ark:- |\ 45 | chain-est-phone-lm --num-extra-lm-states=2000 ark:- ${chaindir}/phone_lm.fst 46 | 47 | chain-make-den-fst ${tree}/tree ${tree}/final.mdl \ 48 | ${chaindir}/phone_lm.fst ${chaindir}/den.fst ${chaindir}/normalization.fst 49 | 50 | ali-to-pdf ${tree}/final.mdl ark:"gunzip -c ${tree}/ali.*.gz |" ark,t:${targets} 51 | fi 52 | 53 | if [ $stage -le 2 ]; then 54 | echo "Dumping memory mapped features ..." 55 | split_memmap_data.sh ${traindir}${feat_affix} ${targets} ${num_split} 56 | fi 57 | 58 | # Multigpu training of Chain-WideResNet with optimizer state averaging 59 | if [ $stage -le 3 ]; then 60 | resume_opts= 61 | if [ ! -z $resume ]; then 62 | resume_opts="--resume ${resume}" 63 | fi 64 | 65 | num_pdfs=$(tree-info ${tree}/tree | grep 'num-pdfs' | cut -d' ' -f2) 66 | ./local/train_async_parallel.sh ${resume_opts} \ 67 | --gpu true \ 68 | --objective LFMMI \ 69 | --denom-graph ${chaindir}/den.fst \ 70 | --num-pdfs ${num_pdfs} \ 71 | --subsample ${subsampling} \ 72 | --model ChainWideResnet \ 73 | --depth 28 \ 74 | --width 10 \ 75 | --warmup 20000 \ 76 | --decay 1e-05 \ 77 | --xent 0.05 \ 78 | --l2 0.0001 \ 79 | --weight-decay 1e-08 \ 80 | --lr 0.0002 \ 81 | --batches-per-epoch 500 \ 82 | --num-epochs 300 \ 83 | --validation-spks 0 \ 84 | --nj 4 \ 85 | "[ \ 86 | {\ 87 | 'data': '${traindir}${feat_affix}', \ 88 | 'tgt': '${targets}', \ 89 | 'batchsize': 32, 'chunk_width': 140, \ 90 | 'left_context': 10, 'right_context': 5, \ 91 | 'mean_norm': True, 'var_norm': 'norm' 92 | }\ 93 | ]" \ 94 | `dirname ${chaindir}`/${model_dirname} 95 | fi 96 | 97 | # Average the last 60 epochs 98 | if $average; then 99 | echo "Averaging the last few epochs ..." 100 | average_models.py `dirname ${chaindir}`/${model_dirname} 80 240 300 101 | fi 102 | 103 | -------------------------------------------------------------------------------- /librispeech/steps: -------------------------------------------------------------------------------- 1 | ../tools/kaldi/egs/wsj/s5/steps -------------------------------------------------------------------------------- /librispeech/utils: -------------------------------------------------------------------------------- 1 | ../tools/kaldi/egs/wsj/s5/utils -------------------------------------------------------------------------------- /librispeech100/cmd.sh: -------------------------------------------------------------------------------- 1 | # you can change cmd.sh depending on what type of queue you are using. 2 | # If you have no queueing system and want to run on a local machine, you 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run 4 | # commands one by one: most recipes will exhaust the memory on your 5 | # machine). queue.pl works with GridEngine (qsub). slurm.pl works 6 | # with slurm. Different queues are configured differently, with different 7 | # queue names and different ways of specifying things like memory; 8 | # to account for these differences you can create and edit the file 9 | # conf/queue.conf to match your queue's configuration. Search for 10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, 11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. 12 | 13 | export train_cmd="queue.pl --mem 2G -l hostname='!b02*&!c24*&!c23*&!c27*&!c26*&!c25*&!a*'" 14 | export decode_cmd="queue.pl --mem 4G -l hostname='!b02*&!c06*&!c23*&!c24*&!c25*&!c26*&!c27*&!a*'" 15 | export mkgraph_cmd="queue.pl --mem 8G" 16 | -------------------------------------------------------------------------------- /librispeech100/conf/decode.config: -------------------------------------------------------------------------------- 1 | # empty config, just use the defaults. 2 | -------------------------------------------------------------------------------- /librispeech100/conf/fbank.conf: -------------------------------------------------------------------------------- 1 | --sample-frequency=16000 2 | --num-mel-bins=64 3 | -------------------------------------------------------------------------------- /librispeech100/conf/gpu.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* 3 | option mem=* -l mem_free=$0,ram_free=$0 4 | option mem=0 # Do not add anything to qsub_opts 5 | option num_threads=* -pe smp $0 6 | option num_threads=1 # Do not add anything to qsub_opts 7 | option max_jobs_run=* -tc $0 8 | default gpu=0 9 | option gpu=0 10 | option gpu=* -l 'hostname=c0[023456789]*|c1[012456789]*|c2[0126]*,gpu=$0' -q g.q 11 | -------------------------------------------------------------------------------- /librispeech100/conf/mfcc.conf: -------------------------------------------------------------------------------- 1 | --use-energy=false # only non-default option. 2 | -------------------------------------------------------------------------------- /librispeech100/conf/mfcc_hires.conf: -------------------------------------------------------------------------------- 1 | # config for high-resolution MFCC features, intended for neural network training 2 | # Note: we keep all cepstra, so it has the same info as filterbank features, 3 | # but MFCC is more easily compressible (because less correlated) which is why 4 | # we prefer this method. 5 | --use-energy=false # use average of log energy, not energy. 6 | --num-mel-bins=40 # similar to Google's setup. 7 | --num-ceps=40 # there is no dimensionality reduction. 8 | --low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so 9 | # there might be some information at the low end. 10 | --high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 11 | -------------------------------------------------------------------------------- /librispeech100/conf/online_cmvn.conf: -------------------------------------------------------------------------------- 1 | # configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh 2 | -------------------------------------------------------------------------------- /librispeech100/conf/online_pitch.conf: -------------------------------------------------------------------------------- 1 | ## This config is given by conf/make_pitch_online.sh to the program compute-and-process-kaldi-pitch-feats, 2 | ## and is copied by steps/online/nnet2/prepare_online_decoding.sh and similar scripts, to be given 3 | ## to programs like online2-wav-nnet2-latgen-faster. 4 | ## The program compute-and-process-kaldi-pitch-feats will use it to compute pitch features that 5 | ## are the same as that those which will generated in online decoding; this enables us to train 6 | ## in a way that's compatible with online decoding. 7 | ## 8 | 9 | ## most of these options relate to the post-processing rather than the pitch 10 | ## extraction itself. 11 | --add-raw-log-pitch=true ## this is intended for input to neural nets, so our 12 | ## approach is "throw everything in and see what 13 | ## sticks". 14 | --normalization-left-context=75 15 | --normalization-right-context=50 # We're removing some of the right-context 16 | # for the normalization. Would normally be 75. 17 | # 18 | # Note: our changes to the (left,right) context 19 | # from the defaults of (75,75) to (75,50) will 20 | # almost certainly worsen results, but will 21 | # reduce latency. 22 | --frames-per-chunk=10 ## relates to offline simulation of online decoding; 1 23 | ## would be equivalent to getting in samples one by 24 | ## one. 25 | --simulate-first-pass-online=true ## this make the online-pitch-extraction code 26 | ## output the 'first-pass' features, which 27 | ## are less accurate than the final ones, and 28 | ## which are the only features the neural-net 29 | ## decoding would ever see (since we can't 30 | ## afford to do lattice rescoring in the 31 | ## neural-net code 32 | -------------------------------------------------------------------------------- /librispeech100/conf/queue_no_k20.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* 3 | option mem=* -l mem_free=$0,ram_free=$0 4 | option mem=0 # Do not add anything to qsub_opts 5 | option num_threads=* -pe smp $0 6 | option num_threads=1 # Do not add anything to qsub_opts 7 | option max_jobs_run=* -tc $0 8 | default gpu=0 9 | option gpu=0 -q all.q 10 | option gpu=* -l gpu=$0 -q g.q 11 | default allow_k20=true 12 | option allow_k20=true 13 | option allow_k20=false -l 'hostname=!g01*&!g02*&!b06*' 14 | -------------------------------------------------------------------------------- /librispeech100/conf/spec.conf: -------------------------------------------------------------------------------- 1 | --preemphasis-coefficient=0.0 2 | --remove-dc-offset=false 3 | --round-to-power-of-two=false 4 | --window-type=hanning 5 | -------------------------------------------------------------------------------- /librispeech100/decode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | speech_data=/export/corpora5 #/PATH/TO/LIBRISPEECH/data 4 | 5 | . ./cmd.sh 6 | . ./path.sh 7 | 8 | stage=1 9 | subsampling=4 10 | chaindir=exp/chain 11 | model_dirname=model1 12 | checkpoint=180_220.mdl 13 | acwt=1.0 14 | testsets="dev_clean dev_other test_clean test_other" 15 | feat_affix="_fbank" 16 | decode_nj=80 17 | 18 | . ./utils/parse_options.sh 19 | 20 | tree=${chaindir}/tree 21 | post_decode_acwt=`echo ${acwt} | awk '{print 10*$1}'` 22 | 23 | # Prepare the test sets if not already done 24 | if [ $stage -le 0 ]; then 25 | if [ ! -f data/${testsets%% *}${feat_affix}/mapped/feats.dat.1 ]; then 26 | ./local/prepare_test.sh --subsampling ${subsampling} \ 27 | --testsets "${testsets}" \ 28 | --subsampling ${subsampling} \ 29 | --data ${speech_data} \ 30 | --feat-affix ${feat_affix} 31 | fi 32 | fi 33 | 34 | # Echo Make graph if it does not exist 35 | if [ ! -f ${tree}/graph_tgsmall/HCLG.fst ]; then 36 | ./utils/mkgraph.sh --self-loop-scale 1.0 \ 37 | data/lang_test_tgsmall ${tree} ${tree}/graph_tgsmall 38 | fi 39 | 40 | for ds in $testsets; do 41 | decode_nnet_pytorch.sh --min-lmwt 6 \ 42 | --max-lmwt 18 \ 43 | --checkpoint ${checkpoint} \ 44 | --acoustic-scale ${acwt} \ 45 | --post-decode-acwt ${post_decode_acwt} \ 46 | --nj ${decode_nj} \ 47 | data/${ds}${feat_affix} exp/${model_dirname} \ 48 | ${tree}/graph_tgsmall exp/${model_dirname}/decode_${checkpoint}_graph_${acwt}_${ds} 49 | 50 | echo ${decode_nj} > exp/${model_dirname}/decode_${checkpoint}_graph_${acwt}_${ds}/num_jobs 51 | ./steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ 52 | data/lang_test_{tgsmall,fglarge} \ 53 | data/${ds}${feat_affix} exp/${model_dirname}/decode_${checkpoint}_graph_${acwt}_${ds}{,_fglarge_rescored} 54 | done 55 | 56 | -------------------------------------------------------------------------------- /librispeech100/decorrupt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . ./path.sh 3 | idim=64 4 | chunk_width=100 5 | left_context=10 6 | right_context=5 7 | batchsize=32 8 | perturb="none" 9 | num_steps= 10 | 11 | . ./utils/parse_options.sh 12 | 13 | if [ $# -ne 3 ]; then 14 | echo "Usage: ./decorrupt.sh " 15 | exit 1; 16 | fi 17 | 18 | data=$1 19 | model=$2 20 | checkpoint=$3 21 | 22 | odir=${model}/decorrupt_${checkpoint} 23 | mkdir -p ${odir} 24 | 25 | num_steps_opts="" 26 | if [ ! -z $num_steps ]; then 27 | num_steps_opts="--num-steps ${num_steps}" 28 | fi 29 | 30 | train_cmd="utils/retry.pl utils/queue.pl --mem 2G --gpu 1 --config conf/gpu.conf" 31 | 32 | ${train_cmd} ${odir}/log decorrupt.py --gpu \ 33 | --datadir ${data} \ 34 | --modeldir ${model} \ 35 | --checkpoint ${checkpoint} \ 36 | --dumpdir ${odir} \ 37 | --idim ${idim} \ 38 | --chunk-width ${chunk_width} \ 39 | --left-context ${left_context} \ 40 | --right-context ${right_context} \ 41 | --batchsize ${batchsize} \ 42 | --perturb ${perturb} \ 43 | ${num_steps_opts} 44 | -------------------------------------------------------------------------------- /librispeech100/generate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . ./path.sh 3 | . ./cmd.sh 4 | 5 | stage=0 6 | subsampling=4 7 | chaindir=exp/chain_wrn 8 | model_dirname=wrn_semisup 9 | checkpoint=20.mdl 10 | top_k=10 11 | target="2697 2697 2697 2697 2697" 12 | left=10 13 | right=5 14 | chunk_width=20 15 | idim=80 16 | gpu=false 17 | 18 | . ./utils/parse_options.sh 19 | 20 | tree=${chaindir}/tree 21 | 22 | # Generation 23 | modeldir=`dirname ${chaindir}`/${model_dirname} 24 | gen_dir=${modeldir}/generate_cond_${checkpoint} 25 | mkdir -p ${gen_dir} 26 | 27 | gpu_opts= 28 | if $gpu; then 29 | gpu_opts="--gpu" 30 | generate_cmd="./utils/queue.pl --mem 2G --gpu 1 --config conf/gpu.conf ${gen_dir}/log" 31 | else 32 | generate_cmd="./utils/queue.pl --mem 2G ${gen_dir}/log" 33 | fi 34 | 35 | target_opts= 36 | if [ ! -z "$target" ]; then 37 | echo "Target: ${target}" 38 | target_opts="--target ${target}" 39 | generate_cmd="./utils/queue.pl --mem 2G --gpu 1 --config conf/gpu.conf ${gen_dir}/log" 40 | gpu_opts="--gpu" 41 | else 42 | gpu_opts= 43 | fi 44 | 45 | ${generate_cmd} generate_conditional_from_buffer.py \ 46 | ${gpu_opts} \ 47 | ${target_opts} \ 48 | --idim ${idim} \ 49 | --modeldir ${modeldir} --modelname ${checkpoint} \ 50 | --dumpdir ${gen_dir} --batchsize 32 \ 51 | --left-context ${left} --right-context ${right} --chunk-width ${chunk_width} \ 52 | --top-k ${top_k} 53 | -------------------------------------------------------------------------------- /librispeech100/local/data_prep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2014 Vassil Panayotov 4 | # 2014 Johns Hopkins University (author: Daniel Povey) 5 | # Apache 2.0 6 | 7 | if [ "$#" -ne 2 ]; then 8 | echo "Usage: $0 " 9 | echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean" 10 | exit 1 11 | fi 12 | 13 | src=$1 14 | dst=$2 15 | 16 | # all utterances are FLAC compressed 17 | if ! which flac >&/dev/null; then 18 | echo "Please install 'flac' on ALL worker nodes!" 19 | exit 1 20 | fi 21 | 22 | spk_file=$src/../SPEAKERS.TXT 23 | 24 | mkdir -p $dst || exit 1; 25 | 26 | [ ! -d $src ] && echo "$0: no such directory $src" && exit 1; 27 | [ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1; 28 | 29 | 30 | wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp 31 | trans=$dst/text; [[ -f "$trans" ]] && rm $trans 32 | utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk 33 | spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender 34 | 35 | for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do 36 | reader=$(basename $reader_dir) 37 | if ! [ $reader -eq $reader ]; then # not integer. 38 | echo "$0: unexpected subdirectory name $reader" 39 | exit 1; 40 | fi 41 | 42 | reader_gender=$(egrep "^$reader[ ]+\|" $spk_file | awk -F'|' '{gsub(/[ ]+/, ""); print tolower($2)}') 43 | if [ "$reader_gender" != 'm' ] && [ "$reader_gender" != 'f' ]; then 44 | echo "Unexpected gender: '$reader_gender'" 45 | exit 1; 46 | fi 47 | 48 | for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do 49 | chapter=$(basename $chapter_dir) 50 | if ! [ "$chapter" -eq "$chapter" ]; then 51 | echo "$0: unexpected chapter-subdirectory name $chapter" 52 | exit 1; 53 | fi 54 | 55 | find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \ 56 | awk -v "dir=$chapter_dir" '{printf "%s flac -c -d -s %s/%s.flac |\n", $0, dir, $0}' >>$wav_scp|| exit 1 57 | 58 | chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt 59 | [ ! -f $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1 60 | cat $chapter_trans >>$trans 61 | 62 | # NOTE: For now we are using per-chapter utt2spk. That is each chapter is considered 63 | # to be a different speaker. This is done for simplicity and because we want 64 | # e.g. the CMVN to be calculated per-chapter 65 | awk -v "reader=$reader" -v "chapter=$chapter" '{printf "%s %s-%s\n", $1, reader, chapter}' \ 66 | <$chapter_trans >>$utt2spk || exit 1 67 | 68 | # reader -> gender map (again using per-chapter granularity) 69 | echo "${reader}-${chapter} $reader_gender" >>$spk2gender 70 | done 71 | done 72 | 73 | spk2utt=$dst/spk2utt 74 | utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt || exit 1 75 | 76 | ntrans=$(wc -l <$trans) 77 | nutt2spk=$(wc -l <$utt2spk) 78 | ! [ "$ntrans" -eq "$nutt2spk" ] && \ 79 | echo "Inconsistent #transcripts($ntrans) and #utt2spk($nutt2spk)" && exit 1; 80 | 81 | utils/validate_data_dir.sh --no-feats $dst || exit 1; 82 | 83 | echo "$0: successfully prepared data in $dst" 84 | 85 | exit 0 86 | -------------------------------------------------------------------------------- /librispeech100/local/decode_nnet_pytorch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ./path.sh 4 | 5 | batchsize=512 6 | checkpoint=final.mdl 7 | prior_scale=1.0 8 | prior_floor=-20.0 9 | prior_name="priors" 10 | min_active=200 11 | max_active=7000 12 | max_mem=50000000 13 | lattice_beam=8.0 14 | beam=15.0 15 | acoustic_scale=0.1 16 | post_decode_acwt=10.0 # 10.0 for chain systems, 1.0 for non-chain 17 | mean_var="(True, True)" 18 | 19 | min_lmwt=6 20 | max_lmwt=18 21 | nj=80 22 | stage=0 23 | 24 | . ./utils/parse_options.sh 25 | if [ $# -ne 4 ]; then 26 | echo "Usage: ./decode_nnet_pytorch.sh " 27 | echo " --batchsize ${batchsize} " 28 | echo " --checkpoint ${checkpoint} --prior-scale ${prior_scale} --prior-floor ${prior_floor} --prior-name ${prior_name}" 29 | echo " --min-active ${min_active} --max-active ${max_active}" 30 | echo " --max-mem ${max_mem} --lattice-beam ${lattice_beam}" 31 | echo " --beam ${beam} --acoustic-scale ${acoustic_scale} --post-decode-acwt ${post_decode_acwt}" 32 | echo " --nj ${nj}" 33 | exit 1; 34 | fi 35 | 36 | data=$1 37 | pytorch_model=$2 38 | graphdir=$3 39 | odir=$4 40 | 41 | # We assume the acoustic model (trans.mdl) is 1 level above the graphdir 42 | amdir=`dirname ${graphdir}` 43 | trans_mdl=${amdir}/final.mdl 44 | words_file=${graphdir}/words.txt 45 | hclg=${graphdir}/HCLG.fst 46 | 47 | mkdir -p ${odir}/log 48 | 49 | decode_cmd="utils/queue.pl --mem 2G -l hostname='!b02*&!a*&!c06*&!c23*&!c24*&!c25*&!c26*&!c27*'" # The 'a' machines are just too slow 50 | if [ $stage -le 0 ]; then 51 | segments=${data}/segments 52 | if [ ! -f ${data}/segments ]; then 53 | echo "No segments file found. Assuming wav.scp is indexed by utterance" 54 | segments=${data}/wav.scp 55 | fi 56 | 57 | ${decode_cmd} JOB=1:${nj} ${odir}/log/decode.JOB.log \ 58 | ./utils/split_scp.pl -j ${nj} \$\[JOB -1\] ${segments} \|\ 59 | decode.py --datadir ${data} \ 60 | --modeldir ${pytorch_model} \ 61 | --dumpdir ${odir} \ 62 | --checkpoint ${checkpoint} \ 63 | --prior-scale ${prior_scale} \ 64 | --prior-floor ${prior_floor} \ 65 | --prior-name ${prior_name} \ 66 | --words-file ${words_file} \ 67 | --trans-mdl ${trans_mdl} \ 68 | --hclg ${hclg} \ 69 | --min-active ${min_active} \ 70 | --max-active ${max_active} \ 71 | --lattice-beam ${lattice_beam} \ 72 | --beam ${beam} \ 73 | --acoustic-scale ${acoustic_scale} \ 74 | --post-decode-acwt ${post_decode_acwt} \ 75 | --job JOB \ 76 | --utt-subset /dev/stdin \ 77 | --batchsize ${batchsize} 78 | fi 79 | 80 | if [ $stage -le 1 ]; then 81 | ./local/score.sh --cmd "$decode_cmd" \ 82 | --min-lmwt ${min_lmwt} --max-lmwt ${max_lmwt} --word-ins-penalty 0.0 \ 83 | ${data} ${graphdir} ${odir} 84 | fi 85 | -------------------------------------------------------------------------------- /librispeech100/local/download_and_untar.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2014 Johns Hopkins University (author: Daniel Povey) 4 | # 2017 Luminar Technologies, Inc. (author: Daniel Galvez) 5 | # Apache 2.0 6 | 7 | remove_archive=false 8 | 9 | if [ "$1" == --remove-archive ]; then 10 | remove_archive=true 11 | shift 12 | fi 13 | 14 | if [ $# -ne 3 ]; then 15 | echo "Usage: $0 [--remove-archive] " 16 | echo "e.g.: $0 /export/a05/dgalvez/ www.openslr.org/resources/31 dev-clean-2" 17 | echo "With --remove-archive it will remove the archive after successfully un-tarring it." 18 | echo " can be one of: dev-clean-2, test-clean-5, dev-other, test-other," 19 | echo " train-clean-100, train-clean-360, train-other-500." 20 | fi 21 | 22 | data=$1 23 | url=$2 24 | part=$3 25 | 26 | if [ ! -d "$data" ]; then 27 | echo "$0: no such directory $data" 28 | exit 1; 29 | fi 30 | 31 | data=$(readlink -f $data) 32 | 33 | part_ok=false 34 | list="dev-clean-2 train-clean-5" 35 | for x in $list; do 36 | if [ "$part" == $x ]; then part_ok=true; fi 37 | done 38 | if ! $part_ok; then 39 | echo "$0: expected to be one of $list, but got '$part'" 40 | exit 1; 41 | fi 42 | 43 | if [ -z "$url" ]; then 44 | echo "$0: empty URL base." 45 | exit 1; 46 | fi 47 | 48 | if [ -f $data/LibriSpeech/$part/.complete ]; then 49 | echo "$0: data part $part was already successfully extracted, nothing to do." 50 | exit 0; 51 | fi 52 | 53 | 54 | #sizes="126046265 332747356" 55 | sizes="126046265 332954390" 56 | 57 | if [ -f $data/$part.tar.gz ]; then 58 | size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}') 59 | size_ok=false 60 | for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done 61 | if ! $size_ok; then 62 | echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size" 63 | echo "does not equal the size of one of the archives." 64 | rm $data/$part.tar.gz 65 | else 66 | echo "$data/$part.tar.gz exists and appears to be complete." 67 | fi 68 | fi 69 | 70 | if [ ! -f $data/$part.tar.gz ]; then 71 | if ! which wget >/dev/null; then 72 | echo "$0: wget is not installed." 73 | exit 1; 74 | fi 75 | full_url=$url/$part.tar.gz 76 | echo "$0: downloading data from $full_url. This may take some time, please be patient." 77 | 78 | cd $data 79 | if ! wget --no-check-certificate $full_url; then 80 | echo "$0: error executing wget $full_url" 81 | exit 1; 82 | fi 83 | cd - 84 | fi 85 | 86 | cd $data 87 | 88 | if ! tar -xvzf $part.tar.gz; then 89 | echo "$0: error un-tarring archive $data/$part.tar.gz" 90 | exit 1; 91 | fi 92 | 93 | touch $data/LibriSpeech/$part/.complete 94 | 95 | echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz" 96 | 97 | if $remove_archive; then 98 | echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied." 99 | rm $data/$part.tar.gz 100 | fi 101 | -------------------------------------------------------------------------------- /librispeech100/local/download_lm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2014 Vassil Panayotov 4 | # 2017 Daniel Povey 5 | # Apache 2.0 6 | 7 | if [ $# -ne "3" ]; then 8 | echo "Usage: $0 /dev/null | awk '{print $1}' || stat '-f %z' $f) 43 | if [[ "$fsize" -eq "$expect_size" ]]; then 44 | echo "'$fname' already exists and appears to be complete" 45 | return 0 46 | else 47 | echo "WARNING: '$fname' exists, but the size is wrong - re-downloading ..." 48 | fi 49 | fi 50 | wget --no-check-certificate -O $dst_dir/$fname $base_url/$fname || { 51 | echo "Error while trying to download $fname!" 52 | return 1 53 | } 54 | f=$dst_dir/$fname 55 | # In the following statement, the first version works on linux, and the part after '||' 56 | # works on Linux. 57 | fsize=$(set -o pipefail; du -b $f 2>/dev/null | awk '{print $1}' || stat '-f %z' $f) 58 | [[ "$fsize" -eq "$expect_size" ]] || { echo "$fname: file size mismatch!"; return 1; } 59 | return 0 60 | } 61 | 62 | mkdir -p $dst_dir $local_dir 63 | 64 | for f in 3-gram.arpa.gz 3-gram.pruned.1e-7.arpa.gz 3-gram.pruned.3e-7.arpa.gz 4-gram.arpa.gz \ 65 | librispeech-vocab.txt librispeech-lexicon.txt; do 66 | check_and_download $f || exit 1 67 | done 68 | 69 | dst_dir=$(readlink -f $dst_dir) 70 | ln -sf $dst_dir/3-gram.pruned.1e-7.arpa.gz $local_dir/lm_tgmed.arpa.gz 71 | ln -sf $dst_dir/3-gram.pruned.3e-7.arpa.gz $local_dir/lm_tgsmall.arpa.gz 72 | ln -sf $dst_dir/3-gram.arpa.gz $local_dir/lm_tglarge.arpa.gz 73 | ln -sf $dst_dir/4-gram.arpa.gz $local_dir/lm_fglarge.arpa.gz 74 | ln -sf $dst_dir/librispeech-lexicon.txt $local_dir/librispeech-lexicon.txt 75 | ln -sf $dst_dir/librispeech-vocab.txt $local_dir/librispeech-vocab.txt 76 | exit 0 77 | -------------------------------------------------------------------------------- /librispeech100/local/format_lms.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2014 Vassil Panayotov 4 | # Apache 2.0 5 | 6 | # Prepares the test time language model(G) transducers 7 | # (adapted from wsj/s5/local/wsj_format_data.sh) 8 | 9 | . ./path.sh || exit 1; 10 | 11 | # begin configuration section 12 | src_dir=data/lang 13 | # end configuration section 14 | 15 | . utils/parse_options.sh || exit 1; 16 | 17 | set -e 18 | 19 | if [ $# -ne 1 ]; then 20 | echo "Usage: $0 " 21 | echo "e.g.: $0 /export/a15/vpanayotov/data/lm" 22 | echo ", where:" 23 | echo " is the directory in which the language model is stored/downloaded" 24 | echo "Options:" 25 | echo " --src-dir # source lang directory, default data/lang" 26 | exit 1 27 | fi 28 | 29 | lm_dir=$1 30 | 31 | if [ ! -d $lm_dir ]; then 32 | echo "$0: expected source LM directory $lm_dir to exist" 33 | exit 1; 34 | fi 35 | if [ ! -f $src_dir/words.txt ]; then 36 | echo "$0: expected $src_dir/words.txt to exist." 37 | exit 1; 38 | fi 39 | 40 | 41 | tmpdir=data/local/lm_tmp.$$ 42 | trap "rm -r $tmpdir" EXIT 43 | 44 | mkdir -p $tmpdir 45 | 46 | for lm_suffix in tgsmall tgmed; do 47 | # tglarge is prepared by a separate command, called from run.sh; we don't 48 | # want to compile G.fst for tglarge, as it takes a while. 49 | test=${src_dir}_test_${lm_suffix} 50 | mkdir -p $test 51 | cp -r ${src_dir}/* $test 52 | gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz | \ 53 | arpa2fst --disambig-symbol=#0 \ 54 | --read-symbol-table=$test/words.txt - $test/G.fst 55 | utils/validate_lang.pl --skip-determinization-check $test || exit 1; 56 | done 57 | 58 | echo "Succeeded in formatting data." 59 | 60 | exit 0 61 | -------------------------------------------------------------------------------- /librispeech100/local/prepare_dict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2014 Vassil Panayotov 4 | # Apache 2.0 5 | 6 | # Prepares the dictionary and auto-generates the pronunciations for the words, 7 | # that are in our vocabulary but not in CMUdict 8 | 9 | stage=0 10 | nj=4 # number of parallel Sequitur G2P jobs, we would like to use 11 | cmd=run.pl 12 | 13 | 14 | . utils/parse_options.sh || exit 1; 15 | . ./path.sh || exit 1 16 | 17 | 18 | if [ $# -ne 3 ]; then 19 | echo "Usage: $0 [options] " 20 | echo "e.g.: /export/a15/vpanayotov/data/lm /export/a15/vpanayotov/data/g2p data/local/dict" 21 | echo "Options:" 22 | echo " --cmd '' # script to launch jobs with, default: run.pl" 23 | echo " --nj # number of jobs to run, default: 4." 24 | exit 1 25 | fi 26 | 27 | lm_dir=$1 28 | g2p_model_dir=$2 29 | dst_dir=$3 30 | 31 | vocab=$lm_dir/librispeech-vocab.txt 32 | [ ! -f $vocab ] && echo "$0: vocabulary file not found at $vocab" && exit 1; 33 | 34 | # this file is either a copy of the lexicon we download from openslr.org/11 or is 35 | # created by the G2P steps below 36 | lexicon_raw_nosil=$dst_dir/lexicon_raw_nosil.txt 37 | 38 | cmudict_dir=$dst_dir/cmudict 39 | cmudict_plain=$dst_dir/cmudict.0.7a.plain 40 | 41 | mkdir -p $dst_dir || exit 1; 42 | 43 | if [ $stage -le 0 ]; then 44 | echo "Downloading and preparing CMUdict" 45 | if [ ! -s $cmudict_dir/cmudict.0.7a ]; then 46 | svn co -r 12440 https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict $cmudict_dir || exit 1; 47 | fi 48 | echo "Removing the pronunciation variant markers ..." 49 | grep -v ';;;' $cmudict_dir/cmudict.0.7a | \ 50 | perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \ 51 | > $cmudict_plain || exit 1; 52 | fi 53 | 54 | 55 | if [ $stage -le 1 ]; then 56 | # check if we have Sequitur G2P is installed 57 | if [ ! -f "$sequitur" ]; then 58 | if ! which swig >&/dev/null; then 59 | echo "Please install 'swig' and then run $KALDI_ROOT/tools/extra/install_sequitur.sh" 60 | exit 1 61 | else 62 | echo "Sequitur G2P not found- running $KALDI_ROOT/tools/extra/install_sequitur.sh" 63 | pushd $KALDI_ROOT/tools 64 | extras/install_sequitur.sh || exit 1 65 | popd 66 | fi 67 | fi 68 | [[ -f "$sequitur" ]] || { echo "Still can't find Sequitur G2P- check your path.sh"; exit 1; } 69 | 70 | g2p_dir=$dst_dir/g2p 71 | auto_vocab_prefix="$g2p_dir/vocab_autogen" 72 | auto_lexicon_prefix="$g2p_dir/lexicon_autogen" 73 | 74 | mkdir -p $g2p_dir/log 75 | auto_vocab_splits=$(eval "echo $auto_vocab_prefix.{$(seq -s',' $nj | sed 's/,$//')}") 76 | awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' $cmudict_plain $vocab |\ 77 | sort | tee $g2p_dir/vocab_autogen.full |\ 78 | utils/split_scp.pl /dev/stdin $auto_vocab_splits || exit 1 79 | echo "Autogenerating pronunciations for the words in $auto_vocab_prefix.* ..." 80 | $cmd JOB=1:$nj $g2p_dir/log/g2p.JOB.log \ 81 | local/g2p.sh $auto_vocab_prefix.JOB $g2p_model_dir $auto_lexicon_prefix.JOB || exit 1 82 | g2p_vocab_size=$(wc -l <$g2p_dir/vocab_autogen.full) 83 | g2p_lex_size=$(wc -l < <(cat $auto_lexicon_prefix.*)) 84 | [[ "$g2p_vocab_size" -eq "$g2p_lex_size" ]] || { echo "Unexpected G2P error"; exit 1; } 85 | sort <(cat $auto_vocab_prefix.*) >$dst_dir/vocab_autogen.txt 86 | sort <(cat $auto_lexicon_prefix.*) >$dst_dir/lexicon_autogen.txt 87 | echo "$(wc -l <$g2p_dir/vocab_autogen.full) pronunciations autogenerated OK" 88 | fi 89 | 90 | if [ $stage -le 2 ]; then 91 | echo "Combining the CMUdict pronunciations with the autogenerated ones ..." 92 | awk 'NR==FNR{a[$1]=1; next} ($1 in a)' $vocab $cmudict_plain |\ 93 | cat - $dst_dir/lexicon_autogen.txt | sort >$lexicon_raw_nosil || exit 1 94 | raw_lex_size=$(cat $lexicon_raw_nosil | awk '{print $1}' | sort -u | wc -l) 95 | vocab_size=$(wc -l <$vocab) 96 | [[ "$vocab_size" -eq "$raw_lex_size" ]] || { 97 | echo "Inconsistent lexicon($raw_lex_size) vs vocabulary($vocab_size) size!"; 98 | exit 1; } 99 | echo "Combined lexicon saved to '$lexicon_raw_nosil'" 100 | fi 101 | 102 | # The copy operation below is necessary, if we skip the g2p stages(e.g. using --stage 3) 103 | if [[ ! -s "$lexicon_raw_nosil" ]]; then 104 | cp $lm_dir/librispeech-lexicon.txt $lexicon_raw_nosil || exit 1 105 | fi 106 | 107 | if [ $stage -le 3 ]; then 108 | silence_phones=$dst_dir/silence_phones.txt 109 | optional_silence=$dst_dir/optional_silence.txt 110 | nonsil_phones=$dst_dir/nonsilence_phones.txt 111 | extra_questions=$dst_dir/extra_questions.txt 112 | 113 | echo "Preparing phone lists and clustering questions" 114 | (echo SIL; echo SPN;) > $silence_phones 115 | echo SIL > $optional_silence 116 | # nonsilence phones; on each line is a list of phones that correspond 117 | # really to the same base phone. 118 | awk '{for (i=2; i<=NF; ++i) { print $i; gsub(/[0-9]/, "", $i); print $i}}' $lexicon_raw_nosil |\ 119 | sort -u |\ 120 | perl -e 'while(<>){ 121 | chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_"; 122 | $phones_of{$1} .= "$_ "; } 123 | foreach $list (values %phones_of) {print $list . "\n"; } ' | sort \ 124 | > $nonsil_phones || exit 1; 125 | # A few extra questions that will be added to those obtained by automatically clustering 126 | # the "real" phones. These ask about stress; there's also one for silence. 127 | cat $silence_phones| awk '{printf("%s ", $1);} END{printf "\n";}' > $extra_questions || exit 1; 128 | cat $nonsil_phones | perl -e 'while(<>){ foreach $p (split(" ", $_)) { 129 | $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \ 130 | >> $extra_questions || exit 1; 131 | echo "$(wc -l <$silence_phones) silence phones saved to: $silence_phones" 132 | echo "$(wc -l <$optional_silence) optional silence saved to: $optional_silence" 133 | echo "$(wc -l <$nonsil_phones) non-silence phones saved to: $nonsil_phones" 134 | echo "$(wc -l <$extra_questions) extra triphone clustering-related questions saved to: $extra_questions" 135 | fi 136 | 137 | if [ $stage -le 4 ]; then 138 | (echo '!SIL SIL'; echo ' SPN'; echo ' SPN'; ) |\ 139 | cat - $lexicon_raw_nosil | sort | uniq >$dst_dir/lexicon.txt 140 | echo "Lexicon text file saved as: $dst_dir/lexicon.txt" 141 | fi 142 | 143 | exit 0 144 | -------------------------------------------------------------------------------- /librispeech100/local/prepare_librilight.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ./path.sh 4 | . ./cmd 5 | 6 | if [ $# -ne 1 ]; then 7 | echo "Usage: ./local/prepare_librilight.sh " 8 | exit 1; 9 | fi 10 | 11 | data=$1 12 | # Get librilight set 13 | wget https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz 14 | tar -xvf librispeech_finetuning.tgz && mv librispeech_finetuning ${data} 15 | 16 | # The following are the data subsets: 17 | # 1h/{0..5}/{clean,other} 18 | # 9h/{clean,other} 19 | # 20 | # In each of these subsets there speaker directories named with a speaker-id. 21 | # Inside each directory are more directories corresponding to a recording-id. 22 | # Within each speaker-id/recording-id subdirectory are the .flac audio files 23 | # corresponding to speech utterances, as well as a .trans.txt file that has 24 | # the transcription. 25 | 26 | find -L $data -name "*.flac" 27 | 28 | for part in 1h/{0..5}/{clean,other} 9h/{clean,other}; do 29 | dataname=$(echo ${part} | sed 's/\//_/g') 30 | ./local/prepare_librilight_dataset.sh ${data}/${part} data/train_${dataname} 31 | done 32 | 33 | ./utils/combine_data.sh \ 34 | data/train_10h data/train_1h_{0..5}_{clean,other} data/train_9h_{clean,other} 35 | -------------------------------------------------------------------------------- /librispeech100/local/prepare_librilight_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ./path.sh 4 | . ./cmd.sh 5 | 6 | if [ $# -ne 2 ]; then 7 | echo "Usage: ./local/prepare_librilight.sh " 8 | exit 1; 9 | fi 10 | 11 | data=$1 12 | kaldi_data=$2 13 | 14 | data=$(./utils/make_absolute.sh ${data}) 15 | mkdir -p $kaldi_data 16 | files=( `find -L ${data}/${p} -name "*.flac"` ) 17 | 18 | for f in ${files[@]}; do 19 | fname=`basename $f` 20 | fname=${fname%%.flac} 21 | echo "${fname} flac -c -d -s ${f} |" 22 | done | sort > ${kaldi_data}/wav.scp 23 | 24 | paste -d' ' <(awk '{print $1}' ${kaldi_data}/wav.scp) \ 25 | <(awk '{print $1}' ${kaldi_data}/wav.scp | cut -d'-' -f1) \ 26 | > ${kaldi_data}/utt2spk 27 | 28 | ./utils/utt2spk_to_spk2utt.pl ${kaldi_data}/utt2spk > ${kaldi_data}/spk2utt 29 | 30 | cat `find -L ${data}/${p} -name "*.trans.txt"` | sort > ${kaldi_data}/text 31 | exit 0; 32 | -------------------------------------------------------------------------------- /librispeech100/local/prepare_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . ./path.sh 3 | . ./cmd.sh 4 | 5 | data="/export/corpora5" 6 | subsampling=4 7 | num_split=20 8 | testsets="dev-clean dev-other test-clean test-other" 9 | feat_affix=_fbank 10 | standard_split=false 11 | 12 | . ./utils/parse_options.sh 13 | 14 | for part in $testsets; do 15 | echo "-------------- Making ${part} ----------------------" 16 | dataname=$(echo ${part} | sed s/-/_/g) 17 | part=$(echo ${part} | sed s/_/-/g) 18 | if $standard_split; then 19 | local/data_prep.sh $data/LibriSpeech/${part} data/${dataname} 20 | else 21 | echo "Assuming the testset ${part} is manually created and exists ..." 22 | fi 23 | ./utils/copy_data_dir.sh data/${dataname} data/${dataname}${feat_affix} 24 | ./steps/make_fbank.sh --cmd "$train_cmd" --nj 32 \ 25 | data/${dataname}${feat_affix} exp/make_fbank/${dataname}${feat_affix} ${feat_affix##_} 26 | ./utils/fix_data_dir.sh data/${dataname}${feat_affix} 27 | ./steps/compute_cmvn_stats.sh data/${dataname}${feat_affix} 28 | ./utils/fix_data_dir.sh data/${dataname}${feat_affix} 29 | 30 | prepare_unlabeled_tgt.py --subsample ${subsampling} \ 31 | data/${dataname}${feat_affix}/utt2num_frames > data/${dataname}${feat_affix}/pdfid.${subsampling}.tgt 32 | split_memmap_data.sh data/${dataname}${feat_affix} data/${dataname}${feat_affix}/pdfid.${subsampling}.tgt $num_split 33 | done 34 | 35 | 36 | -------------------------------------------------------------------------------- /librispeech100/local/prepare_unlabeled_tgt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf-8 -*- 3 | # Copyright 2019 Johns Hopkins University (Author: Matthew Wiesner) 4 | # Apache 2.0 5 | 6 | from __future__ import print_function 7 | import argparse 8 | import sys 9 | import os 10 | 11 | 12 | def main(): 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('utt2num_frames', 15 | help='', 16 | type=str 17 | ) 18 | parser.add_argument('--subsample', type=int, default=1) 19 | 20 | args = parser.parse_args() 21 | 22 | with open(args.utt2num_frames, 'r') as f: 23 | for l in f: 24 | utt, frames = l.strip().split(None, 1) 25 | print(utt, end='') 26 | num_frames = len(range(0, int(frames), args.subsample)) 27 | print(' -1' * num_frames) 28 | 29 | if __name__ == "__main__": 30 | main() 31 | 32 | -------------------------------------------------------------------------------- /librispeech100/local/score.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2012 Johns Hopkins University (Author: Daniel Povey) 3 | # 2014 Guoguo Chen 4 | # Apache 2.0 5 | 6 | [ -f ./path.sh ] && . ./path.sh 7 | 8 | # begin configuration section. 9 | cmd=run.pl 10 | stage=0 11 | decode_mbr=true 12 | word_ins_penalty=0.0,0.5,1.0 13 | min_lmwt=7 14 | max_lmwt=17 15 | iter=final 16 | #end configuration section. 17 | 18 | [ -f ./path.sh ] && . ./path.sh 19 | . parse_options.sh || exit 1; 20 | 21 | if [ $# -ne 3 ]; then 22 | echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] " 23 | echo " Options:" 24 | echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." 25 | echo " --stage (0|1|2) # start scoring script from part-way through." 26 | echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)." 27 | echo " --min_lmwt # minumum LM-weight for lattice rescoring " 28 | echo " --max_lmwt # maximum LM-weight for lattice rescoring " 29 | exit 1; 30 | fi 31 | 32 | data=$1 33 | lang_or_graph=$2 34 | dir=$3 35 | 36 | symtab=$lang_or_graph/words.txt 37 | 38 | for f in $symtab $dir/lat.1.gz $data/text; do 39 | [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; 40 | done 41 | 42 | mkdir -p $dir/scoring/log 43 | 44 | cat $data/text | sed 's:::g' | sed 's:::g' > $dir/scoring/test_filt.txt 45 | 46 | for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do 47 | $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.$wip.log \ 48 | lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ 49 | lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ 50 | lattice-best-path --word-symbol-table=$symtab \ 51 | ark:- ark,t:$dir/scoring/LMWT.$wip.tra || exit 1; 52 | done 53 | 54 | # Note: the double level of quoting for the sed command 55 | for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do 56 | $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.$wip.log \ 57 | cat $dir/scoring/LMWT.$wip.tra \| \ 58 | utils/int2sym.pl -f 2- $symtab \| sed 's:\::g' \| \ 59 | compute-wer --text --mode=present \ 60 | ark:$dir/scoring/test_filt.txt ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1; 61 | done 62 | 63 | exit 0; 64 | -------------------------------------------------------------------------------- /librispeech100/local/split_memmap_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ./path.sh 4 | . ./cmd.sh 5 | 6 | . ./utils/parse_options.sh 7 | if [ $# -ne 2 ]; then 8 | echo "Usage: ./local/split_memmap_data.sh " 9 | exit 1; 10 | fi 11 | 12 | datadir=$1 13 | num_split=$2 14 | 15 | dataname=`basename ${datadir}` 16 | mapped_dir=${datadir}/mapped # don't change this path 17 | mkdir -p $mapped_dir 18 | echo "$0: Splitting data in $num_split parts" 19 | # spread the mapped numpy arrays over various machines, as this data-set is quite large. 20 | if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then 21 | utils/create_split_dir.pl /export/b{11,12,13,14}/$USER/kaldi-data/egs/librispeech100/$mapped_dir/storage \ 22 | $mapped_dir/storage 23 | fi 24 | utils/split_data.sh ${datadir} $num_split 25 | for n in $(seq $num_split); do 26 | # the next command does nothing unless $mapped_feats_dir/storage/ exists, see 27 | # utils/create_data_link.pl for more info. 28 | utils/create_data_link.pl $mapped_dir/feats.dat.$n 29 | done 30 | $train_cmd JOB=1:$num_split exp/make_fbank/${dataname}/memmap_data.JOB.log \ 31 | memmap_data.py ${datadir}/split${num_split}/JOB/feats.scp $mapped_dir/feats.dat.JOB \ 32 | $mapped_dir/metadata.JOB 33 | echo $num_split > ${datadir}/num_split 34 | 35 | -------------------------------------------------------------------------------- /librispeech100/local/subset_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2017 Luminar Technologies, Inc. (author: Daniel Galvez) 4 | # Apache 2.0 5 | 6 | # The following commands were used to generate the mini_librispeech dataset: 7 | # 8 | # Note that data generation is random. This could be fixed by 9 | # providing a seed argument to the shuf program. 10 | 11 | if [ "$#" -ne 3 ]; then 12 | echo "Usage: $0 " 13 | echo "e.g.: $0 /export/a05/dgalvez/LibriSpeech/train-clean-100 \\ 14 | /export/a05/dgalvez/LibriSpeech/train-clean-5 5" 15 | exit 1 16 | fi 17 | 18 | src_dir=$1 19 | dest_dir=$2 20 | dest_num_hours=$3 21 | 22 | src=$(basename $src_dir) 23 | dest=$(basename $dest_dir) 24 | librispeech_dir=$(dirname $src_dir) 25 | 26 | # TODO: Possibly improve this to ensure gender balance and speaker 27 | # balance. 28 | # TODO: Use actual time values instead of assuming that to make sure we get $dest_num_hours of data 29 | src_num_hours=$(grep "$src" $librispeech_dir/CHAPTERS.TXT | awk -F'|' '{ print $3 }' | \ 30 | python -c ' 31 | from __future__ import print_function 32 | from sys import stdin 33 | minutes_str = stdin.read().split() 34 | print(int(round(sum([float(minutes) for minutes in minutes_str]) / 60.0)))') 35 | src_num_chapters=$(grep "$src" $librispeech_dir/CHAPTERS.TXT | \ 36 | awk -F'|' '{ print $1 }' | sort -u | wc -l) 37 | mkdir -p data/subset_tmp 38 | grep "$src" $librispeech_dir/CHAPTERS.TXT | \ 39 | awk -F'|' '{ print $1 }' | \ 40 | shuf -n $(((dest_num_hours * src_num_chapters) / src_num_hours)) > \ 41 | data/subset_tmp/${dest}_chapter_id_list.txt 42 | 43 | while read -r chapter_id || [[ -n "$chapter_id" ]]; do 44 | chapter_dir=$(find $src_dir/ -mindepth 2 -name "$chapter_id" -type d) 45 | speaker_id=$(basename $(dirname $chapter_dir)) 46 | mkdir -p $dest_dir/$speaker_id/ 47 | cp -r $chapter_dir $dest_dir/$speaker_id/ 48 | done < data/subset_tmp/${dest}_chapter_id_list.txt 49 | -------------------------------------------------------------------------------- /librispeech100/path.sh: -------------------------------------------------------------------------------- 1 | export ROOT=`pwd`/../tools 2 | export KALDI_ROOT=${ROOT}/kaldi 3 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh 4 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/sph2pipe_v2.5:$KALDI_ROOT/tools/openfst/bin:`pwd`/../nnet_pytorch:$PWD:$PATH:`pwd`/../nnet_pytorch/utils/ 5 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 6 | . $KALDI_ROOT/tools/config/common_path.sh 7 | export LC_ALL=C 8 | 9 | export OPENFST_PATH=${ROOT}/openfst #/PATH/TO/OPENFST 10 | export LD_LIBRARY_ORIG=${LD_LIBRARY_PATH} 11 | export LD_LIBRARY_PATH=${OPENFST_PATH}/lib:${LD_LIBRARY_PATH} 12 | #export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${OPENFST_PATH}/lib:/usr/local/cuda/lib64 13 | 14 | export PYTHONPATH=${PYTHONPATH}:`pwd`/../nnet_pytorch/:`pwd`/../nnet_pytorch/utils/ 15 | export PYTHONUNBUFFERED=1 16 | source ${ROOT}/NeurIPS2020/bin/activate 17 | 18 | export LC_ALL=C 19 | 20 | -------------------------------------------------------------------------------- /librispeech100/run-semisup-wrn-scratch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This is based almost entirely on the Kaldi librispeech recipe 4 | # Change this location to somewhere where you want to put the data. 5 | # This recipe ASSUMES YOU HAVE DOWNLOADED the Librispeech data 6 | unlabeled_data=/export/corpora5 #/PATH/TO/LIBRISPEECH/data 7 | 8 | . ./cmd.sh 9 | . ./path.sh 10 | 11 | stage=1 12 | subsampling=4 13 | traindir=data/train_100h 14 | unsupdir=data/train_860h 15 | feat_affix=_fbank_64 16 | chaindir=exp/chain 17 | model_dirname=wrn_semisup 18 | batches_per_epoch=250 19 | num_epochs=240 20 | delay=2 21 | train_nj_init=2 22 | train_nj_final=6 23 | ebm_weight=1.0 24 | ebm_type="uncond" 25 | ebm_tgt=data/train_100h_fbank_64/pdfid.4.tgt 26 | sgld_opt=adam 27 | sgld_stepsize=1.0 28 | sgld_maxsteps=50.0 29 | sgld_minsteps=1 30 | sgld_replay=1.0 31 | sgld_noise=0.001 32 | sgld_weight_decay=1e-10 33 | sgld_decay=1e-04 34 | sgld_warmup=15000 35 | sgld_clip=1.0 36 | sgld_init_val=1.5 37 | sgld_epsilon=1e-04 38 | lr=0.0002 39 | xent=0.1 40 | l2=0.0001 41 | leaky_hmm=0.1 42 | l2_energy=0.001 43 | warmup=15000 44 | unsup_num_repeats=1 45 | unsup_batchsize=32 46 | unsup_chunkwidth=50 47 | unsup_left=10 48 | unsup_right=5 49 | mean_norm=True 50 | var_norm=True 51 | perturb="gauss 0.01" 52 | depth=28 53 | width=10 54 | seed=0 55 | resume= 56 | num_split=80 # number of splits for memory-mapped data for training 57 | . ./utils/parse_options.sh 58 | 59 | set -euo pipefail 60 | 61 | tree=${chaindir}/tree 62 | targets=${traindir}${feat_affix}/pdfid.${subsampling}.tgt 63 | trainname=`basename ${traindir}` 64 | 65 | # Make the unlabeled data 66 | if [ $stage -le 0 ]; then 67 | for part in train-clean-360 train-other-500; do 68 | local/data_prep.sh $unlabeled_data/LibriSpeech/${part} data/$(echo ${part} | sed s/-/_/g) 69 | done 70 | 71 | ./utils/combine_data.sh data/train_860 data/train_{clean_360,other_500} 72 | ./steps/make_fbank.sh --cmd "$train_cmd" --nj 32 data/train_860 exp/make_fbank/train_860 ${feat_affix##_} 73 | ./utils/fix_data_dir.sh data/train_860 74 | ./steps/compute_cmvn_stats.sh data/train_860 75 | ./utils/fix_data_dir.sh data/train_860 76 | 77 | python prepare_unlabeled_tgt.py --subsample ${subsampling} data/train_860/utt2num_frames > data/train_860/pdfid.${subsampling}.unsup.tgt 78 | split_memmap_data.sh data/train_860 data/train_860/pdfid.${subsampling}.tgt ${num_split} 79 | fi 80 | 81 | 82 | # We use a lower learning rate in order to prevent the model from forgetting 83 | # too much. 84 | if [ $stage -eq 1 ]; then 85 | resume_opts= 86 | if [ ! -z $resume ]; then 87 | resume_opts="--resume ${resume}" 88 | fi 89 | num_pdfs=$(tree-info ${tree}/tree | grep 'num-pdfs' | cut -d' ' -f2) 90 | idim=$(feat-to-dim scp:${traindir}${feat_affix}/feats.scp -) 91 | train_async_parallel.sh ${resume_opts} \ 92 | --gpu true \ 93 | --objective SemisupLFMMI \ 94 | --denom-graph ${chaindir}/den.fst \ 95 | --num-pdfs ${num_pdfs} \ 96 | --idim ${idim} \ 97 | --subsample ${subsampling} \ 98 | --model ChainWideResnet \ 99 | --depth ${depth} \ 100 | --width ${width} \ 101 | --warmup ${warmup} \ 102 | --decay 1e-05 \ 103 | --xent ${xent} \ 104 | --l2 ${l2} \ 105 | --leaky-hmm ${leaky_hmm} \ 106 | --weight-decay 1e-07 \ 107 | --lr ${lr} \ 108 | --batches-per-epoch ${batches_per_epoch} \ 109 | --num-epochs ${num_epochs} \ 110 | --validation-spks 0 \ 111 | --sgld-thresh 0.0 \ 112 | --sgld-reinit-p 0.05 \ 113 | --sgld-buffer 10000 \ 114 | --sgld-stepsize ${sgld_stepsize} \ 115 | --sgld-steps ${sgld_minsteps} \ 116 | --sgld-max-steps ${sgld_maxsteps} \ 117 | --sgld-noise ${sgld_noise} \ 118 | --sgld-decay ${sgld_decay} \ 119 | --sgld-real-decay 0.0 \ 120 | --sgld-clip ${sgld_clip} \ 121 | --sgld-warmup ${sgld_warmup} \ 122 | --sgld-optim ${sgld_opt} \ 123 | --sgld-init-val ${sgld_init_val} \ 124 | --sgld-epsilon ${sgld_epsilon} \ 125 | --sgld-replay-correction ${sgld_replay} \ 126 | --l2-energy ${l2_energy} \ 127 | --sgld-weight-decay ${sgld_weight_decay} \ 128 | --delay-updates ${delay} \ 129 | --lfmmi-weight 1.0 \ 130 | --ebm-weight ${ebm_weight} \ 131 | --ebm-type ${ebm_type} \ 132 | --ebm-tgt ${ebm_tgt} \ 133 | --nj-init ${train_nj_init} \ 134 | --nj-final ${train_nj_final} \ 135 | --seed ${seed} \ 136 | "[ \ 137 | {\ 138 | 'data': '${traindir}${feat_affix}', \ 139 | 'tgt': '${targets}', \ 140 | 'batchsize': 32, 'num_repeats': 1, 'chunk_width': 140, \ 141 | 'left_context': 10, 'right_context': 5, \ 142 | 'mean_norm': ${mean_norm}, 'var_norm': ${var_norm}, 'perturb_type': '${perturb}' \ 143 | },\ 144 | {\ 145 | 'data': '${unsupdir}', \ 146 | 'tgt': '${unsupdir}/pdfid.${subsampling}.unsup.tgt', \ 147 | 'batchsize': ${unsup_batchsize}, 'num_repeats': ${unsup_num_repeats}, 'chunk_width': ${unsup_chunkwidth}, \ 148 | 'left_context': ${unsup_left}, 'right_context': ${unsup_right}, \ 149 | 'mean_norm': ${mean_norm}, 'var_norm': ${var_norm}, 'perturb_type': '${perturb}' \ 150 | },\ 151 | ]" \ 152 | `dirname ${chaindir}`/${model_dirname} 153 | fi 154 | 155 | -------------------------------------------------------------------------------- /librispeech100/run-semisup-wrn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This is based almost entirely on the Kaldi librispeech recipe 4 | # Change this location to somewhere where you want to put the data. 5 | # This recipe ASSUMES YOU HAVE DOWNLOADED the Librispeech data 6 | unlabeled_data=/export/corpora5 #/PATH/TO/LIBRISPEECH/data 7 | 8 | . ./cmd.sh 9 | . ./path.sh 10 | 11 | stage=1 12 | subsampling=4 13 | traindir=data/train_100h 14 | feat_affix=_fbank_64 15 | chaindir=exp/chain 16 | model_dirname=wrn_semisup 17 | batches_per_epoch=1000 18 | num_epochs=240 19 | train_nj=4 20 | lr=0.0001 21 | delay_updates=2 22 | warmup=15000 23 | ebm_weight=1.0 24 | sgld_opt=adam 25 | sgld_stepsize=1.0 26 | sgld_maxsteps=50.0 27 | sgld_minsteps=1 28 | sgld_replay=1.0 29 | sgld_noise=0.001 30 | sgld_weight_decay=1e-10 31 | sgld_decay=1e-04 32 | sgld_warmup=15000 33 | sgld_reinit=0.05 34 | sgld_clip=1.0 35 | l2_energy=0.0001 36 | unsup_batchsize=16 37 | sup_batchsize=16 38 | unsup_chunkwidth=50 39 | unsup_left=10 40 | unsup_right=5 41 | mean_norm=True 42 | resume= 43 | num_split=80 # number of splits for memory-mapped data for training 44 | . ./utils/parse_options.sh 45 | 46 | if [ $# -ne 1 ]; then 47 | echo "Usage: ./run-semisup-wrn.sh " 48 | echo " This script assumes you have trained a seed model first." 49 | echo " Do ./run-wrn.sh for instance." 50 | exit 1; 51 | fi 52 | 53 | init=$1 54 | set -euo pipefail 55 | 56 | [ ! -f ${init} ] && echo "Expected ${init} to exist." && exit 1; 57 | tree=${chaindir}/tree 58 | targets=${traindir}${feat_affix}/pdfid.${subsampling}.tgt 59 | trainname=`basename ${traindir}` 60 | 61 | # Make the unlabeled data 62 | if [ $stage -le 0 ]; then 63 | for part in train-clean-360 train-other-500; do 64 | local/data_prep.sh $unlabeled_data/LibriSpeech/${part} data/$(echo ${part} | sed s/-/_/g) 65 | done 66 | 67 | ./utils/combine_data.sh data/train_860 data/train_{clean_360,other_500} 68 | ./steps/make_fbank.sh --cmd "$train_cmd" --nj 32 data/train_860 exp/make_fbank/train_860 ${feat_affix##_} 69 | ./utils/fix_data_dir.sh data/train_860 70 | ./steps/compute_cmvn_stats.sh data/train_860 71 | ./utils/fix_data_dir.sh data/train_860 72 | 73 | python prepare_unlabeled_tgt.py --subsample ${subsampling} data/train_860/utt2num_frames > data/train_860/pdfid.${subsampling}.tgt 74 | split_memmap_data.sh data/train_860 data/train_860/pdfid.${subsampling}.tgt ${num_split} 75 | fi 76 | 77 | 78 | # We use a lower learning rate in order to prevent the model from forgetting 79 | # too much. 80 | if [ $stage -eq 1 ]; then 81 | resume_opts= 82 | if [ ! -z $resume ]; then 83 | resume_opts="--resume ${resume}" 84 | fi 85 | num_pdfs=$(tree-info ${tree}/tree | grep 'num-pdfs' | cut -d' ' -f2) 86 | train_async_parallel.sh ${resume_opts} \ 87 | --gpu true \ 88 | --objective SemisupLFMMI \ 89 | --denom-graph ${chaindir}/den.fst \ 90 | --num-pdfs ${num_pdfs} \ 91 | --subsample ${subsampling} \ 92 | --model ChainWideResnet \ 93 | --depth 28 \ 94 | --width 10 \ 95 | --warmup ${warmup} \ 96 | --decay 1e-05 \ 97 | --xent 0.01 \ 98 | --l2 0.0001 \ 99 | --weight-decay 1e-07 \ 100 | --lr ${lr} \ 101 | --batches-per-epoch ${batches_per_epoch} \ 102 | --num-epochs ${num_epochs} \ 103 | --validation-spks 0 \ 104 | --sgld-thresh 0 \ 105 | --sgld-reinit-p ${sgld_reinit} \ 106 | --sgld-buffer 10000 \ 107 | --sgld-stepsize ${sgld_stepsize} \ 108 | --sgld-steps ${sgld_minsteps} \ 109 | --sgld-max-steps ${sgld_maxsteps} \ 110 | --sgld-noise ${sgld_noise} \ 111 | --sgld-decay ${sgld_decay} \ 112 | --sgld-real-decay 0.0 \ 113 | --sgld-clip ${sgld_clip} \ 114 | --sgld-warmup ${sgld_warmup} \ 115 | --sgld-optim ${sgld_opt} \ 116 | --sgld-replay-correction ${sgld_replay} \ 117 | --l2-energy ${l2_energy} \ 118 | --sgld-weight-decay ${sgld_weight_decay} \ 119 | --delay-updates ${delay_updates} \ 120 | --lfmmi-weight 0.1 \ 121 | --ebm-weight ${ebm_weight} \ 122 | --nj ${train_nj} \ 123 | --init ${init} \ 124 | "[ \ 125 | {\ 126 | 'data': '${traindir}${feat_affix}', \ 127 | 'tgt': '${targets}', \ 128 | 'batchsize': ${sup_batchsize}, 'chunk_width': 140, 'num_repeats': 1,\ 129 | 'left_context': 10, 'right_context': 5, \ 130 | 'mean_norm': ${mean_norm}, 'var_norm': 'norm' \ 131 | },\ 132 | {\ 133 | 'data': 'data/train_860', \ 134 | 'tgt': 'data/train_860/pdfid.${subsampling}.tgt', \ 135 | 'batchsize': ${unsup_batchsize}, 'chunk_width': ${unsup_chunkwidth}, 'num_repeats': 1,\ 136 | 'left_context': ${unsup_left}, 'right_context': ${unsup_right}, \ 137 | 'mean_norm': ${mean_norm}, 'var_norm': 'norm' \ 138 | },\ 139 | ]" \ 140 | `dirname ${chaindir}`/${model_dirname} 141 | fi 142 | 143 | -------------------------------------------------------------------------------- /librispeech100/run-tdnn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ./cmd.sh 4 | . ./path.sh 5 | 6 | stage=0 7 | subsampling=3 8 | traindir=data/train_100h 9 | feat_affix=_fbank 10 | chaindir=exp/chain_tdnn 11 | num_leaves=3500 12 | model_dirname=tdnn 13 | batches_per_epoch=250 14 | num_epochs=240 15 | train_nj=2 16 | resume= 17 | num_split=20 # number of splits for memory-mapped data for training 18 | average=true 19 | 20 | . ./utils/parse_options.sh 21 | 22 | set -euo pipefail 23 | 24 | tree=${chaindir}/tree 25 | targets=${traindir}${feat_affix}/pdfid.${subsampling}.tgt 26 | trainname=`basename ${traindir}` 27 | 28 | if [ $stage -le 1 ]; then 29 | echo "Creating Chain Topology, Denominator Graph, and nnet Targets ..." 30 | lang=data/lang_chain 31 | cp -r data/lang $lang 32 | silphonelist=$(cat $lang/phones/silence.csl) || exit 1; 33 | nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; 34 | # Use our special topology... note that later on may have to tune this 35 | # topology. 36 | steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo 37 | 38 | steps/nnet3/chain/build_tree.sh \ 39 | --frame-subsampling-factor ${subsampling} \ 40 | --context-opts "--context-width=2 --central-position=1" \ 41 | --cmd "$train_cmd" ${num_leaves} ${traindir} \ 42 | $lang exp/tri3_ali_${trainname} ${tree} 43 | 44 | ali-to-phones ${tree}/final.mdl ark:"gunzip -c ${tree}/ali.*.gz |" ark:- |\ 45 | chain-est-phone-lm --num-extra-lm-states=2000 ark:- ${chaindir}/phone_lm.fst 46 | 47 | chain-make-den-fst ${tree}/tree ${tree}/final.mdl \ 48 | ${chaindir}/phone_lm.fst ${chaindir}/den.fst ${chaindir}/normalization.fst 49 | 50 | ali-to-pdf ${tree}/final.mdl ark:"gunzip -c ${tree}/ali.*.gz |" ark,t:${targets} 51 | fi 52 | 53 | if [ $stage -le 2 ]; then 54 | echo "Dumping memory mapped features ..." 55 | split_memmap_data.sh ${traindir}${feat_affix} ${targets} ${num_split} 56 | fi 57 | 58 | if [ $stage -le 3 ]; then 59 | resume_opts= 60 | if [ ! -z $resume ]; then 61 | resume_opts="--resume ${resume}" 62 | fi 63 | 64 | num_pdfs=$(tree-info ${tree}/tree | grep 'num-pdfs' | cut -d' ' -f2) 65 | train_async_parallel.sh ${resume_opts} \ 66 | --gpu true \ 67 | --objective LFMMI \ 68 | --denom-graph ${chaindir}/den.fst \ 69 | --num-pdfs ${num_pdfs} \ 70 | --subsample ${subsampling} \ 71 | --model ChainTDNN \ 72 | --hdim 1024 \ 73 | --num-layers 13 \ 74 | --dropout 0.2 \ 75 | --prefinal-dim 192 \ 76 | --warmup 15000 \ 77 | --decay 1e-05 \ 78 | --xent 0.1 \ 79 | --l2 0.0001 \ 80 | --weight-decay 1e-07 \ 81 | --lr 0.0002 \ 82 | --batches-per-epoch ${batches_per_epoch} \ 83 | --num-epochs ${num_epochs} \ 84 | --validation-spks 0 \ 85 | --nj ${train_nj} \ 86 | "[ \ 87 | {\ 88 | 'data': '${traindir}${feat_affix}', \ 89 | 'tgt': '${targets}', \ 90 | 'batchsize': 128, 'chunk_width': 140, \ 91 | 'left_context': 10, 'right_context': 5, \ 92 | 'mean_norm': True, 'var_norm': 'norm' 93 | }\ 94 | ]" \ 95 | `dirname ${chaindir}`/${model_dirname} 96 | fi 97 | 98 | # Average the last 40 epochs 99 | if $average; then 100 | echo "Averaging the last few epochs ..." 101 | average_models.py `dirname ${chaindir}`/${model_dirname} 80 200 240 102 | fi 103 | -------------------------------------------------------------------------------- /librispeech100/run-wrn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ./cmd.sh 4 | . ./path.sh 5 | 6 | stage=0 7 | subsampling=4 8 | traindir=data/train_100h 9 | feat_affix=_fbank 10 | chaindir=exp/chain_wrn 11 | num_leaves=3500 12 | model_dirname=wrn 13 | batches_per_epoch=250 14 | num_epochs=240 15 | train_nj_init=1 16 | train_nj_final=4 17 | perturb="gauss 0.1" 18 | leaky_hmm=0.1 19 | resume= 20 | num_split=20 # number of splits for memory-mapped data for training 21 | average=true 22 | 23 | . ./utils/parse_options.sh 24 | 25 | set -euo pipefail 26 | 27 | tree=${chaindir}/tree 28 | targets=${traindir}${feat_affix}/pdfid.${subsampling}.tgt 29 | trainname=`basename ${traindir}` 30 | 31 | if [ $stage -le 1 ]; then 32 | echo "Creating Chain Topology, Denominator Graph, and nnet Targets ..." 33 | lang=data/lang_chain 34 | cp -r data/lang $lang 35 | silphonelist=$(cat $lang/phones/silence.csl) || exit 1; 36 | nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; 37 | # Use our special topology... note that later on may have to tune this 38 | # topology. 39 | steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo 40 | 41 | steps/nnet3/chain/build_tree.sh \ 42 | --frame-subsampling-factor ${subsampling} \ 43 | --context-opts "--context-width=2 --central-position=1" \ 44 | --cmd "$train_cmd" ${num_leaves} ${traindir} \ 45 | $lang exp/tri3_ali_${trainname} ${tree} 46 | 47 | ali-to-phones ${tree}/final.mdl ark:"gunzip -c ${tree}/ali.*.gz |" ark:- |\ 48 | chain-est-phone-lm --num-extra-lm-states=2000 ark:- ${chaindir}/phone_lm.fst 49 | 50 | chain-make-den-fst ${tree}/tree ${tree}/final.mdl \ 51 | ${chaindir}/phone_lm.fst ${chaindir}/den.fst ${chaindir}/normalization.fst 52 | 53 | ali-to-pdf ${tree}/final.mdl ark:"gunzip -c ${tree}/ali.*.gz |" ark,t:${targets} 54 | fi 55 | 56 | if [ $stage -le 2 ]; then 57 | echo "Dumping memory mapped features ..." 58 | split_memmap_data.sh ${traindir}${feat_affix} ${targets} ${num_split} 59 | fi 60 | 61 | # Multigpu training of Chain-WideResNet with optimizer state averaging 62 | if [ $stage -le 3 ]; then 63 | resume_opts= 64 | if [ ! -z $resume ]; then 65 | resume_opts="--resume ${resume}" 66 | fi 67 | 68 | num_pdfs=$(tree-info ${tree}/tree | grep 'num-pdfs' | cut -d' ' -f2) 69 | idim=$(feat-to-dim scp:${traindir}${feat_affix}/feats.scp -) 70 | train_async_parallel.sh ${resume_opts} \ 71 | --gpu true \ 72 | --objective LFMMI \ 73 | --denom-graph ${chaindir}/den.fst \ 74 | --num-pdfs ${num_pdfs} \ 75 | --idim ${idim} \ 76 | --subsample ${subsampling} \ 77 | --model ChainWideResnet \ 78 | --depth 28 \ 79 | --width 10 \ 80 | --warmup 15000 \ 81 | --decay 1e-05 \ 82 | --xent 0.1 \ 83 | --l2 0.0001 \ 84 | --leaky-hmm ${leaky_hmm} \ 85 | --weight-decay 1e-07 \ 86 | --lr 0.0001 \ 87 | --batches-per-epoch ${batches_per_epoch} \ 88 | --num-epochs ${num_epochs} \ 89 | --nj-init ${train_nj_init} \ 90 | --nj-final ${train_nj_final} \ 91 | "[ \ 92 | {\ 93 | 'data': '${traindir}${feat_affix}', \ 94 | 'tgt': '${targets}', \ 95 | 'batchsize': 32, 'chunk_width': 140, \ 96 | 'left_context': 10, 'right_context': 5, 'num_repeats': 1, \ 97 | 'mean_norm': True, 'var_norm': True, 'perturb_type': '${perturb}' 98 | }\ 99 | ]" \ 100 | `dirname ${chaindir}`/${model_dirname} 101 | fi 102 | 103 | # Average the last 40 epochs 104 | if $average; then 105 | echo "Averaging the last few epochs ..." 106 | average_models.py `dirname ${chaindir}`/${model_dirname} 80 200 240 107 | fi 108 | -------------------------------------------------------------------------------- /librispeech100/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # DATA-level specifications. 4 | speech_data=/export/corpora5 #/PATH/TO/LIBRISPEECH/data 5 | data=./corpus 6 | data_url=www.openslr.org/resources/31 7 | lm_url=www.openslr.org/resources/11 8 | 9 | . ./cmd.sh 10 | . ./path.sh 11 | 12 | stage=0 13 | subsampling=4 14 | num_split=20 # number of splits for memory-mapped data for training 15 | 16 | . ./utils/parse_options.sh 17 | 18 | set -euo pipefail 19 | 20 | mkdir -p $data 21 | 22 | 23 | if [ $stage -le 0 ]; then 24 | local/download_lm.sh $lm_url $data data/local/lm 25 | fi 26 | 27 | if [ $stage -le 1 ]; then 28 | # format the data as Kaldi data directories 29 | local/prepare_dict.sh --stage 3 --nj 30 --cmd "$train_cmd" \ 30 | data/local/lm data/local/lm data/local/dict_nosp 31 | 32 | utils/prepare_lang.sh data/local/dict_nosp \ 33 | "" data/local/lang_tmp_nosp data/lang_nosp 34 | 35 | local/format_lms.sh --src-dir data/lang_nosp data/local/lm 36 | fi 37 | 38 | if [ $stage -le 2 ]; then 39 | # Get the train-100 subset 40 | local/data_prep.sh ${speech_data}/LibriSpeech/train-clean-100 data/train_100h 41 | ./steps/make_mfcc.sh --cmd "$train_cmd" --nj 32 data/train_100h exp/make_mfcc/train_100h mfcc 42 | ./utils/fix_data_dir.sh data/train_100h 43 | ./steps/compute_cmvn_stats.sh data/train_100h 44 | ./utils/fix_data_dir.sh data/train_100h 45 | 46 | utils/subset_data_dir.sh --shortest data/train_100h 500 data/train_500short 47 | utils/subset_data_dir.sh data/train_100h 5000 data/train_5k 48 | utils/subset_data_dir.sh data/train_100h 10000 data/train_10k 49 | fi 50 | 51 | # train a monophone system 52 | if [ $stage -le 3 ]; then 53 | steps/train_mono.sh --boost-silence 1.25 --nj 15 --cmd "$train_cmd" \ 54 | data/train_500short data/lang_nosp exp/mono 55 | 56 | steps/align_si.sh --boost-silence 1.25 --nj 15 --cmd "$train_cmd" \ 57 | data/train_5k data/lang_nosp exp/mono exp/mono_ali_train_5k 58 | fi 59 | 60 | # train a first delta + delta-delta triphone system on 5k utterances 61 | if [ $stage -le 4 ]; then 62 | steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \ 63 | 2000 10000 data/train_5k data/lang_nosp exp/mono_ali_train_5k exp/tri1 64 | 65 | steps/align_si.sh --nj 15 --cmd "$train_cmd" \ 66 | data/train_10k data/lang_nosp exp/tri1 exp/tri1_ali_train_10k 67 | fi 68 | 69 | # train a first delta + delta-delta triphone system on 10k utterances 70 | if [ $stage -le 5 ]; then 71 | steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \ 72 | 2500 15000 data/train_10k data/lang_nosp exp/tri1_ali_train_10k exp/tri1b 73 | 74 | steps/align_si.sh --nj 20 --cmd "$train_cmd" \ 75 | data/train_100h data/lang_nosp exp/tri1b exp/tri1b_ali_train_100h 76 | fi 77 | 78 | # train an LDA+MLLT system. 79 | if [ $stage -le 6 ]; then 80 | steps/train_lda_mllt.sh --cmd "$train_cmd" \ 81 | --splice-opts "--left-context=3 --right-context=3" 4200 40000 \ 82 | data/train_100h data/lang_nosp exp/tri1b_ali_train_100h exp/tri2 83 | 84 | # Align utts using the tri2b model 85 | steps/align_si.sh --nj 20 --cmd "$train_cmd" --use-graphs true \ 86 | data/train_100h data/lang_nosp exp/tri2 exp/tri2_ali_train_100h 87 | fi 88 | 89 | # Train tri3, which is LDA+MLLT+SAT 90 | if [ $stage -le 7 ]; then 91 | steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \ 92 | data/train_100h data/lang_nosp exp/tri2_ali_train_100h exp/tri3 93 | fi 94 | 95 | # Now we compute the pronunciation and silence probabilities from training data, 96 | # and re-create the lang directory. 97 | if [ $stage -le 8 ]; then 98 | steps/get_prons.sh --cmd "$train_cmd" \ 99 | data/train_100h data/lang_nosp exp/tri3 100 | utils/dict_dir_add_pronprobs.sh --max-normalize true \ 101 | data/local/dict_nosp \ 102 | exp/tri3/pron_counts_nowb.txt exp/tri3/sil_counts_nowb.txt \ 103 | exp/tri3/pron_bigram_counts_nowb.txt data/local/dict 104 | 105 | utils/prepare_lang.sh data/local/dict \ 106 | "" data/local/lang_tmp data/lang 107 | 108 | local/format_lms.sh --src-dir data/lang data/local/lm 109 | 110 | # Larger 3-gram LM rescoring 111 | #utils/build_const_arpa_lm.sh \ 112 | # data/local/lm/lm_tglarge.arpa.gz data/lang data/lang_test_tglarge 113 | 114 | # 4-gram LM rescoring 115 | utils/build_const_arpa_lm.sh \ 116 | data/local/lm/lm_fglarge.arpa.gz data/lang data/lang_test_fglarge 117 | 118 | steps/align_fmllr.sh --nj 5 --cmd "$train_cmd" \ 119 | data/train_100h data/lang exp/tri3 exp/tri3_ali_train_100h 120 | fi 121 | 122 | if [ $stage -le 10 ]; then 123 | traindir=data/train_100h 124 | feat_affix=_fbank_64 125 | echo "Making features for nnet training ..." 126 | ./utils/copy_data_dir.sh ${traindir} ${traindir}${feat_affix} 127 | ./steps/make_fbank.sh --cmd "$train_cmd" --nj 32 ${traindir}${feat_affix} 128 | ./utils/fix_data_dir.sh ${traindir}${feat_affix} 129 | ./steps/compute_cmvn_stats.sh ${traindir}${feat_affix} 130 | ./utils/fix_data_dir.sh ${traindir}${feat_affix} 131 | fi 132 | 133 | 134 | -------------------------------------------------------------------------------- /librispeech100/steps: -------------------------------------------------------------------------------- 1 | ../tools/kaldi/egs/wsj/s5/steps -------------------------------------------------------------------------------- /librispeech100/utils: -------------------------------------------------------------------------------- 1 | ../tools/kaldi/egs/wsj/s5/utils -------------------------------------------------------------------------------- /nnet_pytorch/INSTALL_PYCHAIN: -------------------------------------------------------------------------------- 1 | Install kaldi tools; 2 | cd openfst; 3 | make clean; 4 | ./configure --special-flags; 5 | make; make install; 6 | 7 | export OPENFST_PATH=/path/to/openfst; 8 | export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${OPENFST_PATH}/lib:/usr/local/cuda/lib64; 9 | 10 | cd pychain; 11 | cd openfst_binding; python setup.py install; 12 | cd ../pytorch_binding; python setup.py install 13 | -------------------------------------------------------------------------------- /nnet_pytorch/IterationTypes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf-8 -*- 3 | # Copyright 2020 4 | # Apache 2.0 5 | 6 | import numpy as np 7 | import torch 8 | import torch.nn.functional as F 9 | import datasets 10 | import sys 11 | 12 | 13 | def train_epoch(args, generator, model, objective, optim, lr_sched, device='cpu'): 14 | total_loss = 0.0 15 | move_to = datasets.DATASETS[args.datasetname].move_to 16 | dataset_args = eval(args.datasets) 17 | total_num_batches = sum( 18 | [args.batches_per_epoch * ds['num_repeats'] for ds in dataset_args] 19 | ) 20 | total_num_updates = total_num_batches // args.delay_updates 21 | 22 | for i, b in enumerate(generator, 1): 23 | b = move_to(b, device) 24 | loss, correct = objective(model, b) 25 | if isinstance(loss, int): 26 | continue; 27 | print( 28 | "Iter: ", int(i / args.delay_updates), " of ", total_num_updates, 29 | "Loss: ", loss.data.item(), 30 | "LR: ", lr_sched.curr_lr, end=' ' 31 | ) 32 | if correct is not None: 33 | print(" Acc: ", float(correct.data.item()) / (b.target.view(-1).size(0)), end='') 34 | print() 35 | total_loss += loss.data.item() 36 | loss.backward() 37 | loss.detach() 38 | del b 39 | # Mimics multigpu training with large batches on a single gpu 40 | if ((i % args.delay_updates) == 0): 41 | grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_thresh) 42 | optim.step() 43 | optim.zero_grad() 44 | lr_sched.step(1.0) 45 | return total_loss / args.batches_per_epoch 46 | 47 | 48 | def validate(args, generator, model, device='cpu'): 49 | model.eval() 50 | move_to = datasets.DATASETS[args.datasetname].move_to 51 | with torch.no_grad(): 52 | correct = 0.0 53 | avg_loss = 0.0 54 | num_tokens = 0.0 55 | for i, b in enumerate(generator): 56 | b = move_to(b, device) 57 | output = model(b)[0] 58 | lprobs = F.log_softmax(output, dim=-1) 59 | lprobs = lprobs.view(-1, lprobs.size(-1)) 60 | lprobs = lprobs[:b.target.view(-1).size(0), :] 61 | loss = F.nll_loss(lprobs, b.target.view(-1), reduction='sum') 62 | avg_loss += loss.data.item() 63 | correct += torch.sum(lprobs.argmax(1) == b.target.view(-1)) 64 | num_tokens += lprobs.size(0) 65 | avg_loss /= num_tokens 66 | correct = 0 if num_tokens == 0 else float(correct.data.item()) / num_tokens 67 | print() 68 | model.train() 69 | return avg_loss, correct 70 | 71 | 72 | def decode_dataset(args, generator, model, device='cpu'): 73 | move_to = datasets.DATASETS[args.datasetname].move_to 74 | for i, b in enumerate(generator): 75 | uttname = b.metadata['name'][0] 76 | b = move_to(b, device) 77 | model_output = model(b) 78 | # Chain system 79 | if 'LFMMI' in args.objective: 80 | output = model_output[0].clamp(-30, 30) 81 | lprobs = output.contiguous().view(-1, output.size(2)) 82 | ## XENT 83 | elif 'CrossEntropy' in args.objective: 84 | lprobs = F.log_softmax( 85 | model_output[0], dim=-1 86 | ).view(-1, model_output[0].size(-1)) 87 | 88 | yield uttname, lprobs.detach().cpu().numpy() 89 | 90 | 91 | def decode_dataset(args, generator, model, device='cpu'): 92 | move_to = datasets.DATASETS[args.datasetname].move_to 93 | for i, b in enumerate(generator): 94 | uttname = b.metadata['name'][0] 95 | b = move_to(b, device) 96 | model_output = model(b) 97 | # Chain system 98 | if 'CrossEntropy' not in args.objective: 99 | output = model_output[0].clamp(-30, 30) 100 | lprobs = output.contiguous().view(-1, output.size(2)) 101 | ## XENT 102 | elif 'CrossEntropy' in args.objective: 103 | lprobs = F.log_softmax( 104 | model_output[0], dim=-1 105 | ).view(-1, model_output[0].size(-1)) 106 | else: 107 | print("Undefined Objective") 108 | sys.exit(1) 109 | 110 | yield uttname, lprobs.detach().cpu().numpy() 111 | 112 | 113 | def decorrupt_dataset(args, generator, model, objective, device='cpu'): 114 | move_to = datasets.DATASETS[args.datasetname].move_to 115 | for i, b in enumerate(generator): 116 | uttname = b.metadata['name'][0] 117 | b = move_to(b, device) 118 | for sgld_iter, decorrupted in enumerate(objective.decorrupt(model, b, num_steps=args.num_steps)): 119 | yield uttname, sgld_iter, decorrupted.contiguous().view(-1, decorrupted.size(2)).detach().cpu().numpy() 120 | 121 | 122 | def evaluate_energies(args, generator, model, device='cpu'): 123 | move_to = datasets.DATASETS[args.datasetname].move_to 124 | for i, b in enumerate(generator, 1): 125 | b = move_to(b, device) 126 | model_output = model(b) 127 | yield model_output.data.item() 128 | 129 | -------------------------------------------------------------------------------- /nnet_pytorch/LRScheduler.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | # Copyright 2020 3 | # Apache 2.0 4 | 5 | from __future__ import print_function 6 | import sys 7 | import os 8 | import math 9 | 10 | 11 | class LRScheduler(object): 12 | @staticmethod 13 | def add_args(parser): 14 | parser.add_argument('--warmup', type=int, default=0) 15 | parser.add_argument('--decay', type=float, default=0.0) 16 | parser.add_argument('--fixed', type=int, default=0) 17 | parser.add_argument('--min-lr', type=float, default=1e-09) 18 | 19 | def __init__(self, optimizer, args): 20 | self.optimizer = optimizer 21 | self.warmup = args.warmup 22 | self.fixed = args.fixed 23 | self.decay = args.decay 24 | self.min_lr = args.min_lr 25 | 26 | self.num_warmup_updates = 0 27 | self.num_fixed_updates = 0 28 | self.num_decay_updates = 0 29 | self.lr = self.optimizer.param_groups[0]['lr'] 30 | if self.warmup > 0: 31 | self.set_lr(args.min_lr) 32 | self.curr_lr = args.min_lr 33 | else: 34 | self.curr_lr = self.lr 35 | 36 | def step(self, num_new_updates): 37 | if self.warmup > 0 and self.num_warmup_updates < self.warmup: 38 | self.num_warmup_updates += num_new_updates 39 | slope = (self.lr - self.min_lr) / float(self.warmup) 40 | new_lr = self.min_lr + slope * self.num_warmup_updates 41 | elif self.fixed > 0 and self.num_fixed_updates < self.fixed: 42 | self.num_fixed_updates += num_new_updates 43 | new_lr = self.lr 44 | else: 45 | self.num_decay_updates += num_new_updates 46 | factor = math.exp(-self.decay * self.num_decay_updates) 47 | new_lr = self.lr * factor 48 | self.set_lr(new_lr) 49 | self.curr_lr = new_lr 50 | 51 | def set_lr(self, lr): 52 | for param_group in self.optimizer.param_groups: 53 | param_group['lr'] = lr 54 | 55 | def state_dict(self): 56 | return { 57 | 'warmup': self.warmup, 58 | 'fixed': self.fixed, 59 | 'decay': self.decay, 60 | 'warmup_updates': self.num_warmup_updates, 61 | 'fixed_updates': self.num_fixed_updates, 62 | 'decay_updates': self.num_decay_updates, 63 | 'lr': self.lr, 64 | 'curr_lr': self.curr_lr, 65 | 'min_lr': self.min_lr, 66 | } 67 | 68 | def load_state_dict(self, state_dict): 69 | self.warmup = state_dict['warmup'] 70 | self.fixed = state_dict['fixed'] 71 | self.decay = state_dict['decay'] 72 | self.num_warmup_updates = state_dict['warmup_updates'] 73 | self.num_fixed_updates = state_dict['fixed_updates'] 74 | self.num_decay_updates = state_dict['decay_updates'] 75 | self.lr = state_dict['lr'] 76 | self.curr_lr = state_dict['curr_lr'] 77 | self.min_lr = state_dict['min_lr'] 78 | -------------------------------------------------------------------------------- /nnet_pytorch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/m-wiesner/nnet_pytorch/086bc45cf2f1a12197f29033a1e129f6c8b55b03/nnet_pytorch/__init__.py -------------------------------------------------------------------------------- /nnet_pytorch/batch_generators.py: -------------------------------------------------------------------------------- 1 | # Batch generators for training and inference 2 | 3 | def batches(dataset, n): 4 | for b in range(n): 5 | yield dataset.minibatch() 6 | 7 | def multiset_batches(sets, genfun, *args): 8 | ''' 9 | Alternating round-robin batches. 10 | ''' 11 | # We assume the generators are of equal length 12 | for set_batches_n in zip(*[genfun(s, *args) for s in sets]): 13 | for b in set_batches_n: 14 | if b is not None: 15 | yield b 16 | 17 | def evaluation_batches(dataset): 18 | return dataset.evaluation_batches() 19 | 20 | -------------------------------------------------------------------------------- /nnet_pytorch/datasets/NnetPytorchDataset.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | 4 | class NnetPytorchDataset(object): 5 | 6 | Minibatch = namedtuple('Minibatch', ['input', 'target', 'metadata']) 7 | 8 | @staticmethod 9 | def add_args(parser): 10 | pass 11 | 12 | @classmethod 13 | def build_dataset(cls, args): 14 | raise NotImplementedError 15 | 16 | def __init__(self): 17 | pass 18 | 19 | 20 | def __len__(self): 21 | ''' 22 | Returns the total number of elements in the dataset 23 | ''' 24 | raise NotImplementedError 25 | 26 | def size(self, idx): 27 | ''' 28 | Returns some notion of size of an individual element 29 | of the dataset. 30 | ''' 31 | raise NotImplementedError( 32 | "This function should return the size of an individual element of " 33 | "the dataset." 34 | ) 35 | 36 | 37 | def minibatch(self): 38 | ''' 39 | This is effectively the collater. It defines how multiple elements 40 | of a dataset are aggregated or collated together for neural network 41 | training. 42 | ''' 43 | raise NotImplementedError( 44 | "This function should return an object that groups together " 45 | "different elements of the dataset for neural network training." 46 | ) 47 | 48 | def evaluation_batches(self): 49 | ''' 50 | This yields batches of the evaluation set. 51 | ''' 52 | raise NotImplementedError( 53 | "This function should yield batches of the eval data." 54 | ) 55 | 56 | 57 | def __getitem__(self, idx): 58 | raise NotImplementedError( 59 | "This function is used to return a formatted inputs and outputs " 60 | "for a single element from the dataset. self.minibatch() " 61 | "should make repeated calls to __getitem__ when forming " 62 | "minibatches. The argument idx can be any hashable object." 63 | ) 64 | 65 | 66 | def move_to(self, b, device): 67 | pass 68 | 69 | -------------------------------------------------------------------------------- /nnet_pytorch/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import importlib 4 | 5 | 6 | modules = glob.glob( 7 | os.path.sep.join( 8 | [os.path.dirname(__file__), '*.py'] 9 | ) 10 | ) 11 | 12 | for f in modules: 13 | if os.path.isfile(f) and '__init__.py' not in f and 'data_utils' not in f \ 14 | and 'batch_generators' not in f: 15 | module_name, ext = os.path.splitext(f) 16 | if ext == '.py': 17 | module = importlib.import_module('datasets.' + os.path.basename(module_name)) 18 | 19 | DATASETS = { 20 | 'HybridASR': HybridASR.HybridAsrDataset, 21 | } 22 | 23 | -------------------------------------------------------------------------------- /nnet_pytorch/datasets/data_utils.py: -------------------------------------------------------------------------------- 1 | import kaldi_io 2 | import numpy as np 3 | import subprocess 4 | import torch 5 | import random 6 | 7 | 8 | def memmap_feats(feats_scp, f_memmapped, utt_list, dtype=np.float32): 9 | ''' 10 | Maps the feats.scp file from kaldi to a memory mapped numpy object. 11 | This allows for fast i/o when creating window minibatches from slices 12 | of training data. 13 | 14 | input args: feats_scp, f_memmapped 15 | output: 16 | utt_lens = {'utt_n': # utt_n frames, ... } 17 | offsets = {'utt_n': utt_n offset in memory mapped numpy file} 18 | data_shape = (#frames, feature_dimension) 19 | ''' 20 | # First get the total lengths of each utterance 21 | p = subprocess.Popen( 22 | ['feat-to-len', 'scp:' + feats_scp, 'ark,t:-'], 23 | stdout=subprocess.PIPE 24 | ) 25 | out = p.communicate() 26 | utt_lens = {} 27 | for l in out[0].split(b'\n'): 28 | if l.strip() != b'': 29 | utt_id, utt_len = l.strip().split(None, 1) 30 | utt_lens[utt_id] = int(utt_len) 31 | # Next get the dimension of the features 32 | p = subprocess.Popen(['feat-to-dim', 'scp:' + feats_scp, '-'], 33 | stdout=subprocess.PIPE 34 | ) 35 | out = p.communicate() 36 | dim = int(out[0]) 37 | # Set Data Shape 38 | data_shape = (sum(utt_lens.values()), dim) 39 | # Set up memmapped features 40 | f = np.memmap(f_memmapped, mode='w+', dtype=dtype, shape=data_shape) 41 | offsets = {} 42 | offset = 0 43 | for i, (k, m) in enumerate(kaldi_io.read_mat_scp(feats_scp)): 44 | print('Utterance ', i, ' : ', k) 45 | if k not in utt_list: 46 | continue; 47 | m = m.astype(dtype) 48 | offsets[k.encode()] = offset 49 | new_offset = offset + utt_lens[k.encode()] 50 | f[offset:new_offset, :] = m 51 | offset = new_offset 52 | print() 53 | del f 54 | return utt_lens, offsets, data_shape 55 | 56 | 57 | def get_targets(f_targets): 58 | ''' 59 | Retrieve the targets (pdfids) corresponding to each input utterance 60 | input args: 61 | f_targets -- file pointer to the targets 62 | 63 | Format of f_targets: 64 | utt1 pdf11 pdf12 pdf13 ... 65 | utt2 pdf21 pdf22 ... 66 | utt3 ... 67 | ... 68 | output: 69 | utts = {'utt1': [pdf1, pdf2, ...], 'utt2': [pdf1, pdf1, ...]} 70 | ''' 71 | utts = {} 72 | for l in f_targets: 73 | utt_id, tgts = l.strip().split(None, 1) 74 | if utt_id not in utts: 75 | utts[utt_id.encode()] = [] 76 | for t in tgts.split(): 77 | utts[utt_id.encode()].append(int(t)) 78 | return utts 79 | 80 | 81 | def load_cmvn(filename): 82 | ''' 83 | Load the cmvn file. Requires filename. 84 | ''' 85 | gen = kaldi_io.read_mat_scp(filename) 86 | spk2cmvn = {} 87 | for k, m in gen: 88 | total = m[0, -1] 89 | spk2cmvn[k] = {'mu': m[0, :-1] / total, 'var': m[1, :-1] / total} 90 | return spk2cmvn 91 | 92 | def load_ivectors(filename): 93 | ''' 94 | Load the ivectors into a dictionary. 95 | Input argument may be an ark or scp file. 96 | ''' 97 | ivectors = {} 98 | for key, vec in kaldi_io.read_vec_flt_scp(filename): 99 | ivectors[key] = np.array(vec) 100 | return ivectors 101 | 102 | 103 | def load_utt2spk(f): 104 | ''' 105 | Load the utt2spk file. Requires an open file pointer. 106 | ''' 107 | utt2spk = {} 108 | for l in f: 109 | utt, spk = l.strip().split(None, 1) 110 | utt2spk[utt.encode()] = spk 111 | spk2utt = {} 112 | for u, s in utt2spk.items(): 113 | if s not in spk2utt: 114 | spk2utt[s] = [] 115 | spk2utt[s].append(u) 116 | return utt2spk, spk2utt 117 | 118 | 119 | def load_segments(f): 120 | ''' 121 | Load the segments file. Requires an open file pointer. 122 | ''' 123 | audio_to_segments = {} 124 | for l in f: 125 | utt, audio, start, end = l.strip().split() 126 | if audio not in audio_to_segments: 127 | audio_to_segments[audio] = [] 128 | audio_to_segments[audio].append((utt, start, end)) 129 | return audio_to_segments 130 | 131 | 132 | def load_utt_subset(f): 133 | ''' 134 | Load the subset of utterances from file point f. Use a kaldi segments 135 | file for the file f for example. 136 | ''' 137 | utt_subset = [] 138 | for l in f: 139 | utt_subset.append(l.strip().split(None, 1)[0].encode()) 140 | return utt_subset 141 | 142 | 143 | def perturb(x, perturb_type='none'): 144 | if perturb_type == 'none': 145 | pass 146 | elif perturb_type == 'salt_pepper': 147 | x *= torch.FloatTensor(x.size()).random_(0, 2).to(x.dtype) 148 | elif perturb_type == 'time_mask': 149 | width=4 150 | start = random.randint(0, x.size(1) - width) 151 | end = start + width 152 | mask = (torch.arange(x.size(1)) >= start) * (torch.arange(x.size(1)) < end) 153 | mask = mask[:, None].expand(x.size()) 154 | x[mask] = 0.0 155 | elif perturb_type == 'freq_mask': 156 | width=10 157 | start = random.randint(0, x.size(0) - width) 158 | end = start + width 159 | mask = (torch.arange(x.size(-1)) >= start) * (torch.arange(x.size(-1)) < end) 160 | mask = mask[None, :].expand(x.size()) 161 | x[mask] = 0.0 162 | elif perturb_type.startswith('gauss'): 163 | std = float(perturb_type.split()[1]) 164 | x += std * torch.randn_like(x) 165 | elif perturb_type.startswith('rand'): 166 | maxval = float(perturb_type.split()[1]) 167 | x.uniform_(-maxval, maxval) 168 | #return x 169 | -------------------------------------------------------------------------------- /nnet_pytorch/generate.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | #-*- coding: utf-8 -*- 3 | # Copyright 2020 4 | # Apache 2.0 5 | 6 | import os 7 | import argparse 8 | import json 9 | import subprocess 10 | import numpy as np 11 | import torch 12 | import models 13 | import objectives 14 | from collections import namedtuple 15 | from data_utils import move_to 16 | 17 | 18 | Samples = namedtuple('Samples', ['input', 'target', 'metadata']) 19 | 20 | 21 | def main(): 22 | args = parse_arguments() 23 | print(args) 24 | 25 | # Reserve the GPU if used in decoding. 26 | if args.gpu: 27 | # USER will need to set CUDA_VISIBLE_DEVICES here 28 | cvd = subprocess.check_output(["/usr/local/bin/free-gpu", "-n", "1"]).decode().strip() 29 | os.environ['CUDA_VISIBLE_DEVICES'] = cvd 30 | 31 | device = torch.device('cuda' if args.gpu else 'cpu') 32 | reserve_variable = torch.ones(1).to(device) 33 | 34 | # Load experiment configurations so that decoding uses the same parameters 35 | # as training 36 | conf = json.load(open(args.modeldir + '/conf.1.json')) 37 | 38 | # Build the model and send to the device (cpu or gpu). Generally cpu. 39 | objective = objectives.OBJECTIVES[conf['objective']].build_objective(conf) 40 | objective.to(device) 41 | model = models.MODELS[conf['model']].build_model(conf) 42 | model.to(device) 43 | 44 | mdl = torch.load( 45 | os.path.sep.join([args.modeldir, args.modelname]), 46 | map_location=device 47 | ) 48 | objective.load_state_dict(mdl['objective']) 49 | model.load_state_dict(mdl['model']) 50 | 51 | cw = args.chunk_width 52 | cw += args.left_context + args.right_context 53 | 54 | samples = objective.generate_from_model( 55 | model, 56 | bs=args.batchsize, 57 | cw=cw, 58 | dim=args.idim, 59 | left_context=args.left_context, right_context=args.right_context, 60 | device=device, 61 | target=args.target, 62 | ) 63 | 64 | for i, s in enumerate(samples): 65 | np.save('{}/samples.{}'.format(args.dumpdir, i), s.cpu().data.numpy()) 66 | 67 | 68 | def parse_arguments(): 69 | parser = argparse.ArgumentParser() 70 | parser.add_argument('--modeldir', help='model directory used for generated') 71 | parser.add_argument('--dumpdir', help='dump results here') 72 | parser.add_argument('--modelname', default='final.mdl') 73 | parser.add_argument('--gpu', action='store_true', help='Tun on gpu. This ' 74 | 'can be very slow on cpu' 75 | ) 76 | parser.add_argument('--idim', type=int, default=64, 77 | help='The input dimension of features' 78 | ) 79 | parser.add_argument('--chunk-width', type=int, default=50, 80 | help='The width of the speech chunk. The target sequence will be ' 81 | 'length chunk_width / subsample' 82 | ) 83 | parser.add_argument('--left-context', type=int, default=10, 84 | help='extra left context on the input features' 85 | ) 86 | parser.add_argument('--right-context', type=int, default=5, 87 | help='extra right context on the input features' 88 | ) 89 | parser.add_argument('--batchsize', type=int, default=32, 90 | help='number of sample to generate (just 1 minibatch)', 91 | ) 92 | parser.add_argument('--target', nargs='+', type=int, default=None) 93 | 94 | # Args specific to different components 95 | args, leftover = parser.parse_known_args() 96 | conf = json.load(open(args.modeldir + '/conf.1.json')) 97 | models.MODELS[conf['model']].add_args(parser) 98 | parser.parse_args(leftover, namespace=args) 99 | return args 100 | 101 | 102 | if __name__ == "__main__": 103 | main() 104 | 105 | -------------------------------------------------------------------------------- /nnet_pytorch/models/BLSTM.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from collections import namedtuple 4 | import numpy as np 5 | 6 | 7 | class BLSTM(torch.nn.Module): 8 | ''' 9 | Bidirectional LSTM model 10 | ''' 11 | @staticmethod 12 | def add_args(parser): 13 | parser.add_argument('--blstm-hdim', type=int, default=512) 14 | parser.add_argument('--blstm-num-layers', type=int, default=4) 15 | parser.add_argument('--blstm-dropout', type=float, default=0.1) 16 | parser.add_argument('--blstm-prefinal-dim', type=int, default=256) 17 | 18 | @classmethod 19 | def build_model(cls, conf): 20 | model = BLSTM( 21 | conf['idim'], conf['num_targets'], 22 | odims=[conf['blstm_hdim'] for i in range(conf['blstm_num_layers'])], 23 | dropout=conf['blstm_dropout'], 24 | prefinal_affine_dim=conf['blstm_prefinal_dim'], 25 | subsample=conf['subsample'], 26 | batch_norm_dropout=True 27 | ) 28 | return model 29 | 30 | def __init__( 31 | self, idim, odim, 32 | odims=[512, 512, 512, 512, 512, 512], 33 | prefinal_affine_dim=512, 34 | nonlin=F.relu, dropout=0.1, subsample=1, batch_norm_dropout=True 35 | ): 36 | super().__init__() 37 | 38 | # Proper BLSTM layers 39 | self.batch_norm_dropout = batch_norm_dropout 40 | self.dropout = dropout 41 | self.nonlin = nonlin 42 | self.subsample = subsample 43 | self.blstm = torch.nn.ModuleList() 44 | self.batchnorm = torch.nn.ModuleList() 45 | 46 | next_input_dim = idim 47 | for cur_odim in odims: 48 | self.blstm.append( 49 | torch.nn.LSTM( 50 | next_input_dim, cur_odim//2, 1, 51 | batch_first=True, bidirectional=True 52 | ) 53 | ) 54 | self.batchnorm.append( 55 | torch.nn.BatchNorm1d(cur_odim, eps=1e-03, affine=False) 56 | ) 57 | next_input_dim = cur_odim 58 | 59 | # Last few layers 60 | self.prefinal_affine = torch.nn.Linear( 61 | next_input_dim, prefinal_affine_dim, 62 | ) 63 | self.batchnorm.append( 64 | torch.nn.BatchNorm1d( 65 | prefinal_affine_dim, eps=1e-03, affine=False 66 | ) 67 | ) 68 | self.final_affine = torch.nn.Linear( 69 | prefinal_affine_dim, odim, 70 | ) 71 | 72 | def forward(self, sample): 73 | xs_pad = sample.input 74 | left_context = sample.metadata['left_context'] 75 | right_context = sample.metadata['right_context'] 76 | 77 | # Basic pattern is (blstm, relu, batchnorm, dropout) x num_layers 78 | for blstm, batchnorm in zip(self.blstm, self.batchnorm[:-1]): 79 | xs_pad = blstm(xs_pad)[0].transpose(0,1) 80 | xs_pad = self.nonlin(xs_pad) 81 | if not self.batch_norm_dropout: 82 | xs_pad = batchnorm(xs_pad) 83 | xs_pad = F.dropout(xs_pad, p=self.dropout, training=self.training) 84 | 85 | # A few final layers 86 | end_idx = xs_pad.size(1) if right_context == 0 else -right_context 87 | output2 = xs_pad[:, left_context:end_idx:self.subsample, :] 88 | xs_pad = self.nonlin(self.prefinal_affine(xs_pad)) 89 | if not self.batch_norm_dropout: 90 | xs_pad = self.batchnorm[-1](xs_pad) 91 | 92 | # This is basically just glue 93 | output = self.final_affine(xs_pad) 94 | return ( 95 | output[:, left_context:end_idx:self.subsample, :], 96 | output2, 97 | ) 98 | 99 | 100 | class ChainBLSTM(BLSTM): 101 | @classmethod 102 | def build_model(cls, conf): 103 | model = ChainBLSTM( 104 | conf['idim'], conf['num_targets'], 105 | odims=[conf['blstm_hdim'] for i in range(conf['blstm_num_layers'])], 106 | dropout=conf['blstm_dropout'], 107 | prefinal_affine_dim=conf['blstm_prefinal_dim'], 108 | subsample=conf['subsample'], 109 | batch_norm_dropout=True 110 | ) 111 | return model 112 | 113 | def __init__( 114 | self, idim, odim, 115 | odims=[512, 512, 512, 512, 512, 512], 116 | prefinal_affine_dim=512, 117 | nonlin=F.relu, dropout=0.1, subsample=1, batch_norm_dropout=True 118 | ): 119 | super().__init__( 120 | idim, odim, odims, prefinal_affine_dim, 121 | nonlin, dropout, subsample 122 | ) 123 | self.prefinal_xent = torch.nn.Linear( 124 | odims[-1], 125 | prefinal_affine_dim, 126 | ) 127 | self.xent_batchnorm = torch.nn.BatchNorm1d( 128 | prefinal_affine_dim, 129 | eps=1e-03, affine=False 130 | ) 131 | self.xent_layer = torch.nn.Linear(prefinal_affine_dim, odim) 132 | 133 | def forward(self, xs_pad): 134 | output, xs_pad = super().forward(xs_pad) 135 | if self.training: 136 | xs_pad = self.nonlin(self.prefinal_xent(xs_pad)) 137 | if not self.batch_norm_dropout: 138 | xs_pad = self.xent_batchnorm(xs_pad) 139 | xs_pad = self.xent_layer(xs_pad) 140 | return output, xs_pad 141 | 142 | -------------------------------------------------------------------------------- /nnet_pytorch/models/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import importlib 4 | 5 | 6 | modules = glob.glob( 7 | os.path.sep.join( 8 | [os.path.dirname(__file__), '*.py'] 9 | ) 10 | ) 11 | 12 | for f in modules: 13 | if os.path.isfile(f) and '__init__.py' not in f and 'norms.py' not in f: 14 | module_name, ext = os.path.splitext(f) 15 | if ext == '.py': 16 | module = importlib.import_module('models.' + os.path.basename(module_name)) 17 | 18 | MODELS = { 19 | 'TDNN': TDNN.TDNN, 20 | 'ChainTDNN': TDNN.ChainTDNN, 21 | 'Resnet': Resnet.SpeechResnet, 22 | 'ChainResnet': Resnet.ChainSpeechResnet, 23 | 'WideResnet': WideResnet.SpeechResnet, 24 | 'ChainWideResnet': WideResnet.ChainSpeechResnet, 25 | 'BLSTM': BLSTM.BLSTM, 26 | 'ChainBLSTM': BLSTM.ChainBLSTM 27 | } 28 | 29 | def build_model(modelname, conf): 30 | return MODELS[modelname].build_model(conf) 31 | -------------------------------------------------------------------------------- /nnet_pytorch/objectives/AcceleratedSGLD.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import random 3 | from .optimizer import Optimizer, required 4 | 5 | 6 | class AcceleratedSGLD(Optimizer): 7 | r"""Implements stochastic gradient descent (optionally with momentum). 8 | 9 | Nesterov momentum is based on the formula from 10 | `On the importance of initialization and momentum in deep learning`__. 11 | 12 | Args: 13 | params (iterable): iterable of parameters to optimize or dicts defining 14 | parameter groups 15 | lr (float): learning rate 16 | momentum (float, optional): momentum factor (default: 0) 17 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) 18 | dampening (float, optional): dampening for momentum (default: 0) 19 | nesterov (bool, optional): enables Nesterov momentum (default: False) 20 | 21 | Example: 22 | >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9) 23 | >>> optimizer.zero_grad() 24 | >>> loss_fn(model(input), target).backward() 25 | >>> optimizer.step() 26 | 27 | __ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf 28 | 29 | .. note:: 30 | The implementation of SGD with Momentum/Nesterov subtly differs from 31 | Sutskever et. al. and implementations in some other frameworks. 32 | 33 | Considering the specific case of Momentum, the update can be written as 34 | 35 | .. math:: 36 | v = \rho * v + g \\ 37 | p = p - lr * v 38 | 39 | where p, g, v and :math:`\rho` denote the parameters, gradient, 40 | velocity, and momentum respectively. 41 | 42 | This is in contrast to Sutskever et. al. and 43 | other frameworks which employ an update of the form 44 | 45 | .. math:: 46 | v = \rho * v + lr * g \\ 47 | p = p - v 48 | 49 | The Nesterov version is analogously modified. 50 | """ 51 | 52 | def __init__(self, params, finalval, lr=required, momentum=0, dampening=0, 53 | weight_decay=0, nesterov=False, stepscale=1.0, noise=0.005, 54 | rel_overshoot=0.1, epsilon=0.00005, 55 | ): 56 | if lr is not required and lr < 0.0: 57 | raise ValueError("Invalid learning rate: {}".format(lr)) 58 | if momentum < 0.0: 59 | raise ValueError("Invalid momentum value: {}".format(momentum)) 60 | if weight_decay < 0.0: 61 | raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) 62 | 63 | defaults = dict(lr=lr, momentum=momentum, dampening=dampening, 64 | weight_decay=weight_decay, nesterov=nesterov) 65 | if nesterov and (momentum <= 0 or dampening != 0): 66 | raise ValueError("Nesterov momentum requires a momentum and zero dampening") 67 | super(AcceleratedSGLD, self).__init__(params, defaults) 68 | self.noise = noise 69 | self.stepscale = stepscale 70 | # Shoot for 10% better (helps with gradient) 71 | if finalval >= 0: 72 | self.final_val = finalval * (1 - rel_overshoot) 73 | else: 74 | self.final_val = finalval * (1 + rel_overshoot) 75 | self.epsilon = epsilon 76 | 77 | def __setstate__(self, state): 78 | super(SGD, self).__setstate__(state) 79 | for group in self.param_groups: 80 | group.setdefault('nesterov', False) 81 | 82 | def langevin_noise(self, x, std=1.0): 83 | return self.noise * torch.randn_like(x).mul_(std) 84 | 85 | def step(self, startval=None, numsteps=None): 86 | """Performs a single optimization step. 87 | 88 | Arguments: 89 | closure (callable, optional): A closure that reevaluates the model 90 | and returns the loss. 91 | """ 92 | loss = None 93 | for group in self.param_groups: 94 | weight_decay = group['weight_decay'] 95 | for p in group['params']: 96 | if p.grad is None: 97 | continue 98 | grad_norm = max(self.epsilon, (p.grad.data ** 2.0).sum()) 99 | #print("Grad Norm: ", grad_norm.data.item()) 100 | if grad_norm <= self.epsilon: 101 | print("Small Grad Norm!!") 102 | grad_norm = self.epsilon 103 | #opt_lr = abs(self.final_val - startval)/grad_norm 104 | opt_lr = (self.final_val - startval)/grad_norm 105 | #print("Final Value: ", self.final_val, " -- Opt LR: ", opt_lr) 106 | # When we are below the requested value, we can just descend at 107 | # at a normal pace ... 108 | opt_lr = self.epsilon / grad_norm if opt_lr > 0 else -opt_lr 109 | d_p = p.grad.data 110 | if weight_decay != 0: 111 | d_p.add_(p.data, alpha=weight_decay) 112 | 113 | replay_correction = numsteps[:, None, None] ** self.stepscale 114 | langevin_std = 1.0 / replay_correction 115 | 116 | self.state[p]['update'] = self.langevin_noise(p.data, std=langevin_std).add_( 117 | d_p.div_(replay_correction), 118 | alpha=-group['lr'] * opt_lr, 119 | ) 120 | p.data.add_(self.state[p]['update']) 121 | self.state[p]['opt_lr'] = opt_lr 122 | return loss 123 | -------------------------------------------------------------------------------- /nnet_pytorch/objectives/CrossEntropy.py: -------------------------------------------------------------------------------- 1 | import torch.nn.functional as F 2 | import torch 3 | 4 | 5 | class CrossEntropy(torch.nn.Module): 6 | @staticmethod 7 | def add_args(parser): 8 | pass 9 | 10 | @classmethod 11 | def build_objective(cls, conf): 12 | return CrossEntropy() 13 | 14 | @classmethod 15 | def add_state_dict(cls, s1, s2, fraction, iteration=None): 16 | return s1 17 | 18 | def __init__(self): 19 | super(CrossEntropy, self).__init__() 20 | 21 | def forward(self, model, sample, precomputed=None): 22 | if precomputed is not None: 23 | output = precomputed 24 | else: 25 | output = model(sample)[0] 26 | 27 | lprobs = F.log_softmax(output, dim=-1) 28 | lprobs = lprobs.view(-1, lprobs.size(-1)) 29 | loss = F.nll_loss(lprobs, sample.target.view(-1), reduction='mean') 30 | correct = torch.sum(lprobs.argmax(1) == sample.target.view(-1)) 31 | return loss, correct 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /nnet_pytorch/objectives/L2.py: -------------------------------------------------------------------------------- 1 | import torch.nn.functional as F 2 | import torch 3 | 4 | 5 | class L2(torch.nn.Module): 6 | @staticmethod 7 | def add_args(parser): 8 | pass 9 | 10 | @classmethod 11 | def build_objective(cls, conf): 12 | return L2(conf) 13 | 14 | @classmethod 15 | def add_state_dict(cls, s1, s2, fraction, iteration=None): 16 | return s1 17 | 18 | def __init__(self): 19 | super(L2, self).__init__() 20 | 21 | def forward(self, model, sample, precomputed=None): 22 | if precomputed is not None: 23 | x = precomputed 24 | else: 25 | x = model(sample)[0] 26 | 27 | loss = ((x ** 2).sum()) / (x.size(0) * x.size(1)) 28 | return loss, None 29 | -------------------------------------------------------------------------------- /nnet_pytorch/objectives/LFMMI.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from .L2 import L2 5 | from .CrossEntropy import CrossEntropy 6 | from .LFMMIOnly import ChainLoss as LFMMI 7 | 8 | 9 | class ChainLoss(nn.Module): 10 | @staticmethod 11 | def add_args(parser): 12 | parser.add_argument('--xent-reg', type=float, default=0.2) 13 | parser.add_argument('--l2-reg', type=float, default=0.00025) 14 | for m in [L2, CrossEntropy, LFMMI]: 15 | m.add_args(parser) 16 | 17 | @classmethod 18 | def build_objective(cls, conf): 19 | return ChainLoss( 20 | conf['denom_graph'], 21 | xent_reg=conf['xent_reg'], 22 | l2_reg=conf['l2_reg'], 23 | leaky_hmm=conf.get('leaky_hmm', 0.1), 24 | ) 25 | 26 | @classmethod 27 | def add_state_dict(cls, s1, s2, fraction, iteration=None): 28 | return s1 29 | 30 | def __init__( 31 | self, den_graph, 32 | xent_reg=0.2, l2_reg=0.00025, avg=True, leaky_hmm=0.1, 33 | ): 34 | super(ChainLoss, self).__init__() 35 | self.lfmmi = LFMMI(den_graph, leaky_hmm=leaky_hmm) 36 | self.xent = CrossEntropy() 37 | self.l2 = L2() 38 | 39 | self.l2_reg = l2_reg 40 | self.xent_reg = xent_reg 41 | 42 | def forward(self, model, sample, precomputed=None): 43 | if precomputed is not None: 44 | chain_output = precomputed 45 | else: 46 | chain_output = model(sample) 47 | 48 | losses = [] 49 | correct = None 50 | # LFMMI 51 | loss_lfmmi, _ = self.lfmmi( 52 | model, 53 | sample, 54 | precomputed=chain_output[0], 55 | ) 56 | print('LFMMI: {}'.format(loss_lfmmi.data.item()), end=' ') 57 | losses.append(loss_lfmmi) 58 | # XENT 59 | if self.xent_reg > 0: 60 | loss_xent, correct = self.xent( 61 | model, 62 | sample, 63 | precomputed=chain_output[1], 64 | ) 65 | loss_xent *= self.xent_reg 66 | print('XENT: {}'.format(loss_xent.data.item()), end=' ') 67 | losses.append(loss_xent) 68 | 69 | # L2 70 | if self.l2_reg > 0: 71 | loss_l2, _ = self.l2( 72 | model, 73 | sample, 74 | precomputed=chain_output[0], 75 | ) 76 | loss_l2 *= self.l2_reg 77 | print('L2: {}'.format(loss_l2.data.item()), end=' ') 78 | losses.append(loss_l2) 79 | 80 | loss = sum(losses) 81 | return loss, correct 82 | 83 | 84 | -------------------------------------------------------------------------------- /nnet_pytorch/objectives/LFMMIOnly.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf-8 -*- 3 | # Copyright 2020 4 | # Apache 2.0 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | from .pychain.pychain.graph import ChainGraphBatch, ChainGraph 10 | import pychain_C 11 | import simplefst 12 | from .pychain.pychain.chain import ChainFunction 13 | 14 | 15 | class NumeratorFunction(torch.autograd.Function): 16 | @staticmethod 17 | def forward(ctx, input, targets): 18 | input = input.clamp(-30, 30) 19 | output = input.gather(2, targets.unsqueeze(2)).sum() 20 | B = input.size(0) 21 | num_grad = torch.zeros_like(input) 22 | num_grad.scatter_(2, targets.unsqueeze(2), 1.0) 23 | ctx.save_for_backward(num_grad) 24 | return output 25 | 26 | @staticmethod 27 | def backward(ctx, objf_grad): 28 | num_grad, = ctx.saved_tensors 29 | num_grad = torch.mul(num_grad, objf_grad) 30 | return num_grad, None 31 | 32 | 33 | class ChainLoss(nn.Module): 34 | @staticmethod 35 | def add_args(parser): 36 | parser.add_argument('--denom-graph', required=True) 37 | parser.add_argument('--leaky-hmm', type=float, default=0.1) 38 | 39 | @classmethod 40 | def build_objective(cls, conf): 41 | return ChainLoss( 42 | conf['denom_graph'], 43 | avg=True, 44 | leaky_hmm=conf.get('leaky_hmm', 0.1) 45 | ) 46 | 47 | @classmethod 48 | def add_state_dict(cls, s1, s2, fraction, iteration=None): 49 | return s1 50 | 51 | def __init__(self, den_graph, avg=True, leaky_hmm=0.1): 52 | super(ChainLoss, self).__init__() 53 | self.den_graph = ChainGraph( 54 | fst=simplefst.StdVectorFst.read(den_graph), 55 | ) 56 | self.avg = avg 57 | self.leaky_hmm = leaky_hmm 58 | 59 | def forward(self, model, sample, precomputed=None): 60 | B = sample.input.size(0) # batchsize 61 | den_graphs = ChainGraphBatch(self.den_graph, B) 62 | 63 | # Check if we are using precomputed values 64 | if precomputed is not None: 65 | x = precomputed 66 | else: 67 | x = model(sample)[0] 68 | 69 | T = x.size(1) # Length 70 | x_lengths = torch.LongTensor([T] * B).to(x.device) 71 | den_objf = ChainFunction.apply(x, x_lengths, den_graphs, self.leaky_hmm) 72 | num_objf = NumeratorFunction.apply(x, sample.target) 73 | loss = -(num_objf - den_objf) 74 | if self.avg: 75 | loss /= (B * T) 76 | return loss, None 77 | -------------------------------------------------------------------------------- /nnet_pytorch/objectives/SGLD.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .optimizer import Optimizer, required 3 | 4 | 5 | class SGLD(Optimizer): 6 | r"""Implements stochastic gradient descent (optionally with momentum). 7 | 8 | Nesterov momentum is based on the formula from 9 | `On the importance of initialization and momentum in deep learning`__. 10 | 11 | Args: 12 | params (iterable): iterable of parameters to optimize or dicts defining 13 | parameter groups 14 | lr (float): learning rate 15 | momentum (float, optional): momentum factor (default: 0) 16 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) 17 | dampening (float, optional): dampening for momentum (default: 0) 18 | nesterov (bool, optional): enables Nesterov momentum (default: False) 19 | 20 | Example: 21 | >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9) 22 | >>> optimizer.zero_grad() 23 | >>> loss_fn(model(input), target).backward() 24 | >>> optimizer.step() 25 | 26 | __ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf 27 | 28 | .. note:: 29 | The implementation of SGD with Momentum/Nesterov subtly differs from 30 | Sutskever et. al. and implementations in some other frameworks. 31 | 32 | Considering the specific case of Momentum, the update can be written as 33 | 34 | .. math:: 35 | v = \rho * v + g \\ 36 | p = p - lr * v 37 | 38 | where p, g, v and :math:`\rho` denote the parameters, gradient, 39 | velocity, and momentum respectively. 40 | 41 | This is in contrast to Sutskever et. al. and 42 | other frameworks which employ an update of the form 43 | 44 | .. math:: 45 | v = \rho * v + lr * g \\ 46 | p = p - v 47 | 48 | The Nesterov version is analogously modified. 49 | """ 50 | 51 | def __init__(self, params, lr=required, momentum=0, dampening=0, 52 | weight_decay=0, nesterov=False, clamp=1.0, stepscale=1.0, noise=0.005): 53 | if lr is not required and lr < 0.0: 54 | raise ValueError("Invalid learning rate: {}".format(lr)) 55 | if momentum < 0.0: 56 | raise ValueError("Invalid momentum value: {}".format(momentum)) 57 | if weight_decay < 0.0: 58 | raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) 59 | 60 | defaults = dict(lr=lr, momentum=momentum, dampening=dampening, 61 | weight_decay=weight_decay, nesterov=nesterov) 62 | if nesterov and (momentum <= 0 or dampening != 0): 63 | raise ValueError("Nesterov momentum requires a momentum and zero dampening") 64 | super(SGLD, self).__init__(params, defaults) 65 | self.noise = noise 66 | self.stepscale = stepscale 67 | self.clamp = clamp 68 | 69 | def __setstate__(self, state): 70 | super(SGD, self).__setstate__(state) 71 | for group in self.param_groups: 72 | group.setdefault('nesterov', False) 73 | 74 | def langevin_noise(self, x, std=1.0): 75 | return self.noise * torch.randn_like(x).mul_(std) 76 | 77 | def step(self, numsteps=None, closure=None): 78 | """Performs a single optimization step. 79 | 80 | Arguments: 81 | closure (callable, optional): A closure that reevaluates the model 82 | and returns the loss. 83 | """ 84 | loss = None 85 | if closure is not None: 86 | loss = closure() 87 | 88 | for group in self.param_groups: 89 | weight_decay = group['weight_decay'] 90 | momentum = group['momentum'] 91 | dampening = group['dampening'] 92 | nesterov = group['nesterov'] 93 | 94 | for p in group['params']: 95 | if p.grad is None: 96 | continue 97 | d_p = p.grad.data 98 | if weight_decay != 0: 99 | d_p.add_(p.data, alpha=weight_decay) 100 | if momentum != 0: 101 | param_state = self.state[p] 102 | if 'momentum_buffer' not in param_state: 103 | buf = param_state['momentum_buffer'] = torch.clone(d_p).detach() 104 | else: 105 | buf = param_state['momentum_buffer'] 106 | buf.mul_(momentum).add_(d_p, alpha=1-dampening) 107 | if nesterov: 108 | d_p = d_p.add(momentum, buf) 109 | else: 110 | d_p = buf 111 | replay_correction = numsteps[:, None, None] ** self.stepscale 112 | langevin_std = 1.0 #/ replay_correction 113 | p.data.add_( 114 | self.langevin_noise(p.data, std=langevin_std).add_( 115 | d_p.div_(replay_correction), 116 | alpha=-group['lr'], 117 | ) 118 | ) 119 | 120 | return loss 121 | -------------------------------------------------------------------------------- /nnet_pytorch/objectives/SGLDAdam.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from .optimizer import Optimizer 4 | 5 | 6 | class SGLDAdam(Optimizer): 7 | r"""Implements Adam algorithm. 8 | 9 | It has been proposed in `Adam: A Method for Stochastic Optimization`_. 10 | 11 | Arguments: 12 | params (iterable): iterable of parameters to optimize or dicts defining 13 | parameter groups 14 | lr (float, optional): learning rate (default: 1e-3) 15 | betas (Tuple[float, float], optional): coefficients used for computing 16 | running averages of gradient and its square (default: (0.9, 0.999)) 17 | eps (float, optional): term added to the denominator to improve 18 | numerical stability (default: 1e-8) 19 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) 20 | amsgrad (boolean, optional): whether to use the AMSGrad variant of this 21 | algorithm from the paper `On the Convergence of Adam and Beyond`_ 22 | (default: False) 23 | 24 | .. _Adam\: A Method for Stochastic Optimization: 25 | https://arxiv.org/abs/1412.6980 26 | .. _On the Convergence of Adam and Beyond: 27 | https://openreview.net/forum?id=ryQu7f-RZ 28 | """ 29 | 30 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, 31 | weight_decay=0, amsgrad=False, noise=0.005, stepscale=1.0): 32 | if not 0.0 <= lr: 33 | raise ValueError("Invalid learning rate: {}".format(lr)) 34 | if not 0.0 <= eps: 35 | raise ValueError("Invalid epsilon value: {}".format(eps)) 36 | if not 0.0 <= betas[0] < 1.0: 37 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 38 | if not 0.0 <= betas[1] < 1.0: 39 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 40 | if not 0.0 <= weight_decay: 41 | raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) 42 | defaults = dict(lr=lr, betas=betas, eps=eps, 43 | weight_decay=weight_decay, amsgrad=amsgrad) 44 | super(SGLDAdam, self).__init__(params, defaults) 45 | self.noise = noise 46 | self.stepscale = stepscale 47 | 48 | def __setstate__(self, state): 49 | super(Adam, self).__setstate__(state) 50 | for group in self.param_groups: 51 | group.setdefault('amsgrad', False) 52 | 53 | def langevin_noise(self, x, std=1.0): 54 | return self.noise * torch.randn_like(x).mul_(std) 55 | 56 | @torch.no_grad() 57 | def step(self, numsteps=None, closure=None): 58 | """Performs a single optimization step. 59 | 60 | Arguments: 61 | closure (callable, optional): A closure that reevaluates the model 62 | and returns the loss. 63 | """ 64 | loss = None 65 | if closure is not None: 66 | with torch.enable_grad(): 67 | loss = closure() 68 | 69 | for group in self.param_groups: 70 | for p in group['params']: 71 | if p.grad is None: 72 | continue 73 | grad = p.grad 74 | if grad.is_sparse: 75 | raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') 76 | amsgrad = group['amsgrad'] 77 | 78 | state = self.state[p] 79 | 80 | # State initialization 81 | if len(state) == 0: 82 | state['step'] = 0 83 | # Exponential moving average of gradient values 84 | state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) 85 | # Exponential moving average of squared gradient values 86 | state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) 87 | if amsgrad: 88 | # Maintains max of all exp. moving avg. of sq. grad. values 89 | state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) 90 | 91 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 92 | if amsgrad: 93 | max_exp_avg_sq = state['max_exp_avg_sq'] 94 | beta1, beta2 = group['betas'] 95 | 96 | state['step'] += 1 97 | bias_correction1 = 1 - beta1 ** state['step'] 98 | bias_correction2 = 1 - beta2 ** state['step'] 99 | 100 | if group['weight_decay'] != 0: 101 | grad = grad.add(p, alpha=group['weight_decay']) 102 | 103 | # Decay the first and second moment running average coefficient 104 | exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) 105 | exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) 106 | if amsgrad: 107 | # Maintains the maximum of all 2nd moment running avg. till now 108 | torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) 109 | # Use the max. for normalizing running avg. of gradient 110 | denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) 111 | else: 112 | denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) 113 | 114 | step_size = group['lr'] / bias_correction1 115 | replay_correction = numsteps[:, None, None] ** self.stepscale 116 | langevin_std = 1.0 #/ replay_correction 117 | 118 | p.add_(self.langevin_noise(p.data, std=langevin_std)) 119 | p.addcdiv_(exp_avg, denom.mul_(replay_correction), value=-step_size) 120 | 121 | return loss 122 | -------------------------------------------------------------------------------- /nnet_pytorch/objectives/SemisupLFMMI.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from .L2 import L2 5 | from .CrossEntropy import CrossEntropy 6 | from .LFMMI_EBM import SequenceEBMLoss as SeqEBM 7 | 8 | 9 | class ChainLoss(nn.Module): 10 | @staticmethod 11 | def add_args(parser): 12 | parser.add_argument('--xent-reg', type=float, default=0.2) 13 | parser.add_argument('--l2-reg', type=float, default=0.00025) 14 | for m in [L2, CrossEntropy, SeqEBM]: 15 | m.add_args(parser) 16 | 17 | @classmethod 18 | def build_objective(cls, conf): 19 | seq_ebm = SeqEBM.build_objective(conf) 20 | return ChainLoss( 21 | seq_ebm, 22 | xent_reg=conf['xent_reg'], 23 | l2_reg=conf['l2_reg'], 24 | ) 25 | 26 | @classmethod 27 | def add_state_dict(cls, s1, s2, fraction, iteration=None): 28 | return { 29 | 'seq_ebm': SeqEBM.add_state_dict( 30 | s1['seq_ebm'], s2['seq_ebm'], fraction, iteration=iteration, 31 | ), 32 | } 33 | 34 | def __init__( 35 | self, seq_ebm, xent_reg=0.2, l2_reg=0.00025, 36 | ): 37 | super(ChainLoss, self).__init__() 38 | self.seq_ebm = seq_ebm 39 | self.xent = CrossEntropy() 40 | self.l2 = L2() 41 | 42 | self.l2_reg = l2_reg 43 | self.xent_reg = xent_reg 44 | 45 | def forward(self, model, sample): 46 | is_unsup = sample.target[0, 0] == -1 47 | chain_output = model(sample) 48 | losses = [] 49 | correct = None 50 | # SeqEBM 51 | loss_seqebm, _ = self.seq_ebm( 52 | model, 53 | sample, 54 | precomputed=chain_output[0], 55 | ) 56 | losses.append(loss_seqebm) 57 | 58 | # XENT 59 | if not is_unsup and self.xent_reg > 0: 60 | loss_xent, correct = self.xent( 61 | model, 62 | sample, 63 | precomputed=chain_output[1], 64 | ) 65 | loss_xent *= self.xent_reg 66 | print('XENT: {}'.format(loss_xent.data.item()), end=' ') 67 | losses.append(loss_xent) 68 | 69 | # L2 70 | if self.l2_reg > 0 and not is_unsup: 71 | loss_l2, _ = self.l2( 72 | model, 73 | sample, 74 | precomputed=chain_output[0], 75 | ) 76 | loss_l2 *= self.l2_reg 77 | print('L2: {}'.format(loss_l2.data.item()), end=' ') 78 | losses.append(loss_l2) 79 | 80 | loss = sum(losses) 81 | return loss, correct 82 | 83 | def state_dict(self): 84 | return { 85 | 'seq_ebm': self.seq_ebm.state_dict() 86 | } 87 | 88 | def load_state_dict(self, state_dict): 89 | super().load_state_dict(state_dict) 90 | self.seq_ebm.load_state_dict(state_dict['seq_ebm']) 91 | 92 | def generate_from_buffer(self): 93 | return self.seq_ebm.generate_from_buffer() 94 | 95 | def generate_from_model(self, model, **kwargs): 96 | return self.seq_ebm.generate_from_model(model, **kwargs) 97 | 98 | def decorrupt(self, model, sample, num_steps=None): 99 | return self.seq_ebm.decorrupt(model, sample, num_steps) 100 | 101 | 102 | -------------------------------------------------------------------------------- /nnet_pytorch/objectives/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import importlib 4 | 5 | 6 | modules = glob.glob( 7 | os.path.sep.join( 8 | [os.path.dirname(__file__), '*.py'] 9 | ) 10 | ) 11 | 12 | for f in modules: 13 | if os.path.isfile(f) and '__init__.py' not in f: 14 | module_name, ext = os.path.splitext(f) 15 | if ext == '.py': 16 | module = importlib.import_module('objectives.' + os.path.basename(module_name)) 17 | 18 | OBJECTIVES = { 19 | 'CrossEntropy': CrossEntropy.CrossEntropy, 20 | 'LFMMI': LFMMI.ChainLoss, 21 | 'SemisupLFMMI': SemisupLFMMI.ChainLoss, 22 | 'LFMMI_EBM': LFMMI_EBM.SequenceEBMLoss, 23 | 'CrossEntropy_EBM': CrossEntropy_EBM.EBMLoss, 24 | 'LFMMINum': LFMMIOnly.NumeratorFunction, 25 | 'LFMMI_MCE': LFMMI_MCE.MCELoss, 26 | 'SemisupMCE': SemisupMCE.ChainLoss, 27 | } 28 | 29 | def build_objective(objectivename, conf): 30 | return OBJECTIVES[objectivename].build_objective(conf) 31 | -------------------------------------------------------------------------------- /nnet_pytorch/utils/average_models.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf-8 -*- 3 | # Copyright 2020 4 | # Apache 2.0 5 | 6 | from __future__ import print_function 7 | import argparse 8 | import sys 9 | import os 10 | import json 11 | import torch 12 | import models 13 | 14 | def main(): 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('modeldir', 17 | help='Output model directory', 18 | type=str, 19 | ) 20 | parser.add_argument('idim', type=int) 21 | parser.add_argument('start', 22 | help='Start epoch model to average', 23 | type=int, 24 | ) 25 | parser.add_argument('end', 26 | help='End epoch model to average', 27 | type=int, 28 | ) 29 | #parser.add_argument('--weights', 30 | # help='Weights for each model', 31 | #) 32 | args = parser.parse_args() 33 | 34 | conf = json.load(open(args.modeldir + '/conf.1.json')) 35 | conf['idim'] = args.idim 36 | new_model = models.MODELS[conf['model']].build_model(conf) 37 | new_dict = new_model.state_dict() 38 | for name, param in new_dict.items(): 39 | if len(param.size()) > 0: 40 | param.mul_(0.0) 41 | 42 | fraction = 1.0 / (args.end - args.start + 1) 43 | for m in range(args.start, args.end + 1): 44 | state_dict = torch.load( 45 | args.modeldir + '/{}.mdl'.format(m), 46 | map_location=torch.device('cpu') 47 | ) 48 | for name, p in state_dict['model'].items(): 49 | if name in new_dict: 50 | if len(p.size()) != 0: 51 | new_dict[name].add_(p, alpha=fraction) 52 | else: 53 | new_dict[name] = (p * fraction).type(new_dict[name].dtype) 54 | torch.save( 55 | {'model': new_dict}, 56 | args.modeldir + '/{}_{}.mdl'.format(args.start, args.end) 57 | ) 58 | 59 | if __name__ == "__main__": 60 | main() 61 | 62 | -------------------------------------------------------------------------------- /nnet_pytorch/utils/combine_models.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf-8 -*- 3 | # Copyright 2019 Johns Hopkins University (Author: Matthew Wiesner) 4 | # Apache 2.0 5 | 6 | from __future__ import print_function 7 | import argparse 8 | import sys 9 | import os 10 | import models 11 | import objectives 12 | import torch 13 | import json 14 | from itertools import chain 15 | from copy import deepcopy 16 | import math 17 | from LRScheduler import LRScheduler 18 | from collections import defaultdict 19 | from torch._six import container_abcs 20 | 21 | 22 | def main(): 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument('omodel', help='path to output model', type=str,) 25 | parser.add_argument('conf', type=str) 26 | parser.add_argument('--save-models', action='store_true') 27 | parser.add_argument('--models', nargs='+', type=str, help='paths to models') 28 | args = parser.parse_args() 29 | 30 | conf = json.load(open(args.conf)) 31 | new_model = models.MODELS[conf['model']].build_model(conf) 32 | objective = objectives.OBJECTIVES[conf['objective']].build_objective(conf) 33 | 34 | params = list( 35 | filter( 36 | lambda p: p.requires_grad, 37 | chain(new_model.parameters(), objective.parameters()), 38 | ) 39 | ) 40 | 41 | optimizers = { 42 | 'sgd': torch.optim.SGD(params, lr=conf['lr'], momentum=0.0), 43 | 'adadelta': torch.optim.Adadelta(params, lr=conf['lr']), 44 | 'adam': torch.optim.Adam(params, lr=conf['lr'], weight_decay=conf['weight_decay']), 45 | } 46 | 47 | optimizer = optimizers[conf['optim']] 48 | opt_state_dict = optimizer.state_dict() 49 | 50 | 51 | new_mdl_dict = new_model.state_dict() 52 | new_optim_dict = optimizer.state_dict() 53 | new_objective_dict = objective.state_dict() 54 | 55 | for name, param in new_mdl_dict.items(): 56 | if len(param.size()) > 0: 57 | param.mul_(0.0) 58 | 59 | fraction = 1.0 / (len(args.models)) 60 | for i, m in enumerate(args.models): 61 | print("Combining Model ", i, " ...") 62 | state_dict = torch.load(m, map_location=torch.device('cpu')) 63 | if i == 0 and 'buffer' in state_dict: 64 | new_buffer = torch.FloatTensor( 65 | state_dict['buffer'].cpu().size(0), 66 | state_dict['buffer'].cpu().size(1), 67 | state_dict['buffer'].cpu().size(2), 68 | ) 69 | new_buffer_numsteps = torch.zeros(state_dict['buffer'].cpu().size(0)) 70 | 71 | #----------------------- Model ------------------------- 72 | # To combine models, we just average the weights 73 | for name, p in state_dict['model'].items(): 74 | if name in new_mdl_dict: 75 | if len(p.size()) != 0: 76 | new_mdl_dict[name].add_(p, alpha=fraction) 77 | else: 78 | new_mdl_dict[name] = (p * fraction).type(new_mdl_dict[name].dtype) 79 | 80 | #--------------------- Objectives --------------------- 81 | # To combine objectives is harder: We average parameter weights if 82 | # applicable, but in the case of some models such as the EBM models 83 | # we have to specify how to combine things like the sampling buffer. 84 | # This combination is model specific and should therefore written as a 85 | # as method in the objective's class. For now we have just done it 86 | # here though. 87 | update_opt_state_dict(new_optim_dict, state_dict['optimizer'], fraction) 88 | new_objective_dict = objective.add_state_dict( 89 | new_objective_dict, state_dict['objective'], 90 | fraction, iteration=i, 91 | ) 92 | 93 | new_state_dict = { 94 | 'model': new_mdl_dict, 95 | 'objective': new_objective_dict, 96 | 'optimizer': state_dict['optimizer'], 97 | 'lr_sched': state_dict['lr_sched'], 98 | 'epoch': state_dict['epoch'], 99 | } 100 | 101 | torch.save( 102 | new_state_dict, 103 | args.omodel, 104 | ) 105 | 106 | if not args.save_models: 107 | for m in args.models: 108 | os.remove(m) 109 | 110 | 111 | def update_opt_state_dict(state_dict1, state_dict2, fraction): 112 | ''' 113 | Update state_dict1, with state_dict2 where values are 114 | val1 + fraction*val2 115 | ''' 116 | groups2 = state_dict2['param_groups'] 117 | groups1 = state_dict1['param_groups'] 118 | 119 | if len(groups1) != len(groups2): 120 | raise ValueError("state dict as a different number of parameter groups") 121 | 122 | param_lens = (len(g['params']) for g in groups1) 123 | saved_lens = (len(g['params']) for g in groups2) 124 | if any(p_len != s_len for p_len, s_len in zip(param_lens, saved_lens)): 125 | raise ValueError("loaded state dict contains a parameter group that " 126 | "doesn't match the size of the optimizer's group") 127 | 128 | id_map = {p: old_id for old_id, p in 129 | zip(chain(*(g['params'] for g in groups1)), 130 | chain(*(g['params'] for g in groups2)))} 131 | 132 | for k, v in state_dict2['state'].items(): 133 | if k in id_map: 134 | param = id_map[k] 135 | if param in state_dict1['state']: 136 | for p_name, p in v.items(): 137 | if isinstance(p, torch.Tensor): 138 | state_dict1['state'][param][p_name] += fraction * p 139 | else: 140 | state_dict1['state'][param] = {key: fraction * val for key, val in v.items()} 141 | else: 142 | for p_name, p in v.items(): 143 | if isinstance(p, torch.Tensor): 144 | state_dict1['state'][k][p_name] = fraction * p 145 | 146 | 147 | if __name__ == "__main__": 148 | main() 149 | 150 | -------------------------------------------------------------------------------- /nnet_pytorch/utils/decode_nnet_pytorch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ./path.sh 4 | 5 | batchsize=512 6 | checkpoint=final.mdl 7 | prior_scale=1.0 8 | prior_floor=-20.0 9 | prior_name="priors" 10 | min_active=200 11 | max_active=7000 12 | max_mem=50000000 13 | lattice_beam=8.0 14 | beam=15.0 15 | acoustic_scale=1.0 16 | post_decode_acwt=10.0 # 10.0 for chain systems, 1.0 for non-chain 17 | 18 | min_lmwt=6 19 | max_lmwt=18 20 | nj=80 21 | stage=0 22 | 23 | . ./utils/parse_options.sh 24 | if [ $# -ne 4 ]; then 25 | echo "Usage: ./decode_nnet_pytorch.sh " 26 | echo " --batchsize ${batchsize} " 27 | echo " --checkpoint ${checkpoint} --prior-scale ${prior_scale} --prior-floor ${prior_floor} --prior-name ${prior_name}" 28 | echo " --min-active ${min_active} --max-active ${max_active}" 29 | echo " --max-mem ${max_mem} --lattice-beam ${lattice_beam}" 30 | echo " --beam ${beam} --acoustic-scale ${acoustic_scale} --post-decode-acwt ${post_decode_acwt}" 31 | echo " --nj ${nj}" 32 | exit 1; 33 | fi 34 | 35 | data=$1 36 | pytorch_model=$2 37 | graphdir=$3 38 | odir=$4 39 | 40 | # We assume the acoustic model (trans.mdl) is 1 level above the graphdir 41 | amdir=`dirname ${graphdir}` 42 | trans_mdl=${amdir}/final.mdl 43 | words_file=${graphdir}/words.txt 44 | hclg=${graphdir}/HCLG.fst 45 | 46 | mkdir -p ${odir}/log 47 | 48 | decode_cmd="utils/queue.pl --mem 6G -l hostname='!b02*&!a*&!c06*&!c23*&!c24*&!c25*&!c26*&!c27*'" # The 'a' machines are just too slow 49 | if [ $stage -le 0 ]; then 50 | segments=${data}/segments 51 | if [ ! -f ${data}/segments ]; then 52 | echo "No segments file found. Assuming wav.scp is indexed by utterance" 53 | segments=${data}/wav.scp 54 | fi 55 | 56 | ${decode_cmd} JOB=1:${nj} ${odir}/log/decode.JOB.log \ 57 | ./utils/split_scp.pl -j ${nj} \$\[JOB -1\] ${segments} \|\ 58 | decode.py --datadir ${data} \ 59 | --modeldir ${pytorch_model} \ 60 | --dumpdir ${odir} \ 61 | --checkpoint ${checkpoint} \ 62 | --prior-scale ${prior_scale} \ 63 | --prior-floor ${prior_floor} \ 64 | --prior-name ${prior_name} \ 65 | --words-file ${words_file} \ 66 | --trans-mdl ${trans_mdl} \ 67 | --hclg ${hclg} \ 68 | --min-active ${min_active} \ 69 | --max-active ${max_active} \ 70 | --lattice-beam ${lattice_beam} \ 71 | --beam ${beam} \ 72 | --acoustic-scale ${acoustic_scale} \ 73 | --post-decode-acwt ${post_decode_acwt} \ 74 | --job JOB \ 75 | --utt-subset /dev/stdin \ 76 | --batchsize ${batchsize} 77 | fi 78 | 79 | if [ $stage -le 1 ]; then 80 | ./local/score.sh --cmd "$decode_cmd" \ 81 | --min-lmwt ${min_lmwt} --max-lmwt ${max_lmwt} --word-ins-penalty 0.0 \ 82 | ${data} ${graphdir} ${odir} 83 | fi 84 | -------------------------------------------------------------------------------- /nnet_pytorch/utils/memmap_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf-8 -*- 3 | # Copyright 2020 4 | # Apache 2.0 5 | 6 | from datasets.data_utils import memmap_feats 7 | import pickle 8 | import argparse 9 | 10 | 11 | def main(): 12 | parser = argparse.ArgumentParser( 13 | description='Takes Kaldi features, converts them to numpy objects and ' 14 | 'stores memory-mapped version for efficient access in training.' 15 | ) 16 | parser.add_argument('feats_scp') 17 | parser.add_argument('feats_scp_mapped') 18 | parser.add_argument('metadata') 19 | parser.add_argument('--utt-list', default=None) 20 | 21 | args = parser.parse_args() 22 | utt_list = [] 23 | if args.utt_list is not None: 24 | with open(args.utt_list, 'r') as f: 25 | for line in f: 26 | utt_list.append(line.strip().split(None, 1)[0]) 27 | utt_lengths, offsets, data_shape = memmap_feats( 28 | args.feats_scp, args.feats_scp_mapped, utt_list 29 | ) 30 | with open(args.metadata + '.pkl', 'bw') as f: 31 | pickle.dump([utt_lengths, offsets, data_shape], f) 32 | 33 | 34 | if __name__ == "__main__": 35 | main() 36 | 37 | -------------------------------------------------------------------------------- /nnet_pytorch/utils/prepare_unlabeled_tgt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf-8 -*- 3 | # Copyright 2019 Johns Hopkins University (Author: Matthew Wiesner) 4 | # Apache 2.0 5 | 6 | from __future__ import print_function 7 | import argparse 8 | import sys 9 | import os 10 | 11 | 12 | def main(): 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('utt2num_frames', 15 | help='', 16 | type=str 17 | ) 18 | parser.add_argument('--subsample', type=int, default=1) 19 | 20 | args = parser.parse_args() 21 | 22 | with open(args.utt2num_frames, 'r') as f: 23 | for l in f: 24 | utt, frames = l.strip().split(None, 1) 25 | print(utt, end='') 26 | num_frames = len(range(0, int(frames), args.subsample)) 27 | print(' -1' * num_frames) 28 | 29 | if __name__ == "__main__": 30 | main() 31 | 32 | -------------------------------------------------------------------------------- /nnet_pytorch/utils/show_decorruption.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf-8 -*- 3 | # Copyright 2019 Johns Hopkins University (Author: Matthew Wiesner) 4 | # Apache 2.0 5 | 6 | from __future__ import print_function 7 | import argparse 8 | import sys 9 | import os 10 | import glob 11 | import imageio 12 | from matplotlib import pyplot as plt 13 | import numpy as np 14 | 15 | 16 | def main(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('idir') 19 | parser.add_argument('ogif') 20 | parser.add_argument('name', type=str) 21 | args = parser.parse_args() 22 | 23 | files = glob.glob('{}/{}.*.npy'.format(args.idir, args.name)) 24 | files = sorted(files, key=lambda x : int(x.split('.')[-2])) 25 | 26 | images = [] 27 | for f in files: 28 | fname = os.path.basename(f) 29 | print(fname) 30 | out = np.load(f) 31 | plt.imshow(np.flipud(out.T)) 32 | plt.colorbar() 33 | plt.savefig(args.idir + "/" + fname + ".png") 34 | images.append(imageio.imread(args.idir + "/" + fname + ".png")) 35 | plt.clf() 36 | imageio.mimsave(args.ogif, images, duration=0.1) 37 | 38 | 39 | if __name__ == "__main__": 40 | main() 41 | 42 | -------------------------------------------------------------------------------- /nnet_pytorch/utils/show_sampling.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf-8 -*- 3 | # Copyright 2019 Johns Hopkins University (Author: Matthew Wiesner) 4 | # Apache 2.0 5 | 6 | from __future__ import print_function 7 | import argparse 8 | import sys 9 | import os 10 | import glob 11 | import imageio 12 | from matplotlib import pyplot as plt 13 | import numpy as np 14 | 15 | 16 | def main(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('idir') 19 | parser.add_argument('ogif') 20 | parser.add_argument('index', type=int) 21 | args = parser.parse_args() 22 | 23 | files = glob.glob(args.idir + "/samples*.npy") 24 | files = sorted(files, key=lambda x : int(x.split('.')[-2])) 25 | 26 | images = [] 27 | for f in files: 28 | fname = os.path.basename(f) 29 | print(fname) 30 | out = np.load(f) 31 | plt.imshow(np.flipud(out[args.index, :, :].T)) 32 | plt.colorbar() 33 | plt.savefig(args.idir + "/" + fname + ".png") 34 | images.append(imageio.imread(args.idir + "/" + fname + ".png")) 35 | plt.clf() 36 | imageio.mimsave(args.ogif, images, duration=0.1) 37 | 38 | 39 | if __name__ == "__main__": 40 | main() 41 | 42 | -------------------------------------------------------------------------------- /nnet_pytorch/utils/split_memmap_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ./path.sh 4 | . ./cmd.sh 5 | 6 | . ./utils/parse_options.sh 7 | if [ $# -ne 3 ]; then 8 | echo "Usage: split_memmap_data.sh " 9 | exit 1; 10 | fi 11 | 12 | datadir=$1 13 | targets=$2 14 | num_split=$3 15 | 16 | dataname=`basename ${datadir}` 17 | mapped_dir=${datadir}/mapped # don't change this path 18 | mkdir -p $mapped_dir 19 | echo "$0: Splitting data in $num_split parts" 20 | # spread the mapped numpy arrays over various machines, as this data-set is quite large. 21 | if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then 22 | utils/create_split_dir.pl /export/b{11,12,13,14}/$USER/kaldi-data/egs/${dataname}_$(date +'%m_%d_%H_%M')/$mapped_dir/storage \ 23 | $mapped_dir/storage 24 | fi 25 | utils/split_data.sh ${datadir} $num_split 26 | for n in $(seq $num_split); do 27 | # the next command does nothing unless $mapped_feats_dir/storage/ exists, see 28 | # utils/create_data_link.pl for more info. 29 | utils/create_data_link.pl $mapped_dir/feats.dat.$n 30 | done 31 | $train_cmd JOB=1:$num_split exp/make_fbank/${dataname}/memmap_data.JOB.log \ 32 | memmap_data.py --utt-list ${targets} ${datadir}/split${num_split}/JOB/feats.scp $mapped_dir/feats.dat.JOB \ 33 | $mapped_dir/metadata.JOB 34 | echo $num_split > ${datadir}/num_split 35 | 36 | -------------------------------------------------------------------------------- /tools/Makefile: -------------------------------------------------------------------------------- 1 | PYTHON_DIR = `pwd`/NeurIPS2020/bin 2 | CXX ?= g++ 3 | 4 | WGET ?= wget 5 | 6 | # Note: OpenFst requires a relatively recent C++ compiler with C++11 support, 7 | # e.g. g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3. 8 | OPENFST_VERSION ?= 1.7.5 9 | 10 | # Default features configured for OpenFST; can be overridden in the make command line. 11 | OPENFST_COMFIGURE ?= --enable-static --enable-shared --enable-ngram-fsts 12 | 13 | CPPFLAGS ?= -D_GLIBCXX_USE_CXX11_ABI=0 14 | CXXFLAGS ?= -D_GLIBCXX_USE_CXX11_ABI=0 15 | 16 | all: kaldi pychain 17 | 18 | kaldi: 19 | git clone https://github.com/kaldi-asr/kaldi.git 20 | cd kaldi/tools; $(MAKE) all 21 | cd kaldi/src; ./configure --shared; $(MAKE) depend; $(MAKE) all 22 | 23 | venv: requirements.txt 24 | test -d NeurIPS2020 || python3 -m venv NeurIPS2020 25 | . ./NeurIPS2020/bin/activate; pip install -r requirements.txt 26 | 27 | clean: openfst_cleaned 28 | rm -rf pychain 29 | 30 | openfst_cleaned: 31 | $(MAKE) -C openfst-$(OPENFST_VERSION) clean 32 | 33 | .PHONY: openfst # so target will be made even though "openfst" exists. 34 | openfst: openfst_compiled openfst-$(OPENFST_VERSION)/lib 35 | -rm -f openfst 36 | -ln -s openfst-$(OPENFST_VERSION) openfst 37 | 38 | .PHONY: openfst_compiled 39 | openfst_compiled: openfst-$(OPENFST_VERSION)/Makefile 40 | $(MAKE) -C openfst-$(OPENFST_VERSION) install MAKEOVERRIDES= 41 | 42 | openfst-$(OPENFST_VERSION)/lib: | openfst-$(OPENFST_VERSION)/Makefile 43 | -cd openfst-$(OPENFST_VERSION) && [ -d lib64 ] && [ ! -d lib ] && ln -s lib64 lib 44 | 45 | # Add the -O flag to CXXFLAGS on cygwin as it can fix the compilation error 46 | # "file too big". 47 | ifeq ($(OSTYPE),cygwin) 48 | # Note: OSTYPE path is probably dead for latest cygwin64 (installed on 2016/11/11). 49 | openfst_add_CXXFLAGS = -O -Wa,-mbig-obj 50 | else ifeq ($(OS),Windows_NT) 51 | # This new OS path is confirmed working on Windows 10 / Cygwin64. 52 | openfst_add_CXXFLAGS = -O -Wa,-mbig-obj 53 | else 54 | openfst_add_CXXFLAGS = 55 | endif 56 | 57 | openfst-$(OPENFST_VERSION)/Makefile: openfst-$(OPENFST_VERSION) 58 | cd openfst-$(OPENFST_VERSION)/ && \ 59 | ./configure --prefix=`pwd` $(OPENFST_CONFIGURE) CXX="$(CXX)" CPPFLAGS="$(CPPFLAGS)" CXXFLAGS="$(CXXFLAGS) $(openfst_add_CXXFLAGS)" LDFLAGS="$(LDFLAGS)" LIBS="-ldl" 60 | 61 | openfst-$(OPENFST_VERSION): openfst-$(OPENFST_VERSION).tar.gz 62 | tar xozf openfst-$(OPENFST_VERSION).tar.gz 63 | 64 | openfst-$(OPENFST_VERSION).tar.gz: 65 | $(WGET) -T 10 -t 1 http://www.openfst.org/twiki/pub/FST/FstDownload/openfst-$(OPENFST_VERSION).tar.gz || \ 66 | $(WGET) -T 10 -t 3 https://www.openslr.org/resources/2/openfst-$(OPENFST_VERSION).tar.gz; 67 | 68 | .PHONY: pychain 69 | pychain: openfst venv 70 | test -d ../nnet_pytorch/objectives/pychain || . ./NeurIPS2020/bin/activate && \ 71 | export OPENFST_PATH=`pwd`/openfst && \ 72 | export LD_LIBRARY_PATH=`pwd`/openfst/lib:$$LD_LIBRARY_PATH && \ 73 | export PATH=$(PYTHON_DIR):$$PATH && \ 74 | cd ../nnet_pytorch/objectives && \ 75 | git clone --single-branch --branch master https://github.com/YiwenShaoStephen/pychain.git && cd pychain && \ 76 | cp ../../../tools/pychain_patch.diff . && \ 77 | git apply pychain_patch.diff && \ 78 | cd openfst_binding && python3 setup.py install && \ 79 | cd ../pytorch_binding && python3 setup.py install; 80 | -------------------------------------------------------------------------------- /tools/pychain_patch.diff: -------------------------------------------------------------------------------- 1 | diff --git a/__init__.py b/__init__.py 2 | new file mode 100644 3 | index 0000000..e69de29 4 | diff --git a/pychain/__init__.py b/pychain/__init__.py 5 | index 890d65b..84a54e6 100644 6 | --- a/pychain/__init__.py 7 | +++ b/pychain/__init__.py 8 | @@ -1,2 +1 @@ 9 | -from .loss import * 10 | from .graph import * 11 | diff --git a/pychain/chain.py b/pychain/chain.py 12 | new file mode 100644 13 | index 0000000..17c0b64 14 | --- /dev/null 15 | +++ b/pychain/chain.py 16 | @@ -0,0 +1,83 @@ 17 | +# Copyright 2019 Yiwen Shao 18 | +# 2020 Yiming Wang 19 | + 20 | +# Licensed under the Apache License, Version 2.0 (the "License"); 21 | +# you may not use this file except in compliance with the License. 22 | +# You may obtain a copy of the License at 23 | + 24 | +# http://www.apache.org/licenses/LICENSE-2.0 25 | + 26 | +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 27 | +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 28 | +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 29 | +# MERCHANTABLITY OR NON-INFRINGEMENT. 30 | +# See the Apache 2 License for the specific language governing permissions and 31 | +# limitations under the License. 32 | + 33 | +import torch 34 | +import torch.nn as nn 35 | +from .graph import ChainGraphBatch 36 | +import pychain_C 37 | + 38 | + 39 | +class ChainFunction(torch.autograd.Function): 40 | + @staticmethod 41 | + def forward(ctx, input, input_lengths, graphs, leaky_coefficient=1e-5): 42 | + input = input.contiguous().clamp(-30, 30) # clamp for both the denominator and the numerator 43 | + B = input.size(0) 44 | + if B != graphs.batch_size: 45 | + raise ValueError( 46 | + "input batch size ({}) does not equal to graph batch size ({})" 47 | + .format(B, graphs.batch_size) 48 | + ) 49 | + packed_data = torch.nn.utils.rnn.pack_padded_sequence( 50 | + input, input_lengths, batch_first=True, 51 | + ) 52 | + batch_sizes = packed_data.batch_sizes 53 | + input_lengths = input_lengths.cpu() 54 | + if not graphs.log_domain: # usually for the denominator 55 | + exp_input = input.exp() 56 | + objf, input_grad, ok = pychain_C.forward_backward( 57 | + graphs.forward_transitions, 58 | + graphs.forward_transition_indices, 59 | + graphs.forward_transition_probs, 60 | + graphs.backward_transitions, 61 | + graphs.backward_transition_indices, 62 | + graphs.backward_transition_probs, 63 | + graphs.leaky_probs, 64 | + graphs.initial_probs, 65 | + graphs.final_probs, 66 | + graphs.start_state, 67 | + exp_input, 68 | + batch_sizes, 69 | + input_lengths, 70 | + graphs.num_states, 71 | + leaky_coefficient, 72 | + ) 73 | + else: # usually for the numerator 74 | + objf, log_probs_grad, ok = pychain_C.forward_backward_log_domain( 75 | + graphs.forward_transitions, 76 | + graphs.forward_transition_indices, 77 | + graphs.forward_transition_probs, 78 | + graphs.backward_transitions, 79 | + graphs.backward_transition_indices, 80 | + graphs.backward_transition_probs, 81 | + graphs.initial_probs, 82 | + graphs.final_probs, 83 | + graphs.start_state, 84 | + input, 85 | + batch_sizes, 86 | + input_lengths, 87 | + graphs.num_states, 88 | + ) 89 | + input_grad = log_probs_grad.exp() 90 | + 91 | + ctx.save_for_backward(input_grad) 92 | + return objf.sum() 93 | + 94 | + @staticmethod 95 | + def backward(ctx, objf_grad): 96 | + input_grad, = ctx.saved_tensors 97 | + input_grad = torch.mul(input_grad, objf_grad) 98 | + 99 | + return input_grad, None, None, None 100 | diff --git a/pytorch_binding/src/chain-computation.cc b/pytorch_binding/src/chain-computation.cc 101 | index d53a03f..36edf10 100644 102 | --- a/pytorch_binding/src/chain-computation.cc 103 | +++ b/pytorch_binding/src/chain-computation.cc 104 | @@ -226,7 +226,8 @@ torch::Tensor ChainComputation::ComputeTotLogLike() { 105 | // as alpha_frame_log_tot is padded with 0.0, the sum below is fine 106 | tot_log_prob_.copy_(alpha_frame_log_tot.sum(1) + last_frame_alpha_dash_sum.log()); // B 107 | tot_prob_.copy_(tot_log_prob_.exp()); // B 108 | - return tot_log_prob_.sum(); 109 | + //return tot_log_prob_.sum(); 110 | + return tot_log_prob_; 111 | } 112 | 113 | void ChainComputation::BetaDashLastFrame() { 114 | -------------------------------------------------------------------------------- /tools/requirements.txt: -------------------------------------------------------------------------------- 1 | kaldi-io==0.9.4 2 | numpy==1.18.5 3 | torch==1.5.0 4 | --------------------------------------------------------------------------------