├── README.md
├── librilight
    ├── README.md
    ├── cmd.sh
    ├── conf
    │   ├── decode.config
    │   ├── fbank.conf
    │   ├── gpu.conf
    │   ├── mfcc.conf
    │   ├── mfcc_hires.conf
    │   ├── online_cmvn.conf
    │   ├── online_pitch.conf
    │   ├── queue_no_k20.conf
    │   └── spec.conf
    ├── decode_nnet_pytorch.sh
    ├── local
    │   ├── data_prep.sh
    │   ├── download_and_untar.sh
    │   ├── download_lm.sh
    │   ├── format_lms.sh
    │   ├── prepare_dict.sh
    │   ├── prepare_librilight.sh
    │   ├── prepare_librilight_dataset.sh
    │   ├── prepare_test.sh
    │   ├── prepare_unlabeled_tgt.py
    │   ├── score.sh
    │   ├── subset_dataset.sh
    │   └── train_async_parallel2.sh
    ├── path.sh
    ├── run.sh
    ├── steps
    ├── train_nnet_pytorch.sh
    └── utils
├── librispeech
    ├── README
    ├── cmd.sh
    ├── conf
    │   ├── decode.config
    │   ├── fbank.conf
    │   ├── gpu.conf
    │   ├── mfcc.conf
    │   ├── mfcc_hires.conf
    │   ├── online_cmvn.conf
    │   ├── online_pitch.conf
    │   ├── queue_no_k20.conf
    │   └── spec.conf
    ├── decode.sh
    ├── local
    │   ├── data_prep.sh
    │   ├── download_lm.sh
    │   ├── format_lms.sh
    │   ├── prepare_dict.sh
    │   ├── prepare_test.sh
    │   ├── score.sh
    │   └── subset_dataset.sh
    ├── path.sh
    ├── run-blstm.sh
    ├── run-wrn.sh
    ├── run.sh
    ├── steps
    └── utils
├── librispeech100
    ├── cmd.sh
    ├── conf
    │   ├── decode.config
    │   ├── fbank.conf
    │   ├── gpu.conf
    │   ├── mfcc.conf
    │   ├── mfcc_hires.conf
    │   ├── online_cmvn.conf
    │   ├── online_pitch.conf
    │   ├── queue_no_k20.conf
    │   └── spec.conf
    ├── decode.sh
    ├── decorrupt.sh
    ├── generate.sh
    ├── local
    │   ├── data_prep.sh
    │   ├── decode_nnet_pytorch.sh
    │   ├── download_and_untar.sh
    │   ├── download_lm.sh
    │   ├── format_lms.sh
    │   ├── prepare_dict.sh
    │   ├── prepare_librilight.sh
    │   ├── prepare_librilight_dataset.sh
    │   ├── prepare_test.sh
    │   ├── prepare_unlabeled_tgt.py
    │   ├── score.sh
    │   ├── split_memmap_data.sh
    │   ├── subset_dataset.sh
    │   └── train_async_parallel.sh
    ├── path.sh
    ├── run-semisup-wrn-scratch.sh
    ├── run-semisup-wrn.sh
    ├── run-tdnn.sh
    ├── run-wrn.sh
    ├── run.sh
    ├── steps
    └── utils
├── nnet_pytorch
    ├── INSTALL_PYCHAIN
    ├── IterationTypes.py
    ├── LRScheduler.py
    ├── __init__.py
    ├── batch_generators.py
    ├── data_utils.py
    ├── datasets
    │   ├── HybridASR.py
    │   ├── NnetPytorchDataset.py
    │   ├── __init__.py
    │   └── data_utils.py
    ├── decode.py
    ├── decorrupt.py
    ├── generate.py
    ├── generate_conditional_from_buffer.py
    ├── models
    │   ├── BLSTM.py
    │   ├── Resnet.py
    │   ├── TDNN.py
    │   ├── WideResnet.py
    │   └── __init__.py
    ├── objectives
    │   ├── AcceleratedSGLD.py
    │   ├── CrossEntropy.py
    │   ├── CrossEntropy_EBM.py
    │   ├── L2.py
    │   ├── LFMMI.py
    │   ├── LFMMIOnly.py
    │   ├── LFMMI_EBM.py
    │   ├── SGLD.py
    │   ├── SGLDAdam.py
    │   ├── SGLDSampler.py
    │   ├── SemisupLFMMI.py
    │   ├── __init__.py
    │   └── optimizer.py
    ├── train.py
    └── utils
    │   ├── average_models.py
    │   ├── combine_models.py
    │   ├── decode_nnet_pytorch.sh
    │   ├── memmap_data.py
    │   ├── prepare_unlabeled_tgt.py
    │   ├── show_decorruption.py
    │   ├── show_sampling.py
    │   ├── split_memmap_data.sh
    │   └── train_async_parallel.sh
└── tools
    ├── Makefile
    ├── pychain_patch.diff
    └── requirements.txt


/librilight/README.md:
--------------------------------------------------------------------------------
 1 | To start this example commands in the following files will have to be defined
 2 | according to the computing environment on which the code is running. 
 3 | 
 4 | librilight/cmd.sh -- contains commands for training and decoding. These commands
 5 | may need to be modified for new computing clusters.
 6 | 
 7 | librilight/conf/gpu.conf -- gpu configurations that may also need to be changed.
 8 | 
 9 | The CUDA_VISIBLE_DEVICES environment variable is set internally in the code.
10 | Users should modify this line in the following files. It is indicated by 
11 | many commented lines before and after with a note:
12 | 
13 | 1. train.py
14 | 2. decode.py,
15 | 3. generate_conditional_from_buffer.py
16 | 
17 | 
18 | Furthermore, the Librispeech Corpus (unlabled data) will need to be downloaded.
19 | Place the path to the Librispeech data in the variable unlabeled_data found the
20 | run.sh script (first line of code).
21 | 


--------------------------------------------------------------------------------
/librilight/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | export train_cmd="queue.pl --mem 2G"
14 | export decode_cmd="queue.pl --mem 4G"
15 | export mkgraph_cmd="queue.pl --mem 8G"
16 | 


--------------------------------------------------------------------------------
/librilight/conf/decode.config:
--------------------------------------------------------------------------------
1 | # empty config, just use the defaults.
2 | 


--------------------------------------------------------------------------------
/librilight/conf/fbank.conf:
--------------------------------------------------------------------------------
1 | --sample-frequency=16000 
2 | --num-mel-bins=80
3 | 


--------------------------------------------------------------------------------
/librilight/conf/gpu.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
 3 | option mem=* -l mem_free=$0,ram_free=$0
 4 | option mem=0          # Do not add anything to qsub_opts
 5 | option num_threads=* -pe smp $0
 6 | option num_threads=1  # Do not add anything to qsub_opts
 7 | option max_jobs_run=* -tc $0
 8 | default gpu=0
 9 | option gpu=0
10 | option gpu=* -l 'hostname=b1[12345678]*|c0[12345789]*|c1[12356789]*|c2[123456789]*,gpu=$0' -q g.q
11 | 


--------------------------------------------------------------------------------
/librilight/conf/mfcc.conf:
--------------------------------------------------------------------------------
1 | --use-energy=false   # only non-default option.
2 | 


--------------------------------------------------------------------------------
/librilight/conf/mfcc_hires.conf:
--------------------------------------------------------------------------------
 1 | # config for high-resolution MFCC features, intended for neural network training
 2 | # Note: we keep all cepstra, so it has the same info as filterbank features,
 3 | # but MFCC is more easily compressible (because less correlated) which is why 
 4 | # we prefer this method.
 5 | --use-energy=false   # use average of log energy, not energy.
 6 | --num-mel-bins=40     # similar to Google's setup.
 7 | --num-ceps=40     # there is no dimensionality reduction.
 8 | --low-freq=20     # low cutoff frequency for mel bins... this is high-bandwidth data, so
 9 |                   # there might be some information at the low end.
10 | --high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 
11 | 


--------------------------------------------------------------------------------
/librilight/conf/online_cmvn.conf:
--------------------------------------------------------------------------------
1 | # configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
2 | 


--------------------------------------------------------------------------------
/librilight/conf/online_pitch.conf:
--------------------------------------------------------------------------------
 1 | ## This config is given by conf/make_pitch_online.sh to the program compute-and-process-kaldi-pitch-feats,
 2 | ## and is copied by steps/online/nnet2/prepare_online_decoding.sh and similar scripts, to be given
 3 | ## to programs like online2-wav-nnet2-latgen-faster.
 4 | ## The program compute-and-process-kaldi-pitch-feats will use it to compute pitch features that
 5 | ## are the same as that those which will generated in online decoding; this enables us to train
 6 | ## in a way that's compatible with online decoding.
 7 | ## 
 8 | 
 9 | ## most of these options relate to the post-processing rather than the pitch
10 | ## extraction itself.
11 | --add-raw-log-pitch=true   ## this is intended for input to neural nets, so our
12 |                            ## approach is "throw everything in and see what
13 |                            ## sticks".
14 | --normalization-left-context=75
15 | --normalization-right-context=50 # We're removing some of the right-context
16 |                                  # for the normalization.   Would normally be 75.
17 |                                  #
18 |                                  # Note: our changes to the (left,right) context
19 |                                  # from the defaults of (75,75) to (75,50) will
20 |                                  # almost certainly worsen results, but will
21 |                                  # reduce latency.
22 | --frames-per-chunk=10    ## relates to offline simulation of online decoding; 1
23 |                          ## would be equivalent to getting in samples one by
24 |                          ## one.
25 | --simulate-first-pass-online=true  ## this make the online-pitch-extraction code
26 |                                    ## output the 'first-pass' features, which
27 |                                    ## are less accurate than the final ones, and
28 |                                    ## which are the only features the neural-net
29 |                                    ## decoding would ever see (since we can't
30 |                                    ## afford to do lattice rescoring in the
31 |                                    ## neural-net code
32 | 


--------------------------------------------------------------------------------
/librilight/conf/queue_no_k20.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
 3 | option mem=* -l mem_free=$0,ram_free=$0
 4 | option mem=0          # Do not add anything to qsub_opts
 5 | option num_threads=* -pe smp $0
 6 | option num_threads=1  # Do not add anything to qsub_opts
 7 | option max_jobs_run=* -tc $0
 8 | default gpu=0
 9 | option gpu=0 -q all.q
10 | option gpu=* -l gpu=$0 -q g.q
11 | default allow_k20=true
12 | option allow_k20=true
13 | option allow_k20=false -l 'hostname=!g01*&!g02*&!b06*'
14 | 


--------------------------------------------------------------------------------
/librilight/conf/spec.conf:
--------------------------------------------------------------------------------
1 | --preemphasis-coefficient=0.0
2 | --remove-dc-offset=false
3 | --round-to-power-of-two=false
4 | --window-type=hanning
5 | 


--------------------------------------------------------------------------------
/librilight/decode_nnet_pytorch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | . ./path.sh
 4 | 
 5 | batchsize=512
 6 | skip_datadump=false
 7 | checkpoint=final.mdl
 8 | prior_scale=1.0
 9 | prior_floor=-20.0
10 | prior_name="priors"
11 | min_active=200
12 | max_active=7000
13 | max_mem=50000000
14 | lattice_beam=8.0
15 | beam=15.0
16 | acoustic_scale=0.1
17 | post_decode_acwt=10.0 # 10.0 for chain systems, 1.0 for non-chain
18 | mean_var="(True, True)"
19 | 
20 | min_lmwt=6
21 | max_lmwt=18
22 | nj=80
23 | stage=0
24 | 
25 | . ./utils/parse_options.sh
26 | if [ $# -ne 4 ]; then
27 |   echo "Usage: ./decode_nnet_pytorch.sh <data> <pytorch_model> <graphdir> <odir>"
28 |   echo " --batchsize ${batchsize} --skip-datadump ${skip_datadump}"
29 |   echo " --checkpoint ${checkpoint} --prior-scale ${prior_scale} --prior-floor ${prior_floor} --prior-name ${prior_name}"
30 |   echo " --min-active ${min_active} --max-active ${max_active}"
31 |   echo " --max-mem ${max_mem} --lattice-beam ${lattice_beam}"
32 |   echo " --beam ${beam} --acoustic-scale ${acoustic_scale} --post-decode-acwt ${post_decode_acwt}"
33 |   echo " --nj ${nj}"
34 |   exit 1;
35 | fi
36 | 
37 | data=$1
38 | pytorch_model=$2
39 | graphdir=$3
40 | odir=$4
41 | 
42 | # We assume the acoustic model (trans.mdl) is 1 level above the graphdir
43 | amdir=`dirname ${graphdir}`
44 | trans_mdl=${amdir}/final.mdl
45 | words_file=${graphdir}/words.txt
46 | hclg=${graphdir}/HCLG.fst
47 | 
48 | skip_datadump_opts=
49 | if $skip_datadump; then
50 |   skip_datadump_opts="--skip-datadump"
51 | else
52 |   memmap_data.py ${data}/feats.scp ${data}/feats.scp.dat  
53 |   skip_datadump_opts="--skip-datadump"
54 | fi
55 | 
56 | mkdir -p ${odir}/log
57 | 
58 | decode_cmd="utils/queue.pl --mem 2G -l hostname=b0*" # The 'a' machines are just too slow
59 | if [ $stage -le 0 ]; then
60 |   segments=${data}/segments
61 |   if [ ! -f ${data}/segments ]; then
62 |     echo "No segments file found. Assuming wav.scp is indexed by utterance"
63 |     segments=${data}/wav.scp
64 |   fi
65 | 
66 | ${decode_cmd} JOB=1:${nj} ${odir}/log/decode.JOB.log \
67 |     ./utils/split_scp.pl -j ${nj} \$\[JOB -1\] ${segments} \|\
68 |     decode.py ${skip_datadump_opts} \
69 |       --datadir ${data} \
70 |       --modeldir ${pytorch_model} \
71 |       --dumpdir ${odir} \
72 |       --checkpoint ${checkpoint} \
73 |       --prior-scale ${prior_scale} \
74 |       --prior-floor ${prior_floor} \
75 |       --prior-name ${prior_name} \
76 |       --words-file ${words_file} \
77 |       --trans-mdl ${trans_mdl} \
78 |       --hclg ${hclg} \
79 |       --min-active ${min_active} \
80 |       --max-active ${max_active} \
81 |       --lattice-beam ${lattice_beam} \
82 |       --beam ${beam} \
83 |       --acoustic-scale ${acoustic_scale} \
84 |       --post-decode-acwt ${post_decode_acwt} \
85 |       --job JOB \
86 |       --utt-subset /dev/stdin \
87 |       --batchsize ${batchsize}
88 | fi
89 | 
90 | if [ $stage -le 1 ]; then
91 |   ./local/score.sh --cmd "$decode_cmd" \
92 |     --min-lmwt ${min_lmwt} --max-lmwt ${max_lmwt} --word-ins-penalty 0.0 \
93 |     ${data} ${graphdir} ${odir}
94 | fi
95 | 


--------------------------------------------------------------------------------
/librilight/local/data_prep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2014  Vassil Panayotov
 4 | #           2014  Johns Hopkins University (author: Daniel Povey)
 5 | # Apache 2.0
 6 | 
 7 | if [ "$#" -ne 2 ]; then
 8 |   echo "Usage: $0 <src-dir> <dst-dir>"
 9 |   echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean"
10 |   exit 1
11 | fi
12 | 
13 | src=$1
14 | dst=$2
15 | 
16 | # all utterances are FLAC compressed
17 | if ! which flac >&/dev/null; then
18 |    echo "Please install 'flac' on ALL worker nodes!"
19 |    exit 1
20 | fi
21 | 
22 | spk_file=$src/../SPEAKERS.TXT
23 | 
24 | mkdir -p $dst || exit 1;
25 | 
26 | [ ! -d $src ] && echo "$0: no such directory $src" && exit 1;
27 | [ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1;
28 | 
29 | 
30 | wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
31 | trans=$dst/text; [[ -f "$trans" ]] && rm $trans
32 | utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk
33 | spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender
34 | 
35 | for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do
36 |   reader=$(basename $reader_dir)
37 |   if ! [ $reader -eq $reader ]; then  # not integer.
38 |     echo "$0: unexpected subdirectory name $reader"
39 |     exit 1;
40 |   fi
41 | 
42 |   reader_gender=$(egrep "^$reader[ ]+\|" $spk_file | awk -F'|' '{gsub(/[ ]+/, ""); print tolower($2)}')
43 |   if [ "$reader_gender" != 'm' ] && [ "$reader_gender" != 'f' ]; then
44 |     echo "Unexpected gender: '$reader_gender'"
45 |     exit 1;
46 |   fi
47 | 
48 |   for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do
49 |     chapter=$(basename $chapter_dir)
50 |     if ! [ "$chapter" -eq "$chapter" ]; then
51 |       echo "$0: unexpected chapter-subdirectory name $chapter"
52 |       exit 1;
53 |     fi
54 | 
55 |     find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \
56 |       awk -v "dir=$chapter_dir" '{printf "%s flac -c -d -s %s/%s.flac |\n", $0, dir, $0}' >>$wav_scp|| exit 1
57 | 
58 |     chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt
59 |     [ ! -f  $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1
60 |     cat $chapter_trans >>$trans
61 | 
62 |     # NOTE: For now we are using per-chapter utt2spk. That is each chapter is considered
63 |     #       to be a different speaker. This is done for simplicity and because we want
64 |     #       e.g. the CMVN to be calculated per-chapter
65 |     awk -v "reader=$reader" -v "chapter=$chapter" '{printf "%s %s-%s\n", $1, reader, chapter}' \
66 |       <$chapter_trans >>$utt2spk || exit 1
67 | 
68 |     # reader -> gender map (again using per-chapter granularity)
69 |     echo "${reader}-${chapter} $reader_gender" >>$spk2gender
70 |   done
71 | done
72 | 
73 | spk2utt=$dst/spk2utt
74 | utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt || exit 1
75 | 
76 | ntrans=$(wc -l <$trans)
77 | nutt2spk=$(wc -l <$utt2spk)
78 | ! [ "$ntrans" -eq "$nutt2spk" ] && \
79 |   echo "Inconsistent #transcripts($ntrans) and #utt2spk($nutt2spk)" && exit 1;
80 | 
81 | utils/validate_data_dir.sh --no-feats $dst || exit 1;
82 | 
83 | echo "$0: successfully prepared data in $dst"
84 | 
85 | exit 0
86 | 


--------------------------------------------------------------------------------
/librilight/local/download_and_untar.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright   2014  Johns Hopkins University (author: Daniel Povey)
  4 | #             2017  Luminar Technologies, Inc. (author: Daniel Galvez)
  5 | # Apache 2.0
  6 | 
  7 | remove_archive=false
  8 | 
  9 | if [ "$1" == --remove-archive ]; then
 10 |   remove_archive=true
 11 |   shift
 12 | fi
 13 | 
 14 | if [ $# -ne 3 ]; then
 15 |   echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
 16 |   echo "e.g.: $0 /export/a05/dgalvez/ www.openslr.org/resources/31 dev-clean-2"
 17 |   echo "With --remove-archive it will remove the archive after successfully un-tarring it."
 18 |   echo "<corpus-part> can be one of: dev-clean-2, test-clean-5, dev-other, test-other,"
 19 |   echo "          train-clean-100, train-clean-360, train-other-500."
 20 | fi
 21 | 
 22 | data=$1
 23 | url=$2
 24 | part=$3
 25 | 
 26 | if [ ! -d "$data" ]; then
 27 |   echo "$0: no such directory $data"
 28 |   exit 1;
 29 | fi
 30 | 
 31 | data=$(readlink -f $data)
 32 | 
 33 | part_ok=false
 34 | list="dev-clean-2 train-clean-5"
 35 | for x in $list; do
 36 |   if [ "$part" == $x ]; then part_ok=true; fi
 37 | done
 38 | if ! $part_ok; then
 39 |   echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
 40 |   exit 1;
 41 | fi
 42 | 
 43 | if [ -z "$url" ]; then
 44 |   echo "$0: empty URL base."
 45 |   exit 1;
 46 | fi
 47 | 
 48 | if [ -f $data/LibriSpeech/$part/.complete ]; then
 49 |   echo "$0: data part $part was already successfully extracted, nothing to do."
 50 |   exit 0;
 51 | fi
 52 | 
 53 | 
 54 | #sizes="126046265 332747356"
 55 | sizes="126046265 332954390"
 56 | 
 57 | if [ -f $data/$part.tar.gz ]; then
 58 |   size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}')
 59 |   size_ok=false
 60 |   for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
 61 |   if ! $size_ok; then
 62 |     echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size"
 63 |     echo "does not equal the size of one of the archives."
 64 |     rm $data/$part.tar.gz
 65 |   else
 66 |     echo "$data/$part.tar.gz exists and appears to be complete."
 67 |   fi
 68 | fi
 69 | 
 70 | if [ ! -f $data/$part.tar.gz ]; then
 71 |   if ! which wget >/dev/null; then
 72 |     echo "$0: wget is not installed."
 73 |     exit 1;
 74 |   fi
 75 |   full_url=$url/$part.tar.gz
 76 |   echo "$0: downloading data from $full_url.  This may take some time, please be patient."
 77 | 
 78 |   cd $data
 79 |   if ! wget --no-check-certificate $full_url; then
 80 |     echo "$0: error executing wget $full_url"
 81 |     exit 1;
 82 |   fi
 83 |   cd -
 84 | fi
 85 | 
 86 | cd $data
 87 | 
 88 | if ! tar -xvzf $part.tar.gz; then
 89 |   echo "$0: error un-tarring archive $data/$part.tar.gz"
 90 |   exit 1;
 91 | fi
 92 | 
 93 | touch $data/LibriSpeech/$part/.complete
 94 | 
 95 | echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz"
 96 | 
 97 | if $remove_archive; then
 98 |   echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied."
 99 |   rm $data/$part.tar.gz
100 | fi
101 | 


--------------------------------------------------------------------------------
/librilight/local/download_lm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2014  Vassil Panayotov
 4 | #           2017  Daniel Povey
 5 | # Apache 2.0
 6 | 
 7 | if [ $# -ne "3" ]; then
 8 |   echo "Usage: $0 <base-url> <download_dir> <local?"
 9 |   echo "e.g.: $0 http://www.openslr.org/resources/11 ./corpus/ data/local/lm"
10 |   exit 1
11 | fi
12 | 
13 | base_url=$1
14 | dst_dir=$2
15 | local_dir=$3
16 | 
17 | # given a filename returns the corresponding file size in bytes
18 | # The switch cases below can be autogenerated by entering the data directory and running:
19 | # for f in *; do echo "\"$f\") echo \"$(du -b $f | awk '{print $1}')\";;"; done
20 | function filesize() {
21 |   case $1 in
22 |     "3-gram.arpa.gz") echo "759636181";;
23 |     "3-gram.pruned.1e-7.arpa.gz") echo "34094057";;
24 |     "3-gram.pruned.3e-7.arpa.gz") echo "13654242";;
25 |     "4-gram.arpa.gz") echo "1355172078";;
26 |     "librispeech-lexicon.txt") echo "5627653";;
27 |     "librispeech-vocab.txt") echo "1737588";;
28 |     *) echo "";;
29 |   esac
30 | }
31 | 
32 | function check_and_download () {
33 |   [[ $# -eq 1 ]] || { echo "check_and_download() expects exactly one argument!"; return 1; }
34 |   fname=$1
35 |   echo "Downloading file '$fname' into '$dst_dir'..."
36 |   expect_size="$(filesize $fname)"
37 |   [[ ! -z "$expect_size" ]] || { echo "Unknown file size for '$fname'"; return 1; }
38 |   if [[ -s $dst_dir/$fname ]]; then
39 |     # In the following statement, the first version works on linux, and the part
40 |     # after '||' works on Linux.
41 |     f=$dst_dir/$fname
42 |     fsize=$(set -o pipefail; du -b $f 2>/dev/null | awk '{print $1}' || stat '-f %z' $f)
43 |     if [[ "$fsize" -eq "$expect_size" ]]; then
44 |       echo "'$fname' already exists and appears to be complete"
45 |       return 0
46 |     else
47 |       echo "WARNING: '$fname' exists, but the size is wrong - re-downloading ..."
48 |     fi
49 |   fi
50 |   wget --no-check-certificate -O $dst_dir/$fname $base_url/$fname || {
51 |     echo "Error while trying to download $fname!"
52 |     return 1
53 |   }
54 |   f=$dst_dir/$fname
55 |   # In the following statement, the first version works on linux, and the part after '||'
56 |   # works on Linux.
57 |   fsize=$(set -o pipefail; du -b $f 2>/dev/null | awk '{print $1}' || stat '-f %z' $f)
58 |   [[ "$fsize" -eq "$expect_size" ]] || { echo "$fname: file size mismatch!"; return 1; }
59 |   return 0
60 | }
61 | 
62 | mkdir -p $dst_dir $local_dir
63 | 
64 | for f in 3-gram.arpa.gz 3-gram.pruned.1e-7.arpa.gz 3-gram.pruned.3e-7.arpa.gz 4-gram.arpa.gz \
65 |          librispeech-vocab.txt librispeech-lexicon.txt; do
66 |   check_and_download $f || exit 1
67 | done
68 | 
69 | dst_dir=$(readlink -f $dst_dir)
70 | ln -sf $dst_dir/3-gram.pruned.1e-7.arpa.gz $local_dir/lm_tgmed.arpa.gz
71 | ln -sf $dst_dir/3-gram.pruned.3e-7.arpa.gz $local_dir/lm_tgsmall.arpa.gz
72 | ln -sf $dst_dir/3-gram.arpa.gz $local_dir/lm_tglarge.arpa.gz
73 | ln -sf $dst_dir/4-gram.arpa.gz $local_dir/lm_fglarge.arpa.gz
74 | ln -sf $dst_dir/librispeech-lexicon.txt $local_dir/librispeech-lexicon.txt
75 | ln -sf $dst_dir/librispeech-vocab.txt $local_dir/librispeech-vocab.txt
76 | exit 0
77 | 


--------------------------------------------------------------------------------
/librilight/local/format_lms.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2014 Vassil Panayotov
 4 | # Apache 2.0
 5 | 
 6 | # Prepares the test time language model(G) transducers
 7 | # (adapted from wsj/s5/local/wsj_format_data.sh)
 8 | 
 9 | . ./path.sh || exit 1;
10 | 
11 | # begin configuration section
12 | src_dir=data/lang
13 | # end configuration section
14 | 
15 | . utils/parse_options.sh || exit 1;
16 | 
17 | set -e
18 | 
19 | if [ $# -ne 1 ]; then
20 |   echo "Usage: $0 <lm-dir>"
21 |   echo "e.g.: $0 /export/a15/vpanayotov/data/lm"
22 |   echo ", where:"
23 |   echo "    <lm-dir> is the directory in which the language model is stored/downloaded"
24 |   echo "Options:"
25 |   echo "   --src-dir  <dir>           # source lang directory, default data/lang"
26 |   exit 1
27 | fi
28 | 
29 | lm_dir=$1
30 | 
31 | if [ ! -d $lm_dir ]; then
32 |   echo "$0: expected source LM directory $lm_dir to exist"
33 |   exit 1;
34 | fi
35 | if [ ! -f $src_dir/words.txt ]; then
36 |   echo "$0: expected $src_dir/words.txt to exist."
37 |   exit 1;
38 | fi
39 | 
40 | 
41 | tmpdir=data/local/lm_tmp.$$
42 | trap "rm -r $tmpdir" EXIT
43 | 
44 | mkdir -p $tmpdir
45 | 
46 | for lm_suffix in tgsmall tgmed; do
47 |   # tglarge is prepared by a separate command, called from run.sh; we don't
48 |   # want to compile G.fst for tglarge, as it takes a while.
49 |   test=${src_dir}_test_${lm_suffix}
50 |   mkdir -p $test
51 |   cp -r ${src_dir}/* $test
52 |   gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz | \
53 |     arpa2fst --disambig-symbol=#0 \
54 |              --read-symbol-table=$test/words.txt - $test/G.fst
55 |   utils/validate_lang.pl --skip-determinization-check $test || exit 1;
56 | done
57 | 
58 | echo "Succeeded in formatting data."
59 | 
60 | exit 0
61 | 


--------------------------------------------------------------------------------
/librilight/local/prepare_dict.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright 2014 Vassil Panayotov
  4 | # Apache 2.0
  5 | 
  6 | # Prepares the dictionary and auto-generates the pronunciations for the words,
  7 | # that are in our vocabulary but not in CMUdict
  8 | 
  9 | stage=0
 10 | nj=4 # number of parallel Sequitur G2P jobs, we would like to use
 11 | cmd=run.pl
 12 | 
 13 | 
 14 | . utils/parse_options.sh || exit 1;
 15 | . ./path.sh || exit 1
 16 | 
 17 | 
 18 | if [ $# -ne 3 ]; then
 19 |   echo "Usage: $0 [options] <lm-dir> <g2p-model-dir> <dst-dir>"
 20 |   echo "e.g.: /export/a15/vpanayotov/data/lm /export/a15/vpanayotov/data/g2p data/local/dict"
 21 |   echo "Options:"
 22 |   echo "  --cmd '<command>'    # script to launch jobs with, default: run.pl"
 23 |   echo "  --nj <nj>            # number of jobs to run, default: 4."
 24 |   exit 1
 25 | fi
 26 | 
 27 | lm_dir=$1
 28 | g2p_model_dir=$2
 29 | dst_dir=$3
 30 | 
 31 | vocab=$lm_dir/librispeech-vocab.txt
 32 | [ ! -f $vocab ] && echo "$0: vocabulary file not found at $vocab" && exit 1;
 33 | 
 34 | # this file is either a copy of the lexicon we download from openslr.org/11 or is
 35 | # created by the G2P steps below
 36 | lexicon_raw_nosil=$dst_dir/lexicon_raw_nosil.txt
 37 | 
 38 | cmudict_dir=$dst_dir/cmudict
 39 | cmudict_plain=$dst_dir/cmudict.0.7a.plain
 40 | 
 41 | mkdir -p $dst_dir || exit 1;
 42 | 
 43 | if [ $stage -le 0 ]; then
 44 |   echo "Downloading and preparing CMUdict"
 45 |   if [ ! -s $cmudict_dir/cmudict.0.7a ]; then
 46 |     svn co -r 12440 https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict $cmudict_dir || exit 1;
 47 |   fi
 48 |   echo "Removing the pronunciation variant markers ..."
 49 |   grep -v ';;;' $cmudict_dir/cmudict.0.7a | \
 50 |     perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \
 51 |     > $cmudict_plain || exit 1;
 52 | fi
 53 | 
 54 | 
 55 | if [ $stage -le 1 ]; then
 56 |   # check if we have Sequitur G2P is installed
 57 |   if [ ! -f  "$sequitur" ]; then
 58 |     if ! which swig >&/dev/null; then
 59 |       echo "Please install 'swig' and then run $KALDI_ROOT/tools/extra/install_sequitur.sh"
 60 |       exit 1
 61 |     else
 62 |       echo "Sequitur G2P not found- running $KALDI_ROOT/tools/extra/install_sequitur.sh"
 63 |       pushd $KALDI_ROOT/tools
 64 |       extras/install_sequitur.sh || exit 1
 65 |       popd
 66 |     fi
 67 |   fi
 68 |   [[ -f "$sequitur" ]] || { echo "Still can't find Sequitur G2P- check your path.sh"; exit 1; }
 69 | 
 70 |   g2p_dir=$dst_dir/g2p
 71 |   auto_vocab_prefix="$g2p_dir/vocab_autogen"
 72 |   auto_lexicon_prefix="$g2p_dir/lexicon_autogen"
 73 | 
 74 |   mkdir -p $g2p_dir/log
 75 |   auto_vocab_splits=$(eval "echo $auto_vocab_prefix.{$(seq -s',' $nj | sed 's/,$//')}")
 76 |   awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' $cmudict_plain $vocab |\
 77 |     sort | tee $g2p_dir/vocab_autogen.full |\
 78 |     utils/split_scp.pl /dev/stdin $auto_vocab_splits || exit 1
 79 |   echo "Autogenerating pronunciations for the words in $auto_vocab_prefix.* ..."
 80 |   $cmd JOB=1:$nj $g2p_dir/log/g2p.JOB.log \
 81 |     local/g2p.sh  $auto_vocab_prefix.JOB $g2p_model_dir $auto_lexicon_prefix.JOB || exit 1
 82 |   g2p_vocab_size=$(wc -l <$g2p_dir/vocab_autogen.full)
 83 |   g2p_lex_size=$(wc -l < <(cat $auto_lexicon_prefix.*))
 84 |   [[ "$g2p_vocab_size" -eq "$g2p_lex_size" ]] || { echo "Unexpected G2P error"; exit 1; }
 85 |   sort <(cat $auto_vocab_prefix.*) >$dst_dir/vocab_autogen.txt
 86 |   sort <(cat $auto_lexicon_prefix.*) >$dst_dir/lexicon_autogen.txt
 87 |   echo "$(wc -l <$g2p_dir/vocab_autogen.full) pronunciations autogenerated OK"
 88 | fi
 89 | 
 90 | if [ $stage -le 2 ]; then
 91 |   echo "Combining the CMUdict pronunciations with the autogenerated ones ..."
 92 |   awk 'NR==FNR{a[$1]=1; next} ($1 in a)' $vocab $cmudict_plain |\
 93 |     cat - $dst_dir/lexicon_autogen.txt | sort >$lexicon_raw_nosil || exit 1
 94 |   raw_lex_size=$(cat $lexicon_raw_nosil | awk '{print $1}' | sort -u | wc -l)
 95 |   vocab_size=$(wc -l <$vocab)
 96 |   [[ "$vocab_size" -eq "$raw_lex_size" ]] || {
 97 |     echo "Inconsistent lexicon($raw_lex_size) vs vocabulary($vocab_size) size!";
 98 |     exit 1; }
 99 |   echo "Combined lexicon saved to '$lexicon_raw_nosil'"
100 | fi
101 | 
102 | # The copy operation below is necessary, if we skip the g2p stages(e.g. using --stage 3)
103 | if [[ ! -s "$lexicon_raw_nosil" ]]; then
104 |   cp $lm_dir/librispeech-lexicon.txt $lexicon_raw_nosil || exit 1
105 | fi
106 | 
107 | if [ $stage -le 3 ]; then
108 |   silence_phones=$dst_dir/silence_phones.txt
109 |   optional_silence=$dst_dir/optional_silence.txt
110 |   nonsil_phones=$dst_dir/nonsilence_phones.txt
111 |   extra_questions=$dst_dir/extra_questions.txt
112 | 
113 |   echo "Preparing phone lists and clustering questions"
114 |   (echo SIL; echo SPN;) > $silence_phones
115 |   echo SIL > $optional_silence
116 |   # nonsilence phones; on each line is a list of phones that correspond
117 |   # really to the same base phone.
118 |   awk '{for (i=2; i<=NF; ++i) { print $i; gsub(/[0-9]/, "", $i); print $i}}' $lexicon_raw_nosil |\
119 |     sort -u |\
120 |     perl -e 'while(<>){
121 |       chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_";
122 |       $phones_of{$1} .= "$_ "; }
123 |       foreach $list (values %phones_of) {print $list . "\n"; } ' | sort \
124 |       > $nonsil_phones || exit 1;
125 |   # A few extra questions that will be added to those obtained by automatically clustering
126 |   # the "real" phones.  These ask about stress; there's also one for silence.
127 |   cat $silence_phones| awk '{printf("%s ", $1);} END{printf "\n";}' > $extra_questions || exit 1;
128 |   cat $nonsil_phones | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
129 |     $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
130 |     >> $extra_questions || exit 1;
131 |   echo "$(wc -l <$silence_phones) silence phones saved to: $silence_phones"
132 |   echo "$(wc -l <$optional_silence) optional silence saved to: $optional_silence"
133 |   echo "$(wc -l <$nonsil_phones) non-silence phones saved to: $nonsil_phones"
134 |   echo "$(wc -l <$extra_questions) extra triphone clustering-related questions saved to: $extra_questions"
135 | fi
136 | 
137 | if [ $stage -le 4 ]; then
138 |   (echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) |\
139 |   cat - $lexicon_raw_nosil | sort | uniq >$dst_dir/lexicon.txt
140 |   echo "Lexicon text file saved as: $dst_dir/lexicon.txt"
141 | fi
142 | 
143 | exit 0
144 | 


--------------------------------------------------------------------------------
/librilight/local/prepare_librilight.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | . ./path.sh
 4 | . ./cmd
 5 | 
 6 | if [ $# -ne 1 ]; then
 7 |   echo "Usage: ./local/prepare_librilight.sh <data>"
 8 |   exit 1;
 9 | fi
10 | 
11 | data=$1
12 | # Get librilight set
13 | wget https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz
14 | tar -xvf librispeech_finetuning.tgz && mv librispeech_finetuning ${data}
15 | 
16 | # The following are the data subsets:
17 | # 1h/{0..5}/{clean,other}
18 | # 9h/{clean,other}
19 | #
20 | # In each of these subsets there speaker directories named with a speaker-id.
21 | # Inside each directory are more directories corresponding to a recording-id.
22 | # Within each speaker-id/recording-id subdirectory are the .flac audio files
23 | # corresponding to speech utterances, as well as a .trans.txt file that has
24 | # the transcription.
25 | 
26 | find -L $data -name "*.flac"
27 | 
28 | for part in 1h/{0..5}/{clean,other} 9h/{clean,other}; do
29 |   dataname=$(echo ${part} | sed 's/\//_/g')
30 |   ./local/prepare_librilight_dataset.sh ${data}/${part} data/train_${dataname}
31 | done
32 | 
33 | ./utils/combine_data.sh \
34 |   data/train_10h data/train_1h_{0..5}_{clean,other} data/train_9h_{clean,other}
35 | 


--------------------------------------------------------------------------------
/librilight/local/prepare_librilight_dataset.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | . ./path.sh
 4 | . ./cmd.sh
 5 | 
 6 | if [ $# -ne 2 ]; then
 7 |   echo "Usage: ./local/prepare_librilight.sh <data> <kaldi_data>"
 8 |   exit 1;
 9 | fi
10 | 
11 | data=$1
12 | kaldi_data=$2
13 | 
14 | data=$(./utils/make_absolute.sh ${data})
15 | mkdir -p $kaldi_data
16 | files=( `find -L ${data}/${p} -name "*.flac"` )
17 | 
18 | for f in ${files[@]}; do
19 |   fname=`basename $f`
20 |   fname=${fname%%.flac}
21 |   echo "${fname} flac -c -d -s ${f} |" 
22 | done | sort > ${kaldi_data}/wav.scp
23 | 
24 | paste -d' ' <(awk '{print $1}' ${kaldi_data}/wav.scp) \
25 |             <(awk '{print $1}' ${kaldi_data}/wav.scp | cut -d'-' -f1) \
26 |             > ${kaldi_data}/utt2spk
27 | 
28 | ./utils/utt2spk_to_spk2utt.pl ${kaldi_data}/utt2spk > ${kaldi_data}/spk2utt
29 | 
30 | cat `find -L ${data}/${p} -name "*.trans.txt"` | sort > ${kaldi_data}/text
31 | exit 0;
32 | 


--------------------------------------------------------------------------------
/librilight/local/prepare_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | data=/export/a15/vpanayotov/data
 4 | subsampling=4
 5 | 
 6 | . ./cmd.sh
 7 | . ./path.sh
 8 | 
 9 | . ./utils/parse_options.sh
10 | 
11 | set -euo pipefail
12 | 
13 | for part in dev-clean dev-other test-clean test-other; do
14 |   echo "-------------- Making ${part} ----------------------"
15 |   dataname=$(echo ${part} | sed s/-/_/g)
16 |   local/data_prep.sh $data/LibriSpeech/${part} data/${dataname}
17 |   ./utils/copy_data_dir.sh data/${dataname} data/${dataname}_fbank
18 |   ./steps/make_fbank.sh --cmd "$train_cmd" --nj 32 \
19 |     data/${dataname}_fbank exp/make_fbank/${dataname} fbank
20 |   ./utils/fix_data_dir.sh data/${dataname}_fbank
21 |   ./steps/compute_cmvn_stats.sh data/${dataname}_fbank
22 |   ./utils/fix_data_dir.sh data/${dataname}_fbank
23 | 
24 |   memmap_data.py data/${dataname}_fbank/feats.scp data/${dataname}_fbank/feats.scp.dat
25 |   python local/prepare_unlabeled_tgt.py --subsample ${subsampling} \
26 |     data/${dataname}_fbank/utt2num_frames > data/${dataname}_fbank/pdfid.${subsampling}.tgt
27 | done
28 | 
29 | exit 0;
30 |  
31 | 
32 | 


--------------------------------------------------------------------------------
/librilight/local/prepare_unlabeled_tgt.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*- coding: utf-8 -*-
 3 | # Copyright 2019  Johns Hopkins University (Author: Matthew Wiesner)
 4 | # Apache 2.0
 5 | 
 6 | from __future__ import print_function
 7 | import argparse
 8 | import sys
 9 | import os
10 | 
11 | 
12 | def main():
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument('utt2num_frames',
15 |         help='',
16 |         type=str
17 |     )
18 |     parser.add_argument('--subsample', type=int, default=1)
19 | 
20 |     args = parser.parse_args()
21 | 
22 |     with open(args.utt2num_frames, 'r') as f:
23 |         for l in f:
24 |             utt, frames = l.strip().split(None, 1)
25 |             print(utt, end='')
26 |             num_frames = len(range(0, int(frames), args.subsample))
27 |             print(' -1' * num_frames)
28 | 
29 | if __name__ == "__main__":
30 |     main()
31 | 
32 | 


--------------------------------------------------------------------------------
/librilight/local/score.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
 3 | #           2014  Guoguo Chen
 4 | # Apache 2.0
 5 | 
 6 | [ -f ./path.sh ] && . ./path.sh
 7 | 
 8 | # begin configuration section.
 9 | cmd=run.pl
10 | stage=0
11 | decode_mbr=true
12 | word_ins_penalty=0.0,0.5,1.0
13 | min_lmwt=7
14 | max_lmwt=17
15 | iter=final
16 | #end configuration section.
17 | 
18 | [ -f ./path.sh ] && . ./path.sh
19 | . parse_options.sh || exit 1;
20 | 
21 | if [ $# -ne 3 ]; then
22 |   echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
23 |   echo " Options:"
24 |   echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
25 |   echo "    --stage (0|1|2)                 # start scoring script from part-way through."
26 |   echo "    --decode_mbr (true/false)       # maximum bayes risk decoding (confusion network)."
27 |   echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
28 |   echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
29 |   exit 1;
30 | fi
31 | 
32 | data=$1
33 | lang_or_graph=$2
34 | dir=$3
35 | 
36 | symtab=$lang_or_graph/words.txt
37 | 
38 | for f in $symtab $dir/lat.1.gz $data/text; do
39 |   [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
40 | done
41 | 
42 | mkdir -p $dir/scoring/log
43 | 
44 | cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
45 | 
46 | for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
47 |   $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.$wip.log \
48 |     lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
49 |     lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
50 |     lattice-best-path --word-symbol-table=$symtab \
51 |       ark:- ark,t:$dir/scoring/LMWT.$wip.tra || exit 1;
52 | done
53 | 
54 | # Note: the double level of quoting for the sed command
55 | for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
56 |   $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.$wip.log \
57 |     cat $dir/scoring/LMWT.$wip.tra \| \
58 |     utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
59 |     compute-wer --text --mode=present \
60 |     ark:$dir/scoring/test_filt.txt  ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1;
61 | done
62 | 
63 | exit 0;
64 | 


--------------------------------------------------------------------------------
/librilight/local/subset_dataset.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2017  Luminar Technologies, Inc. (author: Daniel Galvez)
 4 | # Apache 2.0
 5 | 
 6 | # The following commands were used to generate the mini_librispeech dataset:
 7 | #
 8 | # Note that data generation is random. This could be fixed by
 9 | # providing a seed argument to the shuf program.
10 | 
11 | if [ "$#" -ne 3 ]; then
12 |   echo "Usage: $0 <src-dir> <dst-dir> <num-hours>"
13 |   echo "e.g.: $0 /export/a05/dgalvez/LibriSpeech/train-clean-100 \\
14 |                  /export/a05/dgalvez/LibriSpeech/train-clean-5 5"
15 |   exit 1
16 | fi
17 | 
18 | src_dir=$1
19 | dest_dir=$2
20 | dest_num_hours=$3
21 | 
22 | src=$(basename $src_dir)
23 | dest=$(basename $dest_dir)
24 | librispeech_dir=$(dirname $src_dir)
25 | 
26 | # TODO: Possibly improve this to ensure gender balance and speaker
27 | # balance.
28 | # TODO: Use actual time values instead of assuming that to make sure we get $dest_num_hours of data
29 | src_num_hours=$(grep "$src" $librispeech_dir/CHAPTERS.TXT | awk -F'|' '{ print $3 }' | \
30 | python -c '
31 | from __future__ import print_function
32 | from sys import stdin
33 | minutes_str = stdin.read().split()
34 | print(int(round(sum([float(minutes) for minutes in minutes_str]) / 60.0)))')
35 | src_num_chapters=$(grep "$src" $librispeech_dir/CHAPTERS.TXT | \
36 |                       awk -F'|' '{ print $1 }' | sort -u | wc -l)
37 | mkdir -p data/subset_tmp
38 | grep "$src" $librispeech_dir/CHAPTERS.TXT | \
39 |   awk -F'|' '{ print $1 }' | \
40 |   shuf -n $(((dest_num_hours * src_num_chapters) / src_num_hours)) > \
41 |        data/subset_tmp/${dest}_chapter_id_list.txt
42 | 
43 | while read -r chapter_id || [[ -n "$chapter_id" ]]; do
44 |   chapter_dir=$(find $src_dir/ -mindepth 2 -name "$chapter_id" -type d)
45 |   speaker_id=$(basename $(dirname $chapter_dir))
46 |   mkdir -p $dest_dir/$speaker_id/
47 |   cp -r $chapter_dir $dest_dir/$speaker_id/
48 | done  < data/subset_tmp/${dest}_chapter_id_list.txt
49 | 


--------------------------------------------------------------------------------
/librilight/path.sh:
--------------------------------------------------------------------------------
 1 | export ROOT=`pwd`/../tools
 2 | export KALDI_ROOT=${ROOT}/kaldi
 3 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 4 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/sph2pipe_v2.5:$KALDI_ROOT/tools/openfst/bin:`pwd`/../nnet_pytorch:$PWD:$PATH
 5 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
 6 | . $KALDI_ROOT/tools/config/common_path.sh
 7 | export LC_ALL=C
 8 | 
 9 | export OPENFST_PATH=${ROOT}/openfst #/PATH/TO/OPENFST
10 | export LD_LIBRARY_ORIG=${LD_LIBRARY_PATH}
11 | export LD_LIBRARY_PATH=${OPENFST_PATH}/lib:${LD_LIBRARY_PATH}
12 | #export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${OPENFST_PATH}/lib:/usr/local/cuda/lib64
13 | 
14 | export PYTHONPATH=${PYTHONPATH}:`pwd`/../nnet_pytorch/
15 | export PYTHONUNBUFFERED=1
16 | source ${ROOT}/NeurIPS2020/bin/activate
17 | 
18 | export LC_ALL=C
19 | 
20 | 


--------------------------------------------------------------------------------
/librilight/steps:
--------------------------------------------------------------------------------
1 | ../tools/kaldi/egs/wsj/s5/steps


--------------------------------------------------------------------------------
/librilight/utils:
--------------------------------------------------------------------------------
1 | ../tools/kaldi/egs/wsj/s5/utils


--------------------------------------------------------------------------------
/librispeech/README:
--------------------------------------------------------------------------------
 1 | This recipe trains a hybrid ASR model on the 960h
 2 | Librispeech data. The training pipeline is almost similar
 3 | to the Kaldi Librispeech recipe, except that we don't do
 4 | speed pertrubation or use i-vectors here, and we use
 5 | the nnet_pytorch for acoustic model training, instead
 6 | of the Kaldi nnet3. Using a 6-layer BLSTM (41M params)
 7 | and very little hyperparameter tuning, and training on
 8 | 4 GPUs for 2 days, we were able to obtain a WER of 4.46%
 9 | on the dev-clean subset (with 4-gram LM rescoring).
10 | 
11 | To run the training pipeline: `./run.sh`
12 | To run decoding with the trained model: `./decode.sh`
13 | 


--------------------------------------------------------------------------------
/librispeech/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | export train_cmd="queue.pl --mem 2G -l hostname=!b02*"
14 | export decode_cmd="queue.pl --mem 4G -l hostname=!b02*\&!c06*\&!c23*\&!c24*\&!c25*\&!c27*"
15 | export mkgraph_cmd="queue.pl --mem 8G"
16 | 


--------------------------------------------------------------------------------
/librispeech/conf/decode.config:
--------------------------------------------------------------------------------
1 | # empty config, just use the defaults.
2 | 


--------------------------------------------------------------------------------
/librispeech/conf/fbank.conf:
--------------------------------------------------------------------------------
1 | --sample-frequency=16000 
2 | --num-mel-bins=80
3 | 


--------------------------------------------------------------------------------
/librispeech/conf/gpu.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
 3 | option mem=* -l mem_free=$0,ram_free=$0
 4 | option mem=0          # Do not add anything to qsub_opts
 5 | option num_threads=* -pe smp $0
 6 | option num_threads=1  # Do not add anything to qsub_opts
 7 | option max_jobs_run=* -tc $0
 8 | default gpu=0
 9 | option gpu=0
10 | option gpu=* -l 'hostname=c0*|c1*|c2[0126]*,gpu=$0' -q g.q
11 | 


--------------------------------------------------------------------------------
/librispeech/conf/mfcc.conf:
--------------------------------------------------------------------------------
1 | --use-energy=false   # only non-default option.
2 | 


--------------------------------------------------------------------------------
/librispeech/conf/mfcc_hires.conf:
--------------------------------------------------------------------------------
 1 | # config for high-resolution MFCC features, intended for neural network training
 2 | # Note: we keep all cepstra, so it has the same info as filterbank features,
 3 | # but MFCC is more easily compressible (because less correlated) which is why 
 4 | # we prefer this method.
 5 | --use-energy=false   # use average of log energy, not energy.
 6 | --num-mel-bins=40     # similar to Google's setup.
 7 | --num-ceps=40     # there is no dimensionality reduction.
 8 | --low-freq=20     # low cutoff frequency for mel bins... this is high-bandwidth data, so
 9 |                   # there might be some information at the low end.
10 | --high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 
11 | 


--------------------------------------------------------------------------------
/librispeech/conf/online_cmvn.conf:
--------------------------------------------------------------------------------
1 | # configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
2 | 


--------------------------------------------------------------------------------
/librispeech/conf/online_pitch.conf:
--------------------------------------------------------------------------------
 1 | ## This config is given by conf/make_pitch_online.sh to the program compute-and-process-kaldi-pitch-feats,
 2 | ## and is copied by steps/online/nnet2/prepare_online_decoding.sh and similar scripts, to be given
 3 | ## to programs like online2-wav-nnet2-latgen-faster.
 4 | ## The program compute-and-process-kaldi-pitch-feats will use it to compute pitch features that
 5 | ## are the same as that those which will generated in online decoding; this enables us to train
 6 | ## in a way that's compatible with online decoding.
 7 | ## 
 8 | 
 9 | ## most of these options relate to the post-processing rather than the pitch
10 | ## extraction itself.
11 | --add-raw-log-pitch=true   ## this is intended for input to neural nets, so our
12 |                            ## approach is "throw everything in and see what
13 |                            ## sticks".
14 | --normalization-left-context=75
15 | --normalization-right-context=50 # We're removing some of the right-context
16 |                                  # for the normalization.   Would normally be 75.
17 |                                  #
18 |                                  # Note: our changes to the (left,right) context
19 |                                  # from the defaults of (75,75) to (75,50) will
20 |                                  # almost certainly worsen results, but will
21 |                                  # reduce latency.
22 | --frames-per-chunk=10    ## relates to offline simulation of online decoding; 1
23 |                          ## would be equivalent to getting in samples one by
24 |                          ## one.
25 | --simulate-first-pass-online=true  ## this make the online-pitch-extraction code
26 |                                    ## output the 'first-pass' features, which
27 |                                    ## are less accurate than the final ones, and
28 |                                    ## which are the only features the neural-net
29 |                                    ## decoding would ever see (since we can't
30 |                                    ## afford to do lattice rescoring in the
31 |                                    ## neural-net code
32 | 


--------------------------------------------------------------------------------
/librispeech/conf/queue_no_k20.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
 3 | option mem=* -l mem_free=$0,ram_free=$0
 4 | option mem=0          # Do not add anything to qsub_opts
 5 | option num_threads=* -pe smp $0
 6 | option num_threads=1  # Do not add anything to qsub_opts
 7 | option max_jobs_run=* -tc $0
 8 | default gpu=0
 9 | option gpu=0 -q all.q
10 | option gpu=* -l gpu=$0 -q g.q
11 | default allow_k20=true
12 | option allow_k20=true
13 | option allow_k20=false -l 'hostname=!g01*&!g02*&!b06*'
14 | 


--------------------------------------------------------------------------------
/librispeech/conf/spec.conf:
--------------------------------------------------------------------------------
1 | --preemphasis-coefficient=0.0
2 | --remove-dc-offset=false
3 | --round-to-power-of-two=false
4 | --window-type=hanning
5 | 


--------------------------------------------------------------------------------
/librispeech/decode.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | speech_data=/export/corpora5 #/PATH/TO/LIBRISPEECH/data
 4 | 
 5 | . ./cmd.sh
 6 | . ./path.sh
 7 | 
 8 | stage=0
 9 | subsampling=4
10 | chaindir=exp/chain_blstm
11 | model_dirname=blstm
12 | checkpoint=240_300.mdl
13 | acwt=1.0
14 | testsets="dev_clean dev_other test_clean test_other"
15 | decode_nj=80
16 | 
17 | . ./utils/parse_options.sh
18 | 
19 | set -euo pipefail
20 | 
21 | tree=${chaindir}/tree
22 | post_decode_acwt=`echo ${acwt} | awk '{print 10*$1}'`
23 | 
24 | # Echo Make graph if it does not exist
25 | if [ ! -f ${tree}/graph_tgsmall/HCLG.fst ]; then 
26 |   ./utils/mkgraph.sh --self-loop-scale 1.0 \
27 |     data/lang_test_tgsmall ${tree} ${tree}/graph_tgsmall
28 | fi
29 | 
30 | ## Prepare the test sets if not already done
31 | if [ ! -f data/dev_clean_fbank/mapped/feats.dat.1 ]; then
32 |   ./local/prepare_test.sh --subsampling ${subsampling} --data ${speech_data} 
33 | fi
34 | 
35 | for ds in $testsets; do 
36 |   decode_nnet_pytorch.sh --min-lmwt 6 \
37 |                          --max-lmwt 18 \
38 |                          --checkpoint ${checkpoint} \
39 |                          --acoustic-scale ${acwt} \
40 |                          --post-decode-acwt ${post_decode_acwt} \
41 |                          --nj ${decode_nj} \
42 |                          data/${ds}_fbank exp/${model_dirname} \
43 |                          ${tree}/graph_tgsmall exp/${model_dirname}/decode_${checkpoint}_graph_${acwt}_${ds}
44 |   
45 |   echo ${decode_nj} > exp/${model_dirname}/decode_${checkpoint}_graph_${acwt}_${ds}/num_jobs
46 |   ./steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
47 |     data/lang_test_{tgsmall,fglarge} \
48 |     data/${ds}_fbank exp/${model_dirname}/decode_${checkpoint}_graph_${acwt}_${ds}{,_fglarge_rescored} 
49 | done
50 | 
51 | 


--------------------------------------------------------------------------------
/librispeech/local/data_prep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2014  Vassil Panayotov
 4 | #           2014  Johns Hopkins University (author: Daniel Povey)
 5 | # Apache 2.0
 6 | 
 7 | if [ "$#" -ne 2 ]; then
 8 |   echo "Usage: $0 <src-dir> <dst-dir>"
 9 |   echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean"
10 |   exit 1
11 | fi
12 | 
13 | src=$1
14 | dst=$2
15 | 
16 | # all utterances are FLAC compressed
17 | if ! which flac >&/dev/null; then
18 |    echo "Please install 'flac' on ALL worker nodes!"
19 |    exit 1
20 | fi
21 | 
22 | spk_file=$src/../SPEAKERS.TXT
23 | 
24 | mkdir -p $dst || exit 1;
25 | 
26 | [ ! -d $src ] && echo "$0: no such directory $src" && exit 1;
27 | [ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1;
28 | 
29 | 
30 | wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
31 | trans=$dst/text; [[ -f "$trans" ]] && rm $trans
32 | utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk
33 | spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender
34 | 
35 | for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do
36 |   reader=$(basename $reader_dir)
37 |   if ! [ $reader -eq $reader ]; then  # not integer.
38 |     echo "$0: unexpected subdirectory name $reader"
39 |     exit 1;
40 |   fi
41 | 
42 |   reader_gender=$(egrep "^$reader[ ]+\|" $spk_file | awk -F'|' '{gsub(/[ ]+/, ""); print tolower($2)}')
43 |   if [ "$reader_gender" != 'm' ] && [ "$reader_gender" != 'f' ]; then
44 |     echo "Unexpected gender: '$reader_gender'"
45 |     exit 1;
46 |   fi
47 | 
48 |   for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do
49 |     chapter=$(basename $chapter_dir)
50 |     if ! [ "$chapter" -eq "$chapter" ]; then
51 |       echo "$0: unexpected chapter-subdirectory name $chapter"
52 |       exit 1;
53 |     fi
54 | 
55 |     find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \
56 |       awk -v "dir=$chapter_dir" '{printf "%s flac -c -d -s %s/%s.flac |\n", $0, dir, $0}' >>$wav_scp|| exit 1
57 | 
58 |     chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt
59 |     [ ! -f  $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1
60 |     cat $chapter_trans >>$trans
61 | 
62 |     # NOTE: For now we are using per-chapter utt2spk. That is each chapter is considered
63 |     #       to be a different speaker. This is done for simplicity and because we want
64 |     #       e.g. the CMVN to be calculated per-chapter
65 |     awk -v "reader=$reader" -v "chapter=$chapter" '{printf "%s %s-%s\n", $1, reader, chapter}' \
66 |       <$chapter_trans >>$utt2spk || exit 1
67 | 
68 |     # reader -> gender map (again using per-chapter granularity)
69 |     echo "${reader}-${chapter} $reader_gender" >>$spk2gender
70 |   done
71 | done
72 | 
73 | spk2utt=$dst/spk2utt
74 | utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt || exit 1
75 | 
76 | ntrans=$(wc -l <$trans)
77 | nutt2spk=$(wc -l <$utt2spk)
78 | ! [ "$ntrans" -eq "$nutt2spk" ] && \
79 |   echo "Inconsistent #transcripts($ntrans) and #utt2spk($nutt2spk)" && exit 1;
80 | 
81 | utils/validate_data_dir.sh --no-feats $dst || exit 1;
82 | 
83 | echo "$0: successfully prepared data in $dst"
84 | 
85 | exit 0
86 | 


--------------------------------------------------------------------------------
/librispeech/local/download_lm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2014  Vassil Panayotov
 4 | #           2017  Daniel Povey
 5 | # Apache 2.0
 6 | 
 7 | if [ $# -ne "3" ]; then
 8 |   echo "Usage: $0 <base-url> <download_dir> <local?"
 9 |   echo "e.g.: $0 http://www.openslr.org/resources/11 ./corpus/ data/local/lm"
10 |   exit 1
11 | fi
12 | 
13 | base_url=$1
14 | dst_dir=$2
15 | local_dir=$3
16 | 
17 | # given a filename returns the corresponding file size in bytes
18 | # The switch cases below can be autogenerated by entering the data directory and running:
19 | # for f in *; do echo "\"$f\") echo \"$(du -b $f | awk '{print $1}')\";;"; done
20 | function filesize() {
21 |   case $1 in
22 |     "3-gram.arpa.gz") echo "759636181";;
23 |     "3-gram.pruned.1e-7.arpa.gz") echo "34094057";;
24 |     "3-gram.pruned.3e-7.arpa.gz") echo "13654242";;
25 |     "librispeech-lexicon.txt") echo "5627653";;
26 |     "librispeech-vocab.txt") echo "1737588";;
27 |     *) echo "";;
28 |   esac
29 | }
30 | 
31 | function check_and_download () {
32 |   [[ $# -eq 1 ]] || { echo "check_and_download() expects exactly one argument!"; return 1; }
33 |   fname=$1
34 |   echo "Downloading file '$fname' into '$dst_dir'..."
35 |   expect_size="$(filesize $fname)"
36 |   [[ ! -z "$expect_size" ]] || { echo "Unknown file size for '$fname'"; return 1; }
37 |   if [[ -s $dst_dir/$fname ]]; then
38 |     # In the following statement, the first version works on linux, and the part
39 |     # after '||' works on Linux.
40 |     f=$dst_dir/$fname
41 |     fsize=$(set -o pipefail; du -b $f 2>/dev/null | awk '{print $1}' || stat '-f %z' $f)
42 |     if [[ "$fsize" -eq "$expect_size" ]]; then
43 |       echo "'$fname' already exists and appears to be complete"
44 |       return 0
45 |     else
46 |       echo "WARNING: '$fname' exists, but the size is wrong - re-downloading ..."
47 |     fi
48 |   fi
49 |   wget --no-check-certificate -O $dst_dir/$fname $base_url/$fname || {
50 |     echo "Error while trying to download $fname!"
51 |     return 1
52 |   }
53 |   f=$dst_dir/$fname
54 |   # In the following statement, the first version works on linux, and the part after '||'
55 |   # works on Linux.
56 |   fsize=$(set -o pipefail; du -b $f 2>/dev/null | awk '{print $1}' || stat '-f %z' $f)
57 |   [[ "$fsize" -eq "$expect_size" ]] || { echo "$fname: file size mismatch!"; return 1; }
58 |   return 0
59 | }
60 | 
61 | mkdir -p $dst_dir $local_dir
62 | 
63 | for f in 3-gram.arpa.gz 3-gram.pruned.1e-7.arpa.gz 3-gram.pruned.3e-7.arpa.gz  \
64 |          librispeech-vocab.txt librispeech-lexicon.txt; do
65 |   check_and_download $f || exit 1
66 | done
67 | 
68 | dst_dir=$(readlink -f $dst_dir)
69 | ln -sf $dst_dir/3-gram.pruned.1e-7.arpa.gz $local_dir/lm_tgmed.arpa.gz
70 | ln -sf $dst_dir/3-gram.pruned.3e-7.arpa.gz $local_dir/lm_tgsmall.arpa.gz
71 | ln -sf $dst_dir/3-gram.arpa.gz $local_dir/lm_tglarge.arpa.gz
72 | ln -sf $dst_dir/librispeech-lexicon.txt $local_dir/librispeech-lexicon.txt
73 | ln -sf $dst_dir/librispeech-vocab.txt $local_dir/librispeech-vocab.txt
74 | exit 0
75 | 


--------------------------------------------------------------------------------
/librispeech/local/format_lms.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2014 Vassil Panayotov
 4 | # Apache 2.0
 5 | 
 6 | # Prepares the test time language model(G) transducers
 7 | # (adapted from wsj/s5/local/wsj_format_data.sh)
 8 | 
 9 | . ./path.sh || exit 1;
10 | 
11 | # begin configuration section
12 | src_dir=data/lang
13 | # end configuration section
14 | 
15 | . utils/parse_options.sh || exit 1;
16 | 
17 | set -e
18 | 
19 | if [ $# -ne 1 ]; then
20 |   echo "Usage: $0 <lm-dir>"
21 |   echo "e.g.: $0 /export/a15/vpanayotov/data/lm"
22 |   echo ", where:"
23 |   echo "    <lm-dir> is the directory in which the language model is stored/downloaded"
24 |   echo "Options:"
25 |   echo "   --src-dir  <dir>           # source lang directory, default data/lang"
26 |   exit 1
27 | fi
28 | 
29 | lm_dir=$1
30 | 
31 | if [ ! -d $lm_dir ]; then
32 |   echo "$0: expected source LM directory $lm_dir to exist"
33 |   exit 1;
34 | fi
35 | if [ ! -f $src_dir/words.txt ]; then
36 |   echo "$0: expected $src_dir/words.txt to exist."
37 |   exit 1;
38 | fi
39 | 
40 | 
41 | tmpdir=data/local/lm_tmp.$$
42 | trap "rm -r $tmpdir" EXIT
43 | 
44 | mkdir -p $tmpdir
45 | 
46 | for lm_suffix in tgsmall tgmed; do
47 |   # tglarge is prepared by a separate command, called from run.sh; we don't
48 |   # want to compile G.fst for tglarge, as it takes a while.
49 |   test=${src_dir}_test_${lm_suffix}
50 |   mkdir -p $test
51 |   cp -r ${src_dir}/* $test
52 |   gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz | \
53 |     arpa2fst --disambig-symbol=#0 \
54 |              --read-symbol-table=$test/words.txt - $test/G.fst
55 |   utils/validate_lang.pl --skip-determinization-check $test || exit 1;
56 | done
57 | 
58 | echo "Succeeded in formatting data."
59 | 
60 | exit 0
61 | 


--------------------------------------------------------------------------------
/librispeech/local/prepare_dict.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright 2014 Vassil Panayotov
  4 | # Apache 2.0
  5 | 
  6 | # Prepares the dictionary and auto-generates the pronunciations for the words,
  7 | # that are in our vocabulary but not in CMUdict
  8 | 
  9 | stage=0
 10 | nj=4 # number of parallel Sequitur G2P jobs, we would like to use
 11 | cmd=run.pl
 12 | 
 13 | 
 14 | . utils/parse_options.sh || exit 1;
 15 | . ./path.sh || exit 1
 16 | 
 17 | 
 18 | if [ $# -ne 3 ]; then
 19 |   echo "Usage: $0 [options] <lm-dir> <g2p-model-dir> <dst-dir>"
 20 |   echo "e.g.: /export/a15/vpanayotov/data/lm /export/a15/vpanayotov/data/g2p data/local/dict"
 21 |   echo "Options:"
 22 |   echo "  --cmd '<command>'    # script to launch jobs with, default: run.pl"
 23 |   echo "  --nj <nj>            # number of jobs to run, default: 4."
 24 |   exit 1
 25 | fi
 26 | 
 27 | lm_dir=$1
 28 | g2p_model_dir=$2
 29 | dst_dir=$3
 30 | 
 31 | vocab=$lm_dir/librispeech-vocab.txt
 32 | [ ! -f $vocab ] && echo "$0: vocabulary file not found at $vocab" && exit 1;
 33 | 
 34 | # this file is either a copy of the lexicon we download from openslr.org/11 or is
 35 | # created by the G2P steps below
 36 | lexicon_raw_nosil=$dst_dir/lexicon_raw_nosil.txt
 37 | 
 38 | cmudict_dir=$dst_dir/cmudict
 39 | cmudict_plain=$dst_dir/cmudict.0.7a.plain
 40 | 
 41 | mkdir -p $dst_dir || exit 1;
 42 | 
 43 | if [ $stage -le 0 ]; then
 44 |   echo "Downloading and preparing CMUdict"
 45 |   if [ ! -s $cmudict_dir/cmudict.0.7a ]; then
 46 |     svn co -r 12440 https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict $cmudict_dir || exit 1;
 47 |   fi
 48 |   echo "Removing the pronunciation variant markers ..."
 49 |   grep -v ';;;' $cmudict_dir/cmudict.0.7a | \
 50 |     perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \
 51 |     > $cmudict_plain || exit 1;
 52 | fi
 53 | 
 54 | 
 55 | if [ $stage -le 1 ]; then
 56 |   # check if we have Sequitur G2P is installed
 57 |   if [ ! -f  "$sequitur" ]; then
 58 |     if ! which swig >&/dev/null; then
 59 |       echo "Please install 'swig' and then run $KALDI_ROOT/tools/extra/install_sequitur.sh"
 60 |       exit 1
 61 |     else
 62 |       echo "Sequitur G2P not found- running $KALDI_ROOT/tools/extra/install_sequitur.sh"
 63 |       pushd $KALDI_ROOT/tools
 64 |       extras/install_sequitur.sh || exit 1
 65 |       popd
 66 |     fi
 67 |   fi
 68 |   [[ -f "$sequitur" ]] || { echo "Still can't find Sequitur G2P- check your path.sh"; exit 1; }
 69 | 
 70 |   g2p_dir=$dst_dir/g2p
 71 |   auto_vocab_prefix="$g2p_dir/vocab_autogen"
 72 |   auto_lexicon_prefix="$g2p_dir/lexicon_autogen"
 73 | 
 74 |   mkdir -p $g2p_dir/log
 75 |   auto_vocab_splits=$(eval "echo $auto_vocab_prefix.{$(seq -s',' $nj | sed 's/,$//')}")
 76 |   awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' $cmudict_plain $vocab |\
 77 |     sort | tee $g2p_dir/vocab_autogen.full |\
 78 |     utils/split_scp.pl /dev/stdin $auto_vocab_splits || exit 1
 79 |   echo "Autogenerating pronunciations for the words in $auto_vocab_prefix.* ..."
 80 |   $cmd JOB=1:$nj $g2p_dir/log/g2p.JOB.log \
 81 |     local/g2p.sh  $auto_vocab_prefix.JOB $g2p_model_dir $auto_lexicon_prefix.JOB || exit 1
 82 |   g2p_vocab_size=$(wc -l <$g2p_dir/vocab_autogen.full)
 83 |   g2p_lex_size=$(wc -l < <(cat $auto_lexicon_prefix.*))
 84 |   [[ "$g2p_vocab_size" -eq "$g2p_lex_size" ]] || { echo "Unexpected G2P error"; exit 1; }
 85 |   sort <(cat $auto_vocab_prefix.*) >$dst_dir/vocab_autogen.txt
 86 |   sort <(cat $auto_lexicon_prefix.*) >$dst_dir/lexicon_autogen.txt
 87 |   echo "$(wc -l <$g2p_dir/vocab_autogen.full) pronunciations autogenerated OK"
 88 | fi
 89 | 
 90 | if [ $stage -le 2 ]; then
 91 |   echo "Combining the CMUdict pronunciations with the autogenerated ones ..."
 92 |   awk 'NR==FNR{a[$1]=1; next} ($1 in a)' $vocab $cmudict_plain |\
 93 |     cat - $dst_dir/lexicon_autogen.txt | sort >$lexicon_raw_nosil || exit 1
 94 |   raw_lex_size=$(cat $lexicon_raw_nosil | awk '{print $1}' | sort -u | wc -l)
 95 |   vocab_size=$(wc -l <$vocab)
 96 |   [[ "$vocab_size" -eq "$raw_lex_size" ]] || {
 97 |     echo "Inconsistent lexicon($raw_lex_size) vs vocabulary($vocab_size) size!";
 98 |     exit 1; }
 99 |   echo "Combined lexicon saved to '$lexicon_raw_nosil'"
100 | fi
101 | 
102 | # The copy operation below is necessary, if we skip the g2p stages(e.g. using --stage 3)
103 | if [[ ! -s "$lexicon_raw_nosil" ]]; then
104 |   cp $lm_dir/librispeech-lexicon.txt $lexicon_raw_nosil || exit 1
105 | fi
106 | 
107 | if [ $stage -le 3 ]; then
108 |   silence_phones=$dst_dir/silence_phones.txt
109 |   optional_silence=$dst_dir/optional_silence.txt
110 |   nonsil_phones=$dst_dir/nonsilence_phones.txt
111 |   extra_questions=$dst_dir/extra_questions.txt
112 | 
113 |   echo "Preparing phone lists and clustering questions"
114 |   (echo SIL; echo SPN;) > $silence_phones
115 |   echo SIL > $optional_silence
116 |   # nonsilence phones; on each line is a list of phones that correspond
117 |   # really to the same base phone.
118 |   awk '{for (i=2; i<=NF; ++i) { print $i; gsub(/[0-9]/, "", $i); print $i}}' $lexicon_raw_nosil |\
119 |     sort -u |\
120 |     perl -e 'while(<>){
121 |       chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_";
122 |       $phones_of{$1} .= "$_ "; }
123 |       foreach $list (values %phones_of) {print $list . "\n"; } ' | sort \
124 |       > $nonsil_phones || exit 1;
125 |   # A few extra questions that will be added to those obtained by automatically clustering
126 |   # the "real" phones.  These ask about stress; there's also one for silence.
127 |   cat $silence_phones| awk '{printf("%s ", $1);} END{printf "\n";}' > $extra_questions || exit 1;
128 |   cat $nonsil_phones | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
129 |     $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
130 |     >> $extra_questions || exit 1;
131 |   echo "$(wc -l <$silence_phones) silence phones saved to: $silence_phones"
132 |   echo "$(wc -l <$optional_silence) optional silence saved to: $optional_silence"
133 |   echo "$(wc -l <$nonsil_phones) non-silence phones saved to: $nonsil_phones"
134 |   echo "$(wc -l <$extra_questions) extra triphone clustering-related questions saved to: $extra_questions"
135 | fi
136 | 
137 | if [ $stage -le 4 ]; then
138 |   (echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) |\
139 |   cat - $lexicon_raw_nosil | sort | uniq >$dst_dir/lexicon.txt
140 |   echo "Lexicon text file saved as: $dst_dir/lexicon.txt"
141 | fi
142 | 
143 | exit 0
144 | 


--------------------------------------------------------------------------------
/librispeech/local/prepare_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | data=/export/a15/vpanayotov/data
 4 | subsampling=4
 5 | 
 6 | . ./cmd.sh
 7 | . ./path.sh
 8 | 
 9 | . ./utils/parse_options.sh
10 | 
11 | set -euo pipefail
12 | 
13 | for part in dev-clean dev-other test-clean test-other; do
14 |   echo "-------------- Making ${part} ----------------------"
15 |   dataname=$(echo ${part} | sed s/-/_/g)
16 |   local/data_prep.sh $data/LibriSpeech/${part} data/${dataname}
17 |   ./utils/copy_data_dir.sh data/${dataname} data/${dataname}_fbank
18 |   ./steps/make_fbank.sh --cmd "$train_cmd" --nj 32 \
19 |     data/${dataname}_fbank exp/make_fbank/${dataname} fbank
20 |   ./utils/fix_data_dir.sh data/${dataname}_fbank
21 |   ./steps/compute_cmvn_stats.sh data/${dataname}_fbank
22 |   ./utils/fix_data_dir.sh data/${dataname}_fbank
23 | 
24 |   memmap_data.py data/${dataname}_fbank/feats.scp data/${dataname}_fbank/feats.scp.dat
25 |   python local/prepare_unlabeled_tgt.py --subsample ${subsampling} \
26 |     data/${dataname}_fbank/utt2num_frames > data/${dataname}_fbank/pdfid.${subsampling}.tgt
27 | done
28 | 
29 | exit 0;
30 |  
31 | 
32 | 


--------------------------------------------------------------------------------
/librispeech/local/score.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
 3 | #           2014  Guoguo Chen
 4 | # Apache 2.0
 5 | 
 6 | [ -f ./path.sh ] && . ./path.sh
 7 | 
 8 | # begin configuration section.
 9 | cmd=run.pl
10 | stage=0
11 | decode_mbr=true
12 | word_ins_penalty=0.0,0.5,1.0
13 | min_lmwt=7
14 | max_lmwt=17
15 | iter=final
16 | #end configuration section.
17 | 
18 | [ -f ./path.sh ] && . ./path.sh
19 | . parse_options.sh || exit 1;
20 | 
21 | if [ $# -ne 3 ]; then
22 |   echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
23 |   echo " Options:"
24 |   echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
25 |   echo "    --stage (0|1|2)                 # start scoring script from part-way through."
26 |   echo "    --decode_mbr (true/false)       # maximum bayes risk decoding (confusion network)."
27 |   echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
28 |   echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
29 |   exit 1;
30 | fi
31 | 
32 | data=$1
33 | lang_or_graph=$2
34 | dir=$3
35 | 
36 | symtab=$lang_or_graph/words.txt
37 | 
38 | for f in $symtab $dir/lat.1.gz $data/text; do
39 |   [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
40 | done
41 | 
42 | mkdir -p $dir/scoring/log
43 | 
44 | cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
45 | 
46 | for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
47 |   $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.$wip.log \
48 |     lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
49 |     lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
50 |     lattice-best-path --word-symbol-table=$symtab \
51 |       ark:- ark,t:$dir/scoring/LMWT.$wip.tra || exit 1;
52 | done
53 | 
54 | # Note: the double level of quoting for the sed command
55 | for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
56 |   $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.$wip.log \
57 |     cat $dir/scoring/LMWT.$wip.tra \| \
58 |     utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
59 |     compute-wer --text --mode=present \
60 |     ark:$dir/scoring/test_filt.txt  ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1;
61 | done
62 | 
63 | exit 0;
64 | 


--------------------------------------------------------------------------------
/librispeech/local/subset_dataset.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2017  Luminar Technologies, Inc. (author: Daniel Galvez)
 4 | # Apache 2.0
 5 | 
 6 | # The following commands were used to generate the mini_librispeech dataset:
 7 | #
 8 | # Note that data generation is random. This could be fixed by
 9 | # providing a seed argument to the shuf program.
10 | 
11 | if [ "$#" -ne 3 ]; then
12 |   echo "Usage: $0 <src-dir> <dst-dir> <num-hours>"
13 |   echo "e.g.: $0 /export/a05/dgalvez/LibriSpeech/train-clean-100 \\
14 |                  /export/a05/dgalvez/LibriSpeech/train-clean-5 5"
15 |   exit 1
16 | fi
17 | 
18 | src_dir=$1
19 | dest_dir=$2
20 | dest_num_hours=$3
21 | 
22 | src=$(basename $src_dir)
23 | dest=$(basename $dest_dir)
24 | librispeech_dir=$(dirname $src_dir)
25 | 
26 | # TODO: Possibly improve this to ensure gender balance and speaker
27 | # balance.
28 | # TODO: Use actual time values instead of assuming that to make sure we get $dest_num_hours of data
29 | src_num_hours=$(grep "$src" $librispeech_dir/CHAPTERS.TXT | awk -F'|' '{ print $3 }' | \
30 | python -c '
31 | from __future__ import print_function
32 | from sys import stdin
33 | minutes_str = stdin.read().split()
34 | print(int(round(sum([float(minutes) for minutes in minutes_str]) / 60.0)))')
35 | src_num_chapters=$(grep "$src" $librispeech_dir/CHAPTERS.TXT | \
36 |                       awk -F'|' '{ print $1 }' | sort -u | wc -l)
37 | mkdir -p data/subset_tmp
38 | grep "$src" $librispeech_dir/CHAPTERS.TXT | \
39 |   awk -F'|' '{ print $1 }' | \
40 |   shuf -n $(((dest_num_hours * src_num_chapters) / src_num_hours)) > \
41 |        data/subset_tmp/${dest}_chapter_id_list.txt
42 | 
43 | while read -r chapter_id || [[ -n "$chapter_id" ]]; do
44 |   chapter_dir=$(find $src_dir/ -mindepth 2 -name "$chapter_id" -type d)
45 |   speaker_id=$(basename $(dirname $chapter_dir))
46 |   mkdir -p $dest_dir/$speaker_id/
47 |   cp -r $chapter_dir $dest_dir/$speaker_id/
48 | done  < data/subset_tmp/${dest}_chapter_id_list.txt
49 | 


--------------------------------------------------------------------------------
/librispeech/path.sh:
--------------------------------------------------------------------------------
 1 | export ROOT=`pwd`/../tools
 2 | export KALDI_ROOT=${ROOT}/kaldi
 3 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 4 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/sph2pipe_v2.5:$KALDI_ROOT/tools/openfst/bin:`pwd`/../nnet_pytorch:$PWD:$PATH:`pwd`/../nnet_pytorch/utils/
 5 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
 6 | . $KALDI_ROOT/tools/config/common_path.sh
 7 | export LC_ALL=C
 8 | 
 9 | export OPENFST_PATH=${ROOT}/openfst #/PATH/TO/OPENFST
10 | export LD_LIBRARY_ORIG=${LD_LIBRARY_PATH}
11 | export LD_LIBRARY_PATH=${OPENFST_PATH}/lib:${LD_LIBRARY_PATH}
12 | #export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${OPENFST_PATH}/lib:/usr/local/cuda/lib64
13 | 
14 | export PYTHONPATH=${PYTHONPATH}:`pwd`/../nnet_pytorch/:`pwd`/../nnet_pytorch/utils/
15 | export PYTHONUNBUFFERED=1
16 | source ${ROOT}/NeurIPS2020/bin/activate
17 | 
18 | export LC_ALL=C
19 | 
20 | 


--------------------------------------------------------------------------------
/librispeech/run-blstm.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | . ./cmd.sh
  4 | . ./path.sh
  5 | 
  6 | stage=0
  7 | subsampling=3
  8 | traindir=data/train_960
  9 | feat_affix=_fbank
 10 | chaindir=exp/chain_blstm
 11 | num_leaves=7000
 12 | model_dirname=blstm
 13 | batches_per_epoch=500
 14 | num_epochs=300
 15 | train_nj=4
 16 | resume=
 17 | num_split=20 # number of splits for memory-mapped data for training
 18 | average=true
 19 | 
 20 | . ./utils/parse_options.sh
 21 | 
 22 | set -euo pipefail
 23 | 
 24 | tree=${chaindir}/tree
 25 | targets=${traindir}${feat_affix}/pdfid.${subsampling}.tgt
 26 | trainname=`basename ${traindir}`
 27 | 
 28 | 
 29 | if [ $stage -le 1 ]; then
 30 |   echo "Creating Chain Topology, Denominator Graph, and nnet Targets ..."
 31 |   lang=data/lang_chain
 32 |   cp -r data/lang $lang
 33 |   silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
 34 |   nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
 35 |   # Use our special topology... note that later on may have to tune this
 36 |   # topology.
 37 |   steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
 38 | 
 39 |   steps/nnet3/chain/build_tree.sh \
 40 |     --frame-subsampling-factor ${subsampling} \
 41 |     --context-opts "--context-width=2 --central-position=1" \
 42 |     --cmd "$train_cmd" ${num_leaves} ${traindir} \
 43 |     $lang exp/tri5b_ali_${trainname} ${tree}
 44 | 
 45 |   ali-to-phones ${tree}/final.mdl ark:"gunzip -c ${tree}/ali.*.gz |" ark:- |\
 46 |     chain-est-phone-lm --num-extra-lm-states=2000 ark:- ${chaindir}/phone_lm.fst
 47 | 
 48 |   chain-make-den-fst ${tree}/tree ${tree}/final.mdl \
 49 |     ${chaindir}/phone_lm.fst ${chaindir}/den.fst ${chaindir}/normalization.fst
 50 | 
 51 |   ali-to-pdf ${tree}/final.mdl ark:"gunzip -c ${tree}/ali.*.gz |" ark,t:${targets}
 52 | fi
 53 | 
 54 | if [ $stage -le 2 ]; then
 55 |   echo "Dumping memory mapped features ..."
 56 |   split_memmap_data.sh ${traindir}${feat_affix} ${targets} ${num_split} 
 57 | fi
 58 | 
 59 | # Multigpu training of Chain-WideResNet with optimizer state averaging
 60 | if [ $stage -le 3 ]; then
 61 |   resume_opts=
 62 |   if [ ! -z $resume ]; then
 63 |     resume_opts="--resume ${resume}"
 64 |   fi 
 65 | 
 66 |   num_pdfs=$(tree-info ${tree}/tree | grep 'num-pdfs' | cut -d' ' -f2)
 67 |   train_async_parallel.sh ${resume_opts} \
 68 |     --gpu true \
 69 |     --objective LFMMI \
 70 |     --denom-graph ${chaindir}/den.fst \
 71 |     --num-pdfs ${num_pdfs} \
 72 |     --subsample ${subsampling} \
 73 |     --model ChainBLSTM \
 74 |     --hdim 1024 \
 75 |     --num-layers 6 \
 76 |     --dropout 0.2 \
 77 |     --prefinal-dim 512 \
 78 |     --warmup 20000 \
 79 |     --decay 1e-07 \
 80 |     --xent 0.1 \
 81 |     --l2 0.0001 \
 82 |     --weight-decay 1e-07 \
 83 |     --lr 0.0002 \
 84 |     --batches-per-epoch ${batches_per_epoch} \
 85 |     --num-epochs ${num_epochs} \
 86 |     --validation-spks 0 \
 87 |     --nj ${train_nj} \
 88 |     "[ \
 89 |         {\
 90 |     'data': '${traindir}${feat_affix}', \
 91 |     'tgt': '${targets}', \
 92 |     'batchsize': 32, 'chunk_width': 140, \
 93 |     'left_context': 10, 'right_context': 5, \
 94 |     'mean_norm': True, 'var_norm': 'norm'
 95 |         }\
 96 |      ]" \
 97 |     `dirname ${chaindir}`/${model_dirname}
 98 | fi
 99 | 
100 | # Average the last 60 epochs
101 | if $average; then
102 |   echo "Averaging the last few epochs ..."
103 |   average_models.py `dirname ${chaindir}`/${model_dirname} 80 240 300
104 | fi
105 | 


--------------------------------------------------------------------------------
/librispeech/run-wrn.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | . ./cmd.sh
  4 | . ./path.sh
  5 | 
  6 | stage=0
  7 | subsampling=4
  8 | traindir=data/train_960
  9 | feat_affix=_fbank
 10 | chaindir=exp/chain_wrn
 11 | num_leaves=7000
 12 | model_dirname=wrn
 13 | batches_per_epoch=500
 14 | num_epochs=300
 15 | train_nj=4
 16 | resume=
 17 | num_split=20 # number of splits for memory-mapped data for training
 18 | average=true
 19 | 
 20 | . ./utils/parse_options.sh
 21 | 
 22 | set -euo pipefail
 23 | 
 24 | tree=${chaindir}/tree
 25 | targets=${traindir}${feat_affix}/pdfid.${subsampling}.tgt
 26 | trainname=`basename ${traindir}`
 27 | 
 28 | if [ $stage -le 1 ]; then
 29 |   echo "Creating Chain Topology, Denominator Graph, and nnet Targets ..."
 30 |   lang=data/lang_chain
 31 |   cp -r data/lang $lang
 32 |   silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
 33 |   nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
 34 |   # Use our special topology... note that later on may have to tune this
 35 |   # topology.
 36 |   steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
 37 | 
 38 |   steps/nnet3/chain/build_tree.sh \
 39 |     --frame-subsampling-factor ${subsampling} \
 40 |     --context-opts "--context-width=2 --central-position=1" \
 41 |     --cmd "$train_cmd" ${num_leaves} ${traindir} \
 42 |     $lang exp/tri5b_ali_${trainname} ${tree}
 43 | 
 44 |   ali-to-phones ${tree}/final.mdl ark:"gunzip -c ${tree}/ali.*.gz |" ark:- |\
 45 |     chain-est-phone-lm --num-extra-lm-states=2000 ark:- ${chaindir}/phone_lm.fst
 46 | 
 47 |   chain-make-den-fst ${tree}/tree ${tree}/final.mdl \
 48 |     ${chaindir}/phone_lm.fst ${chaindir}/den.fst ${chaindir}/normalization.fst
 49 | 
 50 |   ali-to-pdf ${tree}/final.mdl ark:"gunzip -c ${tree}/ali.*.gz |" ark,t:${targets}
 51 | fi
 52 | 
 53 | if [ $stage -le 2 ]; then
 54 |   echo "Dumping memory mapped features ..."
 55 |   split_memmap_data.sh ${traindir}${feat_affix} ${targets} ${num_split} 
 56 | fi
 57 | 
 58 | # Multigpu training of Chain-WideResNet with optimizer state averaging
 59 | if [ $stage -le 3 ]; then
 60 |   resume_opts=
 61 |   if [ ! -z $resume ]; then
 62 |     resume_opts="--resume ${resume}"
 63 |   fi 
 64 | 
 65 |   num_pdfs=$(tree-info ${tree}/tree | grep 'num-pdfs' | cut -d' ' -f2)
 66 |   ./local/train_async_parallel.sh ${resume_opts} \
 67 |     --gpu true \
 68 |     --objective LFMMI \
 69 |     --denom-graph ${chaindir}/den.fst \
 70 |     --num-pdfs ${num_pdfs} \
 71 |     --subsample ${subsampling} \
 72 |     --model ChainWideResnet \
 73 |     --depth 28 \
 74 |     --width 10 \
 75 |     --warmup 20000 \
 76 |     --decay 1e-05 \
 77 |     --xent 0.05 \
 78 |     --l2 0.0001 \
 79 |     --weight-decay 1e-08 \
 80 |     --lr 0.0002 \
 81 |     --batches-per-epoch 500 \
 82 |     --num-epochs 300 \
 83 |     --validation-spks 0 \
 84 |     --nj 4 \
 85 |     "[ \
 86 |         {\
 87 |     'data': '${traindir}${feat_affix}', \
 88 |     'tgt': '${targets}', \
 89 |     'batchsize': 32, 'chunk_width': 140, \
 90 |     'left_context': 10, 'right_context': 5, \
 91 |     'mean_norm': True, 'var_norm': 'norm'
 92 |         }\
 93 |      ]" \
 94 |     `dirname ${chaindir}`/${model_dirname}
 95 | fi
 96 | 
 97 | # Average the last 60 epochs
 98 | if $average; then
 99 |   echo "Averaging the last few epochs ..."
100 |   average_models.py `dirname ${chaindir}`/${model_dirname} 80 240 300
101 | fi
102 | 
103 | 


--------------------------------------------------------------------------------
/librispeech/steps:
--------------------------------------------------------------------------------
1 | ../tools/kaldi/egs/wsj/s5/steps


--------------------------------------------------------------------------------
/librispeech/utils:
--------------------------------------------------------------------------------
1 | ../tools/kaldi/egs/wsj/s5/utils


--------------------------------------------------------------------------------
/librispeech100/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | export train_cmd="queue.pl --mem 2G -l hostname='!b02*&!c24*&!c23*&!c27*&!c26*&!c25*&!a*'"
14 | export decode_cmd="queue.pl --mem 4G -l hostname='!b02*&!c06*&!c23*&!c24*&!c25*&!c26*&!c27*&!a*'"
15 | export mkgraph_cmd="queue.pl --mem 8G"
16 | 


--------------------------------------------------------------------------------
/librispeech100/conf/decode.config:
--------------------------------------------------------------------------------
1 | # empty config, just use the defaults.
2 | 


--------------------------------------------------------------------------------
/librispeech100/conf/fbank.conf:
--------------------------------------------------------------------------------
1 | --sample-frequency=16000 
2 | --num-mel-bins=64
3 | 


--------------------------------------------------------------------------------
/librispeech100/conf/gpu.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
 3 | option mem=* -l mem_free=$0,ram_free=$0
 4 | option mem=0          # Do not add anything to qsub_opts
 5 | option num_threads=* -pe smp $0
 6 | option num_threads=1  # Do not add anything to qsub_opts
 7 | option max_jobs_run=* -tc $0
 8 | default gpu=0
 9 | option gpu=0
10 | option gpu=* -l 'hostname=c0[023456789]*|c1[012456789]*|c2[0126]*,gpu=$0' -q g.q
11 | 


--------------------------------------------------------------------------------
/librispeech100/conf/mfcc.conf:
--------------------------------------------------------------------------------
1 | --use-energy=false   # only non-default option.
2 | 


--------------------------------------------------------------------------------
/librispeech100/conf/mfcc_hires.conf:
--------------------------------------------------------------------------------
 1 | # config for high-resolution MFCC features, intended for neural network training
 2 | # Note: we keep all cepstra, so it has the same info as filterbank features,
 3 | # but MFCC is more easily compressible (because less correlated) which is why 
 4 | # we prefer this method.
 5 | --use-energy=false   # use average of log energy, not energy.
 6 | --num-mel-bins=40     # similar to Google's setup.
 7 | --num-ceps=40     # there is no dimensionality reduction.
 8 | --low-freq=20     # low cutoff frequency for mel bins... this is high-bandwidth data, so
 9 |                   # there might be some information at the low end.
10 | --high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 
11 | 


--------------------------------------------------------------------------------
/librispeech100/conf/online_cmvn.conf:
--------------------------------------------------------------------------------
1 | # configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
2 | 


--------------------------------------------------------------------------------
/librispeech100/conf/online_pitch.conf:
--------------------------------------------------------------------------------
 1 | ## This config is given by conf/make_pitch_online.sh to the program compute-and-process-kaldi-pitch-feats,
 2 | ## and is copied by steps/online/nnet2/prepare_online_decoding.sh and similar scripts, to be given
 3 | ## to programs like online2-wav-nnet2-latgen-faster.
 4 | ## The program compute-and-process-kaldi-pitch-feats will use it to compute pitch features that
 5 | ## are the same as that those which will generated in online decoding; this enables us to train
 6 | ## in a way that's compatible with online decoding.
 7 | ## 
 8 | 
 9 | ## most of these options relate to the post-processing rather than the pitch
10 | ## extraction itself.
11 | --add-raw-log-pitch=true   ## this is intended for input to neural nets, so our
12 |                            ## approach is "throw everything in and see what
13 |                            ## sticks".
14 | --normalization-left-context=75
15 | --normalization-right-context=50 # We're removing some of the right-context
16 |                                  # for the normalization.   Would normally be 75.
17 |                                  #
18 |                                  # Note: our changes to the (left,right) context
19 |                                  # from the defaults of (75,75) to (75,50) will
20 |                                  # almost certainly worsen results, but will
21 |                                  # reduce latency.
22 | --frames-per-chunk=10    ## relates to offline simulation of online decoding; 1
23 |                          ## would be equivalent to getting in samples one by
24 |                          ## one.
25 | --simulate-first-pass-online=true  ## this make the online-pitch-extraction code
26 |                                    ## output the 'first-pass' features, which
27 |                                    ## are less accurate than the final ones, and
28 |                                    ## which are the only features the neural-net
29 |                                    ## decoding would ever see (since we can't
30 |                                    ## afford to do lattice rescoring in the
31 |                                    ## neural-net code
32 | 


--------------------------------------------------------------------------------
/librispeech100/conf/queue_no_k20.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
 3 | option mem=* -l mem_free=$0,ram_free=$0
 4 | option mem=0          # Do not add anything to qsub_opts
 5 | option num_threads=* -pe smp $0
 6 | option num_threads=1  # Do not add anything to qsub_opts
 7 | option max_jobs_run=* -tc $0
 8 | default gpu=0
 9 | option gpu=0 -q all.q
10 | option gpu=* -l gpu=$0 -q g.q
11 | default allow_k20=true
12 | option allow_k20=true
13 | option allow_k20=false -l 'hostname=!g01*&!g02*&!b06*'
14 | 


--------------------------------------------------------------------------------
/librispeech100/conf/spec.conf:
--------------------------------------------------------------------------------
1 | --preemphasis-coefficient=0.0
2 | --remove-dc-offset=false
3 | --round-to-power-of-two=false
4 | --window-type=hanning
5 | 


--------------------------------------------------------------------------------
/librispeech100/decode.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | speech_data=/export/corpora5 #/PATH/TO/LIBRISPEECH/data
 4 | 
 5 | . ./cmd.sh
 6 | . ./path.sh
 7 | 
 8 | stage=1
 9 | subsampling=4
10 | chaindir=exp/chain
11 | model_dirname=model1
12 | checkpoint=180_220.mdl
13 | acwt=1.0
14 | testsets="dev_clean dev_other test_clean test_other"
15 | feat_affix="_fbank"
16 | decode_nj=80
17 | 
18 | . ./utils/parse_options.sh
19 | 
20 | tree=${chaindir}/tree
21 | post_decode_acwt=`echo ${acwt} | awk '{print 10*$1}'`
22 | 
23 | # Prepare the test sets if not already done
24 | if [ $stage -le 0 ]; then
25 |   if [ ! -f data/${testsets%% *}${feat_affix}/mapped/feats.dat.1 ]; then
26 |     ./local/prepare_test.sh --subsampling ${subsampling} \
27 |       --testsets "${testsets}" \
28 |       --subsampling ${subsampling} \
29 |       --data ${speech_data} \
30 |       --feat-affix ${feat_affix}
31 |   fi
32 | fi
33 | 
34 | # Echo Make graph if it does not exist
35 | if [ ! -f ${tree}/graph_tgsmall/HCLG.fst ]; then 
36 |   ./utils/mkgraph.sh --self-loop-scale 1.0 \
37 |     data/lang_test_tgsmall ${tree} ${tree}/graph_tgsmall
38 | fi
39 | 
40 | for ds in $testsets; do 
41 |   decode_nnet_pytorch.sh --min-lmwt 6 \
42 |                          --max-lmwt 18 \
43 |                          --checkpoint ${checkpoint} \
44 |                          --acoustic-scale ${acwt} \
45 |                          --post-decode-acwt ${post_decode_acwt} \
46 |                          --nj ${decode_nj} \
47 |                          data/${ds}${feat_affix} exp/${model_dirname} \
48 |                          ${tree}/graph_tgsmall exp/${model_dirname}/decode_${checkpoint}_graph_${acwt}_${ds}
49 |   
50 |   echo ${decode_nj} > exp/${model_dirname}/decode_${checkpoint}_graph_${acwt}_${ds}/num_jobs
51 |   ./steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
52 |     data/lang_test_{tgsmall,fglarge} \
53 |     data/${ds}${feat_affix} exp/${model_dirname}/decode_${checkpoint}_graph_${acwt}_${ds}{,_fglarge_rescored} 
54 | done
55 | 
56 | 


--------------------------------------------------------------------------------
/librispeech100/decorrupt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . ./path.sh
 3 | idim=64
 4 | chunk_width=100
 5 | left_context=10
 6 | right_context=5
 7 | batchsize=32
 8 | perturb="none"
 9 | num_steps=
10 | 
11 | . ./utils/parse_options.sh
12 | 
13 | if [ $# -ne 3 ]; then
14 |   echo "Usage: ./decorrupt.sh <data> <model> <checkpoint>"
15 |   exit 1;
16 | fi
17 | 
18 | data=$1
19 | model=$2
20 | checkpoint=$3
21 | 
22 | odir=${model}/decorrupt_${checkpoint}
23 | mkdir -p ${odir}
24 | 
25 | num_steps_opts=""
26 | if [ ! -z $num_steps ]; then
27 |   num_steps_opts="--num-steps ${num_steps}"
28 | fi
29 | 
30 | train_cmd="utils/retry.pl utils/queue.pl --mem 2G --gpu 1 --config conf/gpu.conf"
31 | 
32 | ${train_cmd} ${odir}/log decorrupt.py --gpu \
33 |   --datadir ${data} \
34 |   --modeldir ${model} \
35 |   --checkpoint ${checkpoint} \
36 |   --dumpdir ${odir} \
37 |   --idim ${idim} \
38 |   --chunk-width ${chunk_width} \
39 |   --left-context ${left_context} \
40 |   --right-context ${right_context} \
41 |   --batchsize ${batchsize} \
42 |   --perturb ${perturb} \
43 |   ${num_steps_opts}
44 | 


--------------------------------------------------------------------------------
/librispeech100/generate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . ./path.sh
 3 | . ./cmd.sh
 4 | 
 5 | stage=0
 6 | subsampling=4
 7 | chaindir=exp/chain_wrn
 8 | model_dirname=wrn_semisup
 9 | checkpoint=20.mdl
10 | top_k=10
11 | target="2697 2697 2697 2697 2697"
12 | left=10
13 | right=5
14 | chunk_width=20
15 | idim=80
16 | gpu=false
17 | 
18 | . ./utils/parse_options.sh
19 | 
20 | tree=${chaindir}/tree
21 | 
22 | # Generation
23 | modeldir=`dirname ${chaindir}`/${model_dirname}
24 | gen_dir=${modeldir}/generate_cond_${checkpoint}
25 | mkdir -p ${gen_dir}
26 | 
27 | gpu_opts=
28 | if $gpu; then
29 |   gpu_opts="--gpu"
30 |   generate_cmd="./utils/queue.pl --mem 2G --gpu 1 --config conf/gpu.conf ${gen_dir}/log"
31 | else
32 |   generate_cmd="./utils/queue.pl --mem 2G ${gen_dir}/log"
33 | fi
34 | 
35 | target_opts=
36 | if [ ! -z "$target" ]; then
37 |   echo "Target: ${target}"
38 |   target_opts="--target ${target}"
39 |   generate_cmd="./utils/queue.pl --mem 2G --gpu 1 --config conf/gpu.conf ${gen_dir}/log"
40 |   gpu_opts="--gpu"
41 | else
42 |   gpu_opts=
43 | fi
44 | 
45 | ${generate_cmd} generate_conditional_from_buffer.py \
46 |   ${gpu_opts} \
47 |   ${target_opts} \
48 |   --idim ${idim} \
49 |   --modeldir ${modeldir} --modelname ${checkpoint} \
50 |   --dumpdir ${gen_dir} --batchsize 32 \
51 |   --left-context ${left} --right-context ${right} --chunk-width ${chunk_width} \
52 |   --top-k ${top_k}
53 | 


--------------------------------------------------------------------------------
/librispeech100/local/data_prep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2014  Vassil Panayotov
 4 | #           2014  Johns Hopkins University (author: Daniel Povey)
 5 | # Apache 2.0
 6 | 
 7 | if [ "$#" -ne 2 ]; then
 8 |   echo "Usage: $0 <src-dir> <dst-dir>"
 9 |   echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean"
10 |   exit 1
11 | fi
12 | 
13 | src=$1
14 | dst=$2
15 | 
16 | # all utterances are FLAC compressed
17 | if ! which flac >&/dev/null; then
18 |    echo "Please install 'flac' on ALL worker nodes!"
19 |    exit 1
20 | fi
21 | 
22 | spk_file=$src/../SPEAKERS.TXT
23 | 
24 | mkdir -p $dst || exit 1;
25 | 
26 | [ ! -d $src ] && echo "$0: no such directory $src" && exit 1;
27 | [ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1;
28 | 
29 | 
30 | wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
31 | trans=$dst/text; [[ -f "$trans" ]] && rm $trans
32 | utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk
33 | spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender
34 | 
35 | for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do
36 |   reader=$(basename $reader_dir)
37 |   if ! [ $reader -eq $reader ]; then  # not integer.
38 |     echo "$0: unexpected subdirectory name $reader"
39 |     exit 1;
40 |   fi
41 | 
42 |   reader_gender=$(egrep "^$reader[ ]+\|" $spk_file | awk -F'|' '{gsub(/[ ]+/, ""); print tolower($2)}')
43 |   if [ "$reader_gender" != 'm' ] && [ "$reader_gender" != 'f' ]; then
44 |     echo "Unexpected gender: '$reader_gender'"
45 |     exit 1;
46 |   fi
47 | 
48 |   for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do
49 |     chapter=$(basename $chapter_dir)
50 |     if ! [ "$chapter" -eq "$chapter" ]; then
51 |       echo "$0: unexpected chapter-subdirectory name $chapter"
52 |       exit 1;
53 |     fi
54 | 
55 |     find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \
56 |       awk -v "dir=$chapter_dir" '{printf "%s flac -c -d -s %s/%s.flac |\n", $0, dir, $0}' >>$wav_scp|| exit 1
57 | 
58 |     chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt
59 |     [ ! -f  $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1
60 |     cat $chapter_trans >>$trans
61 | 
62 |     # NOTE: For now we are using per-chapter utt2spk. That is each chapter is considered
63 |     #       to be a different speaker. This is done for simplicity and because we want
64 |     #       e.g. the CMVN to be calculated per-chapter
65 |     awk -v "reader=$reader" -v "chapter=$chapter" '{printf "%s %s-%s\n", $1, reader, chapter}' \
66 |       <$chapter_trans >>$utt2spk || exit 1
67 | 
68 |     # reader -> gender map (again using per-chapter granularity)
69 |     echo "${reader}-${chapter} $reader_gender" >>$spk2gender
70 |   done
71 | done
72 | 
73 | spk2utt=$dst/spk2utt
74 | utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt || exit 1
75 | 
76 | ntrans=$(wc -l <$trans)
77 | nutt2spk=$(wc -l <$utt2spk)
78 | ! [ "$ntrans" -eq "$nutt2spk" ] && \
79 |   echo "Inconsistent #transcripts($ntrans) and #utt2spk($nutt2spk)" && exit 1;
80 | 
81 | utils/validate_data_dir.sh --no-feats $dst || exit 1;
82 | 
83 | echo "$0: successfully prepared data in $dst"
84 | 
85 | exit 0
86 | 


--------------------------------------------------------------------------------
/librispeech100/local/decode_nnet_pytorch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | . ./path.sh
 4 | 
 5 | batchsize=512
 6 | checkpoint=final.mdl
 7 | prior_scale=1.0
 8 | prior_floor=-20.0
 9 | prior_name="priors"
10 | min_active=200
11 | max_active=7000
12 | max_mem=50000000
13 | lattice_beam=8.0
14 | beam=15.0
15 | acoustic_scale=0.1
16 | post_decode_acwt=10.0 # 10.0 for chain systems, 1.0 for non-chain
17 | mean_var="(True, True)"
18 | 
19 | min_lmwt=6
20 | max_lmwt=18
21 | nj=80
22 | stage=0
23 | 
24 | . ./utils/parse_options.sh
25 | if [ $# -ne 4 ]; then
26 |   echo "Usage: ./decode_nnet_pytorch.sh <data> <pytorch_model> <graphdir> <odir>"
27 |   echo " --batchsize ${batchsize} "
28 |   echo " --checkpoint ${checkpoint} --prior-scale ${prior_scale} --prior-floor ${prior_floor} --prior-name ${prior_name}"
29 |   echo " --min-active ${min_active} --max-active ${max_active}"
30 |   echo " --max-mem ${max_mem} --lattice-beam ${lattice_beam}"
31 |   echo " --beam ${beam} --acoustic-scale ${acoustic_scale} --post-decode-acwt ${post_decode_acwt}"
32 |   echo " --nj ${nj}"
33 |   exit 1;
34 | fi
35 | 
36 | data=$1
37 | pytorch_model=$2
38 | graphdir=$3
39 | odir=$4
40 | 
41 | # We assume the acoustic model (trans.mdl) is 1 level above the graphdir
42 | amdir=`dirname ${graphdir}`
43 | trans_mdl=${amdir}/final.mdl
44 | words_file=${graphdir}/words.txt
45 | hclg=${graphdir}/HCLG.fst
46 | 
47 | mkdir -p ${odir}/log
48 | 
49 | decode_cmd="utils/queue.pl --mem 2G -l hostname='!b02*&!a*&!c06*&!c23*&!c24*&!c25*&!c26*&!c27*'" # The 'a' machines are just too slow
50 | if [ $stage -le 0 ]; then
51 |   segments=${data}/segments
52 |   if [ ! -f ${data}/segments ]; then
53 |     echo "No segments file found. Assuming wav.scp is indexed by utterance"
54 |     segments=${data}/wav.scp
55 |   fi
56 | 
57 | ${decode_cmd} JOB=1:${nj} ${odir}/log/decode.JOB.log \
58 |     ./utils/split_scp.pl -j ${nj} \$\[JOB -1\] ${segments} \|\
59 |     decode.py --datadir ${data} \
60 |       --modeldir ${pytorch_model} \
61 |       --dumpdir ${odir} \
62 |       --checkpoint ${checkpoint} \
63 |       --prior-scale ${prior_scale} \
64 |       --prior-floor ${prior_floor} \
65 |       --prior-name ${prior_name} \
66 |       --words-file ${words_file} \
67 |       --trans-mdl ${trans_mdl} \
68 |       --hclg ${hclg} \
69 |       --min-active ${min_active} \
70 |       --max-active ${max_active} \
71 |       --lattice-beam ${lattice_beam} \
72 |       --beam ${beam} \
73 |       --acoustic-scale ${acoustic_scale} \
74 |       --post-decode-acwt ${post_decode_acwt} \
75 |       --job JOB \
76 |       --utt-subset /dev/stdin \
77 |       --batchsize ${batchsize}
78 | fi
79 | 
80 | if [ $stage -le 1 ]; then
81 |   ./local/score.sh --cmd "$decode_cmd" \
82 |     --min-lmwt ${min_lmwt} --max-lmwt ${max_lmwt} --word-ins-penalty 0.0 \
83 |     ${data} ${graphdir} ${odir}
84 | fi
85 | 


--------------------------------------------------------------------------------
/librispeech100/local/download_and_untar.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright   2014  Johns Hopkins University (author: Daniel Povey)
  4 | #             2017  Luminar Technologies, Inc. (author: Daniel Galvez)
  5 | # Apache 2.0
  6 | 
  7 | remove_archive=false
  8 | 
  9 | if [ "$1" == --remove-archive ]; then
 10 |   remove_archive=true
 11 |   shift
 12 | fi
 13 | 
 14 | if [ $# -ne 3 ]; then
 15 |   echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
 16 |   echo "e.g.: $0 /export/a05/dgalvez/ www.openslr.org/resources/31 dev-clean-2"
 17 |   echo "With --remove-archive it will remove the archive after successfully un-tarring it."
 18 |   echo "<corpus-part> can be one of: dev-clean-2, test-clean-5, dev-other, test-other,"
 19 |   echo "          train-clean-100, train-clean-360, train-other-500."
 20 | fi
 21 | 
 22 | data=$1
 23 | url=$2
 24 | part=$3
 25 | 
 26 | if [ ! -d "$data" ]; then
 27 |   echo "$0: no such directory $data"
 28 |   exit 1;
 29 | fi
 30 | 
 31 | data=$(readlink -f $data)
 32 | 
 33 | part_ok=false
 34 | list="dev-clean-2 train-clean-5"
 35 | for x in $list; do
 36 |   if [ "$part" == $x ]; then part_ok=true; fi
 37 | done
 38 | if ! $part_ok; then
 39 |   echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
 40 |   exit 1;
 41 | fi
 42 | 
 43 | if [ -z "$url" ]; then
 44 |   echo "$0: empty URL base."
 45 |   exit 1;
 46 | fi
 47 | 
 48 | if [ -f $data/LibriSpeech/$part/.complete ]; then
 49 |   echo "$0: data part $part was already successfully extracted, nothing to do."
 50 |   exit 0;
 51 | fi
 52 | 
 53 | 
 54 | #sizes="126046265 332747356"
 55 | sizes="126046265 332954390"
 56 | 
 57 | if [ -f $data/$part.tar.gz ]; then
 58 |   size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}')
 59 |   size_ok=false
 60 |   for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
 61 |   if ! $size_ok; then
 62 |     echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size"
 63 |     echo "does not equal the size of one of the archives."
 64 |     rm $data/$part.tar.gz
 65 |   else
 66 |     echo "$data/$part.tar.gz exists and appears to be complete."
 67 |   fi
 68 | fi
 69 | 
 70 | if [ ! -f $data/$part.tar.gz ]; then
 71 |   if ! which wget >/dev/null; then
 72 |     echo "$0: wget is not installed."
 73 |     exit 1;
 74 |   fi
 75 |   full_url=$url/$part.tar.gz
 76 |   echo "$0: downloading data from $full_url.  This may take some time, please be patient."
 77 | 
 78 |   cd $data
 79 |   if ! wget --no-check-certificate $full_url; then
 80 |     echo "$0: error executing wget $full_url"
 81 |     exit 1;
 82 |   fi
 83 |   cd -
 84 | fi
 85 | 
 86 | cd $data
 87 | 
 88 | if ! tar -xvzf $part.tar.gz; then
 89 |   echo "$0: error un-tarring archive $data/$part.tar.gz"
 90 |   exit 1;
 91 | fi
 92 | 
 93 | touch $data/LibriSpeech/$part/.complete
 94 | 
 95 | echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz"
 96 | 
 97 | if $remove_archive; then
 98 |   echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied."
 99 |   rm $data/$part.tar.gz
100 | fi
101 | 


--------------------------------------------------------------------------------
/librispeech100/local/download_lm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2014  Vassil Panayotov
 4 | #           2017  Daniel Povey
 5 | # Apache 2.0
 6 | 
 7 | if [ $# -ne "3" ]; then
 8 |   echo "Usage: $0 <base-url> <download_dir> <local?"
 9 |   echo "e.g.: $0 http://www.openslr.org/resources/11 ./corpus/ data/local/lm"
10 |   exit 1
11 | fi
12 | 
13 | base_url=$1
14 | dst_dir=$2
15 | local_dir=$3
16 | 
17 | # given a filename returns the corresponding file size in bytes
18 | # The switch cases below can be autogenerated by entering the data directory and running:
19 | # for f in *; do echo "\"$f\") echo \"$(du -b $f | awk '{print $1}')\";;"; done
20 | function filesize() {
21 |   case $1 in
22 |     "3-gram.arpa.gz") echo "759636181";;
23 |     "3-gram.pruned.1e-7.arpa.gz") echo "34094057";;
24 |     "3-gram.pruned.3e-7.arpa.gz") echo "13654242";;
25 |     "4-gram.arpa.gz") echo "1355172078";;
26 |     "librispeech-lexicon.txt") echo "5627653";;
27 |     "librispeech-vocab.txt") echo "1737588";;
28 |     *) echo "";;
29 |   esac
30 | }
31 | 
32 | function check_and_download () {
33 |   [[ $# -eq 1 ]] || { echo "check_and_download() expects exactly one argument!"; return 1; }
34 |   fname=$1
35 |   echo "Downloading file '$fname' into '$dst_dir'..."
36 |   expect_size="$(filesize $fname)"
37 |   [[ ! -z "$expect_size" ]] || { echo "Unknown file size for '$fname'"; return 1; }
38 |   if [[ -s $dst_dir/$fname ]]; then
39 |     # In the following statement, the first version works on linux, and the part
40 |     # after '||' works on Linux.
41 |     f=$dst_dir/$fname
42 |     fsize=$(set -o pipefail; du -b $f 2>/dev/null | awk '{print $1}' || stat '-f %z' $f)
43 |     if [[ "$fsize" -eq "$expect_size" ]]; then
44 |       echo "'$fname' already exists and appears to be complete"
45 |       return 0
46 |     else
47 |       echo "WARNING: '$fname' exists, but the size is wrong - re-downloading ..."
48 |     fi
49 |   fi
50 |   wget --no-check-certificate -O $dst_dir/$fname $base_url/$fname || {
51 |     echo "Error while trying to download $fname!"
52 |     return 1
53 |   }
54 |   f=$dst_dir/$fname
55 |   # In the following statement, the first version works on linux, and the part after '||'
56 |   # works on Linux.
57 |   fsize=$(set -o pipefail; du -b $f 2>/dev/null | awk '{print $1}' || stat '-f %z' $f)
58 |   [[ "$fsize" -eq "$expect_size" ]] || { echo "$fname: file size mismatch!"; return 1; }
59 |   return 0
60 | }
61 | 
62 | mkdir -p $dst_dir $local_dir
63 | 
64 | for f in 3-gram.arpa.gz 3-gram.pruned.1e-7.arpa.gz 3-gram.pruned.3e-7.arpa.gz 4-gram.arpa.gz \
65 |          librispeech-vocab.txt librispeech-lexicon.txt; do
66 |   check_and_download $f || exit 1
67 | done
68 | 
69 | dst_dir=$(readlink -f $dst_dir)
70 | ln -sf $dst_dir/3-gram.pruned.1e-7.arpa.gz $local_dir/lm_tgmed.arpa.gz
71 | ln -sf $dst_dir/3-gram.pruned.3e-7.arpa.gz $local_dir/lm_tgsmall.arpa.gz
72 | ln -sf $dst_dir/3-gram.arpa.gz $local_dir/lm_tglarge.arpa.gz
73 | ln -sf $dst_dir/4-gram.arpa.gz $local_dir/lm_fglarge.arpa.gz
74 | ln -sf $dst_dir/librispeech-lexicon.txt $local_dir/librispeech-lexicon.txt
75 | ln -sf $dst_dir/librispeech-vocab.txt $local_dir/librispeech-vocab.txt
76 | exit 0
77 | 


--------------------------------------------------------------------------------
/librispeech100/local/format_lms.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2014 Vassil Panayotov
 4 | # Apache 2.0
 5 | 
 6 | # Prepares the test time language model(G) transducers
 7 | # (adapted from wsj/s5/local/wsj_format_data.sh)
 8 | 
 9 | . ./path.sh || exit 1;
10 | 
11 | # begin configuration section
12 | src_dir=data/lang
13 | # end configuration section
14 | 
15 | . utils/parse_options.sh || exit 1;
16 | 
17 | set -e
18 | 
19 | if [ $# -ne 1 ]; then
20 |   echo "Usage: $0 <lm-dir>"
21 |   echo "e.g.: $0 /export/a15/vpanayotov/data/lm"
22 |   echo ", where:"
23 |   echo "    <lm-dir> is the directory in which the language model is stored/downloaded"
24 |   echo "Options:"
25 |   echo "   --src-dir  <dir>           # source lang directory, default data/lang"
26 |   exit 1
27 | fi
28 | 
29 | lm_dir=$1
30 | 
31 | if [ ! -d $lm_dir ]; then
32 |   echo "$0: expected source LM directory $lm_dir to exist"
33 |   exit 1;
34 | fi
35 | if [ ! -f $src_dir/words.txt ]; then
36 |   echo "$0: expected $src_dir/words.txt to exist."
37 |   exit 1;
38 | fi
39 | 
40 | 
41 | tmpdir=data/local/lm_tmp.$$
42 | trap "rm -r $tmpdir" EXIT
43 | 
44 | mkdir -p $tmpdir
45 | 
46 | for lm_suffix in tgsmall tgmed; do
47 |   # tglarge is prepared by a separate command, called from run.sh; we don't
48 |   # want to compile G.fst for tglarge, as it takes a while.
49 |   test=${src_dir}_test_${lm_suffix}
50 |   mkdir -p $test
51 |   cp -r ${src_dir}/* $test
52 |   gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz | \
53 |     arpa2fst --disambig-symbol=#0 \
54 |              --read-symbol-table=$test/words.txt - $test/G.fst
55 |   utils/validate_lang.pl --skip-determinization-check $test || exit 1;
56 | done
57 | 
58 | echo "Succeeded in formatting data."
59 | 
60 | exit 0
61 | 


--------------------------------------------------------------------------------
/librispeech100/local/prepare_dict.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright 2014 Vassil Panayotov
  4 | # Apache 2.0
  5 | 
  6 | # Prepares the dictionary and auto-generates the pronunciations for the words,
  7 | # that are in our vocabulary but not in CMUdict
  8 | 
  9 | stage=0
 10 | nj=4 # number of parallel Sequitur G2P jobs, we would like to use
 11 | cmd=run.pl
 12 | 
 13 | 
 14 | . utils/parse_options.sh || exit 1;
 15 | . ./path.sh || exit 1
 16 | 
 17 | 
 18 | if [ $# -ne 3 ]; then
 19 |   echo "Usage: $0 [options] <lm-dir> <g2p-model-dir> <dst-dir>"
 20 |   echo "e.g.: /export/a15/vpanayotov/data/lm /export/a15/vpanayotov/data/g2p data/local/dict"
 21 |   echo "Options:"
 22 |   echo "  --cmd '<command>'    # script to launch jobs with, default: run.pl"
 23 |   echo "  --nj <nj>            # number of jobs to run, default: 4."
 24 |   exit 1
 25 | fi
 26 | 
 27 | lm_dir=$1
 28 | g2p_model_dir=$2
 29 | dst_dir=$3
 30 | 
 31 | vocab=$lm_dir/librispeech-vocab.txt
 32 | [ ! -f $vocab ] && echo "$0: vocabulary file not found at $vocab" && exit 1;
 33 | 
 34 | # this file is either a copy of the lexicon we download from openslr.org/11 or is
 35 | # created by the G2P steps below
 36 | lexicon_raw_nosil=$dst_dir/lexicon_raw_nosil.txt
 37 | 
 38 | cmudict_dir=$dst_dir/cmudict
 39 | cmudict_plain=$dst_dir/cmudict.0.7a.plain
 40 | 
 41 | mkdir -p $dst_dir || exit 1;
 42 | 
 43 | if [ $stage -le 0 ]; then
 44 |   echo "Downloading and preparing CMUdict"
 45 |   if [ ! -s $cmudict_dir/cmudict.0.7a ]; then
 46 |     svn co -r 12440 https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict $cmudict_dir || exit 1;
 47 |   fi
 48 |   echo "Removing the pronunciation variant markers ..."
 49 |   grep -v ';;;' $cmudict_dir/cmudict.0.7a | \
 50 |     perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \
 51 |     > $cmudict_plain || exit 1;
 52 | fi
 53 | 
 54 | 
 55 | if [ $stage -le 1 ]; then
 56 |   # check if we have Sequitur G2P is installed
 57 |   if [ ! -f  "$sequitur" ]; then
 58 |     if ! which swig >&/dev/null; then
 59 |       echo "Please install 'swig' and then run $KALDI_ROOT/tools/extra/install_sequitur.sh"
 60 |       exit 1
 61 |     else
 62 |       echo "Sequitur G2P not found- running $KALDI_ROOT/tools/extra/install_sequitur.sh"
 63 |       pushd $KALDI_ROOT/tools
 64 |       extras/install_sequitur.sh || exit 1
 65 |       popd
 66 |     fi
 67 |   fi
 68 |   [[ -f "$sequitur" ]] || { echo "Still can't find Sequitur G2P- check your path.sh"; exit 1; }
 69 | 
 70 |   g2p_dir=$dst_dir/g2p
 71 |   auto_vocab_prefix="$g2p_dir/vocab_autogen"
 72 |   auto_lexicon_prefix="$g2p_dir/lexicon_autogen"
 73 | 
 74 |   mkdir -p $g2p_dir/log
 75 |   auto_vocab_splits=$(eval "echo $auto_vocab_prefix.{$(seq -s',' $nj | sed 's/,$//')}")
 76 |   awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' $cmudict_plain $vocab |\
 77 |     sort | tee $g2p_dir/vocab_autogen.full |\
 78 |     utils/split_scp.pl /dev/stdin $auto_vocab_splits || exit 1
 79 |   echo "Autogenerating pronunciations for the words in $auto_vocab_prefix.* ..."
 80 |   $cmd JOB=1:$nj $g2p_dir/log/g2p.JOB.log \
 81 |     local/g2p.sh  $auto_vocab_prefix.JOB $g2p_model_dir $auto_lexicon_prefix.JOB || exit 1
 82 |   g2p_vocab_size=$(wc -l <$g2p_dir/vocab_autogen.full)
 83 |   g2p_lex_size=$(wc -l < <(cat $auto_lexicon_prefix.*))
 84 |   [[ "$g2p_vocab_size" -eq "$g2p_lex_size" ]] || { echo "Unexpected G2P error"; exit 1; }
 85 |   sort <(cat $auto_vocab_prefix.*) >$dst_dir/vocab_autogen.txt
 86 |   sort <(cat $auto_lexicon_prefix.*) >$dst_dir/lexicon_autogen.txt
 87 |   echo "$(wc -l <$g2p_dir/vocab_autogen.full) pronunciations autogenerated OK"
 88 | fi
 89 | 
 90 | if [ $stage -le 2 ]; then
 91 |   echo "Combining the CMUdict pronunciations with the autogenerated ones ..."
 92 |   awk 'NR==FNR{a[$1]=1; next} ($1 in a)' $vocab $cmudict_plain |\
 93 |     cat - $dst_dir/lexicon_autogen.txt | sort >$lexicon_raw_nosil || exit 1
 94 |   raw_lex_size=$(cat $lexicon_raw_nosil | awk '{print $1}' | sort -u | wc -l)
 95 |   vocab_size=$(wc -l <$vocab)
 96 |   [[ "$vocab_size" -eq "$raw_lex_size" ]] || {
 97 |     echo "Inconsistent lexicon($raw_lex_size) vs vocabulary($vocab_size) size!";
 98 |     exit 1; }
 99 |   echo "Combined lexicon saved to '$lexicon_raw_nosil'"
100 | fi
101 | 
102 | # The copy operation below is necessary, if we skip the g2p stages(e.g. using --stage 3)
103 | if [[ ! -s "$lexicon_raw_nosil" ]]; then
104 |   cp $lm_dir/librispeech-lexicon.txt $lexicon_raw_nosil || exit 1
105 | fi
106 | 
107 | if [ $stage -le 3 ]; then
108 |   silence_phones=$dst_dir/silence_phones.txt
109 |   optional_silence=$dst_dir/optional_silence.txt
110 |   nonsil_phones=$dst_dir/nonsilence_phones.txt
111 |   extra_questions=$dst_dir/extra_questions.txt
112 | 
113 |   echo "Preparing phone lists and clustering questions"
114 |   (echo SIL; echo SPN;) > $silence_phones
115 |   echo SIL > $optional_silence
116 |   # nonsilence phones; on each line is a list of phones that correspond
117 |   # really to the same base phone.
118 |   awk '{for (i=2; i<=NF; ++i) { print $i; gsub(/[0-9]/, "", $i); print $i}}' $lexicon_raw_nosil |\
119 |     sort -u |\
120 |     perl -e 'while(<>){
121 |       chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_";
122 |       $phones_of{$1} .= "$_ "; }
123 |       foreach $list (values %phones_of) {print $list . "\n"; } ' | sort \
124 |       > $nonsil_phones || exit 1;
125 |   # A few extra questions that will be added to those obtained by automatically clustering
126 |   # the "real" phones.  These ask about stress; there's also one for silence.
127 |   cat $silence_phones| awk '{printf("%s ", $1);} END{printf "\n";}' > $extra_questions || exit 1;
128 |   cat $nonsil_phones | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
129 |     $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
130 |     >> $extra_questions || exit 1;
131 |   echo "$(wc -l <$silence_phones) silence phones saved to: $silence_phones"
132 |   echo "$(wc -l <$optional_silence) optional silence saved to: $optional_silence"
133 |   echo "$(wc -l <$nonsil_phones) non-silence phones saved to: $nonsil_phones"
134 |   echo "$(wc -l <$extra_questions) extra triphone clustering-related questions saved to: $extra_questions"
135 | fi
136 | 
137 | if [ $stage -le 4 ]; then
138 |   (echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) |\
139 |   cat - $lexicon_raw_nosil | sort | uniq >$dst_dir/lexicon.txt
140 |   echo "Lexicon text file saved as: $dst_dir/lexicon.txt"
141 | fi
142 | 
143 | exit 0
144 | 


--------------------------------------------------------------------------------
/librispeech100/local/prepare_librilight.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | . ./path.sh
 4 | . ./cmd
 5 | 
 6 | if [ $# -ne 1 ]; then
 7 |   echo "Usage: ./local/prepare_librilight.sh <data>"
 8 |   exit 1;
 9 | fi
10 | 
11 | data=$1
12 | # Get librilight set
13 | wget https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz
14 | tar -xvf librispeech_finetuning.tgz && mv librispeech_finetuning ${data}
15 | 
16 | # The following are the data subsets:
17 | # 1h/{0..5}/{clean,other}
18 | # 9h/{clean,other}
19 | #
20 | # In each of these subsets there speaker directories named with a speaker-id.
21 | # Inside each directory are more directories corresponding to a recording-id.
22 | # Within each speaker-id/recording-id subdirectory are the .flac audio files
23 | # corresponding to speech utterances, as well as a .trans.txt file that has
24 | # the transcription.
25 | 
26 | find -L $data -name "*.flac"
27 | 
28 | for part in 1h/{0..5}/{clean,other} 9h/{clean,other}; do
29 |   dataname=$(echo ${part} | sed 's/\//_/g')
30 |   ./local/prepare_librilight_dataset.sh ${data}/${part} data/train_${dataname}
31 | done
32 | 
33 | ./utils/combine_data.sh \
34 |   data/train_10h data/train_1h_{0..5}_{clean,other} data/train_9h_{clean,other}
35 | 


--------------------------------------------------------------------------------
/librispeech100/local/prepare_librilight_dataset.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | . ./path.sh
 4 | . ./cmd.sh
 5 | 
 6 | if [ $# -ne 2 ]; then
 7 |   echo "Usage: ./local/prepare_librilight.sh <data> <kaldi_data>"
 8 |   exit 1;
 9 | fi
10 | 
11 | data=$1
12 | kaldi_data=$2
13 | 
14 | data=$(./utils/make_absolute.sh ${data})
15 | mkdir -p $kaldi_data
16 | files=( `find -L ${data}/${p} -name "*.flac"` )
17 | 
18 | for f in ${files[@]}; do
19 |   fname=`basename $f`
20 |   fname=${fname%%.flac}
21 |   echo "${fname} flac -c -d -s ${f} |" 
22 | done | sort > ${kaldi_data}/wav.scp
23 | 
24 | paste -d' ' <(awk '{print $1}' ${kaldi_data}/wav.scp) \
25 |             <(awk '{print $1}' ${kaldi_data}/wav.scp | cut -d'-' -f1) \
26 |             > ${kaldi_data}/utt2spk
27 | 
28 | ./utils/utt2spk_to_spk2utt.pl ${kaldi_data}/utt2spk > ${kaldi_data}/spk2utt
29 | 
30 | cat `find -L ${data}/${p} -name "*.trans.txt"` | sort > ${kaldi_data}/text
31 | exit 0;
32 | 


--------------------------------------------------------------------------------
/librispeech100/local/prepare_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . ./path.sh
 3 | . ./cmd.sh
 4 | 
 5 | data="/export/corpora5"
 6 | subsampling=4
 7 | num_split=20
 8 | testsets="dev-clean dev-other test-clean test-other"
 9 | feat_affix=_fbank
10 | standard_split=false
11 | 
12 | . ./utils/parse_options.sh
13 | 
14 | for part in $testsets; do
15 |   echo "-------------- Making ${part} ----------------------"
16 |   dataname=$(echo ${part} | sed s/-/_/g)
17 |   part=$(echo ${part} | sed s/_/-/g)
18 |   if $standard_split; then
19 |     local/data_prep.sh $data/LibriSpeech/${part} data/${dataname}
20 |   else
21 |     echo "Assuming the testset ${part} is manually created and exists ..."
22 |   fi
23 |   ./utils/copy_data_dir.sh data/${dataname} data/${dataname}${feat_affix}
24 |   ./steps/make_fbank.sh --cmd "$train_cmd" --nj 32 \
25 |     data/${dataname}${feat_affix} exp/make_fbank/${dataname}${feat_affix} ${feat_affix##_}
26 |   ./utils/fix_data_dir.sh data/${dataname}${feat_affix}
27 |   ./steps/compute_cmvn_stats.sh data/${dataname}${feat_affix}
28 |   ./utils/fix_data_dir.sh data/${dataname}${feat_affix}
29 | 
30 |   prepare_unlabeled_tgt.py --subsample ${subsampling} \
31 |     data/${dataname}${feat_affix}/utt2num_frames > data/${dataname}${feat_affix}/pdfid.${subsampling}.tgt
32 |   split_memmap_data.sh data/${dataname}${feat_affix} data/${dataname}${feat_affix}/pdfid.${subsampling}.tgt $num_split 
33 | done
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/librispeech100/local/prepare_unlabeled_tgt.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*- coding: utf-8 -*-
 3 | # Copyright 2019  Johns Hopkins University (Author: Matthew Wiesner)
 4 | # Apache 2.0
 5 | 
 6 | from __future__ import print_function
 7 | import argparse
 8 | import sys
 9 | import os
10 | 
11 | 
12 | def main():
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument('utt2num_frames',
15 |         help='',
16 |         type=str
17 |     )
18 |     parser.add_argument('--subsample', type=int, default=1)
19 | 
20 |     args = parser.parse_args()
21 | 
22 |     with open(args.utt2num_frames, 'r') as f:
23 |         for l in f:
24 |             utt, frames = l.strip().split(None, 1)
25 |             print(utt, end='')
26 |             num_frames = len(range(0, int(frames), args.subsample))
27 |             print(' -1' * num_frames)
28 | 
29 | if __name__ == "__main__":
30 |     main()
31 | 
32 | 


--------------------------------------------------------------------------------
/librispeech100/local/score.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
 3 | #           2014  Guoguo Chen
 4 | # Apache 2.0
 5 | 
 6 | [ -f ./path.sh ] && . ./path.sh
 7 | 
 8 | # begin configuration section.
 9 | cmd=run.pl
10 | stage=0
11 | decode_mbr=true
12 | word_ins_penalty=0.0,0.5,1.0
13 | min_lmwt=7
14 | max_lmwt=17
15 | iter=final
16 | #end configuration section.
17 | 
18 | [ -f ./path.sh ] && . ./path.sh
19 | . parse_options.sh || exit 1;
20 | 
21 | if [ $# -ne 3 ]; then
22 |   echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
23 |   echo " Options:"
24 |   echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
25 |   echo "    --stage (0|1|2)                 # start scoring script from part-way through."
26 |   echo "    --decode_mbr (true/false)       # maximum bayes risk decoding (confusion network)."
27 |   echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
28 |   echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
29 |   exit 1;
30 | fi
31 | 
32 | data=$1
33 | lang_or_graph=$2
34 | dir=$3
35 | 
36 | symtab=$lang_or_graph/words.txt
37 | 
38 | for f in $symtab $dir/lat.1.gz $data/text; do
39 |   [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
40 | done
41 | 
42 | mkdir -p $dir/scoring/log
43 | 
44 | cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
45 | 
46 | for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
47 |   $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.$wip.log \
48 |     lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
49 |     lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
50 |     lattice-best-path --word-symbol-table=$symtab \
51 |       ark:- ark,t:$dir/scoring/LMWT.$wip.tra || exit 1;
52 | done
53 | 
54 | # Note: the double level of quoting for the sed command
55 | for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
56 |   $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.$wip.log \
57 |     cat $dir/scoring/LMWT.$wip.tra \| \
58 |     utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
59 |     compute-wer --text --mode=present \
60 |     ark:$dir/scoring/test_filt.txt  ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1;
61 | done
62 | 
63 | exit 0;
64 | 


--------------------------------------------------------------------------------
/librispeech100/local/split_memmap_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | . ./path.sh
 4 | . ./cmd.sh
 5 | 
 6 | . ./utils/parse_options.sh
 7 | if [ $# -ne 2 ]; then
 8 |   echo "Usage: ./local/split_memmap_data.sh <datadir> <n>"
 9 |   exit 1;
10 | fi
11 | 
12 | datadir=$1
13 | num_split=$2
14 | 
15 | dataname=`basename ${datadir}`
16 | mapped_dir=${datadir}/mapped # don't change this path
17 | mkdir -p $mapped_dir
18 | echo "$0: Splitting data in $num_split parts"
19 | # spread the mapped numpy arrays over various machines, as this data-set is quite large.
20 | if [[  $(hostname -f) ==  *.clsp.jhu.edu ]]; then
21 |   utils/create_split_dir.pl /export/b{11,12,13,14}/$USER/kaldi-data/egs/librispeech100/$mapped_dir/storage \
22 |     $mapped_dir/storage
23 | fi
24 | utils/split_data.sh ${datadir} $num_split
25 | for n in $(seq $num_split); do
26 |   # the next command does nothing unless $mapped_feats_dir/storage/ exists, see
27 |   # utils/create_data_link.pl for more info.
28 |   utils/create_data_link.pl $mapped_dir/feats.dat.$n
29 | done
30 | $train_cmd JOB=1:$num_split exp/make_fbank/${dataname}/memmap_data.JOB.log \
31 |   memmap_data.py ${datadir}/split${num_split}/JOB/feats.scp $mapped_dir/feats.dat.JOB \
32 |   $mapped_dir/metadata.JOB
33 | echo $num_split > ${datadir}/num_split
34 | 
35 | 


--------------------------------------------------------------------------------
/librispeech100/local/subset_dataset.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2017  Luminar Technologies, Inc. (author: Daniel Galvez)
 4 | # Apache 2.0
 5 | 
 6 | # The following commands were used to generate the mini_librispeech dataset:
 7 | #
 8 | # Note that data generation is random. This could be fixed by
 9 | # providing a seed argument to the shuf program.
10 | 
11 | if [ "$#" -ne 3 ]; then
12 |   echo "Usage: $0 <src-dir> <dst-dir> <num-hours>"
13 |   echo "e.g.: $0 /export/a05/dgalvez/LibriSpeech/train-clean-100 \\
14 |                  /export/a05/dgalvez/LibriSpeech/train-clean-5 5"
15 |   exit 1
16 | fi
17 | 
18 | src_dir=$1
19 | dest_dir=$2
20 | dest_num_hours=$3
21 | 
22 | src=$(basename $src_dir)
23 | dest=$(basename $dest_dir)
24 | librispeech_dir=$(dirname $src_dir)
25 | 
26 | # TODO: Possibly improve this to ensure gender balance and speaker
27 | # balance.
28 | # TODO: Use actual time values instead of assuming that to make sure we get $dest_num_hours of data
29 | src_num_hours=$(grep "$src" $librispeech_dir/CHAPTERS.TXT | awk -F'|' '{ print $3 }' | \
30 | python -c '
31 | from __future__ import print_function
32 | from sys import stdin
33 | minutes_str = stdin.read().split()
34 | print(int(round(sum([float(minutes) for minutes in minutes_str]) / 60.0)))')
35 | src_num_chapters=$(grep "$src" $librispeech_dir/CHAPTERS.TXT | \
36 |                       awk -F'|' '{ print $1 }' | sort -u | wc -l)
37 | mkdir -p data/subset_tmp
38 | grep "$src" $librispeech_dir/CHAPTERS.TXT | \
39 |   awk -F'|' '{ print $1 }' | \
40 |   shuf -n $(((dest_num_hours * src_num_chapters) / src_num_hours)) > \
41 |        data/subset_tmp/${dest}_chapter_id_list.txt
42 | 
43 | while read -r chapter_id || [[ -n "$chapter_id" ]]; do
44 |   chapter_dir=$(find $src_dir/ -mindepth 2 -name "$chapter_id" -type d)
45 |   speaker_id=$(basename $(dirname $chapter_dir))
46 |   mkdir -p $dest_dir/$speaker_id/
47 |   cp -r $chapter_dir $dest_dir/$speaker_id/
48 | done  < data/subset_tmp/${dest}_chapter_id_list.txt
49 | 


--------------------------------------------------------------------------------
/librispeech100/path.sh:
--------------------------------------------------------------------------------
 1 | export ROOT=`pwd`/../tools
 2 | export KALDI_ROOT=${ROOT}/kaldi
 3 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 4 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/sph2pipe_v2.5:$KALDI_ROOT/tools/openfst/bin:`pwd`/../nnet_pytorch:$PWD:$PATH:`pwd`/../nnet_pytorch/utils/
 5 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
 6 | . $KALDI_ROOT/tools/config/common_path.sh
 7 | export LC_ALL=C
 8 | 
 9 | export OPENFST_PATH=${ROOT}/openfst #/PATH/TO/OPENFST
10 | export LD_LIBRARY_ORIG=${LD_LIBRARY_PATH}
11 | export LD_LIBRARY_PATH=${OPENFST_PATH}/lib:${LD_LIBRARY_PATH}
12 | #export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${OPENFST_PATH}/lib:/usr/local/cuda/lib64
13 | 
14 | export PYTHONPATH=${PYTHONPATH}:`pwd`/../nnet_pytorch/:`pwd`/../nnet_pytorch/utils/
15 | export PYTHONUNBUFFERED=1
16 | source ${ROOT}/NeurIPS2020/bin/activate
17 | 
18 | export LC_ALL=C
19 | 
20 | 


--------------------------------------------------------------------------------
/librispeech100/run-semisup-wrn-scratch.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # This is based almost entirely on the Kaldi librispeech recipe
  4 | # Change this location to somewhere where you want to put the data.
  5 | # This recipe ASSUMES YOU HAVE DOWNLOADED the Librispeech data
  6 | unlabeled_data=/export/corpora5 #/PATH/TO/LIBRISPEECH/data
  7 | 
  8 | . ./cmd.sh
  9 | . ./path.sh
 10 | 
 11 | stage=1
 12 | subsampling=4
 13 | traindir=data/train_100h
 14 | unsupdir=data/train_860h
 15 | feat_affix=_fbank_64
 16 | chaindir=exp/chain
 17 | model_dirname=wrn_semisup
 18 | batches_per_epoch=250
 19 | num_epochs=240
 20 | delay=2
 21 | train_nj_init=2
 22 | train_nj_final=6
 23 | ebm_weight=1.0
 24 | ebm_type="uncond"
 25 | ebm_tgt=data/train_100h_fbank_64/pdfid.4.tgt
 26 | sgld_opt=adam
 27 | sgld_stepsize=1.0
 28 | sgld_maxsteps=50.0
 29 | sgld_minsteps=1
 30 | sgld_replay=1.0
 31 | sgld_noise=0.001
 32 | sgld_weight_decay=1e-10
 33 | sgld_decay=1e-04
 34 | sgld_warmup=15000
 35 | sgld_clip=1.0
 36 | sgld_init_val=1.5
 37 | sgld_epsilon=1e-04
 38 | lr=0.0002
 39 | xent=0.1
 40 | l2=0.0001
 41 | leaky_hmm=0.1
 42 | l2_energy=0.001
 43 | warmup=15000
 44 | unsup_num_repeats=1
 45 | unsup_batchsize=32
 46 | unsup_chunkwidth=50
 47 | unsup_left=10
 48 | unsup_right=5
 49 | mean_norm=True
 50 | var_norm=True
 51 | perturb="gauss 0.01"
 52 | depth=28
 53 | width=10
 54 | seed=0
 55 | resume=
 56 | num_split=80 # number of splits for memory-mapped data for training
 57 | . ./utils/parse_options.sh
 58 | 
 59 | set -euo pipefail
 60 | 
 61 | tree=${chaindir}/tree
 62 | targets=${traindir}${feat_affix}/pdfid.${subsampling}.tgt
 63 | trainname=`basename ${traindir}`
 64 | 
 65 | # Make the unlabeled data
 66 | if [ $stage -le 0 ]; then
 67 |   for part in train-clean-360 train-other-500; do
 68 |     local/data_prep.sh $unlabeled_data/LibriSpeech/${part} data/$(echo ${part} | sed s/-/_/g)
 69 |   done 
 70 | 
 71 |   ./utils/combine_data.sh data/train_860 data/train_{clean_360,other_500}
 72 |   ./steps/make_fbank.sh --cmd "$train_cmd" --nj 32 data/train_860 exp/make_fbank/train_860 ${feat_affix##_}
 73 |   ./utils/fix_data_dir.sh data/train_860
 74 |   ./steps/compute_cmvn_stats.sh data/train_860
 75 |   ./utils/fix_data_dir.sh data/train_860
 76 | 
 77 |   python prepare_unlabeled_tgt.py --subsample ${subsampling} data/train_860/utt2num_frames > data/train_860/pdfid.${subsampling}.unsup.tgt
 78 |   split_memmap_data.sh data/train_860 data/train_860/pdfid.${subsampling}.tgt ${num_split} 
 79 | fi
 80 | 
 81 | 
 82 | # We use a lower learning rate in order to prevent the model from forgetting
 83 | # too much. 
 84 | if [ $stage -eq 1 ]; then
 85 |   resume_opts=
 86 |   if [ ! -z $resume ]; then
 87 |     resume_opts="--resume ${resume}"
 88 |   fi 
 89 |   num_pdfs=$(tree-info ${tree}/tree | grep 'num-pdfs' | cut -d' ' -f2)
 90 |   idim=$(feat-to-dim scp:${traindir}${feat_affix}/feats.scp -)
 91 |   train_async_parallel.sh ${resume_opts} \
 92 |     --gpu true \
 93 |     --objective SemisupLFMMI \
 94 |     --denom-graph ${chaindir}/den.fst \
 95 |     --num-pdfs ${num_pdfs} \
 96 |     --idim ${idim} \
 97 |     --subsample ${subsampling} \
 98 |     --model ChainWideResnet \
 99 |     --depth ${depth} \
100 |     --width ${width} \
101 |     --warmup ${warmup} \
102 |     --decay 1e-05 \
103 |     --xent ${xent} \
104 |     --l2 ${l2} \
105 |     --leaky-hmm ${leaky_hmm} \
106 |     --weight-decay 1e-07 \
107 |     --lr ${lr} \
108 |     --batches-per-epoch ${batches_per_epoch} \
109 |     --num-epochs ${num_epochs} \
110 |     --validation-spks 0 \
111 |     --sgld-thresh 0.0 \
112 |     --sgld-reinit-p 0.05 \
113 |     --sgld-buffer 10000 \
114 |     --sgld-stepsize ${sgld_stepsize} \
115 |     --sgld-steps ${sgld_minsteps} \
116 |     --sgld-max-steps ${sgld_maxsteps} \
117 |     --sgld-noise ${sgld_noise} \
118 |     --sgld-decay ${sgld_decay} \
119 |     --sgld-real-decay 0.0 \
120 |     --sgld-clip ${sgld_clip} \
121 |     --sgld-warmup ${sgld_warmup} \
122 |     --sgld-optim ${sgld_opt} \
123 |     --sgld-init-val ${sgld_init_val} \
124 |     --sgld-epsilon ${sgld_epsilon} \
125 |     --sgld-replay-correction ${sgld_replay} \
126 |     --l2-energy ${l2_energy} \
127 |     --sgld-weight-decay ${sgld_weight_decay} \
128 |     --delay-updates ${delay} \
129 |     --lfmmi-weight 1.0 \
130 |     --ebm-weight ${ebm_weight} \
131 |     --ebm-type ${ebm_type} \
132 |     --ebm-tgt ${ebm_tgt} \
133 |     --nj-init ${train_nj_init} \
134 |     --nj-final ${train_nj_final} \
135 |     --seed ${seed} \
136 |     "[ \
137 |         {\
138 |     'data': '${traindir}${feat_affix}', \
139 |     'tgt': '${targets}', \
140 |     'batchsize': 32, 'num_repeats': 1, 'chunk_width': 140, \
141 |     'left_context': 10, 'right_context': 5, \
142 |     'mean_norm': ${mean_norm}, 'var_norm': ${var_norm}, 'perturb_type': '${perturb}' \
143 |         },\
144 |         {\
145 |      'data': '${unsupdir}', \
146 |      'tgt': '${unsupdir}/pdfid.${subsampling}.unsup.tgt', \
147 |      'batchsize': ${unsup_batchsize}, 'num_repeats': ${unsup_num_repeats}, 'chunk_width': ${unsup_chunkwidth}, \
148 |      'left_context': ${unsup_left}, 'right_context': ${unsup_right}, \
149 |      'mean_norm': ${mean_norm}, 'var_norm': ${var_norm}, 'perturb_type': '${perturb}' \
150 |        },\
151 |      ]" \
152 |      `dirname ${chaindir}`/${model_dirname}
153 | fi
154 | 
155 | 


--------------------------------------------------------------------------------
/librispeech100/run-semisup-wrn.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # This is based almost entirely on the Kaldi librispeech recipe
  4 | # Change this location to somewhere where you want to put the data.
  5 | # This recipe ASSUMES YOU HAVE DOWNLOADED the Librispeech data
  6 | unlabeled_data=/export/corpora5 #/PATH/TO/LIBRISPEECH/data
  7 | 
  8 | . ./cmd.sh
  9 | . ./path.sh
 10 | 
 11 | stage=1
 12 | subsampling=4
 13 | traindir=data/train_100h
 14 | feat_affix=_fbank_64
 15 | chaindir=exp/chain
 16 | model_dirname=wrn_semisup
 17 | batches_per_epoch=1000
 18 | num_epochs=240
 19 | train_nj=4
 20 | lr=0.0001
 21 | delay_updates=2
 22 | warmup=15000
 23 | ebm_weight=1.0
 24 | sgld_opt=adam
 25 | sgld_stepsize=1.0
 26 | sgld_maxsteps=50.0
 27 | sgld_minsteps=1
 28 | sgld_replay=1.0
 29 | sgld_noise=0.001
 30 | sgld_weight_decay=1e-10
 31 | sgld_decay=1e-04
 32 | sgld_warmup=15000
 33 | sgld_reinit=0.05
 34 | sgld_clip=1.0
 35 | l2_energy=0.0001
 36 | unsup_batchsize=16
 37 | sup_batchsize=16
 38 | unsup_chunkwidth=50
 39 | unsup_left=10
 40 | unsup_right=5
 41 | mean_norm=True
 42 | resume=
 43 | num_split=80 # number of splits for memory-mapped data for training
 44 | . ./utils/parse_options.sh
 45 | 
 46 | if [ $# -ne 1 ]; then
 47 |   echo "Usage: ./run-semisup-wrn.sh <seed_model>"
 48 |   echo "      This script assumes you have trained a seed model first."
 49 |   echo "      Do ./run-wrn.sh for instance."
 50 |   exit 1;
 51 | fi
 52 | 
 53 | init=$1
 54 | set -euo pipefail
 55 | 
 56 | [ ! -f ${init} ] && echo "Expected ${init} to exist." && exit 1; 
 57 | tree=${chaindir}/tree
 58 | targets=${traindir}${feat_affix}/pdfid.${subsampling}.tgt
 59 | trainname=`basename ${traindir}`
 60 | 
 61 | # Make the unlabeled data
 62 | if [ $stage -le 0 ]; then
 63 |   for part in train-clean-360 train-other-500; do
 64 |     local/data_prep.sh $unlabeled_data/LibriSpeech/${part} data/$(echo ${part} | sed s/-/_/g)
 65 |   done 
 66 | 
 67 |   ./utils/combine_data.sh data/train_860 data/train_{clean_360,other_500}
 68 |   ./steps/make_fbank.sh --cmd "$train_cmd" --nj 32 data/train_860 exp/make_fbank/train_860 ${feat_affix##_}
 69 |   ./utils/fix_data_dir.sh data/train_860
 70 |   ./steps/compute_cmvn_stats.sh data/train_860
 71 |   ./utils/fix_data_dir.sh data/train_860
 72 | 
 73 |   python prepare_unlabeled_tgt.py --subsample ${subsampling} data/train_860/utt2num_frames > data/train_860/pdfid.${subsampling}.tgt
 74 |   split_memmap_data.sh data/train_860 data/train_860/pdfid.${subsampling}.tgt ${num_split} 
 75 | fi
 76 | 
 77 | 
 78 | # We use a lower learning rate in order to prevent the model from forgetting
 79 | # too much. 
 80 | if [ $stage -eq 1 ]; then
 81 |   resume_opts=
 82 |   if [ ! -z $resume ]; then
 83 |     resume_opts="--resume ${resume}"
 84 |   fi 
 85 |   num_pdfs=$(tree-info ${tree}/tree | grep 'num-pdfs' | cut -d' ' -f2)
 86 |   train_async_parallel.sh ${resume_opts} \
 87 |     --gpu true \
 88 |     --objective SemisupLFMMI \
 89 |     --denom-graph ${chaindir}/den.fst \
 90 |     --num-pdfs ${num_pdfs} \
 91 |     --subsample ${subsampling} \
 92 |     --model ChainWideResnet \
 93 |     --depth 28 \
 94 |     --width 10 \
 95 |     --warmup ${warmup} \
 96 |     --decay 1e-05 \
 97 |     --xent 0.01 \
 98 |     --l2 0.0001 \
 99 |     --weight-decay 1e-07 \
100 |     --lr ${lr} \
101 |     --batches-per-epoch ${batches_per_epoch} \
102 |     --num-epochs ${num_epochs} \
103 |     --validation-spks 0 \
104 |     --sgld-thresh 0 \
105 |     --sgld-reinit-p ${sgld_reinit} \
106 |     --sgld-buffer 10000 \
107 |     --sgld-stepsize ${sgld_stepsize} \
108 |     --sgld-steps ${sgld_minsteps} \
109 |     --sgld-max-steps ${sgld_maxsteps} \
110 |     --sgld-noise ${sgld_noise} \
111 |     --sgld-decay ${sgld_decay} \
112 |     --sgld-real-decay 0.0 \
113 |     --sgld-clip ${sgld_clip} \
114 |     --sgld-warmup ${sgld_warmup} \
115 |     --sgld-optim ${sgld_opt} \
116 |     --sgld-replay-correction ${sgld_replay} \
117 |     --l2-energy ${l2_energy} \
118 |     --sgld-weight-decay ${sgld_weight_decay} \
119 |     --delay-updates ${delay_updates} \
120 |     --lfmmi-weight 0.1 \
121 |     --ebm-weight ${ebm_weight} \
122 |     --nj ${train_nj} \
123 |     --init ${init} \
124 |     "[ \
125 |         {\
126 |     'data': '${traindir}${feat_affix}', \
127 |     'tgt': '${targets}', \
128 |     'batchsize': ${sup_batchsize}, 'chunk_width': 140, 'num_repeats': 1,\
129 |     'left_context': 10, 'right_context': 5, \
130 |     'mean_norm': ${mean_norm}, 'var_norm': 'norm' \
131 |         },\
132 |         {\
133 |      'data': 'data/train_860', \
134 |      'tgt': 'data/train_860/pdfid.${subsampling}.tgt', \
135 |      'batchsize': ${unsup_batchsize}, 'chunk_width': ${unsup_chunkwidth}, 'num_repeats': 1,\
136 |      'left_context': ${unsup_left}, 'right_context': ${unsup_right}, \
137 |      'mean_norm': ${mean_norm}, 'var_norm': 'norm' \
138 |        },\
139 |      ]" \
140 |      `dirname ${chaindir}`/${model_dirname}
141 | fi
142 | 
143 | 


--------------------------------------------------------------------------------
/librispeech100/run-tdnn.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | . ./cmd.sh
  4 | . ./path.sh
  5 | 
  6 | stage=0
  7 | subsampling=3
  8 | traindir=data/train_100h
  9 | feat_affix=_fbank
 10 | chaindir=exp/chain_tdnn
 11 | num_leaves=3500
 12 | model_dirname=tdnn
 13 | batches_per_epoch=250
 14 | num_epochs=240
 15 | train_nj=2
 16 | resume=
 17 | num_split=20 # number of splits for memory-mapped data for training
 18 | average=true
 19 | 
 20 | . ./utils/parse_options.sh
 21 | 
 22 | set -euo pipefail
 23 | 
 24 | tree=${chaindir}/tree
 25 | targets=${traindir}${feat_affix}/pdfid.${subsampling}.tgt
 26 | trainname=`basename ${traindir}`
 27 | 
 28 | if [ $stage -le 1 ]; then
 29 |   echo "Creating Chain Topology, Denominator Graph, and nnet Targets ..."
 30 |   lang=data/lang_chain
 31 |   cp -r data/lang $lang
 32 |   silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
 33 |   nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
 34 |   # Use our special topology... note that later on may have to tune this
 35 |   # topology.
 36 |   steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
 37 | 
 38 |   steps/nnet3/chain/build_tree.sh \
 39 |     --frame-subsampling-factor ${subsampling} \
 40 |     --context-opts "--context-width=2 --central-position=1" \
 41 |     --cmd "$train_cmd" ${num_leaves} ${traindir} \
 42 |     $lang exp/tri3_ali_${trainname} ${tree}
 43 | 
 44 |   ali-to-phones ${tree}/final.mdl ark:"gunzip -c ${tree}/ali.*.gz |" ark:- |\
 45 |     chain-est-phone-lm --num-extra-lm-states=2000 ark:- ${chaindir}/phone_lm.fst
 46 | 
 47 |   chain-make-den-fst ${tree}/tree ${tree}/final.mdl \
 48 |     ${chaindir}/phone_lm.fst ${chaindir}/den.fst ${chaindir}/normalization.fst
 49 | 
 50 |   ali-to-pdf ${tree}/final.mdl ark:"gunzip -c ${tree}/ali.*.gz |" ark,t:${targets}
 51 | fi
 52 | 
 53 | if [ $stage -le 2 ]; then
 54 |   echo "Dumping memory mapped features ..."
 55 |   split_memmap_data.sh ${traindir}${feat_affix} ${targets} ${num_split} 
 56 | fi
 57 | 
 58 | if [ $stage -le 3 ]; then
 59 |   resume_opts=
 60 |   if [ ! -z $resume ]; then
 61 |     resume_opts="--resume ${resume}"
 62 |   fi 
 63 |   
 64 |   num_pdfs=$(tree-info ${tree}/tree | grep 'num-pdfs' | cut -d' ' -f2)
 65 |   train_async_parallel.sh ${resume_opts} \
 66 |     --gpu true \
 67 |     --objective LFMMI \
 68 |     --denom-graph ${chaindir}/den.fst \
 69 |     --num-pdfs ${num_pdfs} \
 70 |     --subsample ${subsampling} \
 71 |     --model ChainTDNN \
 72 |     --hdim 1024  \
 73 |     --num-layers 13 \
 74 |     --dropout 0.2 \
 75 |     --prefinal-dim 192 \
 76 |     --warmup 15000 \
 77 |     --decay 1e-05 \
 78 |     --xent 0.1 \
 79 |     --l2 0.0001 \
 80 |     --weight-decay 1e-07 \
 81 |     --lr 0.0002 \
 82 |     --batches-per-epoch ${batches_per_epoch} \
 83 |     --num-epochs ${num_epochs} \
 84 |     --validation-spks 0 \
 85 |     --nj ${train_nj} \
 86 |     "[ \
 87 |         {\
 88 |     'data': '${traindir}${feat_affix}', \
 89 |     'tgt': '${targets}', \
 90 |     'batchsize': 128, 'chunk_width': 140, \
 91 |     'left_context': 10, 'right_context': 5, \
 92 |     'mean_norm': True, 'var_norm': 'norm'
 93 |         }\
 94 |      ]" \
 95 |     `dirname ${chaindir}`/${model_dirname}
 96 | fi
 97 | 
 98 | # Average the last 40 epochs
 99 | if $average; then
100 |   echo "Averaging the last few epochs ..."
101 |   average_models.py `dirname ${chaindir}`/${model_dirname} 80 200 240
102 | fi
103 | 


--------------------------------------------------------------------------------
/librispeech100/run-wrn.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | . ./cmd.sh
  4 | . ./path.sh
  5 | 
  6 | stage=0
  7 | subsampling=4
  8 | traindir=data/train_100h
  9 | feat_affix=_fbank
 10 | chaindir=exp/chain_wrn
 11 | num_leaves=3500
 12 | model_dirname=wrn
 13 | batches_per_epoch=250
 14 | num_epochs=240
 15 | train_nj_init=1
 16 | train_nj_final=4
 17 | perturb="gauss 0.1"
 18 | leaky_hmm=0.1
 19 | resume=
 20 | num_split=20 # number of splits for memory-mapped data for training
 21 | average=true
 22 | 
 23 | . ./utils/parse_options.sh
 24 | 
 25 | set -euo pipefail
 26 | 
 27 | tree=${chaindir}/tree
 28 | targets=${traindir}${feat_affix}/pdfid.${subsampling}.tgt
 29 | trainname=`basename ${traindir}`
 30 | 
 31 | if [ $stage -le 1 ]; then
 32 |   echo "Creating Chain Topology, Denominator Graph, and nnet Targets ..."
 33 |   lang=data/lang_chain
 34 |   cp -r data/lang $lang
 35 |   silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
 36 |   nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
 37 |   # Use our special topology... note that later on may have to tune this
 38 |   # topology.
 39 |   steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
 40 | 
 41 |   steps/nnet3/chain/build_tree.sh \
 42 |     --frame-subsampling-factor ${subsampling} \
 43 |     --context-opts "--context-width=2 --central-position=1" \
 44 |     --cmd "$train_cmd" ${num_leaves} ${traindir} \
 45 |     $lang exp/tri3_ali_${trainname} ${tree}
 46 | 
 47 |   ali-to-phones ${tree}/final.mdl ark:"gunzip -c ${tree}/ali.*.gz |" ark:- |\
 48 |     chain-est-phone-lm --num-extra-lm-states=2000 ark:- ${chaindir}/phone_lm.fst
 49 | 
 50 |   chain-make-den-fst ${tree}/tree ${tree}/final.mdl \
 51 |     ${chaindir}/phone_lm.fst ${chaindir}/den.fst ${chaindir}/normalization.fst
 52 | 
 53 |   ali-to-pdf ${tree}/final.mdl ark:"gunzip -c ${tree}/ali.*.gz |" ark,t:${targets}
 54 | fi
 55 | 
 56 | if [ $stage -le 2 ]; then
 57 |   echo "Dumping memory mapped features ..."
 58 |   split_memmap_data.sh ${traindir}${feat_affix} ${targets} ${num_split} 
 59 | fi
 60 | 
 61 | # Multigpu training of Chain-WideResNet with optimizer state averaging
 62 | if [ $stage -le 3 ]; then
 63 |   resume_opts=
 64 |   if [ ! -z $resume ]; then
 65 |     resume_opts="--resume ${resume}"
 66 |   fi 
 67 | 
 68 |   num_pdfs=$(tree-info ${tree}/tree | grep 'num-pdfs' | cut -d' ' -f2)
 69 |   idim=$(feat-to-dim scp:${traindir}${feat_affix}/feats.scp -)
 70 |   train_async_parallel.sh ${resume_opts} \
 71 |     --gpu true \
 72 |     --objective LFMMI \
 73 |     --denom-graph ${chaindir}/den.fst \
 74 |     --num-pdfs ${num_pdfs} \
 75 |     --idim ${idim} \
 76 |     --subsample ${subsampling} \
 77 |     --model ChainWideResnet \
 78 |     --depth 28 \
 79 |     --width 10 \
 80 |     --warmup 15000 \
 81 |     --decay 1e-05 \
 82 |     --xent 0.1 \
 83 |     --l2 0.0001 \
 84 |     --leaky-hmm ${leaky_hmm} \
 85 |     --weight-decay 1e-07 \
 86 |     --lr 0.0001 \
 87 |     --batches-per-epoch ${batches_per_epoch} \
 88 |     --num-epochs ${num_epochs} \
 89 |     --nj-init ${train_nj_init} \
 90 |     --nj-final ${train_nj_final} \
 91 |     "[ \
 92 |         {\
 93 |     'data': '${traindir}${feat_affix}', \
 94 |     'tgt': '${targets}', \
 95 |     'batchsize': 32, 'chunk_width': 140, \
 96 |     'left_context': 10, 'right_context': 5, 'num_repeats': 1, \
 97 |     'mean_norm': True, 'var_norm': True, 'perturb_type': '${perturb}'
 98 |         }\
 99 |      ]" \
100 |     `dirname ${chaindir}`/${model_dirname}
101 | fi
102 | 
103 | # Average the last 40 epochs
104 | if $average; then
105 |   echo "Averaging the last few epochs ..."
106 |   average_models.py `dirname ${chaindir}`/${model_dirname} 80 200 240
107 | fi
108 | 


--------------------------------------------------------------------------------
/librispeech100/run.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # DATA-level specifications. 
  4 | speech_data=/export/corpora5 #/PATH/TO/LIBRISPEECH/data
  5 | data=./corpus
  6 | data_url=www.openslr.org/resources/31
  7 | lm_url=www.openslr.org/resources/11
  8 | 
  9 | . ./cmd.sh
 10 | . ./path.sh
 11 | 
 12 | stage=0
 13 | subsampling=4
 14 | num_split=20 # number of splits for memory-mapped data for training
 15 | 
 16 | . ./utils/parse_options.sh
 17 | 
 18 | set -euo pipefail
 19 | 
 20 | mkdir -p $data
 21 | 
 22 | 
 23 | if [ $stage -le 0 ]; then
 24 |   local/download_lm.sh $lm_url $data data/local/lm
 25 | fi
 26 | 
 27 | if [ $stage -le 1 ]; then
 28 |   # format the data as Kaldi data directories
 29 |   local/prepare_dict.sh --stage 3 --nj 30 --cmd "$train_cmd" \
 30 |     data/local/lm data/local/lm data/local/dict_nosp
 31 | 
 32 |   utils/prepare_lang.sh data/local/dict_nosp \
 33 |     "<UNK>" data/local/lang_tmp_nosp data/lang_nosp
 34 | 
 35 |   local/format_lms.sh --src-dir data/lang_nosp data/local/lm
 36 | fi
 37 | 
 38 | if [ $stage -le 2 ]; then
 39 |   # Get the train-100 subset 
 40 |   local/data_prep.sh ${speech_data}/LibriSpeech/train-clean-100 data/train_100h
 41 |   ./steps/make_mfcc.sh --cmd "$train_cmd" --nj 32 data/train_100h exp/make_mfcc/train_100h mfcc
 42 |   ./utils/fix_data_dir.sh data/train_100h
 43 |   ./steps/compute_cmvn_stats.sh data/train_100h
 44 |   ./utils/fix_data_dir.sh data/train_100h
 45 | 
 46 |   utils/subset_data_dir.sh --shortest data/train_100h 500 data/train_500short
 47 |   utils/subset_data_dir.sh data/train_100h 5000 data/train_5k
 48 |   utils/subset_data_dir.sh data/train_100h 10000 data/train_10k
 49 | fi
 50 | 
 51 | # train a monophone system
 52 | if [ $stage -le 3 ]; then 
 53 |   steps/train_mono.sh --boost-silence 1.25 --nj 15 --cmd "$train_cmd" \
 54 |     data/train_500short data/lang_nosp exp/mono
 55 |   
 56 |   steps/align_si.sh --boost-silence 1.25 --nj 15 --cmd "$train_cmd" \
 57 |     data/train_5k data/lang_nosp exp/mono exp/mono_ali_train_5k
 58 | fi
 59 | 
 60 | # train a first delta + delta-delta triphone system on 5k utterances
 61 | if [ $stage -le 4 ]; then
 62 |   steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
 63 |     2000 10000 data/train_5k data/lang_nosp exp/mono_ali_train_5k exp/tri1
 64 | 
 65 |   steps/align_si.sh --nj 15 --cmd "$train_cmd" \
 66 |     data/train_10k data/lang_nosp exp/tri1 exp/tri1_ali_train_10k
 67 | fi
 68 | 
 69 | # train a first delta + delta-delta triphone system on 10k utterances
 70 | if [ $stage -le 5 ]; then
 71 |   steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
 72 |     2500 15000 data/train_10k data/lang_nosp exp/tri1_ali_train_10k exp/tri1b
 73 | 
 74 |   steps/align_si.sh --nj 20 --cmd "$train_cmd" \
 75 |     data/train_100h data/lang_nosp exp/tri1b exp/tri1b_ali_train_100h
 76 | fi
 77 | 
 78 | # train an LDA+MLLT system.
 79 | if [ $stage -le 6 ]; then
 80 |   steps/train_lda_mllt.sh --cmd "$train_cmd" \
 81 |     --splice-opts "--left-context=3 --right-context=3" 4200 40000 \
 82 |     data/train_100h data/lang_nosp exp/tri1b_ali_train_100h exp/tri2
 83 | 
 84 |   # Align utts using the tri2b model
 85 |   steps/align_si.sh --nj 20 --cmd "$train_cmd" --use-graphs true \
 86 |     data/train_100h data/lang_nosp exp/tri2 exp/tri2_ali_train_100h
 87 | fi
 88 | 
 89 | # Train tri3, which is LDA+MLLT+SAT
 90 | if [ $stage -le 7 ]; then
 91 |   steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
 92 |     data/train_100h data/lang_nosp exp/tri2_ali_train_100h exp/tri3
 93 | fi
 94 | 
 95 | # Now we compute the pronunciation and silence probabilities from training data,
 96 | # and re-create the lang directory.
 97 | if [ $stage -le 8 ]; then
 98 |   steps/get_prons.sh --cmd "$train_cmd" \
 99 |     data/train_100h data/lang_nosp exp/tri3
100 |   utils/dict_dir_add_pronprobs.sh --max-normalize true \
101 |     data/local/dict_nosp \
102 |     exp/tri3/pron_counts_nowb.txt exp/tri3/sil_counts_nowb.txt \
103 |     exp/tri3/pron_bigram_counts_nowb.txt data/local/dict
104 | 
105 |   utils/prepare_lang.sh data/local/dict \
106 |     "<UNK>" data/local/lang_tmp data/lang
107 | 
108 |   local/format_lms.sh --src-dir data/lang data/local/lm
109 | 
110 |   # Larger 3-gram LM rescoring
111 |   #utils/build_const_arpa_lm.sh \
112 |   #  data/local/lm/lm_tglarge.arpa.gz data/lang data/lang_test_tglarge
113 | 
114 |   # 4-gram LM rescoring
115 |   utils/build_const_arpa_lm.sh \
116 |     data/local/lm/lm_fglarge.arpa.gz data/lang data/lang_test_fglarge
117 | 
118 |   steps/align_fmllr.sh --nj 5 --cmd "$train_cmd" \
119 |     data/train_100h data/lang exp/tri3 exp/tri3_ali_train_100h
120 | fi
121 | 
122 | if [ $stage -le 10 ]; then
123 |   traindir=data/train_100h
124 |   feat_affix=_fbank_64
125 |   echo "Making features for nnet training ..."
126 |   ./utils/copy_data_dir.sh ${traindir} ${traindir}${feat_affix}
127 |   ./steps/make_fbank.sh --cmd "$train_cmd" --nj 32 ${traindir}${feat_affix}
128 |   ./utils/fix_data_dir.sh ${traindir}${feat_affix}
129 |   ./steps/compute_cmvn_stats.sh ${traindir}${feat_affix}
130 |   ./utils/fix_data_dir.sh ${traindir}${feat_affix}
131 | fi
132 | 
133 | 
134 | 


--------------------------------------------------------------------------------
/librispeech100/steps:
--------------------------------------------------------------------------------
1 | ../tools/kaldi/egs/wsj/s5/steps


--------------------------------------------------------------------------------
/librispeech100/utils:
--------------------------------------------------------------------------------
1 | ../tools/kaldi/egs/wsj/s5/utils


--------------------------------------------------------------------------------
/nnet_pytorch/INSTALL_PYCHAIN:
--------------------------------------------------------------------------------
 1 | Install kaldi tools;
 2 | cd openfst;
 3 | make clean;
 4 | ./configure --special-flags;
 5 | make; make install;
 6 | 
 7 | export OPENFST_PATH=/path/to/openfst;
 8 | export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${OPENFST_PATH}/lib:/usr/local/cuda/lib64;
 9 | 
10 | cd pychain;
11 | cd openfst_binding; python setup.py install;
12 | cd ../pytorch_binding; python setup.py install
13 | 


--------------------------------------------------------------------------------
/nnet_pytorch/IterationTypes.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*- coding: utf-8 -*-
  3 | # Copyright 2020 
  4 | # Apache 2.0
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | import torch.nn.functional as F
  9 | import datasets
 10 | import sys
 11 | 
 12 | 
 13 | def train_epoch(args, generator, model, objective, optim, lr_sched, device='cpu'):
 14 |     total_loss = 0.0
 15 |     move_to = datasets.DATASETS[args.datasetname].move_to
 16 |     dataset_args = eval(args.datasets)
 17 |     total_num_batches = sum(
 18 |         [args.batches_per_epoch * ds['num_repeats'] for ds in dataset_args]
 19 |     )
 20 |     total_num_updates = total_num_batches // args.delay_updates 
 21 |     
 22 |     for i, b in enumerate(generator, 1): 
 23 |         b = move_to(b, device)
 24 |         loss, correct = objective(model, b)
 25 |         if isinstance(loss, int):
 26 |             continue;
 27 |         print(
 28 |             "Iter: ", int(i / args.delay_updates), " of ", total_num_updates,
 29 |             "Loss: ", loss.data.item(),
 30 |             "LR: ", lr_sched.curr_lr, end=' '    
 31 |         )
 32 |         if correct is not None:
 33 |             print(" Acc: ", float(correct.data.item()) / (b.target.view(-1).size(0)), end='')
 34 |         print()
 35 |         total_loss += loss.data.item()
 36 |         loss.backward()
 37 |         loss.detach()
 38 |         del b
 39 |         # Mimics multigpu training with large batches on a single gpu
 40 |         if ((i % args.delay_updates) == 0):
 41 |             grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_thresh)
 42 |             optim.step()
 43 |             optim.zero_grad()
 44 |             lr_sched.step(1.0) 
 45 |     return total_loss / args.batches_per_epoch
 46 | 
 47 | 
 48 | def validate(args, generator, model, device='cpu'): 
 49 |     model.eval()
 50 |     move_to = datasets.DATASETS[args.datasetname].move_to
 51 |     with torch.no_grad():
 52 |         correct = 0.0
 53 |         avg_loss = 0.0
 54 |         num_tokens = 0.0
 55 |         for i, b in enumerate(generator):
 56 |             b = move_to(b, device)
 57 |             output = model(b)[0]
 58 |             lprobs = F.log_softmax(output, dim=-1)
 59 |             lprobs = lprobs.view(-1, lprobs.size(-1))
 60 |             lprobs = lprobs[:b.target.view(-1).size(0), :]
 61 |             loss = F.nll_loss(lprobs, b.target.view(-1), reduction='sum')
 62 |             avg_loss += loss.data.item()
 63 |             correct += torch.sum(lprobs.argmax(1) == b.target.view(-1))
 64 |             num_tokens += lprobs.size(0)
 65 |         avg_loss /= num_tokens
 66 |         correct = 0 if num_tokens == 0 else float(correct.data.item()) / num_tokens
 67 |         print()
 68 |     model.train()
 69 |     return avg_loss, correct
 70 | 
 71 | 
 72 | def decode_dataset(args, generator, model, device='cpu'):
 73 |     move_to = datasets.DATASETS[args.datasetname].move_to 
 74 |     for i, b in enumerate(generator):
 75 |         uttname = b.metadata['name'][0]
 76 |         b = move_to(b, device)
 77 |         model_output = model(b)
 78 |         # Chain system
 79 |         if 'LFMMI' in args.objective:
 80 |             output = model_output[0].clamp(-30, 30)
 81 |             lprobs = output.contiguous().view(-1, output.size(2))
 82 |         ## XENT
 83 |         elif 'CrossEntropy' in args.objective:
 84 |             lprobs = F.log_softmax(
 85 |                 model_output[0], dim=-1
 86 |             ).view(-1, model_output[0].size(-1))
 87 | 
 88 |         yield uttname, lprobs.detach().cpu().numpy()
 89 | 
 90 | 
 91 | def decode_dataset(args, generator, model, device='cpu'):
 92 |     move_to = datasets.DATASETS[args.datasetname].move_to 
 93 |     for i, b in enumerate(generator):
 94 |         uttname = b.metadata['name'][0]
 95 |         b = move_to(b, device)
 96 |         model_output = model(b)
 97 |         # Chain system
 98 |         if 'CrossEntropy' not in args.objective:
 99 |             output = model_output[0].clamp(-30, 30)
100 |             lprobs = output.contiguous().view(-1, output.size(2))
101 |         ## XENT
102 |         elif 'CrossEntropy' in args.objective:
103 |             lprobs = F.log_softmax(
104 |                 model_output[0], dim=-1
105 |             ).view(-1, model_output[0].size(-1))
106 |         else:
107 |             print("Undefined Objective")
108 |             sys.exit(1)
109 | 
110 |         yield uttname, lprobs.detach().cpu().numpy()
111 | 
112 | 
113 | def decorrupt_dataset(args, generator, model, objective, device='cpu'):
114 |     move_to = datasets.DATASETS[args.datasetname].move_to 
115 |     for i, b in enumerate(generator):
116 |         uttname = b.metadata['name'][0]
117 |         b = move_to(b, device)
118 |         for sgld_iter, decorrupted in enumerate(objective.decorrupt(model, b, num_steps=args.num_steps)):
119 |             yield uttname, sgld_iter, decorrupted.contiguous().view(-1, decorrupted.size(2)).detach().cpu().numpy()
120 | 
121 | 
122 | def evaluate_energies(args, generator, model, device='cpu'):
123 |     move_to = datasets.DATASETS[args.datasetname].move_to
124 |     for i, b in enumerate(generator, 1):
125 |         b = move_to(b, device)
126 |         model_output = model(b)
127 |         yield model_output.data.item() 
128 |           
129 | 


--------------------------------------------------------------------------------
/nnet_pytorch/LRScheduler.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf-8 -*-
 2 | # Copyright 2020
 3 | # Apache 2.0
 4 | 
 5 | from __future__ import print_function
 6 | import sys
 7 | import os
 8 | import math
 9 | 
10 | 
11 | class LRScheduler(object):
12 |     @staticmethod
13 |     def add_args(parser):
14 |         parser.add_argument('--warmup', type=int, default=0)
15 |         parser.add_argument('--decay', type=float, default=0.0)
16 |         parser.add_argument('--fixed', type=int, default=0)
17 |         parser.add_argument('--min-lr', type=float, default=1e-09)
18 | 
19 |     def __init__(self, optimizer, args):
20 |         self.optimizer = optimizer
21 |         self.warmup = args.warmup
22 |         self.fixed = args.fixed
23 |         self.decay = args.decay
24 |         self.min_lr = args.min_lr 
25 |         
26 |         self.num_warmup_updates = 0
27 |         self.num_fixed_updates = 0
28 |         self.num_decay_updates = 0
29 |         self.lr = self.optimizer.param_groups[0]['lr']
30 |         if self.warmup > 0:
31 |             self.set_lr(args.min_lr)
32 |             self.curr_lr = args.min_lr
33 |         else:
34 |             self.curr_lr = self.lr
35 | 
36 |     def step(self, num_new_updates):
37 |         if self.warmup > 0 and self.num_warmup_updates < self.warmup:
38 |             self.num_warmup_updates += num_new_updates 
39 |             slope = (self.lr - self.min_lr) / float(self.warmup) 
40 |             new_lr = self.min_lr + slope * self.num_warmup_updates
41 |         elif self.fixed > 0 and self.num_fixed_updates < self.fixed:
42 |             self.num_fixed_updates += num_new_updates
43 |             new_lr = self.lr 
44 |         else:
45 |             self.num_decay_updates += num_new_updates
46 |             factor = math.exp(-self.decay * self.num_decay_updates) 
47 |             new_lr = self.lr * factor
48 |         self.set_lr(new_lr)
49 |         self.curr_lr = new_lr
50 |     
51 |     def set_lr(self, lr):
52 |         for param_group in self.optimizer.param_groups:
53 |             param_group['lr'] = lr
54 | 
55 |     def state_dict(self):
56 |         return {
57 |             'warmup': self.warmup,
58 |             'fixed': self.fixed,
59 |             'decay': self.decay,
60 |             'warmup_updates': self.num_warmup_updates,
61 |             'fixed_updates': self.num_fixed_updates,
62 |             'decay_updates': self.num_decay_updates,
63 |             'lr': self.lr,
64 |             'curr_lr': self.curr_lr,
65 |             'min_lr': self.min_lr,
66 |         }
67 |     
68 |     def load_state_dict(self, state_dict):
69 |         self.warmup = state_dict['warmup']
70 |         self.fixed = state_dict['fixed']
71 |         self.decay = state_dict['decay']
72 |         self.num_warmup_updates = state_dict['warmup_updates']
73 |         self.num_fixed_updates = state_dict['fixed_updates']
74 |         self.num_decay_updates = state_dict['decay_updates']
75 |         self.lr = state_dict['lr']
76 |         self.curr_lr = state_dict['curr_lr']
77 |         self.min_lr = state_dict['min_lr']
78 | 


--------------------------------------------------------------------------------
/nnet_pytorch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/m-wiesner/nnet_pytorch/086bc45cf2f1a12197f29033a1e129f6c8b55b03/nnet_pytorch/__init__.py


--------------------------------------------------------------------------------
/nnet_pytorch/batch_generators.py:
--------------------------------------------------------------------------------
 1 | # Batch generators for training and inference
 2 | 
 3 | def batches(dataset, n):
 4 |     for b in range(n):
 5 |         yield dataset.minibatch()
 6 | 
 7 | def multiset_batches(sets, genfun, *args):
 8 |     '''
 9 |         Alternating round-robin batches.
10 |     '''
11 |     # We assume the generators are of equal length
12 |     for set_batches_n in zip(*[genfun(s, *args) for s in sets]):
13 |         for b in set_batches_n:
14 |             if b is not None:
15 |                 yield b
16 |     
17 | def evaluation_batches(dataset):
18 |     return dataset.evaluation_batches()
19 | 
20 | 


--------------------------------------------------------------------------------
/nnet_pytorch/datasets/NnetPytorchDataset.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | 
 3 | 
 4 | class NnetPytorchDataset(object):
 5 | 
 6 |     Minibatch = namedtuple('Minibatch', ['input', 'target', 'metadata'])
 7 | 
 8 |     @staticmethod
 9 |     def add_args(parser):
10 |         pass
11 |     
12 |     @classmethod
13 |     def build_dataset(cls, args):
14 |         raise NotImplementedError
15 |        
16 |     def __init__(self):
17 |         pass
18 | 
19 | 
20 |     def __len__(self):
21 |         '''
22 |             Returns the total number of elements in the dataset
23 |         '''
24 |         raise NotImplementedError
25 | 
26 |     def size(self, idx):
27 |         '''
28 |             Returns some notion of size of an individual element
29 |             of the dataset.
30 |         '''
31 |         raise NotImplementedError(
32 |             "This function should return the size of an individual element of "
33 |             "the dataset."
34 |         )
35 | 
36 |     
37 |     def minibatch(self):     
38 |         '''
39 |             This is effectively the collater. It defines how multiple elements
40 |             of a dataset are aggregated or collated together for neural network
41 |             training.
42 |         '''
43 |         raise NotImplementedError(
44 |             "This function should return an object that groups together "
45 |             "different elements of the dataset for neural network training."
46 |         )
47 | 
48 |     def evaluation_batches(self):
49 |         '''
50 |             This yields batches of the evaluation set.
51 |         '''
52 |         raise NotImplementedError(
53 |             "This function should yield batches of the eval data."
54 |         )
55 | 
56 | 
57 |     def __getitem__(self, idx):
58 |         raise NotImplementedError(
59 |             "This function is used to return a formatted inputs and outputs "
60 |             "for a single element from the dataset. self.minibatch() "
61 |             "should make repeated calls to __getitem__ when forming "
62 |             "minibatches. The argument idx can be any hashable object."
63 |         )
64 | 
65 | 
66 |     def move_to(self, b, device):
67 |         pass
68 | 
69 | 


--------------------------------------------------------------------------------
/nnet_pytorch/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | import importlib
 4 | 
 5 | 
 6 | modules = glob.glob(
 7 |     os.path.sep.join(
 8 |         [os.path.dirname(__file__), '*.py']
 9 |     )
10 | )
11 | 
12 | for f in modules:
13 |     if os.path.isfile(f) and '__init__.py' not in f and 'data_utils' not in f \
14 |         and 'batch_generators' not in f:
15 |         module_name, ext = os.path.splitext(f)
16 |         if ext == '.py':
17 |             module = importlib.import_module('datasets.' + os.path.basename(module_name))
18 | 
19 | DATASETS = {
20 |     'HybridASR': HybridASR.HybridAsrDataset,
21 | }
22 | 
23 | 


--------------------------------------------------------------------------------
/nnet_pytorch/datasets/data_utils.py:
--------------------------------------------------------------------------------
  1 | import kaldi_io
  2 | import numpy as np
  3 | import subprocess
  4 | import torch
  5 | import random
  6 | 
  7 | 
  8 | def memmap_feats(feats_scp, f_memmapped, utt_list, dtype=np.float32):
  9 |     '''
 10 |         Maps the feats.scp file from kaldi to a memory mapped numpy object.
 11 |         This allows for fast i/o when creating window minibatches from slices
 12 |         of training data.
 13 | 
 14 |         input args: feats_scp, f_memmapped
 15 |         output: 
 16 |             utt_lens = {'utt_n': # utt_n frames, ... }
 17 |             offsets  = {'utt_n': utt_n offset in memory mapped numpy file}
 18 |             data_shape = (#frames, feature_dimension)
 19 |     '''
 20 |     # First get the total lengths of each utterance
 21 |     p = subprocess.Popen(
 22 |         ['feat-to-len', 'scp:' + feats_scp, 'ark,t:-'],
 23 |         stdout=subprocess.PIPE
 24 |     )
 25 |     out = p.communicate()
 26 |     utt_lens = {}
 27 |     for l in out[0].split(b'\n'):
 28 |         if l.strip() != b'':
 29 |             utt_id, utt_len = l.strip().split(None, 1)
 30 |             utt_lens[utt_id] = int(utt_len)
 31 |     # Next get the dimension of the features
 32 |     p = subprocess.Popen(['feat-to-dim', 'scp:' + feats_scp, '-'],
 33 |         stdout=subprocess.PIPE
 34 |     )
 35 |     out = p.communicate()
 36 |     dim = int(out[0])
 37 |     # Set Data Shape
 38 |     data_shape = (sum(utt_lens.values()), dim)
 39 |     # Set up memmapped features 
 40 |     f = np.memmap(f_memmapped, mode='w+', dtype=dtype, shape=data_shape)
 41 |     offsets = {}
 42 |     offset = 0
 43 |     for i, (k, m) in enumerate(kaldi_io.read_mat_scp(feats_scp)):
 44 |         print('Utterance ', i, ' : ', k)
 45 |         if k not in utt_list:
 46 |             continue;
 47 |         m = m.astype(dtype)
 48 |         offsets[k.encode()] = offset
 49 |         new_offset = offset + utt_lens[k.encode()]
 50 |         f[offset:new_offset, :] = m
 51 |         offset = new_offset
 52 |     print()
 53 |     del f
 54 |     return utt_lens, offsets, data_shape
 55 | 
 56 | 
 57 | def get_targets(f_targets):
 58 |     '''
 59 |         Retrieve the targets (pdfids) corresponding to each input utterance
 60 |         input args:
 61 |             f_targets -- file pointer to the targets
 62 |             
 63 |             Format of f_targets:
 64 |                 utt1 pdf11 pdf12 pdf13 ...
 65 |                 utt2 pdf21 pdf22 ...
 66 |                 utt3 ...
 67 |                 ...
 68 |         output:
 69 |             utts = {'utt1': [pdf1, pdf2, ...], 'utt2': [pdf1, pdf1, ...]}
 70 |     '''
 71 |     utts = {}
 72 |     for l in f_targets:
 73 |         utt_id, tgts = l.strip().split(None, 1)
 74 |         if utt_id not in utts:
 75 |             utts[utt_id.encode()] = []
 76 |         for t in tgts.split():
 77 |             utts[utt_id.encode()].append(int(t))
 78 |     return utts            
 79 | 
 80 | 
 81 | def load_cmvn(filename):
 82 |     '''
 83 |         Load the cmvn file. Requires filename.
 84 |     '''
 85 |     gen = kaldi_io.read_mat_scp(filename)
 86 |     spk2cmvn = {}
 87 |     for k, m in gen:
 88 |         total = m[0, -1]
 89 |         spk2cmvn[k] = {'mu': m[0, :-1] / total, 'var': m[1, :-1] / total}
 90 |     return spk2cmvn
 91 | 
 92 | def load_ivectors(filename):
 93 |     '''
 94 |         Load the ivectors into a dictionary.
 95 |         Input argument may be an ark or scp file.
 96 |     '''
 97 |     ivectors = {}
 98 |     for key, vec in kaldi_io.read_vec_flt_scp(filename):
 99 |         ivectors[key] = np.array(vec)
100 |     return ivectors
101 | 
102 | 
103 | def load_utt2spk(f):
104 |     '''
105 |         Load the utt2spk file. Requires an open file pointer.
106 |     '''
107 |     utt2spk = {}
108 |     for l in f:
109 |         utt, spk = l.strip().split(None, 1)
110 |         utt2spk[utt.encode()] = spk
111 |     spk2utt = {}
112 |     for u, s in utt2spk.items():
113 |         if s not in spk2utt:
114 |             spk2utt[s] = []
115 |         spk2utt[s].append(u)
116 |     return utt2spk, spk2utt
117 | 
118 | 
119 | def load_segments(f):
120 |     '''
121 |         Load the segments file. Requires an open file pointer.
122 |     '''
123 |     audio_to_segments = {}
124 |     for l in f:
125 |         utt, audio, start, end = l.strip().split()
126 |         if audio not in audio_to_segments:
127 |             audio_to_segments[audio] = []
128 |         audio_to_segments[audio].append((utt, start, end))
129 |     return audio_to_segments
130 | 
131 | 
132 | def load_utt_subset(f):
133 |     '''
134 |         Load the subset of utterances from file point f. Use a kaldi segments
135 |         file for the file f for example.
136 |     '''
137 |     utt_subset = []
138 |     for l in f:
139 |         utt_subset.append(l.strip().split(None, 1)[0].encode())
140 |     return utt_subset
141 | 
142 | 
143 | def perturb(x, perturb_type='none'):
144 |     if perturb_type == 'none':
145 |         pass
146 |     elif perturb_type == 'salt_pepper':
147 |         x *= torch.FloatTensor(x.size()).random_(0, 2).to(x.dtype)
148 |     elif perturb_type == 'time_mask':
149 |         width=4
150 |         start = random.randint(0, x.size(1) - width)
151 |         end = start + width
152 |         mask = (torch.arange(x.size(1)) >= start) * (torch.arange(x.size(1)) < end)  
153 |         mask = mask[:, None].expand(x.size())
154 |         x[mask] = 0.0
155 |     elif perturb_type == 'freq_mask': 
156 |         width=10
157 |         start = random.randint(0, x.size(0) - width)
158 |         end = start + width
159 |         mask = (torch.arange(x.size(-1)) >= start) * (torch.arange(x.size(-1)) < end)  
160 |         mask = mask[None, :].expand(x.size())
161 |         x[mask] = 0.0 
162 |     elif perturb_type.startswith('gauss'):
163 |         std = float(perturb_type.split()[1])
164 |         x += std * torch.randn_like(x)
165 |     elif perturb_type.startswith('rand'):
166 |         maxval = float(perturb_type.split()[1])
167 |         x.uniform_(-maxval, maxval)
168 |     #return x 
169 | 


--------------------------------------------------------------------------------
/nnet_pytorch/generate.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | #-*- coding: utf-8 -*-
  3 | # Copyright 2020 
  4 | # Apache 2.0
  5 | 
  6 | import os
  7 | import argparse
  8 | import json
  9 | import subprocess
 10 | import numpy as np
 11 | import torch
 12 | import models
 13 | import objectives
 14 | from collections import namedtuple
 15 | from data_utils import move_to
 16 | 
 17 | 
 18 | Samples = namedtuple('Samples', ['input', 'target', 'metadata']) 
 19 | 
 20 | 
 21 | def main():
 22 |     args = parse_arguments()
 23 |     print(args)
 24 |   
 25 |     # Reserve the GPU if used in decoding. 
 26 |     if args.gpu: 
 27 |         # USER will need to set CUDA_VISIBLE_DEVICES here
 28 |         cvd = subprocess.check_output(["/usr/local/bin/free-gpu", "-n", "1"]).decode().strip()
 29 |         os.environ['CUDA_VISIBLE_DEVICES'] = cvd
 30 |     
 31 |     device = torch.device('cuda' if args.gpu else 'cpu')
 32 |     reserve_variable = torch.ones(1).to(device)
 33 |    
 34 |     # Load experiment configurations so that decoding uses the same parameters
 35 |     # as training
 36 |     conf = json.load(open(args.modeldir + '/conf.1.json'))
 37 |     
 38 |     # Build the model and send to the device (cpu or gpu). Generally cpu.
 39 |     objective = objectives.OBJECTIVES[conf['objective']].build_objective(conf)
 40 |     objective.to(device)
 41 |     model = models.MODELS[conf['model']].build_model(conf)
 42 |     model.to(device)
 43 | 
 44 |     mdl = torch.load(
 45 |         os.path.sep.join([args.modeldir, args.modelname]),
 46 |         map_location=device
 47 |     )
 48 |     objective.load_state_dict(mdl['objective']) 
 49 |     model.load_state_dict(mdl['model'])  
 50 |     
 51 |     cw = args.chunk_width
 52 |     cw += args.left_context + args.right_context
 53 |     
 54 |     samples = objective.generate_from_model(
 55 |         model,
 56 |         bs=args.batchsize,
 57 |         cw=cw,
 58 |         dim=args.idim,
 59 |         left_context=args.left_context, right_context=args.right_context,
 60 |         device=device,
 61 |         target=args.target,
 62 |     )
 63 | 
 64 |     for i, s in enumerate(samples):
 65 |         np.save('{}/samples.{}'.format(args.dumpdir, i), s.cpu().data.numpy())
 66 | 
 67 | 
 68 | def parse_arguments():
 69 |     parser = argparse.ArgumentParser()
 70 |     parser.add_argument('--modeldir', help='model directory used for generated')
 71 |     parser.add_argument('--dumpdir', help='dump results here')
 72 |     parser.add_argument('--modelname', default='final.mdl')
 73 |     parser.add_argument('--gpu', action='store_true', help='Tun on gpu. This '
 74 |         'can be very slow on cpu'
 75 |     )
 76 |     parser.add_argument('--idim', type=int, default=64,
 77 |         help='The input dimension of features'
 78 |     )
 79 |     parser.add_argument('--chunk-width', type=int, default=50,
 80 |         help='The width of the speech chunk. The target sequence will be '
 81 |         'length chunk_width / subsample'
 82 |     )
 83 |     parser.add_argument('--left-context', type=int, default=10,
 84 |         help='extra left context on the input features'
 85 |     )
 86 |     parser.add_argument('--right-context', type=int, default=5,
 87 |         help='extra right context on the input features'
 88 |     )
 89 |     parser.add_argument('--batchsize', type=int, default=32,
 90 |         help='number of sample to generate (just 1 minibatch)',
 91 |     )
 92 |     parser.add_argument('--target', nargs='+', type=int, default=None)
 93 |    
 94 |     # Args specific to different components
 95 |     args, leftover = parser.parse_known_args()
 96 |     conf = json.load(open(args.modeldir + '/conf.1.json'))
 97 |     models.MODELS[conf['model']].add_args(parser) 
 98 |     parser.parse_args(leftover, namespace=args) 
 99 |     return args
100 |   
101 |    
102 | if __name__ == "__main__":
103 |     main()
104 | 
105 | 


--------------------------------------------------------------------------------
/nnet_pytorch/models/BLSTM.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from collections import namedtuple
  4 | import numpy as np
  5 | 
  6 | 
  7 | class BLSTM(torch.nn.Module):
  8 |     '''
  9 |         Bidirectional LSTM model
 10 |     '''
 11 |     @staticmethod
 12 |     def add_args(parser):
 13 |         parser.add_argument('--blstm-hdim', type=int, default=512)
 14 |         parser.add_argument('--blstm-num-layers', type=int, default=4)
 15 |         parser.add_argument('--blstm-dropout', type=float, default=0.1)
 16 |         parser.add_argument('--blstm-prefinal-dim', type=int, default=256)
 17 |          
 18 |     @classmethod
 19 |     def build_model(cls, conf):
 20 |         model = BLSTM(
 21 |             conf['idim'], conf['num_targets'],
 22 |             odims=[conf['blstm_hdim'] for i in range(conf['blstm_num_layers'])],
 23 |             dropout=conf['blstm_dropout'],
 24 |             prefinal_affine_dim=conf['blstm_prefinal_dim'],
 25 |             subsample=conf['subsample'],
 26 |             batch_norm_dropout=True
 27 |         )   
 28 |         return model
 29 |     
 30 |     def __init__(
 31 |         self, idim, odim,
 32 |         odims=[512, 512, 512, 512, 512, 512],
 33 |         prefinal_affine_dim=512,
 34 |         nonlin=F.relu, dropout=0.1, subsample=1, batch_norm_dropout=True
 35 |     ):
 36 |         super().__init__()
 37 |         
 38 |         # Proper BLSTM layers
 39 |         self.batch_norm_dropout = batch_norm_dropout
 40 |         self.dropout = dropout
 41 |         self.nonlin = nonlin
 42 |         self.subsample = subsample
 43 |         self.blstm = torch.nn.ModuleList()
 44 |         self.batchnorm = torch.nn.ModuleList()
 45 |         
 46 |         next_input_dim = idim
 47 |         for cur_odim in odims:
 48 |             self.blstm.append(
 49 |                 torch.nn.LSTM(
 50 |                     next_input_dim, cur_odim//2, 1,
 51 |                     batch_first=True, bidirectional=True
 52 |                 )
 53 |             )
 54 |             self.batchnorm.append(
 55 |                 torch.nn.BatchNorm1d(cur_odim, eps=1e-03, affine=False)
 56 |             )
 57 |             next_input_dim = cur_odim
 58 | 
 59 |         # Last few layers
 60 |         self.prefinal_affine = torch.nn.Linear(
 61 |             next_input_dim, prefinal_affine_dim,
 62 |         )
 63 |         self.batchnorm.append(
 64 |             torch.nn.BatchNorm1d(
 65 |                 prefinal_affine_dim, eps=1e-03, affine=False
 66 |             )
 67 |         )
 68 |         self.final_affine = torch.nn.Linear(
 69 |             prefinal_affine_dim, odim,
 70 |         )
 71 | 
 72 |     def forward(self, sample):
 73 |         xs_pad = sample.input
 74 |         left_context = sample.metadata['left_context']
 75 |         right_context = sample.metadata['right_context']
 76 |        
 77 |         # Basic pattern is (blstm, relu, batchnorm, dropout) x num_layers 
 78 |         for blstm, batchnorm in zip(self.blstm, self.batchnorm[:-1]):
 79 |             xs_pad = blstm(xs_pad)[0].transpose(0,1)
 80 |             xs_pad = self.nonlin(xs_pad)
 81 |             if not self.batch_norm_dropout: 
 82 |                 xs_pad = batchnorm(xs_pad)
 83 |                 xs_pad = F.dropout(xs_pad, p=self.dropout, training=self.training)
 84 |       
 85 |         # A few final layers
 86 |         end_idx = xs_pad.size(1) if right_context == 0 else -right_context
 87 |         output2 = xs_pad[:, left_context:end_idx:self.subsample, :]
 88 |         xs_pad = self.nonlin(self.prefinal_affine(xs_pad))
 89 |         if not self.batch_norm_dropout:
 90 |             xs_pad = self.batchnorm[-1](xs_pad)
 91 |         
 92 |         # This is basically just glue
 93 |         output = self.final_affine(xs_pad)
 94 |         return (
 95 |             output[:, left_context:end_idx:self.subsample, :],
 96 |             output2,
 97 |         )
 98 | 
 99 | 
100 | class ChainBLSTM(BLSTM):
101 |     @classmethod
102 |     def build_model(cls, conf):
103 |         model = ChainBLSTM(
104 |             conf['idim'], conf['num_targets'],
105 |             odims=[conf['blstm_hdim'] for i in range(conf['blstm_num_layers'])],
106 |             dropout=conf['blstm_dropout'],
107 |             prefinal_affine_dim=conf['blstm_prefinal_dim'],
108 |             subsample=conf['subsample'],
109 |             batch_norm_dropout=True
110 |         )   
111 |         return model
112 | 
113 |     def __init__(
114 |         self, idim, odim,
115 |         odims=[512, 512, 512, 512, 512, 512],
116 |         prefinal_affine_dim=512,
117 |         nonlin=F.relu, dropout=0.1, subsample=1, batch_norm_dropout=True
118 |     ):
119 |         super().__init__(
120 |             idim, odim, odims, prefinal_affine_dim,
121 |             nonlin, dropout, subsample
122 |         )
123 |         self.prefinal_xent = torch.nn.Linear(
124 |             odims[-1],
125 |             prefinal_affine_dim,
126 |         )
127 |         self.xent_batchnorm = torch.nn.BatchNorm1d(
128 |             prefinal_affine_dim,
129 |             eps=1e-03, affine=False
130 |         )
131 |         self.xent_layer = torch.nn.Linear(prefinal_affine_dim, odim)
132 |     
133 |     def forward(self, xs_pad):
134 |         output, xs_pad = super().forward(xs_pad)
135 |         if self.training:
136 |             xs_pad = self.nonlin(self.prefinal_xent(xs_pad))
137 |             if not self.batch_norm_dropout:
138 |                 xs_pad = self.xent_batchnorm(xs_pad)
139 |             xs_pad = self.xent_layer(xs_pad)
140 |         return output, xs_pad 
141 | 
142 | 


--------------------------------------------------------------------------------
/nnet_pytorch/models/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | import importlib
 4 | 
 5 | 
 6 | modules = glob.glob(
 7 |     os.path.sep.join(
 8 |         [os.path.dirname(__file__), '*.py']
 9 |     )
10 | )
11 | 
12 | for f in modules:
13 |     if os.path.isfile(f) and '__init__.py' not in f and 'norms.py' not in f:
14 |         module_name, ext = os.path.splitext(f)
15 |         if ext == '.py':
16 |             module = importlib.import_module('models.' + os.path.basename(module_name))
17 | 
18 | MODELS = {
19 |     'TDNN': TDNN.TDNN,
20 |     'ChainTDNN': TDNN.ChainTDNN,
21 |     'Resnet': Resnet.SpeechResnet,
22 |     'ChainResnet': Resnet.ChainSpeechResnet,
23 |     'WideResnet': WideResnet.SpeechResnet,
24 |     'ChainWideResnet': WideResnet.ChainSpeechResnet,
25 |     'BLSTM': BLSTM.BLSTM,
26 |     'ChainBLSTM': BLSTM.ChainBLSTM
27 | }
28 | 
29 | def build_model(modelname, conf):
30 |     return MODELS[modelname].build_model(conf)
31 | 


--------------------------------------------------------------------------------
/nnet_pytorch/objectives/AcceleratedSGLD.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import random
  3 | from .optimizer import Optimizer, required
  4 | 
  5 | 
  6 | class AcceleratedSGLD(Optimizer):
  7 |     r"""Implements stochastic gradient descent (optionally with momentum).
  8 | 
  9 |     Nesterov momentum is based on the formula from
 10 |     `On the importance of initialization and momentum in deep learning`__.
 11 | 
 12 |     Args:
 13 |         params (iterable): iterable of parameters to optimize or dicts defining
 14 |             parameter groups
 15 |         lr (float): learning rate
 16 |         momentum (float, optional): momentum factor (default: 0)
 17 |         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
 18 |         dampening (float, optional): dampening for momentum (default: 0)
 19 |         nesterov (bool, optional): enables Nesterov momentum (default: False)
 20 | 
 21 |     Example:
 22 |         >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
 23 |         >>> optimizer.zero_grad()
 24 |         >>> loss_fn(model(input), target).backward()
 25 |         >>> optimizer.step()
 26 | 
 27 |     __ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf
 28 | 
 29 |     .. note::
 30 |         The implementation of SGD with Momentum/Nesterov subtly differs from
 31 |         Sutskever et. al. and implementations in some other frameworks.
 32 | 
 33 |         Considering the specific case of Momentum, the update can be written as
 34 | 
 35 |         .. math::
 36 |                   v = \rho * v + g \\
 37 |                   p = p - lr * v
 38 | 
 39 |         where p, g, v and :math:`\rho` denote the parameters, gradient,
 40 |         velocity, and momentum respectively.
 41 | 
 42 |         This is in contrast to Sutskever et. al. and
 43 |         other frameworks which employ an update of the form
 44 | 
 45 |         .. math::
 46 |              v = \rho * v + lr * g \\
 47 |              p = p - v
 48 | 
 49 |         The Nesterov version is analogously modified.
 50 |     """
 51 | 
 52 |     def __init__(self, params, finalval, lr=required, momentum=0, dampening=0,
 53 |         weight_decay=0, nesterov=False, stepscale=1.0, noise=0.005,
 54 |         rel_overshoot=0.1, epsilon=0.00005,
 55 |     ):
 56 |         if lr is not required and lr < 0.0:
 57 |             raise ValueError("Invalid learning rate: {}".format(lr))
 58 |         if momentum < 0.0:
 59 |             raise ValueError("Invalid momentum value: {}".format(momentum))
 60 |         if weight_decay < 0.0:
 61 |             raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
 62 | 
 63 |         defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
 64 |                         weight_decay=weight_decay, nesterov=nesterov)
 65 |         if nesterov and (momentum <= 0 or dampening != 0):
 66 |             raise ValueError("Nesterov momentum requires a momentum and zero dampening")
 67 |         super(AcceleratedSGLD, self).__init__(params, defaults)
 68 |         self.noise = noise
 69 |         self.stepscale = stepscale
 70 |         # Shoot for 10% better (helps with gradient)
 71 |         if finalval >= 0:
 72 |             self.final_val = finalval * (1 - rel_overshoot)
 73 |         else:
 74 |             self.final_val = finalval * (1 + rel_overshoot)
 75 |         self.epsilon = epsilon
 76 | 
 77 |     def __setstate__(self, state):
 78 |         super(SGD, self).__setstate__(state)
 79 |         for group in self.param_groups:
 80 |             group.setdefault('nesterov', False)
 81 | 
 82 |     def langevin_noise(self, x, std=1.0):
 83 |         return self.noise * torch.randn_like(x).mul_(std)
 84 |     
 85 |     def step(self, startval=None, numsteps=None):
 86 |         """Performs a single optimization step.
 87 | 
 88 |         Arguments:
 89 |             closure (callable, optional): A closure that reevaluates the model
 90 |                 and returns the loss.
 91 |         """
 92 |         loss = None
 93 |         for group in self.param_groups:
 94 |             weight_decay = group['weight_decay']
 95 |             for p in group['params']:
 96 |                 if p.grad is None:
 97 |                     continue
 98 |                 grad_norm = max(self.epsilon, (p.grad.data ** 2.0).sum())
 99 |                 #print("Grad Norm: ", grad_norm.data.item())
100 |                 if grad_norm <= self.epsilon:
101 |                     print("Small Grad Norm!!")
102 |                     grad_norm = self.epsilon
103 |                 #opt_lr = abs(self.final_val - startval)/grad_norm
104 |                 opt_lr = (self.final_val - startval)/grad_norm
105 |                 #print("Final Value: ", self.final_val, " -- Opt LR: ", opt_lr)
106 |                 # When we are below the requested value, we can just descend at
107 |                 # at a normal pace ...
108 |                 opt_lr = self.epsilon / grad_norm if opt_lr > 0 else -opt_lr
109 |                 d_p = p.grad.data
110 |                 if weight_decay != 0:
111 |                     d_p.add_(p.data, alpha=weight_decay)
112 |                 
113 |                 replay_correction = numsteps[:, None, None] ** self.stepscale
114 |                 langevin_std = 1.0 / replay_correction
115 |                 
116 |                 self.state[p]['update'] = self.langevin_noise(p.data, std=langevin_std).add_(
117 |                     d_p.div_(replay_correction),
118 |                     alpha=-group['lr'] * opt_lr,
119 |                 )
120 |                 p.data.add_(self.state[p]['update'])
121 |                 self.state[p]['opt_lr'] = opt_lr 
122 |         return loss
123 | 


--------------------------------------------------------------------------------
/nnet_pytorch/objectives/CrossEntropy.py:
--------------------------------------------------------------------------------
 1 | import torch.nn.functional as F
 2 | import torch
 3 | 
 4 | 
 5 | class CrossEntropy(torch.nn.Module):
 6 |     @staticmethod
 7 |     def add_args(parser):
 8 |         pass
 9 |    
10 |     @classmethod 
11 |     def build_objective(cls, conf):
12 |         return CrossEntropy()
13 |          
14 |     @classmethod
15 |     def add_state_dict(cls, s1, s2, fraction, iteration=None):
16 |         return s1
17 | 
18 |     def __init__(self):
19 |         super(CrossEntropy, self).__init__()
20 |     
21 |     def forward(self, model, sample, precomputed=None):
22 |         if precomputed is not None:
23 |             output = precomputed
24 |         else:
25 |             output = model(sample)[0]
26 |         
27 |         lprobs = F.log_softmax(output, dim=-1)
28 |         lprobs = lprobs.view(-1, lprobs.size(-1))
29 |         loss = F.nll_loss(lprobs, sample.target.view(-1), reduction='mean')
30 |         correct = torch.sum(lprobs.argmax(1) == sample.target.view(-1))
31 |         return loss, correct
32 | 
33 | 
34 |          
35 |         
36 | 


--------------------------------------------------------------------------------
/nnet_pytorch/objectives/L2.py:
--------------------------------------------------------------------------------
 1 | import torch.nn.functional as F
 2 | import torch
 3 | 
 4 | 
 5 | class L2(torch.nn.Module):
 6 |     @staticmethod
 7 |     def add_args(parser):
 8 |         pass
 9 | 
10 |     @classmethod
11 |     def build_objective(cls, conf):
12 |         return L2(conf)
13 |     
14 |     @classmethod
15 |     def add_state_dict(cls, s1, s2, fraction, iteration=None):
16 |         return s1
17 |     
18 |     def __init__(self):
19 |         super(L2, self).__init__()
20 |     
21 |     def forward(self, model, sample, precomputed=None):
22 |         if precomputed is not None:
23 |             x = precomputed
24 |         else:
25 |             x = model(sample)[0]
26 |         
27 |         loss = ((x ** 2).sum()) / (x.size(0) * x.size(1))
28 |         return loss, None 
29 | 


--------------------------------------------------------------------------------
/nnet_pytorch/objectives/LFMMI.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from .L2 import L2
 5 | from .CrossEntropy import CrossEntropy 
 6 | from .LFMMIOnly import ChainLoss as LFMMI
 7 | 
 8 | 
 9 | class ChainLoss(nn.Module):
10 |     @staticmethod
11 |     def add_args(parser):
12 |         parser.add_argument('--xent-reg', type=float, default=0.2)
13 |         parser.add_argument('--l2-reg', type=float, default=0.00025)
14 |         for m in [L2, CrossEntropy, LFMMI]:
15 |             m.add_args(parser) 
16 | 
17 |     @classmethod
18 |     def build_objective(cls, conf):
19 |         return ChainLoss(
20 |             conf['denom_graph'],
21 |             xent_reg=conf['xent_reg'],
22 |             l2_reg=conf['l2_reg'],
23 |             leaky_hmm=conf.get('leaky_hmm', 0.1),
24 |         )
25 | 
26 |     @classmethod
27 |     def add_state_dict(cls, s1, s2, fraction, iteration=None):
28 |         return s1 
29 | 
30 |     def __init__(
31 |         self, den_graph,
32 |         xent_reg=0.2, l2_reg=0.00025, avg=True, leaky_hmm=0.1,
33 |     ):
34 |         super(ChainLoss, self).__init__()
35 |         self.lfmmi = LFMMI(den_graph, leaky_hmm=leaky_hmm)  
36 |         self.xent = CrossEntropy()
37 |         self.l2 = L2()
38 |         
39 |         self.l2_reg = l2_reg
40 |         self.xent_reg = xent_reg
41 | 
42 |     def forward(self, model, sample, precomputed=None):
43 |         if precomputed is not None:
44 |             chain_output = precomputed
45 |         else:
46 |             chain_output = model(sample)
47 |         
48 |         losses = [] 
49 |         correct = None
50 |         # LFMMI
51 |         loss_lfmmi, _ = self.lfmmi(
52 |             model,
53 |             sample,
54 |             precomputed=chain_output[0],
55 |         )
56 |         print('LFMMI: {}'.format(loss_lfmmi.data.item()), end=' ')
57 |         losses.append(loss_lfmmi)
58 |         # XENT
59 |         if self.xent_reg > 0:
60 |             loss_xent, correct = self.xent(
61 |                 model,
62 |                 sample,
63 |                 precomputed=chain_output[1],
64 |             )
65 |             loss_xent *= self.xent_reg
66 |             print('XENT: {}'.format(loss_xent.data.item()), end=' ')
67 |             losses.append(loss_xent)
68 |         
69 |         # L2
70 |         if self.l2_reg > 0:
71 |             loss_l2, _ = self.l2(
72 |                 model,
73 |                 sample,
74 |                 precomputed=chain_output[0],
75 |             )
76 |             loss_l2 *= self.l2_reg
77 |             print('L2: {}'.format(loss_l2.data.item()), end=' ')
78 |             losses.append(loss_l2)
79 | 
80 |         loss = sum(losses)
81 |         return loss, correct
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/nnet_pytorch/objectives/LFMMIOnly.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*- coding: utf-8 -*-
 3 | # Copyright 2020 
 4 | # Apache 2.0
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | import torch.nn.functional as F
 9 | from .pychain.pychain.graph import ChainGraphBatch, ChainGraph
10 | import pychain_C
11 | import simplefst
12 | from .pychain.pychain.chain import ChainFunction 
13 | 
14 | 
15 | class NumeratorFunction(torch.autograd.Function):
16 |     @staticmethod
17 |     def forward(ctx, input, targets):
18 |         input = input.clamp(-30, 30)
19 |         output = input.gather(2, targets.unsqueeze(2)).sum()
20 |         B = input.size(0)
21 |         num_grad = torch.zeros_like(input)
22 |         num_grad.scatter_(2, targets.unsqueeze(2), 1.0) 
23 |         ctx.save_for_backward(num_grad)
24 |         return output 
25 | 
26 |     @staticmethod
27 |     def backward(ctx, objf_grad):
28 |         num_grad, = ctx.saved_tensors
29 |         num_grad = torch.mul(num_grad, objf_grad)
30 |         return num_grad, None
31 | 
32 | 
33 | class ChainLoss(nn.Module):
34 |     @staticmethod
35 |     def add_args(parser):
36 |         parser.add_argument('--denom-graph', required=True)
37 |         parser.add_argument('--leaky-hmm', type=float, default=0.1)
38 |     
39 |     @classmethod
40 |     def build_objective(cls, conf):
41 |         return ChainLoss(
42 |             conf['denom_graph'],
43 |             avg=True, 
44 |             leaky_hmm=conf.get('leaky_hmm', 0.1)
45 |         )
46 | 
47 |     @classmethod
48 |     def add_state_dict(cls, s1, s2, fraction, iteration=None):
49 |         return s1 
50 |     
51 |     def __init__(self, den_graph, avg=True, leaky_hmm=0.1):
52 |         super(ChainLoss, self).__init__()
53 |         self.den_graph = ChainGraph(
54 |             fst=simplefst.StdVectorFst.read(den_graph),
55 |         )
56 |         self.avg = avg
57 |         self.leaky_hmm = leaky_hmm
58 | 
59 |     def forward(self, model, sample, precomputed=None):
60 |         B = sample.input.size(0) # batchsize
61 |         den_graphs = ChainGraphBatch(self.den_graph, B)
62 |         
63 |         # Check if we are using precomputed values
64 |         if precomputed is not None:
65 |             x = precomputed
66 |         else:
67 |             x = model(sample)[0]
68 |         
69 |         T = x.size(1) # Length
70 |         x_lengths = torch.LongTensor([T] * B).to(x.device)
71 |         den_objf = ChainFunction.apply(x, x_lengths, den_graphs, self.leaky_hmm) 
72 |         num_objf = NumeratorFunction.apply(x, sample.target)
73 |         loss = -(num_objf - den_objf)
74 |         if self.avg:
75 |             loss /= (B * T)
76 |         return loss, None
77 | 


--------------------------------------------------------------------------------
/nnet_pytorch/objectives/SGLD.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from .optimizer import Optimizer, required
  3 | 
  4 | 
  5 | class SGLD(Optimizer):
  6 |     r"""Implements stochastic gradient descent (optionally with momentum).
  7 | 
  8 |     Nesterov momentum is based on the formula from
  9 |     `On the importance of initialization and momentum in deep learning`__.
 10 | 
 11 |     Args:
 12 |         params (iterable): iterable of parameters to optimize or dicts defining
 13 |             parameter groups
 14 |         lr (float): learning rate
 15 |         momentum (float, optional): momentum factor (default: 0)
 16 |         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
 17 |         dampening (float, optional): dampening for momentum (default: 0)
 18 |         nesterov (bool, optional): enables Nesterov momentum (default: False)
 19 | 
 20 |     Example:
 21 |         >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
 22 |         >>> optimizer.zero_grad()
 23 |         >>> loss_fn(model(input), target).backward()
 24 |         >>> optimizer.step()
 25 | 
 26 |     __ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf
 27 | 
 28 |     .. note::
 29 |         The implementation of SGD with Momentum/Nesterov subtly differs from
 30 |         Sutskever et. al. and implementations in some other frameworks.
 31 | 
 32 |         Considering the specific case of Momentum, the update can be written as
 33 | 
 34 |         .. math::
 35 |                   v = \rho * v + g \\
 36 |                   p = p - lr * v
 37 | 
 38 |         where p, g, v and :math:`\rho` denote the parameters, gradient,
 39 |         velocity, and momentum respectively.
 40 | 
 41 |         This is in contrast to Sutskever et. al. and
 42 |         other frameworks which employ an update of the form
 43 | 
 44 |         .. math::
 45 |              v = \rho * v + lr * g \\
 46 |              p = p - v
 47 | 
 48 |         The Nesterov version is analogously modified.
 49 |     """
 50 | 
 51 |     def __init__(self, params, lr=required, momentum=0, dampening=0,
 52 |                  weight_decay=0, nesterov=False, clamp=1.0, stepscale=1.0, noise=0.005):
 53 |         if lr is not required and lr < 0.0:
 54 |             raise ValueError("Invalid learning rate: {}".format(lr))
 55 |         if momentum < 0.0:
 56 |             raise ValueError("Invalid momentum value: {}".format(momentum))
 57 |         if weight_decay < 0.0:
 58 |             raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
 59 | 
 60 |         defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
 61 |                         weight_decay=weight_decay, nesterov=nesterov)
 62 |         if nesterov and (momentum <= 0 or dampening != 0):
 63 |             raise ValueError("Nesterov momentum requires a momentum and zero dampening")
 64 |         super(SGLD, self).__init__(params, defaults)
 65 |         self.noise = noise
 66 |         self.stepscale = stepscale
 67 |         self.clamp = clamp
 68 | 
 69 |     def __setstate__(self, state):
 70 |         super(SGD, self).__setstate__(state)
 71 |         for group in self.param_groups:
 72 |             group.setdefault('nesterov', False)
 73 | 
 74 |     def langevin_noise(self, x, std=1.0):
 75 |         return self.noise * torch.randn_like(x).mul_(std)
 76 |     
 77 |     def step(self, numsteps=None, closure=None):
 78 |         """Performs a single optimization step.
 79 | 
 80 |         Arguments:
 81 |             closure (callable, optional): A closure that reevaluates the model
 82 |                 and returns the loss.
 83 |         """
 84 |         loss = None
 85 |         if closure is not None:
 86 |             loss = closure()
 87 | 
 88 |         for group in self.param_groups:
 89 |             weight_decay = group['weight_decay']
 90 |             momentum = group['momentum']
 91 |             dampening = group['dampening']
 92 |             nesterov = group['nesterov']
 93 | 
 94 |             for p in group['params']:
 95 |                 if p.grad is None:
 96 |                     continue
 97 |                 d_p = p.grad.data
 98 |                 if weight_decay != 0:
 99 |                     d_p.add_(p.data, alpha=weight_decay)
100 |                 if momentum != 0:
101 |                     param_state = self.state[p]
102 |                     if 'momentum_buffer' not in param_state:
103 |                         buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
104 |                     else:
105 |                         buf = param_state['momentum_buffer']
106 |                         buf.mul_(momentum).add_(d_p, alpha=1-dampening)
107 |                     if nesterov:
108 |                         d_p = d_p.add(momentum, buf)
109 |                     else:
110 |                         d_p = buf
111 |                 replay_correction = numsteps[:, None, None] ** self.stepscale
112 |                 langevin_std = 1.0 #/ replay_correction
113 |                 p.data.add_(
114 |                     self.langevin_noise(p.data, std=langevin_std).add_(
115 |                         d_p.div_(replay_correction),
116 |                         alpha=-group['lr'],
117 |                     )
118 |                 )
119 | 
120 |         return loss
121 | 


--------------------------------------------------------------------------------
/nnet_pytorch/objectives/SGLDAdam.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from .optimizer import Optimizer
  4 | 
  5 | 
  6 | class SGLDAdam(Optimizer):
  7 |     r"""Implements Adam algorithm.
  8 | 
  9 |     It has been proposed in `Adam: A Method for Stochastic Optimization`_.
 10 | 
 11 |     Arguments:
 12 |         params (iterable): iterable of parameters to optimize or dicts defining
 13 |             parameter groups
 14 |         lr (float, optional): learning rate (default: 1e-3)
 15 |         betas (Tuple[float, float], optional): coefficients used for computing
 16 |             running averages of gradient and its square (default: (0.9, 0.999))
 17 |         eps (float, optional): term added to the denominator to improve
 18 |             numerical stability (default: 1e-8)
 19 |         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
 20 |         amsgrad (boolean, optional): whether to use the AMSGrad variant of this
 21 |             algorithm from the paper `On the Convergence of Adam and Beyond`_
 22 |             (default: False)
 23 | 
 24 |     .. _Adam\: A Method for Stochastic Optimization:
 25 |         https://arxiv.org/abs/1412.6980
 26 |     .. _On the Convergence of Adam and Beyond:
 27 |         https://openreview.net/forum?id=ryQu7f-RZ
 28 |     """
 29 | 
 30 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
 31 |                  weight_decay=0, amsgrad=False, noise=0.005, stepscale=1.0):
 32 |         if not 0.0 <= lr:
 33 |             raise ValueError("Invalid learning rate: {}".format(lr))
 34 |         if not 0.0 <= eps:
 35 |             raise ValueError("Invalid epsilon value: {}".format(eps))
 36 |         if not 0.0 <= betas[0] < 1.0:
 37 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
 38 |         if not 0.0 <= betas[1] < 1.0:
 39 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
 40 |         if not 0.0 <= weight_decay:
 41 |             raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
 42 |         defaults = dict(lr=lr, betas=betas, eps=eps,
 43 |                         weight_decay=weight_decay, amsgrad=amsgrad)
 44 |         super(SGLDAdam, self).__init__(params, defaults)
 45 |         self.noise = noise
 46 |         self.stepscale = stepscale
 47 | 
 48 |     def __setstate__(self, state):
 49 |         super(Adam, self).__setstate__(state)
 50 |         for group in self.param_groups:
 51 |             group.setdefault('amsgrad', False)
 52 | 
 53 |     def langevin_noise(self, x, std=1.0):
 54 |         return self.noise * torch.randn_like(x).mul_(std)
 55 | 
 56 |     @torch.no_grad()
 57 |     def step(self, numsteps=None, closure=None):
 58 |         """Performs a single optimization step.
 59 | 
 60 |         Arguments:
 61 |             closure (callable, optional): A closure that reevaluates the model
 62 |                 and returns the loss.
 63 |         """
 64 |         loss = None
 65 |         if closure is not None:
 66 |             with torch.enable_grad():
 67 |                 loss = closure()
 68 | 
 69 |         for group in self.param_groups:
 70 |             for p in group['params']:
 71 |                 if p.grad is None:
 72 |                     continue
 73 |                 grad = p.grad
 74 |                 if grad.is_sparse:
 75 |                     raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
 76 |                 amsgrad = group['amsgrad']
 77 | 
 78 |                 state = self.state[p]
 79 | 
 80 |                 # State initialization
 81 |                 if len(state) == 0:
 82 |                     state['step'] = 0
 83 |                     # Exponential moving average of gradient values
 84 |                     state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
 85 |                     # Exponential moving average of squared gradient values
 86 |                     state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
 87 |                     if amsgrad:
 88 |                         # Maintains max of all exp. moving avg. of sq. grad. values
 89 |                         state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
 90 | 
 91 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
 92 |                 if amsgrad:
 93 |                     max_exp_avg_sq = state['max_exp_avg_sq']
 94 |                 beta1, beta2 = group['betas']
 95 | 
 96 |                 state['step'] += 1
 97 |                 bias_correction1 = 1 - beta1 ** state['step']
 98 |                 bias_correction2 = 1 - beta2 ** state['step']
 99 | 
100 |                 if group['weight_decay'] != 0:
101 |                     grad = grad.add(p, alpha=group['weight_decay'])
102 | 
103 |                 # Decay the first and second moment running average coefficient
104 |                 exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
105 |                 exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
106 |                 if amsgrad:
107 |                     # Maintains the maximum of all 2nd moment running avg. till now
108 |                     torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
109 |                     # Use the max. for normalizing running avg. of gradient
110 |                     denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
111 |                 else:
112 |                     denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
113 | 
114 |                 step_size = group['lr'] / bias_correction1
115 |                 replay_correction = numsteps[:, None, None] ** self.stepscale
116 |                 langevin_std = 1.0 #/ replay_correction
117 |                 
118 |                 p.add_(self.langevin_noise(p.data, std=langevin_std))
119 |                 p.addcdiv_(exp_avg, denom.mul_(replay_correction), value=-step_size)
120 | 
121 |         return loss
122 | 


--------------------------------------------------------------------------------
/nnet_pytorch/objectives/SemisupLFMMI.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from .L2 import L2
  5 | from .CrossEntropy import CrossEntropy 
  6 | from .LFMMI_EBM import SequenceEBMLoss as SeqEBM
  7 | 
  8 | 
  9 | class ChainLoss(nn.Module):
 10 |     @staticmethod
 11 |     def add_args(parser):
 12 |         parser.add_argument('--xent-reg', type=float, default=0.2)
 13 |         parser.add_argument('--l2-reg', type=float, default=0.00025)
 14 |         for m in [L2, CrossEntropy, SeqEBM]:
 15 |             m.add_args(parser) 
 16 | 
 17 |     @classmethod
 18 |     def build_objective(cls, conf):
 19 |         seq_ebm = SeqEBM.build_objective(conf)
 20 |         return ChainLoss(
 21 |             seq_ebm,
 22 |             xent_reg=conf['xent_reg'],
 23 |             l2_reg=conf['l2_reg'],
 24 |         )
 25 | 
 26 |     @classmethod
 27 |     def add_state_dict(cls, s1, s2, fraction, iteration=None):
 28 |         return {
 29 |             'seq_ebm': SeqEBM.add_state_dict(
 30 |                 s1['seq_ebm'], s2['seq_ebm'], fraction, iteration=iteration,
 31 |             ),  
 32 |         }
 33 | 
 34 |     def __init__(
 35 |         self, seq_ebm, xent_reg=0.2, l2_reg=0.00025,
 36 |     ):
 37 |         super(ChainLoss, self).__init__()
 38 |         self.seq_ebm = seq_ebm
 39 |         self.xent = CrossEntropy()
 40 |         self.l2 = L2()
 41 |         
 42 |         self.l2_reg = l2_reg
 43 |         self.xent_reg = xent_reg
 44 | 
 45 |     def forward(self, model, sample):
 46 |         is_unsup = sample.target[0, 0] == -1 
 47 |         chain_output = model(sample)
 48 |         losses = [] 
 49 |         correct = None
 50 |         # SeqEBM
 51 |         loss_seqebm, _ = self.seq_ebm(
 52 |             model,
 53 |             sample,
 54 |             precomputed=chain_output[0],
 55 |         )
 56 |         losses.append(loss_seqebm)
 57 |         
 58 |         # XENT
 59 |         if not is_unsup and self.xent_reg > 0:
 60 |             loss_xent, correct = self.xent(
 61 |                 model,
 62 |                 sample,
 63 |                 precomputed=chain_output[1],
 64 |             )
 65 |             loss_xent *= self.xent_reg
 66 |             print('XENT: {}'.format(loss_xent.data.item()), end=' ')
 67 |             losses.append(loss_xent)
 68 |         
 69 |         # L2
 70 |         if self.l2_reg > 0 and not is_unsup:
 71 |             loss_l2, _ = self.l2(
 72 |                 model,
 73 |                 sample,
 74 |                 precomputed=chain_output[0],
 75 |             )
 76 |             loss_l2 *= self.l2_reg
 77 |             print('L2: {}'.format(loss_l2.data.item()), end=' ')
 78 |             losses.append(loss_l2)
 79 | 
 80 |         loss = sum(losses)
 81 |         return loss, correct
 82 | 
 83 |     def state_dict(self):
 84 |         return {
 85 |             'seq_ebm': self.seq_ebm.state_dict()
 86 |         }
 87 | 
 88 |     def load_state_dict(self, state_dict):
 89 |         super().load_state_dict(state_dict)
 90 |         self.seq_ebm.load_state_dict(state_dict['seq_ebm'])
 91 | 
 92 |     def generate_from_buffer(self):
 93 |         return self.seq_ebm.generate_from_buffer()
 94 | 
 95 |     def generate_from_model(self, model, **kwargs):
 96 |         return self.seq_ebm.generate_from_model(model, **kwargs)
 97 | 
 98 |     def decorrupt(self, model, sample, num_steps=None):
 99 |         return self.seq_ebm.decorrupt(model, sample, num_steps)
100 |   
101 |     
102 | 


--------------------------------------------------------------------------------
/nnet_pytorch/objectives/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | import importlib
 4 | 
 5 | 
 6 | modules = glob.glob(
 7 |     os.path.sep.join(
 8 |         [os.path.dirname(__file__), '*.py']
 9 |     )
10 | )
11 | 
12 | for f in modules:
13 |     if os.path.isfile(f) and '__init__.py' not in f:
14 |         module_name, ext = os.path.splitext(f)
15 |         if ext == '.py':
16 |             module = importlib.import_module('objectives.' + os.path.basename(module_name))
17 | 
18 | OBJECTIVES = {
19 |     'CrossEntropy': CrossEntropy.CrossEntropy,
20 |     'LFMMI': LFMMI.ChainLoss,
21 |     'SemisupLFMMI': SemisupLFMMI.ChainLoss,
22 |     'LFMMI_EBM': LFMMI_EBM.SequenceEBMLoss,
23 |     'CrossEntropy_EBM': CrossEntropy_EBM.EBMLoss,
24 |     'LFMMINum': LFMMIOnly.NumeratorFunction,
25 |     'LFMMI_MCE': LFMMI_MCE.MCELoss,
26 |     'SemisupMCE': SemisupMCE.ChainLoss,
27 | }
28 | 
29 | def build_objective(objectivename, conf):
30 |     return OBJECTIVES[objectivename].build_objective(conf)
31 | 


--------------------------------------------------------------------------------
/nnet_pytorch/utils/average_models.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*- coding: utf-8 -*-
 3 | # Copyright 2020
 4 | # Apache 2.0
 5 | 
 6 | from __future__ import print_function
 7 | import argparse
 8 | import sys
 9 | import os
10 | import json
11 | import torch
12 | import models
13 | 
14 | def main():
15 |     parser = argparse.ArgumentParser()
16 |     parser.add_argument('modeldir',
17 |         help='Output model directory',
18 |         type=str,
19 |     )
20 |     parser.add_argument('idim', type=int)
21 |     parser.add_argument('start',
22 |         help='Start epoch model to average',
23 |         type=int,
24 |     )
25 |     parser.add_argument('end',
26 |         help='End epoch model to average',
27 |         type=int,
28 |     )
29 |     #parser.add_argument('--weights',
30 |     #    help='Weights for each model',
31 |     #)
32 |     args =  parser.parse_args()
33 | 
34 |     conf = json.load(open(args.modeldir + '/conf.1.json'))
35 |     conf['idim'] = args.idim
36 |     new_model = models.MODELS[conf['model']].build_model(conf)
37 |     new_dict = new_model.state_dict()
38 |     for name, param in new_dict.items():
39 |         if len(param.size()) > 0:
40 |             param.mul_(0.0)
41 |     
42 |     fraction = 1.0 / (args.end - args.start + 1)
43 |     for m in range(args.start, args.end + 1):
44 |         state_dict = torch.load(
45 |             args.modeldir + '/{}.mdl'.format(m),
46 |             map_location=torch.device('cpu')
47 |         )
48 |         for name, p in state_dict['model'].items():
49 |             if name in new_dict:
50 |                 if len(p.size()) != 0:
51 |                     new_dict[name].add_(p, alpha=fraction)
52 |                 else:
53 |                     new_dict[name] = (p * fraction).type(new_dict[name].dtype)
54 |     torch.save(
55 |         {'model': new_dict},
56 |         args.modeldir + '/{}_{}.mdl'.format(args.start, args.end)
57 |     ) 
58 |    
59 | if __name__ == "__main__":
60 |     main()
61 | 
62 | 


--------------------------------------------------------------------------------
/nnet_pytorch/utils/combine_models.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*- coding: utf-8 -*-
  3 | # Copyright 2019  Johns Hopkins University (Author: Matthew Wiesner)
  4 | # Apache 2.0
  5 | 
  6 | from __future__ import print_function
  7 | import argparse
  8 | import sys
  9 | import os
 10 | import models
 11 | import objectives
 12 | import torch
 13 | import json
 14 | from itertools import chain
 15 | from copy import deepcopy
 16 | import math
 17 | from LRScheduler import LRScheduler
 18 | from collections import defaultdict
 19 | from torch._six import container_abcs
 20 | 
 21 | 
 22 | def main():
 23 |     parser = argparse.ArgumentParser()
 24 |     parser.add_argument('omodel', help='path to output model', type=str,)
 25 |     parser.add_argument('conf', type=str)
 26 |     parser.add_argument('--save-models', action='store_true')
 27 |     parser.add_argument('--models', nargs='+', type=str, help='paths to models')
 28 |     args = parser.parse_args()
 29 | 
 30 |     conf = json.load(open(args.conf))
 31 |     new_model = models.MODELS[conf['model']].build_model(conf)
 32 |     objective = objectives.OBJECTIVES[conf['objective']].build_objective(conf)
 33 |    
 34 |     params = list(
 35 |         filter(
 36 |             lambda p: p.requires_grad,
 37 |             chain(new_model.parameters(), objective.parameters()),
 38 |         )
 39 |     ) 
 40 |    
 41 |     optimizers = {
 42 |         'sgd': torch.optim.SGD(params, lr=conf['lr'], momentum=0.0),
 43 |         'adadelta': torch.optim.Adadelta(params, lr=conf['lr']),
 44 |         'adam': torch.optim.Adam(params, lr=conf['lr'], weight_decay=conf['weight_decay']),
 45 |     }
 46 | 
 47 |     optimizer = optimizers[conf['optim']]
 48 |     opt_state_dict = optimizer.state_dict()
 49 | 
 50 | 
 51 |     new_mdl_dict = new_model.state_dict()
 52 |     new_optim_dict = optimizer.state_dict()
 53 |     new_objective_dict = objective.state_dict()
 54 | 
 55 |     for name, param in new_mdl_dict.items():
 56 |         if len(param.size()) > 0: 
 57 |             param.mul_(0.0)
 58 |     
 59 |     fraction = 1.0 / (len(args.models)) 
 60 |     for i, m in enumerate(args.models):
 61 |         print("Combining Model ", i, " ...")
 62 |         state_dict = torch.load(m, map_location=torch.device('cpu'))
 63 |         if i == 0 and 'buffer' in state_dict:
 64 |             new_buffer = torch.FloatTensor(
 65 |                 state_dict['buffer'].cpu().size(0),
 66 |                 state_dict['buffer'].cpu().size(1),
 67 |                 state_dict['buffer'].cpu().size(2),
 68 |             )
 69 |             new_buffer_numsteps = torch.zeros(state_dict['buffer'].cpu().size(0))
 70 |         
 71 |         #----------------------- Model -------------------------
 72 |         # To combine models, we just average the weights
 73 |         for name, p in state_dict['model'].items():
 74 |             if name in new_mdl_dict:
 75 |                 if len(p.size()) != 0: 
 76 |                     new_mdl_dict[name].add_(p, alpha=fraction)
 77 |                 else:
 78 |                     new_mdl_dict[name] = (p * fraction).type(new_mdl_dict[name].dtype)
 79 | 
 80 |         #--------------------- Objectives ---------------------
 81 |         # To combine objectives is harder: We average parameter weights if
 82 |         # applicable, but in the case of some models such as the EBM models
 83 |         # we have to specify how to combine things like the sampling buffer.
 84 |         # This combination is model specific and should therefore written as a
 85 |         # as method in the objective's class. For now we have just done it
 86 |         # here though.
 87 |         update_opt_state_dict(new_optim_dict, state_dict['optimizer'], fraction) 
 88 |         new_objective_dict = objective.add_state_dict(
 89 |             new_objective_dict, state_dict['objective'],
 90 |             fraction, iteration=i,
 91 |         )
 92 |         
 93 |     new_state_dict = {
 94 |         'model': new_mdl_dict,
 95 |         'objective': new_objective_dict,
 96 |         'optimizer': state_dict['optimizer'],
 97 |         'lr_sched': state_dict['lr_sched'],
 98 |         'epoch': state_dict['epoch'],
 99 |     }
100 |     
101 |     torch.save(
102 |         new_state_dict,
103 |         args.omodel,
104 |     )
105 |     
106 |     if not args.save_models:
107 |         for m in args.models:
108 |             os.remove(m) 
109 | 
110 | 
111 | def update_opt_state_dict(state_dict1, state_dict2, fraction):
112 |     '''
113 |         Update state_dict1, with state_dict2 where values are
114 |         val1 + fraction*val2
115 |     '''
116 |     groups2 = state_dict2['param_groups']
117 |     groups1 = state_dict1['param_groups']
118 |     
119 |     if len(groups1) != len(groups2):
120 |         raise ValueError("state dict as a different number of parameter groups")
121 | 
122 |     param_lens = (len(g['params']) for g in groups1)
123 |     saved_lens = (len(g['params']) for g in groups2)
124 |     if any(p_len != s_len for p_len, s_len in zip(param_lens, saved_lens)):
125 |         raise ValueError("loaded state dict contains a parameter group that "
126 |             "doesn't match the size of the optimizer's group") 
127 | 
128 |     id_map = {p: old_id for old_id, p in
129 |         zip(chain(*(g['params'] for g in groups1)),
130 |             chain(*(g['params'] for g in groups2)))}
131 |     
132 |     for k, v in state_dict2['state'].items():
133 |         if k in id_map:
134 |             param = id_map[k]
135 |             if param in state_dict1['state']:
136 |                 for p_name, p in v.items():
137 |                     if isinstance(p, torch.Tensor):
138 |                         state_dict1['state'][param][p_name] += fraction * p
139 |             else:
140 |                 state_dict1['state'][param] = {key: fraction * val for key, val in v.items()}
141 |         else:
142 |             for p_name, p in v.items():
143 |                 if isinstance(p, torch.Tensor):
144 |                     state_dict1['state'][k][p_name] = fraction * p
145 | 
146 | 
147 | if __name__ == "__main__":
148 |     main()
149 | 
150 | 


--------------------------------------------------------------------------------
/nnet_pytorch/utils/decode_nnet_pytorch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | . ./path.sh
 4 | 
 5 | batchsize=512
 6 | checkpoint=final.mdl
 7 | prior_scale=1.0
 8 | prior_floor=-20.0
 9 | prior_name="priors"
10 | min_active=200
11 | max_active=7000
12 | max_mem=50000000
13 | lattice_beam=8.0
14 | beam=15.0
15 | acoustic_scale=1.0
16 | post_decode_acwt=10.0 # 10.0 for chain systems, 1.0 for non-chain
17 | 
18 | min_lmwt=6
19 | max_lmwt=18
20 | nj=80
21 | stage=0
22 | 
23 | . ./utils/parse_options.sh
24 | if [ $# -ne 4 ]; then
25 |   echo "Usage: ./decode_nnet_pytorch.sh <data> <pytorch_model> <graphdir> <odir>"
26 |   echo " --batchsize ${batchsize} "
27 |   echo " --checkpoint ${checkpoint} --prior-scale ${prior_scale} --prior-floor ${prior_floor} --prior-name ${prior_name}"
28 |   echo " --min-active ${min_active} --max-active ${max_active}"
29 |   echo " --max-mem ${max_mem} --lattice-beam ${lattice_beam}"
30 |   echo " --beam ${beam} --acoustic-scale ${acoustic_scale} --post-decode-acwt ${post_decode_acwt}"
31 |   echo " --nj ${nj}"
32 |   exit 1;
33 | fi
34 | 
35 | data=$1
36 | pytorch_model=$2
37 | graphdir=$3
38 | odir=$4
39 | 
40 | # We assume the acoustic model (trans.mdl) is 1 level above the graphdir
41 | amdir=`dirname ${graphdir}`
42 | trans_mdl=${amdir}/final.mdl
43 | words_file=${graphdir}/words.txt
44 | hclg=${graphdir}/HCLG.fst
45 | 
46 | mkdir -p ${odir}/log
47 | 
48 | decode_cmd="utils/queue.pl --mem 6G -l hostname='!b02*&!a*&!c06*&!c23*&!c24*&!c25*&!c26*&!c27*'" # The 'a' machines are just too slow
49 | if [ $stage -le 0 ]; then
50 |   segments=${data}/segments
51 |   if [ ! -f ${data}/segments ]; then
52 |     echo "No segments file found. Assuming wav.scp is indexed by utterance"
53 |     segments=${data}/wav.scp
54 |   fi
55 | 
56 | ${decode_cmd} JOB=1:${nj} ${odir}/log/decode.JOB.log \
57 |     ./utils/split_scp.pl -j ${nj} \$\[JOB -1\] ${segments} \|\
58 |     decode.py --datadir ${data} \
59 |       --modeldir ${pytorch_model} \
60 |       --dumpdir ${odir} \
61 |       --checkpoint ${checkpoint} \
62 |       --prior-scale ${prior_scale} \
63 |       --prior-floor ${prior_floor} \
64 |       --prior-name ${prior_name} \
65 |       --words-file ${words_file} \
66 |       --trans-mdl ${trans_mdl} \
67 |       --hclg ${hclg} \
68 |       --min-active ${min_active} \
69 |       --max-active ${max_active} \
70 |       --lattice-beam ${lattice_beam} \
71 |       --beam ${beam} \
72 |       --acoustic-scale ${acoustic_scale} \
73 |       --post-decode-acwt ${post_decode_acwt} \
74 |       --job JOB \
75 |       --utt-subset /dev/stdin \
76 |       --batchsize ${batchsize}
77 | fi
78 | 
79 | if [ $stage -le 1 ]; then
80 |   ./local/score.sh --cmd "$decode_cmd" \
81 |     --min-lmwt ${min_lmwt} --max-lmwt ${max_lmwt} --word-ins-penalty 0.0 \
82 |     ${data} ${graphdir} ${odir}
83 | fi
84 | 


--------------------------------------------------------------------------------
/nnet_pytorch/utils/memmap_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*- coding: utf-8 -*-
 3 | # Copyright 2020
 4 | # Apache 2.0
 5 | 
 6 | from datasets.data_utils import memmap_feats 
 7 | import pickle
 8 | import argparse
 9 | 
10 | 
11 | def main():
12 |     parser = argparse.ArgumentParser(
13 |         description='Takes Kaldi features, converts them to numpy objects and '
14 |         'stores memory-mapped version for efficient access in training.'
15 |     )
16 |     parser.add_argument('feats_scp')
17 |     parser.add_argument('feats_scp_mapped')
18 |     parser.add_argument('metadata')
19 |     parser.add_argument('--utt-list', default=None)
20 | 
21 |     args = parser.parse_args()
22 |     utt_list = []
23 |     if args.utt_list is not None:
24 |         with open(args.utt_list, 'r') as f:
25 |             for line in f:
26 |                 utt_list.append(line.strip().split(None, 1)[0])
27 |     utt_lengths, offsets, data_shape = memmap_feats(
28 |         args.feats_scp, args.feats_scp_mapped, utt_list
29 |     ) 
30 |     with open(args.metadata + '.pkl', 'bw') as f:
31 |         pickle.dump([utt_lengths, offsets, data_shape], f)
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     main()
36 | 
37 | 


--------------------------------------------------------------------------------
/nnet_pytorch/utils/prepare_unlabeled_tgt.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*- coding: utf-8 -*-
 3 | # Copyright 2019  Johns Hopkins University (Author: Matthew Wiesner)
 4 | # Apache 2.0
 5 | 
 6 | from __future__ import print_function
 7 | import argparse
 8 | import sys
 9 | import os
10 | 
11 | 
12 | def main():
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument('utt2num_frames',
15 |         help='',
16 |         type=str
17 |     )
18 |     parser.add_argument('--subsample', type=int, default=1)
19 | 
20 |     args = parser.parse_args()
21 | 
22 |     with open(args.utt2num_frames, 'r') as f:
23 |         for l in f:
24 |             utt, frames = l.strip().split(None, 1)
25 |             print(utt, end='')
26 |             num_frames = len(range(0, int(frames), args.subsample))
27 |             print(' -1' * num_frames)
28 | 
29 | if __name__ == "__main__":
30 |     main()
31 | 
32 | 


--------------------------------------------------------------------------------
/nnet_pytorch/utils/show_decorruption.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*- coding: utf-8 -*-
 3 | # Copyright 2019  Johns Hopkins University (Author: Matthew Wiesner)
 4 | # Apache 2.0
 5 | 
 6 | from __future__ import print_function
 7 | import argparse
 8 | import sys
 9 | import os
10 | import glob
11 | import imageio
12 | from matplotlib import pyplot as plt
13 | import numpy as np
14 | 
15 | 
16 | def main():
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument('idir')
19 |     parser.add_argument('ogif')
20 |     parser.add_argument('name', type=str)
21 |     args = parser.parse_args()
22 | 
23 |     files = glob.glob('{}/{}.*.npy'.format(args.idir, args.name))
24 |     files = sorted(files, key=lambda x : int(x.split('.')[-2]))
25 | 
26 |     images = []
27 |     for f in files:
28 |         fname = os.path.basename(f)
29 |         print(fname)
30 |         out = np.load(f)
31 |         plt.imshow(np.flipud(out.T))
32 |         plt.colorbar()
33 |         plt.savefig(args.idir + "/" + fname + ".png")
34 |         images.append(imageio.imread(args.idir + "/" + fname + ".png"))
35 |         plt.clf()
36 |     imageio.mimsave(args.ogif, images, duration=0.1)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     main()
41 | 
42 | 


--------------------------------------------------------------------------------
/nnet_pytorch/utils/show_sampling.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*- coding: utf-8 -*-
 3 | # Copyright 2019  Johns Hopkins University (Author: Matthew Wiesner)
 4 | # Apache 2.0
 5 | 
 6 | from __future__ import print_function
 7 | import argparse
 8 | import sys
 9 | import os
10 | import glob
11 | import imageio
12 | from matplotlib import pyplot as plt
13 | import numpy as np
14 | 
15 | 
16 | def main():
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument('idir')
19 |     parser.add_argument('ogif')
20 |     parser.add_argument('index', type=int)
21 |     args = parser.parse_args()
22 | 
23 |     files = glob.glob(args.idir + "/samples*.npy")
24 |     files = sorted(files, key=lambda x : int(x.split('.')[-2]))
25 | 
26 |     images = []
27 |     for f in files:
28 |         fname = os.path.basename(f)
29 |         print(fname)
30 |         out = np.load(f)
31 |         plt.imshow(np.flipud(out[args.index, :, :].T))
32 |         plt.colorbar()
33 |         plt.savefig(args.idir + "/" + fname + ".png")
34 |         images.append(imageio.imread(args.idir + "/" + fname + ".png"))
35 |         plt.clf()
36 |     imageio.mimsave(args.ogif, images, duration=0.1)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     main()
41 | 
42 | 


--------------------------------------------------------------------------------
/nnet_pytorch/utils/split_memmap_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | . ./path.sh
 4 | . ./cmd.sh
 5 | 
 6 | . ./utils/parse_options.sh
 7 | if [ $# -ne 3 ]; then
 8 |   echo "Usage: split_memmap_data.sh <datadir> <targets> <n>"
 9 |   exit 1;
10 | fi
11 | 
12 | datadir=$1
13 | targets=$2
14 | num_split=$3
15 | 
16 | dataname=`basename ${datadir}`
17 | mapped_dir=${datadir}/mapped # don't change this path
18 | mkdir -p $mapped_dir
19 | echo "$0: Splitting data in $num_split parts"
20 | # spread the mapped numpy arrays over various machines, as this data-set is quite large.
21 | if [[  $(hostname -f) ==  *.clsp.jhu.edu ]]; then
22 |   utils/create_split_dir.pl /export/b{11,12,13,14}/$USER/kaldi-data/egs/${dataname}_$(date +'%m_%d_%H_%M')/$mapped_dir/storage \
23 |     $mapped_dir/storage
24 | fi
25 | utils/split_data.sh ${datadir} $num_split
26 | for n in $(seq $num_split); do
27 |   # the next command does nothing unless $mapped_feats_dir/storage/ exists, see
28 |   # utils/create_data_link.pl for more info.
29 |   utils/create_data_link.pl $mapped_dir/feats.dat.$n
30 | done
31 | $train_cmd JOB=1:$num_split exp/make_fbank/${dataname}/memmap_data.JOB.log \
32 |   memmap_data.py --utt-list ${targets} ${datadir}/split${num_split}/JOB/feats.scp $mapped_dir/feats.dat.JOB \
33 |   $mapped_dir/metadata.JOB
34 | echo $num_split > ${datadir}/num_split
35 | 
36 | 


--------------------------------------------------------------------------------
/tools/Makefile:
--------------------------------------------------------------------------------
 1 | PYTHON_DIR = `pwd`/NeurIPS2020/bin
 2 | CXX ?= g++
 3 | 
 4 | WGET ?= wget
 5 | 
 6 | # Note: OpenFst requires a relatively recent C++ compiler with C++11 support,
 7 | # e.g. g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3.
 8 | OPENFST_VERSION ?= 1.7.5
 9 | 
10 | # Default features configured for OpenFST; can be overridden in the make command line.
11 | OPENFST_COMFIGURE ?= --enable-static --enable-shared --enable-ngram-fsts
12 | 
13 | CPPFLAGS ?= -D_GLIBCXX_USE_CXX11_ABI=0
14 | CXXFLAGS ?= -D_GLIBCXX_USE_CXX11_ABI=0
15 | 
16 | all: kaldi pychain
17 | 
18 | kaldi:
19 | 	git clone https://github.com/kaldi-asr/kaldi.git
20 | 	cd kaldi/tools; $(MAKE) all
21 | 	cd kaldi/src; ./configure --shared; $(MAKE) depend; $(MAKE) all
22 | 
23 | venv: requirements.txt
24 | 	test -d NeurIPS2020 || python3 -m venv NeurIPS2020
25 | 	. ./NeurIPS2020/bin/activate; pip install -r requirements.txt
26 | 
27 | clean: openfst_cleaned
28 | 	rm -rf pychain
29 | 
30 | openfst_cleaned:
31 | 	$(MAKE) -C openfst-$(OPENFST_VERSION) clean
32 | 
33 | .PHONY: openfst # so target will be made even though "openfst" exists.
34 | openfst: openfst_compiled openfst-$(OPENFST_VERSION)/lib
35 | 	-rm -f openfst
36 | 	-ln -s openfst-$(OPENFST_VERSION) openfst
37 | 
38 | .PHONY: openfst_compiled
39 | openfst_compiled: openfst-$(OPENFST_VERSION)/Makefile
40 | 	$(MAKE) -C openfst-$(OPENFST_VERSION) install MAKEOVERRIDES=
41 | 
42 | openfst-$(OPENFST_VERSION)/lib: | openfst-$(OPENFST_VERSION)/Makefile
43 | 	-cd openfst-$(OPENFST_VERSION) && [ -d lib64 ] && [ ! -d lib ] && ln -s lib64 lib
44 | 
45 | # Add the -O flag to CXXFLAGS on cygwin as it can fix the compilation error
46 | # "file too big".
47 | ifeq ($(OSTYPE),cygwin)
48 |   # Note: OSTYPE path is probably dead for latest cygwin64 (installed on 2016/11/11).
49 |   openfst_add_CXXFLAGS = -O -Wa,-mbig-obj
50 | else ifeq ($(OS),Windows_NT)
51 |   # This new OS path is confirmed working on Windows 10 / Cygwin64.
52 |   openfst_add_CXXFLAGS = -O -Wa,-mbig-obj
53 | else
54 |   openfst_add_CXXFLAGS =
55 | endif
56 | 
57 | openfst-$(OPENFST_VERSION)/Makefile: openfst-$(OPENFST_VERSION)
58 | 	cd openfst-$(OPENFST_VERSION)/ && \
59 | 	 ./configure --prefix=`pwd` $(OPENFST_CONFIGURE) CXX="$(CXX)" CPPFLAGS="$(CPPFLAGS)" CXXFLAGS="$(CXXFLAGS) $(openfst_add_CXXFLAGS)" LDFLAGS="$(LDFLAGS)" LIBS="-ldl"
60 | 
61 | openfst-$(OPENFST_VERSION): openfst-$(OPENFST_VERSION).tar.gz
62 | 	tar xozf openfst-$(OPENFST_VERSION).tar.gz
63 | 
64 | openfst-$(OPENFST_VERSION).tar.gz:
65 | 	$(WGET) -T 10 -t 1 http://www.openfst.org/twiki/pub/FST/FstDownload/openfst-$(OPENFST_VERSION).tar.gz || \
66 | 	$(WGET) -T 10 -t 3 https://www.openslr.org/resources/2/openfst-$(OPENFST_VERSION).tar.gz;
67 | 
68 | .PHONY: pychain
69 | pychain: openfst venv
70 | 	test -d ../nnet_pytorch/objectives/pychain || . ./NeurIPS2020/bin/activate && \
71 | 	export OPENFST_PATH=`pwd`/openfst && \
72 | 	export LD_LIBRARY_PATH=`pwd`/openfst/lib:$$LD_LIBRARY_PATH && \
73 | 	export PATH=$(PYTHON_DIR):$$PATH && \
74 | 	cd ../nnet_pytorch/objectives && \
75 | 	git clone --single-branch --branch master https://github.com/YiwenShaoStephen/pychain.git && cd pychain && \
76 | 	cp ../../../tools/pychain_patch.diff . && \
77 | 	git apply pychain_patch.diff && \
78 | 	cd openfst_binding && python3 setup.py install && \
79 | 	cd ../pytorch_binding && python3 setup.py install;
80 | 


--------------------------------------------------------------------------------
/tools/pychain_patch.diff:
--------------------------------------------------------------------------------
  1 | diff --git a/__init__.py b/__init__.py
  2 | new file mode 100644
  3 | index 0000000..e69de29
  4 | diff --git a/pychain/__init__.py b/pychain/__init__.py
  5 | index 890d65b..84a54e6 100644
  6 | --- a/pychain/__init__.py
  7 | +++ b/pychain/__init__.py
  8 | @@ -1,2 +1 @@
  9 | -from .loss import *
 10 |  from .graph import *
 11 | diff --git a/pychain/chain.py b/pychain/chain.py
 12 | new file mode 100644
 13 | index 0000000..17c0b64
 14 | --- /dev/null
 15 | +++ b/pychain/chain.py
 16 | @@ -0,0 +1,83 @@
 17 | +# Copyright       2019 Yiwen Shao
 18 | +#                 2020 Yiming Wang
 19 | +
 20 | +# Licensed under the Apache License, Version 2.0 (the "License");
 21 | +# you may not use this file except in compliance with the License.
 22 | +# You may obtain a copy of the License at
 23 | +
 24 | +#  http://www.apache.org/licenses/LICENSE-2.0
 25 | +
 26 | +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 27 | +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 28 | +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 29 | +# MERCHANTABLITY OR NON-INFRINGEMENT.
 30 | +# See the Apache 2 License for the specific language governing permissions and
 31 | +# limitations under the License.
 32 | +
 33 | +import torch
 34 | +import torch.nn as nn
 35 | +from .graph import ChainGraphBatch
 36 | +import pychain_C
 37 | +
 38 | +
 39 | +class ChainFunction(torch.autograd.Function):
 40 | +    @staticmethod
 41 | +    def forward(ctx, input, input_lengths, graphs, leaky_coefficient=1e-5):
 42 | +        input = input.contiguous().clamp(-30, 30)  # clamp for both the denominator and the numerator
 43 | +        B = input.size(0)
 44 | +        if B != graphs.batch_size:
 45 | +            raise ValueError(
 46 | +                "input batch size ({}) does not equal to graph batch size ({})"
 47 | +                .format(B, graphs.batch_size)
 48 | +            )
 49 | +        packed_data = torch.nn.utils.rnn.pack_padded_sequence(
 50 | +            input, input_lengths, batch_first=True,
 51 | +        )
 52 | +        batch_sizes = packed_data.batch_sizes
 53 | +        input_lengths = input_lengths.cpu()
 54 | +        if not graphs.log_domain:  # usually for the denominator
 55 | +            exp_input = input.exp()
 56 | +            objf, input_grad, ok = pychain_C.forward_backward(
 57 | +                graphs.forward_transitions,
 58 | +                graphs.forward_transition_indices,
 59 | +                graphs.forward_transition_probs,
 60 | +                graphs.backward_transitions,
 61 | +                graphs.backward_transition_indices,
 62 | +                graphs.backward_transition_probs,
 63 | +                graphs.leaky_probs,
 64 | +                graphs.initial_probs,
 65 | +                graphs.final_probs,
 66 | +                graphs.start_state,
 67 | +                exp_input,
 68 | +                batch_sizes,
 69 | +                input_lengths,
 70 | +                graphs.num_states,
 71 | +                leaky_coefficient,
 72 | +            )
 73 | +        else:  # usually for the numerator
 74 | +            objf, log_probs_grad, ok = pychain_C.forward_backward_log_domain(
 75 | +                graphs.forward_transitions,
 76 | +                graphs.forward_transition_indices,
 77 | +                graphs.forward_transition_probs,
 78 | +                graphs.backward_transitions,
 79 | +                graphs.backward_transition_indices,
 80 | +                graphs.backward_transition_probs,
 81 | +                graphs.initial_probs,
 82 | +                graphs.final_probs,
 83 | +                graphs.start_state,
 84 | +                input,
 85 | +                batch_sizes,
 86 | +                input_lengths,
 87 | +                graphs.num_states,
 88 | +            )
 89 | +            input_grad = log_probs_grad.exp()
 90 | +
 91 | +        ctx.save_for_backward(input_grad)
 92 | +        return objf.sum()
 93 | +
 94 | +    @staticmethod
 95 | +    def backward(ctx, objf_grad):
 96 | +        input_grad, = ctx.saved_tensors
 97 | +        input_grad = torch.mul(input_grad, objf_grad)
 98 | +
 99 | +        return input_grad, None, None, None
100 | diff --git a/pytorch_binding/src/chain-computation.cc b/pytorch_binding/src/chain-computation.cc
101 | index d53a03f..36edf10 100644
102 | --- a/pytorch_binding/src/chain-computation.cc
103 | +++ b/pytorch_binding/src/chain-computation.cc
104 | @@ -226,7 +226,8 @@ torch::Tensor ChainComputation::ComputeTotLogLike() {
105 |    // as alpha_frame_log_tot is padded with 0.0, the sum below is fine
106 |    tot_log_prob_.copy_(alpha_frame_log_tot.sum(1) + last_frame_alpha_dash_sum.log()); // B
107 |    tot_prob_.copy_(tot_log_prob_.exp()); // B
108 | -  return tot_log_prob_.sum();
109 | +  //return tot_log_prob_.sum();
110 | +  return tot_log_prob_;
111 |  }
112 |  
113 |  void ChainComputation::BetaDashLastFrame() {
114 | 


--------------------------------------------------------------------------------
/tools/requirements.txt:
--------------------------------------------------------------------------------
1 | kaldi-io==0.9.4
2 | numpy==1.18.5
3 | torch==1.5.0
4 | 


--------------------------------------------------------------------------------