├── .gitignore
├── CODEOWNERS
├── CODE_OF_CONDUCT.md
├── COMBINE
    ├── asr1
    │   ├── cmd.sh
    │   ├── combine_cmvn_stats.sh
    │   ├── combine_train_data.sh
    │   ├── conf
    │   │   ├── fbank.yaml
    │   │   ├── fbank_pitch.yaml
    │   │   ├── gpu.conf
    │   │   ├── pbs.conf
    │   │   ├── queue.conf
    │   │   └── slurm.conf
    │   ├── db.sh
    │   ├── local
    │   │   └── combine_datasets.py
    │   ├── multi_tokenize.sh
    │   ├── path.sh
    │   ├── run.sh
    │   └── utils
    └── tts1
    │   ├── cmd.sh
    │   ├── combine_cmvn_stats.sh
    │   ├── combine_train_data.sh
    │   ├── conf
    │       ├── fbank.yaml
    │       ├── fbank_pitch.yaml
    │       ├── gpu.conf
    │       ├── pbs.conf
    │       ├── queue.conf
    │       └── slurm.conf
    │   ├── db.sh
    │   ├── local
    │       └── combine_datasets.py
    │   ├── multi_tokenize.sh
    │   ├── path.sh
    │   ├── run.sh
    │   └── utils
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── SECURITY.md
├── TEMPLATE
    ├── asr1
    │   ├── asr.sh
    │   ├── cmd.sh
    │   ├── conf
    │   │   ├── fbank.yaml
    │   │   ├── fbank_pitch.yaml
    │   │   ├── pbs.conf
    │   │   ├── queue.conf
    │   │   └── slurm.conf
    │   ├── db.sh
    │   ├── path.sh
    │   ├── setup.sh
    │   └── utils
    └── tts1
    │   ├── cmd.sh
    │   ├── conf
    │       ├── fbank.yaml
    │       ├── fbank_pitch.yaml
    │       ├── pbs.conf
    │       ├── queue.conf
    │       └── slurm.conf
    │   ├── db.sh
    │   ├── path.sh
    │   ├── setup.sh
    │   ├── tts.sh
    │   └── utils
├── commonvoice
    └── asr1
    │   ├── asr.sh
    │   ├── cmd.sh
    │   ├── conf
    │       ├── fbank.yaml
    │       ├── fbank_pitch.yaml
    │       ├── pbs.conf
    │       ├── queue.conf
    │       └── slurm.conf
    │   ├── db.sh
    │   ├── local
    │       ├── data.sh
    │       ├── data_prep.pl
    │       ├── download_and_untar.sh
    │       ├── filter_text.py
    │       ├── reduce_data_dir.sh
    │       └── split_tr_dt_et.sh
    │   ├── path.sh
    │   ├── run.sh
    │   └── utils
├── docker
    └── Dockerfile
├── example
    ├── README.md
    ├── main.py
    ├── model.py
    ├── requirements.txt
    ├── resources
    │   ├── fbank.yaml
    │   ├── fbank_pitch.yaml
    │   ├── global_cmvn_fbank.ark
    │   ├── global_cmvn_fbank_pitch.ark
    │   ├── librispeech_bpe2000.model
    │   └── tokens.txt
    └── utils.py
├── fisher
    └── asr1
    │   ├── asr.sh
    │   ├── cmd.sh
    │   ├── conf
    │       ├── fbank.yaml
    │       ├── fbank_pitch.yaml
    │       ├── pbs.conf
    │       ├── queue.conf
    │       └── slurm.conf
    │   ├── db.sh
    │   ├── local
    │       ├── data.sh
    │       └── fisher_data_prep.sh
    │   ├── path.sh
    │   ├── run.sh
    │   └── utils
├── librispeech
    └── asr1
    │   ├── asr.sh
    │   ├── cmd.sh
    │   ├── conf
    │       ├── fbank.yaml
    │       ├── fbank_pitch.yaml
    │       ├── gpu.conf
    │       ├── pbs.conf
    │       ├── queue.conf
    │       └── slurm.conf
    │   ├── db.sh
    │   ├── local
    │       ├── data.sh
    │       └── download_and_untar.sh
    │   ├── path.sh
    │   ├── run.sh
    │   └── utils
├── setup.py
├── speech_datasets
    ├── __init__.py
    ├── bin
    │   ├── __init__.py
    │   ├── apply_cmvn.py
    │   ├── combine_cmvn_stats.py
    │   ├── compute_cmvn_stats.py
    │   ├── dump.py
    │   ├── feat_to_shape.py
    │   └── spm_train.py
    ├── dataloader.py
    ├── text
    │   ├── __init__.py
    │   └── tokenizers.py
    ├── transform
    │   ├── README.md
    │   ├── __init__.py
    │   ├── add_deltas.py
    │   ├── cmvn.py
    │   ├── interface.py
    │   ├── perturb.py
    │   ├── sparse_time_warp.py
    │   ├── spec_augment.py
    │   ├── spectrogram.py
    │   └── transformation.py
    └── utils
    │   ├── __init__.py
    │   ├── io_utils.py
    │   ├── misc.py
    │   ├── readers.py
    │   ├── types.py
    │   └── writers.py
├── swbd
    └── asr1
    │   ├── asr.sh
    │   ├── cmd.sh
    │   ├── conf
    │       ├── fbank.yaml
    │       ├── fbank_pitch.yaml
    │       ├── gpu.conf
    │       ├── pbs.conf
    │       ├── queue.conf
    │       └── slurm.conf
    │   ├── db.sh
    │   ├── local
    │       ├── MSU_single_letter.txt
    │       ├── data.sh
    │       ├── dict.patch
    │       ├── eval2000_data_prep.sh
    │       ├── extend_segments.pl
    │       ├── format_acronyms_dict.py
    │       ├── map_acronyms_transcripts.py
    │       ├── rt03_data_prep.sh
    │       ├── swbd1_data_prep.sh
    │       ├── swbd1_fix_speakerid.pl
    │       ├── swbd1_map_words.pl
    │       └── swbd1_prepare_dict.sh
    │   ├── path.sh
    │   ├── run.sh
    │   └── utils
├── tools
    ├── check_install.py
    ├── install_anaconda.sh
    ├── install_pkgs.sh
    └── install_sph2pipe.sh
├── utils
    ├── apply_cmvn.sh
    ├── apply_map.pl
    ├── combine_data.sh
    ├── compute_cmvn_stats.sh
    ├── copy_data_dir.sh
    ├── dump.sh
    ├── feat_to_shape.sh
    ├── filter_scp.pl
    ├── filter_scps.pl
    ├── fix_data_dir.sh
    ├── make_absolute.sh
    ├── parse_options.sh
    ├── pbs.pl
    ├── perturb_data_dir_speed.sh
    ├── queue.pl
    ├── remove_dup_utts.sh
    ├── run.pl
    ├── shuffle_list.pl
    ├── slurm.pl
    ├── spk2utt_to_utt2spk.pl
    ├── split_scp.pl
    ├── ssh.pl
    ├── stdout.pl
    ├── subset_data_dir.sh
    ├── subset_data_dir_tr_cv.sh
    ├── subset_scp.pl
    ├── sym2int.pl
    ├── utt2spk_to_spk2utt.pl
    ├── validate_data_dir.sh
    └── validate_text.pl
└── wsj
    └── asr1
        ├── asr.sh
        ├── cmd.sh
        ├── conf
            ├── fbank.yaml
            ├── fbank_pitch.yaml
            ├── gpu.conf
            ├── pbs.conf
            ├── queue.conf
            └── slurm.conf
        ├── db.sh
        ├── local
            ├── data.sh
            ├── find_transcripts.pl
            ├── flist2scp.pl
            ├── ndx2flist.pl
            ├── normalize_transcript.pl
            ├── wsj_data_prep.sh
            └── wsj_format_data.sh
        ├── path.sh
        ├── run.sh
        └── utils


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.pyc
3 | .DS_Store
4 | .idea/
5 | cmake-build-debug/
6 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # Comment line immediately above ownership line is reserved for related gus information. Please be careful while editing.
2 | #ECCN:Open Source
3 | 


--------------------------------------------------------------------------------
/COMBINE/asr1/cmd.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/cmd.sh


--------------------------------------------------------------------------------
/COMBINE/asr1/combine_cmvn_stats.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Set bash to 'debug' mode, it will exit on :
 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
 4 | set -e
 5 | set -u
 6 | set -o pipefail
 7 | 
 8 | log() {
 9 |     local fname=${BASH_SOURCE[1]##*/}
10 |     echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
11 | }
12 | 
13 | feats_type=fbank   # fbank or fbank_pitch are valid
14 | cmvn_type=global   # global or speaker or utterance are valid
15 | 
16 | help_message=$(cat <<EOF
17 | Usage: $0 [options] <dataset1>/<split1a> <dataset2>/<split2a> <dataset2>/<split2b> ...
18 | 
19 | Combines CMVN stats for the specified dataset splits (pre-computed by Stage 5 run.sh for each dataset split specified)
20 | into a single file.
21 | 
22 | Options:
23 |     --feats_type          # Feature type (fbank or fbank_pitch) (default=${feats_type}).
24 |     --cmvn_type           # Type of CMVN stats to compute (global or speaker or utterance) (default=${cmvn_type}).
25 | EOF
26 | )
27 | 
28 | 
29 | . ./path.sh || exit 1
30 | . ./cmd.sh || exit 1
31 | 
32 | log "$0 $*"
33 | . utils/parse_options.sh || exit 1
34 | if [ $# -eq 0 ]; then
35 |     log "${help_message}"
36 |     log "Error: Please specify dataset splits as positional arguments."
37 |     exit 2
38 | fi
39 | 
40 | workspace=$PWD
41 | task=$(basename "$(utils/make_absolute.sh "$workspace")")
42 | 
43 | # Get CMVN's from all the relevant dataset splits
44 | cmvns=
45 | for dset in "$@"; do
46 |     base=$(echo ${dset} | sed -E "s/\/.*//g")
47 |     split=$(echo ${dset} | sed -E "s/.*\///g")
48 |     base_dir="${MAIN_ROOT}/${base}/${task}"
49 |     dset_dir="${base_dir}/dump/${feats_type}"/${split}
50 |     cmvn="${dset_dir}/${cmvn_type}_cmvn.ark"
51 | 
52 |     if [ ! -d ${base_dir} ]; then
53 |         log "${base} is not a valid dataset for task ${task//1/}"
54 |         exit 1
55 |     elif [ "${base}" = "${dset}" ]; then
56 |         log "Expected dataset to specified as <dataset>/<split>, but got ${dset}"
57 |         exit 1
58 |     elif [ ! -d ${dset_dir} ]; then
59 |         log "Either ${split} is not a valid split for dataset ${base}, or"
60 |         log "${base_dir}/run.sh has not yet been run with feats_type=${feats_type}"
61 |         exit 1
62 |     elif [ ! -f ${cmvn} ]; then
63 |         log "${cmvn_type} CMVN statistics have not been computed for feats_type=${feats_type} for data split ${dset}."
64 |         log "Please run stage 5 of ${base_dir}/${task}/run.sh."
65 |         exit 1
66 |     fi
67 |     cmvns+="${cmvn} "
68 | done
69 | 
70 | # Combine CMVN's
71 | combo_idx=$(python3 local/combine_datasets.py --task "${task//1/}" --write_dir false "$@")
72 | dumpdir="dump/${feats_type}/no_short/${combo_idx}"
73 | mkdir -p "${dumpdir}"
74 | python3 -m speech_datasets.bin.combine_cmvn_stats --cmvn_type ${cmvn_type} \
75 |     --output_file "${dumpdir}/${cmvn_type}_cmvn.ark" ${cmvns}
76 | 


--------------------------------------------------------------------------------
/COMBINE/asr1/combine_train_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | log() {
 4 |     local fname=${BASH_SOURCE[1]##*/}
 5 |     echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 6 | }
 7 | 
 8 | help_message="Usage: $0 [asr.sh options] <dataset1>/<split1a> <dataset2>/<split2a> <dataset2>/<split2b> ..."
 9 | 
10 | log "$0 $*"
11 | if [ $# -eq 0 ]; then
12 |     log "$help_message"
13 |     log "Error: at least 1 argument required"
14 |     exit 2
15 | fi
16 | 
17 | kwargs=()
18 | stage=2
19 | stop_stage=5
20 | while true; do
21 |     case "$1" in
22 |         --stage)
23 |             if [ "$2" -lt 2 ]; then
24 |                 log "Specify --stage 2 or higher (got --stage $2)."
25 |                 log "We expect stage 1 to be complete for all datasets given."
26 |                 exit 2
27 |             else
28 |                 stage=$2
29 |             fi
30 |             shift 2
31 |             ;;
32 |         --stop-stage|--stop_stage)
33 |             if [ "$2" -gt 5 ]; then
34 |                 log "Specify --stop-stage 5 or lower (got --stop-stage $2)."
35 |                 log "Use combine_cmvn_stats.sh to combine CMVN statistics from multiple datasets (stage 5)."
36 |                 log "Use multi_tokenize.sh to obtain token inventories from multiple datasets (stages 6-7)."
37 |                 exit 2
38 |             else
39 |                 stop_stage=$2
40 |             fi
41 |             shift 2
42 |             ;;
43 |         --*) kwargs+=( "$1" "$2" ); shift 2; ;;
44 |         *) break;
45 |     esac
46 | done
47 | kwargs+=( --stage "$stage" --stop_stage "$stop_stage" )
48 | 
49 | if [ $# -eq 0 ]; then
50 |     log "${help_message}"
51 |     log "Error: Please specify dataset splits as positional arguments."
52 |     exit 2
53 | fi
54 | 
55 | task=$(basename "$(utils/make_absolute.sh "$PWD")")
56 | idx=$(python local/combine_datasets.py --task "${task//1/}" --write_dir true "$@")
57 | datadir="data/${idx}"
58 | for f in wav.scp segments utt2spk text; do
59 |     sort "${datadir}/${f}" > "${datadir}/${f}.tmp"
60 |     mv "${datadir}/${f}.tmp" "${datadir}/${f}"
61 | done
62 | ./run.sh "${kwargs[@]}" --train_sets "${idx}"
63 | 


--------------------------------------------------------------------------------
/COMBINE/asr1/conf/fbank.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank
2 |   num_mel_bins: 80
3 |   sample_frequency: 16000
4 | 


--------------------------------------------------------------------------------
/COMBINE/asr1/conf/fbank_pitch.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank_pitch
2 |   num_mel_bins: 80
3 |   sample_frequency: 16000
4 | 


--------------------------------------------------------------------------------
/COMBINE/asr1/conf/gpu.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
 3 | option mem=* -l mem_free=$0,ram_free=$0
 4 | option mem=0          # Do not add anything to qsub_opts
 5 | option num_threads=* -pe smp $0
 6 | option num_threads=1  # Do not add anything to qsub_opts
 7 | option max_jobs_run=* -tc $0
 8 | default gpu=0
 9 | option gpu=0
10 | option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q


--------------------------------------------------------------------------------
/COMBINE/asr1/conf/pbs.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -V -v PATH -S /bin/bash
 3 | option name=* -N $0
 4 | option mem=* -l mem=$0
 5 | option mem=0          # Do not add anything to qsub_opts
 6 | option num_threads=* -l ncpus=$0
 7 | option num_threads=1  # Do not add anything to qsub_opts
 8 | option num_nodes=* -l nodes=$0:ppn=1
 9 | default gpu=0
10 | option gpu=0
11 | option gpu=* -l ngpus=$0
12 | 


--------------------------------------------------------------------------------
/COMBINE/asr1/conf/queue.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
 3 | option name=* -N $0
 4 | option mem=* -l mem_free=$0,ram_free=$0
 5 | option mem=0          # Do not add anything to qsub_opts
 6 | option num_threads=* -pe smp $0
 7 | option num_threads=1  # Do not add anything to qsub_opts
 8 | option max_jobs_run=* -tc $0
 9 | option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
10 | default gpu=0
11 | option gpu=0
12 | option gpu=* -l gpu=$0 -q g.q
13 | 


--------------------------------------------------------------------------------
/COMBINE/asr1/conf/slurm.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command sbatch --export=PATH
 3 | option name=* --job-name $0
 4 | option time=* --time $0
 5 | option mem=* --mem-per-cpu $0
 6 | option mem=0          # Do not add anything to qsub_opts
 7 | option num_threads=* --cpus-per-task $0
 8 | option num_threads=1 --cpus-per-task 1
 9 | option num_nodes=* --nodes $0
10 | default gpu=0
11 | option gpu=0 -p cpu
12 | option gpu=* -p gpu --gres=gpu:$0
13 | # note: the --max-jobs-run option is supported as a special case
14 | # by slurm.pl and you don't have to handle it in the config file.
15 | 


--------------------------------------------------------------------------------
/COMBINE/asr1/db.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/db.sh


--------------------------------------------------------------------------------
/COMBINE/asr1/local/combine_datasets.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import shutil
 4 | 
 5 | from speech_datasets.utils import get_root
 6 | from speech_datasets.utils.io_utils import get_combo_idx
 7 | from speech_datasets.utils.types import str2bool
 8 | 
 9 | 
10 | def main():
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument("--task", type=str, choices=["asr", "tts"])
13 |     parser.add_argument("--write_dir", type=str2bool, default=True)
14 |     parser.add_argument("datasets", nargs="+", type=str)
15 |     args = parser.parse_args()
16 | 
17 |     # Ensure that all datasets are specified as <dataset>/<split>
18 |     datasets = sorted(set(args.datasets))
19 |     dataset_splits = [d.split("/", maxsplit=1) for d in datasets]
20 |     assert all(len(d) == 2 for d in dataset_splits), \
21 |         f"All datasets must be specified as <dataset>/<split>, but got " \
22 |         f"{datasets} instead"
23 | 
24 |     # Verify that all datasets have been prepared
25 |     dataset_dirs = [os.path.join(get_root(), ds[0], f"{args.task}1", "data", ds[1])
26 |                     for ds in dataset_splits]
27 |     assert all(os.path.isdir(d) for d in dataset_dirs), \
28 |         f"Please make sure that all dataset splits are valid, and that all " \
29 |         f"datasets you wish to combine have already been prepared by stage 1 " \
30 |         f"of {args.task}.sh"
31 | 
32 |     # Get the index of this dataset combination (add to the registry if needed)
33 |     idx = get_combo_idx(datasets, args.task)
34 |     data_dir = os.path.join(get_root(), "COMBINE", f"{args.task}1", "data")
35 |     if idx < 0:
36 |         os.makedirs(data_dir, exist_ok=True)
37 |         with open(os.path.join(data_dir, "registry.txt"), "a") as f:
38 |             f.write(" ".join(datasets) + "\n")
39 |         idx = get_combo_idx(datasets, args.task)
40 | 
41 |     if not args.write_dir:
42 |         return idx
43 | 
44 |     # Create a directory for this dataset combo & prepare it
45 |     dirname = os.path.join(data_dir, str(idx))
46 |     os.makedirs(dirname, exist_ok=True)
47 |     write_segments = any(os.path.isfile(os.path.join(d, "segments"))
48 |                          for d in dataset_dirs)
49 |     with open(os.path.join(dirname, "wav.scp"), "wb") as wav, \
50 |             open(os.path.join(dirname, "text"), "wb") as text, \
51 |             open(os.path.join(dirname, "utt2spk"), "wb") as utt2spk, \
52 |             open(os.path.join(dirname, "segments"), "w") as segments:
53 |         for d in dataset_dirs:
54 | 
55 |             # wav.scp, text, and utt2spk can just be concatenated on
56 |             with open(os.path.join(d, "wav.scp"), "rb") as src_wav:
57 |                 shutil.copyfileobj(src_wav, wav)
58 |             with open(os.path.join(d, "text"), "rb") as src_text:
59 |                 shutil.copyfileobj(src_text, text)
60 |             with open(os.path.join(d, "utt2spk"), "rb") as src_utt2spk:
61 |                 shutil.copyfileobj(src_utt2spk, utt2spk)
62 | 
63 |             if write_segments:
64 |                 # If a segments file exists, we can just concatenate it on
65 |                 if os.path.isfile(os.path.join(d, "segments")):
66 |                     with open(os.path.join(d, "segments"), "r") as src_segments:
67 |                         shutil.copyfileobj(src_segments, segments)
68 | 
69 |                 # Otherwise, we need to use wav.scp to create a dummy segments
70 |                 # line format is <segment_id> <record_id> <start_time> <end_time>
71 |                 # <start_time> = 0, <end_time> = -1 means use the whole recording
72 |                 else:
73 |                     with open(os.path.join(d, "wav.scp"), "r") as src_wav:
74 |                         for line in src_wav:
75 |                             utt_id, _ = line.rstrip().split(None, maxsplit=1)
76 |                             segments.write(f"{utt_id} {utt_id} 0.0 -1.0\n")
77 | 
78 |     return idx
79 | 
80 | 
81 | if __name__ == "__main__":
82 |     combo_idx = main()
83 |     print(combo_idx)
84 | 


--------------------------------------------------------------------------------
/COMBINE/asr1/multi_tokenize.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Set bash to 'debug' mode, it will exit on :
 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
 4 | set -e
 5 | set -u
 6 | set -o pipefail
 7 | 
 8 | log() {
 9 |     local fname=${BASH_SOURCE[1]##*/}
10 |     echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
11 | }
12 | 
13 | # Tokenization related options from asr.sh
14 | token_type=bpe          # Tokenization type (char or bpe).
15 | n_tokens=2000           # The number of BPE vocabulary.
16 | nlsyms="<noise>"        # non-linguistic symbols list, separated by a comma
17 | 
18 | help_message=$(cat <<EOF
19 | Usage: $0 [options] <dataset1> <dataset2> <dataset3> ...
20 | 
21 | Produces a token inventory of the given type for all the datasets provided.
22 | 
23 | Options:
24 |     --token_type              # Tokenization type (char or bpe, default="${token_type}").
25 |     --n_tokens                # The maximum number of tokens allowed (default="${n_tokens}").
26 |     --nlsyms                  # Non-linguistic symbol list for BPE/char, separated by a comma. (default="${nlsyms}").
27 | EOF
28 | )
29 | 
30 | . ./path.sh || exit 1
31 | . ./cmd.sh || exit 1
32 | 
33 | log "$0 $*"
34 | . utils/parse_options.sh || exit 1
35 | if [ $# -eq 0 ]; then
36 |     log "${help_message}"
37 |     log "Error: Please specify datasets as positional arguments."
38 |     exit 2
39 | fi
40 | 
41 | workspace=$PWD
42 | task=$(basename "$(utils/make_absolute.sh "$workspace")")
43 | run_args="--token-type ${token_type} --n_tokens ${n_tokens} --nlsyms ${nlsyms} "
44 | 
45 | # Compile srctexts from all the relevant datasets
46 | srctexts=
47 | for dset in "$@"; do
48 |     log "Concatenating all source texts from dataset $dset..."
49 |     dset_dir="${MAIN_ROOT}/${dset}/${task}"
50 |     cd ${dset_dir}
51 |     ./run.sh --stage 6 --stop-stage 6 ${run_args}
52 |     cd ${workspace}
53 |     srctexts+="${dset_dir}/dump/srctexts "
54 |     echo ""
55 | done
56 | 
57 | # Concatenate all the relevant text data & prepare a token inventory
58 | log "Concatenating all source texts from all datasets..."
59 | mkdir -p dump data
60 | cat $srctexts > dump/srctexts
61 | ./run.sh --stage 7 --stop-stage 7 ${run_args}
62 | 
63 | 


--------------------------------------------------------------------------------
/COMBINE/asr1/path.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/path.sh


--------------------------------------------------------------------------------
/COMBINE/asr1/run.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/asr.sh


--------------------------------------------------------------------------------
/COMBINE/asr1/utils:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/utils


--------------------------------------------------------------------------------
/COMBINE/tts1/cmd.sh:
--------------------------------------------------------------------------------
1 | ../asr1/cmd.sh


--------------------------------------------------------------------------------
/COMBINE/tts1/combine_cmvn_stats.sh:
--------------------------------------------------------------------------------
1 | ../asr1/combine_cmvn_stats.sh


--------------------------------------------------------------------------------
/COMBINE/tts1/combine_train_data.sh:
--------------------------------------------------------------------------------
1 | ../asr1/combine_train_data.sh


--------------------------------------------------------------------------------
/COMBINE/tts1/conf/fbank.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank
2 |   num_mel_bins: 80
3 |   sample_frequency: 16000
4 | 


--------------------------------------------------------------------------------
/COMBINE/tts1/conf/fbank_pitch.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank_pitch
2 |   num_mel_bins: 80
3 |   sample_frequency: 16000
4 | 


--------------------------------------------------------------------------------
/COMBINE/tts1/conf/gpu.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
 3 | option mem=* -l mem_free=$0,ram_free=$0
 4 | option mem=0          # Do not add anything to qsub_opts
 5 | option num_threads=* -pe smp $0
 6 | option num_threads=1  # Do not add anything to qsub_opts
 7 | option max_jobs_run=* -tc $0
 8 | default gpu=0
 9 | option gpu=0
10 | option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q


--------------------------------------------------------------------------------
/COMBINE/tts1/conf/pbs.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -V -v PATH -S /bin/bash
 3 | option name=* -N $0
 4 | option mem=* -l mem=$0
 5 | option mem=0          # Do not add anything to qsub_opts
 6 | option num_threads=* -l ncpus=$0
 7 | option num_threads=1  # Do not add anything to qsub_opts
 8 | option num_nodes=* -l nodes=$0:ppn=1
 9 | default gpu=0
10 | option gpu=0
11 | option gpu=* -l ngpus=$0
12 | 


--------------------------------------------------------------------------------
/COMBINE/tts1/conf/queue.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
 3 | option name=* -N $0
 4 | option mem=* -l mem_free=$0,ram_free=$0
 5 | option mem=0          # Do not add anything to qsub_opts
 6 | option num_threads=* -pe smp $0
 7 | option num_threads=1  # Do not add anything to qsub_opts
 8 | option max_jobs_run=* -tc $0
 9 | option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
10 | default gpu=0
11 | option gpu=0
12 | option gpu=* -l gpu=$0 -q g.q
13 | 


--------------------------------------------------------------------------------
/COMBINE/tts1/conf/slurm.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command sbatch --export=PATH
 3 | option name=* --job-name $0
 4 | option time=* --time $0
 5 | option mem=* --mem-per-cpu $0
 6 | option mem=0          # Do not add anything to qsub_opts
 7 | option num_threads=* --cpus-per-task $0
 8 | option num_threads=1 --cpus-per-task 1
 9 | option num_nodes=* --nodes $0
10 | default gpu=0
11 | option gpu=0 -p cpu
12 | option gpu=* -p gpu --gres=gpu:$0
13 | # note: the --max-jobs-run option is supported as a special case
14 | # by slurm.pl and you don't have to handle it in the config file.
15 | 


--------------------------------------------------------------------------------
/COMBINE/tts1/db.sh:
--------------------------------------------------------------------------------
1 | ../asr1/db.sh


--------------------------------------------------------------------------------
/COMBINE/tts1/local/combine_datasets.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import shutil
 4 | 
 5 | from speech_datasets.utils import get_root
 6 | from speech_datasets.utils.io_utils import get_combo_idx
 7 | from speech_datasets.utils.types import str2bool
 8 | 
 9 | 
10 | def main():
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument("--task", type=str, choices=["asr", "tts"])
13 |     parser.add_argument("--write_dir", type=str2bool, default=True)
14 |     parser.add_argument("datasets", nargs="+", type=str)
15 |     args = parser.parse_args()
16 | 
17 |     # Ensure that all datasets are specified as <dataset>/<split>
18 |     datasets = sorted(set(args.datasets))
19 |     dataset_splits = [d.split("/", maxsplit=1) for d in datasets]
20 |     assert all(len(d) == 2 for d in dataset_splits), \
21 |         f"All datasets must be specified as <dataset>/<split>, but got " \
22 |         f"{datasets} instead"
23 | 
24 |     # Verify that all datasets have been prepared
25 |     dataset_dirs = [os.path.join(get_root(), ds[0], f"{args.task}1", "data", ds[1])
26 |                     for ds in dataset_splits]
27 |     assert all(os.path.isdir(d) for d in dataset_dirs), \
28 |         f"Please make sure that all dataset splits are valid, and that all " \
29 |         f"datasets you wish to combine have already been prepared by stage 1 " \
30 |         f"of {args.task}.sh"
31 | 
32 |     # Get the index of this dataset combination (add to the registry if needed)
33 |     idx = get_combo_idx(datasets, args.task)
34 |     data_dir = os.path.join(get_root(), "COMBINE", f"{args.task}1", "data")
35 |     if idx < 0:
36 |         os.makedirs(data_dir, exist_ok=True)
37 |         with open(os.path.join(data_dir, "registry.txt"), "a") as f:
38 |             f.write(" ".join(datasets) + "\n")
39 |         idx = get_combo_idx(datasets, args.task)
40 | 
41 |     if not args.write_dir:
42 |         return idx
43 | 
44 |     # Create a directory for this dataset combo & prepare it
45 |     dirname = os.path.join(data_dir, str(idx))
46 |     os.makedirs(dirname, exist_ok=True)
47 |     write_segments = any(os.path.isfile(os.path.join(d, "segments"))
48 |                          for d in dataset_dirs)
49 |     with open(os.path.join(dirname, "wav.scp"), "wb") as wav, \
50 |             open(os.path.join(dirname, "text"), "wb") as text, \
51 |             open(os.path.join(dirname, "utt2spk"), "wb") as utt2spk, \
52 |             open(os.path.join(dirname, "segments"), "w") as segments:
53 |         for d in dataset_dirs:
54 | 
55 |             # wav.scp, text, and utt2spk can just be concatenated on
56 |             with open(os.path.join(d, "wav.scp"), "rb") as src_wav:
57 |                 shutil.copyfileobj(src_wav, wav)
58 |             with open(os.path.join(d, "text"), "rb") as src_text:
59 |                 shutil.copyfileobj(src_text, text)
60 |             with open(os.path.join(d, "utt2spk"), "rb") as src_utt2spk:
61 |                 shutil.copyfileobj(src_utt2spk, utt2spk)
62 | 
63 |             if write_segments:
64 |                 # If a segments file exists, we can just concatenate it on
65 |                 if os.path.isfile(os.path.join(d, "segments")):
66 |                     with open(os.path.join(d, "segments"), "r") as src_segments:
67 |                         shutil.copyfileobj(src_segments, segments)
68 | 
69 |                 # Otherwise, we need to use wav.scp to create a dummy segments
70 |                 # line format is <segment_id> <record_id> <start_time> <end_time>
71 |                 # <start_time> = 0, <end_time> = -1 means use the whole recording
72 |                 else:
73 |                     with open(os.path.join(d, "wav.scp"), "r") as src_wav:
74 |                         for line in src_wav:
75 |                             utt_id, _ = line.rstrip().split(None, maxsplit=1)
76 |                             segments.write(f"{utt_id} {utt_id} 0.0 -1.0\n")
77 | 
78 |     return idx
79 | 
80 | 
81 | if __name__ == "__main__":
82 |     combo_idx = main()
83 |     print(combo_idx)
84 | 


--------------------------------------------------------------------------------
/COMBINE/tts1/multi_tokenize.sh:
--------------------------------------------------------------------------------
1 | ../asr1/multi_tokenize.sh


--------------------------------------------------------------------------------
/COMBINE/tts1/path.sh:
--------------------------------------------------------------------------------
1 | ../asr1/path.sh


--------------------------------------------------------------------------------
/COMBINE/tts1/run.sh:
--------------------------------------------------------------------------------
1 | ../asr1/run.sh


--------------------------------------------------------------------------------
/COMBINE/tts1/utils:
--------------------------------------------------------------------------------
1 | ../asr1/utils


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Use shell /bin/bash instead of /bin/sh so the source command can be used
 2 | SHELL := /bin/bash
 3 | # Use the default conda unless a specific install is specified. If there is
 4 | # no conda, we will download a fresh one and use it to set up the virtual env.
 5 | CONDA :=
 6 | VENV_NAME := datasets
 7 | # The python version installed in the conda setup
 8 | PYTHON_VERSION := 3.7.9
 9 | # PyTorch version: 1.2.0, 1.3.0, 1.3.1, 1.4.0, 1.5.0, 1.5.1 (>= 1.2.0 required)
10 | # 1.5.0 and later do not work with PyKaldi...
11 | TORCH_VERSION := 1.4.0
12 | 
13 | ifeq ($(CONDA),)
14 |     CONDA := $(shell which conda)
15 | endif
16 | ifeq ($(TORCH_VERSION),)
17 |     pytorch := pytorch
18 | else
19 |     pytorch := pytorch=$(TORCH_VERSION)
20 | endif
21 | 
22 | ifneq ($(shell which nvidia-smi),) # 'nvcc' found
23 |     CUDA_VERSION := $(shell nvcc --version | grep "release" | sed -E "s/.*release ([0-9.]*).*/\1/")
24 |     CONDA_PYTORCH := $(pytorch) cudatoolkit=$(CUDA_VERSION) -c pytorch
25 | else
26 |     CUDA_VERSION :=
27 |     CONDA_PYTORCH := $(pytorch) cpuonly -c pytorch
28 | endif
29 | # Install CPU version of PyKaldi, so we can run feature extraction on CPU while training on GPU
30 | CONDA_PYKALDI := -c pykaldi pykaldi-cpu
31 | 
32 | .PHONY: all clean
33 | 
34 | all: conda sph2pipe check_install example
35 | 
36 | tools/conda.done:
37 | # Only install PyTorch if the PyTorch version is non-empty
38 | 	tools/install_anaconda.sh $(PYTHON_VERSION) "$(CONDA)" tools/venv $(VENV_NAME) . "$(CONDA_PYTORCH)" "$(CONDA_PYKALDI)"
39 | 	@echo $(VENV_NAME) > tools/conda.done
40 | 
41 | conda: tools/conda.done
42 | 
43 | tools/sph2pipe.done:
44 | 	tools/install_sph2pipe.sh tools
45 | 	touch tools/sph2pipe.done
46 | 
47 | sph2pipe: tools/sph2pipe.done
48 | 
49 | check_install: conda
50 | ifneq ($(strip $(CUDA_VERSION)),)
51 | 	source tools/venv/etc/profile.d/conda.sh && conda deactivate && conda activate $(shell cat tools/conda.done) && python tools/check_install.py
52 | else
53 | 	source tools/venv/etc/profile.d/conda.sh && conda deactivate && conda activate $(shell cat tools/conda.done) && python tools/check_install.py --no-cuda
54 | endif
55 | 
56 | example: conda
57 | 	source tools/venv/etc/profile.d/conda.sh && conda deactivate && conda activate $(shell cat tools/conda.done) && pip install -r example/requirements.txt
58 | 
59 | clean: clean_conda
60 | 	rm -rf tools/*.done
61 | 
62 | clean_conda:
63 | 	rm -rf *.egg-info
64 | 	rm -rf tools/venv
65 | 	rm -f tools/miniconda.sh
66 | 	find . -iname "*.pyc" -delete
67 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | ## Security
2 | 
3 | Please report any security issue to [security@salesforce.com](mailto:security@salesforce.com)
4 | as soon as it is discovered. This library limits its runtime dependencies in
5 | order to reduce the total cost of ownership as much as can be, but all consumers
6 | should remain vigilant and have their security stakeholders review all third-party
7 | products (3PP) like this one and their dependencies.
8 | 


--------------------------------------------------------------------------------
/TEMPLATE/asr1/cmd.sh:
--------------------------------------------------------------------------------
  1 | # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
  2 | # Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
  3 | # e.g.
  4 | #   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
  5 | #
  6 | # Options:
  7 | #   --time <time>: Limit the maximum time to execute.
  8 | #   --mem <mem>: Limit the maximum memory usage.
  9 | #   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
 10 | #   --num-threads <ngpu>: Specify the number of CPU core.
 11 | #   --gpu <ngpu>: Specify the number of GPU devices.
 12 | #   --config: Change the configuration file from default.
 13 | #
 14 | # "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
 15 | # The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
 16 | # e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
 17 | # Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
 18 | #
 19 | # run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
 20 | # These options are mapping to specific options for each backend and
 21 | # it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
 22 | # If jobs failed, your configuration might be wrong for your environment.
 23 | #
 24 | #
 25 | # The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
 26 | #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 27 | # =========================================================~
 28 | 
 29 | 
 30 | # Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
 31 | cmd_backend='local'
 32 | 
 33 | # Local machine, without any Job scheduling system
 34 | if [ "${cmd_backend}" = local ]; then
 35 | 
 36 |     # The other usage
 37 |     export train_cmd="utils/run.pl"
 38 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
 39 |     export cuda_cmd="utils/run.pl"
 40 | 
 41 | # Local machine logging to stdout and log file, without any Job scheduling system
 42 | elif [ "${cmd_backend}" = stdout ]; then
 43 | 
 44 |     # The other usage
 45 |     export train_cmd="utils/stdout.pl"
 46 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
 47 |     export cuda_cmd="utils/stdout.pl"
 48 | 
 49 | 
 50 | # "qsub" (Sun Grid Engine, or derivation of it)
 51 | elif [ "${cmd_backend}" = sge ]; then
 52 |     # The default setting is written in conf/queue.conf.
 53 |     # You must change "-q g.q" for the "queue" for your environment.
 54 |     # To know the "queue" names, type "qhost -q"
 55 |     # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
 56 | 
 57 |     export train_cmd="utils/queue.pl"
 58 |     export cuda_cmd="utils/queue.pl"
 59 | 
 60 | 
 61 | # "qsub" (Torque/PBS.)
 62 | elif [ "${cmd_backend}" = pbs ]; then
 63 |     # The default setting is written in conf/pbs.conf.
 64 | 
 65 |     export train_cmd="utils/pbs.pl"
 66 |     export cuda_cmd="utils/pbs.pl"
 67 | 
 68 | 
 69 | # "sbatch" (Slurm)
 70 | elif [ "${cmd_backend}" = slurm ]; then
 71 |     # The default setting is written in conf/slurm.conf.
 72 |     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
 73 |     # To know the "partion" names, type "sinfo".
 74 |     # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
 75 |     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 76 | 
 77 |     export train_cmd="utils/slurm.pl"
 78 |     export cuda_cmd="utils/slurm.pl"
 79 | 
 80 | elif [ "${cmd_backend}" = ssh ]; then
 81 |     # You have to create ".queue/machines" to specify the host to execute jobs.
 82 |     # e.g. .queue/machines
 83 |     #   host1
 84 |     #   host2
 85 |     #   host3
 86 |     # Assuming you can login them without any password, i.e. You have to set ssh keys.
 87 | 
 88 |     export train_cmd="utils/ssh.pl"
 89 |     export cuda_cmd="utils/ssh.pl"
 90 | 
 91 | # This is an example of specifying several unique options in the JHU CLSP cluster setup.
 92 | # Users can modify/add their own command options according to their cluster environments.
 93 | elif [ "${cmd_backend}" = jhu ]; then
 94 | 
 95 |     export train_cmd="utils/queue.pl --mem 2G"
 96 |     export cuda_cmd="utils/queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
 97 | 
 98 | else
 99 |     echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
100 |     return 1
101 | fi
102 | 


--------------------------------------------------------------------------------
/TEMPLATE/asr1/conf/fbank.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank
2 |   num_mel_bins: 80
3 |   sample_frequency: 16000
4 | 


--------------------------------------------------------------------------------
/TEMPLATE/asr1/conf/fbank_pitch.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank_pitch
2 |   num_mel_bins: 80
3 |   sample_frequency: 16000
4 | 


--------------------------------------------------------------------------------
/TEMPLATE/asr1/conf/pbs.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -V -v PATH -S /bin/bash
 3 | option name=* -N $0
 4 | option mem=* -l mem=$0
 5 | option mem=0          # Do not add anything to qsub_opts
 6 | option num_threads=* -l ncpus=$0
 7 | option num_threads=1  # Do not add anything to qsub_opts
 8 | option num_nodes=* -l nodes=$0:ppn=1
 9 | default gpu=0
10 | option gpu=0
11 | option gpu=* -l ngpus=$0
12 | 


--------------------------------------------------------------------------------
/TEMPLATE/asr1/conf/queue.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
 3 | option name=* -N $0
 4 | option mem=* -l mem_free=$0,ram_free=$0
 5 | option mem=0          # Do not add anything to qsub_opts
 6 | option num_threads=* -pe smp $0
 7 | option num_threads=1  # Do not add anything to qsub_opts
 8 | option max_jobs_run=* -tc $0
 9 | option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
10 | default gpu=0
11 | option gpu=0
12 | option gpu=* -l gpu=$0 -q g.q
13 | 


--------------------------------------------------------------------------------
/TEMPLATE/asr1/conf/slurm.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command sbatch --export=PATH
 3 | option name=* --job-name $0
 4 | option time=* --time $0
 5 | option mem=* --mem-per-cpu $0
 6 | option mem=0          # Do not add anything to qsub_opts
 7 | option num_threads=* --cpus-per-task $0
 8 | option num_threads=1 --cpus-per-task 1
 9 | option num_nodes=* --nodes $0
10 | default gpu=0
11 | option gpu=0 -p cpu
12 | option gpu=* -p gpu --gres=gpu:$0
13 | # note: the --max-jobs-run option is supported as a special case
14 | # by slurm.pl and you don't have to handle it in the config file.
15 | 


--------------------------------------------------------------------------------
/TEMPLATE/asr1/db.sh:
--------------------------------------------------------------------------------
 1 | # We extract WSJ0_TGZ to WSJ0 and WSJ1_TGZ to WSJ1. Note that the actual data
 2 | # is in WSJ0/csr_1_senn and WSJ1/csr_senn
 3 | WSJ0_TGZ=/export/data/LDC/csr_1_senn_LDC93S6B.tgz
 4 | WSJ1_TGZ=/export/data/LDC/csr_senn_LDC94S13B.tgz
 5 | WSJ0=/workspace/LDC93S6B
 6 | WSJ1=/workspace/LDC94S13B
 7 | 
 8 | # Extract SWBD1_TGZ to SWBD1
 9 | SWBD1_TGZ=/export/data/LDC/swb1_LDC97S62.tgz
10 | SWBD1=/workspace/LDC97S62
11 | 
12 | # Filepath i of EVAL2000_TGZ extracts into directory i of EVAL2000.
13 | # First directory must contain the speech data, second directory must contain the transcripts.
14 | EVAL2000_TGZ="/export/data/LDC/hub5e_00_LDC2002S09.tgz /export/data/LDC/LDC2002T43.tgz"
15 | EVAL2000="/workspace/LDC2002S09/hub5e_00 /workspace/LDC2002T43"
16 | 
17 | # Extract RT03_TGZ to RT03
18 | RT03_TGZ=/export/data/LDC/rt_03_LDC2007S10.tgz
19 | RT03=/workspace/LDC2007S10/rt_03
20 | 
21 | # filepath i of FISHER_TGZ extracts into directory i of FISHER
22 | # In this case, we extract LDC2004T19 and LDC2005T19 every time, but LDC2004S13 and LDC2005S13 are pre-extracted
23 | FISHER="/workspace/LDC2004T19 /workspace/LDC2005T19 /export/data/LDC/LDC2004S13 /export/data/LDC/LDC2005S13"
24 | FISHER_TGZ="/export/data/LDC/LDC2004T19/fe_03_p1_tran_LDC2004T19.tgz /export/data/LDC/LDC2005T19/LDC2005T19.tgz"
25 | 
26 | LIBRISPEECH=/export/data/librispeech
27 | 
28 | COMMONVOICE=/export/data/commonvoice
29 | 


--------------------------------------------------------------------------------
/TEMPLATE/asr1/path.sh:
--------------------------------------------------------------------------------
 1 | MAIN_ROOT=$(dirname "$(dirname "${PWD}")")
 2 | export LC_ALL=C
 3 | 
 4 | if [ -z "${PS1:-}" ]; then
 5 |     PS1=__dummy__
 6 | fi
 7 | 
 8 | # Activate local virtual environment for development
 9 | error_msg="Virtual environment not set up properly! Navigate to $MAIN_ROOT and run 'make clean all'"
10 | if [ -e $MAIN_ROOT/tools/venv/etc/profile.d/conda.sh ] && [ -e $MAIN_ROOT/tools/conda.done ]; then
11 |     VENV_NAME=$(cat "${MAIN_ROOT}/tools/conda.done")
12 |     source $MAIN_ROOT/tools/venv/etc/profile.d/conda.sh && conda deactivate
13 |     if conda env list | (grep -q -E "${VENV_NAME}\s"); then
14 |         conda activate "${VENV_NAME}"
15 |     else
16 |         echo "${error_msg}" && exit 1
17 |     fi
18 | else
19 |     echo "${error_msg}" && exit 1
20 | fi
21 | 
22 | # Add binary scripts to the path, to allow them to be run easily
23 | export PATH=$MAIN_ROOT/speech_datasets/bin:$PATH
24 | export OMP_NUM_THREADS=1
25 | 
26 | # NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
27 | export PYTHONIOENCODING=UTF-8
28 | 
29 | # You need to change or unset NCCL_SOCKET_IFNAME according to your network environment
30 | # https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/env.html#nccl-socket-ifname
31 | export NCCL_SOCKET_IFNAME="^lo,docker,virbr,vmnet,vboxnet"
32 | 


--------------------------------------------------------------------------------
/TEMPLATE/asr1/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Set bash to 'debug' mode, it will exit on :
 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
 4 | set -e
 5 | set -u
 6 | set -o pipefail
 7 | 
 8 | log() {
 9 |     local fname=${BASH_SOURCE[1]##*/}
10 |     echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
11 | }
12 | help_message=$(cat << EOF
13 | Usage: $0 <target-dir>
14 | EOF
15 | )
16 | 
17 | 
18 | if [ $# -ne 1 ]; then
19 |     log "${help_message}"
20 |     log "Error: 1 positional argument is required."
21 |     exit 2
22 | fi
23 | 
24 | 
25 | dir=$1
26 | mkdir -p "${dir}"
27 | 
28 | if [ ! -d "${dir}"/../../TEMPLATE ]; then
29 |     log "Error: ${dir}/../../TEMPLATE should exist. You may specify wrong directory."
30 |     exit 1
31 | fi
32 | 
33 | targets=""
34 | 
35 | # Copy
36 | for f in conf; do
37 |     target="${dir}"/../../TEMPLATE/asr1/"${f}"
38 |     cp -r "${target}" "${dir}"
39 |     targets+="${dir}/${target} "
40 | done
41 | 
42 | 
43 | # Symlinks to TEMPLATE & Kaldi
44 | for f in asr.sh cmd.sh path.sh db.sh utils; do
45 |     target=../../TEMPLATE/asr1/"${f}"
46 |     ln -sf "${target}" "${dir}"
47 |     targets+="${dir}/${target} "
48 | done
49 | 
50 | 
51 | log "Created: ${targets}"
52 | 


--------------------------------------------------------------------------------
/TEMPLATE/asr1/utils:
--------------------------------------------------------------------------------
1 | ../../utils


--------------------------------------------------------------------------------
/TEMPLATE/tts1/cmd.sh:
--------------------------------------------------------------------------------
1 | ../asr1/cmd.sh


--------------------------------------------------------------------------------
/TEMPLATE/tts1/conf/fbank.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank
2 |   num_mel_bins: 80
3 |   sample_frequency: 16000
4 | 


--------------------------------------------------------------------------------
/TEMPLATE/tts1/conf/fbank_pitch.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank_pitch
2 |   num_mel_bins: 80
3 |   sample_frequency: 16000
4 | 


--------------------------------------------------------------------------------
/TEMPLATE/tts1/conf/pbs.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -V -v PATH -S /bin/bash
 3 | option name=* -N $0
 4 | option mem=* -l mem=$0
 5 | option mem=0          # Do not add anything to qsub_opts
 6 | option num_threads=* -l ncpus=$0
 7 | option num_threads=1  # Do not add anything to qsub_opts
 8 | option num_nodes=* -l nodes=$0:ppn=1
 9 | default gpu=0
10 | option gpu=0
11 | option gpu=* -l ngpus=$0
12 | 


--------------------------------------------------------------------------------
/TEMPLATE/tts1/conf/queue.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
 3 | option name=* -N $0
 4 | option mem=* -l mem_free=$0,ram_free=$0
 5 | option mem=0          # Do not add anything to qsub_opts
 6 | option num_threads=* -pe smp $0
 7 | option num_threads=1  # Do not add anything to qsub_opts
 8 | option max_jobs_run=* -tc $0
 9 | option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
10 | default gpu=0
11 | option gpu=0
12 | option gpu=* -l gpu=$0 -q g.q
13 | 


--------------------------------------------------------------------------------
/TEMPLATE/tts1/conf/slurm.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command sbatch --export=PATH
 3 | option name=* --job-name $0
 4 | option time=* --time $0
 5 | option mem=* --mem-per-cpu $0
 6 | option mem=0          # Do not add anything to qsub_opts
 7 | option num_threads=* --cpus-per-task $0
 8 | option num_threads=1 --cpus-per-task 1
 9 | option num_nodes=* --nodes $0
10 | default gpu=0
11 | option gpu=0 -p cpu
12 | option gpu=* -p gpu --gres=gpu:$0
13 | # note: the --max-jobs-run option is supported as a special case
14 | # by slurm.pl and you don't have to handle it in the config file.
15 | 


--------------------------------------------------------------------------------
/TEMPLATE/tts1/db.sh:
--------------------------------------------------------------------------------
1 | ../asr1/db.sh


--------------------------------------------------------------------------------
/TEMPLATE/tts1/path.sh:
--------------------------------------------------------------------------------
1 | ../asr1/path.sh


--------------------------------------------------------------------------------
/TEMPLATE/tts1/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Set bash to 'debug' mode, it will exit on :
 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
 4 | set -e
 5 | set -u
 6 | set -o pipefail
 7 | 
 8 | log() {
 9 |     local fname=${BASH_SOURCE[1]##*/}
10 |     echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
11 | }
12 | help_message=$(cat << EOF
13 | Usage: $0 <target-dir>
14 | EOF
15 | )
16 | 
17 | 
18 | if [ $# -ne 1 ]; then
19 |     log "${help_message}"
20 |     log "Error: 1 positional argument is required."
21 |     exit 2
22 | fi
23 | 
24 | 
25 | dir=$1
26 | mkdir -p "${dir}"
27 | 
28 | if [ ! -d "${dir}"/../../TEMPLATE ]; then
29 |     log "Error: ${dir}/../../TEMPLATE should exist. You may specify wrong directory."
30 |     exit 1
31 | fi
32 | 
33 | targets=""
34 | 
35 | # Copy
36 | for f in conf; do
37 |     target="${dir}"/../../TEMPLATE/tts1/"${f}"
38 |     cp -r "${target}" "${dir}"
39 |     targets+="${dir}/${target} "
40 | done
41 | 
42 | 
43 | # Symlinks to TEMPLATE & Kaldi
44 | for f in tts.sh cmd.sh path.sh db.sh utils; do
45 |     target=../../TEMPLATE/tts1/"${f}"
46 |     ln -sf "${target}" "${dir}"
47 |     targets+="${dir}/${target} "
48 | done
49 | 
50 | 
51 | log "Created: ${targets}"
52 | 


--------------------------------------------------------------------------------
/TEMPLATE/tts1/tts.sh:
--------------------------------------------------------------------------------
1 | ../asr1/asr.sh


--------------------------------------------------------------------------------
/TEMPLATE/tts1/utils:
--------------------------------------------------------------------------------
1 | ../asr1/utils


--------------------------------------------------------------------------------
/commonvoice/asr1/asr.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/asr.sh


--------------------------------------------------------------------------------
/commonvoice/asr1/cmd.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/cmd.sh


--------------------------------------------------------------------------------
/commonvoice/asr1/conf/fbank.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank
2 |   num_mel_bins: 80
3 |   sample_frequency: 16000
4 | 


--------------------------------------------------------------------------------
/commonvoice/asr1/conf/fbank_pitch.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank_pitch
2 |   num_mel_bins: 80
3 |   sample_frequency: 16000
4 | 


--------------------------------------------------------------------------------
/commonvoice/asr1/conf/pbs.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -V -v PATH -S /bin/bash
 3 | option name=* -N $0
 4 | option mem=* -l mem=$0
 5 | option mem=0          # Do not add anything to qsub_opts
 6 | option num_threads=* -l ncpus=$0
 7 | option num_threads=1  # Do not add anything to qsub_opts
 8 | option num_nodes=* -l nodes=$0:ppn=1
 9 | default gpu=0
10 | option gpu=0
11 | option gpu=* -l ngpus=$0
12 | 


--------------------------------------------------------------------------------
/commonvoice/asr1/conf/queue.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
 3 | option name=* -N $0
 4 | option mem=* -l mem_free=$0,ram_free=$0
 5 | option mem=0          # Do not add anything to qsub_opts
 6 | option num_threads=* -pe smp $0
 7 | option num_threads=1  # Do not add anything to qsub_opts
 8 | option max_jobs_run=* -tc $0
 9 | option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
10 | default gpu=0
11 | option gpu=0
12 | option gpu=* -l gpu=$0 -q g.q
13 | 


--------------------------------------------------------------------------------
/commonvoice/asr1/conf/slurm.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command sbatch --export=PATH
 3 | option name=* --job-name $0
 4 | option time=* --time $0
 5 | option mem=* --mem-per-cpu $0
 6 | option mem=0          # Do not add anything to qsub_opts
 7 | option num_threads=* --cpus-per-task $0
 8 | option num_threads=1 --cpus-per-task 1
 9 | option num_nodes=* --nodes $0
10 | default gpu=0
11 | option gpu=0 -p cpu
12 | option gpu=* -p gpu --gres=gpu:$0
13 | # note: the --max-jobs-run option is supported as a special case
14 | # by slurm.pl and you don't have to handle it in the config file.
15 | 


--------------------------------------------------------------------------------
/commonvoice/asr1/db.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/db.sh


--------------------------------------------------------------------------------
/commonvoice/asr1/local/data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Set bash to 'debug' mode, it will exit on :
 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
 4 | set -e
 5 | set -u
 6 | set -o pipefail
 7 | 
 8 | log() {
 9 |     local fname=${BASH_SOURCE[1]##*/}
10 |     echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
11 | }
12 | 
13 | # Copyright 2020 Johns Hopkins University (Shinji Watanabe)
14 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
15 | 
16 | . ./path.sh || exit 1;
17 | . ./cmd.sh || exit 1;
18 | . ./db.sh || exit 1;
19 | 
20 | # general configuration
21 | SECONDS=0
22 | lang=en # en de fr cy tt kab ca zh-TW it fa eu es ru
23 | # base url for downloads.
24 | data_url=https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/$lang.tar.gz
25 | 
26 | train_set=valid_train_${lang}
27 | train_dev=valid_dev_${lang}
28 | test_set=valid_test_${lang}
29 | 
30 | # Ensure that COMMONVOICE data has already been extracted
31 | if [ -z "${COMMONVOICE}" ]; then
32 |     log "Fill the value of 'COMMONVOICE' in db.sh"
33 |     exit 1
34 | fi
35 | log "Downloading commonvoice dataset"
36 | mkdir -p "${COMMONVOICE}"
37 | local/download_and_untar.sh "${COMMONVOICE}" "${data_url}" "${lang}.tar.gz"
38 | 
39 | log "Preparing data for commonvoice"
40 | ### Task dependent. You have to make data the following preparation part by yourself.
41 | ### But you can utilize Kaldi recipes in most cases
42 | for part in "validated"; do
43 |     # use underscore-separated names in data directories.
44 |     local/data_prep.pl "${COMMONVOICE}" ${part} data/"$(echo "${part}_${lang}" | tr - _)"
45 | done
46 | 
47 | # Kaldi Version Split
48 | # utils/subset_data_dir_tr_cv.sh data/validated data/valid_train data/valid_test_dev
49 | # utils/subset_data_dir_tr_cv.sh --cv-spk-percent 50 data/valid_test_dev data/valid_test data/valid_dev
50 | 
51 | # ESPNet Version (same as voxforge)
52 | # consider duplicated sentences (does not consider speaker split)
53 | # filter out the same sentences (also same text) of test&dev set from validated set
54 | local/split_tr_dt_et.sh data/validated_${lang} data/${train_set} data/${train_dev} data/${test_set}
55 | 
56 | log "Successfully finished. [elapsed=${SECONDS}s]"
57 | 


--------------------------------------------------------------------------------
/commonvoice/asr1/local/data_prep.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | #
 3 | # Copyright 2017   Ewald Enzinger
 4 | # Apache 2.0
 5 | #
 6 | # Usage: data_prep.pl /export/data/cv_corpus_v1/cv-valid-train valid_train
 7 | 
 8 | if (@ARGV != 3) {
 9 |   print STDERR "Usage: $0 <path-to-commonvoice-corpus> <dataset> <valid-train|valid-dev|valid-test>\n";
10 |   print STDERR "e.g. $0 /export/data/cv_corpus_v1 cv-valid-train valid-train\n";
11 |   exit(1);
12 | }
13 | 
14 | # use ffmpeg for mp3 to wav
15 | if (length(`which ffmpeg`) == 0) {
16 |   print "Please install 'ffmpeg' on All worker nodes!\n";
17 |   exit 1;
18 | }
19 | 
20 | 
21 | ($db_base, $dataset, $out_dir) = @ARGV;
22 | mkdir data unless -d data;
23 | mkdir $out_dir unless -d $out_dir;
24 | 
25 | open(CSV, "<", "$db_base/$dataset.tsv") or die "cannot open dataset CSV file";
26 | open(SPKR,">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk";
27 | open(GNDR,">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender";
28 | open(TEXT,">", "$out_dir/text") or die "Could not open the output file $out_dir/text";
29 | open(WAV,">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp";
30 | my $header = <CSV>;
31 | while(<CSV>) {
32 |   chomp;
33 |   ($spkr, $filepath, $text, $upvotes, $downvotes, $age, $gender, $accent) = split("\t", $_);
34 |   # speaker comes from commonvoice --> uttId comes from commonvoice
35 |   $spkr = "{commonvoice}$spkr";
36 |   if ("$gender" eq "female") {
37 |     $gender = "f";
38 |   } else {
39 |     # Use male as default if not provided (no reason, just adopting the same default as in voxforge)
40 |     $gender = "m";
41 |   }
42 |   $uttId = $filepath;
43 |   if (-z "$db_base/clips/$filepath") {
44 |     print "null file $filepath\n";
45 |     next;
46 |   }
47 |   $uttId =~ s/\.mp3//g;
48 |   $uttId =~ tr/\//-/;
49 |   # speaker information should be suffix of the utterance Id
50 |   $uttId = "$spkr-$uttId";
51 | 
52 |   # make sure all text is lowercase
53 |   $text =~ tr/A-Z/a-z/;
54 | 
55 |   # get rid of all puncts besides apostrophes
56 |   $text =~ s/[^\w\s']|//g;
57 |   $text =~ s/(\s)'/$1/g;
58 |   $text =~ s/'(\s)/$1/g;
59 | 
60 |   if (index($text, "{") != -1 and index($text, "}" != -1)) {
61 |     next;
62 |   }
63 |   print TEXT "$uttId"," ","$text","\n";
64 |   print GNDR "$spkr"," ","$gender","\n";
65 |   print WAV "$uttId"," ffmpeg -i $db_base/clips/$filepath -f wav -ar 16000 -ab 16 - |\n";
66 |   print SPKR "$uttId"," $spkr","\n";
67 | }
68 | close(SPKR) || die;
69 | close(TEXT) || die;
70 | close(WAV) || die;
71 | close(GNDR) || die;
72 | close(CSV);
73 | 
74 | # Use utt2spk to generate spk2utt
75 | if (system(
76 |   "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
77 |   die "Error creating spk2utt file in directory $out_dir";
78 | }
79 | 
80 | # Validate the data directory
81 | system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir");
82 | if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-feats $out_dir") != 0) {
83 |   die "Error validating directory $out_dir";
84 | }
85 | 


--------------------------------------------------------------------------------
/commonvoice/asr1/local/download_and_untar.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Set bash to 'debug' mode, it will exit on :
 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
 4 | set -e
 5 | set -u
 6 | set -o pipefail
 7 | 
 8 | log() {
 9 |     local fname=${BASH_SOURCE[1]##*/}
10 |     echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
11 | }
12 | 
13 | # Copyright   2014  Johns Hopkins University (author: Daniel Povey)
14 | #             2017  Luminar Technologies, Inc. (author: Daniel Galvez)
15 | #             2017  Ewald Enzinger
16 | # Apache 2.0
17 | 
18 | # Adapted from egs/mini_librispeech/s5/local/download_and_untar.sh (commit 1cd6d2ac3a935009fdc4184cb8a72ddad98fe7d9)
19 | 
20 | remove_archive=false
21 | 
22 | if [ "$1" == --remove-archive ]; then
23 |   remove_archive=true
24 |   shift
25 | fi
26 | 
27 | if [ $# -ne 3 ]; then
28 |   log "Usage: $0 [--remove-archive] <data-base> <url> <filename>"
29 |   log "e.g.: $0 /export/data/ https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.tar.gz cv_corpus_v1.tar.gz"
30 |   log "With --remove-archive it will remove the archive after successfully un-tarring it."
31 |   exit 1
32 | fi
33 | 
34 | data=$1
35 | url=$2
36 | filename=$3
37 | filepath="$data/$filename"
38 | workspace=$PWD
39 | 
40 | if [ ! -d "$data" ]; then
41 |   log "$0: no such directory $data"
42 |   exit 1;
43 | fi
44 | 
45 | if [ -z "$url" ]; then
46 |   log "$0: empty URL."
47 |   exit 1;
48 | fi
49 | 
50 | if [ -f $data/$filename.complete ]; then
51 |   log "$0: data was already successfully extracted, nothing to do."
52 |   exit 0;
53 | fi
54 | 
55 | if [ ! -f $filepath ]; then
56 |   if ! which wget >/dev/null; then
57 |     log "$0: wget is not installed."
58 |     exit 1;
59 |   fi
60 |   log "$0: downloading data from $url.  This may take some time, please be patient."
61 | 
62 |   if ! wget --no-check-certificate $url -O $filepath; then
63 |     log "$0: error executing wget $url"
64 |     rm -f $filepath
65 |     exit 1;
66 |   fi
67 | fi
68 | 
69 | cd $data
70 | if ! tar -xzvf $filename; then
71 |   log "$0: error un-tarring archive $filepath"
72 |   exit 1;
73 | fi
74 | cd $workspace
75 | 
76 | touch $data/$filename.complete
77 | 
78 | log "$0: Successfully downloaded and un-tarred $filepath"
79 | 
80 | if $remove_archive; then
81 |   log "$0: removing $filepath file since --remove-archive option was supplied."
82 |   rm $filepath
83 | fi
84 | 


--------------------------------------------------------------------------------
/commonvoice/asr1/local/filter_text.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright 2017 Johns Hopkins University (Shinji Watanabe)
 4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | import argparse
 7 | import codecs
 8 | from io import open
 9 | import sys
10 | 
11 | 
12 | sys.stdin = codecs.getreader("utf-8")(sys.stdin.buffer)
13 | sys.stdout = codecs.getwriter("utf-8")(sys.stdout.buffer)
14 | 
15 | 
16 | if __name__ == "__main__":
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument("--filter-list", "-f", type=str, help="filter list")
19 |     args = parser.parse_args()
20 | 
21 |     with open(args.filter_list, encoding="utf-8") as f:
22 |         fil = [x.rstrip() for x in f]
23 | 
24 |     for x in sys.stdin:
25 |         # extract text parts
26 |         text = " ".join(x.rstrip().split()[1:])
27 |         if text in fil:
28 |             print(x.split()[0], text)
29 | 


--------------------------------------------------------------------------------
/commonvoice/asr1/local/reduce_data_dir.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # koried, 10/29/2012
 4 | 
 5 | # Reduce a data set based on a list of turn-ids
 6 | 
 7 | help_message="usage: $0 srcdir turnlist destdir"
 8 | 
 9 | if [ $1 == "--help" ]; then
10 |     echo "${help_message}"
11 |     exit 0;
12 | fi
13 | 
14 | if [ $# != 3 ]; then
15 |     echo "${help_message}"
16 |     exit 1;
17 | fi
18 | 
19 | srcdir=$1
20 | reclist=$2
21 | destdir=$3
22 | 
23 | if [ ! -f ${srcdir}/utt2spk ]; then
24 | echo "$0: no such file $srcdir/utt2spk"
25 | exit 1;
26 | fi
27 | 
28 | function do_filtering {
29 | # assumes the utt2spk and spk2utt files already exist.
30 | 	[ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp
31 | 	[ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp
32 | 	[ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text
33 | 	[ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames
34 | 	[ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender
35 | 	[ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp
36 | 	if [ -f ${srcdir}/segments ]; then
37 | 		utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments
38 | 		awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings.
39 | 		# The next line would override the command above for wav.scp, which would be incorrect.
40 | 		[ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp
41 | 		[ -f ${srcdir}/reco2file_and_channel ] && \
42 | 			utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel
43 | 		
44 | 		# Filter the STM file for proper sclite scoring (this will also remove the comments lines)
45 | 		[ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm
46 | 		rm ${destdir}/reco
47 | 	fi
48 | 	srcutts=$(wc -l < ${srcdir}/utt2spk)
49 | 	destutts=$(wc -l < ${destdir}/utt2spk)
50 | 	echo "Reduced #utt from $srcutts to $destutts"
51 | }
52 | 
53 | mkdir -p ${destdir}
54 | 
55 | # filter the utt2spk based on the set of recordings
56 | utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk
57 | 
58 | utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt
59 | do_filtering;
60 | 


--------------------------------------------------------------------------------
/commonvoice/asr1/local/split_tr_dt_et.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | log() {
 4 |     local fname=${BASH_SOURCE[1]##*/}
 5 |     echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 6 | }
 7 | 
 8 | # Copyright 2017 Johns Hopkins University (Shinji Watanabe)
 9 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
10 | 
11 | . ./path.sh
12 | 
13 | perdt=10 # percent for dev set
14 | peret=10 # percent for eval set
15 | 
16 | . utils/parse_options.sh
17 | 
18 | if [ $# != 4 ]; then
19 |     log "Usage: $0 <src-data-dir> <dest-trdata-dir> <dest-dtdata-dir> <dest-etdata-dir>";
20 |     exit 1;
21 | fi
22 | 
23 | sdata=$1
24 | trdata=$2
25 | dtdata=$3
26 | etdata=$4
27 | 
28 | tmpdata=$trdata/tmp
29 | mkdir -p $tmpdata
30 | mkdir -p $dtdata
31 | mkdir -p $etdata
32 | 
33 | # make a unique prompts files
34 | # some transcripts have multiple spaces and need tr -s " " to remove them
35 | cut -f 2- -d" " $sdata/text | tr -s " " | sort | uniq > $tmpdata/prompts
36 | num_prompt=$(wc -l $tmpdata/prompts | awk '{print $1}')
37 | 
38 | num_dt=$(echo "$num_prompt * $perdt / 100" | bc)
39 | num_et=$(echo "$num_prompt * $peret / 100" | bc)
40 | log "number of dev set prompts: $num_dt"
41 | log "number of eval set prompts: $num_et"
42 | 
43 | # dt
44 | utils/shuffle_list.pl $tmpdata/prompts | head -n $num_dt > $tmpdata/dt_prompts
45 | # et
46 | utils/shuffle_list.pl $tmpdata/prompts | head -n $(echo "$num_dt + $num_et" | bc) \
47 |     | tail -n $num_et > $tmpdata/et_prompts
48 | # tr
49 | nrest=$(echo "$num_dt + $num_et + 1" | bc)
50 | utils/shuffle_list.pl $tmpdata/prompts | tail -n +$nrest > $tmpdata/tr_prompts
51 | log "number of train set prompts: $(wc -l $tmpdata/tr_prompts | awk '{print $1}')"
52 | 
53 | # it takes very long time when # prompts is large
54 | cat $sdata/text | local/filter_text.py -f $tmpdata/dt_prompts | awk '{print $1}' | sort > $tmpdata/dt.ids
55 | log "finished text extraction for dev set #utt = $(wc -l $tmpdata/dt.ids | awk '{print $1}')"
56 | cat $sdata/text | local/filter_text.py -f $tmpdata/et_prompts | awk '{print $1}' | sort > $tmpdata/et.ids
57 | log "finished text extraction for eval set #utt = $(wc -l $tmpdata/et.ids | awk '{print $1}')"
58 | cat $tmpdata/dt.ids $tmpdata/et.ids | sort > $tmpdata/dtet.ids
59 | cat $sdata/text | awk '{print $1}' | sort > $tmpdata/all.ids
60 | diff $tmpdata/all.ids $tmpdata/dtet.ids | awk '/^</{print $2}' | sort > $tmpdata/tr.ids
61 | log "finished text extraction for train set #utt = $(wc -l $tmpdata/tr.ids | awk '{print $1}')"
62 | 
63 | log "dev data: $(reduce_data_dir.sh $sdata $tmpdata/dt.ids $dtdata)"
64 | utils/fix_data_dir.sh $dtdata
65 | 
66 | log "eval data: $(reduce_data_dir.sh $sdata $tmpdata/et.ids $etdata)"
67 | utils/fix_data_dir.sh $etdata
68 | 
69 | log "train data: $(reduce_data_dir.sh $sdata $tmpdata/tr.ids $trdata)"
70 | utils/fix_data_dir.sh $trdata
71 | 


--------------------------------------------------------------------------------
/commonvoice/asr1/path.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/path.sh


--------------------------------------------------------------------------------
/commonvoice/asr1/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Set bash to 'debug' mode, it will exit on :
 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
 4 | set -e
 5 | set -u
 6 | set -o pipefail
 7 | 
 8 | lang=en # en de fr cy tt kab ca zh-TW it fa eu es ru
 9 | 
10 | train_set=valid_train_${lang}
11 | train_dev=valid_dev_${lang}
12 | train_test=valid_test_${lang}
13 | 
14 | ./asr.sh \
15 |     --local_data_opts "--lang ${lang}" \
16 |     --fs 16000 \
17 |     --n_tokens 2000 \
18 |     --token_type bpe \
19 |     --feats_type fbank_pitch \
20 |     --train_sets "${train_set}" \
21 |     --dev_eval_sets "${train_dev} ${train_test}" \
22 |     --srctexts "data/${train_set}/text" "$@"
23 | 


--------------------------------------------------------------------------------
/commonvoice/asr1/utils:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/utils


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM pytorch/pytorch:latest
 2 | RUN apt-get update
 3 | RUN apt-get install -y --no-install-recommends \
 4 |     apt-utils wget bc gawk vim emacs build-essential locales libfontconfig1 automake \
 5 |     sox flac ffmpeg libasound2-dev libsndfile1-dev \
 6 |     libfftw3-dev libopenblas-dev libgflags-dev libgoogle-glog-dev gfortran \
 7 |     python3 python3-dev python3-pip python3-numpy python3-setuptools
 8 | RUN apt update
 9 | RUN apt install -y openssh-server openssh-client
10 | 
11 | # Default to utf-8 encodings in python
12 | # Can verify in container with:
13 | # python -c 'import locale; print(locale.getpreferredencoding(False))'
14 | RUN locale-gen en_US.UTF-8
15 | ENV LANG en_US.UTF-8
16 | ENV LANGUAGE en_US:en
17 | ENV LC_ALL en_US.UTF-8
18 | 


--------------------------------------------------------------------------------
/example/README.md:
--------------------------------------------------------------------------------
 1 | # Example Code
 2 | This directory provides an example which trains a transformer encoder-decoder model on the
 3 | `train-clean-100` and `train-clean-360` splits of LibriSpeech, and it evaluates the model's
 4 | performance on the `dev-clean` split.
 5 | 
 6 | In order to run this example, you must first prepare an environment and install the `speech_datasets` package,
 7 | as detailed [here](../README.md#environment-setup). Next, navigate to [librispeech/asr1](../librispeech/asr1) and
 8 | invoke
 9 | ```shell script
10 | ./run.sh --stage 1 --stop_stage 4 --feats_type <raw|fbank|fbank_pitch>
11 | ```
12 | This will download, prepare, and extract the relevant features for LibriSpeech, and make the dataset usable with
13 | the `speech_datasets` package. Note that this step will take a long time!
14 | 
15 | Next, you should navigate to this directory and activate the conda environment by invoking
16 | ```
17 | source ../tools/venv/bin/activate && conda deactivate && conda activate <VENV_NAME>
18 | ```
19 | (where `<VENV_NAME>` is the name of the conda virtual environment, `datasets` by default if you did not specify it
20 | when setting up your environment as described [here](../README.md#environment-setup)). Now, you can run
21 | [`main.py`](main.py). If you dumped `--feats_type raw`, then you can run
22 | ```
23 | python main.py --feats_type <fbank|fbank_pitch>
24 | ```
25 | If you instead dumped `--feats_type fbank` or `--feats_type fbank_pitch`, you can instead run
26 | ```
27 | python main.py --feats_type <fbank|fbank_pitch> --precomputed_feats
28 | ```
29 | 
30 | The `feats_type` argument to `main.py` will specify whether to use the feature computation configuration
31 | [`fbank.yaml`](resources/fbank.yaml) or [`fbank_pitch.yaml`](resources/fbank_pitch.yaml).
32 | Both compute 80-dimensional filterbank features (optionally pitch as well), apply the appropriate cepstral
33 | mean/variance normalization (using the statistics pre-computed in
34 | [`global_cmvn_fbank.ark`](resources/global_cmvn_fbank.ark) or
35 | [`global_cmvn_fbank_pitch.ark`](resources/global_cmvn_fbank_pitch.ark)), and apply spectral augmentation.
36 | 
37 | In this example, the data loader will tokenize the text using the provided sentencepiece model 
38 | [`librispeech_bpe2000.model`](resources/librispeech_bpe2000.model). See the `main()` function of
39 | [`main.py`](main.py) for a full example.
40 | 


--------------------------------------------------------------------------------
/example/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==3.4.0
2 | 


--------------------------------------------------------------------------------
/example/resources/fbank.yaml:
--------------------------------------------------------------------------------
 1 | - type: fbank
 2 |   num_mel_bins: 80
 3 |   sample_frequency: 16000
 4 | - type: cmvn
 5 |   cmvn_type: global
 6 |   stats: resources/global_cmvn_fbank.ark
 7 |   norm_vars: True
 8 | - type: spec_augment
 9 |   n_freq_mask: 2
10 |   max_freq_width: 27
11 |   n_time_mask: 2
12 |   max_time_width: 100
13 |   max_time_warp: 80
14 | 


--------------------------------------------------------------------------------
/example/resources/fbank_pitch.yaml:
--------------------------------------------------------------------------------
 1 | - type: fbank_pitch
 2 |   num_mel_bins: 80
 3 |   sample_frequency: 16000
 4 | - type: cmvn
 5 |   cmvn_type: global
 6 |   stats: resources/global_cmvn_fbank_pitch.ark
 7 |   norm_vars: True
 8 | - type: spec_augment
 9 |   n_freq_mask: 2
10 |   max_freq_width: 27
11 |   n_time_mask: 2
12 |   max_time_width: 100
13 |   max_time_warp: 80
14 | 


--------------------------------------------------------------------------------
/example/resources/global_cmvn_fbank.ark:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/speech-datasets/48a935727c38d150e3b86b99bdda65e0afd69920/example/resources/global_cmvn_fbank.ark


--------------------------------------------------------------------------------
/example/resources/global_cmvn_fbank_pitch.ark:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/speech-datasets/48a935727c38d150e3b86b99bdda65e0afd69920/example/resources/global_cmvn_fbank_pitch.ark


--------------------------------------------------------------------------------
/example/resources/librispeech_bpe2000.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/speech-datasets/48a935727c38d150e3b86b99bdda65e0afd69920/example/resources/librispeech_bpe2000.model


--------------------------------------------------------------------------------
/example/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | import numpy as np
 3 | 
 4 | 
 5 | def edit_dist(pred: List[int], label: List[int]) -> int:
 6 |     """Computes the edit distance between a predicted and label sequence."""
 7 |     # dists[i, j] = edit_dist(pred[:i], label[:i])
 8 |     pred_len, label_len = len(pred), len(label)
 9 |     dists = np.zeros((pred_len + 1, label_len + 1), dtype=int)
10 | 
11 |     dists[:, 0] = np.arange(pred_len + 1)
12 |     dists[0, :] = np.arange(label_len + 1)
13 | 
14 |     for i, x in enumerate(pred):
15 |         for j, y in enumerate(label):
16 |             sub_delta = int(x != y)
17 |             ins_delta = 1
18 |             del_delta = 1
19 | 
20 |             substitution = dists[i, j] + sub_delta
21 |             insertion = dists[i, j+1] + ins_delta  # pred[:i]  --> pred[:i+1]
22 |             deletion = dists[i+1, j] + del_delta   # label[:j] --> label[:j+1]
23 |             dists[i+1, j+1] = min(substitution, insertion, deletion)
24 | 
25 |     return dists[-1, -1].item()
26 | 


--------------------------------------------------------------------------------
/fisher/asr1/asr.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/asr.sh


--------------------------------------------------------------------------------
/fisher/asr1/cmd.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/cmd.sh


--------------------------------------------------------------------------------
/fisher/asr1/conf/fbank.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank
2 |   num_mel_bins: 80
3 |   sample_frequency: 8000
4 | 


--------------------------------------------------------------------------------
/fisher/asr1/conf/fbank_pitch.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank_pitch
2 |   num_mel_bins: 80
3 |   sample_frequency: 8000
4 | 


--------------------------------------------------------------------------------
/fisher/asr1/conf/pbs.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -V -v PATH -S /bin/bash
 3 | option name=* -N $0
 4 | option mem=* -l mem=$0
 5 | option mem=0          # Do not add anything to qsub_opts
 6 | option num_threads=* -l ncpus=$0
 7 | option num_threads=1  # Do not add anything to qsub_opts
 8 | option num_nodes=* -l nodes=$0:ppn=1
 9 | default gpu=0
10 | option gpu=0
11 | option gpu=* -l ngpus=$0
12 | 


--------------------------------------------------------------------------------
/fisher/asr1/conf/queue.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
 3 | option name=* -N $0
 4 | option mem=* -l mem_free=$0,ram_free=$0
 5 | option mem=0          # Do not add anything to qsub_opts
 6 | option num_threads=* -pe smp $0
 7 | option num_threads=1  # Do not add anything to qsub_opts
 8 | option max_jobs_run=* -tc $0
 9 | option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
10 | default gpu=0
11 | option gpu=0
12 | option gpu=* -l gpu=$0 -q g.q
13 | 


--------------------------------------------------------------------------------
/fisher/asr1/conf/slurm.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command sbatch --export=PATH
 3 | option name=* --job-name $0
 4 | option time=* --time $0
 5 | option mem=* --mem-per-cpu $0
 6 | option mem=0          # Do not add anything to qsub_opts
 7 | option num_threads=* --cpus-per-task $0
 8 | option num_threads=1 --cpus-per-task 1
 9 | option num_nodes=* --nodes $0
10 | default gpu=0
11 | option gpu=0 -p cpu
12 | option gpu=* -p gpu --gres=gpu:$0
13 | # note: the --max-jobs-run option is supported as a special case
14 | # by slurm.pl and you don't have to handle it in the config file.
15 | 


--------------------------------------------------------------------------------
/fisher/asr1/db.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/db.sh


--------------------------------------------------------------------------------
/fisher/asr1/local/data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | set -u
 4 | set -o pipefail
 5 | 
 6 | log() {
 7 |     local fname=${BASH_SOURCE[1]##*/}
 8 |     echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 9 | }
10 | 
11 | . ./path.sh || exit 1
12 | . ./db.sh || exit 1
13 | 
14 | 
15 | # Extract & prepare Fisher
16 | for (( i=1; i<=$(echo "${FISHER_TGZ}" | wc -w); i++ )); do
17 |     src=$(echo "${FISHER_TGZ}" | cut -d " " -f $i)
18 |     dst=$(echo "${FISHER}" | cut -d " " -f $i)
19 |     if [ ! -e "${dst}" ]; then
20 |         mkdir -p "${dst}"
21 |         {
22 |             tar xzvf "${src}" -C "${dst}"
23 |         } || {
24 |             log "Failed to extract FISHER (part $i)"
25 |             exit 1
26 |         }
27 |     fi
28 | done
29 | 
30 | # Note: do not quote ${FISHER} -- it should contains 4 directories, and fisher_prep.sh all 4
31 | log "local/fisher_data_prep.sh ${FISHER}"
32 | local/fisher_data_prep.sh ${FISHER}


--------------------------------------------------------------------------------
/fisher/asr1/path.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/path.sh


--------------------------------------------------------------------------------
/fisher/asr1/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Set bash to 'debug' mode, it will exit on :
 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
 4 | set -e
 5 | set -u
 6 | set -o pipefail
 7 | 
 8 | log() {
 9 |     local fname=${BASH_SOURCE[1]##*/}
10 |     echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
11 | }
12 | 
13 | srctexts="data/train_fisher/text "
14 | train_sets="train_fisher "
15 | 
16 | ./asr.sh \
17 |     --fs 8000 \
18 |     --n_tokens 2000 \
19 |     --token_type bpe \
20 |     --train_sets "${train_sets}" \
21 |     --dev_eval_sets "" \
22 |     --srctexts "${srctexts}" "$@"
23 | 


--------------------------------------------------------------------------------
/fisher/asr1/utils:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/utils


--------------------------------------------------------------------------------
/librispeech/asr1/asr.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/asr.sh


--------------------------------------------------------------------------------
/librispeech/asr1/cmd.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/cmd.sh


--------------------------------------------------------------------------------
/librispeech/asr1/conf/fbank.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank
2 |   num_mel_bins: 80
3 |   sample_frequency: 16000
4 | 


--------------------------------------------------------------------------------
/librispeech/asr1/conf/fbank_pitch.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank_pitch
2 |   num_mel_bins: 80
3 |   sample_frequency: 16000
4 | 


--------------------------------------------------------------------------------
/librispeech/asr1/conf/gpu.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
 3 | option mem=* -l mem_free=$0,ram_free=$0
 4 | option mem=0          # Do not add anything to qsub_opts
 5 | option num_threads=* -pe smp $0
 6 | option num_threads=1  # Do not add anything to qsub_opts
 7 | option max_jobs_run=* -tc $0
 8 | default gpu=0
 9 | option gpu=0
10 | option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q


--------------------------------------------------------------------------------
/librispeech/asr1/conf/pbs.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -V -v PATH -S /bin/bash
 3 | option name=* -N $0
 4 | option mem=* -l mem=$0
 5 | option mem=0          # Do not add anything to qsub_opts
 6 | option num_threads=* -l ncpus=$0
 7 | option num_threads=1  # Do not add anything to qsub_opts
 8 | option num_nodes=* -l nodes=$0:ppn=1
 9 | default gpu=0
10 | option gpu=0
11 | option gpu=* -l ngpus=$0
12 | 


--------------------------------------------------------------------------------
/librispeech/asr1/conf/queue.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
 3 | option name=* -N $0
 4 | option mem=* -l mem_free=$0,ram_free=$0
 5 | option mem=0          # Do not add anything to qsub_opts
 6 | option num_threads=* -pe smp $0
 7 | option num_threads=1  # Do not add anything to qsub_opts
 8 | option max_jobs_run=* -tc $0
 9 | option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
10 | default gpu=0
11 | option gpu=0
12 | option gpu=* -l gpu=$0 -q g.q
13 | 


--------------------------------------------------------------------------------
/librispeech/asr1/conf/slurm.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command sbatch --export=PATH
 3 | option name=* --job-name $0
 4 | option time=* --time $0
 5 | option mem=* --mem-per-cpu $0
 6 | option mem=0          # Do not add anything to qsub_opts
 7 | option num_threads=* --cpus-per-task $0
 8 | option num_threads=1 --cpus-per-task 1
 9 | option num_nodes=* --nodes $0
10 | default gpu=0
11 | option gpu=0 -p cpu
12 | option gpu=* -p gpu --gres=gpu:$0
13 | # note: the --max-jobs-run option is supported as a special case
14 | # by slurm.pl and you don't have to handle it in the config file.
15 | 


--------------------------------------------------------------------------------
/librispeech/asr1/db.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/db.sh


--------------------------------------------------------------------------------
/librispeech/asr1/local/data.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | set -u
  4 | set -o pipefail
  5 | 
  6 | log() {
  7 |     local fname=${BASH_SOURCE[1]##*/}
  8 |     echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
  9 | }
 10 | 
 11 | . ./path.sh || exit 1
 12 | . ./db.sh || exit 1
 13 | 
 14 | # all utterances are FLAC compressed
 15 | if ! which flac >&/dev/null; then
 16 |    log "Please install 'flac' on ALL worker nodes!"
 17 |    exit 1
 18 | fi
 19 | 
 20 | if [ "$#" -eq 0 ]; then
 21 |     log "Usage: $0 <dataset1-name> <dataset2-name> ..."
 22 |     log "e.g.: $0 train-clean-100 train-clean-360 dev-clean test-clean"
 23 |     exit 1
 24 | fi
 25 | 
 26 | # Ensure that LIBRISPEECH data has already been extracted
 27 | if [ -z "${LIBRISPEECH}" ]; then
 28 |     log "Fill in the value of 'LIBRISPEECH' in db.sh"
 29 |     exit 1
 30 | fi
 31 | mkdir -p "${LIBRISPEECH}"
 32 | base_url=www.openslr.org/resources/12
 33 | 
 34 | 
 35 | spk_file="${LIBRISPEECH}/LibriSpeech/SPEAKERS.TXT"
 36 | if [ ! -f "$spk_file" ]; then
 37 |     if ! which wget >/dev/null; then
 38 |         log "$0: wget is not installed."
 39 |         exit 1
 40 |     fi
 41 |     md_file="raw-metadata.tar.gz"
 42 |     if ! wget -P "${LIBRISPEECH}" "${base_url}/${md_file}"; then
 43 |         log "$0: error executing wget ${base_url}/${md_file}"
 44 |         exit 1
 45 |     fi
 46 |     tar -C "${LIBRISPEECH}" -xzvf "${LIBRISPEECH}/${md_file}"
 47 |     rm "${LIBRISPEECH}/${md_file}"
 48 | fi
 49 | [ ! -f "$spk_file" ] && log "$0: expected file $spk_file to exist" && exit 1
 50 | 
 51 | for dset in "$@"; do
 52 |     log "Downloading dataset ${dset} (if needed)"
 53 |     local/download_and_untar.sh "${LIBRISPEECH}" "${base_url}" "${dset}"
 54 | 
 55 |     log "Preparing dataset ${dset}"
 56 |     src=$LIBRISPEECH/LibriSpeech/$dset
 57 |     dst=data/$dset
 58 |     mkdir -p "$dst" || exit 1
 59 | 
 60 |     [ ! -d "$src" ] && log "$0: no such directory $src" && exit 1
 61 | 
 62 |     # Remove existing files
 63 |     wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm "$wav_scp"
 64 |     trans=$dst/text; [[ -f "$trans" ]] && rm $trans
 65 |     utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm "$utt2spk"
 66 |     spk2gender=$dst/spk2gender; [[ -f "$spk2gender" ]] && rm "$spk2gender"
 67 | 
 68 |     for reader_dir in $(find -L "${src}" -mindepth 1 -maxdepth 1 -type d | sort); do
 69 |         reader=$(basename "${reader_dir}")
 70 |         if ! [ "${reader}" -eq "${reader}" ]; then  # not integer.
 71 |             log "$0: unexpected subdirectory name $reader" && exit 1
 72 |         fi
 73 | 
 74 |         reader_gender=$(egrep "^${reader}[ ]+\|" ${spk_file} | awk -F'|' '{gsub(/[ ]+/, ""); print tolower($2)}')
 75 |         if [ "$reader_gender" != 'm' ] && [ "${reader_gender}" != 'f' ]; then
 76 |             log "Unexpected gender: '${reader_gender}'" && exit 1
 77 |         fi
 78 | 
 79 |         for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do
 80 |           chapter=$(basename $chapter_dir)
 81 |           if ! [ "$chapter" -eq "$chapter" ]; then
 82 |               log "$0: unexpected chapter-subdirectory name $chapter" && exit 1
 83 |           fi
 84 | 
 85 |           find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \
 86 |               awk -v "dir=$chapter_dir" '{printf "{librispeech}%s flac -c -d -s %s/%s.flac |\n", $0, dir, $0}' \
 87 |                >> $wav_scp|| exit 1
 88 | 
 89 |           chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt
 90 |           [ ! -f  $chapter_trans ] && log "$0: expected file $chapter_trans to exist" && exit 1
 91 |           cat $chapter_trans | tr "[:upper:]" "[:lower:]" | sed -E "s/^/\{librispeech\}/" >> $trans
 92 | 
 93 |           # NOTE: For now we are using per-chapter utt2spk. That is each chapter is considered
 94 |           #       to be a different speaker. This is done for simplicity and because we want
 95 |           #       e.g. the CMVN to be calculated per-chapter
 96 |           awk -v "reader={librispeech}$reader" -v "chapter=$chapter" \
 97 |             '{printf "{librispeech}%s %s-%s\n", $1, reader, chapter}' < $chapter_trans >> $utt2spk || exit 1
 98 | 
 99 |           # reader -> gender map (again using per-chapter granularity)
100 |           echo "{librispeech}${reader}-${chapter} $reader_gender" >> $spk2gender
101 |         done
102 |     done
103 | 
104 |     spk2utt=$dst/spk2utt
105 |     utils/utt2spk_to_spk2utt.pl $utt2spk > $spk2utt || exit 1
106 | 
107 |     ntrans=$(wc -l <$trans)
108 |     nutt2spk=$(wc -l <$utt2spk)
109 |     ! [ "$ntrans" -eq "$nutt2spk" ] && \
110 |       log "Inconsistent #transcripts($ntrans) and #utt2spk($nutt2spk)" && exit 1
111 | 
112 |     log "$(utils/validate_data_dir.sh --no-feats "${dst}")" || exit 1
113 |     log "$0: Successfully prepared dataset $dset"
114 | done
115 | 
116 | exit 0
117 | 


--------------------------------------------------------------------------------
/librispeech/asr1/local/download_and_untar.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | set -u
  4 | set -o pipefail
  5 | 
  6 | log() {
  7 |     local fname=${BASH_SOURCE[1]##*/}
  8 |     echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
  9 | }
 10 | 
 11 | # Copyright   2014  Johns Hopkins University (author: Daniel Povey)
 12 | # Apache 2.0
 13 | 
 14 | remove_archive=false
 15 | 
 16 | if [ "$1" == --remove-archive ]; then
 17 |     remove_archive=true
 18 |     shift
 19 | fi
 20 | 
 21 | if [ $# -ne 3 ]; then
 22 |     log "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
 23 |     log "e.g.: $0 /export/a15/vpanayotov/data www.openslr.org/resources/11 dev-clean"
 24 |     log "With --remove-archive it will remove the archive after successfully un-tarring it."
 25 |     log "<corpus-part> can be one of: dev-clean, test-clean, dev-other, test-other,"
 26 |     log "          train-clean-100, train-clean-360, train-other-500."
 27 |     exit 1
 28 | fi
 29 | 
 30 | data=$1
 31 | url=$2
 32 | part=$3
 33 | 
 34 | if [ ! -d "$data" ]; then
 35 |     log "$0: no such directory $data"
 36 |     exit 1
 37 | fi
 38 | 
 39 | part_ok=false
 40 | list="dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500"
 41 | for x in $list; do
 42 |     if [ "$part" == $x ]; then part_ok=true; fi
 43 | done
 44 | if ! $part_ok; then
 45 |     log "$0: expected <corpus-part> to be one of $list, but got '$part'"
 46 |     exit 1
 47 | fi
 48 | 
 49 | if [ -z "$url" ]; then
 50 |     log "$0: empty URL base."
 51 |     exit 1
 52 | fi
 53 | 
 54 | if [ -f $data/LibriSpeech/$part/.complete ]; then
 55 |     log "$0: data part $part was already successfully extracted, nothing to do."
 56 |     exit 0
 57 | fi
 58 | 
 59 | 
 60 | # sizes of the archive files in bytes.  This is some older versions.
 61 | sizes_old="371012589 347390293 379743611 361838298 6420417880 23082659865 30626749128"
 62 | # sizes_new is the archive file sizes of the final release.  Some of these sizes are of
 63 | # things we probably won't download.
 64 | sizes_new="337926286 314305928 695964615 297279345 87960560420 33373768 346663984 328757843 6387309499 23049477885 30593501606"
 65 | 
 66 | if [ -f $data/$part.tar.gz ]; then
 67 |     size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}')
 68 |     size_ok=false
 69 |     for s in $sizes_old $sizes_new; do if [ $s == $size ]; then size_ok=true; fi; done
 70 |     if ! $size_ok; then
 71 |         log "$0: removing existing file $data/$part.tar.gz because its size in bytes $size"
 72 |         log "does not equal the size of one of the archives."
 73 |         rm $data/$part.tar.gz
 74 |     else
 75 |         log "$data/$part.tar.gz exists and appears to be complete."
 76 |     fi
 77 | fi
 78 | 
 79 | if [ ! -f $data/$part.tar.gz ]; then
 80 |     if ! which wget >/dev/null; then
 81 |         log "$0: wget is not installed."
 82 |         exit 1
 83 |     fi
 84 |     full_url=$url/$part.tar.gz
 85 |     log "$0: downloading data from $full_url.  This may take some time, please be patient."
 86 | 
 87 |     if ! wget -P $data --no-check-certificate $full_url; then
 88 |         log "$0: error executing wget $full_url"
 89 |         exit 1
 90 |     fi
 91 | fi
 92 | 
 93 | if ! tar -C $data -xvzf $data/$part.tar.gz; then
 94 |     log "$0: error un-tarring archive $data/$part.tar.gz"
 95 |     exit 1
 96 | fi
 97 | 
 98 | touch $data/LibriSpeech/$part/.complete
 99 | 
100 | log "$0: Successfully downloaded and un-tarred $data/$part.tar.gz"
101 | 
102 | if $remove_archive; then
103 |   log "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied."
104 |   rm $data/$part.tar.gz
105 | fi
106 | 


--------------------------------------------------------------------------------
/librispeech/asr1/path.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/path.sh


--------------------------------------------------------------------------------
/librispeech/asr1/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Set bash to 'debug' mode, it will exit on :
 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
 4 | set -e
 5 | set -u
 6 | set -o pipefail
 7 | 
 8 | log() {
 9 |     local fname=${BASH_SOURCE[1]##*/}
10 |     echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
11 | }
12 | 
13 | # Get the datasets we want to use based on the command-line args
14 | train_sets="train-clean-100 train-clean-360 train-other-500 "
15 | dev_sets="dev-clean dev-other "
16 | eval_sets="test-clean test-other "
17 | srctexts=
18 | for dset in ${train_sets}; do
19 |     srctexts+="data/${dset}/text "
20 | done
21 | 
22 | ./asr.sh \
23 |     --fs 16000 \
24 |     --n_tokens 2000 \
25 |     --token_type bpe \
26 |     --train_sets "${train_sets}" \
27 |     --dev_eval_sets "${dev_sets} ${eval_sets}" \
28 |     --srctexts "${srctexts}" \
29 |     --local_data_opts "${eval_sets} ${dev_sets} ${train_sets}" "$@"


--------------------------------------------------------------------------------
/librispeech/asr1/utils:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/utils


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | setup(
 4 |     name="speech_datasets",
 5 |     version="0.1.0",
 6 |     author="Aadyot Bhatnagar",
 7 |     author_email="abhatnagar@salesforce.com",
 8 |     license="Apache-2.0",
 9 |     packages=find_packages(include=["speech_datasets*"]),
10 |     install_requires=[
11 |         "h5py>=2.9.0",
12 |         "humanfriendly",
13 |         "Kaldiio",
14 |         "numpy",
15 |         "pillow>=6.1.0",
16 |         "PyYAML>=5.1.2",
17 |         "ray[tune]",
18 |         "resampy",
19 |         "scipy",
20 |         "sentencepiece<0.1.90,>=0.1.82",
21 |         "soundfile>=0.10.2",
22 |         "torch>=1.2.0",
23 |         "tqdm",
24 |         "typeguard>=2.7.0",
25 |     ]
26 | )
27 | 


--------------------------------------------------------------------------------
/speech_datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | """Initialize main package."""
 2 | import pkg_resources
 3 | from speech_datasets.dataloader import SpeechDataLoader
 4 | 
 5 | try:
 6 |     __version__ = pkg_resources.get_distribution("speech_datasets").version
 7 | except Exception:
 8 |     __version__ = "(Not installed from setup.py)"
 9 | del pkg_resources
10 | 


--------------------------------------------------------------------------------
/speech_datasets/bin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/speech-datasets/48a935727c38d150e3b86b99bdda65e0afd69920/speech_datasets/bin/__init__.py


--------------------------------------------------------------------------------
/speech_datasets/bin/apply_cmvn.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | from distutils.util import strtobool
 4 | import logging
 5 | 
 6 | from speech_datasets.transform import Transformation
 7 | from speech_datasets.utils.readers import file_reader_helper
 8 | from speech_datasets.utils.io_utils import get_commandline_args
 9 | from speech_datasets.utils.writers import file_writer_helper
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | def parse_args():
15 |     parser = argparse.ArgumentParser(
16 |         description="apply mean-variance normalization to files",
17 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
18 |     )
19 | 
20 |     parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
21 |     parser.add_argument("--in-filetype", type=str, default="hdf5", choices=["mat", "hdf5"],
22 |                         help="Specify the file format for the rspecifier. "
23 |                              '"mat" is the matrix format in kaldi')
24 |     parser.add_argument("--out-filetype", type=str, default="hdf5", choices=["mat", "hdf5"],
25 |                         help="Specify the file format for the wspecifier. "
26 |                              '"mat" is the matrix format in kaldi')
27 | 
28 |     parser.add_argument("--norm-means", type=strtobool, default=True,
29 |                         help="Do mean normalization or not.")
30 |     parser.add_argument("--norm-vars", type=strtobool, default=False,
31 |                         help="Do variance normalization or not.")
32 |     parser.add_argument("--reverse", type=strtobool, default=False,
33 |                         help="Do reverse mode or not")
34 |     parser.add_argument("--utt2spk", type=str, default=None,
35 |                         help="A text file of utterance to speaker map.")
36 |     parser.add_argument("--compress", type=strtobool, default=False,
37 |                         help="Save in compressed format")
38 |     parser.add_argument("--compression-method", type=int, default=2,
39 |                         help="Specify the method (if mat) or gzip-level (if hdf5)")
40 |     parser.add_argument("--cmvn-type", type=str, choices=["global", "speaker", "utterance"],
41 |                         help="Type of CMVN to apply (global, per-speaker, or per-utterance)")
42 |     parser.add_argument("stats_file", help="File containing CMVN stats.")
43 |     parser.add_argument("rspecifier", type=str, help="Read specifier id, e.g. ark:some.ark")
44 |     parser.add_argument("wspecifier", type=str, help="Write specifier id, e.g. ark:some.ark")
45 | 
46 |     args = parser.parse_args()
47 |     if args.cmvn_type == "speaker" and args.utt2spk is None:
48 |         raise argparse.ArgumentError(
49 |             args.cmvn_type, "If cmvn-type is 'speaker', utt2spk must be provided.")
50 | 
51 |     return args
52 | 
53 | 
54 | def main():
55 |     args = parse_args()
56 | 
57 |     # logging info
58 |     logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
59 |     if args.verbose > 0:
60 |         logging.basicConfig(level=logging.INFO, format=logfmt)
61 |     else:
62 |         logging.basicConfig(level=logging.WARN, format=logfmt)
63 |     logger.info(get_commandline_args())
64 | 
65 |     cmvn = Transformation([{"type": "cmvn",
66 |                             "stats": args.stats_file,
67 |                             "cmvn_type": args.cmvn_type,
68 |                             "norm_means": args.norm_means,
69 |                             "norm_vars": args.norm_vars,
70 |                             "utt2spk": args.utt2spk,
71 |                             "reverse": args.reverse}])
72 | 
73 |     with file_writer_helper(
74 |         args.wspecifier,
75 |         filetype=args.out_filetype,
76 |         compress=args.compress,
77 |         compression_method=args.compression_method,
78 |     ) as writer:
79 |         for utt, data in file_reader_helper(args.rspecifier, args.in_filetype,
80 |                                             transform=cmvn, return_dict=True):
81 |             writer[utt] = data
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     main()
86 | 


--------------------------------------------------------------------------------
/speech_datasets/bin/combine_cmvn_stats.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from speech_datasets.utils.readers import read_cmvn_stats
 4 | from speech_datasets.utils.writers import write_cmvn_stats
 5 | 
 6 | 
 7 | def parse_args():
 8 |     parser = argparse.ArgumentParser()
 9 |     parser.add_argument("--cmvn_type", choices=["global", "speaker", "utterance"])
10 |     parser.add_argument("--output_file", type=str)
11 |     parser.add_argument("cmvn_stats_files", nargs="+")
12 |     return parser.parse_args()
13 | 
14 | 
15 | def combine_cmvn_dicts(stats_dicts):
16 |     out_dict = {}
17 |     for d in stats_dicts:
18 |         for spk, val in d.items():
19 |             if spk not in out_dict:
20 |                 out_dict[spk] = val
21 |             else:
22 |                 out_dict[spk] += val
23 |     return out_dict
24 | 
25 | 
26 | def main():
27 |     args = parse_args()
28 |     out_dict = combine_cmvn_dicts(read_cmvn_stats(path, args.cmvn_type)
29 |                                   for path in args.cmvn_stats_files)
30 |     write_cmvn_stats(args.output_file, args.cmvn_type, out_dict)
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     main()
35 | 


--------------------------------------------------------------------------------
/speech_datasets/bin/compute_cmvn_stats.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import argparse
  3 | import logging
  4 | 
  5 | import tqdm
  6 | 
  7 | from speech_datasets.transform import Transformation
  8 | from speech_datasets.utils.readers import file_reader_helper
  9 | from speech_datasets.utils.io_utils import get_commandline_args
 10 | from speech_datasets.utils.types import CMVNStats
 11 | from speech_datasets.utils.writers import write_cmvn_stats
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | def parse_args():
 17 |     parser = argparse.ArgumentParser(
 18 |         description="Compute cepstral mean and variance normalization (CMVN) "
 19 |                     "statistics. Can compute CMVN statistics either globally, "
 20 |                     "per-speaker, or per-utterance. Writes output in Kaldi "
 21 |                     "mat format.",
 22 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 23 |     parser.add_argument("--filetype", type=str, default="hdf5", choices=["mat", "hdf5"],
 24 |                         help="Specify the file format for the rspecifier. "
 25 |                              '"mat" is the matrix format in kaldi',)
 26 |     parser.add_argument("--preprocess-conf", type=str, default=None,
 27 |                         help="The configuration file for the pre-processing")
 28 |     parser.add_argument("--cmvn-type", type=str, default="global",
 29 |                         choices=["global", "speaker", "utterance"])
 30 |     parser.add_argument("--spk2utt", type=str, default=None,
 31 |                         help="A text file of speaker to utterance-list map.")
 32 |     parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
 33 |     parser.add_argument("rspecifier", type=str,
 34 |                         help="Read specifier for feats, e.g. ark:some.ark or "
 35 |                              "scp,ark:some.scp,some.ark")
 36 |     parser.add_argument("wfilename", type=str, help="File to write CMVN stats to")
 37 |     args = parser.parse_args()
 38 | 
 39 |     if args.cmvn_type == "speaker" and args.spk2utt is None:
 40 |         raise argparse.ArgumentError(
 41 |             args.spk2utt, f"--spk2utt argument must be given when cmvn-type is 'speaker'")
 42 | 
 43 |     return args
 44 | 
 45 | 
 46 | def main():
 47 |     args = parse_args()
 48 | 
 49 |     logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
 50 |     logging.basicConfig(level=logging.INFO if args.verbose else logging.WARN,
 51 |                         format=logfmt)
 52 |     logger.info(get_commandline_args())
 53 | 
 54 |     if args.cmvn_type == "speaker":
 55 |         logger.info("Performing as speaker CMVN mode")
 56 |         utt2spk_dict = {}
 57 |         with open(args.spk2utt) as f:
 58 |             for line in f:
 59 |                 spk, utts = line.rstrip().split(None, maxsplit=1)
 60 |                 for utt in utts.split():
 61 |                     utt2spk_dict[utt] = spk
 62 | 
 63 |         def utt2spk(x):
 64 |             return utt2spk_dict[x]
 65 | 
 66 |     else:
 67 |         logger.info(f"Performing as {args.cmvn_type} CMVN mode")
 68 |         if args.spk2utt is not None:
 69 |             logger.warning(f"spk2utt is not used for {args.cmvn_type} CMVN mode")
 70 | 
 71 |         if args.cmvn_type == "utterance":
 72 |             def utt2spk(x):
 73 |                 return x
 74 | 
 75 |         else:  # args.cmvn_type == "global"
 76 |             def utt2spk(x):
 77 |                 return None
 78 | 
 79 |     if args.preprocess_conf is not None:
 80 |         preprocessing = Transformation(args.preprocess_conf)
 81 |         logger.info("Apply preprocessing: {}".format(preprocessing))
 82 |     else:
 83 |         preprocessing = None
 84 | 
 85 |     # Calculate stats for each "speaker"
 86 |     cmvn_stats, n = {}, 0
 87 |     for utt, matrix in tqdm.tqdm(file_reader_helper(
 88 |             args.rspecifier, args.filetype, transform=preprocessing)):
 89 |         # Init at the first seen of the spk
 90 |         spk = utt2spk(utt)
 91 |         spk_stats = CMVNStats(
 92 |             count=matrix.shape[0], sum=matrix.sum(axis=0),
 93 |             sum_squares=(matrix ** 2).sum(axis=0))
 94 |         if spk not in cmvn_stats:
 95 |             cmvn_stats[spk] = spk_stats
 96 |         else:
 97 |             cmvn_stats[spk] += spk_stats
 98 |         n += 1
 99 |     logger.info(f"Processed {n} utterances")
100 |     assert n > 0, n
101 | 
102 |     write_cmvn_stats(args.wfilename, args.cmvn_type, cmvn_stats)
103 | 
104 | 
105 | if __name__ == "__main__":
106 |     main()
107 | 


--------------------------------------------------------------------------------
/speech_datasets/bin/dump.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright 2020 Salesforce Research (Aadyot Bhatnagar)
 4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | import argparse
 7 | from distutils.util import strtobool
 8 | import logging
 9 | 
10 | import kaldiio
11 | import tqdm
12 | 
13 | from speech_datasets.transform import Transformation
14 | from speech_datasets.utils.io_utils import get_commandline_args, consolidate_utt_info
15 | from speech_datasets.utils.types import str_or_none, humanfriendly_or_none
16 | from speech_datasets.utils.writers import file_writer_helper
17 | 
18 | logger = logging.getLogger(__name__)
19 | 
20 | 
21 | def parse_args():
22 |     parser = argparse.ArgumentParser(
23 |         description="read .wav files & ")
24 |     parser.add_argument("--feature-config", default=None, type=str_or_none,
25 |                         help="YAML file for feature extraction (if extracting any features)")
26 |     parser.add_argument("--text-file", default=None,
27 |                         help="file mapping utterance ID to transcript")
28 |     parser.add_argument("--utt2spk-file", default=None,
29 |                         help="file mapping utterance ID to speaker ID")
30 | 
31 |     parser.add_argument("--archive-format", type=str, default="hdf5", choices=["mat", "hdf5"],
32 |                         help="Specify the file format for output. \"mat\" is the matrix format in kaldi")
33 |     parser.add_argument("--sample-frequency", type=humanfriendly_or_none, default=None,
34 |                         help="If the sampling rate is specified, resample the input.")
35 |     parser.add_argument("--compress", type=strtobool, default=False, help="Save in compressed format")
36 |     parser.add_argument("--compression-method", type=int, default=2,
37 |                         help="Specify the method(if mat) or " "gzip-level(if hdf5)")
38 |     parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
39 |     parser.add_argument("--segments", type=str,
40 |                         help="segments-file format: each line is either"
41 |                              "<segment-id> <recording-id> <start-time> <end-time>"
42 |                              "e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5")
43 |     parser.add_argument("rspecifier", type=str, help="WAV scp file")
44 |     parser.add_argument("wspecifier", type=str, help="Write specifier")
45 | 
46 |     return parser.parse_args()
47 | 
48 | 
49 | def main():
50 |     args = parse_args()
51 | 
52 |     logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
53 |     if args.verbose > 0:
54 |         logging.basicConfig(level=logging.INFO, format=logfmt)
55 |     else:
56 |         logging.basicConfig(level=logging.WARN, format=logfmt)
57 |     logger.info(get_commandline_args())
58 | 
59 |     utt_text_speaker = consolidate_utt_info(
60 |         scp=None, text=args.text_file, utt2spk=args.utt2spk_file)
61 | 
62 |     with kaldiio.ReadHelper(
63 |         args.rspecifier, segments=args.segments
64 |     ) as reader, file_writer_helper(
65 |         args.wspecifier,
66 |         filetype=args.archive_format,
67 |         compress=args.compress,
68 |         compression_method=args.compression_method,
69 |         sample_frequency=args.sample_frequency,
70 |         transform=Transformation(args.feature_config)
71 |     ) as writer:
72 |         for utt_id, (rate, wave) in tqdm.tqdm(reader, miniters=100, maxinterval=30):
73 |             utt_dict = {"x": wave, "rate": rate}
74 |             utt_dict.update(utt_text_speaker.get(utt_id, {}))
75 |             try:
76 |                 writer[utt_id] = utt_dict
77 |             except Exception as e:
78 |                 logger.warning(
79 |                     f"Failed to process utterance {utt_id} with exception:\n{str(e)}")
80 |                 continue
81 | 
82 | 
83 | if __name__ == "__main__":
84 |     main()
85 | 


--------------------------------------------------------------------------------
/speech_datasets/bin/feat_to_shape.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import logging
 4 | import sys
 5 | 
 6 | from speech_datasets.transform import Transformation
 7 | from speech_datasets.utils.readers import file_reader_helper
 8 | from speech_datasets.utils.io_utils import get_commandline_args, strtobool
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | def get_parser():
14 |     parser = argparse.ArgumentParser(
15 |         description="convert feature to its shape",
16 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
17 |     )
18 |     parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
19 |     parser.add_argument("--filetype", type=str, default="hdf5", choices=["mat", "hdf5", "sound"],
20 |                         help="Specify the file format for the rspecifier.")
21 |     parser.add_argument("--preprocess-conf", type=str, default=None,
22 |                         help="The configuration file for the pre-processing")
23 |     parser.add_argument("--mem-mapped", type=strtobool, default=False,
24 |                         help="Whether to use memory-mapped data loaders (where available)")
25 |     parser.add_argument("rspecifier", type=str,
26 |                         help="Read specifier for feats. e.g. ark:some.ark")
27 |     parser.add_argument("out", nargs="?", type=argparse.FileType("w"), default=sys.stdout,
28 |                         help="The output filename. " "If omitted, then output to sys.stdout")
29 |     return parser
30 | 
31 | 
32 | def main():
33 |     parser = get_parser()
34 |     args = parser.parse_args()
35 | 
36 |     # logging info
37 |     logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
38 |     if args.verbose > 0:
39 |         logging.basicConfig(level=logging.INFO, format=logfmt)
40 |     else:
41 |         logging.basicConfig(level=logging.WARN, format=logfmt)
42 |     logger.info(get_commandline_args())
43 | 
44 |     if args.preprocess_conf is not None:
45 |         preprocessing = Transformation(args.preprocess_conf)
46 |         logger.info("Apply preprocessing: {}".format(preprocessing))
47 |     else:
48 |         preprocessing = None
49 | 
50 |     for utt, shape in file_reader_helper(
51 |             args.rspecifier, args.filetype, return_shape=True, transform=preprocessing):
52 |         shape_str = ",".join(map(str, shape))  # shape is a tuple of ints
53 |         args.out.write("{} {}\n".format(utt, shape_str))
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     main()
58 | 


--------------------------------------------------------------------------------
/speech_datasets/bin/spm_train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # All rights reserved.
 4 | #
 5 | # This source code is licensed under the license found in the
 6 | # https://github.com/pytorch/fairseq/blob/master/LICENSE
 7 | import sys
 8 | 
 9 | import sentencepiece as spm
10 | 
11 | 
12 | if __name__ == "__main__":
13 |     spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:]))
14 | 


--------------------------------------------------------------------------------
/speech_datasets/text/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub-package"""
2 | from speech_datasets.text.tokenizers import SentencepieceTokenizer
3 | 


--------------------------------------------------------------------------------
/speech_datasets/text/tokenizers.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from pathlib import Path
 3 | from typing import Iterable, List, Union
 4 | 
 5 | import sentencepiece as spm
 6 | from typeguard import check_argument_types
 7 | 
 8 | 
 9 | class AbsTokenizer(ABC):
10 |     @abstractmethod
11 |     def text2tokens(self, line: str) -> List[str]:
12 |         raise NotImplementedError
13 | 
14 |     @abstractmethod
15 |     def tokens2text(self, tokens: Iterable[str]) -> str:
16 |         raise NotImplementedError
17 | 
18 |     @abstractmethod
19 |     def tokens2ids(self, tokens: Iterable[str]) -> List[int]:
20 |         raise NotImplementedError
21 | 
22 |     @abstractmethod
23 |     def ids2tokens(self, ids: Iterable[int]) -> List[str]:
24 |         raise NotImplementedError
25 | 
26 |     def text2ids(self, line: str) -> List[int]:
27 |         return self.tokens2ids(self.text2tokens(line))
28 | 
29 |     def ids2text(self, ids: Iterable[int]) -> str:
30 |         return self.tokens2text(self.ids2tokens(ids))
31 | 
32 |     @abstractmethod
33 |     def __len__(self):
34 |         raise NotImplementedError
35 | 
36 | 
37 | class SentencepieceTokenizer(AbsTokenizer):
38 |     def __init__(self, model: Union[Path, str],
39 |                  token_list: Union[Path, str, Iterable[str]] = None):
40 |         assert check_argument_types()
41 |         self.model = str(model)
42 |         self.sp = spm.SentencePieceProcessor()
43 |         self.sp.load(self.model)
44 | 
45 |         if isinstance(token_list, (Path, str)):
46 |             char_list = Path(token_list)
47 |             with char_list.open("r", encoding="utf-8") as f:
48 |                 token_list = [line.rstrip() for line in f]
49 |         elif token_list is None:
50 |             token_list = [self.sp.IdToPiece(i)
51 |                           for i in range(self.sp.get_piece_size())]
52 | 
53 |         self.idx2tok = {i: tok for i, tok in enumerate(token_list)}
54 |         self.tok2idx = {tok: i for i, tok in enumerate(token_list)}
55 | 
56 |     def __repr__(self):
57 |         return f'{self.__class__.__name__}(model="{self.model}")'
58 | 
59 |     def __getstate__(self):
60 |         state = self.__dict__.copy()
61 |         state["sp"] = None
62 |         return state
63 | 
64 |     def __setstate__(self, state):
65 |         self.__dict__ = state
66 |         self.sp = spm.SentencePieceProcessor()
67 |         self.sp.load(self.model)
68 | 
69 |     def text2tokens(self, line: str) -> List[str]:
70 |         return self.sp.EncodeAsPieces(line)
71 | 
72 |     def tokens2text(self, tokens: Iterable[str]) -> str:
73 |         return self.sp.DecodePieces(list(tokens))
74 | 
75 |     def tokens2ids(self, tokens: Iterable[str]) -> List[int]:
76 |         return [self.tok2idx.get(tok, self.tok2idx["<unk>"]) for tok in tokens]
77 | 
78 |     def ids2tokens(self, ids: Iterable[int]) -> List[str]:
79 |         return [self.idx2tok[idx] for idx in ids]
80 | 
81 |     def __len__(self):
82 |         if self.idx2tok is None:
83 |             return self.sp.get_piece_size()
84 |         else:
85 |             return len(self.idx2tok)
86 | 


--------------------------------------------------------------------------------
/speech_datasets/transform/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize main package."""
2 | from speech_datasets.transform.transformation import Transformation
3 | 


--------------------------------------------------------------------------------
/speech_datasets/transform/add_deltas.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from speech_datasets.transform.interface import FuncTrans
 4 | 
 5 | 
 6 | def delta(feat, window):
 7 |     assert window > 0
 8 |     delta_feat = np.zeros_like(feat)
 9 |     for i in range(1, window + 1):
10 |         delta_feat[:-i] += i * feat[i:]
11 |         delta_feat[i:] += -i * feat[:-i]
12 |         delta_feat[-i:] += i * feat[-1]
13 |         delta_feat[:i] += -i * feat[0]
14 |     delta_feat /= 2 * sum(i ** 2 for i in range(1, window + 1))
15 |     return delta_feat
16 | 
17 | 
18 | def add_deltas(x, window=2, order=2):
19 |     """
20 |     :param x: Features
21 |     :param window: size of the window to use to approximate time derivative computation
22 |     :param order: highest order time derivative to compute
23 |     :return: Features, concatenated with all the relevant derivatives
24 |     """
25 |     feats = [x]
26 |     for _ in range(order):
27 |         feats.append(delta(feats[-1], window))
28 |     return np.concatenate(feats, axis=1)
29 | 
30 | 
31 | class AddDeltas(FuncTrans):
32 |     _func = add_deltas
33 |     __doc__ = add_deltas.__doc__
34 | 


--------------------------------------------------------------------------------
/speech_datasets/transform/cmvn.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import logging
  3 | import os
  4 | 
  5 | import numpy as np
  6 | 
  7 | from speech_datasets.transform.interface import TransformInterface
  8 | from speech_datasets.utils import get_root
  9 | from speech_datasets.utils.readers import read_cmvn_stats
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | class CMVN(TransformInterface):
 15 |     def __init__(self, cmvn_type: str, stats: str = None, norm_means=True,
 16 |                  norm_vars=False, utt2spk: str = None, reverse=False,
 17 |                  std_floor=1.0e-20):
 18 |         self.norm_means = norm_means
 19 |         self.norm_vars = norm_vars
 20 |         self.reverse = reverse
 21 |         self.std_floor = std_floor
 22 | 
 23 |         assert cmvn_type in ["global", "speaker", "utterance"], cmvn_type
 24 |         self.accept_uttid = (cmvn_type != "global")
 25 |         self.cmvn_type = cmvn_type
 26 |         if cmvn_type != "utterance":
 27 |             assert stats is not None, "stats required if cmvn_type != 'utterance'"
 28 |             try:
 29 |                 self.stats_file = stats
 30 |                 stats_dict = read_cmvn_stats(self.stats_file, cmvn_type)
 31 |             except FileNotFoundError:
 32 |                 self.stats_file = os.path.join(get_root(), stats)
 33 |                 stats_dict = read_cmvn_stats(self.stats_file, cmvn_type)
 34 |         else:
 35 |             if stats is not None:
 36 |                 logger.warning("stats file is not used if cmvn_type is 'utterance'")
 37 |             self.stats_file = None
 38 |             stats_dict = {}
 39 | 
 40 |         if cmvn_type == "speaker":
 41 |             assert utt2spk is not None, "utt2spk required if cmvn_type is 'speaker'"
 42 |             self.utt2spk = {}
 43 |             with io.open(utt2spk, "r", encoding="utf-8") as f:
 44 |                 for line in f:
 45 |                     utt, spk = line.rstrip().split(None, maxsplit=1)
 46 |                     self.utt2spk[utt] = spk
 47 |         else:
 48 |             if utt2spk is not None:
 49 |                 logger.warning("utt2spk is only used if cmvn_type is 'speaker'")
 50 |             self.utt2spk = None
 51 | 
 52 |         # Kaldi makes a matrix for CMVN which has a shape of (2, feat_dim + 1),
 53 |         # and the first vector contains the sum of feats and the second is
 54 |         # the sum of squares. The last value of the first, i.e. stats[0,-1],
 55 |         # is the number of samples for this statistics.
 56 |         self.bias = {}
 57 |         self.scale = {}
 58 |         for spk, stats in stats_dict.items():
 59 |             # Var[x] = E[x^2] - E[x]^2
 60 |             mean = stats.sum / stats.count
 61 |             var = stats.sum_squares / stats.count - mean * mean
 62 |             std = np.maximum(np.sqrt(var), std_floor)
 63 |             self.bias[spk] = -mean
 64 |             self.scale[spk] = 1 / std
 65 | 
 66 |     def __repr__(self):
 67 |         return (
 68 |             "{name}(stats_file={stats_file}, "
 69 |             "norm_means={norm_means}, norm_vars={norm_vars}, "
 70 |             "reverse={reverse})".format(
 71 |                 name=self.__class__.__name__,
 72 |                 stats_file=self.stats_file,
 73 |                 norm_means=self.norm_means,
 74 |                 norm_vars=self.norm_vars,
 75 |                 reverse=self.reverse,
 76 |             )
 77 |         )
 78 | 
 79 |     def __call__(self, x, uttid=None):
 80 |         if self.cmvn_type == "global":
 81 |             bias = self.bias[None]
 82 |             scale = self.scale[None]
 83 |         elif self.cmvn_type == "speaker":
 84 |             spk = self.utt2spk[uttid]
 85 |             bias = self.bias[spk]
 86 |             scale = self.scale[spk]
 87 |         else:  # self.cmvn_type == "utterance"
 88 |             mean = x.mean(axis=0)
 89 |             mse = (x ** 2).sum(axis=0) / x.shape[0]
 90 |             bias = -mean
 91 |             scale = 1 / np.maximum(np.sqrt(mse - mean ** 2), self.std_floor)
 92 | 
 93 |         if not self.reverse:
 94 |             if self.norm_means:
 95 |                 x = np.add(x, bias)
 96 |             if self.norm_vars:
 97 |                 x = np.multiply(x, scale)
 98 | 
 99 |         else:
100 |             if self.norm_vars:
101 |                 x = np.divide(x, scale)
102 |             if self.norm_means:
103 |                 x = np.subtract(x, bias)
104 | 
105 |         return x
106 | 


--------------------------------------------------------------------------------
/speech_datasets/transform/interface.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | import inspect
 3 | 
 4 | from speech_datasets.utils import check_kwargs
 5 | 
 6 | 
 7 | class TransformInterface(object):
 8 |     """Transform Interface"""
 9 | 
10 |     @abstractmethod
11 |     def __call__(self, x):
12 |         raise NotImplementedError("__call__ method is not implemented")
13 | 
14 |     @classmethod
15 |     def add_arguments(cls, parser):
16 |         return parser
17 | 
18 |     def __repr__(self):
19 |         return self.__class__.__name__ + "()"
20 | 
21 | 
22 | class FuncTrans(TransformInterface):
23 |     """Functional Transformation
24 | 
25 |     WARNING:
26 |         Builtin or C/C++ functions may not work properly
27 |         because this class heavily depends on the `inspect` module.
28 | 
29 |     Usage:
30 | 
31 |     >>> def foo_bar(x, a=1, b=2):
32 |     ...     '''Foo bar
33 |     ...     :param x: input
34 |     ...     :param int a: default 1
35 |     ...     :param int b: default 2
36 |     ...     '''
37 |     ...     return x + a - b
38 | 
39 | 
40 |     >>> class FooBar(FuncTrans):
41 |     ...     _func = foo_bar
42 |     ...     __doc__ = foo_bar.__doc__
43 |     """
44 | 
45 |     _func = None
46 | 
47 |     def __init__(self, **kwargs):
48 |         self.kwargs = kwargs
49 |         check_kwargs(self.func, kwargs)
50 | 
51 |     def __call__(self, x):
52 |         return self.func(x, **self.kwargs)
53 | 
54 |     @classmethod
55 |     def add_arguments(cls, parser):
56 |         fname = cls._func.__name__.replace("_", "-")
57 |         group = parser.add_argument_group(fname + " transformation setting")
58 |         for k, v in cls.default_params().items():
59 |             # TODO(karita): get help and choices from docstring?
60 |             attr = k.replace("_", "-")
61 |             group.add_argument(f"--{fname}-{attr}", default=v, type=type(v))
62 |         return parser
63 | 
64 |     @property
65 |     def func(self):
66 |         return type(self)._func
67 | 
68 |     @classmethod
69 |     def default_params(cls):
70 |         try:
71 |             d = dict(inspect.signature(cls._func).parameters)
72 |         except ValueError:
73 |             d = dict()
74 |         return {
75 |             k: v.default for k, v in d.items() if v.default != inspect.Parameter.empty
76 |         }
77 | 
78 |     def __repr__(self):
79 |         params = self.default_params()
80 |         params.update(**self.kwargs)
81 |         ret = self.__class__.__name__ + "("
82 |         if len(params) == 0:
83 |             return ret + ")"
84 |         for k, v in params.items():
85 |             ret += "{}={}, ".format(k, v)
86 |         return ret[:-2] + ")"
87 | 


--------------------------------------------------------------------------------
/speech_datasets/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package & bring general util into this namespace."""
2 | from speech_datasets.utils.misc import get_root, check_kwargs, dynamic_import, set_deterministic_pytorch
3 | 


--------------------------------------------------------------------------------
/speech_datasets/utils/misc.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import inspect
 3 | from os.path import abspath, dirname
 4 | import torch
 5 | 
 6 | 
 7 | def get_root():
 8 |     """This file is ROOT/speech_datasets/utils/misc.py, so return ROOT."""
 9 |     return dirname(dirname(dirname(abspath(__file__))))
10 | 
11 | 
12 | def check_kwargs(func, kwargs, name=None):
13 |     """check kwargs are valid for func
14 | 
15 |     If kwargs are invalid, raise TypeError as same as python default
16 |     :param function func: function to be validated
17 |     :param dict kwargs: keyword arguments for func
18 |     :param str name: name used in TypeError (default is func name)
19 |     """
20 |     try:
21 |         params = inspect.signature(func).parameters
22 |     except ValueError:
23 |         return
24 |     if name is None:
25 |         name = func.__name__
26 |     for k in kwargs.keys():
27 |         if k not in params:
28 |             raise TypeError(f"{name}() got an unexpected keyword argument '{k}'")
29 | 
30 | 
31 | def dynamic_import(import_path, alias=None):
32 |     """dynamic import module and class
33 | 
34 |     :param str import_path: syntax 'module_name:class_name'
35 |         e.g., 'speech_datasets.transform.add_deltas:AddDeltas'
36 |     :param dict alias: shortcut for registered class
37 |     :return: imported class
38 |     """
39 |     alias = dict() if alias is None else alias
40 |     if import_path not in alias and ":" not in import_path:
41 |         raise ValueError(
42 |             "import_path should be one of {} or "
43 |             'include ":", e.g. "speech_datasets.transform.add_deltas:AddDeltas" : '
44 |             "{}".format(set(alias), import_path)
45 |         )
46 |     if ":" not in import_path:
47 |         import_path = alias[import_path]
48 | 
49 |     module_name, objname = import_path.split(":")
50 |     m = importlib.import_module(module_name)
51 |     return getattr(m, objname)
52 | 
53 | 
54 | def set_deterministic_pytorch(seed, cudnn_deterministic=True):
55 |     """Ensures pytorch produces deterministic results based on the seed."""
56 |     # See https://github.com/pytorch/pytorch/issues/6351 about cudnn.benchmark
57 |     torch.manual_seed(seed)
58 |     torch.backends.cudnn.deterministic = cudnn_deterministic
59 |     torch.backends.cudnn.benchmark = (not cudnn_deterministic)
60 | 


--------------------------------------------------------------------------------
/speech_datasets/utils/types.py:
--------------------------------------------------------------------------------
  1 | from distutils.util import strtobool
  2 | from typing import Optional, Tuple, Union
  3 | 
  4 | import humanfriendly
  5 | import numpy as np
  6 | from typeguard import check_argument_types
  7 | 
  8 | 
  9 | class CMVNStats(object):
 10 |     def __init__(self, count, sum, sum_squares):
 11 |         self.count = count
 12 |         self.sum = sum
 13 |         self.sum_squares = sum_squares
 14 | 
 15 |     def __iadd__(self, other):
 16 |         self.count += other.count
 17 |         self.sum += other.sum
 18 |         self.sum_squares += other.sum_squares
 19 |         return self
 20 | 
 21 |     @classmethod
 22 |     def from_numpy(cls, stats):
 23 |         stats = np.copy(stats)
 24 |         assert len(stats) == 2, stats.shape
 25 |         # If feat has >2 dims, only use the first one for count
 26 |         count = stats[0, -1].flatten()[0]
 27 |         return cls(count=count, sum=stats[0, :-1], sum_squares=stats[1, :-1])
 28 | 
 29 |     def to_numpy(self):
 30 |         shape = (2, self.sum.shape[0] + 1, *self.sum.shape[1:])
 31 |         arr = np.empty(shape, dtype=np.float64)
 32 |         arr[0, :-1] = self.sum
 33 |         arr[1, :-1] = self.sum_squares
 34 |         arr[0, -1] = self.count
 35 |         arr[1, -1] = 0.0
 36 |         return arr
 37 | 
 38 | 
 39 | def str2bool(value: str) -> bool:
 40 |     return bool(strtobool(value))
 41 | 
 42 | 
 43 | def int_or_none(value: str) -> Optional[int]:
 44 |     """int_or_none.
 45 | 
 46 |     Examples:
 47 |         >>> import argparse
 48 |         >>> parser = argparse.ArgumentParser()
 49 |         >>> _ = parser.add_argument('--foo', type=int_or_none)
 50 |         >>> parser.parse_args(['--foo', '456'])
 51 |         Namespace(foo=456)
 52 |         >>> parser.parse_args(['--foo', 'none'])
 53 |         Namespace(foo=None)
 54 |         >>> parser.parse_args(['--foo', 'null'])
 55 |         Namespace(foo=None)
 56 |         >>> parser.parse_args(['--foo', 'nil'])
 57 |         Namespace(foo=None)
 58 | 
 59 |     """
 60 |     if value.strip().lower() in ("none", "null", "nil"):
 61 |         return None
 62 |     return int(value)
 63 | 
 64 | 
 65 | def float_or_none(value: str) -> Optional[float]:
 66 |     """float_or_none.
 67 | 
 68 |     Examples:
 69 |         >>> import argparse
 70 |         >>> parser = argparse.ArgumentParser()
 71 |         >>> _ = parser.add_argument('--foo', type=float_or_none)
 72 |         >>> parser.parse_args(['--foo', '4.5'])
 73 |         Namespace(foo=4.5)
 74 |         >>> parser.parse_args(['--foo', 'none'])
 75 |         Namespace(foo=None)
 76 |         >>> parser.parse_args(['--foo', 'null'])
 77 |         Namespace(foo=None)
 78 |         >>> parser.parse_args(['--foo', 'nil'])
 79 |         Namespace(foo=None)
 80 | 
 81 |     """
 82 |     if value.strip().lower() in ("none", "null", "nil"):
 83 |         return None
 84 |     return float(value)
 85 | 
 86 | 
 87 | def humanfriendly_or_none(value) -> Optional[float]:
 88 |     if value.strip().lower() in ("none", "null", "nil"):
 89 |         return None
 90 |     return humanfriendly.parse_size(value)
 91 | 
 92 | 
 93 | def str2int_tuple(integers: str) -> Optional[Tuple[int, ...]]:
 94 |     """
 95 | 
 96 |     >>> str2int_tuple('3,4,5')
 97 |     (3, 4, 5)
 98 | 
 99 |     """
100 |     assert check_argument_types()
101 |     if integers.strip() in ("none", "None", "NONE", "null", "Null", "NULL"):
102 |         return None
103 |     return tuple(map(int, integers.strip().split(",")))
104 | 
105 | 
106 | def str_or_int(value: str) -> Union[str, int]:
107 |     try:
108 |         return int(value)
109 |     except ValueError:
110 |         return value
111 | 
112 | 
113 | def str_or_none(value: str) -> Optional[str]:
114 |     """str_or_none.
115 | 
116 |     Examples:
117 |         >>> import argparse
118 |         >>> parser = argparse.ArgumentParser()
119 |         >>> _ = parser.add_argument('--foo', type=str_or_none)
120 |         >>> parser.parse_args(['--foo', 'aaa'])
121 |         Namespace(foo='aaa')
122 |         >>> parser.parse_args(['--foo', 'none'])
123 |         Namespace(foo=None)
124 |         >>> parser.parse_args(['--foo', 'null'])
125 |         Namespace(foo=None)
126 |         >>> parser.parse_args(['--foo', 'nil'])
127 |         Namespace(foo=None)
128 | 
129 |     """
130 |     if value.strip().lower() in ("none", "null", "nil"):
131 |         return None
132 |     return value
133 | 


--------------------------------------------------------------------------------
/swbd/asr1/asr.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/asr.sh


--------------------------------------------------------------------------------
/swbd/asr1/cmd.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/cmd.sh


--------------------------------------------------------------------------------
/swbd/asr1/conf/fbank.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank
2 |   num_mel_bins: 80
3 |   sample_frequency: 8000
4 | 


--------------------------------------------------------------------------------
/swbd/asr1/conf/fbank_pitch.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank_pitch
2 |   num_mel_bins: 80
3 |   sample_frequency: 8000
4 | 


--------------------------------------------------------------------------------
/swbd/asr1/conf/gpu.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
 3 | option mem=* -l mem_free=$0,ram_free=$0
 4 | option mem=0          # Do not add anything to qsub_opts
 5 | option num_threads=* -pe smp $0
 6 | option num_threads=1  # Do not add anything to qsub_opts
 7 | option max_jobs_run=* -tc $0
 8 | default gpu=0
 9 | option gpu=0
10 | option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q


--------------------------------------------------------------------------------
/swbd/asr1/conf/pbs.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -V -v PATH -S /bin/bash
 3 | option name=* -N $0
 4 | option mem=* -l mem=$0
 5 | option mem=0          # Do not add anything to qsub_opts
 6 | option num_threads=* -l ncpus=$0
 7 | option num_threads=1  # Do not add anything to qsub_opts
 8 | option num_nodes=* -l nodes=$0:ppn=1
 9 | default gpu=0
10 | option gpu=0
11 | option gpu=* -l ngpus=$0
12 | 


--------------------------------------------------------------------------------
/swbd/asr1/conf/queue.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
 3 | option name=* -N $0
 4 | option mem=* -l mem_free=$0,ram_free=$0
 5 | option mem=0          # Do not add anything to qsub_opts
 6 | option num_threads=* -pe smp $0
 7 | option num_threads=1  # Do not add anything to qsub_opts
 8 | option max_jobs_run=* -tc $0
 9 | option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
10 | default gpu=0
11 | option gpu=0
12 | option gpu=* -l gpu=$0 -q g.q
13 | 


--------------------------------------------------------------------------------
/swbd/asr1/conf/slurm.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command sbatch --export=PATH
 3 | option name=* --job-name $0
 4 | option time=* --time $0
 5 | option mem=* --mem-per-cpu $0
 6 | option mem=0          # Do not add anything to qsub_opts
 7 | option num_threads=* --cpus-per-task $0
 8 | option num_threads=1 --cpus-per-task 1
 9 | option num_nodes=* --nodes $0
10 | default gpu=0
11 | option gpu=0 -p cpu
12 | option gpu=* -p gpu --gres=gpu:$0
13 | # note: the --max-jobs-run option is supported as a special case
14 | # by slurm.pl and you don't have to handle it in the config file.
15 | 


--------------------------------------------------------------------------------
/swbd/asr1/db.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/db.sh


--------------------------------------------------------------------------------
/swbd/asr1/local/MSU_single_letter.txt:
--------------------------------------------------------------------------------
 1 | A ey
 2 | B b iy
 3 | C s iy
 4 | D d iy
 5 | E iy
 6 | F eh f
 7 | G jh iy
 8 | H ey ch
 9 | I ay
10 | J jh ey
11 | K k ey
12 | L eh l
13 | M eh m
14 | N eh n
15 | O ow
16 | P p iy
17 | Q k y uw
18 | R aa r
19 | S eh s
20 | T t iy
21 | U y uw
22 | V v iy
23 | W d ah b ax l y uw
24 | X eh k s
25 | Y w ay
26 | Z z iy
27 | 


--------------------------------------------------------------------------------
/swbd/asr1/local/data.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | set -u
  4 | set -o pipefail
  5 | 
  6 | log() {
  7 |     local fname=${BASH_SOURCE[1]##*/}
  8 |     echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
  9 | }
 10 | 
 11 | . ./path.sh || exit 1
 12 | . ./db.sh || exit 1
 13 | 
 14 | # Extract switchboard-1
 15 | if [ -z "${SWBD1}" ]; then
 16 |     log "Fill the value of 'SWBD1' in db.sh"
 17 |     exit 1
 18 | elif [ ! -e "${SWBD1}" ]; then
 19 |     mkdir -p "${SWBD1}"
 20 |     {
 21 |       tar xzvf ${SWBD1_TGZ} -C "${SWBD1}"
 22 |     } || {
 23 |       log "Failed to extract SWBD1"
 24 |       exit 1
 25 |     }
 26 | fi
 27 | 
 28 | # Download switchboard-1 transcripts if needed
 29 | if [ ! -d "${SWBD1}/swb_ms98_transcriptions" ]; then
 30 |     echo " *** Downloading trascriptions and dictionary ***"
 31 |     wget http://www.openslr.org/resources/5/switchboard_word_alignments.tar.gz ||
 32 |     wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz
 33 |     tar xzvf switchboard_word_alignments.tar.gz -C "${SWBD1}"
 34 |     rm switchboard_word_alignments.tar.gz
 35 | else
 36 |   log "Directory with transcriptions exists, skipping downloading."
 37 | fi
 38 | 
 39 | # Prepare the dictionary & the rest of the Switchboard-1 data
 40 | log "local/swbd1_prepare_dict.sh ${SWBD1}"
 41 | local/swbd1_prepare_dict.sh "${SWBD1}"
 42 | log "local/swbd1_data_prep.sh ${SWBD1}"
 43 | local/swbd1_data_prep.sh "${SWBD1}"
 44 | 
 45 | # Extract & prepare EVAL-2000
 46 | if [ "$(echo "${EVAL2000}" | wc -w)" != 2 ]; then
 47 |     log "Fill the value of 'EVAL2000' in db.sh (2 items required, hub5e_00 and hub5)"
 48 | fi
 49 | for (( i=1; i<=2; i++ )); do
 50 |   src=$(echo "${EVAL2000_TGZ}" | cut -d " " -f $i)
 51 |   dst=$(echo "${EVAL2000}" | cut -d " " -f $i)
 52 |   # hub5e is in a sub-directory
 53 |   if [ $i = 1 ]; then
 54 |     dst=$(dirname "${dst}")
 55 |   fi
 56 | 
 57 |   if [ ! -e "${dst}" ]; then
 58 |       mkdir -p "${dst}"
 59 |       {
 60 |         tar xzvf "${src}" -C "${dst}"
 61 |       } || {
 62 |         log "Failed to extract EVAL2000 (part $i)"
 63 |         exit 1
 64 |       }
 65 |   fi
 66 | done
 67 | 
 68 | # Note: do not quote ${EVAL2000} -- it should contains 2 directories, and eval2000_data_prep.sh requires 2 arguments
 69 | log "local/eval2000_data_prep.sh ${EVAL2000}"
 70 | local/eval2000_data_prep.sh ${EVAL2000}
 71 | 
 72 | # Extract & prepare RT-03
 73 | if [ -z "${RT03}" ]; then
 74 |     log "Fill the value of 'RT03' in db.sh"
 75 |     exit 1
 76 | elif [ ! -e "${RT03}" ]; then
 77 |     RT03_BASE="$(dirname "${RT03}")"
 78 |     mkdir -p "${RT03_BASE}"
 79 |     {
 80 |       tar xzvf "${RT03_TGZ}" -C "${RT03_BASE}"
 81 |     } || {
 82 |       log "Failed to extract SWBD1"
 83 |       exit 1
 84 |     }
 85 | fi
 86 | 
 87 | log "local/rt03_data_prep.sh ${RT03}"
 88 | local/rt03_data_prep.sh ${RT03}
 89 | 
 90 | # normalize eval2000 and rt03 texts by
 91 | # 1) convert upper to lower
 92 | # 2) remove tags (%AH) (%HESITATION) (%UH)
 93 | # 3) remove <B_ASIDE> <E_ASIDE>
 94 | # 4) remove "(" or ")"
 95 | for x in eval2000 rt03; do
 96 |     cp data/${x}/text data/${x}/text.org
 97 |     paste -d "" \
 98 |         <(cut -f 1 -d" " data/${x}/text.org) \
 99 |         <(awk '{$1=""; print tolower($0)}' data/${x}/text.org | perl -pe 's| \(\%.*\)||g' \
100 |          | perl -pe 's| \<.*\>||g' | sed -e "s/(//g" -e "s/)//g") | sed -e 's/\s\+/ /g' > data/${x}/text
101 |     rm data/${x}/text.org
102 | done
103 | 


--------------------------------------------------------------------------------
/swbd/asr1/local/extend_segments.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | use warnings; #sed replacement for -w perl parameter
  3 | 
  4 | if (@ARGV != 1 || !($ARGV[0] =~ m/^-?\d+\.?\d*$/ &&  $ARGV[0] >= 0)) {
  5 |   print STDERR "Usage: extend_segments.pl time-in-seconds <segments >segments.extended \n" .
  6 |        "e.g. extend_segments.pl 0.25 <segments.1 >segments.2\n" .
  7 |        "This command modifies a segments file, with lines like\n" .
  8 |        " <utterance-id> <recording-id> <start-time> <end-time>\n" .
  9 |        "by extending the beginning and end of each segment by a certain\n" .
 10 |        "length of time.  This script makes sure the output segments do not\n" .
 11 |        "overlap as a result of this time-extension, and that there are no\n" .
 12 |        "negative times in the output.\n";
 13 |   exit 1;
 14 | }
 15 | 
 16 | $extend = $ARGV[0];
 17 | 
 18 | @all_lines = ();
 19 | 
 20 | while (<STDIN>) {
 21 |   chop;
 22 |   @A = split(" ", $_);
 23 |   if (@A != 4) {
 24 |     die "invalid line in segments file: $_";
 25 |   }
 26 |   $line = @all_lines;  # current number of lines.
 27 |   ($utt_id, $reco_id, $start_time, $end_time) = @A;
 28 | 
 29 |   push @all_lines, [ $utt_id, $reco_id, $start_time, $end_time ]; # anonymous array.
 30 |   if (! defined $lines_for_reco{$reco_id}) {
 31 |     $lines_for_reco{$reco_id} = [ ];  # push new anonymous array.
 32 |   }
 33 |   push @{$lines_for_reco{$reco_id}}, $line;
 34 | }
 35 | 
 36 | foreach $reco_id (keys %lines_for_reco) {
 37 |   $ref = $lines_for_reco{$reco_id};
 38 |   @line_numbers = sort { ${$all_lines[$a]}[2] <=> ${$all_lines[$b]}[2] } @$ref;
 39 | 
 40 | 
 41 |   {
 42 |     # handle start of earliest segment as a special case.
 43 |     $l0 = $line_numbers[0];
 44 |     $tstart = ${$all_lines[$l0]}[2] - $extend;
 45 |     if ($tstart < 0.0) { $tstart = 0.0; }
 46 |     ${$all_lines[$l0]}[2] = $tstart;
 47 |   }
 48 |   {
 49 |     # handle end of latest segment as a special case.
 50 |     $lN = $line_numbers[$#line_numbers];
 51 |     $tend = ${$all_lines[$lN]}[3] + $extend;
 52 |     ${$all_lines[$lN]}[3] = $tend;
 53 |   }
 54 |   for ($i = 0; $i < $#line_numbers; $i++) {
 55 |     $ln = $line_numbers[$i];
 56 |     $ln1 = $line_numbers[$i+1];
 57 |     $tend = ${$all_lines[$ln]}[3]; # end of earlier segment.
 58 |     $tstart = ${$all_lines[$ln1]}[2]; # start of later segment.
 59 |     if ($tend > $tstart) {
 60 |       $utt1 = ${$all_lines[$ln]}[0];
 61 |       $utt2 = ${$all_lines[$ln1]}[0];
 62 |       print STDERR "Warning: for utterances $utt1 and $utt2, segments " .
 63 |         "already overlap; leaving these times unchanged.\n";
 64 |     } else {
 65 |       $my_extend = $extend;
 66 |       $max_extend =  0.5 * ($tstart - $tend);
 67 |       if ($my_extend > $max_extend) { $my_extend = $max_extend; }
 68 |       $tend += $my_extend;
 69 |       $tstart -= $my_extend;
 70 |       ${$all_lines[$ln]}[3] = $tend;
 71 |       ${$all_lines[$ln1]}[2] = $tstart;
 72 |     }
 73 |   }
 74 | }
 75 | 
 76 | # leave the numbering of the lines unchanged.
 77 | for ($l = 0; $l < @all_lines; $l++) {
 78 |   $ref = $all_lines[$l];
 79 |   ($utt_id, $reco_id, $start_time, $end_time) = @$ref;
 80 |   printf("%s %s %.2f %.2f\n", $utt_id, $reco_id, $start_time, $end_time);
 81 | }
 82 | 
 83 | __END__
 84 | 
 85 | # testing below.
 86 | 
 87 | # ( echo a1 A 0 1; echo a2 A 3 4; echo b1 B 0 1; echo b2 B 2 3 ) | local/extend_segments.pl 1.0
 88 | a1 A 0.00 2.00
 89 | a2 A 2.00 5.00
 90 | b1 B 0.00 1.50
 91 | b2 B 1.50 4.00
 92 | # ( echo a1 A 0 2; echo a2 A 1 3 ) | local/extend_segments.pl 1.0
 93 | Warning: for utterances a1 and a2, segments already overlap; leaving these times unchanged.
 94 | a1 A 0.00 2.00
 95 | a2 A 1.00 4.00
 96 | # ( echo a1 A 0 2; echo a2 A 5 6; echo a3 A 3 4 ) | local/extend_segments.pl 1.0
 97 | a1 A 0.00 2.50
 98 | a2 A 4.50 7.00
 99 | a3 A 2.50 4.50
100 | 


--------------------------------------------------------------------------------
/swbd/asr1/local/format_acronyms_dict.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Copyright 2015  Minhua Wu
  4 | # Apache 2.0
  5 | 
  6 | # convert acronyms in swbd dict to fisher convention
  7 | # IBM to i._b._m.
  8 | # BBC to b._b._c.
  9 | # BBCs to b._b._c.s
 10 | # BBC's to b._b._c.'s
 11 | 
 12 | import argparse,re
 13 | __author__ = 'Minhua Wu'
 14 |  
 15 | parser = argparse.ArgumentParser(description='format acronyms to a._b._c.')
 16 | parser.add_argument('-i','--input', help='Input lexicon',required=True)
 17 | parser.add_argument('-o','--output',help='Output lexicon', required=True)
 18 | parser.add_argument('-L','--Letter', help='Input single letter pronunciation',required=True)
 19 | parser.add_argument('-M','--Map', help='Output acronyms mapping',required=True)
 20 | args = parser.parse_args()
 21 | 
 22 | 
 23 | fin_lex = open(args.input,"r")
 24 | fin_Letter = open(args.Letter, "r")
 25 | fout_lex = open(args.output, "w")
 26 | fout_map = open(args.Map, "w")
 27 | 
 28 | # Initialise single letter dictionary
 29 | dict_letter = {}
 30 | for single_letter_lex in fin_Letter:
 31 |     items = single_letter_lex.split()
 32 |     dict_letter[items[0]] = single_letter_lex[len(items[0])+1:].strip()
 33 | fin_Letter.close()
 34 | #print dict_letter
 35 | 
 36 | for lex in fin_lex:
 37 |     items = lex.split()
 38 |     word = items[0]
 39 |     lexicon = lex[len(items[0])+1:].strip()
 40 |     # find acronyms from words with only letters and '
 41 |     pre_match = re.match(r'^[A-Za-z]+$|^[A-Za-z]+\'s$|^[A-Za-z]+s$',word)
 42 |     if pre_match:
 43 |         # find if words in the form of xxx's is acronym
 44 |         if word[-2:] == '\'s' and (lexicon[-1] == 's' or lexicon[-1] == 'z'):
 45 |             actual_word = word[:-2]
 46 |             actual_lexicon = lexicon[:-2]
 47 |             acronym_lexicon = ""
 48 |             for l in actual_word:
 49 |                 acronym_lexicon = acronym_lexicon + dict_letter[l.upper()] + " "
 50 |             if acronym_lexicon.strip() == actual_lexicon:
 51 |                 acronym_mapped = ""
 52 |                 acronym_mapped_back = ""
 53 |                 for l in actual_word[:-1]:
 54 |                     acronym_mapped = acronym_mapped + l.lower() + '._'
 55 |                     acronym_mapped_back = acronym_mapped_back + l.lower() + ' '
 56 |                 acronym_mapped = acronym_mapped + actual_word[-1].lower() + '.\'s'
 57 |                 acronym_mapped_back = acronym_mapped_back + actual_word[-1].lower() + '\'s'
 58 |                 fout_map.write(word + '\t' + acronym_mapped + '\t' + acronym_mapped_back + '\n')
 59 |                 fout_lex.write(acronym_mapped + ' ' + lexicon + '\n') 
 60 |             else:
 61 |                 fout_lex.write(lex)
 62 |         
 63 |         # find if words in the form of xxxs is acronym
 64 |         elif word[-1] == 's' and (lexicon[-1] == 's' or lexicon[-1] == 'z'):
 65 |             actual_word = word[:-1]
 66 |             actual_lexicon = lexicon[:-2]
 67 |             acronym_lexicon = ""
 68 |             for l in actual_word:
 69 |                 acronym_lexicon = acronym_lexicon + dict_letter[l.upper()] + " "
 70 |             if acronym_lexicon.strip() == actual_lexicon:
 71 |                 acronym_mapped = ""
 72 |                 acronym_mapped_back = ""
 73 |                 for l in actual_word[:-1]:
 74 |                     acronym_mapped = acronym_mapped + l.lower() + '._'
 75 |                     acronym_mapped_back = acronym_mapped_back + l.lower() + ' '
 76 |                 acronym_mapped = acronym_mapped + actual_word[-1].lower() + '.s'
 77 |                 acronym_mapped_back = acronym_mapped_back + actual_word[-1].lower() + '\'s'
 78 |                 fout_map.write(word + '\t' + acronym_mapped + '\t' + acronym_mapped_back + '\n')
 79 |                 fout_lex.write(acronym_mapped + ' ' + lexicon + '\n') 
 80 |             else:
 81 |                 fout_lex.write(lex)
 82 |  
 83 |         # find if words in the form of xxx (not ended with 's or s) is acronym   
 84 |         elif word.find('\'') == -1 and word[-1] != 's':
 85 |             acronym_lexicon = ""
 86 |             for l in word:
 87 |                 acronym_lexicon = acronym_lexicon + dict_letter[l.upper()] + " "
 88 |             if acronym_lexicon.strip() == lexicon:
 89 |                 acronym_mapped = ""
 90 |                 acronym_mapped_back = ""
 91 |                 for l in word[:-1]:
 92 |                     acronym_mapped = acronym_mapped + l.lower() + '._'
 93 |                     acronym_mapped_back = acronym_mapped_back + l.lower() + ' '
 94 |                 acronym_mapped = acronym_mapped + word[-1].lower() + '.'
 95 |                 acronym_mapped_back = acronym_mapped_back + word[-1].lower()
 96 |                 fout_map.write(word + '\t' + acronym_mapped + '\t' + acronym_mapped_back + '\n')
 97 |                 fout_lex.write(acronym_mapped + ' ' + lexicon + '\n')
 98 |             else:
 99 |                 fout_lex.write(lex)
100 |         else:
101 |             fout_lex.write(lex)
102 |         
103 |     else:
104 |         fout_lex.write(lex)
105 | 


--------------------------------------------------------------------------------
/swbd/asr1/local/map_acronyms_transcripts.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright 2015  Minhua Wu
 4 | # Apache 2.0
 5 | 
 6 | # convert acronyms in swbd transcript to fisher convention
 7 | # accoring to first two columns in the input acronyms mapping
 8 | 
 9 | import argparse, re
10 | __author__ = 'Minhua Wu'
11 |  
12 | parser = argparse.ArgumentParser(description='format acronyms to a._b._c.')
13 | parser.add_argument('-i', '--input', help='Input transcripts', required=True)
14 | parser.add_argument('-o', '--output',help='Output transcripts', required=True)
15 | parser.add_argument('-M', '--Map', help='Input acronyms mapping', required=True)
16 | args = parser.parse_args()
17 | 
18 | fin_map = open(args.Map, "r")
19 | dict_acronym = {}
20 | dict_acronym_noi = {}  # Mapping of acronyms without I, i
21 | for pair in fin_map:
22 |     items = pair.split('\t')
23 |     dict_acronym[items[0]] = items[1]
24 |     dict_acronym_noi[items[0]] = items[1]
25 | fin_map.close()
26 | del dict_acronym_noi['I']
27 | del dict_acronym_noi['i']
28 | 
29 | 
30 | fin_trans = open(args.input, "r")
31 | fout_trans = open(args.output, "w")
32 | for line in fin_trans:
33 |     items = line.split()
34 |     L = len(items)
35 |     # First pass mapping to map I as part of acronym
36 |     for i in range(L):
37 |         if items[i] == 'I':
38 |             x = 0
39 |             while i-1-x >= 0 and re.match(r'^[A-Z]$', items[i-1-x]):
40 |                 x += 1
41 |             
42 |             y = 0
43 |             while i+1+y < L and re.match(r'^[A-Z]$', items[i+1+y]):
44 |                 y += 1
45 | 
46 |             if x+y > 0:
47 |                 for bias in range(-x, y+1):
48 |                     items[i+bias] = dict_acronym[items[i+bias]]
49 |                   
50 |     # Second pass mapping (not mapping 'i' and 'I')
51 |     for i in range(len(items)):
52 |         if items[i] in dict_acronym_noi.keys():
53 |             items[i] = dict_acronym_noi[items[i]]
54 |     sentence = ' '.join(items[1:])
55 |     fout_trans.write(items[0] + ' ' + sentence.lower() + '\n')
56 | 
57 | fin_trans.close()
58 | fout_trans.close()
59 | 


--------------------------------------------------------------------------------
/swbd/asr1/local/rt03_data_prep.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | set -u
  4 | set -o pipefail
  5 | 
  6 | log() {
  7 |     local fname=${BASH_SOURCE[1]##*/}
  8 |     echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
  9 | }
 10 | 
 11 | # RT-03 data preparation (conversational telephone speech part only)
 12 | # Adapted from Arnab Ghoshal's script for Hub-5 Eval 2000 by Peng Qi
 13 | 
 14 | # To be run from one directory above this script.
 15 | 
 16 | # Expects the standard directory layout for RT-03
 17 | 
 18 | if [ $# -ne 1 ]; then
 19 |   echo "Usage: $0 <rt03-dir>"
 20 |   echo "e.g.: $0 /export/corpora/LDC/LDC2007S10"
 21 |   echo "See comments in the script for more details"
 22 |   exit 1
 23 | fi
 24 | 
 25 | sdir=$1
 26 | [ ! -d $sdir/data/audio/eval03/english/cts ] \
 27 |   && echo Expecting directory $sdir/data/audio/eval03/english/cts to be present && exit 1;
 28 | [ ! -d $sdir/data/references/eval03/english/cts ] \
 29 |   && echo Expecting directory $tdir/data/references/eval03/english/cts to be present && exit 1;
 30 | 
 31 | . ./path.sh
 32 | 
 33 | dir=data/local/rt03
 34 | mkdir -p $dir
 35 | 
 36 | rtroot=$sdir
 37 | tdir=$sdir/data/references/eval03/english/cts
 38 | sdir=$sdir/data/audio/eval03/english/cts
 39 | 
 40 | find $sdir -iname '*.sph' | sort > $dir/sph.flist
 41 | sed -e 's?.*/?{rt03}?' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist > $dir/sph.scp
 42 | 
 43 | sph2pipe=$MAIN_ROOT/tools/sph2pipe_v2.5/sph2pipe
 44 | [ ! -x $sph2pipe ] \
 45 |   && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
 46 | 
 47 | awk -v sph2pipe=$sph2pipe '{
 48 |   printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
 49 |   printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);
 50 | }' < $dir/sph.scp | sort > $dir/wav.scp || exit 1;
 51 | #side A - channel 1, side B - channel 2
 52 | 
 53 | # Get segments file...
 54 | # segments file format is: utt-id side-id start-time end-time, e.g.:
 55 | # sw02001-A_000098-001156 sw02001-A 0.98 11.56
 56 | #pem=$sdir/english/hub5e_00.pem
 57 | #[ ! -f $pem ] && echo "No such file $pem" && exit 1;
 58 | # pem file has lines like:
 59 | # en_4156 A unknown_speaker 301.85 302.48
 60 | 
 61 | #grep -v ';;' $pem \
 62 | cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap \
 63 |   | awk '{
 64 |            spk="{rt03}"$1"-"(($2==1)?"A":"B");
 65 |            utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
 66 |            print utt,spk,$4,$5;}' \
 67 |   | sort -u > $dir/segments
 68 | 
 69 | # stm file has lines like:
 70 | # en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F>  HE IS A POLICE OFFICER
 71 | #grep -v ';;' $tdir/reference/hub5e00.english.000405.stm \
 72 | cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap \
 73 |   | awk '{
 74 |            spk="{rt03}"$1"-"(($2==1)?"A":"B");
 75 |            utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
 76 |            printf utt; for(n=7;n<=NF;n++) printf(" %s", $n); print ""; }' \
 77 |   | sort > $dir/text.all
 78 | 
 79 | # We'll use the stm file for sclite scoring.  There seem to be various errors
 80 | # in the stm file that upset hubscr.pl, and we fix them here.
 81 | cat $tdir/*.stm | \
 82 |   sed -e 's:((:(:' -e 's:<B_ASIDE>::g' -e 's:<E_ASIDE>::g' | \
 83 |   grep -v inter_segment_gap | \
 84 |   awk '{
 85 |            spk="{rt03}"$1" "(($2==1)?"A":"B");
 86 |            after_spk="{rt03}"$3;
 87 |            if ($1==";;") printf("%s %s %s",$1,$2,$3); else printf("%s %s",spk,after_spk);
 88 |            for(n=4;n<=NF;n++) printf(" %s", $n); print ""; }' \
 89 |   > $dir/stm
 90 | cp $rtroot/data/trans_rules/en20030506.glm  $dir/glm
 91 | 
 92 | # next line uses command substitution
 93 | # Just checking that the segments are the same in pem vs. stm.
 94 | ! cmp <(awk '{print $1}' $dir/text.all) <(awk '{print $1}' $dir/segments) && \
 95 |    echo "Segments from pem file and stm file do not match." && exit 1;
 96 | 
 97 | grep -v IGNORE_TIME_SEGMENT_ $dir/text.all > $dir/text
 98 | 
 99 | # create an utt2spk file that assumes each conversation side is
100 | # a separate speaker.
101 | awk '{print $1,$2;}' $dir/segments > $dir/utt2spk
102 | utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
103 | 
104 | # cp $dir/segments $dir/segments.tmp
105 | # awk '{x=$3-0.05; if (x<0.0) x=0.0; y=$4+0.05; print $1, $2, x, y; }' \
106 | #   $dir/segments.tmp > $dir/segments
107 | 
108 | awk '{print $1}' $dir/wav.scp \
109 |   | perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_";
110 |                print "$1-$2 $1 $2\n"; ' \
111 |   > $dir/reco2file_and_channel || exit 1;
112 | 
113 | dest=data/rt03
114 | mkdir -p $dest
115 | for x in wav.scp segments text utt2spk spk2utt stm glm reco2file_and_channel; do
116 |   cp $dir/$x $dest/$x
117 | done
118 | 
119 | utils/fix_data_dir.sh $dest
120 | rm -r $dir
121 | 
122 | log "Data preparation and formatting completed for RT-03 (but not MFCC extraction)"
123 | 


--------------------------------------------------------------------------------
/swbd/asr1/local/swbd1_fix_speakerid.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | use warnings; #sed replacement for -w perl parameter
 3 | 
 4 | # Author: Peng Qi (pengqi@cs.stanford.edu)
 5 | # This script maps Switchboard speaker IDs to the true physical speakers
 6 | # and fixes the utterances IDs accordingly. Expected to be run one level of
 7 | # directory above.
 8 | 
 9 | sub trim {
10 |     (my $s = $_[0]) =~ s/^\s+|\s+$//g;
11 |     return $s;        
12 | }
13 | 
14 | if ($#ARGV != 1) {
15 | 	print "Usage: swbd1_fix_speakerid.pl <swbd-conv-tab-file> <data-dir>\n";
16 | 	print "E.g.:  swbd1_fix_speakerid.pl /datasets/SWBD1Transcripts/tables/conv.tab data/train\n";
17 | }
18 | 
19 | $tab_file = $ARGV[0];
20 | $dir = $ARGV[1];
21 | 
22 | %conv_to_spk = ();
23 | 
24 | open(my $conv_tab, '<', $tab_file) or die "Could not open '$tab_file' $!\n";
25 |  
26 | while (my $line = <$conv_tab>) {
27 |   chomp $line;
28 |  
29 |   my @fields = split "," , $line;
30 |   #$fields[0] = trim($fields[0]);
31 |   $fields[2] = trim($fields[2]);
32 |   $fields[3] = trim($fields[3]);
33 |   $conv_to_spk{'{swbd}0' . $fields[0] . '-A'} = $fields[2];
34 |   $conv_to_spk{'{swbd}0' . $fields[0] . '-B'} = $fields[3];
35 | }
36 | 
37 | close($conv_tab);
38 | 
39 | # fix utt2spk
40 | 
41 | %missingconv = ();
42 | 
43 | open(my $utt2spk, '<', $dir . '/utt2spk') or die "Could not open '$dir/utt2spk' $!\n";
44 | open(my $utt2spk_new, '>', $dir . '/utt2spk.new');
45 | 
46 | while (my $line = <$utt2spk>) {
47 |   chomp $line;
48 | 
49 |   my @fields = split " " , $line;
50 |   my $convid = substr $fields[0], 0, 9;
51 |   
52 |   if (exists $conv_to_spk{ $convid }) {
53 |     my $spkid = $conv_to_spk{ $convid };
54 |     $spkid = "{swbd}" . $spkid;
55 |     my $newuttid = $spkid . '-' . (substr $fields[0], 2);
56 | 
57 |     print $utt2spk_new "$newuttid $spkid\n";
58 |   } else {
59 |     my $convid = substr $convid, 3, 4;
60 |     $missingconv{$convid} = 1;
61 |     
62 |     print $utt2spk_new $fields[0]." ".$fields[1]."\n";
63 |   }
64 | }
65 | 
66 | close($utt2spk);
67 | close($utt2spk_new);
68 | 
69 | foreach my $conv (keys %missingconv) {
70 |   print "Warning: Conversation ID '$conv' not found in conv.tab, retaining old speaker IDs\n"
71 | }
72 | 
73 | # fix segments and text
74 | 
75 | foreach my $file ('segments','text') {
76 |   open(my $oldfile, '<', "$dir/$file") or die "Could not open '$dir/$file' $!\n";
77 |   open(my $newfile, '>', "$dir/$file.new");
78 | 
79 |   while (my $line = <$oldfile>) {
80 |     chomp $line;
81 | 
82 |     my $convid = substr $line, 0, 9;
83 |     if (exists $conv_to_spk{$convid}) {
84 |       my $spkid = $conv_to_spk{$convid};
85 |       print $newfile "{swbd}$spkid-" . (substr $line, 2) . "\n";
86 |     } else {
87 |       print $newfile "$line\n";
88 |     }
89 |   }
90 | }
91 | 


--------------------------------------------------------------------------------
/swbd/asr1/local/swbd1_map_words.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | # Modified from swbd_map_words.pl in Kaldi s5 recipe to make pattern
 4 | # matches case-insensitive --Arnab (Jan 2013)
 5 | 
 6 | if ($ARGV[0] eq "-f") {
 7 |   shift @ARGV; 
 8 |   $field_spec = shift @ARGV; 
 9 |   if ($field_spec =~ m/^\d+$/) {
10 |     $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
11 |   }
12 |   if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
13 |     if ($1 ne "") {
14 |       $field_begin = $1 - 1;    # Change to zero-based indexing.
15 |     }
16 |     if ($2 ne "") {
17 |       $field_end = $2 - 1;      # Change to zero-based indexing.
18 |     }
19 |   }
20 |   if (!defined $field_begin && !defined $field_end) {
21 |     die "Bad argument to -f option: $field_spec"; 
22 |   }
23 | }
24 | 
25 | 
26 | while (<>) {
27 |   @A = split(" ", $_);
28 |   for ($n = 0; $n < @A; $n++) {
29 |     $a = $A[$n];
30 |     if ( (!defined $field_begin || $n >= $field_begin)
31 |          && (!defined $field_end || $n <= $field_end)) {
32 |       # e.g. [LAUGHTER-STORY] -> STORY;
33 |       $a =~ s:(|\-)^\[LAUGHTER-(.+)\](|\-)$:$1$2$3:i;
34 |       # $1 and $3 relate to preserving trailing "-"
35 |       $a =~ s:^\[(.+)/.+\](|\-)$:$1$2:; # e.g. [IT'N/ISN'T] -> IT'N ... note,
36 |       # 1st part may include partial-word stuff, which we process further below,
37 |       # e.g. [LEM[GUINI]-/LINGUINI]
38 |       # the (|\_) at the end is to accept and preserve trailing -'s.
39 |       $a =~ s:^(|\-)\[[^][]+\](.+)$:-$2:; # e.g. -[AN]Y , note \047 is quote;
40 |       # let the leading - be optional on input, as sometimes omitted.
41 |       $a =~ s:^(.+)\[[^][]+\](|\-)$:$1-:; # e.g. AB[SOLUTE]- -> AB-;
42 |       # let the trailing - be optional on input, as sometimes omitted.
43 |       $a =~ s:([^][]+)\[.+\]$:$1:; # e.g. EX[SPECIALLY]-/ESPECIALLY] -> EX-
44 |       # which is a  mistake in the input.
45 |       $a =~ s:^\{(.+)\}$:$1:;                 # e.g. {YUPPIEDOM} -> YUPPIEDOM
46 |       $a =~ s:[A-Z]\[([^][])+\][A-Z]:$1-$3:i; # e.g. AMMU[N]IT- -> AMMU-IT-
47 |       $a =~ s:_\d$::;                         # e.g. THEM_1 -> THEM 
48 |     }
49 |     $A[$n] = $a;
50 |   }
51 |   print join(" ", @A) . "\n";
52 | }
53 | 


--------------------------------------------------------------------------------
/swbd/asr1/local/swbd1_prepare_dict.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | set -u
  4 | set -o pipefail
  5 | 
  6 | log() {
  7 |     local fname=${BASH_SOURCE[1]##*/}
  8 |     echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
  9 | }
 10 | 
 11 | # Formatting the Mississippi State dictionary for use in Edinburgh. Differs
 12 | # from the one in Kaldi s5 recipe in that it uses lower-case --Arnab (Jan 2013)
 13 | 
 14 | # To be run from one directory above this script.
 15 | 
 16 | . ./path.sh
 17 | 
 18 | #check existing directories
 19 | 
 20 | if [ $# != 1 ]; then
 21 |   log "Error: invalid command line arguments"
 22 |   log "Usage: $0 /path/to/SWBD"
 23 |   exit 1;
 24 | fi
 25 | SWBD_DIR=$1
 26 | 
 27 | # Get the original transcriptions & their corresponding dictionary
 28 | srcdir=data/local/swbd1
 29 | mkdir -p $srcdir
 30 | if [ ! -d $srcdir/swb_ms98_transcriptions ]; then
 31 |     ln -sf "${SWBD_DIR}/swb_ms98_transcriptions" $srcdir/
 32 | fi
 33 | srcdict=$srcdir/swb_ms98_transcriptions/sw-ms98-dict.text
 34 | 
 35 | # assume some basic data prep was already done on the downloaded data.
 36 | [ ! -f "$srcdict" ] && echo "$0: No such file $srcdict" && exit 1;
 37 | 
 38 | # copy over the initial dictionary as thee base lexicon
 39 | dir=data/local/dict_nosp
 40 | mkdir -p $dir
 41 | install -m +rw $srcdict $dir/lexicon0.txt || exit 1;
 42 | log "$(patch <local/dict.patch $dir/lexicon0.txt)" || exit 1;
 43 | 
 44 | #(2a) Dictionary preparation:
 45 | # Pre-processing (remove comments)
 46 | grep -v '^#' $dir/lexicon0.txt | awk 'NF>0' | sort > $dir/lexicon1.txt || exit 1;
 47 | 
 48 | cat $dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \
 49 |   grep -v sil > $dir/nonsilence_phones.txt  || exit 1;
 50 | 
 51 | ( echo sil; echo nsn; ) > $dir/silence_phones.txt
 52 | 
 53 | echo sil > $dir/optional_silence.txt
 54 | 
 55 | # No "extra questions" in the input to this setup, as we don't
 56 | # have stress or tone.
 57 | echo -n > $dir/extra_questions.txt
 58 | 
 59 | cp local/MSU_single_letter.txt $dir/
 60 | # Add to the lexicon the silences, noises etc.
 61 | # Add single letter lexicon
 62 | # The original swbd lexicon does not have precise single letter lexicion
 63 | # e.g. it does not have entry of W
 64 | ( echo '!sil sil'; echo '<noise> nsn'; echo '<unk> spn' ) \
 65 |   | cat - $dir/lexicon1.txt $dir/MSU_single_letter.txt  > $dir/lexicon2.txt || exit 1;
 66 | 
 67 | # Map the words in the lexicon.  That is-- for each word in the lexicon, we map it
 68 | # to a new written form.  The transformations we do are:
 69 | # remove laughter markings, e.g.
 70 | # [LAUGHTER-STORY] -> STORY
 71 | # Remove partial-words, e.g.
 72 | # -[40]1K W AH N K EY
 73 | # becomes -1K
 74 | # and
 75 | # -[AN]Y IY
 76 | # becomes
 77 | # -Y
 78 | # -[A]B[OUT]- B
 79 | # becomes
 80 | # -B-
 81 | # Also, curly braces, which appear to be used for "nonstandard"
 82 | # words or non-words, are removed, e.g.
 83 | # {WOLMANIZED} W OW L M AX N AY Z D
 84 | # -> WOLMANIZED
 85 | # Also, mispronounced words, e.g.
 86 | #  [YEAM/YEAH] Y AE M
 87 | # are changed to just e.g. YEAM, i.e. the orthography
 88 | # of the mispronounced version.
 89 | # Note-- this is only really to be used in training.  The main practical
 90 | # reason is to avoid having tons of disambiguation symbols, which
 91 | # we otherwise would get because there are many partial words with
 92 | # the same phone sequences (most problematic: S).
 93 | # Also, map
 94 | # THEM_1 EH M -> THEM
 95 | # so that multiple pronunciations just have alternate entries
 96 | # in the lexicon.
 97 | local/swbd1_map_words.pl -f 1 $dir/lexicon2.txt | sort -u \
 98 |   > $dir/lexicon3.txt || exit 1;
 99 | 
100 | python local/format_acronyms_dict.py -i $dir/lexicon3.txt -o $dir/lexicon4.txt \
101 |   -L $dir/MSU_single_letter.txt -M $dir/acronyms_raw.map
102 | cat $dir/acronyms_raw.map | sort -u > $dir/acronyms.map
103 | 
104 | ( echo 'i ay' )| cat - $dir/lexicon4.txt | tr '[A-Z]' '[a-z]' | sort -u > $dir/lexicon5.txt
105 | 
106 | pushd $dir >&/dev/null
107 | ln -sf lexicon5.txt lexicon.txt # This is the final lexicon.
108 | popd >&/dev/null
109 | log "Prepared input dictionary and phone-sets for Switchboard phase 1."
110 | 


--------------------------------------------------------------------------------
/swbd/asr1/path.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/path.sh


--------------------------------------------------------------------------------
/swbd/asr1/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Set bash to 'debug' mode, it will exit on :
 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
 4 | set -e
 5 | set -u
 6 | set -o pipefail
 7 | 
 8 | log() {
 9 |     local fname=${BASH_SOURCE[1]##*/}
10 |     echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
11 | }
12 | 
13 | train_sets="swbd1_train "
14 | dev_set="swbd1_dev"
15 | eval_sets="eval2000 rt03 "
16 | srctexts="data/swbd1_train/text "
17 | 
18 | ./asr.sh \
19 |     --fs 8000 \
20 |     --n_tokens 2000 \
21 |     --token_type bpe \
22 |     --train_sets "${train_sets}" \
23 |     --dev_eval_sets "${dev_set} ${eval_sets}" \
24 |     --srctexts "${srctexts}" "$@"
25 | 


--------------------------------------------------------------------------------
/swbd/asr1/utils:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/utils


--------------------------------------------------------------------------------
/tools/install_anaconda.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | 
 4 | if [ -z "${PS1:-}" ]; then
 5 |     PS1=__dummy__
 6 | fi
 7 | CONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
 8 | 
 9 | n_required_args=5
10 | if [ $# -lt $n_required_args ] ; then
11 |     echo "Usage: $0 <python-version> <conda> <venv_dir> <venv_name> <package_root> [conda install args]*"
12 |     exit 1;
13 | fi
14 | PYTHON_VERSION="$1"
15 | CONDA="$2"
16 | VENV_DIR="$3"
17 | VENV_NAME="$4"
18 | PACKAGE_ROOT="$5"
19 | shift $n_required_args
20 | 
21 | # Download conda if an installation isn't specified
22 | if [ -z "${CONDA}" ]; then
23 |     CONDA="${VENV_DIR}/bin/conda"
24 |     if [ -z "${CONDA}" ]; then
25 |         if [ ! -f "${PACKAGE_ROOT}/tools/miniconda.sh" ]; then
26 |             wget --tries=3 "${CONDA_URL}" -O "${PACKAGE_ROOT}/tools/miniconda.sh"
27 |         fi
28 |         if [ ! -d "${VENV_DIR}" ]; then
29 |             bash "${PACKAGE_ROOT}/tools/miniconda.sh" -b -p "${VENV_DIR}"
30 |         fi
31 |     fi
32 | else
33 |     ln -sf "$(${CONDA} info --base)" "${VENV_DIR}"
34 | fi
35 | 
36 | # Check if environment alreay exists
37 | if ${CONDA} env list | (! grep -q -E "${VENV_NAME}\s"); then
38 |     ${CONDA} create -y -n "${VENV_NAME}" "python=${PYTHON_VERSION}"
39 | else
40 |     read -r -p "Enviroment ${VENV_NAME} already exists. Continue setup anyways? (y/n) " choice
41 |     case $choice in
42 |         y|Y|yes|Yes ) echo "Continuing to set up environment ${VENV_NAME}." ;;
43 |         * ) echo "Either pick a different value for VENV_NAME, or remove the ${CONDA} environment ${VENV_NAME} before re-running this script." && exit 1 ;;
44 |     esac
45 | fi
46 | 
47 | # Activate conda environment & check Python version
48 | source "${VENV_DIR}/etc/profile.d/conda.sh" && conda deactivate && conda activate "${VENV_NAME}"
49 | INSTALLED_PYTHON_VERSION=$(python -V | grep -Eo "[[:digit:].]*")
50 | if [ ${INSTALLED_PYTHON_VERSION} != ${PYTHON_VERSION} ]; then
51 |     echo "Enviroment ${VENV_NAME} is Python ${INSTALLED_PYTHON_VERSION}, but Python ${PYTHON_VERSION} requested."
52 |     read -r -p "Continue setup with Python ${INSTALLED_PYTHON_VERSION} anyways? (y/n) " choice
53 |     case $choice in
54 |         y|Y|yes|Yes ) echo "Continuing to set up environment ${VENV_NAME}." ;;
55 |         * ) echo "Either pick a different value for VENV_NAME, or change PYTHON_VERSION to ${INSTALLED_PYTHON_VERSION} before re-running this script." && exit 1 ;;
56 |     esac
57 | fi
58 | 
59 | conda update -y -n "${VENV_NAME}" -c defaults conda
60 | 
61 | # Install any conda dependencies (specified via command line)
62 | while (( "$#" )); do
63 |     echo ""
64 |     echo "conda install -y -n ${VENV_NAME} $1"
65 |     conda install -y -n "${VENV_NAME}" $1
66 |     shift
67 | done
68 | 
69 | # Install the speech_datasets package in editable mode
70 | pip install -e "${PACKAGE_ROOT}"
71 | 


--------------------------------------------------------------------------------
/tools/install_pkgs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Set bash to 'debug' mode, it will exit on :
 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
 4 | set -euo pipefail
 5 | 
 6 | # This is needed for certain pods (ffmpeg-3 doesn't exist anymore & messes up apt gets)
 7 | rm -f /etc/apt/sources.list.d/jonathonf-ubuntu-ffmpeg-3*
 8 | apt-get remove libflac8 -y
 9 | apt-get update -y
10 | apt-get upgrade -y
11 | apt-get autoremove -y
12 | 
13 | # The actual apt installs we need
14 | apt-get install -y apt-utils
15 | apt-get install -y gawk
16 | apt-get install -y build-essential libfontconfig1 automake
17 | apt-get install -y sox flac ffmpeg libasound2-dev libsndfile1-dev
18 | apt-get install -y libfftw3-dev libopenblas-dev libgflags-dev libgoogle-glog-dev
19 | apt-get install -y gfortran python3
20 | apt-get install -y bc
21 | apt-get install -y wget
22 | 


--------------------------------------------------------------------------------
/tools/install_sph2pipe.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | 
 4 | if [ $# != 1 ]; then
 5 |     echo "Usage: $0 <dir>"
 6 |     exit 1;
 7 | fi
 8 | pwd=$PWD
 9 | dir=$1
10 | 
11 | if [ ! -e sph2pipe_v2.5.tar.gz ]; then
12 |   wget --no-check-certificate https://www.openslr.org/resources/3/sph2pipe_v2.5.tar.gz
13 | fi
14 | 
15 | tar xzvf sph2pipe_v2.5.tar.gz -C $dir
16 | rm sph2pipe_v2.5.tar.gz
17 | 
18 | cd $dir/sph2pipe_v2.5
19 | gcc -o sph2pipe *.c -lm
20 | cd $pwd
21 | 


--------------------------------------------------------------------------------
/utils/apply_cmvn.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | log() {
 4 |     local fname=${BASH_SOURCE[1]##*/}
 5 |     echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 6 | }
 7 | 
 8 | # Copyright 2017 Nagoya University (Tomoki Hayashi)
 9 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
10 | 
11 | echo "$0 $*"  # Print the command line for logging
12 | 
13 | cmd=utils/run.pl
14 | nj=$(nproc)
15 | filetype='hdf5'  # mat or hdf5
16 | cmvn_type='global' # global or utterance or speaker
17 | help_message="Usage: $0 <scp> <logdir> <dumpdir>"
18 | 
19 | . ./path.sh || exit 1
20 | . utils/parse_options.sh || exit 1
21 | 
22 | if [ $# != 3 ]; then
23 |     log "${help_message}"
24 |     exit 2
25 | fi
26 | 
27 | scp=$1
28 | logdir=$2
29 | dumpdir=$(utils/make_absolute.sh $3)
30 | 
31 | if [ ${filetype} = mat ]; then
32 |     ext=ark
33 | elif [ ${filetype} = hdf5 ]; then
34 |     ext=h5
35 | else
36 |     log "Received --filetype '${filetype}', but only 'mat' and 'hdf5' are valid"
37 |     exit 2
38 | fi
39 | 
40 | if [ ${cmvn_type} != global ] && [ ${cmvn_type} != utterance ] && [ ${cmvn_type} != speaker ]; then
41 |     log "Received --cmvn_type '${cmvn_type}', but only 'global', 'utterance', 'speaker'' are valid"
42 | fi
43 | 
44 | srcdir=$(dirname "$scp")
45 | cmvnark=$srcdir/cmvn.ark
46 | maybe_utt2spk=
47 | if [ -f $srcdir/utt2spk ]; then
48 |     maybe_utt2spk+="--utt2spk $srcdir/utt2spk "
49 | fi
50 | maybe_spk2utt=
51 | if [ -f $srcdir/spk2utt ]; then
52 |     maybe_spk2utt+="--spk2utt $srcdir/spk2utt "
53 | fi
54 | 
55 | mkdir -p ${logdir}
56 | mkdir -p ${dumpdir}
57 | 
58 | # compute CMVN stats
59 | python -m speech_datasets.bin.compute_cmvn_stats \
60 |     --in-filetype ${filetype} ${maybe_spk2utt}  \
61 |     --cmvn-type ${cmvn_type} scp:${scp} ${cmvnark}
62 | 
63 | echo $cmvn_type > $srcdir/cmvn_type
64 | 
65 | # split scp file
66 | split_scps=""
67 | for n in $(seq ${nj}); do
68 |     split_scps="$split_scps $logdir/feats.$n.scp"
69 | done
70 | 
71 | utils/split_scp.pl ${scp} ${split_scps} || exit 1;
72 | 
73 | # apply CMVN to features & dump them
74 | ${cmd} JOB=1:${nj} ${logdir}/apply_cmvn.JOB.log \
75 |     apply_cmvn.py --norm-vars true --in-filetype ${filetype} --out-filetype ${filetype} \
76 |         --cmvn-type ${cmvn_type} ${maybe_utt2spk} ${cmvnark} scp:${logdir}/feats.JOB.scp \
77 |         ark,scp:${dumpdir}/feats.JOB.${ext},${dumpdir}/feats.JOB.scp \
78 |     || exit 1
79 | 
80 | # concatenate scp files
81 | for n in $(seq ${nj}); do
82 |     cat ${dumpdir}/feats.${n}.scp || exit 1;
83 | done > ${dumpdir}/feats.scp || exit 1
84 | 
85 | # remove temp scps
86 | rm ${dumpdir}/feats.*.scp 2>/dev/null
87 | rm ${logdir}/feats.*.scp 2>/dev/null
88 | log "Succeeded applying CMVN to features for training."
89 | 


--------------------------------------------------------------------------------
/utils/apply_map.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | use warnings; #sed replacement for -w perl parameter
 3 | # Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
 4 | # Apache 2.0.
 5 | 
 6 | # This program is a bit like ./sym2int.pl in that it applies a map
 7 | # to things in a file, but it's a bit more general in that it doesn't
 8 | # assume the things being mapped to are single tokens, they could
 9 | # be sequences of tokens.  See the usage message.
10 | 
11 | 
12 | $permissive = 0;
13 | 
14 | for ($x = 0; $x <= 2; $x++) {
15 | 
16 |   if (@ARGV > 0 && $ARGV[0] eq "-f") {
17 |     shift @ARGV;
18 |     $field_spec = shift @ARGV;
19 |     if ($field_spec =~ m/^\d+$/) {
20 |       $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
21 |     }
22 |     if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
23 |       if ($1 ne "") {
24 |         $field_begin = $1 - 1;  # Change to zero-based indexing.
25 |       }
26 |       if ($2 ne "") {
27 |         $field_end = $2 - 1;    # Change to zero-based indexing.
28 |       }
29 |     }
30 |     if (!defined $field_begin && !defined $field_end) {
31 |       die "Bad argument to -f option: $field_spec";
32 |     }
33 |   }
34 | 
35 |   if (@ARGV > 0 && $ARGV[0] eq '--permissive') {
36 |     shift @ARGV;
37 |     # Mapping is optional (missing key is printed to output)
38 |     $permissive = 1;
39 |   }
40 | }
41 | 
42 | if(@ARGV != 1) {
43 |   print STDERR "Invalid usage: " . join(" ", @ARGV) . "\n";
44 |   print STDERR <<'EOF';
45 | Usage: apply_map.pl [options] map <input >output
46 |  options: [-f <field-range> ] [--permissive]
47 |    This applies a map to some specified fields of some input text:
48 |    For each line in the map file: the first field is the thing we
49 |    map from, and the remaining fields are the sequence we map it to.
50 |    The -f (field-range) option says which fields of the input file the map
51 |    map should apply to.
52 |    If the --permissive option is supplied, fields which are not present
53 |    in the map will be left as they were.
54 |  Applies the map 'map' to all input text, where each line of the map
55 |  is interpreted as a map from the first field to the list of the other fields
56 |  Note: <field-range> can look like 4-5, or 4-, or 5-, or 1, it means the field
57 |  range in the input to apply the map to.
58 |  e.g.: echo A B | apply_map.pl a.txt
59 |  where a.txt is:
60 |  A a1 a2
61 |  B b
62 |  will produce:
63 |  a1 a2 b
64 | EOF
65 |   exit(1);
66 | }
67 | 
68 | ($map_file) = @ARGV;
69 | open(M, "<$map_file") || die "Error opening map file $map_file: $!";
70 | 
71 | while (<M>) {
72 |   @A = split(" ", $_);
73 |   @A >= 1 || die "apply_map.pl: empty line.";
74 |   $i = shift @A;
75 |   $o = join(" ", @A);
76 |   $map{$i} = $o;
77 | }
78 | 
79 | while(<STDIN>) {
80 |   @A = split(" ", $_);
81 |   for ($x = 0; $x < @A; $x++) {
82 |     if ( (!defined $field_begin || $x >= $field_begin)
83 |          && (!defined $field_end || $x <= $field_end)) {
84 |       $a = $A[$x];
85 |       if (!defined $map{$a}) {
86 |         if (!$permissive) {
87 |           die "apply_map.pl: undefined key $a in $map_file\n";
88 |         } else {
89 |           print STDERR "apply_map.pl: warning! missing key $a in $map_file\n";
90 |         }
91 |       } else {
92 |         $A[$x] = $map{$a};
93 |       }
94 |     }
95 |   }
96 |   print join(" ", @A) . "\n";
97 | }
98 | 


--------------------------------------------------------------------------------
/utils/combine_data.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | # Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
  3 | #           2014  David Snyder
  4 | 
  5 | # This script combines the data from multiple source directories into
  6 | # a single destination directory.
  7 | 
  8 | # See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information
  9 | # about what these directories contain.
 10 | 
 11 | # Begin configuration section.
 12 | extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..."
 13 | skip_fix=false # skip the fix_data_dir.sh in the end
 14 | # End configuration section.
 15 | 
 16 | echo "$0 $@"  # Print the command line for logging
 17 | 
 18 | if [ -f path.sh ]; then . ./path.sh; fi
 19 | . utils/parse_options.sh || exit 1;
 20 | 
 21 | if [ $# -lt 2 ]; then
 22 |   echo "Usage: combine_data.sh [--extra-files 'file1 file2'] <dest-data-dir> <src-data-dir1> <src-data-dir2> ..."
 23 |   echo "Note, files that don't appear in all source dirs will not be combined,"
 24 |   echo "with the exception of utt2uniq and segments, which are created where necessary."
 25 |   exit 1
 26 | fi
 27 | 
 28 | dest=$1;
 29 | shift;
 30 | 
 31 | first_src=$1;
 32 | 
 33 | rm -r $dest 2>/dev/null
 34 | mkdir -p $dest;
 35 | 
 36 | export LC_ALL=C
 37 | 
 38 | for dir in $*; do
 39 |   if [ ! -f $dir/utt2spk ]; then
 40 |     echo "$0: no such file $dir/utt2spk"
 41 |     exit 1;
 42 |   fi
 43 | done
 44 | 
 45 | # Check that frame_shift are compatible, where present together with features.
 46 | dir_with_frame_shift=
 47 | for dir in $*; do
 48 |   if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then
 49 |     if [[ $dir_with_frame_shift ]] &&
 50 |        ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then
 51 |       echo "$0:error: different frame_shift in directories $dir and " \
 52 |            "$dir_with_frame_shift. Cannot combine features."
 53 |       exit 1;
 54 |     fi
 55 |     dir_with_frame_shift=$dir
 56 |   fi
 57 | done
 58 | 
 59 | # W.r.t. utt2uniq file the script has different behavior compared to other files
 60 | # it is not compulsary for it to exist in src directories, but if it exists in
 61 | # even one it should exist in all. We will create the files where necessary
 62 | has_utt2uniq=false
 63 | for in_dir in $*; do
 64 |   if [ -f $in_dir/utt2uniq ]; then
 65 |     has_utt2uniq=true
 66 |     break
 67 |   fi
 68 | done
 69 | 
 70 | if $has_utt2uniq; then
 71 |   # we are going to create an utt2uniq file in the destdir
 72 |   for in_dir in $*; do
 73 |     if [ ! -f $in_dir/utt2uniq ]; then
 74 |       # we assume that utt2uniq is a one to one mapping
 75 |       cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}'
 76 |     else
 77 |       cat $in_dir/utt2uniq
 78 |     fi
 79 |   done | sort -k1 > $dest/utt2uniq
 80 |   echo "$0: combined utt2uniq"
 81 | else
 82 |   echo "$0 [info]: not combining utt2uniq as it does not exist"
 83 | fi
 84 | # some of the old scripts might provide utt2uniq as an extrafile, so just remove it
 85 | extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g")
 86 | 
 87 | # segments are treated similarly to utt2uniq. If it exists in some, but not all
 88 | # src directories, then we generate segments where necessary.
 89 | has_segments=false
 90 | for in_dir in $*; do
 91 |   if [ -f $in_dir/segments ]; then
 92 |     has_segments=true
 93 |     break
 94 |   fi
 95 | done
 96 | 
 97 | if $has_segments; then
 98 |   for in_dir in $*; do
 99 |     if [ ! -f $in_dir/segments ]; then
100 |       echo "$0 [info]: will generate missing segments for $in_dir" 1>&2
101 |       utils/data/get_segments_for_data.sh $in_dir
102 |     else
103 |       cat $in_dir/segments
104 |     fi
105 |   done | sort -k1 > $dest/segments
106 |   echo "$0: combined segments"
107 | else
108 |   echo "$0 [info]: not combining segments as it does not exist"
109 | fi
110 | 
111 | for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do
112 |   exists_somewhere=false
113 |   absent_somewhere=false
114 |   for d in $*; do
115 |     if [ -f $d/$file ]; then
116 |       exists_somewhere=true
117 |     else
118 |       absent_somewhere=true
119 |       fi
120 |   done
121 | 
122 |   if ! $absent_somewhere; then
123 |     set -o pipefail
124 |     ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1;
125 |     set +o pipefail
126 |     echo "$0: combined $file"
127 |   else
128 |     if ! $exists_somewhere; then
129 |       echo "$0 [info]: not combining $file as it does not exist"
130 |     else
131 |       echo "$0 [info]: **not combining $file as it does not exist everywhere**"
132 |     fi
133 |   fi
134 | done
135 | 
136 | utils/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt
137 | 
138 | if [[ $dir_with_frame_shift ]]; then
139 |   cp $dir_with_frame_shift/frame_shift $dest
140 | fi
141 | 
142 | if ! $skip_fix ; then
143 |   utils/fix_data_dir.sh $dest || exit 1;
144 | fi
145 | 
146 | exit 0
147 | 


--------------------------------------------------------------------------------
/utils/compute_cmvn_stats.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | 
 4 | # Begin configuration section.
 5 | nj=4
 6 | cmd=utils/run.pl
 7 | archive_format=hdf5
 8 | cmvn_type=global
 9 | spk2utt=
10 | # End configuration section.
11 | 
12 | help_message=$(cat << EOF
13 | Usage: $0 [options] <input-scp> <output-ark> [logdir]
14 | e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/logs
15 | Options:
16 |   --nj <nj>                                        # number of parallel jobs
17 |   --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs.
18 |   --archive_format <mat|hdf5>                      # Specify the format of feats file
19 |   --cmvn-type                                      # cmvn_type (global or speaker or utterance)
20 |   --spk2utt                                        # speaker -> utterance file
21 | EOF
22 | )
23 | 
24 | echo "$0 $*" 1>&2 # Print the command line for logging
25 | . path.sh || exit 1
26 | . utils/parse_options.sh || exit 1;
27 | 
28 | if [ $# -lt 2 ] || [ $# -gt 3 ]; then
29 |     echo "${help_message}" 1>&2
30 |     exit 1;
31 | fi
32 | 
33 | scp=$1
34 | cmvnark=$2
35 | data=$(dirname ${scp})
36 | if [ $# -eq 3 ]; then
37 |   logdir=$3
38 | else
39 |   logdir=${data}/logs
40 | fi
41 | mkdir -p ${logdir}
42 | 
43 | split_scps=
44 | split_cmvn=
45 | for n in $(seq ${nj}); do
46 |     split_cmvn+="${logdir}/cmvn.${n}.ark "
47 |     split_scps+="${logdir}/feats.${n}.scp "
48 | done
49 | utils/split_scp.pl ${scp} ${split_scps} || exit 1
50 | 
51 | 
52 | maybe_spk2utt=
53 | if [ -n "${spk2utt}" ] && [ "${cmvn_type}" = speaker ]; then
54 |     maybe_spk2utt="--spk2utt ${spk2utt}"
55 | fi
56 | 
57 | ${cmd} JOB=1:${nj} ${logdir}/compute_cmvn_stats.JOB.log \
58 |     compute_cmvn_stats.py --filetype ${archive_format} ${maybe_spk2utt} \
59 |     --cmvn-type ${cmvn_type} "scp:${logdir}/feats.JOB.scp" "${logdir}/cmvn.JOB.ark"
60 | 
61 | python3 -m speech_datasets.bin.combine_cmvn_stats --cmvn_type ${cmvn_type} \
62 |     --output_file ${cmvnark} ${split_cmvn} || exit 1
63 | 
64 | rm -f ${split_scps} ${split_cmvn}
65 | 


--------------------------------------------------------------------------------
/utils/dump.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -euo pipefail
  3 | SECONDS=0
  4 | log() {
  5 |     local fname=${BASH_SOURCE[1]##*/}
  6 |     echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
  7 | }
  8 | 
  9 | # Copyright 2018 Nagoya University (Tomoki Hayashi)
 10 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 11 | 
 12 | # Begin configuration section.
 13 | nj=$(nproc)
 14 | conf=none
 15 | data_name=
 16 | data_format=
 17 | fs=16000
 18 | cmd=utils/run.pl
 19 | compress=true
 20 | archive_format=hdf5 # mat or hdf5
 21 | text=
 22 | utt2spk=
 23 | segments=
 24 | # End configuration section.
 25 | 
 26 | help_message=$(cat <<EOF
 27 | Usage: $0 [options] <data-dir> [<log-dir> [<fbank-dir>] ]
 28 | e.g.: $0 data/train exp/make_fbank/train mfcc
 29 | Note: <log-dir> defaults to <data-dir>/logs, and <fbank-dir> defaults to <data-dir>/data
 30 | Options:
 31 |   --nj <nj>                                        # number of parallel jobs
 32 |   --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs.
 33 |   --archive_format <mat|hdf5>                      # Specify the format of feats file
 34 | EOF
 35 | )
 36 | log "$0 $*"  # Print the command line for logging
 37 | . path.sh || exit 1
 38 | . utils/parse_options.sh || exit 1;
 39 | 
 40 | if [ $# -lt 1 ] || [ $# -gt 4 ]; then
 41 |     log "${help_message}"
 42 |     exit 1;
 43 | fi
 44 | 
 45 | scp=$1
 46 | data=$(utils/make_absolute.sh $2)
 47 | if [ $# -ge 3 ]; then
 48 |   logdir=$3
 49 | else
 50 |   logdir=${data}/logs
 51 | fi
 52 | if [ $# -ge 4 ]; then
 53 |   dumpdir=$(utils/make_absolute.sh $4)
 54 | else
 55 |   dumpdir=${data}/data
 56 | fi
 57 | 
 58 | # use "data_name" as part of data_name of the archive.
 59 | if [ -z ${data_name} ]; then
 60 |     log "Please provide a data_name for the features being extracted by supplying the --data-name option"
 61 |     exit 2
 62 | fi
 63 | 
 64 | mkdir -p ${dumpdir} || exit 1;
 65 | mkdir -p ${logdir} || exit 1;
 66 | 
 67 | if [ -f ${data}/${data_format}.scp ]; then
 68 |   mkdir -p ${data}/.backup
 69 |   log "$0: moving $data/${data_format}.scp to $data/.backup"
 70 |   mv ${data}/${data_format}.scp ${data}/.backup
 71 | fi
 72 | 
 73 | utils/validate_data_dir.sh --no-text --no-feats --no-scp ${data} || exit 1;
 74 | 
 75 | split_scps=""
 76 | for n in $(seq ${nj}); do
 77 |     split_scps="${split_scps} ${logdir}/wav.${n}.scp"
 78 | done
 79 | 
 80 | utils/split_scp.pl ${scp} ${split_scps} || exit 1;
 81 | 
 82 | extra_opt=
 83 | if [ -n "${text}" ]; then
 84 |     extra_opt+="--text-file ${text} "
 85 | fi
 86 | if [ -n "${utt2spk}" ]; then
 87 |     extra_opt+="--utt2spk-file ${utt2spk} "
 88 | fi
 89 | 
 90 | if [ "${archive_format}" == hdf5 ]; then
 91 |     ext=h5
 92 | else
 93 |     ext=ark
 94 | fi
 95 | 
 96 | if [ -n "${segments}" ] && [ -f "${segments}" ]; then
 97 |     log "$0 [info]: segments file exists: using that."
 98 |     split_segments=""
 99 |     for n in $(seq ${nj}); do
100 |         split_segments="${split_segments} ${logdir}/segments.${n}"
101 |     done
102 | 
103 |     utils/split_scp.pl ${segments} ${split_segments}
104 |     for split in ${split_segments}; do
105 |         sort -k1,1 -u < ${split} > ${split}.tmp
106 |         mv ${split}.tmp  ${split}
107 |     done
108 | 
109 |     ${cmd} JOB=1:${nj} ${logdir}/make_${data_name}.JOB.log \
110 |         dump.py \
111 |             --feature-config ${conf} ${extra_opt} --sample-frequency ${fs} \
112 |             --compress=${compress} --archive-format ${archive_format} \
113 |             --segment=${logdir}/segments.JOB scp:${scp} \
114 |             ark,scp:${dumpdir}/${data_name}.JOB.${ext},${dumpdir}/${data_name}.JOB.scp
115 | 
116 | else
117 |     log "$0: [info]: no segments file exists: assuming ${scp} indexed by utterance."
118 |     split_scps=""
119 |     for n in $(seq ${nj}); do
120 |       split_scps="${split_scps} ${logdir}/wav.${n}.scp"
121 |     done
122 | 
123 |     utils/split_scp.pl ${scp} ${split_scps}
124 |     for split in ${split_scps}; do
125 |         sort -k1,1 -u < ${split} > ${split}.tmp
126 |         mv ${split}.tmp  ${split}
127 |     done
128 | 
129 |     ${cmd} JOB=1:${nj} ${logdir}/make_${data_name}.JOB.log \
130 |         dump.py \
131 |             --feature-config ${conf} ${extra_opt} --sample-frequency ${fs} \
132 |             --compress=${compress} --archive-format ${archive_format} \
133 |             scp:${logdir}/wav.JOB.scp \
134 |             ark,scp:${dumpdir}/${data_name}.JOB.${ext},${dumpdir}/${data_name}.JOB.scp
135 | fi
136 | 
137 | # concatenate the .scp files together.
138 | for n in $(seq ${nj}); do
139 |     cat ${dumpdir}/${data_name}.${n}.scp || exit 1;
140 | done > ${data}/${data_format}.scp || exit 1
141 | 
142 | rm -f ${logdir}/wav.*.scp ${logdir}/segments.* 2>/dev/null
143 | 
144 | n_feat=$(wc -l < ${data}/${data_format}.scp)
145 | n_utt=$(wc -l < ${data}/utt2spk)
146 | if [ ${n_feat} -ne ${n_utt} ]; then
147 |     log "It seems not all of the feature files were successfully ($n_feat != $n_utt);"
148 |     log "consider using utils/fix_data_dir.sh $data"
149 | fi
150 | 
151 | log "Succeeded dumping ${data_name} [elapsed=${SECONDS}s]"
152 | 


--------------------------------------------------------------------------------
/utils/feat_to_shape.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | 
 4 | # Begin configuration section.
 5 | nj=4
 6 | cmd=utils/run.pl
 7 | verbose=0
 8 | archive_format=
 9 | preprocess_conf=
10 | # End configuration section.
11 | 
12 | help_message=$(cat << EOF
13 | Usage: $0 [options] <input-scp> <output-scp> [<log-dir>]
14 | e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/logs
15 | Options:
16 |   --nj <nj>                                        # number of parallel jobs
17 |   --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs.
18 |   --archive_format <mat|hdf5|sound>                # Specify the format of feats file
19 |   --preprocess-conf <json>                         # Apply preprocess to feats when creating shape.scp
20 |   --verbose <num>                                  # Default: 0
21 | EOF
22 | )
23 | 
24 | echo "$0 $*" 1>&2 # Print the command line for logging
25 | . path.sh || exit 1
26 | . utils/parse_options.sh || exit 1;
27 | 
28 | if [ $# -lt 2 ] || [ $# -gt 3 ]; then
29 |     echo "${help_message}" 1>&2
30 |     exit 1;
31 | fi
32 | 
33 | scp=$1
34 | outscp=$2
35 | data=$(dirname ${scp})
36 | if [ $# -eq 3 ]; then
37 |   logdir=$3
38 | else
39 |   logdir=${data}/logs
40 | fi
41 | mkdir -p ${logdir}
42 | 
43 | nj=$((nj<$(<"${scp}" wc -l)?nj:$(<"${scp}" wc -l)))
44 | split_scps=""
45 | for n in $(seq ${nj}); do
46 |     split_scps="${split_scps} ${logdir}/feats.${n}.scp"
47 | done
48 | 
49 | utils/split_scp.pl ${scp} ${split_scps}
50 | 
51 | if [ -n "${preprocess_conf}" ]; then
52 |     preprocess_opt="--preprocess-conf ${preprocess_conf}"
53 | else
54 |     preprocess_opt=""
55 | fi
56 | if [ -n "${archive_format}" ]; then
57 |     filetype_opt="--filetype ${archive_format}"
58 | else
59 |     filetype_opt=""
60 | fi
61 | 
62 | ${cmd} JOB=1:${nj} ${logdir}/feat_to_shape.JOB.log \
63 |     feat_to_shape.py --verbose ${verbose} ${preprocess_opt} ${filetype_opt} \
64 |     scp:${logdir}/feats.JOB.scp ${logdir}/shape.JOB.scp
65 | 
66 | # concatenate the .scp files together.
67 | for n in $(seq ${nj}); do
68 |     cat ${logdir}/shape.${n}.scp
69 | done > ${outscp}
70 | 
71 | rm -f ${logdir}/feats.*.scp 2>/dev/null
72 | 


--------------------------------------------------------------------------------
/utils/filter_scp.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # Copyright 2010-2012 Microsoft Corporation
 3 | #                     Johns Hopkins University (author: Daniel Povey)
 4 | 
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #  http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
14 | # MERCHANTABLITY OR NON-INFRINGEMENT.
15 | # See the Apache 2 License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | 
19 | # This script takes a list of utterance-ids or any file whose first field
20 | # of each line is an utterance-id, and filters an scp
21 | # file (or any file whose "n-th" field is an utterance id), printing
22 | # out only those lines whose "n-th" field is in id_list. The index of
23 | # the "n-th" field is 1, by default, but can be changed by using
24 | # the -f <n> switch
25 | 
26 | $exclude = 0;
27 | $field = 1;
28 | $shifted = 0;
29 | 
30 | do {
31 |   $shifted=0;
32 |   if ($ARGV[0] eq "--exclude") {
33 |     $exclude = 1;
34 |     shift @ARGV;
35 |     $shifted=1;
36 |   }
37 |   if ($ARGV[0] eq "-f") {
38 |     $field = $ARGV[1];
39 |     shift @ARGV; shift @ARGV;
40 |     $shifted=1
41 |   }
42 | } while ($shifted);
43 | 
44 | if(@ARGV < 1 || @ARGV > 2) {
45 |   die "Usage: filter_scp.pl [--exclude] [-f <field-to-filter-on>] id_list [in.scp] > out.scp \n" .
46 |       "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" .
47 |       "Note: only the first field of each line in id_list matters.  With --exclude, prints\n" .
48 |       "only the lines that were *not* in id_list.\n" .
49 |       "Caution: previously, the -f option was interpreted as a zero-based field index.\n" .
50 |       "If your older scripts (written before Oct 2014) stopped working and you used the\n" .
51 |       "-f option, add 1 to the argument.\n" .
52 |       "See also: utils/filter_scp.pl .\n";
53 | }
54 | 
55 | 
56 | $idlist = shift @ARGV;
57 | open(F, "<$idlist") || die "Could not open id-list file $idlist";
58 | while(<F>) {
59 |   @A = split;
60 |   @A>=1 || die "Invalid id-list file line $_";
61 |   $seen{$A[0]} = 1;
62 | }
63 | 
64 | if ($field == 1) { # Treat this as special case, since it is common.
65 |   while(<>) {
66 |     $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field.";
67 |     # $1 is what we filter on.
68 |     if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) {
69 |       print $_;
70 |     }
71 |   }
72 | } else {
73 |   while(<>) {
74 |     @A = split;
75 |     @A > 0 || die "Invalid scp file line $_";
76 |     @A >= $field || die "Invalid scp file line $_";
77 |     if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) {
78 |       print $_;
79 |     }
80 |   }
81 | }
82 | 
83 | # tests:
84 | # the following should print "foo 1"
85 | # ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo)
86 | # the following should print "bar 2".
87 | # ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2)
88 | 


--------------------------------------------------------------------------------
/utils/make_absolute.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # This script replaces the command readlink -f (which is not portable).
 4 | # It turns a pathname into an absolute pathname, including following soft links.
 5 | target_file=$1
 6 | 
 7 | cd $(dirname $target_file)
 8 | target_file=$(basename $target_file)
 9 | 
10 | # Iterate down a (possible) chain of symlinks
11 | while [ -L "$target_file" ]; do
12 |     target_file=$(readlink $target_file)
13 |     cd $(dirname $target_file)
14 |     target_file=$(basename $target_file)
15 | done
16 | 
17 | # Compute the canonicalized name by finding the physical path 
18 | # for the directory we're in and appending the target file.
19 | phys_dir=$(pwd -P)
20 | result=$phys_dir/$target_file
21 | echo $result
22 | 


--------------------------------------------------------------------------------
/utils/parse_options.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
  4 | #                 Arnab Ghoshal, Karel Vesely
  5 | 
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #  http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 13 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 14 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 15 | # MERCHANTABLITY OR NON-INFRINGEMENT.
 16 | # See the Apache 2 License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | 
 19 | 
 20 | # Parse command-line options.
 21 | # To be sourced by another script (as in ". parse_options.sh").
 22 | # Option format is: --option-name arg
 23 | # and shell variable "option_name" gets set to value "arg."
 24 | # The exception is --help, which takes no arguments, but prints the
 25 | # $help_message variable (if defined).
 26 | 
 27 | 
 28 | ###
 29 | ### The --config file options have lower priority to command line
 30 | ### options, so we need to import them first...
 31 | ###
 32 | 
 33 | # Now import all the configs specified by command-line, in left-to-right order
 34 | for ((argpos=1; argpos<$#; argpos++)); do
 35 |   if [ "${!argpos}" == "--config" ]; then
 36 |     argpos_plus1=$((argpos+1))
 37 |     config=${!argpos_plus1}
 38 |     [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
 39 |     . $config  # source the config file.
 40 |   fi
 41 | done
 42 | 
 43 | 
 44 | ###
 45 | ### Now we process the command line options
 46 | ###
 47 | while true; do
 48 |   [ -z "${1:-}" ] && break;  # break if there are no arguments
 49 |   case "$1" in
 50 |     # If the enclosing script is called with --help option, print the help
 51 |     # message and exit.  Scripts should put help messages in $help_message
 52 |     --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
 53 |       else printf "$help_message\n" 1>&2 ; fi;
 54 |       exit 0 ;;
 55 |     --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
 56 |       exit 1 ;;
 57 |     # If the first command-line argument begins with "--" (e.g. --foo-bar),
 58 |     # then work out the variable name as $name, which will equal "foo_bar".
 59 |     --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
 60 |       # Next we test whether the variable in question is undefned-- if so it's
 61 |       # an invalid option and we die.  Note: $0 evaluates to the name of the
 62 |       # enclosing script.
 63 |       # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
 64 |       # is undefined.  We then have to wrap this test inside "eval" because
 65 |       # foo_bar is itself inside a variable ($name).
 66 |       eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
 67 | 
 68 |       oldval="`eval echo \\$$name`";
 69 |       # Work out whether we seem to be expecting a Boolean argument.
 70 |       if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
 71 |         was_bool=true;
 72 |       else
 73 |         was_bool=false;
 74 |       fi
 75 | 
 76 |       # Set the variable to the right value-- the escaped quotes make it work if
 77 |       # the option had spaces, like --cmd "queue.pl -sync y"
 78 |       if [ $# -lt 2 ]; then
 79 |           echo "$0: no argument provided for option $1" 1>&2
 80 |           exit 1;
 81 |       else
 82 |           eval $name=\"$2\";
 83 |       fi
 84 | 
 85 |       # Check that Boolean-valued arguments are really Boolean.
 86 |       if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
 87 |         echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
 88 |         exit 1;
 89 |       fi
 90 |       shift 2;
 91 |       ;;
 92 |   *) break;
 93 |   esac
 94 | done
 95 | 
 96 | 
 97 | # Check for an empty argument to the --cmd option, which can easily occur as a
 98 | # result of scripting errors.
 99 | [ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
100 | 
101 | 
102 | true; # so this script returns exit code 0.
103 | 


--------------------------------------------------------------------------------
/utils/remove_dup_utts.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Remove excess utterances once they appear  more than a specified
 4 | # number of times with the same transcription, in a data set.
 5 | # E.g. useful for removing excess "uh-huh" from training.
 6 | 
 7 | if [ $# != 3 ]; then
 8 |   echo "Usage: remove_dup_utts.sh max-count <src-data-dir> <dest-data-dir>"
 9 |   echo "e.g.: remove_dup_utts.sh 10 data/train data/train_nodup"
10 |   echo "This script is used to filter out utterances that have from over-represented"
11 |   echo "transcriptions (such as 'uh-huh'), by limiting the number of repetitions of"
12 |   echo "any given word-sequence to a specified value.  It's often used to get"
13 |   echo "subsets for early stages of training."
14 |   exit 1;
15 | fi
16 | 
17 | maxcount=$1
18 | srcdir=$2
19 | destdir=$3
20 | mkdir -p $destdir
21 | 
22 | [ ! -f $srcdir/text ] && echo "$0: Invalid input directory $srcdir" && exit 1;
23 | 
24 | ! mkdir -p $destdir && echo "$0: could not create directory $destdir" && exit 1;
25 | 
26 | ! [ "$maxcount" -gt 1 ] && echo "$0: invalid max-count '$maxcount'" && exit 1;
27 | 
28 | cp $srcdir/* $destdir
29 | cat $srcdir/text | \
30 |   perl -e '
31 |   $maxcount = shift @ARGV;
32 |   @all = ();
33 |    $p1 = 103349; $p2 = 71147; $k = 0;
34 |    sub random { # our own random number generator: predictable.
35 |      $k = ($k + $p1) % $p2;
36 |      return ($k / $p2);
37 |   }
38 |   while(<>) {
39 |     push @all, $_;
40 |     @A = split(" ", $_);
41 |     shift @A;
42 |     $text = join(" ", @A);
43 |     $count{$text} ++;
44 |   }
45 |   foreach $line (@all) {
46 |     @A = split(" ", $line);
47 |     shift @A;
48 |     $text = join(" ", @A);
49 |     $n = $count{$text};
50 |     if ($n < $maxcount || random() < ($maxcount / $n)) {
51 |       print $line;
52 |     }
53 |   }'  $maxcount >$destdir/text
54 | 
55 | echo "Reduced number of utterances from `cat $srcdir/text | wc -l` to `cat $destdir/text | wc -l`"
56 | 
57 | echo "Using fix_data_dir.sh to reconcile the other files."
58 | utils/fix_data_dir.sh $destdir
59 | rm -r $destdir/.backup
60 | 
61 | exit 0
62 | 


--------------------------------------------------------------------------------
/utils/shuffle_list.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | # Copyright 2013  Johns Hopkins University (author: Daniel Povey)
 4 | 
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #  http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
14 | # MERCHANTABLITY OR NON-INFRINGEMENT.
15 | # See the Apache 2 License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | 
19 | if ($ARGV[0] eq "--srand") {
20 |   $n = $ARGV[1];
21 |   $n =~ m/\d+/ || die "Bad argument to --srand option: \"$n\"";
22 |   srand($ARGV[1]);
23 |   shift;
24 |   shift;
25 | } else {
26 |   srand(0); # Gives inconsistent behavior if we don't seed.
27 | }
28 | 
29 | if (@ARGV > 1 || $ARGV[0] =~ m/^-.+/) { # >1 args, or an option we 
30 |   # don't understand.
31 |   print "Usage: shuffle_list.pl [--srand N] [input file]  > output\n";
32 |   print "randomizes the order of lines of input.\n";
33 |   exit(1);
34 | }
35 | 
36 | @lines;
37 | while (<>) {
38 |   push @lines, [ (rand(), $_)] ;
39 | }
40 | 
41 | @lines = sort { $a->[0] cmp $b->[0] } @lines;
42 | foreach $l (@lines) {
43 |     print $l->[1];
44 | }
45 | 


--------------------------------------------------------------------------------
/utils/spk2utt_to_utt2spk.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # Copyright 2010-2011 Microsoft Corporation
 3 | 
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #  http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
14 | # See the Apache 2 License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | while(<>){ 
19 |     @A = split(" ", $_);
20 |     @A > 1 || die "Invalid line in spk2utt file: $_";
21 |     $s = shift @A;
22 |     foreach $u ( @A ) {
23 |         print "$u $s\n";
24 |     }
25 | }
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/utils/subset_data_dir_tr_cv.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Copyright 2017  Brno University of Technology (Author: Karel Vesely);
 4 | # Apache 2.0
 5 | 
 6 | # This scripts splits 'data' directory into two parts:
 7 | # - training set with 90% of speakers
 8 | # - held-out set with 10% of speakers (cv)
 9 | # (to be used in frame cross-entropy training of 'nnet1' models),
10 | 
11 | # The script also accepts a list of held-out set speakers by '--cv-spk-list'
12 | # (with perturbed data, we pass the list of speakers externally).
13 | # The remaining set of speakers is the the training set.
14 | 
15 | cv_spk_percent=10
16 | cv_spk_list= # To be used with perturbed data,
17 | seed=777
18 | cv_utt_percent= # ignored (compatibility),
19 | . utils/parse_options.sh
20 | 
21 | if [ $# != 3 ]; then
22 |   echo "Usage: $0 [opts] <src-data> <train-data> <cv-data>"
23 |   echo "  --cv-spk-percent N (default 10)"
24 |   echo "  --cv-spk-list <file> (a pre-defined list with cv speakers)"
25 |   exit 1;
26 | fi
27 | 
28 | set -euo pipefail
29 | 
30 | src_data=$1
31 | trn_data=$2
32 | cv_data=$3
33 | 
34 | [ ! -r $src_data/spk2utt ] && echo "Missing '$src_data/spk2utt'. Error!" && exit 1
35 | 
36 | tmp=$(mktemp -d /tmp/${USER}_XXXXX)
37 | 
38 | if [ -z "$cv_spk_list" ]; then
39 |   # Select 'cv_spk_percent' speakers randomly,
40 |   cat $src_data/spk2utt | awk '{ print $1; }' | utils/shuffle_list.pl --srand $seed >$tmp/speakers
41 |   n_spk=$(wc -l <$tmp/speakers)
42 |   n_spk_cv=$(perl -e "print int($cv_spk_percent * $n_spk / 100); ")
43 |   #
44 |   head -n $n_spk_cv $tmp/speakers >$tmp/speakers_cv
45 |   tail -n+$((n_spk_cv+1)) $tmp/speakers >$tmp/speakers_trn
46 | else
47 |   # Use pre-defined list of speakers,
48 |   cp $cv_spk_list $tmp/speakers_cv
49 |   join -v2 <(sort $cv_spk_list) <(awk '{ print $1; }' <$src_data/spk2utt | sort) >$tmp/speakers_trn
50 | fi
51 | 
52 | # Sanity checks,
53 | n_spk=$(wc -l <$src_data/spk2utt)
54 | echo "Speakers, src=$n_spk, trn=$(wc -l <$tmp/speakers_trn), cv=$(wc -l $tmp/speakers_cv)"
55 | overlap=$(join <(sort $tmp/speakers_trn) <(sort $tmp/speakers_cv) | wc -l)
56 | [ $overlap != 0 ] && \
57 |   echo "WARNING, speaker overlap detected!" && \
58 |   join <(sort $tmp/speakers_trn) <(sort $tmp/speakers_cv) | head && \
59 |   echo '...'
60 | 
61 | # Create new data dirs,
62 | utils/data/subset_data_dir.sh --spk-list $tmp/speakers_trn $src_data $trn_data
63 | utils/data/subset_data_dir.sh --spk-list $tmp/speakers_cv $src_data $cv_data
64 | 
65 | 


--------------------------------------------------------------------------------
/utils/subset_scp.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | use warnings; #sed replacement for -w perl parameter
  3 | # Copyright 2010-2011 Microsoft Corporation
  4 | 
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #  http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 14 | # MERCHANTABLITY OR NON-INFRINGEMENT.
 15 | # See the Apache 2 License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | # This program selects a subset of N elements in the scp.
 19 | 
 20 | # By default, it selects them evenly from throughout the scp, in order to avoid
 21 | # selecting too many from the same speaker.  It prints them on the standard
 22 | # output.
 23 | # With the option --first, it just selects the N first utterances.
 24 | # With the option --last, it just selects the N last utterances.
 25 | 
 26 | # Last modified by JHU & HKUST @2013
 27 | 
 28 | 
 29 | $quiet = 0;
 30 | $first = 0;
 31 | $last = 0;
 32 | 
 33 | if (@ARGV > 0 && $ARGV[0] eq "--quiet") {
 34 |   shift;
 35 |   $quiet = 1;
 36 | }
 37 | if (@ARGV > 0 && $ARGV[0] eq "--first") {
 38 |   shift;
 39 |   $first = 1;
 40 | }
 41 | if (@ARGV > 0 && $ARGV[0] eq "--last") {
 42 |   shift;
 43 |   $last = 1;
 44 | }
 45 | 
 46 | if(@ARGV < 2 ) {
 47 |     die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" .
 48 |         " --quiet  causes it to not die if N < num lines in scp.\n" .
 49 |         " --first and --last make it equivalent to head or tail.\n" .
 50 |         "See also: filter_scp.pl\n";
 51 | }
 52 | 
 53 | $N = shift @ARGV;
 54 | if($N == 0) {
 55 |     die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\"";
 56 | }
 57 | $inscp = shift @ARGV;
 58 | open(I, "<$inscp") || die "Opening input scp file $inscp";
 59 | 
 60 | @F = ();
 61 | while(<I>) {
 62 |     push @F, $_;
 63 | }
 64 | $numlines = @F;
 65 | if($N > $numlines) {
 66 |   if ($quiet) {
 67 |     $N = $numlines;
 68 |   } else {
 69 |     die "You requested from subset_scp.pl more elements than available: $N > $numlines";
 70 |   }
 71 | }
 72 | 
 73 | sub select_n {
 74 |   my ($start,$end,$num_needed) = @_;
 75 |   my $diff = $end - $start;
 76 |   if ($num_needed > $diff) {
 77 |     die "select_n: code error";
 78 |   }
 79 |   if ($diff == 1 ) {
 80 |     if ($num_needed  > 0) {
 81 |       print $F[$start];
 82 |     }
 83 |   } else {
 84 |     my $halfdiff = int($diff/2);
 85 |     my $halfneeded = int($num_needed/2);
 86 |     select_n($start, $start+$halfdiff, $halfneeded);
 87 |     select_n($start+$halfdiff, $end, $num_needed - $halfneeded);
 88 |   }
 89 | }
 90 | 
 91 | if ( ! $first && ! $last) {
 92 |   if ($N > 0) {
 93 |     select_n(0, $numlines, $N);
 94 |   }
 95 | } else {
 96 |   if ($first) { # --first option: same as head.
 97 |     for ($n = 0; $n < $N; $n++) {
 98 |       print $F[$n];
 99 |     }
100 |   } else { # --last option: same as tail.
101 |     for ($n = @F - $N; $n < @F; $n++) {
102 |       print $F[$n];
103 |     }
104 |   }
105 | }
106 | 


--------------------------------------------------------------------------------
/utils/sym2int.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | # Copyright 2010-2012 Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
  3 | 
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #  http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
 14 | # See the Apache 2 License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | 
 18 | $ignore_oov = 0;
 19 | 
 20 | for($x = 0; $x < 2; $x++) {
 21 |   if ($ARGV[0] eq "--map-oov") {
 22 |     shift @ARGV;
 23 |     $map_oov = shift @ARGV;
 24 |     if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") {
 25 |       # disallow '-f', the empty string and anything ending in words.txt as the
 26 |       # OOV symbol because these are likely command-line errors.
 27 |       die "the --map-oov option requires an argument";
 28 |     }
 29 |   }
 30 |   if ($ARGV[0] eq "-f") {
 31 |     shift @ARGV;
 32 |     $field_spec = shift @ARGV;
 33 |     if ($field_spec =~ m/^\d+$/) {
 34 |       $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
 35 |     }
 36 |     if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesy (properly, 1-10)
 37 |       if ($1 ne "") {
 38 |         $field_begin = $1 - 1;  # Change to zero-based indexing.
 39 |       }
 40 |       if ($2 ne "") {
 41 |         $field_end = $2 - 1;    # Change to zero-based indexing.
 42 |       }
 43 |     }
 44 |     if (!defined $field_begin && !defined $field_end) {
 45 |       die "Bad argument to -f option: $field_spec";
 46 |     }
 47 |   }
 48 | }
 49 | 
 50 | $symtab = shift @ARGV;
 51 | if (!defined $symtab) {
 52 |   print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" .
 53 |     "options: [--map-oov <oov-symbol> ]  [-f <field-range> ]\n" .
 54 |       "note: <field-range> can look like 4-5, or 4-, or 5-, or 1.\n";
 55 | }
 56 | open(F, "<$symtab") || die "Error opening symbol table file $symtab";
 57 | while(<F>) {
 58 |     @A = split(" ", $_);
 59 |     @A == 2 || die "bad line in symbol table file: $_";
 60 |     $sym2int{$A[0]} = $A[1] + 0;
 61 | }
 62 | 
 63 | if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up
 64 |   if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; }
 65 |   $map_oov = $sym2int{$map_oov};
 66 | }
 67 | 
 68 | $num_warning = 0;
 69 | $max_warning = 20;
 70 | 
 71 | while (<>) {
 72 |   @A = split(" ", $_);
 73 |   @B = ();
 74 |   for ($n = 0; $n < @A; $n++) {
 75 |     $a = $A[$n];
 76 |     if ( (!defined $field_begin || $n >= $field_begin)
 77 |          && (!defined $field_end || $n <= $field_end)) {
 78 |       $i = $sym2int{$a};
 79 |       if (!defined ($i)) {
 80 |         if (defined $map_oov) {
 81 |           if ($num_warning++ < $max_warning) {
 82 |             print STDERR "sym2int.pl: replacing $a with $map_oov\n";
 83 |             if ($num_warning == $max_warning) {
 84 |               print STDERR "sym2int.pl: not warning for OOVs any more times\n";
 85 |             }
 86 |           }
 87 |           $i = $map_oov;
 88 |         } else {
 89 |           $pos = $n+1;
 90 |           die "sym2int.pl: undefined symbol $a (in position $pos)\n";
 91 |         }
 92 |       }
 93 |       $a = $i;
 94 |     }
 95 |     push @B, $a;
 96 |   }
 97 |   print join(" ", @B);
 98 |   print "\n";
 99 | }
100 | if ($num_warning > 0) {
101 |   print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n";
102 | }
103 | 
104 | exit(0);
105 | 


--------------------------------------------------------------------------------
/utils/utt2spk_to_spk2utt.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # Copyright 2010-2011 Microsoft Corporation
 3 | 
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #  http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
14 | # See the Apache 2 License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | # converts an utt2spk file to a spk2utt file.
18 | # Takes input from the stdin or from a file argument;
19 | # output goes to the standard out.
20 | 
21 | if ( @ARGV > 1 ) {
22 |     die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt";
23 | }
24 | 
25 | while(<>){ 
26 |     @A = split(" ", $_);
27 |     @A == 2 || die "Invalid line in utt2spk file: $_";
28 |     ($u,$s) = @A;
29 |     if(!$seen_spk{$s}) {
30 |         $seen_spk{$s} = 1;
31 |         push @spklist, $s;
32 |     }
33 |     push (@{$spk_hash{$s}}, "$u");
34 | }
35 | foreach $s (@spklist) {
36 |     $l = join(' ',@{$spk_hash{$s}});
37 |     print "$s $l\n";
38 | }
39 | 


--------------------------------------------------------------------------------
/wsj/asr1/asr.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/asr.sh


--------------------------------------------------------------------------------
/wsj/asr1/cmd.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/cmd.sh


--------------------------------------------------------------------------------
/wsj/asr1/conf/fbank.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank
2 |   num_mel_bins: 80
3 |   sample_frequency: 16000
4 | 


--------------------------------------------------------------------------------
/wsj/asr1/conf/fbank_pitch.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank_pitch
2 |   num_mel_bins: 80
3 |   sample_frequency: 16000
4 | 


--------------------------------------------------------------------------------
/wsj/asr1/conf/gpu.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
 3 | option mem=* -l mem_free=$0,ram_free=$0
 4 | option mem=0          # Do not add anything to qsub_opts
 5 | option num_threads=* -pe smp $0
 6 | option num_threads=1  # Do not add anything to qsub_opts
 7 | option max_jobs_run=* -tc $0
 8 | default gpu=0
 9 | option gpu=0
10 | option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q


--------------------------------------------------------------------------------
/wsj/asr1/conf/pbs.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -V -v PATH -S /bin/bash
 3 | option name=* -N $0
 4 | option mem=* -l mem=$0
 5 | option mem=0          # Do not add anything to qsub_opts
 6 | option num_threads=* -l ncpus=$0
 7 | option num_threads=1  # Do not add anything to qsub_opts
 8 | option num_nodes=* -l nodes=$0:ppn=1
 9 | default gpu=0
10 | option gpu=0
11 | option gpu=* -l ngpus=$0
12 | 


--------------------------------------------------------------------------------
/wsj/asr1/conf/queue.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
 3 | option name=* -N $0
 4 | option mem=* -l mem_free=$0,ram_free=$0
 5 | option mem=0          # Do not add anything to qsub_opts
 6 | option num_threads=* -pe smp $0
 7 | option num_threads=1  # Do not add anything to qsub_opts
 8 | option max_jobs_run=* -tc $0
 9 | option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
10 | default gpu=0
11 | option gpu=0
12 | option gpu=* -l gpu=$0 -q g.q
13 | 


--------------------------------------------------------------------------------
/wsj/asr1/conf/slurm.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command sbatch --export=PATH
 3 | option name=* --job-name $0
 4 | option time=* --time $0
 5 | option mem=* --mem-per-cpu $0
 6 | option mem=0          # Do not add anything to qsub_opts
 7 | option num_threads=* --cpus-per-task $0
 8 | option num_threads=1 --cpus-per-task 1
 9 | option num_nodes=* --nodes $0
10 | default gpu=0
11 | option gpu=0 -p cpu
12 | option gpu=* -p gpu --gres=gpu:$0
13 | # note: the --max-jobs-run option is supported as a special case
14 | # by slurm.pl and you don't have to handle it in the config file.
15 | 


--------------------------------------------------------------------------------
/wsj/asr1/db.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/db.sh


--------------------------------------------------------------------------------
/wsj/asr1/local/data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | set -u
 4 | set -o pipefail
 5 | 
 6 | log() {
 7 |     local fname=${BASH_SOURCE[1]##*/}
 8 |     echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 9 | }
10 | 
11 | help_message=$(cat << EOF
12 | Usage: $0
13 | (No options)
14 | EOF
15 | )
16 | 
17 | if [ $# -ne 0 ]; then
18 |     log "Error: invalid command line arguments"
19 |     log "${help_message}"
20 |     exit 1
21 | fi
22 | 
23 | . ./path.sh || exit 1
24 | . ./db.sh || exit 1
25 | 
26 | other_text=data/local/other_text/text
27 | nlsyms=data/nlsyms.txt
28 | 
29 | # Extract WSJ0/WSJ1 raw data if needed
30 | WSJ=("${WSJ0}" "${WSJ1}")
31 | WSJ_TGZ=("${WSJ0_TGZ}" "${WSJ1_TGZ}")
32 | for (( i=0; i<2; i++ )); do
33 |     echo ${WSJ[i]}
34 |     if [ -z "${WSJ[i]}" ]; then
35 |         log "Fill the value of 'WSJ${i}' in db.sh"
36 |         exit 1
37 |     elif [ ! -d "${WSJ[i]}" ]; then
38 |         mkdir -p "${WSJ[i]}"
39 |         {
40 |           tar xzvf "${WSJ_TGZ[i]}" -C "${WSJ[i]}"
41 |         } || {
42 |           rm -rf "${WSJ[i]}"
43 |           log "Failed to extract WSJ${i}"
44 |           exit 1
45 |         }
46 |     fi
47 | done
48 | 
49 | log "local/wsj_data_prep.sh ${WSJ0}/csr_1_senn/??-{?,??}.? ${WSJ1}/csr_senn/??-{?,??}.?"
50 | local/wsj_data_prep.sh "${WSJ0}"/csr_1_senn/??-{?,??}.? "${WSJ1}"/csr_senn/??-{?,??}.?
51 | log "local/wsj_format_data.sh"
52 | local/wsj_format_data.sh
53 | 
54 | log "Create the list of non-linguistic symbols: ${nlsyms}"
55 | cut -f 2- -d" " data/train_si284/text | tr " " "\n" | sort | uniq | grep "<" > ${nlsyms}
56 | cat ${nlsyms}
57 | 
58 | log "Prepare text from lng_modl dir: ${WSJ1}/csr_senn/13-32.1/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/*.z -> ${other_text}"
59 | mkdir -p "$(dirname ${other_text})"
60 | 
61 | # NOTE(kamo): Give utterance id to each texts & make everything lowercase
62 | # Also remove utterances with non-linguistic symbols, i.e. lines including "<"
63 | zcat ${WSJ1}/csr_senn/13-32.1/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/*.z | \
64 |     grep -v "<" | tr "[:upper:]" "[:lower:]" | \
65 |     awk '{ printf("{wsj}lng_%07d %s\n",NR,$0) } ' > ${other_text}
66 | 


--------------------------------------------------------------------------------
/wsj/asr1/local/find_transcripts.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # Copyright 2010-2011 Microsoft Corporation
 3 | 
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #  http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
14 | # See the Apache 2 License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | 
19 | # This program takes on its standard input a list of utterance
20 | # id's, one for each line. (e.g. 4k0c030a is a an utterance id).
21 | # It takes as
22 | # Extracts from the dot files the transcripts for a given
23 | # dataset (represented by a file list).
24 | # 
25 | 
26 | @ARGV == 1 || die "find_transcripts.pl dot_files_flist < utterance_ids > transcripts";
27 | $dot_flist = shift @ARGV;
28 | 
29 | open(L, "<$dot_flist") || die "Opening file list of dot files: $dot_flist\n";
30 | while(<L>){
31 |     chop;
32 |     m:\S+/(\w{6})00.dot: || die "Bad line in dot file list: $_";
33 |     $spk = $1;
34 |     $spk2dot{$spk} = $_;
35 | }
36 | 
37 | 
38 | 
39 | while(<STDIN>){ 
40 |     chop;
41 |     $uttid = $_;
42 |     $uttid =~ m:(\w{6})\w\w: || die "Bad utterance id $_";
43 |     $spk = $1;
44 |     if($spk ne $curspk) {
45 |         %utt2trans = { }; # Don't keep all the transcripts in memory...
46 |         $curspk = $spk;
47 |         $dotfile = $spk2dot{$spk};
48 |         defined $dotfile || die "No dot file for speaker $spk\n";
49 |         open(F, "<$dotfile") || die "Error opening dot file $dotfile\n";
50 |         while(<F>) {
51 |             $_ =~ m:(.+)\((\w{8})\)\s*$: || die "Bad line $_ in dot file $dotfile (line $.)\n";
52 |             $trans = $1;
53 |             $utt = $2;
54 |             $utt2trans{$utt} = $trans;
55 |         }
56 |     }
57 |     if(!defined $utt2trans{$uttid}) {
58 |         print STDERR "No transcript for utterance $uttid (current dot file is $dotfile)\n";
59 |     } else {
60 |         print "$uttid $utt2trans{$uttid}\n";
61 |     }
62 | }
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/wsj/asr1/local/flist2scp.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # Copyright 2010-2011 Microsoft Corporation
 3 | 
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #  http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
14 | # See the Apache 2 License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | # takes in a file list with lines like
19 | # /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1
20 | # and outputs an scp in kaldi format with lines like
21 | # 4k0c030a /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1
22 | # (the first thing is the utterance-id, which is the same as the basename of the file.
23 | 
24 | 
25 | while(<>){
26 |     m:^\S+/(\w+)\.[wW][vV]1$: || die "Bad line $_";
27 |     $id = $1;
28 |     $id =~ tr/A-Z/a-z/;  # Necessary because of weirdness on disk 13-16.1 (uppercase filenames)
29 |     print "$id $_";
30 | }
31 | 
32 | 


--------------------------------------------------------------------------------
/wsj/asr1/local/ndx2flist.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # Copyright 2010-2011 Microsoft Corporation
 3 | 
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #  http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
14 | # See the Apache 2 License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | # This program takes as its standard input an .ndx file from the WSJ corpus that looks
19 | # like this:
20 | #;; File: tr_s_wv1.ndx, updated 04/26/94
21 | #;;
22 | #;; Index for WSJ0 SI-short Sennheiser training data
23 | #;; Data is read WSJ sentences, Sennheiser mic.
24 | #;; Contains 84 speakers X (~100 utts per speaker MIT/SRI and ~50 utts 
25 | #;; per speaker TI) = 7236 utts
26 | #;;
27 | #11_1_1:wsj0/si_tr_s/01i/01ic0201.wv1
28 | #11_1_1:wsj0/si_tr_s/01i/01ic0202.wv1
29 | #11_1_1:wsj0/si_tr_s/01i/01ic0203.wv1
30 | 
31 | #and as command-line arguments it takes the names of the WSJ disk locations, e.g.:
32 | #/mnt/matylda2/data/WSJ0/11-1.1 /mnt/matylda2/data/WSJ0/11-10.1  ... etc.
33 | # It outputs a list of absolute pathnames (it does this by replacing e.g. 11_1_1 with
34 | # /mnt/matylda2/data/WSJ0/11-1.1.
35 | # It also does a slight fix because one of the WSJ disks (WSJ1/13-16.1) was distributed with
36 | # uppercase rather than lower case filenames.
37 | 
38 | foreach $fn (@ARGV) {
39 |     $fn =~ m:.+/([0-9\.\-]+)/?$: || die "Bad command-line argument $fn\n";
40 |     $disk_id=$1; 
41 |     $disk_id =~ tr/-\./__/; # replace - and . with - so 11-10.1 becomes 11_10_1
42 |     $fn =~ s:/$::; # Remove final slash, just in case it is present.
43 |     $disk2fn{$disk_id} = $fn;
44 | }
45 | 
46 | while(<STDIN>){
47 |     if(m/^;/){ next; } # Comment.  Ignore it.
48 |     else {
49 |       m/^([0-9_]+):\s*(\S+)$/  || die "Could not parse line $_";
50 |       $disk=$1;
51 |       if(!defined $disk2fn{$disk}) {
52 |           die "Disk id $disk not found";
53 |       }
54 |       $filename = $2; # as a subdirectory of the distributed disk.
55 |       if($disk eq "13_16_1" && `hostname` =~ m/fit.vutbr.cz/) {
56 |           # The disk 13-16.1 has been uppercased for some reason, on the
57 |           # BUT system.  This is a fix specifically for that case.
58 |           $filename =~ tr/a-z/A-Z/; # This disk contains all uppercase filenames.  Why?
59 |       }
60 |       print "$disk2fn{$disk}/$filename\n";
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/wsj/asr1/local/normalize_transcript.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # Copyright 2010-2011 Microsoft Corporation
 3 | 
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #  http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
14 | # See the Apache 2 License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | # This takes data from the standard input that's unnormalized transcripts in the format
19 | # 4k2c0308 Of course there isn\'t any guarantee the company will keep its hot hand [misc_noise] 
20 | # 4k2c030a [loud_breath] And new hardware such as the set of personal computers I\. B\. M\. introduced last week can lead to unexpected changes in the software business [door_slam] 
21 | # and outputs normalized transcripts.
22 | # c.f. /mnt/matylda2/data/WSJ0/11-10.1/wsj0/transcrp/doc/dot_spec.doc
23 | 
24 | @ARGV == 1 ||  die "usage: normalize_transcript.pl noise_word < transcript > transcript2";
25 | $noise_word = shift @ARGV;
26 | 
27 | while(<STDIN>) {
28 |     $_ =~ m:^(\S+) (.+): || die "bad line $_";
29 |     $utt = $1;
30 |     $trans = $2;
31 |     print "{wsj}$utt";
32 |     foreach $w (split (" ",$trans)) {
33 |         $w =~ tr:A-Z:a-z:;  # Lowercase everything to match the processing of other datasets.
34 |         $w =~ s:\\::g;      # Remove backslashes. We don't need the quoting.
35 |         $w =~ s:^\%percent:percent:;   # Normalization for Nov'93 test transcripts.
36 |         $w =~ s:^\.point:point:;       # Normalization for Nov'93 test transcripts.
37 |         $w =~ s:\*(.*)\*:\1:g;             # Mispronounced words are enclosed in asterisks; we don't care
38 |         if ($w ne "!exclamation-point") {  # ! indicates unusual emphasis; we don't care
39 |             $w =~ s:!::g;
40 |         }
41 |         if ($w ne ":colon") {              # : indicates a lengthened sound; we don't care
42 |             $w =~ s:\:::g;
43 |         }
44 | 
45 |         # Words we don't want to print
46 |         if($w =~ m:^\[\<\w+\]$:  ||  # E.g. [<door_slam], this means a door slammed in the preceding word. Delete.
47 |            $w =~ m:^\[\w+\>\]$:  ||  # E.g. [door_slam>], this means a door slammed in the next word.  Delete.
48 |            $w =~ m:\[\w+/\]$: ||     # E.g. [phone_ring/], which indicates the start of this phenomenon.
49 |            $w =~ m:\[\/\w+]$: ||     # E.g. [/phone_ring], which indicates the end of this phenomenon.
50 |            $w eq "~" ||              # This is used to indicate truncation of an utterance.  Not a word.
51 |            $w eq ".") {              # "." is used to indicate a pause.  Silence is optional anyway so not much
52 |                                      # point including this in the transcript.
53 |             next;
54 |         } elsif($w =~ m:\[\w+\]:) { # Other noises, e.g. [loud_breath].
55 |             print " $noise_word";
56 |         } elsif($w =~ m:^\<([\w\'.]+)\>$:) {
57 |             # e.g. replace <and> with and.  (the <> means verbal deletion of a word).. but it's pronounced.
58 |             print " $1";
59 |         } elsif($w eq "--dash") {
60 |             print " -dash";  # This is a common issue; the CMU dictionary has it as -DASH.
61 |         } elsif($w =~ m:(.+)\-DASH$:) { # E.g. INCORPORATED-DASH... seems the DASH gets combined with previous word
62 |             print " $1 -DASH";
63 |         } else {
64 |             print " $w";
65 |         }
66 |     }
67 |     print "\n";
68 | }
69 | 


--------------------------------------------------------------------------------
/wsj/asr1/local/wsj_format_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
 4 | #           2015  Guoguo Chen
 5 | # Apache 2.0
 6 | 
 7 | # This script takes data prepared in a corpus-dependent way
 8 | # in data/local/, and converts it into the "canonical" form,
 9 | # in various subdirectories of data/, e.g. data/lang, data/lang_test_ug,
10 | # data/train_si284, data/train_si84, etc.
11 | 
12 | # Don't bother doing train_si84 separately (although we have the file lists
13 | # in data/local/) because it's just the first 7138 utterances in train_si284.
14 | # We'll create train_si84 after doing the feature extraction.
15 | 
16 | lang_suffix=
17 | 
18 | echo "$0 $@"  # Print the command line for logging
19 | . ./path.sh || exit 1;
20 | . utils/parse_options.sh || exit 1;
21 | 
22 | echo "Preparing train and test data"
23 | srcdir=data/local/data
24 | 
25 | for x in train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
26 |   mkdir -p data/$x
27 |   cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
28 |   cp $srcdir/$x.txt data/$x/text || exit 1;
29 |   cp $srcdir/$x.spk2utt data/$x/spk2utt || exit 1;
30 |   cp $srcdir/$x.utt2spk data/$x/utt2spk || exit 1;
31 |   utils/filter_scp.pl data/$x/spk2utt $srcdir/spk2gender > data/$x/spk2gender || exit 1;
32 | done
33 | 
34 | echo "Succeeded in formatting data."
35 | 


--------------------------------------------------------------------------------
/wsj/asr1/path.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/path.sh


--------------------------------------------------------------------------------
/wsj/asr1/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Set bash to 'debug' mode, it will exit on :
 3 | # -e 'error', -u 'unbound variable', -o ... 'error in pipeline'
 4 | set -e
 5 | set -u
 6 | set -o pipefail
 7 | 
 8 | train_set="train_si284 "
 9 | dev_set="test_dev93 "
10 | eval_sets="test_eval92 "
11 | 
12 | # Even though data/nlsyms.txt is generated, we don't provide it to asr.sh
13 | # because the only non-linguistic symbol it contains is "<noise>", which is
14 | # which is the default value for nlysms.
15 | ./asr.sh \
16 |     --fs 16000 \
17 |     --n_tokens 75 \
18 |     --token_type bpe \
19 |     --train_sets "${train_set}" \
20 |     --dev_eval_sets "${dev_set} ${eval_sets}" \
21 |     --srctexts "data/train_si284/text data/local/other_text/text" "$@"
22 | 


--------------------------------------------------------------------------------
/wsj/asr1/utils:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/utils


--------------------------------------------------------------------------------