├── COMBINE ├── tts1 │ ├── cmd.sh │ ├── db.sh │ ├── run.sh │ ├── utils │ ├── path.sh │ ├── multi_tokenize.sh │ ├── combine_cmvn_stats.sh │ ├── combine_train_data.sh │ ├── conf │ │ ├── fbank.yaml │ │ ├── fbank_pitch.yaml │ │ ├── pbs.conf │ │ ├── gpu.conf │ │ ├── queue.conf │ │ └── slurm.conf │ └── local │ │ └── combine_datasets.py └── asr1 │ ├── db.sh │ ├── utils │ ├── cmd.sh │ ├── path.sh │ ├── run.sh │ ├── conf │ ├── fbank.yaml │ ├── fbank_pitch.yaml │ ├── pbs.conf │ ├── gpu.conf │ ├── queue.conf │ └── slurm.conf │ ├── combine_train_data.sh │ ├── multi_tokenize.sh │ ├── combine_cmvn_stats.sh │ └── local │ └── combine_datasets.py ├── TEMPLATE ├── asr1 │ ├── utils │ ├── conf │ │ ├── fbank.yaml │ │ ├── fbank_pitch.yaml │ │ ├── pbs.conf │ │ ├── queue.conf │ │ └── slurm.conf │ ├── setup.sh │ ├── path.sh │ ├── db.sh │ └── cmd.sh └── tts1 │ ├── db.sh │ ├── utils │ ├── cmd.sh │ ├── path.sh │ ├── tts.sh │ ├── conf │ ├── fbank.yaml │ ├── fbank_pitch.yaml │ ├── pbs.conf │ ├── queue.conf │ └── slurm.conf │ └── setup.sh ├── speech_datasets ├── bin │ ├── __init__.py │ ├── spm_train.py │ ├── combine_cmvn_stats.py │ ├── feat_to_shape.py │ ├── dump.py │ ├── apply_cmvn.py │ └── compute_cmvn_stats.py ├── text │ ├── __init__.py │ └── tokenizers.py ├── transform │ ├── __init__.py │ ├── add_deltas.py │ ├── interface.py │ └── cmvn.py ├── utils │ ├── __init__.py │ ├── misc.py │ └── types.py └── __init__.py ├── fisher └── asr1 │ ├── db.sh │ ├── utils │ ├── asr.sh │ ├── cmd.sh │ ├── path.sh │ ├── conf │ ├── fbank.yaml │ ├── fbank_pitch.yaml │ ├── pbs.conf │ ├── queue.conf │ └── slurm.conf │ ├── run.sh │ └── local │ └── data.sh ├── swbd └── asr1 │ ├── asr.sh │ ├── cmd.sh │ ├── db.sh │ ├── utils │ ├── path.sh │ ├── conf │ ├── fbank.yaml │ ├── fbank_pitch.yaml │ ├── pbs.conf │ ├── gpu.conf │ ├── queue.conf │ └── slurm.conf │ ├── local │ ├── MSU_single_letter.txt │ ├── map_acronyms_transcripts.py │ ├── swbd1_map_words.pl │ ├── swbd1_fix_speakerid.pl │ ├── data.sh │ ├── extend_segments.pl │ ├── swbd1_prepare_dict.sh │ ├── rt03_data_prep.sh │ └── format_acronyms_dict.py │ └── run.sh ├── wsj └── asr1 │ ├── asr.sh │ ├── cmd.sh │ ├── db.sh │ ├── path.sh │ ├── utils │ ├── conf │ ├── fbank.yaml │ ├── fbank_pitch.yaml │ ├── pbs.conf │ ├── gpu.conf │ ├── queue.conf │ └── slurm.conf │ ├── run.sh │ └── local │ ├── flist2scp.pl │ ├── wsj_format_data.sh │ ├── data.sh │ ├── find_transcripts.pl │ ├── ndx2flist.pl │ └── normalize_transcript.pl ├── commonvoice └── asr1 │ ├── db.sh │ ├── utils │ ├── asr.sh │ ├── cmd.sh │ ├── path.sh │ ├── conf │ ├── fbank.yaml │ ├── fbank_pitch.yaml │ ├── pbs.conf │ ├── queue.conf │ └── slurm.conf │ ├── run.sh │ └── local │ ├── filter_text.py │ ├── data.sh │ ├── download_and_untar.sh │ ├── reduce_data_dir.sh │ ├── split_tr_dt_et.sh │ └── data_prep.pl ├── example ├── requirements.txt ├── resources │ ├── global_cmvn_fbank.ark │ ├── librispeech_bpe2000.model │ ├── global_cmvn_fbank_pitch.ark │ ├── fbank.yaml │ └── fbank_pitch.yaml ├── utils.py └── README.md ├── librispeech └── asr1 │ ├── db.sh │ ├── utils │ ├── asr.sh │ ├── cmd.sh │ ├── path.sh │ ├── conf │ ├── fbank.yaml │ ├── fbank_pitch.yaml │ ├── pbs.conf │ ├── gpu.conf │ ├── queue.conf │ └── slurm.conf │ ├── run.sh │ └── local │ ├── download_and_untar.sh │ └── data.sh ├── .gitignore ├── CODEOWNERS ├── SECURITY.md ├── tools ├── install_sph2pipe.sh ├── install_pkgs.sh └── install_anaconda.sh ├── setup.py ├── utils ├── make_absolute.sh ├── spk2utt_to_utt2spk.pl ├── utt2spk_to_spk2utt.pl ├── shuffle_list.pl ├── compute_cmvn_stats.sh ├── remove_dup_utts.sh ├── feat_to_shape.sh ├── subset_data_dir_tr_cv.sh ├── apply_cmvn.sh ├── filter_scp.pl ├── subset_scp.pl ├── apply_map.pl ├── sym2int.pl ├── parse_options.sh ├── combine_data.sh └── dump.sh ├── docker └── Dockerfile └── Makefile /COMBINE/tts1/cmd.sh: -------------------------------------------------------------------------------- 1 | ../asr1/cmd.sh -------------------------------------------------------------------------------- /COMBINE/tts1/db.sh: -------------------------------------------------------------------------------- 1 | ../asr1/db.sh -------------------------------------------------------------------------------- /COMBINE/tts1/run.sh: -------------------------------------------------------------------------------- 1 | ../asr1/run.sh -------------------------------------------------------------------------------- /COMBINE/tts1/utils: -------------------------------------------------------------------------------- 1 | ../asr1/utils -------------------------------------------------------------------------------- /TEMPLATE/asr1/utils: -------------------------------------------------------------------------------- 1 | ../../utils -------------------------------------------------------------------------------- /TEMPLATE/tts1/db.sh: -------------------------------------------------------------------------------- 1 | ../asr1/db.sh -------------------------------------------------------------------------------- /TEMPLATE/tts1/utils: -------------------------------------------------------------------------------- 1 | ../asr1/utils -------------------------------------------------------------------------------- /speech_datasets/bin/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /COMBINE/tts1/path.sh: -------------------------------------------------------------------------------- 1 | ../asr1/path.sh -------------------------------------------------------------------------------- /TEMPLATE/tts1/cmd.sh: -------------------------------------------------------------------------------- 1 | ../asr1/cmd.sh -------------------------------------------------------------------------------- /TEMPLATE/tts1/path.sh: -------------------------------------------------------------------------------- 1 | ../asr1/path.sh -------------------------------------------------------------------------------- /TEMPLATE/tts1/tts.sh: -------------------------------------------------------------------------------- 1 | ../asr1/asr.sh -------------------------------------------------------------------------------- /COMBINE/asr1/db.sh: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/db.sh -------------------------------------------------------------------------------- /COMBINE/asr1/utils: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/utils -------------------------------------------------------------------------------- /fisher/asr1/db.sh: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/db.sh -------------------------------------------------------------------------------- /fisher/asr1/utils: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/utils -------------------------------------------------------------------------------- /swbd/asr1/asr.sh: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/asr.sh -------------------------------------------------------------------------------- /swbd/asr1/cmd.sh: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/cmd.sh -------------------------------------------------------------------------------- /swbd/asr1/db.sh: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/db.sh -------------------------------------------------------------------------------- /swbd/asr1/utils: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/utils -------------------------------------------------------------------------------- /wsj/asr1/asr.sh: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/asr.sh -------------------------------------------------------------------------------- /wsj/asr1/cmd.sh: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/cmd.sh -------------------------------------------------------------------------------- /wsj/asr1/db.sh: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/db.sh -------------------------------------------------------------------------------- /wsj/asr1/path.sh: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/path.sh -------------------------------------------------------------------------------- /wsj/asr1/utils: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/utils -------------------------------------------------------------------------------- /COMBINE/asr1/cmd.sh: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/cmd.sh -------------------------------------------------------------------------------- /COMBINE/asr1/path.sh: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/path.sh -------------------------------------------------------------------------------- /COMBINE/asr1/run.sh: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/asr.sh -------------------------------------------------------------------------------- /commonvoice/asr1/db.sh: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/db.sh -------------------------------------------------------------------------------- /commonvoice/asr1/utils: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/utils -------------------------------------------------------------------------------- /example/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==3.4.0 2 | -------------------------------------------------------------------------------- /fisher/asr1/asr.sh: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/asr.sh -------------------------------------------------------------------------------- /fisher/asr1/cmd.sh: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/cmd.sh -------------------------------------------------------------------------------- /fisher/asr1/path.sh: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/path.sh -------------------------------------------------------------------------------- /librispeech/asr1/db.sh: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/db.sh -------------------------------------------------------------------------------- /librispeech/asr1/utils: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/utils -------------------------------------------------------------------------------- /swbd/asr1/path.sh: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/path.sh -------------------------------------------------------------------------------- /commonvoice/asr1/asr.sh: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/asr.sh -------------------------------------------------------------------------------- /commonvoice/asr1/cmd.sh: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/cmd.sh -------------------------------------------------------------------------------- /commonvoice/asr1/path.sh: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/path.sh -------------------------------------------------------------------------------- /librispeech/asr1/asr.sh: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/asr.sh -------------------------------------------------------------------------------- /librispeech/asr1/cmd.sh: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/cmd.sh -------------------------------------------------------------------------------- /librispeech/asr1/path.sh: -------------------------------------------------------------------------------- 1 | ../../TEMPLATE/asr1/path.sh -------------------------------------------------------------------------------- /COMBINE/tts1/multi_tokenize.sh: -------------------------------------------------------------------------------- 1 | ../asr1/multi_tokenize.sh -------------------------------------------------------------------------------- /COMBINE/tts1/combine_cmvn_stats.sh: -------------------------------------------------------------------------------- 1 | ../asr1/combine_cmvn_stats.sh -------------------------------------------------------------------------------- /COMBINE/tts1/combine_train_data.sh: -------------------------------------------------------------------------------- 1 | ../asr1/combine_train_data.sh -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.pyc 3 | .DS_Store 4 | .idea/ 5 | cmake-build-debug/ 6 | -------------------------------------------------------------------------------- /fisher/asr1/conf/fbank.yaml: -------------------------------------------------------------------------------- 1 | - type: fbank 2 | num_mel_bins: 80 3 | sample_frequency: 8000 4 | -------------------------------------------------------------------------------- /swbd/asr1/conf/fbank.yaml: -------------------------------------------------------------------------------- 1 | - type: fbank 2 | num_mel_bins: 80 3 | sample_frequency: 8000 4 | -------------------------------------------------------------------------------- /wsj/asr1/conf/fbank.yaml: -------------------------------------------------------------------------------- 1 | - type: fbank 2 | num_mel_bins: 80 3 | sample_frequency: 16000 4 | -------------------------------------------------------------------------------- /COMBINE/asr1/conf/fbank.yaml: -------------------------------------------------------------------------------- 1 | - type: fbank 2 | num_mel_bins: 80 3 | sample_frequency: 16000 4 | -------------------------------------------------------------------------------- /COMBINE/tts1/conf/fbank.yaml: -------------------------------------------------------------------------------- 1 | - type: fbank 2 | num_mel_bins: 80 3 | sample_frequency: 16000 4 | -------------------------------------------------------------------------------- /TEMPLATE/asr1/conf/fbank.yaml: -------------------------------------------------------------------------------- 1 | - type: fbank 2 | num_mel_bins: 80 3 | sample_frequency: 16000 4 | -------------------------------------------------------------------------------- /TEMPLATE/tts1/conf/fbank.yaml: -------------------------------------------------------------------------------- 1 | - type: fbank 2 | num_mel_bins: 80 3 | sample_frequency: 16000 4 | -------------------------------------------------------------------------------- /commonvoice/asr1/conf/fbank.yaml: -------------------------------------------------------------------------------- 1 | - type: fbank 2 | num_mel_bins: 80 3 | sample_frequency: 16000 4 | -------------------------------------------------------------------------------- /librispeech/asr1/conf/fbank.yaml: -------------------------------------------------------------------------------- 1 | - type: fbank 2 | num_mel_bins: 80 3 | sample_frequency: 16000 4 | -------------------------------------------------------------------------------- /swbd/asr1/conf/fbank_pitch.yaml: -------------------------------------------------------------------------------- 1 | - type: fbank_pitch 2 | num_mel_bins: 80 3 | sample_frequency: 8000 4 | -------------------------------------------------------------------------------- /wsj/asr1/conf/fbank_pitch.yaml: -------------------------------------------------------------------------------- 1 | - type: fbank_pitch 2 | num_mel_bins: 80 3 | sample_frequency: 16000 4 | -------------------------------------------------------------------------------- /COMBINE/asr1/conf/fbank_pitch.yaml: -------------------------------------------------------------------------------- 1 | - type: fbank_pitch 2 | num_mel_bins: 80 3 | sample_frequency: 16000 4 | -------------------------------------------------------------------------------- /COMBINE/tts1/conf/fbank_pitch.yaml: -------------------------------------------------------------------------------- 1 | - type: fbank_pitch 2 | num_mel_bins: 80 3 | sample_frequency: 16000 4 | -------------------------------------------------------------------------------- /TEMPLATE/asr1/conf/fbank_pitch.yaml: -------------------------------------------------------------------------------- 1 | - type: fbank_pitch 2 | num_mel_bins: 80 3 | sample_frequency: 16000 4 | -------------------------------------------------------------------------------- /TEMPLATE/tts1/conf/fbank_pitch.yaml: -------------------------------------------------------------------------------- 1 | - type: fbank_pitch 2 | num_mel_bins: 80 3 | sample_frequency: 16000 4 | -------------------------------------------------------------------------------- /fisher/asr1/conf/fbank_pitch.yaml: -------------------------------------------------------------------------------- 1 | - type: fbank_pitch 2 | num_mel_bins: 80 3 | sample_frequency: 8000 4 | -------------------------------------------------------------------------------- /commonvoice/asr1/conf/fbank_pitch.yaml: -------------------------------------------------------------------------------- 1 | - type: fbank_pitch 2 | num_mel_bins: 80 3 | sample_frequency: 16000 4 | -------------------------------------------------------------------------------- /librispeech/asr1/conf/fbank_pitch.yaml: -------------------------------------------------------------------------------- 1 | - type: fbank_pitch 2 | num_mel_bins: 80 3 | sample_frequency: 16000 4 | -------------------------------------------------------------------------------- /speech_datasets/text/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub-package""" 2 | from speech_datasets.text.tokenizers import SentencepieceTokenizer 3 | -------------------------------------------------------------------------------- /example/resources/global_cmvn_fbank.ark: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/speech-datasets/HEAD/example/resources/global_cmvn_fbank.ark -------------------------------------------------------------------------------- /example/resources/librispeech_bpe2000.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/speech-datasets/HEAD/example/resources/librispeech_bpe2000.model -------------------------------------------------------------------------------- /speech_datasets/transform/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize main package.""" 2 | from speech_datasets.transform.transformation import Transformation 3 | -------------------------------------------------------------------------------- /example/resources/global_cmvn_fbank_pitch.ark: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/speech-datasets/HEAD/example/resources/global_cmvn_fbank_pitch.ark -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Comment line immediately above ownership line is reserved for related gus information. Please be careful while editing. 2 | #ECCN:Open Source 3 | -------------------------------------------------------------------------------- /speech_datasets/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package & bring general util into this namespace.""" 2 | from speech_datasets.utils.misc import get_root, check_kwargs, dynamic_import, set_deterministic_pytorch 3 | -------------------------------------------------------------------------------- /speech_datasets/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize main package.""" 2 | import pkg_resources 3 | from speech_datasets.dataloader import SpeechDataLoader 4 | 5 | try: 6 | __version__ = pkg_resources.get_distribution("speech_datasets").version 7 | except Exception: 8 | __version__ = "(Not installed from setup.py)" 9 | del pkg_resources 10 | -------------------------------------------------------------------------------- /example/resources/fbank.yaml: -------------------------------------------------------------------------------- 1 | - type: fbank 2 | num_mel_bins: 80 3 | sample_frequency: 16000 4 | - type: cmvn 5 | cmvn_type: global 6 | stats: resources/global_cmvn_fbank.ark 7 | norm_vars: True 8 | - type: spec_augment 9 | n_freq_mask: 2 10 | max_freq_width: 27 11 | n_time_mask: 2 12 | max_time_width: 100 13 | max_time_warp: 80 14 | -------------------------------------------------------------------------------- /swbd/asr1/local/MSU_single_letter.txt: -------------------------------------------------------------------------------- 1 | A ey 2 | B b iy 3 | C s iy 4 | D d iy 5 | E iy 6 | F eh f 7 | G jh iy 8 | H ey ch 9 | I ay 10 | J jh ey 11 | K k ey 12 | L eh l 13 | M eh m 14 | N eh n 15 | O ow 16 | P p iy 17 | Q k y uw 18 | R aa r 19 | S eh s 20 | T t iy 21 | U y uw 22 | V v iy 23 | W d ah b ax l y uw 24 | X eh k s 25 | Y w ay 26 | Z z iy 27 | -------------------------------------------------------------------------------- /example/resources/fbank_pitch.yaml: -------------------------------------------------------------------------------- 1 | - type: fbank_pitch 2 | num_mel_bins: 80 3 | sample_frequency: 16000 4 | - type: cmvn 5 | cmvn_type: global 6 | stats: resources/global_cmvn_fbank_pitch.ark 7 | norm_vars: True 8 | - type: spec_augment 9 | n_freq_mask: 2 10 | max_freq_width: 27 11 | n_time_mask: 2 12 | max_time_width: 100 13 | max_time_warp: 80 14 | -------------------------------------------------------------------------------- /wsj/asr1/conf/pbs.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -V -v PATH -S /bin/bash 3 | option name=* -N $0 4 | option mem=* -l mem=$0 5 | option mem=0 # Do not add anything to qsub_opts 6 | option num_threads=* -l ncpus=$0 7 | option num_threads=1 # Do not add anything to qsub_opts 8 | option num_nodes=* -l nodes=$0:ppn=1 9 | default gpu=0 10 | option gpu=0 11 | option gpu=* -l ngpus=$0 12 | -------------------------------------------------------------------------------- /COMBINE/asr1/conf/pbs.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -V -v PATH -S /bin/bash 3 | option name=* -N $0 4 | option mem=* -l mem=$0 5 | option mem=0 # Do not add anything to qsub_opts 6 | option num_threads=* -l ncpus=$0 7 | option num_threads=1 # Do not add anything to qsub_opts 8 | option num_nodes=* -l nodes=$0:ppn=1 9 | default gpu=0 10 | option gpu=0 11 | option gpu=* -l ngpus=$0 12 | -------------------------------------------------------------------------------- /COMBINE/tts1/conf/pbs.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -V -v PATH -S /bin/bash 3 | option name=* -N $0 4 | option mem=* -l mem=$0 5 | option mem=0 # Do not add anything to qsub_opts 6 | option num_threads=* -l ncpus=$0 7 | option num_threads=1 # Do not add anything to qsub_opts 8 | option num_nodes=* -l nodes=$0:ppn=1 9 | default gpu=0 10 | option gpu=0 11 | option gpu=* -l ngpus=$0 12 | -------------------------------------------------------------------------------- /TEMPLATE/asr1/conf/pbs.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -V -v PATH -S /bin/bash 3 | option name=* -N $0 4 | option mem=* -l mem=$0 5 | option mem=0 # Do not add anything to qsub_opts 6 | option num_threads=* -l ncpus=$0 7 | option num_threads=1 # Do not add anything to qsub_opts 8 | option num_nodes=* -l nodes=$0:ppn=1 9 | default gpu=0 10 | option gpu=0 11 | option gpu=* -l ngpus=$0 12 | -------------------------------------------------------------------------------- /TEMPLATE/tts1/conf/pbs.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -V -v PATH -S /bin/bash 3 | option name=* -N $0 4 | option mem=* -l mem=$0 5 | option mem=0 # Do not add anything to qsub_opts 6 | option num_threads=* -l ncpus=$0 7 | option num_threads=1 # Do not add anything to qsub_opts 8 | option num_nodes=* -l nodes=$0:ppn=1 9 | default gpu=0 10 | option gpu=0 11 | option gpu=* -l ngpus=$0 12 | -------------------------------------------------------------------------------- /fisher/asr1/conf/pbs.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -V -v PATH -S /bin/bash 3 | option name=* -N $0 4 | option mem=* -l mem=$0 5 | option mem=0 # Do not add anything to qsub_opts 6 | option num_threads=* -l ncpus=$0 7 | option num_threads=1 # Do not add anything to qsub_opts 8 | option num_nodes=* -l nodes=$0:ppn=1 9 | default gpu=0 10 | option gpu=0 11 | option gpu=* -l ngpus=$0 12 | -------------------------------------------------------------------------------- /swbd/asr1/conf/pbs.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -V -v PATH -S /bin/bash 3 | option name=* -N $0 4 | option mem=* -l mem=$0 5 | option mem=0 # Do not add anything to qsub_opts 6 | option num_threads=* -l ncpus=$0 7 | option num_threads=1 # Do not add anything to qsub_opts 8 | option num_nodes=* -l nodes=$0:ppn=1 9 | default gpu=0 10 | option gpu=0 11 | option gpu=* -l ngpus=$0 12 | -------------------------------------------------------------------------------- /commonvoice/asr1/conf/pbs.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -V -v PATH -S /bin/bash 3 | option name=* -N $0 4 | option mem=* -l mem=$0 5 | option mem=0 # Do not add anything to qsub_opts 6 | option num_threads=* -l ncpus=$0 7 | option num_threads=1 # Do not add anything to qsub_opts 8 | option num_nodes=* -l nodes=$0:ppn=1 9 | default gpu=0 10 | option gpu=0 11 | option gpu=* -l ngpus=$0 12 | -------------------------------------------------------------------------------- /librispeech/asr1/conf/pbs.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -V -v PATH -S /bin/bash 3 | option name=* -N $0 4 | option mem=* -l mem=$0 5 | option mem=0 # Do not add anything to qsub_opts 6 | option num_threads=* -l ncpus=$0 7 | option num_threads=1 # Do not add anything to qsub_opts 8 | option num_nodes=* -l nodes=$0:ppn=1 9 | default gpu=0 10 | option gpu=0 11 | option gpu=* -l ngpus=$0 12 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | ## Security 2 | 3 | Please report any security issue to [security@salesforce.com](mailto:security@salesforce.com) 4 | as soon as it is discovered. This library limits its runtime dependencies in 5 | order to reduce the total cost of ownership as much as can be, but all consumers 6 | should remain vigilant and have their security stakeholders review all third-party 7 | products (3PP) like this one and their dependencies. 8 | -------------------------------------------------------------------------------- /speech_datasets/bin/spm_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the license found in the 6 | # https://github.com/pytorch/fairseq/blob/master/LICENSE 7 | import sys 8 | 9 | import sentencepiece as spm 10 | 11 | 12 | if __name__ == "__main__": 13 | spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:])) 14 | -------------------------------------------------------------------------------- /wsj/asr1/conf/gpu.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* 3 | option mem=* -l mem_free=$0,ram_free=$0 4 | option mem=0 # Do not add anything to qsub_opts 5 | option num_threads=* -pe smp $0 6 | option num_threads=1 # Do not add anything to qsub_opts 7 | option max_jobs_run=* -tc $0 8 | default gpu=0 9 | option gpu=0 10 | option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q -------------------------------------------------------------------------------- /COMBINE/asr1/conf/gpu.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* 3 | option mem=* -l mem_free=$0,ram_free=$0 4 | option mem=0 # Do not add anything to qsub_opts 5 | option num_threads=* -pe smp $0 6 | option num_threads=1 # Do not add anything to qsub_opts 7 | option max_jobs_run=* -tc $0 8 | default gpu=0 9 | option gpu=0 10 | option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q -------------------------------------------------------------------------------- /COMBINE/tts1/conf/gpu.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* 3 | option mem=* -l mem_free=$0,ram_free=$0 4 | option mem=0 # Do not add anything to qsub_opts 5 | option num_threads=* -pe smp $0 6 | option num_threads=1 # Do not add anything to qsub_opts 7 | option max_jobs_run=* -tc $0 8 | default gpu=0 9 | option gpu=0 10 | option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q -------------------------------------------------------------------------------- /swbd/asr1/conf/gpu.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* 3 | option mem=* -l mem_free=$0,ram_free=$0 4 | option mem=0 # Do not add anything to qsub_opts 5 | option num_threads=* -pe smp $0 6 | option num_threads=1 # Do not add anything to qsub_opts 7 | option max_jobs_run=* -tc $0 8 | default gpu=0 9 | option gpu=0 10 | option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q -------------------------------------------------------------------------------- /librispeech/asr1/conf/gpu.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* 3 | option mem=* -l mem_free=$0,ram_free=$0 4 | option mem=0 # Do not add anything to qsub_opts 5 | option num_threads=* -pe smp $0 6 | option num_threads=1 # Do not add anything to qsub_opts 7 | option max_jobs_run=* -tc $0 8 | default gpu=0 9 | option gpu=0 10 | option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q -------------------------------------------------------------------------------- /tools/install_sph2pipe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | if [ $# != 1 ]; then 5 | echo "Usage: $0 " 6 | exit 1; 7 | fi 8 | pwd=$PWD 9 | dir=$1 10 | 11 | if [ ! -e sph2pipe_v2.5.tar.gz ]; then 12 | wget --no-check-certificate https://www.openslr.org/resources/3/sph2pipe_v2.5.tar.gz 13 | fi 14 | 15 | tar xzvf sph2pipe_v2.5.tar.gz -C $dir 16 | rm sph2pipe_v2.5.tar.gz 17 | 18 | cd $dir/sph2pipe_v2.5 19 | gcc -o sph2pipe *.c -lm 20 | cd $pwd 21 | -------------------------------------------------------------------------------- /wsj/asr1/conf/queue.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* 3 | option name=* -N $0 4 | option mem=* -l mem_free=$0,ram_free=$0 5 | option mem=0 # Do not add anything to qsub_opts 6 | option num_threads=* -pe smp $0 7 | option num_threads=1 # Do not add anything to qsub_opts 8 | option max_jobs_run=* -tc $0 9 | option num_nodes=* -pe mpi $0 # You must set this PE as allocation_rule=1 10 | default gpu=0 11 | option gpu=0 12 | option gpu=* -l gpu=$0 -q g.q 13 | -------------------------------------------------------------------------------- /COMBINE/asr1/conf/queue.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* 3 | option name=* -N $0 4 | option mem=* -l mem_free=$0,ram_free=$0 5 | option mem=0 # Do not add anything to qsub_opts 6 | option num_threads=* -pe smp $0 7 | option num_threads=1 # Do not add anything to qsub_opts 8 | option max_jobs_run=* -tc $0 9 | option num_nodes=* -pe mpi $0 # You must set this PE as allocation_rule=1 10 | default gpu=0 11 | option gpu=0 12 | option gpu=* -l gpu=$0 -q g.q 13 | -------------------------------------------------------------------------------- /COMBINE/tts1/conf/queue.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* 3 | option name=* -N $0 4 | option mem=* -l mem_free=$0,ram_free=$0 5 | option mem=0 # Do not add anything to qsub_opts 6 | option num_threads=* -pe smp $0 7 | option num_threads=1 # Do not add anything to qsub_opts 8 | option max_jobs_run=* -tc $0 9 | option num_nodes=* -pe mpi $0 # You must set this PE as allocation_rule=1 10 | default gpu=0 11 | option gpu=0 12 | option gpu=* -l gpu=$0 -q g.q 13 | -------------------------------------------------------------------------------- /TEMPLATE/asr1/conf/queue.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* 3 | option name=* -N $0 4 | option mem=* -l mem_free=$0,ram_free=$0 5 | option mem=0 # Do not add anything to qsub_opts 6 | option num_threads=* -pe smp $0 7 | option num_threads=1 # Do not add anything to qsub_opts 8 | option max_jobs_run=* -tc $0 9 | option num_nodes=* -pe mpi $0 # You must set this PE as allocation_rule=1 10 | default gpu=0 11 | option gpu=0 12 | option gpu=* -l gpu=$0 -q g.q 13 | -------------------------------------------------------------------------------- /TEMPLATE/tts1/conf/queue.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* 3 | option name=* -N $0 4 | option mem=* -l mem_free=$0,ram_free=$0 5 | option mem=0 # Do not add anything to qsub_opts 6 | option num_threads=* -pe smp $0 7 | option num_threads=1 # Do not add anything to qsub_opts 8 | option max_jobs_run=* -tc $0 9 | option num_nodes=* -pe mpi $0 # You must set this PE as allocation_rule=1 10 | default gpu=0 11 | option gpu=0 12 | option gpu=* -l gpu=$0 -q g.q 13 | -------------------------------------------------------------------------------- /fisher/asr1/conf/queue.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* 3 | option name=* -N $0 4 | option mem=* -l mem_free=$0,ram_free=$0 5 | option mem=0 # Do not add anything to qsub_opts 6 | option num_threads=* -pe smp $0 7 | option num_threads=1 # Do not add anything to qsub_opts 8 | option max_jobs_run=* -tc $0 9 | option num_nodes=* -pe mpi $0 # You must set this PE as allocation_rule=1 10 | default gpu=0 11 | option gpu=0 12 | option gpu=* -l gpu=$0 -q g.q 13 | -------------------------------------------------------------------------------- /swbd/asr1/conf/queue.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* 3 | option name=* -N $0 4 | option mem=* -l mem_free=$0,ram_free=$0 5 | option mem=0 # Do not add anything to qsub_opts 6 | option num_threads=* -pe smp $0 7 | option num_threads=1 # Do not add anything to qsub_opts 8 | option max_jobs_run=* -tc $0 9 | option num_nodes=* -pe mpi $0 # You must set this PE as allocation_rule=1 10 | default gpu=0 11 | option gpu=0 12 | option gpu=* -l gpu=$0 -q g.q 13 | -------------------------------------------------------------------------------- /commonvoice/asr1/conf/queue.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* 3 | option name=* -N $0 4 | option mem=* -l mem_free=$0,ram_free=$0 5 | option mem=0 # Do not add anything to qsub_opts 6 | option num_threads=* -pe smp $0 7 | option num_threads=1 # Do not add anything to qsub_opts 8 | option max_jobs_run=* -tc $0 9 | option num_nodes=* -pe mpi $0 # You must set this PE as allocation_rule=1 10 | default gpu=0 11 | option gpu=0 12 | option gpu=* -l gpu=$0 -q g.q 13 | -------------------------------------------------------------------------------- /librispeech/asr1/conf/queue.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* 3 | option name=* -N $0 4 | option mem=* -l mem_free=$0,ram_free=$0 5 | option mem=0 # Do not add anything to qsub_opts 6 | option num_threads=* -pe smp $0 7 | option num_threads=1 # Do not add anything to qsub_opts 8 | option max_jobs_run=* -tc $0 9 | option num_nodes=* -pe mpi $0 # You must set this PE as allocation_rule=1 10 | default gpu=0 11 | option gpu=0 12 | option gpu=* -l gpu=$0 -q g.q 13 | -------------------------------------------------------------------------------- /fisher/asr1/conf/slurm.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command sbatch --export=PATH 3 | option name=* --job-name $0 4 | option time=* --time $0 5 | option mem=* --mem-per-cpu $0 6 | option mem=0 # Do not add anything to qsub_opts 7 | option num_threads=* --cpus-per-task $0 8 | option num_threads=1 --cpus-per-task 1 9 | option num_nodes=* --nodes $0 10 | default gpu=0 11 | option gpu=0 -p cpu 12 | option gpu=* -p gpu --gres=gpu:$0 13 | # note: the --max-jobs-run option is supported as a special case 14 | # by slurm.pl and you don't have to handle it in the config file. 15 | -------------------------------------------------------------------------------- /swbd/asr1/conf/slurm.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command sbatch --export=PATH 3 | option name=* --job-name $0 4 | option time=* --time $0 5 | option mem=* --mem-per-cpu $0 6 | option mem=0 # Do not add anything to qsub_opts 7 | option num_threads=* --cpus-per-task $0 8 | option num_threads=1 --cpus-per-task 1 9 | option num_nodes=* --nodes $0 10 | default gpu=0 11 | option gpu=0 -p cpu 12 | option gpu=* -p gpu --gres=gpu:$0 13 | # note: the --max-jobs-run option is supported as a special case 14 | # by slurm.pl and you don't have to handle it in the config file. 15 | -------------------------------------------------------------------------------- /wsj/asr1/conf/slurm.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command sbatch --export=PATH 3 | option name=* --job-name $0 4 | option time=* --time $0 5 | option mem=* --mem-per-cpu $0 6 | option mem=0 # Do not add anything to qsub_opts 7 | option num_threads=* --cpus-per-task $0 8 | option num_threads=1 --cpus-per-task 1 9 | option num_nodes=* --nodes $0 10 | default gpu=0 11 | option gpu=0 -p cpu 12 | option gpu=* -p gpu --gres=gpu:$0 13 | # note: the --max-jobs-run option is supported as a special case 14 | # by slurm.pl and you don't have to handle it in the config file. 15 | -------------------------------------------------------------------------------- /COMBINE/asr1/conf/slurm.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command sbatch --export=PATH 3 | option name=* --job-name $0 4 | option time=* --time $0 5 | option mem=* --mem-per-cpu $0 6 | option mem=0 # Do not add anything to qsub_opts 7 | option num_threads=* --cpus-per-task $0 8 | option num_threads=1 --cpus-per-task 1 9 | option num_nodes=* --nodes $0 10 | default gpu=0 11 | option gpu=0 -p cpu 12 | option gpu=* -p gpu --gres=gpu:$0 13 | # note: the --max-jobs-run option is supported as a special case 14 | # by slurm.pl and you don't have to handle it in the config file. 15 | -------------------------------------------------------------------------------- /COMBINE/tts1/conf/slurm.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command sbatch --export=PATH 3 | option name=* --job-name $0 4 | option time=* --time $0 5 | option mem=* --mem-per-cpu $0 6 | option mem=0 # Do not add anything to qsub_opts 7 | option num_threads=* --cpus-per-task $0 8 | option num_threads=1 --cpus-per-task 1 9 | option num_nodes=* --nodes $0 10 | default gpu=0 11 | option gpu=0 -p cpu 12 | option gpu=* -p gpu --gres=gpu:$0 13 | # note: the --max-jobs-run option is supported as a special case 14 | # by slurm.pl and you don't have to handle it in the config file. 15 | -------------------------------------------------------------------------------- /TEMPLATE/asr1/conf/slurm.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command sbatch --export=PATH 3 | option name=* --job-name $0 4 | option time=* --time $0 5 | option mem=* --mem-per-cpu $0 6 | option mem=0 # Do not add anything to qsub_opts 7 | option num_threads=* --cpus-per-task $0 8 | option num_threads=1 --cpus-per-task 1 9 | option num_nodes=* --nodes $0 10 | default gpu=0 11 | option gpu=0 -p cpu 12 | option gpu=* -p gpu --gres=gpu:$0 13 | # note: the --max-jobs-run option is supported as a special case 14 | # by slurm.pl and you don't have to handle it in the config file. 15 | -------------------------------------------------------------------------------- /TEMPLATE/tts1/conf/slurm.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command sbatch --export=PATH 3 | option name=* --job-name $0 4 | option time=* --time $0 5 | option mem=* --mem-per-cpu $0 6 | option mem=0 # Do not add anything to qsub_opts 7 | option num_threads=* --cpus-per-task $0 8 | option num_threads=1 --cpus-per-task 1 9 | option num_nodes=* --nodes $0 10 | default gpu=0 11 | option gpu=0 -p cpu 12 | option gpu=* -p gpu --gres=gpu:$0 13 | # note: the --max-jobs-run option is supported as a special case 14 | # by slurm.pl and you don't have to handle it in the config file. 15 | -------------------------------------------------------------------------------- /commonvoice/asr1/conf/slurm.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command sbatch --export=PATH 3 | option name=* --job-name $0 4 | option time=* --time $0 5 | option mem=* --mem-per-cpu $0 6 | option mem=0 # Do not add anything to qsub_opts 7 | option num_threads=* --cpus-per-task $0 8 | option num_threads=1 --cpus-per-task 1 9 | option num_nodes=* --nodes $0 10 | default gpu=0 11 | option gpu=0 -p cpu 12 | option gpu=* -p gpu --gres=gpu:$0 13 | # note: the --max-jobs-run option is supported as a special case 14 | # by slurm.pl and you don't have to handle it in the config file. 15 | -------------------------------------------------------------------------------- /librispeech/asr1/conf/slurm.conf: -------------------------------------------------------------------------------- 1 | # Default configuration 2 | command sbatch --export=PATH 3 | option name=* --job-name $0 4 | option time=* --time $0 5 | option mem=* --mem-per-cpu $0 6 | option mem=0 # Do not add anything to qsub_opts 7 | option num_threads=* --cpus-per-task $0 8 | option num_threads=1 --cpus-per-task 1 9 | option num_nodes=* --nodes $0 10 | default gpu=0 11 | option gpu=0 -p cpu 12 | option gpu=* -p gpu --gres=gpu:$0 13 | # note: the --max-jobs-run option is supported as a special case 14 | # by slurm.pl and you don't have to handle it in the config file. 15 | -------------------------------------------------------------------------------- /fisher/asr1/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Set bash to 'debug' mode, it will exit on : 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', 4 | set -e 5 | set -u 6 | set -o pipefail 7 | 8 | log() { 9 | local fname=${BASH_SOURCE[1]##*/} 10 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" 11 | } 12 | 13 | srctexts="data/train_fisher/text " 14 | train_sets="train_fisher " 15 | 16 | ./asr.sh \ 17 | --fs 8000 \ 18 | --n_tokens 2000 \ 19 | --token_type bpe \ 20 | --train_sets "${train_sets}" \ 21 | --dev_eval_sets "" \ 22 | --srctexts "${srctexts}" "$@" 23 | -------------------------------------------------------------------------------- /commonvoice/asr1/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Set bash to 'debug' mode, it will exit on : 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', 4 | set -e 5 | set -u 6 | set -o pipefail 7 | 8 | lang=en # en de fr cy tt kab ca zh-TW it fa eu es ru 9 | 10 | train_set=valid_train_${lang} 11 | train_dev=valid_dev_${lang} 12 | train_test=valid_test_${lang} 13 | 14 | ./asr.sh \ 15 | --local_data_opts "--lang ${lang}" \ 16 | --fs 16000 \ 17 | --n_tokens 2000 \ 18 | --token_type bpe \ 19 | --feats_type fbank_pitch \ 20 | --train_sets "${train_set}" \ 21 | --dev_eval_sets "${train_dev} ${train_test}" \ 22 | --srctexts "data/${train_set}/text" "$@" 23 | -------------------------------------------------------------------------------- /swbd/asr1/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Set bash to 'debug' mode, it will exit on : 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', 4 | set -e 5 | set -u 6 | set -o pipefail 7 | 8 | log() { 9 | local fname=${BASH_SOURCE[1]##*/} 10 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" 11 | } 12 | 13 | train_sets="swbd1_train " 14 | dev_set="swbd1_dev" 15 | eval_sets="eval2000 rt03 " 16 | srctexts="data/swbd1_train/text " 17 | 18 | ./asr.sh \ 19 | --fs 8000 \ 20 | --n_tokens 2000 \ 21 | --token_type bpe \ 22 | --train_sets "${train_sets}" \ 23 | --dev_eval_sets "${dev_set} ${eval_sets}" \ 24 | --srctexts "${srctexts}" "$@" 25 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name="speech_datasets", 5 | version="0.1.0", 6 | author="Aadyot Bhatnagar", 7 | author_email="abhatnagar@salesforce.com", 8 | license="Apache-2.0", 9 | packages=find_packages(include=["speech_datasets*"]), 10 | install_requires=[ 11 | "h5py>=2.9.0", 12 | "humanfriendly", 13 | "Kaldiio", 14 | "numpy", 15 | "pillow>=6.1.0", 16 | "PyYAML>=5.1.2", 17 | "ray[tune]", 18 | "resampy", 19 | "scipy", 20 | "sentencepiece<0.1.90,>=0.1.82", 21 | "soundfile>=0.10.2", 22 | "torch>=1.2.0", 23 | "tqdm", 24 | "typeguard>=2.7.0", 25 | ] 26 | ) 27 | -------------------------------------------------------------------------------- /wsj/asr1/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Set bash to 'debug' mode, it will exit on : 3 | # -e 'error', -u 'unbound variable', -o ... 'error in pipeline' 4 | set -e 5 | set -u 6 | set -o pipefail 7 | 8 | train_set="train_si284 " 9 | dev_set="test_dev93 " 10 | eval_sets="test_eval92 " 11 | 12 | # Even though data/nlsyms.txt is generated, we don't provide it to asr.sh 13 | # because the only non-linguistic symbol it contains is "", which is 14 | # which is the default value for nlysms. 15 | ./asr.sh \ 16 | --fs 16000 \ 17 | --n_tokens 75 \ 18 | --token_type bpe \ 19 | --train_sets "${train_set}" \ 20 | --dev_eval_sets "${dev_set} ${eval_sets}" \ 21 | --srctexts "data/train_si284/text data/local/other_text/text" "$@" 22 | -------------------------------------------------------------------------------- /utils/make_absolute.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script replaces the command readlink -f (which is not portable). 4 | # It turns a pathname into an absolute pathname, including following soft links. 5 | target_file=$1 6 | 7 | cd $(dirname $target_file) 8 | target_file=$(basename $target_file) 9 | 10 | # Iterate down a (possible) chain of symlinks 11 | while [ -L "$target_file" ]; do 12 | target_file=$(readlink $target_file) 13 | cd $(dirname $target_file) 14 | target_file=$(basename $target_file) 15 | done 16 | 17 | # Compute the canonicalized name by finding the physical path 18 | # for the directory we're in and appending the target file. 19 | phys_dir=$(pwd -P) 20 | result=$phys_dir/$target_file 21 | echo $result 22 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:latest 2 | RUN apt-get update 3 | RUN apt-get install -y --no-install-recommends \ 4 | apt-utils wget bc gawk vim emacs build-essential locales libfontconfig1 automake \ 5 | sox flac ffmpeg libasound2-dev libsndfile1-dev \ 6 | libfftw3-dev libopenblas-dev libgflags-dev libgoogle-glog-dev gfortran \ 7 | python3 python3-dev python3-pip python3-numpy python3-setuptools 8 | RUN apt update 9 | RUN apt install -y openssh-server openssh-client 10 | 11 | # Default to utf-8 encodings in python 12 | # Can verify in container with: 13 | # python -c 'import locale; print(locale.getpreferredencoding(False))' 14 | RUN locale-gen en_US.UTF-8 15 | ENV LANG en_US.UTF-8 16 | ENV LANGUAGE en_US:en 17 | ENV LC_ALL en_US.UTF-8 18 | -------------------------------------------------------------------------------- /tools/install_pkgs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Set bash to 'debug' mode, it will exit on : 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', 4 | set -euo pipefail 5 | 6 | # This is needed for certain pods (ffmpeg-3 doesn't exist anymore & messes up apt gets) 7 | rm -f /etc/apt/sources.list.d/jonathonf-ubuntu-ffmpeg-3* 8 | apt-get remove libflac8 -y 9 | apt-get update -y 10 | apt-get upgrade -y 11 | apt-get autoremove -y 12 | 13 | # The actual apt installs we need 14 | apt-get install -y apt-utils 15 | apt-get install -y gawk 16 | apt-get install -y build-essential libfontconfig1 automake 17 | apt-get install -y sox flac ffmpeg libasound2-dev libsndfile1-dev 18 | apt-get install -y libfftw3-dev libopenblas-dev libgflags-dev libgoogle-glog-dev 19 | apt-get install -y gfortran python3 20 | apt-get install -y bc 21 | apt-get install -y wget 22 | -------------------------------------------------------------------------------- /commonvoice/asr1/local/filter_text.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2017 Johns Hopkins University (Shinji Watanabe) 4 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 5 | 6 | import argparse 7 | import codecs 8 | from io import open 9 | import sys 10 | 11 | 12 | sys.stdin = codecs.getreader("utf-8")(sys.stdin.buffer) 13 | sys.stdout = codecs.getwriter("utf-8")(sys.stdout.buffer) 14 | 15 | 16 | if __name__ == "__main__": 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument("--filter-list", "-f", type=str, help="filter list") 19 | args = parser.parse_args() 20 | 21 | with open(args.filter_list, encoding="utf-8") as f: 22 | fil = [x.rstrip() for x in f] 23 | 24 | for x in sys.stdin: 25 | # extract text parts 26 | text = " ".join(x.rstrip().split()[1:]) 27 | if text in fil: 28 | print(x.split()[0], text) 29 | -------------------------------------------------------------------------------- /fisher/asr1/local/data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -u 4 | set -o pipefail 5 | 6 | log() { 7 | local fname=${BASH_SOURCE[1]##*/} 8 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" 9 | } 10 | 11 | . ./path.sh || exit 1 12 | . ./db.sh || exit 1 13 | 14 | 15 | # Extract & prepare Fisher 16 | for (( i=1; i<=$(echo "${FISHER_TGZ}" | wc -w); i++ )); do 17 | src=$(echo "${FISHER_TGZ}" | cut -d " " -f $i) 18 | dst=$(echo "${FISHER}" | cut -d " " -f $i) 19 | if [ ! -e "${dst}" ]; then 20 | mkdir -p "${dst}" 21 | { 22 | tar xzvf "${src}" -C "${dst}" 23 | } || { 24 | log "Failed to extract FISHER (part $i)" 25 | exit 1 26 | } 27 | fi 28 | done 29 | 30 | # Note: do not quote ${FISHER} -- it should contains 4 directories, and fisher_prep.sh all 4 31 | log "local/fisher_data_prep.sh ${FISHER}" 32 | local/fisher_data_prep.sh ${FISHER} -------------------------------------------------------------------------------- /utils/spk2utt_to_utt2spk.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | while(<>){ 19 | @A = split(" ", $_); 20 | @A > 1 || die "Invalid line in spk2utt file: $_"; 21 | $s = shift @A; 22 | foreach $u ( @A ) { 23 | print "$u $s\n"; 24 | } 25 | } 26 | 27 | 28 | -------------------------------------------------------------------------------- /librispeech/asr1/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Set bash to 'debug' mode, it will exit on : 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', 4 | set -e 5 | set -u 6 | set -o pipefail 7 | 8 | log() { 9 | local fname=${BASH_SOURCE[1]##*/} 10 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" 11 | } 12 | 13 | # Get the datasets we want to use based on the command-line args 14 | train_sets="train-clean-100 train-clean-360 train-other-500 " 15 | dev_sets="dev-clean dev-other " 16 | eval_sets="test-clean test-other " 17 | srctexts= 18 | for dset in ${train_sets}; do 19 | srctexts+="data/${dset}/text " 20 | done 21 | 22 | ./asr.sh \ 23 | --fs 16000 \ 24 | --n_tokens 2000 \ 25 | --token_type bpe \ 26 | --train_sets "${train_sets}" \ 27 | --dev_eval_sets "${dev_sets} ${eval_sets}" \ 28 | --srctexts "${srctexts}" \ 29 | --local_data_opts "${eval_sets} ${dev_sets} ${train_sets}" "$@" -------------------------------------------------------------------------------- /example/utils.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import numpy as np 3 | 4 | 5 | def edit_dist(pred: List[int], label: List[int]) -> int: 6 | """Computes the edit distance between a predicted and label sequence.""" 7 | # dists[i, j] = edit_dist(pred[:i], label[:i]) 8 | pred_len, label_len = len(pred), len(label) 9 | dists = np.zeros((pred_len + 1, label_len + 1), dtype=int) 10 | 11 | dists[:, 0] = np.arange(pred_len + 1) 12 | dists[0, :] = np.arange(label_len + 1) 13 | 14 | for i, x in enumerate(pred): 15 | for j, y in enumerate(label): 16 | sub_delta = int(x != y) 17 | ins_delta = 1 18 | del_delta = 1 19 | 20 | substitution = dists[i, j] + sub_delta 21 | insertion = dists[i, j+1] + ins_delta # pred[:i] --> pred[:i+1] 22 | deletion = dists[i+1, j] + del_delta # label[:j] --> label[:j+1] 23 | dists[i+1, j+1] = min(substitution, insertion, deletion) 24 | 25 | return dists[-1, -1].item() 26 | -------------------------------------------------------------------------------- /speech_datasets/transform/add_deltas.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from speech_datasets.transform.interface import FuncTrans 4 | 5 | 6 | def delta(feat, window): 7 | assert window > 0 8 | delta_feat = np.zeros_like(feat) 9 | for i in range(1, window + 1): 10 | delta_feat[:-i] += i * feat[i:] 11 | delta_feat[i:] += -i * feat[:-i] 12 | delta_feat[-i:] += i * feat[-1] 13 | delta_feat[:i] += -i * feat[0] 14 | delta_feat /= 2 * sum(i ** 2 for i in range(1, window + 1)) 15 | return delta_feat 16 | 17 | 18 | def add_deltas(x, window=2, order=2): 19 | """ 20 | :param x: Features 21 | :param window: size of the window to use to approximate time derivative computation 22 | :param order: highest order time derivative to compute 23 | :return: Features, concatenated with all the relevant derivatives 24 | """ 25 | feats = [x] 26 | for _ in range(order): 27 | feats.append(delta(feats[-1], window)) 28 | return np.concatenate(feats, axis=1) 29 | 30 | 31 | class AddDeltas(FuncTrans): 32 | _func = add_deltas 33 | __doc__ = add_deltas.__doc__ 34 | -------------------------------------------------------------------------------- /speech_datasets/bin/combine_cmvn_stats.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from speech_datasets.utils.readers import read_cmvn_stats 4 | from speech_datasets.utils.writers import write_cmvn_stats 5 | 6 | 7 | def parse_args(): 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("--cmvn_type", choices=["global", "speaker", "utterance"]) 10 | parser.add_argument("--output_file", type=str) 11 | parser.add_argument("cmvn_stats_files", nargs="+") 12 | return parser.parse_args() 13 | 14 | 15 | def combine_cmvn_dicts(stats_dicts): 16 | out_dict = {} 17 | for d in stats_dicts: 18 | for spk, val in d.items(): 19 | if spk not in out_dict: 20 | out_dict[spk] = val 21 | else: 22 | out_dict[spk] += val 23 | return out_dict 24 | 25 | 26 | def main(): 27 | args = parse_args() 28 | out_dict = combine_cmvn_dicts(read_cmvn_stats(path, args.cmvn_type) 29 | for path in args.cmvn_stats_files) 30 | write_cmvn_stats(args.output_file, args.cmvn_type, out_dict) 31 | 32 | 33 | if __name__ == "__main__": 34 | main() 35 | -------------------------------------------------------------------------------- /TEMPLATE/asr1/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Set bash to 'debug' mode, it will exit on : 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', 4 | set -e 5 | set -u 6 | set -o pipefail 7 | 8 | log() { 9 | local fname=${BASH_SOURCE[1]##*/} 10 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" 11 | } 12 | help_message=$(cat << EOF 13 | Usage: $0 14 | EOF 15 | ) 16 | 17 | 18 | if [ $# -ne 1 ]; then 19 | log "${help_message}" 20 | log "Error: 1 positional argument is required." 21 | exit 2 22 | fi 23 | 24 | 25 | dir=$1 26 | mkdir -p "${dir}" 27 | 28 | if [ ! -d "${dir}"/../../TEMPLATE ]; then 29 | log "Error: ${dir}/../../TEMPLATE should exist. You may specify wrong directory." 30 | exit 1 31 | fi 32 | 33 | targets="" 34 | 35 | # Copy 36 | for f in conf; do 37 | target="${dir}"/../../TEMPLATE/asr1/"${f}" 38 | cp -r "${target}" "${dir}" 39 | targets+="${dir}/${target} " 40 | done 41 | 42 | 43 | # Symlinks to TEMPLATE & Kaldi 44 | for f in asr.sh cmd.sh path.sh db.sh utils; do 45 | target=../../TEMPLATE/asr1/"${f}" 46 | ln -sf "${target}" "${dir}" 47 | targets+="${dir}/${target} " 48 | done 49 | 50 | 51 | log "Created: ${targets}" 52 | -------------------------------------------------------------------------------- /TEMPLATE/tts1/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Set bash to 'debug' mode, it will exit on : 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', 4 | set -e 5 | set -u 6 | set -o pipefail 7 | 8 | log() { 9 | local fname=${BASH_SOURCE[1]##*/} 10 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" 11 | } 12 | help_message=$(cat << EOF 13 | Usage: $0 14 | EOF 15 | ) 16 | 17 | 18 | if [ $# -ne 1 ]; then 19 | log "${help_message}" 20 | log "Error: 1 positional argument is required." 21 | exit 2 22 | fi 23 | 24 | 25 | dir=$1 26 | mkdir -p "${dir}" 27 | 28 | if [ ! -d "${dir}"/../../TEMPLATE ]; then 29 | log "Error: ${dir}/../../TEMPLATE should exist. You may specify wrong directory." 30 | exit 1 31 | fi 32 | 33 | targets="" 34 | 35 | # Copy 36 | for f in conf; do 37 | target="${dir}"/../../TEMPLATE/tts1/"${f}" 38 | cp -r "${target}" "${dir}" 39 | targets+="${dir}/${target} " 40 | done 41 | 42 | 43 | # Symlinks to TEMPLATE & Kaldi 44 | for f in tts.sh cmd.sh path.sh db.sh utils; do 45 | target=../../TEMPLATE/tts1/"${f}" 46 | ln -sf "${target}" "${dir}" 47 | targets+="${dir}/${target} " 48 | done 49 | 50 | 51 | log "Created: ${targets}" 52 | -------------------------------------------------------------------------------- /TEMPLATE/asr1/path.sh: -------------------------------------------------------------------------------- 1 | MAIN_ROOT=$(dirname "$(dirname "${PWD}")") 2 | export LC_ALL=C 3 | 4 | if [ -z "${PS1:-}" ]; then 5 | PS1=__dummy__ 6 | fi 7 | 8 | # Activate local virtual environment for development 9 | error_msg="Virtual environment not set up properly! Navigate to $MAIN_ROOT and run 'make clean all'" 10 | if [ -e $MAIN_ROOT/tools/venv/etc/profile.d/conda.sh ] && [ -e $MAIN_ROOT/tools/conda.done ]; then 11 | VENV_NAME=$(cat "${MAIN_ROOT}/tools/conda.done") 12 | source $MAIN_ROOT/tools/venv/etc/profile.d/conda.sh && conda deactivate 13 | if conda env list | (grep -q -E "${VENV_NAME}\s"); then 14 | conda activate "${VENV_NAME}" 15 | else 16 | echo "${error_msg}" && exit 1 17 | fi 18 | else 19 | echo "${error_msg}" && exit 1 20 | fi 21 | 22 | # Add binary scripts to the path, to allow them to be run easily 23 | export PATH=$MAIN_ROOT/speech_datasets/bin:$PATH 24 | export OMP_NUM_THREADS=1 25 | 26 | # NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C 27 | export PYTHONIOENCODING=UTF-8 28 | 29 | # You need to change or unset NCCL_SOCKET_IFNAME according to your network environment 30 | # https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/env.html#nccl-socket-ifname 31 | export NCCL_SOCKET_IFNAME="^lo,docker,virbr,vmnet,vboxnet" 32 | -------------------------------------------------------------------------------- /wsj/asr1/local/flist2scp.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | # takes in a file list with lines like 19 | # /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1 20 | # and outputs an scp in kaldi format with lines like 21 | # 4k0c030a /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1 22 | # (the first thing is the utterance-id, which is the same as the basename of the file. 23 | 24 | 25 | while(<>){ 26 | m:^\S+/(\w+)\.[wW][vV]1$: || die "Bad line $_"; 27 | $id = $1; 28 | $id =~ tr/A-Z/a-z/; # Necessary because of weirdness on disk 13-16.1 (uppercase filenames) 29 | print "$id $_"; 30 | } 31 | 32 | -------------------------------------------------------------------------------- /utils/utt2spk_to_spk2utt.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # converts an utt2spk file to a spk2utt file. 18 | # Takes input from the stdin or from a file argument; 19 | # output goes to the standard out. 20 | 21 | if ( @ARGV > 1 ) { 22 | die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; 23 | } 24 | 25 | while(<>){ 26 | @A = split(" ", $_); 27 | @A == 2 || die "Invalid line in utt2spk file: $_"; 28 | ($u,$s) = @A; 29 | if(!$seen_spk{$s}) { 30 | $seen_spk{$s} = 1; 31 | push @spklist, $s; 32 | } 33 | push (@{$spk_hash{$s}}, "$u"); 34 | } 35 | foreach $s (@spklist) { 36 | $l = join(' ',@{$spk_hash{$s}}); 37 | print "$s $l\n"; 38 | } 39 | -------------------------------------------------------------------------------- /TEMPLATE/asr1/db.sh: -------------------------------------------------------------------------------- 1 | # We extract WSJ0_TGZ to WSJ0 and WSJ1_TGZ to WSJ1. Note that the actual data 2 | # is in WSJ0/csr_1_senn and WSJ1/csr_senn 3 | WSJ0_TGZ=/export/data/LDC/csr_1_senn_LDC93S6B.tgz 4 | WSJ1_TGZ=/export/data/LDC/csr_senn_LDC94S13B.tgz 5 | WSJ0=/workspace/LDC93S6B 6 | WSJ1=/workspace/LDC94S13B 7 | 8 | # Extract SWBD1_TGZ to SWBD1 9 | SWBD1_TGZ=/export/data/LDC/swb1_LDC97S62.tgz 10 | SWBD1=/workspace/LDC97S62 11 | 12 | # Filepath i of EVAL2000_TGZ extracts into directory i of EVAL2000. 13 | # First directory must contain the speech data, second directory must contain the transcripts. 14 | EVAL2000_TGZ="/export/data/LDC/hub5e_00_LDC2002S09.tgz /export/data/LDC/LDC2002T43.tgz" 15 | EVAL2000="/workspace/LDC2002S09/hub5e_00 /workspace/LDC2002T43" 16 | 17 | # Extract RT03_TGZ to RT03 18 | RT03_TGZ=/export/data/LDC/rt_03_LDC2007S10.tgz 19 | RT03=/workspace/LDC2007S10/rt_03 20 | 21 | # filepath i of FISHER_TGZ extracts into directory i of FISHER 22 | # In this case, we extract LDC2004T19 and LDC2005T19 every time, but LDC2004S13 and LDC2005S13 are pre-extracted 23 | FISHER="/workspace/LDC2004T19 /workspace/LDC2005T19 /export/data/LDC/LDC2004S13 /export/data/LDC/LDC2005S13" 24 | FISHER_TGZ="/export/data/LDC/LDC2004T19/fe_03_p1_tran_LDC2004T19.tgz /export/data/LDC/LDC2005T19/LDC2005T19.tgz" 25 | 26 | LIBRISPEECH=/export/data/librispeech 27 | 28 | COMMONVOICE=/export/data/commonvoice 29 | -------------------------------------------------------------------------------- /wsj/asr1/local/wsj_format_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) 4 | # 2015 Guoguo Chen 5 | # Apache 2.0 6 | 7 | # This script takes data prepared in a corpus-dependent way 8 | # in data/local/, and converts it into the "canonical" form, 9 | # in various subdirectories of data/, e.g. data/lang, data/lang_test_ug, 10 | # data/train_si284, data/train_si84, etc. 11 | 12 | # Don't bother doing train_si84 separately (although we have the file lists 13 | # in data/local/) because it's just the first 7138 utterances in train_si284. 14 | # We'll create train_si84 after doing the feature extraction. 15 | 16 | lang_suffix= 17 | 18 | echo "$0 $@" # Print the command line for logging 19 | . ./path.sh || exit 1; 20 | . utils/parse_options.sh || exit 1; 21 | 22 | echo "Preparing train and test data" 23 | srcdir=data/local/data 24 | 25 | for x in train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do 26 | mkdir -p data/$x 27 | cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1; 28 | cp $srcdir/$x.txt data/$x/text || exit 1; 29 | cp $srcdir/$x.spk2utt data/$x/spk2utt || exit 1; 30 | cp $srcdir/$x.utt2spk data/$x/utt2spk || exit 1; 31 | utils/filter_scp.pl data/$x/spk2utt $srcdir/spk2gender > data/$x/spk2gender || exit 1; 32 | done 33 | 34 | echo "Succeeded in formatting data." 35 | -------------------------------------------------------------------------------- /utils/shuffle_list.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Copyright 2013 Johns Hopkins University (author: Daniel Povey) 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | # MERCHANTABLITY OR NON-INFRINGEMENT. 15 | # See the Apache 2 License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | if ($ARGV[0] eq "--srand") { 20 | $n = $ARGV[1]; 21 | $n =~ m/\d+/ || die "Bad argument to --srand option: \"$n\""; 22 | srand($ARGV[1]); 23 | shift; 24 | shift; 25 | } else { 26 | srand(0); # Gives inconsistent behavior if we don't seed. 27 | } 28 | 29 | if (@ARGV > 1 || $ARGV[0] =~ m/^-.+/) { # >1 args, or an option we 30 | # don't understand. 31 | print "Usage: shuffle_list.pl [--srand N] [input file] > output\n"; 32 | print "randomizes the order of lines of input.\n"; 33 | exit(1); 34 | } 35 | 36 | @lines; 37 | while (<>) { 38 | push @lines, [ (rand(), $_)] ; 39 | } 40 | 41 | @lines = sort { $a->[0] cmp $b->[0] } @lines; 42 | foreach $l (@lines) { 43 | print $l->[1]; 44 | } 45 | -------------------------------------------------------------------------------- /utils/compute_cmvn_stats.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | # Begin configuration section. 5 | nj=4 6 | cmd=utils/run.pl 7 | archive_format=hdf5 8 | cmvn_type=global 9 | spk2utt= 10 | # End configuration section. 11 | 12 | help_message=$(cat << EOF 13 | Usage: $0 [options] [logdir] 14 | e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/logs 15 | Options: 16 | --nj # number of parallel jobs 17 | --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. 18 | --archive_format # Specify the format of feats file 19 | --cmvn-type # cmvn_type (global or speaker or utterance) 20 | --spk2utt # speaker -> utterance file 21 | EOF 22 | ) 23 | 24 | echo "$0 $*" 1>&2 # Print the command line for logging 25 | . path.sh || exit 1 26 | . utils/parse_options.sh || exit 1; 27 | 28 | if [ $# -lt 2 ] || [ $# -gt 3 ]; then 29 | echo "${help_message}" 1>&2 30 | exit 1; 31 | fi 32 | 33 | scp=$1 34 | cmvnark=$2 35 | data=$(dirname ${scp}) 36 | if [ $# -eq 3 ]; then 37 | logdir=$3 38 | else 39 | logdir=${data}/logs 40 | fi 41 | mkdir -p ${logdir} 42 | 43 | split_scps= 44 | split_cmvn= 45 | for n in $(seq ${nj}); do 46 | split_cmvn+="${logdir}/cmvn.${n}.ark " 47 | split_scps+="${logdir}/feats.${n}.scp " 48 | done 49 | utils/split_scp.pl ${scp} ${split_scps} || exit 1 50 | 51 | 52 | maybe_spk2utt= 53 | if [ -n "${spk2utt}" ] && [ "${cmvn_type}" = speaker ]; then 54 | maybe_spk2utt="--spk2utt ${spk2utt}" 55 | fi 56 | 57 | ${cmd} JOB=1:${nj} ${logdir}/compute_cmvn_stats.JOB.log \ 58 | compute_cmvn_stats.py --filetype ${archive_format} ${maybe_spk2utt} \ 59 | --cmvn-type ${cmvn_type} "scp:${logdir}/feats.JOB.scp" "${logdir}/cmvn.JOB.ark" 60 | 61 | python3 -m speech_datasets.bin.combine_cmvn_stats --cmvn_type ${cmvn_type} \ 62 | --output_file ${cmvnark} ${split_cmvn} || exit 1 63 | 64 | rm -f ${split_scps} ${split_cmvn} 65 | -------------------------------------------------------------------------------- /utils/remove_dup_utts.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Remove excess utterances once they appear more than a specified 4 | # number of times with the same transcription, in a data set. 5 | # E.g. useful for removing excess "uh-huh" from training. 6 | 7 | if [ $# != 3 ]; then 8 | echo "Usage: remove_dup_utts.sh max-count " 9 | echo "e.g.: remove_dup_utts.sh 10 data/train data/train_nodup" 10 | echo "This script is used to filter out utterances that have from over-represented" 11 | echo "transcriptions (such as 'uh-huh'), by limiting the number of repetitions of" 12 | echo "any given word-sequence to a specified value. It's often used to get" 13 | echo "subsets for early stages of training." 14 | exit 1; 15 | fi 16 | 17 | maxcount=$1 18 | srcdir=$2 19 | destdir=$3 20 | mkdir -p $destdir 21 | 22 | [ ! -f $srcdir/text ] && echo "$0: Invalid input directory $srcdir" && exit 1; 23 | 24 | ! mkdir -p $destdir && echo "$0: could not create directory $destdir" && exit 1; 25 | 26 | ! [ "$maxcount" -gt 1 ] && echo "$0: invalid max-count '$maxcount'" && exit 1; 27 | 28 | cp $srcdir/* $destdir 29 | cat $srcdir/text | \ 30 | perl -e ' 31 | $maxcount = shift @ARGV; 32 | @all = (); 33 | $p1 = 103349; $p2 = 71147; $k = 0; 34 | sub random { # our own random number generator: predictable. 35 | $k = ($k + $p1) % $p2; 36 | return ($k / $p2); 37 | } 38 | while(<>) { 39 | push @all, $_; 40 | @A = split(" ", $_); 41 | shift @A; 42 | $text = join(" ", @A); 43 | $count{$text} ++; 44 | } 45 | foreach $line (@all) { 46 | @A = split(" ", $line); 47 | shift @A; 48 | $text = join(" ", @A); 49 | $n = $count{$text}; 50 | if ($n < $maxcount || random() < ($maxcount / $n)) { 51 | print $line; 52 | } 53 | }' $maxcount >$destdir/text 54 | 55 | echo "Reduced number of utterances from `cat $srcdir/text | wc -l` to `cat $destdir/text | wc -l`" 56 | 57 | echo "Using fix_data_dir.sh to reconcile the other files." 58 | utils/fix_data_dir.sh $destdir 59 | rm -r $destdir/.backup 60 | 61 | exit 0 62 | -------------------------------------------------------------------------------- /swbd/asr1/local/map_acronyms_transcripts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2015 Minhua Wu 4 | # Apache 2.0 5 | 6 | # convert acronyms in swbd transcript to fisher convention 7 | # accoring to first two columns in the input acronyms mapping 8 | 9 | import argparse, re 10 | __author__ = 'Minhua Wu' 11 | 12 | parser = argparse.ArgumentParser(description='format acronyms to a._b._c.') 13 | parser.add_argument('-i', '--input', help='Input transcripts', required=True) 14 | parser.add_argument('-o', '--output',help='Output transcripts', required=True) 15 | parser.add_argument('-M', '--Map', help='Input acronyms mapping', required=True) 16 | args = parser.parse_args() 17 | 18 | fin_map = open(args.Map, "r") 19 | dict_acronym = {} 20 | dict_acronym_noi = {} # Mapping of acronyms without I, i 21 | for pair in fin_map: 22 | items = pair.split('\t') 23 | dict_acronym[items[0]] = items[1] 24 | dict_acronym_noi[items[0]] = items[1] 25 | fin_map.close() 26 | del dict_acronym_noi['I'] 27 | del dict_acronym_noi['i'] 28 | 29 | 30 | fin_trans = open(args.input, "r") 31 | fout_trans = open(args.output, "w") 32 | for line in fin_trans: 33 | items = line.split() 34 | L = len(items) 35 | # First pass mapping to map I as part of acronym 36 | for i in range(L): 37 | if items[i] == 'I': 38 | x = 0 39 | while i-1-x >= 0 and re.match(r'^[A-Z]$', items[i-1-x]): 40 | x += 1 41 | 42 | y = 0 43 | while i+1+y < L and re.match(r'^[A-Z]$', items[i+1+y]): 44 | y += 1 45 | 46 | if x+y > 0: 47 | for bias in range(-x, y+1): 48 | items[i+bias] = dict_acronym[items[i+bias]] 49 | 50 | # Second pass mapping (not mapping 'i' and 'I') 51 | for i in range(len(items)): 52 | if items[i] in dict_acronym_noi.keys(): 53 | items[i] = dict_acronym_noi[items[i]] 54 | sentence = ' '.join(items[1:]) 55 | fout_trans.write(items[0] + ' ' + sentence.lower() + '\n') 56 | 57 | fin_trans.close() 58 | fout_trans.close() 59 | -------------------------------------------------------------------------------- /utils/feat_to_shape.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | # Begin configuration section. 5 | nj=4 6 | cmd=utils/run.pl 7 | verbose=0 8 | archive_format= 9 | preprocess_conf= 10 | # End configuration section. 11 | 12 | help_message=$(cat << EOF 13 | Usage: $0 [options] [] 14 | e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/logs 15 | Options: 16 | --nj # number of parallel jobs 17 | --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. 18 | --archive_format # Specify the format of feats file 19 | --preprocess-conf # Apply preprocess to feats when creating shape.scp 20 | --verbose # Default: 0 21 | EOF 22 | ) 23 | 24 | echo "$0 $*" 1>&2 # Print the command line for logging 25 | . path.sh || exit 1 26 | . utils/parse_options.sh || exit 1; 27 | 28 | if [ $# -lt 2 ] || [ $# -gt 3 ]; then 29 | echo "${help_message}" 1>&2 30 | exit 1; 31 | fi 32 | 33 | scp=$1 34 | outscp=$2 35 | data=$(dirname ${scp}) 36 | if [ $# -eq 3 ]; then 37 | logdir=$3 38 | else 39 | logdir=${data}/logs 40 | fi 41 | mkdir -p ${logdir} 42 | 43 | nj=$((nj<$(<"${scp}" wc -l)?nj:$(<"${scp}" wc -l))) 44 | split_scps="" 45 | for n in $(seq ${nj}); do 46 | split_scps="${split_scps} ${logdir}/feats.${n}.scp" 47 | done 48 | 49 | utils/split_scp.pl ${scp} ${split_scps} 50 | 51 | if [ -n "${preprocess_conf}" ]; then 52 | preprocess_opt="--preprocess-conf ${preprocess_conf}" 53 | else 54 | preprocess_opt="" 55 | fi 56 | if [ -n "${archive_format}" ]; then 57 | filetype_opt="--filetype ${archive_format}" 58 | else 59 | filetype_opt="" 60 | fi 61 | 62 | ${cmd} JOB=1:${nj} ${logdir}/feat_to_shape.JOB.log \ 63 | feat_to_shape.py --verbose ${verbose} ${preprocess_opt} ${filetype_opt} \ 64 | scp:${logdir}/feats.JOB.scp ${logdir}/shape.JOB.scp 65 | 66 | # concatenate the .scp files together. 67 | for n in $(seq ${nj}); do 68 | cat ${logdir}/shape.${n}.scp 69 | done > ${outscp} 70 | 71 | rm -f ${logdir}/feats.*.scp 2>/dev/null 72 | -------------------------------------------------------------------------------- /wsj/asr1/local/data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -u 4 | set -o pipefail 5 | 6 | log() { 7 | local fname=${BASH_SOURCE[1]##*/} 8 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" 9 | } 10 | 11 | help_message=$(cat << EOF 12 | Usage: $0 13 | (No options) 14 | EOF 15 | ) 16 | 17 | if [ $# -ne 0 ]; then 18 | log "Error: invalid command line arguments" 19 | log "${help_message}" 20 | exit 1 21 | fi 22 | 23 | . ./path.sh || exit 1 24 | . ./db.sh || exit 1 25 | 26 | other_text=data/local/other_text/text 27 | nlsyms=data/nlsyms.txt 28 | 29 | # Extract WSJ0/WSJ1 raw data if needed 30 | WSJ=("${WSJ0}" "${WSJ1}") 31 | WSJ_TGZ=("${WSJ0_TGZ}" "${WSJ1_TGZ}") 32 | for (( i=0; i<2; i++ )); do 33 | echo ${WSJ[i]} 34 | if [ -z "${WSJ[i]}" ]; then 35 | log "Fill the value of 'WSJ${i}' in db.sh" 36 | exit 1 37 | elif [ ! -d "${WSJ[i]}" ]; then 38 | mkdir -p "${WSJ[i]}" 39 | { 40 | tar xzvf "${WSJ_TGZ[i]}" -C "${WSJ[i]}" 41 | } || { 42 | rm -rf "${WSJ[i]}" 43 | log "Failed to extract WSJ${i}" 44 | exit 1 45 | } 46 | fi 47 | done 48 | 49 | log "local/wsj_data_prep.sh ${WSJ0}/csr_1_senn/??-{?,??}.? ${WSJ1}/csr_senn/??-{?,??}.?" 50 | local/wsj_data_prep.sh "${WSJ0}"/csr_1_senn/??-{?,??}.? "${WSJ1}"/csr_senn/??-{?,??}.? 51 | log "local/wsj_format_data.sh" 52 | local/wsj_format_data.sh 53 | 54 | log "Create the list of non-linguistic symbols: ${nlsyms}" 55 | cut -f 2- -d" " data/train_si284/text | tr " " "\n" | sort | uniq | grep "<" > ${nlsyms} 56 | cat ${nlsyms} 57 | 58 | log "Prepare text from lng_modl dir: ${WSJ1}/csr_senn/13-32.1/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/*.z -> ${other_text}" 59 | mkdir -p "$(dirname ${other_text})" 60 | 61 | # NOTE(kamo): Give utterance id to each texts & make everything lowercase 62 | # Also remove utterances with non-linguistic symbols, i.e. lines including "<" 63 | zcat ${WSJ1}/csr_senn/13-32.1/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/*.z | \ 64 | grep -v "<" | tr "[:upper:]" "[:lower:]" | \ 65 | awk '{ printf("{wsj}lng_%07d %s\n",NR,$0) } ' > ${other_text} 66 | -------------------------------------------------------------------------------- /COMBINE/asr1/combine_train_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | log() { 4 | local fname=${BASH_SOURCE[1]##*/} 5 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" 6 | } 7 | 8 | help_message="Usage: $0 [asr.sh options] / / / ..." 9 | 10 | log "$0 $*" 11 | if [ $# -eq 0 ]; then 12 | log "$help_message" 13 | log "Error: at least 1 argument required" 14 | exit 2 15 | fi 16 | 17 | kwargs=() 18 | stage=2 19 | stop_stage=5 20 | while true; do 21 | case "$1" in 22 | --stage) 23 | if [ "$2" -lt 2 ]; then 24 | log "Specify --stage 2 or higher (got --stage $2)." 25 | log "We expect stage 1 to be complete for all datasets given." 26 | exit 2 27 | else 28 | stage=$2 29 | fi 30 | shift 2 31 | ;; 32 | --stop-stage|--stop_stage) 33 | if [ "$2" -gt 5 ]; then 34 | log "Specify --stop-stage 5 or lower (got --stop-stage $2)." 35 | log "Use combine_cmvn_stats.sh to combine CMVN statistics from multiple datasets (stage 5)." 36 | log "Use multi_tokenize.sh to obtain token inventories from multiple datasets (stages 6-7)." 37 | exit 2 38 | else 39 | stop_stage=$2 40 | fi 41 | shift 2 42 | ;; 43 | --*) kwargs+=( "$1" "$2" ); shift 2; ;; 44 | *) break; 45 | esac 46 | done 47 | kwargs+=( --stage "$stage" --stop_stage "$stop_stage" ) 48 | 49 | if [ $# -eq 0 ]; then 50 | log "${help_message}" 51 | log "Error: Please specify dataset splits as positional arguments." 52 | exit 2 53 | fi 54 | 55 | task=$(basename "$(utils/make_absolute.sh "$PWD")") 56 | idx=$(python local/combine_datasets.py --task "${task//1/}" --write_dir true "$@") 57 | datadir="data/${idx}" 58 | for f in wav.scp segments utt2spk text; do 59 | sort "${datadir}/${f}" > "${datadir}/${f}.tmp" 60 | mv "${datadir}/${f}.tmp" "${datadir}/${f}" 61 | done 62 | ./run.sh "${kwargs[@]}" --train_sets "${idx}" 63 | -------------------------------------------------------------------------------- /swbd/asr1/local/swbd1_map_words.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Modified from swbd_map_words.pl in Kaldi s5 recipe to make pattern 4 | # matches case-insensitive --Arnab (Jan 2013) 5 | 6 | if ($ARGV[0] eq "-f") { 7 | shift @ARGV; 8 | $field_spec = shift @ARGV; 9 | if ($field_spec =~ m/^\d+$/) { 10 | $field_begin = $field_spec - 1; $field_end = $field_spec - 1; 11 | } 12 | if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) 13 | if ($1 ne "") { 14 | $field_begin = $1 - 1; # Change to zero-based indexing. 15 | } 16 | if ($2 ne "") { 17 | $field_end = $2 - 1; # Change to zero-based indexing. 18 | } 19 | } 20 | if (!defined $field_begin && !defined $field_end) { 21 | die "Bad argument to -f option: $field_spec"; 22 | } 23 | } 24 | 25 | 26 | while (<>) { 27 | @A = split(" ", $_); 28 | for ($n = 0; $n < @A; $n++) { 29 | $a = $A[$n]; 30 | if ( (!defined $field_begin || $n >= $field_begin) 31 | && (!defined $field_end || $n <= $field_end)) { 32 | # e.g. [LAUGHTER-STORY] -> STORY; 33 | $a =~ s:(|\-)^\[LAUGHTER-(.+)\](|\-)$:$1$2$3:i; 34 | # $1 and $3 relate to preserving trailing "-" 35 | $a =~ s:^\[(.+)/.+\](|\-)$:$1$2:; # e.g. [IT'N/ISN'T] -> IT'N ... note, 36 | # 1st part may include partial-word stuff, which we process further below, 37 | # e.g. [LEM[GUINI]-/LINGUINI] 38 | # the (|\_) at the end is to accept and preserve trailing -'s. 39 | $a =~ s:^(|\-)\[[^][]+\](.+)$:-$2:; # e.g. -[AN]Y , note \047 is quote; 40 | # let the leading - be optional on input, as sometimes omitted. 41 | $a =~ s:^(.+)\[[^][]+\](|\-)$:$1-:; # e.g. AB[SOLUTE]- -> AB-; 42 | # let the trailing - be optional on input, as sometimes omitted. 43 | $a =~ s:([^][]+)\[.+\]$:$1:; # e.g. EX[SPECIALLY]-/ESPECIALLY] -> EX- 44 | # which is a mistake in the input. 45 | $a =~ s:^\{(.+)\}$:$1:; # e.g. {YUPPIEDOM} -> YUPPIEDOM 46 | $a =~ s:[A-Z]\[([^][])+\][A-Z]:$1-$3:i; # e.g. AMMU[N]IT- -> AMMU-IT- 47 | $a =~ s:_\d$::; # e.g. THEM_1 -> THEM 48 | } 49 | $A[$n] = $a; 50 | } 51 | print join(" ", @A) . "\n"; 52 | } 53 | -------------------------------------------------------------------------------- /COMBINE/asr1/multi_tokenize.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Set bash to 'debug' mode, it will exit on : 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', 4 | set -e 5 | set -u 6 | set -o pipefail 7 | 8 | log() { 9 | local fname=${BASH_SOURCE[1]##*/} 10 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" 11 | } 12 | 13 | # Tokenization related options from asr.sh 14 | token_type=bpe # Tokenization type (char or bpe). 15 | n_tokens=2000 # The number of BPE vocabulary. 16 | nlsyms="" # non-linguistic symbols list, separated by a comma 17 | 18 | help_message=$(cat < ... 20 | 21 | Produces a token inventory of the given type for all the datasets provided. 22 | 23 | Options: 24 | --token_type # Tokenization type (char or bpe, default="${token_type}"). 25 | --n_tokens # The maximum number of tokens allowed (default="${n_tokens}"). 26 | --nlsyms # Non-linguistic symbol list for BPE/char, separated by a comma. (default="${nlsyms}"). 27 | EOF 28 | ) 29 | 30 | . ./path.sh || exit 1 31 | . ./cmd.sh || exit 1 32 | 33 | log "$0 $*" 34 | . utils/parse_options.sh || exit 1 35 | if [ $# -eq 0 ]; then 36 | log "${help_message}" 37 | log "Error: Please specify datasets as positional arguments." 38 | exit 2 39 | fi 40 | 41 | workspace=$PWD 42 | task=$(basename "$(utils/make_absolute.sh "$workspace")") 43 | run_args="--token-type ${token_type} --n_tokens ${n_tokens} --nlsyms ${nlsyms} " 44 | 45 | # Compile srctexts from all the relevant datasets 46 | srctexts= 47 | for dset in "$@"; do 48 | log "Concatenating all source texts from dataset $dset..." 49 | dset_dir="${MAIN_ROOT}/${dset}/${task}" 50 | cd ${dset_dir} 51 | ./run.sh --stage 6 --stop-stage 6 ${run_args} 52 | cd ${workspace} 53 | srctexts+="${dset_dir}/dump/srctexts " 54 | echo "" 55 | done 56 | 57 | # Concatenate all the relevant text data & prepare a token inventory 58 | log "Concatenating all source texts from all datasets..." 59 | mkdir -p dump data 60 | cat $srctexts > dump/srctexts 61 | ./run.sh --stage 7 --stop-stage 7 ${run_args} 62 | 63 | -------------------------------------------------------------------------------- /commonvoice/asr1/local/data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Set bash to 'debug' mode, it will exit on : 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', 4 | set -e 5 | set -u 6 | set -o pipefail 7 | 8 | log() { 9 | local fname=${BASH_SOURCE[1]##*/} 10 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" 11 | } 12 | 13 | # Copyright 2020 Johns Hopkins University (Shinji Watanabe) 14 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 15 | 16 | . ./path.sh || exit 1; 17 | . ./cmd.sh || exit 1; 18 | . ./db.sh || exit 1; 19 | 20 | # general configuration 21 | SECONDS=0 22 | lang=en # en de fr cy tt kab ca zh-TW it fa eu es ru 23 | # base url for downloads. 24 | data_url=https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/$lang.tar.gz 25 | 26 | train_set=valid_train_${lang} 27 | train_dev=valid_dev_${lang} 28 | test_set=valid_test_${lang} 29 | 30 | # Ensure that COMMONVOICE data has already been extracted 31 | if [ -z "${COMMONVOICE}" ]; then 32 | log "Fill the value of 'COMMONVOICE' in db.sh" 33 | exit 1 34 | fi 35 | log "Downloading commonvoice dataset" 36 | mkdir -p "${COMMONVOICE}" 37 | local/download_and_untar.sh "${COMMONVOICE}" "${data_url}" "${lang}.tar.gz" 38 | 39 | log "Preparing data for commonvoice" 40 | ### Task dependent. You have to make data the following preparation part by yourself. 41 | ### But you can utilize Kaldi recipes in most cases 42 | for part in "validated"; do 43 | # use underscore-separated names in data directories. 44 | local/data_prep.pl "${COMMONVOICE}" ${part} data/"$(echo "${part}_${lang}" | tr - _)" 45 | done 46 | 47 | # Kaldi Version Split 48 | # utils/subset_data_dir_tr_cv.sh data/validated data/valid_train data/valid_test_dev 49 | # utils/subset_data_dir_tr_cv.sh --cv-spk-percent 50 data/valid_test_dev data/valid_test data/valid_dev 50 | 51 | # ESPNet Version (same as voxforge) 52 | # consider duplicated sentences (does not consider speaker split) 53 | # filter out the same sentences (also same text) of test&dev set from validated set 54 | local/split_tr_dt_et.sh data/validated_${lang} data/${train_set} data/${train_dev} data/${test_set} 55 | 56 | log "Successfully finished. [elapsed=${SECONDS}s]" 57 | -------------------------------------------------------------------------------- /speech_datasets/utils/misc.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import inspect 3 | from os.path import abspath, dirname 4 | import torch 5 | 6 | 7 | def get_root(): 8 | """This file is ROOT/speech_datasets/utils/misc.py, so return ROOT.""" 9 | return dirname(dirname(dirname(abspath(__file__)))) 10 | 11 | 12 | def check_kwargs(func, kwargs, name=None): 13 | """check kwargs are valid for func 14 | 15 | If kwargs are invalid, raise TypeError as same as python default 16 | :param function func: function to be validated 17 | :param dict kwargs: keyword arguments for func 18 | :param str name: name used in TypeError (default is func name) 19 | """ 20 | try: 21 | params = inspect.signature(func).parameters 22 | except ValueError: 23 | return 24 | if name is None: 25 | name = func.__name__ 26 | for k in kwargs.keys(): 27 | if k not in params: 28 | raise TypeError(f"{name}() got an unexpected keyword argument '{k}'") 29 | 30 | 31 | def dynamic_import(import_path, alias=None): 32 | """dynamic import module and class 33 | 34 | :param str import_path: syntax 'module_name:class_name' 35 | e.g., 'speech_datasets.transform.add_deltas:AddDeltas' 36 | :param dict alias: shortcut for registered class 37 | :return: imported class 38 | """ 39 | alias = dict() if alias is None else alias 40 | if import_path not in alias and ":" not in import_path: 41 | raise ValueError( 42 | "import_path should be one of {} or " 43 | 'include ":", e.g. "speech_datasets.transform.add_deltas:AddDeltas" : ' 44 | "{}".format(set(alias), import_path) 45 | ) 46 | if ":" not in import_path: 47 | import_path = alias[import_path] 48 | 49 | module_name, objname = import_path.split(":") 50 | m = importlib.import_module(module_name) 51 | return getattr(m, objname) 52 | 53 | 54 | def set_deterministic_pytorch(seed, cudnn_deterministic=True): 55 | """Ensures pytorch produces deterministic results based on the seed.""" 56 | # See https://github.com/pytorch/pytorch/issues/6351 about cudnn.benchmark 57 | torch.manual_seed(seed) 58 | torch.backends.cudnn.deterministic = cudnn_deterministic 59 | torch.backends.cudnn.benchmark = (not cudnn_deterministic) 60 | -------------------------------------------------------------------------------- /wsj/asr1/local/find_transcripts.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | 19 | # This program takes on its standard input a list of utterance 20 | # id's, one for each line. (e.g. 4k0c030a is a an utterance id). 21 | # It takes as 22 | # Extracts from the dot files the transcripts for a given 23 | # dataset (represented by a file list). 24 | # 25 | 26 | @ARGV == 1 || die "find_transcripts.pl dot_files_flist < utterance_ids > transcripts"; 27 | $dot_flist = shift @ARGV; 28 | 29 | open(L, "<$dot_flist") || die "Opening file list of dot files: $dot_flist\n"; 30 | while(){ 31 | chop; 32 | m:\S+/(\w{6})00.dot: || die "Bad line in dot file list: $_"; 33 | $spk = $1; 34 | $spk2dot{$spk} = $_; 35 | } 36 | 37 | 38 | 39 | while(){ 40 | chop; 41 | $uttid = $_; 42 | $uttid =~ m:(\w{6})\w\w: || die "Bad utterance id $_"; 43 | $spk = $1; 44 | if($spk ne $curspk) { 45 | %utt2trans = { }; # Don't keep all the transcripts in memory... 46 | $curspk = $spk; 47 | $dotfile = $spk2dot{$spk}; 48 | defined $dotfile || die "No dot file for speaker $spk\n"; 49 | open(F, "<$dotfile") || die "Error opening dot file $dotfile\n"; 50 | while() { 51 | $_ =~ m:(.+)\((\w{8})\)\s*$: || die "Bad line $_ in dot file $dotfile (line $.)\n"; 52 | $trans = $1; 53 | $utt = $2; 54 | $utt2trans{$utt} = $trans; 55 | } 56 | } 57 | if(!defined $utt2trans{$uttid}) { 58 | print STDERR "No transcript for utterance $uttid (current dot file is $dotfile)\n"; 59 | } else { 60 | print "$uttid $utt2trans{$uttid}\n"; 61 | } 62 | } 63 | 64 | 65 | -------------------------------------------------------------------------------- /example/README.md: -------------------------------------------------------------------------------- 1 | # Example Code 2 | This directory provides an example which trains a transformer encoder-decoder model on the 3 | `train-clean-100` and `train-clean-360` splits of LibriSpeech, and it evaluates the model's 4 | performance on the `dev-clean` split. 5 | 6 | In order to run this example, you must first prepare an environment and install the `speech_datasets` package, 7 | as detailed [here](../README.md#environment-setup). Next, navigate to [librispeech/asr1](../librispeech/asr1) and 8 | invoke 9 | ```shell script 10 | ./run.sh --stage 1 --stop_stage 4 --feats_type 11 | ``` 12 | This will download, prepare, and extract the relevant features for LibriSpeech, and make the dataset usable with 13 | the `speech_datasets` package. Note that this step will take a long time! 14 | 15 | Next, you should navigate to this directory and activate the conda environment by invoking 16 | ``` 17 | source ../tools/venv/bin/activate && conda deactivate && conda activate 18 | ``` 19 | (where `` is the name of the conda virtual environment, `datasets` by default if you did not specify it 20 | when setting up your environment as described [here](../README.md#environment-setup)). Now, you can run 21 | [`main.py`](main.py). If you dumped `--feats_type raw`, then you can run 22 | ``` 23 | python main.py --feats_type 24 | ``` 25 | If you instead dumped `--feats_type fbank` or `--feats_type fbank_pitch`, you can instead run 26 | ``` 27 | python main.py --feats_type --precomputed_feats 28 | ``` 29 | 30 | The `feats_type` argument to `main.py` will specify whether to use the feature computation configuration 31 | [`fbank.yaml`](resources/fbank.yaml) or [`fbank_pitch.yaml`](resources/fbank_pitch.yaml). 32 | Both compute 80-dimensional filterbank features (optionally pitch as well), apply the appropriate cepstral 33 | mean/variance normalization (using the statistics pre-computed in 34 | [`global_cmvn_fbank.ark`](resources/global_cmvn_fbank.ark) or 35 | [`global_cmvn_fbank_pitch.ark`](resources/global_cmvn_fbank_pitch.ark)), and apply spectral augmentation. 36 | 37 | In this example, the data loader will tokenize the text using the provided sentencepiece model 38 | [`librispeech_bpe2000.model`](resources/librispeech_bpe2000.model). See the `main()` function of 39 | [`main.py`](main.py) for a full example. 40 | -------------------------------------------------------------------------------- /commonvoice/asr1/local/download_and_untar.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Set bash to 'debug' mode, it will exit on : 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', 4 | set -e 5 | set -u 6 | set -o pipefail 7 | 8 | log() { 9 | local fname=${BASH_SOURCE[1]##*/} 10 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" 11 | } 12 | 13 | # Copyright 2014 Johns Hopkins University (author: Daniel Povey) 14 | # 2017 Luminar Technologies, Inc. (author: Daniel Galvez) 15 | # 2017 Ewald Enzinger 16 | # Apache 2.0 17 | 18 | # Adapted from egs/mini_librispeech/s5/local/download_and_untar.sh (commit 1cd6d2ac3a935009fdc4184cb8a72ddad98fe7d9) 19 | 20 | remove_archive=false 21 | 22 | if [ "$1" == --remove-archive ]; then 23 | remove_archive=true 24 | shift 25 | fi 26 | 27 | if [ $# -ne 3 ]; then 28 | log "Usage: $0 [--remove-archive] " 29 | log "e.g.: $0 /export/data/ https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.tar.gz cv_corpus_v1.tar.gz" 30 | log "With --remove-archive it will remove the archive after successfully un-tarring it." 31 | exit 1 32 | fi 33 | 34 | data=$1 35 | url=$2 36 | filename=$3 37 | filepath="$data/$filename" 38 | workspace=$PWD 39 | 40 | if [ ! -d "$data" ]; then 41 | log "$0: no such directory $data" 42 | exit 1; 43 | fi 44 | 45 | if [ -z "$url" ]; then 46 | log "$0: empty URL." 47 | exit 1; 48 | fi 49 | 50 | if [ -f $data/$filename.complete ]; then 51 | log "$0: data was already successfully extracted, nothing to do." 52 | exit 0; 53 | fi 54 | 55 | if [ ! -f $filepath ]; then 56 | if ! which wget >/dev/null; then 57 | log "$0: wget is not installed." 58 | exit 1; 59 | fi 60 | log "$0: downloading data from $url. This may take some time, please be patient." 61 | 62 | if ! wget --no-check-certificate $url -O $filepath; then 63 | log "$0: error executing wget $url" 64 | rm -f $filepath 65 | exit 1; 66 | fi 67 | fi 68 | 69 | cd $data 70 | if ! tar -xzvf $filename; then 71 | log "$0: error un-tarring archive $filepath" 72 | exit 1; 73 | fi 74 | cd $workspace 75 | 76 | touch $data/$filename.complete 77 | 78 | log "$0: Successfully downloaded and un-tarred $filepath" 79 | 80 | if $remove_archive; then 81 | log "$0: removing $filepath file since --remove-archive option was supplied." 82 | rm $filepath 83 | fi 84 | -------------------------------------------------------------------------------- /utils/subset_data_dir_tr_cv.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Copyright 2017 Brno University of Technology (Author: Karel Vesely); 4 | # Apache 2.0 5 | 6 | # This scripts splits 'data' directory into two parts: 7 | # - training set with 90% of speakers 8 | # - held-out set with 10% of speakers (cv) 9 | # (to be used in frame cross-entropy training of 'nnet1' models), 10 | 11 | # The script also accepts a list of held-out set speakers by '--cv-spk-list' 12 | # (with perturbed data, we pass the list of speakers externally). 13 | # The remaining set of speakers is the the training set. 14 | 15 | cv_spk_percent=10 16 | cv_spk_list= # To be used with perturbed data, 17 | seed=777 18 | cv_utt_percent= # ignored (compatibility), 19 | . utils/parse_options.sh 20 | 21 | if [ $# != 3 ]; then 22 | echo "Usage: $0 [opts] " 23 | echo " --cv-spk-percent N (default 10)" 24 | echo " --cv-spk-list (a pre-defined list with cv speakers)" 25 | exit 1; 26 | fi 27 | 28 | set -euo pipefail 29 | 30 | src_data=$1 31 | trn_data=$2 32 | cv_data=$3 33 | 34 | [ ! -r $src_data/spk2utt ] && echo "Missing '$src_data/spk2utt'. Error!" && exit 1 35 | 36 | tmp=$(mktemp -d /tmp/${USER}_XXXXX) 37 | 38 | if [ -z "$cv_spk_list" ]; then 39 | # Select 'cv_spk_percent' speakers randomly, 40 | cat $src_data/spk2utt | awk '{ print $1; }' | utils/shuffle_list.pl --srand $seed >$tmp/speakers 41 | n_spk=$(wc -l <$tmp/speakers) 42 | n_spk_cv=$(perl -e "print int($cv_spk_percent * $n_spk / 100); ") 43 | # 44 | head -n $n_spk_cv $tmp/speakers >$tmp/speakers_cv 45 | tail -n+$((n_spk_cv+1)) $tmp/speakers >$tmp/speakers_trn 46 | else 47 | # Use pre-defined list of speakers, 48 | cp $cv_spk_list $tmp/speakers_cv 49 | join -v2 <(sort $cv_spk_list) <(awk '{ print $1; }' <$src_data/spk2utt | sort) >$tmp/speakers_trn 50 | fi 51 | 52 | # Sanity checks, 53 | n_spk=$(wc -l <$src_data/spk2utt) 54 | echo "Speakers, src=$n_spk, trn=$(wc -l <$tmp/speakers_trn), cv=$(wc -l $tmp/speakers_cv)" 55 | overlap=$(join <(sort $tmp/speakers_trn) <(sort $tmp/speakers_cv) | wc -l) 56 | [ $overlap != 0 ] && \ 57 | echo "WARNING, speaker overlap detected!" && \ 58 | join <(sort $tmp/speakers_trn) <(sort $tmp/speakers_cv) | head && \ 59 | echo '...' 60 | 61 | # Create new data dirs, 62 | utils/data/subset_data_dir.sh --spk-list $tmp/speakers_trn $src_data $trn_data 63 | utils/data/subset_data_dir.sh --spk-list $tmp/speakers_cv $src_data $cv_data 64 | 65 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Use shell /bin/bash instead of /bin/sh so the source command can be used 2 | SHELL := /bin/bash 3 | # Use the default conda unless a specific install is specified. If there is 4 | # no conda, we will download a fresh one and use it to set up the virtual env. 5 | CONDA := 6 | VENV_NAME := datasets 7 | # The python version installed in the conda setup 8 | PYTHON_VERSION := 3.7.9 9 | # PyTorch version: 1.2.0, 1.3.0, 1.3.1, 1.4.0, 1.5.0, 1.5.1 (>= 1.2.0 required) 10 | # 1.5.0 and later do not work with PyKaldi... 11 | TORCH_VERSION := 1.4.0 12 | 13 | ifeq ($(CONDA),) 14 | CONDA := $(shell which conda) 15 | endif 16 | ifeq ($(TORCH_VERSION),) 17 | pytorch := pytorch 18 | else 19 | pytorch := pytorch=$(TORCH_VERSION) 20 | endif 21 | 22 | ifneq ($(shell which nvidia-smi),) # 'nvcc' found 23 | CUDA_VERSION := $(shell nvcc --version | grep "release" | sed -E "s/.*release ([0-9.]*).*/\1/") 24 | CONDA_PYTORCH := $(pytorch) cudatoolkit=$(CUDA_VERSION) -c pytorch 25 | else 26 | CUDA_VERSION := 27 | CONDA_PYTORCH := $(pytorch) cpuonly -c pytorch 28 | endif 29 | # Install CPU version of PyKaldi, so we can run feature extraction on CPU while training on GPU 30 | CONDA_PYKALDI := -c pykaldi pykaldi-cpu 31 | 32 | .PHONY: all clean 33 | 34 | all: conda sph2pipe check_install example 35 | 36 | tools/conda.done: 37 | # Only install PyTorch if the PyTorch version is non-empty 38 | tools/install_anaconda.sh $(PYTHON_VERSION) "$(CONDA)" tools/venv $(VENV_NAME) . "$(CONDA_PYTORCH)" "$(CONDA_PYKALDI)" 39 | @echo $(VENV_NAME) > tools/conda.done 40 | 41 | conda: tools/conda.done 42 | 43 | tools/sph2pipe.done: 44 | tools/install_sph2pipe.sh tools 45 | touch tools/sph2pipe.done 46 | 47 | sph2pipe: tools/sph2pipe.done 48 | 49 | check_install: conda 50 | ifneq ($(strip $(CUDA_VERSION)),) 51 | source tools/venv/etc/profile.d/conda.sh && conda deactivate && conda activate $(shell cat tools/conda.done) && python tools/check_install.py 52 | else 53 | source tools/venv/etc/profile.d/conda.sh && conda deactivate && conda activate $(shell cat tools/conda.done) && python tools/check_install.py --no-cuda 54 | endif 55 | 56 | example: conda 57 | source tools/venv/etc/profile.d/conda.sh && conda deactivate && conda activate $(shell cat tools/conda.done) && pip install -r example/requirements.txt 58 | 59 | clean: clean_conda 60 | rm -rf tools/*.done 61 | 62 | clean_conda: 63 | rm -rf *.egg-info 64 | rm -rf tools/venv 65 | rm -f tools/miniconda.sh 66 | find . -iname "*.pyc" -delete 67 | -------------------------------------------------------------------------------- /speech_datasets/bin/feat_to_shape.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import logging 4 | import sys 5 | 6 | from speech_datasets.transform import Transformation 7 | from speech_datasets.utils.readers import file_reader_helper 8 | from speech_datasets.utils.io_utils import get_commandline_args, strtobool 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | def get_parser(): 14 | parser = argparse.ArgumentParser( 15 | description="convert feature to its shape", 16 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 17 | ) 18 | parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option") 19 | parser.add_argument("--filetype", type=str, default="hdf5", choices=["mat", "hdf5", "sound"], 20 | help="Specify the file format for the rspecifier.") 21 | parser.add_argument("--preprocess-conf", type=str, default=None, 22 | help="The configuration file for the pre-processing") 23 | parser.add_argument("--mem-mapped", type=strtobool, default=False, 24 | help="Whether to use memory-mapped data loaders (where available)") 25 | parser.add_argument("rspecifier", type=str, 26 | help="Read specifier for feats. e.g. ark:some.ark") 27 | parser.add_argument("out", nargs="?", type=argparse.FileType("w"), default=sys.stdout, 28 | help="The output filename. " "If omitted, then output to sys.stdout") 29 | return parser 30 | 31 | 32 | def main(): 33 | parser = get_parser() 34 | args = parser.parse_args() 35 | 36 | # logging info 37 | logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" 38 | if args.verbose > 0: 39 | logging.basicConfig(level=logging.INFO, format=logfmt) 40 | else: 41 | logging.basicConfig(level=logging.WARN, format=logfmt) 42 | logger.info(get_commandline_args()) 43 | 44 | if args.preprocess_conf is not None: 45 | preprocessing = Transformation(args.preprocess_conf) 46 | logger.info("Apply preprocessing: {}".format(preprocessing)) 47 | else: 48 | preprocessing = None 49 | 50 | for utt, shape in file_reader_helper( 51 | args.rspecifier, args.filetype, return_shape=True, transform=preprocessing): 52 | shape_str = ",".join(map(str, shape)) # shape is a tuple of ints 53 | args.out.write("{} {}\n".format(utt, shape_str)) 54 | 55 | 56 | if __name__ == "__main__": 57 | main() 58 | -------------------------------------------------------------------------------- /commonvoice/asr1/local/reduce_data_dir.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # koried, 10/29/2012 4 | 5 | # Reduce a data set based on a list of turn-ids 6 | 7 | help_message="usage: $0 srcdir turnlist destdir" 8 | 9 | if [ $1 == "--help" ]; then 10 | echo "${help_message}" 11 | exit 0; 12 | fi 13 | 14 | if [ $# != 3 ]; then 15 | echo "${help_message}" 16 | exit 1; 17 | fi 18 | 19 | srcdir=$1 20 | reclist=$2 21 | destdir=$3 22 | 23 | if [ ! -f ${srcdir}/utt2spk ]; then 24 | echo "$0: no such file $srcdir/utt2spk" 25 | exit 1; 26 | fi 27 | 28 | function do_filtering { 29 | # assumes the utt2spk and spk2utt files already exist. 30 | [ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp 31 | [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp 32 | [ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text 33 | [ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames 34 | [ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender 35 | [ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp 36 | if [ -f ${srcdir}/segments ]; then 37 | utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments 38 | awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings. 39 | # The next line would override the command above for wav.scp, which would be incorrect. 40 | [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp 41 | [ -f ${srcdir}/reco2file_and_channel ] && \ 42 | utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel 43 | 44 | # Filter the STM file for proper sclite scoring (this will also remove the comments lines) 45 | [ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm 46 | rm ${destdir}/reco 47 | fi 48 | srcutts=$(wc -l < ${srcdir}/utt2spk) 49 | destutts=$(wc -l < ${destdir}/utt2spk) 50 | echo "Reduced #utt from $srcutts to $destutts" 51 | } 52 | 53 | mkdir -p ${destdir} 54 | 55 | # filter the utt2spk based on the set of recordings 56 | utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk 57 | 58 | utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt 59 | do_filtering; 60 | -------------------------------------------------------------------------------- /speech_datasets/transform/interface.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | import inspect 3 | 4 | from speech_datasets.utils import check_kwargs 5 | 6 | 7 | class TransformInterface(object): 8 | """Transform Interface""" 9 | 10 | @abstractmethod 11 | def __call__(self, x): 12 | raise NotImplementedError("__call__ method is not implemented") 13 | 14 | @classmethod 15 | def add_arguments(cls, parser): 16 | return parser 17 | 18 | def __repr__(self): 19 | return self.__class__.__name__ + "()" 20 | 21 | 22 | class FuncTrans(TransformInterface): 23 | """Functional Transformation 24 | 25 | WARNING: 26 | Builtin or C/C++ functions may not work properly 27 | because this class heavily depends on the `inspect` module. 28 | 29 | Usage: 30 | 31 | >>> def foo_bar(x, a=1, b=2): 32 | ... '''Foo bar 33 | ... :param x: input 34 | ... :param int a: default 1 35 | ... :param int b: default 2 36 | ... ''' 37 | ... return x + a - b 38 | 39 | 40 | >>> class FooBar(FuncTrans): 41 | ... _func = foo_bar 42 | ... __doc__ = foo_bar.__doc__ 43 | """ 44 | 45 | _func = None 46 | 47 | def __init__(self, **kwargs): 48 | self.kwargs = kwargs 49 | check_kwargs(self.func, kwargs) 50 | 51 | def __call__(self, x): 52 | return self.func(x, **self.kwargs) 53 | 54 | @classmethod 55 | def add_arguments(cls, parser): 56 | fname = cls._func.__name__.replace("_", "-") 57 | group = parser.add_argument_group(fname + " transformation setting") 58 | for k, v in cls.default_params().items(): 59 | # TODO(karita): get help and choices from docstring? 60 | attr = k.replace("_", "-") 61 | group.add_argument(f"--{fname}-{attr}", default=v, type=type(v)) 62 | return parser 63 | 64 | @property 65 | def func(self): 66 | return type(self)._func 67 | 68 | @classmethod 69 | def default_params(cls): 70 | try: 71 | d = dict(inspect.signature(cls._func).parameters) 72 | except ValueError: 73 | d = dict() 74 | return { 75 | k: v.default for k, v in d.items() if v.default != inspect.Parameter.empty 76 | } 77 | 78 | def __repr__(self): 79 | params = self.default_params() 80 | params.update(**self.kwargs) 81 | ret = self.__class__.__name__ + "(" 82 | if len(params) == 0: 83 | return ret + ")" 84 | for k, v in params.items(): 85 | ret += "{}={}, ".format(k, v) 86 | return ret[:-2] + ")" 87 | -------------------------------------------------------------------------------- /utils/apply_cmvn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | log() { 4 | local fname=${BASH_SOURCE[1]##*/} 5 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" 6 | } 7 | 8 | # Copyright 2017 Nagoya University (Tomoki Hayashi) 9 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 10 | 11 | echo "$0 $*" # Print the command line for logging 12 | 13 | cmd=utils/run.pl 14 | nj=$(nproc) 15 | filetype='hdf5' # mat or hdf5 16 | cmvn_type='global' # global or utterance or speaker 17 | help_message="Usage: $0 " 18 | 19 | . ./path.sh || exit 1 20 | . utils/parse_options.sh || exit 1 21 | 22 | if [ $# != 3 ]; then 23 | log "${help_message}" 24 | exit 2 25 | fi 26 | 27 | scp=$1 28 | logdir=$2 29 | dumpdir=$(utils/make_absolute.sh $3) 30 | 31 | if [ ${filetype} = mat ]; then 32 | ext=ark 33 | elif [ ${filetype} = hdf5 ]; then 34 | ext=h5 35 | else 36 | log "Received --filetype '${filetype}', but only 'mat' and 'hdf5' are valid" 37 | exit 2 38 | fi 39 | 40 | if [ ${cmvn_type} != global ] && [ ${cmvn_type} != utterance ] && [ ${cmvn_type} != speaker ]; then 41 | log "Received --cmvn_type '${cmvn_type}', but only 'global', 'utterance', 'speaker'' are valid" 42 | fi 43 | 44 | srcdir=$(dirname "$scp") 45 | cmvnark=$srcdir/cmvn.ark 46 | maybe_utt2spk= 47 | if [ -f $srcdir/utt2spk ]; then 48 | maybe_utt2spk+="--utt2spk $srcdir/utt2spk " 49 | fi 50 | maybe_spk2utt= 51 | if [ -f $srcdir/spk2utt ]; then 52 | maybe_spk2utt+="--spk2utt $srcdir/spk2utt " 53 | fi 54 | 55 | mkdir -p ${logdir} 56 | mkdir -p ${dumpdir} 57 | 58 | # compute CMVN stats 59 | python -m speech_datasets.bin.compute_cmvn_stats \ 60 | --in-filetype ${filetype} ${maybe_spk2utt} \ 61 | --cmvn-type ${cmvn_type} scp:${scp} ${cmvnark} 62 | 63 | echo $cmvn_type > $srcdir/cmvn_type 64 | 65 | # split scp file 66 | split_scps="" 67 | for n in $(seq ${nj}); do 68 | split_scps="$split_scps $logdir/feats.$n.scp" 69 | done 70 | 71 | utils/split_scp.pl ${scp} ${split_scps} || exit 1; 72 | 73 | # apply CMVN to features & dump them 74 | ${cmd} JOB=1:${nj} ${logdir}/apply_cmvn.JOB.log \ 75 | apply_cmvn.py --norm-vars true --in-filetype ${filetype} --out-filetype ${filetype} \ 76 | --cmvn-type ${cmvn_type} ${maybe_utt2spk} ${cmvnark} scp:${logdir}/feats.JOB.scp \ 77 | ark,scp:${dumpdir}/feats.JOB.${ext},${dumpdir}/feats.JOB.scp \ 78 | || exit 1 79 | 80 | # concatenate scp files 81 | for n in $(seq ${nj}); do 82 | cat ${dumpdir}/feats.${n}.scp || exit 1; 83 | done > ${dumpdir}/feats.scp || exit 1 84 | 85 | # remove temp scps 86 | rm ${dumpdir}/feats.*.scp 2>/dev/null 87 | rm ${logdir}/feats.*.scp 2>/dev/null 88 | log "Succeeded applying CMVN to features for training." 89 | -------------------------------------------------------------------------------- /swbd/asr1/local/swbd1_fix_speakerid.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use warnings; #sed replacement for -w perl parameter 3 | 4 | # Author: Peng Qi (pengqi@cs.stanford.edu) 5 | # This script maps Switchboard speaker IDs to the true physical speakers 6 | # and fixes the utterances IDs accordingly. Expected to be run one level of 7 | # directory above. 8 | 9 | sub trim { 10 | (my $s = $_[0]) =~ s/^\s+|\s+$//g; 11 | return $s; 12 | } 13 | 14 | if ($#ARGV != 1) { 15 | print "Usage: swbd1_fix_speakerid.pl \n"; 16 | print "E.g.: swbd1_fix_speakerid.pl /datasets/SWBD1Transcripts/tables/conv.tab data/train\n"; 17 | } 18 | 19 | $tab_file = $ARGV[0]; 20 | $dir = $ARGV[1]; 21 | 22 | %conv_to_spk = (); 23 | 24 | open(my $conv_tab, '<', $tab_file) or die "Could not open '$tab_file' $!\n"; 25 | 26 | while (my $line = <$conv_tab>) { 27 | chomp $line; 28 | 29 | my @fields = split "," , $line; 30 | #$fields[0] = trim($fields[0]); 31 | $fields[2] = trim($fields[2]); 32 | $fields[3] = trim($fields[3]); 33 | $conv_to_spk{'{swbd}0' . $fields[0] . '-A'} = $fields[2]; 34 | $conv_to_spk{'{swbd}0' . $fields[0] . '-B'} = $fields[3]; 35 | } 36 | 37 | close($conv_tab); 38 | 39 | # fix utt2spk 40 | 41 | %missingconv = (); 42 | 43 | open(my $utt2spk, '<', $dir . '/utt2spk') or die "Could not open '$dir/utt2spk' $!\n"; 44 | open(my $utt2spk_new, '>', $dir . '/utt2spk.new'); 45 | 46 | while (my $line = <$utt2spk>) { 47 | chomp $line; 48 | 49 | my @fields = split " " , $line; 50 | my $convid = substr $fields[0], 0, 9; 51 | 52 | if (exists $conv_to_spk{ $convid }) { 53 | my $spkid = $conv_to_spk{ $convid }; 54 | $spkid = "{swbd}" . $spkid; 55 | my $newuttid = $spkid . '-' . (substr $fields[0], 2); 56 | 57 | print $utt2spk_new "$newuttid $spkid\n"; 58 | } else { 59 | my $convid = substr $convid, 3, 4; 60 | $missingconv{$convid} = 1; 61 | 62 | print $utt2spk_new $fields[0]." ".$fields[1]."\n"; 63 | } 64 | } 65 | 66 | close($utt2spk); 67 | close($utt2spk_new); 68 | 69 | foreach my $conv (keys %missingconv) { 70 | print "Warning: Conversation ID '$conv' not found in conv.tab, retaining old speaker IDs\n" 71 | } 72 | 73 | # fix segments and text 74 | 75 | foreach my $file ('segments','text') { 76 | open(my $oldfile, '<', "$dir/$file") or die "Could not open '$dir/$file' $!\n"; 77 | open(my $newfile, '>', "$dir/$file.new"); 78 | 79 | while (my $line = <$oldfile>) { 80 | chomp $line; 81 | 82 | my $convid = substr $line, 0, 9; 83 | if (exists $conv_to_spk{$convid}) { 84 | my $spkid = $conv_to_spk{$convid}; 85 | print $newfile "{swbd}$spkid-" . (substr $line, 2) . "\n"; 86 | } else { 87 | print $newfile "$line\n"; 88 | } 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /wsj/asr1/local/ndx2flist.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | # This program takes as its standard input an .ndx file from the WSJ corpus that looks 19 | # like this: 20 | #;; File: tr_s_wv1.ndx, updated 04/26/94 21 | #;; 22 | #;; Index for WSJ0 SI-short Sennheiser training data 23 | #;; Data is read WSJ sentences, Sennheiser mic. 24 | #;; Contains 84 speakers X (~100 utts per speaker MIT/SRI and ~50 utts 25 | #;; per speaker TI) = 7236 utts 26 | #;; 27 | #11_1_1:wsj0/si_tr_s/01i/01ic0201.wv1 28 | #11_1_1:wsj0/si_tr_s/01i/01ic0202.wv1 29 | #11_1_1:wsj0/si_tr_s/01i/01ic0203.wv1 30 | 31 | #and as command-line arguments it takes the names of the WSJ disk locations, e.g.: 32 | #/mnt/matylda2/data/WSJ0/11-1.1 /mnt/matylda2/data/WSJ0/11-10.1 ... etc. 33 | # It outputs a list of absolute pathnames (it does this by replacing e.g. 11_1_1 with 34 | # /mnt/matylda2/data/WSJ0/11-1.1. 35 | # It also does a slight fix because one of the WSJ disks (WSJ1/13-16.1) was distributed with 36 | # uppercase rather than lower case filenames. 37 | 38 | foreach $fn (@ARGV) { 39 | $fn =~ m:.+/([0-9\.\-]+)/?$: || die "Bad command-line argument $fn\n"; 40 | $disk_id=$1; 41 | $disk_id =~ tr/-\./__/; # replace - and . with - so 11-10.1 becomes 11_10_1 42 | $fn =~ s:/$::; # Remove final slash, just in case it is present. 43 | $disk2fn{$disk_id} = $fn; 44 | } 45 | 46 | while(){ 47 | if(m/^;/){ next; } # Comment. Ignore it. 48 | else { 49 | m/^([0-9_]+):\s*(\S+)$/ || die "Could not parse line $_"; 50 | $disk=$1; 51 | if(!defined $disk2fn{$disk}) { 52 | die "Disk id $disk not found"; 53 | } 54 | $filename = $2; # as a subdirectory of the distributed disk. 55 | if($disk eq "13_16_1" && `hostname` =~ m/fit.vutbr.cz/) { 56 | # The disk 13-16.1 has been uppercased for some reason, on the 57 | # BUT system. This is a fix specifically for that case. 58 | $filename =~ tr/a-z/A-Z/; # This disk contains all uppercase filenames. Why? 59 | } 60 | print "$disk2fn{$disk}/$filename\n"; 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /commonvoice/asr1/local/split_tr_dt_et.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | log() { 4 | local fname=${BASH_SOURCE[1]##*/} 5 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" 6 | } 7 | 8 | # Copyright 2017 Johns Hopkins University (Shinji Watanabe) 9 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 10 | 11 | . ./path.sh 12 | 13 | perdt=10 # percent for dev set 14 | peret=10 # percent for eval set 15 | 16 | . utils/parse_options.sh 17 | 18 | if [ $# != 4 ]; then 19 | log "Usage: $0 "; 20 | exit 1; 21 | fi 22 | 23 | sdata=$1 24 | trdata=$2 25 | dtdata=$3 26 | etdata=$4 27 | 28 | tmpdata=$trdata/tmp 29 | mkdir -p $tmpdata 30 | mkdir -p $dtdata 31 | mkdir -p $etdata 32 | 33 | # make a unique prompts files 34 | # some transcripts have multiple spaces and need tr -s " " to remove them 35 | cut -f 2- -d" " $sdata/text | tr -s " " | sort | uniq > $tmpdata/prompts 36 | num_prompt=$(wc -l $tmpdata/prompts | awk '{print $1}') 37 | 38 | num_dt=$(echo "$num_prompt * $perdt / 100" | bc) 39 | num_et=$(echo "$num_prompt * $peret / 100" | bc) 40 | log "number of dev set prompts: $num_dt" 41 | log "number of eval set prompts: $num_et" 42 | 43 | # dt 44 | utils/shuffle_list.pl $tmpdata/prompts | head -n $num_dt > $tmpdata/dt_prompts 45 | # et 46 | utils/shuffle_list.pl $tmpdata/prompts | head -n $(echo "$num_dt + $num_et" | bc) \ 47 | | tail -n $num_et > $tmpdata/et_prompts 48 | # tr 49 | nrest=$(echo "$num_dt + $num_et + 1" | bc) 50 | utils/shuffle_list.pl $tmpdata/prompts | tail -n +$nrest > $tmpdata/tr_prompts 51 | log "number of train set prompts: $(wc -l $tmpdata/tr_prompts | awk '{print $1}')" 52 | 53 | # it takes very long time when # prompts is large 54 | cat $sdata/text | local/filter_text.py -f $tmpdata/dt_prompts | awk '{print $1}' | sort > $tmpdata/dt.ids 55 | log "finished text extraction for dev set #utt = $(wc -l $tmpdata/dt.ids | awk '{print $1}')" 56 | cat $sdata/text | local/filter_text.py -f $tmpdata/et_prompts | awk '{print $1}' | sort > $tmpdata/et.ids 57 | log "finished text extraction for eval set #utt = $(wc -l $tmpdata/et.ids | awk '{print $1}')" 58 | cat $tmpdata/dt.ids $tmpdata/et.ids | sort > $tmpdata/dtet.ids 59 | cat $sdata/text | awk '{print $1}' | sort > $tmpdata/all.ids 60 | diff $tmpdata/all.ids $tmpdata/dtet.ids | awk '/^ $tmpdata/tr.ids 61 | log "finished text extraction for train set #utt = $(wc -l $tmpdata/tr.ids | awk '{print $1}')" 62 | 63 | log "dev data: $(reduce_data_dir.sh $sdata $tmpdata/dt.ids $dtdata)" 64 | utils/fix_data_dir.sh $dtdata 65 | 66 | log "eval data: $(reduce_data_dir.sh $sdata $tmpdata/et.ids $etdata)" 67 | utils/fix_data_dir.sh $etdata 68 | 69 | log "train data: $(reduce_data_dir.sh $sdata $tmpdata/tr.ids $trdata)" 70 | utils/fix_data_dir.sh $trdata 71 | -------------------------------------------------------------------------------- /tools/install_anaconda.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | if [ -z "${PS1:-}" ]; then 5 | PS1=__dummy__ 6 | fi 7 | CONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh 8 | 9 | n_required_args=5 10 | if [ $# -lt $n_required_args ] ; then 11 | echo "Usage: $0 [conda install args]*" 12 | exit 1; 13 | fi 14 | PYTHON_VERSION="$1" 15 | CONDA="$2" 16 | VENV_DIR="$3" 17 | VENV_NAME="$4" 18 | PACKAGE_ROOT="$5" 19 | shift $n_required_args 20 | 21 | # Download conda if an installation isn't specified 22 | if [ -z "${CONDA}" ]; then 23 | CONDA="${VENV_DIR}/bin/conda" 24 | if [ -z "${CONDA}" ]; then 25 | if [ ! -f "${PACKAGE_ROOT}/tools/miniconda.sh" ]; then 26 | wget --tries=3 "${CONDA_URL}" -O "${PACKAGE_ROOT}/tools/miniconda.sh" 27 | fi 28 | if [ ! -d "${VENV_DIR}" ]; then 29 | bash "${PACKAGE_ROOT}/tools/miniconda.sh" -b -p "${VENV_DIR}" 30 | fi 31 | fi 32 | else 33 | ln -sf "$(${CONDA} info --base)" "${VENV_DIR}" 34 | fi 35 | 36 | # Check if environment alreay exists 37 | if ${CONDA} env list | (! grep -q -E "${VENV_NAME}\s"); then 38 | ${CONDA} create -y -n "${VENV_NAME}" "python=${PYTHON_VERSION}" 39 | else 40 | read -r -p "Enviroment ${VENV_NAME} already exists. Continue setup anyways? (y/n) " choice 41 | case $choice in 42 | y|Y|yes|Yes ) echo "Continuing to set up environment ${VENV_NAME}." ;; 43 | * ) echo "Either pick a different value for VENV_NAME, or remove the ${CONDA} environment ${VENV_NAME} before re-running this script." && exit 1 ;; 44 | esac 45 | fi 46 | 47 | # Activate conda environment & check Python version 48 | source "${VENV_DIR}/etc/profile.d/conda.sh" && conda deactivate && conda activate "${VENV_NAME}" 49 | INSTALLED_PYTHON_VERSION=$(python -V | grep -Eo "[[:digit:].]*") 50 | if [ ${INSTALLED_PYTHON_VERSION} != ${PYTHON_VERSION} ]; then 51 | echo "Enviroment ${VENV_NAME} is Python ${INSTALLED_PYTHON_VERSION}, but Python ${PYTHON_VERSION} requested." 52 | read -r -p "Continue setup with Python ${INSTALLED_PYTHON_VERSION} anyways? (y/n) " choice 53 | case $choice in 54 | y|Y|yes|Yes ) echo "Continuing to set up environment ${VENV_NAME}." ;; 55 | * ) echo "Either pick a different value for VENV_NAME, or change PYTHON_VERSION to ${INSTALLED_PYTHON_VERSION} before re-running this script." && exit 1 ;; 56 | esac 57 | fi 58 | 59 | conda update -y -n "${VENV_NAME}" -c defaults conda 60 | 61 | # Install any conda dependencies (specified via command line) 62 | while (( "$#" )); do 63 | echo "" 64 | echo "conda install -y -n ${VENV_NAME} $1" 65 | conda install -y -n "${VENV_NAME}" $1 66 | shift 67 | done 68 | 69 | # Install the speech_datasets package in editable mode 70 | pip install -e "${PACKAGE_ROOT}" 71 | -------------------------------------------------------------------------------- /COMBINE/asr1/combine_cmvn_stats.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Set bash to 'debug' mode, it will exit on : 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', 4 | set -e 5 | set -u 6 | set -o pipefail 7 | 8 | log() { 9 | local fname=${BASH_SOURCE[1]##*/} 10 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" 11 | } 12 | 13 | feats_type=fbank # fbank or fbank_pitch are valid 14 | cmvn_type=global # global or speaker or utterance are valid 15 | 16 | help_message=$(cat </ / / ... 18 | 19 | Combines CMVN stats for the specified dataset splits (pre-computed by Stage 5 run.sh for each dataset split specified) 20 | into a single file. 21 | 22 | Options: 23 | --feats_type # Feature type (fbank or fbank_pitch) (default=${feats_type}). 24 | --cmvn_type # Type of CMVN stats to compute (global or speaker or utterance) (default=${cmvn_type}). 25 | EOF 26 | ) 27 | 28 | 29 | . ./path.sh || exit 1 30 | . ./cmd.sh || exit 1 31 | 32 | log "$0 $*" 33 | . utils/parse_options.sh || exit 1 34 | if [ $# -eq 0 ]; then 35 | log "${help_message}" 36 | log "Error: Please specify dataset splits as positional arguments." 37 | exit 2 38 | fi 39 | 40 | workspace=$PWD 41 | task=$(basename "$(utils/make_absolute.sh "$workspace")") 42 | 43 | # Get CMVN's from all the relevant dataset splits 44 | cmvns= 45 | for dset in "$@"; do 46 | base=$(echo ${dset} | sed -E "s/\/.*//g") 47 | split=$(echo ${dset} | sed -E "s/.*\///g") 48 | base_dir="${MAIN_ROOT}/${base}/${task}" 49 | dset_dir="${base_dir}/dump/${feats_type}"/${split} 50 | cmvn="${dset_dir}/${cmvn_type}_cmvn.ark" 51 | 52 | if [ ! -d ${base_dir} ]; then 53 | log "${base} is not a valid dataset for task ${task//1/}" 54 | exit 1 55 | elif [ "${base}" = "${dset}" ]; then 56 | log "Expected dataset to specified as /, but got ${dset}" 57 | exit 1 58 | elif [ ! -d ${dset_dir} ]; then 59 | log "Either ${split} is not a valid split for dataset ${base}, or" 60 | log "${base_dir}/run.sh has not yet been run with feats_type=${feats_type}" 61 | exit 1 62 | elif [ ! -f ${cmvn} ]; then 63 | log "${cmvn_type} CMVN statistics have not been computed for feats_type=${feats_type} for data split ${dset}." 64 | log "Please run stage 5 of ${base_dir}/${task}/run.sh." 65 | exit 1 66 | fi 67 | cmvns+="${cmvn} " 68 | done 69 | 70 | # Combine CMVN's 71 | combo_idx=$(python3 local/combine_datasets.py --task "${task//1/}" --write_dir false "$@") 72 | dumpdir="dump/${feats_type}/no_short/${combo_idx}" 73 | mkdir -p "${dumpdir}" 74 | python3 -m speech_datasets.bin.combine_cmvn_stats --cmvn_type ${cmvn_type} \ 75 | --output_file "${dumpdir}/${cmvn_type}_cmvn.ark" ${cmvns} 76 | -------------------------------------------------------------------------------- /commonvoice/asr1/local/data_prep.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # 3 | # Copyright 2017 Ewald Enzinger 4 | # Apache 2.0 5 | # 6 | # Usage: data_prep.pl /export/data/cv_corpus_v1/cv-valid-train valid_train 7 | 8 | if (@ARGV != 3) { 9 | print STDERR "Usage: $0 \n"; 10 | print STDERR "e.g. $0 /export/data/cv_corpus_v1 cv-valid-train valid-train\n"; 11 | exit(1); 12 | } 13 | 14 | # use ffmpeg for mp3 to wav 15 | if (length(`which ffmpeg`) == 0) { 16 | print "Please install 'ffmpeg' on All worker nodes!\n"; 17 | exit 1; 18 | } 19 | 20 | 21 | ($db_base, $dataset, $out_dir) = @ARGV; 22 | mkdir data unless -d data; 23 | mkdir $out_dir unless -d $out_dir; 24 | 25 | open(CSV, "<", "$db_base/$dataset.tsv") or die "cannot open dataset CSV file"; 26 | open(SPKR,">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; 27 | open(GNDR,">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender"; 28 | open(TEXT,">", "$out_dir/text") or die "Could not open the output file $out_dir/text"; 29 | open(WAV,">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; 30 | my $header = ; 31 | while() { 32 | chomp; 33 | ($spkr, $filepath, $text, $upvotes, $downvotes, $age, $gender, $accent) = split("\t", $_); 34 | # speaker comes from commonvoice --> uttId comes from commonvoice 35 | $spkr = "{commonvoice}$spkr"; 36 | if ("$gender" eq "female") { 37 | $gender = "f"; 38 | } else { 39 | # Use male as default if not provided (no reason, just adopting the same default as in voxforge) 40 | $gender = "m"; 41 | } 42 | $uttId = $filepath; 43 | if (-z "$db_base/clips/$filepath") { 44 | print "null file $filepath\n"; 45 | next; 46 | } 47 | $uttId =~ s/\.mp3//g; 48 | $uttId =~ tr/\//-/; 49 | # speaker information should be suffix of the utterance Id 50 | $uttId = "$spkr-$uttId"; 51 | 52 | # make sure all text is lowercase 53 | $text =~ tr/A-Z/a-z/; 54 | 55 | # get rid of all puncts besides apostrophes 56 | $text =~ s/[^\w\s']|//g; 57 | $text =~ s/(\s)'/$1/g; 58 | $text =~ s/'(\s)/$1/g; 59 | 60 | if (index($text, "{") != -1 and index($text, "}" != -1)) { 61 | next; 62 | } 63 | print TEXT "$uttId"," ","$text","\n"; 64 | print GNDR "$spkr"," ","$gender","\n"; 65 | print WAV "$uttId"," ffmpeg -i $db_base/clips/$filepath -f wav -ar 16000 -ab 16 - |\n"; 66 | print SPKR "$uttId"," $spkr","\n"; 67 | } 68 | close(SPKR) || die; 69 | close(TEXT) || die; 70 | close(WAV) || die; 71 | close(GNDR) || die; 72 | close(CSV); 73 | 74 | # Use utt2spk to generate spk2utt 75 | if (system( 76 | "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { 77 | die "Error creating spk2utt file in directory $out_dir"; 78 | } 79 | 80 | # Validate the data directory 81 | system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir"); 82 | if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-feats $out_dir") != 0) { 83 | die "Error validating directory $out_dir"; 84 | } 85 | -------------------------------------------------------------------------------- /speech_datasets/text/tokenizers.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from pathlib import Path 3 | from typing import Iterable, List, Union 4 | 5 | import sentencepiece as spm 6 | from typeguard import check_argument_types 7 | 8 | 9 | class AbsTokenizer(ABC): 10 | @abstractmethod 11 | def text2tokens(self, line: str) -> List[str]: 12 | raise NotImplementedError 13 | 14 | @abstractmethod 15 | def tokens2text(self, tokens: Iterable[str]) -> str: 16 | raise NotImplementedError 17 | 18 | @abstractmethod 19 | def tokens2ids(self, tokens: Iterable[str]) -> List[int]: 20 | raise NotImplementedError 21 | 22 | @abstractmethod 23 | def ids2tokens(self, ids: Iterable[int]) -> List[str]: 24 | raise NotImplementedError 25 | 26 | def text2ids(self, line: str) -> List[int]: 27 | return self.tokens2ids(self.text2tokens(line)) 28 | 29 | def ids2text(self, ids: Iterable[int]) -> str: 30 | return self.tokens2text(self.ids2tokens(ids)) 31 | 32 | @abstractmethod 33 | def __len__(self): 34 | raise NotImplementedError 35 | 36 | 37 | class SentencepieceTokenizer(AbsTokenizer): 38 | def __init__(self, model: Union[Path, str], 39 | token_list: Union[Path, str, Iterable[str]] = None): 40 | assert check_argument_types() 41 | self.model = str(model) 42 | self.sp = spm.SentencePieceProcessor() 43 | self.sp.load(self.model) 44 | 45 | if isinstance(token_list, (Path, str)): 46 | char_list = Path(token_list) 47 | with char_list.open("r", encoding="utf-8") as f: 48 | token_list = [line.rstrip() for line in f] 49 | elif token_list is None: 50 | token_list = [self.sp.IdToPiece(i) 51 | for i in range(self.sp.get_piece_size())] 52 | 53 | self.idx2tok = {i: tok for i, tok in enumerate(token_list)} 54 | self.tok2idx = {tok: i for i, tok in enumerate(token_list)} 55 | 56 | def __repr__(self): 57 | return f'{self.__class__.__name__}(model="{self.model}")' 58 | 59 | def __getstate__(self): 60 | state = self.__dict__.copy() 61 | state["sp"] = None 62 | return state 63 | 64 | def __setstate__(self, state): 65 | self.__dict__ = state 66 | self.sp = spm.SentencePieceProcessor() 67 | self.sp.load(self.model) 68 | 69 | def text2tokens(self, line: str) -> List[str]: 70 | return self.sp.EncodeAsPieces(line) 71 | 72 | def tokens2text(self, tokens: Iterable[str]) -> str: 73 | return self.sp.DecodePieces(list(tokens)) 74 | 75 | def tokens2ids(self, tokens: Iterable[str]) -> List[int]: 76 | return [self.tok2idx.get(tok, self.tok2idx[""]) for tok in tokens] 77 | 78 | def ids2tokens(self, ids: Iterable[int]) -> List[str]: 79 | return [self.idx2tok[idx] for idx in ids] 80 | 81 | def __len__(self): 82 | if self.idx2tok is None: 83 | return self.sp.get_piece_size() 84 | else: 85 | return len(self.idx2tok) 86 | -------------------------------------------------------------------------------- /utils/filter_scp.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2012 Microsoft Corporation 3 | # Johns Hopkins University (author: Daniel Povey) 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | # MERCHANTABLITY OR NON-INFRINGEMENT. 15 | # See the Apache 2 License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | # This script takes a list of utterance-ids or any file whose first field 20 | # of each line is an utterance-id, and filters an scp 21 | # file (or any file whose "n-th" field is an utterance id), printing 22 | # out only those lines whose "n-th" field is in id_list. The index of 23 | # the "n-th" field is 1, by default, but can be changed by using 24 | # the -f switch 25 | 26 | $exclude = 0; 27 | $field = 1; 28 | $shifted = 0; 29 | 30 | do { 31 | $shifted=0; 32 | if ($ARGV[0] eq "--exclude") { 33 | $exclude = 1; 34 | shift @ARGV; 35 | $shifted=1; 36 | } 37 | if ($ARGV[0] eq "-f") { 38 | $field = $ARGV[1]; 39 | shift @ARGV; shift @ARGV; 40 | $shifted=1 41 | } 42 | } while ($shifted); 43 | 44 | if(@ARGV < 1 || @ARGV > 2) { 45 | die "Usage: filter_scp.pl [--exclude] [-f ] id_list [in.scp] > out.scp \n" . 46 | "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" . 47 | "Note: only the first field of each line in id_list matters. With --exclude, prints\n" . 48 | "only the lines that were *not* in id_list.\n" . 49 | "Caution: previously, the -f option was interpreted as a zero-based field index.\n" . 50 | "If your older scripts (written before Oct 2014) stopped working and you used the\n" . 51 | "-f option, add 1 to the argument.\n" . 52 | "See also: utils/filter_scp.pl .\n"; 53 | } 54 | 55 | 56 | $idlist = shift @ARGV; 57 | open(F, "<$idlist") || die "Could not open id-list file $idlist"; 58 | while() { 59 | @A = split; 60 | @A>=1 || die "Invalid id-list file line $_"; 61 | $seen{$A[0]} = 1; 62 | } 63 | 64 | if ($field == 1) { # Treat this as special case, since it is common. 65 | while(<>) { 66 | $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; 67 | # $1 is what we filter on. 68 | if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) { 69 | print $_; 70 | } 71 | } 72 | } else { 73 | while(<>) { 74 | @A = split; 75 | @A > 0 || die "Invalid scp file line $_"; 76 | @A >= $field || die "Invalid scp file line $_"; 77 | if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) { 78 | print $_; 79 | } 80 | } 81 | } 82 | 83 | # tests: 84 | # the following should print "foo 1" 85 | # ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo) 86 | # the following should print "bar 2". 87 | # ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2) 88 | -------------------------------------------------------------------------------- /utils/subset_scp.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use warnings; #sed replacement for -w perl parameter 3 | # Copyright 2010-2011 Microsoft Corporation 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | # MERCHANTABLITY OR NON-INFRINGEMENT. 15 | # See the Apache 2 License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # This program selects a subset of N elements in the scp. 19 | 20 | # By default, it selects them evenly from throughout the scp, in order to avoid 21 | # selecting too many from the same speaker. It prints them on the standard 22 | # output. 23 | # With the option --first, it just selects the N first utterances. 24 | # With the option --last, it just selects the N last utterances. 25 | 26 | # Last modified by JHU & HKUST @2013 27 | 28 | 29 | $quiet = 0; 30 | $first = 0; 31 | $last = 0; 32 | 33 | if (@ARGV > 0 && $ARGV[0] eq "--quiet") { 34 | shift; 35 | $quiet = 1; 36 | } 37 | if (@ARGV > 0 && $ARGV[0] eq "--first") { 38 | shift; 39 | $first = 1; 40 | } 41 | if (@ARGV > 0 && $ARGV[0] eq "--last") { 42 | shift; 43 | $last = 1; 44 | } 45 | 46 | if(@ARGV < 2 ) { 47 | die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" . 48 | " --quiet causes it to not die if N < num lines in scp.\n" . 49 | " --first and --last make it equivalent to head or tail.\n" . 50 | "See also: filter_scp.pl\n"; 51 | } 52 | 53 | $N = shift @ARGV; 54 | if($N == 0) { 55 | die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\""; 56 | } 57 | $inscp = shift @ARGV; 58 | open(I, "<$inscp") || die "Opening input scp file $inscp"; 59 | 60 | @F = (); 61 | while() { 62 | push @F, $_; 63 | } 64 | $numlines = @F; 65 | if($N > $numlines) { 66 | if ($quiet) { 67 | $N = $numlines; 68 | } else { 69 | die "You requested from subset_scp.pl more elements than available: $N > $numlines"; 70 | } 71 | } 72 | 73 | sub select_n { 74 | my ($start,$end,$num_needed) = @_; 75 | my $diff = $end - $start; 76 | if ($num_needed > $diff) { 77 | die "select_n: code error"; 78 | } 79 | if ($diff == 1 ) { 80 | if ($num_needed > 0) { 81 | print $F[$start]; 82 | } 83 | } else { 84 | my $halfdiff = int($diff/2); 85 | my $halfneeded = int($num_needed/2); 86 | select_n($start, $start+$halfdiff, $halfneeded); 87 | select_n($start+$halfdiff, $end, $num_needed - $halfneeded); 88 | } 89 | } 90 | 91 | if ( ! $first && ! $last) { 92 | if ($N > 0) { 93 | select_n(0, $numlines, $N); 94 | } 95 | } else { 96 | if ($first) { # --first option: same as head. 97 | for ($n = 0; $n < $N; $n++) { 98 | print $F[$n]; 99 | } 100 | } else { # --last option: same as tail. 101 | for ($n = @F - $N; $n < @F; $n++) { 102 | print $F[$n]; 103 | } 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /utils/apply_map.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use warnings; #sed replacement for -w perl parameter 3 | # Copyright 2012 Johns Hopkins University (Author: Daniel Povey) 4 | # Apache 2.0. 5 | 6 | # This program is a bit like ./sym2int.pl in that it applies a map 7 | # to things in a file, but it's a bit more general in that it doesn't 8 | # assume the things being mapped to are single tokens, they could 9 | # be sequences of tokens. See the usage message. 10 | 11 | 12 | $permissive = 0; 13 | 14 | for ($x = 0; $x <= 2; $x++) { 15 | 16 | if (@ARGV > 0 && $ARGV[0] eq "-f") { 17 | shift @ARGV; 18 | $field_spec = shift @ARGV; 19 | if ($field_spec =~ m/^\d+$/) { 20 | $field_begin = $field_spec - 1; $field_end = $field_spec - 1; 21 | } 22 | if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) 23 | if ($1 ne "") { 24 | $field_begin = $1 - 1; # Change to zero-based indexing. 25 | } 26 | if ($2 ne "") { 27 | $field_end = $2 - 1; # Change to zero-based indexing. 28 | } 29 | } 30 | if (!defined $field_begin && !defined $field_end) { 31 | die "Bad argument to -f option: $field_spec"; 32 | } 33 | } 34 | 35 | if (@ARGV > 0 && $ARGV[0] eq '--permissive') { 36 | shift @ARGV; 37 | # Mapping is optional (missing key is printed to output) 38 | $permissive = 1; 39 | } 40 | } 41 | 42 | if(@ARGV != 1) { 43 | print STDERR "Invalid usage: " . join(" ", @ARGV) . "\n"; 44 | print STDERR <<'EOF'; 45 | Usage: apply_map.pl [options] map output 46 | options: [-f ] [--permissive] 47 | This applies a map to some specified fields of some input text: 48 | For each line in the map file: the first field is the thing we 49 | map from, and the remaining fields are the sequence we map it to. 50 | The -f (field-range) option says which fields of the input file the map 51 | map should apply to. 52 | If the --permissive option is supplied, fields which are not present 53 | in the map will be left as they were. 54 | Applies the map 'map' to all input text, where each line of the map 55 | is interpreted as a map from the first field to the list of the other fields 56 | Note: can look like 4-5, or 4-, or 5-, or 1, it means the field 57 | range in the input to apply the map to. 58 | e.g.: echo A B | apply_map.pl a.txt 59 | where a.txt is: 60 | A a1 a2 61 | B b 62 | will produce: 63 | a1 a2 b 64 | EOF 65 | exit(1); 66 | } 67 | 68 | ($map_file) = @ARGV; 69 | open(M, "<$map_file") || die "Error opening map file $map_file: $!"; 70 | 71 | while () { 72 | @A = split(" ", $_); 73 | @A >= 1 || die "apply_map.pl: empty line."; 74 | $i = shift @A; 75 | $o = join(" ", @A); 76 | $map{$i} = $o; 77 | } 78 | 79 | while() { 80 | @A = split(" ", $_); 81 | for ($x = 0; $x < @A; $x++) { 82 | if ( (!defined $field_begin || $x >= $field_begin) 83 | && (!defined $field_end || $x <= $field_end)) { 84 | $a = $A[$x]; 85 | if (!defined $map{$a}) { 86 | if (!$permissive) { 87 | die "apply_map.pl: undefined key $a in $map_file\n"; 88 | } else { 89 | print STDERR "apply_map.pl: warning! missing key $a in $map_file\n"; 90 | } 91 | } else { 92 | $A[$x] = $map{$a}; 93 | } 94 | } 95 | } 96 | print join(" ", @A) . "\n"; 97 | } 98 | -------------------------------------------------------------------------------- /swbd/asr1/local/data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -u 4 | set -o pipefail 5 | 6 | log() { 7 | local fname=${BASH_SOURCE[1]##*/} 8 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" 9 | } 10 | 11 | . ./path.sh || exit 1 12 | . ./db.sh || exit 1 13 | 14 | # Extract switchboard-1 15 | if [ -z "${SWBD1}" ]; then 16 | log "Fill the value of 'SWBD1' in db.sh" 17 | exit 1 18 | elif [ ! -e "${SWBD1}" ]; then 19 | mkdir -p "${SWBD1}" 20 | { 21 | tar xzvf ${SWBD1_TGZ} -C "${SWBD1}" 22 | } || { 23 | log "Failed to extract SWBD1" 24 | exit 1 25 | } 26 | fi 27 | 28 | # Download switchboard-1 transcripts if needed 29 | if [ ! -d "${SWBD1}/swb_ms98_transcriptions" ]; then 30 | echo " *** Downloading trascriptions and dictionary ***" 31 | wget http://www.openslr.org/resources/5/switchboard_word_alignments.tar.gz || 32 | wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz 33 | tar xzvf switchboard_word_alignments.tar.gz -C "${SWBD1}" 34 | rm switchboard_word_alignments.tar.gz 35 | else 36 | log "Directory with transcriptions exists, skipping downloading." 37 | fi 38 | 39 | # Prepare the dictionary & the rest of the Switchboard-1 data 40 | log "local/swbd1_prepare_dict.sh ${SWBD1}" 41 | local/swbd1_prepare_dict.sh "${SWBD1}" 42 | log "local/swbd1_data_prep.sh ${SWBD1}" 43 | local/swbd1_data_prep.sh "${SWBD1}" 44 | 45 | # Extract & prepare EVAL-2000 46 | if [ "$(echo "${EVAL2000}" | wc -w)" != 2 ]; then 47 | log "Fill the value of 'EVAL2000' in db.sh (2 items required, hub5e_00 and hub5)" 48 | fi 49 | for (( i=1; i<=2; i++ )); do 50 | src=$(echo "${EVAL2000_TGZ}" | cut -d " " -f $i) 51 | dst=$(echo "${EVAL2000}" | cut -d " " -f $i) 52 | # hub5e is in a sub-directory 53 | if [ $i = 1 ]; then 54 | dst=$(dirname "${dst}") 55 | fi 56 | 57 | if [ ! -e "${dst}" ]; then 58 | mkdir -p "${dst}" 59 | { 60 | tar xzvf "${src}" -C "${dst}" 61 | } || { 62 | log "Failed to extract EVAL2000 (part $i)" 63 | exit 1 64 | } 65 | fi 66 | done 67 | 68 | # Note: do not quote ${EVAL2000} -- it should contains 2 directories, and eval2000_data_prep.sh requires 2 arguments 69 | log "local/eval2000_data_prep.sh ${EVAL2000}" 70 | local/eval2000_data_prep.sh ${EVAL2000} 71 | 72 | # Extract & prepare RT-03 73 | if [ -z "${RT03}" ]; then 74 | log "Fill the value of 'RT03' in db.sh" 75 | exit 1 76 | elif [ ! -e "${RT03}" ]; then 77 | RT03_BASE="$(dirname "${RT03}")" 78 | mkdir -p "${RT03_BASE}" 79 | { 80 | tar xzvf "${RT03_TGZ}" -C "${RT03_BASE}" 81 | } || { 82 | log "Failed to extract SWBD1" 83 | exit 1 84 | } 85 | fi 86 | 87 | log "local/rt03_data_prep.sh ${RT03}" 88 | local/rt03_data_prep.sh ${RT03} 89 | 90 | # normalize eval2000 and rt03 texts by 91 | # 1) convert upper to lower 92 | # 2) remove tags (%AH) (%HESITATION) (%UH) 93 | # 3) remove 94 | # 4) remove "(" or ")" 95 | for x in eval2000 rt03; do 96 | cp data/${x}/text data/${x}/text.org 97 | paste -d "" \ 98 | <(cut -f 1 -d" " data/${x}/text.org) \ 99 | <(awk '{$1=""; print tolower($0)}' data/${x}/text.org | perl -pe 's| \(\%.*\)||g' \ 100 | | perl -pe 's| \<.*\>||g' | sed -e "s/(//g" -e "s/)//g") | sed -e 's/\s\+/ /g' > data/${x}/text 101 | rm data/${x}/text.org 102 | done 103 | -------------------------------------------------------------------------------- /librispeech/asr1/local/download_and_untar.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -u 4 | set -o pipefail 5 | 6 | log() { 7 | local fname=${BASH_SOURCE[1]##*/} 8 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" 9 | } 10 | 11 | # Copyright 2014 Johns Hopkins University (author: Daniel Povey) 12 | # Apache 2.0 13 | 14 | remove_archive=false 15 | 16 | if [ "$1" == --remove-archive ]; then 17 | remove_archive=true 18 | shift 19 | fi 20 | 21 | if [ $# -ne 3 ]; then 22 | log "Usage: $0 [--remove-archive] " 23 | log "e.g.: $0 /export/a15/vpanayotov/data www.openslr.org/resources/11 dev-clean" 24 | log "With --remove-archive it will remove the archive after successfully un-tarring it." 25 | log " can be one of: dev-clean, test-clean, dev-other, test-other," 26 | log " train-clean-100, train-clean-360, train-other-500." 27 | exit 1 28 | fi 29 | 30 | data=$1 31 | url=$2 32 | part=$3 33 | 34 | if [ ! -d "$data" ]; then 35 | log "$0: no such directory $data" 36 | exit 1 37 | fi 38 | 39 | part_ok=false 40 | list="dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500" 41 | for x in $list; do 42 | if [ "$part" == $x ]; then part_ok=true; fi 43 | done 44 | if ! $part_ok; then 45 | log "$0: expected to be one of $list, but got '$part'" 46 | exit 1 47 | fi 48 | 49 | if [ -z "$url" ]; then 50 | log "$0: empty URL base." 51 | exit 1 52 | fi 53 | 54 | if [ -f $data/LibriSpeech/$part/.complete ]; then 55 | log "$0: data part $part was already successfully extracted, nothing to do." 56 | exit 0 57 | fi 58 | 59 | 60 | # sizes of the archive files in bytes. This is some older versions. 61 | sizes_old="371012589 347390293 379743611 361838298 6420417880 23082659865 30626749128" 62 | # sizes_new is the archive file sizes of the final release. Some of these sizes are of 63 | # things we probably won't download. 64 | sizes_new="337926286 314305928 695964615 297279345 87960560420 33373768 346663984 328757843 6387309499 23049477885 30593501606" 65 | 66 | if [ -f $data/$part.tar.gz ]; then 67 | size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}') 68 | size_ok=false 69 | for s in $sizes_old $sizes_new; do if [ $s == $size ]; then size_ok=true; fi; done 70 | if ! $size_ok; then 71 | log "$0: removing existing file $data/$part.tar.gz because its size in bytes $size" 72 | log "does not equal the size of one of the archives." 73 | rm $data/$part.tar.gz 74 | else 75 | log "$data/$part.tar.gz exists and appears to be complete." 76 | fi 77 | fi 78 | 79 | if [ ! -f $data/$part.tar.gz ]; then 80 | if ! which wget >/dev/null; then 81 | log "$0: wget is not installed." 82 | exit 1 83 | fi 84 | full_url=$url/$part.tar.gz 85 | log "$0: downloading data from $full_url. This may take some time, please be patient." 86 | 87 | if ! wget -P $data --no-check-certificate $full_url; then 88 | log "$0: error executing wget $full_url" 89 | exit 1 90 | fi 91 | fi 92 | 93 | if ! tar -C $data -xvzf $data/$part.tar.gz; then 94 | log "$0: error un-tarring archive $data/$part.tar.gz" 95 | exit 1 96 | fi 97 | 98 | touch $data/LibriSpeech/$part/.complete 99 | 100 | log "$0: Successfully downloaded and un-tarred $data/$part.tar.gz" 101 | 102 | if $remove_archive; then 103 | log "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied." 104 | rm $data/$part.tar.gz 105 | fi 106 | -------------------------------------------------------------------------------- /wsj/asr1/local/normalize_transcript.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | # This takes data from the standard input that's unnormalized transcripts in the format 19 | # 4k2c0308 Of course there isn\'t any guarantee the company will keep its hot hand [misc_noise] 20 | # 4k2c030a [loud_breath] And new hardware such as the set of personal computers I\. B\. M\. introduced last week can lead to unexpected changes in the software business [door_slam] 21 | # and outputs normalized transcripts. 22 | # c.f. /mnt/matylda2/data/WSJ0/11-10.1/wsj0/transcrp/doc/dot_spec.doc 23 | 24 | @ARGV == 1 || die "usage: normalize_transcript.pl noise_word < transcript > transcript2"; 25 | $noise_word = shift @ARGV; 26 | 27 | while() { 28 | $_ =~ m:^(\S+) (.+): || die "bad line $_"; 29 | $utt = $1; 30 | $trans = $2; 31 | print "{wsj}$utt"; 32 | foreach $w (split (" ",$trans)) { 33 | $w =~ tr:A-Z:a-z:; # Lowercase everything to match the processing of other datasets. 34 | $w =~ s:\\::g; # Remove backslashes. We don't need the quoting. 35 | $w =~ s:^\%percent:percent:; # Normalization for Nov'93 test transcripts. 36 | $w =~ s:^\.point:point:; # Normalization for Nov'93 test transcripts. 37 | $w =~ s:\*(.*)\*:\1:g; # Mispronounced words are enclosed in asterisks; we don't care 38 | if ($w ne "!exclamation-point") { # ! indicates unusual emphasis; we don't care 39 | $w =~ s:!::g; 40 | } 41 | if ($w ne ":colon") { # : indicates a lengthened sound; we don't care 42 | $w =~ s:\:::g; 43 | } 44 | 45 | # Words we don't want to print 46 | if($w =~ m:^\[\<\w+\]$: || # E.g. [\]$: || # E.g. [door_slam>], this means a door slammed in the next word. Delete. 48 | $w =~ m:\[\w+/\]$: || # E.g. [phone_ring/], which indicates the start of this phenomenon. 49 | $w =~ m:\[\/\w+]$: || # E.g. [/phone_ring], which indicates the end of this phenomenon. 50 | $w eq "~" || # This is used to indicate truncation of an utterance. Not a word. 51 | $w eq ".") { # "." is used to indicate a pause. Silence is optional anyway so not much 52 | # point including this in the transcript. 53 | next; 54 | } elsif($w =~ m:\[\w+\]:) { # Other noises, e.g. [loud_breath]. 55 | print " $noise_word"; 56 | } elsif($w =~ m:^\<([\w\'.]+)\>$:) { 57 | # e.g. replace with and. (the <> means verbal deletion of a word).. but it's pronounced. 58 | print " $1"; 59 | } elsif($w eq "--dash") { 60 | print " -dash"; # This is a common issue; the CMU dictionary has it as -DASH. 61 | } elsif($w =~ m:(.+)\-DASH$:) { # E.g. INCORPORATED-DASH... seems the DASH gets combined with previous word 62 | print " $1 -DASH"; 63 | } else { 64 | print " $w"; 65 | } 66 | } 67 | print "\n"; 68 | } 69 | -------------------------------------------------------------------------------- /swbd/asr1/local/extend_segments.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use warnings; #sed replacement for -w perl parameter 3 | 4 | if (@ARGV != 1 || !($ARGV[0] =~ m/^-?\d+\.?\d*$/ && $ARGV[0] >= 0)) { 5 | print STDERR "Usage: extend_segments.pl time-in-seconds segments.extended \n" . 6 | "e.g. extend_segments.pl 0.25 segments.2\n" . 7 | "This command modifies a segments file, with lines like\n" . 8 | " \n" . 9 | "by extending the beginning and end of each segment by a certain\n" . 10 | "length of time. This script makes sure the output segments do not\n" . 11 | "overlap as a result of this time-extension, and that there are no\n" . 12 | "negative times in the output.\n"; 13 | exit 1; 14 | } 15 | 16 | $extend = $ARGV[0]; 17 | 18 | @all_lines = (); 19 | 20 | while () { 21 | chop; 22 | @A = split(" ", $_); 23 | if (@A != 4) { 24 | die "invalid line in segments file: $_"; 25 | } 26 | $line = @all_lines; # current number of lines. 27 | ($utt_id, $reco_id, $start_time, $end_time) = @A; 28 | 29 | push @all_lines, [ $utt_id, $reco_id, $start_time, $end_time ]; # anonymous array. 30 | if (! defined $lines_for_reco{$reco_id}) { 31 | $lines_for_reco{$reco_id} = [ ]; # push new anonymous array. 32 | } 33 | push @{$lines_for_reco{$reco_id}}, $line; 34 | } 35 | 36 | foreach $reco_id (keys %lines_for_reco) { 37 | $ref = $lines_for_reco{$reco_id}; 38 | @line_numbers = sort { ${$all_lines[$a]}[2] <=> ${$all_lines[$b]}[2] } @$ref; 39 | 40 | 41 | { 42 | # handle start of earliest segment as a special case. 43 | $l0 = $line_numbers[0]; 44 | $tstart = ${$all_lines[$l0]}[2] - $extend; 45 | if ($tstart < 0.0) { $tstart = 0.0; } 46 | ${$all_lines[$l0]}[2] = $tstart; 47 | } 48 | { 49 | # handle end of latest segment as a special case. 50 | $lN = $line_numbers[$#line_numbers]; 51 | $tend = ${$all_lines[$lN]}[3] + $extend; 52 | ${$all_lines[$lN]}[3] = $tend; 53 | } 54 | for ($i = 0; $i < $#line_numbers; $i++) { 55 | $ln = $line_numbers[$i]; 56 | $ln1 = $line_numbers[$i+1]; 57 | $tend = ${$all_lines[$ln]}[3]; # end of earlier segment. 58 | $tstart = ${$all_lines[$ln1]}[2]; # start of later segment. 59 | if ($tend > $tstart) { 60 | $utt1 = ${$all_lines[$ln]}[0]; 61 | $utt2 = ${$all_lines[$ln1]}[0]; 62 | print STDERR "Warning: for utterances $utt1 and $utt2, segments " . 63 | "already overlap; leaving these times unchanged.\n"; 64 | } else { 65 | $my_extend = $extend; 66 | $max_extend = 0.5 * ($tstart - $tend); 67 | if ($my_extend > $max_extend) { $my_extend = $max_extend; } 68 | $tend += $my_extend; 69 | $tstart -= $my_extend; 70 | ${$all_lines[$ln]}[3] = $tend; 71 | ${$all_lines[$ln1]}[2] = $tstart; 72 | } 73 | } 74 | } 75 | 76 | # leave the numbering of the lines unchanged. 77 | for ($l = 0; $l < @all_lines; $l++) { 78 | $ref = $all_lines[$l]; 79 | ($utt_id, $reco_id, $start_time, $end_time) = @$ref; 80 | printf("%s %s %.2f %.2f\n", $utt_id, $reco_id, $start_time, $end_time); 81 | } 82 | 83 | __END__ 84 | 85 | # testing below. 86 | 87 | # ( echo a1 A 0 1; echo a2 A 3 4; echo b1 B 0 1; echo b2 B 2 3 ) | local/extend_segments.pl 1.0 88 | a1 A 0.00 2.00 89 | a2 A 2.00 5.00 90 | b1 B 0.00 1.50 91 | b2 B 1.50 4.00 92 | # ( echo a1 A 0 2; echo a2 A 1 3 ) | local/extend_segments.pl 1.0 93 | Warning: for utterances a1 and a2, segments already overlap; leaving these times unchanged. 94 | a1 A 0.00 2.00 95 | a2 A 1.00 4.00 96 | # ( echo a1 A 0 2; echo a2 A 5 6; echo a3 A 3 4 ) | local/extend_segments.pl 1.0 97 | a1 A 0.00 2.50 98 | a2 A 4.50 7.00 99 | a3 A 2.50 4.50 100 | -------------------------------------------------------------------------------- /speech_datasets/bin/dump.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2020 Salesforce Research (Aadyot Bhatnagar) 4 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 5 | 6 | import argparse 7 | from distutils.util import strtobool 8 | import logging 9 | 10 | import kaldiio 11 | import tqdm 12 | 13 | from speech_datasets.transform import Transformation 14 | from speech_datasets.utils.io_utils import get_commandline_args, consolidate_utt_info 15 | from speech_datasets.utils.types import str_or_none, humanfriendly_or_none 16 | from speech_datasets.utils.writers import file_writer_helper 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | def parse_args(): 22 | parser = argparse.ArgumentParser( 23 | description="read .wav files & ") 24 | parser.add_argument("--feature-config", default=None, type=str_or_none, 25 | help="YAML file for feature extraction (if extracting any features)") 26 | parser.add_argument("--text-file", default=None, 27 | help="file mapping utterance ID to transcript") 28 | parser.add_argument("--utt2spk-file", default=None, 29 | help="file mapping utterance ID to speaker ID") 30 | 31 | parser.add_argument("--archive-format", type=str, default="hdf5", choices=["mat", "hdf5"], 32 | help="Specify the file format for output. \"mat\" is the matrix format in kaldi") 33 | parser.add_argument("--sample-frequency", type=humanfriendly_or_none, default=None, 34 | help="If the sampling rate is specified, resample the input.") 35 | parser.add_argument("--compress", type=strtobool, default=False, help="Save in compressed format") 36 | parser.add_argument("--compression-method", type=int, default=2, 37 | help="Specify the method(if mat) or " "gzip-level(if hdf5)") 38 | parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option") 39 | parser.add_argument("--segments", type=str, 40 | help="segments-file format: each line is either" 41 | " " 42 | "e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5") 43 | parser.add_argument("rspecifier", type=str, help="WAV scp file") 44 | parser.add_argument("wspecifier", type=str, help="Write specifier") 45 | 46 | return parser.parse_args() 47 | 48 | 49 | def main(): 50 | args = parse_args() 51 | 52 | logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" 53 | if args.verbose > 0: 54 | logging.basicConfig(level=logging.INFO, format=logfmt) 55 | else: 56 | logging.basicConfig(level=logging.WARN, format=logfmt) 57 | logger.info(get_commandline_args()) 58 | 59 | utt_text_speaker = consolidate_utt_info( 60 | scp=None, text=args.text_file, utt2spk=args.utt2spk_file) 61 | 62 | with kaldiio.ReadHelper( 63 | args.rspecifier, segments=args.segments 64 | ) as reader, file_writer_helper( 65 | args.wspecifier, 66 | filetype=args.archive_format, 67 | compress=args.compress, 68 | compression_method=args.compression_method, 69 | sample_frequency=args.sample_frequency, 70 | transform=Transformation(args.feature_config) 71 | ) as writer: 72 | for utt_id, (rate, wave) in tqdm.tqdm(reader, miniters=100, maxinterval=30): 73 | utt_dict = {"x": wave, "rate": rate} 74 | utt_dict.update(utt_text_speaker.get(utt_id, {})) 75 | try: 76 | writer[utt_id] = utt_dict 77 | except Exception as e: 78 | logger.warning( 79 | f"Failed to process utterance {utt_id} with exception:\n{str(e)}") 80 | continue 81 | 82 | 83 | if __name__ == "__main__": 84 | main() 85 | -------------------------------------------------------------------------------- /utils/sym2int.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | $ignore_oov = 0; 19 | 20 | for($x = 0; $x < 2; $x++) { 21 | if ($ARGV[0] eq "--map-oov") { 22 | shift @ARGV; 23 | $map_oov = shift @ARGV; 24 | if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") { 25 | # disallow '-f', the empty string and anything ending in words.txt as the 26 | # OOV symbol because these are likely command-line errors. 27 | die "the --map-oov option requires an argument"; 28 | } 29 | } 30 | if ($ARGV[0] eq "-f") { 31 | shift @ARGV; 32 | $field_spec = shift @ARGV; 33 | if ($field_spec =~ m/^\d+$/) { 34 | $field_begin = $field_spec - 1; $field_end = $field_spec - 1; 35 | } 36 | if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesy (properly, 1-10) 37 | if ($1 ne "") { 38 | $field_begin = $1 - 1; # Change to zero-based indexing. 39 | } 40 | if ($2 ne "") { 41 | $field_end = $2 - 1; # Change to zero-based indexing. 42 | } 43 | } 44 | if (!defined $field_begin && !defined $field_end) { 45 | die "Bad argument to -f option: $field_spec"; 46 | } 47 | } 48 | } 49 | 50 | $symtab = shift @ARGV; 51 | if (!defined $symtab) { 52 | print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" . 53 | "options: [--map-oov ] [-f ]\n" . 54 | "note: can look like 4-5, or 4-, or 5-, or 1.\n"; 55 | } 56 | open(F, "<$symtab") || die "Error opening symbol table file $symtab"; 57 | while() { 58 | @A = split(" ", $_); 59 | @A == 2 || die "bad line in symbol table file: $_"; 60 | $sym2int{$A[0]} = $A[1] + 0; 61 | } 62 | 63 | if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up 64 | if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; } 65 | $map_oov = $sym2int{$map_oov}; 66 | } 67 | 68 | $num_warning = 0; 69 | $max_warning = 20; 70 | 71 | while (<>) { 72 | @A = split(" ", $_); 73 | @B = (); 74 | for ($n = 0; $n < @A; $n++) { 75 | $a = $A[$n]; 76 | if ( (!defined $field_begin || $n >= $field_begin) 77 | && (!defined $field_end || $n <= $field_end)) { 78 | $i = $sym2int{$a}; 79 | if (!defined ($i)) { 80 | if (defined $map_oov) { 81 | if ($num_warning++ < $max_warning) { 82 | print STDERR "sym2int.pl: replacing $a with $map_oov\n"; 83 | if ($num_warning == $max_warning) { 84 | print STDERR "sym2int.pl: not warning for OOVs any more times\n"; 85 | } 86 | } 87 | $i = $map_oov; 88 | } else { 89 | $pos = $n+1; 90 | die "sym2int.pl: undefined symbol $a (in position $pos)\n"; 91 | } 92 | } 93 | $a = $i; 94 | } 95 | push @B, $a; 96 | } 97 | print join(" ", @B); 98 | print "\n"; 99 | } 100 | if ($num_warning > 0) { 101 | print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n"; 102 | } 103 | 104 | exit(0); 105 | -------------------------------------------------------------------------------- /COMBINE/asr1/local/combine_datasets.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import shutil 4 | 5 | from speech_datasets.utils import get_root 6 | from speech_datasets.utils.io_utils import get_combo_idx 7 | from speech_datasets.utils.types import str2bool 8 | 9 | 10 | def main(): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("--task", type=str, choices=["asr", "tts"]) 13 | parser.add_argument("--write_dir", type=str2bool, default=True) 14 | parser.add_argument("datasets", nargs="+", type=str) 15 | args = parser.parse_args() 16 | 17 | # Ensure that all datasets are specified as / 18 | datasets = sorted(set(args.datasets)) 19 | dataset_splits = [d.split("/", maxsplit=1) for d in datasets] 20 | assert all(len(d) == 2 for d in dataset_splits), \ 21 | f"All datasets must be specified as /, but got " \ 22 | f"{datasets} instead" 23 | 24 | # Verify that all datasets have been prepared 25 | dataset_dirs = [os.path.join(get_root(), ds[0], f"{args.task}1", "data", ds[1]) 26 | for ds in dataset_splits] 27 | assert all(os.path.isdir(d) for d in dataset_dirs), \ 28 | f"Please make sure that all dataset splits are valid, and that all " \ 29 | f"datasets you wish to combine have already been prepared by stage 1 " \ 30 | f"of {args.task}.sh" 31 | 32 | # Get the index of this dataset combination (add to the registry if needed) 33 | idx = get_combo_idx(datasets, args.task) 34 | data_dir = os.path.join(get_root(), "COMBINE", f"{args.task}1", "data") 35 | if idx < 0: 36 | os.makedirs(data_dir, exist_ok=True) 37 | with open(os.path.join(data_dir, "registry.txt"), "a") as f: 38 | f.write(" ".join(datasets) + "\n") 39 | idx = get_combo_idx(datasets, args.task) 40 | 41 | if not args.write_dir: 42 | return idx 43 | 44 | # Create a directory for this dataset combo & prepare it 45 | dirname = os.path.join(data_dir, str(idx)) 46 | os.makedirs(dirname, exist_ok=True) 47 | write_segments = any(os.path.isfile(os.path.join(d, "segments")) 48 | for d in dataset_dirs) 49 | with open(os.path.join(dirname, "wav.scp"), "wb") as wav, \ 50 | open(os.path.join(dirname, "text"), "wb") as text, \ 51 | open(os.path.join(dirname, "utt2spk"), "wb") as utt2spk, \ 52 | open(os.path.join(dirname, "segments"), "w") as segments: 53 | for d in dataset_dirs: 54 | 55 | # wav.scp, text, and utt2spk can just be concatenated on 56 | with open(os.path.join(d, "wav.scp"), "rb") as src_wav: 57 | shutil.copyfileobj(src_wav, wav) 58 | with open(os.path.join(d, "text"), "rb") as src_text: 59 | shutil.copyfileobj(src_text, text) 60 | with open(os.path.join(d, "utt2spk"), "rb") as src_utt2spk: 61 | shutil.copyfileobj(src_utt2spk, utt2spk) 62 | 63 | if write_segments: 64 | # If a segments file exists, we can just concatenate it on 65 | if os.path.isfile(os.path.join(d, "segments")): 66 | with open(os.path.join(d, "segments"), "r") as src_segments: 67 | shutil.copyfileobj(src_segments, segments) 68 | 69 | # Otherwise, we need to use wav.scp to create a dummy segments 70 | # line format is 71 | # = 0, = -1 means use the whole recording 72 | else: 73 | with open(os.path.join(d, "wav.scp"), "r") as src_wav: 74 | for line in src_wav: 75 | utt_id, _ = line.rstrip().split(None, maxsplit=1) 76 | segments.write(f"{utt_id} {utt_id} 0.0 -1.0\n") 77 | 78 | return idx 79 | 80 | 81 | if __name__ == "__main__": 82 | combo_idx = main() 83 | print(combo_idx) 84 | -------------------------------------------------------------------------------- /COMBINE/tts1/local/combine_datasets.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import shutil 4 | 5 | from speech_datasets.utils import get_root 6 | from speech_datasets.utils.io_utils import get_combo_idx 7 | from speech_datasets.utils.types import str2bool 8 | 9 | 10 | def main(): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("--task", type=str, choices=["asr", "tts"]) 13 | parser.add_argument("--write_dir", type=str2bool, default=True) 14 | parser.add_argument("datasets", nargs="+", type=str) 15 | args = parser.parse_args() 16 | 17 | # Ensure that all datasets are specified as / 18 | datasets = sorted(set(args.datasets)) 19 | dataset_splits = [d.split("/", maxsplit=1) for d in datasets] 20 | assert all(len(d) == 2 for d in dataset_splits), \ 21 | f"All datasets must be specified as /, but got " \ 22 | f"{datasets} instead" 23 | 24 | # Verify that all datasets have been prepared 25 | dataset_dirs = [os.path.join(get_root(), ds[0], f"{args.task}1", "data", ds[1]) 26 | for ds in dataset_splits] 27 | assert all(os.path.isdir(d) for d in dataset_dirs), \ 28 | f"Please make sure that all dataset splits are valid, and that all " \ 29 | f"datasets you wish to combine have already been prepared by stage 1 " \ 30 | f"of {args.task}.sh" 31 | 32 | # Get the index of this dataset combination (add to the registry if needed) 33 | idx = get_combo_idx(datasets, args.task) 34 | data_dir = os.path.join(get_root(), "COMBINE", f"{args.task}1", "data") 35 | if idx < 0: 36 | os.makedirs(data_dir, exist_ok=True) 37 | with open(os.path.join(data_dir, "registry.txt"), "a") as f: 38 | f.write(" ".join(datasets) + "\n") 39 | idx = get_combo_idx(datasets, args.task) 40 | 41 | if not args.write_dir: 42 | return idx 43 | 44 | # Create a directory for this dataset combo & prepare it 45 | dirname = os.path.join(data_dir, str(idx)) 46 | os.makedirs(dirname, exist_ok=True) 47 | write_segments = any(os.path.isfile(os.path.join(d, "segments")) 48 | for d in dataset_dirs) 49 | with open(os.path.join(dirname, "wav.scp"), "wb") as wav, \ 50 | open(os.path.join(dirname, "text"), "wb") as text, \ 51 | open(os.path.join(dirname, "utt2spk"), "wb") as utt2spk, \ 52 | open(os.path.join(dirname, "segments"), "w") as segments: 53 | for d in dataset_dirs: 54 | 55 | # wav.scp, text, and utt2spk can just be concatenated on 56 | with open(os.path.join(d, "wav.scp"), "rb") as src_wav: 57 | shutil.copyfileobj(src_wav, wav) 58 | with open(os.path.join(d, "text"), "rb") as src_text: 59 | shutil.copyfileobj(src_text, text) 60 | with open(os.path.join(d, "utt2spk"), "rb") as src_utt2spk: 61 | shutil.copyfileobj(src_utt2spk, utt2spk) 62 | 63 | if write_segments: 64 | # If a segments file exists, we can just concatenate it on 65 | if os.path.isfile(os.path.join(d, "segments")): 66 | with open(os.path.join(d, "segments"), "r") as src_segments: 67 | shutil.copyfileobj(src_segments, segments) 68 | 69 | # Otherwise, we need to use wav.scp to create a dummy segments 70 | # line format is 71 | # = 0, = -1 means use the whole recording 72 | else: 73 | with open(os.path.join(d, "wav.scp"), "r") as src_wav: 74 | for line in src_wav: 75 | utt_id, _ = line.rstrip().split(None, maxsplit=1) 76 | segments.write(f"{utt_id} {utt_id} 0.0 -1.0\n") 77 | 78 | return idx 79 | 80 | 81 | if __name__ == "__main__": 82 | combo_idx = main() 83 | print(combo_idx) 84 | -------------------------------------------------------------------------------- /speech_datasets/bin/apply_cmvn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | from distutils.util import strtobool 4 | import logging 5 | 6 | from speech_datasets.transform import Transformation 7 | from speech_datasets.utils.readers import file_reader_helper 8 | from speech_datasets.utils.io_utils import get_commandline_args 9 | from speech_datasets.utils.writers import file_writer_helper 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | def parse_args(): 15 | parser = argparse.ArgumentParser( 16 | description="apply mean-variance normalization to files", 17 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 18 | ) 19 | 20 | parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option") 21 | parser.add_argument("--in-filetype", type=str, default="hdf5", choices=["mat", "hdf5"], 22 | help="Specify the file format for the rspecifier. " 23 | '"mat" is the matrix format in kaldi') 24 | parser.add_argument("--out-filetype", type=str, default="hdf5", choices=["mat", "hdf5"], 25 | help="Specify the file format for the wspecifier. " 26 | '"mat" is the matrix format in kaldi') 27 | 28 | parser.add_argument("--norm-means", type=strtobool, default=True, 29 | help="Do mean normalization or not.") 30 | parser.add_argument("--norm-vars", type=strtobool, default=False, 31 | help="Do variance normalization or not.") 32 | parser.add_argument("--reverse", type=strtobool, default=False, 33 | help="Do reverse mode or not") 34 | parser.add_argument("--utt2spk", type=str, default=None, 35 | help="A text file of utterance to speaker map.") 36 | parser.add_argument("--compress", type=strtobool, default=False, 37 | help="Save in compressed format") 38 | parser.add_argument("--compression-method", type=int, default=2, 39 | help="Specify the method (if mat) or gzip-level (if hdf5)") 40 | parser.add_argument("--cmvn-type", type=str, choices=["global", "speaker", "utterance"], 41 | help="Type of CMVN to apply (global, per-speaker, or per-utterance)") 42 | parser.add_argument("stats_file", help="File containing CMVN stats.") 43 | parser.add_argument("rspecifier", type=str, help="Read specifier id, e.g. ark:some.ark") 44 | parser.add_argument("wspecifier", type=str, help="Write specifier id, e.g. ark:some.ark") 45 | 46 | args = parser.parse_args() 47 | if args.cmvn_type == "speaker" and args.utt2spk is None: 48 | raise argparse.ArgumentError( 49 | args.cmvn_type, "If cmvn-type is 'speaker', utt2spk must be provided.") 50 | 51 | return args 52 | 53 | 54 | def main(): 55 | args = parse_args() 56 | 57 | # logging info 58 | logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" 59 | if args.verbose > 0: 60 | logging.basicConfig(level=logging.INFO, format=logfmt) 61 | else: 62 | logging.basicConfig(level=logging.WARN, format=logfmt) 63 | logger.info(get_commandline_args()) 64 | 65 | cmvn = Transformation([{"type": "cmvn", 66 | "stats": args.stats_file, 67 | "cmvn_type": args.cmvn_type, 68 | "norm_means": args.norm_means, 69 | "norm_vars": args.norm_vars, 70 | "utt2spk": args.utt2spk, 71 | "reverse": args.reverse}]) 72 | 73 | with file_writer_helper( 74 | args.wspecifier, 75 | filetype=args.out_filetype, 76 | compress=args.compress, 77 | compression_method=args.compression_method, 78 | ) as writer: 79 | for utt, data in file_reader_helper(args.rspecifier, args.in_filetype, 80 | transform=cmvn, return_dict=True): 81 | writer[utt] = data 82 | 83 | 84 | if __name__ == "__main__": 85 | main() 86 | -------------------------------------------------------------------------------- /swbd/asr1/local/swbd1_prepare_dict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -u 4 | set -o pipefail 5 | 6 | log() { 7 | local fname=${BASH_SOURCE[1]##*/} 8 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" 9 | } 10 | 11 | # Formatting the Mississippi State dictionary for use in Edinburgh. Differs 12 | # from the one in Kaldi s5 recipe in that it uses lower-case --Arnab (Jan 2013) 13 | 14 | # To be run from one directory above this script. 15 | 16 | . ./path.sh 17 | 18 | #check existing directories 19 | 20 | if [ $# != 1 ]; then 21 | log "Error: invalid command line arguments" 22 | log "Usage: $0 /path/to/SWBD" 23 | exit 1; 24 | fi 25 | SWBD_DIR=$1 26 | 27 | # Get the original transcriptions & their corresponding dictionary 28 | srcdir=data/local/swbd1 29 | mkdir -p $srcdir 30 | if [ ! -d $srcdir/swb_ms98_transcriptions ]; then 31 | ln -sf "${SWBD_DIR}/swb_ms98_transcriptions" $srcdir/ 32 | fi 33 | srcdict=$srcdir/swb_ms98_transcriptions/sw-ms98-dict.text 34 | 35 | # assume some basic data prep was already done on the downloaded data. 36 | [ ! -f "$srcdict" ] && echo "$0: No such file $srcdict" && exit 1; 37 | 38 | # copy over the initial dictionary as thee base lexicon 39 | dir=data/local/dict_nosp 40 | mkdir -p $dir 41 | install -m +rw $srcdict $dir/lexicon0.txt || exit 1; 42 | log "$(patch 0' | sort > $dir/lexicon1.txt || exit 1; 47 | 48 | cat $dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \ 49 | grep -v sil > $dir/nonsilence_phones.txt || exit 1; 50 | 51 | ( echo sil; echo nsn; ) > $dir/silence_phones.txt 52 | 53 | echo sil > $dir/optional_silence.txt 54 | 55 | # No "extra questions" in the input to this setup, as we don't 56 | # have stress or tone. 57 | echo -n > $dir/extra_questions.txt 58 | 59 | cp local/MSU_single_letter.txt $dir/ 60 | # Add to the lexicon the silences, noises etc. 61 | # Add single letter lexicon 62 | # The original swbd lexicon does not have precise single letter lexicion 63 | # e.g. it does not have entry of W 64 | ( echo '!sil sil'; echo ' nsn'; echo ' spn' ) \ 65 | | cat - $dir/lexicon1.txt $dir/MSU_single_letter.txt > $dir/lexicon2.txt || exit 1; 66 | 67 | # Map the words in the lexicon. That is-- for each word in the lexicon, we map it 68 | # to a new written form. The transformations we do are: 69 | # remove laughter markings, e.g. 70 | # [LAUGHTER-STORY] -> STORY 71 | # Remove partial-words, e.g. 72 | # -[40]1K W AH N K EY 73 | # becomes -1K 74 | # and 75 | # -[AN]Y IY 76 | # becomes 77 | # -Y 78 | # -[A]B[OUT]- B 79 | # becomes 80 | # -B- 81 | # Also, curly braces, which appear to be used for "nonstandard" 82 | # words or non-words, are removed, e.g. 83 | # {WOLMANIZED} W OW L M AX N AY Z D 84 | # -> WOLMANIZED 85 | # Also, mispronounced words, e.g. 86 | # [YEAM/YEAH] Y AE M 87 | # are changed to just e.g. YEAM, i.e. the orthography 88 | # of the mispronounced version. 89 | # Note-- this is only really to be used in training. The main practical 90 | # reason is to avoid having tons of disambiguation symbols, which 91 | # we otherwise would get because there are many partial words with 92 | # the same phone sequences (most problematic: S). 93 | # Also, map 94 | # THEM_1 EH M -> THEM 95 | # so that multiple pronunciations just have alternate entries 96 | # in the lexicon. 97 | local/swbd1_map_words.pl -f 1 $dir/lexicon2.txt | sort -u \ 98 | > $dir/lexicon3.txt || exit 1; 99 | 100 | python local/format_acronyms_dict.py -i $dir/lexicon3.txt -o $dir/lexicon4.txt \ 101 | -L $dir/MSU_single_letter.txt -M $dir/acronyms_raw.map 102 | cat $dir/acronyms_raw.map | sort -u > $dir/acronyms.map 103 | 104 | ( echo 'i ay' )| cat - $dir/lexicon4.txt | tr '[A-Z]' '[a-z]' | sort -u > $dir/lexicon5.txt 105 | 106 | pushd $dir >&/dev/null 107 | ln -sf lexicon5.txt lexicon.txt # This is the final lexicon. 108 | popd >&/dev/null 109 | log "Prepared input dictionary and phone-sets for Switchboard phase 1." 110 | -------------------------------------------------------------------------------- /utils/parse_options.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2012 Johns Hopkins University (Author: Daniel Povey); 4 | # Arnab Ghoshal, Karel Vesely 5 | 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 14 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 15 | # MERCHANTABLITY OR NON-INFRINGEMENT. 16 | # See the Apache 2 License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | 20 | # Parse command-line options. 21 | # To be sourced by another script (as in ". parse_options.sh"). 22 | # Option format is: --option-name arg 23 | # and shell variable "option_name" gets set to value "arg." 24 | # The exception is --help, which takes no arguments, but prints the 25 | # $help_message variable (if defined). 26 | 27 | 28 | ### 29 | ### The --config file options have lower priority to command line 30 | ### options, so we need to import them first... 31 | ### 32 | 33 | # Now import all the configs specified by command-line, in left-to-right order 34 | for ((argpos=1; argpos<$#; argpos++)); do 35 | if [ "${!argpos}" == "--config" ]; then 36 | argpos_plus1=$((argpos+1)) 37 | config=${!argpos_plus1} 38 | [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 39 | . $config # source the config file. 40 | fi 41 | done 42 | 43 | 44 | ### 45 | ### Now we process the command line options 46 | ### 47 | while true; do 48 | [ -z "${1:-}" ] && break; # break if there are no arguments 49 | case "$1" in 50 | # If the enclosing script is called with --help option, print the help 51 | # message and exit. Scripts should put help messages in $help_message 52 | --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; 53 | else printf "$help_message\n" 1>&2 ; fi; 54 | exit 0 ;; 55 | --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" 56 | exit 1 ;; 57 | # If the first command-line argument begins with "--" (e.g. --foo-bar), 58 | # then work out the variable name as $name, which will equal "foo_bar". 59 | --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; 60 | # Next we test whether the variable in question is undefned-- if so it's 61 | # an invalid option and we die. Note: $0 evaluates to the name of the 62 | # enclosing script. 63 | # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar 64 | # is undefined. We then have to wrap this test inside "eval" because 65 | # foo_bar is itself inside a variable ($name). 66 | eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; 67 | 68 | oldval="`eval echo \\$$name`"; 69 | # Work out whether we seem to be expecting a Boolean argument. 70 | if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then 71 | was_bool=true; 72 | else 73 | was_bool=false; 74 | fi 75 | 76 | # Set the variable to the right value-- the escaped quotes make it work if 77 | # the option had spaces, like --cmd "queue.pl -sync y" 78 | if [ $# -lt 2 ]; then 79 | echo "$0: no argument provided for option $1" 1>&2 80 | exit 1; 81 | else 82 | eval $name=\"$2\"; 83 | fi 84 | 85 | # Check that Boolean-valued arguments are really Boolean. 86 | if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then 87 | echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 88 | exit 1; 89 | fi 90 | shift 2; 91 | ;; 92 | *) break; 93 | esac 94 | done 95 | 96 | 97 | # Check for an empty argument to the --cmd option, which can easily occur as a 98 | # result of scripting errors. 99 | [ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; 100 | 101 | 102 | true; # so this script returns exit code 0. 103 | -------------------------------------------------------------------------------- /speech_datasets/utils/types.py: -------------------------------------------------------------------------------- 1 | from distutils.util import strtobool 2 | from typing import Optional, Tuple, Union 3 | 4 | import humanfriendly 5 | import numpy as np 6 | from typeguard import check_argument_types 7 | 8 | 9 | class CMVNStats(object): 10 | def __init__(self, count, sum, sum_squares): 11 | self.count = count 12 | self.sum = sum 13 | self.sum_squares = sum_squares 14 | 15 | def __iadd__(self, other): 16 | self.count += other.count 17 | self.sum += other.sum 18 | self.sum_squares += other.sum_squares 19 | return self 20 | 21 | @classmethod 22 | def from_numpy(cls, stats): 23 | stats = np.copy(stats) 24 | assert len(stats) == 2, stats.shape 25 | # If feat has >2 dims, only use the first one for count 26 | count = stats[0, -1].flatten()[0] 27 | return cls(count=count, sum=stats[0, :-1], sum_squares=stats[1, :-1]) 28 | 29 | def to_numpy(self): 30 | shape = (2, self.sum.shape[0] + 1, *self.sum.shape[1:]) 31 | arr = np.empty(shape, dtype=np.float64) 32 | arr[0, :-1] = self.sum 33 | arr[1, :-1] = self.sum_squares 34 | arr[0, -1] = self.count 35 | arr[1, -1] = 0.0 36 | return arr 37 | 38 | 39 | def str2bool(value: str) -> bool: 40 | return bool(strtobool(value)) 41 | 42 | 43 | def int_or_none(value: str) -> Optional[int]: 44 | """int_or_none. 45 | 46 | Examples: 47 | >>> import argparse 48 | >>> parser = argparse.ArgumentParser() 49 | >>> _ = parser.add_argument('--foo', type=int_or_none) 50 | >>> parser.parse_args(['--foo', '456']) 51 | Namespace(foo=456) 52 | >>> parser.parse_args(['--foo', 'none']) 53 | Namespace(foo=None) 54 | >>> parser.parse_args(['--foo', 'null']) 55 | Namespace(foo=None) 56 | >>> parser.parse_args(['--foo', 'nil']) 57 | Namespace(foo=None) 58 | 59 | """ 60 | if value.strip().lower() in ("none", "null", "nil"): 61 | return None 62 | return int(value) 63 | 64 | 65 | def float_or_none(value: str) -> Optional[float]: 66 | """float_or_none. 67 | 68 | Examples: 69 | >>> import argparse 70 | >>> parser = argparse.ArgumentParser() 71 | >>> _ = parser.add_argument('--foo', type=float_or_none) 72 | >>> parser.parse_args(['--foo', '4.5']) 73 | Namespace(foo=4.5) 74 | >>> parser.parse_args(['--foo', 'none']) 75 | Namespace(foo=None) 76 | >>> parser.parse_args(['--foo', 'null']) 77 | Namespace(foo=None) 78 | >>> parser.parse_args(['--foo', 'nil']) 79 | Namespace(foo=None) 80 | 81 | """ 82 | if value.strip().lower() in ("none", "null", "nil"): 83 | return None 84 | return float(value) 85 | 86 | 87 | def humanfriendly_or_none(value) -> Optional[float]: 88 | if value.strip().lower() in ("none", "null", "nil"): 89 | return None 90 | return humanfriendly.parse_size(value) 91 | 92 | 93 | def str2int_tuple(integers: str) -> Optional[Tuple[int, ...]]: 94 | """ 95 | 96 | >>> str2int_tuple('3,4,5') 97 | (3, 4, 5) 98 | 99 | """ 100 | assert check_argument_types() 101 | if integers.strip() in ("none", "None", "NONE", "null", "Null", "NULL"): 102 | return None 103 | return tuple(map(int, integers.strip().split(","))) 104 | 105 | 106 | def str_or_int(value: str) -> Union[str, int]: 107 | try: 108 | return int(value) 109 | except ValueError: 110 | return value 111 | 112 | 113 | def str_or_none(value: str) -> Optional[str]: 114 | """str_or_none. 115 | 116 | Examples: 117 | >>> import argparse 118 | >>> parser = argparse.ArgumentParser() 119 | >>> _ = parser.add_argument('--foo', type=str_or_none) 120 | >>> parser.parse_args(['--foo', 'aaa']) 121 | Namespace(foo='aaa') 122 | >>> parser.parse_args(['--foo', 'none']) 123 | Namespace(foo=None) 124 | >>> parser.parse_args(['--foo', 'null']) 125 | Namespace(foo=None) 126 | >>> parser.parse_args(['--foo', 'nil']) 127 | Namespace(foo=None) 128 | 129 | """ 130 | if value.strip().lower() in ("none", "null", "nil"): 131 | return None 132 | return value 133 | -------------------------------------------------------------------------------- /speech_datasets/transform/cmvn.py: -------------------------------------------------------------------------------- 1 | import io 2 | import logging 3 | import os 4 | 5 | import numpy as np 6 | 7 | from speech_datasets.transform.interface import TransformInterface 8 | from speech_datasets.utils import get_root 9 | from speech_datasets.utils.readers import read_cmvn_stats 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class CMVN(TransformInterface): 15 | def __init__(self, cmvn_type: str, stats: str = None, norm_means=True, 16 | norm_vars=False, utt2spk: str = None, reverse=False, 17 | std_floor=1.0e-20): 18 | self.norm_means = norm_means 19 | self.norm_vars = norm_vars 20 | self.reverse = reverse 21 | self.std_floor = std_floor 22 | 23 | assert cmvn_type in ["global", "speaker", "utterance"], cmvn_type 24 | self.accept_uttid = (cmvn_type != "global") 25 | self.cmvn_type = cmvn_type 26 | if cmvn_type != "utterance": 27 | assert stats is not None, "stats required if cmvn_type != 'utterance'" 28 | try: 29 | self.stats_file = stats 30 | stats_dict = read_cmvn_stats(self.stats_file, cmvn_type) 31 | except FileNotFoundError: 32 | self.stats_file = os.path.join(get_root(), stats) 33 | stats_dict = read_cmvn_stats(self.stats_file, cmvn_type) 34 | else: 35 | if stats is not None: 36 | logger.warning("stats file is not used if cmvn_type is 'utterance'") 37 | self.stats_file = None 38 | stats_dict = {} 39 | 40 | if cmvn_type == "speaker": 41 | assert utt2spk is not None, "utt2spk required if cmvn_type is 'speaker'" 42 | self.utt2spk = {} 43 | with io.open(utt2spk, "r", encoding="utf-8") as f: 44 | for line in f: 45 | utt, spk = line.rstrip().split(None, maxsplit=1) 46 | self.utt2spk[utt] = spk 47 | else: 48 | if utt2spk is not None: 49 | logger.warning("utt2spk is only used if cmvn_type is 'speaker'") 50 | self.utt2spk = None 51 | 52 | # Kaldi makes a matrix for CMVN which has a shape of (2, feat_dim + 1), 53 | # and the first vector contains the sum of feats and the second is 54 | # the sum of squares. The last value of the first, i.e. stats[0,-1], 55 | # is the number of samples for this statistics. 56 | self.bias = {} 57 | self.scale = {} 58 | for spk, stats in stats_dict.items(): 59 | # Var[x] = E[x^2] - E[x]^2 60 | mean = stats.sum / stats.count 61 | var = stats.sum_squares / stats.count - mean * mean 62 | std = np.maximum(np.sqrt(var), std_floor) 63 | self.bias[spk] = -mean 64 | self.scale[spk] = 1 / std 65 | 66 | def __repr__(self): 67 | return ( 68 | "{name}(stats_file={stats_file}, " 69 | "norm_means={norm_means}, norm_vars={norm_vars}, " 70 | "reverse={reverse})".format( 71 | name=self.__class__.__name__, 72 | stats_file=self.stats_file, 73 | norm_means=self.norm_means, 74 | norm_vars=self.norm_vars, 75 | reverse=self.reverse, 76 | ) 77 | ) 78 | 79 | def __call__(self, x, uttid=None): 80 | if self.cmvn_type == "global": 81 | bias = self.bias[None] 82 | scale = self.scale[None] 83 | elif self.cmvn_type == "speaker": 84 | spk = self.utt2spk[uttid] 85 | bias = self.bias[spk] 86 | scale = self.scale[spk] 87 | else: # self.cmvn_type == "utterance" 88 | mean = x.mean(axis=0) 89 | mse = (x ** 2).sum(axis=0) / x.shape[0] 90 | bias = -mean 91 | scale = 1 / np.maximum(np.sqrt(mse - mean ** 2), self.std_floor) 92 | 93 | if not self.reverse: 94 | if self.norm_means: 95 | x = np.add(x, bias) 96 | if self.norm_vars: 97 | x = np.multiply(x, scale) 98 | 99 | else: 100 | if self.norm_vars: 101 | x = np.divide(x, scale) 102 | if self.norm_means: 103 | x = np.subtract(x, bias) 104 | 105 | return x 106 | -------------------------------------------------------------------------------- /TEMPLATE/asr1/cmd.sh: -------------------------------------------------------------------------------- 1 | # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ====== 2 | # Usage: .pl [options] JOB=1: 3 | # e.g. 4 | # run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB 5 | # 6 | # Options: 7 | # --time