├── COMBINE
├── tts1
│ ├── cmd.sh
│ ├── db.sh
│ ├── run.sh
│ ├── utils
│ ├── path.sh
│ ├── multi_tokenize.sh
│ ├── combine_cmvn_stats.sh
│ ├── combine_train_data.sh
│ ├── conf
│ │ ├── fbank.yaml
│ │ ├── fbank_pitch.yaml
│ │ ├── pbs.conf
│ │ ├── gpu.conf
│ │ ├── queue.conf
│ │ └── slurm.conf
│ └── local
│ │ └── combine_datasets.py
└── asr1
│ ├── db.sh
│ ├── utils
│ ├── cmd.sh
│ ├── path.sh
│ ├── run.sh
│ ├── conf
│ ├── fbank.yaml
│ ├── fbank_pitch.yaml
│ ├── pbs.conf
│ ├── gpu.conf
│ ├── queue.conf
│ └── slurm.conf
│ ├── combine_train_data.sh
│ ├── multi_tokenize.sh
│ ├── combine_cmvn_stats.sh
│ └── local
│ └── combine_datasets.py
├── TEMPLATE
├── asr1
│ ├── utils
│ ├── conf
│ │ ├── fbank.yaml
│ │ ├── fbank_pitch.yaml
│ │ ├── pbs.conf
│ │ ├── queue.conf
│ │ └── slurm.conf
│ ├── setup.sh
│ ├── path.sh
│ ├── db.sh
│ └── cmd.sh
└── tts1
│ ├── db.sh
│ ├── utils
│ ├── cmd.sh
│ ├── path.sh
│ ├── tts.sh
│ ├── conf
│ ├── fbank.yaml
│ ├── fbank_pitch.yaml
│ ├── pbs.conf
│ ├── queue.conf
│ └── slurm.conf
│ └── setup.sh
├── speech_datasets
├── bin
│ ├── __init__.py
│ ├── spm_train.py
│ ├── combine_cmvn_stats.py
│ ├── feat_to_shape.py
│ ├── dump.py
│ ├── apply_cmvn.py
│ └── compute_cmvn_stats.py
├── text
│ ├── __init__.py
│ └── tokenizers.py
├── transform
│ ├── __init__.py
│ ├── add_deltas.py
│ ├── interface.py
│ └── cmvn.py
├── utils
│ ├── __init__.py
│ ├── misc.py
│ └── types.py
└── __init__.py
├── fisher
└── asr1
│ ├── db.sh
│ ├── utils
│ ├── asr.sh
│ ├── cmd.sh
│ ├── path.sh
│ ├── conf
│ ├── fbank.yaml
│ ├── fbank_pitch.yaml
│ ├── pbs.conf
│ ├── queue.conf
│ └── slurm.conf
│ ├── run.sh
│ └── local
│ └── data.sh
├── swbd
└── asr1
│ ├── asr.sh
│ ├── cmd.sh
│ ├── db.sh
│ ├── utils
│ ├── path.sh
│ ├── conf
│ ├── fbank.yaml
│ ├── fbank_pitch.yaml
│ ├── pbs.conf
│ ├── gpu.conf
│ ├── queue.conf
│ └── slurm.conf
│ ├── local
│ ├── MSU_single_letter.txt
│ ├── map_acronyms_transcripts.py
│ ├── swbd1_map_words.pl
│ ├── swbd1_fix_speakerid.pl
│ ├── data.sh
│ ├── extend_segments.pl
│ ├── swbd1_prepare_dict.sh
│ ├── rt03_data_prep.sh
│ └── format_acronyms_dict.py
│ └── run.sh
├── wsj
└── asr1
│ ├── asr.sh
│ ├── cmd.sh
│ ├── db.sh
│ ├── path.sh
│ ├── utils
│ ├── conf
│ ├── fbank.yaml
│ ├── fbank_pitch.yaml
│ ├── pbs.conf
│ ├── gpu.conf
│ ├── queue.conf
│ └── slurm.conf
│ ├── run.sh
│ └── local
│ ├── flist2scp.pl
│ ├── wsj_format_data.sh
│ ├── data.sh
│ ├── find_transcripts.pl
│ ├── ndx2flist.pl
│ └── normalize_transcript.pl
├── commonvoice
└── asr1
│ ├── db.sh
│ ├── utils
│ ├── asr.sh
│ ├── cmd.sh
│ ├── path.sh
│ ├── conf
│ ├── fbank.yaml
│ ├── fbank_pitch.yaml
│ ├── pbs.conf
│ ├── queue.conf
│ └── slurm.conf
│ ├── run.sh
│ └── local
│ ├── filter_text.py
│ ├── data.sh
│ ├── download_and_untar.sh
│ ├── reduce_data_dir.sh
│ ├── split_tr_dt_et.sh
│ └── data_prep.pl
├── example
├── requirements.txt
├── resources
│ ├── global_cmvn_fbank.ark
│ ├── librispeech_bpe2000.model
│ ├── global_cmvn_fbank_pitch.ark
│ ├── fbank.yaml
│ └── fbank_pitch.yaml
├── utils.py
└── README.md
├── librispeech
└── asr1
│ ├── db.sh
│ ├── utils
│ ├── asr.sh
│ ├── cmd.sh
│ ├── path.sh
│ ├── conf
│ ├── fbank.yaml
│ ├── fbank_pitch.yaml
│ ├── pbs.conf
│ ├── gpu.conf
│ ├── queue.conf
│ └── slurm.conf
│ ├── run.sh
│ └── local
│ ├── download_and_untar.sh
│ └── data.sh
├── .gitignore
├── CODEOWNERS
├── SECURITY.md
├── tools
├── install_sph2pipe.sh
├── install_pkgs.sh
└── install_anaconda.sh
├── setup.py
├── utils
├── make_absolute.sh
├── spk2utt_to_utt2spk.pl
├── utt2spk_to_spk2utt.pl
├── shuffle_list.pl
├── compute_cmvn_stats.sh
├── remove_dup_utts.sh
├── feat_to_shape.sh
├── subset_data_dir_tr_cv.sh
├── apply_cmvn.sh
├── filter_scp.pl
├── subset_scp.pl
├── apply_map.pl
├── sym2int.pl
├── parse_options.sh
├── combine_data.sh
└── dump.sh
├── docker
└── Dockerfile
└── Makefile
/COMBINE/tts1/cmd.sh:
--------------------------------------------------------------------------------
1 | ../asr1/cmd.sh
--------------------------------------------------------------------------------
/COMBINE/tts1/db.sh:
--------------------------------------------------------------------------------
1 | ../asr1/db.sh
--------------------------------------------------------------------------------
/COMBINE/tts1/run.sh:
--------------------------------------------------------------------------------
1 | ../asr1/run.sh
--------------------------------------------------------------------------------
/COMBINE/tts1/utils:
--------------------------------------------------------------------------------
1 | ../asr1/utils
--------------------------------------------------------------------------------
/TEMPLATE/asr1/utils:
--------------------------------------------------------------------------------
1 | ../../utils
--------------------------------------------------------------------------------
/TEMPLATE/tts1/db.sh:
--------------------------------------------------------------------------------
1 | ../asr1/db.sh
--------------------------------------------------------------------------------
/TEMPLATE/tts1/utils:
--------------------------------------------------------------------------------
1 | ../asr1/utils
--------------------------------------------------------------------------------
/speech_datasets/bin/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/COMBINE/tts1/path.sh:
--------------------------------------------------------------------------------
1 | ../asr1/path.sh
--------------------------------------------------------------------------------
/TEMPLATE/tts1/cmd.sh:
--------------------------------------------------------------------------------
1 | ../asr1/cmd.sh
--------------------------------------------------------------------------------
/TEMPLATE/tts1/path.sh:
--------------------------------------------------------------------------------
1 | ../asr1/path.sh
--------------------------------------------------------------------------------
/TEMPLATE/tts1/tts.sh:
--------------------------------------------------------------------------------
1 | ../asr1/asr.sh
--------------------------------------------------------------------------------
/COMBINE/asr1/db.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/db.sh
--------------------------------------------------------------------------------
/COMBINE/asr1/utils:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/utils
--------------------------------------------------------------------------------
/fisher/asr1/db.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/db.sh
--------------------------------------------------------------------------------
/fisher/asr1/utils:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/utils
--------------------------------------------------------------------------------
/swbd/asr1/asr.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/asr.sh
--------------------------------------------------------------------------------
/swbd/asr1/cmd.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/cmd.sh
--------------------------------------------------------------------------------
/swbd/asr1/db.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/db.sh
--------------------------------------------------------------------------------
/swbd/asr1/utils:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/utils
--------------------------------------------------------------------------------
/wsj/asr1/asr.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/asr.sh
--------------------------------------------------------------------------------
/wsj/asr1/cmd.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/cmd.sh
--------------------------------------------------------------------------------
/wsj/asr1/db.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/db.sh
--------------------------------------------------------------------------------
/wsj/asr1/path.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/path.sh
--------------------------------------------------------------------------------
/wsj/asr1/utils:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/utils
--------------------------------------------------------------------------------
/COMBINE/asr1/cmd.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/cmd.sh
--------------------------------------------------------------------------------
/COMBINE/asr1/path.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/path.sh
--------------------------------------------------------------------------------
/COMBINE/asr1/run.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/asr.sh
--------------------------------------------------------------------------------
/commonvoice/asr1/db.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/db.sh
--------------------------------------------------------------------------------
/commonvoice/asr1/utils:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/utils
--------------------------------------------------------------------------------
/example/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==3.4.0
2 |
--------------------------------------------------------------------------------
/fisher/asr1/asr.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/asr.sh
--------------------------------------------------------------------------------
/fisher/asr1/cmd.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/cmd.sh
--------------------------------------------------------------------------------
/fisher/asr1/path.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/path.sh
--------------------------------------------------------------------------------
/librispeech/asr1/db.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/db.sh
--------------------------------------------------------------------------------
/librispeech/asr1/utils:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/utils
--------------------------------------------------------------------------------
/swbd/asr1/path.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/path.sh
--------------------------------------------------------------------------------
/commonvoice/asr1/asr.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/asr.sh
--------------------------------------------------------------------------------
/commonvoice/asr1/cmd.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/cmd.sh
--------------------------------------------------------------------------------
/commonvoice/asr1/path.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/path.sh
--------------------------------------------------------------------------------
/librispeech/asr1/asr.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/asr.sh
--------------------------------------------------------------------------------
/librispeech/asr1/cmd.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/cmd.sh
--------------------------------------------------------------------------------
/librispeech/asr1/path.sh:
--------------------------------------------------------------------------------
1 | ../../TEMPLATE/asr1/path.sh
--------------------------------------------------------------------------------
/COMBINE/tts1/multi_tokenize.sh:
--------------------------------------------------------------------------------
1 | ../asr1/multi_tokenize.sh
--------------------------------------------------------------------------------
/COMBINE/tts1/combine_cmvn_stats.sh:
--------------------------------------------------------------------------------
1 | ../asr1/combine_cmvn_stats.sh
--------------------------------------------------------------------------------
/COMBINE/tts1/combine_train_data.sh:
--------------------------------------------------------------------------------
1 | ../asr1/combine_train_data.sh
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.pyc
3 | .DS_Store
4 | .idea/
5 | cmake-build-debug/
6 |
--------------------------------------------------------------------------------
/fisher/asr1/conf/fbank.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank
2 | num_mel_bins: 80
3 | sample_frequency: 8000
4 |
--------------------------------------------------------------------------------
/swbd/asr1/conf/fbank.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank
2 | num_mel_bins: 80
3 | sample_frequency: 8000
4 |
--------------------------------------------------------------------------------
/wsj/asr1/conf/fbank.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank
2 | num_mel_bins: 80
3 | sample_frequency: 16000
4 |
--------------------------------------------------------------------------------
/COMBINE/asr1/conf/fbank.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank
2 | num_mel_bins: 80
3 | sample_frequency: 16000
4 |
--------------------------------------------------------------------------------
/COMBINE/tts1/conf/fbank.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank
2 | num_mel_bins: 80
3 | sample_frequency: 16000
4 |
--------------------------------------------------------------------------------
/TEMPLATE/asr1/conf/fbank.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank
2 | num_mel_bins: 80
3 | sample_frequency: 16000
4 |
--------------------------------------------------------------------------------
/TEMPLATE/tts1/conf/fbank.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank
2 | num_mel_bins: 80
3 | sample_frequency: 16000
4 |
--------------------------------------------------------------------------------
/commonvoice/asr1/conf/fbank.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank
2 | num_mel_bins: 80
3 | sample_frequency: 16000
4 |
--------------------------------------------------------------------------------
/librispeech/asr1/conf/fbank.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank
2 | num_mel_bins: 80
3 | sample_frequency: 16000
4 |
--------------------------------------------------------------------------------
/swbd/asr1/conf/fbank_pitch.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank_pitch
2 | num_mel_bins: 80
3 | sample_frequency: 8000
4 |
--------------------------------------------------------------------------------
/wsj/asr1/conf/fbank_pitch.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank_pitch
2 | num_mel_bins: 80
3 | sample_frequency: 16000
4 |
--------------------------------------------------------------------------------
/COMBINE/asr1/conf/fbank_pitch.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank_pitch
2 | num_mel_bins: 80
3 | sample_frequency: 16000
4 |
--------------------------------------------------------------------------------
/COMBINE/tts1/conf/fbank_pitch.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank_pitch
2 | num_mel_bins: 80
3 | sample_frequency: 16000
4 |
--------------------------------------------------------------------------------
/TEMPLATE/asr1/conf/fbank_pitch.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank_pitch
2 | num_mel_bins: 80
3 | sample_frequency: 16000
4 |
--------------------------------------------------------------------------------
/TEMPLATE/tts1/conf/fbank_pitch.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank_pitch
2 | num_mel_bins: 80
3 | sample_frequency: 16000
4 |
--------------------------------------------------------------------------------
/fisher/asr1/conf/fbank_pitch.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank_pitch
2 | num_mel_bins: 80
3 | sample_frequency: 8000
4 |
--------------------------------------------------------------------------------
/commonvoice/asr1/conf/fbank_pitch.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank_pitch
2 | num_mel_bins: 80
3 | sample_frequency: 16000
4 |
--------------------------------------------------------------------------------
/librispeech/asr1/conf/fbank_pitch.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank_pitch
2 | num_mel_bins: 80
3 | sample_frequency: 16000
4 |
--------------------------------------------------------------------------------
/speech_datasets/text/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub-package"""
2 | from speech_datasets.text.tokenizers import SentencepieceTokenizer
3 |
--------------------------------------------------------------------------------
/example/resources/global_cmvn_fbank.ark:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/speech-datasets/HEAD/example/resources/global_cmvn_fbank.ark
--------------------------------------------------------------------------------
/example/resources/librispeech_bpe2000.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/speech-datasets/HEAD/example/resources/librispeech_bpe2000.model
--------------------------------------------------------------------------------
/speech_datasets/transform/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize main package."""
2 | from speech_datasets.transform.transformation import Transformation
3 |
--------------------------------------------------------------------------------
/example/resources/global_cmvn_fbank_pitch.ark:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/speech-datasets/HEAD/example/resources/global_cmvn_fbank_pitch.ark
--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # Comment line immediately above ownership line is reserved for related gus information. Please be careful while editing.
2 | #ECCN:Open Source
3 |
--------------------------------------------------------------------------------
/speech_datasets/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package & bring general util into this namespace."""
2 | from speech_datasets.utils.misc import get_root, check_kwargs, dynamic_import, set_deterministic_pytorch
3 |
--------------------------------------------------------------------------------
/speech_datasets/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize main package."""
2 | import pkg_resources
3 | from speech_datasets.dataloader import SpeechDataLoader
4 |
5 | try:
6 | __version__ = pkg_resources.get_distribution("speech_datasets").version
7 | except Exception:
8 | __version__ = "(Not installed from setup.py)"
9 | del pkg_resources
10 |
--------------------------------------------------------------------------------
/example/resources/fbank.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank
2 | num_mel_bins: 80
3 | sample_frequency: 16000
4 | - type: cmvn
5 | cmvn_type: global
6 | stats: resources/global_cmvn_fbank.ark
7 | norm_vars: True
8 | - type: spec_augment
9 | n_freq_mask: 2
10 | max_freq_width: 27
11 | n_time_mask: 2
12 | max_time_width: 100
13 | max_time_warp: 80
14 |
--------------------------------------------------------------------------------
/swbd/asr1/local/MSU_single_letter.txt:
--------------------------------------------------------------------------------
1 | A ey
2 | B b iy
3 | C s iy
4 | D d iy
5 | E iy
6 | F eh f
7 | G jh iy
8 | H ey ch
9 | I ay
10 | J jh ey
11 | K k ey
12 | L eh l
13 | M eh m
14 | N eh n
15 | O ow
16 | P p iy
17 | Q k y uw
18 | R aa r
19 | S eh s
20 | T t iy
21 | U y uw
22 | V v iy
23 | W d ah b ax l y uw
24 | X eh k s
25 | Y w ay
26 | Z z iy
27 |
--------------------------------------------------------------------------------
/example/resources/fbank_pitch.yaml:
--------------------------------------------------------------------------------
1 | - type: fbank_pitch
2 | num_mel_bins: 80
3 | sample_frequency: 16000
4 | - type: cmvn
5 | cmvn_type: global
6 | stats: resources/global_cmvn_fbank_pitch.ark
7 | norm_vars: True
8 | - type: spec_augment
9 | n_freq_mask: 2
10 | max_freq_width: 27
11 | n_time_mask: 2
12 | max_time_width: 100
13 | max_time_warp: 80
14 |
--------------------------------------------------------------------------------
/wsj/asr1/conf/pbs.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command qsub -V -v PATH -S /bin/bash
3 | option name=* -N $0
4 | option mem=* -l mem=$0
5 | option mem=0 # Do not add anything to qsub_opts
6 | option num_threads=* -l ncpus=$0
7 | option num_threads=1 # Do not add anything to qsub_opts
8 | option num_nodes=* -l nodes=$0:ppn=1
9 | default gpu=0
10 | option gpu=0
11 | option gpu=* -l ngpus=$0
12 |
--------------------------------------------------------------------------------
/COMBINE/asr1/conf/pbs.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command qsub -V -v PATH -S /bin/bash
3 | option name=* -N $0
4 | option mem=* -l mem=$0
5 | option mem=0 # Do not add anything to qsub_opts
6 | option num_threads=* -l ncpus=$0
7 | option num_threads=1 # Do not add anything to qsub_opts
8 | option num_nodes=* -l nodes=$0:ppn=1
9 | default gpu=0
10 | option gpu=0
11 | option gpu=* -l ngpus=$0
12 |
--------------------------------------------------------------------------------
/COMBINE/tts1/conf/pbs.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command qsub -V -v PATH -S /bin/bash
3 | option name=* -N $0
4 | option mem=* -l mem=$0
5 | option mem=0 # Do not add anything to qsub_opts
6 | option num_threads=* -l ncpus=$0
7 | option num_threads=1 # Do not add anything to qsub_opts
8 | option num_nodes=* -l nodes=$0:ppn=1
9 | default gpu=0
10 | option gpu=0
11 | option gpu=* -l ngpus=$0
12 |
--------------------------------------------------------------------------------
/TEMPLATE/asr1/conf/pbs.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command qsub -V -v PATH -S /bin/bash
3 | option name=* -N $0
4 | option mem=* -l mem=$0
5 | option mem=0 # Do not add anything to qsub_opts
6 | option num_threads=* -l ncpus=$0
7 | option num_threads=1 # Do not add anything to qsub_opts
8 | option num_nodes=* -l nodes=$0:ppn=1
9 | default gpu=0
10 | option gpu=0
11 | option gpu=* -l ngpus=$0
12 |
--------------------------------------------------------------------------------
/TEMPLATE/tts1/conf/pbs.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command qsub -V -v PATH -S /bin/bash
3 | option name=* -N $0
4 | option mem=* -l mem=$0
5 | option mem=0 # Do not add anything to qsub_opts
6 | option num_threads=* -l ncpus=$0
7 | option num_threads=1 # Do not add anything to qsub_opts
8 | option num_nodes=* -l nodes=$0:ppn=1
9 | default gpu=0
10 | option gpu=0
11 | option gpu=* -l ngpus=$0
12 |
--------------------------------------------------------------------------------
/fisher/asr1/conf/pbs.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command qsub -V -v PATH -S /bin/bash
3 | option name=* -N $0
4 | option mem=* -l mem=$0
5 | option mem=0 # Do not add anything to qsub_opts
6 | option num_threads=* -l ncpus=$0
7 | option num_threads=1 # Do not add anything to qsub_opts
8 | option num_nodes=* -l nodes=$0:ppn=1
9 | default gpu=0
10 | option gpu=0
11 | option gpu=* -l ngpus=$0
12 |
--------------------------------------------------------------------------------
/swbd/asr1/conf/pbs.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command qsub -V -v PATH -S /bin/bash
3 | option name=* -N $0
4 | option mem=* -l mem=$0
5 | option mem=0 # Do not add anything to qsub_opts
6 | option num_threads=* -l ncpus=$0
7 | option num_threads=1 # Do not add anything to qsub_opts
8 | option num_nodes=* -l nodes=$0:ppn=1
9 | default gpu=0
10 | option gpu=0
11 | option gpu=* -l ngpus=$0
12 |
--------------------------------------------------------------------------------
/commonvoice/asr1/conf/pbs.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command qsub -V -v PATH -S /bin/bash
3 | option name=* -N $0
4 | option mem=* -l mem=$0
5 | option mem=0 # Do not add anything to qsub_opts
6 | option num_threads=* -l ncpus=$0
7 | option num_threads=1 # Do not add anything to qsub_opts
8 | option num_nodes=* -l nodes=$0:ppn=1
9 | default gpu=0
10 | option gpu=0
11 | option gpu=* -l ngpus=$0
12 |
--------------------------------------------------------------------------------
/librispeech/asr1/conf/pbs.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command qsub -V -v PATH -S /bin/bash
3 | option name=* -N $0
4 | option mem=* -l mem=$0
5 | option mem=0 # Do not add anything to qsub_opts
6 | option num_threads=* -l ncpus=$0
7 | option num_threads=1 # Do not add anything to qsub_opts
8 | option num_nodes=* -l nodes=$0:ppn=1
9 | default gpu=0
10 | option gpu=0
11 | option gpu=* -l ngpus=$0
12 |
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | ## Security
2 |
3 | Please report any security issue to [security@salesforce.com](mailto:security@salesforce.com)
4 | as soon as it is discovered. This library limits its runtime dependencies in
5 | order to reduce the total cost of ownership as much as can be, but all consumers
6 | should remain vigilant and have their security stakeholders review all third-party
7 | products (3PP) like this one and their dependencies.
8 |
--------------------------------------------------------------------------------
/speech_datasets/bin/spm_train.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Copyright (c) Facebook, Inc. and its affiliates.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the license found in the
6 | # https://github.com/pytorch/fairseq/blob/master/LICENSE
7 | import sys
8 |
9 | import sentencepiece as spm
10 |
11 |
12 | if __name__ == "__main__":
13 | spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:]))
14 |
--------------------------------------------------------------------------------
/wsj/asr1/conf/gpu.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
3 | option mem=* -l mem_free=$0,ram_free=$0
4 | option mem=0 # Do not add anything to qsub_opts
5 | option num_threads=* -pe smp $0
6 | option num_threads=1 # Do not add anything to qsub_opts
7 | option max_jobs_run=* -tc $0
8 | default gpu=0
9 | option gpu=0
10 | option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q
--------------------------------------------------------------------------------
/COMBINE/asr1/conf/gpu.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
3 | option mem=* -l mem_free=$0,ram_free=$0
4 | option mem=0 # Do not add anything to qsub_opts
5 | option num_threads=* -pe smp $0
6 | option num_threads=1 # Do not add anything to qsub_opts
7 | option max_jobs_run=* -tc $0
8 | default gpu=0
9 | option gpu=0
10 | option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q
--------------------------------------------------------------------------------
/COMBINE/tts1/conf/gpu.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
3 | option mem=* -l mem_free=$0,ram_free=$0
4 | option mem=0 # Do not add anything to qsub_opts
5 | option num_threads=* -pe smp $0
6 | option num_threads=1 # Do not add anything to qsub_opts
7 | option max_jobs_run=* -tc $0
8 | default gpu=0
9 | option gpu=0
10 | option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q
--------------------------------------------------------------------------------
/swbd/asr1/conf/gpu.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
3 | option mem=* -l mem_free=$0,ram_free=$0
4 | option mem=0 # Do not add anything to qsub_opts
5 | option num_threads=* -pe smp $0
6 | option num_threads=1 # Do not add anything to qsub_opts
7 | option max_jobs_run=* -tc $0
8 | default gpu=0
9 | option gpu=0
10 | option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q
--------------------------------------------------------------------------------
/librispeech/asr1/conf/gpu.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
3 | option mem=* -l mem_free=$0,ram_free=$0
4 | option mem=0 # Do not add anything to qsub_opts
5 | option num_threads=* -pe smp $0
6 | option num_threads=1 # Do not add anything to qsub_opts
7 | option max_jobs_run=* -tc $0
8 | default gpu=0
9 | option gpu=0
10 | option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q
--------------------------------------------------------------------------------
/tools/install_sph2pipe.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -euo pipefail
3 |
4 | if [ $# != 1 ]; then
5 | echo "Usage: $0
"
6 | exit 1;
7 | fi
8 | pwd=$PWD
9 | dir=$1
10 |
11 | if [ ! -e sph2pipe_v2.5.tar.gz ]; then
12 | wget --no-check-certificate https://www.openslr.org/resources/3/sph2pipe_v2.5.tar.gz
13 | fi
14 |
15 | tar xzvf sph2pipe_v2.5.tar.gz -C $dir
16 | rm sph2pipe_v2.5.tar.gz
17 |
18 | cd $dir/sph2pipe_v2.5
19 | gcc -o sph2pipe *.c -lm
20 | cd $pwd
21 |
--------------------------------------------------------------------------------
/wsj/asr1/conf/queue.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
3 | option name=* -N $0
4 | option mem=* -l mem_free=$0,ram_free=$0
5 | option mem=0 # Do not add anything to qsub_opts
6 | option num_threads=* -pe smp $0
7 | option num_threads=1 # Do not add anything to qsub_opts
8 | option max_jobs_run=* -tc $0
9 | option num_nodes=* -pe mpi $0 # You must set this PE as allocation_rule=1
10 | default gpu=0
11 | option gpu=0
12 | option gpu=* -l gpu=$0 -q g.q
13 |
--------------------------------------------------------------------------------
/COMBINE/asr1/conf/queue.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
3 | option name=* -N $0
4 | option mem=* -l mem_free=$0,ram_free=$0
5 | option mem=0 # Do not add anything to qsub_opts
6 | option num_threads=* -pe smp $0
7 | option num_threads=1 # Do not add anything to qsub_opts
8 | option max_jobs_run=* -tc $0
9 | option num_nodes=* -pe mpi $0 # You must set this PE as allocation_rule=1
10 | default gpu=0
11 | option gpu=0
12 | option gpu=* -l gpu=$0 -q g.q
13 |
--------------------------------------------------------------------------------
/COMBINE/tts1/conf/queue.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
3 | option name=* -N $0
4 | option mem=* -l mem_free=$0,ram_free=$0
5 | option mem=0 # Do not add anything to qsub_opts
6 | option num_threads=* -pe smp $0
7 | option num_threads=1 # Do not add anything to qsub_opts
8 | option max_jobs_run=* -tc $0
9 | option num_nodes=* -pe mpi $0 # You must set this PE as allocation_rule=1
10 | default gpu=0
11 | option gpu=0
12 | option gpu=* -l gpu=$0 -q g.q
13 |
--------------------------------------------------------------------------------
/TEMPLATE/asr1/conf/queue.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
3 | option name=* -N $0
4 | option mem=* -l mem_free=$0,ram_free=$0
5 | option mem=0 # Do not add anything to qsub_opts
6 | option num_threads=* -pe smp $0
7 | option num_threads=1 # Do not add anything to qsub_opts
8 | option max_jobs_run=* -tc $0
9 | option num_nodes=* -pe mpi $0 # You must set this PE as allocation_rule=1
10 | default gpu=0
11 | option gpu=0
12 | option gpu=* -l gpu=$0 -q g.q
13 |
--------------------------------------------------------------------------------
/TEMPLATE/tts1/conf/queue.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
3 | option name=* -N $0
4 | option mem=* -l mem_free=$0,ram_free=$0
5 | option mem=0 # Do not add anything to qsub_opts
6 | option num_threads=* -pe smp $0
7 | option num_threads=1 # Do not add anything to qsub_opts
8 | option max_jobs_run=* -tc $0
9 | option num_nodes=* -pe mpi $0 # You must set this PE as allocation_rule=1
10 | default gpu=0
11 | option gpu=0
12 | option gpu=* -l gpu=$0 -q g.q
13 |
--------------------------------------------------------------------------------
/fisher/asr1/conf/queue.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
3 | option name=* -N $0
4 | option mem=* -l mem_free=$0,ram_free=$0
5 | option mem=0 # Do not add anything to qsub_opts
6 | option num_threads=* -pe smp $0
7 | option num_threads=1 # Do not add anything to qsub_opts
8 | option max_jobs_run=* -tc $0
9 | option num_nodes=* -pe mpi $0 # You must set this PE as allocation_rule=1
10 | default gpu=0
11 | option gpu=0
12 | option gpu=* -l gpu=$0 -q g.q
13 |
--------------------------------------------------------------------------------
/swbd/asr1/conf/queue.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
3 | option name=* -N $0
4 | option mem=* -l mem_free=$0,ram_free=$0
5 | option mem=0 # Do not add anything to qsub_opts
6 | option num_threads=* -pe smp $0
7 | option num_threads=1 # Do not add anything to qsub_opts
8 | option max_jobs_run=* -tc $0
9 | option num_nodes=* -pe mpi $0 # You must set this PE as allocation_rule=1
10 | default gpu=0
11 | option gpu=0
12 | option gpu=* -l gpu=$0 -q g.q
13 |
--------------------------------------------------------------------------------
/commonvoice/asr1/conf/queue.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
3 | option name=* -N $0
4 | option mem=* -l mem_free=$0,ram_free=$0
5 | option mem=0 # Do not add anything to qsub_opts
6 | option num_threads=* -pe smp $0
7 | option num_threads=1 # Do not add anything to qsub_opts
8 | option max_jobs_run=* -tc $0
9 | option num_nodes=* -pe mpi $0 # You must set this PE as allocation_rule=1
10 | default gpu=0
11 | option gpu=0
12 | option gpu=* -l gpu=$0 -q g.q
13 |
--------------------------------------------------------------------------------
/librispeech/asr1/conf/queue.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
3 | option name=* -N $0
4 | option mem=* -l mem_free=$0,ram_free=$0
5 | option mem=0 # Do not add anything to qsub_opts
6 | option num_threads=* -pe smp $0
7 | option num_threads=1 # Do not add anything to qsub_opts
8 | option max_jobs_run=* -tc $0
9 | option num_nodes=* -pe mpi $0 # You must set this PE as allocation_rule=1
10 | default gpu=0
11 | option gpu=0
12 | option gpu=* -l gpu=$0 -q g.q
13 |
--------------------------------------------------------------------------------
/fisher/asr1/conf/slurm.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command sbatch --export=PATH
3 | option name=* --job-name $0
4 | option time=* --time $0
5 | option mem=* --mem-per-cpu $0
6 | option mem=0 # Do not add anything to qsub_opts
7 | option num_threads=* --cpus-per-task $0
8 | option num_threads=1 --cpus-per-task 1
9 | option num_nodes=* --nodes $0
10 | default gpu=0
11 | option gpu=0 -p cpu
12 | option gpu=* -p gpu --gres=gpu:$0
13 | # note: the --max-jobs-run option is supported as a special case
14 | # by slurm.pl and you don't have to handle it in the config file.
15 |
--------------------------------------------------------------------------------
/swbd/asr1/conf/slurm.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command sbatch --export=PATH
3 | option name=* --job-name $0
4 | option time=* --time $0
5 | option mem=* --mem-per-cpu $0
6 | option mem=0 # Do not add anything to qsub_opts
7 | option num_threads=* --cpus-per-task $0
8 | option num_threads=1 --cpus-per-task 1
9 | option num_nodes=* --nodes $0
10 | default gpu=0
11 | option gpu=0 -p cpu
12 | option gpu=* -p gpu --gres=gpu:$0
13 | # note: the --max-jobs-run option is supported as a special case
14 | # by slurm.pl and you don't have to handle it in the config file.
15 |
--------------------------------------------------------------------------------
/wsj/asr1/conf/slurm.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command sbatch --export=PATH
3 | option name=* --job-name $0
4 | option time=* --time $0
5 | option mem=* --mem-per-cpu $0
6 | option mem=0 # Do not add anything to qsub_opts
7 | option num_threads=* --cpus-per-task $0
8 | option num_threads=1 --cpus-per-task 1
9 | option num_nodes=* --nodes $0
10 | default gpu=0
11 | option gpu=0 -p cpu
12 | option gpu=* -p gpu --gres=gpu:$0
13 | # note: the --max-jobs-run option is supported as a special case
14 | # by slurm.pl and you don't have to handle it in the config file.
15 |
--------------------------------------------------------------------------------
/COMBINE/asr1/conf/slurm.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command sbatch --export=PATH
3 | option name=* --job-name $0
4 | option time=* --time $0
5 | option mem=* --mem-per-cpu $0
6 | option mem=0 # Do not add anything to qsub_opts
7 | option num_threads=* --cpus-per-task $0
8 | option num_threads=1 --cpus-per-task 1
9 | option num_nodes=* --nodes $0
10 | default gpu=0
11 | option gpu=0 -p cpu
12 | option gpu=* -p gpu --gres=gpu:$0
13 | # note: the --max-jobs-run option is supported as a special case
14 | # by slurm.pl and you don't have to handle it in the config file.
15 |
--------------------------------------------------------------------------------
/COMBINE/tts1/conf/slurm.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command sbatch --export=PATH
3 | option name=* --job-name $0
4 | option time=* --time $0
5 | option mem=* --mem-per-cpu $0
6 | option mem=0 # Do not add anything to qsub_opts
7 | option num_threads=* --cpus-per-task $0
8 | option num_threads=1 --cpus-per-task 1
9 | option num_nodes=* --nodes $0
10 | default gpu=0
11 | option gpu=0 -p cpu
12 | option gpu=* -p gpu --gres=gpu:$0
13 | # note: the --max-jobs-run option is supported as a special case
14 | # by slurm.pl and you don't have to handle it in the config file.
15 |
--------------------------------------------------------------------------------
/TEMPLATE/asr1/conf/slurm.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command sbatch --export=PATH
3 | option name=* --job-name $0
4 | option time=* --time $0
5 | option mem=* --mem-per-cpu $0
6 | option mem=0 # Do not add anything to qsub_opts
7 | option num_threads=* --cpus-per-task $0
8 | option num_threads=1 --cpus-per-task 1
9 | option num_nodes=* --nodes $0
10 | default gpu=0
11 | option gpu=0 -p cpu
12 | option gpu=* -p gpu --gres=gpu:$0
13 | # note: the --max-jobs-run option is supported as a special case
14 | # by slurm.pl and you don't have to handle it in the config file.
15 |
--------------------------------------------------------------------------------
/TEMPLATE/tts1/conf/slurm.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command sbatch --export=PATH
3 | option name=* --job-name $0
4 | option time=* --time $0
5 | option mem=* --mem-per-cpu $0
6 | option mem=0 # Do not add anything to qsub_opts
7 | option num_threads=* --cpus-per-task $0
8 | option num_threads=1 --cpus-per-task 1
9 | option num_nodes=* --nodes $0
10 | default gpu=0
11 | option gpu=0 -p cpu
12 | option gpu=* -p gpu --gres=gpu:$0
13 | # note: the --max-jobs-run option is supported as a special case
14 | # by slurm.pl and you don't have to handle it in the config file.
15 |
--------------------------------------------------------------------------------
/commonvoice/asr1/conf/slurm.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command sbatch --export=PATH
3 | option name=* --job-name $0
4 | option time=* --time $0
5 | option mem=* --mem-per-cpu $0
6 | option mem=0 # Do not add anything to qsub_opts
7 | option num_threads=* --cpus-per-task $0
8 | option num_threads=1 --cpus-per-task 1
9 | option num_nodes=* --nodes $0
10 | default gpu=0
11 | option gpu=0 -p cpu
12 | option gpu=* -p gpu --gres=gpu:$0
13 | # note: the --max-jobs-run option is supported as a special case
14 | # by slurm.pl and you don't have to handle it in the config file.
15 |
--------------------------------------------------------------------------------
/librispeech/asr1/conf/slurm.conf:
--------------------------------------------------------------------------------
1 | # Default configuration
2 | command sbatch --export=PATH
3 | option name=* --job-name $0
4 | option time=* --time $0
5 | option mem=* --mem-per-cpu $0
6 | option mem=0 # Do not add anything to qsub_opts
7 | option num_threads=* --cpus-per-task $0
8 | option num_threads=1 --cpus-per-task 1
9 | option num_nodes=* --nodes $0
10 | default gpu=0
11 | option gpu=0 -p cpu
12 | option gpu=* -p gpu --gres=gpu:$0
13 | # note: the --max-jobs-run option is supported as a special case
14 | # by slurm.pl and you don't have to handle it in the config file.
15 |
--------------------------------------------------------------------------------
/fisher/asr1/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Set bash to 'debug' mode, it will exit on :
3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
4 | set -e
5 | set -u
6 | set -o pipefail
7 |
8 | log() {
9 | local fname=${BASH_SOURCE[1]##*/}
10 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
11 | }
12 |
13 | srctexts="data/train_fisher/text "
14 | train_sets="train_fisher "
15 |
16 | ./asr.sh \
17 | --fs 8000 \
18 | --n_tokens 2000 \
19 | --token_type bpe \
20 | --train_sets "${train_sets}" \
21 | --dev_eval_sets "" \
22 | --srctexts "${srctexts}" "$@"
23 |
--------------------------------------------------------------------------------
/commonvoice/asr1/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Set bash to 'debug' mode, it will exit on :
3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
4 | set -e
5 | set -u
6 | set -o pipefail
7 |
8 | lang=en # en de fr cy tt kab ca zh-TW it fa eu es ru
9 |
10 | train_set=valid_train_${lang}
11 | train_dev=valid_dev_${lang}
12 | train_test=valid_test_${lang}
13 |
14 | ./asr.sh \
15 | --local_data_opts "--lang ${lang}" \
16 | --fs 16000 \
17 | --n_tokens 2000 \
18 | --token_type bpe \
19 | --feats_type fbank_pitch \
20 | --train_sets "${train_set}" \
21 | --dev_eval_sets "${train_dev} ${train_test}" \
22 | --srctexts "data/${train_set}/text" "$@"
23 |
--------------------------------------------------------------------------------
/swbd/asr1/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Set bash to 'debug' mode, it will exit on :
3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
4 | set -e
5 | set -u
6 | set -o pipefail
7 |
8 | log() {
9 | local fname=${BASH_SOURCE[1]##*/}
10 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
11 | }
12 |
13 | train_sets="swbd1_train "
14 | dev_set="swbd1_dev"
15 | eval_sets="eval2000 rt03 "
16 | srctexts="data/swbd1_train/text "
17 |
18 | ./asr.sh \
19 | --fs 8000 \
20 | --n_tokens 2000 \
21 | --token_type bpe \
22 | --train_sets "${train_sets}" \
23 | --dev_eval_sets "${dev_set} ${eval_sets}" \
24 | --srctexts "${srctexts}" "$@"
25 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages, setup
2 |
3 | setup(
4 | name="speech_datasets",
5 | version="0.1.0",
6 | author="Aadyot Bhatnagar",
7 | author_email="abhatnagar@salesforce.com",
8 | license="Apache-2.0",
9 | packages=find_packages(include=["speech_datasets*"]),
10 | install_requires=[
11 | "h5py>=2.9.0",
12 | "humanfriendly",
13 | "Kaldiio",
14 | "numpy",
15 | "pillow>=6.1.0",
16 | "PyYAML>=5.1.2",
17 | "ray[tune]",
18 | "resampy",
19 | "scipy",
20 | "sentencepiece<0.1.90,>=0.1.82",
21 | "soundfile>=0.10.2",
22 | "torch>=1.2.0",
23 | "tqdm",
24 | "typeguard>=2.7.0",
25 | ]
26 | )
27 |
--------------------------------------------------------------------------------
/wsj/asr1/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Set bash to 'debug' mode, it will exit on :
3 | # -e 'error', -u 'unbound variable', -o ... 'error in pipeline'
4 | set -e
5 | set -u
6 | set -o pipefail
7 |
8 | train_set="train_si284 "
9 | dev_set="test_dev93 "
10 | eval_sets="test_eval92 "
11 |
12 | # Even though data/nlsyms.txt is generated, we don't provide it to asr.sh
13 | # because the only non-linguistic symbol it contains is "", which is
14 | # which is the default value for nlysms.
15 | ./asr.sh \
16 | --fs 16000 \
17 | --n_tokens 75 \
18 | --token_type bpe \
19 | --train_sets "${train_set}" \
20 | --dev_eval_sets "${dev_set} ${eval_sets}" \
21 | --srctexts "data/train_si284/text data/local/other_text/text" "$@"
22 |
--------------------------------------------------------------------------------
/utils/make_absolute.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # This script replaces the command readlink -f (which is not portable).
4 | # It turns a pathname into an absolute pathname, including following soft links.
5 | target_file=$1
6 |
7 | cd $(dirname $target_file)
8 | target_file=$(basename $target_file)
9 |
10 | # Iterate down a (possible) chain of symlinks
11 | while [ -L "$target_file" ]; do
12 | target_file=$(readlink $target_file)
13 | cd $(dirname $target_file)
14 | target_file=$(basename $target_file)
15 | done
16 |
17 | # Compute the canonicalized name by finding the physical path
18 | # for the directory we're in and appending the target file.
19 | phys_dir=$(pwd -P)
20 | result=$phys_dir/$target_file
21 | echo $result
22 |
--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM pytorch/pytorch:latest
2 | RUN apt-get update
3 | RUN apt-get install -y --no-install-recommends \
4 | apt-utils wget bc gawk vim emacs build-essential locales libfontconfig1 automake \
5 | sox flac ffmpeg libasound2-dev libsndfile1-dev \
6 | libfftw3-dev libopenblas-dev libgflags-dev libgoogle-glog-dev gfortran \
7 | python3 python3-dev python3-pip python3-numpy python3-setuptools
8 | RUN apt update
9 | RUN apt install -y openssh-server openssh-client
10 |
11 | # Default to utf-8 encodings in python
12 | # Can verify in container with:
13 | # python -c 'import locale; print(locale.getpreferredencoding(False))'
14 | RUN locale-gen en_US.UTF-8
15 | ENV LANG en_US.UTF-8
16 | ENV LANGUAGE en_US:en
17 | ENV LC_ALL en_US.UTF-8
18 |
--------------------------------------------------------------------------------
/tools/install_pkgs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Set bash to 'debug' mode, it will exit on :
3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
4 | set -euo pipefail
5 |
6 | # This is needed for certain pods (ffmpeg-3 doesn't exist anymore & messes up apt gets)
7 | rm -f /etc/apt/sources.list.d/jonathonf-ubuntu-ffmpeg-3*
8 | apt-get remove libflac8 -y
9 | apt-get update -y
10 | apt-get upgrade -y
11 | apt-get autoremove -y
12 |
13 | # The actual apt installs we need
14 | apt-get install -y apt-utils
15 | apt-get install -y gawk
16 | apt-get install -y build-essential libfontconfig1 automake
17 | apt-get install -y sox flac ffmpeg libasound2-dev libsndfile1-dev
18 | apt-get install -y libfftw3-dev libopenblas-dev libgflags-dev libgoogle-glog-dev
19 | apt-get install -y gfortran python3
20 | apt-get install -y bc
21 | apt-get install -y wget
22 |
--------------------------------------------------------------------------------
/commonvoice/asr1/local/filter_text.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | # Copyright 2017 Johns Hopkins University (Shinji Watanabe)
4 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
5 |
6 | import argparse
7 | import codecs
8 | from io import open
9 | import sys
10 |
11 |
12 | sys.stdin = codecs.getreader("utf-8")(sys.stdin.buffer)
13 | sys.stdout = codecs.getwriter("utf-8")(sys.stdout.buffer)
14 |
15 |
16 | if __name__ == "__main__":
17 | parser = argparse.ArgumentParser()
18 | parser.add_argument("--filter-list", "-f", type=str, help="filter list")
19 | args = parser.parse_args()
20 |
21 | with open(args.filter_list, encoding="utf-8") as f:
22 | fil = [x.rstrip() for x in f]
23 |
24 | for x in sys.stdin:
25 | # extract text parts
26 | text = " ".join(x.rstrip().split()[1:])
27 | if text in fil:
28 | print(x.split()[0], text)
29 |
--------------------------------------------------------------------------------
/fisher/asr1/local/data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | set -u
4 | set -o pipefail
5 |
6 | log() {
7 | local fname=${BASH_SOURCE[1]##*/}
8 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
9 | }
10 |
11 | . ./path.sh || exit 1
12 | . ./db.sh || exit 1
13 |
14 |
15 | # Extract & prepare Fisher
16 | for (( i=1; i<=$(echo "${FISHER_TGZ}" | wc -w); i++ )); do
17 | src=$(echo "${FISHER_TGZ}" | cut -d " " -f $i)
18 | dst=$(echo "${FISHER}" | cut -d " " -f $i)
19 | if [ ! -e "${dst}" ]; then
20 | mkdir -p "${dst}"
21 | {
22 | tar xzvf "${src}" -C "${dst}"
23 | } || {
24 | log "Failed to extract FISHER (part $i)"
25 | exit 1
26 | }
27 | fi
28 | done
29 |
30 | # Note: do not quote ${FISHER} -- it should contains 4 directories, and fisher_prep.sh all 4
31 | log "local/fisher_data_prep.sh ${FISHER}"
32 | local/fisher_data_prep.sh ${FISHER}
--------------------------------------------------------------------------------
/utils/spk2utt_to_utt2spk.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | # Copyright 2010-2011 Microsoft Corporation
3 |
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
14 | # See the Apache 2 License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 |
18 | while(<>){
19 | @A = split(" ", $_);
20 | @A > 1 || die "Invalid line in spk2utt file: $_";
21 | $s = shift @A;
22 | foreach $u ( @A ) {
23 | print "$u $s\n";
24 | }
25 | }
26 |
27 |
28 |
--------------------------------------------------------------------------------
/librispeech/asr1/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Set bash to 'debug' mode, it will exit on :
3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
4 | set -e
5 | set -u
6 | set -o pipefail
7 |
8 | log() {
9 | local fname=${BASH_SOURCE[1]##*/}
10 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
11 | }
12 |
13 | # Get the datasets we want to use based on the command-line args
14 | train_sets="train-clean-100 train-clean-360 train-other-500 "
15 | dev_sets="dev-clean dev-other "
16 | eval_sets="test-clean test-other "
17 | srctexts=
18 | for dset in ${train_sets}; do
19 | srctexts+="data/${dset}/text "
20 | done
21 |
22 | ./asr.sh \
23 | --fs 16000 \
24 | --n_tokens 2000 \
25 | --token_type bpe \
26 | --train_sets "${train_sets}" \
27 | --dev_eval_sets "${dev_sets} ${eval_sets}" \
28 | --srctexts "${srctexts}" \
29 | --local_data_opts "${eval_sets} ${dev_sets} ${train_sets}" "$@"
--------------------------------------------------------------------------------
/example/utils.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 | import numpy as np
3 |
4 |
5 | def edit_dist(pred: List[int], label: List[int]) -> int:
6 | """Computes the edit distance between a predicted and label sequence."""
7 | # dists[i, j] = edit_dist(pred[:i], label[:i])
8 | pred_len, label_len = len(pred), len(label)
9 | dists = np.zeros((pred_len + 1, label_len + 1), dtype=int)
10 |
11 | dists[:, 0] = np.arange(pred_len + 1)
12 | dists[0, :] = np.arange(label_len + 1)
13 |
14 | for i, x in enumerate(pred):
15 | for j, y in enumerate(label):
16 | sub_delta = int(x != y)
17 | ins_delta = 1
18 | del_delta = 1
19 |
20 | substitution = dists[i, j] + sub_delta
21 | insertion = dists[i, j+1] + ins_delta # pred[:i] --> pred[:i+1]
22 | deletion = dists[i+1, j] + del_delta # label[:j] --> label[:j+1]
23 | dists[i+1, j+1] = min(substitution, insertion, deletion)
24 |
25 | return dists[-1, -1].item()
26 |
--------------------------------------------------------------------------------
/speech_datasets/transform/add_deltas.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from speech_datasets.transform.interface import FuncTrans
4 |
5 |
6 | def delta(feat, window):
7 | assert window > 0
8 | delta_feat = np.zeros_like(feat)
9 | for i in range(1, window + 1):
10 | delta_feat[:-i] += i * feat[i:]
11 | delta_feat[i:] += -i * feat[:-i]
12 | delta_feat[-i:] += i * feat[-1]
13 | delta_feat[:i] += -i * feat[0]
14 | delta_feat /= 2 * sum(i ** 2 for i in range(1, window + 1))
15 | return delta_feat
16 |
17 |
18 | def add_deltas(x, window=2, order=2):
19 | """
20 | :param x: Features
21 | :param window: size of the window to use to approximate time derivative computation
22 | :param order: highest order time derivative to compute
23 | :return: Features, concatenated with all the relevant derivatives
24 | """
25 | feats = [x]
26 | for _ in range(order):
27 | feats.append(delta(feats[-1], window))
28 | return np.concatenate(feats, axis=1)
29 |
30 |
31 | class AddDeltas(FuncTrans):
32 | _func = add_deltas
33 | __doc__ = add_deltas.__doc__
34 |
--------------------------------------------------------------------------------
/speech_datasets/bin/combine_cmvn_stats.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from speech_datasets.utils.readers import read_cmvn_stats
4 | from speech_datasets.utils.writers import write_cmvn_stats
5 |
6 |
7 | def parse_args():
8 | parser = argparse.ArgumentParser()
9 | parser.add_argument("--cmvn_type", choices=["global", "speaker", "utterance"])
10 | parser.add_argument("--output_file", type=str)
11 | parser.add_argument("cmvn_stats_files", nargs="+")
12 | return parser.parse_args()
13 |
14 |
15 | def combine_cmvn_dicts(stats_dicts):
16 | out_dict = {}
17 | for d in stats_dicts:
18 | for spk, val in d.items():
19 | if spk not in out_dict:
20 | out_dict[spk] = val
21 | else:
22 | out_dict[spk] += val
23 | return out_dict
24 |
25 |
26 | def main():
27 | args = parse_args()
28 | out_dict = combine_cmvn_dicts(read_cmvn_stats(path, args.cmvn_type)
29 | for path in args.cmvn_stats_files)
30 | write_cmvn_stats(args.output_file, args.cmvn_type, out_dict)
31 |
32 |
33 | if __name__ == "__main__":
34 | main()
35 |
--------------------------------------------------------------------------------
/TEMPLATE/asr1/setup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Set bash to 'debug' mode, it will exit on :
3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
4 | set -e
5 | set -u
6 | set -o pipefail
7 |
8 | log() {
9 | local fname=${BASH_SOURCE[1]##*/}
10 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
11 | }
12 | help_message=$(cat << EOF
13 | Usage: $0
14 | EOF
15 | )
16 |
17 |
18 | if [ $# -ne 1 ]; then
19 | log "${help_message}"
20 | log "Error: 1 positional argument is required."
21 | exit 2
22 | fi
23 |
24 |
25 | dir=$1
26 | mkdir -p "${dir}"
27 |
28 | if [ ! -d "${dir}"/../../TEMPLATE ]; then
29 | log "Error: ${dir}/../../TEMPLATE should exist. You may specify wrong directory."
30 | exit 1
31 | fi
32 |
33 | targets=""
34 |
35 | # Copy
36 | for f in conf; do
37 | target="${dir}"/../../TEMPLATE/asr1/"${f}"
38 | cp -r "${target}" "${dir}"
39 | targets+="${dir}/${target} "
40 | done
41 |
42 |
43 | # Symlinks to TEMPLATE & Kaldi
44 | for f in asr.sh cmd.sh path.sh db.sh utils; do
45 | target=../../TEMPLATE/asr1/"${f}"
46 | ln -sf "${target}" "${dir}"
47 | targets+="${dir}/${target} "
48 | done
49 |
50 |
51 | log "Created: ${targets}"
52 |
--------------------------------------------------------------------------------
/TEMPLATE/tts1/setup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Set bash to 'debug' mode, it will exit on :
3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
4 | set -e
5 | set -u
6 | set -o pipefail
7 |
8 | log() {
9 | local fname=${BASH_SOURCE[1]##*/}
10 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
11 | }
12 | help_message=$(cat << EOF
13 | Usage: $0
14 | EOF
15 | )
16 |
17 |
18 | if [ $# -ne 1 ]; then
19 | log "${help_message}"
20 | log "Error: 1 positional argument is required."
21 | exit 2
22 | fi
23 |
24 |
25 | dir=$1
26 | mkdir -p "${dir}"
27 |
28 | if [ ! -d "${dir}"/../../TEMPLATE ]; then
29 | log "Error: ${dir}/../../TEMPLATE should exist. You may specify wrong directory."
30 | exit 1
31 | fi
32 |
33 | targets=""
34 |
35 | # Copy
36 | for f in conf; do
37 | target="${dir}"/../../TEMPLATE/tts1/"${f}"
38 | cp -r "${target}" "${dir}"
39 | targets+="${dir}/${target} "
40 | done
41 |
42 |
43 | # Symlinks to TEMPLATE & Kaldi
44 | for f in tts.sh cmd.sh path.sh db.sh utils; do
45 | target=../../TEMPLATE/tts1/"${f}"
46 | ln -sf "${target}" "${dir}"
47 | targets+="${dir}/${target} "
48 | done
49 |
50 |
51 | log "Created: ${targets}"
52 |
--------------------------------------------------------------------------------
/TEMPLATE/asr1/path.sh:
--------------------------------------------------------------------------------
1 | MAIN_ROOT=$(dirname "$(dirname "${PWD}")")
2 | export LC_ALL=C
3 |
4 | if [ -z "${PS1:-}" ]; then
5 | PS1=__dummy__
6 | fi
7 |
8 | # Activate local virtual environment for development
9 | error_msg="Virtual environment not set up properly! Navigate to $MAIN_ROOT and run 'make clean all'"
10 | if [ -e $MAIN_ROOT/tools/venv/etc/profile.d/conda.sh ] && [ -e $MAIN_ROOT/tools/conda.done ]; then
11 | VENV_NAME=$(cat "${MAIN_ROOT}/tools/conda.done")
12 | source $MAIN_ROOT/tools/venv/etc/profile.d/conda.sh && conda deactivate
13 | if conda env list | (grep -q -E "${VENV_NAME}\s"); then
14 | conda activate "${VENV_NAME}"
15 | else
16 | echo "${error_msg}" && exit 1
17 | fi
18 | else
19 | echo "${error_msg}" && exit 1
20 | fi
21 |
22 | # Add binary scripts to the path, to allow them to be run easily
23 | export PATH=$MAIN_ROOT/speech_datasets/bin:$PATH
24 | export OMP_NUM_THREADS=1
25 |
26 | # NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
27 | export PYTHONIOENCODING=UTF-8
28 |
29 | # You need to change or unset NCCL_SOCKET_IFNAME according to your network environment
30 | # https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/env.html#nccl-socket-ifname
31 | export NCCL_SOCKET_IFNAME="^lo,docker,virbr,vmnet,vboxnet"
32 |
--------------------------------------------------------------------------------
/wsj/asr1/local/flist2scp.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | # Copyright 2010-2011 Microsoft Corporation
3 |
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
14 | # See the Apache 2 License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 |
18 | # takes in a file list with lines like
19 | # /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1
20 | # and outputs an scp in kaldi format with lines like
21 | # 4k0c030a /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1
22 | # (the first thing is the utterance-id, which is the same as the basename of the file.
23 |
24 |
25 | while(<>){
26 | m:^\S+/(\w+)\.[wW][vV]1$: || die "Bad line $_";
27 | $id = $1;
28 | $id =~ tr/A-Z/a-z/; # Necessary because of weirdness on disk 13-16.1 (uppercase filenames)
29 | print "$id $_";
30 | }
31 |
32 |
--------------------------------------------------------------------------------
/utils/utt2spk_to_spk2utt.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | # Copyright 2010-2011 Microsoft Corporation
3 |
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
14 | # See the Apache 2 License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | # converts an utt2spk file to a spk2utt file.
18 | # Takes input from the stdin or from a file argument;
19 | # output goes to the standard out.
20 |
21 | if ( @ARGV > 1 ) {
22 | die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt";
23 | }
24 |
25 | while(<>){
26 | @A = split(" ", $_);
27 | @A == 2 || die "Invalid line in utt2spk file: $_";
28 | ($u,$s) = @A;
29 | if(!$seen_spk{$s}) {
30 | $seen_spk{$s} = 1;
31 | push @spklist, $s;
32 | }
33 | push (@{$spk_hash{$s}}, "$u");
34 | }
35 | foreach $s (@spklist) {
36 | $l = join(' ',@{$spk_hash{$s}});
37 | print "$s $l\n";
38 | }
39 |
--------------------------------------------------------------------------------
/TEMPLATE/asr1/db.sh:
--------------------------------------------------------------------------------
1 | # We extract WSJ0_TGZ to WSJ0 and WSJ1_TGZ to WSJ1. Note that the actual data
2 | # is in WSJ0/csr_1_senn and WSJ1/csr_senn
3 | WSJ0_TGZ=/export/data/LDC/csr_1_senn_LDC93S6B.tgz
4 | WSJ1_TGZ=/export/data/LDC/csr_senn_LDC94S13B.tgz
5 | WSJ0=/workspace/LDC93S6B
6 | WSJ1=/workspace/LDC94S13B
7 |
8 | # Extract SWBD1_TGZ to SWBD1
9 | SWBD1_TGZ=/export/data/LDC/swb1_LDC97S62.tgz
10 | SWBD1=/workspace/LDC97S62
11 |
12 | # Filepath i of EVAL2000_TGZ extracts into directory i of EVAL2000.
13 | # First directory must contain the speech data, second directory must contain the transcripts.
14 | EVAL2000_TGZ="/export/data/LDC/hub5e_00_LDC2002S09.tgz /export/data/LDC/LDC2002T43.tgz"
15 | EVAL2000="/workspace/LDC2002S09/hub5e_00 /workspace/LDC2002T43"
16 |
17 | # Extract RT03_TGZ to RT03
18 | RT03_TGZ=/export/data/LDC/rt_03_LDC2007S10.tgz
19 | RT03=/workspace/LDC2007S10/rt_03
20 |
21 | # filepath i of FISHER_TGZ extracts into directory i of FISHER
22 | # In this case, we extract LDC2004T19 and LDC2005T19 every time, but LDC2004S13 and LDC2005S13 are pre-extracted
23 | FISHER="/workspace/LDC2004T19 /workspace/LDC2005T19 /export/data/LDC/LDC2004S13 /export/data/LDC/LDC2005S13"
24 | FISHER_TGZ="/export/data/LDC/LDC2004T19/fe_03_p1_tran_LDC2004T19.tgz /export/data/LDC/LDC2005T19/LDC2005T19.tgz"
25 |
26 | LIBRISPEECH=/export/data/librispeech
27 |
28 | COMMONVOICE=/export/data/commonvoice
29 |
--------------------------------------------------------------------------------
/wsj/asr1/local/wsj_format_data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright 2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
4 | # 2015 Guoguo Chen
5 | # Apache 2.0
6 |
7 | # This script takes data prepared in a corpus-dependent way
8 | # in data/local/, and converts it into the "canonical" form,
9 | # in various subdirectories of data/, e.g. data/lang, data/lang_test_ug,
10 | # data/train_si284, data/train_si84, etc.
11 |
12 | # Don't bother doing train_si84 separately (although we have the file lists
13 | # in data/local/) because it's just the first 7138 utterances in train_si284.
14 | # We'll create train_si84 after doing the feature extraction.
15 |
16 | lang_suffix=
17 |
18 | echo "$0 $@" # Print the command line for logging
19 | . ./path.sh || exit 1;
20 | . utils/parse_options.sh || exit 1;
21 |
22 | echo "Preparing train and test data"
23 | srcdir=data/local/data
24 |
25 | for x in train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
26 | mkdir -p data/$x
27 | cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
28 | cp $srcdir/$x.txt data/$x/text || exit 1;
29 | cp $srcdir/$x.spk2utt data/$x/spk2utt || exit 1;
30 | cp $srcdir/$x.utt2spk data/$x/utt2spk || exit 1;
31 | utils/filter_scp.pl data/$x/spk2utt $srcdir/spk2gender > data/$x/spk2gender || exit 1;
32 | done
33 |
34 | echo "Succeeded in formatting data."
35 |
--------------------------------------------------------------------------------
/utils/shuffle_list.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 |
3 | # Copyright 2013 Johns Hopkins University (author: Daniel Povey)
4 |
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
14 | # MERCHANTABLITY OR NON-INFRINGEMENT.
15 | # See the Apache 2 License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 |
19 | if ($ARGV[0] eq "--srand") {
20 | $n = $ARGV[1];
21 | $n =~ m/\d+/ || die "Bad argument to --srand option: \"$n\"";
22 | srand($ARGV[1]);
23 | shift;
24 | shift;
25 | } else {
26 | srand(0); # Gives inconsistent behavior if we don't seed.
27 | }
28 |
29 | if (@ARGV > 1 || $ARGV[0] =~ m/^-.+/) { # >1 args, or an option we
30 | # don't understand.
31 | print "Usage: shuffle_list.pl [--srand N] [input file] > output\n";
32 | print "randomizes the order of lines of input.\n";
33 | exit(1);
34 | }
35 |
36 | @lines;
37 | while (<>) {
38 | push @lines, [ (rand(), $_)] ;
39 | }
40 |
41 | @lines = sort { $a->[0] cmp $b->[0] } @lines;
42 | foreach $l (@lines) {
43 | print $l->[1];
44 | }
45 |
--------------------------------------------------------------------------------
/utils/compute_cmvn_stats.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -euo pipefail
3 |
4 | # Begin configuration section.
5 | nj=4
6 | cmd=utils/run.pl
7 | archive_format=hdf5
8 | cmvn_type=global
9 | spk2utt=
10 | # End configuration section.
11 |
12 | help_message=$(cat << EOF
13 | Usage: $0 [options] [logdir]
14 | e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/logs
15 | Options:
16 | --nj # number of parallel jobs
17 | --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs.
18 | --archive_format # Specify the format of feats file
19 | --cmvn-type # cmvn_type (global or speaker or utterance)
20 | --spk2utt # speaker -> utterance file
21 | EOF
22 | )
23 |
24 | echo "$0 $*" 1>&2 # Print the command line for logging
25 | . path.sh || exit 1
26 | . utils/parse_options.sh || exit 1;
27 |
28 | if [ $# -lt 2 ] || [ $# -gt 3 ]; then
29 | echo "${help_message}" 1>&2
30 | exit 1;
31 | fi
32 |
33 | scp=$1
34 | cmvnark=$2
35 | data=$(dirname ${scp})
36 | if [ $# -eq 3 ]; then
37 | logdir=$3
38 | else
39 | logdir=${data}/logs
40 | fi
41 | mkdir -p ${logdir}
42 |
43 | split_scps=
44 | split_cmvn=
45 | for n in $(seq ${nj}); do
46 | split_cmvn+="${logdir}/cmvn.${n}.ark "
47 | split_scps+="${logdir}/feats.${n}.scp "
48 | done
49 | utils/split_scp.pl ${scp} ${split_scps} || exit 1
50 |
51 |
52 | maybe_spk2utt=
53 | if [ -n "${spk2utt}" ] && [ "${cmvn_type}" = speaker ]; then
54 | maybe_spk2utt="--spk2utt ${spk2utt}"
55 | fi
56 |
57 | ${cmd} JOB=1:${nj} ${logdir}/compute_cmvn_stats.JOB.log \
58 | compute_cmvn_stats.py --filetype ${archive_format} ${maybe_spk2utt} \
59 | --cmvn-type ${cmvn_type} "scp:${logdir}/feats.JOB.scp" "${logdir}/cmvn.JOB.ark"
60 |
61 | python3 -m speech_datasets.bin.combine_cmvn_stats --cmvn_type ${cmvn_type} \
62 | --output_file ${cmvnark} ${split_cmvn} || exit 1
63 |
64 | rm -f ${split_scps} ${split_cmvn}
65 |
--------------------------------------------------------------------------------
/utils/remove_dup_utts.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Remove excess utterances once they appear more than a specified
4 | # number of times with the same transcription, in a data set.
5 | # E.g. useful for removing excess "uh-huh" from training.
6 |
7 | if [ $# != 3 ]; then
8 | echo "Usage: remove_dup_utts.sh max-count "
9 | echo "e.g.: remove_dup_utts.sh 10 data/train data/train_nodup"
10 | echo "This script is used to filter out utterances that have from over-represented"
11 | echo "transcriptions (such as 'uh-huh'), by limiting the number of repetitions of"
12 | echo "any given word-sequence to a specified value. It's often used to get"
13 | echo "subsets for early stages of training."
14 | exit 1;
15 | fi
16 |
17 | maxcount=$1
18 | srcdir=$2
19 | destdir=$3
20 | mkdir -p $destdir
21 |
22 | [ ! -f $srcdir/text ] && echo "$0: Invalid input directory $srcdir" && exit 1;
23 |
24 | ! mkdir -p $destdir && echo "$0: could not create directory $destdir" && exit 1;
25 |
26 | ! [ "$maxcount" -gt 1 ] && echo "$0: invalid max-count '$maxcount'" && exit 1;
27 |
28 | cp $srcdir/* $destdir
29 | cat $srcdir/text | \
30 | perl -e '
31 | $maxcount = shift @ARGV;
32 | @all = ();
33 | $p1 = 103349; $p2 = 71147; $k = 0;
34 | sub random { # our own random number generator: predictable.
35 | $k = ($k + $p1) % $p2;
36 | return ($k / $p2);
37 | }
38 | while(<>) {
39 | push @all, $_;
40 | @A = split(" ", $_);
41 | shift @A;
42 | $text = join(" ", @A);
43 | $count{$text} ++;
44 | }
45 | foreach $line (@all) {
46 | @A = split(" ", $line);
47 | shift @A;
48 | $text = join(" ", @A);
49 | $n = $count{$text};
50 | if ($n < $maxcount || random() < ($maxcount / $n)) {
51 | print $line;
52 | }
53 | }' $maxcount >$destdir/text
54 |
55 | echo "Reduced number of utterances from `cat $srcdir/text | wc -l` to `cat $destdir/text | wc -l`"
56 |
57 | echo "Using fix_data_dir.sh to reconcile the other files."
58 | utils/fix_data_dir.sh $destdir
59 | rm -r $destdir/.backup
60 |
61 | exit 0
62 |
--------------------------------------------------------------------------------
/swbd/asr1/local/map_acronyms_transcripts.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | # Copyright 2015 Minhua Wu
4 | # Apache 2.0
5 |
6 | # convert acronyms in swbd transcript to fisher convention
7 | # accoring to first two columns in the input acronyms mapping
8 |
9 | import argparse, re
10 | __author__ = 'Minhua Wu'
11 |
12 | parser = argparse.ArgumentParser(description='format acronyms to a._b._c.')
13 | parser.add_argument('-i', '--input', help='Input transcripts', required=True)
14 | parser.add_argument('-o', '--output',help='Output transcripts', required=True)
15 | parser.add_argument('-M', '--Map', help='Input acronyms mapping', required=True)
16 | args = parser.parse_args()
17 |
18 | fin_map = open(args.Map, "r")
19 | dict_acronym = {}
20 | dict_acronym_noi = {} # Mapping of acronyms without I, i
21 | for pair in fin_map:
22 | items = pair.split('\t')
23 | dict_acronym[items[0]] = items[1]
24 | dict_acronym_noi[items[0]] = items[1]
25 | fin_map.close()
26 | del dict_acronym_noi['I']
27 | del dict_acronym_noi['i']
28 |
29 |
30 | fin_trans = open(args.input, "r")
31 | fout_trans = open(args.output, "w")
32 | for line in fin_trans:
33 | items = line.split()
34 | L = len(items)
35 | # First pass mapping to map I as part of acronym
36 | for i in range(L):
37 | if items[i] == 'I':
38 | x = 0
39 | while i-1-x >= 0 and re.match(r'^[A-Z]$', items[i-1-x]):
40 | x += 1
41 |
42 | y = 0
43 | while i+1+y < L and re.match(r'^[A-Z]$', items[i+1+y]):
44 | y += 1
45 |
46 | if x+y > 0:
47 | for bias in range(-x, y+1):
48 | items[i+bias] = dict_acronym[items[i+bias]]
49 |
50 | # Second pass mapping (not mapping 'i' and 'I')
51 | for i in range(len(items)):
52 | if items[i] in dict_acronym_noi.keys():
53 | items[i] = dict_acronym_noi[items[i]]
54 | sentence = ' '.join(items[1:])
55 | fout_trans.write(items[0] + ' ' + sentence.lower() + '\n')
56 |
57 | fin_trans.close()
58 | fout_trans.close()
59 |
--------------------------------------------------------------------------------
/utils/feat_to_shape.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -euo pipefail
3 |
4 | # Begin configuration section.
5 | nj=4
6 | cmd=utils/run.pl
7 | verbose=0
8 | archive_format=
9 | preprocess_conf=
10 | # End configuration section.
11 |
12 | help_message=$(cat << EOF
13 | Usage: $0 [options] []
14 | e.g.: $0 data/train/feats.scp data/train/shape.scp data/train/logs
15 | Options:
16 | --nj # number of parallel jobs
17 | --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs.
18 | --archive_format # Specify the format of feats file
19 | --preprocess-conf # Apply preprocess to feats when creating shape.scp
20 | --verbose # Default: 0
21 | EOF
22 | )
23 |
24 | echo "$0 $*" 1>&2 # Print the command line for logging
25 | . path.sh || exit 1
26 | . utils/parse_options.sh || exit 1;
27 |
28 | if [ $# -lt 2 ] || [ $# -gt 3 ]; then
29 | echo "${help_message}" 1>&2
30 | exit 1;
31 | fi
32 |
33 | scp=$1
34 | outscp=$2
35 | data=$(dirname ${scp})
36 | if [ $# -eq 3 ]; then
37 | logdir=$3
38 | else
39 | logdir=${data}/logs
40 | fi
41 | mkdir -p ${logdir}
42 |
43 | nj=$((nj<$(<"${scp}" wc -l)?nj:$(<"${scp}" wc -l)))
44 | split_scps=""
45 | for n in $(seq ${nj}); do
46 | split_scps="${split_scps} ${logdir}/feats.${n}.scp"
47 | done
48 |
49 | utils/split_scp.pl ${scp} ${split_scps}
50 |
51 | if [ -n "${preprocess_conf}" ]; then
52 | preprocess_opt="--preprocess-conf ${preprocess_conf}"
53 | else
54 | preprocess_opt=""
55 | fi
56 | if [ -n "${archive_format}" ]; then
57 | filetype_opt="--filetype ${archive_format}"
58 | else
59 | filetype_opt=""
60 | fi
61 |
62 | ${cmd} JOB=1:${nj} ${logdir}/feat_to_shape.JOB.log \
63 | feat_to_shape.py --verbose ${verbose} ${preprocess_opt} ${filetype_opt} \
64 | scp:${logdir}/feats.JOB.scp ${logdir}/shape.JOB.scp
65 |
66 | # concatenate the .scp files together.
67 | for n in $(seq ${nj}); do
68 | cat ${logdir}/shape.${n}.scp
69 | done > ${outscp}
70 |
71 | rm -f ${logdir}/feats.*.scp 2>/dev/null
72 |
--------------------------------------------------------------------------------
/wsj/asr1/local/data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | set -u
4 | set -o pipefail
5 |
6 | log() {
7 | local fname=${BASH_SOURCE[1]##*/}
8 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
9 | }
10 |
11 | help_message=$(cat << EOF
12 | Usage: $0
13 | (No options)
14 | EOF
15 | )
16 |
17 | if [ $# -ne 0 ]; then
18 | log "Error: invalid command line arguments"
19 | log "${help_message}"
20 | exit 1
21 | fi
22 |
23 | . ./path.sh || exit 1
24 | . ./db.sh || exit 1
25 |
26 | other_text=data/local/other_text/text
27 | nlsyms=data/nlsyms.txt
28 |
29 | # Extract WSJ0/WSJ1 raw data if needed
30 | WSJ=("${WSJ0}" "${WSJ1}")
31 | WSJ_TGZ=("${WSJ0_TGZ}" "${WSJ1_TGZ}")
32 | for (( i=0; i<2; i++ )); do
33 | echo ${WSJ[i]}
34 | if [ -z "${WSJ[i]}" ]; then
35 | log "Fill the value of 'WSJ${i}' in db.sh"
36 | exit 1
37 | elif [ ! -d "${WSJ[i]}" ]; then
38 | mkdir -p "${WSJ[i]}"
39 | {
40 | tar xzvf "${WSJ_TGZ[i]}" -C "${WSJ[i]}"
41 | } || {
42 | rm -rf "${WSJ[i]}"
43 | log "Failed to extract WSJ${i}"
44 | exit 1
45 | }
46 | fi
47 | done
48 |
49 | log "local/wsj_data_prep.sh ${WSJ0}/csr_1_senn/??-{?,??}.? ${WSJ1}/csr_senn/??-{?,??}.?"
50 | local/wsj_data_prep.sh "${WSJ0}"/csr_1_senn/??-{?,??}.? "${WSJ1}"/csr_senn/??-{?,??}.?
51 | log "local/wsj_format_data.sh"
52 | local/wsj_format_data.sh
53 |
54 | log "Create the list of non-linguistic symbols: ${nlsyms}"
55 | cut -f 2- -d" " data/train_si284/text | tr " " "\n" | sort | uniq | grep "<" > ${nlsyms}
56 | cat ${nlsyms}
57 |
58 | log "Prepare text from lng_modl dir: ${WSJ1}/csr_senn/13-32.1/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/*.z -> ${other_text}"
59 | mkdir -p "$(dirname ${other_text})"
60 |
61 | # NOTE(kamo): Give utterance id to each texts & make everything lowercase
62 | # Also remove utterances with non-linguistic symbols, i.e. lines including "<"
63 | zcat ${WSJ1}/csr_senn/13-32.1/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/*.z | \
64 | grep -v "<" | tr "[:upper:]" "[:lower:]" | \
65 | awk '{ printf("{wsj}lng_%07d %s\n",NR,$0) } ' > ${other_text}
66 |
--------------------------------------------------------------------------------
/COMBINE/asr1/combine_train_data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -euo pipefail
3 | log() {
4 | local fname=${BASH_SOURCE[1]##*/}
5 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
6 | }
7 |
8 | help_message="Usage: $0 [asr.sh options] / / / ..."
9 |
10 | log "$0 $*"
11 | if [ $# -eq 0 ]; then
12 | log "$help_message"
13 | log "Error: at least 1 argument required"
14 | exit 2
15 | fi
16 |
17 | kwargs=()
18 | stage=2
19 | stop_stage=5
20 | while true; do
21 | case "$1" in
22 | --stage)
23 | if [ "$2" -lt 2 ]; then
24 | log "Specify --stage 2 or higher (got --stage $2)."
25 | log "We expect stage 1 to be complete for all datasets given."
26 | exit 2
27 | else
28 | stage=$2
29 | fi
30 | shift 2
31 | ;;
32 | --stop-stage|--stop_stage)
33 | if [ "$2" -gt 5 ]; then
34 | log "Specify --stop-stage 5 or lower (got --stop-stage $2)."
35 | log "Use combine_cmvn_stats.sh to combine CMVN statistics from multiple datasets (stage 5)."
36 | log "Use multi_tokenize.sh to obtain token inventories from multiple datasets (stages 6-7)."
37 | exit 2
38 | else
39 | stop_stage=$2
40 | fi
41 | shift 2
42 | ;;
43 | --*) kwargs+=( "$1" "$2" ); shift 2; ;;
44 | *) break;
45 | esac
46 | done
47 | kwargs+=( --stage "$stage" --stop_stage "$stop_stage" )
48 |
49 | if [ $# -eq 0 ]; then
50 | log "${help_message}"
51 | log "Error: Please specify dataset splits as positional arguments."
52 | exit 2
53 | fi
54 |
55 | task=$(basename "$(utils/make_absolute.sh "$PWD")")
56 | idx=$(python local/combine_datasets.py --task "${task//1/}" --write_dir true "$@")
57 | datadir="data/${idx}"
58 | for f in wav.scp segments utt2spk text; do
59 | sort "${datadir}/${f}" > "${datadir}/${f}.tmp"
60 | mv "${datadir}/${f}.tmp" "${datadir}/${f}"
61 | done
62 | ./run.sh "${kwargs[@]}" --train_sets "${idx}"
63 |
--------------------------------------------------------------------------------
/swbd/asr1/local/swbd1_map_words.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 |
3 | # Modified from swbd_map_words.pl in Kaldi s5 recipe to make pattern
4 | # matches case-insensitive --Arnab (Jan 2013)
5 |
6 | if ($ARGV[0] eq "-f") {
7 | shift @ARGV;
8 | $field_spec = shift @ARGV;
9 | if ($field_spec =~ m/^\d+$/) {
10 | $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
11 | }
12 | if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
13 | if ($1 ne "") {
14 | $field_begin = $1 - 1; # Change to zero-based indexing.
15 | }
16 | if ($2 ne "") {
17 | $field_end = $2 - 1; # Change to zero-based indexing.
18 | }
19 | }
20 | if (!defined $field_begin && !defined $field_end) {
21 | die "Bad argument to -f option: $field_spec";
22 | }
23 | }
24 |
25 |
26 | while (<>) {
27 | @A = split(" ", $_);
28 | for ($n = 0; $n < @A; $n++) {
29 | $a = $A[$n];
30 | if ( (!defined $field_begin || $n >= $field_begin)
31 | && (!defined $field_end || $n <= $field_end)) {
32 | # e.g. [LAUGHTER-STORY] -> STORY;
33 | $a =~ s:(|\-)^\[LAUGHTER-(.+)\](|\-)$:$1$2$3:i;
34 | # $1 and $3 relate to preserving trailing "-"
35 | $a =~ s:^\[(.+)/.+\](|\-)$:$1$2:; # e.g. [IT'N/ISN'T] -> IT'N ... note,
36 | # 1st part may include partial-word stuff, which we process further below,
37 | # e.g. [LEM[GUINI]-/LINGUINI]
38 | # the (|\_) at the end is to accept and preserve trailing -'s.
39 | $a =~ s:^(|\-)\[[^][]+\](.+)$:-$2:; # e.g. -[AN]Y , note \047 is quote;
40 | # let the leading - be optional on input, as sometimes omitted.
41 | $a =~ s:^(.+)\[[^][]+\](|\-)$:$1-:; # e.g. AB[SOLUTE]- -> AB-;
42 | # let the trailing - be optional on input, as sometimes omitted.
43 | $a =~ s:([^][]+)\[.+\]$:$1:; # e.g. EX[SPECIALLY]-/ESPECIALLY] -> EX-
44 | # which is a mistake in the input.
45 | $a =~ s:^\{(.+)\}$:$1:; # e.g. {YUPPIEDOM} -> YUPPIEDOM
46 | $a =~ s:[A-Z]\[([^][])+\][A-Z]:$1-$3:i; # e.g. AMMU[N]IT- -> AMMU-IT-
47 | $a =~ s:_\d$::; # e.g. THEM_1 -> THEM
48 | }
49 | $A[$n] = $a;
50 | }
51 | print join(" ", @A) . "\n";
52 | }
53 |
--------------------------------------------------------------------------------
/COMBINE/asr1/multi_tokenize.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Set bash to 'debug' mode, it will exit on :
3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
4 | set -e
5 | set -u
6 | set -o pipefail
7 |
8 | log() {
9 | local fname=${BASH_SOURCE[1]##*/}
10 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
11 | }
12 |
13 | # Tokenization related options from asr.sh
14 | token_type=bpe # Tokenization type (char or bpe).
15 | n_tokens=2000 # The number of BPE vocabulary.
16 | nlsyms="" # non-linguistic symbols list, separated by a comma
17 |
18 | help_message=$(cat < ...
20 |
21 | Produces a token inventory of the given type for all the datasets provided.
22 |
23 | Options:
24 | --token_type # Tokenization type (char or bpe, default="${token_type}").
25 | --n_tokens # The maximum number of tokens allowed (default="${n_tokens}").
26 | --nlsyms # Non-linguistic symbol list for BPE/char, separated by a comma. (default="${nlsyms}").
27 | EOF
28 | )
29 |
30 | . ./path.sh || exit 1
31 | . ./cmd.sh || exit 1
32 |
33 | log "$0 $*"
34 | . utils/parse_options.sh || exit 1
35 | if [ $# -eq 0 ]; then
36 | log "${help_message}"
37 | log "Error: Please specify datasets as positional arguments."
38 | exit 2
39 | fi
40 |
41 | workspace=$PWD
42 | task=$(basename "$(utils/make_absolute.sh "$workspace")")
43 | run_args="--token-type ${token_type} --n_tokens ${n_tokens} --nlsyms ${nlsyms} "
44 |
45 | # Compile srctexts from all the relevant datasets
46 | srctexts=
47 | for dset in "$@"; do
48 | log "Concatenating all source texts from dataset $dset..."
49 | dset_dir="${MAIN_ROOT}/${dset}/${task}"
50 | cd ${dset_dir}
51 | ./run.sh --stage 6 --stop-stage 6 ${run_args}
52 | cd ${workspace}
53 | srctexts+="${dset_dir}/dump/srctexts "
54 | echo ""
55 | done
56 |
57 | # Concatenate all the relevant text data & prepare a token inventory
58 | log "Concatenating all source texts from all datasets..."
59 | mkdir -p dump data
60 | cat $srctexts > dump/srctexts
61 | ./run.sh --stage 7 --stop-stage 7 ${run_args}
62 |
63 |
--------------------------------------------------------------------------------
/commonvoice/asr1/local/data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Set bash to 'debug' mode, it will exit on :
3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
4 | set -e
5 | set -u
6 | set -o pipefail
7 |
8 | log() {
9 | local fname=${BASH_SOURCE[1]##*/}
10 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
11 | }
12 |
13 | # Copyright 2020 Johns Hopkins University (Shinji Watanabe)
14 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
15 |
16 | . ./path.sh || exit 1;
17 | . ./cmd.sh || exit 1;
18 | . ./db.sh || exit 1;
19 |
20 | # general configuration
21 | SECONDS=0
22 | lang=en # en de fr cy tt kab ca zh-TW it fa eu es ru
23 | # base url for downloads.
24 | data_url=https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/$lang.tar.gz
25 |
26 | train_set=valid_train_${lang}
27 | train_dev=valid_dev_${lang}
28 | test_set=valid_test_${lang}
29 |
30 | # Ensure that COMMONVOICE data has already been extracted
31 | if [ -z "${COMMONVOICE}" ]; then
32 | log "Fill the value of 'COMMONVOICE' in db.sh"
33 | exit 1
34 | fi
35 | log "Downloading commonvoice dataset"
36 | mkdir -p "${COMMONVOICE}"
37 | local/download_and_untar.sh "${COMMONVOICE}" "${data_url}" "${lang}.tar.gz"
38 |
39 | log "Preparing data for commonvoice"
40 | ### Task dependent. You have to make data the following preparation part by yourself.
41 | ### But you can utilize Kaldi recipes in most cases
42 | for part in "validated"; do
43 | # use underscore-separated names in data directories.
44 | local/data_prep.pl "${COMMONVOICE}" ${part} data/"$(echo "${part}_${lang}" | tr - _)"
45 | done
46 |
47 | # Kaldi Version Split
48 | # utils/subset_data_dir_tr_cv.sh data/validated data/valid_train data/valid_test_dev
49 | # utils/subset_data_dir_tr_cv.sh --cv-spk-percent 50 data/valid_test_dev data/valid_test data/valid_dev
50 |
51 | # ESPNet Version (same as voxforge)
52 | # consider duplicated sentences (does not consider speaker split)
53 | # filter out the same sentences (also same text) of test&dev set from validated set
54 | local/split_tr_dt_et.sh data/validated_${lang} data/${train_set} data/${train_dev} data/${test_set}
55 |
56 | log "Successfully finished. [elapsed=${SECONDS}s]"
57 |
--------------------------------------------------------------------------------
/speech_datasets/utils/misc.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | import inspect
3 | from os.path import abspath, dirname
4 | import torch
5 |
6 |
7 | def get_root():
8 | """This file is ROOT/speech_datasets/utils/misc.py, so return ROOT."""
9 | return dirname(dirname(dirname(abspath(__file__))))
10 |
11 |
12 | def check_kwargs(func, kwargs, name=None):
13 | """check kwargs are valid for func
14 |
15 | If kwargs are invalid, raise TypeError as same as python default
16 | :param function func: function to be validated
17 | :param dict kwargs: keyword arguments for func
18 | :param str name: name used in TypeError (default is func name)
19 | """
20 | try:
21 | params = inspect.signature(func).parameters
22 | except ValueError:
23 | return
24 | if name is None:
25 | name = func.__name__
26 | for k in kwargs.keys():
27 | if k not in params:
28 | raise TypeError(f"{name}() got an unexpected keyword argument '{k}'")
29 |
30 |
31 | def dynamic_import(import_path, alias=None):
32 | """dynamic import module and class
33 |
34 | :param str import_path: syntax 'module_name:class_name'
35 | e.g., 'speech_datasets.transform.add_deltas:AddDeltas'
36 | :param dict alias: shortcut for registered class
37 | :return: imported class
38 | """
39 | alias = dict() if alias is None else alias
40 | if import_path not in alias and ":" not in import_path:
41 | raise ValueError(
42 | "import_path should be one of {} or "
43 | 'include ":", e.g. "speech_datasets.transform.add_deltas:AddDeltas" : '
44 | "{}".format(set(alias), import_path)
45 | )
46 | if ":" not in import_path:
47 | import_path = alias[import_path]
48 |
49 | module_name, objname = import_path.split(":")
50 | m = importlib.import_module(module_name)
51 | return getattr(m, objname)
52 |
53 |
54 | def set_deterministic_pytorch(seed, cudnn_deterministic=True):
55 | """Ensures pytorch produces deterministic results based on the seed."""
56 | # See https://github.com/pytorch/pytorch/issues/6351 about cudnn.benchmark
57 | torch.manual_seed(seed)
58 | torch.backends.cudnn.deterministic = cudnn_deterministic
59 | torch.backends.cudnn.benchmark = (not cudnn_deterministic)
60 |
--------------------------------------------------------------------------------
/wsj/asr1/local/find_transcripts.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | # Copyright 2010-2011 Microsoft Corporation
3 |
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
14 | # See the Apache 2 License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 |
18 |
19 | # This program takes on its standard input a list of utterance
20 | # id's, one for each line. (e.g. 4k0c030a is a an utterance id).
21 | # It takes as
22 | # Extracts from the dot files the transcripts for a given
23 | # dataset (represented by a file list).
24 | #
25 |
26 | @ARGV == 1 || die "find_transcripts.pl dot_files_flist < utterance_ids > transcripts";
27 | $dot_flist = shift @ARGV;
28 |
29 | open(L, "<$dot_flist") || die "Opening file list of dot files: $dot_flist\n";
30 | while(){
31 | chop;
32 | m:\S+/(\w{6})00.dot: || die "Bad line in dot file list: $_";
33 | $spk = $1;
34 | $spk2dot{$spk} = $_;
35 | }
36 |
37 |
38 |
39 | while(){
40 | chop;
41 | $uttid = $_;
42 | $uttid =~ m:(\w{6})\w\w: || die "Bad utterance id $_";
43 | $spk = $1;
44 | if($spk ne $curspk) {
45 | %utt2trans = { }; # Don't keep all the transcripts in memory...
46 | $curspk = $spk;
47 | $dotfile = $spk2dot{$spk};
48 | defined $dotfile || die "No dot file for speaker $spk\n";
49 | open(F, "<$dotfile") || die "Error opening dot file $dotfile\n";
50 | while() {
51 | $_ =~ m:(.+)\((\w{8})\)\s*$: || die "Bad line $_ in dot file $dotfile (line $.)\n";
52 | $trans = $1;
53 | $utt = $2;
54 | $utt2trans{$utt} = $trans;
55 | }
56 | }
57 | if(!defined $utt2trans{$uttid}) {
58 | print STDERR "No transcript for utterance $uttid (current dot file is $dotfile)\n";
59 | } else {
60 | print "$uttid $utt2trans{$uttid}\n";
61 | }
62 | }
63 |
64 |
65 |
--------------------------------------------------------------------------------
/example/README.md:
--------------------------------------------------------------------------------
1 | # Example Code
2 | This directory provides an example which trains a transformer encoder-decoder model on the
3 | `train-clean-100` and `train-clean-360` splits of LibriSpeech, and it evaluates the model's
4 | performance on the `dev-clean` split.
5 |
6 | In order to run this example, you must first prepare an environment and install the `speech_datasets` package,
7 | as detailed [here](../README.md#environment-setup). Next, navigate to [librispeech/asr1](../librispeech/asr1) and
8 | invoke
9 | ```shell script
10 | ./run.sh --stage 1 --stop_stage 4 --feats_type
11 | ```
12 | This will download, prepare, and extract the relevant features for LibriSpeech, and make the dataset usable with
13 | the `speech_datasets` package. Note that this step will take a long time!
14 |
15 | Next, you should navigate to this directory and activate the conda environment by invoking
16 | ```
17 | source ../tools/venv/bin/activate && conda deactivate && conda activate
18 | ```
19 | (where `` is the name of the conda virtual environment, `datasets` by default if you did not specify it
20 | when setting up your environment as described [here](../README.md#environment-setup)). Now, you can run
21 | [`main.py`](main.py). If you dumped `--feats_type raw`, then you can run
22 | ```
23 | python main.py --feats_type
24 | ```
25 | If you instead dumped `--feats_type fbank` or `--feats_type fbank_pitch`, you can instead run
26 | ```
27 | python main.py --feats_type --precomputed_feats
28 | ```
29 |
30 | The `feats_type` argument to `main.py` will specify whether to use the feature computation configuration
31 | [`fbank.yaml`](resources/fbank.yaml) or [`fbank_pitch.yaml`](resources/fbank_pitch.yaml).
32 | Both compute 80-dimensional filterbank features (optionally pitch as well), apply the appropriate cepstral
33 | mean/variance normalization (using the statistics pre-computed in
34 | [`global_cmvn_fbank.ark`](resources/global_cmvn_fbank.ark) or
35 | [`global_cmvn_fbank_pitch.ark`](resources/global_cmvn_fbank_pitch.ark)), and apply spectral augmentation.
36 |
37 | In this example, the data loader will tokenize the text using the provided sentencepiece model
38 | [`librispeech_bpe2000.model`](resources/librispeech_bpe2000.model). See the `main()` function of
39 | [`main.py`](main.py) for a full example.
40 |
--------------------------------------------------------------------------------
/commonvoice/asr1/local/download_and_untar.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Set bash to 'debug' mode, it will exit on :
3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
4 | set -e
5 | set -u
6 | set -o pipefail
7 |
8 | log() {
9 | local fname=${BASH_SOURCE[1]##*/}
10 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
11 | }
12 |
13 | # Copyright 2014 Johns Hopkins University (author: Daniel Povey)
14 | # 2017 Luminar Technologies, Inc. (author: Daniel Galvez)
15 | # 2017 Ewald Enzinger
16 | # Apache 2.0
17 |
18 | # Adapted from egs/mini_librispeech/s5/local/download_and_untar.sh (commit 1cd6d2ac3a935009fdc4184cb8a72ddad98fe7d9)
19 |
20 | remove_archive=false
21 |
22 | if [ "$1" == --remove-archive ]; then
23 | remove_archive=true
24 | shift
25 | fi
26 |
27 | if [ $# -ne 3 ]; then
28 | log "Usage: $0 [--remove-archive] "
29 | log "e.g.: $0 /export/data/ https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.tar.gz cv_corpus_v1.tar.gz"
30 | log "With --remove-archive it will remove the archive after successfully un-tarring it."
31 | exit 1
32 | fi
33 |
34 | data=$1
35 | url=$2
36 | filename=$3
37 | filepath="$data/$filename"
38 | workspace=$PWD
39 |
40 | if [ ! -d "$data" ]; then
41 | log "$0: no such directory $data"
42 | exit 1;
43 | fi
44 |
45 | if [ -z "$url" ]; then
46 | log "$0: empty URL."
47 | exit 1;
48 | fi
49 |
50 | if [ -f $data/$filename.complete ]; then
51 | log "$0: data was already successfully extracted, nothing to do."
52 | exit 0;
53 | fi
54 |
55 | if [ ! -f $filepath ]; then
56 | if ! which wget >/dev/null; then
57 | log "$0: wget is not installed."
58 | exit 1;
59 | fi
60 | log "$0: downloading data from $url. This may take some time, please be patient."
61 |
62 | if ! wget --no-check-certificate $url -O $filepath; then
63 | log "$0: error executing wget $url"
64 | rm -f $filepath
65 | exit 1;
66 | fi
67 | fi
68 |
69 | cd $data
70 | if ! tar -xzvf $filename; then
71 | log "$0: error un-tarring archive $filepath"
72 | exit 1;
73 | fi
74 | cd $workspace
75 |
76 | touch $data/$filename.complete
77 |
78 | log "$0: Successfully downloaded and un-tarred $filepath"
79 |
80 | if $remove_archive; then
81 | log "$0: removing $filepath file since --remove-archive option was supplied."
82 | rm $filepath
83 | fi
84 |
--------------------------------------------------------------------------------
/utils/subset_data_dir_tr_cv.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #
3 | # Copyright 2017 Brno University of Technology (Author: Karel Vesely);
4 | # Apache 2.0
5 |
6 | # This scripts splits 'data' directory into two parts:
7 | # - training set with 90% of speakers
8 | # - held-out set with 10% of speakers (cv)
9 | # (to be used in frame cross-entropy training of 'nnet1' models),
10 |
11 | # The script also accepts a list of held-out set speakers by '--cv-spk-list'
12 | # (with perturbed data, we pass the list of speakers externally).
13 | # The remaining set of speakers is the the training set.
14 |
15 | cv_spk_percent=10
16 | cv_spk_list= # To be used with perturbed data,
17 | seed=777
18 | cv_utt_percent= # ignored (compatibility),
19 | . utils/parse_options.sh
20 |
21 | if [ $# != 3 ]; then
22 | echo "Usage: $0 [opts] "
23 | echo " --cv-spk-percent N (default 10)"
24 | echo " --cv-spk-list (a pre-defined list with cv speakers)"
25 | exit 1;
26 | fi
27 |
28 | set -euo pipefail
29 |
30 | src_data=$1
31 | trn_data=$2
32 | cv_data=$3
33 |
34 | [ ! -r $src_data/spk2utt ] && echo "Missing '$src_data/spk2utt'. Error!" && exit 1
35 |
36 | tmp=$(mktemp -d /tmp/${USER}_XXXXX)
37 |
38 | if [ -z "$cv_spk_list" ]; then
39 | # Select 'cv_spk_percent' speakers randomly,
40 | cat $src_data/spk2utt | awk '{ print $1; }' | utils/shuffle_list.pl --srand $seed >$tmp/speakers
41 | n_spk=$(wc -l <$tmp/speakers)
42 | n_spk_cv=$(perl -e "print int($cv_spk_percent * $n_spk / 100); ")
43 | #
44 | head -n $n_spk_cv $tmp/speakers >$tmp/speakers_cv
45 | tail -n+$((n_spk_cv+1)) $tmp/speakers >$tmp/speakers_trn
46 | else
47 | # Use pre-defined list of speakers,
48 | cp $cv_spk_list $tmp/speakers_cv
49 | join -v2 <(sort $cv_spk_list) <(awk '{ print $1; }' <$src_data/spk2utt | sort) >$tmp/speakers_trn
50 | fi
51 |
52 | # Sanity checks,
53 | n_spk=$(wc -l <$src_data/spk2utt)
54 | echo "Speakers, src=$n_spk, trn=$(wc -l <$tmp/speakers_trn), cv=$(wc -l $tmp/speakers_cv)"
55 | overlap=$(join <(sort $tmp/speakers_trn) <(sort $tmp/speakers_cv) | wc -l)
56 | [ $overlap != 0 ] && \
57 | echo "WARNING, speaker overlap detected!" && \
58 | join <(sort $tmp/speakers_trn) <(sort $tmp/speakers_cv) | head && \
59 | echo '...'
60 |
61 | # Create new data dirs,
62 | utils/data/subset_data_dir.sh --spk-list $tmp/speakers_trn $src_data $trn_data
63 | utils/data/subset_data_dir.sh --spk-list $tmp/speakers_cv $src_data $cv_data
64 |
65 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # Use shell /bin/bash instead of /bin/sh so the source command can be used
2 | SHELL := /bin/bash
3 | # Use the default conda unless a specific install is specified. If there is
4 | # no conda, we will download a fresh one and use it to set up the virtual env.
5 | CONDA :=
6 | VENV_NAME := datasets
7 | # The python version installed in the conda setup
8 | PYTHON_VERSION := 3.7.9
9 | # PyTorch version: 1.2.0, 1.3.0, 1.3.1, 1.4.0, 1.5.0, 1.5.1 (>= 1.2.0 required)
10 | # 1.5.0 and later do not work with PyKaldi...
11 | TORCH_VERSION := 1.4.0
12 |
13 | ifeq ($(CONDA),)
14 | CONDA := $(shell which conda)
15 | endif
16 | ifeq ($(TORCH_VERSION),)
17 | pytorch := pytorch
18 | else
19 | pytorch := pytorch=$(TORCH_VERSION)
20 | endif
21 |
22 | ifneq ($(shell which nvidia-smi),) # 'nvcc' found
23 | CUDA_VERSION := $(shell nvcc --version | grep "release" | sed -E "s/.*release ([0-9.]*).*/\1/")
24 | CONDA_PYTORCH := $(pytorch) cudatoolkit=$(CUDA_VERSION) -c pytorch
25 | else
26 | CUDA_VERSION :=
27 | CONDA_PYTORCH := $(pytorch) cpuonly -c pytorch
28 | endif
29 | # Install CPU version of PyKaldi, so we can run feature extraction on CPU while training on GPU
30 | CONDA_PYKALDI := -c pykaldi pykaldi-cpu
31 |
32 | .PHONY: all clean
33 |
34 | all: conda sph2pipe check_install example
35 |
36 | tools/conda.done:
37 | # Only install PyTorch if the PyTorch version is non-empty
38 | tools/install_anaconda.sh $(PYTHON_VERSION) "$(CONDA)" tools/venv $(VENV_NAME) . "$(CONDA_PYTORCH)" "$(CONDA_PYKALDI)"
39 | @echo $(VENV_NAME) > tools/conda.done
40 |
41 | conda: tools/conda.done
42 |
43 | tools/sph2pipe.done:
44 | tools/install_sph2pipe.sh tools
45 | touch tools/sph2pipe.done
46 |
47 | sph2pipe: tools/sph2pipe.done
48 |
49 | check_install: conda
50 | ifneq ($(strip $(CUDA_VERSION)),)
51 | source tools/venv/etc/profile.d/conda.sh && conda deactivate && conda activate $(shell cat tools/conda.done) && python tools/check_install.py
52 | else
53 | source tools/venv/etc/profile.d/conda.sh && conda deactivate && conda activate $(shell cat tools/conda.done) && python tools/check_install.py --no-cuda
54 | endif
55 |
56 | example: conda
57 | source tools/venv/etc/profile.d/conda.sh && conda deactivate && conda activate $(shell cat tools/conda.done) && pip install -r example/requirements.txt
58 |
59 | clean: clean_conda
60 | rm -rf tools/*.done
61 |
62 | clean_conda:
63 | rm -rf *.egg-info
64 | rm -rf tools/venv
65 | rm -f tools/miniconda.sh
66 | find . -iname "*.pyc" -delete
67 |
--------------------------------------------------------------------------------
/speech_datasets/bin/feat_to_shape.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import argparse
3 | import logging
4 | import sys
5 |
6 | from speech_datasets.transform import Transformation
7 | from speech_datasets.utils.readers import file_reader_helper
8 | from speech_datasets.utils.io_utils import get_commandline_args, strtobool
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 |
13 | def get_parser():
14 | parser = argparse.ArgumentParser(
15 | description="convert feature to its shape",
16 | formatter_class=argparse.ArgumentDefaultsHelpFormatter,
17 | )
18 | parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
19 | parser.add_argument("--filetype", type=str, default="hdf5", choices=["mat", "hdf5", "sound"],
20 | help="Specify the file format for the rspecifier.")
21 | parser.add_argument("--preprocess-conf", type=str, default=None,
22 | help="The configuration file for the pre-processing")
23 | parser.add_argument("--mem-mapped", type=strtobool, default=False,
24 | help="Whether to use memory-mapped data loaders (where available)")
25 | parser.add_argument("rspecifier", type=str,
26 | help="Read specifier for feats. e.g. ark:some.ark")
27 | parser.add_argument("out", nargs="?", type=argparse.FileType("w"), default=sys.stdout,
28 | help="The output filename. " "If omitted, then output to sys.stdout")
29 | return parser
30 |
31 |
32 | def main():
33 | parser = get_parser()
34 | args = parser.parse_args()
35 |
36 | # logging info
37 | logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
38 | if args.verbose > 0:
39 | logging.basicConfig(level=logging.INFO, format=logfmt)
40 | else:
41 | logging.basicConfig(level=logging.WARN, format=logfmt)
42 | logger.info(get_commandline_args())
43 |
44 | if args.preprocess_conf is not None:
45 | preprocessing = Transformation(args.preprocess_conf)
46 | logger.info("Apply preprocessing: {}".format(preprocessing))
47 | else:
48 | preprocessing = None
49 |
50 | for utt, shape in file_reader_helper(
51 | args.rspecifier, args.filetype, return_shape=True, transform=preprocessing):
52 | shape_str = ",".join(map(str, shape)) # shape is a tuple of ints
53 | args.out.write("{} {}\n".format(utt, shape_str))
54 |
55 |
56 | if __name__ == "__main__":
57 | main()
58 |
--------------------------------------------------------------------------------
/commonvoice/asr1/local/reduce_data_dir.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # koried, 10/29/2012
4 |
5 | # Reduce a data set based on a list of turn-ids
6 |
7 | help_message="usage: $0 srcdir turnlist destdir"
8 |
9 | if [ $1 == "--help" ]; then
10 | echo "${help_message}"
11 | exit 0;
12 | fi
13 |
14 | if [ $# != 3 ]; then
15 | echo "${help_message}"
16 | exit 1;
17 | fi
18 |
19 | srcdir=$1
20 | reclist=$2
21 | destdir=$3
22 |
23 | if [ ! -f ${srcdir}/utt2spk ]; then
24 | echo "$0: no such file $srcdir/utt2spk"
25 | exit 1;
26 | fi
27 |
28 | function do_filtering {
29 | # assumes the utt2spk and spk2utt files already exist.
30 | [ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp
31 | [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp
32 | [ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text
33 | [ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames
34 | [ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender
35 | [ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp
36 | if [ -f ${srcdir}/segments ]; then
37 | utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments
38 | awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings.
39 | # The next line would override the command above for wav.scp, which would be incorrect.
40 | [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp
41 | [ -f ${srcdir}/reco2file_and_channel ] && \
42 | utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel
43 |
44 | # Filter the STM file for proper sclite scoring (this will also remove the comments lines)
45 | [ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm
46 | rm ${destdir}/reco
47 | fi
48 | srcutts=$(wc -l < ${srcdir}/utt2spk)
49 | destutts=$(wc -l < ${destdir}/utt2spk)
50 | echo "Reduced #utt from $srcutts to $destutts"
51 | }
52 |
53 | mkdir -p ${destdir}
54 |
55 | # filter the utt2spk based on the set of recordings
56 | utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk
57 |
58 | utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt
59 | do_filtering;
60 |
--------------------------------------------------------------------------------
/speech_datasets/transform/interface.py:
--------------------------------------------------------------------------------
1 | from abc import abstractmethod
2 | import inspect
3 |
4 | from speech_datasets.utils import check_kwargs
5 |
6 |
7 | class TransformInterface(object):
8 | """Transform Interface"""
9 |
10 | @abstractmethod
11 | def __call__(self, x):
12 | raise NotImplementedError("__call__ method is not implemented")
13 |
14 | @classmethod
15 | def add_arguments(cls, parser):
16 | return parser
17 |
18 | def __repr__(self):
19 | return self.__class__.__name__ + "()"
20 |
21 |
22 | class FuncTrans(TransformInterface):
23 | """Functional Transformation
24 |
25 | WARNING:
26 | Builtin or C/C++ functions may not work properly
27 | because this class heavily depends on the `inspect` module.
28 |
29 | Usage:
30 |
31 | >>> def foo_bar(x, a=1, b=2):
32 | ... '''Foo bar
33 | ... :param x: input
34 | ... :param int a: default 1
35 | ... :param int b: default 2
36 | ... '''
37 | ... return x + a - b
38 |
39 |
40 | >>> class FooBar(FuncTrans):
41 | ... _func = foo_bar
42 | ... __doc__ = foo_bar.__doc__
43 | """
44 |
45 | _func = None
46 |
47 | def __init__(self, **kwargs):
48 | self.kwargs = kwargs
49 | check_kwargs(self.func, kwargs)
50 |
51 | def __call__(self, x):
52 | return self.func(x, **self.kwargs)
53 |
54 | @classmethod
55 | def add_arguments(cls, parser):
56 | fname = cls._func.__name__.replace("_", "-")
57 | group = parser.add_argument_group(fname + " transformation setting")
58 | for k, v in cls.default_params().items():
59 | # TODO(karita): get help and choices from docstring?
60 | attr = k.replace("_", "-")
61 | group.add_argument(f"--{fname}-{attr}", default=v, type=type(v))
62 | return parser
63 |
64 | @property
65 | def func(self):
66 | return type(self)._func
67 |
68 | @classmethod
69 | def default_params(cls):
70 | try:
71 | d = dict(inspect.signature(cls._func).parameters)
72 | except ValueError:
73 | d = dict()
74 | return {
75 | k: v.default for k, v in d.items() if v.default != inspect.Parameter.empty
76 | }
77 |
78 | def __repr__(self):
79 | params = self.default_params()
80 | params.update(**self.kwargs)
81 | ret = self.__class__.__name__ + "("
82 | if len(params) == 0:
83 | return ret + ")"
84 | for k, v in params.items():
85 | ret += "{}={}, ".format(k, v)
86 | return ret[:-2] + ")"
87 |
--------------------------------------------------------------------------------
/utils/apply_cmvn.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -euo pipefail
3 | log() {
4 | local fname=${BASH_SOURCE[1]##*/}
5 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
6 | }
7 |
8 | # Copyright 2017 Nagoya University (Tomoki Hayashi)
9 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
10 |
11 | echo "$0 $*" # Print the command line for logging
12 |
13 | cmd=utils/run.pl
14 | nj=$(nproc)
15 | filetype='hdf5' # mat or hdf5
16 | cmvn_type='global' # global or utterance or speaker
17 | help_message="Usage: $0 "
18 |
19 | . ./path.sh || exit 1
20 | . utils/parse_options.sh || exit 1
21 |
22 | if [ $# != 3 ]; then
23 | log "${help_message}"
24 | exit 2
25 | fi
26 |
27 | scp=$1
28 | logdir=$2
29 | dumpdir=$(utils/make_absolute.sh $3)
30 |
31 | if [ ${filetype} = mat ]; then
32 | ext=ark
33 | elif [ ${filetype} = hdf5 ]; then
34 | ext=h5
35 | else
36 | log "Received --filetype '${filetype}', but only 'mat' and 'hdf5' are valid"
37 | exit 2
38 | fi
39 |
40 | if [ ${cmvn_type} != global ] && [ ${cmvn_type} != utterance ] && [ ${cmvn_type} != speaker ]; then
41 | log "Received --cmvn_type '${cmvn_type}', but only 'global', 'utterance', 'speaker'' are valid"
42 | fi
43 |
44 | srcdir=$(dirname "$scp")
45 | cmvnark=$srcdir/cmvn.ark
46 | maybe_utt2spk=
47 | if [ -f $srcdir/utt2spk ]; then
48 | maybe_utt2spk+="--utt2spk $srcdir/utt2spk "
49 | fi
50 | maybe_spk2utt=
51 | if [ -f $srcdir/spk2utt ]; then
52 | maybe_spk2utt+="--spk2utt $srcdir/spk2utt "
53 | fi
54 |
55 | mkdir -p ${logdir}
56 | mkdir -p ${dumpdir}
57 |
58 | # compute CMVN stats
59 | python -m speech_datasets.bin.compute_cmvn_stats \
60 | --in-filetype ${filetype} ${maybe_spk2utt} \
61 | --cmvn-type ${cmvn_type} scp:${scp} ${cmvnark}
62 |
63 | echo $cmvn_type > $srcdir/cmvn_type
64 |
65 | # split scp file
66 | split_scps=""
67 | for n in $(seq ${nj}); do
68 | split_scps="$split_scps $logdir/feats.$n.scp"
69 | done
70 |
71 | utils/split_scp.pl ${scp} ${split_scps} || exit 1;
72 |
73 | # apply CMVN to features & dump them
74 | ${cmd} JOB=1:${nj} ${logdir}/apply_cmvn.JOB.log \
75 | apply_cmvn.py --norm-vars true --in-filetype ${filetype} --out-filetype ${filetype} \
76 | --cmvn-type ${cmvn_type} ${maybe_utt2spk} ${cmvnark} scp:${logdir}/feats.JOB.scp \
77 | ark,scp:${dumpdir}/feats.JOB.${ext},${dumpdir}/feats.JOB.scp \
78 | || exit 1
79 |
80 | # concatenate scp files
81 | for n in $(seq ${nj}); do
82 | cat ${dumpdir}/feats.${n}.scp || exit 1;
83 | done > ${dumpdir}/feats.scp || exit 1
84 |
85 | # remove temp scps
86 | rm ${dumpdir}/feats.*.scp 2>/dev/null
87 | rm ${logdir}/feats.*.scp 2>/dev/null
88 | log "Succeeded applying CMVN to features for training."
89 |
--------------------------------------------------------------------------------
/swbd/asr1/local/swbd1_fix_speakerid.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | use warnings; #sed replacement for -w perl parameter
3 |
4 | # Author: Peng Qi (pengqi@cs.stanford.edu)
5 | # This script maps Switchboard speaker IDs to the true physical speakers
6 | # and fixes the utterances IDs accordingly. Expected to be run one level of
7 | # directory above.
8 |
9 | sub trim {
10 | (my $s = $_[0]) =~ s/^\s+|\s+$//g;
11 | return $s;
12 | }
13 |
14 | if ($#ARGV != 1) {
15 | print "Usage: swbd1_fix_speakerid.pl \n";
16 | print "E.g.: swbd1_fix_speakerid.pl /datasets/SWBD1Transcripts/tables/conv.tab data/train\n";
17 | }
18 |
19 | $tab_file = $ARGV[0];
20 | $dir = $ARGV[1];
21 |
22 | %conv_to_spk = ();
23 |
24 | open(my $conv_tab, '<', $tab_file) or die "Could not open '$tab_file' $!\n";
25 |
26 | while (my $line = <$conv_tab>) {
27 | chomp $line;
28 |
29 | my @fields = split "," , $line;
30 | #$fields[0] = trim($fields[0]);
31 | $fields[2] = trim($fields[2]);
32 | $fields[3] = trim($fields[3]);
33 | $conv_to_spk{'{swbd}0' . $fields[0] . '-A'} = $fields[2];
34 | $conv_to_spk{'{swbd}0' . $fields[0] . '-B'} = $fields[3];
35 | }
36 |
37 | close($conv_tab);
38 |
39 | # fix utt2spk
40 |
41 | %missingconv = ();
42 |
43 | open(my $utt2spk, '<', $dir . '/utt2spk') or die "Could not open '$dir/utt2spk' $!\n";
44 | open(my $utt2spk_new, '>', $dir . '/utt2spk.new');
45 |
46 | while (my $line = <$utt2spk>) {
47 | chomp $line;
48 |
49 | my @fields = split " " , $line;
50 | my $convid = substr $fields[0], 0, 9;
51 |
52 | if (exists $conv_to_spk{ $convid }) {
53 | my $spkid = $conv_to_spk{ $convid };
54 | $spkid = "{swbd}" . $spkid;
55 | my $newuttid = $spkid . '-' . (substr $fields[0], 2);
56 |
57 | print $utt2spk_new "$newuttid $spkid\n";
58 | } else {
59 | my $convid = substr $convid, 3, 4;
60 | $missingconv{$convid} = 1;
61 |
62 | print $utt2spk_new $fields[0]." ".$fields[1]."\n";
63 | }
64 | }
65 |
66 | close($utt2spk);
67 | close($utt2spk_new);
68 |
69 | foreach my $conv (keys %missingconv) {
70 | print "Warning: Conversation ID '$conv' not found in conv.tab, retaining old speaker IDs\n"
71 | }
72 |
73 | # fix segments and text
74 |
75 | foreach my $file ('segments','text') {
76 | open(my $oldfile, '<', "$dir/$file") or die "Could not open '$dir/$file' $!\n";
77 | open(my $newfile, '>', "$dir/$file.new");
78 |
79 | while (my $line = <$oldfile>) {
80 | chomp $line;
81 |
82 | my $convid = substr $line, 0, 9;
83 | if (exists $conv_to_spk{$convid}) {
84 | my $spkid = $conv_to_spk{$convid};
85 | print $newfile "{swbd}$spkid-" . (substr $line, 2) . "\n";
86 | } else {
87 | print $newfile "$line\n";
88 | }
89 | }
90 | }
91 |
--------------------------------------------------------------------------------
/wsj/asr1/local/ndx2flist.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | # Copyright 2010-2011 Microsoft Corporation
3 |
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
14 | # See the Apache 2 License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 |
18 | # This program takes as its standard input an .ndx file from the WSJ corpus that looks
19 | # like this:
20 | #;; File: tr_s_wv1.ndx, updated 04/26/94
21 | #;;
22 | #;; Index for WSJ0 SI-short Sennheiser training data
23 | #;; Data is read WSJ sentences, Sennheiser mic.
24 | #;; Contains 84 speakers X (~100 utts per speaker MIT/SRI and ~50 utts
25 | #;; per speaker TI) = 7236 utts
26 | #;;
27 | #11_1_1:wsj0/si_tr_s/01i/01ic0201.wv1
28 | #11_1_1:wsj0/si_tr_s/01i/01ic0202.wv1
29 | #11_1_1:wsj0/si_tr_s/01i/01ic0203.wv1
30 |
31 | #and as command-line arguments it takes the names of the WSJ disk locations, e.g.:
32 | #/mnt/matylda2/data/WSJ0/11-1.1 /mnt/matylda2/data/WSJ0/11-10.1 ... etc.
33 | # It outputs a list of absolute pathnames (it does this by replacing e.g. 11_1_1 with
34 | # /mnt/matylda2/data/WSJ0/11-1.1.
35 | # It also does a slight fix because one of the WSJ disks (WSJ1/13-16.1) was distributed with
36 | # uppercase rather than lower case filenames.
37 |
38 | foreach $fn (@ARGV) {
39 | $fn =~ m:.+/([0-9\.\-]+)/?$: || die "Bad command-line argument $fn\n";
40 | $disk_id=$1;
41 | $disk_id =~ tr/-\./__/; # replace - and . with - so 11-10.1 becomes 11_10_1
42 | $fn =~ s:/$::; # Remove final slash, just in case it is present.
43 | $disk2fn{$disk_id} = $fn;
44 | }
45 |
46 | while(){
47 | if(m/^;/){ next; } # Comment. Ignore it.
48 | else {
49 | m/^([0-9_]+):\s*(\S+)$/ || die "Could not parse line $_";
50 | $disk=$1;
51 | if(!defined $disk2fn{$disk}) {
52 | die "Disk id $disk not found";
53 | }
54 | $filename = $2; # as a subdirectory of the distributed disk.
55 | if($disk eq "13_16_1" && `hostname` =~ m/fit.vutbr.cz/) {
56 | # The disk 13-16.1 has been uppercased for some reason, on the
57 | # BUT system. This is a fix specifically for that case.
58 | $filename =~ tr/a-z/A-Z/; # This disk contains all uppercase filenames. Why?
59 | }
60 | print "$disk2fn{$disk}/$filename\n";
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/commonvoice/asr1/local/split_tr_dt_et.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | log() {
4 | local fname=${BASH_SOURCE[1]##*/}
5 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
6 | }
7 |
8 | # Copyright 2017 Johns Hopkins University (Shinji Watanabe)
9 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
10 |
11 | . ./path.sh
12 |
13 | perdt=10 # percent for dev set
14 | peret=10 # percent for eval set
15 |
16 | . utils/parse_options.sh
17 |
18 | if [ $# != 4 ]; then
19 | log "Usage: $0 ";
20 | exit 1;
21 | fi
22 |
23 | sdata=$1
24 | trdata=$2
25 | dtdata=$3
26 | etdata=$4
27 |
28 | tmpdata=$trdata/tmp
29 | mkdir -p $tmpdata
30 | mkdir -p $dtdata
31 | mkdir -p $etdata
32 |
33 | # make a unique prompts files
34 | # some transcripts have multiple spaces and need tr -s " " to remove them
35 | cut -f 2- -d" " $sdata/text | tr -s " " | sort | uniq > $tmpdata/prompts
36 | num_prompt=$(wc -l $tmpdata/prompts | awk '{print $1}')
37 |
38 | num_dt=$(echo "$num_prompt * $perdt / 100" | bc)
39 | num_et=$(echo "$num_prompt * $peret / 100" | bc)
40 | log "number of dev set prompts: $num_dt"
41 | log "number of eval set prompts: $num_et"
42 |
43 | # dt
44 | utils/shuffle_list.pl $tmpdata/prompts | head -n $num_dt > $tmpdata/dt_prompts
45 | # et
46 | utils/shuffle_list.pl $tmpdata/prompts | head -n $(echo "$num_dt + $num_et" | bc) \
47 | | tail -n $num_et > $tmpdata/et_prompts
48 | # tr
49 | nrest=$(echo "$num_dt + $num_et + 1" | bc)
50 | utils/shuffle_list.pl $tmpdata/prompts | tail -n +$nrest > $tmpdata/tr_prompts
51 | log "number of train set prompts: $(wc -l $tmpdata/tr_prompts | awk '{print $1}')"
52 |
53 | # it takes very long time when # prompts is large
54 | cat $sdata/text | local/filter_text.py -f $tmpdata/dt_prompts | awk '{print $1}' | sort > $tmpdata/dt.ids
55 | log "finished text extraction for dev set #utt = $(wc -l $tmpdata/dt.ids | awk '{print $1}')"
56 | cat $sdata/text | local/filter_text.py -f $tmpdata/et_prompts | awk '{print $1}' | sort > $tmpdata/et.ids
57 | log "finished text extraction for eval set #utt = $(wc -l $tmpdata/et.ids | awk '{print $1}')"
58 | cat $tmpdata/dt.ids $tmpdata/et.ids | sort > $tmpdata/dtet.ids
59 | cat $sdata/text | awk '{print $1}' | sort > $tmpdata/all.ids
60 | diff $tmpdata/all.ids $tmpdata/dtet.ids | awk '/^{print $2}' | sort > $tmpdata/tr.ids
61 | log "finished text extraction for train set #utt = $(wc -l $tmpdata/tr.ids | awk '{print $1}')"
62 |
63 | log "dev data: $(reduce_data_dir.sh $sdata $tmpdata/dt.ids $dtdata)"
64 | utils/fix_data_dir.sh $dtdata
65 |
66 | log "eval data: $(reduce_data_dir.sh $sdata $tmpdata/et.ids $etdata)"
67 | utils/fix_data_dir.sh $etdata
68 |
69 | log "train data: $(reduce_data_dir.sh $sdata $tmpdata/tr.ids $trdata)"
70 | utils/fix_data_dir.sh $trdata
71 |
--------------------------------------------------------------------------------
/tools/install_anaconda.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -euo pipefail
3 |
4 | if [ -z "${PS1:-}" ]; then
5 | PS1=__dummy__
6 | fi
7 | CONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
8 |
9 | n_required_args=5
10 | if [ $# -lt $n_required_args ] ; then
11 | echo "Usage: $0 [conda install args]*"
12 | exit 1;
13 | fi
14 | PYTHON_VERSION="$1"
15 | CONDA="$2"
16 | VENV_DIR="$3"
17 | VENV_NAME="$4"
18 | PACKAGE_ROOT="$5"
19 | shift $n_required_args
20 |
21 | # Download conda if an installation isn't specified
22 | if [ -z "${CONDA}" ]; then
23 | CONDA="${VENV_DIR}/bin/conda"
24 | if [ -z "${CONDA}" ]; then
25 | if [ ! -f "${PACKAGE_ROOT}/tools/miniconda.sh" ]; then
26 | wget --tries=3 "${CONDA_URL}" -O "${PACKAGE_ROOT}/tools/miniconda.sh"
27 | fi
28 | if [ ! -d "${VENV_DIR}" ]; then
29 | bash "${PACKAGE_ROOT}/tools/miniconda.sh" -b -p "${VENV_DIR}"
30 | fi
31 | fi
32 | else
33 | ln -sf "$(${CONDA} info --base)" "${VENV_DIR}"
34 | fi
35 |
36 | # Check if environment alreay exists
37 | if ${CONDA} env list | (! grep -q -E "${VENV_NAME}\s"); then
38 | ${CONDA} create -y -n "${VENV_NAME}" "python=${PYTHON_VERSION}"
39 | else
40 | read -r -p "Enviroment ${VENV_NAME} already exists. Continue setup anyways? (y/n) " choice
41 | case $choice in
42 | y|Y|yes|Yes ) echo "Continuing to set up environment ${VENV_NAME}." ;;
43 | * ) echo "Either pick a different value for VENV_NAME, or remove the ${CONDA} environment ${VENV_NAME} before re-running this script." && exit 1 ;;
44 | esac
45 | fi
46 |
47 | # Activate conda environment & check Python version
48 | source "${VENV_DIR}/etc/profile.d/conda.sh" && conda deactivate && conda activate "${VENV_NAME}"
49 | INSTALLED_PYTHON_VERSION=$(python -V | grep -Eo "[[:digit:].]*")
50 | if [ ${INSTALLED_PYTHON_VERSION} != ${PYTHON_VERSION} ]; then
51 | echo "Enviroment ${VENV_NAME} is Python ${INSTALLED_PYTHON_VERSION}, but Python ${PYTHON_VERSION} requested."
52 | read -r -p "Continue setup with Python ${INSTALLED_PYTHON_VERSION} anyways? (y/n) " choice
53 | case $choice in
54 | y|Y|yes|Yes ) echo "Continuing to set up environment ${VENV_NAME}." ;;
55 | * ) echo "Either pick a different value for VENV_NAME, or change PYTHON_VERSION to ${INSTALLED_PYTHON_VERSION} before re-running this script." && exit 1 ;;
56 | esac
57 | fi
58 |
59 | conda update -y -n "${VENV_NAME}" -c defaults conda
60 |
61 | # Install any conda dependencies (specified via command line)
62 | while (( "$#" )); do
63 | echo ""
64 | echo "conda install -y -n ${VENV_NAME} $1"
65 | conda install -y -n "${VENV_NAME}" $1
66 | shift
67 | done
68 |
69 | # Install the speech_datasets package in editable mode
70 | pip install -e "${PACKAGE_ROOT}"
71 |
--------------------------------------------------------------------------------
/COMBINE/asr1/combine_cmvn_stats.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Set bash to 'debug' mode, it will exit on :
3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
4 | set -e
5 | set -u
6 | set -o pipefail
7 |
8 | log() {
9 | local fname=${BASH_SOURCE[1]##*/}
10 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
11 | }
12 |
13 | feats_type=fbank # fbank or fbank_pitch are valid
14 | cmvn_type=global # global or speaker or utterance are valid
15 |
16 | help_message=$(cat </ / / ...
18 |
19 | Combines CMVN stats for the specified dataset splits (pre-computed by Stage 5 run.sh for each dataset split specified)
20 | into a single file.
21 |
22 | Options:
23 | --feats_type # Feature type (fbank or fbank_pitch) (default=${feats_type}).
24 | --cmvn_type # Type of CMVN stats to compute (global or speaker or utterance) (default=${cmvn_type}).
25 | EOF
26 | )
27 |
28 |
29 | . ./path.sh || exit 1
30 | . ./cmd.sh || exit 1
31 |
32 | log "$0 $*"
33 | . utils/parse_options.sh || exit 1
34 | if [ $# -eq 0 ]; then
35 | log "${help_message}"
36 | log "Error: Please specify dataset splits as positional arguments."
37 | exit 2
38 | fi
39 |
40 | workspace=$PWD
41 | task=$(basename "$(utils/make_absolute.sh "$workspace")")
42 |
43 | # Get CMVN's from all the relevant dataset splits
44 | cmvns=
45 | for dset in "$@"; do
46 | base=$(echo ${dset} | sed -E "s/\/.*//g")
47 | split=$(echo ${dset} | sed -E "s/.*\///g")
48 | base_dir="${MAIN_ROOT}/${base}/${task}"
49 | dset_dir="${base_dir}/dump/${feats_type}"/${split}
50 | cmvn="${dset_dir}/${cmvn_type}_cmvn.ark"
51 |
52 | if [ ! -d ${base_dir} ]; then
53 | log "${base} is not a valid dataset for task ${task//1/}"
54 | exit 1
55 | elif [ "${base}" = "${dset}" ]; then
56 | log "Expected dataset to specified as /, but got ${dset}"
57 | exit 1
58 | elif [ ! -d ${dset_dir} ]; then
59 | log "Either ${split} is not a valid split for dataset ${base}, or"
60 | log "${base_dir}/run.sh has not yet been run with feats_type=${feats_type}"
61 | exit 1
62 | elif [ ! -f ${cmvn} ]; then
63 | log "${cmvn_type} CMVN statistics have not been computed for feats_type=${feats_type} for data split ${dset}."
64 | log "Please run stage 5 of ${base_dir}/${task}/run.sh."
65 | exit 1
66 | fi
67 | cmvns+="${cmvn} "
68 | done
69 |
70 | # Combine CMVN's
71 | combo_idx=$(python3 local/combine_datasets.py --task "${task//1/}" --write_dir false "$@")
72 | dumpdir="dump/${feats_type}/no_short/${combo_idx}"
73 | mkdir -p "${dumpdir}"
74 | python3 -m speech_datasets.bin.combine_cmvn_stats --cmvn_type ${cmvn_type} \
75 | --output_file "${dumpdir}/${cmvn_type}_cmvn.ark" ${cmvns}
76 |
--------------------------------------------------------------------------------
/commonvoice/asr1/local/data_prep.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | #
3 | # Copyright 2017 Ewald Enzinger
4 | # Apache 2.0
5 | #
6 | # Usage: data_prep.pl /export/data/cv_corpus_v1/cv-valid-train valid_train
7 |
8 | if (@ARGV != 3) {
9 | print STDERR "Usage: $0 \n";
10 | print STDERR "e.g. $0 /export/data/cv_corpus_v1 cv-valid-train valid-train\n";
11 | exit(1);
12 | }
13 |
14 | # use ffmpeg for mp3 to wav
15 | if (length(`which ffmpeg`) == 0) {
16 | print "Please install 'ffmpeg' on All worker nodes!\n";
17 | exit 1;
18 | }
19 |
20 |
21 | ($db_base, $dataset, $out_dir) = @ARGV;
22 | mkdir data unless -d data;
23 | mkdir $out_dir unless -d $out_dir;
24 |
25 | open(CSV, "<", "$db_base/$dataset.tsv") or die "cannot open dataset CSV file";
26 | open(SPKR,">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk";
27 | open(GNDR,">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender";
28 | open(TEXT,">", "$out_dir/text") or die "Could not open the output file $out_dir/text";
29 | open(WAV,">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp";
30 | my $header = ;
31 | while() {
32 | chomp;
33 | ($spkr, $filepath, $text, $upvotes, $downvotes, $age, $gender, $accent) = split("\t", $_);
34 | # speaker comes from commonvoice --> uttId comes from commonvoice
35 | $spkr = "{commonvoice}$spkr";
36 | if ("$gender" eq "female") {
37 | $gender = "f";
38 | } else {
39 | # Use male as default if not provided (no reason, just adopting the same default as in voxforge)
40 | $gender = "m";
41 | }
42 | $uttId = $filepath;
43 | if (-z "$db_base/clips/$filepath") {
44 | print "null file $filepath\n";
45 | next;
46 | }
47 | $uttId =~ s/\.mp3//g;
48 | $uttId =~ tr/\//-/;
49 | # speaker information should be suffix of the utterance Id
50 | $uttId = "$spkr-$uttId";
51 |
52 | # make sure all text is lowercase
53 | $text =~ tr/A-Z/a-z/;
54 |
55 | # get rid of all puncts besides apostrophes
56 | $text =~ s/[^\w\s']|//g;
57 | $text =~ s/(\s)'/$1/g;
58 | $text =~ s/'(\s)/$1/g;
59 |
60 | if (index($text, "{") != -1 and index($text, "}" != -1)) {
61 | next;
62 | }
63 | print TEXT "$uttId"," ","$text","\n";
64 | print GNDR "$spkr"," ","$gender","\n";
65 | print WAV "$uttId"," ffmpeg -i $db_base/clips/$filepath -f wav -ar 16000 -ab 16 - |\n";
66 | print SPKR "$uttId"," $spkr","\n";
67 | }
68 | close(SPKR) || die;
69 | close(TEXT) || die;
70 | close(WAV) || die;
71 | close(GNDR) || die;
72 | close(CSV);
73 |
74 | # Use utt2spk to generate spk2utt
75 | if (system(
76 | "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
77 | die "Error creating spk2utt file in directory $out_dir";
78 | }
79 |
80 | # Validate the data directory
81 | system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir");
82 | if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-feats $out_dir") != 0) {
83 | die "Error validating directory $out_dir";
84 | }
85 |
--------------------------------------------------------------------------------
/speech_datasets/text/tokenizers.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from pathlib import Path
3 | from typing import Iterable, List, Union
4 |
5 | import sentencepiece as spm
6 | from typeguard import check_argument_types
7 |
8 |
9 | class AbsTokenizer(ABC):
10 | @abstractmethod
11 | def text2tokens(self, line: str) -> List[str]:
12 | raise NotImplementedError
13 |
14 | @abstractmethod
15 | def tokens2text(self, tokens: Iterable[str]) -> str:
16 | raise NotImplementedError
17 |
18 | @abstractmethod
19 | def tokens2ids(self, tokens: Iterable[str]) -> List[int]:
20 | raise NotImplementedError
21 |
22 | @abstractmethod
23 | def ids2tokens(self, ids: Iterable[int]) -> List[str]:
24 | raise NotImplementedError
25 |
26 | def text2ids(self, line: str) -> List[int]:
27 | return self.tokens2ids(self.text2tokens(line))
28 |
29 | def ids2text(self, ids: Iterable[int]) -> str:
30 | return self.tokens2text(self.ids2tokens(ids))
31 |
32 | @abstractmethod
33 | def __len__(self):
34 | raise NotImplementedError
35 |
36 |
37 | class SentencepieceTokenizer(AbsTokenizer):
38 | def __init__(self, model: Union[Path, str],
39 | token_list: Union[Path, str, Iterable[str]] = None):
40 | assert check_argument_types()
41 | self.model = str(model)
42 | self.sp = spm.SentencePieceProcessor()
43 | self.sp.load(self.model)
44 |
45 | if isinstance(token_list, (Path, str)):
46 | char_list = Path(token_list)
47 | with char_list.open("r", encoding="utf-8") as f:
48 | token_list = [line.rstrip() for line in f]
49 | elif token_list is None:
50 | token_list = [self.sp.IdToPiece(i)
51 | for i in range(self.sp.get_piece_size())]
52 |
53 | self.idx2tok = {i: tok for i, tok in enumerate(token_list)}
54 | self.tok2idx = {tok: i for i, tok in enumerate(token_list)}
55 |
56 | def __repr__(self):
57 | return f'{self.__class__.__name__}(model="{self.model}")'
58 |
59 | def __getstate__(self):
60 | state = self.__dict__.copy()
61 | state["sp"] = None
62 | return state
63 |
64 | def __setstate__(self, state):
65 | self.__dict__ = state
66 | self.sp = spm.SentencePieceProcessor()
67 | self.sp.load(self.model)
68 |
69 | def text2tokens(self, line: str) -> List[str]:
70 | return self.sp.EncodeAsPieces(line)
71 |
72 | def tokens2text(self, tokens: Iterable[str]) -> str:
73 | return self.sp.DecodePieces(list(tokens))
74 |
75 | def tokens2ids(self, tokens: Iterable[str]) -> List[int]:
76 | return [self.tok2idx.get(tok, self.tok2idx[""]) for tok in tokens]
77 |
78 | def ids2tokens(self, ids: Iterable[int]) -> List[str]:
79 | return [self.idx2tok[idx] for idx in ids]
80 |
81 | def __len__(self):
82 | if self.idx2tok is None:
83 | return self.sp.get_piece_size()
84 | else:
85 | return len(self.idx2tok)
86 |
--------------------------------------------------------------------------------
/utils/filter_scp.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | # Copyright 2010-2012 Microsoft Corporation
3 | # Johns Hopkins University (author: Daniel Povey)
4 |
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
14 | # MERCHANTABLITY OR NON-INFRINGEMENT.
15 | # See the Apache 2 License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 |
19 | # This script takes a list of utterance-ids or any file whose first field
20 | # of each line is an utterance-id, and filters an scp
21 | # file (or any file whose "n-th" field is an utterance id), printing
22 | # out only those lines whose "n-th" field is in id_list. The index of
23 | # the "n-th" field is 1, by default, but can be changed by using
24 | # the -f switch
25 |
26 | $exclude = 0;
27 | $field = 1;
28 | $shifted = 0;
29 |
30 | do {
31 | $shifted=0;
32 | if ($ARGV[0] eq "--exclude") {
33 | $exclude = 1;
34 | shift @ARGV;
35 | $shifted=1;
36 | }
37 | if ($ARGV[0] eq "-f") {
38 | $field = $ARGV[1];
39 | shift @ARGV; shift @ARGV;
40 | $shifted=1
41 | }
42 | } while ($shifted);
43 |
44 | if(@ARGV < 1 || @ARGV > 2) {
45 | die "Usage: filter_scp.pl [--exclude] [-f ] id_list [in.scp] > out.scp \n" .
46 | "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" .
47 | "Note: only the first field of each line in id_list matters. With --exclude, prints\n" .
48 | "only the lines that were *not* in id_list.\n" .
49 | "Caution: previously, the -f option was interpreted as a zero-based field index.\n" .
50 | "If your older scripts (written before Oct 2014) stopped working and you used the\n" .
51 | "-f option, add 1 to the argument.\n" .
52 | "See also: utils/filter_scp.pl .\n";
53 | }
54 |
55 |
56 | $idlist = shift @ARGV;
57 | open(F, "<$idlist") || die "Could not open id-list file $idlist";
58 | while() {
59 | @A = split;
60 | @A>=1 || die "Invalid id-list file line $_";
61 | $seen{$A[0]} = 1;
62 | }
63 |
64 | if ($field == 1) { # Treat this as special case, since it is common.
65 | while(<>) {
66 | $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field.";
67 | # $1 is what we filter on.
68 | if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) {
69 | print $_;
70 | }
71 | }
72 | } else {
73 | while(<>) {
74 | @A = split;
75 | @A > 0 || die "Invalid scp file line $_";
76 | @A >= $field || die "Invalid scp file line $_";
77 | if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) {
78 | print $_;
79 | }
80 | }
81 | }
82 |
83 | # tests:
84 | # the following should print "foo 1"
85 | # ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo)
86 | # the following should print "bar 2".
87 | # ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2)
88 |
--------------------------------------------------------------------------------
/utils/subset_scp.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | use warnings; #sed replacement for -w perl parameter
3 | # Copyright 2010-2011 Microsoft Corporation
4 |
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
14 | # MERCHANTABLITY OR NON-INFRINGEMENT.
15 | # See the Apache 2 License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | # This program selects a subset of N elements in the scp.
19 |
20 | # By default, it selects them evenly from throughout the scp, in order to avoid
21 | # selecting too many from the same speaker. It prints them on the standard
22 | # output.
23 | # With the option --first, it just selects the N first utterances.
24 | # With the option --last, it just selects the N last utterances.
25 |
26 | # Last modified by JHU & HKUST @2013
27 |
28 |
29 | $quiet = 0;
30 | $first = 0;
31 | $last = 0;
32 |
33 | if (@ARGV > 0 && $ARGV[0] eq "--quiet") {
34 | shift;
35 | $quiet = 1;
36 | }
37 | if (@ARGV > 0 && $ARGV[0] eq "--first") {
38 | shift;
39 | $first = 1;
40 | }
41 | if (@ARGV > 0 && $ARGV[0] eq "--last") {
42 | shift;
43 | $last = 1;
44 | }
45 |
46 | if(@ARGV < 2 ) {
47 | die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" .
48 | " --quiet causes it to not die if N < num lines in scp.\n" .
49 | " --first and --last make it equivalent to head or tail.\n" .
50 | "See also: filter_scp.pl\n";
51 | }
52 |
53 | $N = shift @ARGV;
54 | if($N == 0) {
55 | die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\"";
56 | }
57 | $inscp = shift @ARGV;
58 | open(I, "<$inscp") || die "Opening input scp file $inscp";
59 |
60 | @F = ();
61 | while() {
62 | push @F, $_;
63 | }
64 | $numlines = @F;
65 | if($N > $numlines) {
66 | if ($quiet) {
67 | $N = $numlines;
68 | } else {
69 | die "You requested from subset_scp.pl more elements than available: $N > $numlines";
70 | }
71 | }
72 |
73 | sub select_n {
74 | my ($start,$end,$num_needed) = @_;
75 | my $diff = $end - $start;
76 | if ($num_needed > $diff) {
77 | die "select_n: code error";
78 | }
79 | if ($diff == 1 ) {
80 | if ($num_needed > 0) {
81 | print $F[$start];
82 | }
83 | } else {
84 | my $halfdiff = int($diff/2);
85 | my $halfneeded = int($num_needed/2);
86 | select_n($start, $start+$halfdiff, $halfneeded);
87 | select_n($start+$halfdiff, $end, $num_needed - $halfneeded);
88 | }
89 | }
90 |
91 | if ( ! $first && ! $last) {
92 | if ($N > 0) {
93 | select_n(0, $numlines, $N);
94 | }
95 | } else {
96 | if ($first) { # --first option: same as head.
97 | for ($n = 0; $n < $N; $n++) {
98 | print $F[$n];
99 | }
100 | } else { # --last option: same as tail.
101 | for ($n = @F - $N; $n < @F; $n++) {
102 | print $F[$n];
103 | }
104 | }
105 | }
106 |
--------------------------------------------------------------------------------
/utils/apply_map.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | use warnings; #sed replacement for -w perl parameter
3 | # Copyright 2012 Johns Hopkins University (Author: Daniel Povey)
4 | # Apache 2.0.
5 |
6 | # This program is a bit like ./sym2int.pl in that it applies a map
7 | # to things in a file, but it's a bit more general in that it doesn't
8 | # assume the things being mapped to are single tokens, they could
9 | # be sequences of tokens. See the usage message.
10 |
11 |
12 | $permissive = 0;
13 |
14 | for ($x = 0; $x <= 2; $x++) {
15 |
16 | if (@ARGV > 0 && $ARGV[0] eq "-f") {
17 | shift @ARGV;
18 | $field_spec = shift @ARGV;
19 | if ($field_spec =~ m/^\d+$/) {
20 | $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
21 | }
22 | if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
23 | if ($1 ne "") {
24 | $field_begin = $1 - 1; # Change to zero-based indexing.
25 | }
26 | if ($2 ne "") {
27 | $field_end = $2 - 1; # Change to zero-based indexing.
28 | }
29 | }
30 | if (!defined $field_begin && !defined $field_end) {
31 | die "Bad argument to -f option: $field_spec";
32 | }
33 | }
34 |
35 | if (@ARGV > 0 && $ARGV[0] eq '--permissive') {
36 | shift @ARGV;
37 | # Mapping is optional (missing key is printed to output)
38 | $permissive = 1;
39 | }
40 | }
41 |
42 | if(@ARGV != 1) {
43 | print STDERR "Invalid usage: " . join(" ", @ARGV) . "\n";
44 | print STDERR <<'EOF';
45 | Usage: apply_map.pl [options] map output
46 | options: [-f ] [--permissive]
47 | This applies a map to some specified fields of some input text:
48 | For each line in the map file: the first field is the thing we
49 | map from, and the remaining fields are the sequence we map it to.
50 | The -f (field-range) option says which fields of the input file the map
51 | map should apply to.
52 | If the --permissive option is supplied, fields which are not present
53 | in the map will be left as they were.
54 | Applies the map 'map' to all input text, where each line of the map
55 | is interpreted as a map from the first field to the list of the other fields
56 | Note: can look like 4-5, or 4-, or 5-, or 1, it means the field
57 | range in the input to apply the map to.
58 | e.g.: echo A B | apply_map.pl a.txt
59 | where a.txt is:
60 | A a1 a2
61 | B b
62 | will produce:
63 | a1 a2 b
64 | EOF
65 | exit(1);
66 | }
67 |
68 | ($map_file) = @ARGV;
69 | open(M, "<$map_file") || die "Error opening map file $map_file: $!";
70 |
71 | while () {
72 | @A = split(" ", $_);
73 | @A >= 1 || die "apply_map.pl: empty line.";
74 | $i = shift @A;
75 | $o = join(" ", @A);
76 | $map{$i} = $o;
77 | }
78 |
79 | while() {
80 | @A = split(" ", $_);
81 | for ($x = 0; $x < @A; $x++) {
82 | if ( (!defined $field_begin || $x >= $field_begin)
83 | && (!defined $field_end || $x <= $field_end)) {
84 | $a = $A[$x];
85 | if (!defined $map{$a}) {
86 | if (!$permissive) {
87 | die "apply_map.pl: undefined key $a in $map_file\n";
88 | } else {
89 | print STDERR "apply_map.pl: warning! missing key $a in $map_file\n";
90 | }
91 | } else {
92 | $A[$x] = $map{$a};
93 | }
94 | }
95 | }
96 | print join(" ", @A) . "\n";
97 | }
98 |
--------------------------------------------------------------------------------
/swbd/asr1/local/data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | set -u
4 | set -o pipefail
5 |
6 | log() {
7 | local fname=${BASH_SOURCE[1]##*/}
8 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
9 | }
10 |
11 | . ./path.sh || exit 1
12 | . ./db.sh || exit 1
13 |
14 | # Extract switchboard-1
15 | if [ -z "${SWBD1}" ]; then
16 | log "Fill the value of 'SWBD1' in db.sh"
17 | exit 1
18 | elif [ ! -e "${SWBD1}" ]; then
19 | mkdir -p "${SWBD1}"
20 | {
21 | tar xzvf ${SWBD1_TGZ} -C "${SWBD1}"
22 | } || {
23 | log "Failed to extract SWBD1"
24 | exit 1
25 | }
26 | fi
27 |
28 | # Download switchboard-1 transcripts if needed
29 | if [ ! -d "${SWBD1}/swb_ms98_transcriptions" ]; then
30 | echo " *** Downloading trascriptions and dictionary ***"
31 | wget http://www.openslr.org/resources/5/switchboard_word_alignments.tar.gz ||
32 | wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz
33 | tar xzvf switchboard_word_alignments.tar.gz -C "${SWBD1}"
34 | rm switchboard_word_alignments.tar.gz
35 | else
36 | log "Directory with transcriptions exists, skipping downloading."
37 | fi
38 |
39 | # Prepare the dictionary & the rest of the Switchboard-1 data
40 | log "local/swbd1_prepare_dict.sh ${SWBD1}"
41 | local/swbd1_prepare_dict.sh "${SWBD1}"
42 | log "local/swbd1_data_prep.sh ${SWBD1}"
43 | local/swbd1_data_prep.sh "${SWBD1}"
44 |
45 | # Extract & prepare EVAL-2000
46 | if [ "$(echo "${EVAL2000}" | wc -w)" != 2 ]; then
47 | log "Fill the value of 'EVAL2000' in db.sh (2 items required, hub5e_00 and hub5)"
48 | fi
49 | for (( i=1; i<=2; i++ )); do
50 | src=$(echo "${EVAL2000_TGZ}" | cut -d " " -f $i)
51 | dst=$(echo "${EVAL2000}" | cut -d " " -f $i)
52 | # hub5e is in a sub-directory
53 | if [ $i = 1 ]; then
54 | dst=$(dirname "${dst}")
55 | fi
56 |
57 | if [ ! -e "${dst}" ]; then
58 | mkdir -p "${dst}"
59 | {
60 | tar xzvf "${src}" -C "${dst}"
61 | } || {
62 | log "Failed to extract EVAL2000 (part $i)"
63 | exit 1
64 | }
65 | fi
66 | done
67 |
68 | # Note: do not quote ${EVAL2000} -- it should contains 2 directories, and eval2000_data_prep.sh requires 2 arguments
69 | log "local/eval2000_data_prep.sh ${EVAL2000}"
70 | local/eval2000_data_prep.sh ${EVAL2000}
71 |
72 | # Extract & prepare RT-03
73 | if [ -z "${RT03}" ]; then
74 | log "Fill the value of 'RT03' in db.sh"
75 | exit 1
76 | elif [ ! -e "${RT03}" ]; then
77 | RT03_BASE="$(dirname "${RT03}")"
78 | mkdir -p "${RT03_BASE}"
79 | {
80 | tar xzvf "${RT03_TGZ}" -C "${RT03_BASE}"
81 | } || {
82 | log "Failed to extract SWBD1"
83 | exit 1
84 | }
85 | fi
86 |
87 | log "local/rt03_data_prep.sh ${RT03}"
88 | local/rt03_data_prep.sh ${RT03}
89 |
90 | # normalize eval2000 and rt03 texts by
91 | # 1) convert upper to lower
92 | # 2) remove tags (%AH) (%HESITATION) (%UH)
93 | # 3) remove
94 | # 4) remove "(" or ")"
95 | for x in eval2000 rt03; do
96 | cp data/${x}/text data/${x}/text.org
97 | paste -d "" \
98 | <(cut -f 1 -d" " data/${x}/text.org) \
99 | <(awk '{$1=""; print tolower($0)}' data/${x}/text.org | perl -pe 's| \(\%.*\)||g' \
100 | | perl -pe 's| \<.*\>||g' | sed -e "s/(//g" -e "s/)//g") | sed -e 's/\s\+/ /g' > data/${x}/text
101 | rm data/${x}/text.org
102 | done
103 |
--------------------------------------------------------------------------------
/librispeech/asr1/local/download_and_untar.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | set -u
4 | set -o pipefail
5 |
6 | log() {
7 | local fname=${BASH_SOURCE[1]##*/}
8 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
9 | }
10 |
11 | # Copyright 2014 Johns Hopkins University (author: Daniel Povey)
12 | # Apache 2.0
13 |
14 | remove_archive=false
15 |
16 | if [ "$1" == --remove-archive ]; then
17 | remove_archive=true
18 | shift
19 | fi
20 |
21 | if [ $# -ne 3 ]; then
22 | log "Usage: $0 [--remove-archive] "
23 | log "e.g.: $0 /export/a15/vpanayotov/data www.openslr.org/resources/11 dev-clean"
24 | log "With --remove-archive it will remove the archive after successfully un-tarring it."
25 | log " can be one of: dev-clean, test-clean, dev-other, test-other,"
26 | log " train-clean-100, train-clean-360, train-other-500."
27 | exit 1
28 | fi
29 |
30 | data=$1
31 | url=$2
32 | part=$3
33 |
34 | if [ ! -d "$data" ]; then
35 | log "$0: no such directory $data"
36 | exit 1
37 | fi
38 |
39 | part_ok=false
40 | list="dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500"
41 | for x in $list; do
42 | if [ "$part" == $x ]; then part_ok=true; fi
43 | done
44 | if ! $part_ok; then
45 | log "$0: expected to be one of $list, but got '$part'"
46 | exit 1
47 | fi
48 |
49 | if [ -z "$url" ]; then
50 | log "$0: empty URL base."
51 | exit 1
52 | fi
53 |
54 | if [ -f $data/LibriSpeech/$part/.complete ]; then
55 | log "$0: data part $part was already successfully extracted, nothing to do."
56 | exit 0
57 | fi
58 |
59 |
60 | # sizes of the archive files in bytes. This is some older versions.
61 | sizes_old="371012589 347390293 379743611 361838298 6420417880 23082659865 30626749128"
62 | # sizes_new is the archive file sizes of the final release. Some of these sizes are of
63 | # things we probably won't download.
64 | sizes_new="337926286 314305928 695964615 297279345 87960560420 33373768 346663984 328757843 6387309499 23049477885 30593501606"
65 |
66 | if [ -f $data/$part.tar.gz ]; then
67 | size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}')
68 | size_ok=false
69 | for s in $sizes_old $sizes_new; do if [ $s == $size ]; then size_ok=true; fi; done
70 | if ! $size_ok; then
71 | log "$0: removing existing file $data/$part.tar.gz because its size in bytes $size"
72 | log "does not equal the size of one of the archives."
73 | rm $data/$part.tar.gz
74 | else
75 | log "$data/$part.tar.gz exists and appears to be complete."
76 | fi
77 | fi
78 |
79 | if [ ! -f $data/$part.tar.gz ]; then
80 | if ! which wget >/dev/null; then
81 | log "$0: wget is not installed."
82 | exit 1
83 | fi
84 | full_url=$url/$part.tar.gz
85 | log "$0: downloading data from $full_url. This may take some time, please be patient."
86 |
87 | if ! wget -P $data --no-check-certificate $full_url; then
88 | log "$0: error executing wget $full_url"
89 | exit 1
90 | fi
91 | fi
92 |
93 | if ! tar -C $data -xvzf $data/$part.tar.gz; then
94 | log "$0: error un-tarring archive $data/$part.tar.gz"
95 | exit 1
96 | fi
97 |
98 | touch $data/LibriSpeech/$part/.complete
99 |
100 | log "$0: Successfully downloaded and un-tarred $data/$part.tar.gz"
101 |
102 | if $remove_archive; then
103 | log "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied."
104 | rm $data/$part.tar.gz
105 | fi
106 |
--------------------------------------------------------------------------------
/wsj/asr1/local/normalize_transcript.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | # Copyright 2010-2011 Microsoft Corporation
3 |
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
14 | # See the Apache 2 License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 |
18 | # This takes data from the standard input that's unnormalized transcripts in the format
19 | # 4k2c0308 Of course there isn\'t any guarantee the company will keep its hot hand [misc_noise]
20 | # 4k2c030a [loud_breath] And new hardware such as the set of personal computers I\. B\. M\. introduced last week can lead to unexpected changes in the software business [door_slam]
21 | # and outputs normalized transcripts.
22 | # c.f. /mnt/matylda2/data/WSJ0/11-10.1/wsj0/transcrp/doc/dot_spec.doc
23 |
24 | @ARGV == 1 || die "usage: normalize_transcript.pl noise_word < transcript > transcript2";
25 | $noise_word = shift @ARGV;
26 |
27 | while() {
28 | $_ =~ m:^(\S+) (.+): || die "bad line $_";
29 | $utt = $1;
30 | $trans = $2;
31 | print "{wsj}$utt";
32 | foreach $w (split (" ",$trans)) {
33 | $w =~ tr:A-Z:a-z:; # Lowercase everything to match the processing of other datasets.
34 | $w =~ s:\\::g; # Remove backslashes. We don't need the quoting.
35 | $w =~ s:^\%percent:percent:; # Normalization for Nov'93 test transcripts.
36 | $w =~ s:^\.point:point:; # Normalization for Nov'93 test transcripts.
37 | $w =~ s:\*(.*)\*:\1:g; # Mispronounced words are enclosed in asterisks; we don't care
38 | if ($w ne "!exclamation-point") { # ! indicates unusual emphasis; we don't care
39 | $w =~ s:!::g;
40 | }
41 | if ($w ne ":colon") { # : indicates a lengthened sound; we don't care
42 | $w =~ s:\:::g;
43 | }
44 |
45 | # Words we don't want to print
46 | if($w =~ m:^\[\<\w+\]$: || # E.g. [\]$: || # E.g. [door_slam>], this means a door slammed in the next word. Delete.
48 | $w =~ m:\[\w+/\]$: || # E.g. [phone_ring/], which indicates the start of this phenomenon.
49 | $w =~ m:\[\/\w+]$: || # E.g. [/phone_ring], which indicates the end of this phenomenon.
50 | $w eq "~" || # This is used to indicate truncation of an utterance. Not a word.
51 | $w eq ".") { # "." is used to indicate a pause. Silence is optional anyway so not much
52 | # point including this in the transcript.
53 | next;
54 | } elsif($w =~ m:\[\w+\]:) { # Other noises, e.g. [loud_breath].
55 | print " $noise_word";
56 | } elsif($w =~ m:^\<([\w\'.]+)\>$:) {
57 | # e.g. replace with and. (the <> means verbal deletion of a word).. but it's pronounced.
58 | print " $1";
59 | } elsif($w eq "--dash") {
60 | print " -dash"; # This is a common issue; the CMU dictionary has it as -DASH.
61 | } elsif($w =~ m:(.+)\-DASH$:) { # E.g. INCORPORATED-DASH... seems the DASH gets combined with previous word
62 | print " $1 -DASH";
63 | } else {
64 | print " $w";
65 | }
66 | }
67 | print "\n";
68 | }
69 |
--------------------------------------------------------------------------------
/swbd/asr1/local/extend_segments.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | use warnings; #sed replacement for -w perl parameter
3 |
4 | if (@ARGV != 1 || !($ARGV[0] =~ m/^-?\d+\.?\d*$/ && $ARGV[0] >= 0)) {
5 | print STDERR "Usage: extend_segments.pl time-in-seconds segments.extended \n" .
6 | "e.g. extend_segments.pl 0.25 segments.2\n" .
7 | "This command modifies a segments file, with lines like\n" .
8 | " \n" .
9 | "by extending the beginning and end of each segment by a certain\n" .
10 | "length of time. This script makes sure the output segments do not\n" .
11 | "overlap as a result of this time-extension, and that there are no\n" .
12 | "negative times in the output.\n";
13 | exit 1;
14 | }
15 |
16 | $extend = $ARGV[0];
17 |
18 | @all_lines = ();
19 |
20 | while () {
21 | chop;
22 | @A = split(" ", $_);
23 | if (@A != 4) {
24 | die "invalid line in segments file: $_";
25 | }
26 | $line = @all_lines; # current number of lines.
27 | ($utt_id, $reco_id, $start_time, $end_time) = @A;
28 |
29 | push @all_lines, [ $utt_id, $reco_id, $start_time, $end_time ]; # anonymous array.
30 | if (! defined $lines_for_reco{$reco_id}) {
31 | $lines_for_reco{$reco_id} = [ ]; # push new anonymous array.
32 | }
33 | push @{$lines_for_reco{$reco_id}}, $line;
34 | }
35 |
36 | foreach $reco_id (keys %lines_for_reco) {
37 | $ref = $lines_for_reco{$reco_id};
38 | @line_numbers = sort { ${$all_lines[$a]}[2] <=> ${$all_lines[$b]}[2] } @$ref;
39 |
40 |
41 | {
42 | # handle start of earliest segment as a special case.
43 | $l0 = $line_numbers[0];
44 | $tstart = ${$all_lines[$l0]}[2] - $extend;
45 | if ($tstart < 0.0) { $tstart = 0.0; }
46 | ${$all_lines[$l0]}[2] = $tstart;
47 | }
48 | {
49 | # handle end of latest segment as a special case.
50 | $lN = $line_numbers[$#line_numbers];
51 | $tend = ${$all_lines[$lN]}[3] + $extend;
52 | ${$all_lines[$lN]}[3] = $tend;
53 | }
54 | for ($i = 0; $i < $#line_numbers; $i++) {
55 | $ln = $line_numbers[$i];
56 | $ln1 = $line_numbers[$i+1];
57 | $tend = ${$all_lines[$ln]}[3]; # end of earlier segment.
58 | $tstart = ${$all_lines[$ln1]}[2]; # start of later segment.
59 | if ($tend > $tstart) {
60 | $utt1 = ${$all_lines[$ln]}[0];
61 | $utt2 = ${$all_lines[$ln1]}[0];
62 | print STDERR "Warning: for utterances $utt1 and $utt2, segments " .
63 | "already overlap; leaving these times unchanged.\n";
64 | } else {
65 | $my_extend = $extend;
66 | $max_extend = 0.5 * ($tstart - $tend);
67 | if ($my_extend > $max_extend) { $my_extend = $max_extend; }
68 | $tend += $my_extend;
69 | $tstart -= $my_extend;
70 | ${$all_lines[$ln]}[3] = $tend;
71 | ${$all_lines[$ln1]}[2] = $tstart;
72 | }
73 | }
74 | }
75 |
76 | # leave the numbering of the lines unchanged.
77 | for ($l = 0; $l < @all_lines; $l++) {
78 | $ref = $all_lines[$l];
79 | ($utt_id, $reco_id, $start_time, $end_time) = @$ref;
80 | printf("%s %s %.2f %.2f\n", $utt_id, $reco_id, $start_time, $end_time);
81 | }
82 |
83 | __END__
84 |
85 | # testing below.
86 |
87 | # ( echo a1 A 0 1; echo a2 A 3 4; echo b1 B 0 1; echo b2 B 2 3 ) | local/extend_segments.pl 1.0
88 | a1 A 0.00 2.00
89 | a2 A 2.00 5.00
90 | b1 B 0.00 1.50
91 | b2 B 1.50 4.00
92 | # ( echo a1 A 0 2; echo a2 A 1 3 ) | local/extend_segments.pl 1.0
93 | Warning: for utterances a1 and a2, segments already overlap; leaving these times unchanged.
94 | a1 A 0.00 2.00
95 | a2 A 1.00 4.00
96 | # ( echo a1 A 0 2; echo a2 A 5 6; echo a3 A 3 4 ) | local/extend_segments.pl 1.0
97 | a1 A 0.00 2.50
98 | a2 A 4.50 7.00
99 | a3 A 2.50 4.50
100 |
--------------------------------------------------------------------------------
/speech_datasets/bin/dump.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | # Copyright 2020 Salesforce Research (Aadyot Bhatnagar)
4 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
5 |
6 | import argparse
7 | from distutils.util import strtobool
8 | import logging
9 |
10 | import kaldiio
11 | import tqdm
12 |
13 | from speech_datasets.transform import Transformation
14 | from speech_datasets.utils.io_utils import get_commandline_args, consolidate_utt_info
15 | from speech_datasets.utils.types import str_or_none, humanfriendly_or_none
16 | from speech_datasets.utils.writers import file_writer_helper
17 |
18 | logger = logging.getLogger(__name__)
19 |
20 |
21 | def parse_args():
22 | parser = argparse.ArgumentParser(
23 | description="read .wav files & ")
24 | parser.add_argument("--feature-config", default=None, type=str_or_none,
25 | help="YAML file for feature extraction (if extracting any features)")
26 | parser.add_argument("--text-file", default=None,
27 | help="file mapping utterance ID to transcript")
28 | parser.add_argument("--utt2spk-file", default=None,
29 | help="file mapping utterance ID to speaker ID")
30 |
31 | parser.add_argument("--archive-format", type=str, default="hdf5", choices=["mat", "hdf5"],
32 | help="Specify the file format for output. \"mat\" is the matrix format in kaldi")
33 | parser.add_argument("--sample-frequency", type=humanfriendly_or_none, default=None,
34 | help="If the sampling rate is specified, resample the input.")
35 | parser.add_argument("--compress", type=strtobool, default=False, help="Save in compressed format")
36 | parser.add_argument("--compression-method", type=int, default=2,
37 | help="Specify the method(if mat) or " "gzip-level(if hdf5)")
38 | parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
39 | parser.add_argument("--segments", type=str,
40 | help="segments-file format: each line is either"
41 | " "
42 | "e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5")
43 | parser.add_argument("rspecifier", type=str, help="WAV scp file")
44 | parser.add_argument("wspecifier", type=str, help="Write specifier")
45 |
46 | return parser.parse_args()
47 |
48 |
49 | def main():
50 | args = parse_args()
51 |
52 | logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
53 | if args.verbose > 0:
54 | logging.basicConfig(level=logging.INFO, format=logfmt)
55 | else:
56 | logging.basicConfig(level=logging.WARN, format=logfmt)
57 | logger.info(get_commandline_args())
58 |
59 | utt_text_speaker = consolidate_utt_info(
60 | scp=None, text=args.text_file, utt2spk=args.utt2spk_file)
61 |
62 | with kaldiio.ReadHelper(
63 | args.rspecifier, segments=args.segments
64 | ) as reader, file_writer_helper(
65 | args.wspecifier,
66 | filetype=args.archive_format,
67 | compress=args.compress,
68 | compression_method=args.compression_method,
69 | sample_frequency=args.sample_frequency,
70 | transform=Transformation(args.feature_config)
71 | ) as writer:
72 | for utt_id, (rate, wave) in tqdm.tqdm(reader, miniters=100, maxinterval=30):
73 | utt_dict = {"x": wave, "rate": rate}
74 | utt_dict.update(utt_text_speaker.get(utt_id, {}))
75 | try:
76 | writer[utt_id] = utt_dict
77 | except Exception as e:
78 | logger.warning(
79 | f"Failed to process utterance {utt_id} with exception:\n{str(e)}")
80 | continue
81 |
82 |
83 | if __name__ == "__main__":
84 | main()
85 |
--------------------------------------------------------------------------------
/utils/sym2int.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | # Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
3 |
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
14 | # See the Apache 2 License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 |
18 | $ignore_oov = 0;
19 |
20 | for($x = 0; $x < 2; $x++) {
21 | if ($ARGV[0] eq "--map-oov") {
22 | shift @ARGV;
23 | $map_oov = shift @ARGV;
24 | if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") {
25 | # disallow '-f', the empty string and anything ending in words.txt as the
26 | # OOV symbol because these are likely command-line errors.
27 | die "the --map-oov option requires an argument";
28 | }
29 | }
30 | if ($ARGV[0] eq "-f") {
31 | shift @ARGV;
32 | $field_spec = shift @ARGV;
33 | if ($field_spec =~ m/^\d+$/) {
34 | $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
35 | }
36 | if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesy (properly, 1-10)
37 | if ($1 ne "") {
38 | $field_begin = $1 - 1; # Change to zero-based indexing.
39 | }
40 | if ($2 ne "") {
41 | $field_end = $2 - 1; # Change to zero-based indexing.
42 | }
43 | }
44 | if (!defined $field_begin && !defined $field_end) {
45 | die "Bad argument to -f option: $field_spec";
46 | }
47 | }
48 | }
49 |
50 | $symtab = shift @ARGV;
51 | if (!defined $symtab) {
52 | print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" .
53 | "options: [--map-oov ] [-f ]\n" .
54 | "note: can look like 4-5, or 4-, or 5-, or 1.\n";
55 | }
56 | open(F, "<$symtab") || die "Error opening symbol table file $symtab";
57 | while() {
58 | @A = split(" ", $_);
59 | @A == 2 || die "bad line in symbol table file: $_";
60 | $sym2int{$A[0]} = $A[1] + 0;
61 | }
62 |
63 | if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up
64 | if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; }
65 | $map_oov = $sym2int{$map_oov};
66 | }
67 |
68 | $num_warning = 0;
69 | $max_warning = 20;
70 |
71 | while (<>) {
72 | @A = split(" ", $_);
73 | @B = ();
74 | for ($n = 0; $n < @A; $n++) {
75 | $a = $A[$n];
76 | if ( (!defined $field_begin || $n >= $field_begin)
77 | && (!defined $field_end || $n <= $field_end)) {
78 | $i = $sym2int{$a};
79 | if (!defined ($i)) {
80 | if (defined $map_oov) {
81 | if ($num_warning++ < $max_warning) {
82 | print STDERR "sym2int.pl: replacing $a with $map_oov\n";
83 | if ($num_warning == $max_warning) {
84 | print STDERR "sym2int.pl: not warning for OOVs any more times\n";
85 | }
86 | }
87 | $i = $map_oov;
88 | } else {
89 | $pos = $n+1;
90 | die "sym2int.pl: undefined symbol $a (in position $pos)\n";
91 | }
92 | }
93 | $a = $i;
94 | }
95 | push @B, $a;
96 | }
97 | print join(" ", @B);
98 | print "\n";
99 | }
100 | if ($num_warning > 0) {
101 | print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n";
102 | }
103 |
104 | exit(0);
105 |
--------------------------------------------------------------------------------
/COMBINE/asr1/local/combine_datasets.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import shutil
4 |
5 | from speech_datasets.utils import get_root
6 | from speech_datasets.utils.io_utils import get_combo_idx
7 | from speech_datasets.utils.types import str2bool
8 |
9 |
10 | def main():
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument("--task", type=str, choices=["asr", "tts"])
13 | parser.add_argument("--write_dir", type=str2bool, default=True)
14 | parser.add_argument("datasets", nargs="+", type=str)
15 | args = parser.parse_args()
16 |
17 | # Ensure that all datasets are specified as /
18 | datasets = sorted(set(args.datasets))
19 | dataset_splits = [d.split("/", maxsplit=1) for d in datasets]
20 | assert all(len(d) == 2 for d in dataset_splits), \
21 | f"All datasets must be specified as /, but got " \
22 | f"{datasets} instead"
23 |
24 | # Verify that all datasets have been prepared
25 | dataset_dirs = [os.path.join(get_root(), ds[0], f"{args.task}1", "data", ds[1])
26 | for ds in dataset_splits]
27 | assert all(os.path.isdir(d) for d in dataset_dirs), \
28 | f"Please make sure that all dataset splits are valid, and that all " \
29 | f"datasets you wish to combine have already been prepared by stage 1 " \
30 | f"of {args.task}.sh"
31 |
32 | # Get the index of this dataset combination (add to the registry if needed)
33 | idx = get_combo_idx(datasets, args.task)
34 | data_dir = os.path.join(get_root(), "COMBINE", f"{args.task}1", "data")
35 | if idx < 0:
36 | os.makedirs(data_dir, exist_ok=True)
37 | with open(os.path.join(data_dir, "registry.txt"), "a") as f:
38 | f.write(" ".join(datasets) + "\n")
39 | idx = get_combo_idx(datasets, args.task)
40 |
41 | if not args.write_dir:
42 | return idx
43 |
44 | # Create a directory for this dataset combo & prepare it
45 | dirname = os.path.join(data_dir, str(idx))
46 | os.makedirs(dirname, exist_ok=True)
47 | write_segments = any(os.path.isfile(os.path.join(d, "segments"))
48 | for d in dataset_dirs)
49 | with open(os.path.join(dirname, "wav.scp"), "wb") as wav, \
50 | open(os.path.join(dirname, "text"), "wb") as text, \
51 | open(os.path.join(dirname, "utt2spk"), "wb") as utt2spk, \
52 | open(os.path.join(dirname, "segments"), "w") as segments:
53 | for d in dataset_dirs:
54 |
55 | # wav.scp, text, and utt2spk can just be concatenated on
56 | with open(os.path.join(d, "wav.scp"), "rb") as src_wav:
57 | shutil.copyfileobj(src_wav, wav)
58 | with open(os.path.join(d, "text"), "rb") as src_text:
59 | shutil.copyfileobj(src_text, text)
60 | with open(os.path.join(d, "utt2spk"), "rb") as src_utt2spk:
61 | shutil.copyfileobj(src_utt2spk, utt2spk)
62 |
63 | if write_segments:
64 | # If a segments file exists, we can just concatenate it on
65 | if os.path.isfile(os.path.join(d, "segments")):
66 | with open(os.path.join(d, "segments"), "r") as src_segments:
67 | shutil.copyfileobj(src_segments, segments)
68 |
69 | # Otherwise, we need to use wav.scp to create a dummy segments
70 | # line format is
71 | # = 0, = -1 means use the whole recording
72 | else:
73 | with open(os.path.join(d, "wav.scp"), "r") as src_wav:
74 | for line in src_wav:
75 | utt_id, _ = line.rstrip().split(None, maxsplit=1)
76 | segments.write(f"{utt_id} {utt_id} 0.0 -1.0\n")
77 |
78 | return idx
79 |
80 |
81 | if __name__ == "__main__":
82 | combo_idx = main()
83 | print(combo_idx)
84 |
--------------------------------------------------------------------------------
/COMBINE/tts1/local/combine_datasets.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import shutil
4 |
5 | from speech_datasets.utils import get_root
6 | from speech_datasets.utils.io_utils import get_combo_idx
7 | from speech_datasets.utils.types import str2bool
8 |
9 |
10 | def main():
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument("--task", type=str, choices=["asr", "tts"])
13 | parser.add_argument("--write_dir", type=str2bool, default=True)
14 | parser.add_argument("datasets", nargs="+", type=str)
15 | args = parser.parse_args()
16 |
17 | # Ensure that all datasets are specified as /
18 | datasets = sorted(set(args.datasets))
19 | dataset_splits = [d.split("/", maxsplit=1) for d in datasets]
20 | assert all(len(d) == 2 for d in dataset_splits), \
21 | f"All datasets must be specified as /, but got " \
22 | f"{datasets} instead"
23 |
24 | # Verify that all datasets have been prepared
25 | dataset_dirs = [os.path.join(get_root(), ds[0], f"{args.task}1", "data", ds[1])
26 | for ds in dataset_splits]
27 | assert all(os.path.isdir(d) for d in dataset_dirs), \
28 | f"Please make sure that all dataset splits are valid, and that all " \
29 | f"datasets you wish to combine have already been prepared by stage 1 " \
30 | f"of {args.task}.sh"
31 |
32 | # Get the index of this dataset combination (add to the registry if needed)
33 | idx = get_combo_idx(datasets, args.task)
34 | data_dir = os.path.join(get_root(), "COMBINE", f"{args.task}1", "data")
35 | if idx < 0:
36 | os.makedirs(data_dir, exist_ok=True)
37 | with open(os.path.join(data_dir, "registry.txt"), "a") as f:
38 | f.write(" ".join(datasets) + "\n")
39 | idx = get_combo_idx(datasets, args.task)
40 |
41 | if not args.write_dir:
42 | return idx
43 |
44 | # Create a directory for this dataset combo & prepare it
45 | dirname = os.path.join(data_dir, str(idx))
46 | os.makedirs(dirname, exist_ok=True)
47 | write_segments = any(os.path.isfile(os.path.join(d, "segments"))
48 | for d in dataset_dirs)
49 | with open(os.path.join(dirname, "wav.scp"), "wb") as wav, \
50 | open(os.path.join(dirname, "text"), "wb") as text, \
51 | open(os.path.join(dirname, "utt2spk"), "wb") as utt2spk, \
52 | open(os.path.join(dirname, "segments"), "w") as segments:
53 | for d in dataset_dirs:
54 |
55 | # wav.scp, text, and utt2spk can just be concatenated on
56 | with open(os.path.join(d, "wav.scp"), "rb") as src_wav:
57 | shutil.copyfileobj(src_wav, wav)
58 | with open(os.path.join(d, "text"), "rb") as src_text:
59 | shutil.copyfileobj(src_text, text)
60 | with open(os.path.join(d, "utt2spk"), "rb") as src_utt2spk:
61 | shutil.copyfileobj(src_utt2spk, utt2spk)
62 |
63 | if write_segments:
64 | # If a segments file exists, we can just concatenate it on
65 | if os.path.isfile(os.path.join(d, "segments")):
66 | with open(os.path.join(d, "segments"), "r") as src_segments:
67 | shutil.copyfileobj(src_segments, segments)
68 |
69 | # Otherwise, we need to use wav.scp to create a dummy segments
70 | # line format is
71 | # = 0, = -1 means use the whole recording
72 | else:
73 | with open(os.path.join(d, "wav.scp"), "r") as src_wav:
74 | for line in src_wav:
75 | utt_id, _ = line.rstrip().split(None, maxsplit=1)
76 | segments.write(f"{utt_id} {utt_id} 0.0 -1.0\n")
77 |
78 | return idx
79 |
80 |
81 | if __name__ == "__main__":
82 | combo_idx = main()
83 | print(combo_idx)
84 |
--------------------------------------------------------------------------------
/speech_datasets/bin/apply_cmvn.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import argparse
3 | from distutils.util import strtobool
4 | import logging
5 |
6 | from speech_datasets.transform import Transformation
7 | from speech_datasets.utils.readers import file_reader_helper
8 | from speech_datasets.utils.io_utils import get_commandline_args
9 | from speech_datasets.utils.writers import file_writer_helper
10 |
11 | logger = logging.getLogger(__name__)
12 |
13 |
14 | def parse_args():
15 | parser = argparse.ArgumentParser(
16 | description="apply mean-variance normalization to files",
17 | formatter_class=argparse.ArgumentDefaultsHelpFormatter,
18 | )
19 |
20 | parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
21 | parser.add_argument("--in-filetype", type=str, default="hdf5", choices=["mat", "hdf5"],
22 | help="Specify the file format for the rspecifier. "
23 | '"mat" is the matrix format in kaldi')
24 | parser.add_argument("--out-filetype", type=str, default="hdf5", choices=["mat", "hdf5"],
25 | help="Specify the file format for the wspecifier. "
26 | '"mat" is the matrix format in kaldi')
27 |
28 | parser.add_argument("--norm-means", type=strtobool, default=True,
29 | help="Do mean normalization or not.")
30 | parser.add_argument("--norm-vars", type=strtobool, default=False,
31 | help="Do variance normalization or not.")
32 | parser.add_argument("--reverse", type=strtobool, default=False,
33 | help="Do reverse mode or not")
34 | parser.add_argument("--utt2spk", type=str, default=None,
35 | help="A text file of utterance to speaker map.")
36 | parser.add_argument("--compress", type=strtobool, default=False,
37 | help="Save in compressed format")
38 | parser.add_argument("--compression-method", type=int, default=2,
39 | help="Specify the method (if mat) or gzip-level (if hdf5)")
40 | parser.add_argument("--cmvn-type", type=str, choices=["global", "speaker", "utterance"],
41 | help="Type of CMVN to apply (global, per-speaker, or per-utterance)")
42 | parser.add_argument("stats_file", help="File containing CMVN stats.")
43 | parser.add_argument("rspecifier", type=str, help="Read specifier id, e.g. ark:some.ark")
44 | parser.add_argument("wspecifier", type=str, help="Write specifier id, e.g. ark:some.ark")
45 |
46 | args = parser.parse_args()
47 | if args.cmvn_type == "speaker" and args.utt2spk is None:
48 | raise argparse.ArgumentError(
49 | args.cmvn_type, "If cmvn-type is 'speaker', utt2spk must be provided.")
50 |
51 | return args
52 |
53 |
54 | def main():
55 | args = parse_args()
56 |
57 | # logging info
58 | logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
59 | if args.verbose > 0:
60 | logging.basicConfig(level=logging.INFO, format=logfmt)
61 | else:
62 | logging.basicConfig(level=logging.WARN, format=logfmt)
63 | logger.info(get_commandline_args())
64 |
65 | cmvn = Transformation([{"type": "cmvn",
66 | "stats": args.stats_file,
67 | "cmvn_type": args.cmvn_type,
68 | "norm_means": args.norm_means,
69 | "norm_vars": args.norm_vars,
70 | "utt2spk": args.utt2spk,
71 | "reverse": args.reverse}])
72 |
73 | with file_writer_helper(
74 | args.wspecifier,
75 | filetype=args.out_filetype,
76 | compress=args.compress,
77 | compression_method=args.compression_method,
78 | ) as writer:
79 | for utt, data in file_reader_helper(args.rspecifier, args.in_filetype,
80 | transform=cmvn, return_dict=True):
81 | writer[utt] = data
82 |
83 |
84 | if __name__ == "__main__":
85 | main()
86 |
--------------------------------------------------------------------------------
/swbd/asr1/local/swbd1_prepare_dict.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | set -u
4 | set -o pipefail
5 |
6 | log() {
7 | local fname=${BASH_SOURCE[1]##*/}
8 | echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
9 | }
10 |
11 | # Formatting the Mississippi State dictionary for use in Edinburgh. Differs
12 | # from the one in Kaldi s5 recipe in that it uses lower-case --Arnab (Jan 2013)
13 |
14 | # To be run from one directory above this script.
15 |
16 | . ./path.sh
17 |
18 | #check existing directories
19 |
20 | if [ $# != 1 ]; then
21 | log "Error: invalid command line arguments"
22 | log "Usage: $0 /path/to/SWBD"
23 | exit 1;
24 | fi
25 | SWBD_DIR=$1
26 |
27 | # Get the original transcriptions & their corresponding dictionary
28 | srcdir=data/local/swbd1
29 | mkdir -p $srcdir
30 | if [ ! -d $srcdir/swb_ms98_transcriptions ]; then
31 | ln -sf "${SWBD_DIR}/swb_ms98_transcriptions" $srcdir/
32 | fi
33 | srcdict=$srcdir/swb_ms98_transcriptions/sw-ms98-dict.text
34 |
35 | # assume some basic data prep was already done on the downloaded data.
36 | [ ! -f "$srcdict" ] && echo "$0: No such file $srcdict" && exit 1;
37 |
38 | # copy over the initial dictionary as thee base lexicon
39 | dir=data/local/dict_nosp
40 | mkdir -p $dir
41 | install -m +rw $srcdict $dir/lexicon0.txt || exit 1;
42 | log "$(patch 0' | sort > $dir/lexicon1.txt || exit 1;
47 |
48 | cat $dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \
49 | grep -v sil > $dir/nonsilence_phones.txt || exit 1;
50 |
51 | ( echo sil; echo nsn; ) > $dir/silence_phones.txt
52 |
53 | echo sil > $dir/optional_silence.txt
54 |
55 | # No "extra questions" in the input to this setup, as we don't
56 | # have stress or tone.
57 | echo -n > $dir/extra_questions.txt
58 |
59 | cp local/MSU_single_letter.txt $dir/
60 | # Add to the lexicon the silences, noises etc.
61 | # Add single letter lexicon
62 | # The original swbd lexicon does not have precise single letter lexicion
63 | # e.g. it does not have entry of W
64 | ( echo '!sil sil'; echo ' nsn'; echo ' spn' ) \
65 | | cat - $dir/lexicon1.txt $dir/MSU_single_letter.txt > $dir/lexicon2.txt || exit 1;
66 |
67 | # Map the words in the lexicon. That is-- for each word in the lexicon, we map it
68 | # to a new written form. The transformations we do are:
69 | # remove laughter markings, e.g.
70 | # [LAUGHTER-STORY] -> STORY
71 | # Remove partial-words, e.g.
72 | # -[40]1K W AH N K EY
73 | # becomes -1K
74 | # and
75 | # -[AN]Y IY
76 | # becomes
77 | # -Y
78 | # -[A]B[OUT]- B
79 | # becomes
80 | # -B-
81 | # Also, curly braces, which appear to be used for "nonstandard"
82 | # words or non-words, are removed, e.g.
83 | # {WOLMANIZED} W OW L M AX N AY Z D
84 | # -> WOLMANIZED
85 | # Also, mispronounced words, e.g.
86 | # [YEAM/YEAH] Y AE M
87 | # are changed to just e.g. YEAM, i.e. the orthography
88 | # of the mispronounced version.
89 | # Note-- this is only really to be used in training. The main practical
90 | # reason is to avoid having tons of disambiguation symbols, which
91 | # we otherwise would get because there are many partial words with
92 | # the same phone sequences (most problematic: S).
93 | # Also, map
94 | # THEM_1 EH M -> THEM
95 | # so that multiple pronunciations just have alternate entries
96 | # in the lexicon.
97 | local/swbd1_map_words.pl -f 1 $dir/lexicon2.txt | sort -u \
98 | > $dir/lexicon3.txt || exit 1;
99 |
100 | python local/format_acronyms_dict.py -i $dir/lexicon3.txt -o $dir/lexicon4.txt \
101 | -L $dir/MSU_single_letter.txt -M $dir/acronyms_raw.map
102 | cat $dir/acronyms_raw.map | sort -u > $dir/acronyms.map
103 |
104 | ( echo 'i ay' )| cat - $dir/lexicon4.txt | tr '[A-Z]' '[a-z]' | sort -u > $dir/lexicon5.txt
105 |
106 | pushd $dir >&/dev/null
107 | ln -sf lexicon5.txt lexicon.txt # This is the final lexicon.
108 | popd >&/dev/null
109 | log "Prepared input dictionary and phone-sets for Switchboard phase 1."
110 |
--------------------------------------------------------------------------------
/utils/parse_options.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
4 | # Arnab Ghoshal, Karel Vesely
5 |
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
13 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
14 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
15 | # MERCHANTABLITY OR NON-INFRINGEMENT.
16 | # See the Apache 2 License for the specific language governing permissions and
17 | # limitations under the License.
18 |
19 |
20 | # Parse command-line options.
21 | # To be sourced by another script (as in ". parse_options.sh").
22 | # Option format is: --option-name arg
23 | # and shell variable "option_name" gets set to value "arg."
24 | # The exception is --help, which takes no arguments, but prints the
25 | # $help_message variable (if defined).
26 |
27 |
28 | ###
29 | ### The --config file options have lower priority to command line
30 | ### options, so we need to import them first...
31 | ###
32 |
33 | # Now import all the configs specified by command-line, in left-to-right order
34 | for ((argpos=1; argpos<$#; argpos++)); do
35 | if [ "${!argpos}" == "--config" ]; then
36 | argpos_plus1=$((argpos+1))
37 | config=${!argpos_plus1}
38 | [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
39 | . $config # source the config file.
40 | fi
41 | done
42 |
43 |
44 | ###
45 | ### Now we process the command line options
46 | ###
47 | while true; do
48 | [ -z "${1:-}" ] && break; # break if there are no arguments
49 | case "$1" in
50 | # If the enclosing script is called with --help option, print the help
51 | # message and exit. Scripts should put help messages in $help_message
52 | --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
53 | else printf "$help_message\n" 1>&2 ; fi;
54 | exit 0 ;;
55 | --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
56 | exit 1 ;;
57 | # If the first command-line argument begins with "--" (e.g. --foo-bar),
58 | # then work out the variable name as $name, which will equal "foo_bar".
59 | --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
60 | # Next we test whether the variable in question is undefned-- if so it's
61 | # an invalid option and we die. Note: $0 evaluates to the name of the
62 | # enclosing script.
63 | # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
64 | # is undefined. We then have to wrap this test inside "eval" because
65 | # foo_bar is itself inside a variable ($name).
66 | eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
67 |
68 | oldval="`eval echo \\$$name`";
69 | # Work out whether we seem to be expecting a Boolean argument.
70 | if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
71 | was_bool=true;
72 | else
73 | was_bool=false;
74 | fi
75 |
76 | # Set the variable to the right value-- the escaped quotes make it work if
77 | # the option had spaces, like --cmd "queue.pl -sync y"
78 | if [ $# -lt 2 ]; then
79 | echo "$0: no argument provided for option $1" 1>&2
80 | exit 1;
81 | else
82 | eval $name=\"$2\";
83 | fi
84 |
85 | # Check that Boolean-valued arguments are really Boolean.
86 | if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
87 | echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
88 | exit 1;
89 | fi
90 | shift 2;
91 | ;;
92 | *) break;
93 | esac
94 | done
95 |
96 |
97 | # Check for an empty argument to the --cmd option, which can easily occur as a
98 | # result of scripting errors.
99 | [ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
100 |
101 |
102 | true; # so this script returns exit code 0.
103 |
--------------------------------------------------------------------------------
/speech_datasets/utils/types.py:
--------------------------------------------------------------------------------
1 | from distutils.util import strtobool
2 | from typing import Optional, Tuple, Union
3 |
4 | import humanfriendly
5 | import numpy as np
6 | from typeguard import check_argument_types
7 |
8 |
9 | class CMVNStats(object):
10 | def __init__(self, count, sum, sum_squares):
11 | self.count = count
12 | self.sum = sum
13 | self.sum_squares = sum_squares
14 |
15 | def __iadd__(self, other):
16 | self.count += other.count
17 | self.sum += other.sum
18 | self.sum_squares += other.sum_squares
19 | return self
20 |
21 | @classmethod
22 | def from_numpy(cls, stats):
23 | stats = np.copy(stats)
24 | assert len(stats) == 2, stats.shape
25 | # If feat has >2 dims, only use the first one for count
26 | count = stats[0, -1].flatten()[0]
27 | return cls(count=count, sum=stats[0, :-1], sum_squares=stats[1, :-1])
28 |
29 | def to_numpy(self):
30 | shape = (2, self.sum.shape[0] + 1, *self.sum.shape[1:])
31 | arr = np.empty(shape, dtype=np.float64)
32 | arr[0, :-1] = self.sum
33 | arr[1, :-1] = self.sum_squares
34 | arr[0, -1] = self.count
35 | arr[1, -1] = 0.0
36 | return arr
37 |
38 |
39 | def str2bool(value: str) -> bool:
40 | return bool(strtobool(value))
41 |
42 |
43 | def int_or_none(value: str) -> Optional[int]:
44 | """int_or_none.
45 |
46 | Examples:
47 | >>> import argparse
48 | >>> parser = argparse.ArgumentParser()
49 | >>> _ = parser.add_argument('--foo', type=int_or_none)
50 | >>> parser.parse_args(['--foo', '456'])
51 | Namespace(foo=456)
52 | >>> parser.parse_args(['--foo', 'none'])
53 | Namespace(foo=None)
54 | >>> parser.parse_args(['--foo', 'null'])
55 | Namespace(foo=None)
56 | >>> parser.parse_args(['--foo', 'nil'])
57 | Namespace(foo=None)
58 |
59 | """
60 | if value.strip().lower() in ("none", "null", "nil"):
61 | return None
62 | return int(value)
63 |
64 |
65 | def float_or_none(value: str) -> Optional[float]:
66 | """float_or_none.
67 |
68 | Examples:
69 | >>> import argparse
70 | >>> parser = argparse.ArgumentParser()
71 | >>> _ = parser.add_argument('--foo', type=float_or_none)
72 | >>> parser.parse_args(['--foo', '4.5'])
73 | Namespace(foo=4.5)
74 | >>> parser.parse_args(['--foo', 'none'])
75 | Namespace(foo=None)
76 | >>> parser.parse_args(['--foo', 'null'])
77 | Namespace(foo=None)
78 | >>> parser.parse_args(['--foo', 'nil'])
79 | Namespace(foo=None)
80 |
81 | """
82 | if value.strip().lower() in ("none", "null", "nil"):
83 | return None
84 | return float(value)
85 |
86 |
87 | def humanfriendly_or_none(value) -> Optional[float]:
88 | if value.strip().lower() in ("none", "null", "nil"):
89 | return None
90 | return humanfriendly.parse_size(value)
91 |
92 |
93 | def str2int_tuple(integers: str) -> Optional[Tuple[int, ...]]:
94 | """
95 |
96 | >>> str2int_tuple('3,4,5')
97 | (3, 4, 5)
98 |
99 | """
100 | assert check_argument_types()
101 | if integers.strip() in ("none", "None", "NONE", "null", "Null", "NULL"):
102 | return None
103 | return tuple(map(int, integers.strip().split(",")))
104 |
105 |
106 | def str_or_int(value: str) -> Union[str, int]:
107 | try:
108 | return int(value)
109 | except ValueError:
110 | return value
111 |
112 |
113 | def str_or_none(value: str) -> Optional[str]:
114 | """str_or_none.
115 |
116 | Examples:
117 | >>> import argparse
118 | >>> parser = argparse.ArgumentParser()
119 | >>> _ = parser.add_argument('--foo', type=str_or_none)
120 | >>> parser.parse_args(['--foo', 'aaa'])
121 | Namespace(foo='aaa')
122 | >>> parser.parse_args(['--foo', 'none'])
123 | Namespace(foo=None)
124 | >>> parser.parse_args(['--foo', 'null'])
125 | Namespace(foo=None)
126 | >>> parser.parse_args(['--foo', 'nil'])
127 | Namespace(foo=None)
128 |
129 | """
130 | if value.strip().lower() in ("none", "null", "nil"):
131 | return None
132 | return value
133 |
--------------------------------------------------------------------------------
/speech_datasets/transform/cmvn.py:
--------------------------------------------------------------------------------
1 | import io
2 | import logging
3 | import os
4 |
5 | import numpy as np
6 |
7 | from speech_datasets.transform.interface import TransformInterface
8 | from speech_datasets.utils import get_root
9 | from speech_datasets.utils.readers import read_cmvn_stats
10 |
11 | logger = logging.getLogger(__name__)
12 |
13 |
14 | class CMVN(TransformInterface):
15 | def __init__(self, cmvn_type: str, stats: str = None, norm_means=True,
16 | norm_vars=False, utt2spk: str = None, reverse=False,
17 | std_floor=1.0e-20):
18 | self.norm_means = norm_means
19 | self.norm_vars = norm_vars
20 | self.reverse = reverse
21 | self.std_floor = std_floor
22 |
23 | assert cmvn_type in ["global", "speaker", "utterance"], cmvn_type
24 | self.accept_uttid = (cmvn_type != "global")
25 | self.cmvn_type = cmvn_type
26 | if cmvn_type != "utterance":
27 | assert stats is not None, "stats required if cmvn_type != 'utterance'"
28 | try:
29 | self.stats_file = stats
30 | stats_dict = read_cmvn_stats(self.stats_file, cmvn_type)
31 | except FileNotFoundError:
32 | self.stats_file = os.path.join(get_root(), stats)
33 | stats_dict = read_cmvn_stats(self.stats_file, cmvn_type)
34 | else:
35 | if stats is not None:
36 | logger.warning("stats file is not used if cmvn_type is 'utterance'")
37 | self.stats_file = None
38 | stats_dict = {}
39 |
40 | if cmvn_type == "speaker":
41 | assert utt2spk is not None, "utt2spk required if cmvn_type is 'speaker'"
42 | self.utt2spk = {}
43 | with io.open(utt2spk, "r", encoding="utf-8") as f:
44 | for line in f:
45 | utt, spk = line.rstrip().split(None, maxsplit=1)
46 | self.utt2spk[utt] = spk
47 | else:
48 | if utt2spk is not None:
49 | logger.warning("utt2spk is only used if cmvn_type is 'speaker'")
50 | self.utt2spk = None
51 |
52 | # Kaldi makes a matrix for CMVN which has a shape of (2, feat_dim + 1),
53 | # and the first vector contains the sum of feats and the second is
54 | # the sum of squares. The last value of the first, i.e. stats[0,-1],
55 | # is the number of samples for this statistics.
56 | self.bias = {}
57 | self.scale = {}
58 | for spk, stats in stats_dict.items():
59 | # Var[x] = E[x^2] - E[x]^2
60 | mean = stats.sum / stats.count
61 | var = stats.sum_squares / stats.count - mean * mean
62 | std = np.maximum(np.sqrt(var), std_floor)
63 | self.bias[spk] = -mean
64 | self.scale[spk] = 1 / std
65 |
66 | def __repr__(self):
67 | return (
68 | "{name}(stats_file={stats_file}, "
69 | "norm_means={norm_means}, norm_vars={norm_vars}, "
70 | "reverse={reverse})".format(
71 | name=self.__class__.__name__,
72 | stats_file=self.stats_file,
73 | norm_means=self.norm_means,
74 | norm_vars=self.norm_vars,
75 | reverse=self.reverse,
76 | )
77 | )
78 |
79 | def __call__(self, x, uttid=None):
80 | if self.cmvn_type == "global":
81 | bias = self.bias[None]
82 | scale = self.scale[None]
83 | elif self.cmvn_type == "speaker":
84 | spk = self.utt2spk[uttid]
85 | bias = self.bias[spk]
86 | scale = self.scale[spk]
87 | else: # self.cmvn_type == "utterance"
88 | mean = x.mean(axis=0)
89 | mse = (x ** 2).sum(axis=0) / x.shape[0]
90 | bias = -mean
91 | scale = 1 / np.maximum(np.sqrt(mse - mean ** 2), self.std_floor)
92 |
93 | if not self.reverse:
94 | if self.norm_means:
95 | x = np.add(x, bias)
96 | if self.norm_vars:
97 | x = np.multiply(x, scale)
98 |
99 | else:
100 | if self.norm_vars:
101 | x = np.divide(x, scale)
102 | if self.norm_means:
103 | x = np.subtract(x, bias)
104 |
105 | return x
106 |
--------------------------------------------------------------------------------
/TEMPLATE/asr1/cmd.sh:
--------------------------------------------------------------------------------
1 | # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
2 | # Usage: .pl [options] JOB=1:
3 | # e.g.
4 | # run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
5 | #
6 | # Options:
7 | # --time