├── .gitignore
├── LICENSE
├── README.md
├── SAFEBOX
└── README.md
├── env_vars.sh
├── misc
├── debug
│ └── debug.list
└── metadata_versions.txt
├── toolkits
├── athena
│ ├── extract_meta.py
│ └── prepare_data.py
├── kaldi
│ ├── extract_meta.py
│ ├── gigaspeech_data_prep.sh
│ └── utt2spk_to_spk2utt.pl
└── wenet
│ ├── extract_meta.py
│ └── gigaspeech_data_prep.sh
└── utils
├── check_audio_md5.sh
├── check_metadata_md5.sh
├── download_gigaspeech.sh
├── extract_metadata_version.sh
├── extract_subset_segments.py
├── gigaspeech_scoring.py
├── install_jq.sh
├── internal
├── download_gigaspeech_from_oss.sh
└── download_gigaspeech_with_pyspeechcolab.sh
├── ls_audios.sh
├── ls_md5.sh
├── opus_to_wav.py
├── parse_options.sh
└── show_segment_info.sh
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | # Editors
132 | *.swp
133 |
134 | # GigaSpeech ignore list
135 | SAFEBOX/aliyun_ossutil.cfg
136 | SAFEBOX/password
137 | */dict/*
138 | */g2p/*
139 | *.json
140 | cmudict*
141 | ossutil64
142 | ossutilmac64
143 | ossutil_output/
144 | tmp/
145 | .DS_Store
146 | *.aes
147 | *.tgz
148 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # GigaSpeech
2 | This is the official repository of the GigaSpeech dataset. For details of how we created the dataset, please refer to our Interspeech paper: *"GigaSpeech: An Evolving, Multi-domain ASR Corpus with 10,000 Hours of Transcribed Audio"*. [Preprint available on arxiv](https://arxiv.org/abs/2106.06909).
3 |
4 | GigaSpeech version: 1.0.0 (07/05/2021)
5 |
6 | ## Download
7 | 1. Step 1: Please fill out the Google Form [here](https://forms.gle/UuGQAPyscGRrUMLq6)
8 | 2. Step 2:
9 | - Option A: Follow the instructions in replied email from SpeechColab to get the raw release of GigaSpeech
10 | - Option B: Refer to [GigaSpeech On HuggingFace](https://github.com/SpeechColab/GigaSpeech/issues/117) to get a pre-processed version of GigaSpeech via HuggingFace.
11 |
12 | ## Leaderboard
13 |
14 | | **Contributor**| **Toolkit** | **Train Recipe** | **Train Data** | **Inference** |**Dev/Test WER** |
15 | |:---------------|:------------------|:------------------|:------------------|:------------------|:------------------:|
16 | |||||
17 | | Baseline | [Athena](https://github.com/athena-team/athena) | [Transformer-AED + RNNLM](https://github.com/athena-team/athena/tree/master/examples/asr/gigaspeech) | GigaSpeech v1.0.0 XL | [model](https://drive.google.com/drive/folders/1HUUKzfnqqVfQR3epUVnnOWw9EEFpulVM) [example](https://github.com/athena-team/athena/blob/e704884ec6a3a947769d892aa267578038e49ecb/examples/asr/gigaspeech/run.sh#L85) | 13.60 / 12.70 |
18 | | Baseline | [Espnet](https://github.com/espnet/espnet) | [Conformer/Transformer-AED](https://github.com/espnet/espnet/tree/master/egs2/gigaspeech/asr1) | GigaSpeech v1.0.0 XL | [model](https://zenodo.org/record/4630406) [example](https://github.com/espnet/espnet_model_zoo#asr) | 10.90 / 10.80 |
19 | | Baseline | [Kaldi](https://github.com/kaldi-asr/kaldi) | [Chain + RNNLM](https://github.com/kaldi-asr/kaldi/tree/master/egs/gigaspeech/s5/) | GigaSpeech v1.0.0 XL | model example | 14.78 / 14.84 |
20 | | Baseline | [Pika](https://github.com/tencent-ailab/pika) | [RNN-T](https://github.com/tencent-ailab/pika/tree/) | GigaSpeech v1.0.0 XL | model example | 12.30 / 12.30 |
21 | |||||
22 | | Johns Hopkins University | [Icefall](https://github.com/k2-fsa/icefall) | [Transducer: Zipformer encoder + Embedding decoder](https://github.com/k2-fsa/icefall/tree/master/egs/gigaspeech/ASR/zipformer) | GigaSpeech v1.0.0 XL | [model](https://huggingface.co/yfyeung/icefall-asr-gigaspeech-zipformer-2023-10-17) [example](https://github.com/k2-fsa/icefall/blob/master/egs/gigaspeech/ASR/RESULTS.md#zipformer-zipformer--pruned-stateless-transducer) | 10.25 / 10.38 |
23 | | Johns Hopkins University | [Icefall](https://github.com/k2-fsa/icefall) | [Pruned Stateless RNN-T](https://github.com/k2-fsa/icefall/tree/master/egs/gigaspeech/ASR) | GigaSpeech v1.0.0 XL | [model](https://huggingface.co/wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2) [example](https://github.com/k2-fsa/icefall/blob/master/egs/gigaspeech/ASR/RESULTS.md#gigaspeech-bpe-training-results-pruned-transducer-2) | 10.40 / 10.51 |
24 | | Johns Hopkins University | [Icefall](https://github.com/k2-fsa/icefall) | [Conformer CTC +
ngram & attention rescoring](https://github.com/k2-fsa/icefall/tree/master/egs/gigaspeech/ASR) | GigaSpeech v1.0.0 XL | [model](https://huggingface.co/wgb14/icefall-asr-gigaspeech-conformer-ctc) [example](https://github.com/k2-fsa/icefall/blob/master/egs/gigaspeech/ASR/RESULTS.md#gigaspeech-bpe-training-results-conformer-ctc) | 10.47 / 10.58 |
25 | | Mobvoi | [Wenet](https://github.com/wenet-e2e/wenet) | [Joint CTC/AED(U2++)](https://github.com/wenet-e2e/wenet/tree/main/examples/gigaspeech/s0) | GigaSpeech v1.0.0 XL | [model](http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/gigaspeech/20210811_conformer_bidecoder_exp.tar.gz) [example](https://github.com/wenet-e2e/wenet/blob/main/runtime/server/x86/README.md) | 10.70 / 10.60 |
26 | | ByteDance AI Lab | [NeurST](https://github.com/bytedance/neurst) | [Transformer-AED](https://github.com/bytedance/neurst/tree/master/examples/speech_transformer/gigaspeech) | GigaSpeech v1.0.0 XL | [model](https://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/neurst/speech_to_text/gigaspeech/ckpt.tgz) [example](https://github.com/bytedance/neurst/tree/master/examples/speech_transformer/gigaspeech#models) | 11.89 / 11.60 |
27 |
28 |
29 | ## Dataset
30 |
31 | ### Audio Source
32 | * Language: English
33 | * 33,000+ hours for unsupervised/semi-supervised learning
34 | * 10,000 hours with high-quality human transcriptions for supervised learning
35 |
36 | | Audio Source | Transcribed Hours | Total Hours | Acoustic Condition |
37 | |:---------------|:-----------------:|:--------------:|:-------------------|
38 | | Audiobook | 2,655 | 11,982 |
ReadingVarious ages and accents |
39 | | Podcast | 3,498 | 9,254 | Clean or background musicIndoorNear-fieldSpontaneousVarious ages and accents|
40 | | YouTube | 3,845 | 11,768 | Clean and noisyIndoor and outdoorNear- and far-fieldReading and spontaneousVarious ages and accents |
41 | | ***total*** | ***10,000*** | ***33,005*** ||
42 |
43 |
44 | ### Transcribed Training Subsets
45 | | Subset | Hours | Remarks |
46 | |:---------------:|:-------------:|:-------------|
47 | | XS | 10 | System building and debugging |
48 | | S | 250 | Quick research experiments |
49 | | M | 1,000 | Large-scale research experiments |
50 | | L | 2,500 | Medium-scale industrial experiments |
51 | | XL | 10,000 | Large-scale industrial experiments |
52 |
53 | Larger subsets are supersets of smaller subsets, e.g., subset `L` contains all the data from subset `M`.
54 |
55 |
56 | ### Transcribed Evaluation Subsets
57 | | Subset | Hours | Remarks |
58 | |:------:|:-----:|:--------|
59 | | Dev | 12 | Randomly selected from the crawled Podcast and YouTube Data |
60 | | Test | 40 | Part of the subset was randomly selected from the crawled Podcast and YouTube data; part of it was manually collected through other channels to have better coverage. |
61 |
62 | Evaluation subsets are annotated by ***professional human annotators***
63 |
64 |
65 | ## Data Preparation Guidelines
66 | We maintain data preparation scripts for different speech recognition toolkits
67 | in this repository so that when we update the dataset (note, this is an evolving
68 | dataset), we don't have to update the scripts in the downstream toolkits. Data
69 | preparation scripts for different speech recognition toolkits are maintained in
70 | the `toolkits/` folder, e.g., `toolkits/kaldi` for the Kaldi speech recognition
71 | toolkit.
72 |
73 | ### Preparation Scripts
74 | To use the data preparation scripts, do the following in your toolkit (here we
75 | use Kaldi as an example)
76 | ```bash
77 | git clone https://github.com/SpeechColab/GigaSpeech.git
78 |
79 | cd GigaSpeech
80 | utils/download_gigaspeech.sh /disk1/audio_data/gigaspeech
81 | toolkits/kaldi/gigaspeech_data_prep.sh --train-subset XL /disk1/audio_data/gigaspeech ../data
82 | cd ..
83 | ```
84 |
85 | ### Metadata walkthrough
86 |
87 | We save all the metadata information to a single JSON file named
88 | GigaSpeech.json. Below is a snip of this file:
89 |
90 | ```json
91 | {
92 | "dataset": "GigaSpeech",
93 | "language": "EN",
94 | "version": "v1.0.0",
95 | ... ...
96 | "audios": [
97 | {
98 | "title": "The Architect of Hollywood",
99 | "url": "https://99percentinvisible.org/episode/the-architect-of-hollywood/download",
100 | "path": "audio/podcast/P0001/POD0000000025.opus",
101 | ... ...
102 | "segments": [
103 | {
104 | "sid": "POD0000000025_S0000103",
105 | "speaker": "N/A",
106 | "begin_time": 780.31,
107 | "end_time": 783.13,
108 | "text_tn": "FOUR O'CLOCK TOMORROW AFTERNOON SAID WILLIAMS ",
109 | "subsets": [
110 | "{XL}",
111 | "{L}"
112 | ]
113 | },
114 | ... ...
115 | ],
116 | ... ...
117 | },
118 | ... ...
119 | ]
120 | }
121 | ```
122 | To use the corpus, users are expected to extract the relevant information from GigaSpeech.json. For example, for the speech recognition task, one should first follow the "audios" entry, and work out a list of audio files. One can then follow the "url" entry to download the original audio file, or "path" if preprocessed audio files have been downloaded to the disk. After that, for each audio file, one can follow the "segments" entry, and work out the trainable audio segments, as well as their corresponding transcripts. Of course, we also have various supplementary entries, such as "subsets", "md5", which will also be helpful for your task.
123 |
124 | The metadata file GigaSpeech.json is version controlled, and is supposed to get updated over the time. In future releases, we plan to add speaker information to the metadata file, so that it will be suitable for speaker identification/verification tasks. We also plan to add more data from different sources to increase the diversity.
125 |
126 | We also provide some convenient command-line tools based on [jq](https://stedolan.github.io/jq/), e.g., [utils/ls_audio.sh](utils/ls_audios.sh), [utils/show_segment_info.sh](utils/show_segment_info.sh), [utils/ls_md5.sh](utils/ls_md5.sh).
127 |
128 |
129 | ### Audio Processing
130 | * `Resampling`: GigaSpeech audio files are resampled at 16 kHz sampling rate, and are compressed with the Opus format. The Opus compression, however, does not depend on the input sample rate; it uses the bandwidth instead. Timestamps are measured in 48 kHz units even if the full bandwidth is not used. Likewise, the output sample rate may be freely chosen. For example, audio can be input at 16 kHz yet be set to encode only narrowband audio. For this reason, we recommend our users to explicitly resample the decoded audio to 16 kHz sampling rate before training & testing. For opus-to-wav conversion, refer to our exampler tool [utils/opus_to_wav.py](utils/opus_to_wav.py)
131 |
132 | ### Text Pre-Processing
133 | * `Punctuations`: We keep 4 punctuations in the normalized text (see the `text_tn` entry in GigaSpeech.json)
134 | ```
135 |
136 |
137 |
138 |
139 | ```
140 | This allows researchers to explore directions such as end-to-end endpointing and punctuation restoration. If you don't need these, you can remove them for your own training.
141 |
142 | * `Garbage Utterance Tags`: The Dev/Test evaluation sets are annotated by human annotators. They are instructed to label the entire audio file without "gaps". So for non-speech segments, *garbage utterance tags* are used instead. We recommend our users to discard these utterances in your training. A *complete list* of these tags are:
143 | ```
144 |
145 |
146 |
147 |
148 | ```
149 |
150 | ### Text Post-Processing (before scoring)
151 | * `Conversational Fillers`: Spontaneous/Conversational speech contains conversational fillers such as:
152 | ```
153 | 'UH', 'UHH', 'UM', 'EH', 'MM', 'HM', 'AH', 'HUH', 'HA', 'ER'
154 | ```
155 | We recommend our users to remove these fillers from both hypothese and reference text before WER scoring, so that we will have apple-to-apple performance comparisons across different toolkits. See discussion on post-processing [here](https://github.com/SpeechColab/GigaSpeech/issues/24). We also provide a scoring tool [utils/gigaspeech_scoring.py](utils/gigaspeech_scoring.py) and this tool is used by all the toolkits reported in above leaderboard section.
156 |
157 | ### Add Support for a New Toolkit
158 | To add data preparation support for a new toolkit, please follow
159 | `toolkits/kaldi/gigaspeech_data_prep.sh` and add similar scripts for your own
160 | toolkit. For example, for ESPnet2, you would add
161 | `toolkits/espnet2/gigaspeech_data_prep.sh` to prepare the dataset, and all
162 | other related scripts should be maintained under `toolkits/espnet2`.
163 |
164 | ## Collaboration
165 | We are a group of volunteers trying to make speech technologies easier to use. We welcome any kind of contributions. Currently we are exploring the following directions. If you are interested in one of the directions, and you think you will be able to help, please contact gigaspeech@speechcolab.org.
166 |
167 | * Inference architecture for different pre-trained models
168 | * Adding diverse audio source
169 | * Benchmarking speech algorithms/services
170 | * Building and releasing pre-trained models
171 | * Supporting more languages
172 | * Supporting more tasks through GigaSpeech.json (e.g., speaker ID)
173 | * Making new datasets with permissive licenses
174 |
175 | ## Institutional Contributors
176 | | Institution | Contribution |
177 | |:------|:-----|
178 | | [IEIT, Tsinghua University](http://www.tsinghua-ieit.com/) | Computing power; Data host; Researchers |
179 | | [Magic Data](https://www.magicdatatech.com/) | Data host mirror|
180 | | [speechocean](http://en.speechocean.com/) | Data host mirror; Evaluation data annotation |
181 | | [Xiaomi Corporation](https://www.mi.com/global/) | Computing power; Researchers |
182 |
183 | ## Citation
184 | Please cite our paper if you find this work useful:
185 |
186 | ```bibtext
187 | @inproceedings{GigaSpeech2021,
188 | title={GigaSpeech: An Evolving, Multi-domain ASR Corpus with 10,000 Hours of Transcribed Audio},
189 | booktitle={Proc. Interspeech 2021},
190 | year=2021,
191 | author={Guoguo Chen, Shuzhou Chai, Guanbo Wang, Jiayu Du, Wei-Qiang Zhang, Chao Weng, Dan Su, Daniel Povey, Jan Trmal, Junbo Zhang, Mingjie Jin, Sanjeev Khudanpur, Shinji Watanabe, Shuaijiang Zhao, Wei Zou, Xiangang Li, Xuchen Yao, Yongqing Wang, Yujun Wang, Zhao You, Zhiyong Yan}
192 | }
193 | ```
194 |
195 | ## Contact
196 | If you have any concerns, please contact gigaspeech@speechcolab.org.
197 |
198 | ## Metadata Changelog
199 | * **07/23/2021 v1.0.0**: We found a bug in the metadata and fixed that. We made an exception and kept the version number the same because this **correct** version was used in the original experiments in the paper.
200 | * **07/05/2021 v1.0.0**: Initial release.
201 |
--------------------------------------------------------------------------------
/SAFEBOX/README.md:
--------------------------------------------------------------------------------
1 | This folder is used to hold private credential, e.g.:
2 | * aliyun_ossutil.cfg
3 | * password for decompression
--------------------------------------------------------------------------------
/env_vars.sh:
--------------------------------------------------------------------------------
1 | # Download URL.
2 | # Distribution channel 1: Aliyun Object Storage Service, for invited paper
3 | # collaborators. Script utils/download_gigaspeech.sh host option "oss".
4 | GIGASPEECH_RELEASE_URL_OSS='oss://speechcolab/GigaSpeech/release/GigaSpeech'
5 |
6 | # Distribution Channel 2: Tsinghua Host. Script utils/download_gigaspeech.sh
7 | # host option "tsinghua".
8 | GIGASPEECH_RELEASE_URL_TSINGHUA='http://aidata.tsinghua-ieit.com/GigaSpeech'
9 |
10 | # Distribution Channel 3: Haitian Host. Script utils/download_gigaspeech.sh
11 | # host option "speechocean".
12 | GIGASPEECH_RELEASE_URL_SPEECHOCEAN='ftp://124.207.81.184/GigaSpeech'
13 |
14 | # Distribution Channel 4: MagicData Host
15 | GIGASPEECH_RELEASE_URL_MAGICDATA='https://freedata.oss-cn-beijing.aliyuncs.com/magichub/GigaSpeech'
16 |
17 | # Distribution Channel 5: From IPFS
18 |
19 | export PATH=$PWD:$PATH
20 |
--------------------------------------------------------------------------------
/misc/debug/debug.list:
--------------------------------------------------------------------------------
1 | # last update: 2021.07.24, Jiayu
2 | #
3 | # A dummy host for download debugging
4 | # independent from official distribution host
5 | # total size: ~= 55M
6 | #
7 | # wget base url:
8 | # https://swaphub.oss-cn-hangzhou.aliyuncs.com/GigaSpeechDownloadDebug
9 | #
10 | # `tree GigaSpeechDownloadDebug` gives:
11 | # GigaSpeechDownloadDebug
12 | # ├── TERMS_OF_ACCESS
13 | # ├── audio
14 | # │ └── youtube
15 | # │ └── P0111.tgz.aes
16 | # └── dict.tgz.aes
17 | 683e162ebebabc6c3cd5fb42e72a9868 audio/youtube/P0111.tgz.aes
18 | e88de4ea902cd94e2551a85355e031ca dict.tgz.aes
19 |
--------------------------------------------------------------------------------
/misc/metadata_versions.txt:
--------------------------------------------------------------------------------
1 | v0.2.5 6f89242dfa728cdbe3fdc935eb947460
2 | v1.0.0 19c777dc296ff3eb714bc677a80620a3
3 |
--------------------------------------------------------------------------------
/toolkits/athena/extract_meta.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # coding=utf-8
3 | # Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang)
4 | # Athena Authors (Shuaijiang Zhao)
5 |
6 | import sys
7 | import os
8 | import argparse
9 | import json
10 |
11 |
12 | def get_args():
13 | parser = argparse.ArgumentParser(description="""
14 | This script is used to process raw json dataset of GigaSpeech,
15 | where the long wav is splitinto segments and
16 | data of Athena format is generated.
17 | """)
18 | parser.add_argument('--pipe-format', action='store_true', default='False',
19 | help="""If true, wav.scp is generated with pipeline format""")
20 | parser.add_argument('input_json', help="""Input json file of Gigaspeech""")
21 | parser.add_argument('output_dir', help="""Output dir for prepared data""")
22 |
23 | args = parser.parse_args()
24 | return args
25 |
26 |
27 | def meta_analysis(input_json, output_dir, pipe):
28 | input_dir = os.path.dirname(input_json)
29 |
30 | if not os.path.exists(output_dir):
31 | os.makedirs(output_dir)
32 |
33 | try:
34 | with open(input_json, 'r') as injson:
35 | json_data = json.load(injson)
36 | except:
37 | sys.exit(f'Failed to load input json file: {input_json}')
38 | else:
39 | if json_data['audios'] is not None:
40 | with open(f'{output_dir}/utt2spk', 'w') as utt2spk, \
41 | open(f'{output_dir}/utt2dur', 'w') as utt2dur, \
42 | open(f'{output_dir}/utt2subsets', 'w') as utt2subsets, \
43 | open(f'{output_dir}/text', 'w') as utt2text, \
44 | open(f'{output_dir}/segments', 'w') as segments, \
45 | open(f'{output_dir}/wav.scp', 'w') as wavscp, \
46 | open(f'{output_dir}/reco2dur', 'w') as reco2dur:
47 | for long_audio in json_data['audios']:
48 | try:
49 | long_audio_path = os.path.realpath(os.path.join(input_dir, long_audio['path']))
50 | aid = long_audio['aid']
51 | segments_lists = long_audio['segments']
52 | duration = long_audio['duration']
53 | assert(os.path.exists(long_audio_path))
54 | assert('opus' == long_audio['format'])
55 | assert(16000 == long_audio['sample_rate'])
56 | except AssertionError:
57 | print(f'Warning: {aid} something is wrong, maybe AssertionError, skipped')
58 | continue
59 | except:
60 | print(f'Warning: {aid} something is wrong, maybe the error path: {long_audio_path}, skipped')
61 | continue
62 | else:
63 | if pipe is True:
64 | wavscp.write(f'{aid}\tffmpeg -i {long_audio_path} -ar 16000 -f wav pipe:1 |\n')
65 | else:
66 | wavscp.write(f'{aid}\t{long_audio_path}\n')
67 | reco2dur.write(f'{aid}\t{duration}\n')
68 | for segment_file in segments_lists:
69 | try:
70 | sid = segment_file['sid']
71 | start_time = segment_file['begin_time']
72 | end_time = segment_file['end_time']
73 | dur = end_time - start_time
74 | text = segment_file['text_tn']
75 | segment_subsets = segment_file["subsets"]
76 | except:
77 | print(f'Warning: {segment_file} something is wrong, skipped')
78 | continue
79 | else:
80 | utt2spk.write(f'{sid}\t{sid}\n')
81 | utt2dur.write(f'{sid}\t{dur}\n')
82 | utt2text.write(f'{sid}\t{text}\n')
83 | segments.write(f'{sid}\t{aid}\t{start_time}\t{end_time}\n')
84 | segment_sub_names = " " .join(segment_subsets)
85 | utt2subsets.write(f'{sid}\t{segment_sub_names}\n')
86 |
87 |
88 | def main():
89 | args = get_args()
90 |
91 | meta_analysis(args.input_json, args.output_dir, args.pipe_format)
92 |
93 |
94 | if __name__ == '__main__':
95 | main()
96 |
--------------------------------------------------------------------------------
/toolkits/athena/prepare_data.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # coding=utf-8
3 | # Copyright (C) 2021 ATHENA AUTHORS; Shuaijiang Zhao; Xiaoning Lei
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | # reference https://github.com/SpeechColab/GigaSpeech/tree/main/utils
18 |
19 | import os
20 | import re
21 | import sys
22 | import json
23 | from absl import logging
24 |
25 | SUBSETS = ["XL", "DEV", "TEST"]
26 | garbage_utterance_tags = "|||"
27 | punctuation_tags = "|||"
28 |
29 |
30 | def extract_json(json_file='', output_dir=''):
31 | input_dir = os.path.dirname(json_file)
32 | try:
33 | with open(json_file, 'r') as JSONFILE:
34 | json_data = json.load(JSONFILE)
35 | except:
36 | sys.exit(f'Failed to load input json file: {json_file}')
37 | else:
38 | if json_data['audios'] is not None:
39 | with open(f'{output_dir}/utt2subsets', 'w') as utt2subsets, \
40 | open(f'{output_dir}/text', 'w') as utt2text, \
41 | open(f'{output_dir}/segments', 'w') as segments, \
42 | open(f'{output_dir}/opus.scp', 'w') as wavscp:
43 | for long_audio in json_data['audios']:
44 | try:
45 | long_audio_path = os.path.realpath(
46 | os.path.join(input_dir, long_audio['path']))
47 | aid = long_audio['aid']
48 | segments_lists = long_audio['segments']
49 | assert (os.path.exists(long_audio_path))
50 | assert ('opus' == long_audio['format'])
51 | assert (16000 == long_audio['sample_rate'])
52 | except AssertionError:
53 | print(f'Warning: {aid} something is wrong, maybe AssertionError, skipped')
54 | continue
55 | except:
56 | print(f'Warning: {aid} something is wrong, maybe the error path: '
57 | f'{long_audio_path}, skipped')
58 | continue
59 | else:
60 | wavscp.write(f'{aid}\t{long_audio_path}\n')
61 | for segment_file in segments_lists:
62 | try:
63 | sid = segment_file['sid']
64 | start_time = segment_file['begin_time']
65 | end_time = segment_file['end_time']
66 | text = segment_file['text_tn']
67 | segment_subsets = segment_file["subsets"]
68 | except:
69 | print(f'Warning: {segment_file} something is wrong, skipped')
70 | continue
71 | else:
72 | utt2text.write(f'{sid}\t{text}\n')
73 | segments.write(f'{sid}\t{aid}\t{start_time}\t{end_time}\n')
74 | segment_sub_names = " ".join(segment_subsets)
75 | utt2subsets.write(f'{sid}\t{segment_sub_names}\n')
76 |
77 |
78 | def convert_opus2wav(opus_scp='', wav_scp='', rm_opus=False):
79 | with open(opus_scp, 'r') as oscp, open(wav_scp, 'w') as wscp:
80 | for line in oscp:
81 | line = line.strip()
82 | utt, opus_path = re.split('\s+', line)
83 | wav_path = opus_path.replace('.opus', '.wav')
84 | cmd = f'ffmpeg -y -i {opus_path} -ac 1 -ar 16000 {wav_path}'
85 | try:
86 | os.system(cmd)
87 | wscp.write(f'{utt}\t{wav_path}\n')
88 | except:
89 | sys.exit(f'Failed to run the cmd: {cmd}')
90 |
91 | if rm_opus is True:
92 | os.remove(opus_path)
93 |
94 | def prepare_data(data_dir='', subset='XL'):
95 | subset_file = os.path.join(data_dir, 'utt2subsets')
96 | text_file = os.path.join(data_dir, 'text')
97 | segment_file = os.path.join(data_dir, 'segments')
98 | wav_scp = os.path.join(data_dir, 'wav.scp')
99 | out_f = os.path.join(data_dir, subset + '.csv')
100 |
101 | subset_dict = {}
102 | with open(subset_file) as SUBSET:
103 | subset_lines = SUBSET.readlines()
104 | for line in subset_lines:
105 | line_list = line.strip().split()
106 | utt_key = line_list[0]
107 | subset_dict[utt_key] = line_list[1:]
108 |
109 | with open(text_file) as TEXT:
110 | text_lines = TEXT.readlines()
111 |
112 | time_d = {}
113 | with open(segment_file) as SEGMENT:
114 | seg_lines = SEGMENT.readlines()
115 | for i in seg_lines:
116 | item = i.strip().split('\t')
117 | utt_key = item[0]
118 | start_time = item[2]
119 | end_time = item[3]
120 | time_d[utt_key] = str(int((float(end_time) - float(start_time)) * 1000))
121 |
122 | text_d = {}
123 | for i in text_lines:
124 | utt_key = i.split('\t')[0]
125 | speaker, k1 = utt_key.split('_')
126 | if speaker not in text_d:
127 | text_d[speaker] = []
128 | transcriptions = i.split(utt_key)[1].strip()
129 | if utt_key in time_d:
130 | if re.search(garbage_utterance_tags, transcriptions):
131 | continue
132 | if '{' + subset + '}' not in subset_dict[utt_key]:
133 | continue
134 | # remove the punctuation tags
135 | transcriptions = re.sub(punctuation_tags, "", transcriptions)
136 | # convert two spaces to one space
137 | transcriptions = re.sub(" ", " ", transcriptions)
138 | text_d[speaker].append(utt_key + '\t' + time_d[utt_key] +
139 | '\t' + transcriptions + '\t' + speaker)
140 |
141 | with open(wav_scp) as f:
142 | lines = f.readlines()
143 | utt_key_wav = {}
144 | for i in lines:
145 | utt_key = i.split('\t')[0]
146 | if utt_key not in utt_key_wav:
147 | utt_key_wav[utt_key] = 0
148 |
149 | with open(out_f, 'w') as f:
150 | f.write('wav_filename\twav_len\ttranscript\tspeaker\n')
151 | for speaker in text_d:
152 | if speaker in utt_key_wav:
153 | for utt_sample in text_d[speaker]:
154 | f.write(utt_sample + '\n')
155 |
156 |
157 | if __name__ == '__main__':
158 | logging.set_verbosity(logging.INFO)
159 | if len(sys.argv) < 3:
160 | print('Usage: python {} dataset_dir output_dir\n'
161 | ' dataset_dir : directory contains GigaSpeech dataset\n'
162 | ' output_dir : GigaSpeech data working directory'.format(sys.argv[0]))
163 | exit(1)
164 | DATASET_DIR = sys.argv[1]
165 | OUTPUT_DIR = sys.argv[2]
166 | json_file = os.path.join(DATASET_DIR, "GigaSpeech.json")
167 | extract_json(json_file=json_file, output_dir=OUTPUT_DIR)
168 |
169 | print(f'Converting opus to wave, please be patient')
170 | opus_scp = os.path.join(OUTPUT_DIR, 'opus.scp')
171 | wav_scp = os.path.join(OUTPUT_DIR, 'wav.scp')
172 | convert_opus2wav(opus_scp, wav_scp, False)
173 |
174 | for subset in SUBSETS:
175 | prepare_data(data_dir=OUTPUT_DIR, subset=subset)
176 |
--------------------------------------------------------------------------------
/toolkits/kaldi/extract_meta.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang)
2 |
3 | import sys
4 | import os
5 | import argparse
6 | import json
7 |
8 |
9 | def get_args():
10 | parser = argparse.ArgumentParser(description="""
11 | This script is used to process raw json dataset of GigaSpeech,
12 | where the long wav is splitinto segments and
13 | data of kaldi format is generated.
14 | """)
15 | parser.add_argument('--pipe-format', action='store_true', default='False',
16 | help="""If true, wav.scp is generated with pipeline format""")
17 | parser.add_argument('input_json', help="""Input json file of Gigaspeech""")
18 | parser.add_argument('output_dir', help="""Output dir for prepared data""")
19 |
20 | args = parser.parse_args()
21 | return args
22 |
23 |
24 | def meta_analysis(input_json, output_dir, pipe):
25 | input_dir = os.path.dirname(input_json)
26 |
27 | if not os.path.exists(output_dir):
28 | os.makedirs(output_dir)
29 |
30 | try:
31 | with open(input_json, 'r') as injson:
32 | json_data = json.load(injson)
33 | except:
34 | sys.exit(f'Failed to load input json file: {input_json}')
35 | else:
36 | if json_data['audios'] is not None:
37 | with open(f'{output_dir}/utt2spk', 'w') as utt2spk, \
38 | open(f'{output_dir}/utt2dur', 'w') as utt2dur, \
39 | open(f'{output_dir}/utt2subsets', 'w') as utt2subsets, \
40 | open(f'{output_dir}/text', 'w') as utt2text, \
41 | open(f'{output_dir}/segments', 'w') as segments, \
42 | open(f'{output_dir}/wav.scp', 'w') as wavscp, \
43 | open(f'{output_dir}/reco2dur', 'w') as reco2dur:
44 | for long_audio in json_data['audios']:
45 | try:
46 | long_audio_path = os.path.realpath(os.path.join(input_dir, long_audio['path']))
47 | aid = long_audio['aid']
48 | segments_lists = long_audio['segments']
49 | duration = long_audio['duration']
50 | assert(os.path.exists(long_audio_path))
51 | assert('opus' == long_audio['format'])
52 | assert(16000 == long_audio['sample_rate'])
53 | except AssertionError:
54 | print(f'Warning: {aid} something is wrong, maybe AssertionError, skipped')
55 | continue
56 | except:
57 | print(f'Warning: {aid} something is wrong, maybe the error path: {long_audio_path}, skipped')
58 | continue
59 | else:
60 | if pipe is True:
61 | wavscp.write(f'{aid}\tffmpeg -i {long_audio_path} -ar 16000 -f wav pipe:1 |\n')
62 | else:
63 | wavscp.write(f'{aid}\t{long_audio_path}\n')
64 | reco2dur.write(f'{aid}\t{duration}\n')
65 | for segment_file in segments_lists:
66 | try:
67 | sid = segment_file['sid']
68 | start_time = segment_file['begin_time']
69 | end_time = segment_file['end_time']
70 | dur = end_time - start_time
71 | text = segment_file['text_tn']
72 | segment_subsets = segment_file["subsets"]
73 | except:
74 | print(f'Warning: {segment_file} something is wrong, skipped')
75 | continue
76 | else:
77 | utt2spk.write(f'{sid}\t{sid}\n')
78 | utt2dur.write(f'{sid}\t{dur}\n')
79 | utt2text.write(f'{sid}\t{text}\n')
80 | segments.write(f'{sid}\t{aid}\t{start_time}\t{end_time}\n')
81 | segment_sub_names = " " .join(segment_subsets)
82 | utt2subsets.write(f'{sid}\t{segment_sub_names}\n')
83 |
84 |
85 | def main():
86 | args = get_args()
87 |
88 | meta_analysis(args.input_json, args.output_dir, args.pipe_format)
89 |
90 |
91 | if __name__ == '__main__':
92 | main()
93 |
--------------------------------------------------------------------------------
/toolkits/kaldi/gigaspeech_data_prep.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang)
3 | # Seasalt AI, Inc (Author: Guoguo Chen)
4 |
5 |
6 | set -e
7 | set -o pipefail
8 |
9 | stage=1
10 | prefix=gigaspeech
11 | garbage_utterance_tags=" "
12 | punctuation_tags=" "
13 | train_subset=XL
14 |
15 | . ./utils/parse_options.sh || exit 1;
16 |
17 | filter_by_id () {
18 | idlist=$1
19 | input=$2
20 | output=$3
21 | field=1
22 | if [ $# -eq 4 ]; then
23 | field=$4
24 | fi
25 | cat $input | perl -se '
26 | open(F, "<$idlist") || die "Could not open id-list file $idlist";
27 | while() {
28 | @A = split;
29 | @A>=1 || die "Invalid id-list file line $_";
30 | $seen{$A[0]} = 1;
31 | }
32 | while(<>) {
33 | @A = split;
34 | @A > 0 || die "Invalid file line $_";
35 | @A >= $field || die "Invalid file line $_";
36 | if ($seen{$A[$field-1]}) {
37 | print $_;
38 | }
39 | }' -- -idlist="$idlist" -field="$field" > $output ||\
40 | (echo "$0: filter_by_id() error: $input" && exit 1) || exit 1;
41 | }
42 |
43 | subset_data_dir () {
44 | utt_list=$1
45 | src_dir=$2
46 | dest_dir=$3
47 | mkdir -p $dest_dir || exit 1;
48 | # wav.scp utt2spk text segments utt2dur reco2dur spk2utt
49 | filter_by_id $utt_list $src_dir/utt2spk $dest_dir/utt2spk ||\
50 | (echo "$0: subset_data_dir() error: $src_dir/utt2spk" && exit 1) || exit 1;
51 | filter_by_id $utt_list $src_dir/spk2utt $dest_dir/spk2utt 2 ||\
52 | (echo "$0: subset_data_dir() error: $src_dir/spk2utt" && exit 1) || exit 1;
53 | filter_by_id $utt_list $src_dir/utt2dur $dest_dir/utt2dur ||\
54 | (echo "$0: subset_data_dir() error: $src_dir/utt2dur" && exit 1) || exit 1;
55 | filter_by_id $utt_list $src_dir/text $dest_dir/text ||\
56 | (echo "$0: subset_data_dir() error: $src_dir/text" && exit 1) || exit 1;
57 | filter_by_id $utt_list $src_dir/segments $dest_dir/segments ||\
58 | (echo "$0: subset_data_dir() error: $src_dir/segments" && exit 1) || exit 1;
59 | awk '{print $2}' $dest_dir/segments | sort | uniq > $dest_dir/reco
60 | filter_by_id $dest_dir/reco $src_dir/wav.scp $dest_dir/wav.scp ||\
61 | (echo "$0: subset_data_dir() error: $src_dir/wav.scp" && exit 1) || exit 1;
62 | filter_by_id $dest_dir/reco $src_dir/reco2dur $dest_dir/reco2dur ||\
63 | (echo "$0: subset_data_dir() error: $src_dir/reco2dur" && exit 1) || exit 1;
64 | rm -f $dest_dir/reco
65 | }
66 |
67 | if [ $# -ne 2 ]; then
68 | echo "Usage: $0 [options] "
69 | echo " e.g.: $0 --train-subset XL /disk1/audio_data/gigaspeech/ data/"
70 | echo ""
71 | echo "This script takes the GigaSpeech source directory, and prepares the"
72 | echo "Kaldi format data directory."
73 | echo " --garbage-utterance-tags # Tags for non-speech."
74 | echo " --prefix # Prefix for output data directory."
75 | echo " --punctuation-tags # Tags for punctuations."
76 | echo " --stage # Processing stage."
77 | echo " --train-subset # Train subset to be created."
78 | exit 1
79 | fi
80 |
81 | gigaspeech_dir=$1
82 | data_dir=$2
83 |
84 | declare -A subsets
85 | subsets=(
86 | [XL]="train_xl"
87 | [L]="train_l"
88 | [M]="train_m"
89 | [S]="train_s"
90 | [XS]="train_xs"
91 | [DEV]="dev"
92 | [TEST]="test")
93 | prefix=${prefix:+${prefix}_}
94 |
95 | corpus_dir=$data_dir/${prefix}corpus/
96 | if [ $stage -le 1 ]; then
97 | echo "$0: Extract meta into $corpus_dir"
98 | # Sanity check.
99 | [ ! -f $gigaspeech_dir/GigaSpeech.json ] &&\
100 | echo "$0: Please download $gigaspeech_dir/GigaSpeech.json!" && exit 1;
101 | [ ! -d $gigaspeech_dir/audio ] &&\
102 | echo "$0: Please download $gigaspeech_dir/audio!" && exit 1;
103 |
104 | [ ! -d $corpus_dir ] && mkdir -p $corpus_dir
105 |
106 | # Files to be created:
107 | # wav.scp utt2spk text and segments utt2dur reco2dur spk2utt
108 | python3 toolkits/kaldi/extract_meta.py \
109 | --pipe-format $gigaspeech_dir/GigaSpeech.json $corpus_dir || exit 1;
110 | utt2spk=$corpus_dir/utt2spk
111 | spk2utt=$corpus_dir/spk2utt
112 | toolkits/kaldi/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt ||\
113 | (echo "$0: utt2spk to spk2utt" && exit 1) || exit 1;
114 | fi
115 |
116 | if [ $stage -le 2 ]; then
117 | echo "$0: Filter $corpus_dir/text"
118 | # Delete utterances with garbage meta tags
119 | for tag in $garbage_utterance_tags; do
120 | sed -i "/${tag}/d" $corpus_dir/text
121 | done
122 |
123 | # Delete punctuations in utterances
124 | for tag in $punctuation_tags; do
125 | sed -i "s/${tag}//g" $corpus_dir/text
126 | done
127 |
128 | # Ensure space only appears once and utt is seprated with others by '\t'
129 | sed -i 's/\t/ /g' $corpus_dir/text
130 | sed -i 's/[ ][ ]*/ /g' $corpus_dir/text
131 | sed -i 's/ /\t/' $corpus_dir/text
132 | fi
133 |
134 | if [ $stage -le 3 ]; then
135 | echo "$0: Split data to train, dev and test"
136 | # Split data to train, dev and test.
137 | [ ! -f $corpus_dir/utt2subsets ] &&\
138 | echo "$0: No such file $corpus_dir/utt2subsets!" && exit 1;
139 | for label in $train_subset DEV TEST; do
140 | if [ ! ${subsets[$label]+set} ]; then
141 | echo "$0: Subset $label is not defined in GigaSpeech.json." && exit 1;
142 | fi
143 | subset=${subsets[$label]}
144 | [ ! -d $data_dir/${prefix}$subset ] && mkdir -p $data_dir/${prefix}$subset
145 | grep "{$label}" $corpus_dir/utt2subsets \
146 | > $corpus_dir/${prefix}${subset}_utt_list|| exit 1;
147 | subset_data_dir $corpus_dir/${prefix}${subset}_utt_list \
148 | $corpus_dir $data_dir/${prefix}$subset || exit 1;
149 | done
150 | fi
151 |
152 | echo "$0: Done"
153 |
--------------------------------------------------------------------------------
/toolkits/kaldi/utt2spk_to_spk2utt.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | # Copyright 2010-2011 Microsoft Corporation
3 |
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
13 | # MERCHANTABLITY OR NON-INFRINGEMENT.
14 | # See the Apache 2 License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | # converts an utt2spk file to a spk2utt file.
18 | # Takes input from the stdin or from a file argument;
19 | # output goes to the standard out.
20 |
21 | if ( @ARGV > 1 ) {
22 | die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt";
23 | }
24 |
25 | while(<>){
26 | @A = split(" ", $_);
27 | @A == 2 || die "Invalid line in utt2spk file: $_";
28 | ($u,$s) = @A;
29 | if(!$seen_spk{$s}) {
30 | $seen_spk{$s} = 1;
31 | push @spklist, $s;
32 | }
33 | push (@{$spk_hash{$s}}, "$u");
34 | }
35 | foreach $s (@spklist) {
36 | $l = join(' ',@{$spk_hash{$s}});
37 | print "$s $l\n";
38 | }
39 |
40 |
--------------------------------------------------------------------------------
/toolkits/wenet/extract_meta.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang)
2 | # Mobvoi Corporation (Author: Di Wu)
3 |
4 | import sys
5 | import os
6 | import argparse
7 | import json
8 |
9 |
10 | def get_args():
11 | parser = argparse.ArgumentParser(description="""
12 | This script is used to process raw json dataset of GigaSpeech,
13 | where the long wav is splitinto segments and
14 | data of wenet format is generated.
15 | """)
16 | parser.add_argument('input_json', help="""Input json file of Gigaspeech""")
17 | parser.add_argument('output_dir', help="""Output dir for prepared data""")
18 |
19 | args = parser.parse_args()
20 | return args
21 |
22 |
23 | def meta_analysis(input_json, output_dir):
24 | input_dir = os.path.dirname(input_json)
25 |
26 | if not os.path.exists(output_dir):
27 | os.makedirs(output_dir)
28 |
29 | try:
30 | with open(input_json, 'r') as injson:
31 | json_data = json.load(injson)
32 | except:
33 | sys.exit(f'Failed to load input json file: {input_json}')
34 | else:
35 | if json_data['audios'] is not None:
36 | with open(f'{output_dir}/text', 'w') as utt2text, \
37 | open(f'{output_dir}/segments', 'w') as segments, \
38 | open(f'{output_dir}/utt2dur', 'w') as utt2dur, \
39 | open(f'{output_dir}/wav.scp', 'w') as wavscp, \
40 | open(f'{output_dir}/utt2subsets', 'w') as utt2subsets, \
41 | open(f'{output_dir}/reco2dur', 'w') as reco2dur:
42 | for long_audio in json_data['audios']:
43 | try:
44 | long_audio_path = os.path.realpath(os.path.join(input_dir, long_audio['path']))
45 | aid = long_audio['aid']
46 | segments_lists = long_audio['segments']
47 | duration = long_audio['duration']
48 | assert(os.path.exists(long_audio_path))
49 | assert('opus' == long_audio['format'])
50 | assert(16000 == long_audio['sample_rate'])
51 | except AssertionError:
52 | print(f'Warning: {aid} something is wrong, maybe AssertionError, skipped')
53 | continue
54 | except:
55 | print(f'Warning: {aid} something is wrong, maybe the error path: {long_audio_path}, skipped')
56 | continue
57 | else:
58 | wavscp.write(f'{aid}\t{long_audio_path}\n')
59 | reco2dur.write(f'{aid}\t{duration}\n')
60 | for segment_file in segments_lists:
61 | try:
62 | sid = segment_file['sid']
63 | start_time = segment_file['begin_time']
64 | end_time = segment_file['end_time']
65 | dur = end_time - start_time
66 | text = segment_file['text_tn']
67 | segment_subsets = segment_file["subsets"]
68 | except:
69 | print(f'Warning: {segment_file} something is wrong, skipped')
70 | continue
71 | else:
72 | utt2text.write(f'{sid}\t{text}\n')
73 | segments.write(f'{sid}\t{aid}\t{start_time}\t{end_time}\n')
74 | utt2dur.write(f'{sid}\t{dur}\n')
75 | segment_sub_names = " " .join(segment_subsets)
76 | utt2subsets.write(f'{sid}\t{segment_sub_names}\n')
77 |
78 | def main():
79 | args = get_args()
80 |
81 | meta_analysis(args.input_json, args.output_dir)
82 |
83 |
84 | if __name__ == '__main__':
85 | main()
86 |
--------------------------------------------------------------------------------
/toolkits/wenet/gigaspeech_data_prep.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang)
3 | # Seasalt AI, Inc (Author: Guoguo Chen)
4 | # Mobvoi Corporation (Author: Di Wu)
5 |
6 | set -e
7 | set -o pipefail
8 |
9 | stage=1
10 | prefix=gigaspeech
11 | garbage_utterance_tags=" "
12 | punctuation_tags=" "
13 | train_subset=XL
14 |
15 | . ./utils/parse_options.sh || exit 1;
16 |
17 | filter_by_id () {
18 | idlist=$1
19 | input=$2
20 | output=$3
21 | field=1
22 | if [ $# -eq 4 ]; then
23 | field=$4
24 | fi
25 | cat $input | perl -se '
26 | open(F, "<$idlist") || die "Could not open id-list file $idlist";
27 | while() {
28 | @A = split;
29 | @A>=1 || die "Invalid id-list file line $_";
30 | $seen{$A[0]} = 1;
31 | }
32 | while(<>) {
33 | @A = split;
34 | @A > 0 || die "Invalid file line $_";
35 | @A >= $field || die "Invalid file line $_";
36 | if ($seen{$A[$field-1]}) {
37 | print $_;
38 | }
39 | }' -- -idlist="$idlist" -field="$field" > $output ||\
40 | (echo "$0: filter_by_id() error: $input" && exit 1) || exit 1;
41 | }
42 |
43 | subset_data_dir () {
44 | utt_list=$1
45 | src_dir=$2
46 | dest_dir=$3
47 | mkdir -p $dest_dir || exit 1;
48 | # wav.scp text segments utt2dur
49 | filter_by_id $utt_list $src_dir/utt2dur $dest_dir/utt2dur ||\
50 | (echo "$0: subset_data_dir() error: $src_dir/utt2dur" && exit 1) || exit 1;
51 | filter_by_id $utt_list $src_dir/text $dest_dir/text ||\
52 | (echo "$0: subset_data_dir() error: $src_dir/text" && exit 1) || exit 1;
53 | filter_by_id $utt_list $src_dir/segments $dest_dir/segments ||\
54 | (echo "$0: subset_data_dir() error: $src_dir/segments" && exit 1) || exit 1;
55 | awk '{print $2}' $dest_dir/segments | sort | uniq > $dest_dir/reco
56 | filter_by_id $dest_dir/reco $src_dir/wav.scp $dest_dir/wav.scp ||\
57 | (echo "$0: subset_data_dir() error: $src_dir/wav.scp" && exit 1) || exit 1;
58 | rm -f $dest_dir/reco
59 | }
60 |
61 | if [ $# -ne 2 ]; then
62 | echo "Usage: $0 [options] "
63 | echo " e.g.: $0 --train-subset XL /disk1/audio_data/gigaspeech/ data/"
64 | echo ""
65 | echo "This script takes the GigaSpeech source directory, and prepares the"
66 | echo "WeNet format data directory."
67 | echo " --garbage-utterance-tags # Tags for non-speech."
68 | echo " --prefix # Prefix for output data directory."
69 | echo " --punctuation-tags # Tags for punctuations."
70 | echo " --stage # Processing stage."
71 | echo " --train-subset # Train subset to be created."
72 | exit 1
73 | fi
74 |
75 | gigaspeech_dir=$1
76 | data_dir=$2
77 |
78 | declare -A subsets
79 | subsets=(
80 | [XL]="train_xl"
81 | [L]="train_l"
82 | [M]="train_m"
83 | [S]="train_s"
84 | [XS]="train_xs"
85 | [DEV]="dev"
86 | [TEST]="test")
87 | prefix=${prefix:+${prefix}_}
88 |
89 | corpus_dir=$data_dir/${prefix}corpus/
90 | if [ $stage -le 1 ]; then
91 | echo "$0: Extract meta into $corpus_dir"
92 | # Sanity check.
93 | [ ! -f $gigaspeech_dir/GigaSpeech.json ] &&\
94 | echo "$0: Please download $gigaspeech_dir/GigaSpeech.json!" && exit 1;
95 | [ ! -d $gigaspeech_dir/audio ] &&\
96 | echo "$0: Please download $gigaspeech_dir/audio!" && exit 1;
97 |
98 | [ ! -d $corpus_dir ] && mkdir -p $corpus_dir
99 |
100 | # Files to be created:
101 | # wav.scp text segments utt2dur
102 | python3 toolkits/wenet/extract_meta.py \
103 | $gigaspeech_dir/GigaSpeech.json $corpus_dir || exit 1;
104 | fi
105 |
106 | if [ $stage -le 2 ]; then
107 | echo "$0: Filter $corpus_dir/text"
108 | # Delete utterances with garbage meta tags
109 | for tag in $garbage_utterance_tags; do
110 | sed -i "/${tag}/d" $corpus_dir/text
111 | done
112 |
113 | # Delete punctuations in utterances
114 | for tag in $punctuation_tags; do
115 | sed -i "s/${tag}//g" $corpus_dir/text
116 | done
117 |
118 | # Ensure space only appears once and utt is seprated with others by '\t'
119 | sed -i 's/\t/ /g' $corpus_dir/text
120 | sed -i 's/[ ][ ]*/ /g' $corpus_dir/text
121 | sed -i 's/ /\t/' $corpus_dir/text
122 | fi
123 |
124 | if [ $stage -le 3 ]; then
125 | echo "$0: Split data to train, dev and test"
126 | # Split data to train, dev and test.
127 | [ ! -f $corpus_dir/utt2subsets ] &&\
128 | echo "$0: No such file $corpus_dir/utt2subsets!" && exit 1;
129 | for label in $train_subset DEV TEST; do
130 | if [ ! ${subsets[$label]+set} ]; then
131 | echo "$0: Subset $label is not defined in GigaSpeech.json." && exit 1;
132 | fi
133 | subset=${subsets[$label]}
134 | [ ! -d $data_dir/${prefix}$subset ] && mkdir -p $data_dir/${prefix}$subset
135 | grep "{$label}" $corpus_dir/utt2subsets \
136 | > $corpus_dir/${prefix}${subset}_utt_list|| exit 1;
137 | subset_data_dir $corpus_dir/${prefix}${subset}_utt_list \
138 | $corpus_dir $data_dir/${prefix}$subset || exit 1;
139 | done
140 | fi
141 |
142 | echo "$0: Done"
143 |
--------------------------------------------------------------------------------
/utils/check_audio_md5.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Copyright 2021 Jiayu DU
3 | # Seasalt AI, Inc (Author: Guoguo Chen)
4 |
5 |
6 | set -e
7 | set -o pipefail
8 |
9 | if [ $# -ne 1 ]; then
10 | echo "Usage: $0 "
11 | echo " e.g.: $0 /disk1/audio_data/gigaspeech"
12 | echo ""
13 | echo "This script tries to detect errors in the downloaded audio files "
14 | echo "by comparing your local audio files' md5 with those in GigaSpeech.json"
15 | exit 1
16 | fi
17 |
18 | gigaspeech_dataset_dir=$1
19 |
20 | failed=false
21 | if [[ `uname -s` == "Linux" ]]; then
22 | if ! which md5sum >/dev/null; then
23 | echo "$0: Please install md5sum"
24 | exit 1
25 | fi
26 | utils/ls_md5.sh $gigaspeech_dataset_dir | (while read line; do
27 | echo $line | md5sum -c --strict --quiet --status 2>/dev/null
28 | if [ $? -ne 0 ]; then
29 | echo "$0: md5 verification failed for: \"$line\""
30 | failed=true
31 | fi
32 | done
33 |
34 | if [ "$failed" = true ]; then
35 | echo "$0: md5 verification failed, check the above logs."
36 | exit 1
37 | fi) || exit 1
38 | elif [[ `uname -s` == "Darwin" ]]; then
39 | if ! which md5 >/dev/null; then
40 | echo "$0: Please install md5"
41 | exit 1
42 | fi
43 | utils/ls_md5.sh $gigaspeech_dataset_dir | (while read line; do
44 | checksum=`echo $line | awk '{print $1}'`
45 | file=`echo $line | awk '{print $2}'`
46 | checksum_from_file=`md5 -q $file`
47 | if [[ "$checksum_from_file" != "$checksum" ]]; then
48 | echo "$0: md5 verification failed for: \"$line\""
49 | failed=true
50 | fi
51 | done
52 |
53 | if [ "$failed" = true ]; then
54 | echo "$0: md5 verification failed, check the above logs."
55 | exit 1
56 | fi) || exit 1
57 | else
58 | echo "$0: $0 only supports Linux and Mac OS"
59 | exit 1
60 | fi
61 |
62 | echo "$0: Successfully verified audio files."
63 |
--------------------------------------------------------------------------------
/utils/check_metadata_md5.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Copyright 2021 Jiayu DU
3 | # Seasalt AI, Inc (Author: Guoguo Chen)
4 |
5 |
6 | set -e
7 | set -o pipefail
8 |
9 | if [ $# -ne 1 ]; then
10 | echo "Usage: $0 "
11 | echo " e.g.: $0 /disk1/audio_data/gigaspeech"
12 | echo ""
13 | echo "This script tries to detect errors in the downloaded metadata "
14 | echo "by checking the md5 value. You can find the expected md5 value in"
15 | echo "misc/metadata_versions.txt."
16 | exit 1
17 | fi
18 |
19 | gigaspeech_dataset_dir=$1
20 |
21 | if [ ! -f $gigaspeech_dataset_dir/GigaSpeech.json ]; then
22 | echo "$0: Metadata $gigaspeech_dataset_dir/GigaSpeech.json does not exist."
23 | exit
24 | fi
25 |
26 | verified="false"
27 | local_version=$(utils/extract_metadata_version.sh $gigaspeech_dataset_dir)
28 | if [[ `uname -s` == "Linux" ]]; then
29 | if ! which md5sum >/dev/null; then
30 | echo "$0: Please install md5sum"
31 | exit 1
32 | fi
33 | local_md5=$(md5sum $gigaspeech_dataset_dir/GigaSpeech.json | awk '{print $1}')
34 | elif [[ `uname -s` == "Darwin" ]]; then
35 | if ! which md5 >/dev/null; then
36 | echo "$0: Please install md5"
37 | exit 1
38 | fi
39 | local_md5=$(md5 -r $gigaspeech_dataset_dir/GigaSpeech.json | awk '{print $1}')
40 | else
41 | echo "$0: only supports Linux and Mac OS"
42 | exit 1
43 | fi
44 |
45 | grep -v '^#' misc/metadata_versions.txt | (while read line; do
46 | version=$(echo $line | awk '{print $1}')
47 | md5=$(echo $line | awk '{print $2}')
48 | if [[ "$local_version" == "$version" ]]; then
49 | if [[ "$local_md5" == "$md5" ]]; then
50 | echo "$0: Successfully verified metadata version:$version, md5:$md5"
51 | verified="true"
52 | else
53 | echo "$0: ERROR, $local_version expects md5=$md5, got $local_md5"
54 | exit 1;
55 | fi
56 | fi
57 | done
58 |
59 | if [[ "$verified" == "false" ]]; then
60 | echo "$0: md5 verification failed for unknown version $local_version"
61 | exit 1
62 | fi) || exit 1;
63 |
64 | echo "$0: Done md5 verification."
65 |
--------------------------------------------------------------------------------
/utils/download_gigaspeech.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang)
3 | # Seasalt AI, Inc (Author: Guoguo Chen)
4 | # Jiayu DU
5 | # Tsinghua University (Author: Shuzhou Chai)
6 |
7 | set -e
8 | set -o pipefail
9 |
10 | stage=0
11 | with_dict=false
12 |
13 | # Support hosts:
14 | # 1. oss
15 | # 2. tsinghua
16 | # 3. speechocean
17 | # 4. magicdata
18 | host=
19 | subset={XL} # unavailable for oss
20 | download_eval=true
21 |
22 | . ./env_vars.sh || exit 1
23 | . ./utils/parse_options.sh || exit 1
24 |
25 |
26 | if [ $# -ne 1 ]; then
27 | echo "Usage: $0 "
28 | echo " e.g.: $0 /disk1/audio_data/gigaspeech"
29 | echo ""
30 | echo "This script downloads the entire GigaSpeech dataset"
31 | echo "to your local dir . "
32 | echo "options:"
33 | echo " --with-dict true|false(default) download cmudict & g2p model"
34 | echo " --stage stage(default 0) specifies from which stage to start with"
35 | echo " --host tsinghua|speechocean|magicdata|oss specifies the host"
36 | echo " --subset subset(default {XL}) specifies the subset to download"
37 | echo " --download-eval true(default)|false download {DEV} and {TEST} subsets"
38 | exit 1
39 | fi
40 |
41 | gigaspeech_dataset_dir=$1
42 | mkdir -p $gigaspeech_dataset_dir || exit 1;
43 |
44 | # Check credentials.
45 | if [ ! -f SAFEBOX/password ]; then
46 | echo -n "$0: Please apply for the download credentials (see the \"Download\""
47 | echo " section in README) and it to SAFEBOX/password."
48 | exit 1;
49 | fi
50 | PASSWORD=`cat SAFEBOX/password 2>/dev/null`
51 | if [ -z "$PASSWORD" ]; then
52 | echo "$0: Error, SAFEBOX/password is empty."
53 | exit 1;
54 | fi
55 | PASSWORD_MD5=`echo $PASSWORD | md5sum | cut -d ' ' -f 1`
56 | if [[ $PASSWORD_MD5 != "dfbf0cde1a3ce23749d8d81e492741b8" ]]; then
57 | echo "$0: Error, invalid SAFEBOX/password."
58 | exit 1;
59 | fi
60 |
61 | # Check downloading tools
62 | if ! which wget >/dev/null; then
63 | echo "$0: Error, please make sure you have wget installed."
64 | exit 1
65 | fi
66 |
67 | # Set up speed test.
68 | test_local_file="GigaSpeech.json.gz.aes"
69 | check_download_speed() {
70 | # Downloading for 30 seconds.
71 | rm -f "/tmp/$test_local_file"
72 | local duration=30
73 | eval "$1 &" || exit 1;
74 | local jobid=$!
75 | trap "kill $jobid; rm -f /tmp/$test_local_file; exit 1" INT
76 |
77 | # Wait for $duration seconds, but exit if the job finishes earlier.
78 | for t in `seq 1 $duration`; do
79 | if ! ps -p $jobid > /dev/null; then
80 | if [[ -f "/tmp/$test_local_file" ]]; then
81 | local file_size="$(du -sk /tmp/$test_local_file | cut -f1)"
82 | rm -f "/tmp/$test_local_file"
83 | local speed="$(echo "scale=3; $file_size/1024/$duration" | bc)"
84 | echo "$speed"
85 | return 0
86 | else
87 | echo "0"
88 | return 0
89 | fi
90 | fi
91 | sleep 1
92 | done
93 |
94 | # Check if the jobs is still alive, if yes, then kill it.
95 | if ps -p $jobid > /dev/null; then
96 | kill $jobid || exit 1;
97 | # Check file size.
98 | if [[ -f "/tmp/$test_local_file" ]]; then
99 | local file_size="$(du -sk /tmp/$test_local_file | cut -f1)"
100 | rm -f "/tmp/$test_local_file"
101 | local speed="$(echo "scale=3; $file_size/1024/$duration" | bc)"
102 | echo "$speed"
103 | else
104 | echo "0"
105 | fi
106 | else
107 | echo "0"
108 | fi
109 | }
110 |
111 | if [ -z "$host" ];then
112 | # Default download host.
113 | host=tsinghua
114 | speed=0
115 |
116 | # Check all available hosts and choose the fastest one.
117 | echo "$0: Testing Tsinghua host speed..."
118 | wget_cmd="wget -c -t 20 -T 90 -P /tmp"
119 | wget_cmd="$wget_cmd $GIGASPEECH_RELEASE_URL_TSINGHUA/GigaSpeech.json.gz.aes"
120 | speed=$(check_download_speed "$wget_cmd")
121 | echo; echo "$0: The Tsinghua host speed: $speed MB/s."; echo;
122 |
123 | echo "$0: Testing speechocean host speed..."
124 | wget_cmd="wget -c -t 20 -T 90 -P /tmp"
125 | wget_cmd="$wget_cmd --ftp-user=GigaSpeech --ftp-password=$PASSWORD"
126 | wget_cmd="$wget_cmd $GIGASPEECH_RELEASE_URL_SPEECHOCEAN/"
127 | wget_cmd="${wget_cmd}GigaSpeech.json.gz.aes"
128 | speechocean_speed=$(check_download_speed "$wget_cmd")
129 | if [ $(echo "$speed < $speechocean_speed" | bc) = 1 ]; then
130 | host=speechocean
131 | speed=$speechocean_speed
132 | fi
133 | echo; echo "$0: The speechocean host speed: $speechocean_speed MB/s."; echo;
134 |
135 | echo "$0: Testing Magic Data host speed..."
136 | wget_cmd="wget -c -t 20 -T 90 -P /tmp"
137 | wget_cmd="$wget_cmd $GIGASPEECH_RELEASE_URL_MAGICDATA/GigaSpeech.json.gz.aes"
138 | magicdata_speed=$(check_download_speed "$wget_cmd")
139 | if [ $(echo "$speed < $magicdata_speed" | bc) = 1 ]; then
140 | host=magicdata
141 | speed=$magicdata_speed
142 | fi
143 | echo; echo "$0: The Magic Data host speed: $magicdata_speed MB/s."; echo;
144 |
145 | # Check if there is available host.
146 | if [ $(echo "$speed == 0" | bc) = 1 ]; then
147 | echo "$0: All hosts are down..."
148 | exit 1;
149 | fi
150 | echo; echo "$0: Using $host host, speed is $speed MB/s."; echo;
151 | fi
152 |
153 | if [[ "$host" == "oss" ]]; then
154 | # This is for SpeechColab collaborators, need 500G free space
155 | echo "$0: Downloading from the oss host..."
156 | utils/internal/download_gigaspeech_from_oss.sh \
157 | --stage $stage --with-dict $with_dict \
158 | $gigaspeech_dataset_dir || exit 1;
159 | elif [[ "$host" == "tsinghua" || "$host" == "speechocean" || "$host" == "magicdata" ]]; then
160 | # This is for public release, need 1.0T free space
161 | echo "$0: Downloading with PySpeechColab..."
162 | utils/internal/download_gigaspeech_with_pyspeechcolab.sh \
163 | --host $host --subset $subset --with-dict $with_dict \
164 | --download-eval $download_eval \
165 | $gigaspeech_dataset_dir || exit 1;
166 | else
167 | echo "$0: Unsupported host: $host"
168 | exit 1
169 | fi
170 |
171 | echo "$0: Done"
172 |
--------------------------------------------------------------------------------
/utils/extract_metadata_version.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Copyright 2021 SpeechColab Authors
3 |
4 | set -e
5 | set -o pipefail
6 |
7 | if [ $# -ne 1 ]; then
8 | echo "Usage: $0 "
9 | echo " e.g.: $0 /disk1/audio_data/gigaspeech"
10 | echo ""
11 | echo "This script extract version field from metadata file"
12 | exit 1
13 | fi
14 |
15 | gigaspeech_dataset_dir=$1
16 |
17 | if ! which jq >/dev/null; then
18 | >&2 echo "$0: You have to get jq installed in order to use this. See"
19 | >&2 echo "$0: utils/install_jq.sh"
20 | exit 1
21 | fi
22 |
23 | if [ -f $gigaspeech_dataset_dir/GigaSpeech.json ]; then
24 | cat $gigaspeech_dataset_dir/GigaSpeech.json | jq -r '.version'
25 | else
26 | >&2 echo "$0: ERROR, couldn't find $gigaspeech_dataset_dir/GigaSpeech.json"
27 | exit 1
28 | fi
29 |
--------------------------------------------------------------------------------
/utils/extract_subset_segments.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # coding=utf8
3 | # Copyright 2022 Jiayu DU
4 |
5 | '''
6 | This tool is used to extract supervised segments from GigaSpeech,
7 | segments are saved in .wav format, supervisions are saved in a simple .tsv file:
8 |
9 | --- exampler tsv begin ---
10 | ID AUDIO BEGIN DURATION TEXT
11 | POD1000000004_S0000017 audio/POD1000000004_S0000017.wav 0 3.163 YOU KNOW TO PUT THIS STUFF TOGETHER
12 | ...
13 | ...
14 |
15 | --- exampler tsv end---
16 |
17 | It can be, but not should be used to extract large subsets such as L, XL (because it would be extremely slow).
18 | '''
19 |
20 | import os, sys
21 | import argparse
22 | import csv
23 | from speechcolab.datasets.gigaspeech import GigaSpeech
24 | import torchaudio
25 |
26 | gigaspeech_punctuations = ['', '', '', '']
27 | gigaspeech_garbage_utterance_tags = ['', '', '', '']
28 |
29 | if __name__ == '__main__':
30 | parser = argparse.ArgumentParser(description='Save the audio segments into wav, and meta into tsv.')
31 | parser.add_argument('--subset', choices = ['XS', 'S', 'M', 'L', 'XL', 'DEV', 'TEST'], default='XS', help='The subset name')
32 | parser.add_argument('gigaspeech_dataset_dir', help='The GigaSpeech corpus directory')
33 | parser.add_argument('dst_dir', help='Ouput subset directory')
34 | args = parser.parse_args()
35 |
36 | os.makedirs(args.dst_dir, exist_ok = True)
37 |
38 | gigaspeech = GigaSpeech(args.gigaspeech_dataset_dir)
39 | subset = '{' + args.subset + '}'
40 | with open(os.path.join(args.dst_dir, 'metadata.tsv'), 'w+', encoding='utf8') as fo:
41 | csv_header_fields = ['ID', 'AUDIO', 'DURATION', 'TEXT']
42 | csv_writer = csv.DictWriter(fo, delimiter='\t', fieldnames=csv_header_fields, lineterminator='\n')
43 | csv_writer.writeheader()
44 | for audio in gigaspeech.audios(subset):
45 | aid = audio['aid']
46 | audio_path = os.path.join(args.gigaspeech_dataset_dir, audio["path"])
47 |
48 | audio_info = torchaudio.info(audio_path)
49 | opus_sample_rate = audio_info.sample_rate
50 | assert opus_sample_rate == 48000
51 | nc = audio_info.num_channels
52 | assert nc == 1
53 |
54 | sample_rate = 16000
55 | long_waveform, _ = torchaudio.load(audio_path)
56 | long_waveform = torchaudio.transforms.Resample(opus_sample_rate, sample_rate)(long_waveform)
57 |
58 | for segment in audio['segments']:
59 | sid = segment['sid']
60 |
61 | if subset not in segment['subsets']:
62 | continue
63 |
64 | text = segment['text_tn']
65 | for punctuation in gigaspeech_punctuations:
66 | text = text.replace(punctuation, '').strip()
67 | text = ' '.join(text.split())
68 |
69 | if text in gigaspeech_garbage_utterance_tags:
70 | continue
71 |
72 | begin = segment['begin_time']
73 | duration = segment['end_time'] - segment['begin_time']
74 | frame_offset = int(begin * sample_rate)
75 | num_frames = int(duration * sample_rate)
76 |
77 | waveform = long_waveform[0][frame_offset : frame_offset + num_frames] # mono
78 |
79 | segment_path = os.path.join('audio', aid, f'{sid}.wav')
80 | os.makedirs(os.path.join(args.dst_dir, os.path.dirname(segment_path)), exist_ok = True)
81 | torchaudio.save(
82 | os.path.join(args.dst_dir, segment_path),
83 | waveform.unsqueeze(0),
84 | sample_rate = sample_rate,
85 | format = 'wav',
86 | encoding = 'PCM_S',
87 | bits_per_sample = 16,
88 | )
89 |
90 | utt = {'ID': segment['sid'], 'AUDIO': segment_path, 'DURATION': f'{duration:.4f}', 'TEXT': text }
91 | csv_writer.writerow(utt)
92 |
93 |
--------------------------------------------------------------------------------
/utils/gigaspeech_scoring.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import os
3 | import argparse
4 |
5 | conversational_filler = ['UH', 'UHH', 'UM', 'EH', 'MM', 'HM', 'AH', 'HUH', 'HA', 'ER', 'OOF', 'HEE' , 'ACH', 'EEE', 'EW']
6 | unk_tags = ['', '']
7 | gigaspeech_punctuations = ['', '', '', '']
8 | gigaspeech_garbage_utterance_tags = ['', '', '', '']
9 | non_scoring_words = conversational_filler + unk_tags + gigaspeech_punctuations + gigaspeech_garbage_utterance_tags
10 |
11 | def asr_text_post_processing(text):
12 | # 1. convert to uppercase
13 | text = text.upper()
14 |
15 | # 2. remove hyphen
16 | # "E-COMMERCE" -> "E COMMERCE", "STATE-OF-THE-ART" -> "STATE OF THE ART"
17 | text = text.replace('-', ' ')
18 |
19 | # 3. remove non-scoring words from evaluation
20 | remaining_words = []
21 | for word in text.split():
22 | if word in non_scoring_words:
23 | continue
24 | remaining_words.append(word)
25 |
26 | return ' '.join(remaining_words)
27 |
28 | if __name__ == '__main__':
29 | parser = argparse.ArgumentParser(description="This script evaluates GigaSpeech ASR result via SCTK's tool sclite")
30 | parser.add_argument('ref', type=str, help="sclite's standard transcription(trn) reference file")
31 | parser.add_argument('hyp', type=str, help="sclite's standard transcription(trn) hypothesis file")
32 | parser.add_argument('work_dir', type=str, help='working dir')
33 | args = parser.parse_args()
34 |
35 | if not os.path.isdir(args.work_dir):
36 | os.mkdir(args.work_dir)
37 |
38 | REF = os.path.join(args.work_dir, 'REF')
39 | HYP = os.path.join(args.work_dir, 'HYP')
40 | RESULT = os.path.join(args.work_dir, 'RESULT')
41 |
42 | for io in [(args.ref, REF), (args.hyp, HYP)]:
43 | with open(io[0], 'r', encoding='utf8') as fi, open(io[1], 'w+', encoding='utf8') as fo:
44 | for line in fi:
45 | line = line.strip()
46 | if line:
47 | cols = line.split()
48 | text = asr_text_post_processing(' '.join(cols[0:-1]))
49 | uttid_field = cols[-1]
50 | print(F'{text} {uttid_field}', file=fo)
51 |
52 | os.system(F'sclite -r {REF} trn -h {HYP} trn -i swb | tee {RESULT}') # GigaSpeech's uttid comforms to swb
53 |
54 |
--------------------------------------------------------------------------------
/utils/install_jq.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 | set -o pipefail
5 |
6 | if [ `uname -s` == 'Linux' ]; then
7 | if [ "`grep NAME /etc/os-release | grep Ubuntu`" != "" ] ||\
8 | [ "`grep NAME /etc/os-release | grep Debian`" != "" ]; then
9 | apt-get install jq || exit 1
10 | elif [ "`grep NAME /etc/os-release | grep CentOS`" != "" ]; then
11 | yum install jq || exit 1
12 | else
13 | echo "$0: Unknown platform."
14 | exit 1
15 | fi
16 | elif [ `uname -s` == 'Darwin' ]; then
17 | brew install jq || exit 1
18 | fi
19 |
20 | echo "$0: Done"
21 |
--------------------------------------------------------------------------------
/utils/internal/download_gigaspeech_from_oss.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Copyright 2021 Jiayu Du
3 | # Seasalt AI, Inc (Author: Guoguo Chen)
4 |
5 | set -e
6 | set -o pipefail
7 |
8 | stage=0
9 | with_dict=false
10 |
11 | . ./utils/parse_options.sh || exit 1
12 |
13 | if [ $# -ne 1 ]; then
14 | echo "Usage: $0 "
15 | echo " e.g.: $0 /disk1/audio_data/gigaspeech"
16 | echo ""
17 | echo "This script downloads the entire GigaSpeech Dataset from Aliyun."
18 | echo "This tool is used for our collaborator, not for public users."
19 | echo "We suggest having at least 500G of free space in local dir."
20 | echo "If dataset resources are updated, you can just re-run this script for "
21 | echo "incremental downloading, downloader will only download updates"
22 | exit 1
23 | fi
24 |
25 | gigaspeech_dataset_dir=$1
26 |
27 |
28 | . ./env_vars.sh || exit 1
29 | GIGASPEECH_RELEASE_URL=$GIGASPEECH_RELEASE_URL_OSS
30 |
31 | if [ -z "${GIGASPEECH_RELEASE_URL}" ]; then
32 | echo "$0: Error, variable GIGASPEECH_RELEASE_URL_OSS(in env_vars.sh) is empty."
33 | exit 1
34 | fi
35 |
36 | if [ ! -f SAFEBOX/aliyun_ossutil.cfg ]; then
37 | echo "$0: Error, make sure you have: SAFEBOX/aliyun_ossutil.cfg"
38 | exit 1
39 | fi
40 |
41 | # install downloader (Official client for ALIYUN Objects-Storage-Service)
42 | ossbin=tools/downloader/oss
43 | if [ $stage -le 0 ]; then
44 | [ ! -d tools/downloader ] && mkdir -p tools/downloader
45 | if [ `uname -s` == 'Linux' ]; then
46 | wget -O $ossbin \
47 | http://gosspublic.alicdn.com/ossutil/1.7.1/ossutil64 || exit 1
48 | elif [ `uname -s` == 'Darwin' ]; then
49 | curl -o $ossbin \
50 | http://gosspublic.alicdn.com/ossutil/1.7.1/ossutilmac64 || exit 1
51 | fi
52 | chmod 755 $ossbin
53 | fi
54 |
55 | if [ $stage -le 1 ]; then
56 | echo "$0: Skip downloading TERM_OF_ACCESS, our co-authors don't need this"
57 | fi
58 |
59 | # Download metadata
60 | if [ $stage -le 2 ]; then
61 | echo "$0: Start to download GigaSpeech Metadata"
62 | $ossbin -c SAFEBOX/aliyun_ossutil.cfg \
63 | cp -u ${GIGASPEECH_RELEASE_URL}/GigaSpeech.json $gigaspeech_dataset_dir/ || exit 1
64 | fi
65 |
66 | # Download audio
67 | if [ $stage -le 3 ]; then
68 | echo "$0: Start to download GigaSpeech cached audio collection"
69 | $ossbin -c SAFEBOX/aliyun_ossutil.cfg \
70 | cp -ur ${GIGASPEECH_RELEASE_URL}/audio/ $gigaspeech_dataset_dir/audio || exit 1
71 | fi
72 |
73 | # Download optional dictionary and pretrained g2p model
74 | if [ $stage -le 4 ]; then
75 | if [ $with_dict == true ]; then
76 | $ossbin -c SAFEBOX/aliyun_ossutil.cfg \
77 | cp -u ${GIGASPEECH_RELEASE_URL}/dict/cmudict.0.7a \
78 | $gigaspeech_dataset_dir/dict/cmudict.0.7a || exit 1
79 | $ossbin -c SAFEBOX/aliyun_ossutil.cfg \
80 | cp -ur ${GIGASPEECH_RELEASE_URL}/dict/g2p $gigaspeech_dataset_dir/dict/ || exit 1
81 | fi
82 | fi
83 |
84 | # Check audio md5
85 | if [ $stage -le 5 ]; then
86 | echo "$0: Checking md5 of downloaded audio files"
87 | utils/check_audio_md5.sh $gigaspeech_dataset_dir || exit 1
88 | fi
89 |
90 | echo "$0: Done"
91 |
--------------------------------------------------------------------------------
/utils/internal/download_gigaspeech_with_pyspeechcolab.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Copyright 2021 Jiayu Du
3 | # Seasalt AI, Inc (Author: Guoguo Chen)
4 | # Tsinghua University (Author: Shuzhou Chai)
5 | # Xiaomi Corporation (Author: Junbo Zhang)
6 |
7 | set -e
8 | set -o pipefail
9 |
10 | with_dict=false
11 | host=tsinghua
12 | subset={XL}
13 | download_eval=true
14 |
15 | . ./utils/parse_options.sh || exit 1
16 |
17 | if [ $# -ne 1 ]; then
18 | echo "Usage: $0 "
19 | echo " e.g.: $0 /disk1/audio_data/gigaspeech"
20 | echo ""
21 | echo "This script downloads the entire GigaSpeech Dataset from Tsinghua host."
22 | echo "We suggest having at least 1.0T of free space in the target directory."
23 | echo "If dataset resources are updated, you can re-run this script for "
24 | echo "incremental download."
25 | exit 1
26 | fi
27 |
28 | gigaspeech_dataset_dir=$1
29 | mkdir -p $gigaspeech_dataset_dir || exit 1;
30 |
31 | # Check dependency
32 | python3 -c "import speechcolab" 2> /dev/null || \
33 | (echo "$0: This recipe needs the package speechcolab installed.";
34 | echo "To install:"
35 | echo " pip install speechcolab"; exit 1)
36 |
37 | # Check credential
38 | if [ ! -f SAFEBOX/password ]; then
39 | echo "$0: Please apply for the download credentials (see the \"Download\""
40 | echo "$0: section in README) and it to SAFEBOX/password."
41 | exit 1
42 | fi
43 | PASSWORD=`cat SAFEBOX/password 2>/dev/null`
44 | if [ -z "$PASSWORD" ]; then
45 | echo "$0: Error, SAFEBOX/password is empty."
46 | exit 1
47 | fi
48 |
49 | # false -> False, true -> True
50 | with_dict=$(echo $with_dict | sed "s/\b\(.\)/\u\1/g")
51 |
52 | # Download with PySpeechColab
53 | python3 << END
54 | from speechcolab.datasets.gigaspeech import GigaSpeech
55 | gigaspeech = GigaSpeech('$gigaspeech_dataset_dir')
56 | if '$download_eval' and '$subset' != '{DEV}' and '$subset' != '{TEST}':
57 | gigaspeech.download('$PASSWORD', subset='{DEV}', host='$host', with_dict=$with_dict)
58 | gigaspeech.download('$PASSWORD', subset='{TEST}', host='$host', with_dict=$with_dict)
59 | gigaspeech.download('$PASSWORD', subset='$subset', host='$host', with_dict=$with_dict)
60 | END
61 |
62 |
63 | echo "$0: Done"
64 |
--------------------------------------------------------------------------------
/utils/ls_audios.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Copyright 2021 Jiayu Du
3 | # Seasalt AI, Inc (Author: Guoguo Chen)
4 |
5 |
6 | set -e
7 | set -o pipefail
8 |
9 | if [ $# -ne 1 ]; then
10 | echo "Usage: $0 "
11 | echo " e.g.: $0 /disk1/audio_data/gigaspeech"
12 | echo ""
13 | echo "This script lists all audio files in dataset release."
14 | exit 1
15 | fi
16 |
17 | gigaspeech_dataset_dir=$1
18 |
19 | if ! which jq >/dev/null; then
20 | >&2 echo "$0: You have to get jq installed in order to use this. See"
21 | >&2 echo "$0: utils/install_jq.sh"
22 | exit 1
23 | fi
24 |
25 | cat $gigaspeech_dataset_dir/GigaSpeech.json \
26 | | jq -r '.audios[].path' |\
27 | awk -v prefix="$gigaspeech_dataset_dir" '{print prefix"/"$1}' || exit 1
28 |
--------------------------------------------------------------------------------
/utils/ls_md5.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Copyright 2021 SpeechColab Authors
3 |
4 |
5 | set -e
6 | set -o pipefail
7 |
8 | if [ $# -ne 1 ]; then
9 | echo "Usage: $0 "
10 | echo " e.g.: $0 /disk1/audio_data/gigaspeech"
11 | echo ""
12 | echo "This script lists md5 for all audio files in dataset"
13 | echo "can be used in data consistency check"
14 | exit 1
15 | fi
16 |
17 | gigaspeech_dataset_dir=$1
18 |
19 | if ! which jq >/dev/null; then
20 | >&2 echo "$0: You have to get jq installed in order to use this. See"
21 | >&2 echo "$0: utils/install_jq.sh"
22 | exit 1
23 | fi
24 |
25 | if [ -f $gigaspeech_dataset_dir/GigaSpeech.json ]; then
26 | cat $gigaspeech_dataset_dir/GigaSpeech.json |\
27 | jq -r '.audios[] | "\(.md5) \(.path)"' |\
28 | awk -v prefix="$gigaspeech_dataset_dir" '{print $1" "prefix"/"$2}' || exit 1
29 | else
30 | >&2 echo "$0: ERROR, couldn't find $gigaspeech_dataset_dir/GigaSpeech.json"
31 | exit 1
32 | fi
33 |
--------------------------------------------------------------------------------
/utils/opus_to_wav.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Xiaomi (Author:Yongqing Wang)
2 |
3 | import os
4 | import argparse
5 | import re
6 |
7 |
8 | def get_args():
9 | parser = argparse.ArgumentParser(description="""
10 | This script is used to convert opus file into wav file.""")
11 | parser.add_argument('--remove-opus', action='store_true', default='False',
12 | help="""If true, remove opus files""")
13 | parser.add_argument('opus_scp', help="""Input opus scp file""")
14 |
15 | args = parser.parse_args()
16 | return args
17 |
18 |
19 | def convert_opus2wav(opus_scp, rm_opus):
20 | with open(opus_scp, 'r') as oscp:
21 | for line in oscp:
22 | line = line.strip()
23 | utt, opus_path = re.split('\s+', line)
24 | wav_path = opus_path.replace('.opus', '.wav')
25 | cmd = f'ffmpeg -y -i {opus_path} -ac 1 -ar 16000 {wav_path}'
26 | try:
27 | os.system(cmd)
28 | except:
29 | sys.exit(f'Failed to run the cmd: {cmd}')
30 | if rm_opus is True:
31 | os.remove(opus_path)
32 |
33 |
34 | def main():
35 | args = get_args()
36 | convert_opus2wav(args.opus_scp, args.remove_opus)
37 |
38 |
39 | if __name__ == '__main__':
40 | main()
41 |
--------------------------------------------------------------------------------
/utils/parse_options.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Copyright 2021 Seasalt AI, Inc (Author: Guoguo Chen)
3 |
4 |
5 | while true; do
6 | [ -z "${1:-}" ] && break; # break if there are no arguments
7 | case "$1" in
8 | --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
9 | eval '[ -z "${'$name'+xxx}" ]' &&\
10 | echo "$0: invalid option $1" 1>&2 && exit 1;
11 |
12 | oldval="`eval echo \\$$name`";
13 | if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
14 | was_bool=true;
15 | else
16 | was_bool=false;
17 | fi
18 | eval $name=\"$2\";
19 | if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
20 | echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
21 | exit 1;
22 | fi
23 | shift 2;
24 | ;;
25 | *) break;
26 | esac
27 | done
28 |
29 | true;
30 |
--------------------------------------------------------------------------------
/utils/show_segment_info.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Copyright 2021 Jiayu Du
3 | # Seasalt AI, Inc (Author: Guoguo Chen)
4 |
5 |
6 | set -e
7 | set -o pipefail
8 |
9 | if [ $# -ne 2 ]; then
10 | echo "Usage: $0 "
11 | echo " e.g.: $0 /disk1/audio_data/gigaspeech POD1000000004_S0000000"
12 | echo ""
13 | echo "This script extracts information from GigaSpeech.json for the given"
14 | echo "segment."
15 | exit 1
16 | fi
17 |
18 | gigaspeech_dataset_dir=$1
19 | segment_id=$2
20 |
21 | if ! which jq >/dev/null; then
22 | >&2 echo "$0: You have to get jq installed in order to use this. See"
23 | >&2 echo "$0: utils/install_jq.sh"
24 | exit 1
25 | fi
26 |
27 | cat $gigaspeech_dataset_dir/GigaSpeech.json |\
28 | jq --arg query "$segment_id" \
29 | '.audios[].segments[] | select(.sid == $query)' || exit 1
30 |
31 | echo "$0: Done"
32 |
--------------------------------------------------------------------------------