├── .gitignore ├── LICENSE ├── README.md ├── SAFEBOX └── README.md ├── env_vars.sh ├── misc ├── debug │ └── debug.list └── metadata_versions.txt ├── toolkits ├── athena │ ├── extract_meta.py │ └── prepare_data.py ├── kaldi │ ├── extract_meta.py │ ├── gigaspeech_data_prep.sh │ └── utt2spk_to_spk2utt.pl └── wenet │ ├── extract_meta.py │ └── gigaspeech_data_prep.sh └── utils ├── check_audio_md5.sh ├── check_metadata_md5.sh ├── download_gigaspeech.sh ├── extract_metadata_version.sh ├── extract_subset_segments.py ├── gigaspeech_scoring.py ├── install_jq.sh ├── internal ├── download_gigaspeech_from_oss.sh └── download_gigaspeech_with_pyspeechcolab.sh ├── ls_audios.sh ├── ls_md5.sh ├── opus_to_wav.py ├── parse_options.sh └── show_segment_info.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # Editors 132 | *.swp 133 | 134 | # GigaSpeech ignore list 135 | SAFEBOX/aliyun_ossutil.cfg 136 | SAFEBOX/password 137 | */dict/* 138 | */g2p/* 139 | *.json 140 | cmudict* 141 | ossutil64 142 | ossutilmac64 143 | ossutil_output/ 144 | tmp/ 145 | .DS_Store 146 | *.aes 147 | *.tgz 148 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GigaSpeech 2 | This is the official repository of the GigaSpeech dataset. For details of how we created the dataset, please refer to our Interspeech paper: *"GigaSpeech: An Evolving, Multi-domain ASR Corpus with 10,000 Hours of Transcribed Audio"*. [Preprint available on arxiv](https://arxiv.org/abs/2106.06909). 3 | 4 | GigaSpeech version: 1.0.0 (07/05/2021) 5 | 6 | ## Download 7 | 1. Step 1: Please fill out the Google Form [here](https://forms.gle/UuGQAPyscGRrUMLq6) 8 | 2. Step 2: 9 | - Option A: Follow the instructions in replied email from SpeechColab to get the raw release of GigaSpeech 10 | - Option B: Refer to [GigaSpeech On HuggingFace](https://github.com/SpeechColab/GigaSpeech/issues/117) to get a pre-processed version of GigaSpeech via HuggingFace. 11 | 12 | ## Leaderboard 13 | 14 | | **Contributor**| **Toolkit** | **Train Recipe** | **Train Data** | **Inference** |**Dev/Test WER** | 15 | |:---------------|:------------------|:------------------|:------------------|:------------------|:------------------:| 16 | ||||| 17 | | Baseline | [Athena](https://github.com/athena-team/athena) | [Transformer-AED + RNNLM](https://github.com/athena-team/athena/tree/master/examples/asr/gigaspeech) | GigaSpeech v1.0.0 XL | [model](https://drive.google.com/drive/folders/1HUUKzfnqqVfQR3epUVnnOWw9EEFpulVM) [example](https://github.com/athena-team/athena/blob/e704884ec6a3a947769d892aa267578038e49ecb/examples/asr/gigaspeech/run.sh#L85) | 13.60 / 12.70 | 18 | | Baseline | [Espnet](https://github.com/espnet/espnet) | [Conformer/Transformer-AED](https://github.com/espnet/espnet/tree/master/egs2/gigaspeech/asr1) | GigaSpeech v1.0.0 XL | [model](https://zenodo.org/record/4630406) [example](https://github.com/espnet/espnet_model_zoo#asr) | 10.90 / 10.80 | 19 | | Baseline | [Kaldi](https://github.com/kaldi-asr/kaldi) | [Chain + RNNLM](https://github.com/kaldi-asr/kaldi/tree/master/egs/gigaspeech/s5/) | GigaSpeech v1.0.0 XL | model example | 14.78 / 14.84 | 20 | | Baseline | [Pika](https://github.com/tencent-ailab/pika) | [RNN-T](https://github.com/tencent-ailab/pika/tree/) | GigaSpeech v1.0.0 XL | model example | 12.30 / 12.30 | 21 | ||||| 22 | | Johns Hopkins University | [Icefall](https://github.com/k2-fsa/icefall) | [Transducer: Zipformer encoder + Embedding decoder](https://github.com/k2-fsa/icefall/tree/master/egs/gigaspeech/ASR/zipformer) | GigaSpeech v1.0.0 XL | [model](https://huggingface.co/yfyeung/icefall-asr-gigaspeech-zipformer-2023-10-17) [example](https://github.com/k2-fsa/icefall/blob/master/egs/gigaspeech/ASR/RESULTS.md#zipformer-zipformer--pruned-stateless-transducer) | 10.25 / 10.38 | 23 | | Johns Hopkins University | [Icefall](https://github.com/k2-fsa/icefall) | [Pruned Stateless RNN-T](https://github.com/k2-fsa/icefall/tree/master/egs/gigaspeech/ASR) | GigaSpeech v1.0.0 XL | [model](https://huggingface.co/wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2) [example](https://github.com/k2-fsa/icefall/blob/master/egs/gigaspeech/ASR/RESULTS.md#gigaspeech-bpe-training-results-pruned-transducer-2) | 10.40 / 10.51 | 24 | | Johns Hopkins University | [Icefall](https://github.com/k2-fsa/icefall) | [Conformer CTC +
ngram & attention rescoring](https://github.com/k2-fsa/icefall/tree/master/egs/gigaspeech/ASR) | GigaSpeech v1.0.0 XL | [model](https://huggingface.co/wgb14/icefall-asr-gigaspeech-conformer-ctc) [example](https://github.com/k2-fsa/icefall/blob/master/egs/gigaspeech/ASR/RESULTS.md#gigaspeech-bpe-training-results-conformer-ctc) | 10.47 / 10.58 | 25 | | Mobvoi | [Wenet](https://github.com/wenet-e2e/wenet) | [Joint CTC/AED(U2++)](https://github.com/wenet-e2e/wenet/tree/main/examples/gigaspeech/s0) | GigaSpeech v1.0.0 XL | [model](http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/gigaspeech/20210811_conformer_bidecoder_exp.tar.gz) [example](https://github.com/wenet-e2e/wenet/blob/main/runtime/server/x86/README.md) | 10.70 / 10.60 | 26 | | ByteDance AI Lab | [NeurST](https://github.com/bytedance/neurst) | [Transformer-AED](https://github.com/bytedance/neurst/tree/master/examples/speech_transformer/gigaspeech) | GigaSpeech v1.0.0 XL | [model](https://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/neurst/speech_to_text/gigaspeech/ckpt.tgz) [example](https://github.com/bytedance/neurst/tree/master/examples/speech_transformer/gigaspeech#models) | 11.89 / 11.60 | 27 | 28 | 29 | ## Dataset 30 | 31 | ### Audio Source 32 | * Language: English 33 | * 33,000+ hours for unsupervised/semi-supervised learning 34 | * 10,000 hours with high-quality human transcriptions for supervised learning 35 | 36 | | Audio Source | Transcribed Hours | Total Hours | Acoustic Condition | 37 | |:---------------|:-----------------:|:--------------:|:-------------------| 38 | | Audiobook | 2,655 | 11,982 |
  • Reading
  • Various ages and accents
  • | 39 | | Podcast | 3,498 | 9,254 |
  • Clean or background music
  • Indoor
  • Near-field
  • Spontaneous
  • Various ages and accents
  • | 40 | | YouTube | 3,845 | 11,768 |
  • Clean and noisy
  • Indoor and outdoor
  • Near- and far-field
  • Reading and spontaneous
  • Various ages and accents
  • | 41 | | ***total*** | ***10,000*** | ***33,005*** || 42 | 43 | 44 | ### Transcribed Training Subsets 45 | | Subset | Hours | Remarks | 46 | |:---------------:|:-------------:|:-------------| 47 | | XS | 10 | System building and debugging | 48 | | S | 250 | Quick research experiments | 49 | | M | 1,000 | Large-scale research experiments | 50 | | L | 2,500 | Medium-scale industrial experiments | 51 | | XL | 10,000 | Large-scale industrial experiments | 52 | 53 | Larger subsets are supersets of smaller subsets, e.g., subset `L` contains all the data from subset `M`. 54 | 55 | 56 | ### Transcribed Evaluation Subsets 57 | | Subset | Hours | Remarks | 58 | |:------:|:-----:|:--------| 59 | | Dev | 12 | Randomly selected from the crawled Podcast and YouTube Data | 60 | | Test | 40 | Part of the subset was randomly selected from the crawled Podcast and YouTube data; part of it was manually collected through other channels to have better coverage. | 61 | 62 | Evaluation subsets are annotated by ***professional human annotators*** 63 | 64 | 65 | ## Data Preparation Guidelines 66 | We maintain data preparation scripts for different speech recognition toolkits 67 | in this repository so that when we update the dataset (note, this is an evolving 68 | dataset), we don't have to update the scripts in the downstream toolkits. Data 69 | preparation scripts for different speech recognition toolkits are maintained in 70 | the `toolkits/` folder, e.g., `toolkits/kaldi` for the Kaldi speech recognition 71 | toolkit. 72 | 73 | ### Preparation Scripts 74 | To use the data preparation scripts, do the following in your toolkit (here we 75 | use Kaldi as an example) 76 | ```bash 77 | git clone https://github.com/SpeechColab/GigaSpeech.git 78 | 79 | cd GigaSpeech 80 | utils/download_gigaspeech.sh /disk1/audio_data/gigaspeech 81 | toolkits/kaldi/gigaspeech_data_prep.sh --train-subset XL /disk1/audio_data/gigaspeech ../data 82 | cd .. 83 | ``` 84 | 85 | ### Metadata walkthrough 86 | 87 | We save all the metadata information to a single JSON file named 88 | GigaSpeech.json. Below is a snip of this file: 89 | 90 | ```json 91 | { 92 | "dataset": "GigaSpeech", 93 | "language": "EN", 94 | "version": "v1.0.0", 95 | ... ... 96 | "audios": [ 97 | { 98 | "title": "The Architect of Hollywood", 99 | "url": "https://99percentinvisible.org/episode/the-architect-of-hollywood/download", 100 | "path": "audio/podcast/P0001/POD0000000025.opus", 101 | ... ... 102 | "segments": [ 103 | { 104 | "sid": "POD0000000025_S0000103", 105 | "speaker": "N/A", 106 | "begin_time": 780.31, 107 | "end_time": 783.13, 108 | "text_tn": "FOUR O'CLOCK TOMORROW AFTERNOON SAID WILLIAMS ", 109 | "subsets": [ 110 | "{XL}", 111 | "{L}" 112 | ] 113 | }, 114 | ... ... 115 | ], 116 | ... ... 117 | }, 118 | ... ... 119 | ] 120 | } 121 | ``` 122 | To use the corpus, users are expected to extract the relevant information from GigaSpeech.json. For example, for the speech recognition task, one should first follow the "audios" entry, and work out a list of audio files. One can then follow the "url" entry to download the original audio file, or "path" if preprocessed audio files have been downloaded to the disk. After that, for each audio file, one can follow the "segments" entry, and work out the trainable audio segments, as well as their corresponding transcripts. Of course, we also have various supplementary entries, such as "subsets", "md5", which will also be helpful for your task. 123 | 124 | The metadata file GigaSpeech.json is version controlled, and is supposed to get updated over the time. In future releases, we plan to add speaker information to the metadata file, so that it will be suitable for speaker identification/verification tasks. We also plan to add more data from different sources to increase the diversity. 125 | 126 | We also provide some convenient command-line tools based on [jq](https://stedolan.github.io/jq/), e.g., [utils/ls_audio.sh](utils/ls_audios.sh), [utils/show_segment_info.sh](utils/show_segment_info.sh), [utils/ls_md5.sh](utils/ls_md5.sh). 127 | 128 | 129 | ### Audio Processing 130 | * `Resampling`: GigaSpeech audio files are resampled at 16 kHz sampling rate, and are compressed with the Opus format. The Opus compression, however, does not depend on the input sample rate; it uses the bandwidth instead. Timestamps are measured in 48 kHz units even if the full bandwidth is not used. Likewise, the output sample rate may be freely chosen. For example, audio can be input at 16 kHz yet be set to encode only narrowband audio. For this reason, we recommend our users to explicitly resample the decoded audio to 16 kHz sampling rate before training & testing. For opus-to-wav conversion, refer to our exampler tool [utils/opus_to_wav.py](utils/opus_to_wav.py) 131 | 132 | ### Text Pre-Processing 133 | * `Punctuations`: We keep 4 punctuations in the normalized text (see the `text_tn` entry in GigaSpeech.json) 134 | ``` 135 | 136 | 137 | 138 | 139 | ``` 140 | This allows researchers to explore directions such as end-to-end endpointing and punctuation restoration. If you don't need these, you can remove them for your own training. 141 | 142 | * `Garbage Utterance Tags`: The Dev/Test evaluation sets are annotated by human annotators. They are instructed to label the entire audio file without "gaps". So for non-speech segments, *garbage utterance tags* are used instead. We recommend our users to discard these utterances in your training. A *complete list* of these tags are: 143 | ``` 144 | 145 | 146 | 147 | 148 | ``` 149 | 150 | ### Text Post-Processing (before scoring) 151 | * `Conversational Fillers`: Spontaneous/Conversational speech contains conversational fillers such as: 152 | ``` 153 | 'UH', 'UHH', 'UM', 'EH', 'MM', 'HM', 'AH', 'HUH', 'HA', 'ER' 154 | ``` 155 | We recommend our users to remove these fillers from both hypothese and reference text before WER scoring, so that we will have apple-to-apple performance comparisons across different toolkits. See discussion on post-processing [here](https://github.com/SpeechColab/GigaSpeech/issues/24). We also provide a scoring tool [utils/gigaspeech_scoring.py](utils/gigaspeech_scoring.py) and this tool is used by all the toolkits reported in above leaderboard section. 156 | 157 | ### Add Support for a New Toolkit 158 | To add data preparation support for a new toolkit, please follow 159 | `toolkits/kaldi/gigaspeech_data_prep.sh` and add similar scripts for your own 160 | toolkit. For example, for ESPnet2, you would add 161 | `toolkits/espnet2/gigaspeech_data_prep.sh` to prepare the dataset, and all 162 | other related scripts should be maintained under `toolkits/espnet2`. 163 | 164 | ## Collaboration 165 | We are a group of volunteers trying to make speech technologies easier to use. We welcome any kind of contributions. Currently we are exploring the following directions. If you are interested in one of the directions, and you think you will be able to help, please contact gigaspeech@speechcolab.org. 166 | 167 | * Inference architecture for different pre-trained models 168 | * Adding diverse audio source 169 | * Benchmarking speech algorithms/services 170 | * Building and releasing pre-trained models 171 | * Supporting more languages 172 | * Supporting more tasks through GigaSpeech.json (e.g., speaker ID) 173 | * Making new datasets with permissive licenses 174 | 175 | ## Institutional Contributors 176 | | Institution | Contribution | 177 | |:------|:-----| 178 | | [IEIT, Tsinghua University](http://www.tsinghua-ieit.com/) | Computing power; Data host; Researchers | 179 | | [Magic Data](https://www.magicdatatech.com/) | Data host mirror| 180 | | [speechocean](http://en.speechocean.com/) | Data host mirror; Evaluation data annotation | 181 | | [Xiaomi Corporation](https://www.mi.com/global/) | Computing power; Researchers | 182 | 183 | ## Citation 184 | Please cite our paper if you find this work useful: 185 | 186 | ```bibtext 187 | @inproceedings{GigaSpeech2021, 188 | title={GigaSpeech: An Evolving, Multi-domain ASR Corpus with 10,000 Hours of Transcribed Audio}, 189 | booktitle={Proc. Interspeech 2021}, 190 | year=2021, 191 | author={Guoguo Chen, Shuzhou Chai, Guanbo Wang, Jiayu Du, Wei-Qiang Zhang, Chao Weng, Dan Su, Daniel Povey, Jan Trmal, Junbo Zhang, Mingjie Jin, Sanjeev Khudanpur, Shinji Watanabe, Shuaijiang Zhao, Wei Zou, Xiangang Li, Xuchen Yao, Yongqing Wang, Yujun Wang, Zhao You, Zhiyong Yan} 192 | } 193 | ``` 194 | 195 | ## Contact 196 | If you have any concerns, please contact gigaspeech@speechcolab.org. 197 | 198 | ## Metadata Changelog 199 | * **07/23/2021 v1.0.0**: We found a bug in the metadata and fixed that. We made an exception and kept the version number the same because this **correct** version was used in the original experiments in the paper. 200 | * **07/05/2021 v1.0.0**: Initial release. 201 | -------------------------------------------------------------------------------- /SAFEBOX/README.md: -------------------------------------------------------------------------------- 1 | This folder is used to hold private credential, e.g.: 2 | * aliyun_ossutil.cfg 3 | * password for decompression -------------------------------------------------------------------------------- /env_vars.sh: -------------------------------------------------------------------------------- 1 | # Download URL. 2 | # Distribution channel 1: Aliyun Object Storage Service, for invited paper 3 | # collaborators. Script utils/download_gigaspeech.sh host option "oss". 4 | GIGASPEECH_RELEASE_URL_OSS='oss://speechcolab/GigaSpeech/release/GigaSpeech' 5 | 6 | # Distribution Channel 2: Tsinghua Host. Script utils/download_gigaspeech.sh 7 | # host option "tsinghua". 8 | GIGASPEECH_RELEASE_URL_TSINGHUA='http://aidata.tsinghua-ieit.com/GigaSpeech' 9 | 10 | # Distribution Channel 3: Haitian Host. Script utils/download_gigaspeech.sh 11 | # host option "speechocean". 12 | GIGASPEECH_RELEASE_URL_SPEECHOCEAN='ftp://124.207.81.184/GigaSpeech' 13 | 14 | # Distribution Channel 4: MagicData Host 15 | GIGASPEECH_RELEASE_URL_MAGICDATA='https://freedata.oss-cn-beijing.aliyuncs.com/magichub/GigaSpeech' 16 | 17 | # Distribution Channel 5: From IPFS 18 | 19 | export PATH=$PWD:$PATH 20 | -------------------------------------------------------------------------------- /misc/debug/debug.list: -------------------------------------------------------------------------------- 1 | # last update: 2021.07.24, Jiayu 2 | # 3 | # A dummy host for download debugging 4 | # independent from official distribution host 5 | # total size: ~= 55M 6 | # 7 | # wget base url: 8 | # https://swaphub.oss-cn-hangzhou.aliyuncs.com/GigaSpeechDownloadDebug 9 | # 10 | # `tree GigaSpeechDownloadDebug` gives: 11 | # GigaSpeechDownloadDebug 12 | # ├── TERMS_OF_ACCESS 13 | # ├── audio 14 | # │   └── youtube 15 | # │   └── P0111.tgz.aes 16 | # └── dict.tgz.aes 17 | 683e162ebebabc6c3cd5fb42e72a9868 audio/youtube/P0111.tgz.aes 18 | e88de4ea902cd94e2551a85355e031ca dict.tgz.aes 19 | -------------------------------------------------------------------------------- /misc/metadata_versions.txt: -------------------------------------------------------------------------------- 1 | v0.2.5 6f89242dfa728cdbe3fdc935eb947460 2 | v1.0.0 19c777dc296ff3eb714bc677a80620a3 3 | -------------------------------------------------------------------------------- /toolkits/athena/extract_meta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding=utf-8 3 | # Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang) 4 | # Athena Authors (Shuaijiang Zhao) 5 | 6 | import sys 7 | import os 8 | import argparse 9 | import json 10 | 11 | 12 | def get_args(): 13 | parser = argparse.ArgumentParser(description=""" 14 | This script is used to process raw json dataset of GigaSpeech, 15 | where the long wav is splitinto segments and 16 | data of Athena format is generated. 17 | """) 18 | parser.add_argument('--pipe-format', action='store_true', default='False', 19 | help="""If true, wav.scp is generated with pipeline format""") 20 | parser.add_argument('input_json', help="""Input json file of Gigaspeech""") 21 | parser.add_argument('output_dir', help="""Output dir for prepared data""") 22 | 23 | args = parser.parse_args() 24 | return args 25 | 26 | 27 | def meta_analysis(input_json, output_dir, pipe): 28 | input_dir = os.path.dirname(input_json) 29 | 30 | if not os.path.exists(output_dir): 31 | os.makedirs(output_dir) 32 | 33 | try: 34 | with open(input_json, 'r') as injson: 35 | json_data = json.load(injson) 36 | except: 37 | sys.exit(f'Failed to load input json file: {input_json}') 38 | else: 39 | if json_data['audios'] is not None: 40 | with open(f'{output_dir}/utt2spk', 'w') as utt2spk, \ 41 | open(f'{output_dir}/utt2dur', 'w') as utt2dur, \ 42 | open(f'{output_dir}/utt2subsets', 'w') as utt2subsets, \ 43 | open(f'{output_dir}/text', 'w') as utt2text, \ 44 | open(f'{output_dir}/segments', 'w') as segments, \ 45 | open(f'{output_dir}/wav.scp', 'w') as wavscp, \ 46 | open(f'{output_dir}/reco2dur', 'w') as reco2dur: 47 | for long_audio in json_data['audios']: 48 | try: 49 | long_audio_path = os.path.realpath(os.path.join(input_dir, long_audio['path'])) 50 | aid = long_audio['aid'] 51 | segments_lists = long_audio['segments'] 52 | duration = long_audio['duration'] 53 | assert(os.path.exists(long_audio_path)) 54 | assert('opus' == long_audio['format']) 55 | assert(16000 == long_audio['sample_rate']) 56 | except AssertionError: 57 | print(f'Warning: {aid} something is wrong, maybe AssertionError, skipped') 58 | continue 59 | except: 60 | print(f'Warning: {aid} something is wrong, maybe the error path: {long_audio_path}, skipped') 61 | continue 62 | else: 63 | if pipe is True: 64 | wavscp.write(f'{aid}\tffmpeg -i {long_audio_path} -ar 16000 -f wav pipe:1 |\n') 65 | else: 66 | wavscp.write(f'{aid}\t{long_audio_path}\n') 67 | reco2dur.write(f'{aid}\t{duration}\n') 68 | for segment_file in segments_lists: 69 | try: 70 | sid = segment_file['sid'] 71 | start_time = segment_file['begin_time'] 72 | end_time = segment_file['end_time'] 73 | dur = end_time - start_time 74 | text = segment_file['text_tn'] 75 | segment_subsets = segment_file["subsets"] 76 | except: 77 | print(f'Warning: {segment_file} something is wrong, skipped') 78 | continue 79 | else: 80 | utt2spk.write(f'{sid}\t{sid}\n') 81 | utt2dur.write(f'{sid}\t{dur}\n') 82 | utt2text.write(f'{sid}\t{text}\n') 83 | segments.write(f'{sid}\t{aid}\t{start_time}\t{end_time}\n') 84 | segment_sub_names = " " .join(segment_subsets) 85 | utt2subsets.write(f'{sid}\t{segment_sub_names}\n') 86 | 87 | 88 | def main(): 89 | args = get_args() 90 | 91 | meta_analysis(args.input_json, args.output_dir, args.pipe_format) 92 | 93 | 94 | if __name__ == '__main__': 95 | main() 96 | -------------------------------------------------------------------------------- /toolkits/athena/prepare_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding=utf-8 3 | # Copyright (C) 2021 ATHENA AUTHORS; Shuaijiang Zhao; Xiaoning Lei 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | # reference https://github.com/SpeechColab/GigaSpeech/tree/main/utils 18 | 19 | import os 20 | import re 21 | import sys 22 | import json 23 | from absl import logging 24 | 25 | SUBSETS = ["XL", "DEV", "TEST"] 26 | garbage_utterance_tags = "|||" 27 | punctuation_tags = "|||" 28 | 29 | 30 | def extract_json(json_file='', output_dir=''): 31 | input_dir = os.path.dirname(json_file) 32 | try: 33 | with open(json_file, 'r') as JSONFILE: 34 | json_data = json.load(JSONFILE) 35 | except: 36 | sys.exit(f'Failed to load input json file: {json_file}') 37 | else: 38 | if json_data['audios'] is not None: 39 | with open(f'{output_dir}/utt2subsets', 'w') as utt2subsets, \ 40 | open(f'{output_dir}/text', 'w') as utt2text, \ 41 | open(f'{output_dir}/segments', 'w') as segments, \ 42 | open(f'{output_dir}/opus.scp', 'w') as wavscp: 43 | for long_audio in json_data['audios']: 44 | try: 45 | long_audio_path = os.path.realpath( 46 | os.path.join(input_dir, long_audio['path'])) 47 | aid = long_audio['aid'] 48 | segments_lists = long_audio['segments'] 49 | assert (os.path.exists(long_audio_path)) 50 | assert ('opus' == long_audio['format']) 51 | assert (16000 == long_audio['sample_rate']) 52 | except AssertionError: 53 | print(f'Warning: {aid} something is wrong, maybe AssertionError, skipped') 54 | continue 55 | except: 56 | print(f'Warning: {aid} something is wrong, maybe the error path: ' 57 | f'{long_audio_path}, skipped') 58 | continue 59 | else: 60 | wavscp.write(f'{aid}\t{long_audio_path}\n') 61 | for segment_file in segments_lists: 62 | try: 63 | sid = segment_file['sid'] 64 | start_time = segment_file['begin_time'] 65 | end_time = segment_file['end_time'] 66 | text = segment_file['text_tn'] 67 | segment_subsets = segment_file["subsets"] 68 | except: 69 | print(f'Warning: {segment_file} something is wrong, skipped') 70 | continue 71 | else: 72 | utt2text.write(f'{sid}\t{text}\n') 73 | segments.write(f'{sid}\t{aid}\t{start_time}\t{end_time}\n') 74 | segment_sub_names = " ".join(segment_subsets) 75 | utt2subsets.write(f'{sid}\t{segment_sub_names}\n') 76 | 77 | 78 | def convert_opus2wav(opus_scp='', wav_scp='', rm_opus=False): 79 | with open(opus_scp, 'r') as oscp, open(wav_scp, 'w') as wscp: 80 | for line in oscp: 81 | line = line.strip() 82 | utt, opus_path = re.split('\s+', line) 83 | wav_path = opus_path.replace('.opus', '.wav') 84 | cmd = f'ffmpeg -y -i {opus_path} -ac 1 -ar 16000 {wav_path}' 85 | try: 86 | os.system(cmd) 87 | wscp.write(f'{utt}\t{wav_path}\n') 88 | except: 89 | sys.exit(f'Failed to run the cmd: {cmd}') 90 | 91 | if rm_opus is True: 92 | os.remove(opus_path) 93 | 94 | def prepare_data(data_dir='', subset='XL'): 95 | subset_file = os.path.join(data_dir, 'utt2subsets') 96 | text_file = os.path.join(data_dir, 'text') 97 | segment_file = os.path.join(data_dir, 'segments') 98 | wav_scp = os.path.join(data_dir, 'wav.scp') 99 | out_f = os.path.join(data_dir, subset + '.csv') 100 | 101 | subset_dict = {} 102 | with open(subset_file) as SUBSET: 103 | subset_lines = SUBSET.readlines() 104 | for line in subset_lines: 105 | line_list = line.strip().split() 106 | utt_key = line_list[0] 107 | subset_dict[utt_key] = line_list[1:] 108 | 109 | with open(text_file) as TEXT: 110 | text_lines = TEXT.readlines() 111 | 112 | time_d = {} 113 | with open(segment_file) as SEGMENT: 114 | seg_lines = SEGMENT.readlines() 115 | for i in seg_lines: 116 | item = i.strip().split('\t') 117 | utt_key = item[0] 118 | start_time = item[2] 119 | end_time = item[3] 120 | time_d[utt_key] = str(int((float(end_time) - float(start_time)) * 1000)) 121 | 122 | text_d = {} 123 | for i in text_lines: 124 | utt_key = i.split('\t')[0] 125 | speaker, k1 = utt_key.split('_') 126 | if speaker not in text_d: 127 | text_d[speaker] = [] 128 | transcriptions = i.split(utt_key)[1].strip() 129 | if utt_key in time_d: 130 | if re.search(garbage_utterance_tags, transcriptions): 131 | continue 132 | if '{' + subset + '}' not in subset_dict[utt_key]: 133 | continue 134 | # remove the punctuation tags 135 | transcriptions = re.sub(punctuation_tags, "", transcriptions) 136 | # convert two spaces to one space 137 | transcriptions = re.sub(" ", " ", transcriptions) 138 | text_d[speaker].append(utt_key + '\t' + time_d[utt_key] + 139 | '\t' + transcriptions + '\t' + speaker) 140 | 141 | with open(wav_scp) as f: 142 | lines = f.readlines() 143 | utt_key_wav = {} 144 | for i in lines: 145 | utt_key = i.split('\t')[0] 146 | if utt_key not in utt_key_wav: 147 | utt_key_wav[utt_key] = 0 148 | 149 | with open(out_f, 'w') as f: 150 | f.write('wav_filename\twav_len\ttranscript\tspeaker\n') 151 | for speaker in text_d: 152 | if speaker in utt_key_wav: 153 | for utt_sample in text_d[speaker]: 154 | f.write(utt_sample + '\n') 155 | 156 | 157 | if __name__ == '__main__': 158 | logging.set_verbosity(logging.INFO) 159 | if len(sys.argv) < 3: 160 | print('Usage: python {} dataset_dir output_dir\n' 161 | ' dataset_dir : directory contains GigaSpeech dataset\n' 162 | ' output_dir : GigaSpeech data working directory'.format(sys.argv[0])) 163 | exit(1) 164 | DATASET_DIR = sys.argv[1] 165 | OUTPUT_DIR = sys.argv[2] 166 | json_file = os.path.join(DATASET_DIR, "GigaSpeech.json") 167 | extract_json(json_file=json_file, output_dir=OUTPUT_DIR) 168 | 169 | print(f'Converting opus to wave, please be patient') 170 | opus_scp = os.path.join(OUTPUT_DIR, 'opus.scp') 171 | wav_scp = os.path.join(OUTPUT_DIR, 'wav.scp') 172 | convert_opus2wav(opus_scp, wav_scp, False) 173 | 174 | for subset in SUBSETS: 175 | prepare_data(data_dir=OUTPUT_DIR, subset=subset) 176 | -------------------------------------------------------------------------------- /toolkits/kaldi/extract_meta.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang) 2 | 3 | import sys 4 | import os 5 | import argparse 6 | import json 7 | 8 | 9 | def get_args(): 10 | parser = argparse.ArgumentParser(description=""" 11 | This script is used to process raw json dataset of GigaSpeech, 12 | where the long wav is splitinto segments and 13 | data of kaldi format is generated. 14 | """) 15 | parser.add_argument('--pipe-format', action='store_true', default='False', 16 | help="""If true, wav.scp is generated with pipeline format""") 17 | parser.add_argument('input_json', help="""Input json file of Gigaspeech""") 18 | parser.add_argument('output_dir', help="""Output dir for prepared data""") 19 | 20 | args = parser.parse_args() 21 | return args 22 | 23 | 24 | def meta_analysis(input_json, output_dir, pipe): 25 | input_dir = os.path.dirname(input_json) 26 | 27 | if not os.path.exists(output_dir): 28 | os.makedirs(output_dir) 29 | 30 | try: 31 | with open(input_json, 'r') as injson: 32 | json_data = json.load(injson) 33 | except: 34 | sys.exit(f'Failed to load input json file: {input_json}') 35 | else: 36 | if json_data['audios'] is not None: 37 | with open(f'{output_dir}/utt2spk', 'w') as utt2spk, \ 38 | open(f'{output_dir}/utt2dur', 'w') as utt2dur, \ 39 | open(f'{output_dir}/utt2subsets', 'w') as utt2subsets, \ 40 | open(f'{output_dir}/text', 'w') as utt2text, \ 41 | open(f'{output_dir}/segments', 'w') as segments, \ 42 | open(f'{output_dir}/wav.scp', 'w') as wavscp, \ 43 | open(f'{output_dir}/reco2dur', 'w') as reco2dur: 44 | for long_audio in json_data['audios']: 45 | try: 46 | long_audio_path = os.path.realpath(os.path.join(input_dir, long_audio['path'])) 47 | aid = long_audio['aid'] 48 | segments_lists = long_audio['segments'] 49 | duration = long_audio['duration'] 50 | assert(os.path.exists(long_audio_path)) 51 | assert('opus' == long_audio['format']) 52 | assert(16000 == long_audio['sample_rate']) 53 | except AssertionError: 54 | print(f'Warning: {aid} something is wrong, maybe AssertionError, skipped') 55 | continue 56 | except: 57 | print(f'Warning: {aid} something is wrong, maybe the error path: {long_audio_path}, skipped') 58 | continue 59 | else: 60 | if pipe is True: 61 | wavscp.write(f'{aid}\tffmpeg -i {long_audio_path} -ar 16000 -f wav pipe:1 |\n') 62 | else: 63 | wavscp.write(f'{aid}\t{long_audio_path}\n') 64 | reco2dur.write(f'{aid}\t{duration}\n') 65 | for segment_file in segments_lists: 66 | try: 67 | sid = segment_file['sid'] 68 | start_time = segment_file['begin_time'] 69 | end_time = segment_file['end_time'] 70 | dur = end_time - start_time 71 | text = segment_file['text_tn'] 72 | segment_subsets = segment_file["subsets"] 73 | except: 74 | print(f'Warning: {segment_file} something is wrong, skipped') 75 | continue 76 | else: 77 | utt2spk.write(f'{sid}\t{sid}\n') 78 | utt2dur.write(f'{sid}\t{dur}\n') 79 | utt2text.write(f'{sid}\t{text}\n') 80 | segments.write(f'{sid}\t{aid}\t{start_time}\t{end_time}\n') 81 | segment_sub_names = " " .join(segment_subsets) 82 | utt2subsets.write(f'{sid}\t{segment_sub_names}\n') 83 | 84 | 85 | def main(): 86 | args = get_args() 87 | 88 | meta_analysis(args.input_json, args.output_dir, args.pipe_format) 89 | 90 | 91 | if __name__ == '__main__': 92 | main() 93 | -------------------------------------------------------------------------------- /toolkits/kaldi/gigaspeech_data_prep.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang) 3 | # Seasalt AI, Inc (Author: Guoguo Chen) 4 | 5 | 6 | set -e 7 | set -o pipefail 8 | 9 | stage=1 10 | prefix=gigaspeech 11 | garbage_utterance_tags=" " 12 | punctuation_tags=" " 13 | train_subset=XL 14 | 15 | . ./utils/parse_options.sh || exit 1; 16 | 17 | filter_by_id () { 18 | idlist=$1 19 | input=$2 20 | output=$3 21 | field=1 22 | if [ $# -eq 4 ]; then 23 | field=$4 24 | fi 25 | cat $input | perl -se ' 26 | open(F, "<$idlist") || die "Could not open id-list file $idlist"; 27 | while() { 28 | @A = split; 29 | @A>=1 || die "Invalid id-list file line $_"; 30 | $seen{$A[0]} = 1; 31 | } 32 | while(<>) { 33 | @A = split; 34 | @A > 0 || die "Invalid file line $_"; 35 | @A >= $field || die "Invalid file line $_"; 36 | if ($seen{$A[$field-1]}) { 37 | print $_; 38 | } 39 | }' -- -idlist="$idlist" -field="$field" > $output ||\ 40 | (echo "$0: filter_by_id() error: $input" && exit 1) || exit 1; 41 | } 42 | 43 | subset_data_dir () { 44 | utt_list=$1 45 | src_dir=$2 46 | dest_dir=$3 47 | mkdir -p $dest_dir || exit 1; 48 | # wav.scp utt2spk text segments utt2dur reco2dur spk2utt 49 | filter_by_id $utt_list $src_dir/utt2spk $dest_dir/utt2spk ||\ 50 | (echo "$0: subset_data_dir() error: $src_dir/utt2spk" && exit 1) || exit 1; 51 | filter_by_id $utt_list $src_dir/spk2utt $dest_dir/spk2utt 2 ||\ 52 | (echo "$0: subset_data_dir() error: $src_dir/spk2utt" && exit 1) || exit 1; 53 | filter_by_id $utt_list $src_dir/utt2dur $dest_dir/utt2dur ||\ 54 | (echo "$0: subset_data_dir() error: $src_dir/utt2dur" && exit 1) || exit 1; 55 | filter_by_id $utt_list $src_dir/text $dest_dir/text ||\ 56 | (echo "$0: subset_data_dir() error: $src_dir/text" && exit 1) || exit 1; 57 | filter_by_id $utt_list $src_dir/segments $dest_dir/segments ||\ 58 | (echo "$0: subset_data_dir() error: $src_dir/segments" && exit 1) || exit 1; 59 | awk '{print $2}' $dest_dir/segments | sort | uniq > $dest_dir/reco 60 | filter_by_id $dest_dir/reco $src_dir/wav.scp $dest_dir/wav.scp ||\ 61 | (echo "$0: subset_data_dir() error: $src_dir/wav.scp" && exit 1) || exit 1; 62 | filter_by_id $dest_dir/reco $src_dir/reco2dur $dest_dir/reco2dur ||\ 63 | (echo "$0: subset_data_dir() error: $src_dir/reco2dur" && exit 1) || exit 1; 64 | rm -f $dest_dir/reco 65 | } 66 | 67 | if [ $# -ne 2 ]; then 68 | echo "Usage: $0 [options] " 69 | echo " e.g.: $0 --train-subset XL /disk1/audio_data/gigaspeech/ data/" 70 | echo "" 71 | echo "This script takes the GigaSpeech source directory, and prepares the" 72 | echo "Kaldi format data directory." 73 | echo " --garbage-utterance-tags # Tags for non-speech." 74 | echo " --prefix # Prefix for output data directory." 75 | echo " --punctuation-tags # Tags for punctuations." 76 | echo " --stage # Processing stage." 77 | echo " --train-subset # Train subset to be created." 78 | exit 1 79 | fi 80 | 81 | gigaspeech_dir=$1 82 | data_dir=$2 83 | 84 | declare -A subsets 85 | subsets=( 86 | [XL]="train_xl" 87 | [L]="train_l" 88 | [M]="train_m" 89 | [S]="train_s" 90 | [XS]="train_xs" 91 | [DEV]="dev" 92 | [TEST]="test") 93 | prefix=${prefix:+${prefix}_} 94 | 95 | corpus_dir=$data_dir/${prefix}corpus/ 96 | if [ $stage -le 1 ]; then 97 | echo "$0: Extract meta into $corpus_dir" 98 | # Sanity check. 99 | [ ! -f $gigaspeech_dir/GigaSpeech.json ] &&\ 100 | echo "$0: Please download $gigaspeech_dir/GigaSpeech.json!" && exit 1; 101 | [ ! -d $gigaspeech_dir/audio ] &&\ 102 | echo "$0: Please download $gigaspeech_dir/audio!" && exit 1; 103 | 104 | [ ! -d $corpus_dir ] && mkdir -p $corpus_dir 105 | 106 | # Files to be created: 107 | # wav.scp utt2spk text and segments utt2dur reco2dur spk2utt 108 | python3 toolkits/kaldi/extract_meta.py \ 109 | --pipe-format $gigaspeech_dir/GigaSpeech.json $corpus_dir || exit 1; 110 | utt2spk=$corpus_dir/utt2spk 111 | spk2utt=$corpus_dir/spk2utt 112 | toolkits/kaldi/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt ||\ 113 | (echo "$0: utt2spk to spk2utt" && exit 1) || exit 1; 114 | fi 115 | 116 | if [ $stage -le 2 ]; then 117 | echo "$0: Filter $corpus_dir/text" 118 | # Delete utterances with garbage meta tags 119 | for tag in $garbage_utterance_tags; do 120 | sed -i "/${tag}/d" $corpus_dir/text 121 | done 122 | 123 | # Delete punctuations in utterances 124 | for tag in $punctuation_tags; do 125 | sed -i "s/${tag}//g" $corpus_dir/text 126 | done 127 | 128 | # Ensure space only appears once and utt is seprated with others by '\t' 129 | sed -i 's/\t/ /g' $corpus_dir/text 130 | sed -i 's/[ ][ ]*/ /g' $corpus_dir/text 131 | sed -i 's/ /\t/' $corpus_dir/text 132 | fi 133 | 134 | if [ $stage -le 3 ]; then 135 | echo "$0: Split data to train, dev and test" 136 | # Split data to train, dev and test. 137 | [ ! -f $corpus_dir/utt2subsets ] &&\ 138 | echo "$0: No such file $corpus_dir/utt2subsets!" && exit 1; 139 | for label in $train_subset DEV TEST; do 140 | if [ ! ${subsets[$label]+set} ]; then 141 | echo "$0: Subset $label is not defined in GigaSpeech.json." && exit 1; 142 | fi 143 | subset=${subsets[$label]} 144 | [ ! -d $data_dir/${prefix}$subset ] && mkdir -p $data_dir/${prefix}$subset 145 | grep "{$label}" $corpus_dir/utt2subsets \ 146 | > $corpus_dir/${prefix}${subset}_utt_list|| exit 1; 147 | subset_data_dir $corpus_dir/${prefix}${subset}_utt_list \ 148 | $corpus_dir $data_dir/${prefix}$subset || exit 1; 149 | done 150 | fi 151 | 152 | echo "$0: Done" 153 | -------------------------------------------------------------------------------- /toolkits/kaldi/utt2spk_to_spk2utt.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2011 Microsoft Corporation 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 11 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 12 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 13 | # MERCHANTABLITY OR NON-INFRINGEMENT. 14 | # See the Apache 2 License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # converts an utt2spk file to a spk2utt file. 18 | # Takes input from the stdin or from a file argument; 19 | # output goes to the standard out. 20 | 21 | if ( @ARGV > 1 ) { 22 | die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; 23 | } 24 | 25 | while(<>){ 26 | @A = split(" ", $_); 27 | @A == 2 || die "Invalid line in utt2spk file: $_"; 28 | ($u,$s) = @A; 29 | if(!$seen_spk{$s}) { 30 | $seen_spk{$s} = 1; 31 | push @spklist, $s; 32 | } 33 | push (@{$spk_hash{$s}}, "$u"); 34 | } 35 | foreach $s (@spklist) { 36 | $l = join(' ',@{$spk_hash{$s}}); 37 | print "$s $l\n"; 38 | } 39 | 40 | -------------------------------------------------------------------------------- /toolkits/wenet/extract_meta.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang) 2 | # Mobvoi Corporation (Author: Di Wu) 3 | 4 | import sys 5 | import os 6 | import argparse 7 | import json 8 | 9 | 10 | def get_args(): 11 | parser = argparse.ArgumentParser(description=""" 12 | This script is used to process raw json dataset of GigaSpeech, 13 | where the long wav is splitinto segments and 14 | data of wenet format is generated. 15 | """) 16 | parser.add_argument('input_json', help="""Input json file of Gigaspeech""") 17 | parser.add_argument('output_dir', help="""Output dir for prepared data""") 18 | 19 | args = parser.parse_args() 20 | return args 21 | 22 | 23 | def meta_analysis(input_json, output_dir): 24 | input_dir = os.path.dirname(input_json) 25 | 26 | if not os.path.exists(output_dir): 27 | os.makedirs(output_dir) 28 | 29 | try: 30 | with open(input_json, 'r') as injson: 31 | json_data = json.load(injson) 32 | except: 33 | sys.exit(f'Failed to load input json file: {input_json}') 34 | else: 35 | if json_data['audios'] is not None: 36 | with open(f'{output_dir}/text', 'w') as utt2text, \ 37 | open(f'{output_dir}/segments', 'w') as segments, \ 38 | open(f'{output_dir}/utt2dur', 'w') as utt2dur, \ 39 | open(f'{output_dir}/wav.scp', 'w') as wavscp, \ 40 | open(f'{output_dir}/utt2subsets', 'w') as utt2subsets, \ 41 | open(f'{output_dir}/reco2dur', 'w') as reco2dur: 42 | for long_audio in json_data['audios']: 43 | try: 44 | long_audio_path = os.path.realpath(os.path.join(input_dir, long_audio['path'])) 45 | aid = long_audio['aid'] 46 | segments_lists = long_audio['segments'] 47 | duration = long_audio['duration'] 48 | assert(os.path.exists(long_audio_path)) 49 | assert('opus' == long_audio['format']) 50 | assert(16000 == long_audio['sample_rate']) 51 | except AssertionError: 52 | print(f'Warning: {aid} something is wrong, maybe AssertionError, skipped') 53 | continue 54 | except: 55 | print(f'Warning: {aid} something is wrong, maybe the error path: {long_audio_path}, skipped') 56 | continue 57 | else: 58 | wavscp.write(f'{aid}\t{long_audio_path}\n') 59 | reco2dur.write(f'{aid}\t{duration}\n') 60 | for segment_file in segments_lists: 61 | try: 62 | sid = segment_file['sid'] 63 | start_time = segment_file['begin_time'] 64 | end_time = segment_file['end_time'] 65 | dur = end_time - start_time 66 | text = segment_file['text_tn'] 67 | segment_subsets = segment_file["subsets"] 68 | except: 69 | print(f'Warning: {segment_file} something is wrong, skipped') 70 | continue 71 | else: 72 | utt2text.write(f'{sid}\t{text}\n') 73 | segments.write(f'{sid}\t{aid}\t{start_time}\t{end_time}\n') 74 | utt2dur.write(f'{sid}\t{dur}\n') 75 | segment_sub_names = " " .join(segment_subsets) 76 | utt2subsets.write(f'{sid}\t{segment_sub_names}\n') 77 | 78 | def main(): 79 | args = get_args() 80 | 81 | meta_analysis(args.input_json, args.output_dir) 82 | 83 | 84 | if __name__ == '__main__': 85 | main() 86 | -------------------------------------------------------------------------------- /toolkits/wenet/gigaspeech_data_prep.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang) 3 | # Seasalt AI, Inc (Author: Guoguo Chen) 4 | # Mobvoi Corporation (Author: Di Wu) 5 | 6 | set -e 7 | set -o pipefail 8 | 9 | stage=1 10 | prefix=gigaspeech 11 | garbage_utterance_tags=" " 12 | punctuation_tags=" " 13 | train_subset=XL 14 | 15 | . ./utils/parse_options.sh || exit 1; 16 | 17 | filter_by_id () { 18 | idlist=$1 19 | input=$2 20 | output=$3 21 | field=1 22 | if [ $# -eq 4 ]; then 23 | field=$4 24 | fi 25 | cat $input | perl -se ' 26 | open(F, "<$idlist") || die "Could not open id-list file $idlist"; 27 | while() { 28 | @A = split; 29 | @A>=1 || die "Invalid id-list file line $_"; 30 | $seen{$A[0]} = 1; 31 | } 32 | while(<>) { 33 | @A = split; 34 | @A > 0 || die "Invalid file line $_"; 35 | @A >= $field || die "Invalid file line $_"; 36 | if ($seen{$A[$field-1]}) { 37 | print $_; 38 | } 39 | }' -- -idlist="$idlist" -field="$field" > $output ||\ 40 | (echo "$0: filter_by_id() error: $input" && exit 1) || exit 1; 41 | } 42 | 43 | subset_data_dir () { 44 | utt_list=$1 45 | src_dir=$2 46 | dest_dir=$3 47 | mkdir -p $dest_dir || exit 1; 48 | # wav.scp text segments utt2dur 49 | filter_by_id $utt_list $src_dir/utt2dur $dest_dir/utt2dur ||\ 50 | (echo "$0: subset_data_dir() error: $src_dir/utt2dur" && exit 1) || exit 1; 51 | filter_by_id $utt_list $src_dir/text $dest_dir/text ||\ 52 | (echo "$0: subset_data_dir() error: $src_dir/text" && exit 1) || exit 1; 53 | filter_by_id $utt_list $src_dir/segments $dest_dir/segments ||\ 54 | (echo "$0: subset_data_dir() error: $src_dir/segments" && exit 1) || exit 1; 55 | awk '{print $2}' $dest_dir/segments | sort | uniq > $dest_dir/reco 56 | filter_by_id $dest_dir/reco $src_dir/wav.scp $dest_dir/wav.scp ||\ 57 | (echo "$0: subset_data_dir() error: $src_dir/wav.scp" && exit 1) || exit 1; 58 | rm -f $dest_dir/reco 59 | } 60 | 61 | if [ $# -ne 2 ]; then 62 | echo "Usage: $0 [options] " 63 | echo " e.g.: $0 --train-subset XL /disk1/audio_data/gigaspeech/ data/" 64 | echo "" 65 | echo "This script takes the GigaSpeech source directory, and prepares the" 66 | echo "WeNet format data directory." 67 | echo " --garbage-utterance-tags # Tags for non-speech." 68 | echo " --prefix # Prefix for output data directory." 69 | echo " --punctuation-tags # Tags for punctuations." 70 | echo " --stage # Processing stage." 71 | echo " --train-subset # Train subset to be created." 72 | exit 1 73 | fi 74 | 75 | gigaspeech_dir=$1 76 | data_dir=$2 77 | 78 | declare -A subsets 79 | subsets=( 80 | [XL]="train_xl" 81 | [L]="train_l" 82 | [M]="train_m" 83 | [S]="train_s" 84 | [XS]="train_xs" 85 | [DEV]="dev" 86 | [TEST]="test") 87 | prefix=${prefix:+${prefix}_} 88 | 89 | corpus_dir=$data_dir/${prefix}corpus/ 90 | if [ $stage -le 1 ]; then 91 | echo "$0: Extract meta into $corpus_dir" 92 | # Sanity check. 93 | [ ! -f $gigaspeech_dir/GigaSpeech.json ] &&\ 94 | echo "$0: Please download $gigaspeech_dir/GigaSpeech.json!" && exit 1; 95 | [ ! -d $gigaspeech_dir/audio ] &&\ 96 | echo "$0: Please download $gigaspeech_dir/audio!" && exit 1; 97 | 98 | [ ! -d $corpus_dir ] && mkdir -p $corpus_dir 99 | 100 | # Files to be created: 101 | # wav.scp text segments utt2dur 102 | python3 toolkits/wenet/extract_meta.py \ 103 | $gigaspeech_dir/GigaSpeech.json $corpus_dir || exit 1; 104 | fi 105 | 106 | if [ $stage -le 2 ]; then 107 | echo "$0: Filter $corpus_dir/text" 108 | # Delete utterances with garbage meta tags 109 | for tag in $garbage_utterance_tags; do 110 | sed -i "/${tag}/d" $corpus_dir/text 111 | done 112 | 113 | # Delete punctuations in utterances 114 | for tag in $punctuation_tags; do 115 | sed -i "s/${tag}//g" $corpus_dir/text 116 | done 117 | 118 | # Ensure space only appears once and utt is seprated with others by '\t' 119 | sed -i 's/\t/ /g' $corpus_dir/text 120 | sed -i 's/[ ][ ]*/ /g' $corpus_dir/text 121 | sed -i 's/ /\t/' $corpus_dir/text 122 | fi 123 | 124 | if [ $stage -le 3 ]; then 125 | echo "$0: Split data to train, dev and test" 126 | # Split data to train, dev and test. 127 | [ ! -f $corpus_dir/utt2subsets ] &&\ 128 | echo "$0: No such file $corpus_dir/utt2subsets!" && exit 1; 129 | for label in $train_subset DEV TEST; do 130 | if [ ! ${subsets[$label]+set} ]; then 131 | echo "$0: Subset $label is not defined in GigaSpeech.json." && exit 1; 132 | fi 133 | subset=${subsets[$label]} 134 | [ ! -d $data_dir/${prefix}$subset ] && mkdir -p $data_dir/${prefix}$subset 135 | grep "{$label}" $corpus_dir/utt2subsets \ 136 | > $corpus_dir/${prefix}${subset}_utt_list|| exit 1; 137 | subset_data_dir $corpus_dir/${prefix}${subset}_utt_list \ 138 | $corpus_dir $data_dir/${prefix}$subset || exit 1; 139 | done 140 | fi 141 | 142 | echo "$0: Done" 143 | -------------------------------------------------------------------------------- /utils/check_audio_md5.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2021 Jiayu DU 3 | # Seasalt AI, Inc (Author: Guoguo Chen) 4 | 5 | 6 | set -e 7 | set -o pipefail 8 | 9 | if [ $# -ne 1 ]; then 10 | echo "Usage: $0 " 11 | echo " e.g.: $0 /disk1/audio_data/gigaspeech" 12 | echo "" 13 | echo "This script tries to detect errors in the downloaded audio files " 14 | echo "by comparing your local audio files' md5 with those in GigaSpeech.json" 15 | exit 1 16 | fi 17 | 18 | gigaspeech_dataset_dir=$1 19 | 20 | failed=false 21 | if [[ `uname -s` == "Linux" ]]; then 22 | if ! which md5sum >/dev/null; then 23 | echo "$0: Please install md5sum" 24 | exit 1 25 | fi 26 | utils/ls_md5.sh $gigaspeech_dataset_dir | (while read line; do 27 | echo $line | md5sum -c --strict --quiet --status 2>/dev/null 28 | if [ $? -ne 0 ]; then 29 | echo "$0: md5 verification failed for: \"$line\"" 30 | failed=true 31 | fi 32 | done 33 | 34 | if [ "$failed" = true ]; then 35 | echo "$0: md5 verification failed, check the above logs." 36 | exit 1 37 | fi) || exit 1 38 | elif [[ `uname -s` == "Darwin" ]]; then 39 | if ! which md5 >/dev/null; then 40 | echo "$0: Please install md5" 41 | exit 1 42 | fi 43 | utils/ls_md5.sh $gigaspeech_dataset_dir | (while read line; do 44 | checksum=`echo $line | awk '{print $1}'` 45 | file=`echo $line | awk '{print $2}'` 46 | checksum_from_file=`md5 -q $file` 47 | if [[ "$checksum_from_file" != "$checksum" ]]; then 48 | echo "$0: md5 verification failed for: \"$line\"" 49 | failed=true 50 | fi 51 | done 52 | 53 | if [ "$failed" = true ]; then 54 | echo "$0: md5 verification failed, check the above logs." 55 | exit 1 56 | fi) || exit 1 57 | else 58 | echo "$0: $0 only supports Linux and Mac OS" 59 | exit 1 60 | fi 61 | 62 | echo "$0: Successfully verified audio files." 63 | -------------------------------------------------------------------------------- /utils/check_metadata_md5.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2021 Jiayu DU 3 | # Seasalt AI, Inc (Author: Guoguo Chen) 4 | 5 | 6 | set -e 7 | set -o pipefail 8 | 9 | if [ $# -ne 1 ]; then 10 | echo "Usage: $0 " 11 | echo " e.g.: $0 /disk1/audio_data/gigaspeech" 12 | echo "" 13 | echo "This script tries to detect errors in the downloaded metadata " 14 | echo "by checking the md5 value. You can find the expected md5 value in" 15 | echo "misc/metadata_versions.txt." 16 | exit 1 17 | fi 18 | 19 | gigaspeech_dataset_dir=$1 20 | 21 | if [ ! -f $gigaspeech_dataset_dir/GigaSpeech.json ]; then 22 | echo "$0: Metadata $gigaspeech_dataset_dir/GigaSpeech.json does not exist." 23 | exit 24 | fi 25 | 26 | verified="false" 27 | local_version=$(utils/extract_metadata_version.sh $gigaspeech_dataset_dir) 28 | if [[ `uname -s` == "Linux" ]]; then 29 | if ! which md5sum >/dev/null; then 30 | echo "$0: Please install md5sum" 31 | exit 1 32 | fi 33 | local_md5=$(md5sum $gigaspeech_dataset_dir/GigaSpeech.json | awk '{print $1}') 34 | elif [[ `uname -s` == "Darwin" ]]; then 35 | if ! which md5 >/dev/null; then 36 | echo "$0: Please install md5" 37 | exit 1 38 | fi 39 | local_md5=$(md5 -r $gigaspeech_dataset_dir/GigaSpeech.json | awk '{print $1}') 40 | else 41 | echo "$0: only supports Linux and Mac OS" 42 | exit 1 43 | fi 44 | 45 | grep -v '^#' misc/metadata_versions.txt | (while read line; do 46 | version=$(echo $line | awk '{print $1}') 47 | md5=$(echo $line | awk '{print $2}') 48 | if [[ "$local_version" == "$version" ]]; then 49 | if [[ "$local_md5" == "$md5" ]]; then 50 | echo "$0: Successfully verified metadata version:$version, md5:$md5" 51 | verified="true" 52 | else 53 | echo "$0: ERROR, $local_version expects md5=$md5, got $local_md5" 54 | exit 1; 55 | fi 56 | fi 57 | done 58 | 59 | if [[ "$verified" == "false" ]]; then 60 | echo "$0: md5 verification failed for unknown version $local_version" 61 | exit 1 62 | fi) || exit 1; 63 | 64 | echo "$0: Done md5 verification." 65 | -------------------------------------------------------------------------------- /utils/download_gigaspeech.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang) 3 | # Seasalt AI, Inc (Author: Guoguo Chen) 4 | # Jiayu DU 5 | # Tsinghua University (Author: Shuzhou Chai) 6 | 7 | set -e 8 | set -o pipefail 9 | 10 | stage=0 11 | with_dict=false 12 | 13 | # Support hosts: 14 | # 1. oss 15 | # 2. tsinghua 16 | # 3. speechocean 17 | # 4. magicdata 18 | host= 19 | subset={XL} # unavailable for oss 20 | download_eval=true 21 | 22 | . ./env_vars.sh || exit 1 23 | . ./utils/parse_options.sh || exit 1 24 | 25 | 26 | if [ $# -ne 1 ]; then 27 | echo "Usage: $0 " 28 | echo " e.g.: $0 /disk1/audio_data/gigaspeech" 29 | echo "" 30 | echo "This script downloads the entire GigaSpeech dataset" 31 | echo "to your local dir . " 32 | echo "options:" 33 | echo " --with-dict true|false(default) download cmudict & g2p model" 34 | echo " --stage stage(default 0) specifies from which stage to start with" 35 | echo " --host tsinghua|speechocean|magicdata|oss specifies the host" 36 | echo " --subset subset(default {XL}) specifies the subset to download" 37 | echo " --download-eval true(default)|false download {DEV} and {TEST} subsets" 38 | exit 1 39 | fi 40 | 41 | gigaspeech_dataset_dir=$1 42 | mkdir -p $gigaspeech_dataset_dir || exit 1; 43 | 44 | # Check credentials. 45 | if [ ! -f SAFEBOX/password ]; then 46 | echo -n "$0: Please apply for the download credentials (see the \"Download\"" 47 | echo " section in README) and it to SAFEBOX/password." 48 | exit 1; 49 | fi 50 | PASSWORD=`cat SAFEBOX/password 2>/dev/null` 51 | if [ -z "$PASSWORD" ]; then 52 | echo "$0: Error, SAFEBOX/password is empty." 53 | exit 1; 54 | fi 55 | PASSWORD_MD5=`echo $PASSWORD | md5sum | cut -d ' ' -f 1` 56 | if [[ $PASSWORD_MD5 != "dfbf0cde1a3ce23749d8d81e492741b8" ]]; then 57 | echo "$0: Error, invalid SAFEBOX/password." 58 | exit 1; 59 | fi 60 | 61 | # Check downloading tools 62 | if ! which wget >/dev/null; then 63 | echo "$0: Error, please make sure you have wget installed." 64 | exit 1 65 | fi 66 | 67 | # Set up speed test. 68 | test_local_file="GigaSpeech.json.gz.aes" 69 | check_download_speed() { 70 | # Downloading for 30 seconds. 71 | rm -f "/tmp/$test_local_file" 72 | local duration=30 73 | eval "$1 &" || exit 1; 74 | local jobid=$! 75 | trap "kill $jobid; rm -f /tmp/$test_local_file; exit 1" INT 76 | 77 | # Wait for $duration seconds, but exit if the job finishes earlier. 78 | for t in `seq 1 $duration`; do 79 | if ! ps -p $jobid > /dev/null; then 80 | if [[ -f "/tmp/$test_local_file" ]]; then 81 | local file_size="$(du -sk /tmp/$test_local_file | cut -f1)" 82 | rm -f "/tmp/$test_local_file" 83 | local speed="$(echo "scale=3; $file_size/1024/$duration" | bc)" 84 | echo "$speed" 85 | return 0 86 | else 87 | echo "0" 88 | return 0 89 | fi 90 | fi 91 | sleep 1 92 | done 93 | 94 | # Check if the jobs is still alive, if yes, then kill it. 95 | if ps -p $jobid > /dev/null; then 96 | kill $jobid || exit 1; 97 | # Check file size. 98 | if [[ -f "/tmp/$test_local_file" ]]; then 99 | local file_size="$(du -sk /tmp/$test_local_file | cut -f1)" 100 | rm -f "/tmp/$test_local_file" 101 | local speed="$(echo "scale=3; $file_size/1024/$duration" | bc)" 102 | echo "$speed" 103 | else 104 | echo "0" 105 | fi 106 | else 107 | echo "0" 108 | fi 109 | } 110 | 111 | if [ -z "$host" ];then 112 | # Default download host. 113 | host=tsinghua 114 | speed=0 115 | 116 | # Check all available hosts and choose the fastest one. 117 | echo "$0: Testing Tsinghua host speed..." 118 | wget_cmd="wget -c -t 20 -T 90 -P /tmp" 119 | wget_cmd="$wget_cmd $GIGASPEECH_RELEASE_URL_TSINGHUA/GigaSpeech.json.gz.aes" 120 | speed=$(check_download_speed "$wget_cmd") 121 | echo; echo "$0: The Tsinghua host speed: $speed MB/s."; echo; 122 | 123 | echo "$0: Testing speechocean host speed..." 124 | wget_cmd="wget -c -t 20 -T 90 -P /tmp" 125 | wget_cmd="$wget_cmd --ftp-user=GigaSpeech --ftp-password=$PASSWORD" 126 | wget_cmd="$wget_cmd $GIGASPEECH_RELEASE_URL_SPEECHOCEAN/" 127 | wget_cmd="${wget_cmd}GigaSpeech.json.gz.aes" 128 | speechocean_speed=$(check_download_speed "$wget_cmd") 129 | if [ $(echo "$speed < $speechocean_speed" | bc) = 1 ]; then 130 | host=speechocean 131 | speed=$speechocean_speed 132 | fi 133 | echo; echo "$0: The speechocean host speed: $speechocean_speed MB/s."; echo; 134 | 135 | echo "$0: Testing Magic Data host speed..." 136 | wget_cmd="wget -c -t 20 -T 90 -P /tmp" 137 | wget_cmd="$wget_cmd $GIGASPEECH_RELEASE_URL_MAGICDATA/GigaSpeech.json.gz.aes" 138 | magicdata_speed=$(check_download_speed "$wget_cmd") 139 | if [ $(echo "$speed < $magicdata_speed" | bc) = 1 ]; then 140 | host=magicdata 141 | speed=$magicdata_speed 142 | fi 143 | echo; echo "$0: The Magic Data host speed: $magicdata_speed MB/s."; echo; 144 | 145 | # Check if there is available host. 146 | if [ $(echo "$speed == 0" | bc) = 1 ]; then 147 | echo "$0: All hosts are down..." 148 | exit 1; 149 | fi 150 | echo; echo "$0: Using $host host, speed is $speed MB/s."; echo; 151 | fi 152 | 153 | if [[ "$host" == "oss" ]]; then 154 | # This is for SpeechColab collaborators, need 500G free space 155 | echo "$0: Downloading from the oss host..." 156 | utils/internal/download_gigaspeech_from_oss.sh \ 157 | --stage $stage --with-dict $with_dict \ 158 | $gigaspeech_dataset_dir || exit 1; 159 | elif [[ "$host" == "tsinghua" || "$host" == "speechocean" || "$host" == "magicdata" ]]; then 160 | # This is for public release, need 1.0T free space 161 | echo "$0: Downloading with PySpeechColab..." 162 | utils/internal/download_gigaspeech_with_pyspeechcolab.sh \ 163 | --host $host --subset $subset --with-dict $with_dict \ 164 | --download-eval $download_eval \ 165 | $gigaspeech_dataset_dir || exit 1; 166 | else 167 | echo "$0: Unsupported host: $host" 168 | exit 1 169 | fi 170 | 171 | echo "$0: Done" 172 | -------------------------------------------------------------------------------- /utils/extract_metadata_version.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2021 SpeechColab Authors 3 | 4 | set -e 5 | set -o pipefail 6 | 7 | if [ $# -ne 1 ]; then 8 | echo "Usage: $0 " 9 | echo " e.g.: $0 /disk1/audio_data/gigaspeech" 10 | echo "" 11 | echo "This script extract version field from metadata file" 12 | exit 1 13 | fi 14 | 15 | gigaspeech_dataset_dir=$1 16 | 17 | if ! which jq >/dev/null; then 18 | >&2 echo "$0: You have to get jq installed in order to use this. See" 19 | >&2 echo "$0: utils/install_jq.sh" 20 | exit 1 21 | fi 22 | 23 | if [ -f $gigaspeech_dataset_dir/GigaSpeech.json ]; then 24 | cat $gigaspeech_dataset_dir/GigaSpeech.json | jq -r '.version' 25 | else 26 | >&2 echo "$0: ERROR, couldn't find $gigaspeech_dataset_dir/GigaSpeech.json" 27 | exit 1 28 | fi 29 | -------------------------------------------------------------------------------- /utils/extract_subset_segments.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf8 3 | # Copyright 2022 Jiayu DU 4 | 5 | ''' 6 | This tool is used to extract supervised segments from GigaSpeech, 7 | segments are saved in .wav format, supervisions are saved in a simple .tsv file: 8 | 9 | --- exampler tsv begin --- 10 | ID AUDIO BEGIN DURATION TEXT 11 | POD1000000004_S0000017 audio/POD1000000004_S0000017.wav 0 3.163 YOU KNOW TO PUT THIS STUFF TOGETHER 12 | ... 13 | ... 14 | 15 | --- exampler tsv end--- 16 | 17 | It can be, but not should be used to extract large subsets such as L, XL (because it would be extremely slow). 18 | ''' 19 | 20 | import os, sys 21 | import argparse 22 | import csv 23 | from speechcolab.datasets.gigaspeech import GigaSpeech 24 | import torchaudio 25 | 26 | gigaspeech_punctuations = ['', '', '', ''] 27 | gigaspeech_garbage_utterance_tags = ['', '', '', ''] 28 | 29 | if __name__ == '__main__': 30 | parser = argparse.ArgumentParser(description='Save the audio segments into wav, and meta into tsv.') 31 | parser.add_argument('--subset', choices = ['XS', 'S', 'M', 'L', 'XL', 'DEV', 'TEST'], default='XS', help='The subset name') 32 | parser.add_argument('gigaspeech_dataset_dir', help='The GigaSpeech corpus directory') 33 | parser.add_argument('dst_dir', help='Ouput subset directory') 34 | args = parser.parse_args() 35 | 36 | os.makedirs(args.dst_dir, exist_ok = True) 37 | 38 | gigaspeech = GigaSpeech(args.gigaspeech_dataset_dir) 39 | subset = '{' + args.subset + '}' 40 | with open(os.path.join(args.dst_dir, 'metadata.tsv'), 'w+', encoding='utf8') as fo: 41 | csv_header_fields = ['ID', 'AUDIO', 'DURATION', 'TEXT'] 42 | csv_writer = csv.DictWriter(fo, delimiter='\t', fieldnames=csv_header_fields, lineterminator='\n') 43 | csv_writer.writeheader() 44 | for audio in gigaspeech.audios(subset): 45 | aid = audio['aid'] 46 | audio_path = os.path.join(args.gigaspeech_dataset_dir, audio["path"]) 47 | 48 | audio_info = torchaudio.info(audio_path) 49 | opus_sample_rate = audio_info.sample_rate 50 | assert opus_sample_rate == 48000 51 | nc = audio_info.num_channels 52 | assert nc == 1 53 | 54 | sample_rate = 16000 55 | long_waveform, _ = torchaudio.load(audio_path) 56 | long_waveform = torchaudio.transforms.Resample(opus_sample_rate, sample_rate)(long_waveform) 57 | 58 | for segment in audio['segments']: 59 | sid = segment['sid'] 60 | 61 | if subset not in segment['subsets']: 62 | continue 63 | 64 | text = segment['text_tn'] 65 | for punctuation in gigaspeech_punctuations: 66 | text = text.replace(punctuation, '').strip() 67 | text = ' '.join(text.split()) 68 | 69 | if text in gigaspeech_garbage_utterance_tags: 70 | continue 71 | 72 | begin = segment['begin_time'] 73 | duration = segment['end_time'] - segment['begin_time'] 74 | frame_offset = int(begin * sample_rate) 75 | num_frames = int(duration * sample_rate) 76 | 77 | waveform = long_waveform[0][frame_offset : frame_offset + num_frames] # mono 78 | 79 | segment_path = os.path.join('audio', aid, f'{sid}.wav') 80 | os.makedirs(os.path.join(args.dst_dir, os.path.dirname(segment_path)), exist_ok = True) 81 | torchaudio.save( 82 | os.path.join(args.dst_dir, segment_path), 83 | waveform.unsqueeze(0), 84 | sample_rate = sample_rate, 85 | format = 'wav', 86 | encoding = 'PCM_S', 87 | bits_per_sample = 16, 88 | ) 89 | 90 | utt = {'ID': segment['sid'], 'AUDIO': segment_path, 'DURATION': f'{duration:.4f}', 'TEXT': text } 91 | csv_writer.writerow(utt) 92 | 93 | -------------------------------------------------------------------------------- /utils/gigaspeech_scoring.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import argparse 4 | 5 | conversational_filler = ['UH', 'UHH', 'UM', 'EH', 'MM', 'HM', 'AH', 'HUH', 'HA', 'ER', 'OOF', 'HEE' , 'ACH', 'EEE', 'EW'] 6 | unk_tags = ['', ''] 7 | gigaspeech_punctuations = ['', '', '', ''] 8 | gigaspeech_garbage_utterance_tags = ['', '', '', ''] 9 | non_scoring_words = conversational_filler + unk_tags + gigaspeech_punctuations + gigaspeech_garbage_utterance_tags 10 | 11 | def asr_text_post_processing(text): 12 | # 1. convert to uppercase 13 | text = text.upper() 14 | 15 | # 2. remove hyphen 16 | # "E-COMMERCE" -> "E COMMERCE", "STATE-OF-THE-ART" -> "STATE OF THE ART" 17 | text = text.replace('-', ' ') 18 | 19 | # 3. remove non-scoring words from evaluation 20 | remaining_words = [] 21 | for word in text.split(): 22 | if word in non_scoring_words: 23 | continue 24 | remaining_words.append(word) 25 | 26 | return ' '.join(remaining_words) 27 | 28 | if __name__ == '__main__': 29 | parser = argparse.ArgumentParser(description="This script evaluates GigaSpeech ASR result via SCTK's tool sclite") 30 | parser.add_argument('ref', type=str, help="sclite's standard transcription(trn) reference file") 31 | parser.add_argument('hyp', type=str, help="sclite's standard transcription(trn) hypothesis file") 32 | parser.add_argument('work_dir', type=str, help='working dir') 33 | args = parser.parse_args() 34 | 35 | if not os.path.isdir(args.work_dir): 36 | os.mkdir(args.work_dir) 37 | 38 | REF = os.path.join(args.work_dir, 'REF') 39 | HYP = os.path.join(args.work_dir, 'HYP') 40 | RESULT = os.path.join(args.work_dir, 'RESULT') 41 | 42 | for io in [(args.ref, REF), (args.hyp, HYP)]: 43 | with open(io[0], 'r', encoding='utf8') as fi, open(io[1], 'w+', encoding='utf8') as fo: 44 | for line in fi: 45 | line = line.strip() 46 | if line: 47 | cols = line.split() 48 | text = asr_text_post_processing(' '.join(cols[0:-1])) 49 | uttid_field = cols[-1] 50 | print(F'{text} {uttid_field}', file=fo) 51 | 52 | os.system(F'sclite -r {REF} trn -h {HYP} trn -i swb | tee {RESULT}') # GigaSpeech's uttid comforms to swb 53 | 54 | -------------------------------------------------------------------------------- /utils/install_jq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | set -o pipefail 5 | 6 | if [ `uname -s` == 'Linux' ]; then 7 | if [ "`grep NAME /etc/os-release | grep Ubuntu`" != "" ] ||\ 8 | [ "`grep NAME /etc/os-release | grep Debian`" != "" ]; then 9 | apt-get install jq || exit 1 10 | elif [ "`grep NAME /etc/os-release | grep CentOS`" != "" ]; then 11 | yum install jq || exit 1 12 | else 13 | echo "$0: Unknown platform." 14 | exit 1 15 | fi 16 | elif [ `uname -s` == 'Darwin' ]; then 17 | brew install jq || exit 1 18 | fi 19 | 20 | echo "$0: Done" 21 | -------------------------------------------------------------------------------- /utils/internal/download_gigaspeech_from_oss.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2021 Jiayu Du 3 | # Seasalt AI, Inc (Author: Guoguo Chen) 4 | 5 | set -e 6 | set -o pipefail 7 | 8 | stage=0 9 | with_dict=false 10 | 11 | . ./utils/parse_options.sh || exit 1 12 | 13 | if [ $# -ne 1 ]; then 14 | echo "Usage: $0 " 15 | echo " e.g.: $0 /disk1/audio_data/gigaspeech" 16 | echo "" 17 | echo "This script downloads the entire GigaSpeech Dataset from Aliyun." 18 | echo "This tool is used for our collaborator, not for public users." 19 | echo "We suggest having at least 500G of free space in local dir." 20 | echo "If dataset resources are updated, you can just re-run this script for " 21 | echo "incremental downloading, downloader will only download updates" 22 | exit 1 23 | fi 24 | 25 | gigaspeech_dataset_dir=$1 26 | 27 | 28 | . ./env_vars.sh || exit 1 29 | GIGASPEECH_RELEASE_URL=$GIGASPEECH_RELEASE_URL_OSS 30 | 31 | if [ -z "${GIGASPEECH_RELEASE_URL}" ]; then 32 | echo "$0: Error, variable GIGASPEECH_RELEASE_URL_OSS(in env_vars.sh) is empty." 33 | exit 1 34 | fi 35 | 36 | if [ ! -f SAFEBOX/aliyun_ossutil.cfg ]; then 37 | echo "$0: Error, make sure you have: SAFEBOX/aliyun_ossutil.cfg" 38 | exit 1 39 | fi 40 | 41 | # install downloader (Official client for ALIYUN Objects-Storage-Service) 42 | ossbin=tools/downloader/oss 43 | if [ $stage -le 0 ]; then 44 | [ ! -d tools/downloader ] && mkdir -p tools/downloader 45 | if [ `uname -s` == 'Linux' ]; then 46 | wget -O $ossbin \ 47 | http://gosspublic.alicdn.com/ossutil/1.7.1/ossutil64 || exit 1 48 | elif [ `uname -s` == 'Darwin' ]; then 49 | curl -o $ossbin \ 50 | http://gosspublic.alicdn.com/ossutil/1.7.1/ossutilmac64 || exit 1 51 | fi 52 | chmod 755 $ossbin 53 | fi 54 | 55 | if [ $stage -le 1 ]; then 56 | echo "$0: Skip downloading TERM_OF_ACCESS, our co-authors don't need this" 57 | fi 58 | 59 | # Download metadata 60 | if [ $stage -le 2 ]; then 61 | echo "$0: Start to download GigaSpeech Metadata" 62 | $ossbin -c SAFEBOX/aliyun_ossutil.cfg \ 63 | cp -u ${GIGASPEECH_RELEASE_URL}/GigaSpeech.json $gigaspeech_dataset_dir/ || exit 1 64 | fi 65 | 66 | # Download audio 67 | if [ $stage -le 3 ]; then 68 | echo "$0: Start to download GigaSpeech cached audio collection" 69 | $ossbin -c SAFEBOX/aliyun_ossutil.cfg \ 70 | cp -ur ${GIGASPEECH_RELEASE_URL}/audio/ $gigaspeech_dataset_dir/audio || exit 1 71 | fi 72 | 73 | # Download optional dictionary and pretrained g2p model 74 | if [ $stage -le 4 ]; then 75 | if [ $with_dict == true ]; then 76 | $ossbin -c SAFEBOX/aliyun_ossutil.cfg \ 77 | cp -u ${GIGASPEECH_RELEASE_URL}/dict/cmudict.0.7a \ 78 | $gigaspeech_dataset_dir/dict/cmudict.0.7a || exit 1 79 | $ossbin -c SAFEBOX/aliyun_ossutil.cfg \ 80 | cp -ur ${GIGASPEECH_RELEASE_URL}/dict/g2p $gigaspeech_dataset_dir/dict/ || exit 1 81 | fi 82 | fi 83 | 84 | # Check audio md5 85 | if [ $stage -le 5 ]; then 86 | echo "$0: Checking md5 of downloaded audio files" 87 | utils/check_audio_md5.sh $gigaspeech_dataset_dir || exit 1 88 | fi 89 | 90 | echo "$0: Done" 91 | -------------------------------------------------------------------------------- /utils/internal/download_gigaspeech_with_pyspeechcolab.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2021 Jiayu Du 3 | # Seasalt AI, Inc (Author: Guoguo Chen) 4 | # Tsinghua University (Author: Shuzhou Chai) 5 | # Xiaomi Corporation (Author: Junbo Zhang) 6 | 7 | set -e 8 | set -o pipefail 9 | 10 | with_dict=false 11 | host=tsinghua 12 | subset={XL} 13 | download_eval=true 14 | 15 | . ./utils/parse_options.sh || exit 1 16 | 17 | if [ $# -ne 1 ]; then 18 | echo "Usage: $0 " 19 | echo " e.g.: $0 /disk1/audio_data/gigaspeech" 20 | echo "" 21 | echo "This script downloads the entire GigaSpeech Dataset from Tsinghua host." 22 | echo "We suggest having at least 1.0T of free space in the target directory." 23 | echo "If dataset resources are updated, you can re-run this script for " 24 | echo "incremental download." 25 | exit 1 26 | fi 27 | 28 | gigaspeech_dataset_dir=$1 29 | mkdir -p $gigaspeech_dataset_dir || exit 1; 30 | 31 | # Check dependency 32 | python3 -c "import speechcolab" 2> /dev/null || \ 33 | (echo "$0: This recipe needs the package speechcolab installed."; 34 | echo "To install:" 35 | echo " pip install speechcolab"; exit 1) 36 | 37 | # Check credential 38 | if [ ! -f SAFEBOX/password ]; then 39 | echo "$0: Please apply for the download credentials (see the \"Download\"" 40 | echo "$0: section in README) and it to SAFEBOX/password." 41 | exit 1 42 | fi 43 | PASSWORD=`cat SAFEBOX/password 2>/dev/null` 44 | if [ -z "$PASSWORD" ]; then 45 | echo "$0: Error, SAFEBOX/password is empty." 46 | exit 1 47 | fi 48 | 49 | # false -> False, true -> True 50 | with_dict=$(echo $with_dict | sed "s/\b\(.\)/\u\1/g") 51 | 52 | # Download with PySpeechColab 53 | python3 << END 54 | from speechcolab.datasets.gigaspeech import GigaSpeech 55 | gigaspeech = GigaSpeech('$gigaspeech_dataset_dir') 56 | if '$download_eval' and '$subset' != '{DEV}' and '$subset' != '{TEST}': 57 | gigaspeech.download('$PASSWORD', subset='{DEV}', host='$host', with_dict=$with_dict) 58 | gigaspeech.download('$PASSWORD', subset='{TEST}', host='$host', with_dict=$with_dict) 59 | gigaspeech.download('$PASSWORD', subset='$subset', host='$host', with_dict=$with_dict) 60 | END 61 | 62 | 63 | echo "$0: Done" 64 | -------------------------------------------------------------------------------- /utils/ls_audios.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2021 Jiayu Du 3 | # Seasalt AI, Inc (Author: Guoguo Chen) 4 | 5 | 6 | set -e 7 | set -o pipefail 8 | 9 | if [ $# -ne 1 ]; then 10 | echo "Usage: $0 " 11 | echo " e.g.: $0 /disk1/audio_data/gigaspeech" 12 | echo "" 13 | echo "This script lists all audio files in dataset release." 14 | exit 1 15 | fi 16 | 17 | gigaspeech_dataset_dir=$1 18 | 19 | if ! which jq >/dev/null; then 20 | >&2 echo "$0: You have to get jq installed in order to use this. See" 21 | >&2 echo "$0: utils/install_jq.sh" 22 | exit 1 23 | fi 24 | 25 | cat $gigaspeech_dataset_dir/GigaSpeech.json \ 26 | | jq -r '.audios[].path' |\ 27 | awk -v prefix="$gigaspeech_dataset_dir" '{print prefix"/"$1}' || exit 1 28 | -------------------------------------------------------------------------------- /utils/ls_md5.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2021 SpeechColab Authors 3 | 4 | 5 | set -e 6 | set -o pipefail 7 | 8 | if [ $# -ne 1 ]; then 9 | echo "Usage: $0 " 10 | echo " e.g.: $0 /disk1/audio_data/gigaspeech" 11 | echo "" 12 | echo "This script lists md5 for all audio files in dataset" 13 | echo "can be used in data consistency check" 14 | exit 1 15 | fi 16 | 17 | gigaspeech_dataset_dir=$1 18 | 19 | if ! which jq >/dev/null; then 20 | >&2 echo "$0: You have to get jq installed in order to use this. See" 21 | >&2 echo "$0: utils/install_jq.sh" 22 | exit 1 23 | fi 24 | 25 | if [ -f $gigaspeech_dataset_dir/GigaSpeech.json ]; then 26 | cat $gigaspeech_dataset_dir/GigaSpeech.json |\ 27 | jq -r '.audios[] | "\(.md5) \(.path)"' |\ 28 | awk -v prefix="$gigaspeech_dataset_dir" '{print $1" "prefix"/"$2}' || exit 1 29 | else 30 | >&2 echo "$0: ERROR, couldn't find $gigaspeech_dataset_dir/GigaSpeech.json" 31 | exit 1 32 | fi 33 | -------------------------------------------------------------------------------- /utils/opus_to_wav.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Xiaomi (Author:Yongqing Wang) 2 | 3 | import os 4 | import argparse 5 | import re 6 | 7 | 8 | def get_args(): 9 | parser = argparse.ArgumentParser(description=""" 10 | This script is used to convert opus file into wav file.""") 11 | parser.add_argument('--remove-opus', action='store_true', default='False', 12 | help="""If true, remove opus files""") 13 | parser.add_argument('opus_scp', help="""Input opus scp file""") 14 | 15 | args = parser.parse_args() 16 | return args 17 | 18 | 19 | def convert_opus2wav(opus_scp, rm_opus): 20 | with open(opus_scp, 'r') as oscp: 21 | for line in oscp: 22 | line = line.strip() 23 | utt, opus_path = re.split('\s+', line) 24 | wav_path = opus_path.replace('.opus', '.wav') 25 | cmd = f'ffmpeg -y -i {opus_path} -ac 1 -ar 16000 {wav_path}' 26 | try: 27 | os.system(cmd) 28 | except: 29 | sys.exit(f'Failed to run the cmd: {cmd}') 30 | if rm_opus is True: 31 | os.remove(opus_path) 32 | 33 | 34 | def main(): 35 | args = get_args() 36 | convert_opus2wav(args.opus_scp, args.remove_opus) 37 | 38 | 39 | if __name__ == '__main__': 40 | main() 41 | -------------------------------------------------------------------------------- /utils/parse_options.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2021 Seasalt AI, Inc (Author: Guoguo Chen) 3 | 4 | 5 | while true; do 6 | [ -z "${1:-}" ] && break; # break if there are no arguments 7 | case "$1" in 8 | --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; 9 | eval '[ -z "${'$name'+xxx}" ]' &&\ 10 | echo "$0: invalid option $1" 1>&2 && exit 1; 11 | 12 | oldval="`eval echo \\$$name`"; 13 | if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then 14 | was_bool=true; 15 | else 16 | was_bool=false; 17 | fi 18 | eval $name=\"$2\"; 19 | if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then 20 | echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 21 | exit 1; 22 | fi 23 | shift 2; 24 | ;; 25 | *) break; 26 | esac 27 | done 28 | 29 | true; 30 | -------------------------------------------------------------------------------- /utils/show_segment_info.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2021 Jiayu Du 3 | # Seasalt AI, Inc (Author: Guoguo Chen) 4 | 5 | 6 | set -e 7 | set -o pipefail 8 | 9 | if [ $# -ne 2 ]; then 10 | echo "Usage: $0 " 11 | echo " e.g.: $0 /disk1/audio_data/gigaspeech POD1000000004_S0000000" 12 | echo "" 13 | echo "This script extracts information from GigaSpeech.json for the given" 14 | echo "segment." 15 | exit 1 16 | fi 17 | 18 | gigaspeech_dataset_dir=$1 19 | segment_id=$2 20 | 21 | if ! which jq >/dev/null; then 22 | >&2 echo "$0: You have to get jq installed in order to use this. See" 23 | >&2 echo "$0: utils/install_jq.sh" 24 | exit 1 25 | fi 26 | 27 | cat $gigaspeech_dataset_dir/GigaSpeech.json |\ 28 | jq --arg query "$segment_id" \ 29 | '.audios[].segments[] | select(.sid == $query)' || exit 1 30 | 31 | echo "$0: Done" 32 | --------------------------------------------------------------------------------