├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── common
    ├── cleanup.sh
    ├── path.sh
    ├── prep.sh
    └── train.sh
├── jv
    └── run.sh
├── si
    └── run.sh
├── su
    └── run.sh
└── tools
    ├── corpus_util.py
    ├── download_data.sh
    ├── kaldi_converter.py
    ├── simpleg2g.py
    └── traintest-split.sh


/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to Contribute
 2 | 
 3 | We'd love to accept your patches and contributions to this project. There are
 4 | just a few small guidelines you need to follow.
 5 | 
 6 | ## Contributor License Agreement
 7 | 
 8 | Contributions to this project must be accompanied by a Contributor License
 9 | Agreement. You (or your employer) retain the copyright to your contribution;
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to <https://cla.developers.google.com/> to see
12 | your current agreements on file or to sign a new one.
13 | 
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 | 
18 | ## Code reviews
19 | 
20 | All submissions, including submissions by project members, require review. We
21 | use GitHub pull requests for this purpose. Consult
22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
23 | information on using pull requests.
24 | 
25 | ## Community Guidelines
26 | 
27 | This project follows [Google's Open Source Community
28 | Guidelines](https://opensource.google.com/conduct/).
29 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Recipes for using open-source ASR corpora
 2 | 
 3 | Recipes for using open-source ASR corpora with [Kaldi](http://kaldi-asr.org/).
 4 | 
 5 | This is not an official Google product.
 6 | 
 7 | ## Languages
 8 | 
 9 | | Language  | Directory | Corpus |
10 | |-----------|-----------|--------|
11 | | Javanese  | jv | [Open SLR 35](http://openslr.org/35/) |
12 | | Sundanese | su | [Open SLR 36](http://openslr.org/36/) |
13 | | Sinhala   | si | [Open SLR 52](http://openslr.org/52/) |
14 | 
15 | ## How to use
16 | 
17 | The above corpora are ready for use with Kaldi, after some simple data munging.
18 | We provide a small Kaldi recipe for training a triphone recognizer, inspired by
19 | the start of Kaldi's Resource Management recipe. The recipe is only intended for
20 | illustration and for validating the corpus and data preparation.
21 | 
22 | ### Prerequisites
23 | 
24 | 1. [Kaldi](http://kaldi-asr.org/). First [download Kaldi from GitHub](https://github.com/kaldi-asr/kaldi), compile, and install.
25 | 2. [Flac](https://xiph.org/flac/). The scripts below use the `flac` command line tool (assumed to be on the shell `PATH`) for on-the-fly decompression of the corpus.
26 | 3. Python and Bash.
27 | 
28 | ### General steps
29 | 
30 | 1. **IMPORTANT:** You must define and export an environment variable `KALDI_ROOT` pointing at your Kaldi directory.
31 | 2. Download and unpack the corpora you need.
32 | 3. Change to a recipe directory and execute `run.sh`.
33 | 
34 | ### Example
35 | 
36 | Here is how to use the Javanese corpus:
37 | ```
38 | sudo apt-get install flac wget
39 | git clone https://github.com/kaldi-asr/kaldi
40 | cd kaldi
41 | export KALDI_ROOT="$(realpath .)"
42 | cat INSTALL
43 | # and follow the instructions there to build Kaldi
44 | cd ..
45 | git clone https://github.com/googlei18n/asr-recipes
46 | cd asr-recipes
47 | tools/download_data.sh jv
48 | # this unpacks the Javanese corpus into asr_javanese
49 | cd jv
50 | ./run.sh
51 | ```
52 | 
53 | ## License
54 | 
55 | Unless otherwise noted, all original files are licensed under an
56 | [Apache License, Version 2.0](LICENSE).
57 | 


--------------------------------------------------------------------------------
/common/cleanup.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Copyright 2018 Google LLC. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | # Removes all the directories added when the model is run.
18 | # WARINIG: Removes all data created
19 | if [ -d "exp" ]; then
20 |   rm -r exp
21 | fi
22 | if [ -d "mfcc" ]; then
23 |   rm -r mfcc
24 | fi
25 | if [ -d "data" ]; then
26 |   rm -r data
27 | fi
28 | if [ -d "data-fbank" ]; then
29 |   rm -r data-fbank
30 | fi
31 | echo "All generated files and directories removed."
32 | 


--------------------------------------------------------------------------------
/common/path.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 Google LLC. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Set up Kaldi-related paths.
16 | # This file should only be sourced by other scripts.
17 | # It is probably not a good idea to run it directly from the command line.
18 | 
19 | if [ ! -d "$KALDI_ROOT" ] ; then
20 |   echo >&2 'KALDI_ROOT must be set and point to the Kaldi directory'
21 |   sleep 5
22 |   exit 1
23 | fi
24 | export PATH="$PWD/utils:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH"
25 | if [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] ; then
26 |   echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!"
27 |   sleep 5
28 |   exit 1
29 | fi
30 | . "$KALDI_ROOT/tools/config/common_path.sh"
31 | export LC_ALL=C
32 | 


--------------------------------------------------------------------------------
/common/prep.sh:
--------------------------------------------------------------------------------
  1 | #! /bin/bash
  2 | 
  3 | # Copyright 2018 Google LLC. All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #      http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | if [ ! -d "$1" ] ; then
 18 |   echo >&2 "Usage: prep.sh CORPUSDIR"
 19 |   exit 1
 20 | fi
 21 | 
 22 | if [ ! -d "$KALDI_ROOT" ] ; then
 23 |   echo >&2 'KALDI_ROOT must be set and point to the Kaldi directory'
 24 |   exit 1
 25 | fi
 26 | 
 27 | set -o errexit
 28 | set -o nounset
 29 | export LC_ALL=C
 30 | 
 31 | readonly CORPUSDIR="$1"
 32 | 
 33 | #
 34 | # Kaldi recipe directory layout
 35 | #
 36 | 
 37 | # Create the directories needed
 38 | mkdir -p data/local/dict data/local/tmp data/train data/test
 39 | 
 40 | # Symlink path setup file expected to be present in the recipe directory
 41 | ln -sf ../common/path.sh
 42 | 
 43 | # Symlink auxiliary Kaldi recipe subdirectories
 44 | kaldi_egs_dir="$KALDI_ROOT/egs"
 45 | ln -sf "$kaldi_egs_dir/wsj/s5/steps"
 46 | ln -sf "$kaldi_egs_dir/wsj/s5/utils"
 47 | ln -sf "$kaldi_egs_dir/rm/s5/local"
 48 | ln -sf "$kaldi_egs_dir/rm/s5/conf"
 49 | 
 50 | #
 51 | # Training and test data
 52 | #
 53 | 
 54 | full_file=utt_spk_text.tsv
 55 | train_file=utt_spk_text-train.tsv
 56 | test_file=utt_spk_text-test.tsv
 57 | 
 58 | # Symlink the corpus info file and perform a train/test split
 59 | ln -sf "$CORPUSDIR/utt_spk_text.tsv" "$full_file"
 60 | ../tools/traintest-split.sh "$full_file"
 61 | 
 62 | echo "Preparing training data, this may take a while"
 63 | ../tools/kaldi_converter.py -d $CORPUSDIR -f $train_file --alsent > data/train/al_sent.txt
 64 | ../tools/kaldi_converter.py -d $CORPUSDIR -f $train_file --spk2utt    | sort -k1,1 > data/train/spk2utt
 65 | ../tools/kaldi_converter.py -d $CORPUSDIR -f $train_file --spk2gender | sort -k1,1 > data/train/spk2gender
 66 | ../tools/kaldi_converter.py -d $CORPUSDIR -f $train_file --text       | sort -k1,1 > data/train/text
 67 | ../tools/kaldi_converter.py -d $CORPUSDIR -f $train_file --utt2spk    | sort -k1,1 > data/train/utt2spk
 68 | ../tools/kaldi_converter.py -d $CORPUSDIR -f $train_file --wavscp     | sort -k1,1 > data/train/wav.scp
 69 | echo "Training data prepared"
 70 | 
 71 | echo "Preparing test data, this may take a while"
 72 | ../tools/kaldi_converter.py -d $CORPUSDIR -f $test_file  --alsent > data/test/al_sent.txt
 73 | ../tools/kaldi_converter.py -d $CORPUSDIR -f $test_file  --spk2utt    | sort -k1,1 > data/test/spk2utt
 74 | ../tools/kaldi_converter.py -d $CORPUSDIR -f $test_file  --spk2gender | sort -k1,1 > data/test/spk2gender
 75 | ../tools/kaldi_converter.py -d $CORPUSDIR -f $test_file  --text       | sort -k1,1 > data/test/text
 76 | ../tools/kaldi_converter.py -d $CORPUSDIR -f $test_file  --utt2spk    | sort -k1,1 > data/test/utt2spk
 77 | ../tools/kaldi_converter.py -d $CORPUSDIR -f $test_file  --wavscp     | sort -k1,1 > data/test/wav.scp
 78 | echo "Test data prepared"
 79 | 
 80 | # Fix sorting issues etc.
 81 | utils/fix_data_dir.sh data/train
 82 | utils/fix_data_dir.sh data/test
 83 | 
 84 | #
 85 | # Lexicon and phone set
 86 | #
 87 | 
 88 | lexicon=data/local/dict/lexicon.txt
 89 | nonsilence_phones=data/local/dict/nonsilence_phones.txt
 90 | 
 91 | awk '{for (i = 2; i <= NF; ++i) print $i}' data/train/text data/test/text |
 92 |   sort -u > vocabulary.txt
 93 | 
 94 | ../tools/simpleg2g.py -i vocabulary.txt -n 2 > "$lexicon"
 95 | 
 96 | awk '{for (i = 2; i <= NF; ++i) print $i}' "$lexicon" |
 97 |   sort -u > "$nonsilence_phones"
 98 | 
 99 | # Add silence word and phone to lexicon
100 | echo "!SIL  sil" >> "$lexicon"
101 | 
102 | # Creating empty files and silence only files
103 | echo "sil" > data/local/dict/silence_phones.txt
104 | echo "sil" > data/local/dict/optional_silence.txt
105 | touch data/local/dict/extra_questions.txt
106 | 
107 | #
108 | # Language model
109 | #
110 | 
111 | # Corpora depentent text for language models
112 | # Files for data/local/tmp
113 | ../tools/kaldi_converter.py -d $CORPUSDIR -f $full_file --transcriptions |
114 |   sort -u > tmp_transcripts.txt
115 | 
116 | # Generating text for language model
117 | echo "Processing all utterances to generate Language model"
118 | ../tools/kaldi_converter.py -d $CORPUSDIR --f $full_file --bigram tmp_transcripts.txt > tmp_bigrams.txt
119 | local/make_rm_lm.pl tmp_bigrams.txt > data/local/tmp/G.txt
120 | echo "All utterances processed"
121 | 
122 | rm -f tmp_transcripts.txt; rm tmp_bigrams.txt
123 | 
124 | #
125 | # Hand off to Kaldi
126 | #
127 | 
128 | utils/prepare_lang.sh data/local/dict '!SIL' data/local/lang data/lang
129 | 
130 | local/rm_prepare_grammar.sh      # Traditional RM grammar (bigram word-pair)
131 | #local/rm_prepare_grammar_ug.sh   # Unigram grammar (gives worse results, but changes in WER will be more significant.)
132 | 


--------------------------------------------------------------------------------
/common/train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2018 Google LLC. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | # Simple recipe similar to kaldi/egs/rm/s5, but stop after first triphone pass.
18 | # This is just for validating that our corpora are usable with Kaldi.
19 | 
20 | if [ ! -d "$KALDI_ROOT" ] ; then
21 |   echo >&2 'KALDI_ROOT must be set and point to the Kaldi directory'
22 |   exit 1
23 | fi
24 | 
25 | set -o errexit
26 | set -o nounset
27 | export LC_ALL=C
28 | 
29 | nj=4
30 | train_cmd=run.pl
31 | decode_cmd=run.pl
32 | 
33 | # Buildng MFCC and CMVN
34 | featdir=mfcc
35 | steps/make_mfcc.sh --nj "$nj" --cmd "$train_cmd" data/train exp/make_feat/train "$featdir"
36 | steps/compute_cmvn_stats.sh data/train exp/make_feat/train "$featdir"
37 | steps/make_mfcc.sh --nj "$nj" --cmd "$train_cmd" data/test exp/make_feat/test "$featdir"
38 | steps/compute_cmvn_stats.sh data/test exp/make_feat/test "$featdir"
39 | 
40 | # Train monophone model
41 | steps/train_mono.sh --nj "$nj" --cmd "$train_cmd" data/train data/lang exp/mono
42 | utils/mkgraph.sh --mono data/lang exp/mono exp/mono/graph
43 | 
44 | # Decode monophone model
45 | steps/decode.sh --config conf/decode.config --nj "$nj" --cmd "$decode_cmd" \
46 |   exp/mono/graph data/test exp/mono/decode
47 | 
48 | # Get monophone alignments
49 | steps/align_si.sh --nj "$nj" --cmd "$train_cmd" \
50 |   data/train data/lang exp/mono exp/mono_ali
51 | 
52 | # Train first triphone model
53 | steps/train_deltas.sh --cmd "$train_cmd" \
54 |  1800 9000 data/train data/lang exp/mono_ali exp/tri1
55 | 
56 | # Decode first triphone model
57 | utils/mkgraph.sh data/lang exp/tri1 exp/tri1/graph
58 | steps/decode.sh --config conf/decode.config --nj "$nj" --cmd "$decode_cmd" \
59 |   exp/tri1/graph data/test exp/tri1/decode
60 | 


--------------------------------------------------------------------------------
/jv/run.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Copyright 2018 Google LLC. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | set -o errexit
18 | ../common/prep.sh '../asr_javanese'
19 | ../common/train.sh
20 | 


--------------------------------------------------------------------------------
/si/run.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Copyright 2018 Google LLC. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | set -o errexit
18 | ../common/prep.sh '../asr_sinhala'
19 | ../common/train.sh
20 | 


--------------------------------------------------------------------------------
/su/run.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Copyright 2018 Google LLC. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | set -o errexit
18 | ../common/prep.sh '../asr_sundanese'
19 | ../common/train.sh
20 | 


--------------------------------------------------------------------------------
/tools/corpus_util.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | # Copyright 2018 Google LLC. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | """Utilities for working with open-source ASR speech corpora."""
18 | 
19 | import collections
20 | import io
21 | import re
22 | 
23 | FILENAME_RE = re.compile(r'^[a-f0-9]{10}.(flac)$')
24 | UTTERANCE_ID_RE = re.compile(r'^[a-f0-9]{10}$')
25 | SESSION_ID_RE = re.compile(r'^[a-f0-9]{5}$')
26 | 
27 | stdin = io.open(0, mode='rt', encoding='utf-8', closefd=False)
28 | stdout = io.open(1, mode='wt', encoding='utf-8', closefd=False)
29 | stderr = io.open(2, mode='wt', encoding='utf-8', closefd=False)
30 | 
31 | 
32 | def SessionIdFromFilename(filename):
33 |   match = FILENAME_RE.match(filename)
34 |   assert match is not None
35 |   return match.group(1)
36 | 
37 | 
38 | Recording = collections.namedtuple(
39 |     'Recording', [
40 |         'utterance_id',
41 |         'session_id',
42 |         'text',
43 |         'gender',
44 |         ])
45 | 
46 | 
47 | def ReadInfo(reader):
48 |   """Parses a Google corpus info file (11-column TSV)."""
49 |   for line in reader:
50 |     line = line.rstrip('\n')
51 |     fields = line.split('\t')
52 |     assert len(fields) == 3
53 |     assert UTTERANCE_ID_RE.match(fields[0]) is not None
54 |     assert SESSION_ID_RE.match(fields[1]) is not None
55 |     fields.append('female')
56 |     fields[0] = '%s-%s' % (fields[1], fields[0])
57 |     yield Recording._make(fields)
58 | 
59 | 
60 | def ParseInfoFile(path):
61 |   with io.open(path, mode='rt', encoding='utf-8') as reader:
62 |     return dict((rec.utterance_id, rec) for rec in ReadInfo(reader))
63 | 
64 | 
65 | class Corpus(object):
66 |   """Information about an ASR corpus."""
67 | 
68 |   def __init__(self, corpus_dir, corpus_file):
69 |     self.corpus_dir = corpus_dir
70 |     self.corpus_file = corpus_file
71 |     self.corpus_info = {}
72 | 
73 |   def LoadItems(self):
74 |     self.corpus_info = ParseInfoFile(self.corpus_file)
75 | 
76 |   def AddItem(self, utterance_id, record):
77 |     if utterance_id not in self.corpus_info:
78 |       self.corpus_info[utterance_id] = record
79 | 
80 |   def RemoveItem(self, utterance_id):
81 |     if utterance_id in self.corpus_info:
82 |       del self.corpus_info[utterance_id]
83 | 
84 |   def CleanUp(self):
85 |     self.corpus_info = {}
86 | 


--------------------------------------------------------------------------------
/tools/download_data.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Copyright 2018 Google LLC. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | case "$1" in
18 |   jv) fragment="35/asr_javanese" ;;
19 |   su) fragment="36/asr_sundanese" ;;
20 |   si) fragment="52/asr_sinhala" ;;
21 |   bn) fragment="53/asr_bengali" ;;
22 |   ne) fragment="54/asr_nepali" ;;
23 |   *) echo "Unrecognized language: '$1'" >&2 ; exit 1 ;;
24 | esac
25 | 
26 | for d in 0 1 2 3 4 5 6 7 8 9 a b c d e f; do
27 |   resource="${fragment}_${d}.zip"
28 |   wget "http://www.openslr.org/resources/$resource"
29 |   zipfile="$(basename "$resource")"
30 |   unzip -nqq "$zipfile"
31 |   rm -f "$zipfile"
32 | done
33 | 


--------------------------------------------------------------------------------
/tools/kaldi_converter.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | # Copyright 2018 Google LLC. All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | # http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | """Utility to convert open-source speech corpora to Kaldi RM recipe format."""
 18 | 
 19 | from __future__ import unicode_literals
 20 | 
 21 | import io
 22 | from operator import itemgetter
 23 | import optparse
 24 | import os.path
 25 | import corpus_util as kaldi_util
 26 | 
 27 | stdin = io.open(0, mode='rt', encoding='utf-8', closefd=False)
 28 | stdout = io.open(1, mode='wt', encoding='utf-8', closefd=False)
 29 | stderr = io.open(2, mode='wt', encoding='utf-8', closefd=False)
 30 | 
 31 | 
 32 | class CorpusConverter(object):
 33 |   """Container for cretion from open-soruce corpus to RM recipe."""
 34 | 
 35 |   def __init__(self, corpus):
 36 |     self.corpus = corpus
 37 |     self.corpus_info = corpus.corpus_info.items()
 38 | 
 39 |   def AlSent(self):
 40 |     """Prints out the text used in al_sent.txt file in the RM recipe."""
 41 |     for _, rec in self.corpus_info:
 42 |       stdout.write('%s (%s)\n' % (rec.text, rec.utterance_id))
 43 | 
 44 |   def Spk2gender(self):
 45 |     """Prints out the text used in spk2gender file in the RM recipe."""
 46 |     spk_gender = {}
 47 |     for _, rec in self.corpus_info:
 48 |       if rec.session_id not in spk_gender:
 49 |         spk_gender[rec.session_id] = rec.gender
 50 |     for spk_id in spk_gender:
 51 |       stdout.write('%s %s\n' % (spk_id, spk_gender[spk_id][0]))
 52 | 
 53 |   def Text(self):
 54 |     """Prints out the text used in text file in the RM recipe."""
 55 |     for _, rec in self.corpus_info:
 56 |       stdout.write('%s %s\n' % (rec.utterance_id, rec.text.lower()))
 57 | 
 58 |   def Spk2utt(self):
 59 |     """Prints out the text used in spk2utt file in RM recipe."""
 60 |     spk_utt = {}
 61 |     for _, rec in self.corpus_info:
 62 |       if rec.session_id not in spk_utt:
 63 |         spk_utt[rec.session_id] = []
 64 |       spk_utt[rec.session_id].append(rec.utterance_id)
 65 | 
 66 |     for session_id in spk_utt:
 67 |       stdout.write('%s' % session_id)
 68 |       for utt_id in spk_utt[session_id]:
 69 |         stdout.write(' %s' % utt_id)
 70 |       stdout.write('\n')
 71 | 
 72 |   def Utt2spk(self):
 73 |     """Prints out the text used in utt2spk file in RM recipe."""
 74 |     spk_utt = {}
 75 |     for _, rec in self.corpus_info:
 76 |       if rec.session_id not in spk_utt:
 77 |         spk_utt[rec.session_id] = []
 78 |       spk_utt[rec.session_id].append(rec.utterance_id)
 79 | 
 80 |     for session_id in spk_utt:
 81 |       for utt_id in spk_utt[session_id]:
 82 |         stdout.write('%s %s\n' % (utt_id, session_id))
 83 | 
 84 |   def Wavscp(self):
 85 |     """Prints out the text used in wav.scp file in RM recipe."""
 86 |     spk_utt = {}
 87 |     for _, rec in self.corpus_info:
 88 |       if rec.session_id not in spk_utt:
 89 |         spk_utt[rec.session_id] = []
 90 |       spk_utt[rec.session_id].append(rec.utterance_id)
 91 | 
 92 |     # Adding [-10:] in order to make sure that the order is preserved correctly
 93 |     # in the spk2utt.pl and utt2spk.pl sorting in the utils
 94 |     for session_id in spk_utt:
 95 |       for utt_id in spk_utt[session_id]:
 96 |         _, basename = utt_id.split('-')
 97 |         path = os.path.join(self.corpus.corpus_dir, 'data', basename[:2],
 98 |                             '%s.flac' % basename)
 99 |         stdout.write('%s flac -cds %s |\n' % (utt_id, path))
100 | 
101 |   def Transcriptions(self):
102 |     """Prints out the transcriptions, used to generate grammar file."""
103 |     for _, rec in self.corpus_info:
104 |       if rec.text:
105 |         stdout.write('%s SENTENCE-END\n' % rec.text.lower())
106 | 
107 | 
108 | def Bigrams(inputfile):
109 |   with io.open(inputfile, mode='rt', encoding='utf-8') as text:
110 |     data = ' '.join(line.strip() for line in text)
111 |     data = data.split(' ')
112 | 
113 |     grams = []
114 |     for i in range(len(data) - 1):
115 |       grams.append((data[i], data[i+1]))
116 | 
117 |     grams = list(set(grams))
118 |     bigram_dict = {}
119 | 
120 |     for gram in sorted(grams, key=itemgetter(0)):
121 |       if gram[0] not in bigram_dict:
122 |         bigram_dict[gram[0]] = []
123 |       bigram_dict[gram[0]].append(gram[1])
124 | 
125 |     for start_gram in sorted(bigram_dict):
126 |       stdout.write('>%s\n' % start_gram)
127 |       for end_gram in sorted(bigram_dict[start_gram]):
128 |         stdout.write(' %s\n' % end_gram)
129 | 
130 | 
131 | def main():
132 |   parser = optparse.OptionParser()
133 |   parser.add_option('-d',
134 |                     '--dir',
135 |                     dest='corpusdir',
136 |                     help='Input corpus directory')
137 |   parser.add_option('--alsent',
138 |                     dest='alsent',
139 |                     action='store_false',
140 |                     help='Output for al_sent.txt file')
141 |   parser.add_option('--spk2utt',
142 |                     dest='spk2utt',
143 |                     action='store_false',
144 |                     help='Output for spk2utt file')
145 |   parser.add_option('--spk2gender',
146 |                     dest='spk2gender',
147 |                     action='store_false',
148 |                     help='Output for spk2gender file')
149 |   parser.add_option('--text',
150 |                     dest='text',
151 |                     action='store_false',
152 |                     help='Output for text file')
153 |   parser.add_option('--utt2spk',
154 |                     dest='utt2spk',
155 |                     action='store_false',
156 |                     help='Output for utt2spk file')
157 |   parser.add_option('--wavscp',
158 |                     dest='wavscp',
159 |                     action='store_false',
160 |                     help='Output for wac.scp file')
161 |   parser.add_option('--transcriptions',
162 |                     dest='transcriptions',
163 |                     action='store_false',
164 |                     help='Output only transcriptions')
165 |   parser.add_option('--bigram',
166 |                     dest='bigram',
167 |                     help='Outputs bigrams based on the sentece file')
168 |   parser.add_option('-f',
169 |                     '--file',
170 |                     dest='corpusfile',
171 |                     help='Output training data')
172 |   parser.add_option('--testfile',
173 |                     dest='corpus_test_file',
174 |                     help='Output testfile data')
175 | 
176 |   options, _ = parser.parse_args()
177 | 
178 |   corpus = kaldi_util.Corpus(options.corpusdir, options.corpusfile)
179 |   corpus.LoadItems()
180 |   kaldi_converter = CorpusConverter(corpus)
181 | 
182 |   if options.alsent is not None:
183 |     kaldi_converter.AlSent()
184 | 
185 |   if options.spk2gender is not None:
186 |     kaldi_converter.Spk2gender()
187 | 
188 |   if options.spk2utt is not None:
189 |     kaldi_converter.Spk2utt()
190 | 
191 |   if options.text is not None:
192 |     kaldi_converter.Text()
193 | 
194 |   if options.utt2spk is not None:
195 |     kaldi_converter.Utt2spk()
196 | 
197 |   if options.wavscp is not None:
198 |     kaldi_converter.Wavscp()
199 | 
200 |   if options.transcriptions is not None:
201 |     kaldi_converter.Transcriptions()
202 | 
203 |   if options.bigram is not None:
204 |     Bigrams(options.bigram)
205 | 
206 | 
207 | if __name__ == '__main__':
208 |   main()
209 | 


--------------------------------------------------------------------------------
/tools/simpleg2g.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | # Copyright 2018 Google LLC. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | """Script to simply convert grapheme to phoneme (grapheme)."""
18 | 
19 | from __future__ import unicode_literals
20 | 
21 | import io
22 | import optparse
23 | 
24 | stdin = io.open(0, mode='rt', encoding='utf-8', closefd=False)
25 | stdout = io.open(1, mode='wt', encoding='utf-8', closefd=False)
26 | stderr = io.open(2, mode='wt', encoding='utf-8', closefd=False)
27 | 
28 | 
29 | def SimpleG2P(word, length=1):
30 |   """Simple G2P (G2NG) convertion function."""
31 |   simple_phoneme = ''
32 |   if len(word) > length:
33 |     for i in range(len(word) - length + 1):
34 |       simple_phoneme = simple_phoneme + ' ' + word[i:i+length]
35 |     return simple_phoneme.strip().lower()
36 |   else:
37 |     return word.strip().lower()
38 | 
39 | 
40 | def GenerateDictionary(input_file, ngraph_size):
41 |   """Generates the G2P (G2NG) dictionary."""
42 |   words = set()
43 |   with io.open(input_file, mode='rt', encoding='utf-8') as text:
44 |     for line in text:
45 |       for word in line.split():
46 |         words.add(word)
47 |   words = list(words)
48 |   words.sort()
49 |   if 'SENTNCE-END' in words:
50 |     words.remove('SENTENCE-END')
51 | 
52 |   for word in words:
53 |     word.replace('_', '')
54 |     phoneme = SimpleG2P('_%s_' % word, ngraph_size)
55 |     stdout.write('%s\t%s\n' % (word.lower(), phoneme.lower()))
56 | 
57 | 
58 | def main():
59 |   parser = optparse.OptionParser()
60 |   parser.add_option('-i',
61 |                     '--input',
62 |                     dest='inputFile',
63 |                     help='Input transcription file')
64 |   parser.add_option('-n',
65 |                     '--ngram',
66 |                     dest='ngramSize',
67 |                     help='Ngram size, default 2',
68 |                     default='2')
69 | 
70 |   options, _ = parser.parse_args()
71 | 
72 |   if options.inputFile is None:
73 |     parser.print_help()
74 |     parser.error('Input file is required')
75 | 
76 |   GenerateDictionary(options.inputFile, int(options.ngramSize))
77 | 
78 | 
79 | if __name__ == '__main__':
80 |   main()
81 | 


--------------------------------------------------------------------------------
/tools/traintest-split.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Copyright 2018 Google LLC. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | # Script which takes an utt_spk_text.tsv file and generates two files, one for
18 | # training and the other for testing.
19 | #
20 | # Usage:
21 | #  ./traintest-split.sh file_to_split (default utt_spk_text.tsv)
22 | 
23 | set -o errexit
24 | set -o nounset
25 | export LC_ALL=C
26 | 
27 | if [ $# -eq 0 ]; then
28 |    INPUT_FILE=utt_spk_text.tsv
29 | else
30 |    INPUT_FILE=$1
31 | fi
32 | 
33 | ALLSIZE="$(cat "$INPUT_FILE" | wc -l)"
34 | TESTSIZE=2000
35 | let TRAINSIZE=$ALLSIZE-$TESTSIZE
36 | 
37 | sort -k2 "$INPUT_FILE" | head -n "$TESTSIZE"  | sort > utt_spk_text-test.tsv
38 | sort -k2 "$INPUT_FILE" | tail -n "$TRAINSIZE" | sort > utt_spk_text-train.tsv
39 | 


--------------------------------------------------------------------------------