├── .gitignore ├── resources ├── 1 │ ├── waves_yesno.tar.gz │ ├── info.txt │ └── about.html ├── 2 │ ├── openfst-1.3.2.tar.gz │ ├── openfst-1.3.3.tar.gz │ ├── openfst-1.3.4.tar.gz │ ├── openfst-1.4.1.tar.gz │ ├── openfst-1.5.4.tar.gz │ ├── openfst-1.6.2.tar.gz │ ├── openfst-1.6.5.tar.gz │ ├── openfst-1.6.7.tar.gz │ ├── about.html │ └── info.txt ├── 3 │ ├── sph2pipe_v2.5.tar.gz │ ├── info.txt │ └── about.html ├── 4 │ ├── sctk-2.4.0-20091110-0958.tar.bz2 │ ├── sctk-2.4.0-20091110-0958.tar.gz │ ├── sctk-2.4.10-20151007-1312Z.tar.bz2 │ ├── sctk-2.4.8-20130429-2145.tar.bz2 │ ├── sctk-2.4.9-20141015-1634Z.tar.bz2 │ ├── about.html │ └── info.txt ├── 5 │ ├── sw-ms98-dict.text │ ├── switchboard_word_alignments.tar.gz │ ├── info.txt │ └── about.html ├── 6 │ ├── data_voip_cs.tgz │ ├── data_voip_en.tgz │ ├── info.txt │ └── about.html ├── 7 │ ├── TEDLIUM_release1.tar.gz │ ├── info.txt │ └── about.html ├── 8 │ ├── info.txt │ ├── about.html │ ├── lexicon-da.tgz │ └── lexicon-da-nonorm.tgz ├── 9 │ ├── wordlist.50k.gz │ ├── about.html │ └── info.txt ├── 10 │ ├── sre04_key.tgz │ ├── sre04_key-v2.txt.gz │ ├── sre05-key-v7b.txt.gz │ ├── about.html │ ├── sre2000-key.tar.gz │ └── info.txt ├── 11 │ ├── g2p-model-5 │ ├── 3-gram.arpa.gz │ ├── 4-gram.arpa.gz │ ├── librispeech-vocab.txt │ ├── librispeech-lm-corpus.tgz │ ├── 3-gram.pruned.1e-7.arpa.gz │ ├── 3-gram.pruned.3e-7.arpa.gz │ ├── librispeech-lexicon.txt │ ├── librispeech-lm-norm.txt.gz │ ├── about.html │ └── info.txt ├── 12 │ ├── md5sum.txt │ ├── dev-clean.tar.gz │ ├── dev-other.tar.gz │ ├── test-clean.tar.gz │ ├── test-other.tar.gz │ ├── original-mp3.tar.gz │ ├── raw-metadata.tar.gz │ ├── original-books.tar.gz │ ├── train-clean-100.tar.gz │ ├── train-clean-360.tar.gz │ ├── train-other-500.tar.gz │ ├── intro-disclaimers.tar.gz │ ├── about.html │ └── info.txt ├── 13 │ ├── RWCP.tar.gz │ ├── info.txt │ └── about.html ├── 14 │ ├── beep.tar.gz │ └── info.txt ├── 15 │ ├── speaker_list.tgz │ ├── info.txt │ └── about.html ├── 16 │ ├── headset.tar.gz │ ├── Array1-01.tar.gz │ ├── Array1-02.tar.gz │ ├── Array1-03.tar.gz │ ├── Array1-04.tar.gz │ ├── Array1-05.tar.gz │ ├── Array1-06.tar.gz │ ├── Array1-07.tar.gz │ ├── Array1-08.tar.gz │ ├── ami_manual_1.6.1.tar.gz │ ├── info.txt │ └── about.html ├── 17 │ ├── musan.tar.gz │ ├── info.txt │ └── about.html ├── 18 │ ├── resource.tgz │ ├── test-noise.tgz │ ├── data_thchs30.tgz │ └── info.txt ├── 19 │ ├── TEDLIUM_release2.tar.gz │ ├── info.txt │ └── about.html ├── 20 │ ├── air_database_release_1_4.zip │ ├── about.html │ └── info.txt ├── 21 │ ├── es_wordlist.json.tgz │ ├── info.txt │ └── about.html ├── 22 │ ├── resource.tar.gz │ ├── test_noise.tar.gz │ ├── data_thuyg20.tar.gz │ ├── data_thuyg20_sre.tar.gz │ ├── test_noise_sre.tar.gz │ ├── info.txt │ └── about.html ├── 23 │ ├── lre07_key.txt │ ├── about.html │ └── info.txt ├── 24 │ ├── iban.tar.gz │ └── info.txt ├── 25 │ ├── data_readspeech_am.tar.bz2 │ ├── data_readspeech_wo.tar.bz2 │ ├── data_broadcastnews_sw.tar.bz2 │ └── info.txt ├── 26 │ ├── sim_rir_16k.zip │ ├── sim_rir_8k.zip │ ├── info.txt │ └── about.html ├── 27 │ ├── cantab-TEDLIUM.tar.bz2 │ ├── cantab-TEDLIUM-partial.tar.bz2 │ └── info.txt ├── 28 │ ├── rirs_noises.zip │ ├── info.txt │ └── about.html ├── 29 │ ├── lexicon-sv.tgz │ ├── info.txt │ └── about.html ├── 30 │ ├── README.txt │ ├── si_lk.tar │ ├── LICENSE.txt │ ├── si_lk.tar.gz │ ├── si_lk.lines.txt │ ├── info.txt │ └── about.html ├── 31 │ ├── dev-clean-2.tar.gz │ ├── train-clean-5.tar.gz │ ├── about.html │ ├── md5sum.txt │ └── info.txt ├── 32 │ ├── af_za.tar.gz │ ├── st_za.tar.gz │ ├── tn_za.tar.gz │ ├── xh_za.tar.gz │ ├── info.txt │ └── about.html ├── 33 │ ├── data_aishell.tgz │ ├── resource_aishell.tgz │ ├── info.txt │ └── about.html ├── 34 │ ├── santiago.tar.gz │ ├── info.txt │ └── about.html ├── 35 │ ├── LICENSE │ ├── asr_javanese_0.zip │ ├── asr_javanese_1.zip │ ├── asr_javanese_2.zip │ ├── asr_javanese_3.zip │ ├── asr_javanese_4.zip │ ├── asr_javanese_5.zip │ ├── asr_javanese_6.zip │ ├── asr_javanese_7.zip │ ├── asr_javanese_8.zip │ ├── asr_javanese_9.zip │ ├── asr_javanese_a.zip │ ├── asr_javanese_b.zip │ ├── asr_javanese_c.zip │ ├── asr_javanese_d.zip │ ├── asr_javanese_e.zip │ ├── asr_javanese_f.zip │ ├── utt_spk_text.tsv │ ├── info.txt │ ├── about.html │ └── asr_javanese.sha256 ├── 36 │ ├── asr_sundanese_0.zip │ ├── asr_sundanese_1.zip │ ├── asr_sundanese_2.zip │ ├── asr_sundanese_3.zip │ ├── asr_sundanese_4.zip │ ├── asr_sundanese_5.zip │ ├── asr_sundanese_6.zip │ ├── asr_sundanese_7.zip │ ├── asr_sundanese_8.zip │ ├── asr_sundanese_9.zip │ ├── asr_sundanese_a.zip │ ├── asr_sundanese_b.zip │ ├── asr_sundanese_c.zip │ ├── asr_sundanese_d.zip │ ├── asr_sundanese_e.zip │ ├── asr_sundanese_f.zip │ ├── about.html │ ├── info.txt │ └── asr_sundanese.sha256 ├── 37 │ ├── bn_bd.zip │ ├── bn_in.zip │ ├── info.txt │ ├── README.txt │ └── about.html ├── 38 │ ├── ST-CMDS-20170001_1-OS.tar.gz │ ├── info.txt │ └── about.html ├── 39 │ ├── LDC2006S37.tar.gz │ ├── info.txt │ └── about.html ├── 40 │ ├── zeroth_korean.tar.gz │ ├── info.txt │ └── about.html ├── 41 │ ├── LICENSE │ ├── jv_id_male.zip │ ├── jv_id_female.zip │ ├── info.txt │ └── about.html ├── 42 │ ├── LICENSE │ ├── km_kh_male.zip │ ├── info.txt │ └── about.html ├── 43 │ ├── LICENSE │ ├── ne_np_female.zip │ ├── info.txt │ └── about.html ├── 44 │ ├── LICENSE │ ├── su_id_male.zip │ ├── su_id_female.zip │ ├── info.txt │ └── about.html ├── 45 │ ├── ST-AEDS-20180100_1-OS.tgz │ ├── info.txt │ └── about.html ├── 46 │ ├── Tunisian_MSA.tar.gz │ ├── info.txt │ └── about.html ├── 47 │ ├── primewords_md_2018_set1.tar.gz │ ├── info.txt │ └── about.html ├── 48 │ ├── madcat.dev.raw.lineid │ ├── madcat.test.raw.lineid │ ├── madcat.train.raw.lineid │ ├── info.txt │ └── about.html ├── 49 │ ├── vox1_meta.csv │ ├── vox2_meta.csv │ ├── voxceleb1_test.txt │ ├── voxceleb1_test_v2.txt │ ├── voxceleb1_sitw_overlap.txt │ ├── about.html │ └── info.txt ├── 50 │ ├── madcat.dev.raw.lineid │ ├── madcat.test.raw.lineid │ ├── madcat.train.raw.lineid │ ├── info.txt │ └── about.html ├── 51 │ ├── TEDLIUM_release-3.tgz │ └── info.txt ├── 52 │ ├── asr_sinhala_0.zip │ ├── asr_sinhala_1.zip │ ├── asr_sinhala_2.zip │ ├── asr_sinhala_3.zip │ ├── asr_sinhala_4.zip │ ├── asr_sinhala_5.zip │ ├── asr_sinhala_6.zip │ ├── asr_sinhala_7.zip │ ├── asr_sinhala_8.zip │ ├── asr_sinhala_9.zip │ ├── asr_sinhala_a.zip │ ├── asr_sinhala_b.zip │ ├── asr_sinhala_c.zip │ ├── asr_sinhala_d.zip │ ├── asr_sinhala_e.zip │ ├── asr_sinhala_f.zip │ ├── utt_spk_text.tsv │ ├── info.txt │ └── about.html ├── 53 │ ├── asr_bengali_0.zip │ ├── asr_bengali_1.zip │ ├── asr_bengali_2.zip │ ├── asr_bengali_3.zip │ ├── asr_bengali_4.zip │ ├── asr_bengali_5.zip │ ├── asr_bengali_6.zip │ ├── asr_bengali_7.zip │ ├── asr_bengali_8.zip │ ├── asr_bengali_9.zip │ ├── asr_bengali_a.zip │ ├── asr_bengali_b.zip │ ├── asr_bengali_c.zip │ ├── asr_bengali_d.zip │ ├── asr_bengali_e.zip │ ├── asr_bengali_f.zip │ ├── utt_spk_text.tsv │ ├── info.txt │ └── about.html ├── 54 │ ├── asr_nepali_0.zip │ ├── asr_nepali_1.zip │ ├── asr_nepali_2.zip │ ├── asr_nepali_3.zip │ ├── asr_nepali_4.zip │ ├── asr_nepali_5.zip │ ├── asr_nepali_6.zip │ ├── asr_nepali_7.zip │ ├── asr_nepali_8.zip │ ├── asr_nepali_9.zip │ ├── asr_nepali_a.zip │ ├── asr_nepali_b.zip │ ├── asr_nepali_c.zip │ ├── asr_nepali_d.zip │ ├── asr_nepali_e.zip │ ├── asr_nepali_f.zip │ ├── utt_spk_text.tsv │ ├── info.txt │ └── about.html ├── 55 │ ├── test.tgz │ ├── train.tgz │ ├── info.txt │ └── about.html ├── 56 │ ├── splits.zip │ ├── info.txt │ └── about.html ├── 57 │ ├── African_Accented_French.tar.gz │ ├── info.txt │ └── about.html ├── 58 │ ├── pansori-tedxkr-corpus-1.0.tar.gz │ └── info.txt ├── 59 │ ├── parlament_v1.0_clean.tar.gz │ ├── parlament_v1.0_other.tar.gz │ ├── info.txt │ └── about.html ├── 60 │ ├── dev-clean.tar.gz │ ├── dev-other.tar.gz │ ├── test-clean.tar.gz │ ├── test-other.tar.gz │ ├── train-clean-100.tar.gz │ ├── train-clean-360.tar.gz │ ├── train-other-500.tar.gz │ ├── info.txt │ └── about.html ├── 61 │ ├── LICENSE │ ├── es_ar_male.zip │ ├── es_ar_female.zip │ ├── line_index_male.tsv │ ├── es_weather_messages.zip │ ├── line_index_female.tsv │ ├── es_ar_line_index_weather.tsv │ ├── es_es_line_index_weather.tsv │ └── info.txt ├── 62 │ ├── aidatatang_200zh.tgz │ └── info.txt ├── 63 │ ├── LICENSE │ ├── ml_in_male.zip │ ├── ml_in_female.zip │ ├── line_index_male.tsv │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 64 │ ├── LICENSE │ ├── line_index.tsv │ ├── mr_in_female.zip │ ├── info.txt │ └── about.html ├── 65 │ ├── LICENSE │ ├── ta_in_male.zip │ ├── ta_in_female.zip │ ├── line_index_male.tsv │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 66 │ ├── LICENSE │ ├── te_in_male.zip │ ├── te_in_female.zip │ ├── line_index_male.tsv │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 67 │ ├── tedx_spanish_corpus.tgz │ ├── info.txt │ └── about.html ├── 68 │ ├── dev_set.tar.gz │ ├── metadata.tar.gz │ ├── test_set.tar.gz │ ├── train_set.tar.gz │ └── info.txt ├── 69 │ ├── LICENSE │ ├── ca_es_male.zip │ ├── ca_es_female.zip │ ├── line_index_male.tsv │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 70 │ ├── LICENSE │ ├── en_ng_male.zip │ ├── en_ng_female.zip │ ├── line_index_male.tsv │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 71 │ ├── LICENSE │ ├── es_cl_male.zip │ ├── es_cl_female.zip │ ├── line_index_male.tsv │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 72 │ ├── LICENSE │ ├── es_co_male.zip │ ├── es_co_female.zip │ ├── line_index_male.tsv │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 73 │ ├── LICENSE │ ├── es_pe_male.zip │ ├── es_pe_female.zip │ ├── line_index_male.tsv │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 74 │ ├── LICENSE │ ├── es_pr_female.zip │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 75 │ ├── LICENSE │ ├── es_ve_male.zip │ ├── es_ve_female.zip │ ├── line_index_male.tsv │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 76 │ ├── LICENSE │ ├── eu_es_male.zip │ ├── eu_es_female.zip │ ├── line_index_male.tsv │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 77 │ ├── LICENSE │ ├── gl_es_male.zip │ ├── gl_es_female.zip │ ├── line_index_male.tsv │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 78 │ ├── LICENSE │ ├── gu_in_male.zip │ ├── gu_in_female.zip │ ├── line_index_male.tsv │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 79 │ ├── LICENSE │ ├── kn_in_male.zip │ ├── kn_in_female.zip │ ├── line_index_male.tsv │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 80 │ ├── LICENSE │ ├── my_mm_female.zip │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 81 │ ├── samples.tar.gz │ ├── info.txt │ └── about.html ├── 82 │ ├── cn-celeb.tgz │ └── info.txt ├── 83 │ ├── dialect_info.txt │ ├── line_index_all.csv │ ├── irish_english_male.zip │ ├── welsh_english_male.zip │ ├── midlands_english_male.zip │ ├── northern_english_male.zip │ ├── scottish_english_male.zip │ ├── southern_english_male.zip │ ├── welsh_english_female.zip │ ├── midlands_english_female.zip │ ├── northern_english_female.zip │ ├── scottish_english_female.zip │ ├── southern_english_female.zip │ └── info.txt ├── 84 │ ├── scribblelens.corpus.v1.2.zip │ └── info.txt ├── 85 │ ├── dev.tar.gz │ ├── test.tar.gz │ ├── train.tar.gz │ ├── test_v2.tar.gz │ ├── filename_mapping.tar.gz │ ├── info.txt │ └── about.html ├── 86 │ ├── LICENSE │ ├── yo_ng_male.zip │ ├── yo_ng_female.zip │ ├── annotation_info.txt │ ├── line_index_male.tsv │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 87 │ ├── mobvoi_hotword_dataset.tgz │ ├── mobvoi_hotword_dataset_resources.tgz │ ├── info.txt │ └── about.html ├── 89 │ ├── Yoloxochitl-Mixtec-Data.tgz │ └── info.txt ├── 90 │ ├── speechocean.zip │ ├── info.txt │ └── about.html └── 91 │ ├── speechoceanfreedata2.zip │ └── info.txt ├── favicon.ico ├── openslr.png ├── openslr_ico.png ├── config ├── OpenslrLogo.key └── README ├── robots.txt └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#*\# 3 | 4 | -------------------------------------------------------------------------------- /resources/35/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/LICENSE -------------------------------------------------------------------------------- /resources/41/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/41/LICENSE -------------------------------------------------------------------------------- /resources/42/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/42/LICENSE -------------------------------------------------------------------------------- /resources/43/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/43/LICENSE -------------------------------------------------------------------------------- /resources/44/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/44/LICENSE -------------------------------------------------------------------------------- /resources/55/test.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/55/test.tgz -------------------------------------------------------------------------------- /resources/61/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/61/LICENSE -------------------------------------------------------------------------------- /resources/63/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/63/LICENSE -------------------------------------------------------------------------------- /resources/64/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/64/LICENSE -------------------------------------------------------------------------------- /resources/65/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/65/LICENSE -------------------------------------------------------------------------------- /resources/66/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/66/LICENSE -------------------------------------------------------------------------------- /resources/69/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/69/LICENSE -------------------------------------------------------------------------------- /resources/70/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/70/LICENSE -------------------------------------------------------------------------------- /resources/71/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/71/LICENSE -------------------------------------------------------------------------------- /resources/72/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/72/LICENSE -------------------------------------------------------------------------------- /resources/73/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/73/LICENSE -------------------------------------------------------------------------------- /resources/74/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/74/LICENSE -------------------------------------------------------------------------------- /resources/75/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/75/LICENSE -------------------------------------------------------------------------------- /resources/76/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/76/LICENSE -------------------------------------------------------------------------------- /resources/77/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/77/LICENSE -------------------------------------------------------------------------------- /resources/78/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/78/LICENSE -------------------------------------------------------------------------------- /resources/79/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/79/LICENSE -------------------------------------------------------------------------------- /resources/8/info.txt: -------------------------------------------------------------------------------- 1 | /mnt/resources1/8/info.txt -------------------------------------------------------------------------------- /resources/80/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/80/LICENSE -------------------------------------------------------------------------------- /resources/86/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/86/LICENSE -------------------------------------------------------------------------------- /resources/12/md5sum.txt: -------------------------------------------------------------------------------- 1 | /mnt/resources1/12/md5sum.txt -------------------------------------------------------------------------------- /resources/30/README.txt: -------------------------------------------------------------------------------- 1 | /mnt/resources1/30/README.txt -------------------------------------------------------------------------------- /resources/30/si_lk.tar: -------------------------------------------------------------------------------- 1 | /mnt/resources1/30/si_lk.tar -------------------------------------------------------------------------------- /resources/37/bn_bd.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/37/bn_bd.zip -------------------------------------------------------------------------------- /resources/37/bn_in.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/37/bn_in.zip -------------------------------------------------------------------------------- /resources/55/train.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/55/train.tgz -------------------------------------------------------------------------------- /resources/56/splits.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/56/splits.zip -------------------------------------------------------------------------------- /resources/8/about.html: -------------------------------------------------------------------------------- 1 | /mnt/resources1/8/about.html -------------------------------------------------------------------------------- /resources/85/dev.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/85/dev.tar.gz -------------------------------------------------------------------------------- /resources/10/sre04_key.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/10/sre04_key.tgz -------------------------------------------------------------------------------- /resources/11/g2p-model-5: -------------------------------------------------------------------------------- 1 | /mnt/resources1/11/g2p-model-5 -------------------------------------------------------------------------------- /resources/13/RWCP.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/13/RWCP.tar.gz -------------------------------------------------------------------------------- /resources/14/beep.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/14/beep.tar.gz -------------------------------------------------------------------------------- /resources/17/musan.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/17/musan.tar.gz -------------------------------------------------------------------------------- /resources/18/resource.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/18/resource.tgz -------------------------------------------------------------------------------- /resources/23/lre07_key.txt: -------------------------------------------------------------------------------- 1 | /mnt/resources1/23/lre07_key.txt -------------------------------------------------------------------------------- /resources/24/iban.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/24/iban.tar.gz -------------------------------------------------------------------------------- /resources/30/LICENSE.txt: -------------------------------------------------------------------------------- 1 | /mnt/resources1/30/LICENSE.txt -------------------------------------------------------------------------------- /resources/30/si_lk.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/30/si_lk.tar.gz -------------------------------------------------------------------------------- /resources/32/af_za.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/32/af_za.tar.gz -------------------------------------------------------------------------------- /resources/32/st_za.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/32/st_za.tar.gz -------------------------------------------------------------------------------- /resources/32/tn_za.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/32/tn_za.tar.gz -------------------------------------------------------------------------------- /resources/32/xh_za.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/32/xh_za.tar.gz -------------------------------------------------------------------------------- /resources/49/vox1_meta.csv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/49/vox1_meta.csv -------------------------------------------------------------------------------- /resources/49/vox2_meta.csv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/49/vox2_meta.csv -------------------------------------------------------------------------------- /resources/8/lexicon-da.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/8/lexicon-da.tgz -------------------------------------------------------------------------------- /resources/82/cn-celeb.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/82/cn-celeb.tgz -------------------------------------------------------------------------------- /resources/85/test.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/85/test.tar.gz -------------------------------------------------------------------------------- /resources/85/train.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/85/train.tar.gz -------------------------------------------------------------------------------- /resources/11/3-gram.arpa.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/11/3-gram.arpa.gz -------------------------------------------------------------------------------- /resources/11/4-gram.arpa.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/11/4-gram.arpa.gz -------------------------------------------------------------------------------- /resources/11/librispeech-vocab.txt: -------------------------------------------------------------------------------- 1 | /mnt/resources1/11/vocab.txt -------------------------------------------------------------------------------- /resources/16/headset.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/16/headset.tar.gz -------------------------------------------------------------------------------- /resources/18/test-noise.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/18/test-noise.tgz -------------------------------------------------------------------------------- /resources/22/resource.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/22/resource.tar.gz -------------------------------------------------------------------------------- /resources/26/sim_rir_16k.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/26/sim_rir_16k.zip -------------------------------------------------------------------------------- /resources/26/sim_rir_8k.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/26/sim_rir_8k.zip -------------------------------------------------------------------------------- /resources/28/rirs_noises.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/28/rirs_noises.zip -------------------------------------------------------------------------------- /resources/29/lexicon-sv.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/29/lexicon-sv.tgz -------------------------------------------------------------------------------- /resources/30/si_lk.lines.txt: -------------------------------------------------------------------------------- 1 | /mnt/resources1/30/si_lk.lines.txt -------------------------------------------------------------------------------- /resources/34/santiago.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/34/santiago.tar.gz -------------------------------------------------------------------------------- /resources/41/jv_id_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/41/jv_id_male.zip -------------------------------------------------------------------------------- /resources/42/km_kh_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/42/km_kh_male.zip -------------------------------------------------------------------------------- /resources/44/su_id_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/44/su_id_male.zip -------------------------------------------------------------------------------- /resources/6/data_voip_cs.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/6/data_voip_cs.tgz -------------------------------------------------------------------------------- /resources/6/data_voip_en.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/6/data_voip_en.tgz -------------------------------------------------------------------------------- /resources/61/es_ar_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/61/es_ar_male.zip -------------------------------------------------------------------------------- /resources/63/ml_in_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/63/ml_in_male.zip -------------------------------------------------------------------------------- /resources/64/line_index.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/64/line_index.tsv -------------------------------------------------------------------------------- /resources/65/ta_in_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/65/ta_in_male.zip -------------------------------------------------------------------------------- /resources/66/te_in_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/66/te_in_male.zip -------------------------------------------------------------------------------- /resources/68/dev_set.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/68/dev_set.tar.gz -------------------------------------------------------------------------------- /resources/68/metadata.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/68/metadata.tar.gz -------------------------------------------------------------------------------- /resources/68/test_set.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/68/test_set.tar.gz -------------------------------------------------------------------------------- /resources/69/ca_es_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/69/ca_es_male.zip -------------------------------------------------------------------------------- /resources/70/en_ng_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/70/en_ng_male.zip -------------------------------------------------------------------------------- /resources/71/es_cl_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/71/es_cl_male.zip -------------------------------------------------------------------------------- /resources/72/es_co_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/72/es_co_male.zip -------------------------------------------------------------------------------- /resources/73/es_pe_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/73/es_pe_male.zip -------------------------------------------------------------------------------- /resources/75/es_ve_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/75/es_ve_male.zip -------------------------------------------------------------------------------- /resources/76/eu_es_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/76/eu_es_male.zip -------------------------------------------------------------------------------- /resources/77/gl_es_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/77/gl_es_male.zip -------------------------------------------------------------------------------- /resources/78/gu_in_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/78/gu_in_male.zip -------------------------------------------------------------------------------- /resources/79/kn_in_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/79/kn_in_male.zip -------------------------------------------------------------------------------- /resources/81/samples.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/81/samples.tar.gz -------------------------------------------------------------------------------- /resources/85/test_v2.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/85/test_v2.tar.gz -------------------------------------------------------------------------------- /resources/86/yo_ng_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/86/yo_ng_male.zip -------------------------------------------------------------------------------- /resources/9/wordlist.50k.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/9/wordlist.50k.gz -------------------------------------------------------------------------------- /resources/90/speechocean.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/90/speechocean.zip -------------------------------------------------------------------------------- /resources/1/waves_yesno.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/./1/waves_yesno.tar.gz -------------------------------------------------------------------------------- /resources/12/dev-clean.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/12/dev-clean.tar.gz -------------------------------------------------------------------------------- /resources/12/dev-other.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/12/dev-other.tar.gz -------------------------------------------------------------------------------- /resources/12/test-clean.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/12/test-clean.tar.gz -------------------------------------------------------------------------------- /resources/12/test-other.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/12/test-other.tar.gz -------------------------------------------------------------------------------- /resources/15/speaker_list.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/15/speaker_list.tgz -------------------------------------------------------------------------------- /resources/16/Array1-01.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/16/Array1-01.tar.gz -------------------------------------------------------------------------------- /resources/16/Array1-02.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/16/Array1-02.tar.gz -------------------------------------------------------------------------------- /resources/16/Array1-03.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/16/Array1-03.tar.gz -------------------------------------------------------------------------------- /resources/16/Array1-04.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/16/Array1-04.tar.gz -------------------------------------------------------------------------------- /resources/16/Array1-05.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/16/Array1-05.tar.gz -------------------------------------------------------------------------------- /resources/16/Array1-06.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/16/Array1-06.tar.gz -------------------------------------------------------------------------------- /resources/16/Array1-07.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/16/Array1-07.tar.gz -------------------------------------------------------------------------------- /resources/16/Array1-08.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/16/Array1-08.tar.gz -------------------------------------------------------------------------------- /resources/18/data_thchs30.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/18/data_thchs30.tgz -------------------------------------------------------------------------------- /resources/22/test_noise.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/22/test_noise.tar.gz -------------------------------------------------------------------------------- /resources/31/dev-clean-2.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/31/dev-clean-2.tar.gz -------------------------------------------------------------------------------- /resources/33/data_aishell.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/33/data_aishell.tgz -------------------------------------------------------------------------------- /resources/35/asr_javanese_0.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_0.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_1.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_1.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_2.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_2.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_3.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_3.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_4.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_4.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_5.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_5.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_6.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_6.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_7.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_7.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_8.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_8.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_9.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_9.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_a.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_a.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_b.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_b.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_c.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_c.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_d.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_d.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_e.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_e.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_f.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_f.zip -------------------------------------------------------------------------------- /resources/35/utt_spk_text.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/utt_spk_text.tsv -------------------------------------------------------------------------------- /resources/39/LDC2006S37.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/39/LDC2006S37.tar.gz -------------------------------------------------------------------------------- /resources/41/jv_id_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/41/jv_id_female.zip -------------------------------------------------------------------------------- /resources/43/ne_np_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/43/ne_np_female.zip -------------------------------------------------------------------------------- /resources/44/su_id_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/44/su_id_female.zip -------------------------------------------------------------------------------- /resources/49/voxceleb1_test.txt: -------------------------------------------------------------------------------- 1 | /mnt/resources1/49/voxceleb1_test.txt -------------------------------------------------------------------------------- /resources/5/sw-ms98-dict.text: -------------------------------------------------------------------------------- 1 | /mnt/resources1/5/sw-ms98-dict.text -------------------------------------------------------------------------------- /resources/52/asr_sinhala_0.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_0.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_1.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_1.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_2.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_2.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_3.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_3.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_4.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_4.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_5.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_5.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_6.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_6.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_7.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_7.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_8.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_8.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_9.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_9.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_a.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_a.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_b.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_b.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_c.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_c.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_d.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_d.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_e.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_e.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_f.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_f.zip -------------------------------------------------------------------------------- /resources/52/utt_spk_text.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/utt_spk_text.tsv -------------------------------------------------------------------------------- /resources/53/asr_bengali_0.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_0.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_1.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_1.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_2.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_2.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_3.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_3.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_4.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_4.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_5.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_5.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_6.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_6.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_7.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_7.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_8.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_8.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_9.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_9.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_a.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_a.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_b.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_b.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_c.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_c.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_d.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_d.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_e.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_e.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_f.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_f.zip -------------------------------------------------------------------------------- /resources/53/utt_spk_text.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/utt_spk_text.tsv -------------------------------------------------------------------------------- /resources/54/asr_nepali_0.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_0.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_1.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_1.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_2.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_2.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_3.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_3.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_4.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_4.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_5.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_5.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_6.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_6.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_7.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_7.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_8.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_8.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_9.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_9.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_a.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_a.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_b.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_b.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_c.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_c.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_d.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_d.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_e.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_e.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_f.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_f.zip -------------------------------------------------------------------------------- /resources/54/utt_spk_text.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/utt_spk_text.tsv -------------------------------------------------------------------------------- /resources/60/dev-clean.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/60/dev-clean.tar.gz -------------------------------------------------------------------------------- /resources/60/dev-other.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/60/dev-other.tar.gz -------------------------------------------------------------------------------- /resources/60/test-clean.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/60/test-clean.tar.gz -------------------------------------------------------------------------------- /resources/60/test-other.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/60/test-other.tar.gz -------------------------------------------------------------------------------- /resources/61/es_ar_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/61/es_ar_female.zip -------------------------------------------------------------------------------- /resources/63/ml_in_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/63/ml_in_female.zip -------------------------------------------------------------------------------- /resources/64/mr_in_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/64/mr_in_female.zip -------------------------------------------------------------------------------- /resources/65/ta_in_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/65/ta_in_female.zip -------------------------------------------------------------------------------- /resources/66/te_in_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/66/te_in_female.zip -------------------------------------------------------------------------------- /resources/68/train_set.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/68/train_set.tar.gz -------------------------------------------------------------------------------- /resources/69/ca_es_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/69/ca_es_female.zip -------------------------------------------------------------------------------- /resources/70/en_ng_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/70/en_ng_female.zip -------------------------------------------------------------------------------- /resources/71/es_cl_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/71/es_cl_female.zip -------------------------------------------------------------------------------- /resources/72/es_co_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/72/es_co_female.zip -------------------------------------------------------------------------------- /resources/73/es_pe_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/73/es_pe_female.zip -------------------------------------------------------------------------------- /resources/74/es_pr_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/74/es_pr_female.zip -------------------------------------------------------------------------------- /resources/75/es_ve_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/75/es_ve_female.zip -------------------------------------------------------------------------------- /resources/76/eu_es_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/76/eu_es_female.zip -------------------------------------------------------------------------------- /resources/77/gl_es_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/77/gl_es_female.zip -------------------------------------------------------------------------------- /resources/78/gu_in_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/78/gu_in_female.zip -------------------------------------------------------------------------------- /resources/79/kn_in_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/79/kn_in_female.zip -------------------------------------------------------------------------------- /resources/80/my_mm_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/80/my_mm_female.zip -------------------------------------------------------------------------------- /resources/83/dialect_info.txt: -------------------------------------------------------------------------------- 1 | /mnt/resources1/83/dialect_info.txt -------------------------------------------------------------------------------- /resources/83/line_index_all.csv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/83/line_index_all.csv -------------------------------------------------------------------------------- /resources/86/yo_ng_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/86/yo_ng_female.zip -------------------------------------------------------------------------------- /resources/10/sre04_key-v2.txt.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/10/sre04_key-v2.txt.gz -------------------------------------------------------------------------------- /resources/10/sre05-key-v7b.txt.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/10/sre05-key-v7b.txt.gz -------------------------------------------------------------------------------- /resources/12/original-mp3.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/12/original-mp3.tar.gz -------------------------------------------------------------------------------- /resources/12/raw-metadata.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/12/raw-metadata.tar.gz -------------------------------------------------------------------------------- /resources/2/openfst-1.3.2.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/./2/openfst-1.3.2.tar.gz -------------------------------------------------------------------------------- /resources/2/openfst-1.3.3.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/./2/openfst-1.3.3.tar.gz -------------------------------------------------------------------------------- /resources/2/openfst-1.3.4.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/./2/openfst-1.3.4.tar.gz -------------------------------------------------------------------------------- /resources/2/openfst-1.4.1.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1//2/openfst-1.4.1.tar.gz -------------------------------------------------------------------------------- /resources/2/openfst-1.5.4.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1//2/openfst-1.5.4.tar.gz -------------------------------------------------------------------------------- /resources/2/openfst-1.6.2.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1//2/openfst-1.6.2.tar.gz -------------------------------------------------------------------------------- /resources/2/openfst-1.6.5.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/2/openfst-1.6.5.tar.gz -------------------------------------------------------------------------------- /resources/2/openfst-1.6.7.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/2/openfst-1.6.7.tar.gz -------------------------------------------------------------------------------- /resources/21/es_wordlist.json.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/21/es_wordlist.json.tgz -------------------------------------------------------------------------------- /resources/22/data_thuyg20.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/22/data_thuyg20.tar.gz -------------------------------------------------------------------------------- /resources/3/sph2pipe_v2.5.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/./3/sph2pipe_v2.5.tar.gz -------------------------------------------------------------------------------- /resources/31/train-clean-5.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/31/train-clean-5.tar.gz -------------------------------------------------------------------------------- /resources/33/resource_aishell.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/33/resource_aishell.tgz -------------------------------------------------------------------------------- /resources/36/asr_sundanese_0.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_0.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_1.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_1.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_2.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_2.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_3.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_3.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_4.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_4.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_5.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_5.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_6.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_6.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_7.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_7.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_8.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_8.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_9.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_9.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_a.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_a.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_b.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_b.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_c.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_c.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_d.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_d.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_e.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_e.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_f.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_f.zip -------------------------------------------------------------------------------- /resources/40/zeroth_korean.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/40/zeroth_korean.tar.gz -------------------------------------------------------------------------------- /resources/46/Tunisian_MSA.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/46/Tunisian_MSA.tar.gz -------------------------------------------------------------------------------- /resources/61/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/61/line_index_male.tsv -------------------------------------------------------------------------------- /resources/62/aidatatang_200zh.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/62/aidatatang_200zh.tgz -------------------------------------------------------------------------------- /resources/63/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/63/line_index_male.tsv -------------------------------------------------------------------------------- /resources/65/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/65/line_index_male.tsv -------------------------------------------------------------------------------- /resources/66/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/66/line_index_male.tsv -------------------------------------------------------------------------------- /resources/69/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/69/line_index_male.tsv -------------------------------------------------------------------------------- /resources/70/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/70/line_index_male.tsv -------------------------------------------------------------------------------- /resources/71/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/71/line_index_male.tsv -------------------------------------------------------------------------------- /resources/72/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/72/line_index_male.tsv -------------------------------------------------------------------------------- /resources/73/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/73/line_index_male.tsv -------------------------------------------------------------------------------- /resources/75/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/75/line_index_male.tsv -------------------------------------------------------------------------------- /resources/76/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/76/line_index_male.tsv -------------------------------------------------------------------------------- /resources/77/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/77/line_index_male.tsv -------------------------------------------------------------------------------- /resources/78/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/78/line_index_male.tsv -------------------------------------------------------------------------------- /resources/79/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/79/line_index_male.tsv -------------------------------------------------------------------------------- /resources/8/lexicon-da-nonorm.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/8/lexicon-da-nonorm.tgz -------------------------------------------------------------------------------- /resources/86/annotation_info.txt: -------------------------------------------------------------------------------- 1 | /mnt/resources1/86/annotation_info.txt -------------------------------------------------------------------------------- /resources/86/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/86/line_index_male.tsv -------------------------------------------------------------------------------- /favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danpovey/openslr/HEAD/favicon.ico -------------------------------------------------------------------------------- /openslr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danpovey/openslr/HEAD/openslr.png -------------------------------------------------------------------------------- /resources/12/original-books.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/12/original-books.tar.gz -------------------------------------------------------------------------------- /resources/12/train-clean-100.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/12/train-clean-100.tar.gz -------------------------------------------------------------------------------- /resources/12/train-clean-360.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/12/train-clean-360.tar.gz -------------------------------------------------------------------------------- /resources/12/train-other-500.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/12/train-other-500.tar.gz -------------------------------------------------------------------------------- /resources/16/ami_manual_1.6.1.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/16/ami_manual_1.6.1.tar.gz -------------------------------------------------------------------------------- /resources/19/TEDLIUM_release2.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/19/TEDLIUM_release2.tar.gz -------------------------------------------------------------------------------- /resources/22/data_thuyg20_sre.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/22/data_thuyg20_sre.tar.gz -------------------------------------------------------------------------------- /resources/22/test_noise_sre.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/22/test_noise_sre.tar.gz -------------------------------------------------------------------------------- /resources/27/cantab-TEDLIUM.tar.bz2: -------------------------------------------------------------------------------- 1 | /mnt/resources1/27/cantab-TEDLIUM.tar.bz2 -------------------------------------------------------------------------------- /resources/48/madcat.dev.raw.lineid: -------------------------------------------------------------------------------- 1 | /mnt/resources1/48/madcat.dev.raw.lineid -------------------------------------------------------------------------------- /resources/48/madcat.test.raw.lineid: -------------------------------------------------------------------------------- 1 | /mnt/resources1/48/madcat.test.raw.lineid -------------------------------------------------------------------------------- /resources/48/madcat.train.raw.lineid: -------------------------------------------------------------------------------- 1 | /mnt/resources1/48/madcat.train.raw.lineid -------------------------------------------------------------------------------- /resources/49/voxceleb1_test_v2.txt: -------------------------------------------------------------------------------- 1 | /mnt/resources1/49/voxceleb1_test_v2.txt -------------------------------------------------------------------------------- /resources/50/madcat.dev.raw.lineid: -------------------------------------------------------------------------------- 1 | /mnt/resources1/50/madcat.dev.raw.lineid -------------------------------------------------------------------------------- /resources/50/madcat.test.raw.lineid: -------------------------------------------------------------------------------- 1 | /mnt/resources1/50/madcat.test.raw.lineid -------------------------------------------------------------------------------- /resources/50/madcat.train.raw.lineid: -------------------------------------------------------------------------------- 1 | /mnt/resources1/50/madcat.train.raw.lineid -------------------------------------------------------------------------------- /resources/51/TEDLIUM_release-3.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/51/TEDLIUM_release-3.tgz -------------------------------------------------------------------------------- /resources/60/train-clean-100.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/60/train-clean-100.tar.gz -------------------------------------------------------------------------------- /resources/60/train-clean-360.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/60/train-clean-360.tar.gz -------------------------------------------------------------------------------- /resources/60/train-other-500.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/60/train-other-500.tar.gz -------------------------------------------------------------------------------- /resources/61/es_weather_messages.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/61/es_weather_messages.zip -------------------------------------------------------------------------------- /resources/61/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/61/line_index_female.tsv -------------------------------------------------------------------------------- /resources/63/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/63/line_index_female.tsv -------------------------------------------------------------------------------- /resources/65/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/65/line_index_female.tsv -------------------------------------------------------------------------------- /resources/66/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/66/line_index_female.tsv -------------------------------------------------------------------------------- /resources/67/tedx_spanish_corpus.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/67/tedx_spanish_corpus.tgz -------------------------------------------------------------------------------- /resources/69/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/69/line_index_female.tsv -------------------------------------------------------------------------------- /resources/7/TEDLIUM_release1.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/7/TEDLIUM_release1.tar.gz -------------------------------------------------------------------------------- /resources/70/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/70/line_index_female.tsv -------------------------------------------------------------------------------- /resources/71/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/71/line_index_female.tsv -------------------------------------------------------------------------------- /resources/72/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/72/line_index_female.tsv -------------------------------------------------------------------------------- /resources/73/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/73/line_index_female.tsv -------------------------------------------------------------------------------- /resources/74/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/74/line_index_female.tsv -------------------------------------------------------------------------------- /resources/75/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/75/line_index_female.tsv -------------------------------------------------------------------------------- /resources/76/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/76/line_index_female.tsv -------------------------------------------------------------------------------- /resources/77/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/77/line_index_female.tsv -------------------------------------------------------------------------------- /resources/78/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/78/line_index_female.tsv -------------------------------------------------------------------------------- /resources/79/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/79/line_index_female.tsv -------------------------------------------------------------------------------- /resources/80/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/80/line_index_female.tsv -------------------------------------------------------------------------------- /resources/83/irish_english_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/83/irish_english_male.zip -------------------------------------------------------------------------------- /resources/83/welsh_english_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/83/welsh_english_male.zip -------------------------------------------------------------------------------- /resources/85/filename_mapping.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/85/filename_mapping.tar.gz -------------------------------------------------------------------------------- /resources/86/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/86/line_index_female.tsv -------------------------------------------------------------------------------- /openslr_ico.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danpovey/openslr/HEAD/openslr_ico.png -------------------------------------------------------------------------------- /resources/11/librispeech-lm-corpus.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/11/librispeech-lm-corpus.tgz -------------------------------------------------------------------------------- /resources/12/intro-disclaimers.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/12/intro-disclaimers.tar.gz -------------------------------------------------------------------------------- /resources/45/ST-AEDS-20180100_1-OS.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/45/ST-AEDS-20180100_1-OS.tgz -------------------------------------------------------------------------------- /resources/83/midlands_english_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/83/midlands_english_male.zip -------------------------------------------------------------------------------- /resources/83/northern_english_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/83/northern_english_male.zip -------------------------------------------------------------------------------- /resources/83/scottish_english_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/83/scottish_english_male.zip -------------------------------------------------------------------------------- /resources/83/southern_english_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/83/southern_english_male.zip -------------------------------------------------------------------------------- /resources/83/welsh_english_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/83/welsh_english_female.zip -------------------------------------------------------------------------------- /resources/91/speechoceanfreedata2.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/91/speechoceanfreedata2.zip -------------------------------------------------------------------------------- /resources/11/3-gram.pruned.1e-7.arpa.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/11/3-gram.pruned.1e-7.arpa.gz -------------------------------------------------------------------------------- /resources/11/3-gram.pruned.3e-7.arpa.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/11/3-gram.pruned.3e-7.arpa.gz -------------------------------------------------------------------------------- /resources/11/librispeech-lexicon.txt: -------------------------------------------------------------------------------- 1 | /mnt/resources1/11/librispeech-lexicon-complete.txt -------------------------------------------------------------------------------- /resources/11/librispeech-lm-norm.txt.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/11/librispeech-lm-normtext.txt.gz -------------------------------------------------------------------------------- /resources/20/air_database_release_1_4.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/20/air_database_release_1_4.zip -------------------------------------------------------------------------------- /resources/25/data_readspeech_am.tar.bz2: -------------------------------------------------------------------------------- 1 | /mnt/resources1/25/data_readspeech_am.tar.bz2 -------------------------------------------------------------------------------- /resources/25/data_readspeech_wo.tar.bz2: -------------------------------------------------------------------------------- 1 | /mnt/resources1/25/data_readspeech_wo.tar.bz2 -------------------------------------------------------------------------------- /resources/38/ST-CMDS-20170001_1-OS.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/38/ST-CMDS-20170001_1-OS.tar.gz -------------------------------------------------------------------------------- /resources/49/voxceleb1_sitw_overlap.txt: -------------------------------------------------------------------------------- 1 | /mnt/resources1/49/voxceleb1_sitw_overlap.txt -------------------------------------------------------------------------------- /resources/59/parlament_v1.0_clean.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/59/parlament_v1.0_clean.tar.gz -------------------------------------------------------------------------------- /resources/59/parlament_v1.0_other.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/59/parlament_v1.0_other.tar.gz -------------------------------------------------------------------------------- /resources/61/es_ar_line_index_weather.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/61/es_ar_line_index_weather.tsv -------------------------------------------------------------------------------- /resources/61/es_es_line_index_weather.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/61/es_es_line_index_weather.tsv -------------------------------------------------------------------------------- /resources/83/midlands_english_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/83/midlands_english_female.zip -------------------------------------------------------------------------------- /resources/83/northern_english_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/83/northern_english_female.zip -------------------------------------------------------------------------------- /resources/83/scottish_english_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/83/scottish_english_female.zip -------------------------------------------------------------------------------- /resources/83/southern_english_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/83/southern_english_female.zip -------------------------------------------------------------------------------- /resources/84/scribblelens.corpus.v1.2.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/84/scribblelens.corpus.v1.2.zip -------------------------------------------------------------------------------- /resources/87/mobvoi_hotword_dataset.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/87/mobvoi_hotword_dataset.tgz -------------------------------------------------------------------------------- /resources/89/Yoloxochitl-Mixtec-Data.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/89/Yoloxochitl-Mixtec-Data.tgz -------------------------------------------------------------------------------- /resources/9/about.html: -------------------------------------------------------------------------------- 1 | This data is downloaded and used by the Kaldi AMI recipe. 2 | -------------------------------------------------------------------------------- /resources/25/data_broadcastnews_sw.tar.bz2: -------------------------------------------------------------------------------- 1 | /mnt/resources1/25/data_broadcastnews_sw.tar.bz2 -------------------------------------------------------------------------------- /resources/27/cantab-TEDLIUM-partial.tar.bz2: -------------------------------------------------------------------------------- 1 | /mnt/resources1/27/cantab-TEDLIUM-partial.tar.bz2 -------------------------------------------------------------------------------- /resources/47/primewords_md_2018_set1.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/47/primewords_md_2018_set1.tar.gz -------------------------------------------------------------------------------- /resources/57/African_Accented_French.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/57/African_Accented_French.tar.gz -------------------------------------------------------------------------------- /config/OpenslrLogo.key: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danpovey/openslr/HEAD/config/OpenslrLogo.key -------------------------------------------------------------------------------- /resources/10/about.html: -------------------------------------------------------------------------------- 1 | These files define the tests for some of NIST's SRE evaluations. 2 | 3 | -------------------------------------------------------------------------------- /resources/31/about.html: -------------------------------------------------------------------------------- 1 | A subset of LibriSpeech created for the purpose of regression testing. 2 | -------------------------------------------------------------------------------- /resources/4/sctk-2.4.0-20091110-0958.tar.bz2: -------------------------------------------------------------------------------- 1 | /mnt/resources1/./4/sctk-2.4.0-20091110-0958.tar.bz2 -------------------------------------------------------------------------------- /resources/4/sctk-2.4.0-20091110-0958.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/./4/sctk-2.4.0-20091110-0958.tar.gz -------------------------------------------------------------------------------- /resources/4/sctk-2.4.10-20151007-1312Z.tar.bz2: -------------------------------------------------------------------------------- 1 | /mnt/resources1/4/sctk-2.4.10-20151007-1312Z.tar.bz2 -------------------------------------------------------------------------------- /resources/4/sctk-2.4.8-20130429-2145.tar.bz2: -------------------------------------------------------------------------------- 1 | /mnt/resources1/4/sctk-2.4.8-20130429-2145.tar.bz2 -------------------------------------------------------------------------------- /resources/4/sctk-2.4.9-20141015-1634Z.tar.bz2: -------------------------------------------------------------------------------- 1 | /mnt/resources1/4/sctk-2.4.9-20141015-1634Z.tar.bz2 -------------------------------------------------------------------------------- /resources/5/switchboard_word_alignments.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/5/switchboard_word_alignments.tar.gz -------------------------------------------------------------------------------- /resources/58/pansori-tedxkr-corpus-1.0.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/58/pansori-tedxkr-corpus-1.0.tar.gz -------------------------------------------------------------------------------- /resources/87/mobvoi_hotword_dataset_resources.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/87/mobvoi_hotword_dataset_resources.tgz -------------------------------------------------------------------------------- /resources/10/sre2000-key.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danpovey/openslr/HEAD/resources/10/sre2000-key.tar.gz -------------------------------------------------------------------------------- /resources/49/about.html: -------------------------------------------------------------------------------- 1 | This resource contains files for the VoxCeleb corpora that are helpful in speaker recognition recipes. 2 | -------------------------------------------------------------------------------- /resources/31/md5sum.txt: -------------------------------------------------------------------------------- 1 | 6d7ab67ac6a1d2c993d050e16d61080d dev-clean-2.tar.gz 2 | 5df7d4e78065366204ca6845bb08f490 train-clean-5.tar.gz 3 | -------------------------------------------------------------------------------- /robots.txt: -------------------------------------------------------------------------------- 1 | # robots.txt generated at http://www.mcanerin.com 2 | User-agent: * 3 | Disallow: 4 | Crawl-delay: 5 5 | Disallow: /cgi-bin/ 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | openslr 2 | ======= 3 | 4 | Repository for the web pages and scripts associated with OpenSLR: the open speech and language repository 5 | -------------------------------------------------------------------------------- /resources/46/info.txt: -------------------------------------------------------------------------------- 1 | name: Tunisian_MSA 2 | summary: Tunisian Modern Standard Arabic 3 | category: speech 4 | license: Apache 2.0 5 | file: Tunisian_MSA.tar.gz Data 6 | -------------------------------------------------------------------------------- /resources/23/about.html: -------------------------------------------------------------------------------- 1 | This file contains metadata for the NIST LRE 2007 dataset. It was originally 2 | available at http://www.itl.nist.gov/iad/mig/tests/lang/2007/lid07key_v5.txt. 3 | -------------------------------------------------------------------------------- /resources/17/info.txt: -------------------------------------------------------------------------------- 1 | name: MUSAN 2 | summary: A corpus of music, speech, and noise 3 | category: audio 4 | license: Attribution 4.0 International (CC BY 4.0) 5 | file: musan.tar.gz The corpus 6 | -------------------------------------------------------------------------------- /resources/29/info.txt: -------------------------------------------------------------------------------- 1 | name: Sprakbanken_Swe 2 | summary: Swedish pronunciation dictionary 3 | category: text 4 | license: Creative Commons ZERO (CC-ZERO) 5 | file: lexicon-sv.tgz Lexicon 6 | 7 | -------------------------------------------------------------------------------- /resources/34/info.txt: -------------------------------------------------------------------------------- 1 | name: Santiago Spanish Lexicon 2 | summary: A pronouncing dictionary for the Spanish language. 3 | category: text 4 | license: apache 2.0 5 | file: santiago.tar.gz lexicon 6 | 7 | -------------------------------------------------------------------------------- /resources/56/info.txt: -------------------------------------------------------------------------------- 1 | name: IAM Aachen splits 2 | summary: Aachen data splits (train/test/val) for the IAM dataset. 3 | category: Other 4 | license: n/a 5 | file: splits.zip train/test/val splits 6 | 7 | -------------------------------------------------------------------------------- /resources/57/info.txt: -------------------------------------------------------------------------------- 1 | name: African Accented French 2 | summary: Recordings of African Accented French speech. 3 | category: speech 4 | license: Apache 2.0 5 | file: African_Accented_French.tar.gz The whole corpus 6 | -------------------------------------------------------------------------------- /resources/9/info.txt: -------------------------------------------------------------------------------- 1 | name: The AMI pack 2 | summary: Some auxiliary non-speech data used to build AMI systems with Kaldi 3 | category: text 4 | license: TBD 5 | file: wordlist.50k.gz predefined 50k words vocabulary 6 | -------------------------------------------------------------------------------- /resources/24/info.txt: -------------------------------------------------------------------------------- 1 | name: Iban 2 | summary: Iban language text and speech corpora for ASR 3 | category: speech 4 | license: Attribution-ShareAlike 2.0 Generic (CC BY-SA 2.0) 5 | file: iban.tar.gz Iban language corpora 6 | 7 | -------------------------------------------------------------------------------- /resources/39/info.txt: -------------------------------------------------------------------------------- 1 | name: Heroico 2 | summary: Spanish data, mirrored from the LDC 3 | category: speech 4 | license: apache 2.0 5 | file: LDC2006S37.tar.gz Speech and transcripts 6 | alternate_url: https://catalog.ldc.upenn.edu/LDC2006S37 7 | -------------------------------------------------------------------------------- /resources/81/info.txt: -------------------------------------------------------------------------------- 1 | name: Small Audio Clips 2 | summary: Contains 20 one-second audio clips from various sources, for testing compression algorithms 3 | category: speech 4 | license: CC BY 4.0 5 | file: samples.tar.gz Archive containing audio samples 6 | -------------------------------------------------------------------------------- /resources/20/about.html: -------------------------------------------------------------------------------- 1 | 2 | Please see the original project page http://www.iks.rwth-aachen.de/en/research/tools-downloads/aachen-impulse-response-database/ for more information. 3 | We are mirroring here as backup just in case the original site gooes down. 4 | -------------------------------------------------------------------------------- /resources/51/info.txt: -------------------------------------------------------------------------------- 1 | name: TED-LIUM Release 3 2 | summary: TED-LIUM corpus release 3 3 | category: speech 4 | license: Creative Commons BY-NC-ND 3.0 5 | file: TEDLIUM_release-3.tgz (data) 6 | alternate_url: https://lium.univ-lemans.fr/download/ted-lium_release3 7 | -------------------------------------------------------------------------------- /resources/67/info.txt: -------------------------------------------------------------------------------- 1 | name: TEDx Spanish Corpus 2 | summary: Spanish data taken from the TEDx Talks 3 | category: speech 4 | license: Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0) 5 | file: tedx_spanish_corpus.tgz Spanish speech and transcripts 6 | -------------------------------------------------------------------------------- /resources/89/info.txt: -------------------------------------------------------------------------------- 1 | name: Yoloxóchitl-Mixtec 2 | summary: Yolóxochitl Mixtec Speech with Transcription 3 | category: speech 4 | license: Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0 US) 5 | file: Yoloxochitl-Mixtec-Data.tgz Yolóxochitl Mixtec Speech and Transcription 6 | -------------------------------------------------------------------------------- /resources/55/info.txt: -------------------------------------------------------------------------------- 1 | name: CLMAD 2 | summary: A Chinese Language Model Adaptation Dataset (CLMAD). 3 | category: text 4 | license: Creative Commons BY-NC-ND 3.0 (attribution/non-commercial/no-derivatives). 5 | file: train.tgz Training set 6 | file: test.tgz Testing set 7 | 8 | -------------------------------------------------------------------------------- /resources/82/info.txt: -------------------------------------------------------------------------------- 1 | name: CN-Celeb 2 | summary: A Free Chinese Speaker Recognition Corpus Released by CSLT@Tsinghua University 3 | category: Speech 4 | license: Attribution-ShareAlike 4.0 International 5 | file: cn-celeb.tgz Audios with speaker ids for training and evaluation -------------------------------------------------------------------------------- /resources/15/info.txt: -------------------------------------------------------------------------------- 1 | name: SRE Speaker List 2 | summary: A list linking speakers across NIST SRE corpra 3 | category: Misc 4 | license: Not copyrighted (derived from a work prepared by a US government employee in the course of their official duties) 5 | file: speaker_list.tgz 6 | 7 | -------------------------------------------------------------------------------- /resources/2/about.html: -------------------------------------------------------------------------------- 1 | 2 | This resource is a mirror of the OpenFST toolkit, whose 3 | primary home is at www.openfst.org . 4 | We mirror it here in order to provide a failover location when 5 | OpenFST is unavailable from its primary server. 6 | -------------------------------------------------------------------------------- /resources/42/info.txt: -------------------------------------------------------------------------------- 1 | name: High quality TTS data for Khmer. 2 | summary: Multi-speaker TTS data for Khmer (km-KH) 3 | category: speech 4 | license: Attribution-ShareAlike 4.0 (CC BY-SA 4.0) 5 | file: km_kh_male.zip Khmer data from male speakers 6 | file: LICENSE License information 7 | -------------------------------------------------------------------------------- /resources/3/info.txt: -------------------------------------------------------------------------------- 1 | name: sph2pipe 2 | summary: A mirror of the sph2pipe software 3 | category: software 4 | license: One-off 5 | file: sph2pipe_v2.5.tar.gz Version 2.5, probably the last one. 6 | alternate_url: ftp://ftp.ldc.upenn.edu/pub/ldc/misc_sw/sph2pipe_v2.5.tar.gz Used to work 7 | -------------------------------------------------------------------------------- /resources/26/info.txt: -------------------------------------------------------------------------------- 1 | name: Simulated Room Impulse Response Database 2 | summary: A database of simulated room impulse responses 3 | category: Audio 4 | license: Apache 2.0 5 | file: sim_rir_8k.zip The database in 8k sampling rate 6 | file: sim_rir_16k.zip The database in 16k sampling rate 7 | -------------------------------------------------------------------------------- /resources/43/info.txt: -------------------------------------------------------------------------------- 1 | name: High quality TTS data for Nepali. 2 | summary: Multi-speaker TTS data for Nepali (ne-NP) 3 | category: speech 4 | license: Attribution-ShareAlike 4.0 (CC BY-SA 4.0) 5 | file: ne_np_female.zip Nepali data from female speakers 6 | file: LICENSE License information 7 | -------------------------------------------------------------------------------- /resources/23/info.txt: -------------------------------------------------------------------------------- 1 | name: NIST LRE 2007 Key 2 | summary: A file containing metadata for the utterances in the LRE 2007 evaluation 3 | category: Misc 4 | license: Not copyrighted (derived from a work prepared by a US government employee in the course of their official duties) 5 | file: lre07_key.txt 6 | -------------------------------------------------------------------------------- /resources/21/info.txt: -------------------------------------------------------------------------------- 1 | name: Spanish Word list 2 | summary: A list of words in Spanish with frequency derived from a large corpus (Spanish Gigaword). 3 | category: text 4 | license: Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0 US) 5 | file: es_wordlist.json.tgz JSON file containing words with frequency. 6 | -------------------------------------------------------------------------------- /resources/84/info.txt: -------------------------------------------------------------------------------- 1 | name: ScribbleLens 2 | summary: Dutch cursive, 16-18th century handwritings, pages and lines, for (un)supervised AI and other research. 3 | category: Handwriting 4 | license: CC-BY-NC-ND (details in LICENSE.txt) 5 | file: scribblelens.corpus.v1.2.zip Dutch historical handwritings 6 | 7 | -------------------------------------------------------------------------------- /resources/15/about.html: -------------------------------------------------------------------------------- 1 | This data contains a list which links speakers across various NIST SRE datasets. 2 | It is derived from information distributed with the 2013 CLSP Speaker and Language 3 | Recognition workshop. 4 | 5 | -------------------------------------------------------------------------------- /resources/13/info.txt: -------------------------------------------------------------------------------- 1 | name: RWCP Sound Scene Database 2 | summary: A database of recordings of real-world sounds and measured room impulse responses 3 | category: Speech + Software 4 | license: Research and development use only 5 | file: RWCP.tar.gz 6 | alternate_url: http://research.nii.ac.jp/src/en/RWCP-SSD.html 7 | -------------------------------------------------------------------------------- /resources/27/info.txt: -------------------------------------------------------------------------------- 1 | name: Cantab-TEDLIUM Release 1.1 (February 2015) 2 | summary: Cantab Research Language models for the TEDLIUM database 3 | category: text 4 | license: unspecified 5 | file: cantab-TEDLIUM.tar.bz2 Original archive 6 | file: cantab-TEDLIUM-partial.tar.bz2 Partial archive for Kaldi TEDLIUM recipe 7 | -------------------------------------------------------------------------------- /resources/48/info.txt: -------------------------------------------------------------------------------- 1 | name: MADCAT Arabic data splits 2 | summary: Unofficial data splits (dev/train/test) for the MADCAT Arabic LDC corpus 3 | category: other 4 | license: Apache 2.0 5 | file: madcat.dev.raw.lineid dev set 6 | file: madcat.test.raw.lineid test set 7 | file: madcat.train.raw.lineid train set 8 | 9 | -------------------------------------------------------------------------------- /resources/50/info.txt: -------------------------------------------------------------------------------- 1 | name: MADCAT Chinese data splits 2 | summary: Unofficial data splits (dev/train/test) for the MADCAT Chinese LDC corpus 3 | category: other 4 | license: Apache 2.0 5 | file: madcat.dev.raw.lineid dev set 6 | file: madcat.test.raw.lineid test set 7 | file: madcat.train.raw.lineid train set 8 | 9 | -------------------------------------------------------------------------------- /resources/14/info.txt: -------------------------------------------------------------------------------- 1 | name: BEEP Dictionary 2 | summary: Phonemic transcriptions of over 250,000 English words. (British English pronunciations) 3 | category: Text 4 | license: Research and development use only 5 | file: beep.tar.gz 6 | alternate_url: http://svr-www.eng.cam.ac.uk/comp.speech/Section1/Lexical/beep.html 7 | -------------------------------------------------------------------------------- /resources/4/about.html: -------------------------------------------------------------------------------- 1 | This resource is a mirror of NIST's sctk speech-recognition scoring software, which 2 | is originally available from here . 3 | This is public-domain and not subject to copyright, since it was written by 4 | US government employees. 5 | -------------------------------------------------------------------------------- /resources/90/info.txt: -------------------------------------------------------------------------------- 1 | name: Speechocean 10 Hours Chinese Mandarin Speech Recognition Corpus 2 | summary: Free 10.33 Hours Chinese Mandarin Speech Recognition Corpus Provided by Speechocean 3 | category: Speech 4 | license: Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0) 5 | file: speechocean.zip Corpus 6 | -------------------------------------------------------------------------------- /resources/28/info.txt: -------------------------------------------------------------------------------- 1 | name: Room Impulse Response and Noise Database 2 | summary: A database of simulated and real room impulse responses, isotropic and point-source noises. The audio files in this data are all in 16k sampling rate and 16-bit precision. 3 | category: Audio 4 | license: Apache 2.0 5 | file: rirs_noises.zip The database 6 | -------------------------------------------------------------------------------- /resources/31/info.txt: -------------------------------------------------------------------------------- 1 | name: Mini LibriSpeech ASR corpus 2 | summary: Subset of LibriSpeech corpus for purpose of regression testing 3 | category: speech 4 | license: CC BY 4.0 5 | file: dev-clean-2.tar.gz development set, "clean" speech 6 | file: train-clean-5.tar.gz test set, "clean" speech 7 | file: md5sum.txt md5 checksums of files 8 | -------------------------------------------------------------------------------- /resources/30/info.txt: -------------------------------------------------------------------------------- 1 | name: Sinhala TTS 2 | summary: Sinhalese multi-speaker TTS corpora 3 | category: speech 4 | license: Attribution-ShareAlike 4.0 International (CC BY-SA 4.0) 5 | file: si_lk.tar.gz Audio files 6 | file: si_lk.lines.txt Transcription of the audio 7 | file: README.txt Additional readme 8 | file: LICENSE.txt Licensing information 9 | -------------------------------------------------------------------------------- /resources/41/info.txt: -------------------------------------------------------------------------------- 1 | name: High quality TTS data for Javanese. 2 | summary: Multi-speaker TTS data for Javanese (jv-ID) 3 | category: speech 4 | license: Attribution-ShareAlike 4.0 (CC BY-SA 4.0) 5 | file: jv_id_female.zip Javanese data from female speakers 6 | file: jv_id_male.zip Javanese data from female speakers 7 | file: LICENSE License information 8 | -------------------------------------------------------------------------------- /resources/44/info.txt: -------------------------------------------------------------------------------- 1 | name: High quality TTS data for Sundanese. 2 | summary: Multi-speaker TTS data for Sundanese (su-ID) 3 | category: speech 4 | license: Attribution-ShareAlike 4.0 (CC BY-SA 4.0) 5 | file: su_id_female.zip Sundanese data from female speakers 6 | file: su_id_male.zip Sundanese data from male speakers 7 | file: LICENSE License information 8 | -------------------------------------------------------------------------------- /resources/87/info.txt: -------------------------------------------------------------------------------- 1 | name: MobvoiHotwords 2 | summary: Chinese hotwords detection dataset, provided by Mobvoi CO.,LTD 3 | category: Speech 4 | license: Apache License v.2.0 5 | file: mobvoi_hotword_dataset.tgz Wave files of keyword and non-keyword data 6 | file: mobvoi_hotword_dataset_resources.tgz Label, speaker and channel information of above wave files 7 | -------------------------------------------------------------------------------- /resources/1/info.txt: -------------------------------------------------------------------------------- 1 | name: Yesno 2 | summary: Sixty recordings of one individual saying yes or no in Hebrew; each recording is eight words long. 3 | category: speech 4 | license: No formal license but free to use for any purpose. 5 | file: waves_yesno.tar.gz This is the entire dataset. 6 | alternate_url: http://sourceforge.net/projects/kaldi/files/waves_yesno.tar.gz 7 | -------------------------------------------------------------------------------- /resources/3/about.html: -------------------------------------------------------------------------------- 1 | This resource is a mirror of LDC's sph2pipe software, which 2 | used to be available from here . 3 | The license (available 0readme.1st in the archive) only permits 4 | using it to read sphere files, but since that is the purpose of the 5 | program it should not be a problem. 6 | -------------------------------------------------------------------------------- /resources/11/about.html: -------------------------------------------------------------------------------- 1 | Language modeling resources to be used in conjunction with the (soon-to-be-released) LibriSpeech ASR corpus. 2 |

3 | This corpus and these resources were prepared by Vassil Panayotov with 4 | the assistance of Daniel Povey and Sanjeev Khudanpur. We hope to finalize 5 | this and release the corpus here by the ICASSP deadline (early October 2014). 6 | 7 | -------------------------------------------------------------------------------- /resources/47/info.txt: -------------------------------------------------------------------------------- 1 | name: Primewords Chinese Corpus Set 1 2 | summary: Chinese Mandarin corpus released by Shanghai Primewords Co. Ltd. (www.primewords.cn), containing 100 hours of speech data. 3 | category: speech 4 | license: Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0) 5 | file: primewords_md_2018_set1.tar.gz speech data and transcripts 6 | -------------------------------------------------------------------------------- /resources/19/info.txt: -------------------------------------------------------------------------------- 1 | name: TED-LIUMv2 2 | summary: TED-LIUM corpus release 2, English speech recognition training corpus from TED talks, created by Laboratoire d’Informatique de l’Université du Maine (LIUM) (mirrored here) 3 | category: audio 4 | license: Creative Commons BY-NC-ND 3.0 (http://creativecommons.org/licenses/by-nc-nd/3.0/deed.en) 5 | file: TEDLIUM_release2.tar.gz The corpus 6 | -------------------------------------------------------------------------------- /resources/58/info.txt: -------------------------------------------------------------------------------- 1 | name: Pansori-TEDxKR 2 | summary: Korean speech corpus generated from Korean language TEDx talks 3 | category: speech 4 | license: Creative Commons BY-NC-ND 4.0 (attribution/non-commercial/no-derivatives) 5 | file: pansori-tedxkr-corpus-1.0.tar.gz Korean speech and trascripts 6 | alternate_url: https://storage.googleapis.com/pansori/corpus/pansori-tedxkr-corpus-1.0.tar.gz 7 | -------------------------------------------------------------------------------- /resources/33/info.txt: -------------------------------------------------------------------------------- 1 | name: Aishell 2 | summary: Mandarin data, provided by Beijing Shell Shell Technology Co.,Ltd 3 | category: speech 4 | license: Apache License v.2.0 5 | file: data_aishell.tgz speech data and transcripts 6 | file: resource_aishell.tgz supplementary resources, incl. lexicon, speaker info 7 | alternate_url: http://www.aishelltech.com/kysjcp Full description from the company website 8 | -------------------------------------------------------------------------------- /resources/26/about.html: -------------------------------------------------------------------------------- 1 | This data includes simulated room impulse responses with various room configs. 2 | It is intended for use when comparing the performance 3 | of acoustic models trained with data reverberated with real and simulated 4 | impulse responses. 5 | They were used in our paper 6 | "A Study on Data Augmentation of Reverberant Speech for Robust Speech Recognition" 7 | submitted to ICASSP 2017 8 |

9 | -------------------------------------------------------------------------------- /resources/37/info.txt: -------------------------------------------------------------------------------- 1 | name: High quality TTS data for Bengali languages 2 | summary: Multi-speaker TTS data for Bangladesh Bengali (bn-BD) and Indian Bengali (bn-IN). 3 | category: speech 4 | license: License: Attribution-ShareAlike 4.0 (CC BY-SA 4.0) 5 | file: bn_bd.zip Bangladesh Bengali data 6 | file: bn_in.zip Indian Bengali data 7 | file: README.txt Information about the data 8 | file: LICENSE.txt License information 9 | -------------------------------------------------------------------------------- /resources/91/info.txt: -------------------------------------------------------------------------------- 1 | name: Free English Corpus and Language Challenge -- Speechocean 2 | summary: A free 8.2 hours English speech recognition corpus provided by speechocean and an oriental language recognition challenge co-organized by speechocean and Tsinghua University. 3 | category: Speech 4 | license: Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0) 5 | file: speechoceanfreedata2.zip Corpus 6 | -------------------------------------------------------------------------------- /resources/45/info.txt: -------------------------------------------------------------------------------- 1 | name: Free ST American English Corpus 2 | summary: A free American English corpus by Surfingtech (www.surfing.ai), containing utterances from 10 speakers, Each speaker has about 350 utterances; 3 | category: speech 4 | license: Creative Common BY-NC-ND 4.0 (Attribution-NonCommercial-NoDerivatives 4.0 International) 5 | file: ST-AEDS-20180100_1-OS.tgz speech audios and transcripts 6 | alternate_url: https://www.surfing.ai 7 | -------------------------------------------------------------------------------- /resources/7/info.txt: -------------------------------------------------------------------------------- 1 | name: TED-LIUM 2 | summary: English speech recognition training corpus from TED talks, created by Laboratoire d’Informatique de l’Université du Maine (LIUM) (mirrored here) 3 | category: speech 4 | license: Creative Commons BY-NC-ND 3.0 (attribution/non-commercial/no-derivatives). 5 | file: TEDLIUM_release1.tar.gz The first release 6 | alternate_url: http://www-lium.univ-lemans.fr/en/content/ted-lium-corpus Original source 7 | -------------------------------------------------------------------------------- /resources/62/info.txt: -------------------------------------------------------------------------------- 1 | name: aidatatang_200zh 2 | summary: A Chinese Mandarin speech corpus by Beijing DataTang Technology Co., Ltd, containing 200 hours of speech data from 600 speakers. The transcription accuracy for each sentence is larger than 98%. 3 | category: speech 4 | license: Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0) 5 | file: aidatatang_200zh.tgz Corpus 6 | alternate_url: https://www.datatang.com/webfront/opensource.html 7 | 8 | -------------------------------------------------------------------------------- /config/README: -------------------------------------------------------------------------------- 1 | This directory is used to store the Apache config file "openslr"; we made a soft link like this: 2 | cd /etc/apache2/sites-available 3 | ln -s /var/www/openslr/config/openslr . 4 | This is done so that we can put it in the same git repository as the html and php of the site's 5 | contents. 6 | (note: for reasons of size, we can't do the same for the directory "/var/www/openslr/resources/" 7 | here the actual resources we export are kept.) 8 | 9 | I'm also adding openslr.key 10 | -------------------------------------------------------------------------------- /resources/38/info.txt: -------------------------------------------------------------------------------- 1 | root@www:/var/www/openslr# cat /var/www/openslr/resources/6/info.txt 2 | name: Free ST Chinese Mandarin Corpus 3 | summary: A free Chinese Mandarin corpus by Surfingtech (www.surfing.ai), containing utterances from 855 speakers, 102600 utterances; 4 | category: speech 5 | license: Creative Common BY-NC-ND 4.0 (Attribution-NonCommercial-NoDerivatives 4.0 International) 6 | file: ST-CMDS-20170001_1-OS.tar.gz speech audios and transcripts 7 | alternate_url: https://www.surfing.ai 8 | -------------------------------------------------------------------------------- /resources/40/info.txt: -------------------------------------------------------------------------------- 1 | name: Zeroth-Korean 2 | summary: Korean Open-source Speech Corpus for Speech Recognition by Zeroth Project (https://github.com/goodatlas/zeroth) 3 | category: Speech Corpus for Automatic Speech Recognition 4 | license: Attribution 4.0 International (CC BY 4.0) 5 | file: zeroth_korean.tar.gz Korean Speech data, transcription, lexicon and language model 6 | alternate_url: https://storage.googleapis.com/zeroth_project/zeroth_korean.tar.gz Korean Speech data, transcription and language model 7 | -------------------------------------------------------------------------------- /resources/20/info.txt: -------------------------------------------------------------------------------- 1 | name: Aachen Impulse Response Database 2 | summary: Aachen Impulse Response database (AIR): a database of room impulse responses (mirrored here) 3 | category: audio 4 | license: Not stated in the download. 5 | alternate_url: https://www2.iks.rwth-aachen.de/air/air_database_release_1_4.zip Original download link 6 | alternate_url: http://www.iks.rwth-aachen.de/en/research/tools-downloads/aachen-impulse-response-database/ Project page 7 | file: air_database_release_1_4.zip Version 1.4 of the database 8 | -------------------------------------------------------------------------------- /resources/59/info.txt: -------------------------------------------------------------------------------- 1 | name: ParlamentParla 2 | summary: Catalan speech corpus generated from Catalan Parliamentary sessions 3 | category: speech 4 | license: CC Attribution 4.0 (CC BY 4.0) 5 | file: parlament_v1.0_clean.tar.gz 90 hours of "clean" speech and transcripts 6 | file: parlament_v1.0_other.tar.gz 230 hours of "other" speech and transcripts 7 | alternate_url: http://laklak.eu/share/parlament_v1.0_clean.tar.gz clean data 8 | alternate_url: http://laklak.eu/share/parlament_v1.0_other.tar.gz other data 9 | 10 | -------------------------------------------------------------------------------- /resources/80/info.txt: -------------------------------------------------------------------------------- 1 | name: Crowdsourced high-quality Burmese speech data set. 2 | summary: Data set which contains recordings of Burmese. 3 | There are 2530 recordings from female speakers. 4 | category: speech 5 | license: Attribution-ShareAlike 4.0 International 6 | file: about.html Information about the data set 7 | file: LICENSE License information for the data set 8 | file: line_index_female.tsv All utterances for the female speakers. 9 | file: my_mm_female.zip Archive file with all audio for the female speakers. 10 | -------------------------------------------------------------------------------- /resources/6/info.txt: -------------------------------------------------------------------------------- 1 | name: Vystadial 2 | summary: English and Czech data, mirrored from the Vystadial project 3 | category: speech 4 | license: Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0 US) 5 | file: data_voip_cs.tgz Czech speech and transcripts 6 | file: data_voip_en.tgz English speech and transcripts 7 | alternate_url: https://lindat.mff.cuni.cz/repository/xmlui/handle/11858/00-097C-0000-0023-4670-6 Czech data 8 | alternate_url: https://lindat.mff.cuni.cz/repository/xmlui/handle/11858/00-097C-0000-0023-4671-4 English data 9 | -------------------------------------------------------------------------------- /resources/28/about.html: -------------------------------------------------------------------------------- 1 | This data includes all the room impulse responses (RIRs) and noises 2 | we used in our paper "A Study on Data Augmentation of Reverberant Speech for Robust Speech Recognition" 3 | submitted to ICASSP 2017. 4 | It includes the real RIRs and isotropic noises from 5 | the RWCP sound scene database, the 2014 REVERB challenge 6 | database and the Aachen impulse response database (AIR); 7 | the simulated RIRs generated by ourselves 8 | and also the point-source noises that extracted from the MUSAN corpus. 9 |

10 | -------------------------------------------------------------------------------- /resources/18/info.txt: -------------------------------------------------------------------------------- 1 | name: THCHS-30 2 | summary: A Free Chinese Speech Corpus Released by CSLT@Tsinghua University 3 | category: speech 4 | license: Apache License v.2.0 5 | file: data_thchs30.tgz speech data and transcripts 6 | file: test-noise.tgz standard 0db noisy test data 7 | file: resource.tgz supplementary resources, incl. lexicon for training data, noise samples 8 | alternate_url: http://data.cslt.org/thchs30/README.html Original URL from CSLT 9 | alternate_url: http://pan.baidu.com/s/1hqKwE00 Baidu disk 10 | 11 | -------------------------------------------------------------------------------- /resources/30/about.html: -------------------------------------------------------------------------------- 1 | This data set contains multi-speaker high quality transcribed audio data for Sinhalese. The data set consists of wave files, and a TSV file. The file si_lk.lines.txt contains a FileID, which in tern contains the UserID and the Transcription of audio in the file. 2 |

3 | The data set has been manually quality checked, but there might still be errors. 4 |

5 | This dataset was collected by Google in Sri Lanka. 6 |

7 | See LICENSE.txt file for license information. 8 |

9 | Copyright 2015, 2016 Google, Inc. 10 | -------------------------------------------------------------------------------- /resources/74/info.txt: -------------------------------------------------------------------------------- 1 | name: Crowdsourced high-quality Puerto Rico Spanish speech data set. 2 | summary: Data set which contains recordings of Puerto Rico Spanish. 3 | There are 617 recordings from female speakers. 4 | category: speech 5 | license: Attribution-ShareAlike 4.0 International 6 | file: about.html Information about the data set 7 | file: LICENSE License information for the data set 8 | file: line_index_female.tsv All utterances for the female speakers. 9 | file: es_pr_female.zip Archive file with all audio for the female speakers. 10 | -------------------------------------------------------------------------------- /resources/56/about.html: -------------------------------------------------------------------------------- 1 |

2 | Most research papers reporting results on IAM database use different splits of the data into train/test/val than those provided with the database. These are those data splits shared by Théodore Bluche from Human Language Technology and Pattern Recognition lab, RWTH Aachen University. It will create 6482 train line images, 2915 test line images and 976 validation line images. 3 |

4 | 5 |

6 | It contains the page XML name. For example: 7 |

8 | 9 |
10 | a01-000u
11 | a01-000x
12 | a01-003
13 | 
14 | -------------------------------------------------------------------------------- /resources/7/about.html: -------------------------------------------------------------------------------- 1 | 2 | The TED-LIUM corpus (mirrored here) 3 | is English-language TED talks, with transcriptions, sampled at 16kHz. It 4 | contains about 118 hours of speech. 5 |

6 | The original page requests that you cite the following paper if you make use of this 7 | corpus: 8 |

9 | A. Rousseau, P. Deléglise, and Y. Estève, "TED-LIUM: an automatic speech recognition dedicated corpus", 10 |
11 | in Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC'12), May 2012. 12 |

13 | -------------------------------------------------------------------------------- /resources/32/info.txt: -------------------------------------------------------------------------------- 1 | name: High quality TTS data for four South African languages (af, st, tn, xh) 2 | summary: Multi-speaker TTS data for four South African languages, Afrikaans, Sesotho, Setswana and isiXhosa. 3 | category: speech 4 | license: Attribution-ShareAlike 4.0 International (CC BY-SA 4.0) 5 | file: af_za.tar.gz Audio files and transcriptions for Afrikaans 6 | file: st_za.tar.gz Audio files and transcriptions for Sesotho 7 | file: tn_za.tar.gz Audio files and transcriptions for Setswana 8 | file: xh_za.tar.gz Audio files and transcriptions for isiXhosa 9 | -------------------------------------------------------------------------------- /resources/17/about.html: -------------------------------------------------------------------------------- 1 | MUSAN is a corpus of music, speech, and noise recordings. 2 |

3 | This work was supported by the National Science Foundation Graduate Research 4 | Fellowship under Grant No. 1232825 and by Spoken Communications. 5 |

6 | 7 | You can cite the data using the following BibTeX entry: 8 |

 9 | @misc{musan2015,
10 |   author = {David Snyder and Guoguo Chen and Daniel Povey},
11 |   title = {{MUSAN}: {A} {M}usic, {S}peech, and {N}oise {C}orpus},
12 |   year = {2015},
13 |   eprint = {1510.08484},
14 |   note = {arXiv:1510.08484v1}
15 | }
16 | 


--------------------------------------------------------------------------------
/resources/37/README.txt:
--------------------------------------------------------------------------------
 1 | This data set contains multi-speaker high quality transcribed audio data for
 2 | Bengali. The data set consists of wave files, and a TSV file. There are two
 3 | zip files, one for each local which contain a file: line_index.tsv and the wave
 4 | files. Line index has a fileID and the transcription.
 5 | 
 6 | The data set has been manually quality checked, but there might still be errors.
 7 | 
 8 | This data set was collected by Google.
 9 | 
10 | See LICENSE.txt file for license information.
11 | 
12 | Copyright 2015, 2016, 2017, 2018 Google, Inc.
13 | 


--------------------------------------------------------------------------------
/resources/5/info.txt:
--------------------------------------------------------------------------------
1 | name: MSU Switchboard transcipts
2 | summary: A mirror of the Mississippi State transcripts and lexicon for Switchboard.
3 | category: text
4 | license: Unrestricted
5 | file: switchboard_word_alignments.tar.gz The transcripts
6 | file: sw-ms98-dict.text The lexicon
7 | alternate_url: http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz The transcripts in their original location
8 | alternate_url: http://www.isip.piconepress.com/projects/switchboard/releases/sw-ms98-dict.text The lexicon in its original location
9 | 


--------------------------------------------------------------------------------
/resources/64/info.txt:
--------------------------------------------------------------------------------
 1 | name: Crowdsourced high-quality Marathi multi-speaker speech data set.
 2 | summary: Data set which contains recordings of native speakers of Marathi
 3 | There are 1596 recordings from female speakers.
 4 | The data set has recordings from multiple female speakers.
 5 | category: speech
 6 | license: Attribution-ShareAlike 4.0 International
 7 | file: about.html Information about the data set
 8 | file: LICENSE License information for the data set
 9 | file: line_index.tsv All utterances in the data set
10 | file: mr_in_female.zip Archive file containing all the data.
11 | 


--------------------------------------------------------------------------------
/resources/81/about.html:
--------------------------------------------------------------------------------
1 | 

Contains 25 wav files containing audio sampled at 44.1kHz.  The files are stored as 16-bit signed-integer PCM.  Each file is of duration 1 second.

2 |

There are 5 files from 5 different sources. ‘piano’ contains fragments from a piano-only music, ‘lmiserables’ contains parts from the movie Les-miserables, ‘joliver’ is taken from John Oliver’s show “Last week Tonight” and ‘simons’ are containing split seconds from Simons Institute videos, ‘bmaher’ also is including parts from Bill Maher’s show “In Our Time”.

3 | -------------------------------------------------------------------------------- /resources/85/info.txt: -------------------------------------------------------------------------------- 1 | name: HI-MIA 2 | summary: A far-field text-dependent speaker verification database for AISHELL Speaker Verification Challenge 2019 3 | category: Speech 4 | license: Apache License v.2.0 5 | file: train.tar.gz Training set with speaker dependent sub folders 6 | file: dev.tar.gz Dev set with speaker dependent sub folders 7 | file: test.tar.gz Test set with target/non-target answer 8 | file: test_v2.tar.gz Updated test set fixing corrupted audio files 9 | file: filename_mapping.tar.gz Filename mapping rules for multi-channel information 10 | alternate_url: http://aishelltech.com/wakeup_data 11 | -------------------------------------------------------------------------------- /resources/34/about.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |

9 | This is a pronouncing dictionary for the Spanish language. 10 | It is intended to be used as the lexicon for an automatic speech recognition system. 11 |
12 |
John Morgan
13 | 14 | 15 | Last modified: Thu Sep 7 14:59:16 EDT 2017 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /resources/21/about.html: -------------------------------------------------------------------------------- 1 | This is a list of words in Spanish with frequency counts. 2 |

3 | This data was derived from the LDC Spanish Gigaword Corpus (LDC2011T12). 4 | The list is used as a part of the Kaldi Spanish Fisher recipe and is used to augment 5 | the pronunciation lexicon with additional words. The actual pronunication is generated 6 | using the Spanish rule based lexicon (LDC96L16). 7 |

8 | 9 |

10 | NOTE : No components of the LDC datasets LDC2011T12 and LDC96L16 are included with 11 | this dataset. 12 |

13 | 14 | Details of how this word list is used can be found in this paper : http://cs.jhu.edu/~gkumar/papers/kumar2014some.pdf 15 | -------------------------------------------------------------------------------- /resources/29/about.html: -------------------------------------------------------------------------------- 1 | This data is a pronunciation dictionary consisting of approximately 822 000 Swedish words with their corresponding phonetic transcription. 2 |

3 | The license is a CC0 which is unrestricted. This particular version of the lexicon is an improved and updated version produced by Emelie Kullmann and the original unmodified version of the lexicon can be found on the website of the National Library of Norway. 4 |

5 | The updated version of the lexicon that can be directly downloaded from here is recommended to use with the Swedish recipe and yields a better result for ASR. 6 | -------------------------------------------------------------------------------- /resources/76/info.txt: -------------------------------------------------------------------------------- 1 | name: Crowdsourced high-quality Basque speech data set. 2 | summary: Data set which contains recordings of Basque. 3 | There are 3858 recordings from female speakers, and 3278 recordings from male 4 | speakers. 5 | category: speech 6 | license: Attribution-ShareAlike 4.0 International 7 | file: about.html Information about the data set 8 | file: LICENSE License information for the data set 9 | file: line_index_female.tsv All utterances for the female speakers. 10 | file: line_index_male.tsv All utterances for the male speakers. 11 | file: eu_es_female.zip Archive file with all audio for the female speakers. 12 | file: eu_es_male.zip Archive file with all audio for the male speakers. 13 | -------------------------------------------------------------------------------- /resources/25/info.txt: -------------------------------------------------------------------------------- 1 | name: ALFFA (African Languages in the Field: speech Fundamentals and Automation) 2 | summary: Amharic, Swahili and Wolof data, mirrored from the ALFFA git repository 3 | category: speech 4 | license: MIT 5 | file: data_readspeech_am.tar.bz2 Amharic speech and transcripts 6 | file: data_broadcastnews_sw.tar.bz2 Swahili speech and transcripts 7 | file: data_readspeech_wo.tar.bz2 Wolof speech and transcripts 8 | alternate_url: https://github.com/besacier/ALFFA_PUBLIC/tree/master/ASR/AMHARIC Amharic data 9 | alternate_url: https://github.com/besacier/ALFFA_PUBLIC/tree/master/ASR/SWAHILI Swahili data 10 | alternate_url: https://github.com/besacier/ALFFA_PUBLIC/tree/master/ASR/WOLOF Wolof data -------------------------------------------------------------------------------- /resources/69/info.txt: -------------------------------------------------------------------------------- 1 | name: Crowdsourced high-quality Catalan speech data set. 2 | summary: Data set which contains recordings of Catalan. 3 | There are 2321 recordings from female speakers, and 1919 recordings from male 4 | speakers. 5 | category: speech 6 | license: Attribution-ShareAlike 4.0 International 7 | file: about.html Information about the data set 8 | file: LICENSE License information for the data set 9 | file: line_index_female.tsv All utterances for the female speakers. 10 | file: line_index_male.tsv All utterances for the male speakers. 11 | file: ca_es_female.zip Archive file with all audio for the female speakers. 12 | file: ca_es_male.zip Archive file with all audio for the male speakers. 13 | -------------------------------------------------------------------------------- /resources/77/info.txt: -------------------------------------------------------------------------------- 1 | name: Crowdsourced high-quality Galician speech data set. 2 | summary: Data set which contains recordings of Galician. 3 | There are 4264 recordings from female speakers, and 1323 recordings from male 4 | speakers. 5 | category: speech 6 | license: Attribution-ShareAlike 4.0 International 7 | file: about.html Information about the data set 8 | file: LICENSE License information for the data set 9 | file: line_index_female.tsv All utterances for the female speakers. 10 | file: line_index_male.tsv All utterances for the male speakers. 11 | file: gl_es_female.zip Archive file with all audio for the female speakers. 12 | file: gl_es_male.zip Archive file with all audio for the male speakers. 13 | -------------------------------------------------------------------------------- /resources/10/info.txt: -------------------------------------------------------------------------------- 1 | name: SRE Data 2 | summary: Various files from SRE data that NIST used to host online 3 | category: misc 4 | license: Public domain, I believe 5 | file: sre2000-key.tar.gz An archive containing a key file for SRE 2000, plus some other things. 6 | file: sre04_key.tgz An archive containing the key file for SRE04, plus some other things. 7 | file: sre04_key-v2.txt.gz The key file for SRE04 8 | file: sre05-key-v7b.txt.gz The key file for SRE05 9 | alternate_url: http://www.itl.nist.gov/iad/894.01/tests/sre/2006/sre04_key.tgz Previously working location for SRE04 data 10 | alternate_url: http://www.itl.nist.gov/iad/894.01/tests/sre/2006/sre05_key.tgz Previously working location for SRE05 data 11 | -------------------------------------------------------------------------------- /resources/65/info.txt: -------------------------------------------------------------------------------- 1 | name: Crowdsourced high-quality Tamil multi-speaker speech data set. 2 | summary: Data set which contains recordings of native speakers of Tamil. 3 | There are 2335 recordings from female speakers, and 1956 recordings from male 4 | speakers. 5 | category: speech 6 | license: Attribution-ShareAlike 4.0 International 7 | file: about.html Information about the data set 8 | file: LICENSE License information for the data set 9 | file: line_index_female.tsv Lines recorded by the female speakers 10 | file: line_index_male.tsv Lines recorded by the male speakers 11 | file: ta_in_female.zip Archive containing recordings from female speakers 12 | file: ta_in_male.zip Archive file recordings from male speakers 13 | -------------------------------------------------------------------------------- /resources/66/info.txt: -------------------------------------------------------------------------------- 1 | name: Crowdsourced high-quality Telugu multi-speaker speech data set. 2 | summary: Data set which contains recordings of native speakers of Telugu. 3 | There are 2294 recordings from female speakers, and 2154 recordings from male 4 | speakers. 5 | category: speech 6 | license: Attribution-ShareAlike 4.0 International 7 | file: about.html Information about the data set 8 | file: LICENSE License information for the data set 9 | file: line_index_female.tsv Lines recorded by the female speakers 10 | file: line_index_male.tsv Lines recorded by the male speakers 11 | file: te_in_female.zip Archive containing recordings from female speakers 12 | file: te_in_male.zip Archive file recordings from male speakers 13 | -------------------------------------------------------------------------------- /resources/70/info.txt: -------------------------------------------------------------------------------- 1 | name: Crowdsourced high-quality Nigerian English speech data set. 2 | summary: Data set which contains recordings of Nigerian English. 3 | There are 2045 recordings from female speakers, and 1314 recordings from male 4 | speakers. 5 | category: speech 6 | license: Attribution-ShareAlike 4.0 International 7 | file: about.html Information about the data set 8 | file: LICENSE License information for the data set 9 | file: line_index_female.tsv All utterances for the female speakers. 10 | file: line_index_male.tsv All utterances for the male speakers. 11 | file: en_ng_female.zip Archive file with all audio for the female speakers. 12 | file: en_ng_male.zip Archive file with all audio for the male speakers. 13 | -------------------------------------------------------------------------------- /resources/71/info.txt: -------------------------------------------------------------------------------- 1 | name: Crowdsourced high-quality Chilean Spanish speech data set. 2 | summary: Data set which contains recordings of Chilean Spanish. 3 | There are 1738 recordings from female speakers, and 2636 recordings from male 4 | speakers. 5 | category: speech 6 | license: Attribution-ShareAlike 4.0 International 7 | file: about.html Information about the data set 8 | file: LICENSE License information for the data set 9 | file: line_index_female.tsv All utterances for the female speakers. 10 | file: line_index_male.tsv All utterances for the male speakers. 11 | file: es_cl_female.zip Archive file with all audio for the female speakers. 12 | file: es_cl_male.zip Archive file with all audio for the male speakers. 13 | -------------------------------------------------------------------------------- /resources/73/info.txt: -------------------------------------------------------------------------------- 1 | name: Crowdsourced high-quality Peruvian Spanish speech data set. 2 | summary: Data set which contains recordings of Peruvian Spanish. 3 | There are 2529 recordings from female speakers, and 2918 recordings from male 4 | speakers. 5 | category: speech 6 | license: Attribution-ShareAlike 4.0 International 7 | file: about.html Information about the data set 8 | file: LICENSE License information for the data set 9 | file: line_index_female.tsv All utterances for the female speakers. 10 | file: line_index_male.tsv All utterances for the male speakers. 11 | file: es_pe_female.zip Archive file with all audio for the female speakers. 12 | file: es_pe_male.zip Archive file with all audio for the male speakers. 13 | -------------------------------------------------------------------------------- /resources/79/info.txt: -------------------------------------------------------------------------------- 1 | name: Crowdsourced high-quality Kannada multi-speaker speech data set. 2 | summary: Data set which contains recordings of native speakers of Kannada. 3 | There are 2186 recordings from female speakers, and 2214 recordings from male 4 | speakers. 5 | category: speech 6 | license: Attribution-ShareAlike 4.0 International 7 | file: about.html Information about the data set 8 | file: LICENSE License information for the data set 9 | file: line_index_female.tsv Lines recorded by the female speakers 10 | file: line_index_male.tsv Lines recorded by the male speakers 11 | file: kn_in_female.zip Archive containing recordings from female speakers 12 | file: kn_in_male.zip Archive file recordings from male speakers 13 | -------------------------------------------------------------------------------- /resources/63/info.txt: -------------------------------------------------------------------------------- 1 | name: Crowdsourced high-quality Malayalam multi-speaker speech data set. 2 | summary: Data set which contains recordings of native speakers of Malayalam. 3 | There are 2103 recordings from female speakers, and 2023 recordings from male 4 | speakers. 5 | category: speech 6 | license: Attribution-ShareAlike 4.0 International 7 | file: about.html Information about the data set 8 | file: LICENSE License information for the data set 9 | file: line_index_female.tsv Lines recorded by the female speakers 10 | file: line_index_male.tsv Lines recorded by the male speakers 11 | file: ml_in_female.zip Archive containing recordings from female speakers 12 | file: ml_in_male.zip Archive file recordings from male speakers 13 | -------------------------------------------------------------------------------- /resources/72/info.txt: -------------------------------------------------------------------------------- 1 | name: Crowdsourced high-quality Columbian Spanish speech data set. 2 | summary: Data set which contains recordings of Columbian Spanish. 3 | There are 2369 recordings from female speakers, and 2534 recordings from male 4 | speakers. 5 | category: speech 6 | license: Attribution-ShareAlike 4.0 International 7 | file: about.html Information about the data set 8 | file: LICENSE License information for the data set 9 | file: line_index_female.tsv All utterances for the female speakers. 10 | file: line_index_male.tsv All utterances for the male speakers. 11 | file: es_co_female.zip Archive file with all audio for the female speakers. 12 | file: es_co_male.zip Archive file with all audio for the male speakers. 13 | -------------------------------------------------------------------------------- /resources/75/info.txt: -------------------------------------------------------------------------------- 1 | name: Crowdsourced high-quality Venezuelan Spanish speech data set. 2 | summary: Data set which contains recordings of Venezuelan Spanish. 3 | There are 1603 recordings from female speakers, and 1754 recordings from male 4 | speakers. 5 | category: speech 6 | license: Attribution-ShareAlike 4.0 International 7 | file: about.html Information about the data set 8 | file: LICENSE License information for the data set 9 | file: line_index_female.tsv All utterances for the female speakers. 10 | file: line_index_male.tsv All utterances for the male speakers. 11 | file: es_ve_female.zip Archive file with all audio for the female speakers. 12 | file: es_ve_male.zip Archive file with all audio for the male speakers. 13 | -------------------------------------------------------------------------------- /resources/78/info.txt: -------------------------------------------------------------------------------- 1 | name: Crowdsourced high-quality Gujarati multi-speaker speech data set. 2 | summary: Data set which contains recordings of native speakers of Gujarati. 3 | There are 2219 recordings from female speakers, and 2053 recordings from male 4 | speakers. 5 | category: speech 6 | license: Attribution-ShareAlike 4.0 International 7 | file: about.html Information about the data set 8 | file: LICENSE License information for the data set 9 | file: line_index_female.tsv Lines recorded by the female speakers 10 | file: line_index_male.tsv Lines recorded by the male speakers 11 | file: gu_in_female.zip Archive containing recordings from female speakers 12 | file: gu_in_male.zip Archive file recordings from male speakers 13 | -------------------------------------------------------------------------------- /resources/70/about.html: -------------------------------------------------------------------------------- 1 | This data set contains transcribed high-quality audio of Nigerian English 2 | sentences recorded by volunteers, in Lagos Nigerian and in London. The data set 3 | consists of wave files, and a TSV file (line_index.tsv). The file line_index.tsv 4 | contains a anonymized FileID and the transcription of audio in the file. 5 |

6 | The data set has been manually quality checked, but there might still be errors. 7 |

8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |

13 | See LICENSE file for license information. 14 |

15 | Copyright 2018, 2019 Google, Inc. 16 | -------------------------------------------------------------------------------- /resources/45/about.html: -------------------------------------------------------------------------------- 1 |

2 | This corpus were recorded in silence in-door environment using cellphone. It has 10 speakers. Each speaker has about 350 utterances. All utterances were carefully transcribed and checked by human. Transcription accuracy is guaranteed. If there is any problems, we agree to correct them for you. 	
3 | 
4 | Please cite the data as “ST-AEDS-20180100_1, Free ST American English Corpus”.
5 | 
6 | The data set is a subset of a much bigger data set (about 1000hours) which was recorded in the same environment as this open source data. Please visit our website www.surfing.ai or contact us at contact@surfingtech.cn for details.
7 | 
8 | -------------------------------------------------------------------------------- /resources/49/info.txt: -------------------------------------------------------------------------------- 1 | name: VoxCeleb Data 2 | summary: Various files for the VoxCeleb datasets 3 | category: Misc 4 | license: Not copyrighted 5 | file: voxceleb1_test.txt A file containing a list of trial pairs for the verification task of the old version of VoxCeleb1 6 | file: voxceleb1_test_v2.txt A file containing a list of trial pairs for the verification task of the new version of VoxCeleb1 7 | file: voxceleb1_sitw_overlap.txt A list of 60 speakers in VoxCeleb1 that overlap with the Speakers in the Wild (SITW) dataset 8 | file: vox1_meta.csv A list which provides identity, gender and nationality labels for VoxCeleb1 9 | file: vox2_meta.csv A list which provides identity, gender and nationality labels for VoxCeleb2 10 | alternate_url: http://www.robots.ox.ac.uk/~vgg/data/voxceleb 11 | -------------------------------------------------------------------------------- /resources/86/info.txt: -------------------------------------------------------------------------------- 1 | name: Crowdsourced high-quality Yoruba speech data set. 2 | summary: Data set which contains recordings of Yoruba. 3 | There are 1892 recordings from female speakers, and 1691 recordings from male 4 | speakers 5 | category: speech 6 | license: Attribution-ShareAlike 4.0 International 7 | file: about.html Information about the data set. 8 | file: LICENSE License information for the data set. 9 | file: line_index_female.tsv All utterances for the female speakers. 10 | file: line_index_male.tsv All utterances for the male speakers. 11 | file: yo_ng_female.zip Archive file with all audio for the female speakers. 12 | file: yo_ng_male.zip Archive file with all audio for the male speakers. 13 | file: annotation_info.txt A file listing additional annotations in the text. 14 | -------------------------------------------------------------------------------- /resources/86/about.html: -------------------------------------------------------------------------------- 1 | This data set contains transcribed high-quality audio of Yoruba sentences 2 | recorded by volunteers. The data set consists of wave files, and a TSV file 3 | (line_index.tsv). The file line_index.tsv contains a anonymized FileID and the 4 | transcription of audio in the file. The file annotation_info contains 5 | information annotations included in the data set. 6 |

7 | The data set has been manually quality checked, but there might still be errors. 8 |

9 | Please report any issues in the following issue tracker on GitHub. 10 | 11 | https://github.com/google/language-resources/issues 12 | 13 |

14 | See LICENSE file for license information. 15 |

16 | Copyright 2018, 2019, 2020 Google, Inc. 17 | -------------------------------------------------------------------------------- /resources/60/info.txt: -------------------------------------------------------------------------------- 1 | name: LibriTTS corpus 2 | summary: Large-scale corpus of English speech derived from the original materials of the LibriSpeech corpus 3 | category: Speech 4 | license: CC BY 4.0 5 | file: dev-clean.tar.gz Development set, clean speech 6 | file: dev-other.tar.gz Development set, more challenging speech 7 | file: test-clean.tar.gz Test set, "clean" speech 8 | file: test-other.tar.gz Test set, "other" speech 9 | file: train-clean-100.tar.gz Training set derived from the original materials of the train-clean-100 subset of LibriSpeech 10 | file: train-clean-360.tar.gz Training set derived from the original materials of the train-clean-360 subset of LibriSpeech 11 | file: train-other-500.tar.gz Training set derived from the original materials of the train-other-500 subset of LibriSpeech 12 | -------------------------------------------------------------------------------- /resources/46/about.html: -------------------------------------------------------------------------------- 1 | The Tunisian_MSA corpus was originally collected to train acoustic models for pronunciation modeling in Arabic language learning applications. 2 |

3 | The data collection took place near Tunis the capital of the Republic of Tunisia in 2003. 4 |

5 | The Tunisian_MSA corpus is divided into recited and prompted speech subcorpora. 6 | The recited speech is stored under the recordings directory. 7 | The prompted speech is stored under the answers directory. 8 | Each of the 118 informants contributed to both subcorpora by reciting sentences and providing answers to prompted questions. 9 | The Tunisian_MSA corpus has 11.2 hours of speech. 10 |

11 | A small corpus was collected in 2017 for testing. 12 | It consists of speech from 4 speakers, 3 male Libyans and 1 female from Tunisia. 13 | 14 | 15 | -------------------------------------------------------------------------------- /resources/22/info.txt: -------------------------------------------------------------------------------- 1 | name: THUYG-20 2 | summary: A free Uyghur speech database Released by CSLT@Tsinghua University & Xinjiang University 3 | category: speech 4 | license: Apache License v.2.0 5 | file: data_thuyg20.tar.gz speech data and transcripts for speech recognition 6 | file: data_thuyg20_sre.tar.gz speech data for speaker recognition 7 | file: test_noise.tar.gz standard 0db noisy test data for speech recognition 8 | file: test_noise_sre.tar.gz standard 0db noisy test data for speaker recognition 9 | file: resource.tar.gz supplementary resources, incl. lexicon for training data, noise samples 10 | alternate_url: http://data.cslt.org/thuyg20-openslr/README.html CSLT local storage 11 | alternate_url: http://pan.baidu.com/s/1hqKwE00 Baidu disk 12 | alternate_url: https://mega.nz/#F!idRSjL4A!cnCY0R2NjU77Jr0soe9OgQ Mega disk 13 | -------------------------------------------------------------------------------- /resources/1/about.html: -------------------------------------------------------------------------------- 1 |

2 | This dataset was created for the Kaldi project (see kaldi.sf.net), 3 | by a contributor who prefers to remain anonymous. The main point of the dataset is 4 | to provide an easy and fast way to test out the Kaldi scripts for free.

5 |

6 | The archive "waves_yesno.tar.gz" contains 60 .wav files, sampled at 8 kHz. All were recorded 7 | by the same male speaker, in Hebrew. 8 | In each file, the individual says 8 words; each word is either the Hebrew for "yes" or "no", so each 9 | file is a random sequence of 8 yes-es or noes. There is no separate transcription provided; the 10 | sequence is encoded in the filename, with 1 for yes and 0 for no, for instance: 11 |

12 | # tar -xvzf waves_yesno.tar.gz
13 | waves_yesno/1_0_1_1_1_0_1_0.wav
14 | waves_yesno/0_1_1_0_0_1_1_0.wav
15 | ...
16 | 
17 |

18 | -------------------------------------------------------------------------------- /resources/68/info.txt: -------------------------------------------------------------------------------- 1 | name: MAGICDATA Mandarin Chinese Read Speech Corpus 2 | summary: The corpus by Magic Data Technology Co., Ltd. , containing 755 hours of scripted read speech data from 1080 native speakers of the Mandarin Chinese spoken in mainland China. The sentence transcription accuracy is higher than 98%. 3 | category: Speech 4 | license: Attribution-NonCommercial-NoDerivatives 4.0 International Public License (CC BY-NC-ND 4.0) 5 | file: train_set.tar.gz Training set speech and transcripts 6 | file: dev_set.tar.gz Development set speech and transcripts 7 | file: test_set.tar.gz Test set speech and transcripts 8 | file: metadata.tar.gz supplementary resources, incl. data introduction (in English and Chinese) and speaker information 9 | alternate_url: http://www.imagicdatatech.com/index.php/home/dataopensource/data_info/id/101 Full description from the company website 10 | -------------------------------------------------------------------------------- /resources/40/about.html: -------------------------------------------------------------------------------- 1 | This is Zeroth-Korean corpus,
2 | licensed under Attribution 4.0 International (CC BY 4.0) 3 | 4 |

5 | The data set contains transcriebed audio data for Korean. There are 51.6 hours transcribed Korean audio for training data (22,263 utterances, 105 people, 3000 sentences) and 1.2 hours transcribed Korean audio for testing data (457 utterances, 10 people). This corpus also contains pre-trained/designed language model, lexicon and morpheme-based segmenter(morfessor).
6 | 7 | Zeroth project introduces free Korean speech corpus and aims to make Korean speech recognition more broadly accessible to everyone.
8 | 9 | This project was developed in collaboration between Lucas Jo(@Atlas Guide Inc.) and Wonkyum Lee(@Gridspace Inc.).

10 | 11 | Contact: Lucas Jo(lucasjo@goodatlas.com), Wonkyum Lee(wonkyum@gridspace.com) 12 |

13 | 14 | -------------------------------------------------------------------------------- /resources/11/info.txt: -------------------------------------------------------------------------------- 1 | name: LibriSpeech language models, vocabulary and G2P models 2 | summary: Language modelling resources, for use with the LibriSpeech ASR corpus 3 | category: text 4 | license: Public domain 5 | file: librispeech-lm-corpus.tgz 14500 public domain books, used as training material for the LibriSpeech's LM 6 | file: librispeech-lm-norm.txt.gz Normalized LM training text 7 | file: librispeech-vocab.txt 200K word vocabulary for the LM 8 | file: librispeech-lexicon.txt Pronunciations, some of which G2P auto-generated, for all words in the vocabulary 9 | file: 3-gram.arpa.gz 3-gram ARPA LM, not pruned 10 | file: 3-gram.pruned.1e-7.arpa.gz 3-gram ARPA LM, pruned with theshold 1e-7 11 | file: 3-gram.pruned.3e-7.arpa.gz 3-gram ARPA LM, pruned with theshold 3e-7 12 | file: 4-gram.arpa.gz 4-gram ARPA LM, usually used for rescoring 13 | file: g2p-model-5 Fifth order Sequitur G2P model 14 | -------------------------------------------------------------------------------- /resources/13/about.html: -------------------------------------------------------------------------------- 1 | The data includes non-speech sounds recorded in an anechoic room, 2 | reconstructed signals in various rooms, impulse responses for a 3 | microphone array, speech data recorded with the same array, and 4 | recordings of background noises. It is intended for use when 5 | simulating sound scenes. It was developed by the Real Acoustic 6 | Environments Working Group of the Real World Computing Partnership 7 | (RWCP). The data was recorded from 1998 to 2000. 8 |

9 | 10 | You can cite the data using the following BibTeX entry: 11 |

12 | 
13 | @inproceedings{nakamura2000acoustical,
14 |   title={Acoustical Sound Database in Real Environments for Sound Scene Understanding and Hands-Free Speech Recognition.},
15 |   author={Nakamura, Satoshi and Hiyane, Kazuo and Asano, Futoshi and Nishiura, Takanobu and Yamada, Takeshi},
16 |   booktitle={LREC},
17 |   year={2000}
18 | }
19 | 


--------------------------------------------------------------------------------
/resources/12/about.html:
--------------------------------------------------------------------------------
 1 | LibriSpeech is a corpus of approximately 1000 hours of 16kHz read English speech, prepared by
 2 | Vassil Panayotov with the assistance of Daniel Povey.
 3 | The data is derived from read audiobooks from the LibriVox project, and has been carefully segmented and aligned.
 4 | 

5 | Acoustic models, trained on this data set, are available at 6 | kaldi-asr.org and language models, suitable for evaluation can be found at 7 | http://www.openslr.org/11/. 8 |

9 | For more information, see the paper 10 | "LibriSpeech: an ASR corpus based on public domain audio books", 11 | Vassil Panayotov, Guoguo Chen, Daniel Povey and Sanjeev Khudanpur, ICASSP 2015 (submitted) 12 | (pdf)

13 | 14 | -------------------------------------------------------------------------------- /resources/37/about.html: -------------------------------------------------------------------------------- 1 | This data is transcribed high-quality speech data for Bengali. 2 |

3 | The data collection was perfomed by Google. 4 |

5 | If you use this data in publications, please cite it as follows: 6 |

 7 |   @inproceedings{kjartansson-etal-tts-sltu2018,
 8 |     title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese, Khmer, Nepali, Sinhala, and Sundanese}},
 9 |     author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu De Silva and Supheakmungkol Sarin},
10 |     booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},
11 |     year  = {2018},
12 |     address = {Gurugram, India},
13 |     month = aug,
14 |     pages = {66--70},
15 |     URL   = {http://dx.doi.org/10.21437/SLTU.2018-14}
16 |   }
17 | 
18 | -------------------------------------------------------------------------------- /resources/6/about.html: -------------------------------------------------------------------------------- 1 | 2 | This data is transcribed telephone converation data, in English and Czech. 3 |

4 | The data collection process and development of these training scripts was partly 5 | funded by the Ministry of Education, Youth and Sports of the Czech Republic 6 | under the grant agreement LK11221 and core research funding of Charles 7 | University in Prague. 8 |

9 | 10 | You can cite the data using the following BibTeX entry: 11 |

12 | 
13 | @inproceedings{korvas_2014,
14 |   title={{Free English and Czech telephone speech corpus shared under the CC-BY-SA 3.0 license}},
15 |   author={Korvas, Mat\v{e}j and Pl\'{a}tek, Ond\v{r}ej and Du\v{s}ek, Ond\v{r}ej and \v{Z}ilka, Luk\'{a}\v{s} and Jur\v{c}\'{i}\v{c}ek, Filip},
16 |   booktitle={Proceedings of the Eigth International Conference on Language Resources and Evaluation (LREC 2014)},
17 |   pages={To Appear},
18 |   year={2014},
19 | }
20 | 
21 | -------------------------------------------------------------------------------- /resources/4/info.txt: -------------------------------------------------------------------------------- 1 | name: sctk 2 | summary: A mirror of the sctk scoring software 3 | category: software 4 | license: Public domain 5 | file: sctk-2.4.0-20091110-0958.tar.bz2 The original, bzipped version 6 | file: sctk-2.4.0-20091110-0958.tar.gz A gzipped version of the archive 7 | file: sctk-2.4.8-20130429-2145.tar.bz2 Version 2.4.8 of the software, as bz2 8 | file: sctk-2.4.9-20141015-1634Z.tar.bz2 Version 2.4.9 of the software, as bz2 9 | file: sctk-2.4.10-20151007-1312Z.tar.bz2 Version 2.4.10 of the software, as bz2 10 | alternate_url: ftp://jaguar.ncsl.nist.gov/pub/sctk-2.4.0-20091110-0958.tar.bz2 (for version 2.4.0) 11 | alternate_url: ftp://jaguar.ncsl.nist.gov/pub/sctk-2.4.8-20130429-2145.tar.bz2 (for version 2.4.8) 12 | alternate_url: ftp://jaguar.ncsl.nist.gov/pub/sctk-2.4.9-20141015-1634Z.tar.bz2 (for version 2.4.9) 13 | alternate_url: ftp://jaguar.ncsl.nist.gov/pub/sctk-2.4.10-20151007-1312Z.tar.bz2 (for version 2.4.10) 14 | 15 | -------------------------------------------------------------------------------- /resources/54/info.txt: -------------------------------------------------------------------------------- 1 | name: Large Nepali ASR training data set 2 | summary: Nepali ASR training data set containing ~157K utterances. 3 | category: speech 4 | license: Attribution-ShareAlike 4.0 International 5 | file: about.html Information about the data set 6 | file: LICENSE License information for the data set 7 | file: utt_spk_text.tsv All utterances in the data set 8 | file: asr_nepali_0.zip Data set 9 | file: asr_nepali_1.zip Data set 10 | file: asr_nepali_2.zip Data set 11 | file: asr_nepali_3.zip Data set 12 | file: asr_nepali_4.zip Data set 13 | file: asr_nepali_5.zip Data set 14 | file: asr_nepali_6.zip Data set 15 | file: asr_nepali_7.zip Data set 16 | file: asr_nepali_8.zip Data set 17 | file: asr_nepali_9.zip Data set 18 | file: asr_nepali_a.zip Data set 19 | file: asr_nepali_b.zip Data set 20 | file: asr_nepali_c.zip Data set 21 | file: asr_nepali_d.zip Data set 22 | file: asr_nepali_e.zip Data set 23 | file: asr_nepali_f.zip Data set 24 | -------------------------------------------------------------------------------- /resources/38/about.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 |
 7 | 
 8 | This corpus were recorded in silence in-door environment using cellphone. It has 855 speakers. Each speaker has 120 utterances. All utterances were carefully transcribed and checked by human. Transcription accuracy is guaranteed. If there is any problem, we agree to correct them for you. The corpus contains:
 9 | 	audio files;
10 | 	transcriptions;
11 | 	metadata;	
12 | 
13 | Please cite the data as “ST-CMDS-20170001_1, Free ST Chinese Mandarin Corpus”.
14 | 
15 | The data set is a subset of a much bigger data set which was recorded in the same environment as this open source data. Please visit our website www.surfing.ai for details.
16 | 
17 | 
18 | 
19 | 
20 | 21 | -------------------------------------------------------------------------------- /resources/16/info.txt: -------------------------------------------------------------------------------- 1 | name: The AMI Corpus 2 | summary: Acoustic speech data and meta-data from The AMI corpus. 3 | category: speech 4 | license: THE CREATIVE COMMONS ATTRIBUTION-NONCOMMERCIAL-SHAREALIKE v2.0 LICENCE (modified, look for more details in the licence file and/or the AMI webpage) 5 | file: ami_manual_1.6.1.tar.gz AMI annotation files (ver. 1.6.1) 6 | file: headset.tar.gz Close-talking acosutic data 7 | file: Array1-01.tar.gz Array1 distant acoustic data 8 | file: Array1-02.tar.gz Array1 distant acoustic data 9 | file: Array1-03.tar.gz Array1 distant acoustic data 10 | file: Array1-04.tar.gz Array1 distant acoustic data 11 | file: Array1-05.tar.gz Array1 distant acoustic data 12 | file: Array1-06.tar.gz Array1 distant acoustic data 13 | file: Array1-07.tar.gz Array1 distant acoustic data 14 | file: Array1-08.tar.gz Array1 distant acoustic data 15 | alternate_url: http://groups.inf.ed.ac.uk/ami/corpus The official AMI corpus webpage 16 | 17 | 18 | -------------------------------------------------------------------------------- /resources/50/about.html: -------------------------------------------------------------------------------- 1 | These are unofficial data splits for the corpus MADCAT Chinese Pilot Training Set (LDC2014T13). 2 | LDC is providing only training data for this corpus and not the original dev/eval sets, so the original 3 | training data have been split into three different disjoint parts (i.e. there shouldn't be sentences/lines 4 | from the same document in different sets -- as each document is handwritten/transcribed 5 | by a different author in the MADCAT data) to allow for evaluation of the performance in the usual way. 6 |

7 | Also, please not that the license relates only for the splits. You still need to obtain the original databases 8 | and respect the databases' license! 9 |

10 | It contains the madcat xml name and segment id (s{1,2,3,4}). For example: 11 |

12 | 	GMW_CMN_20070118.0014_001_LDC0632.madcat.xml s1
13 | 	GMW_CMN_20070118.0014_001_LDC0632.madcat.xml s2
14 | 	GMW_CMN_20070118.0014_001_LDC0632.madcat.xml s3
15 | 
16 | 17 | -------------------------------------------------------------------------------- /resources/52/info.txt: -------------------------------------------------------------------------------- 1 | name: Large Sinhala ASR training data set 2 | summary: Sinhala ASR training data set containing ~185K utterances. 3 | category: speech 4 | license: Attribution-ShareAlike 4.0 International 5 | file: about.html Information about the data set 6 | file: LICENSE License information for the data set 7 | file: utt_spk_text.tsv All utterances in the data set 8 | file: asr_sinhala_0.zip Data set 9 | file: asr_sinhala_1.zip Data set 10 | file: asr_sinhala_2.zip Data set 11 | file: asr_sinhala_3.zip Data set 12 | file: asr_sinhala_4.zip Data set 13 | file: asr_sinhala_5.zip Data set 14 | file: asr_sinhala_6.zip Data set 15 | file: asr_sinhala_7.zip Data set 16 | file: asr_sinhala_8.zip Data set 17 | file: asr_sinhala_9.zip Data set 18 | file: asr_sinhala_a.zip Data set 19 | file: asr_sinhala_b.zip Data set 20 | file: asr_sinhala_c.zip Data set 21 | file: asr_sinhala_d.zip Data set 22 | file: asr_sinhala_e.zip Data set 23 | file: asr_sinhala_f.zip Data set 24 | -------------------------------------------------------------------------------- /resources/53/info.txt: -------------------------------------------------------------------------------- 1 | name: Large Bengali ASR training data set 2 | summary: Bengali ASR training data set containing ~196K utterances. 3 | category: speech 4 | license: Attribution-ShareAlike 4.0 International 5 | file: about.html Information about the data set 6 | file: LICENSE License information for the data set 7 | file: utt_spk_text.tsv All utterances in the data set 8 | file: asr_bengali_0.zip Data set 9 | file: asr_bengali_1.zip Data set 10 | file: asr_bengali_2.zip Data set 11 | file: asr_bengali_3.zip Data set 12 | file: asr_bengali_4.zip Data set 13 | file: asr_bengali_5.zip Data set 14 | file: asr_bengali_6.zip Data set 15 | file: asr_bengali_7.zip Data set 16 | file: asr_bengali_8.zip Data set 17 | file: asr_bengali_9.zip Data set 18 | file: asr_bengali_a.zip Data set 19 | file: asr_bengali_b.zip Data set 20 | file: asr_bengali_c.zip Data set 21 | file: asr_bengali_d.zip Data set 22 | file: asr_bengali_e.zip Data set 23 | file: asr_bengali_f.zip Data set 24 | -------------------------------------------------------------------------------- /resources/33/about.html: -------------------------------------------------------------------------------- 1 | Aishell is an open-source Chinese Mandarin speech corpus published by 2 | Beijing Shell Shell Technology Co.,Ltd. 3 |

4 | 400 people from different accent areas in China are invited to 5 | participate in the recording, which is conducted in a quiet indoor 6 | environment using high fidelity microphone and downsampled to 16kHz. 7 | The manual transcription accuracy is above 95%, through professional 8 | speech annotation and strict quality inspection. The data is free 9 | for academic use. We hope to provide moderate amount of data for new 10 | researchers in the field of speech recognition. 11 |

12 | 13 | You can cite the data 14 | using the following BibTeX entry: 15 |

16 | 
17 | @inproceedings{aishell_2017,
18 |   title={AIShell-1: An Open-Source Mandarin Speech Corpus and A Speech Recognition Baseline},
19 |   author={Hui Bu, Jiayu Du, Xingyu Na, Bengu Wu, Hao Zheng},
20 |   booktitle={Oriental COCOSDA 2017},
21 |   pages={Submitted},
22 |   year={2017}
23 | }
24 | 
25 | -------------------------------------------------------------------------------- /resources/39/about.html: -------------------------------------------------------------------------------- 1 | Heroico 2 |

3 | The Heroico corpus (LDC2006S37) was originally collected to train acoustic models for pronunciation modeling in Spanish language learning applications. 4 | The corpus consists of two main subcorpora: 5 |

6 | 1. A subcorpus collected at Mexico's Military Academy called Heroico. 7 |

8 | 2. A subcorpus collected at the United States Military Academy (USMA) in West Point New York. 9 |

10 | The Heroico corpus is further divided into recited and prompted speech subcorpora. 11 | The recited speech appears under the recordings directory and the prompted speech under the answers directory. 12 |

13 | The USMA subcorpus includes 1.2 hours of speech from nonnative informants and 1 hour of speech from native speakers. 14 | All the speech in the USMA corpus was recited. 15 |

16 | The Heroico subcorpus has 11.8 hours of speech. 17 | One hour segment of speech in the Heroico corpus was recited from the same set of prompts that was used in the USMA collection. 18 | -------------------------------------------------------------------------------- /resources/12/info.txt: -------------------------------------------------------------------------------- 1 | name: LibriSpeech ASR corpus 2 | summary: Large-scale (1000 hours) corpus of read English speech 3 | category: speech 4 | license: CC BY 4.0 5 | file: dev-clean.tar.gz development set, "clean" speech 6 | file: dev-other.tar.gz development set, "other", more challenging, speech 7 | file: test-clean.tar.gz test set, "clean" speech 8 | file: test-other.tar.gz test set, "other" speech 9 | file: train-clean-100.tar.gz training set of 100 hours "clean" speech 10 | file: train-clean-360.tar.gz training set of 360 hours "clean" speech 11 | file: train-other-500.tar.gz training set of 500 hours "other" speech 12 | file: intro-disclaimers.tar.gz extracted LibriVox announcements for some of the speakers 13 | file: original-mp3.tar.gz LibriVox mp3 files, from which corpus' audio was extracted 14 | file: original-books.tar.gz Project Gutenberg texts, against which the audio in the corpus was aligned 15 | file: raw-metadata.tar.gz Some extra meta-data produced during the creation of the corpus 16 | file: md5sum.txt MD5 checksums for the archive files 17 | -------------------------------------------------------------------------------- /resources/47/about.html: -------------------------------------------------------------------------------- 1 | This free Chinese Mandarin speech corpus set is released by Shanghai Primewords Information Technology Co., Ltd. 2 | 3 |

4 | The corpus is recorded by smart mobile phones from 296 native Chinese speakers. The transcription accuracy is larger than 98%, at the confidence level of 95%. It is free for academic use. 5 |

6 | 7 |

8 | The mapping between the transcript and utterance is given in JSON format. 9 |

10 | 11 |

12 | You can cite the data using the following BibTeX entry: 13 |

14 |     @misc{primewords_201801,
15 |     title={Primewords Chinese Corpus Set 1},
16 |     author={Primewords Information Technology Co., Ltd.},
17 |     year={2018},
18 |     note={\url{https://www.primewords.cn}}
19 |     }
20 |   
21 |

22 | 23 |

24 | CONTACTOR 25 | 26 | Yinghui Liu, yinghui_liu@primewords.cn 27 |

28 | 29 |

30 | External URLs: https://www.primewords.cn 31 |

32 | -------------------------------------------------------------------------------- /resources/48/about.html: -------------------------------------------------------------------------------- 1 | These are unofficial data splits for the corpus MADCAT Arabic (LDC2013T15, LDC2013T09, LDC2012T15). 2 | LDC is providing only training data for these corpora and not the original dev/eval sets, so the original 3 | training data have been split into three different disjoint parts (i.e. there shouldn't be sentences/lines 4 | from the same document in different sets -- as each document is handwritten/transcribed 5 | by a different author in the MADCAT data) to allow for evaluation of the performance in the usual way. 6 |

7 | Also, please not that the license relates only for the splits. You still need to obtain the original databases 8 | and respect the databases' license! 9 |

10 | It contains the madcat xml name and segment id (s{1,2,3,4}). For example: 11 |

12 | 	groups.google.com_women1000_508c404bd84f8ba3_ARB_20060426_124900_3_LDC0188.madcat.xml s1
13 | 	groups.google.com_women1000_508c404bd84f8ba3_ARB_20060426_124900_3_LDC0188.madcat.xml s2
14 | 	groups.google.com_women1000_508c404bd84f8ba3_ARB_20060426_124900_3_LDC0188.madcat.xml s3
15 | 
16 | 17 | -------------------------------------------------------------------------------- /resources/54/about.html: -------------------------------------------------------------------------------- 1 | This data set contains transcribed audio data for Nepali. The data set consists of wave files, and a TSV file. 2 | The file utt_spk_text.tsv contains a FileID, anonymized UserID and the transcription of audio in the file. 3 |

4 | The data set has been manually quality checked, but there might still be errors. 5 |

6 | See LICENSE.txt file for license information. 7 |

8 | Copyright 2016, 2017, 2018 Google, Inc. 9 |

10 | If you use this data in publications, please cite it as follows: 11 |

12 |   @inproceedings{kjartansson-etal-sltu2018,
13 |     title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},
14 |     author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},
15 |     booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},
16 |     year  = {2018},
17 |     address = {Gurugram, India},
18 |     month = aug,
19 |     pages = {52--55},
20 |     URL   = {http://dx.doi.org/10.21437/SLTU.2018-11}
21 |   }
22 | 
23 | -------------------------------------------------------------------------------- /resources/52/about.html: -------------------------------------------------------------------------------- 1 | This data set contains transcribed audio data for Sinhala. The data set consists 2 | of wave files, and a TSV file. The file utt_spk_text.tsv contains a FileID, 3 | anonymized UserID and the transcription of audio in the file. 4 |

5 | The data set has been manually quality checked, but there might still be errors. 6 |

7 | See LICENSE.txt file for license information. 8 |

9 | Copyright 2016, 2017, 2018 Google, Inc. 10 |

11 | If you use this data in publications, please cite it as follows: 12 |

13 |   @inproceedings{kjartansson-etal-sltu2018,
14 |     title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},
15 |     author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},
16 |     booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},
17 |     year  = {2018},
18 |     address = {Gurugram, India},
19 |     month = aug,
20 |     pages = {52--55},
21 |     URL   = {http://dx.doi.org/10.21437/SLTU.2018-11}
22 |   }
23 | 
24 | -------------------------------------------------------------------------------- /resources/53/about.html: -------------------------------------------------------------------------------- 1 | This data set contains transcribed audio data for Bengali. The data set consists 2 | of wave files, and a TSV file. The file utt_spk_text.tsv contains a FileID, 3 | anonymized UserID and the transcription of audio in the file. 4 |

5 | The data set has been manually quality checked, but there might still be errors. 6 |

7 | See LICENSE.txt file for license information. 8 |

9 | Copyright 2016, 2017, 2018 Google, Inc. 10 |

11 | If you use this data in publications, please cite it as follows: 12 |

13 |   @inproceedings{kjartansson-etal-sltu2018,
14 |     title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},
15 |     author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},
16 |     booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},
17 |     year  = {2018},
18 |     address = {Gurugram, India},
19 |     month = aug,
20 |     pages = {52--55},
21 |     URL   = {http://dx.doi.org/10.21437/SLTU.2018-11}
22 |   }
23 | 
24 | -------------------------------------------------------------------------------- /resources/61/info.txt: -------------------------------------------------------------------------------- 1 | name: Crowdsourced high-quality Argentinian Spanish speech data set. 2 | summary: Data set which contains 5739 recordings of native speakers of Spanish 3 | recorded in Buenos Aires, Argentina. The data set has both male and female 4 | recordings. Part of the data set is a small section of weather messages, 5 | recorded both in Peninsular Spanish (90 messages) as well as Argentinian Spanish 6 | (90 messages). 7 | category: speech 8 | license: Attribution-ShareAlike 4.0 International 9 | file: about.html Information about the data set 10 | file: LICENSE License information for the data set 11 | file: line_index_female.tsv All utterances for the female speakers. 12 | file: line_index_male.tsv All utterances for the male speakers. 13 | file: es_es_line_index_weather.tsv Weather messages in Peninsular Spanish. 14 | file: es_ar_line_index_weather.tsv Weather messages in Argentinian Spanish. 15 | file: es_ar_female.zip Archive file with all audio for the female speakers. 16 | file: es_ar_male.zip Archive file with all audio for the male speakers. 17 | file: es_weather_messages.zip Archive file with the weather messages. 18 | -------------------------------------------------------------------------------- /resources/36/about.html: -------------------------------------------------------------------------------- 1 | This data set contains transcribed audio data for Sundanese. The data set consists of wave files, and a TSV file. The file utt_spk_text.tsv contains a FileID, UserID and the transcription of audio in the file. 2 |

3 | The data set has been manually quality checked, but there might still be errors. 4 |

5 | This dataset was collected by Google in Indonesia. 6 |

7 | See LICENSE.txt file for license information. 8 |

9 | Copyright 2016, 2017 Google, Inc. 10 |

11 | If you use this data in publications, please cite it as follows: 12 |

13 |   @inproceedings{kjartansson-etal-sltu2018,
14 |     title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},
15 |     author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},
16 |     booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},
17 |     year  = {2018},
18 |     address = {Gurugram, India},
19 |     month = aug,
20 |     pages = {52--55},
21 |     URL   = {http://dx.doi.org/10.21437/SLTU.2018-11},
22 |   }
23 | 
24 | -------------------------------------------------------------------------------- /resources/35/info.txt: -------------------------------------------------------------------------------- 1 | name: Large Javanese ASR training data set 2 | summary: Javanese ASR training data set containing ~185K utterances. 3 | category: speech 4 | license: Attribution-ShareAlike 4.0 International 5 | file: asr_javanese.sha256 Checksum for the files 6 | file: LICENSE License information for the data set 7 | file: utt_spk_text.tsv All utterances in the data set 8 | file: asr_javanese_0.zip Data set, file 0/15 9 | file: asr_javanese_1.zip Data set, file 1/15 10 | file: asr_javanese_2.zip Data set, file 2/15 11 | file: asr_javanese_3.zip Data set, file 3/15 12 | file: asr_javanese_4.zip Data set, file 4/15 13 | file: asr_javanese_5.zip Data set, file 5/15 14 | file: asr_javanese_6.zip Data set, file 6/15 15 | file: asr_javanese_7.zip Data set, file 7/15 16 | file: asr_javanese_8.zip Data set, file 8/15 17 | file: asr_javanese_9.zip Data set, file 9/15 18 | file: asr_javanese_a.zip Data set, file 10/15 19 | file: asr_javanese_b.zip Data set, file 11/15 20 | file: asr_javanese_c.zip Data set, file 12/15 21 | file: asr_javanese_d.zip Data set, file 13/15 22 | file: asr_javanese_e.zip Data set, file 14/15 23 | file: asr_javanese_f.zip Data set, file 15/15 24 | 25 | -------------------------------------------------------------------------------- /resources/36/info.txt: -------------------------------------------------------------------------------- 1 | name: Large Sundanese ASR training data set 2 | summary: Sundanese ASR training data set containing ~220K utterances. 3 | category: speech 4 | license: Attribution-ShareAlike 4.0 International 5 | file: asr_sundanese.sha256 Checksum for the files 6 | file: LICENSE License information for the data set 7 | file: utt_spk_text.tsv All utterances in the data set 8 | file: asr_sundanese_0.zip Data set, file 0/15 9 | file: asr_sundanese_1.zip Data set, file 1/15 10 | file: asr_sundanese_2.zip Data set, file 2/15 11 | file: asr_sundanese_3.zip Data set, file 3/15 12 | file: asr_sundanese_4.zip Data set, file 4/15 13 | file: asr_sundanese_5.zip Data set, file 5/15 14 | file: asr_sundanese_6.zip Data set, file 6/15 15 | file: asr_sundanese_7.zip Data set, file 7/15 16 | file: asr_sundanese_8.zip Data set, file 8/15 17 | file: asr_sundanese_9.zip Data set, file 9/15 18 | file: asr_sundanese_a.zip Data set, file 10/15 19 | file: asr_sundanese_b.zip Data set, file 11/15 20 | file: asr_sundanese_c.zip Data set, file 12/15 21 | file: asr_sundanese_d.zip Data set, file 13/15 22 | file: asr_sundanese_e.zip Data set, file 14/15 23 | file: asr_sundanese_f.zip Data set, file 15/15 24 | 25 | -------------------------------------------------------------------------------- /resources/85/about.html: -------------------------------------------------------------------------------- 1 | The data is used in AISHELL Speaker Verification Challenge 2019. It is extracted from a larger database called AISHELL-WakeUp-1. 2 |

3 | 4 | The contents are wake-up words "Hi, Mia" in both Chinese and English. 5 | The data is collected in real home environment using microphone arrays and Hi-Fi microphone. 6 | The collection process and development of a baseline system was described in the paper below. 7 | The data used in the challenge is extracted from 1 Hi-Fi microphone and 16-channel circular microphone arrays for 1/3/5 meters. 8 | And the contents are the Chinese wake-up words. The whole set is divided into train (254 people), dev (42 people) and test (44 people) subsets. 9 | Test subset is provided with paired target/non-target answer to evaluate verification results. 10 |

11 | 12 | You can cite the data using the following BibTeX entry: 13 |

14 | 
15 | @misc{himia,
16 |     title={HI-MIA : A Far-field Text-Dependent Speaker Verification Database and the Baselines},
17 |     author={Xiaoyi Qin and Hui Bu and Ming Li},
18 |     year={2019},
19 |     eprint={1912.01231},
20 |     archivePrefix={arXiv},
21 |     primaryClass={cs.SD}
22 | }
23 | 
24 | 


--------------------------------------------------------------------------------
/resources/59/about.html:
--------------------------------------------------------------------------------
 1 |   

ParlamentParla is a speech corpus for Catalan, published by the workers cooperative Col·lectivaT. The audio segments were extracted from recordings the Catalan Parliament Catalan Parliament (Parlament de Catalunya) plenary sessions. The recordings were aligned with their transcripts, and 320 hours of cleanest segments are extracted. The content belongs to the Catalan Parliament and the data is released conforming their terms of use.

2 | 3 |

Preparation of this corpus was supported by the Department of Culture of the Catalan autonomous government.

4 | 5 |

The audio files are PCM 16bit mono, little endian with the sample rate 16 kHz. As of release version 1.0, the corpus is separated into 90 hours of clean and 230 hours of other quality segments.

6 | 7 |

For contact info@collectivat.cat

8 | 9 | https://collectivat.cat/asr   The official ParlamentParla corpus webpage, with other resources and updates

10 | 11 | -------------------------------------------------------------------------------- /resources/67/about.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | TEDx Spanish Corpus 4 | 5 | 6 | 7 | 8 | The TEDx Spanish Corpus is a gender unbalanced corpus of 24 hours of duration. 9 |

10 | It contains spontaneous speech of several expositors in TEDx events; most of them are men. 11 |

12 | Transcriptions are presented in lowercase with no punctuation marks. 13 |

14 | The data collection process was partly developed by the social service program "Desarrollo de Tecnologías del Habla" that depends on the National Autonomous University of Mexico and partly by the CIEMPIESS-UNAM project (http://www.ciempiess.org/) 15 |

16 | Special thanks to the TED-Talks team for allowing us to share this dataset. 17 |

18 | You can cite the data using the following BibTeX entry: 19 |

20 | 
21 | @misc{mena_2019,
22 | 	title = "{TEDx Spanish Corpus. Audio and transcripts in Spanish taken from the TEDx Talks; shared under the CC BY-NC-ND 4.0 license}",
23 | 	author = "Hernandez-Mena, Carlos D.",
24 | 	howpublished = "Web Download",
25 | 	institution = "Universidad Nacional Autonoma de Mexico",
26 | 	location = "Mexico City",
27 | 	year = "2019"
28 | }
29 | 
30 | 
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/resources/35/about.html:
--------------------------------------------------------------------------------
 1 | This data set contains transcribed audio data for Javanese. The data set consists of wave files, and a TSV file. The file utt_spk_text.tsv contains a FileID, UserID and the transcription of audio in the file.
 2 | 

3 | The data set has been manually quality checked, but there might still be errors. 4 |

5 | This dataset was collected by Google in collaboration with Reykjavik University and Universitas Gadjah Mada in Indonesia. 6 |

7 | See LICENSE.txt file for license information. 8 |

9 | Copyright 2016, 2017 Google, Inc. 10 |

11 | If you use this data in publications, please cite it as follows: 12 |

13 |   @inproceedings{kjartansson-etal-sltu2018,
14 |     title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},
15 |     author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},
16 |     booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},
17 |     year  = {2018},
18 |     address = {Gurugram, India},
19 |     month = aug,
20 |     pages = {52--55},
21 |     URL   = {http://dx.doi.org/10.21437/SLTU.2018-11},
22 |   }
23 | 
24 | -------------------------------------------------------------------------------- /resources/32/about.html: -------------------------------------------------------------------------------- 1 | This data set contains multi-speaker high quality transcribed audio data for four languages of South Africa. The data set consists of wave files, and a TSV file transcribing the audio. In each folder, the file line_index.tsv contains a FileID, which in turn contains the UserID and the Transcription of audio in the file. 2 |

3 | The data set has had some quality checks, but there might still be errors. 4 |

5 | This data set was collected by as a collaboration between North West University and Google. 6 |

7 | See LICENSE.txt file for license information. 8 |

9 | Copyright 2017 Google, Inc. 10 |

11 | If you use this data in publications, please cite it as follows: 12 |

13 |   @inproceedings{van-niekerk-etal-2017,
14 |     title = {{Rapid development of TTS corpora for four South African languages}},
15 |     author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson and Martin Jansche and Linne Ha},
16 |     booktitle = {Proc. Interspeech 2017},
17 |     pages = {2178--2182},
18 |     address = {Stockholm, Sweden},
19 |     month = aug,
20 |     year  = {2017},
21 |     URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}
22 |   }
23 | 
24 | -------------------------------------------------------------------------------- /resources/42/about.html: -------------------------------------------------------------------------------- 1 | This data set contains high-quality transcribed audio data for Khmer. The data set consists of wave files, and a TSV file. The file line_index.tsv contains a filename and the transcription of audio in the file. Each filename is prepended with a speaker identification number. 2 |

3 | The data set has been manually quality checked, but there might still be errors. 4 |

5 | This dataset was collected by Google. 6 |

7 | See LICENSE file for license information. 8 |

9 | Copyright 2016, 2017, 2018 Google LLC 10 |

11 | If you use this data in publications, please cite it as follows: 12 |

13 |   @inproceedings{kjartansson-etal-tts-sltu2018,
14 |     title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese, Khmer, Nepali, Sinhala, and Sundanese}},
15 |     author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu De Silva and Supheakmungkol Sarin},
16 |     booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},
17 |     year  = {2018},
18 |     address = {Gurugram, India},
19 |     month = aug,
20 |     pages = {66--70},
21 |     URL   = {http://dx.doi.org/10.21437/SLTU.2018-14}
22 |   }
23 | 
24 | -------------------------------------------------------------------------------- /resources/43/about.html: -------------------------------------------------------------------------------- 1 | This data set contains high-quality transcribed audio data for Nepali. The data set consists of wave files, and a TSV file. The file line_index.tsv contains a filename and the transcription of audio in the file. Each filename is prepended with a speaker identification number. 2 |

3 | The data set has been manually quality checked, but there might still be errors. 4 |

5 | This dataset was collected by Google in Nepal. 6 |

7 | See LICENSE.txt file for license information. 8 |

9 | Copyright 2016, 2017, 2018 Google LLC 10 |

11 | If you use this data in publications, please cite it as follows: 12 |

13 |   @inproceedings{kjartansson-etal-tts-sltu2018,
14 |     title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese, Khmer, Nepali, Sinhala, and Sundanese}},
15 |     author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu De Silva and Supheakmungkol Sarin},
16 |     booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},
17 |     year  = {2018},
18 |     address = {Gurugram, India},
19 |     month = aug,
20 |     pages = {66--70},
21 |     URL   = {http://dx.doi.org/10.21437/SLTU.2018-14}
22 |   }
23 | 
24 | -------------------------------------------------------------------------------- /resources/2/info.txt: -------------------------------------------------------------------------------- 1 | name: OpenFST 2 | summary: A mirror of the OpenFst toolkit 3 | category: software 4 | license: Apache 2.0 5 | file: openfst-1.3.2.tar.gz Version 1.3.2 6 | file: openfst-1.3.3.tar.gz Version 1.3.3 7 | file: openfst-1.3.4.tar.gz Version 1.3.4 8 | file: openfst-1.4.1.tar.gz Version 1.4.1 9 | file: openfst-1.5.4.tar.gz Version 1.5.4 10 | file: openfst-1.6.2.tar.gz Version 1.6.2 11 | file: openfst-1.6.5.tar.gz Version 1.6.5 12 | file: openfst-1.6.7.tar.gz Version 1.6.7 13 | alternate_url: http://openfst.cs.nyu.edu/twiki/pub/FST/FstDownload/openfst-1.3.2.tar.gz Version 1.3.2 14 | alternate_url: http://openfst.cs.nyu.edu/twiki/pub/FST/FstDownload/openfst-1.3.3.tar.gz Version 1.3.3 15 | alternate_url: http://openfst.cs.nyu.edu/twiki/pub/FST/FstDownload/openfst-1.3.4.tar.gz Version 1.3.4 16 | alternate_url: http://openfst.cs.nyu.edu/twiki/pub/FST/FstDownload/openfst-1.4.1.tar.gz Version 1.4.1 17 | alternate_url: http://openfst.cs.nyu.edu/twiki/pub/FST/FstDownload/openfst-1.5.4.tar.gz Version 1.5.4 18 | alternate_url: http://openfst.cs.nyu.edu/twiki/pub/FST/FstDownload/openfst-1.6.2.tar.gz Version 1.6.2 19 | alternate_url: http://openfst.cs.nyu.edu/twiki/pub/FST/FstDownload/openfst-1.6.5.tar.gz Version 1.6.5 20 | alternate_url: http://www.openfst.org/twiki/pub/FST/FstDownload/openfst-1.6.7.tar.gz Version 1.6.7 21 | -------------------------------------------------------------------------------- /resources/44/about.html: -------------------------------------------------------------------------------- 1 | This data set contains high-quality transcribed audio data for Sundanese. The data set consists of wave files, and a TSV file. The file line_index.tsv contains a filename and the transcription of audio in the file. Each filename is prepended with a speaker identification number. 2 |

3 | The data set has been manually quality checked, but there might still be errors. 4 |

5 | This dataset was collected by Google in collaboration with Universitas Pendidikan Indonesia. 6 |

7 | See LICENSE file for license information. 8 |

9 | Copyright 2016, 2017, 2018 Google LLC 10 |

11 | If you use this data in publications, please cite it as follows: 12 |

13 |   @inproceedings{kjartansson-etal-tts-sltu2018,
14 |     title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese, Khmer, Nepali, Sinhala, and Sundanese}},
15 |     author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu De Silva and Supheakmungkol Sarin},
16 |     booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},
17 |     year  = {2018},
18 |     address = {Gurugram, India},
19 |     month = aug,
20 |     pages = {66--70},
21 |     URL   = {http://dx.doi.org/10.21437/SLTU.2018-14}
22 |   }
23 | 
24 | -------------------------------------------------------------------------------- /resources/41/about.html: -------------------------------------------------------------------------------- 1 | This data set contains high-quality transcribed audio data for Javanese. The data set consists of wave files, and a TSV file. The file line_index.tsv contains a filename and the transcription of audio in the file. Each filename is prepended with a speaker identification number. 2 |

3 | The data set has been manually quality checked, but there might still be errors. 4 |

5 | This dataset was collected by Google in collaboration with Gadjah Mada University in Indonesia. 6 |

7 | See LICENSE file for license information. 8 |

9 | Copyright 2016, 2017, 2018 Google LLC 10 |

11 | If you use this data in publications, please cite it as follows: 12 |

13 |   @inproceedings{kjartansson-etal-tts-sltu2018,
14 |     title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese, Khmer, Nepali, Sinhala, and Sundanese}},
15 |     author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu De Silva and Supheakmungkol Sarin},
16 |     booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},
17 |     year  = {2018},
18 |     address = {Gurugram, India},
19 |     month = aug,
20 |     pages = {66--70},
21 |     URL   = {http://dx.doi.org/10.21437/SLTU.2018-14}
22 |   }
23 | 
24 | -------------------------------------------------------------------------------- /resources/57/about.html: -------------------------------------------------------------------------------- 1 | African Accented French Corpus 2 |

3 | This corpus consists of approximately 22 hours of speech recordings. 4 | Transcripts are provided for all the recordings. 5 | The corpus can be divided into 3 parts: 6 |

7 | 1. Yaounde 8 |

9 | Collected by a team from the U.S. Military Academy's Center for Technology Enhanced Language Learning (CTELL) in 2003 in Yaoundé, Cameroon. It has recordings from 84 speakers, 48 male and 36 female. 10 |

11 | 2. CA16 12 |

13 | This part was collected by a RDECOM Science Team who participated in the United Nations exercise Central Accord 16 (CA16) in Libreville, Gabon in June 2016. The Science Team included DARPA's Dr. Boyan Onyshkevich and Dr. Aaron Lawson (SRI International), as well as RDECOM scientists. 14 | It has recordings from 125 speakers from Cameroon, Chad, Congo and Gabon. 15 |

16 | 3. Niger 17 |

18 | This part was collected from 23 speakers in Niamey, Niger, Oct. 26-30 2015. These speakers were students in a course for officers and sergeants presented by Army trainers assigned to U.S. Army Africa. The data was collected by RDECOM Science & Technology Advisors Major Eddie Strimel and Mr. Bill Bergen. 19 |

20 | Visit this page for further info 21 | -------------------------------------------------------------------------------- /resources/87/about.html: -------------------------------------------------------------------------------- 1 | The MobvoiHotwords is a corpus of wake-up words collected from a commercial smart speaker of Mobvoi. It consists of keyword and non-keyword utterances. 2 |

3 | For keyword data, keyword utterances contain either 'Hi xiaowen' or 'Nihao Wenwen' are collected. For each keyword, there are about 36k utterances. All keyword data is collected from 788 subjects, ages 3-65, with different distances from the smart speaker (1, 3 and 5 meters). Different noises (typical home environment noises like music and TV) with varying sound pressure levels are played in the background during the collection. The keyword data is identical to the keyword data used in the paper below: 4 |

 5 | @article{DBLP:journals/spl/HouSOHX19,
 6 |   author    = {Jingyong Hou and
 7 |                Yangyang Shi and
 8 |                Mari Ostendorf and
 9 |                Mei{-}Yuh Hwang and
10 |                Lei Xie},
11 |   title     = {Region Proposal Network Based Small-Footprint Keyword Spotting},
12 |   journal   = {{IEEE} Signal Process. Lett.},
13 |   volume    = {26},
14 |   number    = {10},
15 |   pages     = {1471--1475},
16 |   year      = {2019},
17 |   url       = {https://doi.org/10.1109/LSP.2019.2936282},
18 |   doi       = {10.1109/LSP.2019.2936282}
19 | }
20 | 
21 | There are also ~220 hours non-keyword data can be used as negative training samples, collected from the same smart speaker.
22 | 


--------------------------------------------------------------------------------
/resources/16/about.html:
--------------------------------------------------------------------------------
 1 | This is a mirror of The AMI Corpus acoustic data originally hosted on http://groups.inf.ed.ac.uk/ami/corpus/
 2 | 
 3 | 

4 | The AMI Meeting Corpus consists of 100 hours of meeting recordings. The recordings use a range of signals synchronized to a common timeline. These include close-talking and far-field microphones, individual and room-view video cameras, and output from a slide projector and an electronic whiteboard. During the meetings, the participants also have unsynchronized pens available to them that record what is written. The meetings were recorded in English using three different rooms with different acoustic properties, and include mostly non-native speakers. 5 |

6 | 7 | The associated paper(s) describing the data:
8 | 9 |
    10 |
  • Jean Carletta (2007). Unleashing the killer corpus: experiences in creating the multi-everything AMI Meeting Corpus. Language Resources and Evaluation Journal 41(2): 181-190. pdf 11 |
  • 12 |
  • Steve Renals, Thomas Hain, and Hervé Bourlard (2007). Recognition and interpretation of meetings: The AMI and AMIDA projects. In Proc. IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU '07). pdf
  • 13 |
14 | 15 | -------------------------------------------------------------------------------- /resources/35/asr_javanese.sha256: -------------------------------------------------------------------------------- 1 | a871c8b71ff8fa9d95955447ca0c388e8c6f925aecfce92e1880bda2da113bcb asr_javanese_0.zip 2 | 8024b18acc265bd502f2c36930ab41bd9a8a9cbc67d3db340698df1f6799eeef asr_javanese_1.zip 3 | c1605da9f74b0951533bcd9bb66a868dc4552929a6e3597d1f6b66c8436cd87e asr_javanese_2.zip 4 | f813cfa6ea5db1a2c7af65d62dd4d2edc932e67990570f0e5418675c0c9443d3 asr_javanese_3.zip 5 | 506af733d9c1f02372e83e997c924fac5a8141a7920d1ab345bd607e26438f0c asr_javanese_4.zip 6 | 5300df2d2fd95033632fe7d3d77042804c92bf4f9983f11e707c20e358e45a91 asr_javanese_5.zip 7 | a487e12f9d3fd1d3e6d8a8c2b58363813d6121e6a84937ec0d27601fea2654db asr_javanese_6.zip 8 | 944ce7e3463f2e0d6024f8a1768e161a64dd4ab7cf8a96b7924fb8666ae2142e asr_javanese_7.zip 9 | cb598b81bd681dc51965c912bf4aabc4af6eb9b57d5a7cb0998ed121cec63dcd asr_javanese_8.zip 10 | 7ee9de72360a59dc2a3cd3570627565a638d7a47f0f95ce4c14545bc9b6690b2 asr_javanese_9.zip 11 | 1fd1e4b06ed5d18614ef7ce414e7e0b6c105d6f5d87b3a6210fcedc4cc6f35cd asr_javanese_a.zip 12 | 036bb70c60e8ba4b9be090dcd717e1da8744dd1cfdfab1eb4a4cd29d7755b938 asr_javanese_b.zip 13 | a46d7b1ad184a4c2ac9099c8399f18fb8b14dd9ab4172a61f8abe3e464f7b2b9 asr_javanese_c.zip 14 | 9f3058916fe721f92a4d1a6c2794d82920b7c88ed780ef06fe69f8e448d0ddb6 asr_javanese_d.zip 15 | d9234d3331fb11c082bc17f3b54c13dfa183c4cb13e35c030f7a1dbbe4c819cd asr_javanese_e.zip 16 | 1bedbc295e4d1592e5730da8f0774fe360fe146d193b9c9815a8025072dd0b70 asr_javanese_f.zip 17 | -------------------------------------------------------------------------------- /resources/36/asr_sundanese.sha256: -------------------------------------------------------------------------------- 1 | 947a0ac86008b88130f7c8f1b27d4a0f93886f653cf65b5948c0532cd0097c0d asr_sundanese_0.zip 2 | 365f052dd9d977343002289ea1f29dea466f1243e5edf22dfb933e3fa93a6d87 asr_sundanese_1.zip 3 | f9b9ee2a925d4fd934be3ebe09545ffb3f294f1e6d1380e837054fdf4ce8cff2 asr_sundanese_2.zip 4 | ba3cc0e8e351a5456269c72edf7a3b50cf820941f93d7eed0e8f02a3b1b0a89f asr_sundanese_3.zip 5 | a6ca66e2537bd55dfaea4e716d847c70aead58c217184ab37afbd4065cca9262 asr_sundanese_4.zip 6 | 31bb8a9981b45855ab0b7c634c89040fe99b122455750a6ab956393dc9dec0d8 asr_sundanese_5.zip 7 | 3f23d6c4c67dc6f39a8ebb2af43e2efedb57028abb85eb519394f2d9ef8b3a21 asr_sundanese_6.zip 8 | bce8f33b6ed62978915dfc601957162e9eece8bc3190cd2d548d7679409a3d77 asr_sundanese_7.zip 9 | 755e0af77d0bd6d4aa7895b2ab9fbf792c57efc49c8cec21d3d728fe3374b621 asr_sundanese_8.zip 10 | 5d426d2c99eb91ffd3db193d510e288133c426556430fe2e70e08f58815f5a31 asr_sundanese_9.zip 11 | e032537b62aa8a8abe660bca418ac2e26a93bdc7a357b948a301bde286952fa5 asr_sundanese_a.zip 12 | e999e83fde37ec973b1a1822aaa8769488c2a95058a3448661ac94c319881549 asr_sundanese_b.zip 13 | 275ac684fe7b8bf012dc251ddb91496e2d95c2c257ec87ab0847efa379e96787 asr_sundanese_c.zip 14 | 34ae64f8a29ddef2e05ca5ce8122b461a737d58d796dbe577a4e8a4a05c6b2ce asr_sundanese_d.zip 15 | 25e36087063e0cc5e54cf04e5a4e065b19e0c1bc9cbc07a9f98635941b53bfea asr_sundanese_e.zip 16 | 3d1410c31cc70994f82b9555967fa4c8d682aee288cc85b05b9c4e6352a49f14 asr_sundanese_f.zip 17 | -------------------------------------------------------------------------------- /resources/19/about.html: -------------------------------------------------------------------------------- 1 |
 2 | This is the TED-LIUM corpus release 2, 
 3 | licensed under Creative Commons BY-NC-ND 3.0 (http://creativecommons.org/licenses/by-nc-nd/3.0/deed.en). 
 4 | 
 5 | All talks and text are property of TED Conferences LLC. 
 6 | 
 7 | --- 
 8 | 
 9 | The TED-LIUM corpus was made from audio talks and their transcriptions available on the TED website. We have prepared and filtered these data in order to train acoustic models to participate to the International Workshop on Spoken Language Translation 2011 (the LIUM English/French SLT system reached the first rank in the SLT task). 
10 | 
11 | More details are given in this paper: 
12 | 
13 | A. Rousseau, P. Deléglise, and Y. Estève, "Enhancing the TED-LIUM Corpus with Selected Data for Language Modeling and More TED Talks",
14 | in Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC’14), May 2014.
15 | 
16 | 
17 | Please cite this reference if you use these data in your research work. 
18 | 
19 | --- 
20 | 
21 | Contents: 
22 | 
23 | - 1495 audio talks in NIST sphere format (SPH) 
24 | - 1495 transcripts in STM format 
25 | - Dictionary with pronunciation (159848 entries) 
26 | - Selected monolingual data for language modeling from WMT12 publicly available corpora
27 | 
28 | 
29 | SPH format info: 
30 | 
31 | Channels			: 1
32 | Sample Rate		: 16000
33 | Precision			: 16-bit
34 | Bit Rate			: 256k
35 | Sample Encoding	: 16-bit Signed Integer PCM
36 | 
37 | 
38 | -------------------------------------------------------------------------------- /resources/55/about.html: -------------------------------------------------------------------------------- 1 | CLMAD is an open Chinese Language Model Adaptation Dataset. The dataset contains 14 classes of 740,000 news. Several necessary preprocessing steps are adopted on the dataset for language model training. Documents are split into sentences in terms of punctuations, and then all punctuations are removed. ICTLACS word segmentation tool is used to segment continues character sequences in to word sequences. Each class of text is split into training set and testing set. The testing set is randomly selected 7000 sentences. The text of training set and testing set are not overlapped. Detailed comparative experiments on four selected domains (fashion, finance, sport, and stock) are shown in our paper "CLMAD: A Chinese Language Model Adaptation Dataset", Ye Bai, Jianhua Tao, Jiangyan Yi, Zhengqi Wen, Cunhang Fan, ISCSLP 2018 (submitted). 2 | 3 |

4 | The dataset is extended from THUCNews text classification dataset. We appreciate NLP lab of Tsinghua University to provide THUC News corpus, and Dr. Zhiyuan Liu to admit us to extend this corpus. 5 |

6 | 7 | You can cite the data using the following BibTeX entry: 8 |
 9 | 
10 | @inproceedings{yebai2018clmad,
11 |   title={CLMAD: A Chinese Language Model Adaptation Dataset},
12 |   author={Ye Bai, Jianhua Tao, Jiangyan Yi, Zhengqi Wen, Cunhang Fan},
13 |   booktitle={The Eleventh International Symposium on Chinese Spoken Language Processing (ISCSLP 2018)},
14 |   pages={To Appear},
15 |   year={2018},
16 | }
17 | 
18 | -------------------------------------------------------------------------------- /resources/90/about.html: -------------------------------------------------------------------------------- 1 |
    2 |
  • The Chinese Mandarin speech recognition corpus is provided by speechocean.
  • 3 |
  • This is a 10.33 hours corpus, which is collected over 4 different microphones 4 | simultaneously.
  • 5 |
  • The corpus was recorded by 20 speakers (10 males and 10 females) in a quiet office. Each 6 | speaker was recorded around 120 utterances in one channel.
  • 7 |
  • Transcription files are included.
  • 8 |
  • The sentence transcription accuracy is higher than 98%.
  • 9 |
  • It is totally free to use for academic purpose.
  • 10 |
  • This corpus is a subset of a bigger corpus (159 hours). Please contact us if you are 11 | interested.
  • 12 |
13 | 14 |
External URL
15 | http://en.speechocean.com/member/details/52.html 16 | 17 |
Contact Information
18 | Email: contact@speechocean.com 19 |

20 | Web: http://en.speechocean.com 21 | 22 |

About Speechocean
23 | Speechocean always devoted itself to providing specialized engineering data products and 24 | services to enterprises and scientific research institutions in the whole industry chain of AI. Our 25 | business involves various domains such as speech recognition, speech synthesis, computer 26 | vision, lexicon, and natural language processing and provides relevant services for the design, 27 | collection, transcription, annotation, etc. of data. 28 | 29 | -------------------------------------------------------------------------------- /resources/5/about.html: -------------------------------------------------------------------------------- 1 | This resource mirrors the transcriptions of Switchboard data that 2 | were generated at Mississippi State, and the associated lexicon. 3 | These were released without any license restrictions. 4 |

5 | The Switchboard (SWB) corpus 6 | is one of the most important historical benchmarks 7 | for recognition tasks involving large vocabulary conversational speech (LVCSR). 8 | It contains 2430 conversations averaging 6 minutes in length; in other words, 9 | over 240 hours of recorded speech, and about 3 million words of text, spoken by 10 | over 500 speakers of both sexes from every major dialect of American English. 11 |

12 | The initial transcriptions for SWB have error rates above 10%, resulting in poor 13 | recognition performance, paticularly on hard to recognize words such as 14 | monosyllabic words. This release of the SWB transcriptions, which was developed 15 | by the Institute for Signal and Information Processing at Mississippi State 16 | University in the late 1990's, includes transcriptions that were manually 17 | corrected to have error rates below 1%. The release also includes 18 | manually-adjusted segmentations and word alignments. 19 | 20 | 21 | 28 | -------------------------------------------------------------------------------- /resources/76/about.html: -------------------------------------------------------------------------------- 1 | This data set contains transcribed high-quality audio of Basque sentences 2 | recorded by volunteers. The data set consists of wave files, and a TSV file 3 | (line_index.tsv). The file line_index.tsv contains a anonymized FileID and the 4 | transcription of audio in the file. 5 |

6 | The data set has been manually quality checked, but there might still be errors. 7 |

8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |

13 | See LICENSE file for license information. 14 |

15 | Copyright 2018, 2019 Google, Inc. 16 |

17 | If you use this data in publications, please cite it as follows: 18 |

19 |   @inproceedings{kjartansson-etal-2020-open,
20 |     title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},
21 |     author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},
22 |     booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},
23 |     year = {2020},
24 |     pages = {21--27},
25 |     month = may,
26 |     address = {Marseille, France},
27 |     publisher = {European Language Resources association (ELRA)},
28 |     url = {https://www.aclweb.org/anthology/2020.sltu-1.3},
29 |     ISBN = {979-10-95546-35-1},
30 |   }
31 | 
32 | -------------------------------------------------------------------------------- /resources/69/about.html: -------------------------------------------------------------------------------- 1 | This data set contains transcribed high-quality audio of Catalan sentences 2 | recorded by volunteers. The data set consists of wave files, and a TSV file 3 | (line_index.tsv). The file line_index.tsv contains a anonymized FileID and the 4 | transcription of audio in the file. 5 |

6 | The data set has been manually quality checked, but there might still be errors. 7 |

8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |

13 | See LICENSE file for license information. 14 |

15 | Copyright 2018, 2019 Google, Inc. 16 |

17 | If you use this data in publications, please cite it as follows: 18 |

19 |   @inproceedings{kjartansson-etal-2020-open,
20 |     title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},
21 |     author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},
22 |     booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},
23 |     year = {2020},
24 |     pages = {21--27},
25 |     month = may,
26 |     address = {Marseille, France},
27 |     publisher = {European Language Resources association (ELRA)},
28 |     url = {https://www.aclweb.org/anthology/2020.sltu-1.3},
29 |     ISBN = {979-10-95546-35-1},
30 |   }
31 | 
32 | -------------------------------------------------------------------------------- /resources/77/about.html: -------------------------------------------------------------------------------- 1 | This data set contains transcribed high-quality audio of Galician sentences 2 | recorded by volunteers. The data set consists of wave files, and a TSV file 3 | (line_index.tsv). The file line_index.tsv contains a anonymized FileID and the 4 | transcription of audio in the file. 5 |

6 | The data set has been manually quality checked, but there might still be errors. 7 |

8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |

13 | See LICENSE file for license information. 14 |

15 | Copyright 2018, 2019 Google, Inc. 16 |

17 | If you use this data in publications, please cite it as follows: 18 |

19 |   @inproceedings{kjartansson-etal-2020-open,
20 |     title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},
21 |     author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},
22 |     booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},
23 |     year = {2020},
24 |     pages = {21--27},
25 |     month = may,
26 |     address = {Marseille, France},
27 |     publisher = {European Language Resources association (ELRA)},
28 |     url = {https://www.aclweb.org/anthology/2020.sltu-1.3},
29 |     ISBN = {979-10-95546-35-1},
30 |   }
31 | 
32 | -------------------------------------------------------------------------------- /resources/71/about.html: -------------------------------------------------------------------------------- 1 | This data set contains transcribed high-quality audio of Chilean Spanish 2 | sentences recorded by volunteers. The data set consists of wave files, and a TSV 3 | file (line_index.tsv). The file line_index.tsv contains a anonymized FileID and 4 | the transcription of audio in the file. 5 |

6 | The data set has been manually quality checked, but there might still be errors. 7 |

8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |

13 | See LICENSE file for license information. 14 |

15 | Copyright 2018, 2019 Google, Inc. 16 |

17 | If you use this data in publications, please cite it as follows: 18 |

19 |   @inproceedings{guevara-rukoz-etal-2020-crowdsourcing,
20 |     title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},
21 |     author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin, Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},
22 |     booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
23 |     year = {2020},
24 |     month = may,
25 |     address = {Marseille, France},
26 |     publisher = {European Language Resources Association (ELRA)},
27 |     url = {https://www.aclweb.org/anthology/2020.lrec-1.801},
28 |     pages = {6504--6513},
29 |     ISBN = {979-10-95546-34-4},
30 |   }
31 | 
32 | -------------------------------------------------------------------------------- /resources/72/about.html: -------------------------------------------------------------------------------- 1 | This data set contains transcribed high-quality audio of Columbian Spanish 2 | sentences recorded by volunteers. The data set consists of wave files, and a TSV 3 | file (line_index.tsv). The file line_index.tsv contains a anonymized FileID and 4 | the transcription of audio in the file. 5 |

6 | The data set has been manually quality checked, but there might still be errors. 7 |

8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |

13 | See LICENSE file for license information. 14 |

15 | Copyright 2018, 2019 Google, Inc. 16 |

17 | If you use this data in publications, please cite it as follows: 18 |

19 |   @inproceedings{guevara-rukoz-etal-2020-crowdsourcing,
20 |     title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},
21 |     author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin, Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},
22 |     booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
23 |     year = {2020},
24 |     month = may,
25 |     address = {Marseille, France},
26 |     publisher = {European Language Resources Association (ELRA)},
27 |     url = {https://www.aclweb.org/anthology/2020.lrec-1.801},
28 |     pages = {6504--6513},
29 |     ISBN = {979-10-95546-34-4},
30 |   }
31 | 
32 | -------------------------------------------------------------------------------- /resources/73/about.html: -------------------------------------------------------------------------------- 1 | This data set contains transcribed high-quality audio of Peruvian Spanish 2 | sentences recorded by volunteers. The data set consists of wave files, and a TSV 3 | file (line_index.tsv). The file line_index.tsv contains a anonymized FileID and 4 | the transcription of audio in the file. 5 |

6 | The data set has been manually quality checked, but there might still be errors. 7 |

8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |

13 | See LICENSE file for license information. 14 |

15 | Copyright 2018, 2019 Google, Inc. 16 |

17 | If you use this data in publications, please cite it as follows: 18 |

19 |   @inproceedings{guevara-rukoz-etal-2020-crowdsourcing,
20 |     title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},
21 |     author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin, Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},
22 |     booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
23 |     year = {2020},
24 |     month = may,
25 |     address = {Marseille, France},
26 |     publisher = {European Language Resources Association (ELRA)},
27 |     url = {https://www.aclweb.org/anthology/2020.lrec-1.801},
28 |     pages = {6504--6513},
29 |     ISBN = {979-10-95546-34-4},
30 |   }
31 | 
32 | -------------------------------------------------------------------------------- /resources/74/about.html: -------------------------------------------------------------------------------- 1 | This data set contains transcribed high-quality audio of Puerto Rico Spanish 2 | sentences recorded by volunteers. The data set consists of wave files, and a TSV 3 | file (line_index.tsv). The file line_index.tsv contains a anonymized FileID and 4 | the transcription of audio in the file. 5 |

6 | The data set has been manually quality checked, but there might still be errors. 7 |

8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |

13 | See LICENSE file for license information. 14 |

15 | Copyright 2018, 2019 Google, Inc. 16 |

17 | If you use this data in publications, please cite it as follows: 18 |

19 |   @inproceedings{guevara-rukoz-etal-2020-crowdsourcing,
20 |     title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},
21 |     author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin, Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},
22 |     booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
23 |     year = {2020},
24 |     month = may,
25 |     address = {Marseille, France},
26 |     publisher = {European Language Resources Association (ELRA)},
27 |     url = {https://www.aclweb.org/anthology/2020.lrec-1.801},
28 |     pages = {6504--6513},
29 |     ISBN = {979-10-95546-34-4},
30 |   }
31 | 
32 | -------------------------------------------------------------------------------- /resources/75/about.html: -------------------------------------------------------------------------------- 1 | This data set contains transcribed high-quality audio of Venezuelan Spanish 2 | sentences recorded by volunteers. The data set consists of wave files, and a TSV 3 | file (line_index.tsv). The file line_index.tsv contains a anonymized FileID and 4 | the transcription of audio in the file. 5 |

6 | The data set has been manually quality checked, but there might still be errors. 7 |

8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |

13 | See LICENSE file for license information. 14 |

15 | Copyright 2018, 2019 Google, Inc. 16 |

17 | If you use this data in publications, please cite it as follows: 18 |

19 |   @inproceedings{guevara-rukoz-etal-2020-crowdsourcing,
20 |     title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},
21 |     author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin, Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},
22 |     booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
23 |     year = {2020},
24 |     month = may,
25 |     address = {Marseille, France},
26 |     publisher = {European Language Resources Association (ELRA)},
27 |     url = {https://www.aclweb.org/anthology/2020.lrec-1.801},
28 |     pages = {6504--6513},
29 |     ISBN = {979-10-95546-34-4},
30 |   }
31 | 
32 | -------------------------------------------------------------------------------- /resources/80/about.html: -------------------------------------------------------------------------------- 1 | This data set contains transcribed high-quality audio of Burmese sentences 2 | recorded by volunteers. The data set consists of wave files, and a TSV file 3 | (line_index.tsv). The file line_index.tsv contains a anonymized FileID and the 4 | transcription of audio in the file. 5 |

6 | The data set has been manually quality checked, but there might still be errors. 7 |

8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |

13 | See LICENSE file for license information. 14 |

15 | Copyright 2018, 2019 Google, Inc. 16 |

17 | If you use this data in publications, please cite it as follows: 18 |

19 |   @inproceedings{oo-etal-2020-burmese,
20 |     title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application to Text-to-Speech}},
21 |     author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin, Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},
22 |     booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
23 |     month = may,
24 |     year = {2020},
25 |     pages = "6328--6339",
26 |     address = {Marseille, France},
27 |     publisher = {European Language Resources Association (ELRA)},
28 |     url = {https://www.aclweb.org/anthology/2020.lrec-1.777},
29 |     ISBN = {979-10-95546-34-4},
30 |   }
31 | 
32 | -------------------------------------------------------------------------------- /resources/22/about.html: -------------------------------------------------------------------------------- 1 |
 2 | 
 3 | INTRODUCTION
 4 | -----------
 5 | 
 6 | THUGY20 is an open Uyghur speech database published by Center for Speech and Language Technology (CSLT) 
 7 | at Tsinghua University, Signal and Information Processing Lab at Xinjiang University, and the AI cloud 
 8 | research center (AICRC). It involves the full set of speech and language resoruces required to establish 
 9 | an Uyghur speech recognition system and an Uyghur speaker recognition system.
10 | 
11 | You can cite the data using the following BibTeX entry:
12 | 
13 | 
14 | @inproceedings{THUGY20_2015,
15 |   title={THUGY20: A free Uyghur speech database},
16 |   author={Askar Roze, Shi Yin, Zhiyong Zhang, Dong Wang, Askar Hamdulla},
17 |   booktitle={NCMMSC'15},
18 |   year={2015}
19 | }
20 | 
21 | @inproceedings{THUGY20_sre_2015,
22 |   title={AN OPEN/FREE DATABASE AND BENCHMARK FOR UYGHUR SPEAKER RECOGNITION},
23 |   author={Askar Rozi, Dong Wang, Zhiyong Zhang},
24 |   Booktitle={O-COCOSDA'15},
25 |   year={2015}
26 | }
27 | 
28 | 
29 | PEOPLE
30 | -----------
31 | Dong Wang, Zhiyong Zhang, Shi Yin, Askar Roze @CSLT, Tsinghua Univ.
32 | Askar Hamdulla @Xinjiang Univ.
33 | 
34 | CONTACTOR
35 | -----------
36 | Dong Wang
37 | Xuewei Zhang
38 | Zhiyong Zhang
39 | 
40 | CSLT, Tsinghua University
41 | wangdong99@mails.tsinghua.edu.cn
42 | {zxw,zhangzy}@cslt.riit.tsinghua.edu.cn
43 | 
44 | 
45 | ROOM1-303, BLDG FIT
46 | Tsinghua University
47 | 
48 | http://cslt.org
49 | http://cslt.riit.tsinghua.edu.cn
50 | 
51 | Askar Hamdulla
52 | Xinjiang University
53 | askarhamdulla@gmail.com
54 | http://erj1.xju.edu.cn/znxx/index.htm
55 | 
56 | 
57 | 
58 | 59 | -------------------------------------------------------------------------------- /resources/65/about.html: -------------------------------------------------------------------------------- 1 | This data set contains transcribed high-quality audio of Tamil sentences 2 | recorded by volunteers. The data set consists of wave files, and a TSV file 3 | (line_index.tsv). The file line_index.tsv contains a anonymized FileID and the 4 | transcription of audio in the file. 5 |

6 | The data set has been manually quality checked, but there might still be errors. 7 |

8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |

13 | See LICENSE file for license information. 14 |

15 | Copyright 2018, 2019 Google, Inc. 16 |

17 | If you use this data in publications, please cite it as follows: 18 |

19 |   @inproceedings{he-etal-2020-open,
20 |     title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and Telugu Speech Synthesis Systems}},
21 |     author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin, Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},
22 |     booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
23 |     month = may,
24 |     year = {2020},
25 |     address = {Marseille, France},
26 |     publisher = {European Language Resources Association (ELRA)},
27 |     pages = {6494--6503},
28 |     url = {https://www.aclweb.org/anthology/2020.lrec-1.800},
29 |     ISBN = "{979-10-95546-34-4},
30 |   }
31 | 
32 | -------------------------------------------------------------------------------- /resources/66/about.html: -------------------------------------------------------------------------------- 1 | This data set contains transcribed high-quality audio of Telugu sentences 2 | recorded by volunteers. The data set consists of wave files, and a TSV file 3 | (line_index.tsv). The file line_index.tsv contains a anonymized FileID and the 4 | transcription of audio in the file. 5 |

6 | The data set has been manually quality checked, but there might still be errors. 7 |

8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |

13 | See LICENSE file for license information. 14 |

15 | Copyright 2018, 2019 Google, Inc. 16 |

17 | If you use this data in publications, please cite it as follows: 18 |

19 |   @inproceedings{he-etal-2020-open,
20 |     title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and Telugu Speech Synthesis Systems}},
21 |     author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin, Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},
22 |     booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
23 |     month = may,
24 |     year = {2020},
25 |     address = {Marseille, France},
26 |     publisher = {European Language Resources Association (ELRA)},
27 |     pages = {6494--6503},
28 |     url = {https://www.aclweb.org/anthology/2020.lrec-1.800},
29 |     ISBN = "{979-10-95546-34-4},
30 |   }
31 | 
32 | -------------------------------------------------------------------------------- /resources/63/about.html: -------------------------------------------------------------------------------- 1 | This data set contains transcribed high-quality audio of Malayalam sentences 2 | recorded by volunteers. The data set consists of wave files, and a TSV file 3 | (line_index.tsv). The file line_index.tsv contains a anonymized FileID and the 4 | transcription of audio in the file. 5 |

6 | The data set has been manually quality checked, but there might still be errors. 7 |

8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |

13 | See LICENSE file for license information. 14 |

15 | Copyright 2018, 2019 Google, Inc. 16 |

17 | If you use this data in publications, please cite it as follows: 18 |

19 |   @inproceedings{he-etal-2020-open,
20 |     title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and Telugu Speech Synthesis Systems}},
21 |     author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin, Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},
22 |     booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
23 |     month = may,
24 |     year = {2020},
25 |     address = {Marseille, France},
26 |     publisher = {European Language Resources Association (ELRA)},
27 |     pages = {6494--6503},
28 |     url = {https://www.aclweb.org/anthology/2020.lrec-1.800},
29 |     ISBN = "{979-10-95546-34-4},
30 |   }
31 | 
32 | -------------------------------------------------------------------------------- /resources/64/about.html: -------------------------------------------------------------------------------- 1 | This data set contains transcribed high-quality audio of Marathi sentences 2 | recorded by volunteers. The data set consists of wave files, and a TSV file 3 | (line_index.tsv). The file line_index.tsv contains a anonymized FileID and the 4 | transcription of audio in the file. 5 |

6 | The data set has been manually quality checked, but there might still be errors. 7 |

8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |

13 | See LICENSE file for license information. 14 |

15 | Copyright 2018, 2019 Google, Inc. 16 |

17 | If you use this data in publications, please cite it as follows: 18 |

19 |   @inproceedings{he-etal-2020-open,
20 |     title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and Telugu Speech Synthesis Systems}},
21 |     author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin, Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},
22 |     booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
23 |     month = may,
24 |     year = {2020},
25 |     address = {Marseille, France},
26 |     publisher = {European Language Resources Association (ELRA)},
27 |     pages = {6494--6503},
28 |     url = {https://www.aclweb.org/anthology/2020.lrec-1.800},
29 |     ISBN = "{979-10-95546-34-4},
30 |   }
31 | 
32 | -------------------------------------------------------------------------------- /resources/78/about.html: -------------------------------------------------------------------------------- 1 | This data set contains transcribed high-quality audio of Gujarati sentences 2 | recorded by volunteers. The data set consists of wave files, and a TSV file 3 | (line_index.tsv). The file line_index.tsv contains a anonymized FileID and the 4 | transcription of audio in the file. 5 |

6 | The data set has been manually quality checked, but there might still be errors. 7 |

8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |

13 | See LICENSE file for license information. 14 |

15 | Copyright 2018, 2019 Google, Inc. 16 |

17 | If you use this data in publications, please cite it as follows: 18 |

19 |   @inproceedings{he-etal-2020-open,
20 |     title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and Telugu Speech Synthesis Systems}},
21 |     author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin, Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},
22 |     booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
23 |     month = may,
24 |     year = {2020},
25 |     address = {Marseille, France},
26 |     publisher = {European Language Resources Association (ELRA)},
27 |     pages = {6494--6503},
28 |     url = {https://www.aclweb.org/anthology/2020.lrec-1.800},
29 |     ISBN = "{979-10-95546-34-4},
30 |   }
31 | 
32 | -------------------------------------------------------------------------------- /resources/79/about.html: -------------------------------------------------------------------------------- 1 | This data set contains transcribed high-quality audio of Kannada sentences 2 | recorded by volunteers. The data set consists of wave files, and a TSV file 3 | (line_index.tsv). The file line_index.tsv contains a anonymized FileID and the 4 | transcription of audio in the file. 5 |

6 | The data set has been manually quality checked, but there might still be errors. 7 |

8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |

13 | See LICENSE file for license information. 14 |

15 | Copyright 2018, 2019 Google, Inc. 16 |

17 | If you use this data in publications, please cite it as follows: 18 |

19 |   @inproceedings{he-etal-2020-open,
20 |     title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and Telugu Speech Synthesis Systems}},
21 |     author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin, Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},
22 |     booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
23 |     month = may,
24 |     year = {2020},
25 |     address = {Marseille, France},
26 |     publisher = {European Language Resources Association (ELRA)},
27 |     pages = {6494--6503},
28 |     url = {https://www.aclweb.org/anthology/2020.lrec-1.800},
29 |     ISBN = "{979-10-95546-34-4},
30 |   }
31 | 
32 | -------------------------------------------------------------------------------- /resources/60/about.html: -------------------------------------------------------------------------------- 1 | LibriTTS is a multi-speaker English corpus of approximately 585 hours of read English speech at 24kHz sampling rate, prepared by Heiga Zen with the assistance of Google Speech and Google Brain team members. 2 | 3 | The LibriTTS corpus is designed for TTS research. It is derived from the original materials (mp3 audio files from LibriVox and text files from Project Gutenberg) of the LibriSpeech corpus. 4 | The main differences from the LibriSpeech corpus are listed below: 5 |
    6 |
  1. The audio files are at 24kHz sampling rate.
  2. 7 |
  3. The speech is split at sentence breaks.
  4. 8 |
  5. Both original and normalized texts are included.
  6. 9 |
  7. Contextual information (e.g., neighbouring sentences) can be extracted.
  8. 10 |
  9. Utterances with significant background noise are excluded.
  10. 11 |
12 | 13 | For more information, refer to the paper "LibriTTS: A Corpus Derived from LibriSpeech for Text-to-Speech", Heiga Zen, Viet Dang, Rob Clark, Yu Zhang, Ron J. Weiss, Ye Jia, Zhifeng Chen, and Yonghui Wu, arXiv, 2019. If you use the LibriTTS corpus in your work, please cite this paper where it was introduced. 14 | 15 |

16 | The MD5 checksums of the downloads are as follows (note: not everyone will want to know this). 17 |


18 | 0c3076c1e5245bb3f0af7d82087ee207  dev-clean.tar.gz
19 | 815555d8d75995782ac3ccd7f047213d  dev-other.tar.gz
20 | 7bed3bdb047c4c197f1ad3bc412db59f  test-clean.tar.gz
21 | ae3258249472a13b5abef2a816f733e4  test-other.tar.gz
22 | 4a8c202b78fe1bc0c47916a98f3a2ea8  train-clean-100.tar.gz
23 | a84ef10ddade5fd25df69596a2767b2d  train-clean-360.tar.gz
24 | 7b181dd5ace343a5f38427999684aa6f  train-other-500.tar.gz
25 | 
26 | 27 | -------------------------------------------------------------------------------- /resources/83/info.txt: -------------------------------------------------------------------------------- 1 | name: Crowdsourced high-quality UK and Ireland English Dialect speech data set. 2 | summary: Data set which contains male and female recordings of English from various dialects of the UK and Ireland. 3 | category: speech 4 | license: Attribution-ShareAlike 4.0 International 5 | file: about.html Information about the data set 6 | file: LICENSE License information for the data set 7 | file: line_index_all.csv All utterances in the data set. 8 | file: dialect_info.txt Information about the dialects represented in the data 9 | set. 10 | file: irish_english_male.zip Archive file with recordings from the speakers of 11 | Irish English. 12 | file: midlands_english_female.zip Archive file with recordings from the female 13 | midlands English speakers. 14 | file: midlands_english_male.zip Archive file with recordings from the male 15 | midlands English speakers. 16 | file: northern_english_female.zip Archive file with recordings from the female 17 | northern English speakers. 18 | file: northern_english_male.zip Archive file with recordings from the male 19 | northern English speakers. 20 | file: scottish_english_female.zip Archive file with recordings from the female 21 | Scottish English speakers. 22 | file: scottish_english_male.zip Archive file with recordings from the male 23 | Scottish English speakers. 24 | file: southern_english_female.zip Archive file with recordings from the female 25 | southern English speakers. 26 | file: southern_english_male.zip Archive file with recordings from the male 27 | southern English speakers. 28 | file: welsh_english_female.zip Archive file with recordings from the female 29 | Welsh english speakers. 30 | file: welsh_english_male.zip Archive file with recordings from the male Welsh 31 | English speakers. 32 | --------------------------------------------------------------------------------