├── .gitignore ├── resources ├── 1 │ ├── waves_yesno.tar.gz │ ├── info.txt │ └── about.html ├── 2 │ ├── openfst-1.3.2.tar.gz │ ├── openfst-1.3.3.tar.gz │ ├── openfst-1.3.4.tar.gz │ ├── openfst-1.4.1.tar.gz │ ├── openfst-1.5.4.tar.gz │ ├── openfst-1.6.2.tar.gz │ ├── openfst-1.6.5.tar.gz │ ├── openfst-1.6.7.tar.gz │ ├── about.html │ └── info.txt ├── 3 │ ├── sph2pipe_v2.5.tar.gz │ ├── info.txt │ └── about.html ├── 4 │ ├── sctk-2.4.0-20091110-0958.tar.bz2 │ ├── sctk-2.4.0-20091110-0958.tar.gz │ ├── sctk-2.4.10-20151007-1312Z.tar.bz2 │ ├── sctk-2.4.8-20130429-2145.tar.bz2 │ ├── sctk-2.4.9-20141015-1634Z.tar.bz2 │ ├── about.html │ └── info.txt ├── 5 │ ├── sw-ms98-dict.text │ ├── switchboard_word_alignments.tar.gz │ ├── info.txt │ └── about.html ├── 6 │ ├── data_voip_cs.tgz │ ├── data_voip_en.tgz │ ├── info.txt │ └── about.html ├── 7 │ ├── TEDLIUM_release1.tar.gz │ ├── info.txt │ └── about.html ├── 8 │ ├── info.txt │ ├── about.html │ ├── lexicon-da.tgz │ └── lexicon-da-nonorm.tgz ├── 9 │ ├── wordlist.50k.gz │ ├── about.html │ └── info.txt ├── 10 │ ├── sre04_key.tgz │ ├── sre04_key-v2.txt.gz │ ├── sre05-key-v7b.txt.gz │ ├── about.html │ ├── sre2000-key.tar.gz │ └── info.txt ├── 11 │ ├── g2p-model-5 │ ├── 3-gram.arpa.gz │ ├── 4-gram.arpa.gz │ ├── librispeech-vocab.txt │ ├── librispeech-lm-corpus.tgz │ ├── 3-gram.pruned.1e-7.arpa.gz │ ├── 3-gram.pruned.3e-7.arpa.gz │ ├── librispeech-lexicon.txt │ ├── librispeech-lm-norm.txt.gz │ ├── about.html │ └── info.txt ├── 12 │ ├── md5sum.txt │ ├── dev-clean.tar.gz │ ├── dev-other.tar.gz │ ├── test-clean.tar.gz │ ├── test-other.tar.gz │ ├── original-mp3.tar.gz │ ├── raw-metadata.tar.gz │ ├── original-books.tar.gz │ ├── train-clean-100.tar.gz │ ├── train-clean-360.tar.gz │ ├── train-other-500.tar.gz │ ├── intro-disclaimers.tar.gz │ ├── about.html │ └── info.txt ├── 13 │ ├── RWCP.tar.gz │ ├── info.txt │ └── about.html ├── 14 │ ├── beep.tar.gz │ └── info.txt ├── 15 │ ├── speaker_list.tgz │ ├── info.txt │ └── about.html ├── 16 │ ├── headset.tar.gz │ ├── Array1-01.tar.gz │ ├── Array1-02.tar.gz │ ├── Array1-03.tar.gz │ ├── Array1-04.tar.gz │ ├── Array1-05.tar.gz │ ├── Array1-06.tar.gz │ ├── Array1-07.tar.gz │ ├── Array1-08.tar.gz │ ├── ami_manual_1.6.1.tar.gz │ ├── info.txt │ └── about.html ├── 17 │ ├── musan.tar.gz │ ├── info.txt │ └── about.html ├── 18 │ ├── resource.tgz │ ├── test-noise.tgz │ ├── data_thchs30.tgz │ └── info.txt ├── 19 │ ├── TEDLIUM_release2.tar.gz │ ├── info.txt │ └── about.html ├── 20 │ ├── air_database_release_1_4.zip │ ├── about.html │ └── info.txt ├── 21 │ ├── es_wordlist.json.tgz │ ├── info.txt │ └── about.html ├── 22 │ ├── resource.tar.gz │ ├── test_noise.tar.gz │ ├── data_thuyg20.tar.gz │ ├── data_thuyg20_sre.tar.gz │ ├── test_noise_sre.tar.gz │ ├── info.txt │ └── about.html ├── 23 │ ├── lre07_key.txt │ ├── about.html │ └── info.txt ├── 24 │ ├── iban.tar.gz │ └── info.txt ├── 25 │ ├── data_readspeech_am.tar.bz2 │ ├── data_readspeech_wo.tar.bz2 │ ├── data_broadcastnews_sw.tar.bz2 │ └── info.txt ├── 26 │ ├── sim_rir_16k.zip │ ├── sim_rir_8k.zip │ ├── info.txt │ └── about.html ├── 27 │ ├── cantab-TEDLIUM.tar.bz2 │ ├── cantab-TEDLIUM-partial.tar.bz2 │ └── info.txt ├── 28 │ ├── rirs_noises.zip │ ├── info.txt │ └── about.html ├── 29 │ ├── lexicon-sv.tgz │ ├── info.txt │ └── about.html ├── 30 │ ├── README.txt │ ├── si_lk.tar │ ├── LICENSE.txt │ ├── si_lk.tar.gz │ ├── si_lk.lines.txt │ ├── info.txt │ └── about.html ├── 31 │ ├── dev-clean-2.tar.gz │ ├── train-clean-5.tar.gz │ ├── about.html │ ├── md5sum.txt │ └── info.txt ├── 32 │ ├── af_za.tar.gz │ ├── st_za.tar.gz │ ├── tn_za.tar.gz │ ├── xh_za.tar.gz │ ├── info.txt │ └── about.html ├── 33 │ ├── data_aishell.tgz │ ├── resource_aishell.tgz │ ├── info.txt │ └── about.html ├── 34 │ ├── santiago.tar.gz │ ├── info.txt │ └── about.html ├── 35 │ ├── LICENSE │ ├── asr_javanese_0.zip │ ├── asr_javanese_1.zip │ ├── asr_javanese_2.zip │ ├── asr_javanese_3.zip │ ├── asr_javanese_4.zip │ ├── asr_javanese_5.zip │ ├── asr_javanese_6.zip │ ├── asr_javanese_7.zip │ ├── asr_javanese_8.zip │ ├── asr_javanese_9.zip │ ├── asr_javanese_a.zip │ ├── asr_javanese_b.zip │ ├── asr_javanese_c.zip │ ├── asr_javanese_d.zip │ ├── asr_javanese_e.zip │ ├── asr_javanese_f.zip │ ├── utt_spk_text.tsv │ ├── info.txt │ ├── about.html │ └── asr_javanese.sha256 ├── 36 │ ├── asr_sundanese_0.zip │ ├── asr_sundanese_1.zip │ ├── asr_sundanese_2.zip │ ├── asr_sundanese_3.zip │ ├── asr_sundanese_4.zip │ ├── asr_sundanese_5.zip │ ├── asr_sundanese_6.zip │ ├── asr_sundanese_7.zip │ ├── asr_sundanese_8.zip │ ├── asr_sundanese_9.zip │ ├── asr_sundanese_a.zip │ ├── asr_sundanese_b.zip │ ├── asr_sundanese_c.zip │ ├── asr_sundanese_d.zip │ ├── asr_sundanese_e.zip │ ├── asr_sundanese_f.zip │ ├── about.html │ ├── info.txt │ └── asr_sundanese.sha256 ├── 37 │ ├── bn_bd.zip │ ├── bn_in.zip │ ├── info.txt │ ├── README.txt │ └── about.html ├── 38 │ ├── ST-CMDS-20170001_1-OS.tar.gz │ ├── info.txt │ └── about.html ├── 39 │ ├── LDC2006S37.tar.gz │ ├── info.txt │ └── about.html ├── 40 │ ├── zeroth_korean.tar.gz │ ├── info.txt │ └── about.html ├── 41 │ ├── LICENSE │ ├── jv_id_male.zip │ ├── jv_id_female.zip │ ├── info.txt │ └── about.html ├── 42 │ ├── LICENSE │ ├── km_kh_male.zip │ ├── info.txt │ └── about.html ├── 43 │ ├── LICENSE │ ├── ne_np_female.zip │ ├── info.txt │ └── about.html ├── 44 │ ├── LICENSE │ ├── su_id_male.zip │ ├── su_id_female.zip │ ├── info.txt │ └── about.html ├── 45 │ ├── ST-AEDS-20180100_1-OS.tgz │ ├── info.txt │ └── about.html ├── 46 │ ├── Tunisian_MSA.tar.gz │ ├── info.txt │ └── about.html ├── 47 │ ├── primewords_md_2018_set1.tar.gz │ ├── info.txt │ └── about.html ├── 48 │ ├── madcat.dev.raw.lineid │ ├── madcat.test.raw.lineid │ ├── madcat.train.raw.lineid │ ├── info.txt │ └── about.html ├── 49 │ ├── vox1_meta.csv │ ├── vox2_meta.csv │ ├── voxceleb1_test.txt │ ├── voxceleb1_test_v2.txt │ ├── voxceleb1_sitw_overlap.txt │ ├── about.html │ └── info.txt ├── 50 │ ├── madcat.dev.raw.lineid │ ├── madcat.test.raw.lineid │ ├── madcat.train.raw.lineid │ ├── info.txt │ └── about.html ├── 51 │ ├── TEDLIUM_release-3.tgz │ └── info.txt ├── 52 │ ├── asr_sinhala_0.zip │ ├── asr_sinhala_1.zip │ ├── asr_sinhala_2.zip │ ├── asr_sinhala_3.zip │ ├── asr_sinhala_4.zip │ ├── asr_sinhala_5.zip │ ├── asr_sinhala_6.zip │ ├── asr_sinhala_7.zip │ ├── asr_sinhala_8.zip │ ├── asr_sinhala_9.zip │ ├── asr_sinhala_a.zip │ ├── asr_sinhala_b.zip │ ├── asr_sinhala_c.zip │ ├── asr_sinhala_d.zip │ ├── asr_sinhala_e.zip │ ├── asr_sinhala_f.zip │ ├── utt_spk_text.tsv │ ├── info.txt │ └── about.html ├── 53 │ ├── asr_bengali_0.zip │ ├── asr_bengali_1.zip │ ├── asr_bengali_2.zip │ ├── asr_bengali_3.zip │ ├── asr_bengali_4.zip │ ├── asr_bengali_5.zip │ ├── asr_bengali_6.zip │ ├── asr_bengali_7.zip │ ├── asr_bengali_8.zip │ ├── asr_bengali_9.zip │ ├── asr_bengali_a.zip │ ├── asr_bengali_b.zip │ ├── asr_bengali_c.zip │ ├── asr_bengali_d.zip │ ├── asr_bengali_e.zip │ ├── asr_bengali_f.zip │ ├── utt_spk_text.tsv │ ├── info.txt │ └── about.html ├── 54 │ ├── asr_nepali_0.zip │ ├── asr_nepali_1.zip │ ├── asr_nepali_2.zip │ ├── asr_nepali_3.zip │ ├── asr_nepali_4.zip │ ├── asr_nepali_5.zip │ ├── asr_nepali_6.zip │ ├── asr_nepali_7.zip │ ├── asr_nepali_8.zip │ ├── asr_nepali_9.zip │ ├── asr_nepali_a.zip │ ├── asr_nepali_b.zip │ ├── asr_nepali_c.zip │ ├── asr_nepali_d.zip │ ├── asr_nepali_e.zip │ ├── asr_nepali_f.zip │ ├── utt_spk_text.tsv │ ├── info.txt │ └── about.html ├── 55 │ ├── test.tgz │ ├── train.tgz │ ├── info.txt │ └── about.html ├── 56 │ ├── splits.zip │ ├── info.txt │ └── about.html ├── 57 │ ├── African_Accented_French.tar.gz │ ├── info.txt │ └── about.html ├── 58 │ ├── pansori-tedxkr-corpus-1.0.tar.gz │ └── info.txt ├── 59 │ ├── parlament_v1.0_clean.tar.gz │ ├── parlament_v1.0_other.tar.gz │ ├── info.txt │ └── about.html ├── 60 │ ├── dev-clean.tar.gz │ ├── dev-other.tar.gz │ ├── test-clean.tar.gz │ ├── test-other.tar.gz │ ├── train-clean-100.tar.gz │ ├── train-clean-360.tar.gz │ ├── train-other-500.tar.gz │ ├── info.txt │ └── about.html ├── 61 │ ├── LICENSE │ ├── es_ar_male.zip │ ├── es_ar_female.zip │ ├── line_index_male.tsv │ ├── es_weather_messages.zip │ ├── line_index_female.tsv │ ├── es_ar_line_index_weather.tsv │ ├── es_es_line_index_weather.tsv │ └── info.txt ├── 62 │ ├── aidatatang_200zh.tgz │ └── info.txt ├── 63 │ ├── LICENSE │ ├── ml_in_male.zip │ ├── ml_in_female.zip │ ├── line_index_male.tsv │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 64 │ ├── LICENSE │ ├── line_index.tsv │ ├── mr_in_female.zip │ ├── info.txt │ └── about.html ├── 65 │ ├── LICENSE │ ├── ta_in_male.zip │ ├── ta_in_female.zip │ ├── line_index_male.tsv │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 66 │ ├── LICENSE │ ├── te_in_male.zip │ ├── te_in_female.zip │ ├── line_index_male.tsv │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 67 │ ├── tedx_spanish_corpus.tgz │ ├── info.txt │ └── about.html ├── 68 │ ├── dev_set.tar.gz │ ├── metadata.tar.gz │ ├── test_set.tar.gz │ ├── train_set.tar.gz │ └── info.txt ├── 69 │ ├── LICENSE │ ├── ca_es_male.zip │ ├── ca_es_female.zip │ ├── line_index_male.tsv │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 70 │ ├── LICENSE │ ├── en_ng_male.zip │ ├── en_ng_female.zip │ ├── line_index_male.tsv │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 71 │ ├── LICENSE │ ├── es_cl_male.zip │ ├── es_cl_female.zip │ ├── line_index_male.tsv │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 72 │ ├── LICENSE │ ├── es_co_male.zip │ ├── es_co_female.zip │ ├── line_index_male.tsv │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 73 │ ├── LICENSE │ ├── es_pe_male.zip │ ├── es_pe_female.zip │ ├── line_index_male.tsv │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 74 │ ├── LICENSE │ ├── es_pr_female.zip │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 75 │ ├── LICENSE │ ├── es_ve_male.zip │ ├── es_ve_female.zip │ ├── line_index_male.tsv │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 76 │ ├── LICENSE │ ├── eu_es_male.zip │ ├── eu_es_female.zip │ ├── line_index_male.tsv │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 77 │ ├── LICENSE │ ├── gl_es_male.zip │ ├── gl_es_female.zip │ ├── line_index_male.tsv │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 78 │ ├── LICENSE │ ├── gu_in_male.zip │ ├── gu_in_female.zip │ ├── line_index_male.tsv │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 79 │ ├── LICENSE │ ├── kn_in_male.zip │ ├── kn_in_female.zip │ ├── line_index_male.tsv │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 80 │ ├── LICENSE │ ├── my_mm_female.zip │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 81 │ ├── samples.tar.gz │ ├── info.txt │ └── about.html ├── 82 │ ├── cn-celeb.tgz │ └── info.txt ├── 83 │ ├── dialect_info.txt │ ├── line_index_all.csv │ ├── irish_english_male.zip │ ├── welsh_english_male.zip │ ├── midlands_english_male.zip │ ├── northern_english_male.zip │ ├── scottish_english_male.zip │ ├── southern_english_male.zip │ ├── welsh_english_female.zip │ ├── midlands_english_female.zip │ ├── northern_english_female.zip │ ├── scottish_english_female.zip │ ├── southern_english_female.zip │ └── info.txt ├── 84 │ ├── scribblelens.corpus.v1.2.zip │ └── info.txt ├── 85 │ ├── dev.tar.gz │ ├── test.tar.gz │ ├── train.tar.gz │ ├── test_v2.tar.gz │ ├── filename_mapping.tar.gz │ ├── info.txt │ └── about.html ├── 86 │ ├── LICENSE │ ├── yo_ng_male.zip │ ├── yo_ng_female.zip │ ├── annotation_info.txt │ ├── line_index_male.tsv │ ├── line_index_female.tsv │ ├── info.txt │ └── about.html ├── 87 │ ├── mobvoi_hotword_dataset.tgz │ ├── mobvoi_hotword_dataset_resources.tgz │ ├── info.txt │ └── about.html ├── 89 │ ├── Yoloxochitl-Mixtec-Data.tgz │ └── info.txt ├── 90 │ ├── speechocean.zip │ ├── info.txt │ └── about.html └── 91 │ ├── speechoceanfreedata2.zip │ └── info.txt ├── favicon.ico ├── openslr.png ├── openslr_ico.png ├── config ├── OpenslrLogo.key └── README ├── robots.txt └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#*\# 3 | 4 | -------------------------------------------------------------------------------- /resources/35/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/LICENSE -------------------------------------------------------------------------------- /resources/41/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/41/LICENSE -------------------------------------------------------------------------------- /resources/42/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/42/LICENSE -------------------------------------------------------------------------------- /resources/43/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/43/LICENSE -------------------------------------------------------------------------------- /resources/44/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/44/LICENSE -------------------------------------------------------------------------------- /resources/55/test.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/55/test.tgz -------------------------------------------------------------------------------- /resources/61/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/61/LICENSE -------------------------------------------------------------------------------- /resources/63/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/63/LICENSE -------------------------------------------------------------------------------- /resources/64/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/64/LICENSE -------------------------------------------------------------------------------- /resources/65/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/65/LICENSE -------------------------------------------------------------------------------- /resources/66/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/66/LICENSE -------------------------------------------------------------------------------- /resources/69/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/69/LICENSE -------------------------------------------------------------------------------- /resources/70/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/70/LICENSE -------------------------------------------------------------------------------- /resources/71/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/71/LICENSE -------------------------------------------------------------------------------- /resources/72/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/72/LICENSE -------------------------------------------------------------------------------- /resources/73/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/73/LICENSE -------------------------------------------------------------------------------- /resources/74/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/74/LICENSE -------------------------------------------------------------------------------- /resources/75/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/75/LICENSE -------------------------------------------------------------------------------- /resources/76/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/76/LICENSE -------------------------------------------------------------------------------- /resources/77/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/77/LICENSE -------------------------------------------------------------------------------- /resources/78/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/78/LICENSE -------------------------------------------------------------------------------- /resources/79/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/79/LICENSE -------------------------------------------------------------------------------- /resources/8/info.txt: -------------------------------------------------------------------------------- 1 | /mnt/resources1/8/info.txt -------------------------------------------------------------------------------- /resources/80/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/80/LICENSE -------------------------------------------------------------------------------- /resources/86/LICENSE: -------------------------------------------------------------------------------- 1 | /mnt/resources1/86/LICENSE -------------------------------------------------------------------------------- /resources/12/md5sum.txt: -------------------------------------------------------------------------------- 1 | /mnt/resources1/12/md5sum.txt -------------------------------------------------------------------------------- /resources/30/README.txt: -------------------------------------------------------------------------------- 1 | /mnt/resources1/30/README.txt -------------------------------------------------------------------------------- /resources/30/si_lk.tar: -------------------------------------------------------------------------------- 1 | /mnt/resources1/30/si_lk.tar -------------------------------------------------------------------------------- /resources/37/bn_bd.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/37/bn_bd.zip -------------------------------------------------------------------------------- /resources/37/bn_in.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/37/bn_in.zip -------------------------------------------------------------------------------- /resources/55/train.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/55/train.tgz -------------------------------------------------------------------------------- /resources/56/splits.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/56/splits.zip -------------------------------------------------------------------------------- /resources/8/about.html: -------------------------------------------------------------------------------- 1 | /mnt/resources1/8/about.html -------------------------------------------------------------------------------- /resources/85/dev.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/85/dev.tar.gz -------------------------------------------------------------------------------- /resources/10/sre04_key.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/10/sre04_key.tgz -------------------------------------------------------------------------------- /resources/11/g2p-model-5: -------------------------------------------------------------------------------- 1 | /mnt/resources1/11/g2p-model-5 -------------------------------------------------------------------------------- /resources/13/RWCP.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/13/RWCP.tar.gz -------------------------------------------------------------------------------- /resources/14/beep.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/14/beep.tar.gz -------------------------------------------------------------------------------- /resources/17/musan.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/17/musan.tar.gz -------------------------------------------------------------------------------- /resources/18/resource.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/18/resource.tgz -------------------------------------------------------------------------------- /resources/23/lre07_key.txt: -------------------------------------------------------------------------------- 1 | /mnt/resources1/23/lre07_key.txt -------------------------------------------------------------------------------- /resources/24/iban.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/24/iban.tar.gz -------------------------------------------------------------------------------- /resources/30/LICENSE.txt: -------------------------------------------------------------------------------- 1 | /mnt/resources1/30/LICENSE.txt -------------------------------------------------------------------------------- /resources/30/si_lk.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/30/si_lk.tar.gz -------------------------------------------------------------------------------- /resources/32/af_za.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/32/af_za.tar.gz -------------------------------------------------------------------------------- /resources/32/st_za.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/32/st_za.tar.gz -------------------------------------------------------------------------------- /resources/32/tn_za.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/32/tn_za.tar.gz -------------------------------------------------------------------------------- /resources/32/xh_za.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/32/xh_za.tar.gz -------------------------------------------------------------------------------- /resources/49/vox1_meta.csv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/49/vox1_meta.csv -------------------------------------------------------------------------------- /resources/49/vox2_meta.csv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/49/vox2_meta.csv -------------------------------------------------------------------------------- /resources/8/lexicon-da.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/8/lexicon-da.tgz -------------------------------------------------------------------------------- /resources/82/cn-celeb.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/82/cn-celeb.tgz -------------------------------------------------------------------------------- /resources/85/test.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/85/test.tar.gz -------------------------------------------------------------------------------- /resources/85/train.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/85/train.tar.gz -------------------------------------------------------------------------------- /resources/11/3-gram.arpa.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/11/3-gram.arpa.gz -------------------------------------------------------------------------------- /resources/11/4-gram.arpa.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/11/4-gram.arpa.gz -------------------------------------------------------------------------------- /resources/11/librispeech-vocab.txt: -------------------------------------------------------------------------------- 1 | /mnt/resources1/11/vocab.txt -------------------------------------------------------------------------------- /resources/16/headset.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/16/headset.tar.gz -------------------------------------------------------------------------------- /resources/18/test-noise.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/18/test-noise.tgz -------------------------------------------------------------------------------- /resources/22/resource.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/22/resource.tar.gz -------------------------------------------------------------------------------- /resources/26/sim_rir_16k.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/26/sim_rir_16k.zip -------------------------------------------------------------------------------- /resources/26/sim_rir_8k.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/26/sim_rir_8k.zip -------------------------------------------------------------------------------- /resources/28/rirs_noises.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/28/rirs_noises.zip -------------------------------------------------------------------------------- /resources/29/lexicon-sv.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/29/lexicon-sv.tgz -------------------------------------------------------------------------------- /resources/30/si_lk.lines.txt: -------------------------------------------------------------------------------- 1 | /mnt/resources1/30/si_lk.lines.txt -------------------------------------------------------------------------------- /resources/34/santiago.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/34/santiago.tar.gz -------------------------------------------------------------------------------- /resources/41/jv_id_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/41/jv_id_male.zip -------------------------------------------------------------------------------- /resources/42/km_kh_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/42/km_kh_male.zip -------------------------------------------------------------------------------- /resources/44/su_id_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/44/su_id_male.zip -------------------------------------------------------------------------------- /resources/6/data_voip_cs.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/6/data_voip_cs.tgz -------------------------------------------------------------------------------- /resources/6/data_voip_en.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/6/data_voip_en.tgz -------------------------------------------------------------------------------- /resources/61/es_ar_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/61/es_ar_male.zip -------------------------------------------------------------------------------- /resources/63/ml_in_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/63/ml_in_male.zip -------------------------------------------------------------------------------- /resources/64/line_index.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/64/line_index.tsv -------------------------------------------------------------------------------- /resources/65/ta_in_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/65/ta_in_male.zip -------------------------------------------------------------------------------- /resources/66/te_in_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/66/te_in_male.zip -------------------------------------------------------------------------------- /resources/68/dev_set.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/68/dev_set.tar.gz -------------------------------------------------------------------------------- /resources/68/metadata.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/68/metadata.tar.gz -------------------------------------------------------------------------------- /resources/68/test_set.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/68/test_set.tar.gz -------------------------------------------------------------------------------- /resources/69/ca_es_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/69/ca_es_male.zip -------------------------------------------------------------------------------- /resources/70/en_ng_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/70/en_ng_male.zip -------------------------------------------------------------------------------- /resources/71/es_cl_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/71/es_cl_male.zip -------------------------------------------------------------------------------- /resources/72/es_co_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/72/es_co_male.zip -------------------------------------------------------------------------------- /resources/73/es_pe_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/73/es_pe_male.zip -------------------------------------------------------------------------------- /resources/75/es_ve_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/75/es_ve_male.zip -------------------------------------------------------------------------------- /resources/76/eu_es_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/76/eu_es_male.zip -------------------------------------------------------------------------------- /resources/77/gl_es_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/77/gl_es_male.zip -------------------------------------------------------------------------------- /resources/78/gu_in_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/78/gu_in_male.zip -------------------------------------------------------------------------------- /resources/79/kn_in_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/79/kn_in_male.zip -------------------------------------------------------------------------------- /resources/81/samples.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/81/samples.tar.gz -------------------------------------------------------------------------------- /resources/85/test_v2.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/85/test_v2.tar.gz -------------------------------------------------------------------------------- /resources/86/yo_ng_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/86/yo_ng_male.zip -------------------------------------------------------------------------------- /resources/9/wordlist.50k.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/9/wordlist.50k.gz -------------------------------------------------------------------------------- /resources/90/speechocean.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/90/speechocean.zip -------------------------------------------------------------------------------- /resources/1/waves_yesno.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/./1/waves_yesno.tar.gz -------------------------------------------------------------------------------- /resources/12/dev-clean.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/12/dev-clean.tar.gz -------------------------------------------------------------------------------- /resources/12/dev-other.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/12/dev-other.tar.gz -------------------------------------------------------------------------------- /resources/12/test-clean.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/12/test-clean.tar.gz -------------------------------------------------------------------------------- /resources/12/test-other.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/12/test-other.tar.gz -------------------------------------------------------------------------------- /resources/15/speaker_list.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/15/speaker_list.tgz -------------------------------------------------------------------------------- /resources/16/Array1-01.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/16/Array1-01.tar.gz -------------------------------------------------------------------------------- /resources/16/Array1-02.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/16/Array1-02.tar.gz -------------------------------------------------------------------------------- /resources/16/Array1-03.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/16/Array1-03.tar.gz -------------------------------------------------------------------------------- /resources/16/Array1-04.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/16/Array1-04.tar.gz -------------------------------------------------------------------------------- /resources/16/Array1-05.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/16/Array1-05.tar.gz -------------------------------------------------------------------------------- /resources/16/Array1-06.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/16/Array1-06.tar.gz -------------------------------------------------------------------------------- /resources/16/Array1-07.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/16/Array1-07.tar.gz -------------------------------------------------------------------------------- /resources/16/Array1-08.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/16/Array1-08.tar.gz -------------------------------------------------------------------------------- /resources/18/data_thchs30.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/18/data_thchs30.tgz -------------------------------------------------------------------------------- /resources/22/test_noise.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/22/test_noise.tar.gz -------------------------------------------------------------------------------- /resources/31/dev-clean-2.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/31/dev-clean-2.tar.gz -------------------------------------------------------------------------------- /resources/33/data_aishell.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/33/data_aishell.tgz -------------------------------------------------------------------------------- /resources/35/asr_javanese_0.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_0.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_1.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_1.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_2.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_2.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_3.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_3.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_4.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_4.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_5.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_5.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_6.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_6.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_7.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_7.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_8.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_8.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_9.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_9.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_a.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_a.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_b.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_b.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_c.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_c.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_d.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_d.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_e.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_e.zip -------------------------------------------------------------------------------- /resources/35/asr_javanese_f.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/asr_javanese_f.zip -------------------------------------------------------------------------------- /resources/35/utt_spk_text.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/35/utt_spk_text.tsv -------------------------------------------------------------------------------- /resources/39/LDC2006S37.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/39/LDC2006S37.tar.gz -------------------------------------------------------------------------------- /resources/41/jv_id_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/41/jv_id_female.zip -------------------------------------------------------------------------------- /resources/43/ne_np_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/43/ne_np_female.zip -------------------------------------------------------------------------------- /resources/44/su_id_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/44/su_id_female.zip -------------------------------------------------------------------------------- /resources/49/voxceleb1_test.txt: -------------------------------------------------------------------------------- 1 | /mnt/resources1/49/voxceleb1_test.txt -------------------------------------------------------------------------------- /resources/5/sw-ms98-dict.text: -------------------------------------------------------------------------------- 1 | /mnt/resources1/5/sw-ms98-dict.text -------------------------------------------------------------------------------- /resources/52/asr_sinhala_0.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_0.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_1.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_1.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_2.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_2.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_3.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_3.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_4.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_4.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_5.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_5.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_6.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_6.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_7.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_7.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_8.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_8.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_9.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_9.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_a.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_a.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_b.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_b.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_c.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_c.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_d.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_d.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_e.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_e.zip -------------------------------------------------------------------------------- /resources/52/asr_sinhala_f.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/asr_sinhala_f.zip -------------------------------------------------------------------------------- /resources/52/utt_spk_text.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/52/utt_spk_text.tsv -------------------------------------------------------------------------------- /resources/53/asr_bengali_0.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_0.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_1.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_1.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_2.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_2.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_3.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_3.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_4.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_4.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_5.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_5.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_6.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_6.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_7.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_7.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_8.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_8.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_9.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_9.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_a.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_a.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_b.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_b.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_c.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_c.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_d.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_d.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_e.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_e.zip -------------------------------------------------------------------------------- /resources/53/asr_bengali_f.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/asr_bengali_f.zip -------------------------------------------------------------------------------- /resources/53/utt_spk_text.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/53/utt_spk_text.tsv -------------------------------------------------------------------------------- /resources/54/asr_nepali_0.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_0.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_1.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_1.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_2.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_2.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_3.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_3.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_4.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_4.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_5.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_5.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_6.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_6.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_7.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_7.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_8.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_8.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_9.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_9.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_a.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_a.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_b.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_b.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_c.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_c.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_d.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_d.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_e.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_e.zip -------------------------------------------------------------------------------- /resources/54/asr_nepali_f.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/asr_nepali_f.zip -------------------------------------------------------------------------------- /resources/54/utt_spk_text.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/54/utt_spk_text.tsv -------------------------------------------------------------------------------- /resources/60/dev-clean.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/60/dev-clean.tar.gz -------------------------------------------------------------------------------- /resources/60/dev-other.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/60/dev-other.tar.gz -------------------------------------------------------------------------------- /resources/60/test-clean.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/60/test-clean.tar.gz -------------------------------------------------------------------------------- /resources/60/test-other.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/60/test-other.tar.gz -------------------------------------------------------------------------------- /resources/61/es_ar_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/61/es_ar_female.zip -------------------------------------------------------------------------------- /resources/63/ml_in_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/63/ml_in_female.zip -------------------------------------------------------------------------------- /resources/64/mr_in_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/64/mr_in_female.zip -------------------------------------------------------------------------------- /resources/65/ta_in_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/65/ta_in_female.zip -------------------------------------------------------------------------------- /resources/66/te_in_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/66/te_in_female.zip -------------------------------------------------------------------------------- /resources/68/train_set.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/68/train_set.tar.gz -------------------------------------------------------------------------------- /resources/69/ca_es_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/69/ca_es_female.zip -------------------------------------------------------------------------------- /resources/70/en_ng_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/70/en_ng_female.zip -------------------------------------------------------------------------------- /resources/71/es_cl_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/71/es_cl_female.zip -------------------------------------------------------------------------------- /resources/72/es_co_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/72/es_co_female.zip -------------------------------------------------------------------------------- /resources/73/es_pe_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/73/es_pe_female.zip -------------------------------------------------------------------------------- /resources/74/es_pr_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/74/es_pr_female.zip -------------------------------------------------------------------------------- /resources/75/es_ve_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/75/es_ve_female.zip -------------------------------------------------------------------------------- /resources/76/eu_es_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/76/eu_es_female.zip -------------------------------------------------------------------------------- /resources/77/gl_es_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/77/gl_es_female.zip -------------------------------------------------------------------------------- /resources/78/gu_in_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/78/gu_in_female.zip -------------------------------------------------------------------------------- /resources/79/kn_in_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/79/kn_in_female.zip -------------------------------------------------------------------------------- /resources/80/my_mm_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/80/my_mm_female.zip -------------------------------------------------------------------------------- /resources/83/dialect_info.txt: -------------------------------------------------------------------------------- 1 | /mnt/resources1/83/dialect_info.txt -------------------------------------------------------------------------------- /resources/83/line_index_all.csv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/83/line_index_all.csv -------------------------------------------------------------------------------- /resources/86/yo_ng_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/86/yo_ng_female.zip -------------------------------------------------------------------------------- /resources/10/sre04_key-v2.txt.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/10/sre04_key-v2.txt.gz -------------------------------------------------------------------------------- /resources/10/sre05-key-v7b.txt.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/10/sre05-key-v7b.txt.gz -------------------------------------------------------------------------------- /resources/12/original-mp3.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/12/original-mp3.tar.gz -------------------------------------------------------------------------------- /resources/12/raw-metadata.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/12/raw-metadata.tar.gz -------------------------------------------------------------------------------- /resources/2/openfst-1.3.2.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/./2/openfst-1.3.2.tar.gz -------------------------------------------------------------------------------- /resources/2/openfst-1.3.3.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/./2/openfst-1.3.3.tar.gz -------------------------------------------------------------------------------- /resources/2/openfst-1.3.4.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/./2/openfst-1.3.4.tar.gz -------------------------------------------------------------------------------- /resources/2/openfst-1.4.1.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1//2/openfst-1.4.1.tar.gz -------------------------------------------------------------------------------- /resources/2/openfst-1.5.4.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1//2/openfst-1.5.4.tar.gz -------------------------------------------------------------------------------- /resources/2/openfst-1.6.2.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1//2/openfst-1.6.2.tar.gz -------------------------------------------------------------------------------- /resources/2/openfst-1.6.5.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/2/openfst-1.6.5.tar.gz -------------------------------------------------------------------------------- /resources/2/openfst-1.6.7.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/2/openfst-1.6.7.tar.gz -------------------------------------------------------------------------------- /resources/21/es_wordlist.json.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/21/es_wordlist.json.tgz -------------------------------------------------------------------------------- /resources/22/data_thuyg20.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/22/data_thuyg20.tar.gz -------------------------------------------------------------------------------- /resources/3/sph2pipe_v2.5.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/./3/sph2pipe_v2.5.tar.gz -------------------------------------------------------------------------------- /resources/31/train-clean-5.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/31/train-clean-5.tar.gz -------------------------------------------------------------------------------- /resources/33/resource_aishell.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/33/resource_aishell.tgz -------------------------------------------------------------------------------- /resources/36/asr_sundanese_0.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_0.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_1.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_1.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_2.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_2.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_3.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_3.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_4.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_4.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_5.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_5.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_6.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_6.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_7.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_7.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_8.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_8.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_9.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_9.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_a.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_a.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_b.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_b.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_c.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_c.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_d.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_d.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_e.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_e.zip -------------------------------------------------------------------------------- /resources/36/asr_sundanese_f.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/36/asr_sundanese_f.zip -------------------------------------------------------------------------------- /resources/40/zeroth_korean.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/40/zeroth_korean.tar.gz -------------------------------------------------------------------------------- /resources/46/Tunisian_MSA.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/46/Tunisian_MSA.tar.gz -------------------------------------------------------------------------------- /resources/61/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/61/line_index_male.tsv -------------------------------------------------------------------------------- /resources/62/aidatatang_200zh.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/62/aidatatang_200zh.tgz -------------------------------------------------------------------------------- /resources/63/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/63/line_index_male.tsv -------------------------------------------------------------------------------- /resources/65/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/65/line_index_male.tsv -------------------------------------------------------------------------------- /resources/66/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/66/line_index_male.tsv -------------------------------------------------------------------------------- /resources/69/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/69/line_index_male.tsv -------------------------------------------------------------------------------- /resources/70/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/70/line_index_male.tsv -------------------------------------------------------------------------------- /resources/71/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/71/line_index_male.tsv -------------------------------------------------------------------------------- /resources/72/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/72/line_index_male.tsv -------------------------------------------------------------------------------- /resources/73/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/73/line_index_male.tsv -------------------------------------------------------------------------------- /resources/75/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/75/line_index_male.tsv -------------------------------------------------------------------------------- /resources/76/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/76/line_index_male.tsv -------------------------------------------------------------------------------- /resources/77/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/77/line_index_male.tsv -------------------------------------------------------------------------------- /resources/78/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/78/line_index_male.tsv -------------------------------------------------------------------------------- /resources/79/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/79/line_index_male.tsv -------------------------------------------------------------------------------- /resources/8/lexicon-da-nonorm.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/8/lexicon-da-nonorm.tgz -------------------------------------------------------------------------------- /resources/86/annotation_info.txt: -------------------------------------------------------------------------------- 1 | /mnt/resources1/86/annotation_info.txt -------------------------------------------------------------------------------- /resources/86/line_index_male.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/86/line_index_male.tsv -------------------------------------------------------------------------------- /favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danpovey/openslr/HEAD/favicon.ico -------------------------------------------------------------------------------- /openslr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danpovey/openslr/HEAD/openslr.png -------------------------------------------------------------------------------- /resources/12/original-books.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/12/original-books.tar.gz -------------------------------------------------------------------------------- /resources/12/train-clean-100.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/12/train-clean-100.tar.gz -------------------------------------------------------------------------------- /resources/12/train-clean-360.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/12/train-clean-360.tar.gz -------------------------------------------------------------------------------- /resources/12/train-other-500.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/12/train-other-500.tar.gz -------------------------------------------------------------------------------- /resources/16/ami_manual_1.6.1.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/16/ami_manual_1.6.1.tar.gz -------------------------------------------------------------------------------- /resources/19/TEDLIUM_release2.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/19/TEDLIUM_release2.tar.gz -------------------------------------------------------------------------------- /resources/22/data_thuyg20_sre.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/22/data_thuyg20_sre.tar.gz -------------------------------------------------------------------------------- /resources/22/test_noise_sre.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/22/test_noise_sre.tar.gz -------------------------------------------------------------------------------- /resources/27/cantab-TEDLIUM.tar.bz2: -------------------------------------------------------------------------------- 1 | /mnt/resources1/27/cantab-TEDLIUM.tar.bz2 -------------------------------------------------------------------------------- /resources/48/madcat.dev.raw.lineid: -------------------------------------------------------------------------------- 1 | /mnt/resources1/48/madcat.dev.raw.lineid -------------------------------------------------------------------------------- /resources/48/madcat.test.raw.lineid: -------------------------------------------------------------------------------- 1 | /mnt/resources1/48/madcat.test.raw.lineid -------------------------------------------------------------------------------- /resources/48/madcat.train.raw.lineid: -------------------------------------------------------------------------------- 1 | /mnt/resources1/48/madcat.train.raw.lineid -------------------------------------------------------------------------------- /resources/49/voxceleb1_test_v2.txt: -------------------------------------------------------------------------------- 1 | /mnt/resources1/49/voxceleb1_test_v2.txt -------------------------------------------------------------------------------- /resources/50/madcat.dev.raw.lineid: -------------------------------------------------------------------------------- 1 | /mnt/resources1/50/madcat.dev.raw.lineid -------------------------------------------------------------------------------- /resources/50/madcat.test.raw.lineid: -------------------------------------------------------------------------------- 1 | /mnt/resources1/50/madcat.test.raw.lineid -------------------------------------------------------------------------------- /resources/50/madcat.train.raw.lineid: -------------------------------------------------------------------------------- 1 | /mnt/resources1/50/madcat.train.raw.lineid -------------------------------------------------------------------------------- /resources/51/TEDLIUM_release-3.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/51/TEDLIUM_release-3.tgz -------------------------------------------------------------------------------- /resources/60/train-clean-100.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/60/train-clean-100.tar.gz -------------------------------------------------------------------------------- /resources/60/train-clean-360.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/60/train-clean-360.tar.gz -------------------------------------------------------------------------------- /resources/60/train-other-500.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/60/train-other-500.tar.gz -------------------------------------------------------------------------------- /resources/61/es_weather_messages.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/61/es_weather_messages.zip -------------------------------------------------------------------------------- /resources/61/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/61/line_index_female.tsv -------------------------------------------------------------------------------- /resources/63/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/63/line_index_female.tsv -------------------------------------------------------------------------------- /resources/65/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/65/line_index_female.tsv -------------------------------------------------------------------------------- /resources/66/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/66/line_index_female.tsv -------------------------------------------------------------------------------- /resources/67/tedx_spanish_corpus.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/67/tedx_spanish_corpus.tgz -------------------------------------------------------------------------------- /resources/69/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/69/line_index_female.tsv -------------------------------------------------------------------------------- /resources/7/TEDLIUM_release1.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/7/TEDLIUM_release1.tar.gz -------------------------------------------------------------------------------- /resources/70/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/70/line_index_female.tsv -------------------------------------------------------------------------------- /resources/71/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/71/line_index_female.tsv -------------------------------------------------------------------------------- /resources/72/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/72/line_index_female.tsv -------------------------------------------------------------------------------- /resources/73/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/73/line_index_female.tsv -------------------------------------------------------------------------------- /resources/74/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/74/line_index_female.tsv -------------------------------------------------------------------------------- /resources/75/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/75/line_index_female.tsv -------------------------------------------------------------------------------- /resources/76/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/76/line_index_female.tsv -------------------------------------------------------------------------------- /resources/77/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/77/line_index_female.tsv -------------------------------------------------------------------------------- /resources/78/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/78/line_index_female.tsv -------------------------------------------------------------------------------- /resources/79/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/79/line_index_female.tsv -------------------------------------------------------------------------------- /resources/80/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/80/line_index_female.tsv -------------------------------------------------------------------------------- /resources/83/irish_english_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/83/irish_english_male.zip -------------------------------------------------------------------------------- /resources/83/welsh_english_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/83/welsh_english_male.zip -------------------------------------------------------------------------------- /resources/85/filename_mapping.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/85/filename_mapping.tar.gz -------------------------------------------------------------------------------- /resources/86/line_index_female.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/86/line_index_female.tsv -------------------------------------------------------------------------------- /openslr_ico.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danpovey/openslr/HEAD/openslr_ico.png -------------------------------------------------------------------------------- /resources/11/librispeech-lm-corpus.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/11/librispeech-lm-corpus.tgz -------------------------------------------------------------------------------- /resources/12/intro-disclaimers.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/12/intro-disclaimers.tar.gz -------------------------------------------------------------------------------- /resources/45/ST-AEDS-20180100_1-OS.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/45/ST-AEDS-20180100_1-OS.tgz -------------------------------------------------------------------------------- /resources/83/midlands_english_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/83/midlands_english_male.zip -------------------------------------------------------------------------------- /resources/83/northern_english_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/83/northern_english_male.zip -------------------------------------------------------------------------------- /resources/83/scottish_english_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/83/scottish_english_male.zip -------------------------------------------------------------------------------- /resources/83/southern_english_male.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/83/southern_english_male.zip -------------------------------------------------------------------------------- /resources/83/welsh_english_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/83/welsh_english_female.zip -------------------------------------------------------------------------------- /resources/91/speechoceanfreedata2.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/91/speechoceanfreedata2.zip -------------------------------------------------------------------------------- /resources/11/3-gram.pruned.1e-7.arpa.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/11/3-gram.pruned.1e-7.arpa.gz -------------------------------------------------------------------------------- /resources/11/3-gram.pruned.3e-7.arpa.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/11/3-gram.pruned.3e-7.arpa.gz -------------------------------------------------------------------------------- /resources/11/librispeech-lexicon.txt: -------------------------------------------------------------------------------- 1 | /mnt/resources1/11/librispeech-lexicon-complete.txt -------------------------------------------------------------------------------- /resources/11/librispeech-lm-norm.txt.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/11/librispeech-lm-normtext.txt.gz -------------------------------------------------------------------------------- /resources/20/air_database_release_1_4.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/20/air_database_release_1_4.zip -------------------------------------------------------------------------------- /resources/25/data_readspeech_am.tar.bz2: -------------------------------------------------------------------------------- 1 | /mnt/resources1/25/data_readspeech_am.tar.bz2 -------------------------------------------------------------------------------- /resources/25/data_readspeech_wo.tar.bz2: -------------------------------------------------------------------------------- 1 | /mnt/resources1/25/data_readspeech_wo.tar.bz2 -------------------------------------------------------------------------------- /resources/38/ST-CMDS-20170001_1-OS.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/38/ST-CMDS-20170001_1-OS.tar.gz -------------------------------------------------------------------------------- /resources/49/voxceleb1_sitw_overlap.txt: -------------------------------------------------------------------------------- 1 | /mnt/resources1/49/voxceleb1_sitw_overlap.txt -------------------------------------------------------------------------------- /resources/59/parlament_v1.0_clean.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/59/parlament_v1.0_clean.tar.gz -------------------------------------------------------------------------------- /resources/59/parlament_v1.0_other.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/59/parlament_v1.0_other.tar.gz -------------------------------------------------------------------------------- /resources/61/es_ar_line_index_weather.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/61/es_ar_line_index_weather.tsv -------------------------------------------------------------------------------- /resources/61/es_es_line_index_weather.tsv: -------------------------------------------------------------------------------- 1 | /mnt/resources1/61/es_es_line_index_weather.tsv -------------------------------------------------------------------------------- /resources/83/midlands_english_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/83/midlands_english_female.zip -------------------------------------------------------------------------------- /resources/83/northern_english_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/83/northern_english_female.zip -------------------------------------------------------------------------------- /resources/83/scottish_english_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/83/scottish_english_female.zip -------------------------------------------------------------------------------- /resources/83/southern_english_female.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/83/southern_english_female.zip -------------------------------------------------------------------------------- /resources/84/scribblelens.corpus.v1.2.zip: -------------------------------------------------------------------------------- 1 | /mnt/resources1/84/scribblelens.corpus.v1.2.zip -------------------------------------------------------------------------------- /resources/87/mobvoi_hotword_dataset.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/87/mobvoi_hotword_dataset.tgz -------------------------------------------------------------------------------- /resources/89/Yoloxochitl-Mixtec-Data.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/89/Yoloxochitl-Mixtec-Data.tgz -------------------------------------------------------------------------------- /resources/9/about.html: -------------------------------------------------------------------------------- 1 | This data is downloaded and used by the Kaldi AMI recipe. 2 | -------------------------------------------------------------------------------- /resources/25/data_broadcastnews_sw.tar.bz2: -------------------------------------------------------------------------------- 1 | /mnt/resources1/25/data_broadcastnews_sw.tar.bz2 -------------------------------------------------------------------------------- /resources/27/cantab-TEDLIUM-partial.tar.bz2: -------------------------------------------------------------------------------- 1 | /mnt/resources1/27/cantab-TEDLIUM-partial.tar.bz2 -------------------------------------------------------------------------------- /resources/47/primewords_md_2018_set1.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/47/primewords_md_2018_set1.tar.gz -------------------------------------------------------------------------------- /resources/57/African_Accented_French.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/57/African_Accented_French.tar.gz -------------------------------------------------------------------------------- /config/OpenslrLogo.key: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danpovey/openslr/HEAD/config/OpenslrLogo.key -------------------------------------------------------------------------------- /resources/10/about.html: -------------------------------------------------------------------------------- 1 | These files define the tests for some of NIST's SRE evaluations. 2 | 3 | -------------------------------------------------------------------------------- /resources/31/about.html: -------------------------------------------------------------------------------- 1 | A subset of LibriSpeech created for the purpose of regression testing. 2 | -------------------------------------------------------------------------------- /resources/4/sctk-2.4.0-20091110-0958.tar.bz2: -------------------------------------------------------------------------------- 1 | /mnt/resources1/./4/sctk-2.4.0-20091110-0958.tar.bz2 -------------------------------------------------------------------------------- /resources/4/sctk-2.4.0-20091110-0958.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/./4/sctk-2.4.0-20091110-0958.tar.gz -------------------------------------------------------------------------------- /resources/4/sctk-2.4.10-20151007-1312Z.tar.bz2: -------------------------------------------------------------------------------- 1 | /mnt/resources1/4/sctk-2.4.10-20151007-1312Z.tar.bz2 -------------------------------------------------------------------------------- /resources/4/sctk-2.4.8-20130429-2145.tar.bz2: -------------------------------------------------------------------------------- 1 | /mnt/resources1/4/sctk-2.4.8-20130429-2145.tar.bz2 -------------------------------------------------------------------------------- /resources/4/sctk-2.4.9-20141015-1634Z.tar.bz2: -------------------------------------------------------------------------------- 1 | /mnt/resources1/4/sctk-2.4.9-20141015-1634Z.tar.bz2 -------------------------------------------------------------------------------- /resources/5/switchboard_word_alignments.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/5/switchboard_word_alignments.tar.gz -------------------------------------------------------------------------------- /resources/58/pansori-tedxkr-corpus-1.0.tar.gz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/58/pansori-tedxkr-corpus-1.0.tar.gz -------------------------------------------------------------------------------- /resources/87/mobvoi_hotword_dataset_resources.tgz: -------------------------------------------------------------------------------- 1 | /mnt/resources1/87/mobvoi_hotword_dataset_resources.tgz -------------------------------------------------------------------------------- /resources/10/sre2000-key.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danpovey/openslr/HEAD/resources/10/sre2000-key.tar.gz -------------------------------------------------------------------------------- /resources/49/about.html: -------------------------------------------------------------------------------- 1 | This resource contains files for the VoxCeleb corpora that are helpful in speaker recognition recipes. 2 | -------------------------------------------------------------------------------- /resources/31/md5sum.txt: -------------------------------------------------------------------------------- 1 | 6d7ab67ac6a1d2c993d050e16d61080d dev-clean-2.tar.gz 2 | 5df7d4e78065366204ca6845bb08f490 train-clean-5.tar.gz 3 | -------------------------------------------------------------------------------- /robots.txt: -------------------------------------------------------------------------------- 1 | # robots.txt generated at http://www.mcanerin.com 2 | User-agent: * 3 | Disallow: 4 | Crawl-delay: 5 5 | Disallow: /cgi-bin/ 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | openslr 2 | ======= 3 | 4 | Repository for the web pages and scripts associated with OpenSLR: the open speech and language repository 5 | -------------------------------------------------------------------------------- /resources/46/info.txt: -------------------------------------------------------------------------------- 1 | name: Tunisian_MSA 2 | summary: Tunisian Modern Standard Arabic 3 | category: speech 4 | license: Apache 2.0 5 | file: Tunisian_MSA.tar.gz Data 6 | -------------------------------------------------------------------------------- /resources/23/about.html: -------------------------------------------------------------------------------- 1 | This file contains metadata for the NIST LRE 2007 dataset. It was originally 2 | available at http://www.itl.nist.gov/iad/mig/tests/lang/2007/lid07key_v5.txt. 3 | -------------------------------------------------------------------------------- /resources/17/info.txt: -------------------------------------------------------------------------------- 1 | name: MUSAN 2 | summary: A corpus of music, speech, and noise 3 | category: audio 4 | license: Attribution 4.0 International (CC BY 4.0) 5 | file: musan.tar.gz The corpus 6 | -------------------------------------------------------------------------------- /resources/29/info.txt: -------------------------------------------------------------------------------- 1 | name: Sprakbanken_Swe 2 | summary: Swedish pronunciation dictionary 3 | category: text 4 | license: Creative Commons ZERO (CC-ZERO) 5 | file: lexicon-sv.tgz Lexicon 6 | 7 | -------------------------------------------------------------------------------- /resources/34/info.txt: -------------------------------------------------------------------------------- 1 | name: Santiago Spanish Lexicon 2 | summary: A pronouncing dictionary for the Spanish language. 3 | category: text 4 | license: apache 2.0 5 | file: santiago.tar.gz lexicon 6 | 7 | -------------------------------------------------------------------------------- /resources/56/info.txt: -------------------------------------------------------------------------------- 1 | name: IAM Aachen splits 2 | summary: Aachen data splits (train/test/val) for the IAM dataset. 3 | category: Other 4 | license: n/a 5 | file: splits.zip train/test/val splits 6 | 7 | -------------------------------------------------------------------------------- /resources/57/info.txt: -------------------------------------------------------------------------------- 1 | name: African Accented French 2 | summary: Recordings of African Accented French speech. 3 | category: speech 4 | license: Apache 2.0 5 | file: African_Accented_French.tar.gz The whole corpus 6 | -------------------------------------------------------------------------------- /resources/9/info.txt: -------------------------------------------------------------------------------- 1 | name: The AMI pack 2 | summary: Some auxiliary non-speech data used to build AMI systems with Kaldi 3 | category: text 4 | license: TBD 5 | file: wordlist.50k.gz predefined 50k words vocabulary 6 | -------------------------------------------------------------------------------- /resources/24/info.txt: -------------------------------------------------------------------------------- 1 | name: Iban 2 | summary: Iban language text and speech corpora for ASR 3 | category: speech 4 | license: Attribution-ShareAlike 2.0 Generic (CC BY-SA 2.0) 5 | file: iban.tar.gz Iban language corpora 6 | 7 | -------------------------------------------------------------------------------- /resources/39/info.txt: -------------------------------------------------------------------------------- 1 | name: Heroico 2 | summary: Spanish data, mirrored from the LDC 3 | category: speech 4 | license: apache 2.0 5 | file: LDC2006S37.tar.gz Speech and transcripts 6 | alternate_url: https://catalog.ldc.upenn.edu/LDC2006S37 7 | -------------------------------------------------------------------------------- /resources/81/info.txt: -------------------------------------------------------------------------------- 1 | name: Small Audio Clips 2 | summary: Contains 20 one-second audio clips from various sources, for testing compression algorithms 3 | category: speech 4 | license: CC BY 4.0 5 | file: samples.tar.gz Archive containing audio samples 6 | -------------------------------------------------------------------------------- /resources/20/about.html: -------------------------------------------------------------------------------- 1 | 2 | Please see the original project page http://www.iks.rwth-aachen.de/en/research/tools-downloads/aachen-impulse-response-database/ for more information. 3 | We are mirroring here as backup just in case the original site gooes down. 4 | -------------------------------------------------------------------------------- /resources/51/info.txt: -------------------------------------------------------------------------------- 1 | name: TED-LIUM Release 3 2 | summary: TED-LIUM corpus release 3 3 | category: speech 4 | license: Creative Commons BY-NC-ND 3.0 5 | file: TEDLIUM_release-3.tgz (data) 6 | alternate_url: https://lium.univ-lemans.fr/download/ted-lium_release3 7 | -------------------------------------------------------------------------------- /resources/67/info.txt: -------------------------------------------------------------------------------- 1 | name: TEDx Spanish Corpus 2 | summary: Spanish data taken from the TEDx Talks 3 | category: speech 4 | license: Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0) 5 | file: tedx_spanish_corpus.tgz Spanish speech and transcripts 6 | -------------------------------------------------------------------------------- /resources/89/info.txt: -------------------------------------------------------------------------------- 1 | name: Yoloxóchitl-Mixtec 2 | summary: Yolóxochitl Mixtec Speech with Transcription 3 | category: speech 4 | license: Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0 US) 5 | file: Yoloxochitl-Mixtec-Data.tgz Yolóxochitl Mixtec Speech and Transcription 6 | -------------------------------------------------------------------------------- /resources/55/info.txt: -------------------------------------------------------------------------------- 1 | name: CLMAD 2 | summary: A Chinese Language Model Adaptation Dataset (CLMAD). 3 | category: text 4 | license: Creative Commons BY-NC-ND 3.0 (attribution/non-commercial/no-derivatives). 5 | file: train.tgz Training set 6 | file: test.tgz Testing set 7 | 8 | -------------------------------------------------------------------------------- /resources/82/info.txt: -------------------------------------------------------------------------------- 1 | name: CN-Celeb 2 | summary: A Free Chinese Speaker Recognition Corpus Released by CSLT@Tsinghua University 3 | category: Speech 4 | license: Attribution-ShareAlike 4.0 International 5 | file: cn-celeb.tgz Audios with speaker ids for training and evaluation -------------------------------------------------------------------------------- /resources/15/info.txt: -------------------------------------------------------------------------------- 1 | name: SRE Speaker List 2 | summary: A list linking speakers across NIST SRE corpra 3 | category: Misc 4 | license: Not copyrighted (derived from a work prepared by a US government employee in the course of their official duties) 5 | file: speaker_list.tgz 6 | 7 | -------------------------------------------------------------------------------- /resources/2/about.html: -------------------------------------------------------------------------------- 1 | 2 | This resource is a mirror of the OpenFST toolkit, whose 3 | primary home is at www.openfst.org . 4 | We mirror it here in order to provide a failover location when 5 | OpenFST is unavailable from its primary server. 6 | -------------------------------------------------------------------------------- /resources/42/info.txt: -------------------------------------------------------------------------------- 1 | name: High quality TTS data for Khmer. 2 | summary: Multi-speaker TTS data for Khmer (km-KH) 3 | category: speech 4 | license: Attribution-ShareAlike 4.0 (CC BY-SA 4.0) 5 | file: km_kh_male.zip Khmer data from male speakers 6 | file: LICENSE License information 7 | -------------------------------------------------------------------------------- /resources/3/info.txt: -------------------------------------------------------------------------------- 1 | name: sph2pipe 2 | summary: A mirror of the sph2pipe software 3 | category: software 4 | license: One-off 5 | file: sph2pipe_v2.5.tar.gz Version 2.5, probably the last one. 6 | alternate_url: ftp://ftp.ldc.upenn.edu/pub/ldc/misc_sw/sph2pipe_v2.5.tar.gz Used to work 7 | -------------------------------------------------------------------------------- /resources/26/info.txt: -------------------------------------------------------------------------------- 1 | name: Simulated Room Impulse Response Database 2 | summary: A database of simulated room impulse responses 3 | category: Audio 4 | license: Apache 2.0 5 | file: sim_rir_8k.zip The database in 8k sampling rate 6 | file: sim_rir_16k.zip The database in 16k sampling rate 7 | -------------------------------------------------------------------------------- /resources/43/info.txt: -------------------------------------------------------------------------------- 1 | name: High quality TTS data for Nepali. 2 | summary: Multi-speaker TTS data for Nepali (ne-NP) 3 | category: speech 4 | license: Attribution-ShareAlike 4.0 (CC BY-SA 4.0) 5 | file: ne_np_female.zip Nepali data from female speakers 6 | file: LICENSE License information 7 | -------------------------------------------------------------------------------- /resources/23/info.txt: -------------------------------------------------------------------------------- 1 | name: NIST LRE 2007 Key 2 | summary: A file containing metadata for the utterances in the LRE 2007 evaluation 3 | category: Misc 4 | license: Not copyrighted (derived from a work prepared by a US government employee in the course of their official duties) 5 | file: lre07_key.txt 6 | -------------------------------------------------------------------------------- /resources/21/info.txt: -------------------------------------------------------------------------------- 1 | name: Spanish Word list 2 | summary: A list of words in Spanish with frequency derived from a large corpus (Spanish Gigaword). 3 | category: text 4 | license: Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0 US) 5 | file: es_wordlist.json.tgz JSON file containing words with frequency. 6 | -------------------------------------------------------------------------------- /resources/84/info.txt: -------------------------------------------------------------------------------- 1 | name: ScribbleLens 2 | summary: Dutch cursive, 16-18th century handwritings, pages and lines, for (un)supervised AI and other research. 3 | category: Handwriting 4 | license: CC-BY-NC-ND (details in LICENSE.txt) 5 | file: scribblelens.corpus.v1.2.zip Dutch historical handwritings 6 | 7 | -------------------------------------------------------------------------------- /resources/15/about.html: -------------------------------------------------------------------------------- 1 | This data contains a list which links speakers across various NIST SRE datasets. 2 | It is derived from information distributed with the 2013 CLSP Speaker and Language 3 | Recognition workshop. 4 | 5 | -------------------------------------------------------------------------------- /resources/13/info.txt: -------------------------------------------------------------------------------- 1 | name: RWCP Sound Scene Database 2 | summary: A database of recordings of real-world sounds and measured room impulse responses 3 | category: Speech + Software 4 | license: Research and development use only 5 | file: RWCP.tar.gz 6 | alternate_url: http://research.nii.ac.jp/src/en/RWCP-SSD.html 7 | -------------------------------------------------------------------------------- /resources/27/info.txt: -------------------------------------------------------------------------------- 1 | name: Cantab-TEDLIUM Release 1.1 (February 2015) 2 | summary: Cantab Research Language models for the TEDLIUM database 3 | category: text 4 | license: unspecified 5 | file: cantab-TEDLIUM.tar.bz2 Original archive 6 | file: cantab-TEDLIUM-partial.tar.bz2 Partial archive for Kaldi TEDLIUM recipe 7 | -------------------------------------------------------------------------------- /resources/48/info.txt: -------------------------------------------------------------------------------- 1 | name: MADCAT Arabic data splits 2 | summary: Unofficial data splits (dev/train/test) for the MADCAT Arabic LDC corpus 3 | category: other 4 | license: Apache 2.0 5 | file: madcat.dev.raw.lineid dev set 6 | file: madcat.test.raw.lineid test set 7 | file: madcat.train.raw.lineid train set 8 | 9 | -------------------------------------------------------------------------------- /resources/50/info.txt: -------------------------------------------------------------------------------- 1 | name: MADCAT Chinese data splits 2 | summary: Unofficial data splits (dev/train/test) for the MADCAT Chinese LDC corpus 3 | category: other 4 | license: Apache 2.0 5 | file: madcat.dev.raw.lineid dev set 6 | file: madcat.test.raw.lineid test set 7 | file: madcat.train.raw.lineid train set 8 | 9 | -------------------------------------------------------------------------------- /resources/14/info.txt: -------------------------------------------------------------------------------- 1 | name: BEEP Dictionary 2 | summary: Phonemic transcriptions of over 250,000 English words. (British English pronunciations) 3 | category: Text 4 | license: Research and development use only 5 | file: beep.tar.gz 6 | alternate_url: http://svr-www.eng.cam.ac.uk/comp.speech/Section1/Lexical/beep.html 7 | -------------------------------------------------------------------------------- /resources/4/about.html: -------------------------------------------------------------------------------- 1 | This resource is a mirror of NIST's sctk speech-recognition scoring software, which 2 | is originally available from here . 3 | This is public-domain and not subject to copyright, since it was written by 4 | US government employees. 5 | -------------------------------------------------------------------------------- /resources/90/info.txt: -------------------------------------------------------------------------------- 1 | name: Speechocean 10 Hours Chinese Mandarin Speech Recognition Corpus 2 | summary: Free 10.33 Hours Chinese Mandarin Speech Recognition Corpus Provided by Speechocean 3 | category: Speech 4 | license: Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0) 5 | file: speechocean.zip Corpus 6 | -------------------------------------------------------------------------------- /resources/28/info.txt: -------------------------------------------------------------------------------- 1 | name: Room Impulse Response and Noise Database 2 | summary: A database of simulated and real room impulse responses, isotropic and point-source noises. The audio files in this data are all in 16k sampling rate and 16-bit precision. 3 | category: Audio 4 | license: Apache 2.0 5 | file: rirs_noises.zip The database 6 | -------------------------------------------------------------------------------- /resources/31/info.txt: -------------------------------------------------------------------------------- 1 | name: Mini LibriSpeech ASR corpus 2 | summary: Subset of LibriSpeech corpus for purpose of regression testing 3 | category: speech 4 | license: CC BY 4.0 5 | file: dev-clean-2.tar.gz development set, "clean" speech 6 | file: train-clean-5.tar.gz test set, "clean" speech 7 | file: md5sum.txt md5 checksums of files 8 | -------------------------------------------------------------------------------- /resources/30/info.txt: -------------------------------------------------------------------------------- 1 | name: Sinhala TTS 2 | summary: Sinhalese multi-speaker TTS corpora 3 | category: speech 4 | license: Attribution-ShareAlike 4.0 International (CC BY-SA 4.0) 5 | file: si_lk.tar.gz Audio files 6 | file: si_lk.lines.txt Transcription of the audio 7 | file: README.txt Additional readme 8 | file: LICENSE.txt Licensing information 9 | -------------------------------------------------------------------------------- /resources/41/info.txt: -------------------------------------------------------------------------------- 1 | name: High quality TTS data for Javanese. 2 | summary: Multi-speaker TTS data for Javanese (jv-ID) 3 | category: speech 4 | license: Attribution-ShareAlike 4.0 (CC BY-SA 4.0) 5 | file: jv_id_female.zip Javanese data from female speakers 6 | file: jv_id_male.zip Javanese data from female speakers 7 | file: LICENSE License information 8 | -------------------------------------------------------------------------------- /resources/44/info.txt: -------------------------------------------------------------------------------- 1 | name: High quality TTS data for Sundanese. 2 | summary: Multi-speaker TTS data for Sundanese (su-ID) 3 | category: speech 4 | license: Attribution-ShareAlike 4.0 (CC BY-SA 4.0) 5 | file: su_id_female.zip Sundanese data from female speakers 6 | file: su_id_male.zip Sundanese data from male speakers 7 | file: LICENSE License information 8 | -------------------------------------------------------------------------------- /resources/87/info.txt: -------------------------------------------------------------------------------- 1 | name: MobvoiHotwords 2 | summary: Chinese hotwords detection dataset, provided by Mobvoi CO.,LTD 3 | category: Speech 4 | license: Apache License v.2.0 5 | file: mobvoi_hotword_dataset.tgz Wave files of keyword and non-keyword data 6 | file: mobvoi_hotword_dataset_resources.tgz Label, speaker and channel information of above wave files 7 | -------------------------------------------------------------------------------- /resources/1/info.txt: -------------------------------------------------------------------------------- 1 | name: Yesno 2 | summary: Sixty recordings of one individual saying yes or no in Hebrew; each recording is eight words long. 3 | category: speech 4 | license: No formal license but free to use for any purpose. 5 | file: waves_yesno.tar.gz This is the entire dataset. 6 | alternate_url: http://sourceforge.net/projects/kaldi/files/waves_yesno.tar.gz 7 | -------------------------------------------------------------------------------- /resources/3/about.html: -------------------------------------------------------------------------------- 1 | This resource is a mirror of LDC's sph2pipe software, which 2 | used to be available from here . 3 | The license (available 0readme.1st in the archive) only permits 4 | using it to read sphere files, but since that is the purpose of the 5 | program it should not be a problem. 6 | -------------------------------------------------------------------------------- /resources/11/about.html: -------------------------------------------------------------------------------- 1 | Language modeling resources to be used in conjunction with the (soon-to-be-released) LibriSpeech ASR corpus. 2 |
3 | This corpus and these resources were prepared by Vassil Panayotov with 4 | the assistance of Daniel Povey and Sanjeev Khudanpur. We hope to finalize 5 | this and release the corpus here by the ICASSP deadline (early October 2014). 6 | 7 | -------------------------------------------------------------------------------- /resources/47/info.txt: -------------------------------------------------------------------------------- 1 | name: Primewords Chinese Corpus Set 1 2 | summary: Chinese Mandarin corpus released by Shanghai Primewords Co. Ltd. (www.primewords.cn), containing 100 hours of speech data. 3 | category: speech 4 | license: Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0) 5 | file: primewords_md_2018_set1.tar.gz speech data and transcripts 6 | -------------------------------------------------------------------------------- /resources/19/info.txt: -------------------------------------------------------------------------------- 1 | name: TED-LIUMv2 2 | summary: TED-LIUM corpus release 2, English speech recognition training corpus from TED talks, created by Laboratoire d’Informatique de l’Université du Maine (LIUM) (mirrored here) 3 | category: audio 4 | license: Creative Commons BY-NC-ND 3.0 (http://creativecommons.org/licenses/by-nc-nd/3.0/deed.en) 5 | file: TEDLIUM_release2.tar.gz The corpus 6 | -------------------------------------------------------------------------------- /resources/58/info.txt: -------------------------------------------------------------------------------- 1 | name: Pansori-TEDxKR 2 | summary: Korean speech corpus generated from Korean language TEDx talks 3 | category: speech 4 | license: Creative Commons BY-NC-ND 4.0 (attribution/non-commercial/no-derivatives) 5 | file: pansori-tedxkr-corpus-1.0.tar.gz Korean speech and trascripts 6 | alternate_url: https://storage.googleapis.com/pansori/corpus/pansori-tedxkr-corpus-1.0.tar.gz 7 | -------------------------------------------------------------------------------- /resources/33/info.txt: -------------------------------------------------------------------------------- 1 | name: Aishell 2 | summary: Mandarin data, provided by Beijing Shell Shell Technology Co.,Ltd 3 | category: speech 4 | license: Apache License v.2.0 5 | file: data_aishell.tgz speech data and transcripts 6 | file: resource_aishell.tgz supplementary resources, incl. lexicon, speaker info 7 | alternate_url: http://www.aishelltech.com/kysjcp Full description from the company website 8 | -------------------------------------------------------------------------------- /resources/26/about.html: -------------------------------------------------------------------------------- 1 | This data includes simulated room impulse responses with various room configs. 2 | It is intended for use when comparing the performance 3 | of acoustic models trained with data reverberated with real and simulated 4 | impulse responses. 5 | They were used in our paper 6 | "A Study on Data Augmentation of Reverberant Speech for Robust Speech Recognition" 7 | submitted to ICASSP 2017 8 |
9 | -------------------------------------------------------------------------------- /resources/37/info.txt: -------------------------------------------------------------------------------- 1 | name: High quality TTS data for Bengali languages 2 | summary: Multi-speaker TTS data for Bangladesh Bengali (bn-BD) and Indian Bengali (bn-IN). 3 | category: speech 4 | license: License: Attribution-ShareAlike 4.0 (CC BY-SA 4.0) 5 | file: bn_bd.zip Bangladesh Bengali data 6 | file: bn_in.zip Indian Bengali data 7 | file: README.txt Information about the data 8 | file: LICENSE.txt License information 9 | -------------------------------------------------------------------------------- /resources/91/info.txt: -------------------------------------------------------------------------------- 1 | name: Free English Corpus and Language Challenge -- Speechocean 2 | summary: A free 8.2 hours English speech recognition corpus provided by speechocean and an oriental language recognition challenge co-organized by speechocean and Tsinghua University. 3 | category: Speech 4 | license: Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0) 5 | file: speechoceanfreedata2.zip Corpus 6 | -------------------------------------------------------------------------------- /resources/45/info.txt: -------------------------------------------------------------------------------- 1 | name: Free ST American English Corpus 2 | summary: A free American English corpus by Surfingtech (www.surfing.ai), containing utterances from 10 speakers, Each speaker has about 350 utterances; 3 | category: speech 4 | license: Creative Common BY-NC-ND 4.0 (Attribution-NonCommercial-NoDerivatives 4.0 International) 5 | file: ST-AEDS-20180100_1-OS.tgz speech audios and transcripts 6 | alternate_url: https://www.surfing.ai 7 | -------------------------------------------------------------------------------- /resources/7/info.txt: -------------------------------------------------------------------------------- 1 | name: TED-LIUM 2 | summary: English speech recognition training corpus from TED talks, created by Laboratoire d’Informatique de l’Université du Maine (LIUM) (mirrored here) 3 | category: speech 4 | license: Creative Commons BY-NC-ND 3.0 (attribution/non-commercial/no-derivatives). 5 | file: TEDLIUM_release1.tar.gz The first release 6 | alternate_url: http://www-lium.univ-lemans.fr/en/content/ted-lium-corpus Original source 7 | -------------------------------------------------------------------------------- /resources/62/info.txt: -------------------------------------------------------------------------------- 1 | name: aidatatang_200zh 2 | summary: A Chinese Mandarin speech corpus by Beijing DataTang Technology Co., Ltd, containing 200 hours of speech data from 600 speakers. The transcription accuracy for each sentence is larger than 98%. 3 | category: speech 4 | license: Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0) 5 | file: aidatatang_200zh.tgz Corpus 6 | alternate_url: https://www.datatang.com/webfront/opensource.html 7 | 8 | -------------------------------------------------------------------------------- /config/README: -------------------------------------------------------------------------------- 1 | This directory is used to store the Apache config file "openslr"; we made a soft link like this: 2 | cd /etc/apache2/sites-available 3 | ln -s /var/www/openslr/config/openslr . 4 | This is done so that we can put it in the same git repository as the html and php of the site's 5 | contents. 6 | (note: for reasons of size, we can't do the same for the directory "/var/www/openslr/resources/" 7 | here the actual resources we export are kept.) 8 | 9 | I'm also adding openslr.key 10 | -------------------------------------------------------------------------------- /resources/38/info.txt: -------------------------------------------------------------------------------- 1 | root@www:/var/www/openslr# cat /var/www/openslr/resources/6/info.txt 2 | name: Free ST Chinese Mandarin Corpus 3 | summary: A free Chinese Mandarin corpus by Surfingtech (www.surfing.ai), containing utterances from 855 speakers, 102600 utterances; 4 | category: speech 5 | license: Creative Common BY-NC-ND 4.0 (Attribution-NonCommercial-NoDerivatives 4.0 International) 6 | file: ST-CMDS-20170001_1-OS.tar.gz speech audios and transcripts 7 | alternate_url: https://www.surfing.ai 8 | -------------------------------------------------------------------------------- /resources/40/info.txt: -------------------------------------------------------------------------------- 1 | name: Zeroth-Korean 2 | summary: Korean Open-source Speech Corpus for Speech Recognition by Zeroth Project (https://github.com/goodatlas/zeroth) 3 | category: Speech Corpus for Automatic Speech Recognition 4 | license: Attribution 4.0 International (CC BY 4.0) 5 | file: zeroth_korean.tar.gz Korean Speech data, transcription, lexicon and language model 6 | alternate_url: https://storage.googleapis.com/zeroth_project/zeroth_korean.tar.gz Korean Speech data, transcription and language model 7 | -------------------------------------------------------------------------------- /resources/20/info.txt: -------------------------------------------------------------------------------- 1 | name: Aachen Impulse Response Database 2 | summary: Aachen Impulse Response database (AIR): a database of room impulse responses (mirrored here) 3 | category: audio 4 | license: Not stated in the download. 5 | alternate_url: https://www2.iks.rwth-aachen.de/air/air_database_release_1_4.zip Original download link 6 | alternate_url: http://www.iks.rwth-aachen.de/en/research/tools-downloads/aachen-impulse-response-database/ Project page 7 | file: air_database_release_1_4.zip Version 1.4 of the database 8 | -------------------------------------------------------------------------------- /resources/59/info.txt: -------------------------------------------------------------------------------- 1 | name: ParlamentParla 2 | summary: Catalan speech corpus generated from Catalan Parliamentary sessions 3 | category: speech 4 | license: CC Attribution 4.0 (CC BY 4.0) 5 | file: parlament_v1.0_clean.tar.gz 90 hours of "clean" speech and transcripts 6 | file: parlament_v1.0_other.tar.gz 230 hours of "other" speech and transcripts 7 | alternate_url: http://laklak.eu/share/parlament_v1.0_clean.tar.gz clean data 8 | alternate_url: http://laklak.eu/share/parlament_v1.0_other.tar.gz other data 9 | 10 | -------------------------------------------------------------------------------- /resources/80/info.txt: -------------------------------------------------------------------------------- 1 | name: Crowdsourced high-quality Burmese speech data set. 2 | summary: Data set which contains recordings of Burmese. 3 | There are 2530 recordings from female speakers. 4 | category: speech 5 | license: Attribution-ShareAlike 4.0 International 6 | file: about.html Information about the data set 7 | file: LICENSE License information for the data set 8 | file: line_index_female.tsv All utterances for the female speakers. 9 | file: my_mm_female.zip Archive file with all audio for the female speakers. 10 | -------------------------------------------------------------------------------- /resources/6/info.txt: -------------------------------------------------------------------------------- 1 | name: Vystadial 2 | summary: English and Czech data, mirrored from the Vystadial project 3 | category: speech 4 | license: Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0 US) 5 | file: data_voip_cs.tgz Czech speech and transcripts 6 | file: data_voip_en.tgz English speech and transcripts 7 | alternate_url: https://lindat.mff.cuni.cz/repository/xmlui/handle/11858/00-097C-0000-0023-4670-6 Czech data 8 | alternate_url: https://lindat.mff.cuni.cz/repository/xmlui/handle/11858/00-097C-0000-0023-4671-4 English data 9 | -------------------------------------------------------------------------------- /resources/28/about.html: -------------------------------------------------------------------------------- 1 | This data includes all the room impulse responses (RIRs) and noises 2 | we used in our paper "A Study on Data Augmentation of Reverberant Speech for Robust Speech Recognition" 3 | submitted to ICASSP 2017. 4 | It includes the real RIRs and isotropic noises from 5 | the RWCP sound scene database, the 2014 REVERB challenge 6 | database and the Aachen impulse response database (AIR); 7 | the simulated RIRs generated by ourselves 8 | and also the point-source noises that extracted from the MUSAN corpus. 9 |
10 | -------------------------------------------------------------------------------- /resources/18/info.txt: -------------------------------------------------------------------------------- 1 | name: THCHS-30 2 | summary: A Free Chinese Speech Corpus Released by CSLT@Tsinghua University 3 | category: speech 4 | license: Apache License v.2.0 5 | file: data_thchs30.tgz speech data and transcripts 6 | file: test-noise.tgz standard 0db noisy test data 7 | file: resource.tgz supplementary resources, incl. lexicon for training data, noise samples 8 | alternate_url: http://data.cslt.org/thchs30/README.html Original URL from CSLT 9 | alternate_url: http://pan.baidu.com/s/1hqKwE00 Baidu disk 10 | 11 | -------------------------------------------------------------------------------- /resources/30/about.html: -------------------------------------------------------------------------------- 1 | This data set contains multi-speaker high quality transcribed audio data for Sinhalese. The data set consists of wave files, and a TSV file. The file si_lk.lines.txt contains a FileID, which in tern contains the UserID and the Transcription of audio in the file. 2 |
3 | The data set has been manually quality checked, but there might still be errors. 4 |
5 | This dataset was collected by Google in Sri Lanka. 6 |
7 | See LICENSE.txt file for license information. 8 |
9 | Copyright 2015, 2016 Google, Inc. 10 | -------------------------------------------------------------------------------- /resources/74/info.txt: -------------------------------------------------------------------------------- 1 | name: Crowdsourced high-quality Puerto Rico Spanish speech data set. 2 | summary: Data set which contains recordings of Puerto Rico Spanish. 3 | There are 617 recordings from female speakers. 4 | category: speech 5 | license: Attribution-ShareAlike 4.0 International 6 | file: about.html Information about the data set 7 | file: LICENSE License information for the data set 8 | file: line_index_female.tsv All utterances for the female speakers. 9 | file: es_pr_female.zip Archive file with all audio for the female speakers. 10 | -------------------------------------------------------------------------------- /resources/56/about.html: -------------------------------------------------------------------------------- 1 |
2 | Most research papers reporting results on IAM database use different splits of the data into train/test/val than those provided with the database. These are those data splits shared by Théodore Bluche from Human Language Technology and Pattern Recognition lab, RWTH Aachen University. It will create 6482 train line images, 2915 test line images and 976 validation line images. 3 |
4 | 5 |6 | It contains the page XML name. For example: 7 |
8 | 9 |10 | a01-000u 11 | a01-000x 12 | a01-003 13 |14 | -------------------------------------------------------------------------------- /resources/7/about.html: -------------------------------------------------------------------------------- 1 | 2 | The TED-LIUM corpus (mirrored here) 3 | is English-language TED talks, with transcriptions, sampled at 16kHz. It 4 | contains about 118 hours of speech. 5 |
6 | The original page requests that you cite the following paper if you make use of this 7 | corpus: 8 |
9 | A. Rousseau, P. Deléglise, and Y. Estève, "TED-LIUM: an automatic speech recognition dedicated corpus",
10 |
11 | in Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC'12), May 2012.
12 |
13 | -------------------------------------------------------------------------------- /resources/32/info.txt: -------------------------------------------------------------------------------- 1 | name: High quality TTS data for four South African languages (af, st, tn, xh) 2 | summary: Multi-speaker TTS data for four South African languages, Afrikaans, Sesotho, Setswana and isiXhosa. 3 | category: speech 4 | license: Attribution-ShareAlike 4.0 International (CC BY-SA 4.0) 5 | file: af_za.tar.gz Audio files and transcriptions for Afrikaans 6 | file: st_za.tar.gz Audio files and transcriptions for Sesotho 7 | file: tn_za.tar.gz Audio files and transcriptions for Setswana 8 | file: xh_za.tar.gz Audio files and transcriptions for isiXhosa 9 | -------------------------------------------------------------------------------- /resources/17/about.html: -------------------------------------------------------------------------------- 1 | MUSAN is a corpus of music, speech, and noise recordings. 2 |
3 | This work was supported by the National Science Foundation Graduate Research 4 | Fellowship under Grant No. 1232825 and by Spoken Communications. 5 |
6 | 7 | You can cite the data using the following BibTeX entry: 8 |
9 | @misc{musan2015,
10 | author = {David Snyder and Guoguo Chen and Daniel Povey},
11 | title = {{MUSAN}: {A} {M}usic, {S}peech, and {N}oise {C}orpus},
12 | year = {2015},
13 | eprint = {1510.08484},
14 | note = {arXiv:1510.08484v1}
15 | }
16 |
--------------------------------------------------------------------------------
/resources/37/README.txt:
--------------------------------------------------------------------------------
1 | This data set contains multi-speaker high quality transcribed audio data for
2 | Bengali. The data set consists of wave files, and a TSV file. There are two
3 | zip files, one for each local which contain a file: line_index.tsv and the wave
4 | files. Line index has a fileID and the transcription.
5 |
6 | The data set has been manually quality checked, but there might still be errors.
7 |
8 | This data set was collected by Google.
9 |
10 | See LICENSE.txt file for license information.
11 |
12 | Copyright 2015, 2016, 2017, 2018 Google, Inc.
13 |
--------------------------------------------------------------------------------
/resources/5/info.txt:
--------------------------------------------------------------------------------
1 | name: MSU Switchboard transcipts
2 | summary: A mirror of the Mississippi State transcripts and lexicon for Switchboard.
3 | category: text
4 | license: Unrestricted
5 | file: switchboard_word_alignments.tar.gz The transcripts
6 | file: sw-ms98-dict.text The lexicon
7 | alternate_url: http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz The transcripts in their original location
8 | alternate_url: http://www.isip.piconepress.com/projects/switchboard/releases/sw-ms98-dict.text The lexicon in its original location
9 |
--------------------------------------------------------------------------------
/resources/64/info.txt:
--------------------------------------------------------------------------------
1 | name: Crowdsourced high-quality Marathi multi-speaker speech data set.
2 | summary: Data set which contains recordings of native speakers of Marathi
3 | There are 1596 recordings from female speakers.
4 | The data set has recordings from multiple female speakers.
5 | category: speech
6 | license: Attribution-ShareAlike 4.0 International
7 | file: about.html Information about the data set
8 | file: LICENSE License information for the data set
9 | file: line_index.tsv All utterances in the data set
10 | file: mr_in_female.zip Archive file containing all the data.
11 |
--------------------------------------------------------------------------------
/resources/81/about.html:
--------------------------------------------------------------------------------
1 | Contains 25 wav files containing audio sampled at 44.1kHz. The files are stored as 16-bit signed-integer PCM. Each file is of duration 1 second.
2 | There are 5 files from 5 different sources. ‘piano’ contains fragments from a piano-only music, ‘lmiserables’ contains parts from the movie Les-miserables, ‘joliver’ is taken from John Oliver’s show “Last week Tonight” and ‘simons’ are containing split seconds from Simons Institute videos, ‘bmaher’ also is including parts from Bill Maher’s show “In Our Time”.
3 |
--------------------------------------------------------------------------------
/resources/85/info.txt:
--------------------------------------------------------------------------------
1 | name: HI-MIA
2 | summary: A far-field text-dependent speaker verification database for AISHELL Speaker Verification Challenge 2019
3 | category: Speech
4 | license: Apache License v.2.0
5 | file: train.tar.gz Training set with speaker dependent sub folders
6 | file: dev.tar.gz Dev set with speaker dependent sub folders
7 | file: test.tar.gz Test set with target/non-target answer
8 | file: test_v2.tar.gz Updated test set fixing corrupted audio files
9 | file: filename_mapping.tar.gz Filename mapping rules for multi-channel information
10 | alternate_url: http://aishelltech.com/wakeup_data
11 |
--------------------------------------------------------------------------------
/resources/34/about.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | This is a pronouncing dictionary for the Spanish language.
10 | It is intended to be used as the lexicon for an automatic speech recognition system.
11 |
12 | John Morgan
13 |
14 |
15 | Last modified: Thu Sep 7 14:59:16 EDT 2017
16 |
17 |
18 |
19 |
--------------------------------------------------------------------------------
/resources/21/about.html:
--------------------------------------------------------------------------------
1 | This is a list of words in Spanish with frequency counts.
2 |
3 | This data was derived from the LDC Spanish Gigaword Corpus (LDC2011T12).
4 | The list is used as a part of the Kaldi Spanish Fisher recipe and is used to augment
5 | the pronunciation lexicon with additional words. The actual pronunication is generated
6 | using the Spanish rule based lexicon (LDC96L16).
7 |
8 |
9 |
10 | NOTE : No components of the LDC datasets LDC2011T12 and LDC96L16 are included with
11 | this dataset.
12 |
13 |
14 | Details of how this word list is used can be found in this paper : http://cs.jhu.edu/~gkumar/papers/kumar2014some.pdf
15 |
--------------------------------------------------------------------------------
/resources/29/about.html:
--------------------------------------------------------------------------------
1 | This data is a pronunciation dictionary consisting of approximately 822 000 Swedish words with their corresponding phonetic transcription.
2 |
3 | The license is a CC0 which is unrestricted. This particular version of the lexicon is an improved and updated version produced by Emelie Kullmann and the original unmodified version of the lexicon can be found on the website of the National Library of Norway.
4 |
5 | The updated version of the lexicon that can be directly downloaded from here is recommended to use with the Swedish recipe and yields a better result for ASR.
6 |
--------------------------------------------------------------------------------
/resources/76/info.txt:
--------------------------------------------------------------------------------
1 | name: Crowdsourced high-quality Basque speech data set.
2 | summary: Data set which contains recordings of Basque.
3 | There are 3858 recordings from female speakers, and 3278 recordings from male
4 | speakers.
5 | category: speech
6 | license: Attribution-ShareAlike 4.0 International
7 | file: about.html Information about the data set
8 | file: LICENSE License information for the data set
9 | file: line_index_female.tsv All utterances for the female speakers.
10 | file: line_index_male.tsv All utterances for the male speakers.
11 | file: eu_es_female.zip Archive file with all audio for the female speakers.
12 | file: eu_es_male.zip Archive file with all audio for the male speakers.
13 |
--------------------------------------------------------------------------------
/resources/25/info.txt:
--------------------------------------------------------------------------------
1 | name: ALFFA (African Languages in the Field: speech Fundamentals and Automation)
2 | summary: Amharic, Swahili and Wolof data, mirrored from the ALFFA git repository
3 | category: speech
4 | license: MIT
5 | file: data_readspeech_am.tar.bz2 Amharic speech and transcripts
6 | file: data_broadcastnews_sw.tar.bz2 Swahili speech and transcripts
7 | file: data_readspeech_wo.tar.bz2 Wolof speech and transcripts
8 | alternate_url: https://github.com/besacier/ALFFA_PUBLIC/tree/master/ASR/AMHARIC Amharic data
9 | alternate_url: https://github.com/besacier/ALFFA_PUBLIC/tree/master/ASR/SWAHILI Swahili data
10 | alternate_url: https://github.com/besacier/ALFFA_PUBLIC/tree/master/ASR/WOLOF Wolof data
--------------------------------------------------------------------------------
/resources/69/info.txt:
--------------------------------------------------------------------------------
1 | name: Crowdsourced high-quality Catalan speech data set.
2 | summary: Data set which contains recordings of Catalan.
3 | There are 2321 recordings from female speakers, and 1919 recordings from male
4 | speakers.
5 | category: speech
6 | license: Attribution-ShareAlike 4.0 International
7 | file: about.html Information about the data set
8 | file: LICENSE License information for the data set
9 | file: line_index_female.tsv All utterances for the female speakers.
10 | file: line_index_male.tsv All utterances for the male speakers.
11 | file: ca_es_female.zip Archive file with all audio for the female speakers.
12 | file: ca_es_male.zip Archive file with all audio for the male speakers.
13 |
--------------------------------------------------------------------------------
/resources/77/info.txt:
--------------------------------------------------------------------------------
1 | name: Crowdsourced high-quality Galician speech data set.
2 | summary: Data set which contains recordings of Galician.
3 | There are 4264 recordings from female speakers, and 1323 recordings from male
4 | speakers.
5 | category: speech
6 | license: Attribution-ShareAlike 4.0 International
7 | file: about.html Information about the data set
8 | file: LICENSE License information for the data set
9 | file: line_index_female.tsv All utterances for the female speakers.
10 | file: line_index_male.tsv All utterances for the male speakers.
11 | file: gl_es_female.zip Archive file with all audio for the female speakers.
12 | file: gl_es_male.zip Archive file with all audio for the male speakers.
13 |
--------------------------------------------------------------------------------
/resources/10/info.txt:
--------------------------------------------------------------------------------
1 | name: SRE Data
2 | summary: Various files from SRE data that NIST used to host online
3 | category: misc
4 | license: Public domain, I believe
5 | file: sre2000-key.tar.gz An archive containing a key file for SRE 2000, plus some other things.
6 | file: sre04_key.tgz An archive containing the key file for SRE04, plus some other things.
7 | file: sre04_key-v2.txt.gz The key file for SRE04
8 | file: sre05-key-v7b.txt.gz The key file for SRE05
9 | alternate_url: http://www.itl.nist.gov/iad/894.01/tests/sre/2006/sre04_key.tgz Previously working location for SRE04 data
10 | alternate_url: http://www.itl.nist.gov/iad/894.01/tests/sre/2006/sre05_key.tgz Previously working location for SRE05 data
11 |
--------------------------------------------------------------------------------
/resources/65/info.txt:
--------------------------------------------------------------------------------
1 | name: Crowdsourced high-quality Tamil multi-speaker speech data set.
2 | summary: Data set which contains recordings of native speakers of Tamil.
3 | There are 2335 recordings from female speakers, and 1956 recordings from male
4 | speakers.
5 | category: speech
6 | license: Attribution-ShareAlike 4.0 International
7 | file: about.html Information about the data set
8 | file: LICENSE License information for the data set
9 | file: line_index_female.tsv Lines recorded by the female speakers
10 | file: line_index_male.tsv Lines recorded by the male speakers
11 | file: ta_in_female.zip Archive containing recordings from female speakers
12 | file: ta_in_male.zip Archive file recordings from male speakers
13 |
--------------------------------------------------------------------------------
/resources/66/info.txt:
--------------------------------------------------------------------------------
1 | name: Crowdsourced high-quality Telugu multi-speaker speech data set.
2 | summary: Data set which contains recordings of native speakers of Telugu.
3 | There are 2294 recordings from female speakers, and 2154 recordings from male
4 | speakers.
5 | category: speech
6 | license: Attribution-ShareAlike 4.0 International
7 | file: about.html Information about the data set
8 | file: LICENSE License information for the data set
9 | file: line_index_female.tsv Lines recorded by the female speakers
10 | file: line_index_male.tsv Lines recorded by the male speakers
11 | file: te_in_female.zip Archive containing recordings from female speakers
12 | file: te_in_male.zip Archive file recordings from male speakers
13 |
--------------------------------------------------------------------------------
/resources/70/info.txt:
--------------------------------------------------------------------------------
1 | name: Crowdsourced high-quality Nigerian English speech data set.
2 | summary: Data set which contains recordings of Nigerian English.
3 | There are 2045 recordings from female speakers, and 1314 recordings from male
4 | speakers.
5 | category: speech
6 | license: Attribution-ShareAlike 4.0 International
7 | file: about.html Information about the data set
8 | file: LICENSE License information for the data set
9 | file: line_index_female.tsv All utterances for the female speakers.
10 | file: line_index_male.tsv All utterances for the male speakers.
11 | file: en_ng_female.zip Archive file with all audio for the female speakers.
12 | file: en_ng_male.zip Archive file with all audio for the male speakers.
13 |
--------------------------------------------------------------------------------
/resources/71/info.txt:
--------------------------------------------------------------------------------
1 | name: Crowdsourced high-quality Chilean Spanish speech data set.
2 | summary: Data set which contains recordings of Chilean Spanish.
3 | There are 1738 recordings from female speakers, and 2636 recordings from male
4 | speakers.
5 | category: speech
6 | license: Attribution-ShareAlike 4.0 International
7 | file: about.html Information about the data set
8 | file: LICENSE License information for the data set
9 | file: line_index_female.tsv All utterances for the female speakers.
10 | file: line_index_male.tsv All utterances for the male speakers.
11 | file: es_cl_female.zip Archive file with all audio for the female speakers.
12 | file: es_cl_male.zip Archive file with all audio for the male speakers.
13 |
--------------------------------------------------------------------------------
/resources/73/info.txt:
--------------------------------------------------------------------------------
1 | name: Crowdsourced high-quality Peruvian Spanish speech data set.
2 | summary: Data set which contains recordings of Peruvian Spanish.
3 | There are 2529 recordings from female speakers, and 2918 recordings from male
4 | speakers.
5 | category: speech
6 | license: Attribution-ShareAlike 4.0 International
7 | file: about.html Information about the data set
8 | file: LICENSE License information for the data set
9 | file: line_index_female.tsv All utterances for the female speakers.
10 | file: line_index_male.tsv All utterances for the male speakers.
11 | file: es_pe_female.zip Archive file with all audio for the female speakers.
12 | file: es_pe_male.zip Archive file with all audio for the male speakers.
13 |
--------------------------------------------------------------------------------
/resources/79/info.txt:
--------------------------------------------------------------------------------
1 | name: Crowdsourced high-quality Kannada multi-speaker speech data set.
2 | summary: Data set which contains recordings of native speakers of Kannada.
3 | There are 2186 recordings from female speakers, and 2214 recordings from male
4 | speakers.
5 | category: speech
6 | license: Attribution-ShareAlike 4.0 International
7 | file: about.html Information about the data set
8 | file: LICENSE License information for the data set
9 | file: line_index_female.tsv Lines recorded by the female speakers
10 | file: line_index_male.tsv Lines recorded by the male speakers
11 | file: kn_in_female.zip Archive containing recordings from female speakers
12 | file: kn_in_male.zip Archive file recordings from male speakers
13 |
--------------------------------------------------------------------------------
/resources/63/info.txt:
--------------------------------------------------------------------------------
1 | name: Crowdsourced high-quality Malayalam multi-speaker speech data set.
2 | summary: Data set which contains recordings of native speakers of Malayalam.
3 | There are 2103 recordings from female speakers, and 2023 recordings from male
4 | speakers.
5 | category: speech
6 | license: Attribution-ShareAlike 4.0 International
7 | file: about.html Information about the data set
8 | file: LICENSE License information for the data set
9 | file: line_index_female.tsv Lines recorded by the female speakers
10 | file: line_index_male.tsv Lines recorded by the male speakers
11 | file: ml_in_female.zip Archive containing recordings from female speakers
12 | file: ml_in_male.zip Archive file recordings from male speakers
13 |
--------------------------------------------------------------------------------
/resources/72/info.txt:
--------------------------------------------------------------------------------
1 | name: Crowdsourced high-quality Columbian Spanish speech data set.
2 | summary: Data set which contains recordings of Columbian Spanish.
3 | There are 2369 recordings from female speakers, and 2534 recordings from male
4 | speakers.
5 | category: speech
6 | license: Attribution-ShareAlike 4.0 International
7 | file: about.html Information about the data set
8 | file: LICENSE License information for the data set
9 | file: line_index_female.tsv All utterances for the female speakers.
10 | file: line_index_male.tsv All utterances for the male speakers.
11 | file: es_co_female.zip Archive file with all audio for the female speakers.
12 | file: es_co_male.zip Archive file with all audio for the male speakers.
13 |
--------------------------------------------------------------------------------
/resources/75/info.txt:
--------------------------------------------------------------------------------
1 | name: Crowdsourced high-quality Venezuelan Spanish speech data set.
2 | summary: Data set which contains recordings of Venezuelan Spanish.
3 | There are 1603 recordings from female speakers, and 1754 recordings from male
4 | speakers.
5 | category: speech
6 | license: Attribution-ShareAlike 4.0 International
7 | file: about.html Information about the data set
8 | file: LICENSE License information for the data set
9 | file: line_index_female.tsv All utterances for the female speakers.
10 | file: line_index_male.tsv All utterances for the male speakers.
11 | file: es_ve_female.zip Archive file with all audio for the female speakers.
12 | file: es_ve_male.zip Archive file with all audio for the male speakers.
13 |
--------------------------------------------------------------------------------
/resources/78/info.txt:
--------------------------------------------------------------------------------
1 | name: Crowdsourced high-quality Gujarati multi-speaker speech data set.
2 | summary: Data set which contains recordings of native speakers of Gujarati.
3 | There are 2219 recordings from female speakers, and 2053 recordings from male
4 | speakers.
5 | category: speech
6 | license: Attribution-ShareAlike 4.0 International
7 | file: about.html Information about the data set
8 | file: LICENSE License information for the data set
9 | file: line_index_female.tsv Lines recorded by the female speakers
10 | file: line_index_male.tsv Lines recorded by the male speakers
11 | file: gu_in_female.zip Archive containing recordings from female speakers
12 | file: gu_in_male.zip Archive file recordings from male speakers
13 |
--------------------------------------------------------------------------------
/resources/70/about.html:
--------------------------------------------------------------------------------
1 | This data set contains transcribed high-quality audio of Nigerian English
2 | sentences recorded by volunteers, in Lagos Nigerian and in London. The data set
3 | consists of wave files, and a TSV file (line_index.tsv). The file line_index.tsv
4 | contains a anonymized FileID and the transcription of audio in the file.
5 |
6 | The data set has been manually quality checked, but there might still be errors.
7 |
8 | Please report any issues in the following issue tracker on GitHub.
9 |
10 | https://github.com/googlei18n/language-resources/issues
11 |
12 |
13 | See LICENSE file for license information.
14 |
15 | Copyright 2018, 2019 Google, Inc.
16 |
--------------------------------------------------------------------------------
/resources/45/about.html:
--------------------------------------------------------------------------------
1 |
2 | This corpus were recorded in silence in-door environment using cellphone. It has 10 speakers. Each speaker has about 350 utterances. All utterances were carefully transcribed and checked by human. Transcription accuracy is guaranteed. If there is any problems, we agree to correct them for you.
3 |
4 | Please cite the data as “ST-AEDS-20180100_1, Free ST American English Corpus”.
5 |
6 | The data set is a subset of a much bigger data set (about 1000hours) which was recorded in the same environment as this open source data. Please visit our website www.surfing.ai or contact us at contact@surfingtech.cn for details.
7 |
8 |
--------------------------------------------------------------------------------
/resources/49/info.txt:
--------------------------------------------------------------------------------
1 | name: VoxCeleb Data
2 | summary: Various files for the VoxCeleb datasets
3 | category: Misc
4 | license: Not copyrighted
5 | file: voxceleb1_test.txt A file containing a list of trial pairs for the verification task of the old version of VoxCeleb1
6 | file: voxceleb1_test_v2.txt A file containing a list of trial pairs for the verification task of the new version of VoxCeleb1
7 | file: voxceleb1_sitw_overlap.txt A list of 60 speakers in VoxCeleb1 that overlap with the Speakers in the Wild (SITW) dataset
8 | file: vox1_meta.csv A list which provides identity, gender and nationality labels for VoxCeleb1
9 | file: vox2_meta.csv A list which provides identity, gender and nationality labels for VoxCeleb2
10 | alternate_url: http://www.robots.ox.ac.uk/~vgg/data/voxceleb
11 |
--------------------------------------------------------------------------------
/resources/86/info.txt:
--------------------------------------------------------------------------------
1 | name: Crowdsourced high-quality Yoruba speech data set.
2 | summary: Data set which contains recordings of Yoruba.
3 | There are 1892 recordings from female speakers, and 1691 recordings from male
4 | speakers
5 | category: speech
6 | license: Attribution-ShareAlike 4.0 International
7 | file: about.html Information about the data set.
8 | file: LICENSE License information for the data set.
9 | file: line_index_female.tsv All utterances for the female speakers.
10 | file: line_index_male.tsv All utterances for the male speakers.
11 | file: yo_ng_female.zip Archive file with all audio for the female speakers.
12 | file: yo_ng_male.zip Archive file with all audio for the male speakers.
13 | file: annotation_info.txt A file listing additional annotations in the text.
14 |
--------------------------------------------------------------------------------
/resources/86/about.html:
--------------------------------------------------------------------------------
1 | This data set contains transcribed high-quality audio of Yoruba sentences
2 | recorded by volunteers. The data set consists of wave files, and a TSV file
3 | (line_index.tsv). The file line_index.tsv contains a anonymized FileID and the
4 | transcription of audio in the file. The file annotation_info contains
5 | information annotations included in the data set.
6 |
7 | The data set has been manually quality checked, but there might still be errors.
8 |
9 | Please report any issues in the following issue tracker on GitHub.
10 |
11 | https://github.com/google/language-resources/issues
12 |
13 |
14 | See LICENSE file for license information.
15 |
16 | Copyright 2018, 2019, 2020 Google, Inc.
17 |
--------------------------------------------------------------------------------
/resources/60/info.txt:
--------------------------------------------------------------------------------
1 | name: LibriTTS corpus
2 | summary: Large-scale corpus of English speech derived from the original materials of the LibriSpeech corpus
3 | category: Speech
4 | license: CC BY 4.0
5 | file: dev-clean.tar.gz Development set, clean speech
6 | file: dev-other.tar.gz Development set, more challenging speech
7 | file: test-clean.tar.gz Test set, "clean" speech
8 | file: test-other.tar.gz Test set, "other" speech
9 | file: train-clean-100.tar.gz Training set derived from the original materials of the train-clean-100 subset of LibriSpeech
10 | file: train-clean-360.tar.gz Training set derived from the original materials of the train-clean-360 subset of LibriSpeech
11 | file: train-other-500.tar.gz Training set derived from the original materials of the train-other-500 subset of LibriSpeech
12 |
--------------------------------------------------------------------------------
/resources/46/about.html:
--------------------------------------------------------------------------------
1 | The Tunisian_MSA corpus was originally collected to train acoustic models for pronunciation modeling in Arabic language learning applications.
2 |
3 | The data collection took place near Tunis the capital of the Republic of Tunisia in 2003.
4 |
5 | The Tunisian_MSA corpus is divided into recited and prompted speech subcorpora.
6 | The recited speech is stored under the recordings directory.
7 | The prompted speech is stored under the answers directory.
8 | Each of the 118 informants contributed to both subcorpora by reciting sentences and providing answers to prompted questions.
9 | The Tunisian_MSA corpus has 11.2 hours of speech.
10 |
11 | A small corpus was collected in 2017 for testing.
12 | It consists of speech from 4 speakers, 3 male Libyans and 1 female from Tunisia.
13 |
14 |
15 |
--------------------------------------------------------------------------------
/resources/22/info.txt:
--------------------------------------------------------------------------------
1 | name: THUYG-20
2 | summary: A free Uyghur speech database Released by CSLT@Tsinghua University & Xinjiang University
3 | category: speech
4 | license: Apache License v.2.0
5 | file: data_thuyg20.tar.gz speech data and transcripts for speech recognition
6 | file: data_thuyg20_sre.tar.gz speech data for speaker recognition
7 | file: test_noise.tar.gz standard 0db noisy test data for speech recognition
8 | file: test_noise_sre.tar.gz standard 0db noisy test data for speaker recognition
9 | file: resource.tar.gz supplementary resources, incl. lexicon for training data, noise samples
10 | alternate_url: http://data.cslt.org/thuyg20-openslr/README.html CSLT local storage
11 | alternate_url: http://pan.baidu.com/s/1hqKwE00 Baidu disk
12 | alternate_url: https://mega.nz/#F!idRSjL4A!cnCY0R2NjU77Jr0soe9OgQ Mega disk
13 |
--------------------------------------------------------------------------------
/resources/1/about.html:
--------------------------------------------------------------------------------
1 |
2 | This dataset was created for the Kaldi project (see kaldi.sf.net),
3 | by a contributor who prefers to remain anonymous. The main point of the dataset is
4 | to provide an easy and fast way to test out the Kaldi scripts for free.
5 |
6 | The archive "waves_yesno.tar.gz" contains 60 .wav files, sampled at 8 kHz. All were recorded
7 | by the same male speaker, in Hebrew.
8 | In each file, the individual says 8 words; each word is either the Hebrew for "yes" or "no", so each
9 | file is a random sequence of 8 yes-es or noes. There is no separate transcription provided; the
10 | sequence is encoded in the filename, with 1 for yes and 0 for no, for instance:
11 |
12 | # tar -xvzf waves_yesno.tar.gz
13 | waves_yesno/1_0_1_1_1_0_1_0.wav
14 | waves_yesno/0_1_1_0_0_1_1_0.wav
15 | ...
16 |
17 |
18 |
--------------------------------------------------------------------------------
/resources/68/info.txt:
--------------------------------------------------------------------------------
1 | name: MAGICDATA Mandarin Chinese Read Speech Corpus
2 | summary: The corpus by Magic Data Technology Co., Ltd. , containing 755 hours of scripted read speech data from 1080 native speakers of the Mandarin Chinese spoken in mainland China. The sentence transcription accuracy is higher than 98%.
3 | category: Speech
4 | license: Attribution-NonCommercial-NoDerivatives 4.0 International Public License (CC BY-NC-ND 4.0)
5 | file: train_set.tar.gz Training set speech and transcripts
6 | file: dev_set.tar.gz Development set speech and transcripts
7 | file: test_set.tar.gz Test set speech and transcripts
8 | file: metadata.tar.gz supplementary resources, incl. data introduction (in English and Chinese) and speaker information
9 | alternate_url: http://www.imagicdatatech.com/index.php/home/dataopensource/data_info/id/101 Full description from the company website
10 |
--------------------------------------------------------------------------------
/resources/40/about.html:
--------------------------------------------------------------------------------
1 | This is Zeroth-Korean corpus,
2 | licensed under Attribution 4.0 International (CC BY 4.0)
3 |
4 |
5 | The data set contains transcriebed audio data for Korean. There are 51.6 hours transcribed Korean audio for training data (22,263 utterances, 105 people, 3000 sentences) and 1.2 hours transcribed Korean audio for testing data (457 utterances, 10 people). This corpus also contains pre-trained/designed language model, lexicon and morpheme-based segmenter(morfessor).
6 |
7 | Zeroth project introduces free Korean speech corpus and aims to make Korean speech recognition more broadly accessible to everyone.
8 |
9 | This project was developed in collaboration between Lucas Jo(@Atlas Guide Inc.) and Wonkyum Lee(@Gridspace Inc.).
10 |
11 | Contact: Lucas Jo(lucasjo@goodatlas.com), Wonkyum Lee(wonkyum@gridspace.com)
12 |
13 |
14 |
--------------------------------------------------------------------------------
/resources/11/info.txt:
--------------------------------------------------------------------------------
1 | name: LibriSpeech language models, vocabulary and G2P models
2 | summary: Language modelling resources, for use with the LibriSpeech ASR corpus
3 | category: text
4 | license: Public domain
5 | file: librispeech-lm-corpus.tgz 14500 public domain books, used as training material for the LibriSpeech's LM
6 | file: librispeech-lm-norm.txt.gz Normalized LM training text
7 | file: librispeech-vocab.txt 200K word vocabulary for the LM
8 | file: librispeech-lexicon.txt Pronunciations, some of which G2P auto-generated, for all words in the vocabulary
9 | file: 3-gram.arpa.gz 3-gram ARPA LM, not pruned
10 | file: 3-gram.pruned.1e-7.arpa.gz 3-gram ARPA LM, pruned with theshold 1e-7
11 | file: 3-gram.pruned.3e-7.arpa.gz 3-gram ARPA LM, pruned with theshold 3e-7
12 | file: 4-gram.arpa.gz 4-gram ARPA LM, usually used for rescoring
13 | file: g2p-model-5 Fifth order Sequitur G2P model
14 |
--------------------------------------------------------------------------------
/resources/13/about.html:
--------------------------------------------------------------------------------
1 | The data includes non-speech sounds recorded in an anechoic room,
2 | reconstructed signals in various rooms, impulse responses for a
3 | microphone array, speech data recorded with the same array, and
4 | recordings of background noises. It is intended for use when
5 | simulating sound scenes. It was developed by the Real Acoustic
6 | Environments Working Group of the Real World Computing Partnership
7 | (RWCP). The data was recorded from 1998 to 2000.
8 |
9 |
10 | You can cite the data using the following BibTeX entry:
11 |
12 |
13 | @inproceedings{nakamura2000acoustical,
14 | title={Acoustical Sound Database in Real Environments for Sound Scene Understanding and Hands-Free Speech Recognition.},
15 | author={Nakamura, Satoshi and Hiyane, Kazuo and Asano, Futoshi and Nishiura, Takanobu and Yamada, Takeshi},
16 | booktitle={LREC},
17 | year={2000}
18 | }
19 |
--------------------------------------------------------------------------------
/resources/12/about.html:
--------------------------------------------------------------------------------
1 | LibriSpeech is a corpus of approximately 1000 hours of 16kHz read English speech, prepared by
2 | Vassil Panayotov with the assistance of Daniel Povey.
3 | The data is derived from read audiobooks from the LibriVox project, and has been carefully segmented and aligned.
4 |
5 | Acoustic models, trained on this data set, are available at
6 | kaldi-asr.org and language models, suitable for evaluation can be found at
7 | http://www.openslr.org/11/.
8 |
9 | For more information, see the paper
10 | "LibriSpeech: an ASR corpus based on public domain audio books",
11 | Vassil Panayotov, Guoguo Chen, Daniel Povey and Sanjeev Khudanpur, ICASSP 2015 (submitted)
12 | (pdf)
13 |
14 |
--------------------------------------------------------------------------------
/resources/37/about.html:
--------------------------------------------------------------------------------
1 | This data is transcribed high-quality speech data for Bengali.
2 |
3 | The data collection was perfomed by Google.
4 |
5 | If you use this data in publications, please cite it as follows:
6 |
7 | @inproceedings{kjartansson-etal-tts-sltu2018,
8 | title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese, Khmer, Nepali, Sinhala, and Sundanese}},
9 | author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu De Silva and Supheakmungkol Sarin},
10 | booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},
11 | year = {2018},
12 | address = {Gurugram, India},
13 | month = aug,
14 | pages = {66--70},
15 | URL = {http://dx.doi.org/10.21437/SLTU.2018-14}
16 | }
17 |
18 |
--------------------------------------------------------------------------------
/resources/6/about.html:
--------------------------------------------------------------------------------
1 |
2 | This data is transcribed telephone converation data, in English and Czech.
3 |
4 | The data collection process and development of these training scripts was partly
5 | funded by the Ministry of Education, Youth and Sports of the Czech Republic
6 | under the grant agreement LK11221 and core research funding of Charles
7 | University in Prague.
8 |
9 |
10 | You can cite the data using the following BibTeX entry:
11 |
12 |
13 | @inproceedings{korvas_2014,
14 | title={{Free English and Czech telephone speech corpus shared under the CC-BY-SA 3.0 license}},
15 | author={Korvas, Mat\v{e}j and Pl\'{a}tek, Ond\v{r}ej and Du\v{s}ek, Ond\v{r}ej and \v{Z}ilka, Luk\'{a}\v{s} and Jur\v{c}\'{i}\v{c}ek, Filip},
16 | booktitle={Proceedings of the Eigth International Conference on Language Resources and Evaluation (LREC 2014)},
17 | pages={To Appear},
18 | year={2014},
19 | }
20 |
21 |
--------------------------------------------------------------------------------
/resources/4/info.txt:
--------------------------------------------------------------------------------
1 | name: sctk
2 | summary: A mirror of the sctk scoring software
3 | category: software
4 | license: Public domain
5 | file: sctk-2.4.0-20091110-0958.tar.bz2 The original, bzipped version
6 | file: sctk-2.4.0-20091110-0958.tar.gz A gzipped version of the archive
7 | file: sctk-2.4.8-20130429-2145.tar.bz2 Version 2.4.8 of the software, as bz2
8 | file: sctk-2.4.9-20141015-1634Z.tar.bz2 Version 2.4.9 of the software, as bz2
9 | file: sctk-2.4.10-20151007-1312Z.tar.bz2 Version 2.4.10 of the software, as bz2
10 | alternate_url: ftp://jaguar.ncsl.nist.gov/pub/sctk-2.4.0-20091110-0958.tar.bz2 (for version 2.4.0)
11 | alternate_url: ftp://jaguar.ncsl.nist.gov/pub/sctk-2.4.8-20130429-2145.tar.bz2 (for version 2.4.8)
12 | alternate_url: ftp://jaguar.ncsl.nist.gov/pub/sctk-2.4.9-20141015-1634Z.tar.bz2 (for version 2.4.9)
13 | alternate_url: ftp://jaguar.ncsl.nist.gov/pub/sctk-2.4.10-20151007-1312Z.tar.bz2 (for version 2.4.10)
14 |
15 |
--------------------------------------------------------------------------------
/resources/54/info.txt:
--------------------------------------------------------------------------------
1 | name: Large Nepali ASR training data set
2 | summary: Nepali ASR training data set containing ~157K utterances.
3 | category: speech
4 | license: Attribution-ShareAlike 4.0 International
5 | file: about.html Information about the data set
6 | file: LICENSE License information for the data set
7 | file: utt_spk_text.tsv All utterances in the data set
8 | file: asr_nepali_0.zip Data set
9 | file: asr_nepali_1.zip Data set
10 | file: asr_nepali_2.zip Data set
11 | file: asr_nepali_3.zip Data set
12 | file: asr_nepali_4.zip Data set
13 | file: asr_nepali_5.zip Data set
14 | file: asr_nepali_6.zip Data set
15 | file: asr_nepali_7.zip Data set
16 | file: asr_nepali_8.zip Data set
17 | file: asr_nepali_9.zip Data set
18 | file: asr_nepali_a.zip Data set
19 | file: asr_nepali_b.zip Data set
20 | file: asr_nepali_c.zip Data set
21 | file: asr_nepali_d.zip Data set
22 | file: asr_nepali_e.zip Data set
23 | file: asr_nepali_f.zip Data set
24 |
--------------------------------------------------------------------------------
/resources/38/about.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | This corpus were recorded in silence in-door environment using cellphone. It has 855 speakers. Each speaker has 120 utterances. All utterances were carefully transcribed and checked by human. Transcription accuracy is guaranteed. If there is any problem, we agree to correct them for you. The corpus contains:
9 | audio files;
10 | transcriptions;
11 | metadata;
12 |
13 | Please cite the data as “ST-CMDS-20170001_1, Free ST Chinese Mandarin Corpus”.
14 |
15 | The data set is a subset of a much bigger data set which was recorded in the same environment as this open source data. Please visit our website www.surfing.ai for details.
16 |
17 |
18 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/resources/16/info.txt:
--------------------------------------------------------------------------------
1 | name: The AMI Corpus
2 | summary: Acoustic speech data and meta-data from The AMI corpus.
3 | category: speech
4 | license: THE CREATIVE COMMONS ATTRIBUTION-NONCOMMERCIAL-SHAREALIKE v2.0 LICENCE (modified, look for more details in the licence file and/or the AMI webpage)
5 | file: ami_manual_1.6.1.tar.gz AMI annotation files (ver. 1.6.1)
6 | file: headset.tar.gz Close-talking acosutic data
7 | file: Array1-01.tar.gz Array1 distant acoustic data
8 | file: Array1-02.tar.gz Array1 distant acoustic data
9 | file: Array1-03.tar.gz Array1 distant acoustic data
10 | file: Array1-04.tar.gz Array1 distant acoustic data
11 | file: Array1-05.tar.gz Array1 distant acoustic data
12 | file: Array1-06.tar.gz Array1 distant acoustic data
13 | file: Array1-07.tar.gz Array1 distant acoustic data
14 | file: Array1-08.tar.gz Array1 distant acoustic data
15 | alternate_url: http://groups.inf.ed.ac.uk/ami/corpus The official AMI corpus webpage
16 |
17 |
18 |
--------------------------------------------------------------------------------
/resources/50/about.html:
--------------------------------------------------------------------------------
1 | These are unofficial data splits for the corpus MADCAT Chinese Pilot Training Set (LDC2014T13).
2 | LDC is providing only training data for this corpus and not the original dev/eval sets, so the original
3 | training data have been split into three different disjoint parts (i.e. there shouldn't be sentences/lines
4 | from the same document in different sets -- as each document is handwritten/transcribed
5 | by a different author in the MADCAT data) to allow for evaluation of the performance in the usual way.
6 |
7 | Also, please not that the license relates only for the splits. You still need to obtain the original databases
8 | and respect the databases' license!
9 |
10 | It contains the madcat xml name and segment id (s{1,2,3,4}). For example:
11 |
12 | GMW_CMN_20070118.0014_001_LDC0632.madcat.xml s1
13 | GMW_CMN_20070118.0014_001_LDC0632.madcat.xml s2
14 | GMW_CMN_20070118.0014_001_LDC0632.madcat.xml s3
15 |
16 |
17 |
--------------------------------------------------------------------------------
/resources/52/info.txt:
--------------------------------------------------------------------------------
1 | name: Large Sinhala ASR training data set
2 | summary: Sinhala ASR training data set containing ~185K utterances.
3 | category: speech
4 | license: Attribution-ShareAlike 4.0 International
5 | file: about.html Information about the data set
6 | file: LICENSE License information for the data set
7 | file: utt_spk_text.tsv All utterances in the data set
8 | file: asr_sinhala_0.zip Data set
9 | file: asr_sinhala_1.zip Data set
10 | file: asr_sinhala_2.zip Data set
11 | file: asr_sinhala_3.zip Data set
12 | file: asr_sinhala_4.zip Data set
13 | file: asr_sinhala_5.zip Data set
14 | file: asr_sinhala_6.zip Data set
15 | file: asr_sinhala_7.zip Data set
16 | file: asr_sinhala_8.zip Data set
17 | file: asr_sinhala_9.zip Data set
18 | file: asr_sinhala_a.zip Data set
19 | file: asr_sinhala_b.zip Data set
20 | file: asr_sinhala_c.zip Data set
21 | file: asr_sinhala_d.zip Data set
22 | file: asr_sinhala_e.zip Data set
23 | file: asr_sinhala_f.zip Data set
24 |
--------------------------------------------------------------------------------
/resources/53/info.txt:
--------------------------------------------------------------------------------
1 | name: Large Bengali ASR training data set
2 | summary: Bengali ASR training data set containing ~196K utterances.
3 | category: speech
4 | license: Attribution-ShareAlike 4.0 International
5 | file: about.html Information about the data set
6 | file: LICENSE License information for the data set
7 | file: utt_spk_text.tsv All utterances in the data set
8 | file: asr_bengali_0.zip Data set
9 | file: asr_bengali_1.zip Data set
10 | file: asr_bengali_2.zip Data set
11 | file: asr_bengali_3.zip Data set
12 | file: asr_bengali_4.zip Data set
13 | file: asr_bengali_5.zip Data set
14 | file: asr_bengali_6.zip Data set
15 | file: asr_bengali_7.zip Data set
16 | file: asr_bengali_8.zip Data set
17 | file: asr_bengali_9.zip Data set
18 | file: asr_bengali_a.zip Data set
19 | file: asr_bengali_b.zip Data set
20 | file: asr_bengali_c.zip Data set
21 | file: asr_bengali_d.zip Data set
22 | file: asr_bengali_e.zip Data set
23 | file: asr_bengali_f.zip Data set
24 |
--------------------------------------------------------------------------------
/resources/33/about.html:
--------------------------------------------------------------------------------
1 | Aishell is an open-source Chinese Mandarin speech corpus published by
2 | Beijing Shell Shell Technology Co.,Ltd.
3 |
4 | 400 people from different accent areas in China are invited to
5 | participate in the recording, which is conducted in a quiet indoor
6 | environment using high fidelity microphone and downsampled to 16kHz.
7 | The manual transcription accuracy is above 95%, through professional
8 | speech annotation and strict quality inspection. The data is free
9 | for academic use. We hope to provide moderate amount of data for new
10 | researchers in the field of speech recognition.
11 |
12 |
13 | You can cite the data
14 | using the following BibTeX entry:
15 |
16 |
17 | @inproceedings{aishell_2017,
18 | title={AIShell-1: An Open-Source Mandarin Speech Corpus and A Speech Recognition Baseline},
19 | author={Hui Bu, Jiayu Du, Xingyu Na, Bengu Wu, Hao Zheng},
20 | booktitle={Oriental COCOSDA 2017},
21 | pages={Submitted},
22 | year={2017}
23 | }
24 |
25 |
--------------------------------------------------------------------------------
/resources/39/about.html:
--------------------------------------------------------------------------------
1 | Heroico
2 |
3 | The Heroico corpus (LDC2006S37) was originally collected to train acoustic models for pronunciation modeling in Spanish language learning applications.
4 | The corpus consists of two main subcorpora:
5 |
6 | 1. A subcorpus collected at Mexico's Military Academy called Heroico.
7 |
8 | 2. A subcorpus collected at the United States Military Academy (USMA) in West Point New York.
9 |
10 | The Heroico corpus is further divided into recited and prompted speech subcorpora.
11 | The recited speech appears under the recordings directory and the prompted speech under the answers directory.
12 |
13 | The USMA subcorpus includes 1.2 hours of speech from nonnative informants and 1 hour of speech from native speakers.
14 | All the speech in the USMA corpus was recited.
15 |
16 | The Heroico subcorpus has 11.8 hours of speech.
17 | One hour segment of speech in the Heroico corpus was recited from the same set of prompts that was used in the USMA collection.
18 |
--------------------------------------------------------------------------------
/resources/12/info.txt:
--------------------------------------------------------------------------------
1 | name: LibriSpeech ASR corpus
2 | summary: Large-scale (1000 hours) corpus of read English speech
3 | category: speech
4 | license: CC BY 4.0
5 | file: dev-clean.tar.gz development set, "clean" speech
6 | file: dev-other.tar.gz development set, "other", more challenging, speech
7 | file: test-clean.tar.gz test set, "clean" speech
8 | file: test-other.tar.gz test set, "other" speech
9 | file: train-clean-100.tar.gz training set of 100 hours "clean" speech
10 | file: train-clean-360.tar.gz training set of 360 hours "clean" speech
11 | file: train-other-500.tar.gz training set of 500 hours "other" speech
12 | file: intro-disclaimers.tar.gz extracted LibriVox announcements for some of the speakers
13 | file: original-mp3.tar.gz LibriVox mp3 files, from which corpus' audio was extracted
14 | file: original-books.tar.gz Project Gutenberg texts, against which the audio in the corpus was aligned
15 | file: raw-metadata.tar.gz Some extra meta-data produced during the creation of the corpus
16 | file: md5sum.txt MD5 checksums for the archive files
17 |
--------------------------------------------------------------------------------
/resources/47/about.html:
--------------------------------------------------------------------------------
1 | This free Chinese Mandarin speech corpus set is released by Shanghai Primewords Information Technology Co., Ltd.
2 |
3 |
4 | The corpus is recorded by smart mobile phones from 296 native Chinese speakers. The transcription accuracy is larger than 98%, at the confidence level of 95%. It is free for academic use.
5 |
6 |
7 |
8 | The mapping between the transcript and utterance is given in JSON format.
9 |
10 |
11 |
12 | You can cite the data using the following BibTeX entry:
13 |
14 | @misc{primewords_201801,
15 | title={Primewords Chinese Corpus Set 1},
16 | author={Primewords Information Technology Co., Ltd.},
17 | year={2018},
18 | note={\url{https://www.primewords.cn}}
19 | }
20 |
21 |
22 |
23 |
24 | CONTACTOR
25 |
26 | Yinghui Liu, yinghui_liu@primewords.cn
27 |
28 |
29 |
30 | External URLs: https://www.primewords.cn
31 |
32 |
--------------------------------------------------------------------------------
/resources/48/about.html:
--------------------------------------------------------------------------------
1 | These are unofficial data splits for the corpus MADCAT Arabic (LDC2013T15, LDC2013T09, LDC2012T15).
2 | LDC is providing only training data for these corpora and not the original dev/eval sets, so the original
3 | training data have been split into three different disjoint parts (i.e. there shouldn't be sentences/lines
4 | from the same document in different sets -- as each document is handwritten/transcribed
5 | by a different author in the MADCAT data) to allow for evaluation of the performance in the usual way.
6 |
7 | Also, please not that the license relates only for the splits. You still need to obtain the original databases
8 | and respect the databases' license!
9 |
10 | It contains the madcat xml name and segment id (s{1,2,3,4}). For example:
11 |
12 | groups.google.com_women1000_508c404bd84f8ba3_ARB_20060426_124900_3_LDC0188.madcat.xml s1
13 | groups.google.com_women1000_508c404bd84f8ba3_ARB_20060426_124900_3_LDC0188.madcat.xml s2
14 | groups.google.com_women1000_508c404bd84f8ba3_ARB_20060426_124900_3_LDC0188.madcat.xml s3
15 |
16 |
17 |
--------------------------------------------------------------------------------
/resources/54/about.html:
--------------------------------------------------------------------------------
1 | This data set contains transcribed audio data for Nepali. The data set consists of wave files, and a TSV file.
2 | The file utt_spk_text.tsv contains a FileID, anonymized UserID and the transcription of audio in the file.
3 |
4 | The data set has been manually quality checked, but there might still be errors.
5 |
6 | See LICENSE.txt file for license information.
7 |
8 | Copyright 2016, 2017, 2018 Google, Inc.
9 |
10 | If you use this data in publications, please cite it as follows:
11 |
12 | @inproceedings{kjartansson-etal-sltu2018,
13 | title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},
14 | author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},
15 | booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},
16 | year = {2018},
17 | address = {Gurugram, India},
18 | month = aug,
19 | pages = {52--55},
20 | URL = {http://dx.doi.org/10.21437/SLTU.2018-11}
21 | }
22 |
23 |
--------------------------------------------------------------------------------
/resources/52/about.html:
--------------------------------------------------------------------------------
1 | This data set contains transcribed audio data for Sinhala. The data set consists
2 | of wave files, and a TSV file. The file utt_spk_text.tsv contains a FileID,
3 | anonymized UserID and the transcription of audio in the file.
4 |
5 | The data set has been manually quality checked, but there might still be errors.
6 |
7 | See LICENSE.txt file for license information.
8 |
9 | Copyright 2016, 2017, 2018 Google, Inc.
10 |
11 | If you use this data in publications, please cite it as follows:
12 |
13 | @inproceedings{kjartansson-etal-sltu2018,
14 | title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},
15 | author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},
16 | booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},
17 | year = {2018},
18 | address = {Gurugram, India},
19 | month = aug,
20 | pages = {52--55},
21 | URL = {http://dx.doi.org/10.21437/SLTU.2018-11}
22 | }
23 |
24 |
--------------------------------------------------------------------------------
/resources/53/about.html:
--------------------------------------------------------------------------------
1 | This data set contains transcribed audio data for Bengali. The data set consists
2 | of wave files, and a TSV file. The file utt_spk_text.tsv contains a FileID,
3 | anonymized UserID and the transcription of audio in the file.
4 |
5 | The data set has been manually quality checked, but there might still be errors.
6 |
7 | See LICENSE.txt file for license information.
8 |
9 | Copyright 2016, 2017, 2018 Google, Inc.
10 |
11 | If you use this data in publications, please cite it as follows:
12 |
13 | @inproceedings{kjartansson-etal-sltu2018,
14 | title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},
15 | author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},
16 | booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},
17 | year = {2018},
18 | address = {Gurugram, India},
19 | month = aug,
20 | pages = {52--55},
21 | URL = {http://dx.doi.org/10.21437/SLTU.2018-11}
22 | }
23 |
24 |
--------------------------------------------------------------------------------
/resources/61/info.txt:
--------------------------------------------------------------------------------
1 | name: Crowdsourced high-quality Argentinian Spanish speech data set.
2 | summary: Data set which contains 5739 recordings of native speakers of Spanish
3 | recorded in Buenos Aires, Argentina. The data set has both male and female
4 | recordings. Part of the data set is a small section of weather messages,
5 | recorded both in Peninsular Spanish (90 messages) as well as Argentinian Spanish
6 | (90 messages).
7 | category: speech
8 | license: Attribution-ShareAlike 4.0 International
9 | file: about.html Information about the data set
10 | file: LICENSE License information for the data set
11 | file: line_index_female.tsv All utterances for the female speakers.
12 | file: line_index_male.tsv All utterances for the male speakers.
13 | file: es_es_line_index_weather.tsv Weather messages in Peninsular Spanish.
14 | file: es_ar_line_index_weather.tsv Weather messages in Argentinian Spanish.
15 | file: es_ar_female.zip Archive file with all audio for the female speakers.
16 | file: es_ar_male.zip Archive file with all audio for the male speakers.
17 | file: es_weather_messages.zip Archive file with the weather messages.
18 |
--------------------------------------------------------------------------------
/resources/36/about.html:
--------------------------------------------------------------------------------
1 | This data set contains transcribed audio data for Sundanese. The data set consists of wave files, and a TSV file. The file utt_spk_text.tsv contains a FileID, UserID and the transcription of audio in the file.
2 |
3 | The data set has been manually quality checked, but there might still be errors.
4 |
5 | This dataset was collected by Google in Indonesia.
6 |
7 | See LICENSE.txt file for license information.
8 |
9 | Copyright 2016, 2017 Google, Inc.
10 |
11 | If you use this data in publications, please cite it as follows:
12 |
13 | @inproceedings{kjartansson-etal-sltu2018,
14 | title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},
15 | author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},
16 | booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},
17 | year = {2018},
18 | address = {Gurugram, India},
19 | month = aug,
20 | pages = {52--55},
21 | URL = {http://dx.doi.org/10.21437/SLTU.2018-11},
22 | }
23 |
24 |
--------------------------------------------------------------------------------
/resources/35/info.txt:
--------------------------------------------------------------------------------
1 | name: Large Javanese ASR training data set
2 | summary: Javanese ASR training data set containing ~185K utterances.
3 | category: speech
4 | license: Attribution-ShareAlike 4.0 International
5 | file: asr_javanese.sha256 Checksum for the files
6 | file: LICENSE License information for the data set
7 | file: utt_spk_text.tsv All utterances in the data set
8 | file: asr_javanese_0.zip Data set, file 0/15
9 | file: asr_javanese_1.zip Data set, file 1/15
10 | file: asr_javanese_2.zip Data set, file 2/15
11 | file: asr_javanese_3.zip Data set, file 3/15
12 | file: asr_javanese_4.zip Data set, file 4/15
13 | file: asr_javanese_5.zip Data set, file 5/15
14 | file: asr_javanese_6.zip Data set, file 6/15
15 | file: asr_javanese_7.zip Data set, file 7/15
16 | file: asr_javanese_8.zip Data set, file 8/15
17 | file: asr_javanese_9.zip Data set, file 9/15
18 | file: asr_javanese_a.zip Data set, file 10/15
19 | file: asr_javanese_b.zip Data set, file 11/15
20 | file: asr_javanese_c.zip Data set, file 12/15
21 | file: asr_javanese_d.zip Data set, file 13/15
22 | file: asr_javanese_e.zip Data set, file 14/15
23 | file: asr_javanese_f.zip Data set, file 15/15
24 |
25 |
--------------------------------------------------------------------------------
/resources/36/info.txt:
--------------------------------------------------------------------------------
1 | name: Large Sundanese ASR training data set
2 | summary: Sundanese ASR training data set containing ~220K utterances.
3 | category: speech
4 | license: Attribution-ShareAlike 4.0 International
5 | file: asr_sundanese.sha256 Checksum for the files
6 | file: LICENSE License information for the data set
7 | file: utt_spk_text.tsv All utterances in the data set
8 | file: asr_sundanese_0.zip Data set, file 0/15
9 | file: asr_sundanese_1.zip Data set, file 1/15
10 | file: asr_sundanese_2.zip Data set, file 2/15
11 | file: asr_sundanese_3.zip Data set, file 3/15
12 | file: asr_sundanese_4.zip Data set, file 4/15
13 | file: asr_sundanese_5.zip Data set, file 5/15
14 | file: asr_sundanese_6.zip Data set, file 6/15
15 | file: asr_sundanese_7.zip Data set, file 7/15
16 | file: asr_sundanese_8.zip Data set, file 8/15
17 | file: asr_sundanese_9.zip Data set, file 9/15
18 | file: asr_sundanese_a.zip Data set, file 10/15
19 | file: asr_sundanese_b.zip Data set, file 11/15
20 | file: asr_sundanese_c.zip Data set, file 12/15
21 | file: asr_sundanese_d.zip Data set, file 13/15
22 | file: asr_sundanese_e.zip Data set, file 14/15
23 | file: asr_sundanese_f.zip Data set, file 15/15
24 |
25 |
--------------------------------------------------------------------------------
/resources/85/about.html:
--------------------------------------------------------------------------------
1 | The data is used in AISHELL Speaker Verification Challenge 2019. It is extracted from a larger database called AISHELL-WakeUp-1.
2 |
3 |
4 | The contents are wake-up words "Hi, Mia" in both Chinese and English.
5 | The data is collected in real home environment using microphone arrays and Hi-Fi microphone.
6 | The collection process and development of a baseline system was described in the paper below.
7 | The data used in the challenge is extracted from 1 Hi-Fi microphone and 16-channel circular microphone arrays for 1/3/5 meters.
8 | And the contents are the Chinese wake-up words. The whole set is divided into train (254 people), dev (42 people) and test (44 people) subsets.
9 | Test subset is provided with paired target/non-target answer to evaluate verification results.
10 |
11 |
12 | You can cite the data using the following BibTeX entry:
13 |
14 |
15 | @misc{himia,
16 | title={HI-MIA : A Far-field Text-Dependent Speaker Verification Database and the Baselines},
17 | author={Xiaoyi Qin and Hui Bu and Ming Li},
18 | year={2019},
19 | eprint={1912.01231},
20 | archivePrefix={arXiv},
21 | primaryClass={cs.SD}
22 | }
23 |
24 |
--------------------------------------------------------------------------------
/resources/59/about.html:
--------------------------------------------------------------------------------
1 | ParlamentParla is a speech corpus for Catalan, published by the workers cooperative Col·lectivaT. The audio segments were extracted from recordings the Catalan Parliament Catalan Parliament (Parlament de Catalunya) plenary sessions. The recordings were aligned with their transcripts, and 320 hours of cleanest segments are extracted. The content belongs to the Catalan Parliament and the data is released conforming their terms of use.
2 |
3 | Preparation of this corpus was supported by the Department of Culture of the Catalan autonomous government.
4 |
5 | The audio files are PCM 16bit mono, little endian with the sample rate 16 kHz. As of release version 1.0, the corpus is separated into 90 hours of clean and 230 hours of other quality segments.
6 |
7 | For contact info@collectivat.cat
8 |
9 | https://collectivat.cat/asr The official ParlamentParla corpus webpage, with other resources and updates
10 |
11 |
--------------------------------------------------------------------------------
/resources/67/about.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | TEDx Spanish Corpus
4 |
5 |
6 |
7 |
8 | The TEDx Spanish Corpus is a gender unbalanced corpus of 24 hours of duration.
9 |
10 | It contains spontaneous speech of several expositors in TEDx events; most of them are men.
11 |
12 | Transcriptions are presented in lowercase with no punctuation marks.
13 |
14 | The data collection process was partly developed by the social service program "Desarrollo de Tecnologías del Habla" that depends on the National Autonomous University of Mexico and partly by the CIEMPIESS-UNAM project (http://www.ciempiess.org/)
15 |
16 | Special thanks to the TED-Talks team for allowing us to share this dataset.
17 |
18 | You can cite the data using the following BibTeX entry:
19 |
20 |
21 | @misc{mena_2019,
22 | title = "{TEDx Spanish Corpus. Audio and transcripts in Spanish taken from the TEDx Talks; shared under the CC BY-NC-ND 4.0 license}",
23 | author = "Hernandez-Mena, Carlos D.",
24 | howpublished = "Web Download",
25 | institution = "Universidad Nacional Autonoma de Mexico",
26 | location = "Mexico City",
27 | year = "2019"
28 | }
29 |
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/resources/35/about.html:
--------------------------------------------------------------------------------
1 | This data set contains transcribed audio data for Javanese. The data set consists of wave files, and a TSV file. The file utt_spk_text.tsv contains a FileID, UserID and the transcription of audio in the file.
2 |
3 | The data set has been manually quality checked, but there might still be errors.
4 |
5 | This dataset was collected by Google in collaboration with Reykjavik University and Universitas Gadjah Mada in Indonesia.
6 |
7 | See LICENSE.txt file for license information.
8 |
9 | Copyright 2016, 2017 Google, Inc.
10 |
11 | If you use this data in publications, please cite it as follows:
12 |
13 | @inproceedings{kjartansson-etal-sltu2018,
14 | title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},
15 | author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},
16 | booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},
17 | year = {2018},
18 | address = {Gurugram, India},
19 | month = aug,
20 | pages = {52--55},
21 | URL = {http://dx.doi.org/10.21437/SLTU.2018-11},
22 | }
23 |
24 |
--------------------------------------------------------------------------------
/resources/32/about.html:
--------------------------------------------------------------------------------
1 | This data set contains multi-speaker high quality transcribed audio data for four languages of South Africa. The data set consists of wave files, and a TSV file transcribing the audio. In each folder, the file line_index.tsv contains a FileID, which in turn contains the UserID and the Transcription of audio in the file.
2 |
3 | The data set has had some quality checks, but there might still be errors.
4 |
5 | This data set was collected by as a collaboration between North West University and Google.
6 |
7 | See LICENSE.txt file for license information.
8 |
9 | Copyright 2017 Google, Inc.
10 |
11 | If you use this data in publications, please cite it as follows:
12 |
13 | @inproceedings{van-niekerk-etal-2017,
14 | title = {{Rapid development of TTS corpora for four South African languages}},
15 | author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson and Martin Jansche and Linne Ha},
16 | booktitle = {Proc. Interspeech 2017},
17 | pages = {2178--2182},
18 | address = {Stockholm, Sweden},
19 | month = aug,
20 | year = {2017},
21 | URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}
22 | }
23 |
24 |
--------------------------------------------------------------------------------
/resources/42/about.html:
--------------------------------------------------------------------------------
1 | This data set contains high-quality transcribed audio data for Khmer. The data set consists of wave files, and a TSV file. The file line_index.tsv contains a filename and the transcription of audio in the file. Each filename is prepended with a speaker identification number.
2 |
3 | The data set has been manually quality checked, but there might still be errors.
4 |
5 | This dataset was collected by Google.
6 |
7 | See LICENSE file for license information.
8 |
9 | Copyright 2016, 2017, 2018 Google LLC
10 |
11 | If you use this data in publications, please cite it as follows:
12 |
13 | @inproceedings{kjartansson-etal-tts-sltu2018,
14 | title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese, Khmer, Nepali, Sinhala, and Sundanese}},
15 | author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu De Silva and Supheakmungkol Sarin},
16 | booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},
17 | year = {2018},
18 | address = {Gurugram, India},
19 | month = aug,
20 | pages = {66--70},
21 | URL = {http://dx.doi.org/10.21437/SLTU.2018-14}
22 | }
23 |
24 |
--------------------------------------------------------------------------------
/resources/43/about.html:
--------------------------------------------------------------------------------
1 | This data set contains high-quality transcribed audio data for Nepali. The data set consists of wave files, and a TSV file. The file line_index.tsv contains a filename and the transcription of audio in the file. Each filename is prepended with a speaker identification number.
2 |
3 | The data set has been manually quality checked, but there might still be errors.
4 |
5 | This dataset was collected by Google in Nepal.
6 |
7 | See LICENSE.txt file for license information.
8 |
9 | Copyright 2016, 2017, 2018 Google LLC
10 |
11 | If you use this data in publications, please cite it as follows:
12 |
13 | @inproceedings{kjartansson-etal-tts-sltu2018,
14 | title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese, Khmer, Nepali, Sinhala, and Sundanese}},
15 | author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu De Silva and Supheakmungkol Sarin},
16 | booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},
17 | year = {2018},
18 | address = {Gurugram, India},
19 | month = aug,
20 | pages = {66--70},
21 | URL = {http://dx.doi.org/10.21437/SLTU.2018-14}
22 | }
23 |
24 |
--------------------------------------------------------------------------------
/resources/2/info.txt:
--------------------------------------------------------------------------------
1 | name: OpenFST
2 | summary: A mirror of the OpenFst toolkit
3 | category: software
4 | license: Apache 2.0
5 | file: openfst-1.3.2.tar.gz Version 1.3.2
6 | file: openfst-1.3.3.tar.gz Version 1.3.3
7 | file: openfst-1.3.4.tar.gz Version 1.3.4
8 | file: openfst-1.4.1.tar.gz Version 1.4.1
9 | file: openfst-1.5.4.tar.gz Version 1.5.4
10 | file: openfst-1.6.2.tar.gz Version 1.6.2
11 | file: openfst-1.6.5.tar.gz Version 1.6.5
12 | file: openfst-1.6.7.tar.gz Version 1.6.7
13 | alternate_url: http://openfst.cs.nyu.edu/twiki/pub/FST/FstDownload/openfst-1.3.2.tar.gz Version 1.3.2
14 | alternate_url: http://openfst.cs.nyu.edu/twiki/pub/FST/FstDownload/openfst-1.3.3.tar.gz Version 1.3.3
15 | alternate_url: http://openfst.cs.nyu.edu/twiki/pub/FST/FstDownload/openfst-1.3.4.tar.gz Version 1.3.4
16 | alternate_url: http://openfst.cs.nyu.edu/twiki/pub/FST/FstDownload/openfst-1.4.1.tar.gz Version 1.4.1
17 | alternate_url: http://openfst.cs.nyu.edu/twiki/pub/FST/FstDownload/openfst-1.5.4.tar.gz Version 1.5.4
18 | alternate_url: http://openfst.cs.nyu.edu/twiki/pub/FST/FstDownload/openfst-1.6.2.tar.gz Version 1.6.2
19 | alternate_url: http://openfst.cs.nyu.edu/twiki/pub/FST/FstDownload/openfst-1.6.5.tar.gz Version 1.6.5
20 | alternate_url: http://www.openfst.org/twiki/pub/FST/FstDownload/openfst-1.6.7.tar.gz Version 1.6.7
21 |
--------------------------------------------------------------------------------
/resources/44/about.html:
--------------------------------------------------------------------------------
1 | This data set contains high-quality transcribed audio data for Sundanese. The data set consists of wave files, and a TSV file. The file line_index.tsv contains a filename and the transcription of audio in the file. Each filename is prepended with a speaker identification number.
2 |
3 | The data set has been manually quality checked, but there might still be errors.
4 |
5 | This dataset was collected by Google in collaboration with Universitas Pendidikan Indonesia.
6 |
7 | See LICENSE file for license information.
8 |
9 | Copyright 2016, 2017, 2018 Google LLC
10 |
11 | If you use this data in publications, please cite it as follows:
12 |
13 | @inproceedings{kjartansson-etal-tts-sltu2018,
14 | title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese, Khmer, Nepali, Sinhala, and Sundanese}},
15 | author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu De Silva and Supheakmungkol Sarin},
16 | booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},
17 | year = {2018},
18 | address = {Gurugram, India},
19 | month = aug,
20 | pages = {66--70},
21 | URL = {http://dx.doi.org/10.21437/SLTU.2018-14}
22 | }
23 |
24 |
--------------------------------------------------------------------------------
/resources/41/about.html:
--------------------------------------------------------------------------------
1 | This data set contains high-quality transcribed audio data for Javanese. The data set consists of wave files, and a TSV file. The file line_index.tsv contains a filename and the transcription of audio in the file. Each filename is prepended with a speaker identification number.
2 |
3 | The data set has been manually quality checked, but there might still be errors.
4 |
5 | This dataset was collected by Google in collaboration with Gadjah Mada University in Indonesia.
6 |
7 | See LICENSE file for license information.
8 |
9 | Copyright 2016, 2017, 2018 Google LLC
10 |
11 | If you use this data in publications, please cite it as follows:
12 |
13 | @inproceedings{kjartansson-etal-tts-sltu2018,
14 | title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese, Khmer, Nepali, Sinhala, and Sundanese}},
15 | author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu De Silva and Supheakmungkol Sarin},
16 | booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},
17 | year = {2018},
18 | address = {Gurugram, India},
19 | month = aug,
20 | pages = {66--70},
21 | URL = {http://dx.doi.org/10.21437/SLTU.2018-14}
22 | }
23 |
24 |
--------------------------------------------------------------------------------
/resources/57/about.html:
--------------------------------------------------------------------------------
1 | African Accented French Corpus
2 |
3 | This corpus consists of approximately 22 hours of speech recordings.
4 | Transcripts are provided for all the recordings.
5 | The corpus can be divided into 3 parts:
6 |
7 | 1. Yaounde
8 |
9 | Collected by a team from the U.S. Military Academy's Center for Technology Enhanced Language Learning (CTELL) in 2003 in Yaoundé, Cameroon. It has recordings from 84 speakers, 48 male and 36 female.
10 |
11 | 2. CA16
12 |
13 | This part was collected by a RDECOM Science Team who participated in the United Nations exercise Central Accord 16 (CA16) in Libreville, Gabon in June 2016. The Science Team included DARPA's Dr. Boyan Onyshkevich and Dr. Aaron Lawson (SRI International), as well as RDECOM scientists.
14 | It has recordings from 125 speakers from Cameroon, Chad, Congo and Gabon.
15 |
16 | 3. Niger
17 |
18 | This part was collected from 23 speakers in Niamey, Niger, Oct. 26-30 2015. These speakers were students in a course for officers and sergeants presented by Army trainers assigned to U.S. Army Africa. The data was collected by RDECOM Science & Technology Advisors Major Eddie Strimel and Mr. Bill Bergen.
19 |
20 | Visit this page for further info
21 |
--------------------------------------------------------------------------------
/resources/87/about.html:
--------------------------------------------------------------------------------
1 | The MobvoiHotwords is a corpus of wake-up words collected from a commercial smart speaker of Mobvoi. It consists of keyword and non-keyword utterances.
2 |
3 | For keyword data, keyword utterances contain either 'Hi xiaowen' or 'Nihao Wenwen' are collected. For each keyword, there are about 36k utterances. All keyword data is collected from 788 subjects, ages 3-65, with different distances from the smart speaker (1, 3 and 5 meters). Different noises (typical home environment noises like music and TV) with varying sound pressure levels are played in the background during the collection. The keyword data is identical to the keyword data used in the paper below:
4 |
5 | @article{DBLP:journals/spl/HouSOHX19,
6 | author = {Jingyong Hou and
7 | Yangyang Shi and
8 | Mari Ostendorf and
9 | Mei{-}Yuh Hwang and
10 | Lei Xie},
11 | title = {Region Proposal Network Based Small-Footprint Keyword Spotting},
12 | journal = {{IEEE} Signal Process. Lett.},
13 | volume = {26},
14 | number = {10},
15 | pages = {1471--1475},
16 | year = {2019},
17 | url = {https://doi.org/10.1109/LSP.2019.2936282},
18 | doi = {10.1109/LSP.2019.2936282}
19 | }
20 |
21 | There are also ~220 hours non-keyword data can be used as negative training samples, collected from the same smart speaker.
22 |
--------------------------------------------------------------------------------
/resources/16/about.html:
--------------------------------------------------------------------------------
1 | This is a mirror of The AMI Corpus acoustic data originally hosted on http://groups.inf.ed.ac.uk/ami/corpus/
2 |
3 |
4 | The AMI Meeting Corpus consists of 100 hours of meeting recordings. The recordings use a range of signals synchronized to a common timeline. These include close-talking and far-field microphones, individual and room-view video cameras, and output from a slide projector and an electronic whiteboard. During the meetings, the participants also have unsynchronized pens available to them that record what is written. The meetings were recorded in English using three different rooms with different acoustic properties, and include mostly non-native speakers.
5 |
6 |
7 | The associated paper(s) describing the data:
8 |
9 |
2 | This is the TED-LIUM corpus release 2, 3 | licensed under Creative Commons BY-NC-ND 3.0 (http://creativecommons.org/licenses/by-nc-nd/3.0/deed.en). 4 | 5 | All talks and text are property of TED Conferences LLC. 6 | 7 | --- 8 | 9 | The TED-LIUM corpus was made from audio talks and their transcriptions available on the TED website. We have prepared and filtered these data in order to train acoustic models to participate to the International Workshop on Spoken Language Translation 2011 (the LIUM English/French SLT system reached the first rank in the SLT task). 10 | 11 | More details are given in this paper: 12 | 13 | A. Rousseau, P. Deléglise, and Y. Estève, "Enhancing the TED-LIUM Corpus with Selected Data for Language Modeling and More TED Talks", 14 | in Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC’14), May 2014. 15 | 16 | 17 | Please cite this reference if you use these data in your research work. 18 | 19 | --- 20 | 21 | Contents: 22 | 23 | - 1495 audio talks in NIST sphere format (SPH) 24 | - 1495 transcripts in STM format 25 | - Dictionary with pronunciation (159848 entries) 26 | - Selected monolingual data for language modeling from WMT12 publicly available corpora 27 | 28 | 29 | SPH format info: 30 | 31 | Channels : 1 32 | Sample Rate : 16000 33 | Precision : 16-bit 34 | Bit Rate : 256k 35 | Sample Encoding : 16-bit Signed Integer PCM 36 | 37 |38 | -------------------------------------------------------------------------------- /resources/55/about.html: -------------------------------------------------------------------------------- 1 | CLMAD is an open Chinese Language Model Adaptation Dataset. The dataset contains 14 classes of 740,000 news. Several necessary preprocessing steps are adopted on the dataset for language model training. Documents are split into sentences in terms of punctuations, and then all punctuations are removed. ICTLACS word segmentation tool is used to segment continues character sequences in to word sequences. Each class of text is split into training set and testing set. The testing set is randomly selected 7000 sentences. The text of training set and testing set are not overlapped. Detailed comparative experiments on four selected domains (fashion, finance, sport, and stock) are shown in our paper "CLMAD: A Chinese Language Model Adaptation Dataset", Ye Bai, Jianhua Tao, Jiangyan Yi, Zhengqi Wen, Cunhang Fan, ISCSLP 2018 (submitted). 2 | 3 |
4 | The dataset is extended from THUCNews text classification dataset. We appreciate NLP lab of Tsinghua University to provide THUC News corpus, and Dr. Zhiyuan Liu to admit us to extend this corpus. 5 |
6 | 7 | You can cite the data using the following BibTeX entry: 8 |
9 |
10 | @inproceedings{yebai2018clmad,
11 | title={CLMAD: A Chinese Language Model Adaptation Dataset},
12 | author={Ye Bai, Jianhua Tao, Jiangyan Yi, Zhengqi Wen, Cunhang Fan},
13 | booktitle={The Eleventh International Symposium on Chinese Spoken Language Processing (ISCSLP 2018)},
14 | pages={To Appear},
15 | year={2018},
16 | }
17 |
18 |
--------------------------------------------------------------------------------
/resources/90/about.html:
--------------------------------------------------------------------------------
1 | 5 | The Switchboard (SWB) corpus 6 | is one of the most important historical benchmarks 7 | for recognition tasks involving large vocabulary conversational speech (LVCSR). 8 | It contains 2430 conversations averaging 6 minutes in length; in other words, 9 | over 240 hours of recorded speech, and about 3 million words of text, spoken by 10 | over 500 speakers of both sexes from every major dialect of American English. 11 |
12 | The initial transcriptions for SWB have error rates above 10%, resulting in poor 13 | recognition performance, paticularly on hard to recognize words such as 14 | monosyllabic words. This release of the SWB transcriptions, which was developed 15 | by the Institute for Signal and Information Processing at Mississippi State 16 | University in the late 1990's, includes transcriptions that were manually 17 | corrected to have error rates below 1%. The release also includes 18 | manually-adjusted segmentations and word alignments. 19 | 20 | 21 | 28 | -------------------------------------------------------------------------------- /resources/76/about.html: -------------------------------------------------------------------------------- 1 | This data set contains transcribed high-quality audio of Basque sentences 2 | recorded by volunteers. The data set consists of wave files, and a TSV file 3 | (line_index.tsv). The file line_index.tsv contains a anonymized FileID and the 4 | transcription of audio in the file. 5 |
6 | The data set has been manually quality checked, but there might still be errors. 7 |
8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |
13 | See LICENSE file for license information. 14 |
15 | Copyright 2018, 2019 Google, Inc. 16 |
17 | If you use this data in publications, please cite it as follows: 18 |
19 | @inproceedings{kjartansson-etal-2020-open,
20 | title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},
21 | author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},
22 | booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},
23 | year = {2020},
24 | pages = {21--27},
25 | month = may,
26 | address = {Marseille, France},
27 | publisher = {European Language Resources association (ELRA)},
28 | url = {https://www.aclweb.org/anthology/2020.sltu-1.3},
29 | ISBN = {979-10-95546-35-1},
30 | }
31 |
32 |
--------------------------------------------------------------------------------
/resources/69/about.html:
--------------------------------------------------------------------------------
1 | This data set contains transcribed high-quality audio of Catalan sentences
2 | recorded by volunteers. The data set consists of wave files, and a TSV file
3 | (line_index.tsv). The file line_index.tsv contains a anonymized FileID and the
4 | transcription of audio in the file.
5 | 6 | The data set has been manually quality checked, but there might still be errors. 7 |
8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |
13 | See LICENSE file for license information. 14 |
15 | Copyright 2018, 2019 Google, Inc. 16 |
17 | If you use this data in publications, please cite it as follows: 18 |
19 | @inproceedings{kjartansson-etal-2020-open,
20 | title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},
21 | author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},
22 | booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},
23 | year = {2020},
24 | pages = {21--27},
25 | month = may,
26 | address = {Marseille, France},
27 | publisher = {European Language Resources association (ELRA)},
28 | url = {https://www.aclweb.org/anthology/2020.sltu-1.3},
29 | ISBN = {979-10-95546-35-1},
30 | }
31 |
32 |
--------------------------------------------------------------------------------
/resources/77/about.html:
--------------------------------------------------------------------------------
1 | This data set contains transcribed high-quality audio of Galician sentences
2 | recorded by volunteers. The data set consists of wave files, and a TSV file
3 | (line_index.tsv). The file line_index.tsv contains a anonymized FileID and the
4 | transcription of audio in the file.
5 | 6 | The data set has been manually quality checked, but there might still be errors. 7 |
8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |
13 | See LICENSE file for license information. 14 |
15 | Copyright 2018, 2019 Google, Inc. 16 |
17 | If you use this data in publications, please cite it as follows: 18 |
19 | @inproceedings{kjartansson-etal-2020-open,
20 | title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},
21 | author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},
22 | booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},
23 | year = {2020},
24 | pages = {21--27},
25 | month = may,
26 | address = {Marseille, France},
27 | publisher = {European Language Resources association (ELRA)},
28 | url = {https://www.aclweb.org/anthology/2020.sltu-1.3},
29 | ISBN = {979-10-95546-35-1},
30 | }
31 |
32 |
--------------------------------------------------------------------------------
/resources/71/about.html:
--------------------------------------------------------------------------------
1 | This data set contains transcribed high-quality audio of Chilean Spanish
2 | sentences recorded by volunteers. The data set consists of wave files, and a TSV
3 | file (line_index.tsv). The file line_index.tsv contains a anonymized FileID and
4 | the transcription of audio in the file.
5 | 6 | The data set has been manually quality checked, but there might still be errors. 7 |
8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |
13 | See LICENSE file for license information. 14 |
15 | Copyright 2018, 2019 Google, Inc. 16 |
17 | If you use this data in publications, please cite it as follows: 18 |
19 | @inproceedings{guevara-rukoz-etal-2020-crowdsourcing,
20 | title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},
21 | author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin, Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},
22 | booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
23 | year = {2020},
24 | month = may,
25 | address = {Marseille, France},
26 | publisher = {European Language Resources Association (ELRA)},
27 | url = {https://www.aclweb.org/anthology/2020.lrec-1.801},
28 | pages = {6504--6513},
29 | ISBN = {979-10-95546-34-4},
30 | }
31 |
32 |
--------------------------------------------------------------------------------
/resources/72/about.html:
--------------------------------------------------------------------------------
1 | This data set contains transcribed high-quality audio of Columbian Spanish
2 | sentences recorded by volunteers. The data set consists of wave files, and a TSV
3 | file (line_index.tsv). The file line_index.tsv contains a anonymized FileID and
4 | the transcription of audio in the file.
5 | 6 | The data set has been manually quality checked, but there might still be errors. 7 |
8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |
13 | See LICENSE file for license information. 14 |
15 | Copyright 2018, 2019 Google, Inc. 16 |
17 | If you use this data in publications, please cite it as follows: 18 |
19 | @inproceedings{guevara-rukoz-etal-2020-crowdsourcing,
20 | title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},
21 | author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin, Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},
22 | booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
23 | year = {2020},
24 | month = may,
25 | address = {Marseille, France},
26 | publisher = {European Language Resources Association (ELRA)},
27 | url = {https://www.aclweb.org/anthology/2020.lrec-1.801},
28 | pages = {6504--6513},
29 | ISBN = {979-10-95546-34-4},
30 | }
31 |
32 |
--------------------------------------------------------------------------------
/resources/73/about.html:
--------------------------------------------------------------------------------
1 | This data set contains transcribed high-quality audio of Peruvian Spanish
2 | sentences recorded by volunteers. The data set consists of wave files, and a TSV
3 | file (line_index.tsv). The file line_index.tsv contains a anonymized FileID and
4 | the transcription of audio in the file.
5 | 6 | The data set has been manually quality checked, but there might still be errors. 7 |
8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |
13 | See LICENSE file for license information. 14 |
15 | Copyright 2018, 2019 Google, Inc. 16 |
17 | If you use this data in publications, please cite it as follows: 18 |
19 | @inproceedings{guevara-rukoz-etal-2020-crowdsourcing,
20 | title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},
21 | author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin, Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},
22 | booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
23 | year = {2020},
24 | month = may,
25 | address = {Marseille, France},
26 | publisher = {European Language Resources Association (ELRA)},
27 | url = {https://www.aclweb.org/anthology/2020.lrec-1.801},
28 | pages = {6504--6513},
29 | ISBN = {979-10-95546-34-4},
30 | }
31 |
32 |
--------------------------------------------------------------------------------
/resources/74/about.html:
--------------------------------------------------------------------------------
1 | This data set contains transcribed high-quality audio of Puerto Rico Spanish
2 | sentences recorded by volunteers. The data set consists of wave files, and a TSV
3 | file (line_index.tsv). The file line_index.tsv contains a anonymized FileID and
4 | the transcription of audio in the file.
5 | 6 | The data set has been manually quality checked, but there might still be errors. 7 |
8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |
13 | See LICENSE file for license information. 14 |
15 | Copyright 2018, 2019 Google, Inc. 16 |
17 | If you use this data in publications, please cite it as follows: 18 |
19 | @inproceedings{guevara-rukoz-etal-2020-crowdsourcing,
20 | title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},
21 | author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin, Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},
22 | booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
23 | year = {2020},
24 | month = may,
25 | address = {Marseille, France},
26 | publisher = {European Language Resources Association (ELRA)},
27 | url = {https://www.aclweb.org/anthology/2020.lrec-1.801},
28 | pages = {6504--6513},
29 | ISBN = {979-10-95546-34-4},
30 | }
31 |
32 |
--------------------------------------------------------------------------------
/resources/75/about.html:
--------------------------------------------------------------------------------
1 | This data set contains transcribed high-quality audio of Venezuelan Spanish
2 | sentences recorded by volunteers. The data set consists of wave files, and a TSV
3 | file (line_index.tsv). The file line_index.tsv contains a anonymized FileID and
4 | the transcription of audio in the file.
5 | 6 | The data set has been manually quality checked, but there might still be errors. 7 |
8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |
13 | See LICENSE file for license information. 14 |
15 | Copyright 2018, 2019 Google, Inc. 16 |
17 | If you use this data in publications, please cite it as follows: 18 |
19 | @inproceedings{guevara-rukoz-etal-2020-crowdsourcing,
20 | title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},
21 | author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin, Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},
22 | booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
23 | year = {2020},
24 | month = may,
25 | address = {Marseille, France},
26 | publisher = {European Language Resources Association (ELRA)},
27 | url = {https://www.aclweb.org/anthology/2020.lrec-1.801},
28 | pages = {6504--6513},
29 | ISBN = {979-10-95546-34-4},
30 | }
31 |
32 |
--------------------------------------------------------------------------------
/resources/80/about.html:
--------------------------------------------------------------------------------
1 | This data set contains transcribed high-quality audio of Burmese sentences
2 | recorded by volunteers. The data set consists of wave files, and a TSV file
3 | (line_index.tsv). The file line_index.tsv contains a anonymized FileID and the
4 | transcription of audio in the file.
5 | 6 | The data set has been manually quality checked, but there might still be errors. 7 |
8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |
13 | See LICENSE file for license information. 14 |
15 | Copyright 2018, 2019 Google, Inc. 16 |
17 | If you use this data in publications, please cite it as follows: 18 |
19 | @inproceedings{oo-etal-2020-burmese,
20 | title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application to Text-to-Speech}},
21 | author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin, Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},
22 | booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
23 | month = may,
24 | year = {2020},
25 | pages = "6328--6339",
26 | address = {Marseille, France},
27 | publisher = {European Language Resources Association (ELRA)},
28 | url = {https://www.aclweb.org/anthology/2020.lrec-1.777},
29 | ISBN = {979-10-95546-34-4},
30 | }
31 |
32 |
--------------------------------------------------------------------------------
/resources/22/about.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | INTRODUCTION
4 | -----------
5 |
6 | THUGY20 is an open Uyghur speech database published by Center for Speech and Language Technology (CSLT)
7 | at Tsinghua University, Signal and Information Processing Lab at Xinjiang University, and the AI cloud
8 | research center (AICRC). It involves the full set of speech and language resoruces required to establish
9 | an Uyghur speech recognition system and an Uyghur speaker recognition system.
10 |
11 | You can cite the data using the following BibTeX entry:
12 |
13 |
14 | @inproceedings{THUGY20_2015,
15 | title={THUGY20: A free Uyghur speech database},
16 | author={Askar Roze, Shi Yin, Zhiyong Zhang, Dong Wang, Askar Hamdulla},
17 | booktitle={NCMMSC'15},
18 | year={2015}
19 | }
20 |
21 | @inproceedings{THUGY20_sre_2015,
22 | title={AN OPEN/FREE DATABASE AND BENCHMARK FOR UYGHUR SPEAKER RECOGNITION},
23 | author={Askar Rozi, Dong Wang, Zhiyong Zhang},
24 | Booktitle={O-COCOSDA'15},
25 | year={2015}
26 | }
27 |
28 |
29 | PEOPLE
30 | -----------
31 | Dong Wang, Zhiyong Zhang, Shi Yin, Askar Roze @CSLT, Tsinghua Univ.
32 | Askar Hamdulla @Xinjiang Univ.
33 |
34 | CONTACTOR
35 | -----------
36 | Dong Wang
37 | Xuewei Zhang
38 | Zhiyong Zhang
39 |
40 | CSLT, Tsinghua University
41 | wangdong99@mails.tsinghua.edu.cn
42 | {zxw,zhangzy}@cslt.riit.tsinghua.edu.cn
43 |
44 |
45 | ROOM1-303, BLDG FIT
46 | Tsinghua University
47 |
48 | http://cslt.org
49 | http://cslt.riit.tsinghua.edu.cn
50 |
51 | Askar Hamdulla
52 | Xinjiang University
53 | askarhamdulla@gmail.com
54 | http://erj1.xju.edu.cn/znxx/index.htm
55 |
56 |
57 |
58 |
59 |
--------------------------------------------------------------------------------
/resources/65/about.html:
--------------------------------------------------------------------------------
1 | This data set contains transcribed high-quality audio of Tamil sentences
2 | recorded by volunteers. The data set consists of wave files, and a TSV file
3 | (line_index.tsv). The file line_index.tsv contains a anonymized FileID and the
4 | transcription of audio in the file.
5 | 6 | The data set has been manually quality checked, but there might still be errors. 7 |
8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |
13 | See LICENSE file for license information. 14 |
15 | Copyright 2018, 2019 Google, Inc. 16 |
17 | If you use this data in publications, please cite it as follows: 18 |
19 | @inproceedings{he-etal-2020-open,
20 | title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and Telugu Speech Synthesis Systems}},
21 | author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin, Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},
22 | booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
23 | month = may,
24 | year = {2020},
25 | address = {Marseille, France},
26 | publisher = {European Language Resources Association (ELRA)},
27 | pages = {6494--6503},
28 | url = {https://www.aclweb.org/anthology/2020.lrec-1.800},
29 | ISBN = "{979-10-95546-34-4},
30 | }
31 |
32 |
--------------------------------------------------------------------------------
/resources/66/about.html:
--------------------------------------------------------------------------------
1 | This data set contains transcribed high-quality audio of Telugu sentences
2 | recorded by volunteers. The data set consists of wave files, and a TSV file
3 | (line_index.tsv). The file line_index.tsv contains a anonymized FileID and the
4 | transcription of audio in the file.
5 | 6 | The data set has been manually quality checked, but there might still be errors. 7 |
8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |
13 | See LICENSE file for license information. 14 |
15 | Copyright 2018, 2019 Google, Inc. 16 |
17 | If you use this data in publications, please cite it as follows: 18 |
19 | @inproceedings{he-etal-2020-open,
20 | title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and Telugu Speech Synthesis Systems}},
21 | author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin, Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},
22 | booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
23 | month = may,
24 | year = {2020},
25 | address = {Marseille, France},
26 | publisher = {European Language Resources Association (ELRA)},
27 | pages = {6494--6503},
28 | url = {https://www.aclweb.org/anthology/2020.lrec-1.800},
29 | ISBN = "{979-10-95546-34-4},
30 | }
31 |
32 |
--------------------------------------------------------------------------------
/resources/63/about.html:
--------------------------------------------------------------------------------
1 | This data set contains transcribed high-quality audio of Malayalam sentences
2 | recorded by volunteers. The data set consists of wave files, and a TSV file
3 | (line_index.tsv). The file line_index.tsv contains a anonymized FileID and the
4 | transcription of audio in the file.
5 | 6 | The data set has been manually quality checked, but there might still be errors. 7 |
8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |
13 | See LICENSE file for license information. 14 |
15 | Copyright 2018, 2019 Google, Inc. 16 |
17 | If you use this data in publications, please cite it as follows: 18 |
19 | @inproceedings{he-etal-2020-open,
20 | title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and Telugu Speech Synthesis Systems}},
21 | author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin, Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},
22 | booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
23 | month = may,
24 | year = {2020},
25 | address = {Marseille, France},
26 | publisher = {European Language Resources Association (ELRA)},
27 | pages = {6494--6503},
28 | url = {https://www.aclweb.org/anthology/2020.lrec-1.800},
29 | ISBN = "{979-10-95546-34-4},
30 | }
31 |
32 |
--------------------------------------------------------------------------------
/resources/64/about.html:
--------------------------------------------------------------------------------
1 | This data set contains transcribed high-quality audio of Marathi sentences
2 | recorded by volunteers. The data set consists of wave files, and a TSV file
3 | (line_index.tsv). The file line_index.tsv contains a anonymized FileID and the
4 | transcription of audio in the file.
5 | 6 | The data set has been manually quality checked, but there might still be errors. 7 |
8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |
13 | See LICENSE file for license information. 14 |
15 | Copyright 2018, 2019 Google, Inc. 16 |
17 | If you use this data in publications, please cite it as follows: 18 |
19 | @inproceedings{he-etal-2020-open,
20 | title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and Telugu Speech Synthesis Systems}},
21 | author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin, Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},
22 | booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
23 | month = may,
24 | year = {2020},
25 | address = {Marseille, France},
26 | publisher = {European Language Resources Association (ELRA)},
27 | pages = {6494--6503},
28 | url = {https://www.aclweb.org/anthology/2020.lrec-1.800},
29 | ISBN = "{979-10-95546-34-4},
30 | }
31 |
32 |
--------------------------------------------------------------------------------
/resources/78/about.html:
--------------------------------------------------------------------------------
1 | This data set contains transcribed high-quality audio of Gujarati sentences
2 | recorded by volunteers. The data set consists of wave files, and a TSV file
3 | (line_index.tsv). The file line_index.tsv contains a anonymized FileID and the
4 | transcription of audio in the file.
5 | 6 | The data set has been manually quality checked, but there might still be errors. 7 |
8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |
13 | See LICENSE file for license information. 14 |
15 | Copyright 2018, 2019 Google, Inc. 16 |
17 | If you use this data in publications, please cite it as follows: 18 |
19 | @inproceedings{he-etal-2020-open,
20 | title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and Telugu Speech Synthesis Systems}},
21 | author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin, Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},
22 | booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
23 | month = may,
24 | year = {2020},
25 | address = {Marseille, France},
26 | publisher = {European Language Resources Association (ELRA)},
27 | pages = {6494--6503},
28 | url = {https://www.aclweb.org/anthology/2020.lrec-1.800},
29 | ISBN = "{979-10-95546-34-4},
30 | }
31 |
32 |
--------------------------------------------------------------------------------
/resources/79/about.html:
--------------------------------------------------------------------------------
1 | This data set contains transcribed high-quality audio of Kannada sentences
2 | recorded by volunteers. The data set consists of wave files, and a TSV file
3 | (line_index.tsv). The file line_index.tsv contains a anonymized FileID and the
4 | transcription of audio in the file.
5 | 6 | The data set has been manually quality checked, but there might still be errors. 7 |
8 | Please report any issues in the following issue tracker on GitHub. 9 | 10 | https://github.com/googlei18n/language-resources/issues 11 | 12 |
13 | See LICENSE file for license information. 14 |
15 | Copyright 2018, 2019 Google, Inc. 16 |
17 | If you use this data in publications, please cite it as follows: 18 |
19 | @inproceedings{he-etal-2020-open,
20 | title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and Telugu Speech Synthesis Systems}},
21 | author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin, Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},
22 | booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
23 | month = may,
24 | year = {2020},
25 | address = {Marseille, France},
26 | publisher = {European Language Resources Association (ELRA)},
27 | pages = {6494--6503},
28 | url = {https://www.aclweb.org/anthology/2020.lrec-1.800},
29 | ISBN = "{979-10-95546-34-4},
30 | }
31 |
32 |
--------------------------------------------------------------------------------
/resources/60/about.html:
--------------------------------------------------------------------------------
1 | LibriTTS is a multi-speaker English corpus of approximately 585 hours of read English speech at 24kHz sampling rate, prepared by Heiga Zen with the assistance of Google Speech and Google Brain team members.
2 |
3 | The LibriTTS corpus is designed for TTS research. It is derived from the original materials (mp3 audio files from LibriVox and text files from Project Gutenberg) of the LibriSpeech corpus.
4 | The main differences from the LibriSpeech corpus are listed below:
5 | 16 | The MD5 checksums of the downloads are as follows (note: not everyone will want to know this). 17 |
18 | 0c3076c1e5245bb3f0af7d82087ee207 dev-clean.tar.gz 19 | 815555d8d75995782ac3ccd7f047213d dev-other.tar.gz 20 | 7bed3bdb047c4c197f1ad3bc412db59f test-clean.tar.gz 21 | ae3258249472a13b5abef2a816f733e4 test-other.tar.gz 22 | 4a8c202b78fe1bc0c47916a98f3a2ea8 train-clean-100.tar.gz 23 | a84ef10ddade5fd25df69596a2767b2d train-clean-360.tar.gz 24 | 7b181dd5ace343a5f38427999684aa6f train-other-500.tar.gz 25 |26 | 27 | -------------------------------------------------------------------------------- /resources/83/info.txt: -------------------------------------------------------------------------------- 1 | name: Crowdsourced high-quality UK and Ireland English Dialect speech data set. 2 | summary: Data set which contains male and female recordings of English from various dialects of the UK and Ireland. 3 | category: speech 4 | license: Attribution-ShareAlike 4.0 International 5 | file: about.html Information about the data set 6 | file: LICENSE License information for the data set 7 | file: line_index_all.csv All utterances in the data set. 8 | file: dialect_info.txt Information about the dialects represented in the data 9 | set. 10 | file: irish_english_male.zip Archive file with recordings from the speakers of 11 | Irish English. 12 | file: midlands_english_female.zip Archive file with recordings from the female 13 | midlands English speakers. 14 | file: midlands_english_male.zip Archive file with recordings from the male 15 | midlands English speakers. 16 | file: northern_english_female.zip Archive file with recordings from the female 17 | northern English speakers. 18 | file: northern_english_male.zip Archive file with recordings from the male 19 | northern English speakers. 20 | file: scottish_english_female.zip Archive file with recordings from the female 21 | Scottish English speakers. 22 | file: scottish_english_male.zip Archive file with recordings from the male 23 | Scottish English speakers. 24 | file: southern_english_female.zip Archive file with recordings from the female 25 | southern English speakers. 26 | file: southern_english_male.zip Archive file with recordings from the male 27 | southern English speakers. 28 | file: welsh_english_female.zip Archive file with recordings from the female 29 | Welsh english speakers. 30 | file: welsh_english_male.zip Archive file with recordings from the male Welsh 31 | English speakers. 32 | --------------------------------------------------------------------------------