├── data ├── validation │ ├── swc │ │ ├── invalid_tracks.json │ │ ├── invalid_character_ratio.json │ │ ├── invalid_all.json │ │ └── invalid_transcripts.json │ ├── mailabs │ │ ├── invalid_tracks.json │ │ ├── invalid_transcripts.json │ │ ├── invalid_all.json │ │ └── invalid_character_ratio.json │ ├── tuda │ │ ├── invalid_transcripts.json │ │ ├── invalid_tracks.json │ │ ├── invalid_all.json │ │ └── invalid_character_ratio.json │ ├── voxforge │ │ ├── invalid_tracks.json │ │ ├── invalid_character_ratio.json │ │ ├── invalid_all.json │ │ └── invalid_transcripts.json │ └── common_voice │ │ ├── invalid_character_ratio.json │ │ ├── invalid_all.json │ │ ├── invalid_tracks.json │ │ └── invalid_transcripts.json ├── .gitignore ├── state.json └── corpus_stats.json ├── requirements.txt ├── custom_formats.sh ├── scripts ├── jasperize.py ├── waverize.py ├── normalize_text.py ├── corpus_infos.py ├── download.py ├── equivalence.py ├── validate.py └── merge_and_subset.py ├── LICENSE ├── .gitignore ├── recreate.sh ├── src └── validators.py ├── create.sh ├── README.md └── notebooks ├── durations.ipynb └── analyze_validation.ipynb /data/validation/swc/invalid_tracks.json: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /data/validation/mailabs/invalid_tracks.json: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /data/validation/tuda/invalid_transcripts.json: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /data/validation/voxforge/invalid_tracks.json: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /data/validation/mailabs/invalid_transcripts.json: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | click==7.0 2 | tqdm==4.38.0 3 | spoteno==0.1.1 4 | audiomate==5.1.0 5 | -------------------------------------------------------------------------------- /data/validation/tuda/invalid_tracks.json: -------------------------------------------------------------------------------- 1 | {"2014-03-24-13-39-24_Kinect-RAW": "Invalid shape: (0,)"} -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | download/ 2 | full/ 3 | full_normalized/ 4 | full_waverized/ 5 | full_jasperized/ 6 | -------------------------------------------------------------------------------- /data/validation/voxforge/invalid_character_ratio.json: -------------------------------------------------------------------------------- 1 | {"anonymous-20081112-ssu-de10-024": 29.53846153846154, "john_doe-20160503-gaj-de7-082": 130.66666666666666, "rwunsch-20090706-any-de2-26": 52.0} -------------------------------------------------------------------------------- /data/validation/voxforge/invalid_all.json: -------------------------------------------------------------------------------- 1 | ["Manu-20140327-m37-deM37-47", "Manu-20140328-cc1-deCC1-03", "Manu-20140328-cc1-deCC1-12", "Manu-20140328-cc1-deCC1-21", "Manu-20140421-MUS-deManu-201-01", "anonymous-20080405-phz-de5-088", "anonymous-20081112-ssu-de10-024", "john_doe-20160503-gaj-de7-082", "justmoon-20080204-hbp-de5-088", "rwunsch-20090706-any-de2-26"] -------------------------------------------------------------------------------- /custom_formats.sh: -------------------------------------------------------------------------------- 1 | # This is the path where all output is stored 2 | export PYTHONPATH=$PYTHONPATH:$(pwd)/src 3 | out_path=data 4 | 5 | echo "##############################################################" 6 | echo "# Jasperize" 7 | echo "##############################################################" 8 | wave_path=$out_path/full_waverized 9 | jasper_path=$out_path/full_jasperized 10 | python scripts/jasperize.py $wave_path $jasper_path 11 | -------------------------------------------------------------------------------- /data/validation/voxforge/invalid_transcripts.json: -------------------------------------------------------------------------------- 1 | {"Manu-20140327-m37-deM37-47": ["ABER DAS IST JA =FORCE MAJEURE= SIEHST DU DAS DENN NICHT EIN", ["="]], "Manu-20140328-cc1-deCC1-12": ["KONVERSATION KOPETE AMAROK CLEMENTINE K3B KDENLIVE", ["3"]], "Manu-20140328-cc1-deCC1-03": ["KONVERSATION KOPETE AMAROK CLEMENTINE K3B KDENLIVE", ["3"]], "Manu-20140328-cc1-deCC1-21": ["KONVERSATION KOPETE AMAROK CLEMENTINE K3B KDENLIVE", ["3"]], "Manu-20140421-MUS-deManu-201-01": ["COUNTRY HIP-HOP R&B ELECTRONIC LATIN", ["&"]], "anonymous-20080405-phz-de5-088": ["ES GIBT ZAHLREICHE BUCHTEN AN DER ETWA TWO THOUSAND0 KM LANGEN ATLANTIKKüSTE", ["0"]], "justmoon-20080204-hbp-de5-088": ["ES GIBT ZAHLREICHE BUCHTEN AN DER ETWA TWO THOUSAND0 KM LANGEN ATLANTIKKüSTE", ["0"]]} -------------------------------------------------------------------------------- /data/validation/common_voice/invalid_character_ratio.json: -------------------------------------------------------------------------------- 1 | {"common_voice_de_17876495": "", "common_voice_de_17876535": "", "common_voice_de_17876537": "", "common_voice_de_17876539": "", "common_voice_de_17876550": "", "common_voice_de_17876551": "", "common_voice_de_17876553": "", "common_voice_de_17876555": "", "common_voice_de_17876557": "", "common_voice_de_17876540": "", "common_voice_de_17876541": "", "common_voice_de_17876542": "", "common_voice_de_17876543": "", "common_voice_de_17876544": "", "common_voice_de_17876552": "", "common_voice_de_17876554": "", "common_voice_de_17876556": "", "common_voice_de_17876304": 28.388278388278387, "common_voice_de_17876483": "", "common_voice_de_17876545": "", "common_voice_de_17876546": "", "common_voice_de_17876547": "", "common_voice_de_17876548": "", "common_voice_de_17876549": "", "common_voice_de_17990660": 26.315789473684212} -------------------------------------------------------------------------------- /scripts/jasperize.py: -------------------------------------------------------------------------------- 1 | import os 2 | import click 3 | 4 | import audiomate 5 | from audiomate.corpus import io 6 | 7 | 8 | @click.command() 9 | @click.argument('in_folder', type=click.Path(exists=True)) 10 | @click.argument('out_folder', type=click.Path()) 11 | @click.option('--base-folder', default=None, type=click.Path()) 12 | def run(in_folder, out_folder, base_folder): 13 | if not os.path.exists(out_folder): 14 | if base_folder is None: 15 | base_folder = os.path.dirname(out_folder) 16 | 17 | w = io.NvidiaJasperWriter( 18 | num_workers=8, 19 | no_check=True, 20 | data_base_path=base_folder 21 | ) 22 | target_audio_path = os.path.join(out_folder, 'audio') 23 | os.makedirs(target_audio_path) 24 | 25 | print('Load source corpus') 26 | ds = audiomate.Corpus.load(in_folder) 27 | print('Save jasper corpus') 28 | w.save(ds, out_folder) 29 | else: 30 | print('Already jasperized') 31 | 32 | 33 | if __name__ == '__main__': 34 | run() 35 | -------------------------------------------------------------------------------- /scripts/waverize.py: -------------------------------------------------------------------------------- 1 | import os 2 | import click 3 | 4 | import audiomate 5 | from audiomate.corpus import conversion 6 | 7 | 8 | @click.command() 9 | @click.argument('full_folder', type=click.Path()) 10 | @click.argument('out_folder', type=click.Path()) 11 | def run(full_folder, out_folder): 12 | if not os.path.exists(out_folder): 13 | converter = conversion.WavAudioFileConverter( 14 | num_workers=8, 15 | sampling_rate=16000, 16 | separate_file_per_utterance=True, 17 | force_conversion=False 18 | ) 19 | 20 | target_audio_path = os.path.join(out_folder, 'audio') 21 | os.makedirs(target_audio_path) 22 | 23 | print('Load source corpus') 24 | ds = audiomate.Corpus.load(full_folder) 25 | print('Convert') 26 | waverized_ds = converter.convert(ds, target_audio_path) 27 | print('Save converted corpus') 28 | waverized_ds.save_at(out_folder) 29 | else: 30 | print('Already waverized') 31 | 32 | 33 | if __name__ == '__main__': 34 | run() 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2019 Matthias Büchi, https://github.com/ynop/megs 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | bin/ 26 | man/ 27 | pip-selfcheck.json 28 | pyvenv.cfg 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *,cover 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | 57 | # Sphinx documentation 58 | docs/_build/ 59 | docs/build/ 60 | 61 | # PyBuilder 62 | target/ 63 | 64 | # OSX 65 | .DS_Store 66 | 67 | # JetBrains 68 | .idea 69 | 70 | # venv 71 | venv/ 72 | .venv/ 73 | .35venv/ 74 | 75 | playground 76 | 77 | pages-clone/ 78 | 79 | .benchmarks/ 80 | 81 | test.ipynb 82 | 83 | playground/ 84 | -------------------------------------------------------------------------------- /recreate.sh: -------------------------------------------------------------------------------- 1 | # This is the path where all output is stored 2 | export PYTHONPATH=$PYTHONPATH:$(pwd)/src 3 | out_path=data 4 | mkdir -p $out_path 5 | 6 | echo "##############################################################" 7 | echo "# Download corpora" 8 | echo "##############################################################" 9 | dl_path=$out_path/download 10 | python scripts/download.py $dl_path 11 | 12 | echo "##############################################################" 13 | echo "# Merge and subset" 14 | echo "##############################################################" 15 | dl_path=$out_path/download 16 | full_path=$out_path/full 17 | python scripts/merge_and_subset.py $dl_path $full_path 18 | 19 | echo "##############################################################" 20 | echo "# Check equivalence state" 21 | echo "##############################################################" 22 | python scripts/equivalence.py check $out_path 23 | 24 | echo "##############################################################" 25 | echo "# Normalize transcripts" 26 | echo "##############################################################" 27 | full_path=$out_path/full 28 | norm_path=$out_path/full_normalized 29 | python scripts/normalize_text.py $full_path $norm_path 30 | 31 | echo "##############################################################" 32 | echo "# Waverize" 33 | echo "##############################################################" 34 | in_path=$out_path/full_normalized 35 | wave_path=$out_path/full_waverized 36 | python scripts/waverize.py $in_path $wave_path 37 | -------------------------------------------------------------------------------- /src/validators.py: -------------------------------------------------------------------------------- 1 | import spoteno 2 | 3 | import audiomate 4 | from audiomate.corpus.validation import base 5 | 6 | 7 | class TextNormalizationValidator(base.Validator): 8 | """ 9 | Checks if the transcript can be normalized with spoteno. 10 | """ 11 | 12 | def __init__(self): 13 | self.normalizer = spoteno.Normalizer.de() 14 | 15 | def name(self): 16 | return 'Normalization-Validator' 17 | 18 | def validate(self, corpus): 19 | """ 20 | Perform the validation on the given corpus. 21 | 22 | Args: 23 | corpus (Corpus): The corpus to test/validate. 24 | 25 | Returns: 26 | InvalidItemsResult: Validation result. 27 | """ 28 | utt_ids = [] 29 | transcripts = [] 30 | ll_idx = audiomate.corpus.LL_WORD_TRANSCRIPT 31 | 32 | for utt in corpus.utterances.values(): 33 | transcript = utt.label_lists[ll_idx].join() 34 | transcripts.append(transcript) 35 | utt_ids.append(utt.idx) 36 | 37 | result = self.normalizer.debug_list(transcripts) 38 | 39 | invalid_utterances = {} 40 | 41 | for i, (output, invalid_characters) in enumerate(result): 42 | utt_idx = utt_ids[i] 43 | transcript = transcripts[i] 44 | if len(invalid_characters) > 0 or len(output) <= 0: 45 | invalid_utterances[utt_idx] = ( 46 | transcript, list(invalid_characters) 47 | ) 48 | 49 | passed = len(invalid_utterances) <= 0 50 | 51 | return base.InvalidItemsResult( 52 | passed, 53 | invalid_utterances, 54 | name=self.name(), 55 | info={} 56 | ) 57 | -------------------------------------------------------------------------------- /scripts/normalize_text.py: -------------------------------------------------------------------------------- 1 | import os 2 | import click 3 | 4 | import audiomate 5 | from audiomate import annotations 6 | import spoteno 7 | 8 | 9 | @click.command() 10 | @click.argument('full_folder', type=click.Path()) 11 | @click.argument('out_folder', type=click.Path()) 12 | def run(full_folder, out_folder): 13 | if not os.path.exists(out_folder): 14 | print('Load source corpus') 15 | ds = audiomate.Corpus.load(full_folder) 16 | 17 | print('Normalize transcripts') 18 | normalizer = spoteno.Normalizer.de() 19 | utt_ids = [] 20 | transcripts = [] 21 | ll_idx = audiomate.corpus.LL_WORD_TRANSCRIPT 22 | 23 | for utt in ds.utterances.values(): 24 | transcript = utt.label_lists[ll_idx].join() 25 | transcripts.append(transcript) 26 | utt_ids.append(utt.idx) 27 | 28 | result = normalizer.normalize_list(transcripts) 29 | 30 | for i, utt_idx in enumerate(utt_ids): 31 | orig = transcripts[i] 32 | normalized = result[i] 33 | 34 | ll_orig = annotations.LabelList.create_single( 35 | orig, 36 | 'word-transcript-orig' 37 | ) 38 | 39 | ll_normalized = annotations.LabelList.create_single( 40 | normalized, 41 | audiomate.corpus.LL_WORD_TRANSCRIPT 42 | ) 43 | 44 | ds.utterances[utt_idx].set_label_list(ll_orig) 45 | ds.utterances[utt_idx].set_label_list(ll_normalized) 46 | 47 | print('Save normalized corpus') 48 | os.makedirs(out_folder, exist_ok=True) 49 | ds.save_at(out_folder) 50 | else: 51 | print('Already normalized') 52 | 53 | 54 | if __name__ == '__main__': 55 | run() 56 | -------------------------------------------------------------------------------- /data/state.json: -------------------------------------------------------------------------------- 1 | {"meta_files": {"subview_train.txt": "88baba2420194ce0a8dfa6d0227f18bd", "files.txt": "cd191451cc4517faaeb1042b5fb75e48", "subview_dev_tuda.txt": "ab23c25d2b9fa294e43c35416b73eb01", "subview_dev.txt": "5ee1ac86796163d6a34d235cf44da84f", "subview_test_common_voice.txt": "6082d635e4cf89a1599e54c99fa8830b", "subview_full_mailabs.txt": "69402eb088e6a8318b2ebbc84a6ece79", "subview_full_voxforge.txt": "7852af57463b057692d6b06616cbda16", "subview_full_tuda.txt": "22e87bbea53d14758c083c1ed87bdf64", "labels_word-transcript-raw.txt": "4527b125a20415e645f5c0e80c0f2aff", "utterances.txt": "171ce6aa2745de9d892f82cb135ef5e6", "subview_test.txt": "a89f7a7fdb000e4f942bde8800a2dce4", "subview_full_common_voice.txt": "811134413761723187b6ef82e54f41e3", "features.txt": "d41d8cd98f00b204e9800998ecf8427e", "subview_train_tuda.txt": "50676e20310c519ec529c6dc0d000333", "subview_dev_common_voice.txt": "d58e188a69000e89be364c146d332125", "subview_test_tuda.txt": "790b4324e01ab7385f90d5dc01f23bf2", "issuers.json": "8c54bd69a25fa1e5e9bff27dbece50b2", "subview_test_swc.txt": "26c89014aad3d3991fad387e162d3133", "subview_dev_mailabs.txt": "1560e8fea53f1528d1329fac88a17349", "subview_dev_voxforge.txt": "34bae59b59e38dec9a1bbc9d1a06ed57", "subview_train_common_voice.txt": "9578c8f4f12f609179a00d2785e6c7c6", "subview_train_mailabs.txt": "8e3b374c5ce2066b6acd93977d39c984", "subview_full_swc.txt": "8f360a9821da137981194e7dfaec5216", "subview_dev_swc.txt": "721edb2033ce5b42a7b9b2d6004ca6cc", "subview_test_mailabs.txt": "1560e8fea53f1528d1329fac88a17349", "labels_word-transcript.txt": "85f53d39fb380d9dc41c5e5d1a07c79c", "utt_issuers.txt": "bf6bb1a4f21b78247ff44620f073fd99", "subview_train_swc.txt": "79dccdd43e2bbd76729e119425216ad4", "subview_test_voxforge.txt": "5a2d067fa3cf92154d63775e67a880fd", "subview_train_voxforge.txt": "72cba887532f5a7189c597acb262369e", "audio.txt": "d41d8cd98f00b204e9800998ecf8427e"}, "audio_files": "67f9042b7ddcbe434e141240986ea1fa"} -------------------------------------------------------------------------------- /scripts/corpus_infos.py: -------------------------------------------------------------------------------- 1 | import os 2 | import click 3 | 4 | import audiomate 5 | from audiomate.utils import jsonfile 6 | 7 | 8 | @click.group() 9 | def cli(): 10 | pass 11 | 12 | 13 | @cli.command() 14 | @click.argument('download_folder', type=click.Path(exists=True)) 15 | @click.argument('output_path', type=click.Path()) 16 | def downloaded(download_folder, output_path): 17 | corpora_names = [ 18 | ('common_voice', 'common-voice'), 19 | ('mailabs', 'mailabs'), 20 | ('swc', 'swc'), 21 | ('tuda', 'tuda'), 22 | ('voxforge', 'voxforge'), 23 | ] 24 | 25 | infos = {} 26 | 27 | if os.path.isfile(output_path): 28 | print('Info file already there') 29 | return 30 | 31 | for name, reader_type in corpora_names: 32 | full_path = os.path.join(download_folder, name) 33 | cinfo = get_corpus_info(name, full_path, reader_type) 34 | infos[name] = cinfo 35 | 36 | jsonfile.write_json_to_file(output_path, infos) 37 | 38 | 39 | @cli.command() 40 | @click.argument('full_path', type=click.Path(exists=True)) 41 | @click.argument('output_path', type=click.Path()) 42 | def full(full_path, output_path): 43 | if os.path.isfile(output_path): 44 | print('Info file already there') 45 | return 46 | 47 | cinfo = get_corpus_info('full', full_path, 'default') 48 | jsonfile.write_json_to_file(output_path, cinfo) 49 | 50 | 51 | def get_corpus_info(name, full_path, reader_type): 52 | print('Get infos for {}'.format(name)) 53 | c = audiomate.Corpus.load( 54 | full_path, 55 | reader=reader_type, 56 | ) 57 | 58 | cinfo = { 59 | 'duration': c.total_duration, 60 | 'num_utterances': c.num_utterances, 61 | 'num_issuers': c.num_issuers, 62 | 'subviews': {}, 63 | } 64 | 65 | for sname, subview in c.subviews.items(): 66 | sinfo = { 67 | 'duration': subview.total_duration, 68 | 'num_utterances': subview.num_utterances, 69 | 'num_issuers': subview.num_issuers 70 | } 71 | cinfo['subviews'][sname] = sinfo 72 | 73 | return cinfo 74 | 75 | 76 | if __name__ == '__main__': 77 | cli() 78 | -------------------------------------------------------------------------------- /scripts/download.py: -------------------------------------------------------------------------------- 1 | import os 2 | import click 3 | from audiomate.corpus import io 4 | 5 | 6 | @click.command() 7 | @click.argument('download_folder', type=click.Path()) 8 | def run(download_folder): 9 | 10 | # 11 | # Common-Voice 12 | # 13 | 14 | common_voice_path = os.path.join(download_folder, 'common_voice') 15 | 16 | if not os.path.isdir(common_voice_path): 17 | print(( 18 | 'You have to download common-voice de corpus manually ' 19 | 'to data/download/common_voice!' 20 | )) 21 | exit(0) 22 | 23 | # 24 | # Voxforge 25 | # 26 | 27 | voxforge_path = os.path.join(download_folder, 'voxforge') 28 | 29 | if not os.path.isdir(voxforge_path): 30 | print('Download Voxforge') 31 | os.makedirs(voxforge_path, exist_ok=True) 32 | voxforge_dl = io.VoxforgeDownloader(lang='de', num_workers=8) 33 | voxforge_dl.download(voxforge_path) 34 | else: 35 | print('Voxforge already exists') 36 | 37 | # 38 | # TUDA 39 | # 40 | 41 | tuda_path = os.path.join(download_folder, 'tuda') 42 | 43 | if not os.path.isdir(tuda_path): 44 | print('Download TUDA') 45 | os.makedirs(tuda_path, exist_ok=True) 46 | tuda_dl = io.TudaDownloader() 47 | tuda_dl.download(tuda_path) 48 | else: 49 | print('TUDA already exists') 50 | 51 | # 52 | # SWC 53 | # 54 | 55 | swc_path = os.path.join(download_folder, 'swc') 56 | 57 | if not os.path.isdir(swc_path): 58 | print('Download SWC') 59 | os.makedirs(swc_path, exist_ok=True) 60 | swc_dl = io.SWCDownloader() 61 | swc_dl.download(swc_path) 62 | else: 63 | print('SWC already exists') 64 | 65 | # 66 | # M-AILABS 67 | # 68 | 69 | mailabs_path = os.path.join(download_folder, 'mailabs') 70 | 71 | if not os.path.isdir(mailabs_path): 72 | print('Download Mailabs') 73 | os.makedirs(mailabs_path, exist_ok=True) 74 | mailabs_dl = io.MailabsDownloader(tags=['de_DE']) 75 | mailabs_dl.download(mailabs_path) 76 | else: 77 | print('Mailabs already exists') 78 | 79 | 80 | if __name__ == '__main__': 81 | run() 82 | -------------------------------------------------------------------------------- /create.sh: -------------------------------------------------------------------------------- 1 | # This is the path where all output is stored 2 | export PYTHONPATH=$PYTHONPATH:$(pwd)/src 3 | out_path=data 4 | mkdir -p $out_path 5 | 6 | echo "##############################################################" 7 | echo "# Download corpora" 8 | echo "##############################################################" 9 | dl_path=$out_path/download 10 | python scripts/download.py $dl_path 11 | 12 | echo "##############################################################" 13 | echo "# Get infos about downloaded corpora" 14 | echo "##############################################################" 15 | dl_path=$out_path/download 16 | info_path=$dl_path/infos.json 17 | python scripts/corpus_infos.py downloaded $dl_path $info_path 18 | 19 | echo "##############################################################" 20 | echo "# Validate corpora" 21 | echo "##############################################################" 22 | dl_path=$out_path/download 23 | val_path=$out_path/validation 24 | python scripts/validate.py $dl_path $val_path 25 | 26 | 27 | # The results from the validation step (invalid utterances) 28 | # have to be incorporated to audiomate manually. 29 | 30 | 31 | echo "##############################################################" 32 | echo "# Merge and subset" 33 | echo "##############################################################" 34 | dl_path=$out_path/download 35 | full_path=$out_path/full 36 | python scripts/merge_and_subset.py $dl_path $full_path 37 | 38 | echo "##############################################################" 39 | echo "# Get infos about merged corpus" 40 | echo "##############################################################" 41 | full_path=$out_path/full 42 | info_path=$out_path/corpus_stats.json 43 | python scripts/corpus_infos.py full $full_path $info_path 44 | 45 | echo "##############################################################" 46 | echo "# Generate equivalence state" 47 | echo "##############################################################" 48 | python scripts/equivalence.py generate $out_path 49 | 50 | echo "##############################################################" 51 | echo "# Normalize transcripts" 52 | echo "##############################################################" 53 | full_path=$out_path/full 54 | norm_path=$out_path/full_normalized 55 | python scripts/normalize_text.py $full_path $norm_path 56 | 57 | echo "##############################################################" 58 | echo "# Waverize" 59 | echo "##############################################################" 60 | in_path=$out_path/full_normalized 61 | wave_path=$out_path/full_waverized 62 | python scripts/waverize.py $in_path $wave_path 63 | -------------------------------------------------------------------------------- /data/validation/common_voice/invalid_all.json: -------------------------------------------------------------------------------- 1 | ["common_voice_de_17304025", "common_voice_de_17304237", "common_voice_de_17312340", "common_voice_de_17312459", "common_voice_de_17318558", "common_voice_de_17337465", "common_voice_de_17359556", "common_voice_de_17427362", "common_voice_de_17430374", "common_voice_de_17430388", "common_voice_de_17431076", "common_voice_de_17507821", "common_voice_de_17517255", "common_voice_de_17551990", "common_voice_de_17619678", "common_voice_de_17623181", "common_voice_de_17635068", "common_voice_de_17637960", "common_voice_de_17639623", "common_voice_de_17649508", "common_voice_de_17651927", "common_voice_de_17661976", "common_voice_de_17663955", "common_voice_de_17670257", "common_voice_de_17678251", "common_voice_de_17686222", "common_voice_de_17700040", "common_voice_de_17705921", "common_voice_de_17707923", "common_voice_de_17712524", "common_voice_de_17738082", "common_voice_de_17747749", "common_voice_de_17772419", "common_voice_de_17800214", "common_voice_de_17816688", "common_voice_de_17816892", "common_voice_de_17823454", "common_voice_de_17839106", "common_voice_de_17858947", "common_voice_de_17876304", "common_voice_de_17876483", "common_voice_de_17876495", "common_voice_de_17876535", "common_voice_de_17876537", "common_voice_de_17876539", "common_voice_de_17876540", "common_voice_de_17876541", "common_voice_de_17876542", "common_voice_de_17876543", "common_voice_de_17876544", "common_voice_de_17876545", "common_voice_de_17876546", "common_voice_de_17876547", "common_voice_de_17876548", "common_voice_de_17876549", "common_voice_de_17876550", "common_voice_de_17876551", "common_voice_de_17876552", "common_voice_de_17876553", "common_voice_de_17876554", "common_voice_de_17876555", "common_voice_de_17876556", "common_voice_de_17876557", "common_voice_de_17876716", "common_voice_de_17906038", "common_voice_de_17986369", "common_voice_de_17990660", "common_voice_de_17993400", "common_voice_de_17993841", "common_voice_de_17999384", "common_voice_de_17999886", "common_voice_de_18002815", "common_voice_de_18024223", "common_voice_de_18034794", "common_voice_de_18042126", "common_voice_de_18099491", "common_voice_de_18101719", "common_voice_de_18110716", "common_voice_de_18154909", "common_voice_de_18193859", "common_voice_de_18200892", "common_voice_de_18203387", "common_voice_de_18219580", "common_voice_de_18223710", "common_voice_de_18227509", "common_voice_de_18233607", "common_voice_de_18235404", "common_voice_de_18235827", "common_voice_de_18265766", "common_voice_de_18289592", "common_voice_de_18326542", "common_voice_de_18359373", "common_voice_de_18366292", "common_voice_de_18366389", "common_voice_de_18381641", "common_voice_de_18384106", "common_voice_de_18385886", "common_voice_de_18389904", "common_voice_de_18397436"] -------------------------------------------------------------------------------- /data/validation/common_voice/invalid_tracks.json: -------------------------------------------------------------------------------- 1 | {"common_voice_de_17852719": "Invalid shape: (0,)", "common_voice_de_17623418": "Invalid shape: (0,)", "common_voice_de_17623419": "Invalid shape: (0,)", "common_voice_de_17723124": "Invalid shape: (0,)", "common_voice_de_17876495": "EOFError", "common_voice_de_17876535": "EOFError", "common_voice_de_17876537": "EOFError", "common_voice_de_17876539": "EOFError", "common_voice_de_17876550": "EOFError", "common_voice_de_17876551": "EOFError", "common_voice_de_17876553": "EOFError", "common_voice_de_17876555": "EOFError", "common_voice_de_17876557": "EOFError", "common_voice_de_18222520": "Invalid shape: (0,)", "common_voice_de_17876540": "EOFError", "common_voice_de_17876541": "EOFError", "common_voice_de_17876542": "EOFError", "common_voice_de_17876543": "EOFError", "common_voice_de_17876544": "EOFError", "common_voice_de_17876552": "EOFError", "common_voice_de_17876554": "EOFError", "common_voice_de_17876556": "EOFError", "common_voice_de_17309021": "Invalid shape: (0,)", "common_voice_de_17309026": "Invalid shape: (0,)", "common_voice_de_18211735": "Invalid shape: (0,)", "common_voice_de_17876483": "EOFError", "common_voice_de_17876545": "EOFError", "common_voice_de_17876546": "EOFError", "common_voice_de_17876547": "EOFError", "common_voice_de_17876548": "EOFError", "common_voice_de_17876549": "EOFError", "common_voice_de_18908331": "Invalid shape: (0,)", "common_voice_de_18265024": "Invalid shape: (0,)", "common_voice_de_17337870": "Invalid shape: (0,)", "common_voice_de_18166806": "Invalid shape: (0,)", "common_voice_de_18644789": "Invalid shape: (0,)", "common_voice_de_18644790": "Invalid shape: (0,)", "common_voice_de_17554604": "Invalid shape: (0,)", "common_voice_de_17300056": "Invalid shape: (0,)", "common_voice_de_17300186": "Invalid shape: (0,)", "common_voice_de_17301890": "Invalid shape: (0,)", "common_voice_de_17315454": "Invalid shape: (0,)", "common_voice_de_17315455": "Invalid shape: (0,)", "common_voice_de_17317509": "Invalid shape: (0,)", "common_voice_de_17318246": "Invalid shape: (0,)", "common_voice_de_17318270": "Invalid shape: (0,)", "common_voice_de_17318274": "Invalid shape: (0,)", "common_voice_de_17318421": "Invalid shape: (0,)", "common_voice_de_17318454": "Invalid shape: (0,)", "common_voice_de_17318564": "Invalid shape: (0,)", "common_voice_de_17318592": "Invalid shape: (0,)", "common_voice_de_17349179": "Invalid shape: (0,)", "common_voice_de_17349187": "Invalid shape: (0,)", "common_voice_de_17356806": "Invalid shape: (0,)", "common_voice_de_17356807": "Invalid shape: (0,)", "common_voice_de_17356808": "Invalid shape: (0,)", "common_voice_de_17364873": "Invalid shape: (0,)", "common_voice_de_17364879": "Invalid shape: (0,)", "common_voice_de_17365333": "Invalid shape: (0,)", "common_voice_de_17365334": "Invalid shape: (0,)", "common_voice_de_17365335": "Invalid shape: (0,)", "common_voice_de_17372424": "Invalid shape: (0,)", "common_voice_de_18366727": "Invalid shape: (0,)", "common_voice_de_18510751": "Invalid shape: (0,)", "common_voice_de_17774338": "Invalid shape: (0,)", "common_voice_de_17780288": "Invalid shape: (0,)", "common_voice_de_17987930": "Invalid shape: (0,)", "common_voice_de_18348514": "Invalid shape: (0,)"} -------------------------------------------------------------------------------- /data/corpus_stats.json: -------------------------------------------------------------------------------- 1 | { 2 | "duration": 3676716.407468239, 3 | "num_utterances": 797627, 4 | "num_issuers": 32374, 5 | "subviews": { 6 | "train": { 7 | "duration": 1932846.8184240141, 8 | "num_utterances": 444185, 9 | "num_issuers": 27792 10 | }, 11 | "dev_tuda": { 12 | "duration": 8692.799999999996, 13 | "num_utterances": 1079, 14 | "num_issuers": 16 15 | }, 16 | "dev": { 17 | "duration": 63890.773125000036, 18 | "num_utterances": 14678, 19 | "num_issuers": 1151 20 | }, 21 | "test_common_voice": { 22 | "duration": 27754.056000000062, 23 | "num_utterances": 5632, 24 | "num_issuers": 1901 25 | }, 26 | "full_mailabs": { 27 | "duration": 841174.3460313546, 28 | "num_utterances": 118390, 29 | "num_issuers": 26441 30 | }, 31 | "full_voxforge": { 32 | "duration": 114093.00250000002, 33 | "num_utterances": 24088, 34 | "num_issuers": 328 35 | }, 36 | "full_tuda": { 37 | "duration": 659869.612937507, 38 | "num_utterances": 79092, 39 | "num_issuers": 179 40 | }, 41 | "test": { 42 | "duration": 65586.5639374999, 43 | "num_utterances": 14412, 44 | "num_issuers": 2037 45 | }, 46 | "full_common_voice": { 47 | "duration": 1167084.4559999288, 48 | "num_utterances": 281112, 49 | "num_issuers": 4852 50 | }, 51 | "train_tuda": { 52 | "duration": 113364.99999999981, 53 | "num_utterances": 14109, 54 | "num_issuers": 146 55 | }, 56 | "dev_common_voice": { 57 | "duration": 25343.375999999935, 58 | "num_utterances": 5631, 59 | "num_issuers": 1010 60 | }, 61 | "test_tuda": { 62 | "duration": 8559.810000000003, 63 | "num_utterances": 1020, 64 | "num_issuers": 17 65 | }, 66 | "test_swc": { 67 | "duration": 15030.600000000035, 68 | "num_utterances": 4918, 69 | "num_issuers": 16 70 | }, 71 | "dev_mailabs": { "duration": 0, "num_utterances": 0, "num_issuers": 0 }, 72 | "dev_voxforge": { 73 | "duration": 14527.937125000017, 74 | "num_utterances": 2846, 75 | "num_issuers": 99 76 | }, 77 | "train_common_voice": { 78 | "duration": 36725.85599999993, 79 | "num_utterances": 8518, 80 | "num_issuers": 552 81 | }, 82 | "train_mailabs": { 83 | "duration": 840589.144986287, 84 | "num_utterances": 118371, 85 | "num_issuers": 26441 86 | }, 87 | "full_swc": { 88 | "duration": 894494.990000059, 89 | "num_utterances": 294945, 90 | "num_issuers": 569 91 | }, 92 | "dev_swc": { 93 | "duration": 15326.66000000006, 94 | "num_utterances": 5122, 95 | "num_issuers": 26 96 | }, 97 | "test_mailabs": { "duration": 0, "num_utterances": 0, "num_issuers": 0 }, 98 | "train_swc": { 99 | "duration": 856843.8500000545, 100 | "num_utterances": 284787, 101 | "num_issuers": 527 102 | }, 103 | "test_voxforge": { 104 | "duration": 14242.097937500053, 105 | "num_utterances": 2842, 106 | "num_issuers": 103 107 | }, 108 | "train_voxforge": { 109 | "duration": 85322.96743750015, 110 | "num_utterances": 18400, 111 | "num_issuers": 126 112 | } 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /scripts/equivalence.py: -------------------------------------------------------------------------------- 1 | import os 2 | import hashlib 3 | import click 4 | import json 5 | 6 | import audiomate 7 | from tqdm import tqdm 8 | 9 | 10 | SEED = 3294 11 | 12 | 13 | @click.group() 14 | def cli(): 15 | pass 16 | 17 | 18 | @cli.command() 19 | @click.argument('data_folder', type=click.Path(exists=True)) 20 | def generate(data_folder): 21 | full_path = os.path.join(data_folder, 'full') 22 | state_path = os.path.join(data_folder, 'state.json') 23 | 24 | state = generate_state(full_path) 25 | 26 | with open(state_path, 'w') as f: 27 | json.dump(state, f) 28 | 29 | 30 | @cli.command() 31 | @click.argument('data_folder', type=click.Path(exists=True)) 32 | def check(data_folder): 33 | full_path = os.path.join(data_folder, 'full') 34 | state_path = os.path.join(data_folder, 'state.json') 35 | 36 | with open(state_path, 'r') as f: 37 | state = json.load(f) 38 | 39 | actual_state = generate_state(full_path) 40 | 41 | if compare(state, actual_state): 42 | print('OK - Your data matches the state of the repository') 43 | else: 44 | print('NOT OK - Your data differs from the state of the repository') 45 | 46 | 47 | def generate_state(path): 48 | state = { 49 | 'meta_files': {}, 50 | } 51 | 52 | print('Hash meta files') 53 | for filename in os.listdir(path): 54 | if filename.endswith('txt') or filename.endswith('json'): 55 | print(' - {} ...'.format(filename)) 56 | file_path = os.path.join(path, filename) 57 | 58 | if filename == 'issuers.json': 59 | hash_value = hash_issuers_json(file_path) 60 | else: 61 | hash_value = hash_file(file_path) 62 | 63 | state['meta_files'][filename] = hash_value 64 | 65 | print('Hash audio files') 66 | corpus = audiomate.Corpus.load(path) 67 | tracks = sorted(corpus.tracks.values(), key=lambda x: x.idx) 68 | 69 | # We only hash file-size 70 | # Otherwise it would take to long 71 | h = hashlib.new('md5') 72 | 73 | for track in tqdm(tracks, total=corpus.num_tracks): 74 | stat = os.stat(track.path) 75 | size = stat.st_size 76 | h.update(size.to_bytes(4, byteorder='big')) 77 | 78 | hash_value = h.hexdigest() 79 | state['audio_files'] = hash_value 80 | 81 | return state 82 | 83 | 84 | def hash_file(path): 85 | with open(path, 'rb') as f: 86 | content = f.read() 87 | 88 | h = hashlib.new('md5') 89 | h.update(content) 90 | return h.hexdigest() 91 | 92 | 93 | def hash_issuers_json(path): 94 | with open(path, 'r') as f: 95 | content = json.load(f) 96 | 97 | h = hashlib.new('md5') 98 | 99 | for x, y in sorted(content.items(), key=lambda x: x[0]): 100 | parts = [x] 101 | for k, v in sorted(y.items(), key=lambda x: x[0]): 102 | parts.append(k) 103 | parts.append(v) 104 | 105 | h.update(' '.join(parts).encode('utf-8')) 106 | return h.hexdigest() 107 | 108 | 109 | def compare(reference, actual): 110 | ok = True 111 | 112 | for meta_file, hash_value in reference['meta_files'].items(): 113 | if hash_value != actual['meta_files'][meta_file]: 114 | print('Hash value of {} differs'.format(meta_file)) 115 | ok = False 116 | 117 | if reference['audio_files'] != actual['audio_files']: 118 | print('Hash value of audio files differs') 119 | ok = False 120 | 121 | return ok 122 | 123 | 124 | if __name__ == '__main__': 125 | cli() 126 | -------------------------------------------------------------------------------- /scripts/validate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import click 3 | import json 4 | 5 | import audiomate 6 | from audiomate.corpus import validation 7 | 8 | import validators 9 | 10 | 11 | @click.command() 12 | @click.argument('download_folder', type=click.Path(exists=True)) 13 | @click.argument('output_folder', type=click.Path()) 14 | def run(download_folder, output_folder): 15 | corpora_names = [ 16 | ('voxforge', 'voxforge'), 17 | ('common_voice', 'common-voice'), 18 | ('mailabs', 'mailabs'), 19 | ('swc', 'swc'), 20 | ('tuda', 'tuda'), 21 | ] 22 | 23 | for name, reader_type in corpora_names: 24 | print('Run validation for {}'.format(name)) 25 | full_path = os.path.join(download_folder, name) 26 | out_path = os.path.join(output_folder, name) 27 | c = audiomate.Corpus.load( 28 | full_path, 29 | reader=reader_type, 30 | include_invalid_items=True 31 | ) 32 | 33 | run_validation(c, out_path) 34 | print('-'*40) 35 | 36 | 37 | def run_validation(corpus, output_path): 38 | os.makedirs(output_path, exist_ok=True) 39 | 40 | all_invalid = set() 41 | 42 | utts = find_invalid_audio_tracks(output_path, corpus) 43 | all_invalid.update(utts) 44 | 45 | utts = find_invalid_character_ratios(output_path, corpus) 46 | all_invalid.update(utts) 47 | 48 | utts = find_invalid_transcripts(output_path, corpus) 49 | all_invalid.update(utts) 50 | 51 | all_report_path = os.path.join(output_path, 'invalid_all.json') 52 | write_report(all_report_path, sorted(all_invalid)) 53 | 54 | 55 | def find_invalid_audio_tracks(output_path, corpus): 56 | # 57 | # Find invalid audio tracks 58 | # 59 | 60 | report_path = os.path.join(output_path, 'invalid_tracks.json') 61 | 62 | if not os.path.isfile(report_path): 63 | print('Validate tracks ...') 64 | v = validation.TrackReadValidator(num_workers=4) 65 | result = v.validate(corpus) 66 | invalid_tracks = result.invalid_items 67 | write_report(report_path, invalid_tracks) 68 | else: 69 | invalid_tracks = read_report(report_path) 70 | print('Validate tracks - Already done') 71 | 72 | invalid_utts = [] 73 | 74 | for utt in corpus.utterances.values(): 75 | if utt.track in invalid_tracks: 76 | invalid_utts.append(utt.idx) 77 | 78 | return invalid_utts 79 | 80 | 81 | def find_invalid_character_ratios(output_path, corpus): 82 | # 83 | # Find invalid chracter ratios 84 | # 85 | report_path = os.path.join(output_path, 'invalid_character_ratio.json') 86 | 87 | if not os.path.isfile(report_path): 88 | print('Validate character ratio ...') 89 | v = validation.UtteranceTranscriptionRatioValidator( 90 | max_characters_per_second=25, 91 | label_list_idx=audiomate.corpus.LL_WORD_TRANSCRIPT, 92 | num_threads=4 93 | ) 94 | result = v.validate(corpus) 95 | invalid_utts = result.invalid_items 96 | write_report(report_path, invalid_utts) 97 | else: 98 | invalid_utts = read_report(report_path) 99 | print('Validate character ratio - Already done') 100 | 101 | return invalid_utts.keys() 102 | 103 | 104 | def find_invalid_transcripts(output_path, corpus): 105 | # 106 | # Find transcripts that can't be normalized 107 | # 108 | report_path = os.path.join(output_path, 'invalid_transcripts.json') 109 | 110 | if not os.path.isfile(report_path): 111 | print('Validate transcript normalization ...') 112 | v = validators.TextNormalizationValidator() 113 | result = v.validate(corpus) 114 | invalid_utts = result.invalid_items 115 | write_report(report_path, invalid_utts) 116 | else: 117 | invalid_utts = read_report(report_path) 118 | print('Validate transcript normalization - Already done') 119 | 120 | return invalid_utts.keys() 121 | 122 | 123 | def write_report(path, report): 124 | with open(path, 'w', encoding='utf-8') as f: 125 | json.dump(report, f, ensure_ascii=False) 126 | 127 | 128 | def read_report(path): 129 | with open(path, 'r', encoding='utf-8') as f: 130 | return json.load(f) 131 | 132 | 133 | if __name__ == '__main__': 134 | run() 135 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MEGS - Merged German Speech 2 | This repository contains scripts to reproduce a merged version of multiple open-source german speech datasets. 3 | For german there is no large speech corpus for automatic speech recognition tasks, as in english with for example 4 | librispeech. 5 | Therefore this repository combines multiple german speech corpora into a single one. 6 | Check licenses in the list below or on the sites of the specific datasets, if you want use the data for any special 7 | purposes. 8 | 9 | ## Recreate 10 | In order to recreate the same corpus as in this repository, 11 | execute the commands in the scripts ``recreate.sh``. 12 | The scripts does the following steps. 13 | 14 | 1. Download all corpora to ``data/download``. Only the common-voice corpus has to be downloaded manually and placed 15 | inside ``data/download/common_voice``. 16 | 17 | 2. Merges all corpora into a single one. Furthermore creates specific subsets for train/dev/test. 18 | 19 | 3. Checks if the created corpus is equal to the given state of the repository. 20 | This is done by comparing hash values against the hash values in the file ``data/state.json``. 21 | 22 | 4. If needed the corpus can be converted to wave files only. 23 | This will make sure every utterance is in a separate wave file with a sampling rate of 16000. 24 | 25 | ## Corpus usage 26 | The final corpus is stored in ``data/full``. 27 | The format of the corpus is the default format of the [audiomate library](https://github.com/ynop/audiomate). 28 | It is described in [audiomate default format](https://audiomate.readthedocs.io/en/latest/documentation/default_format.html). 29 | 30 | Audiomate also can be used to read the corpus: 31 | ```python 32 | import audiomate 33 | 34 | corpus = audiomate.Corpus.load('data/full') 35 | utt = corpus.utterances['utt-idx'] 36 | transcript = utt.label_lists[audiomate.corpus.LL_WORD_TRANSCRIPT].join() 37 | samples = utt.read_samples(sr=16000) 38 | ``` 39 | Checkout [https://github.com/ynop/audiomate](https://github.com/ynop/audiomate) for more information. 40 | 41 | ## Corpus Statistics 42 | 43 | | Part | h | Speakers | 44 | | -----------| -------| ----------------------------------------------------| 45 | | unfiltered | 1021.31 | not known due to the absence of info in M-Ailabs | 46 | | train | 536.90 | not known due to the absence of info in M-Ailabs | 47 | | dev | 17.75 | 1151 | 48 | | test | 18.22 | 2037 | 49 | | full_common_voice | 324.19 | 4852 | 50 | | train_common_voice | 10.20 | 552 | 51 | | dev_common_voice | 7.04 | 1010 | 52 | | test_common_voice | 7.71 | 1901 | 53 | | full_mailabs | 233.66 | - | 54 | | train_mailabs | 233.50 | - | 55 | | dev_mailabs | 0.00 | 0 | 56 | | test_mailabs | 0.00 | 0 | 57 | | full_swc | 248.47 | 569 | 58 | | train_swc | 238.01 | 527 | 59 | | dev_swc | 4.26 | 26 | 60 | | test_swc | 4.18 | 16 | 61 | | full_tuda | 183.30 | 179 | 62 | | train_tuda | 31.49 | 146 | 63 | | dev_tuda | 2.41 | 16 | 64 | | test_tuda | 2.38 | 17 | 65 | | full_voxforge | 31.69 | 328 | 66 | | train_voxforge | 23.70 | 126 | 67 | | dev_voxforge | 4.04 | 99 | 68 | | test_voxforge | 3.96 | 103 | 69 | 70 | ## Corpus sources 71 | 72 | | Name | URL | License | 73 | | --------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------| 74 | | Common-Voice | [https://voice.mozilla.org/en/datasets](https://voice.mozilla.org/en/datasets) | CC-0 | 75 | | TuDa | [https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/acoustic-models.html](https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/acoustic-models.html) | CC-BY | 76 | | M-AILabs | [https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/](https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/) | See Page | 77 | | VoxForge | [http://www.voxforge.org/de](http://www.voxforge.org/de) | GPL | 78 | | SWC | [https://nats.gitlab.io/swc/](https://nats.gitlab.io/swc/) | CC BY-SA 4.0 | 79 | 80 | ## Create a new version 81 | The scripts ``create.sh`` contains the commands to create a new version of the corpus. 82 | 83 | ## Changelog 84 | 85 | | Version | Changes | 86 | | ----------|----------------------------| 87 | | v1 | Initial version | 88 | | v2 | Smaller test sets, Filter long utterances (> 25s) | 89 | -------------------------------------------------------------------------------- /data/validation/tuda/invalid_all.json: -------------------------------------------------------------------------------- 1 | ["2014-03-17-11-10-01_Microsoft-Kinect-Raw", "2014-03-17-11-10-01_Realtek", "2014-03-17-11-10-01_Yamaha", "2014-03-17-14-05-00_Microsoft-Kinect-Raw", "2014-03-17-14-05-00_Realtek", "2014-03-17-14-05-00_Yamaha", "2014-03-17-14-06-43_Microsoft-Kinect-Raw", "2014-03-17-14-06-43_Realtek", "2014-03-17-14-06-43_Yamaha", "2014-03-17-14-09-48_Microsoft-Kinect-Raw", "2014-03-17-14-09-48_Realtek", "2014-03-17-14-09-48_Yamaha", "2014-03-17-14-16-19_Microsoft-Kinect-Raw", "2014-03-17-14-16-19_Realtek", "2014-03-17-14-16-19_Yamaha", "2014-03-17-14-17-43_Microsoft-Kinect-Raw", "2014-03-17-14-17-43_Realtek", "2014-03-17-14-17-43_Yamaha", "2014-03-17-14-20-46_Microsoft-Kinect-Raw", "2014-03-17-14-20-46_Realtek", "2014-03-17-14-20-46_Yamaha", "2014-03-17-14-32-49_Microsoft-Kinect-Raw", "2014-03-17-14-32-49_Realtek", "2014-03-17-14-32-49_Yamaha", "2014-03-17-14-34-48_Microsoft-Kinect-Raw", "2014-03-17-14-34-48_Realtek", "2014-03-17-14-34-48_Yamaha", "2014-03-17-14-35-31_Microsoft-Kinect-Raw", "2014-03-17-14-35-31_Realtek", "2014-03-17-14-35-31_Yamaha", "2014-03-17-14-36-22_Microsoft-Kinect-Raw", "2014-03-17-14-36-22_Realtek", "2014-03-17-14-36-22_Yamaha", "2014-03-17-14-37-39_Microsoft-Kinect-Raw", "2014-03-17-14-37-39_Realtek", "2014-03-17-14-37-39_Yamaha", "2014-03-17-14-38-03_Microsoft-Kinect-Raw", "2014-03-17-14-38-03_Realtek", "2014-03-17-14-38-03_Yamaha", "2014-03-17-14-39-38_Microsoft-Kinect-Raw", "2014-03-17-14-39-38_Realtek", "2014-03-17-14-39-38_Yamaha", "2014-03-17-14-40-14_Microsoft-Kinect-Raw", "2014-03-17-14-40-14_Realtek", "2014-03-17-14-40-14_Yamaha", "2014-03-17-14-41-00_Microsoft-Kinect-Raw", "2014-03-17-14-41-00_Realtek", "2014-03-17-14-41-00_Yamaha", "2014-03-17-14-41-54_Microsoft-Kinect-Raw", "2014-03-17-14-41-54_Realtek", "2014-03-17-14-41-54_Yamaha", "2014-03-17-14-42-10_Microsoft-Kinect-Raw", "2014-03-17-14-42-10_Realtek", "2014-03-17-14-42-10_Yamaha", "2014-03-17-14-43-06_Microsoft-Kinect-Raw", "2014-03-17-14-43-06_Realtek", "2014-03-17-14-43-06_Yamaha", "2014-03-17-14-43-41_Microsoft-Kinect-Raw", "2014-03-17-14-43-41_Realtek", "2014-03-17-14-43-41_Yamaha", "2014-03-17-14-44-50_Microsoft-Kinect-Raw", "2014-03-17-14-44-50_Realtek", "2014-03-17-14-44-50_Yamaha", "2014-03-17-14-46-51_Microsoft-Kinect-Raw", "2014-03-17-14-46-51_Realtek", "2014-03-17-14-46-51_Yamaha", "2014-03-17-14-47-36_Microsoft-Kinect-Raw", "2014-03-17-14-47-36_Realtek", "2014-03-17-14-47-36_Yamaha", "2014-03-17-14-48-42_Microsoft-Kinect-Raw", "2014-03-17-14-48-42_Realtek", "2014-03-17-14-48-42_Yamaha", "2014-03-17-14-49-24_Microsoft-Kinect-Raw", "2014-03-17-14-49-24_Realtek", "2014-03-17-14-49-24_Yamaha", "2014-03-17-15-07-14_Microsoft-Kinect-Raw", "2014-03-17-15-07-14_Realtek", "2014-03-17-15-07-14_Yamaha", "2014-03-17-15-12-53_Microsoft-Kinect-Raw", "2014-03-17-15-12-53_Realtek", "2014-03-17-15-12-53_Yamaha", "2014-03-17-15-13-53_Microsoft-Kinect-Raw", "2014-03-17-15-13-53_Realtek", "2014-03-17-15-13-53_Yamaha", "2014-03-17-15-14-56_Microsoft-Kinect-Raw", "2014-03-17-15-14-56_Realtek", "2014-03-17-15-14-56_Yamaha", "2014-03-17-15-15-23_Microsoft-Kinect-Raw", "2014-03-17-15-15-23_Realtek", "2014-03-17-15-15-23_Yamaha", "2014-03-17-15-23-49_Microsoft-Kinect-Raw", "2014-03-17-15-23-49_Realtek", "2014-03-17-15-23-49_Yamaha", "2014-03-18-15-28-52_Kinect-Beam", "2014-03-18-15-28-52_Kinect-RAW", "2014-03-18-15-28-52_Realtek", "2014-03-18-15-28-52_Yamaha", "2014-03-18-15-34-19_Realtek", "2014-03-21-11-40-39_Samson", "2014-03-24-13-39-24_Kinect-Beam", "2014-03-24-13-39-24_Kinect-RAW", "2014-03-24-13-39-24_Realtek", "2014-03-24-13-39-24_Samson", "2014-03-24-13-39-24_Yamaha", "2014-08-04-13-06-09_Kinect-Beam", "2014-08-04-13-06-09_Kinect-RAW", "2014-08-04-13-06-09_Realtek", "2014-08-04-13-06-09_Samson", "2014-08-04-13-06-09_Yamaha", "2014-08-04-13-22-37_Kinect-Beam", "2014-08-04-13-22-37_Kinect-RAW", "2014-08-04-13-22-37_Realtek", "2014-08-04-13-22-37_Samson", "2014-08-04-13-22-37_Yamaha", "2014-08-04-13-22-49_Kinect-Beam", "2014-08-04-13-22-49_Kinect-RAW", "2014-08-04-13-22-49_Realtek", "2014-08-04-13-22-49_Samson", "2014-08-04-13-22-49_Yamaha", "2014-08-04-13-37-12_Kinect-Beam", "2014-08-04-13-37-12_Kinect-RAW", "2014-08-04-13-37-12_Realtek", "2014-08-04-13-37-12_Samson", "2014-08-04-13-37-12_Yamaha", "2014-08-27-11-05-29_Kinect-Beam", "2014-08-27-11-05-29_Kinect-RAW", "2014-08-27-11-05-29_Realtek", "2014-08-27-11-05-29_Samson", "2014-08-27-11-05-29_Yamaha", "2015-01-27-11-40-44_Samson", "2015-01-27-14-37-33_Kinect-Beam", "2015-01-27-14-37-33_Kinect-RAW", "2015-01-27-14-37-33_Realtek", "2015-01-27-14-37-33_Samson", "2015-01-27-14-37-33_Yamaha", "2015-01-28-11-49-53_Kinect-Beam", "2015-01-28-11-49-53_Kinect-RAW", "2015-01-28-11-49-53_Realtek", "2015-02-04-12-29-49_Kinect-Beam", "2015-02-04-12-29-49_Kinect-RAW", "2015-02-04-12-29-49_Realtek", "2015-02-04-12-29-49_Samson", "2015-02-04-12-29-49_Yamaha", "2015-02-04-12-36-32_Kinect-Beam", "2015-02-04-12-36-32_Kinect-RAW", "2015-02-04-12-36-32_Realtek", "2015-02-04-12-36-32_Samson", "2015-02-04-12-36-32_Yamaha", "2015-02-09-11-42-09_Realtek", "2015-02-09-11-42-09_Samson", "2015-02-09-12-36-46_Kinect-Beam", "2015-02-09-12-36-46_Kinect-RAW", "2015-02-09-12-36-46_Realtek", "2015-02-09-12-36-46_Samson", "2015-02-09-12-36-46_Yamaha", "2015-02-09-13-48-26_Kinect-Beam", "2015-02-09-13-48-26_Kinect-RAW", "2015-02-09-13-48-26_Realtek", "2015-02-09-13-48-26_Samson", "2015-02-09-13-48-26_Yamaha", "2015-02-09-15-07-19_Kinect-RAW", "2015-02-09-15-07-19_Realtek", "2015-02-09-15-07-19_Samson", "2015-02-09-15-07-19_Yamaha", "2015-02-10-13-45-07_Kinect-Beam", "2015-02-10-13-45-07_Kinect-RAW", "2015-02-10-13-45-07_Realtek", "2015-02-10-13-45-07_Samson", "2015-02-10-13-45-07_Yamaha", "2015-02-10-14-18-26_Kinect-Beam", "2015-02-10-14-18-26_Kinect-RAW", "2015-02-10-14-18-26_Realtek", "2015-02-10-14-18-26_Samson", "2015-02-10-14-18-26_Yamaha"] -------------------------------------------------------------------------------- /data/validation/mailabs/invalid_all.json: -------------------------------------------------------------------------------- 1 | ["angela_merkel-20081231_neujahrsansprache_f000079", "angela_merkel-Die_Kanzlerin_direkt_10_13_f000004", "angela_merkel-Die_Kanzlerin_direkt_12_07_f000014", "angela_merkel-Die_Kanzlerin_direkt_15_07_f000006", "angela_merkel-Die_Kanzlerin_direkt_15_07_f000009", "angela_merkel-Die_Kanzlerin_direkt_16_07_f000013", "angela_merkel-Die_Kanzlerin_direkt_24_07_f000008", "angela_merkel-Die_Kanzlerin_direkt_24_07_f000027", "angela_merkel-Die_Kanzlerin_direkt_26_07_f000003", "angela_merkel-Die_Kanzlerin_direkt_34_08_f000027", "angela_merkel-Die_Kanzlerin_direkt_40_14_f000022", "angela_merkel-Die_Kanzlerin_direkt_43_15_f000004", "angela_merkel-Die_Kanzlerin_direkt_45_13_f000043", "angela_merkel-Kanzlerin_01_14_f000013", "angela_merkel-Kanzlerin_01_17_f000017", "angela_merkel-Kanzlerin_02_13_f000048", "angela_merkel-Kanzlerin_04_17_f000031", "angela_merkel-Kanzlerin_04_17_f000035", "angela_merkel-Kanzlerin_05_12_f000011", "angela_merkel-Kanzlerin_05_15_f000045", "angela_merkel-Kanzlerin_06_14_f000023", "angela_merkel-Kanzlerin_06_14_f000029", "angela_merkel-Kanzlerin_06_15_f000044", "angela_merkel-Kanzlerin_07_12_f000022", "angela_merkel-Kanzlerin_07_13_f000030", "angela_merkel-Kanzlerin_07_14_f000013", "angela_merkel-Kanzlerin_07_14_f000020", "angela_merkel-Kanzlerin_09_12_f000009", "angela_merkel-Kanzlerin_11_13_f000024", "angela_merkel-Kanzlerin_11_14_f000018", "angela_merkel-Kanzlerin_12_13_f000045", "angela_merkel-Kanzlerin_12_14_f000015", "angela_merkel-Kanzlerin_12_14_f000019", "angela_merkel-Kanzlerin_12_15_f000033", "angela_merkel-Kanzlerin_12_16_f000041", "angela_merkel-Kanzlerin_13_12_f000025", "angela_merkel-Kanzlerin_14_13_f000034", "angela_merkel-Kanzlerin_15_15_f000025", "angela_merkel-Kanzlerin_16_14_f000038", "angela_merkel-Kanzlerin_18_13_f000045", "angela_merkel-Kanzlerin_18_14_f000036", "angela_merkel-Kanzlerin_18_17_f000034", "angela_merkel-Kanzlerin_18_17_f000051", "angela_merkel-Kanzlerin_21_15_f000051", "angela_merkel-Kanzlerin_21_15_f000060", "angela_merkel-Kanzlerin_21_16_f000023", "angela_merkel-Kanzlerin_22_15_f000012", "angela_merkel-Kanzlerin_23_15_f000034", "angela_merkel-Kanzlerin_23_15_f000036", "angela_merkel-Kanzlerin_23_17_f000049", "angela_merkel-Kanzlerin_24_14_f000006", "angela_merkel-Kanzlerin_24_17_f000026", "angela_merkel-Kanzlerin_25_12_f000040", "angela_merkel-Kanzlerin_28_17_f000049", "angela_merkel-Kanzlerin_29_12_f000028", "angela_merkel-Kanzlerin_30_12_f000002", "angela_merkel-Kanzlerin_30_12_f000008", "angela_merkel-Kanzlerin_30_12_f000022", "angela_merkel-Kanzlerin_31_13_f000043", "angela_merkel-Kanzlerin_31_14_f000035", "angela_merkel-Kanzlerin_31_14_f000037", "angela_merkel-Kanzlerin_32_16_f000047", "angela_merkel-Kanzlerin_33_13_f000018", "angela_merkel-Kanzlerin_34_12_f000016", "angela_merkel-Kanzlerin_34_12_f000027", "angela_merkel-Kanzlerin_34_14_f000030", "angela_merkel-Kanzlerin_35_16_f000053", "angela_merkel-Kanzlerin_36_14_f000060", "angela_merkel-Kanzlerin_36_15_f000038", "angela_merkel-Kanzlerin_37_14_f000035", "angela_merkel-Kanzlerin_38_13_f000006", "angela_merkel-Kanzlerin_38_13_f000021", "angela_merkel-Kanzlerin_38_16_f000018", "angela_merkel-Kanzlerin_39_16_f000046", "angela_merkel-Kanzlerin_40_12_f000040", "angela_merkel-Kanzlerin_40_15_f000034", "angela_merkel-Kanzlerin_42_12_f000007", "angela_merkel-Kanzlerin_42_12_f000024", "angela_merkel-Kanzlerin_43_12_f000029", "angela_merkel-Kanzlerin_43_13_f000013", "eva_k-grune_haus_19_f000097", "eva_k-grune_haus_19_f000099", "karlsson-undine_01_f000001", "ramona_deininger-tschun_04_f000054", "ramona_deininger-tschun_07_f000027", "rebecca_braunert_plunkett-das_letzte_marchen_001_f000223", "rebecca_braunert_plunkett-das_letzte_marchen_002_f000077", "rebecca_braunert_plunkett-das_letzte_marchen_003_f000448", "rebecca_braunert_plunkett-das_letzte_marchen_004_f000251", "rebecca_braunert_plunkett-das_letzte_marchen_006_f000303", "rebecca_braunert_plunkett-das_letzte_marchen_007_f000014", "rebecca_braunert_plunkett-das_letzte_marchen_007_f000288", "rebecca_braunert_plunkett-das_letzte_marchen_007_f000430", "rebecca_braunert_plunkett-das_letzte_marchen_007_f000484", "rebecca_braunert_plunkett-das_letzte_marchen_008_f000279", "rebecca_braunert_plunkett-das_letzte_marchen_008_f000294", "rebecca_braunert_plunkett-das_letzte_marchen_008_f000299", "rebecca_braunert_plunkett-das_letzte_marchen_009_f000053", "rebecca_braunert_plunkett-das_letzte_marchen_009_f000143", "rebecca_braunert_plunkett-das_letzte_marchen_009_f000163", "rebecca_braunert_plunkett-das_letzte_marchen_009_f000190", "rebecca_braunert_plunkett-das_letzte_marchen_010_f000025", "rebecca_braunert_plunkett-das_letzte_marchen_010_f000178", "rebecca_braunert_plunkett-das_letzte_marchen_010_f000205", "rebecca_braunert_plunkett-das_letzte_marchen_012_f000461", "rebecca_braunert_plunkett-das_letzte_marchen_012_f000526", "rebecca_braunert_plunkett-das_letzte_marchen_012_f000566", "rebecca_braunert_plunkett-ferien_vom_ich_002_f000130", "rebecca_braunert_plunkett-ferien_vom_ich_002_f000162", "rebecca_braunert_plunkett-ferien_vom_ich_002_f000323", "rebecca_braunert_plunkett-ferien_vom_ich_003_f000233", "rebecca_braunert_plunkett-ferien_vom_ich_005_f000264", "rebecca_braunert_plunkett-ferien_vom_ich_005_f000292", "rebecca_braunert_plunkett-ferien_vom_ich_010_f000164", "rebecca_braunert_plunkett-ferien_vom_ich_011_f000099", "rebecca_braunert_plunkett-ferien_vom_ich_011_f000195", "rebecca_braunert_plunkett-ferien_vom_ich_011_f000221", "rebecca_braunert_plunkett-ferien_vom_ich_011_f000295", "rebecca_braunert_plunkett-ferien_vom_ich_012_f000124", "rebecca_braunert_plunkett-ferien_vom_ich_012_f000244", "rebecca_braunert_plunkett-ferien_vom_ich_013_f000144", "rebecca_braunert_plunkett-ferien_vom_ich_013_f000290", "rebecca_braunert_plunkett-ferien_vom_ich_013_f000319", "rebecca_braunert_plunkett-ferien_vom_ich_013_f000495", "rebecca_braunert_plunkett-ferien_vom_ich_014_f000111", "rebecca_braunert_plunkett-ferien_vom_ich_014_f000135", "zweiplaneten_15_lasswitz_f000126"] -------------------------------------------------------------------------------- /scripts/merge_and_subset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import click 3 | 4 | from tqdm import tqdm 5 | 6 | import audiomate 7 | from audiomate.corpus import subset 8 | 9 | 10 | SEED = 3294 11 | MAX_DEV_TEST_DURATION = 15000 12 | MAX_TRAIN_UTT_DURATION = 25.0 13 | 14 | 15 | @click.command() 16 | @click.argument('download_folder', type=click.Path(exists=True)) 17 | @click.argument('output_folder', type=click.Path()) 18 | def run(download_folder, output_folder): 19 | corpora_names = [ 20 | ('common_voice', 'common-voice'), 21 | ('mailabs', 'mailabs'), 22 | ('swc', 'swc'), 23 | ('tuda', 'tuda'), 24 | ('voxforge', 'voxforge'), 25 | ] 26 | 27 | print('Load corpora') 28 | corpora = {} 29 | 30 | for name, reader_type in corpora_names: 31 | print(' - {} ...'.format(name)) 32 | full_path = os.path.join(download_folder, name) 33 | c = audiomate.Corpus.load( 34 | full_path, 35 | reader=reader_type 36 | ) 37 | corpora[name] = c 38 | 39 | print('Create Train/Dev/Test - if not already exist') 40 | for name, corpus in corpora.items(): 41 | prepare_corpus(corpus, name) 42 | 43 | print('Insert full subviews') 44 | # 45 | # Insert subviews containing all utterances 46 | # so we have a reference when merged 47 | # 48 | for name, corpus in corpora.items(): 49 | all_utts = set(corpus.utterances.keys()) 50 | full_filter = subset.MatchingUtteranceIdxFilter(all_utts) 51 | full_subview = subset.Subview(corpus, filter_criteria=[full_filter]) 52 | corpus.import_subview('full', full_subview) 53 | 54 | print('Suffix subviews') 55 | # 56 | # Suffix subviews to have the correct names when merging 57 | # 58 | for name, corpus in corpora.items(): 59 | print(' - {} ...'.format(name)) 60 | original_subview_names = list(corpus.subviews.keys()) 61 | 62 | for subview_name in original_subview_names: 63 | new_subview_name = '{}_{}'.format(subview_name, name) 64 | corpus.subviews[new_subview_name] = corpus.subviews[subview_name] 65 | del corpus.subviews[subview_name] 66 | 67 | print('Merge corpora ...') 68 | full_corpus = audiomate.Corpus.merge_corpora(list(corpora.values())) 69 | 70 | print('Create merged train/test/dev subviews ...') 71 | for part in ['train', 'dev', 'test']: 72 | utt_ids = set() 73 | 74 | for name, corpus in corpora.items(): 75 | sv = full_corpus.subviews['{}_{}'.format(part, name)] 76 | utt_ids.update(sv.utterances.keys()) 77 | 78 | part_filter = subset.MatchingUtteranceIdxFilter(utt_ids) 79 | part_subview = subset.Subview(corpus, filter_criteria=[part_filter]) 80 | full_corpus.import_subview(part, part_subview) 81 | 82 | print('Save ...') 83 | os.makedirs(output_folder) 84 | full_corpus.save_at(output_folder) 85 | 86 | 87 | def prepare_corpus(corpus, name): 88 | if name != 'common_voice': 89 | print(' - {}: Find utterances that are too long'.format(name)) 90 | too_long = utts_too_long(corpus) 91 | else: 92 | too_long = set() 93 | 94 | if name == 'mailabs': 95 | # we only use mailabs for training 96 | # since we don't know the speakers 97 | train_utts = set(corpus.utterances.keys()) 98 | train_utts = train_utts - too_long 99 | dev_utts = set() 100 | test_utts = set() 101 | 102 | elif name == 'tuda': 103 | # we only use kinect-raw files 104 | # otherwise sentence of the tuda would occur multiple times 105 | # in contrast to other datasets 106 | train_utts = set(corpus.subviews['train_kinect-raw'].utterances.keys()) 107 | train_utts = train_utts - too_long 108 | dev_utts = set(corpus.subviews['dev_kinect-raw'].utterances.keys()) 109 | test_utts = set(corpus.subviews['test_kinect-raw'].utterances.keys()) 110 | 111 | elif name == 'common_voice': 112 | train_utts = set(corpus.subviews['train'].utterances.keys()) 113 | train_utts = train_utts - too_long 114 | dev_utts = set(corpus.subviews['dev'].utterances.keys()) 115 | test_utts = set(corpus.subviews['test'].utterances.keys()) 116 | 117 | else: 118 | dur_filter = subset.MatchingUtteranceIdxFilter(too_long, inverse=True) 119 | dur_subview = subset.Subview(corpus, filter_criteria=[dur_filter]) 120 | train, dev, test = create_train_dev_test(dur_subview) 121 | 122 | train_utts = set(train.utterances.keys()) 123 | dev_utts = set(dev.utterances.keys()) 124 | test_utts = set(test.utterances.keys()) 125 | 126 | # Remove all subviews 127 | for subname in list(corpus.subviews.keys()): 128 | del corpus.subviews[subname] 129 | 130 | # Add new subviews 131 | train_filter = subset.MatchingUtteranceIdxFilter(train_utts) 132 | train_subview = subset.Subview(corpus, filter_criteria=[train_filter]) 133 | corpus.import_subview('train', train_subview) 134 | 135 | dev_filter = subset.MatchingUtteranceIdxFilter(dev_utts) 136 | dev_subview = subset.Subview(corpus, filter_criteria=[dev_filter]) 137 | corpus.import_subview('dev', dev_subview) 138 | 139 | test_filter = subset.MatchingUtteranceIdxFilter(test_utts) 140 | test_subview = subset.Subview(corpus, filter_criteria=[test_filter]) 141 | corpus.import_subview('test', test_subview) 142 | 143 | 144 | def utts_too_long(corpus): 145 | utts = set() 146 | 147 | for utt in tqdm(corpus.utterances.values()): 148 | if utt.duration > MAX_TRAIN_UTT_DURATION: 149 | utts.add(utt.idx) 150 | 151 | return utts 152 | 153 | 154 | def create_train_dev_test(corpus): 155 | """ 156 | Create train/dev/test subsets of the given corpus. 157 | Size is computed using length of the transcriptions. 158 | """ 159 | 160 | total_duration = corpus.total_duration 161 | test_dev_train_ratio = MAX_DEV_TEST_DURATION / total_duration 162 | 163 | if test_dev_train_ratio > 0.15: 164 | test_dev_train_ratio = 0.15 165 | 166 | splitter = subset.Splitter(corpus, SEED) 167 | subviews = splitter.split_by_label_length( 168 | proportions={ 169 | 'train': 1.0 - (2 * test_dev_train_ratio), 170 | 'dev': test_dev_train_ratio, 171 | 'test': test_dev_train_ratio, 172 | }, 173 | label_list_idx=audiomate.corpus.LL_WORD_TRANSCRIPT, 174 | separate_issuers=True 175 | ) 176 | 177 | return subviews['train'], subviews['dev'], subviews['test'] 178 | 179 | 180 | if __name__ == '__main__': 181 | run() 182 | -------------------------------------------------------------------------------- /data/validation/common_voice/invalid_transcripts.json: -------------------------------------------------------------------------------- 1 | {"common_voice_de_18385886": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_17816688": ["Außerdem klang der Alarm irgendwie nach … Wecker!", ["…"]], "common_voice_de_18203387": ["Fragen, Anregungen, Lob und Kritik richten Sie bitte an kundenservice@firma.de.", ["@"]], "common_voice_de_18219580": ["Fragen, Anregungen, Lob und Kritik richten Sie bitte an kundenservice@firma.de.", ["@"]], "common_voice_de_17670257": ["Außerdem klang der Alarm irgendwie nach … Wecker!", ["…"]], "common_voice_de_17747749": ["Fragen, Anregungen, Lob und Kritik richten Sie bitte an kundenservice@firma.de.", ["@"]], "common_voice_de_18002815": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_17312459": ["Außerdem klang der Alarm irgendwie nach … Wecker!", ["…"]], "common_voice_de_17304237": ["Fragen, Anregungen, Lob und Kritik richten Sie bitte an kundenservice@firma.de.", ["@"]], "common_voice_de_17993841": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_17876716": ["Außerdem klang der Alarm irgendwie nach … Wecker!", ["…"]], "common_voice_de_17661976": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_17816892": ["Fragen, Anregungen, Lob und Kritik richten Sie bitte an kundenservice@firma.de.", ["@"]], "common_voice_de_17312340": ["Fragen, Anregungen, Lob und Kritik richten Sie bitte an kundenservice@firma.de.", ["@"]], "common_voice_de_18397436": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_18384106": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_17431076": ["Außerdem klang der Alarm irgendwie nach … Wecker!", ["…"]], "common_voice_de_17304025": ["Außerdem klang der Alarm irgendwie nach … Wecker!", ["…"]], "common_voice_de_17678251": ["Fragen, Anregungen, Lob und Kritik richten Sie bitte an kundenservice@firma.de.", ["@"]], "common_voice_de_17359556": ["Fragen, Anregungen, Lob und Kritik richten Sie bitte an kundenservice@firma.de.", ["@"]], "common_voice_de_17427362": ["Außerdem klang der Alarm irgendwie nach … Wecker!", ["…"]], "common_voice_de_17517255": ["Fragen, Anregungen, Lob und Kritik richten Sie bitte an kundenservice@firma.de.", ["@"]], "common_voice_de_18265766": ["Außerdem klang der Alarm irgendwie nach … Wecker!", ["…"]], "common_voice_de_18366292": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_18233607": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_18227509": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_17637960": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_18223710": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_17839106": ["Außerdem klang der Alarm irgendwie nach … Wecker!", ["…"]], "common_voice_de_17993400": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_18235827": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_18359373": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_17507821": ["Außerdem klang der Alarm irgendwie nach … Wecker!", ["…"]], "common_voice_de_17430374": ["Außerdem klang der Alarm irgendwie nach … Wecker!", ["…"]], "common_voice_de_17430388": ["Fragen, Anregungen, Lob und Kritik richten Sie bitte an kundenservice@firma.de.", ["@"]], "common_voice_de_18326542": ["Fragen, Anregungen, Lob und Kritik richten Sie bitte an kundenservice@firma.de.", ["@"]], "common_voice_de_18381641": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_18389904": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_17686222": ["Fragen, Anregungen, Lob und Kritik richten Sie bitte an kundenservice@firma.de.", ["@"]], "common_voice_de_18154909": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_17635068": ["Fragen, Anregungen, Lob und Kritik richten Sie bitte an kundenservice@firma.de.", ["@"]], "common_voice_de_17663955": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_17999384": ["Fragen, Anregungen, Lob und Kritik richten Sie bitte an kundenservice@firma.de.", ["@"]], "common_voice_de_17619678": ["Fragen, Anregungen, Lob und Kritik richten Sie bitte an kundenservice@firma.de.", ["@"]], "common_voice_de_17707923": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_18193859": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_17551990": ["Außerdem klang der Alarm irgendwie nach … Wecker!", ["…"]], "common_voice_de_18289592": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_17823454": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_17337465": ["Fragen, Anregungen, Lob und Kritik richten Sie bitte an kundenservice@firma.de.", ["@"]], "common_voice_de_17649508": ["Außerdem klang der Alarm irgendwie nach … Wecker!", ["…"]], "common_voice_de_17705921": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_18042126": ["Außerdem klang der Alarm irgendwie nach … Wecker!", ["…"]], "common_voice_de_17906038": ["Außerdem klang der Alarm irgendwie nach … Wecker!", ["…"]], "common_voice_de_18235404": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_17858947": ["Fragen, Anregungen, Lob und Kritik richten Sie bitte an kundenservice@firma.de.", ["@"]], "common_voice_de_18200892": ["Außerdem klang der Alarm irgendwie nach … Wecker!", ["…"]], "common_voice_de_18110716": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_18099491": ["Fragen, Anregungen, Lob und Kritik richten Sie bitte an kundenservice@firma.de.", ["@"]], "common_voice_de_18101719": ["Außerdem klang der Alarm irgendwie nach … Wecker!", ["…"]], "common_voice_de_17623181": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_17800214": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_17712524": ["Außerdem klang der Alarm irgendwie nach … Wecker!", ["…"]], "common_voice_de_17318558": ["Fragen, Anregungen, Lob und Kritik richten Sie bitte an kundenservice@firma.de.", ["@"]], "common_voice_de_18366389": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_17772419": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_17700040": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_17639623": ["Außerdem klang der Alarm irgendwie nach … Wecker!", ["…"]], "common_voice_de_17651927": ["Fragen, Anregungen, Lob und Kritik richten Sie bitte an kundenservice@firma.de.", ["@"]], "common_voice_de_17986369": ["Außerdem klang der Alarm irgendwie nach … Wecker!", ["…"]], "common_voice_de_17999886": ["Fragen, Anregungen, Lob und Kritik richten Sie bitte an kundenservice@firma.de.", ["@"]], "common_voice_de_17738082": ["Nukuʻalofa ist die Hauptstadt von Tonga.", ["ʻ"]], "common_voice_de_18024223": ["Außerdem klang der Alarm irgendwie nach … Wecker!", ["…"]], "common_voice_de_18034794": ["Fragen, Anregungen, Lob und Kritik richten Sie bitte an kundenservice@firma.de.", ["@"]]} -------------------------------------------------------------------------------- /data/validation/mailabs/invalid_character_ratio.json: -------------------------------------------------------------------------------- 1 | {"zweiplaneten_15_lasswitz_f000126": 35.117005162447064, "karlsson-undine_01_f000001": 29.536643898836992, "eva_k-grune_haus_19_f000097": 30.71376735257626, "eva_k-grune_haus_19_f000099": 32.37169644308759, "ramona_deininger-tschun_04_f000054": 25.38371848538716, "ramona_deininger-tschun_07_f000027": 36.43376629411902, "angela_merkel-20081231_neujahrsansprache_f000079": 26.32983454901263, "angela_merkel-Die_Kanzlerin_direkt_10_13_f000004": 26.72269813048866, "angela_merkel-Die_Kanzlerin_direkt_12_07_f000014": 36.02444515921518, "angela_merkel-Die_Kanzlerin_direkt_15_07_f000006": 28.066779578998307, "angela_merkel-Die_Kanzlerin_direkt_15_07_f000009": 29.57095709570957, "angela_merkel-Die_Kanzlerin_direkt_16_07_f000013": 25.353984562694883, "angela_merkel-Die_Kanzlerin_direkt_24_07_f000008": 26.021827519794567, "angela_merkel-Die_Kanzlerin_direkt_24_07_f000027": 25.940103580274716, "angela_merkel-Die_Kanzlerin_direkt_26_07_f000003": 26.79917369214449, "angela_merkel-Die_Kanzlerin_direkt_34_08_f000027": 38.0725758477097, "angela_merkel-Die_Kanzlerin_direkt_40_14_f000022": 37.041800643086816, "angela_merkel-Die_Kanzlerin_direkt_43_15_f000004": 37.79952750590618, "angela_merkel-Die_Kanzlerin_direkt_45_13_f000043": 28.65158479078374, "angela_merkel-Kanzlerin_01_14_f000013": 25.181979146173518, "angela_merkel-Kanzlerin_01_17_f000017": 35.01616513305148, "angela_merkel-Kanzlerin_02_13_f000048": 26.094420600858367, "angela_merkel-Kanzlerin_04_17_f000031": 25.63154651879236, "angela_merkel-Kanzlerin_04_17_f000035": 29.57212826910637, "angela_merkel-Kanzlerin_05_12_f000011": 27.64067127344521, "angela_merkel-Kanzlerin_05_15_f000045": 34.355828220858896, "angela_merkel-Kanzlerin_06_14_f000023": 28.83773623770534, "angela_merkel-Kanzlerin_06_14_f000029": 30.727210652099693, "angela_merkel-Kanzlerin_06_15_f000044": 29.917415467719316, "angela_merkel-Kanzlerin_07_12_f000022": 25.81192952663978, "angela_merkel-Kanzlerin_07_13_f000030": 31.93916349809886, "angela_merkel-Kanzlerin_07_14_f000013": 36.41360357265545, "angela_merkel-Kanzlerin_07_14_f000020": 25.527065527065528, "angela_merkel-Kanzlerin_09_12_f000009": 26.024940568044375, "angela_merkel-Kanzlerin_11_13_f000024": 25.546352659415227, "angela_merkel-Kanzlerin_11_14_f000018": 25.506288008502093, "angela_merkel-Kanzlerin_12_13_f000045": 28.965829373161352, "angela_merkel-Kanzlerin_12_14_f000015": 25.02415014490087, "angela_merkel-Kanzlerin_12_14_f000019": 25.622930351229577, "angela_merkel-Kanzlerin_12_15_f000033": 27.542372881355934, "angela_merkel-Kanzlerin_12_16_f000041": 46.13953488372093, "angela_merkel-Kanzlerin_13_12_f000025": 27.80601370626619, "angela_merkel-Kanzlerin_14_13_f000034": 27.058450480921678, "angela_merkel-Kanzlerin_15_15_f000025": 27.281534586320483, "angela_merkel-Kanzlerin_16_14_f000038": 25.23460294929422, "angela_merkel-Kanzlerin_18_13_f000045": 29.715017611271215, "angela_merkel-Kanzlerin_18_14_f000036": 41.68475900998697, "angela_merkel-Kanzlerin_18_17_f000034": 28.759197887996404, "angela_merkel-Kanzlerin_18_17_f000051": 43.6306973120374, "angela_merkel-Kanzlerin_21_15_f000051": 36.6333184012742, "angela_merkel-Kanzlerin_21_15_f000060": 26.0586319218241, "angela_merkel-Kanzlerin_21_16_f000023": 26.229508196721312, "angela_merkel-Kanzlerin_22_15_f000012": 25.08002508002508, "angela_merkel-Kanzlerin_23_15_f000034": 32.29919252018699, "angela_merkel-Kanzlerin_23_15_f000036": 25.046519183625247, "angela_merkel-Kanzlerin_23_17_f000049": 29.05728908700677, "angela_merkel-Kanzlerin_24_14_f000006": 29.931407191852006, "angela_merkel-Kanzlerin_24_17_f000026": 27.378725775000994, "angela_merkel-Kanzlerin_25_12_f000040": 33.372335554764206, "angela_merkel-Kanzlerin_28_17_f000049": 33.56462804582863, "angela_merkel-Kanzlerin_29_12_f000028": 25.958223484080307, "angela_merkel-Kanzlerin_30_12_f000002": 26.575130107407816, "angela_merkel-Kanzlerin_30_12_f000008": 53.754653591210385, "angela_merkel-Kanzlerin_30_12_f000022": 26.47507075985195, "angela_merkel-Kanzlerin_31_13_f000043": 27.710426047800485, "angela_merkel-Kanzlerin_31_14_f000035": 27.57587111277632, "angela_merkel-Kanzlerin_31_14_f000037": 25.06548872566644, "angela_merkel-Kanzlerin_32_16_f000047": 27.13749478659808, "angela_merkel-Kanzlerin_33_13_f000018": 29.611351017890193, "angela_merkel-Kanzlerin_34_12_f000016": 32.80200912305879, "angela_merkel-Kanzlerin_34_12_f000027": 27.757333613710443, "angela_merkel-Kanzlerin_34_14_f000030": 27.520963233713182, "angela_merkel-Kanzlerin_35_16_f000053": 25.875643699195642, "angela_merkel-Kanzlerin_36_14_f000060": 33.78316349949509, "angela_merkel-Kanzlerin_36_15_f000038": 43.56912165157368, "angela_merkel-Kanzlerin_37_14_f000035": 35.883722936254635, "angela_merkel-Kanzlerin_38_13_f000006": 29.28752464094621, "angela_merkel-Kanzlerin_38_13_f000021": 26.24216111915099, "angela_merkel-Kanzlerin_38_16_f000018": 25.442980463425712, "angela_merkel-Kanzlerin_39_16_f000046": 30.372664601858325, "angela_merkel-Kanzlerin_40_12_f000040": 27.377636291285317, "angela_merkel-Kanzlerin_40_15_f000034": 28.79368396609776, "angela_merkel-Kanzlerin_42_12_f000007": 28.092353612501096, "angela_merkel-Kanzlerin_42_12_f000024": 33.506099157112196, "angela_merkel-Kanzlerin_43_12_f000029": 25.33783783783784, "angela_merkel-Kanzlerin_43_13_f000013": 25.16499105655955, "rebecca_braunert_plunkett-ferien_vom_ich_002_f000130": 27.183867558999648, "rebecca_braunert_plunkett-ferien_vom_ich_002_f000162": 27.44813278008299, "rebecca_braunert_plunkett-ferien_vom_ich_002_f000323": 26.618744512730466, "rebecca_braunert_plunkett-ferien_vom_ich_003_f000233": 25.41223404255319, "rebecca_braunert_plunkett-ferien_vom_ich_005_f000264": 31.545064377682404, "rebecca_braunert_plunkett-ferien_vom_ich_005_f000292": 26.123771691407068, "rebecca_braunert_plunkett-ferien_vom_ich_010_f000164": 29.963310232368528, "rebecca_braunert_plunkett-ferien_vom_ich_011_f000099": 27.37883365754729, "rebecca_braunert_plunkett-ferien_vom_ich_011_f000195": 39.253501068122475, "rebecca_braunert_plunkett-ferien_vom_ich_011_f000221": 25.32299741602067, "rebecca_braunert_plunkett-ferien_vom_ich_011_f000295": 28.52522639068564, "rebecca_braunert_plunkett-ferien_vom_ich_012_f000124": 25.012603982858582, "rebecca_braunert_plunkett-ferien_vom_ich_012_f000244": 25.042589437819423, "rebecca_braunert_plunkett-ferien_vom_ich_013_f000144": 25.128205128205128, "rebecca_braunert_plunkett-ferien_vom_ich_013_f000290": 28.595512903644146, "rebecca_braunert_plunkett-ferien_vom_ich_013_f000319": 25.772249123392886, "rebecca_braunert_plunkett-ferien_vom_ich_013_f000495": 25.31572904707233, "rebecca_braunert_plunkett-ferien_vom_ich_014_f000111": 25.869240614334473, "rebecca_braunert_plunkett-ferien_vom_ich_014_f000135": 25.52821997105644, "rebecca_braunert_plunkett-das_letzte_marchen_004_f000251": 31.574102485424977, "rebecca_braunert_plunkett-das_letzte_marchen_003_f000448": 25.99163081275417, "rebecca_braunert_plunkett-das_letzte_marchen_002_f000077": 26.98583833556734, "rebecca_braunert_plunkett-das_letzte_marchen_010_f000025": 28.785900783289815, "rebecca_braunert_plunkett-das_letzte_marchen_010_f000178": 25.286697247706424, "rebecca_braunert_plunkett-das_letzte_marchen_010_f000205": 30.429532516819044, "rebecca_braunert_plunkett-das_letzte_marchen_007_f000014": 31.47591129237828, "rebecca_braunert_plunkett-das_letzte_marchen_007_f000288": 25.810317438198492, "rebecca_braunert_plunkett-das_letzte_marchen_007_f000430": 25.350493405294927, "rebecca_braunert_plunkett-das_letzte_marchen_007_f000484": 25.022696323195643, "rebecca_braunert_plunkett-das_letzte_marchen_009_f000053": 25.786812506449284, "rebecca_braunert_plunkett-das_letzte_marchen_009_f000143": 27.80360942548664, "rebecca_braunert_plunkett-das_letzte_marchen_009_f000163": 39.10730121194206, "rebecca_braunert_plunkett-das_letzte_marchen_009_f000190": 25.244974239822202, "rebecca_braunert_plunkett-das_letzte_marchen_008_f000279": 27.71144903858238, "rebecca_braunert_plunkett-das_letzte_marchen_008_f000294": 25.76208178438662, "rebecca_braunert_plunkett-das_letzte_marchen_008_f000299": 30.227401016047097, "rebecca_braunert_plunkett-das_letzte_marchen_006_f000303": 25.720713214464258, "rebecca_braunert_plunkett-das_letzte_marchen_001_f000223": 25.624636839047064, "rebecca_braunert_plunkett-das_letzte_marchen_012_f000461": 33.74569570207882, "rebecca_braunert_plunkett-das_letzte_marchen_012_f000526": 25.353570196619525, "rebecca_braunert_plunkett-das_letzte_marchen_012_f000566": 25.15797788309637} -------------------------------------------------------------------------------- /notebooks/durations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 12, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import audiomate\n", 10 | "import matplotlib.pyplot as plt\n", 11 | "import numpy as np" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 13, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "path = '../data/full'" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 14, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "ds = audiomate.Corpus.load(path)\n", 30 | "train_durations = [(u.idx, u.duration) for u in ds.subviews['train'].utterances.values()]" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 18, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "data": { 40 | "text/plain": [ 41 | "(array([9.30000e+02, 1.08549e+05, 8.59740e+04, 6.32840e+04, 4.54570e+04,\n", 42 | " 3.41960e+04, 2.62460e+04, 2.10970e+04, 1.67040e+04, 1.29230e+04,\n", 43 | " 9.61600e+03, 6.76400e+03, 4.29300e+03, 2.68700e+03, 1.83100e+03,\n", 44 | " 1.25400e+03, 8.22000e+02, 5.29000e+02, 3.57000e+02, 2.05000e+02,\n", 45 | " 1.35000e+02, 9.50000e+01, 8.20000e+01, 8.30000e+01]),\n", 46 | " array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,\n", 47 | " 17, 18, 19, 20, 21, 22, 23, 24]),\n", 48 | " )" 49 | ] 50 | }, 51 | "execution_count": 18, 52 | "metadata": {}, 53 | "output_type": "execute_result" 54 | }, 55 | { 56 | "data": { 57 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlgAAAFlCAYAAAA3apYyAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAQYklEQVR4nO3dX4jm11kH8O9j1ipUHSuVUpKsEyUUgxcqQ3phkV74Z+MaU4vUrF5YKFkrRvSuiwhWRFhFRcSirBhSQROC1pplV6oXlvSiSJIiNmmILmFLNtSmJTJaEErt48VOcVizm5mdZ/J7Z+bzudl5z/vOeR9y+GW/e87vd051dwAAmPN1SxcAAHDYCFgAAMMELACAYQIWAMAwAQsAYJiABQAw7NjSBSTJm9/85l5fX1+6DACA1/T0009/sbu//UafWYmAtb6+nqeeemrpMgAAXlNVffa1PmOJEABgmIAFADBMwAIAGCZgAQAME7AAAIYJWAAAwwQsAIBhAhYAwDABCwBgmIAFADBMwAIAGCZgAQAMGz/suaremeQ3kzyb5NHu/vj0d6yS9TMXxvu8fPbkeJ8AwOtnRzNYVfVQVb1cVc9c036iqp6vqktVdWaruZN8Kck3JrkyWy4AwOrb6RLhw0lObG+oqluSfCjJPUnuSnKqqu5K8onuvifJB5L8xlypAAAHw44CVnc/keSVa5rvTnKpu1/o7i8neTTJfd391a33/yPJN4xVCgBwQOzlHqxbk7y47fWVJG+vqncn+dEk35rkj673y1V1OsnpJDl+/PgeygAAWC3jN7l390eSfGQHnzuX5FySbGxs9HQdAABL2cs2DS8luX3b69u22gAAjrS9BKwnk9xZVXdU1RuS3J/k8ZmyAAAOrp1u0/BIkk8meVtVXamq93X3V5I8mORjSZ5L8lh3P7ubL6+qe6vq3Obm5m7rBgBYWTu6B6u7T12n/WKSizf75d19Psn5jY2NB262DwCAVTN+kzt7Z3d4ADjYnEUIADBMwAIAGLZowHKTOwBwGC0asLr7fHefXltbW7IMAIBRlggBAIYJWAAAwwQsAIBhAhYAwDBPEQIADPMUIQDAMEuEAADDBCwAgGEOez4iHCANAK8fM1gAAMM8RQgAMMxThAAAwywRAgAME7AAAIYJWAAAwwQsAIBhAhYAwDDbNAAADLNNAwDAMEuEAADDBCwAgGEOe+amOUAaAF6dGSwAgGECFgDAMAELAGCYgAUAMEzAAgAYZid3AIBhdnIHABhmiRAAYJiABQAwTMACABgmYAEADHMWISvF+YYAHAZmsAAAhglYAADDBCwAgGECFgDAMAELAGCYswgBAIY5ixAAYJglQgCAYQIWAMAwAQsAYJijcjj0HL8DwOvNDBYAwDABCwBgmIAFADBMwAIAGCZgAQAME7AAAIYJWAAAwwQsAIBhNhqFm2DzUgBuxAwWAMCwRQNWVd1bVec2NzeXLAMAYNSiAau7z3f36bW1tSXLAAAYZYkQAGCYgAUAMEzAAgAYJmABAAwTsAAAhglYAADD7OQOK2I/dodP7BAPsAQzWAAAwwQsAIBhAhYAwDABCwBgmIAFADBMwAIAGCZgAQAME7AAAIYJWAAAw+zkDofcfuwQb3d4gBszgwUAMEzAAgAYJmABAAwTsAAAhglYAADDBCwAgGH7ErCq6o1V9VRV/fh+9A8AsMp2FLCq6qGqermqnrmm/URVPV9Vl6rqzLa3PpDksclCAQAOip3OYD2c5MT2hqq6JcmHktyT5K4kp6rqrqr64SSfSfLyYJ0AAAfGjnZy7+4nqmr9mua7k1zq7heSpKoeTXJfkm9K8sZcDV3/XVUXu/urYxUDAKy4vRyVc2uSF7e9vpLk7d39YJJU1XuTfPF64aqqTic5nSTHjx/fQxkAAKtl384i7O6HX+P9c0nOJcnGxkbvVx3APOcbAtzYXp4ifCnJ7dte37bVBgBwpO0lYD2Z5M6quqOq3pDk/iSPz5QFAHBw7XSbhkeSfDLJ26rqSlW9r7u/kuTBJB9L8lySx7r72d18eVXdW1XnNjc3d1s3AMDK2ulThKeu034xycWb/fLuPp/k/MbGxgM32wcAwKpxVA4AwDABCwBgmIAFADBs0YDlJncA4DBaNGB19/nuPr22trZkGQAAoywRAgAME7AAAIYJWAAAwwQsAIBhniIEABi2o6Ny9oujcoCvWT9zYbzPy2dPjvcJsBOWCAEAhglYAADDBCwAgGECFgDAME8RAgAMcxYhAMAwS4QAAMMELACAYQIWAMAwAQsAYJiABQAwzDYNAADDbNMAADDMEiEAwDABCwBgmIAFADBMwAIAGCZgAQAMO7Z0AQD7Zf3MhfE+L589Od4ncPiYwQIAGCZgAQAMs5M7AMAwO7kDAAyzRAgAMEzAAgAYJmABAAwTsAAAhglYAADDBCwAgGECFgDAMAELAGCYgAUAMMxROQAAwxyVAwAwzBIhAMAwAQsAYJiABQAw7NjSBQAcJOtnLoz3efnsyfE+gWWZwQIAGCZgAQAME7AAAIYJWAAAwwQsAIBhAhYAwDABCwBgmIAFADBMwAIAGCZgAQAME7AAAIYtGrCq6t6qOre5ublkGQAAoxYNWN19vrtPr62tLVkGAMAoS4QAAMMELACAYceWLgDgqFs/c2G8z8tnT473CeycGSwAgGECFgDAMAELAGCYgAUAMEzAAgAYJmABAAwTsAAAhglYAADDBCwAgGECFgDAMAELAGCYgAUAMEzAAgAYJmABAAw7tnQBAMxbP3NhX/q9fPbkvvQLh40ZLACAYQIWAMAwAQsAYJiABQAwTMACABg2HrCq6rur6k+q6q+q6hem+wcAWHU7ClhV9VBVvVxVz1zTfqKqnq+qS1V1Jkm6+7nufn+S9yT5gfmSAQBW205nsB5OcmJ7Q1XdkuRDSe5JcleSU1V119Z7P5HkQpKLY5UCABwQOwpY3f1Ekleuab47yaXufqG7v5zk0ST3bX3+8e6+J8nPThYLAHAQ7GUn91uTvLjt9ZUkb6+qdyZ5d5JvyA1msKrqdJLTSXL8+PE9lAEAsFrGj8rp7o8n+fgOPncuybkk2djY6Ok6AACWspenCF9Kcvu217dttQEAHGl7mcF6MsmdVXVHrgar+5P8zEhV+2C/Dj4FALjWTrdpeCTJJ5O8raquVNX7uvsrSR5M8rEkzyV5rLuf3c2XV9W9VXVuc3Nzt3UDAKysHc1gdfep67RfzB62Yuju80nOb2xsPHCzfQAArBpH5QAADBOwAACGjW/TAMDhtR8PDF0+e3K8T1jaojNYbnIHAA6jRQNWd5/v7tNra2tLlgEAMMo9WAAAwwQsAIBhAhYAwDABCwBgmKcIAQCGeYoQAGCYJUIAgGECFgDAMAELAGCYgAUAMMxThAAAwzxFCAAwzBIhAMAwAQsAYJiABQAw7NjSBQBwtK2fuTDe5+WzJ8f7hN0wgwUAMMw2DQAAw2zTAAAwzBIhAMAwAQsAYJiABQAwTMACABgmYAEADBOwAACGCVgAAMMELACAYXZyBwAYZid3AIBhlggBAIYJWAAAw44tXQAATFs/c2G8z8tnT473yeFlBgsAYJiABQAwTMACABgmYAEADBOwAACGCVgAAMMclQMAMMxROQAAwywRAgAME7AAAIYJWAAAwwQsAIBhAhYAwDABCwBgmIAFADBMwAIAGCZgAQAME7AAAIYJWAAAwwQsAIBhAhYAwDABCwBg2LGlCwCAg2D9zIXxPi+fPTneJ6th0Rmsqrq3qs5tbm4uWQYAwKhFA1Z3n+/u02tra0uWAQAwyj1YAADDBCwAgGECFgDAMAELAGCYgAUAMEzAAgAYJmABAAwTsAAAhglYAADDBCwAgGECFgDAMAELAGCYgAUAMEzAAgAYdmzpAgDgqFo/c2G8z8tnT473ye6ZwQIAGCZgAQAME7AAAIYJWAAAwwQsAIBhAhYAwDABCwBgmIAFADBsXzYarap3JTmZ5FuS/Fl3//1+fA8AwCra8QxWVT1UVS9X1TPXtJ+oquer6lJVnUmS7v5odz+Q5P1Jfnq2ZACA1babJcKHk5zY3lBVtyT5UJJ7ktyV5FRV3bXtI7+29T4AwJGx44DV3U8keeWa5ruTXOruF7r7y0keTXJfXfXbSf6uuz81Vy4AwOrb603utyZ5cdvrK1ttv5Tkh5L8VFW9/9V+sapOV9VTVfXUF77whT2WAQCwOvblJvfu/sMkf/ganzmX5FySbGxs9H7UAQCwhL0GrJeS3L7t9W1bbQDAAtbPXBjv8/LZk+N9HnZ7XSJ8MsmdVXVHVb0hyf1JHt97WQAAB9dutml4JMknk7ytqq5U1fu6+ytJHkzysSTPJXmsu5/dRZ/3VtW5zc3N3dYNALCydrxE2N2nrtN+McnFm/ny7j6f5PzGxsYDN/P7AACryFE5AADD9uUpQgCA13KYb8gXsACAG9qPIHTYLbpE6CZ3AOAwWjRgdff57j69tra2ZBkAAKPc5A4AMEzAAgAYJmABAAwTsAAAhnmKEABgmKcIAQCGWSIEABgmYAEADBOwAACGCVgAAMM8RQgAMMxThAAAw6q7l64hVfWFJJ/d5695c5Iv7vN3sHfG6WAwTqvPGB0MxulguHacvqO7v/1Gv7ASAev1UFVPdffG0nVwY8bpYDBOq88YHQzG6WC4mXFykzsAwDABCwBg2FEKWOeWLoAdMU4Hg3FafcboYDBOB8Oux+nI3IMFAPB6OUozWAAAr4sjEbCq6kRVPV9Vl6rqzNL18Oqq6nJVfbqq/rmqnlq6Hq6qqoeq6uWqemZb27dV1T9U1b9t/fmmJWs86q4zRh+sqpe2rqd/rqofW7JGkqq6var+sao+U1XPVtUvb7W7nlbEDcZo19fToV8irKpbkvxrkh9OciXJk0lOdfdnFi2M/6eqLifZ6G57wqyQqvrBJF9K8ufd/T1bbb+T5JXuPrv1j5Y3dfcHlqzzKLvOGH0wyZe6+3eXrI3/U1VvTfLW7v5UVX1zkqeTvCvJe+N6Wgk3GKP3ZJfX01GYwbo7yaXufqG7v5zk0ST3LVwTHBjd/USSV65pvi/Jh7d+/nCu/g+IhVxnjFgx3f257v7U1s//leS5JLfG9bQybjBGu3YUAtatSV7c9vpKbvI/Fvuuk/x9VT1dVaeXLoYbekt3f27r539P8pYli+G6Hqyqf9laQrTstEKqaj3J9yX5p7ieVtI1Y5Ts8no6CgGLg+Md3f39Se5J8otbyx6suL56n8HhvtfgYPrjJN+V5HuTfC7J7y1bDl9TVd+U5K+T/Ep3/+f291xPq+FVxmjX19NRCFgvJbl92+vbttpYMd390tafLyf5m1xd3mU1fX7rXoWv3bPw8sL1cI3u/nx3/093fzXJn8b1tBKq6utz9S/uv+juj2w1u55WyKuN0c1cT0chYD2Z5M6quqOq3pDk/iSPL1wT16iqN27dUJiqemOSH0nyzI1/iwU9nuTntn7+uSR/u2AtvIqv/YW95SfjelpcVVWSP0vyXHf//ra3XE8r4npjdDPX06F/ijBJth6n/IMktyR5qLt/a+GSuEZVfWeuzlolybEkf2mcVkNVPZLknbl6mvznk/x6ko8meSzJ8SSfTfKe7naT9UKuM0bvzNXljE5yOcnPb7vPhwVU1TuSfCLJp5N8dav5V3P1Hh/X0wq4wRidyi6vpyMRsAAAXk9HYYkQAOB1JWABAAwTsAAAhglYAADDBCwAgGECFgDAMAELAGCYgAUAMOx/Abs/Ky3Bm76LAAAAAElFTkSuQmCC\n", 58 | "text/plain": [ 59 | "
" 60 | ] 61 | }, 62 | "metadata": { 63 | "needs_background": "light" 64 | }, 65 | "output_type": "display_data" 66 | } 67 | ], 68 | "source": [ 69 | "%matplotlib inline\n", 70 | "bins = np.arange(25)\n", 71 | "plt.figure(figsize=(10, 6))\n", 72 | "plt.yscale('log', nonposy='clip')\n", 73 | "plt.hist([d[1] for d in train_durations], bins=bins)" 74 | ] 75 | } 76 | ], 77 | "metadata": { 78 | "kernelspec": { 79 | "display_name": "Python 3", 80 | "language": "python", 81 | "name": "python3" 82 | }, 83 | "language_info": { 84 | "codemirror_mode": { 85 | "name": "ipython", 86 | "version": 3 87 | }, 88 | "file_extension": ".py", 89 | "mimetype": "text/x-python", 90 | "name": "python", 91 | "nbconvert_exporter": "python", 92 | "pygments_lexer": "ipython3", 93 | "version": "3.7.4" 94 | } 95 | }, 96 | "nbformat": 4, 97 | "nbformat_minor": 4 98 | } 99 | -------------------------------------------------------------------------------- /data/validation/tuda/invalid_character_ratio.json: -------------------------------------------------------------------------------- 1 | {"2014-03-17-14-39-38_Realtek": 33.44827586206897, "2014-03-17-14-39-38_Yamaha": 33.44827586206897, "2014-03-17-14-39-38_Microsoft-Kinect-Raw": 33.44827586206897, "2014-03-17-14-37-39_Realtek": 37.93103448275862, "2014-03-17-14-37-39_Yamaha": 37.93103448275862, "2014-03-17-14-37-39_Microsoft-Kinect-Raw": 37.93103448275862, "2014-03-17-14-34-48_Realtek": 25.223880597014926, "2014-03-17-14-34-48_Yamaha": 25.223880597014926, "2014-03-17-14-34-48_Microsoft-Kinect-Raw": 25.223880597014926, "2014-03-17-14-32-49_Realtek": 85.14705882352942, "2014-03-17-14-32-49_Yamaha": 85.14705882352942, "2014-03-17-14-32-49_Microsoft-Kinect-Raw": 85.14705882352942, "2014-03-17-14-41-00_Realtek": 28.0, "2014-03-17-14-41-00_Yamaha": 28.0, "2014-03-17-14-41-00_Microsoft-Kinect-Raw": 28.0, "2014-03-17-14-40-14_Realtek": 25.217391304347828, "2014-03-17-14-40-14_Yamaha": 25.217391304347828, "2014-03-17-14-40-14_Microsoft-Kinect-Raw": 25.77777777777778, "2014-03-17-15-07-14_Realtek": 40.806451612903224, "2014-03-17-15-07-14_Yamaha": 40.806451612903224, "2014-03-17-15-07-14_Microsoft-Kinect-Raw": 40.806451612903224, "2014-03-17-14-42-10_Realtek": 37.15596330275229, "2014-03-17-14-42-10_Yamaha": 37.15596330275229, "2014-03-17-14-42-10_Microsoft-Kinect-Raw": 37.15596330275229, "2014-03-18-15-28-52_Kinect-Beam": "float division by zero", "2014-03-18-15-28-52_Kinect-RAW": "float division by zero", "2014-03-18-15-28-52_Realtek": "float division by zero", "2014-03-18-15-28-52_Yamaha": "float division by zero", "2014-03-17-14-46-51_Realtek": 44.1304347826087, "2014-03-17-14-46-51_Yamaha": 44.1304347826087, "2014-03-17-14-46-51_Microsoft-Kinect-Raw": 44.1304347826087, "2014-08-04-13-22-37_Kinect-Beam": 42.666666666666664, "2014-08-04-13-22-37_Kinect-RAW": 43.63636363636363, "2014-08-04-13-22-37_Realtek": 43.63636363636363, "2014-08-04-13-22-37_Samson": 43.63636363636363, "2014-08-04-13-22-37_Yamaha": 43.63636363636363, "2014-03-18-15-34-19_Realtek": "float division by zero", "2014-03-17-14-47-36_Realtek": 44.48979591836734, "2014-03-17-14-47-36_Yamaha": 44.48979591836734, "2014-03-17-14-47-36_Microsoft-Kinect-Raw": 44.48979591836734, "2014-03-17-14-44-50_Realtek": 73.82352941176471, "2014-03-17-14-44-50_Yamaha": 73.82352941176471, "2014-03-17-14-44-50_Microsoft-Kinect-Raw": 76.06060606060606, "2014-08-04-13-37-12_Kinect-Beam": 37.22222222222222, "2014-08-04-13-37-12_Kinect-RAW": 37.22222222222222, "2014-08-04-13-37-12_Realtek": 37.22222222222222, "2014-08-04-13-37-12_Samson": 37.22222222222222, "2014-08-04-13-37-12_Yamaha": 37.22222222222222, "2014-03-21-11-40-39_Samson": "float division by zero", "2014-03-24-13-39-24_Kinect-Beam": "float division by zero", "2014-03-24-13-39-24_Kinect-RAW": "", "2014-03-24-13-39-24_Realtek": "float division by zero", "2014-03-24-13-39-24_Samson": "float division by zero", "2014-03-24-13-39-24_Yamaha": "float division by zero", "2014-03-17-14-48-42_Realtek": 67.66666666666667, "2014-03-17-14-48-42_Yamaha": 67.66666666666667, "2014-03-17-14-48-42_Microsoft-Kinect-Raw": 67.66666666666667, "2014-03-17-14-16-19_Realtek": 36.486486486486484, "2014-03-17-14-16-19_Yamaha": 36.486486486486484, "2014-03-17-14-16-19_Microsoft-Kinect-Raw": 36.486486486486484, "2014-03-17-14-05-00_Realtek": 28.03571428571429, "2014-03-17-14-05-00_Yamaha": 28.03571428571429, "2014-03-17-14-05-00_Microsoft-Kinect-Raw": 28.03571428571429, "2014-03-17-15-23-49_Realtek": 63.92857142857143, "2014-03-17-15-23-49_Yamaha": 63.92857142857143, "2014-03-17-15-23-49_Microsoft-Kinect-Raw": 63.92857142857143, "2014-03-17-15-15-23_Realtek": 26.18556701030928, "2014-03-17-15-15-23_Yamaha": 26.18556701030928, "2014-03-17-15-15-23_Microsoft-Kinect-Raw": 26.18556701030928, "2014-03-17-14-43-06_Realtek": 75.84905660377359, "2014-03-17-14-43-06_Yamaha": 75.84905660377359, "2014-03-17-14-43-06_Microsoft-Kinect-Raw": 75.84905660377359, "2014-03-17-14-41-54_Realtek": 36.80851063829787, "2014-03-17-14-41-54_Yamaha": 36.80851063829787, "2014-03-17-14-41-54_Microsoft-Kinect-Raw": 36.80851063829787, "2014-03-17-14-06-43_Realtek": 32.407407407407405, "2014-03-17-14-06-43_Yamaha": 32.407407407407405, "2014-03-17-14-06-43_Microsoft-Kinect-Raw": 32.407407407407405, "2014-03-17-11-10-01_Realtek": 270.0, "2014-03-17-11-10-01_Yamaha": 270.0, "2014-03-17-11-10-01_Microsoft-Kinect-Raw": 270.0, "2014-03-17-15-12-53_Realtek": 36.92307692307692, "2014-03-17-15-12-53_Yamaha": 36.92307692307692, "2014-03-17-15-12-53_Microsoft-Kinect-Raw": 36.92307692307692, "2014-03-17-14-36-22_Realtek": 45.116279069767444, "2014-03-17-14-36-22_Yamaha": 45.116279069767444, "2014-03-17-14-36-22_Microsoft-Kinect-Raw": 45.116279069767444, "2014-08-04-13-22-49_Kinect-Beam": 38.22222222222222, "2014-08-04-13-22-49_Kinect-RAW": 39.090909090909086, "2014-08-04-13-22-49_Realtek": 39.090909090909086, "2014-08-04-13-22-49_Samson": 39.090909090909086, "2014-08-04-13-22-49_Yamaha": 39.090909090909086, "2014-03-17-14-38-03_Realtek": 25.25, "2014-03-17-14-38-03_Yamaha": 25.25, "2014-03-17-14-38-03_Microsoft-Kinect-Raw": 25.25, "2014-08-27-11-05-29_Kinect-Beam": 34.32835820895522, "2014-08-27-11-05-29_Kinect-RAW": 34.84848484848485, "2014-08-27-11-05-29_Realtek": 34.84848484848485, "2014-08-27-11-05-29_Samson": 34.84848484848485, "2014-08-27-11-05-29_Yamaha": 34.84848484848485, "2014-03-17-14-49-24_Realtek": 31.833333333333332, "2014-03-17-14-49-24_Yamaha": 31.833333333333332, "2014-03-17-14-49-24_Microsoft-Kinect-Raw": 31.833333333333332, "2014-03-17-14-35-31_Realtek": 32.30769230769231, "2014-03-17-14-35-31_Yamaha": 32.30769230769231, "2014-03-17-14-35-31_Microsoft-Kinect-Raw": 32.72727272727273, "2014-03-17-15-13-53_Realtek": 38.958333333333336, "2014-03-17-15-13-53_Yamaha": 38.958333333333336, "2014-03-17-15-13-53_Microsoft-Kinect-Raw": 38.958333333333336, "2014-03-17-14-09-48_Realtek": 26.923076923076923, "2014-03-17-14-09-48_Yamaha": 26.923076923076923, "2014-03-17-14-09-48_Microsoft-Kinect-Raw": 26.923076923076923, "2014-08-04-13-06-09_Kinect-Beam": 48.0, "2014-08-04-13-06-09_Kinect-RAW": 48.0, "2014-08-04-13-06-09_Realtek": 48.0, "2014-08-04-13-06-09_Samson": 48.0, "2014-08-04-13-06-09_Yamaha": 48.0, "2014-03-17-14-17-43_Realtek": 26.09756097560976, "2014-03-17-14-17-43_Yamaha": 26.09756097560976, "2014-03-17-14-17-43_Microsoft-Kinect-Raw": 26.09756097560976, "2014-03-17-14-43-41_Realtek": 109.72972972972973, "2014-03-17-14-43-41_Yamaha": 109.72972972972973, "2014-03-17-14-43-41_Microsoft-Kinect-Raw": 109.72972972972973, "2014-03-17-15-14-56_Realtek": 30.476190476190478, "2014-03-17-15-14-56_Yamaha": 30.476190476190478, "2014-03-17-15-14-56_Microsoft-Kinect-Raw": 30.476190476190478, "2014-03-17-14-20-46_Realtek": 39.473684210526315, "2014-03-17-14-20-46_Yamaha": 39.473684210526315, "2014-03-17-14-20-46_Microsoft-Kinect-Raw": 39.473684210526315, "2015-02-09-15-07-19_Kinect-RAW": "float division by zero", "2015-02-09-15-07-19_Realtek": "float division by zero", "2015-02-09-15-07-19_Samson": "float division by zero", "2015-02-09-15-07-19_Yamaha": "float division by zero", "2015-02-09-13-48-26_Kinect-Beam": "float division by zero", "2015-02-09-13-48-26_Kinect-RAW": "float division by zero", "2015-02-09-13-48-26_Realtek": "float division by zero", "2015-02-09-13-48-26_Samson": "float division by zero", "2015-02-09-13-48-26_Yamaha": "float division by zero", "2015-02-09-12-36-46_Kinect-Beam": "float division by zero", "2015-02-09-12-36-46_Kinect-RAW": "float division by zero", "2015-02-09-12-36-46_Realtek": "float division by zero", "2015-02-09-12-36-46_Samson": "float division by zero", "2015-02-09-12-36-46_Yamaha": "float division by zero", "2015-02-09-11-42-09_Realtek": "float division by zero", "2015-02-09-11-42-09_Samson": "float division by zero", "2015-01-27-11-40-44_Samson": 36.7816091954023, "2015-01-28-11-49-53_Kinect-Beam": "float division by zero", "2015-01-28-11-49-53_Kinect-RAW": "float division by zero", "2015-01-28-11-49-53_Realtek": "float division by zero", "2015-02-04-12-29-49_Kinect-Beam": "float division by zero", "2015-02-04-12-29-49_Kinect-RAW": "float division by zero", "2015-02-04-12-29-49_Realtek": "float division by zero", "2015-02-04-12-29-49_Samson": "float division by zero", "2015-02-04-12-29-49_Yamaha": "float division by zero", "2015-01-27-14-37-33_Kinect-Beam": "float division by zero", "2015-01-27-14-37-33_Kinect-RAW": "float division by zero", "2015-01-27-14-37-33_Realtek": "float division by zero", "2015-01-27-14-37-33_Samson": "float division by zero", "2015-01-27-14-37-33_Yamaha": "float division by zero", "2015-02-04-12-36-32_Kinect-Beam": "float division by zero", "2015-02-04-12-36-32_Kinect-RAW": "float division by zero", "2015-02-04-12-36-32_Realtek": "float division by zero", "2015-02-04-12-36-32_Samson": "float division by zero", "2015-02-04-12-36-32_Yamaha": "float division by zero", "2015-02-10-13-45-07_Kinect-Beam": "float division by zero", "2015-02-10-13-45-07_Kinect-RAW": "float division by zero", "2015-02-10-13-45-07_Realtek": "float division by zero", "2015-02-10-13-45-07_Samson": "float division by zero", "2015-02-10-13-45-07_Yamaha": "float division by zero", "2015-02-10-14-18-26_Kinect-Beam": "float division by zero", "2015-02-10-14-18-26_Kinect-RAW": "float division by zero", "2015-02-10-14-18-26_Realtek": "float division by zero", "2015-02-10-14-18-26_Samson": "float division by zero", "2015-02-10-14-18-26_Yamaha": "float division by zero"} -------------------------------------------------------------------------------- /data/validation/swc/invalid_character_ratio.json: -------------------------------------------------------------------------------- 1 | {"00000002_0000000002_401810_404760": 28.135593220339093, "00000003_0000000005_2785110_2786660": 25.806451612907768, "00000003_0000000006_3198570_3199890": 25.000000000005514, "00000003_0000000006_3224900_3226230": 25.563909774437487, "00000003_0000000006_3617670_3620200": 28.853754940714367, "00000004_0000000008_193940_194960": 27.450980392156588, "00000004_0000000008_198930_200300": 25.54744525547437, "00000022_0000000029_271060_272130": 28.971962616822616, "00000022_0000000029_695640_696680": 27.88461538461636, "00000028_0000000040_2669840_2671090": 30.4, "00000032_0000000044_789870_791530": 31.927710843374108, "00000042_0000000058_397890_398970": 32.40740740740618, "00000009_0000000067_186870_188230": 33.82352941176507, "00000009_0000000067_369070_370970": 26.842105263157414, "00000059_0000000090_1704230_1705320": 25.68807339449734, "00000061_0000000093_1334740_1336970": 25.56053811659172, "00000008_0000000098_36190_37830": 25.609756097560968, "00000065_0000000099_467240_468290": 27.61904761904732, "00000079_0000000115_117180_118480": 33.07692307692315, "00000087_0000000131_757010_758070": 27.35849056603621, "00000087_0000000141_441790_442970": 28.813559322033733, "00000087_0000000141_1263370_1265180": 25.41436464088155, "00000100_0000000144_386950_389050": 28.09523809523779, "00000100_0000000144_419020_420190": 25.64102564102529, "00000100_0000000144_692900_693920": 26.47058823529459, "00000105_0000000153_186510_187820": 25.190839694656447, "00000106_0000000155_99640_100920": 26.562499999999975, "00000110_0000000160_2199170_2201200": 25.615763546801244, "00000093_0000000173_2886580_2887650": 33.644859813078966, "00000119_0000000174_61920_63250": 27.819548872180487, "00000119_0000000174_103480_104770": 27.131782945736603, "00000119_0000000174_662310_663350": 32.69230769230526, "00000122_0000000181_2562110_2563190": 25.000000000001684, "00000134_0000000196_177910_178950": 28.846153846154067, "00000022_0000000197_1018860_1019970": 28.828828828828474, "00000022_0000000197_1023280_1025380": 26.190476190474488, "00000137_0000000205_5205660_5206670": 27.722772277221733, "00000087_0000000221_875470_876800": 28.571428571430136, "00000100_0000000228_1106300_1107750": 29.655172413792172, "00000087_0000000231_2959060_2960230": 29.05982905982725, "00000087_0000000231_2961380_2962570": 25.21008403361229, "00000028_0000000260_915510_916610": 26.36363636363582, "00000169_0000000266_2084250_2085449": 25.83333333333725, "00000169_0000000266_2489130_2490250": 25.892857142859665, "00000169_0000000267_3439280_3440520": 25.000000000004402, "00000169_0000000267_4717590_4719320": 26.589595375729253, "00000169_0000000268_5760610_5761620": 28.712871287122507, "00000169_0000000268_6524390_6525720": 27.06766917293381, "00000169_0000000269_7921160_7922430": 25.196850393692127, "00000169_0000000270_9690710_9692120": 26.241134751741907, "00000169_0000000271_10932830_10934180": 28.148148148140564, "00000177_0000000291_493660_494800": 27.192982456140676, "00000003_0000000295_48640_49730": 26.605504587156055, "00000003_0000000295_323830_325270": 26.388888888888932, "00000180_0000000296_131180_132320": 28.070175438596827, "00000017_0000000304_154700_155960": 29.365079365078916, "00000195_0000000323_346970_348450": 25.000000000000654, "00000199_0000000331_2279480_2280520": 25.000000000000874, "00000215_0000000362_51900_52940": 26.923076923076945, "00000098_0000000373_54560_56380": 25.27472527472527, "00000220_0000000374_269350_270750": 27.142857142857583, "00000223_0000000383_180850_182880": 26.108374384236438, "00000223_0000000383_708710_709750": 34.61538461538583, "00000223_0000000383_1406320_1408300": 25.25252525252502, "00000223_0000000384_1772450_1773490": 25.000000000000874, "00000000_0000000392_25280_26620": 28.358208955223883, "00000087_0000000417_373920_375280": 25.735294117647875, "00000062_0000000420_475610_477140": 25.490196078431826, "00000062_0000000420_481430_482480": 29.523809523809206, "00000242_0000000424_685470_686780": 32.061068702291415, "00000242_0000000424_1172510_1174280": 27.11864406779689, "00000242_0000000424_1406210_1407250": 29.80769230769335, "00000026_0000000427_425130_426350": 26.229508196720726, "00000230_0000000428_174410_175440": 33.00970873786404, "00000244_0000000430_689560_691330": 25.42372881355795, "00000019_0000000448_1190040_1191400": 27.94117647058562, "00000019_0000000449_3734780_3736300": 25.0000000000003, "00000007_0000000452_76200_77490": 27.131782945736603, "00000087_0000000456_1963100_1964460": 27.94117647058562, "00000267_0000000469_415820_417140": 25.000000000000128, "00000267_0000000469_465730_467430": 25.294117647058993, "00000269_0000000471_96360_97900": 26.623376623376515, "00000270_0000000472_633020_634280": 27.777777777777978, "00000036_0000000478_40430_42520": 25.358851674641105, "00000028_0000000479_397170_398680": 26.490066225165723, "00000028_0000000479_677020_678110": 29.357798165136757, "00000028_0000000479_838550_840280": 26.589595375722265, "00000275_0000000480_179710_180790": 25.925925925926308, "00000092_0000000504_127810_129050": 27.419354838709477, "00000022_0000000506_1386150_1388500": 25.53191489361801, "00000022_0000000506_1389270_1391230": 27.04081632653011, "00000100_0000000526_93570_94870": 26.153846153845926, "00000100_0000000526_755810_758020": 25.339366515836687, "00000219_0000000528_67460_68720": 26.190476190476083, "00000219_0000000528_291400_292840": 25.694444444444485, "00000141_0000000531_385600_387860": 26.106194690265593, "00000104_0000000540_380430_381590": 25.000000000000686, "00000319_0000000576_156990_158480": 28.187919463087614, "00000319_0000000576_219660_220820": 25.000000000000075, "00000304_0000000579_835640_836720": 26.851851851850835, "00000028_0000000591_23810_26350": 25.98425196850391, "00000041_0000000607_203460_204500": 25.000000000000192, "00000041_0000000607_336910_338460": 25.806451612903984, "00000038_0000000616_1058540_1060380": 28.80434782608468, "00000038_0000000616_2386270_2387310": 33.65384615384733, "00000038_0000000616_2781170_2782450": 29.687500000005908, "00000348_0000000627_1085970_1087080": 29.72972972973241, "00000348_0000000631_5229400_5230670": 32.283464566918035, "00000222_0000000635_206920_207960": 25.961538461537952, "00000222_0000000637_3367130_3368920": 25.698324022346892, "00000222_0000000638_5128600_5129830": 25.20325203252927, "00000222_0000000638_5575140_5576940": 27.222222222233228, "00000222_0000000638_5626160_5627240": 25.92592592592767, "00000353_0000000645_1343310_1344580": 25.98425196850431, "00000356_0000000651_128949_130030": 25.925925925925625, "00000358_0000000653_157410_159080": 25.149700598802156, "00000359_0000000655_2942940_2944590": 27.27272727272577, "00000363_0000000663_390750_392090": 27.611940298507978, "00000104_0000000666_389310_390750": 25.00000000000004, "00000127_0000000672_79790_81100": 28.24427480916056, "00000127_0000000672_462760_464140": 25.362318840579793, "00000127_0000000672_1012680_1014100": 26.76056338028032, "00000127_0000000672_1326600_1327710": 26.12612612612313, "00000100_0000000677_587370_588750": 28.98550724637691, "00000374_0000000680_3392940_3393970": 30.09708737864822, "00000100_0000000681_1216130_1217670": 27.922077922078582, "00000219_0000000700_363960_365090": 25.663716814159397, "00000219_0000000700_468080_469120": 37.49999999999926, "00000399_0000000729_62210_63490": 27.343749999999975, "00000406_0000000742_40030_41740": 25.146198830409343, "00000106_0000000752_514520_515710": 26.050420168066033, "00000106_0000000752_613180_614520": 25.373134328357605, "00000106_0000000752_1068400_1069910": 25.165562913907436, "00000106_0000000752_1083600_1084800": 26.666666666665655, "00000106_0000000752_1749690_1750730": 32.69230769230884, "00000044_0000000757_1228370_1229590": 27.049180327868246, "00000044_0000000757_1890780_1891890": 28.82882882882552, "00000100_0000000771_186280_187480": 26.66666666666692, "00000100_0000000771_316610_317660": 33.33333333333297, "00000100_0000000771_859810_863450": 26.923076923076184, "00000418_0000000773_154360_155470": 29.729729729730124, "00000100_0000000776_150050_151090": 25.000000000000192, "00000100_0000000776_188800_190410": 25.465838509317003, "00000100_0000000776_205990_207070": 25.00000000000037, "00000100_0000000776_222990_224500": 25.165562913907436, "00000098_0000000779_123360_126010": 25.283018867924476, "00000421_0000000781_312940_314250": 26.71755725190835, "00000422_0000000784_635440_636480": 33.65384615384733, "00000428_0000000801_1098190_1099750": 30.128205128206183, "00000429_0000000803_99630_100770": 25.438596491228058, "00000009_0000000808_64080_65250": 31.62393162393158, "00000142_0000000814_615630_617770": 25.233644859813246, "00000044_0000000818_875290_876660": 27.007299270072902, "00000104_0000000824_1473880_1475110": 25.203252032524613, "00000104_0000000824_1732300_1733510": 28.099173553718163, "00000437_0000000826_49250_51600": 25.10638297872339, "00000441_0000000833_206420_207710": 25.581395348836804, "00000443_0000000837_10396330_10397490": 25.000000000003137, "00000448_0000000846_211940_212980": 25.96153846153866, "00000169_0000000847_29320_30670": 25.18518518518516, "00000169_0000000847_905500_906850": 28.888888888888403, "00000106_0000000866_510870_512340": 26.530612244897466, "00000106_0000000866_1507650_1508730": 25.92592592592767, "00000106_0000000870_1215030_1217130": 26.666666666664934, "00000106_0000000870_1832890_1833910": 27.45098039215735, "00000106_0000000870_2037650_2039310": 30.120481927713485, "00000106_0000000870_2103440_2104830": 25.89928057554194, "00000106_0000000870_2105500_2106570": 27.10280373831361, "00000193_0000000881_4854810_4856510": 28.82352941176779, "00000193_0000000881_5386160_5387210": 32.38095238094677, "00000106_0000000888_838620_839850": 27.642276422763818, "00000106_0000000888_974970_976420": 26.896551724139197, "00000106_0000000888_1185500_1186960": 26.027397260273325, "00000106_0000000889_2614240_2615440": 27.499999999993747, "00000106_0000000889_2678710_2679830": 25.000000000002437, "00000106_0000000889_2703300_2704820": 25.0000000000003, "00000106_0000000889_2840550_2842170": 27.160493827162323, "00000466_0000000897_2344640_2345680": 28.846153846154856, "00000136_0000000907_245620_246790": 26.495726495726778, "00000434_0000000909_116810_118300": 29.53020134228198, "00000193_0000000929_1271890_1273790": 25.78947368421238, "00000483_0000000930_1513220_1514530": 29.007633587787467, "00000483_0000000930_1566750_1567780": 28.15533980582599, "00000022_0000000940_1425490_1426890": 30.71428571428372, "00000490_0000000941_241830_242980": 31.304347826087575, "00000400_0000000944_288800_290000": 25.000000000000238, "00000496_0000000950_56780_59060": 25.877192982456126, "00000503_0000000971_625420_626890": 30.612244897958615, "00000193_0000000972_1477990_1480000": 25.373134328358322, "00000059_0000000981_622400_623490": 27.52293577981571, "00000000_0000000986_333770_335020": 28.8, "00000000_0000000986_378920_379930": 31.683168316831967, "00000087_0000001003_6449170_6450260": 28.440366972473267, "00000523_0000001017_974550_976600": 26.34146341463327, "00000523_0000001017_2348040_2349240": 30.833333333338008, "00000106_0000001019_219310_220320": 29.70297029702997, "00000106_0000001019_1392100_1393410": 27.480916030530725, "00000525_0000001025_139170_140720": 25.16129032258046, "00000526_0000001026_160140_161690": 26.451612903225612, "00000528_0000001029_564540_565670": 25.663716814159397, "00000093_0000001037_2966940_2968270": 25.563909774437487, "00000533_0000001044_1895170_1896280": 27.927927927930444, "00000535_0000001049_1302410_1303670": 30.952380952381176, "00000007_0000001065_678780_679860": 30.5555555555544, "00000007_0000001065_3282660_3283690": 29.12621359222735, "00000548_0000001083_48550_50200": 25.454545454545368, "00000548_0000001083_1782480_1783620": 25.43859649123091, "00000553_0000001096_885280_886780": 25.333333333333332, "00000553_0000001098_3170850_3172130": 26.562499999995847, "00000557_0000001103_1952370_1955220": 25.263157894735635, "00000557_0000001103_2080920_2082179": 28.571428571433934, "00000022_0000001116_672290_673510": 29.508196721310817, "00000022_0000001116_1847570_1848710": 25.438596491225837, "00000566_0000001118_501500_502900": 25.000000000000405} -------------------------------------------------------------------------------- /data/validation/swc/invalid_all.json: -------------------------------------------------------------------------------- 1 | ["00000000_0000000392_25280_26620", "00000000_0000000986_333770_335020", "00000000_0000000986_378920_379930", "00000002_0000000002_401810_404760", "00000003_0000000005_2785110_2786660", "00000003_0000000006_3198570_3199890", "00000003_0000000006_3224900_3226230", "00000003_0000000006_3617670_3620200", "00000003_0000000295_323830_325270", "00000003_0000000295_48640_49730", "00000004_0000000008_193940_194960", "00000004_0000000008_198930_200300", "00000004_0000000341_184400_186980", "00000005_0000000009_226920_235040", "00000005_0000000009_315530_318630", "00000005_0000000009_328830_334480", "00000005_0000000009_339740_344400", "00000005_0000000009_347200_353590", "00000005_0000000009_383330_393100", "00000005_0000000009_433160_440690", "00000005_0000000009_458600_464070", "00000005_0000000009_548600_553100", "00000005_0000000009_601570_605490", "00000005_0000000009_606060_614140", "00000005_0000000009_621180_625920", "00000005_0000000009_655070_661610", "00000005_0000000009_661600_668910", "00000005_0000000009_84130_92790", "00000005_0000000009_889550_896390", "00000006_0000000010_143500_148520", "00000006_0000000010_184520_189900", "00000006_0000000010_322860_325020", "00000006_0000000010_328480_332760", "00000006_0000000010_337610_343000", "00000006_0000000010_76060_78500", "00000007_0000000036_209240_211610", "00000007_0000000180_1390_3630", "00000007_0000000180_228730_231990", "00000007_0000000180_250410_256709", "00000007_0000000180_25630_32960", "00000007_0000000180_60460_70450", "00000007_0000000180_77580_80550", "00000007_0000000182_40970_47740", "00000007_0000000208_204880_206130", "00000007_0000000208_207640_210860", "00000007_0000000209_2262370_2266860", "00000007_0000000209_2767010_2775320", "00000007_0000000209_3615930_3617680", "00000007_0000000209_790700_813330", "00000007_0000000254_2295760_2296880", "00000007_0000000298_1923760_1929320", "00000007_0000000298_343060_350620", "00000007_0000000452_76200_77490", "00000007_0000000491_181670_195220", "00000007_0000000603_70350_75430", "00000007_0000000711_278400_280640", "00000007_0000000719_2870760_2877430", "00000007_0000000719_359080_364040", "00000007_0000000918_1507150_1516030", "00000007_0000000918_3122990_3128940", "00000007_0000001065_1105500_1108660", "00000007_0000001065_3282660_3283690", "00000007_0000001065_678780_679860", "00000007_0000001065_777750_779610", "00000007_0000001067_249460_251820", "00000007_0000001080_1317840_1325750", "00000007_0000001080_279210_284060", "00000007_0000001143_589450_595400", "00000007_0000001143_642050_646060", "00000007_0000001143_965770_969590", "00000007_0000001144_4041090_4044610", "00000008_0000000098_36190_37830", "00000009_0000000014_31520_35960", "00000009_0000000067_186870_188230", "00000009_0000000067_369070_370970", "00000009_0000000808_64080_65250", "00000011_0000000016_223670_226330", "00000011_0000000016_250510_254570", "00000011_0000000016_384280_396070", "00000012_0000000017_584680_591670", "00000012_0000000017_951160_952270", "00000017_0000000039_124400_127330", "00000017_0000000071_1496050_1500720", "00000017_0000000304_154700_155960", "00000017_0000000304_836220_839770", "00000017_0000000321_2385370_2390580", "00000017_0000000321_4329820_4336150", "00000017_0000000321_5708310_5710140", "00000017_0000000954_24850_27570", "00000019_0000000025_75740_81190", "00000019_0000000448_1190040_1191400", "00000019_0000000449_3734780_3736300", "00000019_0000000450_4536840_4540930", "00000019_0000000450_4551240_4553980", "00000019_0000000450_5251120_5257410", "00000022_0000000029_20650_24550", "00000022_0000000029_271060_272130", "00000022_0000000029_695640_696680", "00000022_0000000197_1018860_1019970", "00000022_0000000197_1023280_1025380", "00000022_0000000197_33370_36900", "00000022_0000000402_21320_24920", "00000022_0000000506_1386150_1388500", "00000022_0000000506_1389270_1391230", "00000022_0000000940_1425490_1426890", "00000022_0000001116_1847570_1848710", "00000022_0000001116_21380_24990", "00000022_0000001116_672290_673510", "00000023_0000000031_20900_24090", "00000026_0000000427_425130_426350", "00000026_0000000509_37430_39470", "00000026_0000000780_39680_44250", "00000028_0000000040_2669840_2671090", "00000028_0000000148_989470_991020", "00000028_0000000260_915510_916610", "00000028_0000000479_397170_398680", "00000028_0000000479_677020_678110", "00000028_0000000479_838550_840280", "00000028_0000000591_23810_26350", "00000031_0000000043_103550_109590", "00000031_0000000043_56240_58430", "00000031_0000000043_82840_89740", "00000031_0000000043_98630_102090", "00000032_0000000044_1012250_1014010", "00000032_0000000044_789870_791530", "00000034_0000000047_1588560_1589870", "00000034_0000000047_2116290_2119600", "00000034_0000000047_3020120_3031280", "00000034_0000000047_3074490_3078650", "00000034_0000000048_3576270_3577970", "00000034_0000000048_3779290_3783580", "00000034_0000000048_4344540_4347940", "00000034_0000000048_4367420_4371860", "00000034_0000000048_4773780_4777500", "00000034_0000000048_6749220_6753320", "00000036_0000000478_40430_42520", "00000038_0000000616_1058540_1060380", "00000038_0000000616_2386270_2387310", "00000038_0000000616_2781170_2782450", "00000038_0000000616_29420_33070", "00000038_0000000616_692560_694440", "00000038_0000000852_699690_701150", "00000038_0000000852_782610_783660", "00000041_0000000607_203460_204500", "00000041_0000000607_336910_338460", "00000041_0000000885_146960_149350", "00000042_0000000058_397890_398970", "00000044_0000000166_1427370_1428810", "00000044_0000000213_3492500_3503710", "00000044_0000000213_3618920_3622750", "00000044_0000000213_3781280_3786270", "00000044_0000000327_1027210_1030609", "00000044_0000000327_2603550_2608890", "00000044_0000000327_385790_388590", "00000044_0000000327_438660_444440", "00000044_0000000334_382350_385410", "00000044_0000000425_1373970_1377460", "00000044_0000000425_1394940_1396560", "00000044_0000000425_3674950_3679210", "00000044_0000000425_3735010_3737060", "00000044_0000000425_3794450_3796920", "00000044_0000000425_736630_742200", "00000044_0000000425_774050_776770", "00000044_0000000516_2194520_2202080", "00000044_0000000516_2321900_2323340", "00000044_0000000516_2338500_2341990", "00000044_0000000516_2977590_2980110", "00000044_0000000516_3061140_3062810", "00000044_0000000516_319750_321230", "00000044_0000000516_3321840_3324160", "00000044_0000000516_3324420_3327870", "00000044_0000000516_3330900_3335300", "00000044_0000000516_947060_948200", "00000044_0000000682_1392640_1393960", "00000044_0000000683_1718800_1723790", "00000044_0000000757_1228370_1229590", "00000044_0000000757_1890780_1891890", "00000044_0000000818_2843970_2851190", "00000044_0000000818_875290_876660", "00000045_0000000062_30570_35440", "00000048_0000000065_1780430_1784460", "00000048_0000000414_532840_533930", "00000048_0000000414_616180_617410", "00000048_0000000521_80060_83940", "00000048_0000000521_822910_825010", "00000050_0000000070_75310_79740", "00000054_0000000084_1035359_1037290", "00000056_0000000086_1291730_1293420", "00000056_0000000086_31160_32740", "00000059_0000000090_1704230_1705320", "00000059_0000000090_2665730_2672390", "00000059_0000000090_2682520_2689000", "00000059_0000000462_106200_113260", "00000059_0000000868_1179660_1181010", "00000059_0000000981_622400_623490", "00000061_0000000093_1334740_1336970", "00000062_0000000420_475610_477140", "00000062_0000000420_481430_482480", "00000065_0000000099_467240_468290", "00000070_0000000104_363670_368000", "00000077_0000000112_1548490_1552340", "00000077_0000000112_2390360_2391610", "00000078_0000000348_79430_83820", "00000079_0000000115_117180_118480", "00000083_0000000119_1165140_1168090", "00000083_0000000119_1477480_1483720", "00000083_0000000119_217180_223740", "00000087_0000000131_757010_758070", "00000087_0000000141_1263370_1265180", "00000087_0000000141_441790_442970", "00000087_0000000221_875470_876800", "00000087_0000000231_2959060_2960230", "00000087_0000000231_2961380_2962570", "00000087_0000000232_3564370_3565630", "00000087_0000000232_4138439_4140279", "00000087_0000000376_284870_286180", "00000087_0000000417_373920_375280", "00000087_0000000456_1192410_1197710", "00000087_0000000456_1963100_1964460", "00000087_0000000456_291720_296860", "00000087_0000000468_18100_21160", "00000087_0000000511_26520_29990", "00000087_0000000859_23700_25100", "00000087_0000000902_533770_535570", "00000087_0000001001_3913000_3916440", "00000087_0000001003_5617400_5621810", "00000087_0000001003_6449170_6450260", "00000087_0000001030_276350_280520", "00000087_0000001031_281810_285540", "00000087_0000001124_163270_164640", "00000088_0000000124_26150_28230", "00000092_0000000504_127810_129050", "00000093_0000000173_2886580_2887650", "00000093_0000000263_1371840_1373820", "00000093_0000000263_1475380_1477400", "00000093_0000000263_1533950_1536090", "00000093_0000000263_2253680_2255700", "00000093_0000000263_2688090_2689910", "00000093_0000000263_557980_559910", "00000093_0000001037_2966940_2968270", "00000096_0000000134_60770_62760", "00000096_0000000134_995930_1000740", "00000098_0000000373_20270_23720", "00000098_0000000373_54560_56380", "00000098_0000000779_123360_126010", "00000100_0000000144_386950_389050", "00000100_0000000144_419020_420190", "00000100_0000000144_692900_693920", "00000100_0000000228_1106300_1107750", "00000100_0000000526_755810_758020", "00000100_0000000526_93570_94870", "00000100_0000000677_587370_588750", "00000100_0000000681_1216130_1217670", "00000100_0000000681_395010_397820", "00000100_0000000771_186280_187480", "00000100_0000000771_316610_317660", "00000100_0000000771_859810_863450", "00000100_0000000776_150050_151090", "00000100_0000000776_188800_190410", "00000100_0000000776_205990_207070", "00000100_0000000776_222990_224500", "00000103_0000000150_259540_263060", "00000104_0000000151_1889430_1892080", "00000104_0000000151_2826700_2827780", "00000104_0000000540_29700_32920", "00000104_0000000540_380430_381590", "00000104_0000000666_389310_390750", "00000104_0000000824_1201380_1206880", "00000104_0000000824_1259540_1262760", "00000104_0000000824_1473880_1475110", "00000104_0000000824_1732300_1733510", "00000104_0000000824_944490_948390", "00000104_0000000824_961390_963280", "00000104_0000000824_964220_967590", "00000104_0000000824_967670_970090", "00000104_0000000824_970300_971880", "00000104_0000000882_1060450_1062890", "00000104_0000000882_1101720_1105430", "00000104_0000000882_1275740_1277340", "00000104_0000000882_1801220_1805530", "00000104_0000000882_1807650_1813750", "00000104_0000000882_250150_255550", "00000104_0000000882_862610_864780", "00000105_0000000153_186510_187820", "00000106_0000000155_1356910_1362170", "00000106_0000000155_99640_100920", "00000106_0000000523_14350_17020", "00000106_0000000752_1068400_1069910", "00000106_0000000752_1083600_1084800", "00000106_0000000752_1749690_1750730", "00000106_0000000752_514520_515710", "00000106_0000000752_613180_614520", "00000106_0000000866_1507650_1508730", "00000106_0000000866_510870_512340", "00000106_0000000869_19950_22570", "00000106_0000000869_666180_668190", "00000106_0000000870_1215030_1217130", "00000106_0000000870_1832890_1833910", "00000106_0000000870_2037650_2039310", "00000106_0000000870_2103440_2104830", "00000106_0000000870_2105500_2106570", "00000106_0000000888_1185500_1186960", "00000106_0000000888_838620_839850", "00000106_0000000888_974970_976420", "00000106_0000000889_2614240_2615440", "00000106_0000000889_2678710_2679830", "00000106_0000000889_2703300_2704820", "00000106_0000000889_2840550_2842170", "00000106_0000001019_1392100_1393410", "00000106_0000001019_219310_220320", "00000110_0000000160_1384070_1387670", "00000110_0000000160_2199170_2201200", "00000110_0000000360_335500_337170", "00000110_0000000360_351070_352550", "00000113_0000000790_2364410_2365640", "00000113_0000000790_894050_896900", "00000117_0000000171_79510_81380", "00000117_0000000225_25190_29240", "00000118_0000000172_1496420_1500380", "00000118_0000000172_1640130_1645500", "00000118_0000000172_995940_1001240", "00000119_0000000174_103480_104770", "00000119_0000000174_61920_63250", "00000119_0000000174_662310_663350", "00000119_0000000507_29970_36720", "00000119_0000000507_53140_58700", "00000122_0000000181_1823290_1826570", "00000122_0000000181_2562110_2563190", "00000122_0000000181_86870_89630", "00000123_0000000183_178350_191720", "00000123_0000000183_247750_253310", "00000126_0000000187_173200_180940", "00000126_0000000274_4833540_4835880", "00000126_0000000274_5152270_5154070", "00000126_0000000274_5157140_5159040", "00000126_0000000275_8156690_8158140", "00000127_0000000672_1012680_1014100", "00000127_0000000672_1326600_1327710", "00000127_0000000672_462760_464140", "00000127_0000000672_79790_81100", "00000128_0000000189_23490_27460", "00000129_0000000190_29360_33310", "00000132_0000000210_1400870_1403950", "00000134_0000000196_177910_178950", "00000136_0000000863_49790_54340", "00000136_0000000907_245620_246790", "00000136_0000000907_432400_434740", "00000136_0000000907_434980_436410", "00000136_0000000907_437140_439450", "00000136_0000000907_439510_440760", "00000136_0000000907_441330_442980", "00000137_0000000201_122970_130680", "00000137_0000000201_534580_540690", "00000137_0000000202_1244640_1255260", "00000137_0000000202_1383300_1386510", "00000137_0000000202_1389350_1391400", "00000137_0000000202_1410620_1413460", "00000137_0000000202_1416850_1419580", "00000137_0000000202_1442060_1445160", "00000137_0000000202_1463380_1467240", "00000137_0000000202_1489860_1492750", "00000137_0000000202_1503180_1507540", "00000137_0000000202_1539340_1542940", "00000137_0000000202_1644950_1650120", "00000137_0000000202_1650790_1657420", "00000137_0000000203_2504180_2505890", "00000137_0000000203_2769450_2771820", "00000137_0000000203_2797520_2799710", "00000137_0000000203_3255190_3258400", "00000137_0000000205_5205660_5206670", "00000137_0000000205_5515500_5519480", "00000139_0000000215_316940_318590", "00000141_0000000531_385600_387860", "00000142_0000000814_615630_617770", "00000145_0000000819_1601770_1607510", "00000145_0000000819_213840_216840", "00000145_0000000819_307570_316150", "00000145_0000000819_436250_440690", "00000145_0000000819_833080_839570", "00000145_0000001014_2506230_2507270", "00000145_0000001014_2519850_2527860", "00000145_0000001014_2540500_2544200", "00000149_0000000238_28990_32380", "00000150_0000000239_21880_26390", "00000153_0000000243_104510_109790", "00000168_0000000264_303220_305660", "00000169_0000000266_2084250_2085449", "00000169_0000000266_2489130_2490250", "00000169_0000000267_3439280_3440520", "00000169_0000000267_4717590_4719320", "00000169_0000000268_5760610_5761620", "00000169_0000000268_6524390_6525720", "00000169_0000000269_7921160_7922430", "00000169_0000000270_9690710_9692120", "00000169_0000000271_10932830_10934180", "00000169_0000000847_29320_30670", "00000169_0000000847_905500_906850", "00000173_0000000283_312750_314070", "00000174_0000001114_29300_33200", "00000174_0000001114_512740_514390", "00000177_0000000291_493660_494800", "00000178_0000000498_111660_115720", "00000180_0000000296_131180_132320", "00000181_0000000297_1705900_1708950", "00000187_0000000309_268980_276060", "00000189_0000000311_23390_26890", "00000189_0000000311_599320_603330", "00000189_0000000311_988580_990460", "00000193_0000000881_4854810_4856510", "00000193_0000000881_5386160_5387210", "00000193_0000000929_1271890_1273790", "00000193_0000000972_1477990_1480000", "00000193_0000000972_1772870_1775670", "00000195_0000000323_346970_348450", "00000198_0000001062_786560_791110", "00000199_0000000331_2279480_2280520", "00000201_0000000338_447020_458200", "00000202_0000000339_369940_373370", "00000202_0000000339_375130_377530", "00000204_0000000880_832240_836110", "00000204_0000000970_455440_460030", "00000206_0000000345_510010_515260", "00000206_0000000345_518659_523690", "00000206_0000000345_538940_542780", "00000206_0000000556_1976880_1979610", "00000206_0000000556_1982280_1984700", "00000206_0000000556_2013100_2015250", "00000215_0000000362_51900_52940", "00000219_0000000528_291400_292840", "00000219_0000000528_67460_68720", "00000219_0000000700_123900_126250", "00000219_0000000700_18760_22880", "00000219_0000000700_363960_365090", "00000219_0000000700_468080_469120", "00000220_0000000374_269350_270750", "00000222_0000000635_206920_207960", "00000222_0000000637_3367130_3368920", "00000222_0000000638_5128600_5129830", "00000222_0000000638_5575140_5576940", "00000222_0000000638_5626160_5627240", "00000223_0000000383_1406320_1408300", "00000223_0000000383_180850_182880", "00000223_0000000383_708710_709750", "00000223_0000000384_1772450_1773490", "00000226_0000000388_1259030_1263170", "00000230_0000000428_174410_175440", "00000230_0000000428_286190_290260", "00000235_0000000410_4470560_4472690", "00000236_0000000413_550640_552070", "00000237_0000000415_251920_256980", "00000237_0000000415_840160_841600", "00000241_0000000466_88390_89890", "00000242_0000000424_1172510_1174280", "00000242_0000000424_1406210_1407250", "00000242_0000000424_685470_686780", "00000244_0000000430_689560_691330", "00000247_0000000433_224420_226100", "00000247_0000000433_242680_245180", "00000247_0000000433_253300_258800", "00000247_0000000433_289200_294050", "00000253_0000000440_686200_693330", "00000259_0000000451_16650_18370", "00000259_0000000451_18360_19970", "00000267_0000000469_415820_417140", "00000267_0000000469_465730_467430", "00000267_0000000696_1533330_1535900", "00000267_0000000696_418140_420050", "00000267_0000000696_634320_635550", "00000269_0000000471_96360_97900", "00000270_0000000472_44410_45490", "00000270_0000000472_633020_634280", "00000275_0000000480_179710_180790", "00000277_0000000610_40170_46470", "00000278_0000000483_650630_651800", "00000280_0000000487_420710_421840", "00000292_0000000522_33170_38090", "00000298_0000000534_1856480_1857510", "00000298_0000000535_4479230_4493500", "00000298_0000000535_4758500_4764110", "00000301_0000000541_26210_29340", "00000304_0000000579_835640_836720", "00000318_0000000575_258589_265170", "00000318_0000000575_266030_270840", "00000318_0000000575_271160_275530", "00000319_0000000576_156990_158480", "00000319_0000000576_219660_220820", "00000330_0000000590_28650_32159", "00000331_0000000592_1964700_1976720", "00000331_0000000592_2010090_2020020", "00000331_0000000592_2054449_2059090", "00000331_0000000592_2259730_2260970", "00000331_0000000592_2329500_2330770", "00000335_0000000849_337170_342420", "00000338_0000000601_14870_20060", "00000338_0000000601_698840_710580", "00000342_0000000609_573930_581250", "00000344_0000000614_23940_29440", "00000348_0000000627_1085970_1087080", "00000348_0000000631_5229400_5230670", "00000348_0000000631_6434120_6439480", "00000353_0000000645_1343310_1344580", "00000353_0000000645_655630_662270", "00000356_0000000651_128949_130030", "00000357_0000000652_25530_28280", "00000358_0000000653_110990_114360", "00000358_0000000653_157410_159080", "00000359_0000000654_16010_20360", "00000359_0000000654_2248010_2254200", "00000359_0000000654_2276780_2282270", "00000359_0000000654_2291800_2293050", "00000359_0000000654_2305170_2309740", "00000359_0000000654_2362720_2367630", "00000359_0000000655_2942940_2944590", "00000359_0000000655_4819840_4827300", "00000359_0000000655_4863250_4866280", "00000363_0000000663_250310_251590", "00000363_0000000663_390750_392090", "00000363_0000000663_56340_60620", "00000364_0000000664_46510_51740", "00000364_0000000664_565860_581900", "00000371_0000000673_26470_27600", "00000372_0000000676_185660_188310", "00000372_0000000676_283970_288380", "00000373_0000000678_2735090_2738310", "00000373_0000000678_3605550_3616960", "00000374_0000000680_2553270_2556500", "00000374_0000000680_3392940_3393970", "00000379_0000000690_84660_89190", "00000380_0000000692_1020_2370", "00000380_0000000692_162680_165170", "00000380_0000000692_170150_175580", "00000380_0000000692_217850_220850", "00000380_0000000692_231050_233480", "00000380_0000000692_233680_234830", "00000380_0000000692_248110_251230", "00000380_0000000692_252000_256709", "00000380_0000000692_43250_46800", "00000388_0000000706_25890_33150", "00000392_0000000717_22210_24580", "00000398_0000000728_25510_29770", "00000399_0000000729_62210_63490", "00000400_0000000944_288800_290000", "00000406_0000000742_40030_41740", "00000412_0000000759_1062550_1065170", "00000412_0000000759_23900_28120", "00000412_0000000759_4779460_4781330", "00000416_0000000769_4569140_4574610", "00000418_0000000773_154360_155470", "00000421_0000000781_312940_314250", "00000422_0000000784_635440_636480", "00000424_0000000793_30850_33080", "00000425_0000000795_1884090_1888520", "00000427_0000000798_408160_410930", "00000427_0000000798_56010_58900", "00000427_0000000798_60830_64209", "00000428_0000000801_1098190_1099750", "00000429_0000000803_99630_100770", "00000430_0000000806_2191690_2197260", "00000430_0000000806_2197390_2200360", "00000430_0000000806_2506950_2512950", "00000431_0000000812_911020_919150", "00000431_0000000812_926070_929940", "00000434_0000000909_116810_118300", "00000437_0000000826_49250_51600", "00000438_0000000827_218740_221630", "00000441_0000000833_206420_207710", "00000443_0000000837_10101940_10108760", "00000443_0000000837_10396330_10397490", "00000445_0000000840_2674570_2675770", "00000445_0000000840_2678430_2679690", "00000448_0000000846_211940_212980", "00000449_0000000856_103910_107320", "00000449_0000000856_172130_176740", "00000449_0000000856_177640_182630", "00000449_0000000856_218990_220990", "00000449_0000000856_79690_83000", "00000449_0000000856_92750_95820", "00000450_0000000858_29130_32850", "00000452_0000000862_35360_39430", "00000452_0000000862_430620_435250", "00000456_0000000874_26330_30000", "00000460_0000000878_1070660_1071970", "00000460_0000000878_213270_218490", "00000466_0000000897_1076950_1082440", "00000466_0000000897_2344640_2345680", "00000468_0000000899_4446650_4448710", "00000468_0000000899_4596940_4602730", "00000479_0000000924_1401740_1411440", "00000483_0000000930_1513220_1514530", "00000483_0000000930_1566750_1567780", "00000490_0000000941_241830_242980", "00000492_0000000943_1540380_1547430", "00000492_0000000943_2078639_2080510", "00000492_0000000943_3062660_3063680", "00000492_0000000943_3083250_3088340", "00000492_0000000943_3135650_3138940", "00000492_0000000943_3148160_3152490", "00000492_0000000943_3169490_3175630", "00000495_0000000948_1288990_1292060", "00000495_0000000948_3193220_3199550", "00000495_0000000948_3225080_3227730", "00000495_0000000948_3228330_3232170", "00000496_0000000950_56780_59060", "00000503_0000000971_625420_626890", "00000510_0000000983_1744650_1751420", "00000510_0000000983_1764420_1768910", "00000510_0000000983_1785550_1791140", "00000510_0000000983_3332640_3338740", "00000519_0000001009_103870_106630", "00000523_0000001017_2348040_2349240", "00000523_0000001017_974550_976600", "00000525_0000001025_139170_140720", "00000526_0000001026_160140_161690", "00000528_0000001029_564540_565670", "00000531_0000001039_4600640_4604560", "00000531_0000001040_6406130_6417170", "00000531_0000001040_6433480_6437030", "00000531_0000001040_6484830_6487710", "00000531_0000001040_6489830_6502420", "00000532_0000001041_401470_405040", "00000533_0000001044_1895170_1896280", "00000535_0000001049_1302410_1303670", "00000548_0000001083_1415660_1417780", "00000548_0000001083_1782480_1783620", "00000548_0000001083_48550_50200", "00000553_0000001096_885280_886780", "00000553_0000001098_3170850_3172130", "00000553_0000001098_3463410_3466760", "00000554_0000001099_285020_287430", "00000555_0000001101_36620_40920", "00000556_0000001102_35000_38730", "00000557_0000001103_1952370_1955220", "00000557_0000001103_2080920_2082179", "00000562_0000001111_1772300_1774680", "00000562_0000001111_2751130_2756580", "00000563_0000001112_26070_29950", "00000566_0000001118_501500_502900"] -------------------------------------------------------------------------------- /notebooks/analyze_validation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import audiomate\n", 10 | "import IPython.display as ipd" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "path = 'data/download/swc'\n", 20 | "ds = audiomate.Corpus.load(path, reader='swc')" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "path = 'data/download/common_voice'\n", 30 | "ds = audiomate.Corpus.load(path, reader='common-voice')" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 26, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | "finanzielle Unabhängigkeit\n" 43 | ] 44 | }, 45 | { 46 | "data": { 47 | "text/html": [ 48 | "\n", 49 | " \n", 53 | " " 54 | ], 55 | "text/plain": [ 56 | "" 57 | ] 58 | }, 59 | "execution_count": 26, 60 | "metadata": {}, 61 | "output_type": "execute_result" 62 | } 63 | ], 64 | "source": [ 65 | "utt = ds.utterances['common_voice_de_17737392']\n", 66 | "transcript = utt.label_lists[audiomate.corpus.LL_WORD_TRANSCRIPT].join()\n", 67 | "samples = utt.read_samples(sr=16000)\n", 68 | "\n", 69 | "print(transcript)\n", 70 | "ipd.Audio(samples, rate=16000)" 71 | ] 72 | } 73 | ], 74 | "metadata": { 75 | "kernelspec": { 76 | "display_name": "Python 3", 77 | "language": "python", 78 | "name": "python3" 79 | }, 80 | "language_info": { 81 | "codemirror_mode": { 82 | "name": "ipython", 83 | "version": 3 84 | }, 85 | "file_extension": ".py", 86 | "mimetype": "text/x-python", 87 | "name": "python", 88 | "nbconvert_exporter": "python", 89 | "pygments_lexer": "ipython3", 90 | "version": "3.7.4" 91 | } 92 | }, 93 | "nbformat": 4, 94 | "nbformat_minor": 4 95 | } 96 | -------------------------------------------------------------------------------- /data/validation/swc/invalid_transcripts.json: -------------------------------------------------------------------------------- 1 | {"00000005_0000000009_84130_92790": ["in den heutzutage üblichen Antiqua Schriftarten war jedoch auch eine Ligatur aus langem ſ und s", ["ſ"]], "00000005_0000000009_226920_235040": ["Dabei wurde für die häufig auftretende Buchstabenkombination aus langem ſ und z mit Unterschlinge", ["ſ"]], "00000005_0000000009_315530_318630": ["eine Ligatur von langem ſ und rundem", ["ſ"]], "00000005_0000000009_328830_334480": ["Dagegen gibt es für eine ſs Ligatur viel ältere Belegstellen", ["ſ"]], "00000005_0000000009_339740_344400": ["zu Eszett und ſs Ligatur ist umstritten", ["ſ"]], "00000005_0000000009_347200_353590": ["Die ſs Ligatur in der Antiqua", ["ſ"]], "00000005_0000000009_383330_393100": ["Sie erscheint sowohl in Handschriften als auch im Druck bis Ende des siebzehnte Jahrhunderts als eine Alternative für ſſ beziehungsweise", ["ſ"]], "00000005_0000000009_433160_440690": ["England und eingeschränkt in Deutschland als Äquivalent zu ß die Ligaturform ſs", ["ſ"]], "00000005_0000000009_458600_464070": ["wo sie in freier Variation zur ſſ-Ligatur steht", ["ſ"]], "00000005_0000000009_548600_553100": ["je nach orthografischer Konvention alternierend zu ſs", ["ſ"]], "00000005_0000000009_601570_605490": ["Verwendung von ſs erfolgte weiterhin", ["ſ"]], "00000005_0000000009_606060_614140": ["auch nachdem im Antiquasatz das gewöhnliche ſ im späten achtzehnte Jahrhundert unüblich geworden war", ["ſ"]], "00000005_0000000009_621180_625920": ["im Antiquasatz die Buchstabenfolge ſs verwendet werden sollte", ["ſ"]], "00000005_0000000009_655070_661610": ["wurde ß bis Ende des neunzehnte Jahrhunderts gerne durch ſs wiedergegeben", ["ſ"]], "00000005_0000000009_661600_668910": ["Da das Lang-ſ der Kursivschrift grafisch mit dem h der Kurrentschrift übereinstimmte", ["ſ"]], "00000005_0000000009_889550_896390": ["Das sogenannte lange Antiqua-ſ wird oben mit einem z verbunden", ["ſ"]], "00000006_0000000010_76060_78500": ["2½ D versus drei D", ["½", "2"]], "00000006_0000000010_143500_148520": ["Dieses Verfahren wird häufig als 2½ D bezeichnet", ["½", "2"]], "00000006_0000000010_184520_189900": ["Gegenüber 2½ D aber höhere Datenraten / Speicherplatz erforderlich", ["½", "2"]], "00000006_0000000010_322860_325020": ["abgekürzt 2½D oder", ["½", "2"]], "00000006_0000000010_328480_332760": ["Der Unterschied zwischen 2½ D und drei D liegt in der Art der Höhe", ["½", "2"]], "00000006_0000000010_337610_343000": ["kann es bei 2½D Modellen zu unerwarteten optischen Effekten kommen", ["½", "2"]], "00000009_0000000014_31520_35960": ["Im Morsealphabet wird A oder a mit • – dargestellt", ["•"]], "00000011_0000000016_223670_226330": ["neunzehn Hundert neunzig wurde Photoshop S1.0 von", [".", "0", "1"]], "00000011_0000000016_250510_254570": ["Mit der Version S5.5 kam mit Photoshop", [".", "5"]], "00000011_0000000016_384280_396070": ["→ Siehe auch", ["→"]], "00000012_0000000017_584680_591670": ["Coortes bekanntes Œuvre ist mit etwas mehr als sechzig signierten Werken relativ klein", ["œ"]], "00000012_0000000017_951160_952270": ["* 1620", ["*"]], "00000019_0000000025_75740_81190": ["† zwölfte Oktober vierzehn Hundert fünfunddreißig bei Straubing", ["†"]], "00000022_0000000029_20650_24550": ["† achtzehnte April neunzehn Hundert fünfundfünfzig in Princeton", ["†"]], "00000023_0000000031_20900_24090": ["† November neunzehn Hundert dreißig in Grönland", ["†"]], "00000007_0000000036_209240_211610": ["sowie C D ³ ² Reviews", ["²", "³"]], "00000017_0000000039_124400_127330": ["griechischen Wort κέρας für Horn ab", ["ς", "α", "ρ", "κ", "έ"]], "00000031_0000000043_56240_58430": ["der magnetischen Feldkonstante μ0", ["0", "μ"]], "00000031_0000000043_82840_89740": ["ein Meter zwischen diesen Leitern eine Kraft von 2·10−7 Newton pro Meter Leiterlänge hervorrufen würde", ["0", "·", "2", "1"]], "00000031_0000000043_98630_102090": ["gleich eins C·s eins", ["·"]], "00000031_0000000043_103550_109590": ["Dies bedeutet einen Durchsatz von 6,24151·1018", ["4", "6", "·", "5", "2", ",", "8", "0", "1"]], "00000032_0000000044_1012250_1014010": ["→Chronologie der Sodomiegesetze", ["→"]], "00000034_0000000047_1588560_1589870": ["Insel Oʻahu", ["ʻ"]], "00000034_0000000047_2116290_2119600": ["So erhielten zum Beispiel die Befehlshaber auf Hawaiʻi", ["ʻ"]], "00000034_0000000047_3020120_3031280": ["So hatte man beispielsweise die Heeres Flak bei der neu gebauten Kāneʻohe Naval Air Station wenige Tage vorher wieder in die Kasernen verlegt", ["ʻ"]], "00000034_0000000047_3074490_3078650": ["Oktober neunzehn Hundert einundvierzig auf Hawaiʻi eingetroffen waren", ["ʻ"]], "00000034_0000000048_3576270_3577970": ["nördlich von Oʻahu", ["ʻ"]], "00000034_0000000048_3779290_3783580": ["die seit weniger als einem Monat auf Oʻahu eingesetzt wurden", ["ʻ"]], "00000034_0000000048_4344540_4347940": ["Ford Island und Kāneʻohe an", ["ʻ"]], "00000034_0000000048_4367420_4371860": ["blieb nur der kleine Flugplatz Haleʻiwa verschont", ["ʻ"]], "00000034_0000000048_4773780_4777500": ["Träger westlich oder südlich von Hawaiʻi standen", ["ʻ"]], "00000034_0000000048_6749220_6753320": ["einen Angriff auf Hawaiʻi in die Kriegsplanungen aufzunehmen", ["ʻ"]], "00000045_0000000062_30570_35440": ["† sechzehnte Mai neunzehn Hundert fünfundsechzig bei Bellinzona", ["†"]], "00000048_0000000065_1780430_1784460": ["Mindesteinbaudicke gleich Größtkorn x2,5", [",", "5", "2"]], "00000050_0000000070_75310_79740": ["beziehungsweise drei Hundert dreiundzwanzig Komma sechs Million km³", ["³"]], "00000017_0000000071_1496050_1500720": ["etwa eins Komma sechs sechs × 10−27", ["×"]], "00000054_0000000084_1035359_1037290": ["hier kann man einfach alles kaufen…", ["…"]], "00000056_0000000086_31160_32740": ["† dreiundzwanzigste Mai", ["†"]], "00000056_0000000086_1291730_1293420": ["Jahren in den Œuvres complètes", ["œ"]], "00000059_0000000090_2665730_2672390": ["Die ein Hundert größten Unternehmen des Landes haben zwei Tausend eins für rund fünfzig Milliarde A$ Waren", ["$"]], "00000059_0000000090_2682520_2689000": ["rund ein Hundert vierundfünfzig Milliarde A$ und macht damit über zwanzig Prozent des", ["$"]], "00000070_0000000104_363670_368000": ["Im Juli zwei Tausend dreizehn wurde das 100.000ste Fahrzeug ausgeliefert", ["0", ".", "1"]], "00000077_0000000112_1548490_1552340": ["Das passiert alles durch Andeutungen … Ich frage mich", ["…"]], "00000077_0000000112_2390360_2391610": ["…", ["…"]], "00000083_0000000119_217180_223740": ["Die modernisierten Fahrzeuge werden als Typ R 2.2b bezeichnet", [".", "2"]], "00000083_0000000119_1165140_1168090": ["3×120 Kilowatt erhöht", ["2", "3", "×", "0", "1"]], "00000083_0000000119_1477480_1483720": ["Die modernisierten Fahrzeuge werden als Baureihe R 2.2b bezeichnet", [".", "2"]], "00000088_0000000124_26150_28230": ["† sechzehnte Juli zwei Tausend eins", ["†"]], "00000096_0000000134_60770_62760": ["† zwanzigste Juli", ["†"]], "00000096_0000000134_995930_1000740": ["/", []], "00000028_0000000148_989470_991020": ["→ Hauptartikel", ["→"]], "00000103_0000000150_259540_263060": ["Tonumfang beträgt nicht wesentlich mehr als 1½ Oktaven", ["½", "1"]], "00000104_0000000151_1889430_1892080": ["das an die α1 Rezeptoren anbindet", ["α", "1"]], "00000104_0000000151_2826700_2827780": ["–", []], "00000106_0000000155_1356910_1362170": ["in einen 3,4×5,5 m winzigen Käfig auf dem Achterdeck sperren", ["4", "5", ",", "3", "×"]], "00000110_0000000160_1384070_1387670": ["• einheimische Volksgruppen", ["•"]], "00000044_0000000166_1427370_1428810": ["→ Postfälschung", ["→"]], "00000117_0000000171_79510_81380": ["um bei Peder Severin Krøyer", ["ø"]], "00000118_0000000172_995940_1001240": ["etwa 3×3½ Zentimeter großes Einzelexemplar aus Blei ist", ["3", "½", "×"]], "00000118_0000000172_1496420_1500380": ["B C ” aus der Sammlung Melancholike Humours…", ["…"]], "00000118_0000000172_1640130_1645500": ["easie Introduction to Practical Musicke… ein Lied über das Hornbuch", ["…"]], "00000007_0000000180_1390_3630": ["Artikel C D ³ ²", ["²", "³"]], "00000007_0000000180_25630_32960": ["Das C D ³ ² war eine von Commodore International auf Basis des Amiga", ["²", "³"]], "00000007_0000000180_60460_70450": ["einen Einschub an der Rückseite konnte das C D ³ ² aber zu einem vollwertigen Amiga oder alternativ zu einem Video C D Player erweitert werden", ["²", "³"]], "00000007_0000000180_77580_80550": ["C D ³ ² besitzt statt des im Amiga", ["²", "³"]], "00000007_0000000180_228730_231990": ["C D ³ ² war die letzte Hoffnung Commodores", ["²", "³"]], "00000007_0000000180_250410_256709": ["den U S A durfte das C D ³ ² wegen eines Lizenzstreites offiziell nie verkauft werden", ["²", "³"]], "00000122_0000000181_86870_89630": ["siːˈɛn ˌtaʊəɹ", ["ə", "ː", "ˌ", "ˈ", "ʊ", "ɛ", "ɹ"]], "00000122_0000000181_1823290_1826570": ["einzelnen Glasplatten messen 1×1,5 Meter", [",", "5", "×", "1"]], "00000007_0000000182_40970_47740": ["† dreiundzwanzigste Februar achtzehn Hundert fünfundfünfzig in Göttingen", ["†"]], "00000123_0000000183_178350_191720": ["Die Behälter sind entsprechend den Zwischenlagergenehmigungen für die Aufnahme von maximal ein Hundert achtzig Kilogramm Schwermetall sowie ein Aktivitätsinventar von maximal 1,2·1018 B q zugelassen", ["·", "2", ",", "8", "0", "1"]], "00000123_0000000183_247750_253310": ["Jahr zwei Tausend zehn wurden erstmals Behälter des neuen Typs CASTOR HAW28M", ["8", "2"]], "00000126_0000000187_173200_180940": ["Forat umfasste etwa eins Komma acht × eins Komma drei Kilometer und war damit auch eine ansehnliche Stadt", ["×"]], "00000128_0000000189_23490_27460": ["† einunddreißigste März achtzehn Hundert fünfundfünfzig in Haworth", ["†"]], "00000129_0000000190_29360_33310": ["† neunte Oktober neunzehn Hundert siebenundsechzig in", ["†"]], "00000022_0000000197_33370_36900": ["† zwanzigste Mai fünfzehn Hundert sechs in Valladolid", ["†"]], "00000137_0000000201_122970_130680": ["wie dem Kassettenlaufwerk Datasette oder dem 5¼″ Diskettenlaufwerk V C ein Tausend fünf Hundert einundvierzig", ["″", "¼", "5"]], "00000137_0000000201_534580_540690": ["Viele Spiele waren gleichzeitig auf einer 5¼ Zoll Diskette für beide Systeme erhältlich", ["¼", "5"]], "00000137_0000000202_1244640_1255260": ["Die restlichen vier K B enthalten zwei Zeichensätze à zwei Hundert sechsundfünfzig Zeichen in 8×8 Matrixdarstellung für den Bildschirm", ["8", "×"]], "00000137_0000000202_1383300_1386510": ["40×25 Zeichen Textmodus", ["4", "5", "2", "×", "0"]], "00000137_0000000202_1389350_1391400": ["8×8 Pixel pro Zeichen", ["8", "×"]], "00000137_0000000202_1410620_1413460": ["40×25 Zeichen Textmodus", ["4", "5", "2", "×", "0"]], "00000137_0000000202_1416850_1419580": ["4×8 doppelt breite Pixel pro Zeichen", ["4", "8", "×"]], "00000137_0000000202_1442060_1445160": ["40×25 Zeichen Textmodus", ["4", "5", "2", "×", "0"]], "00000137_0000000202_1463380_1467240": ["160×200 doppelt breite Pixel", ["6", "2", "×", "0", "1"]], "00000137_0000000202_1489860_1492750": ["320×200 Pixel", ["0", "3", "×", "2"]], "00000137_0000000202_1503180_1507540": ["zwei individuelle Farben je 8×8 Pixel Block", ["8", "×"]], "00000137_0000000202_1539340_1542940": ["12×21 doppelt breite Pixel für Multicolor Sprites", ["1", "×", "2"]], "00000137_0000000202_1644950_1650120": ["$dbff eingeblendet werden kann", ["$"]], "00000137_0000000202_1650790_1657420": ["ist aus Geschwindigkeitsgründen ein einzelner 1024×4 Bit S R A M Chip", ["4", "2", "×", "0", "1"]], "00000137_0000000203_2504180_2505890": ["5¼ Zoll Diskettenlaufwerk", ["¼", "5"]], "00000137_0000000203_2769450_2771820": ["3½ Zoll Diskettenlaufwerk", ["3", "½"]], "00000137_0000000203_2797520_2799710": ["3½ Zoll D D Disketten", ["3", "½"]], "00000137_0000000203_3255190_3258400": ["mit Auflösungen von 300×300 Pixeln", ["0", "3", "×"]], "00000137_0000000205_5515500_5519480": ["der B A S I C V2.0 Code in einen kompakten", [".", "0", "2"]], "00000007_0000000208_204880_206130": ["IF…THEN", ["…"]], "00000007_0000000208_207640_210860": ["und ON…GOTO / ON…GOSUB", ["…"]], "00000007_0000000209_790700_813330": ["Einem breiteren Publikum öffentlich vorgestellt wurden die Prototypen der Commodore zwei Hundert vierundsechzig Modelle und des Commodore drei Hundert vierundsechzig mitsamt einem neuentwickelten 5¼ Zoll Diskettenlaufwerk S F S vier Hundert einundachtzig erstmals auf der Fachmesse Winter", ["¼", "5"]], "00000007_0000000209_2262370_2266860": ["besseren Unterscheidbarkeit üblicherweise ein $-Symbol vorangestellt", ["$"]], "00000007_0000000209_2767010_2775320": ["dieses zunächst S F S vier Hundert einundachtzig genannten 5¼ Zoll Diskettenlaufwerks erheblich", ["¼", "5"]], "00000007_0000000209_3615930_3617680": ["Octasoft B A S I C V7.0", [".", "7", "0"]], "00000132_0000000210_1400870_1403950": ["den Versionen 8.x bis 9.x eingesetzt", [".", "9", "8"]], "00000044_0000000213_3492500_3503710": ["neunzehn Hundert zweiundneunzig wurde mit Win16.Vir _ eins _ vier das erste Computervirus für das Betriebssystem Microsoft Windows drei Punkt eins eins registriert", [".", "6", "1"]], "00000044_0000000213_3618920_3622750": ["und W32.Marburg die ersten polymorphen", [".", "3", "2"]], "00000044_0000000213_3781280_3786270": ["Einige Monate später wurde der Virus Win64.Rugrad entdeckt", ["4", "6", "."]], "00000139_0000000215_316940_318590": ["† erste Februar 1733", ["†"]], "00000117_0000000225_25190_29240": ["† vierzehnte September dreizehn Hundert einundzwanzig in Ravenna", ["†"]], "00000087_0000000232_3564370_3565630": ["Geschichte konzentrieren…", ["…"]], "00000087_0000000232_4138439_4140279": ["die wir noch mehr hassen als die Römer…", ["…"]], "00000149_0000000238_28990_32380": ["† zweite August zwei Tausend sechzehn in Santa", ["†"]], "00000150_0000000239_21880_26390": ["† fünfundzwanzigste August siebzehn Hundert sechsundsiebzig in Edinburgh", ["†"]], "00000153_0000000243_104510_109790": ["Karte des vereinigten Koreas und der Aufschrift 3대헌장 emporhalten", ["헌", "3", "대", "장"]], "00000007_0000000254_2295760_2296880": ["‚", []], "00000093_0000000263_557980_559910": ["→ Hauptartikel", ["→"]], "00000093_0000000263_1371840_1373820": ["→ Hauptartikel", ["→"]], "00000093_0000000263_1475380_1477400": ["→ Hauptartikel", ["→"]], "00000093_0000000263_1533950_1536090": ["→ Hauptartikel", ["→"]], "00000093_0000000263_2253680_2255700": ["→ Hauptartikel", ["→"]], "00000093_0000000263_2688090_2689910": ["→ Hauptartikel", ["→"]], "00000168_0000000264_303220_305660": ["Stunde noch nicht …", ["…"]], "00000126_0000000274_4833540_4835880": ["→ Historisches Territorium", ["→"]], "00000126_0000000274_5152270_5154070": ["→ Deutscher Dualismus", ["→"]], "00000126_0000000274_5157140_5159040": ["→ Drittes Deutschland", ["→"]], "00000126_0000000275_8156690_8158140": ["als auch Sønderjysk", ["ø"]], "00000173_0000000283_312750_314070": ["–", []], "00000181_0000000297_1705900_1708950": ["der E gleich mc² stand ein Jojo auslösen", ["²"]], "00000007_0000000298_343060_350620": ["In einem Interview vom sechzehnte Mai zwei Tausend eins auf der Spielemesse E³ sagte Miyamoto über den Namen", ["³"]], "00000007_0000000298_1923760_1929320": ["Für den Nintendo D S wurden auf der E³ zwei Tausend sechs Donkey Kong", ["³"]], "00000017_0000000304_836220_839770": ["sunt hec … in civitate nostra Dresdene", ["…"]], "00000187_0000000309_268980_276060": ["beanspruchte eine Fläche von zehn Meter × siebzehn Meter und wog siebenundzwanzig Tonnen", ["×"]], "00000189_0000000311_23390_26890": ["† siebte Oktober achtzehn Hundert neunundvierzig in Baltimore", ["†"]], "00000189_0000000311_599320_603330": ["Mein ererbter Reichtum ermöglichte mir ungewöhnlich ausgedehnte Studien…", ["…"]], "00000189_0000000311_988580_990460": ["bis die Zeitung abends in Druck ging…", ["…"]], "00000017_0000000321_2385370_2390580": ["Eine Toleranz von ±5 Prozent sollte daher eingehalten werden", ["±", "5"]], "00000017_0000000321_4329820_4336150": ["Bei Überlagerungsempfängern ist hierbei nur die Differenz f1−f2 von Bedeutung", ["2", "1"]], "00000017_0000000321_5708310_5710140": ["zum Beispiel 2Ж27Л", ["ж", "7", "2", "л"]], "00000044_0000000327_385790_388590": ["παιδεία verwies auf die", ["δ", "π", "ε", "ί", "α", "ι"]], "00000044_0000000327_438660_444440": ["παιδεία eine Vorbildung für die von ihnen vorgestellte Spezialisierung", ["δ", "π", "ε", "ί", "α", "ι"]], "00000044_0000000327_1027210_1030609": ["† siebenundzwanzig vor Christus", ["†"]], "00000044_0000000327_2603550_2608890": ["verwies auf sich selbst sowie auf die französische Encyclopædia Universalis", ["æ"]], "00000044_0000000334_382350_385410": ["Erde ist 23°26", ["°", "3", "6", "2"]], "00000201_0000000338_447020_458200": ["Sie wurde an der Küste Norwegens bis dreiundsechzig °N nachgewiesen und hat an küstennahen Standorten im südlichen Schweden und auf Bornholm Fuß gefasst", ["°"]], "00000202_0000000339_369940_373370": ["Nordeuropa westlich der Ostsee bis etwa 65°N", ["°", "6", "5"]], "00000202_0000000339_375130_377530": ["davon teils bis 66°N", ["°", "6"]], "00000004_0000000341_184400_186980": ["… hat denselben Vater wie", ["…"]], "00000206_0000000345_510010_515260": ["er sucht eine Kongruenz der Form x zwei ≡ y zwei", ["≡"]], "00000206_0000000345_518659_523690": ["multipliziert er geeignete Kongruenzen der Form x zwei ≡ y", ["≡"]], "00000206_0000000345_538940_542780": ["um Kongruenzen der Form x zwei ≡ y", ["≡"]], "00000078_0000000348_79430_83820": ["† sechsundzwanzigste Februar neunzehn Hundert dreizehn in Dresden", ["†"]], "00000110_0000000360_335500_337170": ["s wirklich lieber Du würdest nicht…", ["…"]], "00000110_0000000360_351070_352550": ["wirklich lieber Du würdest nicht…", ["…"]], "00000098_0000000373_20270_23720": ["† sechzehnte April 1998", ["†"]], "00000087_0000000376_284870_286180": ["15×12 Zentimeter", ["×", "5", "2", "1"]], "00000226_0000000388_1259030_1263170": ["Figuren sind dabei in der Größe 3¾ Zoll gehalten", ["¾", "3"]], "00000022_0000000402_21320_24920": ["† vierundzwanzigste Februar siebzehn Hundert neunundneunzig in Göttingen", ["†"]], "00000235_0000000410_4470560_4472690": ["… die Löhne", ["…"]], "00000236_0000000413_550640_552070": ["im Westen die Franken…", ["…"]], "00000048_0000000414_532840_533930": ["† 1731", ["†"]], "00000048_0000000414_616180_617410": ["† 1806", ["†"]], "00000237_0000000415_251920_256980": ["Pythagoras und als Näherung für die Kreiszahl π benutzten sie drei oder", ["π"]], "00000237_0000000415_840160_841600": ["etwas ganz anderes…", ["…"]], "00000044_0000000425_736630_742200": ["die ein Gitter von 19×19 gleich drei Hundert einundsechzig Schnittpunkten bilden", ["9", "×", "1"]], "00000044_0000000425_774050_776770": ["13×13 oder 9×9", ["9", "3", "×", "1"]], "00000044_0000000425_1373970_1377460": ["übliche Werte sind 5½ oder 6½", ["5", "½", "6"]], "00000044_0000000425_1394940_1396560": ["8½ Punkten entschädigt", ["8", "½"]], "00000044_0000000425_3674950_3679210": ["starken Amateur auf dem 19×19 Brett konkurrieren kann", ["9", "×", "1"]], "00000044_0000000425_3735010_3737060": ["5×5 Bretter gelöst hat", ["5", "×"]], "00000044_0000000425_3794450_3796920": ["die auf einem 19×19 Brett möglich sind", ["9", "×", "1"]], "00000230_0000000428_286190_290260": ["Arbeitsspeicher und ein 5¼ Zoll Diskettenlaufwerk mit", ["¼", "5"]], "00000247_0000000433_224420_226100": ["ANNUIT CŒPTIS", ["œ"]], "00000247_0000000433_242680_245180": ["audacibus annue cœptis", ["œ"]], "00000247_0000000433_253300_258800": ["O und das E aus CŒPTIS ohne gestalterische Notwendigkeit", ["œ"]], "00000247_0000000433_289200_294050": ["Magnus ab integro sæclorum nascitur ordo", ["æ"]], "00000253_0000000440_686200_693330": ["… auf den Höhenzügen und an deren Abhängen treten oft Felsen zutage", ["…"]], "00000019_0000000450_4536840_4540930": ["Cesar Godeffroy und Baron Ernst Merck… Carl Heine", ["…"]], "00000019_0000000450_4551240_4553980": ["Clubs… ward auf George Parish", ["…"]], "00000019_0000000450_5251120_5257410": ["… Die heutige Veranstaltung ist so stark besucht wie vielleicht noch keine Veranstaltung des Klubs", ["…"]], "00000259_0000000451_16650_18370": ["* 1630", ["*"]], "00000259_0000000451_18360_19970": ["† 1655", ["†"]], "00000087_0000000456_291720_296860": ["der die Hogwarts Schüler von Gleis neun ¾ des Londoner Bahnhofs", ["¾"]], "00000087_0000000456_1192410_1197710": ["Drehort für das fiktive Gleis neun ¾ waren die Bahnsteige vier und", ["¾"]], "00000059_0000000462_106200_113260": ["Rasse eine Schulterhöhe von neunzig bis ein Hundert sechzig Zentimeter und sind mit zwei bis 2½ Jahren geschlechtsreif", ["½", "2"]], "00000241_0000000466_88390_89890": ["† fünfte Oktober", ["†"]], "00000087_0000000468_18100_21160": ["† fünfte Juni neunzehn Hundert neunundsiebzig in Hamburg", ["†"]], "00000270_0000000472_44410_45490": ["* 1847", ["*"]], "00000278_0000000483_650630_651800": ["russisch Собачье", ["ч", "а", "б", "о", "ь", "е", "с"]], "00000280_0000000487_420710_421840": ["–", []], "00000007_0000000491_181670_195220": ["Ingolstadt liegt auf 48° 45′ 49″ nördlicher Breite und 11° 25′ 34″ östlicher Länge", ["4", "9", "″", "5", "2", "3", "′"]], "00000178_0000000498_111660_115720": ["So entstand in Berlin Radio multicult2.0 als Reaktion auf die", [".", "0", "2"]], "00000119_0000000507_29970_36720": ["und achtzehn Hundert achtundsechzig vom späteren Literatur Nobelpreisträger Bjørnstjerne Bjørnson in Riksmål", ["ø"]], "00000119_0000000507_53140_58700": ["Zuvor galt das patriotische Lied Sønner av Norge als Nationalhymne Norwegens", ["ø"]], "00000026_0000000509_37430_39470": ["† zwanzigste", ["†"]], "00000087_0000000511_26520_29990": ["† vierte Oktober neunzehn Hundert siebzig in Los Angeles", ["†"]], "00000044_0000000516_319750_321230": ["Ἰησοῦς Iēsūs", ["σ", "η", "ς", "ῦ", "ο", "ἰ"]], "00000044_0000000516_947060_948200": ["Jes 53,4f", ["4", ",", "5", "3"]], "00000044_0000000516_2194520_2202080": ["seiner Taufe empfing er laut Apg 22,16ff den Auftrag zur Völkermission", [",", "6", "2", "1"]], "00000044_0000000516_2321900_2323340": ["Apg 2,14ff", ["4", ",", "2", "1"]], "00000044_0000000516_2338500_2341990": ["Apg 5,12ff", [",", "5", "2", "1"]], "00000044_0000000516_2977590_2980110": ["zwei Sam 7,13f", [",", "3", "7", "1"]], "00000044_0000000516_3061140_3062810": ["Dan 7,13f", [",", "3", "7", "1"]], "00000044_0000000516_3321840_3324160": ["… in seinem Sohn Jesus Christus", ["…"]], "00000044_0000000516_3324420_3327870": ["Dieser ist der wahrhaftige Gott …", ["…"]], "00000044_0000000516_3330900_3335300": ["… Herrlichkeit unseres großen Gottes und Heilandes Jesus Christus", ["…"]], "00000048_0000000521_80060_83940": ["† vierzehnte Oktober neunzehn Hundert dreiundvierzig", ["†"]], "00000048_0000000521_822910_825010": ["dem Soldatenfriedhof in Chełm", ["ł"]], "00000292_0000000522_33170_38090": ["† neunundzwanzigste Juli neunzehn Hundert zwei bei Windisch Matrei", ["†"]], "00000106_0000000523_14350_17020": ["† achte Januar achtzehn Hundert achtzig in San Francisco", ["†"]], "00000298_0000000534_1856480_1857510": ["die فرقه", ["ق", "ف", "ر", "ه"]], "00000298_0000000535_4479230_4493500": ["Von ausstrahlender Wirkung waren die Streiks und zwischenzeitlichen Erfolge der unabhängigen Gewerkschaft Solidarność unter Lech Wałęsa in der Volksrepublik Polen 1980 / einundachtzig", ["ł"]], "00000298_0000000535_4758500_4764110": ["Dezember wurde das oppositionelle Bürgerkomitee unter Vorsitz Lech Wałęsas gegründet", ["ł"]], "00000104_0000000540_29700_32920": ["† neunzehnte April zwei Tausend sechs in Unterwössen", ["†"]], "00000301_0000000541_26210_29340": ["† zwanzigste Mai achtzehn Hundert zwanzig", ["†"]], "00000206_0000000556_1976880_1979610": ["Der Fall P ≠ N P", ["≠"]], "00000206_0000000556_1982280_1984700": ["Sinne von P ≠ N P gelöst", ["≠"]], "00000206_0000000556_2013100_2015250": ["dass P ≠ N P gilt", ["≠"]], "00000318_0000000575_258589_265170": ["wird die degenerative beziehungsweise maligne Myopie mit H44.2 kodiert", ["4", ".", "2"]], "00000318_0000000575_266030_270840": ["die einfache Kurzsichtigkeit mit H52.1 und die", [".", "5", "2", "1"]], "00000318_0000000575_271160_275530": ["Ciliarmuskelkrämpfe bedingte Pseudomyopie mit H52.5", [".", "5", "2"]], "00000330_0000000590_28650_32159": ["† vierundzwanzigste November neunzehn Hundert dreiundsechzig", ["†"]], "00000331_0000000592_1964700_1976720": ["Die Abteilung Biologie der Université de Technologie der Lothringer Stadt Vandœuvre lès Nancy suchte neunzehn Hundert dreiundsiebzig für den Studentenaustausch eine gleichartige Fachhochschule", ["œ"]], "00000331_0000000592_2010090_2020020": ["Im Oktober neunzehn Hundert achtundsiebzig wurde die Städtepartnerschaft in Vandœuvre und am neunzehnte Mai neunzehn Hundert neunundsiebzig in Lemgo offiziell vereinbart", ["œ"]], "00000331_0000000592_2054449_2059090": ["Lemgo die Partnerschaftsurkunden mit Vandœuvre und Beverley unterzeichnet", ["œ"]], "00000331_0000000592_2259730_2260970": ["→ Hauptartikel", ["→"]], "00000331_0000000592_2329500_2330770": ["→ Hauptartikel", ["→"]], "00000338_0000000601_14870_20060": ["† siebenundzwanzigste Oktober neunzehn Hundert achtundsechzig in Cambridge", ["†"]], "00000338_0000000601_698840_710580": ["Aus dieser Massendifferenz errechneten Lise Meitner und Otto Frisch mit Einsteins Formel E gleich mc² die bei der Spaltung freiwerdende Energie von etwa", ["²"]], "00000007_0000000603_70350_75430": ["verfeinerte die Version und kam schließlich mit der Firma Brøderbund ins Geschäft", ["ø"]], "00000342_0000000609_573930_581250": ["so aus der Vereinigung des Rheins mit der Donau vermittelst Schiffreichmachung und Vereinigung der Tauber und Wernitz…", ["…"]], "00000277_0000000610_40170_46470": ["† dreizehnte September achtzehn Hundert zweiundsiebzig in Rechenberg bei Nürnberg", ["†"]], "00000344_0000000614_23940_29440": ["† elfte Oktober dreizehn Hundert siebenundvierzig in Puch bei Fürstenfeldbruck", ["†"]], "00000038_0000000616_29420_33070": ["† erste September siebzehn Hundert fünfzehn in Versailles", ["†"]], "00000038_0000000616_692560_694440": ["für den mich die Höflinge gehalten haben…", ["…"]], "00000348_0000000631_6434120_6439480": ["der anderem die Sender E g o F M und M94.5 überträgt", ["4", "9", "5", "."]], "00000353_0000000645_655630_662270": ["die mittlerweile einen festen Bestandteil in der 1.Damen Basketball Bundesliga bilden sowie die ebenfalls", [".", "1"]], "00000357_0000000652_25530_28280": ["† neunundzwanzigste Juni", ["†"]], "00000358_0000000653_110990_114360": ["Eflatun Pınar und auf dem Karasis", ["ı"]], "00000359_0000000654_16010_20360": ["† achtzehnte Februar fünfzehn Hundert sechsundvierzig ebenda", ["†"]], "00000359_0000000654_2248010_2254200": ["† siebenundzwanzigste Oktober fünfzehn Hundert fünfundsiebzig in Königsberg", ["†"]], "00000359_0000000654_2276780_2282270": ["† zwanzigste September fünfzehn Hundert zweiundvierzig", ["†"]], "00000359_0000000654_2291800_2293050": ["† vierte", ["†"]], "00000359_0000000654_2305170_2309740": ["† achte März fünfzehn Hundert dreiundneunzig in Leipzig", ["†"]], "00000359_0000000654_2362720_2367630": ["† achtzehnte Januar fünfzehn Hundert einundsiebzig ebenda", ["†"]], "00000359_0000000655_4819840_4827300": ["können ein Kind verzaubern… Auch können sie geheimnisvolle Krankheiten im menschlichen Knie erzeugen", ["…"]], "00000359_0000000655_4863250_4866280": ["Mörder… Sie schaden mannigfaltig", ["…"]], "00000363_0000000663_56340_60620": ["† vierte Oktober neunzehn Hundert siebenundvierzig", ["†"]], "00000363_0000000663_250310_251590": ["–", []], "00000364_0000000664_46510_51740": ["† dreizehnte Oktober achtzehn Hundert fünfundzwanzig in München", ["†"]], "00000364_0000000664_565860_581900": ["Freifrau von Bayrstorff ∞ achtzehn Hundert neunundfünfzig Henriette Schoeller", ["∞"]], "00000371_0000000673_26470_27600": ["평양", ["평", "양"]], "00000372_0000000676_185660_188310": ["an einem Stück über Molière mit dem Titel Кабала", ["к", "а", "б", "л"]], "00000372_0000000676_283970_288380": ["an dem Libretto Рашель und an einem Stück über Stalin", ["а", "л", "ш", "ь", "р", "е"]], "00000373_0000000678_2735090_2738310": ["1.x parallel auf einem Rechner zu betreiben", [".", "1"]], "00000373_0000000678_3605550_3616960": ["bei der Nutzung solcher Programme unter Windows N T drei Punkt eins waren aber im Vergleich zu Windows 3.x Geschwindigkeitseinbußen zu verzeichnen", [".", "3"]], "00000374_0000000680_2553270_2556500": ["mit denen textbasierte O S / zwei 1.x", [".", "1"]], "00000100_0000000681_395010_397820": ["M S D O S oder Windows 3.x entwickelt worden sind", [".", "3"]], "00000044_0000000682_1392640_1393960": ["≥ drei", ["≥"]], "00000044_0000000683_1718800_1723790": ["Bei gegebenem Δt ist der Druckgradient umso höher", ["δ"]], "00000379_0000000690_84660_89190": ["niederschlagreichsten Monat fällt circa 2,9mal mehr Regen", [",", "9", "2"]], "00000380_0000000692_1020_2370": ["den Artikel Molokaʻi", ["ʻ"]], "00000380_0000000692_43250_46800": ["Auf Molokaʻi findet man die Geburtsstätte der Hulagöttin Laka", ["ʻ"]], "00000380_0000000692_162680_165170": ["grandma Kailiʻohe Kamaʻekua", ["ʻ"]], "00000380_0000000692_170150_175580": ["von der Insel Molokaʻi wurden Beschreibungen aus dem Alltagsleben der alten Hawaiier gegeben", ["ʻ"]], "00000380_0000000692_217850_220850": ["schönsten Strände befinden sich im Westen Molokaʻis", ["ʻ"]], "00000380_0000000692_231050_233480": ["zwischen Molokaʻi und Oʻahu verlaufende", ["ʻ"]], "00000380_0000000692_233680_234830": ["Molokaʻi Express", ["ʻ"]], "00000380_0000000692_248110_251230": ["eine Molokaʻi Ranch Wildlife Park Safari angeboten", ["ʻ"]], "00000380_0000000692_252000_256709": ["Molokaʻi Forest Reserve kann man per Fuß oder Geländewagen erkunden und auf", ["ʻ"]], "00000267_0000000696_418140_420050": ["≈45,72 Zentimeter", ["≈", "4", "5", "2", ",", "7"]], "00000267_0000000696_634320_635550": ["≈35 m", ["≈", "3", "5"]], "00000267_0000000696_1533330_1535900": ["≈1 drei Hundert Kilometer", ["≈", "1"]], "00000219_0000000700_18760_22880": ["† zehnte November neunzehn Hundert achtunddreißig in Istanbul", ["†"]], "00000219_0000000700_123900_126250": ["In dieser Zeit gewöhnte er sich auch an Rakı", ["ı"]], "00000388_0000000706_25890_33150": ["† fünfte Mai achtzehn Hundert einundzwanzig in Longwood House auf Stück Helena im Südatlantik", ["†"]], "00000007_0000000711_278400_280640": ["mit acht Hundert achtzig µg", ["µ"]], "00000392_0000000717_22210_24580": ["κνίδη knidē ‚ Nessel ‘", ["δ", "η", "ί", "κ", "ν"]], "00000007_0000000719_359080_364040": ["…wie an einen wundervollen Traum gedenke ich", ["…"]], "00000007_0000000719_2870760_2877430": ["der Herreninsel umschließt eine große rechteckige Fläche von etwa 120×400 Metern", ["4", "2", "×", "0", "1"]], "00000398_0000000728_25510_29770": ["† siebte Januar neunzehn Hundert dreiundvierzig in New York", ["†"]], "00000412_0000000759_23900_28120": ["† achtundzwanzigste Juli neunzehn Hundert achtundsechzig in Göttingen", ["†"]], "00000412_0000000759_1062550_1065170": ["… eine grundsätzliche", ["…"]], "00000412_0000000759_4779460_4781330": ["… Ein alter Trottel", ["…"]], "00000416_0000000769_4569140_4574610": ["fünfundsiebzig Prozent Körner mit einer Größe von weniger als zwei µm", ["µ"]], "00000026_0000000780_39680_44250": ["† achtzehnte März zwei Tausend dreizehn inBerlin", ["†"]], "00000113_0000000790_894050_896900": ["…eine spannungsgeladene Atmosphäre aufkommen zu lassen", ["…"]], "00000113_0000000790_2364410_2365640": ["…Blaupause für einen", ["…"]], "00000424_0000000793_30850_33080": ["† vermutlich am siebzehnte Mai", ["†"]], "00000425_0000000795_1884090_1888520": ["wie etwa der Vorläufer der existenzialistischen Philosophie Søren Kierkegaard", ["ø"]], "00000427_0000000798_56010_58900": ["für sichtbares Licht bis circa eins µm Wellenlänge", ["µ"]], "00000427_0000000798_60830_64209": ["für Infrarot bis etwa eins Komma acht µm Wellenlänge", ["µ"]], "00000427_0000000798_408160_410930": ["≤ null", ["≤"]], "00000430_0000000806_2191690_2197260": ["Sein Leben war ein einziger Dienst für Deutschland… Von dem Willen beseelt", ["…"]], "00000430_0000000806_2197390_2200360": ["sein Volk und Europa… zu erretten", ["…"]], "00000430_0000000806_2506950_2512950": ["Gefolgschaft… zum unbarmherzigen Widerstand gegen den Weltvergifter aller Völker", ["…"]], "00000431_0000000812_911020_919150": ["ihre Meinung zu verbreiten… Da die Herstellung von Zeitungen und Zeitschriften immer größeres Kapital erfordert", ["…"]], "00000431_0000000812_926070_929940": ["unsere Abhängigkeit immer größer und immer gefährlicher…", ["…"]], "00000044_0000000818_2843970_2851190": ["Zionismusforscher Juri Iwanow veröffentlichte neunzehn Hundert neunundsechzig ein Werk Осторожно", ["о", "т", "р", "ж", "н", "с"]], "00000145_0000000819_213840_216840": ["oder 43·1012", ["4", "·", "2", "3", "0", "1"]], "00000145_0000000819_307570_316150": ["Mit der Klassifikation M5.5 zählt er zu den späten M Zwergsternen und hat an seiner Oberfläche", [".", "5"]], "00000145_0000000819_436250_440690": ["im Infrarotbereich bei einer Wellenlänge von eins Komma zwei µm", ["µ"]], "00000145_0000000819_833080_839570": ["von der Erde aus beobachtbare Eigenbewegung am Himmel ist wegen der geringen Entfernung mit jährlich 3,85″", ["″", "5", ",", "8", "3"]], "00000145_0000000819_1601770_1607510": ["Science Fiction Autor Stanisław Lem veröffentlichte im Jahre neunzehn Hundert fünfundfünfzig", ["ł"]], "00000104_0000000824_944490_948390": ["} und {", ["{", "}"]], "00000104_0000000824_961390_963280": ["meist {", ["{"]], "00000104_0000000824_964220_967590": ["} und trigonale Bipyramiden", ["}"]], "00000104_0000000824_967670_970090": ["meist {", ["{"]], "00000104_0000000824_970300_971880": ["} auf", ["}"]], "00000104_0000000824_1201380_1206880": ["finden sich Brasilianer Zwillingslamellen konzentriert in den { eins null eins", ["{"]], "00000104_0000000824_1259540_1262760": ["der negativen Rhomboeder { null h", ["{"]], "00000438_0000000827_218740_221630": ["Richenza ∞ Ulrich I I", ["∞"]], "00000443_0000000837_10101940_10108760": ["bei I D W Publishing zwei Tausend zwölf erschienenen Graphic Novel Miniserie Assimilation²", ["²"]], "00000445_0000000840_2674570_2675770": ["→Schmalkaldischer Krieg", ["→"]], "00000445_0000000840_2678430_2679690": ["→Zweiter Kappelerkrieg", ["→"]], "00000335_0000000849_337170_342420": ["Masse m und zwar gemäß ΔE gleich Δm c zwei", ["δ"]], "00000038_0000000852_699690_701150": ["Restex ®", ["®"]], "00000038_0000000852_782610_783660": ["Sifrol ®", ["®"]], "00000449_0000000856_79690_83000": ["ist an die ε-Aminogruppe der Lysin Einheit", ["ε"]], "00000449_0000000856_92750_95820": ["Bereich liegt bei λ gleich fünf Hundert", ["λ"]], "00000449_0000000856_103910_107320": ["elf cis Retinal → all trans Retinal", ["→"]], "00000449_0000000856_172130_176740": ["eine Kristallstruktur des menschlichen β2 Adrenozeptors publiziert", ["β", "2"]], "00000449_0000000856_177640_182630": ["Im Juni zwei Tausend acht folgten Strukturen des β1 Adrenozeptors vom Truthuhn", ["β", "1"]], "00000449_0000000856_218990_220990": ["acht Hundert µg festgelegt", ["µ"]], "00000450_0000000858_29130_32850": ["† fünfte April neunzehn Hundert siebenundfünfzig bei", ["†"]], "00000087_0000000859_23700_25100": ["† fünfundzwanzigste Februar", ["†"]], "00000452_0000000862_35360_39430": ["† dreißigste Juni zwei Tausend sechs in Frankfurt am Main", ["†"]], "00000452_0000000862_430620_435250": ["Lützel Jeman gibt sein Pseudonym auf … Ich habe das dann in einem langen", ["…"]], "00000136_0000000863_49790_54340": ["† vierzehnte April achtzehn Hundert achtzehn in München", ["†"]], "00000059_0000000868_1179660_1181010": ["Piotr Michałowski", ["ł"]], "00000106_0000000869_19950_22570": ["† vierundzwanzigste Dezember neunzehn Hundert neunundsiebzig in", ["†"]], "00000106_0000000869_666180_668190": ["leider kann Bachmann daran nun nicht mehr teilnehmen…", ["…"]], "00000456_0000000874_26330_30000": ["· oder als ausgeschriebene Buchstabenfolge verwendet wird", ["·"]], "00000460_0000000878_213270_218490": ["ɸ /", ["ɸ"]], "00000460_0000000878_1070660_1071970": ["प्", ["्", "प"]], "00000204_0000000880_832240_836110": ["Der schwedische Dermatologe Cæsar Peter Møller Boeck", ["æ", "ø"]], "00000104_0000000882_250150_255550": ["Zahl der möglichen Stellungen wird auf zwei Komma zwei acht · ein Tausend sechsundvierzig geschätzt", ["·"]], "00000104_0000000882_862610_864780": ["Großmeister Larry Kaufman zu 93⁄4", ["4", "9", "⁄", "3"]], "00000104_0000000882_1060450_1062890": ["nach Larry Kaufman 3¼ Bauerneinheiten", ["¼", "3"]], "00000104_0000000882_1101720_1105430": ["zwei Mal 3¼ für die beiden Läufer und", ["¼", "3"]], "00000104_0000000882_1275740_1277340": ["3¼ nach Larry Kaufman", ["¼", "3"]], "00000104_0000000882_1801220_1805530": ["ein ½:½ für einen unentschiedenen Ausgang", ["½"]], "00000104_0000000882_1807650_1813750": ["Kampflos gewonnene Partien werden mit +:− beziehungsweise −:+ notiert", ["+"]], "00000041_0000000885_146960_149350": ["zum Aufbau der Hawaiʻi Inseln geführt haben", ["ʻ"]], "00000466_0000000897_1076950_1082440": ["…in jeder Hinsicht schöner und wohnlicher wird diese Burg werden als das untere", ["…"]], "00000468_0000000899_4446650_4448710": ["durch Helene Radziwiłł", ["ł"]], "00000468_0000000899_4596940_4602730": ["wie sie Franz von Anhalt Dessau und Helene Radziwiłł noch kopierten", ["ł"]], "00000087_0000000902_533770_535570": ["Zwischen Störkraft und den andern…", ["…"]], "00000136_0000000907_432400_434740": ["Chrankchəhüüsər iiglifərət woordə sind", ["ə"]], "00000136_0000000907_434980_436410": ["beträchtlich aaschtiigə", ["ə"]], "00000136_0000000907_437140_439450": ["statt mə hät Angscht", ["ə"]], "00000136_0000000907_439510_440760": ["das no mee Vərletzti", ["ə"]], "00000136_0000000907_441330_442980": ["praacht wäärdə chönntət", ["ə"]], "00000007_0000000918_1507150_1516030": ["Seit neunzehn Hundert neunundachtzig gibt es eine an den Dachverband DİTİB angeschlossene Moschee in Sendling", ["̇"]], "00000007_0000000918_3122990_3128940": ["… das Bewusstsein der Sendlinger", ["…"]], "00000479_0000000924_1401740_1411440": ["den romantischen Komödien Ranma ½ und Urusei Yatsura sowie ebenfalls Horrorgeschichten", ["½"]], "00000492_0000000943_1540380_1547430": ["eine Extraprämie in Höhe von acht Punkt null null null £", ["£"]], "00000492_0000000943_2078639_2080510": ["£ ein Allzeithoch", ["£"]], "00000492_0000000943_3062660_3063680": ["£ ausgeschüttet", ["£"]], "00000492_0000000943_3083250_3088340": ["und von zwei Tausend eins bis zwei Tausend drei sogar über eins Punkt fünf null null Punkt null null null £", ["£"]], "00000492_0000000943_3135650_3138940": ["ein Hundert siebenundvierzig Punkt null null null £ ausgeschüttet wurden", ["£"]], "00000492_0000000943_3148160_3152490": ["garantierten ein Hundert siebenundvierzig Punkt null null null £ für den Weltsnookerverband", ["£"]], "00000492_0000000943_3169490_3175630": ["bei dem sich die Prämie für ein Maximum Break bei jedem großen Ranglisten Turnier um fünf Punkt null null null £ erhöht", ["£"]], "00000495_0000000948_1288990_1292060": ["zwar ist ein Intervall dτ", ["τ"]], "00000495_0000000948_3193220_3199550": ["Myonen haben eine mittlere Lebensdauer von Δτ Tilde 2·10−6", ["δ", "τ", "·", "2", "0", "1"]], "00000495_0000000948_3225080_3227730": ["Erdoberfläche nicht die Eigenzeit Δτ", ["τ", "δ"]], "00000495_0000000948_3228330_3232170": ["sondern die Beobachterzeit Δt gleich", ["δ"]], "00000017_0000000954_24850_27570": ["russisch Спутник für Weggefährte", ["у", "и", "п", "т", "к", "н", "с"]], "00000204_0000000970_455440_460030": ["diese mit B85.5 kodierbare Erkrankung der", [".", "8", "5"]], "00000193_0000000972_1772870_1775670": ["wie zum Beispiel der Homunkulusnebel um η Carinae", ["η"]], "00000510_0000000983_1744650_1751420": ["Mai zwei Tausend sechs während seiner Pressekonferenz anlässlich der Spielemesse E³ vor", ["³"]], "00000510_0000000983_1764420_1768910": ["Fachbesuchern der E³ Messe stand die Gelegenheit offen", ["³"]], "00000510_0000000983_1785550_1791140": ["Spiel auf der E³ zwei Tausend sieben erneut vor und teilte den finalen", ["³"]], "00000510_0000000983_3332640_3338740": ["der E³ zwei Tausend neun kündigte Nintendo den Nachfolger Super Mario Galaxy zwei an", ["³"]], "00000087_0000001001_3913000_3916440": ["der Beat geht weiter…", ["…"]], "00000087_0000001003_5617400_5621810": ["Am siebzehnte November erschien unter dem Titel Let It Be… Naked eine", ["…"]], "00000519_0000001009_103870_106630": ["† 1820", ["†"]], "00000145_0000001014_2506230_2507270": ["¡No pasarán", ["¡"]], "00000145_0000001014_2519850_2527860": ["Bei der mexikanischen Revolution verwendeten die Anhänger des Rebellionsführers Emiliano Zapata den Wahlspruch ¡Tierra y Libertad", ["¡"]], "00000145_0000001014_2540500_2544200": ["im Liedtext in Form der Parole ¡Zapata vive", ["¡"]], "00000087_0000001030_276350_280520": ["Trio in kleiner Auflage auf eine 10″ Mini L P pressen", ["0", "″", "1"]], "00000087_0000001031_281810_285540": ["produzierte Trio in Eigenregie eine 10″ Demo Mini L P", ["0", "″", "1"]], "00000531_0000001039_4600640_4604560": ["bei wichtigen Veranstaltungen im 2½ Minuten Takt", ["½", "2"]], "00000531_0000001040_6406130_6417170": ["Insgesamt gibt es zehn Fahrzeuge vom Typ C1.9 sowie acht Fahrzeuge des lediglich geringfügig abgewandelten Typs C1.10", [".", "9", "0", "1"]], "00000531_0000001040_6433480_6437030": ["vom Typ C1.9 zeitweilig außer Betrieb", [".", "9", "1"]], "00000531_0000001040_6484830_6487710": ["Modellen C1.9 und C1.10", [".", "9", "0", "1"]], "00000531_0000001040_6489830_6502420": ["Mit dieser Bestellung will die Münchner Verkehrsgesellschaft ihre Angebotsoffensive mit dichteren Taktfolgen von 2½ auf zwei Minuten auf Teilstrecken umsetzen sowie die ältesten Fahrzeuge", ["½", "2"]], "00000532_0000001041_401470_405040": ["U einundzwanzig Röthenbach → Ziegelstein", ["→"]], "00000198_0000001062_786560_791110": ["der Karnivorie mit Verweis auf Genesis 1,29f", [",", "9", "2", "1"]], "00000007_0000001065_777750_779610": ["Geld zu verdienen …", ["…"]], "00000007_0000001065_1105500_1108660": ["etwa 1½ Stunden später aufgenommene Foto", ["½", "1"]], "00000007_0000001067_249460_251820": ["²Das Recht zur Auflösung erlischt", ["²"]], "00000007_0000001080_279210_284060": ["der ein durchschnittliches Wasservolumen von eins Komma drei km³ hat", ["³"]], "00000007_0000001080_1317840_1325750": ["Ein Jahr später war der See Schauplatz in dem Drama Bis dass das Geld Euch scheidet… mit Luise Ullrich und Gert", ["…"]], "00000548_0000001083_1415660_1417780": ["Grund eines Einspruchs Walfang betrieben wurde†", ["†"]], "00000553_0000001098_3463410_3466760": ["→2010 Übersiedlung in den Dreiklanghof siehe oben", ["0", "→", "2", "1"]], "00000554_0000001099_285020_287430": ["elf bis ¼ zwölf Uhr", ["¼"]], "00000555_0000001101_36620_40920": ["† erste Februar neunzehn Hundert sechsundsiebzig in München", ["†"]], "00000556_0000001102_35000_38730": ["† zweite Mai neunzehn Hundert fünfundneunzig in Hamburg", ["†"]], "00000562_0000001111_1772300_1774680": ["der Encyclopædia Britannica", ["æ"]], "00000562_0000001111_2751130_2756580": ["dass die Erstellung neuer Artikel seit zwei Tausend fünf um ⅔ gefallen ist", ["⅔"]], "00000563_0000001112_26070_29950": ["† zwanzigste Oktober achtzehn Hundert neunundachtzig ebenda", ["†"]], "00000174_0000001114_29300_33200": ["† achte Oktober neunzehn Hundert zweiundneunzig in Unkel", ["†"]], "00000174_0000001114_512740_514390": ["† zwei Tausend sechs", ["†"]], "00000022_0000001116_21380_24990": ["† vierundzwanzigste Januar neunzehn Hundert fünfundsechzig", ["†"]], "00000087_0000001124_163270_164640": ["the band begins to play…", ["…"]], "00000007_0000001143_589450_595400": ["…zur Verbesserung des Bodens im Zwingergarten und Baumgarten den Schlamm und", ["…"]], "00000007_0000001143_642050_646060": ["…auch in dem sogenannten Zwinger Garten", ["…"]], "00000007_0000001143_965770_969590": ["…daß der Landbaumeister Pöppelmann nachher", ["…"]], "00000007_0000001144_4041090_4044610": ["Bild Säule theils als eines Ober Aufsehers…", ["…"]]} --------------------------------------------------------------------------------