├── scripts ├── __init__.py ├── createDatasetConfig │ ├── __init__.py │ ├── Sonja.json │ ├── redaer.json │ ├── Bernd_Ungerer_tausendUndEineNacht.json │ ├── Eva.json │ ├── Friedrich.json │ └── Karlsson.json ├── generateAudioStatistic.py └── createDataset.py ├── huiAudioCorpus ├── __init__.py ├── error │ ├── __init__.py │ ├── MatchingNotFoundError.py │ └── DependencyInjectionError.py ├── filter │ ├── __init__.py │ └── AudioFilter.py ├── model │ ├── __init__.py │ ├── GutenbergBook.py │ ├── Credentials.py │ ├── SymbolSentence.py │ ├── PhoneticSentence.py │ ├── Histogram.py │ ├── AudioTranscriptPair.py │ ├── Statistic.py │ ├── PhoneticChars.py │ ├── SentenceAlignment.py │ ├── Transcripts.py │ ├── Sentence.py │ └── Audio.py ├── ui │ ├── __init__.py │ └── Plot.py ├── utils │ ├── __init__.py │ ├── FileListUtil.py │ ├── SecureFTP.py │ ├── DoneMarker.py │ ├── ModelToStringConverter.py │ └── PathUtil.py ├── calculator │ ├── __init__.py │ ├── AlignSentencesIntoTextCalculator.py │ └── TextNormalizer.py ├── components │ ├── __init__.py │ ├── AudioStatisticComponent.py │ └── TextStatisticComponent.py ├── converter │ ├── __init__.py │ ├── StringToSentencesConverter.py │ ├── TranscriptsToSentencesConverter.py │ ├── ListToStatisticConverter.py │ ├── PhoneticSentenceToSymbolSentenceConverter.py │ ├── ListToHistogramConverter.py │ ├── AudioToSentenceConverter.py │ └── SentenceToPhoneticSentenceConverter.py ├── persistenz │ ├── __init__.py │ ├── TranscriptsPersistenz.py │ ├── AudioPersistenz.py │ ├── AudiosFromLibrivoxPersistenz.py │ ├── AudioTranscriptPairPersistenz.py │ └── GutenbergBookPersistenz.py ├── testOutput │ └── __init__.py ├── transformer │ ├── __init__.py │ ├── AudioRemoveSilenceTransformer.py │ ├── TranscriptsSelectionTransformer.py │ ├── AudioLoudnessTransformer.py │ ├── AudioSamplingRateTransformer.py │ ├── SentenceDistanceTransformer.py │ ├── AudioAddSilenceTransformer.py │ ├── AudioFadeTransformer.py │ └── AudioSplitTransformer.py ├── sttInference │ ├── __init__.py │ ├── deepspeechModel │ │ └── __init__.py │ └── README.md ├── dependencyInjection │ ├── __init__.py │ └── DependencyInjection.py ├── workflows │ └── createDatasetWorkflow │ │ ├── __init__.py │ │ ├── Step3_DownloadText.py │ │ ├── Step1_DownloadAudio.py │ │ ├── Step2_1_AudioStatistic.py │ │ ├── Step2_SplitAudio.py │ │ ├── Step4_TranscriptAudio.py │ │ ├── Step7_AudioRawStatistic.py │ │ ├── Step9_GenerateCleanDataset.py │ │ ├── Step6_FinalizeDataset.py │ │ ├── Step5_AlignText.py │ │ ├── Step8_DatasetStatistic.py │ │ ├── Step0_Overview.py │ │ └── Step3_1_PrepareText.py └── enum │ └── PipelineReturnEnum.py ├── .vscode └── settings.json ├── setup.py ├── requirements.txt ├── .gitignore ├── README.md └── LICENSE /scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /huiAudioCorpus/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /huiAudioCorpus/error/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /huiAudioCorpus/filter/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /huiAudioCorpus/model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /huiAudioCorpus/ui/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /huiAudioCorpus/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /huiAudioCorpus/calculator/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /huiAudioCorpus/components/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /huiAudioCorpus/converter/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /huiAudioCorpus/model/GutenbergBook.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /huiAudioCorpus/persistenz/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /huiAudioCorpus/testOutput/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /huiAudioCorpus/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /huiAudioCorpus/sttInference/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scripts/createDatasetConfig/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /huiAudioCorpus/dependencyInjection/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /huiAudioCorpus/sttInference/deepspeechModel/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /huiAudioCorpus/workflows/createDatasetWorkflow/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "/home/ppuchtler/anaconda3/envs/huiAudioCorpus/bin/python" 3 | } -------------------------------------------------------------------------------- /huiAudioCorpus/enum/PipelineReturnEnum.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | class PipelineReturnEnum(Enum): 4 | Ok = 0 5 | OkWithDoneMarker=1 6 | -------------------------------------------------------------------------------- /huiAudioCorpus/model/Credentials.py: -------------------------------------------------------------------------------- 1 | class Credentials: 2 | 3 | def __init__(self, username:str, password:str): 4 | self.username = username 5 | self.password = password -------------------------------------------------------------------------------- /huiAudioCorpus/sttInference/README.md: -------------------------------------------------------------------------------- 1 | # sttInference 2 | 3 | We can execute text to speech with this Project 4 | 5 | 6 | # Copy the model 7 | 8 | Copy the files form: ***** unzipped into the deepspeechModel Folder 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="huiAudioCorpus", 5 | version="0.1", 6 | packages=find_packages(), 7 | ) 8 | 9 | # to init run this: 10 | # sudo python3 setup.py develop -------------------------------------------------------------------------------- /huiAudioCorpus/model/SymbolSentence.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from huiAudioCorpus.utils.ModelToStringConverter import ToString 3 | 4 | class SymbolSentence(ToString): 5 | def __init__ (self, sentence: List[int]): 6 | self.sentence = sentence -------------------------------------------------------------------------------- /huiAudioCorpus/utils/FileListUtil.py: -------------------------------------------------------------------------------- 1 | import glob 2 | 3 | class FileListUtil: 4 | def getFiles(self,path: str, ending: str): 5 | searchPath = path + '/**/*.' + ending 6 | files = glob.glob(searchPath, recursive=True) 7 | return files -------------------------------------------------------------------------------- /huiAudioCorpus/model/PhoneticSentence.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from huiAudioCorpus.utils.ModelToStringConverter import ToString 3 | 4 | class PhoneticSentence(ToString): 5 | def __init__ (self, sentence: str, subWords: List[str]): 6 | self.sentence = sentence 7 | self.subWords = subWords -------------------------------------------------------------------------------- /huiAudioCorpus/model/Histogram.py: -------------------------------------------------------------------------------- 1 | from huiAudioCorpus.utils.ModelToStringConverter import ToString 2 | from typing import List, TypeVar 3 | number = TypeVar('number', int, float) 4 | 5 | class Histogram(ToString): 6 | def __init__(self, bins: List[number], values: List[number]): 7 | self.bins = bins 8 | self.values = values 9 | -------------------------------------------------------------------------------- /huiAudioCorpus/converter/StringToSentencesConverter.py: -------------------------------------------------------------------------------- 1 | from huiAudioCorpus.model.Sentence import Sentence 2 | from textblob import TextBlob 3 | 4 | class StringToSentencesConverter: 5 | def convert(self, text: str): 6 | blob = TextBlob(text) 7 | sentences = [Sentence(str(sentences)) for sentences in blob.sentences] # type: ignore 8 | return sentences -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | matplotlib 3 | pandas 4 | librosa 5 | nptyping 6 | tqdm 7 | textblob 8 | torch 9 | adabound 10 | dependencies 11 | pysftp 12 | h5py 13 | pyyaml===5.3.1 14 | yq 15 | numba===0.48.0 16 | kaldiio 17 | frosch 18 | unidecode 19 | inflect 20 | bs4 21 | natsort 22 | python-Levenshtein 23 | deepspeech 24 | gutenberg 25 | pyloudnorm 26 | pandas_profiling 27 | lxml -------------------------------------------------------------------------------- /huiAudioCorpus/model/AudioTranscriptPair.py: -------------------------------------------------------------------------------- 1 | from huiAudioCorpus.utils.ModelToStringConverter import ToString 2 | from huiAudioCorpus.model.Sentence import Sentence 3 | from huiAudioCorpus.model.Audio import Audio 4 | 5 | class AudioTranscriptPair(ToString): 6 | 7 | def __init__(self, sentence: Sentence, audio: Audio): 8 | self.sentence = sentence 9 | self.audio = audio -------------------------------------------------------------------------------- /huiAudioCorpus/converter/TranscriptsToSentencesConverter.py: -------------------------------------------------------------------------------- 1 | 2 | from pathlib import Path 3 | from huiAudioCorpus.model.Sentence import Sentence 4 | from huiAudioCorpus.model.Transcripts import Transcripts 5 | 6 | class TranscriptsToSentencesConverter: 7 | def convert(self, transcripts: Transcripts): 8 | texts = transcripts.text 9 | ids = transcripts.keys 10 | sentences = [Sentence(text, Path(id).stem) for text, id in zip(texts, ids)] 11 | return sentences -------------------------------------------------------------------------------- /huiAudioCorpus/model/Statistic.py: -------------------------------------------------------------------------------- 1 | from huiAudioCorpus.utils.ModelToStringConverter import ToString 2 | 3 | 4 | class Statistic(ToString): 5 | def __init__(self, count:int, max:float, min:float, median:float, average:float, sum:float, std: float, var: float): 6 | self.count = count 7 | self.max = max 8 | self.min = min 9 | self.median = median 10 | self.average = average 11 | self.sum = sum 12 | self.std = std 13 | self.var = var -------------------------------------------------------------------------------- /huiAudioCorpus/transformer/AudioRemoveSilenceTransformer.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | from huiAudioCorpus.model.Audio import Audio 3 | 4 | 5 | class AudioRemoveSilenceTransformer: 6 | 7 | def __init__(self, dezibel: int): 8 | self.dezibel = dezibel 9 | 10 | def transform(self, audio: Audio): 11 | newAudioTimeline,_ = librosa.effects.trim(audio.timeSeries, self.dezibel) 12 | newAudio = Audio(newAudioTimeline, audio.samplingRate, audio.id, audio.name) 13 | return newAudio -------------------------------------------------------------------------------- /huiAudioCorpus/transformer/TranscriptsSelectionTransformer.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from huiAudioCorpus.model.Transcripts import Transcripts 3 | 4 | class TranscriptsSelectionTransformer: 5 | 6 | def transform(self, transcripts: Transcripts, selectedKeys: List[str]): 7 | trans = transcripts.transcripts 8 | transformedTrans = trans[trans[0].isin(selectedKeys)]# type:ignore 9 | transformedTranscripts = Transcripts(transformedTrans, transcripts.id, transcripts.name) 10 | return transformedTranscripts 11 | -------------------------------------------------------------------------------- /huiAudioCorpus/model/PhoneticChars.py: -------------------------------------------------------------------------------- 1 | from huiAudioCorpus.utils.ModelToStringConverter import ToString 2 | 3 | class PhoneticChars(ToString): 4 | 5 | def __init__(self): 6 | self.chars = ['ˈ', 'a', 'l', 'ə', 's', ' ', 'i', 'ʔ', 'ɛ', 'n', 'd', 'e', 'ː', 'ɐ', '̯', 'v', 't', 'ɪ', 'm', 'j', 'ɔ', 'x', '͡', 'u', ',', 'ʊ', 'z', 'p', 'ʁ', 'o', 'ʃ', 'ç', 'ɡ', '̩', '.', 'k', 'h', 'ˌ', 'f', 'b', 'ŋ', 'y', 'ʏ', 'œ', 'æ', 'ø', '!', 'ʒ', '…', ':', '̍', '?', '̥', '̃', 'r', 'ɑ', 'θ', "'", 'ð', 'ɱ', 'ʙ', 'ɺ', "ˑ", "ɒ",'‿'] 7 | 8 | @property 9 | def countChars(self): 10 | return len(self.chars) 11 | -------------------------------------------------------------------------------- /huiAudioCorpus/transformer/AudioLoudnessTransformer.py: -------------------------------------------------------------------------------- 1 | from huiAudioCorpus.model.Audio import Audio 2 | import pyloudnorm as pyln 3 | 4 | 5 | class AudioLoudnessTransformer: 6 | 7 | def __init__(self, loudness: int): 8 | self.loudness = loudness 9 | 10 | def transform(self, audio: Audio): 11 | meter = pyln.Meter(audio.samplingRate) # create BS.1770 meter 12 | 13 | loudnessNormalizedAudio = pyln.normalize.loudness(audio.timeSeries, audio.loudness, self.loudness) 14 | newAudio = Audio(loudnessNormalizedAudio, audio.samplingRate, audio.id, audio.name) 15 | return newAudio -------------------------------------------------------------------------------- /huiAudioCorpus/workflows/createDatasetWorkflow/Step3_DownloadText.py: -------------------------------------------------------------------------------- 1 | 2 | from huiAudioCorpus.persistenz.GutenbergBookPersistenz import GutenbergBookPersistenz 3 | from huiAudioCorpus.utils.DoneMarker import DoneMarker 4 | 5 | class Step3_DownloadText: 6 | 7 | def __init__(self, GutenbergBookPersistenz: GutenbergBookPersistenz, savePath: str): 8 | self.savePath = savePath 9 | self.GutenbergBookPersistenz = GutenbergBookPersistenz 10 | 11 | def run(self): 12 | return DoneMarker(self.savePath).run(self.script) 13 | 14 | def script(self): 15 | self.GutenbergBookPersistenz.save() -------------------------------------------------------------------------------- /huiAudioCorpus/workflows/createDatasetWorkflow/Step1_DownloadAudio.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from huiAudioCorpus.persistenz.AudiosFromLibrivoxPersistenz import AudiosFromLibrivoxPersistenz 4 | from huiAudioCorpus.utils.DoneMarker import DoneMarker 5 | 6 | 7 | class Step1_DownloadAudio: 8 | 9 | def __init__(self, audiosFromLibrivoxPersistenz: AudiosFromLibrivoxPersistenz, savePath: str): 10 | self.savePath = savePath 11 | self.audiosFromLibrivoxPersistenz = audiosFromLibrivoxPersistenz 12 | 13 | def run(self): 14 | return DoneMarker(self.savePath).run(self.script) 15 | 16 | def script(self): 17 | self.audiosFromLibrivoxPersistenz.save() 18 | 19 | -------------------------------------------------------------------------------- /huiAudioCorpus/converter/ListToStatisticConverter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from huiAudioCorpus.model.Statistic import Statistic 3 | 4 | from typing import List, TypeVar 5 | 6 | number = TypeVar('number', int, float) 7 | 8 | class ListToStatisticConverter: 9 | 10 | def convert(self, list: List[number]): 11 | count = len(list) 12 | maximum = max(list) 13 | minimum = min(list) 14 | total = sum(list) 15 | median: float 16 | median = np.median(list) 17 | std = np.std(list) 18 | var = np.var(list) 19 | average = total/count 20 | statistic = Statistic(count,maximum,minimum,median,average,total, std, var) 21 | return statistic 22 | -------------------------------------------------------------------------------- /huiAudioCorpus/filter/AudioFilter.py: -------------------------------------------------------------------------------- 1 | from huiAudioCorpus.model.Audio import Audio 2 | from typing import List 3 | 4 | class AudioFilter: 5 | 6 | def __init__(self, maxDuration = None, names: List[str] = None): 7 | 8 | self.maxDuration = float('inf') if maxDuration is None else maxDuration 9 | self.names = names 10 | 11 | 12 | def isAllowed(self, audio: Audio): 13 | if audio.duration >= self.maxDuration: 14 | return False 15 | if self.names is not None and audio.name not in self.names: 16 | return False 17 | return True 18 | 19 | def filter(self, audios: List[Audio]): 20 | filteredAudios = [audio for audio in audios if self.isAllowed(audio)] 21 | return filteredAudios 22 | -------------------------------------------------------------------------------- /huiAudioCorpus/converter/PhoneticSentenceToSymbolSentenceConverter.py: -------------------------------------------------------------------------------- 1 | from huiAudioCorpus.model.PhoneticChars import PhoneticChars 2 | from huiAudioCorpus.model.PhoneticSentence import PhoneticSentence 3 | from huiAudioCorpus.model.SymbolSentence import SymbolSentence 4 | 5 | class PhoneticSentenceToSymbolSentenceConverter: 6 | def __init__(self): 7 | self.symbols = PhoneticChars().chars 8 | self.symbol_to_id = {s: i for i, s in enumerate(self.symbols)} 9 | 10 | def convert(self, phoneticSentence:PhoneticSentence): 11 | sentence = phoneticSentence.sentence 12 | symbols = [self.getId(char) for char in sentence] 13 | return SymbolSentence(symbols) 14 | 15 | def getId(self, char): 16 | return self.symbol_to_id[char] +1 -------------------------------------------------------------------------------- /huiAudioCorpus/converter/ListToHistogramConverter.py: -------------------------------------------------------------------------------- 1 | from huiAudioCorpus.model.Histogram import Histogram 2 | from typing import List, TypeVar 3 | 4 | import numpy as np 5 | number = TypeVar('number', int, float) 6 | 7 | class ListToHistogramConverter: 8 | def __init__(self, stepSize: int): 9 | self.stepSize =stepSize 10 | 11 | def convert(self, list: List[number]): 12 | bins = np.arange(round(min(1,min(list)))-1,max(list) + 2*self.stepSize,self.stepSize) 13 | exportBins: List[number] 14 | values : List[number] 15 | valuesNumpy, exportBinsNumpy = np.histogram(list, bins=bins) # type: ignore 16 | exportBins = exportBinsNumpy.tolist()# type: ignore 17 | values = valuesNumpy.tolist()# type: ignore 18 | histogram = Histogram(exportBins[:-1], values) 19 | return histogram -------------------------------------------------------------------------------- /huiAudioCorpus/transformer/AudioSamplingRateTransformer.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | from huiAudioCorpus.model.Audio import Audio 3 | 4 | 5 | class AudioSamplingRateTransformer(): 6 | 7 | def __init__(self, targetSamplingRate: int = None): 8 | self.targetSamplingRate = targetSamplingRate 9 | 10 | def transform(self, audio: Audio ): 11 | if self.targetSamplingRate is None: 12 | return audio 13 | if audio.samplingRate == self.targetSamplingRate: 14 | return audio 15 | audioTimeSeries = audio.timeSeries 16 | samplingRate = audio.samplingRate 17 | resampledTimeSeries = librosa.core.resample(audioTimeSeries, samplingRate, self.targetSamplingRate) 18 | resampledAudio = Audio(resampledTimeSeries, self.targetSamplingRate, audio.id, audio.name) # type:ignore 19 | return resampledAudio 20 | -------------------------------------------------------------------------------- /huiAudioCorpus/model/SentenceAlignment.py: -------------------------------------------------------------------------------- 1 | from huiAudioCorpus.utils.ModelToStringConverter import ToString 2 | from huiAudioCorpus.model.Sentence import Sentence 3 | 4 | class SentenceAlignment(ToString): 5 | def __init__(self, sourceText: Sentence, alignedText: Sentence, start: int, end: int, distance: float, leftIsPerfekt:bool = False, rightIsPerfekt: bool = False, isFirst : bool = False, isLast: bool = False, isPerfect: bool = False, isSkipped: bool = False): 6 | self.sourceText = sourceText 7 | self.alignedText = alignedText 8 | self.start = start 9 | self.end = end 10 | self.distance = distance 11 | self.leftIsPerfekt = leftIsPerfekt 12 | self.rightIsPerfekt= rightIsPerfekt 13 | self.isFirst = isFirst 14 | self.isLast = isLast 15 | self.isPerfect = isPerfect 16 | self.isSkipped = isSkipped -------------------------------------------------------------------------------- /huiAudioCorpus/transformer/SentenceDistanceTransformer.py: -------------------------------------------------------------------------------- 1 | from huiAudioCorpus.model.Sentence import Sentence 2 | from Levenshtein import distance as LevensteinDistance 3 | 4 | class SentenceDistanceTransformer: 5 | 6 | def transform(self, sentence1: Sentence, sentence2: Sentence): 7 | 8 | baseDistance = self.distanceTwoSentences(sentence1, sentence2) 9 | return baseDistance 10 | 11 | 12 | def distanceTwoSentences(self, sentence1: Sentence, sentence2: Sentence): 13 | if sentence1.wordsCount == 0 or sentence2.wordsCount == 0: 14 | return 1 15 | 16 | sentenceString1 = "".join(sentence1.wordsWithoutChars) 17 | sentenceString2 = "".join(sentence2.wordsWithoutChars) 18 | 19 | countCharsMax = max(len(sentenceString1) , len(sentenceString2)) 20 | diff = LevensteinDistance(sentenceString1, sentenceString2) 21 | distance = diff / countCharsMax 22 | return distance 23 | -------------------------------------------------------------------------------- /huiAudioCorpus/error/MatchingNotFoundError.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | 4 | class MatchingNotFoundError(Exception): 5 | 6 | def __init__(self, missingIdsIn1: List[str], missingIdsIn2: List[str], namemissingIdsIn1: str, namemissingIdsIn2: str): 7 | self.missingIdsIn1 = missingIdsIn1 8 | self.missingIdsIn2 = missingIdsIn2 9 | self.namemissingIdsIn1= namemissingIdsIn1 10 | self.namemissingIdsIn2 = namemissingIdsIn2 11 | 12 | super().__init__(f'Missing ids from matching {self.namemissingIdsIn1} and {self.namemissingIdsIn2}') 13 | 14 | def __str__(self): 15 | return self.getString() 16 | 17 | def getString(self): 18 | string = f'Exception: Missing ids from matching {self.namemissingIdsIn1} and {self.namemissingIdsIn2}\n' 19 | string+= f'misssing ids in {self.namemissingIdsIn1}: {self.missingIdsIn1}\n' 20 | string+= f'misssing ids in {self.namemissingIdsIn2}: {self.missingIdsIn2}\n' 21 | return string -------------------------------------------------------------------------------- /huiAudioCorpus/transformer/AudioAddSilenceTransformer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from huiAudioCorpus.model.Audio import Audio 3 | 4 | 5 | class AudioAddSilenceTransformer: 6 | 7 | def __init__(self,startDurationSeconds: float, endDurationSeconds: float): 8 | self.startDurationSeconds = startDurationSeconds 9 | self.endDurationSeconds = endDurationSeconds 10 | 11 | def transform(self, audio: Audio): 12 | silenceAudioFront = self.generateSilence(self.startDurationSeconds, audio.samplingRate) 13 | silenceAudioBack = self.generateSilence(self.endDurationSeconds, audio.samplingRate) 14 | newAudio = silenceAudioFront+ audio + silenceAudioBack 15 | return newAudio 16 | 17 | def generateSilence(self,duration: float, samplingRate: int): 18 | silenceDataPoints = int(duration*samplingRate) 19 | silence = np.zeros(silenceDataPoints) 20 | silenceAudio = Audio(silence, samplingRate, 's', 's') 21 | return silenceAudio 22 | -------------------------------------------------------------------------------- /scripts/generateAudioStatistic.py: -------------------------------------------------------------------------------- 1 | from numpy import source 2 | from huiAudioCorpus.dependencyInjection.DependencyInjection import DependencyInjection 3 | 4 | loadPath = '/media/ppuchtler/LangsameSSD/Projekte/espnet/egs2/HUI_Tacotron/tts1/inferences' 5 | savePath = '/media/ppuchtler/LangsameSSD/Projekte/espnet/egs2/HUI_Tacotron/tts1/hokuspokus_statistic' 6 | 7 | diConfig = { 8 | 'step7_AudioRawStatistic': { 9 | 'savePath': savePath + '/raw', 10 | 'loadPath': loadPath 11 | } 12 | } 13 | DependencyInjection(diConfig).step7_AudioRawStatistic.run() 14 | 15 | diConfig = { 16 | 'step8_DatasetStatistic': { 17 | 'savePath': savePath + '/stats', 18 | 'loadPath': savePath + '/raw/overview.csv', 19 | 'specialSpeackers': [], 20 | 'filter': None 21 | }, 22 | 'audioPersistenz': { 23 | 'loadPath':'' 24 | }, 25 | 'transcriptsPersistenz': { 26 | 'loadPath':'' 27 | }, 28 | 'plot': { 29 | 'showDuration': 0 30 | } 31 | } 32 | DependencyInjection(diConfig).step8_DatasetStatistic.run() -------------------------------------------------------------------------------- /huiAudioCorpus/error/DependencyInjectionError.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | 4 | class DependencyInjectionError(Exception): 5 | 6 | def __init__(self, exception: Exception, classConfig: Dict[str,str] , className : str, requestedClassName : str): 7 | self.exception = exception 8 | self.classConfig = classConfig 9 | self.className= className 10 | self.requestedClassName = requestedClassName 11 | 12 | super().__init__(f'Dependent object {self.className} could not be injected for {self.requestedClassName}') 13 | 14 | def __str__(self): 15 | return self.getString() 16 | 17 | def getString(self): 18 | string = f'\n+++++++++++++++++++++++++\n' 19 | string += 'Error during creation of dependencys. Maybe your config is wrong. \n' 20 | string += f'Dependent object "{self.className}" could not be injected for "{self.requestedClassName}" \n' 21 | string += f'with error message: {self.exception} \n' 22 | string += f'config parameter used are: {self.classConfig}\n' 23 | string += f'+++++++++++++++++++++++++\n' 24 | return string -------------------------------------------------------------------------------- /huiAudioCorpus/utils/SecureFTP.py: -------------------------------------------------------------------------------- 1 | from huiAudioCorpus.persistenz.CredentialsPersistenz import CredentialsPersistenz 2 | from huiAudioCorpus.utils.PathUtil import PathUtil 3 | import pysftp 4 | 5 | # This class is hard to test. Because the risk is not so high i decided not to test this class automatic. Pascal 6 | class SecureFTP:# pragma: no cover 7 | def __init__(self, pathUtil: PathUtil, server: str, credentialsPersistenz: CredentialsPersistenz): 8 | cnopts = pysftp.CnOpts() 9 | credentials = credentialsPersistenz.load(server) 10 | cnopts.hostkeys = None 11 | self.connection = pysftp.Connection(server, username=credentials.username, password=credentials.password, cnopts=cnopts) 12 | self.pathUtil = pathUtil 13 | 14 | def getFiles(self, path: str): 15 | files = self.connection.listdir(path) 16 | return files 17 | 18 | def copyFile(self, sourcePath: str, targetPath: str): 19 | source = self.connection.open(sourcePath,'rb') 20 | self.pathUtil.copyFileWithStream(source, self.getSize(sourcePath), targetPath)# type:ignore 21 | source.close() 22 | 23 | def getSize(self, sourcePath: str): 24 | stats = self.connection.stat(sourcePath) 25 | size = stats.st_size 26 | return size -------------------------------------------------------------------------------- /huiAudioCorpus/model/Transcripts.py: -------------------------------------------------------------------------------- 1 | from huiAudioCorpus.model.Sentence import Sentence 2 | from typing import List 3 | from huiAudioCorpus.utils.ModelToStringConverter import ToString 4 | from pandas.core.frame import DataFrame 5 | class Transcripts(ToString): 6 | def __init__(self, transcripts: DataFrame, id: str, name: str): 7 | self.transcripts = transcripts 8 | self.id = id 9 | self.name = name 10 | 11 | 12 | @property 13 | def transcriptsCount(self): 14 | return self.transcripts.shape[0] 15 | 16 | @property 17 | def example(self): 18 | return self.transcripts.values[0][0] 19 | 20 | @property 21 | def keys(self) -> List[str]: 22 | #TODO: This is not generalizable at all! We should introduce column labels 23 | return list(self.transcripts[0].values) # type:ignore 24 | 25 | @property 26 | def text(self)-> List[str]: 27 | #TODO: This is not generalizable at all! We should introduce column labels 28 | return list(self.transcripts[self.transcripts.columns[-1]].values) # type:ignore 29 | 30 | 31 | def sentences(self) -> List[Sentence]: 32 | sentences = [] 33 | for key, text in zip(self.keys, self.text): 34 | if type(text) == str: 35 | sentences.append(Sentence(text,key)) 36 | return sentences -------------------------------------------------------------------------------- /huiAudioCorpus/ui/Plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from huiAudioCorpus.model.Histogram import Histogram 3 | from huiAudioCorpus.utils.PathUtil import PathUtil 4 | import logging 5 | logging.getLogger('matplotlib.font_manager').disabled = True 6 | logging.getLogger('matplotlib.colorbar').disabled = True 7 | 8 | class Plot: 9 | def __init__(self, showDuration: int, savePath: str = ''): 10 | self.showDuration = showDuration 11 | self.savePath = savePath 12 | self.pathUtil = PathUtil() 13 | 14 | 15 | def histogram(self, histogram:Histogram, name:str, logScaleY = False, logScaleX = False): 16 | plt.clf() 17 | _, ax = plt.subplots() 18 | 19 | 20 | ax.bar(histogram.bins,histogram.values, width=1) # type: ignore 21 | ax.set_ylabel('count') # type: ignore 22 | ax.set_xlabel('bins') # type: ignore 23 | ax.set_title(name) # type: ignore 24 | if logScaleY: 25 | ax.set_yscale('log') 26 | if logScaleX: 27 | ax.set_xscale('log') 28 | 29 | def show(self): 30 | plt.show(block=False) 31 | plt.pause(self.showDuration) 32 | plt.close() 33 | 34 | def save(self, filename: str): 35 | filename = self.savePath + '/' + filename 36 | self.pathUtil.createFolderForFile(filename) 37 | plt.savefig(filename, dpi=200) 38 | -------------------------------------------------------------------------------- /huiAudioCorpus/utils/DoneMarker.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime 3 | from os import unlink 4 | from os.path import isfile 5 | from huiAudioCorpus.enum.PipelineReturnEnum import PipelineReturnEnum 6 | from huiAudioCorpus.utils.PathUtil import PathUtil 7 | 8 | class DoneMarker: 9 | doneFilename = '.done' 10 | 11 | def __init__(self, path: str): 12 | self.path = path 13 | self.doneFilePath = path + '/' + self.doneFilename 14 | self.pathUtil = PathUtil() 15 | 16 | def isDone(self): 17 | isDone = os.path.exists(self.doneFilePath) 18 | return isDone 19 | 20 | def setDone(self): 21 | self.pathUtil.createFolderForFile(self.doneFilePath) 22 | f = open(self.doneFilePath, "w") 23 | f.write(f'Done at: {datetime.now()}') 24 | f.close() 25 | 26 | def remove(self): 27 | if isfile(self.doneFilePath): 28 | unlink(self.doneFilePath) 29 | 30 | def getInfo(self): 31 | return 'Continue to next step because of done marker.' 32 | 33 | def run(self, script, deleteFolder = True): 34 | if self.isDone(): 35 | print(self.getInfo()) 36 | return PipelineReturnEnum.OkWithDoneMarker 37 | 38 | if deleteFolder: 39 | self.pathUtil.deleteFolder(self.path) 40 | 41 | script() 42 | 43 | self.setDone() 44 | return PipelineReturnEnum.Ok -------------------------------------------------------------------------------- /huiAudioCorpus/transformer/AudioFadeTransformer.py: -------------------------------------------------------------------------------- 1 | from huiAudioCorpus.model.Audio import Audio 2 | import numpy as np 3 | 4 | class AudioFadeTransformer: 5 | 6 | def __init__(self, fadeInDuration: float = 0.1, fadeOutDuration: float = 0.1): 7 | self.fadeInDuration = fadeInDuration 8 | self.fadeOutDuration = fadeOutDuration 9 | 10 | def transform(self, audio: Audio): 11 | audio = self.fadeOut(audio) 12 | audio = self.fadeIn(audio) 13 | return audio 14 | 15 | 16 | def fadeOut(self, audio: Audio) -> Audio: 17 | countOfSamples= int(self.fadeOutDuration*audio.samplingRate) 18 | end = audio.samples 19 | start = end - countOfSamples 20 | 21 | # compute fade out curve 22 | # linear fade 23 | fade_curve = np.linspace(1.0, 0.0, countOfSamples) 24 | 25 | # apply the curve 26 | audio.timeSeries[start:end] = audio.timeSeries[start:end] * fade_curve 27 | return audio 28 | 29 | def fadeIn(self, audio: Audio) -> Audio: 30 | countOfSamples= int(self.fadeOutDuration*audio.samplingRate) 31 | end = countOfSamples 32 | start = 0 33 | 34 | # compute fade out curve 35 | # linear fade 36 | fade_curve = np.linspace(0.0, 1.0, countOfSamples) 37 | 38 | # apply the curve 39 | audio.timeSeries[start:end] = audio.timeSeries[start:end] * fade_curve 40 | return audio -------------------------------------------------------------------------------- /huiAudioCorpus/workflows/createDatasetWorkflow/Step2_1_AudioStatistic.py: -------------------------------------------------------------------------------- 1 | from huiAudioCorpus.utils.DoneMarker import DoneMarker 2 | from huiAudioCorpus.components.AudioStatisticComponent import AudioStatisticComponent 3 | from huiAudioCorpus.ui.Plot import Plot 4 | 5 | 6 | class Step2_1_AudioStatistic: 7 | def __init__(self, savePath: str, audioStatisticComponent: AudioStatisticComponent, plot: Plot): 8 | self.savePath = savePath 9 | self.audioStatisticComponent = audioStatisticComponent 10 | self.plot = plot 11 | 12 | def run(self): 13 | doneMarker = DoneMarker(self.savePath) 14 | result = doneMarker.run(self.script, deleteFolder=False) 15 | return result 16 | 17 | def script(self): 18 | statistics, rawData = self.audioStatisticComponent.run() 19 | 20 | self.plot.histogram(statistics['duration']['histogram'], statistics['duration']['description']) 21 | self.plot.save('audioLength') 22 | self.plot.show() 23 | 24 | with open(self.savePath + '/statistic.txt', 'w') as textFile: 25 | for statistic in statistics.values(): 26 | print(statistic['description']) 27 | print(statistic['statistic']) 28 | textFile.write(statistic['description']) 29 | textFile.write('\n') 30 | textFile.write(statistic['statistic'].__str__()) 31 | textFile.write('\n') 32 | -------------------------------------------------------------------------------- /huiAudioCorpus/utils/ModelToStringConverter.py: -------------------------------------------------------------------------------- 1 | classHighlither = '###' 2 | endOfClass = '____' 3 | 4 | class ToString(): 5 | def __str__(self): 6 | return ModelToStringConverter().convert(self) # pragma: no cover 7 | 8 | class ModelToStringConverter: 9 | def convert(self, model): 10 | strings =[] 11 | strings.append( self.getClassText(model)) 12 | strings.append('') 13 | attributes = self.getAllAttributes(model) 14 | [strings.append(self.getMethodText(model, attr)) for attr in attributes] 15 | strings.append(endOfClass) 16 | string = '\n'.join(strings) 17 | return string 18 | 19 | def getClassText(self,model): 20 | string = classHighlither + ' ' + model.__class__.__name__ + ' ' + classHighlither 21 | return string 22 | 23 | def getAllAttributes(self, model): 24 | attr:str 25 | allAttributes = dir(model) 26 | allAttributes = [attr for attr in allAttributes if not attr.startswith('__')] 27 | return allAttributes 28 | 29 | def getMethodText(self,model, methodName: str): 30 | value = getattr(model, methodName) 31 | valueString = self.getValueText(value) 32 | string = methodName + ' ' + str(type(value)) +': ' + valueString 33 | return string 34 | 35 | def getValueText(self, value): 36 | if isinstance(value, float): 37 | return str(round(value,2)) 38 | 39 | string = str(value) 40 | if len(string)>20: 41 | return string[:20] + ' ...' 42 | return str(value) -------------------------------------------------------------------------------- /huiAudioCorpus/persistenz/TranscriptsPersistenz.py: -------------------------------------------------------------------------------- 1 | from pandas.core.frame import DataFrame 2 | from huiAudioCorpus.model.Transcripts import Transcripts 3 | from huiAudioCorpus.utils.FileListUtil import FileListUtil 4 | from huiAudioCorpus.utils.PathUtil import PathUtil 5 | import pandas as pd 6 | 7 | class TranscriptsPersistenz: 8 | def __init__(self, loadPath:str, savePath: str = None, fileExtension:str = 'csv'): 9 | self.savePath = loadPath if savePath is None else savePath 10 | self.loadPath = loadPath 11 | self.fileExtension = fileExtension 12 | self.fileListUtil = FileListUtil() 13 | self.pathUtil = PathUtil() 14 | 15 | def getIds(self): 16 | transcriptsFiles = self.fileListUtil.getFiles(self.loadPath, self.fileExtension) 17 | transcriptsFiles = [file.replace(self.loadPath,'')[1:-len(self.fileExtension)-1] for file in transcriptsFiles] 18 | return transcriptsFiles 19 | 20 | def load(self, id: str): 21 | targetPath = self.loadPath +'/' + id + '.' + self.fileExtension 22 | csv: DataFrame 23 | csv = pd.read_csv(targetPath, sep='|', header=None) # type: ignore 24 | name = self.pathUtil.filenameWithoutExtension(targetPath) 25 | transcripts = Transcripts(csv, id, name) 26 | return transcripts 27 | 28 | def save(self, transcripts: Transcripts): 29 | targetPath = self.savePath +'/' + transcripts.id + '.' + self.fileExtension 30 | self.pathUtil.createFolderForFile(targetPath) 31 | trans = transcripts.transcripts 32 | trans.to_csv(targetPath, sep='|', header = None, index=False) # type: ignore 33 | 34 | def loadAll(self): 35 | ids = self.getIds() 36 | for id in ids: 37 | yield self.load(id) -------------------------------------------------------------------------------- /scripts/createDatasetConfig/Sonja.json: -------------------------------------------------------------------------------- 1 | { 2 | "deutschland_ein_wintermrchen": { 3 | "title": "deutschland_ein_wintermrchen", 4 | "LibrivoxBookName": "Deutschland. Ein Wintermärchen", 5 | "GutenbergId": 6079, 6 | "GutenbergStart": "VORWORT", 7 | "GutenbergEnd": "", 8 | "textReplacement": { 9 | "1844":"achtzehnhundertvierundvierzig", 10 | "17.":"siebzehnsten", 11 | "***":"Punkt Punkt Punkt", 12 | "CAPUT I\n":"CAPUT eins", 13 | "CAPUT II\n":"CAPUT zwei", 14 | "CAPUT III\n":"CAPUT drei", 15 | "CAPUT IV\n":"CAPUT vier", 16 | "CAPUT V\n":"CAPUT fünf", 17 | "CAPUT VI\n":"CAPUT sechs", 18 | "CAPUT VII\n":"CAPUT sieben", 19 | "CAPUT VIII\n":"CAPUT acht", 20 | "CAPUT IX\n":"CAPUT neun", 21 | "CAPUT X\n":"CAPUT zehn", 22 | "CAPUT XI\n":"CAPUT elf", 23 | "CAPUT XII\n":"CAPUT zwölf", 24 | "CAPUT XIII\n":"CAPUT dreizehn", 25 | "CAPUT XIV\n":"CAPUT vierzehn", 26 | "CAPUT XV\n":"CAPUT fünfzehn", 27 | "CAPUT XVI\n":"CAPUT sechszehn", 28 | "CAPUT XVII\n":"CAPUT siebzehn", 29 | "CAPUT XVIII\n":"CAPUT achtzehn", 30 | "CAPUT XIX\n":"CAPUT neunzehn", 31 | "CAPUT XX\n":"CAPUT zwanzig", 32 | "CAPUT XXI\n":"CAPUT einundzwanzig", 33 | "CAPUT XXII\n":"CAPUT zweiundzwanzig", 34 | "CAPUT XXIII\n":"CAPUT dreiundzwanzig", 35 | "CAPUT XXIV\n":"CAPUT vierundzwanzig", 36 | "CAPUT XXV\n":"CAPUT fünfunzwanzig", 37 | "CAPUT XXVI\n":"CAPUT sechundzwanzig", 38 | "CAPUT XXVII\n":"CAPUT siebenundzwanzig" 39 | } 40 | } 41 | } -------------------------------------------------------------------------------- /huiAudioCorpus/workflows/createDatasetWorkflow/Step2_SplitAudio.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from typing import List 4 | from huiAudioCorpus.persistenz.AudioPersistenz import AudioPersistenz 5 | from huiAudioCorpus.transformer.AudioSplitTransformer import AudioSplitTransformer 6 | from huiAudioCorpus.transformer.AudioLoudnessTransformer import AudioLoudnessTransformer 7 | from huiAudioCorpus.model.Audio import Audio 8 | from huiAudioCorpus.utils.DoneMarker import DoneMarker 9 | from joblib import Parallel, delayed 10 | 11 | class Step2_SplitAudio: 12 | 13 | def __init__(self, audioSplitTransformer:AudioSplitTransformer , audioPersistenz: AudioPersistenz, savePath: str, bookName: str, audioLoudnessTransformer: AudioLoudnessTransformer, remapSort: List[int] = None): 14 | self.audioPersistenz = audioPersistenz 15 | self.savePath = savePath 16 | self.audioSplitTransformer = audioSplitTransformer 17 | self.bookName = bookName 18 | self.audioLoudnessTransformer = audioLoudnessTransformer 19 | self.remapSort = remapSort 20 | 21 | def run(self): 22 | return DoneMarker(self.savePath).run(self.script) 23 | 24 | def script(self): 25 | audios = self.audioPersistenz.loadAll() 26 | if self.remapSort: 27 | audios = list(audios) 28 | audios = [audios[i] for i in self.remapSort] 29 | 30 | Parallel(n_jobs=1, verbose=10, batch_size= 100)(delayed(self.splitOneAudio)(audio, index) for index, audio in enumerate(audios)) 31 | 32 | 33 | def splitOneAudio(self, audio: Audio, index:int): 34 | splittedAudios = self.audioSplitTransformer.transform(audio, self.bookName, index+1) 35 | for splitAudio in splittedAudios: 36 | loudnessAudio = self.audioLoudnessTransformer.transform(splitAudio) 37 | self.audioPersistenz.save(loudnessAudio) -------------------------------------------------------------------------------- /huiAudioCorpus/persistenz/AudioPersistenz.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import soundfile 3 | from huiAudioCorpus.model.Audio import Audio 4 | from nptyping import NDArray 5 | from huiAudioCorpus.utils.FileListUtil import FileListUtil 6 | from huiAudioCorpus.utils.PathUtil import PathUtil 7 | from natsort import natsorted 8 | 9 | class AudioPersistenz: 10 | def __init__(self, loadPath:str, savePath: str = None , fileExtension:str = 'wav'): 11 | self.savePath = loadPath if savePath is None else savePath 12 | self.loadPath = loadPath 13 | self.fileExtension = fileExtension 14 | self.fileListUtil = FileListUtil() 15 | self.pathUtil = PathUtil() 16 | 17 | def load(self, id: str): 18 | audioTimeSeries: NDArray 19 | samplingRate: int 20 | targetPath = self.loadPath +'/' + id + '.' + self.fileExtension 21 | name = self.pathUtil.filenameWithoutExtension(targetPath) 22 | audioTimeSeries, samplingRate = librosa.core.load(targetPath, sr=None) # type: ignore 23 | audio = Audio(audioTimeSeries, samplingRate, id, name) 24 | return audio 25 | 26 | def save(self, audio: Audio): 27 | targetPath = self.savePath + '/' + audio.id + '.wav' 28 | self.pathUtil.createFolderForFile(targetPath) 29 | audioTimeSeries = audio.timeSeries 30 | samplingRate = audio.samplingRate 31 | soundfile.write(targetPath, audioTimeSeries, samplingRate) 32 | 33 | def getNames(self): 34 | names = [self.transformIdToName(id) for id in self.getIds()] 35 | return names 36 | 37 | def getIds(self): 38 | audioFiles = self.fileListUtil.getFiles(self.loadPath, self.fileExtension) 39 | audioFiles = [file.replace(self.loadPath,'')[1:-len(self.fileExtension)-1] for file in audioFiles] 40 | audioFiles = natsorted(audioFiles) 41 | 42 | return audioFiles 43 | 44 | def loadAll(self): 45 | ids = self.getIds() 46 | for id in ids: 47 | yield self.load(id) 48 | 49 | def transformIdToName(self, id: str): 50 | return self.pathUtil.filenameWithoutExtension(id) -------------------------------------------------------------------------------- /huiAudioCorpus/workflows/createDatasetWorkflow/Step4_TranscriptAudio.py: -------------------------------------------------------------------------------- 1 | 2 | from itertools import count 3 | from typing import List 4 | from huiAudioCorpus.model.Audio import Audio 5 | from huiAudioCorpus.persistenz.TranscriptsPersistenz import TranscriptsPersistenz 6 | from pandas.core.frame import DataFrame 7 | from huiAudioCorpus.model.Transcripts import Transcripts 8 | from huiAudioCorpus.persistenz.AudioPersistenz import AudioPersistenz 9 | from huiAudioCorpus.converter.AudioToSentenceConverter import AudioToSentenceConverter 10 | from huiAudioCorpus.utils.DoneMarker import DoneMarker 11 | from tqdm import tqdm 12 | import numpy as np 13 | from joblib import Parallel, delayed 14 | 15 | class Step4_TranscriptAudio: 16 | 17 | def __init__(self, savePath: str, audioToSentenceConverter: AudioToSentenceConverter, audioPersistenz: AudioPersistenz, transcriptsPersistenz: TranscriptsPersistenz, numberWorker = 4): 18 | self.savePath = savePath 19 | self.audioToSentenceConverter = audioToSentenceConverter 20 | self.audioPersistenz = audioPersistenz 21 | self.transcriptsPersistenz = transcriptsPersistenz 22 | self.numberWorker = numberWorker 23 | 24 | 25 | def run(self): 26 | return DoneMarker(self.savePath).run(self.script) 27 | 28 | def script(self): 29 | ids = self.audioPersistenz.getIds() 30 | chunks = np.array_split(ids, self.numberWorker) 31 | 32 | parallelResult = Parallel(n_jobs=self.numberWorker)(delayed(self.loadOneChunk)(audioIds, chunkId) for chunkId, audioIds in enumerate(chunks)) 33 | 34 | results = [[sentence.id, sentence.sentence] for level in parallelResult for sentence in level] 35 | 36 | csv = DataFrame(results) 37 | transcripts = Transcripts(csv, 'transcripts', 'transcripts') 38 | self.transcriptsPersistenz.save(transcripts) 39 | 40 | def loadOneChunk(self, ids: List[str], chunkId: int): 41 | sentences = [] 42 | for id in tqdm(ids, desc="Chunk " + str(chunkId) + ": "): 43 | audio = self.audioPersistenz.load(id) 44 | sentence = self.audioToSentenceConverter.convert(audio) 45 | sentences.append(sentence) 46 | return sentences 47 | -------------------------------------------------------------------------------- /huiAudioCorpus/workflows/createDatasetWorkflow/Step7_AudioRawStatistic.py: -------------------------------------------------------------------------------- 1 | from huiAudioCorpus.utils.DoneMarker import DoneMarker 2 | from huiAudioCorpus.utils.PathUtil import PathUtil 3 | import pandas as pd 4 | import os 5 | 6 | 7 | class Step7_AudioRawStatistic: 8 | def __init__(self, savePath: str, loadPath: str, pathUtil: PathUtil): 9 | self.savePath = savePath 10 | self.pathUtil = pathUtil 11 | self.loadPath = loadPath 12 | 13 | def run(self): 14 | doneMarker = DoneMarker(self.savePath) 15 | result = doneMarker.run(self.script, deleteFolder=False) 16 | return result 17 | 18 | def script(self): 19 | from huiAudioCorpus.dependencyInjection.DependencyInjection import DependencyInjection 20 | speackers = os.listdir(self.loadPath) 21 | audioInfos = [] 22 | for speacker in speackers: 23 | if speacker == '.done': 24 | continue 25 | print('finalSummary: ' + speacker) 26 | loadPath = self.loadPath + '/' + speacker 27 | savePath = self.savePath + '/' + speacker 28 | saveFile = savePath + '/overview.csv' 29 | self.pathUtil.createFolderForFile(saveFile) 30 | localDoneMarker = DoneMarker(savePath) 31 | if localDoneMarker.isDone(): 32 | rawDataAudio = pd.read_csv(saveFile, sep='|' , index_col='id') 33 | else: 34 | diConfig = { 35 | 'audioPersistenz': { 36 | 'loadPath': loadPath, 37 | } 38 | } 39 | rawDataAudio = DependencyInjection(diConfig).audioStatisticComponent.loadAudioFiles() 40 | rawDataAudio['speacker'] = speacker 41 | 42 | diConfig = { 43 | 'transcriptsPersistenz': { 44 | 'loadPath': loadPath, 45 | } 46 | } 47 | rawDataText = DependencyInjection(diConfig).textStatisticComponent.loadTextFiles() 48 | rawData = rawDataAudio.merge(rawDataText, how='outer', on='id' ) 49 | rawData.to_csv(saveFile , sep='|') 50 | 51 | localDoneMarker.setDone() 52 | 53 | audioInfos.append(rawDataAudio) 54 | 55 | audio = pd.concat(audioInfos) 56 | audio.to_csv(self.savePath + '/overview.csv', sep='|') -------------------------------------------------------------------------------- /huiAudioCorpus/components/AudioStatisticComponent.py: -------------------------------------------------------------------------------- 1 | 2 | from huiAudioCorpus.model.Audio import Audio 3 | from pandas.core.frame import DataFrame 4 | from huiAudioCorpus.converter.ListToHistogramConverter import ListToHistogramConverter 5 | from huiAudioCorpus.converter.ListToStatisticConverter import ListToStatisticConverter 6 | from huiAudioCorpus.persistenz.AudioPersistenz import AudioPersistenz 7 | from joblib import Parallel, delayed 8 | 9 | class AudioStatisticComponent: 10 | def __init__(self, audioPersistenz: AudioPersistenz, listToStatisticConverter:ListToStatisticConverter, listToHistogramConverter: ListToHistogramConverter): 11 | self.audioPersistenz = audioPersistenz 12 | self.listToStatisticConverter = listToStatisticConverter 13 | self.listToHistogramConverter = listToHistogramConverter 14 | self.columns = ['id','duration', 'loudness', 'minSilenceDB', 'samplingrate', 'silencePercent', 'averageFrequency' ] 15 | 16 | def run(self): 17 | rawData = self.loadAudioFiles() 18 | return self.getStatistic(rawData) 19 | 20 | def getStatistic(self, rawData): 21 | descriptions = ['Length in seconds', 'Loudness in DB', 'Minimum silence in DB', 'Samplingrate in Hz', 'Silence in percent', 'Average Frquency in Hz'] 22 | statistics = {} 23 | for column in rawData: 24 | if column not in self.columns: 25 | continue 26 | statistics[column] = { 27 | 'name': column, 28 | 'statistic': self.listToStatisticConverter.convert(rawData[column].tolist()), 29 | 'histogram': self.listToHistogramConverter.convert(rawData[column].tolist()), 30 | 'description': descriptions[len(statistics)] 31 | } 32 | 33 | return statistics, rawData 34 | 35 | def loadAudioFiles(self): 36 | result = Parallel(n_jobs=12, verbose=10, batch_size=100)(delayed(self.loadAudio)(audio) for audio in self.audioPersistenz.getIds()) 37 | rawData = DataFrame(result, columns = self.columns) 38 | rawData = rawData.set_index('id') 39 | return rawData 40 | 41 | def loadAudio(self, audioId: str): 42 | audio = self.audioPersistenz.load(audioId) 43 | return [audio.id.split("\\")[-1].split("/")[-1], round(audio.duration,1), round(audio.loudness,1), round(audio.silenceDB,1), audio.samplingRate, round(audio.silencePercent*100), round(audio.averageFrequency)] 44 | -------------------------------------------------------------------------------- /huiAudioCorpus/model/Sentence.py: -------------------------------------------------------------------------------- 1 | from os import error 2 | from textblob import TextBlob 3 | from huiAudioCorpus.utils.ModelToStringConverter import ToString 4 | from typing import List 5 | 6 | class Sentence(ToString): 7 | def __init__(self, sentence: str, id: str = ''): 8 | sentence = self.cleanSpaces(sentence) 9 | sentence = self.cleanSpacesPunctuation(sentence) 10 | 11 | self.sentence = sentence 12 | self.id = id 13 | 14 | textBlob = TextBlob(self.sentence.replace('.',' . ') ) 15 | self.words = self.generateWords(textBlob) 16 | self.wordsWithoutChars: List[str] = [word.lower() for word in textBlob.words] # type: ignore 17 | self.wordsWithoutCharsAndUpperChars: List[str] = [word for word in textBlob.words] # type: ignore 18 | self.wordsCount = len(self.wordsWithoutChars) 19 | self.charCount = len(self.sentence) 20 | self.wordsMatchingWithChars = self.generateWordsMatchingWithChars(self.words ,self.wordsWithoutChars) 21 | self.rawChars = "".join(self.wordsWithoutChars) 22 | 23 | def generateWords(self, textBlob:TextBlob): 24 | words = list(textBlob.tokenize()) 25 | 26 | return words 27 | def __getitem__(self, k): 28 | 29 | return Sentence(" ".join(self.wordsMatchingWithChars[k])) 30 | 31 | def generateWordsMatchingWithChars(self, words:List[str], wordsWithoutChars: List[str]): 32 | wordMatching = [] 33 | wordPointer = 0 34 | for word in words: 35 | if wordPointer 1000: 40 | print(wordMatching[-1]) 41 | raise Exception("Problems during creation of word matchings.") 42 | wordMatching[-1]+=' ' + word 43 | return wordMatching 44 | 45 | 46 | def cleanSpaces(self, text: str): 47 | text = text.replace(' ', ' ').replace(' ',' ').replace(' ',' ').replace(' ',' ') 48 | return text 49 | 50 | def cleanSpacesPunctuation(self, text: str): 51 | punctuations = '.,;?!:"' 52 | for char in punctuations: 53 | text = text.replace(char, char+' ') 54 | for char in punctuations: 55 | text = text.replace(' ' + char,char) 56 | text = text.replace(' ', ' ').replace(' ', ' ') 57 | return text.strip() -------------------------------------------------------------------------------- /huiAudioCorpus/workflows/createDatasetWorkflow/Step9_GenerateCleanDataset.py: -------------------------------------------------------------------------------- 1 | 2 | from huiAudioCorpus.utils.DoneMarker import DoneMarker 3 | from huiAudioCorpus.transformer.TranscriptsSelectionTransformer import TranscriptsSelectionTransformer 4 | from huiAudioCorpus.persistenz.TranscriptsPersistenz import TranscriptsPersistenz 5 | from huiAudioCorpus.persistenz.AudioPersistenz import AudioPersistenz 6 | from huiAudioCorpus.transformer.AudioSamplingRateTransformer import AudioSamplingRateTransformer 7 | from tqdm.std import tqdm 8 | import pandas as pd 9 | 10 | class Step9_GenerateCleanDataset: 11 | 12 | def __init__(self, savePath: str, infoFile:str, audioPersistenz: AudioPersistenz, transcriptsPersistenz: TranscriptsPersistenz, audioSamplingRateTransformer: AudioSamplingRateTransformer, transcriptsSelectionTransformer: TranscriptsSelectionTransformer, filter): 13 | self.audioSamplingRateTransformer = audioSamplingRateTransformer 14 | self.audioPersistenz = audioPersistenz 15 | self.transcriptsPersistenz = transcriptsPersistenz 16 | self.transcriptsSelectionTransformer = transcriptsSelectionTransformer 17 | self.savePath = savePath 18 | self.infoFile = infoFile 19 | self.filter = filter 20 | 21 | def run(self): 22 | doneMarker = DoneMarker(self.savePath) 23 | result = doneMarker.run(self.script, deleteFolder=False) 24 | return result 25 | 26 | def script(self): 27 | df = pd.read_csv(self.infoFile, sep='|' , index_col=0) 28 | try: 29 | df = df.set_index('id') 30 | except: 31 | pass 32 | 33 | print('Audios bevore: ', df.shape[0]) 34 | filteredAudios = self.filter(df) 35 | print('Audios after: ', filteredAudios.shape[0]) 36 | audiosAllowed = filteredAudios.index.tolist() 37 | 38 | self.copyAudioFiles(audiosAllowed) 39 | self.copyAndFilterTranscripts(audiosAllowed) 40 | 41 | 42 | 43 | 44 | def copyAudioFiles(self, audiosAllowed): 45 | countFiles = len(self.audioPersistenz.getIds()) 46 | for audio in tqdm(self.audioPersistenz.loadAll(), total= countFiles): 47 | if audio.name in audiosAllowed: 48 | self.audioPersistenz.save(audio) 49 | 50 | def copyAndFilterTranscripts(self, usedAudioFileNames): 51 | for transcripts in tqdm(self.transcriptsPersistenz.loadAll()): 52 | filteredTranscript = self.transcriptsSelectionTransformer.transform(transcripts, usedAudioFileNames) 53 | self.transcriptsPersistenz.save(filteredTranscript) -------------------------------------------------------------------------------- /huiAudioCorpus/workflows/createDatasetWorkflow/Step6_FinalizeDataset.py: -------------------------------------------------------------------------------- 1 | from huiAudioCorpus.transformer.TranscriptsSelectionTransformer import TranscriptsSelectionTransformer 2 | import pandas as pd 3 | from huiAudioCorpus.model.Audio import Audio 4 | from huiAudioCorpus.utils.DoneMarker import DoneMarker 5 | from huiAudioCorpus.persistenz.AudioPersistenz import AudioPersistenz 6 | from huiAudioCorpus.persistenz.TranscriptsPersistenz import TranscriptsPersistenz 7 | from tqdm import tqdm 8 | 9 | class Step6_FinalizeDataset: 10 | 11 | def __init__(self, savePath: str,chapterPath: str, audioPersistenz: AudioPersistenz, transcriptsPersistenz: TranscriptsPersistenz, transcriptsSelectionTransformer: TranscriptsSelectionTransformer): 12 | self.savePath = savePath 13 | self.audioPersistenz = audioPersistenz 14 | self.transcriptsPersistenz = transcriptsPersistenz 15 | self.chapterPath = chapterPath 16 | self.transcriptsSelectionTransformer = transcriptsSelectionTransformer 17 | 18 | 19 | def run(self): 20 | doneMarker = DoneMarker(self.savePath) 21 | result = doneMarker.run(self.script, deleteFolder=False) 22 | return result 23 | 24 | def script(self): 25 | transcriptsIterator = list(self.transcriptsPersistenz.loadAll()) 26 | transcripts = transcriptsIterator[0] 27 | transcriptsIds = [sentence.id for sentence in transcripts.sentences()] 28 | chapters = pd.read_csv(self.chapterPath) 29 | 30 | transcriptsSelectedIds = {} 31 | 32 | ids = self.audioPersistenz.getIds() 33 | audios = self.audioPersistenz.loadAll() 34 | audio: Audio 35 | for audio in tqdm(audios, total=len(ids)): 36 | book, chapter, index = audio.id.rsplit('_',2) 37 | reader:str = chapters.loc[int(chapter)-1]['Reader'] # type:ignore 38 | reader = reader.replace(' ', '_') 39 | if audio.id in transcriptsIds: 40 | path = reader + '/' + book 41 | if path in transcriptsSelectedIds: 42 | transcriptsSelectedIds[path].append(audio.id) 43 | else: 44 | transcriptsSelectedIds[path] = [audio.id] 45 | audio.id = path + '/wavs/' + audio.id 46 | self.audioPersistenz.save(audio) 47 | for path, ids in transcriptsSelectedIds.items(): 48 | localTranscripts = self.transcriptsSelectionTransformer.transform(transcripts, ids) 49 | localTranscripts.id = path + '/metadata' 50 | self.transcriptsPersistenz.save(localTranscripts) -------------------------------------------------------------------------------- /huiAudioCorpus/converter/AudioToSentenceConverter.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from huiAudioCorpus.persistenz.AudioPersistenz import AudioPersistenz 3 | from huiAudioCorpus.transformer.AudioSamplingRateTransformer import AudioSamplingRateTransformer 4 | from huiAudioCorpus.model.Audio import Audio 5 | from huiAudioCorpus.model.Sentence import Sentence 6 | import numpy as np 7 | from tqdm import tqdm 8 | 9 | try: 10 | from deepspeech import Model 11 | except: 12 | print('failed to load deepspeech, if you need it, try to install it') 13 | 14 | from huiAudioCorpus.sttInference import deepspeechModel 15 | 16 | class AudioToSentenceConverter: 17 | def __init__(self): 18 | self.modelPath = deepspeechModel.__path__[0] 19 | self.model = None 20 | 21 | 22 | def convert(self, audio: Audio, samplingRate:int = 15000): 23 | if self.model is None: 24 | self.model, self.samplingRate = self.loadDeepspeech(self.modelPath) 25 | audioSamplingRateTransformer = AudioSamplingRateTransformer(self.samplingRate) 26 | audioSampled = audioSamplingRateTransformer.transform(audio) 27 | timeSeries = audioSampled.timeSeries 28 | timeSeries /=1.414 29 | timeSeries *= 32767 30 | audioNumpy = timeSeries.astype(np.int16) 31 | 32 | transcript = self.model.stt(audioNumpy) 33 | sentence = Sentence(transcript, audio.id) 34 | return sentence 35 | 36 | def loadDeepspeech(self, modelPath: str): 37 | model = Model(modelPath+"/output_graph.pb") 38 | model.enableExternalScorer(modelPath+"/kenlm.scorer") 39 | desiredSamplingRate = model.sampleRate() 40 | return model, desiredSamplingRate 41 | 42 | 43 | if __name__ == "__main__": 44 | import librosa 45 | path = '/media/ppuchtler/LangsameSSD/Projekte/textToSpeech/datasetWorkflow/Step2_SplitAudio/audio/' 46 | 47 | addAudio = AudioPersistenz(path).load('acht_gesichter_am_biwasee_01_f000177') 48 | audio = AudioPersistenz(path).load('acht_gesichter_am_biwasee_01_f000077') 49 | 50 | audio = AudioPersistenz(path).load('acht_gesichter_am_biwasee_01_f000030') 51 | audio1 = AudioPersistenz(path).load('acht_gesichter_am_biwasee_01_f000105') 52 | audio = AudioPersistenz(path).load('acht_gesichter_am_biwasee_01_f000166') 53 | 54 | #audioRemove = AudioPersistenz(path).load('acht_gesichter_am_biwasee_01_f000001') 55 | #audio = AudioAddSilenceTransformer(10, 10).transform(audio) 56 | #audio = audio + audio 57 | 58 | converter = AudioToSentenceConverter() 59 | transcript = converter.convert(addAudio +audio + addAudio) 60 | 61 | print(transcript.sentence) -------------------------------------------------------------------------------- /huiAudioCorpus/workflows/createDatasetWorkflow/Step5_AlignText.py: -------------------------------------------------------------------------------- 1 | from huiAudioCorpus.model.Transcripts import Transcripts 2 | from pandas.core.frame import DataFrame 3 | from huiAudioCorpus.model.Sentence import Sentence 4 | from huiAudioCorpus.calculator.AlignSentencesIntoTextCalculator import AlignSentencesIntoTextCalculator 5 | from huiAudioCorpus.persistenz.TranscriptsPersistenz import TranscriptsPersistenz 6 | from huiAudioCorpus.utils.DoneMarker import DoneMarker 7 | 8 | class Step5_AlignText: 9 | 10 | def __init__(self, savePath: str, alignSentencesIntoTextCalculator: AlignSentencesIntoTextCalculator, transcriptsPersistenz: TranscriptsPersistenz, textToAlignPath: str): 11 | self.savePath = savePath 12 | self.alignSentencesIntoTextCalculator = alignSentencesIntoTextCalculator 13 | self.transcriptsPersistenz = transcriptsPersistenz 14 | self.textToAlignPath = textToAlignPath 15 | 16 | def run(self): 17 | doneMarker = DoneMarker(self.savePath) 18 | result = doneMarker.run(self.script, deleteFolder=False) 19 | return result 20 | 21 | def script(self): 22 | transcripts = list(self.transcriptsPersistenz.loadAll()) 23 | sentences = transcripts[0].sentences() 24 | with open(self.textToAlignPath, 'r', encoding='utf8') as f: 25 | inputText = f.read() 26 | inputSentence = Sentence(inputText) 27 | 28 | alignments = self.alignSentencesIntoTextCalculator.calculate(inputSentence,sentences ) 29 | notPerfektAlignments = [align for align in alignments if not align.isPerfect and not align.isSkipped] 30 | for align in notPerfektAlignments: 31 | print('------------------') 32 | print(align.sourceText.id) 33 | print(align.alignedText.sentence) 34 | print(align.sourceText.sentence) 35 | print(align.leftIsPerfekt) 36 | print(align.rightIsPerfekt) 37 | print(align.distance) 38 | 39 | print("notPerfektAlignments Percent",len(notPerfektAlignments)/len(alignments)*100) 40 | 41 | results = [[align.sourceText.id, align.alignedText.sentence]for align in alignments if align.isPerfect] 42 | 43 | csv = DataFrame(results) 44 | transcripts = Transcripts(csv, 'transcripts', 'transcripts') 45 | self.transcriptsPersistenz.save(transcripts) 46 | 47 | resultsNotPerfect = [[align.sourceText.id, align.alignedText.sentence]for align in alignments if not align.isPerfect] 48 | 49 | csv = DataFrame(resultsNotPerfect) 50 | transcripts = Transcripts(csv, 'transcriptsNotPerfect', 'transcriptsNotPerfect') 51 | self.transcriptsPersistenz.save(transcripts) 52 | -------------------------------------------------------------------------------- /scripts/createDatasetConfig/redaer.json: -------------------------------------------------------------------------------- 1 | { 2 | "rmische_geschichte_buch_1": { 3 | "title": "rmische_geschichte_buch_1", 4 | "LibrivoxBookName": "Römische Geschichte Buch 1", 5 | "GutenbergId": 3060, 6 | "GutenbergStart": "Vorrede zu der zweiten Auflage\n\n\n", 7 | "GutenbergEnd": "", 8 | "remove": [{ 9 | 10 | "start": "\n——————————————————", 11 | "end": "\n——————————————————" 12 | } 13 | ], 14 | "textReplacement": { 15 | 16 | } 17 | }, 18 | "rmische_geschichte_buch_2": { 19 | "title": "rmische_geschichte_buch_2", 20 | "LibrivoxBookName": "Römische Geschichte Buch 2", 21 | "GutenbergId": 3061, 22 | "GutenbergStart": "", 23 | "GutenbergEnd": "", 24 | "textReplacement": {} 25 | }, 26 | "rmische_geschichte_buch_3": { 27 | "title": "rmische_geschichte_buch_3", 28 | "LibrivoxBookName": "Römische Geschichte Buch 3", 29 | "GutenbergId": 3062, 30 | "GutenbergStart": "", 31 | "GutenbergEnd": "", 32 | "textReplacement": {} 33 | }, 34 | "rmische_geschichte_buch_4": { 35 | "title": "rmische_geschichte_buch_4", 36 | "LibrivoxBookName": "Römische Geschichte Buch 4", 37 | "GutenbergId": 3063, 38 | "GutenbergStart": "", 39 | "GutenbergEnd": "", 40 | "textReplacement": {} 41 | }, 42 | "rmische_geschichte_buch_5": { 43 | "title": "rmische_geschichte_buch_5", 44 | "LibrivoxBookName": "Römische Geschichte Buch 5", 45 | "GutenbergId": 3064, 46 | "GutenbergStart": "", 47 | "GutenbergEnd": "", 48 | "textReplacement": {} 49 | }, 50 | "rmische_geschichte_buch_8": { 51 | "title": "rmische_geschichte_buch_8", 52 | "LibrivoxBookName": "Römische Geschichte Buch 8", 53 | "GutenbergId": 3065, 54 | "GutenbergStart": "", 55 | "GutenbergEnd": "", 56 | "textReplacement": {} 57 | }, 58 | "reineke_fuchs": { 59 | "title": "reineke_fuchs", 60 | "LibrivoxBookName": "Reineke Fuchs", 61 | "GutenbergId": 2228, 62 | "GutenbergStart": "", 63 | "GutenbergEnd": "", 64 | "textReplacement": {} 65 | }, 66 | "hermann_und_dorothea": { 67 | "title": "hermann_und_dorothea", 68 | "LibrivoxBookName": "Hermann und Dorothea", 69 | "GutenbergId": 2312, 70 | "GutenbergStart": "", 71 | "GutenbergEnd": "", 72 | "textReplacement": {} 73 | }, 74 | "fabeln": { 75 | "title": "fabeln", 76 | "LibrivoxBookName": "Fabeln", 77 | "GutenbergId": "lichtwer/lichtfab/lichtfab.html", 78 | "GutenbergStart": "", 79 | "GutenbergEnd": "", 80 | "textReplacement": {} 81 | } 82 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ 139 | datasetWorkflow/ 140 | huiAudioCorpus/sttInference/deepspeechModel/alphabet.txt 141 | huiAudioCorpus/sttInference/deepspeechModel/kenlm.scorer 142 | huiAudioCorpus/sttInference/deepspeechModel/lm.binary 143 | huiAudioCorpus/sttInference/deepspeechModel/output_graph.pb 144 | -------------------------------------------------------------------------------- /huiAudioCorpus/persistenz/AudiosFromLibrivoxPersistenz.py: -------------------------------------------------------------------------------- 1 | import bs4 as bs 2 | import pandas as pd 3 | from huiAudioCorpus.utils.PathUtil import PathUtil 4 | import requests 5 | import json 6 | from tqdm import tqdm 7 | from joblib import Parallel, delayed 8 | 9 | class AudiosFromLibrivoxPersistenz: 10 | 11 | def __init__ (self, bookName: str, savePath: str, chapterPath: str, url:str = 'https://librivox.org/'): 12 | self.bookName = bookName 13 | self.url = url 14 | self.savePath = savePath 15 | self.chapterPath = chapterPath 16 | self.pathUtil = PathUtil() 17 | self.limitChapters = 1000 18 | self.rangeCapters = 20 19 | 20 | def save(self): 21 | chapters, chapterDownloadLinks = self.getChapter(self.bookName) 22 | Parallel(n_jobs=-2)(delayed(self.pathUtil.copyFileFromUrl)(link ,self.savePath+ '/' + link.split('/')[-1]) for link in chapterDownloadLinks) 23 | chapters.to_csv(self.chapterPath) 24 | 25 | 26 | def getChapter(self, bookName:str): 27 | searchUrl = self.getSearchUrl(bookName, self.url) 28 | response = self.loadSearchBook(searchUrl) 29 | chapterUrl = self.extractChapterUrl(response) 30 | chapterDownloadLinks = self.getChapterLinks(chapterUrl) 31 | chapters = pd.read_html(chapterUrl) 32 | return chapters[0], chapterDownloadLinks 33 | 34 | def loadSearchBook(self, url:str ): 35 | searchResult = requests.get(url) 36 | return searchResult.text 37 | 38 | def getSearchUrl(self, bookName: str, url:str): 39 | searchUrl = url + 'api/feed/audiobooks/?format=json&title=' + bookName 40 | return searchUrl 41 | 42 | def extractChapterUrl(self, response: str): 43 | jsonInput = json.loads(response)['books'] 44 | book = jsonInput[0] 45 | urlZipFile = book['url_librivox'] 46 | return urlZipFile 47 | 48 | def extractZipUrl(self, response: str): 49 | jsonInput = json.loads(response)['books'] 50 | book = jsonInput[0] 51 | urlZipFile = book['url_zip_file'] 52 | return urlZipFile 53 | 54 | def getChapterLinks(self, url: str): 55 | searchResult = requests.get(url) 56 | searchResult.encoding = "UTF-8" 57 | soup = bs.BeautifulSoup(searchResult.text, 'html.parser') 58 | parsed_table = soup.find_all('table')[0] 59 | data = [[td.a['href'] if td.find('a') else 60 | ''.join(td.stripped_strings) 61 | for td in row.find_all('td')] 62 | for row in parsed_table.find_all('tr')] 63 | downloadLinks = [chapter[1] for chapter in data if len(chapter)>0] 64 | return downloadLinks 65 | 66 | 67 | def getIds(self): 68 | books = [] 69 | limit = self.limitChapters 70 | for i in tqdm(range(self.rangeCapters)): 71 | requestUrl = f'https://librivox.org/api/feed/audiobooks/?format=json&limit={limit}&offset={i*limit}' 72 | page = requests.get(requestUrl) 73 | page.encoding = "UTF-8" 74 | result= json.loads(page.text) 75 | if 'books' in result: 76 | books.extend(result['books']) 77 | else: 78 | print(result) 79 | break 80 | return books -------------------------------------------------------------------------------- /huiAudioCorpus/components/TextStatisticComponent.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | from huiAudioCorpus.converter.ListToHistogramConverter import ListToHistogramConverter 3 | from huiAudioCorpus.converter.ListToStatisticConverter import ListToStatisticConverter 4 | from huiAudioCorpus.persistenz.TranscriptsPersistenz import TranscriptsPersistenz 5 | from huiAudioCorpus.converter.TranscriptsToSentencesConverter import TranscriptsToSentencesConverter 6 | from pandas.core.frame import DataFrame 7 | from collections import Counter 8 | from huiAudioCorpus.model.Sentence import Sentence 9 | 10 | class TextStatisticComponent: 11 | def __init__(self, transcriptsPersistenz: TranscriptsPersistenz, transcriptsToSentencesConverter:TranscriptsToSentencesConverter, listToStatisticConverter:ListToStatisticConverter, listToHistogramConverter: ListToHistogramConverter): 12 | self.transcriptsPersistenz = transcriptsPersistenz 13 | self.transcriptsToSentencesConverter = transcriptsToSentencesConverter 14 | self.listToStatisticConverter = listToStatisticConverter 15 | self.listToHistogramConverter = listToHistogramConverter 16 | 17 | def run(self): 18 | rawData= self.loadTextFiles() 19 | return self.getStatistic(rawData) 20 | 21 | def getStatistic(self, rawData): 22 | descriptions = ['Words count in audio', 'Chars count in audio'] 23 | ids = ['wordCount', 'charCount'] 24 | statistics = {} 25 | for column in rawData: 26 | if column not in ids: 27 | continue 28 | statistics[column] = { 29 | 'name': column, 30 | 'statistic': self.listToStatisticConverter.convert(rawData[column].tolist()), 31 | 'histogram': self.listToHistogramConverter.convert(rawData[column].tolist()), 32 | 'description': descriptions[len(statistics)] 33 | } 34 | 35 | 36 | if 'text' not in rawData: 37 | counter = Counter() 38 | uniqeWordsWithMinimum = {} 39 | 40 | else: 41 | counter = Counter([word for sentence in tqdm(rawData['text']) for word in Sentence(sentence).wordsWithoutChars]) 42 | 43 | counterValues = counter.values() 44 | uniqeWordsWithMinimum = {} 45 | remainingCounts = counterValues 46 | for minWortOccurence in tqdm(list(range(1, max(counterValues)+1))): 47 | remainingCounts = [count for count in remainingCounts if count>=minWortOccurence] 48 | uniqeWordsWithMinimum[minWortOccurence] = len(remainingCounts) 49 | if(len(remainingCounts)==1): 50 | break 51 | 52 | return statistics, rawData, counter, uniqeWordsWithMinimum 53 | 54 | def loadTextFiles(self): 55 | allSentences =[sentence for transcripts in tqdm(self.transcriptsPersistenz.loadAll(), total=len(self.transcriptsPersistenz.getIds())) for sentence in self.transcriptsToSentencesConverter.convert(transcripts)] 56 | result = [[sentence.id.split("\\")[-1].split("/")[-1], sentence.wordsCount, sentence.charCount, sentence.sentence] for sentence in tqdm(allSentences)] 57 | rawData = DataFrame(result, columns = ['id','wordCount', 'charCount', 'text']) 58 | rawData = rawData.set_index('id') 59 | return rawData -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HUI-Audio-Corpus-German 2 | This is the official repository for the HUI-Audio-Corpus-German. The corresponding paper is in the process of publication. With this repository it is possible to automatically recreate the dataset. It is also possible to add more speakers to the processing pipeline. 3 | 4 | Dataset: https://opendata.iisys.de 5 | 6 | Live example: http://narvi.sysint.iisys.de/projects/tts 7 | 8 | Paper (presented at 44th German Conference on Artificial Intelligence (KI2021)): https://arxiv.org/abs/2106.06309 9 | 10 | ## Speaker overview 11 | 12 | * bernd 13 | * hokuspokus 14 | * friedrich 15 | * eva 16 | * karlsson 17 | * sonja 18 | 19 | ### Not finished 20 | 21 | * redaer 22 | 23 | ## Installation 24 | 25 | ### Requirements 26 | 27 | * Linux 28 | * Anaconda 29 | 30 | ### Setup python environment with Anaconda 31 | 32 | Navigate to the cloned repository 33 | 34 | Create a new conda environment (For more informations: https://salishsea-meopar-docs.readthedocs.io/en/latest/work_env/python3_conda_environment.html) 35 | ``` 36 | conda create -n huiAudioCorpus python=3.8 37 | conda activate huiAudioCorpus 38 | ``` 39 | 40 | Install the package as devolop python package (For more informations: http://naoko.github.io/your-project-install-pip-setup/) 41 | 42 | ``` 43 | python setup.py develop 44 | ``` 45 | 46 | Installation of dependencies 47 | ``` 48 | pip install -r requirements.txt 49 | ``` 50 | 51 | Download: https://opendata.iisys.de/opendata/Datasets/deepspeechModel/deepspeechModel.zip and copy the content of the downloaded zip into the folder: 52 | 53 | ``` 54 | /huiAudioCorpus/sttInference/deepspeechModel 55 | ``` 56 | ### Optional installation step 57 | The deepspeech model runs by default on CPU. This could lead to a long pipeline processing pipeline. If you have a compatible GPU you can install a special version from deepspeech. 58 | More infos can be found at: 59 | ``` 60 | https://deepspeech.readthedocs.io/en/r0.9/USING.html 61 | ``` 62 | ## Recreate dataset 63 | 64 | ``` 65 | cd scripts 66 | 67 | python createDataset.py 68 | ``` 69 | 70 | Here, configurations can be viewed: 71 | 72 | Inside the variable "allConfigs" all speaker configurations can be added. If you want to easily test if the pipeline is runnig you can use: 73 | 74 | ``` 75 | allConfigs = sonja 76 | ``` 77 | 78 | for all speakers you could use 79 | 80 | ``` 81 | allConfigs = {**bernd, **hokuspokus, **friedrich, **eva, **karlsson, **sonja} 82 | ``` 83 | 84 | The processing files and the complete dataset with statistics are created at 85 | ``` 86 | /datasetWorkflow 87 | ``` 88 | Directory can be changed inside createDataset.py 89 | 90 | ``` 91 | externalPaths = [ 92 | "/path/to/the/folder" 93 | ] 94 | 95 | ``` 96 | 97 | ## Adding a new speaker 98 | 99 | If you want to add a new speaker, follow these steps: 100 | * Create a json file inside the scripts/createDatasetConfig with your speaker. Here you can find examples of how the file should look. Infos about the speacers can be found at datasetWorkflow/overview 101 | * Validate text replacements, the script helps you with the needed steps 102 | * finish dataset and create a push request 103 | 104 | ## Creating statistics for other datasets 105 | 106 | We have a script for the creation of statistics only. 107 | For this, variables "loadPath" and "savePath" inside the file "scripts/generrateAudioStatistic.py" have to be adjusted. 108 | -------------------------------------------------------------------------------- /huiAudioCorpus/persistenz/AudioTranscriptPairPersistenz.py: -------------------------------------------------------------------------------- 1 | from huiAudioCorpus.model.AudioTranscriptPair import AudioTranscriptPair 2 | from huiAudioCorpus.error.MatchingNotFoundError import MatchingNotFoundError 3 | from typing import List 4 | from huiAudioCorpus.converter.TranscriptsToSentencesConverter import TranscriptsToSentencesConverter 5 | from huiAudioCorpus.persistenz.AudioPersistenz import AudioPersistenz 6 | from huiAudioCorpus.persistenz.TranscriptsPersistenz import TranscriptsPersistenz 7 | 8 | class AudioTranscriptPairPersistenz: 9 | 10 | def __init__(self, audioPersistenz: AudioPersistenz, transcriptsPersistenz: TranscriptsPersistenz, transcriptsToSentencesConverter: TranscriptsToSentencesConverter, checkForConsistency:bool = True): 11 | self.audioPersistenz = audioPersistenz 12 | self.transcriptsPersistenz = transcriptsPersistenz 13 | self.transcriptsToSentencesConverter = transcriptsToSentencesConverter 14 | 15 | def load(self, audioId: str, sentenceId:str): 16 | audio = self.audioPersistenz.load(audioId) 17 | sentence = self.getAllSentences()[sentenceId] 18 | elementPair = AudioTranscriptPair(sentence, audio) 19 | return elementPair 20 | 21 | 22 | def getIds(self, checkForConsistency = True): 23 | audioIds = self.audioPersistenz.getIds() 24 | audioNames = self.audioPersistenz.getNames() 25 | sentencesIds = list(self.getAllSentences().keys()) 26 | 27 | if checkForConsistency: 28 | self.checkeIds(audioNames, sentencesIds) 29 | else: 30 | audioIds,audioNames, sentencesIds = self.removeNonExistentIds(audioIds, audioNames, sentencesIds) 31 | 32 | ids = self.sortIds(audioIds, audioNames, sentencesIds) 33 | 34 | return ids 35 | 36 | def sortIds(self, audioIds, audioNames, sentencesIds): 37 | zippedAudios = list(zip(audioIds, audioNames)) 38 | zippedAudios.sort(key = lambda x: x[1]) 39 | audioIds = [element[0] for element in zippedAudios] 40 | sentencesIds.sort() 41 | return list(zip(audioIds, sentencesIds)) 42 | 43 | 44 | def loadAll(self, checkForConsistency = True): 45 | ids = self.getIds(checkForConsistency) 46 | for audioId, sentenceId in ids: 47 | yield self.load(audioId, sentenceId) 48 | 49 | 50 | def getAllSentences(self): 51 | transcripts = list(self.transcriptsPersistenz.loadAll()) 52 | sentences = [sentence for transcript in transcripts for sentence in self.transcriptsToSentencesConverter.convert(transcript)] 53 | sentenceDict = {sentence.id: sentence for sentence in sentences} 54 | return sentenceDict 55 | 56 | def checkeIds(self, audioIds: List[str], sentenceIds: List[str]): 57 | missingAudioIds = [id for id in sentenceIds if not id in audioIds] 58 | missingSentenceIds = [id for id in audioIds if not id in sentenceIds] 59 | if missingAudioIds or missingSentenceIds: 60 | raise MatchingNotFoundError(missingAudioIds, missingSentenceIds, 'audioFiles', 'Transcripts') 61 | 62 | def removeNonExistentIds(self, audioIds: List[str], audioNames: List[str], sentenceIds: List[str]): 63 | audioIds = [id for id, name in zip(audioIds, audioNames) if name in sentenceIds] 64 | audioNames = [name for name in audioNames if name in sentenceIds] 65 | sentenceIds= [id for id in sentenceIds if id in audioNames] 66 | return audioIds, audioNames, sentenceIds -------------------------------------------------------------------------------- /huiAudioCorpus/model/Audio.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.lib.function_base import average 3 | from huiAudioCorpus.utils.ModelToStringConverter import ToString 4 | from nptyping import NDArray 5 | import pyloudnorm as pyln 6 | import librosa 7 | 8 | class Audio(ToString): 9 | def __init__(self, audioTimeSeries: NDArray, samplingRate: int, id: str, name: str): 10 | self.timeSeries = audioTimeSeries 11 | self.samplingRate = samplingRate 12 | self.name = name 13 | self.id = id 14 | 15 | 16 | @property 17 | def samples(self)->int: 18 | return self.timeSeries.shape[0] 19 | 20 | @property 21 | def duration(self)-> float: 22 | return self.samples/ self.samplingRate 23 | 24 | 25 | def __add__(self, other: 'Audio') -> 'Audio': 26 | audioTimeSeries = self.timeSeries.tolist() + other.timeSeries.tolist() 27 | audioTimeSeries = np.array(audioTimeSeries) 28 | id = self.id + '&' + other.id 29 | name = self.name + '&' + other.name 30 | 31 | samplingRateSelf = self.samplingRate 32 | samplingRateOther = other.samplingRate 33 | if samplingRateOther != samplingRateSelf: 34 | raise ValueError(f"The samplingrates from the audio files are different sr1: {samplingRateSelf} sr2: {samplingRateOther} from the audio files with the combined id: {id} and name: {name}") 35 | 36 | audio = Audio(audioTimeSeries,samplingRateSelf, id, name) 37 | return audio 38 | 39 | def __radd__(self, other): 40 | return self 41 | 42 | @property 43 | def loudness(self)->float: 44 | meter = pyln.Meter(self.samplingRate) # create BS.1770 meter 45 | loudness = meter.integrated_loudness(self.timeSeries) 46 | return loudness 47 | 48 | @property 49 | def silenceDB(self)->float: 50 | silenceDurationInSeconds= 0.05 51 | frameLength = int(silenceDurationInSeconds* self.samplingRate) 52 | for silenceDezibel in range(100, 1,-1): 53 | splitted = librosa.effects.split(self.timeSeries,silenceDezibel , frame_length=frameLength, hop_length=int(frameLength/4)) 54 | if len(splitted)>1: 55 | return -silenceDezibel 56 | return 0 57 | 58 | @property 59 | def silencePercent(self)->float: 60 | states = self.isLoud() 61 | silencePercent = 1- sum(states)/len(states) 62 | return silencePercent 63 | 64 | def isLoud(self): 65 | #https://librosa.org/doc/latest/auto_examples/plot_viterbi.html#sphx-glr-auto-examples-plot-viterbi-py 66 | rms = librosa.feature.rms(y=self.timeSeries)[0]# type: ignore 67 | 68 | r_normalized = (rms - 0.02) / np.std(rms) 69 | p = np.exp(r_normalized) / (1 + np.exp(r_normalized))# type: ignore 70 | 71 | 72 | transition = librosa.sequence.transition_loop(2, [0.5, 0.6]) 73 | full_p = np.vstack([1 - p, p]) 74 | states = librosa.sequence.viterbi_discriminative(full_p, transition) 75 | return states 76 | 77 | @property 78 | def averageFrequency(self)->float: 79 | try: 80 | cent = librosa.feature.spectral_centroid(y=self.timeSeries, sr=self.samplingRate)[0] #type: ignore 81 | loudPositions = self.isLoud() 82 | 83 | centAtLoud = [cent[index] for index in range(len(cent)) if loudPositions[index]==1] 84 | averageFrequency = round(average(centAtLoud)) #type: ignore 85 | return averageFrequency 86 | except: 87 | return -1 88 | -------------------------------------------------------------------------------- /huiAudioCorpus/transformer/AudioSplitTransformer.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import librosa 3 | from huiAudioCorpus.model.Audio import Audio 4 | from huiAudioCorpus.transformer.AudioFadeTransformer import AudioFadeTransformer 5 | import statistics 6 | 7 | class AudioSplitTransformer: 8 | 9 | def __init__(self, audioFadeTransformer: AudioFadeTransformer, maxAudioDuration: float, minAudioDuration: float, silenceDurationInSeconds:float= 0.2): 10 | self.maxAudioDuration = maxAudioDuration 11 | self.minAudioDuration = minAudioDuration 12 | self.silenceDurationInSeconds = silenceDurationInSeconds 13 | self.audioFadeTransformer = audioFadeTransformer 14 | 15 | def transform(self, audio: Audio, bookName: str, chapter: int): 16 | splitted = self.splitWithBestDezibel(audio, self.maxAudioDuration - self.minAudioDuration) 17 | splitted = self.mergeAudioToTargetDuration(splitted, self.minAudioDuration) 18 | merged = self.mergeLastAudioIfTooShort(splitted, self.minAudioDuration) 19 | withIds = self.setIds(merged, bookName, chapter) 20 | withFading = self.fade(withIds) 21 | return withIds 22 | 23 | def splitWithBestDezibel(self, audio: Audio, maxAudioDuration: float): 24 | # TODO: think about using recrusive and split just needed audio files???? 25 | splittedAudio:List[Audio]=[] 26 | maxDuration:float = 0 27 | for silenceDezibel in range(70, -20,-5): 28 | splittedAudio = self.split(audio, silenceDezibel) 29 | maxDuration = max([audio.duration for audio in splittedAudio]) 30 | if maxDuration< maxAudioDuration: 31 | print( audio.name, 'used DB:', silenceDezibel) 32 | return splittedAudio 33 | return splittedAudio 34 | 35 | def split(self, audio: Audio, silenceDezibel: int): 36 | frameLength = int(self.silenceDurationInSeconds* audio.samplingRate) 37 | splitted = librosa.effects.split(audio.timeSeries,silenceDezibel , frame_length=frameLength, hop_length=int(frameLength/4)) 38 | audios = [] 39 | for index in range(len(splitted)): 40 | (start,end) = splitted[index] 41 | isNextElementAvailable = len(splitted)> index+1 42 | if isNextElementAvailable: 43 | (nextStart, nextEnd) = splitted[index+1] 44 | betterEnd = int(statistics.mean([end, nextStart])) 45 | else: 46 | betterEnd = end 47 | 48 | isPreviousElementAvailable = not index == 0 49 | if isPreviousElementAvailable: 50 | (previousStart, previousEnd) = splitted[index-1] 51 | betterStart = int(statistics.mean([previousEnd, start])) 52 | else: 53 | betterStart = start 54 | 55 | newAudio = Audio(audio.timeSeries[betterStart:betterEnd], audio.samplingRate, 'id', 'name') 56 | audios.append(newAudio) 57 | return audios 58 | 59 | def mergeAudioToTargetDuration(self, audios: List[Audio], targetDuration: float): 60 | mergedAudios: List[Audio] = [] 61 | 62 | for audio in audios: 63 | if len(mergedAudios)>0 and mergedAudios[-1].duration>")))>0: 89 | 90 | directLink = pageSoup.find("a",text=re.compile("weiter\s*>>"))["href"] 91 | 92 | nextLink = page.url.split("/") 93 | 94 | nextLink.pop() 95 | 96 | nextLink.append(directLink) 97 | 98 | nextLink = "/".join(nextLink) 99 | 100 | return paragraphs, nextLink 101 | 102 | 103 | def prepareParagraph(self, paragraphs:List): 104 | extractedParagraphs = '' 105 | for paragraph in paragraphs: 106 | for footnote in paragraph.select('span'): 107 | footnote.extract() 108 | 109 | if len(paragraph.text) > 0: 110 | extractedParagraph = re.sub(r" +",r" ",paragraph.text.replace("\t"," ").replace("\n", " ")) 111 | 112 | extractedParagraphs += extractedParagraph.strip()+"\n" 113 | return extractedParagraphs 114 | 115 | 116 | class GuttenbergDownload: 117 | """ 118 | This class downloads a book from www.projekt-gutenberg.org 119 | The id has to be searched manual with the link http://gutendex.com/books/?search=ThisIsTheSearchText 120 | """ 121 | def downloadText(self, textId: int): 122 | text = strip_headers(load_etext(textId, mirror='http://eremita.di.uminho.pt/gutenberg/')).strip() 123 | return text 124 | 125 | -------------------------------------------------------------------------------- /scripts/createDatasetConfig/Bernd_Ungerer_tausendUndEineNacht.json: -------------------------------------------------------------------------------- 1 | { 2 | "tausendUndEineNacht1": { 3 | "title": "tausend_und_eine_nacht_band_1", 4 | "LibrivoxBookName": "Tausend und eine Nacht, Band 1", 5 | "GutenbergId": "weil/band1/inhalt.html", 6 | "GutenbergStart": "", 7 | "GutenbergEnd": "", 8 | "textReplacement": { 9 | "0, mein Teurer": "O, mein Teurer", 10 | " u. s. w.": " und so weiter. ", 11 | "100.000": "einhunderttausend", 12 | "90.000": "neunzigtausend", 13 | "10.000": "zehntausend", 14 | "50.000": "fünfzigtausend", 15 | "d. h.": " das heißt ", 16 | " z. B.": " zum Beispiel ", 17 | "H. v.": " Herr von ", 18 | "1786)": "siebzehnhundertsechsundachzig)", 19 | "1839)": "achtzehnhundertneununddreißig)", 20 | "15ten": "fünfzehnten", 21 | "16ten": "sechzehnten", 22 | "1001": "tausendundeine", 23 | "1837": "achzehnhundertsiebenunddreißig", 24 | "1000": "eintausend", 25 | "5000": "fünftausend", 26 | "261.": "zweihunderteinundsechzigte", 27 | "4500": "viertausendfünfhundert", 28 | "1200": "eintausendzweihundert", 29 | "7320": "siebentausenddreihundertzwanzig", 30 | "1226": "eintausendzweihundertsechsundzwanzig", 31 | "2500": "zweitausendfünfhundert", 32 | "4000": "viertausend", 33 | "8000": "achttausend", 34 | "3000": "dreitausend", 35 | "2000": "zweitausend", 36 | "6000": "sechstausend", 37 | "653": "sechshundertdreiundfünfzig", 38 | "636": "sechshundertsechsunddreißig", 39 | "103": "einhundertdrei", 40 | "700": "siebenhundert", 41 | "100": "einhundert", 42 | "200": "zweihundert", 43 | "800": "achthundert", 44 | "110": "einhundertzehn", 45 | "500": "fünfhundert", 46 | "400": "vierhundert", 47 | "14.": "vierzehnsten", 48 | "11.": "elften", 49 | "16.": "sechzehnsten", 50 | "13.": "dreizehnsten", 51 | "15.": "fünfzehnsten", 52 | "145": "einhundertfünfundvierzig", 53 | "170": "einhundertsiebzig", 54 | "40.": "vierzigsten", 55 | "41.": "einundvierzigsten", 56 | "98": "achtundneunzig", 57 | "39": "neununddreißig", 58 | "40": "vierzig", 59 | "70": "siebzig", 60 | "18": "achtzehn", 61 | "20": "zwanzig", 62 | "50": "fünfzig", 63 | "10": "zehn", 64 | "24": "vierundzwanzig", 65 | " v.": " von ", 66 | " H. ": " Herr ", 67 | " N.": " Nacht " 68 | } 69 | }, 70 | "tausendUndEineNacht2": { 71 | "title": "tausend_und_eine_nacht_band_2", 72 | "LibrivoxBookName": "Tausend und eine Nacht, Band 2", 73 | "GutenbergId": "weil/band2/inhalt.html", 74 | "GutenbergStart": "", 75 | "GutenbergEnd": "", 76 | "textReplacement": { 77 | "10.000": "zehntausend", 78 | " usw.": " und so weiter ", 79 | " u.s.w.": "und so weiter ", 80 | " N. N.": " so und so ", 81 | "d. h.": " das heißt ", 82 | " d.h.": " das heißt ", 83 | " N.N.": " so und so ", 84 | "1001": "tausendundeine", 85 | "1000": "eintausend", 86 | "1050": "eintausendfünfzig", 87 | " u.": " und ", 88 | "999": "neunhundertneunundneunzig", 89 | "12.": "zwölfsten", 90 | "2.": "zweite", 91 | "40": "vierzig" 92 | } 93 | }, 94 | "tausendUndEineNacht3": { 95 | "title": "tausend_und_eine_nacht_band_3", 96 | "LibrivoxBookName": "Tausend und eine Nacht, Band 3", 97 | "GutenbergId": "weil/band3/inhalt.html", 98 | "GutenbergStart": "", 99 | "GutenbergEnd": "", 100 | "textReplacement": { 101 | " u.s.f.": " und so fort", 102 | " Z.B.": " zum Beispiel ", 103 | " d. h.": " das heißt ", 104 | " z. B.": " zum Beispiel ", 105 | "1001": "tausendundeine", 106 | "1564": "fünfzenhundertvierundsechzig" 107 | } 108 | }, 109 | "tausendUndEineNacht4": { 110 | "title": "tausend_und_eine_nacht_band_4", 111 | "LibrivoxBookName": "Tausend und eine Nacht, Band 4", 112 | "GutenbergId": "weil/band4/inhalt.html", 113 | "GutenbergStart": "", 114 | "GutenbergEnd": "", 115 | "textReplacement": { 116 | " u.s.w.": " und so weiter ", 117 | " N. N.": " so und so ", 118 | " usw.": " und so weiter", 119 | "3,700,000": "dreimillionensiebenhundertausend", 120 | "1050000": "einemillionfünfzigtausend", 121 | "30,000": "dreißigtausend", 122 | "70.000": "siebzigtausend", 123 | "1001": "Tausendundeine", 124 | "1 1/6": "ein ein sechstel", 125 | "70000": "siebzigtausend", 126 | "75": "fünfundsiebzig", 127 | "40": "vierzig " 128 | } 129 | } 130 | } -------------------------------------------------------------------------------- /huiAudioCorpus/calculator/AlignSentencesIntoTextCalculator.py: -------------------------------------------------------------------------------- 1 | import operator 2 | from nltk.sem.evaluate import Error 3 | from tqdm import tqdm 4 | from huiAudioCorpus.model.SentenceAlignment import SentenceAlignment 5 | from typing import List 6 | from huiAudioCorpus.model.Sentence import Sentence 7 | from huiAudioCorpus.transformer.SentenceDistanceTransformer import SentenceDistanceTransformer 8 | from joblib import Parallel, delayed 9 | 10 | rangeWords = 40 11 | class AlignSentencesIntoTextCalculator: 12 | 13 | def __init__(self, sentenceDistanceTransformer: SentenceDistanceTransformer): 14 | self.sentenceDistanceTransformer = sentenceDistanceTransformer 15 | 16 | def calculate(self, originalText: Sentence, textToAlign: List[Sentence]): 17 | 18 | alignments = self.calculateAlignments(originalText,textToAlign) 19 | alignments = self.evaluateIfPerfektStartAndEnd(alignments,originalText.wordsCount) 20 | alignments = self.getMissingWordsBetweenAlignments(alignments, originalText) 21 | return alignments 22 | 23 | def calculateAlignments(self, originalText: Sentence, textToAlign: List[Sentence]): 24 | with Parallel(n_jobs=15, batch_size=500) as parallel: 25 | alignments:List[SentenceAlignment] = [] 26 | start=0 27 | text: Sentence 28 | additionalRange = 0 29 | for text in tqdm(textToAlign): 30 | 31 | 32 | rangeStart= max(0,start-rangeWords - additionalRange) 33 | rangeEnd = min(rangeStart+2*(rangeWords + additionalRange)+text.wordsCount,originalText.wordsCount+1) 34 | 35 | if rangeEnd- rangeStart>2000: 36 | raise Exception('more than 2000 Words in search text') 37 | 38 | (newStart, end), distance = self.bestPosition(parallel,originalText[rangeStart: rangeEnd ], text, 0, rangeEnd- rangeStart) 39 | newStart += rangeStart 40 | end += rangeStart 41 | 42 | align = SentenceAlignment(text, originalText[newStart: end],newStart, end, distance) 43 | if distance>0.2: 44 | print('*****************') 45 | print('skip because of too high distance: ',text.id, distance) 46 | print('*****************') 47 | print(text.sentence) 48 | print('___________________') 49 | print(originalText[rangeStart: rangeEnd ].sentence) 50 | print('########') 51 | 52 | align.isSkipped = True 53 | additionalRange += 30 + text.wordsCount 54 | else: 55 | start = end 56 | additionalRange= 0 57 | alignments.append(align) 58 | return alignments 59 | 60 | def bestPosition(self,parallel:Parallel, originalText: Sentence, textToAlign: Sentence, rangeStart: int, rangeEnd: int): 61 | startEnds = [] 62 | for end in range(rangeStart, rangeEnd): 63 | for start in range(max(rangeStart,end-textToAlign.wordsCount-10), end): 64 | startEnds.append((start, end)) 65 | 66 | positionene = parallel(delayed(self.positionOneSentence)(originalText, textToAlign, start, end) for start, end in startEnds) 67 | #positionene = [self.positionOneSentence(originalText, textToAlign, start, end) for start, end in startEnds] 68 | 69 | bestPosition = min(positionene, key=operator.itemgetter(1)) # type: ignore 70 | return bestPosition 71 | 72 | def positionOneSentence(self, originalText: Sentence , textToAlign: Sentence, start: int, end: int): 73 | textToSearch = originalText[start:end] 74 | distance = self.sentenceDistanceTransformer.transform(textToSearch, textToAlign) 75 | return [(start, end), distance] 76 | 77 | 78 | def evaluateIfPerfektStartAndEnd(self,alignments: List[SentenceAlignment], originalTextLength: int): 79 | for index, align in enumerate(alignments): 80 | align.leftIsPerfekt = False 81 | align.rightIsPerfekt = False 82 | align.isFirst = index ==0 83 | align.isLast = index == len(alignments)-1 84 | 85 | if align.start==0: 86 | align.leftIsPerfekt=True 87 | if align.end == originalTextLength: 88 | align.rightIsPerfekt= True 89 | 90 | try: 91 | if align.start == alignments[index-1].end: 92 | align.leftIsPerfekt=True 93 | except: 94 | pass 95 | try: 96 | if align.end == alignments[index+1].start: 97 | align.rightIsPerfekt=True 98 | except: 99 | pass 100 | align.isPerfect = (align.leftIsPerfekt or align.isFirst) and (align.rightIsPerfekt or align.isLast) and not align.isSkipped 101 | return alignments 102 | 103 | def getMissingWordsBetweenAlignments(self, alignments: List[SentenceAlignment], originalText: Sentence): 104 | for index, aling in enumerate(alignments): 105 | if index == len(alignments)-1: 106 | continue 107 | 108 | if not aling.rightIsPerfekt: 109 | print(originalText[aling.end:alignments[index+1].start]) 110 | 111 | return alignments -------------------------------------------------------------------------------- /huiAudioCorpus/converter/SentenceToPhoneticSentenceConverter.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from nltk.sem.evaluate import Error 3 | from huiAudioCorpus.model.Sentence import Sentence 4 | from huiAudioCorpus.model.PhoneticSentence import PhoneticSentence 5 | import pandas as pd 6 | 7 | class SentenceToPhoneticSentenceConverter: 8 | def __init__(self, libraryPath: str , useEmphasis: bool = True): 9 | self.library = self.createLibrary(libraryPath) 10 | self.useEmphasis = useEmphasis 11 | 12 | def convert(self, sentence: Sentence): 13 | words = sentence.words 14 | ipaWords, subWords = self.transformSentencesToIpa(words) 15 | ipaText = ' '.join(ipaWords) 16 | ipaText = self.removeEmphasis(ipaText) 17 | return PhoneticSentence(ipaText, subWords) 18 | 19 | 20 | def createLibrary(self, libraryPath: str): 21 | pointLibrary = pd.DataFrame({ 22 | "text": [",", ".", "?", "-", ";", "!", ":", "'", "s", "ste", "(", ")", ">", "<", '›', '‹', 'é','è', '&'], 23 | "ipa": [",", ".","?", ",", ",", "!", ":", "'", "s", "stə", ",", ",", "'", "'", "'", "'", 'e', 'e', 'ʊnt'] 24 | }) 25 | library = pd.read_csv(libraryPath,keep_default_na=False) 26 | 27 | libraryLowerCase = library.copy(deep=True) 28 | libraryLowerCase['text'] = libraryLowerCase['text'].apply(str.lower) 29 | library = library.append(pointLibrary) 30 | library = library.append(libraryLowerCase) 31 | 32 | library.set_index('text', inplace = True) 33 | library.sort_index(inplace = True) 34 | return library 35 | 36 | def transformSentencesToIpa(self, words:List[str]): 37 | ipaWords: List[str] = [] 38 | subWords: List[str] = [] 39 | index = 0 40 | while index < len(words): 41 | word = words[index] 42 | remainingWords = words[index:] 43 | countMultiwords, multiwords, multiWord = self.findMultiwordIpa(remainingWords) 44 | if countMultiwords>0 and multiwords is not None: 45 | index += countMultiwords 46 | subWords.append(multiWord) 47 | ipaWords.append(multiwords) 48 | continue 49 | ipa, subWord = self.transformWordToIpa(word) 50 | subWords.append(subWord) 51 | ipaWords.append(ipa) 52 | index +=1 53 | return ipaWords, subWords 54 | 55 | def findMultiwordIpa(self, words:List[str]): 56 | if len(words)<2: 57 | return 0, None, "" 58 | for count in range(5,1,-1): 59 | multiWord = ' '.join(words[:count]) 60 | multiwordIpa = self.getIpaFromLibrary(multiWord) 61 | if multiwordIpa is not None: 62 | return count, multiwordIpa, multiWord 63 | return 0, None, "" 64 | 65 | def transformWordToIpa(self, word:str): 66 | completeIpaLeft = '' 67 | completeIpaRight = '' 68 | completeWordLeft = [] 69 | completeWordRight = [] 70 | while word != '': 71 | remainingWordFirst, ipaFirst, firstPart = self.findFirstPartInWord(word) 72 | remainingWordLast, ipaLast, lastPart = self.findLastPartInWord(word) 73 | if len(remainingWordLast) < len(remainingWordFirst): 74 | completeIpaLeft = ipaLast + completeIpaLeft 75 | completeWordLeft.insert(0,lastPart) 76 | word = remainingWordLast 77 | else: 78 | completeIpaRight = completeIpaRight + ipaFirst 79 | completeWordRight.append(firstPart) 80 | word = remainingWordFirst 81 | completeIpa = completeIpaRight + completeIpaLeft 82 | completeWordRight.extend(completeWordLeft) 83 | completeWords = '|'.join(completeWordRight) 84 | return completeIpa, completeWords 85 | 86 | 87 | def findFirstPartInWord(self, word:str): 88 | for wordPart in range(len(word), 0, -1): 89 | part = word[:wordPart] 90 | ipa = self.getIpaFromLibrary(part) 91 | if ipa is not None: 92 | remainingWord = word[wordPart:] 93 | return remainingWord, ipa, part 94 | raise Error('we have no match for single char in library with char: ' + word[0] + 'with full text:' + word)# pragma: no cover 95 | 96 | def findLastPartInWord(self, word:str): 97 | for wordPart in range(0,len(word)): 98 | part = word[wordPart:] 99 | ipa = self.getIpaFromLibrary(part) 100 | if ipa is not None: 101 | remainingWord = word[:wordPart] 102 | return remainingWord, ipa, part 103 | raise Error('we have no match for single char in library with char: ' + word[-1])# pragma: no cover 104 | 105 | def getIpaFromLibrary(self, word:str): 106 | ipa = self.getIpaFromLibraryExcactString(word) 107 | if ipa is None: 108 | word = word.lower() 109 | ipa = self.getIpaFromLibraryExcactString(word) 110 | return ipa 111 | 112 | def getIpaFromLibraryExcactString(self,word:str): 113 | if word in self.library.index: 114 | ipa: str 115 | ipa = self.library.loc[word].values[0] 116 | if type(ipa) is not str: 117 | ipa = ipa[0] 118 | return ipa 119 | return None 120 | 121 | def removeEmphasis(self, text: str): 122 | if self.useEmphasis: 123 | return text 124 | withoutEmphasis = text.replace("ˈ","") 125 | return withoutEmphasis 126 | -------------------------------------------------------------------------------- /huiAudioCorpus/workflows/createDatasetWorkflow/Step0_Overview.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from huiAudioCorpus.utils.PathUtil import PathUtil 4 | from huiAudioCorpus.persistenz.AudiosFromLibrivoxPersistenz import AudiosFromLibrivoxPersistenz 5 | from huiAudioCorpus.utils.DoneMarker import DoneMarker 6 | from tqdm import tqdm 7 | 8 | class Step0_Overview: 9 | 10 | def __init__(self, audiosFromLibrivoxPersistenz: AudiosFromLibrivoxPersistenz, savePath: str, pathUtil: PathUtil): 11 | self.savePath = savePath 12 | self.audiosFromLibrivoxPersistenz = audiosFromLibrivoxPersistenz 13 | self.pathUtil = pathUtil 14 | 15 | def run(self): 16 | return DoneMarker(self.savePath).run(self.script, deleteFolder=False) 17 | 18 | def script(self): 19 | booksLibrivox = self.downloadOverviewLibrivox() 20 | usableBooks = self.downloadChapters(booksLibrivox) 21 | speackerOverview = self.generateSpeackerOverview(usableBooks) 22 | speackerShort = self.generateSpeackerShort(speackerOverview) 23 | self.generateSpeackerTemplate(usableBooks) 24 | 25 | print('Usable books:', len(usableBooks)) 26 | print('Total hours:',sum([book['time'] for book in usableBooks])/60/60) 27 | print('Count of Speackers:', len(speackerShort)) 28 | print('bestSpeacker:', speackerShort[0]) 29 | 30 | def downloadOverviewLibrivox(self): 31 | librivoxPath = self.savePath + '/booksLibrivox.json' 32 | if not self.pathUtil.fileExists(librivoxPath): 33 | print('Download Overview from Librivox') 34 | booksLibrivox = self.audiosFromLibrivoxPersistenz.getIds() 35 | self.pathUtil.saveJson(librivoxPath, booksLibrivox) 36 | 37 | booksLibrivox = self.pathUtil.loadJson(librivoxPath) 38 | return booksLibrivox 39 | 40 | def downloadChapters(self, booksLibrivox): 41 | usableBookPath = self.savePath + '/usableBooks.json' 42 | if not self.pathUtil.fileExists(usableBookPath): 43 | print('Download Chapters from Librivox') 44 | usableBooks = [{'time': book['totaltimesecs'], 'title':book['title'], 'url': book['url_text_source']} for book in booksLibrivox if self.isBookUseable(book)] 45 | for book in tqdm(usableBooks): 46 | chapters, chapterDownloadLinks = self.audiosFromLibrivoxPersistenz.getChapter(book['title']) 47 | book['chapters'] = [] 48 | for _, chapter in chapters.iterrows(): 49 | book['chapters'].append({ 50 | 'title': chapter['Chapter'], 51 | 'reader': chapter['Reader'], 52 | 'time': convertToSeconds(chapter['Time']) 53 | }) 54 | self.pathUtil.saveJson(usableBookPath, usableBooks) 55 | 56 | usableBooks = self.pathUtil.loadJson(usableBookPath) 57 | return usableBooks 58 | 59 | def isBookUseable(self, book): 60 | if book['totaltimesecs']<=0: 61 | return False 62 | if book['language'] != "German": 63 | return False 64 | if 'www.projekt-gutenberg.org' in book['url_text_source']: 65 | return True 66 | 67 | if 'www.gutenberg.org/' in book['url_text_source']: 68 | return True 69 | return False 70 | 71 | def generateSpeackerTemplate(self, usableBooks): 72 | readerPath = self.savePath + '/readerTemplate.json' 73 | if not self.pathUtil.fileExists(readerPath): 74 | reader = {} 75 | for book in usableBooks: 76 | bookTitle = book['title'] 77 | for chapter in book['chapters']: 78 | if chapter['reader'] not in reader: 79 | reader[chapter['reader']] = {} 80 | 81 | title = ''.join([i for i in bookTitle.lower().replace(' ','_') if (i in 'abcdefghijklmonpqrstuvwxyz_' or i.isnumeric())]) 82 | guttenbergId = book['url'].replace('www.projekt-gutenberg.org/', '').replace('https://','').replace('http://','') 83 | if 'www.gutenberg.org/' in guttenbergId: 84 | guttenbergId = int(guttenbergId.replace('www.gutenberg.org/ebooks/', '').replace('www.gutenberg.org/etext/', '')) 85 | 86 | reader[chapter['reader']][title] = { 87 | 'title': title, 88 | 'LibrivoxBookName': bookTitle, 89 | 'GutenbergId': guttenbergId, 90 | 'GutenbergStart': '', 91 | 'GutenbergEnd': '', 92 | 'textReplacement':{} 93 | } 94 | 95 | 96 | 97 | self.pathUtil.saveJson(readerPath, reader) 98 | reader = self.pathUtil.loadJson(readerPath) 99 | return reader 100 | 101 | def generateSpeackerOverview(self, usableBooks): 102 | readerPath = self.savePath + '/readerLong.json' 103 | if not self.pathUtil.fileExists(readerPath): 104 | reader = {} 105 | for book in usableBooks: 106 | bookTitle = book['title'] 107 | for chapter in book['chapters']: 108 | if chapter['reader'] not in reader: 109 | reader[chapter['reader']] = [] 110 | 111 | reader[chapter['reader']].append({ 112 | 'title': chapter['title'], 113 | 'time': chapter['time'], 114 | 'book': bookTitle 115 | }) 116 | self.pathUtil.saveJson(readerPath, reader) 117 | reader = self.pathUtil.loadJson(readerPath) 118 | return reader 119 | 120 | def generateSpeackerShort(self, speackerOverview): 121 | readerPath = self.savePath + '/readerShort.json' 122 | if not self.pathUtil.fileExists(readerPath): 123 | readers = [] 124 | for speacker in speackerOverview: 125 | readers.append({ 126 | 'name': speacker, 127 | 'time': round(sum([chapter['time'] for chapter in speackerOverview[speacker]])/60/60,1) 128 | }) 129 | readers.sort(key=lambda x: x['time'], reverse=True) 130 | self.pathUtil.saveJson(readerPath, readers) 131 | readers = self.pathUtil.loadJson(readerPath) 132 | return readers 133 | 134 | 135 | def convertToSeconds(timeString: str): 136 | ftr = [3600,60,1] 137 | duration = sum([a*b for a,b in zip(ftr, map(int,timeString.split(':')))]) 138 | return duration -------------------------------------------------------------------------------- /scripts/createDatasetConfig/Eva.json: -------------------------------------------------------------------------------- 1 | { 2 | "balladen": { 3 | "title": "balladen", 4 | "LibrivoxBookName": "Balladen", 5 | "GutenbergId": "spittelr/balladen/balladen.html", 6 | "GutenbergStart": "In finstrer Nacht, auf steilen Wolkenpfaden,", 7 | "GutenbergEnd": "", 8 | "textReplacement": { 9 | "1.": " ", 10 | "2.": " ", 11 | "3.": " ", 12 | " St.": " Sankt ", 13 | "ï": "i", 14 | "*": " " 15 | } 16 | }, 17 | "ligeia_und_andere_novellen": { 18 | "title": "ligeia_und_andere_novellen", 19 | "LibrivoxBookName": "Ligeia und Andere Novellen", 20 | "GutenbergId": 50887, 21 | "GutenbergStart": "Und es liegt darin der Wille, der", 22 | "GutenbergEnd": "Genien und Gnomen.", 23 | "textReplacement": { 24 | "nennt[1],": "nennt, ", 25 | "[Fußnote 1: Denn da Jupiter während der Winterzeit zweimal sieben Tage": "", 26 | "Wärme schenkt, so haben die Menschen diese milde und gemäßigte Zeit die": "", 27 | "Amme des schönen Eisvogels genannt. -- Simonides]": "", 28 | "Gemach der Bibliothek": " Gemach der Bibliothek. Fussnote Denn da Jupiter während der Winterzeit zweimal sieben Tage Wärme schenkt, so haben die Menschen diese milde und gemäßigte Zeit die Amme des schönen Eisvogels genannt. Simonides. Ende der Fussnote.", 29 | "Mond[2],": "Mond Zwei,", 30 | "die Sonne ist;" : " die Sonne is. Fussnote zwei,m Mond im Englischen weiblich, Sonne männlich. Anmerkung der Übersetzerin. ", 31 | "[Fußnote 2: Mond im Englischen weiblich, Sonne männlich. A. d. Üb.]": "", 32 | "sieht[3].": "sieht drei. Fussnote drei, Wo Pomponius Mela in seiner Abhandlung De Situ Orbis von Flut und Ebbe spricht, sagt er: Entweder ist die Welt ein großes Tier, oder und so wieter. Ender der Fussnote drei. ", 33 | "[Fußnote 3: Wo Pomponius Mela in seiner Abhandlung »De Situ Orbis« von": "", 34 | "Flut und Ebbe spricht, sagt er: »Entweder ist die Welt ein großes Tier,": "", 35 | "oder« usw.]": "", 36 | "Franzose[4]": "Franzose vier", 37 | "belle chose«?": "belle chose? Fussnote vier, Balzac, dem Sinne nach, ich weiß nicht mehr die Worte. Ende der Fussnote vier.", 38 | "[Fußnote 4: Balzac, dem Sinne nach; ich weiß nicht mehr die Worte.]": "", 39 | "können.[5]": "können fünf. Fussnote fünf, Florem putares nare per liquidum aethera. P Commire. Ender der Fussnote fünf.", 40 | "[Fußnote 5: Florem putares nare per liquidum aethera. -- P. Commire]": "", 41 | " Mr.": " Mister ", 42 | " on.": " on .", 43 | "*": " ", 44 | "[": " ", 45 | "]": " ", 46 | "ë": "e" 47 | } 48 | }, 49 | "fabeln_und_erzhlungen": { 50 | "title": "fabeln_und_erzhlungen", 51 | "LibrivoxBookName": "Fabeln und Erzählungen", 52 | "GutenbergId": 9335, 53 | "GutenbergStart": "", 54 | "GutenbergEnd": "", 55 | "textReplacement": { 56 | "*": " " 57 | } 58 | }, 59 | "toten_seelen": { 60 | "title": "toten_seelen", 61 | "LibrivoxBookName": "toten Seelen", 62 | "GutenbergId": "gogol/toteseel/toteseel.html", 63 | "GutenbergStart": "", 64 | "GutenbergEnd": "irdischen Amtes zu achten, weil wir es schon alle dunkel ahnen und weil wir kaum", 65 | "textReplacement": { 66 | " von Ew.": " von euerer ", 67 | "d. h.": " das heißt ", 68 | " z. B.": " zum Beispiel ", 69 | " N. N.": " N N ", 70 | " usw.": " und so weiter", 71 | " Nr.": " Nummer ", 72 | " St.": " Sankt ", 73 | " Ew.": " Eure ", 74 | "a. D.": " a D ", 75 | "a.D.": " a D ", 76 | "1845": "achtzehnhundertfünfundvierzig", 77 | "1850": "achtzehnhundertfünfzig", 78 | "1812": "achtzehnhundertzwölf", 79 | "1814": "achtzehnhundertvierzehn", 80 | "1835": "achtzehnhundertfünfunddreißig", 81 | "1841": "achtzehnhunderteinundvierzig", 82 | "1840": "achtzehnhundertvierzig", 83 | "1842": "achtzehnhundertzweiundvierzig", 84 | "1852": "achtzehnhundertzweiundfünfzig", 85 | "10.": "zehnten", 86 | "21.": "einundzwanzigsten", 87 | "4a": "vier A ", 88 | "34": "vierunddreißig", 89 | "9.": "neunte", 90 | "I.": "Erstens ", 91 | "II.":"Zweitens " 92 | } 93 | }, 94 | "wir_fanden_einen_pfad_neue_gedichte": { 95 | "title": "wir_fanden_einen_pfad_neue_gedichte", 96 | "LibrivoxBookName": "Wir fanden einen Pfad: Neue Gedichte", 97 | "GutenbergId": 9623, 98 | "GutenbergStart": "", 99 | "GutenbergEnd": "", 100 | "textReplacement": { 101 | "F1ügelschuhn": "Flügelschuhn", 102 | "G1ühwürmchen": "Glühwürmchen", 103 | "B1üte,": "Blüte,", 104 | "1912": "neunzehnhundertzwölf" 105 | } 106 | }, 107 | "werde_die_du_bist": { 108 | "title": "werde_die_du_bist", 109 | "LibrivoxBookName": "Werde, die Du bist", 110 | "GutenbergId": "dohm/wiedu/wiedu.html", 111 | "GutenbergStart": "In der Irrenanstalt des Doktor Behrend,", 112 | "GutenbergEnd": "", 113 | "textReplacement": { 114 | " u.s.w.": " und so weiter ", 115 | " Z. B.": " zum Beispiel ", 116 | " z.B.": " zum Beispiel ", 117 | "10.000": "zehntausend", 118 | "1894": "achtzehnhundertvierundneunzig", 119 | "2500": "zweitausendfünfhundert", 120 | "3000": "dreitausend", 121 | "1500": "fünfzehnhundert", 122 | "54": "vierundfünfzig", 123 | "35": "fünfunddreißig", 124 | "18": "achtzehn", 125 | "*": " ", 126 | "ô": "o" 127 | } 128 | }, 129 | "kleine_lord_version_2": { 130 | "title": "kleine_lord_version_2", 131 | "LibrivoxBookName": "kleine Lord (version 2)", 132 | "GutenbergId": "burnett/lord/lord.html", 133 | "GutenbergStart": "Cedrik selbst wußte kein Sterbenswörtchen davon,", 134 | "GutenbergEnd": "", 135 | "textReplacement": { 136 | "4.": "vierte", 137 | "Mr.": " Mister ", 138 | "p. s.": " p s ", 139 | "·": " " 140 | } 141 | } 142 | } -------------------------------------------------------------------------------- /huiAudioCorpus/dependencyInjection/DependencyInjection.py: -------------------------------------------------------------------------------- 1 | def disableLog(): 2 | logging.getLogger('matplotlib').disabled = True 3 | logging.getLogger('matplotlib.font_manager').disabled = True 4 | logging.getLogger('matplotlib.colorbar').disabled = True 5 | logging.getLogger('numba.core.ssa').disabled = True 6 | logging.getLogger('numba.core.interpreter').disabled = True 7 | logging.getLogger('numba.core.byteflow').disabled = True 8 | logging.getLogger('numba.ssa').disabled = True 9 | logging.getLogger('numba.byteflow').disabled = True 10 | logging.getLogger('numba.interpreter').disabled = True 11 | logging.getLogger('paramiko.transport.sftp').disabled = True 12 | logging.getLogger('paramiko.transport').disabled = True 13 | logging.getLogger('h5py._conv').disabled = True 14 | logging.getLogger().setLevel(logging.WARNING) 15 | 16 | from huiAudioCorpus.workflows.createDatasetWorkflow.Step8_DatasetStatistic import Step8_DatasetStatistic 17 | from huiAudioCorpus.workflows.createDatasetWorkflow.Step9_GenerateCleanDataset import Step9_GenerateCleanDataset 18 | from huiAudioCorpus.workflows.createDatasetWorkflow.Step7_AudioRawStatistic import Step7_AudioRawStatistic 19 | from huiAudioCorpus.workflows.createDatasetWorkflow.Step3_1_PrepareText import Step3_1_PrepareText 20 | from huiAudioCorpus.workflows.createDatasetWorkflow.Step0_Overview import Step0_Overview 21 | from huiAudioCorpus.workflows.createDatasetWorkflow.Step2_1_AudioStatistic import Step2_1_AudioStatistic 22 | from huiAudioCorpus.workflows.createDatasetWorkflow.Step6_FinalizeDataset import Step6_FinalizeDataset 23 | from huiAudioCorpus.transformer.SentenceDistanceTransformer import SentenceDistanceTransformer 24 | from huiAudioCorpus.calculator.AlignSentencesIntoTextCalculator import AlignSentencesIntoTextCalculator 25 | from huiAudioCorpus.workflows.createDatasetWorkflow.Step5_AlignText import Step5_AlignText 26 | from huiAudioCorpus.converter.AudioToSentenceConverter import AudioToSentenceConverter 27 | from huiAudioCorpus.workflows.createDatasetWorkflow.Step4_TranscriptAudio import Step4_TranscriptAudio 28 | from huiAudioCorpus.persistenz.GutenbergBookPersistenz import GutenbergBookPersistenz 29 | from huiAudioCorpus.workflows.createDatasetWorkflow.Step3_DownloadText import Step3_DownloadText 30 | from huiAudioCorpus.transformer.AudioSplitTransformer import AudioSplitTransformer 31 | from huiAudioCorpus.transformer.AudioLoudnessTransformer import AudioLoudnessTransformer 32 | from huiAudioCorpus.workflows.createDatasetWorkflow.Step2_SplitAudio import Step2_SplitAudio 33 | from huiAudioCorpus.persistenz.AudiosFromLibrivoxPersistenz import AudiosFromLibrivoxPersistenz 34 | from huiAudioCorpus.workflows.createDatasetWorkflow.Step1_DownloadAudio import Step1_DownloadAudio 35 | from huiAudioCorpus.converter.StringToSentencesConverter import StringToSentencesConverter 36 | from frosch import hook 37 | hook(theme = 'paraiso_dark') 38 | import logging 39 | disableLog() 40 | 41 | from huiAudioCorpus.error.DependencyInjectionError import DependencyInjectionError 42 | from huiAudioCorpus.converter.ListToHistogramConverter import ListToHistogramConverter 43 | from huiAudioCorpus.converter.ListToStatisticConverter import ListToStatisticConverter 44 | from huiAudioCorpus.ui.Plot import Plot 45 | from huiAudioCorpus.components.TextStatisticComponent import TextStatisticComponent 46 | from huiAudioCorpus.components.AudioStatisticComponent import AudioStatisticComponent 47 | from huiAudioCorpus.utils.PathUtil import PathUtil 48 | from huiAudioCorpus.utils.FileListUtil import FileListUtil 49 | from huiAudioCorpus.converter.TranscriptsToSentencesConverter import TranscriptsToSentencesConverter 50 | from huiAudioCorpus.persistenz.AudioTranscriptPairPersistenz import AudioTranscriptPairPersistenz 51 | from huiAudioCorpus.converter.PhoneticSentenceToSymbolSentenceConverter import PhoneticSentenceToSymbolSentenceConverter 52 | from huiAudioCorpus.converter.SentenceToPhoneticSentenceConverter import SentenceToPhoneticSentenceConverter 53 | from huiAudioCorpus.transformer.AudioAddSilenceTransformer import AudioAddSilenceTransformer 54 | from huiAudioCorpus.transformer.TranscriptsSelectionTransformer import TranscriptsSelectionTransformer 55 | from huiAudioCorpus.transformer.AudioSamplingRateTransformer import AudioSamplingRateTransformer 56 | from huiAudioCorpus.persistenz.TranscriptsPersistenz import TranscriptsPersistenz 57 | from huiAudioCorpus.persistenz.AudioPersistenz import AudioPersistenz 58 | from huiAudioCorpus.filter.AudioFilter import AudioFilter 59 | from huiAudioCorpus.transformer.AudioFadeTransformer import AudioFadeTransformer 60 | from huiAudioCorpus.calculator.TextNormalizer import TextNormalizer 61 | 62 | import inspect 63 | 64 | disableLog() 65 | 66 | 67 | defaultConfig = { 68 | 'audioAddSilenceTransformer': { 69 | 'endDurationSeconds': 0.7, 70 | 'startDurationSeconds': 0 71 | }, 72 | 'listToHistogramConverter': { 73 | 'stepSize':1 74 | } 75 | } 76 | 77 | class DependencyInjection: 78 | #Calculators 79 | alignSentencesIntoTextCalculator: AlignSentencesIntoTextCalculator 80 | textNormalizer: TextNormalizer 81 | 82 | 83 | #Components 84 | audioStatisticComponent: AudioStatisticComponent 85 | textStatisticComponent: TextStatisticComponent 86 | 87 | #Converters 88 | phoneticSentenceToSymbolSentenceConverter:PhoneticSentenceToSymbolSentenceConverter 89 | sentenceToPhoneticSentenceConverter:SentenceToPhoneticSentenceConverter 90 | transcriptsToSentencesConverter:TranscriptsToSentencesConverter 91 | listToStatisticConverter:ListToStatisticConverter 92 | listToHistogramConverter: ListToHistogramConverter 93 | stringToSentencesConverter: StringToSentencesConverter 94 | audioToSentenceConverter: AudioToSentenceConverter 95 | 96 | 97 | #Filters 98 | audioFilter:AudioFilter 99 | 100 | #Persistence 101 | audioPersistenz:AudioPersistenz 102 | audioTranscriptPairPersistenz:AudioTranscriptPairPersistenz 103 | transcriptsPersistenz:TranscriptsPersistenz 104 | audiosFromLibrivoxPersistenz:AudiosFromLibrivoxPersistenz 105 | GutenbergBookPersistenz: GutenbergBookPersistenz 106 | 107 | #Transformers 108 | audioAddSilenceTransformer:AudioAddSilenceTransformer 109 | audioSamplingRateTransformer:AudioSamplingRateTransformer 110 | transcriptsSelectionTransformer:TranscriptsSelectionTransformer 111 | audioSplitTransformer: AudioSplitTransformer 112 | sentenceDistanceTransformer: SentenceDistanceTransformer 113 | audioLoudnessTransformer: AudioLoudnessTransformer 114 | audioFadeTransformer: AudioFadeTransformer 115 | 116 | 117 | #Utilities 118 | pathUtil:PathUtil 119 | fileListUtil: FileListUtil 120 | 121 | #Workflows 122 | step0_Overview: Step0_Overview 123 | step1_DownloadAudio: Step1_DownloadAudio 124 | step2_SplitAudio: Step2_SplitAudio 125 | step2_1_AudioStatistic: Step2_1_AudioStatistic 126 | step3_DowloadText: Step3_DownloadText 127 | step3_1_PrepareText: Step3_1_PrepareText 128 | step4_TranscriptAudio: Step4_TranscriptAudio 129 | step5_AlignText: Step5_AlignText 130 | step6_FinalizeDataset: Step6_FinalizeDataset 131 | step7_AudioRawStatistic: Step7_AudioRawStatistic 132 | step8_DatasetStatistic: Step8_DatasetStatistic 133 | step9_GenerateCleanDataset: Step9_GenerateCleanDataset 134 | 135 | #plot 136 | plot: Plot 137 | 138 | def __init__(self, config={}): 139 | configWithDefault = defaultConfig.copy() 140 | configWithDefault.update(config) 141 | self.allClassReferences = self.getAllClassReferences(configWithDefault) 142 | initialedClasses = {} 143 | for name, classInstance in self.allClassReferences.items(): 144 | def getLambda (name, classInstance): 145 | return property(lambda _: self.initClass(name, classInstance, self.classConstructor, initialedClasses, configWithDefault, name )) 146 | setattr(DependencyInjection, name, getLambda(name, classInstance)) 147 | 148 | def initClass(self, className, classReference , classConstructorMethod, initialedClasses, config , requestedClass = ''): 149 | if className in initialedClasses: 150 | return initialedClasses[className] 151 | arguments = self.getConstructorReferenceClasses(classReference) 152 | for argument in arguments: 153 | if argument not in initialedClasses.values() and arguments[argument] is not None: 154 | self.initClass(argument, arguments[argument], classConstructorMethod, initialedClasses, config, requestedClass) 155 | 156 | classConfig = config[className].copy() if className in config else {} 157 | if '#' in classConfig: 158 | classConfig.pop('#') 159 | classConfig 160 | try: 161 | 162 | newClassInstance = classConstructorMethod(classReference, initialedClasses, classConfig) 163 | except Exception as e: 164 | raise DependencyInjectionError(e, classConfig, classReference.__name__, requestedClass) 165 | initialedClasses[className] = newClassInstance 166 | return newClassInstance 167 | 168 | 169 | def classConstructor(self,classReference, initialedClasses , classConfig): 170 | classConstructor = classConfig.copy() 171 | references = self.getConstructorReferenceClasses(classReference) 172 | for ref in references: 173 | if references[ref] is not None: 174 | classConstructor[ref] = initialedClasses[ref] 175 | classInstance = classReference(**classConstructor) 176 | 177 | return classInstance 178 | 179 | def getConstructorReferenceClasses(self, classReference): 180 | arguments = self.getAllConstructorArguments(classReference) 181 | 182 | references = {} 183 | for argument in arguments: 184 | if argument in ["self","args","kwargs"]: 185 | continue 186 | references[argument] = self.allClassReferences[argument] if argument in self.allClassReferences.keys() else None 187 | return references 188 | 189 | def getAllConstructorArguments(self, classInstance): 190 | return list(inspect.signature(classInstance.__init__).parameters.keys()) 191 | 192 | def getAllClassReferences(self,configWithDefault): 193 | classes = globalClassesAtImportTime.copy() 194 | for className in configWithDefault: 195 | if '#' in configWithDefault[className]: 196 | classes[className] = configWithDefault[className]['#'] 197 | return classes 198 | 199 | 200 | globalClassesAtImportTime = DependencyInjection.__dict__.get("__annotations__") -------------------------------------------------------------------------------- /huiAudioCorpus/calculator/TextNormalizer.py: -------------------------------------------------------------------------------- 1 | import re 2 | import argparse 3 | from pathlib import Path 4 | 5 | number_mappings = { 6 | "0" : "null", 7 | "1" : "ein", 8 | "2" : "zwei", 9 | "3" : "drei", 10 | "4" : "vier", 11 | "5" : "fünf", 12 | "6" : "sechs", 13 | "7" : "sieben", 14 | "8" : "acht", 15 | "9" : "neun", 16 | "10" : "zehn", 17 | "11" : "elf", 18 | "12" : "zwölf", 19 | "13" : "dreizehn", 20 | "14" : "vierzehn", 21 | "15" : "fünfzehn", 22 | "16" : "sechzehn", 23 | "17" : "siebzehn", 24 | "18" : "achtzehn", 25 | "19" : "neunzehn", 26 | "20" : "zwanzig", 27 | "30" : "dreißig", 28 | "60" : "sechzig", 29 | "70" : "siebzig", 30 | "100" : "einhundert" 31 | } 32 | ordinal_mappings = { 33 | "1" : "erste", 34 | "3" : "dritte", 35 | "7" : "siebte", 36 | "8" : "achte", 37 | } 38 | customs_mappings = { 39 | "¼" : "ein viertel", 40 | "½" : "einhalb", 41 | "¾" : "drei viertel", 42 | } 43 | ''' 44 | ordinal_genders = { 45 | ["diese"] : "te", 46 | ["als"] : "ter", 47 | [""] : "tes", 48 | ["am", "zum", "en", "im", "die", "dieser", "diese", "em"] : "ten", 49 | } 50 | ''' 51 | def number_literal(number): 52 | x_str = str(number) 53 | if x_str in number_mappings: 54 | return number_mappings[x_str] 55 | x_str_left = x_str[0] 56 | x_str_right = x_str[1:].lstrip("0") 57 | if len(x_str) == 8: 58 | x_str_left = x_str[0:2] 59 | x_str_right = x_str[2:].lstrip("0") 60 | if x_str_right != "": 61 | return number_literal(x_str_left)+"millionen"+number_literal(x_str_right) 62 | else: 63 | return number_literal(x_str_left)+"millionen" 64 | if len(x_str) == 7: 65 | x_str_left = x_str[0] 66 | x_str_right = x_str[1:].lstrip("0") 67 | if x_str_right != "": 68 | return number_literal(x_str_left)+"millionen"+number_literal(x_str_right) 69 | else: 70 | return number_literal(x_str_left)+"millionen" 71 | if len(x_str) == 6: 72 | x_str_left = x_str[0:3] 73 | x_str_right = x_str[3:].lstrip("0") 74 | if x_str_right != "": 75 | return number_literal(x_str_left)+"tausend"+number_literal(x_str_right) 76 | else: 77 | return number_literal(x_str_left)+"tausend" 78 | if len(x_str) == 5: 79 | x_str_left = x_str[0:2] 80 | x_str_right = x_str[2:].lstrip("0") 81 | if x_str_right != "": 82 | return number_literal(x_str_left)+"tausend"+number_literal(x_str_right) 83 | else: 84 | return number_literal(x_str_left)+"tausend" 85 | 86 | if len(x_str) == 4: 87 | if x_str_right != "": 88 | if int(number) >= 1200 and int(number) < 2000: 89 | decade = x_str[2:].lstrip("0") 90 | if decade != "": 91 | return number_literal(x_str[0:2])+"hundert"+number_literal(x_str[2:].lstrip("0")) 92 | else: 93 | return number_literal(x_str[0:2])+"hundert" 94 | else: 95 | return number_literal(x_str_left)+"tausend"+number_literal(x_str_right) 96 | else: 97 | return number_literal(x_str_left)+"tausend" 98 | if len(x_str) == 3: 99 | if x_str_right != "": 100 | return number_literal(x_str_left)+"hundert"+number_literal(x_str_right) 101 | else: 102 | return number_literal(x_str_left)+"hundert" 103 | if len(x_str) == 2: 104 | if x_str_right != "": 105 | return number_literal(x_str_right)+"und"+number_literal(x_str_left+"0") 106 | else: 107 | return number_literal(x_str_left) + "zig" 108 | class TextNormalizer: 109 | def __init__(self) -> None: 110 | pass 111 | 112 | def normalize_rationals(self, input_sentence:str): 113 | rationals = re.findall(r"(\d+[\. ']*\d*,\d+)",input_sentence) 114 | for rational in rationals: 115 | number, decimals = rational.split(",") 116 | normalized_number = self.normalize_integer(number) 117 | if number == "1": 118 | normalized_number = normalized_number + "s" 119 | decimals_list = [] 120 | for decimal in decimals: 121 | normalized_decimal = self.normalize_integer(decimal) 122 | if decimal == "1": 123 | normalized_decimal = normalized_decimal + "s" 124 | decimals_list.append(normalized_decimal) 125 | normalized_rational = normalized_number + " komma " + " ".join(decimals_list) 126 | input_sentence = re.sub(rational, normalized_rational, input_sentence) 127 | return input_sentence 128 | 129 | def normalize_time(self, input_sentence:str): 130 | times = re.findall(r"(\d{1,2}[\.:]\d{1,2}(?:( Uhr)?))(?!\d)",input_sentence) 131 | 132 | if not len(times) > 0: 133 | return input_sentence 134 | if type(times[0]) is tuple: 135 | temp_times = [] 136 | for t, _ in times: 137 | temp_times.append(t) 138 | times = temp_times 139 | for time in times: 140 | hour, minute = time.split()[0].replace(".",":").split(":") 141 | if len(hour) > 2 or len(minute) > 2: 142 | print("TOO LONG") 143 | continue 144 | if len(hour) == 2 and hour.startswith("0"): 145 | hour = hour[1] 146 | hour = self.normalize_integer(hour).capitalize() 147 | 148 | if len(minute) == 2 and minute.startswith("0"): 149 | minute = minute[1] 150 | if minute == "0": 151 | minute = "" 152 | else: 153 | minute = " "+self.normalize_integer(minute).capitalize() 154 | normalized_time = hour + " Uhr" + minute 155 | input_sentence = re.sub(time, normalized_time, input_sentence) 156 | 157 | return input_sentence 158 | 159 | def normalize_date(self, input_sentence:str): 160 | dates = re.findall(r"(\d{1,2}\.\d{1,2}\.\d{2,4})",input_sentence) 161 | for date in dates: 162 | day, month, year = date.split(".") 163 | day = self.normalize_ordinal(day.lstrip("0")+".") 164 | month = self.normalize_ordinal(month.lstrip("0")+".") 165 | year = self.normalize_integer(year.lstrip("0")) 166 | normalized_date = " ".join([day, month, year]) 167 | input_sentence = re.sub(date, normalized_date, input_sentence) 168 | 169 | return input_sentence 170 | 171 | def normalize_ordinal(self, input_sentence:str): 172 | ordinals = re.findall(r"([\.]*\d+[\. ']*\d*)\.(?!\d)",input_sentence) 173 | for number in ordinals: 174 | normalized_number = number 175 | if len(normalized_number) > 2: 176 | if normalized_number[-2] == 0 and normalized_number[-1] in ordinal_mappings: 177 | temp_number = self.normalize_integer(normalized_number[:-2]+"00") 178 | normalized_number = temp_number + "ste" 179 | else: 180 | normalized_number = self.normalize_integer(normalized_number)+"te" 181 | elif len(normalized_number) == 2: 182 | normalized_number = self.normalize_integer(number) 183 | normalized_number+="sten" 184 | else: 185 | if normalized_number in ordinal_mappings: 186 | normalized_number = ordinal_mappings[normalized_number] 187 | else: 188 | normalized_number = self.normalize_integer(normalized_number)+"te" 189 | input_sentence = re.sub(number+".", normalized_number, input_sentence) 190 | return input_sentence 191 | 192 | def normalize_integer(self, input_sentence:str): 193 | numbers = re.findall(r"(\d+[\. ']*\d*)",input_sentence) 194 | for number in numbers: 195 | number_cleaned = number.replace(" ","").replace(".", "").replace("'","") 196 | number = number.strip() 197 | normalized_number = number_literal(number_cleaned) 198 | input_sentence=re.sub(number, normalized_number, input_sentence) 199 | 200 | return input_sentence 201 | 202 | def normalize_customs(self, input_sentence:str): 203 | for custom_character in customs_mappings: 204 | if custom_character in input_sentence: 205 | input_sentence = input_sentence.replace(" "+custom_character, " "+customs_mappings[custom_character]) 206 | input_sentence = input_sentence.replace(custom_character, " "+customs_mappings[custom_character]) 207 | return input_sentence 208 | 209 | def normalize_percent(self, input_sentence:str): 210 | numbers = re.findall(r"(\d+%)",input_sentence) 211 | for number in numbers: 212 | number_cleaned = number.replace(" ","").replace(".", "").replace("'","") 213 | number = number.strip() 214 | normalized_number = number_literal(number_cleaned[:-1]) + " prozent" 215 | input_sentence=re.sub(number, normalized_number, input_sentence) 216 | return input_sentence 217 | 218 | def normalize(self, input_sentence:str): 219 | input_sentence = self.normalize_percent(input_sentence) 220 | input_sentence = self.normalize_rationals(input_sentence) 221 | input_sentence = self.normalize_time(input_sentence) 222 | input_sentence = self.normalize_date(input_sentence) 223 | input_sentence = self.normalize_ordinal(input_sentence) 224 | input_sentence = self.normalize_integer(input_sentence) 225 | input_sentence = self.normalize_customs(input_sentence) 226 | 227 | return input_sentence 228 | 229 | def main(): 230 | parser = argparse.ArgumentParser(description="Normalizer control") 231 | 232 | parser.add_argument("--files", required=True,action="append") 233 | parser.add_argument("--save_path", required=True) 234 | 235 | args = parser.parse_args() 236 | 237 | normalizer = TextNormalizer() 238 | 239 | normalized_sentences = [] 240 | 241 | for text_file in args.files: 242 | with open(text_file, encoding="UTF-8") as file: 243 | lines = file.readlines() 244 | for line in lines: 245 | normalized_line = normalizer.normalize(line) 246 | normalized_sentences.append(normalized_line) 247 | text_file_name = Path(text_file).name 248 | with open(args.save_path+text_file_name+"_normalized.txt", "w", encoding="UTF-8") as file: 249 | file.writelines(normalized_sentences) 250 | normalized_sentences = [] 251 | 252 | if __name__ == "__main__": 253 | main() -------------------------------------------------------------------------------- /huiAudioCorpus/workflows/createDatasetWorkflow/Step3_1_PrepareText.py: -------------------------------------------------------------------------------- 1 | 2 | from huiAudioCorpus.utils.PathUtil import PathUtil 3 | from typing import Dict, List 4 | from huiAudioCorpus.utils.DoneMarker import DoneMarker 5 | from huiAudioCorpus.calculator.TextNormalizer import TextNormalizer 6 | 7 | import re 8 | import json 9 | 10 | class Step3_1_PrepareText: 11 | 12 | def __init__(self, savePath: str, loadFile: str, saveFile: str, startSentence: str, endSentence: str, textReplacement: Dict[str,str], textNormalizer: TextNormalizer, moves: List[Dict[str, str]], remove: List[Dict[str, str] ]): 13 | self.savePath = savePath 14 | self.textNormalizer = textNormalizer 15 | self.loadFile = loadFile 16 | self.saveFile = saveFile 17 | self.textReplacement = textReplacement 18 | self.pathUtil = PathUtil() 19 | self.startSentence = startSentence 20 | self.endSentence = endSentence 21 | self.moves = moves 22 | self.removes = remove 23 | 24 | def run(self): 25 | return DoneMarker(self.savePath).run(self.script) 26 | 27 | def script(self): 28 | inputText = self.pathUtil.loadFile(self.loadFile) 29 | cuttedText = self.cutText(inputText, self.startSentence , self.endSentence) 30 | removedText = self.remove(cuttedText, self.removes) 31 | replacedText = self.replace(removedText, self.textReplacement) 32 | movedText = self.move(replacedText, self.moves) 33 | self.pathUtil.writeFile(movedText, self.saveFile) 34 | 35 | def move(self, text: str, moves: List[Dict[str, str]]): 36 | for move in moves: 37 | start = move['start'] 38 | end = move['end'] 39 | after = move['after'] 40 | textToMove = text.partition(start)[-1].partition(end)[0] + end 41 | textWithoutMove = text.replace(textToMove, "") 42 | first, seperator, last = textWithoutMove.partition(after) 43 | finalText = first + seperator + textToMove + last 44 | text = finalText 45 | return text 46 | 47 | def remove(self, text: str, removes: List[Dict[str, str]]): 48 | for remove in removes: 49 | textToRemove = "" 50 | textToRemove_old = None 51 | start = remove['start'] 52 | end = remove['end'] 53 | while textToRemove != textToRemove_old: 54 | textToRemvoe = start + text.partition(start)[-1].partition(end)[0] + end 55 | text = text.replace(textToRemvoe, "") 56 | textToRemove_old = textToRemove 57 | print(textToRemvoe) 58 | return text 59 | 60 | 61 | def cutText(self, text: str, startSentence: str, endSentence: str): 62 | if startSentence =="": 63 | withoutFirst = text 64 | else: 65 | withoutFirst = startSentence + text.split(startSentence, 1)[1] 66 | 67 | if endSentence=="": 68 | withoutEnd = withoutFirst 69 | else: 70 | withoutEnd = withoutFirst.split(endSentence,1)[0] + endSentence 71 | 72 | stripped = withoutEnd.strip() 73 | prepared = stripped.replace('\r', '') 74 | return prepared 75 | 76 | def replace(self, text: str, textReplacement: Dict[str,str]): 77 | beforeReplacement = { 78 | '\xa0': ' ' 79 | } 80 | baseReplacement = { 81 | '...': '.', 82 | '«': ' ', 83 | '»': ' ', 84 | "'": '', 85 | '"': ' ', 86 | '_': ' ', 87 | '-': ' ', 88 | '–': ' ', 89 | ';': ',', 90 | ':': ':', 91 | '’': ' ', 92 | '‘': ' ', 93 | '<': ' ', 94 | '>': ' ', 95 | '(': ' ', 96 | ')': ' ', 97 | '›': ' ', 98 | '‹': ' ', 99 | 'é': 'e', 100 | 'ê': 'e', 101 | '^': ' ', 102 | 'è': 'e', 103 | 'à': 'a', 104 | 'á': 'a' 105 | 106 | } 107 | 108 | abbreviations = { 109 | ' H. v.': ' Herr von ', 110 | '†': ' gestorben ', 111 | ' v.': ' von ', 112 | '§': ' Paragraph ', 113 | ' geb.': ' geboren ', 114 | ' u.': ' und ', 115 | '&': ' und ', 116 | ' o.': ' oder ', 117 | ' Nr.': ' Nummer ', 118 | ' Pf.': ' Pfennig ', 119 | ' Mk.': ' Mark ', 120 | " Sr. Exz.": " seiner exzellenz ", 121 | " Kgl.": " königlich ", 122 | " Dr.": ' Doktor ', 123 | ' Abb.': ' Abbildung ', 124 | ' Abh.': ' Abhandlung ', 125 | ' Abk.': ' Abkürzung ', 126 | ' allg.': ' allgemein ', 127 | ' bes.': ' besonders ', 128 | ' bzw.': ' beziehungsweise ', 129 | ' geb.': ' geboren ', 130 | ' gegr.': ' gegründet ', 131 | ' jmd.': ' jemand ', 132 | ' o. Ä.': ' oder Ähnliches ', 133 | ' u. a.': ' unter anderem ', 134 | ' o.Ä.': ' oder Ähnliches ', 135 | ' u.a.': ' unter anderem ', 136 | ' ugs.': ' umgangssprachlich ', 137 | ' urspr.': ' ursprünglich ', 138 | ' usw.': ' und so weiter', 139 | ' u. s. w.': ' und so weiter ', 140 | ' u.s.w.': ' und so weiter ', 141 | ' zz.': ' zurzeit ', 142 | ' dt.': ' deutsch', 143 | ' ev.': ' evangelisch ', 144 | ' Jh.': ' Jahrhundert ', 145 | ' kath.': ' katholisch ', 146 | ' lat.': ' lateinisch ', 147 | ' luth.': ' lutherisch ', 148 | ' Myth.': ' Mythologie ', 149 | ' natsoz.': ' nationalsozialistisch ', 150 | ' n.Chr.': ' nach Christus ', 151 | ' n. Chr.': ' nach Christus ', 152 | ' relig.': ' religiös ', 153 | ' v. Chr.': ' vor Christus ', 154 | ' v.Chr.': ' vor Christus ', 155 | ' Med.': ' Medizin ', 156 | ' Mio.': ' Millionen ', 157 | ' d.h.': ' das heißt ', 158 | ' d. h.': ' das heißt ', 159 | ' Abb.': ' Abbildung ', 160 | ' f.': ' folgende ', 161 | ' ff.': ' folgende ', 162 | ' ggf.': ' gegebenfalls ', 163 | ' i. Allg.': ' im Allgemeinen ', 164 | ' i. d. R.': ' in der Regel ', 165 | ' i.Allg.': ' im Allgemeinen ', 166 | ' i.d.R.': ' in der Regel ', 167 | ' lt.': ' laut ', 168 | ' m.': ' mit ', 169 | ' od.': ' oder ', 170 | ' s. o.': ' siehe oben ', 171 | ' s. u.': ' siehe unten ', 172 | ' s.o.': ' siehe oben ', 173 | ' s.u.': ' siehe unten ', 174 | ' Std.': ' Stunde ', 175 | ' tägl.': ' täglich ', 176 | ' Tsd.': ' Tausend ', 177 | ' tsd.': ' tausend ', 178 | ' v.': ' von ', 179 | ' z. B.': ' zum Beispiel ', 180 | ' z.B.': ' zum Beispiel ', 181 | ' Z. B.': ' zum Beispiel ', 182 | ' Z.B.': ' zum Beispiel ', 183 | ' Bsp.': ' Beispiel ', 184 | ' bzgl.': ' bezüglich ', 185 | ' ca.': ' circa ', 186 | ' dgl.': ' dergleichen ', 187 | ' etc.': ' et cetera ', 188 | ' evtl.': ' eventuell ', 189 | ' z.T.': ' zum Teil ', 190 | ' z. T.': ' zum Teil ', 191 | ' zit.': ' zitiert ', 192 | ' zzgl.': ' zuzüglich ', 193 | ' H. ': ' Herr ', 194 | ' N. N.': ' so und so ', 195 | ' N.N.': ' so und so ', 196 | ' u.s.f.': ' und so fort', 197 | ' u. s. f.': ' und so fort', 198 | ' von Ew.': ' von euerer ', 199 | ' Se.': ' seine ', 200 | ' St.': ' Sankt ', 201 | ' inkl.': ' inklusive ', 202 | 'U.S.A.': ' U S A ', 203 | ' d. J': 'des Jahres ', 204 | 'G.m.b.H.': ' GmbH ', 205 | ' Mr.': ' Mister ', 206 | '°': ' Grad ', 207 | ' m. E.': ' meines Erachtens ', 208 | ' m.E.': ' meines Erachtens ', 209 | ' Ew.': ' Eure ', 210 | ' a.O.': ' an der Oder ', 211 | ' d.': ' der ', 212 | ' Ev.': ' Evangelium ', 213 | ' Sr.': ' seiner ', 214 | ' hl.': ' heilige ', 215 | ' Hr.': ' Herr ', 216 | 'd.i.': ' das ist ', 217 | ' Aufl.': ' Auflage ', 218 | "A. d. Üb.":" Anmerkung der Übersetzerin ", 219 | " gest.": " gestorben " 220 | 221 | 222 | 223 | } 224 | for input, target in beforeReplacement.items(): 225 | text = text.replace(input,target) 226 | for input, target in textReplacement.items(): 227 | text = text.replace(input,target) 228 | for input, target in baseReplacement.items(): 229 | text = text.replace(input,target) 230 | 231 | self.pathUtil.writeFile(text, self.saveFile) 232 | 233 | remainingNumbers = [s for s in text.split() if bool(re.search(r'\d', s))] 234 | if len(remainingNumbers)>0: 235 | print('there are remaining number inside the text') 236 | print(remainingNumbers) 237 | replacements = {} 238 | for text in remainingNumbers: 239 | replacements[text] = self.textNormalizer.normalize(text) 240 | replacements = dict(sorted(replacements.items(), key=lambda item: len(item[0]), reverse=True)) 241 | print(json.dumps(replacements, indent=4, ensure_ascii=False)) 242 | 243 | raise Exception('there are remaining number inside the text') 244 | 245 | remainingAbbreviations = [ab for ab in abbreviations.keys() if ab in text] 246 | if len(remainingAbbreviations)>0: 247 | print('there are remaining abbreviations inside the text') 248 | print(remainingAbbreviations) 249 | replacements = {key: value for (key,value) in abbreviations.items() if key in remainingAbbreviations} 250 | replacements = dict(sorted(replacements.items(), key=lambda item: len(item[0]), reverse=True)) 251 | print(json.dumps(replacements, indent=4, ensure_ascii=False)) 252 | raise Exception('there are remaining abbreviations inside the text') 253 | 254 | aToZ = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' 255 | possibleAbberviations = [' '+char+'.' for char in 'abcdefghijklmnopqrstuvwxyz' if ' '+char+'.' in text] + [' '+char+char2+'.' for char in aToZ for char2 in aToZ if ' '+char+char2+'.' in text] 256 | shortWorts = [' Co.', ' go.', ' Da.',' na.',' ab.', ' an.', ' da.', ' du.', ' er.', ' es.', ' ja.', ' so.', ' um.', ' zu.', ' Ja.', ' Ad.', ' je.', ' Es.', ' ob.', ' is.', ' tu.', ' Hm.', ' So.', ' wo.', ' ha.', ' he.', ' Du.', ' du.', ' Nu.', ' in.'] 257 | possibleAbberviations = [ab for ab in possibleAbberviations if ab not in shortWorts] 258 | if len(possibleAbberviations)>0: 259 | print('there are remaining possible abberviations inside the text') 260 | print(possibleAbberviations) 261 | raise Exception('there are remaining possible abberviations inside the text') 262 | 263 | allowedChars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ äöüßÖÄÜ .,;?!:" \n' 264 | remaininNotAllowedChars = [char for char in text if char not in allowedChars] 265 | if len(remaininNotAllowedChars)>0: 266 | print('there are remaining chars inside the text') 267 | print(remaininNotAllowedChars) 268 | raise Exception('there are remaining chars inside the text') 269 | return text -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /scripts/createDataset.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | from huiAudioCorpus.dependencyInjection.DependencyInjection import DependencyInjection 3 | import datasetWorkflow 4 | import scripts.createDatasetConfig as createDatasetConfig 5 | from huiAudioCorpus.utils.PathUtil import PathUtil 6 | import os 7 | 8 | pathUtil = PathUtil() 9 | basePath = createDatasetConfig.__path__[0] # type: ignore 10 | 11 | externalPaths = [ 12 | ] 13 | 14 | dataBasePath = datasetWorkflow.__path__[0] # type: ignore 15 | for path in externalPaths: 16 | if pathUtil.fileExists(path): 17 | dataBasePath = path 18 | 19 | def logStep(name): 20 | print('') 21 | print('') 22 | print('#######################################################') 23 | print(name) 24 | print('#######################################################') 25 | print('') 26 | 27 | ### load all configurations 28 | bernd_1 = pathUtil.loadJson( 29 | basePath + '/Bernd_Ungerer_tausendUndEineNacht.json') 30 | bernd_2 = pathUtil.loadJson(basePath + '/Bernd_Ungerer_other.json') 31 | bernd = {**bernd_1, **bernd_2} 32 | hokuspokus = pathUtil.loadJson(basePath + '/Hokuspokus.json') 33 | redaer = pathUtil.loadJson(basePath + '/redaer.json') 34 | friedrich = pathUtil.loadJson(basePath + '/Friedrich.json') 35 | eva = pathUtil.loadJson(basePath + '/Eva.json') 36 | karlsson = pathUtil.loadJson(basePath + '/Karlsson.json') 37 | sonja = pathUtil.loadJson(basePath + '/Sonja.json') 38 | 39 | allLibriboxIds = [author[key]['LibrivoxBookName'] for author in [ 40 | bernd, hokuspokus, friedrich, eva, karlsson, redaer] for key in author] 41 | duplicatIds = set([x for x in allLibriboxIds if allLibriboxIds.count(x) > 1]) 42 | 43 | if len(duplicatIds) > 0: 44 | raise Exception("Duplicate Librivox ids: " + str(duplicatIds)) 45 | 46 | 47 | # configere this object to only create a single speacker 48 | allConfigs = {**bernd, **hokuspokus, **friedrich, **eva, **karlsson, **sonja} 49 | allConfigs = sonja 50 | #allConfigs = redaer 51 | 52 | # this is needed for the statistic and split into others 53 | specialSpeackers = ['Bernd_Ungerer', 'Eva_K', 'Friedrich', 'Hokuspokus', 'Karlsson'] 54 | 55 | workflowConfig = { 56 | 'continueOnError': False, 57 | 'prepareAudio': True, 58 | 'prepareText': True, 59 | 'transcriptText': True, 60 | 'alignText': True, 61 | 'finalize': True, 62 | 'audioRawStatistic': True, 63 | 'cleanStatistic': True, 64 | 'fullStatistic': True, 65 | 'generateClean': True 66 | } 67 | 68 | 69 | step0Path = dataBasePath + '/overview' 70 | logStep('Step0_Overview') 71 | config = { 72 | 'audiosFromLibrivoxPersistenz': { 73 | 'bookName': '', 74 | 'savePath': '', 75 | 'chapterPath': '' 76 | }, 77 | 'step0_Overview': { 78 | 'savePath': step0Path 79 | } 80 | } 81 | DependencyInjection(config).step0_Overview.run() 82 | 83 | finalDatasetPath = dataBasePath + '/finalDataset' 84 | finalDatasetPathClean = dataBasePath + '/finalDatasetClean' 85 | step7Path = dataBasePath + '/rawStatistic' 86 | setp8Path = dataBasePath + '/datasetStatistic' 87 | setp8Path_clean = dataBasePath + '/datasetStatisticClean' 88 | 89 | 90 | def cleanFilter(input): 91 | input = input[input['minSilenceDB'] < -50] 92 | input = input[input['silencePercent'] < 45] 93 | input = input[input['silencePercent'] > 10] 94 | return input 95 | 96 | def runWorkflow(params: Dict, workflowConfig: Dict): 97 | print(params) 98 | bookBasePath = dataBasePath + '/books/' 99 | 100 | step1Path = bookBasePath + params['title'] + '/Step1_DownloadAudio' 101 | step1PathAudio = step1Path + '/audio' 102 | step1PathChapter = step1Path + '/chapter.csv' 103 | step2Path = bookBasePath + params['title'] + '/Step2_SplitAudio' 104 | step2_1_Path = bookBasePath + params['title'] + '/Step2_1_AudioStatistic' 105 | 106 | step2PathAudio = step2Path + '/audio' 107 | step3Path = bookBasePath + params['title'] + '/Step3_DownloadText' 108 | step3PathText = step3Path + '/text.txt' 109 | step3_1_Path = bookBasePath + params['title'] + '/Step3_1_PrepareText' 110 | step3_1_PathText = step3_1_Path + '/text.txt' 111 | 112 | step4Path = bookBasePath + params['title'] + '/Step4_TranscriptAudio' 113 | step5Path = bookBasePath + params['title'] + '/Step5_AlignText' 114 | step6Path = bookBasePath + params['title'] + '/Step6_FinalizeDataset' 115 | 116 | if workflowConfig['prepareAudio']: 117 | logStep('Step1_DowloadAudio') 118 | config = { 119 | 'audiosFromLibrivoxPersistenz': { 120 | 'bookName': params['LibrivoxBookName'], 121 | 'savePath': step1PathAudio + '/', 122 | 'chapterPath': step1PathChapter 123 | }, 124 | 'step1_DownloadAudio': { 125 | 'savePath': step1Path 126 | } 127 | } 128 | DependencyInjection(config).step1_DownloadAudio.run() 129 | 130 | logStep('Step2_SplitAudio') 131 | config = { 132 | 'audioSplitTransformer': { 133 | 'minAudioDuration': 5, 134 | 'maxAudioDuration': 40 135 | }, 136 | 'audioPersistenz': { 137 | 'loadPath': step1PathAudio, 138 | 'savePath': step2PathAudio, 139 | 'fileExtension': 'mp3' 140 | }, 141 | 'audioLoudnessTransformer': { 142 | 'loudness': -20 143 | }, 144 | 'step2_SplitAudio': { 145 | 'bookName': params['title'], 146 | 'savePath': step2Path, 147 | 'remapSort': params['remapSort'] if 'remapSort' in params else None 148 | } 149 | } 150 | DependencyInjection(config).step2_SplitAudio.run() 151 | 152 | logStep('Step2_1_AudioStatistic') 153 | config = { 154 | 'step2_1_AudioStatistic': { 155 | 'savePath': step2_1_Path, 156 | }, 157 | 'audioPersistenz': { 158 | 'loadPath': step2PathAudio 159 | }, 160 | 'plot': { 161 | 'showDuration': 1, 162 | 'savePath': step2_1_Path 163 | } 164 | } 165 | DependencyInjection(config).step2_1_AudioStatistic.run() 166 | 167 | if workflowConfig['prepareText']: 168 | logStep('Step3_DowloadText') 169 | config = { 170 | 'GutenbergBookPersistenz': { 171 | 'textId': params['GutenbergId'], 172 | 'savePath': step3PathText 173 | }, 174 | 'step3_DowloadText': { 175 | 'savePath': step3Path 176 | } 177 | } 178 | DependencyInjection(config).step3_DowloadText.run() 179 | 180 | logStep('Step3_1_PrepareText') 181 | config = { 182 | 'step3_1_PrepareText': { 183 | 'savePath': step3_1_Path, 184 | 'loadFile': step3PathText, 185 | 'saveFile': step3_1_PathText, 186 | 'textReplacement': params['textReplacement'], 187 | 'startSentence': params['GutenbergStart'], 188 | 'endSentence': params['GutenbergEnd'], 189 | 'moves': params['moves'] if 'moves' in params else [], 190 | 'remove': params['remove'] if 'remove' in params else [] 191 | } 192 | } 193 | DependencyInjection(config).step3_1_PrepareText.run() 194 | 195 | if workflowConfig['transcriptText']: 196 | logStep('Step4_TranscriptAudio') 197 | config = { 198 | 'step4_TranscriptAudio': { 199 | 'savePath': step4Path, 200 | }, 201 | 'audioPersistenz': { 202 | 'loadPath': step2PathAudio 203 | }, 204 | 'transcriptsPersistenz': { 205 | 'loadPath': step4Path, 206 | } 207 | } 208 | DependencyInjection(config).step4_TranscriptAudio.run() 209 | 210 | if workflowConfig['alignText']: 211 | logStep('Step5_AlignText') 212 | config = { 213 | 'step5_AlignText': { 214 | 'savePath': step5Path, 215 | 'textToAlignPath': step3_1_PathText 216 | }, 217 | 'transcriptsPersistenz': { 218 | 'loadPath': step4Path, 219 | 'savePath': step5Path 220 | } 221 | } 222 | DependencyInjection(config).step5_AlignText.run() 223 | 224 | if workflowConfig['finalize']: 225 | logStep('Step6_FinalizeDataset') 226 | config = { 227 | 'step6_FinalizeDataset': { 228 | 'savePath': step6Path, 229 | 'chapterPath': step1PathChapter 230 | }, 231 | 'audioPersistenz': { 232 | 'loadPath': step2PathAudio, 233 | 'savePath': finalDatasetPath 234 | }, 235 | 'transcriptsPersistenz': { 236 | 'loadPath': step5Path, 237 | 'savePath': finalDatasetPath 238 | } 239 | } 240 | DependencyInjection(config).step6_FinalizeDataset.run() 241 | 242 | 243 | summary = {} 244 | for configName in allConfigs: 245 | print('+++++++++++++++++++++++++++++++++++++++++') 246 | print('+++++++++++++++++++++++++++++++++++++++++') 247 | print('+++++++++++++++++++++++++++++++++++++++++') 248 | logStep(configName) 249 | print('+++++++++++++++++++++++++++++++++++++++++') 250 | print('+++++++++++++++++++++++++++++++++++++++++') 251 | print('+++++++++++++++++++++++++++++++++++++++++') 252 | 253 | config = allConfigs[configName] 254 | if workflowConfig['continueOnError']: 255 | try: 256 | runWorkflow(config, workflowConfig) 257 | summary[config['title']] = 'finished' 258 | except: 259 | summary[config['title']] = 'error' 260 | else: 261 | runWorkflow(config, workflowConfig) 262 | print(summary) 263 | 264 | if workflowConfig['audioRawStatistic']: 265 | logStep('audioRawStatistic') 266 | diConfig = { 267 | 'step7_AudioRawStatistic': { 268 | 'savePath': step7Path, 269 | 'loadPath': finalDatasetPath 270 | } 271 | } 272 | DependencyInjection(diConfig).step7_AudioRawStatistic.run() 273 | 274 | if workflowConfig['fullStatistic']: 275 | logStep('fullStatistic') 276 | diConfig = { 277 | 'step8_DatasetStatistic': { 278 | 'savePath': setp8Path, 279 | 'loadPath': step7Path + '/overview.csv', 280 | 'specialSpeackers': specialSpeackers, 281 | 'filter': None 282 | }, 283 | 'audioPersistenz': { 284 | 'loadPath':'' 285 | }, 286 | 'transcriptsPersistenz': { 287 | 'loadPath':'' 288 | }, 289 | 'plot': { 290 | 'showDuration': 0 291 | } 292 | } 293 | DependencyInjection(diConfig).step8_DatasetStatistic.run() 294 | 295 | if workflowConfig['cleanStatistic']: 296 | logStep('cleanStatistic') 297 | diConfig = { 298 | 'step8_DatasetStatistic': { 299 | 'savePath': setp8Path_clean, 300 | 'loadPath': step7Path + '/overview.csv', 301 | 'specialSpeackers': specialSpeackers, 302 | 'filter': cleanFilter 303 | }, 304 | 'audioPersistenz': { 305 | 'loadPath':'' 306 | }, 307 | 'transcriptsPersistenz': { 308 | 'loadPath':'' 309 | }, 310 | 'plot': { 311 | 'showDuration': 0 312 | } 313 | } 314 | DependencyInjection(diConfig).step8_DatasetStatistic.run() 315 | 316 | if workflowConfig['generateClean']: 317 | logStep('generateClean') 318 | diConfig = { 319 | 'step9_GenerateCleanDataset': { 320 | 'savePath': finalDatasetPath, 321 | 'infoFile': step7Path +'/overview.csv', 322 | 'filter': cleanFilter 323 | }, 324 | 'transcriptsPersistenz': { 325 | 'loadPath': finalDatasetPath, 326 | 'savePath': finalDatasetPathClean 327 | }, 328 | 'audioPersistenz': { 329 | 'loadPath': finalDatasetPath, 330 | 'savePath': finalDatasetPathClean 331 | }, 332 | } 333 | DependencyInjection(diConfig).step9_GenerateCleanDataset.run() -------------------------------------------------------------------------------- /scripts/createDatasetConfig/Friedrich.json: -------------------------------------------------------------------------------- 1 | { 2 | "vierzehnte_dezember": { 3 | "title": "vierzehnte_dezember", 4 | "LibrivoxBookName": "vierzehnte Dezember", 5 | "GutenbergId": "mereschk/14dezemb/14dezemb.html", 6 | "GutenbergStart": "", 7 | "GutenbergEnd": "", 8 | "textReplacement": { 9 | "III.": "der dritte", 10 | "II.": "der zweite", 11 | "I.": "der erste", 12 | "No": "Nummer", 13 | " z. B.": " zum Beispiel ", 14 | "60 000": "sechzigtausend", 15 | "1817": "achtzehnhundertsiebzehn", 16 | "1812": "achtzehnhundertzwölf", 17 | "1825": "achtzehnhundertfünfundzwanzig", 18 | "1801": "achtzehnhundertein", 19 | "1809": "achtzehnhundertneun", 20 | "31.,": "einunddreißigste", 21 | "31.": "einunddreißigsten", 22 | "27.": "siebenundzwanzigsten", 23 | "13.": "dreizehnsten", 24 | "19.": "neunzehnsten", 25 | "14.": "vierzehnsten", 26 | "15.": "fünfzehnsten", 27 | "21.": "einundzwanzigsten", 28 | "22.": "zweiundzwanzigsten", 29 | "11.": "elfsten", 30 | "12.": "zwölfsten", 31 | "18.": "achtzehnsten", 32 | "28.": "achtundzwanzigsten", 33 | "29.": "neunundzwanzigsten", 34 | "30.": "dreißigsten", 35 | "700": "siebenhundert", 36 | "116": "einhundertsechzehn", 37 | "4.": "vierten", 38 | "45": "fünfundvierzig", 39 | "11": "elf", 40 | "2.": "zweiten", 41 | "3.": "dritten", 42 | "12": "zwölf", 43 | "7": "sieben", 44 | "9": "neun", 45 | "ù": "u", 46 | "â": "a", 47 | "œ": "oe", 48 | "ï": "i", 49 | "ç": "c", 50 | "î": "i" 51 | } 52 | }, 53 | "lustige_geschichten": { 54 | "title": "lustige_geschichten", 55 | "LibrivoxBookName": "Lustige Geschichten", 56 | "GutenbergId": "cechov/novel5/novel5.html", 57 | "GutenbergStart": "", 58 | "GutenbergEnd": "", 59 | "textReplacement": { 60 | " d. h.": " das heißt ", 61 | " z. B.": " zum Beispiel ", 62 | " usw.": " und so weiter", 63 | " Ew.": "Eure", 64 | " d.": " der ", 65 | "&": " und ", 66 | "a. D.": " a D ", 67 | " Co.": " Co ", 68 | " II. ": " zweite ", 69 | "75.000": "fünfundsiebzigtausend", 70 | "9499": "neuntausendvierhundertneunundneunzig", 71 | "1883": "achtzehnhundertdreiundachtzig", 72 | "35,8": "fünfunddreißig komma acht", 73 | "1842": "achtzehnhundertzweiundvierzig", 74 | "209": "zweihundertneun", 75 | "29.": "neunundzwanzigsten", 76 | "223": "zweihundertdreiundzwanzig", 77 | "219": "zweihundertneunzehn", 78 | "26": "sechsundzwanzig", 79 | "46": "sechsundvierzig", 80 | "Nr.": "Nummer" 81 | } 82 | }, 83 | "saemtliche_schriften6": { 84 | "title": "saemtliche_schriften6", 85 | "LibrivoxBookName": "Sämtliche Schriften 1911-1921, Teil 6", 86 | "GutenbergId": "ossietzk/schrift1/chap251.html", 87 | "GutenbergStart": "", 88 | "GutenbergEnd": "", 89 | "textReplacement": { 90 | " u.a.": " unter anderem ", 91 | " d.h.": " das heißt ", 92 | " z.B.": " zum Beispiel ", 93 | " Dr.": " Doktor ", 94 | " v.": " von ", 95 | " u.": " und ", 96 | " d.": " der ", 97 | "&": " und ", 98 | " Co.": " Co ", 99 | "a. D.": " a D ", 100 | "1911": "neunzehnhundertelf", 101 | "1919": "neunzehnhundertneunzehn", 102 | "1500": "fünfzehnhundert", 103 | "1914": "neunzehnhundertvierzehn", 104 | "1000": "eintausend", 105 | "1821": "achtzehnhunderteinundzwanzig", 106 | "1881": "achtzehnhunderteinundachtzig", 107 | "1915": "neunzehnhundertfünfzehn", 108 | "1857": "achtzehnhundertsiebenundfünfzig", 109 | "1880": "achtzehnhundertachtzig", 110 | "1935": "neunzehnhundertfünfunddreißig", 111 | "1908": "neunzehnhundertacht", 112 | "1920": "neunzehnhundertzwanzig", 113 | "1921": "neunzehnhunderteinundzwanzig", 114 | "21.": "einundzwanzigsten", 115 | "23.": "dreiundzwanzigsten", 116 | "27.": "siebenundzwanzigsten", 117 | "17.": "siebzehnsten", 118 | "18.": "achtzehnsten", 119 | "20.": "zwanzigsten", 120 | "184": "einhundertvierundachtzig", 121 | "11.": "elfsten", 122 | "22.": "zweiundzwanzigsten", 123 | "300": "dreihundert", 124 | "500": "fünfhundert", 125 | "100": "einhundert", 126 | "231": "zweihunderteinunddreißig", 127 | "24.": "vierundzwanzigsten", 128 | "6.": "sechste", 129 | "22": "zweiundzwanzig", 130 | "26": "sechsundzwanzig", 131 | "18": "achtzehn", 132 | "60": "sechzig", 133 | "80": "achtzig", 134 | "35": "fünfunddreißig", 135 | "15": "fünfzehn", 136 | "10": "zehn", 137 | "38": "achtunddreißig", 138 | "4.": "vierte", 139 | "6": "sechs", 140 | "§": "Paragraph", 141 | "ç": " ", 142 | "[": " ", 143 | "]": " " 144 | } 145 | }, 146 | "saemtliche_schriften5": { 147 | "title": "saemtliche_schriften5", 148 | "LibrivoxBookName": "Sämtliche Schriften 1911-1921, Teil 5", 149 | "GutenbergId": "ossietzk/schrift1/chap201.html", 150 | "GutenbergStart": "", 151 | "GutenbergEnd": "dieses Abends gelernt haben. Es wird noch einiges über diesen Trauerfall zu sagen sein.", 152 | "textReplacement": { 153 | " d. h.": " das heißt ", 154 | " usw.": " und so weiter", 155 | " dgl.": " dergleichen ", 156 | " Dr.": " Doktor ", 157 | " H. ": " Herr ", 158 | " St.": " Sankt ", 159 | " v.": " von ", 160 | " u.": " und ", 161 | " d.": " der ", 162 | "&": " und ", 163 | " Co.": " Co ", 164 | " II. ": " zweite ", 165 | " z.": " z ", 166 | " CD.": " CD ", 167 | " Fr.": " Fr ", 168 | "100 000": "einhunderttausend", 169 | "30 000": "dreißigtausend", 170 | "1814": "achtzehnhundertvierzehnte", 171 | "1336": "dreizehnhundertsechsunddreißig", 172 | "1793": "siebzehnhundertdreiundneunzig", 173 | "1899": "achtzehnhundertneunundneunzig", 174 | "1916": "neunzehnhundertsechzehn", 175 | "1800": "achtzehnhundert", 176 | "1914": "neunzehnhundertvierzehn", 177 | "1918": "neunzehnhundertachtzehn", 178 | "1902": "neunzehnhundertzwei", 179 | "1490": "vierzehnhundertneunzig", 180 | "1921": "neunzehnhunderteinundzwanzig", 181 | "1898": "achtzehnhundertachtundneunzig", 182 | "1917": "neunzehnhundertsiebzehn", 183 | "1848": "achtzehnhundertachtundvierzig", 184 | "212": "zweihundertzwölf", 185 | "28.": "achtundzwanzigsten", 186 | "109": "einhundertneun", 187 | "118": "einhundertachtzehn", 188 | "47": "siebenundvierzig", 189 | "4.": "vierten", 190 | "17": "siebzehn", 191 | "15": "fünfzehn", 192 | "10": "zehn", 193 | "31": "einunddreißig", 194 | "30": "dreißig", 195 | "1.": "ersten", 196 | "54": "vierundfünfzig", 197 | "[": " ", 198 | "]": " " 199 | } 200 | }, 201 | "homo_sapiens": { 202 | "title": "homo_sapiens", 203 | "LibrivoxBookName": "Homo sapiens - Romantrilogie", 204 | "GutenbergId": "przybysz/homosapi/homosapi.html", 205 | "GutenbergStart": "", 206 | "GutenbergEnd": "", 207 | "textReplacement": { 208 | "IV.": " vierten ", 209 | "ô": "o", 210 | "ó": "o", 211 | " u. s. w.": " und so weiter ", 212 | " d. h.": " das heißt ", 213 | " z. B.": " zum Beispiel ", 214 | " dgl.": " dergleichen ", 215 | " u.": " und ", 216 | " d.": " der ", 217 | "1894": "achtzehnhundertvierundneunzig", 218 | "28.": "achtundzwanzigsten", 219 | "100": "einhundert", 220 | "21.": "einundzwanzigsten", 221 | "183": "einhundertdreiundachtzig", 222 | "10": "zehn", 223 | "13": "dreizehn", 224 | "26": "sechsundzwanzig", 225 | "1.": "erster", 226 | "30": "dreißig", 227 | "90": "neunzig" 228 | } 229 | }, 230 | "aus_allen_winkeln": { 231 | "title": "aus_allen_winkeln", 232 | "LibrivoxBookName": "Aus allen Winkeln - Erzählungen", 233 | "GutenbergId": "heiberg/erzaehlg/erzaehlg.html", 234 | "GutenbergStart": "In einer der besten Gegenden der Stadt", 235 | "GutenbergEnd": "", 236 | "remapSort": [ 237 | 5, 238 | 7, 239 | 13, 240 | 15, 241 | 16, 242 | 1, 243 | 10, 244 | 6, 245 | 11, 246 | 14, 247 | 4, 248 | 9, 249 | 12, 250 | 0, 251 | 8, 252 | 2, 253 | 3 254 | ], 255 | "textReplacement": { 256 | " H. ": " H ", 257 | " v.": " von ", 258 | "&": " und ", 259 | " geb.": " geboren ", 260 | " Dr.": " Doktor ", 261 | "250,000": "zweihundertfünfzigtausend", 262 | "1867": "achtzehnhundertsiebenundsechzig", 263 | "1868": "achtzehnhundertachtundsechzig", 264 | "1873": "achtzehnhundertdreiundsiebzig", 265 | "1729": "siebzehnhundertneunundzwanzig", 266 | "25.": "fünfundzwanzigsten", 267 | "10.": "zehnsten", 268 | "11.": "elfsten", 269 | "13.": "dreizehnsten", 270 | "800": "achthundert", 271 | "200": "zweihundert", 272 | "400": "vierhundert", 273 | "11": "elf", 274 | "18": "achtzehn", 275 | "30": "dreißig", 276 | "4.": "vierte", 277 | "*": " ", 278 | " af.": "af .", 279 | "[": " ", 280 | "]": " " 281 | } 282 | }, 283 | "falsches_geld": { 284 | "title": "falsches_geld", 285 | "LibrivoxBookName": "Falsches Geld", 286 | "GutenbergId": "zapp/falsgeld/falsgeld.html", 287 | "GutenbergStart": "", 288 | "GutenbergEnd": "", 289 | "textReplacement": { 290 | " d. h.": " das heißt ", 291 | " z. B.": " zum Beispiel ", 292 | " geb.": " geboren ", 293 | " usw.": " und so weiter", 294 | " p.": " P ", 295 | "4,40 Mk.": "Vier Mark vierzig", 296 | "30jährigen": "dreißigjährigen", 297 | "20jährigen": "zwanzigjährigen", 298 | "0459.": "null vier fünf neun", 299 | "4905": "viertausendneunhundertfünf", 300 | "9054": "neuntausendvierundfünfzig", 301 | "l0000": "zehntausend", 302 | "5049": "fünftausendneunundvierzig", 303 | "0246": "null zwei vier sechs", 304 | "4,40": "vier komma vier null", 305 | "2000": "zweitausend", 306 | "3000": "dreitausend", 307 | "5000": "fünftausend", 308 | "1000": "eintausend", 309 | "300": "dreihundert", 310 | "200": "zweihundert", 311 | "26.": "sechsundzwanzigsten", 312 | "100": "einhundert", 313 | "20.": "zwanzigsten", 314 | "5o,": "So,", 315 | "400": "vierhundert", 316 | "Pf.": "Pfennig", 317 | "Mk.": "Mark", 318 | "58": "achtundfünfzig", 319 | "12": "zwölf", 320 | "27": "siebenundzwanzig", 321 | "30": "dreißig", 322 | "24": "vierundzwanzig", 323 | "22": "zweiundzwanzig", 324 | "10": "zehn", 325 | "20": "zwanzig", 326 | "50": "fünfzig", 327 | "4O": "vierO", 328 | "14": "vierzehn", 329 | "90": "neunzig", 330 | "11": "elf", 331 | "60": "sechzig", 332 | "25": "fünfundzwanzig", 333 | "2": "zwei", 334 | "3": "drei", 335 | "4": "vier", 336 | "1": "ein", 337 | "8": "acht", 338 | "7": "sieben", 339 | "½": "halb" 340 | } 341 | }, 342 | "judith_trachtenberg": { 343 | "title": "judith_trachtenberg", 344 | "LibrivoxBookName": "Judith Trachtenberg", 345 | "GutenbergId": "franzos/trachten/trachten.html", 346 | "GutenbergStart": "", 347 | "GutenbergEnd": "", 348 | "textReplacement": { 349 | "&": " und " 350 | } 351 | }, 352 | "verbrechen": { 353 | "title": "verbrechen", 354 | "LibrivoxBookName": "Verbrechen", 355 | "GutenbergId": "gorki/verbrec1/verbrec1.html", 356 | "GutenbergStart": "", 357 | "GutenbergEnd": "", 358 | "textReplacement": {} 359 | }, 360 | "furchtbare_rache": { 361 | "title": "furchtbare_rache", 362 | "LibrivoxBookName": "Furchtbare Rache", 363 | "GutenbergId": "gogol/rache/rache.html", 364 | "GutenbergStart": "", 365 | "GutenbergEnd": "", 366 | "textReplacement": {} 367 | }, 368 | "gruene_nachtigall": { 369 | "title": "gruene_nachtigall", 370 | "LibrivoxBookName": "grüne Nachtigall und andere Novellen", 371 | "GutenbergId": "kusmin/grnachti/grnachti.html", 372 | "GutenbergStart": "Das grüne Haus glich so sehr", 373 | "GutenbergEnd": "", 374 | "textReplacement": { 375 | "1811": "achtzehnhundertelf", 376 | "1.": "erstens", 377 | "2.": "zweitens", 378 | "a. D.": " A D ", 379 | "1918": "neunzehnhundertachtzehn", 380 | "5.": "fünften", 381 | "7.": "siebten", 382 | "9.": "neunten" 383 | }, 384 | "moves": [ 385 | { 386 | "start": "Der Traum der letzten Nacht rief in meiner Erinnerung wieder alles wach, was ich so gerne vergessen möchte.", 387 | "end": "ich später in seinen Umarmungen niemals jene fremden Arme mit dem braunen Halbmond auf der blassen Haut wiedererkannte.", 388 | "after": "freundlich und still, wie eine echte Meisterin in ihrem Fach." 389 | } 390 | ] 391 | } 392 | } -------------------------------------------------------------------------------- /scripts/createDatasetConfig/Karlsson.json: -------------------------------------------------------------------------------- 1 | { 2 | "unterm_birnbaum": { 3 | "title": "unterm_birnbaum", 4 | "LibrivoxBookName": "Unterm Birnbaum", 5 | "GutenbergId": 26686, 6 | "GutenbergStart": "Vor dem in dem großen und reichen Oderbruchdorfe Tschechin um", 7 | "GutenbergEnd": "gesponnen, ’s kommt doch alles an die Sonnen._‹«", 8 | "textReplacement": { 9 | "d. M.": " des Monats ", 10 | " gest.": " gestorben ", 11 | " geb.": " geboren ", 12 | " etc.": " et cetera ", 13 | " Se.": " seine ", 14 | " v.": " von ", 15 | "&": " und ", 16 | "1831": "achtzehnhunderteinunddreißig", 17 | "1790": "siebzehnhundertneunzig", 18 | "1832": "achtzehnhundertzweiunddreißig", 19 | "80er": "achtziger", 20 | "30.": "dreißigsten", 21 | "29.": "neunundzwanzigsten", 22 | "14.": "vierzehnsten", 23 | "36": "sechsunddreißig", 24 | "20": "zwanzig", 25 | "9.": "neunte", 26 | "13": "dreizehn", 27 | "7.": "siebten", 28 | "15": "fünfzehn", 29 | "27": "siebenundzwanzig", 30 | "7": "sieben", 31 | "10": "zehn", 32 | "30": "dreißig", 33 | "14": "vierzehn", 34 | "3.": "dritten", 35 | "6": "sechs", 36 | "2": "zwei", 37 | "9": "neun", 38 | "1": "ein", 39 | "’n ": "n ", 40 | "’n.": "n. ", 41 | "’s.": "s.", 42 | "’s ": "s ", 43 | " mi.": " mi .", 44 | " pp.": " P P ", 45 | " se.": " se .", 46 | " to.": " to .", 47 | " ut.": " ut .", 48 | "XVIII.": " ", 49 | "XVII.": " ", 50 | "XIII.": " ", 51 | "VIII.": " ", 52 | "XII.": " ", 53 | "XVI.": " ", 54 | "XIV.": " ", 55 | "VII.": " ", 56 | "III.": " ", 57 | "IX.": " ", 58 | "VI.": " ", 59 | "II.": " ", 60 | "XV.": " ", 61 | "IV.": " ", 62 | "XI.": " ", 63 | "X.": " ", 64 | "V.": " ", 65 | "I.": " ", 66 | "*": " ", 67 | "ô": "o", 68 | "#": " ", 69 | "ç": "c" 70 | } 71 | }, 72 | "schwle_tage": { 73 | "title": "schwle_tage", 74 | "LibrivoxBookName": "Schwüle Tage", 75 | "GutenbergId": "keyserlg/schwuele/schwuele.html", 76 | "GutenbergStart": "", 77 | "GutenbergEnd": "", 78 | "textReplacement": { 79 | "-h.": " h .", 80 | "ç": "c", 81 | "ακτις αελιου": "aktisch ayileou" 82 | } 83 | }, 84 | "mdchen_vom_moorhof": { 85 | "title": "mdchen_vom_moorhof", 86 | "LibrivoxBookName": "Mädchen vom Moorhof", 87 | "GutenbergId": 20211, 88 | "GutenbergStart": "Es ist in einem Thingsaal, weit draußen auf dem Lande. Am Richtertisch,", 89 | "GutenbergEnd": "um sie stand. Jetzt konnte sie ihm nicht mehr entfliehen.", 90 | "textReplacement": { 91 | "2": " ", 92 | "3": " ", 93 | "4": " ", 94 | "5": " ", 95 | "6": " ", 96 | "À": "A", 97 | "å": "a", 98 | "*": " " 99 | } 100 | }, 101 | "sandmann": { 102 | "title": "sandmann", 103 | "LibrivoxBookName": "Sandmann", 104 | "GutenbergId": "etahoff/sandmann/sandmann.html", 105 | "GutenbergStart": "", 106 | "GutenbergEnd": "", 107 | "textReplacement": { 108 | "30.": "dreißigsten", 109 | "12": "zwölf", 110 | " z. B.": " zum Beispiel ", 111 | " usw.": " und so weiter", 112 | " etc.": " et cetera ", 113 | "[": " ", 114 | "]": " " 115 | } 116 | }, 117 | "spuk": { 118 | "title": "spuk", 119 | "LibrivoxBookName": "Spuk", 120 | "GutenbergId": "klabund/spuk/spuk.html", 121 | "GutenbergStart": "", 122 | "GutenbergEnd": "", 123 | "textReplacement": { 124 | " d. h.": " das heißt ", 125 | " Kgl.": " königlich ", 126 | "Abb.": " Abbildung ", 127 | " usw.": " und so weiter", 128 | " Nr.": " Nummer ", 129 | "1921": "neunzehnhunderteinundzwanzig", 130 | "38,9,": "achtunddreißig komma neun,", 131 | "0,02.": "null komma null zwei.", 132 | "1891": "achtzehnhunderteinundneunzig", 133 | "8000": "achttausend", 134 | "0,6.": "null komma sechs.", 135 | "39,1": "neununddreißig komma eins", 136 | "105": "einhundertfünf", 137 | "7314": "siebentausenddreihundertvierzehn", 138 | "2–3": "zwei bis drei", 139 | "2–4": "zwei bis vier", 140 | "5:4.": "Fünf zu Vier.", 141 | "23": "dreiundzwanzig", 142 | "20": "zwanzig", 143 | "38": "achtunddreißig", 144 | "28": "achtundzwanzig", 145 | "13,": "dreizehn,", 146 | "25:": "fünfundzwanzig:", 147 | "999": "neunhundertneunundneunzig", 148 | "2,": "zwei,", 149 | "3.": "dritte", 150 | "50": "fünfzig", 151 | "0,": "Oh, ", 152 | "25": "fünfundzwanzig", 153 | "91": "einundneunzig", 154 | "15": "fünfzehn", 155 | "13": "dreizehn", 156 | "35": "fünfunddreißig", 157 | "5": "fünf", 158 | "4": "vier", 159 | "0!": "Oh, ", 160 | "0": "Oh, ", 161 | "2": "zwei", 162 | "1": "eins", 163 | "3": "drei", 164 | "Yo.": "Yo ." 165 | } 166 | }, 167 | "odysseus": { 168 | "title": "odysseus", 169 | "LibrivoxBookName": "Odysseus", 170 | "GutenbergId": "beckerkf/altewelt/altewelt.html", 171 | "GutenbergStart": "", 172 | "GutenbergEnd": "und den Gedanken an die Todesgöttinnen nicht scheut!", 173 | "textReplacement": { 174 | " u.s.w.": " und so weiter ", 175 | "ë": "e" 176 | } 177 | }, 178 | "herr_und_knecht": { 179 | "title": "herr_und_knecht", 180 | "LibrivoxBookName": "Herr und Knecht", 181 | "GutenbergId": 33266, 182 | "GutenbergStart": "Es war in den siebziger Jahren, ", 183 | "GutenbergEnd": "", 184 | "textReplacement": { 185 | "7.": "siebten" 186 | } 187 | }, 188 | "smtliche_schriften_19111921_teil_1": { 189 | "title": "smtliche_schriften_19111921_teil_1", 190 | "LibrivoxBookName": "Sämtliche Schriften 1911-1921, Teil 1", 191 | "GutenbergId": "ossietzk/schrift1/schrift1.html", 192 | "GutenbergStart": "", 193 | "GutenbergEnd": "Mitteilungen der Deutschen Friedensgesellschaft. Januar 1920", 194 | "textReplacement": { 195 | "LA Berlin, N Madrasch Groschopp, Rep.200, Acc. 4288. Nr.22": "Erschienen neunzenhundertsechzen Literaturanstalt Berlin. N Madrasch Groschopp, Rep zweihundert, Acc viertausendzweihundertachtundachtzig Nummmer zweiundzwanzig", 196 | "Von Dr. med. M. von Kemnitz. Verlag Ernst Reinhardt, München. Brosch. Mk. 6.–, geb. Mk. 8,50.": "Von Doktor der Medizin M von Kemnitz. Verlag Ernst Reinhardt, München. Broschiert sechs Mark, gebunden acht Mark fünfzig. ", 197 | "brosch.": "broschiert", 198 | "Mk. -.80": " achtzig Pfennig", 199 | "M. 1.20": "eine Mark zwanzig", 200 | "G.m.b.H.": " GmbH ", 201 | " H. v.": " Herr von ", 202 | " bzw.": " beziehungsweise ", 203 | " usw.": " und so weiter", 204 | " d.h.": " das heißt ", 205 | " z.B.": " zum Beispiel ", 206 | " Nr.": " Nummer ", 207 | " Pf.": " Pfennig ", 208 | " Dr.": " Doktor ", 209 | " H. ": " Herr ", 210 | " v.": " von ", 211 | " u.": " und ", 212 | " d.": " der ", 213 | "§": " Paragraph ", 214 | "a.D.": " A D ", 215 | "II.": " der zweite ", 216 | "IV.": " der vierte ", 217 | "1914/15": "neunzehnhundertvierzehn fünfzehn", 218 | "1870": "achtzehnhundertsiebzig", 219 | "1830": "achtzehnhundertdreißigte", 220 | "1911": "neunzehnhundertelf", 221 | "1912": "neunzehnhundertzwölf", 222 | "1913": "neunzehnhundertdreizehn", 223 | "78/8": "achtundsiebzig/acht", 224 | "1914": "neunzehnhundertvierzehn", 225 | "4002": "viertausendzwei", 226 | "1799": "siebzehnhundertneunundneunzig", 227 | "4003": "viertausenddrei", 228 | "1917": "neunzehnhundertsiebzehn", 229 | "1918": "neunzehnhundertachtzehn", 230 | "1500": "fünfzehnhundert", 231 | "1919": "neunzehnhundertneunzehn", 232 | "1348": "dreizehnhundertachtundvierzig", 233 | "1848": "achtzehnhundertachtundvierzig", 234 | "1915": "neunzehnhundertfünfzehn", 235 | "1916": "neunzehnhundertsechzehn", 236 | "1920": "neunzehnhundertzwanzig", 237 | "21.": "einundzwanzigsten", 238 | "16.": "sechzehnsten", 239 | "25.": "fünfundzwanzigsten", 240 | "31.": "einunddreißigsten", 241 | "28.": "achtundzwanzigsten", 242 | "33.": "dreiunddreißigsten", 243 | "30.": "dreißigsten", 244 | "18.": "achtzehnsten", 245 | "22.": "zweiundzwanzigsten", 246 | "19.": "neunzehnsten", 247 | "12.": "zwölfsten", 248 | "29.": "neunundzwanzigsten", 249 | "24.": "vierundzwanzigsten", 250 | "11.": "elfsten", 251 | "20,": "zwanzig,", 252 | "180": "einhundertachtzig", 253 | "850": "achthundertfünfzig", 254 | "200": "zweihundert", 255 | "100": "einhundert", 256 | "13.": "dreizehnsten", 257 | "15.": "fünfzehnsten", 258 | "10:": "zehn:", 259 | "20.": "zwanzigsten", 260 | "26.": "sechsundzwanzigsten", 261 | "27": "siebenundzwanzig", 262 | "50": "fünfzig", 263 | "19": "neunzehn", 264 | "5.": "fünfte", 265 | "8.": "achte", 266 | "40": "vierzig", 267 | "4.": "vierte", 268 | "2.": "zweite", 269 | "1.": "erste", 270 | "28": "achtundzwanzig", 271 | "57": "siebenundfünfzig", 272 | "30": "dreißig", 273 | "26": "sechsundzwanzig", 274 | "60": "sechzig", 275 | "9.": "neunte", 276 | "39": "neununddreißig", 277 | "67": "siebenundsechzig", 278 | "20": "zwanzig", 279 | "44": "vierundvierzig", 280 | "24": "vierundzwanzig", 281 | "10": "zehn", 282 | "6.": "sechste", 283 | "47": "siebenundvierzig", 284 | "32": "zweiunddreißig", 285 | "89": "neunundachtzig", 286 | "38": "achtunddreißig", 287 | "1": "ein", 288 | "5": "fünf", 289 | "4": "vier", 290 | "2": "zwei", 291 | "[": " ", 292 | "]": " ", 293 | "*": " ", 294 | "#": " ", 295 | "/": " ", 296 | "â": "a" 297 | } 298 | }, 299 | "smtliche_schriften_19111921_teil_2": { 300 | "title": "smtliche_schriften_19111921_teil_2", 301 | "LibrivoxBookName": "Sämtliche Schriften 1911-1921, Teil 2", 302 | "GutenbergId": "ossietzk/schrift1/chap051.html", 303 | "GutenbergStart": "", 304 | "GutenbergEnd": "Auf Wiedersehen beim nächsten Putsch ...!", 305 | "textReplacement": { 306 | "24jährige": "vierundzwanzigjährige", 307 | "1916/17.": "neunzehnhundertsechzehnn siebzehnsten", 308 | "2.8.1921": "zweiter achter neunzehnhunderteinundzwanzig", 309 | "159er,": "einhundertneunundfünfziger,", 310 | "Mk. 0,50.": "fünfzig Pfennige ", 311 | "G.m.b.H.": " GmbH ", 312 | " H. v.": " Herr von ", 313 | " d. h.": " das heißt ", 314 | " z. B.": " zum Beispiel ", 315 | " inkl.": " inklusive ", 316 | "U.S.A.": " U S A ", 317 | " u.a.": " unter anderem ", 318 | " usw.": " und so weiter", 319 | " d.h.": " das heißt ", 320 | " z.B.": " zum Beispiel ", 321 | " dgl.": " dergleichen ", 322 | " d. J": "des Jahres ", 323 | " Nr.": " Nummer ", 324 | " Mk.": " Mark ", 325 | "Dr.": " Doktor ", 326 | " H. ": " Herr ", 327 | " St.": " Sankt ", 328 | " v.": " von ", 329 | " u.": " und ", 330 | " d.": " der ", 331 | " a. D. ": " A D ", 332 | "a.M.": " am Main ", 333 | "§": " Paragraph ", 334 | "&": " und ", 335 | "3333": "dreitausenddreihundertdreiunddreißig,", 336 | "1348": "dreizehnhundertachtundvierzig", 337 | "1897": "achtzehnhundertsiebenundneunzig", 338 | "11/12": "elf zwölf", 339 | "1814": "achtzehnhundertvierzehn", 340 | "1336": "dreizehnhundertsechsunddreißig", 341 | "1793": "siebzehnhundertdreiundneunzig", 342 | "1899": "achtzehnhundertneunundneunzig", 343 | "1911": "neunzehnhundertelf", 344 | "1914": "neunzehnhundertvierzehn", 345 | "1847": "achtzehnhundertsiebenundvierzig", 346 | "1920": "neunzehnhundertzwanzig", 347 | "1918": "neunzehnhundertachtzehn", 348 | "1813": "achtzehnhundertdreizehn", 349 | "1848": "achtzehnhundertachtundvierzig", 350 | "1,80 Mark": "eine Mark achtzig", 351 | "1850": "achtzehnhundertfünfzig", 352 | "1880": "achtzehnhundertachtzig", 353 | "1870": "achtzehnhundertsiebzig", 354 | "1919": "neunzehnhundertneunzehn", 355 | "1916": "neunzehnhundertsechzehn", 356 | "3000": "dreitausend", 357 | "5000": "fünftausend", 358 | "1100": "eintausendeinhundert", 359 | "1913": "neunzehnhundertdreizehn", 360 | "1915": "neunzehnhundertfünfzehn", 361 | "1520": "fünfzehnhundertzwanzig", 362 | "1910": "neunzehnhundertzehn", 363 | "1921": "neunzehnhunderteinundzwanzig", 364 | "1871": "achtzehnhunderteinundsiebzig", 365 | "170:": "einhundertsiebzig:", 366 | "3,80": "drei komma acht null", 367 | "1896": "achtzehnhundertsechsundneunzig", 368 | "1909": "neunzehnhundertneun", 369 | "1521": "fünfzehnhunderteinundzwanzig", 370 | "1807": "achtzehnhundertsieben", 371 | "1866": "achtzehnhundertsechsundsechzig", 372 | "159.": "einhundertneunundfünfzigte", 373 | "1800": "achtzehnhundert", 374 | "1902": "neunzehnhundertzwei", 375 | "1490": "vierzehnhundertneunzig", 376 | "1891": "achtzehnhunderteinundneunzig", 377 | "1898": "achtzehnhundertachtundneunzig", 378 | "1917": "neunzehnhundertsiebzehn", 379 | "1500": "fünfzehnhundert", 380 | "184.": "einhundertvierundachtzigte", 381 | "1000": "eintausend", 382 | "1821": "achtzehnhunderteinundzwanzig", 383 | "1881": "achtzehnhunderteinundachtzig", 384 | "1857": "achtzehnhundertsiebenundfünfzig", 385 | "1935": "neunzehnhundertfünfunddreißig", 386 | "1908": "neunzehnhundertacht", 387 | "000": "tausend", 388 | "31.": "einunddreißigster", 389 | "27.": "siebenundzwanzigster", 390 | "28.": "achtundzwanzigster", 391 | "400": "vierhundert", 392 | "11.": "elfster", 393 | "65.": "fünfundsechzigster", 394 | "13.": "dreizehnster", 395 | "17.": "siebzehnster", 396 | "24.": "vierundzwanzigster", 397 | "200": "zweihundert", 398 | "100": "einhundert", 399 | "20.": "zwanzigster", 400 | "23.": "dreiundzwanzigster", 401 | "29.": "neunundzwanzigster", 402 | "125": "einhundertfünfundzwanzig", 403 | "600": "sechshundert", 404 | "150": "einhundertfünfzig", 405 | "12.": "zwölfster", 406 | "390": "dreihundertneunzig", 407 | "250": "zweihundertfünfzig", 408 | "21.": "einundzwanzigsten", 409 | "14.": "vierzehnster", 410 | "26.": "sechsundzwanzigster", 411 | "75,": "fünfundsiebzig,", 412 | "22.": "zweiundzwanzigster", 413 | "18.": "achtzehnster", 414 | "10.": "zehnster", 415 | "25.": "fünfundzwanzigster", 416 | "30.": "dreißigster", 417 | "19.": "neunzehnster", 418 | "16.": "sechzehnster", 419 | "39,": "neununddreißig,", 420 | "300": "dreihundert", 421 | "15.": "fünfzehnsten", 422 | "212": "zweihundertzwölf", 423 | "109": "einhundertneun", 424 | "118": "einhundertachtzehn", 425 | "184": "einhundertvierundachtzig", 426 | "500": "fünfhundert", 427 | "231": "zweihunderteinunddreißig", 428 | "3.": "dritter", 429 | "1.": "erster", 430 | "52": "zweiundfünfzig", 431 | "10": "zehn", 432 | "2.": "zweite", 433 | "6.": "sechste", 434 | "20": "zwanzig", 435 | "70": "siebzig", 436 | "4.": "vierter", 437 | "51": "einundfünfzig", 438 | "35": "fünfunddreißig", 439 | "48": "achtundvierzig", 440 | "84": "vierundachtzig", 441 | "9.": "neunter", 442 | "30": "dreißig", 443 | "50": "fünfzig", 444 | "40": "vierzig", 445 | "62": "zweiundsechzig", 446 | "80": "achtzig", 447 | "25": "fünfundzwanzig", 448 | "90": "neunzig", 449 | "95": "fünfundneunzig", 450 | "8.": "achter", 451 | "5.": "fünfter", 452 | "11": "elf", 453 | "7.": "siebter", 454 | "19": "neunzehn", 455 | "47": "siebenundvierzig", 456 | "17": "siebzehn", 457 | "15": "fünfzehn", 458 | "31": "einunddreißig", 459 | "36": "sechsunddreißig", 460 | "54": "vierundfünfzig", 461 | "22": "zweiundzwanzig", 462 | "26": "sechsundzwanzig", 463 | "18": "achtzehn", 464 | "60": "sechzig", 465 | "38": "achtunddreißig", 466 | "1": "ein", 467 | "3": "drei", 468 | "4": "vier", 469 | "5": "fünf", 470 | "2": "zwei", 471 | "6": "sechs", 472 | "9": "neun", 473 | "8": "acht", 474 | "IV.": "vier ", 475 | "III.": "drei", 476 | "II.": "zwei ", 477 | "I.": "eins", 478 | "[": " ", 479 | "]": " ", 480 | "*": " ", 481 | "#": " ", 482 | "/": " ", 483 | "â": "a", 484 | "ç": "c" 485 | 486 | } 487 | } 488 | } --------------------------------------------------------------------------------