├── scripts
    ├── __init__.py
    ├── createDatasetConfig
    │   ├── __init__.py
    │   ├── Sonja.json
    │   ├── redaer.json
    │   ├── Bernd_Ungerer_tausendUndEineNacht.json
    │   ├── Eva.json
    │   ├── Friedrich.json
    │   └── Karlsson.json
    ├── generateAudioStatistic.py
    └── createDataset.py
├── huiAudioCorpus
    ├── __init__.py
    ├── error
    │   ├── __init__.py
    │   ├── MatchingNotFoundError.py
    │   └── DependencyInjectionError.py
    ├── filter
    │   ├── __init__.py
    │   └── AudioFilter.py
    ├── model
    │   ├── __init__.py
    │   ├── GutenbergBook.py
    │   ├── Credentials.py
    │   ├── SymbolSentence.py
    │   ├── PhoneticSentence.py
    │   ├── Histogram.py
    │   ├── AudioTranscriptPair.py
    │   ├── Statistic.py
    │   ├── PhoneticChars.py
    │   ├── SentenceAlignment.py
    │   ├── Transcripts.py
    │   ├── Sentence.py
    │   └── Audio.py
    ├── ui
    │   ├── __init__.py
    │   └── Plot.py
    ├── utils
    │   ├── __init__.py
    │   ├── FileListUtil.py
    │   ├── SecureFTP.py
    │   ├── DoneMarker.py
    │   ├── ModelToStringConverter.py
    │   └── PathUtil.py
    ├── calculator
    │   ├── __init__.py
    │   ├── AlignSentencesIntoTextCalculator.py
    │   └── TextNormalizer.py
    ├── components
    │   ├── __init__.py
    │   ├── AudioStatisticComponent.py
    │   └── TextStatisticComponent.py
    ├── converter
    │   ├── __init__.py
    │   ├── StringToSentencesConverter.py
    │   ├── TranscriptsToSentencesConverter.py
    │   ├── ListToStatisticConverter.py
    │   ├── PhoneticSentenceToSymbolSentenceConverter.py
    │   ├── ListToHistogramConverter.py
    │   ├── AudioToSentenceConverter.py
    │   └── SentenceToPhoneticSentenceConverter.py
    ├── persistenz
    │   ├── __init__.py
    │   ├── TranscriptsPersistenz.py
    │   ├── AudioPersistenz.py
    │   ├── AudiosFromLibrivoxPersistenz.py
    │   ├── AudioTranscriptPairPersistenz.py
    │   └── GutenbergBookPersistenz.py
    ├── testOutput
    │   └── __init__.py
    ├── transformer
    │   ├── __init__.py
    │   ├── AudioRemoveSilenceTransformer.py
    │   ├── TranscriptsSelectionTransformer.py
    │   ├── AudioLoudnessTransformer.py
    │   ├── AudioSamplingRateTransformer.py
    │   ├── SentenceDistanceTransformer.py
    │   ├── AudioAddSilenceTransformer.py
    │   ├── AudioFadeTransformer.py
    │   └── AudioSplitTransformer.py
    ├── sttInference
    │   ├── __init__.py
    │   ├── deepspeechModel
    │   │   └── __init__.py
    │   └── README.md
    ├── dependencyInjection
    │   ├── __init__.py
    │   └── DependencyInjection.py
    ├── workflows
    │   └── createDatasetWorkflow
    │   │   ├── __init__.py
    │   │   ├── Step3_DownloadText.py
    │   │   ├── Step1_DownloadAudio.py
    │   │   ├── Step2_1_AudioStatistic.py
    │   │   ├── Step2_SplitAudio.py
    │   │   ├── Step4_TranscriptAudio.py
    │   │   ├── Step7_AudioRawStatistic.py
    │   │   ├── Step9_GenerateCleanDataset.py
    │   │   ├── Step6_FinalizeDataset.py
    │   │   ├── Step5_AlignText.py
    │   │   ├── Step8_DatasetStatistic.py
    │   │   ├── Step0_Overview.py
    │   │   └── Step3_1_PrepareText.py
    └── enum
    │   └── PipelineReturnEnum.py
├── .vscode
    └── settings.json
├── setup.py
├── requirements.txt
├── .gitignore
├── README.md
└── LICENSE


/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/error/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/filter/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/model/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/ui/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/calculator/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/components/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/converter/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/model/GutenbergBook.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/persistenz/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/testOutput/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/sttInference/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scripts/createDatasetConfig/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/dependencyInjection/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/sttInference/deepspeechModel/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/workflows/createDatasetWorkflow/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.pythonPath": "/home/ppuchtler/anaconda3/envs/huiAudioCorpus/bin/python"
3 | }


--------------------------------------------------------------------------------
/huiAudioCorpus/enum/PipelineReturnEnum.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 | 
3 | class PipelineReturnEnum(Enum):
4 |     Ok = 0
5 |     OkWithDoneMarker=1
6 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/model/Credentials.py:
--------------------------------------------------------------------------------
1 | class Credentials:
2 | 
3 |     def __init__(self, username:str, password:str):
4 |         self.username = username
5 |         self.password = password


--------------------------------------------------------------------------------
/huiAudioCorpus/sttInference/README.md:
--------------------------------------------------------------------------------
1 | # sttInference
2 | 
3 | We can execute text to speech with this Project
4 | 
5 | 
6 | # Copy the model
7 | 
8 | Copy the  files form: ***** unzipped into the deepspeechModel Folder
9 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="huiAudioCorpus",
 5 |     version="0.1",
 6 |     packages=find_packages(),
 7 | )
 8 | 
 9 | # to init run this: 
10 | # sudo python3 setup.py develop


--------------------------------------------------------------------------------
/huiAudioCorpus/model/SymbolSentence.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 | from huiAudioCorpus.utils.ModelToStringConverter import ToString
3 | 
4 | class SymbolSentence(ToString):
5 |     def __init__ (self, sentence: List[int]):
6 |         self.sentence = sentence


--------------------------------------------------------------------------------
/huiAudioCorpus/utils/FileListUtil.py:
--------------------------------------------------------------------------------
1 | import glob
2 | 
3 | class FileListUtil:
4 |     def getFiles(self,path: str, ending: str):
5 |         searchPath = path +  '/**/*.' + ending
6 |         files = glob.glob(searchPath, recursive=True)
7 |         return files


--------------------------------------------------------------------------------
/huiAudioCorpus/model/PhoneticSentence.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 | from huiAudioCorpus.utils.ModelToStringConverter import ToString
3 | 
4 | class PhoneticSentence(ToString):
5 |     def __init__ (self, sentence: str, subWords: List[str]):
6 |         self.sentence = sentence
7 |         self.subWords = subWords


--------------------------------------------------------------------------------
/huiAudioCorpus/model/Histogram.py:
--------------------------------------------------------------------------------
1 | from huiAudioCorpus.utils.ModelToStringConverter import ToString
2 | from typing import List, TypeVar
3 | number = TypeVar('number', int, float)
4 | 
5 | class Histogram(ToString):
6 |     def __init__(self, bins: List[number], values: List[number]):
7 |         self.bins = bins
8 |         self.values = values
9 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/converter/StringToSentencesConverter.py:
--------------------------------------------------------------------------------
1 | from huiAudioCorpus.model.Sentence import Sentence
2 | from textblob import TextBlob
3 | 
4 | class StringToSentencesConverter:
5 |     def convert(self, text: str):
6 |         blob = TextBlob(text)
7 |         sentences = [Sentence(str(sentences)) for sentences in blob.sentences] # type: ignore
8 |         return sentences


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy
 2 | matplotlib
 3 | pandas
 4 | librosa
 5 | nptyping
 6 | tqdm
 7 | textblob
 8 | torch
 9 | adabound
10 | dependencies
11 | pysftp
12 | h5py
13 | pyyaml===5.3.1
14 | yq
15 | numba===0.48.0
16 | kaldiio
17 | frosch
18 | unidecode
19 | inflect
20 | bs4
21 | natsort
22 | python-Levenshtein
23 | deepspeech
24 | gutenberg
25 | pyloudnorm
26 | pandas_profiling
27 | lxml


--------------------------------------------------------------------------------
/huiAudioCorpus/model/AudioTranscriptPair.py:
--------------------------------------------------------------------------------
1 | from huiAudioCorpus.utils.ModelToStringConverter import ToString
2 | from huiAudioCorpus.model.Sentence import Sentence
3 | from huiAudioCorpus.model.Audio import Audio
4 | 
5 | class AudioTranscriptPair(ToString):
6 |     
7 |     def __init__(self, sentence: Sentence, audio: Audio):
8 |         self.sentence = sentence
9 |         self.audio = audio


--------------------------------------------------------------------------------
/huiAudioCorpus/converter/TranscriptsToSentencesConverter.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from pathlib import Path
 3 | from huiAudioCorpus.model.Sentence import Sentence
 4 | from huiAudioCorpus.model.Transcripts import Transcripts
 5 | 
 6 | class TranscriptsToSentencesConverter:
 7 |     def convert(self, transcripts: Transcripts):
 8 |         texts = transcripts.text
 9 |         ids = transcripts.keys
10 |         sentences = [Sentence(text, Path(id).stem) for text, id in zip(texts, ids)]
11 |         return sentences


--------------------------------------------------------------------------------
/huiAudioCorpus/model/Statistic.py:
--------------------------------------------------------------------------------
 1 | from huiAudioCorpus.utils.ModelToStringConverter import ToString
 2 | 
 3 | 
 4 | class Statistic(ToString):
 5 |     def __init__(self, count:int, max:float, min:float, median:float, average:float, sum:float, std: float, var: float):
 6 |         self.count = count
 7 |         self.max = max
 8 |         self.min = min
 9 |         self.median = median
10 |         self.average = average
11 |         self.sum = sum
12 |         self.std = std
13 |         self.var = var


--------------------------------------------------------------------------------
/huiAudioCorpus/transformer/AudioRemoveSilenceTransformer.py:
--------------------------------------------------------------------------------
 1 | import librosa
 2 | from huiAudioCorpus.model.Audio import Audio
 3 | 
 4 | 
 5 | class AudioRemoveSilenceTransformer:
 6 | 
 7 |     def __init__(self, dezibel: int):
 8 |         self.dezibel = dezibel
 9 | 
10 |     def transform(self, audio: Audio):
11 |         newAudioTimeline,_ = librosa.effects.trim(audio.timeSeries, self.dezibel)
12 |         newAudio = Audio(newAudioTimeline, audio.samplingRate, audio.id, audio.name)
13 |         return newAudio


--------------------------------------------------------------------------------
/huiAudioCorpus/transformer/TranscriptsSelectionTransformer.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from huiAudioCorpus.model.Transcripts import Transcripts
 3 | 
 4 | class TranscriptsSelectionTransformer:
 5 | 
 6 |     def transform(self, transcripts: Transcripts, selectedKeys: List[str]):
 7 |         trans = transcripts.transcripts
 8 |         transformedTrans = trans[trans[0].isin(selectedKeys)]# type:ignore
 9 |         transformedTranscripts = Transcripts(transformedTrans, transcripts.id, transcripts.name)
10 |         return transformedTranscripts
11 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/model/PhoneticChars.py:
--------------------------------------------------------------------------------
 1 | from huiAudioCorpus.utils.ModelToStringConverter import ToString
 2 | 
 3 | class PhoneticChars(ToString):
 4 | 
 5 |     def __init__(self):
 6 |         self.chars = ['ˈ', 'a', 'l', 'ə', 's', ' ', 'i', 'ʔ', 'ɛ', 'n', 'd', 'e', 'ː', 'ɐ', '̯', 'v', 't', 'ɪ', 'm', 'j', 'ɔ', 'x', '͡', 'u', ',', 'ʊ', 'z', 'p', 'ʁ', 'o', 'ʃ', 'ç', 'ɡ', '̩', '.', 'k', 'h', 'ˌ', 'f', 'b', 'ŋ', 'y', 'ʏ', 'œ', 'æ', 'ø', '!', 'ʒ', '…', ':', '̍', '?', '̥', '̃', 'r', 'ɑ', 'θ', "'", 'ð', 'ɱ', 'ʙ', 'ɺ', "ˑ", "ɒ",'‿']
 7 |     
 8 |     @property
 9 |     def countChars(self):
10 |         return len(self.chars)
11 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/transformer/AudioLoudnessTransformer.py:
--------------------------------------------------------------------------------
 1 | from huiAudioCorpus.model.Audio import Audio
 2 | import pyloudnorm as pyln
 3 | 
 4 | 
 5 | class AudioLoudnessTransformer:
 6 | 
 7 |     def __init__(self, loudness: int):
 8 |         self.loudness = loudness
 9 | 
10 |     def transform(self, audio: Audio):
11 |         meter = pyln.Meter(audio.samplingRate) # create BS.1770 meter
12 | 
13 |         loudnessNormalizedAudio = pyln.normalize.loudness(audio.timeSeries, audio.loudness, self.loudness)
14 |         newAudio = Audio(loudnessNormalizedAudio, audio.samplingRate, audio.id, audio.name)
15 |         return newAudio


--------------------------------------------------------------------------------
/huiAudioCorpus/workflows/createDatasetWorkflow/Step3_DownloadText.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from huiAudioCorpus.persistenz.GutenbergBookPersistenz import GutenbergBookPersistenz
 3 | from huiAudioCorpus.utils.DoneMarker import DoneMarker
 4 | 
 5 | class Step3_DownloadText:
 6 | 
 7 |     def __init__(self, GutenbergBookPersistenz: GutenbergBookPersistenz, savePath: str):
 8 |         self.savePath = savePath
 9 |         self.GutenbergBookPersistenz = GutenbergBookPersistenz
10 | 
11 |     def run(self):
12 |         return DoneMarker(self.savePath).run(self.script)
13 |     
14 |     def script(self):
15 |         self.GutenbergBookPersistenz.save()


--------------------------------------------------------------------------------
/huiAudioCorpus/workflows/createDatasetWorkflow/Step1_DownloadAudio.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | from huiAudioCorpus.persistenz.AudiosFromLibrivoxPersistenz import AudiosFromLibrivoxPersistenz
 4 | from huiAudioCorpus.utils.DoneMarker import DoneMarker
 5 | 
 6 | 
 7 | class Step1_DownloadAudio:
 8 | 
 9 |     def __init__(self, audiosFromLibrivoxPersistenz: AudiosFromLibrivoxPersistenz, savePath: str):
10 |         self.savePath = savePath
11 |         self.audiosFromLibrivoxPersistenz = audiosFromLibrivoxPersistenz
12 | 
13 |     def run(self):
14 |         return DoneMarker(self.savePath).run(self.script)
15 |     
16 |     def script(self):
17 |         self.audiosFromLibrivoxPersistenz.save()
18 | 
19 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/converter/ListToStatisticConverter.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from huiAudioCorpus.model.Statistic import Statistic
 3 | 
 4 | from typing import List, TypeVar
 5 | 
 6 | number = TypeVar('number', int, float)
 7 | 
 8 | class ListToStatisticConverter:
 9 | 
10 |     def convert(self, list: List[number]):
11 |         count = len(list)
12 |         maximum = max(list)
13 |         minimum = min(list)
14 |         total = sum(list)
15 |         median: float
16 |         median = np.median(list)
17 |         std = np.std(list)
18 |         var = np.var(list)
19 |         average = total/count
20 |         statistic = Statistic(count,maximum,minimum,median,average,total, std, var)
21 |         return statistic
22 |     


--------------------------------------------------------------------------------
/huiAudioCorpus/filter/AudioFilter.py:
--------------------------------------------------------------------------------
 1 | from huiAudioCorpus.model.Audio import Audio
 2 | from typing import List
 3 | 
 4 | class AudioFilter:
 5 | 
 6 |     def __init__(self, maxDuration = None, names: List[str] = None):
 7 | 
 8 |         self.maxDuration = float('inf') if maxDuration is None else maxDuration
 9 |         self.names = names
10 | 
11 |     
12 |     def isAllowed(self, audio: Audio):
13 |         if audio.duration >= self.maxDuration:
14 |             return False
15 |         if self.names is not None and audio.name not in self.names:
16 |             return False
17 |         return True
18 | 
19 |     def filter(self, audios: List[Audio]):
20 |         filteredAudios = [audio for audio in audios if self.isAllowed(audio)]
21 |         return filteredAudios
22 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/converter/PhoneticSentenceToSymbolSentenceConverter.py:
--------------------------------------------------------------------------------
 1 | from huiAudioCorpus.model.PhoneticChars import PhoneticChars
 2 | from huiAudioCorpus.model.PhoneticSentence import PhoneticSentence
 3 | from huiAudioCorpus.model.SymbolSentence import SymbolSentence
 4 | 
 5 | class PhoneticSentenceToSymbolSentenceConverter:
 6 |     def __init__(self):
 7 |         self.symbols = PhoneticChars().chars
 8 |         self.symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
 9 | 
10 |     def convert(self, phoneticSentence:PhoneticSentence):
11 |         sentence = phoneticSentence.sentence
12 |         symbols = [self.getId(char) for char in sentence]
13 |         return SymbolSentence(symbols)
14 | 
15 |     def getId(self, char):
16 |             return self.symbol_to_id[char] +1


--------------------------------------------------------------------------------
/huiAudioCorpus/converter/ListToHistogramConverter.py:
--------------------------------------------------------------------------------
 1 | from huiAudioCorpus.model.Histogram import Histogram
 2 | from typing import List, TypeVar
 3 | 
 4 | import numpy as np
 5 | number = TypeVar('number', int, float)
 6 | 
 7 | class ListToHistogramConverter:
 8 |     def __init__(self, stepSize: int):
 9 |         self.stepSize =stepSize
10 | 
11 |     def convert(self, list: List[number]):
12 |         bins = np.arange(round(min(1,min(list)))-1,max(list) + 2*self.stepSize,self.stepSize)
13 |         exportBins: List[number]
14 |         values : List[number]
15 |         valuesNumpy, exportBinsNumpy =  np.histogram(list, bins=bins) # type: ignore
16 |         exportBins = exportBinsNumpy.tolist()# type: ignore
17 |         values = valuesNumpy.tolist()# type: ignore
18 |         histogram = Histogram(exportBins[:-1], values)
19 |         return histogram


--------------------------------------------------------------------------------
/huiAudioCorpus/transformer/AudioSamplingRateTransformer.py:
--------------------------------------------------------------------------------
 1 | import librosa
 2 | from huiAudioCorpus.model.Audio import Audio
 3 | 
 4 | 
 5 | class AudioSamplingRateTransformer():
 6 | 
 7 |     def __init__(self, targetSamplingRate: int = None):
 8 |         self.targetSamplingRate = targetSamplingRate
 9 | 
10 |     def transform(self, audio: Audio ):
11 |         if self.targetSamplingRate is None:
12 |             return audio
13 |         if audio.samplingRate == self.targetSamplingRate:
14 |             return audio
15 |         audioTimeSeries = audio.timeSeries
16 |         samplingRate = audio.samplingRate
17 |         resampledTimeSeries = librosa.core.resample(audioTimeSeries, samplingRate, self.targetSamplingRate)
18 |         resampledAudio = Audio(resampledTimeSeries, self.targetSamplingRate, audio.id, audio.name) # type:ignore
19 |         return resampledAudio
20 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/model/SentenceAlignment.py:
--------------------------------------------------------------------------------
 1 | from huiAudioCorpus.utils.ModelToStringConverter import ToString
 2 | from huiAudioCorpus.model.Sentence import Sentence
 3 | 
 4 | class SentenceAlignment(ToString):
 5 |     def __init__(self, sourceText: Sentence, alignedText: Sentence, start: int, end: int, distance: float, leftIsPerfekt:bool = False, rightIsPerfekt: bool = False, isFirst : bool = False, isLast: bool = False, isPerfect: bool = False, isSkipped: bool = False):
 6 |         self.sourceText = sourceText
 7 |         self.alignedText = alignedText
 8 |         self.start = start
 9 |         self.end = end
10 |         self.distance = distance
11 |         self.leftIsPerfekt = leftIsPerfekt
12 |         self.rightIsPerfekt= rightIsPerfekt
13 |         self.isFirst = isFirst
14 |         self.isLast = isLast
15 |         self.isPerfect = isPerfect
16 |         self.isSkipped = isSkipped


--------------------------------------------------------------------------------
/huiAudioCorpus/transformer/SentenceDistanceTransformer.py:
--------------------------------------------------------------------------------
 1 | from huiAudioCorpus.model.Sentence import Sentence
 2 | from Levenshtein import distance as LevensteinDistance
 3 | 
 4 | class SentenceDistanceTransformer:
 5 | 
 6 |     def transform(self, sentence1: Sentence, sentence2: Sentence):
 7 | 
 8 |         baseDistance = self.distanceTwoSentences(sentence1, sentence2)
 9 |         return baseDistance
10 | 
11 |   
12 |     def distanceTwoSentences(self, sentence1: Sentence, sentence2: Sentence):
13 |         if sentence1.wordsCount == 0 or sentence2.wordsCount == 0:
14 |             return 1
15 |         
16 |         sentenceString1 = "".join(sentence1.wordsWithoutChars)
17 |         sentenceString2 = "".join(sentence2.wordsWithoutChars)
18 | 
19 |         countCharsMax = max(len(sentenceString1) , len(sentenceString2))
20 |         diff = LevensteinDistance(sentenceString1, sentenceString2)
21 |         distance = diff / countCharsMax
22 |         return distance
23 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/error/MatchingNotFoundError.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | 
 4 | class MatchingNotFoundError(Exception):
 5 | 
 6 |     def __init__(self, missingIdsIn1: List[str], missingIdsIn2: List[str], namemissingIdsIn1: str, namemissingIdsIn2: str):
 7 |         self.missingIdsIn1 = missingIdsIn1
 8 |         self.missingIdsIn2 = missingIdsIn2
 9 |         self.namemissingIdsIn1= namemissingIdsIn1
10 |         self.namemissingIdsIn2 = namemissingIdsIn2
11 | 
12 |         super().__init__(f'Missing ids from matching {self.namemissingIdsIn1} and {self.namemissingIdsIn2}')
13 | 
14 |     def __str__(self):
15 |         return self.getString()
16 | 
17 |     def getString(self):
18 |         string = f'Exception: Missing ids from matching {self.namemissingIdsIn1} and {self.namemissingIdsIn2}\n'
19 |         string+= f'misssing ids in {self.namemissingIdsIn1}: {self.missingIdsIn1}\n'
20 |         string+= f'misssing ids in {self.namemissingIdsIn2}: {self.missingIdsIn2}\n'
21 |         return string


--------------------------------------------------------------------------------
/huiAudioCorpus/transformer/AudioAddSilenceTransformer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from huiAudioCorpus.model.Audio import Audio
 3 | 
 4 | 
 5 | class AudioAddSilenceTransformer:
 6 | 
 7 |     def __init__(self,startDurationSeconds: float,  endDurationSeconds: float):
 8 |         self.startDurationSeconds = startDurationSeconds
 9 |         self.endDurationSeconds = endDurationSeconds
10 | 
11 |     def transform(self, audio: Audio):
12 |         silenceAudioFront = self.generateSilence(self.startDurationSeconds, audio.samplingRate)
13 |         silenceAudioBack = self.generateSilence(self.endDurationSeconds, audio.samplingRate)
14 |         newAudio = silenceAudioFront+ audio + silenceAudioBack
15 |         return newAudio
16 | 
17 |     def generateSilence(self,duration: float, samplingRate: int):
18 |         silenceDataPoints = int(duration*samplingRate)
19 |         silence = np.zeros(silenceDataPoints)
20 |         silenceAudio =  Audio(silence, samplingRate, 's', 's')
21 |         return silenceAudio
22 | 


--------------------------------------------------------------------------------
/scripts/generateAudioStatistic.py:
--------------------------------------------------------------------------------
 1 | from numpy import source
 2 | from huiAudioCorpus.dependencyInjection.DependencyInjection import DependencyInjection
 3 | 
 4 | loadPath = '/media/ppuchtler/LangsameSSD/Projekte/espnet/egs2/HUI_Tacotron/tts1/inferences'
 5 | savePath = '/media/ppuchtler/LangsameSSD/Projekte/espnet/egs2/HUI_Tacotron/tts1/hokuspokus_statistic'
 6 | 
 7 | diConfig = {
 8 | 'step7_AudioRawStatistic': {
 9 |     'savePath': savePath + '/raw',
10 |     'loadPath': loadPath
11 | }
12 | }
13 | DependencyInjection(diConfig).step7_AudioRawStatistic.run()
14 | 
15 | diConfig = {
16 |     'step8_DatasetStatistic': {
17 |         'savePath': savePath + '/stats',
18 |         'loadPath': savePath + '/raw/overview.csv',
19 |         'specialSpeackers': [],
20 |         'filter': None
21 |     },
22 |     'audioPersistenz': {
23 |         'loadPath':''
24 |     },
25 |     'transcriptsPersistenz': {
26 |         'loadPath':''
27 |     },
28 |     'plot': {
29 |         'showDuration': 0
30 |     }
31 | }
32 | DependencyInjection(diConfig).step8_DatasetStatistic.run()


--------------------------------------------------------------------------------
/huiAudioCorpus/error/DependencyInjectionError.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | 
 4 | class DependencyInjectionError(Exception):
 5 | 
 6 |     def __init__(self, exception: Exception, classConfig: Dict[str,str] , className : str, requestedClassName : str):
 7 |         self.exception = exception
 8 |         self.classConfig = classConfig
 9 |         self.className= className
10 |         self.requestedClassName = requestedClassName
11 | 
12 |         super().__init__(f'Dependent object {self.className} could not be injected for {self.requestedClassName}')
13 | 
14 |     def __str__(self):
15 |         return self.getString()
16 | 
17 |     def getString(self):
18 |         string = f'\n+++++++++++++++++++++++++\n'
19 |         string += 'Error during creation of dependencys. Maybe your config is wrong. \n'
20 |         string += f'Dependent object "{self.className}" could not be injected for "{self.requestedClassName}" \n'
21 |         string += f'with error message: {self.exception} \n'
22 |         string += f'config parameter used are: {self.classConfig}\n'
23 |         string += f'+++++++++++++++++++++++++\n'
24 |         return string


--------------------------------------------------------------------------------
/huiAudioCorpus/utils/SecureFTP.py:
--------------------------------------------------------------------------------
 1 | from huiAudioCorpus.persistenz.CredentialsPersistenz import CredentialsPersistenz
 2 | from huiAudioCorpus.utils.PathUtil import PathUtil
 3 | import pysftp
 4 | 
 5 | # This class is hard to test. Because the risk is not so high i decided not to test this class automatic. Pascal
 6 | class SecureFTP:# pragma: no cover
 7 |     def __init__(self, pathUtil: PathUtil, server: str, credentialsPersistenz: CredentialsPersistenz):
 8 |         cnopts = pysftp.CnOpts()
 9 |         credentials = credentialsPersistenz.load(server)
10 |         cnopts.hostkeys = None   
11 |         self.connection =  pysftp.Connection(server, username=credentials.username, password=credentials.password, cnopts=cnopts)
12 |         self.pathUtil = pathUtil
13 | 
14 |     def getFiles(self, path: str):
15 |         files = self.connection.listdir(path)
16 |         return files
17 | 
18 |     def copyFile(self, sourcePath: str, targetPath: str):
19 |         source = self.connection.open(sourcePath,'rb')
20 |         self.pathUtil.copyFileWithStream(source, self.getSize(sourcePath), targetPath)# type:ignore
21 |         source.close()
22 | 
23 |     def getSize(self, sourcePath: str):
24 |         stats = self.connection.stat(sourcePath)
25 |         size = stats.st_size
26 |         return size


--------------------------------------------------------------------------------
/huiAudioCorpus/model/Transcripts.py:
--------------------------------------------------------------------------------
 1 | from huiAudioCorpus.model.Sentence import Sentence
 2 | from typing import List
 3 | from huiAudioCorpus.utils.ModelToStringConverter import ToString
 4 | from pandas.core.frame import DataFrame
 5 | class Transcripts(ToString):
 6 |     def __init__(self, transcripts: DataFrame, id: str, name: str):
 7 |         self.transcripts = transcripts
 8 |         self.id = id
 9 |         self.name = name
10 |     
11 | 
12 |     @property
13 |     def transcriptsCount(self):
14 |         return self.transcripts.shape[0]
15 | 
16 |     @property
17 |     def example(self):
18 |         return self.transcripts.values[0][0]
19 | 
20 |     @property
21 |     def keys(self) -> List[str]:
22 |         #TODO: This is not generalizable at all! We should introduce column labels
23 |         return list(self.transcripts[0].values) # type:ignore
24 | 
25 |     @property
26 |     def text(self)-> List[str]:
27 |         #TODO: This is not generalizable at all! We should introduce column labels
28 |         return list(self.transcripts[self.transcripts.columns[-1]].values) # type:ignore
29 | 
30 |     
31 |     def sentences(self) -> List[Sentence]:
32 |         sentences = []
33 |         for key, text in zip(self.keys, self.text):
34 |             if type(text) == str:
35 |                 sentences.append(Sentence(text,key))
36 |         return sentences


--------------------------------------------------------------------------------
/huiAudioCorpus/ui/Plot.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | from huiAudioCorpus.model.Histogram import Histogram
 3 | from huiAudioCorpus.utils.PathUtil import PathUtil
 4 | import logging
 5 | logging.getLogger('matplotlib.font_manager').disabled = True
 6 | logging.getLogger('matplotlib.colorbar').disabled = True
 7 | 
 8 | class Plot:
 9 |     def __init__(self, showDuration: int, savePath: str = ''):
10 |         self.showDuration = showDuration
11 |         self.savePath = savePath
12 |         self.pathUtil = PathUtil()
13 | 
14 | 
15 |     def histogram(self, histogram:Histogram, name:str, logScaleY = False, logScaleX = False):
16 |         plt.clf()
17 |         _, ax = plt.subplots()
18 | 
19 | 
20 |         ax.bar(histogram.bins,histogram.values, width=1) # type: ignore
21 |         ax.set_ylabel('count') # type: ignore
22 |         ax.set_xlabel('bins') # type: ignore
23 |         ax.set_title(name) # type: ignore
24 |         if logScaleY:
25 |             ax.set_yscale('log')
26 |         if logScaleX:
27 |             ax.set_xscale('log')
28 | 
29 |     def show(self):
30 |         plt.show(block=False)
31 |         plt.pause(self.showDuration)
32 |         plt.close()
33 |     
34 |     def save(self, filename: str):
35 |         filename = self.savePath + '/' + filename
36 |         self.pathUtil.createFolderForFile(filename)
37 |         plt.savefig(filename, dpi=200)
38 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/utils/DoneMarker.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime
 3 | from os import unlink
 4 | from os.path import isfile
 5 | from huiAudioCorpus.enum.PipelineReturnEnum import PipelineReturnEnum
 6 | from huiAudioCorpus.utils.PathUtil import PathUtil
 7 | 
 8 | class DoneMarker:
 9 |     doneFilename = '.done'
10 |     
11 |     def __init__(self, path: str):
12 |         self.path = path
13 |         self.doneFilePath = path + '/' + self.doneFilename
14 |         self.pathUtil = PathUtil()
15 | 
16 |     def isDone(self):
17 |         isDone = os.path.exists(self.doneFilePath)
18 |         return isDone
19 | 
20 |     def setDone(self):
21 |         self.pathUtil.createFolderForFile(self.doneFilePath)
22 |         f = open(self.doneFilePath, "w")
23 |         f.write(f'Done at:   {datetime.now()}')
24 |         f.close()
25 | 
26 |     def remove(self):
27 |         if isfile(self.doneFilePath):
28 |             unlink(self.doneFilePath)
29 | 
30 |     def getInfo(self):
31 |         return 'Continue to next step because of done marker.'
32 | 
33 |     def run(self, script, deleteFolder = True):
34 |         if self.isDone():
35 |             print(self.getInfo())
36 |             return PipelineReturnEnum.OkWithDoneMarker
37 | 
38 |         if deleteFolder:
39 |             self.pathUtil.deleteFolder(self.path)
40 | 
41 |         script()
42 | 
43 |         self.setDone()
44 |         return PipelineReturnEnum.Ok


--------------------------------------------------------------------------------
/huiAudioCorpus/transformer/AudioFadeTransformer.py:
--------------------------------------------------------------------------------
 1 | from huiAudioCorpus.model.Audio import Audio
 2 | import numpy as np
 3 | 
 4 | class AudioFadeTransformer:
 5 | 
 6 |     def __init__(self, fadeInDuration: float = 0.1, fadeOutDuration: float = 0.1):
 7 |         self.fadeInDuration = fadeInDuration
 8 |         self.fadeOutDuration = fadeOutDuration
 9 | 
10 |     def transform(self, audio: Audio):
11 |         audio = self.fadeOut(audio)
12 |         audio = self.fadeIn(audio)
13 |         return audio
14 |         
15 | 
16 |     def fadeOut(self, audio: Audio) -> Audio: 
17 |         countOfSamples= int(self.fadeOutDuration*audio.samplingRate)
18 |         end = audio.samples
19 |         start = end - countOfSamples
20 | 
21 |         # compute fade out curve
22 |         # linear fade
23 |         fade_curve = np.linspace(1.0, 0.0, countOfSamples)
24 | 
25 |         # apply the curve
26 |         audio.timeSeries[start:end] = audio.timeSeries[start:end] * fade_curve
27 |         return audio
28 | 
29 |     def fadeIn(self, audio: Audio) -> Audio: 
30 |         countOfSamples= int(self.fadeOutDuration*audio.samplingRate)
31 |         end = countOfSamples
32 |         start = 0
33 | 
34 |         # compute fade out curve
35 |         # linear fade
36 |         fade_curve = np.linspace(0.0, 1.0, countOfSamples)
37 | 
38 |         # apply the curve
39 |         audio.timeSeries[start:end] = audio.timeSeries[start:end] * fade_curve
40 |         return audio


--------------------------------------------------------------------------------
/huiAudioCorpus/workflows/createDatasetWorkflow/Step2_1_AudioStatistic.py:
--------------------------------------------------------------------------------
 1 | from huiAudioCorpus.utils.DoneMarker import DoneMarker
 2 | from huiAudioCorpus.components.AudioStatisticComponent import AudioStatisticComponent
 3 | from huiAudioCorpus.ui.Plot import Plot
 4 | 
 5 | 
 6 | class Step2_1_AudioStatistic:
 7 |     def __init__(self, savePath: str, audioStatisticComponent: AudioStatisticComponent, plot: Plot):
 8 |         self.savePath = savePath
 9 |         self.audioStatisticComponent = audioStatisticComponent
10 |         self.plot = plot
11 | 
12 |     def run(self):
13 |         doneMarker = DoneMarker(self.savePath)
14 |         result = doneMarker.run(self.script, deleteFolder=False)
15 |         return result
16 | 
17 |     def script(self):
18 |         statistics, rawData = self.audioStatisticComponent.run()
19 | 
20 |         self.plot.histogram(statistics['duration']['histogram'],  statistics['duration']['description'])
21 |         self.plot.save('audioLength')
22 |         self.plot.show()
23 | 
24 |         with open(self.savePath + '/statistic.txt', 'w') as textFile:
25 |             for statistic in statistics.values():
26 |                 print(statistic['description'])
27 |                 print(statistic['statistic'])
28 |                 textFile.write(statistic['description'])
29 |                 textFile.write('\n')
30 |                 textFile.write(statistic['statistic'].__str__())
31 |                 textFile.write('\n')
32 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/utils/ModelToStringConverter.py:
--------------------------------------------------------------------------------
 1 | classHighlither = '###'
 2 | endOfClass = '____'
 3 | 
 4 | class ToString():
 5 |     def __str__(self):
 6 |         return ModelToStringConverter().convert(self) # pragma: no cover
 7 | 
 8 | class ModelToStringConverter:
 9 |     def convert(self, model):
10 |         strings =[]
11 |         strings.append( self.getClassText(model))
12 |         strings.append('')
13 |         attributes = self.getAllAttributes(model)
14 |         [strings.append(self.getMethodText(model, attr)) for attr in attributes]
15 |         strings.append(endOfClass)
16 |         string = '\n'.join(strings)
17 |         return string
18 | 
19 |     def getClassText(self,model):
20 |         string = classHighlither + ' ' + model.__class__.__name__ + ' ' + classHighlither
21 |         return string
22 | 
23 |     def getAllAttributes(self, model):
24 |         attr:str
25 |         allAttributes = dir(model)
26 |         allAttributes = [attr for attr in allAttributes if not attr.startswith('__')]
27 |         return allAttributes
28 | 
29 |     def getMethodText(self,model, methodName: str):
30 |         value = getattr(model, methodName)
31 |         valueString = self.getValueText(value)
32 |         string = methodName + ' ' +  str(type(value)) +': ' + valueString
33 |         return string
34 |     
35 |     def getValueText(self, value):
36 |         if isinstance(value, float):
37 |             return str(round(value,2))
38 | 
39 |         string = str(value)
40 |         if len(string)>20:
41 |             return string[:20] + ' ...'
42 |         return str(value)


--------------------------------------------------------------------------------
/huiAudioCorpus/persistenz/TranscriptsPersistenz.py:
--------------------------------------------------------------------------------
 1 | from pandas.core.frame import DataFrame
 2 | from huiAudioCorpus.model.Transcripts import Transcripts
 3 | from huiAudioCorpus.utils.FileListUtil import FileListUtil
 4 | from huiAudioCorpus.utils.PathUtil import PathUtil
 5 | import pandas as pd
 6 | 
 7 | class TranscriptsPersistenz:
 8 |     def __init__(self, loadPath:str, savePath: str = None, fileExtension:str = 'csv'):
 9 |         self.savePath = loadPath if savePath is None else savePath
10 |         self.loadPath = loadPath
11 |         self.fileExtension = fileExtension
12 |         self.fileListUtil = FileListUtil()
13 |         self.pathUtil = PathUtil()
14 | 
15 |     def getIds(self):
16 |         transcriptsFiles = self.fileListUtil.getFiles(self.loadPath, self.fileExtension)
17 |         transcriptsFiles = [file.replace(self.loadPath,'')[1:-len(self.fileExtension)-1] for file in transcriptsFiles]
18 |         return transcriptsFiles
19 |     
20 |     def load(self, id: str):
21 |         targetPath = self.loadPath +'/' +  id + '.' + self.fileExtension
22 |         csv: DataFrame
23 |         csv = pd.read_csv(targetPath, sep='|', header=None) # type: ignore
24 |         name = self.pathUtil.filenameWithoutExtension(targetPath)
25 |         transcripts = Transcripts(csv, id, name)
26 |         return transcripts
27 |     
28 |     def save(self, transcripts: Transcripts):
29 |         targetPath = self.savePath +'/' +  transcripts.id + '.' + self.fileExtension
30 |         self.pathUtil.createFolderForFile(targetPath)
31 |         trans = transcripts.transcripts
32 |         trans.to_csv(targetPath, sep='|', header = None, index=False) # type: ignore
33 | 
34 |     def loadAll(self):
35 |         ids = self.getIds()
36 |         for id in ids:
37 |             yield self.load(id)


--------------------------------------------------------------------------------
/scripts/createDatasetConfig/Sonja.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "deutschland_ein_wintermrchen": {
 3 |         "title": "deutschland_ein_wintermrchen",
 4 |         "LibrivoxBookName": "Deutschland. Ein Wintermärchen",
 5 |         "GutenbergId": 6079,
 6 |         "GutenbergStart": "VORWORT",
 7 |         "GutenbergEnd": "",
 8 |         "textReplacement": {
 9 |             "1844":"achtzehnhundertvierundvierzig",
10 |             "17.":"siebzehnsten",
11 |             "***":"Punkt Punkt Punkt",
12 |             "CAPUT I\n":"CAPUT eins",
13 |             "CAPUT II\n":"CAPUT zwei",
14 |             "CAPUT III\n":"CAPUT drei",
15 |             "CAPUT IV\n":"CAPUT vier",
16 |             "CAPUT V\n":"CAPUT fünf",
17 |             "CAPUT VI\n":"CAPUT sechs",
18 |             "CAPUT VII\n":"CAPUT sieben",
19 |             "CAPUT VIII\n":"CAPUT acht",
20 |             "CAPUT IX\n":"CAPUT neun",
21 |             "CAPUT X\n":"CAPUT zehn",
22 |             "CAPUT XI\n":"CAPUT elf",
23 |             "CAPUT XII\n":"CAPUT zwölf",
24 |             "CAPUT XIII\n":"CAPUT dreizehn",
25 |             "CAPUT XIV\n":"CAPUT vierzehn",
26 |             "CAPUT XV\n":"CAPUT fünfzehn",
27 |             "CAPUT XVI\n":"CAPUT sechszehn",
28 |             "CAPUT XVII\n":"CAPUT siebzehn",
29 |             "CAPUT XVIII\n":"CAPUT achtzehn",
30 |             "CAPUT XIX\n":"CAPUT neunzehn",
31 |             "CAPUT XX\n":"CAPUT zwanzig",
32 |             "CAPUT XXI\n":"CAPUT einundzwanzig",
33 |             "CAPUT XXII\n":"CAPUT zweiundzwanzig",
34 |             "CAPUT XXIII\n":"CAPUT dreiundzwanzig",
35 |             "CAPUT XXIV\n":"CAPUT vierundzwanzig",
36 |             "CAPUT XXV\n":"CAPUT fünfunzwanzig",
37 |             "CAPUT XXVI\n":"CAPUT sechundzwanzig",
38 |             "CAPUT XXVII\n":"CAPUT siebenundzwanzig"
39 |         }
40 |     }
41 | }


--------------------------------------------------------------------------------
/huiAudioCorpus/workflows/createDatasetWorkflow/Step2_SplitAudio.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | from typing import List
 4 | from huiAudioCorpus.persistenz.AudioPersistenz import AudioPersistenz
 5 | from huiAudioCorpus.transformer.AudioSplitTransformer import AudioSplitTransformer
 6 | from huiAudioCorpus.transformer.AudioLoudnessTransformer import AudioLoudnessTransformer
 7 | from huiAudioCorpus.model.Audio import Audio
 8 | from huiAudioCorpus.utils.DoneMarker import DoneMarker
 9 | from joblib import Parallel, delayed
10 | 
11 | class Step2_SplitAudio:
12 | 
13 |     def __init__(self, audioSplitTransformer:AudioSplitTransformer , audioPersistenz: AudioPersistenz, savePath: str, bookName: str, audioLoudnessTransformer: AudioLoudnessTransformer, remapSort: List[int] = None):
14 |         self.audioPersistenz = audioPersistenz
15 |         self.savePath = savePath
16 |         self.audioSplitTransformer = audioSplitTransformer
17 |         self.bookName = bookName
18 |         self.audioLoudnessTransformer = audioLoudnessTransformer
19 |         self.remapSort = remapSort
20 | 
21 |     def run(self):
22 |         return DoneMarker(self.savePath).run(self.script)
23 |     
24 |     def script(self):
25 |         audios = self.audioPersistenz.loadAll()
26 |         if self.remapSort:
27 |             audios = list(audios)
28 |             audios = [audios[i] for i in self.remapSort]
29 | 
30 |         Parallel(n_jobs=1, verbose=10, batch_size= 100)(delayed(self.splitOneAudio)(audio, index) for index, audio in enumerate(audios))
31 | 
32 | 
33 |     def splitOneAudio(self, audio: Audio, index:int):
34 |         splittedAudios = self.audioSplitTransformer.transform(audio, self.bookName, index+1)
35 |         for splitAudio in splittedAudios:
36 |             loudnessAudio = self.audioLoudnessTransformer.transform(splitAudio)
37 |             self.audioPersistenz.save(loudnessAudio)


--------------------------------------------------------------------------------
/huiAudioCorpus/persistenz/AudioPersistenz.py:
--------------------------------------------------------------------------------
 1 | import librosa
 2 | import soundfile
 3 | from huiAudioCorpus.model.Audio import Audio
 4 | from nptyping import NDArray
 5 | from huiAudioCorpus.utils.FileListUtil import FileListUtil
 6 | from huiAudioCorpus.utils.PathUtil import PathUtil
 7 | from natsort import natsorted
 8 | 
 9 | class AudioPersistenz:
10 |     def __init__(self, loadPath:str, savePath: str = None , fileExtension:str = 'wav'):
11 |         self.savePath = loadPath if savePath is None else savePath
12 |         self.loadPath = loadPath
13 |         self.fileExtension = fileExtension
14 |         self.fileListUtil = FileListUtil()
15 |         self.pathUtil = PathUtil()
16 | 
17 |     def load(self, id: str):
18 |         audioTimeSeries: NDArray
19 |         samplingRate: int
20 |         targetPath = self.loadPath +'/' +  id + '.' + self.fileExtension
21 |         name = self.pathUtil.filenameWithoutExtension(targetPath)
22 |         audioTimeSeries, samplingRate = librosa.core.load(targetPath, sr=None) # type: ignore
23 |         audio = Audio(audioTimeSeries, samplingRate, id, name)
24 |         return audio
25 | 
26 |     def save(self, audio: Audio):
27 |         targetPath = self.savePath + '/' + audio.id + '.wav'
28 |         self.pathUtil.createFolderForFile(targetPath)
29 |         audioTimeSeries = audio.timeSeries
30 |         samplingRate = audio.samplingRate
31 |         soundfile.write(targetPath, audioTimeSeries, samplingRate)
32 | 
33 |     def getNames(self):
34 |         names = [self.transformIdToName(id) for id in self.getIds()]
35 |         return names
36 | 
37 |     def getIds(self):
38 |         audioFiles = self.fileListUtil.getFiles(self.loadPath, self.fileExtension)
39 |         audioFiles = [file.replace(self.loadPath,'')[1:-len(self.fileExtension)-1] for file in audioFiles]
40 |         audioFiles = natsorted(audioFiles)
41 | 
42 |         return audioFiles
43 | 
44 |     def loadAll(self):
45 |         ids = self.getIds()
46 |         for id in ids:
47 |             yield self.load(id)
48 | 
49 |     def transformIdToName(self, id: str):
50 |         return self.pathUtil.filenameWithoutExtension(id)


--------------------------------------------------------------------------------
/huiAudioCorpus/workflows/createDatasetWorkflow/Step4_TranscriptAudio.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from itertools import count
 3 | from typing import List
 4 | from huiAudioCorpus.model.Audio import Audio
 5 | from huiAudioCorpus.persistenz.TranscriptsPersistenz import TranscriptsPersistenz
 6 | from pandas.core.frame import DataFrame
 7 | from huiAudioCorpus.model.Transcripts import Transcripts
 8 | from huiAudioCorpus.persistenz.AudioPersistenz import AudioPersistenz
 9 | from huiAudioCorpus.converter.AudioToSentenceConverter import AudioToSentenceConverter
10 | from huiAudioCorpus.utils.DoneMarker import DoneMarker
11 | from tqdm import tqdm
12 | import numpy as np
13 | from joblib import Parallel, delayed
14 | 
15 | class Step4_TranscriptAudio:
16 | 
17 |     def __init__(self, savePath: str, audioToSentenceConverter: AudioToSentenceConverter, audioPersistenz: AudioPersistenz, transcriptsPersistenz: TranscriptsPersistenz, numberWorker = 4):
18 |         self.savePath = savePath
19 |         self.audioToSentenceConverter = audioToSentenceConverter
20 |         self.audioPersistenz = audioPersistenz
21 |         self.transcriptsPersistenz = transcriptsPersistenz
22 |         self.numberWorker = numberWorker
23 | 
24 | 
25 |     def run(self):
26 |         return DoneMarker(self.savePath).run(self.script)
27 |     
28 |     def script(self):
29 |         ids = self.audioPersistenz.getIds()
30 |         chunks = np.array_split(ids, self.numberWorker)
31 | 
32 |         parallelResult = Parallel(n_jobs=self.numberWorker)(delayed(self.loadOneChunk)(audioIds, chunkId) for chunkId, audioIds in enumerate(chunks))
33 | 
34 |         results = [[sentence.id, sentence.sentence] for level in parallelResult for sentence in level]
35 | 
36 |         csv =  DataFrame(results)
37 |         transcripts = Transcripts(csv, 'transcripts', 'transcripts')
38 |         self.transcriptsPersistenz.save(transcripts)
39 | 
40 |     def loadOneChunk(self, ids: List[str], chunkId: int):
41 |         sentences = []
42 |         for id in tqdm(ids, desc="Chunk " + str(chunkId) + ": "):
43 |             audio = self.audioPersistenz.load(id)
44 |             sentence = self.audioToSentenceConverter.convert(audio)
45 |             sentences.append(sentence)
46 |         return sentences
47 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/workflows/createDatasetWorkflow/Step7_AudioRawStatistic.py:
--------------------------------------------------------------------------------
 1 | from huiAudioCorpus.utils.DoneMarker import DoneMarker
 2 | from huiAudioCorpus.utils.PathUtil import PathUtil
 3 | import pandas as pd
 4 | import os
 5 | 
 6 | 
 7 | class Step7_AudioRawStatistic:
 8 |     def __init__(self, savePath: str, loadPath: str, pathUtil: PathUtil):
 9 |         self.savePath = savePath
10 |         self.pathUtil = pathUtil
11 |         self.loadPath = loadPath
12 | 
13 |     def run(self):
14 |         doneMarker = DoneMarker(self.savePath)
15 |         result = doneMarker.run(self.script, deleteFolder=False)
16 |         return result
17 | 
18 |     def script(self):
19 |         from huiAudioCorpus.dependencyInjection.DependencyInjection import DependencyInjection
20 |         speackers = os.listdir(self.loadPath)
21 |         audioInfos = []
22 |         for speacker in speackers:
23 |             if speacker == '.done':
24 |                 continue
25 |             print('finalSummary: ' + speacker)
26 |             loadPath = self.loadPath  + '/' + speacker
27 |             savePath = self.savePath + '/' + speacker
28 |             saveFile = savePath + '/overview.csv'
29 |             self.pathUtil.createFolderForFile(saveFile)
30 |             localDoneMarker = DoneMarker(savePath)
31 |             if localDoneMarker.isDone():
32 |                 rawDataAudio = pd.read_csv(saveFile, sep='|' , index_col='id')
33 |             else:
34 |                 diConfig = {
35 |                     'audioPersistenz': {
36 |                         'loadPath': loadPath,
37 |                     }
38 |                 }
39 |                 rawDataAudio = DependencyInjection(diConfig).audioStatisticComponent.loadAudioFiles()
40 |                 rawDataAudio['speacker'] = speacker
41 | 
42 |                 diConfig = {
43 |                     'transcriptsPersistenz': {
44 |                         'loadPath': loadPath,
45 |                     }
46 |                 }
47 |                 rawDataText = DependencyInjection(diConfig).textStatisticComponent.loadTextFiles()
48 |                 rawData = rawDataAudio.merge(rawDataText, how='outer', on='id' )
49 |                 rawData.to_csv(saveFile , sep='|')
50 | 
51 |                 localDoneMarker.setDone()
52 | 
53 |             audioInfos.append(rawDataAudio)
54 | 
55 |         audio = pd.concat(audioInfos)
56 |         audio.to_csv(self.savePath  + '/overview.csv', sep='|')


--------------------------------------------------------------------------------
/huiAudioCorpus/components/AudioStatisticComponent.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from huiAudioCorpus.model.Audio import Audio
 3 | from pandas.core.frame import DataFrame
 4 | from huiAudioCorpus.converter.ListToHistogramConverter import ListToHistogramConverter
 5 | from huiAudioCorpus.converter.ListToStatisticConverter import ListToStatisticConverter
 6 | from huiAudioCorpus.persistenz.AudioPersistenz import AudioPersistenz
 7 | from joblib import Parallel, delayed
 8 | 
 9 | class AudioStatisticComponent:
10 |     def __init__(self, audioPersistenz: AudioPersistenz, listToStatisticConverter:ListToStatisticConverter, listToHistogramConverter: ListToHistogramConverter):
11 |         self.audioPersistenz = audioPersistenz
12 |         self.listToStatisticConverter = listToStatisticConverter
13 |         self.listToHistogramConverter = listToHistogramConverter
14 |         self.columns = ['id','duration', 'loudness', 'minSilenceDB', 'samplingrate', 'silencePercent', 'averageFrequency' ]
15 | 
16 |     def run(self):
17 |         rawData = self.loadAudioFiles()
18 |         return self.getStatistic(rawData)
19 | 
20 |     def getStatistic(self, rawData):
21 |         descriptions = ['Length in seconds', 'Loudness in DB', 'Minimum silence in DB', 'Samplingrate in Hz', 'Silence in percent', 'Average Frquency in Hz']
22 |         statistics = {}
23 |         for column in rawData:
24 |             if column not in self.columns:
25 |                 continue
26 |             statistics[column] = {
27 |                 'name': column,
28 |                 'statistic': self.listToStatisticConverter.convert(rawData[column].tolist()),
29 |                 'histogram': self.listToHistogramConverter.convert(rawData[column].tolist()),
30 |                 'description': descriptions[len(statistics)]
31 |             }
32 | 
33 |         return statistics, rawData
34 | 
35 |     def loadAudioFiles(self):
36 |         result = Parallel(n_jobs=12, verbose=10, batch_size=100)(delayed(self.loadAudio)(audio) for audio in self.audioPersistenz.getIds())
37 |         rawData = DataFrame(result, columns  = self.columns)
38 |         rawData = rawData.set_index('id')
39 |         return rawData
40 | 
41 |     def loadAudio(self, audioId: str):
42 |         audio = self.audioPersistenz.load(audioId)
43 |         return [audio.id.split("\\")[-1].split("/")[-1], round(audio.duration,1), round(audio.loudness,1), round(audio.silenceDB,1), audio.samplingRate, round(audio.silencePercent*100), round(audio.averageFrequency)]
44 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/model/Sentence.py:
--------------------------------------------------------------------------------
 1 | from os import error
 2 | from textblob import TextBlob
 3 | from huiAudioCorpus.utils.ModelToStringConverter import ToString
 4 | from typing import List
 5 | 
 6 | class Sentence(ToString):
 7 |     def __init__(self, sentence: str, id: str = ''):
 8 |         sentence = self.cleanSpaces(sentence)
 9 |         sentence = self.cleanSpacesPunctuation(sentence)
10 | 
11 |         self.sentence = sentence
12 |         self.id = id
13 | 
14 |         textBlob = TextBlob(self.sentence.replace('.',' . ') )
15 |         self.words = self.generateWords(textBlob)
16 |         self.wordsWithoutChars: List[str] = [word.lower() for word in textBlob.words] # type: ignore
17 |         self.wordsWithoutCharsAndUpperChars: List[str] = [word for word in textBlob.words] # type: ignore
18 |         self.wordsCount = len(self.wordsWithoutChars)
19 |         self.charCount = len(self.sentence)
20 |         self.wordsMatchingWithChars = self.generateWordsMatchingWithChars(self.words ,self.wordsWithoutChars)
21 |         self.rawChars = "".join(self.wordsWithoutChars)
22 | 
23 |     def generateWords(self, textBlob:TextBlob):
24 |         words = list(textBlob.tokenize())
25 | 
26 |         return words
27 |     def __getitem__(self, k):
28 | 
29 |         return Sentence(" ".join(self.wordsMatchingWithChars[k]))
30 | 
31 |     def generateWordsMatchingWithChars(self, words:List[str], wordsWithoutChars: List[str]):
32 |         wordMatching = []
33 |         wordPointer = 0
34 |         for word in words:
35 |             if wordPointer<len(wordsWithoutChars) and wordsWithoutChars[wordPointer] == word.lower():
36 |                 wordPointer+=1
37 |                 wordMatching.append(word)
38 |             else:
39 |                 if len(wordMatching[-1]) > 1000:
40 |                     print(wordMatching[-1])
41 |                     raise Exception("Problems during creation of word matchings.")
42 |                 wordMatching[-1]+=' ' + word
43 |         return wordMatching
44 | 
45 | 
46 |     def cleanSpaces(self, text: str):
47 |         text =  text.replace('  ', ' ').replace('  ',' ').replace('  ',' ').replace('  ',' ')
48 |         return text
49 | 
50 |     def cleanSpacesPunctuation(self, text: str):
51 |         punctuations = '.,;?!:"'
52 |         for char in punctuations:
53 |             text = text.replace(char, char+' ')
54 |         for char in punctuations:
55 |             text = text.replace(' ' + char,char)
56 |         text = text.replace('  ', ' ').replace('  ', ' ')
57 |         return text.strip()


--------------------------------------------------------------------------------
/huiAudioCorpus/workflows/createDatasetWorkflow/Step9_GenerateCleanDataset.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from huiAudioCorpus.utils.DoneMarker import DoneMarker
 3 | from huiAudioCorpus.transformer.TranscriptsSelectionTransformer import TranscriptsSelectionTransformer
 4 | from huiAudioCorpus.persistenz.TranscriptsPersistenz import TranscriptsPersistenz
 5 | from huiAudioCorpus.persistenz.AudioPersistenz import AudioPersistenz
 6 | from huiAudioCorpus.transformer.AudioSamplingRateTransformer import AudioSamplingRateTransformer
 7 | from tqdm.std import tqdm
 8 | import pandas as pd
 9 | 
10 | class Step9_GenerateCleanDataset:
11 | 
12 |     def __init__(self, savePath: str, infoFile:str, audioPersistenz: AudioPersistenz, transcriptsPersistenz: TranscriptsPersistenz, audioSamplingRateTransformer: AudioSamplingRateTransformer, transcriptsSelectionTransformer: TranscriptsSelectionTransformer, filter):
13 |         self.audioSamplingRateTransformer = audioSamplingRateTransformer
14 |         self.audioPersistenz = audioPersistenz
15 |         self.transcriptsPersistenz = transcriptsPersistenz
16 |         self.transcriptsSelectionTransformer = transcriptsSelectionTransformer
17 |         self.savePath = savePath
18 |         self.infoFile = infoFile
19 |         self.filter = filter
20 | 
21 |     def run(self):
22 |         doneMarker = DoneMarker(self.savePath)
23 |         result = doneMarker.run(self.script, deleteFolder=False)
24 |         return result
25 | 
26 |     def script(self):
27 |         df = pd.read_csv(self.infoFile, sep='|' , index_col=0)
28 |         try:
29 |             df = df.set_index('id')
30 |         except:
31 |             pass
32 | 
33 |         print('Audios bevore: ', df.shape[0])
34 |         filteredAudios = self.filter(df)
35 |         print('Audios after: ', filteredAudios.shape[0])
36 |         audiosAllowed = filteredAudios.index.tolist()
37 | 
38 |         self.copyAudioFiles(audiosAllowed)
39 |         self.copyAndFilterTranscripts(audiosAllowed)
40 | 
41 | 
42 | 
43 | 
44 |     def copyAudioFiles(self, audiosAllowed):
45 |         countFiles = len(self.audioPersistenz.getIds())
46 |         for audio in tqdm(self.audioPersistenz.loadAll(), total= countFiles):
47 |             if audio.name in audiosAllowed:
48 |                 self.audioPersistenz.save(audio)
49 | 
50 |     def copyAndFilterTranscripts(self, usedAudioFileNames):
51 |         for transcripts in tqdm(self.transcriptsPersistenz.loadAll()):
52 |             filteredTranscript = self.transcriptsSelectionTransformer.transform(transcripts, usedAudioFileNames)
53 |             self.transcriptsPersistenz.save(filteredTranscript)


--------------------------------------------------------------------------------
/huiAudioCorpus/workflows/createDatasetWorkflow/Step6_FinalizeDataset.py:
--------------------------------------------------------------------------------
 1 | from huiAudioCorpus.transformer.TranscriptsSelectionTransformer import TranscriptsSelectionTransformer
 2 | import pandas as pd
 3 | from huiAudioCorpus.model.Audio import Audio
 4 | from huiAudioCorpus.utils.DoneMarker import DoneMarker
 5 | from huiAudioCorpus.persistenz.AudioPersistenz import AudioPersistenz
 6 | from huiAudioCorpus.persistenz.TranscriptsPersistenz import TranscriptsPersistenz
 7 | from tqdm import tqdm
 8 | 
 9 | class Step6_FinalizeDataset:
10 | 
11 |     def __init__(self, savePath: str,chapterPath: str, audioPersistenz: AudioPersistenz, transcriptsPersistenz: TranscriptsPersistenz, transcriptsSelectionTransformer: TranscriptsSelectionTransformer):
12 |         self.savePath = savePath
13 |         self.audioPersistenz = audioPersistenz
14 |         self.transcriptsPersistenz = transcriptsPersistenz
15 |         self.chapterPath = chapterPath
16 |         self.transcriptsSelectionTransformer = transcriptsSelectionTransformer
17 |     
18 |     
19 |     def run(self):
20 |         doneMarker = DoneMarker(self.savePath)
21 |         result = doneMarker.run(self.script, deleteFolder=False)
22 |         return result
23 | 
24 |     def script(self):
25 |         transcriptsIterator = list(self.transcriptsPersistenz.loadAll())
26 |         transcripts = transcriptsIterator[0]
27 |         transcriptsIds = [sentence.id for sentence in transcripts.sentences()]
28 |         chapters = pd.read_csv(self.chapterPath)
29 | 
30 |         transcriptsSelectedIds = {}
31 | 
32 |         ids = self.audioPersistenz.getIds()
33 |         audios = self.audioPersistenz.loadAll()
34 |         audio: Audio
35 |         for audio in tqdm(audios, total=len(ids)):
36 |             book, chapter, index  = audio.id.rsplit('_',2)
37 |             reader:str = chapters.loc[int(chapter)-1]['Reader'] # type:ignore
38 |             reader = reader.replace(' ', '_')
39 |             if audio.id in transcriptsIds:
40 |                 path = reader + '/' + book
41 |                 if path in transcriptsSelectedIds:
42 |                     transcriptsSelectedIds[path].append(audio.id)
43 |                 else:
44 |                     transcriptsSelectedIds[path] = [audio.id]
45 |                 audio.id = path + '/wavs/' + audio.id
46 |                 self.audioPersistenz.save(audio)
47 |         for path, ids in transcriptsSelectedIds.items():
48 |             localTranscripts = self.transcriptsSelectionTransformer.transform(transcripts, ids)
49 |             localTranscripts.id = path + '/metadata'
50 |             self.transcriptsPersistenz.save(localTranscripts)


--------------------------------------------------------------------------------
/huiAudioCorpus/converter/AudioToSentenceConverter.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from huiAudioCorpus.persistenz.AudioPersistenz import AudioPersistenz
 3 | from huiAudioCorpus.transformer.AudioSamplingRateTransformer import AudioSamplingRateTransformer
 4 | from huiAudioCorpus.model.Audio import Audio
 5 | from huiAudioCorpus.model.Sentence import Sentence
 6 | import numpy as np
 7 | from tqdm import tqdm
 8 | 
 9 | try:
10 |     from deepspeech import Model
11 | except:
12 |     print('failed to load deepspeech, if you need it, try to install it')
13 | 
14 | from huiAudioCorpus.sttInference import deepspeechModel
15 | 
16 | class AudioToSentenceConverter:
17 |     def __init__(self):
18 |         self.modelPath = deepspeechModel.__path__[0]
19 |         self.model = None
20 |         
21 | 
22 |     def convert(self, audio: Audio, samplingRate:int = 15000):
23 |         if self.model is None:
24 |             self.model, self.samplingRate = self.loadDeepspeech(self.modelPath)
25 |         audioSamplingRateTransformer = AudioSamplingRateTransformer(self.samplingRate)
26 |         audioSampled = audioSamplingRateTransformer.transform(audio)
27 |         timeSeries =  audioSampled.timeSeries
28 |         timeSeries /=1.414
29 |         timeSeries *= 32767
30 |         audioNumpy = timeSeries.astype(np.int16)
31 | 
32 |         transcript = self.model.stt(audioNumpy)
33 |         sentence = Sentence(transcript, audio.id)
34 |         return sentence
35 | 
36 |     def loadDeepspeech(self, modelPath: str):
37 |         model = Model(modelPath+"/output_graph.pb")
38 |         model.enableExternalScorer(modelPath+"/kenlm.scorer")
39 |         desiredSamplingRate = model.sampleRate()
40 |         return model, desiredSamplingRate
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     import librosa
45 |     path = '/media/ppuchtler/LangsameSSD/Projekte/textToSpeech/datasetWorkflow/Step2_SplitAudio/audio/'
46 |     
47 |     addAudio = AudioPersistenz(path).load('acht_gesichter_am_biwasee_01_f000177')
48 |     audio = AudioPersistenz(path).load('acht_gesichter_am_biwasee_01_f000077')
49 | 
50 |     audio = AudioPersistenz(path).load('acht_gesichter_am_biwasee_01_f000030')
51 |     audio1 = AudioPersistenz(path).load('acht_gesichter_am_biwasee_01_f000105')
52 |     audio = AudioPersistenz(path).load('acht_gesichter_am_biwasee_01_f000166')
53 | 
54 |     #audioRemove = AudioPersistenz(path).load('acht_gesichter_am_biwasee_01_f000001')
55 |     #audio = AudioAddSilenceTransformer(10, 10).transform(audio)
56 |     #audio = audio + audio
57 | 
58 |     converter = AudioToSentenceConverter() 
59 |     transcript = converter.convert(addAudio +audio + addAudio)
60 | 
61 |     print(transcript.sentence)


--------------------------------------------------------------------------------
/huiAudioCorpus/workflows/createDatasetWorkflow/Step5_AlignText.py:
--------------------------------------------------------------------------------
 1 | from huiAudioCorpus.model.Transcripts import Transcripts
 2 | from pandas.core.frame import DataFrame
 3 | from huiAudioCorpus.model.Sentence import Sentence
 4 | from huiAudioCorpus.calculator.AlignSentencesIntoTextCalculator import AlignSentencesIntoTextCalculator
 5 | from huiAudioCorpus.persistenz.TranscriptsPersistenz import TranscriptsPersistenz
 6 | from huiAudioCorpus.utils.DoneMarker import DoneMarker
 7 | 
 8 | class Step5_AlignText:
 9 | 
10 |     def __init__(self, savePath: str, alignSentencesIntoTextCalculator: AlignSentencesIntoTextCalculator, transcriptsPersistenz: TranscriptsPersistenz, textToAlignPath: str):
11 |         self.savePath = savePath
12 |         self.alignSentencesIntoTextCalculator = alignSentencesIntoTextCalculator
13 |         self.transcriptsPersistenz = transcriptsPersistenz
14 |         self.textToAlignPath = textToAlignPath
15 | 
16 |     def run(self):
17 |         doneMarker = DoneMarker(self.savePath)
18 |         result = doneMarker.run(self.script, deleteFolder=False)
19 |         return result
20 | 
21 |     def script(self):
22 |         transcripts = list(self.transcriptsPersistenz.loadAll())
23 |         sentences = transcripts[0].sentences()
24 |         with open(self.textToAlignPath, 'r', encoding='utf8') as f:
25 |             inputText = f.read()
26 |         inputSentence = Sentence(inputText)
27 |         
28 |         alignments = self.alignSentencesIntoTextCalculator.calculate(inputSentence,sentences )
29 |         notPerfektAlignments = [align for align in alignments if not align.isPerfect and not align.isSkipped]
30 |         for align in notPerfektAlignments:
31 |             print('------------------')
32 |             print(align.sourceText.id)
33 |             print(align.alignedText.sentence)
34 |             print(align.sourceText.sentence)
35 |             print(align.leftIsPerfekt)
36 |             print(align.rightIsPerfekt)
37 |             print(align.distance)
38 | 
39 |         print("notPerfektAlignments Percent",len(notPerfektAlignments)/len(alignments)*100)
40 | 
41 |         results = [[align.sourceText.id, align.alignedText.sentence]for align in alignments  if align.isPerfect]
42 | 
43 |         csv =  DataFrame(results)
44 |         transcripts = Transcripts(csv, 'transcripts', 'transcripts')
45 |         self.transcriptsPersistenz.save(transcripts)
46 | 
47 |         resultsNotPerfect = [[align.sourceText.id, align.alignedText.sentence]for align in alignments  if not align.isPerfect]
48 | 
49 |         csv =  DataFrame(resultsNotPerfect)
50 |         transcripts = Transcripts(csv, 'transcriptsNotPerfect', 'transcriptsNotPerfect')
51 |         self.transcriptsPersistenz.save(transcripts)
52 | 


--------------------------------------------------------------------------------
/scripts/createDatasetConfig/redaer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "rmische_geschichte_buch_1": {
 3 |         "title": "rmische_geschichte_buch_1",
 4 |         "LibrivoxBookName": "Römische Geschichte Buch 1",
 5 |         "GutenbergId": 3060,
 6 |         "GutenbergStart": "Vorrede zu der zweiten Auflage\n\n\n",
 7 |         "GutenbergEnd": "",
 8 |         "remove": [{
 9 | 
10 |             "start": "\n——————————————————",
11 |             "end": "\n——————————————————"
12 |         }
13 |     ],
14 |         "textReplacement": {
15 |           
16 |         }
17 |     },
18 |     "rmische_geschichte_buch_2": {
19 |         "title": "rmische_geschichte_buch_2",
20 |         "LibrivoxBookName": "Römische Geschichte Buch 2",
21 |         "GutenbergId": 3061,
22 |         "GutenbergStart": "",
23 |         "GutenbergEnd": "",
24 |         "textReplacement": {}
25 |     },
26 |     "rmische_geschichte_buch_3": {
27 |         "title": "rmische_geschichte_buch_3",
28 |         "LibrivoxBookName": "Römische Geschichte Buch 3",
29 |         "GutenbergId": 3062,
30 |         "GutenbergStart": "",
31 |         "GutenbergEnd": "",
32 |         "textReplacement": {}
33 |     },
34 |     "rmische_geschichte_buch_4": {
35 |         "title": "rmische_geschichte_buch_4",
36 |         "LibrivoxBookName": "Römische Geschichte Buch 4",
37 |         "GutenbergId": 3063,
38 |         "GutenbergStart": "",
39 |         "GutenbergEnd": "",
40 |         "textReplacement": {}
41 |     },
42 |     "rmische_geschichte_buch_5": {
43 |         "title": "rmische_geschichte_buch_5",
44 |         "LibrivoxBookName": "Römische Geschichte Buch 5",
45 |         "GutenbergId": 3064,
46 |         "GutenbergStart": "",
47 |         "GutenbergEnd": "",
48 |         "textReplacement": {}
49 |     },
50 |     "rmische_geschichte_buch_8": {
51 |         "title": "rmische_geschichte_buch_8",
52 |         "LibrivoxBookName": "Römische Geschichte Buch 8",
53 |         "GutenbergId": 3065,
54 |         "GutenbergStart": "",
55 |         "GutenbergEnd": "",
56 |         "textReplacement": {}
57 |     },
58 |     "reineke_fuchs": {
59 |         "title": "reineke_fuchs",
60 |         "LibrivoxBookName": "Reineke Fuchs",
61 |         "GutenbergId": 2228,
62 |         "GutenbergStart": "",
63 |         "GutenbergEnd": "",
64 |         "textReplacement": {}
65 |     },
66 |     "hermann_und_dorothea": {
67 |         "title": "hermann_und_dorothea",
68 |         "LibrivoxBookName": "Hermann und Dorothea",
69 |         "GutenbergId": 2312,
70 |         "GutenbergStart": "",
71 |         "GutenbergEnd": "",
72 |         "textReplacement": {}
73 |     },
74 |     "fabeln": {
75 |         "title": "fabeln",
76 |         "LibrivoxBookName": "Fabeln",
77 |         "GutenbergId": "lichtwer/lichtfab/lichtfab.html",
78 |         "GutenbergStart": "",
79 |         "GutenbergEnd": "",
80 |         "textReplacement": {}
81 |     }
82 | }


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 
137 | # Cython debug symbols
138 | cython_debug/
139 | datasetWorkflow/
140 | huiAudioCorpus/sttInference/deepspeechModel/alphabet.txt
141 | huiAudioCorpus/sttInference/deepspeechModel/kenlm.scorer
142 | huiAudioCorpus/sttInference/deepspeechModel/lm.binary
143 | huiAudioCorpus/sttInference/deepspeechModel/output_graph.pb
144 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/persistenz/AudiosFromLibrivoxPersistenz.py:
--------------------------------------------------------------------------------
 1 | import bs4 as bs
 2 | import pandas as pd
 3 | from huiAudioCorpus.utils.PathUtil import PathUtil
 4 | import requests
 5 | import json
 6 | from tqdm import tqdm
 7 | from joblib import Parallel, delayed
 8 | 
 9 | class AudiosFromLibrivoxPersistenz:
10 | 
11 |     def __init__ (self, bookName: str, savePath: str, chapterPath: str, url:str = 'https://librivox.org/'):
12 |         self.bookName = bookName
13 |         self.url = url
14 |         self.savePath = savePath
15 |         self.chapterPath = chapterPath
16 |         self.pathUtil = PathUtil()
17 |         self.limitChapters = 1000
18 |         self.rangeCapters = 20
19 | 
20 |     def save(self):
21 |         chapters, chapterDownloadLinks = self.getChapter(self.bookName)
22 |         Parallel(n_jobs=-2)(delayed(self.pathUtil.copyFileFromUrl)(link ,self.savePath+ '/' + link.split('/')[-1]) for link in chapterDownloadLinks)
23 |         chapters.to_csv(self.chapterPath)
24 |         
25 | 
26 |     def getChapter(self, bookName:str):
27 |         searchUrl = self.getSearchUrl(bookName, self.url)
28 |         response = self.loadSearchBook(searchUrl)
29 |         chapterUrl = self.extractChapterUrl(response)
30 |         chapterDownloadLinks = self.getChapterLinks(chapterUrl)
31 |         chapters = pd.read_html(chapterUrl)
32 |         return chapters[0], chapterDownloadLinks
33 | 
34 |     def loadSearchBook(self, url:str ):
35 |         searchResult = requests.get(url)
36 |         return searchResult.text
37 | 
38 |     def getSearchUrl(self, bookName: str, url:str):
39 |         searchUrl = url + 'api/feed/audiobooks/?format=json&title=' + bookName 
40 |         return searchUrl
41 | 
42 |     def extractChapterUrl(self, response: str):
43 |         jsonInput = json.loads(response)['books']
44 |         book = jsonInput[0]
45 |         urlZipFile = book['url_librivox']
46 |         return urlZipFile
47 | 
48 |     def extractZipUrl(self, response: str):
49 |         jsonInput = json.loads(response)['books']
50 |         book = jsonInput[0]
51 |         urlZipFile = book['url_zip_file']
52 |         return urlZipFile
53 | 
54 |     def getChapterLinks(self, url: str):
55 |         searchResult = requests.get(url)
56 |         searchResult.encoding = "UTF-8"
57 |         soup = bs.BeautifulSoup(searchResult.text, 'html.parser')
58 |         parsed_table = soup.find_all('table')[0] 
59 |         data = [[td.a['href'] if td.find('a') else 
60 |                 ''.join(td.stripped_strings)
61 |                 for td in row.find_all('td')]
62 |                 for row in parsed_table.find_all('tr')]
63 |         downloadLinks = [chapter[1] for chapter in data if len(chapter)>0]
64 |         return downloadLinks
65 | 
66 | 
67 |     def getIds(self):
68 |         books = []
69 |         limit = self.limitChapters
70 |         for i in tqdm(range(self.rangeCapters)):
71 |             requestUrl = f'https://librivox.org/api/feed/audiobooks/?format=json&limit={limit}&offset={i*limit}'
72 |             page = requests.get(requestUrl)
73 |             page.encoding = "UTF-8"
74 |             result=  json.loads(page.text)
75 |             if 'books' in result:
76 |                 books.extend(result['books'])
77 |             else:
78 |                 print(result)
79 |                 break
80 |         return books


--------------------------------------------------------------------------------
/huiAudioCorpus/components/TextStatisticComponent.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm  
 2 | from huiAudioCorpus.converter.ListToHistogramConverter import ListToHistogramConverter
 3 | from huiAudioCorpus.converter.ListToStatisticConverter import ListToStatisticConverter
 4 | from huiAudioCorpus.persistenz.TranscriptsPersistenz import TranscriptsPersistenz
 5 | from huiAudioCorpus.converter.TranscriptsToSentencesConverter import TranscriptsToSentencesConverter
 6 | from pandas.core.frame import DataFrame
 7 | from collections import Counter
 8 | from huiAudioCorpus.model.Sentence import Sentence
 9 | 
10 | class TextStatisticComponent:
11 |     def __init__(self, transcriptsPersistenz: TranscriptsPersistenz, transcriptsToSentencesConverter:TranscriptsToSentencesConverter, listToStatisticConverter:ListToStatisticConverter, listToHistogramConverter: ListToHistogramConverter):
12 |         self.transcriptsPersistenz = transcriptsPersistenz
13 |         self.transcriptsToSentencesConverter = transcriptsToSentencesConverter
14 |         self.listToStatisticConverter = listToStatisticConverter
15 |         self.listToHistogramConverter = listToHistogramConverter
16 | 
17 |     def run(self):
18 |         rawData= self.loadTextFiles()
19 |         return self.getStatistic(rawData)
20 | 
21 |     def getStatistic(self, rawData):
22 |         descriptions = ['Words count in audio', 'Chars count in audio']
23 |         ids = ['wordCount', 'charCount']
24 |         statistics = {}
25 |         for column in rawData:
26 |             if column not in ids:
27 |                 continue
28 |             statistics[column] = {
29 |                 'name': column,
30 |                 'statistic': self.listToStatisticConverter.convert(rawData[column].tolist()),
31 |                 'histogram': self.listToHistogramConverter.convert(rawData[column].tolist()),
32 |                 'description': descriptions[len(statistics)]
33 |             }
34 | 
35 | 
36 |         if 'text' not in rawData:
37 |             counter = Counter()
38 |             uniqeWordsWithMinimum = {}
39 | 
40 |         else:
41 |             counter = Counter([word for sentence in tqdm(rawData['text']) for word in Sentence(sentence).wordsWithoutChars])
42 | 
43 |             counterValues = counter.values()
44 |             uniqeWordsWithMinimum = {}
45 |             remainingCounts = counterValues
46 |             for minWortOccurence  in tqdm(list(range(1, max(counterValues)+1))): 
47 |                 remainingCounts = [count for count in remainingCounts if count>=minWortOccurence]
48 |                 uniqeWordsWithMinimum[minWortOccurence] = len(remainingCounts)
49 |                 if(len(remainingCounts)==1):
50 |                     break
51 | 
52 |         return statistics, rawData, counter, uniqeWordsWithMinimum
53 | 
54 |     def loadTextFiles(self):
55 |         allSentences =[sentence for transcripts in tqdm(self.transcriptsPersistenz.loadAll(), total=len(self.transcriptsPersistenz.getIds())) for sentence in  self.transcriptsToSentencesConverter.convert(transcripts)]
56 |         result =  [[sentence.id.split("\\")[-1].split("/")[-1], sentence.wordsCount, sentence.charCount, sentence.sentence] for sentence in  tqdm(allSentences)]
57 |         rawData = DataFrame(result, columns  = ['id','wordCount', 'charCount', 'text'])
58 |         rawData = rawData.set_index('id')
59 |         return rawData


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # HUI-Audio-Corpus-German
  2 | This is the official repository for the HUI-Audio-Corpus-German. The corresponding paper is in the process of publication.  With this repository it is possible to automatically recreate the dataset. It is also possible to add more speakers to the processing pipeline.
  3 | 
  4 | Dataset: https://opendata.iisys.de
  5 | 
  6 | Live example: http://narvi.sysint.iisys.de/projects/tts
  7 | 
  8 | Paper (presented at 44th German Conference on Artificial Intelligence (KI2021)): https://arxiv.org/abs/2106.06309
  9 | 
 10 | ## Speaker overview
 11 | 
 12 | * bernd
 13 | * hokuspokus
 14 | * friedrich
 15 | * eva
 16 | * karlsson
 17 | * sonja
 18 | 
 19 | ### Not finished
 20 | 
 21 | * redaer
 22 | 
 23 | ## Installation
 24 | 
 25 | ### Requirements
 26 | 
 27 | * Linux
 28 | * Anaconda 
 29 | 
 30 | ### Setup python environment with Anaconda
 31 | 
 32 | Navigate to the cloned repository
 33 | 
 34 | Create a new conda environment (For more informations: https://salishsea-meopar-docs.readthedocs.io/en/latest/work_env/python3_conda_environment.html)
 35 | ```
 36 | conda create -n huiAudioCorpus python=3.8
 37 | conda activate huiAudioCorpus
 38 | ```
 39 | 
 40 | Install the package as devolop python package (For more informations: http://naoko.github.io/your-project-install-pip-setup/)
 41 | 
 42 | ```
 43 | python setup.py develop
 44 | ```
 45 | 
 46 | Installation of dependencies
 47 | ```
 48 | pip install -r requirements.txt 
 49 | ```
 50 | 
 51 | Download: https://opendata.iisys.de/opendata/Datasets/deepspeechModel/deepspeechModel.zip and copy the content of the downloaded zip into the folder:
 52 | 
 53 | ```
 54 | /huiAudioCorpus/sttInference/deepspeechModel
 55 | ```
 56 | ### Optional installation step
 57 | The deepspeech model runs by default on CPU. This could lead to a long pipeline processing pipeline. If you have a compatible GPU you can install a special version from deepspeech.
 58 | More infos can be found at:
 59 | ```
 60 | https://deepspeech.readthedocs.io/en/r0.9/USING.html
 61 | ```
 62 | ## Recreate dataset
 63 | 
 64 | ```
 65 | cd scripts
 66 | 
 67 | python createDataset.py
 68 | ```
 69 | 
 70 | Here, configurations can be viewed:
 71 | 
 72 | Inside the variable "allConfigs" all speaker configurations can be added. If you want to easily test if the pipeline is runnig you can use:
 73 | 
 74 | ```
 75 | allConfigs = sonja
 76 | ```
 77 | 
 78 | for all speakers you could use
 79 | 
 80 | ```
 81 | allConfigs = {**bernd, **hokuspokus, **friedrich, **eva, **karlsson, **sonja}
 82 | ```
 83 | 
 84 | The processing files and the complete dataset with statistics are created at
 85 | ```
 86 | /datasetWorkflow
 87 | ```
 88 | Directory can be changed inside createDataset.py
 89 | 
 90 | ```
 91 | externalPaths = [
 92 |     "/path/to/the/folder"
 93 | ]
 94 | 
 95 | ```
 96 | 
 97 | ## Adding a new speaker
 98 | 
 99 | If you want to add a new speaker, follow these steps:
100 | * Create a json file inside the scripts/createDatasetConfig with your speaker. Here you can find examples of how the file should look. Infos about the speacers can be found at datasetWorkflow/overview
101 | * Validate text replacements, the script helps you with the needed steps
102 | * finish dataset and create a push request
103 | 
104 | ## Creating statistics for other datasets
105 | 
106 | We have a script for the creation of statistics only.
107 | For this, variables "loadPath" and "savePath" inside the file "scripts/generrateAudioStatistic.py" have to be adjusted.
108 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/persistenz/AudioTranscriptPairPersistenz.py:
--------------------------------------------------------------------------------
 1 | from huiAudioCorpus.model.AudioTranscriptPair import AudioTranscriptPair
 2 | from huiAudioCorpus.error.MatchingNotFoundError import MatchingNotFoundError
 3 | from typing import List
 4 | from huiAudioCorpus.converter.TranscriptsToSentencesConverter import TranscriptsToSentencesConverter
 5 | from huiAudioCorpus.persistenz.AudioPersistenz import AudioPersistenz
 6 | from huiAudioCorpus.persistenz.TranscriptsPersistenz import TranscriptsPersistenz
 7 | 
 8 | class AudioTranscriptPairPersistenz:
 9 | 
10 |     def __init__(self, audioPersistenz: AudioPersistenz, transcriptsPersistenz: TranscriptsPersistenz, transcriptsToSentencesConverter: TranscriptsToSentencesConverter, checkForConsistency:bool = True):
11 |         self.audioPersistenz = audioPersistenz
12 |         self.transcriptsPersistenz = transcriptsPersistenz
13 |         self.transcriptsToSentencesConverter = transcriptsToSentencesConverter
14 | 
15 |     def load(self, audioId: str, sentenceId:str):
16 |         audio = self.audioPersistenz.load(audioId)
17 |         sentence = self.getAllSentences()[sentenceId]
18 |         elementPair = AudioTranscriptPair(sentence, audio)
19 |         return elementPair
20 | 
21 | 
22 |     def getIds(self, checkForConsistency = True):
23 |         audioIds = self.audioPersistenz.getIds()
24 |         audioNames = self.audioPersistenz.getNames()
25 |         sentencesIds = list(self.getAllSentences().keys())
26 | 
27 |         if checkForConsistency:
28 |             self.checkeIds(audioNames, sentencesIds)
29 |         else:
30 |             audioIds,audioNames, sentencesIds = self.removeNonExistentIds(audioIds, audioNames, sentencesIds)
31 |   
32 |         ids = self.sortIds(audioIds, audioNames, sentencesIds)   
33 |         
34 |         return ids
35 | 
36 |     def sortIds(self, audioIds, audioNames, sentencesIds):
37 |         zippedAudios = list(zip(audioIds, audioNames))
38 |         zippedAudios.sort(key = lambda x: x[1])
39 |         audioIds = [element[0] for element in zippedAudios]
40 |         sentencesIds.sort()
41 |         return list(zip(audioIds, sentencesIds))
42 |         
43 | 
44 |     def loadAll(self, checkForConsistency = True):
45 |         ids = self.getIds(checkForConsistency)
46 |         for audioId, sentenceId in ids:
47 |             yield self.load(audioId, sentenceId)
48 | 
49 | 
50 |     def getAllSentences(self):
51 |         transcripts = list(self.transcriptsPersistenz.loadAll())
52 |         sentences = [sentence for transcript in transcripts for sentence in self.transcriptsToSentencesConverter.convert(transcript)]
53 |         sentenceDict = {sentence.id: sentence for sentence in sentences}
54 |         return sentenceDict
55 | 
56 |     def checkeIds(self, audioIds: List[str], sentenceIds: List[str]):
57 |         missingAudioIds = [id for id in sentenceIds if not id in audioIds]
58 |         missingSentenceIds = [id for id in audioIds if not id in sentenceIds]
59 |         if missingAudioIds or missingSentenceIds:
60 |             raise MatchingNotFoundError(missingAudioIds, missingSentenceIds, 'audioFiles', 'Transcripts')
61 | 
62 |     def removeNonExistentIds(self, audioIds: List[str], audioNames: List[str], sentenceIds: List[str]):
63 |         audioIds = [id for id, name in zip(audioIds, audioNames) if name in sentenceIds]
64 |         audioNames = [name for name in  audioNames if name in sentenceIds]
65 |         sentenceIds= [id for id in sentenceIds if id in audioNames]
66 |         return audioIds, audioNames, sentenceIds


--------------------------------------------------------------------------------
/huiAudioCorpus/model/Audio.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from numpy.lib.function_base import average
 3 | from huiAudioCorpus.utils.ModelToStringConverter import ToString
 4 | from nptyping import NDArray
 5 | import pyloudnorm as pyln
 6 | import librosa
 7 | 
 8 | class Audio(ToString):
 9 |     def __init__(self, audioTimeSeries: NDArray, samplingRate:  int, id: str, name: str):
10 |         self.timeSeries = audioTimeSeries
11 |         self.samplingRate = samplingRate
12 |         self.name = name
13 |         self.id = id
14 | 
15 | 
16 |     @property
17 |     def samples(self)->int:
18 |         return self.timeSeries.shape[0]
19 | 
20 |     @property
21 |     def duration(self)-> float:
22 |         return self.samples/ self.samplingRate
23 | 
24 | 
25 |     def __add__(self, other: 'Audio') -> 'Audio':
26 |         audioTimeSeries = self.timeSeries.tolist() + other.timeSeries.tolist()
27 |         audioTimeSeries = np.array(audioTimeSeries)
28 |         id = self.id + '&' + other.id
29 |         name = self.name + '&' + other.name
30 |         
31 |         samplingRateSelf = self.samplingRate
32 |         samplingRateOther = other.samplingRate
33 |         if samplingRateOther != samplingRateSelf:
34 |             raise ValueError(f"The samplingrates from the audio files are different sr1: {samplingRateSelf} sr2: {samplingRateOther} from the audio files with the combined id: {id} and name: {name}")
35 | 
36 |         audio = Audio(audioTimeSeries,samplingRateSelf, id, name)
37 |         return audio
38 | 
39 |     def __radd__(self, other):
40 |         return self
41 | 
42 |     @property
43 |     def loudness(self)->float:
44 |         meter = pyln.Meter(self.samplingRate) # create BS.1770 meter
45 |         loudness = meter.integrated_loudness(self.timeSeries) 
46 |         return loudness
47 | 
48 |     @property
49 |     def silenceDB(self)->float:
50 |         silenceDurationInSeconds= 0.05
51 |         frameLength = int(silenceDurationInSeconds* self.samplingRate)
52 |         for silenceDezibel in range(100, 1,-1):
53 |             splitted = librosa.effects.split(self.timeSeries,silenceDezibel , frame_length=frameLength, hop_length=int(frameLength/4))
54 |             if len(splitted)>1:
55 |                 return -silenceDezibel
56 |         return 0
57 |     
58 |     @property
59 |     def silencePercent(self)->float:
60 |         states = self.isLoud()
61 |         silencePercent = 1- sum(states)/len(states)
62 |         return silencePercent
63 | 
64 |     def isLoud(self):
65 |         #https://librosa.org/doc/latest/auto_examples/plot_viterbi.html#sphx-glr-auto-examples-plot-viterbi-py
66 |         rms = librosa.feature.rms(y=self.timeSeries)[0]# type: ignore
67 | 
68 |         r_normalized = (rms - 0.02) / np.std(rms)
69 |         p = np.exp(r_normalized) / (1 + np.exp(r_normalized))# type: ignore
70 | 
71 | 
72 |         transition = librosa.sequence.transition_loop(2, [0.5, 0.6])
73 |         full_p = np.vstack([1 - p, p])
74 |         states = librosa.sequence.viterbi_discriminative(full_p, transition)
75 |         return states
76 | 
77 |     @property
78 |     def averageFrequency(self)->float:
79 |         try:
80 |             cent = librosa.feature.spectral_centroid(y=self.timeSeries, sr=self.samplingRate)[0] #type: ignore
81 |             loudPositions = self.isLoud()
82 | 
83 |             centAtLoud = [cent[index] for index in range(len(cent)) if loudPositions[index]==1]
84 |             averageFrequency = round(average(centAtLoud)) #type: ignore
85 |             return averageFrequency
86 |         except:
87 |             return -1
88 | 


--------------------------------------------------------------------------------
/huiAudioCorpus/transformer/AudioSplitTransformer.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | import librosa
 3 | from huiAudioCorpus.model.Audio import Audio
 4 | from huiAudioCorpus.transformer.AudioFadeTransformer import AudioFadeTransformer
 5 | import statistics
 6 | 
 7 | class AudioSplitTransformer:
 8 | 
 9 |     def __init__(self, audioFadeTransformer: AudioFadeTransformer, maxAudioDuration: float, minAudioDuration: float, silenceDurationInSeconds:float= 0.2):
10 |         self.maxAudioDuration = maxAudioDuration
11 |         self.minAudioDuration = minAudioDuration
12 |         self.silenceDurationInSeconds = silenceDurationInSeconds
13 |         self.audioFadeTransformer = audioFadeTransformer
14 | 
15 |     def transform(self, audio: Audio, bookName: str, chapter: int):
16 |         splitted = self.splitWithBestDezibel(audio, self.maxAudioDuration - self.minAudioDuration)
17 |         splitted = self.mergeAudioToTargetDuration(splitted, self.minAudioDuration)
18 |         merged = self.mergeLastAudioIfTooShort(splitted, self.minAudioDuration)
19 |         withIds = self.setIds(merged, bookName, chapter)
20 |         withFading = self.fade(withIds)
21 |         return withIds
22 | 
23 |     def splitWithBestDezibel(self, audio: Audio, maxAudioDuration: float):
24 |         # TODO: think about using recrusive and split just needed audio files????
25 |         splittedAudio:List[Audio]=[]
26 |         maxDuration:float = 0
27 |         for silenceDezibel in range(70, -20,-5):
28 |             splittedAudio = self.split(audio, silenceDezibel)
29 |             maxDuration = max([audio.duration for audio in splittedAudio])
30 |             if maxDuration< maxAudioDuration:
31 |                 print( audio.name, 'used DB:', silenceDezibel)
32 |                 return splittedAudio
33 |         return splittedAudio
34 | 
35 |     def split(self, audio: Audio, silenceDezibel: int):
36 |         frameLength = int(self.silenceDurationInSeconds* audio.samplingRate)
37 |         splitted = librosa.effects.split(audio.timeSeries,silenceDezibel , frame_length=frameLength, hop_length=int(frameLength/4))
38 |         audios = []
39 |         for index in range(len(splitted)):
40 |             (start,end) = splitted[index]
41 |             isNextElementAvailable = len(splitted)> index+1
42 |             if isNextElementAvailable: 
43 |                 (nextStart, nextEnd) = splitted[index+1]
44 |                 betterEnd = int(statistics.mean([end, nextStart]))
45 |             else:
46 |                 betterEnd = end
47 | 
48 |             isPreviousElementAvailable = not index == 0
49 |             if isPreviousElementAvailable:
50 |                 (previousStart, previousEnd) = splitted[index-1]
51 |                 betterStart = int(statistics.mean([previousEnd, start]))
52 |             else:
53 |                 betterStart = start
54 | 
55 |             newAudio = Audio(audio.timeSeries[betterStart:betterEnd], audio.samplingRate, 'id', 'name')
56 |             audios.append(newAudio)
57 |         return audios
58 | 
59 |     def mergeAudioToTargetDuration(self, audios: List[Audio], targetDuration: float):
60 |         mergedAudios: List[Audio] = []
61 | 
62 |         for audio in audios:
63 |             if len(mergedAudios)>0 and mergedAudios[-1].duration<targetDuration:
64 |                 mergedAudios[-1] = mergedAudios[-1] + audio
65 |             else:
66 |                 mergedAudios.append(audio)
67 |         return mergedAudios
68 | 
69 |     def mergeLastAudioIfTooShort(self, audios: List[Audio], minDuration: float):
70 |         if audios[-1].duration<minDuration:
71 |             audios[-2] = audios[-2] + audios[-1]
72 |             audios.pop()
73 |         return audios
74 | 
75 |     def setIds(self, audios: List[Audio], bookName: str, chapter: int):
76 |         for index, audio in enumerate(audios):
77 |             name = f"{bookName}_{chapter:02d}_f{index+1:06d}"
78 |             audio.id = name
79 |             audio.name = name
80 |         return audios
81 | 
82 |     def fade(self, audios: List[Audio]):
83 |         return [self.audioFadeTransformer.transform(audio) for audio in audios]


--------------------------------------------------------------------------------
/huiAudioCorpus/workflows/createDatasetWorkflow/Step8_DatasetStatistic.py:
--------------------------------------------------------------------------------
 1 | from huiAudioCorpus.utils.DoneMarker import DoneMarker
 2 | from huiAudioCorpus.utils.PathUtil import PathUtil
 3 | import pandas as pd
 4 | from huiAudioCorpus.components.AudioStatisticComponent import AudioStatisticComponent
 5 | from huiAudioCorpus.components.TextStatisticComponent import TextStatisticComponent
 6 | from typing import List
 7 | from huiAudioCorpus.ui.Plot import Plot
 8 | from pandas_profiling import ProfileReport
 9 | 
10 | class Step8_DatasetStatistic:
11 |     def __init__(self, savePath: str, loadPath: str, specialSpeackers: List[str], filter,  pathUtil: PathUtil, audioStatisticComponent:AudioStatisticComponent, textStatisticComponent:TextStatisticComponent, plot: Plot):
12 |         self.savePath = savePath
13 |         self.pathUtil = pathUtil
14 |         self.loadPath = loadPath
15 |         self.specialSpeackers = specialSpeackers
16 |         self.filter = filter
17 |         self.audioStatisticComponent =audioStatisticComponent
18 |         self.textStatisticComponent = textStatisticComponent
19 |         self.plot = plot
20 | 
21 |     def run(self):
22 |         doneMarker = DoneMarker(self.savePath)
23 |         result = doneMarker.run(self.script, deleteFolder=False)
24 |         return result
25 | 
26 |     def script(self):
27 |         rawData = pd.read_csv(self.loadPath, sep='|', index_col='id')
28 | 
29 |         print('Audios bevore: ', rawData.shape[0])
30 |         infoText = " - full"
31 |         if self.filter is not None:
32 |             infoText = " - clean"
33 |             rawData = self.filter(rawData)
34 |         print('Audios after: ', rawData.shape[0])
35 | 
36 | 
37 |         print(rawData)
38 |         # all Speakers
39 |         self.saveSummary(rawData, self.savePath  + '/complete', "All speakers" + infoText)
40 | 
41 |         # every Speacker
42 |         for speackerId, data in rawData.groupby('speacker'):
43 |             self.saveSummary(data, self.savePath + '/speacker/' + speackerId, "Speaker: " + speackerId + infoText)
44 | 
45 |         # others
46 |         others = rawData[~rawData['speacker'].isin(self.specialSpeackers)]
47 |         self.saveSummary(others, self.savePath + '/others', "Other speakers" + infoText)
48 | 
49 | 
50 |     def saveSummary(self, rawData, savePath: str, title: str):
51 |         print(title)
52 |         statisticsText, _, counter, uniqeWordsWithMinimum = self.textStatisticComponent.getStatistic(rawData)
53 |         statisticsAudio, _ = self.audioStatisticComponent.getStatistic(rawData)
54 | 
55 |         statistics = {**statisticsAudio, **statisticsText}
56 |         filePath = savePath + '/statistic.txt'
57 |         self.pathUtil.createFolderForFile(filePath)
58 | 
59 |         rawData.to_csv(savePath + '/overview.csv' , sep='|')
60 | 
61 |         profile = rawData.profile_report(title = title)
62 |         profile.to_file(savePath + "/profilingReport.html")
63 | 
64 | 
65 |         self.pathUtil.saveJson(savePath + '/wordCounts.json',counter )
66 |         self.pathUtil.saveJson(savePath + '/uniqueWordsWithMinimalNumberOfOccurrences.json',uniqeWordsWithMinimum )
67 | 
68 |         with open(filePath, 'w') as textFile:
69 |             for statistic in statistics.values():
70 |                 textFile.write(statistic['description'])
71 |                 textFile.write('\n')
72 |                 textFile.write(statistic['statistic'].__str__())
73 |                 textFile.write('\n')
74 |                 textFile.write('\n')
75 |                 textFile.write('\n')
76 | 
77 |         histogrammData = {}
78 |         extractHistogram = lambda hist : {'bins': hist.bins, 'values': hist.values}
79 | 
80 |         for statistic in statistics.values():
81 |             if statistic['name'] =='samplingrate':
82 |                 continue
83 |             self.plot.histogram(statistic['histogram'],  statistic['description'])
84 |             self.plot.savePath = savePath
85 |             self.plot.save(statistic['name'])
86 |             histogrammData[statistic['name']] = extractHistogram(statistic['histogram'])
87 | 
88 |         self.pathUtil.saveJson(savePath + '/histogrammData.json',histogrammData )


--------------------------------------------------------------------------------
/huiAudioCorpus/utils/PathUtil.py:
--------------------------------------------------------------------------------
  1 | from io import BufferedReader
  2 | import json
  3 | from pathlib import Path
  4 | import os
  5 | import huiAudioCorpus.testOutput as testOutput
  6 | from tqdm import tqdm 
  7 | import requests
  8 | from zipfile import ZipFile
  9 | 
 10 | class PathUtil:
 11 |     def filenameWithoutExtension(self, path:str):
 12 |         filename = self.filenameWithExtension(path)
 13 |         filenameWithoutExtension =os.path.splitext(filename)[0]
 14 |         return filenameWithoutExtension
 15 | 
 16 |     def filenameWithExtension(self, path: str):
 17 |         filename = Path(path).name
 18 |         return filename 
 19 | 
 20 |     def createFolderForFile(self,file):
 21 |         Path(file).parent.mkdir(parents=True, exist_ok=True)
 22 | 
 23 |     def deleteFolder(self, folder):
 24 |         if self.fileExists(folder):
 25 |             for filename in os.listdir(folder):
 26 |                 file_path = os.path.join(folder, filename)
 27 |                 if os.path.isfile(file_path) or os.path.islink(file_path):
 28 |                     os.unlink(file_path)
 29 |                 else:
 30 |                     self.deleteFolder(file_path)
 31 |             os.rmdir(folder)
 32 | 
 33 |     def getTestDataFolder(self, testFile: str):
 34 |         testFolder = testOutput.__path__[0] +'/' +  self.filenameWithoutExtension(testFile)
 35 |         return testFolder
 36 | 
 37 |     def copyFileWithStream(self, inputStream: BufferedReader, inputSize: int, outputFile):
 38 |         self.createFolderForFile(outputFile)
 39 |         bufferSize = 1024*1024*2
 40 | 
 41 |         with tqdm(total=inputSize, unit='iB', unit_scale=True, unit_divisor=1024) as pbar:
 42 |             with open(outputFile, 'wb') as dest:
 43 |                 while True:
 44 |                     copy_buffer = inputStream.read(bufferSize)
 45 |                     if not copy_buffer:
 46 |                         break
 47 |                     size = dest.write(copy_buffer)
 48 |                     pbar.update(size)
 49 | 
 50 |     def copyFileFromUrl(self, url: str, outputFile):
 51 |         self.createFolderForFile(outputFile)
 52 |         bufferSize = 1024*10
 53 | 
 54 |         resp = requests.get(url, stream=True)
 55 |         inputSize = resp.headers.get('content-length')
 56 |         if inputSize is not None:
 57 |             inputSize = int(inputSize)
 58 |         else:
 59 |             inputSize = None
 60 | 
 61 |         inputSize = int()# type:ignore
 62 |         with tqdm(total=inputSize, unit='iB', unit_scale=True, unit_divisor=1024, desc=url) as pbar:
 63 |             with open(outputFile, 'wb') as dest:
 64 |                 for data in resp.iter_content(chunk_size=bufferSize):
 65 |                     size = dest.write(data)
 66 |                     pbar.update(size)
 67 | 
 68 |     def copyFile(self, inputFile: str, outputFile: str):
 69 |         size = os.path.getsize(inputFile)
 70 |         with open(inputFile, 'rb',) as source:
 71 |             self.copyFileWithStream(source, size, outputFile)
 72 | 
 73 |     def unzip(self, inputZip:str, outputFolder:str):
 74 |         with ZipFile(inputZip, 'r') as zipReference:
 75 |             for file in zipReference.namelist():
 76 |                 if not self.fileExists(outputFolder + '/' + file):
 77 |                     print('start unzipping because file ', file, 'does not exist.')
 78 |                     self.deleteFolder(outputFolder)
 79 |                     zipReference.extractall(outputFolder)
 80 |                     break
 81 | 
 82 |     def saveJson(self, filename:str, jsonContent):
 83 |         string = json.dumps(jsonContent, indent=4, ensure_ascii=False)
 84 |         self.createFolderForFile(filename)
 85 |         with open(filename, 'w', encoding='utf8') as file:
 86 |             file.write(string)
 87 | 
 88 |     def loadJson(self, filename: str):
 89 |         with open(filename, encoding='utf8') as jsonFile:
 90 |             data = json.load(jsonFile)
 91 |         return data
 92 | 
 93 |     def fileExists(self, filename):
 94 |         exists = os.path.exists(filename)
 95 |         return exists
 96 | 
 97 |     def writeFile(self, text: str, filename: str):
 98 |         self.createFolderForFile(filename)
 99 |         with open(filename, 'w', encoding='utf8') as f:
100 |             f.write(text)
101 | 
102 |     def loadFile(self, filename: str):
103 |         with open(filename, 'r', encoding='utf8') as f:
104 |             inputText = f.read()
105 |         return inputText


--------------------------------------------------------------------------------
/huiAudioCorpus/persistenz/GutenbergBookPersistenz.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Union
  2 | from huiAudioCorpus.utils.PathUtil import PathUtil
  3 | import re
  4 | from bs4 import BeautifulSoup
  5 | import requests
  6 | from gutenberg.acquire import load_etext
  7 | from gutenberg.cleanup import strip_headers
  8 | 
  9 | class GutenbergBookPersistenz:
 10 | 
 11 |     def __init__(self, textId: Union[str,int], savePath: str):
 12 |         self.textId = textId
 13 |         self.savePath = savePath
 14 |         self.pathUtil = PathUtil()
 15 |         self.guttenbergProjektDownload = GuttenbergProjektDownload()
 16 |         self.guttenbergDownload = GuttenbergDownload()
 17 | 
 18 |     def save(self):
 19 |         if isinstance(self.textId, str) :
 20 |             text = self.guttenbergProjektDownload.downloadText(self.textId)
 21 |         else:
 22 |             text = self.guttenbergDownload.downloadText(self.textId)
 23 |         self.pathUtil.writeFile(text, self.savePath)
 24 | 
 25 | 
 26 | class GuttenbergProjektDownload():
 27 |     """
 28 |     This class downloads a book from www.projekt-gutenberg.org
 29 |     The id has to be searched manual with the link https://www.projekt-gutenberg.org/info/texte/allworka.html
 30 |     You have to use the last part of the links for example dauthend/biwasee/biwasee.html
 31 |     """
 32 | 
 33 |     def __init__(self):
 34 |         self.baseLink = 'https://www.projekt-gutenberg.org/'
 35 | 
 36 |     def getIds(self):
 37 |         works_link = self.baseLink + "info/texte/allworka.html"
 38 | 
 39 |         works = requests.get(works_link)
 40 |         works.encoding = "UTF-8"
 41 | 
 42 |         works_soup = BeautifulSoup(works.text,"html.parser")
 43 | 
 44 |         books = []
 45 |         lastName=''
 46 |         firstName=''
 47 |         elements = works_soup.find("dl")
 48 |         for element in elements:
 49 |             if element.name == 'dt':
 50 |                 currentAuthor = element.text
 51 |                 if ', ' in currentAuthor:
 52 |                         names = currentAuthor.split(', ')
 53 |                         firstName = names[-1]
 54 |                         lastName = names[0]
 55 | 
 56 |                 else: 
 57 |                     lastName = currentAuthor,
 58 |                     firstName = ''
 59 | 
 60 |             if element.name == 'dd':
 61 |                 link = element.find("a")
 62 |                 if link is not None:
 63 |                     id = link["href"][5:]
 64 |                     bookname = link.text
 65 |                     books.append({
 66 |                         'name': bookname,
 67 |                         'fistName': firstName,
 68 |                         'lastName': lastName,
 69 |                         'id': id
 70 |                     })
 71 |         return books
 72 | 
 73 |     def downloadText(self, textId: str):
 74 |         link = self.baseLink + textId
 75 |         fullText = ''
 76 |         while link is not None:
 77 |             paragraph, link = self.downloadPage(link)
 78 |             preparedParagraph = self.prepareParagraph(paragraph)
 79 |             fullText+=preparedParagraph
 80 |         return fullText
 81 | 
 82 |     def downloadPage(self, link: str):
 83 |         page = requests.get(link)
 84 |         page.encoding = "UTF-8"
 85 |         pageSoup = BeautifulSoup(page.text,"html.parser")
 86 |         paragraphs = pageSoup.find('p').find_all("p")
 87 |         nextLink = None
 88 |         if len(pageSoup.find_all("a",text=re.compile("weiter\s*>>")))>0:
 89 | 
 90 |             directLink = pageSoup.find("a",text=re.compile("weiter\s*>>"))["href"]
 91 | 
 92 |             nextLink = page.url.split("/")
 93 | 
 94 |             nextLink.pop()
 95 | 
 96 |             nextLink.append(directLink)
 97 | 
 98 |             nextLink = "/".join(nextLink)
 99 |         
100 |         return paragraphs, nextLink
101 | 
102 | 
103 |     def prepareParagraph(self, paragraphs:List):
104 |         extractedParagraphs = ''
105 |         for paragraph in paragraphs:
106 |             for footnote in paragraph.select('span'):
107 |                 footnote.extract()
108 | 
109 |             if len(paragraph.text) > 0:
110 |                 extractedParagraph = re.sub(r" +",r" ",paragraph.text.replace("\t"," ").replace("\n", " "))
111 | 
112 |                 extractedParagraphs += extractedParagraph.strip()+"\n"
113 |         return extractedParagraphs
114 | 
115 | 
116 | class GuttenbergDownload:
117 |     """
118 |     This class downloads a book from www.projekt-gutenberg.org
119 |     The id has to be searched manual with the link http://gutendex.com/books/?search=ThisIsTheSearchText
120 |     """
121 |     def downloadText(self, textId: int):
122 |         text = strip_headers(load_etext(textId, mirror='http://eremita.di.uminho.pt/gutenberg/')).strip()
123 |         return text
124 | 
125 | 


--------------------------------------------------------------------------------
/scripts/createDatasetConfig/Bernd_Ungerer_tausendUndEineNacht.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "tausendUndEineNacht1": {
  3 |         "title": "tausend_und_eine_nacht_band_1",
  4 |         "LibrivoxBookName": "Tausend und eine Nacht, Band 1",
  5 |         "GutenbergId": "weil/band1/inhalt.html",
  6 |         "GutenbergStart": "",
  7 |         "GutenbergEnd": "",
  8 |         "textReplacement": {
  9 |             "0, mein Teurer": "O, mein Teurer",
 10 |             " u. s. w.": " und so weiter. ",
 11 |             "100.000": "einhunderttausend",
 12 |             "90.000": "neunzigtausend",
 13 |             "10.000": "zehntausend",
 14 |             "50.000": "fünfzigtausend",
 15 |             "d. h.": " das heißt ",
 16 |             " z. B.": " zum Beispiel ",
 17 |             "H. v.": " Herr von ",
 18 |             "1786)": "siebzehnhundertsechsundachzig)",
 19 |             "1839)": "achtzehnhundertneununddreißig)",
 20 |             "15ten": "fünfzehnten",
 21 |             "16ten": "sechzehnten",
 22 |             "1001": "tausendundeine",
 23 |             "1837": "achzehnhundertsiebenunddreißig",
 24 |             "1000": "eintausend",
 25 |             "5000": "fünftausend",
 26 |             "261.": "zweihunderteinundsechzigte",
 27 |             "4500": "viertausendfünfhundert",
 28 |             "1200": "eintausendzweihundert",
 29 |             "7320": "siebentausenddreihundertzwanzig",
 30 |             "1226": "eintausendzweihundertsechsundzwanzig",
 31 |             "2500": "zweitausendfünfhundert",
 32 |             "4000": "viertausend",
 33 |             "8000": "achttausend",
 34 |             "3000": "dreitausend",
 35 |             "2000": "zweitausend",
 36 |             "6000": "sechstausend",
 37 |             "653": "sechshundertdreiundfünfzig",
 38 |             "636": "sechshundertsechsunddreißig",
 39 |             "103": "einhundertdrei",
 40 |             "700": "siebenhundert",
 41 |             "100": "einhundert",
 42 |             "200": "zweihundert",
 43 |             "800": "achthundert",
 44 |             "110": "einhundertzehn",
 45 |             "500": "fünfhundert",
 46 |             "400": "vierhundert",
 47 |             "14.": "vierzehnsten",
 48 |             "11.": "elften",
 49 |             "16.": "sechzehnsten",
 50 |             "13.": "dreizehnsten",
 51 |             "15.": "fünfzehnsten",
 52 |             "145": "einhundertfünfundvierzig",
 53 |             "170": "einhundertsiebzig",
 54 |             "40.": "vierzigsten",
 55 |             "41.": "einundvierzigsten",
 56 |             "98": "achtundneunzig",
 57 |             "39": "neununddreißig",
 58 |             "40": "vierzig",
 59 |             "70": "siebzig",
 60 |             "18": "achtzehn",
 61 |             "20": "zwanzig",
 62 |             "50": "fünfzig",
 63 |             "10": "zehn",
 64 |             "24": "vierundzwanzig",
 65 |             " v.": " von ",
 66 |             " H. ": " Herr ",
 67 |             " N.": " Nacht "
 68 |         }
 69 |     },
 70 |     "tausendUndEineNacht2": {
 71 |         "title": "tausend_und_eine_nacht_band_2",
 72 |         "LibrivoxBookName": "Tausend und eine Nacht, Band 2",
 73 |         "GutenbergId": "weil/band2/inhalt.html",
 74 |         "GutenbergStart": "",
 75 |         "GutenbergEnd": "",
 76 |         "textReplacement": {
 77 |             "10.000": "zehntausend",
 78 |             " usw.": "  und so weiter ",
 79 |             " u.s.w.": "und so weiter ",
 80 |             " N. N.": " so und so ",
 81 |             "d. h.": " das heißt ",
 82 |             " d.h.": " das heißt ",
 83 |             " N.N.": " so und so ",
 84 |             "1001": "tausendundeine",
 85 |             "1000": "eintausend",
 86 |             "1050": "eintausendfünfzig",
 87 |             " u.": " und ",
 88 |             "999": "neunhundertneunundneunzig",
 89 |             "12.": "zwölfsten",
 90 |             "2.": "zweite",
 91 |             "40": "vierzig"
 92 |         }
 93 |     },
 94 |     "tausendUndEineNacht3": {
 95 |         "title": "tausend_und_eine_nacht_band_3",
 96 |         "LibrivoxBookName": "Tausend und eine Nacht, Band 3",
 97 |         "GutenbergId": "weil/band3/inhalt.html",
 98 |         "GutenbergStart": "",
 99 |         "GutenbergEnd": "",
100 |         "textReplacement": {
101 |             " u.s.f.": " und so fort",
102 |             " Z.B.": " zum Beispiel ",
103 |             " d. h.": " das heißt ",
104 |             " z. B.": " zum Beispiel ",
105 |             "1001": "tausendundeine",
106 |             "1564": "fünfzenhundertvierundsechzig"
107 |         }
108 |     },
109 |     "tausendUndEineNacht4": {
110 |         "title": "tausend_und_eine_nacht_band_4",
111 |         "LibrivoxBookName": "Tausend und eine Nacht, Band 4",
112 |         "GutenbergId": "weil/band4/inhalt.html",
113 |         "GutenbergStart": "",
114 |         "GutenbergEnd": "",
115 |         "textReplacement": {
116 |             " u.s.w.": " und so weiter ",
117 |             " N. N.": " so und so ",
118 |             " usw.": "  und so weiter",
119 |             "3,700,000": "dreimillionensiebenhundertausend",
120 |             "1050000": "einemillionfünfzigtausend",
121 |             "30,000": "dreißigtausend",
122 |             "70.000": "siebzigtausend",
123 |             "1001": "Tausendundeine",
124 |             "1 1/6": "ein ein sechstel",
125 |             "70000": "siebzigtausend",
126 |             "75": "fünfundsiebzig",
127 |             "40": "vierzig "
128 |         }
129 |     }
130 | }


--------------------------------------------------------------------------------
/huiAudioCorpus/calculator/AlignSentencesIntoTextCalculator.py:
--------------------------------------------------------------------------------
  1 | import operator
  2 | from nltk.sem.evaluate import Error
  3 | from tqdm import tqdm
  4 | from huiAudioCorpus.model.SentenceAlignment import SentenceAlignment
  5 | from typing import List
  6 | from huiAudioCorpus.model.Sentence import Sentence
  7 | from huiAudioCorpus.transformer.SentenceDistanceTransformer import SentenceDistanceTransformer
  8 | from joblib import Parallel, delayed
  9 | 
 10 | rangeWords = 40
 11 | class AlignSentencesIntoTextCalculator:
 12 | 
 13 |     def __init__(self, sentenceDistanceTransformer: SentenceDistanceTransformer):
 14 |         self.sentenceDistanceTransformer = sentenceDistanceTransformer
 15 | 
 16 |     def calculate(self, originalText: Sentence, textToAlign: List[Sentence]):
 17 | 
 18 |         alignments = self.calculateAlignments(originalText,textToAlign)
 19 |         alignments = self.evaluateIfPerfektStartAndEnd(alignments,originalText.wordsCount)
 20 |         alignments = self.getMissingWordsBetweenAlignments(alignments, originalText)
 21 |         return alignments
 22 | 
 23 |     def calculateAlignments(self, originalText: Sentence, textToAlign: List[Sentence]):
 24 |         with Parallel(n_jobs=15, batch_size=500) as parallel:
 25 |             alignments:List[SentenceAlignment] = []
 26 |             start=0
 27 |             text: Sentence
 28 |             additionalRange =  0
 29 |             for text in tqdm(textToAlign):
 30 | 
 31 | 
 32 |                 rangeStart= max(0,start-rangeWords - additionalRange)
 33 |                 rangeEnd = min(rangeStart+2*(rangeWords + additionalRange)+text.wordsCount,originalText.wordsCount+1)
 34 | 
 35 |                 if rangeEnd- rangeStart>2000:
 36 |                     raise Exception('more than 2000 Words in search text')
 37 | 
 38 |                 (newStart, end), distance = self.bestPosition(parallel,originalText[rangeStart: rangeEnd ], text, 0, rangeEnd- rangeStart)
 39 |                 newStart += rangeStart
 40 |                 end += rangeStart
 41 | 
 42 |                 align = SentenceAlignment(text, originalText[newStart: end],newStart, end, distance)
 43 |                 if distance>0.2:
 44 |                     print('*****************')
 45 |                     print('skip because of too high distance: ',text.id, distance)
 46 |                     print('*****************')
 47 |                     print(text.sentence)
 48 |                     print('___________________')
 49 |                     print(originalText[rangeStart: rangeEnd ].sentence)
 50 |                     print('########')
 51 | 
 52 |                     align.isSkipped = True
 53 |                     additionalRange += 30 + text.wordsCount
 54 |                 else: 
 55 |                     start = end
 56 |                     additionalRange= 0
 57 |                 alignments.append(align)
 58 |             return alignments
 59 | 
 60 |     def bestPosition(self,parallel:Parallel, originalText: Sentence, textToAlign: Sentence, rangeStart: int, rangeEnd: int):
 61 |         startEnds = []
 62 |         for end in range(rangeStart, rangeEnd):
 63 |             for start in range(max(rangeStart,end-textToAlign.wordsCount-10), end):
 64 |                 startEnds.append((start, end))
 65 | 
 66 |         positionene = parallel(delayed(self.positionOneSentence)(originalText, textToAlign, start, end) for start, end in startEnds)
 67 |         #positionene = [self.positionOneSentence(originalText, textToAlign, start, end) for start, end in startEnds]
 68 |         
 69 |         bestPosition = min(positionene, key=operator.itemgetter(1)) # type: ignore
 70 |         return  bestPosition
 71 | 
 72 |     def positionOneSentence(self, originalText: Sentence , textToAlign: Sentence, start: int, end: int):
 73 |         textToSearch = originalText[start:end]
 74 |         distance = self.sentenceDistanceTransformer.transform(textToSearch, textToAlign)
 75 |         return [(start, end), distance]
 76 | 
 77 | 
 78 |     def evaluateIfPerfektStartAndEnd(self,alignments:  List[SentenceAlignment], originalTextLength: int):
 79 |         for index, align in enumerate(alignments):
 80 |             align.leftIsPerfekt = False
 81 |             align.rightIsPerfekt = False
 82 |             align.isFirst = index ==0
 83 |             align.isLast = index == len(alignments)-1
 84 | 
 85 |             if align.start==0:
 86 |                 align.leftIsPerfekt=True
 87 |             if align.end == originalTextLength:
 88 |                 align.rightIsPerfekt= True
 89 | 
 90 |             try:
 91 |                 if align.start == alignments[index-1].end:
 92 |                     align.leftIsPerfekt=True
 93 |             except:
 94 |                 pass
 95 |             try:
 96 |                 if align.end == alignments[index+1].start:
 97 |                     align.rightIsPerfekt=True
 98 |             except:
 99 |                 pass
100 |             align.isPerfect = (align.leftIsPerfekt or align.isFirst) and (align.rightIsPerfekt or align.isLast) and not align.isSkipped
101 |         return alignments
102 | 
103 |     def getMissingWordsBetweenAlignments(self, alignments:  List[SentenceAlignment], originalText: Sentence):
104 |         for index, aling in enumerate(alignments):
105 |             if index == len(alignments)-1:
106 |                 continue
107 |             
108 |             if not aling.rightIsPerfekt:
109 |                 print(originalText[aling.end:alignments[index+1].start])
110 | 
111 |         return alignments


--------------------------------------------------------------------------------
/huiAudioCorpus/converter/SentenceToPhoneticSentenceConverter.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | from nltk.sem.evaluate import Error
  3 | from huiAudioCorpus.model.Sentence import Sentence
  4 | from huiAudioCorpus.model.PhoneticSentence import PhoneticSentence
  5 | import pandas as pd
  6 | 
  7 | class SentenceToPhoneticSentenceConverter:
  8 |     def __init__(self, libraryPath: str , useEmphasis: bool = True):
  9 |         self.library = self.createLibrary(libraryPath)
 10 |         self.useEmphasis = useEmphasis
 11 | 
 12 |     def convert(self, sentence: Sentence):
 13 |         words = sentence.words
 14 |         ipaWords, subWords = self.transformSentencesToIpa(words)
 15 |         ipaText = ' '.join(ipaWords)
 16 |         ipaText = self.removeEmphasis(ipaText)
 17 |         return PhoneticSentence(ipaText, subWords)
 18 | 
 19 | 
 20 |     def createLibrary(self, libraryPath: str):
 21 |         pointLibrary = pd.DataFrame({
 22 |             "text": [",", ".", "?", "-", ";", "!", ":", "'", "s", "ste", "(", ")", ">", "<", '›', '‹', 'é','è', '&'],
 23 |             "ipa": [",", ".","?", ",", ",", "!", ":", "'", "s", "stə", ",", ",", "'", "'", "'", "'", 'e', 'e', 'ʊnt']
 24 |         })
 25 |         library = pd.read_csv(libraryPath,keep_default_na=False)
 26 | 
 27 |         libraryLowerCase = library.copy(deep=True)
 28 |         libraryLowerCase['text'] = libraryLowerCase['text'].apply(str.lower)
 29 |         library = library.append(pointLibrary)
 30 |         library = library.append(libraryLowerCase)
 31 | 
 32 |         library.set_index('text', inplace = True)
 33 |         library.sort_index(inplace = True)
 34 |         return library
 35 | 
 36 |     def transformSentencesToIpa(self, words:List[str]):
 37 |             ipaWords: List[str] = []
 38 |             subWords: List[str] = []
 39 |             index = 0
 40 |             while index < len(words):
 41 |                 word = words[index]
 42 |                 remainingWords = words[index:]
 43 |                 countMultiwords, multiwords, multiWord = self.findMultiwordIpa(remainingWords)
 44 |                 if countMultiwords>0 and multiwords is not None:
 45 |                     index += countMultiwords
 46 |                     subWords.append(multiWord)
 47 |                     ipaWords.append(multiwords)
 48 |                     continue
 49 |                 ipa, subWord = self.transformWordToIpa(word)
 50 |                 subWords.append(subWord)
 51 |                 ipaWords.append(ipa)
 52 |                 index +=1
 53 |             return ipaWords, subWords
 54 |     
 55 |     def findMultiwordIpa(self, words:List[str]):
 56 |         if len(words)<2:
 57 |             return 0, None, ""
 58 |         for count in range(5,1,-1):
 59 |             multiWord = ' '.join(words[:count])
 60 |             multiwordIpa = self.getIpaFromLibrary(multiWord)
 61 |             if multiwordIpa is not None:
 62 |                 return count, multiwordIpa, multiWord
 63 |         return 0, None, ""
 64 | 
 65 |     def transformWordToIpa(self, word:str):
 66 |         completeIpaLeft = ''
 67 |         completeIpaRight = ''
 68 |         completeWordLeft = []
 69 |         completeWordRight = []
 70 |         while word != '':
 71 |             remainingWordFirst, ipaFirst, firstPart = self.findFirstPartInWord(word)
 72 |             remainingWordLast, ipaLast, lastPart = self.findLastPartInWord(word)
 73 |             if len(remainingWordLast) < len(remainingWordFirst):
 74 |                 completeIpaLeft = ipaLast + completeIpaLeft
 75 |                 completeWordLeft.insert(0,lastPart)
 76 |                 word = remainingWordLast
 77 |             else:
 78 |                 completeIpaRight = completeIpaRight + ipaFirst
 79 |                 completeWordRight.append(firstPart)                
 80 |                 word = remainingWordFirst
 81 |         completeIpa = completeIpaRight + completeIpaLeft
 82 |         completeWordRight.extend(completeWordLeft)
 83 |         completeWords = '|'.join(completeWordRight)
 84 |         return completeIpa, completeWords
 85 | 
 86 | 
 87 |     def findFirstPartInWord(self, word:str):
 88 |         for wordPart in range(len(word), 0, -1):
 89 |             part = word[:wordPart]
 90 |             ipa = self.getIpaFromLibrary(part)
 91 |             if ipa is not None:
 92 |                 remainingWord = word[wordPart:]
 93 |                 return remainingWord, ipa, part
 94 |         raise Error('we have no match for single char in library with char: ' + word[0] + 'with full text:' + word)# pragma: no cover
 95 | 
 96 |     def findLastPartInWord(self, word:str):
 97 |         for wordPart in range(0,len(word)):
 98 |             part = word[wordPart:]
 99 |             ipa = self.getIpaFromLibrary(part)
100 |             if ipa is not None:
101 |                 remainingWord = word[:wordPart]
102 |                 return remainingWord, ipa, part
103 |         raise Error('we have no match for single char in library with char: ' + word[-1])# pragma: no cover
104 | 
105 |     def getIpaFromLibrary(self, word:str):
106 |         ipa = self.getIpaFromLibraryExcactString(word)
107 |         if ipa is  None:
108 |             word = word.lower()
109 |             ipa = self.getIpaFromLibraryExcactString(word)
110 |         return ipa
111 |     
112 |     def getIpaFromLibraryExcactString(self,word:str):
113 |         if word in self.library.index:
114 |             ipa: str
115 |             ipa = self.library.loc[word].values[0]
116 |             if type(ipa) is not str:
117 |                 ipa = ipa[0]
118 |             return ipa
119 |         return None
120 | 
121 |     def removeEmphasis(self, text: str):
122 |         if self.useEmphasis:
123 |             return text
124 |         withoutEmphasis = text.replace("ˈ","")
125 |         return withoutEmphasis
126 |         


--------------------------------------------------------------------------------
/huiAudioCorpus/workflows/createDatasetWorkflow/Step0_Overview.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | from huiAudioCorpus.utils.PathUtil import PathUtil
  4 | from huiAudioCorpus.persistenz.AudiosFromLibrivoxPersistenz import AudiosFromLibrivoxPersistenz
  5 | from huiAudioCorpus.utils.DoneMarker import DoneMarker
  6 | from tqdm import tqdm
  7 | 
  8 | class Step0_Overview:
  9 | 
 10 |     def __init__(self, audiosFromLibrivoxPersistenz: AudiosFromLibrivoxPersistenz, savePath: str, pathUtil: PathUtil):
 11 |         self.savePath = savePath
 12 |         self.audiosFromLibrivoxPersistenz = audiosFromLibrivoxPersistenz
 13 |         self.pathUtil = pathUtil
 14 | 
 15 |     def run(self):
 16 |         return DoneMarker(self.savePath).run(self.script, deleteFolder=False)
 17 |     
 18 |     def script(self):
 19 |         booksLibrivox = self.downloadOverviewLibrivox()
 20 |         usableBooks = self.downloadChapters(booksLibrivox)
 21 |         speackerOverview = self.generateSpeackerOverview(usableBooks)
 22 |         speackerShort = self.generateSpeackerShort(speackerOverview)
 23 |         self.generateSpeackerTemplate(usableBooks)
 24 | 
 25 |         print('Usable books:', len(usableBooks))
 26 |         print('Total hours:',sum([book['time'] for book in usableBooks])/60/60)
 27 |         print('Count of Speackers:', len(speackerShort))
 28 |         print('bestSpeacker:', speackerShort[0])
 29 | 
 30 |     def downloadOverviewLibrivox(self):
 31 |         librivoxPath = self.savePath + '/booksLibrivox.json'
 32 |         if not self.pathUtil.fileExists(librivoxPath):
 33 |             print('Download Overview from Librivox')
 34 |             booksLibrivox  = self.audiosFromLibrivoxPersistenz.getIds()
 35 |             self.pathUtil.saveJson(librivoxPath, booksLibrivox)
 36 | 
 37 |         booksLibrivox = self.pathUtil.loadJson(librivoxPath)
 38 |         return booksLibrivox
 39 | 
 40 |     def downloadChapters(self, booksLibrivox):
 41 |         usableBookPath = self.savePath + '/usableBooks.json'
 42 |         if not self.pathUtil.fileExists(usableBookPath):
 43 |             print('Download Chapters from Librivox')
 44 |             usableBooks = [{'time': book['totaltimesecs'], 'title':book['title'], 'url': book['url_text_source']} for book in booksLibrivox if self.isBookUseable(book)]
 45 |             for book in tqdm(usableBooks):
 46 |                 chapters, chapterDownloadLinks = self.audiosFromLibrivoxPersistenz.getChapter(book['title'])
 47 |                 book['chapters'] = []
 48 |                 for _, chapter in chapters.iterrows():
 49 |                     book['chapters'].append({
 50 |                         'title': chapter['Chapter'],
 51 |                         'reader': chapter['Reader'],
 52 |                         'time': convertToSeconds(chapter['Time'])
 53 |                     })
 54 |             self.pathUtil.saveJson(usableBookPath, usableBooks)
 55 | 
 56 |         usableBooks = self.pathUtil.loadJson(usableBookPath)
 57 |         return usableBooks
 58 | 
 59 |     def isBookUseable(self, book):
 60 |         if book['totaltimesecs']<=0:
 61 |             return False
 62 |         if book['language'] != "German":
 63 |             return False
 64 |         if 'www.projekt-gutenberg.org' in book['url_text_source']:
 65 |             return True
 66 | 
 67 |         if 'www.gutenberg.org/' in book['url_text_source']:
 68 |             return True
 69 |         return False
 70 | 
 71 |     def generateSpeackerTemplate(self, usableBooks):
 72 |         readerPath = self.savePath + '/readerTemplate.json'
 73 |         if not self.pathUtil.fileExists(readerPath):
 74 |             reader = {}
 75 |             for book in usableBooks:
 76 |                 bookTitle = book['title']
 77 |                 for chapter in book['chapters']:
 78 |                     if chapter['reader'] not in reader:
 79 |                         reader[chapter['reader']] = {}
 80 | 
 81 |                     title = ''.join([i for i in bookTitle.lower().replace(' ','_') if (i in 'abcdefghijklmonpqrstuvwxyz_' or i.isnumeric())])
 82 |                     guttenbergId = book['url'].replace('www.projekt-gutenberg.org/', '').replace('https://','').replace('http://','')
 83 |                     if 'www.gutenberg.org/' in guttenbergId:
 84 |                         guttenbergId = int(guttenbergId.replace('www.gutenberg.org/ebooks/', '').replace('www.gutenberg.org/etext/', ''))
 85 | 
 86 |                     reader[chapter['reader']][title] = {
 87 |                         'title': title,
 88 |                         'LibrivoxBookName': bookTitle,
 89 |                         'GutenbergId': guttenbergId,
 90 |                         'GutenbergStart': '',
 91 |                         'GutenbergEnd': '',
 92 |                         'textReplacement':{}
 93 |                     }
 94 | 
 95 | 
 96 | 
 97 |             self.pathUtil.saveJson(readerPath, reader)
 98 |         reader = self.pathUtil.loadJson(readerPath)
 99 |         return reader
100 | 
101 |     def generateSpeackerOverview(self, usableBooks):
102 |         readerPath = self.savePath + '/readerLong.json'
103 |         if not self.pathUtil.fileExists(readerPath):
104 |             reader = {}
105 |             for book in usableBooks:
106 |                 bookTitle = book['title']
107 |                 for chapter in book['chapters']:
108 |                     if chapter['reader'] not in reader:
109 |                         reader[chapter['reader']] = []
110 | 
111 |                     reader[chapter['reader']].append({
112 |                         'title': chapter['title'],
113 |                         'time': chapter['time'],
114 |                         'book': bookTitle
115 |                     })
116 |             self.pathUtil.saveJson(readerPath, reader)
117 |         reader = self.pathUtil.loadJson(readerPath)
118 |         return reader
119 | 
120 |     def generateSpeackerShort(self, speackerOverview):
121 |         readerPath = self.savePath + '/readerShort.json'
122 |         if not self.pathUtil.fileExists(readerPath):
123 |             readers = []
124 |             for speacker in speackerOverview:
125 |                 readers.append({
126 |                     'name': speacker,
127 |                     'time': round(sum([chapter['time'] for chapter in speackerOverview[speacker]])/60/60,1)
128 |                 })
129 |             readers.sort(key=lambda x: x['time'], reverse=True)
130 |             self.pathUtil.saveJson(readerPath, readers)
131 |         readers = self.pathUtil.loadJson(readerPath)
132 |         return readers
133 | 
134 | 
135 | def convertToSeconds(timeString: str):
136 |     ftr = [3600,60,1]
137 |     duration = sum([a*b for a,b in zip(ftr, map(int,timeString.split(':')))])
138 |     return duration


--------------------------------------------------------------------------------
/scripts/createDatasetConfig/Eva.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "balladen": {
  3 |         "title": "balladen",
  4 |         "LibrivoxBookName": "Balladen",
  5 |         "GutenbergId": "spittelr/balladen/balladen.html",
  6 |         "GutenbergStart": "In finstrer Nacht, auf steilen Wolkenpfaden,",
  7 |         "GutenbergEnd": "",
  8 |         "textReplacement": {
  9 |             "1.": " ",
 10 |             "2.": " ",
 11 |             "3.": " ",
 12 |             " St.": " Sankt ",
 13 |             "ï": "i",
 14 |             "*": " "
 15 |         }
 16 |     },
 17 |     "ligeia_und_andere_novellen": {
 18 |         "title": "ligeia_und_andere_novellen",
 19 |         "LibrivoxBookName": "Ligeia und Andere Novellen",
 20 |         "GutenbergId": 50887,
 21 |         "GutenbergStart": "Und es liegt darin der Wille, der",
 22 |         "GutenbergEnd": "Genien und Gnomen.",
 23 |         "textReplacement": {
 24 |             "nennt[1],": "nennt, ",
 25 |             "[Fußnote 1: Denn da Jupiter während der Winterzeit zweimal sieben Tage": "",
 26 |             "Wärme schenkt, so haben die Menschen diese milde und gemäßigte Zeit die": "",
 27 |             "Amme des schönen Eisvogels genannt. -- Simonides]": "",
 28 |             "Gemach der Bibliothek": " Gemach der Bibliothek. Fussnote Denn da Jupiter während der Winterzeit zweimal sieben Tage Wärme schenkt, so haben die Menschen diese milde und gemäßigte Zeit die Amme des schönen Eisvogels genannt.    Simonides. Ende der Fussnote.",
 29 |             "Mond[2],": "Mond Zwei,",
 30 |             "die Sonne ist;" : " die Sonne is. Fussnote zwei,m Mond im Englischen weiblich, Sonne männlich. Anmerkung der Übersetzerin. ",
 31 |             "[Fußnote 2: Mond im Englischen weiblich, Sonne männlich. A. d. Üb.]": "",
 32 |             "sieht[3].": "sieht drei. Fussnote drei, Wo Pomponius Mela in seiner Abhandlung De Situ Orbis von Flut und Ebbe spricht, sagt er: Entweder ist die Welt ein großes Tier, oder und so wieter. Ender der Fussnote drei. ",
 33 |             "[Fußnote 3: Wo Pomponius Mela in seiner Abhandlung »De Situ Orbis« von": "",
 34 |             "Flut und Ebbe spricht, sagt er: »Entweder ist die Welt ein großes Tier,": "",
 35 |             "oder« usw.]": "",
 36 |             "Franzose[4]": "Franzose vier",
 37 |             "belle chose«?": "belle chose? Fussnote vier, Balzac, dem Sinne nach, ich weiß nicht mehr die Worte. Ende der Fussnote vier.",
 38 |             "[Fußnote 4: Balzac, dem Sinne nach; ich weiß nicht mehr die Worte.]": "",
 39 |             "können.[5]": "können fünf. Fussnote fünf, Florem putares nare per liquidum aethera. P Commire. Ender der Fussnote fünf.",
 40 |             "[Fußnote 5: Florem putares nare per liquidum aethera. -- P. Commire]": "",
 41 |             " Mr.": " Mister ",
 42 |             " on.": " on .",
 43 |             "*": " ",
 44 |             "[": " ",
 45 |             "]": " ",
 46 |             "ë": "e"
 47 |         }
 48 |     },
 49 |     "fabeln_und_erzhlungen": {
 50 |         "title": "fabeln_und_erzhlungen",
 51 |         "LibrivoxBookName": "Fabeln und Erzählungen",
 52 |         "GutenbergId": 9335,
 53 |         "GutenbergStart": "",
 54 |         "GutenbergEnd": "",
 55 |         "textReplacement": {
 56 |             "*": " "
 57 |         }
 58 |     },
 59 |     "toten_seelen": {
 60 |         "title": "toten_seelen",
 61 |         "LibrivoxBookName": "toten Seelen",
 62 |         "GutenbergId": "gogol/toteseel/toteseel.html",
 63 |         "GutenbergStart": "",
 64 |         "GutenbergEnd": "irdischen Amtes zu achten, weil wir es schon alle dunkel ahnen und weil wir kaum",
 65 |         "textReplacement": {
 66 |             " von Ew.": " von euerer ",
 67 |             "d. h.": " das heißt ",
 68 |             " z. B.": " zum Beispiel ",
 69 |             " N. N.": " N N ",
 70 |             " usw.": "  und so weiter",
 71 |             " Nr.": " Nummer ",
 72 |             " St.": " Sankt ",
 73 |             " Ew.": " Eure ",
 74 |             "a. D.": " a D ",
 75 |             "a.D.": " a D ",
 76 |             "1845": "achtzehnhundertfünfundvierzig",
 77 |             "1850": "achtzehnhundertfünfzig",
 78 |             "1812": "achtzehnhundertzwölf",
 79 |             "1814": "achtzehnhundertvierzehn",
 80 |             "1835": "achtzehnhundertfünfunddreißig",
 81 |             "1841": "achtzehnhunderteinundvierzig",
 82 |             "1840": "achtzehnhundertvierzig",
 83 |             "1842": "achtzehnhundertzweiundvierzig",
 84 |             "1852": "achtzehnhundertzweiundfünfzig",
 85 |             "10.": "zehnten",
 86 |             "21.": "einundzwanzigsten",
 87 |             "4a": "vier A ",
 88 |             "34": "vierunddreißig",
 89 |             "9.": "neunte",
 90 |             "I.": "Erstens ",
 91 |             "II.":"Zweitens "
 92 |         }
 93 |     },
 94 |     "wir_fanden_einen_pfad_neue_gedichte": {
 95 |         "title": "wir_fanden_einen_pfad_neue_gedichte",
 96 |         "LibrivoxBookName": "Wir fanden einen Pfad: Neue Gedichte",
 97 |         "GutenbergId": 9623,
 98 |         "GutenbergStart": "",
 99 |         "GutenbergEnd": "",
100 |         "textReplacement": {
101 |             "F1ügelschuhn": "Flügelschuhn",
102 |             "G1ühwürmchen": "Glühwürmchen",
103 |             "B1üte,": "Blüte,",
104 |             "1912": "neunzehnhundertzwölf"
105 |         }
106 |     },
107 |     "werde_die_du_bist": {
108 |         "title": "werde_die_du_bist",
109 |         "LibrivoxBookName": "Werde, die Du bist",
110 |         "GutenbergId": "dohm/wiedu/wiedu.html",
111 |         "GutenbergStart": "In der Irrenanstalt des Doktor Behrend,",
112 |         "GutenbergEnd": "",
113 |         "textReplacement": {
114 |             " u.s.w.": " und so weiter ",
115 |             " Z. B.": " zum Beispiel ",
116 |             " z.B.": " zum Beispiel ",
117 |             "10.000": "zehntausend",
118 |             "1894": "achtzehnhundertvierundneunzig",
119 |             "2500": "zweitausendfünfhundert",
120 |             "3000": "dreitausend",
121 |             "1500": "fünfzehnhundert",
122 |             "54": "vierundfünfzig",
123 |             "35": "fünfunddreißig",
124 |             "18": "achtzehn",
125 |             "*": " ",
126 |             "ô": "o"
127 |         }
128 |     },
129 |     "kleine_lord_version_2": {
130 |         "title": "kleine_lord_version_2",
131 |         "LibrivoxBookName": "kleine Lord (version 2)",
132 |         "GutenbergId": "burnett/lord/lord.html",
133 |         "GutenbergStart": "Cedrik selbst wußte kein Sterbenswörtchen davon,",
134 |         "GutenbergEnd": "",
135 |         "textReplacement": {
136 |             "4.": "vierte",
137 |             "Mr.": " Mister ",
138 |             "p. s.": " p s ",
139 |             "·": " "
140 |         }
141 |     }
142 | }


--------------------------------------------------------------------------------
/huiAudioCorpus/dependencyInjection/DependencyInjection.py:
--------------------------------------------------------------------------------
  1 | def disableLog():
  2 |     logging.getLogger('matplotlib').disabled = True
  3 |     logging.getLogger('matplotlib.font_manager').disabled = True
  4 |     logging.getLogger('matplotlib.colorbar').disabled = True
  5 |     logging.getLogger('numba.core.ssa').disabled = True
  6 |     logging.getLogger('numba.core.interpreter').disabled = True
  7 |     logging.getLogger('numba.core.byteflow').disabled = True
  8 |     logging.getLogger('numba.ssa').disabled = True
  9 |     logging.getLogger('numba.byteflow').disabled = True
 10 |     logging.getLogger('numba.interpreter').disabled = True
 11 |     logging.getLogger('paramiko.transport.sftp').disabled = True
 12 |     logging.getLogger('paramiko.transport').disabled = True
 13 |     logging.getLogger('h5py._conv').disabled = True
 14 |     logging.getLogger().setLevel(logging.WARNING)
 15 |     
 16 | from huiAudioCorpus.workflows.createDatasetWorkflow.Step8_DatasetStatistic import Step8_DatasetStatistic
 17 | from huiAudioCorpus.workflows.createDatasetWorkflow.Step9_GenerateCleanDataset import Step9_GenerateCleanDataset
 18 | from huiAudioCorpus.workflows.createDatasetWorkflow.Step7_AudioRawStatistic import Step7_AudioRawStatistic
 19 | from huiAudioCorpus.workflows.createDatasetWorkflow.Step3_1_PrepareText import Step3_1_PrepareText
 20 | from huiAudioCorpus.workflows.createDatasetWorkflow.Step0_Overview import Step0_Overview
 21 | from huiAudioCorpus.workflows.createDatasetWorkflow.Step2_1_AudioStatistic import Step2_1_AudioStatistic
 22 | from huiAudioCorpus.workflows.createDatasetWorkflow.Step6_FinalizeDataset import Step6_FinalizeDataset
 23 | from huiAudioCorpus.transformer.SentenceDistanceTransformer import SentenceDistanceTransformer
 24 | from huiAudioCorpus.calculator.AlignSentencesIntoTextCalculator import AlignSentencesIntoTextCalculator
 25 | from huiAudioCorpus.workflows.createDatasetWorkflow.Step5_AlignText import Step5_AlignText
 26 | from huiAudioCorpus.converter.AudioToSentenceConverter import AudioToSentenceConverter
 27 | from huiAudioCorpus.workflows.createDatasetWorkflow.Step4_TranscriptAudio import Step4_TranscriptAudio
 28 | from huiAudioCorpus.persistenz.GutenbergBookPersistenz import GutenbergBookPersistenz
 29 | from huiAudioCorpus.workflows.createDatasetWorkflow.Step3_DownloadText import Step3_DownloadText
 30 | from huiAudioCorpus.transformer.AudioSplitTransformer import AudioSplitTransformer
 31 | from huiAudioCorpus.transformer.AudioLoudnessTransformer import AudioLoudnessTransformer
 32 | from huiAudioCorpus.workflows.createDatasetWorkflow.Step2_SplitAudio import Step2_SplitAudio
 33 | from huiAudioCorpus.persistenz.AudiosFromLibrivoxPersistenz import AudiosFromLibrivoxPersistenz
 34 | from huiAudioCorpus.workflows.createDatasetWorkflow.Step1_DownloadAudio import Step1_DownloadAudio
 35 | from huiAudioCorpus.converter.StringToSentencesConverter import StringToSentencesConverter
 36 | from frosch import hook
 37 | hook(theme = 'paraiso_dark')
 38 | import logging
 39 | disableLog()
 40 | 
 41 | from huiAudioCorpus.error.DependencyInjectionError import DependencyInjectionError
 42 | from huiAudioCorpus.converter.ListToHistogramConverter import ListToHistogramConverter
 43 | from huiAudioCorpus.converter.ListToStatisticConverter import ListToStatisticConverter
 44 | from huiAudioCorpus.ui.Plot import Plot
 45 | from huiAudioCorpus.components.TextStatisticComponent import TextStatisticComponent
 46 | from huiAudioCorpus.components.AudioStatisticComponent import AudioStatisticComponent
 47 | from huiAudioCorpus.utils.PathUtil import PathUtil
 48 | from huiAudioCorpus.utils.FileListUtil import FileListUtil
 49 | from huiAudioCorpus.converter.TranscriptsToSentencesConverter import TranscriptsToSentencesConverter
 50 | from huiAudioCorpus.persistenz.AudioTranscriptPairPersistenz import AudioTranscriptPairPersistenz
 51 | from huiAudioCorpus.converter.PhoneticSentenceToSymbolSentenceConverter import PhoneticSentenceToSymbolSentenceConverter
 52 | from huiAudioCorpus.converter.SentenceToPhoneticSentenceConverter import SentenceToPhoneticSentenceConverter
 53 | from huiAudioCorpus.transformer.AudioAddSilenceTransformer import AudioAddSilenceTransformer
 54 | from huiAudioCorpus.transformer.TranscriptsSelectionTransformer import TranscriptsSelectionTransformer
 55 | from huiAudioCorpus.transformer.AudioSamplingRateTransformer import AudioSamplingRateTransformer
 56 | from huiAudioCorpus.persistenz.TranscriptsPersistenz import TranscriptsPersistenz
 57 | from huiAudioCorpus.persistenz.AudioPersistenz import AudioPersistenz
 58 | from huiAudioCorpus.filter.AudioFilter import AudioFilter
 59 | from huiAudioCorpus.transformer.AudioFadeTransformer import AudioFadeTransformer
 60 | from huiAudioCorpus.calculator.TextNormalizer import TextNormalizer
 61 | 
 62 | import inspect
 63 | 
 64 | disableLog()
 65 | 
 66 | 
 67 | defaultConfig = {
 68 |     'audioAddSilenceTransformer': {
 69 |         'endDurationSeconds': 0.7,
 70 |         'startDurationSeconds': 0
 71 |     },
 72 |     'listToHistogramConverter': {
 73 |         'stepSize':1
 74 |     }
 75 | }
 76 | 
 77 | class DependencyInjection:
 78 |     #Calculators
 79 |     alignSentencesIntoTextCalculator: AlignSentencesIntoTextCalculator
 80 |     textNormalizer: TextNormalizer
 81 | 
 82 |         
 83 |     #Components
 84 |     audioStatisticComponent: AudioStatisticComponent
 85 |     textStatisticComponent: TextStatisticComponent
 86 | 
 87 |     #Converters
 88 |     phoneticSentenceToSymbolSentenceConverter:PhoneticSentenceToSymbolSentenceConverter
 89 |     sentenceToPhoneticSentenceConverter:SentenceToPhoneticSentenceConverter
 90 |     transcriptsToSentencesConverter:TranscriptsToSentencesConverter
 91 |     listToStatisticConverter:ListToStatisticConverter
 92 |     listToHistogramConverter: ListToHistogramConverter
 93 |     stringToSentencesConverter: StringToSentencesConverter
 94 |     audioToSentenceConverter: AudioToSentenceConverter
 95 | 
 96 | 
 97 |     #Filters
 98 |     audioFilter:AudioFilter
 99 |     
100 |     #Persistence
101 |     audioPersistenz:AudioPersistenz
102 |     audioTranscriptPairPersistenz:AudioTranscriptPairPersistenz
103 |     transcriptsPersistenz:TranscriptsPersistenz
104 |     audiosFromLibrivoxPersistenz:AudiosFromLibrivoxPersistenz
105 |     GutenbergBookPersistenz: GutenbergBookPersistenz
106 |     
107 |     #Transformers
108 |     audioAddSilenceTransformer:AudioAddSilenceTransformer
109 |     audioSamplingRateTransformer:AudioSamplingRateTransformer
110 |     transcriptsSelectionTransformer:TranscriptsSelectionTransformer
111 |     audioSplitTransformer: AudioSplitTransformer
112 |     sentenceDistanceTransformer: SentenceDistanceTransformer
113 |     audioLoudnessTransformer: AudioLoudnessTransformer
114 |     audioFadeTransformer: AudioFadeTransformer
115 | 
116 | 
117 |     #Utilities
118 |     pathUtil:PathUtil
119 |     fileListUtil: FileListUtil
120 | 
121 |     #Workflows
122 |     step0_Overview: Step0_Overview
123 |     step1_DownloadAudio: Step1_DownloadAudio
124 |     step2_SplitAudio: Step2_SplitAudio
125 |     step2_1_AudioStatistic: Step2_1_AudioStatistic
126 |     step3_DowloadText: Step3_DownloadText
127 |     step3_1_PrepareText: Step3_1_PrepareText
128 |     step4_TranscriptAudio: Step4_TranscriptAudio
129 |     step5_AlignText: Step5_AlignText
130 |     step6_FinalizeDataset: Step6_FinalizeDataset
131 |     step7_AudioRawStatistic: Step7_AudioRawStatistic
132 |     step8_DatasetStatistic: Step8_DatasetStatistic
133 |     step9_GenerateCleanDataset: Step9_GenerateCleanDataset
134 | 
135 |     #plot
136 |     plot: Plot
137 |     
138 |     def __init__(self, config={}):
139 |         configWithDefault = defaultConfig.copy()
140 |         configWithDefault.update(config)
141 |         self.allClassReferences = self.getAllClassReferences(configWithDefault)
142 |         initialedClasses = {}
143 |         for name, classInstance in self.allClassReferences.items():
144 |             def getLambda (name, classInstance):
145 |                 return property(lambda _: self.initClass(name, classInstance, self.classConstructor, initialedClasses, configWithDefault, name ))
146 |             setattr(DependencyInjection, name, getLambda(name, classInstance))
147 | 
148 |     def initClass(self, className, classReference , classConstructorMethod, initialedClasses, config , requestedClass = ''):
149 |         if className in initialedClasses:
150 |             return initialedClasses[className]
151 |         arguments = self.getConstructorReferenceClasses(classReference)
152 |         for argument in arguments:
153 |             if argument not in initialedClasses.values() and arguments[argument] is not None:
154 |                 self.initClass(argument, arguments[argument], classConstructorMethod, initialedClasses, config, requestedClass)
155 |         
156 |         classConfig = config[className].copy() if className in config else {}
157 |         if '#' in classConfig:
158 |             classConfig.pop('#')
159 |         classConfig
160 |         try:
161 | 
162 |             newClassInstance = classConstructorMethod(classReference, initialedClasses, classConfig)
163 |         except Exception as e:
164 |             raise DependencyInjectionError(e, classConfig, classReference.__name__, requestedClass)
165 |         initialedClasses[className] = newClassInstance
166 |         return newClassInstance
167 | 
168 | 
169 |     def classConstructor(self,classReference, initialedClasses , classConfig):
170 |         classConstructor = classConfig.copy()
171 |         references = self.getConstructorReferenceClasses(classReference)
172 |         for ref in references:
173 |             if references[ref] is not None:
174 |                 classConstructor[ref] = initialedClasses[ref]
175 |         classInstance = classReference(**classConstructor)
176 | 
177 |         return classInstance
178 | 
179 |     def getConstructorReferenceClasses(self, classReference):
180 |         arguments = self.getAllConstructorArguments(classReference)
181 | 
182 |         references = {}
183 |         for argument in arguments:
184 |             if argument in ["self","args","kwargs"]:
185 |                 continue
186 |             references[argument] = self.allClassReferences[argument] if argument in self.allClassReferences.keys() else None
187 |         return references
188 | 
189 |     def getAllConstructorArguments(self, classInstance):
190 |         return list(inspect.signature(classInstance.__init__).parameters.keys())
191 | 
192 |     def getAllClassReferences(self,configWithDefault):
193 |         classes = globalClassesAtImportTime.copy()
194 |         for className in configWithDefault:
195 |             if '#' in configWithDefault[className]:
196 |                 classes[className] = configWithDefault[className]['#']
197 |         return classes
198 | 
199 | 
200 | globalClassesAtImportTime = DependencyInjection.__dict__.get("__annotations__")


--------------------------------------------------------------------------------
/huiAudioCorpus/calculator/TextNormalizer.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import argparse
  3 | from pathlib import Path
  4 | 
  5 | number_mappings = {
  6 |     "0" : "null",
  7 |     "1" : "ein",
  8 |     "2" : "zwei",
  9 |     "3" : "drei",
 10 |     "4" : "vier",
 11 |     "5" : "fünf",
 12 |     "6" : "sechs",
 13 |     "7" : "sieben",
 14 |     "8" : "acht",
 15 |     "9" : "neun",
 16 |     "10" : "zehn",
 17 |     "11" : "elf",
 18 |     "12" : "zwölf",
 19 |     "13" : "dreizehn",
 20 |     "14" : "vierzehn",
 21 |     "15" : "fünfzehn",
 22 |     "16" : "sechzehn",
 23 |     "17" : "siebzehn",
 24 |     "18" : "achtzehn",
 25 |     "19" : "neunzehn",
 26 |     "20" : "zwanzig",
 27 |     "30" : "dreißig",
 28 |     "60" : "sechzig",
 29 |     "70" : "siebzig",
 30 |     "100" : "einhundert"
 31 | }
 32 | ordinal_mappings = {
 33 |     "1" : "erste",
 34 |     "3" : "dritte",
 35 |     "7" : "siebte",
 36 |     "8" : "achte",
 37 | }
 38 | customs_mappings = {
 39 |     "¼" : "ein viertel",
 40 |     "½" : "einhalb",
 41 |     "¾" : "drei viertel",
 42 | }
 43 | '''
 44 | ordinal_genders = {
 45 |     ["diese"] : "te",
 46 |     ["als"] : "ter",
 47 |     [""] : "tes",
 48 |     ["am", "zum", "en", "im", "die", "dieser", "diese", "em"] : "ten",
 49 | }
 50 | '''
 51 | def number_literal(number):
 52 |     x_str = str(number)
 53 |     if x_str in number_mappings:
 54 |         return number_mappings[x_str]
 55 |     x_str_left = x_str[0]
 56 |     x_str_right = x_str[1:].lstrip("0")
 57 |     if len(x_str) == 8:
 58 |         x_str_left = x_str[0:2]
 59 |         x_str_right = x_str[2:].lstrip("0")
 60 |         if x_str_right != "":
 61 |             return number_literal(x_str_left)+"millionen"+number_literal(x_str_right)
 62 |         else:
 63 |             return number_literal(x_str_left)+"millionen"
 64 |     if len(x_str) == 7:
 65 |         x_str_left = x_str[0]
 66 |         x_str_right = x_str[1:].lstrip("0")
 67 |         if x_str_right != "":
 68 |             return number_literal(x_str_left)+"millionen"+number_literal(x_str_right)
 69 |         else:
 70 |             return number_literal(x_str_left)+"millionen"
 71 |     if len(x_str) == 6:
 72 |         x_str_left = x_str[0:3]
 73 |         x_str_right = x_str[3:].lstrip("0")
 74 |         if x_str_right != "":
 75 |             return number_literal(x_str_left)+"tausend"+number_literal(x_str_right)
 76 |         else:
 77 |             return number_literal(x_str_left)+"tausend"
 78 |     if len(x_str) == 5:
 79 |         x_str_left = x_str[0:2]
 80 |         x_str_right = x_str[2:].lstrip("0")
 81 |         if x_str_right != "":
 82 |             return number_literal(x_str_left)+"tausend"+number_literal(x_str_right)
 83 |         else:
 84 |             return number_literal(x_str_left)+"tausend"
 85 |         
 86 |     if len(x_str) == 4:
 87 |         if x_str_right != "":
 88 |             if int(number) >= 1200 and int(number) < 2000:
 89 |                 decade = x_str[2:].lstrip("0")
 90 |                 if decade != "":
 91 |                     return number_literal(x_str[0:2])+"hundert"+number_literal(x_str[2:].lstrip("0"))
 92 |                 else:
 93 |                     return number_literal(x_str[0:2])+"hundert"
 94 |             else:
 95 |                 return number_literal(x_str_left)+"tausend"+number_literal(x_str_right)
 96 |         else:
 97 |             return number_literal(x_str_left)+"tausend"
 98 |     if len(x_str) == 3:
 99 |         if x_str_right != "":
100 |             return number_literal(x_str_left)+"hundert"+number_literal(x_str_right)
101 |         else:
102 |             return number_literal(x_str_left)+"hundert"
103 |     if len(x_str) == 2:
104 |         if x_str_right != "":
105 |             return number_literal(x_str_right)+"und"+number_literal(x_str_left+"0")
106 |         else:
107 |             return number_literal(x_str_left) + "zig"
108 | class TextNormalizer:
109 |     def __init__(self) -> None:
110 |         pass
111 |     
112 |     def normalize_rationals(self, input_sentence:str):
113 |         rationals = re.findall(r"(\d+[\. ']*\d*,\d+)",input_sentence)
114 |         for rational in rationals:
115 |             number, decimals = rational.split(",")
116 |             normalized_number = self.normalize_integer(number)
117 |             if number == "1":
118 |                 normalized_number = normalized_number + "s"
119 |             decimals_list = []
120 |             for decimal in decimals:
121 |                 normalized_decimal = self.normalize_integer(decimal)
122 |                 if decimal == "1":
123 |                     normalized_decimal = normalized_decimal + "s"
124 |                 decimals_list.append(normalized_decimal)
125 |             normalized_rational = normalized_number + " komma " + " ".join(decimals_list)
126 |             input_sentence = re.sub(rational, normalized_rational, input_sentence)
127 |         return input_sentence
128 |     
129 |     def normalize_time(self, input_sentence:str):
130 |         times = re.findall(r"(\d{1,2}[\.:]\d{1,2}(?:( Uhr)?))(?!\d)",input_sentence)
131 |         
132 |         if not len(times) > 0:
133 |             return input_sentence
134 |         if type(times[0]) is tuple:
135 |             temp_times = []
136 |             for t, _ in times:
137 |                 temp_times.append(t)
138 |             times = temp_times
139 |         for time in times:
140 |             hour, minute = time.split()[0].replace(".",":").split(":")
141 |             if len(hour) > 2 or len(minute) > 2:
142 |                 print("TOO LONG")
143 |                 continue
144 |             if len(hour) == 2 and hour.startswith("0"):
145 |                 hour = hour[1]
146 |             hour = self.normalize_integer(hour).capitalize()
147 |             
148 |             if len(minute) == 2 and minute.startswith("0"):
149 |                 minute = minute[1]
150 |             if minute == "0":
151 |                 minute = ""
152 |             else:
153 |                 minute = " "+self.normalize_integer(minute).capitalize()
154 |             normalized_time = hour + " Uhr" + minute
155 |             input_sentence = re.sub(time, normalized_time, input_sentence)
156 |         
157 |         return input_sentence
158 |     
159 |     def normalize_date(self, input_sentence:str):
160 |         dates = re.findall(r"(\d{1,2}\.\d{1,2}\.\d{2,4})",input_sentence)
161 |         for date in dates:
162 |             day, month, year = date.split(".")
163 |             day = self.normalize_ordinal(day.lstrip("0")+".")
164 |             month = self.normalize_ordinal(month.lstrip("0")+".")
165 |             year = self.normalize_integer(year.lstrip("0"))
166 |             normalized_date = " ".join([day, month, year])
167 |             input_sentence = re.sub(date, normalized_date, input_sentence)
168 |         
169 |         return input_sentence
170 |     
171 |     def normalize_ordinal(self, input_sentence:str):
172 |         ordinals = re.findall(r"([\.]*\d+[\. ']*\d*)\.(?!\d)",input_sentence)
173 |         for number in ordinals:
174 |             normalized_number = number
175 |             if len(normalized_number) > 2:
176 |                 if normalized_number[-2] == 0 and normalized_number[-1] in ordinal_mappings:
177 |                     temp_number = self.normalize_integer(normalized_number[:-2]+"00")
178 |                     normalized_number = temp_number + "ste"
179 |                 else:
180 |                     normalized_number = self.normalize_integer(normalized_number)+"te"
181 |             elif len(normalized_number) == 2:
182 |                 normalized_number = self.normalize_integer(number)
183 |                 normalized_number+="sten"
184 |             else:
185 |                 if normalized_number in ordinal_mappings:
186 |                     normalized_number = ordinal_mappings[normalized_number]
187 |                 else:
188 |                     normalized_number = self.normalize_integer(normalized_number)+"te"
189 |             input_sentence = re.sub(number+".", normalized_number, input_sentence)
190 |         return input_sentence
191 |     
192 |     def normalize_integer(self, input_sentence:str):
193 |         numbers = re.findall(r"(\d+[\. ']*\d*)",input_sentence)
194 |         for number in numbers:
195 |             number_cleaned = number.replace(" ","").replace(".", "").replace("'","")
196 |             number = number.strip()
197 |             normalized_number = number_literal(number_cleaned)
198 |             input_sentence=re.sub(number, normalized_number, input_sentence)
199 |         
200 |         return input_sentence
201 |     
202 |     def normalize_customs(self, input_sentence:str):
203 |         for custom_character in customs_mappings:
204 |             if custom_character in input_sentence:
205 |                 input_sentence = input_sentence.replace(" "+custom_character, " "+customs_mappings[custom_character])
206 |                 input_sentence = input_sentence.replace(custom_character, " "+customs_mappings[custom_character])
207 |         return input_sentence
208 |     
209 |     def normalize_percent(self, input_sentence:str):
210 |         numbers = re.findall(r"(\d+%)",input_sentence)
211 |         for number in numbers:
212 |             number_cleaned = number.replace(" ","").replace(".", "").replace("'","")
213 |             number = number.strip()
214 |             normalized_number = number_literal(number_cleaned[:-1]) + " prozent"
215 |             input_sentence=re.sub(number, normalized_number, input_sentence)
216 |         return input_sentence
217 |     
218 |     def normalize(self, input_sentence:str):
219 |         input_sentence = self.normalize_percent(input_sentence)
220 |         input_sentence = self.normalize_rationals(input_sentence)
221 |         input_sentence = self.normalize_time(input_sentence)
222 |         input_sentence = self.normalize_date(input_sentence)
223 |         input_sentence = self.normalize_ordinal(input_sentence)
224 |         input_sentence = self.normalize_integer(input_sentence)
225 |         input_sentence = self.normalize_customs(input_sentence)
226 |         
227 |         return input_sentence
228 | 
229 | def main():
230 |     parser = argparse.ArgumentParser(description="Normalizer control")
231 |     
232 |     parser.add_argument("--files", required=True,action="append")
233 |     parser.add_argument("--save_path", required=True)
234 |     
235 |     args = parser.parse_args()
236 |     
237 |     normalizer = TextNormalizer()
238 |     
239 |     normalized_sentences = []
240 |     
241 |     for text_file in args.files:
242 |         with open(text_file, encoding="UTF-8") as file:
243 |             lines = file.readlines()
244 |             for line in lines:
245 |                 normalized_line = normalizer.normalize(line)
246 |                 normalized_sentences.append(normalized_line)
247 |         text_file_name = Path(text_file).name
248 |         with open(args.save_path+text_file_name+"_normalized.txt", "w", encoding="UTF-8") as file:
249 |             file.writelines(normalized_sentences)
250 |         normalized_sentences = []    
251 | 
252 | if __name__ == "__main__":
253 |     main()


--------------------------------------------------------------------------------
/huiAudioCorpus/workflows/createDatasetWorkflow/Step3_1_PrepareText.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from huiAudioCorpus.utils.PathUtil import PathUtil
  3 | from typing import Dict, List
  4 | from huiAudioCorpus.utils.DoneMarker import DoneMarker
  5 | from huiAudioCorpus.calculator.TextNormalizer import TextNormalizer
  6 | 
  7 | import re
  8 | import json
  9 | 
 10 | class Step3_1_PrepareText:
 11 | 
 12 |     def __init__(self, savePath: str, loadFile: str, saveFile: str, startSentence: str, endSentence: str, textReplacement: Dict[str,str],  textNormalizer: TextNormalizer, moves: List[Dict[str, str]], remove: List[Dict[str, str] ]):
 13 |         self.savePath = savePath
 14 |         self.textNormalizer = textNormalizer
 15 |         self.loadFile = loadFile
 16 |         self.saveFile = saveFile
 17 |         self.textReplacement = textReplacement
 18 |         self.pathUtil = PathUtil()
 19 |         self.startSentence = startSentence
 20 |         self.endSentence = endSentence
 21 |         self.moves = moves
 22 |         self.removes = remove
 23 | 
 24 |     def run(self):
 25 |         return DoneMarker(self.savePath).run(self.script)
 26 |     
 27 |     def script(self):
 28 |         inputText = self.pathUtil.loadFile(self.loadFile)
 29 |         cuttedText = self.cutText(inputText, self.startSentence , self.endSentence)
 30 |         removedText = self.remove(cuttedText, self.removes)
 31 |         replacedText = self.replace(removedText, self.textReplacement)
 32 |         movedText = self.move(replacedText, self.moves)
 33 |         self.pathUtil.writeFile(movedText, self.saveFile)
 34 | 
 35 |     def move(self, text: str, moves: List[Dict[str, str]]):
 36 |         for move in moves:
 37 |             start = move['start']
 38 |             end = move['end']
 39 |             after = move['after']
 40 |             textToMove = text.partition(start)[-1].partition(end)[0] + end
 41 |             textWithoutMove = text.replace(textToMove, "")
 42 |             first, seperator, last = textWithoutMove.partition(after)
 43 |             finalText = first + seperator + textToMove + last
 44 |             text = finalText
 45 |         return text
 46 | 
 47 |     def remove(self, text: str, removes: List[Dict[str, str]]):
 48 |         for remove in removes:
 49 |             textToRemove = ""
 50 |             textToRemove_old = None
 51 |             start = remove['start']
 52 |             end = remove['end']
 53 |             while textToRemove != textToRemove_old:
 54 |                 textToRemvoe = start + text.partition(start)[-1].partition(end)[0] + end     
 55 |                 text = text.replace(textToRemvoe, "")
 56 |                 textToRemove_old = textToRemove
 57 |                 print(textToRemvoe)
 58 |         return text
 59 | 
 60 | 
 61 |     def cutText(self, text: str, startSentence: str, endSentence: str):
 62 |         if startSentence =="":
 63 |             withoutFirst = text
 64 |         else:
 65 |             withoutFirst = startSentence + text.split(startSentence, 1)[1]
 66 |         
 67 |         if endSentence=="":
 68 |             withoutEnd = withoutFirst
 69 |         else:
 70 |             withoutEnd = withoutFirst.split(endSentence,1)[0] + endSentence
 71 |         
 72 |         stripped = withoutEnd.strip()
 73 |         prepared = stripped.replace('\r', '')
 74 |         return prepared
 75 | 
 76 |     def replace(self, text: str, textReplacement: Dict[str,str]):
 77 |         beforeReplacement = {
 78 |             '\xa0': ' '
 79 |         }
 80 |         baseReplacement =  {
 81 |             '...': '.',
 82 |             '«': ' ',
 83 |             '»': ' ',
 84 |             "'": '',
 85 |             '"': ' ',
 86 |             '_': ' ',
 87 |             '-': ' ',
 88 |             '–': ' ',
 89 |             ';': ',',
 90 |             ':': ':',
 91 |             '’': ' ',
 92 |             '‘': ' ',
 93 |             '<': ' ',
 94 |             '>': ' ',
 95 |             '(': ' ',
 96 |             ')': ' ',
 97 |             '›': ' ',
 98 |             '‹': ' ',
 99 |             'é': 'e',
100 |             'ê': 'e',
101 |             '^': ' ',
102 |             'è': 'e',
103 |             'à': 'a',
104 |             'á': 'a'
105 | 
106 |         }
107 |         
108 |         abbreviations = {
109 |             ' H. v.': ' Herr von ',
110 |             '†': ' gestorben ',
111 |             ' v.': ' von ',
112 |             '§': ' Paragraph ',
113 |             ' geb.': ' geboren ',
114 |             ' u.': ' und ',
115 |             '&': ' und ',
116 |             ' o.': ' oder ',
117 |             ' Nr.': ' Nummer ',
118 |             ' Pf.': ' Pfennig ',
119 |             ' Mk.': ' Mark ',
120 |             " Sr. Exz.": " seiner exzellenz ",
121 |             " Kgl.": " königlich ",
122 |             " Dr.": ' Doktor ',
123 |             ' Abb.': ' Abbildung ',
124 |             ' Abh.': ' Abhandlung ',
125 |             ' Abk.': ' Abkürzung ',
126 |             ' allg.': ' allgemein ',
127 |             ' bes.': ' besonders ',
128 |             ' bzw.': ' beziehungsweise ',
129 |             ' geb.': ' geboren ',
130 |             ' gegr.': ' gegründet ',
131 |             ' jmd.': ' jemand ',
132 |             ' o. Ä.': ' oder Ähnliches ',
133 |             ' u. a.': ' unter anderem ',
134 |             ' o.Ä.': ' oder Ähnliches ',
135 |             ' u.a.': ' unter anderem ',
136 |             ' ugs.': ' umgangssprachlich ',
137 |             ' urspr.': ' ursprünglich ',
138 |             ' usw.': '  und so weiter',
139 |             ' u. s. w.': ' und so weiter ',
140 |             ' u.s.w.': ' und so weiter ',
141 |             ' zz.': ' zurzeit ',
142 |             ' dt.': '  deutsch',
143 |             ' ev.': ' evangelisch ',
144 |             ' Jh.': ' Jahrhundert ',
145 |             ' kath.': ' katholisch ',
146 |             ' lat.': ' lateinisch ',
147 |             ' luth.': ' lutherisch ',
148 |             ' Myth.': ' Mythologie ',
149 |             ' natsoz.': ' nationalsozialistisch ',
150 |             ' n.Chr.': ' nach Christus ',
151 |             ' n. Chr.': ' nach Christus ',
152 |             ' relig.': ' religiös ',
153 |             ' v. Chr.': ' vor Christus ',
154 |             ' v.Chr.': ' vor Christus ',
155 |             ' Med.': ' Medizin ',
156 |             ' Mio.': ' Millionen ',
157 |             ' d.h.': ' das heißt ',
158 |             ' d. h.': ' das heißt ',
159 |             ' Abb.': ' Abbildung ',
160 |             ' f.': ' folgende ',
161 |             ' ff.': ' folgende ',
162 |             ' ggf.': ' gegebenfalls ',
163 |             ' i. Allg.': ' im Allgemeinen ',
164 |             ' i. d. R.': ' in der Regel ',
165 |             ' i.Allg.': ' im Allgemeinen ',
166 |             ' i.d.R.': ' in der Regel ',
167 |             ' lt.': ' laut ',
168 |             ' m.': ' mit ',
169 |             ' od.': ' oder ',
170 |             ' s. o.': ' siehe oben ',
171 |             ' s. u.': ' siehe unten ',
172 |             ' s.o.': ' siehe oben ',
173 |             ' s.u.': ' siehe unten ',
174 |             ' Std.': ' Stunde ',
175 |             ' tägl.': ' täglich ',
176 |             ' Tsd.': ' Tausend ',
177 |             ' tsd.': ' tausend ',
178 |             ' v.': ' von ',
179 |             ' z. B.': ' zum Beispiel ',
180 |             ' z.B.': ' zum Beispiel ',
181 |             ' Z. B.': ' zum Beispiel ',
182 |             ' Z.B.': ' zum Beispiel ',
183 |             ' Bsp.': ' Beispiel ',
184 |             ' bzgl.': ' bezüglich ',
185 |             ' ca.': ' circa ',
186 |             ' dgl.': ' dergleichen ',
187 |             ' etc.': ' et cetera ',
188 |             ' evtl.': ' eventuell ',
189 |             ' z.T.': ' zum Teil ',
190 |             ' z. T.': ' zum Teil ',
191 |             ' zit.': ' zitiert ',
192 |             ' zzgl.': ' zuzüglich ',
193 |             ' H. ': ' Herr ',
194 |             ' N. N.': ' so und so ',
195 |             ' N.N.': ' so und so ',
196 |             ' u.s.f.': ' und so fort',
197 |             ' u. s. f.': ' und so fort',
198 |             ' von Ew.': ' von euerer ',
199 |             ' Se.': ' seine ',
200 |             ' St.': ' Sankt ',
201 |             ' inkl.': ' inklusive ',
202 |             'U.S.A.': ' U S A ',
203 |             ' d. J': 'des Jahres ',
204 |             'G.m.b.H.': ' GmbH ',
205 |             ' Mr.': ' Mister ',
206 |             '°': ' Grad ',
207 |             ' m. E.': ' meines Erachtens ',
208 |             ' m.E.': ' meines Erachtens ',
209 |             ' Ew.': ' Eure ',
210 |             ' a.O.': ' an der Oder ',
211 |             ' d.': ' der ',
212 |             ' Ev.': ' Evangelium ',
213 |             ' Sr.': ' seiner ',
214 |             ' hl.': ' heilige ',
215 |             ' Hr.': ' Herr ',
216 |             'd.i.': ' das ist ',
217 |             ' Aufl.': ' Auflage ',
218 |             "A. d. Üb.":" Anmerkung der Übersetzerin ",
219 |             " gest.": " gestorben "
220 |     
221 | 
222 |             
223 |         }
224 |         for input, target in beforeReplacement.items():
225 |             text = text.replace(input,target)
226 |         for input, target in textReplacement.items():
227 |             text = text.replace(input,target)
228 |         for input, target in baseReplacement.items():
229 |             text = text.replace(input,target)
230 | 
231 |         self.pathUtil.writeFile(text, self.saveFile)
232 | 
233 |         remainingNumbers = [s for s in text.split() if bool(re.search(r'\d', s))]
234 |         if len(remainingNumbers)>0:
235 |             print('there are remaining number inside the text')
236 |             print(remainingNumbers)
237 |             replacements = {}
238 |             for text in remainingNumbers:
239 |                 replacements[text] = self.textNormalizer.normalize(text)
240 |             replacements = dict(sorted(replacements.items(), key=lambda item: len(item[0]), reverse=True))
241 |             print(json.dumps(replacements, indent=4, ensure_ascii=False))
242 | 
243 |             raise Exception('there are remaining number inside the text')
244 | 
245 |         remainingAbbreviations = [ab for ab in abbreviations.keys() if ab in text]
246 |         if len(remainingAbbreviations)>0:
247 |             print('there are remaining abbreviations inside the text')
248 |             print(remainingAbbreviations)
249 |             replacements = {key: value for (key,value) in abbreviations.items() if key in remainingAbbreviations}
250 |             replacements = dict(sorted(replacements.items(), key=lambda item: len(item[0]), reverse=True))
251 |             print(json.dumps(replacements, indent=4, ensure_ascii=False))
252 |             raise Exception('there are remaining abbreviations inside the text')
253 | 
254 |         aToZ = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
255 |         possibleAbberviations = [' '+char+'.' for char in 'abcdefghijklmnopqrstuvwxyz' if ' '+char+'.' in text] + [' '+char+char2+'.' for char in aToZ for char2 in aToZ if ' '+char+char2+'.' in text]
256 |         shortWorts = [' Co.', ' go.', ' Da.',' na.',' ab.', ' an.', ' da.', ' du.', ' er.', ' es.', ' ja.', ' so.', ' um.', ' zu.', ' Ja.', ' Ad.', ' je.', ' Es.', ' ob.', ' is.', ' tu.', ' Hm.', ' So.', ' wo.', ' ha.', ' he.', ' Du.', ' du.', ' Nu.', ' in.']
257 |         possibleAbberviations = [ab for ab in possibleAbberviations if ab not in shortWorts]
258 |         if len(possibleAbberviations)>0:
259 |             print('there are remaining possible abberviations inside the text')
260 |             print(possibleAbberviations)
261 |             raise Exception('there are remaining possible abberviations inside the text')
262 |         
263 |         allowedChars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ äöüßÖÄÜ .,;?!:" \n'
264 |         remaininNotAllowedChars = [char for char in text if char not in allowedChars]
265 |         if len(remaininNotAllowedChars)>0:
266 |             print('there are remaining chars inside the text')
267 |             print(remaininNotAllowedChars)
268 |             raise Exception('there are remaining chars inside the text')
269 |         return text


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/scripts/createDataset.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict
  2 | from huiAudioCorpus.dependencyInjection.DependencyInjection import DependencyInjection
  3 | import datasetWorkflow
  4 | import scripts.createDatasetConfig as createDatasetConfig
  5 | from huiAudioCorpus.utils.PathUtil import PathUtil
  6 | import os
  7 | 
  8 | pathUtil = PathUtil()
  9 | basePath = createDatasetConfig.__path__[0]  # type: ignore
 10 | 
 11 | externalPaths = [
 12 | ]
 13 | 
 14 | dataBasePath = datasetWorkflow.__path__[0]  # type: ignore
 15 | for path in externalPaths:
 16 |     if pathUtil.fileExists(path):
 17 |         dataBasePath = path
 18 | 
 19 | def logStep(name):
 20 |     print('')
 21 |     print('')
 22 |     print('#######################################################')
 23 |     print(name)
 24 |     print('#######################################################')
 25 |     print('')
 26 | 
 27 | ### load all configurations
 28 | bernd_1 = pathUtil.loadJson(
 29 |     basePath + '/Bernd_Ungerer_tausendUndEineNacht.json')
 30 | bernd_2 = pathUtil.loadJson(basePath + '/Bernd_Ungerer_other.json')
 31 | bernd = {**bernd_1, **bernd_2}
 32 | hokuspokus = pathUtil.loadJson(basePath + '/Hokuspokus.json')
 33 | redaer = pathUtil.loadJson(basePath + '/redaer.json')
 34 | friedrich = pathUtil.loadJson(basePath + '/Friedrich.json')
 35 | eva = pathUtil.loadJson(basePath + '/Eva.json')
 36 | karlsson = pathUtil.loadJson(basePath + '/Karlsson.json')
 37 | sonja = pathUtil.loadJson(basePath + '/Sonja.json')
 38 | 
 39 | allLibriboxIds = [author[key]['LibrivoxBookName'] for author in [
 40 |     bernd, hokuspokus, friedrich, eva, karlsson, redaer] for key in author]
 41 | duplicatIds = set([x for x in allLibriboxIds if allLibriboxIds.count(x) > 1])
 42 | 
 43 | if len(duplicatIds) > 0:
 44 |     raise Exception("Duplicate Librivox ids: " + str(duplicatIds))
 45 | 
 46 | 
 47 | # configere this object to only create a single speacker
 48 | allConfigs = {**bernd, **hokuspokus, **friedrich, **eva, **karlsson, **sonja}
 49 | allConfigs = sonja
 50 | #allConfigs = redaer
 51 | 
 52 | # this is needed for the statistic and split into others
 53 | specialSpeackers = ['Bernd_Ungerer', 'Eva_K', 'Friedrich', 'Hokuspokus', 'Karlsson']
 54 | 
 55 | workflowConfig = {
 56 |     'continueOnError': False,
 57 |     'prepareAudio': True,
 58 |     'prepareText': True,
 59 |     'transcriptText': True,
 60 |     'alignText': True,
 61 |     'finalize': True,
 62 |     'audioRawStatistic': True,
 63 |     'cleanStatistic': True,
 64 |     'fullStatistic': True,
 65 |     'generateClean': True
 66 | }
 67 | 
 68 | 
 69 | step0Path = dataBasePath + '/overview'
 70 | logStep('Step0_Overview')
 71 | config = {
 72 |     'audiosFromLibrivoxPersistenz': {
 73 |         'bookName': '',
 74 |         'savePath': '',
 75 |         'chapterPath': ''
 76 |     },
 77 |     'step0_Overview': {
 78 |         'savePath': step0Path
 79 |     }
 80 | }
 81 | DependencyInjection(config).step0_Overview.run()
 82 | 
 83 | finalDatasetPath = dataBasePath + '/finalDataset'
 84 | finalDatasetPathClean = dataBasePath + '/finalDatasetClean'
 85 | step7Path = dataBasePath + '/rawStatistic'
 86 | setp8Path = dataBasePath + '/datasetStatistic'
 87 | setp8Path_clean = dataBasePath + '/datasetStatisticClean'
 88 | 
 89 | 
 90 | def cleanFilter(input):
 91 |     input = input[input['minSilenceDB'] < -50]
 92 |     input = input[input['silencePercent'] < 45]
 93 |     input = input[input['silencePercent'] > 10]
 94 |     return input
 95 | 
 96 | def runWorkflow(params: Dict, workflowConfig: Dict):
 97 |     print(params)
 98 |     bookBasePath = dataBasePath + '/books/'
 99 | 
100 |     step1Path = bookBasePath + params['title'] + '/Step1_DownloadAudio'
101 |     step1PathAudio = step1Path + '/audio'
102 |     step1PathChapter = step1Path + '/chapter.csv'
103 |     step2Path = bookBasePath + params['title'] + '/Step2_SplitAudio'
104 |     step2_1_Path = bookBasePath + params['title'] + '/Step2_1_AudioStatistic'
105 | 
106 |     step2PathAudio = step2Path + '/audio'
107 |     step3Path = bookBasePath + params['title'] + '/Step3_DownloadText'
108 |     step3PathText = step3Path + '/text.txt'
109 |     step3_1_Path = bookBasePath + params['title'] + '/Step3_1_PrepareText'
110 |     step3_1_PathText = step3_1_Path + '/text.txt'
111 | 
112 |     step4Path = bookBasePath + params['title'] + '/Step4_TranscriptAudio'
113 |     step5Path = bookBasePath + params['title'] + '/Step5_AlignText'
114 |     step6Path = bookBasePath + params['title'] + '/Step6_FinalizeDataset'
115 | 
116 |     if workflowConfig['prepareAudio']:
117 |         logStep('Step1_DowloadAudio')
118 |         config = {
119 |             'audiosFromLibrivoxPersistenz': {
120 |                 'bookName': params['LibrivoxBookName'],
121 |                 'savePath': step1PathAudio + '/',
122 |                 'chapterPath': step1PathChapter
123 |             },
124 |             'step1_DownloadAudio': {
125 |                 'savePath': step1Path
126 |             }
127 |         }
128 |         DependencyInjection(config).step1_DownloadAudio.run()
129 | 
130 |         logStep('Step2_SplitAudio')
131 |         config = {
132 |             'audioSplitTransformer': {
133 |                 'minAudioDuration': 5,
134 |                 'maxAudioDuration': 40
135 |             },
136 |             'audioPersistenz': {
137 |                 'loadPath': step1PathAudio,
138 |                 'savePath': step2PathAudio,
139 |                 'fileExtension': 'mp3'
140 |             },
141 |             'audioLoudnessTransformer': {
142 |                 'loudness': -20
143 |             },
144 |             'step2_SplitAudio': {
145 |                 'bookName': params['title'],
146 |                 'savePath': step2Path,
147 |                 'remapSort': params['remapSort'] if 'remapSort' in params else None
148 |             }
149 |         }
150 |         DependencyInjection(config).step2_SplitAudio.run()
151 | 
152 |         logStep('Step2_1_AudioStatistic')
153 |         config = {
154 |             'step2_1_AudioStatistic': {
155 |                 'savePath': step2_1_Path,
156 |             },
157 |             'audioPersistenz': {
158 |                 'loadPath': step2PathAudio
159 |             },
160 |             'plot': {
161 |                 'showDuration': 1,
162 |                 'savePath': step2_1_Path
163 |             }
164 |         }
165 |         DependencyInjection(config).step2_1_AudioStatistic.run()
166 | 
167 |     if workflowConfig['prepareText']:
168 |         logStep('Step3_DowloadText')
169 |         config = {
170 |             'GutenbergBookPersistenz': {
171 |                 'textId': params['GutenbergId'],
172 |                 'savePath': step3PathText
173 |             },
174 |             'step3_DowloadText': {
175 |                 'savePath': step3Path
176 |             }
177 |         }
178 |         DependencyInjection(config).step3_DowloadText.run()
179 | 
180 |         logStep('Step3_1_PrepareText')
181 |         config = {
182 |             'step3_1_PrepareText': {
183 |                 'savePath': step3_1_Path,
184 |                 'loadFile': step3PathText,
185 |                 'saveFile': step3_1_PathText,
186 |                 'textReplacement': params['textReplacement'],
187 |                 'startSentence': params['GutenbergStart'],
188 |                 'endSentence': params['GutenbergEnd'],
189 |                 'moves': params['moves'] if 'moves' in params else [],
190 |                 'remove': params['remove'] if 'remove' in params else []
191 |             }
192 |         }
193 |         DependencyInjection(config).step3_1_PrepareText.run()
194 | 
195 |     if workflowConfig['transcriptText']:
196 |         logStep('Step4_TranscriptAudio')
197 |         config = {
198 |             'step4_TranscriptAudio': {
199 |                 'savePath': step4Path,
200 |             },
201 |             'audioPersistenz': {
202 |                 'loadPath': step2PathAudio
203 |             },
204 |             'transcriptsPersistenz': {
205 |                 'loadPath': step4Path,
206 |             }
207 |         }
208 |         DependencyInjection(config).step4_TranscriptAudio.run()
209 | 
210 |     if workflowConfig['alignText']:
211 |         logStep('Step5_AlignText')
212 |         config = {
213 |             'step5_AlignText': {
214 |                 'savePath': step5Path,
215 |                 'textToAlignPath': step3_1_PathText
216 |             },
217 |             'transcriptsPersistenz': {
218 |                 'loadPath': step4Path,
219 |                 'savePath': step5Path
220 |             }
221 |         }
222 |         DependencyInjection(config).step5_AlignText.run()
223 | 
224 |     if workflowConfig['finalize']:
225 |         logStep('Step6_FinalizeDataset')
226 |         config = {
227 |             'step6_FinalizeDataset': {
228 |                 'savePath': step6Path,
229 |                 'chapterPath': step1PathChapter
230 |             },
231 |             'audioPersistenz': {
232 |                 'loadPath': step2PathAudio,
233 |                 'savePath': finalDatasetPath
234 |             },
235 |             'transcriptsPersistenz': {
236 |                 'loadPath': step5Path,
237 |                 'savePath': finalDatasetPath
238 |             }
239 |         }
240 |         DependencyInjection(config).step6_FinalizeDataset.run()
241 | 
242 | 
243 | summary = {}
244 | for configName in allConfigs:
245 |     print('+++++++++++++++++++++++++++++++++++++++++')
246 |     print('+++++++++++++++++++++++++++++++++++++++++')
247 |     print('+++++++++++++++++++++++++++++++++++++++++')
248 |     logStep(configName)
249 |     print('+++++++++++++++++++++++++++++++++++++++++')
250 |     print('+++++++++++++++++++++++++++++++++++++++++')
251 |     print('+++++++++++++++++++++++++++++++++++++++++')
252 | 
253 |     config = allConfigs[configName]
254 |     if workflowConfig['continueOnError']:
255 |         try:
256 |             runWorkflow(config, workflowConfig)
257 |             summary[config['title']] = 'finished'
258 |         except:
259 |             summary[config['title']] = 'error'
260 |     else:
261 |         runWorkflow(config, workflowConfig)
262 | print(summary)
263 | 
264 | if workflowConfig['audioRawStatistic']:
265 |     logStep('audioRawStatistic')
266 |     diConfig = {
267 |         'step7_AudioRawStatistic': {
268 |             'savePath': step7Path,
269 |             'loadPath': finalDatasetPath
270 |         }
271 |     }
272 |     DependencyInjection(diConfig).step7_AudioRawStatistic.run()
273 | 
274 | if workflowConfig['fullStatistic']:
275 |     logStep('fullStatistic')
276 |     diConfig = {
277 |         'step8_DatasetStatistic': {
278 |             'savePath': setp8Path,
279 |             'loadPath': step7Path + '/overview.csv',
280 |             'specialSpeackers': specialSpeackers,
281 |             'filter': None
282 |         },
283 |         'audioPersistenz': {
284 |             'loadPath':''
285 |         },
286 |         'transcriptsPersistenz': {
287 |             'loadPath':''
288 |         },
289 |         'plot': {
290 |             'showDuration': 0
291 |         }
292 |     }
293 |     DependencyInjection(diConfig).step8_DatasetStatistic.run()
294 | 
295 | if workflowConfig['cleanStatistic']:
296 |     logStep('cleanStatistic')
297 |     diConfig = {
298 |         'step8_DatasetStatistic': {
299 |             'savePath': setp8Path_clean,
300 |             'loadPath': step7Path + '/overview.csv',
301 |             'specialSpeackers': specialSpeackers,
302 |             'filter': cleanFilter
303 |         },
304 |         'audioPersistenz': {
305 |             'loadPath':''
306 |         },
307 |         'transcriptsPersistenz': {
308 |             'loadPath':''
309 |         },
310 |         'plot': {
311 |             'showDuration': 0
312 |         }
313 |     }
314 |     DependencyInjection(diConfig).step8_DatasetStatistic.run()
315 | 
316 | if workflowConfig['generateClean']:
317 |     logStep('generateClean')
318 |     diConfig = {
319 |         'step9_GenerateCleanDataset': {
320 |             'savePath': finalDatasetPath,
321 |             'infoFile': step7Path +'/overview.csv',
322 |             'filter': cleanFilter
323 |         },
324 |         'transcriptsPersistenz': {
325 |             'loadPath': finalDatasetPath,
326 |             'savePath': finalDatasetPathClean
327 |         },
328 |         'audioPersistenz': {
329 |             'loadPath': finalDatasetPath,
330 |             'savePath': finalDatasetPathClean
331 |         },
332 |     }
333 |     DependencyInjection(diConfig).step9_GenerateCleanDataset.run()


--------------------------------------------------------------------------------
/scripts/createDatasetConfig/Friedrich.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "vierzehnte_dezember": {
  3 |         "title": "vierzehnte_dezember",
  4 |         "LibrivoxBookName": "vierzehnte Dezember",
  5 |         "GutenbergId": "mereschk/14dezemb/14dezemb.html",
  6 |         "GutenbergStart": "",
  7 |         "GutenbergEnd": "",
  8 |         "textReplacement": {
  9 |             "III.": "der dritte",
 10 |             "II.": "der zweite",
 11 |             "I.": "der erste",
 12 |             "No": "Nummer",
 13 |             " z. B.": " zum Beispiel ",
 14 |             "60 000": "sechzigtausend",
 15 |             "1817": "achtzehnhundertsiebzehn",
 16 |             "1812": "achtzehnhundertzwölf",
 17 |             "1825": "achtzehnhundertfünfundzwanzig",
 18 |             "1801": "achtzehnhundertein",
 19 |             "1809": "achtzehnhundertneun",
 20 |             "31.,": "einunddreißigste",
 21 |             "31.": "einunddreißigsten",
 22 |             "27.": "siebenundzwanzigsten",
 23 |             "13.": "dreizehnsten",
 24 |             "19.": "neunzehnsten",
 25 |             "14.": "vierzehnsten",
 26 |             "15.": "fünfzehnsten",
 27 |             "21.": "einundzwanzigsten",
 28 |             "22.": "zweiundzwanzigsten",
 29 |             "11.": "elfsten",
 30 |             "12.": "zwölfsten",
 31 |             "18.": "achtzehnsten",
 32 |             "28.": "achtundzwanzigsten",
 33 |             "29.": "neunundzwanzigsten",
 34 |             "30.": "dreißigsten",
 35 |             "700": "siebenhundert",
 36 |             "116": "einhundertsechzehn",
 37 |             "4.": "vierten",
 38 |             "45": "fünfundvierzig",
 39 |             "11": "elf",
 40 |             "2.": "zweiten",
 41 |             "3.": "dritten",
 42 |             "12": "zwölf",
 43 |             "7": "sieben",
 44 |             "9": "neun",
 45 |             "ù": "u",
 46 |             "â": "a",
 47 |             "œ": "oe",
 48 |             "ï": "i",
 49 |             "ç": "c",
 50 |             "î": "i"
 51 |         }
 52 |     },
 53 |     "lustige_geschichten": {
 54 |         "title": "lustige_geschichten",
 55 |         "LibrivoxBookName": "Lustige Geschichten",
 56 |         "GutenbergId": "cechov/novel5/novel5.html",
 57 |         "GutenbergStart": "",
 58 |         "GutenbergEnd": "",
 59 |         "textReplacement": {
 60 |             " d. h.": " das heißt ",
 61 |             " z. B.": " zum Beispiel ",
 62 |             " usw.": "  und so weiter",
 63 |             " Ew.": "Eure",
 64 |             " d.": " der ",
 65 |             "&": " und ",
 66 |             "a. D.": " a D ",
 67 |             " Co.": " Co ",
 68 |             " II. ": " zweite ",
 69 |             "75.000": "fünfundsiebzigtausend",
 70 |             "9499": "neuntausendvierhundertneunundneunzig",
 71 |             "1883": "achtzehnhundertdreiundachtzig",
 72 |             "35,8": "fünfunddreißig komma acht",
 73 |             "1842": "achtzehnhundertzweiundvierzig",
 74 |             "209": "zweihundertneun",
 75 |             "29.": "neunundzwanzigsten",
 76 |             "223": "zweihundertdreiundzwanzig",
 77 |             "219": "zweihundertneunzehn",
 78 |             "26": "sechsundzwanzig",
 79 |             "46": "sechsundvierzig",
 80 |             "Nr.": "Nummer"
 81 |         }
 82 |     },
 83 |     "saemtliche_schriften6": {
 84 |         "title": "saemtliche_schriften6",
 85 |         "LibrivoxBookName": "Sämtliche Schriften 1911-1921, Teil 6",
 86 |         "GutenbergId": "ossietzk/schrift1/chap251.html",
 87 |         "GutenbergStart": "",
 88 |         "GutenbergEnd": "",
 89 |         "textReplacement": {
 90 |             " u.a.": " unter anderem ",
 91 |             " d.h.": " das heißt ",
 92 |             " z.B.": " zum Beispiel ",
 93 |             " Dr.": " Doktor ",
 94 |             " v.": " von ",
 95 |             " u.": " und ",
 96 |             " d.": " der ",
 97 |             "&": " und ",
 98 |             " Co.": " Co ",
 99 |             "a. D.": " a D ",
100 |             "1911": "neunzehnhundertelf",
101 |             "1919": "neunzehnhundertneunzehn",
102 |             "1500": "fünfzehnhundert",
103 |             "1914": "neunzehnhundertvierzehn",
104 |             "1000": "eintausend",
105 |             "1821": "achtzehnhunderteinundzwanzig",
106 |             "1881": "achtzehnhunderteinundachtzig",
107 |             "1915": "neunzehnhundertfünfzehn",
108 |             "1857": "achtzehnhundertsiebenundfünfzig",
109 |             "1880": "achtzehnhundertachtzig",
110 |             "1935": "neunzehnhundertfünfunddreißig",
111 |             "1908": "neunzehnhundertacht",
112 |             "1920": "neunzehnhundertzwanzig",
113 |             "1921": "neunzehnhunderteinundzwanzig",
114 |             "21.": "einundzwanzigsten",
115 |             "23.": "dreiundzwanzigsten",
116 |             "27.": "siebenundzwanzigsten",
117 |             "17.": "siebzehnsten",
118 |             "18.": "achtzehnsten",
119 |             "20.": "zwanzigsten",
120 |             "184": "einhundertvierundachtzig",
121 |             "11.": "elfsten",
122 |             "22.": "zweiundzwanzigsten",
123 |             "300": "dreihundert",
124 |             "500": "fünfhundert",
125 |             "100": "einhundert",
126 |             "231": "zweihunderteinunddreißig",
127 |             "24.": "vierundzwanzigsten",
128 |             "6.": "sechste",
129 |             "22": "zweiundzwanzig",
130 |             "26": "sechsundzwanzig",
131 |             "18": "achtzehn",
132 |             "60": "sechzig",
133 |             "80": "achtzig",
134 |             "35": "fünfunddreißig",
135 |             "15": "fünfzehn",
136 |             "10": "zehn",
137 |             "38": "achtunddreißig",
138 |             "4.": "vierte",
139 |             "6": "sechs",
140 |             "§": "Paragraph",
141 |             "ç": " ",
142 |             "[": " ",
143 |             "]": " "
144 |         }
145 |     },
146 |     "saemtliche_schriften5": {
147 |         "title": "saemtliche_schriften5",
148 |         "LibrivoxBookName": "Sämtliche Schriften 1911-1921, Teil 5",
149 |         "GutenbergId": "ossietzk/schrift1/chap201.html",
150 |         "GutenbergStart": "",
151 |         "GutenbergEnd": "dieses Abends gelernt haben. Es wird noch einiges über diesen Trauerfall zu sagen sein.",
152 |         "textReplacement": {
153 |             " d. h.": " das heißt ",
154 |             " usw.": "  und so weiter",
155 |             " dgl.": " dergleichen ",
156 |             " Dr.": " Doktor ",
157 |             " H. ": " Herr ",
158 |             " St.": " Sankt ",
159 |             " v.": " von ",
160 |             " u.": " und ",
161 |             " d.": " der ",
162 |             "&": " und ",
163 |             " Co.": " Co ",
164 |             " II. ": " zweite ",
165 |             " z.": " z ",
166 |             " CD.": " CD ",
167 |             " Fr.": " Fr ",
168 |             "100 000": "einhunderttausend",
169 |             "30 000": "dreißigtausend",
170 |             "1814": "achtzehnhundertvierzehnte",
171 |             "1336": "dreizehnhundertsechsunddreißig",
172 |             "1793": "siebzehnhundertdreiundneunzig",
173 |             "1899": "achtzehnhundertneunundneunzig",
174 |             "1916": "neunzehnhundertsechzehn",
175 |             "1800": "achtzehnhundert",
176 |             "1914": "neunzehnhundertvierzehn",
177 |             "1918": "neunzehnhundertachtzehn",
178 |             "1902": "neunzehnhundertzwei",
179 |             "1490": "vierzehnhundertneunzig",
180 |             "1921": "neunzehnhunderteinundzwanzig",
181 |             "1898": "achtzehnhundertachtundneunzig",
182 |             "1917": "neunzehnhundertsiebzehn",
183 |             "1848": "achtzehnhundertachtundvierzig",
184 |             "212": "zweihundertzwölf",
185 |             "28.": "achtundzwanzigsten",
186 |             "109": "einhundertneun",
187 |             "118": "einhundertachtzehn",
188 |             "47": "siebenundvierzig",
189 |             "4.": "vierten",
190 |             "17": "siebzehn",
191 |             "15": "fünfzehn",
192 |             "10": "zehn",
193 |             "31": "einunddreißig",
194 |             "30": "dreißig",
195 |             "1.": "ersten",
196 |             "54": "vierundfünfzig",
197 |             "[": " ",
198 |             "]": " "
199 |         }
200 |     },
201 |     "homo_sapiens": {
202 |         "title": "homo_sapiens",
203 |         "LibrivoxBookName": "Homo sapiens - Romantrilogie",
204 |         "GutenbergId": "przybysz/homosapi/homosapi.html",
205 |         "GutenbergStart": "",
206 |         "GutenbergEnd": "",
207 |         "textReplacement": {
208 |             "IV.": " vierten ",
209 |             "ô": "o",
210 |             "ó": "o",
211 |             " u. s. w.": " und so weiter ",
212 |             " d. h.": " das heißt ",
213 |             " z. B.": " zum Beispiel ",
214 |             " dgl.": " dergleichen ",
215 |             " u.": " und ",
216 |             " d.": " der ",
217 |             "1894": "achtzehnhundertvierundneunzig",
218 |             "28.": "achtundzwanzigsten",
219 |             "100": "einhundert",
220 |             "21.": "einundzwanzigsten",
221 |             "183": "einhundertdreiundachtzig",
222 |             "10": "zehn",
223 |             "13": "dreizehn",
224 |             "26": "sechsundzwanzig",
225 |             "1.": "erster",
226 |             "30": "dreißig",
227 |             "90": "neunzig"
228 |         }
229 |     },
230 |     "aus_allen_winkeln": {
231 |         "title": "aus_allen_winkeln",
232 |         "LibrivoxBookName": "Aus allen Winkeln - Erzählungen",
233 |         "GutenbergId": "heiberg/erzaehlg/erzaehlg.html",
234 |         "GutenbergStart": "In einer der besten Gegenden der Stadt",
235 |         "GutenbergEnd": "",
236 |         "remapSort": [
237 |             5,
238 |             7,
239 |             13,
240 |             15,
241 |             16,
242 |             1,
243 |             10,
244 |             6,
245 |             11,
246 |             14,
247 |             4,
248 |             9,
249 |             12,
250 |             0,
251 |             8,
252 |             2,
253 |             3
254 |         ],
255 |         "textReplacement": {
256 |             " H. ": " H ",
257 |             " v.": " von ",
258 |             "&": " und ",
259 |             " geb.": " geboren ",
260 |             " Dr.": " Doktor ",
261 |             "250,000": "zweihundertfünfzigtausend",
262 |             "1867": "achtzehnhundertsiebenundsechzig",
263 |             "1868": "achtzehnhundertachtundsechzig",
264 |             "1873": "achtzehnhundertdreiundsiebzig",
265 |             "1729": "siebzehnhundertneunundzwanzig",
266 |             "25.": "fünfundzwanzigsten",
267 |             "10.": "zehnsten",
268 |             "11.": "elfsten",
269 |             "13.": "dreizehnsten",
270 |             "800": "achthundert",
271 |             "200": "zweihundert",
272 |             "400": "vierhundert",
273 |             "11": "elf",
274 |             "18": "achtzehn",
275 |             "30": "dreißig",
276 |             "4.": "vierte",
277 |             "*": " ",
278 |             " af.": "af .",
279 |             "[": " ",
280 |             "]": " "
281 |         }
282 |     },
283 |     "falsches_geld": {
284 |         "title": "falsches_geld",
285 |         "LibrivoxBookName": "Falsches Geld",
286 |         "GutenbergId": "zapp/falsgeld/falsgeld.html",
287 |         "GutenbergStart": "",
288 |         "GutenbergEnd": "",
289 |         "textReplacement": {
290 |             " d. h.": " das heißt ",
291 |             " z. B.": " zum Beispiel ",
292 |             " geb.": " geboren ",
293 |             " usw.": "  und so weiter",
294 |             " p.": " P ",
295 |             "4,40 Mk.": "Vier Mark vierzig",
296 |             "30jährigen": "dreißigjährigen",
297 |             "20jährigen": "zwanzigjährigen",
298 |             "0459.": "null vier fünf neun",
299 |             "4905": "viertausendneunhundertfünf",
300 |             "9054": "neuntausendvierundfünfzig",
301 |             "l0000": "zehntausend",
302 |             "5049": "fünftausendneunundvierzig",
303 |             "0246": "null zwei vier sechs",
304 |             "4,40": "vier komma vier null",
305 |             "2000": "zweitausend",
306 |             "3000": "dreitausend",
307 |             "5000": "fünftausend",
308 |             "1000": "eintausend",
309 |             "300": "dreihundert",
310 |             "200": "zweihundert",
311 |             "26.": "sechsundzwanzigsten",
312 |             "100": "einhundert",
313 |             "20.": "zwanzigsten",
314 |             "5o,": "So,",
315 |             "400": "vierhundert",
316 |             "Pf.": "Pfennig",
317 |             "Mk.": "Mark",
318 |             "58": "achtundfünfzig",
319 |             "12": "zwölf",
320 |             "27": "siebenundzwanzig",
321 |             "30": "dreißig",
322 |             "24": "vierundzwanzig",
323 |             "22": "zweiundzwanzig",
324 |             "10": "zehn",
325 |             "20": "zwanzig",
326 |             "50": "fünfzig",
327 |             "4O": "vierO",
328 |             "14": "vierzehn",
329 |             "90": "neunzig",
330 |             "11": "elf",
331 |             "60": "sechzig",
332 |             "25": "fünfundzwanzig",
333 |             "2": "zwei",
334 |             "3": "drei",
335 |             "4": "vier",
336 |             "1": "ein",
337 |             "8": "acht",
338 |             "7": "sieben",
339 |             "½": "halb"
340 |         }
341 |     },
342 |     "judith_trachtenberg": {
343 |         "title": "judith_trachtenberg",
344 |         "LibrivoxBookName": "Judith Trachtenberg",
345 |         "GutenbergId": "franzos/trachten/trachten.html",
346 |         "GutenbergStart": "",
347 |         "GutenbergEnd": "",
348 |         "textReplacement": {
349 |             "&": " und "
350 |         }
351 |     },
352 |     "verbrechen": {
353 |         "title": "verbrechen",
354 |         "LibrivoxBookName": "Verbrechen",
355 |         "GutenbergId": "gorki/verbrec1/verbrec1.html",
356 |         "GutenbergStart": "",
357 |         "GutenbergEnd": "",
358 |         "textReplacement": {}
359 |     },
360 |     "furchtbare_rache": {
361 |         "title": "furchtbare_rache",
362 |         "LibrivoxBookName": "Furchtbare Rache",
363 |         "GutenbergId": "gogol/rache/rache.html",
364 |         "GutenbergStart": "",
365 |         "GutenbergEnd": "",
366 |         "textReplacement": {}
367 |     },
368 |     "gruene_nachtigall": {
369 |         "title": "gruene_nachtigall",
370 |         "LibrivoxBookName": "grüne Nachtigall und andere Novellen",
371 |         "GutenbergId": "kusmin/grnachti/grnachti.html",
372 |         "GutenbergStart": "Das grüne Haus glich so sehr",
373 |         "GutenbergEnd": "",
374 |         "textReplacement": {
375 |             "1811": "achtzehnhundertelf",
376 |             "1.": "erstens",
377 |             "2.": "zweitens",
378 |             "a. D.": " A D ",
379 |             "1918": "neunzehnhundertachtzehn",
380 |             "5.": "fünften",
381 |             "7.": "siebten",
382 |             "9.": "neunten"
383 |         },
384 |         "moves": [
385 |             {
386 |                 "start": "Der Traum der letzten Nacht rief in meiner Erinnerung wieder alles wach, was ich so gerne vergessen möchte.",
387 |                 "end": "ich später in seinen Umarmungen niemals jene fremden Arme mit dem braunen Halbmond auf der blassen Haut wiedererkannte.",
388 |                 "after": "freundlich und still, wie eine echte Meisterin in ihrem Fach."
389 |             }
390 |         ]
391 |     }
392 | }


--------------------------------------------------------------------------------
/scripts/createDatasetConfig/Karlsson.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "unterm_birnbaum": {
  3 |         "title": "unterm_birnbaum",
  4 |         "LibrivoxBookName": "Unterm Birnbaum",
  5 |         "GutenbergId": 26686,
  6 |         "GutenbergStart": "Vor dem in dem großen und reichen Oderbruchdorfe Tschechin um",
  7 |         "GutenbergEnd": "gesponnen, ’s kommt doch alles an die Sonnen._‹«",
  8 |         "textReplacement": {
  9 |             "d. M.": " des Monats ",
 10 |             " gest.": " gestorben ",
 11 |             " geb.": " geboren ",
 12 |             " etc.": " et cetera ",
 13 |             " Se.": " seine ",
 14 |             " v.": " von ",
 15 |             "&": " und ",
 16 |             "1831": "achtzehnhunderteinunddreißig",
 17 |             "1790": "siebzehnhundertneunzig",
 18 |             "1832": "achtzehnhundertzweiunddreißig",
 19 |             "80er": "achtziger",
 20 |             "30.": "dreißigsten",
 21 |             "29.": "neunundzwanzigsten",
 22 |             "14.": "vierzehnsten",
 23 |             "36": "sechsunddreißig",
 24 |             "20": "zwanzig",
 25 |             "9.": "neunte",
 26 |             "13": "dreizehn",
 27 |             "7.": "siebten",
 28 |             "15": "fünfzehn",
 29 |             "27": "siebenundzwanzig",
 30 |             "7": "sieben",
 31 |             "10": "zehn",
 32 |             "30": "dreißig",
 33 |             "14": "vierzehn",
 34 |             "3.": "dritten",
 35 |             "6": "sechs",
 36 |             "2": "zwei",
 37 |             "9": "neun",
 38 |             "1": "ein",
 39 |             "’n ": "n ",
 40 |             "’n.": "n. ",
 41 |             "’s.": "s.",
 42 |             "’s ": "s ",
 43 |             " mi.": " mi .",
 44 |             " pp.": " P P ",
 45 |             " se.": " se .",
 46 |             " to.": " to .",
 47 |             " ut.": " ut .",
 48 |             "XVIII.": " ",
 49 |             "XVII.": " ",
 50 |             "XIII.": " ",
 51 |             "VIII.": " ",
 52 |             "XII.": " ",
 53 |             "XVI.": " ",
 54 |             "XIV.": " ",
 55 |             "VII.": " ",
 56 |             "III.": " ",
 57 |             "IX.": " ",
 58 |             "VI.": " ",
 59 |             "II.": " ",
 60 |             "XV.": " ",
 61 |             "IV.": " ",
 62 |             "XI.": " ",
 63 |             "X.": " ",
 64 |             "V.": " ",
 65 |             "I.": " ",
 66 |             "*": " ",
 67 |             "ô": "o",
 68 |             "#": " ",
 69 |             "ç": "c"
 70 |         }
 71 |     },
 72 |     "schwle_tage": {
 73 |         "title": "schwle_tage",
 74 |         "LibrivoxBookName": "Schwüle Tage",
 75 |         "GutenbergId": "keyserlg/schwuele/schwuele.html",
 76 |         "GutenbergStart": "",
 77 |         "GutenbergEnd": "",
 78 |         "textReplacement": {
 79 |             "-h.": " h .",
 80 |             "ç": "c",
 81 |             "ακτις αελιου": "aktisch ayileou"
 82 |         }
 83 |     },
 84 |     "mdchen_vom_moorhof": {
 85 |         "title": "mdchen_vom_moorhof",
 86 |         "LibrivoxBookName": "Mädchen vom Moorhof",
 87 |         "GutenbergId": 20211,
 88 |         "GutenbergStart": "Es ist in einem Thingsaal, weit draußen auf dem Lande. Am Richtertisch,",
 89 |         "GutenbergEnd": "um sie stand. Jetzt konnte sie ihm nicht mehr entfliehen.",
 90 |         "textReplacement": {
 91 |             "2": " ",
 92 |             "3": " ",
 93 |             "4": " ",
 94 |             "5": " ",
 95 |             "6": " ",
 96 |             "À": "A",
 97 |             "å": "a",
 98 |             "*": " "
 99 |         }
100 |     },
101 |     "sandmann": {
102 |         "title": "sandmann",
103 |         "LibrivoxBookName": "Sandmann",
104 |         "GutenbergId": "etahoff/sandmann/sandmann.html",
105 |         "GutenbergStart": "",
106 |         "GutenbergEnd": "",
107 |         "textReplacement": {
108 |             "30.": "dreißigsten",
109 |             "12": "zwölf",
110 |             " z. B.": " zum Beispiel ",
111 |             " usw.": "  und so weiter",
112 |             " etc.": " et cetera ",
113 |             "[": " ",
114 |             "]": " "
115 |         }
116 |     },
117 |     "spuk": {
118 |         "title": "spuk",
119 |         "LibrivoxBookName": "Spuk",
120 |         "GutenbergId": "klabund/spuk/spuk.html",
121 |         "GutenbergStart": "",
122 |         "GutenbergEnd": "",
123 |         "textReplacement": {
124 |             " d. h.": " das heißt ",
125 |             " Kgl.": " königlich ",
126 |             "Abb.": " Abbildung ",
127 |             " usw.": "  und so weiter",
128 |             " Nr.": " Nummer ",
129 |             "1921": "neunzehnhunderteinundzwanzig",
130 |             "38,9,": "achtunddreißig komma neun,",
131 |             "0,02.": "null komma null zwei.",
132 |             "1891": "achtzehnhunderteinundneunzig",
133 |             "8000": "achttausend",
134 |             "0,6.": "null komma sechs.",
135 |             "39,1": "neununddreißig komma eins",
136 |             "105": "einhundertfünf",
137 |             "7314": "siebentausenddreihundertvierzehn",
138 |             "2–3": "zwei bis drei",
139 |             "2–4": "zwei bis vier",
140 |             "5:4.": "Fünf zu Vier.",
141 |             "23": "dreiundzwanzig",
142 |             "20": "zwanzig",
143 |             "38": "achtunddreißig",
144 |             "28": "achtundzwanzig",
145 |             "13,": "dreizehn,",
146 |             "25:": "fünfundzwanzig:",
147 |             "999": "neunhundertneunundneunzig",
148 |             "2,": "zwei,",
149 |             "3.": "dritte",
150 |             "50": "fünfzig",
151 |             "0,": "Oh, ",
152 |             "25": "fünfundzwanzig",
153 |             "91": "einundneunzig",
154 |             "15": "fünfzehn",
155 |             "13": "dreizehn",
156 |             "35": "fünfunddreißig",
157 |             "5": "fünf",
158 |             "4": "vier",
159 |             "0!": "Oh, ",
160 |             "0": "Oh, ",
161 |             "2": "zwei",
162 |             "1": "eins",
163 |             "3": "drei",
164 |             "Yo.": "Yo ."
165 |         }
166 |     },
167 |     "odysseus": {
168 |         "title": "odysseus",
169 |         "LibrivoxBookName": "Odysseus",
170 |         "GutenbergId": "beckerkf/altewelt/altewelt.html",
171 |         "GutenbergStart": "",
172 |         "GutenbergEnd": "und den Gedanken an die Todesgöttinnen nicht scheut!",
173 |         "textReplacement": {
174 |             " u.s.w.": " und so weiter ",
175 |             "ë": "e"
176 |         }
177 |     },
178 |     "herr_und_knecht": {
179 |         "title": "herr_und_knecht",
180 |         "LibrivoxBookName": "Herr und Knecht",
181 |         "GutenbergId": 33266,
182 |         "GutenbergStart": "Es war in den siebziger Jahren, ",
183 |         "GutenbergEnd": "",
184 |         "textReplacement": {
185 |             "7.": "siebten"
186 |         }
187 |     },
188 |     "smtliche_schriften_19111921_teil_1": {
189 |         "title": "smtliche_schriften_19111921_teil_1",
190 |         "LibrivoxBookName": "Sämtliche Schriften 1911-1921, Teil 1",
191 |         "GutenbergId": "ossietzk/schrift1/schrift1.html",
192 |         "GutenbergStart": "",
193 |         "GutenbergEnd": "Mitteilungen der Deutschen Friedensgesellschaft. Januar 1920",
194 |         "textReplacement": {
195 |             "LA Berlin, N Madrasch Groschopp, Rep.200, Acc. 4288. Nr.22": "Erschienen neunzenhundertsechzen Literaturanstalt Berlin. N Madrasch Groschopp, Rep zweihundert, Acc viertausendzweihundertachtundachtzig Nummmer zweiundzwanzig",
196 |             "Von Dr. med. M. von Kemnitz. Verlag Ernst Reinhardt, München. Brosch. Mk. 6.–, geb. Mk. 8,50.": "Von Doktor der Medizin M von Kemnitz.  Verlag Ernst Reinhardt, München. Broschiert sechs Mark, gebunden acht Mark fünfzig. ",
197 |             "brosch.": "broschiert",
198 |             "Mk. -.80": " achtzig Pfennig",
199 |             "M. 1.20": "eine Mark zwanzig",
200 |             "G.m.b.H.": " GmbH ",
201 |             " H. v.": " Herr von ",
202 |             " bzw.": " beziehungsweise ",
203 |             " usw.": "  und so weiter",
204 |             " d.h.": " das heißt ",
205 |             " z.B.": " zum Beispiel ",
206 |             " Nr.": " Nummer ",
207 |             " Pf.": " Pfennig ",
208 |             " Dr.": " Doktor ",
209 |             " H. ": " Herr ",
210 |             " v.": " von ",
211 |             " u.": " und ",
212 |             " d.": " der ",
213 |             "§": " Paragraph ",
214 |             "a.D.": " A D ",
215 |             "II.": " der zweite ",
216 |             "IV.": " der vierte ",
217 |             "1914/15": "neunzehnhundertvierzehn fünfzehn",
218 |             "1870": "achtzehnhundertsiebzig",
219 |             "1830": "achtzehnhundertdreißigte",
220 |             "1911": "neunzehnhundertelf",
221 |             "1912": "neunzehnhundertzwölf",
222 |             "1913": "neunzehnhundertdreizehn",
223 |             "78/8": "achtundsiebzig/acht",
224 |             "1914": "neunzehnhundertvierzehn",
225 |             "4002": "viertausendzwei",
226 |             "1799": "siebzehnhundertneunundneunzig",
227 |             "4003": "viertausenddrei",
228 |             "1917": "neunzehnhundertsiebzehn",
229 |             "1918": "neunzehnhundertachtzehn",
230 |             "1500": "fünfzehnhundert",
231 |             "1919": "neunzehnhundertneunzehn",
232 |             "1348": "dreizehnhundertachtundvierzig",
233 |             "1848": "achtzehnhundertachtundvierzig",
234 |             "1915": "neunzehnhundertfünfzehn",
235 |             "1916": "neunzehnhundertsechzehn",
236 |             "1920": "neunzehnhundertzwanzig",
237 |             "21.": "einundzwanzigsten",
238 |             "16.": "sechzehnsten",
239 |             "25.": "fünfundzwanzigsten",
240 |             "31.": "einunddreißigsten",
241 |             "28.": "achtundzwanzigsten",
242 |             "33.": "dreiunddreißigsten",
243 |             "30.": "dreißigsten",
244 |             "18.": "achtzehnsten",
245 |             "22.": "zweiundzwanzigsten",
246 |             "19.": "neunzehnsten",
247 |             "12.": "zwölfsten",
248 |             "29.": "neunundzwanzigsten",
249 |             "24.": "vierundzwanzigsten",
250 |             "11.": "elfsten",
251 |             "20,": "zwanzig,",
252 |             "180": "einhundertachtzig",
253 |             "850": "achthundertfünfzig",
254 |             "200": "zweihundert",
255 |             "100": "einhundert",
256 |             "13.": "dreizehnsten",
257 |             "15.": "fünfzehnsten",
258 |             "10:": "zehn:",
259 |             "20.": "zwanzigsten",
260 |             "26.": "sechsundzwanzigsten",
261 |             "27": "siebenundzwanzig",
262 |             "50": "fünfzig",
263 |             "19": "neunzehn",
264 |             "5.": "fünfte",
265 |             "8.": "achte",
266 |             "40": "vierzig",
267 |             "4.": "vierte",
268 |             "2.": "zweite",
269 |             "1.": "erste",
270 |             "28": "achtundzwanzig",
271 |             "57": "siebenundfünfzig",
272 |             "30": "dreißig",
273 |             "26": "sechsundzwanzig",
274 |             "60": "sechzig",
275 |             "9.": "neunte",
276 |             "39": "neununddreißig",
277 |             "67": "siebenundsechzig",
278 |             "20": "zwanzig",
279 |             "44": "vierundvierzig",
280 |             "24": "vierundzwanzig",
281 |             "10": "zehn",
282 |             "6.": "sechste",
283 |             "47": "siebenundvierzig",
284 |             "32": "zweiunddreißig",
285 |             "89": "neunundachtzig",
286 |             "38": "achtunddreißig",
287 |             "1": "ein",
288 |             "5": "fünf",
289 |             "4": "vier",
290 |             "2": "zwei",
291 |             "[": " ",
292 |             "]": " ",
293 |             "*": " ",
294 |             "#": " ",
295 |             "/": " ",
296 |             "â": "a"
297 |         }
298 |     },
299 |     "smtliche_schriften_19111921_teil_2": {
300 |         "title": "smtliche_schriften_19111921_teil_2",
301 |         "LibrivoxBookName": "Sämtliche Schriften 1911-1921, Teil 2",
302 |         "GutenbergId": "ossietzk/schrift1/chap051.html",
303 |         "GutenbergStart": "",
304 |         "GutenbergEnd": "Auf Wiedersehen beim nächsten Putsch ...!",
305 |         "textReplacement": {
306 |             "24jährige": "vierundzwanzigjährige",
307 |             "1916/17.": "neunzehnhundertsechzehnn siebzehnsten",
308 |             "2.8.1921": "zweiter achter neunzehnhunderteinundzwanzig",
309 |             "159er,": "einhundertneunundfünfziger,",
310 |             "Mk. 0,50.": "fünfzig Pfennige ",
311 |             "G.m.b.H.": " GmbH ",
312 |             " H. v.": " Herr von ",
313 |             " d. h.": " das heißt ",
314 |             " z. B.": " zum Beispiel ",
315 |             " inkl.": " inklusive ",
316 |             "U.S.A.": " U S A ",
317 |             " u.a.": " unter anderem ",
318 |             " usw.": "  und so weiter",
319 |             " d.h.": " das heißt ",
320 |             " z.B.": " zum Beispiel ",
321 |             " dgl.": " dergleichen ",
322 |             " d. J": "des Jahres ",
323 |             " Nr.": " Nummer ",
324 |             " Mk.": " Mark ",
325 |             "Dr.": " Doktor ",
326 |             " H. ": " Herr ",
327 |             " St.": " Sankt ",
328 |             " v.": " von ",
329 |             " u.": " und ",
330 |             " d.": " der ",
331 |             " a. D. ": " A D ",
332 |             "a.M.": " am Main ",
333 |             "§": " Paragraph ",
334 |             "&": " und ",
335 |             "3333": "dreitausenddreihundertdreiunddreißig,",
336 |             "1348": "dreizehnhundertachtundvierzig",
337 |             "1897": "achtzehnhundertsiebenundneunzig",
338 |             "11/12": "elf zwölf",
339 |             "1814": "achtzehnhundertvierzehn",
340 |             "1336": "dreizehnhundertsechsunddreißig",
341 |             "1793": "siebzehnhundertdreiundneunzig",
342 |             "1899": "achtzehnhundertneunundneunzig",
343 |             "1911": "neunzehnhundertelf",
344 |             "1914": "neunzehnhundertvierzehn",
345 |             "1847": "achtzehnhundertsiebenundvierzig",
346 |             "1920": "neunzehnhundertzwanzig",
347 |             "1918": "neunzehnhundertachtzehn",
348 |             "1813": "achtzehnhundertdreizehn",
349 |             "1848": "achtzehnhundertachtundvierzig",
350 |             "1,80 Mark": "eine Mark achtzig",
351 |             "1850": "achtzehnhundertfünfzig",
352 |             "1880": "achtzehnhundertachtzig",
353 |             "1870": "achtzehnhundertsiebzig",
354 |             "1919": "neunzehnhundertneunzehn",
355 |             "1916": "neunzehnhundertsechzehn",
356 |             "3000": "dreitausend",
357 |             "5000": "fünftausend",
358 |             "1100": "eintausendeinhundert",
359 |             "1913": "neunzehnhundertdreizehn",
360 |             "1915": "neunzehnhundertfünfzehn",
361 |             "1520": "fünfzehnhundertzwanzig",
362 |             "1910": "neunzehnhundertzehn",
363 |             "1921": "neunzehnhunderteinundzwanzig",
364 |             "1871": "achtzehnhunderteinundsiebzig",
365 |             "170:": "einhundertsiebzig:",
366 |             "3,80": "drei komma acht null",
367 |             "1896": "achtzehnhundertsechsundneunzig",
368 |             "1909": "neunzehnhundertneun",
369 |             "1521": "fünfzehnhunderteinundzwanzig",
370 |             "1807": "achtzehnhundertsieben",
371 |             "1866": "achtzehnhundertsechsundsechzig",
372 |             "159.": "einhundertneunundfünfzigte",
373 |             "1800": "achtzehnhundert",
374 |             "1902": "neunzehnhundertzwei",
375 |             "1490": "vierzehnhundertneunzig",
376 |             "1891": "achtzehnhunderteinundneunzig",
377 |             "1898": "achtzehnhundertachtundneunzig",
378 |             "1917": "neunzehnhundertsiebzehn",
379 |             "1500": "fünfzehnhundert",
380 |             "184.": "einhundertvierundachtzigte",
381 |             "1000": "eintausend",
382 |             "1821": "achtzehnhunderteinundzwanzig",
383 |             "1881": "achtzehnhunderteinundachtzig",
384 |             "1857": "achtzehnhundertsiebenundfünfzig",
385 |             "1935": "neunzehnhundertfünfunddreißig",
386 |             "1908": "neunzehnhundertacht",
387 |             "000": "tausend",
388 |             "31.": "einunddreißigster",
389 |             "27.": "siebenundzwanzigster",
390 |             "28.": "achtundzwanzigster",
391 |             "400": "vierhundert",
392 |             "11.": "elfster",
393 |             "65.": "fünfundsechzigster",
394 |             "13.": "dreizehnster",
395 |             "17.": "siebzehnster",
396 |             "24.": "vierundzwanzigster",
397 |             "200": "zweihundert",
398 |             "100": "einhundert",
399 |             "20.": "zwanzigster",
400 |             "23.": "dreiundzwanzigster",
401 |             "29.": "neunundzwanzigster",
402 |             "125": "einhundertfünfundzwanzig",
403 |             "600": "sechshundert",
404 |             "150": "einhundertfünfzig",
405 |             "12.": "zwölfster",
406 |             "390": "dreihundertneunzig",
407 |             "250": "zweihundertfünfzig",
408 |             "21.": "einundzwanzigsten",
409 |             "14.": "vierzehnster",
410 |             "26.": "sechsundzwanzigster",
411 |             "75,": "fünfundsiebzig,",
412 |             "22.": "zweiundzwanzigster",
413 |             "18.": "achtzehnster",
414 |             "10.": "zehnster",
415 |             "25.": "fünfundzwanzigster",
416 |             "30.": "dreißigster",
417 |             "19.": "neunzehnster",
418 |             "16.": "sechzehnster",
419 |             "39,": "neununddreißig,",
420 |             "300": "dreihundert",
421 |             "15.": "fünfzehnsten",
422 |             "212": "zweihundertzwölf",
423 |             "109": "einhundertneun",
424 |             "118": "einhundertachtzehn",
425 |             "184": "einhundertvierundachtzig",
426 |             "500": "fünfhundert",
427 |             "231": "zweihunderteinunddreißig",
428 |             "3.": "dritter",
429 |             "1.": "erster",
430 |             "52": "zweiundfünfzig",
431 |             "10": "zehn",
432 |             "2.": "zweite",
433 |             "6.": "sechste",
434 |             "20": "zwanzig",
435 |             "70": "siebzig",
436 |             "4.": "vierter",
437 |             "51": "einundfünfzig",
438 |             "35": "fünfunddreißig",
439 |             "48": "achtundvierzig",
440 |             "84": "vierundachtzig",
441 |             "9.": "neunter",
442 |             "30": "dreißig",
443 |             "50": "fünfzig",
444 |             "40": "vierzig",
445 |             "62": "zweiundsechzig",
446 |             "80": "achtzig",
447 |             "25": "fünfundzwanzig",
448 |             "90": "neunzig",
449 |             "95": "fünfundneunzig",
450 |             "8.": "achter",
451 |             "5.": "fünfter",
452 |             "11": "elf",
453 |             "7.": "siebter",
454 |             "19": "neunzehn",
455 |             "47": "siebenundvierzig",
456 |             "17": "siebzehn",
457 |             "15": "fünfzehn",
458 |             "31": "einunddreißig",
459 |             "36": "sechsunddreißig",
460 |             "54": "vierundfünfzig",
461 |             "22": "zweiundzwanzig",
462 |             "26": "sechsundzwanzig",
463 |             "18": "achtzehn",
464 |             "60": "sechzig",
465 |             "38": "achtunddreißig",
466 |             "1": "ein",
467 |             "3": "drei",
468 |             "4": "vier",
469 |             "5": "fünf",
470 |             "2": "zwei",
471 |             "6": "sechs",
472 |             "9": "neun",
473 |             "8": "acht",
474 |             "IV.": "vier ",
475 |             "III.": "drei",
476 |             "II.": "zwei ",
477 |             "I.": "eins",
478 |             "[": " ",
479 |             "]": " ",
480 |             "*": " ",
481 |             "#": " ",
482 |             "/": " ",
483 |             "â": "a",
484 |             "ç": "c"
485 |             
486 |         }
487 |     }
488 | }


--------------------------------------------------------------------------------