├── deepasr
    ├── utils
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── utils.cpython-36.pyc
    │   │   ├── utils.cpython-37.pyc
    │   │   ├── __init__.cpython-36.pyc
    │   │   └── __init__.cpython-37.pyc
    │   ├── getmeta.py
    │   └── utils.py
    ├── vocab
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── __init__.cpython-37.pyc
    │   │   ├── alphabet.cpython-36.pyc
    │   │   └── alphabet.cpython-37.pyc
    │   ├── alphabet-en.txt
    │   └── alphabet.py
    ├── evaluate
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-37.pyc
    │   │   ├── distance.cpython-37.pyc
    │   │   └── evaluate.cpython-37.pyc
    │   ├── evaluate.py
    │   ├── activations.py
    │   └── distance.py
    ├── decoder
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── __init__.cpython-37.pyc
    │   │   ├── decoder.cpython-36.pyc
    │   │   └── decoder.cpython-37.pyc
    │   └── decoder.py
    ├── augmentation
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── __init__.cpython-37.pyc
    │   │   ├── augmentation.cpython-36.pyc
    │   │   ├── augmentation.cpython-37.pyc
    │   │   ├── spec_augment.cpython-36.pyc
    │   │   └── spec_augment.cpython-37.pyc
    │   ├── augmentation.py
    │   └── spec_augment.py
    ├── __pycache__
    │   └── __init__.cpython-37.pyc
    ├── features
    │   ├── __pycache__
    │   │   ├── mfcc.cpython-36.pyc
    │   │   ├── mfcc.cpython-37.pyc
    │   │   ├── sigproc.cpython-36.pyc
    │   │   ├── sigproc.cpython-37.pyc
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── __init__.cpython-37.pyc
    │   │   ├── filter_banks.cpython-36.pyc
    │   │   ├── filter_banks.cpython-37.pyc
    │   │   ├── get_features.cpython-37.pyc
    │   │   ├── spectrogram.cpython-36.pyc
    │   │   ├── spectrogram.cpython-37.pyc
    │   │   ├── feature_extractor.cpython-36.pyc
    │   │   └── feature_extractor.cpython-37.pyc
    │   ├── __init__.py
    │   ├── filter_banks.py
    │   ├── get_features.py
    │   ├── feature_extractor.py
    │   ├── spectrogram.py
    │   ├── sigproc.py
    │   └── mfcc.py
    ├── model
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── __init__.cpython-37.pyc
    │   │   ├── deepspeech2.cpython-37.pyc
    │   │   └── deepasrnetwork1.cpython-37.pyc
    │   ├── __init__.py
    │   ├── compilemodel.py
    │   ├── deepasrnetwork1.py
    │   └── deepspeech2.py
    ├── pipeline
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── __init__.cpython-37.pyc
    │   │   ├── pipeline.cpython-36.pyc
    │   │   ├── pipeline.cpython-37.pyc
    │   │   ├── ctc_pipeline.cpython-36.pyc
    │   │   ├── ctc_pipeline.cpython-37.pyc
    │   │   └── get_pipeline.cpython-37.pyc
    │   ├── __init__.py
    │   ├── pipeline.py
    │   ├── get_pipeline.py
    │   └── ctc_pipeline.py
    └── __init__.py
├── setup.cfg
├── MANIFEST.in
├── setup.py
├── app.py
├── README.md
├── LICENSE
└── DeepAsr_CTC_Pipeline.ipynb


/deepasr/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import *
2 | 


--------------------------------------------------------------------------------
/deepasr/vocab/__init__.py:
--------------------------------------------------------------------------------
1 | from .alphabet import Alphabet
2 | 


--------------------------------------------------------------------------------
/deepasr/evaluate/__init__.py:
--------------------------------------------------------------------------------
1 | from .evaluate import calculate_error_rates
2 | 


--------------------------------------------------------------------------------
/deepasr/decoder/__init__.py:
--------------------------------------------------------------------------------
1 | from .decoder import Decoder, GreedyDecoder, BeamSearchDecoder
2 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [egg_info]
2 | tag_build =
3 | tag_date = 0
4 | [metadata]
5 | description-file = README.md


--------------------------------------------------------------------------------
/deepasr/augmentation/__init__.py:
--------------------------------------------------------------------------------
1 | from .augmentation import Augmentation
2 | from .spec_augment import SpecAugment
3 | 


--------------------------------------------------------------------------------
/deepasr/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/deepasr/utils/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/utils/__pycache__/utils.cpython-36.pyc


--------------------------------------------------------------------------------
/deepasr/utils/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/utils/__pycache__/utils.cpython-37.pyc


--------------------------------------------------------------------------------
/deepasr/features/__pycache__/mfcc.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/features/__pycache__/mfcc.cpython-36.pyc


--------------------------------------------------------------------------------
/deepasr/features/__pycache__/mfcc.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/features/__pycache__/mfcc.cpython-37.pyc


--------------------------------------------------------------------------------
/deepasr/model/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/model/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/deepasr/model/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/model/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/deepasr/utils/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/utils/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/deepasr/utils/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/utils/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/deepasr/vocab/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/vocab/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/deepasr/vocab/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/vocab/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/deepasr/vocab/__pycache__/alphabet.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/vocab/__pycache__/alphabet.cpython-36.pyc


--------------------------------------------------------------------------------
/deepasr/vocab/__pycache__/alphabet.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/vocab/__pycache__/alphabet.cpython-37.pyc


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE
3 | include app.py
4 | 
5 | include deepasr/vocab/*.txt      # Alphabets
6 | recursive-include deepasr *.py


--------------------------------------------------------------------------------
/deepasr/decoder/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/decoder/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/deepasr/decoder/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/decoder/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/deepasr/decoder/__pycache__/decoder.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/decoder/__pycache__/decoder.cpython-36.pyc


--------------------------------------------------------------------------------
/deepasr/decoder/__pycache__/decoder.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/decoder/__pycache__/decoder.cpython-37.pyc


--------------------------------------------------------------------------------
/deepasr/features/__pycache__/sigproc.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/features/__pycache__/sigproc.cpython-36.pyc


--------------------------------------------------------------------------------
/deepasr/features/__pycache__/sigproc.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/features/__pycache__/sigproc.cpython-37.pyc


--------------------------------------------------------------------------------
/deepasr/evaluate/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/evaluate/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/deepasr/evaluate/__pycache__/distance.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/evaluate/__pycache__/distance.cpython-37.pyc


--------------------------------------------------------------------------------
/deepasr/evaluate/__pycache__/evaluate.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/evaluate/__pycache__/evaluate.cpython-37.pyc


--------------------------------------------------------------------------------
/deepasr/features/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/features/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/deepasr/features/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/features/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/deepasr/model/__pycache__/deepspeech2.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/model/__pycache__/deepspeech2.cpython-37.pyc


--------------------------------------------------------------------------------
/deepasr/pipeline/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/pipeline/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/deepasr/pipeline/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/pipeline/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/deepasr/pipeline/__pycache__/pipeline.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/pipeline/__pycache__/pipeline.cpython-36.pyc


--------------------------------------------------------------------------------
/deepasr/pipeline/__pycache__/pipeline.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/pipeline/__pycache__/pipeline.cpython-37.pyc


--------------------------------------------------------------------------------
/deepasr/augmentation/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/augmentation/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/deepasr/augmentation/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/augmentation/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/deepasr/features/__pycache__/filter_banks.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/features/__pycache__/filter_banks.cpython-36.pyc


--------------------------------------------------------------------------------
/deepasr/features/__pycache__/filter_banks.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/features/__pycache__/filter_banks.cpython-37.pyc


--------------------------------------------------------------------------------
/deepasr/features/__pycache__/get_features.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/features/__pycache__/get_features.cpython-37.pyc


--------------------------------------------------------------------------------
/deepasr/features/__pycache__/spectrogram.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/features/__pycache__/spectrogram.cpython-36.pyc


--------------------------------------------------------------------------------
/deepasr/features/__pycache__/spectrogram.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/features/__pycache__/spectrogram.cpython-37.pyc


--------------------------------------------------------------------------------
/deepasr/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .compilemodel import compile_model
2 | from .deepspeech2 import get_deepspeech2
3 | from .deepasrnetwork1 import get_deepasrnetwork1
4 | 


--------------------------------------------------------------------------------
/deepasr/model/__pycache__/deepasrnetwork1.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/model/__pycache__/deepasrnetwork1.cpython-37.pyc


--------------------------------------------------------------------------------
/deepasr/pipeline/__pycache__/ctc_pipeline.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/pipeline/__pycache__/ctc_pipeline.cpython-36.pyc


--------------------------------------------------------------------------------
/deepasr/pipeline/__pycache__/ctc_pipeline.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/pipeline/__pycache__/ctc_pipeline.cpython-37.pyc


--------------------------------------------------------------------------------
/deepasr/pipeline/__pycache__/get_pipeline.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/pipeline/__pycache__/get_pipeline.cpython-37.pyc


--------------------------------------------------------------------------------
/deepasr/augmentation/__pycache__/augmentation.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/augmentation/__pycache__/augmentation.cpython-36.pyc


--------------------------------------------------------------------------------
/deepasr/augmentation/__pycache__/augmentation.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/augmentation/__pycache__/augmentation.cpython-37.pyc


--------------------------------------------------------------------------------
/deepasr/augmentation/__pycache__/spec_augment.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/augmentation/__pycache__/spec_augment.cpython-36.pyc


--------------------------------------------------------------------------------
/deepasr/augmentation/__pycache__/spec_augment.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/augmentation/__pycache__/spec_augment.cpython-37.pyc


--------------------------------------------------------------------------------
/deepasr/features/__pycache__/feature_extractor.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/features/__pycache__/feature_extractor.cpython-36.pyc


--------------------------------------------------------------------------------
/deepasr/features/__pycache__/feature_extractor.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scionoftech/DeepAsr/HEAD/deepasr/features/__pycache__/feature_extractor.cpython-37.pyc


--------------------------------------------------------------------------------
/deepasr/pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | from .pipeline import Pipeline
2 | from .ctc_pipeline import CTCPipeline
3 | from .get_pipeline import load
4 | # from .get_pipeline import load_checkpoint
5 | 


--------------------------------------------------------------------------------
/deepasr/augmentation/augmentation.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | import numpy as np
 3 | 
 4 | 
 5 | class Augmentation:
 6 | 
 7 |     @abc.abstractmethod
 8 |     def __call__(self, batch_features: np.ndarray) -> np.ndarray:
 9 |         pass
10 | 


--------------------------------------------------------------------------------
/deepasr/features/__init__.py:
--------------------------------------------------------------------------------
1 | from .feature_extractor import FeaturesExtractor
2 | from .filter_banks import FilterBanks
3 | from .spectrogram import Spectrogram
4 | from . import mfcc
5 | from . import sigproc
6 | from .get_features import preprocess
7 | 


--------------------------------------------------------------------------------
/deepasr/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import augmentation
 2 | from . import decoder
 3 | from . import evaluate
 4 | from . import features
 5 | from . import model
 6 | from . import pipeline
 7 | from . import utils
 8 | from . import vocab
 9 | 
10 | # Version of the deepasr package
11 | __version__ = "0.1.1"
12 | 


--------------------------------------------------------------------------------
/deepasr/utils/getmeta.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from tinytag import TinyTag
 3 | 
 4 | 
 5 | # https://pypi.org/project/tinytag/
 6 | 
 7 | def get_file_tags(audio_file):
 8 |     tag = TinyTag.get(audio_file)
 9 |     print(tag.filesize, '|', tag.audio_offest, "|", tag.bitrate, "|", tag.channels, "|", tag.duration, "|",
10 |           tag.samplerate, "|", tag.audio_offset)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     get_file_tags(sys.argv[1])
15 | 


--------------------------------------------------------------------------------
/deepasr/vocab/alphabet-en.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Alphabet is the list of valid characters. There are two special characters:
 3 | #       - space: on the beginning
 4 | #       - blank: default added as the last char
 5 | #
 6 | #   To comment the line use `#`
 7 | #
 8 |  
 9 | a
10 | b
11 | c
12 | d
13 | e
14 | f
15 | g
16 | h
17 | i
18 | j
19 | k
20 | l
21 | m
22 | n
23 | o
24 | p
25 | q
26 | r
27 | s
28 | t
29 | u
30 | v
31 | w
32 | x
33 | y
34 | z
35 | '
36 | 
37 | # End of vocabulary
38 | # The last (non-comment) blank line represent the blank token


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open('README.md') as f:
 4 |     long_description = f.read()
 5 | 
 6 | setuptools.setup(
 7 |     name="deepasr",
 8 |     version="0.1.2",
 9 |     author="Sai Kumar Yava",
10 |     author_email="saikumar.geek@gmail.com",
11 |     description="Keras(Tensorflow) implementations of Automatic Speech Recognition",
12 |     long_description=long_description,
13 |     long_description_content_type='text/markdown',
14 |     url="https://github.com/scionoftech/DeepAsr",
15 |     include_package_data=True,
16 |     packages=['deepasr'],
17 |     keywords=['deepspeech', 'asr', 'speech recognition', 'speech to text'],
18 |     license='GNU',
19 |     install_requires=['tensorflow>=2.0', 'pandas', 'tables', 'scipy', 'librosa'],
20 |     python_requires='>=3.6',
21 | )
22 | 


--------------------------------------------------------------------------------
/deepasr/features/filter_banks.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from .mfcc import fbank
 3 | from . import feature_extractor
 4 | 
 5 | 
 6 | class FilterBanks(feature_extractor.FeaturesExtractor):
 7 | 
 8 |     def __init__(self, features_num: int, samplerate: int = 16000, is_standardization=True, **kwargs):
 9 |         self.features_num = features_num
10 |         self.is_standardization = is_standardization
11 |         self.params = kwargs
12 |         self.samplerate = samplerate
13 | 
14 |     def make_features(self, audio: np.ndarray) -> np.ndarray:
15 |         """ Use `python_speech_features` lib to extract log filter banks from
16 |         the features file. """
17 |         audio = self.normalize(audio.astype(np.float32))
18 |         audio = (audio * np.iinfo(np.int16).max).astype(np.int16)
19 |         feat, energy = fbank(
20 |             audio, nfilt=self.features_num, samplerate=self.samplerate, **self.params
21 |         )
22 |         features = np.log(feat)
23 |         return self.standardize(features) if self.is_standardization else features
24 | 


--------------------------------------------------------------------------------
/deepasr/features/get_features.py:
--------------------------------------------------------------------------------
 1 | from .filter_banks import FilterBanks
 2 | from .spectrogram import Spectrogram
 3 | 
 4 | 
 5 | def preprocess(feature_type: str = 'fbank', features_num: int = 161,
 6 |                samplerate: int = 16000,
 7 |                winlen: float = 0.02,
 8 |                winstep: float = 0.01,
 9 |                winfunc=None,
10 |                is_standardization=True,
11 |                pad_audio_to: int = 0):
12 |     ''' This method extracts the audio features based on fbank or spectrogram '''
13 |     if feature_type == 'fbank':
14 |         features_extractor = FilterBanks(features_num=features_num, samplerate=samplerate, winlen=winlen,
15 |                                          winstep=winstep, winfunc=winfunc,
16 |                                          is_standardization=is_standardization)
17 |         return features_extractor
18 |     elif feature_type == 'spectrogram':
19 |         features_extractor = Spectrogram(
20 |             features_num=features_num,
21 |             samplerate=samplerate,
22 |             winlen=winlen,
23 |             winstep=winstep,
24 |             winfunc=winfunc,
25 |             pad_audio_to=pad_audio_to
26 |         )
27 |         return features_extractor
28 | 


--------------------------------------------------------------------------------
/deepasr/pipeline/pipeline.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | from typing import List
 3 | import numpy as np
 4 | import pandas as pd
 5 | from tensorflow import keras
 6 | import sys
 7 | 
 8 | sys.path.append("..")
 9 | from deepasr.decoder import Decoder
10 | from deepasr.features import FeaturesExtractor
11 | from deepasr.vocab import Alphabet
12 | 
13 | 
14 | class Pipeline:
15 | 
16 |     @property
17 |     @abc.abstractmethod
18 |     def alphabet(self) -> Alphabet:
19 |         pass
20 | 
21 |     @property
22 |     @abc.abstractmethod
23 |     def features_extractor(self) -> FeaturesExtractor:
24 |         pass
25 | 
26 |     @property
27 |     @abc.abstractmethod
28 |     def model(self) -> keras.Model:
29 |         pass
30 | 
31 |     @property
32 |     @abc.abstractmethod
33 |     def decoder(self) -> Decoder:
34 |         pass
35 | 
36 |     @abc.abstractmethod
37 |     def fit(self,
38 |             train_dataset: pd.DataFrame,
39 |             val_dataset: pd.DataFrame,
40 |             prepared_features=False,
41 |             **kwargs) -> keras.callbacks.History:
42 |         pass
43 | 
44 |     @abc.abstractmethod
45 |     def predict(self, batch_audio: List[np.ndarray], **kwargs) -> List[str]:
46 |         pass
47 | 
48 |     @abc.abstractmethod
49 |     def save(self, directory: str):
50 |         pass
51 | 


--------------------------------------------------------------------------------
/deepasr/pipeline/get_pipeline.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import os
 3 | from deepasr.utils import load_data
 4 | from deepasr.pipeline import CTCPipeline
 5 | from deepasr.model import compile_model
 6 | 
 7 | 
 8 | def load(directory: str):
 9 |     """ Load each component of the CTC pipeline. """
10 | 
11 |     _label_len = load_data(os.path.join(directory, 'label_len.bin'))
12 |     _optimizer = load_data(os.path.join(directory, 'optimizer.bin'))
13 |     _network = tf.keras.models.load_model(os.path.join(directory, 'network.h5'))
14 |     _model = _network
15 |     _model = compile_model(_model, _optimizer, _label_len)
16 |     _model.load_weights(os.path.join(directory, 'model_weights.h5'))
17 |     _alphabet = load_data(os.path.join(directory, 'alphabet.bin'))
18 |     _decoder = load_data(os.path.join(directory, 'decoder.bin'))
19 |     _features_extractor = load_data(
20 |         os.path.join(directory, 'feature_extractor.bin'))
21 |     _multi_gpu_flag = load_data(os.path.join(directory, 'multi_gpu_flag.bin'))
22 |     _sample_rate = load_data(os.path.join(directory, 'sample_rate.bin'))
23 |     _mono = load_data(os.path.join(directory, 'mono.bin'))
24 | 
25 |     pipeline = CTCPipeline(
26 |         alphabet=_alphabet, features_extractor=_features_extractor, model=_model, optimizer=_optimizer,
27 |         decoder=_decoder, sample_rate=_sample_rate, mono=_mono, label_len=_label_len, multi_gpu=_multi_gpu_flag,
28 |         temp_model=_network
29 |     )
30 |     return pipeline
31 | 


--------------------------------------------------------------------------------
/deepasr/model/compilemodel.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.keras.layers import *
 2 | from tensorflow.keras.models import Model
 3 | import tensorflow.keras.backend as K
 4 | import logging
 5 | 
 6 | logger = logging.getLogger('asr.pipeline')
 7 | 
 8 | 
 9 | def ctc_loss(args):
10 |     """ The CTC loss using TensorFlow's `ctc_loss`. """
11 |     y_pred, labels, input_length, label_length = args
12 |     return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
13 | 
14 | 
15 | def compile_model(_model, _optimizer, label_len=None):
16 |     """ The compiled model means the model configured for training. """
17 | 
18 |     input_data = _model.inputs[0]
19 |     y_pred = _model.outputs[0]
20 | 
21 |     # your ground truth data. The data you are going to compare with the model's outputs in training
22 |     labels = Input(name='the_labels', shape=[label_len], dtype='float32')
23 |     # the length (in steps, or chars this case) of each sample (sentence) in the y_pred tensor
24 |     input_length = Input(name='input_length', shape=[1], dtype='float32')
25 |     #  the length (in steps, or chars this case) of each sample (sentence) in the y_true
26 |     label_length = Input(name='label_length', shape=[1], dtype='float32')
27 |     output = Lambda(ctc_loss, output_shape=(1,), name='ctc')([y_pred, labels, input_length, label_length])
28 |     _model = Model(inputs=[input_data, labels, input_length, label_length], outputs=output,
29 |                    name="DeepAsr")
30 |     _model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=_optimizer,
31 |                    metrics=['accuracy'])
32 | 
33 |     # _model.summary()
34 |     logger.info("Model is successfully compiled")
35 |     return _model
36 | 


--------------------------------------------------------------------------------
/deepasr/decoder/decoder.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | # import itertools
 3 | from typing import List
 4 | import numpy as np
 5 | from tensorflow.keras import backend as K
 6 | 
 7 | 
 8 | # https://www.tensorflow.org/api_docs/python/tf/keras/backend/ctc_decode
 9 | 
10 | class Decoder:
11 | 
12 |     @abc.abstractmethod
13 |     def __call__(self, batch_logits: np.ndarray, input_length: int) -> List[np.ndarray]:
14 |         pass
15 | 
16 | 
17 | class GreedyDecoder:
18 | 
19 |     def __call__(self, batch_logits: np.ndarray, input_length: int) -> List[np.ndarray]:
20 |         """ Decode the best guess from logits using greedy algorithm. """
21 |         # Choose the class with maximum probability
22 |         # best_candidates = np.argmax(batch_logits, axis=2)
23 |         # Merge repeated chars
24 |         # decoded = [np.array([k for k, _ in itertools.groupby(best_candidate)])
25 |         #            for best_candidate in best_candidates]
26 |         decoded = np.array(
27 |             (K.eval(K.ctc_decode(batch_logits, [input_length], greedy=True)[0][0])).flatten().tolist())
28 |         return [decoded]
29 | 
30 | 
31 | class BeamSearchDecoder:
32 | 
33 |     def __init__(self, beam_width: int, top_paths: int):
34 |         self.beam_width = beam_width
35 |         self.top_paths = top_paths
36 | 
37 |     def __call__(self, batch_logits: np.ndarray, input_length: int, **kwargs) -> List[
38 |         np.ndarray]:
39 |         """ Decode the best guess from logits using beam search algorithm. """
40 |         decoded = np.array((K.eval(
41 |             K.ctc_decode(batch_logits, [input_length], greedy=False, beam_width=self.beam_width,
42 |                          top_paths=self.top_paths)[0][
43 |                 0])).flatten().tolist())
44 |         return [decoded]
45 | 


--------------------------------------------------------------------------------
/deepasr/features/feature_extractor.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | from typing import List, Tuple
 3 | import numpy as np
 4 | 
 5 | 
 6 | class FeaturesExtractor:
 7 | 
 8 |     def __index__(self):
 9 |         self.features_shape = None
10 | 
11 |     def __call__(self, batch_audio: List[np.ndarray]) -> np.ndarray:
12 |         """ Extract features from the file list. """
13 |         features = [self.make_features(audio) for audio in batch_audio]
14 |         self.features_shape = max(features, key=len).shape
15 |         X = self.align(features, self.features_shape)
16 |         return X.astype(np.float16)
17 | 
18 |     @abc.abstractmethod
19 |     def make_features(self, audio: np.ndarray) -> np.ndarray:
20 |         pass
21 | 
22 |     @staticmethod
23 |     def standardize(features: np.ndarray) -> np.ndarray:
24 |         """ Standardize globally, independently of features. """
25 |         mean = np.mean(features)
26 |         std = np.std(features)
27 |         return (features - mean) / std
28 | 
29 |     @staticmethod
30 |     def normalize(audio: np.ndarray):
31 |         """ Normalize float32 signal to [-1, 1] range. """
32 |         gain = 1.0 / (np.max(np.abs(audio)) + 1e-5)
33 |         return audio * gain
34 | 
35 |     @staticmethod
36 |     def align(arrays: list, features_shape: Tuple, default=0) -> np.ndarray:
37 |         """ Pad arrays (default along time dimensions). Return the single
38 |         array (batch_size, time, features). """
39 |         # max_array = max(arrays, key=len)
40 |         X = np.full(shape=[len(arrays), *features_shape],
41 |                     fill_value=default, dtype=float)
42 |         for index, array in enumerate(arrays):
43 |             time_dim, features_dim = array.shape
44 |             X[index, :time_dim] = array
45 |         return X
46 | 


--------------------------------------------------------------------------------
/deepasr/features/spectrogram.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import numpy as np
 3 | from . import sigproc
 4 | from . import feature_extractor
 5 | 
 6 | 
 7 | class Spectrogram(feature_extractor.FeaturesExtractor):
 8 | 
 9 |     def __init__(self,
10 |                  features_num: int,
11 |                  samplerate: int,
12 |                  winlen: float,
13 |                  winstep: float,
14 |                  winfunc=None,
15 |                  is_standardization=True,
16 |                  pad_audio_to: int = 0):
17 |         self.features_num = features_num
18 |         self.winfunc = winfunc
19 |         self.frame_len = int(winlen * samplerate)
20 |         self.frame_step = int(winstep * samplerate)
21 |         self.is_standardization = is_standardization
22 |         self.pad_to = pad_audio_to
23 | 
24 |     def make_features(self, audio: np.ndarray) -> np.ndarray:
25 |         """ Use `python_speech_features` lib to extract log-spectrogram's. """
26 |         audio = self.normalize(audio.astype(np.float32))
27 |         audio = (audio * np.iinfo(np.int16).max).astype(np.int16)
28 |         audio = self.pad(audio) if self.pad_to else audio
29 |         frames = sigproc.framesig(
30 |             audio, self.frame_len, self.frame_step, self.winfunc
31 |         )
32 |         features = sigproc.logpowspec(
33 |             frames, self.frame_len, norm=1
34 |         )
35 |         features = features[:, :self.features_num]  # Cut high frequency part
36 |         return self.standardize(features) if self.is_standardization else features
37 | 
38 |     def pad(self, audio: np.ndarray) -> np.ndarray:
39 |         """ Padding signal is required if you play with mixed precession. """
40 |         length = 1 + int((len(audio) - self.frame_len) // self.frame_step + 1)
41 |         pad_size = (self.pad_to - length % self.pad_to) * self.frame_step
42 |         return np.pad(audio, (0, pad_size), mode='constant')
43 | 


--------------------------------------------------------------------------------
/deepasr/model/deepasrnetwork1.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from tensorflow import keras
 4 | from tensorflow.keras.models import Model
 5 | from tensorflow.keras.layers import *
 6 | from tensorflow.keras.mixed_precision import experimental as mixed_precision
 7 | 
 8 | 
 9 | def get_deepasrnetwork1(input_dim=None, output_dim=29,
10 |                         is_mixed_precision=True, random_state=1) -> keras.Model:
11 |     """
12 | 
13 |     input_dim: int i wielokrotność 4
14 |     output_dim: licba liter w słowniku
15 | 
16 |     """
17 |     if is_mixed_precision:
18 |         policy = mixed_precision.Policy('float32')
19 |         mixed_precision.set_policy(policy)
20 | 
21 |     np.random.seed(random_state)
22 |     tf.random.set_seed(random_state)
23 | 
24 |     # the input
25 |     input_data = Input(name='the_input', shape=(None, input_dim), dtype='float32')
26 | 
27 |     # Batch normalize
28 |     bn1 = BatchNormalization(axis=-1, name='BN_1')(input_data)
29 | 
30 |     # 1D Convs
31 |     conv = Conv1D(filters=220, kernel_size=5, strides=1, padding='valid', activation='relu', name='Conv1D_1')(bn1)
32 |     conv = BatchNormalization(name="CNBN_1")(conv)
33 |     conv1 = Conv1D(filters=220, kernel_size=5, strides=1, padding='valid', activation='relu', name='Conv1D_2')(conv)
34 |     conv1 = BatchNormalization(name="CNBN_2")(conv1)
35 | 
36 |     # RNN
37 |     gru_1 = GRU(512, return_sequences=True, name='gru_1')(conv1)
38 |     gru_2 = GRU(512, return_sequences=True, go_backwards=True, name='gru_2')(conv1)
39 | 
40 |     # merge tow gpu ouputs
41 |     merged = concatenate([gru_1, gru_2])
42 |     # Batch normalize
43 |     bn2 = BatchNormalization(axis=-1, name="BN_2")(merged)
44 | 
45 |     dense = TimeDistributed(Dense(30))(bn2)
46 |     y_pred = TimeDistributed(Dense(output_dim, activation='softmax', name='y_pred'), name='the_output')(dense)
47 | 
48 |     model = Model(inputs=input_data, outputs=y_pred)
49 | 
50 |     return model
51 | 


--------------------------------------------------------------------------------
/deepasr/augmentation/spec_augment.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple
 2 | import numpy as np
 3 | 
 4 | 
 5 | class SpecAugment:
 6 | 
 7 |     def __init__(self,
 8 |                  F: int = None,
 9 |                  mf: int = None,
10 |                  Tmin: int = None,
11 |                  Tmax: int = None,
12 |                  mt: int = None):
13 |         """ SpecAugment: A Simple Data Augmentation Method. """
14 |         self.F = F
15 |         self.mf = mf
16 |         self.Tmin = Tmin
17 |         self.Tmax = Tmax
18 |         self.mt = mt
19 | 
20 |     def __call__(self, batch_features: np.ndarray) -> np.ndarray:
21 |         return np.stack([self.mask_features(features) for features in batch_features], axis=0)
22 | 
23 |     def mask_features(self, features: np.ndarray) -> np.ndarray:
24 |         features = features.copy()
25 |         time, channels = features.shape
26 |         means = features.mean(axis=0)  # The mean should be zero if features are normalized
27 |         if self.F and self.mf:
28 |             features = self.mask_frequencies(features, means, channels, self.F, self.mf)
29 |         if self.Tmax and self.mt:
30 |             features = self.mask_time(features, means, time, (self.Tmin, self.Tmax), self.mt)
31 |         return features
32 | 
33 |     @staticmethod
34 |     def mask_frequencies(features: np.ndarray, means: np.ndarray, channels: int, F: int, mf: int):
35 |         for i in range(mf):
36 |             f = np.random.random_integers(low=0, high=F)
37 |             f0 = np.random.random_integers(low=0, high=channels - F)
38 |             features[:, f0:f0 + f] = means[f0:f0 + f]
39 |         return features
40 | 
41 |     @staticmethod
42 |     def mask_time(features: np.ndarray, means: np.ndarray, time: int, T_range: Tuple[int, int], mt: int):
43 |         Tmin, Tmax = T_range
44 |         for i in range(mt):
45 |             t = np.random.random_integers(low=Tmin, high=Tmax)
46 |             t0 = np.random.random_integers(low=0, high=time - Tmax)
47 |             features[t0:t0 + t, :] = means
48 |         return features
49 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import tensorflow as tf
 4 | import deepasr as asr
 5 | 
 6 | 
 7 | # get CTCPipeline
 8 | def get_config(feature_type: str = 'spectrogram', multi_gpu: bool = False):
 9 |     # audio feature extractor
10 |     features_extractor = asr.features.preprocess(feature_type=feature_type, features_num=161,
11 |                                                  samplerate=16000,
12 |                                                  winlen=0.02,
13 |                                                  winstep=0.025,
14 |                                                  winfunc=np.hanning)
15 | 
16 |     # input label encoder
17 |     alphabet_en = asr.vocab.Alphabet(lang='en')
18 |     # training model
19 |     model = asr.model.get_deepasrnetwork1(
20 |         input_dim=161,
21 |         output_dim=29,
22 |         is_mixed_precision=True
23 |     )
24 |     # model optimizer
25 |     optimizer = tf.keras.optimizers.Adam(
26 |         lr=1e-4,
27 |         beta_1=0.9,
28 |         beta_2=0.999,
29 |         epsilon=1e-8
30 |     )
31 |     # output label deocder
32 |     decoder = asr.decoder.GreedyDecoder()
33 |     # CTCPipeline
34 |     pipeline = asr.pipeline.ctc_pipeline.CTCPipeline(
35 |         alphabet=alphabet_en, features_extractor=features_extractor, model=model, optimizer=optimizer, decoder=decoder,
36 |         sample_rate=16000, mono=True, multi_gpu=multi_gpu
37 |     )
38 |     return pipeline
39 | 
40 | 
41 | def run():
42 | 
43 |     train_data = pd.read_csv('train_data.csv')
44 | 
45 |     pipeline = get_config(feature_type = 'fbank', multi_gpu=False)
46 | 
47 |     # train asr model
48 |     history = pipeline.fit(train_dataset=train_data, batch_size=128, epochs=500)
49 |     # history = pipeline.fit_generator(train_dataset = train_data, batch_size=32, epochs=500)
50 | 
51 |     pipeline.save('./checkpoints')
52 | 
53 |     return history
54 | 
55 | 
56 | def test_model(test_data):
57 |     test_data = pd.read_csv('test_data.csv')
58 |     pipeline = asr.pipeline.load('checkpoints')
59 |     print("Truth:", test_data['transcripts'].to_list()[0])
60 |     print("Prediction", pipeline.predict(test_data['path'].to_list()[0]))
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     run()
65 |     # test_model(test)
66 | 


--------------------------------------------------------------------------------
/deepasr/evaluate/evaluate.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Iterable, Tuple, Union
 2 | from collections import namedtuple
 3 | import pandas as pd
 4 | from . import distance
 5 | # from .. import dataset
 6 | from .. import pipeline
 7 | 
 8 | Metric = namedtuple('Metric', ['transcript', 'prediction', 'wer', 'cer'])
 9 | 
10 | 
11 | def calculate_error_rates(ctc_pipeline: pipeline.Pipeline,
12 |                           data: pd.DataFrame,
13 |                           return_metrics: bool = False
14 |                           ) -> Union[Tuple[float, float], pd.DataFrame]:
15 |     """ Calculate base metrics: WER and CER. """
16 |     metrics = []
17 |     for audio, transcript in zip(data['path'].values, data['transcripts'].values):
18 |         prediction = ctc_pipeline.predict(audio)
19 |         batch_metrics = get_metrics(sources=prediction,
20 |                                     destinations=[transcript])
21 |         metrics.extend(batch_metrics)
22 |     metrics = pd.DataFrame(metrics)
23 |     return metrics if return_metrics else (metrics.wer.mean(), metrics.cer.mean())
24 | 
25 | 
26 | def get_metrics(sources: List[str],
27 |                 destinations: List[str]) -> Iterable[Metric]:
28 |     """ Calculate base metrics in one batch: WER and CER. """
29 |     for source, destination in zip(sources, destinations):
30 |         wer_distance, *_ = distance.edit_distance(source.split(),
31 |                                                   destination.split())
32 |         wer = wer_distance / len(destination.split())
33 | 
34 |         cer_distance, *_ = distance.edit_distance(list(source),
35 |                                                   list(destination))
36 |         cer = cer_distance / len(destination)
37 |         yield Metric(destination, source, wer, cer)
38 | 
39 | 
40 | def get_cer(source: str, destination: str) -> float:
41 |     cer_distance, *_ = distance.edit_distance(list(source),
42 |                                               list(destination))
43 |     cer = cer_distance / len(destination)
44 | 
45 |     return cer
46 | 
47 | 
48 | def get_wer(source: str, destination: str) -> float:
49 |     wer_distance, *_ = distance.edit_distance(source.split(),
50 |                                               destination.split())
51 |     wer = wer_distance / len(destination.split())
52 | 
53 |     return wer
54 | 


--------------------------------------------------------------------------------
/deepasr/evaluate/activations.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | import operator
 3 | from typing import Callable, List, Union, Tuple
 4 | import h5py
 5 | import numpy as np
 6 | import pandas as pd
 7 | import tensorflow as tf
 8 | from tensorflow import keras
 9 | from . import evaluate
10 | # from .. import dataset
11 | from .. import pipeline
12 | from .. import utils
13 | 
14 | 
15 | def save_metrics_and_activations(pipeline: pipeline.Pipeline,
16 |                                  data: pd.DataFrame,
17 |                                  store_path: str,
18 |                                  prepared_features: bool = False,
19 |                                  return_metrics: bool = False
20 |                                  ) -> Union[Tuple[float, float], pd.DataFrame]:
21 |     columns = ['sample_id', 'transcript', 'prediction', 'wer', 'cer']
22 |     references = pd.DataFrame(columns=columns).set_index('sample_id')
23 |     get_activations = get_activations_function(pipeline.model)
24 | 
25 |     with h5py.File(store_path, mode='w') as store:
26 |         for audio, transcript in zip(data['path'].values, data['transcripts'].values):
27 |             features = audio if prepared_features else pipeline.features_extractor([utils.read_audio(audio)])
28 |             *activations, y_hat = get_activations([features, 0])
29 |             decoded_labels = pipeline.decoder(y_hat)
30 |             predictions = pipeline.alphabet.get_batch_transcripts(decoded_labels)
31 |             batch_metrics = list(evaluate.get_metrics(sources=predictions,
32 |                                                       destinations=transcript))
33 | 
34 |             save_in_store(store, [*activations, y_hat], batch_metrics, references)
35 | 
36 |     with pd.HDFStore(store_path, mode='r+') as store:
37 |         store.put('references', references)
38 |     metrics = pd.DataFrame(functools.reduce(operator.concat, batch_metrics))
39 |     return metrics if return_metrics else (metrics.wer.mean(), metrics.cer.mean())
40 | 
41 | 
42 | def get_activations_function(model: keras.Model) -> Callable:
43 |     """ Function which handle all activations through one pass. """
44 |     inputs = [model.input, tf.keras.learning_phase()]
45 |     outputs = [layer.output for layer in model.layers][1:]
46 |     return tf.keras.function(inputs, outputs)
47 | 
48 | 
49 | def save_in_store(store: h5py.File,
50 |                   layer_outputs: List[np.ndarray],
51 |                   metrics: List[evaluate.Metric],
52 |                   references: pd.DataFrame):
53 |     """ Save batch data into HDF5 file. """
54 |     for index, metric in enumerate(metrics):
55 |         sample_id = len(references)
56 |         references.loc[sample_id] = metric
57 |         for output_index, batch_layer_outputs in enumerate(layer_outputs):
58 |             layer_output = batch_layer_outputs[index]
59 |             store.create_dataset(f'outputs/{output_index}/{sample_id}', data=layer_output)
60 | 


--------------------------------------------------------------------------------
/deepasr/utils/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import logging
 4 | from functools import reduce
 5 | from logging import Logger
 6 | from typing import Any
 7 | import numpy as np
 8 | import librosa
 9 | # from scipy.io import wavfile
10 | from tensorflow import keras
11 | 
12 | # from google.cloud import storage
13 | 
14 | logger = logging.getLogger('asr.utils')
15 | 
16 | 
17 | def load_data(file_path: str):
18 |     """ Load arbitrary python objects from the pickled file. """
19 |     with open(file_path, mode='rb') as file:
20 |         return pickle.load(file)
21 | 
22 | 
23 | def save_data(data: Any, file_path: str):
24 |     """ Save arbitrary python objects in the pickled file. """
25 |     with open(file_path, mode='wb') as file:
26 |         pickle.dump(data, file)
27 | 
28 | 
29 | # def download_from_bucket(bucket_name: str, remote_path: str, local_path: str):
30 | #     """ Download the file from the public bucket. """
31 | #     client = storage.Client.create_anonymous_client()
32 | #     bucket = client.bucket(bucket_name)
33 | #     blob = storage.Blob(remote_path, bucket)
34 | #     blob.download_to_filename(local_path, client=client)
35 | 
36 | 
37 | # def maybe_download_from_bucket(bucket_name: str, remote_path: str, local_path: str):
38 | #     """ Download file from the bucket if it does not exist. """
39 | #     if os.path.isfile(local_path):
40 | #         return
41 | #     directory = os.path.dirname(local_path)
42 | #     os.makedirs(directory, exist_ok=True)
43 | #     logger.info('Downloading file from the bucket...')
44 | #     download_from_bucket(bucket_name, remote_path, local_path)
45 | 
46 | 
47 | def read_audio(file_path: str, sample_rate: int, mono: bool) -> np.ndarray:
48 |     """ Read already prepared features from the store. """
49 |     # fs, audio = wavfile.read(file_path)
50 |     audio = librosa.core.load(file_path, sr=sample_rate, mono=mono)[0]
51 |     return audio
52 | 
53 | 
54 | def calculate_units(model: keras.Model) -> int:
55 |     """ Calculate number of the model parameters. """
56 |     units = 0
57 |     for parameters in model.get_weights():
58 |         units += reduce(lambda x, y: x * y, parameters.shape)
59 |     return units
60 | 
61 | 
62 | def create_logger(file_path=None, level=20, name='asr') -> Logger:
63 |     """ Create the logger and handlers both console and file. """
64 |     logger = logging.getLogger(name)
65 |     logger.setLevel(level)
66 |     formatter = logging.Formatter(
67 |         '%(asctime)s [%(levelname)-8s] [%(name)-20s] %(message)s',
68 |         datefmt='%Y-%m-%d %H:%M:%S'
69 |     )
70 |     console = logging.StreamHandler()
71 |     console.setFormatter(formatter)
72 |     logger.addHandler(console)  # handle all messages from logger
73 |     if file_path:
74 |         file_handler = logging.FileHandler(file_path, mode='w')
75 |         file_handler.setFormatter(formatter)
76 |         logger.addHandler(file_handler)
77 |     return logger
78 | 


--------------------------------------------------------------------------------
/deepasr/vocab/alphabet.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import List
 3 | import numpy as np
 4 | 
 5 | 
 6 | class Alphabet:
 7 |     """
 8 |     Read alphabet-pl.txt, which is the list of valid characters. Alphabet has
 9 |     two special characters:
10 |     - space on the beginning
11 |     - blank token default added as the last char
12 | 
13 |     This class is used to convert characters to labels and vice versa.
14 |     """
15 | 
16 |     def __init__(self, file_path: str = None, lang: str = None):
17 |         self.size = 0
18 |         self.blank_token = None
19 |         self._label_to_str = []
20 |         self._str_to_label = {}
21 |         if lang in ['en', 'pl']:
22 |             directory = os.path.dirname(os.path.abspath(__file__))
23 |             file_path = os.path.join(directory, f'alphabet-{lang}.txt')
24 |         elif not file_path:
25 |             raise ValueError
26 |         self.process_alphabet_file(file_path)
27 | 
28 |     def __contains__(self, char: str) -> bool:
29 |         """ Check if char is in the Alphabet. """
30 |         return char in self._str_to_label
31 | 
32 |     def string_from_label(self, label: int) -> str:
33 |         """ Convert label to string. """
34 |         return self._label_to_str[label]
35 | 
36 |     def label_from_string(self, string: str) -> int:
37 |         """ Convert string to label. """
38 |         return self._str_to_label[string]
39 | 
40 |     def process_alphabet_file(self, file_path: str):
41 |         """ Read alphabet-pl.txt file. """
42 |         with open(file_path) as file:
43 |             for line in file:
44 |                 if line.startswith('#'):
45 |                     continue
46 |                 # Char can contain more than one letter
47 |                 char = line[:-1]  # remove the line ending
48 |                 self._label_to_str.append(char)
49 |                 self._str_to_label[char] = self.size
50 |                 self.size += 1
51 |             # Blank token is added on the end
52 |             self.blank_token = self.size - 1
53 | 
54 |     def get_batch_labels(self, transcripts: List[str]) -> np.ndarray:
55 |         """ Convert batch transcripts to labels """
56 |         batch_labels = [[self.label_from_string(c) for c in transcript.lower().strip() if c in self]
57 |                         for transcript in transcripts]
58 |         max_len = max(map(len, batch_labels))
59 |         default_value = self.blank_token
60 |         for labels in batch_labels:
61 |             remainder = [default_value] * (max_len - len(labels))
62 |             labels.extend(remainder)
63 |         return np.array(batch_labels)
64 | 
65 |     def get_batch_transcripts(self, sequences: List[np.ndarray]) -> List[str]:
66 |         """ Convert label sequences to transcripts. The `-1` also means the
67 |         blank tag """
68 |         return [''.join(self.string_from_label(char_label)
69 |                         for char_label in sequence
70 |                         if char_label not in (-1, self.blank_token))
71 |                 for sequence in sequences]
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     al = Alphabet(lang='en')
76 |     labels = al.get_batch_labels(["Hi how are you", "i am vey well, what about you"])
77 |     print(labels)
78 | 


--------------------------------------------------------------------------------
/deepasr/evaluate/distance.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | from typing import Tuple, List
 3 | from collections import defaultdict
 4 | import numpy as np
 5 | 
 6 | 
 7 | def edit_distance(source: List[str],
 8 |                   destination: List[str]) -> Tuple[int, np.ndarray, np.ndarray]:
 9 |     """
10 |     Calculation of edit distance between two sequences.
11 | 
12 |     This is the Levenshtein distance with the substitution cost equals 1.
13 |     It is the iterative method with the full matrix support.
14 |     O(nm) time and space complexity.
15 | 
16 |     References:
17 |     https://web.stanford.edu/class/cs124/lec/med.pdf
18 |     https://www.python-course.eu/levenshtein_distance.php
19 |     """
20 |     size_x = len(source) + 1
21 |     size_y = len(destination) + 1
22 |     matrix = np.zeros([size_x, size_y])
23 |     matrix[:, 0] = np.arange(0, size_x)
24 |     matrix[0, :] = np.arange(0, size_y)
25 |     backtrace = np.zeros_like(matrix, dtype=[('del', bool),
26 |                                              ('sub', bool),
27 |                                              ('ins', bool),
28 |                                              ('cost', int)])
29 |     backtrace[:, 0] = (True, False, False, 0)
30 |     backtrace[0, :] = (False, False, True, 0)
31 |     for x, y in itertools.product(range(1, size_x),
32 |                                   range(1, size_y)):
33 |         if source[x - 1] == destination[y - 1]:
34 |             cost = 0
35 |         else:
36 |             cost = 1
37 |         delete = matrix[x - 1][y] + 1
38 |         insert = matrix[x][y - 1] + 1
39 |         substitute = matrix[x - 1][y - 1] + cost
40 |         min_dist = min(delete, insert, substitute)
41 |         matrix[x, y] = min_dist
42 |         backtrace[x, y] = (delete == min_dist,
43 |                            substitute == min_dist,
44 |                            insert == min_dist,
45 |                            cost)
46 |     return matrix[size_x - 1, size_y - 1], matrix, backtrace
47 | 
48 | 
49 | def simple_backtrace(backtrace: np.ndarray):
50 |     """ Calculate the editing path via the backtrace. """
51 |     rows, columns = backtrace.shape
52 |     i, j = rows - 1, columns - 1
53 |     backtrace_indices = [(i, j, 'sub', 0)]
54 |     while (i, j) != (0, 0):
55 |         delete, substitute, insert, cost = backtrace[i, j]
56 |         if insert:
57 |             operation = 'ins'
58 |             i, j = i, j - 1
59 |         elif substitute:
60 |             operation = 'sub'
61 |             i, j = i - 1, j - 1
62 |         elif delete:
63 |             operation = 'del'
64 |             i, j = i - 1, j
65 |         else:
66 |             raise KeyError("Backtrace matrix wrong defined")
67 |         backtrace_indices.append((i, j, operation, cost))
68 |     return list(reversed(backtrace_indices))
69 | 
70 | 
71 | def decode_path(best_path: List[Tuple[int, int, str, int]],
72 |                 source: List[str],
73 |                 destination: List[str]):
74 |     """ Collect all transformations needed to go from `source` to
75 |     `destination`. """
76 |     to_delete, to_insert, to_substitute = [], [], defaultdict(list)
77 |     for index, (i, j, operation, cost) in enumerate(best_path):
78 |         if operation == 'del':
79 |             item = source[i]
80 |             to_delete.append(item)
81 |         elif operation == 'sub' and cost:
82 |             # without cost sub operation indicates correctness
83 |             wrong_item, target_item = source[i], destination[j]
84 |             to_substitute[target_item].append(wrong_item)
85 |         elif operation == 'ins':
86 |             item = destination[j]
87 |             to_insert.append(item)
88 |     return to_delete, to_insert, to_substitute
89 | 


--------------------------------------------------------------------------------
/deepasr/model/deepspeech2.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from tensorflow import keras
 4 | from tensorflow.keras.models import Model
 5 | from tensorflow.keras.layers import *
 6 | from tensorflow.keras.mixed_precision import experimental as mixed_precision
 7 | # from tensorflow.keras.activations import relu
 8 | 
 9 | 
10 | # def clipped_relu(x):
11 | #     return relu(x, max_value=20)
12 | 
13 | 
14 | def get_deepspeech2(input_dim=None, output_dim=29,
15 |                     is_mixed_precision=True, random_state=1) -> keras.Model:
16 |     """
17 | 
18 |     input_dim: int i wielokrotność 4
19 |     output_dim: licba liter w słowniku
20 | 
21 |     """
22 |     if is_mixed_precision:
23 |         policy = mixed_precision.Policy('float32')
24 |         mixed_precision.set_policy(policy)
25 | 
26 |     np.random.seed(random_state)
27 |     tf.random.set_seed(random_state)
28 | 
29 |     # the input
30 |     input_data = Input(name='the_input', shape=(None, input_dim), dtype='float32')
31 | 
32 |     # Batch normalize
33 |     bn1 = BatchNormalization(axis=-1, name='BN_1')(input_data)
34 | 
35 |     # 1D Convs
36 |     conv1 = Conv1D(512, 5, strides=1, activation='relu', name='Conv1D_1')(bn1)
37 |     cbn1 = BatchNormalization(axis=-1, name='CBN_1')(conv1)
38 |     conv2 = Conv1D(512, 5, strides=1, activation='relu', name='Conv1D_2')(cbn1)
39 |     cbn2 = BatchNormalization(axis=-1, name='CBN_2')(conv2)
40 |     conv3 = Conv1D(512, 5, strides=1, activation='relu', name='Conv1D_3')(cbn2)
41 | 
42 |     # Batch normalize
43 |     x = BatchNormalization(axis=-1, name='BN_2')(conv3)
44 | 
45 |     # BiRNNs
46 |     # birnn1 = Bidirectional(SimpleRNN(1280, return_sequences=True, name='BiRNN_1'), merge_mode='sum')(bn2)
47 |     # birnn2 = Bidirectional(SimpleRNN(1280, return_sequences=True, name='BiRNN_2'), merge_mode='sum')(birnn1)
48 |     # birnn3 = Bidirectional(SimpleRNN(1280, return_sequences=True, name='BiRNN_3'), merge_mode='sum')(birnn2)
49 |     # birnn4 = Bidirectional(SimpleRNN(1280, return_sequences=True, name='BiRNN_4'), merge_mode='sum')(birnn3)
50 |     # birnn5 = Bidirectional(SimpleRNN(1280, return_sequences=True, name='BiRNN_5'), merge_mode='sum')(birnn4)
51 |     # birnn6 = Bidirectional(SimpleRNN(1280, return_sequences=True, name='BiRNN_6'), merge_mode='sum')(birnn5)
52 |     # birnn7 = Bidirectional(SimpleRNN(1280, return_sequences=True, name='BiRNN_7'), merge_mode='sum')(birnn6)
53 | 
54 |     # BiRNNs
55 |     for i in [1, 2, 3, 4, 5]:
56 |         recurrent = GRU(units=800,
57 |                         activation='tanh',
58 |                         recurrent_activation='sigmoid',
59 |                         use_bias=True,
60 |                         return_sequences=True,
61 |                         reset_after=True,
62 |                         name=f'gru_{i}')
63 |         x = Bidirectional(recurrent,
64 |                           name=f'bidirectional_{i}',
65 |                           merge_mode='concat')(x)
66 |         x = Dropout(rate=0.5)(x) if i < 5 else x  # Only between
67 | 
68 |     # Batch normalize
69 |     bn3 = BatchNormalization(axis=-1, name='BN_3')(x)
70 | 
71 |     dense = TimeDistributed(Dense(1024, activation='relu', name='FC1'))(bn3)
72 |     y_pred = TimeDistributed(Dense(output_dim, activation='softmax', name='y_pred'), name='the_output')(dense)
73 | 
74 |     model = Model(inputs=input_data, outputs=y_pred)
75 | 
76 |     # # your ground truth data. The data you are going to compare with the model's outputs in training
77 |     # labels = Input(name='the_labels', shape=[label_dim], dtype='float32')
78 |     # # the length (in steps, or chars this case) of each sample (sentence) in the y_pred tensor
79 |     # input_length = Input(name='input_length', shape=[1], dtype='float32')
80 |     # #  the length (in steps, or chars this case) of each sample (sentence) in the y_true
81 |     # label_length = Input(name='label_length', shape=[1], dtype='float32')
82 |     # output = Lambda(ctc_loss, output_shape=(1,), name='ctc')([y_pred, labels, input_length, label_length])
83 |     # model = Model(inputs=[input_data, labels, input_length, label_length], outputs=output, name="deepspeech2pro_v1")
84 |     return model
85 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # DeepAsr
  2 | DeepAsr is an open-source & Keras (Tensorflow) implementation of end-to-end Automatic Speech Recognition (ASR) engine and it supports multiple Speech Recognition architectures.
  3 | 
  4 | Supported Asr Architectures:
  5 | - Baidu's Deep Speech 2
  6 | - DeepAsrNetwork1
  7 | 
  8 | **Using DeepAsr you can**:
  9 | - perform speech-to-text using pre-trained models
 10 | - tune pre-trained models to your needs
 11 | - create new models on your own 
 12 | 
 13 | **DeepAsr key features**:
 14 | - **Multi GPU support**: You can do much more like distribute the training using the [Strategy](https://www.tensorflow.org/guide/distributed_training), or experiment with [mixed precision](https://www.tensorflow.org/api_docs/python/tf/keras/mixed_precision/experimental/Policy) policy.
 15 | - **CuDNN support**: Model using [CuDNNLSTM](https://keras.io/layers/recurrent/) implementation by NVIDIA Developers. CPU devices is also supported.
 16 | - **DataGenerator**: The feature extraction during model training for large the data.
 17 | 
 18 | ## Installation
 19 | You can use pip:
 20 | ```bash
 21 | pip install deepasr
 22 | ```
 23 | 
 24 | ## Getting started
 25 | The speech recognition is a tough task. You don't need to know all details to use one of the pretrained models.
 26 | However it's worth to understand conceptional crucial components:
 27 | - **Input**: Audio files (WAV or FLAC) with mono 16-bit 16 kHz (up to 5 seconds)
 28 | - **FeaturesExtractor**: Convert audio files using MFCC Features or Spectrogram
 29 | - **Model**: CTC model defined in [**Keras**](https://keras.io/) (references: [[1]](https://arxiv.org/abs/1412.5567), [[2]](https://arxiv.org/abs/1512.02595))
 30 | - **Decoder**: Greedy or BeamSearch algorithms with the language model support decode a sequence of probabilities using Alphabet
 31 | - **DataGenerator**: Stream data to the model via generator
 32 | - **Callbacks**: Set of functions monitoring the training
 33 | 
 34 | ```python
 35 | import numpy as np
 36 | import pandas as pd
 37 | import tensorflow as tf
 38 | import deepasr as asr
 39 | 
 40 | # get CTCPipeline
 41 | def get_config(feature_type: str = 'spectrogram', multi_gpu: bool = False):
 42 |     # audio feature extractor
 43 |     features_extractor = asr.features.preprocess(feature_type=feature_type, features_num=161,
 44 |                                                  samplerate=16000,
 45 |                                                  winlen=0.02,
 46 |                                                  winstep=0.025,
 47 |                                                  winfunc=np.hanning)
 48 | 
 49 |     # input label encoder
 50 |     alphabet_en = asr.vocab.Alphabet(lang='en')
 51 |     # training model
 52 |     model = asr.model.get_deepspeech2(
 53 |         input_dim=161,
 54 |         output_dim=29,
 55 |         is_mixed_precision=True
 56 |     )
 57 |     # model optimizer
 58 |     optimizer = tf.keras.optimizers.Adam(
 59 |         lr=1e-4,
 60 |         beta_1=0.9,
 61 |         beta_2=0.999,
 62 |         epsilon=1e-8
 63 |     )
 64 |     # output label deocder
 65 |     decoder = asr.decoder.GreedyDecoder()
 66 |     # decoder = asr.decoder.BeamSearchDecoder(beam_width=100, top_paths=1)
 67 |     # CTCPipeline
 68 |     pipeline = asr.pipeline.ctc_pipeline.CTCPipeline(
 69 |         alphabet=alphabet_en, features_extractor=features_extractor, model=model, optimizer=optimizer, decoder=decoder,
 70 |         sample_rate=16000, mono=True, multi_gpu=multi_gpu
 71 |     )
 72 |     return pipeline
 73 | 
 74 | 
 75 | train_data = pd.read_csv('train_data.csv')
 76 | 
 77 | pipeline = get_config(feature_type = 'fbank', multi_gpu=False)
 78 | 
 79 | # train asr model
 80 | history = pipeline.fit(train_dataset=train_data, batch_size=128, epochs=500)
 81 | # history = pipeline.fit_generator(train_dataset = train_data, batch_size=32, epochs=500)
 82 | 
 83 | pipeline.save('./checkpoint')
 84 | ```
 85 | 
 86 | Loaded pre-trained model has all components. The prediction can be invoked just by calling pipline.predict().
 87 | 
 88 | ```python
 89 | import pandas as pd
 90 | import deepasr as asr
 91 | import numpy as np
 92 | test_data = pd.read_csv('test_data.csv')
 93 | 
 94 | # get testing audio and transcript from dataset
 95 | index = np.random.randint(test_data.shape[0])
 96 | data = test_data.iloc[index]
 97 | test_file = data[0]
 98 | test_transcript = data[1]
 99 | # Test Audio file
100 | print("Audio File:",test_file)
101 | # Test Transcript
102 | print("Audio Transcript:", test_transcript)
103 | print("Transcript length:",len(test_transcript))
104 | 
105 | pipeline = asr.pipeline.load('./checkpoint')
106 | print("Prediction", pipeline.predict(test_file))
107 | ```
108 | 
109 | #### References
110 | 
111 | The fundamental repositories:
112 | - Baidu - [DeepSpeech2 - A PaddlePaddle implementation of DeepSpeech2 architecture for ASR](https://github.com/PaddlePaddle/DeepSpeech)
113 | - NVIDIA - [Toolkit for efficient experimentation with Speech Recognition, Text2Speech and NLP](https://nvidia.github.io/OpenSeq2Seq)
114 | - TensorFlow - [The implementation of DeepSpeech2 model](https://github.com/tensorflow/models/tree/master/research/deep_speech)
115 | - Mozilla - [DeepSpeech - A TensorFlow implementation of Baidu's DeepSpeech architecture](https://github.com/mozilla/DeepSpeech) 
116 | - Espnet - [End-to-End Speech Processing Toolkit](https://github.com/espnet/espnet)
117 | - Automatic Speech Recognition - [Distill the Automatic Speech Recognition research](https://github.com/rolczynski/Automatic-Speech-Recognition)
118 | - Python Speech Features - [Speech features for ASR including MFCCs and filterbank energies](https://github.com/jameslyons/python_speech_features)


--------------------------------------------------------------------------------
/deepasr/features/sigproc.py:
--------------------------------------------------------------------------------
  1 | # This file includes routines for basic signal processing including framing and computing power spectra.
  2 | # Author: James Lyons 2012
  3 | import decimal
  4 | 
  5 | import numpy
  6 | import math
  7 | import logging
  8 | 
  9 | 
 10 | def round_half_up(number):
 11 |     return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))
 12 | 
 13 | 
 14 | def rolling_window(a, window, step=1):
 15 |     # http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
 16 |     shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
 17 |     strides = a.strides + (a.strides[-1],)
 18 |     return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]
 19 | 
 20 | 
 21 | def framesig(sig, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,)), stride_trick=True):
 22 |     """Frame a signal into overlapping frames.
 23 | 
 24 |     :param sig: the audio signal to frame.
 25 |     :param frame_len: length of each frame measured in samples.
 26 |     :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
 27 |     :param winfunc: the analysis window to apply to each frame. By default no window is applied.
 28 |     :param stride_trick: use stride trick to compute the rolling window and window multiplication faster
 29 |     :returns: an array of frames. Size is NUMFRAMES by frame_len.
 30 |     """
 31 |     slen = len(sig)
 32 |     frame_len = int(round_half_up(frame_len))
 33 |     frame_step = int(round_half_up(frame_step))
 34 |     if slen <= frame_len:
 35 |         numframes = 1
 36 |     else:
 37 |         numframes = 1 + int(math.ceil((1.0 * slen - frame_len) / frame_step))
 38 | 
 39 |     padlen = int((numframes - 1) * frame_step + frame_len)
 40 | 
 41 |     zeros = numpy.zeros((padlen - slen,))
 42 |     padsignal = numpy.concatenate((sig, zeros))
 43 |     if stride_trick:
 44 |         win = winfunc(frame_len)
 45 |         frames = rolling_window(padsignal, window=frame_len, step=frame_step)
 46 |     else:
 47 |         indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
 48 |             numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
 49 |         indices = numpy.array(indices, dtype=numpy.int32)
 50 |         frames = padsignal[indices]
 51 |         win = numpy.tile(winfunc(frame_len), (numframes, 1))
 52 | 
 53 |     return frames * win
 54 | 
 55 | 
 56 | def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))):
 57 |     """Does overlap-add procedure to undo the action of framesig.
 58 | 
 59 |     :param frames: the array of frames.
 60 |     :param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples.
 61 |     :param frame_len: length of each frame measured in samples.
 62 |     :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
 63 |     :param winfunc: the analysis window to apply to each frame. By default no window is applied.
 64 |     :returns: a 1-D signal.
 65 |     """
 66 |     frame_len = round_half_up(frame_len)
 67 |     frame_step = round_half_up(frame_step)
 68 |     numframes = numpy.shape(frames)[0]
 69 |     assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len'
 70 | 
 71 |     indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
 72 |         numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
 73 |     indices = numpy.array(indices, dtype=numpy.int32)
 74 |     padlen = (numframes - 1) * frame_step + frame_len
 75 | 
 76 |     if siglen <= 0: siglen = padlen
 77 | 
 78 |     rec_signal = numpy.zeros((padlen,))
 79 |     window_correction = numpy.zeros((padlen,))
 80 |     win = winfunc(frame_len)
 81 | 
 82 |     for i in range(0, numframes):
 83 |         window_correction[indices[i, :]] = window_correction[
 84 |                                                indices[i, :]] + win + 1e-15  # add a little bit so it is never zero
 85 |         rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]
 86 | 
 87 |     rec_signal = rec_signal / window_correction
 88 |     return rec_signal[0:siglen]
 89 | 
 90 | 
 91 | def magspec(frames, NFFT):
 92 |     """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
 93 | 
 94 |     :param frames: the array of frames. Each row is a frame.
 95 |     :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
 96 |     :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
 97 |     """
 98 |     if numpy.shape(frames)[1] > NFFT:
 99 |         logging.warn(
100 |             'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
101 |             numpy.shape(frames)[1], NFFT)
102 |     complex_spec = numpy.fft.rfft(frames, NFFT)
103 |     return numpy.absolute(complex_spec)
104 | 
105 | 
106 | def powspec(frames, NFFT):
107 |     """Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
108 | 
109 |     :param frames: the array of frames. Each row is a frame.
110 |     :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
111 |     :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.
112 |     """
113 |     return 1.0 / NFFT * numpy.square(magspec(frames, NFFT))
114 | 
115 | 
116 | def logpowspec(frames, NFFT, norm=1):
117 |     """Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
118 | 
119 |     :param frames: the array of frames. Each row is a frame.
120 |     :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
121 |     :param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0.
122 |     :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame.
123 |     """
124 |     ps = powspec(frames, NFFT)
125 |     ps[ps <= 1e-30] = 1e-30
126 |     lps = 10 * numpy.log10(ps)
127 |     if norm:
128 |         return lps - numpy.max(lps)
129 |     else:
130 |         return lps
131 | 
132 | 
133 | def preemphasis(signal, coeff=0.95):
134 |     """perform preemphasis on the input signal.
135 | 
136 |     :param signal: The signal to filter.
137 |     :param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
138 |     :returns: the filtered signal.
139 |     """
140 |     return numpy.append(signal[0], signal[1:] - coeff * signal[:-1])
141 | 


--------------------------------------------------------------------------------
/deepasr/features/mfcc.py:
--------------------------------------------------------------------------------
  1 | # calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications
  2 | # Author: James Lyons 2012
  3 | from __future__ import division
  4 | import numpy
  5 | from . import sigproc
  6 | from scipy.fftpack import dct
  7 | 
  8 | 
  9 | def calculate_nfft(samplerate, winlen):
 10 |     """Calculates the FFT size as a power of two greater than or equal to
 11 |     the number of samples in a single window length.
 12 | 
 13 |     Having an FFT less than the window length loses precision by dropping
 14 |     many of the samples; a longer FFT than the window allows zero-padding
 15 |     of the FFT buffer which is neutral in terms of frequency domain conversion.
 16 | 
 17 |     :param samplerate: The sample rate of the signal we are working with, in Hz.
 18 |     :param winlen: The length of the analysis window in seconds.
 19 |     """
 20 |     window_length_samples = winlen * samplerate
 21 |     nfft = 1
 22 |     while nfft < window_length_samples:
 23 |         nfft *= 2
 24 |     return nfft
 25 | 
 26 | 
 27 | def mfcc(signal, samplerate=16000, winlen=0.025, winstep=0.01, numcep=13,
 28 |          nfilt=26, nfft=None, lowfreq=0, highfreq=None, preemph=0.97, ceplifter=22, appendEnergy=True,
 29 |          winfunc=lambda x: numpy.ones((x,))):
 30 |     """Compute MFCC features from an audio signal.
 31 | 
 32 |     :param signal: the audio signal from which to compute features. Should be an N*1 array
 33 |     :param samplerate: the sample rate of the signal we are working with, in Hz.
 34 |     :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
 35 |     :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
 36 |     :param numcep: the number of cepstrum to return, default 13
 37 |     :param nfilt: the number of filters in the filterbank, default 26.
 38 |     :param nfft: the FFT size. Default is None, which uses the calculate_nfft function to choose the smallest size that does not drop sample data.
 39 |     :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
 40 |     :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
 41 |     :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
 42 |     :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
 43 |     :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
 44 |     :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
 45 |     :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
 46 |     """
 47 |     nfft = nfft or calculate_nfft(samplerate, winlen)
 48 |     feat, energy = fbank(signal, samplerate, winlen, winstep, nfilt, nfft, lowfreq, highfreq, preemph, winfunc)
 49 |     feat = numpy.log(feat)
 50 |     feat = dct(feat, type=2, axis=1, norm='ortho')[:, :numcep]
 51 |     feat = lifter(feat, ceplifter)
 52 |     if appendEnergy: feat[:, 0] = numpy.log(energy)  # replace first cepstral coefficient with log of frame energy
 53 |     return feat
 54 | 
 55 | 
 56 | def fbank(signal, samplerate=16000, winlen=0.025, winstep=0.01,
 57 |           nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97,
 58 |           winfunc=lambda x: numpy.ones((x,))):
 59 |     """Compute Mel-filterbank energy features from an audio signal.
 60 | 
 61 |     :param signal: the audio signal from which to compute features. Should be an N*1 array
 62 |     :param samplerate: the sample rate of the signal we are working with, in Hz.
 63 |     :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
 64 |     :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
 65 |     :param nfilt: the number of filters in the filterbank, default 26.
 66 |     :param nfft: the FFT size. Default is 512.
 67 |     :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
 68 |     :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
 69 |     :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
 70 |     :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
 71 |     :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
 72 |         second return value is the energy in each frame (total energy, unwindowed)
 73 |     """
 74 |     highfreq = highfreq or samplerate / 2
 75 |     signal = sigproc.preemphasis(signal, preemph)
 76 |     frames = sigproc.framesig(signal, winlen * samplerate, winstep * samplerate, winfunc)
 77 |     pspec = sigproc.powspec(frames, nfft)
 78 |     energy = numpy.sum(pspec, 1)  # this stores the total energy in each frame
 79 |     energy = numpy.where(energy == 0, numpy.finfo(float).eps, energy)  # if energy is zero, we get problems with log
 80 | 
 81 |     fb = get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq)
 82 |     feat = numpy.dot(pspec, fb.T)  # compute the filterbank energies
 83 |     feat = numpy.where(feat == 0, numpy.finfo(float).eps, feat)  # if feat is zero, we get problems with log
 84 | 
 85 |     return feat, energy
 86 | 
 87 | 
 88 | def logfbank(signal, samplerate=16000, winlen=0.025, winstep=0.01,
 89 |              nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97,
 90 |              winfunc=lambda x: numpy.ones((x,))):
 91 |     """Compute log Mel-filterbank energy features from an audio signal.
 92 | 
 93 |     :param signal: the audio signal from which to compute features. Should be an N*1 array
 94 |     :param samplerate: the sample rate of the signal we are working with, in Hz.
 95 |     :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
 96 |     :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
 97 |     :param nfilt: the number of filters in the filterbank, default 26.
 98 |     :param nfft: the FFT size. Default is 512.
 99 |     :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
100 |     :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
101 |     :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
102 |     :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
103 |     :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
104 |     """
105 |     feat, energy = fbank(signal, samplerate, winlen, winstep, nfilt, nfft, lowfreq, highfreq, preemph, winfunc)
106 |     return numpy.log(feat)
107 | 
108 | 
109 | def ssc(signal, samplerate=16000, winlen=0.025, winstep=0.01,
110 |         nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97,
111 |         winfunc=lambda x: numpy.ones((x,))):
112 |     """Compute Spectral Subband Centroid features from an audio signal.
113 | 
114 |     :param signal: the audio signal from which to compute features. Should be an N*1 array
115 |     :param samplerate: the sample rate of the signal we are working with, in Hz.
116 |     :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
117 |     :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
118 |     :param nfilt: the number of filters in the filterbank, default 26.
119 |     :param nfft: the FFT size. Default is 512.
120 |     :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
121 |     :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
122 |     :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
123 |     :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
124 |     :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
125 |     """
126 |     highfreq = highfreq or samplerate / 2
127 |     signal = sigproc.preemphasis(signal, preemph)
128 |     frames = sigproc.framesig(signal, winlen * samplerate, winstep * samplerate, winfunc)
129 |     pspec = sigproc.powspec(frames, nfft)
130 |     pspec = numpy.where(pspec == 0, numpy.finfo(float).eps, pspec)  # if things are all zeros we get problems
131 | 
132 |     fb = get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq)
133 |     feat = numpy.dot(pspec, fb.T)  # compute the filterbank energies
134 |     R = numpy.tile(numpy.linspace(1, samplerate / 2, numpy.size(pspec, 1)), (numpy.size(pspec, 0), 1))
135 | 
136 |     return numpy.dot(pspec * R, fb.T) / feat
137 | 
138 | 
139 | def hz2mel(hz):
140 |     """Convert a value in Hertz to Mels
141 | 
142 |     :param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
143 |     :returns: a value in Mels. If an array was passed in, an identical sized array is returned.
144 |     """
145 |     return 2595 * numpy.log10(1 + hz / 700.)
146 | 
147 | 
148 | def mel2hz(mel):
149 |     """Convert a value in Mels to Hertz
150 | 
151 |     :param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
152 |     :returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
153 |     """
154 |     return 700 * (10 ** (mel / 2595.0) - 1)
155 | 
156 | 
157 | def get_filterbanks(nfilt=20, nfft=512, samplerate=16000, lowfreq=0, highfreq=None):
158 |     """Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
159 |     to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)
160 | 
161 |     :param nfilt: the number of filters in the filterbank, default 20.
162 |     :param nfft: the FFT size. Default is 512.
163 |     :param samplerate: the sample rate of the signal we are working with, in Hz. Affects mel spacing.
164 |     :param lowfreq: lowest band edge of mel filters, default 0 Hz
165 |     :param highfreq: highest band edge of mel filters, default samplerate/2
166 |     :returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
167 |     """
168 |     highfreq = highfreq or samplerate / 2
169 |     assert highfreq <= samplerate / 2, "highfreq is greater than samplerate/2"
170 | 
171 |     # compute points evenly spaced in mels
172 |     lowmel = hz2mel(lowfreq)
173 |     highmel = hz2mel(highfreq)
174 |     melpoints = numpy.linspace(lowmel, highmel, nfilt + 2)
175 |     # our points are in Hz, but we use fft bins, so we have to convert
176 |     #  from Hz to fft bin number
177 |     bin = numpy.floor((nfft + 1) * mel2hz(melpoints) / samplerate)
178 | 
179 |     fbank = numpy.zeros([nfilt, nfft // 2 + 1])
180 |     for j in range(0, nfilt):
181 |         for i in range(int(bin[j]), int(bin[j + 1])):
182 |             fbank[j, i] = (i - bin[j]) / (bin[j + 1] - bin[j])
183 |         for i in range(int(bin[j + 1]), int(bin[j + 2])):
184 |             fbank[j, i] = (bin[j + 2] - i) / (bin[j + 2] - bin[j + 1])
185 |     return fbank
186 | 
187 | 
188 | def lifter(cepstra, L=22):
189 |     """Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the
190 |     magnitude of the high frequency DCT coeffs.
191 | 
192 |     :param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
193 |     :param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter.
194 |     """
195 |     if L > 0:
196 |         nframes, ncoeff = numpy.shape(cepstra)
197 |         n = numpy.arange(ncoeff)
198 |         lift = 1 + (L / 2.) * numpy.sin(numpy.pi * n / L)
199 |         return lift * cepstra
200 |     else:
201 |         # values of L <= 0, do nothing
202 |         return cepstra
203 | 
204 | 
205 | def delta(feat, N):
206 |     """Compute delta features from a feature vector sequence.
207 | 
208 |     :param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
209 |     :param N: For each frame, calculate delta features based on preceding and following N frames
210 |     :returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector.
211 |     """
212 |     if N < 1:
213 |         raise ValueError('N must be an integer >= 1')
214 |     NUMFRAMES = len(feat)
215 |     denominator = 2 * sum([i ** 2 for i in range(1, N + 1)])
216 |     delta_feat = numpy.empty_like(feat)
217 |     padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge')  # padded version of feat
218 |     for t in range(NUMFRAMES):
219 |         delta_feat[t] = numpy.dot(numpy.arange(-N, N + 1),
220 |                                   padded[t: t + 2 * N + 1]) / denominator  # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
221 |     return delta_feat
222 | 


--------------------------------------------------------------------------------
/deepasr/pipeline/ctc_pipeline.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | from typing import List
  4 | import numpy as np
  5 | import random
  6 | import tensorflow as tf
  7 | from tensorflow import keras
  8 | import pandas as pd
  9 | from concurrent.futures import ThreadPoolExecutor, wait
 10 | # from tensorflow.keras.layers import *
 11 | from tensorflow.keras.models import Model
 12 | import sys
 13 | 
 14 | sys.path.append("..")
 15 | from deepasr.pipeline import Pipeline
 16 | from deepasr.augmentation import Augmentation
 17 | from deepasr.decoder import Decoder
 18 | from deepasr.features import FeaturesExtractor
 19 | from deepasr.vocab import Alphabet
 20 | from deepasr.utils import read_audio, save_data
 21 | from deepasr.model import compile_model
 22 | 
 23 | logger = logging.getLogger('asr.pipeline')
 24 | 
 25 | 
 26 | class CTCPipeline(Pipeline):
 27 |     """
 28 |     The pipeline is responsible for connecting a neural network model with
 29 |     all non-differential transformations (features extraction or decoding),
 30 |     and dependencies. Components are independent.
 31 |     """
 32 | 
 33 |     def __init__(self,
 34 |                  alphabet: Alphabet,
 35 |                  features_extractor: FeaturesExtractor,
 36 |                  model: keras.Model,
 37 |                  optimizer: keras.optimizers.Optimizer,
 38 |                  decoder: Decoder,
 39 |                  sample_rate: int,
 40 |                  mono: True,
 41 |                  label_len: int = 0,
 42 |                  multi_gpu: bool = True,
 43 |                  temp_model: keras.Model = None):
 44 |         self._alphabet = alphabet
 45 |         self._optimizer = optimizer
 46 |         self._decoder = decoder
 47 |         self._features_extractor = features_extractor
 48 |         self.sample_rate = sample_rate
 49 |         self.mono = mono
 50 |         self.label_len = label_len
 51 |         self.multi_gpu = multi_gpu
 52 |         self._model = self._compile_model(model, optimizer, multi_gpu)
 53 |         self.temp_model = temp_model if temp_model else self._model
 54 | 
 55 |     @property
 56 |     def alphabet(self) -> Alphabet:
 57 |         return self._alphabet
 58 | 
 59 |     @property
 60 |     def features_extractor(self) -> FeaturesExtractor:
 61 |         return self._features_extractor
 62 | 
 63 |     @property
 64 |     def model(self) -> keras.Model:
 65 |         return self.temp_model
 66 | 
 67 |     @property
 68 |     def decoder(self) -> Decoder:
 69 |         return self._decoder
 70 | 
 71 |     def preprocess(self,
 72 |                    data: List[np.ndarray],
 73 |                    is_extracted: bool,
 74 |                    augmentation: Augmentation) -> np.ndarray:
 75 |         """ Preprocess batch data to format understandable to a model. """
 76 | 
 77 |         if is_extracted:  # then just align features
 78 |             features = FeaturesExtractor.align(data)
 79 |         else:
 80 |             features = self._features_extractor(data)
 81 |         features = augmentation(features) if augmentation else features
 82 |         # labels = self._alphabet.get_batch_labels(transcripts)
 83 |         return features
 84 | 
 85 |     def fit_iter(self,
 86 |                  train_dataset: pd.DataFrame,
 87 |                  augmentation: Augmentation = None,
 88 |                  prepared_features: bool = False,
 89 |                  iter_num: int = 1000,
 90 |                  batch_size: int = 32,
 91 |                  epochs: int = 3,
 92 |                  checkpoint: str = None,
 93 |                  **kwargs) -> keras.callbacks.History:
 94 |         """ Get ready data and train a model. """
 95 | 
 96 |         history = keras.callbacks.History()
 97 | 
 98 |         audios = train_dataset['path'].to_list()
 99 | 
100 |         labels = self._alphabet.get_batch_labels(train_dataset['transcripts'].to_list())
101 | 
102 |         transcripts = train_dataset['transcripts'].to_list()
103 | 
104 |         train_len_ = len(transcripts)
105 | 
106 |         self.label_len = labels.shape[1]
107 | 
108 |         self._model.summary()
109 | 
110 |         for i in range(iter_num):
111 |             train_index = random.sample(range(train_len_ - 25), batch_size)
112 | 
113 |             x_train = [audios[i] for i in train_index]
114 | 
115 |             y_train = [labels[i] for i in train_index]
116 | 
117 |             y_trans = [transcripts[i] for i in train_index]
118 | 
119 |             train_inputs = self.wrap_preprocess(x_train,
120 |                                                 y_train,
121 |                                                 y_trans, augmentation, prepared_features)
122 | 
123 |             outputs = {'ctc': np.zeros([batch_size])}
124 | 
125 |             # print(train_inputs['the_input'].shape)
126 |             # print(train_inputs['the_labels'].shape)
127 |             # print(train_inputs['input_length'].shape)
128 |             # print(train_inputs['label_length'].shape)
129 |             # print(train_inputs['input_length'])
130 |             # print(train_inputs['label_length'])
131 | 
132 |             if i % 100 == 0:
133 |                 print("iter:", i)
134 |                 print("input features: ", train_inputs['the_input'].shape)
135 |                 print("input labels: ", train_inputs['the_labels'].shape)
136 |                 history = self._model.fit(train_inputs, outputs,
137 |                                           batch_size=batch_size,
138 |                                           epochs=epochs,
139 |                                           verbose=1, **kwargs)
140 |                 if checkpoint:
141 |                     self.save(checkpoint)
142 |                     print("Pipeline Saved at", checkpoint)
143 |             else:
144 |                 history = self._model.fit(train_inputs, outputs,
145 |                                           batch_size=batch_size,
146 |                                           epochs=epochs,
147 |                                           verbose=0, **kwargs)
148 | 
149 |         return history
150 | 
151 |     def fit(self,
152 |             train_dataset: pd.DataFrame,
153 |             augmentation: Augmentation = None,
154 |             prepared_features: bool = False,
155 |             batch_size: int = 32,
156 |             epochs: int = 3,
157 |             checkpoint: str = None,
158 |             **kwargs) -> keras.callbacks.History:
159 |         """ Get ready data and train a model. """
160 | 
161 |         audios = train_dataset['path'].to_list()
162 | 
163 |         labels = self._alphabet.get_batch_labels(train_dataset['transcripts'].to_list())
164 | 
165 |         transcripts = train_dataset['transcripts'].to_list()
166 | 
167 |         self.label_len = labels.shape[1]
168 | 
169 |         self._model.summary()
170 | 
171 |         print("Feature Extraction in progress...")
172 |         train_inputs = self.wrap_preprocess(audios,
173 |                                             list(labels),
174 |                                             transcripts, augmentation, prepared_features)
175 | 
176 |         outputs = {'ctc': np.zeros([len(audios)])}
177 | 
178 |         print("Feature Extraction completed.")
179 | 
180 |         print("input features: ", train_inputs['the_input'].shape)
181 |         print("input labels: ", train_inputs['the_labels'].shape)
182 | 
183 |         print("Model training initiated...")
184 | 
185 |         history = self._model.fit(train_inputs, outputs,
186 |                                   batch_size=batch_size,
187 |                                   epochs=epochs,
188 |                                   verbose=1, **kwargs)
189 | 
190 |         return history
191 | 
192 |     def fit_generator(self, train_dataset: pd.DataFrame,
193 |                       shuffle: bool = True,
194 |                       augmentation: Augmentation = None,
195 |                       prepared_features: bool = False,
196 |                       batch_size: int = 32,
197 |                       epochs: int = 3,
198 |                       verbose: int = 1,
199 |                       **kwargs) -> keras.callbacks.History:
200 | 
201 |         """ Get ready data and train a model. """
202 | 
203 |         audios = train_dataset['path'].to_list()
204 | 
205 |         labels = self._alphabet.get_batch_labels(train_dataset['transcripts'].to_list())
206 | 
207 |         transcripts = train_dataset['transcripts'].to_list()
208 | 
209 |         train_len_ = len(transcripts)
210 | 
211 |         self.label_len = labels.shape[1]
212 | 
213 |         self._model.summary()
214 | 
215 |         train_gen = self.get_generator(audios, labels, transcripts,
216 |                                        batch_size, shuffle, augmentation, prepared_features)
217 | 
218 |         return self._model.fit(train_gen, epochs=epochs,
219 |                                steps_per_epoch=train_len_ // batch_size, verbose=verbose, **kwargs)
220 | 
221 |     def get_generator(self, audio_paths: List[str], texts: np.array, transcripts: List[str], batch_size: int = 32,
222 |                       shuffle: bool = True, augmentation: Augmentation = None,
223 |                       prepared_features: bool = False):
224 |         """ Data Generator """
225 | 
226 |         def generator():
227 |             num_samples = len(audio_paths)
228 |             while True:
229 |                 x = list()
230 |                 y = list()
231 |                 if shuffle:
232 |                     temp = list(zip(audio_paths, texts))
233 |                     random.Random(123).shuffle(temp)
234 |                     x, y = list(zip(*temp))
235 | 
236 |                 pool = ThreadPoolExecutor(1)  # Run a single I/O thread in parallel
237 |                 future = pool.submit(self.wrap_preprocess,
238 |                                      x[:batch_size],
239 |                                      y[:batch_size], transcripts[:batch_size], augmentation, prepared_features)
240 |                 for offset in range(batch_size, num_samples, batch_size):
241 |                     wait([future])
242 |                     batch = future.result()
243 |                     future = pool.submit(self.wrap_preprocess,
244 |                                          x[offset: offset + batch_size],
245 |                                          y[offset: offset + batch_size], transcripts[offset:offset + batch_size],
246 |                                          augmentation, prepared_features)
247 |                     yield batch, {'ctc': np.zeros([batch_size])}
248 | 
249 |         return generator()
250 | 
251 |     def wrap_preprocess(self, audios: List[str], the_labels: List[np.array], transcripts: List[str],
252 |                         augmentation: Augmentation = None,
253 |                         prepared_features: bool = False):
254 |         """ Build training data """
255 |         # the_input = np.array(the_input) / 100
256 |         # the_input = x3/np.max(the_input)
257 | 
258 |         mid_features = [read_audio(audio, sample_rate=self.sample_rate, mono=self.mono) for audio in audios]
259 | 
260 |         the_input = self.preprocess(mid_features, prepared_features, augmentation)
261 | 
262 |         the_labels = np.array(the_labels)
263 | 
264 |         label_len = [len(trans) for trans in transcripts]  # length of each transcription
265 |         label_lengths = np.array(label_len).reshape(-1, 1)  # reshape to 1d
266 | 
267 |         input_lengths = np.ones((the_labels.shape[0], 1)) * the_labels.shape[1]
268 |         for i in range(the_input.shape[0]):
269 |             input_lengths[i] = the_labels.shape[1]  # num of features from labels
270 | 
271 |         return {
272 |             'the_input': the_input,
273 |             'the_labels': the_labels,
274 |             'input_length': np.asarray(input_lengths),
275 |             'label_length': np.asarray(label_lengths)
276 |         }
277 | 
278 |     def predict(self, audio: str, **kwargs) -> List[str]:
279 |         """ Get ready features, and make a prediction. """
280 |         # get audio features
281 |         features = self.features_extractor.make_features(
282 |             read_audio(audio, sample_rate=self.sample_rate, mono=self.mono))
283 |         in_features = self.features_extractor.align([features], self.features_extractor.features_shape)
284 | 
285 |         pred_model = Model(inputs=self._model.get_layer('the_input').output,
286 |                            outputs=self._model.get_layer('the_output').output)
287 |         batch_logits = pred_model.predict(in_features, **kwargs)
288 |         decoded_labels = self._decoder(batch_logits, self.label_len)
289 |         predictions = self._alphabet.get_batch_transcripts(decoded_labels)
290 |         return predictions
291 | 
292 |     def save(self, directory: str):
293 |         """ Save each component of the CTC pipeline. """
294 |         self.temp_model.save(os.path.join(directory, 'network.h5'))
295 |         self._model.save_weights(os.path.join(directory, 'model_weights.h5'))
296 |         save_data(self._optimizer, os.path.join(directory, 'optimizer.bin'))
297 |         save_data(self._alphabet, os.path.join(directory, 'alphabet.bin'))
298 |         save_data(self._decoder, os.path.join(directory, 'decoder.bin'))
299 |         save_data(self.multi_gpu, os.path.join(directory, 'multi_gpu_flag.bin'))
300 |         save_data(self.sample_rate, os.path.join(directory, 'sample_rate.bin'))
301 |         save_data(self.mono, os.path.join(directory, 'mono.bin'))
302 |         save_data(self.label_len, os.path.join(directory, 'label_len.bin'))
303 |         save_data(self._features_extractor,
304 |                   os.path.join(directory, 'feature_extractor.bin'))
305 | 
306 |     # def load(self, directory: str):
307 |     #     """ Load each component of the CTC pipeline. """
308 |     #     # model = keras.models.load_model(os.path.join(directory, 'model.h5'),
309 |     #     #                                 custom_objects={'clipped_relu': cls.clipped_relu})
310 |     #     self._model.load_weights(os.path.join(directory, 'model_weights.h5'))
311 |     #     self._alphabet = load_data(os.path.join(directory, 'alphabet.bin'))
312 |     #     self._decoder = load_data(os.path.join(directory, 'decoder.bin'))
313 |     #     self._features_extractor = load_data(
314 |     #         os.path.join(directory, 'feature_extractor.bin'))
315 | 
316 |     @staticmethod
317 |     def _compile_model(model: keras.Model, 
318 |                        optimizer: keras.optimizers.Optimizer, 
319 |                        multi_gpu: bool) -> keras.Model:
320 |         """ Replicates a model on different GPUs. """
321 |         if not multi_gpu:
322 |             dist_model = compile_model(model, optimizer)
323 |             logger.info("Training using single GPU or CPU")
324 |         else: 
325 |             try:
326 |                 strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
327 |                 with strategy.scope():
328 |                     dist_model = compile_model(model, optimizer)
329 |                 logger.info("Training using multiple GPUs")
330 |             except ValueError:
331 |                 dist_model = compile_model(model, optimizer)
332 |                 logger.info("Training using single GPU or CPU")
333 |         return dist_model
334 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU AFFERO GENERAL PUBLIC LICENSE
  2 |                        Version 3, 19 November 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU Affero General Public License is a free, copyleft license for
 11 | software and other kinds of works, specifically designed to ensure
 12 | cooperation with the community in the case of network server software.
 13 | 
 14 |   The licenses for most software and other practical works are designed
 15 | to take away your freedom to share and change the works.  By contrast,
 16 | our General Public Licenses are intended to guarantee your freedom to
 17 | share and change all versions of a program--to make sure it remains free
 18 | software for all its users.
 19 | 
 20 |   When we speak of free software, we are referring to freedom, not
 21 | price.  Our General Public Licenses are designed to make sure that you
 22 | have the freedom to distribute copies of free software (and charge for
 23 | them if you wish), that you receive source code or can get it if you
 24 | want it, that you can change the software or use pieces of it in new
 25 | free programs, and that you know you can do these things.
 26 | 
 27 |   Developers that use our General Public Licenses protect your rights
 28 | with two steps: (1) assert copyright on the software, and (2) offer
 29 | you this License which gives you legal permission to copy, distribute
 30 | and/or modify the software.
 31 | 
 32 |   A secondary benefit of defending all users' freedom is that
 33 | improvements made in alternate versions of the program, if they
 34 | receive widespread use, become available for other developers to
 35 | incorporate.  Many developers of free software are heartened and
 36 | encouraged by the resulting cooperation.  However, in the case of
 37 | software used on network servers, this result may fail to come about.
 38 | The GNU General Public License permits making a modified version and
 39 | letting the public access it on a server without ever releasing its
 40 | source code to the public.
 41 | 
 42 |   The GNU Affero General Public License is designed specifically to
 43 | ensure that, in such cases, the modified source code becomes available
 44 | to the community.  It requires the operator of a network server to
 45 | provide the source code of the modified version running there to the
 46 | users of that server.  Therefore, public use of a modified version, on
 47 | a publicly accessible server, gives the public access to the source
 48 | code of the modified version.
 49 | 
 50 |   An older license, called the Affero General Public License and
 51 | published by Affero, was designed to accomplish similar goals.  This is
 52 | a different license, not a version of the Affero GPL, but Affero has
 53 | released a new version of the Affero GPL which permits relicensing under
 54 | this license.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                        TERMS AND CONDITIONS
 60 | 
 61 |   0. Definitions.
 62 | 
 63 |   "This License" refers to version 3 of the GNU Affero General Public License.
 64 | 
 65 |   "Copyright" also means copyright-like laws that apply to other kinds of
 66 | works, such as semiconductor masks.
 67 | 
 68 |   "The Program" refers to any copyrightable work licensed under this
 69 | License.  Each licensee is addressed as "you".  "Licensees" and
 70 | "recipients" may be individuals or organizations.
 71 | 
 72 |   To "modify" a work means to copy from or adapt all or part of the work
 73 | in a fashion requiring copyright permission, other than the making of an
 74 | exact copy.  The resulting work is called a "modified version" of the
 75 | earlier work or a work "based on" the earlier work.
 76 | 
 77 |   A "covered work" means either the unmodified Program or a work based
 78 | on the Program.
 79 | 
 80 |   To "propagate" a work means to do anything with it that, without
 81 | permission, would make you directly or secondarily liable for
 82 | infringement under applicable copyright law, except executing it on a
 83 | computer or modifying a private copy.  Propagation includes copying,
 84 | distribution (with or without modification), making available to the
 85 | public, and in some countries other activities as well.
 86 | 
 87 |   To "convey" a work means any kind of propagation that enables other
 88 | parties to make or receive copies.  Mere interaction with a user through
 89 | a computer network, with no transfer of a copy, is not conveying.
 90 | 
 91 |   An interactive user interface displays "Appropriate Legal Notices"
 92 | to the extent that it includes a convenient and prominently visible
 93 | feature that (1) displays an appropriate copyright notice, and (2)
 94 | tells the user that there is no warranty for the work (except to the
 95 | extent that warranties are provided), that licensees may convey the
 96 | work under this License, and how to view a copy of this License.  If
 97 | the interface presents a list of user commands or options, such as a
 98 | menu, a prominent item in the list meets this criterion.
 99 | 
100 |   1. Source Code.
101 | 
102 |   The "source code" for a work means the preferred form of the work
103 | for making modifications to it.  "Object code" means any non-source
104 | form of a work.
105 | 
106 |   A "Standard Interface" means an interface that either is an official
107 | standard defined by a recognized standards body, or, in the case of
108 | interfaces specified for a particular programming language, one that
109 | is widely used among developers working in that language.
110 | 
111 |   The "System Libraries" of an executable work include anything, other
112 | than the work as a whole, that (a) is included in the normal form of
113 | packaging a Major Component, but which is not part of that Major
114 | Component, and (b) serves only to enable use of the work with that
115 | Major Component, or to implement a Standard Interface for which an
116 | implementation is available to the public in source code form.  A
117 | "Major Component", in this context, means a major essential component
118 | (kernel, window system, and so on) of the specific operating system
119 | (if any) on which the executable work runs, or a compiler used to
120 | produce the work, or an object code interpreter used to run it.
121 | 
122 |   The "Corresponding Source" for a work in object code form means all
123 | the source code needed to generate, install, and (for an executable
124 | work) run the object code and to modify the work, including scripts to
125 | control those activities.  However, it does not include the work's
126 | System Libraries, or general-purpose tools or generally available free
127 | programs which are used unmodified in performing those activities but
128 | which are not part of the work.  For example, Corresponding Source
129 | includes interface definition files associated with source files for
130 | the work, and the source code for shared libraries and dynamically
131 | linked subprograms that the work is specifically designed to require,
132 | such as by intimate data communication or control flow between those
133 | subprograms and other parts of the work.
134 | 
135 |   The Corresponding Source need not include anything that users
136 | can regenerate automatically from other parts of the Corresponding
137 | Source.
138 | 
139 |   The Corresponding Source for a work in source code form is that
140 | same work.
141 | 
142 |   2. Basic Permissions.
143 | 
144 |   All rights granted under this License are granted for the term of
145 | copyright on the Program, and are irrevocable provided the stated
146 | conditions are met.  This License explicitly affirms your unlimited
147 | permission to run the unmodified Program.  The output from running a
148 | covered work is covered by this License only if the output, given its
149 | content, constitutes a covered work.  This License acknowledges your
150 | rights of fair use or other equivalent, as provided by copyright law.
151 | 
152 |   You may make, run and propagate covered works that you do not
153 | convey, without conditions so long as your license otherwise remains
154 | in force.  You may convey covered works to others for the sole purpose
155 | of having them make modifications exclusively for you, or provide you
156 | with facilities for running those works, provided that you comply with
157 | the terms of this License in conveying all material for which you do
158 | not control copyright.  Those thus making or running the covered works
159 | for you must do so exclusively on your behalf, under your direction
160 | and control, on terms that prohibit them from making any copies of
161 | your copyrighted material outside their relationship with you.
162 | 
163 |   Conveying under any other circumstances is permitted solely under
164 | the conditions stated below.  Sublicensing is not allowed; section 10
165 | makes it unnecessary.
166 | 
167 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
168 | 
169 |   No covered work shall be deemed part of an effective technological
170 | measure under any applicable law fulfilling obligations under article
171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
172 | similar laws prohibiting or restricting circumvention of such
173 | measures.
174 | 
175 |   When you convey a covered work, you waive any legal power to forbid
176 | circumvention of technological measures to the extent such circumvention
177 | is effected by exercising rights under this License with respect to
178 | the covered work, and you disclaim any intention to limit operation or
179 | modification of the work as a means of enforcing, against the work's
180 | users, your or third parties' legal rights to forbid circumvention of
181 | technological measures.
182 | 
183 |   4. Conveying Verbatim Copies.
184 | 
185 |   You may convey verbatim copies of the Program's source code as you
186 | receive it, in any medium, provided that you conspicuously and
187 | appropriately publish on each copy an appropriate copyright notice;
188 | keep intact all notices stating that this License and any
189 | non-permissive terms added in accord with section 7 apply to the code;
190 | keep intact all notices of the absence of any warranty; and give all
191 | recipients a copy of this License along with the Program.
192 | 
193 |   You may charge any price or no price for each copy that you convey,
194 | and you may offer support or warranty protection for a fee.
195 | 
196 |   5. Conveying Modified Source Versions.
197 | 
198 |   You may convey a work based on the Program, or the modifications to
199 | produce it from the Program, in the form of source code under the
200 | terms of section 4, provided that you also meet all of these conditions:
201 | 
202 |     a) The work must carry prominent notices stating that you modified
203 |     it, and giving a relevant date.
204 | 
205 |     b) The work must carry prominent notices stating that it is
206 |     released under this License and any conditions added under section
207 |     7.  This requirement modifies the requirement in section 4 to
208 |     "keep intact all notices".
209 | 
210 |     c) You must license the entire work, as a whole, under this
211 |     License to anyone who comes into possession of a copy.  This
212 |     License will therefore apply, along with any applicable section 7
213 |     additional terms, to the whole of the work, and all its parts,
214 |     regardless of how they are packaged.  This License gives no
215 |     permission to license the work in any other way, but it does not
216 |     invalidate such permission if you have separately received it.
217 | 
218 |     d) If the work has interactive user interfaces, each must display
219 |     Appropriate Legal Notices; however, if the Program has interactive
220 |     interfaces that do not display Appropriate Legal Notices, your
221 |     work need not make them do so.
222 | 
223 |   A compilation of a covered work with other separate and independent
224 | works, which are not by their nature extensions of the covered work,
225 | and which are not combined with it such as to form a larger program,
226 | in or on a volume of a storage or distribution medium, is called an
227 | "aggregate" if the compilation and its resulting copyright are not
228 | used to limit the access or legal rights of the compilation's users
229 | beyond what the individual works permit.  Inclusion of a covered work
230 | in an aggregate does not cause this License to apply to the other
231 | parts of the aggregate.
232 | 
233 |   6. Conveying Non-Source Forms.
234 | 
235 |   You may convey a covered work in object code form under the terms
236 | of sections 4 and 5, provided that you also convey the
237 | machine-readable Corresponding Source under the terms of this License,
238 | in one of these ways:
239 | 
240 |     a) Convey the object code in, or embodied in, a physical product
241 |     (including a physical distribution medium), accompanied by the
242 |     Corresponding Source fixed on a durable physical medium
243 |     customarily used for software interchange.
244 | 
245 |     b) Convey the object code in, or embodied in, a physical product
246 |     (including a physical distribution medium), accompanied by a
247 |     written offer, valid for at least three years and valid for as
248 |     long as you offer spare parts or customer support for that product
249 |     model, to give anyone who possesses the object code either (1) a
250 |     copy of the Corresponding Source for all the software in the
251 |     product that is covered by this License, on a durable physical
252 |     medium customarily used for software interchange, for a price no
253 |     more than your reasonable cost of physically performing this
254 |     conveying of source, or (2) access to copy the
255 |     Corresponding Source from a network server at no charge.
256 | 
257 |     c) Convey individual copies of the object code with a copy of the
258 |     written offer to provide the Corresponding Source.  This
259 |     alternative is allowed only occasionally and noncommercially, and
260 |     only if you received the object code with such an offer, in accord
261 |     with subsection 6b.
262 | 
263 |     d) Convey the object code by offering access from a designated
264 |     place (gratis or for a charge), and offer equivalent access to the
265 |     Corresponding Source in the same way through the same place at no
266 |     further charge.  You need not require recipients to copy the
267 |     Corresponding Source along with the object code.  If the place to
268 |     copy the object code is a network server, the Corresponding Source
269 |     may be on a different server (operated by you or a third party)
270 |     that supports equivalent copying facilities, provided you maintain
271 |     clear directions next to the object code saying where to find the
272 |     Corresponding Source.  Regardless of what server hosts the
273 |     Corresponding Source, you remain obligated to ensure that it is
274 |     available for as long as needed to satisfy these requirements.
275 | 
276 |     e) Convey the object code using peer-to-peer transmission, provided
277 |     you inform other peers where the object code and Corresponding
278 |     Source of the work are being offered to the general public at no
279 |     charge under subsection 6d.
280 | 
281 |   A separable portion of the object code, whose source code is excluded
282 | from the Corresponding Source as a System Library, need not be
283 | included in conveying the object code work.
284 | 
285 |   A "User Product" is either (1) a "consumer product", which means any
286 | tangible personal property which is normally used for personal, family,
287 | or household purposes, or (2) anything designed or sold for incorporation
288 | into a dwelling.  In determining whether a product is a consumer product,
289 | doubtful cases shall be resolved in favor of coverage.  For a particular
290 | product received by a particular user, "normally used" refers to a
291 | typical or common use of that class of product, regardless of the status
292 | of the particular user or of the way in which the particular user
293 | actually uses, or expects or is expected to use, the product.  A product
294 | is a consumer product regardless of whether the product has substantial
295 | commercial, industrial or non-consumer uses, unless such uses represent
296 | the only significant mode of use of the product.
297 | 
298 |   "Installation Information" for a User Product means any methods,
299 | procedures, authorization keys, or other information required to install
300 | and execute modified versions of a covered work in that User Product from
301 | a modified version of its Corresponding Source.  The information must
302 | suffice to ensure that the continued functioning of the modified object
303 | code is in no case prevented or interfered with solely because
304 | modification has been made.
305 | 
306 |   If you convey an object code work under this section in, or with, or
307 | specifically for use in, a User Product, and the conveying occurs as
308 | part of a transaction in which the right of possession and use of the
309 | User Product is transferred to the recipient in perpetuity or for a
310 | fixed term (regardless of how the transaction is characterized), the
311 | Corresponding Source conveyed under this section must be accompanied
312 | by the Installation Information.  But this requirement does not apply
313 | if neither you nor any third party retains the ability to install
314 | modified object code on the User Product (for example, the work has
315 | been installed in ROM).
316 | 
317 |   The requirement to provide Installation Information does not include a
318 | requirement to continue to provide support service, warranty, or updates
319 | for a work that has been modified or installed by the recipient, or for
320 | the User Product in which it has been modified or installed.  Access to a
321 | network may be denied when the modification itself materially and
322 | adversely affects the operation of the network or violates the rules and
323 | protocols for communication across the network.
324 | 
325 |   Corresponding Source conveyed, and Installation Information provided,
326 | in accord with this section must be in a format that is publicly
327 | documented (and with an implementation available to the public in
328 | source code form), and must require no special password or key for
329 | unpacking, reading or copying.
330 | 
331 |   7. Additional Terms.
332 | 
333 |   "Additional permissions" are terms that supplement the terms of this
334 | License by making exceptions from one or more of its conditions.
335 | Additional permissions that are applicable to the entire Program shall
336 | be treated as though they were included in this License, to the extent
337 | that they are valid under applicable law.  If additional permissions
338 | apply only to part of the Program, that part may be used separately
339 | under those permissions, but the entire Program remains governed by
340 | this License without regard to the additional permissions.
341 | 
342 |   When you convey a copy of a covered work, you may at your option
343 | remove any additional permissions from that copy, or from any part of
344 | it.  (Additional permissions may be written to require their own
345 | removal in certain cases when you modify the work.)  You may place
346 | additional permissions on material, added by you to a covered work,
347 | for which you have or can give appropriate copyright permission.
348 | 
349 |   Notwithstanding any other provision of this License, for material you
350 | add to a covered work, you may (if authorized by the copyright holders of
351 | that material) supplement the terms of this License with terms:
352 | 
353 |     a) Disclaiming warranty or limiting liability differently from the
354 |     terms of sections 15 and 16 of this License; or
355 | 
356 |     b) Requiring preservation of specified reasonable legal notices or
357 |     author attributions in that material or in the Appropriate Legal
358 |     Notices displayed by works containing it; or
359 | 
360 |     c) Prohibiting misrepresentation of the origin of that material, or
361 |     requiring that modified versions of such material be marked in
362 |     reasonable ways as different from the original version; or
363 | 
364 |     d) Limiting the use for publicity purposes of names of licensors or
365 |     authors of the material; or
366 | 
367 |     e) Declining to grant rights under trademark law for use of some
368 |     trade names, trademarks, or service marks; or
369 | 
370 |     f) Requiring indemnification of licensors and authors of that
371 |     material by anyone who conveys the material (or modified versions of
372 |     it) with contractual assumptions of liability to the recipient, for
373 |     any liability that these contractual assumptions directly impose on
374 |     those licensors and authors.
375 | 
376 |   All other non-permissive additional terms are considered "further
377 | restrictions" within the meaning of section 10.  If the Program as you
378 | received it, or any part of it, contains a notice stating that it is
379 | governed by this License along with a term that is a further
380 | restriction, you may remove that term.  If a license document contains
381 | a further restriction but permits relicensing or conveying under this
382 | License, you may add to a covered work material governed by the terms
383 | of that license document, provided that the further restriction does
384 | not survive such relicensing or conveying.
385 | 
386 |   If you add terms to a covered work in accord with this section, you
387 | must place, in the relevant source files, a statement of the
388 | additional terms that apply to those files, or a notice indicating
389 | where to find the applicable terms.
390 | 
391 |   Additional terms, permissive or non-permissive, may be stated in the
392 | form of a separately written license, or stated as exceptions;
393 | the above requirements apply either way.
394 | 
395 |   8. Termination.
396 | 
397 |   You may not propagate or modify a covered work except as expressly
398 | provided under this License.  Any attempt otherwise to propagate or
399 | modify it is void, and will automatically terminate your rights under
400 | this License (including any patent licenses granted under the third
401 | paragraph of section 11).
402 | 
403 |   However, if you cease all violation of this License, then your
404 | license from a particular copyright holder is reinstated (a)
405 | provisionally, unless and until the copyright holder explicitly and
406 | finally terminates your license, and (b) permanently, if the copyright
407 | holder fails to notify you of the violation by some reasonable means
408 | prior to 60 days after the cessation.
409 | 
410 |   Moreover, your license from a particular copyright holder is
411 | reinstated permanently if the copyright holder notifies you of the
412 | violation by some reasonable means, this is the first time you have
413 | received notice of violation of this License (for any work) from that
414 | copyright holder, and you cure the violation prior to 30 days after
415 | your receipt of the notice.
416 | 
417 |   Termination of your rights under this section does not terminate the
418 | licenses of parties who have received copies or rights from you under
419 | this License.  If your rights have been terminated and not permanently
420 | reinstated, you do not qualify to receive new licenses for the same
421 | material under section 10.
422 | 
423 |   9. Acceptance Not Required for Having Copies.
424 | 
425 |   You are not required to accept this License in order to receive or
426 | run a copy of the Program.  Ancillary propagation of a covered work
427 | occurring solely as a consequence of using peer-to-peer transmission
428 | to receive a copy likewise does not require acceptance.  However,
429 | nothing other than this License grants you permission to propagate or
430 | modify any covered work.  These actions infringe copyright if you do
431 | not accept this License.  Therefore, by modifying or propagating a
432 | covered work, you indicate your acceptance of this License to do so.
433 | 
434 |   10. Automatic Licensing of Downstream Recipients.
435 | 
436 |   Each time you convey a covered work, the recipient automatically
437 | receives a license from the original licensors, to run, modify and
438 | propagate that work, subject to this License.  You are not responsible
439 | for enforcing compliance by third parties with this License.
440 | 
441 |   An "entity transaction" is a transaction transferring control of an
442 | organization, or substantially all assets of one, or subdividing an
443 | organization, or merging organizations.  If propagation of a covered
444 | work results from an entity transaction, each party to that
445 | transaction who receives a copy of the work also receives whatever
446 | licenses to the work the party's predecessor in interest had or could
447 | give under the previous paragraph, plus a right to possession of the
448 | Corresponding Source of the work from the predecessor in interest, if
449 | the predecessor has it or can get it with reasonable efforts.
450 | 
451 |   You may not impose any further restrictions on the exercise of the
452 | rights granted or affirmed under this License.  For example, you may
453 | not impose a license fee, royalty, or other charge for exercise of
454 | rights granted under this License, and you may not initiate litigation
455 | (including a cross-claim or counterclaim in a lawsuit) alleging that
456 | any patent claim is infringed by making, using, selling, offering for
457 | sale, or importing the Program or any portion of it.
458 | 
459 |   11. Patents.
460 | 
461 |   A "contributor" is a copyright holder who authorizes use under this
462 | License of the Program or a work on which the Program is based.  The
463 | work thus licensed is called the contributor's "contributor version".
464 | 
465 |   A contributor's "essential patent claims" are all patent claims
466 | owned or controlled by the contributor, whether already acquired or
467 | hereafter acquired, that would be infringed by some manner, permitted
468 | by this License, of making, using, or selling its contributor version,
469 | but do not include claims that would be infringed only as a
470 | consequence of further modification of the contributor version.  For
471 | purposes of this definition, "control" includes the right to grant
472 | patent sublicenses in a manner consistent with the requirements of
473 | this License.
474 | 
475 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
476 | patent license under the contributor's essential patent claims, to
477 | make, use, sell, offer for sale, import and otherwise run, modify and
478 | propagate the contents of its contributor version.
479 | 
480 |   In the following three paragraphs, a "patent license" is any express
481 | agreement or commitment, however denominated, not to enforce a patent
482 | (such as an express permission to practice a patent or covenant not to
483 | sue for patent infringement).  To "grant" such a patent license to a
484 | party means to make such an agreement or commitment not to enforce a
485 | patent against the party.
486 | 
487 |   If you convey a covered work, knowingly relying on a patent license,
488 | and the Corresponding Source of the work is not available for anyone
489 | to copy, free of charge and under the terms of this License, through a
490 | publicly available network server or other readily accessible means,
491 | then you must either (1) cause the Corresponding Source to be so
492 | available, or (2) arrange to deprive yourself of the benefit of the
493 | patent license for this particular work, or (3) arrange, in a manner
494 | consistent with the requirements of this License, to extend the patent
495 | license to downstream recipients.  "Knowingly relying" means you have
496 | actual knowledge that, but for the patent license, your conveying the
497 | covered work in a country, or your recipient's use of the covered work
498 | in a country, would infringe one or more identifiable patents in that
499 | country that you have reason to believe are valid.
500 | 
501 |   If, pursuant to or in connection with a single transaction or
502 | arrangement, you convey, or propagate by procuring conveyance of, a
503 | covered work, and grant a patent license to some of the parties
504 | receiving the covered work authorizing them to use, propagate, modify
505 | or convey a specific copy of the covered work, then the patent license
506 | you grant is automatically extended to all recipients of the covered
507 | work and works based on it.
508 | 
509 |   A patent license is "discriminatory" if it does not include within
510 | the scope of its coverage, prohibits the exercise of, or is
511 | conditioned on the non-exercise of one or more of the rights that are
512 | specifically granted under this License.  You may not convey a covered
513 | work if you are a party to an arrangement with a third party that is
514 | in the business of distributing software, under which you make payment
515 | to the third party based on the extent of your activity of conveying
516 | the work, and under which the third party grants, to any of the
517 | parties who would receive the covered work from you, a discriminatory
518 | patent license (a) in connection with copies of the covered work
519 | conveyed by you (or copies made from those copies), or (b) primarily
520 | for and in connection with specific products or compilations that
521 | contain the covered work, unless you entered into that arrangement,
522 | or that patent license was granted, prior to 28 March 2007.
523 | 
524 |   Nothing in this License shall be construed as excluding or limiting
525 | any implied license or other defenses to infringement that may
526 | otherwise be available to you under applicable patent law.
527 | 
528 |   12. No Surrender of Others' Freedom.
529 | 
530 |   If conditions are imposed on you (whether by court order, agreement or
531 | otherwise) that contradict the conditions of this License, they do not
532 | excuse you from the conditions of this License.  If you cannot convey a
533 | covered work so as to satisfy simultaneously your obligations under this
534 | License and any other pertinent obligations, then as a consequence you may
535 | not convey it at all.  For example, if you agree to terms that obligate you
536 | to collect a royalty for further conveying from those to whom you convey
537 | the Program, the only way you could satisfy both those terms and this
538 | License would be to refrain entirely from conveying the Program.
539 | 
540 |   13. Remote Network Interaction; Use with the GNU General Public License.
541 | 
542 |   Notwithstanding any other provision of this License, if you modify the
543 | Program, your modified version must prominently offer all users
544 | interacting with it remotely through a computer network (if your version
545 | supports such interaction) an opportunity to receive the Corresponding
546 | Source of your version by providing access to the Corresponding Source
547 | from a network server at no charge, through some standard or customary
548 | means of facilitating copying of software.  This Corresponding Source
549 | shall include the Corresponding Source for any work covered by version 3
550 | of the GNU General Public License that is incorporated pursuant to the
551 | following paragraph.
552 | 
553 |   Notwithstanding any other provision of this License, you have
554 | permission to link or combine any covered work with a work licensed
555 | under version 3 of the GNU General Public License into a single
556 | combined work, and to convey the resulting work.  The terms of this
557 | License will continue to apply to the part which is the covered work,
558 | but the work with which it is combined will remain governed by version
559 | 3 of the GNU General Public License.
560 | 
561 |   14. Revised Versions of this License.
562 | 
563 |   The Free Software Foundation may publish revised and/or new versions of
564 | the GNU Affero General Public License from time to time.  Such new versions
565 | will be similar in spirit to the present version, but may differ in detail to
566 | address new problems or concerns.
567 | 
568 |   Each version is given a distinguishing version number.  If the
569 | Program specifies that a certain numbered version of the GNU Affero General
570 | Public License "or any later version" applies to it, you have the
571 | option of following the terms and conditions either of that numbered
572 | version or of any later version published by the Free Software
573 | Foundation.  If the Program does not specify a version number of the
574 | GNU Affero General Public License, you may choose any version ever published
575 | by the Free Software Foundation.
576 | 
577 |   If the Program specifies that a proxy can decide which future
578 | versions of the GNU Affero General Public License can be used, that proxy's
579 | public statement of acceptance of a version permanently authorizes you
580 | to choose that version for the Program.
581 | 
582 |   Later license versions may give you additional or different
583 | permissions.  However, no additional obligations are imposed on any
584 | author or copyright holder as a result of your choosing to follow a
585 | later version.
586 | 
587 |   15. Disclaimer of Warranty.
588 | 
589 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
590 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
594 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
595 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
597 | 
598 |   16. Limitation of Liability.
599 | 
600 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
608 | SUCH DAMAGES.
609 | 
610 |   17. Interpretation of Sections 15 and 16.
611 | 
612 |   If the disclaimer of warranty and limitation of liability provided
613 | above cannot be given local legal effect according to their terms,
614 | reviewing courts shall apply local law that most closely approximates
615 | an absolute waiver of all civil liability in connection with the
616 | Program, unless a warranty or assumption of liability accompanies a
617 | copy of the Program in return for a fee.
618 | 
619 |                      END OF TERMS AND CONDITIONS
620 | 
621 |             How to Apply These Terms to Your New Programs
622 | 
623 |   If you develop a new program, and you want it to be of the greatest
624 | possible use to the public, the best way to achieve this is to make it
625 | free software which everyone can redistribute and change under these terms.
626 | 
627 |   To do so, attach the following notices to the program.  It is safest
628 | to attach them to the start of each source file to most effectively
629 | state the exclusion of warranty; and each file should have at least
630 | the "copyright" line and a pointer to where the full notice is found.
631 | 
632 |     DeepAsr is an open-source implementation of
633 |     end-to-end Automatic Speech Recognition (ASR) engine.
634 |     Copyright (C) 2020  Sai Kumar Yava
635 | 
636 |     This program is free software: you can redistribute it and/or modify
637 |     it under the terms of the GNU Affero General Public License as published
638 |     by the Free Software Foundation, either version 3 of the License, or
639 |     (at your option) any later version.
640 | 
641 |     This program is distributed in the hope that it will be useful,
642 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
643 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
644 |     GNU Affero General Public License for more details.
645 | 
646 |     You should have received a copy of the GNU Affero General Public License
647 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
648 | 
649 | Also add information on how to contact you by electronic and paper mail.
650 | 
651 |   If your software can interact with users remotely through a computer
652 | network, you should also make sure that it provides a way for users to
653 | get its source.  For example, if your program is a web application, its
654 | interface could display a "Source" link that leads users to an archive
655 | of the code.  There are many ways you could offer source, and different
656 | solutions will be better for different programs; see section 13 for the
657 | specific requirements.
658 | 
659 |   You should also get your employer (if you work as a programmer) or school,
660 | if any, to sign a "copyright disclaimer" for the program, if necessary.
661 | For more information on this, and how to apply and follow the GNU AGPL, see
662 | <https://www.gnu.org/licenses/>.
663 | 


--------------------------------------------------------------------------------
/DeepAsr_CTC_Pipeline.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# DeepAsr (DeepAsrNetwork1)"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "code",
  12 |    "execution_count": null,
  13 |    "metadata": {
  14 |     "colab": {},
  15 |     "colab_type": "code",
  16 |     "id": "S0FiiN9Y0FEs"
  17 |    },
  18 |    "outputs": [],
  19 |    "source": [
  20 |     "# !wget http://www.openslr.org/resources/12/train-clean-100.tar.gz"
  21 |    ]
  22 |   },
  23 |   {
  24 |    "cell_type": "code",
  25 |    "execution_count": null,
  26 |    "metadata": {
  27 |     "colab": {},
  28 |     "colab_type": "code",
  29 |     "id": "5JJMHx460FE4"
  30 |    },
  31 |    "outputs": [],
  32 |    "source": [
  33 |     "# !tar xzvf train-clean-100.tar.gz"
  34 |    ]
  35 |   },
  36 |   {
  37 |    "cell_type": "code",
  38 |    "execution_count": null,
  39 |    "metadata": {
  40 |     "colab": {},
  41 |     "colab_type": "code",
  42 |     "id": "SHLb6nDsUwkN"
  43 |    },
  44 |    "outputs": [],
  45 |    "source": [
  46 |     "# ! pip install tensorflow==2.1.0"
  47 |    ]
  48 |   },
  49 |   {
  50 |    "cell_type": "markdown",
  51 |    "metadata": {
  52 |     "colab_type": "text",
  53 |     "id": "Vx9UdVs5384B"
  54 |    },
  55 |    "source": [
  56 |     "# 1. Prepare DataSet"
  57 |    ]
  58 |   },
  59 |   {
  60 |    "cell_type": "code",
  61 |    "execution_count": null,
  62 |    "metadata": {
  63 |     "colab": {},
  64 |     "colab_type": "code",
  65 |     "id": "kw_18d180FFM"
  66 |    },
  67 |    "outputs": [],
  68 |    "source": [
  69 |     "import os\n",
  70 |     "import numpy as np\n",
  71 |     "import pandas as pd\n",
  72 |     "import tensorflow as tf\n",
  73 |     "import deepasr as asr\n",
  74 |     "import librosa"
  75 |    ]
  76 |   },
  77 |   {
  78 |    "cell_type": "code",
  79 |    "execution_count": 5,
  80 |    "metadata": {
  81 |     "colab": {
  82 |      "base_uri": "https://localhost:8080/",
  83 |      "height": 34
  84 |     },
  85 |     "colab_type": "code",
  86 |     "id": "d29QeHTJVNOF",
  87 |     "outputId": "b79a2fa5-e783-4543-d5b1-125fb3a1bd92"
  88 |    },
  89 |    "outputs": [
  90 |     {
  91 |      "data": {
  92 |       "text/plain": [
  93 |        "'2.1.0'"
  94 |       ]
  95 |      },
  96 |      "execution_count": 5,
  97 |      "metadata": {
  98 |       "tags": []
  99 |      },
 100 |      "output_type": "execute_result"
 101 |     }
 102 |    ],
 103 |    "source": [
 104 |     "tf.__version__"
 105 |    ]
 106 |   },
 107 |   {
 108 |    "cell_type": "code",
 109 |    "execution_count": 6,
 110 |    "metadata": {
 111 |     "colab": {
 112 |      "base_uri": "https://localhost:8080/",
 113 |      "height": 34
 114 |     },
 115 |     "colab_type": "code",
 116 |     "id": "zQEGr0HfC5OF",
 117 |     "outputId": "88d851cc-ec17-42d8-c8a2-26cfda9506cb"
 118 |    },
 119 |    "outputs": [
 120 |     {
 121 |      "data": {
 122 |       "text/plain": [
 123 |        "'0.0.9'"
 124 |       ]
 125 |      },
 126 |      "execution_count": 6,
 127 |      "metadata": {
 128 |       "tags": []
 129 |      },
 130 |      "output_type": "execute_result"
 131 |     }
 132 |    ],
 133 |    "source": [
 134 |     "asr.__version__"
 135 |    ]
 136 |   },
 137 |   {
 138 |    "cell_type": "code",
 139 |    "execution_count": null,
 140 |    "metadata": {
 141 |     "colab": {},
 142 |     "colab_type": "code",
 143 |     "id": "NmOf6DzG0FFS"
 144 |    },
 145 |    "outputs": [],
 146 |    "source": [
 147 |     "# get audios and transcripts\n",
 148 |     "org_path = './LibriSpeech/train-clean-100/'\n",
 149 |     "count = 0\n",
 150 |     "inp = []\n",
 151 |     "k=0\n",
 152 |     "audio_name = []\n",
 153 |     "audio_trans = []\n",
 154 |     "for dir1 in os.listdir(org_path):\n",
 155 |     "  dir2_path = org_path+dir1+'/'\n",
 156 |     "  #print(dir2_path)\n",
 157 |     "  for dir2 in os.listdir(dir2_path):\n",
 158 |     "    dir3_path = dir2_path+dir2+'/'\n",
 159 |     "    \n",
 160 |     "    for audio in os.listdir(dir3_path):\n",
 161 |     "      if audio.endswith('.txt'):\n",
 162 |     "        k+=1\n",
 163 |     "        file_path = dir3_path + audio\n",
 164 |     "        with open(file_path) as f:\n",
 165 |     "          line = f.readlines()\n",
 166 |     "          for lines in line:\n",
 167 |     "            flac_path = dir3_path+lines.split()[0]+'.flac'\n",
 168 |     "            \n",
 169 |     "            audio_name.append(flac_path)\n",
 170 |     "\n",
 171 |     "            # print(cmd)\n",
 172 |     "            words2 = lines.split()[1:]\n",
 173 |     "            words4=' '.join(words2)\n",
 174 |     "            audio_trans.append(words4)"
 175 |    ]
 176 |   },
 177 |   {
 178 |    "cell_type": "code",
 179 |    "execution_count": null,
 180 |    "metadata": {
 181 |     "colab": {},
 182 |     "colab_type": "code",
 183 |     "id": "5E9POoGc0FFb"
 184 |    },
 185 |    "outputs": [],
 186 |    "source": [
 187 |     "# create dataset\n",
 188 |     "df = pd.DataFrame({\"path\":audio_name,\"transcripts\":audio_trans})"
 189 |    ]
 190 |   },
 191 |   {
 192 |    "cell_type": "code",
 193 |    "execution_count": 9,
 194 |    "metadata": {
 195 |     "colab": {
 196 |      "base_uri": "https://localhost:8080/",
 197 |      "height": 34
 198 |     },
 199 |     "colab_type": "code",
 200 |     "id": "TRuPCDxZrhJu",
 201 |     "outputId": "64b3dc3f-ec85-4fc4-cf4e-60204b6f719e"
 202 |    },
 203 |    "outputs": [
 204 |     {
 205 |      "data": {
 206 |       "text/plain": [
 207 |        "(28539, 2)"
 208 |       ]
 209 |      },
 210 |      "execution_count": 9,
 211 |      "metadata": {
 212 |       "tags": []
 213 |      },
 214 |      "output_type": "execute_result"
 215 |     }
 216 |    ],
 217 |    "source": [
 218 |     "df.shape"
 219 |    ]
 220 |   },
 221 |   {
 222 |    "cell_type": "code",
 223 |    "execution_count": null,
 224 |    "metadata": {
 225 |     "colab": {},
 226 |     "colab_type": "code",
 227 |     "id": "g4bePqQvri5Q"
 228 |    },
 229 |    "outputs": [],
 230 |    "source": [
 231 |     "# filter transcript less than 100 charcters\n",
 232 |     "train_data = df[df['transcripts'].str.len() < 100]\n",
 233 |     "# train_df = df.sample(n = 3000) "
 234 |    ]
 235 |   },
 236 |   {
 237 |    "cell_type": "code",
 238 |    "execution_count": 11,
 239 |    "metadata": {
 240 |     "colab": {
 241 |      "base_uri": "https://localhost:8080/",
 242 |      "height": 34
 243 |     },
 244 |     "colab_type": "code",
 245 |     "id": "fiM94FU3rkh7",
 246 |     "outputId": "9c60db2a-8b85-47e9-a294-5e46d7c2c41e"
 247 |    },
 248 |    "outputs": [
 249 |     {
 250 |      "data": {
 251 |       "text/plain": [
 252 |        "(3194, 2)"
 253 |       ]
 254 |      },
 255 |      "execution_count": 11,
 256 |      "metadata": {
 257 |       "tags": []
 258 |      },
 259 |      "output_type": "execute_result"
 260 |     }
 261 |    ],
 262 |    "source": [
 263 |     "train_data.shape"
 264 |    ]
 265 |   },
 266 |   {
 267 |    "cell_type": "markdown",
 268 |    "metadata": {
 269 |     "colab_type": "text",
 270 |     "id": "EMDC5MYk4AyL"
 271 |    },
 272 |    "source": [
 273 |     "# 2. Prepare DeepAsr CTC Pipeline"
 274 |    ]
 275 |   },
 276 |   {
 277 |    "cell_type": "code",
 278 |    "execution_count": null,
 279 |    "metadata": {
 280 |     "colab": {},
 281 |     "colab_type": "code",
 282 |     "id": "-youl7Mo0FFi"
 283 |    },
 284 |    "outputs": [],
 285 |    "source": [
 286 |     "# get CTCPipeline\n",
 287 |     "def get_config(feature_type: str = 'spectrogram', multi_gpu: bool = False):\n",
 288 |     "    # audio feature extractor\n",
 289 |     "    features_extractor = asr.features.preprocess(feature_type=feature_type, features_num=161,\n",
 290 |     "                                                 samplerate=16000,\n",
 291 |     "                                                 winlen=0.02,\n",
 292 |     "                                                 winstep=0.025,\n",
 293 |     "                                                 winfunc=np.hanning)\n",
 294 |     "    \n",
 295 |     "    # input label encoder\n",
 296 |     "    alphabet_en = asr.vocab.Alphabet(lang='en')\n",
 297 |     "    # training model\n",
 298 |     "    model = asr.model.get_deepasrnetwork1(\n",
 299 |     "        input_dim=161,\n",
 300 |     "        output_dim=29,\n",
 301 |     "        is_mixed_precision=True\n",
 302 |     "    )\n",
 303 |     "    # model optimizer\n",
 304 |     "    optimizer = tf.keras.optimizers.Adam(\n",
 305 |     "        lr=1e-4,\n",
 306 |     "        beta_1=0.9,\n",
 307 |     "        beta_2=0.999,\n",
 308 |     "        epsilon=1e-8\n",
 309 |     "    )\n",
 310 |     "    # output label deocder\n",
 311 |     "    decoder = asr.decoder.GreedyDecoder()\n",
 312 |     "    # CTCPipeline\n",
 313 |     "    pipeline = asr.pipeline.ctc_pipeline.CTCPipeline(\n",
 314 |     "        alphabet=alphabet_en, features_extractor=features_extractor, model=model, optimizer=optimizer, decoder=decoder,\n",
 315 |     "        sample_rate=16000, mono=True, multi_gpu=multi_gpu\n",
 316 |     "    )\n",
 317 |     "    return pipeline"
 318 |    ]
 319 |   },
 320 |   {
 321 |    "cell_type": "code",
 322 |    "execution_count": null,
 323 |    "metadata": {
 324 |     "colab": {},
 325 |     "colab_type": "code",
 326 |     "id": "MqdfySzuRtk5"
 327 |    },
 328 |    "outputs": [],
 329 |    "source": [
 330 |     "# CTCPiline for asr\n",
 331 |     "pipeline = get_config(feature_type = 'fbank', multi_gpu=False)"
 332 |    ]
 333 |   },
 334 |   {
 335 |    "cell_type": "markdown",
 336 |    "metadata": {
 337 |     "colab_type": "text",
 338 |     "id": "WTG8iEwS4NKU"
 339 |    },
 340 |    "source": [
 341 |     "# 3. Model traning"
 342 |    ]
 343 |   },
 344 |   {
 345 |    "cell_type": "code",
 346 |    "execution_count": 21,
 347 |    "metadata": {
 348 |     "colab": {
 349 |      "base_uri": "https://localhost:8080/",
 350 |      "height": 1000
 351 |     },
 352 |     "colab_type": "code",
 353 |     "id": "6QMxCI8T0qMK",
 354 |     "outputId": "a598acfb-1cce-41db-db3b-b2d5cdc062d9"
 355 |    },
 356 |    "outputs": [
 357 |     {
 358 |      "name": "stdout",
 359 |      "output_type": "stream",
 360 |      "text": [
 361 |       "Model: \"DeepAsr\"\n",
 362 |       "__________________________________________________________________________________________________\n",
 363 |       "Layer (type)                    Output Shape         Param #     Connected to                     \n",
 364 |       "==================================================================================================\n",
 365 |       "the_input (InputLayer)          [(None, None, 161)]  0                                            \n",
 366 |       "__________________________________________________________________________________________________\n",
 367 |       "BN_1 (BatchNormalization)       (None, None, 161)    644         the_input[0][0]                  \n",
 368 |       "__________________________________________________________________________________________________\n",
 369 |       "Conv1D_1 (Conv1D)               (None, None, 220)    177320      BN_1[0][0]                       \n",
 370 |       "__________________________________________________________________________________________________\n",
 371 |       "CNBN_1 (BatchNormalization)     (None, None, 220)    880         Conv1D_1[0][0]                   \n",
 372 |       "__________________________________________________________________________________________________\n",
 373 |       "Conv1D_2 (Conv1D)               (None, None, 220)    242220      CNBN_1[0][0]                     \n",
 374 |       "__________________________________________________________________________________________________\n",
 375 |       "CNBN_2 (BatchNormalization)     (None, None, 220)    880         Conv1D_2[0][0]                   \n",
 376 |       "__________________________________________________________________________________________________\n",
 377 |       "gru_1 (GRU)                     (None, None, 512)    1127424     CNBN_2[0][0]                     \n",
 378 |       "__________________________________________________________________________________________________\n",
 379 |       "gru_2 (GRU)                     (None, None, 512)    1127424     CNBN_2[0][0]                     \n",
 380 |       "__________________________________________________________________________________________________\n",
 381 |       "concatenate (Concatenate)       (None, None, 1024)   0           gru_1[0][0]                      \n",
 382 |       "                                                                 gru_2[0][0]                      \n",
 383 |       "__________________________________________________________________________________________________\n",
 384 |       "BN_2 (BatchNormalization)       (None, None, 1024)   4096        concatenate[0][0]                \n",
 385 |       "__________________________________________________________________________________________________\n",
 386 |       "time_distributed (TimeDistribut (None, None, 30)     30750       BN_2[0][0]                       \n",
 387 |       "__________________________________________________________________________________________________\n",
 388 |       "the_output (TimeDistributed)    (None, None, 29)     899         time_distributed[0][0]           \n",
 389 |       "__________________________________________________________________________________________________\n",
 390 |       "the_labels (InputLayer)         [(None, None)]       0                                            \n",
 391 |       "__________________________________________________________________________________________________\n",
 392 |       "input_length (InputLayer)       [(None, 1)]          0                                            \n",
 393 |       "__________________________________________________________________________________________________\n",
 394 |       "label_length (InputLayer)       [(None, 1)]          0                                            \n",
 395 |       "__________________________________________________________________________________________________\n",
 396 |       "ctc (Lambda)                    (None, 1)            0           the_output[0][0]                 \n",
 397 |       "                                                                 the_labels[0][0]                 \n",
 398 |       "                                                                 input_length[0][0]               \n",
 399 |       "                                                                 label_length[0][0]               \n",
 400 |       "==================================================================================================\n",
 401 |       "Total params: 2,712,537\n",
 402 |       "Trainable params: 2,709,287\n",
 403 |       "Non-trainable params: 3,250\n",
 404 |       "__________________________________________________________________________________________________\n",
 405 |       "Feature Extraction in progress...\n",
 406 |       "Feature Extraction completed.\n",
 407 |       "input features:  (3194, 593, 161)\n",
 408 |       "input labels:  (3194, 99)\n",
 409 |       "Model training initiated...\n",
 410 |       "Train on 3194 samples\n",
 411 |       "Epoch 1/500\n",
 412 |       "3194/3194 [==============================] - 48s 15ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 413 |       "Epoch 2/500\n",
 414 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 415 |       "Epoch 3/500\n",
 416 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 417 |       "Epoch 4/500\n",
 418 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 419 |       "Epoch 5/500\n",
 420 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 421 |       "Epoch 6/500\n",
 422 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 423 |       "Epoch 7/500\n",
 424 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 425 |       "Epoch 8/500\n",
 426 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 427 |       "Epoch 9/500\n",
 428 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 429 |       "Epoch 10/500\n",
 430 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 431 |       "Epoch 11/500\n",
 432 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 433 |       "Epoch 12/500\n",
 434 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 435 |       "Epoch 13/500\n",
 436 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 437 |       "Epoch 14/500\n",
 438 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 439 |       "Epoch 15/500\n",
 440 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 441 |       "Epoch 16/500\n",
 442 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 443 |       "Epoch 17/500\n",
 444 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 445 |       "Epoch 18/500\n",
 446 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 447 |       "Epoch 19/500\n",
 448 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 449 |       "Epoch 20/500\n",
 450 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 451 |       "Epoch 21/500\n",
 452 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 453 |       "Epoch 22/500\n",
 454 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 455 |       "Epoch 23/500\n",
 456 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 457 |       "Epoch 24/500\n",
 458 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 459 |       "Epoch 25/500\n",
 460 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 461 |       "Epoch 26/500\n",
 462 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 463 |       "Epoch 27/500\n",
 464 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 465 |       "Epoch 28/500\n",
 466 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 467 |       "Epoch 29/500\n",
 468 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 469 |       "Epoch 30/500\n",
 470 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 471 |       "Epoch 31/500\n",
 472 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 473 |       "Epoch 32/500\n",
 474 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 475 |       "Epoch 33/500\n",
 476 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 477 |       "Epoch 34/500\n",
 478 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 479 |       "Epoch 35/500\n",
 480 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 481 |       "Epoch 36/500\n",
 482 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 483 |       "Epoch 37/500\n",
 484 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 485 |       "Epoch 38/500\n",
 486 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 487 |       "Epoch 39/500\n",
 488 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 489 |       "Epoch 40/500\n",
 490 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 491 |       "Epoch 41/500\n",
 492 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 493 |       "Epoch 42/500\n",
 494 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 495 |       "Epoch 43/500\n",
 496 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 497 |       "Epoch 44/500\n",
 498 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 499 |       "Epoch 45/500\n",
 500 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 501 |       "Epoch 46/500\n",
 502 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 503 |       "Epoch 47/500\n",
 504 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 505 |       "Epoch 48/500\n",
 506 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 507 |       "Epoch 49/500\n",
 508 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 509 |       "Epoch 50/500\n",
 510 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 511 |       "Epoch 51/500\n",
 512 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 513 |       "Epoch 52/500\n",
 514 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 515 |       "Epoch 53/500\n",
 516 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 517 |       "Epoch 54/500\n",
 518 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 519 |       "Epoch 55/500\n",
 520 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 521 |       "Epoch 56/500\n",
 522 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 523 |       "Epoch 57/500\n",
 524 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 525 |       "Epoch 58/500\n",
 526 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 527 |       "Epoch 59/500\n",
 528 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 529 |       "Epoch 60/500\n",
 530 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 531 |       "Epoch 61/500\n",
 532 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 533 |       "Epoch 62/500\n",
 534 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 535 |       "Epoch 63/500\n",
 536 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 537 |       "Epoch 64/500\n",
 538 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 539 |       "Epoch 65/500\n",
 540 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 541 |       "Epoch 66/500\n",
 542 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 543 |       "Epoch 67/500\n",
 544 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 545 |       "Epoch 68/500\n",
 546 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 547 |       "Epoch 69/500\n",
 548 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 549 |       "Epoch 70/500\n",
 550 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 551 |       "Epoch 71/500\n",
 552 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 553 |       "Epoch 72/500\n",
 554 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 555 |       "Epoch 73/500\n",
 556 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 557 |       "Epoch 74/500\n",
 558 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 559 |       "Epoch 75/500\n",
 560 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 561 |       "Epoch 76/500\n",
 562 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 563 |       "Epoch 77/500\n",
 564 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 565 |       "Epoch 78/500\n",
 566 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 567 |       "Epoch 79/500\n",
 568 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 569 |       "Epoch 80/500\n",
 570 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 571 |       "Epoch 81/500\n",
 572 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 573 |       "Epoch 82/500\n",
 574 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 575 |       "Epoch 83/500\n",
 576 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 577 |       "Epoch 84/500\n",
 578 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 579 |       "Epoch 85/500\n",
 580 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 581 |       "Epoch 86/500\n",
 582 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 583 |       "Epoch 87/500\n",
 584 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 585 |       "Epoch 88/500\n",
 586 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 587 |       "Epoch 89/500\n",
 588 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 589 |       "Epoch 90/500\n",
 590 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 591 |       "Epoch 91/500\n",
 592 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 593 |       "Epoch 92/500\n",
 594 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 595 |       "Epoch 93/500\n",
 596 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 597 |       "Epoch 94/500\n",
 598 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 599 |       "Epoch 95/500\n",
 600 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 601 |       "Epoch 96/500\n",
 602 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 603 |       "Epoch 97/500\n",
 604 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 605 |       "Epoch 98/500\n",
 606 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 607 |       "Epoch 99/500\n",
 608 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 609 |       "Epoch 100/500\n",
 610 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 611 |       "Epoch 101/500\n",
 612 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 613 |       "Epoch 102/500\n",
 614 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 615 |       "Epoch 103/500\n",
 616 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 617 |       "Epoch 104/500\n",
 618 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 619 |       "Epoch 105/500\n",
 620 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 621 |       "Epoch 106/500\n",
 622 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 623 |       "Epoch 107/500\n",
 624 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 625 |       "Epoch 108/500\n",
 626 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 627 |       "Epoch 109/500\n",
 628 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 629 |       "Epoch 110/500\n",
 630 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 631 |       "Epoch 111/500\n",
 632 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 633 |       "Epoch 112/500\n",
 634 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 635 |       "Epoch 113/500\n",
 636 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 637 |       "Epoch 114/500\n",
 638 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 639 |       "Epoch 115/500\n",
 640 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 641 |       "Epoch 116/500\n",
 642 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 643 |       "Epoch 117/500\n",
 644 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 645 |       "Epoch 118/500\n",
 646 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 647 |       "Epoch 119/500\n",
 648 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 649 |       "Epoch 120/500\n",
 650 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 651 |       "Epoch 121/500\n",
 652 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 653 |       "Epoch 122/500\n",
 654 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 655 |       "Epoch 123/500\n",
 656 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 657 |       "Epoch 124/500\n",
 658 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 659 |       "Epoch 125/500\n",
 660 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 661 |       "Epoch 126/500\n",
 662 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 663 |       "Epoch 127/500\n",
 664 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 665 |       "Epoch 128/500\n",
 666 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 667 |       "Epoch 129/500\n",
 668 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 669 |       "Epoch 130/500\n",
 670 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 671 |       "Epoch 131/500\n",
 672 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 673 |       "Epoch 132/500\n",
 674 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 675 |       "Epoch 133/500\n",
 676 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 677 |       "Epoch 134/500\n",
 678 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 679 |       "Epoch 135/500\n",
 680 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 681 |       "Epoch 136/500\n",
 682 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 683 |       "Epoch 137/500\n",
 684 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 685 |       "Epoch 138/500\n",
 686 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 687 |       "Epoch 139/500\n",
 688 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 689 |       "Epoch 140/500\n",
 690 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 691 |       "Epoch 141/500\n",
 692 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 693 |       "Epoch 142/500\n",
 694 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 695 |       "Epoch 143/500\n",
 696 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 697 |       "Epoch 144/500\n",
 698 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 699 |       "Epoch 145/500\n",
 700 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 701 |       "Epoch 146/500\n",
 702 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 703 |       "Epoch 147/500\n",
 704 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 705 |       "Epoch 148/500\n",
 706 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 707 |       "Epoch 149/500\n",
 708 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 709 |       "Epoch 150/500\n",
 710 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 711 |       "Epoch 151/500\n",
 712 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 713 |       "Epoch 152/500\n",
 714 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 715 |       "Epoch 153/500\n",
 716 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 717 |       "Epoch 154/500\n",
 718 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 719 |       "Epoch 155/500\n",
 720 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 721 |       "Epoch 156/500\n",
 722 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 723 |       "Epoch 157/500\n",
 724 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 725 |       "Epoch 158/500\n",
 726 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 727 |       "Epoch 159/500\n",
 728 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 729 |       "Epoch 160/500\n",
 730 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 731 |       "Epoch 161/500\n",
 732 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 733 |       "Epoch 162/500\n",
 734 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 735 |       "Epoch 163/500\n",
 736 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 737 |       "Epoch 164/500\n",
 738 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 739 |       "Epoch 165/500\n",
 740 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 741 |       "Epoch 166/500\n",
 742 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 743 |       "Epoch 167/500\n",
 744 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 745 |       "Epoch 168/500\n",
 746 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 747 |       "Epoch 169/500\n",
 748 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 749 |       "Epoch 170/500\n",
 750 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 751 |       "Epoch 171/500\n",
 752 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 753 |       "Epoch 172/500\n",
 754 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 755 |       "Epoch 173/500\n",
 756 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 757 |       "Epoch 174/500\n",
 758 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 759 |       "Epoch 175/500\n",
 760 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 761 |       "Epoch 176/500\n",
 762 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 763 |       "Epoch 177/500\n",
 764 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 765 |       "Epoch 178/500\n",
 766 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 767 |       "Epoch 179/500\n",
 768 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 769 |       "Epoch 180/500\n",
 770 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 771 |       "Epoch 181/500\n",
 772 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 773 |       "Epoch 182/500\n",
 774 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 775 |       "Epoch 183/500\n",
 776 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 777 |       "Epoch 184/500\n",
 778 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 779 |       "Epoch 185/500\n",
 780 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 781 |       "Epoch 186/500\n",
 782 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 783 |       "Epoch 187/500\n",
 784 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 785 |       "Epoch 188/500\n",
 786 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 787 |       "Epoch 189/500\n",
 788 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 789 |       "Epoch 190/500\n",
 790 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 791 |       "Epoch 191/500\n",
 792 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 793 |       "Epoch 192/500\n",
 794 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 795 |       "Epoch 193/500\n",
 796 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 797 |       "Epoch 194/500\n",
 798 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 799 |       "Epoch 195/500\n",
 800 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 801 |       "Epoch 196/500\n",
 802 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 803 |       "Epoch 197/500\n",
 804 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 805 |       "Epoch 198/500\n",
 806 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 807 |       "Epoch 199/500\n",
 808 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 809 |       "Epoch 200/500\n",
 810 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 811 |       "Epoch 201/500\n",
 812 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 813 |       "Epoch 202/500\n",
 814 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 815 |       "Epoch 203/500\n",
 816 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 817 |       "Epoch 204/500\n",
 818 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 819 |       "Epoch 205/500\n",
 820 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 821 |       "Epoch 206/500\n",
 822 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 823 |       "Epoch 207/500\n",
 824 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 825 |       "Epoch 208/500\n",
 826 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 827 |       "Epoch 209/500\n",
 828 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 829 |       "Epoch 210/500\n",
 830 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 831 |       "Epoch 211/500\n",
 832 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 833 |       "Epoch 212/500\n",
 834 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 835 |       "Epoch 213/500\n",
 836 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 837 |       "Epoch 214/500\n",
 838 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 839 |       "Epoch 215/500\n",
 840 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 841 |       "Epoch 216/500\n",
 842 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 843 |       "Epoch 217/500\n",
 844 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 845 |       "Epoch 218/500\n",
 846 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 847 |       "Epoch 219/500\n",
 848 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 849 |       "Epoch 220/500\n",
 850 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 851 |       "Epoch 221/500\n",
 852 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 853 |       "Epoch 222/500\n",
 854 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 855 |       "Epoch 223/500\n",
 856 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 857 |       "Epoch 224/500\n",
 858 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 859 |       "Epoch 225/500\n",
 860 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 861 |       "Epoch 226/500\n",
 862 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 863 |       "Epoch 227/500\n",
 864 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 865 |       "Epoch 228/500\n",
 866 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 867 |       "Epoch 229/500\n",
 868 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 869 |       "Epoch 230/500\n",
 870 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 871 |       "Epoch 231/500\n",
 872 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 873 |       "Epoch 232/500\n",
 874 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 875 |       "Epoch 233/500\n",
 876 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 877 |       "Epoch 234/500\n",
 878 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 879 |       "Epoch 235/500\n",
 880 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 881 |       "Epoch 236/500\n",
 882 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 883 |       "Epoch 237/500\n",
 884 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 885 |       "Epoch 238/500\n",
 886 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 887 |       "Epoch 239/500\n",
 888 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 889 |       "Epoch 240/500\n",
 890 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 891 |       "Epoch 241/500\n",
 892 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 893 |       "Epoch 242/500\n",
 894 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 895 |       "Epoch 243/500\n",
 896 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 897 |       "Epoch 244/500\n",
 898 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 899 |       "Epoch 245/500\n",
 900 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 901 |       "Epoch 246/500\n",
 902 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 903 |       "Epoch 247/500\n",
 904 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 905 |       "Epoch 248/500\n",
 906 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 907 |       "Epoch 249/500\n",
 908 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 909 |       "Epoch 250/500\n",
 910 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 911 |       "Epoch 251/500\n",
 912 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 913 |       "Epoch 252/500\n",
 914 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 915 |       "Epoch 253/500\n",
 916 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 917 |       "Epoch 254/500\n",
 918 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 919 |       "Epoch 255/500\n",
 920 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 921 |       "Epoch 256/500\n",
 922 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 923 |       "Epoch 257/500\n",
 924 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 925 |       "Epoch 258/500\n",
 926 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 927 |       "Epoch 259/500\n",
 928 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 929 |       "Epoch 260/500\n",
 930 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 931 |       "Epoch 261/500\n",
 932 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 933 |       "Epoch 262/500\n",
 934 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 935 |       "Epoch 263/500\n",
 936 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 937 |       "Epoch 264/500\n",
 938 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 939 |       "Epoch 265/500\n",
 940 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 941 |       "Epoch 266/500\n",
 942 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 943 |       "Epoch 267/500\n",
 944 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 945 |       "Epoch 268/500\n",
 946 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 947 |       "Epoch 269/500\n",
 948 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 949 |       "Epoch 270/500\n",
 950 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 951 |       "Epoch 271/500\n",
 952 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 953 |       "Epoch 272/500\n",
 954 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 955 |       "Epoch 273/500\n",
 956 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 957 |       "Epoch 274/500\n",
 958 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 959 |       "Epoch 275/500\n",
 960 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 961 |       "Epoch 276/500\n",
 962 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 963 |       "Epoch 277/500\n",
 964 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 965 |       "Epoch 278/500\n",
 966 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 967 |       "Epoch 279/500\n",
 968 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 969 |       "Epoch 280/500\n",
 970 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 971 |       "Epoch 281/500\n",
 972 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 973 |       "Epoch 282/500\n",
 974 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 975 |       "Epoch 283/500\n",
 976 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 977 |       "Epoch 284/500\n",
 978 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 979 |       "Epoch 285/500\n",
 980 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 981 |       "Epoch 286/500\n",
 982 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 983 |       "Epoch 287/500\n",
 984 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 985 |       "Epoch 288/500\n",
 986 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 987 |       "Epoch 289/500\n",
 988 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 989 |       "Epoch 290/500\n",
 990 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 991 |       "Epoch 291/500\n",
 992 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 993 |       "Epoch 292/500\n",
 994 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 995 |       "Epoch 293/500\n",
 996 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 997 |       "Epoch 294/500\n",
 998 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
 999 |       "Epoch 295/500\n",
1000 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1001 |       "Epoch 296/500\n",
1002 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1003 |       "Epoch 297/500\n",
1004 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1005 |       "Epoch 298/500\n",
1006 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1007 |       "Epoch 299/500\n",
1008 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1009 |       "Epoch 300/500\n",
1010 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1011 |       "Epoch 301/500\n",
1012 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1013 |       "Epoch 302/500\n",
1014 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1015 |       "Epoch 303/500\n",
1016 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1017 |       "Epoch 304/500\n",
1018 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1019 |       "Epoch 305/500\n",
1020 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1021 |       "Epoch 306/500\n",
1022 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1023 |       "Epoch 307/500\n",
1024 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1025 |       "Epoch 308/500\n",
1026 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1027 |       "Epoch 309/500\n",
1028 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1029 |       "Epoch 310/500\n",
1030 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1031 |       "Epoch 311/500\n",
1032 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1033 |       "Epoch 312/500\n",
1034 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1035 |       "Epoch 313/500\n",
1036 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1037 |       "Epoch 314/500\n",
1038 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1039 |       "Epoch 315/500\n",
1040 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1041 |       "Epoch 316/500\n",
1042 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1043 |       "Epoch 317/500\n",
1044 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1045 |       "Epoch 318/500\n",
1046 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1047 |       "Epoch 319/500\n",
1048 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1049 |       "Epoch 320/500\n",
1050 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1051 |       "Epoch 321/500\n",
1052 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1053 |       "Epoch 322/500\n",
1054 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1055 |       "Epoch 323/500\n",
1056 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1057 |       "Epoch 324/500\n",
1058 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1059 |       "Epoch 325/500\n",
1060 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1061 |       "Epoch 326/500\n",
1062 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1063 |       "Epoch 327/500\n",
1064 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1065 |       "Epoch 328/500\n",
1066 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1067 |       "Epoch 329/500\n",
1068 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1069 |       "Epoch 330/500\n",
1070 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1071 |       "Epoch 331/500\n",
1072 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1073 |       "Epoch 332/500\n",
1074 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1075 |       "Epoch 333/500\n",
1076 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1077 |       "Epoch 334/500\n",
1078 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1079 |       "Epoch 335/500\n",
1080 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1081 |       "Epoch 336/500\n",
1082 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1083 |       "Epoch 337/500\n",
1084 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1085 |       "Epoch 338/500\n",
1086 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1087 |       "Epoch 339/500\n",
1088 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1089 |       "Epoch 340/500\n",
1090 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1091 |       "Epoch 341/500\n",
1092 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1093 |       "Epoch 342/500\n",
1094 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1095 |       "Epoch 343/500\n",
1096 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1097 |       "Epoch 344/500\n",
1098 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1099 |       "Epoch 345/500\n",
1100 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1101 |       "Epoch 346/500\n",
1102 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1103 |       "Epoch 347/500\n",
1104 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1105 |       "Epoch 348/500\n",
1106 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1107 |       "Epoch 349/500\n",
1108 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1109 |       "Epoch 350/500\n",
1110 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1111 |       "Epoch 351/500\n",
1112 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1113 |       "Epoch 352/500\n",
1114 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1115 |       "Epoch 353/500\n",
1116 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1117 |       "Epoch 354/500\n",
1118 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1119 |       "Epoch 355/500\n",
1120 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1121 |       "Epoch 356/500\n",
1122 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1123 |       "Epoch 357/500\n",
1124 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1125 |       "Epoch 358/500\n",
1126 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1127 |       "Epoch 359/500\n",
1128 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1129 |       "Epoch 360/500\n",
1130 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1131 |       "Epoch 361/500\n",
1132 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1133 |       "Epoch 362/500\n",
1134 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1135 |       "Epoch 363/500\n",
1136 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1137 |       "Epoch 364/500\n",
1138 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1139 |       "Epoch 365/500\n",
1140 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1141 |       "Epoch 366/500\n",
1142 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1143 |       "Epoch 367/500\n",
1144 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1145 |       "Epoch 368/500\n",
1146 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1147 |       "Epoch 369/500\n",
1148 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1149 |       "Epoch 370/500\n",
1150 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1151 |       "Epoch 371/500\n",
1152 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1153 |       "Epoch 372/500\n",
1154 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1155 |       "Epoch 373/500\n",
1156 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1157 |       "Epoch 374/500\n",
1158 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1159 |       "Epoch 375/500\n",
1160 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1161 |       "Epoch 376/500\n",
1162 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1163 |       "Epoch 377/500\n",
1164 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1165 |       "Epoch 378/500\n",
1166 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1167 |       "Epoch 379/500\n",
1168 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1169 |       "Epoch 380/500\n",
1170 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1171 |       "Epoch 381/500\n",
1172 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1173 |       "Epoch 382/500\n",
1174 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1175 |       "Epoch 383/500\n",
1176 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1177 |       "Epoch 384/500\n",
1178 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1179 |       "Epoch 385/500\n",
1180 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1181 |       "Epoch 386/500\n",
1182 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1183 |       "Epoch 387/500\n",
1184 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1185 |       "Epoch 388/500\n",
1186 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1187 |       "Epoch 389/500\n",
1188 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1189 |       "Epoch 390/500\n",
1190 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1191 |       "Epoch 391/500\n",
1192 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1193 |       "Epoch 392/500\n",
1194 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1195 |       "Epoch 393/500\n",
1196 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1197 |       "Epoch 394/500\n",
1198 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1199 |       "Epoch 395/500\n",
1200 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 3.1309e-04\n",
1201 |       "Epoch 396/500\n",
1202 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1203 |       "Epoch 397/500\n",
1204 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1205 |       "Epoch 398/500\n",
1206 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1207 |       "Epoch 399/500\n",
1208 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 3.1309e-04\n",
1209 |       "Epoch 400/500\n",
1210 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1211 |       "Epoch 401/500\n",
1212 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1213 |       "Epoch 402/500\n",
1214 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1215 |       "Epoch 403/500\n",
1216 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1217 |       "Epoch 404/500\n",
1218 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1219 |       "Epoch 405/500\n",
1220 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 3.1309e-04\n",
1221 |       "Epoch 406/500\n",
1222 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 3.1309e-04\n",
1223 |       "Epoch 407/500\n",
1224 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1225 |       "Epoch 408/500\n",
1226 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1227 |       "Epoch 409/500\n",
1228 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1229 |       "Epoch 410/500\n",
1230 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1231 |       "Epoch 411/500\n",
1232 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1233 |       "Epoch 412/500\n",
1234 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1235 |       "Epoch 413/500\n",
1236 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1237 |       "Epoch 414/500\n",
1238 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 3.1309e-04\n",
1239 |       "Epoch 415/500\n",
1240 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1241 |       "Epoch 416/500\n",
1242 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1243 |       "Epoch 417/500\n",
1244 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 3.1309e-04\n",
1245 |       "Epoch 418/500\n",
1246 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 9.3926e-04\n",
1247 |       "Epoch 419/500\n",
1248 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1249 |       "Epoch 420/500\n",
1250 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 6.2617e-04\n",
1251 |       "Epoch 421/500\n",
1252 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1253 |       "Epoch 422/500\n",
1254 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 3.1309e-04\n",
1255 |       "Epoch 423/500\n",
1256 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1257 |       "Epoch 424/500\n",
1258 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 9.3926e-04\n",
1259 |       "Epoch 425/500\n",
1260 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 3.1309e-04\n",
1261 |       "Epoch 426/500\n",
1262 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 9.3926e-04\n",
1263 |       "Epoch 427/500\n",
1264 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1265 |       "Epoch 428/500\n",
1266 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1267 |       "Epoch 429/500\n",
1268 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1269 |       "Epoch 430/500\n",
1270 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1271 |       "Epoch 431/500\n",
1272 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0000e+00\n",
1273 |       "Epoch 432/500\n",
1274 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 3.1309e-04\n",
1275 |       "Epoch 433/500\n",
1276 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 6.2617e-04\n",
1277 |       "Epoch 434/500\n",
1278 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 6.2617e-04\n",
1279 |       "Epoch 435/500\n",
1280 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 6.2617e-04\n",
1281 |       "Epoch 436/500\n",
1282 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 3.1309e-04\n",
1283 |       "Epoch 437/500\n",
1284 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 3.1309e-04\n",
1285 |       "Epoch 438/500\n",
1286 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 3.1309e-04\n",
1287 |       "Epoch 439/500\n",
1288 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 6.2617e-04\n",
1289 |       "Epoch 440/500\n",
1290 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 3.1309e-04\n",
1291 |       "Epoch 441/500\n",
1292 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 3.1309e-04\n",
1293 |       "Epoch 442/500\n",
1294 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 6.2617e-04\n",
1295 |       "Epoch 443/500\n",
1296 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0016\n",
1297 |       "Epoch 444/500\n",
1298 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0013\n",
1299 |       "Epoch 445/500\n",
1300 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0013\n",
1301 |       "Epoch 446/500\n",
1302 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0013\n",
1303 |       "Epoch 447/500\n",
1304 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 9.3926e-04\n",
1305 |       "Epoch 448/500\n",
1306 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0016\n",
1307 |       "Epoch 449/500\n",
1308 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0013\n",
1309 |       "Epoch 450/500\n",
1310 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0019\n",
1311 |       "Epoch 451/500\n",
1312 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 9.3926e-04\n",
1313 |       "Epoch 452/500\n",
1314 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0013\n",
1315 |       "Epoch 453/500\n",
1316 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0041\n",
1317 |       "Epoch 454/500\n",
1318 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0028\n",
1319 |       "Epoch 455/500\n",
1320 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0034\n",
1321 |       "Epoch 456/500\n",
1322 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0019\n",
1323 |       "Epoch 457/500\n",
1324 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0025\n",
1325 |       "Epoch 458/500\n",
1326 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0013\n",
1327 |       "Epoch 459/500\n",
1328 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0028\n",
1329 |       "Epoch 460/500\n",
1330 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0025\n",
1331 |       "Epoch 461/500\n",
1332 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0022\n",
1333 |       "Epoch 462/500\n",
1334 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0031\n",
1335 |       "Epoch 463/500\n",
1336 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0044\n",
1337 |       "Epoch 464/500\n",
1338 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0028\n",
1339 |       "Epoch 465/500\n",
1340 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0031\n",
1341 |       "Epoch 466/500\n",
1342 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 9.3926e-04\n",
1343 |       "Epoch 467/500\n",
1344 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 9.3926e-04\n",
1345 |       "Epoch 468/500\n",
1346 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0013\n",
1347 |       "Epoch 469/500\n",
1348 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0047\n",
1349 |       "Epoch 470/500\n",
1350 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0044\n",
1351 |       "Epoch 471/500\n",
1352 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0085\n",
1353 |       "Epoch 472/500\n",
1354 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0100\n",
1355 |       "Epoch 473/500\n",
1356 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0163\n",
1357 |       "Epoch 474/500\n",
1358 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0157\n",
1359 |       "Epoch 475/500\n",
1360 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0059\n",
1361 |       "Epoch 476/500\n",
1362 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0053\n",
1363 |       "Epoch 477/500\n",
1364 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0066\n",
1365 |       "Epoch 478/500\n",
1366 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0078\n",
1367 |       "Epoch 479/500\n",
1368 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0091\n",
1369 |       "Epoch 480/500\n",
1370 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0138\n",
1371 |       "Epoch 481/500\n",
1372 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0066\n",
1373 |       "Epoch 482/500\n",
1374 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0081\n",
1375 |       "Epoch 483/500\n",
1376 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0119\n",
1377 |       "Epoch 484/500\n",
1378 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0185\n",
1379 |       "Epoch 485/500\n",
1380 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0100\n",
1381 |       "Epoch 486/500\n",
1382 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0138\n",
1383 |       "Epoch 487/500\n",
1384 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0238\n",
1385 |       "Epoch 488/500\n",
1386 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0122\n",
1387 |       "Epoch 489/500\n",
1388 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0110\n",
1389 |       "Epoch 490/500\n",
1390 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0150\n",
1391 |       "Epoch 491/500\n",
1392 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0041\n",
1393 |       "Epoch 492/500\n",
1394 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 9.3926e-04\n",
1395 |       "Epoch 493/500\n",
1396 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0022\n",
1397 |       "Epoch 494/500\n",
1398 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0050\n",
1399 |       "Epoch 495/500\n",
1400 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0053\n",
1401 |       "Epoch 496/500\n",
1402 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0028\n",
1403 |       "Epoch 497/500\n",
1404 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0047\n",
1405 |       "Epoch 498/500\n",
1406 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0128\n",
1407 |       "Epoch 499/500\n",
1408 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0185\n",
1409 |       "Epoch 500/500\n",
1410 |       "3194/3194 [==============================] - 39s 12ms/sample - loss: inf - accuracy: 0.0222\n"
1411 |      ]
1412 |     }
1413 |    ],
1414 |    "source": [
1415 |     "# train asr model\n",
1416 |     "history = pipeline.fit(train_dataset = train_data, batch_size=128, epochs=500)\n",
1417 |     "\n",
1418 |     "# history = pipeline.fit_iter(train_dataset = train_data, batch_size=32, epochs=3,iter_num=500,checkpoint=project_path+'checkpoints')\n",
1419 |     "# history = pipeline.fit_generator(train_dataset = train_data, batch_size=32, epochs=500)"
1420 |    ]
1421 |   },
1422 |   {
1423 |    "cell_type": "code",
1424 |    "execution_count": null,
1425 |    "metadata": {
1426 |     "colab": {},
1427 |     "colab_type": "code",
1428 |     "id": "5WbeF-OWwhZB"
1429 |    },
1430 |    "outputs": [],
1431 |    "source": [
1432 |     "# save deepasr ctc pipeline\n",
1433 |     "pipeline.save(project_path+'checkpoints')"
1434 |    ]
1435 |   },
1436 |   {
1437 |    "cell_type": "markdown",
1438 |    "metadata": {
1439 |     "colab_type": "text",
1440 |     "id": "o_psolNH4XFl"
1441 |    },
1442 |    "source": [
1443 |     "# 4. Model testing"
1444 |    ]
1445 |   },
1446 |   {
1447 |    "cell_type": "code",
1448 |    "execution_count": 12,
1449 |    "metadata": {
1450 |     "colab": {
1451 |      "base_uri": "https://localhost:8080/",
1452 |      "height": 34
1453 |     },
1454 |     "colab_type": "code",
1455 |     "id": "EEgiUEkVc07E",
1456 |     "outputId": "fb6184a0-c0d9-4fe9-f445-37477d4661ff"
1457 |    },
1458 |    "outputs": [
1459 |     {
1460 |      "name": "stdout",
1461 |      "output_type": "stream",
1462 |      "text": [
1463 |       "WARNING:tensorflow:No training configuration found in save file: the model was *not* compiled. Compile it manually.\n"
1464 |      ]
1465 |     }
1466 |    ],
1467 |    "source": [
1468 |     "# load saved ctc pipeline\n",
1469 |     "pipeline1 = asr.pipeline.load(project_path+'checkpoints')"
1470 |    ]
1471 |   },
1472 |   {
1473 |    "cell_type": "code",
1474 |    "execution_count": 13,
1475 |    "metadata": {
1476 |     "colab": {
1477 |      "base_uri": "https://localhost:8080/",
1478 |      "height": 67
1479 |     },
1480 |     "colab_type": "code",
1481 |     "id": "xkPo_3SMtzHp",
1482 |     "outputId": "bcdc7fa7-6852-4aa8-99e3-6db3d8508ada"
1483 |    },
1484 |    "outputs": [
1485 |     {
1486 |      "name": "stdout",
1487 |      "output_type": "stream",
1488 |      "text": [
1489 |       "Audio File: ./LibriSpeech/train-clean-100/27/124992/27-124992-0063.flac\n",
1490 |       "Audio Transcription: WENT THROUGH THE PLAINS BUT WHEN THEY CAME NEAR THE MOUNTAINS\n",
1491 |       "Trancript length: 61\n"
1492 |      ]
1493 |     }
1494 |    ],
1495 |    "source": [
1496 |     "# get testing audio and transcript from dataset\n",
1497 |     "index = np.random.randint(train_data.shape[0])\n",
1498 |     "data = train_data.iloc[index]\n",
1499 |     "test_file = data[0]\n",
1500 |     "test_transcript = data[1]\n",
1501 |     "# Audio file\n",
1502 |     "print(\"Audio File:\",test_file)\n",
1503 |     "# ground truth\n",
1504 |     "print(\"Audio Transcription:\", test_transcript)\n",
1505 |     "print(\"Transcript length:\",len(test_transcript))"
1506 |    ]
1507 |   },
1508 |   {
1509 |    "cell_type": "code",
1510 |    "execution_count": 14,
1511 |    "metadata": {
1512 |     "colab": {
1513 |      "base_uri": "https://localhost:8080/",
1514 |      "height": 87
1515 |     },
1516 |     "colab_type": "code",
1517 |     "id": "moqXWTQVvdxC",
1518 |     "outputId": "0645ce28-1da9-447e-cc9d-93a4f57096c8"
1519 |    },
1520 |    "outputs": [
1521 |     {
1522 |      "name": "stdout",
1523 |      "output_type": "stream",
1524 |      "text": [
1525 |       "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/backend.py:5811: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.\n",
1526 |       "Instructions for updating:\n",
1527 |       "Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.\n"
1528 |      ]
1529 |     }
1530 |    ],
1531 |    "source": [
1532 |     "# predict labels\n",
1533 |     "pred= pipeline1.predict(test_file)"
1534 |    ]
1535 |   },
1536 |   {
1537 |    "cell_type": "code",
1538 |    "execution_count": 15,
1539 |    "metadata": {
1540 |     "colab": {
1541 |      "base_uri": "https://localhost:8080/",
1542 |      "height": 34
1543 |     },
1544 |     "colab_type": "code",
1545 |     "id": "oNvRyWq8weZs",
1546 |     "outputId": "88732c3b-5412-4a84-bb25-d517d4251a8c"
1547 |    },
1548 |    "outputs": [
1549 |     {
1550 |      "data": {
1551 |       "text/plain": [
1552 |        "'WENT THROUGH THE PLAINS BUT WHEN THEY CAME NEAR THE MOUNTAINS'"
1553 |       ]
1554 |      },
1555 |      "execution_count": 15,
1556 |      "metadata": {
1557 |       "tags": []
1558 |      },
1559 |      "output_type": "execute_result"
1560 |     }
1561 |    ],
1562 |    "source": [
1563 |     "pred[0].upper()"
1564 |    ]
1565 |   },
1566 |   {
1567 |    "cell_type": "code",
1568 |    "execution_count": null,
1569 |    "metadata": {
1570 |     "colab": {},
1571 |     "colab_type": "code",
1572 |     "id": "AFMoK13mtR6V"
1573 |    },
1574 |    "outputs": [],
1575 |    "source": []
1576 |   }
1577 |  ],
1578 |  "metadata": {
1579 |   "accelerator": "GPU",
1580 |   "colab": {
1581 |    "collapsed_sections": [],
1582 |    "machine_shape": "hm",
1583 |    "name": "DeepAsr-CTC_Pipeline.ipynb",
1584 |    "provenance": []
1585 |   },
1586 |   "kernelspec": {
1587 |    "display_name": "Python 3",
1588 |    "language": "python",
1589 |    "name": "python3"
1590 |   },
1591 |   "language_info": {
1592 |    "codemirror_mode": {
1593 |     "name": "ipython",
1594 |     "version": 3
1595 |    },
1596 |    "file_extension": ".py",
1597 |    "mimetype": "text/x-python",
1598 |    "name": "python",
1599 |    "nbconvert_exporter": "python",
1600 |    "pygments_lexer": "ipython3",
1601 |    "version": "3.7.5"
1602 |   }
1603 |  },
1604 |  "nbformat": 4,
1605 |  "nbformat_minor": 4
1606 | }
1607 | 


--------------------------------------------------------------------------------