├── audio_classification_models
    ├── layers
    │   ├── __init__.py
    │   ├── embedding.py
    │   ├── positional_encoding.py
    │   ├── subsampling.py
    │   └── multihead_attention.py
    ├── models
    │   ├── __init__.py
    │   ├── contextnet.py
    │   └── conformer.py
    ├── utils
    │   ├── __init__.py
    │   ├── weights.py
    │   ├── shape_util.py
    │   └── math_util.py
    ├── activations
    │   ├── __init__.py
    │   └── glu.py
    ├── version.py
    └── __init__.py
├── requirements.txt
├── .github
    └── workflows
    │   └── publish_to_pypi.yml
├── LICENSE.md
├── README.md
├── setup.py
└── .gitignore


/audio_classification_models/layers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/audio_classification_models/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/audio_classification_models/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/audio_classification_models/activations/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/audio_classification_models/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.0.9"


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy>=1.19.5
 2 | # gdown>=4.4.0
 3 | 
 4 | 
 5 | # # extra=tf2.6
 6 | # tensorflow>=2.6.4
 7 | # tensorflow_probability>=0.14.1
 8 | # tensorflow_addons>=0.15.0
 9 | # tensorflow-io>=0.21.0
10 | 
11 | six>=1.16.0


--------------------------------------------------------------------------------
/audio_classification_models/__init__.py:
--------------------------------------------------------------------------------
1 | from .models.conformer import ConformerEncoder, Conformer
2 | from .models.contextnet import ContextNetEncoder, ContextNet
3 | from .utils.weights import load_pretrain
4 | from .version import __version__


--------------------------------------------------------------------------------
/audio_classification_models/utils/weights.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tensorflow as tf
3 | 
4 | def load_pretrain(model, url, fname=None):
5 |     "Download weights from google drive using url then load weights to the model"
6 |     local_path = tf.keras.utils.get_file(fname, origin=url)
7 |     model.load_weights(local_path, by_name=True,skip_mismatch=True)


--------------------------------------------------------------------------------
/audio_classification_models/utils/shape_util.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | def shape_list(x, out_type=tf.int32):
 5 |     """Deal with dynamic shape in tensorflow cleanly."""
 6 |     static = x.shape.as_list()
 7 |     dynamic = tf.shape(x, out_type=out_type)
 8 |     return [dynamic[i] if s is None else s for i, s in enumerate(static)]
 9 | 
10 | 
11 | def get_shape_invariants(tensor):
12 |     shapes = shape_list(tensor)
13 |     return tf.TensorShape([i if isinstance(i, int) else None for i in shapes])
14 | 
15 | 
16 | def get_float_spec(tensor):
17 |     shape = get_shape_invariants(tensor)
18 |     return tf.TensorSpec(shape, dtype=tf.float32)


--------------------------------------------------------------------------------
/audio_classification_models/activations/glu.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | class GLU(tf.keras.layers.Layer):
 5 |     def __init__(
 6 |         self,
 7 |         axis=-1,
 8 |         name="glu_activation",
 9 |         **kwargs,
10 |     ):
11 |         super(GLU, self).__init__(name=name, **kwargs)
12 |         self.axis = axis
13 | 
14 |     def call(
15 |         self,
16 |         inputs,
17 |         **kwargs,
18 |     ):
19 |         a, b = tf.split(inputs, 2, axis=self.axis)
20 |         b = tf.nn.sigmoid(b)
21 |         return tf.multiply(a, b)
22 | 
23 |     def get_config(self):
24 |         conf = super(GLU, self).get_config()
25 |         conf.update({"axis": self.axis})
26 |         return conf


--------------------------------------------------------------------------------
/.github/workflows/publish_to_pypi.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Python 🐍 distributions 📦 to PyPI
 2 | 
 3 | on: push
 4 | 
 5 | jobs:
 6 |   build-n-publish:
 7 |     name: Build and publish Python 🐍 distributions 📦 to PyPI
 8 |     runs-on: ubuntu-18.04
 9 |     steps:
10 |     - uses: actions/checkout@master
11 |     - name: Set up Python 3.6
12 |       uses: actions/setup-python@v1
13 |       with:
14 |         python-version: 3.6
15 |     - name: Install pypa/build
16 |       run: >-
17 |         python -m
18 |         pip install
19 |         build
20 |         --user
21 |     - name: Build a binary wheel and a source tarball
22 |       run: >-
23 |         python -m
24 |         build
25 |         --sdist
26 |         --wheel
27 |         --outdir dist/
28 |         .
29 |     - name: Publish distribution 📦 to PyPI
30 |       if: startsWith(github.ref, 'refs/tags')
31 |       uses: pypa/gh-action-pypi-publish@master
32 |       with:
33 |         password: ${{ secrets.PYPI_API_TOKEN }}


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | 
 2 | The MIT License (MIT)
 3 | 
 4 | Copyright (c) 2022 Awsaf
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <div align=center><img src="https://user-images.githubusercontent.com/36858976/175043546-a32a0c92-3797-4a4f-a87b-ec8d046dba7f.png" width=300></div>
 2 | <p align="center">
 3 | <a href="https://github.com/TensorSpeech/TensorFlowASR/blob/main/LICENSE">
 4 |   <img src="https://img.shields.io/badge/License-MIT-yellow.svg">
 5 | </a>
 6 | <img alt="python" src="https://img.shields.io/badge/python-%3E%3D3.6-blue?logo=python">
 7 | <img alt="tensorflow" src="https://img.shields.io/badge/tensorflow-%3E%3D2.5.1-orange?logo=tensorflow">
 8 | <h2 align="center">
 9 | <p>Audio Classification Models in Tensorflow 2.0</p>
10 | </h2>
11 | </p>
12 | <p align="center">
13 | This library utilizes some automatic speech recognition architectures such as ContextNet, Conformer, etc for audio classification.
14 | </p>
15 | 
16 | ## Kaggle Codes/Notebook
17 | This library is used in the following notebook for **Fake Speech Detection**.
18 | * [Fake Speech Detection: Conformer [TF]](https://www.kaggle.com/code/awsaf49/fake-speech-detection-conformer-tf) (Awarded for Google OSS Expert Award 2022)
19 | > **Note**: You can also access the notebook in [`/notebooks`](/notebooks) folder.
20 |   
21 | ## Installation
22 | ```shell
23 | pip install -U audio_classification_models
24 | ```
25 | or
26 | ```shell
27 | pip install git+https://github.com/awsaf49/audio_classification_models
28 | ```
29 | 
30 | ## Usage
31 | ```py
32 | import audio_classification_models as acm
33 | model = acm.Conformer(pretrain=True)
34 | ```
35 | 
36 | ## Acknowledgement
37 | * [TensorflowASR](https://github.com/TensorSpeech/TensorFlowASR)
38 | 


--------------------------------------------------------------------------------
/audio_classification_models/layers/embedding.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | class Embedding(tf.keras.layers.Layer):
 5 |     def __init__(
 6 |         self,
 7 |         vocab_size,
 8 |         embed_dim,
 9 |         contraint=None,
10 |         regularizer=None,
11 |         initializer=None,
12 |         **kwargs,
13 |     ):
14 |         super(Embedding, self).__init__(**kwargs)
15 |         self.vocab_size = vocab_size
16 |         self.embed_dim = embed_dim
17 |         self.contraint = tf.keras.constraints.get(contraint)
18 |         self.regularizer = tf.keras.regularizers.get(regularizer)
19 |         self.initializer = tf.keras.initializers.get(initializer)
20 | 
21 |     def build(self, input_shape):
22 |         self.embeddings = self.add_weight(
23 |             name="embeddings",
24 |             dtype=tf.float32,
25 |             shape=[self.vocab_size, self.embed_dim],
26 |             initializer=self.initializer,
27 |             trainable=True,
28 |             regularizer=self.regularizer,
29 |             constraint=self.contraint,
30 |         )
31 |         self.built = True
32 | 
33 |     def call(self, inputs):
34 |         outputs = tf.cast(inputs, dtype=tf.int32)
35 |         return tf.nn.embedding_lookup(self.embeddings, outputs)
36 | 
37 |     def recognize_tflite(self, inputs):
38 |         outputs = tf.cast(tf.expand_dims(inputs, axis=-1), dtype=tf.int32)
39 |         return tf.gather_nd(self.embeddings, outputs)  # https://github.com/tensorflow/tensorflow/issues/42410
40 | 
41 |     def get_config(self):
42 |         conf = super(Embedding, self).get_config()
43 |         conf.update(
44 |             {
45 |                 "vocab_size": self.vocab_size,
46 |                 "embed_dim": self.embed_dim,
47 |                 "contraint": self.contraint,
48 |                 "regularizer": self.regularizer,
49 |                 "initializer": self.initializer,
50 |             }
51 |         )
52 |         return conf


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | from codecs import open
 3 | from os import path
 4 | 
 5 | here = path.abspath(path.dirname(__file__))
 6 | 
 7 | # Get the long description from the README file
 8 | with open(path.join(here, "README.md"), encoding="utf-8") as f:
 9 |     long_description = f.read()
10 | 
11 | with open(path.join(here, 'requirements.txt')) as f:
12 |     install_requires = [x for x in f.read().splitlines() if len(x)]
13 |     
14 | exec(open("audio_classification_models/version.py").read())
15 | 
16 | setup(
17 |     name="audio_classification_models",
18 |     version=__version__,
19 |     description="Tensorflow Audio Classification Models. https://github.com/awsaf49/audio_classification_models",
20 |     long_description=long_description,
21 |     long_description_content_type="text/markdown",
22 |     url="https://github.com/awsaf49/audio_classification_models",
23 |     author="Awsaf",
24 |     author_email="awsaf49@gmail.com",
25 |     classifiers=[
26 |         # How mature is this project? Common values are
27 |         #   3 - Alpha
28 |         #   4 - Beta
29 |         #   5 - Production/Stable
30 |         "Development Status :: 3 - Alpha",
31 |         "Intended Audience :: Developers",
32 |         "Intended Audience :: Science/Research",
33 |         "License :: OSI Approved :: Apache Software License",
34 |         "Programming Language :: Python :: 3.6",
35 |         "Programming Language :: Python :: 3.7",
36 |         "Programming Language :: Python :: 3.8",
37 |         "Topic :: Scientific/Engineering",
38 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
39 |         "Topic :: Software Development",
40 |         "Topic :: Software Development :: Libraries",
41 |         "Topic :: Software Development :: Libraries :: Python Modules",
42 |     ],
43 |     # Note that this is a string of words separated by whitespace, not a list.
44 |     keywords="tensorflow audio speech classification",
45 |     packages=find_packages(exclude=["tests"]),
46 |     include_package_data=True,
47 |     install_requires=install_requires,
48 |     python_requires=">=3.6",
49 |     license="MIT",
50 | )


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | db.sqlite3-journal
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # IPython
 80 | profile_default/
 81 | ipython_config.py
 82 | 
 83 | # pyenv
 84 | .python-version
 85 | 
 86 | # pipenv
 87 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 88 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 89 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 90 | #   install all needed dependencies.
 91 | #Pipfile.lock
 92 | 
 93 | # celery beat schedule file
 94 | celerybeat-schedule
 95 | 
 96 | # SageMath parsed files
 97 | *.sage.py
 98 | 
 99 | # Environments
100 | .env
101 | .venv
102 | env/
103 | venv/
104 | ENV/
105 | env.bak/
106 | venv.bak/
107 | 
108 | # Spyder project settings
109 | .spyderproject
110 | .spyproject
111 | 
112 | # Rope project settings
113 | .ropeproject
114 | 
115 | # mkdocs documentation
116 | /site
117 | 
118 | # mypy
119 | .mypy_cache/
120 | .dmypy.json
121 | dmypy.json
122 | 
123 | # Pyre type checker
124 | .pyre/
125 | 


--------------------------------------------------------------------------------
/audio_classification_models/layers/positional_encoding.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from ..utils.shape_util import shape_list
 4 | 
 5 | 
 6 | class PositionalEncoding(tf.keras.layers.Layer):
 7 |     def __init__(
 8 |         self,
 9 |         alpha: int = 1,
10 |         beta: int = 0,
11 |         name="positional_encoding",
12 |         **kwargs,
13 |     ):
14 |         super().__init__(trainable=False, name=name, **kwargs)
15 |         self.alpha = alpha
16 |         self.beta = beta
17 | 
18 |     def build(
19 |         self,
20 |         input_shape,
21 |     ):
22 |         dmodel = input_shape[-1]
23 |         assert dmodel % 2 == 0, f"Input last dim must be even: {dmodel}"
24 | 
25 |     @staticmethod
26 |     def encode(
27 |         max_len,
28 |         dmodel,
29 |     ):
30 |         pos = tf.expand_dims(tf.range(max_len - 1, -1, -1.0, dtype=tf.float32), axis=1)
31 |         index = tf.expand_dims(tf.range(0, dmodel, dtype=tf.float32), axis=0)
32 | 
33 |         pe = pos * (1 / tf.pow(10000.0, (2 * (index // 2)) / dmodel))
34 | 
35 |         # Sin cos will be [max_len, size // 2]
36 |         # we add 0 between numbers by using padding and reshape
37 |         sin = tf.pad(tf.expand_dims(tf.sin(pe[:, 0::2]), -1), [[0, 0], [0, 0], [0, 1]], mode="CONSTANT", constant_values=0)
38 |         sin = tf.reshape(sin, [max_len, dmodel])
39 |         cos = tf.pad(tf.expand_dims(tf.cos(pe[:, 1::2]), -1), [[0, 0], [0, 0], [1, 0]], mode="CONSTANT", constant_values=0)
40 |         cos = tf.reshape(cos, [max_len, dmodel])
41 |         # Then add sin and cos, which results in [time, size]
42 |         pe = tf.add(sin, cos)
43 |         return tf.expand_dims(pe, axis=0)  # [1, time, size]
44 | 
45 |     def call(
46 |         self,
47 |         inputs,
48 |         **kwargs,
49 |     ):
50 |         # inputs shape [B, T, V]
51 |         _, max_len, dmodel = shape_list(inputs)
52 |         pe = self.encode(max_len * self.alpha + self.beta, dmodel)
53 |         return tf.cast(pe, dtype=inputs.dtype)
54 | 
55 |     def get_config(self):
56 |         conf = super().get_config()
57 |         conf.update({"alpha": self.alpha, "beta": self.beta})
58 |         return conf
59 | 
60 | 
61 | class PositionalEncodingConcat(PositionalEncoding):
62 |     def build(
63 |         self,
64 |         input_shape,
65 |     ):
66 |         dmodel = input_shape[-1]
67 |         assert dmodel % 2 == 0, f"Input last dim must be even: {dmodel}"
68 | 
69 |     @staticmethod
70 |     def encode(
71 |         max_len,
72 |         dmodel,
73 |     ):
74 |         pos = tf.range(max_len - 1, -1, -1.0, dtype=tf.float32)
75 | 
76 |         index = tf.range(0, dmodel, 2.0, dtype=tf.float32)
77 |         index = 1 / tf.pow(10000.0, (index / dmodel))
78 | 
79 |         sinusoid = tf.einsum("i,j->ij", pos, index)
80 |         pos = tf.concat([tf.sin(sinusoid), tf.cos(sinusoid)], axis=-1)
81 | 
82 |         return tf.expand_dims(pos, axis=0)
83 | 
84 |     def call(
85 |         self,
86 |         inputs,
87 |         **kwargs,
88 |     ):
89 |         # inputs shape [B, T, V]
90 |         _, max_len, dmodel = shape_list(inputs)
91 |         pe = self.encode(max_len * self.alpha + self.beta, dmodel)
92 |         return tf.cast(pe, dtype=inputs.dtype)


--------------------------------------------------------------------------------
/audio_classification_models/utils/math_util.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | 
  5 | from ..utils import shape_util
  6 | 
  7 | 
  8 | def log10(x):
  9 |     numerator = tf.math.log(x)
 10 |     denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype))
 11 |     return numerator / denominator
 12 | 
 13 | 
 14 | def get_num_batches(
 15 |     nsamples,
 16 |     batch_size,
 17 |     drop_remainders=True,
 18 | ):
 19 |     if nsamples is None or batch_size is None:
 20 |         return None
 21 |     if drop_remainders:
 22 |         return math.floor(float(nsamples) / float(batch_size))
 23 |     return math.ceil(float(nsamples) / float(batch_size))
 24 | 
 25 | 
 26 | def nan_to_zero(
 27 |     input_tensor: tf.Tensor,
 28 | ):
 29 |     return tf.where(tf.math.is_nan(input_tensor), tf.zeros_like(input_tensor), input_tensor)
 30 | 
 31 | 
 32 | def bytes_to_string(
 33 |     array: np.ndarray,
 34 |     encoding: str = "utf-8",
 35 | ):
 36 |     if array is None:
 37 |         return None
 38 |     return [transcript.decode(encoding) for transcript in array]
 39 | 
 40 | 
 41 | def get_reduced_length(
 42 |     length,
 43 |     reduction_factor,
 44 | ):
 45 |     return tf.cast(
 46 |         tf.math.ceil(tf.divide(length, tf.cast(reduction_factor, dtype=length.dtype))),
 47 |         dtype=tf.int32,
 48 |     )
 49 | 
 50 | 
 51 | def count_non_blank(
 52 |     tensor: tf.Tensor,
 53 |     blank: int or tf.Tensor = 0,
 54 |     axis=None,
 55 | ):
 56 |     return tf.reduce_sum(
 57 |         tf.where(tf.not_equal(tensor, blank), x=tf.ones_like(tensor), y=tf.zeros_like(tensor)),
 58 |         axis=axis,
 59 |     )
 60 | 
 61 | 
 62 | def merge_two_last_dims(x):
 63 |     b, _, f, c = shape_util.shape_list(x)
 64 |     return tf.reshape(x, shape=[b, -1, f * c])
 65 | 
 66 | 
 67 | def merge_repeated(
 68 |     yseqs,
 69 |     blank=0,
 70 | ):
 71 |     result = tf.reshape(yseqs[0], [1])
 72 | 
 73 |     U = shape_util.shape_list(yseqs)[0]
 74 |     i = tf.constant(1, dtype=tf.int32)
 75 | 
 76 |     def _cond(i, result, yseqs, U):
 77 |         return tf.less(i, U)
 78 | 
 79 |     def _body(i, result, yseqs, U):
 80 |         if yseqs[i] != result[-1]:
 81 |             result = tf.concat([result, [yseqs[i]]], axis=-1)
 82 |         return i + 1, result, yseqs, U
 83 | 
 84 |     _, result, _, _ = tf.while_loop(
 85 |         _cond,
 86 |         _body,
 87 |         loop_vars=[i, result, yseqs, U],
 88 |         shape_invariants=(
 89 |             tf.TensorShape([]),
 90 |             tf.TensorShape([None]),
 91 |             tf.TensorShape([None]),
 92 |             tf.TensorShape([]),
 93 |         ),
 94 |     )
 95 | 
 96 |     return tf.pad(result, [[U - shape_util.shape_list(result)[0], 0]], constant_values=blank)
 97 | 
 98 | 
 99 | def find_max_length_prediction_tfarray(
100 |     tfarray: tf.TensorArray,
101 | ) -> tf.Tensor:
102 |     with tf.name_scope("find_max_length_prediction_tfarray"):
103 |         index = tf.constant(0, dtype=tf.int32)
104 |         total = tfarray.size()
105 |         max_length = tf.constant(0, dtype=tf.int32)
106 | 
107 |         def condition(index, _):
108 |             return tf.less(index, total)
109 | 
110 |         def body(index, max_length):
111 |             prediction = tfarray.read(index)
112 |             length = tf.shape(prediction)[0]
113 |             max_length = tf.where(tf.greater(length, max_length), length, max_length)
114 |             return index + 1, max_length
115 | 
116 |         index, max_length = tf.while_loop(condition, body, loop_vars=[index, max_length], swap_memory=False)
117 |         return max_length
118 | 
119 | 
120 | def pad_prediction_tfarray(
121 |     tfarray: tf.TensorArray,
122 |     blank: int or tf.Tensor,
123 | ) -> tf.TensorArray:
124 |     with tf.name_scope("pad_prediction_tfarray"):
125 |         index = tf.constant(0, dtype=tf.int32)
126 |         total = tfarray.size()
127 |         max_length = find_max_length_prediction_tfarray(tfarray) + 1
128 | 
129 |         def condition(index, _):
130 |             return tf.less(index, total)
131 | 
132 |         def body(index, tfarray):
133 |             prediction = tfarray.read(index)
134 |             prediction = tf.pad(
135 |                 prediction,
136 |                 paddings=[[0, max_length - tf.shape(prediction)[0]]],
137 |                 mode="CONSTANT",
138 |                 constant_values=blank,
139 |             )
140 |             tfarray = tfarray.write(index, prediction)
141 |             return index + 1, tfarray
142 | 
143 |         index, tfarray = tf.while_loop(condition, body, loop_vars=[index, tfarray], swap_memory=False)
144 |         return tfarray


--------------------------------------------------------------------------------
/audio_classification_models/layers/subsampling.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import tensorflow as tf
  3 | 
  4 | from ..utils import math_util, shape_util
  5 | 
  6 | 
  7 | class TimeReduction(tf.keras.layers.Layer):
  8 |     def __init__(
  9 |         self,
 10 |         factor: int,
 11 |         name: str = "TimeReduction",
 12 |         **kwargs,
 13 |     ):
 14 |         super(TimeReduction, self).__init__(name=name, **kwargs)
 15 |         self.time_reduction_factor = factor
 16 | 
 17 |     def padding(
 18 |         self,
 19 |         time,
 20 |     ):
 21 |         new_time = tf.math.ceil(time / self.time_reduction_factor) * self.time_reduction_factor
 22 |         return tf.cast(new_time, dtype=tf.int32) - time
 23 | 
 24 |     def call(
 25 |         self,
 26 |         inputs,
 27 |         **kwargs,
 28 |     ):
 29 |         shape = shape_util.shape_list(inputs)
 30 |         outputs = tf.pad(inputs, [[0, 0], [0, self.padding(shape[1])], [0, 0]])
 31 |         outputs = tf.reshape(outputs, [shape[0], -1, shape[-1] * self.time_reduction_factor])
 32 |         return outputs
 33 | 
 34 |     def get_config(self):
 35 |         config = super(TimeReduction, self).get_config()
 36 |         config.update({"factor": self.time_reduction_factor})
 37 |         return config
 38 | 
 39 | 
 40 | class VggSubsampling(tf.keras.layers.Layer):
 41 |     def __init__(
 42 |         self,
 43 |         filters: tuple or list = (32, 64),
 44 |         kernel_size: int or list or tuple = 3,
 45 |         strides: int or list or tuple = 2,
 46 |         kernel_regularizer=None,
 47 |         bias_regularizer=None,
 48 |         name="VggSubsampling",
 49 |         **kwargs,
 50 |     ):
 51 |         super(VggSubsampling, self).__init__(name=name, **kwargs)
 52 |         self.conv1 = tf.keras.layers.Conv2D(
 53 |             filters=filters[0],
 54 |             kernel_size=kernel_size,
 55 |             strides=1,
 56 |             padding="same",
 57 |             name=f"{name}_conv_1",
 58 |             kernel_regularizer=kernel_regularizer,
 59 |             bias_regularizer=bias_regularizer,
 60 |         )
 61 |         self.conv2 = tf.keras.layers.Conv2D(
 62 |             filters=filters[0],
 63 |             kernel_size=kernel_size,
 64 |             strides=1,
 65 |             padding="same",
 66 |             name=f"{name}_conv_2",
 67 |             kernel_regularizer=kernel_regularizer,
 68 |             bias_regularizer=bias_regularizer,
 69 |         )
 70 |         self.maxpool1 = tf.keras.layers.MaxPool2D(pool_size=strides, padding="same", name=f"{name}_maxpool_1")
 71 |         self.conv3 = tf.keras.layers.Conv2D(
 72 |             filters=filters[1],
 73 |             kernel_size=kernel_size,
 74 |             strides=1,
 75 |             padding="same",
 76 |             name=f"{name}_conv_3",
 77 |             kernel_regularizer=kernel_regularizer,
 78 |             bias_regularizer=bias_regularizer,
 79 |         )
 80 |         self.conv4 = tf.keras.layers.Conv2D(
 81 |             filters=filters[1],
 82 |             kernel_size=kernel_size,
 83 |             strides=1,
 84 |             padding="same",
 85 |             name=f"{name}_conv_4",
 86 |             kernel_regularizer=kernel_regularizer,
 87 |             bias_regularizer=bias_regularizer,
 88 |         )
 89 |         self.maxpool2 = tf.keras.layers.MaxPool2D(pool_size=strides, padding="same", name=f"{name}_maxpool_2")
 90 |         self.time_reduction_factor = self.maxpool1.pool_size[0] * self.maxpool2.pool_size[0]
 91 | 
 92 |     def call(
 93 |         self,
 94 |         inputs,
 95 |         training=False,
 96 |         **kwargs,
 97 |     ):
 98 |         outputs = self.conv1(inputs, training=training)
 99 |         outputs = tf.nn.relu(outputs)
100 |         outputs = self.conv2(outputs, training=training)
101 |         outputs = tf.nn.relu(outputs)
102 |         outputs = self.maxpool1(outputs, training=training)
103 | 
104 |         outputs = self.conv3(outputs, training=training)
105 |         outputs = tf.nn.relu(outputs)
106 |         outputs = self.conv4(outputs, training=training)
107 |         outputs = tf.nn.relu(outputs)
108 |         outputs = self.maxpool2(outputs, training=training)
109 | 
110 |         return math_util.merge_two_last_dims(outputs)
111 | 
112 |     def get_config(
113 |         self,
114 |     ):
115 |         conf = super(VggSubsampling, self).get_config()
116 |         conf.update(self.conv1.get_config())
117 |         conf.update(self.conv2.get_config())
118 |         conf.update(self.maxpool1.get_config())
119 |         conf.update(self.conv3.get_config())
120 |         conf.update(self.conv4.get_config())
121 |         conf.update(self.maxpool2.get_config())
122 |         return conf
123 | 
124 | 
125 | class Conv2dSubsampling(tf.keras.layers.Layer):
126 |     def __init__(
127 |         self,
128 |         filters: int,
129 |         strides: list or tuple or int = 2,
130 |         kernel_size: int or list or tuple = 3,
131 |         kernel_regularizer=None,
132 |         bias_regularizer=None,
133 |         name="Conv2dSubsampling",
134 |         **kwargs,
135 |     ):
136 |         super(Conv2dSubsampling, self).__init__(name=name, **kwargs)
137 |         self.conv1 = tf.keras.layers.Conv2D(
138 |             filters=filters,
139 |             kernel_size=kernel_size,
140 |             strides=strides,
141 |             padding="same",
142 |             name=f"{name}_1",
143 |             kernel_regularizer=kernel_regularizer,
144 |             bias_regularizer=bias_regularizer,
145 |         )
146 |         self.conv2 = tf.keras.layers.Conv2D(
147 |             filters=filters,
148 |             kernel_size=kernel_size,
149 |             strides=strides,
150 |             padding="same",
151 |             name=f"{name}_2",
152 |             kernel_regularizer=kernel_regularizer,
153 |             bias_regularizer=bias_regularizer,
154 |         )
155 |         self.time_reduction_factor = self.conv1.strides[0] * self.conv2.strides[0]
156 | 
157 |     def call(
158 |         self,
159 |         inputs,
160 |         training=False,
161 |         **kwargs,
162 |     ):
163 |         outputs = self.conv1(inputs, training=training)
164 |         outputs = tf.nn.relu(outputs)
165 |         outputs = self.conv2(outputs, training=training)
166 |         outputs = tf.nn.relu(outputs)
167 |         return math_util.merge_two_last_dims(outputs)
168 | 
169 |     def get_config(self):
170 |         conf = super(Conv2dSubsampling, self).get_config()
171 |         conf.update(self.conv1.get_config())
172 |         conf.update(self.conv2.get_config())
173 |         return conf


--------------------------------------------------------------------------------
/audio_classification_models/models/contextnet.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | import tensorflow as tf
  3 | 
  4 | from ..utils import math_util, weights
  5 | 
  6 | L2 = tf.keras.regularizers.l2(1e-6)
  7 | URL = "https://github.com/awsaf49/audio_classification_models/releases/download/v1.0.8/contextnet.h5"
  8 | BLOCKS =[{'nlayers': 1, 'kernel_size': 5, 'filters': 256, 'strides': 1, 'residual': False, 'activation': 'silu'},
  9 |          {'nlayers': 5,'kernel_size': 5,'filters': 256,'strides': 1,'residual': True,'activation': 'silu'},
 10 |          {'nlayers': 5,'kernel_size': 5,'filters': 256,'strides': 1,'residual': True,'activation': 'silu'},
 11 |          {'nlayers': 5,'kernel_size': 5,'filters': 256,'strides': 2,'residual': True,'activation': 'silu'},
 12 |          {'nlayers': 5,'kernel_size': 5,'filters': 256,'strides': 1,'residual': True,'activation': 'silu'},
 13 |          {'nlayers': 5,'kernel_size': 5,'filters': 256,'strides': 1,'residual': True,'activation': 'silu'},
 14 |          {'nlayers': 5,'kernel_size': 5,'filters': 256,'strides': 1,'residual': True,'activation': 'silu'},
 15 |          {'nlayers': 5,'kernel_size': 5,'filters': 256,'strides': 2,'residual': True,'activation': 'silu'},
 16 |          {'nlayers': 5,'kernel_size': 5,'filters': 256,'strides': 1,'residual': True,'activation': 'silu'},
 17 |          {'nlayers': 5,'kernel_size': 5,'filters': 256,'strides': 1,'residual': True,'activation': 'silu'},
 18 |          {'nlayers': 5,'kernel_size': 5,'filters': 256,'strides': 1,'residual': True,'activation': 'silu'},
 19 |          {'nlayers': 5,'kernel_size': 5,'filters': 512,'strides': 1,'residual': True,'activation': 'silu'},
 20 |          {'nlayers': 5,'kernel_size': 5,'filters': 512,'strides': 1,'residual': True,'activation': 'silu'},
 21 |          {'nlayers': 5,'kernel_size': 5,'filters': 512,'strides': 1,'residual': True,'activation': 'silu'},
 22 |          {'nlayers': 5,'kernel_size': 5,'filters': 512,'strides': 2,'residual': True,'activation': 'silu'},
 23 |          {'nlayers': 5,'kernel_size': 5,'filters': 512,'strides': 1,'residual': True,'activation': 'silu'},
 24 |          {'nlayers': 5,'kernel_size': 5,'filters': 512,'strides': 1,'residual': True,'activation': 'silu'},
 25 |          {'nlayers': 5,'kernel_size': 5,'filters': 512,'strides': 1,'residual': True,'activation': 'silu'},
 26 |          {'nlayers': 5,'kernel_size': 5,'filters': 512,'strides': 1,'residual': True,'activation': 'silu'},
 27 |          {'nlayers': 5,'kernel_size': 5,'filters': 512,'strides': 1,'residual': True,'activation': 'silu'},
 28 |          {'nlayers': 5,'kernel_size': 5,'filters': 512,'strides': 1,'residual': True,'activation': 'silu'},
 29 |          {'nlayers': 5,'kernel_size': 5,'filters': 512,'strides': 1,'residual': True,'activation': 'silu'},
 30 |          {'nlayers': 1,'kernel_size': 5,'filters': 640,'strides': 1,'residual': False,'activation': 'silu'}]
 31 | 
 32 | def get_activation(
 33 |     activation: str = "silu",
 34 | ):
 35 |     activation = activation.lower()
 36 |     if activation in ["silu", "swish"]:
 37 |         return tf.nn.swish
 38 |     elif activation == 'selu':
 39 |         return tf.nn.selu
 40 |     elif activation == "relu":
 41 |         return tf.nn.relu
 42 |     elif activation == "linear":
 43 |         return tf.keras.activations.linear
 44 |     else:
 45 |         raise ValueError("activation must be either 'silu', 'swish', 'selu', 'relu' or 'linear'")
 46 | 
 47 | 
 48 | class Reshape(tf.keras.layers.Layer):
 49 |     def call(self, inputs):
 50 |         return math_util.merge_two_last_dims(inputs)
 51 | 
 52 | 
 53 | class ConvModule(tf.keras.layers.Layer):
 54 |     def __init__(
 55 |         self,
 56 |         kernel_size: int = 3,
 57 |         strides: int = 1,
 58 |         filters: int = 256,
 59 |         activation: str = "silu",
 60 |         kernel_regularizer=None,
 61 |         bias_regularizer=None,
 62 |         **kwargs,
 63 |     ):
 64 |         super(ConvModule, self).__init__(**kwargs)
 65 |         self.strides = strides
 66 |         self.conv = tf.keras.layers.SeparableConv1D(
 67 |             filters=filters,
 68 |             kernel_size=kernel_size,
 69 |             strides=strides,
 70 |             padding="same",
 71 |             depthwise_regularizer=kernel_regularizer,
 72 |             pointwise_regularizer=kernel_regularizer,
 73 |             bias_regularizer=bias_regularizer,
 74 |             name=f"{self.name}_conv",
 75 |         )
 76 |         self.bn = tf.keras.layers.BatchNormalization(name=f"{self.name}_bn")
 77 |         self.activation = get_activation(activation)
 78 | 
 79 |     def call(
 80 |         self,
 81 |         inputs,
 82 |         training=False,
 83 |         **kwargs,
 84 |     ):
 85 |         outputs = self.conv(inputs, training=training)
 86 |         outputs = self.bn(outputs, training=training)
 87 |         outputs = self.activation(outputs)
 88 |         return outputs
 89 | 
 90 | 
 91 | class SEModule(tf.keras.layers.Layer):
 92 |     def __init__(
 93 |         self,
 94 |         kernel_size: int = 3,
 95 |         strides: int = 1,
 96 |         filters: int = 256,
 97 |         activation: str = "silu",
 98 |         kernel_regularizer=None,
 99 |         bias_regularizer=None,
100 |         **kwargs,
101 |     ):
102 |         super(SEModule, self).__init__(**kwargs)
103 |         self.conv = ConvModule(
104 |             kernel_size=kernel_size,
105 |             strides=strides,
106 |             filters=filters,
107 |             activation=activation,
108 |             kernel_regularizer=kernel_regularizer,
109 |             bias_regularizer=bias_regularizer,
110 |             name=f"{self.name}_conv_module",
111 |         )
112 |         self.activation = get_activation(activation)
113 |         self.fc1 = tf.keras.layers.Dense(filters // 8, name=f"{self.name}_fc1")
114 |         self.fc2 = tf.keras.layers.Dense(filters, name=f"{self.name}_fc2")
115 | 
116 |     def call(
117 |         self,
118 |         inputs,
119 |         training=False,
120 |         **kwargs,
121 |     ):
122 |         features, input_length = inputs
123 |         outputs = self.conv(features, training=training)
124 | 
125 |         se = tf.divide(tf.reduce_sum(outputs, axis=1), tf.expand_dims(tf.cast(input_length, dtype=outputs.dtype), axis=1))
126 |         se = self.fc1(se, training=training)
127 |         se = self.activation(se)
128 |         se = self.fc2(se, training=training)
129 |         se = self.activation(se)
130 |         se = tf.nn.sigmoid(se)
131 |         se = tf.expand_dims(se, axis=1)
132 | 
133 |         outputs = tf.multiply(outputs, se)
134 |         return outputs
135 | 
136 | 
137 | class ConvBlock(tf.keras.layers.Layer):
138 |     def __init__(
139 |         self,
140 |         nlayers: int = 3,
141 |         kernel_size: int = 3,
142 |         filters: int = 256,
143 |         strides: int = 1,
144 |         residual: bool = True,
145 |         activation: str = "silu",
146 |         alpha: float = 1.0,
147 |         kernel_regularizer=None,
148 |         bias_regularizer=None,
149 |         **kwargs,
150 |     ):
151 |         super(ConvBlock, self).__init__(**kwargs)
152 | 
153 |         self.dmodel = filters
154 |         self.time_reduction_factor = strides
155 |         filters = int(filters * alpha)
156 | 
157 |         self.convs = []
158 |         for i in range(nlayers - 1):
159 |             self.convs.append(
160 |                 ConvModule(
161 |                     kernel_size=kernel_size,
162 |                     strides=1,
163 |                     filters=filters,
164 |                     activation=activation,
165 |                     kernel_regularizer=kernel_regularizer,
166 |                     bias_regularizer=bias_regularizer,
167 |                     name=f"{self.name}_conv_module_{i}",
168 |                 )
169 |             )
170 | 
171 |         self.last_conv = ConvModule(
172 |             kernel_size=kernel_size,
173 |             strides=strides,
174 |             filters=filters,
175 |             activation=activation,
176 |             kernel_regularizer=kernel_regularizer,
177 |             bias_regularizer=bias_regularizer,
178 |             name=f"{self.name}_conv_module_{nlayers - 1}",
179 |         )
180 | 
181 |         self.se = SEModule(
182 |             kernel_size=kernel_size,
183 |             strides=1,
184 |             filters=filters,
185 |             activation=activation,
186 |             kernel_regularizer=kernel_regularizer,
187 |             bias_regularizer=bias_regularizer,
188 |             name=f"{self.name}_se",
189 |         )
190 | 
191 |         self.residual = None
192 |         if residual:
193 |             self.residual = ConvModule(
194 |                 kernel_size=kernel_size,
195 |                 strides=strides,
196 |                 filters=filters,
197 |                 activation="linear",
198 |                 kernel_regularizer=kernel_regularizer,
199 |                 bias_regularizer=bias_regularizer,
200 |                 name=f"{self.name}_residual",
201 |             )
202 | 
203 |         self.activation = get_activation(activation)
204 | 
205 |     def call(
206 |         self,
207 |         inputs,
208 |         training=False,
209 |         **kwargs,
210 |     ):
211 |         features, input_length = inputs
212 |         outputs = features
213 |         for conv in self.convs:
214 |             outputs = conv(outputs, training=training)
215 |         outputs = self.last_conv(outputs, training=training)
216 |         input_length = math_util.get_reduced_length(input_length, self.last_conv.strides)
217 |         outputs = self.se([outputs, input_length], training=training)
218 |         if self.residual is not None:
219 |             res = self.residual(features, training=training)
220 |             outputs = tf.add(outputs, res)
221 |         outputs = self.activation(outputs)
222 |         return outputs, input_length
223 | 
224 | 
225 | class ContextNetEncoder(tf.keras.Model):
226 |     def __init__(
227 |         self,
228 |         blocks: List[dict] = BLOCKS,
229 |         alpha: float = 0.5,
230 |         kernel_regularizer=L2,
231 |         bias_regularizer=L2,
232 |         name='contextnet_encoder',
233 |         **kwargs,
234 |     ):
235 |         super(ContextNetEncoder, self).__init__(name=name, **kwargs)
236 | 
237 |         self.reshape = Reshape(name=f"{self.name}_reshape")
238 | 
239 |         self.blocks = []
240 |         for i, config in enumerate(blocks):
241 |             self.blocks.append(
242 |                 ConvBlock(
243 |                     **config,
244 |                     alpha=alpha,
245 |                     kernel_regularizer=kernel_regularizer,
246 |                     bias_regularizer=bias_regularizer,
247 |                     name=f"{self.name}_block_{i}",
248 |                 )
249 |             )
250 | 
251 |     def call(
252 |         self,
253 |         inputs,
254 |         training=False,
255 |         **kwargs,
256 |     ):
257 |         outputs = inputs # shape: [B, T, F, C]
258 |         input_length  = tf.expand_dims(tf.shape(inputs)[1], axis=0) # spec time duration
259 |         outputs = self.reshape(outputs)
260 |         for block in self.blocks:
261 |             outputs, input_length = block([outputs, input_length], training=training)
262 |         return outputs
263 | 
264 | def ContextNet(input_shape=(128, 80, 1), num_classes=1, final_activation='sigmoid', pretrain=True):
265 |     inp = tf.keras.layers.Input(shape=input_shape)
266 |     backbone = ContextNetEncoder()
267 |     out = backbone(inp)
268 |     if pretrain:
269 |         weights.load_pretrain(backbone, url=URL)
270 |     out = tf.keras.layers.GlobalAveragePooling1D()(out)
271 |     out = tf.keras.layers.Dense(32, activation='selu')(out)
272 |     out = tf.keras.layers.Dense(num_classes, activation=final_activation)(out)
273 |     model = tf.keras.models.Model(inp, out)
274 |     return model


--------------------------------------------------------------------------------
/audio_classification_models/layers/multihead_attention.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import typing
  3 | 
  4 | import tensorflow as tf
  5 | 
  6 | 
  7 | class MultiHeadAttention(tf.keras.layers.Layer):
  8 |     def __init__(
  9 |         self,
 10 |         num_heads,
 11 |         head_size,
 12 |         output_size: int = None,
 13 |         dropout: float = 0.0,
 14 |         use_projection_bias: bool = True,
 15 |         return_attn_coef: bool = False,
 16 |         kernel_initializer: typing.Union[str, typing.Callable] = "glorot_uniform",
 17 |         kernel_regularizer: typing.Union[str, typing.Callable] = None,
 18 |         kernel_constraint: typing.Union[str, typing.Callable] = None,
 19 |         bias_initializer: typing.Union[str, typing.Callable] = "zeros",
 20 |         bias_regularizer: typing.Union[str, typing.Callable] = None,
 21 |         bias_constraint: typing.Union[str, typing.Callable] = None,
 22 |         **kwargs,
 23 |     ):
 24 |         super(MultiHeadAttention, self).__init__(**kwargs)
 25 | 
 26 |         if output_size is not None and output_size < 1:
 27 |             raise ValueError("output_size must be a positive number")
 28 | 
 29 |         self.kernel_initializer = tf.keras.initializers.get(kernel_initializer)
 30 |         self.kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
 31 |         self.kernel_constraint = tf.keras.constraints.get(kernel_constraint)
 32 |         self.bias_initializer = tf.keras.initializers.get(bias_initializer)
 33 |         self.bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
 34 |         self.bias_constraint = tf.keras.constraints.get(bias_constraint)
 35 | 
 36 |         self.head_size = head_size
 37 |         self.num_heads = num_heads
 38 |         self.output_size = output_size
 39 |         self.use_projection_bias = use_projection_bias
 40 |         self.return_attn_coef = return_attn_coef
 41 | 
 42 |         self.dropout = tf.keras.layers.Dropout(dropout, name="dropout")
 43 |         self._droput_rate = dropout
 44 | 
 45 |     def build(
 46 |         self,
 47 |         input_shape,
 48 |     ):
 49 |         num_query_features = input_shape[0][-1]
 50 |         num_key_features = input_shape[1][-1]
 51 |         num_value_features = input_shape[2][-1] if len(input_shape) > 2 else num_key_features
 52 |         output_size = self.output_size if self.output_size is not None else num_value_features
 53 |         self.query_kernel = self.add_weight(
 54 |             name="query_kernel",
 55 |             shape=[self.num_heads, num_query_features, self.head_size],
 56 |             initializer=self.kernel_initializer,
 57 |             regularizer=self.kernel_regularizer,
 58 |             constraint=self.kernel_constraint,
 59 |         )
 60 |         self.key_kernel = self.add_weight(
 61 |             name="key_kernel",
 62 |             shape=[self.num_heads, num_key_features, self.head_size],
 63 |             initializer=self.kernel_initializer,
 64 |             regularizer=self.kernel_regularizer,
 65 |             constraint=self.kernel_constraint,
 66 |         )
 67 |         self.value_kernel = self.add_weight(
 68 |             name="value_kernel",
 69 |             shape=[self.num_heads, num_value_features, self.head_size],
 70 |             initializer=self.kernel_initializer,
 71 |             regularizer=self.kernel_regularizer,
 72 |             constraint=self.kernel_constraint,
 73 |         )
 74 |         self.projection_kernel = self.add_weight(
 75 |             name="projection_kernel",
 76 |             shape=[self.num_heads, self.head_size, output_size],
 77 |             initializer=self.kernel_initializer,
 78 |             regularizer=self.kernel_regularizer,
 79 |             constraint=self.kernel_constraint,
 80 |         )
 81 |         if self.use_projection_bias:
 82 |             self.projection_bias = self.add_weight(
 83 |                 name="projection_bias",
 84 |                 shape=[output_size],
 85 |                 initializer=self.bias_initializer,
 86 |                 regularizer=self.bias_regularizer,
 87 |                 constraint=self.bias_constraint,
 88 |             )
 89 |         else:
 90 |             self.projection_bias = None
 91 | 
 92 |     def call_qkv(
 93 |         self,
 94 |         query,
 95 |         key,
 96 |         value,
 97 |         training=False,
 98 |     ):
 99 |         # verify shapes
100 |         if key.shape[-2] != value.shape[-2]:
101 |             raise ValueError(
102 |                 "the number of elements in 'key' must be equal to " "the same as the number of elements in 'value'"
103 |             )
104 |         # Linear transformations
105 |         query = tf.einsum("...NI,HIO->...NHO", query, self.query_kernel)
106 |         key = tf.einsum("...MI,HIO->...MHO", key, self.key_kernel)
107 |         value = tf.einsum("...MI,HIO->...MHO", value, self.value_kernel)
108 | 
109 |         return query, key, value
110 | 
111 |     def call_attention(
112 |         self,
113 |         query,
114 |         key,
115 |         value,
116 |         logits,
117 |         training=False,
118 |         mask=None,
119 |     ):
120 |         # mask = attention mask with shape [B, Tquery, Tkey] with 1 is for positions we want to attend, 0 for masked
121 |         if mask is not None:
122 |             if len(mask.shape) < 2:
123 |                 raise ValueError("'mask' must have at least 2 dimensions")
124 |             if query.shape[-3] != mask.shape[-2]:
125 |                 raise ValueError("mask's second to last dimension must be equal to " "the number of elements in 'query'")
126 |             if key.shape[-3] != mask.shape[-1]:
127 |                 raise ValueError("mask's last dimension must be equal to the number of elements in 'key'")
128 |         # apply mask
129 |         if mask is not None:
130 |             mask = tf.cast(mask, tf.float32)
131 | 
132 |             # possibly expand on the head dimension so broadcasting works
133 |             if len(mask.shape) != len(logits.shape):
134 |                 mask = tf.expand_dims(mask, -3)
135 | 
136 |             logits += -10e9 * (1.0 - mask)
137 | 
138 |         attn_coef = tf.nn.softmax(logits)
139 | 
140 |         # attention dropout
141 |         attn_coef_dropout = self.dropout(attn_coef, training=training)
142 | 
143 |         # attention * value
144 |         multihead_output = tf.einsum("...HNM,...MHI->...NHI", attn_coef_dropout, value)
145 | 
146 |         # Run the outputs through another linear projection layer. Recombining heads
147 |         # is automatically done.
148 |         output = tf.einsum("...NHI,HIO->...NO", multihead_output, self.projection_kernel)
149 | 
150 |         if self.projection_bias is not None:
151 |             output += self.projection_bias
152 | 
153 |         return output, attn_coef
154 | 
155 |     def call(
156 |         self,
157 |         inputs,
158 |         training=False,
159 |         mask=None,
160 |         **kwargs,
161 |     ):
162 |         query, key, value = inputs
163 | 
164 |         query, key, value = self.call_qkv(query, key, value, training=training)
165 | 
166 |         # Scale dot-product, doing the division to either query or key
167 |         # instead of their product saves some computation
168 |         depth = tf.constant(self.head_size, dtype=tf.float32)
169 |         query /= tf.sqrt(depth)
170 | 
171 |         # Calculate dot product attention
172 |         logits = tf.einsum("...NHO,...MHO->...HNM", query, key)
173 | 
174 |         output, attn_coef = self.call_attention(query, key, value, logits, training=training, mask=mask)
175 | 
176 |         if self.return_attn_coef:
177 |             return output, attn_coef
178 |         else:
179 |             return output
180 | 
181 |     def compute_output_shape(
182 |         self,
183 |         input_shape,
184 |     ):
185 |         num_value_features = input_shape[2][-1] if len(input_shape) > 2 else input_shape[1][-1]
186 |         output_size = self.output_size if self.output_size is not None else num_value_features
187 | 
188 |         output_shape = input_shape[0][:-1] + (output_size,)
189 | 
190 |         if self.return_attn_coef:
191 |             num_query_elements = input_shape[0][-2]
192 |             num_key_elements = input_shape[1][-2]
193 |             attn_coef_shape = input_shape[0][:-2] + (
194 |                 self.num_heads,
195 |                 num_query_elements,
196 |                 num_key_elements,
197 |             )
198 | 
199 |             return output_shape, attn_coef_shape
200 |         else:
201 |             return output_shape
202 | 
203 |     def get_config(self):
204 |         config = super().get_config()
205 | 
206 |         config.update(
207 |             head_size=self.head_size,
208 |             num_heads=self.num_heads,
209 |             output_size=self.output_size,
210 |             dropout=self._droput_rate,
211 |             use_projection_bias=self.use_projection_bias,
212 |             return_attn_coef=self.return_attn_coef,
213 |             kernel_initializer=tf.keras.initializers.serialize(self.kernel_initializer),
214 |             kernel_regularizer=tf.keras.regularizers.serialize(self.kernel_regularizer),
215 |             kernel_constraint=tf.keras.constraints.serialize(self.kernel_constraint),
216 |             bias_initializer=tf.keras.initializers.serialize(self.bias_initializer),
217 |             bias_regularizer=tf.keras.regularizers.serialize(self.bias_regularizer),
218 |             bias_constraint=tf.keras.constraints.serialize(self.bias_constraint),
219 |         )
220 | 
221 |         return config
222 | 
223 | 
224 | class RelPositionMultiHeadAttention(MultiHeadAttention):
225 |     def build(
226 |         self,
227 |         input_shape,
228 |     ):
229 |         num_pos_features = input_shape[-1][-1]
230 |         self.pos_kernel = self.add_weight(
231 |             name="pos_kernel",
232 |             shape=[self.num_heads, num_pos_features, self.head_size],
233 |             initializer=self.kernel_initializer,
234 |             regularizer=self.kernel_regularizer,
235 |             constraint=self.kernel_constraint,
236 |         )
237 |         self.pos_bias_u = self.add_weight(
238 |             name="pos_bias_u",
239 |             shape=[self.num_heads, self.head_size],
240 |             regularizer=self.kernel_regularizer,
241 |             initializer=self.kernel_initializer,
242 |             constraint=self.kernel_constraint,
243 |         )
244 |         self.pos_bias_v = self.add_weight(
245 |             name="pos_bias_v",
246 |             shape=[self.num_heads, self.head_size],
247 |             regularizer=self.kernel_regularizer,
248 |             initializer=self.kernel_initializer,
249 |             constraint=self.kernel_constraint,
250 |         )
251 |         super(RelPositionMultiHeadAttention, self).build(input_shape[:-1])
252 | 
253 |     @staticmethod
254 |     def relative_shift(x):
255 |         x_shape = tf.shape(x)
256 |         x = tf.pad(x, [[0, 0], [0, 0], [0, 0], [1, 0]])
257 |         x = tf.reshape(x, [x_shape[0], x_shape[1], x_shape[3] + 1, x_shape[2]])
258 |         x = tf.reshape(x[:, :, 1:, :], x_shape)
259 |         return x
260 | 
261 |     def call(
262 |         self,
263 |         inputs,
264 |         training=False,
265 |         mask=None,
266 |         **kwargs,
267 |     ):
268 |         query, key, value, pos = inputs
269 | 
270 |         query, key, value = self.call_qkv(query, key, value, training=training)
271 | 
272 |         pos = tf.einsum("...MI,HIO->...MHO", pos, self.pos_kernel)
273 | 
274 |         query_with_u = query + self.pos_bias_u
275 |         query_with_v = query + self.pos_bias_v
276 | 
277 |         logits_with_u = tf.einsum("...NHO,...MHO->...HNM", query_with_u, key)
278 |         logits_with_v = tf.einsum("...NHO,...MHO->...HNM", query_with_v, pos)
279 |         logits_with_v = self.relative_shift(logits_with_v)
280 | 
281 |         logits = logits_with_u + logits_with_v[:, :, :, : tf.shape(logits_with_u)[3]]
282 | 
283 |         depth = tf.constant(self.head_size, dtype=tf.float32)
284 |         logits /= tf.sqrt(depth)
285 | 
286 |         output, attn_coef = self.call_attention(query, key, value, logits, training=training, mask=mask)
287 | 
288 |         if self.return_attn_coef:
289 |             return output, attn_coef
290 |         else:
291 |             return output


--------------------------------------------------------------------------------
/audio_classification_models/models/conformer.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | from ..utils import shape_util, weights
  4 | from ..activations.glu import GLU
  5 | from ..layers.multihead_attention import MultiHeadAttention, RelPositionMultiHeadAttention
  6 | from ..layers.positional_encoding import PositionalEncoding, PositionalEncodingConcat
  7 | from ..layers.subsampling import Conv2dSubsampling, VggSubsampling
  8 | 
  9 | L2 = tf.keras.regularizers.l2(1e-6)
 10 | URL = "https://github.com/awsaf49/audio_classification_models/releases/download/v1.0.8/conformer-encoder.h5"
 11 | 
 12 | class FFModule(tf.keras.layers.Layer):
 13 |     def __init__(
 14 |         self,
 15 |         input_dim,
 16 |         dropout=0.0,
 17 |         fc_factor=0.5,
 18 |         kernel_regularizer=L2,
 19 |         bias_regularizer=L2,
 20 |         name="ff_module",
 21 |         **kwargs,
 22 |     ):
 23 |         super(FFModule, self).__init__(name=name, **kwargs)
 24 |         self.fc_factor = fc_factor
 25 |         self.ln = tf.keras.layers.LayerNormalization(
 26 |             name=f"{name}_ln",
 27 |             gamma_regularizer=kernel_regularizer,
 28 |             beta_regularizer=bias_regularizer,
 29 |         )
 30 |         self.ffn1 = tf.keras.layers.Dense(
 31 |             4 * input_dim,
 32 |             name=f"{name}_dense_1",
 33 |             kernel_regularizer=kernel_regularizer,
 34 |             bias_regularizer=bias_regularizer,
 35 |         )
 36 |         self.swish = tf.keras.layers.Activation(tf.nn.swish, name=f"{name}_swish_activation")
 37 |         self.do1 = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout_1")
 38 |         self.ffn2 = tf.keras.layers.Dense(
 39 |             input_dim,
 40 |             name=f"{name}_dense_2",
 41 |             kernel_regularizer=kernel_regularizer,
 42 |             bias_regularizer=bias_regularizer,
 43 |         )
 44 |         self.do2 = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout_2")
 45 |         self.res_add = tf.keras.layers.Add(name=f"{name}_add")
 46 | 
 47 |     def call(
 48 |         self,
 49 |         inputs,
 50 |         training=False,
 51 |         **kwargs,
 52 |     ):
 53 |         outputs = self.ln(inputs, training=training)
 54 |         outputs = self.ffn1(outputs, training=training)
 55 |         outputs = self.swish(outputs)
 56 |         outputs = self.do1(outputs, training=training)
 57 |         outputs = self.ffn2(outputs, training=training)
 58 |         outputs = self.do2(outputs, training=training)
 59 |         outputs = self.res_add([inputs, self.fc_factor * outputs])
 60 |         return outputs
 61 | 
 62 |     def get_config(self):
 63 |         conf = super(FFModule, self).get_config()
 64 |         conf.update({"fc_factor": self.fc_factor})
 65 |         conf.update(self.ln.get_config())
 66 |         conf.update(self.ffn1.get_config())
 67 |         conf.update(self.swish.get_config())
 68 |         conf.update(self.do1.get_config())
 69 |         conf.update(self.ffn2.get_config())
 70 |         conf.update(self.do2.get_config())
 71 |         conf.update(self.res_add.get_config())
 72 |         return conf
 73 | 
 74 | 
 75 | class MHSAModule(tf.keras.layers.Layer):
 76 |     def __init__(
 77 |         self,
 78 |         head_size,
 79 |         num_heads,
 80 |         dropout=0.0,
 81 |         mha_type="relmha",
 82 |         kernel_regularizer=L2,
 83 |         bias_regularizer=L2,
 84 |         name="mhsa_module",
 85 |         **kwargs,
 86 |     ):
 87 |         super(MHSAModule, self).__init__(name=name, **kwargs)
 88 |         self.ln = tf.keras.layers.LayerNormalization(
 89 |             name=f"{name}_ln",
 90 |             gamma_regularizer=kernel_regularizer,
 91 |             beta_regularizer=bias_regularizer,
 92 |         )
 93 |         if mha_type == "relmha":
 94 |             self.mha = RelPositionMultiHeadAttention(
 95 |                 name=f"{name}_mhsa",
 96 |                 head_size=head_size,
 97 |                 num_heads=num_heads,
 98 |                 kernel_regularizer=kernel_regularizer,
 99 |                 bias_regularizer=bias_regularizer,
100 |             )
101 |         elif mha_type == "mha":
102 |             self.mha = MultiHeadAttention(
103 |                 name=f"{name}_mhsa",
104 |                 head_size=head_size,
105 |                 num_heads=num_heads,
106 |                 kernel_regularizer=kernel_regularizer,
107 |                 bias_regularizer=bias_regularizer,
108 |             )
109 |         else:
110 |             raise ValueError("mha_type must be either 'mha' or 'relmha'")
111 |         self.do = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout")
112 |         self.res_add = tf.keras.layers.Add(name=f"{name}_add")
113 |         self.mha_type = mha_type
114 | 
115 |     def call(
116 |         self,
117 |         inputs,
118 |         training=False,
119 |         mask=None,
120 |         **kwargs,
121 |     ):
122 |         inputs, pos = inputs  # pos is positional encoding
123 |         outputs = self.ln(inputs, training=training)
124 |         if self.mha_type == "relmha":
125 |             outputs = self.mha([outputs, outputs, outputs, pos], training=training, mask=mask)
126 |         else:
127 |             outputs = outputs + pos
128 |             outputs = self.mha([outputs, outputs, outputs], training=training, mask=mask)
129 |         outputs = self.do(outputs, training=training)
130 |         outputs = self.res_add([inputs, outputs])
131 |         return outputs
132 | 
133 |     def get_config(self):
134 |         conf = super(MHSAModule, self).get_config()
135 |         conf.update({"mha_type": self.mha_type})
136 |         conf.update(self.ln.get_config())
137 |         conf.update(self.mha.get_config())
138 |         conf.update(self.do.get_config())
139 |         conf.update(self.res_add.get_config())
140 |         return conf
141 | 
142 | 
143 | class ConvModule(tf.keras.layers.Layer):
144 |     def __init__(
145 |         self,
146 |         input_dim,
147 |         kernel_size=32,
148 |         dropout=0.0,
149 |         depth_multiplier=1,
150 |         kernel_regularizer=L2,
151 |         bias_regularizer=L2,
152 |         name="conv_module",
153 |         **kwargs,
154 |     ):
155 |         super(ConvModule, self).__init__(name=name, **kwargs)
156 |         self.ln = tf.keras.layers.LayerNormalization()
157 |         self.pw_conv_1 = tf.keras.layers.Conv2D(
158 |             filters=2 * input_dim,
159 |             kernel_size=1,
160 |             strides=1,
161 |             padding="valid",
162 |             name=f"{name}_pw_conv_1",
163 |             kernel_regularizer=kernel_regularizer,
164 |             bias_regularizer=bias_regularizer,
165 |         )
166 |         self.glu = GLU(name=f"{name}_glu")
167 |         self.dw_conv = tf.keras.layers.DepthwiseConv2D(
168 |             kernel_size=(kernel_size, 1),
169 |             strides=1,
170 |             padding="same",
171 |             name=f"{name}_dw_conv",
172 |             depth_multiplier=depth_multiplier,
173 |             depthwise_regularizer=kernel_regularizer,
174 |             bias_regularizer=bias_regularizer,
175 |         )
176 |         self.bn = tf.keras.layers.BatchNormalization(
177 |             name=f"{name}_bn",
178 |             gamma_regularizer=kernel_regularizer,
179 |             beta_regularizer=bias_regularizer,
180 |         )
181 |         self.swish = tf.keras.layers.Activation(
182 |             tf.nn.swish,
183 |             name=f"{name}_swish_activation",
184 |         )
185 |         self.pw_conv_2 = tf.keras.layers.Conv2D(
186 |             filters=input_dim,
187 |             kernel_size=1,
188 |             strides=1,
189 |             padding="valid",
190 |             name=f"{name}_pw_conv_2",
191 |             kernel_regularizer=kernel_regularizer,
192 |             bias_regularizer=bias_regularizer,
193 |         )
194 |         self.do = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout")
195 |         self.res_add = tf.keras.layers.Add(name=f"{name}_add")
196 | 
197 |     def call(
198 |         self,
199 |         inputs,
200 |         training=False,
201 |         **kwargs,
202 |     ):
203 |         outputs = self.ln(inputs, training=training)
204 |         B, T, E = shape_util.shape_list(outputs)
205 |         outputs = tf.reshape(outputs, [B, T, 1, E])
206 |         outputs = self.pw_conv_1(outputs, training=training)
207 |         outputs = self.glu(outputs)
208 |         outputs = self.dw_conv(outputs, training=training)
209 |         outputs = self.bn(outputs, training=training)
210 |         outputs = self.swish(outputs)
211 |         outputs = self.pw_conv_2(outputs, training=training)
212 |         outputs = tf.reshape(outputs, [B, T, E])
213 |         outputs = self.do(outputs, training=training)
214 |         outputs = self.res_add([inputs, outputs])
215 |         return outputs
216 | 
217 |     def get_config(self):
218 |         conf = super(ConvModule, self).get_config()
219 |         conf.update(self.ln.get_config())
220 |         conf.update(self.pw_conv_1.get_config())
221 |         conf.update(self.glu.get_config())
222 |         conf.update(self.dw_conv.get_config())
223 |         conf.update(self.bn.get_config())
224 |         conf.update(self.swish.get_config())
225 |         conf.update(self.pw_conv_2.get_config())
226 |         conf.update(self.do.get_config())
227 |         conf.update(self.res_add.get_config())
228 |         return conf
229 | 
230 | 
231 | class ConformerBlock(tf.keras.layers.Layer):
232 |     def __init__(
233 |         self,
234 |         input_dim,
235 |         dropout=0.0,
236 |         fc_factor=0.5,
237 |         head_size=36,
238 |         num_heads=4,
239 |         mha_type="relmha",
240 |         kernel_size=32,
241 |         depth_multiplier=1,
242 |         kernel_regularizer=L2,
243 |         bias_regularizer=L2,
244 |         name="conformer_block",
245 |         **kwargs,
246 |     ):
247 |         super(ConformerBlock, self).__init__(name=name, **kwargs)
248 |         self.ffm1 = FFModule(
249 |             input_dim=input_dim,
250 |             dropout=dropout,
251 |             fc_factor=fc_factor,
252 |             name=f"{name}_ff_module_1",
253 |             kernel_regularizer=kernel_regularizer,
254 |             bias_regularizer=bias_regularizer,
255 |         )
256 |         self.mhsam = MHSAModule(
257 |             mha_type=mha_type,
258 |             head_size=head_size,
259 |             num_heads=num_heads,
260 |             dropout=dropout,
261 |             name=f"{name}_mhsa_module",
262 |             kernel_regularizer=kernel_regularizer,
263 |             bias_regularizer=bias_regularizer,
264 |         )
265 |         self.convm = ConvModule(
266 |             input_dim=input_dim,
267 |             kernel_size=kernel_size,
268 |             dropout=dropout,
269 |             name=f"{name}_conv_module",
270 |             depth_multiplier=depth_multiplier,
271 |             kernel_regularizer=kernel_regularizer,
272 |             bias_regularizer=bias_regularizer,
273 |         )
274 |         self.ffm2 = FFModule(
275 |             input_dim=input_dim,
276 |             dropout=dropout,
277 |             fc_factor=fc_factor,
278 |             name=f"{name}_ff_module_2",
279 |             kernel_regularizer=kernel_regularizer,
280 |             bias_regularizer=bias_regularizer,
281 |         )
282 |         self.ln = tf.keras.layers.LayerNormalization(
283 |             name=f"{name}_ln",
284 |             gamma_regularizer=kernel_regularizer,
285 |             beta_regularizer=kernel_regularizer,
286 |         )
287 | 
288 |     def call(
289 |         self,
290 |         inputs,
291 |         training=False,
292 |         mask=None,
293 |         **kwargs,
294 |     ):
295 |         inputs, pos = inputs  # pos is positional encoding
296 |         outputs = self.ffm1(inputs, training=training, **kwargs)
297 |         outputs = self.mhsam([outputs, pos], training=training, mask=mask, **kwargs)
298 |         outputs = self.convm(outputs, training=training, **kwargs)
299 |         outputs = self.ffm2(outputs, training=training, **kwargs)
300 |         outputs = self.ln(outputs, training=training)
301 |         return outputs
302 | 
303 |     def get_config(self):
304 |         conf = super(ConformerBlock, self).get_config()
305 |         conf.update(self.ffm1.get_config())
306 |         conf.update(self.mhsam.get_config())
307 |         conf.update(self.convm.get_config())
308 |         conf.update(self.ffm2.get_config())
309 |         conf.update(self.ln.get_config())
310 |         return conf
311 | 
312 | 
313 | class ConformerEncoder(tf.keras.Model):
314 |     def __init__(
315 |         self,
316 |         subsampling={'type': 'conv2d','filters': 144,'kernel_size': 3,'strides': 2},
317 |         positional_encoding="sinusoid",
318 |         dmodel=144,
319 |         num_blocks=16,
320 |         mha_type="relmha",
321 |         head_size=36,
322 |         num_heads=4,
323 |         kernel_size=32,
324 |         depth_multiplier=1,
325 |         fc_factor=0.5,
326 |         dropout=0.1,
327 |         kernel_regularizer=L2,
328 |         bias_regularizer=L2,
329 |         name="conformer_encoder",
330 |         **kwargs,
331 |     ):
332 |         super(ConformerEncoder, self).__init__(name=name, **kwargs)
333 | 
334 |         subsampling_name = subsampling.pop("type", "conv2d")
335 |         if subsampling_name == "vgg":
336 |             subsampling_class = VggSubsampling
337 |         elif subsampling_name == "conv2d":
338 |             subsampling_class = Conv2dSubsampling
339 |         else:
340 |             raise ValueError("subsampling must be either  'conv2d' or 'vgg'")
341 | 
342 |         self.conv_subsampling = subsampling_class(
343 |             **subsampling,
344 |             name=f"{name}_subsampling",
345 |             kernel_regularizer=kernel_regularizer,
346 |             bias_regularizer=bias_regularizer,
347 |         )
348 | 
349 |         if positional_encoding == "sinusoid":
350 |             self.pe = PositionalEncoding(name=f"{name}_pe")
351 |         elif positional_encoding == "sinusoid_v2":
352 |             self.pe = PositionalEncoding(alpha=2, beta=0, name=f"{name}_pe")
353 |         elif positional_encoding == "sinusoid_concat":
354 |             self.pe = PositionalEncodingConcat(name=f"{name}_pe")
355 |         elif positional_encoding == "sinusoid_concat_v2":
356 |             self.pe = PositionalEncodingConcat(alpha=2, beta=-1, name=f"{name}_pe")
357 |         elif positional_encoding == "subsampling":
358 |             self.pe = tf.keras.layers.Activation("linear", name=f"{name}_pe")
359 |         else:
360 |             raise ValueError(
361 |                 "positional_encoding must be either 'sinusoid', \
362 |                 'sinusoid_concat', 'sinusoid_v2', 'sinusoid_concat_v2' or 'subsampling'"
363 |             )
364 | 
365 |         self.linear = tf.keras.layers.Dense(
366 |             dmodel,
367 |             name=f"{name}_linear",
368 |             kernel_regularizer=kernel_regularizer,
369 |             bias_regularizer=bias_regularizer,
370 |         )
371 |         self.do = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout")
372 | 
373 |         self.conformer_blocks = []
374 |         for i in range(num_blocks):
375 |             conformer_block = ConformerBlock(
376 |                 input_dim=dmodel,
377 |                 dropout=dropout,
378 |                 fc_factor=fc_factor,
379 |                 head_size=head_size,
380 |                 num_heads=num_heads,
381 |                 mha_type=mha_type,
382 |                 kernel_size=kernel_size,
383 |                 depth_multiplier=depth_multiplier,
384 |                 kernel_regularizer=kernel_regularizer,
385 |                 bias_regularizer=bias_regularizer,
386 |                 name=f"{name}_block_{i}",
387 |             )
388 |             self.conformer_blocks.append(conformer_block)
389 | 
390 |     def call(
391 |         self,
392 |         inputs,
393 |         training=False,
394 |         mask=None,
395 |         **kwargs,
396 |     ):
397 |         # input with shape [B, T, V1, V2]
398 |         outputs = self.conv_subsampling(inputs, training=training)
399 |         outputs = self.linear(outputs, training=training)
400 |         pe = self.pe(outputs)
401 |         outputs = self.do(outputs, training=training)
402 |         for cblock in self.conformer_blocks:
403 |             outputs = cblock([outputs, pe], training=training, mask=mask, **kwargs)
404 |         return outputs
405 | 
406 |     def get_config(self):
407 |         conf = super(ConformerEncoder, self).get_config()
408 |         conf.update(self.conv_subsampling.get_config())
409 |         conf.update(self.linear.get_config())
410 |         conf.update(self.do.get_config())
411 |         conf.update(self.pe.get_config())
412 |         for cblock in self.conformer_blocks:
413 |             conf.update(cblock.get_config())
414 |         return conf
415 | 
416 | def Conformer(input_shape = (128, 80, 1),num_classes=1, final_activation='sigmoid', pretrain=True):
417 |     inp = tf.keras.layers.Input(shape=input_shape)
418 |     backbone = ConformerEncoder()
419 |     out = backbone(inp)
420 |     if pretrain:
421 |         weights.load_pretrain(backbone, url=URL)
422 |     out = tf.keras.layers.GlobalAveragePooling1D()(out)
423 |     out = tf.keras.layers.Dense(32, activation='selu')(out)
424 |     out = tf.keras.layers.Dense(num_classes, activation=final_activation)(out)
425 |     model = tf.keras.models.Model(inp, out)
426 |     return model
427 | 
428 | 


--------------------------------------------------------------------------------