├── audio_classification_models ├── layers │ ├── __init__.py │ ├── embedding.py │ ├── positional_encoding.py │ ├── subsampling.py │ └── multihead_attention.py ├── models │ ├── __init__.py │ ├── contextnet.py │ └── conformer.py ├── utils │ ├── __init__.py │ ├── weights.py │ ├── shape_util.py │ └── math_util.py ├── activations │ ├── __init__.py │ └── glu.py ├── version.py └── __init__.py ├── requirements.txt ├── .github └── workflows │ └── publish_to_pypi.yml ├── LICENSE.md ├── README.md ├── setup.py └── .gitignore /audio_classification_models/layers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /audio_classification_models/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /audio_classification_models/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /audio_classification_models/activations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /audio_classification_models/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.0.9" -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.19.5 2 | # gdown>=4.4.0 3 | 4 | 5 | # # extra=tf2.6 6 | # tensorflow>=2.6.4 7 | # tensorflow_probability>=0.14.1 8 | # tensorflow_addons>=0.15.0 9 | # tensorflow-io>=0.21.0 10 | 11 | six>=1.16.0 -------------------------------------------------------------------------------- /audio_classification_models/__init__.py: -------------------------------------------------------------------------------- 1 | from .models.conformer import ConformerEncoder, Conformer 2 | from .models.contextnet import ContextNetEncoder, ContextNet 3 | from .utils.weights import load_pretrain 4 | from .version import __version__ -------------------------------------------------------------------------------- /audio_classification_models/utils/weights.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tensorflow as tf 3 | 4 | def load_pretrain(model, url, fname=None): 5 | "Download weights from google drive using url then load weights to the model" 6 | local_path = tf.keras.utils.get_file(fname, origin=url) 7 | model.load_weights(local_path, by_name=True,skip_mismatch=True) -------------------------------------------------------------------------------- /audio_classification_models/utils/shape_util.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def shape_list(x, out_type=tf.int32): 5 | """Deal with dynamic shape in tensorflow cleanly.""" 6 | static = x.shape.as_list() 7 | dynamic = tf.shape(x, out_type=out_type) 8 | return [dynamic[i] if s is None else s for i, s in enumerate(static)] 9 | 10 | 11 | def get_shape_invariants(tensor): 12 | shapes = shape_list(tensor) 13 | return tf.TensorShape([i if isinstance(i, int) else None for i in shapes]) 14 | 15 | 16 | def get_float_spec(tensor): 17 | shape = get_shape_invariants(tensor) 18 | return tf.TensorSpec(shape, dtype=tf.float32) -------------------------------------------------------------------------------- /audio_classification_models/activations/glu.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class GLU(tf.keras.layers.Layer): 5 | def __init__( 6 | self, 7 | axis=-1, 8 | name="glu_activation", 9 | **kwargs, 10 | ): 11 | super(GLU, self).__init__(name=name, **kwargs) 12 | self.axis = axis 13 | 14 | def call( 15 | self, 16 | inputs, 17 | **kwargs, 18 | ): 19 | a, b = tf.split(inputs, 2, axis=self.axis) 20 | b = tf.nn.sigmoid(b) 21 | return tf.multiply(a, b) 22 | 23 | def get_config(self): 24 | conf = super(GLU, self).get_config() 25 | conf.update({"axis": self.axis}) 26 | return conf -------------------------------------------------------------------------------- /.github/workflows/publish_to_pypi.yml: -------------------------------------------------------------------------------- 1 | name: Publish Python 🐍 distributions 📦 to PyPI 2 | 3 | on: push 4 | 5 | jobs: 6 | build-n-publish: 7 | name: Build and publish Python 🐍 distributions 📦 to PyPI 8 | runs-on: ubuntu-18.04 9 | steps: 10 | - uses: actions/checkout@master 11 | - name: Set up Python 3.6 12 | uses: actions/setup-python@v1 13 | with: 14 | python-version: 3.6 15 | - name: Install pypa/build 16 | run: >- 17 | python -m 18 | pip install 19 | build 20 | --user 21 | - name: Build a binary wheel and a source tarball 22 | run: >- 23 | python -m 24 | build 25 | --sdist 26 | --wheel 27 | --outdir dist/ 28 | . 29 | - name: Publish distribution 📦 to PyPI 30 | if: startsWith(github.ref, 'refs/tags') 31 | uses: pypa/gh-action-pypi-publish@master 32 | with: 33 | password: ${{ secrets.PYPI_API_TOKEN }} -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2022 Awsaf 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

3 |
4 |
5 |
6 |
7 |
8 |
Audio Classification Models in Tensorflow 2.0
10 | 11 | 12 |13 | This library utilizes some automatic speech recognition architectures such as ContextNet, Conformer, etc for audio classification. 14 |
15 | 16 | ## Kaggle Codes/Notebook 17 | This library is used in the following notebook for **Fake Speech Detection**. 18 | * [Fake Speech Detection: Conformer [TF]](https://www.kaggle.com/code/awsaf49/fake-speech-detection-conformer-tf) (Awarded for Google OSS Expert Award 2022) 19 | > **Note**: You can also access the notebook in [`/notebooks`](/notebooks) folder. 20 | 21 | ## Installation 22 | ```shell 23 | pip install -U audio_classification_models 24 | ``` 25 | or 26 | ```shell 27 | pip install git+https://github.com/awsaf49/audio_classification_models 28 | ``` 29 | 30 | ## Usage 31 | ```py 32 | import audio_classification_models as acm 33 | model = acm.Conformer(pretrain=True) 34 | ``` 35 | 36 | ## Acknowledgement 37 | * [TensorflowASR](https://github.com/TensorSpeech/TensorFlowASR) 38 | -------------------------------------------------------------------------------- /audio_classification_models/layers/embedding.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class Embedding(tf.keras.layers.Layer): 5 | def __init__( 6 | self, 7 | vocab_size, 8 | embed_dim, 9 | contraint=None, 10 | regularizer=None, 11 | initializer=None, 12 | **kwargs, 13 | ): 14 | super(Embedding, self).__init__(**kwargs) 15 | self.vocab_size = vocab_size 16 | self.embed_dim = embed_dim 17 | self.contraint = tf.keras.constraints.get(contraint) 18 | self.regularizer = tf.keras.regularizers.get(regularizer) 19 | self.initializer = tf.keras.initializers.get(initializer) 20 | 21 | def build(self, input_shape): 22 | self.embeddings = self.add_weight( 23 | name="embeddings", 24 | dtype=tf.float32, 25 | shape=[self.vocab_size, self.embed_dim], 26 | initializer=self.initializer, 27 | trainable=True, 28 | regularizer=self.regularizer, 29 | constraint=self.contraint, 30 | ) 31 | self.built = True 32 | 33 | def call(self, inputs): 34 | outputs = tf.cast(inputs, dtype=tf.int32) 35 | return tf.nn.embedding_lookup(self.embeddings, outputs) 36 | 37 | def recognize_tflite(self, inputs): 38 | outputs = tf.cast(tf.expand_dims(inputs, axis=-1), dtype=tf.int32) 39 | return tf.gather_nd(self.embeddings, outputs) # https://github.com/tensorflow/tensorflow/issues/42410 40 | 41 | def get_config(self): 42 | conf = super(Embedding, self).get_config() 43 | conf.update( 44 | { 45 | "vocab_size": self.vocab_size, 46 | "embed_dim": self.embed_dim, 47 | "contraint": self.contraint, 48 | "regularizer": self.regularizer, 49 | "initializer": self.initializer, 50 | } 51 | ) 52 | return conf -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from codecs import open 3 | from os import path 4 | 5 | here = path.abspath(path.dirname(__file__)) 6 | 7 | # Get the long description from the README file 8 | with open(path.join(here, "README.md"), encoding="utf-8") as f: 9 | long_description = f.read() 10 | 11 | with open(path.join(here, 'requirements.txt')) as f: 12 | install_requires = [x for x in f.read().splitlines() if len(x)] 13 | 14 | exec(open("audio_classification_models/version.py").read()) 15 | 16 | setup( 17 | name="audio_classification_models", 18 | version=__version__, 19 | description="Tensorflow Audio Classification Models. https://github.com/awsaf49/audio_classification_models", 20 | long_description=long_description, 21 | long_description_content_type="text/markdown", 22 | url="https://github.com/awsaf49/audio_classification_models", 23 | author="Awsaf", 24 | author_email="awsaf49@gmail.com", 25 | classifiers=[ 26 | # How mature is this project? Common values are 27 | # 3 - Alpha 28 | # 4 - Beta 29 | # 5 - Production/Stable 30 | "Development Status :: 3 - Alpha", 31 | "Intended Audience :: Developers", 32 | "Intended Audience :: Science/Research", 33 | "License :: OSI Approved :: Apache Software License", 34 | "Programming Language :: Python :: 3.6", 35 | "Programming Language :: Python :: 3.7", 36 | "Programming Language :: Python :: 3.8", 37 | "Topic :: Scientific/Engineering", 38 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 39 | "Topic :: Software Development", 40 | "Topic :: Software Development :: Libraries", 41 | "Topic :: Software Development :: Libraries :: Python Modules", 42 | ], 43 | # Note that this is a string of words separated by whitespace, not a list. 44 | keywords="tensorflow audio speech classification", 45 | packages=find_packages(exclude=["tests"]), 46 | include_package_data=True, 47 | install_requires=install_requires, 48 | python_requires=">=3.6", 49 | license="MIT", 50 | ) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | db.sqlite3-journal 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # IPython 80 | profile_default/ 81 | ipython_config.py 82 | 83 | # pyenv 84 | .python-version 85 | 86 | # pipenv 87 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 88 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 89 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 90 | # install all needed dependencies. 91 | #Pipfile.lock 92 | 93 | # celery beat schedule file 94 | celerybeat-schedule 95 | 96 | # SageMath parsed files 97 | *.sage.py 98 | 99 | # Environments 100 | .env 101 | .venv 102 | env/ 103 | venv/ 104 | ENV/ 105 | env.bak/ 106 | venv.bak/ 107 | 108 | # Spyder project settings 109 | .spyderproject 110 | .spyproject 111 | 112 | # Rope project settings 113 | .ropeproject 114 | 115 | # mkdocs documentation 116 | /site 117 | 118 | # mypy 119 | .mypy_cache/ 120 | .dmypy.json 121 | dmypy.json 122 | 123 | # Pyre type checker 124 | .pyre/ 125 | -------------------------------------------------------------------------------- /audio_classification_models/layers/positional_encoding.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from ..utils.shape_util import shape_list 4 | 5 | 6 | class PositionalEncoding(tf.keras.layers.Layer): 7 | def __init__( 8 | self, 9 | alpha: int = 1, 10 | beta: int = 0, 11 | name="positional_encoding", 12 | **kwargs, 13 | ): 14 | super().__init__(trainable=False, name=name, **kwargs) 15 | self.alpha = alpha 16 | self.beta = beta 17 | 18 | def build( 19 | self, 20 | input_shape, 21 | ): 22 | dmodel = input_shape[-1] 23 | assert dmodel % 2 == 0, f"Input last dim must be even: {dmodel}" 24 | 25 | @staticmethod 26 | def encode( 27 | max_len, 28 | dmodel, 29 | ): 30 | pos = tf.expand_dims(tf.range(max_len - 1, -1, -1.0, dtype=tf.float32), axis=1) 31 | index = tf.expand_dims(tf.range(0, dmodel, dtype=tf.float32), axis=0) 32 | 33 | pe = pos * (1 / tf.pow(10000.0, (2 * (index // 2)) / dmodel)) 34 | 35 | # Sin cos will be [max_len, size // 2] 36 | # we add 0 between numbers by using padding and reshape 37 | sin = tf.pad(tf.expand_dims(tf.sin(pe[:, 0::2]), -1), [[0, 0], [0, 0], [0, 1]], mode="CONSTANT", constant_values=0) 38 | sin = tf.reshape(sin, [max_len, dmodel]) 39 | cos = tf.pad(tf.expand_dims(tf.cos(pe[:, 1::2]), -1), [[0, 0], [0, 0], [1, 0]], mode="CONSTANT", constant_values=0) 40 | cos = tf.reshape(cos, [max_len, dmodel]) 41 | # Then add sin and cos, which results in [time, size] 42 | pe = tf.add(sin, cos) 43 | return tf.expand_dims(pe, axis=0) # [1, time, size] 44 | 45 | def call( 46 | self, 47 | inputs, 48 | **kwargs, 49 | ): 50 | # inputs shape [B, T, V] 51 | _, max_len, dmodel = shape_list(inputs) 52 | pe = self.encode(max_len * self.alpha + self.beta, dmodel) 53 | return tf.cast(pe, dtype=inputs.dtype) 54 | 55 | def get_config(self): 56 | conf = super().get_config() 57 | conf.update({"alpha": self.alpha, "beta": self.beta}) 58 | return conf 59 | 60 | 61 | class PositionalEncodingConcat(PositionalEncoding): 62 | def build( 63 | self, 64 | input_shape, 65 | ): 66 | dmodel = input_shape[-1] 67 | assert dmodel % 2 == 0, f"Input last dim must be even: {dmodel}" 68 | 69 | @staticmethod 70 | def encode( 71 | max_len, 72 | dmodel, 73 | ): 74 | pos = tf.range(max_len - 1, -1, -1.0, dtype=tf.float32) 75 | 76 | index = tf.range(0, dmodel, 2.0, dtype=tf.float32) 77 | index = 1 / tf.pow(10000.0, (index / dmodel)) 78 | 79 | sinusoid = tf.einsum("i,j->ij", pos, index) 80 | pos = tf.concat([tf.sin(sinusoid), tf.cos(sinusoid)], axis=-1) 81 | 82 | return tf.expand_dims(pos, axis=0) 83 | 84 | def call( 85 | self, 86 | inputs, 87 | **kwargs, 88 | ): 89 | # inputs shape [B, T, V] 90 | _, max_len, dmodel = shape_list(inputs) 91 | pe = self.encode(max_len * self.alpha + self.beta, dmodel) 92 | return tf.cast(pe, dtype=inputs.dtype) -------------------------------------------------------------------------------- /audio_classification_models/utils/math_util.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import tensorflow as tf 4 | 5 | from ..utils import shape_util 6 | 7 | 8 | def log10(x): 9 | numerator = tf.math.log(x) 10 | denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype)) 11 | return numerator / denominator 12 | 13 | 14 | def get_num_batches( 15 | nsamples, 16 | batch_size, 17 | drop_remainders=True, 18 | ): 19 | if nsamples is None or batch_size is None: 20 | return None 21 | if drop_remainders: 22 | return math.floor(float(nsamples) / float(batch_size)) 23 | return math.ceil(float(nsamples) / float(batch_size)) 24 | 25 | 26 | def nan_to_zero( 27 | input_tensor: tf.Tensor, 28 | ): 29 | return tf.where(tf.math.is_nan(input_tensor), tf.zeros_like(input_tensor), input_tensor) 30 | 31 | 32 | def bytes_to_string( 33 | array: np.ndarray, 34 | encoding: str = "utf-8", 35 | ): 36 | if array is None: 37 | return None 38 | return [transcript.decode(encoding) for transcript in array] 39 | 40 | 41 | def get_reduced_length( 42 | length, 43 | reduction_factor, 44 | ): 45 | return tf.cast( 46 | tf.math.ceil(tf.divide(length, tf.cast(reduction_factor, dtype=length.dtype))), 47 | dtype=tf.int32, 48 | ) 49 | 50 | 51 | def count_non_blank( 52 | tensor: tf.Tensor, 53 | blank: int or tf.Tensor = 0, 54 | axis=None, 55 | ): 56 | return tf.reduce_sum( 57 | tf.where(tf.not_equal(tensor, blank), x=tf.ones_like(tensor), y=tf.zeros_like(tensor)), 58 | axis=axis, 59 | ) 60 | 61 | 62 | def merge_two_last_dims(x): 63 | b, _, f, c = shape_util.shape_list(x) 64 | return tf.reshape(x, shape=[b, -1, f * c]) 65 | 66 | 67 | def merge_repeated( 68 | yseqs, 69 | blank=0, 70 | ): 71 | result = tf.reshape(yseqs[0], [1]) 72 | 73 | U = shape_util.shape_list(yseqs)[0] 74 | i = tf.constant(1, dtype=tf.int32) 75 | 76 | def _cond(i, result, yseqs, U): 77 | return tf.less(i, U) 78 | 79 | def _body(i, result, yseqs, U): 80 | if yseqs[i] != result[-1]: 81 | result = tf.concat([result, [yseqs[i]]], axis=-1) 82 | return i + 1, result, yseqs, U 83 | 84 | _, result, _, _ = tf.while_loop( 85 | _cond, 86 | _body, 87 | loop_vars=[i, result, yseqs, U], 88 | shape_invariants=( 89 | tf.TensorShape([]), 90 | tf.TensorShape([None]), 91 | tf.TensorShape([None]), 92 | tf.TensorShape([]), 93 | ), 94 | ) 95 | 96 | return tf.pad(result, [[U - shape_util.shape_list(result)[0], 0]], constant_values=blank) 97 | 98 | 99 | def find_max_length_prediction_tfarray( 100 | tfarray: tf.TensorArray, 101 | ) -> tf.Tensor: 102 | with tf.name_scope("find_max_length_prediction_tfarray"): 103 | index = tf.constant(0, dtype=tf.int32) 104 | total = tfarray.size() 105 | max_length = tf.constant(0, dtype=tf.int32) 106 | 107 | def condition(index, _): 108 | return tf.less(index, total) 109 | 110 | def body(index, max_length): 111 | prediction = tfarray.read(index) 112 | length = tf.shape(prediction)[0] 113 | max_length = tf.where(tf.greater(length, max_length), length, max_length) 114 | return index + 1, max_length 115 | 116 | index, max_length = tf.while_loop(condition, body, loop_vars=[index, max_length], swap_memory=False) 117 | return max_length 118 | 119 | 120 | def pad_prediction_tfarray( 121 | tfarray: tf.TensorArray, 122 | blank: int or tf.Tensor, 123 | ) -> tf.TensorArray: 124 | with tf.name_scope("pad_prediction_tfarray"): 125 | index = tf.constant(0, dtype=tf.int32) 126 | total = tfarray.size() 127 | max_length = find_max_length_prediction_tfarray(tfarray) + 1 128 | 129 | def condition(index, _): 130 | return tf.less(index, total) 131 | 132 | def body(index, tfarray): 133 | prediction = tfarray.read(index) 134 | prediction = tf.pad( 135 | prediction, 136 | paddings=[[0, max_length - tf.shape(prediction)[0]]], 137 | mode="CONSTANT", 138 | constant_values=blank, 139 | ) 140 | tfarray = tfarray.write(index, prediction) 141 | return index + 1, tfarray 142 | 143 | index, tfarray = tf.while_loop(condition, body, loop_vars=[index, tfarray], swap_memory=False) 144 | return tfarray -------------------------------------------------------------------------------- /audio_classification_models/layers/subsampling.py: -------------------------------------------------------------------------------- 1 | 2 | import tensorflow as tf 3 | 4 | from ..utils import math_util, shape_util 5 | 6 | 7 | class TimeReduction(tf.keras.layers.Layer): 8 | def __init__( 9 | self, 10 | factor: int, 11 | name: str = "TimeReduction", 12 | **kwargs, 13 | ): 14 | super(TimeReduction, self).__init__(name=name, **kwargs) 15 | self.time_reduction_factor = factor 16 | 17 | def padding( 18 | self, 19 | time, 20 | ): 21 | new_time = tf.math.ceil(time / self.time_reduction_factor) * self.time_reduction_factor 22 | return tf.cast(new_time, dtype=tf.int32) - time 23 | 24 | def call( 25 | self, 26 | inputs, 27 | **kwargs, 28 | ): 29 | shape = shape_util.shape_list(inputs) 30 | outputs = tf.pad(inputs, [[0, 0], [0, self.padding(shape[1])], [0, 0]]) 31 | outputs = tf.reshape(outputs, [shape[0], -1, shape[-1] * self.time_reduction_factor]) 32 | return outputs 33 | 34 | def get_config(self): 35 | config = super(TimeReduction, self).get_config() 36 | config.update({"factor": self.time_reduction_factor}) 37 | return config 38 | 39 | 40 | class VggSubsampling(tf.keras.layers.Layer): 41 | def __init__( 42 | self, 43 | filters: tuple or list = (32, 64), 44 | kernel_size: int or list or tuple = 3, 45 | strides: int or list or tuple = 2, 46 | kernel_regularizer=None, 47 | bias_regularizer=None, 48 | name="VggSubsampling", 49 | **kwargs, 50 | ): 51 | super(VggSubsampling, self).__init__(name=name, **kwargs) 52 | self.conv1 = tf.keras.layers.Conv2D( 53 | filters=filters[0], 54 | kernel_size=kernel_size, 55 | strides=1, 56 | padding="same", 57 | name=f"{name}_conv_1", 58 | kernel_regularizer=kernel_regularizer, 59 | bias_regularizer=bias_regularizer, 60 | ) 61 | self.conv2 = tf.keras.layers.Conv2D( 62 | filters=filters[0], 63 | kernel_size=kernel_size, 64 | strides=1, 65 | padding="same", 66 | name=f"{name}_conv_2", 67 | kernel_regularizer=kernel_regularizer, 68 | bias_regularizer=bias_regularizer, 69 | ) 70 | self.maxpool1 = tf.keras.layers.MaxPool2D(pool_size=strides, padding="same", name=f"{name}_maxpool_1") 71 | self.conv3 = tf.keras.layers.Conv2D( 72 | filters=filters[1], 73 | kernel_size=kernel_size, 74 | strides=1, 75 | padding="same", 76 | name=f"{name}_conv_3", 77 | kernel_regularizer=kernel_regularizer, 78 | bias_regularizer=bias_regularizer, 79 | ) 80 | self.conv4 = tf.keras.layers.Conv2D( 81 | filters=filters[1], 82 | kernel_size=kernel_size, 83 | strides=1, 84 | padding="same", 85 | name=f"{name}_conv_4", 86 | kernel_regularizer=kernel_regularizer, 87 | bias_regularizer=bias_regularizer, 88 | ) 89 | self.maxpool2 = tf.keras.layers.MaxPool2D(pool_size=strides, padding="same", name=f"{name}_maxpool_2") 90 | self.time_reduction_factor = self.maxpool1.pool_size[0] * self.maxpool2.pool_size[0] 91 | 92 | def call( 93 | self, 94 | inputs, 95 | training=False, 96 | **kwargs, 97 | ): 98 | outputs = self.conv1(inputs, training=training) 99 | outputs = tf.nn.relu(outputs) 100 | outputs = self.conv2(outputs, training=training) 101 | outputs = tf.nn.relu(outputs) 102 | outputs = self.maxpool1(outputs, training=training) 103 | 104 | outputs = self.conv3(outputs, training=training) 105 | outputs = tf.nn.relu(outputs) 106 | outputs = self.conv4(outputs, training=training) 107 | outputs = tf.nn.relu(outputs) 108 | outputs = self.maxpool2(outputs, training=training) 109 | 110 | return math_util.merge_two_last_dims(outputs) 111 | 112 | def get_config( 113 | self, 114 | ): 115 | conf = super(VggSubsampling, self).get_config() 116 | conf.update(self.conv1.get_config()) 117 | conf.update(self.conv2.get_config()) 118 | conf.update(self.maxpool1.get_config()) 119 | conf.update(self.conv3.get_config()) 120 | conf.update(self.conv4.get_config()) 121 | conf.update(self.maxpool2.get_config()) 122 | return conf 123 | 124 | 125 | class Conv2dSubsampling(tf.keras.layers.Layer): 126 | def __init__( 127 | self, 128 | filters: int, 129 | strides: list or tuple or int = 2, 130 | kernel_size: int or list or tuple = 3, 131 | kernel_regularizer=None, 132 | bias_regularizer=None, 133 | name="Conv2dSubsampling", 134 | **kwargs, 135 | ): 136 | super(Conv2dSubsampling, self).__init__(name=name, **kwargs) 137 | self.conv1 = tf.keras.layers.Conv2D( 138 | filters=filters, 139 | kernel_size=kernel_size, 140 | strides=strides, 141 | padding="same", 142 | name=f"{name}_1", 143 | kernel_regularizer=kernel_regularizer, 144 | bias_regularizer=bias_regularizer, 145 | ) 146 | self.conv2 = tf.keras.layers.Conv2D( 147 | filters=filters, 148 | kernel_size=kernel_size, 149 | strides=strides, 150 | padding="same", 151 | name=f"{name}_2", 152 | kernel_regularizer=kernel_regularizer, 153 | bias_regularizer=bias_regularizer, 154 | ) 155 | self.time_reduction_factor = self.conv1.strides[0] * self.conv2.strides[0] 156 | 157 | def call( 158 | self, 159 | inputs, 160 | training=False, 161 | **kwargs, 162 | ): 163 | outputs = self.conv1(inputs, training=training) 164 | outputs = tf.nn.relu(outputs) 165 | outputs = self.conv2(outputs, training=training) 166 | outputs = tf.nn.relu(outputs) 167 | return math_util.merge_two_last_dims(outputs) 168 | 169 | def get_config(self): 170 | conf = super(Conv2dSubsampling, self).get_config() 171 | conf.update(self.conv1.get_config()) 172 | conf.update(self.conv2.get_config()) 173 | return conf -------------------------------------------------------------------------------- /audio_classification_models/models/contextnet.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import tensorflow as tf 3 | 4 | from ..utils import math_util, weights 5 | 6 | L2 = tf.keras.regularizers.l2(1e-6) 7 | URL = "https://github.com/awsaf49/audio_classification_models/releases/download/v1.0.8/contextnet.h5" 8 | BLOCKS =[{'nlayers': 1, 'kernel_size': 5, 'filters': 256, 'strides': 1, 'residual': False, 'activation': 'silu'}, 9 | {'nlayers': 5,'kernel_size': 5,'filters': 256,'strides': 1,'residual': True,'activation': 'silu'}, 10 | {'nlayers': 5,'kernel_size': 5,'filters': 256,'strides': 1,'residual': True,'activation': 'silu'}, 11 | {'nlayers': 5,'kernel_size': 5,'filters': 256,'strides': 2,'residual': True,'activation': 'silu'}, 12 | {'nlayers': 5,'kernel_size': 5,'filters': 256,'strides': 1,'residual': True,'activation': 'silu'}, 13 | {'nlayers': 5,'kernel_size': 5,'filters': 256,'strides': 1,'residual': True,'activation': 'silu'}, 14 | {'nlayers': 5,'kernel_size': 5,'filters': 256,'strides': 1,'residual': True,'activation': 'silu'}, 15 | {'nlayers': 5,'kernel_size': 5,'filters': 256,'strides': 2,'residual': True,'activation': 'silu'}, 16 | {'nlayers': 5,'kernel_size': 5,'filters': 256,'strides': 1,'residual': True,'activation': 'silu'}, 17 | {'nlayers': 5,'kernel_size': 5,'filters': 256,'strides': 1,'residual': True,'activation': 'silu'}, 18 | {'nlayers': 5,'kernel_size': 5,'filters': 256,'strides': 1,'residual': True,'activation': 'silu'}, 19 | {'nlayers': 5,'kernel_size': 5,'filters': 512,'strides': 1,'residual': True,'activation': 'silu'}, 20 | {'nlayers': 5,'kernel_size': 5,'filters': 512,'strides': 1,'residual': True,'activation': 'silu'}, 21 | {'nlayers': 5,'kernel_size': 5,'filters': 512,'strides': 1,'residual': True,'activation': 'silu'}, 22 | {'nlayers': 5,'kernel_size': 5,'filters': 512,'strides': 2,'residual': True,'activation': 'silu'}, 23 | {'nlayers': 5,'kernel_size': 5,'filters': 512,'strides': 1,'residual': True,'activation': 'silu'}, 24 | {'nlayers': 5,'kernel_size': 5,'filters': 512,'strides': 1,'residual': True,'activation': 'silu'}, 25 | {'nlayers': 5,'kernel_size': 5,'filters': 512,'strides': 1,'residual': True,'activation': 'silu'}, 26 | {'nlayers': 5,'kernel_size': 5,'filters': 512,'strides': 1,'residual': True,'activation': 'silu'}, 27 | {'nlayers': 5,'kernel_size': 5,'filters': 512,'strides': 1,'residual': True,'activation': 'silu'}, 28 | {'nlayers': 5,'kernel_size': 5,'filters': 512,'strides': 1,'residual': True,'activation': 'silu'}, 29 | {'nlayers': 5,'kernel_size': 5,'filters': 512,'strides': 1,'residual': True,'activation': 'silu'}, 30 | {'nlayers': 1,'kernel_size': 5,'filters': 640,'strides': 1,'residual': False,'activation': 'silu'}] 31 | 32 | def get_activation( 33 | activation: str = "silu", 34 | ): 35 | activation = activation.lower() 36 | if activation in ["silu", "swish"]: 37 | return tf.nn.swish 38 | elif activation == 'selu': 39 | return tf.nn.selu 40 | elif activation == "relu": 41 | return tf.nn.relu 42 | elif activation == "linear": 43 | return tf.keras.activations.linear 44 | else: 45 | raise ValueError("activation must be either 'silu', 'swish', 'selu', 'relu' or 'linear'") 46 | 47 | 48 | class Reshape(tf.keras.layers.Layer): 49 | def call(self, inputs): 50 | return math_util.merge_two_last_dims(inputs) 51 | 52 | 53 | class ConvModule(tf.keras.layers.Layer): 54 | def __init__( 55 | self, 56 | kernel_size: int = 3, 57 | strides: int = 1, 58 | filters: int = 256, 59 | activation: str = "silu", 60 | kernel_regularizer=None, 61 | bias_regularizer=None, 62 | **kwargs, 63 | ): 64 | super(ConvModule, self).__init__(**kwargs) 65 | self.strides = strides 66 | self.conv = tf.keras.layers.SeparableConv1D( 67 | filters=filters, 68 | kernel_size=kernel_size, 69 | strides=strides, 70 | padding="same", 71 | depthwise_regularizer=kernel_regularizer, 72 | pointwise_regularizer=kernel_regularizer, 73 | bias_regularizer=bias_regularizer, 74 | name=f"{self.name}_conv", 75 | ) 76 | self.bn = tf.keras.layers.BatchNormalization(name=f"{self.name}_bn") 77 | self.activation = get_activation(activation) 78 | 79 | def call( 80 | self, 81 | inputs, 82 | training=False, 83 | **kwargs, 84 | ): 85 | outputs = self.conv(inputs, training=training) 86 | outputs = self.bn(outputs, training=training) 87 | outputs = self.activation(outputs) 88 | return outputs 89 | 90 | 91 | class SEModule(tf.keras.layers.Layer): 92 | def __init__( 93 | self, 94 | kernel_size: int = 3, 95 | strides: int = 1, 96 | filters: int = 256, 97 | activation: str = "silu", 98 | kernel_regularizer=None, 99 | bias_regularizer=None, 100 | **kwargs, 101 | ): 102 | super(SEModule, self).__init__(**kwargs) 103 | self.conv = ConvModule( 104 | kernel_size=kernel_size, 105 | strides=strides, 106 | filters=filters, 107 | activation=activation, 108 | kernel_regularizer=kernel_regularizer, 109 | bias_regularizer=bias_regularizer, 110 | name=f"{self.name}_conv_module", 111 | ) 112 | self.activation = get_activation(activation) 113 | self.fc1 = tf.keras.layers.Dense(filters // 8, name=f"{self.name}_fc1") 114 | self.fc2 = tf.keras.layers.Dense(filters, name=f"{self.name}_fc2") 115 | 116 | def call( 117 | self, 118 | inputs, 119 | training=False, 120 | **kwargs, 121 | ): 122 | features, input_length = inputs 123 | outputs = self.conv(features, training=training) 124 | 125 | se = tf.divide(tf.reduce_sum(outputs, axis=1), tf.expand_dims(tf.cast(input_length, dtype=outputs.dtype), axis=1)) 126 | se = self.fc1(se, training=training) 127 | se = self.activation(se) 128 | se = self.fc2(se, training=training) 129 | se = self.activation(se) 130 | se = tf.nn.sigmoid(se) 131 | se = tf.expand_dims(se, axis=1) 132 | 133 | outputs = tf.multiply(outputs, se) 134 | return outputs 135 | 136 | 137 | class ConvBlock(tf.keras.layers.Layer): 138 | def __init__( 139 | self, 140 | nlayers: int = 3, 141 | kernel_size: int = 3, 142 | filters: int = 256, 143 | strides: int = 1, 144 | residual: bool = True, 145 | activation: str = "silu", 146 | alpha: float = 1.0, 147 | kernel_regularizer=None, 148 | bias_regularizer=None, 149 | **kwargs, 150 | ): 151 | super(ConvBlock, self).__init__(**kwargs) 152 | 153 | self.dmodel = filters 154 | self.time_reduction_factor = strides 155 | filters = int(filters * alpha) 156 | 157 | self.convs = [] 158 | for i in range(nlayers - 1): 159 | self.convs.append( 160 | ConvModule( 161 | kernel_size=kernel_size, 162 | strides=1, 163 | filters=filters, 164 | activation=activation, 165 | kernel_regularizer=kernel_regularizer, 166 | bias_regularizer=bias_regularizer, 167 | name=f"{self.name}_conv_module_{i}", 168 | ) 169 | ) 170 | 171 | self.last_conv = ConvModule( 172 | kernel_size=kernel_size, 173 | strides=strides, 174 | filters=filters, 175 | activation=activation, 176 | kernel_regularizer=kernel_regularizer, 177 | bias_regularizer=bias_regularizer, 178 | name=f"{self.name}_conv_module_{nlayers - 1}", 179 | ) 180 | 181 | self.se = SEModule( 182 | kernel_size=kernel_size, 183 | strides=1, 184 | filters=filters, 185 | activation=activation, 186 | kernel_regularizer=kernel_regularizer, 187 | bias_regularizer=bias_regularizer, 188 | name=f"{self.name}_se", 189 | ) 190 | 191 | self.residual = None 192 | if residual: 193 | self.residual = ConvModule( 194 | kernel_size=kernel_size, 195 | strides=strides, 196 | filters=filters, 197 | activation="linear", 198 | kernel_regularizer=kernel_regularizer, 199 | bias_regularizer=bias_regularizer, 200 | name=f"{self.name}_residual", 201 | ) 202 | 203 | self.activation = get_activation(activation) 204 | 205 | def call( 206 | self, 207 | inputs, 208 | training=False, 209 | **kwargs, 210 | ): 211 | features, input_length = inputs 212 | outputs = features 213 | for conv in self.convs: 214 | outputs = conv(outputs, training=training) 215 | outputs = self.last_conv(outputs, training=training) 216 | input_length = math_util.get_reduced_length(input_length, self.last_conv.strides) 217 | outputs = self.se([outputs, input_length], training=training) 218 | if self.residual is not None: 219 | res = self.residual(features, training=training) 220 | outputs = tf.add(outputs, res) 221 | outputs = self.activation(outputs) 222 | return outputs, input_length 223 | 224 | 225 | class ContextNetEncoder(tf.keras.Model): 226 | def __init__( 227 | self, 228 | blocks: List[dict] = BLOCKS, 229 | alpha: float = 0.5, 230 | kernel_regularizer=L2, 231 | bias_regularizer=L2, 232 | name='contextnet_encoder', 233 | **kwargs, 234 | ): 235 | super(ContextNetEncoder, self).__init__(name=name, **kwargs) 236 | 237 | self.reshape = Reshape(name=f"{self.name}_reshape") 238 | 239 | self.blocks = [] 240 | for i, config in enumerate(blocks): 241 | self.blocks.append( 242 | ConvBlock( 243 | **config, 244 | alpha=alpha, 245 | kernel_regularizer=kernel_regularizer, 246 | bias_regularizer=bias_regularizer, 247 | name=f"{self.name}_block_{i}", 248 | ) 249 | ) 250 | 251 | def call( 252 | self, 253 | inputs, 254 | training=False, 255 | **kwargs, 256 | ): 257 | outputs = inputs # shape: [B, T, F, C] 258 | input_length = tf.expand_dims(tf.shape(inputs)[1], axis=0) # spec time duration 259 | outputs = self.reshape(outputs) 260 | for block in self.blocks: 261 | outputs, input_length = block([outputs, input_length], training=training) 262 | return outputs 263 | 264 | def ContextNet(input_shape=(128, 80, 1), num_classes=1, final_activation='sigmoid', pretrain=True): 265 | inp = tf.keras.layers.Input(shape=input_shape) 266 | backbone = ContextNetEncoder() 267 | out = backbone(inp) 268 | if pretrain: 269 | weights.load_pretrain(backbone, url=URL) 270 | out = tf.keras.layers.GlobalAveragePooling1D()(out) 271 | out = tf.keras.layers.Dense(32, activation='selu')(out) 272 | out = tf.keras.layers.Dense(num_classes, activation=final_activation)(out) 273 | model = tf.keras.models.Model(inp, out) 274 | return model -------------------------------------------------------------------------------- /audio_classification_models/layers/multihead_attention.py: -------------------------------------------------------------------------------- 1 | 2 | import typing 3 | 4 | import tensorflow as tf 5 | 6 | 7 | class MultiHeadAttention(tf.keras.layers.Layer): 8 | def __init__( 9 | self, 10 | num_heads, 11 | head_size, 12 | output_size: int = None, 13 | dropout: float = 0.0, 14 | use_projection_bias: bool = True, 15 | return_attn_coef: bool = False, 16 | kernel_initializer: typing.Union[str, typing.Callable] = "glorot_uniform", 17 | kernel_regularizer: typing.Union[str, typing.Callable] = None, 18 | kernel_constraint: typing.Union[str, typing.Callable] = None, 19 | bias_initializer: typing.Union[str, typing.Callable] = "zeros", 20 | bias_regularizer: typing.Union[str, typing.Callable] = None, 21 | bias_constraint: typing.Union[str, typing.Callable] = None, 22 | **kwargs, 23 | ): 24 | super(MultiHeadAttention, self).__init__(**kwargs) 25 | 26 | if output_size is not None and output_size < 1: 27 | raise ValueError("output_size must be a positive number") 28 | 29 | self.kernel_initializer = tf.keras.initializers.get(kernel_initializer) 30 | self.kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer) 31 | self.kernel_constraint = tf.keras.constraints.get(kernel_constraint) 32 | self.bias_initializer = tf.keras.initializers.get(bias_initializer) 33 | self.bias_regularizer = tf.keras.regularizers.get(bias_regularizer) 34 | self.bias_constraint = tf.keras.constraints.get(bias_constraint) 35 | 36 | self.head_size = head_size 37 | self.num_heads = num_heads 38 | self.output_size = output_size 39 | self.use_projection_bias = use_projection_bias 40 | self.return_attn_coef = return_attn_coef 41 | 42 | self.dropout = tf.keras.layers.Dropout(dropout, name="dropout") 43 | self._droput_rate = dropout 44 | 45 | def build( 46 | self, 47 | input_shape, 48 | ): 49 | num_query_features = input_shape[0][-1] 50 | num_key_features = input_shape[1][-1] 51 | num_value_features = input_shape[2][-1] if len(input_shape) > 2 else num_key_features 52 | output_size = self.output_size if self.output_size is not None else num_value_features 53 | self.query_kernel = self.add_weight( 54 | name="query_kernel", 55 | shape=[self.num_heads, num_query_features, self.head_size], 56 | initializer=self.kernel_initializer, 57 | regularizer=self.kernel_regularizer, 58 | constraint=self.kernel_constraint, 59 | ) 60 | self.key_kernel = self.add_weight( 61 | name="key_kernel", 62 | shape=[self.num_heads, num_key_features, self.head_size], 63 | initializer=self.kernel_initializer, 64 | regularizer=self.kernel_regularizer, 65 | constraint=self.kernel_constraint, 66 | ) 67 | self.value_kernel = self.add_weight( 68 | name="value_kernel", 69 | shape=[self.num_heads, num_value_features, self.head_size], 70 | initializer=self.kernel_initializer, 71 | regularizer=self.kernel_regularizer, 72 | constraint=self.kernel_constraint, 73 | ) 74 | self.projection_kernel = self.add_weight( 75 | name="projection_kernel", 76 | shape=[self.num_heads, self.head_size, output_size], 77 | initializer=self.kernel_initializer, 78 | regularizer=self.kernel_regularizer, 79 | constraint=self.kernel_constraint, 80 | ) 81 | if self.use_projection_bias: 82 | self.projection_bias = self.add_weight( 83 | name="projection_bias", 84 | shape=[output_size], 85 | initializer=self.bias_initializer, 86 | regularizer=self.bias_regularizer, 87 | constraint=self.bias_constraint, 88 | ) 89 | else: 90 | self.projection_bias = None 91 | 92 | def call_qkv( 93 | self, 94 | query, 95 | key, 96 | value, 97 | training=False, 98 | ): 99 | # verify shapes 100 | if key.shape[-2] != value.shape[-2]: 101 | raise ValueError( 102 | "the number of elements in 'key' must be equal to " "the same as the number of elements in 'value'" 103 | ) 104 | # Linear transformations 105 | query = tf.einsum("...NI,HIO->...NHO", query, self.query_kernel) 106 | key = tf.einsum("...MI,HIO->...MHO", key, self.key_kernel) 107 | value = tf.einsum("...MI,HIO->...MHO", value, self.value_kernel) 108 | 109 | return query, key, value 110 | 111 | def call_attention( 112 | self, 113 | query, 114 | key, 115 | value, 116 | logits, 117 | training=False, 118 | mask=None, 119 | ): 120 | # mask = attention mask with shape [B, Tquery, Tkey] with 1 is for positions we want to attend, 0 for masked 121 | if mask is not None: 122 | if len(mask.shape) < 2: 123 | raise ValueError("'mask' must have at least 2 dimensions") 124 | if query.shape[-3] != mask.shape[-2]: 125 | raise ValueError("mask's second to last dimension must be equal to " "the number of elements in 'query'") 126 | if key.shape[-3] != mask.shape[-1]: 127 | raise ValueError("mask's last dimension must be equal to the number of elements in 'key'") 128 | # apply mask 129 | if mask is not None: 130 | mask = tf.cast(mask, tf.float32) 131 | 132 | # possibly expand on the head dimension so broadcasting works 133 | if len(mask.shape) != len(logits.shape): 134 | mask = tf.expand_dims(mask, -3) 135 | 136 | logits += -10e9 * (1.0 - mask) 137 | 138 | attn_coef = tf.nn.softmax(logits) 139 | 140 | # attention dropout 141 | attn_coef_dropout = self.dropout(attn_coef, training=training) 142 | 143 | # attention * value 144 | multihead_output = tf.einsum("...HNM,...MHI->...NHI", attn_coef_dropout, value) 145 | 146 | # Run the outputs through another linear projection layer. Recombining heads 147 | # is automatically done. 148 | output = tf.einsum("...NHI,HIO->...NO", multihead_output, self.projection_kernel) 149 | 150 | if self.projection_bias is not None: 151 | output += self.projection_bias 152 | 153 | return output, attn_coef 154 | 155 | def call( 156 | self, 157 | inputs, 158 | training=False, 159 | mask=None, 160 | **kwargs, 161 | ): 162 | query, key, value = inputs 163 | 164 | query, key, value = self.call_qkv(query, key, value, training=training) 165 | 166 | # Scale dot-product, doing the division to either query or key 167 | # instead of their product saves some computation 168 | depth = tf.constant(self.head_size, dtype=tf.float32) 169 | query /= tf.sqrt(depth) 170 | 171 | # Calculate dot product attention 172 | logits = tf.einsum("...NHO,...MHO->...HNM", query, key) 173 | 174 | output, attn_coef = self.call_attention(query, key, value, logits, training=training, mask=mask) 175 | 176 | if self.return_attn_coef: 177 | return output, attn_coef 178 | else: 179 | return output 180 | 181 | def compute_output_shape( 182 | self, 183 | input_shape, 184 | ): 185 | num_value_features = input_shape[2][-1] if len(input_shape) > 2 else input_shape[1][-1] 186 | output_size = self.output_size if self.output_size is not None else num_value_features 187 | 188 | output_shape = input_shape[0][:-1] + (output_size,) 189 | 190 | if self.return_attn_coef: 191 | num_query_elements = input_shape[0][-2] 192 | num_key_elements = input_shape[1][-2] 193 | attn_coef_shape = input_shape[0][:-2] + ( 194 | self.num_heads, 195 | num_query_elements, 196 | num_key_elements, 197 | ) 198 | 199 | return output_shape, attn_coef_shape 200 | else: 201 | return output_shape 202 | 203 | def get_config(self): 204 | config = super().get_config() 205 | 206 | config.update( 207 | head_size=self.head_size, 208 | num_heads=self.num_heads, 209 | output_size=self.output_size, 210 | dropout=self._droput_rate, 211 | use_projection_bias=self.use_projection_bias, 212 | return_attn_coef=self.return_attn_coef, 213 | kernel_initializer=tf.keras.initializers.serialize(self.kernel_initializer), 214 | kernel_regularizer=tf.keras.regularizers.serialize(self.kernel_regularizer), 215 | kernel_constraint=tf.keras.constraints.serialize(self.kernel_constraint), 216 | bias_initializer=tf.keras.initializers.serialize(self.bias_initializer), 217 | bias_regularizer=tf.keras.regularizers.serialize(self.bias_regularizer), 218 | bias_constraint=tf.keras.constraints.serialize(self.bias_constraint), 219 | ) 220 | 221 | return config 222 | 223 | 224 | class RelPositionMultiHeadAttention(MultiHeadAttention): 225 | def build( 226 | self, 227 | input_shape, 228 | ): 229 | num_pos_features = input_shape[-1][-1] 230 | self.pos_kernel = self.add_weight( 231 | name="pos_kernel", 232 | shape=[self.num_heads, num_pos_features, self.head_size], 233 | initializer=self.kernel_initializer, 234 | regularizer=self.kernel_regularizer, 235 | constraint=self.kernel_constraint, 236 | ) 237 | self.pos_bias_u = self.add_weight( 238 | name="pos_bias_u", 239 | shape=[self.num_heads, self.head_size], 240 | regularizer=self.kernel_regularizer, 241 | initializer=self.kernel_initializer, 242 | constraint=self.kernel_constraint, 243 | ) 244 | self.pos_bias_v = self.add_weight( 245 | name="pos_bias_v", 246 | shape=[self.num_heads, self.head_size], 247 | regularizer=self.kernel_regularizer, 248 | initializer=self.kernel_initializer, 249 | constraint=self.kernel_constraint, 250 | ) 251 | super(RelPositionMultiHeadAttention, self).build(input_shape[:-1]) 252 | 253 | @staticmethod 254 | def relative_shift(x): 255 | x_shape = tf.shape(x) 256 | x = tf.pad(x, [[0, 0], [0, 0], [0, 0], [1, 0]]) 257 | x = tf.reshape(x, [x_shape[0], x_shape[1], x_shape[3] + 1, x_shape[2]]) 258 | x = tf.reshape(x[:, :, 1:, :], x_shape) 259 | return x 260 | 261 | def call( 262 | self, 263 | inputs, 264 | training=False, 265 | mask=None, 266 | **kwargs, 267 | ): 268 | query, key, value, pos = inputs 269 | 270 | query, key, value = self.call_qkv(query, key, value, training=training) 271 | 272 | pos = tf.einsum("...MI,HIO->...MHO", pos, self.pos_kernel) 273 | 274 | query_with_u = query + self.pos_bias_u 275 | query_with_v = query + self.pos_bias_v 276 | 277 | logits_with_u = tf.einsum("...NHO,...MHO->...HNM", query_with_u, key) 278 | logits_with_v = tf.einsum("...NHO,...MHO->...HNM", query_with_v, pos) 279 | logits_with_v = self.relative_shift(logits_with_v) 280 | 281 | logits = logits_with_u + logits_with_v[:, :, :, : tf.shape(logits_with_u)[3]] 282 | 283 | depth = tf.constant(self.head_size, dtype=tf.float32) 284 | logits /= tf.sqrt(depth) 285 | 286 | output, attn_coef = self.call_attention(query, key, value, logits, training=training, mask=mask) 287 | 288 | if self.return_attn_coef: 289 | return output, attn_coef 290 | else: 291 | return output -------------------------------------------------------------------------------- /audio_classification_models/models/conformer.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from ..utils import shape_util, weights 4 | from ..activations.glu import GLU 5 | from ..layers.multihead_attention import MultiHeadAttention, RelPositionMultiHeadAttention 6 | from ..layers.positional_encoding import PositionalEncoding, PositionalEncodingConcat 7 | from ..layers.subsampling import Conv2dSubsampling, VggSubsampling 8 | 9 | L2 = tf.keras.regularizers.l2(1e-6) 10 | URL = "https://github.com/awsaf49/audio_classification_models/releases/download/v1.0.8/conformer-encoder.h5" 11 | 12 | class FFModule(tf.keras.layers.Layer): 13 | def __init__( 14 | self, 15 | input_dim, 16 | dropout=0.0, 17 | fc_factor=0.5, 18 | kernel_regularizer=L2, 19 | bias_regularizer=L2, 20 | name="ff_module", 21 | **kwargs, 22 | ): 23 | super(FFModule, self).__init__(name=name, **kwargs) 24 | self.fc_factor = fc_factor 25 | self.ln = tf.keras.layers.LayerNormalization( 26 | name=f"{name}_ln", 27 | gamma_regularizer=kernel_regularizer, 28 | beta_regularizer=bias_regularizer, 29 | ) 30 | self.ffn1 = tf.keras.layers.Dense( 31 | 4 * input_dim, 32 | name=f"{name}_dense_1", 33 | kernel_regularizer=kernel_regularizer, 34 | bias_regularizer=bias_regularizer, 35 | ) 36 | self.swish = tf.keras.layers.Activation(tf.nn.swish, name=f"{name}_swish_activation") 37 | self.do1 = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout_1") 38 | self.ffn2 = tf.keras.layers.Dense( 39 | input_dim, 40 | name=f"{name}_dense_2", 41 | kernel_regularizer=kernel_regularizer, 42 | bias_regularizer=bias_regularizer, 43 | ) 44 | self.do2 = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout_2") 45 | self.res_add = tf.keras.layers.Add(name=f"{name}_add") 46 | 47 | def call( 48 | self, 49 | inputs, 50 | training=False, 51 | **kwargs, 52 | ): 53 | outputs = self.ln(inputs, training=training) 54 | outputs = self.ffn1(outputs, training=training) 55 | outputs = self.swish(outputs) 56 | outputs = self.do1(outputs, training=training) 57 | outputs = self.ffn2(outputs, training=training) 58 | outputs = self.do2(outputs, training=training) 59 | outputs = self.res_add([inputs, self.fc_factor * outputs]) 60 | return outputs 61 | 62 | def get_config(self): 63 | conf = super(FFModule, self).get_config() 64 | conf.update({"fc_factor": self.fc_factor}) 65 | conf.update(self.ln.get_config()) 66 | conf.update(self.ffn1.get_config()) 67 | conf.update(self.swish.get_config()) 68 | conf.update(self.do1.get_config()) 69 | conf.update(self.ffn2.get_config()) 70 | conf.update(self.do2.get_config()) 71 | conf.update(self.res_add.get_config()) 72 | return conf 73 | 74 | 75 | class MHSAModule(tf.keras.layers.Layer): 76 | def __init__( 77 | self, 78 | head_size, 79 | num_heads, 80 | dropout=0.0, 81 | mha_type="relmha", 82 | kernel_regularizer=L2, 83 | bias_regularizer=L2, 84 | name="mhsa_module", 85 | **kwargs, 86 | ): 87 | super(MHSAModule, self).__init__(name=name, **kwargs) 88 | self.ln = tf.keras.layers.LayerNormalization( 89 | name=f"{name}_ln", 90 | gamma_regularizer=kernel_regularizer, 91 | beta_regularizer=bias_regularizer, 92 | ) 93 | if mha_type == "relmha": 94 | self.mha = RelPositionMultiHeadAttention( 95 | name=f"{name}_mhsa", 96 | head_size=head_size, 97 | num_heads=num_heads, 98 | kernel_regularizer=kernel_regularizer, 99 | bias_regularizer=bias_regularizer, 100 | ) 101 | elif mha_type == "mha": 102 | self.mha = MultiHeadAttention( 103 | name=f"{name}_mhsa", 104 | head_size=head_size, 105 | num_heads=num_heads, 106 | kernel_regularizer=kernel_regularizer, 107 | bias_regularizer=bias_regularizer, 108 | ) 109 | else: 110 | raise ValueError("mha_type must be either 'mha' or 'relmha'") 111 | self.do = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout") 112 | self.res_add = tf.keras.layers.Add(name=f"{name}_add") 113 | self.mha_type = mha_type 114 | 115 | def call( 116 | self, 117 | inputs, 118 | training=False, 119 | mask=None, 120 | **kwargs, 121 | ): 122 | inputs, pos = inputs # pos is positional encoding 123 | outputs = self.ln(inputs, training=training) 124 | if self.mha_type == "relmha": 125 | outputs = self.mha([outputs, outputs, outputs, pos], training=training, mask=mask) 126 | else: 127 | outputs = outputs + pos 128 | outputs = self.mha([outputs, outputs, outputs], training=training, mask=mask) 129 | outputs = self.do(outputs, training=training) 130 | outputs = self.res_add([inputs, outputs]) 131 | return outputs 132 | 133 | def get_config(self): 134 | conf = super(MHSAModule, self).get_config() 135 | conf.update({"mha_type": self.mha_type}) 136 | conf.update(self.ln.get_config()) 137 | conf.update(self.mha.get_config()) 138 | conf.update(self.do.get_config()) 139 | conf.update(self.res_add.get_config()) 140 | return conf 141 | 142 | 143 | class ConvModule(tf.keras.layers.Layer): 144 | def __init__( 145 | self, 146 | input_dim, 147 | kernel_size=32, 148 | dropout=0.0, 149 | depth_multiplier=1, 150 | kernel_regularizer=L2, 151 | bias_regularizer=L2, 152 | name="conv_module", 153 | **kwargs, 154 | ): 155 | super(ConvModule, self).__init__(name=name, **kwargs) 156 | self.ln = tf.keras.layers.LayerNormalization() 157 | self.pw_conv_1 = tf.keras.layers.Conv2D( 158 | filters=2 * input_dim, 159 | kernel_size=1, 160 | strides=1, 161 | padding="valid", 162 | name=f"{name}_pw_conv_1", 163 | kernel_regularizer=kernel_regularizer, 164 | bias_regularizer=bias_regularizer, 165 | ) 166 | self.glu = GLU(name=f"{name}_glu") 167 | self.dw_conv = tf.keras.layers.DepthwiseConv2D( 168 | kernel_size=(kernel_size, 1), 169 | strides=1, 170 | padding="same", 171 | name=f"{name}_dw_conv", 172 | depth_multiplier=depth_multiplier, 173 | depthwise_regularizer=kernel_regularizer, 174 | bias_regularizer=bias_regularizer, 175 | ) 176 | self.bn = tf.keras.layers.BatchNormalization( 177 | name=f"{name}_bn", 178 | gamma_regularizer=kernel_regularizer, 179 | beta_regularizer=bias_regularizer, 180 | ) 181 | self.swish = tf.keras.layers.Activation( 182 | tf.nn.swish, 183 | name=f"{name}_swish_activation", 184 | ) 185 | self.pw_conv_2 = tf.keras.layers.Conv2D( 186 | filters=input_dim, 187 | kernel_size=1, 188 | strides=1, 189 | padding="valid", 190 | name=f"{name}_pw_conv_2", 191 | kernel_regularizer=kernel_regularizer, 192 | bias_regularizer=bias_regularizer, 193 | ) 194 | self.do = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout") 195 | self.res_add = tf.keras.layers.Add(name=f"{name}_add") 196 | 197 | def call( 198 | self, 199 | inputs, 200 | training=False, 201 | **kwargs, 202 | ): 203 | outputs = self.ln(inputs, training=training) 204 | B, T, E = shape_util.shape_list(outputs) 205 | outputs = tf.reshape(outputs, [B, T, 1, E]) 206 | outputs = self.pw_conv_1(outputs, training=training) 207 | outputs = self.glu(outputs) 208 | outputs = self.dw_conv(outputs, training=training) 209 | outputs = self.bn(outputs, training=training) 210 | outputs = self.swish(outputs) 211 | outputs = self.pw_conv_2(outputs, training=training) 212 | outputs = tf.reshape(outputs, [B, T, E]) 213 | outputs = self.do(outputs, training=training) 214 | outputs = self.res_add([inputs, outputs]) 215 | return outputs 216 | 217 | def get_config(self): 218 | conf = super(ConvModule, self).get_config() 219 | conf.update(self.ln.get_config()) 220 | conf.update(self.pw_conv_1.get_config()) 221 | conf.update(self.glu.get_config()) 222 | conf.update(self.dw_conv.get_config()) 223 | conf.update(self.bn.get_config()) 224 | conf.update(self.swish.get_config()) 225 | conf.update(self.pw_conv_2.get_config()) 226 | conf.update(self.do.get_config()) 227 | conf.update(self.res_add.get_config()) 228 | return conf 229 | 230 | 231 | class ConformerBlock(tf.keras.layers.Layer): 232 | def __init__( 233 | self, 234 | input_dim, 235 | dropout=0.0, 236 | fc_factor=0.5, 237 | head_size=36, 238 | num_heads=4, 239 | mha_type="relmha", 240 | kernel_size=32, 241 | depth_multiplier=1, 242 | kernel_regularizer=L2, 243 | bias_regularizer=L2, 244 | name="conformer_block", 245 | **kwargs, 246 | ): 247 | super(ConformerBlock, self).__init__(name=name, **kwargs) 248 | self.ffm1 = FFModule( 249 | input_dim=input_dim, 250 | dropout=dropout, 251 | fc_factor=fc_factor, 252 | name=f"{name}_ff_module_1", 253 | kernel_regularizer=kernel_regularizer, 254 | bias_regularizer=bias_regularizer, 255 | ) 256 | self.mhsam = MHSAModule( 257 | mha_type=mha_type, 258 | head_size=head_size, 259 | num_heads=num_heads, 260 | dropout=dropout, 261 | name=f"{name}_mhsa_module", 262 | kernel_regularizer=kernel_regularizer, 263 | bias_regularizer=bias_regularizer, 264 | ) 265 | self.convm = ConvModule( 266 | input_dim=input_dim, 267 | kernel_size=kernel_size, 268 | dropout=dropout, 269 | name=f"{name}_conv_module", 270 | depth_multiplier=depth_multiplier, 271 | kernel_regularizer=kernel_regularizer, 272 | bias_regularizer=bias_regularizer, 273 | ) 274 | self.ffm2 = FFModule( 275 | input_dim=input_dim, 276 | dropout=dropout, 277 | fc_factor=fc_factor, 278 | name=f"{name}_ff_module_2", 279 | kernel_regularizer=kernel_regularizer, 280 | bias_regularizer=bias_regularizer, 281 | ) 282 | self.ln = tf.keras.layers.LayerNormalization( 283 | name=f"{name}_ln", 284 | gamma_regularizer=kernel_regularizer, 285 | beta_regularizer=kernel_regularizer, 286 | ) 287 | 288 | def call( 289 | self, 290 | inputs, 291 | training=False, 292 | mask=None, 293 | **kwargs, 294 | ): 295 | inputs, pos = inputs # pos is positional encoding 296 | outputs = self.ffm1(inputs, training=training, **kwargs) 297 | outputs = self.mhsam([outputs, pos], training=training, mask=mask, **kwargs) 298 | outputs = self.convm(outputs, training=training, **kwargs) 299 | outputs = self.ffm2(outputs, training=training, **kwargs) 300 | outputs = self.ln(outputs, training=training) 301 | return outputs 302 | 303 | def get_config(self): 304 | conf = super(ConformerBlock, self).get_config() 305 | conf.update(self.ffm1.get_config()) 306 | conf.update(self.mhsam.get_config()) 307 | conf.update(self.convm.get_config()) 308 | conf.update(self.ffm2.get_config()) 309 | conf.update(self.ln.get_config()) 310 | return conf 311 | 312 | 313 | class ConformerEncoder(tf.keras.Model): 314 | def __init__( 315 | self, 316 | subsampling={'type': 'conv2d','filters': 144,'kernel_size': 3,'strides': 2}, 317 | positional_encoding="sinusoid", 318 | dmodel=144, 319 | num_blocks=16, 320 | mha_type="relmha", 321 | head_size=36, 322 | num_heads=4, 323 | kernel_size=32, 324 | depth_multiplier=1, 325 | fc_factor=0.5, 326 | dropout=0.1, 327 | kernel_regularizer=L2, 328 | bias_regularizer=L2, 329 | name="conformer_encoder", 330 | **kwargs, 331 | ): 332 | super(ConformerEncoder, self).__init__(name=name, **kwargs) 333 | 334 | subsampling_name = subsampling.pop("type", "conv2d") 335 | if subsampling_name == "vgg": 336 | subsampling_class = VggSubsampling 337 | elif subsampling_name == "conv2d": 338 | subsampling_class = Conv2dSubsampling 339 | else: 340 | raise ValueError("subsampling must be either 'conv2d' or 'vgg'") 341 | 342 | self.conv_subsampling = subsampling_class( 343 | **subsampling, 344 | name=f"{name}_subsampling", 345 | kernel_regularizer=kernel_regularizer, 346 | bias_regularizer=bias_regularizer, 347 | ) 348 | 349 | if positional_encoding == "sinusoid": 350 | self.pe = PositionalEncoding(name=f"{name}_pe") 351 | elif positional_encoding == "sinusoid_v2": 352 | self.pe = PositionalEncoding(alpha=2, beta=0, name=f"{name}_pe") 353 | elif positional_encoding == "sinusoid_concat": 354 | self.pe = PositionalEncodingConcat(name=f"{name}_pe") 355 | elif positional_encoding == "sinusoid_concat_v2": 356 | self.pe = PositionalEncodingConcat(alpha=2, beta=-1, name=f"{name}_pe") 357 | elif positional_encoding == "subsampling": 358 | self.pe = tf.keras.layers.Activation("linear", name=f"{name}_pe") 359 | else: 360 | raise ValueError( 361 | "positional_encoding must be either 'sinusoid', \ 362 | 'sinusoid_concat', 'sinusoid_v2', 'sinusoid_concat_v2' or 'subsampling'" 363 | ) 364 | 365 | self.linear = tf.keras.layers.Dense( 366 | dmodel, 367 | name=f"{name}_linear", 368 | kernel_regularizer=kernel_regularizer, 369 | bias_regularizer=bias_regularizer, 370 | ) 371 | self.do = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout") 372 | 373 | self.conformer_blocks = [] 374 | for i in range(num_blocks): 375 | conformer_block = ConformerBlock( 376 | input_dim=dmodel, 377 | dropout=dropout, 378 | fc_factor=fc_factor, 379 | head_size=head_size, 380 | num_heads=num_heads, 381 | mha_type=mha_type, 382 | kernel_size=kernel_size, 383 | depth_multiplier=depth_multiplier, 384 | kernel_regularizer=kernel_regularizer, 385 | bias_regularizer=bias_regularizer, 386 | name=f"{name}_block_{i}", 387 | ) 388 | self.conformer_blocks.append(conformer_block) 389 | 390 | def call( 391 | self, 392 | inputs, 393 | training=False, 394 | mask=None, 395 | **kwargs, 396 | ): 397 | # input with shape [B, T, V1, V2] 398 | outputs = self.conv_subsampling(inputs, training=training) 399 | outputs = self.linear(outputs, training=training) 400 | pe = self.pe(outputs) 401 | outputs = self.do(outputs, training=training) 402 | for cblock in self.conformer_blocks: 403 | outputs = cblock([outputs, pe], training=training, mask=mask, **kwargs) 404 | return outputs 405 | 406 | def get_config(self): 407 | conf = super(ConformerEncoder, self).get_config() 408 | conf.update(self.conv_subsampling.get_config()) 409 | conf.update(self.linear.get_config()) 410 | conf.update(self.do.get_config()) 411 | conf.update(self.pe.get_config()) 412 | for cblock in self.conformer_blocks: 413 | conf.update(cblock.get_config()) 414 | return conf 415 | 416 | def Conformer(input_shape = (128, 80, 1),num_classes=1, final_activation='sigmoid', pretrain=True): 417 | inp = tf.keras.layers.Input(shape=input_shape) 418 | backbone = ConformerEncoder() 419 | out = backbone(inp) 420 | if pretrain: 421 | weights.load_pretrain(backbone, url=URL) 422 | out = tf.keras.layers.GlobalAveragePooling1D()(out) 423 | out = tf.keras.layers.Dense(32, activation='selu')(out) 424 | out = tf.keras.layers.Dense(num_classes, activation=final_activation)(out) 425 | model = tf.keras.models.Model(inp, out) 426 | return model 427 | 428 | --------------------------------------------------------------------------------