├── keras_cv_attention_models
    ├── version.py
    ├── pytorch_backend
    │   ├── optimizers.py
    │   ├── callbacks.py
    │   ├── metrics.py
    │   ├── utils.py
    │   └── losses.py
    ├── imagenet
    │   ├── metrics.py
    │   └── losses.py
    ├── tf_functional.py
    ├── clip
    │   ├── __init__.py
    │   ├── tf_data.py
    │   └── torch_data.py
    ├── coco
    │   ├── info.py
    │   └── __init__.py
    ├── beit
    │   ├── eva.py
    │   ├── meta_transformer.py
    │   ├── flexivit.py
    │   ├── dinov2.py
    │   ├── eva02.py
    │   └── vit.py
    ├── model_surgery
    │   ├── __init__.py
    │   └── README.md
    ├── ghostnet
    │   ├── ghostnet.py
    │   ├── README.md
    │   └── __init__.py
    ├── mobilenetv3_family
    │   ├── fbnetv3.py
    │   ├── lcnet.py
    │   └── tinynet.py
    ├── resnet_family
    │   ├── resnet_deep.py
    │   └── resnext.py
    ├── cspnext
    │   ├── __init__.py
    │   └── README.md
    ├── gpt2
    │   ├── __init__.py
    │   └── README.md
    ├── gpvit
    │   ├── README.md
    │   └── __init__.py
    ├── llama2
    │   ├── __init__.py
    │   └── README.md
    ├── inceptionnext
    │   ├── __init__.py
    │   └── README.md
    ├── moganet
    │   ├── __init__.py
    │   └── README.md
    ├── hiera
    │   └── __init__.py
    ├── iformer
    │   ├── __init__.py
    │   └── README.md
    ├── efficientnet
    │   └── efficientnet_edgetpu.py
    ├── fasternet
    │   ├── __init__.py
    │   └── README.md
    ├── resnest
    │   ├── README.md
    │   └── __init__.py
    ├── nat
    │   └── dinat.py
    ├── mobilevit
    │   └── mobilevit_v2.py
    ├── pvt
    │   ├── __init__.py
    │   └── README.md
    ├── repvit
    │   └── __init__.py
    ├── gcvit
    │   └── __init__.py
    ├── halonet
    │   └── README.md
    ├── convnext
    │   └── convnext_v2.py
    ├── keras_core_functional.py
    ├── stable_diffusion
    │   └── __init__.py
    ├── davit
    │   └── README.md
    ├── segment_anything
    │   └── __init__.py
    ├── fastervit
    │   └── __init__.py
    ├── tinyvit
    │   └── __init__.py
    ├── models.py
    ├── nfnets
    │   └── README.md
    ├── fastvit
    │   └── __init__.py
    ├── aotnet
    │   └── README.md
    ├── edgenext
    │   └── README.md
    └── __init__.py
├── .gitignore
├── kecam
    └── __init__.py
├── LICENSE
├── .github
    └── workflows
    │   └── publish-to-test-pypi.yml
├── tests
    ├── test_models_tf.py
    └── test_switch_to_deploy_tf.py
├── setup_kecam.py
└── setup.py


/keras_cv_attention_models/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.4.3"
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | *.h5
 3 | *.pth
 4 | *.pth.tar
 5 | *.pt
 6 | *.ckpt
 7 | *.npy
 8 | *.npz
 9 | *.onnx
10 | *.json
11 | *.tflite
12 | *.ipynb
13 | *.swp
14 | *.log
15 | *.tar
16 | *.tar.gz
17 | *.bin
18 | *.keras
19 | .ipynb_checkpoints
20 | checkpoints
21 | datasets
22 | logs
23 | 


--------------------------------------------------------------------------------
/kecam/__init__.py:
--------------------------------------------------------------------------------
1 | from keras_cv_attention_models import *
2 | _sub_modules = {__name__ + "." + kk: vv for kk, vv in locals().items() if not kk.startswith("_")}
3 | 
4 | import sys as _sys
5 | _sys.modules.update(_sub_modules)
6 | 
7 | from keras_cv_attention_models.version import __version__
8 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/pytorch_backend/optimizers.py:
--------------------------------------------------------------------------------
 1 | class Optimizer:
 2 |     def __init__(
 3 |         self,
 4 |         name,
 5 |         weight_decay=0,
 6 |         clipnorm=None,
 7 |         clipvalue=None,
 8 |         global_clipnorm=None,
 9 |         use_ema=False,
10 |         ema_momentum=0.99,
11 |         ema_overwrite_frequency=None,
12 |     ):
13 |         pass
14 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/imagenet/metrics.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.backend import metrics
 2 | 
 3 | 
 4 | class LossMeanMetricWrapper(metrics.Metric):
 5 |     def __init__(self, loss_func, loss_attr_name):
 6 |         self.loss_func, self.loss_attr_name = loss_func, loss_attr_name
 7 |         super().__init__(name=loss_attr_name)
 8 | 
 9 |     def reset_state(self):
10 |         self.value, self.passed_steps = 0.0, 0
11 | 
12 |     def update_state(self, y_true, y_pred, sample_weight=None):
13 |         self.value += getattr(self.loss_func, self.loss_attr_name)
14 |         self.passed_steps += 1
15 | 
16 |     def result(self):
17 |         return self.value / self.passed_steps
18 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/tf_functional.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.nn import *
 2 | from tensorflow.math import *
 3 | 
 4 | from tensorflow import (
 5 |     abs,
 6 |     cast,
 7 |     clip_by_value,
 8 |     complex,
 9 |     concat,
10 |     convert_to_tensor,
11 |     expand_dims,
12 |     gather,
13 |     gather_nd,
14 |     linspace,
15 |     map_fn,
16 |     matmul,
17 |     norm,
18 |     pad,
19 |     print,
20 |     range,
21 |     repeat,
22 |     reshape,
23 |     shape,
24 |     sign,
25 |     split,
26 |     squeeze,
27 |     stack,
28 |     tensor_scatter_nd_update,
29 |     tile,
30 |     transpose,
31 |     unstack,
32 |     where,
33 |     zeros,
34 | )
35 | from tensorflow.image import resize, extract_patches, non_max_suppression_with_scores
36 | from tensorflow.signal import irfft2d, rfft2d
37 | 
38 | 
39 | def assign(parameter, data):
40 |     parameter.assign(data)
41 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/clip/__init__.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models import backend as __backend__
 2 | 
 3 | from keras_cv_attention_models.clip.tokenizer import SimpleTokenizer, GPT2Tokenizer, TikToken, SentencePieceTokenizer
 4 | from keras_cv_attention_models.clip.models import (
 5 |     add_text_model_index_header,
 6 |     build_text_model_from_image_model,
 7 |     convert_to_clip_model,
 8 |     split_to_image_text_model,
 9 |     RunPrediction,
10 | )
11 | from keras_cv_attention_models.plot_func import plot_hists, show_batch_sample
12 | 
13 | if __backend__.is_tensorflow_backend:
14 |     from keras_cv_attention_models.clip import tf_data as data
15 |     from keras_cv_attention_models.clip.tf_data import init_dataset
16 | else:
17 |     from keras_cv_attention_models.clip import torch_data as data
18 |     from keras_cv_attention_models.clip.torch_data import init_dataset
19 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 leondgarse
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/coco/info.py:
--------------------------------------------------------------------------------
 1 | COCO_LABELS = """person, bicycle, car, motorcycle, airplane, bus, train, truck, boat, traffic light, fire hydrant, stop sign,
 2 |     parking meter, bench, bird, cat, dog, horse, sheep, cow, elephant, bear, zebra, giraffe, backpack, umbrella, handbag, tie,
 3 |     suitcase, frisbee, skis, snowboard, sports ball, kite, baseball bat, baseball glove, skateboard, surfboard, tennis racket,
 4 |     bottle, wine glass, cup, fork, knife, spoon, bowl, banana, apple, sandwich, orange, broccoli, carrot, hot dog, pizza, donut,
 5 |     cake, chair, couch, potted plant, bed, dining table, toilet, tv, laptop, mouse, remote, keyboard, cell phone, microwave, oven,
 6 |     toaster, sink, refrigerator, book, clock, vase, scissors, teddy bear, hair drier, toothbrush"""
 7 | COCO_80_LABEL_DICT = {id: ii.strip() for id, ii in enumerate(COCO_LABELS.split(","))}
 8 | INVALID_ID_90 = [11, 25, 28, 29, 44, 65, 67, 68, 70, 82]
 9 | COCO_90_LABEL_DICT = {id: ii for id, ii in zip(set(range(90)) - set(INVALID_ID_90), COCO_80_LABEL_DICT.values())}
10 | COCO_90_LABEL_DICT.update({ii: "Unknown" for ii in INVALID_ID_90})
11 | COCO_80_to_90_LABEL_DICT = {id_80: id_90 for id_80, id_90 in enumerate(set(range(90)) - set(INVALID_ID_90))}
12 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/beit/eva.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.beit.beit import Beit, keras_model_load_weights_from_pytorch_model
 2 | from keras_cv_attention_models.models import register_model
 3 | 
 4 | 
 5 | def EVA(layer_scale=0, use_abs_pos_emb=True, model_name="eva", **kwargs):
 6 |     kwargs.pop("kwargs", None)
 7 |     patch_size = kwargs.pop("patch_size", 14)
 8 |     force_reload_mismatch = patch_size != 14  # If patch_size not 14, force reload pos_emb and stem_conv weights
 9 |     return Beit(**locals(), **kwargs)
10 | 
11 | 
12 | @register_model
13 | def EvaLargePatch14(input_shape=(196, 196, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet21k-ft1k", **kwargs):
14 |     embed_dim = 1024
15 |     depth = 24
16 |     num_heads = 16
17 |     attn_qkv_bias = True
18 |     return EVA(**locals(), model_name="eva_large_patch14", **kwargs)
19 | 
20 | 
21 | @register_model
22 | def EvaGiantPatch14(input_shape=(224, 224, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet21k-ft1k", **kwargs):
23 |     mlp_ratio = 6144 / 1408
24 |     embed_dim = 1408
25 |     depth = 40
26 |     num_heads = 16
27 |     return EVA(**locals(), model_name="eva_giant_patch14", **kwargs)
28 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/model_surgery/__init__.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.model_surgery.model_surgery import (
 2 |     SAMModel,
 3 |     DropConnect,
 4 |     add_l2_regularizer_2_model,
 5 |     align_pyramide_feature_output_by_image_data_format,
 6 |     change_model_input_shape,
 7 |     convert_to_dynamic_input_shape,
 8 |     convert_dense_to_conv,
 9 |     convert_extract_patches_to_conv,
10 |     convert_gelu_to_approximate,
11 |     convert_gelu_and_extract_patches_for_tflite,  # [Deprecated], use convert_gelu_to_approximate -> convert_extract_patches_to_conv instead
12 |     convert_groups_conv2d_2_split_conv2d,
13 |     convert_to_mixed_float16,
14 |     convert_mixed_float16_to_float32,
15 |     convert_to_fixed_batch_size,
16 |     convert_to_fused_conv_bn_model,
17 |     convert_to_token_label_model,
18 |     convert_layers_to_deploy_inplace,
19 |     count_params,
20 |     export_onnx,
21 |     fuse_sequential_conv_strict,
22 |     fuse_channel_affine_to_conv_dense,
23 |     fuse_reparam_blocks,
24 |     fuse_distill_head,
25 |     get_actual_survival_probabilities,
26 |     get_actual_drop_connect_rates,
27 |     get_flops,
28 |     get_global_avg_pool_layer_id,
29 |     get_pyramide_feature_layers,
30 |     prepare_for_tflite,
31 |     remove_layer_single_input,
32 |     replace_ReLU,
33 |     replace_add_with_drop_connect,
34 |     replace_add_with_stochastic_depth,
35 |     replace_stochastic_depth_with_add,
36 |     split_model_to_head_body_tail_by_blocks,
37 |     swin_convert_pos_emb_mlp_to_MlpPairwisePositionalEmbedding_weights,
38 | )
39 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/beit/meta_transformer.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.beit.beit import Beit, keras_model_load_weights_from_pytorch_model
 2 | from keras_cv_attention_models.models import register_model
 3 | 
 4 | 
 5 | def MetaTransformer(
 6 |     use_patch_bias=False,
 7 |     use_pre_norm=True,
 8 |     use_abs_pos_emb=True,
 9 |     attn_qv_bias=False,
10 |     attn_qkv_bias=True,
11 |     use_mean_pooling_head=False,
12 |     layer_scale=0,
13 |     model_name="meta_transformer",
14 |     **kwargs,
15 | ):
16 |     kwargs.pop("kwargs", None)
17 |     return Beit(**locals(), **kwargs)
18 | 
19 | 
20 | @register_model
21 | def MetaTransformerBasePatch16(
22 |     input_shape=(384, 384, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="laion_2b", **kwargs
23 | ):
24 |     depth = 12
25 |     embed_dim = 768
26 |     num_heads = 12
27 |     patch_size = kwargs.pop("patch_size", 16)
28 |     force_reload_mismatch = patch_size != 16  # If patch_size not match, force reload pos_emb and stem_conv weights
29 |     return MetaTransformer(**locals(), model_name="meta_transformer_base_patch16", **kwargs)
30 | 
31 | 
32 | @register_model
33 | def MetaTransformerLargePatch14(
34 |     input_shape=(336, 336, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="laion_2b", **kwargs
35 | ):
36 |     depth = 24
37 |     embed_dim = 1024
38 |     num_heads = 16
39 |     patch_size = kwargs.pop("patch_size", 14)
40 |     force_reload_mismatch = patch_size != 14  # If patch_size not match, force reload pos_emb and stem_conv weights
41 |     return MetaTransformer(**locals(), model_name="meta_transformer_large_patch14", **kwargs)
42 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/beit/flexivit.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.beit.beit import Beit, keras_model_load_weights_from_pytorch_model
 2 | from keras_cv_attention_models.models import register_model
 3 | 
 4 | 
 5 | def FlexiViT(
 6 |     attn_qv_bias=False,
 7 |     attn_qkv_bias=True,
 8 |     use_abs_pos_emb=True,
 9 |     use_abs_pos_emb_on_cls_token=False,  # no_embed_class in timm
10 |     layer_scale=0,
11 |     use_mean_pooling_head=False,
12 |     model_name="flexivit",
13 |     **kwargs,
14 | ):
15 |     kwargs.pop("kwargs", None)
16 |     patch_size = kwargs.pop("patch_size", 16)
17 |     force_reload_mismatch = patch_size != 16  # If patch_size not 16, force reload pos_emb and stem_conv weights
18 |     return Beit(**locals(), **kwargs)
19 | 
20 | 
21 | @register_model
22 | def FlexiViTSmall(input_shape=(240, 240, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet", **kwargs):
23 |     embed_dim = 384
24 |     depth = 12
25 |     num_heads = 6
26 |     return FlexiViT(**locals(), model_name="flexivit_small", **kwargs)
27 | 
28 | 
29 | @register_model
30 | def FlexiViTBase(input_shape=(240, 240, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet", **kwargs):
31 |     embed_dim = 768
32 |     depth = 12
33 |     num_heads = 12
34 |     return FlexiViT(**locals(), model_name="flexivit_base", **kwargs)
35 | 
36 | 
37 | @register_model
38 | def FlexiViTLarge(input_shape=(240, 240, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet", **kwargs):
39 |     embed_dim = 1024
40 |     depth = 24
41 |     num_heads = 16
42 |     return FlexiViT(**locals(), model_name="flexivit_large", **kwargs)
43 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-to-test-pypi.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Python 🐍 distributions 📦 to PyPI and TestPyPI
 2 | 
 3 | on: push
 4 | 
 5 | jobs:
 6 |   build-n-publish:
 7 |     name: build and publish python 🐍 distributions 📦 to pypi and testpypi
 8 |     runs-on: ubuntu-24.04
 9 |     steps:
10 |     - uses: actions/checkout@master
11 |     - name: Set up Python 3.9
12 |       uses: actions/setup-python@v1
13 |       with:
14 |         python-version: 3.9
15 |     - name: Install pypa/build
16 |       run: >-
17 |         CUDA_VISIBLE_DEVICES='-1' python -m
18 |         pip install
19 |         build setuptools wheel pytest pytest-timeout pillow ftfy regex tqdm tensorflow tf-keras torch torchvision sentencepiece
20 |         --extra-index-url https://download.pytorch.org/whl/cpu
21 |         --user
22 |     - name: Build a binary wheel and a source tarball
23 |       run: >-
24 |         python -m
25 |         build
26 |         --sdist
27 |         --wheel
28 |         --outdir dist/
29 |         .
30 |     - name: Build a kecam binary wheel and a source tarball
31 |       run: >-
32 |         python setup_kecam.py sdist bdist_wheel
33 |     - name: Run tests
34 |       run: >-
35 |         CUDA_VISIBLE_DEVICES='-1' pytest -vv --durations=0 ./tests
36 |     - name: Run PyTorch backend tests
37 |       run: >-
38 |         CUDA_VISIBLE_DEVICES='-1' KECAM_BACKEND='torch' pytest -vv --durations=0 ./tests/test_models.py
39 |     - name: Publish distribution 📦 to Test PyPI
40 |       if: startsWith(github.ref, 'refs/tags')
41 |       uses: pypa/gh-action-pypi-publish@master
42 |       with:
43 |         password: ${{ secrets.TEST_PYPI_API_TOKEN }}
44 |         repository_url: https://test.pypi.org/legacy/
45 |     - name: Publish distribution 📦 to PyPI
46 |       if: startsWith(github.ref, 'refs/tags')
47 |       uses: pypa/gh-action-pypi-publish@master
48 |       with:
49 |         password: ${{ secrets.PYPI_API_TOKEN }}
50 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/beit/dinov2.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.beit.beit import Beit, keras_model_load_weights_from_pytorch_model
 2 | from keras_cv_attention_models.models import register_model
 3 | 
 4 | 
 5 | def DINOv2(layer_scale=1.0, use_abs_pos_emb=True, use_cat_head=True, attn_qkv_bias=True, model_name="dinov2", **kwargs):
 6 |     kwargs.pop("kwargs", None)
 7 |     patch_size = kwargs.pop("patch_size", 14)
 8 |     force_reload_mismatch = patch_size != 14  # If patch_size not 14, force reload pos_emb and stem_conv weights
 9 |     return Beit(**locals(), **kwargs)
10 | 
11 | 
12 | @register_model
13 | def DINOv2_ViT_Small14(input_shape=(518, 518, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet", **kwargs):
14 |     embed_dim = 384
15 |     depth = 12
16 |     num_heads = 6
17 |     return DINOv2(**locals(), model_name="dinov2_vit_small14", **kwargs)
18 | 
19 | 
20 | @register_model
21 | def DINOv2_ViT_Base14(input_shape=(518, 518, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet", **kwargs):
22 |     embed_dim = 768
23 |     depth = 12
24 |     num_heads = 12
25 |     return DINOv2(**locals(), model_name="dinov2_vit_base14", **kwargs)
26 | 
27 | 
28 | @register_model
29 | def DINOv2_ViT_Large14(input_shape=(518, 518, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet", **kwargs):
30 |     embed_dim = 1024
31 |     depth = 24
32 |     num_heads = 16
33 |     return DINOv2(**locals(), model_name="dinov2_vit_large14", **kwargs)
34 | 
35 | 
36 | @register_model
37 | def DINOv2_ViT_Giant14(input_shape=(518, 518, 3), num_classes=1000, activation="swish", classifier_activation="softmax", pretrained="imagenet", **kwargs):
38 |     embed_dim = 1536
39 |     depth = 40
40 |     num_heads = 24
41 |     use_gated_mlp = True
42 |     mlp_ratio = 4096 / 1536
43 |     return DINOv2(**locals(), model_name="dinov2_vit_giant14", **kwargs)
44 | 


--------------------------------------------------------------------------------
/tests/test_models_tf.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | sys.path.append(".")
 4 | import keras_cv_attention_models  # Needs to set TF_USE_LEGACY_KERAS=1 env firstly
 5 | 
 6 | import pytest
 7 | from keras_cv_attention_models.backend import models
 8 | from keras_cv_attention_models.test_images import cat
 9 | 
10 | """ Recognition models HorNet*GF / NFNet / VOLO defination """
11 | 
12 | 
13 | def test_NFNet_defination():
14 |     mm = keras_cv_attention_models.nfnets.NFNetF0(pretrained=None)
15 |     assert isinstance(mm, models.Model)
16 | 
17 |     mm = keras_cv_attention_models.nfnets.ECA_NFNetL1(pretrained=None, num_classes=0)
18 |     assert isinstance(mm, models.Model)
19 | 
20 | 
21 | def test_VOLO_defination():
22 |     mm = keras_cv_attention_models.volo.VOLO_d3(pretrained=None)
23 |     assert isinstance(mm, models.Model)
24 | 
25 |     mm = keras_cv_attention_models.volo.VOLO_d4(pretrained=None, num_classes=0)
26 |     assert isinstance(mm, models.Model)
27 | 
28 | 
29 | """ Recognition models EfficientNetV2B1_preprocessing / HorNet / VOLO prediction """
30 | 
31 | 
32 | def test_EfficientNetV2B1_preprocessing_predict():
33 |     mm = keras_cv_attention_models.efficientnet.EfficientNetV2B1(pretrained="imagenet", include_preprocessing=True)
34 |     pred = mm(mm.preprocess_input(cat()))
35 |     out = mm.decode_predictions(pred)[0][0]
36 | 
37 |     assert out[1] == "Egyptian_cat"
38 | 
39 | 
40 | def test_HorNetTinyGF_new_shape_predict():
41 |     mm = keras_cv_attention_models.hornet.HorNetTinyGF(input_shape=(174, 255, 3), pretrained="imagenet")
42 |     pred = mm(mm.preprocess_input(cat()))
43 |     out = mm.decode_predictions(pred)[0][0]
44 | 
45 |     assert out[1] == "Egyptian_cat"
46 | 
47 | 
48 | def test_VOLO_d1_new_shape_predict():
49 |     mm = keras_cv_attention_models.volo.VOLO_d1(input_shape=(512, 512, 3), pretrained="imagenet")
50 |     pred = mm(mm.preprocess_input(cat()))
51 |     out = mm.decode_predictions(pred)[0][0]
52 | 
53 |     assert out[1] == "Egyptian_cat"
54 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/ghostnet/ghostnet.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.ghostnet.ghostnet_v2 import GhostNetV2
 2 | from keras_cv_attention_models.models import register_model
 3 | 
 4 | 
 5 | def GhostNet(
 6 |     kernel_sizes=[3, 3, 3, 5, 5, 3, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5],
 7 |     first_ghost_channels=[16, 48, 72, 72, 120, 240, 200, 184, 184, 480, 672, 672, 960, 960, 960, 960],
 8 |     out_channels=[16, 24, 24, 40, 40, 80, 80, 80, 80, 112, 112, 160, 160, 160, 160, 160],
 9 |     se_ratios=[0, 0, 0, 0.25, 0.25, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0, 0.25, 0, 0.25],
10 |     strides=[1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1],
11 |     stem_width=16,
12 |     stem_strides=2,
13 |     width_mul=1.0,
14 |     num_ghost_module_v1_stacks=-1,  # num of `ghost_module` stcks on the head, others are `ghost_module_multiply`, set `-1` for all using `ghost_module`
15 |     output_conv_filter=-1,  # -1 for first_ghost_channels[-1] * width_mul
16 |     input_shape=(224, 224, 3),
17 |     num_classes=1000,
18 |     activation="relu",
19 |     classifier_activation="softmax",
20 |     dropout=0,
21 |     pretrained=None,
22 |     model_name="ghostnet",
23 |     kwargs=None,
24 | ):
25 |     return GhostNetV2(**locals())
26 | 
27 | 
28 | @register_model
29 | def GhostNet_050(input_shape=(224, 224, 3), num_classes=1000, activation="relu", classifier_activation="softmax", pretrained="imagenet", **kwargs):
30 |     return GhostNet(**locals(), width_mul=0.5, model_name="ghostnet_050", **kwargs)
31 | 
32 | 
33 | @register_model
34 | def GhostNet_100(input_shape=(224, 224, 3), num_classes=1000, activation="relu", classifier_activation="softmax", pretrained="imagenet", **kwargs):
35 |     return GhostNet(**locals(), model_name="ghostnet_100", **kwargs)
36 | 
37 | 
38 | @register_model
39 | def GhostNet_130(input_shape=(224, 224, 3), num_classes=1000, activation="relu", classifier_activation="softmax", pretrained="imagenet", **kwargs):
40 |     return GhostNet(**locals(), width_mul=1.3, model_name="ghostnet_130", **kwargs)
41 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/beit/eva02.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.beit.beit import Beit, keras_model_load_weights_from_pytorch_model
 2 | from keras_cv_attention_models.models import register_model
 3 | 
 4 | 
 5 | def EVA02(mlp_ratio=4 * 2 / 3, layer_scale=0, use_abs_pos_emb=True, use_rot_pos_emb=True, use_gated_mlp=True, activation="swish", model_name="eva02", **kwargs):
 6 |     kwargs.pop("kwargs", None)
 7 |     patch_size = kwargs.pop("patch_size", 14)
 8 |     force_reload_mismatch = patch_size != 14  # If patch_size not 14, force reload pos_emb and stem_conv weights
 9 |     return Beit(**locals(), **kwargs)
10 | 
11 | 
12 | @register_model
13 | def EVA02TinyPatch14(input_shape=(336, 336, 3), num_classes=1000, activation="swish", classifier_activation="softmax", pretrained="mim_in22k_ft1k", **kwargs):
14 |     embed_dim = 192
15 |     depth = 12
16 |     num_heads = 3
17 |     return EVA02(**locals(), model_name="eva02_tiny_patch14", **kwargs)
18 | 
19 | 
20 | @register_model
21 | def EVA02SmallPatch14(input_shape=(336, 336, 3), num_classes=1000, activation="swish", classifier_activation="softmax", pretrained="mim_in22k_ft1k", **kwargs):
22 |     embed_dim = 384
23 |     depth = 12
24 |     num_heads = 6
25 |     return EVA02(**locals(), model_name="eva02_small_patch14", **kwargs)
26 | 
27 | 
28 | @register_model
29 | def EVA02BasePatch14(
30 |     input_shape=(448, 448, 3), num_classes=1000, activation="swish", classifier_activation="softmax", pretrained="mim_in22k_ft22k_ft1k", **kwargs
31 | ):
32 |     embed_dim = 768
33 |     depth = 12
34 |     num_heads = 12
35 |     use_norm_mlp = True  # scale_mlp = True
36 |     return EVA02(**locals(), model_name="eva02_base_patch14", **kwargs)
37 | 
38 | 
39 | @register_model
40 | def EVA02LargePatch14(
41 |     input_shape=(448, 448, 3), num_classes=1000, activation="swish", classifier_activation="softmax", pretrained="mim_m38m_ft22k_ft1k", **kwargs
42 | ):
43 |     embed_dim = 1024
44 |     depth = 24
45 |     num_heads = 16
46 |     use_norm_mlp = True  # scale_mlp = True
47 |     return EVA02(**locals(), model_name="eva02_large_patch14", **kwargs)
48 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/mobilenetv3_family/fbnetv3.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.mobilenetv3_family.mobilenetv3 import MobileNetV3
 2 | from keras_cv_attention_models.models import register_model
 3 | 
 4 | 
 5 | def FBNetV3(
 6 |     num_blocks=[2, 4, 1, 4, 1, 4, 1, 5, 1, 5, 1],
 7 |     out_channels=[16, 24, 40, 40, 72, 72, 120, 120, 184, 184, 224],
 8 |     expands=[1, [4, 2, 2, 2], 5, 3, 5, 3, 5, 3, 6, 4, 6],
 9 |     kernel_sizes=[3, 5, 5, 5, 5, 3, 3, 5, 3, 5, 5],
10 |     strides=[1, 2, 2, 1, 2, 1, 1, 1, 2, 1, 1],
11 |     activations="hard_swish",
12 |     se_ratios=[0, 0, 0.25, 0.25, 0, 0, 0.25, 0.25, 0.25, 0.25, 0.25],
13 |     se_activation=("hard_swish", "hard_sigmoid_torch"),
14 |     se_limit_round_down=0.95,
15 |     use_expanded_se_ratio=False,
16 |     output_num_features=1984,
17 |     use_output_feature_bias=False,
18 |     model_name="fbnetv3",
19 |     **kwargs,
20 | ):
21 |     kwargs.pop("kwargs", None)
22 |     return MobileNetV3(**locals(), **kwargs)
23 | 
24 | 
25 | @register_model
26 | def FBNetV3B(input_shape=(256, 256, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs):
27 |     return FBNetV3(**locals(), model_name="fbnetv3_b", **kwargs)
28 | 
29 | 
30 | @register_model
31 | def FBNetV3D(input_shape=(256, 256, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs):
32 |     num_blocks = [2, 6, 1, 4, 1, 4, 1, 6, 1, 5, 1]
33 |     out_channels = [16, 24, 40, 40, 72, 72, 128, 128, 208, 208, 240]
34 |     expands = [1, [5, 2, 2, 2, 2, 2], 4, 3, 5, 3, 5, 3, 6, 5, 6]
35 |     kernel_sizes = [3, 3, 5, 3, 3, 3, 3, 5, 3, 5, 5]
36 |     stem_width = 24
37 |     return FBNetV3(**locals(), model_name="fbnetv3_d", **kwargs)
38 | 
39 | 
40 | @register_model
41 | def FBNetV3G(input_shape=(256, 256, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs):
42 |     num_blocks = [3, 5, 1, 4, 1, 4, 1, 8, 1, 6, 2]
43 |     out_channels = [24, 40, 56, 56, 104, 104, 160, 160, 264, 264, 288]
44 |     expands = [1, [4, 2, 2, 2, 2], 4, 3, 5, 3, 5, 3, 6, 5, 6]
45 |     kernel_sizes = [3, 5, 5, 5, 5, 3, 3, 5, 3, 5, 5]
46 |     stem_width = 32
47 |     return FBNetV3(**locals(), model_name="fbnetv3_g", **kwargs)
48 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/resnet_family/resnet_deep.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.aotnet import AotNet
 2 | from keras_cv_attention_models.models import register_model
 3 | from keras_cv_attention_models.download_and_load import reload_model_weights
 4 | 
 5 | PRETRAINED_DICT = {
 6 |     "resnet50d": {"imagenet": "1b71933a82b058ba1e605ee5c01f64b2"},
 7 |     "resnet101d": {"imagenet": "79b075be5cf222cff2bced7a5a117623"},
 8 |     "resnet152d": {"imagenet": "0a15299b9abe1fee3ae06d9a59d13a3f"},
 9 |     "resnet200d": {"imagenet": "b5961494e0072c342b838c77ef52ddc5"},
10 | }
11 | 
12 | 
13 | def ResNetD(num_blocks, input_shape=(224, 224, 3), pretrained="imagenet", stem_type="deep", strides=2, shortcut_type="avg", **kwargs):
14 |     strides = strides if isinstance(strides, (list, tuple)) else [1, 2, 2, strides]
15 |     model = AotNet(num_blocks, input_shape=input_shape, stem_type=stem_type, strides=strides, shortcut_type=shortcut_type, **kwargs)
16 |     reload_model_weights(model, pretrained_dict=PRETRAINED_DICT, sub_release="resnet_family", pretrained=pretrained)
17 |     return model
18 | 
19 | 
20 | @register_model
21 | def ResNet50D(input_shape=(224, 224, 3), num_classes=1000, activation="relu", classifier_activation="softmax", pretrained="imagenet", **kwargs):
22 |     num_blocks = [3, 4, 6, 3]
23 |     return ResNetD(**locals(), model_name="resnet50d", **kwargs)
24 | 
25 | 
26 | @register_model
27 | def ResNet101D(input_shape=(224, 224, 3), num_classes=1000, activation="relu", classifier_activation="softmax", pretrained="imagenet", **kwargs):
28 |     num_blocks = [3, 4, 23, 3]
29 |     return ResNetD(**locals(), model_name="resnet101d", **kwargs)
30 | 
31 | 
32 | @register_model
33 | def ResNet152D(input_shape=(224, 224, 3), num_classes=1000, activation="relu", classifier_activation="softmax", pretrained="imagenet", **kwargs):
34 |     num_blocks = [3, 8, 36, 3]
35 |     return ResNetD(**locals(), model_name="resnet152d", **kwargs)
36 | 
37 | 
38 | @register_model
39 | def ResNet200D(input_shape=(224, 224, 3), num_classes=1000, activation="relu", classifier_activation="softmax", pretrained="imagenet", **kwargs):
40 |     num_blocks = [3, 24, 36, 3]
41 |     return ResNetD(**locals(), model_name="resnet200d", **kwargs)
42 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/clip/tf_data.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from keras_cv_attention_models.imagenet.tf_data import init_mean_std_by_rescale_mode, tf_imread, random_crop_and_resize_image, build_custom_dataset
 3 | 
 4 | 
 5 | def image_process(image, image_size=(224, 224), is_train=True):
 6 |     image = tf_imread(image)
 7 |     if is_train:
 8 |         image = random_crop_and_resize_image(image, image_size, scale=(0.9, 1.0), method="bicubic", antialias=True)[0]
 9 |     else:
10 |         image = tf.image.resize(image, image_size, method="bicubic", antialias=True)
11 |     image = tf.cast(image, tf.float32)
12 |     image.set_shape([*image_size, 3])
13 |     return image
14 | 
15 | 
16 | def init_dataset(data_path, caption_tokenizer, batch_size=64, image_size=224, rescale_mode="torch"):
17 |     dataset, total_images, num_classes, num_channels = build_custom_dataset(data_path, with_info=True, caption_tokenizer=caption_tokenizer)
18 | 
19 |     mean, std = init_mean_std_by_rescale_mode(rescale_mode)
20 |     image_size = image_size if isinstance(image_size, (list, tuple)) else [image_size, image_size]
21 | 
22 |     AUTOTUNE, buffer_size, seed = tf.data.AUTOTUNE, batch_size * 100, None
23 |     train_pre_batch = lambda data_point: (image_process(data_point["image"], image_size, is_train=True), data_point["caption"])
24 |     y_true = tf.range(batch_size)
25 |     train_post_batch = lambda xx, caption: (((xx - mean) / std, caption), y_true)
26 | 
27 |     train_dataset = dataset["train"]
28 |     train_dataset = train_dataset.shuffle(buffer_size, seed=seed).map(train_pre_batch, num_parallel_calls=AUTOTUNE)
29 |     train_dataset = train_dataset.batch(batch_size, drop_remainder=True).map(train_post_batch, num_parallel_calls=AUTOTUNE)
30 |     train_dataset = train_dataset.prefetch(buffer_size=AUTOTUNE)
31 | 
32 |     test_dataset = dataset.get("validation", dataset.get("test", None))
33 |     if test_dataset is not None:
34 |         test_pre_batch = lambda data_point: (image_process(data_point["image"], image_size, is_train=False), data_point["caption"])
35 |         test_dataset = test_dataset.map(test_pre_batch, num_parallel_calls=AUTOTUNE)
36 |         test_dataset = test_dataset.batch(batch_size, drop_remainder=True).map(train_post_batch)
37 | 
38 |     return train_dataset, test_dataset
39 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/cspnext/__init__.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.cspnext.cspnext import (
 2 |     CSPNeXt,
 3 |     CSPNeXtTiny,
 4 |     CSPNeXtSmall,
 5 |     CSPNeXtMedium,
 6 |     CSPNeXtLarge,
 7 |     CSPNeXtXLarge,
 8 | )
 9 | 
10 | __head_doc__ = """
11 | Keras implementation of [Github open-mmlab/mmdetection/rtmdet](https://github.com/open-mmlab/mmdetection/tree/main/configs/rtmdet#classification).
12 | CSPNeXt is the backbone from Paper [PDF 2212.07784 RTMDet: An Empirical Study of Designing Real-Time Object Detectors](https://arxiv.org/abs/2212.07784).
13 | """
14 | 
15 | __tail_doc__ = """  input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`.
16 |   num_classes: number of classes to classify images into. Set `0` to exclude top layers.
17 |   activation: activation used in whole model, default `gelu`.
18 |   dropout: dropout rate if top layers is included.
19 |   classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`.
20 |       Set `classifier_activation=None` to return the logits of the "top" layer.
21 |   pretrained: one of None or "imagenet".
22 |       Will try to download and load pre-trained model weights if not None.
23 | 
24 | Returns:
25 |     A `keras.Model` instance.
26 | """
27 | 
28 | CSPNeXt.__doc__ = __head_doc__ + """
29 | Args:
30 |   num_blocks: number of blocks in each stack.
31 |   out_channels: output channels for each stack.
32 |   stem_width: hidden dimension stem blocks.
33 |   model_name: string, model name.
34 | """ + __tail_doc__ + """
35 | Model architectures:
36 |   | Model         | Params | FLOPs | Input | Top1 Acc |
37 |   | ------------- | ------ | ----- | ----- | -------- |
38 |   | CSPNeXtTiny   | 2.73M  | 0.34G | 224   | 69.44    |
39 |   | CSPNeXtSmall  | 4.89M  | 0.66G | 224   | 74.41    |
40 |   | CSPNeXtMedium | 13.05M | 1.92G | 224   | 79.27    |
41 |   | CSPNeXtLarge  | 27.16M | 4.19G | 224   | 81.30    |
42 |   | CSPNeXtXLarge | 48.85M | 7.75G | 224   | 82.10    |
43 | """
44 | 
45 | CSPNeXtTiny.__doc__ = __head_doc__ + """
46 | Args:
47 | """ + __tail_doc__
48 | 
49 | CSPNeXtSmall.__doc__ = CSPNeXtTiny.__doc__
50 | CSPNeXtMedium.__doc__ = CSPNeXtTiny.__doc__
51 | CSPNeXtLarge.__doc__ = CSPNeXtTiny.__doc__
52 | CSPNeXtXLarge.__doc__ = CSPNeXtTiny.__doc__
53 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/mobilenetv3_family/lcnet.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.mobilenetv3_family.mobilenetv3 import MobileNetV3
 2 | from keras_cv_attention_models.models import register_model
 3 | 
 4 | 
 5 | def LCNet(
 6 |     num_blocks=[1, 2, 2, 1, 5, 2],
 7 |     out_channels=[32, 64, 128, 256, 256, 512],
 8 |     expands=1,
 9 |     kernel_sizes=[3, 3, 3, 3, 5, 5],
10 |     strides=[1, 2, 2, 2, 1, 2],
11 |     activations="hard_swish",
12 |     disable_shortcut=True,
13 |     use_blocks_output_activation=True,
14 |     se_ratios=[0, 0, 0, 0, 0, 0.25],
15 |     output_num_features=1280,
16 |     use_additional_output_conv=False,
17 |     model_name="lcnet",
18 |     **kwargs,
19 | ):
20 |     kwargs.pop("kwargs", None)
21 |     return MobileNetV3(**locals(), **kwargs)
22 | 
23 | 
24 | @register_model
25 | def LCNet050(input_shape=(224, 224, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs):
26 |     return LCNet(**locals(), width_ratio=0.5, model_name="lcnet_050", **kwargs)
27 | 
28 | 
29 | @register_model
30 | def LCNet075(input_shape=(224, 224, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs):
31 |     return LCNet(**locals(), width_ratio=0.75, model_name="lcnet_075", **kwargs)
32 | 
33 | 
34 | @register_model
35 | def LCNet100(input_shape=(224, 224, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs):
36 |     return LCNet(**locals(), model_name="lcnet_100", **kwargs)
37 | 
38 | 
39 | @register_model
40 | def LCNet150(input_shape=(224, 224, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs):
41 |     use_output_feature_bias = False
42 |     return LCNet(**locals(), width_ratio=1.5, model_name="lcnet_150", **kwargs)
43 | 
44 | 
45 | @register_model
46 | def LCNet200(input_shape=(224, 224, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs):
47 |     use_output_feature_bias = False
48 |     return LCNet(**locals(), width_ratio=2.0, model_name="lcnet_200", **kwargs)
49 | 
50 | 
51 | @register_model
52 | def LCNet250(input_shape=(224, 224, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs):
53 |     use_output_feature_bias = False
54 |     return LCNet(**locals(), width_ratio=2.5, model_name="lcnet_250", **kwargs)
55 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/pytorch_backend/callbacks.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | 
 4 | 
 5 | class Callback:
 6 |     def __init__(self):
 7 |         self.validation_data = None
 8 |         self.model = None
 9 | 
10 |     def set_params(self, params):
11 |         self.params = params
12 | 
13 |     def set_model(self, model):
14 |         self.model = model
15 | 
16 |     def on_train_batch_begin(self, batch, logs=None):
17 |         pass
18 | 
19 |     def on_train_batch_end(self, batch, logs=None):
20 |         pass
21 | 
22 |     def on_epoch_begin(self, cur_epoch, logs=None):
23 |         pass
24 | 
25 |     def on_epoch_end(self, cur_epoch, logs=None):
26 |         pass
27 | 
28 |     def on_test_batch_begin(self, batch, logs=None):
29 |         pass
30 | 
31 |     def on_test_batch_end(self, batch, logs=None):
32 |         pass
33 | 
34 |     def on_test_begin(self, logs=None):
35 |         pass
36 | 
37 |     def on_test_end(self, logs=None):
38 |         pass
39 | 
40 | 
41 | class TerminateOnNaN(Callback):
42 |     def on_train_batch_end(self, batch, logs=None):
43 |         logs = logs or {}
44 |         loss = logs.get("loss")
45 |         if loss is not None:
46 |             if not np.isfinite(loss):
47 |                 print("\nError: Invalid loss, terminating training")
48 |                 # self.model.stop_training = True
49 |                 sys.exit()
50 | 
51 | 
52 | class TensorBoard(Callback):
53 |     def __init__(self, log_dir="logs", histogram_freq=1, **kwargs):
54 |         super().__init__()
55 |         self.log_dir, self.histogram_freq = log_dir, histogram_freq
56 |         try:
57 |             from torch.utils.tensorboard import SummaryWriter
58 | 
59 |             self.tensorboard_writer = SummaryWriter(self.log_dir)
60 |             print(">>>> Tensorboard writer created, summary will be write to '{}', view by 'tensorboard --logdir {}'".format(log_dir, log_dir))
61 |         except:
62 |             self.tensorboard_writer = None
63 |             print("[Error] tensorboard not installed, try `pip install tensorboard`")
64 | 
65 |     def on_epoch_end(self, cur_epoch, logs=None):
66 |         if self.tensorboard_writer is None:
67 |             return
68 |         logs = logs or {}
69 |         for kk, vv in logs.items():
70 |             self.tensorboard_writer.add_scalar(kk, vv, cur_epoch)
71 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/gpt2/__init__.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.gpt2.gpt2 import (
 2 |     GPT2,
 3 |     GPT2_Base,
 4 |     GPT2_Medium,
 5 |     GPT2_Large,
 6 |     GPT2_XLarge,
 7 |     RunPrediction,
 8 |     PositionalIndex,
 9 |     CausalMask,
10 |     load_weights_from_huggingface,
11 | )
12 | 
13 | __head_doc__ = """
14 | Keras implementation of [Github openai/gpt-2](https://github.com/openai/gpt-2).
15 | Paper [Language Models are Unsupervised Multitask Learners](https://d4mucfpksywv.cloudfront.net/better-language-models/language-models.pdf).
16 | """
17 | 
18 | __tail_doc__ = """  vocab_size: model vocab size.
19 |   max_block_size: number of tokens generated in each sample.
20 |   include_top: boolena value if including output Dense head layer. Set false to exclude the head layer.
21 |   dropout: float value for drop out rate for Embedding layer and attention blocks.
22 |   activation: activation used in whole model, default `gelu/app`.
23 |   pretrained: None or one of ["webtext", "huggingface"].
24 |       - if "webtext", will try to download and load ported weights if available.
25 |       - if "huggingface", will try converting and loading weights from huggingface `transformers` pacakge.
26 |       - if None, will initialize model with ranbdom weights.
27 | 
28 | Returns:
29 |     A `keras.Model` instance.
30 | """
31 | 
32 | GPT2.__doc__ = __head_doc__ + """
33 | Args:
34 |   num_blocks: num of `attention_mlp_block`s.
35 |   embedding_size: `attention_mlp_block` block embedding size.
36 |   num_heads: num of heads.
37 |   block_use_bias: boolean value if using bias for `attention_mlp_block` Dense layers.
38 |   model_name: string, model name.
39 | """ + __tail_doc__ + """
40 | Model architectures:
41 |   | Model       | Params  | FLOPs   | vocab_size | LAMBADA PPL |
42 |   | ------------| ------- | ------- | ---------- | ----------- |
43 |   | GPT2_Base   | 163.04M | 146.42G | 50257      | 35.13       |
44 |   | GPT2_Medium | 406.29M | 415.07G | 50257      | 15.60       |
45 |   | GPT2_Large  | 838.36M | 890.28G | 50257      | 10.87       |
46 |   | GPT2_XLarge | 1.638B  | 1758.3G | 50257      | 8.63        |
47 | """
48 | 
49 | GPT2_Base.__doc__ = __head_doc__ + """
50 | Args:
51 | """ + __tail_doc__
52 | 
53 | GPT2_Medium.__doc__ = GPT2_Base.__doc__
54 | GPT2_Large.__doc__ = GPT2_Base.__doc__
55 | GPT2_XLarge.__doc__ = GPT2_Base.__doc__
56 | 


--------------------------------------------------------------------------------
/setup_kecam.py:
--------------------------------------------------------------------------------
 1 | """Setup"""
 2 | 
 3 | from setuptools import setup, find_packages
 4 | from codecs import open
 5 | from os import path
 6 | 
 7 | here = path.abspath(path.dirname(__file__))
 8 | 
 9 | # Get the long description from the README file
10 | with open(path.join(here, "README.md"), encoding="utf-8") as f:
11 |     long_description = f.read()
12 | long_description = long_description.replace(
13 |     "](keras_cv_attention_models", "](https://github.com/leondgarse/keras_cv_attention_models/tree/main/keras_cv_attention_models"
14 | )
15 | 
16 | exec(open("keras_cv_attention_models/version.py").read())
17 | setup(
18 |     name="kecam",
19 |     version=__version__,
20 |     description="Tensorflow keras computer vision attention models. Alias kecam. https://github.com/leondgarse/keras_cv_attention_models",
21 |     long_description=long_description,
22 |     long_description_content_type="text/markdown",
23 |     url="https://github.com/leondgarse/keras_cv_attention_models",
24 |     author="Leondgarse",
25 |     author_email="leondgarse@gmail.com",
26 |     classifiers=[
27 |         # How mature is this project? Common values are
28 |         #   3 - Alpha
29 |         #   4 - Beta
30 |         #   5 - Production/Stable
31 |         "Development Status :: 3 - Alpha",
32 |         "Intended Audience :: Developers",
33 |         "Intended Audience :: Science/Research",
34 |         "License :: OSI Approved :: Apache Software License",
35 |         "Programming Language :: Python :: 3.6",
36 |         "Programming Language :: Python :: 3.7",
37 |         "Programming Language :: Python :: 3.8",
38 |         "Topic :: Scientific/Engineering",
39 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
40 |         "Topic :: Software Development",
41 |         "Topic :: Software Development :: Libraries",
42 |         "Topic :: Software Development :: Libraries :: Python Modules",
43 |     ],
44 |     # Note that this is a string of words separated by whitespace, not a list.
45 |     keywords="tensorflow keras cv attention pretrained models kecam",
46 |     packages=find_packages(exclude=["tests"]) + ["keras_cv_attention_models.pytorch_backend"],
47 |     include_package_data=True,
48 |     install_requires=["h5py", "pillow", "tqdm", "ftfy", "regex"],  # ftfy and regex required for language models
49 |     python_requires=">=3.6",
50 |     license="Apache 2.0",
51 | )
52 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/gpvit/README.md:
--------------------------------------------------------------------------------
 1 | # ___Keras GPViT___
 2 | ***
 3 | 
 4 | ## Summary
 5 |   - Keras implementation of [Github ChenhongyiYang/GPViT](https://github.com/ChenhongyiYang/GPViT). Paper [PDF 2212.06795 GPVIT: A HIGH RESOLUTION NON-HIERARCHICAL VISION TRANSFORMER WITH GROUP PROPAGATION](https://arxiv.org/pdf/2212.06795.pdf).
 6 |   - Model weights ported from official publication.
 7 | ***
 8 | 
 9 | ## Models
10 |   | Model    | Params | FLOPs  | Input | Top1 Acc | Download |
11 |   | -------- | ------ | ------ | ----- | -------- | -------- |
12 |   | GPViT_L1 | 9.59M  | 6.15G  | 224   | 80.5     | [gpvit_l1_224_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/gpvit/gpvit_l1_224_imagenet.h5) |
13 |   | GPViT_L2 | 24.2M  | 15.74G | 224   | 83.4     | [gpvit_l2_224_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/gpvit/gpvit_l2_224_imagenet.h5) |
14 |   | GPViT_L3 | 36.7M  | 23.54G | 224   | 84.1     | [gpvit_l3_224_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/gpvit/gpvit_l3_224_imagenet.h5) |
15 |   | GPViT_L4 | 75.5M  | 48.29G | 224   | 84.3     | [gpvit_l4_224_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/gpvit/gpvit_l4_224_imagenet.h5) |
16 | ## Usage
17 |   ```py
18 |   from keras_cv_attention_models import gpvit
19 | 
20 |   # Will download and load pretrained imagenet weights.
21 |   mm = gpvit.GPViT_L1(pretrained="imagenet")
22 | 
23 |   # Run prediction
24 |   import tensorflow as tf
25 |   from tensorflow import keras
26 |   from skimage.data import chelsea
27 |   imm = keras.applications.imagenet_utils.preprocess_input(chelsea(), mode='torch') # Chelsea the cat
28 |   pred = mm(tf.expand_dims(tf.image.resize(imm, mm.input_shape[1:3]), 0)).numpy()
29 |   print(keras.applications.imagenet_utils.decode_predictions(pred)[0])
30 |   # [('n02124075', 'Egyptian_cat', 0.7434748), ('n02123045', 'tabby', 0.089776225), ...]
31 |   ```
32 |   **Change input resolution**.
33 |   ```py
34 |   from keras_cv_attention_models import gpvit
35 |   mm = gpvit.GPViT_L1(input_shape=(128, 192, 3), pretrained="imagenet")
36 |   # >>>> Load pretrained from: ~/.keras/models/gp_vit_l1_224_imagenet.h5
37 |   # >>>> Reload mismatched weights: 224 -> (128, 192)
38 |   # >>>> Reload layer: positional_embedding
39 | 
40 |   # Run prediction
41 |   from skimage.data import chelsea
42 |   preds = mm(mm.preprocess_input(chelsea()))
43 |   print(mm.decode_predictions(preds))
44 |   # [('n02124075', 'Egyptian_cat', 0.8140152), ('n02123045', 'tabby', 0.05595901), ...]
45 |   ```
46 | ***
47 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/beit/vit.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.beit.beit import Beit, keras_model_load_weights_from_pytorch_model
 2 | from keras_cv_attention_models.models import register_model
 3 | 
 4 | 
 5 | def ViT(attn_qv_bias=False, attn_qkv_bias=True, use_abs_pos_emb=True, layer_scale=0, use_mean_pooling_head=False, model_name="vit", **kwargs):
 6 |     kwargs.pop("kwargs", None)
 7 |     return Beit(**locals(), **kwargs)
 8 | 
 9 | 
10 | def ViTText(
11 |     vocab_size=49408,
12 |     max_block_size=77,
13 |     text_positional_dropout=0,
14 |     text_use_positional_embedding=True,
15 |     include_top=True,
16 |     layer_norm_epsilon=1e-5,
17 |     activation="gelu/quick",
18 |     model_name="vit_text",
19 |     **kwargs,
20 | ):
21 |     attn_qv_bias = kwargs.pop("attn_qv_bias", False)
22 |     attn_qkv_bias = kwargs.pop("attn_qkv_bias", True)
23 |     use_abs_pos_emb = kwargs.pop("use_abs_pos_emb", True)
24 |     layer_scale = kwargs.pop("layer_scale", 0)
25 |     use_mean_pooling_head = kwargs.pop("use_mean_pooling_head", False)
26 |     kwargs.pop("kwargs", None)
27 |     return Beit(**locals(), **kwargs)
28 | 
29 | 
30 | @register_model
31 | def ViTTinyPatch16(input_shape=(196, 196, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet21k-ft1k", **kwargs):
32 |     embed_dim = 192
33 |     depth = 12
34 |     num_heads = 3
35 |     patch_size = kwargs.pop("patch_size", 16)
36 |     return ViT(**locals(), model_name="vit_tiny_patch16", **kwargs)
37 | 
38 | 
39 | @register_model
40 | def ViTBasePatch16(input_shape=(196, 196, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet21k-ft1k", **kwargs):
41 |     embed_dim = 768
42 |     depth = 12
43 |     num_heads = 12
44 |     patch_size = kwargs.pop("patch_size", 16)
45 |     return ViT(**locals(), model_name="vit_base_patch16", **kwargs)
46 | 
47 | 
48 | @register_model
49 | def ViTLargePatch14(input_shape=(196, 196, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet21k-ft1k", **kwargs):
50 |     embed_dim = 1024
51 |     depth = 24
52 |     num_heads = 16
53 |     patch_size = kwargs.pop("patch_size", 14)
54 |     return ViT(**locals(), model_name="vit_large_patch14", **kwargs)
55 | 
56 | 
57 | @register_model
58 | def ViTTextLargePatch14(vocab_size=49408, max_block_size=77, activation="gelu/quick", include_top=True, pretrained="clip", **kwargs):
59 |     embed_dim = 768
60 |     depth = 12
61 |     num_heads = 12
62 |     patch_size = kwargs.pop("patch_size", 14)
63 |     return ViTText(**locals(), model_name="vit_text_large_patch14", **kwargs)
64 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/coco/__init__.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models import backend
 2 | 
 3 | from keras_cv_attention_models.coco import eval_func, anchors_func
 4 | from keras_cv_attention_models.coco.eval_func import DecodePredictions, COCOEvalCallback
 5 | from keras_cv_attention_models.coco.anchors_func import (
 6 |     get_anchors_mode_parameters,
 7 |     get_anchors,
 8 |     get_anchor_free_anchors,
 9 |     get_yolor_anchors,
10 |     get_anchors_mode_by_anchors,
11 |     get_pyramid_levels_by_anchors,
12 |     decode_bboxes,
13 | )
14 | from keras_cv_attention_models.coco.info import COCO_80_LABEL_DICT, COCO_90_LABEL_DICT, COCO_80_to_90_LABEL_DICT
15 | from keras_cv_attention_models.plot_func import draw_bboxes, show_image_with_bboxes
16 | from keras_cv_attention_models.plot_func import show_detection_batch_sample as show_batch_sample
17 | 
18 | if backend.is_tensorflow_backend:
19 |     from keras_cv_attention_models.coco import tf_data as data
20 |     from keras_cv_attention_models.coco import tf_losses as losses
21 |     from keras_cv_attention_models.coco.tf_data import aspect_aware_resize_and_crop_image, init_mean_std_by_rescale_mode, init_dataset
22 | 
23 |     data.init_dataset.__doc__ = """ Init dataset by name.
24 |     Args:
25 |       data_name: the registered dataset name from `tensorflow_datasets`.
26 |       input_shape: input shape.
27 |       batch_size: batch size.
28 |       buffer_size: dataset shuffle buffer size.
29 |       info_only: boolean value if returns dataset info only.
30 |       max_labels_per_image: .
31 |       anchors_mode: .
32 |       anchor_pyramid_levels: .
33 |       anchor_aspect_ratios: .
34 |       anchor_num_scales: .
35 |       anchor_scale: .
36 |       anchor_scale: .
37 |       cutmix_alpha: cutmix applying probability.
38 |       rescale_mode: one of ["tf", "torch", "raw01", "raw"]. Detail in `data.init_mean_std_by_rescale_mode`. Or specific `(mean, std)` like `(128.0, 128.0)`.
39 |       random_crop_mode: .
40 |       mosaic_mix_prob: .
41 |       resize_method: one of ["nearest", "bilinear", "bicubic"]. Resize method for `tf.image.resize`.
42 |       resize_antialias: boolean value if using antialias for `tf.image.resize`.
43 |       magnitude: randaug magnitude.
44 |       num_layers: randaug num_layers.
45 |       augment_kwargs: randaug kwargs. Too many to list them all.
46 | 
47 |     Returns: train_dataset, test_dataset, total_images, num_classes, steps_per_epoch
48 |     """
49 | else:
50 |     from keras_cv_attention_models.coco import torch_data as data
51 |     from keras_cv_attention_models.coco import torch_losses as losses
52 |     from keras_cv_attention_models.coco.torch_data import aspect_aware_resize_and_crop_image, init_mean_std_by_rescale_mode, init_dataset
53 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/llama2/__init__.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.llama2.llama2 import (
 2 |     LLaMA2,
 3 |     LLaMA2_15M,
 4 |     LLaMA2_42M,
 5 |     LLaMA2_110M,
 6 |     LLaMA2_1B,
 7 |     LLaMA2_7B,
 8 |     RunPrediction,
 9 |     PositionalEncodingFourierRot1D,
10 |     RMSNorm,
11 |     convert_huggingface_weights_to_h5,
12 | )
13 | 
14 | __head_doc__ = """
15 | Keras implementation of [Github facebookresearch/llama](https://github.com/facebookresearch/llama).
16 | Paper [PDF 2307.09288 Llama 2: Open Foundation and Fine-Tuned Chat Models](https://arxiv.org/pdf/2307.09288.pdf).
17 | """
18 | 
19 | __tail_doc__ = """  vocab_size: model vocab size.
20 |   max_block_size: number of tokens generated in each sample.
21 |   include_top: boolena value if including output Dense head layer. Set false to exclude the head layer.
22 |   dropout: float value for drop out rate for Embedding layer and attention blocks.
23 |   activation: activation used in whole model, default `swish`.
24 |   pretrained: None or "tiny_stories", or specific ".pt" or ".h5" file.
25 |       - if "tiny_stories" or "tiny_llama_1.1B_chat_v0.4", will try to download and load ported weights if available.
26 |       - if "xxx.pt", will try converting and loading weights from .pt file.
27 |       - if "xxx.h5", will just load weights.
28 |       - if None, will initialize model with ranbdom weights.
29 | 
30 | Returns:
31 |     A `keras.Model` instance.
32 | """
33 | 
34 | LLaMA2.__doc__ = __head_doc__ + """
35 | Args:
36 |   num_blocks: num of `attention_fft_block`s.
37 |   embedding_size: `attention_fft_block` block embedding size.
38 |   hidden_divisible: int value making fft block hidden layer size multiple of large power of 2.
39 |   num_heads: num of heads.
40 |   num_kv_heads: int value specific key value heads, num_heads should be divisible by num_kv_heads. Default -1 for equal with num_heads.
41 |   block_use_bias: boolean value if using bias for `attention_fft_block` Dense layers.
42 |   model_name: string, model name.
43 | """ + __tail_doc__ + """
44 | Model architectures:
45 |   | Model       | Params | FLOPs  | vocab_size | Val loss |
46 |   | ----------- | ------ | ------ | ---------- | -------- |
47 |   | LLaMA2_15M  | 24.41M | 4.06G  | 32000      | 1.072    |
48 |   | LLaMA2_42M  | 58.17M | 50.7G  | 32000      | 0.847    |
49 |   | LLaMA2_110M | 134.1M | 130.2G | 32000      | 0.760    |
50 |   | LLaMA2_1B   | 1.10B  | 2.50T  | 32003      |          |
51 |   | LLaMA2_7B   | 6.74B  | 14.54T | 32000      |          |
52 | """
53 | 
54 | LLaMA2_15M.__doc__ = __head_doc__ + """
55 | Args:
56 | """ + __tail_doc__
57 | 
58 | LLaMA2_42M.__doc__ = LLaMA2_15M.__doc__
59 | LLaMA2_110M.__doc__ = LLaMA2_15M.__doc__
60 | LLaMA2_1B.__doc__ = LLaMA2_15M.__doc__
61 | LLaMA2_7B.__doc__ = LLaMA2_15M.__doc__
62 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/inceptionnext/__init__.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.inceptionnext.inceptionnext import InceptionNeXt, InceptionNeXtTiny, InceptionNeXtSmall, InceptionNeXtBase
 2 | 
 3 | __head_doc__ = """
 4 | Keras implementation of [Github sail-sg/inceptionnext](https://github.com/sail-sg/inceptionnext).
 5 | Paper [PDF 2303.16900 InceptionNeXt: When Inception Meets ConvNeXt](https://arxiv.org/pdf/2303.16900.pdf).
 6 | """
 7 | 
 8 | __tail_doc__ = """  input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`.
 9 |   num_classes: number of classes to classify images into. Set `0` to exclude top layers.
10 |   activation: activation used in whole model, default `gelu`.
11 |   drop_connect_rate: is used for [Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382).
12 |       Can be a constant value like `0.2`,
13 |       or a tuple value like `(0, 0.2)` indicates the drop probability linearly changes from `0 --> 0.2` for `top --> bottom` layers.
14 |       A higher value means a higher probability will drop the deep branch.
15 |       or `0` to disable (default).
16 |   layer_scale: int value indicates layer scale init value for each stack. Default `[0, 0, 1e-6, 1e-6]`, 0 for not using.
17 |       [Going deeper with Image Transformers](https://arxiv.org/abs/2103.17239).
18 |   classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`.
19 |       Set `classifier_activation=None` to return the logits of the "top" layer.
20 |       Default is `None`.
21 |   pretrained: one of `None` (random initialization) or 'imagenet21k-ft1k' (pre-training on ImageNet21k and fine-tuned ImageNet).
22 |       Will try to download and load pre-trained model weights if not None.
23 | 
24 | Returns:
25 |     A `keras.Model` instance.
26 | """
27 | 
28 | InceptionNeXt.__doc__ = __head_doc__ + """
29 | Args:
30 |   num_blocks: number of blocks in each stack.
31 |   embed_dims: output channels for each stack.
32 |   mlp_ratios: int or list value indicates expand ratio for mlp blocks hidden channel in each stack.
33 |   model_name: string, model name.
34 | """ + __tail_doc__ + """
35 | Model architectures:
36 |   | Model              | Params | FLOP s | Input | Top1 Acc |
37 |   | ------------------ | ------ | ------ | ----- | -------- |
38 |   | InceptionNeXtTiny  | 28.05M | 4.21G  | 224   | 82.3     |
39 |   | InceptionNeXtSmall | 49.37M | 8.39G  | 224   | 83.5     |
40 |   | InceptionNeXtBase  | 86.67M | 14.88G | 224   | 84.0     |
41 |   |                    | 86.67M | 43.73G | 384   | 85.2     |
42 | """
43 | 
44 | InceptionNeXtTiny.__doc__ = __head_doc__ + """
45 | Args:
46 | """ + __tail_doc__
47 | 
48 | InceptionNeXtSmall.__doc__ = InceptionNeXtTiny.__doc__
49 | InceptionNeXtBase.__doc__ = InceptionNeXtTiny.__doc__
50 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/pytorch_backend/metrics.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.pytorch_backend import functional
 2 | 
 3 | BUILDIN_METRICS = {}
 4 | 
 5 | 
 6 | def register_metrics(name=None):
 7 |     def decorator(arg):
 8 |         registered_names = name or [arg.__name__]
 9 |         registered_names = registered_names if isinstance(registered_names, (list, tuple)) else [registered_names]
10 |         for registered_name in registered_names:
11 |             if registered_name in BUILDIN_METRICS:
12 |                 raise ValueError(f"{registered_name} has already been registered to " f"{BUILDIN_METRICS[registered_name]}")
13 |             BUILDIN_METRICS[registered_name] = arg
14 |         return arg
15 | 
16 |     return decorator
17 | 
18 | 
19 | class Metric:
20 |     def __init__(self, name=None, **kwargs):
21 |         super().__init__()
22 |         self.name = name
23 |         self.eval_only = False
24 |         self.reset_state()
25 | 
26 |     def reset_state(self):
27 |         pass
28 | 
29 |     def update_state(self, y_true, y_pred, sample_weight=None):
30 |         pass
31 | 
32 |     def result(self):
33 |         pass
34 | 
35 | 
36 | @register_metrics(name=["acc", "accuracy"])
37 | class Accuracy(Metric):
38 |     def __init__(self, name="acc"):
39 |         super().__init__(name=name)
40 | 
41 |     def reset_state(self):
42 |         self.sum_value, self.passed_steps = 0.0, 0
43 | 
44 |     def update_state(self, y_true, y_pred, sample_weight=None):
45 |         y_pred = functional.argmax(y_pred, axis=-1)
46 |         if len(y_true.shape) > len(y_pred.shape):
47 |             y_true = functional.argmax(y_true, axis=-1)
48 |         cur_acc = functional.reduce_mean(functional.cast(y_true == y_pred, "float32"))
49 |         self.sum_value = self.sum_value + cur_acc
50 |         self.passed_steps += 1
51 | 
52 |     def result(self):
53 |         return self.sum_value / self.passed_steps
54 | 
55 | 
56 | @register_metrics(name=["acc5", "accuracy5"])
57 | class Accuracy5(Metric):
58 |     def __init__(self, name="acc5"):
59 |         super().__init__(name=name)
60 |         self.eval_only = True
61 | 
62 |     def reset_state(self):
63 |         self.sum_value, self.passed_steps = 0.0, 0
64 | 
65 |     def update_state(self, y_true, y_pred, sample_weight=None):
66 |         y_pred = functional.argsort(y_pred, direction="DESCENDING", axis=-1)[:, :5]
67 |         if len(y_true.shape) >= len(y_pred.shape):
68 |             y_true = functional.argmax(y_true, axis=-1)
69 |         cur_acc = functional.reduce_mean(functional.convert_to_tensor([y_true[id] in y_pred[id] for id in range(y_true.shape[0])], "float32"))
70 |         self.sum_value = self.sum_value + cur_acc
71 |         self.passed_steps += 1
72 | 
73 |     def result(self):
74 |         return self.sum_value / self.passed_steps
75 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """Setup"""
 2 | 
 3 | from setuptools import setup, find_packages
 4 | from codecs import open
 5 | from os import path
 6 | 
 7 | here = path.abspath(path.dirname(__file__))
 8 | 
 9 | # Get the long description from the README file
10 | with open(path.join(here, "README.md"), encoding="utf-8") as f:
11 |     long_description = f.read()
12 | long_description = long_description.replace(
13 |     "](keras_cv_attention_models", "](https://github.com/leondgarse/keras_cv_attention_models/tree/main/keras_cv_attention_models"
14 | )
15 | 
16 | exec(open("keras_cv_attention_models/version.py").read())
17 | setup(
18 |     name="keras-cv-attention-models",
19 |     version=__version__,
20 |     description="Tensorflow keras computer vision attention models. Alias kecam. https://github.com/leondgarse/keras_cv_attention_models",
21 |     long_description=long_description,
22 |     long_description_content_type="text/markdown",
23 |     url="https://github.com/leondgarse/keras_cv_attention_models",
24 |     author="Leondgarse",
25 |     author_email="leondgarse@gmail.com",
26 |     classifiers=[
27 |         # How mature is this project? Common values are
28 |         #   3 - Alpha
29 |         #   4 - Beta
30 |         #   5 - Production/Stable
31 |         "Development Status :: 3 - Alpha",
32 |         "Intended Audience :: Developers",
33 |         "Intended Audience :: Science/Research",
34 |         "License :: OSI Approved :: Apache Software License",
35 |         "Programming Language :: Python :: 3.6",
36 |         "Programming Language :: Python :: 3.7",
37 |         "Programming Language :: Python :: 3.8",
38 |         "Topic :: Scientific/Engineering",
39 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
40 |         "Topic :: Software Development",
41 |         "Topic :: Software Development :: Libraries",
42 |         "Topic :: Software Development :: Libraries :: Python Modules",
43 |     ],
44 |     # Note that this is a string of words separated by whitespace, not a list.
45 |     keywords="tensorflow keras cv attention pretrained models kecam",
46 |     packages=find_packages(exclude=["tests"]) + ["keras_cv_attention_models.pytorch_backend"],
47 |     include_package_data=True,
48 |     install_requires=[
49 |         "pillow",
50 |         "tqdm",
51 |         "ftfy",  # required for language models
52 |         "regex",  # required for language models
53 |         # "tensorflow-macos;platform_system=='Darwin'",  # [???]
54 |         "tensorflow;platform_system!='Darwin'",
55 |         # "tensorflow-addons;platform_machine!='aarch64' and platform_machine!='aarch32'",  # [deprecated]
56 |         # "tensorflow-datasets;platform_machine!='aarch64' and platform_machine!='aarch32'",  # >4.7.0 needs dm-tree, failed on arm, just skip
57 |     ],
58 |     python_requires=">=3.6",
59 |     license="Apache 2.0",
60 | )
61 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/mobilenetv3_family/tinynet.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.mobilenetv3_family.mobilenetv3 import MobileNetV3
 2 | from keras_cv_attention_models.models import register_model
 3 | 
 4 | 
 5 | def get_expanded_width_depth(width, depth):
 6 |     out_channels = [ii * width for ii in [16, 24, 40, 80, 112, 192, 320]]
 7 |     num_blocks = [int(round(ii * depth)) for ii in [1, 2, 2, 3, 3, 4, 1]]
 8 |     return out_channels, num_blocks
 9 | 
10 | 
11 | def TinyNet(
12 |     num_blocks=[1, 2, 2, 3, 3, 4, 1],
13 |     out_channels=[16, 24, 40, 80, 112, 192, 320],
14 |     expands=[1, 6, 6, 6, 6, 6, 6],
15 |     kernel_sizes=[3, 3, 5, 3, 5, 5, 3],
16 |     strides=[1, 2, 2, 2, 1, 2, 1],
17 |     activations="swish",
18 |     stem_width=32,
19 |     fix_stem=True,
20 |     se_ratios=0.25,
21 |     se_activation=None,  # None for same with activations
22 |     use_expanded_se_ratio=False,
23 |     se_divisor=1,
24 |     output_num_features=1280,
25 |     use_additional_output_conv=False,
26 |     use_output_feature_bias=False,
27 |     use_avg_pool_conv_output=False,
28 |     model_name="tinynet",
29 |     **kwargs,
30 | ):
31 |     stem_feature_activation = activations
32 |     kwargs.pop("kwargs", None)
33 |     return MobileNetV3(**locals(), **kwargs)
34 | 
35 | 
36 | @register_model
37 | def TinyNetA(input_shape=(192, 192, 3), num_classes=1000, activations="swish", classifier_activation="softmax", pretrained="imagenet", **kwargs):
38 |     out_channels, num_blocks = get_expanded_width_depth(1.0, 1.2)
39 |     return TinyNet(**locals(), model_name="tinynet_a", **kwargs)
40 | 
41 | 
42 | @register_model
43 | def TinyNetB(input_shape=(188, 188, 3), num_classes=1000, activations="swish", classifier_activation="softmax", pretrained="imagenet", **kwargs):
44 |     out_channels, num_blocks = get_expanded_width_depth(0.75, 1.1)
45 |     return TinyNet(**locals(), model_name="tinynet_b", **kwargs)
46 | 
47 | 
48 | @register_model
49 | def TinyNetC(input_shape=(184, 184, 3), num_classes=1000, activations="swish", classifier_activation="softmax", pretrained="imagenet", **kwargs):
50 |     out_channels, num_blocks = get_expanded_width_depth(0.54, 0.85)
51 |     return TinyNet(**locals(), model_name="tinynet_c", **kwargs)
52 | 
53 | 
54 | @register_model
55 | def TinyNetD(input_shape=(152, 152, 3), num_classes=1000, activations="swish", classifier_activation="softmax", pretrained="imagenet", **kwargs):
56 |     out_channels, num_blocks = get_expanded_width_depth(0.54, 0.695)
57 |     return TinyNet(**locals(), model_name="tinynet_d", **kwargs)
58 | 
59 | 
60 | @register_model
61 | def TinyNetE(input_shape=(106, 106, 3), num_classes=1000, activations="swish", classifier_activation="softmax", pretrained="imagenet", **kwargs):
62 |     out_channels, num_blocks = get_expanded_width_depth(0.51, 0.6)
63 |     return TinyNet(**locals(), model_name="tinynet_e", **kwargs)
64 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/moganet/__init__.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.moganet.moganet import MogaNet, MogaNetXtiny, MogaNetTiny, MogaNetSmall, MogaNetBase, MogaNetLarge
 2 | 
 3 | __head_doc__ = """
 4 | Keras implementation of [Github Westlake-AI/MogaNet](https://github.com/Westlake-AI/MogaNet).
 5 | Paper [PDF 2211.03295 Efficient Multi-order Gated Aggregation Network](https://arxiv.org/pdf/2211.03295.pdf).
 6 | """
 7 | 
 8 | __tail_doc__ = """  input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`.
 9 |   num_classes: number of classes to classify images into. Set `0` to exclude top layers.
10 |   activation: activation for non-attention blocks, default `gelu`.
11 |   attn_activation: activation for attention blocks, default `swish`. `None` for same with `activation`.
12 |   drop_connect_rate: is used for [Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382).
13 |       Can be a constant value like `0.2`,
14 |       or a tuple value like `(0, 0.2)` indicates the drop probability linearly changes from `0 --> 0.2` for `top --> bottom` layers.
15 |       A higher value means a higher probability will drop the deep branch.
16 |       or `0` to disable (default).
17 |   layer_scale: int value indicates layer scale init value for each stack. Default `1e-5`, 0 for not using.
18 |       [Going deeper with Image Transformers](https://arxiv.org/abs/2103.17239).
19 |   classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`.
20 |       Set `classifier_activation=None` to return the logits of the "top" layer.
21 |       Default is `None`.
22 |   pretrained: one of `None` (random initialization) or 'imagenet' (pre-training on ImageNet).
23 |       Will try to download and load pre-trained model weights if not None.
24 | 
25 | Returns:
26 |     A `keras.Model` instance.
27 | """
28 | 
29 | MogaNet.__doc__ = __head_doc__ + """
30 | Args:
31 |   num_blocks: number of blocks in each stack.
32 |   out_channels: output channels for each stack.
33 |   mlp_ratios: int or list value indicates expand ratio for mlp blocks hidden channel in each stack.
34 |   model_name: string, model name.
35 | """ + __tail_doc__ + """
36 | Model architectures:
37 |   | Model        | Params | FLOPs  | Input | Top1 Acc |
38 |   | ------------ | ------ | ------ | ----- | -------- |
39 |   | MogaNetXtiny | 2.96M  | 806M   | 224   | 76.5     |
40 |   | MogaNetTiny  | 5.20M  | 1.11G  | 224   | 79.0     |
41 |   |              | 5.20M  | 1.45G  | 256   | 79.6     |
42 |   | MogaNetSmall | 25.3M  | 4.98G  | 224   | 83.4     |
43 |   | MogaNetBase  | 43.7M  | 9.96G  | 224   | 84.2     |
44 |   | MogaNetLarge | 82.5M  | 15.96G | 224   | 84.6     |
45 | """
46 | 
47 | MogaNetXtiny.__doc__ = __head_doc__ + """
48 | Args:
49 | """ + __tail_doc__
50 | 
51 | MogaNetTiny.__doc__ = MogaNetXtiny.__doc__
52 | MogaNetSmall.__doc__ = MogaNetXtiny.__doc__
53 | MogaNetBase.__doc__ = MogaNetXtiny.__doc__
54 | MogaNetLarge.__doc__ = MogaNetXtiny.__doc__
55 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/hiera/__init__.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.hiera.hiera import Hiera, HieraTiny, HieraSmall, HieraBase, HieraBasePlus, HieraLarge, HieraHuge
 2 | 
 3 | __head_doc__ = """
 4 | Keras implementation of [Github facebookresearch/hiera](https://github.com/facebookresearch/hiera).
 5 | Paper [PDF 2306.00989 Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/pdf/2306.00989.pdf).
 6 | """
 7 | 
 8 | __tail_doc__ = """  strides: list of int indicates strides for each stack. Default `[1, 2, 2, 2]`.
 9 |   input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`.
10 |   num_classes: number of classes to classify images into. Set `0` to exclude top layers.
11 |   activation: activation used in whole model, default `gelu`.
12 |   drop_connect_rate: is used for [Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382).
13 |       Can be value like `0.2`, indicates the drop probability linearly changes from `0 --> 0.2` for `top --> bottom` layers.
14 |       A higher value means a higher probability will drop the deep branch.
15 |       or `0` to disable (default).
16 |   dropout: dropout rate if top layers is included.
17 |   classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`.
18 |       Set `classifier_activation=None` to return the logits of the "top" layer.
19 |   pretrained: one of None or "mae_in1k_ft1k".
20 |       Will try to download and load pre-trained model weights if not None.
21 | 
22 | Returns:
23 |     A `keras.Model` instance.
24 | """
25 | 
26 | Hiera.__doc__ = __head_doc__ + """
27 | Args:
28 |   num_blocks: number of blocks in each stack.
29 |   embed_dim: basic hidden dims, expand * 2 for each stack.
30 |   num_heads: int or list value for num heads in each stack.
31 |   use_window_attentions: boolean or list value, each value in the list can also be a list of boolean.
32 |       Indicates if use window attention in each stack.
33 |       Element value like `[True, False]` means first one is True, others are False.
34 |   mlp_ratio: expand ratio for mlp blocks hidden channel.
35 |   model_name: string, model name.
36 | """ + __tail_doc__ + """
37 | Model architectures:
38 |   | Model         | Params  | FLOPs   | Input | Top1 Acc |
39 |   | ------------- | ------- | ------- | ----- | -------- |
40 |   | HieraTiny     | 27.91M  | 4.93G   | 224   | 82.8     |
41 |   | HieraSmall    | 35.01M  | 6.44G   | 224   | 83.8     |
42 |   | HieraBase     | 51.52M  | 9.43G   | 224   | 84.5     |
43 |   | HieraBasePlus | 69.90M  | 12.71G  | 224   | 85.2     |
44 |   | HieraLarge    | 213.74M | 40.43G  | 224   | 86.1     |
45 |   | HieraHuge     | 672.78M | 125.03G | 224   | 86.9     |
46 | """
47 | 
48 | HieraTiny.__doc__ = __head_doc__ + """
49 | Args:
50 | """ + __tail_doc__
51 | 
52 | HieraSmall.__doc__ = HieraTiny.__doc__
53 | HieraBase.__doc__ = HieraTiny.__doc__
54 | HieraBasePlus.__doc__ = HieraTiny.__doc__
55 | HieraLarge.__doc__ = HieraTiny.__doc__
56 | HieraHuge.__doc__ = HieraTiny.__doc__
57 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/iformer/__init__.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.iformer.iformer import InceptionTransformer, IFormerSmall, IFormerBase, IFormerLarge
 2 | 
 3 | __head_doc__ = """
 4 | Keras implementation of [Github sail-sg/iFormer](https://github.com/sail-sg/iFormer).
 5 | Paper [PDF 2205.12956 Inception Transformer](https://arxiv.org/pdf/2205.12956.pdf).
 6 | """
 7 | 
 8 | __tail_doc__ = """  input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`.
 9 |   num_classes: number of classes to classify images into. Set `0` to exclude top layers.
10 |   activation: activation used in whole model, default `gelu`.
11 |   drop_connect_rate: is used for [Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382).
12 |       Can be a constant value like `0.2`,
13 |       or a tuple value like `(0, 0.2)` indicates the drop probability linearly changes from `0 --> 0.2` for `top --> bottom` layers.
14 |       A higher value means a higher probability will drop the deep branch.
15 |       or `0` to disable (default).
16 |   layer_scales: int or list of int, indicates layer scale init value for each stack. Default `[0, 0, 1e-6, 1e-6]`, 0 for not using.
17 |       [Going deeper with Image Transformers](https://arxiv.org/abs/2103.17239).
18 |   classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`.
19 |       Set `classifier_activation=None` to return the logits of the "top" layer.
20 |       Default is `None`.
21 |   pretrained: one of `None` (random initialization) or 'imagenet21k-ft1k' (pre-training on ImageNet21k and fine-tuned ImageNet).
22 |       Will try to download and load pre-trained model weights if not None.
23 | 
24 | Returns:
25 |     A `keras.Model` instance.
26 | """
27 | 
28 | InceptionTransformer.__doc__ = __head_doc__ + """
29 | Args:
30 |   num_blocks: number of blocks in each stack.
31 |   embed_dims: output channels for each stack.
32 |   num_heads: int or list value indicates heads number for `conv_attention_mixer` blocks in each stack.
33 |   num_attn_low_heads: int or list value indicates attention heads number for `attention_low_frequency_mixer` blocks in each stack.
34 |   pool_sizes: int or list value indicates attention blocks key_value downsample rate in each stack.
35 |   mlp_ratios: int or list value indicates expand ratio for mlp blocks hidden channel in each stack.
36 |   model_name: string, model name.
37 | """ + __tail_doc__ + """
38 | Model architectures:
39 |   | Model        | Params | FLOPs  | Input | Top1 Acc |
40 |   | ------------ | ------ | ------ | ----- | -------- |
41 |   | IFormerSmall | 19.9M  | 4.88G  | 224   | 83.4     |
42 |   |              | 20.9M  | 16.29G | 384   | 84.6     |
43 |   | IFormerBase  | 47.9M  | 9.44G  | 224   | 84.6     |
44 |   |              | 48.9M  | 30.86G | 384   | 85.7     |
45 |   | IFormerLarge | 86.6M  | 14.12G | 224   | 84.6     |
46 |   |              | 87.7M  | 45.74G | 384   | 85.8     |
47 | """
48 | 
49 | IFormerSmall.__doc__ = __head_doc__ + """
50 | Args:
51 | """ + __tail_doc__
52 | 
53 | IFormerBase.__doc__ = IFormerSmall.__doc__
54 | IFormerLarge.__doc__ = IFormerSmall.__doc__
55 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/efficientnet/efficientnet_edgetpu.py:
--------------------------------------------------------------------------------
 1 | """Creates an EfficientNet-EdgeTPU model
 2 | Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/edgetpu
 3 | """
 4 | 
 5 | import math
 6 | from keras_cv_attention_models.efficientnet.efficientnet_v2 import EfficientNetV2
 7 | from keras_cv_attention_models.attention_layers import make_divisible
 8 | from keras_cv_attention_models.models import register_model
 9 | 
10 | 
11 | def get_expanded_width_depth(width, depth, fix_head_stem=False):
12 |     out_channels = [ii * width for ii in [24, 32, 48, 96, 144, 192]]
13 |     depthes = [int(math.ceil(ii * depth)) for ii in [1, 2, 4, 5, 4, 2]]
14 |     first_conv_filter = 32 * width
15 |     output_conv_filter = 1280 * width
16 | 
17 |     out_channels = [out_channels[0], out_channels[0], *out_channels[1:]]
18 |     depthes = [1, depthes[0] - 1, *depthes[1:]]
19 |     return out_channels, depthes, first_conv_filter, output_conv_filter
20 | 
21 | 
22 | def EfficientNetEdgeTPU(
23 |     expands=[-1, 4, 8, 8, 8, 8, 8],  # expands[0] = expands[1] * out_channels[0] / first_conv_filter, as timm using expand on out_channel
24 |     out_channels=[24, 24, 32, 48, 96, 144, 192],
25 |     depthes=[1, 0, 2, 4, 5, 4, 2],  # Add an additional block, as timm using expand on out_channel
26 |     strides=[1, 1, 2, 2, 2, 1, 2],
27 |     se_ratios=[0, 0, 0, 0, 0, 0, 0, 0],
28 |     first_conv_filter=32,
29 |     output_conv_filter=1280,
30 |     kernel_sizes=[3, 3, 3, 3, 5, 5, 5],
31 |     use_shortcuts=[False, False, True, True, True, True, True],
32 |     is_fused=[True, True, True, True, False, False, False],
33 |     is_torch_mode=True,
34 |     drop_connect_rate=0.2,
35 |     pretrained="imagenet",
36 |     activation="relu",
37 |     model_name="EfficientNetEdgeTPU",
38 |     **kwargs,
39 | ):
40 |     kwargs.pop("kwargs", None)
41 |     expands[0] = make_divisible(out_channels[0], 8) * expands[1] / make_divisible(first_conv_filter, 8)
42 |     return EfficientNetV2(**locals(), **kwargs)
43 | 
44 | 
45 | @register_model
46 | def EfficientNetEdgeTPUSmall(input_shape=(224, 224, 3), num_classes=1000, dropout=0.2, classifier_activation="softmax", pretrained="imagenet", **kwargs):
47 |     out_channels, depthes, first_conv_filter, output_conv_filter = get_expanded_width_depth(1.0, 1.0)
48 |     return EfficientNetEdgeTPU(**locals(), model_name="efficientnet_edgetpu-small", **kwargs)
49 | 
50 | 
51 | @register_model
52 | def EfficientNetEdgeTPUMedium(input_shape=(240, 240, 3), num_classes=1000, dropout=0.2, classifier_activation="softmax", pretrained="imagenet", **kwargs):
53 |     out_channels, depthes, first_conv_filter, output_conv_filter = get_expanded_width_depth(1.0, 1.1)
54 |     return EfficientNetEdgeTPU(**locals(), model_name="efficientnet_edgetpu-medium", **kwargs)
55 | 
56 | 
57 | @register_model
58 | def EfficientNetEdgeTPULarge(input_shape=(300, 300, 3), num_classes=1000, dropout=0.3, classifier_activation="softmax", pretrained="imagenet", **kwargs):
59 |     out_channels, depthes, first_conv_filter, output_conv_filter = get_expanded_width_depth(1.2, 1.4)
60 |     return EfficientNetEdgeTPU(**locals(), model_name="efficientnet_edgetpu-large", **kwargs)
61 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/fasternet/__init__.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.fasternet.fasternet import FasterNet, FasterNetT0, FasterNetT1, FasterNetT2, FasterNetS, FasterNetM, FasterNetL
 2 | 
 3 | __head_doc__ = """
 4 | Keras implementation of [Github JierunChen/FasterNet](https://github.com/JierunChen/FasterNet).
 5 | Paper [PDF 2303.03667 Run, Don’t Walk: Chasing Higher FLOPS for Faster Neural Networks ](https://arxiv.org/pdf/2303.03667.pdf).
 6 | """
 7 | 
 8 | __tail_doc__ = """  window_ratios: window split ratio. Each stack will calculate `window_size = (height // window_ratio, width // window_ratio)` .
 9 |   layer_scale: layer scale init value. Default `-1` means not applying, any value `>=0` will add a scale value for each block output.
10 |       [Going deeper with Image Transformers](https://arxiv.org/pdf/2103.17239.pdf).
11 |   input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`.
12 |   num_classes: number of classes to classify images into. Set `0` to exclude top layers.
13 |   activation: activation used in whole model, default `gelu`.
14 |   drop_connect_rate: is used for [Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382).
15 |       Can be value like `0.2`, indicates the drop probability linearly changes from `0 --> 0.2` for `top --> bottom` layers.
16 |       A higher value means a higher probability will drop the deep branch.
17 |       or `0` to disable (default).
18 |   dropout: dropout rate if top layers is included.
19 |   classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`.
20 |       Set `classifier_activation=None` to return the logits of the "top" layer.
21 |   pretrained: one of None or "imagenet".
22 |       Will try to download and load pre-trained model weights if not None.
23 | 
24 | Returns:
25 |     A `keras.Model` instance.
26 | """
27 | 
28 | FasterNet.__doc__ = __head_doc__ + """
29 | Args:
30 |   num_blocks: number of blocks in each stack.
31 |   embed_dim: basic hidden dims, expand * 2 for each stack.
32 |   patch_size: int value for stem kernel size and strides.
33 |   mlp_ratio: expand ratio for mlp blocks hidden channel.
34 |   partial_conv_ratio: float value for partial channles appling `Conv2D` in each block.
35 |   output_conv_filter: int value for filters of `Conv2D` block before output block.
36 |   model_name: string, model name.
37 | """ + __tail_doc__ + """
38 | Model architectures:
39 |   | Model       | Params | FLOPs  | Input | Top1 Acc |
40 |   | ----------- | ------ | ------ | ----- | -------- |
41 |   | FasterNetT0 | 3.9M   | 0.34G  | 224   | 71.9     |
42 |   | FasterNetT1 | 7.6M   | 0.85G  | 224   | 76.2     |
43 |   | FasterNetT2 | 15.0M  | 1.90G  | 224   | 78.9     |
44 |   | FasterNetS  | 31.1M  | 4.55G  | 224   | 81.3     |
45 |   | FasterNetM  | 53.5M  | 8.72G  | 224   | 83.0     |
46 |   | FasterNetL  | 93.4M  | 15.49G | 224   | 83.5     |
47 | """
48 | 
49 | FasterNetT0.__doc__ = __head_doc__ + """
50 | Args:
51 | """ + __tail_doc__
52 | 
53 | FasterNetT1.__doc__ = FasterNetT0.__doc__
54 | FasterNetT2.__doc__ = FasterNetT0.__doc__
55 | FasterNetS.__doc__ = FasterNetT0.__doc__
56 | FasterNetM.__doc__ = FasterNetT0.__doc__
57 | FasterNetL.__doc__ = FasterNetT0.__doc__
58 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/gpvit/__init__.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.gpvit.gpvit import GPViT, GPViT_L1, GPViT_L2, GPViT_L3, GPViT_L4, PureWeigths
 2 | 
 3 | __head_doc__ = """
 4 | Keras implementation of [Github ChenhongyiYang/GPViT](https://github.com/ChenhongyiYang/GPViT).
 5 | Paper [PDF 2212.06795 GPVIT: A HIGH RESOLUTION NON-HIERARCHICAL VISION TRANSFORMER WITH GROUP PROPAGATION](https://arxiv.org/pdf/2212.06795.pdf).
 6 | """
 7 | 
 8 | __tail_doc__ = """  input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`.
 9 |   num_classes: number of classes to classify images into. Set `0` to exclude top layers.
10 |   activation: activation used in whole model, default `gelu`.
11 |   drop_connect_rate: is used for [Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382).
12 |       Can be a constant value like `0.2`,
13 |       or a tuple value like `(0, 0.2)` indicates the drop probability linearly changes from `0 --> 0.2` for `top --> bottom` layers.
14 |       A higher value means a higher probability will drop the deep branch.
15 |       or `0` to disable (default).
16 |   layer_scale: layer scale init value, [Going deeper with Image Transformers](https://arxiv.org/abs/2103.17239).
17 |       Default 0 for not using.
18 |   classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`.
19 |       Set `classifier_activation=None` to return the logits of the "top" layer.
20 |       Default is `None`.
21 |   pretrained: one of `None` (random initialization) or 'imagenet21k-ft1k' (pre-training on ImageNet21k and fine-tuned ImageNet).
22 |       Will try to download and load pre-trained model weights if not None.
23 | 
24 | Returns:
25 |     A `keras.Model` instance.
26 | """
27 | 
28 | GPViT.__doc__ = __head_doc__ + """
29 | Args:
30 |   num_layers: number of transformer blocks.
31 |   embed_dims: output channels for each stack.
32 |   stem_depth: number of stem conv blocks.
33 |   num_window_heads: number of heads for `window_lepe_attention_mlp_block` blocks.
34 |   num_group_heads: number of heads for `group_attention` blocks.
35 |   mlp_ratios: int value indicates expand ratio for mlp blocks hidden channel in each stack.
36 |   window_size: number of `window_size` for `window_lepe_attention_mlp_block` blocks.
37 |   group_attention_layer_ids: list of layer id for using `group_attention`, others will be `window_lepe_attention_mlp_block`.
38 |   group_attention_layer_group_tokens: list of `num_group_token` for each block using `group_attention`.
39 |   use_neck_attention_output: boolean value whether using `light_group_attention` before output block.
40 |   model_name: string, model name.
41 | """ + __tail_doc__ + """
42 | Model architectures:
43 |   | Model    | Params | FLOPs  | Input | Top1 Acc |
44 |   | -------- | ------ | ------ | ----- | -------- |
45 |   | GPViT_L1 | 9.59M  | 6.15G  | 224   | 80.5     |
46 |   | GPViT_L2 | 24.2M  | 15.74G | 224   | 83.4     |
47 |   | GPViT_L3 | 36.7M  | 23.54G | 224   | 84.1     |
48 |   | GPViT_L4 | 75.5M  | 48.29G | 224   | 84.3     |
49 | """
50 | 
51 | GPViT_L1.__doc__ = __head_doc__ + """
52 | Args:
53 | """ + __tail_doc__
54 | 
55 | GPViT_L2.__doc__ = GPViT_L1.__doc__
56 | GPViT_L3.__doc__ = GPViT_L1.__doc__
57 | GPViT_L4.__doc__ = GPViT_L1.__doc__
58 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/moganet/README.md:
--------------------------------------------------------------------------------
 1 | # ___Keras MogaNet___
 2 | ***
 3 | 
 4 | ## Summary
 5 |   - Keras implementation of [Github Westlake-AI/MogaNet](https://github.com/Westlake-AI/MogaNet). Paper [PDF 2211.03295 Efficient Multi-order Gated Aggregation Network](https://arxiv.org/pdf/2211.03295.pdf).
 6 |   - Model weights ported from official publication.
 7 | ***
 8 | 
 9 | ## Models
10 |   | Model        | Params | FLOPs  | Input | Top1 Acc | Download |
11 |   | ------------ | ------ | ------ | ----- | -------- | -------- |
12 |   | MogaNetXtiny | 2.96M  | 806M   | 224   | 76.5     | [moganet_xtiny_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/moganet/moganet_xtiny_imagenet.h5) |
13 |   | MogaNetTiny  | 5.20M  | 1.11G  | 224   | 79.0     | [moganet_tiny_224_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/moganet/moganet_tiny_224_imagenet.h5) |
14 |   |              | 5.20M  | 1.45G  | 256   | 79.6     | [moganet_tiny_256_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/moganet/moganet_tiny_256_imagenet.h5) |
15 |   | MogaNetSmall | 25.3M  | 4.98G  | 224   | 83.4     | [moganet_small_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/moganet/moganet_small_imagenet.h5) |
16 |   | MogaNetBase  | 43.7M  | 9.96G  | 224   | 84.2     | [moganet_base_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/moganet/moganet_base_imagenet.h5) |
17 |   | MogaNetLarge | 82.5M  | 15.96G | 224   | 84.6     | [moganet_large_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/moganet/moganet_large_imagenet.h5) |
18 | ## Usage
19 |   ```py
20 |   from keras_cv_attention_models import moganet
21 | 
22 |   # Will download and load pretrained imagenet weights.
23 |   mm = moganet.MogaNetXtiny(pretrained="imagenet")
24 | 
25 |   # Run prediction
26 |   import tensorflow as tf
27 |   from tensorflow import keras
28 |   from skimage.data import chelsea
29 |   imm = keras.applications.imagenet_utils.preprocess_input(chelsea(), mode='torch') # Chelsea the cat
30 |   pred = mm(tf.expand_dims(tf.image.resize(imm, mm.input_shape[1:3]), 0)).numpy()
31 |   print(keras.applications.imagenet_utils.decode_predictions(pred)[0])
32 |   # [('n02124075', 'Egyptian_cat', 0.6138564), ('n02123045', 'tabby', 0.16214457), ...]
33 |   ```
34 |   **Change input resolution**.
35 |   ```py
36 |   from keras_cv_attention_models import moganet
37 |   mm = moganet.MogaNetXtiny(input_shape=(112, 193, 3), pretrained="imagenet")
38 |   # >>>> Load pretrained from: ~/.keras/models/caformer_small18_224_imagenet.h5
39 | 
40 |   # Run prediction
41 |   from skimage.data import chelsea
42 |   preds = mm(mm.preprocess_input(chelsea()))
43 |   print(mm.decode_predictions(preds))
44 |   # [('n02124075', 'Egyptian_cat', 0.5223805), ('n02123045', 'tabby', 0.27944055), ...]
45 |   ```
46 |   **Use dynamic input resolution** by set `input_shape=(None, None, 3)`.
47 |   ```py
48 |   from keras_cv_attention_models import moganet
49 |   model = moganet.MogaNetTiny(input_shape=(None, None, 3), num_classes=0)
50 | 
51 |   print(model(np.ones([1, 223, 123, 3])).shape)
52 |   # (1, 7, 4, 256)
53 |   print(model(np.ones([1, 32, 526, 3])).shape)
54 |   # (1, 1, 17, 256)
55 |   ```
56 | ***
57 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/resnest/README.md:
--------------------------------------------------------------------------------
 1 | # ___Keras ResNeSt___
 2 | ***
 3 | 
 4 | ## Summary
 5 |   - Keras implementation of [ResNeSt](https://github.com/zhanghang1989/ResNeSt). Paper [PDF 2004.08955 ResNeSt: Split-Attention Networks](https://arxiv.org/pdf/2004.08955.pdf).
 6 |   - Model weights reloaded from official publication.
 7 | ***
 8 | 
 9 | ## Models
10 |   | Model          | Params | FLOPs  | Input | Top1 Acc | Download |
11 |   | -------------- | ------ | ------ | ----- | -------- | -------- |
12 |   | resnest50      | 28M    | 5.38G  | 224   | 81.03    | [resnest50.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/resnest/resnest50_imagenet.h5) |
13 |   | resnest101     | 49M    | 13.33G | 256   | 82.83    | [resnest101.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/resnest/resnest101_imagenet.h5) |
14 |   | resnest200     | 71M    | 35.55G | 320   | 83.84    | [resnest200.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/resnest/resnest200_imagenet.h5) |
15 |   | resnest269     | 111M   | 77.42G | 416   | 84.54    | [resnest269.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/resnest/resnest269_imagenet.h5) |
16 | ## Usage
17 |   ```py
18 |   from keras_cv_attention_models import resnest
19 | 
20 |   # Will download and load pretrained imagenet weights.
21 |   mm = resnest.ResNest50(pretrained="imagenet")
22 | 
23 |   # Run prediction
24 |   import tensorflow as tf
25 |   from tensorflow import keras
26 |   from skimage.data import chelsea
27 |   imm = keras.applications.imagenet_utils.preprocess_input(chelsea(), mode='torch') # Chelsea the cat
28 |   pred = mm(tf.expand_dims(tf.image.resize(imm, mm.input_shape[1:3]), 0)).numpy()
29 |   print(keras.applications.imagenet_utils.decode_predictions(pred)[0])
30 |   # [('n02124075', 'Egyptian_cat', 0.7793046),
31 |   #  ('n02123159', 'tiger_cat', 0.028313603),
32 |   #  ('n04209239', 'tabby', 0.02322878),
33 |   #  ('n02127052', 'lynx', 0.0036637571),
34 |   #  ('n03085013', 'computer_keyboard', 0.0008628946)]
35 |   ```
36 |   **Use dynamic input resolution**
37 |   ```py
38 |   from keras_cv_attention_models import resnest
39 |   mm = resnest.ResNest50(input_shape=(None, None, 3), num_classes=0)
40 | 
41 |   print(mm(np.ones([1, 224, 224, 3])).shape)
42 |   # (1, 7, 7, 2048)
43 |   print(mm(np.ones([1, 512, 512, 3])).shape)
44 |   # (1, 16, 16, 2048)
45 | 
46 |   mm.save("../models/resnest50_dynamic_notop.h5")
47 |   ```
48 | ## Verification with PyTorch version
49 |   ```py
50 |   """ PyTorch resnest50 """
51 |   import torch
52 |   sys.path.append("../")
53 |   from ResNeSt.resnest.torch import resnest as torch_resnest
54 | 
55 |   torch_model = torch_resnest.resnest50(pretrained=True)
56 |   torch_model.eval()
57 | 
58 |   """ Keras ResNest50 """
59 |   from keras_cv_attention_models import resnest
60 |   mm = resnest.ResNest50(pretrained="imagenet", classifier_activation=None)
61 | 
62 |   """ Verification """
63 |   inputs = np.random.uniform(size=(1, *mm.input_shape[1:3], 3)).astype("float32")
64 |   torch_out = torch_model(torch.from_numpy(inputs).permute(0, 3, 1, 2)).detach().numpy()
65 |   keras_out = mm(inputs).numpy()
66 |   print(f"{np.allclose(torch_out, keras_out, atol=1e-4) = }")
67 |   # np.allclose(torch_out, keras_out, atol=1e-4) = True
68 |   ```
69 | ***
70 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/resnet_family/resnext.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.aotnet import AotNet
 2 | from keras_cv_attention_models.models import register_model
 3 | from keras_cv_attention_models.download_and_load import reload_model_weights
 4 | 
 5 | 
 6 | PRETRAINED_DICT = {
 7 |     "resnext50": {"imagenet": "cf65d988c38ba0335c97a046288b91f4", "swsl": "f1cf0cc3c49bb50e6949c50fcce3db8f"},
 8 |     "resnext101": {"imagenet": "1e58c0ecc31184bd6bfe4d6b568f4325", "swsl": "c2fe8eefcf9a55e0254d2b13055a4cbc"},
 9 |     "resnext101w": {"imagenet": "9a1b92145aeb922695c29a0f02b52188", "swsl": "58b7cf4a72b03171f50ed19789b20f3d"},
10 |     "resnext101w_64": {"imagenet": "51c81e014224bb731ebf64c3ed271a16"},
11 |     "resnext50d": {"imagenet": "a7b2433b7bee7029fce11ba3fabf3fb9"},
12 | }
13 | 
14 | 
15 | def ResNeXt(num_blocks, input_shape=(224, 224, 3), pretrained="imagenet", strides=2, groups=32, **kwargs):
16 |     strides = strides if isinstance(strides, (list, tuple)) else [1, 2, 2, strides]
17 |     model = AotNet(num_blocks, input_shape=input_shape, strides=strides, groups=groups, **kwargs)
18 |     reload_model_weights(model, pretrained_dict=PRETRAINED_DICT, sub_release="resnet_family", pretrained=pretrained)
19 |     return model
20 | 
21 | 
22 | @register_model
23 | def ResNeXt50(input_shape=(224, 224, 3), num_classes=1000, activation="relu", classifier_activation="softmax", pretrained="imagenet", **kwargs):
24 |     num_blocks = [3, 4, 6, 3]
25 |     hidden_channel_ratio = 0.5
26 |     return ResNeXt(**locals(), model_name="resnext50", **kwargs)
27 | 
28 | 
29 | @register_model
30 | def ResNeXt101(input_shape=(224, 224, 3), num_classes=1000, activation="relu", classifier_activation="softmax", pretrained="imagenet", **kwargs):
31 |     num_blocks = [3, 4, 23, 3]
32 |     hidden_channel_ratio = 0.5
33 |     return ResNeXt(**locals(), model_name="resnext101", **kwargs)
34 | 
35 | 
36 | @register_model
37 | def ResNeXt50D(input_shape=(224, 224, 3), num_classes=1000, activation="relu", classifier_activation="softmax", pretrained="imagenet", **kwargs):
38 |     num_blocks = [3, 4, 6, 3]
39 |     hidden_channel_ratio = 0.5
40 |     stem_type = "deep"
41 |     shortcut_type = "avg"
42 |     return ResNeXt(**locals(), model_name="resnext50d", **kwargs)
43 | 
44 | 
45 | @register_model
46 | def ResNeXt101W(input_shape=(224, 224, 3), num_classes=1000, activation="relu", classifier_activation="softmax", pretrained="imagenet", **kwargs):
47 |     num_blocks = [3, 4, 23, 3]
48 |     hidden_channel_ratio = 1
49 |     return ResNeXt(**locals(), model_name="resnext101w", **kwargs)
50 | 
51 | 
52 | @register_model
53 | def ResNeXt101W_se(input_shape=(224, 224, 3), num_classes=1000, activation="relu", classifier_activation="softmax", pretrained=None, **kwargs):
54 |     # timm using an additional conv + bn before se_module
55 |     num_blocks = [3, 4, 23, 3]
56 |     hidden_channel_ratio = 1
57 |     se_ratio = 0.25 / 4
58 |     stem_type = "deep"
59 |     return ResNeXt(**locals(), model_name="resnext101w", **kwargs)
60 | 
61 | 
62 | @register_model
63 | def ResNeXt101W_64(input_shape=(224, 224, 3), num_classes=1000, activation="relu", classifier_activation="softmax", pretrained="imagenet", **kwargs):
64 |     num_blocks = [3, 4, 23, 3]
65 |     hidden_channel_ratio = 1
66 |     groups = 64
67 |     return ResNeXt(**locals(), model_name="resnext101w_64", **kwargs)
68 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/nat/dinat.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.nat.nat import NAT
 2 | from keras_cv_attention_models.models import register_model
 3 | 
 4 | 
 5 | @register_model
 6 | def DiNAT_Mini(input_shape=(224, 224, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet", **kwargs):
 7 |     num_blocks = [3, 4, 6, 5]
 8 |     # dilation_rates = [[1, 8, 1], [1, 4, 1, 4], [1, 2] * 3, [1, 1, 1, 1, 1]]
 9 |     use_every_other_dilations = True
10 |     return NAT(**locals(), model_name="dinat_mini", **kwargs)
11 | 
12 | 
13 | @register_model
14 | def DiNAT_Tiny(input_shape=(224, 224, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet", **kwargs):
15 |     num_blocks = [3, 4, 18, 5]
16 |     # dilation_rates = [[1, 8, 1], [1, 4, 1, 4], [1, 2] * 9, [1, 1, 1, 1, 1]]
17 |     use_every_other_dilations = True
18 |     return NAT(**locals(), model_name="dinat_tiny", **kwargs)
19 | 
20 | 
21 | @register_model
22 | def DiNAT_Small(input_shape=(224, 224, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet", **kwargs):
23 |     num_blocks = [3, 4, 18, 5]
24 |     num_heads = [3, 6, 12, 24]
25 |     out_channels = [96, 192, 384, 768]
26 |     mlp_ratio = kwargs.pop("mlp_ratio", 2)
27 |     # layer_scale = kwargs.pop("layer_scale", 1e-5)
28 |     # dilation_rates = [[1, 8, 1], [1, 4, 1, 4], [1, 2] * 9, [1, 1, 1, 1, 1]]
29 |     use_every_other_dilations = True
30 |     return NAT(**locals(), model_name="dinat_small", **kwargs)
31 | 
32 | 
33 | @register_model
34 | def DiNAT_Base(input_shape=(224, 224, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet", **kwargs):
35 |     num_blocks = [3, 4, 18, 5]
36 |     num_heads = [4, 8, 16, 32]
37 |     out_channels = [128, 256, 512, 1024]
38 |     mlp_ratio = kwargs.pop("mlp_ratio", 2)
39 |     layer_scale = kwargs.pop("layer_scale", 1e-5)
40 |     # dilation_rates = [[1, 8, 1], [1, 4, 1, 4], [1, 2] * 9, [1, 1, 1, 1, 1]]
41 |     use_every_other_dilations = True
42 |     return NAT(**locals(), model_name="dinat_base", **kwargs)
43 | 
44 | 
45 | @register_model
46 | def DiNAT_Large(input_shape=(224, 224, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet21k-ft1k", **kwargs):
47 |     num_blocks = [3, 4, 18, 5]
48 |     num_heads = [6, 12, 24, 48]
49 |     out_channels = [192, 384, 768, 1536]
50 |     mlp_ratio = kwargs.pop("mlp_ratio", 2)
51 |     # layer_scale = kwargs.pop("layer_scale", 1e-5)
52 |     # dilation_rates = [[1, 8, 1], [1, 4, 1, 4], [1, 2] * 9, [1, 1, 1, 1, 1]]
53 |     use_every_other_dilations = True
54 |     return NAT(**locals(), model_name="dinat_large", **kwargs)
55 | 
56 | 
57 | @register_model
58 | def DiNAT_Large_K11(input_shape=(384, 384, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet21k-ft1k", **kwargs):
59 |     num_blocks = [3, 4, 18, 5]
60 |     num_heads = [6, 12, 24, 48]
61 |     out_channels = [192, 384, 768, 1536]
62 |     mlp_ratio = kwargs.pop("mlp_ratio", 2)
63 |     # layer_scale = kwargs.pop("layer_scale", 1e-5)
64 |     # dilation_rates = [[1, 8, 1], [1, 4, 1, 4], [1, 2] * 9, [1, 1, 1, 1, 1]]
65 |     use_every_other_dilations = True
66 |     attn_kernel_size = 11
67 |     return NAT(**locals(), model_name="dinat_large_k11", **kwargs)
68 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/mobilevit/mobilevit_v2.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.mobilevit.mobilevit import MobileViT
 2 | from keras_cv_attention_models.models import register_model
 3 | 
 4 | 
 5 | def MobileViT_V2(
 6 |     num_blocks=[1, 2, 3, 5, 4],
 7 |     out_channels=[64, 128, 256, 384, 512],
 8 |     attn_channels=0.5,  # Can be a list matching out_channels, or a float number for expansion ratio of out_channels
 9 |     expand_ratio=2,
10 |     stem_width=32,
11 |     resize_first=True,  # False for V1, True for V2
12 |     use_depthwise=True,  # False for V1, True for V2
13 |     use_fusion=False,  # True for V1, False for V2
14 |     num_norm_groups=1,  # -1 or 0 for V1 using layer_norm, or 1 for V2 using group_norm
15 |     use_linear_attention=True,  # False for V1, True for V2
16 |     output_num_features=0,
17 |     model_name="mobilevit_v2",
18 |     **kwargs,
19 | ):
20 |     kwargs.pop("kwargs", None)
21 |     return MobileViT(**locals(), **kwargs)
22 | 
23 | 
24 | def get_mobilevit_v2_width(multiplier=1.0):
25 |     return int(32 * multiplier), [int(ii * multiplier) for ii in [64, 128, 256, 384, 512]]  # stem_width, out_channels
26 | 
27 | 
28 | @register_model
29 | def MobileViT_V2_050(input_shape=(256, 256, 3), num_classes=1000, activation="swish", classifier_activation="softmax", pretrained="imagenet", **kwargs):
30 |     stem_width, out_channels = get_mobilevit_v2_width(0.5)
31 |     return MobileViT_V2(**locals(), model_name="mobilevit_v2_050", **kwargs)
32 | 
33 | 
34 | @register_model
35 | def MobileViT_V2_075(input_shape=(256, 256, 3), num_classes=1000, activation="swish", classifier_activation="softmax", pretrained="imagenet", **kwargs):
36 |     stem_width, out_channels = get_mobilevit_v2_width(0.75)
37 |     return MobileViT_V2(**locals(), model_name="mobilevit_v2_075", **kwargs)
38 | 
39 | 
40 | @register_model
41 | def MobileViT_V2_100(input_shape=(256, 256, 3), num_classes=1000, activation="swish", classifier_activation="softmax", pretrained="imagenet", **kwargs):
42 |     return MobileViT_V2(**locals(), model_name="mobilevit_v2_100", **kwargs)
43 | 
44 | 
45 | @register_model
46 | def MobileViT_V2_125(input_shape=(256, 256, 3), num_classes=1000, activation="swish", classifier_activation="softmax", pretrained="imagenet", **kwargs):
47 |     stem_width, out_channels = get_mobilevit_v2_width(1.25)
48 |     return MobileViT_V2(**locals(), model_name="mobilevit_v2_125", **kwargs)
49 | 
50 | 
51 | @register_model
52 | def MobileViT_V2_150(input_shape=(256, 256, 3), num_classes=1000, activation="swish", classifier_activation="softmax", pretrained="imagenet", **kwargs):
53 |     stem_width, out_channels = get_mobilevit_v2_width(1.5)
54 |     return MobileViT_V2(**locals(), model_name="mobilevit_v2_150", **kwargs)
55 | 
56 | 
57 | @register_model
58 | def MobileViT_V2_175(input_shape=(256, 256, 3), num_classes=1000, activation="swish", classifier_activation="softmax", pretrained="imagenet", **kwargs):
59 |     stem_width, out_channels = get_mobilevit_v2_width(1.75)
60 |     return MobileViT_V2(**locals(), model_name="mobilevit_v2_175", **kwargs)
61 | 
62 | 
63 | @register_model
64 | def MobileViT_V2_200(input_shape=(256, 256, 3), num_classes=1000, activation="swish", classifier_activation="softmax", pretrained="imagenet", **kwargs):
65 |     stem_width, out_channels = get_mobilevit_v2_width(2.0)
66 |     return MobileViT_V2(**locals(), model_name="mobilevit_v2_200", **kwargs)
67 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/pvt/__init__.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.pvt.pvt import PyramidVisionTransformerV2, PVT_V2B0, PVT_V2B1, PVT_V2B2, PVT_V2B2_linear, PVT_V2B3, PVT_V2B4, PVT_V2B5
 2 | 
 3 | __head_doc__ = """
 4 | Keras implementation of [Github whai362/PVT](https://github.com/whai362/PVT/tree/v2/classification).
 5 | Paper [PDF 2106.13797 PVTv2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/pdf/2106.13797.pdf).
 6 | """
 7 | 
 8 | __tail_doc__ = """  input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`.
 9 |   num_classes: number of classes to classify images into. Set `0` to exclude top layers.
10 |   activation: activation used in whole model, default `gelu`.
11 |   drop_connect_rate: is used for [Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382).
12 |       Can be a constant value like `0.2`,
13 |       or a tuple value like `(0, 0.2)` indicates the drop probability linearly changes from `0 --> 0.2` for `top --> bottom` layers.
14 |       A higher value means a higher probability will drop the deep branch.
15 |       or `0` to disable (default).
16 |   layer_scale: layer scale init value, [Going deeper with Image Transformers](https://arxiv.org/abs/2103.17239).
17 |       Default 0 for not using.
18 |   classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`.
19 |       Set `classifier_activation=None` to return the logits of the "top" layer.
20 |       Default is `None`.
21 |   pretrained: one of `None` (random initialization) or 'imagenet21k-ft1k' (pre-training on ImageNet21k and fine-tuned ImageNet).
22 |       Will try to download and load pre-trained model weights if not None.
23 | 
24 | Returns:
25 |     A `keras.Model` instance.
26 | """
27 | 
28 | PyramidVisionTransformerV2.__doc__ = __head_doc__ + """
29 | Args:
30 |   num_blocks: number of blocks in each stack.
31 |   embed_dims: output channels for each stack.
32 |   num_heads: int or list value indicates heads number for transformer blocks in each stack.
33 |   mlp_ratios: int or list value indicates expand ratio for mlp blocks hidden channel in each stack.
34 |   sr_ratios: int or list value indicates attention blocks key_value downsample rate in each stack.
35 |   stem_patch_size: stem patch size. Default `7`.
36 |   use_linear: boolean value if using linear complexity attention layer with `AvgPool2D`. True for `PVT_V2B2_linear`.
37 |   model_name: string, model name.
38 | """ + __tail_doc__ + """
39 | Model architectures:
40 |   | Model           | Params | FLOPs  | Input | Top1 Acc |
41 |   | --------------- | ------ | ------ | ----- | -------- |
42 |   | PVT_V2B0        | 3.7M   | 580.3M | 224   | 70.5     |
43 |   | PVT_V2B1        | 14.0M  | 2.14G  | 224   | 78.7     |
44 |   | PVT_V2B2        | 25.4M  | 4.07G  | 224   | 82.0     |
45 |   | PVT_V2B2_linear | 22.6M  | 3.94G  | 224   | 82.1     |
46 |   | PVT_V2B3        | 45.2M  | 6.96G  | 224   | 83.1     |
47 |   | PVT_V2B4        | 62.6M  | 10.19G | 224   | 83.6     |
48 |   | PVT_V2B5        | 82.0M  | 11.81G | 224   | 83.8     |
49 | """
50 | 
51 | PVT_V2B0.__doc__ = __head_doc__ + """
52 | Args:
53 | """ + __tail_doc__
54 | 
55 | PVT_V2B1.__doc__ = PVT_V2B0.__doc__
56 | PVT_V2B2.__doc__ = PVT_V2B0.__doc__
57 | PVT_V2B2_linear.__doc__ = PVT_V2B0.__doc__
58 | PVT_V2B3.__doc__ = PVT_V2B0.__doc__
59 | PVT_V2B4.__doc__ = PVT_V2B0.__doc__
60 | PVT_V2B5.__doc__ = PVT_V2B0.__doc__
61 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/repvit/__init__.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.repvit.repvit import RepViT, RepViT_M09, RepViT_M10, RepViT_M11, RepViT_M15, RepViT_M23, switch_to_deploy
 2 | 
 3 | 
 4 | __head_doc__ = """
 5 | Keras implementation of [Github THU-MIG/RepViT](https://github.com/THU-MIG/RepViT).
 6 | Paper [PDF 2307.09283 RepViT: Revisiting Mobile CNN From ViT Perspective](https://arxiv.org/pdf/2307.09283.pdf).
 7 | """
 8 | 
 9 | __tail_doc__ = """  input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`.
10 |   deploy: boolean value if build a fused model. **Evaluation only, not good for training**.
11 |   num_classes: number of classes to classify images into. Set `0` to exclude top layers.
12 |   activation: activation used in whole model, default `hard_swish`.
13 |   drop_connect_rate: is used for [Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382).
14 |       Can be a constant value like `0.2`,
15 |       or a tuple value like `(0, 0.2)` indicates the drop probability linearly changes from `0 --> 0.2` for `top --> bottom` layers.
16 |       A higher value means a higher probability will drop the deep branch.
17 |       or `0` to disable (default).
18 |   dropout: top dropout rate if top layers is included. Default 0.
19 |   classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`.
20 |       Set `classifier_activation=None` to return the logits of the "top" layer.
21 |       Default is `None`.
22 |   use_distillation: Boolean value if output `distill_head`. Default `False`.
23 |   pretrained: one of `None` (random initialization) or 'imagenet' (pre-training on ImageNet).
24 |       Will try to download and load pre-trained model weights if not None.
25 |   **kwargs: other parameters if available.
26 | 
27 | Returns:
28 |     A `keras.Model` instance.
29 | """
30 | 
31 | RepViT.__doc__ = __head_doc__ + """
32 | Args:
33 |   num_blocks: number of block for each stack.
34 |   out_channels: output channels for each stack.
35 |   stem_width: channel dimension output for stem block, default -1 for using out_channels[0].
36 |   se_ratio: float value for se_ratio for each stack, will use `se_module` every other block in each stack if > 0.
37 |   model_name: string, model name.
38 | """ + __tail_doc__ + """
39 | Model architectures:
40 |   | Model                    | Params | FLOPs | Input | Top1 Acc |
41 |   | ------------------------ | ------ | ----- | ----- | -------- |
42 |   | RepViT_M09, distillation | 5.10M  | 0.82G | 224   | 79.1     |
43 |   | - deploy=True            | 5.07M  | 0.82G | 224   | 79.1     |
44 |   | RepViT_M10, distillation | 6.85M  | 1.12G | 224   | 80.3     |
45 |   | - deploy=True            | 6.81M  | 1.12G | 224   | 80.3     |
46 |   | RepViT_M11, distillation | 8.29M  | 1.35G | 224   | 81.2     |
47 |   | - deploy=True            | 8.24M  | 1.35G | 224   | 81.2     |
48 |   | RepViT_M15, distillation | 14.13M | 2.30G | 224   | 82.5     |
49 |   | - deploy=True            | 14.05M | 2.30G | 224   | 82.5     |
50 |   | RepViT_M23, distillation | 23.01M | 4.55G | 224   | 83.7     |
51 |   | - deploy=True            | 22.93M | 4.55G | 224   | 83.7     |
52 | """
53 | 
54 | RepViT_M09.__doc__ = __head_doc__ + """
55 | Args:
56 | """ + __tail_doc__
57 | 
58 | RepViT_M10.__doc__ = RepViT_M09.__doc__
59 | RepViT_M11.__doc__ = RepViT_M09.__doc__
60 | RepViT_M15.__doc__ = RepViT_M09.__doc__
61 | RepViT_M23.__doc__ = RepViT_M09.__doc__
62 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/gcvit/__init__.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.gcvit.gcvit import GCViT, GCViT_XXTiny, GCViT_XTiny, GCViT_Tiny, GCViT_Tiny2, GCViT_Small, GCViT_Small2, GCViT_Base, GCViT_Large
 2 | 
 3 | __head_doc__ = """
 4 | Keras implementation of [Github NVlabs/GCVit](https://github.com/NVlabs/GCVit).
 5 | Paper [PDF 2206.09959 Global Context Vision Transformers](https://arxiv.org/pdf/2206.09959.pdf).
 6 | """
 7 | 
 8 | __tail_doc__ = """  window_ratios: window split ratio. Each stack will calculate `window_size = (height // window_ratio, width // window_ratio)` .
 9 |   layer_scale: layer scale init value. Default `-1` means not applying, any value `>=0` will add a scale value for each block output.
10 |       [Going deeper with Image Transformers](https://arxiv.org/pdf/2103.17239.pdf).
11 |   input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`.
12 |   num_classes: number of classes to classify images into. Set `0` to exclude top layers.
13 |   activation: activation used in whole model, default `gelu`.
14 |   drop_connect_rate: is used for [Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382).
15 |       Can be value like `0.2`, indicates the drop probability linearly changes from `0 --> 0.2` for `top --> bottom` layers.
16 |       A higher value means a higher probability will drop the deep branch.
17 |       or `0` to disable (default).
18 |   dropout: dropout rate if top layers is included.
19 |   classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`.
20 |       Set `classifier_activation=None` to return the logits of the "top" layer.
21 |   pretrained: None or one of ["imagenet", "imagenet21k-ft1k"].
22 |       Will try to download and load pre-trained model weights if not None.
23 | 
24 | Returns:
25 |     A `keras.Model` instance.
26 | """
27 | 
28 | GCViT.__doc__ = __head_doc__ + """
29 | Args:
30 |   num_blocks: number of blocks in each stack.
31 |   num_heads: num heads for each stack.
32 |   embed_dim: basic hidden dims, expand * 2 for each stack.
33 |   mlp_ratio: expand ratio for mlp blocks hidden channel.
34 |   model_name: string, model name.
35 | """ + __tail_doc__ + """
36 | Model architectures:
37 |   | Model           | Params | FLOPs  | Input | Top1 Acc |
38 |   | --------------- | ------ | ------ | ----- | -------- |
39 |   | GCViT_XXTiny    | 12.0M  | 2.15G  | 224   | 79.9     |
40 |   | GCViT_XTiny     | 20.0M  | 2.96G  | 224   | 82.0     |
41 |   | GCViT_Tiny      | 28.2M  | 4.83G  | 224   | 83.5     |
42 |   | GCViT_Tiny2     | 34.5M  | 6.28G  | 224   | 83.7     |
43 |   | GCViT_Small     | 51.1M  | 8.63G  | 224   | 84.3     |
44 |   | GCViT_Small2    | 68.6M  | 11.7G  | 224   | 84.8     |
45 |   | GCViT_Base      | 90.3M  | 14.9G  | 224   | 85.0     |
46 |   | GCViT_Large     | 202.1M | 32.8G  | 224   | 85.7     |
47 |   | - 21k_ft1k      | 202.1M | 32.8G  | 224   | 86.6     |
48 |   | - 21k_ft1k, 384 | 202.9M | 105.1G | 384   | 87.4     |
49 |   | - 21k_ft1k, 512 | 203.8M | 205.1G | 512   | 87.6     |
50 | """
51 | 
52 | GCViT_XXTiny.__doc__ = __head_doc__ + """
53 | Args:
54 | """ + __tail_doc__
55 | 
56 | GCViT_XTiny.__doc__ = GCViT_XXTiny.__doc__
57 | GCViT_Tiny.__doc__ = GCViT_XXTiny.__doc__
58 | GCViT_Tiny2.__doc__ = GCViT_XXTiny.__doc__
59 | GCViT_Small.__doc__ = GCViT_XXTiny.__doc__
60 | GCViT_Small2.__doc__ = GCViT_XXTiny.__doc__
61 | GCViT_Base.__doc__ = GCViT_XXTiny.__doc__
62 | GCViT_Large.__doc__ = GCViT_XXTiny.__doc__
63 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/halonet/README.md:
--------------------------------------------------------------------------------
 1 | # ___Keras HaloNet___
 2 | ***
 3 | 
 4 | ## Summary
 5 |   - [Github lucidrains/halonet-pytorch](https://github.com/lucidrains/halonet-pytorch).
 6 |   - HaloAttention article: [PDF 2103.12731 Scaling Local Self-Attention for Parameter Efficient Visual Backbones](https://arxiv.org/pdf/2103.12731.pdf).
 7 |   - No pretrained available for `H` models. Architecture is guessed from article, so it's NOT certain.
 8 |   - `T` model weights are reloaded from timm [Github rwightman/pytorch-image-models](https://github.com/rwightman/pytorch-image-models).
 9 | 
10 |   ![](https://user-images.githubusercontent.com/5744524/151656806-005a80ba-3c35-4707-af29-2a781492a1d9.png)
11 | ## Models
12 |   | Model          | Params | FLOPs   | Input | Top1 Acc | Download |
13 |   | -------------- | ------ | ------- | ----- | -------- | -------- |
14 |   | HaloNextECA26T | 10.7M  | 2.43G   | 256   | 79.50    | [halonext_eca26t_256_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/halonet/halonext_eca26t_256_imagenet.h5) |
15 |   | HaloNet26T     | 12.5M  | 3.18G   | 256   | 79.13    | [halonet26t_256_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/halonet/halonet26t_256_imagenet.h5) |
16 |   | HaloNetSE33T   | 13.7M  | 3.55G   | 256   | 80.99    | [halonet_se33t_256_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/halonet/halonet_se33t_256_imagenet.h5) |
17 |   | HaloRegNetZB   | 11.68M | 1.97G   | 224   | 81.042   | [haloregnetz_b_224_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/halonet/haloregnetz_b_224_imagenet.h5) |
18 |   | HaloNet50T     | 22.7M  | 5.29G   | 256   | 81.70    | [halonet50t_256_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/halonet/halonet50t_256_imagenet.h5) |
19 |   | HaloBotNet50T  | 22.6M  | 5.02G   | 256   | 82.0     | [halobotnet50t_256_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/halonet/halobotnet50t_256_imagenet.h5) |
20 | 
21 |   Comparing `HaloNetH7` accuracy by replacing Conv layers with Attention in each stage:
22 | 
23 |   | Conv Stages | Attention Stages | Top-1 Acc (%) | Norm. Train Time |
24 |   |:-----------:|:----------------:|:-------------:|:----------------:|
25 |   |      -      |    1, 2, 3, 4    |     84.9      |       1.9        |
26 |   |      1      |     2, 3, 4      |     84.6      |       1.4        |
27 |   |    1, 2     |       3, 4       |     84.7      |       1.0        |
28 |   |   1, 2, 3   |        4         |     83.8      |       0.5        |
29 | ## Usage
30 |   ```py
31 |   from keras_cv_attention_models import halonet
32 | 
33 |   # Will download and load pretrained imagenet weights.
34 |   mm = halonet.HaloNet26T(pretrained="imagenet")
35 | 
36 |   # Run prediction
37 |   import tensorflow as tf
38 |   from tensorflow import keras
39 |   from skimage.data import chelsea
40 |   imm = keras.applications.imagenet_utils.preprocess_input(chelsea(), mode='torch') # Chelsea the cat
41 |   pred = mm(tf.expand_dims(tf.image.resize(imm, mm.input_shape[1:3]), 0)).numpy()
42 |   print(keras.applications.imagenet_utils.decode_predictions(pred)[0])
43 |   # [('n02124075', 'Egyptian_cat', 0.8999013),
44 |   #  ('n02123159', 'tiger_cat', 0.012704549),
45 |   #  ('n02123045', 'tabby', 0.009713952),
46 |   #  ('n07760859', 'custard_apple', 0.00056676986),
47 |   #  ('n02487347', 'macaque', 0.00050636294)]
48 |   ```
49 | ***
50 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/convnext/convnext_v2.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.convnext.convnext import ConvNeXt
 2 | from keras_cv_attention_models.models import register_model
 3 | 
 4 | 
 5 | def ConvNeXtV2(
 6 |     num_blocks=[3, 3, 9, 3],
 7 |     out_channels=[96, 192, 384, 768],
 8 |     stem_width=-1,
 9 |     layer_scale_init_value=0,  # 1e-6 for v1, 0 for v2
10 |     use_grn=True,  # False for v1, True for v2
11 |     head_init_scale=1.0,
12 |     layer_norm_epsilon=1e-6,  # 1e-5 for ConvNeXtXXlarge, 1e-6 for others
13 |     output_num_filters=-1,  # If apply additional dense + activation before output dense, <0 for not using
14 |     input_shape=(224, 224, 3),
15 |     num_classes=1000,
16 |     activation="gelu",
17 |     drop_connect_rate=0.1,
18 |     classifier_activation="softmax",
19 |     dropout=0,
20 |     pretrained=None,
21 |     model_name="convnext_v2",
22 |     kwargs=None,
23 | ):
24 |     return ConvNeXt(**locals())
25 | 
26 | 
27 | @register_model
28 | def ConvNeXtV2Atto(input_shape=(224, 224, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs):
29 |     num_blocks = [2, 2, 6, 2]
30 |     out_channels = [40, 80, 160, 320]
31 |     return ConvNeXtV2(**locals(), model_name="convnext_v2_atto", **kwargs)
32 | 
33 | 
34 | @register_model
35 | def ConvNeXtV2Femto(input_shape=(224, 224, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs):
36 |     num_blocks = [2, 2, 6, 2]
37 |     out_channels = [48, 96, 192, 384]
38 |     return ConvNeXtV2(**locals(), model_name="convnext_v2_femto", **kwargs)
39 | 
40 | 
41 | @register_model
42 | def ConvNeXtV2Pico(input_shape=(224, 224, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs):
43 |     num_blocks = [2, 2, 6, 2]
44 |     out_channels = [64, 128, 256, 512]
45 |     return ConvNeXtV2(**locals(), model_name="convnext_v2_pico", **kwargs)
46 | 
47 | 
48 | @register_model
49 | def ConvNeXtV2Nano(input_shape=(224, 224, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs):
50 |     num_blocks = [2, 2, 8, 2]
51 |     out_channels = [80, 160, 320, 640]
52 |     return ConvNeXtV2(**locals(), model_name="convnext_v2_nano", **kwargs)
53 | 
54 | 
55 | @register_model
56 | def ConvNeXtV2Tiny(input_shape=(224, 224, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs):
57 |     num_blocks = [3, 3, 9, 3]
58 |     out_channels = [96, 192, 384, 768]
59 |     return ConvNeXtV2(**locals(), model_name="convnext_v2_tiny", **kwargs)
60 | 
61 | 
62 | @register_model
63 | def ConvNeXtV2Base(input_shape=(224, 224, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs):
64 |     num_blocks = [3, 3, 27, 3]
65 |     out_channels = [128, 256, 512, 1024]
66 |     return ConvNeXtV2(**locals(), model_name="convnext_v2_base", **kwargs)
67 | 
68 | 
69 | @register_model
70 | def ConvNeXtV2Large(input_shape=(224, 224, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs):
71 |     num_blocks = [3, 3, 27, 3]
72 |     out_channels = [192, 384, 768, 1536]
73 |     return ConvNeXtV2(**locals(), model_name="convnext_v2_large", **kwargs)
74 | 
75 | 
76 | @register_model
77 | def ConvNeXtV2Huge(input_shape=(224, 224, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs):
78 |     num_blocks = [3, 3, 27, 3]
79 |     out_channels = [352, 704, 1408, 2816]
80 |     return ConvNeXtV2(**locals(), model_name="convnext_v2_huge", **kwargs)
81 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/gpt2/README.md:
--------------------------------------------------------------------------------
 1 | # ___Keras GPT2___
 2 | ***
 3 | 
 4 | ## Summary
 5 |   - Keras implementation of [Github openai/gpt-2](https://github.com/openai/gpt-2). Paper [Language Models are Unsupervised Multitask Learners](https://d4mucfpksywv.cloudfront.net/better-language-models/language-models.pdf).
 6 |   - Model ported from [huggingface/gpt2](https://huggingface.co/gpt2).
 7 |   - References [Github karpathy/nanoGPT](https://github.com/karpathy/nanoGPT) and [Github jaymody/picoGPT](https://github.com/jaymody/picoGPT).
 8 | ## Models
 9 |   - For `GPT2_XLarge`, needs to download 2 file parts `gpt2_xlarge_webtext.1.h5` and `gpt2_xlarge_webtext.2.h5`.
10 | 
11 |   | Model            | Params  | FLOPs   | vocab_size | LAMBADA PPL |
12 |   | ---------------- | ------- | ------- | ---------- | ----------- |
13 |   | [GPT2_Base](https://github.com/leondgarse/keras_cv_attention_models/releases/download/gpt2/gpt2_base_webtext.h5)        | 163.04M | 146.42G | 50257      | 35.13       |
14 |   | [GPT2_Medium](https://github.com/leondgarse/keras_cv_attention_models/releases/download/gpt2/gpt2_medium_webtext.h5)      | 406.29M | 415.07G | 50257      | 15.60       |
15 |   | [GPT2_Large](https://github.com/leondgarse/keras_cv_attention_models/releases/download/gpt2/gpt2_large_webtext.h5)       | 838.36M | 890.28G | 50257      | 10.87       |
16 |   | [GPT2_XLarge](https://github.com/leondgarse/keras_cv_attention_models/releases/download/gpt2/gpt2_xlarge_webtext.1.h5), [+.2](https://github.com/leondgarse/keras_cv_attention_models/releases/download/gpt2/gpt2_xlarge_webtext.2.h5) | 1.638B  | 1758.3G | 50257      | 8.63        |
17 | ## Usage
18 |   ```py
19 |   from keras_cv_attention_models import gpt2
20 | 
21 |   mm = gpt2.GPT2_Base()
22 |   _ = mm.run_prediction("hello world", num_samples=1, max_new_tokens=100)
23 |   # hello world. I mean, just because we call ourselves anorexic, with a very strong genetic, doesn't mean we are human.
24 |   #
25 |   # And so there we have it. And we've just got to get through going through the rest of our lives.
26 |   #
27 |   #
28 |   # I mean, it's a real challenge right now. And we know, we've already talked about the ethical issues. And so, I think, you know, the human body is a very dangerous thing, and the ethical issues
29 |   # ---------------
30 |   ```
31 |   **Set `include_top=False`** to exclude model head layer.
32 |   ```py
33 |   from keras_cv_attention_models import gpt2
34 | 
35 |   mm = gpt2.GPT2_Base(include_top=False)
36 |   # >>>> Load pretrained from: ~/.keras/models/gpt2_base_webtext.h5
37 |   print(f"{mm.output_shape = }")
38 |   # mm.output_shape = (None, 1024, 768)
39 |   ```
40 |   **Set `pretrained="huggingface"`** for converting and loading weights from huggingface `transformers` pacakge.
41 |   ```py
42 |   from keras_cv_attention_models import gpt2
43 | 
44 |   mm = gpt2.GPT2_Medium(pretrained="huggingface")
45 |   # Load and convert weights from huggingface
46 |   # >>>> Save to: ~/.keras/models/gpt2_medium_huggingface.h5
47 |   _ = mm.run_prediction("hello world", num_samples=1, max_new_tokens=100)
48 |   # hello world, and he'll meet you in the afternoon and ask you to think about your career, and then I'll return. I'll write something up, and after that I'll have you come over."<|endoftext|>BALTIMORE -- The Baltimore Sun has been the one to expose the violence and destruction of the Baltimore riots that led to the death of Freddie Gray, and it's not your typical public servant.
49 |   #
50 |   # The Sun, which is owned by the Baltimore-based News Corp, went public with
51 |   # ---------------
52 |   ```
53 | ***
54 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/keras_core_functional.py:
--------------------------------------------------------------------------------
 1 | import keras_core as keras
 2 | from keras_core.ops import *
 3 | from keras_core.ops import concatenate as concat
 4 | from keras_core.ops import mean as reduce_mean
 5 | from keras_core.ops import max as reduce_max
 6 | from keras_core.ops import min as reduce_min
 7 | from keras_core.ops import power as pow
 8 | from keras_core.ops import clip as clip_by_value
 9 | 
10 | 
11 | def extract_patches(images, sizes=1, strides=1, rates=1, padding="valid", name=None):
12 |     return keras.ops.image.extract_patches(
13 |         images,
14 |         size=sizes[1:-1] if isinstance(sizes, int) or len(sizes) > 2 else sizes,
15 |         strides=strides[1:-1] if isinstance(strides, int) or len(strides) > 2 else strides,
16 |         dilation_rate=rates[1:-1] if isinstance(rates, int) or len(rates) > 2 else rates,
17 |         padding=padding.lower(),
18 |         data_format=keras.backend.image_data_format(),
19 |     )
20 | 
21 | 
22 | def gather(inputs, indices, axis=None, batch_dims=0, name=None):
23 |     """Defaults axis=None means the first non-batch dimension"""
24 |     axis = batch_dims if axis is None else (len(inputs.shape) + axis if axis < 0 else axis)
25 |     return keras.ops.take(inputs, indices, axis=axis)
26 | 
27 | 
28 | def l2_normalize(inputs, axis=None, epsilon=1e-12, name=None):
29 |     return inputs / keras.ops.sqrt(keras.ops.maximum(keras.ops.sum(inputs**2, axis=axis, keepdims=True), epsilon))
30 | 
31 | 
32 | def norm(inputs, ord="euclidean", axis=1, keepdims=False, name=None):
33 |     return keras.ops.sqrt(keras.ops.sum(inputs**2, axis=axis, keepdims=True))
34 | 
35 | 
36 | def resize(images, size, method="bilinear", preserve_aspect_ratio=False, antialias=False, name=None):
37 |     return keras.ops.image.resize(images, size, interpolation=method, antialias=antialias, data_format=keras.backend.image_data_format())
38 | 
39 | 
40 | def reduce_sum(inputs, axis=None, keepdims=False, name=None):
41 |     axis = () if axis is None else axis
42 |     if isinstance(inputs, (list, tuple)) and axis == 0:
43 |         rr = inputs[0]
44 |         for ii in inputs[1:]:
45 |             rr += ii
46 |         return rr
47 |     else:
48 |         # return wrapper(lambda xx: xx.sum(dim=axis, keepdim=keepdims), inputs, name=name)
49 |         return keras.ops.sum(inputs, axis=axis, keepdims=keepdims)
50 | 
51 | 
52 | def rsqrt(inputs, name=None):
53 |     return keras.ops.true_divide(1, keras.ops.sqrt(inputs))
54 | 
55 | 
56 | def split(inputs, num_or_size_splits, axis=0, num=None, name="split"):
57 |     from builtins import sum
58 | 
59 |     if isinstance(num_or_size_splits, int):
60 |         return keras.ops.split(inputs, num_or_size_splits, axis=axis)
61 | 
62 |     axis = (len(inputs.shape) + axis) if axis < 0 else axis
63 |     split_axis_shape = inputs.shape[axis]
64 |     assert split_axis_shape is not None
65 | 
66 |     size_splits = num_or_size_splits
67 |     size_splits = [0 if ii is None or ii == -1 else ii for ii in size_splits]
68 |     num_unknown_dim = sum([ii == 0 for ii in size_splits])
69 |     assert num_unknown_dim < 2, "At most one unknown dimension in num_or_size_splits: {}".format(num_or_size_splits)
70 | 
71 |     if num_unknown_dim == 1:
72 |         size_splits = [(split_axis_shape - sum(size_splits)) if ii == 0 else ii for ii in size_splits]
73 | 
74 |     cum_split = [sum(size_splits[: id + 1]) for id, _ in enumerate(size_splits)]
75 |     # len(keras.ops.split(np.ones([2, 6]), [2, 2, 2], axis=-1)) == 4
76 |     # len(keras.ops.split(keras.layers.Input([6]), [2, 2, 2], axis=-1)) == 3
77 |     return keras.ops.split(inputs, cum_split, axis=axis)[: len(size_splits)]
78 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/stable_diffusion/__init__.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models import backend as __backend__
 2 | from keras_cv_attention_models.stable_diffusion.stable_diffusion import StableDiffusion
 3 | from keras_cv_attention_models.stable_diffusion.unet import UNet, UNetTest
 4 | from keras_cv_attention_models.stable_diffusion.encoder_decoder import Encoder, Decoder
 5 | from keras_cv_attention_models.stable_diffusion.eval_func import RunPrediction
 6 | 
 7 | if __backend__.is_tensorflow_backend:
 8 |     from keras_cv_attention_models.stable_diffusion.data import build_tf_dataset as build_dataset
 9 | else:
10 |     from keras_cv_attention_models.stable_diffusion.data import build_torch_dataset as build_dataset
11 | 
12 | 
13 | __head_doc__ = """
14 | Keras implementation of [Github CompVis/stable-diffusion](https://github.com/CompVis/stable-diffusion).
15 | Paper [PDF 2112.10752 High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/pdf/2112.10752.pdf).
16 | """
17 | 
18 | __tail_doc__ = """  image_shape: model image input shape and generated image shape.
19 |       Should have exactly 3 inputs channels like `(224, 224, 3)`.
20 |       Inner latents inpuit shape for UNet and Decode is `[image_shape[0] // 8, image_shape[1] // 8, 4]`.
21 |   clip_model: str value like `beit.ViTTextLargePatch14` for models from this package under `keras_cv_attention_models`.
22 |       Also can be a built model, or None for not using.
23 |   unet_model: str value like `stable_diffusion.UNet` for models from this package under `keras_cv_attention_models`.
24 |       Also can be a built model, or None for not using.
25 |   decoder_model: str value like `stable_diffusion.Decoder` for models from this package under `keras_cv_attention_models`.
26 |       Also can be a built model, or None for not using.
27 |   encoder_model: str value like `stable_diffusion.Encoder` for models from this package under `keras_cv_attention_models`.
28 |       Also can be a built model, or None for not using.
29 |   clip_model_kwargs: dict value for kwargs used for building `clip_model`.
30 |   unet_model_kwargs: dict value for kwargs used for building `unet_model`.
31 |   decoder_model_kwargs: dict value for kwargs used for building `decoder_model`.
32 |   encoder_model_kwargs: dict value for kwargs used for building `encoder_model`.
33 |   caption_tokenizer: str value in ['GPT2Tokenizer', 'SimpleTokenizer', 'SentencePieceTokenizer'],
34 |       or tiktoken one ['gpt2', 'r50k_base', 'p50k_base', 'cl100k_base'],
35 |       or specified built tokenizer.
36 |   num_steps: int value for the number of DDIM sampling steps, also means total denoising steps.
37 |   num_training_steps: int value for total denoising steps during training.
38 |   ddim_discretize: one of ["uniform", "quad"] for time_steps sampling `num_steps` method from `num_training_steps`.
39 |   linear_start: float value for `beta` start value.
40 |   linear_end: float value for `beta` end value.
41 |   ddim_eta: float value for calculating `ddim_sigma`. 0 makes the sampling process deterministic.
42 | 
43 | Returns:
44 |     A `StableDiffusion` instance.
45 | """
46 | 
47 | StableDiffusion.__doc__ = __head_doc__ + """
48 | Args:
49 | """ + __tail_doc__ + """
50 | Model architectures:
51 |   | Model               | Params | FLOPs   | Input               |
52 |   | ------------------- | ------ | ------- | ------------------- |
53 |   | ViTTextLargePatch14 | 123.1M | 6.67G   | [None, 77]          |
54 |   | Encoder             | 34.16M | 559.6G  | [None, 512, 512, 3] |
55 |   | UNet                | 859.5M | 404.4G  | [None, 64, 64, 4]   |
56 |   | Decoder             | 49.49M | 1259.5G | [None, 64, 64, 4]   |
57 | """
58 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/davit/README.md:
--------------------------------------------------------------------------------
 1 | # ___Keras DaViT___
 2 | ***
 3 | 
 4 | ## Summary
 5 |   - DaViT article: [PDF 2204.03645 DaViT: Dual Attention Vision Transformers](https://arxiv.org/pdf/2204.03645.pdf).
 6 |   - Model weights reloaded from [Github dingmyu/davit](https://github.com/dingmyu/davit).
 7 | ***
 8 | 
 9 | ## Models
10 |   | Model         | Params | FLOPs  | Input | Top1 Acc | Download |
11 |   | ------------- | ------ | ------ | ----- | -------- | -------- |
12 |   | DaViT_T       | 28.36M | 4.56G  | 224   | 82.8     | [davit_t_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/davit/davit_t_imagenet.h5) |
13 |   | DaViT_S       | 49.75M | 8.83G  | 224   | 84.2     | [davit_s_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/davit/davit_s_imagenet.h5) |
14 |   | DaViT_B       | 87.95M | 15.55G | 224   | 84.6     | [davit_b_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/davit/davit_b_imagenet.h5) |
15 |   | DaViT_L, 21k  | 196.8M | 103.2G | 384   | 87.5     |          |
16 |   | DaViT_H, 1.5B | 348.9M | 327.3G | 512   | 90.2     |          |
17 |   | DaViT_G, 1.5B | 1.406B | 1.022T | 512   | 90.4     |          |
18 | 
19 |   **Self tested accuracy**. There may be some detail differences in model output layer or evaluating process.
20 |   ```sh
21 |   CUDA_VISIBLE_DEVICES='0' ./eval_script.py -m davit.DaViT_T
22 |   # >>>> Accuracy top1: 0.82276 top5: 0.96152
23 |   ```
24 |   | Model   | Self tested Top1 Acc |
25 |   | ------- | -------------------- |
26 |   | DaViT_T | 82.276               |
27 |   | DaViT_S | 83.810               |
28 |   | DaViT_B | 84.142               |
29 | ## Usage
30 |   ```py
31 |   from keras_cv_attention_models import davit
32 | 
33 |   # Will download and load pretrained imagenet weights.
34 |   mm = davit.DaViT_T(pretrained="imagenet")
35 | 
36 |   # Run prediction
37 |   import tensorflow as tf
38 |   from tensorflow import keras
39 |   from skimage.data import chelsea
40 |   imm = keras.applications.imagenet_utils.preprocess_input(chelsea(), mode='torch') # Chelsea the cat
41 |   pred = mm(tf.expand_dims(tf.image.resize(imm, mm.input_shape[1:3]), 0)).numpy()
42 |   print(keras.applications.imagenet_utils.decode_predictions(pred)[0])
43 |   # [('n02124075', 'Egyptian_cat', 0.39985177), ('n02123159', 'tiger_cat', 0.036589254), ...]
44 |   ```
45 |   **Change input resolution**. Note if `input_shape` is not divisible by `window_ratio`, which default is `32`, will pad for `window_attention`.
46 |   ```py
47 |   from keras_cv_attention_models import davit
48 |   mm = davit.DaViT_T(input_shape=(376, 227, 3), pretrained="imagenet")
49 |   # >>>> Load pretrained from: ~/.keras/models/davit_t_imagenet.h5
50 | 
51 |   # Run prediction
52 |   from skimage.data import chelsea
53 |   preds = mm(mm.preprocess_input(chelsea()))
54 |   print(mm.decode_predictions(preds))
55 |   # [('n02124075', 'Egyptian_cat', 0.17319576), ('n02123159', 'tiger_cat', 0.017631555), ...]
56 |   ```
57 |   Reloading weights with new input_shape not divisible by default `window_ratio` works in some cases, like `input_shape` and `window_ratio` both downsample half:
58 |   ```py
59 |   from keras_cv_attention_models import davit
60 |   mm = davit.DaViT_T(input_shape=(112, 112, 3), window_ratio=16, pretrained="imagenet")
61 |   # >>>> Load pretrained from: ~/.keras/models/davit_t_imagenet.h5
62 | 
63 |   # Run prediction
64 |   from skimage.data import chelsea
65 |   preds = mm(mm.preprocess_input(chelsea()))
66 |   print(mm.decode_predictions(preds))
67 |   # [('n02124075', 'Egyptian_cat', 0.7279274), ('n02123045', 'tabby', 0.021591123), ...]
68 |   ```
69 | ***
70 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/segment_anything/__init__.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.segment_anything.sam import SAM, MobileSAM, TinySAM, EfficientViT_SAM_L0
 2 | 
 3 | __head_doc__ = """
 4 | Keras implementation of [Github facebookresearch/segment-anything](https://github.com/facebookresearch/segment-anything).
 5 | Paper [PDF 2304.02643 Segment Anything](https://arxiv.org/abs/2304.02643).
 6 | """
 7 | 
 8 | __call_doc__ = """
 9 | Call args:
10 |   image: raw input image. np.array value in shape `[height, width, 3]`, value range in `[0, 255]`.
11 |   points: combinging with `labels`, specific points coordinates as background or foreground.
12 |       np.array value in shape `[None, 2]`, `2` means `[left, top]`.
13 |       left / top value range in `[0, 1]` or `[0, width]` / `[0, height]`.
14 |   labels: combinging with `points`, specific points coordinates as background or foreground.
15 |       np.array value in shape `[None]`, value in `[0, 1]`, where 0 means relative point being background, and 1 foreground.
16 |   boxes: specific box area performing segmentation.
17 |       np.array value in shape `[1, 4]`, `4` means `[left, top, right, bottom]`.
18 |       left and right / top and bottom value range in `[0, 1]` or `[0, width]` / `[0, height]`.
19 |       Supports only single boxes as inputs.
20 |   masks: NOT tested.
21 |   mask_threshold: float value for regading model output where `masks > mask_threshold` as True.
22 |   return_logits: boolean value if returning boolean mask or logits mask. Default False for boolean mask.
23 | 
24 | Call returns:
25 |   masks: is all masks output, and it's `4` masks by default, specified by `MaskDecoder` parameter `num_mask_tokens`.
26 |       Default shape is `[4, image_height, image_width]`.
27 |       `masks[0]` is the output of token 0, which is said better for using if segmenting **single object with multi prompts**.
28 |       `masks[1:]` are intended for ambiguous input prompts, and `iou_predictions[1:]`** are the corresponding confidences,
29 |       which can be used for picking the highest score one from `masks[1:]`.
30 |   iou_predictions: is the corresponding masks confidences. Default shape is `[4]`.
31 |   low_res_masks: is the raw output from `MaskDecoder`. Default shape is `[4, 256, 256]`.
32 | """
33 | 
34 | __tail_doc__ = """  image_shape: int or list of 2 int like [1024, 1024].
35 |   embed_dims: inner channels for prompt encoder.
36 |   mask_hidden_dims: `MaskEncoder` hidden channels.
37 |   pretrained: one of `None` (random initialization) or 'sam' (pre-training on SA-1B from Segment Anything paper).
38 |       Will try to download and load pre-trained model weights if not None.
39 | 
40 | Returns:
41 |     A `keras.Model` instance.
42 | """ + __call_doc__
43 | 
44 | SAM.__doc__ = __head_doc__ + """
45 | Init args:
46 |   image_encoder: string or built image encoder model. Currently string can be one of ["TinyViT_5M", "EfficientViT_L0"].
47 |   mask_decoder: string or built mask decoder model. Currently string can be one of ["sam_mask_decoder", "tiny_sam_mask_decoder"].
48 |   name: string, model name.
49 | """ + __tail_doc__ + """
50 | Model architectures:
51 |   | Model               | Params | FLOPs | Input | COCO val mask AP |
52 |   | ------------------- | ------ | ----- | ----- | ---------------- |
53 |   | MobileSAM           | 5.74M  | 39.4G | 1024  | 41.0             |
54 |   | TinySAM             | 5.74M  | 39.4G | 1024  | 41.9             |
55 |   | EfficientViT_SAM_L0 | 30.73M | 35.4G | 512   | 45.7             |
56 | """
57 | 
58 | SAM.__call__.__doc__ = __call_doc__
59 | 
60 | MobileSAM.__doc__ = __head_doc__ + """
61 | Args:
62 | """ + __tail_doc__
63 | 
64 | EfficientViT_SAM_L0.__doc__ = MobileSAM.__doc__
65 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/fastervit/__init__.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.fastervit.fastervit import (
 2 |     FasterViT,
 3 |     FasterViT0,
 4 |     FasterViT1,
 5 |     FasterViT2,
 6 |     FasterViT3,
 7 |     FasterViT4,
 8 |     FasterViT5,
 9 |     FasterViT6,
10 |     switch_to_deploy,
11 | )
12 | 
13 | __head_doc__ = """
14 | Keras implementation of [Github NVlabs/FasterViT](https://github.com/NVlabs/FasterViT).
15 | Paper [PDF 2306.06189 FasterViT: Fast Vision Transformers with Hierarchical Attention](https://arxiv.org/pdf/2306.06189.pdf).
16 | """
17 | 
18 | __tail_doc__ = """  window_ratios: window split ratio. It's mainly for the 3rd stack, that `window_size = (height // window_ratio, width // window_ratio)`.
19 |       `1` means not using window partition, while `window_size == (height, width)`.
20 |   carrier_token_size: int value indicates carrier token size for the 3rd stack.
21 |   pos_scale: If pretrained weights are from different input_shape or window_size, pos_scale is previous actually using window_size.
22 |   use_propagation: boolean value if using `do_propagation` block at the end of the 3rd stack.
23 |   layer_scale: layer scale init value. Default `-1` means not applying, any value `>=0` will add a scale value for each block output.
24 |       [Going deeper with Image Transformers](https://arxiv.org/pdf/2103.17239.pdf).
25 |   input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`.
26 |   num_classes: number of classes to classify images into. Set `0` to exclude top layers.
27 |   activation: activation used in whole model, default `gelu`.
28 |   drop_connect_rate: is used for [Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382).
29 |       Can be value like `0.2`, indicates the drop probability linearly changes from `0 --> 0.2` for `top --> bottom` layers.
30 |       A higher value means a higher probability will drop the deep branch.
31 |       or `0` to disable (default).
32 |   dropout: dropout rate if top layers is included.
33 |   classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`.
34 |       Set `classifier_activation=None` to return the logits of the "top" layer.
35 |   pretrained: one of None or "imagenet".
36 |       Will try to download and load pre-trained model weights if not None.
37 | 
38 | Returns:
39 |     A `keras.Model` instance.
40 | """
41 | 
42 | FasterViT.__doc__ = __head_doc__ + """
43 | Args:
44 |   num_blocks: number of blocks in each stack.
45 |   num_heads: num heads for each stack.
46 |   stem_hidden_dim: hidden dimension for the 1st stem `Conv2D`.
47 |   embed_dim: basic hidden dims, expand * 2 for each stack.
48 |   mlp_ratio: expand ratio for mlp blocks hidden channel.
49 |   model_name: string, model name.
50 | """ + __tail_doc__ + """
51 | Model architectures:
52 |   | Model      | Params   | FLOPs   | Input | Top1 Acc |
53 |   | ---------- | -------- | ------- | ----- | -------- |
54 |   | FasterViT0 | 31.40M   | 3.51G   | 224   | 82.1     |
55 |   | FasterViT1 | 53.37M   | 5.52G   | 224   | 83.2     |
56 |   | FasterViT2 | 75.92M   | 9.00G   | 224   | 84.2     |
57 |   | FasterViT3 | 159.55M  | 18.75G  | 224   | 84.9     |
58 |   | FasterViT4 | 351.12M  | 41.57G  | 224   | 85.4     |
59 |   | FasterViT5 | 957.52M  | 114.08G | 224   | 85.6     |
60 |   | FasterViT6 | 1360.33M | 144.13G | 224   | 85.8     |
61 | """
62 | 
63 | FasterViT0.__doc__ = __head_doc__ + """
64 | Args:
65 | """ + __tail_doc__
66 | 
67 | FasterViT1.__doc__ = FasterViT0.__doc__
68 | FasterViT2.__doc__ = FasterViT0.__doc__
69 | FasterViT3.__doc__ = FasterViT0.__doc__
70 | FasterViT4.__doc__ = FasterViT0.__doc__
71 | FasterViT5.__doc__ = FasterViT0.__doc__
72 | FasterViT6.__doc__ = FasterViT0.__doc__
73 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/imagenet/losses.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | @tf.keras.utils.register_keras_serializable(package="kecamLoss")
 5 | class BinaryCrossEntropyTimm(tf.keras.losses.BinaryCrossentropy):
 6 |     """
 7 |     >>> import torch, timm.loss
 8 |     >>> from keras_cv_attention_models.imagenet import losses
 9 |     >>> tt = timm.loss.BinaryCrossEntropy(smoothing=0.0, target_threshold=0.2)
10 |     >>> ss = losses.BinaryCrossEntropyTimm(target_threshold=0.2, from_logits=True)
11 |     >>> y_true = tf.one_hot(np.random.permutation(20).reshape(2, 10), 10).numpy()
12 |     >>> y_true = np.clip(y_true[0] + y_true[1], 0, 1)
13 |     >>> y_pred = np.random.uniform(size=(10, 10))
14 |     >>> torch_out = tt(torch.from_numpy(y_pred), torch.from_numpy(y_true)).numpy()
15 |     >>> keras_out = ss(y_true, y_pred).numpy()
16 |     >>> print(f"{torch_out = }, {keras_out = }")
17 |     # torch_out = array(0.9457581, dtype=float32), keras_out = 0.945758044719696
18 |     """
19 | 
20 |     def __init__(self, target_threshold=0.0, label_smoothing=0.0, **kwargs):
21 |         super().__init__(label_smoothing=label_smoothing, **kwargs)
22 |         self.target_threshold = target_threshold
23 |         self.label_smoothing = label_smoothing
24 | 
25 |     def call(self, y_true, y_pred):
26 |         target_threshold = tf.cast(self.target_threshold, y_true.dtype)
27 |         y_true = tf.where(y_true > target_threshold, tf.ones_like(y_true), tf.zeros_like(y_true))
28 |         return super().call(y_true, y_pred)
29 | 
30 |     def get_config(self):
31 |         config = super().get_config()
32 |         config.update({"target_threshold": self.target_threshold, "label_smoothing": self.label_smoothing})
33 |         return config
34 | 
35 | 
36 | @tf.keras.utils.register_keras_serializable(package="kecamLoss")
37 | class DistillKLDivergenceLoss(tf.keras.losses.Loss):
38 |     """[PDF 2106.05237 Knowledge distillation: A good teacher is patient and consistent](https://arxiv.org/pdf/2106.05237.pdf)
39 |     Modified according [Knowledge distillation recipes](https://keras.io/examples/keras_recipes/better_knowledge_distillation/)
40 | 
41 |     Temperature affecting:
42 |     >>> teacher_prob = np.array([0, 0.2, 0.4, 0.6, 0.8, 1.0])
43 |     >>> _ = [print("temperature:", temp, tf.nn.softmax(teacher_prob / temp).numpy()) for temp in [0.1, 1, 10, 20]]
44 |     >>> # temperature: 0.1 [3.92559586e-05 2.90064480e-04 2.14330272e-03 1.58369840e-02 1.17020363e-01 8.64670029e-01]
45 |     >>> # temperature: 1 [0.09542741 0.11655531 0.14236097 0.17388009 0.21237762 0.25939861]
46 |     >>> # temperature: 10 [0.1584458  0.16164661 0.16491209 0.16824354 0.17164228 0.17510968]
47 |     >>> # temperature: 20 [0.16252795 0.16416138 0.16581123 0.16747766 0.16916084 0.17086094]
48 |     """
49 | 
50 |     def __init__(self, temperature=10, **kwargs):
51 |         super().__init__(**kwargs)
52 |         self.temperature = temperature
53 |         # self.kl_divergence = tf.keras.losses.KLDivergence()
54 | 
55 |     def call(self, teacher_prob, student_prob):
56 |         return tf.losses.kl_divergence(
57 |             tf.nn.softmax(teacher_prob / self.temperature, axis=-1),
58 |             tf.nn.softmax(student_prob / self.temperature, axis=-1),
59 |         )
60 | 
61 | 
62 | # Not using, from VOLO with mix_token lambda
63 | def token_label_class_loss(y_true, y_pred):
64 |     # tf.print(", y_true:", y_true.shape, "y_pred:", y_pred.shape, end="")
65 |     if y_pred.shape[-1] != y_true.shape[-1]:
66 |         y_pred, cls_lambda = y_pred[:, :-1], y_pred[:, -1:]
67 |         y_true = tf.cast(y_true, y_pred.dtype)
68 |         y_true = cls_lambda * y_true + (1 - cls_lambda) * y_true[::-1]
69 |     return keras.losses.categorical_crossentropy(y_true, y_pred, from_logits=True)
70 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/tinyvit/__init__.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.tinyvit.tinyvit import TinyViT, TinyViT_5M, TinyViT_11M, TinyViT_21M
 2 | 
 3 | __head_doc__ = """
 4 | Keras implementation of [Github microsoft/TinyViT](https://github.com/microsoft/Cream/tree/main/TinyViT).
 5 | Paper [PDF 2207.10666 TinyViT: Fast Pretraining Distillation for Small Vision Transformers](https://arxiv.org/pdf/2207.10666.pdf).
 6 | """
 7 | 
 8 | __tail_doc__ = """  input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`.
 9 |   num_classes: number of classes to classify images into. Set `0` to exclude top layers.
10 |   activation: activation used in whole model, default `gelu`.
11 |   drop_connect_rate: is used for [Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382).
12 |       Can be a constant value like `0.2`,
13 |       or a tuple value like `(0, 0.2)` indicates the drop probability linearly changes from `0 --> 0.2` for `top --> bottom` layers.
14 |       A higher value means a higher probability will drop the deep branch.
15 |       or `0` to disable (default).
16 |   layer_scale: int value indicates layer scale init value for each stack. Default 0 for not using.
17 |       [Going deeper with Image Transformers](https://arxiv.org/abs/2103.17239).
18 |   classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`.
19 |       Set `classifier_activation=None` to return the logits of the "top" layer.
20 |       Default is `None`.
21 |   pretrained: one of `None` (random initialization) or 'imagenet' or 'imagenet21k-ft1k' (pre-training on ImageNet21k and fine-tuned ImageNet).
22 |       Will try to download and load pre-trained model weights if not None.
23 | 
24 | Returns:
25 |     A `keras.Model` instance.
26 | """
27 | 
28 | TinyViT.__doc__ = __head_doc__ + """
29 | Args:
30 |   num_blocks: number of blocks in each stack.
31 |   out_channels: output channels for each stack.
32 |   block_types: block types for each stack,
33 |       - `conv` or any `c` / `C` starts word, means `mlp_block_with_depthwise_conv` block.
34 |       - `transfrom` or any `t` / `T` starts word, means `multi_head_self_attention` block.
35 |       value could be in format like `"cctt"` or `"CCTT"` or `["conv", "conv", "transfrom", "transform"]`.
36 |   num_heads: int or list of int value indicates attention heads number for each transformer stack.
37 |   window_ratios: int or list of int value indicates attention heads window ratio number for each transformer stack.
38 |       Actual using `window_size = ceil(cur_input_shape / window_ratio)`.
39 |       For `input_shape=(224, 224, 3)` will be window_sizes=[7, 7, 14, 7], for `(384, 384, 3)` will be `[12, 12, 24, 12]`.
40 |   mlp_ratio: int value indicates expand ratio for mlp blocks hidden channel in each stack.
41 |   model_name: string, model name.
42 | """ + __tail_doc__ + """
43 | Model architectures:
44 |   | Model                | Params | FLOPs | Input | Top1 Acc |
45 |   | -------------------- | ------ | ----- | ----- | -------- |
46 |   | TinyViT_5M, distill  | 5.4M   | 1.3G  | 224   | 79.1     |
47 |   | - imagenet21k-ft1k   | 5.4M   | 1.3G  | 224   | 80.7     |
48 |   | TinyViT_11M, distill | 11M    | 2.0G  | 224   | 81.5     |
49 |   | - imagenet21k-ft1k   | 11M    | 2.0G  | 224   | 83.2     |
50 |   | TinyViT_21M, distill | 21M    | 4.3G  | 224   | 83.1     |
51 |   | - imagenet21k-ft1k   | 21M    | 4.3G  | 224   | 84.8     |
52 |   |                      | 21M    | 13.8G | 384   | 86.2     |
53 |   |                      | 21M    | 27.0G | 512   | 86.5     |
54 | """
55 | 
56 | TinyViT_5M.__doc__ = __head_doc__ + """
57 | Args:
58 | """ + __tail_doc__
59 | 
60 | TinyViT_11M.__doc__ = TinyViT_5M.__doc__
61 | TinyViT_21M.__doc__ = TinyViT_5M.__doc__
62 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/clip/torch_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import numpy as np
 4 | from PIL import Image
 5 | from torch.utils.data import Dataset, DataLoader
 6 | 
 7 | 
 8 | def read_from_tsv(data_path):
 9 |     import csv
10 | 
11 |     delimiter = "\t" if data_path.endswith(".tsv") else ","
12 |     train_images, train_captions, test_images, test_captions, base_path, is_train = [], [], [], [], "", True
13 |     with open(data_path) as ff:
14 |         for ii in csv.reader(ff, delimiter=delimiter):
15 |             if ii[0] == "base_path":  # special keys for info
16 |                 base_path = os.path.expanduser(ii[1])
17 |             elif ii[0] == "TEST":  # Use this as indicator for start of test set
18 |                 is_train = False
19 |             elif is_train:
20 |                 train_images.append(ii[0])
21 |                 train_captions.append(ii[1])
22 |             else:
23 |                 test_images.append(ii[0])
24 |                 test_captions.append(ii[1])
25 |     if len(base_path) > 0:
26 |         train_images = [os.path.join(base_path, ii) for ii in train_images]
27 |         test_images = [os.path.join(base_path, ii) for ii in test_images]
28 |     return train_images, train_captions, test_images, test_captions
29 | 
30 | 
31 | class CaptionDataset(Dataset):
32 |     def __init__(self, images, captions, tokenizer, is_train=True, image_size=224):
33 |         from torchvision.transforms import Normalize, Compose, RandomResizedCrop, Resize, InterpolationMode, ToTensor
34 | 
35 |         self.images, self.captions, self.tokenizer = images, captions, tokenizer
36 |         self.context_length = self.tokenizer.context_length
37 | 
38 |         # self.mean, self.std = (0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)  # value from openai/CLIP
39 |         self.mean, self.std = (0.485, 0.456, 0.406), (0.229, 0.224, 0.225)
40 |         interpolation = InterpolationMode.BICUBIC
41 |         image_size = image_size if isinstance(image_size, (list, tuple)) else (image_size, image_size)
42 |         self.transforms = Compose(
43 |             [
44 |                 RandomResizedCrop(image_size, scale=(0.9, 1.0), interpolation=interpolation) if is_train else Resize(image_size, interpolation=interpolation),
45 |                 lambda image: image.convert("RGB"),
46 |                 ToTensor(),
47 |                 Normalize(mean=self.mean, std=self.std),
48 |             ]
49 |         )
50 | 
51 |     def __len__(self):
52 |         return len(self.images)
53 | 
54 |     def __getitem__(self, idx):
55 |         images = self.transforms(Image.open(str(self.images[idx])))
56 |         texts = torch.from_numpy(self.tokenizer(str(self.captions[idx])))
57 |         return images, texts
58 | 
59 | 
60 | def collate_wrapper(batch):
61 |     images, texts = list(zip(*batch))
62 |     return (torch.stack(images), torch.stack(texts)), torch.arange(len(batch))
63 | 
64 | 
65 | def init_dataset(data_path, caption_tokenizer, batch_size=64, image_size=224, num_workers=8):
66 |     train_images, train_captions, test_images, test_captions = read_from_tsv(data_path)
67 | 
68 |     train_dataset = CaptionDataset(train_images, train_captions, tokenizer=caption_tokenizer, is_train=True, image_size=image_size)
69 |     train_dataloader = DataLoader(
70 |         train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, collate_fn=collate_wrapper, pin_memory=True, sampler=None, drop_last=True
71 |     )
72 | 
73 |     test_dataset = CaptionDataset(test_images, test_captions, tokenizer=caption_tokenizer, is_train=False, image_size=image_size)
74 |     test_dataloader = DataLoader(
75 |         test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, collate_fn=collate_wrapper, pin_memory=True, sampler=None, drop_last=True
76 |     )
77 | 
78 |     return train_dataloader, test_dataloader
79 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/cspnext/README.md:
--------------------------------------------------------------------------------
 1 | # ___Keras CSPNeXt___
 2 | ***
 3 | 
 4 | ## Summary
 5 |   - CSPNeXt is the backbone from article: [PDF 2212.07784 RTMDet: An Empirical Study of Designing Real-Time Object Detectors](https://arxiv.org/abs/2212.07784).
 6 |   - Model weights ported from [Github open-mmlab/mmdetection/rtmdet](https://github.com/open-mmlab/mmdetection/tree/main/configs/rtmdet#classification).
 7 | ***
 8 | 
 9 | ## Models
10 |   | Model         | Params | FLOPs | Input | Top1 Acc | Download |
11 |   | ------------- | ------ | ----- | ----- | -------- | -------- |
12 |   | CSPNeXtTiny   | 2.73M  | 0.34G | 224   | 69.44    | [cspnext_tiny_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/cspnext/cspnext_tiny_imagenet.h5) |
13 |   | CSPNeXtSmall  | 4.89M  | 0.66G | 224   | 74.41    | [cspnext_small_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/cspnext/cspnext_small_imagenet.h5) |
14 |   | CSPNeXtMedium | 13.05M | 1.92G | 224   | 79.27    | [cspnext_medium_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/cspnext/cspnext_medium_imagenet.h5) |
15 |   | CSPNeXtLarge  | 27.16M | 4.19G | 224   | 81.30    | [cspnext_large_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/cspnext/cspnext_large_imagenet.h5) |
16 |   | CSPNeXtXLarge | 48.85M | 7.75G | 224   | 82.10    | [cspnext_xlarge_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/cspnext/cspnext_xlarge_imagenet.h5) |
17 | 
18 | ## Usage
19 |   ```py
20 |   from keras_cv_attention_models import cspnext, test_images
21 |   mm = cspnext.CSPNeXtTiny()
22 | 
23 |   # Run prediction
24 |   preds = mm(mm.preprocess_input(test_images.cat()))
25 |   print(mm.decode_predictions(preds))
26 |   # [('n02124075', 'Egyptian_cat', 0.46106383), ('n02123045', 'tabby', 0.19603978), ...]
27 |   ```
28 |   **Use dynamic input resolution** by set `input_shape=(None, None, 3)`.
29 |   ```py
30 |   from keras_cv_attention_models import cspnext
31 |   model = cspnext.CSPNeXtTiny(input_shape=(None, None, 3), num_classes=0)
32 | 
33 |   print(model(np.ones([1, 223, 123, 3])).shape)
34 |   # (1, 7, 4, 384)
35 |   print(model(np.ones([1, 32, 526, 3])).shape)
36 |   # (1, 1, 17, 384)
37 |   ```
38 |   **Using PyTorch backend** by set `KECAM_BACKEND='torch'` environment variable.
39 |   ```py
40 |   os.environ['KECAM_BACKEND'] = 'torch'
41 |   from keras_cv_attention_models import cspnext, test_images
42 |   mm = cspnext.CSPNeXtSmall(input_shape=(219, 112, 3))
43 |   # >>>> Using PyTorch backend
44 |   # >>>> Load pretrained from: ~/.keras/models/cspnext_small_imagenet.h5
45 | 
46 |   # Run prediction
47 |   preds = mm(mm.preprocess_input(test_images.cat()))
48 |   print(mm.decode_predictions(preds))
49 |   # [('n02124075', 'Egyptian_cat', 0.7909507), ('n02123045', 'tabby', 0.038315363), ...]
50 |   ```
51 | ## Verification with PyTorch version
52 |   ```py
53 |   inputs = np.random.uniform(size=(1, 224, 224, 3)).astype("float32")
54 | 
55 |   """ PyTorch CSPNeXt """
56 |   from mmdet import models
57 |   torch_model = models.backbones.CSPNeXt()
58 |   import torch
59 |   ss = torch.load('cspnext-l_8xb256-rsb-a1-600e_in1k-6a760974.pth')
60 |   ss = {kk.replace('backbone.', ''): vv for kk, vv in ss['state_dict'].items() if kk.startswith('backbone.')}
61 |   torch_model.load_state_dict(ss)
62 |   _ = torch_model.eval()
63 |   torch_out = torch_model(torch.from_numpy(inputs).permute(0, 3, 1, 2))[-1].permute([0, 2, 3, 1]).detach().numpy()
64 | 
65 |   """ Keras CSPNeXtLarge """
66 |   from keras_cv_attention_models import cspnext
67 |   mm = cspnext.CSPNeXtLarge(pretrained="imagenet", num_classes=0)  # Exclude header
68 |   keras_out = mm(inputs).numpy()
69 | 
70 |   """ Verification """
71 |   print(f"{np.allclose(torch_out, keras_out, atol=1e-4) = }")
72 |   # np.allclose(torch_out, keras_out, atol=1e-4) = True
73 |   ```
74 | ***
75 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/ghostnet/README.md:
--------------------------------------------------------------------------------
 1 | # ___Keras GhostNetV2___
 2 | ***
 3 | 
 4 | ## Summary
 5 |   - Keras implementation of [Github huawei-noah/ghostnetv2_pytorch](https://github.com/huawei-noah/Efficient-AI-Backbones/tree/master/ghostnetv2_pytorch). Paper [PDF GhostNetV2: Enhance Cheap Operation with Long-Range Attention](https://openreview.net/pdf/6db544c65bbd0fa7d7349508454a433c112470e2.pdf).
 6 |   - `GhostNet_100` model weights ported from official publication [Github huawei-noah/ghostnet_pytorch](https://github.com/huawei-noah/Efficient-AI-Backbones/tree/master/ghostnet_pytorch). Paper [PDF 1911.11907 GhostNet: More Features from Cheap Operations](https://arxiv.org/pdf/1911.11907.pdf).
 7 |   - `GhostNet_050` and `GhostNet_130` model weights ported from [Github PaddlePaddle/PaddleClas](https://github.com/PaddlePaddle/PaddleClas).
 8 | 
 9 |   ![ghostnetv2](https://user-images.githubusercontent.com/5744524/202699896-4c429db1-8038-4dc9-992b-d355d1cfee6e.PNG)
10 | ***
11 | 
12 | ## Models
13 |   - `GhostNetV2_100` should be same with `GhostNetV2 (1.0x)`. Weights are ported from official publication. Currently it's only weights with accuracy `74.41` provided.
14 | 
15 |   | Model          | Params | FLOPs  | Input | Top1 Acc | Download |
16 |   | -------------- | ------ | ------ | ----- | -------- | -------- |
17 |   | GhostNetV2_100 | 6.12M  | 168.5M | 224   | 75.3     | [ghostnetv2_100_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/ghostnetv2/ghostnetv2_100_imagenet.h5) |
18 |   | GhostNetV2_130 | 8.96M  | 271.1M | 224   | 76.9     | [ghostnetv2_130_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/ghostnetv2/ghostnetv2_130_imagenet.h5) |
19 |   | GhostNetV2_160 | 12.39M | 400.9M | 224   | 77.8     | [ghostnetv2_160_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/ghostnetv2/ghostnetv2_160_imagenet.h5) |
20 | 
21 |   | Model        | Params | FLOPs  | Input | Top1 Acc | Download |
22 |   | ------------ | ------ | ------ | ----- | -------- | -------- |
23 |   | GhostNet_050 | 2.59M  | 42.6M  | 224   | 66.88    | [ghostnet_050_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/ghostnetv2/ghostnet_050_imagenet.h5) |
24 |   | GhostNet_100 | 5.18M  | 141.7M | 224   | 74.16    | [ghostnet_100_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/ghostnetv2/ghostnet_100_imagenet.h5) |
25 |   | GhostNet_130 | 7.36M  | 227.7M | 224   | 75.79    | [ghostnet_130_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/ghostnetv2/ghostnet_130_imagenet.h5) |
26 |   | - ssld       | 7.36M  | 227.7M | 224   | 79.38    | [ghostnet_130_ssld.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/ghostnetv2/ghostnet_130_ssld.h5) |
27 | ## Usage
28 |   ```py
29 |   from keras_cv_attention_models import ghostnet
30 | 
31 |   # Will download and load pretrained imagenet weights.
32 |   mm = ghostnet.GhostNetV2_100(pretrained="imagenet")
33 | 
34 |   # Run prediction
35 |   import tensorflow as tf
36 |   from tensorflow import keras
37 |   from skimage.data import chelsea
38 |   imm = keras.applications.imagenet_utils.preprocess_input(chelsea(), mode='torch') # Chelsea the cat
39 |   pred = mm(tf.expand_dims(tf.image.resize(imm, mm.input_shape[1:3]), 0)).numpy()
40 |   print(keras.applications.imagenet_utils.decode_predictions(pred)[0])
41 |   # [('n02124075', 'Egyptian_cat', 0.81426907), ('n02123045', 'tabby', 0.07202001), ...]
42 |   ```
43 |   **Use dynamic input resolution** by set `input_shape=(None, None, 3)`.
44 |   ```py
45 |   from keras_cv_attention_models import ghostnet
46 |   model = ghostnet.GhostNetV2_100(input_shape=(None, None, 3), num_classes=0)
47 | 
48 |   print(model(np.ones([1, 224, 224, 3])).shape)
49 |   # (1, 7, 7, 960)
50 |   print(model(np.ones([1, 512, 384, 3])).shape)
51 |   # (1, 16, 12, 960)
52 |   ```
53 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/pytorch_backend/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import inspect
 3 | import torch
 4 | import hashlib
 5 | 
 6 | _GLOBAL_CUSTOM_OBJECTS = {}
 7 | _GLOBAL_CUSTOM_NAMES = {}
 8 | 
 9 | 
10 | def register_keras_serializable(package="Custom", name=None):
11 |     def decorator(arg):
12 |         """Registers a class with the Keras serialization framework."""
13 |         class_name = name if name is not None else arg.__name__
14 |         registered_name = package + ">" + class_name
15 | 
16 |         if inspect.isclass(arg) and not hasattr(arg, "get_config"):
17 |             raise ValueError("Cannot register a class that does not have a " "get_config() method.")
18 | 
19 |         # if registered_name in _GLOBAL_CUSTOM_OBJECTS:
20 |         #     raise ValueError(f"{registered_name} has already been registered to " f"{_GLOBAL_CUSTOM_OBJECTS[registered_name]}")
21 | 
22 |         # if arg in _GLOBAL_CUSTOM_NAMES:
23 |         #     raise ValueError(f"{arg} has already been registered to " f"{_GLOBAL_CUSTOM_NAMES[arg]}")
24 |         _GLOBAL_CUSTOM_OBJECTS[registered_name] = arg
25 |         _GLOBAL_CUSTOM_NAMES[arg] = registered_name
26 | 
27 |         return arg
28 | 
29 |     return decorator
30 | 
31 | 
32 | def validate_file_md5(fpath, file_hash, chunk_size=65535):
33 |     """Validates a file against a md5 hash. From keras/utils/data_utils.py"""
34 |     hasher = hashlib.md5()
35 |     with open(fpath, "rb") as fpath_file:
36 |         for chunk in iter(lambda: fpath_file.read(chunk_size), b""):
37 |             hasher.update(chunk)
38 |     return str(hasher.hexdigest()) == str(file_hash)
39 | 
40 | 
41 | def _extract_archive(file_path, path=".", archive_format="auto"):
42 |     if "zip" in archive_format or (archive_format == "auto" and file_path.endswith(".zip")):
43 |         import zipfile
44 | 
45 |         assert zipfile.is_zipfile(file_path), "Not a zip file: {}".format(file_path)
46 |         open_fn = zipfile.ZipFile
47 |     elif "tar" in archive_format or (archive_format == "auto" and (file_path.endswith(".tar") or file_path.endswith(".tar.gz"))):
48 |         import tarfile
49 | 
50 |         assert tarfile.is_tarfile(file_path), "Not a tar file: {}".format(file_path)
51 |         open_fn = tarfile.open
52 |     else:
53 |         raise ValueError("Not a supported extract file format: {}".format(file_path))
54 | 
55 |     print(">>>> Extract {} -> {}".format(file_path, path))
56 |     with open_fn(file_path) as ff:
57 |         ff.extractall(path)
58 |     return path
59 | 
60 | 
61 | def get_file(fname=None, origin=None, cache_subdir="datasets", file_hash=None, extract=False):
62 |     # print(f">>>> {fname = }, {origin = }, {cache_subdir = }, {file_hash = }")
63 |     save_dir = os.path.join(os.path.expanduser("~"), ".keras", cache_subdir)
64 |     if not os.path.exists(save_dir):
65 |         os.makedirs(save_dir, exist_ok=True)
66 |     fname = os.path.basename(origin) if fname is None else fname
67 |     file_path = os.path.join(save_dir, fname)
68 |     if os.path.exists(file_path):
69 |         if file_hash is not None and not validate_file_md5(file_path, file_hash):
70 |             print(
71 |                 "A local file was found, but it seems to be incomplete or outdated because the md5 file hash does not match the original value of "
72 |                 f"{file_hash} so we will re-download the data."
73 |             )
74 |         else:
75 |             return file_path
76 | 
77 |     print("Downloading data from {} to {}".format(origin, file_path))
78 |     torch.hub.download_url_to_file(origin, file_path)
79 |     if os.path.exists(file_path) and file_hash is not None and not validate_file_md5(file_path, file_hash):
80 |         raise ValueError("Incomplete or corrupted file detected. The md5 file hash does not match the provided value {}.".format(file_hash))
81 | 
82 |     if extract:
83 |         _extract_archive(file_path, path=save_dir)  # return tar file path, just like keras one
84 |     return file_path
85 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/ghostnet/__init__.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.ghostnet.ghostnet_v2 import GhostNetV2, GhostNetV2_100, GhostNetV2_130, GhostNetV2_160
 2 | from keras_cv_attention_models.ghostnet.ghostnet import GhostNet, GhostNet_050, GhostNet_100, GhostNet_130
 3 | 
 4 | __v2_head_doc__ = """
 5 | Keras implementation of [Gitee mindspore/models/ghostnetv2](https://gitee.com/mindspore/models/tree/master/research/cv/ghostnetv2).
 6 | Paper [PDF GhostNetV2: Enhance Cheap Operation with Long-Range Attention](https://openreview.net/pdf/6db544c65bbd0fa7d7349508454a433c112470e2.pdf).
 7 | """
 8 | 
 9 | __tail_doc__ = """  kernel_sizes: kernel_size for each stack.
10 |   first_ghost_channels: num channels for first ghost module in each stack.
11 |   out_channels: output channels for each stack.
12 |   se_ratios: se_ratio for each stack.
13 |   strides: stride for each stack.
14 |   stem_width: output dimension for stem block.
15 |   stem_strides: strides for stem `Conv2D`, default `2`.
16 |   num_ghost_module_v1_stacks: num of `ghost_module` stcks on the head, others are `ghost_module_multiply`.
17 |       - for `GhostNet` v1 way, default `-1` for all using `ghost_module`.
18 |       - for `GhostNetV2` way, default `2` for only using `ghost_module` in the first 2 stacks.
19 |   input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`.
20 |   num_classes: number of classes to classify images into. Set `0` to exclude top layers.
21 |   activation: activation used in whole model, default "relu".
22 |   dropout: dropout rate if top layers is included.
23 |   classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`.
24 |       Set `classifier_activation=None` to return the logits of the "top" layer.
25 |   pretrained: One of `[None, "imagenet", "ssld"]`. "ssld" if for `GhostNet_130`.
26 |   **kwargs: other parameters if available.
27 | 
28 | Returns:
29 |   A `keras.Model` instance.
30 | """
31 | 
32 | GhostNetV2.__doc__ = __v2_head_doc__ + """
33 | Args:
34 |   width_mul: expansion ratio of `fist_ghost_channels` and `out_channels` in each block.
35 |   model_name: string, model name.
36 | """ + __tail_doc__ + """
37 | Model architectures:
38 |   | Model             | Params | FLOPs  | Input | Top1 Acc |
39 |   | ----------------- | ------ | ------ | ----- | -------- |
40 |   | GhostNetV2_100    | 6.12M  | 168.5M | 224   | 74.41    |
41 |   | GhostNetV2 (1.0x) | 6.12M  | 168.5M | 224   | 75.3     |
42 |   | GhostNetV2 (1.3x) | 8.96M  | 271.1M | 224   | 76.9     |
43 |   | GhostNetV2 (1.6x) | 12.39M | 400.9M | 224   | 77.8     |
44 | """
45 | 
46 | GhostNetV2_100.__doc__ = __v2_head_doc__ + """
47 | Args:
48 | """ + __tail_doc__
49 | 
50 | GhostNetV2_130.__doc__ = GhostNetV2_100.__doc__
51 | GhostNetV2_160.__doc__ = GhostNetV2_100.__doc__
52 | 
53 | __v1_head_doc__ = """
54 | Keras implementation of [Github huawei-noah/ghostnet_pytorch](https://github.com/huawei-noah/Efficient-AI-Backbones/tree/master/ghostnet_pytorch).
55 | Paper [PDF 1911.11907 GhostNet: More Features from Cheap Operations](https://arxiv.org/pdf/1911.11907.pdf).
56 | """
57 | 
58 | GhostNet.__doc__ = __v1_head_doc__ + """
59 | Args:
60 |   width_mul: expansion ratio of `fist_ghost_channels` and `out_channels` in each block.
61 |   stem_width: output dimension for stem block.
62 |   model_name: string, model name.
63 | """ + __tail_doc__ + """
64 | Model architectures:
65 |   | Model        | Params | FLOPs  | Input | Top1 Acc |
66 |   | ------------ | ------ | ------ | ----- | -------- |
67 |   | GhostNet_050 | 2.59M  | 42.6M  | 224   | 66.88    |
68 |   | GhostNet_100 | 5.18M  | 141.7M | 224   | 74.16    |
69 |   | GhostNet_130 | 7.36M  | 227.7M | 224   | 75.79    |
70 |   | - ssld       | 7.36M  | 227.7M | 224   | 79.38    |
71 | """
72 | 
73 | GhostNet_050.__doc__ = __v1_head_doc__ + """
74 | Args:
75 | """ + __tail_doc__
76 | 
77 | GhostNet_100.__doc__ = GhostNet_050.__doc__
78 | GhostNet_130.__doc__ = GhostNet_050.__doc__
79 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/iformer/README.md:
--------------------------------------------------------------------------------
 1 | # ___Keras InceptionTransformer___
 2 | ***
 3 | 
 4 | ## Summary
 5 |   - Keras implementation of [Github sail-sg/iFormer](https://github.com/sail-sg/iFormer). Paper [PDF 2205.12956 Inception Transformer](https://arxiv.org/pdf/2205.12956.pdf).
 6 |   - Model weights ported from official publication.
 7 | ***
 8 | 
 9 | ## Models
10 |   | Model        | Params | FLOPs  | Input | Top1 Acc | Download |
11 |   | ------------ | ------ | ------ | ----- | -------- | -------- |
12 |   | IFormerSmall | 19.9M  | 4.88G  | 224   | 83.4     | [iformer_small_224_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/iformer/iformer_small_224_imagenet.h5) |
13 |   |              | 20.9M  | 16.29G | 384   | 84.6     | [iformer_small_384_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/iformer/iformer_small_384_imagenet.h5) |
14 |   | IFormerBase  | 47.9M  | 9.44G  | 224   | 84.6     | [iformer_base_224_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/iformer/iformer_base_224_imagenet.h5) |
15 |   |              | 48.9M  | 30.86G | 384   | 85.7     | [iformer_base_384_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/iformer/iformer_base_384_imagenet.h5) |
16 |   | IFormerLarge | 86.6M  | 14.12G | 224   | 84.6     | [iformer_large_224_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/iformer/iformer_largel_224_imagenet.h5) |
17 |   |              | 87.7M  | 45.74G | 384   | 85.8     | [iformer_large_384_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/iformer/iformer_largel_384_imagenet.h5) |
18 | ## Usage
19 |   ```py
20 |   from keras_cv_attention_models import iformer
21 | 
22 |   # Will download and load pretrained imagenet weights.
23 |   mm = iformer.IFormerSmall(pretrained="imagenet")
24 | 
25 |   # Run prediction
26 |   import tensorflow as tf
27 |   from tensorflow import keras
28 |   from skimage.data import chelsea
29 |   imm = keras.applications.imagenet_utils.preprocess_input(chelsea(), mode='torch') # Chelsea the cat
30 |   pred = mm(tf.expand_dims(tf.image.resize(imm, mm.input_shape[1:3]), 0)).numpy()
31 |   print(keras.applications.imagenet_utils.decode_predictions(pred)[0])
32 |   # [('n02124075', 'Egyptian_cat', 0.7471715), ('n02123159', 'tiger_cat', 0.035306472), ...]
33 |   ```
34 |   **Change input resolution**.
35 |   ```py
36 |   from keras_cv_attention_models import iformer
37 |   mm = iformer.IFormerSmall(input_shape=(512, 393, 3), pretrained="imagenet")
38 |   # >>>> Load pretrained from: ~/.keras/models/iformer_small_384_imagenet.h5
39 |   # >>>> Reload mismatched weights: 384 -> (512, 393)
40 |   # >>>> Reload layer: stack1_positional_embedding
41 |   # ...
42 | 
43 |   # Run prediction
44 |   from skimage.data import chelsea
45 |   preds = mm(mm.preprocess_input(chelsea()))
46 |   print(mm.decode_predictions(preds))
47 |   # [('n02124075', 'Egyptian_cat', 0.72780704), ('n02123159', 'tiger_cat', 0.11522171), ...]
48 |   ```
49 | ## Verification with PyTorch version
50 |   ```py
51 |   """ PyTorch iformer_small """
52 |   sys.path.append('../iFormer/')
53 |   sys.path.append('../pytorch-image-models/')  # Needs timm
54 |   import torch
55 |   from models import inception_transformer
56 | 
57 |   torch_model = inception_transformer.iformer_small(pretrained=True)
58 |   _ = torch_model.eval()
59 | 
60 |   """ Keras IFormerSmall """
61 |   from keras_cv_attention_models import iformer
62 |   mm = iformer.IFormerSmall(pretrained="imagenet", classifier_activation=None)
63 | 
64 |   """ Verification """
65 |   inputs = np.random.uniform(size=(1, *mm.input_shape[1:3], 3)).astype("float32")
66 |   torch_out = torch_model(torch.from_numpy(inputs).permute(0, 3, 1, 2)).detach().numpy()
67 |   keras_out = mm(inputs).numpy()
68 |   print(f"{np.allclose(torch_out, keras_out, atol=1e-5) = }")
69 |   # np.allclose(torch_out, keras_out, atol=1e-5) = True
70 |   ```
71 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/models.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import keras_cv_attention_models as __package__  # don't show `keras_cv_attention_models` under `keras_cv_attention_models.models.`
 3 | 
 4 | 
 5 | def register_model(model_func):
 6 |     if not hasattr(__package__.models, model_func.__name__):
 7 |         setattr(__package__.models, model_func.__name__, model_func)
 8 |     return model_func
 9 | 
10 | 
11 | def no_grad_if_torch(func):
12 |     if __package__.backend.is_torch_backend:
13 |         import torch
14 | 
15 |         def no_grad_call(*args, **kwargs):
16 |             with torch.no_grad():
17 |                 return func(*args, **kwargs)
18 | 
19 |         return no_grad_call
20 |     else:
21 |         return func
22 | 
23 | 
24 | class FakeModelWrapper:
25 |     def __init__(self, models, name="model"):
26 |         self.models = models if isinstance(models, (list, tuple)) else [models]
27 |         self.name = name
28 | 
29 |     def cuda(self):
30 |         """Torch function"""
31 |         self.models = [model.cuda() for model in self.models]
32 |         return self
33 | 
34 |     def cpu(self):
35 |         """Torch function"""
36 |         self.models = [model.cpu() for model in self.models]
37 |         return self
38 | 
39 |     def float(self):
40 |         """Torch function"""
41 |         self.models = [model.float() for model in self.models]
42 |         return self
43 | 
44 |     def half(self):
45 |         """Torch function"""
46 |         self.models = [model.half() for model in self.models]
47 |         return self
48 | 
49 |     def to(self, *args):
50 |         """Torch function"""
51 |         self.models = [model.to(*args) for model in self.models]
52 |         return self
53 | 
54 |     def _save_load_file_path_rule_(self, file_path=None):
55 |         file_path = self.name if file_path is None else file_path
56 |         suffix = os.path.splitext(file_path)[1]
57 |         if suffix in [".h5", ".keras", ".pt", ".pth"]:
58 |             file_path = os.path.splitext(file_path)[0]
59 |             save_path_rule = lambda model_name: file_path + "_" + model_name + suffix
60 |         else:  # Regard as directory
61 |             if not os.path.exists(file_path):
62 |                 os.makedirs(file_path, exist_ok=True)
63 |             save_path_rule = lambda model_name: os.path.join(file_path, model_name + ".h5")
64 |         return save_path_rule
65 | 
66 |     def save(self, file_path=None):
67 |         """file_path: if suffix in [".h5", ".keras", ".pt", ".pth"], will save as {file_path}_{model_name}.{suffix},
68 |         or will regard as directory, and save to {file_path}/{model_name}.h5
69 |         """
70 |         save_path_rule = self._save_load_file_path_rule_(file_path)
71 |         for model in self.models:
72 |             cur_save_path = save_path_rule(model.name)
73 |             print(">>>> Saving {} to {}".format(model.name, cur_save_path))
74 |             model.save(cur_save_path)
75 | 
76 |     def save_weights(self, file_path=None):
77 |         """file_path: if suffix in [".h5", ".keras", ".pt", ".pth"], will save as {file_path}_{model_name}.{suffix},
78 |         or will regard as directory, and save to {file_path}/{model_name}.h5
79 |         """
80 |         save_path_rule = self._save_load_file_path_rule_(file_path)
81 |         for model in self.models:
82 |             cur_save_path = save_path_rule(model.name)
83 |             print(">>>> Saving {} weights to {}".format(model.name, cur_save_path))
84 |             model.save_weights(cur_save_path)
85 | 
86 |     def load_weights(self, file_path=None):
87 |         """file_path: if suffix in [".h5", ".keras", ".pt", ".pth"], will load from {file_path}_{model_name}.{suffix},
88 |         or will regard as directory, and load from {file_path}/{model_name}.h5
89 |         """
90 |         save_path_rule = self._save_load_file_path_rule_(file_path)
91 |         for model in self.models:
92 |             cur_save_path = save_path_rule(model.name)
93 |             print(">>>> Loading {} from {}".format(model.name, cur_save_path))
94 |             model.load_weights(cur_save_path)
95 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/pvt/README.md:
--------------------------------------------------------------------------------
 1 | # ___Keras PyramidVisionTransformerV2___
 2 | ***
 3 | 
 4 | ## Summary
 5 |   - Keras implementation of [Github whai362/PVT](https://github.com/whai362/PVT/tree/v2/classification). Paper [PDF 2106.13797 PVTv2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/pdf/2106.13797.pdf).
 6 |   - Model weights ported from official publication.
 7 | ***
 8 | 
 9 | ## Models
10 |   | Model           | Params | FLOPs  | Input | Top1 Acc | Download |
11 |   | --------------- | ------ | ------ | ----- | -------- | -------- |
12 |   | PVT_V2B0        | 3.7M   | 580.3M | 224   | 70.5     | [pvt_v2_b0_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/pvt/pvt_v2_b0_imagenet.h5) |
13 |   | PVT_V2B1        | 14.0M  | 2.14G  | 224   | 78.7     | [pvt_v2_b1_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/pvt/pvt_v2_b1_imagenet.h5) |
14 |   | PVT_V2B2        | 25.4M  | 4.07G  | 224   | 82.0     | [pvt_v2_b2_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/pvt/pvt_v2_b2_imagenet.h5) |
15 |   | PVT_V2B2_linear | 22.6M  | 3.94G  | 224   | 82.1     | [pvt_v2_b2_linear.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/pvt/pvt_v2_b2_linear_imagenet.h5) |
16 |   | PVT_V2B3        | 45.2M  | 6.96G  | 224   | 83.1     | [pvt_v2_b3_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/pvt/pvt_v2_b3_imagenet.h5) |
17 |   | PVT_V2B4        | 62.6M  | 10.19G | 224   | 83.6     | [pvt_v2_b4_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/pvt/pvt_v2_b4_imagenet.h5) |
18 |   | PVT_V2B5        | 82.0M  | 11.81G | 224   | 83.8     | [pvt_v2_b5_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/pvt/pvt_v2_b5_imagenet.h5) |
19 | ## Usage
20 |   ```py
21 |   from keras_cv_attention_models import pvt
22 | 
23 |   # Will download and load pretrained imagenet weights.
24 |   mm = pvt.PVT_V2B2(pretrained="imagenet")
25 | 
26 |   # Run prediction
27 |   import tensorflow as tf
28 |   from tensorflow import keras
29 |   from skimage.data import chelsea
30 |   imm = keras.applications.imagenet_utils.preprocess_input(chelsea(), mode='torch') # Chelsea the cat
31 |   pred = mm(tf.expand_dims(tf.image.resize(imm, mm.input_shape[1:3]), 0)).numpy()
32 |   print(keras.applications.imagenet_utils.decode_predictions(pred)[0])
33 |   # [('n02124075', 'Egyptian_cat', 0.6658455), ('n02123159', 'tiger_cat', 0.08825972), ...]
34 |   ```
35 |   **Change input resolution**. Note: for `PVT_V2B2_linear` using `addaptive_pooling_2d` with `output_size=7`, input shape should be lager than `193`.
36 |   ```py
37 |   from keras_cv_attention_models import pvt
38 |   mm = pvt.PVT_V2B1(input_shape=(128, 192, 3), pretrained="imagenet")
39 |   # >>>> Load pretrained from: ~/.keras/models/pvt_v2_b1_imagenet.h5
40 | 
41 |   # Run prediction
42 |   from skimage.data import chelsea
43 |   preds = mm(mm.preprocess_input(chelsea()))
44 |   print(mm.decode_predictions(preds))
45 |   # [('n02124075', 'Egyptian_cat', 0.8482509), ('n02123045', 'tabby', 0.07139703), ...]
46 |   ```
47 | ## Verification with PyTorch version
48 |   ```py
49 |   """ PyTorch pvt_v2_b0 """
50 |   sys.path.append('../PVT-2/')
51 |   sys.path.append('../pytorch-image-models/')  # Needs timm
52 |   import torch
53 |   from classification import pvt_v2
54 | 
55 |   torch_model = pvt_v2.pvt_v2_b0()
56 |   ss = torch.load('pvt_v2_b0.pth', map_location=torch.device('cpu'))
57 |   torch_model.load_state_dict(ss)
58 |   _ = torch_model.eval()
59 | 
60 |   """ Keras PVT_V2B0 """
61 |   from keras_cv_attention_models import pvt
62 |   mm = pvt.PVT_V2B0(pretrained="imagenet", classifier_activation=None)
63 | 
64 |   """ Verification """
65 |   inputs = np.random.uniform(size=(1, *mm.input_shape[1:3], 3)).astype("float32")
66 |   torch_out = torch_model(torch.from_numpy(inputs).permute(0, 3, 1, 2)).detach().numpy()
67 |   keras_out = mm(inputs).numpy()
68 |   print(f"{np.allclose(torch_out, keras_out, atol=1e-5) = }")
69 |   # np.allclose(torch_out, keras_out, atol=1e-5) = True
70 |   ```
71 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/pytorch_backend/losses.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | epsilon = 1e-7
 4 | 
 5 | 
 6 | class Loss:
 7 |     def __init__(self, reduction="AUTO", name=None):
 8 |         self.reduction, self.name = reduction, name
 9 | 
10 |     def __call__(self, y_true, y_pred, sample_weight=None):
11 |         pass
12 | 
13 | 
14 | def categorical_crossentropy(y_true, y_pred, from_logits=False, label_smoothing=0.0, axis=-1):
15 |     """
16 |     # from_logits=False
17 |     >>> import torch, tensorflow as tf
18 |     >>> from keras_cv_attention_models.pytorch_backend import losses
19 |     >>> xx, yy = tf.random.uniform([24, 10]), tf.one_hot(tf.random.uniform([24], 0, 10, dtype='int32'), 10)
20 |     >>> tf_out = tf.losses.categorical_crossentropy(yy, xx, from_logits=False).numpy().mean()
21 |     >>> torch_out = losses.categorical_crossentropy(torch.from_numpy(yy.numpy()), torch.from_numpy(xx.numpy()), from_logits=False)
22 |     >>> print(tf_out, torch_out, np.allclose(tf_out, torch_out))
23 |     >>> # 2.681877 tensor(2.6819) True
24 |     # from_logits=True
25 |     >>> tf_out = tf.losses.categorical_crossentropy(yy, xx, from_logits=True).numpy().mean()
26 |     >>> torch_out = losses.categorical_crossentropy(torch.from_numpy(yy.numpy()), torch.from_numpy(xx.numpy()), from_logits=True)
27 |     >>> print(tf_out, torch_out, np.allclose(tf_out, torch_out))
28 |     >>> # 2.3364408 tensor(2.3364) True
29 |     """
30 |     if from_logits:
31 |         return torch.nn.functional.cross_entropy(y_pred, y_true.argmax(-1), label_smoothing=label_smoothing)
32 |     else:
33 |         y_pred = y_pred / y_pred.sum(dim=axis, keepdim=True)
34 |         y_pred = y_pred.clamp_(epsilon, 1.0 - epsilon)
35 |         return -(y_true * y_pred.log()).sum(dim=axis).mean()
36 | 
37 | 
38 | def sparse_categorical_crossentropy(y_true, y_pred, from_logits=False, label_smoothing=0.0, axis=-1):
39 |     """
40 |     # from_logits=False
41 |     >>> import torch, tensorflow as tf
42 |     >>> from keras_cv_attention_models.pytorch_backend import losses
43 |     >>> xx, yy = tf.random.uniform([24, 10]), tf.random.uniform([24], 0, 10, dtype='int64')
44 |     >>> tf_out = tf.losses.sparse_categorical_crossentropy(yy, xx, from_logits=False).numpy().mean()
45 |     >>> torch_out = losses.sparse_categorical_crossentropy(torch.from_numpy(yy.numpy()), torch.from_numpy(xx.numpy()), from_logits=False)
46 |     >>> print(tf_out, torch_out, np.allclose(tf_out, torch_out))
47 |     >>> # 2.677911 tensor(2.6779) True
48 |     # from_logits=True
49 |     >>> tf_out = tf.losses.sparse_categorical_crossentropy(yy, xx, from_logits=True).numpy().mean()
50 |     >>> torch_out = losses.sparse_categorical_crossentropy(torch.from_numpy(yy.numpy()), torch.from_numpy(xx.numpy()), from_logits=True)
51 |     >>> print(tf_out, torch_out, np.allclose(tf_out, torch_out))
52 |     >>> # 2.3503969 tensor(2.3504) True
53 |     """
54 |     if from_logits:
55 |         return torch.nn.functional.cross_entropy(y_pred, y_true, label_smoothing=label_smoothing)
56 |     else:
57 |         y_pred = y_pred / y_pred.sum(dim=axis, keepdim=True)
58 |         y_pred = y_pred.clamp_(epsilon, 1.0 - epsilon)
59 |         y_true = torch.nn.functional.one_hot(y_true, y_pred.shape[-1])
60 |         return -(y_true * y_pred.log()).sum(dim=axis).mean()
61 | 
62 | 
63 | class MeanSquaredError(Loss):
64 |     """
65 |     >>> from keras_cv_attention_models.pytorch_backend import losses
66 |     >>> aa = np.random.uniform(size=[4, 42, 42, 3]).astype("float32")
67 |     >>> bb = np.random.uniform(size=[4, 42, 42, 3]).astype("float32")
68 |     >>> print(f"{keras.losses.MeanSquaredError()(aa, bb).numpy() = }")
69 |     # keras.losses.MeanSquaredError()(aa, bb).numpy() = 0.16724217
70 |     >>> print(f"{losses.MeanSquaredError()(torch.from_numpy(aa), torch.from_numpy(bb)) = }")
71 |     # losses.MeanSquaredError()(torch.from_numpy(aa), torch.from_numpy(bb)) = tensor(0.1672)
72 |     """
73 | 
74 |     def __init__(self, reduction="AUTO", name="mean_squared_error"):
75 |         super().__init__(reduction=reduction, name=name)
76 | 
77 |     def __call__(self, y_true, y_pred, sample_weight=None):
78 |         return torch.functional.F.mse_loss(y_pred, y_true)
79 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/llama2/README.md:
--------------------------------------------------------------------------------
 1 | # ___Keras LLaMA2___
 2 | ***
 3 | 
 4 | ## Summary
 5 |   - Keras implementation of [Github facebookresearch/llama](https://github.com/facebookresearch/llama). Paper [PDF 2307.09288 Llama 2: Open Foundation and Fine-Tuned Chat Models](https://arxiv.org/pdf/2307.09288.pdf).
 6 |   - `LLaMA2_15M` / `LLaMA2_42M`, `LLaMA2_110M` model weights ported from [Github karpathy/llama2.c](https://github.com/karpathy/llama2.c).
 7 |   - `LLaMA2_1B` model weights ported from [Github jzhang38/TinyLlama](https://githubfast.com/jzhang38/TinyLlama) `TinyLlama-1.1B-Chat-V0.4` one.
 8 | ## Models
 9 |   - `Params` is counted with `include_top=True`, will match the name if set `include_top=False`.
10 | 
11 |   | Model       | Params | FLOPs  | vocab_size | Val loss |
12 |   | ----------- | ------ | ------ | ---------- | -------- |
13 |   | [LLaMA2_15M](https://github.com/leondgarse/keras_cv_attention_models/releases/download/llama2/llama2_15m_tiny_stories.h5)  | 24.41M | 4.06G  | 32000      | 1.072    |
14 |   | [LLaMA2_42M](https://github.com/leondgarse/keras_cv_attention_models/releases/download/llama2/llama2_42m_tiny_stories.h5)  | 58.17M | 50.7G  | 32000      | 0.847    |
15 |   | [LLaMA2_110M](https://github.com/leondgarse/keras_cv_attention_models/releases/download/llama2/llama2_110m_tiny_stories.h5) | 134.1M | 130.2G | 32000      | 0.760    |
16 |   | [LLaMA2_1B](https://github.com/leondgarse/keras_cv_attention_models/releases/download/llama2/llama2_1b_tiny_llama_1.1B_chat_v0.4.h5) | 1.10B  | 2.50T  | 32003      |          |
17 |   | LLaMA2_7B   | 6.74B  | 14.54T | 32000      |          |
18 | ## Usage
19 |   ```py
20 |   from keras_cv_attention_models import llama2
21 | 
22 |   mm = llama2.LLaMA2_42M()
23 |   # >>>> Load pretrained from: ~/.keras/models/llama2_42m_tiny_stories.h5
24 |   _ = mm.run_prediction("As evening fell, a maiden stood at the edge of a wood. In her hands,")
25 |   # >>>> Load tokenizer from file: ~/.keras/datasets/llama_tokenizer.model
26 |   # <s>
27 |   # As evening fell, a maiden stood at the edge of a wood. In her hands, she held a beautiful diamond. Everyone was surprised to see it.
28 |   # "What is it?" one of the kids asked.
29 |   # "It's a diamond," the maiden said.
30 |   # ...
31 |   ```
32 |   **Set `include_top=False`** to exclude model head layer.
33 |   ```py
34 |   from keras_cv_attention_models import llama2
35 | 
36 |   mm = llama2.LLaMA2_42M(include_top=False)
37 |   # >>>> Load pretrained from: ~/.keras/models/llama2_42m_tiny_stories.h5
38 |   print(f"{mm.output_shape = }")
39 |   # mm.output_shape = (None, 1024, 512)
40 |   ```
41 | ## Convert weights
42 | - Manually downloading weights from [Huggingface meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) or [Huggingface LinkSoul/Chinese-Llama-2-7b](https://huggingface.co/LinkSoul/Chinese-Llama-2-7b), and convert to `h5` format. The benefit of saving h5 is that, it's like `npz` or `tfrecord`, weights can be loaded layer by layer without reading the entire file into memory.
43 |   ```py
44 |   # Set to build model using pure float16 if using Tensorflow
45 |   policy = keras.mixed_precision.Policy("float16")
46 |   keras.mixed_precision.set_global_policy(policy)
47 | 
48 |   from keras_cv_attention_models import llama2
49 |   _ = llama2.convert_huggingface_weights_to_h5("pytorch_model-00001-of-00002.bin", to_fp16=True)
50 |   # >>>> Save to: pytorch_model-00001-of-00002.h5
51 |   _ = llama2.convert_huggingface_weights_to_h5("pytorch_model-00002-of-00002.bin", to_fp16=True)
52 |   # >>>> Save to: pytorch_model-00002-of-00002.h5
53 |   ```
54 |   Then load back into model.
55 |   ```py
56 |   policy = keras.mixed_precision.Policy("float16")
57 |   keras.mixed_precision.set_global_policy(policy)
58 | 
59 |   from keras_cv_attention_models import llama2
60 |   mm = llama2.LLaMA2_7B(pretrained=["pytorch_model-00001-of-00002.h5", "pytorch_model-00002-of-00002.h5"])
61 |   # >>>> Load pretrained from: pytorch_model-00001-of-00002.h5
62 |   # >>>> Load pretrained from: pytorch_model-00002-of-00002.h5
63 |   mm.save(mm.name + ".h5")  # mm.half().save(mm.name + ".h5") if using PyTorch backend
64 | 
65 |   _ = mm.run_prediction("Who's there?")
66 |   ```
67 | ***
68 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/nfnets/README.md:
--------------------------------------------------------------------------------
 1 | # ___Keras NFNets___
 2 | ***
 3 | 
 4 | ## Summary
 5 |   - Keras implementation of [Github deepmind/nfnets](https://github.com/deepmind/deepmind-research/tree/master/nfnets). Paper [PDF 2102.06171 High-Performance Large-Scale Image Recognition Without Normalization](https://arxiv.org/pdf/2102.06171.pdf).
 6 |   - Model weights reloaded from official publication.
 7 |   - `ECA` and `Light` NFNets weights reloaded from timm [Github rwightman/pytorch-image-models](https://github.com/rwightman/pytorch-image-models).
 8 | ***
 9 | 
10 | ## Models
11 |   - `L` types models are light versions of `NFNet-F` from `timm`.
12 |   - `ECA` type models are using `attn_type="eca"` instead of `attn_type="se"` from `timm`.
13 | 
14 |   | Model       | Params | FLOPs   | Input | Top1 Acc | Download |
15 |   | ----------- | ------ | ------- | ----- | -------- | -------- |
16 |   | NFNetL0     | 35.07M | 7.13G   | 288   | 82.75    | [nfnetl0_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/nfnets/nfnetl0_imagenet.h5) |
17 |   | NFNetF0     | 71.5M  | 12.58G  | 256   | 83.6     | [nfnetf0_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/nfnets/nfnetf0_imagenet.h5) |
18 |   | NFNetF1     | 132.6M | 35.95G  | 320   | 84.7     | [nfnetf1_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/nfnets/nfnetf1_imagenet.h5) |
19 |   | NFNetF2     | 193.8M | 63.24G  | 352   | 85.1     | [nfnetf2_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/nfnets/nfnetf2_imagenet.h5) |
20 |   | NFNetF3     | 254.9M | 115.75G | 416   | 85.7     | [nfnetf3_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/nfnets/nfnetf3_imagenet.h5) |
21 |   | NFNetF4     | 316.1M | 216.78G | 512   | 85.9     | [nfnetf4_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/nfnets/nfnetf4_imagenet.h5) |
22 |   | NFNetF5     | 377.2M | 291.73G | 544   | 86.0     | [nfnetf5_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/nfnets/nfnetf5_imagenet.h5) |
23 |   | NFNetF6 SAM | 438.4M | 379.75G | 576   | 86.5     | [nfnetf6_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/nfnets/nfnetf6_imagenet.h5) |
24 |   | NFNetF7     | 499.5M | 481.80G | 608   |          |          |
25 |   | ECA_NFNetL0 | 24.14M | 7.12G   | 288   | 82.58    | [eca_nfnetl0_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/nfnets/eca_nfnetl0_imagenet.h5) |
26 |   | ECA_NFNetL1 | 41.41M | 14.93G  | 320   | 84.01    | [eca_nfnetl1_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/nfnets/eca_nfnetl1_imagenet.h5) |
27 |   | ECA_NFNetL2 | 56.72M | 30.12G  | 384   | 84.70    | [eca_nfnetl2_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/nfnets/eca_nfnetl2_imagenet.h5) |
28 |   | ECA_NFNetL3 | 72.04M | 52.73G  | 448   |          |          |
29 | ## Usage
30 |   ```py
31 |   from keras_cv_attention_models import nfnets
32 | 
33 |   # Will download and load pretrained imagenet weights.
34 |   mm = nfnets.NFNetF0(pretrained="imagenet")
35 | 
36 |   # Run prediction
37 |   import tensorflow as tf
38 |   from tensorflow import keras
39 |   from skimage.data import chelsea
40 |   imm = keras.applications.imagenet_utils.preprocess_input(chelsea(), mode='torch') # Chelsea the cat
41 |   pred = mm(tf.expand_dims(tf.image.resize(imm, mm.input_shape[1:3]), 0)).numpy()
42 |   print(keras.applications.imagenet_utils.decode_predictions(pred)[0])
43 |   # [('n02124075', 'Egyptian_cat', 0.9195376), ('n02123159', 'tiger_cat', 0.021603014), ...]
44 |   ```
45 |   **Use dynamic input resolution**
46 |   ```py
47 |   from keras_cv_attention_models import nfnets
48 |   mm = nfnets.NFNetF1(input_shape=(None, None, 3), num_classes=0, pretrained="imagenet")
49 | 
50 |   print(mm(np.ones([1, 320, 320, 3])).shape)
51 |   # (1, 10, 10, 3072)
52 |   print(mm(np.ones([1, 512, 512, 3])).shape)
53 |   # (1, 16, 16, 3072)
54 | 
55 |   mm.save("nfnetf1_imagenet_dynamic_notop.h5")
56 |   ```
57 | ***
58 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/inceptionnext/README.md:
--------------------------------------------------------------------------------
 1 | # ___Keras InceptionNeXt___
 2 | ***
 3 | 
 4 | ## Summary
 5 |   - Keras implementation of [Github sail-sg/inceptionnext](https://github.com/sail-sg/inceptionnext). Paper [PDF 2303.16900 InceptionNeXt: When Inception Meets ConvNeXt](https://arxiv.org/pdf/2303.16900.pdf).
 6 |   - Model weights ported from official publication.
 7 | ***
 8 | 
 9 | ## Models
10 |   | Model              | Params | FLOPs | Input | Top1 Acc | Download |
11 |   | ------------------ | ------ | ------ | ----- | -------- | -------- |
12 |   | InceptionNeXtTiny  | 28.05M | 4.21G  | 224   | 82.3     | [inceptionnext_tiny_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/inceptionnext/inceptionnext_tiny_imagenet.h5) |
13 |   | InceptionNeXtSmall | 49.37M | 8.39G  | 224   | 83.5     | [inceptionnext_small_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/inceptionnext/inceptionnext_small_imagenet.h5) |
14 |   | InceptionNeXtBase  | 86.67M | 14.88G | 224   | 84.0     | [inceptionnext_base_224_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/inceptionnext/inceptionnext_base_224_imagenet.h5) |
15 |   |                    | 86.67M | 43.73G | 384   | 85.2     | [inceptionnext_base_384_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/inceptionnext/inceptionnext_base_384_imagenet.h5) |
16 | 
17 | ## Usage
18 |   ```py
19 |   from keras_cv_attention_models import inceptionnext
20 | 
21 |   # Will download and load pretrained imagenet weights.
22 |   model = inceptionnext.InceptionNeXtTiny(pretrained="imagenet")
23 | 
24 |   # Run prediction
25 |   from skimage.data import chelsea # Chelsea the cat
26 |   preds = model(model.preprocess_input(chelsea()))
27 |   print(model.decode_predictions(preds))
28 |   # [('n02124075', 'Egyptian_cat', 0.8221698), ('n02123159', 'tiger_cat', 0.019049658), ...]
29 |   ```
30 |   **Use dynamic input resolution** by set `input_shape=(None, None, 3)`.
31 |   ```py
32 |   from keras_cv_attention_models import inceptionnext
33 |   model = inceptionnext.InceptionNeXtTiny(input_shape=(None, None, 3), num_classes=0)
34 |   # >>>> Load pretrained from: ~/.keras/models/inceptionnext_tiny_imagenet.h5
35 |   print(model.output_shape)
36 |   # (None, None, None, 768)
37 | 
38 |   print(model(np.ones([1, 223, 123, 3])).shape)
39 |   # (1, 6, 3, 768)
40 |   print(model(np.ones([1, 32, 526, 3])).shape)
41 |   # (1, 1, 16, 768)
42 |   ```
43 |   **Using PyTorch backend** by set `KECAM_BACKEND='torch'` environment variable.
44 |   ```py
45 |   os.environ['KECAM_BACKEND'] = 'torch'
46 | 
47 |   from keras_cv_attention_models import inceptionnext
48 |   model = inceptionnext.InceptionNeXtTiny(input_shape=(None, None, 3), num_classes=0)
49 |   # >>>> Using PyTorch backend
50 |   # >>>> Aligned input_shape: [3, None, None]
51 |   # >>>> Load pretrained from: ~/.keras/models/inceptionnext_tiny_imagenet.h5
52 |   print(model.output_shape)
53 |   # (None, 768, None, None)
54 | 
55 |   import torch
56 |   print(model(torch.ones([1, 3, 223, 123])).shape)
57 |   # (1, 768, 6, 3 )
58 |   print(model(torch.ones([1, 3, 32, 526])).shape)
59 |   # (1, 768, 1, 16)
60 |   ```  
61 | ## Verification with PyTorch version
62 |   ```py
63 |   """ PyTorch inceptionnext_tiny """
64 |   sys.path.append('../inceptionnext/')
65 |   sys.path.append('../pytorch-image-models/')  # Needs timm
66 |   import torch
67 |   from models import inceptionnext as inceptionnext_torch
68 | 
69 |   torch_model = inceptionnext_torch.inceptionnext_tiny(pretrained=True)
70 |   _ = torch_model.eval()
71 | 
72 |   """ Keras InceptionNeXtTiny """
73 |   from keras_cv_attention_models import inceptionnext
74 |   mm = inceptionnext.InceptionNeXtTiny(pretrained="imagenet", classifier_activation=None)
75 | 
76 |   """ Verification """
77 |   inputs = np.random.uniform(size=(1, *mm.input_shape[1:3], 3)).astype("float32")
78 |   torch_out = torch_model(torch.from_numpy(inputs).permute(0, 3, 1, 2)).detach().numpy()
79 |   keras_out = mm(inputs).numpy()
80 |   print(f"{np.allclose(torch_out, keras_out, atol=5e-5) = }")
81 |   # np.allclose(torch_out, keras_out, atol=5e-5) = True
82 |   ```
83 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/fastvit/__init__.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models.fastvit.fastvit import (
 2 |     FastViT,
 3 |     FastViT_T8,
 4 |     FastViT_T12,
 5 |     FastViT_S12,
 6 |     FastViT_SA12,
 7 |     FastViT_SA24,
 8 |     FastViT_SA36,
 9 |     FastViT_MA36,
10 |     # switch_to_deploy,
11 | )
12 | 
13 | __head_doc__ = """
14 | Keras implementation of [Github NVlabs/FasterViT](https://github.com/NVlabs/FasterViT).
15 | Paper [PDF 2306.06189 FasterViT: Fast Vision Transformers with Hierarchical Attention](https://arxiv.org/pdf/2306.06189.pdf).
16 | """
17 | 
18 | __tail_doc__ = """  block_types: block types for each stack,
19 |       - `conv` or any `c` / `C` starts word, means `rep_conv_block` block.
20 |       - `transfrom` or any not `c` / `C` starts word, means `multi_head_self_attention` block.
21 |       value could be in format like `"cctt"` or `"CCTT"` or `["conv", "conv", "transfrom", "transform"]`.
22 |       `["conv", "conv", "conv", "conv"]` for SA models, all conv for others.
23 |   layer_scale: layer scale init value, [Going deeper with Image Transformers](https://arxiv.org/abs/2103.17239).
24 |       Default 1e-6 for SA models, 1e-5 for others.
25 |   input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`.
26 |   deploy: boolean value if build a fused model. **Evaluation only, not good for training**.
27 |   num_classes: number of classes to classify images into. Set `0` to exclude top layers.
28 |   activation: activation used in whole model, default `hard_swish`.
29 |   drop_connect_rate: is used for [Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382).
30 |       Can be a constant value like `0.2`,
31 |       or a tuple value like `(0, 0.2)` indicates the drop probability linearly changes from `0 --> 0.2` for `top --> bottom` layers.
32 |       A higher value means a higher probability will drop the deep branch.
33 |       or `0` to disable (default).
34 |   dropout: top dropout rate if top layers is included. Default 0.
35 |   classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`.
36 |       Set `classifier_activation=None` to return the logits of the "top" layer.
37 |       Default is `None`.
38 |   pretrained: one of `None` (random initialization) or 'imagenet' (pre-training on ImageNet).
39 |       Will try to download and load pre-trained model weights if not None.
40 |   **kwargs: other parameters if available.
41 | 
42 | Returns:
43 |     A `keras.Model` instance.
44 | """
45 | 
46 | FastViT.__doc__ = __head_doc__ + """
47 | Args:
48 |   num_blocks: number of block for each stack.
49 |   out_channels: output channels for each stack.
50 |   stem_width: channel dimension output for stem block, default -1 for using out_channels[0].
51 |   mlp_ratio: int value for mlp_ratio for each stack.
52 |   model_name: string, model name.
53 | """ + __tail_doc__ + """
54 | Model architectures:
55 |   | Model        | Params | FLOPs | Input | Top1 Acc |
56 |   | ------------ | ------ | ----- | ----- | -------- |
57 |   | FastViT_T8   | 4.03M  | 0.65G | 256   | 76.2     |
58 |   | - distill    | 4.03M  | 0.65G | 256   | 77.2     |
59 |   | FastViT_T12  | 7.55M  | 1.34G | 256   | 79.3     |
60 |   | - distill    | 7.55M  | 1.34G | 256   | 80.3     |
61 |   | FastViT_S12  | 9.47M  | 1.74G | 256   | 79.9     |
62 |   | - distill    | 9.47M  | 1.74G | 256   | 81.1     |
63 |   | FastViT_SA12 | 11.58M | 1.88G | 256   | 80.9     |
64 |   | - distill    | 11.58M | 1.88G | 256   | 81.9     |
65 |   | FastViT_SA24 | 21.55M | 3.66G | 256   | 82.7     |
66 |   | - distill    | 21.55M | 3.66G | 256   | 83.4     |
67 |   | FastViT_SA36 | 31.53M | 5.44G | 256   | 83.6     |
68 |   | - distill    | 31.53M | 5.44G | 256   | 84.2     |
69 |   | FastViT_MA36 | 44.07M | 7.64G | 256   | 83.9     |
70 |   | - distill    | 44.07M | 7.64G | 256   | 84.6     |
71 | """
72 | 
73 | FastViT_T8.__doc__ = __head_doc__ + """
74 | Args:
75 | """ + __tail_doc__
76 | 
77 | FastViT_T12.__doc__ = FastViT_T8.__doc__
78 | FastViT_S12.__doc__ = FastViT_T8.__doc__
79 | FastViT_SA12.__doc__ = FastViT_T8.__doc__
80 | FastViT_SA24.__doc__ = FastViT_T8.__doc__
81 | FastViT_SA36.__doc__ = FastViT_T8.__doc__
82 | FastViT_MA36.__doc__ = FastViT_T8.__doc__
83 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/aotnet/README.md:
--------------------------------------------------------------------------------
 1 | # ___Keras AotNet___
 2 | ***
 3 | 
 4 | ## Summary
 5 |   - `AotNet` is just a `ResNet` / `ResNetV2` like framework, that set parameters like `attn_types` and `attn_params` and others, which is used to apply different types attention layers. Works like `byoanet` / `byobnet` from `timm`.
 6 |   - Default parameters set is a typical `ResNet` architecture with `Conv2D use_bias=False` and `padding` like `PyTorch`.
 7 |   - `AotNet` means `Attention Over Template network`! Honestly, just a name after `BotNet` and `CotNet`...
 8 | ***
 9 | 
10 | ## Usage
11 |   - **attn_types** is a `string` or `list`, indicates attention layer type for each stack. Each element can also be a `string` or `list`, indicates attention layer type for each block.
12 |     - `"bot"`: `mhsa_with_relative_position_embedding` from `botnet`.
13 |     - `"cot"`: `cot_attention` from `cotnet`.
14 |     - `"halo"`: `halo_attention` from `halonet`.
15 |     - `"outlook"`: `outlook_attention` from `volo`.
16 |     - `"sa"`: `split_attention_conv2d` from `resnest`.
17 |     - `None`: `Conv2D`. Can add `groups` like `ResNeXt` or add `se` and `eca` attention.
18 |   - **attn_params**: like `attn_types`, is a dict or list, each element in list can also be a dict or list. Indicates specific attention layer parameters for relative `attn_types`.
19 |   - **se_ratio**: value in `(0, 1)`, where `0` means not using `se_module`. Should be a `number` or `list`, indicates `se_ratio` for each stack. Each element can also be a `number` or `list`, indicates `se_ratio` for each block.
20 |   - **use_eca**: boolean value if use `eca` attention. Can also be a list like `se_ratio`.
21 |   - **groups**: `groups` for `Conv2D` layer if relative `attn_types` is `None`. `ResNeXt` like archeticture. Note it's NOT the `group_size`. Default value `1` means not using group.
22 |   - **Definition of `BotNet26T`**
23 |     ```py
24 |     from keras_cv_attention_models import aotnet
25 |     model = aotnet.AotNet(
26 |         num_blocks=[2, 2, 2, 2],
27 |         attn_types=[None, None, [None, "bot"], "bot"],
28 |         attn_params={"num_heads": 4, "out_weight": False},
29 |         stem_type="tiered",
30 |         input_shape=(256, 256, 3),
31 |         model_name="botnet26t",
32 |     )
33 |     model.summary()
34 |     ```
35 |   - **Definition of `CotNet101`**
36 |     ```py
37 |     from keras_cv_attention_models import aotnet
38 |     model = aotnet.AotNet101(
39 |         attn_types="cot",
40 |         bn_after_attn=False,
41 |         shortcut_type="avg",
42 |         model_name="cotnet101",
43 |     )
44 |     model.summary()
45 |     ```
46 |   - **Definition of `HaloNet50T`**
47 |     ```py
48 |     from keras_cv_attention_models import aotnet
49 |     attn_params = [
50 |         None,
51 |         [None, None, None, {"block_size": 8, "halo_size": 3, "num_heads": 4, "out_weight": False}],
52 |         [None, {"block_size": 8, "halo_size": 3, "num_heads": 8, "out_weight": False}] * 3,
53 |         [None, {"block_size": 8, "halo_size": 3, "num_heads": 8, "out_weight": False}, None],
54 |     ]
55 |     model = aotnet.AotNet50(
56 |         attn_types=[None, [None, None, None, "halo"], [None, "halo"] * 3, [None, "halo", None]],
57 |         attn_params=attn_params,
58 |         stem_type="tiered",
59 |         input_shape=(256, 256, 3),
60 |         model_name="halonet50t",
61 |     )
62 |     model.summary()
63 |     ```
64 |   - **Definition of `ResNest50`**
65 |     ```py
66 |     from keras_cv_attention_models import aotnet
67 |     model = aotnet.AotNet50(
68 |         stem_type="deep",
69 |         shortcut_type="avg",
70 |         attn_types="sa",
71 |         bn_after_attn=False,
72 |         model_name="resnest50",
73 |     )
74 |     model.summary()
75 |     ```
76 |   - **Mixing se and outlook and halo and bot and cot**, 21M parameters
77 |     ```py
78 |     # 50 is just a picked number that larger than the relative `num_block`
79 |     model = aotnet.AotNet50V2(
80 |         attn_types=[None, "outlook", ["bot", "halo"] * 50, "cot"],
81 |         se_ratio=[0.25, 0, 0, 0],
82 |         stem_type="deep",
83 |         strides=1,
84 |     )
85 |     model.summary()
86 |     ```
87 |   - `AotNet50V2` / `AotNet101V2` / `AotNet152V2` / `AotNet200V2` is the `ResNetV2` like template.
88 | ***
89 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/model_surgery/README.md:
--------------------------------------------------------------------------------
 1 | ## Model Surgery
 2 | ***
 3 | 
 4 | ## Summary
 5 |   - Functions used to change model parameters after built.
 6 |   - `SAMModel`: SAMModel definition.
 7 |   - `add_l2_regularizer_2_model`: add `l2` weight decay to `Dense` / `Conv2D` / `DepthwiseConv2D` / `SeparableConv2D` layers.
 8 |   - `convert_to_mixed_float16`: convert `float32` model to `mixed_float16`.
 9 |   - `convert_mixed_float16_to_float32`: convert `mixed_float16` model to `float32`.
10 |   - `convert_groups_conv2d_2_split_conv2d`: convert `Conv2D groups != 1` to `SplitConv2D` using `split -> conv -> concat`.
11 |   - `convert_gelu_and_extract_patches_for_tflite`: convert model `gelu` activation to `gelu approximate=True`, and `tf.image.extract_patches` to a `Conv2D` version.
12 |   - `convert_to_fused_conv_bn_model`: fuse convolution and batchnorm layers for inference.
13 |   - `prepare_for_tflite`: a combination of `convert_groups_conv2d_2_split_conv2d` and `convert_gelu_and_extract_patches_for_tflite`.
14 |   - `replace_ReLU`: replace all `ReLU` with other activations, default target is `PReLU`.
15 |   - `replace_add_with_stochastic_depth`: replace all `Add` layers with `StochasticDepth`.
16 |   - `replace_stochastic_depth_with_add`: replace all `StochasticDepth` layers with `add` + `multiply`.
17 | ## Usage
18 |   - **Convert add layers to stochastic depth**
19 |     ```py
20 |     from keras_cv_attention_models import model_surgery
21 |     mm = keras.applications.ResNet50()
22 |     mm = model_surgery.replace_add_with_drop_connect(mm, drop_rate=(0, 0.2))
23 |     print(model_surgery.get_actual_drop_connect_rates(mm))
24 |     # [0.0, 0.0125, 0.025, 0.0375, 0.05, 0.0625, 0.075, 0.0875, 0.1, 0.1125, 0.125, 0.1375, 0.15, 0.1625, 0.175, 0.1875]
25 |     ```
26 |   - **Convert model between float16 and float32**
27 |     ```py
28 |     from keras_cv_attention_models import model_surgery
29 |     mm = keras.applications.ResNet50()
30 |     print(mm.layers[-1].compute_dtype)
31 |     # float32
32 |     mm = model_surgery.convert_to_mixed_float16(mm)
33 |     print(mm.layers[-1].compute_dtype)
34 |     # float16
35 |     mm = model_surgery.convert_mixed_float16_to_float32(mm)
36 |     print(mm.layers[-1].compute_dtype)
37 |     # float32
38 |     ```
39 |   - **Convert groups conv2d to split conv2d for TFLite usage**
40 |     ```py
41 |     from keras_cv_attention_models import model_surgery, regnet
42 |     mm = regnet.RegNetZD32()
43 |     print([ii.groups for ii in mm.layers if isinstance(ii, keras.layers.Conv2D) and ii.groups != 1])
44 |     # [8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 48, 48]
45 |     mm = model_surgery.convert_groups_conv2d_2_split_conv2d(mm)
46 |     print([ii.groups for ii in mm.layers if isinstance(ii, model_surgery.model_surgery.SplitConv2D)])
47 |     # [8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 48, 48]
48 | 
49 |     converter = tf.lite.TFLiteConverter.from_keras_model(mm)
50 |     open(mm.name + ".tflite", "wb").write(converter.convert())
51 |     ```
52 |     ![](https://user-images.githubusercontent.com/5744524/147234593-0323b99b-7dcd-4b75-b8ed-94060346aabb.png)
53 |   - **Change model input_shape after built**
54 |     ```py
55 |     from keras_cv_attention_models import model_surgery
56 |     mm = keras.applications.ResNet50()
57 |     print(mm.input_shape)
58 |     # (None, 224, 224, 3)
59 |     mm = model_surgery.change_model_input_shape(mm, (320, 320))
60 |     print(mm.input_shape)
61 |     # (None, 320, 320, 3)
62 |     ```
63 |   - **Replace ReLU activation layers**
64 |     ```py
65 |     from keras_cv_attention_models import model_surgery
66 |     mm = keras.applications.ResNet50()
67 |     print(mm.layers[-3].activation.__name__)
68 |     # relu
69 |     mm = model_surgery.replace_ReLU(mm, "PReLU")
70 |     print(mm.layers[-3].__class__.__name__)
71 |     # PReLU
72 |     ```
73 |   - **Fuse convolution and batchnorm layers for inference**
74 |     ```py
75 |     from keras_cv_attention_models import model_surgery
76 |     mm = keras.applications.ResNet50()
77 |     mm.summary()
78 |     # Trainable params: 25,583,592
79 |     mm = model_surgery.convert_to_fused_conv_bn_model(mm)
80 |     mm.summary()
81 |     # Trainable params: 25,530,472
82 |     ```
83 | ***
84 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/resnest/__init__.py:
--------------------------------------------------------------------------------
  1 | from keras_cv_attention_models.resnest.resnest import ResNest, ResNest50, ResNest101, ResNest200, ResNest269, rsoftmax, split_attention_conv2d
  2 | 
  3 | 
  4 | __head_doc__ = """
  5 | Keras implementation of [ResNeSt](https://github.com/zhanghang1989/ResNeSt).
  6 | Paper [PDF 2004.08955 ResNeSt: Split-Attention Networks](https://arxiv.org/pdf/2004.08955.pdf).
  7 | """
  8 | 
  9 | __tail_doc__ = """  groups: controls number of split groups in `split_attention_conv2d`.
 10 |   input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`.
 11 |       Set `(None, None, 3)` for dynamic input resolution.
 12 |   num_classes: number of classes to classify images into. Set `0` to exclude top layers.
 13 |   activation: activation used in whole model, default `relu`.
 14 |   drop_connect_rate: is used for [Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382).
 15 |       Can be value like `0.2`, indicates the drop probability linearly changes from `0 --> 0.2` for `top --> bottom` layers.
 16 |       A higher value means a higher probability will drop the deep branch.
 17 |       or `0` to disable (default).
 18 |   classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`.
 19 |       Set `classifier_activation=None` to return the logits of the "top" layer.
 20 |       Default is `softmax`.
 21 |   pretrained: one of `None` (random initialization) or 'imagenet' (pre-training on ImageNet).
 22 |       Will try to download and load pre-trained model weights if not None.
 23 |   **kwargs: other parameters from `AotNet` if not conflict.
 24 | 
 25 | Returns:
 26 |     A `keras.Model` instance.
 27 | """
 28 | 
 29 | ResNest.__doc__ = __head_doc__ + """
 30 | Args:
 31 |   num_blocks: number of blocks in each stack.
 32 |   model_name: string, model name.
 33 | """ + __tail_doc__ + """
 34 | Model architectures:
 35 |   | Model          | Params | FLOPs  | Input | Top1 Acc |
 36 |   | -------------- | ------ | ------ | ----- | -------- |
 37 |   | resnest50      | 28M    | 5.38G  | 224   | 81.03    |
 38 |   | resnest101     | 49M    | 13.33G | 256   | 82.83    |
 39 |   | resnest200     | 71M    | 35.55G | 320   | 83.84    |
 40 |   | resnest269     | 111M   | 77.42G | 416   | 84.54    |
 41 | """
 42 | 
 43 | ResNest50.__doc__ = __head_doc__ + """
 44 | Args:
 45 | """ + __tail_doc__
 46 | 
 47 | ResNest101.__doc__ = ResNest50.__doc__
 48 | ResNest200.__doc__ = ResNest50.__doc__
 49 | ResNest269.__doc__ = ResNest50.__doc__
 50 | 
 51 | split_attention_conv2d.__doc__ = __head_doc__ + """
 52 | Split-Attention. Callable function, NOT defined as a layer.
 53 | Generating `attention_scores` using grouped `Conv2D`.
 54 | 
 55 | Args:
 56 |   inputs: input tensor.
 57 |   filters: output dimension.
 58 |   kernel_size: kernel size for grouped Conv2D.
 59 |   strides: strides for grouped Conv2D.
 60 |   groups: number of splitted groups.
 61 |   activation: activation used after `BatchNormalization`.
 62 | 
 63 | Examples:
 64 | 
 65 | >>> from keras_cv_attention_models import attention_layers
 66 | >>> inputs = keras.layers.Input([28, 28, 192])
 67 | >>> nn = attention_layers.split_attention_conv2d(inputs, 384)
 68 | >>> dd = keras.models.Model(inputs, nn)
 69 | >>> dd.summary()
 70 | >>> dd.output_shape
 71 | (None, 28, 28, 384)
 72 | 
 73 | >>> {ii.name: ii.shape for ii in dd.weights}
 74 | {'1_g1_conv/kernel:0': TensorShape([3, 3, 96, 384]),
 75 |  '1_g2_conv/kernel:0': TensorShape([3, 3, 96, 384]),
 76 |  '1_bn/gamma:0': TensorShape([768]),
 77 |  '1_bn/beta:0': TensorShape([768]),
 78 |  '1_bn/moving_mean:0': TensorShape([768]),
 79 |  '1_bn/moving_variance:0': TensorShape([768]),
 80 |  '2_conv/kernel:0': TensorShape([1, 1, 384, 96]),
 81 |  '2_conv/bias:0': TensorShape([96]),
 82 |  '2_bn/gamma:0': TensorShape([96]),
 83 |  '2_bn/beta:0': TensorShape([96]),
 84 |  '2_bn/moving_mean:0': TensorShape([96]),
 85 |  '2_bn/moving_variance:0': TensorShape([96]),
 86 |  '3_conv/kernel:0': TensorShape([1, 1, 96, 768]),
 87 |  '3_conv/bias:0': TensorShape([768])}
 88 | """
 89 | 
 90 | rsoftmax.__doc__ = __head_doc__ + """
 91 | Perform group split softmax
 92 | 
 93 | input: `[batch, 1, 1, channel]`.
 94 | output: `[batch, 1, 1, channel]`.
 95 | 
 96 | Args:
 97 |   inputs: Input tensor.
 98 |   groups: groups to split on channel dimension.
 99 | """
100 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/edgenext/README.md:
--------------------------------------------------------------------------------
 1 | # ___Keras EdgeNeXt___
 2 | ***
 3 | 
 4 | ## Summary
 5 |   - Keras implementation of [Github mmaaz60/EdgeNeXt](https://github.com/mmaaz60/EdgeNeXt). Paper [PDF 2206.10589 EdgeNeXt: Efficiently Amalgamated CNN-Transformer Architecture for Mobile Vision Applications](https://arxiv.org/pdf/2206.10589.pdf).
 6 |   - Model weights reloaded from official publication.
 7 |   - Related usi distillation paper [PDF 2204.03475 Solving ImageNet: a Unified Scheme for Training any Backbone to Top Results](https://arxiv.org/pdf/2204.03475.pdf).
 8 | ***
 9 | 
10 | ## Models
11 |   | Model             | Params | FLOPs  | Input | Top1 Acc | Download |
12 |   | ----------------- | ------ | ------ | ----- | -------- | -------- |
13 |   | EdgeNeXt_XX_Small | 1.33M  | 266M   | 256   | 71.23    | [edgenext_xx_small_256_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/edgenext/edgenext_xx_small_256_imagenet.h5) |
14 |   | EdgeNeXt_X_Small  | 2.34M  | 547M   | 256   | 74.96    | [edgenext_x_small_256_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/edgenext/edgenext_x_small_256_imagenet.h5) |
15 |   | EdgeNeXt_Small    | 5.59M  | 1.27G  | 256   | 79.41    | [edgenext_small_256_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/edgenext/edgenext_small_256_imagenet.h5) |
16 |   | - usi             | 5.59M  | 1.27G  | 256   | 81.07    | [edgenext_small_256_usi.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/edgenext/edgenext_small_256_usi.h5) |
17 |   | EdgeNeXt_Base     | 18.5M  | 3.86G  | 256   | 82.47    | [edgenext_small_256_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/edgenext/edgenext_base_256_imagenet.h5) |
18 |   | - usi             | 18.5M  | 3.86G  | 256   | 83.31    | [edgenext_small_256_usi.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/edgenext/edgenext_base_256_usi.h5) |
19 |   | - 21k_ft1k        | 18.5M  | 3.86G  | 256   | 83.68    | [edgenext_small_256_usi.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/edgenext/edgenext_base_256_imagenet-ft1k.h5) |
20 | ## Usage
21 |   ```py
22 |   from keras_cv_attention_models import edgenext
23 | 
24 |   # Will download and load pretrained imagenet weights.
25 |   mm = edgenext.EdgeNeXt_XX_Small(pretrained="imagenet")
26 | 
27 |   # Run prediction
28 |   import tensorflow as tf
29 |   from tensorflow import keras
30 |   from skimage.data import chelsea
31 |   imm = keras.applications.imagenet_utils.preprocess_input(chelsea(), mode='torch') # Chelsea the cat
32 |   pred = mm(tf.expand_dims(tf.image.resize(imm, mm.input_shape[1:3]), 0)).numpy()
33 |   print(keras.applications.imagenet_utils.decode_predictions(pred)[0])
34 |   # [('n02124075', 'Egyptian_cat', 0.60692847), ('n02123045', 'tabby', 0.21328166), ...]
35 |   ```
36 |   **Change input resolution**
37 |   ```py
38 |   from keras_cv_attention_models import edgenext
39 |   mm = edgenext.EdgeNeXt_Small(input_shape=(174, 269, 3), pretrained="usi")
40 |   # >>>> Load pretrained from: ~/.keras/models/edgenext_small_256_usi.h5
41 | 
42 |   # Run prediction
43 |   from skimage.data import chelsea
44 |   preds = mm(mm.preprocess_input(chelsea()))
45 |   print(mm.decode_predictions(preds))
46 |   # [[('n02124075', 'Egyptian_cat', 0.8444098), ('n02123159', 'tiger_cat', 0.061309356), ...]
47 |   ```
48 | ## Verification with PyTorch version
49 |   ```py
50 |   """ PyTorch edgenext_small """
51 |   sys.path.append('../EdgeNeXt/')
52 |   sys.path.append('../pytorch-image-models/')  # Needs timm
53 |   import torch
54 |   from models import model
55 |   torch_model = model.edgenext_small(classifier_dropout=0)
56 |   _ = torch_model.eval()
57 |   ss = torch.load('edgenext_small_usi.pth', map_location=torch.device('cpu'))
58 |   torch_model.load_state_dict(ss.get('state_dict', ss.get('model', ss)))
59 | 
60 |   """ Keras EdgeNeXt_Small """
61 |   from keras_cv_attention_models import edgenext
62 |   mm = edgenext.EdgeNeXt_Small(pretrained="usi", classifier_activation=None)
63 | 
64 |   """ Verification """
65 |   inputs = np.random.uniform(size=(1, *mm.input_shape[1:3], 3)).astype("float32")
66 |   torch_out = torch_model(torch.from_numpy(inputs).permute(0, 3, 1, 2)).detach().numpy()
67 |   keras_out = mm(inputs).numpy()
68 |   print(f"{np.allclose(torch_out, keras_out, atol=1e-5) = }")
69 |   # np.allclose(torch_out, keras_out, atol=1e-5) = True
70 |   ```
71 | ***
72 | 


--------------------------------------------------------------------------------
/tests/test_switch_to_deploy_tf.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | sys.path.append(".")
  4 | import keras_cv_attention_models  # Needs to set TF_USE_LEGACY_KERAS=1 env firstly
  5 | 
  6 | import pytest
  7 | import numpy as np
  8 | 
  9 | from keras_cv_attention_models.test_images import cat
 10 | 
 11 | 
 12 | def test_EfficientFormerL1_use_distillation_switch_to_deploy():
 13 |     mm = keras_cv_attention_models.models.EfficientFormerL1(use_distillation=True, classifier_activation=None)
 14 |     preds = mm(mm.preprocess_input(cat()))
 15 | 
 16 |     bb = mm.switch_to_deploy()
 17 |     preds_deploy = bb(bb.preprocess_input(cat()))
 18 |     assert np.allclose((preds[0] + preds[1]) / 2, preds_deploy, atol=1e-5)
 19 | 
 20 | 
 21 | def test_EfficientFormerV2S0_use_distillation_switch_to_deploy():
 22 |     mm = keras_cv_attention_models.models.EfficientFormerV2S0(use_distillation=True, classifier_activation=None)
 23 |     preds = mm(mm.preprocess_input(cat()))
 24 | 
 25 |     bb = mm.switch_to_deploy()
 26 |     preds_deploy = bb(bb.preprocess_input(cat()))
 27 |     assert np.allclose((preds[0] + preds[1]) / 2, preds_deploy, atol=1e-5)
 28 | 
 29 | 
 30 | def test_EfficientViT_M0_use_distillation_switch_to_deploy():
 31 |     mm = keras_cv_attention_models.models.EfficientViT_M0(use_distillation=True, classifier_activation=None)
 32 |     preds = mm(mm.preprocess_input(cat()))
 33 | 
 34 |     bb = mm.switch_to_deploy()
 35 |     preds_deploy = bb(bb.preprocess_input(cat()))
 36 |     assert np.allclose((preds[0] + preds[1]) / 2, preds_deploy, atol=1e-5)
 37 | 
 38 | 
 39 | def test_FasterViT0_switch_to_deploy():
 40 |     mm = keras_cv_attention_models.models.FasterViT0()
 41 |     preds = mm(mm.preprocess_input(cat()))
 42 | 
 43 |     bb = mm.switch_to_deploy()
 44 |     preds_deploy = bb(bb.preprocess_input(cat()))
 45 |     assert np.allclose(preds, preds_deploy, atol=1e-5)
 46 | 
 47 | 
 48 | def test_FastViT_T8_switch_to_deploy():
 49 |     mm = keras_cv_attention_models.models.FastViT_T8()
 50 |     preds = mm(mm.preprocess_input(cat()))
 51 | 
 52 |     bb = mm.switch_to_deploy()
 53 |     preds_deploy = bb(bb.preprocess_input(cat()))
 54 |     assert np.allclose(preds, preds_deploy, atol=1e-5)
 55 | 
 56 | 
 57 | def test_FastViT_SA12_switch_to_deploy():
 58 |     mm = keras_cv_attention_models.models.FastViT_SA12()
 59 |     preds = mm(mm.preprocess_input(cat()))
 60 | 
 61 |     bb = mm.switch_to_deploy()
 62 |     preds_deploy = bb(bb.preprocess_input(cat()))
 63 |     assert np.allclose(preds, preds_deploy, atol=1e-5)
 64 | 
 65 | 
 66 | def test_LeViT128S_switch_to_deploy():
 67 |     mm = keras_cv_attention_models.models.LeViT128S()
 68 |     preds = mm(mm.preprocess_input(cat()))
 69 | 
 70 |     bb = mm.switch_to_deploy()
 71 |     preds_deploy = bb(bb.preprocess_input(cat()))
 72 |     assert np.allclose(preds, preds_deploy, atol=1e-5)
 73 | 
 74 | 
 75 | def test_RepViT_M09_use_distillation_switch_to_deploy():
 76 |     mm = keras_cv_attention_models.models.RepViT_M09(use_distillation=True, classifier_activation=None)
 77 |     preds = mm(mm.preprocess_input(cat()))
 78 | 
 79 |     bb = mm.switch_to_deploy()
 80 |     preds_deploy = bb(bb.preprocess_input(cat()))
 81 |     assert np.allclose((preds[0] + preds[1]) / 2, preds_deploy, atol=1e-5)
 82 | 
 83 | 
 84 | def test_RepViT_M1_not_distillation_switch_to_deploy():
 85 |     mm = keras_cv_attention_models.models.RepViT_M09(use_distillation=False)
 86 |     preds = mm(mm.preprocess_input(cat()))
 87 | 
 88 |     bb = mm.switch_to_deploy()
 89 |     preds_deploy = bb(bb.preprocess_input(cat()))
 90 |     assert np.allclose(preds, preds_deploy, atol=1e-5)
 91 | 
 92 | 
 93 | def test_SwinTransformerV2Tiny_window8_switch_to_deploy():
 94 |     mm = keras_cv_attention_models.models.SwinTransformerV2Tiny_window8()
 95 |     preds = mm(mm.preprocess_input(cat()))
 96 | 
 97 |     bb = mm.switch_to_deploy()
 98 |     preds_deploy = bb(bb.preprocess_input(cat()))
 99 |     assert np.allclose(preds, preds_deploy, atol=1e-5)
100 | 
101 | 
102 | def test_VanillaNet5_switch_to_deploy():
103 |     mm = keras_cv_attention_models.models.VanillaNet5()
104 |     preds = mm(mm.preprocess_input(cat()))
105 | 
106 |     bb = mm.switch_to_deploy()
107 |     preds_deploy = bb(bb.preprocess_input(cat()))
108 |     assert np.allclose(preds, preds_deploy, atol=1e-5)
109 | 
110 | 
111 | def test_YOLO_NAS_S_switch_to_deploy():
112 |     mm = keras_cv_attention_models.models.YOLO_NAS_S(use_reparam_conv=True)
113 |     preds = mm(mm.preprocess_input(cat()))
114 | 
115 |     bb = mm.switch_to_deploy()
116 |     preds_deploy = bb(bb.preprocess_input(cat()))
117 |     assert np.allclose(preds, preds_deploy, atol=1e-3)
118 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/fasternet/README.md:
--------------------------------------------------------------------------------
 1 | # ___Keras FasterNet___
 2 | ***
 3 | 
 4 | ## Summary
 5 |   - Keras implementation of [Github JierunChen/FasterNet](https://github.com/JierunChen/FasterNet). Paper [PDF 2303.03667 Run, Don’t Walk: Chasing Higher FLOPS for Faster Neural Networks ](https://arxiv.org/pdf/2303.03667.pdf).
 6 |   - Model weights ported from official publication.
 7 | 
 8 |   ![fasternet](https://user-images.githubusercontent.com/5744524/227238562-5ee980ba-84c7-44d0-969d-c472f6e719a4.jpg)
 9 | ***
10 | 
11 | ## Models
12 |   | Model       | Params | FLOPs  | Input | Top1 Acc | Download |
13 |   | ----------- | ------ | ------ | ----- | -------- | -------- |
14 |   | FasterNetT0 | 3.9M   | 0.34G  | 224   | 71.9     | [fasternet_t0_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/fasternet/fasternet_t0_imagenet.h5) |
15 |   | FasterNetT1 | 7.6M   | 0.85G  | 224   | 76.2     | [fasternet_t1_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/fasternet/fasternet_t1_imagenet.h5) |
16 |   | FasterNetT2 | 15.0M  | 1.90G  | 224   | 78.9     | [fasternet_t2_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/fasternet/fasternet_t2_imagenet.h5) |
17 |   | FasterNetS  | 31.1M  | 4.55G  | 224   | 81.3     | [fasternet_s_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/fasternet/fasternet_s_imagenet.h5)   |
18 |   | FasterNetM  | 53.5M  | 8.72G  | 224   | 83.0     | [fasternet_m_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/fasternet/fasternet_m_imagenet.h5)   |
19 |   | FasterNetL  | 93.4M  | 15.49G | 224   | 83.5     | [fasternet_l_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/fasternet/fasternet_l_imagenet.h5)   |
20 | 
21 | ## Usage
22 |   ```py
23 |   from keras_cv_attention_models import fasternet
24 | 
25 |   # Will download and load pretrained imagenet weights.
26 |   model = fasternet.FasterNetT2(pretrained="imagenet")
27 | 
28 |   # Run prediction
29 |   from skimage.data import chelsea # Chelsea the cat
30 |   preds = model(model.preprocess_input(chelsea()))
31 |   print(model.decode_predictions(preds))
32 |   # [('n02124075', 'Egyptian_cat', 0.76938057), ('n02123159', 'tiger_cat', 0.0810011), ...]
33 |   ```
34 |   **Use dynamic input resolution** by set `input_shape=(None, None, 3)`.
35 |   ```py
36 |   from keras_cv_attention_models import fasternet
37 |   model = fasternet.FasterNetT2(input_shape=(None, None, 3), num_classes=0)
38 |   # >>>> Load pretrained from: ~/.keras/models/fasternet_t2_imagenet.h5
39 |   print(model.output_shape)
40 |   # (None, None, None, 768)
41 | 
42 |   print(model(np.ones([1, 223, 123, 3])).shape)
43 |   # (1, 6, 3, 768)
44 |   print(model(np.ones([1, 32, 526, 3])).shape)
45 |   # (1, 1, 16, 768)
46 |   ```
47 |   **Using PyTorch backend** by set `KECAM_BACKEND='torch'` environment variable.
48 |   ```py
49 |   os.environ['KECAM_BACKEND'] = 'torch'
50 | 
51 |   from keras_cv_attention_models import fasternet
52 |   model = fasternet.FasterNetT2(input_shape=(None, None, 3), num_classes=0)
53 |   # >>>> Using PyTorch backend
54 |   # >>>> Aligned input_shape: [3, None, None]
55 |   # >>>> Load pretrained from: ~/.keras/models/fasternet_t2_imagenet.h5
56 |   print(model.output_shape)
57 |   # (None, 768, None, None)
58 | 
59 |   import torch
60 |   print(model(torch.ones([1, 3, 223, 123])).shape)
61 |   # (1, 768, 6, 3 )
62 |   print(model(torch.ones([1, 3, 32, 526])).shape)
63 |   # (1, 768, 1, 16)
64 |   ```
65 | ## Verification with PyTorch version
66 |   ```py
67 |   """ PyTorch fasternet_t2 """
68 |   sys.path.append('../FasterNet/')
69 |   sys.path.append('../pytorch-image-models/')  # Needs timm
70 |   import torch
71 |   from models import fasternet as fasternet_torch
72 | 
73 |   torch_model = fasternet_torch.FasterNet()  # Default parameters is for T2
74 |   ss = torch.load('fasternet_t2-epoch.289-val_acc1.78.8860.pth', map_location=torch.device('cpu'))
75 |   torch_model.load_state_dict(ss)
76 |   _ = torch_model.eval()
77 | 
78 |   """ Keras FasterNetT2 """
79 |   from keras_cv_attention_models import fasternet
80 |   mm = fasternet.FasterNetT2(pretrained="imagenet", classifier_activation=None)
81 | 
82 |   """ Verification """
83 |   inputs = np.random.uniform(size=(1, *mm.input_shape[1:3], 3)).astype("float32")
84 |   torch_out = torch_model(torch.from_numpy(inputs).permute(0, 3, 1, 2)).detach().numpy()
85 |   keras_out = mm(inputs).numpy()
86 |   print(f"{np.allclose(torch_out, keras_out, atol=1e-5) = }")
87 |   # np.allclose(torch_out, keras_out, atol=1e-5) = True
88 |   ```
89 | 


--------------------------------------------------------------------------------
/keras_cv_attention_models/__init__.py:
--------------------------------------------------------------------------------
 1 | from keras_cv_attention_models import backend
 2 | 
 3 | from keras_cv_attention_models.version import __version__
 4 | from keras_cv_attention_models import plot_func
 5 | from keras_cv_attention_models import attention_layers
 6 | from keras_cv_attention_models import beit
 7 | from keras_cv_attention_models.beit import flexivit
 8 | from keras_cv_attention_models.beit import eva
 9 | from keras_cv_attention_models.beit import eva02
10 | from keras_cv_attention_models.beit import dinov2
11 | from keras_cv_attention_models.beit import meta_transformer
12 | from keras_cv_attention_models.beit import vit
13 | from keras_cv_attention_models import botnet
14 | from keras_cv_attention_models import caformer
15 | from keras_cv_attention_models import coat
16 | from keras_cv_attention_models import coatnet
17 | from keras_cv_attention_models import convnext
18 | from keras_cv_attention_models import cotnet
19 | from keras_cv_attention_models import cmt
20 | from keras_cv_attention_models import cspnext
21 | from keras_cv_attention_models import davit
22 | from keras_cv_attention_models import efficientnet
23 | from keras_cv_attention_models import edgenext
24 | from keras_cv_attention_models import efficientformer
25 | from keras_cv_attention_models import fasternet
26 | from keras_cv_attention_models import gcvit
27 | from keras_cv_attention_models import ghostnet
28 | from keras_cv_attention_models import ghostnet as ghostnetv2  # alias name
29 | from keras_cv_attention_models import gpt2
30 | from keras_cv_attention_models import llama2
31 | from keras_cv_attention_models import halonet
32 | from keras_cv_attention_models import hiera
33 | from keras_cv_attention_models import hornet
34 | from keras_cv_attention_models import iformer
35 | from keras_cv_attention_models import levit
36 | from keras_cv_attention_models import mlp_family
37 | from keras_cv_attention_models.mlp_family import mlp_mixer
38 | from keras_cv_attention_models.mlp_family import res_mlp
39 | from keras_cv_attention_models.mlp_family import gated_mlp
40 | from keras_cv_attention_models.mlp_family import wave_mlp
41 | from keras_cv_attention_models.mobilenetv3_family import fbnetv3
42 | from keras_cv_attention_models.mobilenetv3_family import lcnet
43 | from keras_cv_attention_models.mobilenetv3_family import mobilenetv3
44 | from keras_cv_attention_models.mobilenetv3_family import tinynet
45 | from keras_cv_attention_models import efficientvit
46 | from keras_cv_attention_models.efficientvit import efficientvit_b
47 | from keras_cv_attention_models.efficientvit import efficientvit_m
48 | from keras_cv_attention_models import inceptionnext
49 | from keras_cv_attention_models import maxvit
50 | from keras_cv_attention_models import mobilevit
51 | from keras_cv_attention_models import moganet
52 | from keras_cv_attention_models import nat
53 | from keras_cv_attention_models.nat import dinat
54 | from keras_cv_attention_models import pvt
55 | from keras_cv_attention_models import repvit
56 | from keras_cv_attention_models import tinyvit
57 | from keras_cv_attention_models import resnest
58 | from keras_cv_attention_models import resnet_family
59 | from keras_cv_attention_models.resnet_family import resnext
60 | from keras_cv_attention_models.resnet_family import resnet_quad
61 | from keras_cv_attention_models.resnet_family import resnet_deep
62 | from keras_cv_attention_models.resnet_family import regnet
63 | from keras_cv_attention_models import gpvit
64 | from keras_cv_attention_models import swin_transformer_v2
65 | from keras_cv_attention_models import uniformer
66 | from keras_cv_attention_models import fastervit
67 | from keras_cv_attention_models import fastvit
68 | from keras_cv_attention_models import vanillanet
69 | from keras_cv_attention_models import download_and_load
70 | from keras_cv_attention_models import imagenet
71 | from keras_cv_attention_models import test_images
72 | from keras_cv_attention_models import model_surgery
73 | from keras_cv_attention_models import efficientdet
74 | from keras_cv_attention_models import yolox
75 | from keras_cv_attention_models import yolor
76 | from keras_cv_attention_models import yolov7
77 | from keras_cv_attention_models import yolov8
78 | from keras_cv_attention_models.yolov8 import yolo_nas
79 | from keras_cv_attention_models import coco
80 | from keras_cv_attention_models import clip
81 | from keras_cv_attention_models.clip import tokenizer
82 | from keras_cv_attention_models import stable_diffusion
83 | from keras_cv_attention_models import segment_anything
84 | from keras_cv_attention_models import segment_anything as sam  # Alias name
85 | 
86 | if backend.is_tensorflow_backend:
87 |     from keras_cv_attention_models import nfnets
88 |     from keras_cv_attention_models import volo
89 |     from keras_cv_attention_models import visualizing
90 | 


--------------------------------------------------------------------------------