├── keras_cv_attention_models ├── version.py ├── pytorch_backend │ ├── optimizers.py │ ├── callbacks.py │ ├── metrics.py │ ├── utils.py │ └── losses.py ├── imagenet │ ├── metrics.py │ └── losses.py ├── tf_functional.py ├── clip │ ├── __init__.py │ ├── tf_data.py │ └── torch_data.py ├── coco │ ├── info.py │ └── __init__.py ├── beit │ ├── eva.py │ ├── meta_transformer.py │ ├── flexivit.py │ ├── dinov2.py │ ├── eva02.py │ └── vit.py ├── model_surgery │ ├── __init__.py │ └── README.md ├── ghostnet │ ├── ghostnet.py │ ├── README.md │ └── __init__.py ├── mobilenetv3_family │ ├── fbnetv3.py │ ├── lcnet.py │ └── tinynet.py ├── resnet_family │ ├── resnet_deep.py │ └── resnext.py ├── cspnext │ ├── __init__.py │ └── README.md ├── gpt2 │ ├── __init__.py │ └── README.md ├── gpvit │ ├── README.md │ └── __init__.py ├── llama2 │ ├── __init__.py │ └── README.md ├── inceptionnext │ ├── __init__.py │ └── README.md ├── moganet │ ├── __init__.py │ └── README.md ├── hiera │ └── __init__.py ├── iformer │ ├── __init__.py │ └── README.md ├── efficientnet │ └── efficientnet_edgetpu.py ├── fasternet │ ├── __init__.py │ └── README.md ├── resnest │ ├── README.md │ └── __init__.py ├── nat │ └── dinat.py ├── mobilevit │ └── mobilevit_v2.py ├── pvt │ ├── __init__.py │ └── README.md ├── repvit │ └── __init__.py ├── gcvit │ └── __init__.py ├── halonet │ └── README.md ├── convnext │ └── convnext_v2.py ├── keras_core_functional.py ├── stable_diffusion │ └── __init__.py ├── davit │ └── README.md ├── segment_anything │ └── __init__.py ├── fastervit │ └── __init__.py ├── tinyvit │ └── __init__.py ├── models.py ├── nfnets │ └── README.md ├── fastvit │ └── __init__.py ├── aotnet │ └── README.md ├── edgenext │ └── README.md └── __init__.py ├── .gitignore ├── kecam └── __init__.py ├── LICENSE ├── .github └── workflows │ └── publish-to-test-pypi.yml ├── tests ├── test_models_tf.py └── test_switch_to_deploy_tf.py ├── setup_kecam.py └── setup.py /keras_cv_attention_models/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.4.3" 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.h5 3 | *.pth 4 | *.pth.tar 5 | *.pt 6 | *.ckpt 7 | *.npy 8 | *.npz 9 | *.onnx 10 | *.json 11 | *.tflite 12 | *.ipynb 13 | *.swp 14 | *.log 15 | *.tar 16 | *.tar.gz 17 | *.bin 18 | *.keras 19 | .ipynb_checkpoints 20 | checkpoints 21 | datasets 22 | logs 23 | -------------------------------------------------------------------------------- /kecam/__init__.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models import * 2 | _sub_modules = {__name__ + "." + kk: vv for kk, vv in locals().items() if not kk.startswith("_")} 3 | 4 | import sys as _sys 5 | _sys.modules.update(_sub_modules) 6 | 7 | from keras_cv_attention_models.version import __version__ 8 | -------------------------------------------------------------------------------- /keras_cv_attention_models/pytorch_backend/optimizers.py: -------------------------------------------------------------------------------- 1 | class Optimizer: 2 | def __init__( 3 | self, 4 | name, 5 | weight_decay=0, 6 | clipnorm=None, 7 | clipvalue=None, 8 | global_clipnorm=None, 9 | use_ema=False, 10 | ema_momentum=0.99, 11 | ema_overwrite_frequency=None, 12 | ): 13 | pass 14 | -------------------------------------------------------------------------------- /keras_cv_attention_models/imagenet/metrics.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.backend import metrics 2 | 3 | 4 | class LossMeanMetricWrapper(metrics.Metric): 5 | def __init__(self, loss_func, loss_attr_name): 6 | self.loss_func, self.loss_attr_name = loss_func, loss_attr_name 7 | super().__init__(name=loss_attr_name) 8 | 9 | def reset_state(self): 10 | self.value, self.passed_steps = 0.0, 0 11 | 12 | def update_state(self, y_true, y_pred, sample_weight=None): 13 | self.value += getattr(self.loss_func, self.loss_attr_name) 14 | self.passed_steps += 1 15 | 16 | def result(self): 17 | return self.value / self.passed_steps 18 | -------------------------------------------------------------------------------- /keras_cv_attention_models/tf_functional.py: -------------------------------------------------------------------------------- 1 | from tensorflow.nn import * 2 | from tensorflow.math import * 3 | 4 | from tensorflow import ( 5 | abs, 6 | cast, 7 | clip_by_value, 8 | complex, 9 | concat, 10 | convert_to_tensor, 11 | expand_dims, 12 | gather, 13 | gather_nd, 14 | linspace, 15 | map_fn, 16 | matmul, 17 | norm, 18 | pad, 19 | print, 20 | range, 21 | repeat, 22 | reshape, 23 | shape, 24 | sign, 25 | split, 26 | squeeze, 27 | stack, 28 | tensor_scatter_nd_update, 29 | tile, 30 | transpose, 31 | unstack, 32 | where, 33 | zeros, 34 | ) 35 | from tensorflow.image import resize, extract_patches, non_max_suppression_with_scores 36 | from tensorflow.signal import irfft2d, rfft2d 37 | 38 | 39 | def assign(parameter, data): 40 | parameter.assign(data) 41 | -------------------------------------------------------------------------------- /keras_cv_attention_models/clip/__init__.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models import backend as __backend__ 2 | 3 | from keras_cv_attention_models.clip.tokenizer import SimpleTokenizer, GPT2Tokenizer, TikToken, SentencePieceTokenizer 4 | from keras_cv_attention_models.clip.models import ( 5 | add_text_model_index_header, 6 | build_text_model_from_image_model, 7 | convert_to_clip_model, 8 | split_to_image_text_model, 9 | RunPrediction, 10 | ) 11 | from keras_cv_attention_models.plot_func import plot_hists, show_batch_sample 12 | 13 | if __backend__.is_tensorflow_backend: 14 | from keras_cv_attention_models.clip import tf_data as data 15 | from keras_cv_attention_models.clip.tf_data import init_dataset 16 | else: 17 | from keras_cv_attention_models.clip import torch_data as data 18 | from keras_cv_attention_models.clip.torch_data import init_dataset 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 leondgarse 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /keras_cv_attention_models/coco/info.py: -------------------------------------------------------------------------------- 1 | COCO_LABELS = """person, bicycle, car, motorcycle, airplane, bus, train, truck, boat, traffic light, fire hydrant, stop sign, 2 | parking meter, bench, bird, cat, dog, horse, sheep, cow, elephant, bear, zebra, giraffe, backpack, umbrella, handbag, tie, 3 | suitcase, frisbee, skis, snowboard, sports ball, kite, baseball bat, baseball glove, skateboard, surfboard, tennis racket, 4 | bottle, wine glass, cup, fork, knife, spoon, bowl, banana, apple, sandwich, orange, broccoli, carrot, hot dog, pizza, donut, 5 | cake, chair, couch, potted plant, bed, dining table, toilet, tv, laptop, mouse, remote, keyboard, cell phone, microwave, oven, 6 | toaster, sink, refrigerator, book, clock, vase, scissors, teddy bear, hair drier, toothbrush""" 7 | COCO_80_LABEL_DICT = {id: ii.strip() for id, ii in enumerate(COCO_LABELS.split(","))} 8 | INVALID_ID_90 = [11, 25, 28, 29, 44, 65, 67, 68, 70, 82] 9 | COCO_90_LABEL_DICT = {id: ii for id, ii in zip(set(range(90)) - set(INVALID_ID_90), COCO_80_LABEL_DICT.values())} 10 | COCO_90_LABEL_DICT.update({ii: "Unknown" for ii in INVALID_ID_90}) 11 | COCO_80_to_90_LABEL_DICT = {id_80: id_90 for id_80, id_90 in enumerate(set(range(90)) - set(INVALID_ID_90))} 12 | -------------------------------------------------------------------------------- /keras_cv_attention_models/beit/eva.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.beit.beit import Beit, keras_model_load_weights_from_pytorch_model 2 | from keras_cv_attention_models.models import register_model 3 | 4 | 5 | def EVA(layer_scale=0, use_abs_pos_emb=True, model_name="eva", **kwargs): 6 | kwargs.pop("kwargs", None) 7 | patch_size = kwargs.pop("patch_size", 14) 8 | force_reload_mismatch = patch_size != 14 # If patch_size not 14, force reload pos_emb and stem_conv weights 9 | return Beit(**locals(), **kwargs) 10 | 11 | 12 | @register_model 13 | def EvaLargePatch14(input_shape=(196, 196, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet21k-ft1k", **kwargs): 14 | embed_dim = 1024 15 | depth = 24 16 | num_heads = 16 17 | attn_qkv_bias = True 18 | return EVA(**locals(), model_name="eva_large_patch14", **kwargs) 19 | 20 | 21 | @register_model 22 | def EvaGiantPatch14(input_shape=(224, 224, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet21k-ft1k", **kwargs): 23 | mlp_ratio = 6144 / 1408 24 | embed_dim = 1408 25 | depth = 40 26 | num_heads = 16 27 | return EVA(**locals(), model_name="eva_giant_patch14", **kwargs) 28 | -------------------------------------------------------------------------------- /keras_cv_attention_models/model_surgery/__init__.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.model_surgery.model_surgery import ( 2 | SAMModel, 3 | DropConnect, 4 | add_l2_regularizer_2_model, 5 | align_pyramide_feature_output_by_image_data_format, 6 | change_model_input_shape, 7 | convert_to_dynamic_input_shape, 8 | convert_dense_to_conv, 9 | convert_extract_patches_to_conv, 10 | convert_gelu_to_approximate, 11 | convert_gelu_and_extract_patches_for_tflite, # [Deprecated], use convert_gelu_to_approximate -> convert_extract_patches_to_conv instead 12 | convert_groups_conv2d_2_split_conv2d, 13 | convert_to_mixed_float16, 14 | convert_mixed_float16_to_float32, 15 | convert_to_fixed_batch_size, 16 | convert_to_fused_conv_bn_model, 17 | convert_to_token_label_model, 18 | convert_layers_to_deploy_inplace, 19 | count_params, 20 | export_onnx, 21 | fuse_sequential_conv_strict, 22 | fuse_channel_affine_to_conv_dense, 23 | fuse_reparam_blocks, 24 | fuse_distill_head, 25 | get_actual_survival_probabilities, 26 | get_actual_drop_connect_rates, 27 | get_flops, 28 | get_global_avg_pool_layer_id, 29 | get_pyramide_feature_layers, 30 | prepare_for_tflite, 31 | remove_layer_single_input, 32 | replace_ReLU, 33 | replace_add_with_drop_connect, 34 | replace_add_with_stochastic_depth, 35 | replace_stochastic_depth_with_add, 36 | split_model_to_head_body_tail_by_blocks, 37 | swin_convert_pos_emb_mlp_to_MlpPairwisePositionalEmbedding_weights, 38 | ) 39 | -------------------------------------------------------------------------------- /keras_cv_attention_models/beit/meta_transformer.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.beit.beit import Beit, keras_model_load_weights_from_pytorch_model 2 | from keras_cv_attention_models.models import register_model 3 | 4 | 5 | def MetaTransformer( 6 | use_patch_bias=False, 7 | use_pre_norm=True, 8 | use_abs_pos_emb=True, 9 | attn_qv_bias=False, 10 | attn_qkv_bias=True, 11 | use_mean_pooling_head=False, 12 | layer_scale=0, 13 | model_name="meta_transformer", 14 | **kwargs, 15 | ): 16 | kwargs.pop("kwargs", None) 17 | return Beit(**locals(), **kwargs) 18 | 19 | 20 | @register_model 21 | def MetaTransformerBasePatch16( 22 | input_shape=(384, 384, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="laion_2b", **kwargs 23 | ): 24 | depth = 12 25 | embed_dim = 768 26 | num_heads = 12 27 | patch_size = kwargs.pop("patch_size", 16) 28 | force_reload_mismatch = patch_size != 16 # If patch_size not match, force reload pos_emb and stem_conv weights 29 | return MetaTransformer(**locals(), model_name="meta_transformer_base_patch16", **kwargs) 30 | 31 | 32 | @register_model 33 | def MetaTransformerLargePatch14( 34 | input_shape=(336, 336, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="laion_2b", **kwargs 35 | ): 36 | depth = 24 37 | embed_dim = 1024 38 | num_heads = 16 39 | patch_size = kwargs.pop("patch_size", 14) 40 | force_reload_mismatch = patch_size != 14 # If patch_size not match, force reload pos_emb and stem_conv weights 41 | return MetaTransformer(**locals(), model_name="meta_transformer_large_patch14", **kwargs) 42 | -------------------------------------------------------------------------------- /keras_cv_attention_models/beit/flexivit.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.beit.beit import Beit, keras_model_load_weights_from_pytorch_model 2 | from keras_cv_attention_models.models import register_model 3 | 4 | 5 | def FlexiViT( 6 | attn_qv_bias=False, 7 | attn_qkv_bias=True, 8 | use_abs_pos_emb=True, 9 | use_abs_pos_emb_on_cls_token=False, # no_embed_class in timm 10 | layer_scale=0, 11 | use_mean_pooling_head=False, 12 | model_name="flexivit", 13 | **kwargs, 14 | ): 15 | kwargs.pop("kwargs", None) 16 | patch_size = kwargs.pop("patch_size", 16) 17 | force_reload_mismatch = patch_size != 16 # If patch_size not 16, force reload pos_emb and stem_conv weights 18 | return Beit(**locals(), **kwargs) 19 | 20 | 21 | @register_model 22 | def FlexiViTSmall(input_shape=(240, 240, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet", **kwargs): 23 | embed_dim = 384 24 | depth = 12 25 | num_heads = 6 26 | return FlexiViT(**locals(), model_name="flexivit_small", **kwargs) 27 | 28 | 29 | @register_model 30 | def FlexiViTBase(input_shape=(240, 240, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet", **kwargs): 31 | embed_dim = 768 32 | depth = 12 33 | num_heads = 12 34 | return FlexiViT(**locals(), model_name="flexivit_base", **kwargs) 35 | 36 | 37 | @register_model 38 | def FlexiViTLarge(input_shape=(240, 240, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet", **kwargs): 39 | embed_dim = 1024 40 | depth = 24 41 | num_heads = 16 42 | return FlexiViT(**locals(), model_name="flexivit_large", **kwargs) 43 | -------------------------------------------------------------------------------- /.github/workflows/publish-to-test-pypi.yml: -------------------------------------------------------------------------------- 1 | name: Publish Python 🐍 distributions 📦 to PyPI and TestPyPI 2 | 3 | on: push 4 | 5 | jobs: 6 | build-n-publish: 7 | name: build and publish python 🐍 distributions 📦 to pypi and testpypi 8 | runs-on: ubuntu-24.04 9 | steps: 10 | - uses: actions/checkout@master 11 | - name: Set up Python 3.9 12 | uses: actions/setup-python@v1 13 | with: 14 | python-version: 3.9 15 | - name: Install pypa/build 16 | run: >- 17 | CUDA_VISIBLE_DEVICES='-1' python -m 18 | pip install 19 | build setuptools wheel pytest pytest-timeout pillow ftfy regex tqdm tensorflow tf-keras torch torchvision sentencepiece 20 | --extra-index-url https://download.pytorch.org/whl/cpu 21 | --user 22 | - name: Build a binary wheel and a source tarball 23 | run: >- 24 | python -m 25 | build 26 | --sdist 27 | --wheel 28 | --outdir dist/ 29 | . 30 | - name: Build a kecam binary wheel and a source tarball 31 | run: >- 32 | python setup_kecam.py sdist bdist_wheel 33 | - name: Run tests 34 | run: >- 35 | CUDA_VISIBLE_DEVICES='-1' pytest -vv --durations=0 ./tests 36 | - name: Run PyTorch backend tests 37 | run: >- 38 | CUDA_VISIBLE_DEVICES='-1' KECAM_BACKEND='torch' pytest -vv --durations=0 ./tests/test_models.py 39 | - name: Publish distribution 📦 to Test PyPI 40 | if: startsWith(github.ref, 'refs/tags') 41 | uses: pypa/gh-action-pypi-publish@master 42 | with: 43 | password: ${{ secrets.TEST_PYPI_API_TOKEN }} 44 | repository_url: https://test.pypi.org/legacy/ 45 | - name: Publish distribution 📦 to PyPI 46 | if: startsWith(github.ref, 'refs/tags') 47 | uses: pypa/gh-action-pypi-publish@master 48 | with: 49 | password: ${{ secrets.PYPI_API_TOKEN }} 50 | -------------------------------------------------------------------------------- /keras_cv_attention_models/beit/dinov2.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.beit.beit import Beit, keras_model_load_weights_from_pytorch_model 2 | from keras_cv_attention_models.models import register_model 3 | 4 | 5 | def DINOv2(layer_scale=1.0, use_abs_pos_emb=True, use_cat_head=True, attn_qkv_bias=True, model_name="dinov2", **kwargs): 6 | kwargs.pop("kwargs", None) 7 | patch_size = kwargs.pop("patch_size", 14) 8 | force_reload_mismatch = patch_size != 14 # If patch_size not 14, force reload pos_emb and stem_conv weights 9 | return Beit(**locals(), **kwargs) 10 | 11 | 12 | @register_model 13 | def DINOv2_ViT_Small14(input_shape=(518, 518, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet", **kwargs): 14 | embed_dim = 384 15 | depth = 12 16 | num_heads = 6 17 | return DINOv2(**locals(), model_name="dinov2_vit_small14", **kwargs) 18 | 19 | 20 | @register_model 21 | def DINOv2_ViT_Base14(input_shape=(518, 518, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet", **kwargs): 22 | embed_dim = 768 23 | depth = 12 24 | num_heads = 12 25 | return DINOv2(**locals(), model_name="dinov2_vit_base14", **kwargs) 26 | 27 | 28 | @register_model 29 | def DINOv2_ViT_Large14(input_shape=(518, 518, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet", **kwargs): 30 | embed_dim = 1024 31 | depth = 24 32 | num_heads = 16 33 | return DINOv2(**locals(), model_name="dinov2_vit_large14", **kwargs) 34 | 35 | 36 | @register_model 37 | def DINOv2_ViT_Giant14(input_shape=(518, 518, 3), num_classes=1000, activation="swish", classifier_activation="softmax", pretrained="imagenet", **kwargs): 38 | embed_dim = 1536 39 | depth = 40 40 | num_heads = 24 41 | use_gated_mlp = True 42 | mlp_ratio = 4096 / 1536 43 | return DINOv2(**locals(), model_name="dinov2_vit_giant14", **kwargs) 44 | -------------------------------------------------------------------------------- /tests/test_models_tf.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | sys.path.append(".") 4 | import keras_cv_attention_models # Needs to set TF_USE_LEGACY_KERAS=1 env firstly 5 | 6 | import pytest 7 | from keras_cv_attention_models.backend import models 8 | from keras_cv_attention_models.test_images import cat 9 | 10 | """ Recognition models HorNet*GF / NFNet / VOLO defination """ 11 | 12 | 13 | def test_NFNet_defination(): 14 | mm = keras_cv_attention_models.nfnets.NFNetF0(pretrained=None) 15 | assert isinstance(mm, models.Model) 16 | 17 | mm = keras_cv_attention_models.nfnets.ECA_NFNetL1(pretrained=None, num_classes=0) 18 | assert isinstance(mm, models.Model) 19 | 20 | 21 | def test_VOLO_defination(): 22 | mm = keras_cv_attention_models.volo.VOLO_d3(pretrained=None) 23 | assert isinstance(mm, models.Model) 24 | 25 | mm = keras_cv_attention_models.volo.VOLO_d4(pretrained=None, num_classes=0) 26 | assert isinstance(mm, models.Model) 27 | 28 | 29 | """ Recognition models EfficientNetV2B1_preprocessing / HorNet / VOLO prediction """ 30 | 31 | 32 | def test_EfficientNetV2B1_preprocessing_predict(): 33 | mm = keras_cv_attention_models.efficientnet.EfficientNetV2B1(pretrained="imagenet", include_preprocessing=True) 34 | pred = mm(mm.preprocess_input(cat())) 35 | out = mm.decode_predictions(pred)[0][0] 36 | 37 | assert out[1] == "Egyptian_cat" 38 | 39 | 40 | def test_HorNetTinyGF_new_shape_predict(): 41 | mm = keras_cv_attention_models.hornet.HorNetTinyGF(input_shape=(174, 255, 3), pretrained="imagenet") 42 | pred = mm(mm.preprocess_input(cat())) 43 | out = mm.decode_predictions(pred)[0][0] 44 | 45 | assert out[1] == "Egyptian_cat" 46 | 47 | 48 | def test_VOLO_d1_new_shape_predict(): 49 | mm = keras_cv_attention_models.volo.VOLO_d1(input_shape=(512, 512, 3), pretrained="imagenet") 50 | pred = mm(mm.preprocess_input(cat())) 51 | out = mm.decode_predictions(pred)[0][0] 52 | 53 | assert out[1] == "Egyptian_cat" 54 | -------------------------------------------------------------------------------- /keras_cv_attention_models/ghostnet/ghostnet.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.ghostnet.ghostnet_v2 import GhostNetV2 2 | from keras_cv_attention_models.models import register_model 3 | 4 | 5 | def GhostNet( 6 | kernel_sizes=[3, 3, 3, 5, 5, 3, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5], 7 | first_ghost_channels=[16, 48, 72, 72, 120, 240, 200, 184, 184, 480, 672, 672, 960, 960, 960, 960], 8 | out_channels=[16, 24, 24, 40, 40, 80, 80, 80, 80, 112, 112, 160, 160, 160, 160, 160], 9 | se_ratios=[0, 0, 0, 0.25, 0.25, 0, 0, 0, 0, 0.25, 0.25, 0.25, 0, 0.25, 0, 0.25], 10 | strides=[1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1], 11 | stem_width=16, 12 | stem_strides=2, 13 | width_mul=1.0, 14 | num_ghost_module_v1_stacks=-1, # num of `ghost_module` stcks on the head, others are `ghost_module_multiply`, set `-1` for all using `ghost_module` 15 | output_conv_filter=-1, # -1 for first_ghost_channels[-1] * width_mul 16 | input_shape=(224, 224, 3), 17 | num_classes=1000, 18 | activation="relu", 19 | classifier_activation="softmax", 20 | dropout=0, 21 | pretrained=None, 22 | model_name="ghostnet", 23 | kwargs=None, 24 | ): 25 | return GhostNetV2(**locals()) 26 | 27 | 28 | @register_model 29 | def GhostNet_050(input_shape=(224, 224, 3), num_classes=1000, activation="relu", classifier_activation="softmax", pretrained="imagenet", **kwargs): 30 | return GhostNet(**locals(), width_mul=0.5, model_name="ghostnet_050", **kwargs) 31 | 32 | 33 | @register_model 34 | def GhostNet_100(input_shape=(224, 224, 3), num_classes=1000, activation="relu", classifier_activation="softmax", pretrained="imagenet", **kwargs): 35 | return GhostNet(**locals(), model_name="ghostnet_100", **kwargs) 36 | 37 | 38 | @register_model 39 | def GhostNet_130(input_shape=(224, 224, 3), num_classes=1000, activation="relu", classifier_activation="softmax", pretrained="imagenet", **kwargs): 40 | return GhostNet(**locals(), width_mul=1.3, model_name="ghostnet_130", **kwargs) 41 | -------------------------------------------------------------------------------- /keras_cv_attention_models/beit/eva02.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.beit.beit import Beit, keras_model_load_weights_from_pytorch_model 2 | from keras_cv_attention_models.models import register_model 3 | 4 | 5 | def EVA02(mlp_ratio=4 * 2 / 3, layer_scale=0, use_abs_pos_emb=True, use_rot_pos_emb=True, use_gated_mlp=True, activation="swish", model_name="eva02", **kwargs): 6 | kwargs.pop("kwargs", None) 7 | patch_size = kwargs.pop("patch_size", 14) 8 | force_reload_mismatch = patch_size != 14 # If patch_size not 14, force reload pos_emb and stem_conv weights 9 | return Beit(**locals(), **kwargs) 10 | 11 | 12 | @register_model 13 | def EVA02TinyPatch14(input_shape=(336, 336, 3), num_classes=1000, activation="swish", classifier_activation="softmax", pretrained="mim_in22k_ft1k", **kwargs): 14 | embed_dim = 192 15 | depth = 12 16 | num_heads = 3 17 | return EVA02(**locals(), model_name="eva02_tiny_patch14", **kwargs) 18 | 19 | 20 | @register_model 21 | def EVA02SmallPatch14(input_shape=(336, 336, 3), num_classes=1000, activation="swish", classifier_activation="softmax", pretrained="mim_in22k_ft1k", **kwargs): 22 | embed_dim = 384 23 | depth = 12 24 | num_heads = 6 25 | return EVA02(**locals(), model_name="eva02_small_patch14", **kwargs) 26 | 27 | 28 | @register_model 29 | def EVA02BasePatch14( 30 | input_shape=(448, 448, 3), num_classes=1000, activation="swish", classifier_activation="softmax", pretrained="mim_in22k_ft22k_ft1k", **kwargs 31 | ): 32 | embed_dim = 768 33 | depth = 12 34 | num_heads = 12 35 | use_norm_mlp = True # scale_mlp = True 36 | return EVA02(**locals(), model_name="eva02_base_patch14", **kwargs) 37 | 38 | 39 | @register_model 40 | def EVA02LargePatch14( 41 | input_shape=(448, 448, 3), num_classes=1000, activation="swish", classifier_activation="softmax", pretrained="mim_m38m_ft22k_ft1k", **kwargs 42 | ): 43 | embed_dim = 1024 44 | depth = 24 45 | num_heads = 16 46 | use_norm_mlp = True # scale_mlp = True 47 | return EVA02(**locals(), model_name="eva02_large_patch14", **kwargs) 48 | -------------------------------------------------------------------------------- /keras_cv_attention_models/mobilenetv3_family/fbnetv3.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.mobilenetv3_family.mobilenetv3 import MobileNetV3 2 | from keras_cv_attention_models.models import register_model 3 | 4 | 5 | def FBNetV3( 6 | num_blocks=[2, 4, 1, 4, 1, 4, 1, 5, 1, 5, 1], 7 | out_channels=[16, 24, 40, 40, 72, 72, 120, 120, 184, 184, 224], 8 | expands=[1, [4, 2, 2, 2], 5, 3, 5, 3, 5, 3, 6, 4, 6], 9 | kernel_sizes=[3, 5, 5, 5, 5, 3, 3, 5, 3, 5, 5], 10 | strides=[1, 2, 2, 1, 2, 1, 1, 1, 2, 1, 1], 11 | activations="hard_swish", 12 | se_ratios=[0, 0, 0.25, 0.25, 0, 0, 0.25, 0.25, 0.25, 0.25, 0.25], 13 | se_activation=("hard_swish", "hard_sigmoid_torch"), 14 | se_limit_round_down=0.95, 15 | use_expanded_se_ratio=False, 16 | output_num_features=1984, 17 | use_output_feature_bias=False, 18 | model_name="fbnetv3", 19 | **kwargs, 20 | ): 21 | kwargs.pop("kwargs", None) 22 | return MobileNetV3(**locals(), **kwargs) 23 | 24 | 25 | @register_model 26 | def FBNetV3B(input_shape=(256, 256, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs): 27 | return FBNetV3(**locals(), model_name="fbnetv3_b", **kwargs) 28 | 29 | 30 | @register_model 31 | def FBNetV3D(input_shape=(256, 256, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs): 32 | num_blocks = [2, 6, 1, 4, 1, 4, 1, 6, 1, 5, 1] 33 | out_channels = [16, 24, 40, 40, 72, 72, 128, 128, 208, 208, 240] 34 | expands = [1, [5, 2, 2, 2, 2, 2], 4, 3, 5, 3, 5, 3, 6, 5, 6] 35 | kernel_sizes = [3, 3, 5, 3, 3, 3, 3, 5, 3, 5, 5] 36 | stem_width = 24 37 | return FBNetV3(**locals(), model_name="fbnetv3_d", **kwargs) 38 | 39 | 40 | @register_model 41 | def FBNetV3G(input_shape=(256, 256, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs): 42 | num_blocks = [3, 5, 1, 4, 1, 4, 1, 8, 1, 6, 2] 43 | out_channels = [24, 40, 56, 56, 104, 104, 160, 160, 264, 264, 288] 44 | expands = [1, [4, 2, 2, 2, 2], 4, 3, 5, 3, 5, 3, 6, 5, 6] 45 | kernel_sizes = [3, 5, 5, 5, 5, 3, 3, 5, 3, 5, 5] 46 | stem_width = 32 47 | return FBNetV3(**locals(), model_name="fbnetv3_g", **kwargs) 48 | -------------------------------------------------------------------------------- /keras_cv_attention_models/resnet_family/resnet_deep.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.aotnet import AotNet 2 | from keras_cv_attention_models.models import register_model 3 | from keras_cv_attention_models.download_and_load import reload_model_weights 4 | 5 | PRETRAINED_DICT = { 6 | "resnet50d": {"imagenet": "1b71933a82b058ba1e605ee5c01f64b2"}, 7 | "resnet101d": {"imagenet": "79b075be5cf222cff2bced7a5a117623"}, 8 | "resnet152d": {"imagenet": "0a15299b9abe1fee3ae06d9a59d13a3f"}, 9 | "resnet200d": {"imagenet": "b5961494e0072c342b838c77ef52ddc5"}, 10 | } 11 | 12 | 13 | def ResNetD(num_blocks, input_shape=(224, 224, 3), pretrained="imagenet", stem_type="deep", strides=2, shortcut_type="avg", **kwargs): 14 | strides = strides if isinstance(strides, (list, tuple)) else [1, 2, 2, strides] 15 | model = AotNet(num_blocks, input_shape=input_shape, stem_type=stem_type, strides=strides, shortcut_type=shortcut_type, **kwargs) 16 | reload_model_weights(model, pretrained_dict=PRETRAINED_DICT, sub_release="resnet_family", pretrained=pretrained) 17 | return model 18 | 19 | 20 | @register_model 21 | def ResNet50D(input_shape=(224, 224, 3), num_classes=1000, activation="relu", classifier_activation="softmax", pretrained="imagenet", **kwargs): 22 | num_blocks = [3, 4, 6, 3] 23 | return ResNetD(**locals(), model_name="resnet50d", **kwargs) 24 | 25 | 26 | @register_model 27 | def ResNet101D(input_shape=(224, 224, 3), num_classes=1000, activation="relu", classifier_activation="softmax", pretrained="imagenet", **kwargs): 28 | num_blocks = [3, 4, 23, 3] 29 | return ResNetD(**locals(), model_name="resnet101d", **kwargs) 30 | 31 | 32 | @register_model 33 | def ResNet152D(input_shape=(224, 224, 3), num_classes=1000, activation="relu", classifier_activation="softmax", pretrained="imagenet", **kwargs): 34 | num_blocks = [3, 8, 36, 3] 35 | return ResNetD(**locals(), model_name="resnet152d", **kwargs) 36 | 37 | 38 | @register_model 39 | def ResNet200D(input_shape=(224, 224, 3), num_classes=1000, activation="relu", classifier_activation="softmax", pretrained="imagenet", **kwargs): 40 | num_blocks = [3, 24, 36, 3] 41 | return ResNetD(**locals(), model_name="resnet200d", **kwargs) 42 | -------------------------------------------------------------------------------- /keras_cv_attention_models/clip/tf_data.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from keras_cv_attention_models.imagenet.tf_data import init_mean_std_by_rescale_mode, tf_imread, random_crop_and_resize_image, build_custom_dataset 3 | 4 | 5 | def image_process(image, image_size=(224, 224), is_train=True): 6 | image = tf_imread(image) 7 | if is_train: 8 | image = random_crop_and_resize_image(image, image_size, scale=(0.9, 1.0), method="bicubic", antialias=True)[0] 9 | else: 10 | image = tf.image.resize(image, image_size, method="bicubic", antialias=True) 11 | image = tf.cast(image, tf.float32) 12 | image.set_shape([*image_size, 3]) 13 | return image 14 | 15 | 16 | def init_dataset(data_path, caption_tokenizer, batch_size=64, image_size=224, rescale_mode="torch"): 17 | dataset, total_images, num_classes, num_channels = build_custom_dataset(data_path, with_info=True, caption_tokenizer=caption_tokenizer) 18 | 19 | mean, std = init_mean_std_by_rescale_mode(rescale_mode) 20 | image_size = image_size if isinstance(image_size, (list, tuple)) else [image_size, image_size] 21 | 22 | AUTOTUNE, buffer_size, seed = tf.data.AUTOTUNE, batch_size * 100, None 23 | train_pre_batch = lambda data_point: (image_process(data_point["image"], image_size, is_train=True), data_point["caption"]) 24 | y_true = tf.range(batch_size) 25 | train_post_batch = lambda xx, caption: (((xx - mean) / std, caption), y_true) 26 | 27 | train_dataset = dataset["train"] 28 | train_dataset = train_dataset.shuffle(buffer_size, seed=seed).map(train_pre_batch, num_parallel_calls=AUTOTUNE) 29 | train_dataset = train_dataset.batch(batch_size, drop_remainder=True).map(train_post_batch, num_parallel_calls=AUTOTUNE) 30 | train_dataset = train_dataset.prefetch(buffer_size=AUTOTUNE) 31 | 32 | test_dataset = dataset.get("validation", dataset.get("test", None)) 33 | if test_dataset is not None: 34 | test_pre_batch = lambda data_point: (image_process(data_point["image"], image_size, is_train=False), data_point["caption"]) 35 | test_dataset = test_dataset.map(test_pre_batch, num_parallel_calls=AUTOTUNE) 36 | test_dataset = test_dataset.batch(batch_size, drop_remainder=True).map(train_post_batch) 37 | 38 | return train_dataset, test_dataset 39 | -------------------------------------------------------------------------------- /keras_cv_attention_models/cspnext/__init__.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.cspnext.cspnext import ( 2 | CSPNeXt, 3 | CSPNeXtTiny, 4 | CSPNeXtSmall, 5 | CSPNeXtMedium, 6 | CSPNeXtLarge, 7 | CSPNeXtXLarge, 8 | ) 9 | 10 | __head_doc__ = """ 11 | Keras implementation of [Github open-mmlab/mmdetection/rtmdet](https://github.com/open-mmlab/mmdetection/tree/main/configs/rtmdet#classification). 12 | CSPNeXt is the backbone from Paper [PDF 2212.07784 RTMDet: An Empirical Study of Designing Real-Time Object Detectors](https://arxiv.org/abs/2212.07784). 13 | """ 14 | 15 | __tail_doc__ = """ input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`. 16 | num_classes: number of classes to classify images into. Set `0` to exclude top layers. 17 | activation: activation used in whole model, default `gelu`. 18 | dropout: dropout rate if top layers is included. 19 | classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`. 20 | Set `classifier_activation=None` to return the logits of the "top" layer. 21 | pretrained: one of None or "imagenet". 22 | Will try to download and load pre-trained model weights if not None. 23 | 24 | Returns: 25 | A `keras.Model` instance. 26 | """ 27 | 28 | CSPNeXt.__doc__ = __head_doc__ + """ 29 | Args: 30 | num_blocks: number of blocks in each stack. 31 | out_channels: output channels for each stack. 32 | stem_width: hidden dimension stem blocks. 33 | model_name: string, model name. 34 | """ + __tail_doc__ + """ 35 | Model architectures: 36 | | Model | Params | FLOPs | Input | Top1 Acc | 37 | | ------------- | ------ | ----- | ----- | -------- | 38 | | CSPNeXtTiny | 2.73M | 0.34G | 224 | 69.44 | 39 | | CSPNeXtSmall | 4.89M | 0.66G | 224 | 74.41 | 40 | | CSPNeXtMedium | 13.05M | 1.92G | 224 | 79.27 | 41 | | CSPNeXtLarge | 27.16M | 4.19G | 224 | 81.30 | 42 | | CSPNeXtXLarge | 48.85M | 7.75G | 224 | 82.10 | 43 | """ 44 | 45 | CSPNeXtTiny.__doc__ = __head_doc__ + """ 46 | Args: 47 | """ + __tail_doc__ 48 | 49 | CSPNeXtSmall.__doc__ = CSPNeXtTiny.__doc__ 50 | CSPNeXtMedium.__doc__ = CSPNeXtTiny.__doc__ 51 | CSPNeXtLarge.__doc__ = CSPNeXtTiny.__doc__ 52 | CSPNeXtXLarge.__doc__ = CSPNeXtTiny.__doc__ 53 | -------------------------------------------------------------------------------- /keras_cv_attention_models/mobilenetv3_family/lcnet.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.mobilenetv3_family.mobilenetv3 import MobileNetV3 2 | from keras_cv_attention_models.models import register_model 3 | 4 | 5 | def LCNet( 6 | num_blocks=[1, 2, 2, 1, 5, 2], 7 | out_channels=[32, 64, 128, 256, 256, 512], 8 | expands=1, 9 | kernel_sizes=[3, 3, 3, 3, 5, 5], 10 | strides=[1, 2, 2, 2, 1, 2], 11 | activations="hard_swish", 12 | disable_shortcut=True, 13 | use_blocks_output_activation=True, 14 | se_ratios=[0, 0, 0, 0, 0, 0.25], 15 | output_num_features=1280, 16 | use_additional_output_conv=False, 17 | model_name="lcnet", 18 | **kwargs, 19 | ): 20 | kwargs.pop("kwargs", None) 21 | return MobileNetV3(**locals(), **kwargs) 22 | 23 | 24 | @register_model 25 | def LCNet050(input_shape=(224, 224, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs): 26 | return LCNet(**locals(), width_ratio=0.5, model_name="lcnet_050", **kwargs) 27 | 28 | 29 | @register_model 30 | def LCNet075(input_shape=(224, 224, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs): 31 | return LCNet(**locals(), width_ratio=0.75, model_name="lcnet_075", **kwargs) 32 | 33 | 34 | @register_model 35 | def LCNet100(input_shape=(224, 224, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs): 36 | return LCNet(**locals(), model_name="lcnet_100", **kwargs) 37 | 38 | 39 | @register_model 40 | def LCNet150(input_shape=(224, 224, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs): 41 | use_output_feature_bias = False 42 | return LCNet(**locals(), width_ratio=1.5, model_name="lcnet_150", **kwargs) 43 | 44 | 45 | @register_model 46 | def LCNet200(input_shape=(224, 224, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs): 47 | use_output_feature_bias = False 48 | return LCNet(**locals(), width_ratio=2.0, model_name="lcnet_200", **kwargs) 49 | 50 | 51 | @register_model 52 | def LCNet250(input_shape=(224, 224, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs): 53 | use_output_feature_bias = False 54 | return LCNet(**locals(), width_ratio=2.5, model_name="lcnet_250", **kwargs) 55 | -------------------------------------------------------------------------------- /keras_cv_attention_models/pytorch_backend/callbacks.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | 4 | 5 | class Callback: 6 | def __init__(self): 7 | self.validation_data = None 8 | self.model = None 9 | 10 | def set_params(self, params): 11 | self.params = params 12 | 13 | def set_model(self, model): 14 | self.model = model 15 | 16 | def on_train_batch_begin(self, batch, logs=None): 17 | pass 18 | 19 | def on_train_batch_end(self, batch, logs=None): 20 | pass 21 | 22 | def on_epoch_begin(self, cur_epoch, logs=None): 23 | pass 24 | 25 | def on_epoch_end(self, cur_epoch, logs=None): 26 | pass 27 | 28 | def on_test_batch_begin(self, batch, logs=None): 29 | pass 30 | 31 | def on_test_batch_end(self, batch, logs=None): 32 | pass 33 | 34 | def on_test_begin(self, logs=None): 35 | pass 36 | 37 | def on_test_end(self, logs=None): 38 | pass 39 | 40 | 41 | class TerminateOnNaN(Callback): 42 | def on_train_batch_end(self, batch, logs=None): 43 | logs = logs or {} 44 | loss = logs.get("loss") 45 | if loss is not None: 46 | if not np.isfinite(loss): 47 | print("\nError: Invalid loss, terminating training") 48 | # self.model.stop_training = True 49 | sys.exit() 50 | 51 | 52 | class TensorBoard(Callback): 53 | def __init__(self, log_dir="logs", histogram_freq=1, **kwargs): 54 | super().__init__() 55 | self.log_dir, self.histogram_freq = log_dir, histogram_freq 56 | try: 57 | from torch.utils.tensorboard import SummaryWriter 58 | 59 | self.tensorboard_writer = SummaryWriter(self.log_dir) 60 | print(">>>> Tensorboard writer created, summary will be write to '{}', view by 'tensorboard --logdir {}'".format(log_dir, log_dir)) 61 | except: 62 | self.tensorboard_writer = None 63 | print("[Error] tensorboard not installed, try `pip install tensorboard`") 64 | 65 | def on_epoch_end(self, cur_epoch, logs=None): 66 | if self.tensorboard_writer is None: 67 | return 68 | logs = logs or {} 69 | for kk, vv in logs.items(): 70 | self.tensorboard_writer.add_scalar(kk, vv, cur_epoch) 71 | -------------------------------------------------------------------------------- /keras_cv_attention_models/gpt2/__init__.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.gpt2.gpt2 import ( 2 | GPT2, 3 | GPT2_Base, 4 | GPT2_Medium, 5 | GPT2_Large, 6 | GPT2_XLarge, 7 | RunPrediction, 8 | PositionalIndex, 9 | CausalMask, 10 | load_weights_from_huggingface, 11 | ) 12 | 13 | __head_doc__ = """ 14 | Keras implementation of [Github openai/gpt-2](https://github.com/openai/gpt-2). 15 | Paper [Language Models are Unsupervised Multitask Learners](https://d4mucfpksywv.cloudfront.net/better-language-models/language-models.pdf). 16 | """ 17 | 18 | __tail_doc__ = """ vocab_size: model vocab size. 19 | max_block_size: number of tokens generated in each sample. 20 | include_top: boolena value if including output Dense head layer. Set false to exclude the head layer. 21 | dropout: float value for drop out rate for Embedding layer and attention blocks. 22 | activation: activation used in whole model, default `gelu/app`. 23 | pretrained: None or one of ["webtext", "huggingface"]. 24 | - if "webtext", will try to download and load ported weights if available. 25 | - if "huggingface", will try converting and loading weights from huggingface `transformers` pacakge. 26 | - if None, will initialize model with ranbdom weights. 27 | 28 | Returns: 29 | A `keras.Model` instance. 30 | """ 31 | 32 | GPT2.__doc__ = __head_doc__ + """ 33 | Args: 34 | num_blocks: num of `attention_mlp_block`s. 35 | embedding_size: `attention_mlp_block` block embedding size. 36 | num_heads: num of heads. 37 | block_use_bias: boolean value if using bias for `attention_mlp_block` Dense layers. 38 | model_name: string, model name. 39 | """ + __tail_doc__ + """ 40 | Model architectures: 41 | | Model | Params | FLOPs | vocab_size | LAMBADA PPL | 42 | | ------------| ------- | ------- | ---------- | ----------- | 43 | | GPT2_Base | 163.04M | 146.42G | 50257 | 35.13 | 44 | | GPT2_Medium | 406.29M | 415.07G | 50257 | 15.60 | 45 | | GPT2_Large | 838.36M | 890.28G | 50257 | 10.87 | 46 | | GPT2_XLarge | 1.638B | 1758.3G | 50257 | 8.63 | 47 | """ 48 | 49 | GPT2_Base.__doc__ = __head_doc__ + """ 50 | Args: 51 | """ + __tail_doc__ 52 | 53 | GPT2_Medium.__doc__ = GPT2_Base.__doc__ 54 | GPT2_Large.__doc__ = GPT2_Base.__doc__ 55 | GPT2_XLarge.__doc__ = GPT2_Base.__doc__ 56 | -------------------------------------------------------------------------------- /setup_kecam.py: -------------------------------------------------------------------------------- 1 | """Setup""" 2 | 3 | from setuptools import setup, find_packages 4 | from codecs import open 5 | from os import path 6 | 7 | here = path.abspath(path.dirname(__file__)) 8 | 9 | # Get the long description from the README file 10 | with open(path.join(here, "README.md"), encoding="utf-8") as f: 11 | long_description = f.read() 12 | long_description = long_description.replace( 13 | "](keras_cv_attention_models", "](https://github.com/leondgarse/keras_cv_attention_models/tree/main/keras_cv_attention_models" 14 | ) 15 | 16 | exec(open("keras_cv_attention_models/version.py").read()) 17 | setup( 18 | name="kecam", 19 | version=__version__, 20 | description="Tensorflow keras computer vision attention models. Alias kecam. https://github.com/leondgarse/keras_cv_attention_models", 21 | long_description=long_description, 22 | long_description_content_type="text/markdown", 23 | url="https://github.com/leondgarse/keras_cv_attention_models", 24 | author="Leondgarse", 25 | author_email="leondgarse@gmail.com", 26 | classifiers=[ 27 | # How mature is this project? Common values are 28 | # 3 - Alpha 29 | # 4 - Beta 30 | # 5 - Production/Stable 31 | "Development Status :: 3 - Alpha", 32 | "Intended Audience :: Developers", 33 | "Intended Audience :: Science/Research", 34 | "License :: OSI Approved :: Apache Software License", 35 | "Programming Language :: Python :: 3.6", 36 | "Programming Language :: Python :: 3.7", 37 | "Programming Language :: Python :: 3.8", 38 | "Topic :: Scientific/Engineering", 39 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 40 | "Topic :: Software Development", 41 | "Topic :: Software Development :: Libraries", 42 | "Topic :: Software Development :: Libraries :: Python Modules", 43 | ], 44 | # Note that this is a string of words separated by whitespace, not a list. 45 | keywords="tensorflow keras cv attention pretrained models kecam", 46 | packages=find_packages(exclude=["tests"]) + ["keras_cv_attention_models.pytorch_backend"], 47 | include_package_data=True, 48 | install_requires=["h5py", "pillow", "tqdm", "ftfy", "regex"], # ftfy and regex required for language models 49 | python_requires=">=3.6", 50 | license="Apache 2.0", 51 | ) 52 | -------------------------------------------------------------------------------- /keras_cv_attention_models/gpvit/README.md: -------------------------------------------------------------------------------- 1 | # ___Keras GPViT___ 2 | *** 3 | 4 | ## Summary 5 | - Keras implementation of [Github ChenhongyiYang/GPViT](https://github.com/ChenhongyiYang/GPViT). Paper [PDF 2212.06795 GPVIT: A HIGH RESOLUTION NON-HIERARCHICAL VISION TRANSFORMER WITH GROUP PROPAGATION](https://arxiv.org/pdf/2212.06795.pdf). 6 | - Model weights ported from official publication. 7 | *** 8 | 9 | ## Models 10 | | Model | Params | FLOPs | Input | Top1 Acc | Download | 11 | | -------- | ------ | ------ | ----- | -------- | -------- | 12 | | GPViT_L1 | 9.59M | 6.15G | 224 | 80.5 | [gpvit_l1_224_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/gpvit/gpvit_l1_224_imagenet.h5) | 13 | | GPViT_L2 | 24.2M | 15.74G | 224 | 83.4 | [gpvit_l2_224_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/gpvit/gpvit_l2_224_imagenet.h5) | 14 | | GPViT_L3 | 36.7M | 23.54G | 224 | 84.1 | [gpvit_l3_224_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/gpvit/gpvit_l3_224_imagenet.h5) | 15 | | GPViT_L4 | 75.5M | 48.29G | 224 | 84.3 | [gpvit_l4_224_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/gpvit/gpvit_l4_224_imagenet.h5) | 16 | ## Usage 17 | ```py 18 | from keras_cv_attention_models import gpvit 19 | 20 | # Will download and load pretrained imagenet weights. 21 | mm = gpvit.GPViT_L1(pretrained="imagenet") 22 | 23 | # Run prediction 24 | import tensorflow as tf 25 | from tensorflow import keras 26 | from skimage.data import chelsea 27 | imm = keras.applications.imagenet_utils.preprocess_input(chelsea(), mode='torch') # Chelsea the cat 28 | pred = mm(tf.expand_dims(tf.image.resize(imm, mm.input_shape[1:3]), 0)).numpy() 29 | print(keras.applications.imagenet_utils.decode_predictions(pred)[0]) 30 | # [('n02124075', 'Egyptian_cat', 0.7434748), ('n02123045', 'tabby', 0.089776225), ...] 31 | ``` 32 | **Change input resolution**. 33 | ```py 34 | from keras_cv_attention_models import gpvit 35 | mm = gpvit.GPViT_L1(input_shape=(128, 192, 3), pretrained="imagenet") 36 | # >>>> Load pretrained from: ~/.keras/models/gp_vit_l1_224_imagenet.h5 37 | # >>>> Reload mismatched weights: 224 -> (128, 192) 38 | # >>>> Reload layer: positional_embedding 39 | 40 | # Run prediction 41 | from skimage.data import chelsea 42 | preds = mm(mm.preprocess_input(chelsea())) 43 | print(mm.decode_predictions(preds)) 44 | # [('n02124075', 'Egyptian_cat', 0.8140152), ('n02123045', 'tabby', 0.05595901), ...] 45 | ``` 46 | *** 47 | -------------------------------------------------------------------------------- /keras_cv_attention_models/beit/vit.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.beit.beit import Beit, keras_model_load_weights_from_pytorch_model 2 | from keras_cv_attention_models.models import register_model 3 | 4 | 5 | def ViT(attn_qv_bias=False, attn_qkv_bias=True, use_abs_pos_emb=True, layer_scale=0, use_mean_pooling_head=False, model_name="vit", **kwargs): 6 | kwargs.pop("kwargs", None) 7 | return Beit(**locals(), **kwargs) 8 | 9 | 10 | def ViTText( 11 | vocab_size=49408, 12 | max_block_size=77, 13 | text_positional_dropout=0, 14 | text_use_positional_embedding=True, 15 | include_top=True, 16 | layer_norm_epsilon=1e-5, 17 | activation="gelu/quick", 18 | model_name="vit_text", 19 | **kwargs, 20 | ): 21 | attn_qv_bias = kwargs.pop("attn_qv_bias", False) 22 | attn_qkv_bias = kwargs.pop("attn_qkv_bias", True) 23 | use_abs_pos_emb = kwargs.pop("use_abs_pos_emb", True) 24 | layer_scale = kwargs.pop("layer_scale", 0) 25 | use_mean_pooling_head = kwargs.pop("use_mean_pooling_head", False) 26 | kwargs.pop("kwargs", None) 27 | return Beit(**locals(), **kwargs) 28 | 29 | 30 | @register_model 31 | def ViTTinyPatch16(input_shape=(196, 196, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet21k-ft1k", **kwargs): 32 | embed_dim = 192 33 | depth = 12 34 | num_heads = 3 35 | patch_size = kwargs.pop("patch_size", 16) 36 | return ViT(**locals(), model_name="vit_tiny_patch16", **kwargs) 37 | 38 | 39 | @register_model 40 | def ViTBasePatch16(input_shape=(196, 196, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet21k-ft1k", **kwargs): 41 | embed_dim = 768 42 | depth = 12 43 | num_heads = 12 44 | patch_size = kwargs.pop("patch_size", 16) 45 | return ViT(**locals(), model_name="vit_base_patch16", **kwargs) 46 | 47 | 48 | @register_model 49 | def ViTLargePatch14(input_shape=(196, 196, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet21k-ft1k", **kwargs): 50 | embed_dim = 1024 51 | depth = 24 52 | num_heads = 16 53 | patch_size = kwargs.pop("patch_size", 14) 54 | return ViT(**locals(), model_name="vit_large_patch14", **kwargs) 55 | 56 | 57 | @register_model 58 | def ViTTextLargePatch14(vocab_size=49408, max_block_size=77, activation="gelu/quick", include_top=True, pretrained="clip", **kwargs): 59 | embed_dim = 768 60 | depth = 12 61 | num_heads = 12 62 | patch_size = kwargs.pop("patch_size", 14) 63 | return ViTText(**locals(), model_name="vit_text_large_patch14", **kwargs) 64 | -------------------------------------------------------------------------------- /keras_cv_attention_models/coco/__init__.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models import backend 2 | 3 | from keras_cv_attention_models.coco import eval_func, anchors_func 4 | from keras_cv_attention_models.coco.eval_func import DecodePredictions, COCOEvalCallback 5 | from keras_cv_attention_models.coco.anchors_func import ( 6 | get_anchors_mode_parameters, 7 | get_anchors, 8 | get_anchor_free_anchors, 9 | get_yolor_anchors, 10 | get_anchors_mode_by_anchors, 11 | get_pyramid_levels_by_anchors, 12 | decode_bboxes, 13 | ) 14 | from keras_cv_attention_models.coco.info import COCO_80_LABEL_DICT, COCO_90_LABEL_DICT, COCO_80_to_90_LABEL_DICT 15 | from keras_cv_attention_models.plot_func import draw_bboxes, show_image_with_bboxes 16 | from keras_cv_attention_models.plot_func import show_detection_batch_sample as show_batch_sample 17 | 18 | if backend.is_tensorflow_backend: 19 | from keras_cv_attention_models.coco import tf_data as data 20 | from keras_cv_attention_models.coco import tf_losses as losses 21 | from keras_cv_attention_models.coco.tf_data import aspect_aware_resize_and_crop_image, init_mean_std_by_rescale_mode, init_dataset 22 | 23 | data.init_dataset.__doc__ = """ Init dataset by name. 24 | Args: 25 | data_name: the registered dataset name from `tensorflow_datasets`. 26 | input_shape: input shape. 27 | batch_size: batch size. 28 | buffer_size: dataset shuffle buffer size. 29 | info_only: boolean value if returns dataset info only. 30 | max_labels_per_image: . 31 | anchors_mode: . 32 | anchor_pyramid_levels: . 33 | anchor_aspect_ratios: . 34 | anchor_num_scales: . 35 | anchor_scale: . 36 | anchor_scale: . 37 | cutmix_alpha: cutmix applying probability. 38 | rescale_mode: one of ["tf", "torch", "raw01", "raw"]. Detail in `data.init_mean_std_by_rescale_mode`. Or specific `(mean, std)` like `(128.0, 128.0)`. 39 | random_crop_mode: . 40 | mosaic_mix_prob: . 41 | resize_method: one of ["nearest", "bilinear", "bicubic"]. Resize method for `tf.image.resize`. 42 | resize_antialias: boolean value if using antialias for `tf.image.resize`. 43 | magnitude: randaug magnitude. 44 | num_layers: randaug num_layers. 45 | augment_kwargs: randaug kwargs. Too many to list them all. 46 | 47 | Returns: train_dataset, test_dataset, total_images, num_classes, steps_per_epoch 48 | """ 49 | else: 50 | from keras_cv_attention_models.coco import torch_data as data 51 | from keras_cv_attention_models.coco import torch_losses as losses 52 | from keras_cv_attention_models.coco.torch_data import aspect_aware_resize_and_crop_image, init_mean_std_by_rescale_mode, init_dataset 53 | -------------------------------------------------------------------------------- /keras_cv_attention_models/llama2/__init__.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.llama2.llama2 import ( 2 | LLaMA2, 3 | LLaMA2_15M, 4 | LLaMA2_42M, 5 | LLaMA2_110M, 6 | LLaMA2_1B, 7 | LLaMA2_7B, 8 | RunPrediction, 9 | PositionalEncodingFourierRot1D, 10 | RMSNorm, 11 | convert_huggingface_weights_to_h5, 12 | ) 13 | 14 | __head_doc__ = """ 15 | Keras implementation of [Github facebookresearch/llama](https://github.com/facebookresearch/llama). 16 | Paper [PDF 2307.09288 Llama 2: Open Foundation and Fine-Tuned Chat Models](https://arxiv.org/pdf/2307.09288.pdf). 17 | """ 18 | 19 | __tail_doc__ = """ vocab_size: model vocab size. 20 | max_block_size: number of tokens generated in each sample. 21 | include_top: boolena value if including output Dense head layer. Set false to exclude the head layer. 22 | dropout: float value for drop out rate for Embedding layer and attention blocks. 23 | activation: activation used in whole model, default `swish`. 24 | pretrained: None or "tiny_stories", or specific ".pt" or ".h5" file. 25 | - if "tiny_stories" or "tiny_llama_1.1B_chat_v0.4", will try to download and load ported weights if available. 26 | - if "xxx.pt", will try converting and loading weights from .pt file. 27 | - if "xxx.h5", will just load weights. 28 | - if None, will initialize model with ranbdom weights. 29 | 30 | Returns: 31 | A `keras.Model` instance. 32 | """ 33 | 34 | LLaMA2.__doc__ = __head_doc__ + """ 35 | Args: 36 | num_blocks: num of `attention_fft_block`s. 37 | embedding_size: `attention_fft_block` block embedding size. 38 | hidden_divisible: int value making fft block hidden layer size multiple of large power of 2. 39 | num_heads: num of heads. 40 | num_kv_heads: int value specific key value heads, num_heads should be divisible by num_kv_heads. Default -1 for equal with num_heads. 41 | block_use_bias: boolean value if using bias for `attention_fft_block` Dense layers. 42 | model_name: string, model name. 43 | """ + __tail_doc__ + """ 44 | Model architectures: 45 | | Model | Params | FLOPs | vocab_size | Val loss | 46 | | ----------- | ------ | ------ | ---------- | -------- | 47 | | LLaMA2_15M | 24.41M | 4.06G | 32000 | 1.072 | 48 | | LLaMA2_42M | 58.17M | 50.7G | 32000 | 0.847 | 49 | | LLaMA2_110M | 134.1M | 130.2G | 32000 | 0.760 | 50 | | LLaMA2_1B | 1.10B | 2.50T | 32003 | | 51 | | LLaMA2_7B | 6.74B | 14.54T | 32000 | | 52 | """ 53 | 54 | LLaMA2_15M.__doc__ = __head_doc__ + """ 55 | Args: 56 | """ + __tail_doc__ 57 | 58 | LLaMA2_42M.__doc__ = LLaMA2_15M.__doc__ 59 | LLaMA2_110M.__doc__ = LLaMA2_15M.__doc__ 60 | LLaMA2_1B.__doc__ = LLaMA2_15M.__doc__ 61 | LLaMA2_7B.__doc__ = LLaMA2_15M.__doc__ 62 | -------------------------------------------------------------------------------- /keras_cv_attention_models/inceptionnext/__init__.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.inceptionnext.inceptionnext import InceptionNeXt, InceptionNeXtTiny, InceptionNeXtSmall, InceptionNeXtBase 2 | 3 | __head_doc__ = """ 4 | Keras implementation of [Github sail-sg/inceptionnext](https://github.com/sail-sg/inceptionnext). 5 | Paper [PDF 2303.16900 InceptionNeXt: When Inception Meets ConvNeXt](https://arxiv.org/pdf/2303.16900.pdf). 6 | """ 7 | 8 | __tail_doc__ = """ input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`. 9 | num_classes: number of classes to classify images into. Set `0` to exclude top layers. 10 | activation: activation used in whole model, default `gelu`. 11 | drop_connect_rate: is used for [Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382). 12 | Can be a constant value like `0.2`, 13 | or a tuple value like `(0, 0.2)` indicates the drop probability linearly changes from `0 --> 0.2` for `top --> bottom` layers. 14 | A higher value means a higher probability will drop the deep branch. 15 | or `0` to disable (default). 16 | layer_scale: int value indicates layer scale init value for each stack. Default `[0, 0, 1e-6, 1e-6]`, 0 for not using. 17 | [Going deeper with Image Transformers](https://arxiv.org/abs/2103.17239). 18 | classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`. 19 | Set `classifier_activation=None` to return the logits of the "top" layer. 20 | Default is `None`. 21 | pretrained: one of `None` (random initialization) or 'imagenet21k-ft1k' (pre-training on ImageNet21k and fine-tuned ImageNet). 22 | Will try to download and load pre-trained model weights if not None. 23 | 24 | Returns: 25 | A `keras.Model` instance. 26 | """ 27 | 28 | InceptionNeXt.__doc__ = __head_doc__ + """ 29 | Args: 30 | num_blocks: number of blocks in each stack. 31 | embed_dims: output channels for each stack. 32 | mlp_ratios: int or list value indicates expand ratio for mlp blocks hidden channel in each stack. 33 | model_name: string, model name. 34 | """ + __tail_doc__ + """ 35 | Model architectures: 36 | | Model | Params | FLOP s | Input | Top1 Acc | 37 | | ------------------ | ------ | ------ | ----- | -------- | 38 | | InceptionNeXtTiny | 28.05M | 4.21G | 224 | 82.3 | 39 | | InceptionNeXtSmall | 49.37M | 8.39G | 224 | 83.5 | 40 | | InceptionNeXtBase | 86.67M | 14.88G | 224 | 84.0 | 41 | | | 86.67M | 43.73G | 384 | 85.2 | 42 | """ 43 | 44 | InceptionNeXtTiny.__doc__ = __head_doc__ + """ 45 | Args: 46 | """ + __tail_doc__ 47 | 48 | InceptionNeXtSmall.__doc__ = InceptionNeXtTiny.__doc__ 49 | InceptionNeXtBase.__doc__ = InceptionNeXtTiny.__doc__ 50 | -------------------------------------------------------------------------------- /keras_cv_attention_models/pytorch_backend/metrics.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.pytorch_backend import functional 2 | 3 | BUILDIN_METRICS = {} 4 | 5 | 6 | def register_metrics(name=None): 7 | def decorator(arg): 8 | registered_names = name or [arg.__name__] 9 | registered_names = registered_names if isinstance(registered_names, (list, tuple)) else [registered_names] 10 | for registered_name in registered_names: 11 | if registered_name in BUILDIN_METRICS: 12 | raise ValueError(f"{registered_name} has already been registered to " f"{BUILDIN_METRICS[registered_name]}") 13 | BUILDIN_METRICS[registered_name] = arg 14 | return arg 15 | 16 | return decorator 17 | 18 | 19 | class Metric: 20 | def __init__(self, name=None, **kwargs): 21 | super().__init__() 22 | self.name = name 23 | self.eval_only = False 24 | self.reset_state() 25 | 26 | def reset_state(self): 27 | pass 28 | 29 | def update_state(self, y_true, y_pred, sample_weight=None): 30 | pass 31 | 32 | def result(self): 33 | pass 34 | 35 | 36 | @register_metrics(name=["acc", "accuracy"]) 37 | class Accuracy(Metric): 38 | def __init__(self, name="acc"): 39 | super().__init__(name=name) 40 | 41 | def reset_state(self): 42 | self.sum_value, self.passed_steps = 0.0, 0 43 | 44 | def update_state(self, y_true, y_pred, sample_weight=None): 45 | y_pred = functional.argmax(y_pred, axis=-1) 46 | if len(y_true.shape) > len(y_pred.shape): 47 | y_true = functional.argmax(y_true, axis=-1) 48 | cur_acc = functional.reduce_mean(functional.cast(y_true == y_pred, "float32")) 49 | self.sum_value = self.sum_value + cur_acc 50 | self.passed_steps += 1 51 | 52 | def result(self): 53 | return self.sum_value / self.passed_steps 54 | 55 | 56 | @register_metrics(name=["acc5", "accuracy5"]) 57 | class Accuracy5(Metric): 58 | def __init__(self, name="acc5"): 59 | super().__init__(name=name) 60 | self.eval_only = True 61 | 62 | def reset_state(self): 63 | self.sum_value, self.passed_steps = 0.0, 0 64 | 65 | def update_state(self, y_true, y_pred, sample_weight=None): 66 | y_pred = functional.argsort(y_pred, direction="DESCENDING", axis=-1)[:, :5] 67 | if len(y_true.shape) >= len(y_pred.shape): 68 | y_true = functional.argmax(y_true, axis=-1) 69 | cur_acc = functional.reduce_mean(functional.convert_to_tensor([y_true[id] in y_pred[id] for id in range(y_true.shape[0])], "float32")) 70 | self.sum_value = self.sum_value + cur_acc 71 | self.passed_steps += 1 72 | 73 | def result(self): 74 | return self.sum_value / self.passed_steps 75 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """Setup""" 2 | 3 | from setuptools import setup, find_packages 4 | from codecs import open 5 | from os import path 6 | 7 | here = path.abspath(path.dirname(__file__)) 8 | 9 | # Get the long description from the README file 10 | with open(path.join(here, "README.md"), encoding="utf-8") as f: 11 | long_description = f.read() 12 | long_description = long_description.replace( 13 | "](keras_cv_attention_models", "](https://github.com/leondgarse/keras_cv_attention_models/tree/main/keras_cv_attention_models" 14 | ) 15 | 16 | exec(open("keras_cv_attention_models/version.py").read()) 17 | setup( 18 | name="keras-cv-attention-models", 19 | version=__version__, 20 | description="Tensorflow keras computer vision attention models. Alias kecam. https://github.com/leondgarse/keras_cv_attention_models", 21 | long_description=long_description, 22 | long_description_content_type="text/markdown", 23 | url="https://github.com/leondgarse/keras_cv_attention_models", 24 | author="Leondgarse", 25 | author_email="leondgarse@gmail.com", 26 | classifiers=[ 27 | # How mature is this project? Common values are 28 | # 3 - Alpha 29 | # 4 - Beta 30 | # 5 - Production/Stable 31 | "Development Status :: 3 - Alpha", 32 | "Intended Audience :: Developers", 33 | "Intended Audience :: Science/Research", 34 | "License :: OSI Approved :: Apache Software License", 35 | "Programming Language :: Python :: 3.6", 36 | "Programming Language :: Python :: 3.7", 37 | "Programming Language :: Python :: 3.8", 38 | "Topic :: Scientific/Engineering", 39 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 40 | "Topic :: Software Development", 41 | "Topic :: Software Development :: Libraries", 42 | "Topic :: Software Development :: Libraries :: Python Modules", 43 | ], 44 | # Note that this is a string of words separated by whitespace, not a list. 45 | keywords="tensorflow keras cv attention pretrained models kecam", 46 | packages=find_packages(exclude=["tests"]) + ["keras_cv_attention_models.pytorch_backend"], 47 | include_package_data=True, 48 | install_requires=[ 49 | "pillow", 50 | "tqdm", 51 | "ftfy", # required for language models 52 | "regex", # required for language models 53 | # "tensorflow-macos;platform_system=='Darwin'", # [???] 54 | "tensorflow;platform_system!='Darwin'", 55 | # "tensorflow-addons;platform_machine!='aarch64' and platform_machine!='aarch32'", # [deprecated] 56 | # "tensorflow-datasets;platform_machine!='aarch64' and platform_machine!='aarch32'", # >4.7.0 needs dm-tree, failed on arm, just skip 57 | ], 58 | python_requires=">=3.6", 59 | license="Apache 2.0", 60 | ) 61 | -------------------------------------------------------------------------------- /keras_cv_attention_models/mobilenetv3_family/tinynet.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.mobilenetv3_family.mobilenetv3 import MobileNetV3 2 | from keras_cv_attention_models.models import register_model 3 | 4 | 5 | def get_expanded_width_depth(width, depth): 6 | out_channels = [ii * width for ii in [16, 24, 40, 80, 112, 192, 320]] 7 | num_blocks = [int(round(ii * depth)) for ii in [1, 2, 2, 3, 3, 4, 1]] 8 | return out_channels, num_blocks 9 | 10 | 11 | def TinyNet( 12 | num_blocks=[1, 2, 2, 3, 3, 4, 1], 13 | out_channels=[16, 24, 40, 80, 112, 192, 320], 14 | expands=[1, 6, 6, 6, 6, 6, 6], 15 | kernel_sizes=[3, 3, 5, 3, 5, 5, 3], 16 | strides=[1, 2, 2, 2, 1, 2, 1], 17 | activations="swish", 18 | stem_width=32, 19 | fix_stem=True, 20 | se_ratios=0.25, 21 | se_activation=None, # None for same with activations 22 | use_expanded_se_ratio=False, 23 | se_divisor=1, 24 | output_num_features=1280, 25 | use_additional_output_conv=False, 26 | use_output_feature_bias=False, 27 | use_avg_pool_conv_output=False, 28 | model_name="tinynet", 29 | **kwargs, 30 | ): 31 | stem_feature_activation = activations 32 | kwargs.pop("kwargs", None) 33 | return MobileNetV3(**locals(), **kwargs) 34 | 35 | 36 | @register_model 37 | def TinyNetA(input_shape=(192, 192, 3), num_classes=1000, activations="swish", classifier_activation="softmax", pretrained="imagenet", **kwargs): 38 | out_channels, num_blocks = get_expanded_width_depth(1.0, 1.2) 39 | return TinyNet(**locals(), model_name="tinynet_a", **kwargs) 40 | 41 | 42 | @register_model 43 | def TinyNetB(input_shape=(188, 188, 3), num_classes=1000, activations="swish", classifier_activation="softmax", pretrained="imagenet", **kwargs): 44 | out_channels, num_blocks = get_expanded_width_depth(0.75, 1.1) 45 | return TinyNet(**locals(), model_name="tinynet_b", **kwargs) 46 | 47 | 48 | @register_model 49 | def TinyNetC(input_shape=(184, 184, 3), num_classes=1000, activations="swish", classifier_activation="softmax", pretrained="imagenet", **kwargs): 50 | out_channels, num_blocks = get_expanded_width_depth(0.54, 0.85) 51 | return TinyNet(**locals(), model_name="tinynet_c", **kwargs) 52 | 53 | 54 | @register_model 55 | def TinyNetD(input_shape=(152, 152, 3), num_classes=1000, activations="swish", classifier_activation="softmax", pretrained="imagenet", **kwargs): 56 | out_channels, num_blocks = get_expanded_width_depth(0.54, 0.695) 57 | return TinyNet(**locals(), model_name="tinynet_d", **kwargs) 58 | 59 | 60 | @register_model 61 | def TinyNetE(input_shape=(106, 106, 3), num_classes=1000, activations="swish", classifier_activation="softmax", pretrained="imagenet", **kwargs): 62 | out_channels, num_blocks = get_expanded_width_depth(0.51, 0.6) 63 | return TinyNet(**locals(), model_name="tinynet_e", **kwargs) 64 | -------------------------------------------------------------------------------- /keras_cv_attention_models/moganet/__init__.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.moganet.moganet import MogaNet, MogaNetXtiny, MogaNetTiny, MogaNetSmall, MogaNetBase, MogaNetLarge 2 | 3 | __head_doc__ = """ 4 | Keras implementation of [Github Westlake-AI/MogaNet](https://github.com/Westlake-AI/MogaNet). 5 | Paper [PDF 2211.03295 Efficient Multi-order Gated Aggregation Network](https://arxiv.org/pdf/2211.03295.pdf). 6 | """ 7 | 8 | __tail_doc__ = """ input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`. 9 | num_classes: number of classes to classify images into. Set `0` to exclude top layers. 10 | activation: activation for non-attention blocks, default `gelu`. 11 | attn_activation: activation for attention blocks, default `swish`. `None` for same with `activation`. 12 | drop_connect_rate: is used for [Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382). 13 | Can be a constant value like `0.2`, 14 | or a tuple value like `(0, 0.2)` indicates the drop probability linearly changes from `0 --> 0.2` for `top --> bottom` layers. 15 | A higher value means a higher probability will drop the deep branch. 16 | or `0` to disable (default). 17 | layer_scale: int value indicates layer scale init value for each stack. Default `1e-5`, 0 for not using. 18 | [Going deeper with Image Transformers](https://arxiv.org/abs/2103.17239). 19 | classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`. 20 | Set `classifier_activation=None` to return the logits of the "top" layer. 21 | Default is `None`. 22 | pretrained: one of `None` (random initialization) or 'imagenet' (pre-training on ImageNet). 23 | Will try to download and load pre-trained model weights if not None. 24 | 25 | Returns: 26 | A `keras.Model` instance. 27 | """ 28 | 29 | MogaNet.__doc__ = __head_doc__ + """ 30 | Args: 31 | num_blocks: number of blocks in each stack. 32 | out_channels: output channels for each stack. 33 | mlp_ratios: int or list value indicates expand ratio for mlp blocks hidden channel in each stack. 34 | model_name: string, model name. 35 | """ + __tail_doc__ + """ 36 | Model architectures: 37 | | Model | Params | FLOPs | Input | Top1 Acc | 38 | | ------------ | ------ | ------ | ----- | -------- | 39 | | MogaNetXtiny | 2.96M | 806M | 224 | 76.5 | 40 | | MogaNetTiny | 5.20M | 1.11G | 224 | 79.0 | 41 | | | 5.20M | 1.45G | 256 | 79.6 | 42 | | MogaNetSmall | 25.3M | 4.98G | 224 | 83.4 | 43 | | MogaNetBase | 43.7M | 9.96G | 224 | 84.2 | 44 | | MogaNetLarge | 82.5M | 15.96G | 224 | 84.6 | 45 | """ 46 | 47 | MogaNetXtiny.__doc__ = __head_doc__ + """ 48 | Args: 49 | """ + __tail_doc__ 50 | 51 | MogaNetTiny.__doc__ = MogaNetXtiny.__doc__ 52 | MogaNetSmall.__doc__ = MogaNetXtiny.__doc__ 53 | MogaNetBase.__doc__ = MogaNetXtiny.__doc__ 54 | MogaNetLarge.__doc__ = MogaNetXtiny.__doc__ 55 | -------------------------------------------------------------------------------- /keras_cv_attention_models/hiera/__init__.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.hiera.hiera import Hiera, HieraTiny, HieraSmall, HieraBase, HieraBasePlus, HieraLarge, HieraHuge 2 | 3 | __head_doc__ = """ 4 | Keras implementation of [Github facebookresearch/hiera](https://github.com/facebookresearch/hiera). 5 | Paper [PDF 2306.00989 Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/pdf/2306.00989.pdf). 6 | """ 7 | 8 | __tail_doc__ = """ strides: list of int indicates strides for each stack. Default `[1, 2, 2, 2]`. 9 | input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`. 10 | num_classes: number of classes to classify images into. Set `0` to exclude top layers. 11 | activation: activation used in whole model, default `gelu`. 12 | drop_connect_rate: is used for [Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382). 13 | Can be value like `0.2`, indicates the drop probability linearly changes from `0 --> 0.2` for `top --> bottom` layers. 14 | A higher value means a higher probability will drop the deep branch. 15 | or `0` to disable (default). 16 | dropout: dropout rate if top layers is included. 17 | classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`. 18 | Set `classifier_activation=None` to return the logits of the "top" layer. 19 | pretrained: one of None or "mae_in1k_ft1k". 20 | Will try to download and load pre-trained model weights if not None. 21 | 22 | Returns: 23 | A `keras.Model` instance. 24 | """ 25 | 26 | Hiera.__doc__ = __head_doc__ + """ 27 | Args: 28 | num_blocks: number of blocks in each stack. 29 | embed_dim: basic hidden dims, expand * 2 for each stack. 30 | num_heads: int or list value for num heads in each stack. 31 | use_window_attentions: boolean or list value, each value in the list can also be a list of boolean. 32 | Indicates if use window attention in each stack. 33 | Element value like `[True, False]` means first one is True, others are False. 34 | mlp_ratio: expand ratio for mlp blocks hidden channel. 35 | model_name: string, model name. 36 | """ + __tail_doc__ + """ 37 | Model architectures: 38 | | Model | Params | FLOPs | Input | Top1 Acc | 39 | | ------------- | ------- | ------- | ----- | -------- | 40 | | HieraTiny | 27.91M | 4.93G | 224 | 82.8 | 41 | | HieraSmall | 35.01M | 6.44G | 224 | 83.8 | 42 | | HieraBase | 51.52M | 9.43G | 224 | 84.5 | 43 | | HieraBasePlus | 69.90M | 12.71G | 224 | 85.2 | 44 | | HieraLarge | 213.74M | 40.43G | 224 | 86.1 | 45 | | HieraHuge | 672.78M | 125.03G | 224 | 86.9 | 46 | """ 47 | 48 | HieraTiny.__doc__ = __head_doc__ + """ 49 | Args: 50 | """ + __tail_doc__ 51 | 52 | HieraSmall.__doc__ = HieraTiny.__doc__ 53 | HieraBase.__doc__ = HieraTiny.__doc__ 54 | HieraBasePlus.__doc__ = HieraTiny.__doc__ 55 | HieraLarge.__doc__ = HieraTiny.__doc__ 56 | HieraHuge.__doc__ = HieraTiny.__doc__ 57 | -------------------------------------------------------------------------------- /keras_cv_attention_models/iformer/__init__.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.iformer.iformer import InceptionTransformer, IFormerSmall, IFormerBase, IFormerLarge 2 | 3 | __head_doc__ = """ 4 | Keras implementation of [Github sail-sg/iFormer](https://github.com/sail-sg/iFormer). 5 | Paper [PDF 2205.12956 Inception Transformer](https://arxiv.org/pdf/2205.12956.pdf). 6 | """ 7 | 8 | __tail_doc__ = """ input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`. 9 | num_classes: number of classes to classify images into. Set `0` to exclude top layers. 10 | activation: activation used in whole model, default `gelu`. 11 | drop_connect_rate: is used for [Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382). 12 | Can be a constant value like `0.2`, 13 | or a tuple value like `(0, 0.2)` indicates the drop probability linearly changes from `0 --> 0.2` for `top --> bottom` layers. 14 | A higher value means a higher probability will drop the deep branch. 15 | or `0` to disable (default). 16 | layer_scales: int or list of int, indicates layer scale init value for each stack. Default `[0, 0, 1e-6, 1e-6]`, 0 for not using. 17 | [Going deeper with Image Transformers](https://arxiv.org/abs/2103.17239). 18 | classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`. 19 | Set `classifier_activation=None` to return the logits of the "top" layer. 20 | Default is `None`. 21 | pretrained: one of `None` (random initialization) or 'imagenet21k-ft1k' (pre-training on ImageNet21k and fine-tuned ImageNet). 22 | Will try to download and load pre-trained model weights if not None. 23 | 24 | Returns: 25 | A `keras.Model` instance. 26 | """ 27 | 28 | InceptionTransformer.__doc__ = __head_doc__ + """ 29 | Args: 30 | num_blocks: number of blocks in each stack. 31 | embed_dims: output channels for each stack. 32 | num_heads: int or list value indicates heads number for `conv_attention_mixer` blocks in each stack. 33 | num_attn_low_heads: int or list value indicates attention heads number for `attention_low_frequency_mixer` blocks in each stack. 34 | pool_sizes: int or list value indicates attention blocks key_value downsample rate in each stack. 35 | mlp_ratios: int or list value indicates expand ratio for mlp blocks hidden channel in each stack. 36 | model_name: string, model name. 37 | """ + __tail_doc__ + """ 38 | Model architectures: 39 | | Model | Params | FLOPs | Input | Top1 Acc | 40 | | ------------ | ------ | ------ | ----- | -------- | 41 | | IFormerSmall | 19.9M | 4.88G | 224 | 83.4 | 42 | | | 20.9M | 16.29G | 384 | 84.6 | 43 | | IFormerBase | 47.9M | 9.44G | 224 | 84.6 | 44 | | | 48.9M | 30.86G | 384 | 85.7 | 45 | | IFormerLarge | 86.6M | 14.12G | 224 | 84.6 | 46 | | | 87.7M | 45.74G | 384 | 85.8 | 47 | """ 48 | 49 | IFormerSmall.__doc__ = __head_doc__ + """ 50 | Args: 51 | """ + __tail_doc__ 52 | 53 | IFormerBase.__doc__ = IFormerSmall.__doc__ 54 | IFormerLarge.__doc__ = IFormerSmall.__doc__ 55 | -------------------------------------------------------------------------------- /keras_cv_attention_models/efficientnet/efficientnet_edgetpu.py: -------------------------------------------------------------------------------- 1 | """Creates an EfficientNet-EdgeTPU model 2 | Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/edgetpu 3 | """ 4 | 5 | import math 6 | from keras_cv_attention_models.efficientnet.efficientnet_v2 import EfficientNetV2 7 | from keras_cv_attention_models.attention_layers import make_divisible 8 | from keras_cv_attention_models.models import register_model 9 | 10 | 11 | def get_expanded_width_depth(width, depth, fix_head_stem=False): 12 | out_channels = [ii * width for ii in [24, 32, 48, 96, 144, 192]] 13 | depthes = [int(math.ceil(ii * depth)) for ii in [1, 2, 4, 5, 4, 2]] 14 | first_conv_filter = 32 * width 15 | output_conv_filter = 1280 * width 16 | 17 | out_channels = [out_channels[0], out_channels[0], *out_channels[1:]] 18 | depthes = [1, depthes[0] - 1, *depthes[1:]] 19 | return out_channels, depthes, first_conv_filter, output_conv_filter 20 | 21 | 22 | def EfficientNetEdgeTPU( 23 | expands=[-1, 4, 8, 8, 8, 8, 8], # expands[0] = expands[1] * out_channels[0] / first_conv_filter, as timm using expand on out_channel 24 | out_channels=[24, 24, 32, 48, 96, 144, 192], 25 | depthes=[1, 0, 2, 4, 5, 4, 2], # Add an additional block, as timm using expand on out_channel 26 | strides=[1, 1, 2, 2, 2, 1, 2], 27 | se_ratios=[0, 0, 0, 0, 0, 0, 0, 0], 28 | first_conv_filter=32, 29 | output_conv_filter=1280, 30 | kernel_sizes=[3, 3, 3, 3, 5, 5, 5], 31 | use_shortcuts=[False, False, True, True, True, True, True], 32 | is_fused=[True, True, True, True, False, False, False], 33 | is_torch_mode=True, 34 | drop_connect_rate=0.2, 35 | pretrained="imagenet", 36 | activation="relu", 37 | model_name="EfficientNetEdgeTPU", 38 | **kwargs, 39 | ): 40 | kwargs.pop("kwargs", None) 41 | expands[0] = make_divisible(out_channels[0], 8) * expands[1] / make_divisible(first_conv_filter, 8) 42 | return EfficientNetV2(**locals(), **kwargs) 43 | 44 | 45 | @register_model 46 | def EfficientNetEdgeTPUSmall(input_shape=(224, 224, 3), num_classes=1000, dropout=0.2, classifier_activation="softmax", pretrained="imagenet", **kwargs): 47 | out_channels, depthes, first_conv_filter, output_conv_filter = get_expanded_width_depth(1.0, 1.0) 48 | return EfficientNetEdgeTPU(**locals(), model_name="efficientnet_edgetpu-small", **kwargs) 49 | 50 | 51 | @register_model 52 | def EfficientNetEdgeTPUMedium(input_shape=(240, 240, 3), num_classes=1000, dropout=0.2, classifier_activation="softmax", pretrained="imagenet", **kwargs): 53 | out_channels, depthes, first_conv_filter, output_conv_filter = get_expanded_width_depth(1.0, 1.1) 54 | return EfficientNetEdgeTPU(**locals(), model_name="efficientnet_edgetpu-medium", **kwargs) 55 | 56 | 57 | @register_model 58 | def EfficientNetEdgeTPULarge(input_shape=(300, 300, 3), num_classes=1000, dropout=0.3, classifier_activation="softmax", pretrained="imagenet", **kwargs): 59 | out_channels, depthes, first_conv_filter, output_conv_filter = get_expanded_width_depth(1.2, 1.4) 60 | return EfficientNetEdgeTPU(**locals(), model_name="efficientnet_edgetpu-large", **kwargs) 61 | -------------------------------------------------------------------------------- /keras_cv_attention_models/fasternet/__init__.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.fasternet.fasternet import FasterNet, FasterNetT0, FasterNetT1, FasterNetT2, FasterNetS, FasterNetM, FasterNetL 2 | 3 | __head_doc__ = """ 4 | Keras implementation of [Github JierunChen/FasterNet](https://github.com/JierunChen/FasterNet). 5 | Paper [PDF 2303.03667 Run, Don’t Walk: Chasing Higher FLOPS for Faster Neural Networks ](https://arxiv.org/pdf/2303.03667.pdf). 6 | """ 7 | 8 | __tail_doc__ = """ window_ratios: window split ratio. Each stack will calculate `window_size = (height // window_ratio, width // window_ratio)` . 9 | layer_scale: layer scale init value. Default `-1` means not applying, any value `>=0` will add a scale value for each block output. 10 | [Going deeper with Image Transformers](https://arxiv.org/pdf/2103.17239.pdf). 11 | input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`. 12 | num_classes: number of classes to classify images into. Set `0` to exclude top layers. 13 | activation: activation used in whole model, default `gelu`. 14 | drop_connect_rate: is used for [Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382). 15 | Can be value like `0.2`, indicates the drop probability linearly changes from `0 --> 0.2` for `top --> bottom` layers. 16 | A higher value means a higher probability will drop the deep branch. 17 | or `0` to disable (default). 18 | dropout: dropout rate if top layers is included. 19 | classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`. 20 | Set `classifier_activation=None` to return the logits of the "top" layer. 21 | pretrained: one of None or "imagenet". 22 | Will try to download and load pre-trained model weights if not None. 23 | 24 | Returns: 25 | A `keras.Model` instance. 26 | """ 27 | 28 | FasterNet.__doc__ = __head_doc__ + """ 29 | Args: 30 | num_blocks: number of blocks in each stack. 31 | embed_dim: basic hidden dims, expand * 2 for each stack. 32 | patch_size: int value for stem kernel size and strides. 33 | mlp_ratio: expand ratio for mlp blocks hidden channel. 34 | partial_conv_ratio: float value for partial channles appling `Conv2D` in each block. 35 | output_conv_filter: int value for filters of `Conv2D` block before output block. 36 | model_name: string, model name. 37 | """ + __tail_doc__ + """ 38 | Model architectures: 39 | | Model | Params | FLOPs | Input | Top1 Acc | 40 | | ----------- | ------ | ------ | ----- | -------- | 41 | | FasterNetT0 | 3.9M | 0.34G | 224 | 71.9 | 42 | | FasterNetT1 | 7.6M | 0.85G | 224 | 76.2 | 43 | | FasterNetT2 | 15.0M | 1.90G | 224 | 78.9 | 44 | | FasterNetS | 31.1M | 4.55G | 224 | 81.3 | 45 | | FasterNetM | 53.5M | 8.72G | 224 | 83.0 | 46 | | FasterNetL | 93.4M | 15.49G | 224 | 83.5 | 47 | """ 48 | 49 | FasterNetT0.__doc__ = __head_doc__ + """ 50 | Args: 51 | """ + __tail_doc__ 52 | 53 | FasterNetT1.__doc__ = FasterNetT0.__doc__ 54 | FasterNetT2.__doc__ = FasterNetT0.__doc__ 55 | FasterNetS.__doc__ = FasterNetT0.__doc__ 56 | FasterNetM.__doc__ = FasterNetT0.__doc__ 57 | FasterNetL.__doc__ = FasterNetT0.__doc__ 58 | -------------------------------------------------------------------------------- /keras_cv_attention_models/gpvit/__init__.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.gpvit.gpvit import GPViT, GPViT_L1, GPViT_L2, GPViT_L3, GPViT_L4, PureWeigths 2 | 3 | __head_doc__ = """ 4 | Keras implementation of [Github ChenhongyiYang/GPViT](https://github.com/ChenhongyiYang/GPViT). 5 | Paper [PDF 2212.06795 GPVIT: A HIGH RESOLUTION NON-HIERARCHICAL VISION TRANSFORMER WITH GROUP PROPAGATION](https://arxiv.org/pdf/2212.06795.pdf). 6 | """ 7 | 8 | __tail_doc__ = """ input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`. 9 | num_classes: number of classes to classify images into. Set `0` to exclude top layers. 10 | activation: activation used in whole model, default `gelu`. 11 | drop_connect_rate: is used for [Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382). 12 | Can be a constant value like `0.2`, 13 | or a tuple value like `(0, 0.2)` indicates the drop probability linearly changes from `0 --> 0.2` for `top --> bottom` layers. 14 | A higher value means a higher probability will drop the deep branch. 15 | or `0` to disable (default). 16 | layer_scale: layer scale init value, [Going deeper with Image Transformers](https://arxiv.org/abs/2103.17239). 17 | Default 0 for not using. 18 | classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`. 19 | Set `classifier_activation=None` to return the logits of the "top" layer. 20 | Default is `None`. 21 | pretrained: one of `None` (random initialization) or 'imagenet21k-ft1k' (pre-training on ImageNet21k and fine-tuned ImageNet). 22 | Will try to download and load pre-trained model weights if not None. 23 | 24 | Returns: 25 | A `keras.Model` instance. 26 | """ 27 | 28 | GPViT.__doc__ = __head_doc__ + """ 29 | Args: 30 | num_layers: number of transformer blocks. 31 | embed_dims: output channels for each stack. 32 | stem_depth: number of stem conv blocks. 33 | num_window_heads: number of heads for `window_lepe_attention_mlp_block` blocks. 34 | num_group_heads: number of heads for `group_attention` blocks. 35 | mlp_ratios: int value indicates expand ratio for mlp blocks hidden channel in each stack. 36 | window_size: number of `window_size` for `window_lepe_attention_mlp_block` blocks. 37 | group_attention_layer_ids: list of layer id for using `group_attention`, others will be `window_lepe_attention_mlp_block`. 38 | group_attention_layer_group_tokens: list of `num_group_token` for each block using `group_attention`. 39 | use_neck_attention_output: boolean value whether using `light_group_attention` before output block. 40 | model_name: string, model name. 41 | """ + __tail_doc__ + """ 42 | Model architectures: 43 | | Model | Params | FLOPs | Input | Top1 Acc | 44 | | -------- | ------ | ------ | ----- | -------- | 45 | | GPViT_L1 | 9.59M | 6.15G | 224 | 80.5 | 46 | | GPViT_L2 | 24.2M | 15.74G | 224 | 83.4 | 47 | | GPViT_L3 | 36.7M | 23.54G | 224 | 84.1 | 48 | | GPViT_L4 | 75.5M | 48.29G | 224 | 84.3 | 49 | """ 50 | 51 | GPViT_L1.__doc__ = __head_doc__ + """ 52 | Args: 53 | """ + __tail_doc__ 54 | 55 | GPViT_L2.__doc__ = GPViT_L1.__doc__ 56 | GPViT_L3.__doc__ = GPViT_L1.__doc__ 57 | GPViT_L4.__doc__ = GPViT_L1.__doc__ 58 | -------------------------------------------------------------------------------- /keras_cv_attention_models/moganet/README.md: -------------------------------------------------------------------------------- 1 | # ___Keras MogaNet___ 2 | *** 3 | 4 | ## Summary 5 | - Keras implementation of [Github Westlake-AI/MogaNet](https://github.com/Westlake-AI/MogaNet). Paper [PDF 2211.03295 Efficient Multi-order Gated Aggregation Network](https://arxiv.org/pdf/2211.03295.pdf). 6 | - Model weights ported from official publication. 7 | *** 8 | 9 | ## Models 10 | | Model | Params | FLOPs | Input | Top1 Acc | Download | 11 | | ------------ | ------ | ------ | ----- | -------- | -------- | 12 | | MogaNetXtiny | 2.96M | 806M | 224 | 76.5 | [moganet_xtiny_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/moganet/moganet_xtiny_imagenet.h5) | 13 | | MogaNetTiny | 5.20M | 1.11G | 224 | 79.0 | [moganet_tiny_224_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/moganet/moganet_tiny_224_imagenet.h5) | 14 | | | 5.20M | 1.45G | 256 | 79.6 | [moganet_tiny_256_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/moganet/moganet_tiny_256_imagenet.h5) | 15 | | MogaNetSmall | 25.3M | 4.98G | 224 | 83.4 | [moganet_small_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/moganet/moganet_small_imagenet.h5) | 16 | | MogaNetBase | 43.7M | 9.96G | 224 | 84.2 | [moganet_base_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/moganet/moganet_base_imagenet.h5) | 17 | | MogaNetLarge | 82.5M | 15.96G | 224 | 84.6 | [moganet_large_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/moganet/moganet_large_imagenet.h5) | 18 | ## Usage 19 | ```py 20 | from keras_cv_attention_models import moganet 21 | 22 | # Will download and load pretrained imagenet weights. 23 | mm = moganet.MogaNetXtiny(pretrained="imagenet") 24 | 25 | # Run prediction 26 | import tensorflow as tf 27 | from tensorflow import keras 28 | from skimage.data import chelsea 29 | imm = keras.applications.imagenet_utils.preprocess_input(chelsea(), mode='torch') # Chelsea the cat 30 | pred = mm(tf.expand_dims(tf.image.resize(imm, mm.input_shape[1:3]), 0)).numpy() 31 | print(keras.applications.imagenet_utils.decode_predictions(pred)[0]) 32 | # [('n02124075', 'Egyptian_cat', 0.6138564), ('n02123045', 'tabby', 0.16214457), ...] 33 | ``` 34 | **Change input resolution**. 35 | ```py 36 | from keras_cv_attention_models import moganet 37 | mm = moganet.MogaNetXtiny(input_shape=(112, 193, 3), pretrained="imagenet") 38 | # >>>> Load pretrained from: ~/.keras/models/caformer_small18_224_imagenet.h5 39 | 40 | # Run prediction 41 | from skimage.data import chelsea 42 | preds = mm(mm.preprocess_input(chelsea())) 43 | print(mm.decode_predictions(preds)) 44 | # [('n02124075', 'Egyptian_cat', 0.5223805), ('n02123045', 'tabby', 0.27944055), ...] 45 | ``` 46 | **Use dynamic input resolution** by set `input_shape=(None, None, 3)`. 47 | ```py 48 | from keras_cv_attention_models import moganet 49 | model = moganet.MogaNetTiny(input_shape=(None, None, 3), num_classes=0) 50 | 51 | print(model(np.ones([1, 223, 123, 3])).shape) 52 | # (1, 7, 4, 256) 53 | print(model(np.ones([1, 32, 526, 3])).shape) 54 | # (1, 1, 17, 256) 55 | ``` 56 | *** 57 | -------------------------------------------------------------------------------- /keras_cv_attention_models/resnest/README.md: -------------------------------------------------------------------------------- 1 | # ___Keras ResNeSt___ 2 | *** 3 | 4 | ## Summary 5 | - Keras implementation of [ResNeSt](https://github.com/zhanghang1989/ResNeSt). Paper [PDF 2004.08955 ResNeSt: Split-Attention Networks](https://arxiv.org/pdf/2004.08955.pdf). 6 | - Model weights reloaded from official publication. 7 | *** 8 | 9 | ## Models 10 | | Model | Params | FLOPs | Input | Top1 Acc | Download | 11 | | -------------- | ------ | ------ | ----- | -------- | -------- | 12 | | resnest50 | 28M | 5.38G | 224 | 81.03 | [resnest50.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/resnest/resnest50_imagenet.h5) | 13 | | resnest101 | 49M | 13.33G | 256 | 82.83 | [resnest101.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/resnest/resnest101_imagenet.h5) | 14 | | resnest200 | 71M | 35.55G | 320 | 83.84 | [resnest200.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/resnest/resnest200_imagenet.h5) | 15 | | resnest269 | 111M | 77.42G | 416 | 84.54 | [resnest269.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/resnest/resnest269_imagenet.h5) | 16 | ## Usage 17 | ```py 18 | from keras_cv_attention_models import resnest 19 | 20 | # Will download and load pretrained imagenet weights. 21 | mm = resnest.ResNest50(pretrained="imagenet") 22 | 23 | # Run prediction 24 | import tensorflow as tf 25 | from tensorflow import keras 26 | from skimage.data import chelsea 27 | imm = keras.applications.imagenet_utils.preprocess_input(chelsea(), mode='torch') # Chelsea the cat 28 | pred = mm(tf.expand_dims(tf.image.resize(imm, mm.input_shape[1:3]), 0)).numpy() 29 | print(keras.applications.imagenet_utils.decode_predictions(pred)[0]) 30 | # [('n02124075', 'Egyptian_cat', 0.7793046), 31 | # ('n02123159', 'tiger_cat', 0.028313603), 32 | # ('n04209239', 'tabby', 0.02322878), 33 | # ('n02127052', 'lynx', 0.0036637571), 34 | # ('n03085013', 'computer_keyboard', 0.0008628946)] 35 | ``` 36 | **Use dynamic input resolution** 37 | ```py 38 | from keras_cv_attention_models import resnest 39 | mm = resnest.ResNest50(input_shape=(None, None, 3), num_classes=0) 40 | 41 | print(mm(np.ones([1, 224, 224, 3])).shape) 42 | # (1, 7, 7, 2048) 43 | print(mm(np.ones([1, 512, 512, 3])).shape) 44 | # (1, 16, 16, 2048) 45 | 46 | mm.save("../models/resnest50_dynamic_notop.h5") 47 | ``` 48 | ## Verification with PyTorch version 49 | ```py 50 | """ PyTorch resnest50 """ 51 | import torch 52 | sys.path.append("../") 53 | from ResNeSt.resnest.torch import resnest as torch_resnest 54 | 55 | torch_model = torch_resnest.resnest50(pretrained=True) 56 | torch_model.eval() 57 | 58 | """ Keras ResNest50 """ 59 | from keras_cv_attention_models import resnest 60 | mm = resnest.ResNest50(pretrained="imagenet", classifier_activation=None) 61 | 62 | """ Verification """ 63 | inputs = np.random.uniform(size=(1, *mm.input_shape[1:3], 3)).astype("float32") 64 | torch_out = torch_model(torch.from_numpy(inputs).permute(0, 3, 1, 2)).detach().numpy() 65 | keras_out = mm(inputs).numpy() 66 | print(f"{np.allclose(torch_out, keras_out, atol=1e-4) = }") 67 | # np.allclose(torch_out, keras_out, atol=1e-4) = True 68 | ``` 69 | *** 70 | -------------------------------------------------------------------------------- /keras_cv_attention_models/resnet_family/resnext.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.aotnet import AotNet 2 | from keras_cv_attention_models.models import register_model 3 | from keras_cv_attention_models.download_and_load import reload_model_weights 4 | 5 | 6 | PRETRAINED_DICT = { 7 | "resnext50": {"imagenet": "cf65d988c38ba0335c97a046288b91f4", "swsl": "f1cf0cc3c49bb50e6949c50fcce3db8f"}, 8 | "resnext101": {"imagenet": "1e58c0ecc31184bd6bfe4d6b568f4325", "swsl": "c2fe8eefcf9a55e0254d2b13055a4cbc"}, 9 | "resnext101w": {"imagenet": "9a1b92145aeb922695c29a0f02b52188", "swsl": "58b7cf4a72b03171f50ed19789b20f3d"}, 10 | "resnext101w_64": {"imagenet": "51c81e014224bb731ebf64c3ed271a16"}, 11 | "resnext50d": {"imagenet": "a7b2433b7bee7029fce11ba3fabf3fb9"}, 12 | } 13 | 14 | 15 | def ResNeXt(num_blocks, input_shape=(224, 224, 3), pretrained="imagenet", strides=2, groups=32, **kwargs): 16 | strides = strides if isinstance(strides, (list, tuple)) else [1, 2, 2, strides] 17 | model = AotNet(num_blocks, input_shape=input_shape, strides=strides, groups=groups, **kwargs) 18 | reload_model_weights(model, pretrained_dict=PRETRAINED_DICT, sub_release="resnet_family", pretrained=pretrained) 19 | return model 20 | 21 | 22 | @register_model 23 | def ResNeXt50(input_shape=(224, 224, 3), num_classes=1000, activation="relu", classifier_activation="softmax", pretrained="imagenet", **kwargs): 24 | num_blocks = [3, 4, 6, 3] 25 | hidden_channel_ratio = 0.5 26 | return ResNeXt(**locals(), model_name="resnext50", **kwargs) 27 | 28 | 29 | @register_model 30 | def ResNeXt101(input_shape=(224, 224, 3), num_classes=1000, activation="relu", classifier_activation="softmax", pretrained="imagenet", **kwargs): 31 | num_blocks = [3, 4, 23, 3] 32 | hidden_channel_ratio = 0.5 33 | return ResNeXt(**locals(), model_name="resnext101", **kwargs) 34 | 35 | 36 | @register_model 37 | def ResNeXt50D(input_shape=(224, 224, 3), num_classes=1000, activation="relu", classifier_activation="softmax", pretrained="imagenet", **kwargs): 38 | num_blocks = [3, 4, 6, 3] 39 | hidden_channel_ratio = 0.5 40 | stem_type = "deep" 41 | shortcut_type = "avg" 42 | return ResNeXt(**locals(), model_name="resnext50d", **kwargs) 43 | 44 | 45 | @register_model 46 | def ResNeXt101W(input_shape=(224, 224, 3), num_classes=1000, activation="relu", classifier_activation="softmax", pretrained="imagenet", **kwargs): 47 | num_blocks = [3, 4, 23, 3] 48 | hidden_channel_ratio = 1 49 | return ResNeXt(**locals(), model_name="resnext101w", **kwargs) 50 | 51 | 52 | @register_model 53 | def ResNeXt101W_se(input_shape=(224, 224, 3), num_classes=1000, activation="relu", classifier_activation="softmax", pretrained=None, **kwargs): 54 | # timm using an additional conv + bn before se_module 55 | num_blocks = [3, 4, 23, 3] 56 | hidden_channel_ratio = 1 57 | se_ratio = 0.25 / 4 58 | stem_type = "deep" 59 | return ResNeXt(**locals(), model_name="resnext101w", **kwargs) 60 | 61 | 62 | @register_model 63 | def ResNeXt101W_64(input_shape=(224, 224, 3), num_classes=1000, activation="relu", classifier_activation="softmax", pretrained="imagenet", **kwargs): 64 | num_blocks = [3, 4, 23, 3] 65 | hidden_channel_ratio = 1 66 | groups = 64 67 | return ResNeXt(**locals(), model_name="resnext101w_64", **kwargs) 68 | -------------------------------------------------------------------------------- /keras_cv_attention_models/nat/dinat.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.nat.nat import NAT 2 | from keras_cv_attention_models.models import register_model 3 | 4 | 5 | @register_model 6 | def DiNAT_Mini(input_shape=(224, 224, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet", **kwargs): 7 | num_blocks = [3, 4, 6, 5] 8 | # dilation_rates = [[1, 8, 1], [1, 4, 1, 4], [1, 2] * 3, [1, 1, 1, 1, 1]] 9 | use_every_other_dilations = True 10 | return NAT(**locals(), model_name="dinat_mini", **kwargs) 11 | 12 | 13 | @register_model 14 | def DiNAT_Tiny(input_shape=(224, 224, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet", **kwargs): 15 | num_blocks = [3, 4, 18, 5] 16 | # dilation_rates = [[1, 8, 1], [1, 4, 1, 4], [1, 2] * 9, [1, 1, 1, 1, 1]] 17 | use_every_other_dilations = True 18 | return NAT(**locals(), model_name="dinat_tiny", **kwargs) 19 | 20 | 21 | @register_model 22 | def DiNAT_Small(input_shape=(224, 224, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet", **kwargs): 23 | num_blocks = [3, 4, 18, 5] 24 | num_heads = [3, 6, 12, 24] 25 | out_channels = [96, 192, 384, 768] 26 | mlp_ratio = kwargs.pop("mlp_ratio", 2) 27 | # layer_scale = kwargs.pop("layer_scale", 1e-5) 28 | # dilation_rates = [[1, 8, 1], [1, 4, 1, 4], [1, 2] * 9, [1, 1, 1, 1, 1]] 29 | use_every_other_dilations = True 30 | return NAT(**locals(), model_name="dinat_small", **kwargs) 31 | 32 | 33 | @register_model 34 | def DiNAT_Base(input_shape=(224, 224, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet", **kwargs): 35 | num_blocks = [3, 4, 18, 5] 36 | num_heads = [4, 8, 16, 32] 37 | out_channels = [128, 256, 512, 1024] 38 | mlp_ratio = kwargs.pop("mlp_ratio", 2) 39 | layer_scale = kwargs.pop("layer_scale", 1e-5) 40 | # dilation_rates = [[1, 8, 1], [1, 4, 1, 4], [1, 2] * 9, [1, 1, 1, 1, 1]] 41 | use_every_other_dilations = True 42 | return NAT(**locals(), model_name="dinat_base", **kwargs) 43 | 44 | 45 | @register_model 46 | def DiNAT_Large(input_shape=(224, 224, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet21k-ft1k", **kwargs): 47 | num_blocks = [3, 4, 18, 5] 48 | num_heads = [6, 12, 24, 48] 49 | out_channels = [192, 384, 768, 1536] 50 | mlp_ratio = kwargs.pop("mlp_ratio", 2) 51 | # layer_scale = kwargs.pop("layer_scale", 1e-5) 52 | # dilation_rates = [[1, 8, 1], [1, 4, 1, 4], [1, 2] * 9, [1, 1, 1, 1, 1]] 53 | use_every_other_dilations = True 54 | return NAT(**locals(), model_name="dinat_large", **kwargs) 55 | 56 | 57 | @register_model 58 | def DiNAT_Large_K11(input_shape=(384, 384, 3), num_classes=1000, activation="gelu", classifier_activation="softmax", pretrained="imagenet21k-ft1k", **kwargs): 59 | num_blocks = [3, 4, 18, 5] 60 | num_heads = [6, 12, 24, 48] 61 | out_channels = [192, 384, 768, 1536] 62 | mlp_ratio = kwargs.pop("mlp_ratio", 2) 63 | # layer_scale = kwargs.pop("layer_scale", 1e-5) 64 | # dilation_rates = [[1, 8, 1], [1, 4, 1, 4], [1, 2] * 9, [1, 1, 1, 1, 1]] 65 | use_every_other_dilations = True 66 | attn_kernel_size = 11 67 | return NAT(**locals(), model_name="dinat_large_k11", **kwargs) 68 | -------------------------------------------------------------------------------- /keras_cv_attention_models/mobilevit/mobilevit_v2.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.mobilevit.mobilevit import MobileViT 2 | from keras_cv_attention_models.models import register_model 3 | 4 | 5 | def MobileViT_V2( 6 | num_blocks=[1, 2, 3, 5, 4], 7 | out_channels=[64, 128, 256, 384, 512], 8 | attn_channels=0.5, # Can be a list matching out_channels, or a float number for expansion ratio of out_channels 9 | expand_ratio=2, 10 | stem_width=32, 11 | resize_first=True, # False for V1, True for V2 12 | use_depthwise=True, # False for V1, True for V2 13 | use_fusion=False, # True for V1, False for V2 14 | num_norm_groups=1, # -1 or 0 for V1 using layer_norm, or 1 for V2 using group_norm 15 | use_linear_attention=True, # False for V1, True for V2 16 | output_num_features=0, 17 | model_name="mobilevit_v2", 18 | **kwargs, 19 | ): 20 | kwargs.pop("kwargs", None) 21 | return MobileViT(**locals(), **kwargs) 22 | 23 | 24 | def get_mobilevit_v2_width(multiplier=1.0): 25 | return int(32 * multiplier), [int(ii * multiplier) for ii in [64, 128, 256, 384, 512]] # stem_width, out_channels 26 | 27 | 28 | @register_model 29 | def MobileViT_V2_050(input_shape=(256, 256, 3), num_classes=1000, activation="swish", classifier_activation="softmax", pretrained="imagenet", **kwargs): 30 | stem_width, out_channels = get_mobilevit_v2_width(0.5) 31 | return MobileViT_V2(**locals(), model_name="mobilevit_v2_050", **kwargs) 32 | 33 | 34 | @register_model 35 | def MobileViT_V2_075(input_shape=(256, 256, 3), num_classes=1000, activation="swish", classifier_activation="softmax", pretrained="imagenet", **kwargs): 36 | stem_width, out_channels = get_mobilevit_v2_width(0.75) 37 | return MobileViT_V2(**locals(), model_name="mobilevit_v2_075", **kwargs) 38 | 39 | 40 | @register_model 41 | def MobileViT_V2_100(input_shape=(256, 256, 3), num_classes=1000, activation="swish", classifier_activation="softmax", pretrained="imagenet", **kwargs): 42 | return MobileViT_V2(**locals(), model_name="mobilevit_v2_100", **kwargs) 43 | 44 | 45 | @register_model 46 | def MobileViT_V2_125(input_shape=(256, 256, 3), num_classes=1000, activation="swish", classifier_activation="softmax", pretrained="imagenet", **kwargs): 47 | stem_width, out_channels = get_mobilevit_v2_width(1.25) 48 | return MobileViT_V2(**locals(), model_name="mobilevit_v2_125", **kwargs) 49 | 50 | 51 | @register_model 52 | def MobileViT_V2_150(input_shape=(256, 256, 3), num_classes=1000, activation="swish", classifier_activation="softmax", pretrained="imagenet", **kwargs): 53 | stem_width, out_channels = get_mobilevit_v2_width(1.5) 54 | return MobileViT_V2(**locals(), model_name="mobilevit_v2_150", **kwargs) 55 | 56 | 57 | @register_model 58 | def MobileViT_V2_175(input_shape=(256, 256, 3), num_classes=1000, activation="swish", classifier_activation="softmax", pretrained="imagenet", **kwargs): 59 | stem_width, out_channels = get_mobilevit_v2_width(1.75) 60 | return MobileViT_V2(**locals(), model_name="mobilevit_v2_175", **kwargs) 61 | 62 | 63 | @register_model 64 | def MobileViT_V2_200(input_shape=(256, 256, 3), num_classes=1000, activation="swish", classifier_activation="softmax", pretrained="imagenet", **kwargs): 65 | stem_width, out_channels = get_mobilevit_v2_width(2.0) 66 | return MobileViT_V2(**locals(), model_name="mobilevit_v2_200", **kwargs) 67 | -------------------------------------------------------------------------------- /keras_cv_attention_models/pvt/__init__.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.pvt.pvt import PyramidVisionTransformerV2, PVT_V2B0, PVT_V2B1, PVT_V2B2, PVT_V2B2_linear, PVT_V2B3, PVT_V2B4, PVT_V2B5 2 | 3 | __head_doc__ = """ 4 | Keras implementation of [Github whai362/PVT](https://github.com/whai362/PVT/tree/v2/classification). 5 | Paper [PDF 2106.13797 PVTv2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/pdf/2106.13797.pdf). 6 | """ 7 | 8 | __tail_doc__ = """ input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`. 9 | num_classes: number of classes to classify images into. Set `0` to exclude top layers. 10 | activation: activation used in whole model, default `gelu`. 11 | drop_connect_rate: is used for [Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382). 12 | Can be a constant value like `0.2`, 13 | or a tuple value like `(0, 0.2)` indicates the drop probability linearly changes from `0 --> 0.2` for `top --> bottom` layers. 14 | A higher value means a higher probability will drop the deep branch. 15 | or `0` to disable (default). 16 | layer_scale: layer scale init value, [Going deeper with Image Transformers](https://arxiv.org/abs/2103.17239). 17 | Default 0 for not using. 18 | classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`. 19 | Set `classifier_activation=None` to return the logits of the "top" layer. 20 | Default is `None`. 21 | pretrained: one of `None` (random initialization) or 'imagenet21k-ft1k' (pre-training on ImageNet21k and fine-tuned ImageNet). 22 | Will try to download and load pre-trained model weights if not None. 23 | 24 | Returns: 25 | A `keras.Model` instance. 26 | """ 27 | 28 | PyramidVisionTransformerV2.__doc__ = __head_doc__ + """ 29 | Args: 30 | num_blocks: number of blocks in each stack. 31 | embed_dims: output channels for each stack. 32 | num_heads: int or list value indicates heads number for transformer blocks in each stack. 33 | mlp_ratios: int or list value indicates expand ratio for mlp blocks hidden channel in each stack. 34 | sr_ratios: int or list value indicates attention blocks key_value downsample rate in each stack. 35 | stem_patch_size: stem patch size. Default `7`. 36 | use_linear: boolean value if using linear complexity attention layer with `AvgPool2D`. True for `PVT_V2B2_linear`. 37 | model_name: string, model name. 38 | """ + __tail_doc__ + """ 39 | Model architectures: 40 | | Model | Params | FLOPs | Input | Top1 Acc | 41 | | --------------- | ------ | ------ | ----- | -------- | 42 | | PVT_V2B0 | 3.7M | 580.3M | 224 | 70.5 | 43 | | PVT_V2B1 | 14.0M | 2.14G | 224 | 78.7 | 44 | | PVT_V2B2 | 25.4M | 4.07G | 224 | 82.0 | 45 | | PVT_V2B2_linear | 22.6M | 3.94G | 224 | 82.1 | 46 | | PVT_V2B3 | 45.2M | 6.96G | 224 | 83.1 | 47 | | PVT_V2B4 | 62.6M | 10.19G | 224 | 83.6 | 48 | | PVT_V2B5 | 82.0M | 11.81G | 224 | 83.8 | 49 | """ 50 | 51 | PVT_V2B0.__doc__ = __head_doc__ + """ 52 | Args: 53 | """ + __tail_doc__ 54 | 55 | PVT_V2B1.__doc__ = PVT_V2B0.__doc__ 56 | PVT_V2B2.__doc__ = PVT_V2B0.__doc__ 57 | PVT_V2B2_linear.__doc__ = PVT_V2B0.__doc__ 58 | PVT_V2B3.__doc__ = PVT_V2B0.__doc__ 59 | PVT_V2B4.__doc__ = PVT_V2B0.__doc__ 60 | PVT_V2B5.__doc__ = PVT_V2B0.__doc__ 61 | -------------------------------------------------------------------------------- /keras_cv_attention_models/repvit/__init__.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.repvit.repvit import RepViT, RepViT_M09, RepViT_M10, RepViT_M11, RepViT_M15, RepViT_M23, switch_to_deploy 2 | 3 | 4 | __head_doc__ = """ 5 | Keras implementation of [Github THU-MIG/RepViT](https://github.com/THU-MIG/RepViT). 6 | Paper [PDF 2307.09283 RepViT: Revisiting Mobile CNN From ViT Perspective](https://arxiv.org/pdf/2307.09283.pdf). 7 | """ 8 | 9 | __tail_doc__ = """ input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`. 10 | deploy: boolean value if build a fused model. **Evaluation only, not good for training**. 11 | num_classes: number of classes to classify images into. Set `0` to exclude top layers. 12 | activation: activation used in whole model, default `hard_swish`. 13 | drop_connect_rate: is used for [Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382). 14 | Can be a constant value like `0.2`, 15 | or a tuple value like `(0, 0.2)` indicates the drop probability linearly changes from `0 --> 0.2` for `top --> bottom` layers. 16 | A higher value means a higher probability will drop the deep branch. 17 | or `0` to disable (default). 18 | dropout: top dropout rate if top layers is included. Default 0. 19 | classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`. 20 | Set `classifier_activation=None` to return the logits of the "top" layer. 21 | Default is `None`. 22 | use_distillation: Boolean value if output `distill_head`. Default `False`. 23 | pretrained: one of `None` (random initialization) or 'imagenet' (pre-training on ImageNet). 24 | Will try to download and load pre-trained model weights if not None. 25 | **kwargs: other parameters if available. 26 | 27 | Returns: 28 | A `keras.Model` instance. 29 | """ 30 | 31 | RepViT.__doc__ = __head_doc__ + """ 32 | Args: 33 | num_blocks: number of block for each stack. 34 | out_channels: output channels for each stack. 35 | stem_width: channel dimension output for stem block, default -1 for using out_channels[0]. 36 | se_ratio: float value for se_ratio for each stack, will use `se_module` every other block in each stack if > 0. 37 | model_name: string, model name. 38 | """ + __tail_doc__ + """ 39 | Model architectures: 40 | | Model | Params | FLOPs | Input | Top1 Acc | 41 | | ------------------------ | ------ | ----- | ----- | -------- | 42 | | RepViT_M09, distillation | 5.10M | 0.82G | 224 | 79.1 | 43 | | - deploy=True | 5.07M | 0.82G | 224 | 79.1 | 44 | | RepViT_M10, distillation | 6.85M | 1.12G | 224 | 80.3 | 45 | | - deploy=True | 6.81M | 1.12G | 224 | 80.3 | 46 | | RepViT_M11, distillation | 8.29M | 1.35G | 224 | 81.2 | 47 | | - deploy=True | 8.24M | 1.35G | 224 | 81.2 | 48 | | RepViT_M15, distillation | 14.13M | 2.30G | 224 | 82.5 | 49 | | - deploy=True | 14.05M | 2.30G | 224 | 82.5 | 50 | | RepViT_M23, distillation | 23.01M | 4.55G | 224 | 83.7 | 51 | | - deploy=True | 22.93M | 4.55G | 224 | 83.7 | 52 | """ 53 | 54 | RepViT_M09.__doc__ = __head_doc__ + """ 55 | Args: 56 | """ + __tail_doc__ 57 | 58 | RepViT_M10.__doc__ = RepViT_M09.__doc__ 59 | RepViT_M11.__doc__ = RepViT_M09.__doc__ 60 | RepViT_M15.__doc__ = RepViT_M09.__doc__ 61 | RepViT_M23.__doc__ = RepViT_M09.__doc__ 62 | -------------------------------------------------------------------------------- /keras_cv_attention_models/gcvit/__init__.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.gcvit.gcvit import GCViT, GCViT_XXTiny, GCViT_XTiny, GCViT_Tiny, GCViT_Tiny2, GCViT_Small, GCViT_Small2, GCViT_Base, GCViT_Large 2 | 3 | __head_doc__ = """ 4 | Keras implementation of [Github NVlabs/GCVit](https://github.com/NVlabs/GCVit). 5 | Paper [PDF 2206.09959 Global Context Vision Transformers](https://arxiv.org/pdf/2206.09959.pdf). 6 | """ 7 | 8 | __tail_doc__ = """ window_ratios: window split ratio. Each stack will calculate `window_size = (height // window_ratio, width // window_ratio)` . 9 | layer_scale: layer scale init value. Default `-1` means not applying, any value `>=0` will add a scale value for each block output. 10 | [Going deeper with Image Transformers](https://arxiv.org/pdf/2103.17239.pdf). 11 | input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`. 12 | num_classes: number of classes to classify images into. Set `0` to exclude top layers. 13 | activation: activation used in whole model, default `gelu`. 14 | drop_connect_rate: is used for [Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382). 15 | Can be value like `0.2`, indicates the drop probability linearly changes from `0 --> 0.2` for `top --> bottom` layers. 16 | A higher value means a higher probability will drop the deep branch. 17 | or `0` to disable (default). 18 | dropout: dropout rate if top layers is included. 19 | classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`. 20 | Set `classifier_activation=None` to return the logits of the "top" layer. 21 | pretrained: None or one of ["imagenet", "imagenet21k-ft1k"]. 22 | Will try to download and load pre-trained model weights if not None. 23 | 24 | Returns: 25 | A `keras.Model` instance. 26 | """ 27 | 28 | GCViT.__doc__ = __head_doc__ + """ 29 | Args: 30 | num_blocks: number of blocks in each stack. 31 | num_heads: num heads for each stack. 32 | embed_dim: basic hidden dims, expand * 2 for each stack. 33 | mlp_ratio: expand ratio for mlp blocks hidden channel. 34 | model_name: string, model name. 35 | """ + __tail_doc__ + """ 36 | Model architectures: 37 | | Model | Params | FLOPs | Input | Top1 Acc | 38 | | --------------- | ------ | ------ | ----- | -------- | 39 | | GCViT_XXTiny | 12.0M | 2.15G | 224 | 79.9 | 40 | | GCViT_XTiny | 20.0M | 2.96G | 224 | 82.0 | 41 | | GCViT_Tiny | 28.2M | 4.83G | 224 | 83.5 | 42 | | GCViT_Tiny2 | 34.5M | 6.28G | 224 | 83.7 | 43 | | GCViT_Small | 51.1M | 8.63G | 224 | 84.3 | 44 | | GCViT_Small2 | 68.6M | 11.7G | 224 | 84.8 | 45 | | GCViT_Base | 90.3M | 14.9G | 224 | 85.0 | 46 | | GCViT_Large | 202.1M | 32.8G | 224 | 85.7 | 47 | | - 21k_ft1k | 202.1M | 32.8G | 224 | 86.6 | 48 | | - 21k_ft1k, 384 | 202.9M | 105.1G | 384 | 87.4 | 49 | | - 21k_ft1k, 512 | 203.8M | 205.1G | 512 | 87.6 | 50 | """ 51 | 52 | GCViT_XXTiny.__doc__ = __head_doc__ + """ 53 | Args: 54 | """ + __tail_doc__ 55 | 56 | GCViT_XTiny.__doc__ = GCViT_XXTiny.__doc__ 57 | GCViT_Tiny.__doc__ = GCViT_XXTiny.__doc__ 58 | GCViT_Tiny2.__doc__ = GCViT_XXTiny.__doc__ 59 | GCViT_Small.__doc__ = GCViT_XXTiny.__doc__ 60 | GCViT_Small2.__doc__ = GCViT_XXTiny.__doc__ 61 | GCViT_Base.__doc__ = GCViT_XXTiny.__doc__ 62 | GCViT_Large.__doc__ = GCViT_XXTiny.__doc__ 63 | -------------------------------------------------------------------------------- /keras_cv_attention_models/halonet/README.md: -------------------------------------------------------------------------------- 1 | # ___Keras HaloNet___ 2 | *** 3 | 4 | ## Summary 5 | - [Github lucidrains/halonet-pytorch](https://github.com/lucidrains/halonet-pytorch). 6 | - HaloAttention article: [PDF 2103.12731 Scaling Local Self-Attention for Parameter Efficient Visual Backbones](https://arxiv.org/pdf/2103.12731.pdf). 7 | - No pretrained available for `H` models. Architecture is guessed from article, so it's NOT certain. 8 | - `T` model weights are reloaded from timm [Github rwightman/pytorch-image-models](https://github.com/rwightman/pytorch-image-models). 9 | 10 | ![](https://user-images.githubusercontent.com/5744524/151656806-005a80ba-3c35-4707-af29-2a781492a1d9.png) 11 | ## Models 12 | | Model | Params | FLOPs | Input | Top1 Acc | Download | 13 | | -------------- | ------ | ------- | ----- | -------- | -------- | 14 | | HaloNextECA26T | 10.7M | 2.43G | 256 | 79.50 | [halonext_eca26t_256_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/halonet/halonext_eca26t_256_imagenet.h5) | 15 | | HaloNet26T | 12.5M | 3.18G | 256 | 79.13 | [halonet26t_256_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/halonet/halonet26t_256_imagenet.h5) | 16 | | HaloNetSE33T | 13.7M | 3.55G | 256 | 80.99 | [halonet_se33t_256_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/halonet/halonet_se33t_256_imagenet.h5) | 17 | | HaloRegNetZB | 11.68M | 1.97G | 224 | 81.042 | [haloregnetz_b_224_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/halonet/haloregnetz_b_224_imagenet.h5) | 18 | | HaloNet50T | 22.7M | 5.29G | 256 | 81.70 | [halonet50t_256_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/halonet/halonet50t_256_imagenet.h5) | 19 | | HaloBotNet50T | 22.6M | 5.02G | 256 | 82.0 | [halobotnet50t_256_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/halonet/halobotnet50t_256_imagenet.h5) | 20 | 21 | Comparing `HaloNetH7` accuracy by replacing Conv layers with Attention in each stage: 22 | 23 | | Conv Stages | Attention Stages | Top-1 Acc (%) | Norm. Train Time | 24 | |:-----------:|:----------------:|:-------------:|:----------------:| 25 | | - | 1, 2, 3, 4 | 84.9 | 1.9 | 26 | | 1 | 2, 3, 4 | 84.6 | 1.4 | 27 | | 1, 2 | 3, 4 | 84.7 | 1.0 | 28 | | 1, 2, 3 | 4 | 83.8 | 0.5 | 29 | ## Usage 30 | ```py 31 | from keras_cv_attention_models import halonet 32 | 33 | # Will download and load pretrained imagenet weights. 34 | mm = halonet.HaloNet26T(pretrained="imagenet") 35 | 36 | # Run prediction 37 | import tensorflow as tf 38 | from tensorflow import keras 39 | from skimage.data import chelsea 40 | imm = keras.applications.imagenet_utils.preprocess_input(chelsea(), mode='torch') # Chelsea the cat 41 | pred = mm(tf.expand_dims(tf.image.resize(imm, mm.input_shape[1:3]), 0)).numpy() 42 | print(keras.applications.imagenet_utils.decode_predictions(pred)[0]) 43 | # [('n02124075', 'Egyptian_cat', 0.8999013), 44 | # ('n02123159', 'tiger_cat', 0.012704549), 45 | # ('n02123045', 'tabby', 0.009713952), 46 | # ('n07760859', 'custard_apple', 0.00056676986), 47 | # ('n02487347', 'macaque', 0.00050636294)] 48 | ``` 49 | *** 50 | -------------------------------------------------------------------------------- /keras_cv_attention_models/convnext/convnext_v2.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.convnext.convnext import ConvNeXt 2 | from keras_cv_attention_models.models import register_model 3 | 4 | 5 | def ConvNeXtV2( 6 | num_blocks=[3, 3, 9, 3], 7 | out_channels=[96, 192, 384, 768], 8 | stem_width=-1, 9 | layer_scale_init_value=0, # 1e-6 for v1, 0 for v2 10 | use_grn=True, # False for v1, True for v2 11 | head_init_scale=1.0, 12 | layer_norm_epsilon=1e-6, # 1e-5 for ConvNeXtXXlarge, 1e-6 for others 13 | output_num_filters=-1, # If apply additional dense + activation before output dense, <0 for not using 14 | input_shape=(224, 224, 3), 15 | num_classes=1000, 16 | activation="gelu", 17 | drop_connect_rate=0.1, 18 | classifier_activation="softmax", 19 | dropout=0, 20 | pretrained=None, 21 | model_name="convnext_v2", 22 | kwargs=None, 23 | ): 24 | return ConvNeXt(**locals()) 25 | 26 | 27 | @register_model 28 | def ConvNeXtV2Atto(input_shape=(224, 224, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs): 29 | num_blocks = [2, 2, 6, 2] 30 | out_channels = [40, 80, 160, 320] 31 | return ConvNeXtV2(**locals(), model_name="convnext_v2_atto", **kwargs) 32 | 33 | 34 | @register_model 35 | def ConvNeXtV2Femto(input_shape=(224, 224, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs): 36 | num_blocks = [2, 2, 6, 2] 37 | out_channels = [48, 96, 192, 384] 38 | return ConvNeXtV2(**locals(), model_name="convnext_v2_femto", **kwargs) 39 | 40 | 41 | @register_model 42 | def ConvNeXtV2Pico(input_shape=(224, 224, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs): 43 | num_blocks = [2, 2, 6, 2] 44 | out_channels = [64, 128, 256, 512] 45 | return ConvNeXtV2(**locals(), model_name="convnext_v2_pico", **kwargs) 46 | 47 | 48 | @register_model 49 | def ConvNeXtV2Nano(input_shape=(224, 224, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs): 50 | num_blocks = [2, 2, 8, 2] 51 | out_channels = [80, 160, 320, 640] 52 | return ConvNeXtV2(**locals(), model_name="convnext_v2_nano", **kwargs) 53 | 54 | 55 | @register_model 56 | def ConvNeXtV2Tiny(input_shape=(224, 224, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs): 57 | num_blocks = [3, 3, 9, 3] 58 | out_channels = [96, 192, 384, 768] 59 | return ConvNeXtV2(**locals(), model_name="convnext_v2_tiny", **kwargs) 60 | 61 | 62 | @register_model 63 | def ConvNeXtV2Base(input_shape=(224, 224, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs): 64 | num_blocks = [3, 3, 27, 3] 65 | out_channels = [128, 256, 512, 1024] 66 | return ConvNeXtV2(**locals(), model_name="convnext_v2_base", **kwargs) 67 | 68 | 69 | @register_model 70 | def ConvNeXtV2Large(input_shape=(224, 224, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs): 71 | num_blocks = [3, 3, 27, 3] 72 | out_channels = [192, 384, 768, 1536] 73 | return ConvNeXtV2(**locals(), model_name="convnext_v2_large", **kwargs) 74 | 75 | 76 | @register_model 77 | def ConvNeXtV2Huge(input_shape=(224, 224, 3), num_classes=1000, classifier_activation="softmax", pretrained="imagenet", **kwargs): 78 | num_blocks = [3, 3, 27, 3] 79 | out_channels = [352, 704, 1408, 2816] 80 | return ConvNeXtV2(**locals(), model_name="convnext_v2_huge", **kwargs) 81 | -------------------------------------------------------------------------------- /keras_cv_attention_models/gpt2/README.md: -------------------------------------------------------------------------------- 1 | # ___Keras GPT2___ 2 | *** 3 | 4 | ## Summary 5 | - Keras implementation of [Github openai/gpt-2](https://github.com/openai/gpt-2). Paper [Language Models are Unsupervised Multitask Learners](https://d4mucfpksywv.cloudfront.net/better-language-models/language-models.pdf). 6 | - Model ported from [huggingface/gpt2](https://huggingface.co/gpt2). 7 | - References [Github karpathy/nanoGPT](https://github.com/karpathy/nanoGPT) and [Github jaymody/picoGPT](https://github.com/jaymody/picoGPT). 8 | ## Models 9 | - For `GPT2_XLarge`, needs to download 2 file parts `gpt2_xlarge_webtext.1.h5` and `gpt2_xlarge_webtext.2.h5`. 10 | 11 | | Model | Params | FLOPs | vocab_size | LAMBADA PPL | 12 | | ---------------- | ------- | ------- | ---------- | ----------- | 13 | | [GPT2_Base](https://github.com/leondgarse/keras_cv_attention_models/releases/download/gpt2/gpt2_base_webtext.h5) | 163.04M | 146.42G | 50257 | 35.13 | 14 | | [GPT2_Medium](https://github.com/leondgarse/keras_cv_attention_models/releases/download/gpt2/gpt2_medium_webtext.h5) | 406.29M | 415.07G | 50257 | 15.60 | 15 | | [GPT2_Large](https://github.com/leondgarse/keras_cv_attention_models/releases/download/gpt2/gpt2_large_webtext.h5) | 838.36M | 890.28G | 50257 | 10.87 | 16 | | [GPT2_XLarge](https://github.com/leondgarse/keras_cv_attention_models/releases/download/gpt2/gpt2_xlarge_webtext.1.h5), [+.2](https://github.com/leondgarse/keras_cv_attention_models/releases/download/gpt2/gpt2_xlarge_webtext.2.h5) | 1.638B | 1758.3G | 50257 | 8.63 | 17 | ## Usage 18 | ```py 19 | from keras_cv_attention_models import gpt2 20 | 21 | mm = gpt2.GPT2_Base() 22 | _ = mm.run_prediction("hello world", num_samples=1, max_new_tokens=100) 23 | # hello world. I mean, just because we call ourselves anorexic, with a very strong genetic, doesn't mean we are human. 24 | # 25 | # And so there we have it. And we've just got to get through going through the rest of our lives. 26 | # 27 | # 28 | # I mean, it's a real challenge right now. And we know, we've already talked about the ethical issues. And so, I think, you know, the human body is a very dangerous thing, and the ethical issues 29 | # --------------- 30 | ``` 31 | **Set `include_top=False`** to exclude model head layer. 32 | ```py 33 | from keras_cv_attention_models import gpt2 34 | 35 | mm = gpt2.GPT2_Base(include_top=False) 36 | # >>>> Load pretrained from: ~/.keras/models/gpt2_base_webtext.h5 37 | print(f"{mm.output_shape = }") 38 | # mm.output_shape = (None, 1024, 768) 39 | ``` 40 | **Set `pretrained="huggingface"`** for converting and loading weights from huggingface `transformers` pacakge. 41 | ```py 42 | from keras_cv_attention_models import gpt2 43 | 44 | mm = gpt2.GPT2_Medium(pretrained="huggingface") 45 | # Load and convert weights from huggingface 46 | # >>>> Save to: ~/.keras/models/gpt2_medium_huggingface.h5 47 | _ = mm.run_prediction("hello world", num_samples=1, max_new_tokens=100) 48 | # hello world, and he'll meet you in the afternoon and ask you to think about your career, and then I'll return. I'll write something up, and after that I'll have you come over."<|endoftext|>BALTIMORE -- The Baltimore Sun has been the one to expose the violence and destruction of the Baltimore riots that led to the death of Freddie Gray, and it's not your typical public servant. 49 | # 50 | # The Sun, which is owned by the Baltimore-based News Corp, went public with 51 | # --------------- 52 | ``` 53 | *** 54 | -------------------------------------------------------------------------------- /keras_cv_attention_models/keras_core_functional.py: -------------------------------------------------------------------------------- 1 | import keras_core as keras 2 | from keras_core.ops import * 3 | from keras_core.ops import concatenate as concat 4 | from keras_core.ops import mean as reduce_mean 5 | from keras_core.ops import max as reduce_max 6 | from keras_core.ops import min as reduce_min 7 | from keras_core.ops import power as pow 8 | from keras_core.ops import clip as clip_by_value 9 | 10 | 11 | def extract_patches(images, sizes=1, strides=1, rates=1, padding="valid", name=None): 12 | return keras.ops.image.extract_patches( 13 | images, 14 | size=sizes[1:-1] if isinstance(sizes, int) or len(sizes) > 2 else sizes, 15 | strides=strides[1:-1] if isinstance(strides, int) or len(strides) > 2 else strides, 16 | dilation_rate=rates[1:-1] if isinstance(rates, int) or len(rates) > 2 else rates, 17 | padding=padding.lower(), 18 | data_format=keras.backend.image_data_format(), 19 | ) 20 | 21 | 22 | def gather(inputs, indices, axis=None, batch_dims=0, name=None): 23 | """Defaults axis=None means the first non-batch dimension""" 24 | axis = batch_dims if axis is None else (len(inputs.shape) + axis if axis < 0 else axis) 25 | return keras.ops.take(inputs, indices, axis=axis) 26 | 27 | 28 | def l2_normalize(inputs, axis=None, epsilon=1e-12, name=None): 29 | return inputs / keras.ops.sqrt(keras.ops.maximum(keras.ops.sum(inputs**2, axis=axis, keepdims=True), epsilon)) 30 | 31 | 32 | def norm(inputs, ord="euclidean", axis=1, keepdims=False, name=None): 33 | return keras.ops.sqrt(keras.ops.sum(inputs**2, axis=axis, keepdims=True)) 34 | 35 | 36 | def resize(images, size, method="bilinear", preserve_aspect_ratio=False, antialias=False, name=None): 37 | return keras.ops.image.resize(images, size, interpolation=method, antialias=antialias, data_format=keras.backend.image_data_format()) 38 | 39 | 40 | def reduce_sum(inputs, axis=None, keepdims=False, name=None): 41 | axis = () if axis is None else axis 42 | if isinstance(inputs, (list, tuple)) and axis == 0: 43 | rr = inputs[0] 44 | for ii in inputs[1:]: 45 | rr += ii 46 | return rr 47 | else: 48 | # return wrapper(lambda xx: xx.sum(dim=axis, keepdim=keepdims), inputs, name=name) 49 | return keras.ops.sum(inputs, axis=axis, keepdims=keepdims) 50 | 51 | 52 | def rsqrt(inputs, name=None): 53 | return keras.ops.true_divide(1, keras.ops.sqrt(inputs)) 54 | 55 | 56 | def split(inputs, num_or_size_splits, axis=0, num=None, name="split"): 57 | from builtins import sum 58 | 59 | if isinstance(num_or_size_splits, int): 60 | return keras.ops.split(inputs, num_or_size_splits, axis=axis) 61 | 62 | axis = (len(inputs.shape) + axis) if axis < 0 else axis 63 | split_axis_shape = inputs.shape[axis] 64 | assert split_axis_shape is not None 65 | 66 | size_splits = num_or_size_splits 67 | size_splits = [0 if ii is None or ii == -1 else ii for ii in size_splits] 68 | num_unknown_dim = sum([ii == 0 for ii in size_splits]) 69 | assert num_unknown_dim < 2, "At most one unknown dimension in num_or_size_splits: {}".format(num_or_size_splits) 70 | 71 | if num_unknown_dim == 1: 72 | size_splits = [(split_axis_shape - sum(size_splits)) if ii == 0 else ii for ii in size_splits] 73 | 74 | cum_split = [sum(size_splits[: id + 1]) for id, _ in enumerate(size_splits)] 75 | # len(keras.ops.split(np.ones([2, 6]), [2, 2, 2], axis=-1)) == 4 76 | # len(keras.ops.split(keras.layers.Input([6]), [2, 2, 2], axis=-1)) == 3 77 | return keras.ops.split(inputs, cum_split, axis=axis)[: len(size_splits)] 78 | -------------------------------------------------------------------------------- /keras_cv_attention_models/stable_diffusion/__init__.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models import backend as __backend__ 2 | from keras_cv_attention_models.stable_diffusion.stable_diffusion import StableDiffusion 3 | from keras_cv_attention_models.stable_diffusion.unet import UNet, UNetTest 4 | from keras_cv_attention_models.stable_diffusion.encoder_decoder import Encoder, Decoder 5 | from keras_cv_attention_models.stable_diffusion.eval_func import RunPrediction 6 | 7 | if __backend__.is_tensorflow_backend: 8 | from keras_cv_attention_models.stable_diffusion.data import build_tf_dataset as build_dataset 9 | else: 10 | from keras_cv_attention_models.stable_diffusion.data import build_torch_dataset as build_dataset 11 | 12 | 13 | __head_doc__ = """ 14 | Keras implementation of [Github CompVis/stable-diffusion](https://github.com/CompVis/stable-diffusion). 15 | Paper [PDF 2112.10752 High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/pdf/2112.10752.pdf). 16 | """ 17 | 18 | __tail_doc__ = """ image_shape: model image input shape and generated image shape. 19 | Should have exactly 3 inputs channels like `(224, 224, 3)`. 20 | Inner latents inpuit shape for UNet and Decode is `[image_shape[0] // 8, image_shape[1] // 8, 4]`. 21 | clip_model: str value like `beit.ViTTextLargePatch14` for models from this package under `keras_cv_attention_models`. 22 | Also can be a built model, or None for not using. 23 | unet_model: str value like `stable_diffusion.UNet` for models from this package under `keras_cv_attention_models`. 24 | Also can be a built model, or None for not using. 25 | decoder_model: str value like `stable_diffusion.Decoder` for models from this package under `keras_cv_attention_models`. 26 | Also can be a built model, or None for not using. 27 | encoder_model: str value like `stable_diffusion.Encoder` for models from this package under `keras_cv_attention_models`. 28 | Also can be a built model, or None for not using. 29 | clip_model_kwargs: dict value for kwargs used for building `clip_model`. 30 | unet_model_kwargs: dict value for kwargs used for building `unet_model`. 31 | decoder_model_kwargs: dict value for kwargs used for building `decoder_model`. 32 | encoder_model_kwargs: dict value for kwargs used for building `encoder_model`. 33 | caption_tokenizer: str value in ['GPT2Tokenizer', 'SimpleTokenizer', 'SentencePieceTokenizer'], 34 | or tiktoken one ['gpt2', 'r50k_base', 'p50k_base', 'cl100k_base'], 35 | or specified built tokenizer. 36 | num_steps: int value for the number of DDIM sampling steps, also means total denoising steps. 37 | num_training_steps: int value for total denoising steps during training. 38 | ddim_discretize: one of ["uniform", "quad"] for time_steps sampling `num_steps` method from `num_training_steps`. 39 | linear_start: float value for `beta` start value. 40 | linear_end: float value for `beta` end value. 41 | ddim_eta: float value for calculating `ddim_sigma`. 0 makes the sampling process deterministic. 42 | 43 | Returns: 44 | A `StableDiffusion` instance. 45 | """ 46 | 47 | StableDiffusion.__doc__ = __head_doc__ + """ 48 | Args: 49 | """ + __tail_doc__ + """ 50 | Model architectures: 51 | | Model | Params | FLOPs | Input | 52 | | ------------------- | ------ | ------- | ------------------- | 53 | | ViTTextLargePatch14 | 123.1M | 6.67G | [None, 77] | 54 | | Encoder | 34.16M | 559.6G | [None, 512, 512, 3] | 55 | | UNet | 859.5M | 404.4G | [None, 64, 64, 4] | 56 | | Decoder | 49.49M | 1259.5G | [None, 64, 64, 4] | 57 | """ 58 | -------------------------------------------------------------------------------- /keras_cv_attention_models/davit/README.md: -------------------------------------------------------------------------------- 1 | # ___Keras DaViT___ 2 | *** 3 | 4 | ## Summary 5 | - DaViT article: [PDF 2204.03645 DaViT: Dual Attention Vision Transformers](https://arxiv.org/pdf/2204.03645.pdf). 6 | - Model weights reloaded from [Github dingmyu/davit](https://github.com/dingmyu/davit). 7 | *** 8 | 9 | ## Models 10 | | Model | Params | FLOPs | Input | Top1 Acc | Download | 11 | | ------------- | ------ | ------ | ----- | -------- | -------- | 12 | | DaViT_T | 28.36M | 4.56G | 224 | 82.8 | [davit_t_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/davit/davit_t_imagenet.h5) | 13 | | DaViT_S | 49.75M | 8.83G | 224 | 84.2 | [davit_s_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/davit/davit_s_imagenet.h5) | 14 | | DaViT_B | 87.95M | 15.55G | 224 | 84.6 | [davit_b_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/davit/davit_b_imagenet.h5) | 15 | | DaViT_L, 21k | 196.8M | 103.2G | 384 | 87.5 | | 16 | | DaViT_H, 1.5B | 348.9M | 327.3G | 512 | 90.2 | | 17 | | DaViT_G, 1.5B | 1.406B | 1.022T | 512 | 90.4 | | 18 | 19 | **Self tested accuracy**. There may be some detail differences in model output layer or evaluating process. 20 | ```sh 21 | CUDA_VISIBLE_DEVICES='0' ./eval_script.py -m davit.DaViT_T 22 | # >>>> Accuracy top1: 0.82276 top5: 0.96152 23 | ``` 24 | | Model | Self tested Top1 Acc | 25 | | ------- | -------------------- | 26 | | DaViT_T | 82.276 | 27 | | DaViT_S | 83.810 | 28 | | DaViT_B | 84.142 | 29 | ## Usage 30 | ```py 31 | from keras_cv_attention_models import davit 32 | 33 | # Will download and load pretrained imagenet weights. 34 | mm = davit.DaViT_T(pretrained="imagenet") 35 | 36 | # Run prediction 37 | import tensorflow as tf 38 | from tensorflow import keras 39 | from skimage.data import chelsea 40 | imm = keras.applications.imagenet_utils.preprocess_input(chelsea(), mode='torch') # Chelsea the cat 41 | pred = mm(tf.expand_dims(tf.image.resize(imm, mm.input_shape[1:3]), 0)).numpy() 42 | print(keras.applications.imagenet_utils.decode_predictions(pred)[0]) 43 | # [('n02124075', 'Egyptian_cat', 0.39985177), ('n02123159', 'tiger_cat', 0.036589254), ...] 44 | ``` 45 | **Change input resolution**. Note if `input_shape` is not divisible by `window_ratio`, which default is `32`, will pad for `window_attention`. 46 | ```py 47 | from keras_cv_attention_models import davit 48 | mm = davit.DaViT_T(input_shape=(376, 227, 3), pretrained="imagenet") 49 | # >>>> Load pretrained from: ~/.keras/models/davit_t_imagenet.h5 50 | 51 | # Run prediction 52 | from skimage.data import chelsea 53 | preds = mm(mm.preprocess_input(chelsea())) 54 | print(mm.decode_predictions(preds)) 55 | # [('n02124075', 'Egyptian_cat', 0.17319576), ('n02123159', 'tiger_cat', 0.017631555), ...] 56 | ``` 57 | Reloading weights with new input_shape not divisible by default `window_ratio` works in some cases, like `input_shape` and `window_ratio` both downsample half: 58 | ```py 59 | from keras_cv_attention_models import davit 60 | mm = davit.DaViT_T(input_shape=(112, 112, 3), window_ratio=16, pretrained="imagenet") 61 | # >>>> Load pretrained from: ~/.keras/models/davit_t_imagenet.h5 62 | 63 | # Run prediction 64 | from skimage.data import chelsea 65 | preds = mm(mm.preprocess_input(chelsea())) 66 | print(mm.decode_predictions(preds)) 67 | # [('n02124075', 'Egyptian_cat', 0.7279274), ('n02123045', 'tabby', 0.021591123), ...] 68 | ``` 69 | *** 70 | -------------------------------------------------------------------------------- /keras_cv_attention_models/segment_anything/__init__.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.segment_anything.sam import SAM, MobileSAM, TinySAM, EfficientViT_SAM_L0 2 | 3 | __head_doc__ = """ 4 | Keras implementation of [Github facebookresearch/segment-anything](https://github.com/facebookresearch/segment-anything). 5 | Paper [PDF 2304.02643 Segment Anything](https://arxiv.org/abs/2304.02643). 6 | """ 7 | 8 | __call_doc__ = """ 9 | Call args: 10 | image: raw input image. np.array value in shape `[height, width, 3]`, value range in `[0, 255]`. 11 | points: combinging with `labels`, specific points coordinates as background or foreground. 12 | np.array value in shape `[None, 2]`, `2` means `[left, top]`. 13 | left / top value range in `[0, 1]` or `[0, width]` / `[0, height]`. 14 | labels: combinging with `points`, specific points coordinates as background or foreground. 15 | np.array value in shape `[None]`, value in `[0, 1]`, where 0 means relative point being background, and 1 foreground. 16 | boxes: specific box area performing segmentation. 17 | np.array value in shape `[1, 4]`, `4` means `[left, top, right, bottom]`. 18 | left and right / top and bottom value range in `[0, 1]` or `[0, width]` / `[0, height]`. 19 | Supports only single boxes as inputs. 20 | masks: NOT tested. 21 | mask_threshold: float value for regading model output where `masks > mask_threshold` as True. 22 | return_logits: boolean value if returning boolean mask or logits mask. Default False for boolean mask. 23 | 24 | Call returns: 25 | masks: is all masks output, and it's `4` masks by default, specified by `MaskDecoder` parameter `num_mask_tokens`. 26 | Default shape is `[4, image_height, image_width]`. 27 | `masks[0]` is the output of token 0, which is said better for using if segmenting **single object with multi prompts**. 28 | `masks[1:]` are intended for ambiguous input prompts, and `iou_predictions[1:]`** are the corresponding confidences, 29 | which can be used for picking the highest score one from `masks[1:]`. 30 | iou_predictions: is the corresponding masks confidences. Default shape is `[4]`. 31 | low_res_masks: is the raw output from `MaskDecoder`. Default shape is `[4, 256, 256]`. 32 | """ 33 | 34 | __tail_doc__ = """ image_shape: int or list of 2 int like [1024, 1024]. 35 | embed_dims: inner channels for prompt encoder. 36 | mask_hidden_dims: `MaskEncoder` hidden channels. 37 | pretrained: one of `None` (random initialization) or 'sam' (pre-training on SA-1B from Segment Anything paper). 38 | Will try to download and load pre-trained model weights if not None. 39 | 40 | Returns: 41 | A `keras.Model` instance. 42 | """ + __call_doc__ 43 | 44 | SAM.__doc__ = __head_doc__ + """ 45 | Init args: 46 | image_encoder: string or built image encoder model. Currently string can be one of ["TinyViT_5M", "EfficientViT_L0"]. 47 | mask_decoder: string or built mask decoder model. Currently string can be one of ["sam_mask_decoder", "tiny_sam_mask_decoder"]. 48 | name: string, model name. 49 | """ + __tail_doc__ + """ 50 | Model architectures: 51 | | Model | Params | FLOPs | Input | COCO val mask AP | 52 | | ------------------- | ------ | ----- | ----- | ---------------- | 53 | | MobileSAM | 5.74M | 39.4G | 1024 | 41.0 | 54 | | TinySAM | 5.74M | 39.4G | 1024 | 41.9 | 55 | | EfficientViT_SAM_L0 | 30.73M | 35.4G | 512 | 45.7 | 56 | """ 57 | 58 | SAM.__call__.__doc__ = __call_doc__ 59 | 60 | MobileSAM.__doc__ = __head_doc__ + """ 61 | Args: 62 | """ + __tail_doc__ 63 | 64 | EfficientViT_SAM_L0.__doc__ = MobileSAM.__doc__ 65 | -------------------------------------------------------------------------------- /keras_cv_attention_models/fastervit/__init__.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.fastervit.fastervit import ( 2 | FasterViT, 3 | FasterViT0, 4 | FasterViT1, 5 | FasterViT2, 6 | FasterViT3, 7 | FasterViT4, 8 | FasterViT5, 9 | FasterViT6, 10 | switch_to_deploy, 11 | ) 12 | 13 | __head_doc__ = """ 14 | Keras implementation of [Github NVlabs/FasterViT](https://github.com/NVlabs/FasterViT). 15 | Paper [PDF 2306.06189 FasterViT: Fast Vision Transformers with Hierarchical Attention](https://arxiv.org/pdf/2306.06189.pdf). 16 | """ 17 | 18 | __tail_doc__ = """ window_ratios: window split ratio. It's mainly for the 3rd stack, that `window_size = (height // window_ratio, width // window_ratio)`. 19 | `1` means not using window partition, while `window_size == (height, width)`. 20 | carrier_token_size: int value indicates carrier token size for the 3rd stack. 21 | pos_scale: If pretrained weights are from different input_shape or window_size, pos_scale is previous actually using window_size. 22 | use_propagation: boolean value if using `do_propagation` block at the end of the 3rd stack. 23 | layer_scale: layer scale init value. Default `-1` means not applying, any value `>=0` will add a scale value for each block output. 24 | [Going deeper with Image Transformers](https://arxiv.org/pdf/2103.17239.pdf). 25 | input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`. 26 | num_classes: number of classes to classify images into. Set `0` to exclude top layers. 27 | activation: activation used in whole model, default `gelu`. 28 | drop_connect_rate: is used for [Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382). 29 | Can be value like `0.2`, indicates the drop probability linearly changes from `0 --> 0.2` for `top --> bottom` layers. 30 | A higher value means a higher probability will drop the deep branch. 31 | or `0` to disable (default). 32 | dropout: dropout rate if top layers is included. 33 | classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`. 34 | Set `classifier_activation=None` to return the logits of the "top" layer. 35 | pretrained: one of None or "imagenet". 36 | Will try to download and load pre-trained model weights if not None. 37 | 38 | Returns: 39 | A `keras.Model` instance. 40 | """ 41 | 42 | FasterViT.__doc__ = __head_doc__ + """ 43 | Args: 44 | num_blocks: number of blocks in each stack. 45 | num_heads: num heads for each stack. 46 | stem_hidden_dim: hidden dimension for the 1st stem `Conv2D`. 47 | embed_dim: basic hidden dims, expand * 2 for each stack. 48 | mlp_ratio: expand ratio for mlp blocks hidden channel. 49 | model_name: string, model name. 50 | """ + __tail_doc__ + """ 51 | Model architectures: 52 | | Model | Params | FLOPs | Input | Top1 Acc | 53 | | ---------- | -------- | ------- | ----- | -------- | 54 | | FasterViT0 | 31.40M | 3.51G | 224 | 82.1 | 55 | | FasterViT1 | 53.37M | 5.52G | 224 | 83.2 | 56 | | FasterViT2 | 75.92M | 9.00G | 224 | 84.2 | 57 | | FasterViT3 | 159.55M | 18.75G | 224 | 84.9 | 58 | | FasterViT4 | 351.12M | 41.57G | 224 | 85.4 | 59 | | FasterViT5 | 957.52M | 114.08G | 224 | 85.6 | 60 | | FasterViT6 | 1360.33M | 144.13G | 224 | 85.8 | 61 | """ 62 | 63 | FasterViT0.__doc__ = __head_doc__ + """ 64 | Args: 65 | """ + __tail_doc__ 66 | 67 | FasterViT1.__doc__ = FasterViT0.__doc__ 68 | FasterViT2.__doc__ = FasterViT0.__doc__ 69 | FasterViT3.__doc__ = FasterViT0.__doc__ 70 | FasterViT4.__doc__ = FasterViT0.__doc__ 71 | FasterViT5.__doc__ = FasterViT0.__doc__ 72 | FasterViT6.__doc__ = FasterViT0.__doc__ 73 | -------------------------------------------------------------------------------- /keras_cv_attention_models/imagenet/losses.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | @tf.keras.utils.register_keras_serializable(package="kecamLoss") 5 | class BinaryCrossEntropyTimm(tf.keras.losses.BinaryCrossentropy): 6 | """ 7 | >>> import torch, timm.loss 8 | >>> from keras_cv_attention_models.imagenet import losses 9 | >>> tt = timm.loss.BinaryCrossEntropy(smoothing=0.0, target_threshold=0.2) 10 | >>> ss = losses.BinaryCrossEntropyTimm(target_threshold=0.2, from_logits=True) 11 | >>> y_true = tf.one_hot(np.random.permutation(20).reshape(2, 10), 10).numpy() 12 | >>> y_true = np.clip(y_true[0] + y_true[1], 0, 1) 13 | >>> y_pred = np.random.uniform(size=(10, 10)) 14 | >>> torch_out = tt(torch.from_numpy(y_pred), torch.from_numpy(y_true)).numpy() 15 | >>> keras_out = ss(y_true, y_pred).numpy() 16 | >>> print(f"{torch_out = }, {keras_out = }") 17 | # torch_out = array(0.9457581, dtype=float32), keras_out = 0.945758044719696 18 | """ 19 | 20 | def __init__(self, target_threshold=0.0, label_smoothing=0.0, **kwargs): 21 | super().__init__(label_smoothing=label_smoothing, **kwargs) 22 | self.target_threshold = target_threshold 23 | self.label_smoothing = label_smoothing 24 | 25 | def call(self, y_true, y_pred): 26 | target_threshold = tf.cast(self.target_threshold, y_true.dtype) 27 | y_true = tf.where(y_true > target_threshold, tf.ones_like(y_true), tf.zeros_like(y_true)) 28 | return super().call(y_true, y_pred) 29 | 30 | def get_config(self): 31 | config = super().get_config() 32 | config.update({"target_threshold": self.target_threshold, "label_smoothing": self.label_smoothing}) 33 | return config 34 | 35 | 36 | @tf.keras.utils.register_keras_serializable(package="kecamLoss") 37 | class DistillKLDivergenceLoss(tf.keras.losses.Loss): 38 | """[PDF 2106.05237 Knowledge distillation: A good teacher is patient and consistent](https://arxiv.org/pdf/2106.05237.pdf) 39 | Modified according [Knowledge distillation recipes](https://keras.io/examples/keras_recipes/better_knowledge_distillation/) 40 | 41 | Temperature affecting: 42 | >>> teacher_prob = np.array([0, 0.2, 0.4, 0.6, 0.8, 1.0]) 43 | >>> _ = [print("temperature:", temp, tf.nn.softmax(teacher_prob / temp).numpy()) for temp in [0.1, 1, 10, 20]] 44 | >>> # temperature: 0.1 [3.92559586e-05 2.90064480e-04 2.14330272e-03 1.58369840e-02 1.17020363e-01 8.64670029e-01] 45 | >>> # temperature: 1 [0.09542741 0.11655531 0.14236097 0.17388009 0.21237762 0.25939861] 46 | >>> # temperature: 10 [0.1584458 0.16164661 0.16491209 0.16824354 0.17164228 0.17510968] 47 | >>> # temperature: 20 [0.16252795 0.16416138 0.16581123 0.16747766 0.16916084 0.17086094] 48 | """ 49 | 50 | def __init__(self, temperature=10, **kwargs): 51 | super().__init__(**kwargs) 52 | self.temperature = temperature 53 | # self.kl_divergence = tf.keras.losses.KLDivergence() 54 | 55 | def call(self, teacher_prob, student_prob): 56 | return tf.losses.kl_divergence( 57 | tf.nn.softmax(teacher_prob / self.temperature, axis=-1), 58 | tf.nn.softmax(student_prob / self.temperature, axis=-1), 59 | ) 60 | 61 | 62 | # Not using, from VOLO with mix_token lambda 63 | def token_label_class_loss(y_true, y_pred): 64 | # tf.print(", y_true:", y_true.shape, "y_pred:", y_pred.shape, end="") 65 | if y_pred.shape[-1] != y_true.shape[-1]: 66 | y_pred, cls_lambda = y_pred[:, :-1], y_pred[:, -1:] 67 | y_true = tf.cast(y_true, y_pred.dtype) 68 | y_true = cls_lambda * y_true + (1 - cls_lambda) * y_true[::-1] 69 | return keras.losses.categorical_crossentropy(y_true, y_pred, from_logits=True) 70 | -------------------------------------------------------------------------------- /keras_cv_attention_models/tinyvit/__init__.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.tinyvit.tinyvit import TinyViT, TinyViT_5M, TinyViT_11M, TinyViT_21M 2 | 3 | __head_doc__ = """ 4 | Keras implementation of [Github microsoft/TinyViT](https://github.com/microsoft/Cream/tree/main/TinyViT). 5 | Paper [PDF 2207.10666 TinyViT: Fast Pretraining Distillation for Small Vision Transformers](https://arxiv.org/pdf/2207.10666.pdf). 6 | """ 7 | 8 | __tail_doc__ = """ input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`. 9 | num_classes: number of classes to classify images into. Set `0` to exclude top layers. 10 | activation: activation used in whole model, default `gelu`. 11 | drop_connect_rate: is used for [Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382). 12 | Can be a constant value like `0.2`, 13 | or a tuple value like `(0, 0.2)` indicates the drop probability linearly changes from `0 --> 0.2` for `top --> bottom` layers. 14 | A higher value means a higher probability will drop the deep branch. 15 | or `0` to disable (default). 16 | layer_scale: int value indicates layer scale init value for each stack. Default 0 for not using. 17 | [Going deeper with Image Transformers](https://arxiv.org/abs/2103.17239). 18 | classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`. 19 | Set `classifier_activation=None` to return the logits of the "top" layer. 20 | Default is `None`. 21 | pretrained: one of `None` (random initialization) or 'imagenet' or 'imagenet21k-ft1k' (pre-training on ImageNet21k and fine-tuned ImageNet). 22 | Will try to download and load pre-trained model weights if not None. 23 | 24 | Returns: 25 | A `keras.Model` instance. 26 | """ 27 | 28 | TinyViT.__doc__ = __head_doc__ + """ 29 | Args: 30 | num_blocks: number of blocks in each stack. 31 | out_channels: output channels for each stack. 32 | block_types: block types for each stack, 33 | - `conv` or any `c` / `C` starts word, means `mlp_block_with_depthwise_conv` block. 34 | - `transfrom` or any `t` / `T` starts word, means `multi_head_self_attention` block. 35 | value could be in format like `"cctt"` or `"CCTT"` or `["conv", "conv", "transfrom", "transform"]`. 36 | num_heads: int or list of int value indicates attention heads number for each transformer stack. 37 | window_ratios: int or list of int value indicates attention heads window ratio number for each transformer stack. 38 | Actual using `window_size = ceil(cur_input_shape / window_ratio)`. 39 | For `input_shape=(224, 224, 3)` will be window_sizes=[7, 7, 14, 7], for `(384, 384, 3)` will be `[12, 12, 24, 12]`. 40 | mlp_ratio: int value indicates expand ratio for mlp blocks hidden channel in each stack. 41 | model_name: string, model name. 42 | """ + __tail_doc__ + """ 43 | Model architectures: 44 | | Model | Params | FLOPs | Input | Top1 Acc | 45 | | -------------------- | ------ | ----- | ----- | -------- | 46 | | TinyViT_5M, distill | 5.4M | 1.3G | 224 | 79.1 | 47 | | - imagenet21k-ft1k | 5.4M | 1.3G | 224 | 80.7 | 48 | | TinyViT_11M, distill | 11M | 2.0G | 224 | 81.5 | 49 | | - imagenet21k-ft1k | 11M | 2.0G | 224 | 83.2 | 50 | | TinyViT_21M, distill | 21M | 4.3G | 224 | 83.1 | 51 | | - imagenet21k-ft1k | 21M | 4.3G | 224 | 84.8 | 52 | | | 21M | 13.8G | 384 | 86.2 | 53 | | | 21M | 27.0G | 512 | 86.5 | 54 | """ 55 | 56 | TinyViT_5M.__doc__ = __head_doc__ + """ 57 | Args: 58 | """ + __tail_doc__ 59 | 60 | TinyViT_11M.__doc__ = TinyViT_5M.__doc__ 61 | TinyViT_21M.__doc__ = TinyViT_5M.__doc__ 62 | -------------------------------------------------------------------------------- /keras_cv_attention_models/clip/torch_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import numpy as np 4 | from PIL import Image 5 | from torch.utils.data import Dataset, DataLoader 6 | 7 | 8 | def read_from_tsv(data_path): 9 | import csv 10 | 11 | delimiter = "\t" if data_path.endswith(".tsv") else "," 12 | train_images, train_captions, test_images, test_captions, base_path, is_train = [], [], [], [], "", True 13 | with open(data_path) as ff: 14 | for ii in csv.reader(ff, delimiter=delimiter): 15 | if ii[0] == "base_path": # special keys for info 16 | base_path = os.path.expanduser(ii[1]) 17 | elif ii[0] == "TEST": # Use this as indicator for start of test set 18 | is_train = False 19 | elif is_train: 20 | train_images.append(ii[0]) 21 | train_captions.append(ii[1]) 22 | else: 23 | test_images.append(ii[0]) 24 | test_captions.append(ii[1]) 25 | if len(base_path) > 0: 26 | train_images = [os.path.join(base_path, ii) for ii in train_images] 27 | test_images = [os.path.join(base_path, ii) for ii in test_images] 28 | return train_images, train_captions, test_images, test_captions 29 | 30 | 31 | class CaptionDataset(Dataset): 32 | def __init__(self, images, captions, tokenizer, is_train=True, image_size=224): 33 | from torchvision.transforms import Normalize, Compose, RandomResizedCrop, Resize, InterpolationMode, ToTensor 34 | 35 | self.images, self.captions, self.tokenizer = images, captions, tokenizer 36 | self.context_length = self.tokenizer.context_length 37 | 38 | # self.mean, self.std = (0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711) # value from openai/CLIP 39 | self.mean, self.std = (0.485, 0.456, 0.406), (0.229, 0.224, 0.225) 40 | interpolation = InterpolationMode.BICUBIC 41 | image_size = image_size if isinstance(image_size, (list, tuple)) else (image_size, image_size) 42 | self.transforms = Compose( 43 | [ 44 | RandomResizedCrop(image_size, scale=(0.9, 1.0), interpolation=interpolation) if is_train else Resize(image_size, interpolation=interpolation), 45 | lambda image: image.convert("RGB"), 46 | ToTensor(), 47 | Normalize(mean=self.mean, std=self.std), 48 | ] 49 | ) 50 | 51 | def __len__(self): 52 | return len(self.images) 53 | 54 | def __getitem__(self, idx): 55 | images = self.transforms(Image.open(str(self.images[idx]))) 56 | texts = torch.from_numpy(self.tokenizer(str(self.captions[idx]))) 57 | return images, texts 58 | 59 | 60 | def collate_wrapper(batch): 61 | images, texts = list(zip(*batch)) 62 | return (torch.stack(images), torch.stack(texts)), torch.arange(len(batch)) 63 | 64 | 65 | def init_dataset(data_path, caption_tokenizer, batch_size=64, image_size=224, num_workers=8): 66 | train_images, train_captions, test_images, test_captions = read_from_tsv(data_path) 67 | 68 | train_dataset = CaptionDataset(train_images, train_captions, tokenizer=caption_tokenizer, is_train=True, image_size=image_size) 69 | train_dataloader = DataLoader( 70 | train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, collate_fn=collate_wrapper, pin_memory=True, sampler=None, drop_last=True 71 | ) 72 | 73 | test_dataset = CaptionDataset(test_images, test_captions, tokenizer=caption_tokenizer, is_train=False, image_size=image_size) 74 | test_dataloader = DataLoader( 75 | test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, collate_fn=collate_wrapper, pin_memory=True, sampler=None, drop_last=True 76 | ) 77 | 78 | return train_dataloader, test_dataloader 79 | -------------------------------------------------------------------------------- /keras_cv_attention_models/cspnext/README.md: -------------------------------------------------------------------------------- 1 | # ___Keras CSPNeXt___ 2 | *** 3 | 4 | ## Summary 5 | - CSPNeXt is the backbone from article: [PDF 2212.07784 RTMDet: An Empirical Study of Designing Real-Time Object Detectors](https://arxiv.org/abs/2212.07784). 6 | - Model weights ported from [Github open-mmlab/mmdetection/rtmdet](https://github.com/open-mmlab/mmdetection/tree/main/configs/rtmdet#classification). 7 | *** 8 | 9 | ## Models 10 | | Model | Params | FLOPs | Input | Top1 Acc | Download | 11 | | ------------- | ------ | ----- | ----- | -------- | -------- | 12 | | CSPNeXtTiny | 2.73M | 0.34G | 224 | 69.44 | [cspnext_tiny_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/cspnext/cspnext_tiny_imagenet.h5) | 13 | | CSPNeXtSmall | 4.89M | 0.66G | 224 | 74.41 | [cspnext_small_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/cspnext/cspnext_small_imagenet.h5) | 14 | | CSPNeXtMedium | 13.05M | 1.92G | 224 | 79.27 | [cspnext_medium_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/cspnext/cspnext_medium_imagenet.h5) | 15 | | CSPNeXtLarge | 27.16M | 4.19G | 224 | 81.30 | [cspnext_large_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/cspnext/cspnext_large_imagenet.h5) | 16 | | CSPNeXtXLarge | 48.85M | 7.75G | 224 | 82.10 | [cspnext_xlarge_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/cspnext/cspnext_xlarge_imagenet.h5) | 17 | 18 | ## Usage 19 | ```py 20 | from keras_cv_attention_models import cspnext, test_images 21 | mm = cspnext.CSPNeXtTiny() 22 | 23 | # Run prediction 24 | preds = mm(mm.preprocess_input(test_images.cat())) 25 | print(mm.decode_predictions(preds)) 26 | # [('n02124075', 'Egyptian_cat', 0.46106383), ('n02123045', 'tabby', 0.19603978), ...] 27 | ``` 28 | **Use dynamic input resolution** by set `input_shape=(None, None, 3)`. 29 | ```py 30 | from keras_cv_attention_models import cspnext 31 | model = cspnext.CSPNeXtTiny(input_shape=(None, None, 3), num_classes=0) 32 | 33 | print(model(np.ones([1, 223, 123, 3])).shape) 34 | # (1, 7, 4, 384) 35 | print(model(np.ones([1, 32, 526, 3])).shape) 36 | # (1, 1, 17, 384) 37 | ``` 38 | **Using PyTorch backend** by set `KECAM_BACKEND='torch'` environment variable. 39 | ```py 40 | os.environ['KECAM_BACKEND'] = 'torch' 41 | from keras_cv_attention_models import cspnext, test_images 42 | mm = cspnext.CSPNeXtSmall(input_shape=(219, 112, 3)) 43 | # >>>> Using PyTorch backend 44 | # >>>> Load pretrained from: ~/.keras/models/cspnext_small_imagenet.h5 45 | 46 | # Run prediction 47 | preds = mm(mm.preprocess_input(test_images.cat())) 48 | print(mm.decode_predictions(preds)) 49 | # [('n02124075', 'Egyptian_cat', 0.7909507), ('n02123045', 'tabby', 0.038315363), ...] 50 | ``` 51 | ## Verification with PyTorch version 52 | ```py 53 | inputs = np.random.uniform(size=(1, 224, 224, 3)).astype("float32") 54 | 55 | """ PyTorch CSPNeXt """ 56 | from mmdet import models 57 | torch_model = models.backbones.CSPNeXt() 58 | import torch 59 | ss = torch.load('cspnext-l_8xb256-rsb-a1-600e_in1k-6a760974.pth') 60 | ss = {kk.replace('backbone.', ''): vv for kk, vv in ss['state_dict'].items() if kk.startswith('backbone.')} 61 | torch_model.load_state_dict(ss) 62 | _ = torch_model.eval() 63 | torch_out = torch_model(torch.from_numpy(inputs).permute(0, 3, 1, 2))[-1].permute([0, 2, 3, 1]).detach().numpy() 64 | 65 | """ Keras CSPNeXtLarge """ 66 | from keras_cv_attention_models import cspnext 67 | mm = cspnext.CSPNeXtLarge(pretrained="imagenet", num_classes=0) # Exclude header 68 | keras_out = mm(inputs).numpy() 69 | 70 | """ Verification """ 71 | print(f"{np.allclose(torch_out, keras_out, atol=1e-4) = }") 72 | # np.allclose(torch_out, keras_out, atol=1e-4) = True 73 | ``` 74 | *** 75 | -------------------------------------------------------------------------------- /keras_cv_attention_models/ghostnet/README.md: -------------------------------------------------------------------------------- 1 | # ___Keras GhostNetV2___ 2 | *** 3 | 4 | ## Summary 5 | - Keras implementation of [Github huawei-noah/ghostnetv2_pytorch](https://github.com/huawei-noah/Efficient-AI-Backbones/tree/master/ghostnetv2_pytorch). Paper [PDF GhostNetV2: Enhance Cheap Operation with Long-Range Attention](https://openreview.net/pdf/6db544c65bbd0fa7d7349508454a433c112470e2.pdf). 6 | - `GhostNet_100` model weights ported from official publication [Github huawei-noah/ghostnet_pytorch](https://github.com/huawei-noah/Efficient-AI-Backbones/tree/master/ghostnet_pytorch). Paper [PDF 1911.11907 GhostNet: More Features from Cheap Operations](https://arxiv.org/pdf/1911.11907.pdf). 7 | - `GhostNet_050` and `GhostNet_130` model weights ported from [Github PaddlePaddle/PaddleClas](https://github.com/PaddlePaddle/PaddleClas). 8 | 9 | ![ghostnetv2](https://user-images.githubusercontent.com/5744524/202699896-4c429db1-8038-4dc9-992b-d355d1cfee6e.PNG) 10 | *** 11 | 12 | ## Models 13 | - `GhostNetV2_100` should be same with `GhostNetV2 (1.0x)`. Weights are ported from official publication. Currently it's only weights with accuracy `74.41` provided. 14 | 15 | | Model | Params | FLOPs | Input | Top1 Acc | Download | 16 | | -------------- | ------ | ------ | ----- | -------- | -------- | 17 | | GhostNetV2_100 | 6.12M | 168.5M | 224 | 75.3 | [ghostnetv2_100_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/ghostnetv2/ghostnetv2_100_imagenet.h5) | 18 | | GhostNetV2_130 | 8.96M | 271.1M | 224 | 76.9 | [ghostnetv2_130_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/ghostnetv2/ghostnetv2_130_imagenet.h5) | 19 | | GhostNetV2_160 | 12.39M | 400.9M | 224 | 77.8 | [ghostnetv2_160_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/ghostnetv2/ghostnetv2_160_imagenet.h5) | 20 | 21 | | Model | Params | FLOPs | Input | Top1 Acc | Download | 22 | | ------------ | ------ | ------ | ----- | -------- | -------- | 23 | | GhostNet_050 | 2.59M | 42.6M | 224 | 66.88 | [ghostnet_050_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/ghostnetv2/ghostnet_050_imagenet.h5) | 24 | | GhostNet_100 | 5.18M | 141.7M | 224 | 74.16 | [ghostnet_100_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/ghostnetv2/ghostnet_100_imagenet.h5) | 25 | | GhostNet_130 | 7.36M | 227.7M | 224 | 75.79 | [ghostnet_130_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/ghostnetv2/ghostnet_130_imagenet.h5) | 26 | | - ssld | 7.36M | 227.7M | 224 | 79.38 | [ghostnet_130_ssld.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/ghostnetv2/ghostnet_130_ssld.h5) | 27 | ## Usage 28 | ```py 29 | from keras_cv_attention_models import ghostnet 30 | 31 | # Will download and load pretrained imagenet weights. 32 | mm = ghostnet.GhostNetV2_100(pretrained="imagenet") 33 | 34 | # Run prediction 35 | import tensorflow as tf 36 | from tensorflow import keras 37 | from skimage.data import chelsea 38 | imm = keras.applications.imagenet_utils.preprocess_input(chelsea(), mode='torch') # Chelsea the cat 39 | pred = mm(tf.expand_dims(tf.image.resize(imm, mm.input_shape[1:3]), 0)).numpy() 40 | print(keras.applications.imagenet_utils.decode_predictions(pred)[0]) 41 | # [('n02124075', 'Egyptian_cat', 0.81426907), ('n02123045', 'tabby', 0.07202001), ...] 42 | ``` 43 | **Use dynamic input resolution** by set `input_shape=(None, None, 3)`. 44 | ```py 45 | from keras_cv_attention_models import ghostnet 46 | model = ghostnet.GhostNetV2_100(input_shape=(None, None, 3), num_classes=0) 47 | 48 | print(model(np.ones([1, 224, 224, 3])).shape) 49 | # (1, 7, 7, 960) 50 | print(model(np.ones([1, 512, 384, 3])).shape) 51 | # (1, 16, 12, 960) 52 | ``` 53 | -------------------------------------------------------------------------------- /keras_cv_attention_models/pytorch_backend/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import inspect 3 | import torch 4 | import hashlib 5 | 6 | _GLOBAL_CUSTOM_OBJECTS = {} 7 | _GLOBAL_CUSTOM_NAMES = {} 8 | 9 | 10 | def register_keras_serializable(package="Custom", name=None): 11 | def decorator(arg): 12 | """Registers a class with the Keras serialization framework.""" 13 | class_name = name if name is not None else arg.__name__ 14 | registered_name = package + ">" + class_name 15 | 16 | if inspect.isclass(arg) and not hasattr(arg, "get_config"): 17 | raise ValueError("Cannot register a class that does not have a " "get_config() method.") 18 | 19 | # if registered_name in _GLOBAL_CUSTOM_OBJECTS: 20 | # raise ValueError(f"{registered_name} has already been registered to " f"{_GLOBAL_CUSTOM_OBJECTS[registered_name]}") 21 | 22 | # if arg in _GLOBAL_CUSTOM_NAMES: 23 | # raise ValueError(f"{arg} has already been registered to " f"{_GLOBAL_CUSTOM_NAMES[arg]}") 24 | _GLOBAL_CUSTOM_OBJECTS[registered_name] = arg 25 | _GLOBAL_CUSTOM_NAMES[arg] = registered_name 26 | 27 | return arg 28 | 29 | return decorator 30 | 31 | 32 | def validate_file_md5(fpath, file_hash, chunk_size=65535): 33 | """Validates a file against a md5 hash. From keras/utils/data_utils.py""" 34 | hasher = hashlib.md5() 35 | with open(fpath, "rb") as fpath_file: 36 | for chunk in iter(lambda: fpath_file.read(chunk_size), b""): 37 | hasher.update(chunk) 38 | return str(hasher.hexdigest()) == str(file_hash) 39 | 40 | 41 | def _extract_archive(file_path, path=".", archive_format="auto"): 42 | if "zip" in archive_format or (archive_format == "auto" and file_path.endswith(".zip")): 43 | import zipfile 44 | 45 | assert zipfile.is_zipfile(file_path), "Not a zip file: {}".format(file_path) 46 | open_fn = zipfile.ZipFile 47 | elif "tar" in archive_format or (archive_format == "auto" and (file_path.endswith(".tar") or file_path.endswith(".tar.gz"))): 48 | import tarfile 49 | 50 | assert tarfile.is_tarfile(file_path), "Not a tar file: {}".format(file_path) 51 | open_fn = tarfile.open 52 | else: 53 | raise ValueError("Not a supported extract file format: {}".format(file_path)) 54 | 55 | print(">>>> Extract {} -> {}".format(file_path, path)) 56 | with open_fn(file_path) as ff: 57 | ff.extractall(path) 58 | return path 59 | 60 | 61 | def get_file(fname=None, origin=None, cache_subdir="datasets", file_hash=None, extract=False): 62 | # print(f">>>> {fname = }, {origin = }, {cache_subdir = }, {file_hash = }") 63 | save_dir = os.path.join(os.path.expanduser("~"), ".keras", cache_subdir) 64 | if not os.path.exists(save_dir): 65 | os.makedirs(save_dir, exist_ok=True) 66 | fname = os.path.basename(origin) if fname is None else fname 67 | file_path = os.path.join(save_dir, fname) 68 | if os.path.exists(file_path): 69 | if file_hash is not None and not validate_file_md5(file_path, file_hash): 70 | print( 71 | "A local file was found, but it seems to be incomplete or outdated because the md5 file hash does not match the original value of " 72 | f"{file_hash} so we will re-download the data." 73 | ) 74 | else: 75 | return file_path 76 | 77 | print("Downloading data from {} to {}".format(origin, file_path)) 78 | torch.hub.download_url_to_file(origin, file_path) 79 | if os.path.exists(file_path) and file_hash is not None and not validate_file_md5(file_path, file_hash): 80 | raise ValueError("Incomplete or corrupted file detected. The md5 file hash does not match the provided value {}.".format(file_hash)) 81 | 82 | if extract: 83 | _extract_archive(file_path, path=save_dir) # return tar file path, just like keras one 84 | return file_path 85 | -------------------------------------------------------------------------------- /keras_cv_attention_models/ghostnet/__init__.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.ghostnet.ghostnet_v2 import GhostNetV2, GhostNetV2_100, GhostNetV2_130, GhostNetV2_160 2 | from keras_cv_attention_models.ghostnet.ghostnet import GhostNet, GhostNet_050, GhostNet_100, GhostNet_130 3 | 4 | __v2_head_doc__ = """ 5 | Keras implementation of [Gitee mindspore/models/ghostnetv2](https://gitee.com/mindspore/models/tree/master/research/cv/ghostnetv2). 6 | Paper [PDF GhostNetV2: Enhance Cheap Operation with Long-Range Attention](https://openreview.net/pdf/6db544c65bbd0fa7d7349508454a433c112470e2.pdf). 7 | """ 8 | 9 | __tail_doc__ = """ kernel_sizes: kernel_size for each stack. 10 | first_ghost_channels: num channels for first ghost module in each stack. 11 | out_channels: output channels for each stack. 12 | se_ratios: se_ratio for each stack. 13 | strides: stride for each stack. 14 | stem_width: output dimension for stem block. 15 | stem_strides: strides for stem `Conv2D`, default `2`. 16 | num_ghost_module_v1_stacks: num of `ghost_module` stcks on the head, others are `ghost_module_multiply`. 17 | - for `GhostNet` v1 way, default `-1` for all using `ghost_module`. 18 | - for `GhostNetV2` way, default `2` for only using `ghost_module` in the first 2 stacks. 19 | input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`. 20 | num_classes: number of classes to classify images into. Set `0` to exclude top layers. 21 | activation: activation used in whole model, default "relu". 22 | dropout: dropout rate if top layers is included. 23 | classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`. 24 | Set `classifier_activation=None` to return the logits of the "top" layer. 25 | pretrained: One of `[None, "imagenet", "ssld"]`. "ssld" if for `GhostNet_130`. 26 | **kwargs: other parameters if available. 27 | 28 | Returns: 29 | A `keras.Model` instance. 30 | """ 31 | 32 | GhostNetV2.__doc__ = __v2_head_doc__ + """ 33 | Args: 34 | width_mul: expansion ratio of `fist_ghost_channels` and `out_channels` in each block. 35 | model_name: string, model name. 36 | """ + __tail_doc__ + """ 37 | Model architectures: 38 | | Model | Params | FLOPs | Input | Top1 Acc | 39 | | ----------------- | ------ | ------ | ----- | -------- | 40 | | GhostNetV2_100 | 6.12M | 168.5M | 224 | 74.41 | 41 | | GhostNetV2 (1.0x) | 6.12M | 168.5M | 224 | 75.3 | 42 | | GhostNetV2 (1.3x) | 8.96M | 271.1M | 224 | 76.9 | 43 | | GhostNetV2 (1.6x) | 12.39M | 400.9M | 224 | 77.8 | 44 | """ 45 | 46 | GhostNetV2_100.__doc__ = __v2_head_doc__ + """ 47 | Args: 48 | """ + __tail_doc__ 49 | 50 | GhostNetV2_130.__doc__ = GhostNetV2_100.__doc__ 51 | GhostNetV2_160.__doc__ = GhostNetV2_100.__doc__ 52 | 53 | __v1_head_doc__ = """ 54 | Keras implementation of [Github huawei-noah/ghostnet_pytorch](https://github.com/huawei-noah/Efficient-AI-Backbones/tree/master/ghostnet_pytorch). 55 | Paper [PDF 1911.11907 GhostNet: More Features from Cheap Operations](https://arxiv.org/pdf/1911.11907.pdf). 56 | """ 57 | 58 | GhostNet.__doc__ = __v1_head_doc__ + """ 59 | Args: 60 | width_mul: expansion ratio of `fist_ghost_channels` and `out_channels` in each block. 61 | stem_width: output dimension for stem block. 62 | model_name: string, model name. 63 | """ + __tail_doc__ + """ 64 | Model architectures: 65 | | Model | Params | FLOPs | Input | Top1 Acc | 66 | | ------------ | ------ | ------ | ----- | -------- | 67 | | GhostNet_050 | 2.59M | 42.6M | 224 | 66.88 | 68 | | GhostNet_100 | 5.18M | 141.7M | 224 | 74.16 | 69 | | GhostNet_130 | 7.36M | 227.7M | 224 | 75.79 | 70 | | - ssld | 7.36M | 227.7M | 224 | 79.38 | 71 | """ 72 | 73 | GhostNet_050.__doc__ = __v1_head_doc__ + """ 74 | Args: 75 | """ + __tail_doc__ 76 | 77 | GhostNet_100.__doc__ = GhostNet_050.__doc__ 78 | GhostNet_130.__doc__ = GhostNet_050.__doc__ 79 | -------------------------------------------------------------------------------- /keras_cv_attention_models/iformer/README.md: -------------------------------------------------------------------------------- 1 | # ___Keras InceptionTransformer___ 2 | *** 3 | 4 | ## Summary 5 | - Keras implementation of [Github sail-sg/iFormer](https://github.com/sail-sg/iFormer). Paper [PDF 2205.12956 Inception Transformer](https://arxiv.org/pdf/2205.12956.pdf). 6 | - Model weights ported from official publication. 7 | *** 8 | 9 | ## Models 10 | | Model | Params | FLOPs | Input | Top1 Acc | Download | 11 | | ------------ | ------ | ------ | ----- | -------- | -------- | 12 | | IFormerSmall | 19.9M | 4.88G | 224 | 83.4 | [iformer_small_224_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/iformer/iformer_small_224_imagenet.h5) | 13 | | | 20.9M | 16.29G | 384 | 84.6 | [iformer_small_384_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/iformer/iformer_small_384_imagenet.h5) | 14 | | IFormerBase | 47.9M | 9.44G | 224 | 84.6 | [iformer_base_224_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/iformer/iformer_base_224_imagenet.h5) | 15 | | | 48.9M | 30.86G | 384 | 85.7 | [iformer_base_384_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/iformer/iformer_base_384_imagenet.h5) | 16 | | IFormerLarge | 86.6M | 14.12G | 224 | 84.6 | [iformer_large_224_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/iformer/iformer_largel_224_imagenet.h5) | 17 | | | 87.7M | 45.74G | 384 | 85.8 | [iformer_large_384_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/iformer/iformer_largel_384_imagenet.h5) | 18 | ## Usage 19 | ```py 20 | from keras_cv_attention_models import iformer 21 | 22 | # Will download and load pretrained imagenet weights. 23 | mm = iformer.IFormerSmall(pretrained="imagenet") 24 | 25 | # Run prediction 26 | import tensorflow as tf 27 | from tensorflow import keras 28 | from skimage.data import chelsea 29 | imm = keras.applications.imagenet_utils.preprocess_input(chelsea(), mode='torch') # Chelsea the cat 30 | pred = mm(tf.expand_dims(tf.image.resize(imm, mm.input_shape[1:3]), 0)).numpy() 31 | print(keras.applications.imagenet_utils.decode_predictions(pred)[0]) 32 | # [('n02124075', 'Egyptian_cat', 0.7471715), ('n02123159', 'tiger_cat', 0.035306472), ...] 33 | ``` 34 | **Change input resolution**. 35 | ```py 36 | from keras_cv_attention_models import iformer 37 | mm = iformer.IFormerSmall(input_shape=(512, 393, 3), pretrained="imagenet") 38 | # >>>> Load pretrained from: ~/.keras/models/iformer_small_384_imagenet.h5 39 | # >>>> Reload mismatched weights: 384 -> (512, 393) 40 | # >>>> Reload layer: stack1_positional_embedding 41 | # ... 42 | 43 | # Run prediction 44 | from skimage.data import chelsea 45 | preds = mm(mm.preprocess_input(chelsea())) 46 | print(mm.decode_predictions(preds)) 47 | # [('n02124075', 'Egyptian_cat', 0.72780704), ('n02123159', 'tiger_cat', 0.11522171), ...] 48 | ``` 49 | ## Verification with PyTorch version 50 | ```py 51 | """ PyTorch iformer_small """ 52 | sys.path.append('../iFormer/') 53 | sys.path.append('../pytorch-image-models/') # Needs timm 54 | import torch 55 | from models import inception_transformer 56 | 57 | torch_model = inception_transformer.iformer_small(pretrained=True) 58 | _ = torch_model.eval() 59 | 60 | """ Keras IFormerSmall """ 61 | from keras_cv_attention_models import iformer 62 | mm = iformer.IFormerSmall(pretrained="imagenet", classifier_activation=None) 63 | 64 | """ Verification """ 65 | inputs = np.random.uniform(size=(1, *mm.input_shape[1:3], 3)).astype("float32") 66 | torch_out = torch_model(torch.from_numpy(inputs).permute(0, 3, 1, 2)).detach().numpy() 67 | keras_out = mm(inputs).numpy() 68 | print(f"{np.allclose(torch_out, keras_out, atol=1e-5) = }") 69 | # np.allclose(torch_out, keras_out, atol=1e-5) = True 70 | ``` 71 | -------------------------------------------------------------------------------- /keras_cv_attention_models/models.py: -------------------------------------------------------------------------------- 1 | import os 2 | import keras_cv_attention_models as __package__ # don't show `keras_cv_attention_models` under `keras_cv_attention_models.models.` 3 | 4 | 5 | def register_model(model_func): 6 | if not hasattr(__package__.models, model_func.__name__): 7 | setattr(__package__.models, model_func.__name__, model_func) 8 | return model_func 9 | 10 | 11 | def no_grad_if_torch(func): 12 | if __package__.backend.is_torch_backend: 13 | import torch 14 | 15 | def no_grad_call(*args, **kwargs): 16 | with torch.no_grad(): 17 | return func(*args, **kwargs) 18 | 19 | return no_grad_call 20 | else: 21 | return func 22 | 23 | 24 | class FakeModelWrapper: 25 | def __init__(self, models, name="model"): 26 | self.models = models if isinstance(models, (list, tuple)) else [models] 27 | self.name = name 28 | 29 | def cuda(self): 30 | """Torch function""" 31 | self.models = [model.cuda() for model in self.models] 32 | return self 33 | 34 | def cpu(self): 35 | """Torch function""" 36 | self.models = [model.cpu() for model in self.models] 37 | return self 38 | 39 | def float(self): 40 | """Torch function""" 41 | self.models = [model.float() for model in self.models] 42 | return self 43 | 44 | def half(self): 45 | """Torch function""" 46 | self.models = [model.half() for model in self.models] 47 | return self 48 | 49 | def to(self, *args): 50 | """Torch function""" 51 | self.models = [model.to(*args) for model in self.models] 52 | return self 53 | 54 | def _save_load_file_path_rule_(self, file_path=None): 55 | file_path = self.name if file_path is None else file_path 56 | suffix = os.path.splitext(file_path)[1] 57 | if suffix in [".h5", ".keras", ".pt", ".pth"]: 58 | file_path = os.path.splitext(file_path)[0] 59 | save_path_rule = lambda model_name: file_path + "_" + model_name + suffix 60 | else: # Regard as directory 61 | if not os.path.exists(file_path): 62 | os.makedirs(file_path, exist_ok=True) 63 | save_path_rule = lambda model_name: os.path.join(file_path, model_name + ".h5") 64 | return save_path_rule 65 | 66 | def save(self, file_path=None): 67 | """file_path: if suffix in [".h5", ".keras", ".pt", ".pth"], will save as {file_path}_{model_name}.{suffix}, 68 | or will regard as directory, and save to {file_path}/{model_name}.h5 69 | """ 70 | save_path_rule = self._save_load_file_path_rule_(file_path) 71 | for model in self.models: 72 | cur_save_path = save_path_rule(model.name) 73 | print(">>>> Saving {} to {}".format(model.name, cur_save_path)) 74 | model.save(cur_save_path) 75 | 76 | def save_weights(self, file_path=None): 77 | """file_path: if suffix in [".h5", ".keras", ".pt", ".pth"], will save as {file_path}_{model_name}.{suffix}, 78 | or will regard as directory, and save to {file_path}/{model_name}.h5 79 | """ 80 | save_path_rule = self._save_load_file_path_rule_(file_path) 81 | for model in self.models: 82 | cur_save_path = save_path_rule(model.name) 83 | print(">>>> Saving {} weights to {}".format(model.name, cur_save_path)) 84 | model.save_weights(cur_save_path) 85 | 86 | def load_weights(self, file_path=None): 87 | """file_path: if suffix in [".h5", ".keras", ".pt", ".pth"], will load from {file_path}_{model_name}.{suffix}, 88 | or will regard as directory, and load from {file_path}/{model_name}.h5 89 | """ 90 | save_path_rule = self._save_load_file_path_rule_(file_path) 91 | for model in self.models: 92 | cur_save_path = save_path_rule(model.name) 93 | print(">>>> Loading {} from {}".format(model.name, cur_save_path)) 94 | model.load_weights(cur_save_path) 95 | -------------------------------------------------------------------------------- /keras_cv_attention_models/pvt/README.md: -------------------------------------------------------------------------------- 1 | # ___Keras PyramidVisionTransformerV2___ 2 | *** 3 | 4 | ## Summary 5 | - Keras implementation of [Github whai362/PVT](https://github.com/whai362/PVT/tree/v2/classification). Paper [PDF 2106.13797 PVTv2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/pdf/2106.13797.pdf). 6 | - Model weights ported from official publication. 7 | *** 8 | 9 | ## Models 10 | | Model | Params | FLOPs | Input | Top1 Acc | Download | 11 | | --------------- | ------ | ------ | ----- | -------- | -------- | 12 | | PVT_V2B0 | 3.7M | 580.3M | 224 | 70.5 | [pvt_v2_b0_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/pvt/pvt_v2_b0_imagenet.h5) | 13 | | PVT_V2B1 | 14.0M | 2.14G | 224 | 78.7 | [pvt_v2_b1_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/pvt/pvt_v2_b1_imagenet.h5) | 14 | | PVT_V2B2 | 25.4M | 4.07G | 224 | 82.0 | [pvt_v2_b2_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/pvt/pvt_v2_b2_imagenet.h5) | 15 | | PVT_V2B2_linear | 22.6M | 3.94G | 224 | 82.1 | [pvt_v2_b2_linear.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/pvt/pvt_v2_b2_linear_imagenet.h5) | 16 | | PVT_V2B3 | 45.2M | 6.96G | 224 | 83.1 | [pvt_v2_b3_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/pvt/pvt_v2_b3_imagenet.h5) | 17 | | PVT_V2B4 | 62.6M | 10.19G | 224 | 83.6 | [pvt_v2_b4_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/pvt/pvt_v2_b4_imagenet.h5) | 18 | | PVT_V2B5 | 82.0M | 11.81G | 224 | 83.8 | [pvt_v2_b5_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/pvt/pvt_v2_b5_imagenet.h5) | 19 | ## Usage 20 | ```py 21 | from keras_cv_attention_models import pvt 22 | 23 | # Will download and load pretrained imagenet weights. 24 | mm = pvt.PVT_V2B2(pretrained="imagenet") 25 | 26 | # Run prediction 27 | import tensorflow as tf 28 | from tensorflow import keras 29 | from skimage.data import chelsea 30 | imm = keras.applications.imagenet_utils.preprocess_input(chelsea(), mode='torch') # Chelsea the cat 31 | pred = mm(tf.expand_dims(tf.image.resize(imm, mm.input_shape[1:3]), 0)).numpy() 32 | print(keras.applications.imagenet_utils.decode_predictions(pred)[0]) 33 | # [('n02124075', 'Egyptian_cat', 0.6658455), ('n02123159', 'tiger_cat', 0.08825972), ...] 34 | ``` 35 | **Change input resolution**. Note: for `PVT_V2B2_linear` using `addaptive_pooling_2d` with `output_size=7`, input shape should be lager than `193`. 36 | ```py 37 | from keras_cv_attention_models import pvt 38 | mm = pvt.PVT_V2B1(input_shape=(128, 192, 3), pretrained="imagenet") 39 | # >>>> Load pretrained from: ~/.keras/models/pvt_v2_b1_imagenet.h5 40 | 41 | # Run prediction 42 | from skimage.data import chelsea 43 | preds = mm(mm.preprocess_input(chelsea())) 44 | print(mm.decode_predictions(preds)) 45 | # [('n02124075', 'Egyptian_cat', 0.8482509), ('n02123045', 'tabby', 0.07139703), ...] 46 | ``` 47 | ## Verification with PyTorch version 48 | ```py 49 | """ PyTorch pvt_v2_b0 """ 50 | sys.path.append('../PVT-2/') 51 | sys.path.append('../pytorch-image-models/') # Needs timm 52 | import torch 53 | from classification import pvt_v2 54 | 55 | torch_model = pvt_v2.pvt_v2_b0() 56 | ss = torch.load('pvt_v2_b0.pth', map_location=torch.device('cpu')) 57 | torch_model.load_state_dict(ss) 58 | _ = torch_model.eval() 59 | 60 | """ Keras PVT_V2B0 """ 61 | from keras_cv_attention_models import pvt 62 | mm = pvt.PVT_V2B0(pretrained="imagenet", classifier_activation=None) 63 | 64 | """ Verification """ 65 | inputs = np.random.uniform(size=(1, *mm.input_shape[1:3], 3)).astype("float32") 66 | torch_out = torch_model(torch.from_numpy(inputs).permute(0, 3, 1, 2)).detach().numpy() 67 | keras_out = mm(inputs).numpy() 68 | print(f"{np.allclose(torch_out, keras_out, atol=1e-5) = }") 69 | # np.allclose(torch_out, keras_out, atol=1e-5) = True 70 | ``` 71 | -------------------------------------------------------------------------------- /keras_cv_attention_models/pytorch_backend/losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | epsilon = 1e-7 4 | 5 | 6 | class Loss: 7 | def __init__(self, reduction="AUTO", name=None): 8 | self.reduction, self.name = reduction, name 9 | 10 | def __call__(self, y_true, y_pred, sample_weight=None): 11 | pass 12 | 13 | 14 | def categorical_crossentropy(y_true, y_pred, from_logits=False, label_smoothing=0.0, axis=-1): 15 | """ 16 | # from_logits=False 17 | >>> import torch, tensorflow as tf 18 | >>> from keras_cv_attention_models.pytorch_backend import losses 19 | >>> xx, yy = tf.random.uniform([24, 10]), tf.one_hot(tf.random.uniform([24], 0, 10, dtype='int32'), 10) 20 | >>> tf_out = tf.losses.categorical_crossentropy(yy, xx, from_logits=False).numpy().mean() 21 | >>> torch_out = losses.categorical_crossentropy(torch.from_numpy(yy.numpy()), torch.from_numpy(xx.numpy()), from_logits=False) 22 | >>> print(tf_out, torch_out, np.allclose(tf_out, torch_out)) 23 | >>> # 2.681877 tensor(2.6819) True 24 | # from_logits=True 25 | >>> tf_out = tf.losses.categorical_crossentropy(yy, xx, from_logits=True).numpy().mean() 26 | >>> torch_out = losses.categorical_crossentropy(torch.from_numpy(yy.numpy()), torch.from_numpy(xx.numpy()), from_logits=True) 27 | >>> print(tf_out, torch_out, np.allclose(tf_out, torch_out)) 28 | >>> # 2.3364408 tensor(2.3364) True 29 | """ 30 | if from_logits: 31 | return torch.nn.functional.cross_entropy(y_pred, y_true.argmax(-1), label_smoothing=label_smoothing) 32 | else: 33 | y_pred = y_pred / y_pred.sum(dim=axis, keepdim=True) 34 | y_pred = y_pred.clamp_(epsilon, 1.0 - epsilon) 35 | return -(y_true * y_pred.log()).sum(dim=axis).mean() 36 | 37 | 38 | def sparse_categorical_crossentropy(y_true, y_pred, from_logits=False, label_smoothing=0.0, axis=-1): 39 | """ 40 | # from_logits=False 41 | >>> import torch, tensorflow as tf 42 | >>> from keras_cv_attention_models.pytorch_backend import losses 43 | >>> xx, yy = tf.random.uniform([24, 10]), tf.random.uniform([24], 0, 10, dtype='int64') 44 | >>> tf_out = tf.losses.sparse_categorical_crossentropy(yy, xx, from_logits=False).numpy().mean() 45 | >>> torch_out = losses.sparse_categorical_crossentropy(torch.from_numpy(yy.numpy()), torch.from_numpy(xx.numpy()), from_logits=False) 46 | >>> print(tf_out, torch_out, np.allclose(tf_out, torch_out)) 47 | >>> # 2.677911 tensor(2.6779) True 48 | # from_logits=True 49 | >>> tf_out = tf.losses.sparse_categorical_crossentropy(yy, xx, from_logits=True).numpy().mean() 50 | >>> torch_out = losses.sparse_categorical_crossentropy(torch.from_numpy(yy.numpy()), torch.from_numpy(xx.numpy()), from_logits=True) 51 | >>> print(tf_out, torch_out, np.allclose(tf_out, torch_out)) 52 | >>> # 2.3503969 tensor(2.3504) True 53 | """ 54 | if from_logits: 55 | return torch.nn.functional.cross_entropy(y_pred, y_true, label_smoothing=label_smoothing) 56 | else: 57 | y_pred = y_pred / y_pred.sum(dim=axis, keepdim=True) 58 | y_pred = y_pred.clamp_(epsilon, 1.0 - epsilon) 59 | y_true = torch.nn.functional.one_hot(y_true, y_pred.shape[-1]) 60 | return -(y_true * y_pred.log()).sum(dim=axis).mean() 61 | 62 | 63 | class MeanSquaredError(Loss): 64 | """ 65 | >>> from keras_cv_attention_models.pytorch_backend import losses 66 | >>> aa = np.random.uniform(size=[4, 42, 42, 3]).astype("float32") 67 | >>> bb = np.random.uniform(size=[4, 42, 42, 3]).astype("float32") 68 | >>> print(f"{keras.losses.MeanSquaredError()(aa, bb).numpy() = }") 69 | # keras.losses.MeanSquaredError()(aa, bb).numpy() = 0.16724217 70 | >>> print(f"{losses.MeanSquaredError()(torch.from_numpy(aa), torch.from_numpy(bb)) = }") 71 | # losses.MeanSquaredError()(torch.from_numpy(aa), torch.from_numpy(bb)) = tensor(0.1672) 72 | """ 73 | 74 | def __init__(self, reduction="AUTO", name="mean_squared_error"): 75 | super().__init__(reduction=reduction, name=name) 76 | 77 | def __call__(self, y_true, y_pred, sample_weight=None): 78 | return torch.functional.F.mse_loss(y_pred, y_true) 79 | -------------------------------------------------------------------------------- /keras_cv_attention_models/llama2/README.md: -------------------------------------------------------------------------------- 1 | # ___Keras LLaMA2___ 2 | *** 3 | 4 | ## Summary 5 | - Keras implementation of [Github facebookresearch/llama](https://github.com/facebookresearch/llama). Paper [PDF 2307.09288 Llama 2: Open Foundation and Fine-Tuned Chat Models](https://arxiv.org/pdf/2307.09288.pdf). 6 | - `LLaMA2_15M` / `LLaMA2_42M`, `LLaMA2_110M` model weights ported from [Github karpathy/llama2.c](https://github.com/karpathy/llama2.c). 7 | - `LLaMA2_1B` model weights ported from [Github jzhang38/TinyLlama](https://githubfast.com/jzhang38/TinyLlama) `TinyLlama-1.1B-Chat-V0.4` one. 8 | ## Models 9 | - `Params` is counted with `include_top=True`, will match the name if set `include_top=False`. 10 | 11 | | Model | Params | FLOPs | vocab_size | Val loss | 12 | | ----------- | ------ | ------ | ---------- | -------- | 13 | | [LLaMA2_15M](https://github.com/leondgarse/keras_cv_attention_models/releases/download/llama2/llama2_15m_tiny_stories.h5) | 24.41M | 4.06G | 32000 | 1.072 | 14 | | [LLaMA2_42M](https://github.com/leondgarse/keras_cv_attention_models/releases/download/llama2/llama2_42m_tiny_stories.h5) | 58.17M | 50.7G | 32000 | 0.847 | 15 | | [LLaMA2_110M](https://github.com/leondgarse/keras_cv_attention_models/releases/download/llama2/llama2_110m_tiny_stories.h5) | 134.1M | 130.2G | 32000 | 0.760 | 16 | | [LLaMA2_1B](https://github.com/leondgarse/keras_cv_attention_models/releases/download/llama2/llama2_1b_tiny_llama_1.1B_chat_v0.4.h5) | 1.10B | 2.50T | 32003 | | 17 | | LLaMA2_7B | 6.74B | 14.54T | 32000 | | 18 | ## Usage 19 | ```py 20 | from keras_cv_attention_models import llama2 21 | 22 | mm = llama2.LLaMA2_42M() 23 | # >>>> Load pretrained from: ~/.keras/models/llama2_42m_tiny_stories.h5 24 | _ = mm.run_prediction("As evening fell, a maiden stood at the edge of a wood. In her hands,") 25 | # >>>> Load tokenizer from file: ~/.keras/datasets/llama_tokenizer.model 26 | # 27 | # As evening fell, a maiden stood at the edge of a wood. In her hands, she held a beautiful diamond. Everyone was surprised to see it. 28 | # "What is it?" one of the kids asked. 29 | # "It's a diamond," the maiden said. 30 | # ... 31 | ``` 32 | **Set `include_top=False`** to exclude model head layer. 33 | ```py 34 | from keras_cv_attention_models import llama2 35 | 36 | mm = llama2.LLaMA2_42M(include_top=False) 37 | # >>>> Load pretrained from: ~/.keras/models/llama2_42m_tiny_stories.h5 38 | print(f"{mm.output_shape = }") 39 | # mm.output_shape = (None, 1024, 512) 40 | ``` 41 | ## Convert weights 42 | - Manually downloading weights from [Huggingface meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) or [Huggingface LinkSoul/Chinese-Llama-2-7b](https://huggingface.co/LinkSoul/Chinese-Llama-2-7b), and convert to `h5` format. The benefit of saving h5 is that, it's like `npz` or `tfrecord`, weights can be loaded layer by layer without reading the entire file into memory. 43 | ```py 44 | # Set to build model using pure float16 if using Tensorflow 45 | policy = keras.mixed_precision.Policy("float16") 46 | keras.mixed_precision.set_global_policy(policy) 47 | 48 | from keras_cv_attention_models import llama2 49 | _ = llama2.convert_huggingface_weights_to_h5("pytorch_model-00001-of-00002.bin", to_fp16=True) 50 | # >>>> Save to: pytorch_model-00001-of-00002.h5 51 | _ = llama2.convert_huggingface_weights_to_h5("pytorch_model-00002-of-00002.bin", to_fp16=True) 52 | # >>>> Save to: pytorch_model-00002-of-00002.h5 53 | ``` 54 | Then load back into model. 55 | ```py 56 | policy = keras.mixed_precision.Policy("float16") 57 | keras.mixed_precision.set_global_policy(policy) 58 | 59 | from keras_cv_attention_models import llama2 60 | mm = llama2.LLaMA2_7B(pretrained=["pytorch_model-00001-of-00002.h5", "pytorch_model-00002-of-00002.h5"]) 61 | # >>>> Load pretrained from: pytorch_model-00001-of-00002.h5 62 | # >>>> Load pretrained from: pytorch_model-00002-of-00002.h5 63 | mm.save(mm.name + ".h5") # mm.half().save(mm.name + ".h5") if using PyTorch backend 64 | 65 | _ = mm.run_prediction("Who's there?") 66 | ``` 67 | *** 68 | -------------------------------------------------------------------------------- /keras_cv_attention_models/nfnets/README.md: -------------------------------------------------------------------------------- 1 | # ___Keras NFNets___ 2 | *** 3 | 4 | ## Summary 5 | - Keras implementation of [Github deepmind/nfnets](https://github.com/deepmind/deepmind-research/tree/master/nfnets). Paper [PDF 2102.06171 High-Performance Large-Scale Image Recognition Without Normalization](https://arxiv.org/pdf/2102.06171.pdf). 6 | - Model weights reloaded from official publication. 7 | - `ECA` and `Light` NFNets weights reloaded from timm [Github rwightman/pytorch-image-models](https://github.com/rwightman/pytorch-image-models). 8 | *** 9 | 10 | ## Models 11 | - `L` types models are light versions of `NFNet-F` from `timm`. 12 | - `ECA` type models are using `attn_type="eca"` instead of `attn_type="se"` from `timm`. 13 | 14 | | Model | Params | FLOPs | Input | Top1 Acc | Download | 15 | | ----------- | ------ | ------- | ----- | -------- | -------- | 16 | | NFNetL0 | 35.07M | 7.13G | 288 | 82.75 | [nfnetl0_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/nfnets/nfnetl0_imagenet.h5) | 17 | | NFNetF0 | 71.5M | 12.58G | 256 | 83.6 | [nfnetf0_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/nfnets/nfnetf0_imagenet.h5) | 18 | | NFNetF1 | 132.6M | 35.95G | 320 | 84.7 | [nfnetf1_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/nfnets/nfnetf1_imagenet.h5) | 19 | | NFNetF2 | 193.8M | 63.24G | 352 | 85.1 | [nfnetf2_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/nfnets/nfnetf2_imagenet.h5) | 20 | | NFNetF3 | 254.9M | 115.75G | 416 | 85.7 | [nfnetf3_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/nfnets/nfnetf3_imagenet.h5) | 21 | | NFNetF4 | 316.1M | 216.78G | 512 | 85.9 | [nfnetf4_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/nfnets/nfnetf4_imagenet.h5) | 22 | | NFNetF5 | 377.2M | 291.73G | 544 | 86.0 | [nfnetf5_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/nfnets/nfnetf5_imagenet.h5) | 23 | | NFNetF6 SAM | 438.4M | 379.75G | 576 | 86.5 | [nfnetf6_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/nfnets/nfnetf6_imagenet.h5) | 24 | | NFNetF7 | 499.5M | 481.80G | 608 | | | 25 | | ECA_NFNetL0 | 24.14M | 7.12G | 288 | 82.58 | [eca_nfnetl0_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/nfnets/eca_nfnetl0_imagenet.h5) | 26 | | ECA_NFNetL1 | 41.41M | 14.93G | 320 | 84.01 | [eca_nfnetl1_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/nfnets/eca_nfnetl1_imagenet.h5) | 27 | | ECA_NFNetL2 | 56.72M | 30.12G | 384 | 84.70 | [eca_nfnetl2_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/nfnets/eca_nfnetl2_imagenet.h5) | 28 | | ECA_NFNetL3 | 72.04M | 52.73G | 448 | | | 29 | ## Usage 30 | ```py 31 | from keras_cv_attention_models import nfnets 32 | 33 | # Will download and load pretrained imagenet weights. 34 | mm = nfnets.NFNetF0(pretrained="imagenet") 35 | 36 | # Run prediction 37 | import tensorflow as tf 38 | from tensorflow import keras 39 | from skimage.data import chelsea 40 | imm = keras.applications.imagenet_utils.preprocess_input(chelsea(), mode='torch') # Chelsea the cat 41 | pred = mm(tf.expand_dims(tf.image.resize(imm, mm.input_shape[1:3]), 0)).numpy() 42 | print(keras.applications.imagenet_utils.decode_predictions(pred)[0]) 43 | # [('n02124075', 'Egyptian_cat', 0.9195376), ('n02123159', 'tiger_cat', 0.021603014), ...] 44 | ``` 45 | **Use dynamic input resolution** 46 | ```py 47 | from keras_cv_attention_models import nfnets 48 | mm = nfnets.NFNetF1(input_shape=(None, None, 3), num_classes=0, pretrained="imagenet") 49 | 50 | print(mm(np.ones([1, 320, 320, 3])).shape) 51 | # (1, 10, 10, 3072) 52 | print(mm(np.ones([1, 512, 512, 3])).shape) 53 | # (1, 16, 16, 3072) 54 | 55 | mm.save("nfnetf1_imagenet_dynamic_notop.h5") 56 | ``` 57 | *** 58 | -------------------------------------------------------------------------------- /keras_cv_attention_models/inceptionnext/README.md: -------------------------------------------------------------------------------- 1 | # ___Keras InceptionNeXt___ 2 | *** 3 | 4 | ## Summary 5 | - Keras implementation of [Github sail-sg/inceptionnext](https://github.com/sail-sg/inceptionnext). Paper [PDF 2303.16900 InceptionNeXt: When Inception Meets ConvNeXt](https://arxiv.org/pdf/2303.16900.pdf). 6 | - Model weights ported from official publication. 7 | *** 8 | 9 | ## Models 10 | | Model | Params | FLOPs | Input | Top1 Acc | Download | 11 | | ------------------ | ------ | ------ | ----- | -------- | -------- | 12 | | InceptionNeXtTiny | 28.05M | 4.21G | 224 | 82.3 | [inceptionnext_tiny_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/inceptionnext/inceptionnext_tiny_imagenet.h5) | 13 | | InceptionNeXtSmall | 49.37M | 8.39G | 224 | 83.5 | [inceptionnext_small_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/inceptionnext/inceptionnext_small_imagenet.h5) | 14 | | InceptionNeXtBase | 86.67M | 14.88G | 224 | 84.0 | [inceptionnext_base_224_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/inceptionnext/inceptionnext_base_224_imagenet.h5) | 15 | | | 86.67M | 43.73G | 384 | 85.2 | [inceptionnext_base_384_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/inceptionnext/inceptionnext_base_384_imagenet.h5) | 16 | 17 | ## Usage 18 | ```py 19 | from keras_cv_attention_models import inceptionnext 20 | 21 | # Will download and load pretrained imagenet weights. 22 | model = inceptionnext.InceptionNeXtTiny(pretrained="imagenet") 23 | 24 | # Run prediction 25 | from skimage.data import chelsea # Chelsea the cat 26 | preds = model(model.preprocess_input(chelsea())) 27 | print(model.decode_predictions(preds)) 28 | # [('n02124075', 'Egyptian_cat', 0.8221698), ('n02123159', 'tiger_cat', 0.019049658), ...] 29 | ``` 30 | **Use dynamic input resolution** by set `input_shape=(None, None, 3)`. 31 | ```py 32 | from keras_cv_attention_models import inceptionnext 33 | model = inceptionnext.InceptionNeXtTiny(input_shape=(None, None, 3), num_classes=0) 34 | # >>>> Load pretrained from: ~/.keras/models/inceptionnext_tiny_imagenet.h5 35 | print(model.output_shape) 36 | # (None, None, None, 768) 37 | 38 | print(model(np.ones([1, 223, 123, 3])).shape) 39 | # (1, 6, 3, 768) 40 | print(model(np.ones([1, 32, 526, 3])).shape) 41 | # (1, 1, 16, 768) 42 | ``` 43 | **Using PyTorch backend** by set `KECAM_BACKEND='torch'` environment variable. 44 | ```py 45 | os.environ['KECAM_BACKEND'] = 'torch' 46 | 47 | from keras_cv_attention_models import inceptionnext 48 | model = inceptionnext.InceptionNeXtTiny(input_shape=(None, None, 3), num_classes=0) 49 | # >>>> Using PyTorch backend 50 | # >>>> Aligned input_shape: [3, None, None] 51 | # >>>> Load pretrained from: ~/.keras/models/inceptionnext_tiny_imagenet.h5 52 | print(model.output_shape) 53 | # (None, 768, None, None) 54 | 55 | import torch 56 | print(model(torch.ones([1, 3, 223, 123])).shape) 57 | # (1, 768, 6, 3 ) 58 | print(model(torch.ones([1, 3, 32, 526])).shape) 59 | # (1, 768, 1, 16) 60 | ``` 61 | ## Verification with PyTorch version 62 | ```py 63 | """ PyTorch inceptionnext_tiny """ 64 | sys.path.append('../inceptionnext/') 65 | sys.path.append('../pytorch-image-models/') # Needs timm 66 | import torch 67 | from models import inceptionnext as inceptionnext_torch 68 | 69 | torch_model = inceptionnext_torch.inceptionnext_tiny(pretrained=True) 70 | _ = torch_model.eval() 71 | 72 | """ Keras InceptionNeXtTiny """ 73 | from keras_cv_attention_models import inceptionnext 74 | mm = inceptionnext.InceptionNeXtTiny(pretrained="imagenet", classifier_activation=None) 75 | 76 | """ Verification """ 77 | inputs = np.random.uniform(size=(1, *mm.input_shape[1:3], 3)).astype("float32") 78 | torch_out = torch_model(torch.from_numpy(inputs).permute(0, 3, 1, 2)).detach().numpy() 79 | keras_out = mm(inputs).numpy() 80 | print(f"{np.allclose(torch_out, keras_out, atol=5e-5) = }") 81 | # np.allclose(torch_out, keras_out, atol=5e-5) = True 82 | ``` 83 | -------------------------------------------------------------------------------- /keras_cv_attention_models/fastvit/__init__.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.fastvit.fastvit import ( 2 | FastViT, 3 | FastViT_T8, 4 | FastViT_T12, 5 | FastViT_S12, 6 | FastViT_SA12, 7 | FastViT_SA24, 8 | FastViT_SA36, 9 | FastViT_MA36, 10 | # switch_to_deploy, 11 | ) 12 | 13 | __head_doc__ = """ 14 | Keras implementation of [Github NVlabs/FasterViT](https://github.com/NVlabs/FasterViT). 15 | Paper [PDF 2306.06189 FasterViT: Fast Vision Transformers with Hierarchical Attention](https://arxiv.org/pdf/2306.06189.pdf). 16 | """ 17 | 18 | __tail_doc__ = """ block_types: block types for each stack, 19 | - `conv` or any `c` / `C` starts word, means `rep_conv_block` block. 20 | - `transfrom` or any not `c` / `C` starts word, means `multi_head_self_attention` block. 21 | value could be in format like `"cctt"` or `"CCTT"` or `["conv", "conv", "transfrom", "transform"]`. 22 | `["conv", "conv", "conv", "conv"]` for SA models, all conv for others. 23 | layer_scale: layer scale init value, [Going deeper with Image Transformers](https://arxiv.org/abs/2103.17239). 24 | Default 1e-6 for SA models, 1e-5 for others. 25 | input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`. 26 | deploy: boolean value if build a fused model. **Evaluation only, not good for training**. 27 | num_classes: number of classes to classify images into. Set `0` to exclude top layers. 28 | activation: activation used in whole model, default `hard_swish`. 29 | drop_connect_rate: is used for [Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382). 30 | Can be a constant value like `0.2`, 31 | or a tuple value like `(0, 0.2)` indicates the drop probability linearly changes from `0 --> 0.2` for `top --> bottom` layers. 32 | A higher value means a higher probability will drop the deep branch. 33 | or `0` to disable (default). 34 | dropout: top dropout rate if top layers is included. Default 0. 35 | classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`. 36 | Set `classifier_activation=None` to return the logits of the "top" layer. 37 | Default is `None`. 38 | pretrained: one of `None` (random initialization) or 'imagenet' (pre-training on ImageNet). 39 | Will try to download and load pre-trained model weights if not None. 40 | **kwargs: other parameters if available. 41 | 42 | Returns: 43 | A `keras.Model` instance. 44 | """ 45 | 46 | FastViT.__doc__ = __head_doc__ + """ 47 | Args: 48 | num_blocks: number of block for each stack. 49 | out_channels: output channels for each stack. 50 | stem_width: channel dimension output for stem block, default -1 for using out_channels[0]. 51 | mlp_ratio: int value for mlp_ratio for each stack. 52 | model_name: string, model name. 53 | """ + __tail_doc__ + """ 54 | Model architectures: 55 | | Model | Params | FLOPs | Input | Top1 Acc | 56 | | ------------ | ------ | ----- | ----- | -------- | 57 | | FastViT_T8 | 4.03M | 0.65G | 256 | 76.2 | 58 | | - distill | 4.03M | 0.65G | 256 | 77.2 | 59 | | FastViT_T12 | 7.55M | 1.34G | 256 | 79.3 | 60 | | - distill | 7.55M | 1.34G | 256 | 80.3 | 61 | | FastViT_S12 | 9.47M | 1.74G | 256 | 79.9 | 62 | | - distill | 9.47M | 1.74G | 256 | 81.1 | 63 | | FastViT_SA12 | 11.58M | 1.88G | 256 | 80.9 | 64 | | - distill | 11.58M | 1.88G | 256 | 81.9 | 65 | | FastViT_SA24 | 21.55M | 3.66G | 256 | 82.7 | 66 | | - distill | 21.55M | 3.66G | 256 | 83.4 | 67 | | FastViT_SA36 | 31.53M | 5.44G | 256 | 83.6 | 68 | | - distill | 31.53M | 5.44G | 256 | 84.2 | 69 | | FastViT_MA36 | 44.07M | 7.64G | 256 | 83.9 | 70 | | - distill | 44.07M | 7.64G | 256 | 84.6 | 71 | """ 72 | 73 | FastViT_T8.__doc__ = __head_doc__ + """ 74 | Args: 75 | """ + __tail_doc__ 76 | 77 | FastViT_T12.__doc__ = FastViT_T8.__doc__ 78 | FastViT_S12.__doc__ = FastViT_T8.__doc__ 79 | FastViT_SA12.__doc__ = FastViT_T8.__doc__ 80 | FastViT_SA24.__doc__ = FastViT_T8.__doc__ 81 | FastViT_SA36.__doc__ = FastViT_T8.__doc__ 82 | FastViT_MA36.__doc__ = FastViT_T8.__doc__ 83 | -------------------------------------------------------------------------------- /keras_cv_attention_models/aotnet/README.md: -------------------------------------------------------------------------------- 1 | # ___Keras AotNet___ 2 | *** 3 | 4 | ## Summary 5 | - `AotNet` is just a `ResNet` / `ResNetV2` like framework, that set parameters like `attn_types` and `attn_params` and others, which is used to apply different types attention layers. Works like `byoanet` / `byobnet` from `timm`. 6 | - Default parameters set is a typical `ResNet` architecture with `Conv2D use_bias=False` and `padding` like `PyTorch`. 7 | - `AotNet` means `Attention Over Template network`! Honestly, just a name after `BotNet` and `CotNet`... 8 | *** 9 | 10 | ## Usage 11 | - **attn_types** is a `string` or `list`, indicates attention layer type for each stack. Each element can also be a `string` or `list`, indicates attention layer type for each block. 12 | - `"bot"`: `mhsa_with_relative_position_embedding` from `botnet`. 13 | - `"cot"`: `cot_attention` from `cotnet`. 14 | - `"halo"`: `halo_attention` from `halonet`. 15 | - `"outlook"`: `outlook_attention` from `volo`. 16 | - `"sa"`: `split_attention_conv2d` from `resnest`. 17 | - `None`: `Conv2D`. Can add `groups` like `ResNeXt` or add `se` and `eca` attention. 18 | - **attn_params**: like `attn_types`, is a dict or list, each element in list can also be a dict or list. Indicates specific attention layer parameters for relative `attn_types`. 19 | - **se_ratio**: value in `(0, 1)`, where `0` means not using `se_module`. Should be a `number` or `list`, indicates `se_ratio` for each stack. Each element can also be a `number` or `list`, indicates `se_ratio` for each block. 20 | - **use_eca**: boolean value if use `eca` attention. Can also be a list like `se_ratio`. 21 | - **groups**: `groups` for `Conv2D` layer if relative `attn_types` is `None`. `ResNeXt` like archeticture. Note it's NOT the `group_size`. Default value `1` means not using group. 22 | - **Definition of `BotNet26T`** 23 | ```py 24 | from keras_cv_attention_models import aotnet 25 | model = aotnet.AotNet( 26 | num_blocks=[2, 2, 2, 2], 27 | attn_types=[None, None, [None, "bot"], "bot"], 28 | attn_params={"num_heads": 4, "out_weight": False}, 29 | stem_type="tiered", 30 | input_shape=(256, 256, 3), 31 | model_name="botnet26t", 32 | ) 33 | model.summary() 34 | ``` 35 | - **Definition of `CotNet101`** 36 | ```py 37 | from keras_cv_attention_models import aotnet 38 | model = aotnet.AotNet101( 39 | attn_types="cot", 40 | bn_after_attn=False, 41 | shortcut_type="avg", 42 | model_name="cotnet101", 43 | ) 44 | model.summary() 45 | ``` 46 | - **Definition of `HaloNet50T`** 47 | ```py 48 | from keras_cv_attention_models import aotnet 49 | attn_params = [ 50 | None, 51 | [None, None, None, {"block_size": 8, "halo_size": 3, "num_heads": 4, "out_weight": False}], 52 | [None, {"block_size": 8, "halo_size": 3, "num_heads": 8, "out_weight": False}] * 3, 53 | [None, {"block_size": 8, "halo_size": 3, "num_heads": 8, "out_weight": False}, None], 54 | ] 55 | model = aotnet.AotNet50( 56 | attn_types=[None, [None, None, None, "halo"], [None, "halo"] * 3, [None, "halo", None]], 57 | attn_params=attn_params, 58 | stem_type="tiered", 59 | input_shape=(256, 256, 3), 60 | model_name="halonet50t", 61 | ) 62 | model.summary() 63 | ``` 64 | - **Definition of `ResNest50`** 65 | ```py 66 | from keras_cv_attention_models import aotnet 67 | model = aotnet.AotNet50( 68 | stem_type="deep", 69 | shortcut_type="avg", 70 | attn_types="sa", 71 | bn_after_attn=False, 72 | model_name="resnest50", 73 | ) 74 | model.summary() 75 | ``` 76 | - **Mixing se and outlook and halo and bot and cot**, 21M parameters 77 | ```py 78 | # 50 is just a picked number that larger than the relative `num_block` 79 | model = aotnet.AotNet50V2( 80 | attn_types=[None, "outlook", ["bot", "halo"] * 50, "cot"], 81 | se_ratio=[0.25, 0, 0, 0], 82 | stem_type="deep", 83 | strides=1, 84 | ) 85 | model.summary() 86 | ``` 87 | - `AotNet50V2` / `AotNet101V2` / `AotNet152V2` / `AotNet200V2` is the `ResNetV2` like template. 88 | *** 89 | -------------------------------------------------------------------------------- /keras_cv_attention_models/model_surgery/README.md: -------------------------------------------------------------------------------- 1 | ## Model Surgery 2 | *** 3 | 4 | ## Summary 5 | - Functions used to change model parameters after built. 6 | - `SAMModel`: SAMModel definition. 7 | - `add_l2_regularizer_2_model`: add `l2` weight decay to `Dense` / `Conv2D` / `DepthwiseConv2D` / `SeparableConv2D` layers. 8 | - `convert_to_mixed_float16`: convert `float32` model to `mixed_float16`. 9 | - `convert_mixed_float16_to_float32`: convert `mixed_float16` model to `float32`. 10 | - `convert_groups_conv2d_2_split_conv2d`: convert `Conv2D groups != 1` to `SplitConv2D` using `split -> conv -> concat`. 11 | - `convert_gelu_and_extract_patches_for_tflite`: convert model `gelu` activation to `gelu approximate=True`, and `tf.image.extract_patches` to a `Conv2D` version. 12 | - `convert_to_fused_conv_bn_model`: fuse convolution and batchnorm layers for inference. 13 | - `prepare_for_tflite`: a combination of `convert_groups_conv2d_2_split_conv2d` and `convert_gelu_and_extract_patches_for_tflite`. 14 | - `replace_ReLU`: replace all `ReLU` with other activations, default target is `PReLU`. 15 | - `replace_add_with_stochastic_depth`: replace all `Add` layers with `StochasticDepth`. 16 | - `replace_stochastic_depth_with_add`: replace all `StochasticDepth` layers with `add` + `multiply`. 17 | ## Usage 18 | - **Convert add layers to stochastic depth** 19 | ```py 20 | from keras_cv_attention_models import model_surgery 21 | mm = keras.applications.ResNet50() 22 | mm = model_surgery.replace_add_with_drop_connect(mm, drop_rate=(0, 0.2)) 23 | print(model_surgery.get_actual_drop_connect_rates(mm)) 24 | # [0.0, 0.0125, 0.025, 0.0375, 0.05, 0.0625, 0.075, 0.0875, 0.1, 0.1125, 0.125, 0.1375, 0.15, 0.1625, 0.175, 0.1875] 25 | ``` 26 | - **Convert model between float16 and float32** 27 | ```py 28 | from keras_cv_attention_models import model_surgery 29 | mm = keras.applications.ResNet50() 30 | print(mm.layers[-1].compute_dtype) 31 | # float32 32 | mm = model_surgery.convert_to_mixed_float16(mm) 33 | print(mm.layers[-1].compute_dtype) 34 | # float16 35 | mm = model_surgery.convert_mixed_float16_to_float32(mm) 36 | print(mm.layers[-1].compute_dtype) 37 | # float32 38 | ``` 39 | - **Convert groups conv2d to split conv2d for TFLite usage** 40 | ```py 41 | from keras_cv_attention_models import model_surgery, regnet 42 | mm = regnet.RegNetZD32() 43 | print([ii.groups for ii in mm.layers if isinstance(ii, keras.layers.Conv2D) and ii.groups != 1]) 44 | # [8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 48, 48] 45 | mm = model_surgery.convert_groups_conv2d_2_split_conv2d(mm) 46 | print([ii.groups for ii in mm.layers if isinstance(ii, model_surgery.model_surgery.SplitConv2D)]) 47 | # [8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 48, 48] 48 | 49 | converter = tf.lite.TFLiteConverter.from_keras_model(mm) 50 | open(mm.name + ".tflite", "wb").write(converter.convert()) 51 | ``` 52 | ![](https://user-images.githubusercontent.com/5744524/147234593-0323b99b-7dcd-4b75-b8ed-94060346aabb.png) 53 | - **Change model input_shape after built** 54 | ```py 55 | from keras_cv_attention_models import model_surgery 56 | mm = keras.applications.ResNet50() 57 | print(mm.input_shape) 58 | # (None, 224, 224, 3) 59 | mm = model_surgery.change_model_input_shape(mm, (320, 320)) 60 | print(mm.input_shape) 61 | # (None, 320, 320, 3) 62 | ``` 63 | - **Replace ReLU activation layers** 64 | ```py 65 | from keras_cv_attention_models import model_surgery 66 | mm = keras.applications.ResNet50() 67 | print(mm.layers[-3].activation.__name__) 68 | # relu 69 | mm = model_surgery.replace_ReLU(mm, "PReLU") 70 | print(mm.layers[-3].__class__.__name__) 71 | # PReLU 72 | ``` 73 | - **Fuse convolution and batchnorm layers for inference** 74 | ```py 75 | from keras_cv_attention_models import model_surgery 76 | mm = keras.applications.ResNet50() 77 | mm.summary() 78 | # Trainable params: 25,583,592 79 | mm = model_surgery.convert_to_fused_conv_bn_model(mm) 80 | mm.summary() 81 | # Trainable params: 25,530,472 82 | ``` 83 | *** 84 | -------------------------------------------------------------------------------- /keras_cv_attention_models/resnest/__init__.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models.resnest.resnest import ResNest, ResNest50, ResNest101, ResNest200, ResNest269, rsoftmax, split_attention_conv2d 2 | 3 | 4 | __head_doc__ = """ 5 | Keras implementation of [ResNeSt](https://github.com/zhanghang1989/ResNeSt). 6 | Paper [PDF 2004.08955 ResNeSt: Split-Attention Networks](https://arxiv.org/pdf/2004.08955.pdf). 7 | """ 8 | 9 | __tail_doc__ = """ groups: controls number of split groups in `split_attention_conv2d`. 10 | input_shape: it should have exactly 3 inputs channels, like `(224, 224, 3)`. 11 | Set `(None, None, 3)` for dynamic input resolution. 12 | num_classes: number of classes to classify images into. Set `0` to exclude top layers. 13 | activation: activation used in whole model, default `relu`. 14 | drop_connect_rate: is used for [Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382). 15 | Can be value like `0.2`, indicates the drop probability linearly changes from `0 --> 0.2` for `top --> bottom` layers. 16 | A higher value means a higher probability will drop the deep branch. 17 | or `0` to disable (default). 18 | classifier_activation: A `str` or callable. The activation function to use on the "top" layer if `num_classes > 0`. 19 | Set `classifier_activation=None` to return the logits of the "top" layer. 20 | Default is `softmax`. 21 | pretrained: one of `None` (random initialization) or 'imagenet' (pre-training on ImageNet). 22 | Will try to download and load pre-trained model weights if not None. 23 | **kwargs: other parameters from `AotNet` if not conflict. 24 | 25 | Returns: 26 | A `keras.Model` instance. 27 | """ 28 | 29 | ResNest.__doc__ = __head_doc__ + """ 30 | Args: 31 | num_blocks: number of blocks in each stack. 32 | model_name: string, model name. 33 | """ + __tail_doc__ + """ 34 | Model architectures: 35 | | Model | Params | FLOPs | Input | Top1 Acc | 36 | | -------------- | ------ | ------ | ----- | -------- | 37 | | resnest50 | 28M | 5.38G | 224 | 81.03 | 38 | | resnest101 | 49M | 13.33G | 256 | 82.83 | 39 | | resnest200 | 71M | 35.55G | 320 | 83.84 | 40 | | resnest269 | 111M | 77.42G | 416 | 84.54 | 41 | """ 42 | 43 | ResNest50.__doc__ = __head_doc__ + """ 44 | Args: 45 | """ + __tail_doc__ 46 | 47 | ResNest101.__doc__ = ResNest50.__doc__ 48 | ResNest200.__doc__ = ResNest50.__doc__ 49 | ResNest269.__doc__ = ResNest50.__doc__ 50 | 51 | split_attention_conv2d.__doc__ = __head_doc__ + """ 52 | Split-Attention. Callable function, NOT defined as a layer. 53 | Generating `attention_scores` using grouped `Conv2D`. 54 | 55 | Args: 56 | inputs: input tensor. 57 | filters: output dimension. 58 | kernel_size: kernel size for grouped Conv2D. 59 | strides: strides for grouped Conv2D. 60 | groups: number of splitted groups. 61 | activation: activation used after `BatchNormalization`. 62 | 63 | Examples: 64 | 65 | >>> from keras_cv_attention_models import attention_layers 66 | >>> inputs = keras.layers.Input([28, 28, 192]) 67 | >>> nn = attention_layers.split_attention_conv2d(inputs, 384) 68 | >>> dd = keras.models.Model(inputs, nn) 69 | >>> dd.summary() 70 | >>> dd.output_shape 71 | (None, 28, 28, 384) 72 | 73 | >>> {ii.name: ii.shape for ii in dd.weights} 74 | {'1_g1_conv/kernel:0': TensorShape([3, 3, 96, 384]), 75 | '1_g2_conv/kernel:0': TensorShape([3, 3, 96, 384]), 76 | '1_bn/gamma:0': TensorShape([768]), 77 | '1_bn/beta:0': TensorShape([768]), 78 | '1_bn/moving_mean:0': TensorShape([768]), 79 | '1_bn/moving_variance:0': TensorShape([768]), 80 | '2_conv/kernel:0': TensorShape([1, 1, 384, 96]), 81 | '2_conv/bias:0': TensorShape([96]), 82 | '2_bn/gamma:0': TensorShape([96]), 83 | '2_bn/beta:0': TensorShape([96]), 84 | '2_bn/moving_mean:0': TensorShape([96]), 85 | '2_bn/moving_variance:0': TensorShape([96]), 86 | '3_conv/kernel:0': TensorShape([1, 1, 96, 768]), 87 | '3_conv/bias:0': TensorShape([768])} 88 | """ 89 | 90 | rsoftmax.__doc__ = __head_doc__ + """ 91 | Perform group split softmax 92 | 93 | input: `[batch, 1, 1, channel]`. 94 | output: `[batch, 1, 1, channel]`. 95 | 96 | Args: 97 | inputs: Input tensor. 98 | groups: groups to split on channel dimension. 99 | """ 100 | -------------------------------------------------------------------------------- /keras_cv_attention_models/edgenext/README.md: -------------------------------------------------------------------------------- 1 | # ___Keras EdgeNeXt___ 2 | *** 3 | 4 | ## Summary 5 | - Keras implementation of [Github mmaaz60/EdgeNeXt](https://github.com/mmaaz60/EdgeNeXt). Paper [PDF 2206.10589 EdgeNeXt: Efficiently Amalgamated CNN-Transformer Architecture for Mobile Vision Applications](https://arxiv.org/pdf/2206.10589.pdf). 6 | - Model weights reloaded from official publication. 7 | - Related usi distillation paper [PDF 2204.03475 Solving ImageNet: a Unified Scheme for Training any Backbone to Top Results](https://arxiv.org/pdf/2204.03475.pdf). 8 | *** 9 | 10 | ## Models 11 | | Model | Params | FLOPs | Input | Top1 Acc | Download | 12 | | ----------------- | ------ | ------ | ----- | -------- | -------- | 13 | | EdgeNeXt_XX_Small | 1.33M | 266M | 256 | 71.23 | [edgenext_xx_small_256_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/edgenext/edgenext_xx_small_256_imagenet.h5) | 14 | | EdgeNeXt_X_Small | 2.34M | 547M | 256 | 74.96 | [edgenext_x_small_256_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/edgenext/edgenext_x_small_256_imagenet.h5) | 15 | | EdgeNeXt_Small | 5.59M | 1.27G | 256 | 79.41 | [edgenext_small_256_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/edgenext/edgenext_small_256_imagenet.h5) | 16 | | - usi | 5.59M | 1.27G | 256 | 81.07 | [edgenext_small_256_usi.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/edgenext/edgenext_small_256_usi.h5) | 17 | | EdgeNeXt_Base | 18.5M | 3.86G | 256 | 82.47 | [edgenext_small_256_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/edgenext/edgenext_base_256_imagenet.h5) | 18 | | - usi | 18.5M | 3.86G | 256 | 83.31 | [edgenext_small_256_usi.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/edgenext/edgenext_base_256_usi.h5) | 19 | | - 21k_ft1k | 18.5M | 3.86G | 256 | 83.68 | [edgenext_small_256_usi.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/edgenext/edgenext_base_256_imagenet-ft1k.h5) | 20 | ## Usage 21 | ```py 22 | from keras_cv_attention_models import edgenext 23 | 24 | # Will download and load pretrained imagenet weights. 25 | mm = edgenext.EdgeNeXt_XX_Small(pretrained="imagenet") 26 | 27 | # Run prediction 28 | import tensorflow as tf 29 | from tensorflow import keras 30 | from skimage.data import chelsea 31 | imm = keras.applications.imagenet_utils.preprocess_input(chelsea(), mode='torch') # Chelsea the cat 32 | pred = mm(tf.expand_dims(tf.image.resize(imm, mm.input_shape[1:3]), 0)).numpy() 33 | print(keras.applications.imagenet_utils.decode_predictions(pred)[0]) 34 | # [('n02124075', 'Egyptian_cat', 0.60692847), ('n02123045', 'tabby', 0.21328166), ...] 35 | ``` 36 | **Change input resolution** 37 | ```py 38 | from keras_cv_attention_models import edgenext 39 | mm = edgenext.EdgeNeXt_Small(input_shape=(174, 269, 3), pretrained="usi") 40 | # >>>> Load pretrained from: ~/.keras/models/edgenext_small_256_usi.h5 41 | 42 | # Run prediction 43 | from skimage.data import chelsea 44 | preds = mm(mm.preprocess_input(chelsea())) 45 | print(mm.decode_predictions(preds)) 46 | # [[('n02124075', 'Egyptian_cat', 0.8444098), ('n02123159', 'tiger_cat', 0.061309356), ...] 47 | ``` 48 | ## Verification with PyTorch version 49 | ```py 50 | """ PyTorch edgenext_small """ 51 | sys.path.append('../EdgeNeXt/') 52 | sys.path.append('../pytorch-image-models/') # Needs timm 53 | import torch 54 | from models import model 55 | torch_model = model.edgenext_small(classifier_dropout=0) 56 | _ = torch_model.eval() 57 | ss = torch.load('edgenext_small_usi.pth', map_location=torch.device('cpu')) 58 | torch_model.load_state_dict(ss.get('state_dict', ss.get('model', ss))) 59 | 60 | """ Keras EdgeNeXt_Small """ 61 | from keras_cv_attention_models import edgenext 62 | mm = edgenext.EdgeNeXt_Small(pretrained="usi", classifier_activation=None) 63 | 64 | """ Verification """ 65 | inputs = np.random.uniform(size=(1, *mm.input_shape[1:3], 3)).astype("float32") 66 | torch_out = torch_model(torch.from_numpy(inputs).permute(0, 3, 1, 2)).detach().numpy() 67 | keras_out = mm(inputs).numpy() 68 | print(f"{np.allclose(torch_out, keras_out, atol=1e-5) = }") 69 | # np.allclose(torch_out, keras_out, atol=1e-5) = True 70 | ``` 71 | *** 72 | -------------------------------------------------------------------------------- /tests/test_switch_to_deploy_tf.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | sys.path.append(".") 4 | import keras_cv_attention_models # Needs to set TF_USE_LEGACY_KERAS=1 env firstly 5 | 6 | import pytest 7 | import numpy as np 8 | 9 | from keras_cv_attention_models.test_images import cat 10 | 11 | 12 | def test_EfficientFormerL1_use_distillation_switch_to_deploy(): 13 | mm = keras_cv_attention_models.models.EfficientFormerL1(use_distillation=True, classifier_activation=None) 14 | preds = mm(mm.preprocess_input(cat())) 15 | 16 | bb = mm.switch_to_deploy() 17 | preds_deploy = bb(bb.preprocess_input(cat())) 18 | assert np.allclose((preds[0] + preds[1]) / 2, preds_deploy, atol=1e-5) 19 | 20 | 21 | def test_EfficientFormerV2S0_use_distillation_switch_to_deploy(): 22 | mm = keras_cv_attention_models.models.EfficientFormerV2S0(use_distillation=True, classifier_activation=None) 23 | preds = mm(mm.preprocess_input(cat())) 24 | 25 | bb = mm.switch_to_deploy() 26 | preds_deploy = bb(bb.preprocess_input(cat())) 27 | assert np.allclose((preds[0] + preds[1]) / 2, preds_deploy, atol=1e-5) 28 | 29 | 30 | def test_EfficientViT_M0_use_distillation_switch_to_deploy(): 31 | mm = keras_cv_attention_models.models.EfficientViT_M0(use_distillation=True, classifier_activation=None) 32 | preds = mm(mm.preprocess_input(cat())) 33 | 34 | bb = mm.switch_to_deploy() 35 | preds_deploy = bb(bb.preprocess_input(cat())) 36 | assert np.allclose((preds[0] + preds[1]) / 2, preds_deploy, atol=1e-5) 37 | 38 | 39 | def test_FasterViT0_switch_to_deploy(): 40 | mm = keras_cv_attention_models.models.FasterViT0() 41 | preds = mm(mm.preprocess_input(cat())) 42 | 43 | bb = mm.switch_to_deploy() 44 | preds_deploy = bb(bb.preprocess_input(cat())) 45 | assert np.allclose(preds, preds_deploy, atol=1e-5) 46 | 47 | 48 | def test_FastViT_T8_switch_to_deploy(): 49 | mm = keras_cv_attention_models.models.FastViT_T8() 50 | preds = mm(mm.preprocess_input(cat())) 51 | 52 | bb = mm.switch_to_deploy() 53 | preds_deploy = bb(bb.preprocess_input(cat())) 54 | assert np.allclose(preds, preds_deploy, atol=1e-5) 55 | 56 | 57 | def test_FastViT_SA12_switch_to_deploy(): 58 | mm = keras_cv_attention_models.models.FastViT_SA12() 59 | preds = mm(mm.preprocess_input(cat())) 60 | 61 | bb = mm.switch_to_deploy() 62 | preds_deploy = bb(bb.preprocess_input(cat())) 63 | assert np.allclose(preds, preds_deploy, atol=1e-5) 64 | 65 | 66 | def test_LeViT128S_switch_to_deploy(): 67 | mm = keras_cv_attention_models.models.LeViT128S() 68 | preds = mm(mm.preprocess_input(cat())) 69 | 70 | bb = mm.switch_to_deploy() 71 | preds_deploy = bb(bb.preprocess_input(cat())) 72 | assert np.allclose(preds, preds_deploy, atol=1e-5) 73 | 74 | 75 | def test_RepViT_M09_use_distillation_switch_to_deploy(): 76 | mm = keras_cv_attention_models.models.RepViT_M09(use_distillation=True, classifier_activation=None) 77 | preds = mm(mm.preprocess_input(cat())) 78 | 79 | bb = mm.switch_to_deploy() 80 | preds_deploy = bb(bb.preprocess_input(cat())) 81 | assert np.allclose((preds[0] + preds[1]) / 2, preds_deploy, atol=1e-5) 82 | 83 | 84 | def test_RepViT_M1_not_distillation_switch_to_deploy(): 85 | mm = keras_cv_attention_models.models.RepViT_M09(use_distillation=False) 86 | preds = mm(mm.preprocess_input(cat())) 87 | 88 | bb = mm.switch_to_deploy() 89 | preds_deploy = bb(bb.preprocess_input(cat())) 90 | assert np.allclose(preds, preds_deploy, atol=1e-5) 91 | 92 | 93 | def test_SwinTransformerV2Tiny_window8_switch_to_deploy(): 94 | mm = keras_cv_attention_models.models.SwinTransformerV2Tiny_window8() 95 | preds = mm(mm.preprocess_input(cat())) 96 | 97 | bb = mm.switch_to_deploy() 98 | preds_deploy = bb(bb.preprocess_input(cat())) 99 | assert np.allclose(preds, preds_deploy, atol=1e-5) 100 | 101 | 102 | def test_VanillaNet5_switch_to_deploy(): 103 | mm = keras_cv_attention_models.models.VanillaNet5() 104 | preds = mm(mm.preprocess_input(cat())) 105 | 106 | bb = mm.switch_to_deploy() 107 | preds_deploy = bb(bb.preprocess_input(cat())) 108 | assert np.allclose(preds, preds_deploy, atol=1e-5) 109 | 110 | 111 | def test_YOLO_NAS_S_switch_to_deploy(): 112 | mm = keras_cv_attention_models.models.YOLO_NAS_S(use_reparam_conv=True) 113 | preds = mm(mm.preprocess_input(cat())) 114 | 115 | bb = mm.switch_to_deploy() 116 | preds_deploy = bb(bb.preprocess_input(cat())) 117 | assert np.allclose(preds, preds_deploy, atol=1e-3) 118 | -------------------------------------------------------------------------------- /keras_cv_attention_models/fasternet/README.md: -------------------------------------------------------------------------------- 1 | # ___Keras FasterNet___ 2 | *** 3 | 4 | ## Summary 5 | - Keras implementation of [Github JierunChen/FasterNet](https://github.com/JierunChen/FasterNet). Paper [PDF 2303.03667 Run, Don’t Walk: Chasing Higher FLOPS for Faster Neural Networks ](https://arxiv.org/pdf/2303.03667.pdf). 6 | - Model weights ported from official publication. 7 | 8 | ![fasternet](https://user-images.githubusercontent.com/5744524/227238562-5ee980ba-84c7-44d0-969d-c472f6e719a4.jpg) 9 | *** 10 | 11 | ## Models 12 | | Model | Params | FLOPs | Input | Top1 Acc | Download | 13 | | ----------- | ------ | ------ | ----- | -------- | -------- | 14 | | FasterNetT0 | 3.9M | 0.34G | 224 | 71.9 | [fasternet_t0_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/fasternet/fasternet_t0_imagenet.h5) | 15 | | FasterNetT1 | 7.6M | 0.85G | 224 | 76.2 | [fasternet_t1_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/fasternet/fasternet_t1_imagenet.h5) | 16 | | FasterNetT2 | 15.0M | 1.90G | 224 | 78.9 | [fasternet_t2_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/fasternet/fasternet_t2_imagenet.h5) | 17 | | FasterNetS | 31.1M | 4.55G | 224 | 81.3 | [fasternet_s_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/fasternet/fasternet_s_imagenet.h5) | 18 | | FasterNetM | 53.5M | 8.72G | 224 | 83.0 | [fasternet_m_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/fasternet/fasternet_m_imagenet.h5) | 19 | | FasterNetL | 93.4M | 15.49G | 224 | 83.5 | [fasternet_l_imagenet.h5](https://github.com/leondgarse/keras_cv_attention_models/releases/download/fasternet/fasternet_l_imagenet.h5) | 20 | 21 | ## Usage 22 | ```py 23 | from keras_cv_attention_models import fasternet 24 | 25 | # Will download and load pretrained imagenet weights. 26 | model = fasternet.FasterNetT2(pretrained="imagenet") 27 | 28 | # Run prediction 29 | from skimage.data import chelsea # Chelsea the cat 30 | preds = model(model.preprocess_input(chelsea())) 31 | print(model.decode_predictions(preds)) 32 | # [('n02124075', 'Egyptian_cat', 0.76938057), ('n02123159', 'tiger_cat', 0.0810011), ...] 33 | ``` 34 | **Use dynamic input resolution** by set `input_shape=(None, None, 3)`. 35 | ```py 36 | from keras_cv_attention_models import fasternet 37 | model = fasternet.FasterNetT2(input_shape=(None, None, 3), num_classes=0) 38 | # >>>> Load pretrained from: ~/.keras/models/fasternet_t2_imagenet.h5 39 | print(model.output_shape) 40 | # (None, None, None, 768) 41 | 42 | print(model(np.ones([1, 223, 123, 3])).shape) 43 | # (1, 6, 3, 768) 44 | print(model(np.ones([1, 32, 526, 3])).shape) 45 | # (1, 1, 16, 768) 46 | ``` 47 | **Using PyTorch backend** by set `KECAM_BACKEND='torch'` environment variable. 48 | ```py 49 | os.environ['KECAM_BACKEND'] = 'torch' 50 | 51 | from keras_cv_attention_models import fasternet 52 | model = fasternet.FasterNetT2(input_shape=(None, None, 3), num_classes=0) 53 | # >>>> Using PyTorch backend 54 | # >>>> Aligned input_shape: [3, None, None] 55 | # >>>> Load pretrained from: ~/.keras/models/fasternet_t2_imagenet.h5 56 | print(model.output_shape) 57 | # (None, 768, None, None) 58 | 59 | import torch 60 | print(model(torch.ones([1, 3, 223, 123])).shape) 61 | # (1, 768, 6, 3 ) 62 | print(model(torch.ones([1, 3, 32, 526])).shape) 63 | # (1, 768, 1, 16) 64 | ``` 65 | ## Verification with PyTorch version 66 | ```py 67 | """ PyTorch fasternet_t2 """ 68 | sys.path.append('../FasterNet/') 69 | sys.path.append('../pytorch-image-models/') # Needs timm 70 | import torch 71 | from models import fasternet as fasternet_torch 72 | 73 | torch_model = fasternet_torch.FasterNet() # Default parameters is for T2 74 | ss = torch.load('fasternet_t2-epoch.289-val_acc1.78.8860.pth', map_location=torch.device('cpu')) 75 | torch_model.load_state_dict(ss) 76 | _ = torch_model.eval() 77 | 78 | """ Keras FasterNetT2 """ 79 | from keras_cv_attention_models import fasternet 80 | mm = fasternet.FasterNetT2(pretrained="imagenet", classifier_activation=None) 81 | 82 | """ Verification """ 83 | inputs = np.random.uniform(size=(1, *mm.input_shape[1:3], 3)).astype("float32") 84 | torch_out = torch_model(torch.from_numpy(inputs).permute(0, 3, 1, 2)).detach().numpy() 85 | keras_out = mm(inputs).numpy() 86 | print(f"{np.allclose(torch_out, keras_out, atol=1e-5) = }") 87 | # np.allclose(torch_out, keras_out, atol=1e-5) = True 88 | ``` 89 | -------------------------------------------------------------------------------- /keras_cv_attention_models/__init__.py: -------------------------------------------------------------------------------- 1 | from keras_cv_attention_models import backend 2 | 3 | from keras_cv_attention_models.version import __version__ 4 | from keras_cv_attention_models import plot_func 5 | from keras_cv_attention_models import attention_layers 6 | from keras_cv_attention_models import beit 7 | from keras_cv_attention_models.beit import flexivit 8 | from keras_cv_attention_models.beit import eva 9 | from keras_cv_attention_models.beit import eva02 10 | from keras_cv_attention_models.beit import dinov2 11 | from keras_cv_attention_models.beit import meta_transformer 12 | from keras_cv_attention_models.beit import vit 13 | from keras_cv_attention_models import botnet 14 | from keras_cv_attention_models import caformer 15 | from keras_cv_attention_models import coat 16 | from keras_cv_attention_models import coatnet 17 | from keras_cv_attention_models import convnext 18 | from keras_cv_attention_models import cotnet 19 | from keras_cv_attention_models import cmt 20 | from keras_cv_attention_models import cspnext 21 | from keras_cv_attention_models import davit 22 | from keras_cv_attention_models import efficientnet 23 | from keras_cv_attention_models import edgenext 24 | from keras_cv_attention_models import efficientformer 25 | from keras_cv_attention_models import fasternet 26 | from keras_cv_attention_models import gcvit 27 | from keras_cv_attention_models import ghostnet 28 | from keras_cv_attention_models import ghostnet as ghostnetv2 # alias name 29 | from keras_cv_attention_models import gpt2 30 | from keras_cv_attention_models import llama2 31 | from keras_cv_attention_models import halonet 32 | from keras_cv_attention_models import hiera 33 | from keras_cv_attention_models import hornet 34 | from keras_cv_attention_models import iformer 35 | from keras_cv_attention_models import levit 36 | from keras_cv_attention_models import mlp_family 37 | from keras_cv_attention_models.mlp_family import mlp_mixer 38 | from keras_cv_attention_models.mlp_family import res_mlp 39 | from keras_cv_attention_models.mlp_family import gated_mlp 40 | from keras_cv_attention_models.mlp_family import wave_mlp 41 | from keras_cv_attention_models.mobilenetv3_family import fbnetv3 42 | from keras_cv_attention_models.mobilenetv3_family import lcnet 43 | from keras_cv_attention_models.mobilenetv3_family import mobilenetv3 44 | from keras_cv_attention_models.mobilenetv3_family import tinynet 45 | from keras_cv_attention_models import efficientvit 46 | from keras_cv_attention_models.efficientvit import efficientvit_b 47 | from keras_cv_attention_models.efficientvit import efficientvit_m 48 | from keras_cv_attention_models import inceptionnext 49 | from keras_cv_attention_models import maxvit 50 | from keras_cv_attention_models import mobilevit 51 | from keras_cv_attention_models import moganet 52 | from keras_cv_attention_models import nat 53 | from keras_cv_attention_models.nat import dinat 54 | from keras_cv_attention_models import pvt 55 | from keras_cv_attention_models import repvit 56 | from keras_cv_attention_models import tinyvit 57 | from keras_cv_attention_models import resnest 58 | from keras_cv_attention_models import resnet_family 59 | from keras_cv_attention_models.resnet_family import resnext 60 | from keras_cv_attention_models.resnet_family import resnet_quad 61 | from keras_cv_attention_models.resnet_family import resnet_deep 62 | from keras_cv_attention_models.resnet_family import regnet 63 | from keras_cv_attention_models import gpvit 64 | from keras_cv_attention_models import swin_transformer_v2 65 | from keras_cv_attention_models import uniformer 66 | from keras_cv_attention_models import fastervit 67 | from keras_cv_attention_models import fastvit 68 | from keras_cv_attention_models import vanillanet 69 | from keras_cv_attention_models import download_and_load 70 | from keras_cv_attention_models import imagenet 71 | from keras_cv_attention_models import test_images 72 | from keras_cv_attention_models import model_surgery 73 | from keras_cv_attention_models import efficientdet 74 | from keras_cv_attention_models import yolox 75 | from keras_cv_attention_models import yolor 76 | from keras_cv_attention_models import yolov7 77 | from keras_cv_attention_models import yolov8 78 | from keras_cv_attention_models.yolov8 import yolo_nas 79 | from keras_cv_attention_models import coco 80 | from keras_cv_attention_models import clip 81 | from keras_cv_attention_models.clip import tokenizer 82 | from keras_cv_attention_models import stable_diffusion 83 | from keras_cv_attention_models import segment_anything 84 | from keras_cv_attention_models import segment_anything as sam # Alias name 85 | 86 | if backend.is_tensorflow_backend: 87 | from keras_cv_attention_models import nfnets 88 | from keras_cv_attention_models import volo 89 | from keras_cv_attention_models import visualizing 90 | --------------------------------------------------------------------------------