├── .vscode └── launch.json ├── README.md ├── attention ├── A2Attention.py ├── AFT.py ├── BAM.py ├── CBAM.py ├── CoAtNet.py ├── CoTAttention.py ├── CoordAttention.py ├── DANet.py ├── ECAAttention.py ├── EMSA.py ├── ExternalAttention.py ├── HaloAttention.py ├── MUSEAttention.py ├── MobileViTAttention.py ├── OutlookAttention.py ├── PSA.py ├── ParNetAttention.py ├── PolarizedSelfAttention.py ├── ResidualAttention.py ├── S2Attention.py ├── SEAttention.py ├── SGE.py ├── SKAttention.py ├── SelfAttention.py ├── ShuffleAttention.py ├── SimplifiedSelfAttention.py ├── TripletAttention.py ├── UFOAttention.py ├── ViP.py ├── gfnet.py └── img │ ├── A2.png │ ├── AFT.jpg │ ├── BAM.png │ ├── CBAM1.png │ ├── CBAM2.png │ ├── CoAtNet.png │ ├── CoT.png │ ├── CondConv.png │ ├── ConvMixer.png │ ├── CoordAttention.png │ ├── DepthwiseSeparableConv.png │ ├── DynamicConv.png │ ├── ECA.png │ ├── EMSA.jpg │ ├── EMSA.png │ ├── External_Attention.png │ ├── GFNet.jpg │ ├── HaloNet.png │ ├── Infine-attention.jpeg │ ├── Involution.png │ ├── MBConv.jpg │ ├── MUSE.png │ ├── MUSE2.jpg │ ├── MobileViTAttention.png │ ├── MobileViTv2.png │ ├── OutlookAttention.png │ ├── ParNet.png │ ├── PoSA.png │ ├── ResAtt.png │ ├── S2Attention.png │ ├── SA.png │ ├── SE.png │ ├── SGE.jpg │ ├── SGE.png │ ├── SK.png │ ├── SSA.png │ ├── ShuffleAttention.jpg │ ├── ShuffleAttention.png │ ├── UFO.png │ ├── ViP.png │ ├── acnet.png │ ├── danet.png │ ├── danet2.png │ ├── ddb.png │ ├── gMLP.jpg │ ├── mlpmixer.png │ ├── mobileViT.jpg │ ├── psa.jpg │ ├── psa.png │ ├── psa2.jpg │ ├── repmlp.png │ ├── repvgg.png │ ├── resmlp.png │ ├── resnet.png │ ├── resnet2.jpg │ ├── resnext.png │ ├── sMLP.jpg │ └── triplet.png ├── conv └── MBConv.py └── requirements.txt /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // 使用 IntelliSense 了解相关属性。 3 | // 悬停以查看现有属性的描述。 4 | // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [] 7 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # [External-Attention-tensorflow](https://github.com/ccfco-Ivan/External-Attention-tensorflow) 2 | 3 | [![OSCS Status](https://www.oscs1024.com/platform/badge/ccfco-Ivan/External-Attention-tensorflow.svg?size=small)](https://www.oscs1024.com/project/ccfco-Ivan/External-Attention-tensorflow?ref=badge_small) 4 | 5 | ## Contents 6 | 7 | - [External-Attention-tensorflow](#external-attention-tensorflow) 8 | - [Contents](#contents) 9 | - [Attention Series](#attention-series) 10 | - [1. Residual Attention Usage](#1-residual-attention-usage) 11 | - [1.1. Paper](#11-paper) 12 | - [1.2 Overview](#12-overview) 13 | - [1.3. UsageCode](#13-usagecode) 14 | - [2. External Attention Usage](#2-external-attention-usage) 15 | - [2.1. Paper](#21-paper) 16 | - [2.2. Overview](#22-overview) 17 | - [2.3. UsageCode](#23-usagecode) 18 | - [3. Self Attention Usage](#3-self-attention-usage) 19 | - [3.1. Paper](#31-paper) 20 | - [3.2. Overview](#32-overview) 21 | - [3.3. UsageCode](#33-usagecode) 22 | - [4. Simplified Self Attention Usage](#4-simplified-self-attention-usage) 23 | - [4.1. Paper](#41-paper) 24 | - [4.2. Overview](#42-overview) 25 | - [4.3. UsageCode](#43-usagecode) 26 | - [5. Squeeze-and-Excitation Attention Usage](#5-squeeze-and-excitation-attention-usage) 27 | - [5.1. Paper](#51-paper) 28 | - [5.2. Overview](#52-overview) 29 | - [5.3. UsageCode](#53-usagecode) 30 | - [6. SK Attention Usage](#6-sk-attention-usage) 31 | - [6.1. Paper](#61-paper) 32 | - [6.2. Overview](#62-overview) 33 | - [6.3. UsageCode](#63-usagecode) 34 | - [7. CBAM Attention Usage](#7-cbam-attention-usage) 35 | - [7.1. Paper](#71-paper) 36 | - [7.2. Overview](#72-overview) 37 | - [7.3. Usage Code](#73-usage-code) 38 | - [8. BAM Attention Usage](#8-bam-attention-usage) 39 | - [8.1. Paper](#81-paper) 40 | - [8.2. Overview](#82-overview) 41 | - [8.3. Usage Code](#83-usage-code) 42 | - [9. ECA Attention Usage](#9-eca-attention-usage) 43 | - [9.1. Paper](#91-paper) 44 | - [9.2. Overview](#92-overview) 45 | - [9.3. Usage Code](#93-usage-code) 46 | - [10. DANet Attention Usage](#10-danet-attention-usage) 47 | - [10.1. Paper](#101-paper) 48 | - [10.2. Overview](#102-overview) 49 | - [10.3. Usage Code](#103-usage-code) 50 | - [11. Pyramid Squeeze Attention Usage](#11-pyramid-squeeze-attention-usage) 51 | - [11.1. Paper](#111-paper) 52 | - [11.2. Overview](#112-overview) 53 | - [11.3. Usage Code](#113-usage-code) 54 | - [12. Efficient Multi-Head Self-Attention Usage](#12-efficient-multi-head-self-attention-usage) 55 | - [12.1. Paper](#121-paper) 56 | - [12.2. Overview](#122-overview) 57 | - [12.3. Usage Code](#123-usage-code) 58 | - [13. Shuffle Attention Usage](#13-shuffle-attention-usage) 59 | - [13.1. Paper](#131-paper) 60 | - [13.2. Overview](#132-overview) 61 | - [13.3. Usage Code](#133-usage-code) 62 | - [14. MUSE Attention Usage](#14-muse-attention-usage) 63 | - [14.1. Paper](#141-paper) 64 | - [14.2. Overview](#142-overview) 65 | - [14.3. Usage Code](#143-usage-code) 66 | - [15. SGE Attention Usage](#15-sge-attention-usage) 67 | - [15.1. Paper](#151-paper) 68 | - [15.2. Overview](#152-overview) 69 | - [15.3. Usage Code](#153-usage-code) 70 | - [16. A2 Attention Usage](#16-a2-attention-usage) 71 | - [16.1. Paper](#161-paper) 72 | - [16.2. Overview](#162-overview) 73 | - [16.3. Usage Code](#163-usage-code) 74 | - [17. AFT Attention Usage](#17-aft-attention-usage) 75 | - [17.1. Paper](#171-paper) 76 | - [17.2. Overview](#172-overview) 77 | - [17.3. Usage Code](#173-usage-code) 78 | - [18. Outlook Attention Usage](#18-outlook-attention-usage) 79 | - [18.1. Paper](#181-paper) 80 | - [18.2. Overview](#182-overview) 81 | - [18.3. Usage Code](#183-usage-code) 82 | - [19. ViP Attention Usage](#19-vip-attention-usage) 83 | - [19.1. Paper](#191-paper) 84 | - [19.2. Overview](#192-overview) 85 | - [19.3. Usage Code](#193-usage-code) 86 | - [20. CoAtNet Attention Usage](#20-coatnet-attention-usage) 87 | - [20.1. Paper](#201-paper) 88 | - [20.2. Overview](#202-overview) 89 | - [20.3. Usage Code](#203-usage-code) 90 | - [21. HaloNet Attention Usage](#21-halonet-attention-usage) 91 | - [21.1. Paper](#211-paper) 92 | - [21.2. Overview](#212-overview) 93 | - [21.3. Usage Code](#213-usage-code) 94 | - [22. Polarized Self-Attention Usage](#22-polarized-self-attention-usage) 95 | - [22.1. Paper](#221-paper) 96 | - [22.2. Overview](#222-overview) 97 | - [22.3. Usage Code](#223-usage-code) 98 | - [23. CoTAttention Usage](#23-cotattention-usage) 99 | - [23.1. Paper](#231-paper) 100 | - [23.2. Overview](#232-overview) 101 | - [23.3. Usage Code](#233-usage-code) 102 | - [24. S2 Attention Usage](#24-s2-attention-usage) 103 | - [24.1. Paper](#241-paper) 104 | - [24.2. Overview](#242-overview) 105 | - [24.3. Usage Code](#243-usage-code) 106 | - [25. GFNet Attention Usage](#25-gfnet-attention-usage) 107 | - [25.1. Paper](#251-paper) 108 | - [25.2. Overview](#252-overview) 109 | - [25.3. Usage Code - Implemented by Wenliang Zhao (Author)](#253-usage-code---implemented-by-wenliang-zhao-author) 110 | - [26. TripletAttention Usage](#26-tripletattention-usage) 111 | - [26.1. Paper](#261-paper) 112 | - [26.2. Overview](#262-overview) 113 | - [26.3. Usage Code - Implemented by digantamisra98](#263-usage-code---implemented-by-digantamisra98) 114 | - [27. Coordinate Attention Usage](#27-coordinate-attention-usage) 115 | - [27.1. Paper](#271-paper) 116 | - [27.2. Overview](#272-overview) 117 | - [27.3. Usage Code - Implemented by Andrew-Qibin](#273-usage-code---implemented-by-andrew-qibin) 118 | - [28. MobileViT Attention Usage](#28-mobilevit-attention-usage) 119 | - [28.1. Paper](#281-paper) 120 | - [28.2. Overview](#282-overview) 121 | - [28.3. Usage Code](#283-usage-code) 122 | - [29. ParNet Attention Usage](#29-parnet-attention-usage) 123 | - [29.1. Paper](#291-paper) 124 | - [29.2. Overview](#292-overview) 125 | - [29.3. Usage Code](#293-usage-code) 126 | - [30. UFO Attention Usage](#30-ufo-attention-usage) 127 | - [30.1. Paper](#301-paper) 128 | - [30.2. Overview](#302-overview) 129 | - [30.3. Usage Code](#303-usage-code) 130 | - [31. MobileViTv2 Attention Usage](#31-mobilevitv2-attention-usage) 131 | - [31.1. Paper](#311-paper) 132 | - [31.2. Overview](#312-overview) 133 | - [31.3. Usage Code](#313-usage-code) 134 | - [32. Infini-attention Usage](#32-infini-attention-usage) 135 | - [32.1. Paper](#321-paper) 136 | - [32.2. Overview](#322-overview) 137 | - [32.3. Usage Code](#323-usage-code) 138 | 139 | ## Attention Series 140 | 141 | ### 1. Residual Attention Usage 142 | 143 | #### 1.1. Paper 144 | 145 | [Residual Attention: A Simple but Effective Method for Multi-Label Recognition---ICCV2021](https://arxiv.org/abs/2108.02456) 146 | 147 | #### 1.2 Overview 148 | 149 | ![](attention/img/ResAtt.png) 150 | 151 | > Only 4 lines of code consistently leads to improvement of multi-label recognition, across many diverse pretrained attentions and datasets, even without any extra training. 152 | > (在许多不同的预训练模型和数据集上,即使没有任何额外的训练,只用4行代码也可以提高多标签识别的准确率) 153 | 154 | #### 1.3. UsageCode 155 | 156 | ```python 157 | from attention.ResidualAttention import ResidualAttention 158 | import tensorflow as tf 159 | 160 | input = tf.random.normal(shape=(50, 7, 7, 512)) 161 | resatt = ResidualAttention(num_class=1000, la=0.2) 162 | output = resatt(input) 163 | print(output.shape) 164 | ``` 165 | 166 | *** 167 | 168 | ### 2. External Attention Usage 169 | 170 | #### 2.1. Paper 171 | 172 | ["Beyond Self-attention: External Attention using Two Linear Layers for Visual Tasks"](https://arxiv.org/abs/2105.02358) 173 | 174 | #### 2.2. Overview 175 | 176 | ![](attention/img/External_Attention.png) 177 | 178 | > 主要解决的Self-Attention(SA)的两个痛点问题: 179 | > >(1)O(n^2)的计算复杂度;(2) SA是在同一个样本上根据不同位置计算Attention,忽略了不同样本之间的联系。 180 | > 181 | > 因此,本文采用了两个串联的MLP结构作为memory units,使得计算复杂度降低到了O(n);此外,这两个memory units是基于全部的训练数据学习的,因此也隐式的考虑了不同样本之间的联系。 182 | 183 | #### 2.3. UsageCode 184 | 185 | ```python 186 | from attention.ExternalAttention import ExternalAttention 187 | import tensorflow as tf 188 | 189 | input = tf.random.normal(shape=(50, 49, 512)) 190 | ea = ExternalAttention(d_attention=512, S=8) 191 | output = ea(input) 192 | print(output.shape) 193 | ``` 194 | 195 | *** 196 | 197 | ### 3. Self Attention Usage 198 | 199 | #### 3.1. Paper 200 | 201 | ["Attention Is All You Need"](https://arxiv.org/pdf/1706.03762.pdf) 202 | 203 | #### 3.2. Overview 204 | 205 | ![](attention/img/SA.png) 206 | 207 | > 这是Google在NeurIPS2017发表的一篇文章,在CV、NLP、多模态等各个领域都有很大的影响力,目前引用量已经4.5w+。Transformer中提出的 208 | > Self-Attention是Attention的一种,用于计算特征中不同位置之间的权重,从而达到更新特征的效果。首先将input feature通过FC映射成Q、K、V 209 | > 三个特征,然后将Q和K进行点乘的得到attention map,再将attention map与V做点乘得到加权后的特征。最后通过FC进行特征的映射,得到一个新的特征。 210 | 211 | #### 3.3. UsageCode 212 | 213 | ```python 214 | from attention.SelfAttention import ScaledDotProductAttention 215 | import tensorflow as tf 216 | 217 | input = tf.random.normal((50, 49, 512)) 218 | sa = ScaledDotProductAttention(d_attention=512, d_k=512, d_v=512, h=8) 219 | output = sa(input, input, input) 220 | print(output.shape) 221 | ``` 222 | 223 | *** 224 | 225 | ### 4. Simplified Self Attention Usage 226 | 227 | #### 4.1. Paper 228 | 229 | [None]() 230 | 231 | #### 4.2. Overview 232 | 233 | ![](attention/img/SSA.png) 234 | 235 | #### 4.3. UsageCode 236 | 237 | ```python 238 | from attention.SimplifiedSelfAttention import SimplifiedScaledDotProductAttention 239 | import tensorflow as tf 240 | 241 | input = tf.random.normal((50, 49, 512)) 242 | ssa = SimplifiedScaledDotProductAttention(d_attention=512, h=8) 243 | output = ssa(input, input, input) 244 | print(output.shape) 245 | ``` 246 | 247 | *** 248 | 249 | ### 5. Squeeze-and-Excitation Attention Usage 250 | 251 | #### 5.1. Paper 252 | 253 | ["Squeeze-and-Excitation Networks"](https://arxiv.org/abs/1709.01507) 254 | 255 | #### 5.2. Overview 256 | 257 | ![](attention/img/SE.png) 258 | 259 | > 这是CVPR2018的一篇文章,是做通道注意力的,因其简单的结构和有效性,将通道注意力掀起了一波小高潮。大道至简,这篇文章的思想非常简单,首先将 260 | > spatial维度进行AdaptiveAvgPool,然后通过两个FC学习到通道注意力,并用Sigmoid进行归一化得到Channel Attention Map,最后将Channel 261 | > Attention Map与原特征相乘,就得到了加权后的特征。 262 | 263 | #### 5.3. UsageCode 264 | 265 | ```python 266 | from attention.SEAttention import SEAttention 267 | import tensorflow as tf 268 | 269 | input = tf.random.normal((50, 7, 7, 512)) 270 | se = SEAttention(channel=512, reduction=8) 271 | output = se(input) 272 | print(output.shape) 273 | ``` 274 | 275 | *** 276 | 277 | ### 6. SK Attention Usage 278 | 279 | #### 6.1. Paper 280 | 281 | ["Selective Kernel Networks"](https://arxiv.org/pdf/1903.06586.pdf) 282 | 283 | #### 6.2. Overview 284 | 285 | ![](attention/img/SK.png) 286 | 287 | > 这是CVPR2019的一篇文章,致敬了SENet的思想。在传统的CNN中每一个卷积层都是用相同大小的卷积核,限制了模型的表达能力;而Inception这种“更宽”的模型结构也验证了,用多个不同的卷积核进行学习确实可以提升模型的表达能力。作者借鉴了SENet的思想,通过动态计算每个卷积核得到通道的权重,动态的将各个卷积核的结果进行融合。 288 | 289 | >本文的方法分为三个部分:Split,Fuse,Select。Split就是一个multi-branch的操作,用不同的卷积核进行卷积得到不同的特征;Fuse部分就是用SE的结构获取通道注意力的矩阵( 290 | >N个卷积核就可以得到N个注意力矩阵,这步操作对所有的特征参数共享),这样就可以得到不同kernel经过SE之后的特征;Select操作就是将这几个特征进行相加。 291 | 292 | #### 6.3. UsageCode 293 | 294 | ```python 295 | from attention.SKAttention import SKAttention 296 | import tensorflow as tf 297 | 298 | input = tf.random.normal((50, 7, 7, 512)) 299 | se = SKAttention(channel=512, reduction=8) 300 | output = se(input) 301 | print(output.shape) 302 | ``` 303 | 304 | *** 305 | 306 | ### 7. CBAM Attention Usage 307 | 308 | #### 7.1. Paper 309 | 310 | ["CBAM: Convolutional Block Attention Module"](https://openaccess.thecvf.com/content_ECCV_2018/papers/Sanghyun_Woo_Convolutional_Block_Attention_ECCV_2018_paper.pdf) 311 | 312 | #### 7.2. Overview 313 | 314 | ![](attention/img/CBAM1.png) 315 | 316 | ![](attention/img/CBAM2.png) 317 | 318 | > 这是ECCV2018的一篇论文,这篇文章同时使用了Channel Attention和Spatial Attention,将两者进行了串联(文章也做了并联和两种串联方式的消融实验)。 319 | > 320 | >Channel 321 | > Attention方面,大致结构还是和SE相似,不过作者提出AvgPool和MaxPool有不同的表示效果,所以作者对原来的特征在Spatial维度分别进行了AvgPool和MaxPool,然后用SE的结构提取channel 322 | > attention,注意这里是参数共享的,然后将两个特征相加后做归一化,就得到了注意力矩阵。 323 | > 324 | >Spatial Attention和Channel Attention类似,先在channel维度进行两种pool后,将两个特征进行拼接,然后用7x7的卷积来提取Spatial 325 | > Attention(之所以用7x7是因为提取的是空间注意力,所以用的卷积核必须足够大)。然后做一次归一化,就得到了空间的注意力矩阵。 326 | 327 | #### 7.3. Usage Code 328 | 329 | ```python 330 | from attention.CBAM import CBAMBlock 331 | import tensorflow as tf 332 | 333 | input = tf.random.normal((50, 7, 7, 512)) 334 | kernel_size = input.get_shape()[1] 335 | cbam = CBAMBlock(channel=512, reduction=16, kernel_size=kernel_size) 336 | output = cbam(input) 337 | print(output.shape) 338 | ``` 339 | 340 | *** 341 | 342 | ### 8. BAM Attention Usage 343 | 344 | #### 8.1. Paper 345 | 346 | ["BAM: Bottleneck Attention Module"](https://arxiv.org/pdf/1807.06514.pdf) 347 | 348 | #### 8.2. Overview 349 | 350 | ![](attention/img/BAM.png) 351 | 352 | > 这是CBAM同作者同时期的工作,工作与CBAM非常相似,也是双重Attention,不同的是CBAM是将两个attention的结果串联;而BAM是直接将两个attention矩阵进行相加。 353 | > 354 | >Channel Attention方面,与SE的结构基本一样。Spatial Attention方面,还是在通道维度进行pool,然后用了两次3x3的空洞卷积,最后将用一次1x1的卷积得到Spatial Attention的矩阵。 355 | > 356 | >最后Channel Attention和Spatial Attention矩阵进行相加(这里用到了广播机制),并进行归一化,这样一来,就得到了空间和通道结合的attention矩阵。 357 | 358 | #### 8.3. Usage Code 359 | 360 | ```python 361 | from attention.BAM import BAMBlock 362 | import tensorflow as tf 363 | 364 | input = tf.random.normal((50, 7, 7, 512)) 365 | bam = BAMBlock(channel=512, reduction=16, dia_val=2) 366 | output = bam(input) 367 | print(output.shape) 368 | ``` 369 | 370 | *** 371 | 372 | ### 9. ECA Attention Usage 373 | 374 | #### 9.1. Paper 375 | 376 | ["ECA-Net: Efficient Channel Attention for Deep Convolutional Neural Networks"](https://arxiv.org/pdf/1910.03151.pdf) 377 | 378 | #### 9.2. Overview 379 | 380 | ![](attention/img/ECA.png) 381 | 382 | > 这是CVPR2020的一篇文章。 如上图所示,SE实现通道注意力是使用两个全连接层,而ECA是需要一个的卷积。作者这么做的原因一方面是认为计算所有通道两两之间的注意力是没有必要的,另一方面是用两个全连接层确实引入了太多的参数和计算量。 383 | > 384 | >因此作者进行了AvgPool之后,只是使用了一个感受野为k的一维卷积(相当于只计算与相邻k个通道的注意力),这样做就大大的减少的参数和计算量。(i.e.相当于SE是一个global的注意力,而ECA是一个local的注意力)。 385 | 386 | #### 9.3. Usage Code 387 | 388 | ```python 389 | from attention.ECAAttention import ECAAttention 390 | import tensorflow as tf 391 | 392 | input = tf.random.normal((50, 7, 7, 512)) 393 | eca = ECAAttention(kernel_size=3) 394 | output = eca(input) 395 | print(output.shape) 396 | ``` 397 | 398 | *** 399 | 400 | ### 10. DANet Attention Usage 401 | 402 | #### 10.1. Paper 403 | 404 | ["Dual Attention Network for Scene Segmentation"](https://arxiv.org/pdf/1809.02983.pdf) 405 | 406 | #### 10.2. Overview 407 | 408 | ![](attention/img/danet.png)![](attention/img/danet2.png) 409 | 410 | >这是CVPR2019的文章,思想上就是将self-attention用到场景分割的任务中,不同的是self-attention是关注每个position之间的注意力,而本文将self-attention做了一个拓展,还做了一个通道注意力的分支,操作上和self-attention一样,不同的通道attention中把生成Q,K,V的三个Linear去掉了。最后将两个attention之后的特征进行element-wise sum。 411 | 412 | #### 10.3. Usage Code 413 | 414 | ```python 415 | from attention.DANet import DAModule 416 | import tensorflow as tf 417 | 418 | input = tf.random.normal((50, 7, 7, 512)) 419 | danet = DAModule(d_attention=512, kernel_size=3, H=7, W=7) 420 | print(danet(input).shape) 421 | ``` 422 | 423 | *** 424 | 425 | ### 11. Pyramid Squeeze Attention Usage 426 | 427 | #### 11.1. Paper 428 | 429 | ["EPSANet: An Efficient Pyramid Squeeze Attention Block on Convolutional Neural Network"](https://doi.org/10.48550/arXiv.2105.14447) 430 | 431 | #### 11.2. Overview 432 | 433 | ![Pyramid Squeeze Attention (PSA) module](attention/img/psa.jpg)![A detailed illustration of Squeeze and Concat(SPC) module](attention/img/psa2.jpg) 434 | 435 | >这是深大2021年5月30日在arXiv上上传的一篇文章,本文的目的是如何获取并探索不同尺度的空间信息来丰富特征空间。网络结构相对来说也比较简单,主要分成四步,第一步,将原来的feature根据通道分成n组然后对不同的组进行不同尺度的卷积,得到新的特征W1;第二步,通过使用SE权重模块提取不同尺度的特征图的注意力,得到channel-wise attention向量;第三步,对不同组进行softmax;第四步,将获得channel-wise attention与原来的特征W1相乘。 436 | 437 | #### 11.3. Usage Code 438 | 439 | ```python 440 | from attention.PSA import PSA 441 | import tensorflow as tf 442 | 443 | input = tf.random.normal((50, 7, 7, 512)) 444 | psa = PSA(channel=512, reduction=8) 445 | output = psa(input) 446 | print(output.shape) 447 | ``` 448 | 449 | *** 450 | 451 | ### 12. Efficient Multi-Head Self-Attention Usage 452 | 453 | #### 12.1. Paper 454 | 455 | ["ResT: An Efficient Transformer for Visual Recognition"](https://arxiv.org/abs/2105.13677) 456 | 457 | #### 12.2. Overview 458 | 459 | ![](attention/img/EMSA.jpg) 460 | 461 | >这是南大5月28日在arXiv上上传的一篇文章。本文解决的主要是SA的两个痛点问题:(1)Self-Attention的计算复杂度和n呈平方关系;(2)每个head只有q,k,v的部分信息,如果q,k,v的维度太小,那么就会导致获取不到连续的信息,从而导致性能损失。这篇文章给出的思路也非常简单,在SA中的FC之前,用了一个卷积来降低了空间的维度,从而得到空间维度上更小的K和V。 462 | 463 | #### 12.3. Usage Code 464 | 465 | ```python 466 | from attention.EMSA import EMSA 467 | import tensorflow as tf 468 | 469 | input = tf.random.normal((50, 64, 512)) 470 | emsa = EMSA(d_attention=512, d_k=512, d_v=512, h=8, H=8, W=8, ratio=2, apply_transform=True) 471 | output = emsa(input, input, input) 472 | print(output.shape) 473 | ``` 474 | 475 | *** 476 | 477 | ### 13. Shuffle Attention Usage 478 | 479 | #### 13.1. Paper 480 | 481 | ["SA-NET: SHUFFLE ATTENTION FOR DEEP CONVOLUTIONAL NEURAL NETWORKS"](https://arxiv.org/pdf/2102.00240.pdf) 482 | 483 | #### 13.2. Overview 484 | 485 | ![](attention/img/ShuffleAttention.png) 486 | 487 | > 采用Shuffle Units将两种注意力机制有效结合。具体来说,SA首先将通道维度分组为多个子特征,然后并行处理它们。其次,对于每个子特征,SA使用Shuffle 488 | > Unit来描述空间和通道维度上的特征依赖关系。最后,对所有子特征进行聚合,并采用“channel shuffle”算子来实现不同子特征之间的信息通信。 489 | 490 | #### 13.3. Usage Code 491 | 492 | ```python 493 | from attention.ShuffleAttention import ShuffleAttention 494 | import tensorflow as tf 495 | 496 | input = tf.random.normal((50, 7, 7, 512)) 497 | se = ShuffleAttention(channel=512, G=8) 498 | output = se(input) 499 | print(output.shape) 500 | ``` 501 | 502 | *** 503 | 504 | ### 14. MUSE Attention Usage 505 | 506 | #### 14.1. Paper 507 | 508 | ["MUSE: Parallel Multi-Scale Attention for Sequence to Sequence Learning"](https://arxiv.org/abs/1911.09483) 509 | 510 | #### 14.2. Overview 511 | 512 | ![](./attention/img/MUSE.png) 513 | 514 | > 这是北大团队2019年在arXiv上发布的一篇文章,主要解决的是Self-Attention(SA)只有全局捕获能力的缺点。如下图所示,当句子长度变长时, 515 | > SA的全局捕获能力变弱,导致最终模型性能变差。因此,作者在文中引入了多个不同感受野的一维卷积来捕获多尺度的局部Attention,以此来弥补SA在建模长句子能力的不足。 516 | > ![](attention/img/MUSE2.jpg) 517 | > 实现方式如模型结构所示的那样,将SA的结果和多个卷积的结果相加,不仅进行全局感知,还进行局部感知。最终通过引入多尺度的局部感知,使模型在翻译任务上的性能得到了提升。 518 | 519 | #### 14.3. Usage Code 520 | 521 | ```python 522 | from attention.MUSEAttention import MUSEAttention 523 | import tensorflow as tf 524 | 525 | input = tf.random.normal((50, 49, 512)) 526 | sa = MUSEAttention(d_attention=512, d_k=512, d_v=512, h=8) 527 | output = sa(input, input, input) 528 | print(output.shape) 529 | ``` 530 | 531 | *** 532 | 533 | ### 15. SGE Attention Usage 534 | 535 | #### 15.1. Paper 536 | 537 | [Spatial Group-wise Enhance: Improving Semantic Feature Learning in Convolutional Networks](https://arxiv.org/pdf/1905.09646.pdf) 538 | 539 | #### 15.2. Overview 540 | 541 | ![](attention/img/SGE.jpg) 542 | > 这篇文章是[SKNet](#6-sk-attention-usage) 543 | > 作者在19年的时候在arXiv上挂出的文章,是一个轻量级Attention的工作,从核心代码中可以看出,引入的参数真的非常少,self.weight和self.bias都是和groups呈一个数量级的(几乎就是常数级别)。 544 | > 545 | > 这篇文章的核心点是用局部信息和全局信息的相似性来指导语义特征的增强,总体的操作可以分为以下几步: 546 | >> 1. 将特征分组,每组feature在空间上与其global pooling后的feature做点积(相似性)得到初始的attention mask; 547 | >> 2. 对该attention mask进行减均值除标准差的normalize,并同时每个group学习两个缩放偏移参数使得normalize操作可被还原; 548 | >> 3. 最后经过sigmoid得到最终的attention mask并对原始feature group中的每个位置的feature进行scale。 549 | > 550 | > 实验部分,作者也是在分类任务(ImageNet)和检测任务(COCO)上做了实验,能够在比[SK](#6-sk-attention-usage)、[CBAM](#7-cbam-attention-usage) 551 | > 、[BAM](#8-bam-attention-usage)等网络参数和计算量更小的情况下,获得更好的性能,证明了本文方法的高效性。 552 | 553 | #### 15.3. Usage Code 554 | 555 | ```python 556 | from attention.SGE import SpatialGroupEnhance 557 | import tensorflow as tf 558 | 559 | input = tf.random.normal((50, 7, 7, 512)) 560 | sge = SpatialGroupEnhance(groups=8) 561 | output = sge(input) 562 | print(output.shape) 563 | ``` 564 | 565 | *** 566 | 567 | ### 16. A2 Attention Usage 568 | 569 | #### 16.1. Paper 570 | 571 | [A2-Nets: Double Attention Networks](https://arxiv.org/pdf/1810.11579.pdf) 572 | 573 | #### 16.2. Overview 574 | 575 | ![](./attention/img/A2.png) 576 | 577 | > 这是NeurIPS2018上的一篇文章,这篇论文主要是做空间注意力的。并且这篇文章的方法跟做法跟self-attention非常相似,但是包装上就比较“花里胡哨”。 578 | > 579 | > input用1x1的卷积变成A,B,V(类似self-attention的Q,K,V)。本文的方法主要分为两个步骤,第一步,feature 580 | > gathering,首先用A和B进行点乘,得到一个聚合全局信息的attention,标记为G。然后用G和V进行点乘,得到二阶的attention。 581 | > 582 | > 从实验结果上看,这个结构的效果还是非常不错的,作者在分类(ImageNet)和行为识别(Kinetics , UCF-101)任务上做了实验,都取得非常好的效果,相比于Non-Local[12]、SENet[13] 583 | > 等模型,都有不错的提升。 584 | 585 | #### 16.3. Usage Code 586 | 587 | ```python 588 | from attention.A2Attention import DoubleAttention 589 | import tensorflow as tf 590 | 591 | input = tf.random.normal((50, 7, 7, 512)) 592 | a2 = DoubleAttention(512, 128, 128, True) 593 | output = a2(input) 594 | print(output.shape) 595 | ``` 596 | 597 | ### 17. AFT Attention Usage 598 | 599 | #### 17.1. Paper 600 | 601 | [An Attention Free Transformer](https://arxiv.org/pdf/2105.14103v1.pdf) 602 | 603 | #### 17.2. Overview 604 | 605 | ![](./attention/img/AFT.jpg) 606 | 607 | > 这是苹果团队2021年6月16日在arXiv上发布的工作,主要工作是简化Self-Attention。 608 | > 609 | > Transformer近几年被用于各种任务中,但是由于Self-Attention的与输入数据大小呈平方关系的时间和空间复杂度,它不能被用于太大的数据中。 610 | > 近几年,基于简化SA的复杂度,很多工作也被提出:稀疏注意力、局部哈希、低质分解... 611 | > 612 | > 本文提出了一个Attention Free Transformer(AFT),AFT也是由QKV三部分组成,不同的是QK不是做点积。而是将KV直接融合了,从而来保证对应位置的交互,然后Q与融合后的特征进行了对应位置相乘,来减少计算量。 613 | > 614 | > 总体上原理跟Self-Attention相似,不同的是Self-Attention用的是点积,而这里用的是对应位置相乘,所以大大减少了计算量。 615 | 616 | #### 17.3. Usage Code 617 | 618 | ```python 619 | from attention.AFT import AFT_FULL 620 | import tensorflow as tf 621 | 622 | input = tf.random.normal((50, 49, 512)) 623 | aft_full = AFT_FULL(d_model=512, n=49) 624 | output = aft_full(input) 625 | print(output.shape) 626 | ``` 627 | 628 | ### 18. Outlook Attention Usage 629 | 630 | #### 18.1. Paper 631 | 632 | [VOLO: Vision Outlooker for Visual Recognition---arXiv 2021.06.24"](https://arxiv.org/abs/2106.13112) 633 | 634 | #### 18.2. Overview 635 | 636 | ![](./attention/img/OutlookAttention.png) 637 | 638 | > Transformer-based模型在Visual Recognition领域,如果不借助额外的训练数据,比CNN-based模型要差一点。作者认为,这是因为token 639 | > embedding并没有进行细粒度特征表示,因此本文提出了一种新的Attention方式,通过局部信息的感知,能够获得更加细粒度的特征表示。 640 | > 641 | > > 整个框架分为两个分支,上面的分支用于生成attention map,下面的分支用于生成投影后的value。然后通过矩阵乘法得到outlook attention后的结果,最后通过Fold函数将feature map还原到输入大小。 642 | > > - 上面分支Linear是为了对特征进行embedding,之后对特征进行reshape,最后通过softmax得到每个位置和周围几个位置的注意力权重。 643 | > > - 下面分支同样进行embedding,之后通过unfold,也就是滑动窗口的形式将特征中的K*K区域取出来。 644 | > 645 | > 可以看出,在Outlook Attention中,每一个中心点的位置都与周围k*k个位置进行attention操作,这个步骤就有点类似卷积。 646 | 647 | #### 18.3. Usage Code 648 | 649 | [torch.nn.fold开发者说没有这个功能,未来没打算加。我以后看情况是否补充吧。点击看开发者回复](https://github.com/tensorflow/tensorflow/issues/52195#issuecomment-948915934) 650 | 651 | ```python 652 | from attention.OutlookAttention import OutlookAttention 653 | import tensorflow as tf 654 | ``` 655 | 656 | *** 657 | 658 | ### 19. ViP Attention Usage 659 | 660 | #### 19.1. Paper 661 | 662 | [Vision Permutator: A Permutable MLP-Like Architecture for Visual Recognition"](https://arxiv.org/abs/2106.12368) 663 | 664 | #### 19.2. Overview 665 | 666 | ![](./attention/img/ViP.png) 667 | 668 | #### 19.3. Usage Code 669 | 670 | ```python 671 | from attention.ViP import WeightedPermuteMLP 672 | import tensorflow as tf 673 | 674 | input = tf.random.normal((64, 8, 8, 512)) 675 | seg_dim = 8 676 | vip = WeightedPermuteMLP(512, seg_dim) 677 | output = vip(input) 678 | print(output.shape) 679 | ``` 680 | 681 | *** 682 | 683 | ### 20. CoAtNet Attention Usage 684 | 685 | #### 20.1. Paper 686 | 687 | [CoAtNet: Marrying Convolution and Attention for All Data Sizes"](https://arxiv.org/abs/2106.04803) 688 | 689 | #### 20.2. Overview 690 | 691 | ![](attention/img/CoAtNet.png) 692 | 693 | > 本文系统调研了CNN和Transformer的特性,并将两者结合提出新的家族式网络:CoAtNet,无额外数据时高达86%准确率,在JFT加持下,高达89.77%!性能优于CvT、BotNet和Swin等网络。 694 | > >Transformers 在计算机视觉方面吸引了越来越多的兴趣,但它们仍然落后于最先进的卷积网络。在这项工作中,我们表明虽然 Transformer 往往具有更大的模型容量,但由于缺乏正确的归纳偏差,它们的泛化可能比卷积网络更差。 695 | > >为了有效地结合两种架构的优势,我们提出了 CoAtNets(发音为“coat”nets),这是一个基于两个关键insight构建的混合模型系列: 696 | > > 697 | > >- 1. 深度卷积和自注意力可以通过简单的相对注意力自然地统一起来; 698 | > >- 2. 以有原则的方式垂直堆叠卷积层和注意力层在提高泛化、容量和效率方面非常有效。 699 | 700 | #### 20.3. Usage Code 701 | 702 | ```python 703 | 704 | from attention.CoAtNet import CoAtNet 705 | import tensorflow as tf 706 | 707 | input = tf.random.normal((1, 224, 224, 3)) 708 | coatnet = CoAtNet(in_ch=3) 709 | output = coatnet(input) 710 | print(output.shape) 711 | ``` 712 | 713 | *** 714 | 715 | ### 21. HaloNet Attention Usage 716 | 717 | #### 21.1. Paper 718 | 719 | [Scaling Local Self-Attention for Parameter Efficient Visual Backbones](https://arxiv.org/pdf/2103.12731.pdf) 720 | 721 | #### 21.2. Overview 722 | 723 | ![](./attention/img/HaloNet.png) 724 | 725 | #### 21.3. Usage Code 726 | 727 | ```python 728 | 729 | from attention.HaloAttention import HaloAttention 730 | import tensorflow as tf 731 | ``` 732 | 733 | *** 734 | 735 | ### 22. Polarized Self-Attention Usage 736 | 737 | #### 22.1. Paper 738 | 739 | [Polarized Self-Attention: Towards High-quality Pixel-wise Regression"](https://arxiv.org/abs/2107.00782) 740 | 741 | #### 22.2. Overview 742 | 743 | ![](./attention/img/PoSA.png) 744 | 745 | #### 22.3. Usage Code 746 | 747 | ```python 748 | from attention.PolarizedSelfAttention import SequentialPolarizedSelfAttention 749 | import tensorflow as tf 750 | 751 | if __name__ == '__main__': 752 | input_tensor = tf.random.normal([1, 7, 7, 512]) 753 | psa = SequentialPolarizedSelfAttention(channel=512) 754 | output_tensor = psa(input_tensor) 755 | print(output_tensor.shape) 756 | ``` 757 | 758 | *** 759 | 760 | ### 23. CoTAttention Usage 761 | 762 | #### 23.1. Paper 763 | 764 | [Contextual Transformer Networks for Visual Recognition---arXiv 2021.07.26](https://arxiv.org/abs/2107.12292) 765 | 766 | #### 23.2. Overview 767 | 768 | ![](./attention/img/CoT.png) 769 | 770 | #### 23.3. Usage Code 771 | 772 | ```python 773 | 774 | from attention.CoTAttention import CoTAttention 775 | import tensorflow as tf 776 | ``` 777 | 778 | ### 24. S2 Attention Usage 779 | 780 | #### 24.1. Paper 781 | 782 | [S²-MLPv2: Improved Spatial - Shift MLP Architecture for Vision ---arXiv 2021.08.02](https://arxiv.org/abs/2108.01072) 783 | 784 | #### 24.2. Overview 785 | 786 | ![](./attention/img/S2Attention.png) 787 | 788 | #### 24.3. Usage Code 789 | 790 | ```python 791 | from attention.S2Attention import S2Attention 792 | import tensorflow as tf 793 | ``` 794 | 795 | *** 796 | 797 | ### 25. GFNet Attention Usage 798 | 799 | #### 25.1. Paper 800 | 801 | [Global Filter Networks for Image Classification---arXiv 2021.07.01](https://arxiv.org/abs/2107.00645) 802 | 803 | #### 25.2. Overview 804 | 805 | ![](./attention/img/GFNet.jpg) 806 | 807 | #### 25.3. Usage Code - Implemented by [Wenliang Zhao (Author)](https://scholar.google.com/citations?user=lyPWvuEAAAAJ&hl=en) 808 | 809 | ```python 810 | from attention.gfnet import GFNet 811 | import tensorflow as tf 812 | ``` 813 | 814 | *** 815 | 816 | ### 26. TripletAttention Usage 817 | 818 | #### 26.1. Paper 819 | 820 | [Rotate to Attend: Convolutional Triplet Attention Module---CVPR 2021](https://arxiv.org/abs/2010.03045) 821 | 822 | #### 26.2. Overview 823 | 824 | ![](./attention/img/triplet.png) 825 | 826 | #### 26.3. Usage Code - Implemented by [digantamisra98](https://github.com/digantamisra98) 827 | 828 | ```python 829 | from attention.TripletAttention import TripletAttention 830 | import tensorflow as tf 831 | ``` 832 | 833 | *** 834 | 835 | ### 27. Coordinate Attention Usage 836 | 837 | #### 27.1. Paper 838 | 839 | [Coordinate Attention for Efficient Mobile Network Design---CVPR 2021](https://arxiv.org/abs/2103.02907) 840 | 841 | #### 27.2. Overview 842 | 843 | ![](./attention/img/CoordAttention.png) 844 | 845 | #### 27.3. Usage Code - Implemented by [Andrew-Qibin](https://github.com/Andrew-Qibin) 846 | 847 | ```python 848 | from attention.CoordAttention import CoordAtt 849 | import tensorflow as tf 850 | ``` 851 | 852 | *** 853 | 854 | ### 28. MobileViT Attention Usage 855 | 856 | #### 28.1. Paper 857 | 858 | [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer---ArXiv 2021.10.05](https://arxiv.org/abs/2103.02907) 859 | 860 | #### 28.2. Overview 861 | 862 | ![](./attention/img/MobileViTAttention.png) 863 | 864 | #### 28.3. Usage Code 865 | 866 | ```python 867 | from attention.MobileViTAttention import MobileViTAttention 868 | import tensorflow as tf 869 | 870 | ``` 871 | 872 | *** 873 | 874 | ### 29. ParNet Attention Usage 875 | 876 | #### 29.1. Paper 877 | 878 | [Non-deep Networks---ArXiv 2021.10.20](https://arxiv.org/abs/2110.07641) 879 | 880 | #### 29.2. Overview 881 | 882 | ![](./attention/img/ParNet.png) 883 | 884 | #### 29.3. Usage Code 885 | 886 | ```python 887 | from attention.ParNetAttention import * 888 | import tensorflow as tf 889 | 890 | ``` 891 | 892 | *** 893 | 894 | ### 30. UFO Attention Usage 895 | 896 | #### 30.1. Paper 897 | 898 | [UFO-ViT: High Performance Linear Vision Transformer without Softmax---ArXiv 2021.09.29](https://arxiv.org/abs/2110.07641) 899 | 900 | #### 30.2. Overview 901 | 902 | ![](./attention/img/UFO.png) 903 | 904 | #### 30.3. Usage Code 905 | 906 | ```python 907 | from attention.UFOAttention import * 908 | import tensorflow as tf 909 | ``` 910 | 911 | ### 31. MobileViTv2 Attention Usage 912 | 913 | #### 31.1. Paper 914 | 915 | [Separable Self-attention for Mobile Vision Transformers---ArXiv 2022.06.06](https://arxiv.org/abs/2206.02680) 916 | 917 | #### 31.2. Overview 918 | 919 | ![](./attention/img/MobileViTv2.png) 920 | 921 | #### 31.3. Usage Code 922 | 923 | ```python 924 | from attention.UFOAttention import * 925 | import tensorflow as tf 926 | 927 | ``` 928 | 929 | ### 32. Infini-attention Usage 930 | 931 | #### 32.1. Paper 932 | 933 | [Leave No Context Behind: Efficient Infinite Context Transformers with Infini-attention---ArXiv 2024.04.10](https://arxiv.org/abs/2404.07143) 934 | 935 | #### 32.2. Overview 936 | 937 | ![](attention/img/Infine-attention.jpeg) 938 | 939 | #### 32.3. Usage Code 940 | 941 | ```python 942 | 943 | 944 | ``` 945 | 946 | *** 947 | 948 | 参考:小马[External-Attention-pytorch](https://github.com/xmu-xiaoma666/External-Attention-pytorch) -------------------------------------------------------------------------------- /attention/A2Attention.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras import layers 3 | 4 | 5 | class DoubleAttention(layers.Layer): 6 | 7 | def __init__(self, in_channels, c_m, c_n, reconstruct=True): 8 | super(DoubleAttention, self).__init__() 9 | self.in_channels = in_channels 10 | self.reconstruct = reconstruct 11 | self.c_m = c_m 12 | self.c_n = c_n 13 | self.convA = layers.Conv2D(c_m, 1) 14 | self.convB = layers.Conv2D(c_n, 1) 15 | self.convV = layers.Conv2D(c_n, 1) 16 | if self.reconstruct: 17 | self.conv_reconstruct = layers.Conv2D(in_channels, kernel_size=1) 18 | 19 | def call(self, x): 20 | b, h, w, c = x.get_shape() 21 | assert c == self.in_channels 22 | A = self.convA(x) # b, h, w, c_m 23 | B = self.convB(x) # b, h, w, c_n 24 | V = self.convV(x) # b, h, w, c_n 25 | tmpA = tf.reshape(A, (b, self.c_m, -1)) 26 | attention_maps = tf.nn.softmax(tf.reshape(B, (b, -1, self.c_n))) 27 | attention_vectors = tf.nn.softmax(tf.reshape(V, (b, self.c_n, -1))) 28 | # step 1: feature gating 29 | global_descriptors = tf.matmul(tmpA, attention_maps) # b, c_m, c_n 30 | # step 2: feature distribution 31 | tmpZ = tf.matmul(global_descriptors, attention_vectors) # b, c_m, h*w 32 | tmpZ = tf.reshape(tmpZ, (b, h, w, self.c_m)) # b, h, w, c_m 33 | if self.reconstruct: 34 | tmpZ = self.conv_reconstruct(tmpZ) 35 | 36 | return tmpZ 37 | 38 | 39 | if __name__ == '__main__': 40 | input = tf.random.normal((50, 7, 7, 512)) 41 | a2 = DoubleAttention(512, 128, 128, True) 42 | output = a2(input) 43 | print(output.shape) 44 | -------------------------------------------------------------------------------- /attention/AFT.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras import layers 3 | 4 | 5 | class AFT_FULL(layers.Layer): 6 | def __init__(self, d_model, n=49, simple=False): 7 | 8 | super(AFT_FULL, self).__init__() 9 | self.fc_q = layers.Dense(d_model) 10 | self.fc_k = layers.Dense(d_model) 11 | self.fc_v = layers.Dense(d_model) 12 | if simple: 13 | self.position_biases = tf.zeros((n, n)) 14 | else: 15 | self.position_biases = tf.Variable(tf.ones((n, n)), trainable=True) 16 | self.d_model = d_model 17 | self.n = n 18 | self.sigmoid = tf.sigmoid 19 | 20 | def call(self, input): 21 | bs, n, dim = input.get_shape() 22 | 23 | q = self.fc_q(input) # bs, n, dim 24 | k = tf.expand_dims(self.fc_k(input), axis=0) # 1, bs, n, dim 25 | v = tf.expand_dims(self.fc_v(input), axis=0) # 1, bs, n, dim 26 | numerator = tf.reduce_sum(tf.exp(k + tf.reshape(self.position_biases, (n, 1, -1, 1))) * v, 2) # n, bs, dim 27 | denominator = tf.reduce_sum(tf.exp(k + tf.reshape(self.position_biases, (n, 1, -1, 1))), 2) # n, bs, dim 28 | 29 | out = (numerator / denominator) # n, bs, dim 30 | out = self.sigmoid(q) * (tf.transpose(out, (1, 0, 2))) 31 | 32 | return out 33 | 34 | 35 | if __name__ == '__main__': 36 | input = tf.random.normal((50, 49, 512)) 37 | aft_full = AFT_FULL(d_model=512, n=49) 38 | output = aft_full(input) 39 | print(output.shape) 40 | -------------------------------------------------------------------------------- /attention/BAM.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras import layers, Sequential 3 | 4 | 5 | class ChannelAttention(layers.Layer): 6 | def __init__(self, channel, reduction=16, num_layers=3): 7 | super(ChannelAttention, self).__init__() 8 | self.avg_pool = layers.GlobalAvgPool2D(keepdims=True) 9 | gate_channels = [channel] 10 | gate_channels += [channel // reduction] * num_layers 11 | gate_channels += [channel] 12 | 13 | self.ca = Sequential() 14 | for i in range(len(gate_channels) - 2): 15 | self.ca.add(layers.Dense(gate_channels[i + 1])) 16 | self.ca.add(layers.BatchNormalization()) 17 | self.ca.add(layers.Activation('relu')) 18 | self.ca.add(layers.Dense(gate_channels[-1])) 19 | 20 | def call(self, x): 21 | res = self.avg_pool(x) 22 | res = self.ca(res) 23 | res = tf.broadcast_to(res, x.get_shape()) 24 | return res 25 | 26 | 27 | class SpatialAttention(layers.Layer): 28 | def __init__(self, channel, reduction=16, num_layers=3, dia_val=2): 29 | super(SpatialAttention, self).__init__() 30 | self.sa = Sequential() 31 | self.sa.add(layers.Conv2D(filters=channel // reduction, kernel_size=1)) 32 | self.sa.add(layers.BatchNormalization()) 33 | self.sa.add(layers.Activation('relu')) 34 | for i in range(num_layers): 35 | self.sa.add( 36 | layers.Conv2D(filters=channel // reduction, kernel_size=3, padding='same', dilation_rate=dia_val)) 37 | self.sa.add(layers.BatchNormalization()) 38 | self.sa.add(layers.Activation('relu')) 39 | self.sa.add(layers.Conv2D(1, kernel_size=1)) 40 | 41 | def call(self, x): 42 | res = self.sa(x) 43 | res = tf.broadcast_to(res, x.get_shape()) 44 | return res 45 | 46 | 47 | class BAMBlock(layers.Layer): 48 | def __init__(self, channel=512, reduction=16, dia_val=2): 49 | super(BAMBlock, self).__init__() 50 | self.ca = ChannelAttention(channel=channel, reduction=reduction) 51 | self.sa = SpatialAttention(channel=channel, reduction=reduction, dia_val=dia_val) 52 | self.sigmoid = tf.sigmoid 53 | 54 | def call(self, x): 55 | sa_out = self.sa(x) 56 | ca_out = self.ca(x) 57 | weight = self.sigmoid(sa_out + ca_out) 58 | out = (1 + weight) * x 59 | return out 60 | 61 | 62 | if __name__ == '__main__': 63 | input = tf.random.normal((50, 7, 7, 512)) 64 | bam = BAMBlock(channel=512, reduction=16, dia_val=2) 65 | output = bam(input) 66 | print(output.shape) 67 | -------------------------------------------------------------------------------- /attention/CBAM.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras import layers, Sequential 3 | 4 | 5 | class ChannelAttention(layers.Layer): 6 | def __init__(self, channel, reduction=16): 7 | super(ChannelAttention, self).__init__() 8 | self.maxpool = layers.GlobalMaxPool2D(keepdims=True) 9 | self.avgpool = layers.GlobalAvgPool2D(keepdims=True) 10 | self.se = Sequential([ 11 | layers.Conv2D(channel // reduction, 1, use_bias=False), 12 | layers.Activation('relu'), 13 | layers.Conv2D(channel, 1, use_bias=False) 14 | ]) 15 | self.sigmoid = tf.sigmoid 16 | 17 | def call(self, x): 18 | max_result = self.maxpool(x) 19 | avg_result = self.avgpool(x) 20 | max_out = self.se(max_result) 21 | avg_out = self.se(avg_result) 22 | output = self.sigmoid(max_out + avg_out) 23 | return output 24 | 25 | 26 | class SpatialAttention(layers.Layer): 27 | def __init__(self, kernel_size=7): 28 | super(SpatialAttention, self).__init__() 29 | self.conv = layers.Conv2D(1, kernel_size=kernel_size, padding='same') 30 | self.sigmoid = tf.sigmoid 31 | 32 | def call(self, x): 33 | max_result = tf.reduce_max(x, axis=-1, keepdims=True) 34 | avg_result = tf.reduce_mean(x, axis=-1, keepdims=True) 35 | result = tf.concat([max_result, avg_result], -1) 36 | output = self.conv(result) 37 | output = self.sigmoid(output) 38 | return output 39 | 40 | 41 | class CBAMBlock(layers.Layer): 42 | def __init__(self, channel=512, reduction=16, kernel_size=49): 43 | super().__init__() 44 | self.ca = ChannelAttention(channel=channel, reduction=reduction) 45 | self.sa = SpatialAttention(kernel_size=kernel_size) 46 | 47 | def call(self, x): 48 | b, _, _, c = x.get_shape() 49 | residual = x 50 | out = x * self.ca(x) 51 | out = out * self.sa(out) 52 | return out + residual 53 | 54 | 55 | if __name__ == '__main__': 56 | input = tf.random.normal((50, 7, 7, 512)) 57 | kernel_size = input.get_shape()[1] 58 | cbam = CBAMBlock(channel=512, reduction=16, kernel_size=kernel_size) 59 | output = cbam(input) 60 | print(output.shape) 61 | -------------------------------------------------------------------------------- /attention/CoAtNet.py: -------------------------------------------------------------------------------- 1 | from math import sqrt 2 | 3 | import tensorflow as tf 4 | from tensorflow.keras import layers, Sequential 5 | from attention.SelfAttention import ScaledDotProductAttention 6 | from conv.MBConv import MBConvBlock 7 | 8 | 9 | class CoAtNet(layers.Layer): 10 | def __init__(self, in_ch, out_chs=[64, 96, 192, 384, 768]): 11 | super(CoAtNet, self).__init__() 12 | self.out_chs = out_chs 13 | self.maxpool2d = layers.MaxPool2D(pool_size=2, strides=2) 14 | self.maxpool1d = layers.MaxPool1D(pool_size=2, strides=2) 15 | 16 | self.s0 = Sequential([ 17 | layers.Conv2D(in_ch, kernel_size=3, padding='same', activation='relu'), 18 | layers.Conv2D(in_ch, kernel_size=3, padding='same') 19 | ]) 20 | 21 | self.mlp0 = Sequential([ 22 | layers.Conv2D(out_chs[0], kernel_size=1, padding='same', activation='relu'), 23 | layers.Conv2D(out_chs[0], kernel_size=1, padding='same') 24 | ]) 25 | 26 | self.s1 = MBConvBlock(ksize=3, input_filters=out_chs[0], output_filters=out_chs[0]) 27 | self.mlp1 = Sequential([ 28 | layers.Conv2D(out_chs[1], kernel_size=1, activation='relu'), 29 | layers.Conv2D(out_chs[1], kernel_size=1, ) 30 | ]) 31 | 32 | self.s2 = MBConvBlock(ksize=3, input_filters=out_chs[1], output_filters=out_chs[1]) 33 | self.mlp2 = Sequential([ 34 | layers.Conv2D(out_chs[2], kernel_size=1, activation='relu'), 35 | layers.Conv2D(out_chs[2], kernel_size=1, ) 36 | ]) 37 | 38 | self.s3 = ScaledDotProductAttention(out_chs[2], out_chs[2] // 8, out_chs[2] // 8, 8) 39 | self.mlp3 = Sequential([ 40 | layers.Dense(out_chs[3], activation='relu'), 41 | layers.Dense(out_chs[3]) 42 | ]) 43 | 44 | self.s4 = ScaledDotProductAttention(out_chs[3], out_chs[3] // 8, out_chs[3] // 8, 8) 45 | self.mlp4 = Sequential([ 46 | layers.Dense(out_chs[4], activation='relu'), 47 | layers.Dense(out_chs[4]) 48 | ]) 49 | 50 | def call(self, x): 51 | B, H, W, C = x.get_shape() 52 | # stage0 53 | y = self.mlp0(self.s0(x)) 54 | y = self.maxpool2d(y) 55 | # stage1 56 | y = self.mlp1(self.s1(y)) 57 | y = self.maxpool2d(y) 58 | # stage2 59 | y = self.mlp2(self.s2(y)) 60 | y = self.maxpool2d(y) 61 | # stage3 62 | y = tf.reshape(y, (B, -1, self.out_chs[2])) # B, N, C 63 | y = self.mlp3(self.s3(y, y, y)) 64 | y = self.maxpool1d(y) 65 | # stage4 66 | y = self.mlp4(self.s4(y, y, y)) 67 | y = self.maxpool1d(y) 68 | N = y.get_shape()[-2] 69 | y = tf.reshape(y, (B, int(sqrt(N)), int(sqrt(N)), self.out_chs[4])) 70 | 71 | return y 72 | 73 | 74 | if __name__ == '__main__': 75 | input = tf.random.normal((1, 224, 224, 3)) 76 | coatnet = CoAtNet(3) 77 | output = coatnet(input) 78 | print(output.shape) 79 | -------------------------------------------------------------------------------- /attention/CoTAttention.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras import layers 3 | 4 | class CoTAttention(layers.Layer): 5 | pass 6 | 7 | if __name__ == '__main__': 8 | input = tf.random.normal((50, 7, 7, 512)) 9 | a2 = CoTAttention(512, 128, 128, True) 10 | output = a2(input) 11 | print(output.shape) -------------------------------------------------------------------------------- /attention/CoordAttention.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras import layers 3 | 4 | class CoordAtt(layers.Layer): 5 | pass 6 | 7 | if __name__ == '__main__': 8 | input = tf.random.normal((50, 7, 7, 512)) 9 | a2 = CoordAtt(512, 128, 128, True) 10 | output = a2(input) 11 | print(output.shape) -------------------------------------------------------------------------------- /attention/DANet.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras import layers 3 | from attention.SelfAttention import ScaledDotProductAttention 4 | from attention.SimplifiedSelfAttention import SimplifiedScaledDotProductAttention 5 | 6 | 7 | class PositionAttentionModule(layers.Layer): 8 | def __init__(self, d_model=512, kernel_size=3, H=7, W=7): 9 | super(PositionAttentionModule, self).__init__() 10 | self.cnn = layers.Conv2D(d_model, kernel_size=kernel_size, padding='same') 11 | self.pa = ScaledDotProductAttention(d_model, d_k=d_model, d_v=d_model, h=1) 12 | 13 | def call(self, x): 14 | bs, h, w, c = x.get_shape() 15 | y = self.cnn(x) 16 | y = tf.reshape(y, shape=(bs, h * w, c)) 17 | y = self.pa(y, y, y) # bs, h*w, c 18 | return y 19 | 20 | 21 | class ChannelAttentionModule(layers.Layer): 22 | def __init__(self, d_model=512, kernel_size=3, H=7, W=7): 23 | super(ChannelAttentionModule, self).__init__() 24 | self.cnn = layers.Conv2D(d_model, kernel_size=kernel_size, padding='same') 25 | self.pa = SimplifiedScaledDotProductAttention(H * W, h=1) 26 | 27 | def call(self, x): 28 | bs, h, w, c = x.get_shape() 29 | y = self.cnn(x) 30 | y = tf.reshape(y, shape=(bs, c, -1)) # bs, c, h*w 31 | y = self.pa(y, y, y) # bs, c, h*w 32 | return y 33 | 34 | 35 | class DAModule(layers.Layer): 36 | def __init__(self, d_model=512, kernel_size=3, H=7, W=7): 37 | super(DAModule, self).__init__() 38 | self.position_attention_module = PositionAttentionModule(d_model=d_model, kernel_size=kernel_size, H=H, W=W) 39 | self.channel_attention_module = ChannelAttentionModule(d_model=d_model, kernel_size=kernel_size, H=H, W=W) 40 | 41 | def call(self, input): 42 | bs, h, w, c = input.get_shape() 43 | p_out = self.position_attention_module(input) 44 | c_out = self.channel_attention_module(input) 45 | p_out = tf.reshape(p_out, shape=(bs, h, w, c)) 46 | c_out = tf.reshape(tf.transpose(c_out, perm=[0, 2, 1]), shape=(bs, h, w, c)) 47 | return p_out + c_out 48 | 49 | 50 | if __name__ == '__main__': 51 | input = tf.random.normal((50, 7, 7, 512)) 52 | danet = DAModule(d_model=512, kernel_size=3, H=7, W=7) 53 | print(danet(input).shape) 54 | -------------------------------------------------------------------------------- /attention/ECAAttention.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras import layers 3 | 4 | 5 | class ECAAttention(layers.Layer): 6 | def __init__(self, kernel_size=3): 7 | super(ECAAttention, self).__init__() 8 | self.gap = layers.GlobalAvgPool2D() 9 | self.conv = layers.Conv1D(1, kernel_size=kernel_size, padding='same') 10 | self.sigmoid = tf.sigmoid 11 | 12 | def call(self, x): 13 | y = self.gap(x) # bs, 1, 1, c 14 | y = tf.expand_dims(y, -1) # bs, c, 1 15 | y = self.conv(y) # bs, c, 1 16 | y = self.sigmoid(y) # bs, c, 1 17 | y = tf.transpose(tf.expand_dims(y, -1), (0, 2, 3, 1)) # bs, 1, 1, c 18 | return x * tf.broadcast_to(y, x.get_shape()) 19 | 20 | 21 | if __name__ == '__main__': 22 | input = tf.random.normal((50, 7, 7, 512)) 23 | eca = ECAAttention(kernel_size=3) 24 | output = eca(input) 25 | print(output.shape) 26 | -------------------------------------------------------------------------------- /attention/EMSA.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.keras import layers, Sequential 4 | 5 | 6 | class EMSA(layers.Layer): 7 | def __init__(self, d_model, d_k, d_v, h, droupout=.1, H=7, W=7, ratio=3, apply_transform=True): 8 | super(EMSA, self).__init__() 9 | self.H = H 10 | self.W = W 11 | self.fc_q = layers.Dense(h * d_k) 12 | self.fc_k = layers.Dense(h * d_k) 13 | self.fc_v = layers.Dense(h * d_v) 14 | self.fc_o = layers.Dense(d_model) 15 | self.dropout = layers.Dropout(droupout) 16 | 17 | self.ratio = ratio 18 | if self.ratio > 1: 19 | self.sr = Sequential() 20 | self.sr_conv = layers.Conv2D(d_model, kernel_size=ratio + 1, strides=ratio, padding='same', groups=d_model) 21 | self.sr_ln = layers.LayerNormalization() 22 | 23 | self.apply_transform = apply_transform and h > 1 24 | if self.apply_transform: 25 | self.transform = Sequential() 26 | self.transform.add(layers.Conv2D(h, kernel_size=1, strides=1, data_format='channels_first')) 27 | self.transform.add(layers.Activation(tf.nn.softmax)) 28 | ''' 29 | Batch Normalisation(axis是沿着channel): 就是强行将数据拉回到均值为0,方差为1的正太分布上,这样不仅数据分布一致,而且避免发生梯度消失。依赖于batch的大小和输入sequence的深度。 30 | Layer Normalisation(axis是沿着batch): LN不依赖于batch的大小和输入sequence的深度,因此可以用于batchsize为1和RNN中对边长的输入sequence的normalize操作。LN用于RNN效果比较明显,但是在CNN上,不如BN。 31 | Instance Normalisation(axis是沿着batch和channel): 同BN注重对每个batch进行归一化,保证数据分布一致,因为判别模型中结果取决于数据整体分布。但是图像风格化中,生成结果主要依赖于某个图像实例,所以对整个batch归一化不适合图像风格化中,因而对HW做归一化。可以加速模型收敛,并且保持每个图像实例之间的独立。 32 | Group Normalization: 主要是针对Batch Normalization对小batchsize效果差,GN将channel方向分group,然后每个group内做归一化,算(C//G)*H*W的均值,这样与batchsize无关,不受其约束。 33 | ''' 34 | self.transform.add(layers.BatchNormalization(axis=[0, 1])) # InstanceNormalisation,[0, 1] is bs and c. 35 | # self.transform.add(tfa.layers.InstanceNormalization()) 36 | 37 | self.d_model = d_model 38 | self.d_k = d_k 39 | self.d_v = d_v 40 | self.h = h 41 | 42 | def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int): 43 | # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] 44 | tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.h, self.d_model)) 45 | 46 | # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size] 47 | return tf.transpose(tensor, perm=[0, 2, 1, 3]) 48 | 49 | def call(self, queries, keys, values, attention_mask=None, attention_weights=None): 50 | 51 | b_s, nq, c = queries.get_shape() 52 | 53 | q = self.transpose_for_scores(self.fc_q(queries), batch_size=b_s) # (b_s, h, nq, d_k) 54 | 55 | if self.ratio > 1: 56 | x = tf.reshape(queries, shape=[b_s, self.H, self.W, c]) # (b_s, H, W, c) 57 | x = self.sr_conv(x) # (b_s, h, w, c) 58 | x = tf.reshape(x, shape=[b_s, -1, c]) # (bs, n', c) 59 | x = self.sr_ln(x) 60 | k = self.transpose_for_scores(self.fc_k(x), batch_size=b_s) # (bs, h, n', d_k) 61 | v = self.transpose_for_scores(self.fc_v(x), batch_size=b_s) # (bs, h, n', d_v) 62 | else: 63 | k = self.transpose_for_scores(self.fc_k(keys), batch_size=b_s) # (bs, h, nk, d_k) 64 | v = self.transpose_for_scores(self.fc_v(values), batch_size=b_s) # (bs, h, nk, d_v) 65 | 66 | if self.apply_transform: 67 | att = tf.matmul(q, k, transpose_b=True) / np.sqrt(self.d_k) # (bs, h, nq, n') 68 | att = self.transform(att) 69 | else: 70 | att = tf.matmul(q, k, transpose_b=True) / tf.math.sqrt(self.d_k) # (bs, h, nq, n') 71 | att = tf.math.softmax(att, -1) # (bs, h, nq, n') 72 | 73 | if attention_weights is not None: 74 | att = att * attention_weights 75 | if attention_mask is not None: 76 | att = tf.multiply(att, attention_mask) 77 | 78 | att = self.dropout(att) 79 | 80 | out = tf.reshape(tf.transpose(tf.matmul(att, v), perm=[0, 2, 1, 3]), 81 | shape=(b_s, nq, self.h * self.d_v)) # (bs, nq, h*d_v) 82 | out = self.fc_o(out) 83 | return out 84 | 85 | 86 | if __name__ == '__main__': 87 | input = tf.random.normal((50, 64, 512)) 88 | emsa = EMSA(d_model=512, d_k=512, d_v=512, h=8, H=8, W=8, ratio=2, apply_transform=True) 89 | output = emsa(input, input, input) 90 | print(output.shape) 91 | -------------------------------------------------------------------------------- /attention/ExternalAttention.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from tensorflow.keras import layers 4 | 5 | 6 | class ExternalAttention(layers.Layer): 7 | 8 | def __init__(self, d_model, S=64): 9 | super(ExternalAttention, self).__init__(name='ExternalAttention') 10 | self.mk = layers.Dense(S, use_bias=False) 11 | self.mv = layers.Dense(d_model, use_bias=False) 12 | 13 | def call(self, queries): 14 | attn = self.mk(queries) # bs,n,S 15 | attn = tf.nn.softmax(attn, axis=1) # bs,n,S 16 | attn = attn / tf.reduce_sum(attn, axis=2, keepdims=True) # bs,n,S (l1_norm) 17 | out = self.mv(attn) # bs,n,d_model 18 | 19 | return out 20 | 21 | 22 | if __name__ == '__main__': 23 | input = tf.random.normal(shape=(50, 49, 512)) 24 | ea = ExternalAttention(d_model=512, S=8) 25 | output = ea(input) 26 | print(output.shape) 27 | -------------------------------------------------------------------------------- /attention/HaloAttention.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras import layers 3 | 4 | class HaloAttention(layers.Layer): 5 | pass 6 | 7 | if __name__ == '__main__': 8 | input = tf.random.normal((50, 7, 7, 512)) 9 | halo = HaloAttention(512, 128, 128, True) 10 | output = halo(input) 11 | print(output.shape) 12 | # 参考https://github.com/leondgarse/keras_cv_attention_models/blob/main/keras_cv_attention_models/halonet/halonet.py -------------------------------------------------------------------------------- /attention/MUSEAttention.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.keras import layers 4 | 5 | 6 | class Depth_Pointwise_Conv1d(layers.Layer): 7 | def __init__(self, in_ch, out_ch, k): 8 | super(Depth_Pointwise_Conv1d, self).__init__() 9 | if k == 1: 10 | self.depth_conv = tf.identity 11 | else: 12 | self.depth_conv = layers.Conv1D( 13 | filters=in_ch, 14 | kernel_size=k, 15 | groups=in_ch, 16 | padding='same' 17 | ) 18 | self.pointwise_conv = layers.Conv1D( 19 | filters=out_ch, 20 | kernel_size=1, 21 | groups=1 22 | ) 23 | 24 | def call(self, x): 25 | depth_conv_out = self.depth_conv(x) 26 | out = self.pointwise_conv(depth_conv_out) 27 | return out 28 | 29 | 30 | class MUSEAttention(layers.Layer): 31 | def __init__(self, d_model, d_k, d_v, h, dropout=1): 32 | super(MUSEAttention, self).__init__() 33 | self.fc_q = layers.Dense(h * d_k) 34 | self.fc_k = layers.Dense(h * d_k) 35 | self.fc_v = layers.Dense(h * d_v) 36 | self.fc_o = layers.Dense(d_model) 37 | self.dropout = layers.Dropout(dropout) 38 | 39 | self.conv1 = Depth_Pointwise_Conv1d(h * d_v, d_model, 1) 40 | self.conv3 = Depth_Pointwise_Conv1d(h * d_v, d_model, 3) 41 | self.conv5 = Depth_Pointwise_Conv1d(h * d_v, d_model, 5) 42 | self.dy_paras = tf.Variable(tf.ones(3), trainable=True) 43 | self.softmax = tf.nn.softmax 44 | 45 | self.d_model = d_model 46 | self.d_k = d_k 47 | self.d_v = d_v 48 | self.h = h 49 | 50 | def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int): 51 | # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] 52 | tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.h, self.d_model)) 53 | 54 | # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size] 55 | return tf.transpose(tensor, perm=[0, 2, 1, 3]) 56 | 57 | def call(self, queries, keys, values, attention_mask=None, attention_weights=None): 58 | 59 | # Self Attention 60 | b_s, nq = queries.shape[:2] 61 | nk = keys.shape[1] 62 | 63 | q = self.transpose_for_scores(self.fc_q(queries), batch_size=b_s) # (b_s, h, nq ,d_k) 64 | k = self.transpose_for_scores(self.fc_k(keys), batch_size=b_s) # (b_s, h, nk, d_k) 65 | v = self.transpose_for_scores(self.fc_v(values), batch_size=b_s) # (b_s, h, nk ,d_v) 66 | 67 | # Take the dot product between "query" and "key" to get the raw attention scores. 68 | # (batch size, num_heads, seq_len_q, seq_len_k) 69 | att = tf.matmul(q, k, transpose_b=True) / np.sqrt(self.d_k) 70 | 71 | if attention_weights is not None: 72 | att = att * attention_weights 73 | if attention_mask is not None: 74 | att = tf.multiply(att, attention_mask) 75 | 76 | # Normalize the attention scores to probabilities. 77 | att = self.softmax(att, -1) 78 | 79 | att = self.dropout(att) 80 | 81 | out = tf.reshape(tf.transpose(tf.matmul(att, v), (0, 2, 1, 3)), 82 | (b_s, nq, self.h * self.d_v)) # (b_s, nq, h*d_v) 83 | out = self.fc_o(out) # (b_s, nq, d_model) 84 | 85 | v2 = tf.reshape(tf.transpose(v, (0, 2, 1, 3)), (b_s, nk, -1)) # bs, dim, nk 86 | self.dy_paras = tf.Variable(self.softmax(self.dy_paras, -1)) 87 | 88 | out2 = self.dy_paras[0] * self.conv1(v2) + self.dy_paras[1] * self.conv3(v2) + self.dy_paras[2] * self.conv5(v2) 89 | # out2 = tf.transpose(out2, (0, 2, 1)) # bs, n, dim 90 | 91 | out = out + out2 92 | return out 93 | 94 | 95 | if __name__ == '__main__': 96 | input = tf.random.normal((50, 49, 512)) 97 | sa = MUSEAttention(d_model=512, d_k=512, d_v=512, h=8) 98 | output = sa(input, input, input) 99 | print(output.shape) 100 | -------------------------------------------------------------------------------- /attention/MobileViTAttention.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras import layers 3 | 4 | class MobileViTAttention(layers.Layer): 5 | pass 6 | 7 | if __name__ == '__main__': 8 | input = tf.random.normal((50, 7, 7, 512)) 9 | a2 = MobileViTAttention(512, 128, 128, True) 10 | output = a2(input) 11 | print(output.shape) -------------------------------------------------------------------------------- /attention/OutlookAttention.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import tensorflow as tf 4 | from tensorflow.keras import layers 5 | 6 | 7 | class OutlookAttention(layers.Layer): 8 | def __init__(self, dim, num_heads=1, kernel_size=3, padding=1, stride=1, qkv_bias=False, attn_drop=0.1): 9 | super(OutlookAttention, self).__init__() 10 | self.dim = dim 11 | self.num_heads = num_heads 12 | self.head_dim = dim // num_heads 13 | self.kernel_size = kernel_size 14 | self.padding = padding 15 | self.stride = stride 16 | self.scale = self.head_dim ** (-0.5) 17 | 18 | self.v_pj = layers.Dense(dim, use_bias=qkv_bias) 19 | self.attn = layers.Dense(kernel_size ** 4 * num_heads) 20 | 21 | self.attn_drop = layers.Dropout(attn_drop) 22 | self.proj = layers.Dense(dim) 23 | self.proj_drop = layers.Dropout(attn_drop) 24 | 25 | self.unflod = tf.image.extract_patches(sizes=[1, kernel_size, kernel_size, 1], strides=[1, stride, stride, 1], 26 | padding='same') 27 | self.pool = layers.AvgPool2D(pool_size=stride, strides=stride, ceil_mode=True) 28 | 29 | def call(self, x): 30 | B, H, W, C = x.get_shape() 31 | 32 | # 映射到新的特征v 33 | v = self.v_pj(x) 34 | h, w = math.ceil(H / self.stride), math.ceil(W / self.stride) 35 | v = tf.reshape(self.unflod(v), (B, self.num_heads, h * w, self.kernel_size * self.kernel_size, self.head_dim)) 36 | 37 | # 生成Attention Map 38 | attn = self.pool(x) 39 | attn = tf.reshape(self.attn(attn), 40 | (B, self.num_heads, h * w, self.kernel_size * self.kernel_size, 41 | self.kernel_size * self.kernel_size)) 42 | 43 | attn = self.scale * attn 44 | attn = tf.nn.softmax(attn, axis=-1) 45 | attn = self.attn_drop(attn) 46 | 47 | # 获取weighted特征 48 | out = tf.reshape((attn @ v), (B, h*w, C * self.kernel_size * self.kernel_size)) 49 | out = tf.fold # torch.nn.fold开发者说没有这个功能,未来没打算加,以后再补充。见https://github.com/tensorflow/tensorflow/issues/52195#issuecomment-948915934 50 | 51 | 52 | if __name__ == '__main__': 53 | input = tf.random.normal((50, 7, 7, 512)) 54 | outlook = OutlookAttention(512, 128, 128, True) 55 | output = outlook(input) 56 | print(output.shape) 57 | -------------------------------------------------------------------------------- /attention/PSA.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras import layers, Sequential 3 | 4 | 5 | class PSA(layers.Layer): 6 | def __init__(self, channel=512, reduction=4, S=4): 7 | super(PSA, self).__init__() 8 | self.S = S 9 | self.convs = [] 10 | for i in range(S): 11 | self.convs.append(layers.Conv2D(channel // S, kernel_size=2 * (i + 1) + 1, padding='same')) 12 | 13 | self.se_blocks = [] 14 | for i in range(S): 15 | self.se_blocks.append(Sequential([ 16 | layers.GlobalAvgPool2D(keepdims=True), 17 | layers.Conv2D(channel // (S * reduction), kernel_size=1, use_bias=False), 18 | layers.Activation('relu'), 19 | layers.Conv2D(channel // S, kernel_size=1, use_bias=False), 20 | layers.Activation('sigmoid') 21 | ])) 22 | 23 | self.softmax = tf.nn.softmax 24 | 25 | def call(self, x): 26 | b, h, w, c = x.get_shape() 27 | 28 | # Step1: SPC module 29 | SPC_out = tf.reshape(x, shape=(b, h, w, self.S, c // self.S)) # bs, h, w, s, ci 30 | SPC_out_list = [] 31 | for idx, conv in enumerate(self.convs): 32 | SPC_out_list.append(conv(SPC_out[:, :, :, idx, :])) 33 | 34 | SPC_out = tf.stack(SPC_out_list, axis=3) 35 | 36 | # Step2: SE weight 37 | se_out = [] 38 | for idx, se in enumerate(self.se_blocks): 39 | se_out.append((se(SPC_out[:, :, :, idx, :]))) 40 | SE_out = tf.stack(se_out, axis=3) 41 | SE_out = tf.broadcast_to(SE_out, SPC_out.get_shape()) 42 | 43 | # Step3: Softmax 44 | softmax_out = self.softmax(SE_out) 45 | 46 | # Step4: SPA 47 | PSA_out = SPC_out * softmax_out 48 | PSA_out = tf.reshape(PSA_out, shape=(b, h, w, -1)) 49 | 50 | return PSA_out 51 | 52 | 53 | if __name__ == '__main__': 54 | input = tf.random.normal((50, 7, 7, 512)) 55 | psa = PSA(channel=512, reduction=8) 56 | output = psa(input) 57 | print(output.shape) 58 | -------------------------------------------------------------------------------- /attention/ParNetAttention.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras import layers 3 | 4 | class AFT_FULL(layers.Layer): 5 | pass 6 | 7 | if __name__ == '__main__': 8 | input = tf.random.normal((50, 7, 7, 512)) 9 | a2 = AFT_FULL(512, 128, 128, True) 10 | output = a2(input) 11 | print(output.shape) -------------------------------------------------------------------------------- /attention/PolarizedSelfAttention.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras.layers import Layer, Conv2D, Softmax, LayerNormalization, Activation 3 | import tensorflow.keras.backend as K 4 | 5 | class SequentialPolarizedSelfAttention(Layer): 6 | def __init__(self, channel=512, **kwargs): 7 | super(SequentialPolarizedSelfAttention, self).__init__(**kwargs) 8 | self.channel = channel 9 | self.ch_wv = Conv2D(channel // 2, kernel_size=(1, 1), padding='same') 10 | self.ch_wq = Conv2D(1, kernel_size=(1, 1), padding='same') 11 | self.softmax_channel = Softmax(axis=1) 12 | self.softmax_spatial = Softmax(axis=-1) 13 | self.ch_wz = Conv2D(channel, kernel_size=(1, 1), padding='same') 14 | self.ln = LayerNormalization(axis=[1, 2, 3]) 15 | self.sigmoid = Activation('sigmoid') 16 | self.sp_wv = Conv2D(channel // 2, kernel_size=(1, 1), padding='same') 17 | self.sp_wq = Conv2D(channel // 2, kernel_size=(1, 1), padding='same') 18 | self.agp = tf.keras.layers.GlobalAveragePooling2D(keepdims=True) 19 | 20 | def call(self, x): 21 | # Channel-only Self-Attention 22 | channel_wv = self.ch_wv(x) # bs, h, w, c//2 23 | channel_wq = self.ch_wq(x) # bs, h, w, 1 24 | channel_wv = tf.reshape(channel_wv, [tf.shape(x)[0], -1, self.channel // 2]) # bs, h*w, c//2 25 | channel_wq = tf.reshape(channel_wq, [tf.shape(x)[0], -1, 1]) # bs, h, w, 1 26 | channel_wq = self.softmax_channel(channel_wq) # bs, h*w, 1 27 | channel_wz = tf.matmul(channel_wv, channel_wq, transpose_a=True) # bs, c//2, 1 28 | channel_wz = tf.reshape(channel_wz, [tf.shape(x)[0], 1, 1, self.channel // 2]) 29 | channel_wz = self.ch_wz(channel_wz) 30 | channel_wz = tf.reshape(channel_wz, [tf.shape(x)[0], 1, 1, self.channel]) 31 | channel_weight = self.sigmoid(self.ln(channel_wz)) # bs, 1, 1, c 32 | channel_out = channel_weight * x 33 | 34 | # Spatial-only Self-Attention 35 | spatial_wv = self.sp_wv(channel_out) # bs, h, w, c//2 36 | spatial_wq = self.sp_wq(channel_out) # bs, h, w, c//2 37 | spatial_wq = self.agp(spatial_wq) # bs, 1, 1, c//2 38 | spatial_wv = tf.reshape(spatial_wv, [tf.shape(x)[0], -1, self.channel // 2]) # bs, h*w, c//2 39 | spatial_wq = tf.reshape(spatial_wq, [tf.shape(x)[0], 1, self.channel // 2]) # bs, 1, c//2 40 | spatial_wq = self.softmax_spatial(spatial_wq) 41 | spatial_wz = tf.matmul(spatial_wq, spatial_wv, transpose_b=True) # bs, 1, h*w, 42 | spatial_weight = self.sigmoid(tf.reshape(spatial_wz, [tf.shape(x)[0], tf.shape(x)[1], tf.shape(x)[2], 1])) # bs, h, w, 1 43 | spatial_out = spatial_weight * channel_out 44 | 45 | return spatial_out 46 | 47 | # Test the SequentialPolarizedSelfAttention layer 48 | if __name__ == '__main__': 49 | input_tensor = tf.random.normal([1, 7, 7, 512]) 50 | psa = SequentialPolarizedSelfAttention(channel=512) 51 | output_tensor = psa(input_tensor) 52 | print(output_tensor.shape) -------------------------------------------------------------------------------- /attention/ResidualAttention.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from tensorflow.keras.layers import Conv2D 4 | from tensorflow.keras import Model 5 | 6 | 7 | class ResidualAttention(Model): 8 | def __init__(self, num_class=1000, name='ResidualAttention', la=0.2): 9 | super(ResidualAttention, self).__init__(name=name) 10 | self.la = la 11 | self.fc = Conv2D(filters=num_class, kernel_size=1, strides=1, use_bias=False) 12 | 13 | def call(self, x): 14 | x = self.fc(x) 15 | b, h, w, c = x.shape 16 | y_raw = tf.reshape(x, [-1, h * w, c]) # b, hxw, num_class 17 | y_avg = tf.reduce_mean(y_raw, axis=1) # b, num_class 18 | y_max = tf.reduce_max(y_raw, axis=1) # b, num_class 19 | score = y_avg + self.la * y_max 20 | return score 21 | 22 | 23 | if __name__ == '__main__': 24 | input = tf.random.normal(shape=(50, 7, 7, 512)) 25 | resatt = ResidualAttention(num_class=1000, la=0.2) 26 | output = resatt(input) 27 | print(output.shape) 28 | -------------------------------------------------------------------------------- /attention/S2Attention.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras import layers 3 | 4 | class S2Attention(layers.Layer): 5 | pass 6 | 7 | if __name__ == '__main__': 8 | input = tf.random.normal((50, 7, 7, 512)) 9 | a2 = S2Attention(512, 128, 128, True) 10 | output = a2(input) 11 | print(output.shape) -------------------------------------------------------------------------------- /attention/SEAttention.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras import layers, Sequential 3 | 4 | 5 | class SEAttention(layers.Layer): 6 | def __init__(self, channel=512, reduction=16): 7 | super(SEAttention, self).__init__() 8 | self.avg_pool = layers.GlobalAvgPool2D( 9 | keepdims=True) # 同nn.AdaptiveAvgPool2d(1), 但是注意torch的输出是保持4维的,而tensorflow不保持维度. 10 | self.fc = Sequential([ 11 | layers.Dense(channel // reduction, use_bias=False), 12 | layers.Activation('relu'), 13 | layers.Dense(channel, use_bias=False), 14 | layers.Activation('sigmoid') 15 | ]) 16 | 17 | def call(self, x): 18 | b, h, w, c = x.get_shape() 19 | y = self.avg_pool(x) 20 | y = self.fc(y) 21 | return x * tf.tile(y, (1, h, w, 1)) # or use 'tf.broadcast_to(y, x.get_shape())' 22 | 23 | 24 | if __name__ == '__main__': 25 | input = tf.random.normal((50, 7, 7, 512)) 26 | se = SEAttention(channel=512, reduction=8) 27 | output = se(input) 28 | print(output.shape) 29 | -------------------------------------------------------------------------------- /attention/SGE.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras import layers 3 | 4 | 5 | class SpatialGroupEnhance(layers.Layer): 6 | def __init__(self, groups): 7 | super(SpatialGroupEnhance, self).__init__() 8 | self.groups = groups 9 | self.avg_pool = layers.GlobalAvgPool2D(keepdims=True) 10 | self.sig = tf.sigmoid 11 | 12 | def build(self, input_shape): 13 | self.weight = self.add_weight(shape=(1, 1, 1, self.groups), initializer='zeros', trainable=True) 14 | self.bias = self.add_weight(shape=(1, 1, 1, self.groups), initializer='zeros', trainable=True) 15 | super(SpatialGroupEnhance, self).build(input_shape) 16 | 17 | def call(self, x): 18 | b, h, w, c = x.get_shape() 19 | x = tf.reshape(x, (b * self.groups, h, w, -1)) # bs*g, h, w, dim//g 20 | xn = x * self.avg_pool(x) # bs*g, h, w, dim//g 21 | xn = tf.reduce_sum(xn, axis=-1, keepdims=True) # bs*g, h, w, 1 22 | t = tf.reshape(xn, (b * self.groups, -1)) # bs*g, h*w 23 | 24 | t = t - tf.reduce_mean(t, axis=-1, keepdims=True) # bs*g, h*w 25 | std = tf.math.reduce_std(t, axis=-1, keepdims=True) + 1e-5 26 | t = t / std # bs*g, h*w 27 | t = tf.reshape(t, (b, h, w, self.groups)) # bs, h, w, g 28 | 29 | t = t * self.weight + self.bias # bs, h, w, g 30 | t = tf.reshape(t, (b * self.groups, h, w, 1)) # bs*g, h, w, 1 31 | x = x * self.sig(t) # bs*g, h, w, dim//g 32 | x = tf.reshape(x, (b, h, w, c)) 33 | 34 | return x 35 | 36 | 37 | if __name__ == '__main__': 38 | input = tf.random.normal((50, 7, 7, 512)) 39 | sge = SpatialGroupEnhance(groups=8) 40 | output = sge(input) 41 | print(output.shape) 42 | -------------------------------------------------------------------------------- /attention/SKAttention.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras import layers, Sequential 3 | 4 | 5 | class SKAttention(layers.Layer): 6 | def __init__(self, channel=512, kernels=[1, 3, 5, 7], reduction=16, group=1, L=32): 7 | super(SKAttention, self).__init__() 8 | self.d = max(L, channel // reduction) 9 | self.convs = [] 10 | # self.convs = Sequential([]) 11 | for k in kernels: 12 | self.convs.append( 13 | Sequential([ 14 | layers.Conv2D(channel, kernel_size=k, padding='same', groups=group, name='conv'), 15 | layers.BatchNormalization(name='bn'), 16 | layers.Activation('relu', name='relu'), 17 | ]) 18 | ) 19 | self.fc = layers.Dense(self.d) 20 | self.fcs = [] 21 | for i in range(len(kernels)): 22 | self.fcs.append(layers.Dense(channel)) 23 | 24 | def call(self, x): 25 | bs, _, _, c = x.get_shape() 26 | conv_outs = [] 27 | ### split 28 | for conv in self.convs: 29 | conv_outs.append(conv(x)) 30 | feats = tf.stack(conv_outs, 0) # k, bs, h, w, channel 31 | 32 | ### fuse 33 | U = sum(conv_outs) # bs, h, w, c 34 | 35 | ### reduction channel 36 | S = tf.reduce_mean(tf.reduce_mean(U, axis=-2), axis=-2) # bs, c 37 | Z = self.fc(S) # bs, d 38 | 39 | ### calculate attention weight 40 | weights = [] 41 | for fc in self.fcs: 42 | weight = fc(Z) 43 | weights.append(tf.reshape(weight, (bs, 1, 1, c))) # bs, channel 44 | attention_weughts = tf.stack(weights, 0) # k, bs, 1, 1, channel 45 | attention_weughts = tf.nn.softmax(attention_weughts, axis=0) # k, bs, 1, 1, channel 46 | 47 | ### fuse 48 | V = tf.reduce_sum(attention_weughts * feats, 0) 49 | return V 50 | 51 | 52 | if __name__ == '__main__': 53 | input = tf.random.normal((50, 7, 7, 512)) 54 | se = SKAttention(channel=512, reduction=8) 55 | output = se(input) 56 | print(output.shape) 57 | -------------------------------------------------------------------------------- /attention/SelfAttention.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.keras import layers 4 | 5 | 6 | class ScaledDotProductAttention(layers.Layer): 7 | """ 8 | Scaled dot-product attention 9 | """ 10 | 11 | def __init__(self, d_model, d_k, d_v, h, dropout=.1): 12 | """ 13 | :param d_model: Output dimensionality of the model 14 | :param d_k: Dimensionality of queries and keys 15 | :param d_v: Dimensionality of values 16 | :param h: Number of heads 17 | """ 18 | super(ScaledDotProductAttention, self).__init__() 19 | self.fc_q = layers.Dense(h * d_k) 20 | self.fc_k = layers.Dense(h * d_k) 21 | self.fc_v = layers.Dense(h * d_k) 22 | self.fc_o = layers.Dense(d_model) 23 | self.dropout = layers.Dropout(dropout) 24 | self.softmax = tf.nn.softmax 25 | 26 | self.d_model = d_model 27 | self.d_k = d_k 28 | self.d_v = d_v 29 | self.h = h 30 | 31 | def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int): 32 | # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] 33 | tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.h, self.d_model)) 34 | 35 | # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size] 36 | return tf.transpose(tensor, perm=[0, 2, 1, 3]) 37 | 38 | def call(self, queries, keys, values, attention_mask=None, attention_weights=None): 39 | """ 40 | Computs 41 | :param queries: Queries (b_s, nq, d_model) 42 | :param keys: Keys (b_s, nk, d_model) 43 | :param values: Values (b_s, nk, d_model) 44 | :param attention_mask: Mask over attention values (b_s, h, nq, nk). True indicates masking 45 | :param attention_weights: Multiplicative weights for attention values (b_s, h, nq, nk). 46 | :return: 47 | """ 48 | b_s, nq = queries.shape[:2] 49 | 50 | q = self.transpose_for_scores(self.fc_q(queries), batch_size=b_s) # (b_s, h, nq ,d_k) 51 | k = self.transpose_for_scores(self.fc_k(keys), batch_size=b_s) # (b_s, h, nk, d_k) 52 | v = self.transpose_for_scores(self.fc_v(values), batch_size=b_s) # (b_s, h, nk ,d_v) 53 | 54 | # Take the dot product between "query" and "key" to get the raw attention scores. 55 | # (batch size, num_heads, seq_len_q, seq_len_k) 56 | att = tf.matmul(q, k, transpose_b=True) / np.sqrt(self.d_k) 57 | 58 | if attention_weights is not None: 59 | att = att * attention_weights 60 | if attention_mask is not None: 61 | att = tf.multiply(att, attention_mask) 62 | 63 | # Normalize the attention scores to probabilities. 64 | att = self.softmax(att, -1) 65 | 66 | att = self.dropout(att) 67 | 68 | out = tf.reshape(tf.transpose(tf.matmul(att, v), (0, 2, 1, 3)), 69 | (b_s, nq, self.h * self.d_v)) # (b_s, nq, h*d_v) 70 | out = self.fc_o(out) # (b_s, nq, d_model) 71 | return out 72 | 73 | 74 | if __name__ == '__main__': 75 | input = tf.random.normal((50, 49, 512)) 76 | sa = ScaledDotProductAttention(d_model=512, d_k=512, d_v=512, h=8) 77 | output = sa(input, input, input) 78 | print(output.shape) 79 | -------------------------------------------------------------------------------- /attention/ShuffleAttention.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow_addons as tfa 3 | from tensorflow.keras import layers 4 | 5 | 6 | class ShuffleAttention(layers.Layer): 7 | def __init__(self, channel=512, reduction=16, G=8): 8 | super(ShuffleAttention, self).__init__() 9 | self.G = G 10 | self.channel = channel 11 | self.avg_pool = layers.GlobalAvgPool2D(keepdims=True) 12 | self.gn = tfa.layers.GroupNormalization(channel // (2 * G), axis=-1) 13 | self.sigmoid = tf.nn.sigmoid 14 | 15 | def build(self, input_shape): 16 | self.cweight = self.add_weight( 17 | shape=(1, 1, 1, self.channel // (2 * self.G)), initializer='zeros', trainable=True, 18 | ) 19 | self.cbias = self.add_weight( 20 | shape=(1, 1, 1, self.channel // (2 * self.G)), initializer='ones', trainable=True, 21 | ) 22 | self.sweight = self.add_weight( 23 | shape=(1, 1, 1, self.channel // (2 * self.G)), initializer='zeros', trainable=True, 24 | ) 25 | self.sbias = self.add_weight( 26 | shape=(1, 1, 1, self.channel // (2 * self.G)), initializer='ones', trainable=True, 27 | ) 28 | super(ShuffleAttention, self).build(input_shape) 29 | 30 | @staticmethod 31 | def channel_shuffle(x, groups): 32 | b, h, w, c = x.get_shape() 33 | x = tf.reshape(x, shape=(b, h, w, groups, -1)) 34 | x = tf.transpose(x, perm=(0, 1, 2, 4, 3)) 35 | 36 | # flatten 37 | x = tf.reshape(x, shape=(b, h, w, -1)) 38 | return x 39 | 40 | def call(self, x): 41 | b, h, w, c = x.get_shape() 42 | # group into subfeatures 43 | x = tf.reshape(x, (b * self.G, h, w, -1)) # bs*G, h, w, c//G 44 | 45 | # channel_split 46 | x_0, x_1 = tf.split(x, num_or_size_splits=2, axis=3) # bs*G, h, w, c//(2*G) 47 | 48 | # channel attention 49 | x_channel = self.avg_pool(x_0) # bs*G, 1, 1, c//(2*G) 50 | x_channel = self.cweight * x_channel + self.cbias # bs*G, 1, 1, c//(2*G) 51 | x_channel = x_0 * self.sigmoid(x_channel) # bs*G, h, w, c//(2*G) 52 | 53 | # spatial attention 54 | x_spatial = self.gn(x_1) # bs*G, h, w, c//(2*G) 55 | x_spatial = self.sweight * x_spatial + self.sbias # bs*G, h, w, c//(2*G) 56 | x_spatial = x_1 * self.sigmoid(x_spatial) # bs*G, h, w, c//(2*G) 57 | 58 | # concatenate along channel axis 59 | out = tf.concat([x_channel, x_spatial], axis=3) 60 | out = tf.reshape(out, (b, h, w, -1)) 61 | 62 | # channel shuffle 63 | out = self.channel_shuffle(out, 2) 64 | 65 | return out 66 | 67 | 68 | if __name__ == '__main__': 69 | input = tf.random.normal((50, 7, 7, 512)) 70 | se = ShuffleAttention(channel=512, G=8) 71 | output = se(input) 72 | print(output.shape) 73 | -------------------------------------------------------------------------------- /attention/SimplifiedSelfAttention.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.keras import layers 4 | 5 | 6 | class SimplifiedScaledDotProductAttention(layers.Layer): 7 | """ 8 | Scaled dot-product attention 9 | """ 10 | 11 | def __init__(self, d_model, h, dropout=.1): 12 | """ 13 | :param d_model: Output dimensionality of the model 14 | :param d_k: Dimensionality of queries and keys 15 | :param d_v: Dimensionality of values 16 | :param h: Number of heads 17 | """ 18 | super(SimplifiedScaledDotProductAttention, self).__init__() 19 | 20 | self.d_model = d_model 21 | self.d_k = d_model // h 22 | self.d_v = d_model // h 23 | self.h = h 24 | 25 | self.fc_o = layers.Dense(d_model) 26 | self.dropout = layers.Dropout(dropout) 27 | 28 | def call(self, queries, keys, values, attention_mask=None, attention_weights=None): 29 | ''' 30 | Computes 31 | :param queries: Queries (b_s, nq, d_model) 32 | :param keys: Keys (b_s, nk, d_model) 33 | :param values: Values (b_s, nk, d_model) 34 | :param attention_mask: Mask over attention values (b_s, h, nq, nk). True indicates masking. 35 | :param attention_weights: Multiplicative weights for attention values (b_s, h, nq, nk). 36 | :return: 37 | ''' 38 | b_s, nq = queries.shape[:2] 39 | nk = keys.shape[1] 40 | 41 | q = tf.transpose(tf.reshape(queries, (b_s, nq, self.h, self.d_k)), (0, 2, 1, 3)) # (b_s, h, nq, d_k) 42 | k = tf.transpose(tf.reshape(keys, (b_s, nk, self.h, self.d_k)), (0, 2, 3, 1)) # (b_s, h, d_k, nk) 43 | v = tf.transpose(tf.reshape(values, (b_s, nk, self.h, self.d_v)), (0, 2, 1, 3)) # (b_s, h, nk, d_v) 44 | 45 | att = tf.matmul(q, k) / np.sqrt(self.d_k) # (b_s, h, nq, nk) 46 | if attention_weights is not None: 47 | att = att * attention_weights 48 | if attention_mask is not None: 49 | att = att.masked_fill(attention_mask, -np.inf) 50 | att = tf.nn.softmax(att, -1) 51 | att = self.dropout(att) 52 | 53 | out = tf.reshape(tf.transpose(tf.matmul(att, v), (0, 2, 1, 3)), 54 | (b_s, nq, self.h * self.d_v)) # (b_s, nq, h*d_v) 55 | out = self.fc_o(out) # (b_s, nq, d_model) 56 | return out 57 | 58 | 59 | if __name__ == '__main__': 60 | input = tf.random.normal((50, 49, 512)) 61 | ssa = SimplifiedScaledDotProductAttention(d_model=512, h=8) 62 | output = ssa(input, input, input) 63 | print(output.shape) 64 | -------------------------------------------------------------------------------- /attention/TripletAttention.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras import layers 3 | 4 | class TripletAttention(layers.Layer): 5 | pass 6 | 7 | if __name__ == '__main__': 8 | input = tf.random.normal((50, 7, 7, 512)) 9 | a2 = TripletAttention(512, 128, 128, True) 10 | output = a2(input) 11 | print(output.shape) -------------------------------------------------------------------------------- /attention/UFOAttention.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras import layers 3 | 4 | class AFT_FULL(layers.Layer): 5 | pass 6 | 7 | if __name__ == '__main__': 8 | input = tf.random.normal((50, 7, 7, 512)) 9 | a2 = AFT_FULL(512, 128, 128, True) 10 | output = a2(input) 11 | print(output.shape) -------------------------------------------------------------------------------- /attention/ViP.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras import layers 3 | 4 | 5 | class MLP(layers.Layer): 6 | def __init__(self, hidden_features, out_features, drop=0.1): 7 | super(MLP, self).__init__() 8 | self.fc1 = layers.Dense(hidden_features, activation='gelu') 9 | self.fc2 = layers.Dense(out_features) 10 | self.drop = layers.Dropout(drop) 11 | 12 | def call(self, x): 13 | return self.drop(self.fc2(self.drop(self.fc1(x)))) 14 | 15 | 16 | class WeightedPermuteMLP(layers.Layer): 17 | def __init__(self, dim, seg_dim=8, qkv_bias=False, proj_drop=0.): 18 | super(WeightedPermuteMLP, self).__init__() 19 | self.seg_dim = seg_dim 20 | self.mlp_c = layers.Dense(dim, use_bias=qkv_bias) 21 | self.mlp_h = layers.Dense(dim, use_bias=qkv_bias) 22 | self.mlp_w = layers.Dense(dim, use_bias=qkv_bias) 23 | 24 | self.reweighting = MLP(dim // 4, dim * 3) 25 | 26 | self.proj = layers.Dense(dim) 27 | self.proj_drop = layers.Dropout(proj_drop) 28 | 29 | def call(self, x): 30 | B, H, W, C = x.get_shape() 31 | 32 | c_embed = self.mlp_c(x) 33 | 34 | S = C // self.seg_dim 35 | h_embed = tf.reshape(tf.transpose(tf.reshape(x, (B, H, W, self.seg_dim, S)), (0, 3, 2, 1, 4)), 36 | (B, self.seg_dim, W, H * S)) 37 | h_embed = tf.reshape(tf.transpose(tf.reshape(self.mlp_h(h_embed), (B, self.seg_dim, W, H, S)), (0, 3, 2, 1, 4)), 38 | (B, H, W, C)) 39 | 40 | w_embed = tf.reshape(tf.transpose(tf.reshape(x, (B, H, W, self.seg_dim, S)), (0, 3, 2, 1, 4)), 41 | (B, self.seg_dim, W, H * S)) 42 | w_embed = tf.reshape(tf.transpose(tf.reshape(self.mlp_w(w_embed), (B, self.seg_dim, W, H, S)), (0, 3, 2, 1, 4)), 43 | (B, H, W, C)) 44 | 45 | weight = tf.reduce_mean(tf.reshape(tf.transpose((c_embed + h_embed + w_embed), (0, 3, 1, 2)), (B, C, -1)), 46 | axis=2) 47 | weight = tf.expand_dims(tf.expand_dims( 48 | tf.nn.softmax(tf.transpose(tf.reshape(self.reweighting(weight), (B, C, 3)), (2, 0, 1)), axis=0), axis=2), 49 | axis=2) 50 | 51 | x = c_embed * weight[0] + w_embed * weight[1] + h_embed * weight[2] 52 | 53 | x = self.proj_drop(self.proj(x)) 54 | 55 | return x 56 | 57 | 58 | if __name__ == '__main__': 59 | input = tf.random.normal((64, 8, 8, 512)) 60 | seg_dim = 8 61 | vip = WeightedPermuteMLP(512, seg_dim) 62 | output = vip(input) 63 | print(output.shape) 64 | -------------------------------------------------------------------------------- /attention/gfnet.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras import layers 3 | 4 | class GFNet(layers.Layer): 5 | pass 6 | 7 | if __name__ == '__main__': 8 | input = tf.random.normal((50, 7, 7, 512)) 9 | a2 = GFNet(512, 128, 128, True) 10 | output = a2(input) 11 | print(output.shape) -------------------------------------------------------------------------------- /attention/img/A2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/A2.png -------------------------------------------------------------------------------- /attention/img/AFT.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/AFT.jpg -------------------------------------------------------------------------------- /attention/img/BAM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/BAM.png -------------------------------------------------------------------------------- /attention/img/CBAM1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/CBAM1.png -------------------------------------------------------------------------------- /attention/img/CBAM2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/CBAM2.png -------------------------------------------------------------------------------- /attention/img/CoAtNet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/CoAtNet.png -------------------------------------------------------------------------------- /attention/img/CoT.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/CoT.png -------------------------------------------------------------------------------- /attention/img/CondConv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/CondConv.png -------------------------------------------------------------------------------- /attention/img/ConvMixer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/ConvMixer.png -------------------------------------------------------------------------------- /attention/img/CoordAttention.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/CoordAttention.png -------------------------------------------------------------------------------- /attention/img/DepthwiseSeparableConv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/DepthwiseSeparableConv.png -------------------------------------------------------------------------------- /attention/img/DynamicConv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/DynamicConv.png -------------------------------------------------------------------------------- /attention/img/ECA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/ECA.png -------------------------------------------------------------------------------- /attention/img/EMSA.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/EMSA.jpg -------------------------------------------------------------------------------- /attention/img/EMSA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/EMSA.png -------------------------------------------------------------------------------- /attention/img/External_Attention.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/External_Attention.png -------------------------------------------------------------------------------- /attention/img/GFNet.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/GFNet.jpg -------------------------------------------------------------------------------- /attention/img/HaloNet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/HaloNet.png -------------------------------------------------------------------------------- /attention/img/Infine-attention.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/Infine-attention.jpeg -------------------------------------------------------------------------------- /attention/img/Involution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/Involution.png -------------------------------------------------------------------------------- /attention/img/MBConv.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/MBConv.jpg -------------------------------------------------------------------------------- /attention/img/MUSE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/MUSE.png -------------------------------------------------------------------------------- /attention/img/MUSE2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/MUSE2.jpg -------------------------------------------------------------------------------- /attention/img/MobileViTAttention.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/MobileViTAttention.png -------------------------------------------------------------------------------- /attention/img/MobileViTv2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/MobileViTv2.png -------------------------------------------------------------------------------- /attention/img/OutlookAttention.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/OutlookAttention.png -------------------------------------------------------------------------------- /attention/img/ParNet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/ParNet.png -------------------------------------------------------------------------------- /attention/img/PoSA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/PoSA.png -------------------------------------------------------------------------------- /attention/img/ResAtt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/ResAtt.png -------------------------------------------------------------------------------- /attention/img/S2Attention.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/S2Attention.png -------------------------------------------------------------------------------- /attention/img/SA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/SA.png -------------------------------------------------------------------------------- /attention/img/SE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/SE.png -------------------------------------------------------------------------------- /attention/img/SGE.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/SGE.jpg -------------------------------------------------------------------------------- /attention/img/SGE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/SGE.png -------------------------------------------------------------------------------- /attention/img/SK.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/SK.png -------------------------------------------------------------------------------- /attention/img/SSA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/SSA.png -------------------------------------------------------------------------------- /attention/img/ShuffleAttention.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/ShuffleAttention.jpg -------------------------------------------------------------------------------- /attention/img/ShuffleAttention.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/ShuffleAttention.png -------------------------------------------------------------------------------- /attention/img/UFO.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/UFO.png -------------------------------------------------------------------------------- /attention/img/ViP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/ViP.png -------------------------------------------------------------------------------- /attention/img/acnet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/acnet.png -------------------------------------------------------------------------------- /attention/img/danet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/danet.png -------------------------------------------------------------------------------- /attention/img/danet2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/danet2.png -------------------------------------------------------------------------------- /attention/img/ddb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/ddb.png -------------------------------------------------------------------------------- /attention/img/gMLP.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/gMLP.jpg -------------------------------------------------------------------------------- /attention/img/mlpmixer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/mlpmixer.png -------------------------------------------------------------------------------- /attention/img/mobileViT.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/mobileViT.jpg -------------------------------------------------------------------------------- /attention/img/psa.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/psa.jpg -------------------------------------------------------------------------------- /attention/img/psa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/psa.png -------------------------------------------------------------------------------- /attention/img/psa2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/psa2.jpg -------------------------------------------------------------------------------- /attention/img/repmlp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/repmlp.png -------------------------------------------------------------------------------- /attention/img/repvgg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/repvgg.png -------------------------------------------------------------------------------- /attention/img/resmlp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/resmlp.png -------------------------------------------------------------------------------- /attention/img/resnet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/resnet.png -------------------------------------------------------------------------------- /attention/img/resnet2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/resnet2.jpg -------------------------------------------------------------------------------- /attention/img/resnext.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/resnext.png -------------------------------------------------------------------------------- /attention/img/sMLP.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/sMLP.jpg -------------------------------------------------------------------------------- /attention/img/triplet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/triplet.png -------------------------------------------------------------------------------- /conv/MBConv.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras import layers 3 | 4 | def drop_connect(inputs, p, training): 5 | """Drop the entire conv with given survival probability.""" 6 | # "Deep Networks with Stochastic Depth", https://arxiv.org/pdf/1603.09382.pdf 7 | if not training: return inputs 8 | 9 | # Compute tensor. 10 | batch_size = tf.shape(inputs)[0] 11 | keep_prob = 1 - p 12 | random_tensor = keep_prob 13 | random_tensor += tf.random.uniform([batch_size, 1, 1, 1], dtype=inputs.dtype) 14 | binary_tensor = tf.floor(random_tensor) 15 | # Unlike conventional way that multiply survival_prob at test time, here we 16 | # divide survival_prob at training time, such that no addition compute is 17 | # needed at test time. 18 | output = inputs / keep_prob * binary_tensor 19 | return output 20 | 21 | class MBConvBlock(layers.Layer): 22 | """A class of MBVonv: Mobile Inverted Residual Bottleneck. 23 | Attributes: 24 | endpoints: dict. A list of internal tensors. 25 | 层:ksize=3*3 输入32 输出16 conv1 stride1 26 | """ 27 | 28 | def __init__(self, ksize, input_filters, output_filters, expand_ratio=1, stride=1, name=None): 29 | super().__init__(name=name) 30 | 31 | self._bn_mom = 0.1 # batch norm momentum 32 | self._bn_eps = 0.1 # batch norm epsilon 33 | self._se_ratio = 0.25 34 | self._input_filters = input_filters 35 | self._output_filters = output_filters 36 | self._expand_ratio = expand_ratio 37 | self.kernel_size = ksize 38 | self._stride = stride 39 | 40 | inp = self._input_filters 41 | oup = self._input_filters * self._expand_ratio 42 | if self._expand_ratio != 1: 43 | self._expand_conv = layers.Conv2D(filters=oup, kernel_size=1, padding='same', use_bias=False) 44 | self._bn0 = layers.BatchNormalization(momentum=self._bn_mom, epsilon=self._bn_eps) 45 | 46 | # Depthwise convolution 47 | k = self.kernel_size 48 | s = self._stride 49 | self._depthwise_conv = layers.Conv2D(filters=oup, groups=oup, kernel_size=k, strides=s, padding='same', 50 | use_bias=False) 51 | self._bn1 = layers.BatchNormalization(momentum=self._bn_mom, epsilon=self._bn_eps) 52 | 53 | # Squeeze and Excitation layer, if desired 54 | num_squeezed_channels = max(1, self._input_filters * self._se_ratio) # num reduced filters 55 | self._se_reduce = layers.Conv2D(filters=num_squeezed_channels, kernel_size=1, padding='same') 56 | self._se_expand = layers.Conv2D(filters=oup, kernel_size=1, padding='same') 57 | 58 | # Output phase 59 | final_oup = self._output_filters 60 | self._project_conv = layers.Conv2D(filters=final_oup, kernel_size=1, padding='same', use_bias=False) 61 | self._bn2 = layers.BatchNormalization(momentum=self._bn_mom, epsilon=self._bn_eps) 62 | self._swish = tf.nn.swish # Swish 是一种新型激活函数,公式为: f(x) = x · sigmoid(x) 63 | 64 | def call(self, inputs, drop_connect_rate=None): 65 | # Expansion and Depthwise Convolution 66 | x = inputs 67 | if self._expand_ratio != 1: 68 | expand = self._expand_conv(x) 69 | bn0 = self._bn0(expand) 70 | x = self._swish(bn0) 71 | depthwise = self._depthwise_conv(x) 72 | bn1 = self._bn1(depthwise) 73 | x = self._swish(bn1) 74 | 75 | # Squeeze and Excitation 76 | h_axis, w_axis = [1, 2] 77 | x_squeezed = tf.nn.avg_pool2d(x, ksize=[1, x.shape[h_axis], x.shape[w_axis], 1], strides=[1, 1, 1, 1], 78 | padding='VALID') 79 | x_squeezed = self._se_reduce(x_squeezed) 80 | x_squeezed = self._swish(x_squeezed) 81 | x_squeezed = self._se_expand(x_squeezed) 82 | x = tf.sigmoid(x_squeezed) * x 83 | 84 | x = self._bn2(self._project_conv(x)) 85 | 86 | # Skip connection and drop connect 87 | input_filters, output_filters = self._input_filters, self._output_filters 88 | if self._stride == 1 and input_filters == output_filters: 89 | if drop_connect_rate is not None: 90 | x = drop_connect(x, p=drop_connect_rate, training=True) 91 | x = x + inputs # skip connection 92 | return x 93 | 94 | if __name__ == '__main__': 95 | input = tf.random.normal((1, 112, 112, 3)) 96 | mbconv = MBConvBlock(ksize=3, input_filters=3, output_filters=3) 97 | out = mbconv(input) 98 | print(out.shape) 99 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.23.5 2 | tensorflow==2.10.0 3 | 4 | # 2024.03.20 SequentialPolarizedSelfAttention 5 | # numpy==1.26.4 6 | # tensorflow==2.16.1 --------------------------------------------------------------------------------