├── .vscode
    └── launch.json
├── README.md
├── attention
    ├── A2Attention.py
    ├── AFT.py
    ├── BAM.py
    ├── CBAM.py
    ├── CoAtNet.py
    ├── CoTAttention.py
    ├── CoordAttention.py
    ├── DANet.py
    ├── ECAAttention.py
    ├── EMSA.py
    ├── ExternalAttention.py
    ├── HaloAttention.py
    ├── MUSEAttention.py
    ├── MobileViTAttention.py
    ├── OutlookAttention.py
    ├── PSA.py
    ├── ParNetAttention.py
    ├── PolarizedSelfAttention.py
    ├── ResidualAttention.py
    ├── S2Attention.py
    ├── SEAttention.py
    ├── SGE.py
    ├── SKAttention.py
    ├── SelfAttention.py
    ├── ShuffleAttention.py
    ├── SimplifiedSelfAttention.py
    ├── TripletAttention.py
    ├── UFOAttention.py
    ├── ViP.py
    ├── gfnet.py
    └── img
    │   ├── A2.png
    │   ├── AFT.jpg
    │   ├── BAM.png
    │   ├── CBAM1.png
    │   ├── CBAM2.png
    │   ├── CoAtNet.png
    │   ├── CoT.png
    │   ├── CondConv.png
    │   ├── ConvMixer.png
    │   ├── CoordAttention.png
    │   ├── DepthwiseSeparableConv.png
    │   ├── DynamicConv.png
    │   ├── ECA.png
    │   ├── EMSA.jpg
    │   ├── EMSA.png
    │   ├── External_Attention.png
    │   ├── GFNet.jpg
    │   ├── HaloNet.png
    │   ├── Infine-attention.jpeg
    │   ├── Involution.png
    │   ├── MBConv.jpg
    │   ├── MUSE.png
    │   ├── MUSE2.jpg
    │   ├── MobileViTAttention.png
    │   ├── MobileViTv2.png
    │   ├── OutlookAttention.png
    │   ├── ParNet.png
    │   ├── PoSA.png
    │   ├── ResAtt.png
    │   ├── S2Attention.png
    │   ├── SA.png
    │   ├── SE.png
    │   ├── SGE.jpg
    │   ├── SGE.png
    │   ├── SK.png
    │   ├── SSA.png
    │   ├── ShuffleAttention.jpg
    │   ├── ShuffleAttention.png
    │   ├── UFO.png
    │   ├── ViP.png
    │   ├── acnet.png
    │   ├── danet.png
    │   ├── danet2.png
    │   ├── ddb.png
    │   ├── gMLP.jpg
    │   ├── mlpmixer.png
    │   ├── mobileViT.jpg
    │   ├── psa.jpg
    │   ├── psa.png
    │   ├── psa2.jpg
    │   ├── repmlp.png
    │   ├── repvgg.png
    │   ├── resmlp.png
    │   ├── resnet.png
    │   ├── resnet2.jpg
    │   ├── resnext.png
    │   ├── sMLP.jpg
    │   └── triplet.png
├── conv
    └── MBConv.py
└── requirements.txt


/.vscode/launch.json:
--------------------------------------------------------------------------------
1 | {
2 |     // 使用 IntelliSense 了解相关属性。 
3 |     // 悬停以查看现有属性的描述。
4 |     // 欲了解更多信息，请访问: https://go.microsoft.com/fwlink/?linkid=830387
5 |     "version": "0.2.0",
6 |     "configurations": []
7 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # [External-Attention-tensorflow](https://github.com/ccfco-Ivan/External-Attention-tensorflow)
  2 | 
  3 | [![OSCS Status](https://www.oscs1024.com/platform/badge/ccfco-Ivan/External-Attention-tensorflow.svg?size=small)](https://www.oscs1024.com/project/ccfco-Ivan/External-Attention-tensorflow?ref=badge_small)
  4 | 
  5 | ## Contents
  6 | 
  7 | - [External-Attention-tensorflow](#external-attention-tensorflow)
  8 |   - [Contents](#contents)
  9 |   - [Attention Series](#attention-series)
 10 |     - [1. Residual Attention Usage](#1-residual-attention-usage)
 11 |       - [1.1. Paper](#11-paper)
 12 |       - [1.2 Overview](#12-overview)
 13 |       - [1.3. UsageCode](#13-usagecode)
 14 |     - [2. External Attention Usage](#2-external-attention-usage)
 15 |       - [2.1. Paper](#21-paper)
 16 |       - [2.2. Overview](#22-overview)
 17 |       - [2.3. UsageCode](#23-usagecode)
 18 |     - [3. Self Attention Usage](#3-self-attention-usage)
 19 |       - [3.1. Paper](#31-paper)
 20 |       - [3.2. Overview](#32-overview)
 21 |       - [3.3. UsageCode](#33-usagecode)
 22 |     - [4. Simplified Self Attention Usage](#4-simplified-self-attention-usage)
 23 |       - [4.1. Paper](#41-paper)
 24 |       - [4.2. Overview](#42-overview)
 25 |       - [4.3. UsageCode](#43-usagecode)
 26 |     - [5. Squeeze-and-Excitation Attention Usage](#5-squeeze-and-excitation-attention-usage)
 27 |       - [5.1. Paper](#51-paper)
 28 |       - [5.2. Overview](#52-overview)
 29 |       - [5.3. UsageCode](#53-usagecode)
 30 |     - [6. SK Attention Usage](#6-sk-attention-usage)
 31 |       - [6.1. Paper](#61-paper)
 32 |       - [6.2. Overview](#62-overview)
 33 |       - [6.3. UsageCode](#63-usagecode)
 34 |     - [7. CBAM Attention Usage](#7-cbam-attention-usage)
 35 |       - [7.1. Paper](#71-paper)
 36 |       - [7.2. Overview](#72-overview)
 37 |       - [7.3. Usage Code](#73-usage-code)
 38 |     - [8. BAM Attention Usage](#8-bam-attention-usage)
 39 |       - [8.1. Paper](#81-paper)
 40 |       - [8.2. Overview](#82-overview)
 41 |       - [8.3. Usage Code](#83-usage-code)
 42 |     - [9. ECA Attention Usage](#9-eca-attention-usage)
 43 |       - [9.1. Paper](#91-paper)
 44 |       - [9.2. Overview](#92-overview)
 45 |       - [9.3. Usage Code](#93-usage-code)
 46 |     - [10. DANet Attention Usage](#10-danet-attention-usage)
 47 |       - [10.1. Paper](#101-paper)
 48 |       - [10.2. Overview](#102-overview)
 49 |       - [10.3. Usage Code](#103-usage-code)
 50 |     - [11. Pyramid Squeeze Attention Usage](#11-pyramid-squeeze-attention-usage)
 51 |       - [11.1. Paper](#111-paper)
 52 |       - [11.2. Overview](#112-overview)
 53 |       - [11.3. Usage Code](#113-usage-code)
 54 |     - [12. Efficient Multi-Head Self-Attention Usage](#12-efficient-multi-head-self-attention-usage)
 55 |       - [12.1. Paper](#121-paper)
 56 |       - [12.2. Overview](#122-overview)
 57 |       - [12.3. Usage Code](#123-usage-code)
 58 |     - [13. Shuffle Attention Usage](#13-shuffle-attention-usage)
 59 |       - [13.1. Paper](#131-paper)
 60 |       - [13.2. Overview](#132-overview)
 61 |       - [13.3. Usage Code](#133-usage-code)
 62 |     - [14. MUSE Attention Usage](#14-muse-attention-usage)
 63 |       - [14.1. Paper](#141-paper)
 64 |       - [14.2. Overview](#142-overview)
 65 |       - [14.3. Usage Code](#143-usage-code)
 66 |     - [15. SGE Attention Usage](#15-sge-attention-usage)
 67 |       - [15.1. Paper](#151-paper)
 68 |       - [15.2. Overview](#152-overview)
 69 |       - [15.3. Usage Code](#153-usage-code)
 70 |     - [16. A2 Attention Usage](#16-a2-attention-usage)
 71 |       - [16.1. Paper](#161-paper)
 72 |       - [16.2. Overview](#162-overview)
 73 |       - [16.3. Usage Code](#163-usage-code)
 74 |     - [17. AFT Attention Usage](#17-aft-attention-usage)
 75 |       - [17.1. Paper](#171-paper)
 76 |       - [17.2. Overview](#172-overview)
 77 |       - [17.3. Usage Code](#173-usage-code)
 78 |     - [18. Outlook Attention Usage](#18-outlook-attention-usage)
 79 |       - [18.1. Paper](#181-paper)
 80 |       - [18.2. Overview](#182-overview)
 81 |       - [18.3. Usage Code](#183-usage-code)
 82 |     - [19. ViP Attention Usage](#19-vip-attention-usage)
 83 |       - [19.1. Paper](#191-paper)
 84 |       - [19.2. Overview](#192-overview)
 85 |       - [19.3. Usage Code](#193-usage-code)
 86 |     - [20. CoAtNet Attention Usage](#20-coatnet-attention-usage)
 87 |       - [20.1. Paper](#201-paper)
 88 |       - [20.2. Overview](#202-overview)
 89 |       - [20.3. Usage Code](#203-usage-code)
 90 |     - [21. HaloNet Attention Usage](#21-halonet-attention-usage)
 91 |       - [21.1. Paper](#211-paper)
 92 |       - [21.2. Overview](#212-overview)
 93 |       - [21.3. Usage Code](#213-usage-code)
 94 |     - [22. Polarized Self-Attention Usage](#22-polarized-self-attention-usage)
 95 |       - [22.1. Paper](#221-paper)
 96 |       - [22.2. Overview](#222-overview)
 97 |       - [22.3. Usage Code](#223-usage-code)
 98 |     - [23. CoTAttention Usage](#23-cotattention-usage)
 99 |       - [23.1. Paper](#231-paper)
100 |       - [23.2. Overview](#232-overview)
101 |       - [23.3. Usage Code](#233-usage-code)
102 |     - [24. S2 Attention Usage](#24-s2-attention-usage)
103 |       - [24.1. Paper](#241-paper)
104 |       - [24.2. Overview](#242-overview)
105 |       - [24.3. Usage Code](#243-usage-code)
106 |     - [25. GFNet Attention Usage](#25-gfnet-attention-usage)
107 |       - [25.1. Paper](#251-paper)
108 |       - [25.2. Overview](#252-overview)
109 |       - [25.3. Usage Code - Implemented by Wenliang Zhao (Author)](#253-usage-code---implemented-by-wenliang-zhao-author)
110 |     - [26. TripletAttention Usage](#26-tripletattention-usage)
111 |       - [26.1. Paper](#261-paper)
112 |       - [26.2. Overview](#262-overview)
113 |       - [26.3. Usage Code - Implemented by digantamisra98](#263-usage-code---implemented-by-digantamisra98)
114 |     - [27. Coordinate Attention Usage](#27-coordinate-attention-usage)
115 |       - [27.1. Paper](#271-paper)
116 |       - [27.2. Overview](#272-overview)
117 |       - [27.3. Usage Code - Implemented by Andrew-Qibin](#273-usage-code---implemented-by-andrew-qibin)
118 |     - [28. MobileViT Attention Usage](#28-mobilevit-attention-usage)
119 |       - [28.1. Paper](#281-paper)
120 |       - [28.2. Overview](#282-overview)
121 |       - [28.3. Usage Code](#283-usage-code)
122 |     - [29. ParNet Attention Usage](#29-parnet-attention-usage)
123 |       - [29.1. Paper](#291-paper)
124 |       - [29.2. Overview](#292-overview)
125 |       - [29.3. Usage Code](#293-usage-code)
126 |     - [30. UFO Attention Usage](#30-ufo-attention-usage)
127 |       - [30.1. Paper](#301-paper)
128 |       - [30.2. Overview](#302-overview)
129 |       - [30.3. Usage Code](#303-usage-code)
130 |     - [31. MobileViTv2 Attention Usage](#31-mobilevitv2-attention-usage)
131 |       - [31.1. Paper](#311-paper)
132 |       - [31.2. Overview](#312-overview)
133 |       - [31.3. Usage Code](#313-usage-code)
134 |     - [32. Infini-attention Usage](#32-infini-attention-usage)
135 |       - [32.1. Paper](#321-paper)
136 |       - [32.2. Overview](#322-overview)
137 |       - [32.3. Usage Code](#323-usage-code)
138 | 
139 | ## Attention Series
140 | 
141 | ### 1. Residual Attention Usage
142 | 
143 | #### 1.1. Paper
144 | 
145 | [Residual Attention: A Simple but Effective Method for Multi-Label Recognition---ICCV2021](https://arxiv.org/abs/2108.02456)
146 | 
147 | #### 1.2 Overview
148 | 
149 | ![](attention/img/ResAtt.png)
150 | 
151 | > Only 4 lines of code consistently leads to improvement of multi-label recognition, across many diverse pretrained attentions and datasets, even without any extra training.
152 | > （在许多不同的预训练模型和数据集上，即使没有任何额外的训练，只用4行代码也可以提高多标签识别的准确率）
153 | 
154 | #### 1.3. UsageCode
155 | 
156 | ```python
157 | from attention.ResidualAttention import ResidualAttention
158 | import tensorflow as tf
159 | 
160 | input = tf.random.normal(shape=(50, 7, 7, 512))
161 | resatt = ResidualAttention(num_class=1000, la=0.2)
162 | output = resatt(input)
163 | print(output.shape)
164 | ```
165 | 
166 | ***
167 | 
168 | ### 2. External Attention Usage
169 | 
170 | #### 2.1. Paper
171 | 
172 | ["Beyond Self-attention: External Attention using Two Linear Layers for Visual Tasks"](https://arxiv.org/abs/2105.02358)
173 | 
174 | #### 2.2. Overview
175 | 
176 | ![](attention/img/External_Attention.png)
177 | 
178 | > 主要解决的Self-Attention(SA)的两个痛点问题：
179 | > >（1）O(n^2)的计算复杂度；(2) SA是在同一个样本上根据不同位置计算Attention，忽略了不同样本之间的联系。
180 | > 
181 | > 因此，本文采用了两个串联的MLP结构作为memory units，使得计算复杂度降低到了O(n)；此外，这两个memory units是基于全部的训练数据学习的，因此也隐式的考虑了不同样本之间的联系。
182 | 
183 | #### 2.3. UsageCode
184 | 
185 | ```python
186 | from attention.ExternalAttention import ExternalAttention
187 | import tensorflow as tf
188 | 
189 | input = tf.random.normal(shape=(50, 49, 512))
190 | ea = ExternalAttention(d_attention=512, S=8)
191 | output = ea(input)
192 | print(output.shape)
193 | ```
194 | 
195 | ***
196 | 
197 | ### 3. Self Attention Usage
198 | 
199 | #### 3.1. Paper
200 | 
201 | ["Attention Is All You Need"](https://arxiv.org/pdf/1706.03762.pdf)
202 | 
203 | #### 3.2. Overview
204 | 
205 | ![](attention/img/SA.png)
206 | 
207 | > 这是Google在NeurIPS2017发表的一篇文章，在CV、NLP、多模态等各个领域都有很大的影响力，目前引用量已经4.5w+。Transformer中提出的
208 | > Self-Attention是Attention的一种，用于计算特征中不同位置之间的权重，从而达到更新特征的效果。首先将input feature通过FC映射成Q、K、V
209 | > 三个特征，然后将Q和K进行点乘的得到attention map，再将attention map与V做点乘得到加权后的特征。最后通过FC进行特征的映射，得到一个新的特征。
210 | 
211 | #### 3.3. UsageCode
212 | 
213 | ```python
214 | from attention.SelfAttention import ScaledDotProductAttention
215 | import tensorflow as tf
216 | 
217 | input = tf.random.normal((50, 49, 512))
218 | sa = ScaledDotProductAttention(d_attention=512, d_k=512, d_v=512, h=8)
219 | output = sa(input, input, input)
220 | print(output.shape)
221 | ```
222 | 
223 | ***
224 | 
225 | ### 4. Simplified Self Attention Usage
226 | 
227 | #### 4.1. Paper
228 | 
229 | [None]()
230 | 
231 | #### 4.2. Overview
232 | 
233 | ![](attention/img/SSA.png)
234 | 
235 | #### 4.3. UsageCode
236 | 
237 | ```python
238 | from attention.SimplifiedSelfAttention import SimplifiedScaledDotProductAttention
239 | import tensorflow as tf
240 | 
241 | input = tf.random.normal((50, 49, 512))
242 | ssa = SimplifiedScaledDotProductAttention(d_attention=512, h=8)
243 | output = ssa(input, input, input)
244 | print(output.shape)
245 | ```
246 | 
247 | ***
248 | 
249 | ### 5. Squeeze-and-Excitation Attention Usage
250 | 
251 | #### 5.1. Paper
252 | 
253 | ["Squeeze-and-Excitation Networks"](https://arxiv.org/abs/1709.01507)
254 | 
255 | #### 5.2. Overview
256 | 
257 | ![](attention/img/SE.png)
258 | 
259 | > 这是CVPR2018的一篇文章，是做通道注意力的，因其简单的结构和有效性，将通道注意力掀起了一波小高潮。大道至简，这篇文章的思想非常简单，首先将
260 | > spatial维度进行AdaptiveAvgPool，然后通过两个FC学习到通道注意力，并用Sigmoid进行归一化得到Channel Attention Map,最后将Channel 
261 | > Attention Map与原特征相乘，就得到了加权后的特征。
262 | 
263 | #### 5.3. UsageCode
264 | 
265 | ```python
266 | from attention.SEAttention import SEAttention
267 | import tensorflow as tf
268 | 
269 | input = tf.random.normal((50, 7, 7, 512))
270 | se = SEAttention(channel=512, reduction=8)
271 | output = se(input)
272 | print(output.shape)
273 | ```
274 | 
275 | ***
276 | 
277 | ### 6. SK Attention Usage
278 | 
279 | #### 6.1. Paper
280 | 
281 | ["Selective Kernel Networks"](https://arxiv.org/pdf/1903.06586.pdf)
282 | 
283 | #### 6.2. Overview
284 | 
285 | ![](attention/img/SK.png)
286 | 
287 | > 这是CVPR2019的一篇文章，致敬了SENet的思想。在传统的CNN中每一个卷积层都是用相同大小的卷积核，限制了模型的表达能力；而Inception这种“更宽”的模型结构也验证了，用多个不同的卷积核进行学习确实可以提升模型的表达能力。作者借鉴了SENet的思想，通过动态计算每个卷积核得到通道的权重，动态的将各个卷积核的结果进行融合。
288 | 
289 | >本文的方法分为三个部分：Split,Fuse,Select。Split就是一个multi-branch的操作，用不同的卷积核进行卷积得到不同的特征；Fuse部分就是用SE的结构获取通道注意力的矩阵(
290 | >N个卷积核就可以得到N个注意力矩阵，这步操作对所有的特征参数共享)，这样就可以得到不同kernel经过SE之后的特征；Select操作就是将这几个特征进行相加。
291 | 
292 | #### 6.3. UsageCode
293 | 
294 | ```python
295 | from attention.SKAttention import SKAttention
296 | import tensorflow as tf
297 | 
298 | input = tf.random.normal((50, 7, 7, 512))
299 | se = SKAttention(channel=512, reduction=8)
300 | output = se(input)
301 | print(output.shape)
302 | ```
303 | 
304 | ***
305 | 
306 | ### 7. CBAM Attention Usage
307 | 
308 | #### 7.1. Paper
309 | 
310 | ["CBAM: Convolutional Block Attention Module"](https://openaccess.thecvf.com/content_ECCV_2018/papers/Sanghyun_Woo_Convolutional_Block_Attention_ECCV_2018_paper.pdf)
311 | 
312 | #### 7.2. Overview
313 | 
314 | ![](attention/img/CBAM1.png)
315 | 
316 | ![](attention/img/CBAM2.png)
317 | 
318 | > 这是ECCV2018的一篇论文，这篇文章同时使用了Channel Attention和Spatial Attention，将两者进行了串联（文章也做了并联和两种串联方式的消融实验）。
319 | >
320 | >Channel
321 | > Attention方面，大致结构还是和SE相似，不过作者提出AvgPool和MaxPool有不同的表示效果，所以作者对原来的特征在Spatial维度分别进行了AvgPool和MaxPool，然后用SE的结构提取channel
322 | > attention，注意这里是参数共享的，然后将两个特征相加后做归一化，就得到了注意力矩阵。
323 | >
324 | >Spatial Attention和Channel Attention类似，先在channel维度进行两种pool后，将两个特征进行拼接，然后用7x7的卷积来提取Spatial
325 | > Attention（之所以用7x7是因为提取的是空间注意力，所以用的卷积核必须足够大）。然后做一次归一化，就得到了空间的注意力矩阵。
326 | 
327 | #### 7.3. Usage Code
328 | 
329 | ```python
330 | from attention.CBAM import CBAMBlock
331 | import tensorflow as tf
332 | 
333 | input = tf.random.normal((50, 7, 7, 512))
334 | kernel_size = input.get_shape()[1]
335 | cbam = CBAMBlock(channel=512, reduction=16, kernel_size=kernel_size)
336 | output = cbam(input)
337 | print(output.shape)
338 | ```
339 | 
340 | ***
341 | 
342 | ### 8. BAM Attention Usage
343 | 
344 | #### 8.1. Paper
345 | 
346 | ["BAM: Bottleneck Attention Module"](https://arxiv.org/pdf/1807.06514.pdf)
347 | 
348 | #### 8.2. Overview
349 | 
350 | ![](attention/img/BAM.png)
351 | 
352 | > 这是CBAM同作者同时期的工作，工作与CBAM非常相似，也是双重Attention，不同的是CBAM是将两个attention的结果串联；而BAM是直接将两个attention矩阵进行相加。
353 | >
354 | >Channel Attention方面，与SE的结构基本一样。Spatial Attention方面，还是在通道维度进行pool，然后用了两次3x3的空洞卷积，最后将用一次1x1的卷积得到Spatial Attention的矩阵。
355 | >
356 | >最后Channel Attention和Spatial Attention矩阵进行相加（这里用到了广播机制），并进行归一化，这样一来，就得到了空间和通道结合的attention矩阵。
357 | 
358 | #### 8.3. Usage Code
359 | 
360 | ```python
361 | from attention.BAM import BAMBlock
362 | import tensorflow as tf
363 | 
364 | input = tf.random.normal((50, 7, 7, 512))
365 | bam = BAMBlock(channel=512, reduction=16, dia_val=2)
366 | output = bam(input)
367 | print(output.shape)
368 | ```
369 | 
370 | ***
371 | 
372 | ### 9. ECA Attention Usage
373 | 
374 | #### 9.1. Paper
375 | 
376 | ["ECA-Net: Efficient Channel Attention for Deep Convolutional Neural Networks"](https://arxiv.org/pdf/1910.03151.pdf)
377 | 
378 | #### 9.2. Overview
379 | 
380 | ![](attention/img/ECA.png)
381 | 
382 | > 这是CVPR2020的一篇文章。 如上图所示，SE实现通道注意力是使用两个全连接层，而ECA是需要一个的卷积。作者这么做的原因一方面是认为计算所有通道两两之间的注意力是没有必要的，另一方面是用两个全连接层确实引入了太多的参数和计算量。
383 | >
384 | >因此作者进行了AvgPool之后，只是使用了一个感受野为k的一维卷积（相当于只计算与相邻k个通道的注意力），这样做就大大的减少的参数和计算量。(i.e.相当于SE是一个global的注意力，而ECA是一个local的注意力)。
385 | 
386 | #### 9.3. Usage Code
387 | 
388 | ```python
389 | from attention.ECAAttention import ECAAttention
390 | import tensorflow as tf
391 | 
392 | input = tf.random.normal((50, 7, 7, 512))
393 | eca = ECAAttention(kernel_size=3)
394 | output = eca(input)
395 | print(output.shape)
396 | ```
397 | 
398 | ***
399 | 
400 | ### 10. DANet Attention Usage
401 | 
402 | #### 10.1. Paper
403 | 
404 | ["Dual Attention Network for Scene Segmentation"](https://arxiv.org/pdf/1809.02983.pdf)
405 | 
406 | #### 10.2. Overview
407 | 
408 | ![](attention/img/danet.png)![](attention/img/danet2.png)
409 | 
410 | >这是CVPR2019的文章，思想上就是将self-attention用到场景分割的任务中，不同的是self-attention是关注每个position之间的注意力，而本文将self-attention做了一个拓展，还做了一个通道注意力的分支，操作上和self-attention一样，不同的通道attention中把生成Q，K，V的三个Linear去掉了。最后将两个attention之后的特征进行element-wise sum。
411 | 
412 | #### 10.3. Usage Code
413 | 
414 | ```python
415 | from attention.DANet import DAModule
416 | import tensorflow as tf
417 | 
418 | input = tf.random.normal((50, 7, 7, 512))
419 | danet = DAModule(d_attention=512, kernel_size=3, H=7, W=7)
420 | print(danet(input).shape)
421 | ```
422 | 
423 | ***
424 | 
425 | ### 11. Pyramid Squeeze Attention Usage
426 | 
427 | #### 11.1. Paper
428 | 
429 | ["EPSANet: An Efficient Pyramid Squeeze Attention Block on Convolutional Neural Network"](https://doi.org/10.48550/arXiv.2105.14447)
430 | 
431 | #### 11.2. Overview
432 | 
433 | ![Pyramid Squeeze Attention (PSA) module](attention/img/psa.jpg)![A detailed illustration of Squeeze and Concat(SPC) module](attention/img/psa2.jpg)
434 | 
435 | >这是深大2021年5月30日在arXiv上上传的一篇文章，本文的目的是如何获取并探索不同尺度的空间信息来丰富特征空间。网络结构相对来说也比较简单，主要分成四步，第一步，将原来的feature根据通道分成n组然后对不同的组进行不同尺度的卷积，得到新的特征W1；第二步，通过使用SE权重模块提取不同尺度的特征图的注意力，得到channel-wise attention向量；第三步，对不同组进行softmax；第四步，将获得channel-wise attention与原来的特征W1相乘。
436 | 
437 | #### 11.3. Usage Code
438 | 
439 | ```python
440 | from attention.PSA import PSA
441 | import tensorflow as tf
442 | 
443 | input = tf.random.normal((50, 7, 7, 512))
444 | psa = PSA(channel=512, reduction=8)
445 | output = psa(input)
446 | print(output.shape)
447 | ```
448 | 
449 | ***
450 | 
451 | ### 12. Efficient Multi-Head Self-Attention Usage
452 | 
453 | #### 12.1. Paper
454 | 
455 | ["ResT: An Efficient Transformer for Visual Recognition"](https://arxiv.org/abs/2105.13677)
456 | 
457 | #### 12.2. Overview
458 | 
459 | ![](attention/img/EMSA.jpg)
460 | 
461 | >这是南大5月28日在arXiv上上传的一篇文章。本文解决的主要是SA的两个痛点问题：（1）Self-Attention的计算复杂度和n呈平方关系；（2）每个head只有q,k,v的部分信息，如果q,k,v的维度太小，那么就会导致获取不到连续的信息，从而导致性能损失。这篇文章给出的思路也非常简单，在SA中的FC之前，用了一个卷积来降低了空间的维度，从而得到空间维度上更小的K和V。
462 | 
463 | #### 12.3. Usage Code
464 | 
465 | ```python
466 | from attention.EMSA import EMSA
467 | import tensorflow as tf
468 | 
469 | input = tf.random.normal((50, 64, 512))
470 | emsa = EMSA(d_attention=512, d_k=512, d_v=512, h=8, H=8, W=8, ratio=2, apply_transform=True)
471 | output = emsa(input, input, input)
472 | print(output.shape)
473 | ```
474 | 
475 | ***
476 | 
477 | ### 13. Shuffle Attention Usage
478 | 
479 | #### 13.1. Paper
480 | 
481 | ["SA-NET: SHUFFLE ATTENTION FOR DEEP CONVOLUTIONAL NEURAL NETWORKS"](https://arxiv.org/pdf/2102.00240.pdf)
482 | 
483 | #### 13.2. Overview
484 | 
485 | ![](attention/img/ShuffleAttention.png)
486 | 
487 | > 采用Shuffle Units将两种注意力机制有效结合。具体来说，SA首先将通道维度分组为多个子特征，然后并行处理它们。其次，对于每个子特征，SA使用Shuffle
488 | > Unit来描述空间和通道维度上的特征依赖关系。最后，对所有子特征进行聚合，并采用“channel shuffle”算子来实现不同子特征之间的信息通信。
489 | 
490 | #### 13.3. Usage Code
491 | 
492 | ```python
493 | from attention.ShuffleAttention import ShuffleAttention
494 | import tensorflow as tf
495 | 
496 | input = tf.random.normal((50, 7, 7, 512))
497 | se = ShuffleAttention(channel=512, G=8)
498 | output = se(input)
499 | print(output.shape)
500 | ```
501 | 
502 | ***
503 | 
504 | ### 14. MUSE Attention Usage
505 | 
506 | #### 14.1. Paper
507 | 
508 | ["MUSE: Parallel Multi-Scale Attention for Sequence to Sequence Learning"](https://arxiv.org/abs/1911.09483)
509 | 
510 | #### 14.2. Overview
511 | 
512 | ![](./attention/img/MUSE.png)
513 | 
514 | > 这是北大团队2019年在arXiv上发布的一篇文章，主要解决的是Self-Attention（SA）只有全局捕获能力的缺点。如下图所示，当句子长度变长时，
515 | > SA的全局捕获能力变弱，导致最终模型性能变差。因此，作者在文中引入了多个不同感受野的一维卷积来捕获多尺度的局部Attention，以此来弥补SA在建模长句子能力的不足。
516 | > ![](attention/img/MUSE2.jpg)
517 | > 实现方式如模型结构所示的那样，将SA的结果和多个卷积的结果相加，不仅进行全局感知，还进行局部感知。最终通过引入多尺度的局部感知，使模型在翻译任务上的性能得到了提升。
518 | 
519 | #### 14.3. Usage Code
520 | 
521 | ```python
522 | from attention.MUSEAttention import MUSEAttention
523 | import tensorflow as tf
524 | 
525 | input = tf.random.normal((50, 49, 512))
526 | sa = MUSEAttention(d_attention=512, d_k=512, d_v=512, h=8)
527 | output = sa(input, input, input)
528 | print(output.shape)
529 | ```
530 | 
531 | ***
532 | 
533 | ### 15. SGE Attention Usage
534 | 
535 | #### 15.1. Paper
536 | 
537 | [Spatial Group-wise Enhance: Improving Semantic Feature Learning in Convolutional Networks](https://arxiv.org/pdf/1905.09646.pdf)
538 | 
539 | #### 15.2. Overview
540 | 
541 | ![](attention/img/SGE.jpg)
542 | > 这篇文章是[SKNet](#6-sk-attention-usage)
543 | > 作者在19年的时候在arXiv上挂出的文章，是一个轻量级Attention的工作，从核心代码中可以看出，引入的参数真的非常少，self.weight和self.bias都是和groups呈一个数量级的（几乎就是常数级别）。
544 | >
545 | > 这篇文章的核心点是用局部信息和全局信息的相似性来指导语义特征的增强，总体的操作可以分为以下几步：
546 | >> 1. 将特征分组，每组feature在空间上与其global pooling后的feature做点积（相似性）得到初始的attention mask；
547 | >> 2. 对该attention mask进行减均值除标准差的normalize，并同时每个group学习两个缩放偏移参数使得normalize操作可被还原；
548 | >> 3. 最后经过sigmoid得到最终的attention mask并对原始feature group中的每个位置的feature进行scale。
549 | >
550 | > 实验部分，作者也是在分类任务（ImageNet）和检测任务（COCO）上做了实验，能够在比[SK](#6-sk-attention-usage)、[CBAM](#7-cbam-attention-usage)
551 | > 、[BAM](#8-bam-attention-usage)等网络参数和计算量更小的情况下，获得更好的性能，证明了本文方法的高效性。
552 | 
553 | #### 15.3. Usage Code
554 | 
555 | ```python
556 | from attention.SGE import SpatialGroupEnhance
557 | import tensorflow as tf
558 | 
559 | input = tf.random.normal((50, 7, 7, 512))
560 | sge = SpatialGroupEnhance(groups=8)
561 | output = sge(input)
562 | print(output.shape)
563 | ```
564 | 
565 | ***
566 | 
567 | ### 16. A2 Attention Usage
568 | 
569 | #### 16.1. Paper
570 | 
571 | [A2-Nets: Double Attention Networks](https://arxiv.org/pdf/1810.11579.pdf)
572 | 
573 | #### 16.2. Overview
574 | 
575 | ![](./attention/img/A2.png)
576 | 
577 | > 这是NeurIPS2018上的一篇文章，这篇论文主要是做空间注意力的。并且这篇文章的方法跟做法跟self-attention非常相似，但是包装上就比较“花里胡哨”。
578 | >
579 | > input用1x1的卷积变成A，B，V（类似self-attention的Q，K，V）。本文的方法主要分为两个步骤，第一步，feature
580 | > gathering，首先用A和B进行点乘，得到一个聚合全局信息的attention，标记为G。然后用G和V进行点乘，得到二阶的attention。
581 | >
582 | > 从实验结果上看，这个结构的效果还是非常不错的，作者在分类（ImageNet）和行为识别（Kinetics ， UCF-101）任务上做了实验，都取得非常好的效果，相比于Non-Local[12]、SENet[13]
583 | > 等模型，都有不错的提升。
584 | 
585 | #### 16.3. Usage Code
586 | 
587 | ```python
588 | from attention.A2Attention import DoubleAttention
589 | import tensorflow as tf
590 | 
591 | input = tf.random.normal((50, 7, 7, 512))
592 | a2 = DoubleAttention(512, 128, 128, True)
593 | output = a2(input)
594 | print(output.shape)
595 | ```
596 | 
597 | ### 17. AFT Attention Usage
598 | 
599 | #### 17.1. Paper
600 | 
601 | [An Attention Free Transformer](https://arxiv.org/pdf/2105.14103v1.pdf)
602 | 
603 | #### 17.2. Overview
604 | 
605 | ![](./attention/img/AFT.jpg)
606 | 
607 | > 这是苹果团队2021年6月16日在arXiv上发布的工作，主要工作是简化Self-Attention。
608 | >
609 | > Transformer近几年被用于各种任务中，但是由于Self-Attention的与输入数据大小呈平方关系的时间和空间复杂度，它不能被用于太大的数据中。
610 | > 近几年，基于简化SA的复杂度，很多工作也被提出：稀疏注意力、局部哈希、低质分解...
611 | >
612 | > 本文提出了一个Attention Free Transformer（AFT），AFT也是由QKV三部分组成，不同的是QK不是做点积。而是将KV直接融合了，从而来保证对应位置的交互，然后Q与融合后的特征进行了对应位置相乘，来减少计算量。
613 | >
614 | > 总体上原理跟Self-Attention相似，不同的是Self-Attention用的是点积，而这里用的是对应位置相乘，所以大大减少了计算量。
615 | 
616 | #### 17.3. Usage Code
617 | 
618 | ```python
619 | from attention.AFT import AFT_FULL
620 | import tensorflow as tf
621 | 
622 | input = tf.random.normal((50, 49, 512))
623 | aft_full = AFT_FULL(d_model=512, n=49)
624 | output = aft_full(input)
625 | print(output.shape)
626 | ```
627 | 
628 | ### 18. Outlook Attention Usage
629 | 
630 | #### 18.1. Paper
631 | 
632 | [VOLO: Vision Outlooker for Visual Recognition---arXiv 2021.06.24"](https://arxiv.org/abs/2106.13112)
633 | 
634 | #### 18.2. Overview
635 | 
636 | ![](./attention/img/OutlookAttention.png)
637 | 
638 | > Transformer-based模型在Visual Recognition领域，如果不借助额外的训练数据，比CNN-based模型要差一点。作者认为，这是因为token
639 | > embedding并没有进行细粒度特征表示，因此本文提出了一种新的Attention方式，通过局部信息的感知，能够获得更加细粒度的特征表示。
640 | >
641 | > > 整个框架分为两个分支，上面的分支用于生成attention map，下面的分支用于生成投影后的value。然后通过矩阵乘法得到outlook attention后的结果，最后通过Fold函数将feature map还原到输入大小。
642 | > > - 上面分支Linear是为了对特征进行embedding，之后对特征进行reshape，最后通过softmax得到每个位置和周围几个位置的注意力权重。
643 | > > - 下面分支同样进行embedding，之后通过unfold，也就是滑动窗口的形式将特征中的K*K区域取出来。
644 | >
645 | > 可以看出，在Outlook Attention中，每一个中心点的位置都与周围k*k个位置进行attention操作，这个步骤就有点类似卷积。
646 | 
647 | #### 18.3. Usage Code
648 | 
649 | [torch.nn.fold开发者说没有这个功能，未来没打算加。我以后看情况是否补充吧。点击看开发者回复](https://github.com/tensorflow/tensorflow/issues/52195#issuecomment-948915934)
650 | 
651 | ```python
652 | from attention.OutlookAttention import OutlookAttention
653 | import tensorflow as tf
654 | ```
655 | 
656 | ***
657 | 
658 | ### 19. ViP Attention Usage
659 | 
660 | #### 19.1. Paper
661 | 
662 | [Vision Permutator: A Permutable MLP-Like Architecture for Visual Recognition"](https://arxiv.org/abs/2106.12368)
663 | 
664 | #### 19.2. Overview
665 | 
666 | ![](./attention/img/ViP.png)
667 | 
668 | #### 19.3. Usage Code
669 | 
670 | ```python
671 | from attention.ViP import WeightedPermuteMLP
672 | import tensorflow as tf
673 | 
674 | input = tf.random.normal((64, 8, 8, 512))
675 | seg_dim = 8
676 | vip = WeightedPermuteMLP(512, seg_dim)
677 | output = vip(input)
678 | print(output.shape)
679 | ```
680 | 
681 | ***
682 | 
683 | ### 20. CoAtNet Attention Usage
684 | 
685 | #### 20.1. Paper
686 | 
687 | [CoAtNet: Marrying Convolution and Attention for All Data Sizes"](https://arxiv.org/abs/2106.04803)
688 | 
689 | #### 20.2. Overview
690 | 
691 | ![](attention/img/CoAtNet.png)
692 | 
693 | > 本文系统调研了CNN和Transformer的特性，并将两者结合提出新的家族式网络：CoAtNet，无额外数据时高达86%准确率，在JFT加持下，高达89.77%！性能优于CvT、BotNet和Swin等网络。
694 | > >Transformers 在计算机视觉方面吸引了越来越多的兴趣，但它们仍然落后于最先进的卷积网络。在这项工作中，我们表明虽然 Transformer 往往具有更大的模型容量，但由于缺乏正确的归纳偏差，它们的泛化可能比卷积网络更差。
695 | > >为了有效地结合两种架构的优势，我们提出了 CoAtNets（发音为“coat”nets），这是一个基于两个关键insight构建的混合模型系列：
696 | > >
697 | > >- 1. 深度卷积和自注意力可以通过简单的相对注意力自然地统一起来；
698 | > >- 2. 以有原则的方式垂直堆叠卷积层和注意力层在提高泛化、容量和效率方面非常有效。
699 | 
700 | #### 20.3. Usage Code
701 | 
702 | ```python
703 | 
704 | from attention.CoAtNet import CoAtNet
705 | import tensorflow as tf
706 | 
707 | input = tf.random.normal((1, 224, 224, 3))
708 | coatnet = CoAtNet(in_ch=3)
709 | output = coatnet(input)
710 | print(output.shape)
711 | ```
712 | 
713 | ***
714 | 
715 | ### 21. HaloNet Attention Usage
716 | 
717 | #### 21.1. Paper
718 | 
719 | [Scaling Local Self-Attention for Parameter Efficient Visual Backbones](https://arxiv.org/pdf/2103.12731.pdf)
720 | 
721 | #### 21.2. Overview
722 | 
723 | ![](./attention/img/HaloNet.png)
724 | 
725 | #### 21.3. Usage Code
726 | 
727 | ```python
728 | 
729 | from attention.HaloAttention import HaloAttention
730 | import tensorflow as tf
731 | ```
732 | 
733 | ***
734 | 
735 | ### 22. Polarized Self-Attention Usage
736 | 
737 | #### 22.1. Paper
738 | 
739 | [Polarized Self-Attention: Towards High-quality Pixel-wise Regression"](https://arxiv.org/abs/2107.00782)
740 | 
741 | #### 22.2. Overview
742 | 
743 | ![](./attention/img/PoSA.png)
744 | 
745 | #### 22.3. Usage Code
746 | 
747 | ```python
748 | from attention.PolarizedSelfAttention import SequentialPolarizedSelfAttention
749 | import tensorflow as tf
750 | 
751 | if __name__ == '__main__':
752 |     input_tensor = tf.random.normal([1, 7, 7, 512])
753 |     psa = SequentialPolarizedSelfAttention(channel=512)
754 |     output_tensor = psa(input_tensor)
755 |     print(output_tensor.shape)
756 | ```
757 | 
758 | ***
759 | 
760 | ### 23. CoTAttention Usage
761 | 
762 | #### 23.1. Paper
763 | 
764 | [Contextual Transformer Networks for Visual Recognition---arXiv 2021.07.26](https://arxiv.org/abs/2107.12292)
765 | 
766 | #### 23.2. Overview
767 | 
768 | ![](./attention/img/CoT.png)
769 | 
770 | #### 23.3. Usage Code
771 | 
772 | ```python
773 | 
774 | from attention.CoTAttention import CoTAttention
775 | import tensorflow as tf
776 | ```
777 | 
778 | ### 24. S2 Attention Usage
779 | 
780 | #### 24.1. Paper
781 | 
782 | [S²-MLPv2: Improved Spatial - Shift MLP Architecture for Vision ---arXiv 2021.08.02](https://arxiv.org/abs/2108.01072)
783 | 
784 | #### 24.2. Overview
785 | 
786 | ![](./attention/img/S2Attention.png)
787 | 
788 | #### 24.3. Usage Code
789 | 
790 | ```python
791 | from attention.S2Attention import S2Attention
792 | import tensorflow as tf
793 | ```
794 | 
795 | ***
796 | 
797 | ### 25. GFNet Attention Usage
798 | 
799 | #### 25.1. Paper
800 | 
801 | [Global Filter Networks for Image Classification---arXiv 2021.07.01](https://arxiv.org/abs/2107.00645)
802 | 
803 | #### 25.2. Overview
804 | 
805 | ![](./attention/img/GFNet.jpg)
806 | 
807 | #### 25.3. Usage Code - Implemented by [Wenliang Zhao (Author)](https://scholar.google.com/citations?user=lyPWvuEAAAAJ&hl=en)
808 | 
809 | ```python
810 | from attention.gfnet import GFNet
811 | import tensorflow as tf
812 | ```
813 | 
814 | ***
815 | 
816 | ### 26. TripletAttention Usage
817 | 
818 | #### 26.1. Paper
819 | 
820 | [Rotate to Attend: Convolutional Triplet Attention Module---CVPR 2021](https://arxiv.org/abs/2010.03045)
821 | 
822 | #### 26.2. Overview
823 | 
824 | ![](./attention/img/triplet.png)
825 | 
826 | #### 26.3. Usage Code - Implemented by [digantamisra98](https://github.com/digantamisra98)
827 | 
828 | ```python
829 | from attention.TripletAttention import TripletAttention
830 | import tensorflow as tf
831 | ```
832 | 
833 | ***
834 | 
835 | ### 27. Coordinate Attention Usage
836 | 
837 | #### 27.1. Paper
838 | 
839 | [Coordinate Attention for Efficient Mobile Network Design---CVPR 2021](https://arxiv.org/abs/2103.02907)
840 | 
841 | #### 27.2. Overview
842 | 
843 | ![](./attention/img/CoordAttention.png)
844 | 
845 | #### 27.3. Usage Code - Implemented by [Andrew-Qibin](https://github.com/Andrew-Qibin)
846 | 
847 | ```python
848 | from attention.CoordAttention import CoordAtt
849 | import tensorflow as tf
850 | ```
851 | 
852 | ***
853 | 
854 | ### 28. MobileViT Attention Usage
855 | 
856 | #### 28.1. Paper
857 | 
858 | [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer---ArXiv 2021.10.05](https://arxiv.org/abs/2103.02907)
859 | 
860 | #### 28.2. Overview
861 | 
862 | ![](./attention/img/MobileViTAttention.png)
863 | 
864 | #### 28.3. Usage Code
865 | 
866 | ```python
867 | from attention.MobileViTAttention import MobileViTAttention
868 | import tensorflow as tf
869 | 
870 | ```
871 | 
872 | ***
873 | 
874 | ### 29. ParNet Attention Usage
875 | 
876 | #### 29.1. Paper
877 | 
878 | [Non-deep Networks---ArXiv 2021.10.20](https://arxiv.org/abs/2110.07641)
879 | 
880 | #### 29.2. Overview
881 | 
882 | ![](./attention/img/ParNet.png)
883 | 
884 | #### 29.3. Usage Code
885 | 
886 | ```python
887 | from attention.ParNetAttention import *
888 | import tensorflow as tf
889 | 
890 | ```
891 | 
892 | ***
893 | 
894 | ### 30. UFO Attention Usage
895 | 
896 | #### 30.1. Paper
897 | 
898 | [UFO-ViT: High Performance Linear Vision Transformer without Softmax---ArXiv 2021.09.29](https://arxiv.org/abs/2110.07641)
899 | 
900 | #### 30.2. Overview
901 | 
902 | ![](./attention/img/UFO.png)
903 | 
904 | #### 30.3. Usage Code
905 | 
906 | ```python
907 | from attention.UFOAttention import *
908 | import tensorflow as tf
909 | ```
910 | 
911 | ### 31. MobileViTv2 Attention Usage
912 | 
913 | #### 31.1. Paper
914 | 
915 | [Separable Self-attention for Mobile Vision Transformers---ArXiv 2022.06.06](https://arxiv.org/abs/2206.02680)
916 | 
917 | #### 31.2. Overview
918 | 
919 | ![](./attention/img/MobileViTv2.png)
920 | 
921 | #### 31.3. Usage Code
922 | 
923 | ```python
924 | from attention.UFOAttention import *
925 | import tensorflow as tf
926 | 
927 | ```
928 | 
929 | ### 32. Infini-attention Usage
930 | 
931 | #### 32.1. Paper
932 | 
933 | [Leave No Context Behind: Efficient Infinite Context Transformers with Infini-attention---ArXiv 2024.04.10](https://arxiv.org/abs/2404.07143)
934 | 
935 | #### 32.2. Overview
936 | 
937 | ![](attention/img/Infine-attention.jpeg)
938 | 
939 | #### 32.3. Usage Code
940 | 
941 | ```python
942 | 
943 | 
944 | ```
945 | 
946 | ***
947 | 
948 | 参考：小马[External-Attention-pytorch](https://github.com/xmu-xiaoma666/External-Attention-pytorch)


--------------------------------------------------------------------------------
/attention/A2Attention.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras import layers
 3 | 
 4 | 
 5 | class DoubleAttention(layers.Layer):
 6 | 
 7 |     def __init__(self, in_channels, c_m, c_n, reconstruct=True):
 8 |         super(DoubleAttention, self).__init__()
 9 |         self.in_channels = in_channels
10 |         self.reconstruct = reconstruct
11 |         self.c_m = c_m
12 |         self.c_n = c_n
13 |         self.convA = layers.Conv2D(c_m, 1)
14 |         self.convB = layers.Conv2D(c_n, 1)
15 |         self.convV = layers.Conv2D(c_n, 1)
16 |         if self.reconstruct:
17 |             self.conv_reconstruct = layers.Conv2D(in_channels, kernel_size=1)
18 | 
19 |     def call(self, x):
20 |         b, h, w, c = x.get_shape()
21 |         assert c == self.in_channels
22 |         A = self.convA(x)  # b, h, w, c_m
23 |         B = self.convB(x)  # b, h, w, c_n
24 |         V = self.convV(x)  # b, h, w, c_n
25 |         tmpA = tf.reshape(A, (b, self.c_m, -1))
26 |         attention_maps = tf.nn.softmax(tf.reshape(B, (b, -1, self.c_n)))
27 |         attention_vectors = tf.nn.softmax(tf.reshape(V, (b, self.c_n, -1)))
28 |         # step 1: feature gating
29 |         global_descriptors = tf.matmul(tmpA, attention_maps)  # b, c_m, c_n
30 |         # step 2: feature distribution
31 |         tmpZ = tf.matmul(global_descriptors, attention_vectors)  # b, c_m, h*w
32 |         tmpZ = tf.reshape(tmpZ, (b, h, w, self.c_m))  # b, h, w, c_m
33 |         if self.reconstruct:
34 |             tmpZ = self.conv_reconstruct(tmpZ)
35 | 
36 |         return tmpZ
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     input = tf.random.normal((50, 7, 7, 512))
41 |     a2 = DoubleAttention(512, 128, 128, True)
42 |     output = a2(input)
43 |     print(output.shape)
44 | 


--------------------------------------------------------------------------------
/attention/AFT.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras import layers
 3 | 
 4 | 
 5 | class AFT_FULL(layers.Layer):
 6 |     def __init__(self, d_model, n=49, simple=False):
 7 | 
 8 |         super(AFT_FULL, self).__init__()
 9 |         self.fc_q = layers.Dense(d_model)
10 |         self.fc_k = layers.Dense(d_model)
11 |         self.fc_v = layers.Dense(d_model)
12 |         if simple:
13 |             self.position_biases = tf.zeros((n, n))
14 |         else:
15 |             self.position_biases = tf.Variable(tf.ones((n, n)), trainable=True)
16 |         self.d_model = d_model
17 |         self.n = n
18 |         self.sigmoid = tf.sigmoid
19 | 
20 |     def call(self, input):
21 |         bs, n, dim = input.get_shape()
22 | 
23 |         q = self.fc_q(input)  # bs, n, dim
24 |         k = tf.expand_dims(self.fc_k(input), axis=0)  # 1, bs, n, dim
25 |         v = tf.expand_dims(self.fc_v(input), axis=0)  # 1, bs, n, dim
26 |         numerator = tf.reduce_sum(tf.exp(k + tf.reshape(self.position_biases, (n, 1, -1, 1))) * v, 2)  # n, bs, dim
27 |         denominator = tf.reduce_sum(tf.exp(k + tf.reshape(self.position_biases, (n, 1, -1, 1))), 2)  # n, bs, dim
28 | 
29 |         out = (numerator / denominator)  # n, bs, dim
30 |         out = self.sigmoid(q) * (tf.transpose(out, (1, 0, 2)))
31 | 
32 |         return out
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     input = tf.random.normal((50, 49, 512))
37 |     aft_full = AFT_FULL(d_model=512, n=49)
38 |     output = aft_full(input)
39 |     print(output.shape)
40 | 


--------------------------------------------------------------------------------
/attention/BAM.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras import layers, Sequential
 3 | 
 4 | 
 5 | class ChannelAttention(layers.Layer):
 6 |     def __init__(self, channel, reduction=16, num_layers=3):
 7 |         super(ChannelAttention, self).__init__()
 8 |         self.avg_pool = layers.GlobalAvgPool2D(keepdims=True)
 9 |         gate_channels = [channel]
10 |         gate_channels += [channel // reduction] * num_layers
11 |         gate_channels += [channel]
12 | 
13 |         self.ca = Sequential()
14 |         for i in range(len(gate_channels) - 2):
15 |             self.ca.add(layers.Dense(gate_channels[i + 1]))
16 |             self.ca.add(layers.BatchNormalization())
17 |             self.ca.add(layers.Activation('relu'))
18 |         self.ca.add(layers.Dense(gate_channels[-1]))
19 | 
20 |     def call(self, x):
21 |         res = self.avg_pool(x)
22 |         res = self.ca(res)
23 |         res = tf.broadcast_to(res, x.get_shape())
24 |         return res
25 | 
26 | 
27 | class SpatialAttention(layers.Layer):
28 |     def __init__(self, channel, reduction=16, num_layers=3, dia_val=2):
29 |         super(SpatialAttention, self).__init__()
30 |         self.sa = Sequential()
31 |         self.sa.add(layers.Conv2D(filters=channel // reduction, kernel_size=1))
32 |         self.sa.add(layers.BatchNormalization())
33 |         self.sa.add(layers.Activation('relu'))
34 |         for i in range(num_layers):
35 |             self.sa.add(
36 |                 layers.Conv2D(filters=channel // reduction, kernel_size=3, padding='same', dilation_rate=dia_val))
37 |             self.sa.add(layers.BatchNormalization())
38 |             self.sa.add(layers.Activation('relu'))
39 |         self.sa.add(layers.Conv2D(1, kernel_size=1))
40 | 
41 |     def call(self, x):
42 |         res = self.sa(x)
43 |         res = tf.broadcast_to(res, x.get_shape())
44 |         return res
45 | 
46 | 
47 | class BAMBlock(layers.Layer):
48 |     def __init__(self, channel=512, reduction=16, dia_val=2):
49 |         super(BAMBlock, self).__init__()
50 |         self.ca = ChannelAttention(channel=channel, reduction=reduction)
51 |         self.sa = SpatialAttention(channel=channel, reduction=reduction, dia_val=dia_val)
52 |         self.sigmoid = tf.sigmoid
53 | 
54 |     def call(self, x):
55 |         sa_out = self.sa(x)
56 |         ca_out = self.ca(x)
57 |         weight = self.sigmoid(sa_out + ca_out)
58 |         out = (1 + weight) * x
59 |         return out
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     input = tf.random.normal((50, 7, 7, 512))
64 |     bam = BAMBlock(channel=512, reduction=16, dia_val=2)
65 |     output = bam(input)
66 |     print(output.shape)
67 | 


--------------------------------------------------------------------------------
/attention/CBAM.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras import layers, Sequential
 3 | 
 4 | 
 5 | class ChannelAttention(layers.Layer):
 6 |     def __init__(self, channel, reduction=16):
 7 |         super(ChannelAttention, self).__init__()
 8 |         self.maxpool = layers.GlobalMaxPool2D(keepdims=True)
 9 |         self.avgpool = layers.GlobalAvgPool2D(keepdims=True)
10 |         self.se = Sequential([
11 |             layers.Conv2D(channel // reduction, 1, use_bias=False),
12 |             layers.Activation('relu'),
13 |             layers.Conv2D(channel, 1, use_bias=False)
14 |         ])
15 |         self.sigmoid = tf.sigmoid
16 | 
17 |     def call(self, x):
18 |         max_result = self.maxpool(x)
19 |         avg_result = self.avgpool(x)
20 |         max_out = self.se(max_result)
21 |         avg_out = self.se(avg_result)
22 |         output = self.sigmoid(max_out + avg_out)
23 |         return output
24 | 
25 | 
26 | class SpatialAttention(layers.Layer):
27 |     def __init__(self, kernel_size=7):
28 |         super(SpatialAttention, self).__init__()
29 |         self.conv = layers.Conv2D(1, kernel_size=kernel_size, padding='same')
30 |         self.sigmoid = tf.sigmoid
31 | 
32 |     def call(self, x):
33 |         max_result = tf.reduce_max(x, axis=-1, keepdims=True)
34 |         avg_result = tf.reduce_mean(x, axis=-1, keepdims=True)
35 |         result = tf.concat([max_result, avg_result], -1)
36 |         output = self.conv(result)
37 |         output = self.sigmoid(output)
38 |         return output
39 | 
40 | 
41 | class CBAMBlock(layers.Layer):
42 |     def __init__(self, channel=512, reduction=16, kernel_size=49):
43 |         super().__init__()
44 |         self.ca = ChannelAttention(channel=channel, reduction=reduction)
45 |         self.sa = SpatialAttention(kernel_size=kernel_size)
46 | 
47 |     def call(self, x):
48 |         b, _, _, c = x.get_shape()
49 |         residual = x
50 |         out = x * self.ca(x)
51 |         out = out * self.sa(out)
52 |         return out + residual
53 | 
54 | 
55 | if __name__ == '__main__':
56 |     input = tf.random.normal((50, 7, 7, 512))
57 |     kernel_size = input.get_shape()[1]
58 |     cbam = CBAMBlock(channel=512, reduction=16, kernel_size=kernel_size)
59 |     output = cbam(input)
60 |     print(output.shape)
61 | 


--------------------------------------------------------------------------------
/attention/CoAtNet.py:
--------------------------------------------------------------------------------
 1 | from math import sqrt
 2 | 
 3 | import tensorflow as tf
 4 | from tensorflow.keras import layers, Sequential
 5 | from attention.SelfAttention import ScaledDotProductAttention
 6 | from conv.MBConv import MBConvBlock
 7 | 
 8 | 
 9 | class CoAtNet(layers.Layer):
10 |     def __init__(self, in_ch, out_chs=[64, 96, 192, 384, 768]):
11 |         super(CoAtNet, self).__init__()
12 |         self.out_chs = out_chs
13 |         self.maxpool2d = layers.MaxPool2D(pool_size=2, strides=2)
14 |         self.maxpool1d = layers.MaxPool1D(pool_size=2, strides=2)
15 | 
16 |         self.s0 = Sequential([
17 |             layers.Conv2D(in_ch, kernel_size=3, padding='same', activation='relu'),
18 |             layers.Conv2D(in_ch, kernel_size=3, padding='same')
19 |         ])
20 | 
21 |         self.mlp0 = Sequential([
22 |             layers.Conv2D(out_chs[0], kernel_size=1, padding='same', activation='relu'),
23 |             layers.Conv2D(out_chs[0], kernel_size=1, padding='same')
24 |         ])
25 | 
26 |         self.s1 = MBConvBlock(ksize=3, input_filters=out_chs[0], output_filters=out_chs[0])
27 |         self.mlp1 = Sequential([
28 |             layers.Conv2D(out_chs[1], kernel_size=1, activation='relu'),
29 |             layers.Conv2D(out_chs[1], kernel_size=1, )
30 |         ])
31 | 
32 |         self.s2 = MBConvBlock(ksize=3, input_filters=out_chs[1], output_filters=out_chs[1])
33 |         self.mlp2 = Sequential([
34 |             layers.Conv2D(out_chs[2], kernel_size=1, activation='relu'),
35 |             layers.Conv2D(out_chs[2], kernel_size=1, )
36 |         ])
37 | 
38 |         self.s3 = ScaledDotProductAttention(out_chs[2], out_chs[2] // 8, out_chs[2] // 8, 8)
39 |         self.mlp3 = Sequential([
40 |             layers.Dense(out_chs[3], activation='relu'),
41 |             layers.Dense(out_chs[3])
42 |         ])
43 | 
44 |         self.s4 = ScaledDotProductAttention(out_chs[3], out_chs[3] // 8, out_chs[3] // 8, 8)
45 |         self.mlp4 = Sequential([
46 |             layers.Dense(out_chs[4], activation='relu'),
47 |             layers.Dense(out_chs[4])
48 |         ])
49 | 
50 |     def call(self, x):
51 |         B, H, W, C = x.get_shape()
52 |         # stage0
53 |         y = self.mlp0(self.s0(x))
54 |         y = self.maxpool2d(y)
55 |         # stage1
56 |         y = self.mlp1(self.s1(y))
57 |         y = self.maxpool2d(y)
58 |         # stage2
59 |         y = self.mlp2(self.s2(y))
60 |         y = self.maxpool2d(y)
61 |         # stage3
62 |         y = tf.reshape(y, (B, -1, self.out_chs[2]))  # B, N, C
63 |         y = self.mlp3(self.s3(y, y, y))
64 |         y = self.maxpool1d(y)
65 |         # stage4
66 |         y = self.mlp4(self.s4(y, y, y))
67 |         y = self.maxpool1d(y)
68 |         N = y.get_shape()[-2]
69 |         y = tf.reshape(y, (B, int(sqrt(N)), int(sqrt(N)), self.out_chs[4]))
70 | 
71 |         return y
72 | 
73 | 
74 | if __name__ == '__main__':
75 |     input = tf.random.normal((1, 224, 224, 3))
76 |     coatnet = CoAtNet(3)
77 |     output = coatnet(input)
78 |     print(output.shape)
79 | 


--------------------------------------------------------------------------------
/attention/CoTAttention.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras import layers
 3 | 
 4 | class CoTAttention(layers.Layer):
 5 |     pass
 6 | 
 7 | if __name__ == '__main__':
 8 |     input = tf.random.normal((50, 7, 7, 512))
 9 |     a2 = CoTAttention(512, 128, 128, True)
10 |     output = a2(input)
11 |     print(output.shape)


--------------------------------------------------------------------------------
/attention/CoordAttention.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras import layers
 3 | 
 4 | class CoordAtt(layers.Layer):
 5 |     pass
 6 | 
 7 | if __name__ == '__main__':
 8 |     input = tf.random.normal((50, 7, 7, 512))
 9 |     a2 = CoordAtt(512, 128, 128, True)
10 |     output = a2(input)
11 |     print(output.shape)


--------------------------------------------------------------------------------
/attention/DANet.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras import layers
 3 | from attention.SelfAttention import ScaledDotProductAttention
 4 | from attention.SimplifiedSelfAttention import SimplifiedScaledDotProductAttention
 5 | 
 6 | 
 7 | class PositionAttentionModule(layers.Layer):
 8 |     def __init__(self, d_model=512, kernel_size=3, H=7, W=7):
 9 |         super(PositionAttentionModule, self).__init__()
10 |         self.cnn = layers.Conv2D(d_model, kernel_size=kernel_size, padding='same')
11 |         self.pa = ScaledDotProductAttention(d_model, d_k=d_model, d_v=d_model, h=1)
12 | 
13 |     def call(self, x):
14 |         bs, h, w, c = x.get_shape()
15 |         y = self.cnn(x)
16 |         y = tf.reshape(y, shape=(bs, h * w, c))
17 |         y = self.pa(y, y, y)  # bs, h*w, c
18 |         return y
19 | 
20 | 
21 | class ChannelAttentionModule(layers.Layer):
22 |     def __init__(self, d_model=512, kernel_size=3, H=7, W=7):
23 |         super(ChannelAttentionModule, self).__init__()
24 |         self.cnn = layers.Conv2D(d_model, kernel_size=kernel_size, padding='same')
25 |         self.pa = SimplifiedScaledDotProductAttention(H * W, h=1)
26 | 
27 |     def call(self, x):
28 |         bs, h, w, c = x.get_shape()
29 |         y = self.cnn(x)
30 |         y = tf.reshape(y, shape=(bs, c, -1))  # bs, c, h*w
31 |         y = self.pa(y, y, y)  # bs, c, h*w
32 |         return y
33 | 
34 | 
35 | class DAModule(layers.Layer):
36 |     def __init__(self, d_model=512, kernel_size=3, H=7, W=7):
37 |         super(DAModule, self).__init__()
38 |         self.position_attention_module = PositionAttentionModule(d_model=d_model, kernel_size=kernel_size, H=H, W=W)
39 |         self.channel_attention_module = ChannelAttentionModule(d_model=d_model, kernel_size=kernel_size, H=H, W=W)
40 | 
41 |     def call(self, input):
42 |         bs, h, w, c = input.get_shape()
43 |         p_out = self.position_attention_module(input)
44 |         c_out = self.channel_attention_module(input)
45 |         p_out = tf.reshape(p_out, shape=(bs, h, w, c))
46 |         c_out = tf.reshape(tf.transpose(c_out, perm=[0, 2, 1]), shape=(bs, h, w, c))
47 |         return p_out + c_out
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     input = tf.random.normal((50, 7, 7, 512))
52 |     danet = DAModule(d_model=512, kernel_size=3, H=7, W=7)
53 |     print(danet(input).shape)
54 | 


--------------------------------------------------------------------------------
/attention/ECAAttention.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras import layers
 3 | 
 4 | 
 5 | class ECAAttention(layers.Layer):
 6 |     def __init__(self, kernel_size=3):
 7 |         super(ECAAttention, self).__init__()
 8 |         self.gap = layers.GlobalAvgPool2D()
 9 |         self.conv = layers.Conv1D(1, kernel_size=kernel_size, padding='same')
10 |         self.sigmoid = tf.sigmoid
11 | 
12 |     def call(self, x):
13 |         y = self.gap(x)  # bs, 1, 1, c
14 |         y = tf.expand_dims(y, -1)  # bs, c, 1
15 |         y = self.conv(y)  # bs, c, 1
16 |         y = self.sigmoid(y)  # bs, c, 1
17 |         y = tf.transpose(tf.expand_dims(y, -1), (0, 2, 3, 1))  # bs, 1, 1, c
18 |         return x * tf.broadcast_to(y, x.get_shape())
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     input = tf.random.normal((50, 7, 7, 512))
23 |     eca = ECAAttention(kernel_size=3)
24 |     output = eca(input)
25 |     print(output.shape)
26 | 


--------------------------------------------------------------------------------
/attention/EMSA.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from tensorflow.keras import layers, Sequential
 4 | 
 5 | 
 6 | class EMSA(layers.Layer):
 7 |     def __init__(self, d_model, d_k, d_v, h, droupout=.1, H=7, W=7, ratio=3, apply_transform=True):
 8 |         super(EMSA, self).__init__()
 9 |         self.H = H
10 |         self.W = W
11 |         self.fc_q = layers.Dense(h * d_k)
12 |         self.fc_k = layers.Dense(h * d_k)
13 |         self.fc_v = layers.Dense(h * d_v)
14 |         self.fc_o = layers.Dense(d_model)
15 |         self.dropout = layers.Dropout(droupout)
16 | 
17 |         self.ratio = ratio
18 |         if self.ratio > 1:
19 |             self.sr = Sequential()
20 |             self.sr_conv = layers.Conv2D(d_model, kernel_size=ratio + 1, strides=ratio, padding='same', groups=d_model)
21 |             self.sr_ln = layers.LayerNormalization()
22 | 
23 |         self.apply_transform = apply_transform and h > 1
24 |         if self.apply_transform:
25 |             self.transform = Sequential()
26 |             self.transform.add(layers.Conv2D(h, kernel_size=1, strides=1, data_format='channels_first'))
27 |             self.transform.add(layers.Activation(tf.nn.softmax))
28 |             '''
29 |             Batch Normalisation(axis是沿着channel): 就是强行将数据拉回到均值为0，方差为1的正太分布上，这样不仅数据分布一致，而且避免发生梯度消失。依赖于batch的大小和输入sequence的深度。
30 |             Layer Normalisation(axis是沿着batch): LN不依赖于batch的大小和输入sequence的深度，因此可以用于batchsize为1和RNN中对边长的输入sequence的normalize操作。LN用于RNN效果比较明显，但是在CNN上，不如BN。
31 |             Instance Normalisation(axis是沿着batch和channel): 同BN注重对每个batch进行归一化，保证数据分布一致，因为判别模型中结果取决于数据整体分布。但是图像风格化中，生成结果主要依赖于某个图像实例，所以对整个batch归一化不适合图像风格化中，因而对HW做归一化。可以加速模型收敛，并且保持每个图像实例之间的独立。
32 |             Group Normalization: 主要是针对Batch Normalization对小batchsize效果差，GN将channel方向分group，然后每个group内做归一化，算(C//G)*H*W的均值，这样与batchsize无关，不受其约束。
33 |             '''
34 |             self.transform.add(layers.BatchNormalization(axis=[0, 1]))  # InstanceNormalisation，[0, 1] is bs and c.
35 |             # self.transform.add(tfa.layers.InstanceNormalization())
36 | 
37 |         self.d_model = d_model
38 |         self.d_k = d_k
39 |         self.d_v = d_v
40 |         self.h = h
41 | 
42 |     def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int):
43 |         # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
44 |         tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.h, self.d_model))
45 | 
46 |         # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
47 |         return tf.transpose(tensor, perm=[0, 2, 1, 3])
48 | 
49 |     def call(self, queries, keys, values, attention_mask=None, attention_weights=None):
50 | 
51 |         b_s, nq, c = queries.get_shape()
52 | 
53 |         q = self.transpose_for_scores(self.fc_q(queries), batch_size=b_s)  # (b_s, h, nq, d_k)
54 | 
55 |         if self.ratio > 1:
56 |             x = tf.reshape(queries, shape=[b_s, self.H, self.W, c])  # (b_s, H, W, c)
57 |             x = self.sr_conv(x)  # (b_s, h, w, c)
58 |             x = tf.reshape(x, shape=[b_s, -1, c])  # (bs, n', c)
59 |             x = self.sr_ln(x)
60 |             k = self.transpose_for_scores(self.fc_k(x), batch_size=b_s)  # (bs, h, n', d_k)
61 |             v = self.transpose_for_scores(self.fc_v(x), batch_size=b_s)  # (bs, h, n', d_v)
62 |         else:
63 |             k = self.transpose_for_scores(self.fc_k(keys), batch_size=b_s)  # (bs, h, nk, d_k)
64 |             v = self.transpose_for_scores(self.fc_v(values), batch_size=b_s)  # (bs, h, nk, d_v)
65 | 
66 |         if self.apply_transform:
67 |             att = tf.matmul(q, k, transpose_b=True) / np.sqrt(self.d_k)  # (bs, h, nq, n')
68 |             att = self.transform(att)
69 |         else:
70 |             att = tf.matmul(q, k, transpose_b=True) / tf.math.sqrt(self.d_k)  # (bs, h, nq, n')
71 |             att = tf.math.softmax(att, -1)  # (bs, h, nq, n')
72 | 
73 |         if attention_weights is not None:
74 |             att = att * attention_weights
75 |         if attention_mask is not None:
76 |             att = tf.multiply(att, attention_mask)
77 | 
78 |         att = self.dropout(att)
79 | 
80 |         out = tf.reshape(tf.transpose(tf.matmul(att, v), perm=[0, 2, 1, 3]),
81 |                          shape=(b_s, nq, self.h * self.d_v))  # (bs, nq, h*d_v)
82 |         out = self.fc_o(out)
83 |         return out
84 | 
85 | 
86 | if __name__ == '__main__':
87 |     input = tf.random.normal((50, 64, 512))
88 |     emsa = EMSA(d_model=512, d_k=512, d_v=512, h=8, H=8, W=8, ratio=2, apply_transform=True)
89 |     output = emsa(input, input, input)
90 |     print(output.shape)
91 | 


--------------------------------------------------------------------------------
/attention/ExternalAttention.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from tensorflow.keras import layers
 4 | 
 5 | 
 6 | class ExternalAttention(layers.Layer):
 7 | 
 8 |     def __init__(self, d_model, S=64):
 9 |         super(ExternalAttention, self).__init__(name='ExternalAttention')
10 |         self.mk = layers.Dense(S, use_bias=False)
11 |         self.mv = layers.Dense(d_model, use_bias=False)
12 | 
13 |     def call(self, queries):
14 |         attn = self.mk(queries)  # bs,n,S
15 |         attn = tf.nn.softmax(attn, axis=1)  # bs,n,S
16 |         attn = attn / tf.reduce_sum(attn, axis=2, keepdims=True)  # bs,n,S (l1_norm)
17 |         out = self.mv(attn)  # bs,n,d_model
18 | 
19 |         return out
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     input = tf.random.normal(shape=(50, 49, 512))
24 |     ea = ExternalAttention(d_model=512, S=8)
25 |     output = ea(input)
26 |     print(output.shape)
27 | 


--------------------------------------------------------------------------------
/attention/HaloAttention.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras import layers
 3 | 
 4 | class HaloAttention(layers.Layer):
 5 |     pass
 6 | 
 7 | if __name__ == '__main__':
 8 |     input = tf.random.normal((50, 7, 7, 512))
 9 |     halo = HaloAttention(512, 128, 128, True)
10 |     output = halo(input)
11 |     print(output.shape)
12 |     # 参考https://github.com/leondgarse/keras_cv_attention_models/blob/main/keras_cv_attention_models/halonet/halonet.py


--------------------------------------------------------------------------------
/attention/MUSEAttention.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from tensorflow.keras import layers
  4 | 
  5 | 
  6 | class Depth_Pointwise_Conv1d(layers.Layer):
  7 |     def __init__(self, in_ch, out_ch, k):
  8 |         super(Depth_Pointwise_Conv1d, self).__init__()
  9 |         if k == 1:
 10 |             self.depth_conv = tf.identity
 11 |         else:
 12 |             self.depth_conv = layers.Conv1D(
 13 |                 filters=in_ch,
 14 |                 kernel_size=k,
 15 |                 groups=in_ch,
 16 |                 padding='same'
 17 |             )
 18 |         self.pointwise_conv = layers.Conv1D(
 19 |             filters=out_ch,
 20 |             kernel_size=1,
 21 |             groups=1
 22 |         )
 23 | 
 24 |     def call(self, x):
 25 |         depth_conv_out = self.depth_conv(x)
 26 |         out = self.pointwise_conv(depth_conv_out)
 27 |         return out
 28 | 
 29 | 
 30 | class MUSEAttention(layers.Layer):
 31 |     def __init__(self, d_model, d_k, d_v, h, dropout=1):
 32 |         super(MUSEAttention, self).__init__()
 33 |         self.fc_q = layers.Dense(h * d_k)
 34 |         self.fc_k = layers.Dense(h * d_k)
 35 |         self.fc_v = layers.Dense(h * d_v)
 36 |         self.fc_o = layers.Dense(d_model)
 37 |         self.dropout = layers.Dropout(dropout)
 38 | 
 39 |         self.conv1 = Depth_Pointwise_Conv1d(h * d_v, d_model, 1)
 40 |         self.conv3 = Depth_Pointwise_Conv1d(h * d_v, d_model, 3)
 41 |         self.conv5 = Depth_Pointwise_Conv1d(h * d_v, d_model, 5)
 42 |         self.dy_paras = tf.Variable(tf.ones(3), trainable=True)
 43 |         self.softmax = tf.nn.softmax
 44 | 
 45 |         self.d_model = d_model
 46 |         self.d_k = d_k
 47 |         self.d_v = d_v
 48 |         self.h = h
 49 | 
 50 |     def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int):
 51 |         # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
 52 |         tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.h, self.d_model))
 53 | 
 54 |         # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
 55 |         return tf.transpose(tensor, perm=[0, 2, 1, 3])
 56 | 
 57 |     def call(self, queries, keys, values, attention_mask=None, attention_weights=None):
 58 | 
 59 |         # Self Attention
 60 |         b_s, nq = queries.shape[:2]
 61 |         nk = keys.shape[1]
 62 | 
 63 |         q = self.transpose_for_scores(self.fc_q(queries), batch_size=b_s)  # (b_s, h, nq ,d_k)
 64 |         k = self.transpose_for_scores(self.fc_k(keys), batch_size=b_s)  # (b_s, h, nk, d_k)
 65 |         v = self.transpose_for_scores(self.fc_v(values), batch_size=b_s)  # (b_s, h, nk ,d_v)
 66 | 
 67 |         # Take the dot product between "query" and "key" to get the raw attention scores.
 68 |         # (batch size, num_heads, seq_len_q, seq_len_k)
 69 |         att = tf.matmul(q, k, transpose_b=True) / np.sqrt(self.d_k)
 70 | 
 71 |         if attention_weights is not None:
 72 |             att = att * attention_weights
 73 |         if attention_mask is not None:
 74 |             att = tf.multiply(att, attention_mask)
 75 | 
 76 |         # Normalize the attention scores to probabilities.
 77 |         att = self.softmax(att, -1)
 78 | 
 79 |         att = self.dropout(att)
 80 | 
 81 |         out = tf.reshape(tf.transpose(tf.matmul(att, v), (0, 2, 1, 3)),
 82 |                          (b_s, nq, self.h * self.d_v))  # (b_s, nq, h*d_v)
 83 |         out = self.fc_o(out)  # (b_s, nq, d_model)
 84 | 
 85 |         v2 = tf.reshape(tf.transpose(v, (0, 2, 1, 3)), (b_s, nk, -1))  # bs, dim, nk
 86 |         self.dy_paras = tf.Variable(self.softmax(self.dy_paras, -1))
 87 | 
 88 |         out2 = self.dy_paras[0] * self.conv1(v2) + self.dy_paras[1] * self.conv3(v2) + self.dy_paras[2] * self.conv5(v2)
 89 |         # out2 = tf.transpose(out2, (0, 2, 1))  # bs, n, dim
 90 | 
 91 |         out = out + out2
 92 |         return out
 93 | 
 94 | 
 95 | if __name__ == '__main__':
 96 |     input = tf.random.normal((50, 49, 512))
 97 |     sa = MUSEAttention(d_model=512, d_k=512, d_v=512, h=8)
 98 |     output = sa(input, input, input)
 99 |     print(output.shape)
100 | 


--------------------------------------------------------------------------------
/attention/MobileViTAttention.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras import layers
 3 | 
 4 | class MobileViTAttention(layers.Layer):
 5 |     pass
 6 | 
 7 | if __name__ == '__main__':
 8 |     input = tf.random.normal((50, 7, 7, 512))
 9 |     a2 = MobileViTAttention(512, 128, 128, True)
10 |     output = a2(input)
11 |     print(output.shape)


--------------------------------------------------------------------------------
/attention/OutlookAttention.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import tensorflow as tf
 4 | from tensorflow.keras import layers
 5 | 
 6 | 
 7 | class OutlookAttention(layers.Layer):
 8 |     def __init__(self, dim, num_heads=1, kernel_size=3, padding=1, stride=1, qkv_bias=False, attn_drop=0.1):
 9 |         super(OutlookAttention, self).__init__()
10 |         self.dim = dim
11 |         self.num_heads = num_heads
12 |         self.head_dim = dim // num_heads
13 |         self.kernel_size = kernel_size
14 |         self.padding = padding
15 |         self.stride = stride
16 |         self.scale = self.head_dim ** (-0.5)
17 | 
18 |         self.v_pj = layers.Dense(dim, use_bias=qkv_bias)
19 |         self.attn = layers.Dense(kernel_size ** 4 * num_heads)
20 | 
21 |         self.attn_drop = layers.Dropout(attn_drop)
22 |         self.proj = layers.Dense(dim)
23 |         self.proj_drop = layers.Dropout(attn_drop)
24 | 
25 |         self.unflod = tf.image.extract_patches(sizes=[1, kernel_size, kernel_size, 1], strides=[1, stride, stride, 1],
26 |                                                padding='same')
27 |         self.pool = layers.AvgPool2D(pool_size=stride, strides=stride, ceil_mode=True)
28 | 
29 |     def call(self, x):
30 |         B, H, W, C = x.get_shape()
31 | 
32 |         # 映射到新的特征v
33 |         v = self.v_pj(x)
34 |         h, w = math.ceil(H / self.stride), math.ceil(W / self.stride)
35 |         v = tf.reshape(self.unflod(v), (B, self.num_heads, h * w, self.kernel_size * self.kernel_size, self.head_dim))
36 | 
37 |         # 生成Attention Map
38 |         attn = self.pool(x)
39 |         attn = tf.reshape(self.attn(attn),
40 |                           (B, self.num_heads, h * w, self.kernel_size * self.kernel_size,
41 |                            self.kernel_size * self.kernel_size))
42 | 
43 |         attn = self.scale * attn
44 |         attn = tf.nn.softmax(attn, axis=-1)
45 |         attn = self.attn_drop(attn)
46 | 
47 |         # 获取weighted特征
48 |         out = tf.reshape((attn @ v), (B, h*w, C * self.kernel_size * self.kernel_size))
49 |         out = tf.fold  # torch.nn.fold开发者说没有这个功能，未来没打算加，以后再补充。见https://github.com/tensorflow/tensorflow/issues/52195#issuecomment-948915934
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     input = tf.random.normal((50, 7, 7, 512))
54 |     outlook = OutlookAttention(512, 128, 128, True)
55 |     output = outlook(input)
56 |     print(output.shape)
57 | 


--------------------------------------------------------------------------------
/attention/PSA.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras import layers, Sequential
 3 | 
 4 | 
 5 | class PSA(layers.Layer):
 6 |     def __init__(self, channel=512, reduction=4, S=4):
 7 |         super(PSA, self).__init__()
 8 |         self.S = S
 9 |         self.convs = []
10 |         for i in range(S):
11 |             self.convs.append(layers.Conv2D(channel // S, kernel_size=2 * (i + 1) + 1, padding='same'))
12 | 
13 |         self.se_blocks = []
14 |         for i in range(S):
15 |             self.se_blocks.append(Sequential([
16 |                 layers.GlobalAvgPool2D(keepdims=True),
17 |                 layers.Conv2D(channel // (S * reduction), kernel_size=1, use_bias=False),
18 |                 layers.Activation('relu'),
19 |                 layers.Conv2D(channel // S, kernel_size=1, use_bias=False),
20 |                 layers.Activation('sigmoid')
21 |             ]))
22 | 
23 |         self.softmax = tf.nn.softmax
24 | 
25 |     def call(self, x):
26 |         b, h, w, c = x.get_shape()
27 | 
28 |         # Step1: SPC module
29 |         SPC_out = tf.reshape(x, shape=(b, h, w, self.S, c // self.S))  # bs, h, w, s, ci
30 |         SPC_out_list = []
31 |         for idx, conv in enumerate(self.convs):
32 |             SPC_out_list.append(conv(SPC_out[:, :, :, idx, :]))
33 | 
34 |         SPC_out = tf.stack(SPC_out_list, axis=3)
35 | 
36 |         # Step2: SE weight
37 |         se_out = []
38 |         for idx, se in enumerate(self.se_blocks):
39 |             se_out.append((se(SPC_out[:, :, :, idx, :])))
40 |         SE_out = tf.stack(se_out, axis=3)
41 |         SE_out = tf.broadcast_to(SE_out, SPC_out.get_shape())
42 | 
43 |         # Step3: Softmax
44 |         softmax_out = self.softmax(SE_out)
45 | 
46 |         # Step4: SPA
47 |         PSA_out = SPC_out * softmax_out
48 |         PSA_out = tf.reshape(PSA_out, shape=(b, h, w, -1))
49 | 
50 |         return PSA_out
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     input = tf.random.normal((50, 7, 7, 512))
55 |     psa = PSA(channel=512, reduction=8)
56 |     output = psa(input)
57 |     print(output.shape)
58 | 


--------------------------------------------------------------------------------
/attention/ParNetAttention.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras import layers
 3 | 
 4 | class AFT_FULL(layers.Layer):
 5 |     pass
 6 | 
 7 | if __name__ == '__main__':
 8 |     input = tf.random.normal((50, 7, 7, 512))
 9 |     a2 = AFT_FULL(512, 128, 128, True)
10 |     output = a2(input)
11 |     print(output.shape)


--------------------------------------------------------------------------------
/attention/PolarizedSelfAttention.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras.layers import Layer, Conv2D, Softmax, LayerNormalization, Activation
 3 | import tensorflow.keras.backend as K
 4 | 
 5 | class SequentialPolarizedSelfAttention(Layer):
 6 |     def __init__(self, channel=512, **kwargs):
 7 |         super(SequentialPolarizedSelfAttention, self).__init__(**kwargs)
 8 |         self.channel = channel
 9 |         self.ch_wv = Conv2D(channel // 2, kernel_size=(1, 1), padding='same')
10 |         self.ch_wq = Conv2D(1, kernel_size=(1, 1), padding='same')
11 |         self.softmax_channel = Softmax(axis=1)
12 |         self.softmax_spatial = Softmax(axis=-1)
13 |         self.ch_wz = Conv2D(channel, kernel_size=(1, 1), padding='same')
14 |         self.ln = LayerNormalization(axis=[1, 2, 3])
15 |         self.sigmoid = Activation('sigmoid')
16 |         self.sp_wv = Conv2D(channel // 2, kernel_size=(1, 1), padding='same')
17 |         self.sp_wq = Conv2D(channel // 2, kernel_size=(1, 1), padding='same')
18 |         self.agp = tf.keras.layers.GlobalAveragePooling2D(keepdims=True)
19 | 
20 |     def call(self, x):
21 |         # Channel-only Self-Attention
22 |         channel_wv = self.ch_wv(x) # bs, h, w, c//2
23 |         channel_wq = self.ch_wq(x) # bs, h, w, 1
24 |         channel_wv = tf.reshape(channel_wv, [tf.shape(x)[0], -1, self.channel // 2]) # bs, h*w, c//2
25 |         channel_wq = tf.reshape(channel_wq, [tf.shape(x)[0], -1, 1]) # bs, h, w, 1
26 |         channel_wq = self.softmax_channel(channel_wq) # bs, h*w, 1
27 |         channel_wz = tf.matmul(channel_wv, channel_wq, transpose_a=True)  # bs, c//2, 1
28 |         channel_wz = tf.reshape(channel_wz, [tf.shape(x)[0], 1, 1, self.channel // 2])
29 |         channel_wz = self.ch_wz(channel_wz)
30 |         channel_wz = tf.reshape(channel_wz, [tf.shape(x)[0], 1, 1, self.channel])
31 |         channel_weight = self.sigmoid(self.ln(channel_wz)) # bs, 1, 1, c
32 |         channel_out = channel_weight * x
33 | 
34 |         # Spatial-only Self-Attention
35 |         spatial_wv = self.sp_wv(channel_out) # bs, h, w, c//2
36 |         spatial_wq = self.sp_wq(channel_out) # bs, h, w, c//2
37 |         spatial_wq = self.agp(spatial_wq) # bs, 1, 1, c//2
38 |         spatial_wv = tf.reshape(spatial_wv, [tf.shape(x)[0], -1, self.channel // 2]) # bs, h*w, c//2
39 |         spatial_wq = tf.reshape(spatial_wq, [tf.shape(x)[0], 1, self.channel // 2]) # bs, 1, c//2
40 |         spatial_wq = self.softmax_spatial(spatial_wq)
41 |         spatial_wz = tf.matmul(spatial_wq, spatial_wv, transpose_b=True) # bs, 1, h*w, 
42 |         spatial_weight = self.sigmoid(tf.reshape(spatial_wz, [tf.shape(x)[0], tf.shape(x)[1], tf.shape(x)[2], 1])) # bs, h, w, 1
43 |         spatial_out = spatial_weight * channel_out
44 |         
45 |         return spatial_out
46 | 
47 | # Test the SequentialPolarizedSelfAttention layer
48 | if __name__ == '__main__':
49 |     input_tensor = tf.random.normal([1, 7, 7, 512])
50 |     psa = SequentialPolarizedSelfAttention(channel=512)
51 |     output_tensor = psa(input_tensor)
52 |     print(output_tensor.shape)


--------------------------------------------------------------------------------
/attention/ResidualAttention.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from tensorflow.keras.layers import Conv2D
 4 | from tensorflow.keras import Model
 5 | 
 6 | 
 7 | class ResidualAttention(Model):
 8 |     def __init__(self, num_class=1000, name='ResidualAttention', la=0.2):
 9 |         super(ResidualAttention, self).__init__(name=name)
10 |         self.la = la
11 |         self.fc = Conv2D(filters=num_class, kernel_size=1, strides=1, use_bias=False)
12 | 
13 |     def call(self, x):
14 |         x = self.fc(x)
15 |         b, h, w, c = x.shape
16 |         y_raw = tf.reshape(x, [-1, h * w, c])  # b, hxw, num_class
17 |         y_avg = tf.reduce_mean(y_raw, axis=1)  # b, num_class
18 |         y_max = tf.reduce_max(y_raw, axis=1)  # b, num_class
19 |         score = y_avg + self.la * y_max
20 |         return score
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     input = tf.random.normal(shape=(50, 7, 7, 512))
25 |     resatt = ResidualAttention(num_class=1000, la=0.2)
26 |     output = resatt(input)
27 |     print(output.shape)
28 | 


--------------------------------------------------------------------------------
/attention/S2Attention.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras import layers
 3 | 
 4 | class S2Attention(layers.Layer):
 5 |     pass
 6 | 
 7 | if __name__ == '__main__':
 8 |     input = tf.random.normal((50, 7, 7, 512))
 9 |     a2 = S2Attention(512, 128, 128, True)
10 |     output = a2(input)
11 |     print(output.shape)


--------------------------------------------------------------------------------
/attention/SEAttention.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras import layers, Sequential
 3 | 
 4 | 
 5 | class SEAttention(layers.Layer):
 6 |     def __init__(self, channel=512, reduction=16):
 7 |         super(SEAttention, self).__init__()
 8 |         self.avg_pool = layers.GlobalAvgPool2D(
 9 |             keepdims=True)  # 同nn.AdaptiveAvgPool2d(1)， 但是注意torch的输出是保持4维的,而tensorflow不保持维度.
10 |         self.fc = Sequential([
11 |             layers.Dense(channel // reduction, use_bias=False),
12 |             layers.Activation('relu'),
13 |             layers.Dense(channel, use_bias=False),
14 |             layers.Activation('sigmoid')
15 |         ])
16 | 
17 |     def call(self, x):
18 |         b, h, w, c = x.get_shape()
19 |         y = self.avg_pool(x)
20 |         y = self.fc(y)
21 |         return x * tf.tile(y, (1, h, w, 1))  # or use 'tf.broadcast_to(y, x.get_shape())'
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     input = tf.random.normal((50, 7, 7, 512))
26 |     se = SEAttention(channel=512, reduction=8)
27 |     output = se(input)
28 |     print(output.shape)
29 | 


--------------------------------------------------------------------------------
/attention/SGE.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras import layers
 3 | 
 4 | 
 5 | class SpatialGroupEnhance(layers.Layer):
 6 |     def __init__(self, groups):
 7 |         super(SpatialGroupEnhance, self).__init__()
 8 |         self.groups = groups
 9 |         self.avg_pool = layers.GlobalAvgPool2D(keepdims=True)
10 |         self.sig = tf.sigmoid
11 | 
12 |     def build(self, input_shape):
13 |         self.weight = self.add_weight(shape=(1, 1, 1, self.groups), initializer='zeros', trainable=True)
14 |         self.bias = self.add_weight(shape=(1, 1, 1, self.groups), initializer='zeros', trainable=True)
15 |         super(SpatialGroupEnhance, self).build(input_shape)
16 | 
17 |     def call(self, x):
18 |         b, h, w, c = x.get_shape()
19 |         x = tf.reshape(x, (b * self.groups, h, w, -1))  # bs*g, h, w, dim//g
20 |         xn = x * self.avg_pool(x)  # bs*g, h, w, dim//g
21 |         xn = tf.reduce_sum(xn, axis=-1, keepdims=True)  # bs*g, h, w, 1
22 |         t = tf.reshape(xn, (b * self.groups, -1))  # bs*g, h*w
23 | 
24 |         t = t - tf.reduce_mean(t, axis=-1, keepdims=True)  # bs*g, h*w
25 |         std = tf.math.reduce_std(t, axis=-1, keepdims=True) + 1e-5
26 |         t = t / std  # bs*g, h*w
27 |         t = tf.reshape(t, (b, h, w, self.groups))  # bs, h, w, g
28 | 
29 |         t = t * self.weight + self.bias  # bs, h, w, g
30 |         t = tf.reshape(t, (b * self.groups, h, w, 1))  # bs*g, h, w, 1
31 |         x = x * self.sig(t)  # bs*g, h, w, dim//g
32 |         x = tf.reshape(x, (b, h, w, c))
33 | 
34 |         return x
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     input = tf.random.normal((50, 7, 7, 512))
39 |     sge = SpatialGroupEnhance(groups=8)
40 |     output = sge(input)
41 |     print(output.shape)
42 | 


--------------------------------------------------------------------------------
/attention/SKAttention.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras import layers, Sequential
 3 | 
 4 | 
 5 | class SKAttention(layers.Layer):
 6 |     def __init__(self, channel=512, kernels=[1, 3, 5, 7], reduction=16, group=1, L=32):
 7 |         super(SKAttention, self).__init__()
 8 |         self.d = max(L, channel // reduction)
 9 |         self.convs = []
10 |         # self.convs = Sequential([])
11 |         for k in kernels:
12 |             self.convs.append(
13 |                 Sequential([
14 |                     layers.Conv2D(channel, kernel_size=k, padding='same', groups=group, name='conv'),
15 |                     layers.BatchNormalization(name='bn'),
16 |                     layers.Activation('relu', name='relu'),
17 |                 ])
18 |             )
19 |         self.fc = layers.Dense(self.d)
20 |         self.fcs = []
21 |         for i in range(len(kernels)):
22 |             self.fcs.append(layers.Dense(channel))
23 | 
24 |     def call(self, x):
25 |         bs, _, _, c = x.get_shape()
26 |         conv_outs = []
27 |         ### split
28 |         for conv in self.convs:
29 |             conv_outs.append(conv(x))
30 |         feats = tf.stack(conv_outs, 0)  # k, bs, h, w, channel
31 | 
32 |         ### fuse
33 |         U = sum(conv_outs)  # bs, h, w, c
34 | 
35 |         ### reduction channel
36 |         S = tf.reduce_mean(tf.reduce_mean(U, axis=-2), axis=-2)  # bs, c
37 |         Z = self.fc(S)  # bs, d
38 | 
39 |         ### calculate attention weight
40 |         weights = []
41 |         for fc in self.fcs:
42 |             weight = fc(Z)
43 |             weights.append(tf.reshape(weight, (bs, 1, 1, c)))  # bs, channel
44 |         attention_weughts = tf.stack(weights, 0)  # k, bs, 1, 1, channel
45 |         attention_weughts = tf.nn.softmax(attention_weughts, axis=0)  # k, bs, 1, 1, channel
46 | 
47 |         ### fuse
48 |         V = tf.reduce_sum(attention_weughts * feats, 0)
49 |         return V
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     input = tf.random.normal((50, 7, 7, 512))
54 |     se = SKAttention(channel=512, reduction=8)
55 |     output = se(input)
56 |     print(output.shape)
57 | 


--------------------------------------------------------------------------------
/attention/SelfAttention.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from tensorflow.keras import layers
 4 | 
 5 | 
 6 | class ScaledDotProductAttention(layers.Layer):
 7 |     """
 8 |     Scaled dot-product attention
 9 |     """
10 | 
11 |     def __init__(self, d_model, d_k, d_v, h, dropout=.1):
12 |         """
13 |         :param d_model: Output dimensionality of the model
14 |         :param d_k: Dimensionality of queries and keys
15 |         :param d_v: Dimensionality of values
16 |         :param h: Number of heads
17 |         """
18 |         super(ScaledDotProductAttention, self).__init__()
19 |         self.fc_q = layers.Dense(h * d_k)
20 |         self.fc_k = layers.Dense(h * d_k)
21 |         self.fc_v = layers.Dense(h * d_k)
22 |         self.fc_o = layers.Dense(d_model)
23 |         self.dropout = layers.Dropout(dropout)
24 |         self.softmax = tf.nn.softmax
25 | 
26 |         self.d_model = d_model
27 |         self.d_k = d_k
28 |         self.d_v = d_v
29 |         self.h = h
30 | 
31 |     def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int):
32 |         # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
33 |         tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.h, self.d_model))
34 | 
35 |         # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
36 |         return tf.transpose(tensor, perm=[0, 2, 1, 3])
37 | 
38 |     def call(self, queries, keys, values, attention_mask=None, attention_weights=None):
39 |         """
40 |         Computs
41 |         :param queries: Queries (b_s, nq, d_model)
42 |         :param keys: Keys (b_s, nk, d_model)
43 |         :param values: Values (b_s, nk, d_model)
44 |         :param attention_mask: Mask over attention values (b_s, h, nq, nk). True indicates masking
45 |         :param attention_weights: Multiplicative weights for attention values (b_s, h, nq, nk).
46 |         :return:
47 |         """
48 |         b_s, nq = queries.shape[:2]
49 | 
50 |         q = self.transpose_for_scores(self.fc_q(queries), batch_size=b_s)  # (b_s, h, nq ,d_k)
51 |         k = self.transpose_for_scores(self.fc_k(keys), batch_size=b_s)  # (b_s, h, nk, d_k)
52 |         v = self.transpose_for_scores(self.fc_v(values), batch_size=b_s)  # (b_s, h, nk ,d_v)
53 | 
54 |         # Take the dot product between "query" and "key" to get the raw attention scores.
55 |         # (batch size, num_heads, seq_len_q, seq_len_k)
56 |         att = tf.matmul(q, k, transpose_b=True) / np.sqrt(self.d_k)
57 | 
58 |         if attention_weights is not None:
59 |             att = att * attention_weights
60 |         if attention_mask is not None:
61 |             att = tf.multiply(att, attention_mask)
62 | 
63 |         # Normalize the attention scores to probabilities.
64 |         att = self.softmax(att, -1)
65 | 
66 |         att = self.dropout(att)
67 | 
68 |         out = tf.reshape(tf.transpose(tf.matmul(att, v), (0, 2, 1, 3)),
69 |                          (b_s, nq, self.h * self.d_v))  # (b_s, nq, h*d_v)
70 |         out = self.fc_o(out)  # (b_s, nq, d_model)
71 |         return out
72 | 
73 | 
74 | if __name__ == '__main__':
75 |     input = tf.random.normal((50, 49, 512))
76 |     sa = ScaledDotProductAttention(d_model=512, d_k=512, d_v=512, h=8)
77 |     output = sa(input, input, input)
78 |     print(output.shape)
79 | 


--------------------------------------------------------------------------------
/attention/ShuffleAttention.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import tensorflow_addons as tfa
 3 | from tensorflow.keras import layers
 4 | 
 5 | 
 6 | class ShuffleAttention(layers.Layer):
 7 |     def __init__(self, channel=512, reduction=16, G=8):
 8 |         super(ShuffleAttention, self).__init__()
 9 |         self.G = G
10 |         self.channel = channel
11 |         self.avg_pool = layers.GlobalAvgPool2D(keepdims=True)
12 |         self.gn = tfa.layers.GroupNormalization(channel // (2 * G), axis=-1)
13 |         self.sigmoid = tf.nn.sigmoid
14 | 
15 |     def build(self, input_shape):
16 |         self.cweight = self.add_weight(
17 |             shape=(1, 1, 1, self.channel // (2 * self.G)), initializer='zeros', trainable=True,
18 |         )
19 |         self.cbias = self.add_weight(
20 |             shape=(1, 1, 1, self.channel // (2 * self.G)), initializer='ones', trainable=True,
21 |         )
22 |         self.sweight = self.add_weight(
23 |             shape=(1, 1, 1, self.channel // (2 * self.G)), initializer='zeros', trainable=True,
24 |         )
25 |         self.sbias = self.add_weight(
26 |             shape=(1, 1, 1, self.channel // (2 * self.G)), initializer='ones', trainable=True,
27 |         )
28 |         super(ShuffleAttention, self).build(input_shape)
29 | 
30 |     @staticmethod
31 |     def channel_shuffle(x, groups):
32 |         b, h, w, c = x.get_shape()
33 |         x = tf.reshape(x, shape=(b, h, w, groups, -1))
34 |         x = tf.transpose(x, perm=(0, 1, 2, 4, 3))
35 | 
36 |         # flatten
37 |         x = tf.reshape(x, shape=(b, h, w, -1))
38 |         return x
39 | 
40 |     def call(self, x):
41 |         b, h, w, c = x.get_shape()
42 |         # group into subfeatures
43 |         x = tf.reshape(x, (b * self.G, h, w, -1))  # bs*G, h, w, c//G
44 | 
45 |         # channel_split
46 |         x_0, x_1 = tf.split(x, num_or_size_splits=2, axis=3)  # bs*G, h, w, c//(2*G)
47 | 
48 |         # channel attention
49 |         x_channel = self.avg_pool(x_0)  # bs*G, 1, 1, c//(2*G)
50 |         x_channel = self.cweight * x_channel + self.cbias  # bs*G, 1, 1, c//(2*G)
51 |         x_channel = x_0 * self.sigmoid(x_channel)  # bs*G, h, w, c//(2*G)
52 | 
53 |         # spatial attention
54 |         x_spatial = self.gn(x_1)  # bs*G, h, w, c//(2*G)
55 |         x_spatial = self.sweight * x_spatial + self.sbias  # bs*G, h, w, c//(2*G)
56 |         x_spatial = x_1 * self.sigmoid(x_spatial)  # bs*G, h, w, c//(2*G)
57 | 
58 |         # concatenate along channel axis
59 |         out = tf.concat([x_channel, x_spatial], axis=3)
60 |         out = tf.reshape(out, (b, h, w, -1))
61 | 
62 |         # channel shuffle
63 |         out = self.channel_shuffle(out, 2)
64 | 
65 |         return out
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     input = tf.random.normal((50, 7, 7, 512))
70 |     se = ShuffleAttention(channel=512, G=8)
71 |     output = se(input)
72 |     print(output.shape)
73 | 


--------------------------------------------------------------------------------
/attention/SimplifiedSelfAttention.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from tensorflow.keras import layers
 4 | 
 5 | 
 6 | class SimplifiedScaledDotProductAttention(layers.Layer):
 7 |     """
 8 |     Scaled dot-product attention
 9 |     """
10 | 
11 |     def __init__(self, d_model, h, dropout=.1):
12 |         """
13 |         :param d_model: Output dimensionality of the model
14 |         :param d_k: Dimensionality of queries and keys
15 |         :param d_v: Dimensionality of values
16 |         :param h: Number of heads
17 |         """
18 |         super(SimplifiedScaledDotProductAttention, self).__init__()
19 | 
20 |         self.d_model = d_model
21 |         self.d_k = d_model // h
22 |         self.d_v = d_model // h
23 |         self.h = h
24 | 
25 |         self.fc_o = layers.Dense(d_model)
26 |         self.dropout = layers.Dropout(dropout)
27 | 
28 |     def call(self, queries, keys, values, attention_mask=None, attention_weights=None):
29 |         '''
30 |         Computes
31 |         :param queries: Queries (b_s, nq, d_model)
32 |         :param keys: Keys (b_s, nk, d_model)
33 |         :param values: Values (b_s, nk, d_model)
34 |         :param attention_mask: Mask over attention values (b_s, h, nq, nk). True indicates masking.
35 |         :param attention_weights: Multiplicative weights for attention values (b_s, h, nq, nk).
36 |         :return:
37 |         '''
38 |         b_s, nq = queries.shape[:2]
39 |         nk = keys.shape[1]
40 | 
41 |         q = tf.transpose(tf.reshape(queries, (b_s, nq, self.h, self.d_k)), (0, 2, 1, 3))  # (b_s, h, nq, d_k)
42 |         k = tf.transpose(tf.reshape(keys, (b_s, nk, self.h, self.d_k)), (0, 2, 3, 1))  # (b_s, h, d_k, nk)
43 |         v = tf.transpose(tf.reshape(values, (b_s, nk, self.h, self.d_v)), (0, 2, 1, 3))  # (b_s, h, nk, d_v)
44 | 
45 |         att = tf.matmul(q, k) / np.sqrt(self.d_k)  # (b_s, h, nq, nk)
46 |         if attention_weights is not None:
47 |             att = att * attention_weights
48 |         if attention_mask is not None:
49 |             att = att.masked_fill(attention_mask, -np.inf)
50 |         att = tf.nn.softmax(att, -1)
51 |         att = self.dropout(att)
52 | 
53 |         out = tf.reshape(tf.transpose(tf.matmul(att, v), (0, 2, 1, 3)),
54 |                          (b_s, nq, self.h * self.d_v))  # (b_s, nq, h*d_v)
55 |         out = self.fc_o(out)  # (b_s, nq, d_model)
56 |         return out
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     input = tf.random.normal((50, 49, 512))
61 |     ssa = SimplifiedScaledDotProductAttention(d_model=512, h=8)
62 |     output = ssa(input, input, input)
63 |     print(output.shape)
64 | 


--------------------------------------------------------------------------------
/attention/TripletAttention.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras import layers
 3 | 
 4 | class TripletAttention(layers.Layer):
 5 |     pass
 6 | 
 7 | if __name__ == '__main__':
 8 |     input = tf.random.normal((50, 7, 7, 512))
 9 |     a2 = TripletAttention(512, 128, 128, True)
10 |     output = a2(input)
11 |     print(output.shape)


--------------------------------------------------------------------------------
/attention/UFOAttention.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras import layers
 3 | 
 4 | class AFT_FULL(layers.Layer):
 5 |     pass
 6 | 
 7 | if __name__ == '__main__':
 8 |     input = tf.random.normal((50, 7, 7, 512))
 9 |     a2 = AFT_FULL(512, 128, 128, True)
10 |     output = a2(input)
11 |     print(output.shape)


--------------------------------------------------------------------------------
/attention/ViP.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras import layers
 3 | 
 4 | 
 5 | class MLP(layers.Layer):
 6 |     def __init__(self, hidden_features, out_features, drop=0.1):
 7 |         super(MLP, self).__init__()
 8 |         self.fc1 = layers.Dense(hidden_features, activation='gelu')
 9 |         self.fc2 = layers.Dense(out_features)
10 |         self.drop = layers.Dropout(drop)
11 | 
12 |     def call(self, x):
13 |         return self.drop(self.fc2(self.drop(self.fc1(x))))
14 | 
15 | 
16 | class WeightedPermuteMLP(layers.Layer):
17 |     def __init__(self, dim, seg_dim=8, qkv_bias=False, proj_drop=0.):
18 |         super(WeightedPermuteMLP, self).__init__()
19 |         self.seg_dim = seg_dim
20 |         self.mlp_c = layers.Dense(dim, use_bias=qkv_bias)
21 |         self.mlp_h = layers.Dense(dim, use_bias=qkv_bias)
22 |         self.mlp_w = layers.Dense(dim, use_bias=qkv_bias)
23 | 
24 |         self.reweighting = MLP(dim // 4, dim * 3)
25 | 
26 |         self.proj = layers.Dense(dim)
27 |         self.proj_drop = layers.Dropout(proj_drop)
28 | 
29 |     def call(self, x):
30 |         B, H, W, C = x.get_shape()
31 | 
32 |         c_embed = self.mlp_c(x)
33 | 
34 |         S = C // self.seg_dim
35 |         h_embed = tf.reshape(tf.transpose(tf.reshape(x, (B, H, W, self.seg_dim, S)), (0, 3, 2, 1, 4)),
36 |                              (B, self.seg_dim, W, H * S))
37 |         h_embed = tf.reshape(tf.transpose(tf.reshape(self.mlp_h(h_embed), (B, self.seg_dim, W, H, S)), (0, 3, 2, 1, 4)),
38 |                              (B, H, W, C))
39 | 
40 |         w_embed = tf.reshape(tf.transpose(tf.reshape(x, (B, H, W, self.seg_dim, S)), (0, 3, 2, 1, 4)),
41 |                              (B, self.seg_dim, W, H * S))
42 |         w_embed = tf.reshape(tf.transpose(tf.reshape(self.mlp_w(w_embed), (B, self.seg_dim, W, H, S)), (0, 3, 2, 1, 4)),
43 |                              (B, H, W, C))
44 | 
45 |         weight = tf.reduce_mean(tf.reshape(tf.transpose((c_embed + h_embed + w_embed), (0, 3, 1, 2)), (B, C, -1)),
46 |                                 axis=2)
47 |         weight = tf.expand_dims(tf.expand_dims(
48 |             tf.nn.softmax(tf.transpose(tf.reshape(self.reweighting(weight), (B, C, 3)), (2, 0, 1)), axis=0), axis=2),
49 |             axis=2)
50 | 
51 |         x = c_embed * weight[0] + w_embed * weight[1] + h_embed * weight[2]
52 | 
53 |         x = self.proj_drop(self.proj(x))
54 | 
55 |         return x
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     input = tf.random.normal((64, 8, 8, 512))
60 |     seg_dim = 8
61 |     vip = WeightedPermuteMLP(512, seg_dim)
62 |     output = vip(input)
63 |     print(output.shape)
64 | 


--------------------------------------------------------------------------------
/attention/gfnet.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras import layers
 3 | 
 4 | class GFNet(layers.Layer):
 5 |     pass
 6 | 
 7 | if __name__ == '__main__':
 8 |     input = tf.random.normal((50, 7, 7, 512))
 9 |     a2 = GFNet(512, 128, 128, True)
10 |     output = a2(input)
11 |     print(output.shape)


--------------------------------------------------------------------------------
/attention/img/A2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/A2.png


--------------------------------------------------------------------------------
/attention/img/AFT.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/AFT.jpg


--------------------------------------------------------------------------------
/attention/img/BAM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/BAM.png


--------------------------------------------------------------------------------
/attention/img/CBAM1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/CBAM1.png


--------------------------------------------------------------------------------
/attention/img/CBAM2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/CBAM2.png


--------------------------------------------------------------------------------
/attention/img/CoAtNet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/CoAtNet.png


--------------------------------------------------------------------------------
/attention/img/CoT.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/CoT.png


--------------------------------------------------------------------------------
/attention/img/CondConv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/CondConv.png


--------------------------------------------------------------------------------
/attention/img/ConvMixer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/ConvMixer.png


--------------------------------------------------------------------------------
/attention/img/CoordAttention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/CoordAttention.png


--------------------------------------------------------------------------------
/attention/img/DepthwiseSeparableConv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/DepthwiseSeparableConv.png


--------------------------------------------------------------------------------
/attention/img/DynamicConv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/DynamicConv.png


--------------------------------------------------------------------------------
/attention/img/ECA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/ECA.png


--------------------------------------------------------------------------------
/attention/img/EMSA.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/EMSA.jpg


--------------------------------------------------------------------------------
/attention/img/EMSA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/EMSA.png


--------------------------------------------------------------------------------
/attention/img/External_Attention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/External_Attention.png


--------------------------------------------------------------------------------
/attention/img/GFNet.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/GFNet.jpg


--------------------------------------------------------------------------------
/attention/img/HaloNet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/HaloNet.png


--------------------------------------------------------------------------------
/attention/img/Infine-attention.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/Infine-attention.jpeg


--------------------------------------------------------------------------------
/attention/img/Involution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/Involution.png


--------------------------------------------------------------------------------
/attention/img/MBConv.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/MBConv.jpg


--------------------------------------------------------------------------------
/attention/img/MUSE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/MUSE.png


--------------------------------------------------------------------------------
/attention/img/MUSE2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/MUSE2.jpg


--------------------------------------------------------------------------------
/attention/img/MobileViTAttention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/MobileViTAttention.png


--------------------------------------------------------------------------------
/attention/img/MobileViTv2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/MobileViTv2.png


--------------------------------------------------------------------------------
/attention/img/OutlookAttention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/OutlookAttention.png


--------------------------------------------------------------------------------
/attention/img/ParNet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/ParNet.png


--------------------------------------------------------------------------------
/attention/img/PoSA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/PoSA.png


--------------------------------------------------------------------------------
/attention/img/ResAtt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/ResAtt.png


--------------------------------------------------------------------------------
/attention/img/S2Attention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/S2Attention.png


--------------------------------------------------------------------------------
/attention/img/SA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/SA.png


--------------------------------------------------------------------------------
/attention/img/SE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/SE.png


--------------------------------------------------------------------------------
/attention/img/SGE.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/SGE.jpg


--------------------------------------------------------------------------------
/attention/img/SGE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/SGE.png


--------------------------------------------------------------------------------
/attention/img/SK.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/SK.png


--------------------------------------------------------------------------------
/attention/img/SSA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/SSA.png


--------------------------------------------------------------------------------
/attention/img/ShuffleAttention.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/ShuffleAttention.jpg


--------------------------------------------------------------------------------
/attention/img/ShuffleAttention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/ShuffleAttention.png


--------------------------------------------------------------------------------
/attention/img/UFO.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/UFO.png


--------------------------------------------------------------------------------
/attention/img/ViP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/ViP.png


--------------------------------------------------------------------------------
/attention/img/acnet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/acnet.png


--------------------------------------------------------------------------------
/attention/img/danet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/danet.png


--------------------------------------------------------------------------------
/attention/img/danet2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/danet2.png


--------------------------------------------------------------------------------
/attention/img/ddb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/ddb.png


--------------------------------------------------------------------------------
/attention/img/gMLP.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/gMLP.jpg


--------------------------------------------------------------------------------
/attention/img/mlpmixer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/mlpmixer.png


--------------------------------------------------------------------------------
/attention/img/mobileViT.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/mobileViT.jpg


--------------------------------------------------------------------------------
/attention/img/psa.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/psa.jpg


--------------------------------------------------------------------------------
/attention/img/psa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/psa.png


--------------------------------------------------------------------------------
/attention/img/psa2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/psa2.jpg


--------------------------------------------------------------------------------
/attention/img/repmlp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/repmlp.png


--------------------------------------------------------------------------------
/attention/img/repvgg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/repvgg.png


--------------------------------------------------------------------------------
/attention/img/resmlp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/resmlp.png


--------------------------------------------------------------------------------
/attention/img/resnet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/resnet.png


--------------------------------------------------------------------------------
/attention/img/resnet2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/resnet2.jpg


--------------------------------------------------------------------------------
/attention/img/resnext.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/resnext.png


--------------------------------------------------------------------------------
/attention/img/sMLP.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/sMLP.jpg


--------------------------------------------------------------------------------
/attention/img/triplet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccfco/External-Attention-tensorflow/4d5eeae4772900e0d2ac0df1e23a0fd86ceaf748/attention/img/triplet.png


--------------------------------------------------------------------------------
/conv/MBConv.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras import layers
 3 | 
 4 | def drop_connect(inputs, p, training):
 5 |     """Drop the entire conv with given survival probability."""
 6 |     # "Deep Networks with Stochastic Depth", https://arxiv.org/pdf/1603.09382.pdf
 7 |     if not training: return inputs
 8 | 
 9 |     # Compute tensor.
10 |     batch_size = tf.shape(inputs)[0]
11 |     keep_prob = 1 - p
12 |     random_tensor = keep_prob
13 |     random_tensor += tf.random.uniform([batch_size, 1, 1, 1], dtype=inputs.dtype)
14 |     binary_tensor = tf.floor(random_tensor)
15 |     # Unlike conventional way that multiply survival_prob at test time, here we
16 |     # divide survival_prob at training time, such that no addition compute is
17 |     # needed at test time.
18 |     output = inputs / keep_prob * binary_tensor
19 |     return output
20 | 
21 | class MBConvBlock(layers.Layer):
22 |     """A class of MBVonv: Mobile Inverted Residual Bottleneck.
23 |     Attributes:
24 |         endpoints: dict. A list of internal tensors.
25 |         层：ksize=3*3 输入32 输出16 conv1 stride1
26 |     """
27 | 
28 |     def __init__(self, ksize, input_filters, output_filters, expand_ratio=1, stride=1, name=None):
29 |         super().__init__(name=name)
30 | 
31 |         self._bn_mom = 0.1  # batch norm momentum
32 |         self._bn_eps = 0.1  # batch norm epsilon
33 |         self._se_ratio = 0.25
34 |         self._input_filters = input_filters
35 |         self._output_filters = output_filters
36 |         self._expand_ratio = expand_ratio
37 |         self.kernel_size = ksize
38 |         self._stride = stride
39 | 
40 |         inp = self._input_filters
41 |         oup = self._input_filters * self._expand_ratio
42 |         if self._expand_ratio != 1:
43 |             self._expand_conv = layers.Conv2D(filters=oup, kernel_size=1, padding='same', use_bias=False)
44 |             self._bn0 = layers.BatchNormalization(momentum=self._bn_mom, epsilon=self._bn_eps)
45 | 
46 |         # Depthwise convolution
47 |         k = self.kernel_size
48 |         s = self._stride
49 |         self._depthwise_conv = layers.Conv2D(filters=oup, groups=oup, kernel_size=k, strides=s, padding='same',
50 |                                              use_bias=False)
51 |         self._bn1 = layers.BatchNormalization(momentum=self._bn_mom, epsilon=self._bn_eps)
52 | 
53 |         # Squeeze and Excitation layer, if desired
54 |         num_squeezed_channels = max(1, self._input_filters * self._se_ratio)  # num reduced filters
55 |         self._se_reduce = layers.Conv2D(filters=num_squeezed_channels, kernel_size=1, padding='same')
56 |         self._se_expand = layers.Conv2D(filters=oup, kernel_size=1, padding='same')
57 | 
58 |         # Output phase
59 |         final_oup = self._output_filters
60 |         self._project_conv = layers.Conv2D(filters=final_oup, kernel_size=1, padding='same', use_bias=False)
61 |         self._bn2 = layers.BatchNormalization(momentum=self._bn_mom, epsilon=self._bn_eps)
62 |         self._swish = tf.nn.swish  # Swish 是一种新型激活函数，公式为： f(x) = x · sigmoid(x)
63 | 
64 |     def call(self, inputs, drop_connect_rate=None):
65 |         # Expansion and Depthwise Convolution
66 |         x = inputs
67 |         if self._expand_ratio != 1:
68 |             expand = self._expand_conv(x)
69 |             bn0 = self._bn0(expand)
70 |             x = self._swish(bn0)
71 |         depthwise = self._depthwise_conv(x)
72 |         bn1 = self._bn1(depthwise)
73 |         x = self._swish(bn1)
74 | 
75 |         # Squeeze and Excitation
76 |         h_axis, w_axis = [1, 2]
77 |         x_squeezed = tf.nn.avg_pool2d(x, ksize=[1, x.shape[h_axis], x.shape[w_axis], 1], strides=[1, 1, 1, 1],
78 |                                       padding='VALID')
79 |         x_squeezed = self._se_reduce(x_squeezed)
80 |         x_squeezed = self._swish(x_squeezed)
81 |         x_squeezed = self._se_expand(x_squeezed)
82 |         x = tf.sigmoid(x_squeezed) * x
83 | 
84 |         x = self._bn2(self._project_conv(x))
85 | 
86 |         # Skip connection and drop connect
87 |         input_filters, output_filters = self._input_filters, self._output_filters
88 |         if self._stride == 1 and input_filters == output_filters:
89 |             if drop_connect_rate is not None:
90 |                 x = drop_connect(x, p=drop_connect_rate, training=True)
91 |             x = x + inputs  # skip connection
92 |         return x
93 | 
94 | if __name__ == '__main__':
95 |     input = tf.random.normal((1, 112, 112, 3))
96 |     mbconv = MBConvBlock(ksize=3, input_filters=3, output_filters=3)
97 |     out = mbconv(input)
98 |     print(out.shape)
99 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.23.5
2 | tensorflow==2.10.0
3 | 
4 | # 2024.03.20 SequentialPolarizedSelfAttention
5 | # numpy==1.26.4
6 | # tensorflow==2.16.1


--------------------------------------------------------------------------------