├── LICENSE
├── README.md
└── coatnet.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2021 Chin-Hsuan Wu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CoAtNet
 2 | 
 3 | ## Overview
 4 | 
 5 | This is a PyTorch implementation of CoAtNet specified in ["CoAtNet: Marrying Convolution and Attention for All Data Sizes"](https://arxiv.org/abs/2106.04803), arXiv 2021.
 6 | 
 7 | ![img](https://user-images.githubusercontent.com/67839539/138133065-337bb5ac-3dca-4ce8-af51-990c5ff23316.png)
 8 | 
 9 | 👉 Check out [MobileViT](https://github.com/chinhsuanwu/mobilevit-pytorch) if you are interested in other **Convolution + Transformer** models.
10 | 
11 | ## Usage
12 | 
13 | ```python
14 | import torch
15 | from coatnet import coatnet_0
16 | 
17 | img = torch.randn(1, 3, 224, 224)
18 | net = coatnet_0()
19 | out = net(img)
20 | ```
21 | 
22 | Try out other block combinations mentioned in the paper:
23 | 
24 | ```python
25 | from coatnet import CoAtNet
26 | 
27 | num_blocks = [2, 2, 3, 5, 2]            # L
28 | channels = [64, 96, 192, 384, 768]      # D
29 | block_types=['C', 'T', 'T', 'T']        # 'C' for MBConv, 'T' for Transformer
30 | 
31 | net = CoAtNet((224, 224), 3, num_blocks, channels, block_types=block_types)
32 | out = net(img)
33 | ```
34 | 
35 | ## Citation
36 | 
37 | ```bibtex
38 | @article{dai2021coatnet,
39 |   title={CoAtNet: Marrying Convolution and Attention for All Data Sizes},
40 |   author={Dai, Zihang and Liu, Hanxiao and Le, Quoc V and Tan, Mingxing},
41 |   journal={arXiv preprint arXiv:2106.04803},
42 |   year={2021}
43 | }
44 | ```
45 | 
46 | ## Credits
47 | 
48 | Code adapted from [MobileNetV2](https://github.com/tonylins/pytorch-mobilenet-v2) and [ViT](https://github.com/lucidrains/vit-pytorch).
49 | 


--------------------------------------------------------------------------------
/coatnet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | from einops import rearrange
  5 | from einops.layers.torch import Rearrange
  6 | 
  7 | 
  8 | def conv_3x3_bn(inp, oup, image_size, downsample=False):
  9 |     stride = 1 if downsample == False else 2
 10 |     return nn.Sequential(
 11 |         nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
 12 |         nn.BatchNorm2d(oup),
 13 |         nn.GELU()
 14 |     )
 15 | 
 16 | 
 17 | class PreNorm(nn.Module):
 18 |     def __init__(self, dim, fn, norm):
 19 |         super().__init__()
 20 |         self.norm = norm(dim)
 21 |         self.fn = fn
 22 | 
 23 |     def forward(self, x, **kwargs):
 24 |         return self.fn(self.norm(x), **kwargs)
 25 | 
 26 | 
 27 | class SE(nn.Module):
 28 |     def __init__(self, inp, oup, expansion=0.25):
 29 |         super().__init__()
 30 |         self.avg_pool = nn.AdaptiveAvgPool2d(1)
 31 |         self.fc = nn.Sequential(
 32 |             nn.Linear(oup, int(inp * expansion), bias=False),
 33 |             nn.GELU(),
 34 |             nn.Linear(int(inp * expansion), oup, bias=False),
 35 |             nn.Sigmoid()
 36 |         )
 37 | 
 38 |     def forward(self, x):
 39 |         b, c, _, _ = x.size()
 40 |         y = self.avg_pool(x).view(b, c)
 41 |         y = self.fc(y).view(b, c, 1, 1)
 42 |         return x * y
 43 | 
 44 | 
 45 | class FeedForward(nn.Module):
 46 |     def __init__(self, dim, hidden_dim, dropout=0.):
 47 |         super().__init__()
 48 |         self.net = nn.Sequential(
 49 |             nn.Linear(dim, hidden_dim),
 50 |             nn.GELU(),
 51 |             nn.Dropout(dropout),
 52 |             nn.Linear(hidden_dim, dim),
 53 |             nn.Dropout(dropout)
 54 |         )
 55 | 
 56 |     def forward(self, x):
 57 |         return self.net(x)
 58 | 
 59 | 
 60 | class MBConv(nn.Module):
 61 |     def __init__(self, inp, oup, image_size, downsample=False, expansion=4):
 62 |         super().__init__()
 63 |         self.downsample = downsample
 64 |         stride = 1 if self.downsample == False else 2
 65 |         hidden_dim = int(inp * expansion)
 66 | 
 67 |         if self.downsample:
 68 |             self.pool = nn.MaxPool2d(3, 2, 1)
 69 |             self.proj = nn.Conv2d(inp, oup, 1, 1, 0, bias=False)
 70 | 
 71 |         if expansion == 1:
 72 |             self.conv = nn.Sequential(
 73 |                 # dw
 74 |                 nn.Conv2d(hidden_dim, hidden_dim, 3, stride,
 75 |                           1, groups=hidden_dim, bias=False),
 76 |                 nn.BatchNorm2d(hidden_dim),
 77 |                 nn.GELU(),
 78 |                 # pw-linear
 79 |                 nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
 80 |                 nn.BatchNorm2d(oup),
 81 |             )
 82 |         else:
 83 |             self.conv = nn.Sequential(
 84 |                 # pw
 85 |                 # down-sample in the first conv
 86 |                 nn.Conv2d(inp, hidden_dim, 1, stride, 0, bias=False),
 87 |                 nn.BatchNorm2d(hidden_dim),
 88 |                 nn.GELU(),
 89 |                 # dw
 90 |                 nn.Conv2d(hidden_dim, hidden_dim, 3, 1, 1,
 91 |                           groups=hidden_dim, bias=False),
 92 |                 nn.BatchNorm2d(hidden_dim),
 93 |                 nn.GELU(),
 94 |                 SE(inp, hidden_dim),
 95 |                 # pw-linear
 96 |                 nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
 97 |                 nn.BatchNorm2d(oup),
 98 |             )
 99 |         
100 |         self.conv = PreNorm(inp, self.conv, nn.BatchNorm2d)
101 | 
102 |     def forward(self, x):
103 |         if self.downsample:
104 |             return self.proj(self.pool(x)) + self.conv(x)
105 |         else:
106 |             return x + self.conv(x)
107 | 
108 | 
109 | class Attention(nn.Module):
110 |     def __init__(self, inp, oup, image_size, heads=8, dim_head=32, dropout=0.):
111 |         super().__init__()
112 |         inner_dim = dim_head * heads
113 |         project_out = not (heads == 1 and dim_head == inp)
114 | 
115 |         self.ih, self.iw = image_size
116 | 
117 |         self.heads = heads
118 |         self.scale = dim_head ** -0.5
119 | 
120 |         # parameter table of relative position bias
121 |         self.relative_bias_table = nn.Parameter(
122 |             torch.zeros((2 * self.ih - 1) * (2 * self.iw - 1), heads))
123 | 
124 |         coords = torch.meshgrid((torch.arange(self.ih), torch.arange(self.iw)))
125 |         coords = torch.flatten(torch.stack(coords), 1)
126 |         relative_coords = coords[:, :, None] - coords[:, None, :]
127 | 
128 |         relative_coords[0] += self.ih - 1
129 |         relative_coords[1] += self.iw - 1
130 |         relative_coords[0] *= 2 * self.iw - 1
131 |         relative_coords = rearrange(relative_coords, 'c h w -> h w c')
132 |         relative_index = relative_coords.sum(-1).flatten().unsqueeze(1)
133 |         self.register_buffer("relative_index", relative_index)
134 | 
135 |         self.attend = nn.Softmax(dim=-1)
136 |         self.to_qkv = nn.Linear(inp, inner_dim * 3, bias=False)
137 | 
138 |         self.to_out = nn.Sequential(
139 |             nn.Linear(inner_dim, oup),
140 |             nn.Dropout(dropout)
141 |         ) if project_out else nn.Identity()
142 | 
143 |     def forward(self, x):
144 |         qkv = self.to_qkv(x).chunk(3, dim=-1)
145 |         q, k, v = map(lambda t: rearrange(
146 |             t, 'b n (h d) -> b h n d', h=self.heads), qkv)
147 | 
148 |         dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale
149 | 
150 |         # Use "gather" for more efficiency on GPUs
151 |         relative_bias = self.relative_bias_table.gather(
152 |             0, self.relative_index.repeat(1, self.heads))
153 |         relative_bias = rearrange(
154 |             relative_bias, '(h w) c -> 1 c h w', h=self.ih*self.iw, w=self.ih*self.iw)
155 |         dots = dots + relative_bias
156 | 
157 |         attn = self.attend(dots)
158 |         out = torch.matmul(attn, v)
159 |         out = rearrange(out, 'b h n d -> b n (h d)')
160 |         out = self.to_out(out)
161 |         return out
162 | 
163 | 
164 | class Transformer(nn.Module):
165 |     def __init__(self, inp, oup, image_size, heads=8, dim_head=32, downsample=False, dropout=0.):
166 |         super().__init__()
167 |         hidden_dim = int(inp * 4)
168 | 
169 |         self.ih, self.iw = image_size
170 |         self.downsample = downsample
171 | 
172 |         if self.downsample:
173 |             self.pool1 = nn.MaxPool2d(3, 2, 1)
174 |             self.pool2 = nn.MaxPool2d(3, 2, 1)
175 |             self.proj = nn.Conv2d(inp, oup, 1, 1, 0, bias=False)
176 | 
177 |         self.attn = Attention(inp, oup, image_size, heads, dim_head, dropout)
178 |         self.ff = FeedForward(oup, hidden_dim, dropout)
179 | 
180 |         self.attn = nn.Sequential(
181 |             Rearrange('b c ih iw -> b (ih iw) c'),
182 |             PreNorm(inp, self.attn, nn.LayerNorm),
183 |             Rearrange('b (ih iw) c -> b c ih iw', ih=self.ih, iw=self.iw)
184 |         )
185 | 
186 |         self.ff = nn.Sequential(
187 |             Rearrange('b c ih iw -> b (ih iw) c'),
188 |             PreNorm(oup, self.ff, nn.LayerNorm),
189 |             Rearrange('b (ih iw) c -> b c ih iw', ih=self.ih, iw=self.iw)
190 |         )
191 | 
192 |     def forward(self, x):
193 |         if self.downsample:
194 |             x = self.proj(self.pool1(x)) + self.attn(self.pool2(x))
195 |         else:
196 |             x = x + self.attn(x)
197 |         x = x + self.ff(x)
198 |         return x
199 | 
200 | 
201 | class CoAtNet(nn.Module):
202 |     def __init__(self, image_size, in_channels, num_blocks, channels, num_classes=1000, block_types=['C', 'C', 'T', 'T']):
203 |         super().__init__()
204 |         ih, iw = image_size
205 |         block = {'C': MBConv, 'T': Transformer}
206 | 
207 |         self.s0 = self._make_layer(
208 |             conv_3x3_bn, in_channels, channels[0], num_blocks[0], (ih // 2, iw // 2))
209 |         self.s1 = self._make_layer(
210 |             block[block_types[0]], channels[0], channels[1], num_blocks[1], (ih // 4, iw // 4))
211 |         self.s2 = self._make_layer(
212 |             block[block_types[1]], channels[1], channels[2], num_blocks[2], (ih // 8, iw // 8))
213 |         self.s3 = self._make_layer(
214 |             block[block_types[2]], channels[2], channels[3], num_blocks[3], (ih // 16, iw // 16))
215 |         self.s4 = self._make_layer(
216 |             block[block_types[3]], channels[3], channels[4], num_blocks[4], (ih // 32, iw // 32))
217 | 
218 |         self.pool = nn.AvgPool2d(ih // 32, 1)
219 |         self.fc = nn.Linear(channels[-1], num_classes, bias=False)
220 | 
221 |     def forward(self, x):
222 |         x = self.s0(x)
223 |         x = self.s1(x)
224 |         x = self.s2(x)
225 |         x = self.s3(x)
226 |         x = self.s4(x)
227 | 
228 |         x = self.pool(x).view(-1, x.shape[1])
229 |         x = self.fc(x)
230 |         return x
231 | 
232 |     def _make_layer(self, block, inp, oup, depth, image_size):
233 |         layers = nn.ModuleList([])
234 |         for i in range(depth):
235 |             if i == 0:
236 |                 layers.append(block(inp, oup, image_size, downsample=True))
237 |             else:
238 |                 layers.append(block(oup, oup, image_size))
239 |         return nn.Sequential(*layers)
240 | 
241 | 
242 | def coatnet_0():
243 |     num_blocks = [2, 2, 3, 5, 2]            # L
244 |     channels = [64, 96, 192, 384, 768]      # D
245 |     return CoAtNet((224, 224), 3, num_blocks, channels, num_classes=1000)
246 | 
247 | 
248 | def coatnet_1():
249 |     num_blocks = [2, 2, 6, 14, 2]           # L
250 |     channels = [64, 96, 192, 384, 768]      # D
251 |     return CoAtNet((224, 224), 3, num_blocks, channels, num_classes=1000)
252 | 
253 | 
254 | def coatnet_2():
255 |     num_blocks = [2, 2, 6, 14, 2]           # L
256 |     channels = [128, 128, 256, 512, 1026]   # D
257 |     return CoAtNet((224, 224), 3, num_blocks, channels, num_classes=1000)
258 | 
259 | 
260 | def coatnet_3():
261 |     num_blocks = [2, 2, 6, 14, 2]           # L
262 |     channels = [192, 192, 384, 768, 1536]   # D
263 |     return CoAtNet((224, 224), 3, num_blocks, channels, num_classes=1000)
264 | 
265 | 
266 | def coatnet_4():
267 |     num_blocks = [2, 2, 12, 28, 2]          # L
268 |     channels = [192, 192, 384, 768, 1536]   # D
269 |     return CoAtNet((224, 224), 3, num_blocks, channels, num_classes=1000)
270 | 
271 | 
272 | def count_parameters(model):
273 |     return sum(p.numel() for p in model.parameters() if p.requires_grad)
274 | 
275 | 
276 | if __name__ == '__main__':
277 |     img = torch.randn(1, 3, 224, 224)
278 | 
279 |     net = coatnet_0()
280 |     out = net(img)
281 |     print(out.shape, count_parameters(net))
282 | 
283 |     net = coatnet_1()
284 |     out = net(img)
285 |     print(out.shape, count_parameters(net))
286 | 
287 |     net = coatnet_2()
288 |     out = net(img)
289 |     print(out.shape, count_parameters(net))
290 | 
291 |     net = coatnet_3()
292 |     out = net(img)
293 |     print(out.shape, count_parameters(net))
294 | 
295 |     net = coatnet_4()
296 |     out = net(img)
297 |     print(out.shape, count_parameters(net))
298 | 


--------------------------------------------------------------------------------