├── gittransformer.png
├── LICENSE
├── README.md
├── GPT_1.py
├── VisionImageTransformer.py
├── GPT_2.py
├── BERT.py
├── TRANSFORMERS.py
└── PERFORMER.py
/gittransformer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShivamRajSharma/Transformer-Architectures-From-Scratch/HEAD/gittransformer.png
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Shivam Raj
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Transformer Architecure From Scratch Using PyTorch
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | ## 1) TRANSFORMER -
10 | A Self attention based Encoder-Decoder Architecture. It is mostly used for
11 | 1) Machine Translation
12 | 2) Document Summaraization
13 | 3) Text extraction
14 |
15 | Paper - https://arxiv.org/abs/1706.03762
16 |
17 | ## 2) BERT -
18 | A Self-attention based Encoder Architecture. It is mostly used for
19 | 1) Sentiment Classification
20 | 2) Named Entity Recognition
21 | 3) Question and Answering
22 | 4) Sentence Embedding Extraction
23 | 5) Document Matching
24 |
25 | Paper - https://arxiv.org/abs/1810.04805
26 |
27 | ## 3) GPT-1 -
28 | A Self-attention based Decoder based Autoregressive model. It is mostly used for
29 | 1) Sentence Completion
30 | 2) Generating Text
31 | 3) Sentiment Classification
32 |
33 | Paper - https://paperswithcode.com/method/gpt
34 |
35 | ## 4) GPT-2 -
36 | A Self-attention based Decoder based Autoregressive model with a slight change in architecture and trained on larger corpus of text than GPT-1. It is mostly used for
37 | 1) Sentence Completion
38 | 2) Generating Text
39 | 3) Sentiment Classification
40 |
41 | Paper - https://d4mucfpksywv.cloudfront.net/better-language-models/language-models.pdf
42 |
43 | ## 5) ViT -
44 | A State of the art Self-attention based Encoder Architecture for Computer Vision application. It is mostly used for
45 | 1) Image Classification
46 | 2) Image Encoding
47 | 3) Backbone for Object Detection
48 |
49 | Paper - https://arxiv.org/abs/2006.03677
50 |
51 | ## 6) PERFORMER -
52 | A Self-attention based Encoder-Decoder Architecture with a linear time complexity other than transformer which has quadratic time complexity. It is mostly used
53 | 1) Machine Translation
54 | 2) Document Summaraization
55 | 3) Text extraction
56 |
57 | Paper - https://arxiv.org/abs/2009.14794
58 |
--------------------------------------------------------------------------------
/GPT_1.py:
--------------------------------------------------------------------------------
1 | from time import time
2 | import torch
3 | import torch.nn as nn
4 |
5 | class SelfAttention(nn.Module):
6 | def __init__(self, input_dims, heads):
7 | super(SelfAttention, self).__init__()
8 | self.heads = heads
9 | self.head_dims = int(input_dims/heads)
10 | self.input_dims = input_dims
11 |
12 | self.query = nn.Linear(self.head_dims, self.head_dims)
13 | self.key = nn.Linear(self.head_dims, self.head_dims)
14 | self.value = nn.Linear(self.head_dims, self.head_dims)
15 | self.fc = nn.Linear(self.head_dims*heads, self.input_dims)
16 |
17 | def forward(self, query, key, value, mask):
18 | Batch, Seq_len, embed = query.shape
19 | query_len, key_len, value_len = query.shape[1], key.shape[1], value.shape[1]
20 |
21 | query = query.reshape(Batch, query_len, self.heads, self.head_dims)
22 | key = key.reshape(Batch, key_len, self.heads, self.head_dims)
23 | value = value.reshape(Batch, value_len, self.heads, self.head_dims)
24 |
25 | query = self.query(query)
26 | key = self.key(key)
27 | value = self.value(value)
28 |
29 | score = torch.einsum('bqhd,bkhd->bhqk', [query, key])
30 | if mask is not None:
31 | score = score.masked_fill(mask == 0, float('-1e20'))
32 |
33 | attention_score = nn.Softmax(dim=-1)(score/((self.head_dims)**(1/2)))
34 | out = torch.einsum('bhqv,bvhd->bqhd', [attention_score, value]).reshape(Batch, query_len, self.head_dims*self.heads)
35 | out = self.fc(out)
36 |
37 | return out
38 |
39 |
40 | class GPTBlock(nn.Module):
41 | def __init__(
42 | self,
43 | heads,
44 | embedding_dims,
45 | dropout,
46 | forward_expansion,
47 | layer_norm_eps
48 | ):
49 | super(GPTBlock, self).__init__()
50 | self.embedding_dims = embedding_dims
51 | self.attention = SelfAttention(embedding_dims, heads)
52 | self.layer_norm1 = nn.LayerNorm(embedding_dims, eps=layer_norm_eps)
53 | self.layer_norm2 = nn.LayerNorm(embedding_dims, eps=layer_norm_eps)
54 | self.feed_forward = nn.Sequential(
55 | *[
56 | nn.Linear(embedding_dims, embedding_dims*forward_expansion),
57 | nn.GELU(),
58 | nn.Linear(embedding_dims*forward_expansion, embedding_dims)
59 | ]
60 | )
61 | self.dropout = nn.Dropout(dropout)
62 |
63 | def forward(self, x, mask):
64 | attention_block = self.attention(x, x, x, mask)
65 | add = self.dropout(self.layer_norm1(attention_block + x))
66 | feed_forward = self.feed_forward(add)
67 | out = self.dropout(self.layer_norm2(feed_forward + add))
68 | return out
69 |
70 |
71 | class GPT(nn.Module):
72 | def __init__(
73 | self,
74 | vocab_size,
75 | embedding_dims,
76 | dropout,
77 | heads,
78 | num_of_layers,
79 | forward_expansion,
80 | max_len,
81 | layer_norm_eps = 1e-5
82 | ):
83 | super(GPT, self).__init__()
84 | self.embedding_dims = embedding_dims
85 | self.word_embeddings = nn.Embedding(vocab_size, embedding_dims)
86 | self.positional_embeddings = nn.Parameter(torch.zeros(1, max_len, embedding_dims))
87 | self.dropout = nn.Dropout(dropout)
88 | self.gpt_blocks = nn.ModuleList(
89 | [
90 | GPTBlock(
91 | heads,
92 | embedding_dims,
93 | dropout,
94 | forward_expansion,
95 | layer_norm_eps
96 |
97 | )
98 | for _ in range(num_of_layers)
99 | ]
100 | )
101 |
102 | self.layer_norm = nn.LayerNorm(embedding_dims, eps=layer_norm_eps)
103 | self.fc = nn.Linear(embedding_dims, vocab_size)
104 |
105 | self.apply(self._init_weights)
106 |
107 | #From @HuggingFace
108 | def _init_weights(self, module):
109 | if isinstance(module, (nn.Linear, nn.Embedding)):
110 | module.weight.data.normal_(mean=0.0, std=0.02)
111 |
112 | elif isinstance(module, nn.LayerNorm):
113 | module.bias.data.zero_()
114 | module.weight.data.fill_(1.0)
115 |
116 | if isinstance(module, nn.Linear) and module.bias is not None:
117 | module.bias.data.zero_()
118 |
119 | def casual_mask(self, x):
120 | mask = torch.tril(torch.ones((x.shape[0], x.shape[-1], x.shape[-1]))).unsqueeze(1)
121 | return mask
122 |
123 | def forward(self, x):
124 | casual_mask = self.casual_mask(x)
125 | seq_len = x.shape[-1]
126 | word_embeddings = self.word_embeddings(x)
127 | x = self.dropout(word_embeddings + self.positional_embeddings[:, :seq_len, :])
128 | for block in self.gpt_blocks:
129 | x = block(x, casual_mask)
130 | x = self.layer_norm(x)
131 | out = self.fc(x)
132 | return x
133 |
134 |
135 | if __name__ == '__main__':
136 | #DEFAULT GPT PARAMETERS :-
137 | vocab_size = 40478
138 | embedding_dims = 768
139 | dropout = 0.1
140 | heads = 12
141 | num_of_layers = 12
142 | forward_expansion = 4
143 | max_len = 512
144 |
145 |
146 | a = torch.randint(1, 100, (1, 300))
147 | model = GPT(
148 | vocab_size,
149 | embedding_dims,
150 | dropout,
151 | heads,
152 | num_of_layers,
153 | forward_expansion,
154 | max_len,
155 | )
156 |
157 | start = time()
158 | y = model(a)
159 | print(f'INFERENCE TIME = {time() - start}sec')
160 | x = sum(p.numel() for p in model.parameters() if p.requires_grad)
161 | print(f'NUMBER OF PARAMETERS ARE = {x}')
162 |
--------------------------------------------------------------------------------
/VisionImageTransformer.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 |
5 | class SelfAttention(nn.Module):
6 | def __init__(
7 | self,
8 | embedding_dims,
9 | heads,
10 | dropout
11 | ):
12 | super(SelfAttention, self).__init__()
13 | self.heads = heads
14 | self.embedding_dims = embedding_dims
15 | self.head_dims = int(embedding_dims/heads)
16 |
17 | self.key = nn.Linear(self.head_dims, self.head_dims)
18 | self.query = nn.Linear(self.head_dims, self.head_dims)
19 | self.value = nn.Linear(self.head_dims, self.head_dims)
20 |
21 | self.fc = nn.Linear(self.head_dims*self.heads, self.embedding_dims)
22 |
23 | self.dropout = nn.Dropout(dropout)
24 |
25 | def forward(self, query, key, value, mask):
26 | Batch = query.shape[0]
27 |
28 | query_len, key_len, value_len = query.shape[1], key.shape[1], value.shape[1]
29 |
30 | query = query.reshape(Batch, query_len, self.heads, self.head_dims)
31 | key = key.reshape(Batch, key_len, self.heads, self.head_dims)
32 | value = value.reshape(Batch, value_len, self.heads, self.head_dims)
33 |
34 | query = self.query(query)
35 | key = self.key(key)
36 | value = self.value(value)
37 |
38 | attention_score = torch.einsum('bqhd,bkhd->bhqk', [query, key])
39 |
40 | if mask is not None:
41 | attention_score = attention_score.masked_fill(mask==0, float('-1e20'))
42 |
43 | attention_score = attention_score/((self.head_dims)**(1/2))
44 | attention_score = torch.softmax(attention_score, dim=-1)
45 |
46 | out = torch.einsum('bhqv,bvhd->bqhd', [attention_score, value]).reshape(
47 | Batch, query_len, self.heads*self.head_dims
48 | )
49 |
50 | out = self.dropout(self.fc(out))
51 |
52 | return out
53 |
54 |
55 |
56 | class TransformerBlock(nn.Module):
57 | def __init__(
58 | self,
59 | embedding_dims,
60 | heads,
61 | dropout,
62 | forward_expansion,
63 | layer_norm_eps
64 | ):
65 | super(TransformerBlock, self).__init__()
66 | self.layer_norm1 = nn.LayerNorm(embedding_dims, eps=layer_norm_eps)
67 | self.layer_norm2 = nn.LayerNorm(embedding_dims, eps=layer_norm_eps)
68 | self.attention = SelfAttention(embedding_dims, heads, dropout)
69 | self.feed_forward = nn.Sequential(
70 | nn.Linear(embedding_dims, embedding_dims*forward_expansion),
71 | nn.GELU(),
72 | nn.Dropout(dropout),
73 | nn.Linear(embedding_dims*forward_expansion, embedding_dims),
74 | nn.Dropout(dropout)
75 | )
76 | self.dropout = nn.Dropout(dropout)
77 |
78 | def forward(self, x, mask):
79 | norm = self.layer_norm1(x)
80 | attention_block = self.attention(norm, norm, norm, mask)
81 | add = x + attention_block
82 | norm = self.layer_norm2(add)
83 | feed_forward = self.feed_forward(norm)
84 | out = feed_forward + add
85 | return out
86 |
87 |
88 | class ViT(nn.Module):
89 | def __init__(
90 | self,
91 | patch_height,
92 | patch_width,
93 | max_len,
94 | embedding_dims,
95 | heads,
96 | forward_expansion,
97 | num_layers,
98 | dropout,
99 | layer_norm_eps,
100 | num_classes
101 | ):
102 | super(ViT, self).__init__()
103 |
104 | self.vit_blocks = nn.Sequential(
105 | *[
106 | TransformerBlock(
107 | embedding_dims,
108 | heads,
109 | dropout,
110 | forward_expansion,
111 | layer_norm_eps
112 | )
113 | for _ in range(num_layers)
114 | ]
115 |
116 | )
117 | self.patch_height = patch_height
118 | self.patch_width = patch_width
119 | self.cls_embedding = nn.Parameter(torch.zeros(1, 1, embedding_dims))
120 | self.patch_embeddings = nn.Linear(embedding_dims, embedding_dims)
121 | self.postional_embedding = nn.Parameter(torch.zeros(1, max_len+1, embedding_dims))
122 | self.to_cls_token = nn.Identity()
123 | self.classifier = nn.Sequential(
124 | nn.LayerNorm(embedding_dims),
125 | nn.Linear(embedding_dims, num_classes*4),
126 | nn.GELU(),
127 | nn.Dropout(dropout),
128 | nn.Linear(num_classes*4, num_classes)
129 | )
130 | self.dropout = nn.Dropout(dropout)
131 |
132 |
133 | def forward(self, images):
134 | patches = images.unfold(2, self.patch_height, self.patch_width).unfold(3, self.patch_height, self.patch_width)
135 | patches = patches.permute(0, 2, 3, 1, 4, 5)
136 | patches = patches.reshape(
137 | patches.shape[0],
138 | patches.shape[1],
139 | patches.shape[2],
140 | patches.shape[3]*patches.shape[4]*patches.shape[5]
141 | )
142 | patches = patches.view(patches.shape[0], -1, patches.shape[-1])
143 |
144 | x = self.cls_embedding.expand(patches.shape[0], -1, -1)
145 | patch_embeddings = self.patch_embeddings(patches)
146 | x = torch.cat((x, patch_embeddings), dim=1) + self.postional_embedding
147 | x = self.dropout(x)
148 | mask = None
149 | for block in self.vit_blocks:
150 | x = block(x, mask)
151 | out = self.to_cls_token(x[:, 0])
152 | out = self.classifier(out)
153 | return out
154 |
155 |
156 |
157 | if __name__ == "__main__":
158 |
159 | model = ViT(
160 | patch_height = 16,
161 | patch_width = 16,
162 | embedding_dims = 768,
163 | dropout = 0.1,
164 | heads = 4,
165 | num_layers = 4,
166 | forward_expansion = 4,
167 | max_len = int((32*32)/(16*16)),
168 | layer_norm_eps = 1e-5,
169 | num_classes = 10,
170 | )
171 |
172 | a = torch.randn(32, 3, 32, 32)
173 | output = model(a)
174 | print(output.shape)
--------------------------------------------------------------------------------
/GPT_2.py:
--------------------------------------------------------------------------------
1 | from time import time
2 | import torch
3 | import torch.nn as nn
4 |
5 | class SelfAttention(nn.Module):
6 | def __init__(self, input_dims, heads):
7 | super(SelfAttention, self).__init__()
8 | self.heads = heads
9 | self.head_dims = int(input_dims/heads)
10 | self.input_dims = input_dims
11 |
12 | self.expand = nn.Linear(self.input_dims, self.input_dims*3)
13 | self.fc = nn.Linear(self.head_dims*heads, self.input_dims)
14 |
15 | def split_(self, x):
16 | query, key, value = x.split(self.input_dims, dim=-1)
17 | return query, key, value
18 |
19 | def forward(self, x, mask, past):
20 | Batch, seq_len, embed = x.shape
21 | expand = self.expand(x)
22 | query, key, value = self.split_(expand)
23 |
24 | query = query.reshape(Batch, seq_len, self.heads, self.head_dims)
25 | key = key.reshape(Batch, seq_len, self.heads, self.head_dims)
26 | value = value.reshape(Batch, seq_len, self.heads, self.head_dims)
27 |
28 | present = torch.cat((key.unsqueeze(0), value.unsqueeze(0)), dim=0)
29 |
30 | if past is not None:
31 | past_key, past_value = past
32 | key = torch.cat((past_key, key), dim=1)
33 | value = torch.cat((past_value, value), dim=1)
34 |
35 | score = torch.einsum('bqhd,bkhd->bhqk', [query, key])
36 | if mask is not None:
37 | score = score.masked_fill(mask == 0, float('-1e20'))
38 |
39 | attention_score = nn.Softmax(dim=-1)(score/((self.head_dims)**(1/2)))
40 | out = torch.einsum('bhqv,bvhd->bqhd', [attention_score, value]).reshape(Batch, seq_len, self.head_dims*self.heads)
41 | out = self.fc(out)
42 |
43 | return out, present
44 |
45 |
46 | class GPTBlock(nn.Module):
47 | def __init__(
48 | self,
49 | heads,
50 | embedding_dims,
51 | dropout,
52 | forward_expansion,
53 | layer_norm_eps
54 | ):
55 | super(GPTBlock, self).__init__()
56 | self.embedding_dims = embedding_dims
57 | self.attention = SelfAttention(embedding_dims, heads)
58 | self.layer_norm1 = nn.LayerNorm(embedding_dims, eps=layer_norm_eps)
59 | self.layer_norm2 = nn.LayerNorm(embedding_dims, eps=layer_norm_eps)
60 | self.feed_forward = nn.Sequential(
61 | *[
62 | nn.Linear(embedding_dims, embedding_dims*forward_expansion),
63 | nn.GELU(),
64 | nn.Linear(embedding_dims*forward_expansion, embedding_dims)
65 | ]
66 | )
67 | self.dropout = nn.Dropout(dropout)
68 |
69 | def forward(self, x, mask, past):
70 | attention_block, present = self.attention(self.layer_norm1(x), mask, past)
71 | add = self.dropout(self.layer_norm2(attention_block + x))
72 | feed_forward = self.feed_forward(add)
73 | out = self.dropout(feed_forward + add)
74 | return out, present
75 |
76 |
77 | class GPT2(nn.Module):
78 | def __init__(
79 | self,
80 | vocab_size,
81 | embedding_dims,
82 | dropout,
83 | heads,
84 | num_of_layers,
85 | forward_expansion,
86 | max_len,
87 | layer_norm_eps = 1e-5
88 | ):
89 | super(GPT2, self).__init__()
90 | self.embedding_dims = embedding_dims
91 | self.word_embeddings = nn.Embedding(vocab_size, embedding_dims)
92 | self.positional_embeddings = nn.Parameter(torch.zeros(1, max_len, embedding_dims))
93 | self.dropout = nn.Dropout(dropout)
94 | self.gpt_blocks = nn.ModuleList(
95 | [
96 | GPTBlock(
97 | heads,
98 | embedding_dims,
99 | dropout,
100 | forward_expansion,
101 | layer_norm_eps
102 |
103 | )
104 | for _ in range(num_of_layers)
105 | ]
106 | )
107 |
108 | self.fc = nn.Linear(embedding_dims, vocab_size)
109 |
110 | self.apply(self._init_weights)
111 |
112 | #From @HuggingFace
113 | def _init_weights(self, module):
114 | if isinstance(module, (nn.Linear, nn.Embedding)):
115 | module.weight.data.normal_(mean=0.0, std=0.02)
116 |
117 | elif isinstance(module, nn.LayerNorm):
118 | module.bias.data.zero_()
119 | module.weight.data.fill_(1.0)
120 |
121 | if isinstance(module, nn.Linear) and module.bias is not None:
122 | module.bias.data.zero_()
123 |
124 |
125 | def casual_mask(self, x, past):
126 | ones_matix = torch.ones((x.shape[-1], x.shape[-1]))
127 | mask = torch.tril(ones_matix)
128 | if past is not None:
129 | mask = torch.cat((ones_matix, mask), dim=1)
130 | mask = mask.unsqueeze(0).unsqueeze(1)
131 | return mask
132 |
133 |
134 | def forward(self, x, past=None):
135 | casual_mask = self.casual_mask(x, past)
136 | seq_len = x.shape[-1]
137 | word_embeddings = self.word_embeddings(x)
138 | x = self.dropout(word_embeddings + self.positional_embeddings[:, :seq_len, :])
139 | presents = []
140 | past_layer = None
141 | for num, block in enumerate(self.gpt_blocks):
142 | if past is not None:
143 | past_layer = past[num]
144 | x, present = block(x, casual_mask, past_layer)
145 | presents.append(present)
146 | return x, presents
147 |
148 |
149 | if __name__ == '__main__':
150 | #DEFAULT GPT-2 PARAMETERS :-
151 | vocab_size = 50257
152 | embedding_dims = 768
153 | dropout = 0.1
154 | heads = 12
155 | num_of_layers = 12
156 | forward_expansion = 4
157 | max_len = 1024
158 |
159 |
160 | a = torch.randint(1, 100, (3, 300))
161 | model = GPT2(
162 | vocab_size,
163 | embedding_dims,
164 | dropout,
165 | heads,
166 | num_of_layers,
167 | forward_expansion,
168 | max_len,
169 | )
170 |
171 | start = time()
172 | past_key_value = None
173 | for i in range(2):
174 | y, past_key_value = model(a, past_key_value)
175 | print(f'INFERENCE TIME = {time() - start}sec')
176 | x = sum(p.numel() for p in model.parameters() if p.requires_grad)
177 | print(f'NUMBER OF PARAMETERS ARE = {x}')
178 |
--------------------------------------------------------------------------------
/BERT.py:
--------------------------------------------------------------------------------
1 | from time import time
2 | import torch
3 | import torch.nn as nn
4 |
5 |
6 | class SelfAttention(nn.Module):
7 | def __init__(
8 | self,
9 | embedding_dims,
10 | heads
11 | ):
12 | super(SelfAttention, self).__init__()
13 | self.heads = heads
14 | self.embedding_dims = embedding_dims
15 | self.head_dims = int(embedding_dims/heads)
16 |
17 | self.key = nn.Linear(self.head_dims, self.head_dims)
18 | self.query = nn.Linear(self.head_dims, self.head_dims)
19 | self.value = nn.Linear(self.head_dims, self.head_dims)
20 |
21 | self.fc = nn.Linear(self.head_dims*self.heads, self.embedding_dims)
22 |
23 | def forward(self, query, key, value, mask):
24 | Batch = query.shape[0]
25 |
26 | query_len, key_len, value_len = query.shape[1], key.shape[1], value.shape[1]
27 |
28 | query = query.reshape(Batch, query_len, self.heads, self.head_dims)
29 | key = key.reshape(Batch, key_len, self.heads, self.head_dims)
30 | value = value.reshape(Batch, value_len, self.heads, self.head_dims)
31 |
32 | query = self.query(query)
33 | key = self.key(key)
34 | value = self.value(value)
35 |
36 | attention_score = torch.einsum('bqhd,bkhd->bhqk', [query, key])
37 |
38 | if mask is not None:
39 | attention_score = attention_score.masked_fill(mask==0, float('-1e20'))
40 |
41 | attention_score = attention_score/((self.head_dims)**(1/2))
42 | attention_score = torch.softmax(attention_score, dim=-1)
43 |
44 | out = torch.einsum('bhqv,bvhd->bqhd', [attention_score, value]).reshape(
45 | Batch, query_len, self.heads*self.head_dims
46 | )
47 |
48 | out = self.fc(out)
49 |
50 | return out
51 |
52 |
53 |
54 | class BertBlock(nn.Module):
55 | def __init__(
56 | self,
57 | embedding_dims,
58 | heads,
59 | dropout,
60 | forward_expansion,
61 | layer_norm_eps
62 | ):
63 | super(BertBlock, self).__init__()
64 | self.layer_norm1 = nn.LayerNorm(embedding_dims, eps=layer_norm_eps)
65 | self.layer_norm2 = nn.LayerNorm(embedding_dims, eps=layer_norm_eps)
66 | self.attention = SelfAttention(embedding_dims, heads)
67 | self.feed_forward = nn.Sequential(
68 | nn.Linear(embedding_dims, embedding_dims*forward_expansion),
69 | nn.GELU(),
70 | nn.Linear(embedding_dims*forward_expansion, embedding_dims)
71 | )
72 | self.dropout = nn.Dropout(dropout)
73 |
74 | def forward(self, x, mask):
75 | attention_block = self.attention(x, x, x, mask)
76 | add = self.dropout(self.layer_norm1(x + attention_block))
77 | feed_forward = self.feed_forward(add)
78 | out = self.dropout(self.layer_norm2(feed_forward + add))
79 | return out
80 |
81 |
82 |
83 | class Embeddings(nn.Module):
84 | def __init__(
85 | self,
86 | vocab_size,
87 | max_len,
88 | embedding_dims
89 | ):
90 | super(Embeddings, self).__init__()
91 | self.word_embeddings = nn.Embedding(vocab_size, embedding_dims)
92 | self.positional_embeddings = nn.Parameter(
93 | torch.zeros(1, max_len, embedding_dims)
94 | )
95 | self.segment_embeddings = nn.Embedding(3, embedding_dims)
96 |
97 | def forward(self, x, segment_x):
98 | sentence_len = x.shape[1]
99 | word_embeddings = self.word_embeddings(x)
100 | positional_embeddings = self.positional_embeddings[:, :sentence_len, :]
101 | segment_embeddings = self.segment_embeddings(segment_x)
102 | return word_embeddings + positional_embeddings + segment_embeddings
103 |
104 |
105 |
106 | class BERT(nn.Module):
107 | def __init__(
108 | self,
109 | vocab_size,
110 | max_len,
111 | mask_idx,
112 | embedding_dims,
113 | heads,
114 | forward_expansion,
115 | num_layers,
116 | dropout,
117 | layer_norm_eps
118 | ):
119 | super(BERT, self).__init__()
120 | self.embedding = Embeddings(
121 | vocab_size,
122 | max_len,
123 | embedding_dims
124 | )
125 |
126 | self.bert_blocks = nn.Sequential(
127 | *[
128 | BertBlock(
129 | embedding_dims,
130 | heads,
131 | dropout,
132 | forward_expansion,
133 | layer_norm_eps
134 | )
135 | for _ in range(num_layers)
136 | ]
137 |
138 | )
139 |
140 | self.layer_norm = nn.LayerNorm(embedding_dims, eps=layer_norm_eps)
141 | self.fc = nn.Linear(embedding_dims, vocab_size)
142 | self.dropout = nn.Dropout(dropout)
143 | self.mask_idx = mask_idx
144 |
145 | self.apply(self._init_weight)
146 |
147 | # @hugging_face
148 | def _init_weight(self, module):
149 | if isinstance(module, (nn.Linear, nn.Embedding)):
150 | module.weight.data.normal_(mean=0.0, std=0.02)
151 |
152 | elif isinstance(module, nn.LayerNorm):
153 | module.weight.data.fill_(1.0)
154 |
155 | if isinstance(module, nn.Linear) and module.bias is not None:
156 | module.bias.data.zero_()
157 |
158 | def create_mask(self, mask):
159 | mask = (mask != self.mask_idx).unsqueeze(1).unsqueeze(2)
160 | return mask
161 |
162 | def forward(self, x, segment_x, mask):
163 | mask = self.create_mask(mask)
164 | x = self.dropout(self.embedding(x, segment_x))
165 | for block in self.bert_blocks:
166 | x = block(x, mask)
167 | return x
168 |
169 |
170 | if __name__ == '__main__':
171 | #DEFAULT BERT PARAMETER :-
172 | vocab_size = 30522
173 | embedding_dims = 768
174 | dropout = 0.1
175 | heads = 12
176 | num_layers = 12
177 | forward_expansion = 4
178 | max_len = 512
179 | layer_norm_eps = 1e-12
180 | mask_idx = 0
181 |
182 | x = torch.randint(1, 100, (32, 100))
183 | x_segment = torch.randint(0, 2, (32, 100))
184 |
185 | model = BERT(
186 | vocab_size,
187 | max_len,
188 | mask_idx,
189 | embedding_dims,
190 | heads,
191 | forward_expansion,
192 | num_layers,
193 | dropout,
194 | layer_norm_eps
195 | )
196 |
197 | mask = torch.randint(0, 2, (32, 100))
198 | start = time()
199 | y = model(x, x_segment, mask)
200 | print(f'INFERENCE TIME = {time() - start}sec')
201 | x = sum(p.numel() for p in model.parameters() if p.requires_grad)
202 | print(f'NUMBER OF PARAMETERS ARE = {x}')
203 |
--------------------------------------------------------------------------------
/TRANSFORMERS.py:
--------------------------------------------------------------------------------
1 | from time import time
2 | import torch
3 | import torch.nn as nn
4 |
5 | class Attention(nn.Module):
6 | def __init__(self, input_shape, head):
7 | super(Attention, self).__init__()
8 | self.head = head
9 | self.input_shape = input_shape
10 | self.head_dims = int(input_shape // head)
11 |
12 | self.query = nn.Linear(self.head_dims, self.head_dims)
13 | self.key = nn.Linear(self.head_dims, self.head_dims)
14 | self.value = nn.Linear(self.head_dims, self.head_dims)
15 | self.fc = nn.Linear(self.head_dims*head, input_shape)
16 |
17 | def forward(self, query, key, value, mask=None):
18 | batch = query.shape[0]
19 | query_len, key_len, value_len = query.shape[1], key.shape[1], value.shape[1]
20 |
21 | query = query.reshape(batch, query_len, self.head, self.head_dims)
22 | key = key.reshape(batch, key_len, self.head, self.head_dims)
23 | value = value.reshape(batch, value_len, self.head, self.head_dims)
24 |
25 | query = self.query(query)
26 | key = self.key(key)
27 | value = self.value(value)
28 |
29 | score = torch.einsum("bqhd,bkhd->bhqk", [query, key])
30 |
31 | if mask is not None:
32 | score.masked_fill(mask == 0, float("-1e20"))
33 | score = torch.softmax(score/((self.head_dims)**(1/2)), dim=-1)
34 |
35 | out = torch.einsum("bhqv,bvhd->bqhd", [score, value])
36 | out = out.reshape(batch, query_len, self.head*self.head_dims)
37 | out = self.fc(out)
38 |
39 | return out
40 |
41 |
42 |
43 | class TransformerBlock(nn.Module):
44 | def __init__(self, input_shape, head, dropout, forward_expansion):
45 | super(TransformerBlock, self).__init__()
46 | self.attention = Attention(input_shape, head)
47 | self.feed_forward = nn.Sequential(
48 | nn.Linear(input_shape, input_shape*forward_expansion),
49 | nn.GELU(),
50 | nn.Linear(input_shape*forward_expansion, input_shape)
51 | )
52 | self.layernorm1 = nn.LayerNorm(input_shape)
53 | self.layernorm2 = nn.LayerNorm(input_shape)
54 |
55 | self.dropout = nn.Dropout(dropout)
56 |
57 | def forward(self, query, key, value, mask):
58 | attention = self.attention(query, key, value, mask)
59 | add = attention + query
60 | regulazation = self.dropout(self.layernorm1(add))
61 | forward = self.feed_forward(regulazation)
62 | out = self.dropout(self.layernorm2(forward + regulazation))
63 | return out
64 |
65 |
66 |
67 | class Encoder(nn.Module):
68 | def __init__(
69 | self,
70 | vocab_size,
71 | embedding_out,
72 | num_layers,
73 | heads,
74 | forward_expansion,
75 | dropout,
76 | max_len
77 | ):
78 | super(Encoder, self).__init__()
79 | self.word_embedding = nn.Embedding(vocab_size, embedding_out)
80 | self.postional_embedding = nn.Parameter(torch.zeros(1, max_len, embedding_out))
81 | self.dropout = nn.Dropout(dropout)
82 | self.layers = nn.Sequential(
83 | *[
84 | TransformerBlock(
85 | embedding_out,
86 | heads,
87 | dropout,
88 | forward_expansion
89 | )
90 | for _ in range(num_layers)
91 | ]
92 | )
93 |
94 | def forward(self, x, mask):
95 | word_embedding = self.word_embedding(x)
96 | postional_embedding = self.postional_embedding[:, :x.shape[1], :]
97 | out = self.dropout(word_embedding + postional_embedding)
98 | for layer in self.layers:
99 | out = layer(out, out, out, mask)
100 | return out
101 |
102 |
103 |
104 | class DecoderBlock(nn.Module):
105 | def __init__(
106 | self,
107 | embedding_out,
108 | head,
109 | forward_expansion,
110 | dropout
111 | ):
112 | super(DecoderBlock, self).__init__()
113 | self.attention = Attention(embedding_out, head)
114 | self.transformer_block = TransformerBlock(
115 | embedding_out,
116 | head,
117 | dropout,
118 | forward_expansion
119 | )
120 | self.dropout = nn.Dropout(dropout)
121 | self.norm = nn.LayerNorm(embedding_out)
122 |
123 | def forward(self, query, key, value, src_mask, causal_mask):
124 | attention = self.attention(query, query, query, causal_mask)
125 | query = self.dropout(self.norm(attention + query))
126 | out = self.transformer_block(query, key, value, src_mask)
127 | return out
128 |
129 |
130 |
131 | class Decoder(nn.Module):
132 | def __init__(
133 | self,
134 | vocab_size,
135 | embedding_out,
136 | num_layers,
137 | head,
138 | forward_expansion,
139 | dropout,
140 | max_len
141 | ):
142 | super(Decoder, self).__init__()
143 | self.word_embedding = nn.Embedding(vocab_size, embedding_out)
144 | self.positional_embedding = nn.Parameter(torch.zeros(1, max_len, embedding_out))
145 | self.layers = nn.Sequential(
146 | *[
147 | DecoderBlock(
148 | embedding_out,
149 | head,
150 | forward_expansion,
151 | dropout
152 | )
153 | for _ in range(num_layers)
154 | ]
155 | )
156 | self.fc = nn.Linear(embedding_out, vocab_size)
157 | self.dropout = nn.Dropout(dropout)
158 |
159 | def forward(self, x, encoder_output, src_mask, casual_mask):
160 | x = self.dropout(self.word_embedding(x) + self.positional_embedding[:, :x.shape[1], :])
161 | for layer in self.layers:
162 | x = layer(
163 | x,
164 | encoder_output,
165 | encoder_output,
166 | src_mask,
167 | casual_mask
168 | )
169 | out = self.fc(x)
170 | return out
171 |
172 |
173 |
174 | class Transformers(nn.Module):
175 | def __init__(
176 | self,
177 | input_vocab_size,
178 | output_vocab_size,
179 | pad_idx,
180 | embedding_out,
181 | num_layers,
182 | forward_expansion,
183 | head,
184 | dropout,
185 | max_len
186 | ):
187 | super(Transformers, self).__init__()
188 | self.encoder = Encoder(
189 | input_vocab_size,
190 | embedding_out,
191 | num_layers,
192 | head,
193 | forward_expansion,
194 | dropout,
195 | max_len
196 | )
197 |
198 | self.decoder = Decoder(
199 | output_vocab_size,
200 | embedding_out,
201 | num_layers,
202 | head,
203 | forward_expansion,
204 | dropout,
205 | max_len
206 | )
207 |
208 | self.pad_idx = pad_idx
209 | self.apply(self._init_weights)
210 |
211 | #From @HuggingFace
212 | def _init_weights(self, module):
213 | if isinstance(module, (nn.Linear, nn.Embedding)):
214 | module.weight.data.normal_(mean=0.0, std=0.02)
215 |
216 | elif isinstance(module, nn.LayerNorm):
217 | module.weight.data.fill_(1.0)
218 |
219 | if isinstance(module, nn.Linear) and module.bias is not None:
220 | module.bias.data.zero_()
221 |
222 | def pad_mask(self, inputs):
223 | pad_mask = (inputs != self.pad_idx).unsqueeze(1).unsqueeze(2)
224 | return pad_mask
225 |
226 | def causal_mask(self, target):
227 | N, target_len = target.shape
228 | target_mask = torch.tril(torch.ones((N, target_len, target_len))).unsqueeze(1)
229 | return target_mask
230 |
231 | def forward(self, inputs, target):
232 | pad_mask = self.pad_mask(inputs)
233 | causal_mask = self.causal_mask(target)
234 | encoder_output = self.encoder(inputs, pad_mask)
235 | decoder_out = self.decoder(target, encoder_output, pad_mask, causal_mask)
236 | return decoder_out
237 |
238 |
239 |
240 | if __name__ == "__main__":
241 | #Depends on the Tokenizer
242 | input_vocab_size = 100
243 | output_vocab_size = 200
244 |
245 | #DEFAULT TRANSFORMERS PARAMETERS:-
246 | pad_idx = 0
247 | embedding_out = 512
248 | num_layers = 6
249 | forward_expansion = 4
250 | head = 8
251 | dropout = 0.1
252 | max_len = 512
253 |
254 | inputs = torch.randint(0, 100, (32, 200))
255 | targets = torch.randint(0, 100, (32,100))
256 |
257 | model = Transformers(
258 | input_vocab_size,
259 | output_vocab_size,
260 | pad_idx,
261 | embedding_out,
262 | num_layers,
263 | forward_expansion,
264 | head,
265 | dropout,
266 | max_len
267 | )
268 |
269 | start = time()
270 | y = model(inputs, targets)
271 | print(f'INFERENCE TIME = {time() - start}sec')
272 | x = sum(p.numel() for p in model.parameters() if p.requires_grad)
273 | print(f'NUMBER OF PARAMETERS ARE = {x}')
274 |
--------------------------------------------------------------------------------
/PERFORMER.py:
--------------------------------------------------------------------------------
1 | from time import time
2 | import torch
3 | import torch.nn as nn
4 |
5 |
6 | class FastAttention(nn.Module):
7 | def __init__(self, input_shape, head, n_features):
8 | super(FastAttention, self).__init__()
9 | self.head = head
10 | self.input_shape = input_shape
11 | self.depth = int(input_shape // head)
12 | self.n_features = n_features
13 | self.key_ORF = self.OrthogonalRandomFeature()
14 | self.query_ORF = self.OrthogonalRandomFeature()
15 |
16 | self.query = nn.Linear(self.depth, self.depth)
17 | self.key = nn.Linear(self.depth, self.depth)
18 | self.value = nn.Linear(self.depth, self.depth)
19 | self.fc = nn.Linear(self.depth*head, input_shape)
20 |
21 | def kernel_function(self, x, flag):
22 | ORF = self.query_ORF if flag == 'query' else self.key_ORF
23 | normalization_factor = 1/ORF.shape[-1]**0.25
24 | x *= normalization_factor
25 | out = torch.einsum('nhsd, fd -> nhsf', x, ORF)
26 | kernel_fn = nn.ReLU()(out) + 1e-3
27 | return kernel_fn
28 |
29 | def OrthogonalRandomFeature(self):
30 | n = self.n_features//self.depth
31 | remainder = self.n_features%self.depth
32 | orthogonal_features = []
33 | for _ in range(n):
34 | normal_feature = torch.rand(self.depth, self.depth)
35 | orthogonal_feature, _ = torch.qr(normal_feature)
36 | orthogonal_features.append(orthogonal_feature)
37 |
38 | if remainder > 0 :
39 | normal_feature = torch.rand(self.depth, self.depth)
40 | orthogonal_feature, _ = torch.qr(normal_feature)
41 | orthogonal_features.append(orthogonal_feature[0: remainder])
42 |
43 | orthogonal_features = torch.cat(orthogonal_features)
44 | mutilplier = torch.randn(self.n_features, self.depth).norm(dim=1)
45 | final_features = torch.matmul(torch.diag(mutilplier), orthogonal_features)
46 |
47 | return final_features
48 |
49 | def causal_attention(self, q, k, v):
50 | denominator = 1/torch.einsum('nhqf, nhkf -> nhqf', q, k.cumsum(dim=-2))
51 | x = torch.einsum('nhkf, nhkd -> nhkfd', k, v)
52 | x = x.cumsum(dim=-3)
53 | out = torch.einsum('nhqfd, nhqf, nhqf -> nhqd', x, q, denominator)
54 | return out
55 |
56 |
57 | def bidirectional_attention(self, q, k, v):
58 | kt_i = torch.einsum('nhkf -> nhf', k)
59 | normalization_factor = 1/(torch.einsum('nhqf, nhf -> nhq', q, kt_i))
60 | k_v = torch.einsum('nhkf, nhkd -> nhfd', k, v)
61 | attention = torch.einsum('nhfd, nhqf, nhq-> nhqd', k_v, q, normalization_factor)
62 | return attention
63 |
64 |
65 | def forward(self, query, key, value, mask=None, casual_mask=False):
66 | batch = query.shape[0]
67 | query_len, key_len, value_len = query.shape[1], key.shape[1], value.shape[1]
68 |
69 |
70 | query = query.reshape(batch, query_len, self.head, self.depth)
71 | key = key.reshape(batch, key_len, self.head, self.depth)
72 | value = value.reshape(batch, value_len, self.head, self.depth)
73 |
74 | query = query.permute(0, 2, 1, 3)
75 | key = key.permute(0, 2, 1, 3)
76 | value = value.permute(0, 2, 1, 3)
77 |
78 | query = self.query(query)
79 | key = self.key(key)
80 | value = self.value(value)
81 |
82 | if mask is not None:
83 | key.masked_fill(mask == 0, float("-1e20"))
84 |
85 | query = self.kernel_function(query, 'query')
86 | key = self.kernel_function(key, 'key')
87 |
88 | if casual_mask:
89 | out = self.causal_attention(query, key, value)
90 | else:
91 | out = self.bidirectional_attention(query, key, value)
92 |
93 | out = out.permute(0, 2, 1, 3)
94 | out = out.reshape(batch, query_len, self.head*self.depth)
95 | out = self.fc(out)
96 |
97 | return out
98 |
99 |
100 |
101 | class PerformerBlock(nn.Module):
102 | def __init__(self, input_shape, head, n_features, dropout, forward_expansion):
103 | super(PerformerBlock, self).__init__()
104 | self.attention = FastAttention(input_shape, head, n_features)
105 | self.feed_forward = nn.Sequential(
106 | nn.Linear(input_shape, input_shape*forward_expansion),
107 | nn.GELU(),
108 | nn.Linear(input_shape*forward_expansion, input_shape)
109 | )
110 | self.layernorm1 = nn.LayerNorm(input_shape)
111 | self.layernorm2 = nn.LayerNorm(input_shape)
112 |
113 | self.dropout = nn.Dropout(dropout)
114 |
115 | def forward(self, query, key, value, mask):
116 | attention = self.attention(query, key, value, mask)
117 | add = attention + query
118 | regulazation = self.dropout(self.layernorm1(add))
119 | forward = self.feed_forward(regulazation)
120 | out = self.dropout(self.layernorm2(forward + regulazation))
121 | return out
122 |
123 |
124 |
125 | class Encoder(nn.Module):
126 | def __init__(
127 | self,
128 | vocab_size,
129 | embedding_out,
130 | num_layers,
131 | heads,
132 | n_features,
133 | forward_expansion,
134 | dropout,
135 | max_len
136 | ):
137 | super(Encoder, self).__init__()
138 | self.word_embedding = nn.Embedding(vocab_size, embedding_out)
139 | self.postional_embedding = nn.Parameter(torch.zeros(1, max_len, embedding_out))
140 | self.dropout = nn.Dropout(dropout)
141 | self.layers = nn.Sequential(
142 | *[
143 | PerformerBlock(
144 | embedding_out,
145 | heads,
146 | n_features,
147 | dropout,
148 | forward_expansion
149 | )
150 | for _ in range(num_layers)
151 | ]
152 | )
153 |
154 | def forward(self, x, mask):
155 | word_embedding = self.word_embedding(x)
156 | postional_embedding = self.postional_embedding[:, :x.shape[1], :]
157 | out = self.dropout(word_embedding + postional_embedding)
158 | for layer in self.layers:
159 | out = layer(out, out, out, mask)
160 | return out
161 |
162 |
163 |
164 | class DecoderBlock(nn.Module):
165 | def __init__(
166 | self,
167 | embedding_out,
168 | head,
169 | n_features,
170 | forward_expansion,
171 | dropout
172 | ):
173 | super(DecoderBlock, self).__init__()
174 | self.attention = FastAttention(embedding_out, head, n_features)
175 | self.Performer_block = PerformerBlock(
176 | embedding_out,
177 | head,
178 | n_features,
179 | dropout,
180 | forward_expansion
181 | )
182 | self.dropout = nn.Dropout(dropout)
183 | self.norm = nn.LayerNorm(embedding_out)
184 |
185 | def forward(self, query, key, value, src_mask):
186 | attention = self.attention(query, query, query, src_mask, True)
187 | query = self.dropout(self.norm(attention + query))
188 | out = self.Performer_block(query, key, value, src_mask)
189 | return out
190 |
191 |
192 |
193 | class Decoder(nn.Module):
194 | def __init__(
195 | self,
196 | vocab_size,
197 | embedding_out,
198 | num_layers,
199 | head,
200 | n_features,
201 | forward_expansion,
202 | dropout,
203 | max_len
204 | ):
205 | super(Decoder, self).__init__()
206 | self.word_embedding = nn.Embedding(vocab_size, embedding_out)
207 | self.positional_embedding = nn.Parameter(torch.zeros(1, max_len, embedding_out))
208 | self.layers = nn.Sequential(
209 | *[
210 | DecoderBlock(
211 | embedding_out,
212 | head,
213 | n_features,
214 | forward_expansion,
215 | dropout
216 | )
217 | for _ in range(num_layers)
218 | ]
219 | )
220 | self.fc = nn.Linear(embedding_out, vocab_size)
221 | self.dropout = nn.Dropout(dropout)
222 |
223 | def forward(self, x, encoder_output, src_mask):
224 | x = self.dropout(self.word_embedding(x) + self.positional_embedding[:, :x.shape[1], :])
225 | for layer in self.layers:
226 | x = layer(
227 | x,
228 | encoder_output,
229 | encoder_output,
230 | src_mask
231 | )
232 | out = self.fc(x)
233 | return out
234 |
235 |
236 |
237 | class Performers(nn.Module):
238 | def __init__(
239 | self,
240 | input_vocab_size,
241 | output_vocab_size,
242 | pad_idx,
243 | embedding_out,
244 | num_layers,
245 | forward_expansion,
246 | head,
247 | n_features,
248 | dropout,
249 | max_len
250 | ):
251 | super(Performers, self).__init__()
252 | self.encoder = Encoder(
253 | input_vocab_size,
254 | embedding_out,
255 | num_layers,
256 | head,
257 | n_features,
258 | forward_expansion,
259 | dropout,
260 | max_len
261 | )
262 |
263 | self.decoder = Decoder(
264 | output_vocab_size,
265 | embedding_out,
266 | num_layers,
267 | head,
268 | n_features,
269 | forward_expansion,
270 | dropout,
271 | max_len
272 | )
273 |
274 | self.pad_idx = pad_idx
275 | self.apply(self._init_weights)
276 |
277 | #From @HuggingFace
278 | def _init_weights(self, module):
279 | if isinstance(module, (nn.Linear, nn.Embedding)):
280 | module.weight.data.normal_(mean=0.0, std=0.02)
281 |
282 | elif isinstance(module, nn.LayerNorm):
283 | module.weight.data.fill_(1.0)
284 |
285 | if isinstance(module, nn.Linear) and module.bias is not None:
286 | module.bias.data.zero_()
287 |
288 | def input_pad_mask(self, inputs):
289 | pad_mask = (inputs != self.pad_idx).unsqueeze(1).unsqueeze(3)
290 | return pad_mask
291 |
292 | def output_pad_mask(self, targets):
293 | pad_mask = (targets != self.pad_idx).unsqueeze(1).unsqueeze(3)
294 |
295 | def forward(self, inputs, target):
296 | input_pad_mask = self.input_pad_mask(inputs)
297 | output_pad_mask = self.output_pad_mask(targets)
298 | encoder_output = self.encoder(inputs, input_pad_mask)
299 | decoder_out = self.decoder(target, encoder_output, output_pad_mask)
300 | return decoder_out
301 |
302 |
303 |
304 | if __name__ == "__main__":
305 | #Depends on the Tokenizer
306 | input_vocab_size = 100
307 | output_vocab_size = 200
308 |
309 | #DEFAULT PerFORMERS PARAMETERS:-
310 | pad_idx = 0
311 | embedding_out = 512
312 | num_layers = 6
313 | forward_expansion = 4
314 | head = 8
315 | n_features = 256
316 | dropout = 0.1
317 | max_len = 512
318 |
319 | inputs = torch.randint(0, 100, (32, 200))
320 | targets = torch.randint(0, 100, (32,100))
321 |
322 | model = Performers(
323 | input_vocab_size,
324 | output_vocab_size,
325 | pad_idx,
326 | embedding_out,
327 | num_layers,
328 | forward_expansion,
329 | head,
330 | n_features,
331 | dropout,
332 | max_len
333 | )
334 |
335 | start = time()
336 | y = model(inputs, targets)
337 | print(f'INFERENCE TIME = {time() - start}sec')
338 | x = sum(p.numel() for p in model.parameters() if p.requires_grad)
339 | print(f'NUMBER OF PARAMETERS ARE = {x}')
--------------------------------------------------------------------------------