├── src ├── __init__.py ├── trainer │ ├── __init__.py │ └── trainer.py ├── transformer │ ├── __init__.py │ ├── encoder.py │ ├── positional_encoding.py │ └── transformer.py ├── data_handling │ ├── __init__.py │ └── data_handler.py ├── library.py └── utils.py ├── requirements.txt ├── .gitignore ├── README.md └── main.py /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/trainer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/data_handling/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib==3.6.2 2 | numpy==1.23.5 3 | pandas==1.5.3 4 | sktime==0.19.0 5 | torch==2.0.0 6 | tqdm==4.65.0 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | .idea/ 3 | 4 | .DS_Store 5 | 6 | src/__pycache__/ 7 | 8 | src/data_handling/__pycache__/ 9 | 10 | src/trainer/__pycache__/ 11 | 12 | src/transformer/__pycache__/ 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Human Activity Recognition using Transformer Models 2 | 3 | We implement and train a transformer model to solve a human acivity recognition task. 4 | 5 | Data is available from [here](https://www.timeseriesclassification.com/description.php?Dataset=MotionSenseHAR). 6 | 7 | The code is adapted from [this](https://github.com/gzerveas/mvts_transformer) framework, applied to the task of human activity recognition. 8 | 9 | Please see my [Medium](https://pub.towardsai.net/time-series-regression-using-transformer-models-a-plain-english-introduction-3215892e1cc) post for more information. 10 | -------------------------------------------------------------------------------- /src/library.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn, Tensor 4 | import torch.optim as optim 5 | import torch.utils.data as data 6 | from typing import Optional, Any, Type, List, Tuple 7 | import os 8 | import numpy as np 9 | import torch 10 | import json 11 | import pandas as pd 12 | import random 13 | import matplotlib.pyplot as plt 14 | import pickle as pkl 15 | from tqdm import tqdm 16 | from sktime.datasets import load_from_tsfile 17 | import math 18 | from src.utils import * 19 | from torch.nn.modules import ( 20 | MultiheadAttention, 21 | Linear, 22 | Dropout, 23 | BatchNorm1d, 24 | TransformerEncoderLayer, 25 | ) 26 | 27 | import copy 28 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | from src.library import * 2 | from src.data_handling.data_handler import DataHandler 3 | 4 | 5 | def get_data(train_path, test_path): 6 | torch.cuda.empty_cache() 7 | train_x, train_y = load_from_tsfile( 8 | train_path, return_data_type='numpy3d') 9 | train_x = torch.tensor(train_x) 10 | train_y_orig, train_y = np.unique(train_y, return_inverse=True) 11 | n_values = np.max(train_y) + 1 12 | train_y = np.eye(n_values)[train_y] 13 | 14 | test_x, test_y = load_from_tsfile( 15 | test_path, return_data_type='numpy3d') 16 | test_x = torch.tensor(test_x) 17 | test_y_orig, test_y = np.unique(test_y, return_inverse=True) 18 | n_values = np.max(test_y) + 1 19 | test_y = np.eye(n_values)[test_y] 20 | train_y = torch.tensor(train_y) 21 | test_y = torch.tensor(test_y) 22 | 23 | dh = DataHandler() 24 | data_x = torch.concat((train_x, test_x), dim=0).permute(0, 2, 1) 25 | data_y = torch.concat((train_y, test_y), dim=0) 26 | dh.dataset_x = data_x 27 | dh.dataset_y = data_y 28 | return dh 29 | 30 | def get_activation_fn(activation: str): 31 | if activation == 'relu': 32 | return F.relu 33 | elif activation == 'gelu': 34 | return F.gelu 35 | raise ValueError(f"Activation should be relu/gelu, not {activation}.") 36 | -------------------------------------------------------------------------------- /src/data_handling/data_handler.py: -------------------------------------------------------------------------------- 1 | import torch.utils.data 2 | 3 | from src.library import * 4 | 5 | 6 | class DataHandler: 7 | def __init__(self): 8 | """DataHandler class. 9 | 10 | Args: 11 | data_folder (str): Path to the folder containing the data. 12 | frame_subfolder (str): Name of the feature to extract from the data. 13 | 14 | Attributes: 15 | dataset (ndarray): Data loaded from the folder. 16 | """ 17 | 18 | self.dataset = None 19 | self.dataset_x = None 20 | self.dataset_y = None 21 | self.train_data = None 22 | self.test_data = None 23 | self.torch_dataset = None 24 | random.seed = 1 25 | 26 | def create_dataset(self): 27 | self.torch_dataset = torch.utils.data.TensorDataset( 28 | self.dataset_x, self.dataset_y 29 | ) 30 | 31 | def split_data(self, train_split: float = 0.8): 32 | self.train_data, self.test_data = torch.utils.data.random_split( 33 | self.torch_dataset, [train_split, 1 - train_split] 34 | ) 35 | 36 | @staticmethod 37 | def create_dataloader(dataset: torch.utils.data.TensorDataset, 38 | batch_size: int) -> torch.utils.data.DataLoader: 39 | dataloader = torch.utils.data.DataLoader( 40 | dataset, shuffle=True, batch_size=batch_size 41 | ) 42 | return dataloader 43 | 44 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from src.transformer.transformer import TSTransformerEncoderClassiregressor 2 | from src.trainer.trainer import Trainer 3 | from src.utils import get_data 4 | from src.library import * 5 | 6 | 7 | def run_all(): 8 | dh = get_data(train_path='data/MotionSenseHAR/MotionSenseHAR_TRAIN.ts', 9 | test_path='data/MotionSenseHAR/MotionSenseHAR_TEST.ts') 10 | 11 | for lr in [1e-4]: 12 | model = TSTransformerEncoderClassiregressor( 13 | feat_dim=12, 14 | d_model=64, 15 | max_len=1000, 16 | n_heads=8, 17 | num_layers=6, 18 | dim_feedforward=512, 19 | num_classes=6, 20 | dropout=0.1, 21 | pos_encoding="learnable", 22 | activation="gelu", 23 | norm="BatchNorm", 24 | freeze=False, 25 | ) 26 | trainer = Trainer(dh=dh, epochs=1) 27 | dh.create_dataset() 28 | dh.split_data(train_split=0.8) 29 | dataloader_train = dh.create_dataloader(dh.train_data, batch_size=8) 30 | optimiser = torch.optim.Adam(model.parameters(), lr=lr) 31 | trainer.fit(dataloader=dataloader_train, model=model, optimiser=optimiser) 32 | dataloader_test = dh.create_dataloader(dh.test_data, batch_size=8) 33 | accuracy = trainer.evaluate(dataloader=dataloader_test, model=model) 34 | print(accuracy) 35 | 36 | 37 | if __name__ == "__main__": 38 | run_all() 39 | -------------------------------------------------------------------------------- /src/transformer/encoder.py: -------------------------------------------------------------------------------- 1 | from src.library import * 2 | 3 | 4 | class TransformerBatchNormEncoderLayer(nn.modules.Module): 5 | """Transformer encoder with BatchNorm instead of LayerNorm. 6 | 7 | Args: 8 | d_model: the number of expected features in the input (required). 9 | nhead: the number of heads in the multihead attention models (required). 10 | dim_feedforward: the dimension of the feedforward network model (default=2048). 11 | dropout: the dropout value (default=0.1). 12 | activation: the activation function of intermediate layer, relu or gelu (default=relu). 13 | """ 14 | 15 | def __init__( 16 | self, 17 | d_model: int, 18 | nhead: int, 19 | dim_feedforward: int = 2048, 20 | dropout: float = 0.1, 21 | activation: str = "relu", 22 | ): 23 | super(TransformerBatchNormEncoderLayer, self).__init__() 24 | self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) 25 | # Implementation of Feedforward model 26 | self.linear1 = Linear(d_model, dim_feedforward) 27 | self.dropout = Dropout(dropout) 28 | self.linear2 = Linear(dim_feedforward, d_model) 29 | 30 | self.norm1 = BatchNorm1d( 31 | d_model, eps=1e-5 32 | ) # normalizes each feature across batch samples and time steps 33 | self.norm2 = BatchNorm1d(d_model, eps=1e-5) 34 | self.dropout1 = Dropout(dropout) 35 | self.dropout2 = Dropout(dropout) 36 | 37 | self.activation = get_activation_fn(activation) 38 | 39 | def __setstate__(self, state): 40 | if "activation" not in state: 41 | state["activation"] = F.relu 42 | super(TransformerBatchNormEncoderLayer, self).__setstate__(state) 43 | 44 | def forward( 45 | self, 46 | src: Tensor, 47 | src_mask: Optional[Tensor] = None, 48 | src_key_padding_mask: Optional[Tensor] = None, 49 | is_causal: bool = False, 50 | ) -> Tensor: 51 | r"""Pass the input through the encoder layer. 52 | 53 | Args: 54 | src: the sequence to the encoder layer. 55 | src_mask: the mask for the src sequence. 56 | src_key_padding_mask: the mask for the src keys per batch. 57 | is_causal: flag for causaility. Present for compatibility with nn. 58 | """ 59 | src2 = self.self_attn( 60 | src, src, src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask 61 | )[0] 62 | src = src + self.dropout1(src2) # (seq_len, batch_size, d_model) 63 | src = src.permute(1, 2, 0) # (batch_size, d_model, seq_len) 64 | src = self.norm1(src) 65 | src = src.permute(2, 0, 1) # restore (seq_len, batch_size, d_model) 66 | src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) 67 | src = src + self.dropout2(src2) # (seq_len, batch_size, d_model) 68 | src = src.permute(1, 2, 0) # (batch_size, d_model, seq_len) 69 | src = self.norm2(src) 70 | src = src.permute(2, 0, 1) # restore (seq_len, batch_size, d_model) 71 | return src 72 | -------------------------------------------------------------------------------- /src/transformer/positional_encoding.py: -------------------------------------------------------------------------------- 1 | from src.library import * 2 | 3 | 4 | class FixedPositionalEncoding(nn.Module): 5 | """Positional encoding of input. 6 | Args: 7 | d_model: the embed dim. 8 | dropout: the dropout value. 9 | max_len: the max. length of the incoming sequence. 10 | scale_factor: the scale factor for the positional encoding. 11 | """ 12 | 13 | def __init__( 14 | self, 15 | d_model: int, 16 | dropout: float = 0.1, 17 | max_len: int = 1024, 18 | scale_factor: float = 1.0, 19 | ): 20 | super(FixedPositionalEncoding, self).__init__() 21 | self.dropout = nn.Dropout(p=dropout) 22 | 23 | pe = torch.zeros(max_len, d_model) # positional encoding 24 | position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) 25 | div_term = torch.exp( 26 | torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model) 27 | ) 28 | pe[:, 0::2] = torch.sin(position * div_term) 29 | pe[:, 1::2] = torch.cos(position * div_term) 30 | pe = scale_factor * pe.unsqueeze(0).transpose(0, 1) 31 | self.register_buffer( 32 | "pe", pe 33 | ) # this stores the variable in the state_dict (used for non-trainable variables) 34 | 35 | def forward(self, x: Tensor) -> Tensor: 36 | r"""Inputs of forward function 37 | Args: 38 | x: the sequence fed to the positional encoder model (required). 39 | Shape: 40 | x: [sequence length, batch size, embed dim] 41 | output: [sequence length, batch size, embed dim] 42 | """ 43 | 44 | x = x + self.pe[: x.size(0), :] 45 | return self.dropout(x) 46 | 47 | 48 | class LearnablePositionalEncoding(nn.Module): 49 | """Positional encoding of input. This is learnable. 50 | 51 | Args: 52 | d_model: the embed dim. 53 | dropout: the dropout value. 54 | max_len: the max. length of the incoming sequence. 55 | """ 56 | 57 | def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 1024): 58 | super(LearnablePositionalEncoding, self).__init__() 59 | self.dropout = nn.Dropout(p=dropout) 60 | # Each position gets its own embedding 61 | # Since indices are always 0 ... max_len, we don't have to do a look-up 62 | self.pe = nn.Parameter( 63 | torch.empty(max_len, 1, d_model) 64 | ) # requires_grad automatically set to True 65 | nn.init.uniform_(self.pe, -0.02, 0.02) 66 | 67 | def forward(self, x: Tensor) -> Tensor: 68 | """Inputs of forward function 69 | Args: 70 | x: the sequence fed to the positional encoder model. 71 | Shape: 72 | x: [sequence length, batch size, embed dim] 73 | output: [sequence length, batch size, embed dim] 74 | """ 75 | 76 | x = x + self.pe[: x.size(0), :] 77 | return self.dropout(x) 78 | 79 | 80 | def get_pos_encoder(pos_encoding: str) -> Type[nn.Module]: 81 | if pos_encoding == "learnable": 82 | return LearnablePositionalEncoding 83 | elif pos_encoding == "fixed": 84 | return FixedPositionalEncoding 85 | 86 | raise NotImplementedError( 87 | "pos_encoding should be 'learnable'/'fixed', not '{}'".format(pos_encoding) 88 | ) 89 | -------------------------------------------------------------------------------- /src/trainer/trainer.py: -------------------------------------------------------------------------------- 1 | import torch.optim 2 | from src.library import * 3 | from src.data_handling.data_handler import DataHandler 4 | 5 | 6 | class Trainer: 7 | """Trainer class for the transformer model. 8 | 9 | Args: 10 | model: The model to train. 11 | dh: The data handler object. 12 | batch_size: The batch size. 13 | lr: The learning rate. 14 | betas: The betas for the Adam optimiser. 15 | eps: The epsilon for the Adam optimiser. 16 | epochs: The number of epochs to train for. 17 | """ 18 | 19 | def __init__( 20 | self, 21 | dh: DataHandler, 22 | epochs: int = 10, 23 | ): 24 | self.criterion = nn.CrossEntropyLoss() 25 | 26 | self.dh = dh 27 | self.dataset = None 28 | self.train_data = None 29 | self.test_data = None 30 | self.n_epochs = epochs 31 | cuda_dev = "0" 32 | use_cuda = torch.cuda.is_available() 33 | self.device = torch.device("cuda:" + cuda_dev if use_cuda else "cpu") 34 | 35 | def fit(self, dataloader: torch.utils.data.DataLoader, model: nn.Module, optimiser: torch.optim.Optimizer): 36 | losses = [] 37 | model = model.to(self.device) 38 | model.train() 39 | model.double() 40 | for epoch in range(self.n_epochs): 41 | losses = self.train_one_epoch(dataloader=dataloader, epoch_no=epoch, losses=losses, optimiser=optimiser, model=model) 42 | 43 | def train_one_epoch(self, dataloader, epoch_no, losses, optimiser, model, disable_tqdm=False): 44 | epoch_loss = 0 45 | i = 0 46 | with tqdm(dataloader, unit="batch", disable=disable_tqdm) as tepoch: 47 | for idx, data in enumerate(tepoch): 48 | i += 1 49 | loss, losses = self._train_one_loop(data=data, losses=losses, model=model, optimiser=optimiser) 50 | epoch_loss += loss.detach() 51 | tepoch.set_description(f"Epoch {epoch_no}") 52 | tepoch.set_postfix(loss=epoch_loss.item() / i) 53 | return losses 54 | 55 | 56 | def _train_one_loop( 57 | self, data: torch.utils.data.DataLoader, losses: List[float], model: nn.Module, optimiser: torch.optim.Optimizer 58 | ) -> Tuple[float, List[float]]: 59 | 60 | optimiser.zero_grad() 61 | data[0] = data[0].double() 62 | padding_mask = torch.ones((data[0].shape[0], data[0].shape[1])) > 0 63 | output = model(data[0].to(self.device), padding_mask.to(self.device)) 64 | loss = self.criterion(output, data[1].type(torch.DoubleTensor).to(self.device)) 65 | loss.backward() 66 | optimiser.step() 67 | losses.append(loss.detach()) 68 | return loss.detach(), losses 69 | 70 | def evaluate(self, dataloader: torch.utils.data.DataLoader, model: nn.Module): 71 | """Run the model on the test set and return the accuracy.""" 72 | model.eval() 73 | n_correct = 0 74 | n_incorrect = 0 75 | for idx, data in enumerate(dataloader): 76 | padding_mask = torch.ones((data[0].shape[0], data[0].shape[1])) > 0 77 | 78 | output = model(data[0].to(self.device), padding_mask.to(self.device)) 79 | predictions = torch.argmax(output, dim=1) 80 | target = torch.argmax(data[1], dim=1).to(self.device) 81 | incorrect = torch.count_nonzero(predictions - target) 82 | n_incorrect += incorrect.detach() 83 | n_correct += (len(target) - incorrect).detach() 84 | accuracy = n_correct / (n_correct + n_incorrect) 85 | return accuracy 86 | -------------------------------------------------------------------------------- /src/transformer/transformer.py: -------------------------------------------------------------------------------- 1 | from src.library import * 2 | from src.transformer.encoder import TransformerBatchNormEncoderLayer 3 | from src.transformer.positional_encoding import get_pos_encoder 4 | 5 | 6 | class TSTransformerEncoder(nn.Module): 7 | """Time series transformer encoder module. 8 | 9 | Args: 10 | feat_dim: feature dimension 11 | max_len: maximum length of the input sequence 12 | d_model: the embed dim 13 | n_heads: the number of heads in the multihead attention models 14 | num_layers: the number of sub-encoder-layers in the encoder 15 | dim_feedforward: the dimension of the feedforward network model 16 | dropout: the dropout value 17 | pos_encoding: positional encoding method 18 | activation: the activation function of intermediate layer, relu or gelu 19 | norm: the normalization layer 20 | freeze: whether to freeze the positional encoding layer 21 | """ 22 | 23 | def __init__( 24 | self, 25 | feat_dim: int, 26 | max_len: int, 27 | d_model: int, 28 | n_heads: int, 29 | num_layers: int, 30 | dim_feedforward: int, 31 | dropout: float = 0.1, 32 | pos_encoding: str = "fixed", 33 | activation: str = "gelu", 34 | norm: str = "BatchNorm", 35 | freeze: bool = False, 36 | ): 37 | super(TSTransformerEncoder, self).__init__() 38 | 39 | self.max_len = max_len 40 | self.d_model = d_model 41 | self.n_heads = n_heads 42 | 43 | self.project_inp = nn.Linear(feat_dim, d_model) 44 | self.pos_enc = get_pos_encoder(pos_encoding)( 45 | d_model, dropout=dropout * (1.0 - freeze), max_len=max_len 46 | ) 47 | 48 | if norm == "LayerNorm": 49 | encoder_layer = TransformerEncoderLayer( 50 | d_model, 51 | self.n_heads, 52 | dim_feedforward, 53 | dropout * (1.0 - freeze), 54 | activation=activation, 55 | ) 56 | else: 57 | encoder_layer = TransformerBatchNormEncoderLayer( 58 | d_model, 59 | self.n_heads, 60 | dim_feedforward, 61 | dropout * (1.0 - freeze), 62 | activation=activation, 63 | ) 64 | 65 | self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers) 66 | 67 | self.output_layer = nn.Linear(d_model, feat_dim) 68 | 69 | self.act = get_activation_fn(activation) 70 | 71 | self.dropout1 = nn.Dropout(dropout) 72 | 73 | self.feat_dim = feat_dim 74 | 75 | def forward(self, X: Tensor, padding_masks: Tensor) -> Tensor: 76 | """ 77 | Args: 78 | X: (batch_size, seq_length, feat_dim) torch tensor of masked features (input) 79 | padding_masks: (batch_size, seq_length) boolean tensor, 1 means keep vector at this position, 0 means padding 80 | Returns: 81 | output: (batch_size, seq_length, feat_dim) 82 | """ 83 | 84 | # permute because pytorch convention for transformers is [seq_length, batch_size, feat_dim]. padding_masks [batch_size, feat_dim] 85 | inp = X.permute(1, 0, 2) 86 | inp = self.project_inp(inp) * math.sqrt( 87 | self.d_model 88 | ) # [seq_length, batch_size, d_model] project input vectors to d_model dimensional space 89 | inp = self.pos_enc(inp) # add positional encoding 90 | # NOTE: logic for padding masks is reversed to comply with definition in MultiHeadAttention, TransformerEncoderLayer 91 | output = self.transformer_encoder( 92 | inp, src_key_padding_mask=~padding_masks 93 | ) # (seq_length, batch_size, d_model) 94 | output = self.act( 95 | output 96 | ) # the output transformer encoder/decoder embeddings don't include non-linearity 97 | output = output.permute(1, 0, 2) # (batch_size, seq_length, d_model) 98 | output = self.dropout1(output) 99 | # Most probably defining a Linear(d_model,feat_dim) vectorizes the operation over (seq_length, batch_size). 100 | output = self.output_layer(output) # (batch_size, seq_length, feat_dim) 101 | 102 | return output 103 | 104 | 105 | class TSTransformerEncoderClassiregressor(nn.Module): 106 | """ 107 | Simplest classifier/regressor. Can be either regressor or classifier because the output does not include 108 | softmax. Concatenates final layer embeddings and uses 0s to ignore padding embeddings in final output layer. 109 | 110 | Args: 111 | feat_dim: feature dimension 112 | max_len: maximum length of the input sequence 113 | d_model: the embed dim 114 | n_heads: the number of heads in the multihead attention models 115 | num_layers: the number of sub-encoder-layers in the encoder 116 | dim_feedforward: the dimension of the feedforward network model 117 | num_classes: the number of classes in the classification task 118 | dropout: the dropout value 119 | pos_encoding: positional encoding method 120 | activation: the activation function of intermediate layer, relu or gelu 121 | norm: the normalization layer 122 | freeze: whether to freeze the positional encoding layer 123 | """ 124 | 125 | def __init__( 126 | self, 127 | feat_dim: int, 128 | max_len: int, 129 | d_model: int, 130 | n_heads: int, 131 | num_layers: int, 132 | dim_feedforward: int, 133 | num_classes: int, 134 | dropout: float = 0.1, 135 | pos_encoding: str = "fixed", 136 | activation: str = "gelu", 137 | norm: str = "BatchNorm", 138 | freeze: bool = False, 139 | ): 140 | super(TSTransformerEncoderClassiregressor, self).__init__() 141 | 142 | self.max_len = max_len 143 | self.d_model = d_model 144 | self.n_heads = n_heads 145 | 146 | self.project_inp = nn.Linear(feat_dim, d_model) 147 | self.pos_enc = get_pos_encoder(pos_encoding)( 148 | d_model, dropout=dropout * (1.0 - freeze), max_len=max_len 149 | ) 150 | 151 | if norm == "LayerNorm": 152 | encoder_layer = TransformerEncoderLayer( 153 | d_model, 154 | self.n_heads, 155 | dim_feedforward, 156 | dropout * (1.0 - freeze), 157 | activation=activation, 158 | ) 159 | else: 160 | encoder_layer = TransformerBatchNormEncoderLayer( 161 | d_model, 162 | self.n_heads, 163 | dim_feedforward, 164 | dropout * (1.0 - freeze), 165 | activation=activation, 166 | ) 167 | 168 | self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers) 169 | 170 | self.act = get_activation_fn(activation) 171 | 172 | self.dropout1 = nn.Dropout(dropout) 173 | 174 | self.feat_dim = feat_dim 175 | self.num_classes = num_classes 176 | self.output_layer = self.build_output_module(d_model, max_len, num_classes) 177 | 178 | def build_output_module( 179 | self, d_model: int, max_len: int, num_classes: int 180 | ) -> nn.Module: 181 | """ Build linear layer that maps from d_model*max_len to num_classes. 182 | 183 | Softmax not included here as it is computed in the loss function. 184 | 185 | Args: 186 | d_model: the embed dim 187 | max_len: maximum length of the input sequence 188 | num_classes: the number of classes in the classification task 189 | 190 | Returns: 191 | output_layer: Tensor of shape (batch_size, num_classes) 192 | """ 193 | output_layer = nn.Linear(d_model * max_len, num_classes) 194 | # no softmax (or log softmax), because CrossEntropyLoss does this internally. If probabilities are needed, 195 | # add F.log_softmax and use NLLoss 196 | return output_layer 197 | 198 | def forward(self, X: Tensor, padding_masks: Tensor) -> Tensor: 199 | """ 200 | Args: 201 | X: (batch_size, seq_length, feat_dim) torch tensor of masked features (input) 202 | padding_masks: (batch_size, seq_length) boolean tensor, 1 means keep vector at this position, 0 means padding 203 | Returns: 204 | output: (batch_size, num_classes) 205 | """ 206 | 207 | # permute because pytorch convention for transformers is [seq_length, batch_size, feat_dim]. padding_masks [batch_size, feat_dim] 208 | inp = X.permute(1, 0, 2) 209 | inp = self.project_inp(inp) * math.sqrt( 210 | self.d_model 211 | ) # [seq_length, batch_size, d_model] project input vectors to d_model dimensional space 212 | inp = self.pos_enc(inp) # add positional encoding 213 | # NOTE: logic for padding masks is reversed to comply with definition in MultiHeadAttention, TransformerEncoderLayer 214 | output = self.transformer_encoder( 215 | inp, src_key_padding_mask=~padding_masks 216 | ) # (seq_length, batch_size, d_model) 217 | output = self.act( 218 | output 219 | ) # the output transformer encoder/decoder embeddings don't include non-linearity 220 | output = output.permute(1, 0, 2) # (batch_size, seq_length, d_model) 221 | output = self.dropout1(output) 222 | 223 | # Output 224 | output = output * padding_masks.unsqueeze(-1) # zero-out padding embeddings 225 | output = output.reshape( 226 | output.shape[0], -1 227 | ) # (batch_size, seq_length * d_model) 228 | output = self.output_layer(output) # (batch_size, num_classes) 229 | 230 | return output 231 | --------------------------------------------------------------------------------