├── BertModules.py ├── Classifier.py ├── Constants.py ├── DataModules.py ├── README.md └── Utils.py /BertModules.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from pytorch_transformers import BertModel 4 | 5 | 6 | class BertClassifier(nn.Module): 7 | 8 | def __init__(self, config): 9 | super(BertClassifier, self).__init__() 10 | # Binary classification problem (num_labels = 2) 11 | self.num_labels = config.num_labels 12 | # Pre-trained BERT model 13 | self.bert = BertModel(config) 14 | # Dropout to avoid overfitting 15 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 16 | # A single layer classifier added on top of BERT to fine tune for binary classification 17 | self.classifier = nn.Linear(config.hidden_size, config.num_labels) 18 | # Weight initialization 19 | torch.nn.init.xavier_normal_(self.classifier.weight) 20 | 21 | def forward(self, input_ids, token_type_ids=None, attention_mask=None, 22 | position_ids=None, head_mask=None): 23 | # Forward pass through pre-trained BERT 24 | outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids, 25 | attention_mask=attention_mask, head_mask=head_mask) 26 | 27 | # Last layer output (Total 12 layers) 28 | pooled_output = outputs[-1] 29 | 30 | pooled_output = self.dropout(pooled_output) 31 | return self.classifier(pooled_output) -------------------------------------------------------------------------------- /Classifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch.nn as nn 3 | from pytorch_transformers import BertTokenizer, BertConfig 4 | from pytorch_transformers import WarmupLinearSchedule 5 | from torch.utils.data import DataLoader, SubsetRandomSampler 6 | from tqdm import tqdm, trange 7 | 8 | from BertModules import BertClassifier 9 | from Constants import * 10 | from DataModules import SequenceDataset 11 | from Utils import seed_everything 12 | 13 | seed_everything() 14 | 15 | # Load BERT default config object and make necessary changes as per requirement 16 | config = BertConfig(hidden_size=768, 17 | num_hidden_layers=12, 18 | num_attention_heads=12, 19 | intermediate_size=3072, 20 | num_labels=2) 21 | 22 | # Create our custom BERTClassifier model object 23 | model = BertClassifier(config) 24 | model.to(DEVICE) 25 | 26 | # Initialize BERT tokenizer 27 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 28 | 29 | # Load Train dataset and split it into Train and Validation dataset 30 | train_dataset = SequenceDataset(TRAIN_FILE_PATH, tokenizer) 31 | 32 | validation_split = 0.2 33 | dataset_size = len(train_dataset) 34 | indices = list(range(dataset_size)) 35 | split = int(np.floor(validation_split * dataset_size)) 36 | shuffle_dataset = True 37 | 38 | if shuffle_dataset : 39 | np.random.shuffle(indices) 40 | train_indices, val_indices = indices[split:], indices[:split] 41 | 42 | train_sampler = SubsetRandomSampler(train_indices) 43 | validation_sampler = SubsetRandomSampler(val_indices) 44 | train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler) 45 | val_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=validation_sampler) 46 | 47 | print ('Training Set Size {}, Validation Set Size {}'.format(len(train_indices), len(val_indices))) 48 | 49 | # Loss Function 50 | criterion = nn.CrossEntropyLoss() 51 | 52 | # Adam Optimizer with very small learning rate given to BERT 53 | optimizer = torch.optim.Adam([ 54 | {'params': model.bert.parameters(), 'lr': 1e-5}, 55 | {'params': model.classifier.parameters(), 'lr': 3e-4} 56 | ]) 57 | 58 | # Learning rate scheduler 59 | scheduler = WarmupLinearSchedule(optimizer, warmup_steps=WARMUP_STEPS, 60 | t_total=len(train_loader) // GRADIENT_ACCUMULATION_STEPS * NUM_EPOCHS) 61 | 62 | model.zero_grad() 63 | epoch_iterator = trange(int(NUM_EPOCHS), desc="Epoch") 64 | training_acc_list, validation_acc_list = [], [] 65 | 66 | for epoch in epoch_iterator: 67 | epoch_loss = 0.0 68 | train_correct_total = 0 69 | 70 | # Training Loop 71 | train_iterator = tqdm(train_loader, desc="Train Iteration") 72 | for step, batch in enumerate(train_iterator): 73 | model.train(True) 74 | # Here each element of batch list refers to one of [input_ids, segment_ids, attention_mask, labels] 75 | inputs = { 76 | 'input_ids': batch[0].to(DEVICE), 77 | 'token_type_ids': batch[1].to(DEVICE), 78 | 'attention_mask': batch[2].to(DEVICE) 79 | } 80 | 81 | labels = batch[3].to(DEVICE) 82 | logits = model(**inputs) 83 | 84 | loss = criterion(logits, labels) / GRADIENT_ACCUMULATION_STEPS 85 | loss.backward() 86 | epoch_loss += loss.item() 87 | 88 | if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0: 89 | scheduler.step() 90 | optimizer.step() 91 | model.zero_grad() 92 | 93 | _, predicted = torch.max(logits.data, 1) 94 | correct_reviews_in_batch = (predicted == labels).sum().item() 95 | train_correct_total += correct_reviews_in_batch 96 | 97 | break 98 | 99 | print('Epoch {} - Loss {:.2f}'.format(epoch + 1, epoch_loss / len(train_indices))) 100 | 101 | # Validation Loop 102 | with torch.no_grad(): 103 | val_correct_total = 0 104 | model.train(False) 105 | val_iterator = tqdm(val_loader, desc="Validation Iteration") 106 | for step, batch in enumerate(val_iterator): 107 | inputs = { 108 | 'input_ids': batch[0].to(DEVICE), 109 | 'token_type_ids': batch[1].to(DEVICE), 110 | 'attention_mask': batch[2].to(DEVICE) 111 | } 112 | 113 | labels = batch[3].to(DEVICE) 114 | logits = model(**inputs) 115 | 116 | _, predicted = torch.max(logits.data, 1) 117 | correct_reviews_in_batch = (predicted == labels).sum().item() 118 | val_correct_total += correct_reviews_in_batch 119 | break 120 | training_acc_list.append(train_correct_total * 100 / len(train_indices)) 121 | validation_acc_list.append(val_correct_total * 100 / len(val_indices)) 122 | print('Training Accuracy {:.4f} - Validation Accurracy {:.4f}'.format( 123 | train_correct_total * 100 / len(train_indices), val_correct_total * 100 / len(val_indices))) 124 | 125 | 126 | # text = 'I am a big fan of cricket' 127 | # text = '[CLS] ' + text + ' [SEP]' 128 | # 129 | # encoded_text = tokenizer.encode(text) + [0] * 120 130 | # tokens_tensor = torch.tensor([encoded_text]) 131 | # labels = torch.tensor([1]) 132 | # 133 | # criterion = nn.CrossEntropyLoss() 134 | # optimizer = torch.optim.Adam([ 135 | # {'params': model.bert.parameters(), 'lr' : 1e-5}, 136 | # {'params': model.classifier.parameters(), 'lr': 1e-3} 137 | # ]) 138 | 139 | 140 | # logits = model(tokens_tensor, labels=labels) 141 | # loss = criterion(logits, labels) 142 | # print(loss) 143 | # optimizer.zero_grad() 144 | # loss.backward() 145 | # optimizer.step() 146 | 147 | -------------------------------------------------------------------------------- /Constants.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | SEP_TOKEN = '[SEP]' 4 | CLS_TOKEN = '[CLS]' 5 | TRAIN_FILE_PATH = '/Users/muhammadabdullah/Downloads/sarcasmv2/Sarcasm_Headlines_Dataset_v2.json' 6 | MAX_SEQ_LENGTH = 512 7 | BATCH_SIZE = 4 8 | NUM_EPOCHS = 2 9 | GRADIENT_ACCUMULATION_STEPS = 8 10 | WARMUP_STEPS = 3 11 | DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 12 | print(DEVICE) -------------------------------------------------------------------------------- /DataModules.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import pandas as pd 4 | from torch.utils.data import Dataset 5 | 6 | from Constants import * 7 | 8 | 9 | class SequenceDataset(Dataset): 10 | def __init__(self, dataset_file_path, tokenizer, regex_transformations={}): 11 | # Read JSON file and assign to headlines variable (list of strings) 12 | df = pd.read_json(dataset_file_path, lines=True) 13 | df = df.drop(['article_link'], axis=1) 14 | self.headlines = df.values 15 | # Regex Transformations can be used for data cleansing. 16 | # e.g. replace 17 | # '\n' -> ' ', 18 | # 'wasn't -> was not 19 | self.regex_transformations = regex_transformations 20 | self.tokenizer = tokenizer 21 | 22 | def __len__(self): 23 | return len(self.headlines) 24 | 25 | def __getitem__(self, index): 26 | is_sarcastic, headline = self.headlines[index] 27 | for regex, value_to_replace_with in self.regex_transformations.items(): 28 | headline = re.sub(regex, value_to_replace_with, headline) 29 | 30 | # Convert input string into tokens with the special BERT Tokenizer which can handle out-of-vocabulary words using subgrams 31 | # e.g. headline = Here is the sentence I want embeddings for. 32 | # tokens = [here, is, the, sentence, i, want, em, ##bed, ##ding, ##s, for, .] 33 | tokens = self.tokenizer.tokenize(headline) 34 | 35 | # Add [CLS] at the beginning and [SEP] at the end of the tokens list for classification problems 36 | tokens = [CLS_TOKEN] + tokens + [SEP_TOKEN] 37 | # Convert tokens to respective IDs from the vocabulary 38 | input_ids = self.tokenizer.convert_tokens_to_ids(tokens) 39 | 40 | # Segment ID for a single sequence in case of classification is 0. 41 | segment_ids = [0] * len(input_ids) 42 | 43 | # Input mask where each valid token has mask = 1 and padding has mask = 0 44 | input_mask = [1] * len(input_ids) 45 | 46 | # padding_length is calculated to reach max_seq_length 47 | padding_length = MAX_SEQ_LENGTH - len(input_ids) 48 | input_ids = input_ids + [0] * padding_length 49 | input_mask = input_mask + [0] * padding_length 50 | segment_ids = segment_ids + [0] * padding_length 51 | 52 | assert len(input_ids) == MAX_SEQ_LENGTH 53 | assert len(input_mask) == MAX_SEQ_LENGTH 54 | assert len(segment_ids) == MAX_SEQ_LENGTH 55 | 56 | return torch.tensor(input_ids, dtype=torch.long, device=DEVICE), \ 57 | torch.tensor(segment_ids, dtype=torch.long, device=DEVICE), \ 58 | torch.tensor(input_mask, device=DEVICE), \ 59 | torch.tensor(is_sarcastic, dtype=torch.long, device=DEVICE) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Text-Classification-with-BERT-PyTorch-Implementation 2 | A text classifier by fine tuning on pre-trained BERT for Sarcasm Detection in News Headlines (PyTorch Implementation) 3 | 4 | ### Kaggle Notebook 5 | [Transfer Learning for Text Data In Pytorch (BERT)](https://www.kaggle.com/aaybeedee/transfer-learning-for-text-data-in-pytorch-bert) 6 | 7 | ### Dataset 8 | [News Headlines Dataset For Sarcasm Detection](https://www.kaggle.com/rmisra/news-headlines-dataset-for-sarcasm-detection/) 9 | 10 | 11 | ### Pre Requisite 12 | * python 3.7 13 | * torch 1.1.0 14 | * pytorch-transformers 15 | * pandas 16 | * numpy 17 | * tqdm 18 | 19 | ### Results (Train Accuracy / Validation Accuracy) 20 | * Epoch 1 (69.89 / 81.16) 21 | * Epoch 2 (83.52 / 83.64) 22 | * Epoch 3 (87.21 / 82.70) 23 | * Epoch 4 (89.38 / 85.23) 24 | * Epoch 5 (91.09 / 85.54) 25 | * Epoch 6 (92.23 / 85.86) 26 | -------------------------------------------------------------------------------- /Utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | import numpy as np 5 | import torch 6 | 7 | 8 | def seed_everything(seed = 42): 9 | random.seed(seed) 10 | os.environ['PYTHONHASHSEED'] = str(seed) 11 | np.random.seed(seed) 12 | torch.manual_seed(seed) 13 | torch.cuda.manual_seed(seed) 14 | torch.backends.cudnn.deterministic = True --------------------------------------------------------------------------------