├── BertModules.py
├── Classifier.py
├── Constants.py
├── DataModules.py
├── README.md
└── Utils.py


/BertModules.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from pytorch_transformers import BertModel
 4 | 
 5 | 
 6 | class BertClassifier(nn.Module):
 7 | 
 8 |     def __init__(self, config):
 9 |         super(BertClassifier, self).__init__()
10 |         # Binary classification problem (num_labels = 2)
11 |         self.num_labels = config.num_labels
12 |         # Pre-trained BERT model
13 |         self.bert = BertModel(config)
14 |         # Dropout to avoid overfitting
15 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
16 |         # A single layer classifier added on top of BERT to fine tune for binary classification
17 |         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
18 |         # Weight initialization
19 |         torch.nn.init.xavier_normal_(self.classifier.weight)
20 | 
21 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None,
22 |                 position_ids=None, head_mask=None):
23 |         # Forward pass through pre-trained BERT
24 |         outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
25 |                             attention_mask=attention_mask, head_mask=head_mask)
26 | 
27 |         # Last layer output (Total 12 layers)
28 |         pooled_output = outputs[-1]
29 | 
30 |         pooled_output = self.dropout(pooled_output)
31 |         return self.classifier(pooled_output)


--------------------------------------------------------------------------------
/Classifier.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch.nn as nn
  3 | from pytorch_transformers import BertTokenizer, BertConfig
  4 | from pytorch_transformers import WarmupLinearSchedule
  5 | from torch.utils.data import DataLoader, SubsetRandomSampler
  6 | from tqdm import tqdm, trange
  7 | 
  8 | from BertModules import BertClassifier
  9 | from Constants import *
 10 | from DataModules import SequenceDataset
 11 | from Utils import seed_everything
 12 | 
 13 | seed_everything()
 14 | 
 15 | # Load BERT default config object and make necessary changes as per requirement
 16 | config = BertConfig(hidden_size=768,
 17 |                     num_hidden_layers=12,
 18 |                     num_attention_heads=12,
 19 |                     intermediate_size=3072,
 20 |                     num_labels=2)
 21 | 
 22 | # Create our custom BERTClassifier model object
 23 | model = BertClassifier(config)
 24 | model.to(DEVICE)
 25 | 
 26 | # Initialize BERT tokenizer
 27 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 28 | 
 29 | # Load Train dataset and split it into Train and Validation dataset
 30 | train_dataset = SequenceDataset(TRAIN_FILE_PATH, tokenizer)
 31 | 
 32 | validation_split = 0.2
 33 | dataset_size = len(train_dataset)
 34 | indices = list(range(dataset_size))
 35 | split = int(np.floor(validation_split * dataset_size))
 36 | shuffle_dataset = True
 37 | 
 38 | if shuffle_dataset :
 39 |     np.random.shuffle(indices)
 40 | train_indices, val_indices = indices[split:], indices[:split]
 41 | 
 42 | train_sampler = SubsetRandomSampler(train_indices)
 43 | validation_sampler = SubsetRandomSampler(val_indices)
 44 | train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler)
 45 | val_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=validation_sampler)
 46 | 
 47 | print ('Training Set Size {}, Validation Set Size {}'.format(len(train_indices), len(val_indices)))
 48 | 
 49 | # Loss Function
 50 | criterion = nn.CrossEntropyLoss()
 51 | 
 52 | # Adam Optimizer with very small learning rate given to BERT
 53 | optimizer = torch.optim.Adam([
 54 |     {'params': model.bert.parameters(), 'lr': 1e-5},
 55 |     {'params': model.classifier.parameters(), 'lr': 3e-4}
 56 | ])
 57 | 
 58 | # Learning rate scheduler
 59 | scheduler = WarmupLinearSchedule(optimizer, warmup_steps=WARMUP_STEPS,
 60 |                                  t_total=len(train_loader) // GRADIENT_ACCUMULATION_STEPS * NUM_EPOCHS)
 61 | 
 62 | model.zero_grad()
 63 | epoch_iterator = trange(int(NUM_EPOCHS), desc="Epoch")
 64 | training_acc_list, validation_acc_list = [], []
 65 | 
 66 | for epoch in epoch_iterator:
 67 |     epoch_loss = 0.0
 68 |     train_correct_total = 0
 69 | 
 70 |     # Training Loop
 71 |     train_iterator = tqdm(train_loader, desc="Train Iteration")
 72 |     for step, batch in enumerate(train_iterator):
 73 |         model.train(True)
 74 |         # Here each element of batch list refers to one of [input_ids, segment_ids, attention_mask, labels]
 75 |         inputs = {
 76 |             'input_ids': batch[0].to(DEVICE),
 77 |             'token_type_ids': batch[1].to(DEVICE),
 78 |             'attention_mask': batch[2].to(DEVICE)
 79 |         }
 80 | 
 81 |         labels = batch[3].to(DEVICE)
 82 |         logits = model(**inputs)
 83 | 
 84 |         loss = criterion(logits, labels) / GRADIENT_ACCUMULATION_STEPS
 85 |         loss.backward()
 86 |         epoch_loss += loss.item()
 87 | 
 88 |         if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
 89 |             scheduler.step()
 90 |             optimizer.step()
 91 |             model.zero_grad()
 92 | 
 93 |         _, predicted = torch.max(logits.data, 1)
 94 |         correct_reviews_in_batch = (predicted == labels).sum().item()
 95 |         train_correct_total += correct_reviews_in_batch
 96 | 
 97 |         break
 98 | 
 99 |     print('Epoch {} - Loss {:.2f}'.format(epoch + 1, epoch_loss / len(train_indices)))
100 | 
101 |     # Validation Loop
102 |     with torch.no_grad():
103 |         val_correct_total = 0
104 |         model.train(False)
105 |         val_iterator = tqdm(val_loader, desc="Validation Iteration")
106 |         for step, batch in enumerate(val_iterator):
107 |             inputs = {
108 |                 'input_ids': batch[0].to(DEVICE),
109 |                 'token_type_ids': batch[1].to(DEVICE),
110 |                 'attention_mask': batch[2].to(DEVICE)
111 |             }
112 | 
113 |             labels = batch[3].to(DEVICE)
114 |             logits = model(**inputs)
115 | 
116 |             _, predicted = torch.max(logits.data, 1)
117 |             correct_reviews_in_batch = (predicted == labels).sum().item()
118 |             val_correct_total += correct_reviews_in_batch
119 |             break
120 |         training_acc_list.append(train_correct_total * 100 / len(train_indices))
121 |         validation_acc_list.append(val_correct_total * 100 / len(val_indices))
122 |         print('Training Accuracy {:.4f} - Validation Accurracy {:.4f}'.format(
123 |             train_correct_total * 100 / len(train_indices), val_correct_total * 100 / len(val_indices)))
124 | 
125 | 
126 | # text = 'I am a big fan of cricket'
127 | # text = '[CLS] ' + text + ' [SEP]'
128 | #
129 | # encoded_text = tokenizer.encode(text) + [0] * 120
130 | # tokens_tensor = torch.tensor([encoded_text])
131 | # labels = torch.tensor([1])
132 | #
133 | # criterion = nn.CrossEntropyLoss()
134 | # optimizer = torch.optim.Adam([
135 | #                 {'params': model.bert.parameters(), 'lr' : 1e-5},
136 | #                 {'params': model.classifier.parameters(), 'lr': 1e-3}
137 | #             ])
138 | 
139 | 
140 | # logits = model(tokens_tensor, labels=labels)
141 | # loss = criterion(logits, labels)
142 | # print(loss)
143 | # optimizer.zero_grad()
144 | # loss.backward()
145 | # optimizer.step()
146 | 
147 | 


--------------------------------------------------------------------------------
/Constants.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | SEP_TOKEN = '[SEP]'
 4 | CLS_TOKEN = '[CLS]'
 5 | TRAIN_FILE_PATH = '/Users/muhammadabdullah/Downloads/sarcasmv2/Sarcasm_Headlines_Dataset_v2.json'
 6 | MAX_SEQ_LENGTH = 512
 7 | BATCH_SIZE = 4
 8 | NUM_EPOCHS = 2
 9 | GRADIENT_ACCUMULATION_STEPS = 8
10 | WARMUP_STEPS = 3
11 | DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
12 | print(DEVICE)


--------------------------------------------------------------------------------
/DataModules.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import pandas as pd
 4 | from torch.utils.data import Dataset
 5 | 
 6 | from Constants import *
 7 | 
 8 | 
 9 | class SequenceDataset(Dataset):
10 |     def __init__(self, dataset_file_path, tokenizer, regex_transformations={}):
11 |         # Read JSON file and assign to headlines variable (list of strings)
12 |         df = pd.read_json(dataset_file_path, lines=True)
13 |         df = df.drop(['article_link'], axis=1)
14 |         self.headlines = df.values
15 |         # Regex Transformations can be used for data cleansing.
16 |         # e.g. replace
17 |         #   '\n' -> ' ',
18 |         #   'wasn't -> was not
19 |         self.regex_transformations = regex_transformations
20 |         self.tokenizer = tokenizer
21 | 
22 |     def __len__(self):
23 |         return len(self.headlines)
24 | 
25 |     def __getitem__(self, index):
26 |         is_sarcastic, headline = self.headlines[index]
27 |         for regex, value_to_replace_with in self.regex_transformations.items():
28 |             headline = re.sub(regex, value_to_replace_with, headline)
29 | 
30 |         # Convert input string into tokens with the special BERT Tokenizer which can handle out-of-vocabulary words using subgrams
31 |         # e.g. headline = Here is the sentence I want embeddings for.
32 |         #      tokens = [here, is, the, sentence, i, want, em, ##bed, ##ding, ##s, for, .]
33 |         tokens = self.tokenizer.tokenize(headline)
34 | 
35 |         # Add [CLS] at the beginning and [SEP] at the end of the tokens list for classification problems
36 |         tokens = [CLS_TOKEN] + tokens + [SEP_TOKEN]
37 |         # Convert tokens to respective IDs from the vocabulary
38 |         input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
39 | 
40 |         # Segment ID for a single sequence in case of classification is 0.
41 |         segment_ids = [0] * len(input_ids)
42 | 
43 |         # Input mask where each valid token has mask = 1 and padding has mask = 0
44 |         input_mask = [1] * len(input_ids)
45 | 
46 |         # padding_length is calculated to reach max_seq_length
47 |         padding_length = MAX_SEQ_LENGTH - len(input_ids)
48 |         input_ids = input_ids + [0] * padding_length
49 |         input_mask = input_mask + [0] * padding_length
50 |         segment_ids = segment_ids + [0] * padding_length
51 | 
52 |         assert len(input_ids) == MAX_SEQ_LENGTH
53 |         assert len(input_mask) == MAX_SEQ_LENGTH
54 |         assert len(segment_ids) == MAX_SEQ_LENGTH
55 | 
56 |         return torch.tensor(input_ids, dtype=torch.long, device=DEVICE), \
57 |                torch.tensor(segment_ids, dtype=torch.long, device=DEVICE), \
58 |                torch.tensor(input_mask, device=DEVICE), \
59 |                torch.tensor(is_sarcastic, dtype=torch.long, device=DEVICE)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Text-Classification-with-BERT-PyTorch-Implementation
 2 | A text classifier by fine tuning on pre-trained BERT for Sarcasm Detection in News Headlines (PyTorch Implementation)
 3 | 
 4 | ### Kaggle Notebook
 5 | [Transfer Learning for Text Data In Pytorch (BERT)](https://www.kaggle.com/aaybeedee/transfer-learning-for-text-data-in-pytorch-bert)
 6 | 
 7 | ### Dataset
 8 | [News Headlines Dataset For Sarcasm Detection](https://www.kaggle.com/rmisra/news-headlines-dataset-for-sarcasm-detection/)
 9 |   
10 |   
11 | ### Pre Requisite
12 | * python 3.7
13 | * torch 1.1.0
14 | * pytorch-transformers
15 | * pandas
16 | * numpy
17 | * tqdm
18 | 
19 | ### Results (Train Accuracy / Validation Accuracy)
20 | * Epoch 1 (69.89 / 81.16)
21 | * Epoch 2 (83.52 / 83.64)
22 | * Epoch 3 (87.21 / 82.70)
23 | * Epoch 4 (89.38 / 85.23)
24 | * Epoch 5 (91.09 / 85.54)
25 | * Epoch 6 (92.23 / 85.86)
26 | 


--------------------------------------------------------------------------------
/Utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | 
 4 | import numpy as np
 5 | import torch
 6 | 
 7 | 
 8 | def seed_everything(seed = 42):
 9 |     random.seed(seed)
10 |     os.environ['PYTHONHASHSEED'] = str(seed)
11 |     np.random.seed(seed)
12 |     torch.manual_seed(seed)
13 |     torch.cuda.manual_seed(seed)
14 |     torch.backends.cudnn.deterministic = True


--------------------------------------------------------------------------------