├── code_files ├── __pycache__ │ └── text_preprocessing.cpython-38.pyc ├── text_preprocessing.py └── BERT_Classifier.py ├── keyword_list.txt └── README.md /code_files/__pycache__/text_preprocessing.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mohit3011/AbuseAnalyzer/HEAD/code_files/__pycache__/text_preprocessing.cpython-38.pyc -------------------------------------------------------------------------------- /keyword_list.txt: -------------------------------------------------------------------------------- 1 | '''Gathered from multiple sources 2 | https://en.wikipedia.org/wiki/List_of_ethnic_slurs 3 | https://github.com/t-davidson/hate-speech-and-offensive-language/blob/master/lexicons/refined_ngram_dict.csv 4 | https://hatebase.org/search_results/language_id%3Deng 5 | ''' 6 | 7 | 'hilari', 'funni', 'god', 'jesu', 'joke', 'troll', 'luck', 'bless', 'lol', 'lmao', 'rofl', 'christ', 'sake' 8 | 'stupid', 'retard', 'predat', 'thief', 'robber', 'ugli', 'deform', 'stink', 'cretin', 'slut', 'liar', 'cheap', 'flirt', 'foolish', 'dumb', 'bitter', 'cruel', 'inhuman', 'evil', 'harsh', 'bitch', 'hate', 'whore', 'steal', 'filth', 'dirt', 'beg', 'fuck', 'shit', 'hell', 'abus', 'asshol', 'motherfuck', 'dare', 'corrupt', 'fool' 9 | 'kike', 'dyke', 'nigger', 'black', 'muzzie', 'muslim', 'coon', 'shitskin', 'spic', 'sand', 'gay', 'lesbian', 'transgender', 'jew', 'homosexual', 'african', 'indian', 'yid', 'heeb', 'mussie', 'raghead', 'mick', 'haji', 'hajji', 'fenian', 'chink', 'chinky', 'ape', 'banana', 'cholo', 'cina', 'cushi', 'dago', 'dego', 'guiri', 'goy', 'jap', 'kebab', 'kolorad', 'kraut', 'kaffir', 'kafir', 'nigga', 'pak', 'paki', 'pocho', 'redneck', 'rastus', 'sambo', 'shkije', 'twinkie', 'wetback', 'wigger', 'wop', 'yank', 'yankee', 'islam' 10 | 'warn', 'risk', 'peril', 'scare', 'fear', 'dread', 'horror', 'caution', 'alert', 'menac' 11 | 'kill', 'rape', 'abort', 'genocid', 'bomb', 'execut', 'assassin', 'murder', 'poison', 'slaughter', 'wipe', 'hit', 'slai', 'molest', 'holocaust', 'carnag', 'annihil' -------------------------------------------------------------------------------- /code_files/text_preprocessing.py: -------------------------------------------------------------------------------- 1 | import sys, os, re 2 | import pandas as pd 3 | import csv 4 | 5 | punctuation_str = '''!()-[]`{};:\,<>./?@#$%^&+|*_~''' 6 | punctuation_str_2 = '''\'\"''' 7 | table = str.maketrans(dict.fromkeys(punctuation_str, " ")) 8 | table_2 = str.maketrans(dict.fromkeys(punctuation_str_2, "")) 9 | 10 | def read_csv_file(file_name, data_col, label_col): 11 | data_list = [] 12 | label_list = [] 13 | with open(file_name, 'r') as csv_reader: 14 | reader = csv.reader(csv_reader, delimiter='\t') 15 | for line_number, row in enumerate(reader): 16 | if line_number!=0: 17 | data_list.append(row[data_col]) 18 | label_list.append(row[label_col]) 19 | 20 | return data_list, label_list 21 | 22 | def clean_text(input_string): 23 | 24 | input_string = input_string.split('\n') 25 | input_string = " ".join(input_string) 26 | input_string = re.sub(r'@\S+', " usermention ", input_string) 27 | input_string = re.sub(r'#\S+'," ", input_string) 28 | input_string = re.sub(r'\d\S+'," ", input_string) 29 | input_string = re.sub(r"http\S+", " ", input_string) 30 | input_string = re.sub(r'www\S+', " ", input_string) 31 | input_string = re.sub(r'\.|/|:|-', " ", input_string) 32 | input_string = re.sub(r'[^\w\s]','',input_string) 33 | input_string = " ".join(input_string.split()) 34 | 35 | return input_string 36 | 37 | 38 | def parse_input_character(data): 39 | train_data = [] 40 | train_label = [] 41 | for row in data: 42 | temp_row = row.split() 43 | char_list = [] 44 | for words in temp_row: 45 | for ch in words: 46 | char_list.append(ch) 47 | train_data.append(char_list) 48 | 49 | return train_data 50 | 51 | 52 | 53 | def prepare_dataset(filename, data_col, label_col, network_type): 54 | 55 | original_data, original_labels = read_csv_file(filename, data_col, label_col) 56 | for i in range(len(original_data)): 57 | original_data[i] = clean_text(original_data[i]) 58 | 59 | if network_type=="character-based": 60 | train_data = parse_input_character(original_data) 61 | return train_data, original_labels 62 | else: 63 | return original_data, original_labels -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AbuseAnalyzer 2 | Repository for our paper "AbuseAnalyzer: Abuse Detection, Severity and Target Prediction for Gab Posts" (Accepted at COLING 2020). This repository contains the code, data and other scripts used for the project. 3 | 4 | **For additional information/queries please contact me via email (mohit.chandra@research.iiit.ac.in)** 5 | 6 | ## Dataset Information 7 | 8 | ### Encodings for Hate/Non-Hate labels 9 | * **1** : Non-Hateful 10 | * **2** : Hateful 11 | 12 | ### Encodings for Target of Hate labels 13 | * **1-2** : Individual Second Person 14 | * **1-3** : Individual Third Person 15 | * **2** : Group 16 | 17 | ### Encodings for Class of Hate labels 18 | * **1** : Biased Attitude 19 | * **2** : Act of Bias and Discrimination 20 | * **3** : Violence and Genocide 21 | 22 | **Note** : The usermentions in the dataset have been changed to `@usermention` for privacy and ethical reasons. 23 | 24 | ## Instructions for the Script/Model 25 | 26 | `code_files/BERT_Classifier.py` contains the model file. To run the model you'll need the GPU support (CUDA). Additionally the following are the required libraries: 27 | * pytorch 28 | * tensorflow.keras 29 | * numpy 30 | * transformers 31 | * scikit-learn 32 | * argparse 33 | * pandas 34 | 35 | To run the file copy the following command **`python3 BERT_Classifier.py --datafile ../AbuseAnalyzer_Dataset.tsv --label_col 2`** . Aquick verification can be performed while looking at the number of examples that have been processed (for binary classification, number of examples=7,601) 36 | 37 | You can also play around the maximum_length of the input through editing the `max_len` variable in the file (our paper uses 100) and number of `epochs` (our paper uses 15). 38 | 39 | ### Instructions for using the severity and target of abuse data 40 | 41 | You'll first need to filter the rows which have been marked as `Abusive/Hateful` and create a separate tsv file. You can give this file as an argument to the model. For the `label_col` argument provide **3** for `Target of Abuse` experiment and provide **4** for `Severity of Abuse` experiment. For example: 42 | 43 | * Target of Abuse experiment: run `python3 BERT_Classifier.py --datafile "provide new filename" --label_col 3`. Please check that the number of examples=4120. 44 | * Severity of Abuse experiment: run `python3 BERT_Classifier.py --datafile "provide new filename" --label_col 4`. Please check that the number of examples=4120. 45 | 46 | For both of the above mentioned experiments, we have used `epochs=20` and have kept the other hyperparameters same. 47 | 48 | ## Citation 49 | 50 | * If you use/refer the dataset and/or code presented in this paper, then kindly cite our work using the following BibTeX: 51 | 52 | ``` 53 | @inproceedings{chandra-etal-2020-abuseanalyzer, 54 | title = "{A}buse{A}nalyzer: Abuse Detection, Severity and Target Prediction for Gab Posts", 55 | author = "Chandra, Mohit and 56 | Pathak, Ashwin and 57 | Dutta, Eesha and 58 | Jain, Paryul and 59 | Gupta, Manish and 60 | Shrivastava, Manish and 61 | Kumaraguru, Ponnurangam", 62 | booktitle = "Proceedings of the 28th International Conference on Computational Linguistics", 63 | month = dec, 64 | year = "2020", 65 | address = "Barcelona, Spain (Online)", 66 | publisher = "International Committee on Computational Linguistics", 67 | url = "https://www.aclweb.org/anthology/2020.coling-main.552", 68 | doi = "10.18653/v1/2020.coling-main.552", 69 | pages = "6277--6283", 70 | abstract = "While extensive popularity of online social media platforms has made information dissemination faster, it has also resulted in widespread online abuse of different types like hate speech, offensive language, sexist and racist opinions, etc. Detection and curtailment of such abusive content is critical for avoiding its psychological impact on victim communities, and thereby preventing hate crimes. Previous works have focused on classifying user posts into various forms of abusive behavior. But there has hardly been any focus on estimating the severity of abuse and the target. In this paper, we present a first of the kind dataset with 7,601 posts from Gab which looks at online abuse from the perspective of presence of abuse, severity and target of abusive behavior. We also propose a system to address these tasks, obtaining an accuracy of ∼80{\%} for abuse presence, ∼82{\%} for abuse target prediction, and ∼65{\%} for abuse severity prediction.", 71 | } 72 | ``` 73 | 74 | 75 | -------------------------------------------------------------------------------- /code_files/BERT_Classifier.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # BERT implementation in PyTorch 5 | 6 | # In[1]: 7 | 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | from transformers import BertModel 13 | import torch.optim as optim 14 | import numpy as np 15 | 16 | import os,sys,re 17 | 18 | import numpy as np 19 | import pandas as pd 20 | from sklearn.model_selection import KFold 21 | from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score 22 | from sklearn.preprocessing import LabelEncoder 23 | from sklearn.utils import class_weight 24 | from tensorflow.keras.utils import to_categorical 25 | from sklearn.model_selection import StratifiedKFold 26 | from text_preprocessing import * 27 | from transformers import BertConfig 28 | from transformers import BertTokenizer 29 | 30 | import statistics 31 | import time 32 | import copy 33 | from tqdm import tqdm 34 | from sklearn.model_selection import train_test_split 35 | 36 | import argparse 37 | from argparse import ArgumentParser 38 | 39 | 40 | # In[2]: 41 | 42 | torch.manual_seed(42) 43 | 44 | parser = ArgumentParser() 45 | 46 | parser.add_argument('--datafile', default="../AbuseAnalyzer_Dataset.tsv", 47 | help='Input file.') 48 | parser.add_argument('--label_col', default="2", 49 | help='Column Number for the Labels') 50 | args = parser.parse_args() 51 | 52 | 53 | 54 | def make_bert_input(data, max_len): 55 | # For every sentence... 56 | input_ids = [] 57 | attention_masks = [] 58 | token_ids = [] 59 | for sent in data: 60 | encoded_dict = tokenizer.encode_plus( 61 | sent, # Sentence to encode. 62 | add_special_tokens = True, # Add '[CLS]' and '[SEP]' 63 | truncation=True, 64 | max_length = max_len, # Pad & truncate all sentences. 65 | pad_to_max_length = True, 66 | return_attention_mask = True, # Construct attn. masks. 67 | return_token_type_ids = True, 68 | ) 69 | 70 | # Add the encoded sentence to the list. 71 | input_ids.append(encoded_dict['input_ids']) 72 | 73 | # And its attention mask (simply differentiates padding from non-padding). 74 | attention_masks.append(encoded_dict['attention_mask']) 75 | 76 | token_ids.append(encoded_dict['token_type_ids']) 77 | 78 | input_ids = np.asarray(input_ids, dtype='int32') 79 | attention_masks = np.asarray(attention_masks, dtype='int32') 80 | token_ids = np.asarray(token_ids, dtype='int32') 81 | 82 | new_data = np.concatenate((input_ids, attention_masks), axis = 1) 83 | new_data = np.concatenate((new_data, token_ids), axis=1) 84 | 85 | return new_data 86 | 87 | # Standard dataset class for pytorch dataloaders 88 | 89 | # In[3]: 90 | 91 | 92 | class Dataset(torch.utils.data.Dataset): 93 | # 'Characterizes a dataset for PyTorch' 94 | def __init__(self, text_input, text_mask, labels): 95 | 'Initialization' 96 | self.labels = labels 97 | self.text_input = text_input 98 | self.text_mask = text_mask 99 | 100 | def __len__(self): 101 | 'Denotes the total number of samples' 102 | return len(self.labels) 103 | 104 | def __getitem__(self, index): 105 | 'Generates one sample of data' 106 | # Select sample 107 | X = np.vstack((self.text_input[index], self.text_mask[index])) 108 | y = self.labels[index] 109 | 110 | return X, y 111 | 112 | # Defining BERT model using both OCR and Text inputs 113 | 114 | # In[4]: 115 | 116 | 117 | class Bert_Text_OCR(nn.Module): 118 | 119 | def __init__(self, num_labels, config=None, device=torch.device("cuda:0")): 120 | super(Bert_Text_OCR, self).__init__() 121 | self.bert_text = BertModel.from_pretrained('bert-base-uncased', config=config) 122 | self.bn = nn.BatchNorm1d(768, momentum=0.99) 123 | self.dense1 = nn.Linear(in_features=768, out_features=192) #Add ReLu in forward loop 124 | self.dropout = nn.Dropout(p=0.2) 125 | self.dense2 = nn.Linear(in_features=192, out_features=num_labels) #Add softmax in forward loop 126 | self.device = device 127 | 128 | def forward(self, inputs, attention_mask=None, labels=None): 129 | 130 | text_input_ids_in = inputs[:,0,:].long().to(self.device) 131 | text_input_masks_in = inputs[:,1,:].long().to(self.device) 132 | 133 | text_embedding_layer = self.bert_text(text_input_ids_in, attention_mask=text_input_masks_in)[0] 134 | text_cls_token = text_embedding_layer[:,0,:] 135 | X = self.bn(text_cls_token) 136 | X = F.relu(self.dense1(X)) 137 | X = self.dropout(X) 138 | X = F.log_softmax(self.dense2(X)) 139 | return X 140 | 141 | 142 | # In[5]: 143 | 144 | 145 | def save_models(epochs, model): 146 | torch.save(model.state_dict(), "bert_model_fold_{}.h5".format(epochs)) 147 | print("Checkpoint Saved") 148 | 149 | # In[6]: 150 | 151 | 152 | def train_loop(dataloaders, dataset_sizes, num_classes, config=None, epochs=1): 153 | model = Bert_Text_OCR(num_labels=num_classes, config=config) 154 | 155 | criterion = torch.nn.CrossEntropyLoss(weight=class_weights_labels) 156 | optimizer = torch.optim.Adam(model.parameters(), lr=1e-5, eps=1e-08) # clipnorm=1.0, add later 157 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 158 | #device = "cpu" 159 | model.to(device) 160 | 161 | since = time.time() 162 | 163 | best_model_wts = copy.deepcopy(model.state_dict()) 164 | best_loss = float("inf") 165 | 166 | for epoch in range(epochs): 167 | print('Epoch {}/{}'.format(epoch, epochs - 1)) 168 | print('-' * 10) 169 | 170 | # Each epoch has a training and validation phase 171 | for phase in ['train', 'validation']: 172 | if phase == 'train': 173 | # scheduler.step() 174 | model.train() # Set model to training mode 175 | else: 176 | model.eval() # Set model to evaluate mode 177 | 178 | running_loss = 0.0 179 | running_corrects = 0 180 | 181 | # Iterate over data. 182 | for inputs, labels in tqdm(dataloaders[phase]): 183 | inputs = inputs.to(device) 184 | labels = labels.to(device) 185 | 186 | # zero the parameter gradients 187 | optimizer.zero_grad() 188 | 189 | # forward 190 | # track history if only in train 191 | with torch.set_grad_enabled(phase == 'train'): 192 | outputs = model(inputs) 193 | _, preds = torch.max(outputs, 1) 194 | actual_labels = torch.max(labels.long(), 1)[1] 195 | loss = criterion(outputs, actual_labels) 196 | 197 | # backward + optimize only if in training phase 198 | if phase == 'train': 199 | loss.backward() 200 | optimizer.step() 201 | 202 | # statistics 203 | # running_loss += loss.item() * inputs.size(0) 204 | running_loss += loss.item() 205 | running_corrects += torch.sum(preds == actual_labels) 206 | 207 | epoch_loss = running_loss / dataset_sizes[phase] 208 | epoch_acc = running_corrects.double() / dataset_sizes[phase] 209 | 210 | print('{} Loss: {:.4f} Acc: {:.4f}'.format( 211 | phase, epoch_loss, epoch_acc)) 212 | 213 | # deep copy the model 214 | if phase == 'validation' and epoch_loss < best_loss: 215 | # save_models(epoch,model) 216 | best_loss = epoch_loss 217 | best_model_wts = copy.deepcopy(model.state_dict()) 218 | 219 | print() 220 | 221 | time_elapsed = time.time() - since 222 | print('Training complete in {:.0f}m {:.0f}s'.format( 223 | time_elapsed // 60, time_elapsed % 60)) 224 | #print('Best val Acc: {:4f}'.format(best_acc)) 225 | 226 | # load best model weights 227 | model.load_state_dict(best_model_wts) 228 | return model 229 | 230 | 231 | # In[7]: 232 | 233 | if __name__ == '__main__': 234 | 235 | datafile = args.datafile 236 | data_col = 0 237 | label_col = int(args.label_col) 238 | max_len = 100 239 | skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) 240 | acc_cum = 0 241 | rec_cum = 0 242 | pre_cum = 0 243 | f1_cum = 0 244 | f1_cum_mic = 0 245 | acc_arr = [] 246 | rec_arr = [] 247 | pre_arr = [] 248 | f1_arr = [] 249 | f1_arr_mic = [] 250 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True, add_special_tokens=True, max_length=max_len, pad_to_max_length=True) 251 | 252 | #------------------------------------------------------------------------------------------------ 253 | text_data, labels = prepare_dataset(datafile, data_col, label_col, "word-based") 254 | 255 | print("Number of Examples: ", len(text_data)) 256 | 257 | encoder = LabelEncoder() 258 | encoder.fit(labels) 259 | encoded_labels = encoder.transform(labels) 260 | class_weights_labels = class_weight.compute_class_weight('balanced', 261 | np.unique(encoded_labels), 262 | encoded_labels) 263 | 264 | num_classes = len(list(encoder.classes_)) 265 | print("num_classes: ", num_classes) 266 | print(encoder.classes_) 267 | config = BertConfig.from_pretrained('bert-base-uncased') 268 | config.output_hidden_states = False 269 | 270 | 271 | fold_number = 1 272 | 273 | new_data = make_bert_input(text_data, max_len) 274 | 275 | ## Add image input to new_data, flatten images then unflatten later 276 | 277 | encoded_labels = np.asarray(encoded_labels, dtype='int32') 278 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 279 | #device = "cpu" 280 | class_weights_labels = torch.tensor(class_weights_labels, dtype=torch.float, device=device) 281 | 282 | for train_index, test_index in skf.split(new_data, encoded_labels): 283 | print("Running fold #", fold_number) 284 | train_data, test_data = new_data[train_index], new_data[test_index] 285 | train_label, test_label = encoded_labels[train_index], encoded_labels[test_index] 286 | train_data, validation_data, train_label, validation_label = train_test_split(train_data, train_label, stratify=train_label, test_size=0.2, random_state=42) 287 | 288 | train_label = to_categorical(train_label) 289 | validation_label = to_categorical(validation_label) 290 | metric_test = np.copy(test_label) 291 | test_label = to_categorical(test_label) 292 | 293 | train_text_input_ids = np.copy(train_data[:,0:max_len]) 294 | validation_text_input_ids = np.copy(validation_data[:,0:max_len]) 295 | test_text_input_ids = np.copy(test_data[:,0:max_len]) 296 | train_text_attention_mask = np.copy(train_data[:,max_len:2*max_len]) 297 | validation_text_attention_mask = np.copy(validation_data[:,max_len:2*max_len]) 298 | test_text_attention_mask = np.copy(test_data[:,max_len:2*max_len]) 299 | 300 | training_set = Dataset(train_text_input_ids, train_text_attention_mask, train_label) 301 | validation_set = Dataset(validation_text_input_ids, validation_text_attention_mask, validation_label) 302 | test_set = Dataset(test_text_input_ids, test_text_attention_mask, test_label) 303 | 304 | dataloaders = { 305 | 'train' : torch.utils.data.DataLoader(training_set, batch_size=4, 306 | shuffle=True, num_workers=2, drop_last=True), 307 | 'validation' : torch.utils.data.DataLoader(validation_set, batch_size=4, 308 | shuffle=True, num_workers=2, drop_last=True) 309 | } 310 | 311 | dataset_sizes = { 312 | 'train': len(training_set), 313 | 'validation': len(validation_set), 314 | } 315 | 316 | model = train_loop(dataloaders, dataset_sizes, num_classes, config=config, epochs=15) 317 | 318 | 319 | y_pred = np.array([]) 320 | 321 | for i in tqdm(range(len(test_set))): 322 | inputs = torch.Tensor([test_set[i][0]]).to(device) 323 | model.eval() 324 | outputs = model(inputs) 325 | preds = torch.max(outputs, 1)[1] 326 | y_pred = np.append(y_pred, preds.cpu().numpy()) 327 | 328 | acc_arr.append(accuracy_score(metric_test, y_pred)) 329 | acc_cum += acc_arr[fold_number-1] 330 | rec_arr.append(recall_score(metric_test, y_pred, average='macro')) 331 | rec_cum += rec_arr[fold_number-1] 332 | pre_arr.append(precision_score(metric_test, y_pred, average='macro')) 333 | pre_cum += pre_arr[fold_number-1] 334 | f1_arr.append(f1_score(metric_test, y_pred, average='macro')) 335 | f1_cum += f1_arr[fold_number-1] 336 | f1_arr_mic.append(f1_score(metric_test, y_pred, average='micro')) 337 | f1_cum_mic += f1_arr_mic[fold_number-1] 338 | fold_number+=1 339 | 340 | print("Accuracy: ", acc_cum/5) 341 | print("Recall: ", rec_cum/5) 342 | print("Precision: ", pre_cum/5) 343 | print("F1 score: ", f1_cum/5) 344 | print("F1 score Micro: ", f1_cum_mic/5) 345 | 346 | print("------------------------------") 347 | print("Accuracy_stdev: ", statistics.stdev(acc_arr)) 348 | print("Recall_stdev: ", statistics.stdev(rec_arr)) 349 | print("Precision_stdev: ", statistics.stdev(pre_arr)) 350 | print("F1 score_stdev: ", statistics.stdev(f1_arr)) 351 | print("F1 score_stdev Micro: ", statistics.stdev(f1_arr_mic)) 352 | 353 | 354 | # In[ ]: 355 | --------------------------------------------------------------------------------