├── .github └── ISSUE_TEMPLATE │ └── notion-issue--correcting-paper-entry.md ├── .gitignore ├── Code ├── __init__.py ├── model.py ├── predictions.py └── utils.py ├── Demos ├── Multilingual_abuse_predictor.ipynb └── Rationale_predictor_demo.ipynb ├── LICENSE └── README.md /.github/ISSUE_TEMPLATE/notion-issue--correcting-paper-entry.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: 'Notion issue: Correcting paper entry' 3 | about: 'Correcting the paper entries ' 4 | title: "[Notion page correction]" 5 | labels: documentation 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Name of the paper** 11 | Please write the name of the paper 12 | 13 | **Correction details ** 14 | Please mention the column name and the corrected entry 15 | 16 | **Important links** 17 | Add any important links 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /Code/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hate-alert/Tutorial-Resources/b1d87ecb4e61e95964a781d29740f0ec0e5bfd7e/Code/__init__.py -------------------------------------------------------------------------------- /Code/model.py: -------------------------------------------------------------------------------- 1 | import transformers 2 | from transformers import AutoTokenizer 3 | from transformers import BertForTokenClassification, BertForSequenceClassification,BertPreTrainedModel, BertModel 4 | import torch.nn as nn 5 | import torch 6 | from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler 7 | import torch.nn.functional as F 8 | import numpy as np 9 | 10 | class BertPooler(nn.Module): 11 | def __init__(self, config): 12 | super().__init__() 13 | self.dense = nn.Linear(config.hidden_size, config.hidden_size) 14 | self.activation = nn.Tanh() 15 | 16 | def forward(self, hidden_states): 17 | # We "pool" the model by simply taking the hidden state corresponding 18 | # to the first token. 19 | first_token_tensor = hidden_states[:, 0] 20 | pooled_output = self.dense(first_token_tensor) 21 | pooled_output = self.activation(pooled_output) 22 | return pooled_output 23 | 24 | 25 | 26 | 27 | class Model_Rational_Label(BertPreTrainedModel): 28 | def __init__(self,config): 29 | super().__init__(config) 30 | #### Keep this parameters fixed 31 | self.num_labels=2 32 | self.impact_factor=10 33 | #### 34 | self.bert = BertModel(config,add_pooling_layer=False) 35 | self.bert_pooler=BertPooler(config) 36 | self.token_dropout = nn.Dropout(0.1) 37 | self.token_classifier = nn.Linear(config.hidden_size, 2) 38 | self.dropout = nn.Dropout(0.1) 39 | self.classifier = nn.Linear(config.hidden_size, self.num_labels) 40 | self.init_weights() 41 | 42 | def forward(self, input_ids=None, mask=None, attn=None, labels=None): 43 | outputs = self.bert(input_ids, mask) 44 | out=outputs[0] 45 | logits = self.token_classifier(self.token_dropout(out)) 46 | embed=self.bert_pooler(outputs[0]) 47 | y_pred = self.classifier(self.dropout(embed)) 48 | loss_token = None 49 | loss_label = None 50 | loss_total = None 51 | 52 | if attn is not None: 53 | loss_fct = nn.CrossEntropyLoss() 54 | # Only keep active parts of the loss 55 | if mask is not None: 56 | active_loss = mask.view(-1) == 1 57 | active_logits = logits.view(-1, 2) 58 | active_labels = torch.where( 59 | active_loss, attn.view(-1), torch.tensor(loss_fct.ignore_index).type_as(attn) 60 | ) 61 | loss_token = loss_fct(active_logits, active_labels) 62 | else: 63 | loss_token = loss_fct(logits.view(-1, 2), attn.view(-1)) 64 | 65 | loss_total=self.impact_factor*loss_token 66 | 67 | 68 | if labels is not None: 69 | loss_funct = nn.CrossEntropyLoss() 70 | loss_logits = loss_funct(y_pred.view(-1, self.num_labels), labels.view(-1)) 71 | loss_label= loss_logits 72 | if(loss_total is not None): 73 | loss_total+=loss_label 74 | else: 75 | loss_total=loss_label 76 | if(loss_total is not None): 77 | return y_pred, logits, loss_total 78 | else: 79 | return y_pred, logits -------------------------------------------------------------------------------- /Code/predictions.py: -------------------------------------------------------------------------------- 1 | from ekphrasis.classes.preprocessor import TextPreProcessor 2 | from ekphrasis.classes.tokenizer import SocialTokenizer 3 | from ekphrasis.dicts.emoticons import emoticons 4 | import re 5 | from transformers import AutoTokenizer,AutoModelForSequenceClassification,AutoConfig 6 | import numpy as np 7 | import torch 8 | from .model import * 9 | from .utils import * 10 | 11 | text_processor = TextPreProcessor( 12 | normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 13 | 'time', 'date', 'number'], 14 | fix_html=True, # fix HTML tokens 15 | annotate={"hashtag", "allcaps", "elongated", "repeated", 16 | 'emphasis', 'censored'}, 17 | segmenter="twitter", 18 | unpack_hashtags=True, # perform word segmentation on hashtags 19 | unpack_contractions=True, # Unpack contractions (can't -> can not) 20 | spell_correct_elong=False, # spell correction for elongated words 21 | tokenizer=SocialTokenizer(lowercase=True).tokenize, 22 | dicts=[emoticons] 23 | ) 24 | 25 | 26 | 27 | class modelPredRationale(): 28 | def __init__(self, model_path = 'bert-base-uncased', device = None): 29 | self.device = device 30 | self.model_path=model_path 31 | self.model = Model_Rational_Label.from_pretrained(model_path,output_attentions = True,output_hidden_states = False).to(self.device) 32 | self.config = AutoConfig.from_pretrained(self.model_path) 33 | if torch.cuda.is_available(): 34 | self.model.cuda() 35 | self.model.eval() 36 | self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, use_fast = False) 37 | 38 | def preprocess_func(self, text): 39 | remove_words=['','','','','','','','\'','s'] 40 | word_list=text_processor.pre_process_doc(text) 41 | word_list=list(filter(lambda a: a not in remove_words, word_list)) 42 | sent=" ".join(word_list) 43 | sent = re.sub(r"[<\*>]", " ",sent) 44 | return sent 45 | 46 | def tokenize(self, sentences, padding = True, max_len = 128): 47 | input_ids, attention_masks, token_type_ids = [], [], [] 48 | for sent in sentences: 49 | encoded_dict = self.tokenizer.encode_plus(sent, 50 | add_special_tokens=True, 51 | max_length=max_len, 52 | padding='max_length', 53 | return_attention_mask = True, 54 | return_tensors = 'pt', 55 | truncation = True) 56 | input_ids.append(encoded_dict['input_ids']) 57 | attention_masks.append(encoded_dict['attention_mask']) 58 | 59 | input_ids = torch.cat(input_ids, dim=0) 60 | attention_masks = torch.cat(attention_masks, dim=0) 61 | 62 | return {'input_ids': input_ids, 'attention_masks': attention_masks} 63 | 64 | def process_data(self, sentences_list): 65 | sentences = [] 66 | sentence_lengths = [] 67 | for sentence in sentences_list: 68 | try: 69 | sentence = self.preprocess_func(sentence) 70 | except TypeError: 71 | sentence = self.preprocess_func("dummy text") 72 | sentences.append(sentence) 73 | sentence_lengths.append(len(self.tokenizer.encode(sentence))) 74 | inputs = self.tokenize(sentences) 75 | tokenized_sentences = [self.tokenizer.convert_ids_to_tokens(ele) for ele in inputs['input_ids']] 76 | 77 | return self.get_dataloader(inputs), sentence_lengths, tokenized_sentences 78 | 79 | def get_dataloader(self, inputs): 80 | data = TensorDataset(inputs['input_ids'], inputs['attention_masks']) 81 | sampler = SequentialSampler(data) 82 | return DataLoader(data, sampler=sampler, batch_size=32) 83 | 84 | 85 | def return_rationales(self, sentences_list): 86 | """Input: should be a list of sentences""" 87 | """Output: probablity values""" 88 | device = self.device 89 | 90 | test_dataloader,sentence_lengths, tokenized_sentences=self.process_data(sentences_list) 91 | 92 | print("Running eval on test data...") 93 | labels_list=[] 94 | rationale_list=[] 95 | rationale_logit_list = [] 96 | # Evaluate data 97 | for step,batch in enumerate(test_dataloader): 98 | 99 | b_input_ids = batch[0].to(device) 100 | b_input_mask = batch[1].to(device) 101 | 102 | label_logits, rationale_logits = self.model(b_input_ids, b_input_mask) 103 | 104 | label_logits = label_logits.detach().cpu().numpy() 105 | rationale_logits = rationale_logits.detach().cpu().numpy() 106 | 107 | final_logits=[] 108 | final_rationales=[] 109 | for i in range(label_logits.shape[0]): 110 | final_logits.append(softmax(label_logits[i])) 111 | final_rationales.append([ele[1] for ele in rationale_logits[i]]) 112 | labels_list+=final_logits 113 | rationale_list+=final_rationales 114 | 115 | attention_vectors = [] 116 | for idx, rationales in enumerate(rationale_list): 117 | attention_vector = softmax(rationales[:sentence_lengths[idx]]) 118 | attention_vector = list(attention_vector) + [0]*(128-len(list(attention_vector))) 119 | attention_vectors.append(attention_vector) 120 | 121 | tokens_sentence=[] 122 | for idx, tokenized in enumerate(tokenized_sentences): 123 | tokenized = tokenized[:sentence_lengths[idx]] 124 | tokens_sentence.append(tokenized) 125 | 126 | return np.array(labels_list), np.array(attention_vectors), tokens_sentence 127 | 128 | 129 | 130 | class modelPred(): 131 | def __init__(self, language='english', device=None): 132 | self.__modelDict ={ 133 | 'arabic':"Hate-speech-CNERG/dehatebert-mono-arabic", 134 | 'english': "Hate-speech-CNERG/dehatebert-mono-english", 135 | 'english_hatexplain':"Hate-speech-CNERG/bert-base-uncased-hatexplain", 136 | 'french': "Hate-speech-CNERG/dehatebert-mono-french", 137 | 'german': "Hate-speech-CNERG/dehatebert-mono-german", 138 | 'indonesian': "Hate-speech-CNERG/dehatebert-mono-indonesian", 139 | 'polish': "Hate-speech-CNERG/dehatebert-mono-polish", 140 | 'portugese': "Hate-speech-CNERG/dehatebert-mono-portugese", 141 | 'italian': "Hate-speech-CNERG/dehatebert-mono-italian", 142 | 'spanish': "Hate-speech-CNERG/dehatebert-mono-spanish", 143 | 'kannada': "Hate-speech-CNERG/deoffxlmr-mono-kannada", 144 | 'malyalam': "Hate-speech-CNERG/deoffxlmr-mono-malyalam", 145 | 'tamil': "Hate-speech-CNERG/deoffxlmr-mono-tamil", 146 | } 147 | self.device = device 148 | self.model_path=self.__modelDict[language] 149 | self.model = AutoModelForSequenceClassification.from_pretrained(self.model_path) 150 | self.config = AutoConfig.from_pretrained(self.model_path) 151 | # if(model_name=='xlmr'): 152 | # self.model = XLMRobertaForSequenceClassification.from_pretrained(self.model_path,output_attentions = True,output_hidden_states = False).to(self.device) 153 | # elif(model_name=='bert'): 154 | # self.model = BertForSequenceClassification.from_pretrained(self.model_path,output_attentions = True,output_hidden_states = False).to(self.device) 155 | self.model.cuda() 156 | self.model.eval() 157 | 158 | def preprocess_func(self, text): 159 | new_text = re.sub('@\w+', '@user',text) 160 | new_text = new_text.replace("\r\n\'",' ').replace("\n",' ') 161 | new_text = re.sub(r"http\S+", "", new_text) 162 | new_text = new_text.replace('&', '&') 163 | return new_text 164 | 165 | def tokenize(self, sentences, padding = True, max_len = 128): 166 | input_ids, attention_masks, token_type_ids = [], [], [] 167 | self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) 168 | for sent in sentences: 169 | encoded_dict = self.tokenizer.encode_plus(sent, 170 | add_special_tokens=True, 171 | max_length=max_len, 172 | padding='max_length', 173 | return_attention_mask = True, 174 | return_tensors = 'pt', 175 | truncation = True) 176 | input_ids.append(encoded_dict['input_ids']) 177 | attention_masks.append(encoded_dict['attention_mask']) 178 | 179 | input_ids = torch.cat(input_ids, dim=0) 180 | attention_masks = torch.cat(attention_masks, dim=0) 181 | 182 | return {'input_ids': input_ids, 'attention_masks': attention_masks} 183 | 184 | def process_data(self, sentences_list): 185 | sentences = [] 186 | for sentence in sentences_list: 187 | try: 188 | sentence = self.preprocess_func(sentence) 189 | except TypeError: 190 | sentence = self.preprocess_func("dummy text") 191 | sentences.append(sentence) 192 | inputs = self.tokenize(sentences) 193 | return self.get_dataloader(inputs) 194 | 195 | def get_dataloader(self, inputs): 196 | data = TensorDataset(inputs['input_ids'], inputs['attention_masks']) 197 | sampler = SequentialSampler(data) 198 | return DataLoader(data, sampler=sampler, batch_size=32) 199 | 200 | def return_probab(self, sentences_list): 201 | """Input: should be a list of sentences""" 202 | """Output: probablity values""" 203 | device = self.device 204 | 205 | test_dataloader=self.process_data(sentences_list) 206 | 207 | print("Running eval on test data...") 208 | labels_list=[] 209 | sentence_lengths = [len(self.tokenizer.encode(sentence)) for sentence in sentences_list] 210 | # Evaluate data 211 | for step,batch in enumerate(test_dataloader): 212 | 213 | b_input_ids = batch[0].to(device) 214 | b_input_mask = batch[1].to(device) 215 | 216 | label_logits = self.model(b_input_ids, b_input_mask).logits 217 | label_logits = label_logits.detach().cpu().numpy() 218 | 219 | final_logits=[] 220 | for i in range(label_logits.shape[0]): 221 | final_logits.append(softmax(label_logits[i])) 222 | labels_list+=final_logits 223 | 224 | return np.array(labels_list) -------------------------------------------------------------------------------- /Code/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import json 3 | def softmax(x): 4 | """Compute softmax values for each sets of scores in x.""" 5 | e_x = np.exp(x - np.max(x)) 6 | temp=e_x / e_x.sum(axis=0) # only difference 7 | 8 | if np.isnan(temp).any()==True: 9 | return [0.0,1.0,0.0] 10 | else: 11 | return temp 12 | 13 | 14 | 15 | class CharVal(object): 16 | def __init__(self, char, val): 17 | self.char = char 18 | self.val = val 19 | 20 | def __str__(self): 21 | return self.char 22 | 23 | def rgb_to_hex(rgb): 24 | return '#%02x%02x%02x' % rgb 25 | 26 | 27 | def color_charvals_lime(s): 28 | r = 255-int(s.val*255) 29 | color = rgb_to_hex((255, r, r)) 30 | return 'background-color: %s' % color 31 | def color_charvals_rationale(s): 32 | r = 255-int(s.val*255) 33 | color = rgb_to_hex((255, r, r)) 34 | return 'background-color: %s' % color 35 | 36 | 37 | class NumpyEncoder(json.JSONEncoder): 38 | """ Special json encoder for numpy types """ 39 | def default(self, obj): 40 | if isinstance(obj, (np.int_, np.intc, np.intp, np.int8, 41 | np.int16, np.int32, np.int64, np.uint8, 42 | np.uint16, np.uint32, np.uint64)): 43 | return int(obj) 44 | elif isinstance(obj, (np.float_, np.float16, np.float32, 45 | np.float64)): 46 | return float(obj) 47 | elif isinstance(obj,(np.ndarray,)): #### This is the fix 48 | return obj.tolist() 49 | return json.JSONEncoder.default(self, obj) -------------------------------------------------------------------------------- /Demos/Multilingual_abuse_predictor.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Multilingual_abuse_predictor.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | }, 17 | "accelerator": "GPU", 18 | "widgets": { 19 | "application/vnd.jupyter.widget-state+json": { 20 | "fd6617d95fea499dbeb25332b62b4602": { 21 | "model_module": "@jupyter-widgets/controls", 22 | "model_name": "HBoxModel", 23 | "state": { 24 | "_view_name": "HBoxView", 25 | "_dom_classes": [], 26 | "_model_name": "HBoxModel", 27 | "_view_module": "@jupyter-widgets/controls", 28 | "_model_module_version": "1.5.0", 29 | "_view_count": null, 30 | "_view_module_version": "1.5.0", 31 | "box_style": "", 32 | "layout": "IPY_MODEL_21a33a9cb69e43ed84338e353a742643", 33 | "_model_module": "@jupyter-widgets/controls", 34 | "children": [ 35 | "IPY_MODEL_5b4da2787cfd4b629a59b6ba80d88fc8", 36 | "IPY_MODEL_18238948093e43579ef0b78716800322" 37 | ] 38 | } 39 | }, 40 | "21a33a9cb69e43ed84338e353a742643": { 41 | "model_module": "@jupyter-widgets/base", 42 | "model_name": "LayoutModel", 43 | "state": { 44 | "_view_name": "LayoutView", 45 | "grid_template_rows": null, 46 | "right": null, 47 | "justify_content": null, 48 | "_view_module": "@jupyter-widgets/base", 49 | "overflow": null, 50 | "_model_module_version": "1.2.0", 51 | "_view_count": null, 52 | "flex_flow": null, 53 | "width": null, 54 | "min_width": null, 55 | "border": null, 56 | "align_items": null, 57 | "bottom": null, 58 | "_model_module": "@jupyter-widgets/base", 59 | "top": null, 60 | "grid_column": null, 61 | "overflow_y": null, 62 | "overflow_x": null, 63 | "grid_auto_flow": null, 64 | "grid_area": null, 65 | "grid_template_columns": null, 66 | "flex": null, 67 | "_model_name": "LayoutModel", 68 | "justify_items": null, 69 | "grid_row": null, 70 | "max_height": null, 71 | "align_content": null, 72 | "visibility": null, 73 | "align_self": null, 74 | "height": null, 75 | "min_height": null, 76 | "padding": null, 77 | "grid_auto_rows": null, 78 | "grid_gap": null, 79 | "max_width": null, 80 | "order": null, 81 | "_view_module_version": "1.2.0", 82 | "grid_template_areas": null, 83 | "object_position": null, 84 | "object_fit": null, 85 | "grid_auto_columns": null, 86 | "margin": null, 87 | "display": null, 88 | "left": null 89 | } 90 | }, 91 | "5b4da2787cfd4b629a59b6ba80d88fc8": { 92 | "model_module": "@jupyter-widgets/controls", 93 | "model_name": "FloatProgressModel", 94 | "state": { 95 | "_view_name": "ProgressView", 96 | "style": "IPY_MODEL_76bcd6d2964d47b885dac60582cd92ce", 97 | "_dom_classes": [], 98 | "description": "Downloading: 100%", 99 | "_model_name": "FloatProgressModel", 100 | "bar_style": "success", 101 | "max": 1225, 102 | "_view_module": "@jupyter-widgets/controls", 103 | "_model_module_version": "1.5.0", 104 | "value": 1225, 105 | "_view_count": null, 106 | "_view_module_version": "1.5.0", 107 | "orientation": "horizontal", 108 | "min": 0, 109 | "description_tooltip": null, 110 | "_model_module": "@jupyter-widgets/controls", 111 | "layout": "IPY_MODEL_ba11daa1d3d2433db25af1e2dbb04a93" 112 | } 113 | }, 114 | "18238948093e43579ef0b78716800322": { 115 | "model_module": "@jupyter-widgets/controls", 116 | "model_name": "HTMLModel", 117 | "state": { 118 | "_view_name": "HTMLView", 119 | "style": "IPY_MODEL_64689e87c0c94c6a92b99a484382939e", 120 | "_dom_classes": [], 121 | "description": "", 122 | "_model_name": "HTMLModel", 123 | "placeholder": "​", 124 | "_view_module": "@jupyter-widgets/controls", 125 | "_model_module_version": "1.5.0", 126 | "value": " 1.23k/1.23k [01:27<00:00, 14.0B/s]", 127 | "_view_count": null, 128 | "_view_module_version": "1.5.0", 129 | "description_tooltip": null, 130 | "_model_module": "@jupyter-widgets/controls", 131 | "layout": "IPY_MODEL_4bdaf2d131db4769bd77c278f1f0842f" 132 | } 133 | }, 134 | "76bcd6d2964d47b885dac60582cd92ce": { 135 | "model_module": "@jupyter-widgets/controls", 136 | "model_name": "ProgressStyleModel", 137 | "state": { 138 | "_view_name": "StyleView", 139 | "_model_name": "ProgressStyleModel", 140 | "description_width": "initial", 141 | "_view_module": "@jupyter-widgets/base", 142 | "_model_module_version": "1.5.0", 143 | "_view_count": null, 144 | "_view_module_version": "1.2.0", 145 | "bar_color": null, 146 | "_model_module": "@jupyter-widgets/controls" 147 | } 148 | }, 149 | "ba11daa1d3d2433db25af1e2dbb04a93": { 150 | "model_module": "@jupyter-widgets/base", 151 | "model_name": "LayoutModel", 152 | "state": { 153 | "_view_name": "LayoutView", 154 | "grid_template_rows": null, 155 | "right": null, 156 | "justify_content": null, 157 | "_view_module": "@jupyter-widgets/base", 158 | "overflow": null, 159 | "_model_module_version": "1.2.0", 160 | "_view_count": null, 161 | "flex_flow": null, 162 | "width": null, 163 | "min_width": null, 164 | "border": null, 165 | "align_items": null, 166 | "bottom": null, 167 | "_model_module": "@jupyter-widgets/base", 168 | "top": null, 169 | "grid_column": null, 170 | "overflow_y": null, 171 | "overflow_x": null, 172 | "grid_auto_flow": null, 173 | "grid_area": null, 174 | "grid_template_columns": null, 175 | "flex": null, 176 | "_model_name": "LayoutModel", 177 | "justify_items": null, 178 | "grid_row": null, 179 | "max_height": null, 180 | "align_content": null, 181 | "visibility": null, 182 | "align_self": null, 183 | "height": null, 184 | "min_height": null, 185 | "padding": null, 186 | "grid_auto_rows": null, 187 | "grid_gap": null, 188 | "max_width": null, 189 | "order": null, 190 | "_view_module_version": "1.2.0", 191 | "grid_template_areas": null, 192 | "object_position": null, 193 | "object_fit": null, 194 | "grid_auto_columns": null, 195 | "margin": null, 196 | "display": null, 197 | "left": null 198 | } 199 | }, 200 | "64689e87c0c94c6a92b99a484382939e": { 201 | "model_module": "@jupyter-widgets/controls", 202 | "model_name": "DescriptionStyleModel", 203 | "state": { 204 | "_view_name": "StyleView", 205 | "_model_name": "DescriptionStyleModel", 206 | "description_width": "", 207 | "_view_module": "@jupyter-widgets/base", 208 | "_model_module_version": "1.5.0", 209 | "_view_count": null, 210 | "_view_module_version": "1.2.0", 211 | "_model_module": "@jupyter-widgets/controls" 212 | } 213 | }, 214 | "4bdaf2d131db4769bd77c278f1f0842f": { 215 | "model_module": "@jupyter-widgets/base", 216 | "model_name": "LayoutModel", 217 | "state": { 218 | "_view_name": "LayoutView", 219 | "grid_template_rows": null, 220 | "right": null, 221 | "justify_content": null, 222 | "_view_module": "@jupyter-widgets/base", 223 | "overflow": null, 224 | "_model_module_version": "1.2.0", 225 | "_view_count": null, 226 | "flex_flow": null, 227 | "width": null, 228 | "min_width": null, 229 | "border": null, 230 | "align_items": null, 231 | "bottom": null, 232 | "_model_module": "@jupyter-widgets/base", 233 | "top": null, 234 | "grid_column": null, 235 | "overflow_y": null, 236 | "overflow_x": null, 237 | "grid_auto_flow": null, 238 | "grid_area": null, 239 | "grid_template_columns": null, 240 | "flex": null, 241 | "_model_name": "LayoutModel", 242 | "justify_items": null, 243 | "grid_row": null, 244 | "max_height": null, 245 | "align_content": null, 246 | "visibility": null, 247 | "align_self": null, 248 | "height": null, 249 | "min_height": null, 250 | "padding": null, 251 | "grid_auto_rows": null, 252 | "grid_gap": null, 253 | "max_width": null, 254 | "order": null, 255 | "_view_module_version": "1.2.0", 256 | "grid_template_areas": null, 257 | "object_position": null, 258 | "object_fit": null, 259 | "grid_auto_columns": null, 260 | "margin": null, 261 | "display": null, 262 | "left": null 263 | } 264 | }, 265 | "9892dc2a0b174fd5a87f092efe33db93": { 266 | "model_module": "@jupyter-widgets/controls", 267 | "model_name": "HBoxModel", 268 | "state": { 269 | "_view_name": "HBoxView", 270 | "_dom_classes": [], 271 | "_model_name": "HBoxModel", 272 | "_view_module": "@jupyter-widgets/controls", 273 | "_model_module_version": "1.5.0", 274 | "_view_count": null, 275 | "_view_module_version": "1.5.0", 276 | "box_style": "", 277 | "layout": "IPY_MODEL_65d060b22f134245affa3c3be8b747b9", 278 | "_model_module": "@jupyter-widgets/controls", 279 | "children": [ 280 | "IPY_MODEL_89c3f9cf413b4d27b81bb3f81fc3282d", 281 | "IPY_MODEL_25b23313da404eea8b547e89012822f6" 282 | ] 283 | } 284 | }, 285 | "65d060b22f134245affa3c3be8b747b9": { 286 | "model_module": "@jupyter-widgets/base", 287 | "model_name": "LayoutModel", 288 | "state": { 289 | "_view_name": "LayoutView", 290 | "grid_template_rows": null, 291 | "right": null, 292 | "justify_content": null, 293 | "_view_module": "@jupyter-widgets/base", 294 | "overflow": null, 295 | "_model_module_version": "1.2.0", 296 | "_view_count": null, 297 | "flex_flow": null, 298 | "width": null, 299 | "min_width": null, 300 | "border": null, 301 | "align_items": null, 302 | "bottom": null, 303 | "_model_module": "@jupyter-widgets/base", 304 | "top": null, 305 | "grid_column": null, 306 | "overflow_y": null, 307 | "overflow_x": null, 308 | "grid_auto_flow": null, 309 | "grid_area": null, 310 | "grid_template_columns": null, 311 | "flex": null, 312 | "_model_name": "LayoutModel", 313 | "justify_items": null, 314 | "grid_row": null, 315 | "max_height": null, 316 | "align_content": null, 317 | "visibility": null, 318 | "align_self": null, 319 | "height": null, 320 | "min_height": null, 321 | "padding": null, 322 | "grid_auto_rows": null, 323 | "grid_gap": null, 324 | "max_width": null, 325 | "order": null, 326 | "_view_module_version": "1.2.0", 327 | "grid_template_areas": null, 328 | "object_position": null, 329 | "object_fit": null, 330 | "grid_auto_columns": null, 331 | "margin": null, 332 | "display": null, 333 | "left": null 334 | } 335 | }, 336 | "89c3f9cf413b4d27b81bb3f81fc3282d": { 337 | "model_module": "@jupyter-widgets/controls", 338 | "model_name": "FloatProgressModel", 339 | "state": { 340 | "_view_name": "ProgressView", 341 | "style": "IPY_MODEL_45dbc7a3a5194be4bee51f6067a860b2", 342 | "_dom_classes": [], 343 | "description": "Downloading: 100%", 344 | "_model_name": "FloatProgressModel", 345 | "bar_style": "success", 346 | "max": 669482093, 347 | "_view_module": "@jupyter-widgets/controls", 348 | "_model_module_version": "1.5.0", 349 | "value": 669482093, 350 | "_view_count": null, 351 | "_view_module_version": "1.5.0", 352 | "orientation": "horizontal", 353 | "min": 0, 354 | "description_tooltip": null, 355 | "_model_module": "@jupyter-widgets/controls", 356 | "layout": "IPY_MODEL_a12eea961709413389734e80e6c0cb67" 357 | } 358 | }, 359 | "25b23313da404eea8b547e89012822f6": { 360 | "model_module": "@jupyter-widgets/controls", 361 | "model_name": "HTMLModel", 362 | "state": { 363 | "_view_name": "HTMLView", 364 | "style": "IPY_MODEL_8219eeb0125d4a1bae3d598a6680be76", 365 | "_dom_classes": [], 366 | "description": "", 367 | "_model_name": "HTMLModel", 368 | "placeholder": "​", 369 | "_view_module": "@jupyter-widgets/controls", 370 | "_model_module_version": "1.5.0", 371 | "value": " 669M/669M [00:14<00:00, 46.4MB/s]", 372 | "_view_count": null, 373 | "_view_module_version": "1.5.0", 374 | "description_tooltip": null, 375 | "_model_module": "@jupyter-widgets/controls", 376 | "layout": "IPY_MODEL_f324e3737cad414894c3ca5c6f2d4b41" 377 | } 378 | }, 379 | "45dbc7a3a5194be4bee51f6067a860b2": { 380 | "model_module": "@jupyter-widgets/controls", 381 | "model_name": "ProgressStyleModel", 382 | "state": { 383 | "_view_name": "StyleView", 384 | "_model_name": "ProgressStyleModel", 385 | "description_width": "initial", 386 | "_view_module": "@jupyter-widgets/base", 387 | "_model_module_version": "1.5.0", 388 | "_view_count": null, 389 | "_view_module_version": "1.2.0", 390 | "bar_color": null, 391 | "_model_module": "@jupyter-widgets/controls" 392 | } 393 | }, 394 | "a12eea961709413389734e80e6c0cb67": { 395 | "model_module": "@jupyter-widgets/base", 396 | "model_name": "LayoutModel", 397 | "state": { 398 | "_view_name": "LayoutView", 399 | "grid_template_rows": null, 400 | "right": null, 401 | "justify_content": null, 402 | "_view_module": "@jupyter-widgets/base", 403 | "overflow": null, 404 | "_model_module_version": "1.2.0", 405 | "_view_count": null, 406 | "flex_flow": null, 407 | "width": null, 408 | "min_width": null, 409 | "border": null, 410 | "align_items": null, 411 | "bottom": null, 412 | "_model_module": "@jupyter-widgets/base", 413 | "top": null, 414 | "grid_column": null, 415 | "overflow_y": null, 416 | "overflow_x": null, 417 | "grid_auto_flow": null, 418 | "grid_area": null, 419 | "grid_template_columns": null, 420 | "flex": null, 421 | "_model_name": "LayoutModel", 422 | "justify_items": null, 423 | "grid_row": null, 424 | "max_height": null, 425 | "align_content": null, 426 | "visibility": null, 427 | "align_self": null, 428 | "height": null, 429 | "min_height": null, 430 | "padding": null, 431 | "grid_auto_rows": null, 432 | "grid_gap": null, 433 | "max_width": null, 434 | "order": null, 435 | "_view_module_version": "1.2.0", 436 | "grid_template_areas": null, 437 | "object_position": null, 438 | "object_fit": null, 439 | "grid_auto_columns": null, 440 | "margin": null, 441 | "display": null, 442 | "left": null 443 | } 444 | }, 445 | "8219eeb0125d4a1bae3d598a6680be76": { 446 | "model_module": "@jupyter-widgets/controls", 447 | "model_name": "DescriptionStyleModel", 448 | "state": { 449 | "_view_name": "StyleView", 450 | "_model_name": "DescriptionStyleModel", 451 | "description_width": "", 452 | "_view_module": "@jupyter-widgets/base", 453 | "_model_module_version": "1.5.0", 454 | "_view_count": null, 455 | "_view_module_version": "1.2.0", 456 | "_model_module": "@jupyter-widgets/controls" 457 | } 458 | }, 459 | "f324e3737cad414894c3ca5c6f2d4b41": { 460 | "model_module": "@jupyter-widgets/base", 461 | "model_name": "LayoutModel", 462 | "state": { 463 | "_view_name": "LayoutView", 464 | "grid_template_rows": null, 465 | "right": null, 466 | "justify_content": null, 467 | "_view_module": "@jupyter-widgets/base", 468 | "overflow": null, 469 | "_model_module_version": "1.2.0", 470 | "_view_count": null, 471 | "flex_flow": null, 472 | "width": null, 473 | "min_width": null, 474 | "border": null, 475 | "align_items": null, 476 | "bottom": null, 477 | "_model_module": "@jupyter-widgets/base", 478 | "top": null, 479 | "grid_column": null, 480 | "overflow_y": null, 481 | "overflow_x": null, 482 | "grid_auto_flow": null, 483 | "grid_area": null, 484 | "grid_template_columns": null, 485 | "flex": null, 486 | "_model_name": "LayoutModel", 487 | "justify_items": null, 488 | "grid_row": null, 489 | "max_height": null, 490 | "align_content": null, 491 | "visibility": null, 492 | "align_self": null, 493 | "height": null, 494 | "min_height": null, 495 | "padding": null, 496 | "grid_auto_rows": null, 497 | "grid_gap": null, 498 | "max_width": null, 499 | "order": null, 500 | "_view_module_version": "1.2.0", 501 | "grid_template_areas": null, 502 | "object_position": null, 503 | "object_fit": null, 504 | "grid_auto_columns": null, 505 | "margin": null, 506 | "display": null, 507 | "left": null 508 | } 509 | }, 510 | "bbfd351359a2433cad1bc54e768f41b8": { 511 | "model_module": "@jupyter-widgets/controls", 512 | "model_name": "HBoxModel", 513 | "state": { 514 | "_view_name": "HBoxView", 515 | "_dom_classes": [], 516 | "_model_name": "HBoxModel", 517 | "_view_module": "@jupyter-widgets/controls", 518 | "_model_module_version": "1.5.0", 519 | "_view_count": null, 520 | "_view_module_version": "1.5.0", 521 | "box_style": "", 522 | "layout": "IPY_MODEL_4a19d08ac48b44bcb4ba4c172447fc27", 523 | "_model_module": "@jupyter-widgets/controls", 524 | "children": [ 525 | "IPY_MODEL_3d2d574bb15640a8a7e4e1a85998be67", 526 | "IPY_MODEL_d28f3afe51a041f2a9cf820f3c03e730" 527 | ] 528 | } 529 | }, 530 | "4a19d08ac48b44bcb4ba4c172447fc27": { 531 | "model_module": "@jupyter-widgets/base", 532 | "model_name": "LayoutModel", 533 | "state": { 534 | "_view_name": "LayoutView", 535 | "grid_template_rows": null, 536 | "right": null, 537 | "justify_content": null, 538 | "_view_module": "@jupyter-widgets/base", 539 | "overflow": null, 540 | "_model_module_version": "1.2.0", 541 | "_view_count": null, 542 | "flex_flow": null, 543 | "width": null, 544 | "min_width": null, 545 | "border": null, 546 | "align_items": null, 547 | "bottom": null, 548 | "_model_module": "@jupyter-widgets/base", 549 | "top": null, 550 | "grid_column": null, 551 | "overflow_y": null, 552 | "overflow_x": null, 553 | "grid_auto_flow": null, 554 | "grid_area": null, 555 | "grid_template_columns": null, 556 | "flex": null, 557 | "_model_name": "LayoutModel", 558 | "justify_items": null, 559 | "grid_row": null, 560 | "max_height": null, 561 | "align_content": null, 562 | "visibility": null, 563 | "align_self": null, 564 | "height": null, 565 | "min_height": null, 566 | "padding": null, 567 | "grid_auto_rows": null, 568 | "grid_gap": null, 569 | "max_width": null, 570 | "order": null, 571 | "_view_module_version": "1.2.0", 572 | "grid_template_areas": null, 573 | "object_position": null, 574 | "object_fit": null, 575 | "grid_auto_columns": null, 576 | "margin": null, 577 | "display": null, 578 | "left": null 579 | } 580 | }, 581 | "3d2d574bb15640a8a7e4e1a85998be67": { 582 | "model_module": "@jupyter-widgets/controls", 583 | "model_name": "FloatProgressModel", 584 | "state": { 585 | "_view_name": "ProgressView", 586 | "style": "IPY_MODEL_297aea76ea23441ea64ce49e83920a79", 587 | "_dom_classes": [], 588 | "description": "Downloading: 100%", 589 | "_model_name": "FloatProgressModel", 590 | "bar_style": "success", 591 | "max": 871891, 592 | "_view_module": "@jupyter-widgets/controls", 593 | "_model_module_version": "1.5.0", 594 | "value": 871891, 595 | "_view_count": null, 596 | "_view_module_version": "1.5.0", 597 | "orientation": "horizontal", 598 | "min": 0, 599 | "description_tooltip": null, 600 | "_model_module": "@jupyter-widgets/controls", 601 | "layout": "IPY_MODEL_c54e109435ec4cbf9f9341062f36d68c" 602 | } 603 | }, 604 | "d28f3afe51a041f2a9cf820f3c03e730": { 605 | "model_module": "@jupyter-widgets/controls", 606 | "model_name": "HTMLModel", 607 | "state": { 608 | "_view_name": "HTMLView", 609 | "style": "IPY_MODEL_16095eadbfdb4deb94b50592cd07e5b8", 610 | "_dom_classes": [], 611 | "description": "", 612 | "_model_name": "HTMLModel", 613 | "placeholder": "​", 614 | "_view_module": "@jupyter-widgets/controls", 615 | "_model_module_version": "1.5.0", 616 | "value": " 872k/872k [00:02<00:00, 422kB/s]", 617 | "_view_count": null, 618 | "_view_module_version": "1.5.0", 619 | "description_tooltip": null, 620 | "_model_module": "@jupyter-widgets/controls", 621 | "layout": "IPY_MODEL_8053a5b760284ceb8feb1325a23d467d" 622 | } 623 | }, 624 | "297aea76ea23441ea64ce49e83920a79": { 625 | "model_module": "@jupyter-widgets/controls", 626 | "model_name": "ProgressStyleModel", 627 | "state": { 628 | "_view_name": "StyleView", 629 | "_model_name": "ProgressStyleModel", 630 | "description_width": "initial", 631 | "_view_module": "@jupyter-widgets/base", 632 | "_model_module_version": "1.5.0", 633 | "_view_count": null, 634 | "_view_module_version": "1.2.0", 635 | "bar_color": null, 636 | "_model_module": "@jupyter-widgets/controls" 637 | } 638 | }, 639 | "c54e109435ec4cbf9f9341062f36d68c": { 640 | "model_module": "@jupyter-widgets/base", 641 | "model_name": "LayoutModel", 642 | "state": { 643 | "_view_name": "LayoutView", 644 | "grid_template_rows": null, 645 | "right": null, 646 | "justify_content": null, 647 | "_view_module": "@jupyter-widgets/base", 648 | "overflow": null, 649 | "_model_module_version": "1.2.0", 650 | "_view_count": null, 651 | "flex_flow": null, 652 | "width": null, 653 | "min_width": null, 654 | "border": null, 655 | "align_items": null, 656 | "bottom": null, 657 | "_model_module": "@jupyter-widgets/base", 658 | "top": null, 659 | "grid_column": null, 660 | "overflow_y": null, 661 | "overflow_x": null, 662 | "grid_auto_flow": null, 663 | "grid_area": null, 664 | "grid_template_columns": null, 665 | "flex": null, 666 | "_model_name": "LayoutModel", 667 | "justify_items": null, 668 | "grid_row": null, 669 | "max_height": null, 670 | "align_content": null, 671 | "visibility": null, 672 | "align_self": null, 673 | "height": null, 674 | "min_height": null, 675 | "padding": null, 676 | "grid_auto_rows": null, 677 | "grid_gap": null, 678 | "max_width": null, 679 | "order": null, 680 | "_view_module_version": "1.2.0", 681 | "grid_template_areas": null, 682 | "object_position": null, 683 | "object_fit": null, 684 | "grid_auto_columns": null, 685 | "margin": null, 686 | "display": null, 687 | "left": null 688 | } 689 | }, 690 | "16095eadbfdb4deb94b50592cd07e5b8": { 691 | "model_module": "@jupyter-widgets/controls", 692 | "model_name": "DescriptionStyleModel", 693 | "state": { 694 | "_view_name": "StyleView", 695 | "_model_name": "DescriptionStyleModel", 696 | "description_width": "", 697 | "_view_module": "@jupyter-widgets/base", 698 | "_model_module_version": "1.5.0", 699 | "_view_count": null, 700 | "_view_module_version": "1.2.0", 701 | "_model_module": "@jupyter-widgets/controls" 702 | } 703 | }, 704 | "8053a5b760284ceb8feb1325a23d467d": { 705 | "model_module": "@jupyter-widgets/base", 706 | "model_name": "LayoutModel", 707 | "state": { 708 | "_view_name": "LayoutView", 709 | "grid_template_rows": null, 710 | "right": null, 711 | "justify_content": null, 712 | "_view_module": "@jupyter-widgets/base", 713 | "overflow": null, 714 | "_model_module_version": "1.2.0", 715 | "_view_count": null, 716 | "flex_flow": null, 717 | "width": null, 718 | "min_width": null, 719 | "border": null, 720 | "align_items": null, 721 | "bottom": null, 722 | "_model_module": "@jupyter-widgets/base", 723 | "top": null, 724 | "grid_column": null, 725 | "overflow_y": null, 726 | "overflow_x": null, 727 | "grid_auto_flow": null, 728 | "grid_area": null, 729 | "grid_template_columns": null, 730 | "flex": null, 731 | "_model_name": "LayoutModel", 732 | "justify_items": null, 733 | "grid_row": null, 734 | "max_height": null, 735 | "align_content": null, 736 | "visibility": null, 737 | "align_self": null, 738 | "height": null, 739 | "min_height": null, 740 | "padding": null, 741 | "grid_auto_rows": null, 742 | "grid_gap": null, 743 | "max_width": null, 744 | "order": null, 745 | "_view_module_version": "1.2.0", 746 | "grid_template_areas": null, 747 | "object_position": null, 748 | "object_fit": null, 749 | "grid_auto_columns": null, 750 | "margin": null, 751 | "display": null, 752 | "left": null 753 | } 754 | }, 755 | "b7e60c4e1f41472fb09bcc834e8587be": { 756 | "model_module": "@jupyter-widgets/controls", 757 | "model_name": "HBoxModel", 758 | "state": { 759 | "_view_name": "HBoxView", 760 | "_dom_classes": [], 761 | "_model_name": "HBoxModel", 762 | "_view_module": "@jupyter-widgets/controls", 763 | "_model_module_version": "1.5.0", 764 | "_view_count": null, 765 | "_view_module_version": "1.5.0", 766 | "box_style": "", 767 | "layout": "IPY_MODEL_09c36ba512a2484bbb23032340b7c2cd", 768 | "_model_module": "@jupyter-widgets/controls", 769 | "children": [ 770 | "IPY_MODEL_1b309a6ab22849aaae3dad14622c79a3", 771 | "IPY_MODEL_90859146e926401597f78ad48c7d50eb" 772 | ] 773 | } 774 | }, 775 | "09c36ba512a2484bbb23032340b7c2cd": { 776 | "model_module": "@jupyter-widgets/base", 777 | "model_name": "LayoutModel", 778 | "state": { 779 | "_view_name": "LayoutView", 780 | "grid_template_rows": null, 781 | "right": null, 782 | "justify_content": null, 783 | "_view_module": "@jupyter-widgets/base", 784 | "overflow": null, 785 | "_model_module_version": "1.2.0", 786 | "_view_count": null, 787 | "flex_flow": null, 788 | "width": null, 789 | "min_width": null, 790 | "border": null, 791 | "align_items": null, 792 | "bottom": null, 793 | "_model_module": "@jupyter-widgets/base", 794 | "top": null, 795 | "grid_column": null, 796 | "overflow_y": null, 797 | "overflow_x": null, 798 | "grid_auto_flow": null, 799 | "grid_area": null, 800 | "grid_template_columns": null, 801 | "flex": null, 802 | "_model_name": "LayoutModel", 803 | "justify_items": null, 804 | "grid_row": null, 805 | "max_height": null, 806 | "align_content": null, 807 | "visibility": null, 808 | "align_self": null, 809 | "height": null, 810 | "min_height": null, 811 | "padding": null, 812 | "grid_auto_rows": null, 813 | "grid_gap": null, 814 | "max_width": null, 815 | "order": null, 816 | "_view_module_version": "1.2.0", 817 | "grid_template_areas": null, 818 | "object_position": null, 819 | "object_fit": null, 820 | "grid_auto_columns": null, 821 | "margin": null, 822 | "display": null, 823 | "left": null 824 | } 825 | }, 826 | "1b309a6ab22849aaae3dad14622c79a3": { 827 | "model_module": "@jupyter-widgets/controls", 828 | "model_name": "FloatProgressModel", 829 | "state": { 830 | "_view_name": "ProgressView", 831 | "style": "IPY_MODEL_d36910d5d2bc43f19f46c81e6bb20d13", 832 | "_dom_classes": [], 833 | "description": "Downloading: 100%", 834 | "_model_name": "FloatProgressModel", 835 | "bar_style": "success", 836 | "max": 112, 837 | "_view_module": "@jupyter-widgets/controls", 838 | "_model_module_version": "1.5.0", 839 | "value": 112, 840 | "_view_count": null, 841 | "_view_module_version": "1.5.0", 842 | "orientation": "horizontal", 843 | "min": 0, 844 | "description_tooltip": null, 845 | "_model_module": "@jupyter-widgets/controls", 846 | "layout": "IPY_MODEL_686f9e8ba5f84314bfac6893150e1738" 847 | } 848 | }, 849 | "90859146e926401597f78ad48c7d50eb": { 850 | "model_module": "@jupyter-widgets/controls", 851 | "model_name": "HTMLModel", 852 | "state": { 853 | "_view_name": "HTMLView", 854 | "style": "IPY_MODEL_b3e578abc71046cdaf46c259cda7b479", 855 | "_dom_classes": [], 856 | "description": "", 857 | "_model_name": "HTMLModel", 858 | "placeholder": "​", 859 | "_view_module": "@jupyter-widgets/controls", 860 | "_model_module_version": "1.5.0", 861 | "value": " 112/112 [00:00<00:00, 152B/s]", 862 | "_view_count": null, 863 | "_view_module_version": "1.5.0", 864 | "description_tooltip": null, 865 | "_model_module": "@jupyter-widgets/controls", 866 | "layout": "IPY_MODEL_f934500902b342079d91c55147f39ebd" 867 | } 868 | }, 869 | "d36910d5d2bc43f19f46c81e6bb20d13": { 870 | "model_module": "@jupyter-widgets/controls", 871 | "model_name": "ProgressStyleModel", 872 | "state": { 873 | "_view_name": "StyleView", 874 | "_model_name": "ProgressStyleModel", 875 | "description_width": "initial", 876 | "_view_module": "@jupyter-widgets/base", 877 | "_model_module_version": "1.5.0", 878 | "_view_count": null, 879 | "_view_module_version": "1.2.0", 880 | "bar_color": null, 881 | "_model_module": "@jupyter-widgets/controls" 882 | } 883 | }, 884 | "686f9e8ba5f84314bfac6893150e1738": { 885 | "model_module": "@jupyter-widgets/base", 886 | "model_name": "LayoutModel", 887 | "state": { 888 | "_view_name": "LayoutView", 889 | "grid_template_rows": null, 890 | "right": null, 891 | "justify_content": null, 892 | "_view_module": "@jupyter-widgets/base", 893 | "overflow": null, 894 | "_model_module_version": "1.2.0", 895 | "_view_count": null, 896 | "flex_flow": null, 897 | "width": null, 898 | "min_width": null, 899 | "border": null, 900 | "align_items": null, 901 | "bottom": null, 902 | "_model_module": "@jupyter-widgets/base", 903 | "top": null, 904 | "grid_column": null, 905 | "overflow_y": null, 906 | "overflow_x": null, 907 | "grid_auto_flow": null, 908 | "grid_area": null, 909 | "grid_template_columns": null, 910 | "flex": null, 911 | "_model_name": "LayoutModel", 912 | "justify_items": null, 913 | "grid_row": null, 914 | "max_height": null, 915 | "align_content": null, 916 | "visibility": null, 917 | "align_self": null, 918 | "height": null, 919 | "min_height": null, 920 | "padding": null, 921 | "grid_auto_rows": null, 922 | "grid_gap": null, 923 | "max_width": null, 924 | "order": null, 925 | "_view_module_version": "1.2.0", 926 | "grid_template_areas": null, 927 | "object_position": null, 928 | "object_fit": null, 929 | "grid_auto_columns": null, 930 | "margin": null, 931 | "display": null, 932 | "left": null 933 | } 934 | }, 935 | "b3e578abc71046cdaf46c259cda7b479": { 936 | "model_module": "@jupyter-widgets/controls", 937 | "model_name": "DescriptionStyleModel", 938 | "state": { 939 | "_view_name": "StyleView", 940 | "_model_name": "DescriptionStyleModel", 941 | "description_width": "", 942 | "_view_module": "@jupyter-widgets/base", 943 | "_model_module_version": "1.5.0", 944 | "_view_count": null, 945 | "_view_module_version": "1.2.0", 946 | "_model_module": "@jupyter-widgets/controls" 947 | } 948 | }, 949 | "f934500902b342079d91c55147f39ebd": { 950 | "model_module": "@jupyter-widgets/base", 951 | "model_name": "LayoutModel", 952 | "state": { 953 | "_view_name": "LayoutView", 954 | "grid_template_rows": null, 955 | "right": null, 956 | "justify_content": null, 957 | "_view_module": "@jupyter-widgets/base", 958 | "overflow": null, 959 | "_model_module_version": "1.2.0", 960 | "_view_count": null, 961 | "flex_flow": null, 962 | "width": null, 963 | "min_width": null, 964 | "border": null, 965 | "align_items": null, 966 | "bottom": null, 967 | "_model_module": "@jupyter-widgets/base", 968 | "top": null, 969 | "grid_column": null, 970 | "overflow_y": null, 971 | "overflow_x": null, 972 | "grid_auto_flow": null, 973 | "grid_area": null, 974 | "grid_template_columns": null, 975 | "flex": null, 976 | "_model_name": "LayoutModel", 977 | "justify_items": null, 978 | "grid_row": null, 979 | "max_height": null, 980 | "align_content": null, 981 | "visibility": null, 982 | "align_self": null, 983 | "height": null, 984 | "min_height": null, 985 | "padding": null, 986 | "grid_auto_rows": null, 987 | "grid_gap": null, 988 | "max_width": null, 989 | "order": null, 990 | "_view_module_version": "1.2.0", 991 | "grid_template_areas": null, 992 | "object_position": null, 993 | "object_fit": null, 994 | "grid_auto_columns": null, 995 | "margin": null, 996 | "display": null, 997 | "left": null 998 | } 999 | }, 1000 | "aa3e298c00fb414faab5ee7f6df30820": { 1001 | "model_module": "@jupyter-widgets/controls", 1002 | "model_name": "HBoxModel", 1003 | "state": { 1004 | "_view_name": "HBoxView", 1005 | "_dom_classes": [], 1006 | "_model_name": "HBoxModel", 1007 | "_view_module": "@jupyter-widgets/controls", 1008 | "_model_module_version": "1.5.0", 1009 | "_view_count": null, 1010 | "_view_module_version": "1.5.0", 1011 | "box_style": "", 1012 | "layout": "IPY_MODEL_cac2fca50a404c6e9350b05957786bbb", 1013 | "_model_module": "@jupyter-widgets/controls", 1014 | "children": [ 1015 | "IPY_MODEL_216bfdef980645829c34f96bfabdc6e0", 1016 | "IPY_MODEL_834ed4faf6e94318aee069f4af7bee82" 1017 | ] 1018 | } 1019 | }, 1020 | "cac2fca50a404c6e9350b05957786bbb": { 1021 | "model_module": "@jupyter-widgets/base", 1022 | "model_name": "LayoutModel", 1023 | "state": { 1024 | "_view_name": "LayoutView", 1025 | "grid_template_rows": null, 1026 | "right": null, 1027 | "justify_content": null, 1028 | "_view_module": "@jupyter-widgets/base", 1029 | "overflow": null, 1030 | "_model_module_version": "1.2.0", 1031 | "_view_count": null, 1032 | "flex_flow": null, 1033 | "width": null, 1034 | "min_width": null, 1035 | "border": null, 1036 | "align_items": null, 1037 | "bottom": null, 1038 | "_model_module": "@jupyter-widgets/base", 1039 | "top": null, 1040 | "grid_column": null, 1041 | "overflow_y": null, 1042 | "overflow_x": null, 1043 | "grid_auto_flow": null, 1044 | "grid_area": null, 1045 | "grid_template_columns": null, 1046 | "flex": null, 1047 | "_model_name": "LayoutModel", 1048 | "justify_items": null, 1049 | "grid_row": null, 1050 | "max_height": null, 1051 | "align_content": null, 1052 | "visibility": null, 1053 | "align_self": null, 1054 | "height": null, 1055 | "min_height": null, 1056 | "padding": null, 1057 | "grid_auto_rows": null, 1058 | "grid_gap": null, 1059 | "max_width": null, 1060 | "order": null, 1061 | "_view_module_version": "1.2.0", 1062 | "grid_template_areas": null, 1063 | "object_position": null, 1064 | "object_fit": null, 1065 | "grid_auto_columns": null, 1066 | "margin": null, 1067 | "display": null, 1068 | "left": null 1069 | } 1070 | }, 1071 | "216bfdef980645829c34f96bfabdc6e0": { 1072 | "model_module": "@jupyter-widgets/controls", 1073 | "model_name": "FloatProgressModel", 1074 | "state": { 1075 | "_view_name": "ProgressView", 1076 | "style": "IPY_MODEL_99f243bfd4d54926a506e895dfd227c7", 1077 | "_dom_classes": [], 1078 | "description": "Downloading: 100%", 1079 | "_model_name": "FloatProgressModel", 1080 | "bar_style": "success", 1081 | "max": 152, 1082 | "_view_module": "@jupyter-widgets/controls", 1083 | "_model_module_version": "1.5.0", 1084 | "value": 152, 1085 | "_view_count": null, 1086 | "_view_module_version": "1.5.0", 1087 | "orientation": "horizontal", 1088 | "min": 0, 1089 | "description_tooltip": null, 1090 | "_model_module": "@jupyter-widgets/controls", 1091 | "layout": "IPY_MODEL_aaac5ae2c4e044a488c334454cf365a9" 1092 | } 1093 | }, 1094 | "834ed4faf6e94318aee069f4af7bee82": { 1095 | "model_module": "@jupyter-widgets/controls", 1096 | "model_name": "HTMLModel", 1097 | "state": { 1098 | "_view_name": "HTMLView", 1099 | "style": "IPY_MODEL_d2361da32dbf422cb71b8ace01adf5ab", 1100 | "_dom_classes": [], 1101 | "description": "", 1102 | "_model_name": "HTMLModel", 1103 | "placeholder": "​", 1104 | "_view_module": "@jupyter-widgets/controls", 1105 | "_model_module_version": "1.5.0", 1106 | "value": " 152/152 [00:00<00:00, 1.17kB/s]", 1107 | "_view_count": null, 1108 | "_view_module_version": "1.5.0", 1109 | "description_tooltip": null, 1110 | "_model_module": "@jupyter-widgets/controls", 1111 | "layout": "IPY_MODEL_04be4a0136cd4f8bb3b77c2ca5cd8946" 1112 | } 1113 | }, 1114 | "99f243bfd4d54926a506e895dfd227c7": { 1115 | "model_module": "@jupyter-widgets/controls", 1116 | "model_name": "ProgressStyleModel", 1117 | "state": { 1118 | "_view_name": "StyleView", 1119 | "_model_name": "ProgressStyleModel", 1120 | "description_width": "initial", 1121 | "_view_module": "@jupyter-widgets/base", 1122 | "_model_module_version": "1.5.0", 1123 | "_view_count": null, 1124 | "_view_module_version": "1.2.0", 1125 | "bar_color": null, 1126 | "_model_module": "@jupyter-widgets/controls" 1127 | } 1128 | }, 1129 | "aaac5ae2c4e044a488c334454cf365a9": { 1130 | "model_module": "@jupyter-widgets/base", 1131 | "model_name": "LayoutModel", 1132 | "state": { 1133 | "_view_name": "LayoutView", 1134 | "grid_template_rows": null, 1135 | "right": null, 1136 | "justify_content": null, 1137 | "_view_module": "@jupyter-widgets/base", 1138 | "overflow": null, 1139 | "_model_module_version": "1.2.0", 1140 | "_view_count": null, 1141 | "flex_flow": null, 1142 | "width": null, 1143 | "min_width": null, 1144 | "border": null, 1145 | "align_items": null, 1146 | "bottom": null, 1147 | "_model_module": "@jupyter-widgets/base", 1148 | "top": null, 1149 | "grid_column": null, 1150 | "overflow_y": null, 1151 | "overflow_x": null, 1152 | "grid_auto_flow": null, 1153 | "grid_area": null, 1154 | "grid_template_columns": null, 1155 | "flex": null, 1156 | "_model_name": "LayoutModel", 1157 | "justify_items": null, 1158 | "grid_row": null, 1159 | "max_height": null, 1160 | "align_content": null, 1161 | "visibility": null, 1162 | "align_self": null, 1163 | "height": null, 1164 | "min_height": null, 1165 | "padding": null, 1166 | "grid_auto_rows": null, 1167 | "grid_gap": null, 1168 | "max_width": null, 1169 | "order": null, 1170 | "_view_module_version": "1.2.0", 1171 | "grid_template_areas": null, 1172 | "object_position": null, 1173 | "object_fit": null, 1174 | "grid_auto_columns": null, 1175 | "margin": null, 1176 | "display": null, 1177 | "left": null 1178 | } 1179 | }, 1180 | "d2361da32dbf422cb71b8ace01adf5ab": { 1181 | "model_module": "@jupyter-widgets/controls", 1182 | "model_name": "DescriptionStyleModel", 1183 | "state": { 1184 | "_view_name": "StyleView", 1185 | "_model_name": "DescriptionStyleModel", 1186 | "description_width": "", 1187 | "_view_module": "@jupyter-widgets/base", 1188 | "_model_module_version": "1.5.0", 1189 | "_view_count": null, 1190 | "_view_module_version": "1.2.0", 1191 | "_model_module": "@jupyter-widgets/controls" 1192 | } 1193 | }, 1194 | "04be4a0136cd4f8bb3b77c2ca5cd8946": { 1195 | "model_module": "@jupyter-widgets/base", 1196 | "model_name": "LayoutModel", 1197 | "state": { 1198 | "_view_name": "LayoutView", 1199 | "grid_template_rows": null, 1200 | "right": null, 1201 | "justify_content": null, 1202 | "_view_module": "@jupyter-widgets/base", 1203 | "overflow": null, 1204 | "_model_module_version": "1.2.0", 1205 | "_view_count": null, 1206 | "flex_flow": null, 1207 | "width": null, 1208 | "min_width": null, 1209 | "border": null, 1210 | "align_items": null, 1211 | "bottom": null, 1212 | "_model_module": "@jupyter-widgets/base", 1213 | "top": null, 1214 | "grid_column": null, 1215 | "overflow_y": null, 1216 | "overflow_x": null, 1217 | "grid_auto_flow": null, 1218 | "grid_area": null, 1219 | "grid_template_columns": null, 1220 | "flex": null, 1221 | "_model_name": "LayoutModel", 1222 | "justify_items": null, 1223 | "grid_row": null, 1224 | "max_height": null, 1225 | "align_content": null, 1226 | "visibility": null, 1227 | "align_self": null, 1228 | "height": null, 1229 | "min_height": null, 1230 | "padding": null, 1231 | "grid_auto_rows": null, 1232 | "grid_gap": null, 1233 | "max_width": null, 1234 | "order": null, 1235 | "_view_module_version": "1.2.0", 1236 | "grid_template_areas": null, 1237 | "object_position": null, 1238 | "object_fit": null, 1239 | "grid_auto_columns": null, 1240 | "margin": null, 1241 | "display": null, 1242 | "left": null 1243 | } 1244 | } 1245 | } 1246 | } 1247 | }, 1248 | "cells": [ 1249 | { 1250 | "cell_type": "markdown", 1251 | "metadata": { 1252 | "id": "k-Y5kDvq0olg" 1253 | }, 1254 | "source": [ 1255 | "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/hate-alert/Tutorial-ICWSM-2021/blob/main/Demos/Multilingual_abuse_predictor.ipynb)" 1256 | ] 1257 | }, 1258 | { 1259 | "cell_type": "markdown", 1260 | "metadata": { 1261 | "id": "JuTVFcXfMZO9" 1262 | }, 1263 | "source": [ 1264 | "# **Multilingual abuse predictor**\n", 1265 | "> This tool provides a suite of classifiers for different abuse detection tasks in a multilingual setting. This tool is provided by [Hate-alert](https://github.com/hate-alert)\n", 1266 | "\n", 1267 | "\n", 1268 | "![Multilingual](https://cdn.eventplanner.net/imgs/xnr8886_how-to-run-an-efficient-multilingual-conference.jpg)\n", 1269 | "\n" 1270 | ] 1271 | }, 1272 | { 1273 | "cell_type": "markdown", 1274 | "metadata": { 1275 | "id": "prujIqc62_9c" 1276 | }, 1277 | "source": [ 1278 | "# **Install necessary modules**\n", 1279 | "#### this cell will install transformers and other necessary packages required for running the code\n" 1280 | ] 1281 | }, 1282 | { 1283 | "cell_type": "code", 1284 | "metadata": { 1285 | "id": "CKUqwmKu3t_d" 1286 | }, 1287 | "source": [ 1288 | "%%capture\n", 1289 | "!pip install transformers\n", 1290 | "!pip install transformers[sentencepiece]\n", 1291 | "!pip install ekphrasis\n", 1292 | "!git clone https://github.com/hate-alert/Tutorial-ICWSM-2021.git" 1293 | ], 1294 | "execution_count": 6, 1295 | "outputs": [] 1296 | }, 1297 | { 1298 | "cell_type": "code", 1299 | "metadata": { 1300 | "colab": { 1301 | "base_uri": "https://localhost:8080/" 1302 | }, 1303 | "id": "_CUqXC8OQW2e", 1304 | "outputId": "9534bcec-ef8f-41a7-898c-0a7c6b3be044" 1305 | }, 1306 | "source": [ 1307 | "cd Tutorial-ICWSM-2021" 1308 | ], 1309 | "execution_count": 7, 1310 | "outputs": [ 1311 | { 1312 | "output_type": "stream", 1313 | "text": [ 1314 | "/content/Tutorial-ICWSM-2021\n" 1315 | ], 1316 | "name": "stdout" 1317 | } 1318 | ] 1319 | }, 1320 | { 1321 | "cell_type": "code", 1322 | "metadata": { 1323 | "id": "WTB7UXsm3_mF" 1324 | }, 1325 | "source": [ 1326 | "%%capture\n", 1327 | "import transformers\n", 1328 | "import random\n", 1329 | "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n", 1330 | "from transformers import BertForSequenceClassification\n", 1331 | "from transformers import XLMRobertaForSequenceClassification\n", 1332 | "import torch.nn as nn\n", 1333 | "import torch\n", 1334 | "from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\n", 1335 | "import re\n", 1336 | "import torch.nn.functional as F\n", 1337 | "import numpy as np\n", 1338 | "from Code.utils import *\n", 1339 | "from Code.model import *\n", 1340 | "from Code.predictions import *" 1341 | ], 1342 | "execution_count": 8, 1343 | "outputs": [] 1344 | }, 1345 | { 1346 | "cell_type": "markdown", 1347 | "metadata": { 1348 | "id": "M7BOzLjVZIZK" 1349 | }, 1350 | "source": [ 1351 | "### **Set GPU** : \n", 1352 | "> This will select the device based on your current configuration. Select Runtime --> change runtimetype and select GPU as hardware accelerator to use GPU. " 1353 | ] 1354 | }, 1355 | { 1356 | "cell_type": "code", 1357 | "metadata": { 1358 | "id": "SZbRedQYZKZ1" 1359 | }, 1360 | "source": [ 1361 | "if torch.cuda.is_available():\n", 1362 | " device = torch.device(\"cuda\")\n", 1363 | "else:\n", 1364 | " device = torch.device(\"cpu\")" 1365 | ], 1366 | "execution_count": 9, 1367 | "outputs": [] 1368 | }, 1369 | { 1370 | "cell_type": "code", 1371 | "metadata": { 1372 | "id": "UWgKwG1oVUQF" 1373 | }, 1374 | "source": [ 1375 | "def getDatasetPrediction(dataset,config):\n", 1376 | " labels=model.return_probab(dataset['Sentences'])\n", 1377 | " predictions = {}\n", 1378 | " for index, row in dataset.iterrows():\n", 1379 | " \n", 1380 | " dict1={}\n", 1381 | " dict1['Sentence']=row['Sentences']\n", 1382 | " dict_labels={}\n", 1383 | " for ele in config:\n", 1384 | " dict_labels[config[ele]]=round(labels[index][ele],3)\n", 1385 | " dict1[\"Labels\"]=dict_labels\n", 1386 | " predictions[row['Index']] = dict1\n", 1387 | " return predictions\n", 1388 | "\n", 1389 | "def getRandomTextFromPred(pred = None):\n", 1390 | " return random.choice(list(prediction.items()))" 1391 | ], 1392 | "execution_count": 10, 1393 | "outputs": [] 1394 | }, 1395 | { 1396 | "cell_type": "markdown", 1397 | "metadata": { 1398 | "id": "LYPI3C6MPm38" 1399 | }, 1400 | "source": [ 1401 | "# **Models and their origins**\n", 1402 | "Here different models have different origins in terms of the dataset and prediction they user\n", 1403 | "* **Kannada, Malaylam, Telugu models are trained using the recent competition Offensive Language [shared task](https://competitions.codalab.org/competitions/27654) at DravidianLangTech workshop in EACL 2021** \n", 1404 | "\n", 1405 | "These are XLM-R models and has the following labels\n", 1406 | "> Not_in_intended_language, Not_offensive , Off_target_group (*offensive targetting group*),\n", 1407 | " Off_target_ind (*offensive targetting individual*), Profanity (*presence of slur*)\n", 1408 | "\n", 1409 | "#### **If used cite this** \n", 1410 | "```\n", 1411 | "@misc{saha2021hatealertdravidianlangtecheacl2021,\n", 1412 | " title={Hate-Alert@DravidianLangTech-EACL2021: Ensembling strategies for Transformer-based Offensive language Detection}, \n", 1413 | " author={Debjoy Saha and Naman Paharia and Debajit Chakraborty and Punyajoy Saha and Animesh Mukherjee},\n", 1414 | " year={2021},\n", 1415 | " eprint={2102.10084},\n", 1416 | " archivePrefix={arXiv},\n", 1417 | " primaryClass={cs.CL}\n", 1418 | "}\n", 1419 | "```\n", 1420 | "\n", 1421 | "\n", 1422 | "* **English_hatexplain is a model trained using the [hatexplain dataset](https://huggingface.co/datasets/hatexplain)** \n", 1423 | "\n", 1424 | "This is a BERT-BASE-UNCASED model trained with hateful, offensive and normal labels\n", 1425 | "\n", 1426 | "\n", 1427 | "#### **If used cite this** \n", 1428 | "```\n", 1429 | "@article{mathew2020hatexplain,\n", 1430 | " title={HateXplain: A Benchmark Dataset for Explainable Hate Speech Detection},\n", 1431 | " author={Mathew, Binny and Saha, Punyajoy and Yimam, Seid Muhie and Biemann, Chris and Goyal, Pawan and Mukherjee, Animesh},\n", 1432 | " journal={arXiv preprint arXiv:2012.10289},\n", 1433 | " year={2020}\n", 1434 | "}\n", 1435 | "```\n", 1436 | "* **Rest of the models are trained using the datasets in the [DELIMIT repo](https://github.com/hate-alert/DE-LIMIT)** \n", 1437 | "This is a BERT-BASE-UNCASED model trained with hateful, non hateful labels\n", 1438 | "\n", 1439 | "#### **If used cite this** \n", 1440 | "```\n", 1441 | "@article{aluru2020deep,\n", 1442 | " title={Deep Learning Models for Multilingual Hate Speech Detection},\n", 1443 | " author={Aluru, Sai Saket and Mathew, Binny and Saha, Punyajoy and Mukherjee, Animesh},\n", 1444 | " journal={arXiv preprint arXiv:2004.06465},\n", 1445 | " year={2020}\n", 1446 | "}\n", 1447 | "```\n", 1448 | "\n" 1449 | ] 1450 | }, 1451 | { 1452 | "cell_type": "code", 1453 | "metadata": { 1454 | "id": "eBsVw6f7m0Vv" 1455 | }, 1456 | "source": [ 1457 | "#@title ### **Select a language**\n", 1458 | "Language = \"English\" #@param [\"Arabic\", \"English\", \"French\", \"German\", \"Indonesian\", \"Polish\", \"Portugese\", \"Italian\", \"Spanish\", \"Kannada\", \"Malyalam\", \"Tamil\", \"English_hatexplain\"]\n" 1459 | ], 1460 | "execution_count": 17, 1461 | "outputs": [] 1462 | }, 1463 | { 1464 | "cell_type": "code", 1465 | "metadata": { 1466 | "id": "5rFPd30qXYVi", 1467 | "colab": { 1468 | "base_uri": "https://localhost:8080/", 1469 | "height": 116, 1470 | "referenced_widgets": [ 1471 | "fd6617d95fea499dbeb25332b62b4602", 1472 | "21a33a9cb69e43ed84338e353a742643", 1473 | "5b4da2787cfd4b629a59b6ba80d88fc8", 1474 | "18238948093e43579ef0b78716800322", 1475 | "76bcd6d2964d47b885dac60582cd92ce", 1476 | "ba11daa1d3d2433db25af1e2dbb04a93", 1477 | "64689e87c0c94c6a92b99a484382939e", 1478 | "4bdaf2d131db4769bd77c278f1f0842f", 1479 | "9892dc2a0b174fd5a87f092efe33db93", 1480 | "65d060b22f134245affa3c3be8b747b9", 1481 | "89c3f9cf413b4d27b81bb3f81fc3282d", 1482 | "25b23313da404eea8b547e89012822f6", 1483 | "45dbc7a3a5194be4bee51f6067a860b2", 1484 | "a12eea961709413389734e80e6c0cb67", 1485 | "8219eeb0125d4a1bae3d598a6680be76", 1486 | "f324e3737cad414894c3ca5c6f2d4b41" 1487 | ] 1488 | }, 1489 | "outputId": "9f566f18-acad-4654-c242-0423ea1afce4" 1490 | }, 1491 | "source": [ 1492 | "model = modelPred(language=Language.lower(), device=device)" 1493 | ], 1494 | "execution_count": 18, 1495 | "outputs": [ 1496 | { 1497 | "output_type": "display_data", 1498 | "data": { 1499 | "application/vnd.jupyter.widget-view+json": { 1500 | "model_id": "fd6617d95fea499dbeb25332b62b4602", 1501 | "version_minor": 0, 1502 | "version_major": 2 1503 | }, 1504 | "text/plain": [ 1505 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1225.0, style=ProgressStyle(description…" 1506 | ] 1507 | }, 1508 | "metadata": { 1509 | "tags": [] 1510 | } 1511 | }, 1512 | { 1513 | "output_type": "stream", 1514 | "text": [ 1515 | "\n" 1516 | ], 1517 | "name": "stdout" 1518 | }, 1519 | { 1520 | "output_type": "display_data", 1521 | "data": { 1522 | "application/vnd.jupyter.widget-view+json": { 1523 | "model_id": "9892dc2a0b174fd5a87f092efe33db93", 1524 | "version_minor": 0, 1525 | "version_major": 2 1526 | }, 1527 | "text/plain": [ 1528 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=669482093.0, style=ProgressStyle(descri…" 1529 | ] 1530 | }, 1531 | "metadata": { 1532 | "tags": [] 1533 | } 1534 | }, 1535 | { 1536 | "output_type": "stream", 1537 | "text": [ 1538 | "\n" 1539 | ], 1540 | "name": "stdout" 1541 | } 1542 | ] 1543 | }, 1544 | { 1545 | "cell_type": "code", 1546 | "metadata": { 1547 | "cellView": "form", 1548 | "id": "oh217P4G1Ksc" 1549 | }, 1550 | "source": [ 1551 | "#@title **How do you want to enter text ?**\n", 1552 | "# @markdown You can either directly enter the text (text input) or uppload from a csv (file)\n", 1553 | "input_type = \"text input\" #@param [\"file\", \"text input\"]" 1554 | ], 1555 | "execution_count": 19, 1556 | "outputs": [] 1557 | }, 1558 | { 1559 | "cell_type": "code", 1560 | "metadata": { 1561 | "id": "jkFxfc1IZl4w", 1562 | "colab": { 1563 | "base_uri": "https://localhost:8080/" 1564 | }, 1565 | "outputId": "b0f74966-e2bb-41f1-e64d-9fccf6800af8" 1566 | }, 1567 | "source": [ 1568 | "import io\n", 1569 | "import pandas as pd\n", 1570 | "\n", 1571 | "if input_type == \"text input\":\n", 1572 | " text_input = input(\"Write the post: \")\n", 1573 | " dataset=[text_input]\n", 1574 | "else:\n", 1575 | " print(\"Please upload the csv file you want to get predictions\")\n", 1576 | " print(\"Please make sure the column name of the csv should be Index, Sentences\")\n", 1577 | " from google.colab import files\n", 1578 | " uploaded = files.upload()\n", 1579 | " dataset = pd.read_csv(io.BytesIO(uploaded[list(uploaded)[0]])).reset_index()" 1580 | ], 1581 | "execution_count": 20, 1582 | "outputs": [ 1583 | { 1584 | "output_type": "stream", 1585 | "text": [ 1586 | "Write the post: I hate nigger\n" 1587 | ], 1588 | "name": "stdout" 1589 | } 1590 | ] 1591 | }, 1592 | { 1593 | "cell_type": "code", 1594 | "metadata": { 1595 | "colab": { 1596 | "base_uri": "https://localhost:8080/", 1597 | "height": 200, 1598 | "referenced_widgets": [ 1599 | "bbfd351359a2433cad1bc54e768f41b8", 1600 | "4a19d08ac48b44bcb4ba4c172447fc27", 1601 | "3d2d574bb15640a8a7e4e1a85998be67", 1602 | "d28f3afe51a041f2a9cf820f3c03e730", 1603 | "297aea76ea23441ea64ce49e83920a79", 1604 | "c54e109435ec4cbf9f9341062f36d68c", 1605 | "16095eadbfdb4deb94b50592cd07e5b8", 1606 | "8053a5b760284ceb8feb1325a23d467d", 1607 | "b7e60c4e1f41472fb09bcc834e8587be", 1608 | "09c36ba512a2484bbb23032340b7c2cd", 1609 | "1b309a6ab22849aaae3dad14622c79a3", 1610 | "90859146e926401597f78ad48c7d50eb", 1611 | "d36910d5d2bc43f19f46c81e6bb20d13", 1612 | "686f9e8ba5f84314bfac6893150e1738", 1613 | "b3e578abc71046cdaf46c259cda7b479", 1614 | "f934500902b342079d91c55147f39ebd", 1615 | "aa3e298c00fb414faab5ee7f6df30820", 1616 | "cac2fca50a404c6e9350b05957786bbb", 1617 | "216bfdef980645829c34f96bfabdc6e0", 1618 | "834ed4faf6e94318aee069f4af7bee82", 1619 | "99f243bfd4d54926a506e895dfd227c7", 1620 | "aaac5ae2c4e044a488c334454cf365a9", 1621 | "d2361da32dbf422cb71b8ace01adf5ab", 1622 | "04be4a0136cd4f8bb3b77c2ca5cd8946" 1623 | ] 1624 | }, 1625 | "id": "W1rQS0GzT8nn", 1626 | "outputId": "8b18e1fe-4f78-4842-a201-4df64212c526" 1627 | }, 1628 | "source": [ 1629 | "if input_type == \"text input\":\n", 1630 | " labels=model.return_probab(dataset)\n", 1631 | " dict1={}\n", 1632 | " dict1['Sentence']=dataset[0]\n", 1633 | " dict_labels={}\n", 1634 | " config=model.config.id2label\n", 1635 | " for ele in config:\n", 1636 | " dict_labels[config[ele]]=round(labels[0][ele],3)\n", 1637 | " dict1[\"Labels\"]=dict_labels\n", 1638 | " print(dict1)\n", 1639 | "else:\n", 1640 | " prediction = getDatasetPrediction(dataset,model.config.id2label)\n", 1641 | " print(getRandomTextFromPred(prediction))" 1642 | ], 1643 | "execution_count": 21, 1644 | "outputs": [ 1645 | { 1646 | "output_type": "display_data", 1647 | "data": { 1648 | "application/vnd.jupyter.widget-view+json": { 1649 | "model_id": "bbfd351359a2433cad1bc54e768f41b8", 1650 | "version_minor": 0, 1651 | "version_major": 2 1652 | }, 1653 | "text/plain": [ 1654 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=871891.0, style=ProgressStyle(descripti…" 1655 | ] 1656 | }, 1657 | "metadata": { 1658 | "tags": [] 1659 | } 1660 | }, 1661 | { 1662 | "output_type": "stream", 1663 | "text": [ 1664 | "\n" 1665 | ], 1666 | "name": "stdout" 1667 | }, 1668 | { 1669 | "output_type": "display_data", 1670 | "data": { 1671 | "application/vnd.jupyter.widget-view+json": { 1672 | "model_id": "b7e60c4e1f41472fb09bcc834e8587be", 1673 | "version_minor": 0, 1674 | "version_major": 2 1675 | }, 1676 | "text/plain": [ 1677 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…" 1678 | ] 1679 | }, 1680 | "metadata": { 1681 | "tags": [] 1682 | } 1683 | }, 1684 | { 1685 | "output_type": "stream", 1686 | "text": [ 1687 | "\n" 1688 | ], 1689 | "name": "stdout" 1690 | }, 1691 | { 1692 | "output_type": "display_data", 1693 | "data": { 1694 | "application/vnd.jupyter.widget-view+json": { 1695 | "model_id": "aa3e298c00fb414faab5ee7f6df30820", 1696 | "version_minor": 0, 1697 | "version_major": 2 1698 | }, 1699 | "text/plain": [ 1700 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=152.0, style=ProgressStyle(description_…" 1701 | ] 1702 | }, 1703 | "metadata": { 1704 | "tags": [] 1705 | } 1706 | }, 1707 | { 1708 | "output_type": "stream", 1709 | "text": [ 1710 | "\n", 1711 | "Running eval on test data...\n", 1712 | "{'Sentence': 'I hate nigger', 'Labels': {'NON_HATE': 0.091, 'HATE': 0.909}}\n" 1713 | ], 1714 | "name": "stdout" 1715 | } 1716 | ] 1717 | }, 1718 | { 1719 | "cell_type": "markdown", 1720 | "metadata": { 1721 | "id": "11B0F2cOXKbP" 1722 | }, 1723 | "source": [ 1724 | "### **Word of caution**\n", 1725 | "\n", 1726 | "> Model used here have any trained using a particular dataset and they may carry some bias or errors, they should be only used as a complementary labels in case of any analysis." 1727 | ] 1728 | }, 1729 | { 1730 | "cell_type": "markdown", 1731 | "metadata": { 1732 | "id": "SP6sUTQwzYhU" 1733 | }, 1734 | "source": [ 1735 | "#**Download the file generated**\n", 1736 | "#### Run the next cell and select the destination folder\n" 1737 | ] 1738 | }, 1739 | { 1740 | "cell_type": "code", 1741 | "metadata": { 1742 | "colab": { 1743 | "base_uri": "https://localhost:8080/", 1744 | "height": 17 1745 | }, 1746 | "id": "z8EjG7goWTc-", 1747 | "outputId": "131d2cdd-1561-42ce-92c7-0d40d9f1081d" 1748 | }, 1749 | "source": [ 1750 | "from google.colab import files\n", 1751 | "import json\n", 1752 | "if input_type != \"text input\":\n", 1753 | " with open('predictions.json', 'w') as f:\n", 1754 | " json_string = json.dumps(predictions, cls=NumpyEncoder, sort_keys=True, indent=4)\n", 1755 | " f.write(json_string)\n", 1756 | " files.download('predictions.json')\n", 1757 | "else:\n", 1758 | " print(\"No file input given\")" 1759 | ], 1760 | "execution_count": null, 1761 | "outputs": [ 1762 | { 1763 | "output_type": "display_data", 1764 | "data": { 1765 | "application/javascript": [ 1766 | "\n", 1767 | " async function download(id, filename, size) {\n", 1768 | " if (!google.colab.kernel.accessAllowed) {\n", 1769 | " return;\n", 1770 | " }\n", 1771 | " const div = document.createElement('div');\n", 1772 | " const label = document.createElement('label');\n", 1773 | " label.textContent = `Downloading \"${filename}\": `;\n", 1774 | " div.appendChild(label);\n", 1775 | " const progress = document.createElement('progress');\n", 1776 | " progress.max = size;\n", 1777 | " div.appendChild(progress);\n", 1778 | " document.body.appendChild(div);\n", 1779 | "\n", 1780 | " const buffers = [];\n", 1781 | " let downloaded = 0;\n", 1782 | "\n", 1783 | " const channel = await google.colab.kernel.comms.open(id);\n", 1784 | " // Send a message to notify the kernel that we're ready.\n", 1785 | " channel.send({})\n", 1786 | "\n", 1787 | " for await (const message of channel.messages) {\n", 1788 | " // Send a message to notify the kernel that we're ready.\n", 1789 | " channel.send({})\n", 1790 | " if (message.buffers) {\n", 1791 | " for (const buffer of message.buffers) {\n", 1792 | " buffers.push(buffer);\n", 1793 | " downloaded += buffer.byteLength;\n", 1794 | " progress.value = downloaded;\n", 1795 | " }\n", 1796 | " }\n", 1797 | " }\n", 1798 | " const blob = new Blob(buffers, {type: 'application/binary'});\n", 1799 | " const a = document.createElement('a');\n", 1800 | " a.href = window.URL.createObjectURL(blob);\n", 1801 | " a.download = filename;\n", 1802 | " div.appendChild(a);\n", 1803 | " a.click();\n", 1804 | " div.remove();\n", 1805 | " }\n", 1806 | " " 1807 | ], 1808 | "text/plain": [ 1809 | "" 1810 | ] 1811 | }, 1812 | "metadata": { 1813 | "tags": [] 1814 | } 1815 | }, 1816 | { 1817 | "output_type": "display_data", 1818 | "data": { 1819 | "application/javascript": [ 1820 | "download(\"download_fd4bbf57-8bf3-4fe7-b3a0-49b7a6ef5589\", \"predictions.json\", 354)" 1821 | ], 1822 | "text/plain": [ 1823 | "" 1824 | ] 1825 | }, 1826 | "metadata": { 1827 | "tags": [] 1828 | } 1829 | } 1830 | ] 1831 | }, 1832 | { 1833 | "cell_type": "code", 1834 | "metadata": { 1835 | "id": "LiEMpkzGjSA9" 1836 | }, 1837 | "source": [ 1838 | "" 1839 | ], 1840 | "execution_count": null, 1841 | "outputs": [] 1842 | } 1843 | ] 1844 | } -------------------------------------------------------------------------------- /Demos/Rationale_predictor_demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Rationale_predictor_demo.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "display_name": "Python 3", 12 | "name": "python3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "MBD0BwuKzhbK" 23 | }, 24 | "source": [ 25 | "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/hate-alert/Tutorial-ICWSM-2021/blob/main/Demos/Rationale_predictor_demo.ipynb)\n" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "rU55lwj03fsJ" 32 | }, 33 | "source": [ 34 | "# **Rationale and Label predictor for Abusive speech**\n", 35 | "> Here, we present a tool which can predict both rationale and labels given a dataset having unknown label. \n", 36 | "This tool is provided by [Hate-alert](https://github.com/hate-alert)\n", 37 | "\n", 38 | "![hate speech](https://www.media-diversity.org/wp-content/uploads/2021/03/shutterstock_1523413514-780x520.jpg)\n", 39 | "\n" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": { 45 | "id": "b8c3xcqtwe6c" 46 | }, 47 | "source": [ 48 | "#**Install necessary modules**\n", 49 | "####this cell will install transformers and other necessary packages required for running the code\n" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "metadata": { 55 | "id": "CKUqwmKu3t_d" 56 | }, 57 | "source": [ 58 | "%%capture\n", 59 | "!pip install transformers\n", 60 | "!pip install torch\n", 61 | "!pip install ekphrasis\n", 62 | "!git clone https://github.com/hate-alert/Tutorial-ICWSM-2021.git\n" 63 | ], 64 | "execution_count": 1, 65 | "outputs": [] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "metadata": { 70 | "colab": { 71 | "base_uri": "https://localhost:8080/" 72 | }, 73 | "id": "VkJAnGuN_gLB", 74 | "outputId": "cb5dbd80-ee5c-4256-ab2e-c938a36d93f3" 75 | }, 76 | "source": [ 77 | "cd Tutorial-ICWSM-2021" 78 | ], 79 | "execution_count": 2, 80 | "outputs": [ 81 | { 82 | "output_type": "stream", 83 | "text": [ 84 | "/content/Tutorial-ICWSM-2021\n" 85 | ], 86 | "name": "stdout" 87 | } 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": { 93 | "id": "I_9MmtLF785n" 94 | }, 95 | "source": [ 96 | "#### **Import necessary modules**\n" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "metadata": { 102 | "id": "WTB7UXsm3_mF" 103 | }, 104 | "source": [ 105 | "%%capture\n", 106 | "import transformers\n", 107 | "from transformers import AutoTokenizer\n", 108 | "from transformers import BertForTokenClassification, BertForSequenceClassification,BertPreTrainedModel, BertModel\n", 109 | "import torch.nn as nn\n", 110 | "import torch\n", 111 | "from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\n", 112 | "import re\n", 113 | "import random\n", 114 | "import torch.nn.functional as F\n", 115 | "import numpy as np\n", 116 | "from Code.utils import *\n", 117 | "from Code.model import *\n", 118 | "from Code.predictions import *" 119 | ], 120 | "execution_count": 3, 121 | "outputs": [] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": { 126 | "id": "qd26Eq_pXXP1" 127 | }, 128 | "source": [ 129 | "### **Set GPU** : \n", 130 | "> This will select the device based on your current configuration. Select Runtime --> change runtimetype and select GPU as hardware accelerator to use GPU. " 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "metadata": { 136 | "id": "YjbiE-E6Fl0v" 137 | }, 138 | "source": [ 139 | "if torch.cuda.is_available():\n", 140 | " device = torch.device(\"cuda\")\n", 141 | "else:\n", 142 | " device = torch.device(\"cpu\")" 143 | ], 144 | "execution_count": 4, 145 | "outputs": [] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": { 150 | "id": "X47xZEZ3UqwD" 151 | }, 152 | "source": [ 153 | "# **Model and its origin:-**\n", 154 | "\n", 155 | "* **The model here was trained using the [hatexplain dataset](https://huggingface.co/datasets/hatexplain)** \n", 156 | "\n", 157 | "This is a BERT-BASE-UNCASED model trained with label predictor and rationale predictor head\n", 158 | "\n", 159 | "> **Labels** -- abusive (hateful/offensive) and normal labels \n", 160 | "> **Rationales** -- Highlighted the words which can justify the label selected by the annotators (only for abusive labels)\n", 161 | "\n", 162 | "\n", 163 | "#### **If used cite this** \n", 164 | "```\n", 165 | "@article{mathew2020hatexplain,\n", 166 | " title={HateXplain: A Benchmark Dataset for Explainable Hate Speech Detection},\n", 167 | " author={Mathew, Binny and Saha, Punyajoy and Yimam, Seid Muhie and Biemann, Chris and Goyal, Pawan and Mukherjee, Animesh},\n", 168 | " journal={arXiv preprint arXiv:2012.10289},\n", 169 | " year={2020}\n", 170 | "}\n", 171 | "```" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "metadata": { 177 | "id": "V8_FUrHCC5FQ" 178 | }, 179 | "source": [ 180 | "model = modelPredRationale(model_path='Hate-speech-CNERG/bert-base-uncased-hatexplain-rationale-two',device=device)" 181 | ], 182 | "execution_count": 34, 183 | "outputs": [] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "metadata": { 188 | "id": "-M-ma7nPBHL7" 189 | }, 190 | "source": [ 191 | "#@title **How do you want to enter text ?**\n", 192 | "# @markdown You can either directly enter the text (text input) or uppload from a csv (file)\n", 193 | "input_type = \"text input\" #@param [\"file\", \"text input\"]" 194 | ], 195 | "execution_count": 82, 196 | "outputs": [] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "metadata": { 201 | "id": "e8Ut_xu0BV3A", 202 | "colab": { 203 | "base_uri": "https://localhost:8080/" 204 | }, 205 | "outputId": "d4d2cf03-3c64-4eb2-fbb1-a4fce38f141c" 206 | }, 207 | "source": [ 208 | "import io\n", 209 | "import pandas as pd\n", 210 | "\n", 211 | "if input_type == \"text input\":\n", 212 | " text_input = input(\"Write the post: \")\n", 213 | " dataset=[text_input]\n", 214 | "else:\n", 215 | " print(\"Please upload the csv file you want to get predictions\")\n", 216 | " print(\"Please make sure the column name of the csv should be Index, Sentences\")\n", 217 | " from google.colab import files\n", 218 | " uploaded = files.upload()\n", 219 | " dataset = pd.read_csv(io.BytesIO(uploaded[list(uploaded)[0]])).reset_index()" 220 | ], 221 | "execution_count": 83, 222 | "outputs": [ 223 | { 224 | "output_type": "stream", 225 | "text": [ 226 | "Write the post: you deserve death if you voted yes cruz that dumbass spic was destined to lose\n" 227 | ], 228 | "name": "stdout" 229 | } 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "metadata": { 235 | "id": "en7viZPEmF3j" 236 | }, 237 | "source": [ 238 | "def getDatasetPrediction(dataset,config):\n", 239 | " labels,attention,sents=model.return_rationales(dataset['Sentences'])\n", 240 | " predictions = {}\n", 241 | " for index, row in dataset.iterrows():\n", 242 | " dict1={}\n", 243 | " dict1['Sentence']=row['Sentences']\n", 244 | " dict_labels={}\n", 245 | " for ele in config:\n", 246 | " dict_labels[config[ele]]=round(labels[index][ele],3)\n", 247 | " dict1[\"Labels\"]=dict_labels\n", 248 | "\n", 249 | " dict1[\"Tokens\"] = sents[index]\n", 250 | " if (np.argmax(labels[index])==0):\n", 251 | " dict1[\"Rationale\"]=list(np.zeros(len(sents[index])))\n", 252 | " else:\n", 253 | " dict1[\"Rationale\"]=attention[index][0:len(dict1[\"Tokens\"])]\n", 254 | " predictions[row['Index']] = dict1\n", 255 | " return predictions\n", 256 | "\n", 257 | "def getRandomTextFromPred(pred = None):\n", 258 | " return random.choice(list(pred.items()))" 259 | ], 260 | "execution_count": 84, 261 | "outputs": [] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "metadata": { 266 | "colab": { 267 | "base_uri": "https://localhost:8080/" 268 | }, 269 | "id": "_93XaDK9oIU6", 270 | "outputId": "f3f1c657-0c37-469b-8b2a-67961fbbe788" 271 | }, 272 | "source": [ 273 | "if input_type == \"text input\":\n", 274 | " labels,attention,sents=model.return_rationales(dataset)\n", 275 | " \n", 276 | "else:\n", 277 | " config=model.config.id2label\n", 278 | " predictions= getDatasetPrediction(dataset,config)" 279 | ], 280 | "execution_count": 85, 281 | "outputs": [ 282 | { 283 | "output_type": "stream", 284 | "text": [ 285 | "Running eval on test data...\n" 286 | ], 287 | "name": "stdout" 288 | } 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "metadata": { 294 | "id": "UAUm4mfCZPrG" 295 | }, 296 | "source": [ 297 | "def show_attention_rationale(tokenized_text,attention_vector):\n", 298 | " char_vals = [CharVal(c, v) for c, v in zip(tokenized_text, attention_vector)]\n", 299 | " char_df = pd.DataFrame(char_vals).transpose()\n", 300 | " char_df = char_df.style.applymap(color_charvals_rationale)\n", 301 | " return char_df" 302 | ], 303 | "execution_count": 86, 304 | "outputs": [] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "metadata": { 309 | "id": "Vo4gQwHN_Ewc" 310 | }, 311 | "source": [ 312 | "### Show a random text with rationales, No rationales will mean the text is not predicted as abusive by the model.\n", 313 | "if input_type != \"text input\":\n", 314 | " pred=getRandomTextFromPred(predictions)\n", 315 | " print(pred)\n", 316 | " char_df=show_attention_rationale(pred[1]['Tokens'],pred[1]['Rationale'])\n", 317 | "else:\n", 318 | " pred= {'Normal':labels[0][0], 'Abusive':labels[0][1]}\n", 319 | " if (np.argmax(labels[0])==0):\n", 320 | " attention=[list(np.zeros(len(sents[0])))]\n", 321 | " else:\n", 322 | " pass \n", 323 | " char_df=show_attention_rationale(sents[0], attention[0])" 324 | ], 325 | "execution_count": 87, 326 | "outputs": [] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "metadata": { 331 | "colab": { 332 | "base_uri": "https://localhost:8080/", 333 | "height": 79 334 | }, 335 | "id": "sp0aMAhszN-G", 336 | "outputId": "dc8b4100-1606-492c-f502-fad26d81a1a2" 337 | }, 338 | "source": [ 339 | "print(\"Prediction probablities:\", pred)\n", 340 | "char_df" 341 | ], 342 | "execution_count": 88, 343 | "outputs": [ 344 | { 345 | "output_type": "stream", 346 | "text": [ 347 | "Prediction probablities: {'Normal': 0.026733398, 'Abusive': 0.9732666}\n" 348 | ], 349 | "name": "stdout" 350 | }, 351 | { 352 | "output_type": "execute_result", 353 | "data": { 354 | "text/html": [ 355 | "\n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | "
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
0[CLS]youdeservedeathifyouvotedyescruzthatdumb##asssp##icwasdestinedtolose[SEP]
" 394 | ], 395 | "text/plain": [ 396 | "" 397 | ] 398 | }, 399 | "metadata": { 400 | "tags": [] 401 | }, 402 | "execution_count": 88 403 | } 404 | ] 405 | }, 406 | { 407 | "cell_type": "markdown", 408 | "metadata": { 409 | "id": "debOLbbeV8aj" 410 | }, 411 | "source": [ 412 | "### **Word of caution**\n", 413 | "\n", 414 | "> Model used here have any trained using a particular dataset and they may carry some bias or errors, they should be only used as a complementary labels in case of any analysis." 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "metadata": { 420 | "id": "fn2NNW1e1wWT" 421 | }, 422 | "source": [ 423 | "#**Download the file generated**\n", 424 | "#### Run this cell and select the destination folder.\n" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "metadata": { 430 | "colab": { 431 | "base_uri": "https://localhost:8080/" 432 | }, 433 | "id": "Ki5ACvSoCe2l", 434 | "outputId": "45125ca0-f41a-452b-d166-2467b6056067" 435 | }, 436 | "source": [ 437 | "from google.colab import files\n", 438 | "import json\n", 439 | "if input_type != \"text input\":\n", 440 | " with open('predictions.json', 'w') as f:\n", 441 | " json_string = json.dumps(predictions, cls=NumpyEncoder, sort_keys=True, indent=4)\n", 442 | " f.write(json_string)\n", 443 | " files.download('predictions.json')\n", 444 | "else:\n", 445 | " print(\"No file input given\")" 446 | ], 447 | "execution_count": null, 448 | "outputs": [ 449 | { 450 | "output_type": "stream", 451 | "text": [ 452 | "No file input given\n" 453 | ], 454 | "name": "stdout" 455 | } 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "metadata": { 461 | "id": "uNZr_e0IErnl" 462 | }, 463 | "source": [ 464 | "" 465 | ], 466 | "execution_count": null, 467 | "outputs": [] 468 | } 469 | ] 470 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Hate-ALERT 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![GitHub license](https://img.shields.io/github/license/Naereen/StrapDown.js.svg)](https://github.com/Naereen/StrapDown.js/blob/master/LICENSE) 2 | [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg?style=flat-square)](http://makeapullrequest.com) 3 | [![Hits](https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2Fhate-alert%2FTutorial-ICWSM-2021&count_bg=%2379C83D&title_bg=%23555555&icon=peertube.svg&icon_color=%23E7E7E7&title=hits&edge_flat=false)](https://hits.seeyoufarm.com) 4 | 5 | # Hate speech detection, mitigation and beyond 6 | These are the resources and demos associated with the tutorial *"Hate speech detection, mitigation and beyond"* at [ICWSM 2021](https://www.icwsm.org/2021/index.html) and [AAAI 2022](https://hate-alert.github.io/talk/aaai_tutorial/) are noted here. 7 | 8 | # Abstract :bookmark: 9 | 10 | Social media sites such as Twitter and Facebook have connected billions of people and given the opportunity to the users to share their ideas and opinions instantly. That being said, there are several ill consequences as well such as online harassment, trolling, cyber-bullying, fake news, and hate speech. Out of these, hate speech presents a unique challenge as it is deep engraved into our society and is often linked with offline violence. Social media platforms rely on local moderators to identify hate speech and take necessary action, but with a prolific increase in such content over the social media many are turning toward automated hate speech detection and mitigation systems. This shift brings several challenges on the plate, and hence, is an important avenue to explore for the computation social science community. 11 | 12 | # Contributions and achievements :tada: :tada: 13 | 14 | * Our papers are accepted in **top conferences** like AAAI, WWW, CSCW, ICWSM, WebSci. Link to the papers [here](../../tags/our-papers/) 15 | * We have **open sourced** our codes and datasets under a single github organisation - [hate-alert](https://github.com/hate-alert) for the future research in this domain 16 | * We have stored different **transformers models** in [huggingface.co](https://huggingface.co/). Link to [hatealert organisation](https://huggingface.co/Hate-speech-CNERG) 17 | * **Dataset** from our recent accepted paper in AAAI - *"Hatexplain:A Benchmark Dataset for Explainable Hate Speech Detection"* is also stored in the [huggingface datsets forum](https://huggingface.co/datasets/hatexplain) 18 | * We also participate in several hate speech shared tasks, winning many of them - [hatealert@DLTEACL](https://www.aclweb.org/anthology/2021.dravidianlangtech-1.17.pdf), [hateminers@AMI](http://personales.upv.es/prosso/resources/FersiniEtAl_Evalita18.pdf), [hatemonitors@HASOC](https://dl.acm.org/doi/10.1145/3368567.3368584) and coming under 1% in [hatealert@Hatememe detection](https://www.drivendata.org/competitions/70/hateful-memes-phase-2/leaderboard/) by Facebook AI. 19 | * [Notion page](https://www.notion.so/punyajoy/Hate-speech-papers-resource-7fc20fa1bea64cbdb30862092ae197b3) containing hate speech papers. 20 | 21 | # Other Resources 22 | 23 | * A dataset resource created and maintained by Leon Derczynski and Bertie Vidgen. Click the link [here](https://hatespeechdata.com/) 24 | * This resource collates all the resources and links used in this information hub, for both teachers and young people. Click the link [here](https://www.stophateuk.org/resources-2/) 25 | 26 | # Few demos :abacus: 27 | 28 | We also provide some demos for the social scientists so that our opensource models can be used. Please provide feedback in the [issues](https://github.com/hate-alert/Tutorial-ICWSM-2021/issues). 29 | 30 | * **Multlingual abuse predictor** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/hate-alert/Tutorial-ICWSM-2021/blob/main/Demos/Multilingual_abuse_predictor.ipynb) - This presents a suite of models which try to predict abuse in different languages. Different models are built upon the dataset found from that language. You can upload a file in the specified format and get back the predicitions of these models. 31 | * **Rationale predictor demo** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/hate-alert/Tutorial-ICWSM-2021/blob/main/Demos/Rationale_predictor_demo.ipynb) - This is a model trained using rationale and classifier head. Along with predicting the abusive or non-abusive label, it can also predict the rationales i.e. parts of text which are abusive according to the model. 32 | * **Counter speech detection demo** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/binny-mathew/Countering_Hate_Speech/blob/master/DEMO_Counter_speech.ipynb) - These are some of the models which can detect counter speech. These models are simple in nature. [Link to the original github repository](https://github.com/hate-alert/Countering_Hate_Speech_ICWSM2019) 33 | 34 | > :rotating_light: **Check the individual colab demos to learn more about the how to use these tools. These models might carry potential biases, hence should be used with appropriate caution.** :rotating_light: 35 | 36 | ### :thumbsup: The repo is still in active developements. Feel free to create an [issue](https://github.com/hate-alert/Tutorial-ICWSM-2021/issues) for the demos as well as the notion page that we shared!! :thumbsup: 37 | 38 | --------------------------------------------------------------------------------