├── hate_speech_mlma.zip ├── notes ├── requirements.txt ├── LICENSE ├── constants.py ├── annotated_data_processing.py ├── keywords.txt ├── README.md ├── baseline_classifiers.py ├── predictors.py ├── run_sluice_net.py ├── utils.py ├── guidelines.tar ├── pilot_dataset_tweets_only.tar └── sluice_net.py /hate_speech_mlma.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUST-KnowComp/MLMA_hate_speech/HEAD/hate_speech_mlma.zip -------------------------------------------------------------------------------- /notes: -------------------------------------------------------------------------------- 1 | Although we have done our best to avoid scams and to counter common misconceptions, there are still, as in 2 | complicated human annotation tasks unreliable annotations. 3 | For instance, one common misconception is that some annotators in French were not aware that the word m****** 4 | was an insulting slur towards people with Down Syndrom rather than an origin. 5 | The same goes to people sometimes between wordings of nationality and gender/gender identity in Arabic. 6 | 7 | We will publish more insights about the data, our annotation guidelines, and the dataset with emojis soon. 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | To replicate our experiments, you need to download: 2 | Python 3.6 onwards 3 | dyNET 0.0.0 and its dependencies (Follow instructions on https://dynet.readthedocs.io/en/latest/python.html) 4 | 5 | On a side note, when you install DyNet make sure that you are using CUDA 9 and CUDNN for CUDA 9. I used the following command: 6 | CUDNN_ROOT=/path_to_conda/pkgs/cudnn-7.3.1-cuda10.0_0 BACKEND=/path_to_conda/pkgs/cudatoolkit-10.0.130-0 pip install git+https://github.com/clab/dynet#egg=dynet 7 | 8 | Using CUDA 10 will generate an error when calling DyNet. 9 | 10 | Sluice Networks (Ruder et al. 2017 ) 11 | Babylon/ MUSE embeddings 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 HKUST-KnowComp 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /constants.py: -------------------------------------------------------------------------------- 1 | """ 2 | Constants shared across files. 3 | """ 4 | import re 5 | 6 | # special tokens and number regex 7 | UNK = '_UNK' # unk/OOV word/char 8 | WORD_START = '' # word star 9 | WORD_END = '' # word end 10 | NUM = 'NUM' # number normalization string 11 | NUMBERREGEX = re.compile("[0-9]+|[0-9]+\\.[0-9]+|[0-9]+[0-9,]+") 12 | 13 | # tasks 14 | 15 | TASK_NAMES = ['group', #The target group of the tweet 16 | 'annotator_sentiment', #The sentiment of the annotator with respect to the tweet 17 | 'directness', #Whether the tweet is direct or indirect hate speech 18 | 'target', #The characteristic based on which the tweet discriminates people (e.g., race). 19 | 'sentiment' ] #The sentiment expressed by the tweet 20 | 21 | # word embeddings 22 | EMBEDS = ['babylon', 'muse', 'umwe', None] 23 | 24 | EMBEDS_FILES = {'babylon': '../data/bi-embedding-babylon78/transformed_embeds/', 25 | 'muse': '../data/bi-embedding-muse/', 26 | 'umwe':' '} 27 | 28 | 29 | #Dictionary of tasks and corresponding labels 30 | LABELS = {'group':['arabs', 'other', 'african_descent', 'left_wing_people', 'asians', 31 | 'hispanics', 'muslims', 'individual', 'special_needs', 'christian', 'immigrants', 'jews' , 32 | 'women', 'indian/hindu', 'gay', 'refugees'], 33 | 'annotator_sentiment':['indifference', 'sadness', 'disgust', 'shock', 'confusion', 34 | 'anger', 'fear'], 35 | 'directness':['direct', 'indirect'], 36 | 'target':['origin', 'religion', 'disability', 'gender', 'sexual_orientation', 'other'], 37 | 'sentiment':['disrespectful', 'fearful', 'offensive', 'abusive', 'hateful', 'normal']} 38 | MODIFIED_LABELS = {'group':['arabs', 'other', 'african_descent', 'left_wing_people', 'asians', 39 | 'hispanics', 'muslims', 'individual', 'special_needs', 'christian', 'immigrants', 'jews' , 40 | 'women', 'indian/hindu', 'gay', 'refugees'], 41 | 'annotator_sentiment':['indifference', 'sadness', 'shock', 'confusion','anger', 'fear'], 42 | 'directness':['direct', 'indirect'], 43 | 'target':['origin', 'religion', 'disability', 'gender', 'sexual_orientation', 'other'], 44 | 'sentiment':['somewhatoffensive', 'offensive', 'veryoffensive', 'normal']} 45 | 46 | #'directness':['direct', 'indirect', 'none'], #to be added 47 | 48 | # languages 49 | LANGUAGES = ['ar', 'en', 'fr'] 50 | FULL_LANG = {'ar': 'Arabic', 'en': 'English', 'fr': 'French'} 51 | 52 | 53 | 54 | 55 | # optimizers 56 | SGD = 'sgd' 57 | ADAM = 'adam' 58 | 59 | 60 | # cross-stitch and layer-stitch initialization schemes 61 | BALANCED = 'balanced' 62 | IMBALANCED = 'imbalanced' 63 | -------------------------------------------------------------------------------- /annotated_data_processing.py: -------------------------------------------------------------------------------- 1 | import re 2 | from nltk.corpus import stopwords 3 | import numpy as np 4 | import pandas as pd 5 | 6 | def clean_text(text): 7 | """ 8 | text: a string 9 | 10 | return: modified initial string 11 | """ 12 | replace_by_blank_symbols = re.compile('\u00bb|\u00a0|\u00d7|\u00a3|\u00eb|\u00fb|\u00fb|\u00f4|\u00c7|\u00ab|\u00a0\ude4c|\udf99|\udfc1|\ude1b|\ude22|\u200b|\u2b07|\uddd0|\ude02|\ud83d|\u2026|\u201c|\udfe2|\u2018|\ude2a|\ud83c|\u2018|\u201d|\u201c|\udc69|\udc97|\ud83e|\udd18|\udffb|\ude2d|\udc80|\ud83e|\udd2a|\ud83e|\udd26|\u200d|\u2642|\ufe0f|\u25b7|\u25c1|\ud83e|\udd26|\udffd|\u200d|\u2642|\ufe0f|\udd21|\ude12|\ud83e|\udd14|\ude03|\ude03|\ude03|\ude1c|\udd81|\ude03|\ude10|\u2728|\udf7f|\ude48|\udc4d|\udffb|\udc47|\ude11|\udd26|\udffe|\u200d|\u2642|\ufe0f|\udd37|\ude44|\udffb|\u200d|\u2640|\udd23|\u2764|\ufe0f|\udc93|\udffc|\u2800|\u275b|\u275c|\udd37|\udffd|\u200d|\u2640|\ufe0f|\u2764|\ude48|\u2728|\ude05|\udc40|\udf8a|\u203c|\u266a|\u203c|\u2744|\u2665|\u23f0|\udea2|\u26a1|\u2022|\u25e1|\uff3f|\u2665|\u270b|\u270a|\udca6|\u203c|\u270c|\u270b|\u270a|\ude14|\u263a|\udf08|\u2753|\udd28|\u20ac|\u266b|\ude35|\ude1a|\u2622|\u263a|\ude09|\udd20|\udd15|\ude08|\udd2c|\ude21|\ude2b|\ude18|\udd25|\udc83|\ude24|\udc3e|\udd95|\udc96|\ude0f|\udc46|\udc4a|\udc7b|\udca8|\udec5|\udca8|\udd94|\ude08|\udca3|\ude2b|\ude24|\ude23|\ude16|\udd8d|\ude06|\ude09|\udd2b|\ude00|\udd95|\ude0d|\udc9e|\udca9|\udf33|\udc0b|\ude21|\udde3|\ude37|\udd2c|\ude21|\ude09|\ude39|\ude42|\ude41|\udc96|\udd24|\udf4f|\ude2b|\ude4a|\udf69|\udd2e|\ude09|\ude01|\udcf7|\ude2f|\ude21|\ude28|\ude43|\udc4a|\uddfa|\uddf2|\udc4a|\ude95|\ude0d|\udf39|\udded|\uddf7|\udded|\udd2c|\udd4a|\udc48|\udc42|\udc41|\udc43|\udc4c|\udd11|\ude0f|\ude29|\ude15|\ude18|\ude01|\udd2d|\ude43|\udd1d|\ude2e|\ude29|\ude00|\ude1f|\udd71|\uddf8|\ude20|\udc4a|\udeab|\udd19|\ude29|\udd42|\udc4a|\udc96|\ude08|\ude0d|\udc43|\udff3|\udc13|\ude0f|\udc4f|\udff9|\udd1d|\udc4a|\udc95|\udcaf|\udd12|\udd95|\udd38|\ude01|\ude2c|\udc49|\ude01|\udf89|\udc36|\ude0f|\udfff|\udd29|\udc4f|\ude0a|\ude1e|\udd2d|\uff46|\uff41|\uff54|\uff45|\uffe3|\u300a|\u300b|\u2708|\u2044|\u25d5|\u273f|\udc8b|\udc8d|\udc51|\udd8b|\udd54|\udc81|\udd80|\uded1|\udd27|\udc4b|\udc8b|\udc51|\udd90|\ude0e') 13 | replace_by_apostrophe_symbol = re.compile('\u2019') 14 | replace_by_dash_symbol = re.compile('\u2014') 15 | replace_by_u_symbols = re.compile('\u00fb|\u00f9') 16 | replace_by_a_symbols = re.compile('\u00e2|\u00e0') 17 | replace_by_c_symbols = re.compile('\u00e7') 18 | replace_by_i_symbols = re.compile('\u00ee|\u00ef') 19 | replace_by_o_symbols = re.compile('\u00f4') 20 | replace_by_oe_symbols = re.compile('\u0153') 21 | replace_by_e_symbols = re.compile('\u00e9|\u00ea|\u0117|\u00e8') 22 | REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|,;]') 23 | text = replace_by_e_symbols.sub('e', text) 24 | text = replace_by_a_symbols.sub('a', text) 25 | text = replace_by_o_symbols.sub('o', text) 26 | text = replace_by_oe_symbols.sub('oe', text) 27 | text = replace_by_u_symbols.sub('e', text) 28 | text = replace_by_i_symbols.sub('e', text) 29 | text = replace_by_u_symbols.sub('e', text) 30 | text = replace_by_apostrophe_symbol.sub("'", text) 31 | text = replace_by_dash_symbol.sub("_", text) 32 | text = replace_by_blank_symbols.sub('', text) 33 | 34 | #For English 35 | #text = ''.join([c for c in text if ord(c) < 128]) 36 | text = text.replace("\\", "") 37 | #text = text.encode("ascii", errors="ignore").decode() 38 | text = text.lower() # lowercase text 39 | STOPWORDS = set(stopwords.words('english')) 40 | text = text.lower() # lowercase text 41 | text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text 42 | text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text 43 | return text 44 | 45 | -------------------------------------------------------------------------------- /keywords.txt: -------------------------------------------------------------------------------- 1 | *** Below are keywords/phrases that we used for our search. They include different spellings of common slurs, controversial topics, 2 | insulting patterns during debates, ... *** 3 | 4 | ENGLISH 5 | ___________ 6 | Immigrants 7 | refugees 8 | Africa 9 | AllLivesMatter 10 | immigrant terror 11 | abusive women 12 | harassment 13 | fake empowerment 14 | African attack 15 | protect from immigrants 16 | Eurafrica 17 | Immigrant not welcome 18 | illegal immigrants 19 | It's okay to be white 20 | feminazi 21 | violence 22 | refugees impact 23 | asians 24 | terrorist leftist 25 | terrorist threat 26 | ching chong 27 | dirty jew 28 | dirty pig 29 | redneck 30 | bring Ebola 31 | bring disease 32 | negro 33 | illegal aliens 34 | intellectual terrorist 35 | intellectual terrorism 36 | women threat 37 | woman terrorist 38 | shithole country 39 | shithole countries 40 | go back to where you come from 41 | go back to where they come from 42 | loud mexican 43 | muslim countries 44 | muslim country 45 | muslim immigrant 46 | muslim refugee 47 | white christian 48 | christian country 49 | immigrant invasion 50 | muslim invasion 51 | refugee invasion 52 | antiwhite 53 | anti white 54 | eurafrica 55 | jihadi 56 | russian rat 57 | mongol 58 | mongoloid 59 | mongy 60 | downy 61 | retard 62 | boojie 63 | surrender monkey 64 | chinaman 65 | hillbilly 66 | whigger 67 | white nigger 68 | wigger 69 | wigerette 70 | bitter clinger 71 | conspiracy theorist 72 | redneck 73 | trailer trash 74 | white trash 75 | yobbo 76 | retard 77 | nigger 78 | white trash 79 | trailer trash 80 | raghead 81 | camel fucker 82 | moon cricket 83 | wetback 84 | spic 85 | cunt 86 | dyke 87 | twat 88 | chinaman 89 | muzzie 90 | faggot 91 | __________________________________________________________________________________________________ 92 | French 93 | _________ 94 | invasion réfugié 95 | invasion migration 96 | invasion musulman 97 | invasion islam 98 | invasion migrant 99 | complotiste 100 | complotisme 101 | réfugiés 102 | migrants 103 | Afrique 104 | migrants africains 105 | réfugiés 106 | migrants 107 | Afrique 108 | debile 109 | débile 110 | mongol 111 | attardé 112 | attarde 113 | facho 114 | e-facho 115 | action sociale 116 | vive front national 117 | djihadiste 118 | eurafrique 119 | anti blanc 120 | entiblanc 121 | beurres 122 | rebeus 123 | renois 124 | chinetoque 125 | mongol 126 | terroristes 127 | arabes 128 | invasion des réfugiés 129 | France musulmane 130 | islamisation 131 | triage des migrants 132 | contre l'avortement 133 | femme autonome 134 | liberte de la femme 135 | harcèlement 136 | blanc fier 137 | implantation des migrants 138 | violence 139 | blackos 140 | terrorisme musulman 141 | terrorisme islam 142 | terrorisme intellectuel 143 | expulser migrants 144 | islam gauchiste 145 | gauchiste 146 | expulser migrant 147 | ching chong 148 | sale arabe 149 | sale africain 150 | negre 151 | negre maghrebins 152 | noich 153 | sale rebeu 154 | sale renoi 155 | africain ramene madadie 156 | migrant maladie 157 | ramene maladie 158 | arabe chameau 159 | africain chameau 160 | sale russe 161 | ramene ebola 162 | negro 163 | Afreurope 164 | __________________________________________________________________________________________________ 165 | 166 | Arabic 167 | _________ 168 | اللاجئون 169 | اللاجئون ضرر 170 | اللاجئون خطر 171 | اللاجئون و الامراض 172 | افارقة ايبولا 173 | افارقة كوليرا 174 | افارقة مرض 175 | خنازير 176 | حرمة قاصر 177 | سوري وسخ 178 | هندوس 179 | بايرة 180 | مراة عورة 181 | مراة متسلطة 182 | شينغ شونغ 183 | لاجئ وسخ 184 | مراة قاصر 185 | افارقة نجس 186 | افارقة وسخ 187 | عرب وسخ 188 | بول البعير 189 | سواقة حريم 190 | مراة قاصر 191 | خنزير 192 | وسخ 193 | منحرفة 194 | العادات و التقاليد 195 | الحريم 196 | بهيم 197 | الافارقة 198 | السود 199 | متبرجة 200 | الزواج و الطلاق للمراة 201 | ديوث 202 | فمينيست 203 | العنف 204 | اسقاط ولاية المراة 205 | الصينيون 206 | افريقيا السمراء 207 | التحرش 208 | افريقيا 209 | اغبياء 210 | منغول 211 | مغول 212 | ديوث 213 | وسخ شيعة 214 | ايران وسخ 215 | طائفة 216 | طائفي 217 | علماني 218 | جهاد 219 | كحالش 220 | معاق ذهني 221 | شيعة رافضة 222 | شيعي 223 | رافضي 224 | علوي 225 | اكراد 226 | لجوء احتلال 227 | لاجئ احتلال 228 | مثلية 229 | مثليين 230 | كلب ايران 231 | فارسي وسخ 232 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dataset and code of our EMNLP 2019 Paper (Multilingual and Multi-Aspect Hate Speech Analysis) 2 | If you use our dataset or our guidelines please cite this paper: 3 | 4 | @inproceedings{ousidhoum-etal-multilingual-hate-speech-2019, 5 | title = "Multilingual and Multi-Aspect Hate Speech Analysis", 6 | author = "Ousidhoum, Nedjma 7 | and Lin, Zizheng 8 | and Zhang, Hongming 9 | and Song, Yangqiu 10 | and Yeung, Dit-Yan", 11 | booktitle = "Proceedings of EMNLP", 12 | year = "2019", 13 | publisher = "Association for Computational Linguistics", 14 | } 15 | 16 | 17 | ## Update 18 | - If you need the individual labels of some data instances (unfortunately I could not find all the batches on the cloud years later), please send me an email OusidhoumN(at)cardiff(dot)ac(dot)uk 19 | 20 | - The dataset is available on HuggingFace https://huggingface.co/datasets/nedjmaou/MLMA_hate_speech 21 | 22 | ## Clarification 23 | The multi-labeled tasks are *the hostility type of the tweet* and the *annotator's sentiment*. (We kept labels on which at least two annotators agreed.) 24 | 25 | ## Taxonomy 26 | In further experiments that involved binary classification tasks of the hostility/hate/abuse type, we considered single-labeled *normal* instances to be *non-hate/non-toxic* and all the other instances to be *toxic*. 27 | 28 | ## Dataset 29 | Our dataset is composed of three csv files sorted by language. They contain the tweets and the annotations described in our paper: 30 | 31 | the hostility type *(column: tweet sentiment)* 32 | 33 | hostility directness *(column: directness)* 34 | 35 | target attribute *(column: target)* 36 | 37 | target group *(column: group)* 38 | 39 | annotator's sentiment *(column: annotator sentiment)*. 40 | 41 | ## Experiments 42 | 43 | To replicate our experiments, please follow the guidelines below. 44 | 45 | ### Requirements 46 | Python 3.6 onwards, 47 | 48 | dyNET 0.0.0 and its dependencies (follow the instructions on https://dynet.readthedocs.io/en/latest/python.html). 49 | 50 | [On a side note, when you install DyNet make sure that you are using CUDA 9 and CUDNN for CUDA 9. I used the following command: 51 | 52 | CUDNN_ROOT=/path/to/conda/pkgs/cudnn-7.3.1-cuda10.0_0 \ 53 | BACKEND=/path/to/conda/pkgs/cudatoolkit-10.0.130-0 \ 54 | pip install git+https://github.com/clab/dynet#egg=dynet 55 | 56 | Using CUDA 10 will generate an error when calling DyNet for GPUs.] 57 | 58 | Cross-lingual word embeddings (Babylon or MUSE. The reported results have been run using Babylon.) 59 | 60 | 61 | ### Python files 62 | 63 | - annotated_data_processing.py contains a normalization function that cleans the content of the tweets. 64 | 65 | - constants.py defines constants used across all the files. 66 | 67 | - utils.py utility functions for data processing. 68 | 69 | - baseline_classifiers.py allows you to run majority voting and logistic regression by calling: 70 | 71 | run_majority_voting(train_filename, dev_filename, test_filename, attribute) 72 | 73 | or 74 | 75 | run_logistic_regression(train_filename, dev_filename, test_filename, attribute) 76 | 77 | on csv files of the same form of the dataset. 78 | 79 | predictors.py contains classes for sequence predictors and layers. 80 | 81 | run_sluice_net.py: Script to train, load, and evaluate SluiceNetwork. 82 | 83 | sluice_net.py: The main logic for the SluiceNetwork (Ruder et al. 2017). More details on the implementation of Sluice networks can be found here. 84 | 85 | ### How to run the program 86 | To save and load the trained model, you need to create a directory (e.g., model/), and specify the name of the created directory when using --model-dir argument in the command line. 87 | 88 | To save the log files of the training and evaluation, you need to create a directory (e.g., log/), and specify the name of the created directory when using --log-dir argumnet in the command line. 89 | 90 | #### Example: 91 | python run_sluice_net.py --dynet-autobatch 1 --dynet-gpus 3 --dynet-seed 123 \ 92 | --h-layers 1 \ 93 | --cross-stitch\ 94 | --num-subspaces 2 --constraint-weight 0.1 \ 95 | --constrain-matrices 1 2 --patience 3 \ 96 | --languages ar en fr \ 97 | --test-languages ar en fr \ 98 | --model-dir model/ --log-dir log/\ 99 | --task-names annotator_sentiment sentiment directness group target \ 100 | --train-dir '/path/to/train' \ 101 | --dev-dir '/path/to/dev' \ 102 | --test-dir 'path/to/test' \ 103 | --embeds babylon --h-dim 200 \ 104 | --cross-stitch-init-scheme imbalanced \ 105 | --threshold 0.1 106 | 107 | ### NB 108 | - The meaning of each argument can be found in run_sluice_net.py 109 | 110 | - '--task_names' refers to a list of task names (e.g: annotator_sentiment). 111 | 112 | - '--languages' refers to the language dataset which will be used in the training. 113 | 114 | - 'test-languages' can only be the subset of '--languages'. 115 | 116 | 117 | -------------------------------------------------------------------------------- /baseline_classifiers.py: -------------------------------------------------------------------------------- 1 | import re 2 | from collections import Counter 3 | import os 4 | import matplotlib 5 | import numpy as np 6 | import pandas as pd 7 | from pandas import Series 8 | from sklearn.pipeline import Pipeline 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.feature_extraction.text import CountVectorizer 11 | from sklearn.feature_extraction.text import TfidfTransformer 12 | from sklearn.naive_bayes import MultinomialNB 13 | from sklearn.metrics import accuracy_score 14 | from sklearn.linear_model import LogisticRegression 15 | from sklearn.pipeline import Pipeline 16 | from sklearn.preprocessing import LabelBinarizer, LabelEncoder 17 | from sklearn.metrics import classification_report 18 | from annotated_data_processing import clean_text 19 | from sklearn.preprocessing import MultiLabelBinarizer 20 | from sklearn.metrics import accuracy_score 21 | from sklearn.metrics import f1_score 22 | from skmultilearn.problem_transform import ClassifierChain 23 | from sklearn.dummy import DummyClassifier 24 | from constants import LABELS 25 | 26 | #majority voting for multilabel tasks: annotator's sentiment and hostility type (tweet sentiment) 27 | def lr_multilabel_classification(train_filename, dev_filename, test_filename, attribute): 28 | df_train = pd.read_csv(train_filename) 29 | df_dev = pd.read_csv(dev_filename) 30 | df_test = pd.read_csv(test_filename) 31 | mlb = MultiLabelBinarizer() 32 | X_train = df_train.tweet.apply(clean_text) 33 | y_train_text = df_train[attribute].apply(lambda x: x.split('_')) 34 | y_train = mlb.fit_transform(y_train_text) 35 | X_dev = df_dev.tweet.apply(clean_text) 36 | y_dev_text = df_dev[attribute].apply(lambda x: x.split('_')) 37 | y_dev = mlb.fit_transform(y_dev_text) 38 | X_test = df_test.tweet.apply(clean_text) 39 | y_test_text = df_test[attribute].apply(lambda x: x.split('_')) 40 | y_test = mlb.fit_transform(y_test_text) 41 | count_vect = CountVectorizer() 42 | X_train_counts = count_vect.fit_transform(X_train) 43 | tfidf_transformer = TfidfTransformer() 44 | X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) 45 | Y = mlb.fit_transform(y_train_text) 46 | classifier = Pipeline([ 47 | ('vectorizer', CountVectorizer()), 48 | ('tfidf', TfidfTransformer()), 49 | ('clf', ClassifierChain(LogisticRegression()))]) 50 | classifier.fit(X_train, y_train) 51 | y_pred = classifier.predict(X_test) 52 | print('accuracy %s' % accuracy_score(y_pred, y_test)) 53 | print('Test macro F1 score is %s' % f1_score(y_test, y_pred, average='macro')) 54 | print('Test micro F1 score is %s' % f1_score(y_test, y_pred, average='micro')) 55 | 56 | #majority voting for multilabel tasks: annotator's sentiment and hostility type (tweet sentiment) 57 | def majority_voting_multilabel_classification(train_filename, dev_filename, test_filename, attribute): 58 | df_train = pd.read_csv(train_filename) 59 | df_dev = pd.read_csv(dev_filename) 60 | df_test = pd.read_csv(test_filename) 61 | mlb = MultiLabelBinarizer() 62 | X_train = df_train.tweet.apply(clean_text) 63 | y_train_text = df_train[attribute].apply(lambda x: x.split('_')) 64 | y_train = mlb.fit_transform(y_train_text) 65 | X_dev = df_dev.tweet.apply(clean_text) 66 | y_dev_text = df_dev[attribute].apply(lambda x: x.split('_')) 67 | y_dev = mlb.fit_transform(y_dev_text) 68 | X_test = df_test.tweet.apply(clean_text) 69 | y_test_text = df_test[attribute].apply(lambda x: x.split('_')) 70 | y_test = mlb.fit_transform(y_test_text) 71 | count_vect = CountVectorizer() 72 | X_train_counts = count_vect.fit_transform(X_train) 73 | tfidf_transformer = TfidfTransformer() 74 | X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) 75 | Y = mlb.fit_transform(y_train_text) 76 | classifier = Pipeline([ 77 | ('vectorizer', CountVectorizer()), 78 | ('tfidf', TfidfTransformer()), 79 | ('clf', ClassifierChain(DummyClassifier()))]) 80 | 81 | classifier.fit(X_train, y_train) 82 | y_pred = classifier.predict(X_test) 83 | print('Accuracy %s' % accuracy_score(y_pred, y_test)) 84 | print('Test macro F1 score is %s' % f1_score(y_test, y_pred, average='macro')) 85 | print('Test micro F1 score is %s' % f1_score(y_test, y_pred, average='micro')) 86 | 87 | 88 | #majority voting for non mumtilabel tasks namely: target, group and directness 89 | def majority_voting_non_multilabel_classification(train_filename, dev_filename, test_filename, attribute): 90 | my_labels=LABELS[attribute] 91 | df_train = pd.read_csv(train_filename) 92 | df_dev = pd.read_csv(dev_filename) 93 | df_test = pd.read_csv(test_filename) 94 | X_train = df_train.tweet.apply(clean_text) 95 | y_train = df_train[attribute] 96 | X_dev = df_dev.tweet.apply(clean_text) 97 | y_dev = df_dev[attribute] 98 | X_test = df_test.tweet.apply(clean_text) 99 | y_test = df_test[attribute] 100 | count_vect = CountVectorizer() 101 | X_train_counts = count_vect.fit_transform(X_train) 102 | tfidf_transformer = TfidfTransformer() 103 | X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) 104 | dummy = Pipeline([('vect', CountVectorizer()), 105 | ('tfidf', TfidfTransformer()), 106 | ('clf', DummyClassifier()), 107 | ]) 108 | dummy.fit(X_train, y_train) 109 | y_pred = dummy.predict(X_test) 110 | print('Accuracy %s' % accuracy_score(y_pred, y_test)) 111 | print(classification_report(y_test, y_pred,target_names=my_labels,labels=my_labels)) 112 | print('Test macro F1 score is %s' % f1_score(y_test, y_pred, average='macro')) 113 | print('Test micro F1 score is %s' % f1_score(y_test, y_pred, average='micro')) 114 | 115 | 116 | #logistic regression for non mumtilabel tasks namely: target, group and directness 117 | def lr_non_multilabel_classification(train_filename, dev_filename, test_filename, attribute): 118 | my_labels=LABELS[attribute] 119 | df_train = pd.read_csv(train_filename) 120 | df_dev = pd.read_csv(dev_filename) 121 | df_test = pd.read_csv(test_filename) 122 | X_train = df_train.tweet.apply(clean_text) 123 | y_train = df_train[attribute] 124 | X_dev = df_dev.tweet.apply(clean_text) 125 | y_dev = df_dev[attribute] 126 | X_test = df_test.tweet.apply(clean_text) 127 | y_test = df_test[attribute] 128 | count_vect = CountVectorizer() 129 | X_train_counts = count_vect.fit_transform(X_train) 130 | tfidf_transformer = TfidfTransformer() 131 | X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) 132 | logreg = Pipeline([('vect', CountVectorizer()), 133 | ('tfidf', TfidfTransformer()), 134 | ('clf', LogisticRegression(n_jobs=1, C=1e5)), 135 | ]) 136 | logreg.fit(X_train, y_train) 137 | y_pred = logreg.predict(X_test) 138 | print('accuracy %s' % accuracy_score(y_pred, y_test)) 139 | print('Test macro F1 score is %s' % f1_score(y_test, y_pred, average='macro')) 140 | print('Test micro F1 score is %s' % f1_score(y_test, y_pred, average='micro')) 141 | 142 | 143 | def run_majority_voting(train_filename, dev_filename, test_filename, attribute): 144 | #multilabel tasks 145 | if(attribute=='sentiment' or attribute=='annotator_sentiment'): 146 | return majority_voting_multilabel_classification(train_filename, dev_filename, test_filename, attribute) 147 | #non mutilabel tasks 148 | elif(attribute=='target' or attribute =='group' or attribute=='directness'): 149 | return majority_voting_non_multilabel_classification(train_filename, dev_filename, test_filename, attribute) 150 | 151 | def run_logistic_regression(train_filename, dev_filename, test_filename, attribute): 152 | #multilabel tasks 153 | if(attribute=='sentiment' or attribute=='annotator_sentiment'): 154 | return lr_multilabel_classification(train_filename, dev_filename, test_filename, attribute) 155 | #non mutilabel tasks 156 | elif(attribute=='target' or attribute =='group' or attribute=='directness'): 157 | return lr_non_multilabel_classification(train_filename, dev_filename, test_filename, attribute) 158 | -------------------------------------------------------------------------------- /predictors.py: -------------------------------------------------------------------------------- 1 | """ 2 | Classes for predictors and special layers. 3 | """ 4 | import dynet 5 | import numpy as np 6 | 7 | from constants import BALANCED, IMBALANCED 8 | 9 | 10 | class SequencePredictor: 11 | """Convenience class to wrap a sequence prediction model.""" 12 | def __init__(self, builder): 13 | """Initializes the model. Expects a LSTMBuilder or SimpleRNNBuilder.""" 14 | self.builder = builder 15 | 16 | def predict_sequence(self, inputs): 17 | """Predicts the output of a sequence.""" 18 | return [self.builder(x) for x in inputs] 19 | 20 | 21 | class RNNSequencePredictor(SequencePredictor): 22 | """Convenience class to wrap an RNN model.""" 23 | def predict_sequence(self, inputs): 24 | s_init = self.builder.initial_state() 25 | return [x.output() for x in s_init.add_inputs(inputs)] 26 | 27 | 28 | class BiRNNSequencePredictor(SequencePredictor): 29 | """Convenience class to wrap an LSTM builder.""" 30 | def predict_sequence(self, f_inputs, b_inputs): 31 | f_init = self.builder.initial_state() 32 | b_init = self.builder.initial_state() 33 | forward_sequence = [x.output() for x in f_init.add_inputs(f_inputs)] 34 | backward_sequence = [x.output() for x in b_init.add_inputs( 35 | reversed(b_inputs))] 36 | return forward_sequence, backward_sequence 37 | 38 | 39 | class CrossStitchLayer: 40 | """Cross-stitch layer class.""" 41 | def __init__(self, model, num_tasks, hidden_dim, num_subspaces=1, 42 | init_scheme=BALANCED): 43 | """ 44 | Initializes a CrossStitchLayer. 45 | :param model: the DyNet Model 46 | :param num_tasks: the number of tasks 47 | :param hidden_dim: the # of hidden dimensions of the previous LSTM layer 48 | :param num_subspaces: the number of subspaces 49 | :param init_scheme: the initialization scheme; balanced or imbalanced 50 | """ 51 | print('Using %d subspaces...' % num_subspaces, flush=True) 52 | alpha_params = np.full((num_tasks * num_subspaces, 53 | num_tasks * num_subspaces), 54 | 1. / (num_tasks * num_subspaces)) 55 | if init_scheme == IMBALANCED: 56 | if num_subspaces == 1: 57 | alpha_params = np.full((num_tasks, num_tasks), 58 | 0.1 / (num_tasks - 1)) 59 | for i in range(num_tasks): 60 | alpha_params[i, i] = 0.9 61 | else: 62 | # 0 1 0 1 63 | # 0 1 0 1 64 | # 1 0 1 0 65 | # 1 0 1 0 66 | for (x, y), value in np.ndenumerate(alpha_params): 67 | if (y + 1) % num_subspaces == 0 and not \ 68 | (x in range(num_tasks, num_tasks+num_subspaces)): 69 | alpha_params[x, y] = 0.95 70 | elif (y + num_subspaces) % num_subspaces == 0 and x \ 71 | in range(num_tasks, num_tasks+num_subspaces): 72 | alpha_params[x, y] = 0.95 73 | else: 74 | alpha_params[x, y] = 0.05 75 | 76 | self.alphas = model.add_parameters( 77 | (num_tasks*num_subspaces, num_tasks*num_subspaces), 78 | init=dynet.NumpyInitializer(alpha_params)) 79 | print('Initializing cross-stitch units to:', flush=True) 80 | print(dynet.parameter(self.alphas).value(), flush=True) 81 | self.num_tasks = num_tasks 82 | self.num_subspaces = num_subspaces 83 | self.hidden_dim = hidden_dim 84 | 85 | def stitch(self, predictions): 86 | """ 87 | Takes as inputs a list of the predicted states of the previous layers of 88 | each task, e.g. for two tasks a list containing two lists of 89 | n-dimensional output states. For every time step, the predictions of 90 | each previous task layer are then multiplied with the cross-stitch 91 | units to obtain a linear combination. In the end, we obtain a list of 92 | lists of linear combinations of states for every subsequent task layer. 93 | :param predictions: a list of length num_tasks containing the predicted 94 | states for each task 95 | :return: a list of length num_tasks containing the linear combination of 96 | predictions for each task 97 | """ 98 | assert self.num_tasks == len(predictions) 99 | linear_combinations = [] 100 | # iterate over tuples of predictions of each task at every time step 101 | for task_predictions in zip(*predictions): 102 | # concatenate the predicted state for all tasks to a matrix of shape 103 | # (num_tasks*num_subspaces, hidden_dim/num_subspaces); 104 | # we can multiply this directly with the alpha values 105 | concat_task_predictions = dynet.reshape( 106 | dynet.concatenate_cols(list(task_predictions)), 107 | (self.num_tasks*self.num_subspaces, 108 | self.hidden_dim / self.num_subspaces)) 109 | 110 | # multiply the alpha matrix with the concatenated predictions to 111 | # produce a linear combination of predictions 112 | alphas = dynet.parameter(self.alphas) 113 | product = alphas * concat_task_predictions 114 | if self.num_subspaces != 1: 115 | product = dynet.reshape(product, 116 | (self.num_tasks, self.hidden_dim)) 117 | linear_combinations.append(product) 118 | 119 | stitched = [linear_combination for linear_combination in 120 | zip(*linear_combinations)] 121 | return stitched 122 | 123 | 124 | class LayerStitchLayer: 125 | """Layer-stitch layer class.""" 126 | def __init__(self, model, num_layers, hidden_dim, init_scheme=IMBALANCED): 127 | """ 128 | Initializes a LayerStitchLayer. 129 | :param model: the DyNet model 130 | :param num_layers: the number of layers 131 | :param hidden_dim: the hidden dimensions of the LSTM layers 132 | :param init_scheme: the initialisation scheme; balanced or imbalanced 133 | """ 134 | if init_scheme == IMBALANCED: 135 | beta_params = np.full((num_layers), 0.1 / (num_layers - 1)) 136 | beta_params[-1] = 0.9 137 | elif init_scheme == BALANCED: 138 | beta_params = np.full((num_layers), 1. / num_layers) 139 | else: 140 | raise ValueError('Invalid initialization scheme for layer-stitch ' 141 | 'units: %s.' % init_scheme) 142 | self.betas = model.add_parameters( 143 | num_layers, init=dynet.NumpyInitializer(beta_params)) 144 | print('Initializing layer-stitch units to:', flush=True) 145 | print(dynet.parameter(self.betas).value(), flush=True) 146 | self.num_layers = num_layers 147 | self.hidden_dim = hidden_dim 148 | 149 | def stitch(self, layer_predictions): 150 | """ 151 | Takes as input the predicted states of all the layers of a task-specific 152 | network and produces a linear combination of them. 153 | :param layer_predictions: a list of length num_layers containing lists 154 | of length seq_len of predicted states for 155 | each layer 156 | :return: a list of linear combinations of the predicted states at every 157 | time step for each layer 158 | """ 159 | assert len(layer_predictions) == self.num_layers 160 | 161 | concatenated_layer_states = dynet.reshape(dynet.concatenate_cols(\ 162 | list(layer_predictions)), (self.num_layers, self.hidden_dim)) 163 | 164 | product = None 165 | if(self.num_layers > 1): 166 | product = dynet.transpose(dynet.parameter( 167 | self.betas)) * concatenated_layer_states 168 | else: 169 | product = dynet.parameter(self.betas) * concatenated_layer_states 170 | 171 | reshaped = dynet.reshape(product, (self.hidden_dim,)) 172 | 173 | return reshaped 174 | 175 | 176 | 177 | class Layer: 178 | """Class for a single layer or a two-layer MLP.""" 179 | def __init__(self, model, in_dim, output_dim, activation=dynet.tanh, 180 | mlp=False): 181 | """ 182 | Initialize the layer and add its parameters to the model. 183 | :param model: the DyNet Model 184 | :param in_dim: the input dimension 185 | :param output_dim: the output dimension 186 | :param activation: the activation function that should be used 187 | :param mlp: if True, add a hidden layer with 100 dimensions 188 | """ 189 | self.act = activation 190 | self.mlp = mlp 191 | if mlp: 192 | mlp_dim = 100 193 | self.W_mlp = model.add_parameters((mlp_dim, in_dim)) 194 | self.b_mlp = model.add_parameters((mlp_dim)) 195 | else: 196 | mlp_dim = in_dim 197 | self.W_out = model.add_parameters((output_dim, mlp_dim)) 198 | self.b_out = model.add_parameters((output_dim)) 199 | 200 | def __call__(self, x): 201 | if self.mlp: 202 | W_mlp = dynet.parameter(self.W_mlp) 203 | b_mlp = dynet.parameter(self.b_mlp) 204 | input = dynet.rectify(W_mlp*x + b_mlp) 205 | else: 206 | input = x 207 | W_out = dynet.parameter(self.W_out) 208 | b_out = dynet.parameter(self.b_out) 209 | act = self.act(W_out*input + b_out) 210 | return act 211 | 212 | -------------------------------------------------------------------------------- /run_sluice_net.py: -------------------------------------------------------------------------------- 1 | """ 2 | Main script 3 | """ 4 | import argparse 5 | import os 6 | import random 7 | import sys 8 | 9 | import numpy as np 10 | 11 | import dynet 12 | 13 | from constants import TASK_NAMES, LANGUAGES, EMBEDS, BALANCED, IMBALANCED, SGD, ADAM 14 | from sluice_net import SluiceNetwork, load 15 | import utils 16 | 17 | 18 | def check_activation_function(arg): 19 | """Checks allowed argument for --ac option.""" 20 | try: 21 | functions = [dynet.rectify, dynet.tanh] 22 | functions = {function.__name__: function for function in functions} 23 | functions['None'] = None 24 | return functions[str(arg)] 25 | except: 26 | raise argparse.ArgumentTypeError( 27 | 'String {} does not match required format'.format(arg, )) 28 | 29 | 30 | def main(args): 31 | 32 | 33 | train_score = {task: 0 for task in args.task_names} 34 | dev_score = {task: 0 for task in args.task_names} 35 | avg_train_score = 0 36 | avg_dev_score = 0 37 | 38 | if args.load: 39 | assert os.path.exists(args.model_dir),\ 40 | ('Error: Trying to load the model but %s does not exist.' % 41 | args.model_dir) 42 | print('Loading model from directory %s...' % args.model_dir) 43 | 44 | model_file = None 45 | params_file = None 46 | 47 | #Load models from different directory based on the type (STSL, MTSL, STML, MTML) 48 | if(len(args.task_names) ==1): 49 | 50 | if(len(args.languages) == 1): 51 | 52 | model_file = os.path.join(args.model_dir, 'STSL/{}_{}.model'.format(args.languages[0],args.task_names[0])) 53 | params_file = os.path.join(args.model_dir, 'STSL/{}_{}.pkl'.format(args.languages[0],args.task_names[0])) 54 | 55 | else: 56 | 57 | model_file = os.path.join(args.model_dir, 'STML/{}.model'.format(args.task_names[0])) 58 | 59 | params_file = os.path.join(args.model_dir, 'STML/{}.pkl'.format(args.task_names[0])) 60 | 61 | else: 62 | 63 | if(len(args.languages) ==1): 64 | 65 | model_file = os.path.join(args.model_dir, 'MTSL/{}.model'.format(args.languages[0])) 66 | params_file = os.path.join(args.model_dir, 'MTSL/{}.pkl'.format(args.languages[0])) 67 | else: 68 | model_file = os.path.join(args.model_dir, 'MTML/MTML.model') 69 | 70 | params_file = os.path.join(args.model_dir, 'MTML/MTML.pkl') 71 | 72 | 73 | 74 | model, train_score, dev_score, avg_train_score, avg_dev_score = load(params_file, model_file, args) 75 | 76 | if(args.continue_train):#Continue to train the loaded model 77 | train_score, dev_score, avg_train_score, avg_dev_score= model.fit(args.languages, args.test_languages, args.epochs, args.patience, args.opt, args.threshold, 78 | train_dir=args.train_dir, dev_dir=args.dev_dir)#added args.threshold 79 | 80 | else: 81 | model = SluiceNetwork(args.h_dim, 82 | args.h_layers, 83 | args.model_dir, 84 | args.log_dir, 85 | embeds=args.embeds, 86 | activation=args.activation, 87 | lower=args.lower, 88 | noise_sigma=args.sigma, 89 | task_names=args.task_names, 90 | languages = args.languages, 91 | cross_stitch=args.cross_stitch, 92 | num_subspaces=args.num_subspaces, 93 | constraint_weight=args.constraint_weight, 94 | constrain_matrices=args.constrain_matrices, 95 | cross_stitch_init_scheme= 96 | args.cross_stitch_init_scheme, 97 | layer_stitch_init_scheme= 98 | args.layer_stitch_init_scheme) 99 | train_score, dev_score, avg_train_score, avg_dev_score = model.fit(args.languages, args.test_languages, args.epochs, args.patience, args.opt, args.threshold, train_dir=args.train_dir, dev_dir=args.dev_dir) 100 | 101 | 102 | 103 | print('='*50) 104 | print('Start testing', ','.join(args.test_languages)) 105 | 106 | for test_lang in args.test_languages: 107 | test_X, test_Y, _ = utils.get_data( 108 | [test_lang], model.task_names, model.word2id, 109 | model.task2label2id, data_dir=args.test_dir, train=False) 110 | 111 | test_score = model.evaluate(test_X, test_Y, test_lang, args.threshold) 112 | 113 | 114 | 115 | 116 | print('='*50) 117 | print('\tStart logging {}'.format(test_lang)) 118 | 119 | 120 | utils.log_score(args.log_dir, args.languages, [test_lang], args.task_names, args.embeds, args.h_dim, args.cross_stitch_init_scheme, 121 | args.constraint_weight, args.sigma, args.opt, train_score, dev_score, test_score) 122 | 123 | 124 | print('\tFinished logging{}'.format(test_lang)) 125 | 126 | 127 | 128 | 129 | if __name__ == '__main__': 130 | parser = argparse.ArgumentParser( 131 | description='Run the Sluice Network', 132 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 133 | 134 | # DyNet parameters 135 | 136 | parser.add_argument('--dynet-autobatch', type=int, #automatically batch some operations to speed up computations 137 | help='use auto-batching (1) (should be first argument)') 138 | parser.add_argument('--dynet-gpus', type=int, 139 | help='Specify how many GPUs you want to use, if DyNet is compiled with CUDA') 140 | 141 | parser.add_argument('--dynet-devices', nargs='+', choices=['CPU', 'GPU:0', 'GPU:1', 'GPU:2', 'GPU:3'], 142 | help='Specify which GPUs do use') 143 | parser.add_argument('--dynet-seed', type=int, help='random seed for DyNet') 144 | parser.add_argument('--dynet-mem', type=int, help='memory for DyNet') 145 | 146 | # languages, tasks, and paths 147 | parser.add_argument('--languages', nargs='+', choices=LANGUAGES, 148 | help='the language datasets to be trained on ') 149 | 150 | parser.add_argument('--test-languages', nargs='+', choices=LANGUAGES, 151 | help='the language datasets to be tested on') 152 | 153 | parser.add_argument('--train-dir', required=True, 154 | help='the directory containing the training data') 155 | parser.add_argument('--dev-dir', required=True, 156 | help='the directory containing the development data') 157 | parser.add_argument('--test-dir', required=True, 158 | help='the directory containing the test data') 159 | 160 | parser.add_argument('--load', action='store_true', 161 | help='load the pre-trained model') 162 | 163 | parser.add_argument('--load-action', default='test', 164 | choices=['train', 'test'], 165 | help='action after loading the model') 166 | 167 | parser.add_argument('--task-names', nargs='+', default=TASK_NAMES, 168 | choices=TASK_NAMES, 169 | help='the names of the tasks (main task is first)') 170 | parser.add_argument('--model-dir', required=True, 171 | help='directory where to save model and param files') 172 | parser.add_argument('--log-dir', required=True, 173 | help='the directory where the results should be logged') 174 | parser.add_argument('--w-in-dim', type=int, default=64, 175 | help='default word embeddings dimension [default: 64]') 176 | #parser.add_argument('--c-in-dim', type=int, default=100, 177 | # help='input dim for char embeddings [default:100]') 178 | parser.add_argument('--h-dim', type=int, default=100, 179 | help='hidden dimension [default: 100]') 180 | parser.add_argument('--h-layers', type=int, default=1, 181 | help='number of stacked LSTMs [default: 1=no stacking]') 182 | parser.add_argument('--lower', action='store_true', 183 | help='lowercase words (not used)') 184 | parser.add_argument('--embeds', nargs='?',help='word embeddings file', 185 | choices=EMBEDS, default=None) 186 | 187 | 188 | parser.add_argument('--sigma', help='noise sigma', default=0.2, type=float) 189 | parser.add_argument('--activation', default='tanh', 190 | help='activation function [rectify, tanh, ...]', 191 | type=check_activation_function) 192 | parser.add_argument('--opt', '--optimizer', default=SGD, 193 | choices=[SGD, ADAM], 194 | help='trainer [sgd, adam] default: sgd') 195 | 196 | # training hyperparameters 197 | parser.add_argument('--epochs', type=int, default=30, 198 | help='training epochs [default: 30]') 199 | parser.add_argument('--patience', default=1, type=int, 200 | help='patience for early stopping') 201 | 202 | parser.add_argument('--cross-stitch', action='store_true', 203 | help='use cross-stitch units between LSTM layers') 204 | 205 | 206 | 207 | parser.add_argument('--num-subspaces', default=1, type=int, choices=[1, 2], 208 | help='the number of subspaces for cross-stitching; ' 209 | 'only 1 (no subspace) or 2 allowed currently') 210 | parser.add_argument('--constraint-weight', type=float, default=0., 211 | help='weighting factor for orthogonality constraint on ' 212 | 'cross-stitch subspaces; 0 = no constraint') 213 | parser.add_argument('--constrain-matrices', type=int, nargs='+', 214 | default=[1, 2], 215 | help='the indices of the LSTM matrices that should be ' 216 | 'constrained; indices correspond to: Wix,Wih,Wic,' 217 | 'bi,Wox,Woh,Woc,bo,Wcx,Wch,bc. Best indices so ' 218 | 'far: [1, 2] http://dynet.readthedocs.io/en/latest/python_ref.html#dynet.LSTMBuilder.get_parameter_expressions)') 219 | parser.add_argument('--cross-stitch-init-scheme', type=str, 220 | default=BALANCED, choices=[IMBALANCED, BALANCED], 221 | help='which initialisation scheme to use for the ' 222 | 'alpha matrix - currently available: imbalanced ' 223 | 'and balanced (which sets all to ' 224 | '1/(num_tasks*num_subspaces)). Only available ' 225 | 'with subspaces.') 226 | parser.add_argument('--layer-stitch-init-scheme', type=str, 227 | default=BALANCED, 228 | choices=[BALANCED, IMBALANCED], 229 | help='initialisation scheme for layer-stitch units; ' 230 | 'default: imbalanced (.9) for last layer weights;' 231 | 'other choice: balanced (1. / num_layers).') 232 | 233 | parser.add_argument('--threshold', type=float,default=0.5, 234 | help='threshold for classfication') 235 | args = parser.parse_args() 236 | main(args) 237 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility methods for data processing. 3 | """ 4 | import os 5 | from glob import glob 6 | import itertools 7 | import csv 8 | import pandas as pd 9 | from constants import NUM, NUMBERREGEX, UNK, WORD_START, WORD_END, EMBEDS_FILES, FULL_LANG, LABELS, MODIFIED_LABELS 10 | 11 | def print_task_labels(task_name, label2id, id_sequence, file): 12 | #Convert label_id sequence to label sequence and write to file 13 | #changed the original function completely 14 | with open(file, 'a+') as f: 15 | writer = csv.writer(f,delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL) 16 | writer.writerow(['ID', task_name]) 17 | label_list=dict() 18 | for task_id, labels_ids in label2id.items(): 19 | if task_name==task_id: 20 | for label, idx in labels_ids.items(): 21 | label_list[label] = idx 22 | #print(label_list) 23 | 24 | count = 1 25 | #with open(file, 'a+') as f: 26 | for label_idx_seq in id_sequence: 27 | #Create a label_sequence for each tweet 28 | label_seq = [] 29 | for task, label_idx in label_idx_seq.items(): 30 | #intialize_values 31 | #target_val='' 32 | #group_val='' 33 | #annotator_val=[] 34 | #sentiment_val=[] 35 | #Non multilabel_tasks, labels are of the form [1, [7], [12], ... 36 | if task==task_name: 37 | if task=='target' or task =='group' or task=='directness': 38 | for target_label, indice in label2id[task].items(): 39 | if indice==label_idx[0]: 40 | if task=='target': 41 | val=target_label 42 | else: 43 | val=target_label 44 | #Multilabel tasks, labels are of the form [1, 0, 0, 1, 0, 0], ... such that each column represents one label 45 | elif task=='annotator_sentiment': 46 | val=[] 47 | for j in range(len(label_idx)): 48 | if label_idx[j]>0: 49 | for label, indice in label2id[task].items(): 50 | #if labels[j]==1 or label number j ==1 append the name of the label 51 | if indice==j: 52 | val.append(label) 53 | elif task=='sentiment': 54 | val=[] 55 | for j in range(len(label_idx)): 56 | if label_idx[j]>0: 57 | for label, indice in label2id[task].items(): 58 | #if labels[j]==1 or label number j ==1 append the name of the label 59 | if indice==j: 60 | val.append(label) 61 | writer.writerow([count,val]) 62 | count+=1 63 | #target_val='' 64 | #group_val='' 65 | #annotator_val=[] 66 | #sentiment_val=[] 67 | 68 | 69 | f.close() 70 | 71 | 72 | 73 | 74 | #write functions for studying correlations 75 | def save_generated_labels_in_csv_file(label2id, id_sequence, file): 76 | #Convert label_id sequence to label sequence and write to file 77 | #changed the original function completely 78 | with open(file, 'a+') as f: 79 | writer = csv.writer(f,delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) 80 | writer.writerow(['ID','annotator_sentiment','sentiment','group','target']) 81 | label_list=dict() 82 | for task, labels_ids in label2id.items(): 83 | #print (task) 84 | for label, idx in labels_ids.items(): 85 | label_list[label] = idx 86 | #print(label_list) 87 | 88 | count = 1 89 | #with open(file, 'a+') as f: 90 | for label_idx_seq in id_sequence: 91 | #Create a label_sequence for each tweet 92 | label_seq = [] 93 | for task, label_idx in label_idx_seq.items(): 94 | #intialize_values 95 | #target_val='' 96 | #group_val='' 97 | #annotator_val=[] 98 | #sentiment_val=[] 99 | #Non multilabel_tasks, labels are of the form [1, [7], [12], ... 100 | if task=='target' or task =='group': 101 | for target_label, indice in label2id[task].items(): 102 | if indice==label_idx[0]: 103 | if task=='target': 104 | target_val=target_label 105 | else: 106 | group_val=target_label 107 | #Multilabel tasks, labels are of the form [1, 0, 0, 1, 0, 0], ... such that each column represents one label 108 | elif task=='annotator_sentiment': 109 | annotator_val=[] 110 | for j in range(len(label_idx)): 111 | if label_idx[j]>0: 112 | for label, indice in label2id[task].items(): 113 | #if labels[j]==1 or label number j ==1 append the name of the label 114 | if indice==j: 115 | annotator_val.append(label) 116 | elif task=='sentiment': 117 | sentiment_val=[] 118 | for j in range(len(label_idx)): 119 | if label_idx[j]>0: 120 | for label, indice in label2id[task].items(): 121 | #if labels[j]==1 or label number j ==1 append the name of the label 122 | if indice==j: 123 | sentiment_val.append(label) 124 | writer.writerow([count,sentiment_val,target_val,group_val,annotator_val]) 125 | #target_val='' 126 | #group_val='' 127 | #annotator_val=[] 128 | #sentiment_val=[] 129 | count+=1 130 | 131 | f.close() 132 | 133 | 134 | 135 | def get_label(label2id, id_sequence, file): 136 | #Convert label_id sequence to label sequence and write to file 137 | #changed the original function completely 138 | label_list=dict() 139 | for task, labels_ids in label2id.items(): 140 | #print (task) 141 | for label, idx in labels_ids.items(): 142 | label_list[label] = idx 143 | #print(label_list) 144 | 145 | 146 | count = 1 147 | with open(file, 'a+') as f: 148 | for label_idx_seq in id_sequence: 149 | #Create a label_sequence for each tweet 150 | label_seq = [] 151 | for task, label_idx in label_idx_seq.items(): 152 | #Non multilabel_tasks, labels are of the form [1, [7], [12], ... 153 | if task=='target' or task =='group': 154 | for target_label, indice in label2id[task].items(): 155 | if indice==label_idx[0]: 156 | label_seq.append(target_label) 157 | #Multilabel tasks, labels are of the form [1, 0, 0, 1, 0, 0], ... such that each column represents one label 158 | elif task=='annotator_sentiment' or task =='sentiment': 159 | for j in range(len(label_idx)): 160 | if label_idx[j]>0: 161 | for label, indice in label2id[task].items(): 162 | #if labels[j]==1 or label number j ==1 append the name of the label 163 | if indice==j: 164 | label_seq.append(label) 165 | f.write(str(count) +'.\t'+','.join(label_seq) +'\n') 166 | count+=1 167 | 168 | f.close() 169 | 170 | def normalize(word): 171 | """Normalize a word by lower-casing it or replacing it if it is a number.""" 172 | return NUM if NUMBERREGEX.match(word) else word.lower() 173 | 174 | def average_by_task(score_dict): 175 | #Compute unweighted average of all metrics among all tasks 176 | total = 0 177 | count = 0 178 | 179 | for key in score_dict: 180 | 181 | total+=(score_dict[key]['micro_f1'] + score_dict[key]['macro_f1']) 182 | count+=2 183 | 184 | 185 | return total/float(count) 186 | 187 | def average_by_lang(score_list, data_size_list, total_data_size): 188 | #Compute weighted average of all languages 189 | res = 0 190 | 191 | for idx in range(len(score_list)): 192 | ratio = float(data_size_list[idx]) / total_data_size 193 | res += ratio * score_list[idx] 194 | 195 | return res 196 | 197 | def load_embeddings_file(embeds, languages, sep=" ", lower=False): 198 | """Loads a word embedding file.""" 199 | 200 | 201 | embed_dir = EMBEDS_FILES[embeds] 202 | file_name_list = [] 203 | for f in os.listdir(embed_dir): 204 | if (any([f.endswith(lang+'.vec') for lang in languages])): 205 | file_name_list.append(os.path.join(embed_dir,f)) 206 | 207 | 208 | word2vec = {} 209 | total_num_words = 0 210 | embed_dim = 0 211 | encoding = None 212 | for file_name in file_name_list: 213 | print('\n\n Loading {}.....\n\n'.format(file_name)) 214 | if(file_name.endswith('ar.vec') or file_name.endswith('fr.vec')): 215 | encoding='utf-8' 216 | with open(file=file_name, mode='r', encoding=encoding) as f: 217 | (num_words, embed_dim) = (int(x) for x in f.readline().rstrip('\n').split(' ')) 218 | total_num_words+=num_words 219 | for idx, line in enumerate(f): 220 | if((idx+1)%(1e+5)==0): 221 | print('Loading {}/{} words'.format(idx+1, num_words)) 222 | fields = line.rstrip('\n').split(sep) 223 | vec = [float(x) for x in fields[1:]] 224 | word = fields[0] 225 | if lower: 226 | word = word.lower() 227 | word2vec[word] = vec 228 | print('Loaded pre-trained embeddings of dimension: {}, size: {}, lower: {}' 229 | .format(embed_dim, total_num_words, lower)) 230 | return word2vec, embed_dim 231 | 232 | 233 | 234 | 235 | 236 | 237 | def get_data(languages, task_names, word2id=None, task2label2id=None, data_dir=None, 238 | train=True, verbose=False): 239 | """ 240 | :param languages: a list of languages from which to obtain the data 241 | :param task_names: a list of task names 242 | :param word2id: a mapping of words to their ids 243 | :param char2id: a mapping of characters to their ids 244 | :param task2label2id: a mapping of tasks to a label-to-id dictionary 245 | :param data_dir: the directory containing the data 246 | :param train: whether data is used for training (default: True) 247 | :param verbose: whether to print more information re file reading 248 | :return X: a list of tuples containing a list of word indices and a list of 249 | a list of character indices; 250 | Y: a list of dictionaries mapping a task to a list of label indices; 251 | org_X: the original words; a list of lists of normalized word forms; 252 | org_Y: a list of dictionaries mapping a task to a list of labels; 253 | word2id: a word-to-id mapping; 254 | char2id: a character-to-id mapping; 255 | task2label2id: a dictionary mapping a task to a label-to-id mapping. 256 | """ 257 | X = [] 258 | Y = [] 259 | org_X = [] 260 | org_Y = [] 261 | 262 | # for training, we initialize all mappings; for testing, we require mappings 263 | if train: 264 | 265 | # create word-to-id, character-to-id, and task-to-label-to-id mappings 266 | word2id = {} 267 | 268 | 269 | # set the indices of the special characters 270 | word2id[UNK] = 0 # unk word / OOV 271 | 272 | 273 | for language in languages: 274 | num_sentences = 0 275 | num_tokens = 0 276 | 277 | full_lang = FULL_LANG[language] 278 | #file_reader = iter(()) 279 | language_path = os.path.join(data_dir, full_lang) 280 | 281 | 282 | assert os.path.exists(language_path), ('language path %s does not exist.' 283 | % language_path) 284 | 285 | csv_file = os.path.join(language_path,os.listdir(language_path)[0]) 286 | 287 | df = pd.read_csv(csv_file) 288 | 289 | 290 | #Column headers are HITId, tweet, sentiment, directness, annotator_sentiment, target, group 291 | 292 | for index, instance in df.iterrows(): 293 | num_sentences+=1 294 | #sentence = instance['tweet'].split() 295 | sentence = instance['tweet'].split() 296 | 297 | sentence_word_indices = [] # sequence of word indices 298 | sentence_char_indices = [] # sequence of char indice 299 | 300 | # keep track of the label indices and labels for each task 301 | sentence_task2label_indices = {} 302 | 303 | for i, word in enumerate(sentence): 304 | num_tokens+=1 305 | 306 | if train and word not in word2id: 307 | word2id[word] = len(word2id) 308 | 309 | sentence_word_indices.append(word2id.get(word, word2id[UNK])) 310 | 311 | 312 | 313 | 314 | labels = None 315 | 316 | for task in task2label2id.keys(): 317 | if('sentiment' in task): 318 | labels = instance[task].split('_') 319 | else: 320 | labels = [instance[task]] 321 | 322 | if('sentiment' in task):#Multi-label 323 | 324 | sentence_task2label_indices[task]=[0]*len(task2label2id[task]) 325 | 326 | for label in labels: 327 | label_idx = task2label2id[task][label] 328 | sentence_task2label_indices[task][label_idx]=1 329 | 330 | 331 | else: 332 | 333 | sentence_task2label_indices[task] = [task2label2id[task][labels[0]]] 334 | 335 | 336 | X.append(sentence_word_indices) 337 | Y.append(sentence_task2label_indices) 338 | 339 | assert len(X) == len(Y) 340 | return X, Y, word2id 341 | 342 | 343 | 344 | #Log the training process 345 | 346 | def log_fit(log_dir, epoch, languages, test_lang, task_names, train_score, dev_score): 347 | if(len(task_names) ==1): 348 | task_name = task_names[0] 349 | 350 | if(len(languages) == 1): 351 | task_directory = os.path.join(log_dir,'STSL/') 352 | if not os.path.exists(task_directory): 353 | os.mkdir(task_directory) 354 | file = os.path.join(log_dir, 'STSL/{}_{}.csv'.format(languages[0],task_names[0])) 355 | 356 | else: 357 | task_directory = os.path.join(log_dir,'STML/') 358 | if not os.path.exists(task_directory): 359 | os.mkdir(task_directory) 360 | file = os.path.join(log_dir, 'STML/{}.csv'.format(task_names[0])) 361 | 362 | #This function needs to be changed 363 | if(os.path.exists(file)): 364 | with open(file, 'a') as f: 365 | writer = csv.writer(f,delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) 366 | 367 | 368 | writer.writerow([epoch, test_lang, train_score[task_name]['micro_f1'], train_score[task_name]['macro_f1'], 369 | dev_score[task_name]['micro_f1'], dev_score[task_name]['macro_f1']]) 370 | 371 | else: 372 | with open(file, 'a') as f: 373 | writer = csv.writer(f,delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) 374 | 375 | writer.writerow(['epoch', 'test_lang', task_name+'-train-micro-f1', task_name+'-train-macro-f1', 376 | task_name+'-dev-micro-f1', task_name+'-dev-macro-f1']) 377 | 378 | writer.writerow([epoch, test_lang, train_score[task_name]['micro_f1'], train_score[task_name]['macro_f1'], 379 | dev_score[task_name]['micro_f1'], dev_score[task_name]['macro_f1']]) 380 | 381 | f.close() 382 | 383 | else: 384 | 385 | if(len(languages) ==1): 386 | task_directory = os.path.join(log_dir,'MTSL/') 387 | if not os.path.exists(task_directory): 388 | os.mkdir(task_directory) 389 | file = os.path.join(log_dir, 'MTSL/{}.csv'.format(languages[0])) 390 | 391 | 392 | else: 393 | task_directory = os.path.join(log_dir,'MTML/') 394 | if not os.path.exists(task_directory): 395 | os.mkdir(task_directory) 396 | 397 | file = os.path.join(log_dir, 'MTML/log.csv') 398 | 399 | 400 | task_name_list = [] 401 | 402 | task_f1_list = [] 403 | #changed for task_name in task_names to for task_name in task_names: 404 | for task_name in task_names: 405 | task_name_list+=[task_name+'-train-micro-f1', task_name+'-train-macro-f1', 406 | task_name+'-dev-micro-f1', task_name+'-dev-macro-f1'] 407 | 408 | task_f1_list +=[train_score[task_name]['micro_f1'], train_score[task_name]['macro_f1'], 409 | dev_score[task_name]['micro_f1'], dev_score[task_name]['macro_f1']] 410 | 411 | 412 | if(os.path.exists(file)): 413 | #print("File exists: ") 414 | #print(file) 415 | #file = open(file, 'a') 416 | with open(file, 'a') as f: 417 | writer = csv.writer(f,delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) 418 | writer.writerow([epoch, test_lang]+ task_f1_list) 419 | 420 | f.close() 421 | 422 | else: 423 | #print("File does not exist: ") 424 | #print(file) 425 | with open(file, 'a') as f: 426 | writer = csv.writer(f,delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) 427 | writer.writerow(['epoch', 'test_lang'] + task_name_list ) 428 | writer.writerow([epoch, test_lang]+ task_f1_list ) 429 | 430 | 431 | f.close() 432 | 433 | 434 | 435 | 436 | #Log the final score 437 | 438 | def log_score(log_dir, languages, test_lang, task_names, embeds,h_dim, cross_stitch_init, 439 | constraint_weight, sigma, optimizer, train_score, dev_score, test_score): 440 | 441 | 442 | if(len(task_names) ==1): 443 | task_name = task_names[0] 444 | 445 | if(len(languages) == 1): 446 | task_directory = os.path.join(log_dir,'STSL/') 447 | if not os.path.exists(task_directory): 448 | os.mkdir(task_directory) 449 | file = os.path.join(log_dir, 'STSL/{}_{}.csv'.format(languages[0],task_names[0])) 450 | 451 | else: 452 | task_directory = os.path.join(log_dir,'STML/') 453 | if not os.path.exists(task_directory): 454 | os.mkdir(task_directory) 455 | file = os.path.join(log_dir, 'STML/{}.csv'.format(task_names[0])) 456 | 457 | 458 | if(os.path.exists(file)): 459 | with open(file, 'a') as f: 460 | writer = csv.writer(f,delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) 461 | writer.writerow([embeds,test_lang, h_dim, cross_stitch_init, constraint_weight, sigma, optimizer, 462 | train_score[task_name]['micro_f1'], train_score[task_name]['macro_f1'], 463 | dev_score[task_name]['micro_f1'], dev_score[task_name]['macro_f1'], 464 | test_score[task_name]['micro_f1'], test_score[task_name]['macro_f1']]) 465 | print([embeds,test_lang, h_dim, cross_stitch_init, constraint_weight, sigma, optimizer, 466 | train_score[task_name]['micro_f1'], train_score[task_name]['macro_f1'], 467 | dev_score[task_name]['micro_f1'], dev_score[task_name]['macro_f1'], 468 | test_score[task_name]['micro_f1'], test_score[task_name]['macro_f1']]) 469 | 470 | else: 471 | with open(file, 'a') as f: 472 | writer = csv.writer(f,delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) 473 | 474 | writer.writerow(['embeds', 'test_lang', 'h_dim', 'cross_stitch_init', 'constraint_weight', 'sigma', 'optimizer', 475 | task_name+'-train-micro-f1', task_name+'-train-macro-f1', task_name+'-dev-micro-f1', task_name+'-dev-macro-f1', 476 | task_name+'-test-micro-f1', task_name+'-test-macro-f1']) 477 | print(['embeds', 'test_lang', 'h_dim', 'cross_stitch_init', 'constraint_weight', 'sigma', 'optimizer', 478 | task_name+'-train-micro-f1', task_name+'-train-macro-f1', task_name+'-dev-micro-f1', task_name+'-dev-macro-f1', 479 | task_name+'-test-micro-f1', task_name+'-test-macro-f1']) 480 | 481 | writer.writerow([embeds,test_lang, h_dim, cross_stitch_init, constraint_weight, sigma, optimizer,\ 482 | train_score[task_name]['micro_f1'], train_score[task_name]['macro_f1'], 483 | dev_score[task_name]['micro_f1'], dev_score[task_name]['macro_f1'], 484 | test_score[task_name]['micro_f1'], test_score[task_name]['macro_f1']]) 485 | #added line 486 | #add test here 487 | #end of add 488 | print([embeds,test_lang, h_dim, cross_stitch_init, constraint_weight, sigma, optimizer,\ 489 | train_score[task_name]['micro_f1'], train_score[task_name]['macro_f1'], 490 | dev_score[task_name]['micro_f1'], dev_score[task_name]['macro_f1'], 491 | test_score[task_name]['micro_f1'], test_score[task_name]['macro_f1']]) 492 | 493 | 494 | f.close() 495 | 496 | else: 497 | 498 | if(len(languages) ==1): 499 | task_directory = os.path.join(log_dir,'MTSL/') 500 | if not os.path.exists(task_directory): 501 | os.mkdir(task_directory) 502 | file = os.path.join(log_dir, 'MTSL/{}.csv'.format(languages[0])) 503 | 504 | else: 505 | task_directory = os.path.join(log_dir,'MTML/') 506 | if not os.path.exists(task_directory): 507 | os.mkdir(task_directory) 508 | file = os.path.join(log_dir, 'MTML/log.csv') 509 | 510 | 511 | task_name_list = [] 512 | 513 | task_f1_list = [] 514 | 515 | for task in task_names: 516 | task_name_list+=[task+'-train-micro-f1', task+'-train-macro-f1', task+'-dev-micro-f1', task+'-dev-macro-f1', task+'-test-micro-f1', task+'-test-macro-f1'] 517 | 518 | task_f1_list +=[ train_score[task]['micro_f1'], train_score[task]['macro_f1'], dev_score[task]['micro_f1'], dev_score[task]['macro_f1'], test_score[task]['micro_f1'], test_score[task]['macro_f1']] 519 | 520 | if(os.path.exists(file)): 521 | with open(file, 'a') as f: 522 | writer = csv.writer(f,delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) 523 | writer.writerow([embeds, test_lang, h_dim, cross_stitch_init, constraint_weight, sigma,optimizer]+\ 524 | task_f1_list) 525 | print([embeds, test_lang, h_dim, cross_stitch_init, constraint_weight, sigma,optimizer]+\ 526 | task_f1_list) 527 | 528 | 529 | f.close() 530 | 531 | else: 532 | with open(file, 'a') as f: 533 | writer = csv.writer(f,delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) 534 | writer.writerow(['embeds', 'test_lang', 'h_dim', 'cross_stitch_init', 'constraint_weight', 'sigma']\ 535 | +task_name_list) 536 | writer.writerow([embeds, test_lang,h_dim, cross_stitch_init, constraint_weight, sigma,optimizer]+\ 537 | task_f1_list ) 538 | print(['embeds', 'test_lang', 'h_dim', 'cross_stitch_init', 'constraint_weight', 'sigma']\ 539 | +task_name_list) 540 | print([embeds, test_lang,h_dim, cross_stitch_init, constraint_weight, sigma,optimizer]+\ 541 | task_f1_list ) 542 | 543 | 544 | f.close() 545 | 546 | 547 | 548 | 549 | -------------------------------------------------------------------------------- /guidelines.tar: -------------------------------------------------------------------------------- 1 | arabic_guidelines.html0000664000176200017620000001654213541352256016571 0ustar nousidhoumnousidhoum

.نقوم في ما يلي بدراسة خطاب الكراهية الذي يمكن أن يُعرف بكونه أي عبارات تؤيد التحريض على الضرر (خاصة التمييز أو العدوانية أو العنف) حسب الهدف الذي تم استهدافه وسط مجموعة اجتماعية أوسُكانية وتكون هذه المجموعات عادة من الضعفاء والأقليات

2 |

.نصنف في التالي خطاب الكراهية بناءا على شعور كاتب التغريدة، شعور القارئ و الفئة المستهدفة من التغريدة. عند تصنيف التغريدات يرجى أخذ أسلوب التغريدة و انطباعك عن كاتبها بعين الاعتبار

3 |

.خطاب الكراهية قد يكون مباشرا أو غير مباشر اعتمادا على نوعية الألفاظ المستعملة، صراحة التعبير عن الفكرة المروج لها في التغريدة و احتمال استعمال صورة بلاغية مهينة

4 |

.يندرج تصنيف شعور الكاتب في سياق كلام التغريدة فقد يكون مفعما بالكراهية إن كان محرضا .للعنف، متعسفا او بذيئا، قبيحا، خائفا، قليل الاحترام أو عاديا

5 |

.التغريدة قد تثير صدمتك، خوفك، غضبك، حزنك، حيرتك (إن كنت غير متأكد) أو لا مبالاتك إن ظننت أنها غير عدوانية

6 |

أخيرا، اختر الفئة المستهدفة من التغريدة بناءا على الصفة التي تشمل أفراد المجتمع المعنيين بها. قد تتمثل هذد الصفة في الجنسية، الديانة، العرق، الأصل، الجنس، الهوية الجنسية، التوجه الجنسي أو الاحتياجات الخاصة (عقلية كانت أم جسدية) إضافة إلى الفئة المحددة كالنساء، اللاجئين، ...

7 |

8 |

ملحوظة

9 |

.لقد تم إخفاء هوية مستعملي التويتر و كل الروابط المستعملة احتراما لخصوصيتهم

10 |

خطاب هذه التغريدة

11 |

12 |

${TWEET}

13 |

14 |

مباشر
غير مباشر

15 |

16 |

كيف يظهرلك كاتب التغريدة؟

17 |

متعسف/بذيء
مفعم بالكراهية
قبيح
خائف
قليل الإحترام
عادي

18 |

ما هو شعورك بعد قراءة التغريدة؟

19 |

صدمة
غضب
حزن
خوف
حيرة
لا مبالاة

20 |

فيم يشترك أفراد الفئة المستهدفة من التغريدة؟

21 |

الأصل
الديانة
العرق
الجنسية
الجِنْس
الإحتياجات الخاصة
التوجه الجنسي
الهوية الجنسية
أخرى

22 |

ما هي الفئة المستهدفة؟

23 |

الآسيويون
ذووالإحتياجات الخاصة
ذوو الأصول الإفريقية
اللاتينيون
اليساريون
الهنود
الهندوس
المسيحيون
النساء
ذوو البشرة السمراء
الصينيون
العرب
المسلمون
اليهود
المهاجرون
اللاجئون
المثليون
آخرون

24 |

french_guidelines.html0000664000176200017620000001506013541351766016614 0ustar nousidhoumnousidhoum

Nous conduisons une étude sur le discours haineux sur Twitter. Le discours haineux, le plus souvent utilisé sous le terme anglophone "hate speech", désigne un type de discours qui attaque une personne ou un groupe de personnes sur la base de caractéristiques discriminatoires telles que la race, l'âge, l'invalidité, le sexe, l'origine ethnique, l'orientation sexuelle, la religion, etc.

25 |

Nous voudrions classer ce discours selon le sentiment qu'émane, selon vous, la personne ayant rédigé le tweet, le vôtre ainsi que la personne ou le groupe d'individus ciblés par ce tweet.

26 |

Pour ce, veuillez prendre en compte le contenu du tweet, le ton et l'impression que vous donne l'utilisateur l'ayant rédigé. Le tweet peut être direct ou indirect suivant le langage utilisé par son auteur, l'utilisation d'idiomes ou de métaphores et si l'auteur est insultant envers un individu ou un groupe de personnes, etc.

27 |

Le tweet peut être abusif si l'auteur vous semble dangereux ou menaçant, haineux, grossier, irrespectueux, craintif si l'auteur vous semble avoir peur d'un certain phénomène par ignorance ou incertitude ou bien normal.

28 |

Veuillez noter que l'utilisation d'obsénités, de vocabulaire toxique ou de termes calomnieux met les tweets dans des catégories variant d'irrespectueux et grossier à haineux et abusif mais qu'un discours haineux ou toxique ne contient pas forcément de termes calomnieux directs.

29 |

Veuillez choisir l'attribut selon lequel le tweet vous semble discriminatoire. Cela peut êtres la race, la religion, l'ethnicité, la nationalité, le genre/sexe, l'identité sexuelle, l'orientation sexuelle ou l'invalidité. Puis, veuillez cocher le nom du groupe ciblé.

30 |

Le tweet peut provoquer en vous un choc, de la colère, de la tristesse, de la peur, de la confusion s'il vous semble ambivalent et peut vous laisser indifférent s'il vous semble normal.

31 |

NB.

32 |

Les @mentions et les URLs sont masquées pour protéger l'anonymat des utilisateurs.

33 |

Ce Tweet est

34 |

direct
indirect

35 |

Quel adjectif qualifierait le mieux ce tweet?

36 |

Abusif
Haineux
Offensif
Craintif
Irrespectueux
Normal

37 |

Quel adjectif décrirait ce que vous ressentez, le mieux?

38 |

Choc
Colère
Tristesse
Peur
Confusion/Ambivalence
Indifférence

39 |

Sous quel attribut ce tweet vous semble discriminatoire?

40 |

Race
Religion
Ethnicité
Nationalité
Genre
Invalidité
Orientation sexuelle
Identité sexuelle
Autre

41 |

Quelle catégorie de personnes cible-t-il?

42 |

Les Asiatiques
Les personnes invalides
Les personnes d'origine africaine
Les Hispanniques
Les socialistes /les personnes de gauche
Les Indiens
Les Hindous
Les Chrétiens
Les femmes
Les personnes de couleur
Les Chinois
Les Arabes
Les Musulmans
Les Juifs
Les migrants
Les réfugiés
Les homosexuels
Autres

43 |

english_guidelines.html0000664000176200017620000002336413541352215016774 0ustar nousidhoumnousidhoum 44 |

We are conducting an academic survey about hate speech and offensive language on Twitter. The following tweets most likely contain hate speech that attacks an individual or group of people on the basis of attributes such as race, religion, ethnic origin, national origin, gender or gender identity, special needs or sexual orientation.

45 |

We would like to identify different types of hate speech given its target, the sentiment it may spread and what you feel when you read it.

46 |

Please take into consideration the content of the tweet, whether the attack is easily generalizable to a group of people, the tone used by the tweet's author and the impression s/he gives you. The tweet can be direct or indirect hate speech, depending on whether, for instance, the name of the target group is rather explicitly expressed or if the tweet's author uses an insulting metaphore or idiom, etc.

47 |

The tweet can be abusive, hateful, offensive, fearful, disrespectful or normal. If the tweet sounds dangerous, it should be labelled abusive. The use of slurs and/or obscene words puts it into categories that vary from disrespectful to offensive and hateful depending on its tone, whether it targets an individual or a group of people and its topic. Please, remember that hate speech does not necessarily contain slurs and some elusive hate may be hidden behind sarcasm. Finally, if the tweet expresses or spreads fear against a group of people, it should be annotated as fearful. Some tweets might be tagged as normal.

48 |

We would also like to know whether this tweet leaves you disgusted, shocked, annoyed or angry, sad, unsure confused if it sounds ambivalent or indifferent if you consider it to be normal.

49 |

Please check in the appropriate boxes, the discriminatory attribute of each tweet. Remember that race refers to a person's physical characteristics, such as bone structure, skin, hair, or eye color. Ethnicity, however, refers to cultural factors, including nationality, regional culture, ancestry, and language... Many nationalities can share one ethnic group and several ethnic groups may have the same race.

50 |

Example "Immigrants are stealing our jobs" is a fearful tweet. It targets people based on their nationality, immigrants specifically and it may provoke sadness in you.

51 |

Please, also keep in mind that racist or sexist comments fall into intersecting sub-categories. For example, a racist tweet can be hateful towards a certain race and a sexist one may be offensive towards women.

52 |

If the tweet targets more than one group, choose the one it may offend the most.

53 |

54 |

NB. The @mentions and URLs are anonymized for privacy issues

55 |

Further explanations

56 |

We have used the Urban Dictionary to provide you with definitions of some derogatory terms and slurs you may encounter.

57 |

Ching Chong is an extremely racist slur that mocks Asian people, especially Chinese people.

58 |

Spic is a racial slur for people of Latino descent.

59 |

Rag head refers to any ethnic individual whose culture and/or religion dictates that they wear a clothdevice/headress on their head.

60 |

Muzzie refers to a muslim.

61 |

Yobbo refers to a stereotypical Australian.

62 |

Limey refers to a British person.

63 |

Plastic paddy refers to a person who retains a strong sense of Irish cultural identity despite not having been born in Ireland or being of only partial (if any) Irish descent; generally used in reference to Irish-English or Irish-Americans. Perceived as irritating poseurs by Irish nationals.

64 |

Bitter Clinger defined in a public press conference as people from the US who are not progressive, socialist, or liberal.

65 |

Boojie means fancy. The term is derived from the French word, bourgeoisie.

66 |

Rube, redneck and white trash are derogatory terms for American Southerners or very poor white people.

67 |

Moon cricket is a slur derived from early slave times when black people would come out at night and sing slave songs under the moonlight like crickets.

68 |

Surrender monkey refers to a Frenchman or a soldier in the French army.

69 |

Bint is the English slang for b****.

70 |

Trailer trash is a derogatory description for a person who seems well-suited to residential life in a mobile home park and is distinguished by poor hygiene, foul language, slovenly or slutty clothing, and general ignorance.

71 |

Dyke is a slur used to refer to Lesbians.

72 |

Downy refers to someone with Down Syndrome.

73 |

Soup-taker is used against a person who has sold out their beliefs, referring to the Irish potato famine when some Catholics converted to a Protestant faith in order to gain access to a free meal.

74 |

Wetback is a derogatory term used to describe Mexicans who have immigrated illegally to the United States by swimming or wading across the Rio Grande--the river that separates the U.S. from Mexico. U.S. Border Patrol began using the word in 1944 to refer to illegal Mexican immigrants who were easily identifiable by their wet clothing.

75 |

Is the following tweet

76 |

77 |

${TWEET}

78 |

79 |

Direct hate speech
Indirect hate speech

80 |

81 |

What adjective would describe the tweet the best?

Abusive
Hateful
Offensive
Fearful
Disrespectful
Normal

What feeling does this tweet provoke in you?

82 |

Shock
Disgust
Anger
Sadness
Fear
Confusion/Ambivalence
Indifference

83 |

Based on what attribute does it discriminate people?

Race
Religion
Ethnicity
Nationality
Gender/ Gender identity
Special needs
Sexual orientation
Other

84 |

Who does it target?

It is a personal attack that cannot be generalized
Asians/ Nationals of an Asian country
People with special needs
People from an African descent
Hispanics
"Lestists"/ Socialists
Indians
Hindu
Christian

Women
People of color
Chinese
Arabs
Muslims
Jewish people
Immigrants
Refugees

Gay people
Other

85 |

-------------------------------------------------------------------------------- /pilot_dataset_tweets_only.tar: -------------------------------------------------------------------------------- 1 | pilot_english_tweets.csv0000664000176200017620000002452113533400713017207 0ustar nousidhoumnousidhoum0,Hopefully they investigate the diseases that immigrants bring into America like AIDS and EBOLA and @URL 2 | 1,Horrible 'Nigerian disease' called Monkeypox spreads in the UK for the first time @URL 3 | 2,@user You forgot the mass uncontrolled 3rd world immigration plus the crime and disease they bring into the UK 4 | 3,We should close the borders until the Wall is built. They never assimilate & bring disease that's been eradicated.\u2026 @URL 5 | 4,While they bring disease and ruin your walls and eat your foods without giving a shit about you. Fuck if I care. @URL 6 | 5,@user Stop all Travel until isolated! Stop illegals they bring damn disease 7 | 6,@user @user Lying Liberals. There is no line they won't cross to bring down the greatest President of ou\u2026 @URL 8 | 7,Lmfao. Feminism is really a disease now. Women always trying to bring men down in all aspects of life. 9 | 8,@user These feminazis are bring on a new Victorian era. Men became celibate to protect themselves from disease\u2026 @URL 10 | 9,@user @user Ching Chang Chong. Do you understand chinky boy? Does Communist government allow you to use\u2026 URL 11 | 10,RT @user: I HAVE NEVER EVER SEEN THIS MUCH \ud83d\udca9\ud83d\udca9\ud83d\udca9 PACKED AROUND A SINGLE #leftist #Terrorist #activist #liar -\n #ChristineBlaseyFord\nGU\u2026 12 | 11,RT @user : A feminist will spend hours telling you \u201cmen are trash\u201d\n\nThen start complaining about how they should be equal to men 13 | 12,@user The feminist movement has been the biggest threat to women in my lifetime! In their effort to empow\u2026 @URL 14 | 13,@user You cheating on me your black boyfriend with a WHIGGER 15 | 14,\fuck your optics\ ok whigger @URL 16 | 15,lol at how \leading HBD bloggers\= a Bengali cuckold with a jarringly gay voice a dravidian male posing as a whit\u2026 @URL 17 | 16,@user I'm going with the whigger on this one Mike. 18 | 17,@user call me a retarded cat thing alien thing again and ill fuck your cat in front if your gold fish faggot 19 | 18,u r ugly and retarded u stupid piece of shit. flaming faggot. kill ur self @URL 20 | 19,This faggot next to me at the gym is a Sox fan and everyone looking at him like he\u2019s retarded 21 | 20,When people get offended by insults and jokes with words like retarded autistic faggot etc. @URL 22 | 21,pal we're not always gonna be no the same wavelength you twat 23 | 22,HMMM BOMB SCARES Stocks Falling Gas Prices Interest Rates Rising Migrants Marching Leftist Violence Made up @URL 24 | 23,What do you expect from expired country like Nigeria a shithole and fantastically corrupt one indeed. Biafra Natio\u2026 @URL 25 | 24,I want to punch his face so fucking much disrespectful ching chong 26 | 25,@user Not everybody wants to hear ching chang Chong sir. Also I bet you dont even know Japanese 27 | 26,Ching Chong Ring Rong? I don't speak the language 28 | 27,@user Ching Chong lookin ass 29 | 28,since im japanese heres some japanese terms ill teach u: ching chong ling long long d0ng 30 | 29,why do niggas act like they are retarded 31 | 30,@user SEE U IN HELL DOWNY SCREW YOUR STUPID ASS 32 | 31,Faggot fucking sellout no iq pebble brain autistic slut 33 | 32,@user Are you a chinaman or a wetback? Looks like a combo to me. 34 | 33,RT @user _Laflare: VibeHi did all that MAGA tap dancing and they still kicked him out the White House like the nigger he is. 35 | 34,@user is a disgrace it wants to trash the US and make it another shithole country.... @URL 36 | 35,@user Poor Sweden. Turning into shithole country. But those eurofruits don\u2019t want to help themselves so w\u2026 @URL 37 | 36,@user Boutta hit the raghead tire shop again with Zane \ud83d\ude02\ud83d\ude02 38 | 37,@user No one gives a crap about this raghead!! 39 | 38,I\u2019ve never seen a story so overhyped as this Jamal Khashoggi bullshit. Americans don\u2019t care if some raghead reporte\u2026 @URL 40 | 39,@user I wanna kill em raghead style pour acid on them and shove razor wire up their ass that's what th\u2026 @URL 41 | 40,@user @user Never see me going to a #RagHead Country. 42 | 41,You're obsessed negro jew.\n\nI'm not bowing to your bullshit. ever. You deaf lesion brained con.\n\nI will keep tellin\u2026 @URL 43 | 42,"@user : Hodgetwins react @user #KanyeWest was called a \u201ctoken negro\u201d and a \u201cdumb negro\u201d.\nCNN is full blown RACIST. #TRUMP2020 #MA\u2026""" 44 | 43,Black own businesses is gone bc of illegal immigrants?\nThink abt it if undocumented ppl are illegal then legal @URL 45 | 44,If you are making jokes about the scenes in Leicester right now you're a cunt 46 | 45,@user You are a green tea drinking muzzie doing taqqiya by drinking Brahminical hindu coffee and non-brahminical hindu tea!!\ud83d\ude02\ud83d\ude02\ud83d\ude02 47 | 46,@user Green tea? You muzzie!! Stop your taqqiya... \ud83d\ude02\ud83d\ude02 48 | 47,@user @user His fake rap is nauseating. He's a Muzzie and never did squat for Black Americans. He's ou\u2026 @URL 49 | 48,Brother Nature ain't black no more huh? He just a spic to y'all now right? Ain't trying to hold the dominirican homie down at all 50 | 49,Teddy fat sorry ass wrong for how he doing Jadah all because he want that wetback Tati \ud83d\ude12\ud83e\udd2c his lame no pussy getting\u2026 @URL 51 | 50,@user @user @user Go back to Guatemala you stinking wetback 52 | 51,Communist wetback douchebag @user is such a drag on his team send him back to cuba! @user 53 | 52,We got shitlib Twitter calling us Nazis and we got wigger-nationalists and low IQ right wing spergs calling us jews\u2026 @URL 54 | 53,@user WTF!!!! STOP THIS INVASION. OPERATION WETBACK IS HOW IKE RAN OUT THE ILLEGALS WITH ONE IF HIS TOP\u2026 @URL 55 | 54,@user 41 from an idiot TORY soup taker in denial. go do research on the Irish regiments . 56 | 55,@user @user @user He's a Marxist big mouthed yobbo. As much as I hate @user I wouldn't\u2026 @URL 57 | 56,@user Strewth mate ... don't be a yobbo. Just speak the Pommy English .... 58 | 57,@user If it weren't for her close resemblance to trailer park trash she could be a model. 59 | 58,@user 538 They may dress better they may speak \u201cbetter\u201d but make no mistake they are trailer park trash 60 | 59,@user Trash comes in many forms apparently. Moons in the trailer park. 61 | 60,I can\u2019t surround myself with fake boojie bitches lol. Especially when you been pass around and acting stuck up. Shit bothers me. 62 | 61,Macron can't you see your own country's sinking into a shithole your trying to be Merkel no cunts interested in you\u2026 @URL 63 | 62,@user Can I dress up as you for Halloween (retard with anger issues) you ugly cunt 64 | 63,@user Yes. He needs to fuck off to a Muslim country and practice there. Total cunt tgat started all the @URL 65 | 64,This SOCIALIST COMMUNIST MUSLIM HAD 8 YEARS ...\nHE DAMN NEAR DESTROYED \nAMERICA !!! \nIF YOU DARE LISTEN TO HIS RANT\u2026 @URL 66 | 65,Only Leicester i like is the cheese fucking yobbo cunts 67 | 66,@user We don't need URDU in our nation and schools. Just shut up @URL 68 | 67,@user Pappu don't feel bad. By kicking your videshi family out of our Country we are ONLY FOLLOWING your BA\u2026 @URL 69 | 68,It certainly will starve or go back to your country where you belong. @URL 70 | 69,@user Stupid nut lier dumbass transgender asshole. Go back to your fucking country you came from\u2026 @URL 71 | 70,RT @user: CONNECTICUT SPENDS MORE THAN $1.1 billion a year on illegals!! Go back to your country get your paperwork done and we wi\u2026 72 | 71,@user Good. You can go back to hitting yourself for Hussein and leave civilized people alone. Your coun\u2026 @URL 73 | 72,also you\u2019re not from around here aren\u2019t you ? bitch better get your fuckin\u2019 visa or get ready to go back to either\u2026 @URL 74 | 73,@user Traumatized? What your people in your home country do to women is worse! This liar was Groped NOT R\u2026 @URL 75 | 74,@user They are not thankful that they can live in Europe or Uk why demonstration for what go back to your country 76 | 75,@user But you just said Europe is for Europeans Germany must not become an Arab country immgrants should g\u2026 @URL 77 | 76,@user 1982 Seriously go back to your country with that shit dead ass 78 | 77,There is no fcking way I'm paying you cnts a toll for that completely retarded Bell St exit with its completely @URL 79 | 78,@user Why dont you go back to Vietnam for they rest of your useless being. You never was a patriot for th\u2026 @URL 80 | 79,@user you Have proven to be one of the Communist Party just go back to your other\u2026 @URL 81 | 80,This woman is a danger to woman\u2019s rights. She does NOT share our values. She and her ppl WILL force you to accept Sharia law @URL 82 | 81,@user You married your bother go back to the hell you came from you dont deserve to be in our great country..I WILL CALL ICE ASAP 83 | 82,@user There the woman sits in clothes that are probably hot & are cumbersome. Go back t\u2026 @URL 84 | 83,@user You could go back to your native India and start there tell your native country to stop raping little girls @URL 85 | 84,@user GO back to your own country!!! Leave ours alone....fix your own country take your brother husband home with you 86 | 85,@user Racist...seriously go back to Canada. You try to show him with black people to insult him? You really are\u2026 @URL 87 | 86,Gentle reminder: The atrocities in other nation states are theirs to fix and you don't have to accept refugees into yours. 88 | 87,@user @user @user There is a difference between refugees and migrant workers. The latter return home after they\u2026 89 | 88,RT @user : We need to do everything possible to secure the border. And we can do that at the same time as we welcome and celebrate legal\u2026 90 | 89,Illegal immigrants are going to destroy America. Saddest thing really. 91 | 90,@user If the leftists in Europe are happy to recieve muslim illegal immigrants - they are very welcome\n\nTake them all and keep them\u2026 92 | 91,RT @user Poland actually lost the best of our blood. Let's be honest - Polish immigrants pay taxes do not m\u2026 93 | 92,RT @user : Legal immigration is by far the most damaging form of immigration. So many of our people get caught up with the thousa\u2026 94 | 93,@user @user Conjectural statement. No country in the world; incl US can afford it. 2. More importantly US cl\u2026 @URL 95 | 94,RT @user : Thread\ud83d\udc47 on the impact of housing refugees in Toronto. What a mess. @URL 96 | 95,RT @user : The next question is why the government has housed refugees in a $150-$200 a night hotel for months 97 | 96,I have taught BOTH my daughters and son that this type of feminist bullshit is what keeps us in turmoil rather than\u2026 @URL 98 | 97,Call them what they are ILLEGAL ALIENS not undocumented immigrants!! @URL 99 | 98,@user We've heard 11 million illegal aliens for decades. My guess it's over 30 million now es @URL 100 | 99,@user LIVE BY OUR LAWS OR GO BACK FROM WHENCE YOU COME BUILD HEALTHY COMMUNITIES IN YOUR HOMELAND\u2026 @URL 101 | pilot_french_tweets.csv0000664000176200017620000002465213533400713017030 0ustar nousidhoumnousidhoum0,RT @user #France = #Nantes = #Traitrise = des #Gauchistes Liberaux Mondialistes encouragent l'#Invasion des #Migrants Musulmans\u2026 102 | 1,RT @user #Aquarius Cette haine qui n'a plus de frein venant de gens qui n'ont pas de problemes ou si peu. \nOn a denonce le #terrorism\u2026 103 | 2,RT @user Dresde (Allemagne) : Proteges par la police les migrants clandestins musulmans menacent de viol la population @URL 104 | 3,RT @user Hier soir sur #BFMTV #Kouchner a reclame encore plus de migrants en France. Ce qu'il oublie de dire c'est que c'est au\u2026 105 | 4,RT @user A Montpellier une trentaine de migrants albanais s'attaquent a des lyceens et poignardent un pere de famille : STOP a la\u2026 106 | 5,RT @user Avec 1000 agressions et 700 cambriolages par jour une arrivee de migrants record une menace terroriste extremement forte et\u2026 107 | 6,RT @user Bienvenue a #Cergy ou des migrants africains completement nus agressent sauvagement les Francaises. @URL 108 | 7,mdr macron demolit la franceaccepte le terrorismeles migrantsNON se serra pas lui qui protegera la france et le\u2026 @URL 109 | 8,RT @user \Alors que les dirigeants europeens sont reunis a Salzbourg pour evoquer notamment les questions migratoires et la creati\u2026 110 | 9,RT @user Dans chaque departement la priorite du prefet c'est aujourd'hui l'accueil le logement les loisirs... des immigres e\u2026 111 | 10,RT @user Saint-Brieuc : Boris Le Lay condamne a un an de prison pour avoir denonce l`islamisation induite par l`implantation de migra\u2026 112 | 11,RT @user Kouchner a attaque les chretiens de Yougoslavie en faveur des musulmans il espere la meme chose pour la France \n\nMigrants : po\u2026 113 | 12,RT @user \Migrants : comment Nantes degaine le plan geant (Ouest-France)\ @URL 114 | 13,RT @user @user Hey Manu C`est quand que tu portes tes couilles et tu demissionnes de ton poste de depute fantome ! Et ap\u2026 115 | 14,donc si on suit son raisonnement la france on est cense ne pas aider les migrants puisqu'on doit s occuper de not\u2026 @URL 116 | 15,RT @user \u00ab On nous qu'il n'y a pas de logements pour les SDF mais il y en a pour les migrants qui arrivent ! \u00bb \n\nMicro coupe lorsqu\u2026 117 | 16,RT @user @user @user Vu les circonstances de ces derniers mois se sont les migrants qu'on peut taxer de Nazis\u2026 118 | 17,Malheureusement en #France un de perdu dix de retrouves ! #migrants #stopimmigration @URL 119 | 18,Et ca ce sont les entrees legales. Les migrants viennent en plus. @URL 120 | 19,RT @user La Bretagne qui etait le symbole de la bobossitude pro-migrant commencerait-elle a changer d`avis ? \n\n@URL 121 | 20,RT @user Paris. Un centre d`hebergement pour refugies ouvre dans le 16e arrondissement et co\u00fbtera 35 millions d'euros par an a\u2026 122 | 21,RT @user Euh ca pourrait servir a :\n- devenir la premiere puissance du monde.\n- proteger et developper une civilisation unique et gen\u2026 123 | 22,@user @user Comment ne pas dire ? Purain de saloperie de negro cafard de migrants et surtout commen\u2026 @URL 124 | 23,RT @user Savez vous que les migrants nous ramenent d' essencieres maladies eradiques comme la rougeole d'ou les 11 vaccins obliga\u2026 125 | 24,Dans sa tete elle va etre avocate des droits de la femme en realite elle va finir assistante sociale 126 | 25,@user Pourquoi la peur des migrants serait sans raison alors que le nombre de crimes commis par ceux-ci est en augmentation ? 127 | 26,RT @user - Mme Le Pen si je vous dis \plage\?\n- MIGRANTS !\n- ligne ?\n- FRONTI\u00c8RES !\n- semoule ?\n- ISLAMISATION !\n- ah ouais...quand me\u2026 128 | 27,RT @user De #Pompidou a nos jours la meme ideologie mortifere pour les peuples plus de migrants a la demande du grand patrona\u2026 129 | 28,RT @user @user @user @user trouve peut-etre la charia la polygamie l`excision la burqa et les abattages r\u2026 130 | 29,Je ne sais pas. Je suis peux etre une revoltee du dimanche je ne defend pas les droits de la femme a tord et a tr\u2026 @URL 131 | 30,@user @user @user Y'a rien de illogique dans ce que j'ai dit a pars pour une personne avec un qi negatif\u2026 @URL 132 | 31,@user @user Oui enfin l'impérialisme Chinois ça se résume à: Cool l'Afrique à merde les mongols nous déf… @URL 133 | 32,RT @user @user Le cauchemar... Ca me fa 134 | 33,it penser à ces #migrants qui ont jeté de le merde et de la chaux vive sur des policiers p… 135 | 34,@user Pour moi c'est plus simple encore mdr ta un voile de merde tes pas française mais écrit français une… @URL 136 | 35,RT @user Il fait pas attardé lui en SSG 137 | 36,un vrai attardé mdr dans sa tête quand on dit que sahra maghribiya il croit qu'on parle de tout le désert @URL 138 | 37,Verratti encore attardé mental hier. 139 | 38,Il a déjà un jaune et tire le maillot gratuitement c'est vraiment un mongolie… @URL 140 | 39,@user @user @user Je suis d'accord avec lui ispice de bobo-gaucho complotiste de l'extrême à… @URL 141 | 40,Après la droite « diesel et clope » théorisée par Griveaux la gauche « débile et sectaire » par @user @URL 142 | 41,Bon j’ai plus rien à dire sur le negro debile mais l’autre fils de pute qui nous sert de coach il se dit pas 1 minu… @URL 143 | 42,Les français se font cracher dessus par la presse hébraique holocauster via l'immigration de masse étrangère musul… @URL 144 | 43,Ptdr mais j’ai juré dans sa tete il se disait « oh non ce sale negre a marqué » @URL 145 | 44,@user @user Pq tjr les sagitzires sont mechants sale pute negre 146 | 45,@user C un negre qui veut Persée 147 | 46,@user Il y a quand même plus de passion lorsqu'il s'agit de faire le panégyrique de quelque negre beuglant la… @URL 148 | 47,ptdr lui ca s'voi a droite c'est le seule negro que tu vois au meeting du Front National @URL 149 | 48,RT @user Nekfeu (Sale babtou d’negro) @URL 150 | 49,@user @user @user @user le gauchiste est toujours un ado attardéfaut pas lui en vouloir… @URL 151 | 50,@user Encore un écolo gauchiste débile qui n'a rien compris. C'est la démographie délirante de l'humanité couplé… @URL 152 | 51,@user Si on ne veut pas une invasion de « réfugiés » gauchistes en Guyane il faut fermer la frontière 153 | 52,@user @user Les muzz sont à l'assaut de la planète qu'ils veulent conquérir et islamiser. Ils s… @URL 154 | 53,@user @user Les Chrétiens ne sont pas aimés en France ! La gauche est vendue s'est donnée même à l… @URL 155 | 54,@user Où sont les femmes ? Voici une troisième! Euh... Ah re merde ! Encore une malade ( #gauchiasse… @URL 156 | 55,@user c'est sûr qu'il y a plus à en vouloir à la droite traitresse qu'à la gauche ouvertement pro islam pro… @URL 157 | 56,@user La gauche est trop maquée avec l'Islam qui traite la femme comme une chose appartenant à l'homme !!!!! 158 | 57,Quand Halloween remplace la Toussaint pourquoi ne pas mettre Noël à la poubelle ? Hollywood et le politiquement co… @URL 159 | 58,Imaginez vous élevez une bizu pendant 18 ans et elle est féministe gay et islamo-gauchiste..... Je préférerais que mon fils soit un violeur. 160 | 59,@user @user @user j'y serai pour combattre l' islamisation et emmerder les islamo-gauchist… @URL 161 | 60,@user Et là tu rajoutes à la fin: mongol!!!! 162 | 61,Il ma répondu (deux heure après le vu mais nique sa mere) jtm 163 | 62,@user 164 | 63, oh putin espèce de negre @user @user @user 165 | 64,Me rappelle un certain nègre du psg.. @URL 166 | 65,Attention un suspect pour un flic ce n’est pas forcément un Arabe ou un jeune là par exemple c’était un nègre. 167 | 66,Chez nous ça roule des battes pussy négro Viseur holo' shoote tes pattes pussy négro Dans mon clan pas d'pussy né… @URL 168 | 67,@user QUE PENSE YASSINE CHABLI DE TOI ??? sale putain a négre ! marochienne de merde va ! 169 | 68,ma parole tu fais ça à mon fils sale pute de nègre....et t'as plus de tête. @URL 170 | 69,@user @user sale batard 171 | 70,@user @user @user sale fils de chien jvais tretrouver 172 | 71,Mais espece de sale pute va ton gosse a cause de TOI il a un tympan percé mais tu souris fièrement pcq cetait pas… @URL 173 | 72,Mdr sale chienne de féministe de merde @URL 174 | 73,ale pute de féministe de mes couilles sale race @URL 175 | 74,@user mais wAllah t'as grave raison mec je suis serieux on s'en balec de ta vie nique ta mere sale feministe de pute 176 | 75,@user si tu n'es pas capable de t'occuper de ton enfant n'en fais pas sale conne. Les gens comme toi sont le ca… @URL 177 | 76,@user @user @user @user Bon comme j’imagine que t’es une ptn de feministe lgbt pour changer… @URL 178 | 77,@user Tout ça pour dire que c’est la faute de l’homme ? Sale pute laisser mourir son fils pck on est féministe ? 179 | 78,y a des gens qui font des films de merde sur le handicap mental je suppose que ça te dérange aussi sale féministe… @URL 180 | 79,@user Je suis pas juif sale bougnoule 181 | 80,@user @user bah pq tu viens sous mon tweet alors sale juif 182 | 81,@user Arrêtez de tout minimiser comme cela sale homme cisgenre blanc phallocrate non juif et dominateur!!!… @URL 183 | 82,@user @user Tg fdp sale chien sale juif gaz t encetr 184 | 83,@user @user Nique ta mere toi sale juif de merde 185 | 84,@user wesh sale pute ntm fdp sucer d ebite jvais tenculer ta mere ntm fdp tu fais le loveur mais t qun l… @URL 186 | 85,@user VA NIQUER TA GRAND MERE LA GROSSE PUTE SALE HOMOSEXUEL JUIF PERSONNE T’AS JAMAIS AIMÉ ET J’AI BAISÉ TA FEM… @URL 187 | 86,@user Sale payday juif arabophile retourne en cuisine ! 188 | 87,Vous êtes une arabe une terroriste qui met des bombes partout ! 189 | 88,@user Pourquoi ne pourrait ont pas se déguiser en terroriste arabe?J'emmerde les gauchistes des réseaux sociaux! 190 | 89,@user RYAN SALE ARABE PUTAIN DE TERRORISTE JE VAIS PORTER PLAINTE AAAAAARGH TERRORISTE WRYYYYYYYYYYYYYYY @URL 191 | 90,La propagande de la Dilcrah contre la violence anti-homo est bidon. La propagande antiraciste est bidon aussi. Elle… 192 | 91,#septahuit De +en+ de #migrants #Refugies qq soient pays... car de +en+ de violence cruauté non #laicite corruption… @URL 193 | 92,@user On ne dira jamais assez que l’islam est une secte dangereuse. La gauche athée qui a la haine et l… @URL 194 | 93,#Rennes Autorisation du #Burkini par @user à la piscine : recul incompréhensible du droit des femmes par d… @URL 195 | 94,@user sale mongol dans 10 ans ça sera le feu quand tu seras au RSA mdr 196 | 95,@user @user j'aime pas sa facon de exprimé en diré un mongol de marseille a oui c'est un mongol de marseille ^^ 197 | 96,@user On a les mêmes déconnectés à la maison. Ils ne veulent pas voir la réalité de l’islamisation la violence ca… @URL 198 | 97,Je ne suis pas #raciste mais #Patriote. Regardez c’est dans leur sang la rage l'#agression la propension à la… @URL 199 | 98,L'UE finance-t-elle avec l'argent des #contribuables la violence que l'invasion islamique apporte aux pays europ… @URL 200 | pilot_arabic_tweets.csv0000664000176200017620000004013613533400713016777 0ustar nousidhoumnousidhoum0,يظل التحرش في نظر الانثى تحرشاََ حتي يقوم به الشخص المناسب. فيصبح غزل. 201 | 1,وفي خطواتها عربية هيفاء حذرو الشباب التحرش بها كأن الشباب صخرة صماء 202 | 2,@user @user احمدي ربك انك في حماية رجل لو انفردتي بنفسك طمع القوم فيك قال الله تعالي ( الرجال قوامون علي النس… @URL 203 | 3,RT @user @user إعلان غير موفق وغير مقبول من مايسترو للأسف يتم استغلال تمكين المرأة بطريقة لا تمس بثقافتنا وقيمنا بصله 204 | 4,RT @user الثقافة السائدة اخرجت جيل من النساء ينتقصون انفسهم دون دراية ، حتى أنهم قد يدافعوا عن قناعتهم بأنهم أقل شأناً من الرجال .… 205 | 5,أغلب الإعلانات التجاريه اللي شاهدناها تخص اليوم الوطني عباره عن تبرج سفور اختلاط اغاني قيادة المرأه… 206 | 6,RT @user #كذبتم_الشعب_ضد_قياده_المراه 207 | 7,الشيخ محمد السعيدي يؤكد أن رؤيه 2030 هي رؤيه تغريبيه أمريكيه لإفساد المجتمع السعودي بمسمى الإ… RT @user #ارتقوا 208 | 8,ملابس المرأة تحكي عن تربية أبيها ، وغيرة أخيها ، ورجولة زوجها ، وحرص ومتابعة والدتها وقبل هذا كله أتشعارها بمراقبه… 209 | 9,RT @user أختي المباركه : المبادرة للزواج اذا تقدم الخاطب الكفوء من صالحك حتى ولوكان معددا خاصة لمن تعدى عمرها الثلاثون لإن عمر الم… 210 | 10,RT @user @user جميل جدا (والحضرية الحجازيه ) من اي بلد تشردت إلينا ؟ هل من صنادق حضرموت ،؟ ام من أحراش افريقيا ام من مستنقعات اسيا… 211 | 11,@user اللاجئون ضرر!!!!!!!!!! 212 | 12,@user @user اللاجئون محتلين البلد 213 | 13,بالتزامن مع محاولات تحريض المواطن على وطنه بدء المشردون اللاجئون في كندا وبريطانيا تحريض السفهاء على الهجرة من أو… @URL 214 | 14,@user @user ماذا تقول انت؟ ثامازغا/شمال افريقيا المحتلة اغلبية امازيغ وامازيغ مستعربين اخوتنا واقلية افارقة… @URL 215 | 15,RT @user يا جماعة فهمانين الموضوع غلط مساوات المرأة بالرجل مش يعني تحطوا حلقة وجدولة وتلبسوا وتمشوا متل النسوان 216 | 16,RT @user @user أما في دماغ المرأة فالخطوط متداخلة وهذا يفسر حبها لمعرفة أدق الأمور، وتمتعها بذاكرة قوية والثراء اللغوي وقدرتها… 217 | 17,@user هذا السؤال لم لم تبلغ؟ هو سؤال مشروع .... لماذا يتستر بعض النساء خجلاً على المتحرشين جنسياً و فج… @URL 218 | 18,RT @user فــي لـعـبـة الـشـطرنـج يــوجــد مـلـک ولا تــوجــد مــلـكـة ..!!!!تـعـرفـون لـمـاذا ؟؟؟؟!! لان الــمــرأة لاتـصـلـح ان تـ… 219 | 19,RT @user @user @user @user الموضوع ما هو استنقاص من العمل بقدر ماهو استغراب !!! حيث لما كانت المرأة في بيتها معز… 220 | 20,طالبوا بتدويل الحج من مسؤول الي اعلام طالبوا بمحاكمة سمو ولي العهد وغيره وغيره ويخرج علينا أغبى اغبياء شرب حليب ال… 221 | 21,@user فعلا عاصمتنا جميلة جدا و لكن اغلبية سكانها من العرب و القليل من السكان هم أكراد حكومتنا الفاسقة بدأت… @URL 222 | 22,لو كنتُ مكانكِ ( لاسمح الله)لَتبرأت من الكورد وغيرت قوميتي هذاأشرف لكِ وأشجع خطوة تخُطينها ولوان كوردستان لم ولن ت… @URL 223 | 23,@user لما #قردرقان يتحاكم الأول على قتله أكراد #تركيا وإنجابه إبن من #زوجتك التي عندك 224 | 24,@user لست مطبلا لأي أحد! لكن (أغلبية) الأتراك الموجودين في حضن ألمانيا أتراك علوية وأتراك أكراد وكلاب العلماني…@URL 225 | 25,@user اعقب يادشتي ياذنب ايران وسخ ياحقير ياتافه اقسم بالله ليجي يوم وناطاك بالنعول وتحت الاقدام حنا اهل السنة… @URL 226 | 26,@user @user روح إيران يا وسخ 227 | 27,@user الذي وسخ شوارع النجف هم زوار ايران وينظفون وسخ مواطنيهم 228 | 28,@user @user @user كس امك انت. سلمان ومحمد ابنه باذن الله فوق كل وسخ وذنب يا كلاب ايران. كلن… @URL 229 | 29,@user @user وينك يافيصل اسرائيل عن دلوعة ايران رئيسك(بشار)الذي عاث فسادا وقتل ابناءوطنك واغتصب نساء سورياال… @URL 230 | 30,الأخونجية معروف عنهم الغدر والخيانة والكذب مالهم أمان حزب عفن حزب وسخ حزب نجس استغلوا الدين وحرّفوه على حسب أهوائهم… @URL 231 | 31,@user عربي وسخ وحاط علم ايران السيخي المجوسي علم وشعار المشركين الكفار اعداء الله ورسوله المملكة شامخة عزيزة… @URL 232 | 32,@user @user الفضيحه يا رافضي هي انت تعطي عقلك وعرضك وارضك ومالك لمعممين ايران وتقتنع ان فيه واحد عا… @URL 233 | 33,@user خلص نفسك يا كلب إيران ودلتك من نجاسة الفرس وبعدين تفلسف يا وسخ 234 | 34,@user ي حشدي ي شيعي صدام زاغبكم زغب ولا كان لكم صوت يوم كان موجود اما الأقنعة ف شي طبيعي ي حمار إيران ي… @URL 235 | 35,@user @user @user اتمنى من دول الخليج الاستغناء عن الفلسطينيين واحلال مكانهم رجال عجم لاكنهم اوفياء… @URL 236 | 36,@user @user روح الشهيد نظيفه بإذن الله ونتي الله اعلم بحالك ع اي دين وع ايش عايشه عايشه حياتك من وسخ لوسخ يالوسخه ياكلبة ايران 237 | 37,@user بابه اييه ياجاهله حتي العربي مش عارفين تكتبوه قظر اييه دي خلاص نهايتكم بعد شهرين يابهايم والامير بتا… @URL 238 | 38,@user أوروبا أخذ لكي تكون أوروبا ٣٠٠ سنه وانت و بن سلمان جايين في ٥ سنوات تقلبوها أوروبا ؟ وليش انت وبن زف… @URL 239 | 39,@user اغبياء كل مازاد الضغط تلاحم الشعب مع سمو الامير محمد بن سلمان اكثر واكثر ... @URL 240 | 40,@user @user هههههههه الكذب ركن من اركان دينكم المجوسي اللعين،،، بس لانك اغبياء وملاعين تسموه تقية..… @URL 241 | 41,@user لانكم اغبياء لازم تحتوي على الجنسين عشان محد يستشرف والكل يفلها 242 | 42,@user الضحايا اكيد كانو اكراد لان صلاح الدين هزم دولة الفاطمية لهذا تجدهم يحقدون علينا 243 | 43,@user تطورت اربيل ودهوك والسليمانية لانكم اكراد وتتبعون العرق الاري وله اقسم بالله لو سكان الاقليم الاصليين كان… @URL 244 | 44,@user @user جاسم علي واضح انك ايراني وسخ ونقد ايران اوجعك لكن ابشرك ان الدعس على اتباع الخميني قريب جدا 245 | 45,@user @user الله يفنيك و يفني اهلك يا وسخ ..اكيد انت ابن متعة ..اللهم عجل بدمار و فناء و موت إيران و من… 246 | 46,@user معك حليب حمير تشربه والا زق وكله آخر واحد يتكلم انت لانكن شربت حليب حمير حتى ارتويت يا معفن انت تعرف من سلم 247 | 47,@user وسخ وحقير اظهر على حقيقتك ياكلب ايران 248 | 48,@user @user محمد بن سلمان تاج راسك ي زبال المشكله انت بس كلام انت من كلاب ايران ي وسخ ي ن 249 | 49,@user مجوسي ههههههههههههههه فارسي هههههههههههههه=ايران وسخ ههههههه 250 | 50,@user انسان تافه وحقير فيه من النذاله مايمنعنا من الرد على وسخ مثله لولا وساخه الدنبوع لما اوصل الحوثي… 251 | 51,@user @user @user وعندك وحدة بايرة يأسة من الحياة ومن العرس 252 | 52,@user ده كلام زوله بايره 253 | 53,@user @user بصي لنفسك الاول يا بايره 254 | 54,انتى صدقتى بجد ده انا كنت بعملك قيمه يا بايره — بايره كمان هشششش 255 | 55,@user معانا الاخت عافية بايرة بتقرأ في الزعيم الأزهري 256 | 56,@user @user @user يا شارب بول البعير انت نكرة وبلادك العاهرة التي انجبت لقطاء مثلك يا عار الامه 257 | 57,@user @user يا احقر خنزير يا شارب بول البعير واكل روثه يا صاحب البارات و المراقص يا عميل الاسرائليين و الامريكيي 258 | 58,@user @user @user انت ايها النتن نتانة عرقك المقرف تتطاول على بلادي التي هي اكبر من بلدك مساحة… 259 | 59,@user انت مثل بول البعير 260 | 60,@user @user @user @user @user ههههههههههه عد انا دريت انا بول البعير له اثر اقوء من الخمر 261 | 61,@user @user ههههه يكفينا فخرا أننا صمدنا و نسعى إلى تحرير أرضنا يا شاربي بول البعير .. بعدين عم تحكي عن ايران… @URL 262 | 62,@user @user @user تجرأت وتدخلت في اليمن بلادك ههههههه وهي تعاني مع الحوثي الى يومنا هذا ههههه ف… 263 | 63,هذا قائدشرطةدبي قد اكثر من شرب بول البعير حتى صار حماراً فوتشوبياً 264 | 64,@user لاتكون تكثر من شرب بول البعير فقد اكثرت من الشرب حتى اصبحت حمارا 265 | 65,@user كس امك اختتتتفوووو عليك وعلى اشكالك يا انجس من بول البعير .. 266 | 66,@user @user ههههههههه لا يا ابله بول البعير اوصى به الرسول واما بول البنجالي خلك في في شرب بول حميركم… 267 | 67,@user @user انت مش عاجبك أن الأجانب رواتبهم أعلي منكم عارفة هما أعلي منكم ليه عشان انتو بهايم ومينفعش ال… 268 | 68,@user @user هههه البقرة امك يا شارب بول البعير ويللي ابوك ناكح الماعز والحمير انتم ما انتم سوى حيوانات ن… 269 | 69,يضحكوا على الهندوسي لأنه بيشرب بول البقر وهم بيشربوا بول البعير! 270 | 70,@user هذه التغريدة توثق مقولة قثم إنما جئت لأتمم مكارم الأخلاق فاستبدل بول البقر ببول البعير 271 | 71,@user بول البعير افضل من بول السياح هههههههههههههههههههههههههههههههههههههههههههههههههههههههههههه 272 | 72,عقبالك ياحبيب أمك فى جهاد النكاح.وجمال كان الدكر وإنت نتايته ..اللبؤة.. ها ها ها 273 | 73,@user وانت ابن وشو ياحمار انت والعطية خنازير انجاس ياطراطير الشاذ تميم اميرك شاذ ياحمار يابن الحمار 274 | 74,@user خنازير ابناء خنازير اصدقاء خنازير تنبح علي الام 275 | 75,@user عاد هذا الفالي سيد كذابين الدين الشيعي دين خنازير لعنة الله على من أسسه الخنزير اليهودي إبن سبأ و الخن… 276 | 76,@user يبدو أنك كلب وراضع من حماره و تربى مع خنازير ( ما تفهم أن الرياض عاصمة دولة تقود العالم الإسلامي) جهز جوازك… @URL 277 | 77,@user @user هاذا اخبث انسان خبث نجاسة الفلسطينين كلها اجتمعت فية اقطع منة حليب خنازير يقلب 278 | 78,@user @user معصب من شيعة النعال عليهم 279 | 79,لعنة الله وودي يلطمون بالنعال بعد غمسه بترب خربلاء ع رواسهم ل… @URL 280 | 80,@user خنازير مرتدين باعوا آخرتهم ودنياهم من أجل إرضاء أمريكا ودول الصليب عنهم أخزاهم الله!! والقادم أدهى وأمر بإذنه تعالى 281 | 81,RT @user يا خنزير هذا ملكٌ أبا عن جد، خلفه ملوك ملؤوا الأرض عزا وفخرا، لم يكونوا يوما تابعين للاستعمار ولا منقلبين على الأهل والأعمام 282 | 82,@user اكيد هذا خنزير حوثي لعنة الله على الرافضه عبيد المجوس ليته يرسلي خاص ابسأله عن الموت لأسرائيل لعنة الله على تقيتهم الذميمه 283 | 83,@user كيف تسمى هذا الكائن الديووث بالاستاذ فهذا خنزير من ارباب العلمانية والفجور 284 | 84,@user ماذا عن السعودي !!؟؟ هههههههه عربان اوباش حضيرة خنزير والله اطهر من طاهركم ! سفله 285 | 85,@user @user يا خنزير ما دخل اختك فينا .. اذا ما بدكم اقصاكم لطيزي بس لاتسوقو الشرف علينا وتقولو استقبلنا 286 | 86,@user بل نبع منها الشرك وعبادة القبر والضريح والثارات واللطم والتطبير والزحف للقبور وأكل الطين والتراب القذر ب… 287 | 87,@user صحيح الاخونجي جرو ثم كلب ثم خنزي 288 | 88,@user @user @user امشي بعيد يا خنزير يازبال الارض 289 | 89,أسري أسري ياوااااااد لا أسريلك قال صقر قال والله ما أنت إلا خنزير مؤدلج 290 | 90,ديوث امه يرسل لي صورتها اححح #رتويت وتفاعل يلا 291 | 91,@user #علاء_الاسواني اذا ماكنش ديوث زيك يقول كدة امال مين اللي هايقول وانت كل كتاباتك قصص جنسيه ..محدش قلك ان دا حريه شخصيه زي… 292 | 92,@user @user اذا كان الروافض الفرس المجوس الصفويون الهندوس يؤمنون بهذا الكلام فاليشهد التاريخ انني ا 293 | 93,@user ههههههه منطق غبي وأسلوب فكر رافضي جبان 294 | 94,RT @user احمد الله جلا وعلا على سلامة المنهج والمعتقدت حزبي ولا بدعي ولا خرافي ولا قبوري ولا صوفي ولا رافضي ولا تكفيري...#سلفي_أ… 295 | 95,@user حين يكتب رافضي يشتم الأمة ودينها وقادتها أودرزي يؤز الفتن بين أبنائها أومهرطق يشكك بهويتها فإن الفضائيات تصفق طربا لذلك .… 296 | 96,@user خبيث مرتزق رافضي حاقد من بقايا الدولة العبيدية 297 | 97,@user @user @user اسمها اسراء العمر فلسطيزيه عايشه ومستمتعه بحقوق عمل مواطنات سعوديات عاطلات وهذا وجهها الق 298 | 98,@user هذه كائنات فضائية، حدكم يا حريم ترى مافي أحسن من سواقة رجالنا هههههه عنجد شر البلية ما يضحك 299 | 99,#ياسر_الفيصل انسان مستشرف لو فيه خير ومعه الحق كان منزل صورته وجنسيته مو يبابي ورا اسم مستعار ( سوري وسخ ) بنات الس… @URL 300 | -------------------------------------------------------------------------------- /sluice_net.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf-8 3 | """ 4 | Sluice Network model. 5 | """ 6 | 7 | 8 | import random 9 | import os 10 | import numpy as np 11 | import pickle 12 | import dynet 13 | from progress.bar import Bar 14 | 15 | from predictors import SequencePredictor, Layer, RNNSequencePredictor, \ 16 | BiRNNSequencePredictor, CrossStitchLayer, LayerStitchLayer 17 | from utils import load_embeddings_file, get_data, log_fit, average_by_task, get_label, average_by_lang 18 | from constants import IMBALANCED, BALANCED, SGD, ADAM, LABELS, MODIFIED_LABELS, TASK_NAMES 19 | from sklearn.metrics import classification_report, f1_score 20 | 21 | 22 | 23 | def load(params_file, model_file, args): 24 | """ 25 | Loads a model by first initializing a model with the hyperparameters 26 | and then loading the weights of the saved model. 27 | :param params_file: the file containing the hyperparameters 28 | :param model_file: the file containing the weights of the saved model 29 | :return the loaded AdaptNN model 30 | """ 31 | params = pickle.load(open(params_file, 'rb')) 32 | model = SluiceNetwork(params['h_dim'], 33 | params['h_layers'], 34 | params['model_dir'], 35 | params['log_dir'], 36 | languages = params['languages'], 37 | word2id=params['w2i'], 38 | embeds = params['embeds'], 39 | activation=params['activation'], 40 | task_names=params['task_names'], 41 | cross_stitch=params['cross_stitch'], 42 | num_subspaces=params['num_subspaces'], 43 | constraint_weight=params['constraint_weight'], 44 | noise_sigma = params['noise_sigma'], 45 | constrain_matrices = params['constrain_matrices'], 46 | cross_stitch_init_scheme=params['cross_stitch_init_scheme'], 47 | layer_stitch_init_scheme=params['layer_stitch_init_scheme'], 48 | best_train_dict = params['best_train_dict'], 49 | best_dev_dict = params['best_dev_dict'], 50 | avg_train_score = params['avg_train_score'], 51 | avg_dev_score = params['avg_dev_score'], 52 | best_epoch = params['best_epoch'], 53 | oov_id = params['oov_id']) 54 | 55 | model.predictors = model.build_computation_graph() 56 | 57 | print('Model loaded from %s...' % model_file, flush=True) 58 | model.model.populate(model_file) 59 | 60 | return model, params['best_train_dict'], params['best_dev_dict'], params['avg_train_score'], params['avg_dev_score'] 61 | ''' 62 | def load_no_args(params_file, model_file): 63 | """ 64 | Loads a model by first initializing a model with the hyperparameters 65 | and then loading the weights of the saved model. 66 | :param params_file: the file containing the hyperparameters 67 | :param model_file: the file containing the weights of the saved model 68 | :return the loaded AdaptNN model 69 | """ 70 | params = pickle.load(open(params_file, 'rb')) 71 | model = SluiceNetwork(params['h_dim'], 72 | params['h_layers'], 73 | params['model_dir'], 74 | params['log_dir'], 75 | languages = params['languages'], 76 | word2id=params['w2i'], 77 | embeds = params['embeds'], 78 | activation=params['activation'], 79 | task_names=params['task_names'], 80 | cross_stitch=params['cross_stitch'], 81 | num_subspaces=params['num_subspaces'], 82 | constraint_weight=params['constraint_weight'], 83 | noise_sigma = params['noise_sigma'], 84 | constrain_matrices = params['constrain_matrices'], 85 | cross_stitch_init_scheme=params['cross_stitch_init_scheme'], 86 | layer_stitch_init_scheme=params['layer_stitch_init_scheme'], 87 | best_train_dict = params['best_train_dict'], 88 | best_dev_dict = params['best_dev_dict'], 89 | avg_train_score = params['avg_train_score'], 90 | avg_dev_score = params['avg_dev_score'], 91 | best_epoch = params['best_epoch'], 92 | oov_id = params['oov_id']) 93 | 94 | model.predictors = model.build_computation_graph() 95 | 96 | print('Model loaded from %s...' % model_file, flush=True) 97 | model.model.populate(model_file) 98 | return model, params['best_train_dict'], params['best_dev_dict'], params['avg_train_score'], params['avg_dev_score'] 99 | ''' 100 | 101 | class SluiceNetwork(object): 102 | def __init__(self, h_dim, h_layers, model_dir, log_dir, task_names, languages, 103 | embeds=None, activation=dynet.tanh, lower=False, 104 | noise_sigma=0.1, cross_stitch=False, num_subspaces=1, 105 | constraint_weight=0, constrain_matrices=[1, 2], cross_stitch_init_scheme=IMBALANCED, 106 | layer_stitch_init_scheme=BALANCED, best_train_dict = {}, best_dev_dict = {}, 107 | avg_train_score=0, avg_dev_score =0, best_epoch=-1, word2id={}, oov_id = None): 108 | """ 109 | :param h_dim: The hidden dimension of the model. 110 | :param h_layers: The number of hidden layers. 111 | :param model_dir: The directory where the model should be saved 112 | :param log_dir: The directory where the log should be saved 113 | :param task_names: the names of the tasks 114 | :param langauges: the training languages of the model 115 | :param embeds: the pre-trained embedding used by the model 116 | :param activation: the DyNet activation function that should be used 117 | :param lower: whether the words should be lower-cased 118 | :param noise_sigma: the stddev of the Gaussian noise that should be used 119 | during training if > 0.0 120 | :param cross_stitch: whether to use cross-stitch units 121 | 122 | :param num_subspaces: the number of subspaces to use (1 or 2) 123 | :param constraint_weight: weight of subspace orthogonality constraint 124 | (default: 0 = no constraint) 125 | :param constrain_matrices: indices of LSTM weight matrices that should 126 | be constrained (default: [1, 2]) 127 | :param cross_stitch_init_scheme: initialisation scheme for cross-stitch 128 | :param layer_stitch_init_scheme: initialisation scheme for layer-stitch 129 | 130 | :param best_train_dict: dictionary storing the best scores on training set 131 | :param best_dev_dict: dictionary storing the best scores on development set 132 | :param avg_train_score: best unweighted average training score over all tasks and all metrics 133 | :param avg_dev_score: best unweighted average development score over all tasks and all metrics 134 | :param best_epoch: the epoch of the best performance 135 | :param word2id: dictionary storing the words to the idx of the word embedding 136 | :param oov_id: the idx of the word which do not appear in the pre-trained word embedding 137 | 138 | 139 | """ 140 | self.word2id = word2id 141 | 142 | self.task_names = task_names 143 | self.model_dir = model_dir 144 | self.log_dir = log_dir 145 | self.w_in_dim = 0 146 | 147 | 148 | if(len(task_names) ==1): 149 | 150 | if(len(languages) == 1): 151 | task_directory = os.path.join(model_dir,'STSL/') 152 | if not os.path.exists(task_directory): 153 | os.mkdir(task_directory) 154 | self.model_file = os.path.join(model_dir, 'STSL/{}_{}.model'.format(languages[0],task_names[0])) 155 | self.params_file = os.path.join(model_dir, 'STSL/{}_{}.pkl'.format(languages[0],task_names[0])) 156 | else: 157 | task_directory = os.path.join(model_dir,'STML/') 158 | if not os.path.exists(task_directory): 159 | os.mkdir(task_directory) 160 | self.model_file = os.path.join(model_dir, 'STML/{}.model'.format(task_names[0])) 161 | 162 | self.params_file = os.path.join(model_dir, 'STML/{}.pkl'.format(task_names[0])) 163 | 164 | 165 | else: 166 | 167 | if(len(languages) ==1): 168 | task_directory = os.path.join(model_dir,'MTSL/') 169 | if not os.path.exists(task_directory): 170 | os.mkdir(task_directory) 171 | self.model_file = os.path.join(model_dir, 'MTSL/{}.model'.format(languages[0])) 172 | self.params_file = os.path.join(model_dir, 'MTSL/{}.pkl'.format(languages[0])) 173 | 174 | else: 175 | task_directory = os.path.join(model_dir,'MTML/') 176 | if not os.path.exists(task_directory): 177 | os.mkdir(task_directory) 178 | self.model_file = os.path.join(model_dir, 'MTML/MTML.model') 179 | self.params_file = os.path.join(model_dir, 'MTML/MTML.pkl') 180 | 181 | 182 | self.cross_stitch = cross_stitch 183 | self.num_subspaces = num_subspaces 184 | self.constraint_weight = constraint_weight 185 | self.constrain_matrices = constrain_matrices 186 | self.cross_stitch_init_scheme = cross_stitch_init_scheme 187 | self.layer_stitch_init_scheme = layer_stitch_init_scheme 188 | self.model = dynet.Model() # init model 189 | # term to capture sum of constraints over all subspaces 190 | self.subspace_penalty = self.model.add_parameters( 191 | 1, init=dynet.NumpyInitializer(np.zeros(1))) 192 | # weight of subspace constraint 193 | self.constraint_weight_param = self.model.add_parameters( 194 | 1, init=dynet.NumpyInitializer(np.array(self.constraint_weight))) 195 | 196 | 197 | task2label2id = {} 198 | 199 | for task in task_names: 200 | labels = LABELS[task]#TO BE CHANGED AGAIN to MODIFIED_LABELS[task] 201 | task2label2id[task] = {} 202 | count = 0 203 | 204 | for label in LABELS[task]: 205 | task2label2id[task][label] = count 206 | count+=1 207 | 208 | 209 | 210 | self.task2label2id = task2label2id # need one dictionary per task 211 | 212 | self.languages = languages 213 | self.h_dim = h_dim 214 | self.activation = activation 215 | self.lower = lower 216 | self.noise_sigma = noise_sigma 217 | self.h_layers = h_layers 218 | self.predictors = {} 219 | self.wembeds = None # lookup: embeddings for words 220 | self.embeds = embeds 221 | 222 | self.best_train_dict = best_train_dict 223 | self.best_dev_dict = best_dev_dict 224 | 225 | self.best_epoch = best_epoch 226 | 227 | self.avg_train_score = avg_train_score 228 | self.avg_dev_score = avg_dev_score 229 | self.oov_id = oov_id 230 | 231 | def save(self): 232 | """Save model. DyNet only saves parameters. Save rest separately.""" 233 | self.model.save(self.model_file) 234 | myparams = {"task_names": self.task_names, 235 | "languages": self.languages, 236 | "w2i": self.word2id, 237 | "task2tag2idx": self.task2label2id, 238 | "activation": self.activation, 239 | "h_dim": self.h_dim, 240 | "h_layers": self.h_layers, 241 | "embeds": self.embeds, 242 | 'model_dir': self.model_dir, 243 | 'cross_stitch': self.cross_stitch, 244 | 'num_subspaces': self.num_subspaces, 245 | 'constraint_weight': self.constraint_weight, 246 | 'cross_stitch_init_scheme': self.cross_stitch_init_scheme, 247 | 'layer_stitch_init_scheme': self.layer_stitch_init_scheme, 248 | 'constrain_matrices': self.constrain_matrices, 249 | 'noise_sigma': self.noise_sigma, 250 | 'best_train_dict': self.best_train_dict, 251 | 'best_dev_dict': self.best_dev_dict, 252 | 'best_epoch': self.best_epoch, 253 | 'oov_id': self.oov_id, 254 | 'log_dir': self.log_dir, 255 | 'avg_train_score': self.avg_train_score, 256 | 'avg_dev_score':self.avg_dev_score } 257 | pickle.dump(myparams, open(self.params_file, "wb")) 258 | 259 | 260 | 261 | 262 | def build_computation_graph(self): 263 | """Builds the computation graph.""" 264 | # initialize the word embeddings using the pre-trained embedding file 265 | 266 | embeddings, emb_dim = load_embeddings_file(self.embeds, self.languages, 267 | lower=self.lower) 268 | self.w_in_dim = emb_dim 269 | 270 | num_words = len(set(embeddings.keys()).union(set(self.word2id.keys()))) 271 | self.wembeds = self.model.add_lookup_parameters((num_words, emb_dim)) 272 | self.oov_id = set(range(num_words)) 273 | 274 | #Find words which do not appear in the pre-trained embeddings 275 | #by removing words which have appeared 276 | for i, word in enumerate(embeddings.keys()): 277 | if word not in self.word2id: 278 | self.word2id[word] = len(self.word2id.keys()) 279 | self.wembeds.init_row(self.word2id[word], embeddings[word]) 280 | 281 | self.oov_id.remove(self.word2id[word]) 282 | 283 | 284 | layers = [] # inner layers 285 | 286 | 287 | output_layers_dict = {} # from task_name to actual predictor 288 | 289 | 290 | # we have a separate layer for each task for cross-stitching; 291 | # otherwise just 1 layer for all tasks with hard parameter sharing 292 | num_task_layers = len(self.task_names) if self.cross_stitch else 1 293 | #print("task names") 294 | #print(self.task_names) 295 | #print("num_task_layers:") 296 | #print(len(self.task_names)) 297 | cross_stitch_layers = [] 298 | 299 | 300 | for layer_num in range(self.h_layers): 301 | print(">>> %d layer_num" % layer_num, flush=True) 302 | input_dim = self.w_in_dim if layer_num == 0 \ 303 | else self.h_dim 304 | 305 | task_layers = [] 306 | # get one layer per task for cross-stitching or just one layer 307 | for task_id in range(num_task_layers): 308 | builder = dynet.LSTMBuilder(1, input_dim, self.h_dim, self.model) 309 | task_layers.append(BiRNNSequencePredictor(builder)) 310 | layers.append(task_layers) 311 | if self.cross_stitch: 312 | print('Using cross-stitch units after layer %d...' % layer_num, 313 | flush=True) 314 | cross_stitch_layers.append( 315 | CrossStitchLayer(self.model, len(self.task_names), 316 | self.h_dim, self.num_subspaces, 317 | self.cross_stitch_init_scheme)) 318 | 319 | layer_stitch_layers = [] 320 | 321 | 322 | # store at which layer to predict task 323 | for task_name in self.task_names: 324 | task_num_labels = len(self.task2label2id[task_name]) 325 | 326 | print('Using an MLP for task losses.', flush=True) 327 | 328 | input_dim = self.h_dim * 2 329 | activation = dynet.softmax 330 | 331 | layer_output = None 332 | if('sentiment' in task_name):#Multi-label classification 333 | #use one binary classification layer for each label 334 | layer_output =[] 335 | for _ in range(task_num_labels): 336 | layer_output.append(Layer(self.model, input_dim, 2, 337 | activation, mlp=True)) 338 | 339 | else: 340 | layer_output = Layer(self.model, input_dim, task_num_labels, 341 | activation, mlp=True) 342 | 343 | output_layers_dict[task_name] = layer_output#sequence_predictor 344 | 345 | if(self.h_layers > 1): 346 | # w/o cross-stitching, we only use one LayerStitchLayer 347 | layer_stitch_layers.append( 348 | LayerStitchLayer(self.model, self.h_layers, self.h_dim, 349 | self.layer_stitch_init_scheme)) 350 | 351 | print('#\nOutput layers: %d\n' % len(output_layers_dict), flush=True) 352 | 353 | 354 | predictors = dict() 355 | predictors["inner"] = layers 356 | predictors['cross_stitch'] = cross_stitch_layers 357 | predictors['layer_stitch'] = layer_stitch_layers 358 | predictors["output_layers_dict"] = output_layers_dict 359 | return predictors 360 | 361 | 362 | 363 | 364 | 365 | def fit(self, train_languages, test_lang, num_epochs, patience, optimizer, threshold, train_dir, 366 | dev_dir): 367 | """ 368 | Train the model, return the train and dev score 369 | :param train_language: the language used for training 370 | :param num_epochs: the max number of epochs the model should be trained 371 | :param patience: the patience to use for early stopping 372 | :param optimizer: the optimizer that should be used 373 | :param train_dir: the directory containing the training files 374 | :param dev_dir: the directory containing the development files 375 | :param threshold added 376 | 377 | """ 378 | 379 | first_train = True if self.best_epoch==(-1) else False#Check whether this is a loaded model 380 | 381 | print("Reading training data from %s..." % train_dir, flush=True) 382 | train_X, train_Y, word2id = get_data(train_languages, self.task_names, word2id=self.word2id, task2label2id=self.task2label2id, 383 | data_dir=train_dir, train=first_train) 384 | print("Finished reading training data") 385 | 386 | 387 | print("Reading development data from %s..." % train_dir, flush=True) 388 | dev_X, dev_Y, _ = get_data(train_languages, self.task_names, word2id, self.task2label2id, 389 | data_dir=dev_dir, train=False) 390 | print("Finished reading development data") 391 | 392 | print('Length of training data:', len(train_X), flush=True) 393 | print('Length of development data:', len(dev_X), flush=True) 394 | 395 | 396 | if(first_train): 397 | self.word2id = word2id 398 | 399 | 400 | print('Building the computation graph...', flush=True) 401 | self.predictors= \ 402 | self.build_computation_graph() 403 | 404 | if optimizer == SGD: 405 | trainer = dynet.SimpleSGDTrainer(self.model) 406 | elif optimizer == ADAM: 407 | trainer = dynet.AdamTrainer(self.model) 408 | else: 409 | raise ValueError('%s is not a valid optimizer.' % optimizer) 410 | 411 | train_data = list(zip(train_X, train_Y)) 412 | 413 | num_iterations = 0 414 | num_epochs_no_improvement = 0 415 | 416 | 417 | train_score = {} 418 | dev_score = {} 419 | 420 | print('Training model with %s for %d epochs and patience of %d.' 421 | % (optimizer, num_epochs, patience)) 422 | 423 | for epoch in range(self.best_epoch+1, num_epochs): 424 | 425 | print('', flush=True) 426 | 427 | bar = Bar('Training epoch %d/%d...' % (epoch+1, num_epochs), 428 | max=len(train_data), flush=True) 429 | 430 | # keep track of the # of updates, total loss, and total # of 431 | # predicted instances per task 432 | task2num_updates = {task: 0 for task in self.task_names} 433 | task2total_loss = {task: 0.0 for task in self.task_names} 434 | task2total_predicted = {task: 0.0 for task in self.task_names} 435 | total_loss = 0.0 436 | total_penalty = 0.0 437 | total_predicted = 0.0 438 | 439 | random.shuffle(train_data) 440 | 441 | # for every instance, we optimize the loss of the corresponding task 442 | for word_indices, task2label_id_seq in train_data: 443 | # get the concatenated word and char-based features for every 444 | # word in the sequence 445 | features = self.get_word_features(word_indices) 446 | 447 | for task, y in task2label_id_seq.items(): 448 | 449 | output, penalty = self.predict(features, task, train=True) 450 | 451 | 452 | if task not in TASK_NAMES: 453 | raise NotImplementedError('Task %s has not been ' 454 | 'implemented yet.' % task) 455 | 456 | 457 | loss = dynet.esum([pick_neg_log(o, gold) for \ 458 | o, gold in zip(output, y)]) 459 | 460 | lv = loss.value() 461 | 462 | 463 | # sum the loss and the subspace constraint penalty 464 | 465 | combined_loss = loss + dynet.const_parameter(self.constraint_weight_param) * penalty 466 | 467 | total_loss += lv 468 | total_penalty += penalty.value() 469 | total_predicted += 1 470 | task2total_loss[task] += lv 471 | task2total_predicted[task] += 1 472 | task2num_updates[task] += 1 473 | 474 | # back-propagate through the combined loss 475 | combined_loss.backward() 476 | trainer.update() 477 | bar.next() 478 | num_iterations += 1 479 | 480 | print("\nEpoch %d. Loss per instance: %.3f. Penalty per instance: %.3f. " 481 | % (epoch+1, total_loss / total_predicted, 482 | total_penalty / total_predicted), end='', flush=True) 483 | 484 | print('Loss per instance by task: ') 485 | 486 | for task in task2total_loss.keys(): 487 | print('%s: %.3f. ' % (task, task2total_loss[task] / 488 | task2total_predicted[task]), 489 | end='', flush=True) 490 | print('', flush=True) 491 | 492 | 493 | 494 | # evaluate after every epoch 495 | 496 | avg_train_score_by_task_list = []#Each item stores the avg train score (by task) for a particular language 497 | avg_dev_score_by_task_list = []#Each item stores the avg dev score (by task) for a particular language 498 | train_data_size_list = []#Each item stores the size for a particular language train set 499 | dev_data_size_list = []#Each item stores the size for a particular language dev set 500 | 501 | for lang in train_languages: 502 | #changed utils.get_data( 503 | #changed model to self everywhere,checkif it has to be replaced by self.model 504 | #changed args.train_dir to train_dir 505 | train_eval_X, train_eval_Y, _ = get_data( 506 | [lang], self.task_names, self.word2id, 507 | self.task2label2id, data_dir=train_dir, train=False) 508 | 509 | 510 | train_data_size_list+=[len(train_eval_Y)] 511 | 512 | #changed args.dev_dir to dev_dir 513 | dev_eval_X, dev_eval_Y, _ = get_data( 514 | [lang], self.task_names, self.word2id, 515 | self.task2label2id, data_dir= dev_dir, train=False) 516 | 517 | 518 | dev_data_size_list+=[len(dev_eval_Y)] 519 | 520 | 521 | #changed args.threshold to threshold 522 | train_score = self.evaluate(train_eval_X, train_eval_Y, lang, threshold) 523 | 524 | #changed args.threshold to threshold 525 | dev_score = self.evaluate(dev_eval_X, dev_eval_Y, lang, threshold) 526 | #changed utils.average_by_task 527 | avg_train_score_by_task_list.append(average_by_task(train_score)) 528 | avg_dev_score_by_task_list.append(average_by_task(dev_score)) 529 | 530 | 531 | 532 | print('='*50) 533 | print('\tStart logging for {} in epoch {}'.format(test_lang, epoch+1)) 534 | 535 | #changed utils.log_fit 536 | log_fit(self.log_dir, epoch+1, train_languages, test_lang, self.task_names, train_score, dev_score) 537 | 538 | 539 | print('\tFinish logging for {} in epoch {}'.format(test_lang, epoch+1)) 540 | 541 | 542 | 543 | #Compute the weighted average over all languages and use it to determine the overall performance of training 544 | total_train_size = len(train_Y) 545 | total_dev_size = len(dev_Y) 546 | 547 | #changed util.average_by_lang 548 | avg_train_score = average_by_lang(avg_train_score_by_task_list, train_data_size_list, 549 | total_train_size) 550 | 551 | #changed util.average_by_lang 552 | avg_dev_score = average_by_lang(avg_dev_score_by_task_list, dev_data_size_list, 553 | total_dev_size) 554 | 555 | if avg_dev_score > self.avg_dev_score: 556 | 557 | self.avg_dev_score = avg_dev_score 558 | self.avg_train_score = avg_train_score 559 | 560 | self.best_train_dict = train_score 561 | self.best_dev_dict = dev_score 562 | 563 | 564 | self.best_epoch = epoch 565 | num_epochs_no_improvement = 0 566 | print('Saving model to directory %s...' % self.model_dir, 567 | flush=True) 568 | self.save() 569 | else: 570 | 571 | num_epochs_no_improvement += 1 572 | 573 | 574 | 575 | if num_epochs_no_improvement == patience: 576 | #dynet.load(self.model_file, self.model) 577 | break 578 | 579 | 580 | print('Finished training', flush=True) 581 | print('Loading the best performing model from %s...'\ 582 | % self.model_dir, flush=True) 583 | 584 | self.model.populate(self.model_file) 585 | 586 | 587 | 588 | return self.best_train_dict, self.best_dev_dict, self.avg_train_score, self.avg_dev_score 589 | 590 | 591 | 592 | def predict(self, features, task_name, train=False): 593 | """ 594 | Steps through the computation graph and obtains predictions for the 595 | provided input features. 596 | :param features: a list of word embeddings for every word in the sequence 597 | :param task_name: the name of the task that should be predicted 598 | :param train: if the model is training; apply noise in this case 599 | :return output: the output predictions 600 | penalty: the summed subspace penalty (0 if no constraint) 601 | """ 602 | if train: # noise is added only at training time 603 | 604 | features = [dynet.noise(fe, self.noise_sigma) for fe in 605 | features] 606 | 607 | 608 | 609 | # only if we use cross-stitch we have a layer for each task; 610 | # otherwise we just have one layer for all tasks 611 | num_layers = self.h_layers 612 | inputs = [features] * len(self.task_names) 613 | inputs_rev = [features] * len(self.task_names) 614 | 615 | target_task_id = self.task_names.index( 616 | task_name) if self.cross_stitch else 0 617 | 618 | #added 619 | num_task_layers = len(self.task_names) if self.cross_stitch else 1 620 | 621 | 622 | # collect the forward and backward sequences for each task at every 623 | # layer for the layer connection units 624 | layer_forward_sequences = [] 625 | layer_backward_sequences = [] 626 | 627 | penalty = dynet.const_parameter(self.subspace_penalty) 628 | 629 | for i in range(0, num_layers): 630 | forward_sequences = [] 631 | backward_sequences = [] 632 | for j in range(num_task_layers): 633 | predictor = self.predictors['inner'][i][j] 634 | forward_sequence, backward_sequence = predictor.predict_sequence( 635 | inputs[j], inputs_rev[j]) 636 | if i > 0 and self.activation: 637 | # activation between LSTM layers 638 | forward_sequence = [self.activation(s) for s in 639 | forward_sequence] 640 | backward_sequence = [self.activation(s) for s in 641 | backward_sequence] 642 | forward_sequences.append(forward_sequence) 643 | backward_sequences.append(backward_sequence) 644 | 645 | if self.num_subspaces == 2 and self.constraint_weight != 0: 646 | # returns a list per layer, i.e. here a list with one item 647 | lstm_parameters = \ 648 | predictor.builder.get_parameter_expressions()[0] 649 | 650 | 651 | 652 | # lstm parameters consists of these weights: 653 | # Wix,Wih,Wic,bi,Wox,Woh,Woc,bo,Wcx,Wch,bc 654 | for param_idx in range(len(lstm_parameters)): 655 | if param_idx in self.constrain_matrices: 656 | W = lstm_parameters[param_idx] 657 | W_shape = np.array(W.value()).shape 658 | 659 | if(len(W_shape) <2): 660 | W_shape = [W_shape[0], 1] 661 | 662 | # split matrix into its two subspaces 663 | W_subspaces = dynet.reshape(W, ( 664 | self.num_subspaces, W_shape[0] / float( 665 | self.num_subspaces), W_shape[1])) 666 | subspace_1, subspace_2 = W_subspaces[0], W_subspaces[1] 667 | 668 | # calculate the matrix product of the two matrices 669 | matrix_product = dynet.transpose( 670 | subspace_1) * subspace_2 671 | 672 | # take the squared Frobenius norm by squaring 673 | # every element and then summing them 674 | squared_frobenius_norm = dynet.sum_elems( 675 | dynet.square(matrix_product)) 676 | penalty += squared_frobenius_norm 677 | 678 | if self.cross_stitch: 679 | # takes as input a list of input lists and produces a list of 680 | # outputs where the index indicates the task 681 | forward_sequences = self.predictors['cross_stitch'][ 682 | i].stitch(forward_sequences) 683 | backward_sequences = self.predictors['cross_stitch'][ 684 | i].stitch(backward_sequences) 685 | 686 | inputs = forward_sequences 687 | inputs_rev = backward_sequences 688 | layer_forward_sequences.append(forward_sequences) 689 | layer_backward_sequences.append(backward_sequences) 690 | 691 | if i == num_layers-1: 692 | output_predictor = \ 693 | self.predictors['output_layers_dict'][task_name] 694 | 695 | # get the forward/backward states of all task layers 696 | task_forward_sequences = [ 697 | layer_seq_list[target_task_id][-1] for 698 | layer_seq_list in layer_forward_sequences] 699 | 700 | task_backward_sequences = [ 701 | layer_seq_list[target_task_id][0] for 702 | layer_seq_list in layer_backward_sequences] 703 | 704 | 705 | if(num_layers > 1): 706 | forward_input = \ 707 | self.predictors['layer_stitch'][ 708 | target_task_id].stitch(task_forward_sequences) 709 | backward_input = \ 710 | self.predictors['layer_stitch'][ 711 | target_task_id].stitch(task_backward_sequences) 712 | 713 | 714 | else: 715 | forward_input = task_forward_sequences[0] 716 | backward_input = task_backward_sequences[0] 717 | 718 | 719 | 720 | 721 | concat_layer = dynet.concatenate([forward_input, backward_input]) 722 | 723 | if train and self.noise_sigma > 0.0: 724 | concat_layer = dynet.noise(concat_layer, self.noise_sigma) 725 | 726 | output = [] 727 | 728 | if('sentiment' in task_name):#Multi-label 729 | 730 | for i in range(len(output_predictor)): 731 | 732 | output.append(output_predictor[i](concat_layer)) 733 | 734 | 735 | else: 736 | output.append(output_predictor(concat_layer)) 737 | 738 | 739 | #output = output_predictor.predict_sequence(concat_layer) 740 | 741 | return output, penalty 742 | raise Exception('Error: This place should not be reached.') 743 | 744 | 745 | 746 | def evaluate(self, test_X, test_Y, test_lang, threshold): 747 | """ 748 | Computes accuracy on a test file. 749 | :param test_X: the test data; a list of (word_ids, char_ids) tuples 750 | :param test_Y: labels; a list of task-to-label sequence mappings 751 | :param test_lang: language of the test data 752 | :param threshold: threshold for classification in multi-label prediction 753 | 754 | :return a dictionary storing the macro-f1 and micro-f1 scores of all tasks 755 | """ 756 | dynet.renew_cg(immediate_compute = True)#(immediate_compute = True, check_validity = True) #is_valid not yet implemented for CUDA 757 | 758 | #Display the parameters 759 | ''' 760 | if self.cross_stitch: 761 | for layer_num in range(self.h_layers): 762 | alphas = dynet.parameter( 763 | self.predictors['cross_stitch'][layer_num].alphas).value() 764 | print('Cross-stitch unit values at layer %d.' % layer_num, 765 | end=' ', flush=True) 766 | if self.num_subspaces > 1: 767 | print(np.array(alphas).flatten()) 768 | else: 769 | for i, task_i in enumerate(self.task_names): 770 | for j, task_j in enumerate(self.task_names): 771 | print('%s-%s: %3f.' % (task_i, task_j, 772 | alphas[i][j]), 773 | end=' ', flush=True) 774 | print('') 775 | 776 | 777 | ''' 778 | 779 | y_true_dict = {task: [] for task in self.task_names} 780 | 781 | y_pred_dict = {task: [] for task in self.task_names} 782 | 783 | 784 | for i, (word_indices, task2label_id_seq)\ 785 | in enumerate(zip(test_X, test_Y)): 786 | for task, label_id_seq in task2label_id_seq.items(): 787 | features = self.get_word_features(word_indices) 788 | output, _ = self.predict(features, task, train=False) 789 | 790 | y_true_dict[task].append(label_id_seq) 791 | 792 | if('sentiment' in task):#Multi-label classification 793 | output_seq = [] 794 | 795 | 796 | for o in output: 797 | o_val = o.value() 798 | 799 | if(o_val[1]>=threshold): 800 | output_seq.append(1) 801 | else: 802 | output_seq.append(0) 803 | 804 | y_pred_dict[task].append(output_seq) 805 | else: 806 | y_pred_dict[task].append([np.argmax(o.value()) for o in output]) 807 | 808 | 809 | 810 | 811 | 812 | res_dict = {} 813 | for task in self.task_names: 814 | 815 | res_dict[task] = {'micro_f1': 0, 'macro_f1': 0} 816 | 817 | 818 | for task in y_true_dict: 819 | 820 | 821 | clf_dict = classification_report(np.array(y_true_dict[task]), np.array(y_pred_dict[task]), 822 | output_dict=True) 823 | precision = clf_dict['micro avg']['precision'] 824 | recall = clf_dict['micro avg']['recall'] 825 | 826 | divisor = precision + recall 827 | if divisor<0.000001: 828 | divisor = 0.000001 829 | 830 | if(divisor > 0): 831 | 832 | res_dict[task]['micro_f1'] = (2*precision*recall)\ 833 | /(divisor) 834 | else: #changing dvisor 835 | print("The sum of precision and recall equals zero.") 836 | res_dict[task]['micro_f1'] = 0 837 | 838 | precision = clf_dict['macro avg']['precision'] 839 | recall = clf_dict['macro avg']['recall'] 840 | 841 | if divisor<0.000001: 842 | divisor = 0.000001 843 | 844 | if(divisor > 0): 845 | res_dict[task]['macro_f1'] = (2*precision*recall)\ 846 | /(divisor) 847 | else: 848 | print("The sum of precision and recall equals zero.") 849 | res_dict[task]['macro_f1'] = 0 850 | print(test_lang) 851 | print(threshold) 852 | print(res_dict) 853 | return res_dict 854 | 855 | 856 | 857 | 858 | 859 | def get_word_features(self, word_indices): 860 | """ 861 | Produce word and character features that can be used as input for the 862 | predictions. 863 | :param word_indices: a list of word indices 864 | :return: a list of word embeddigs 865 | """ 866 | dynet.renew_cg(immediate_compute = True)#(immediate_compute = True, check_validity = True) # new graph #is_valid() not implemented for CUDA yet 867 | 868 | features = [] 869 | 870 | for w_idx in word_indices: 871 | update_flag = False 872 | if(w_idx in self.oov_id): 873 | #Allow the vocabs which are not in pre-load embeddings to 874 | #be updated during training 875 | update_flag = True 876 | 877 | embed_vec = dynet.lookup(self.wembeds,index=w_idx, update=update_flag) 878 | features.append(embed_vec) 879 | 880 | return features 881 | 882 | 883 | def pick_neg_log(pred, gold): 884 | """Get the negative log-likelihood of the predictions.""" 885 | return -dynet.log(dynet.pick(pred, gold)) 886 | --------------------------------------------------------------------------------