├── .gitignore ├── Dockerfile ├── Python └── code │ ├── twitter_label.py │ └── wiki_badwords.py ├── R └── explore.R ├── README.md ├── app ├── __init__.py ├── app.py ├── config.py ├── core.py ├── data_science.py └── training.py ├── data ├── README.md ├── twitter-hate-speech.csv └── twitter-hate-speech2.csv ├── docker-compose-prebuilt.yml ├── docker-compose.yml ├── docs └── summarize_twitter-hate-speech2 │ ├── quantity_of_tweets_per_class_histogram.png │ └── tweet_length_histogram.png ├── flask_app.py ├── hate_speech_detector.Rproj ├── hatebase_api.py ├── nginx ├── Dockerfile └── conf.d ├── notebooks ├── .ipynb_checkpoints │ ├── Data Cleaning-checkpoint.ipynb │ └── Data Exploration-checkpoint.ipynb ├── Data Cleaning.ipynb ├── Data Exploration.ipynb ├── LSTM with Keras and TensorFlow.ipynb ├── best_multiclass_model.ipynb ├── clean.csv ├── labeled_data.csv └── multiclass_baseline.ipynb ├── requirements.txt └── research └── 23_Paper.pdf /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | __pycache__ 3 | *.ipynb 4 | *.xlsx 5 | Python/code/bad_words_from_wiki.txt 6 | Python/code/.DS_Store 7 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.5-slim 2 | 3 | WORKDIR ./src 4 | 5 | ADD requirements.txt requirements.txt 6 | ADD app ./app/ 7 | ADD ./data/twitter-hate-speech2.csv ./data/ 8 | 9 | ENV TRAINING_DATA_LOCATION data/twitter-hate-speech2.csv 10 | 11 | RUN pip install -r requirements.txt 12 | 13 | RUN python -m app.training 14 | 15 | ENTRYPOINT ["gunicorn","-b", "0.0.0.0:8000", "app.app:app"] -------------------------------------------------------------------------------- /Python/code/twitter_label.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas 3 | import numpy as np 4 | import re 5 | 6 | tweets = pandas.read_csv('~/Documents/final_tweets_NLP+CSS_2016.csv', header = None) 7 | tweets['label'] = 0 8 | 9 | badwords = pandas.read_csv('~/Documents/list1.csv', header = None) 10 | 11 | for i in range(0, len(badwords)): 12 | 13 | print(badwords[0][i]) 14 | tweets["label"] = tweets["label"] + [1 if badwords[0][i] in ele else 0 for ele in tweets[1]] 15 | 16 | 17 | 18 | 19 | badwords = pandas.read_csv('~/Documents/list2.csv', header = None) 20 | 21 | for i in range(0, len(badwords)): 22 | text = re.findall('\"(.*?)\"', badwords.loc[i][0]) 23 | print(text) 24 | tweets["label"] = tweets["label"] + [1 if text[0] in ele else 0 for ele in tweets[1]] 25 | 26 | 27 | badwords = pandas.read_csv('~/Documents/list3.csv', header = None) 28 | 29 | for i in range(0, len(badwords)): 30 | 31 | print(badwords[0][i]) 32 | tweets["label"] = tweets["label"] + [1 if badwords[0][i] in ele else 0 for ele in tweets[1]] 33 | 34 | 35 | 36 | badwords = pandas.read_csv('~/Documents/list4.csv', header = None) 37 | 38 | for i in range(0, len(badwords)): 39 | 40 | print(badwords[0][i]) 41 | tweets["label"] = tweets["label"] + [1 if badwords[0][i] in ele else 0 for ele in tweets[1]] 42 | 43 | 44 | badwords = pandas.read_csv('~/Documents/list5.csv', header = None) 45 | 46 | for i in range(0, len(badwords)): 47 | text = badwords.loc[i][0].split(',') 48 | print(text) 49 | tweets["label"] = tweets["label"] + [1 if text[0] in ele else 0 for ele in tweets[1]] 50 | 51 | 52 | tweets['label'] = np.where(tweets['label']>=1, 1, 0) 53 | 54 | 55 | ## sanity check 56 | tweets['label'].sum() 57 | -------------------------------------------------------------------------------- /Python/code/wiki_badwords.py: -------------------------------------------------------------------------------- 1 | import wikipediaapi 2 | import re 3 | 4 | 5 | 6 | def process_word(word): 7 | # remove tags 8 | remove_list = ['', '', '', '', '\xa0', '"'] 9 | for r in remove_list: 10 | word = word.replace(r, '') 11 | # Remove span 12 | if 'span' in word: 13 | word = re.findall(r'>(.*?)<', word)[0] 14 | # Remove words in parentheses 15 | if '(' in word and ')' in word: 16 | word = word[:word.index('(')] + word[word.index(')')+1:] 17 | if '(' in word: 18 | word = word[:word.index('(')] 19 | # Replace differet delimiters to comma 20 | replace_list = ['/', ' or ', ' also spelled '] 21 | for r in replace_list: 22 | word = word.replace(r, ',') 23 | # Remove non-latin characters 24 | stripped_text = '' 25 | for c in word: 26 | stripped_text += c if len(c.encode(encoding='utf_8'))==1 else '' 27 | word = stripped_text 28 | return(word) 29 | 30 | 31 | def process_extract(page_text): 32 | extract = re.findall(r'
(.*?)
', page_text) 33 | for word in extract: 34 | ind = extract.index(word) 35 | extract[ind] = process_word(word) 36 | 37 | bad_words = [] 38 | for word in extract: 39 | bad_words.extend(word.split(',')) 40 | #ethnic_words = [x.strip() for x in ethnic_words] 41 | 42 | bad_words[:] = [x for x in bad_words if x != ''] 43 | bad_words[:] = [x.strip().lower() for x in bad_words] 44 | return(bad_words) 45 | 46 | 47 | 48 | 49 | 50 | wiki_html = wikipediaapi.Wikipedia( 51 | language='en', 52 | extract_format=wikipediaapi.ExtractFormat.HTML 53 | ) 54 | 55 | page_ethnic = wiki_html.page("List_of_ethnic_slurs") 56 | page_religous = wiki_html.page("List_of_religious_slurs") 57 | 58 | 59 | ethnic_bad_words = process_extract(page_ethnic.text) 60 | religious_bad_words = process_extract(page_religous.text) 61 | 62 | 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /R/explore.R: -------------------------------------------------------------------------------- 1 | library(readr) 2 | library(dplyr) 3 | library(ggplot2) 4 | 5 | # Explore the data in 'twitter-hate-speech2.csv'. 6 | 7 | explore <- function() { 8 | encoded_tweets <- readr::read_csv("./data/twitter-hate-speech2.csv") 9 | names(encoded_tweets)[1] <- "id" 10 | 11 | # Bar chart of the number of tweets in each class 12 | encoded_tweets %>% 13 | dplyr::group_by(class) %>% 14 | dplyr::summarize(total = n()) %>% 15 | ggplot(aes(x = factor(class), y = total)) + 16 | geom_bar(stat = "identity") + 17 | geom_text(aes(label = total, vjust = -0.25)) + 18 | scale_x_discrete(labels = c("hate speech", "offensive language", "neither")) + 19 | xlab("class") + 20 | ylab("quantity") + 21 | ggtitle("Number of tweets in each class (data: twitter-hate-speech2.csv)") %>% 22 | print() 23 | 24 | # Histogram of tweet lengths 25 | encoded_tweets %>% 26 | mutate(tweet_length = nchar(tweet)) %>% 27 | ggplot(aes(x = tweet_length)) + 28 | geom_histogram(binwidth = 1) + 29 | scale_x_continuous(name = "Number of characters in tweet", 30 | limits = c(0, 300), 31 | expand = c(0, 0)) + 32 | ylab("Number of tweets") + 33 | ggtitle("Length of tweets (data: twitter-hate-speech2.csv)") %>% 34 | print() 35 | } 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # hate_speech_detector 2 | 3 | ## Project Status 4 | 5 | We have built a few fully working hate speech detection models. This project is currently in hibernation, in that there's not an active use case at the moment but the models are ready to go. Every once in a while we'll tweak the models a bit but in general, there is not active development on this project. If you know of a possible application, please reach out to us. Also, if you're interested in helping, we're always looking for help, whether more data, more models, or any other interesting component of hate speech detection. If you're interested in using or working on this model, feel free to reach out to the Slack channel (#p-hate-speech) or Julius Simonelli (jss367 in Slack). 6 | 7 | ## Data 8 | 9 | We are currently working with the data collected by [Davidson et al.](https://aaai.org/ocs/index.php/ICWSM/ICWSM17/paper/view/15665) for their research on hate speech detection. The dataset contains tweets that are labeled as either hate speech, offensive language, or neither. The data were pulled from [Hatebase.org](https://www.hatebase.org/), an organization that collects instances of potential hate speech. The data were then labeled using [CrowdFlower](https://www.crowdflower.com/), which uses non-specialists to clean and label data. Each tweet was reviewed by three or more people and a majority-wins approach was taken when there was disagreement. 10 | 11 | All data used in this analysis is stored in the [data](https://github.com/Data4Democracy/hate_speech_detector/tree/master/data) folder of [this repository](https://github.com/Data4Democracy/hate_speech_detector). The original source of the data is: https://github.com/t-davidson/hate-speech-and-offensive-language 12 | 13 | The paper by Davidson et al. can be found here: 14 | Thomas Davidson, Dana Warmsley, Michael Macy, Ingmar Weber. 2017. "[Automated Hate Speech Detection and the Problem of Offensive Language](https://aaai.org/ocs/index.php/ICWSM/ICWSM17/paper/view/15665)". Proceedings of the 11th International AAAI Conference on Web and Social Media (ICWSM). 15 | 16 | ## Demo REST API 17 | 18 | A REST API has been designed to demo the functionality of a basic model. The code will train a basic model as defined in the data_science.py and use this model to make predictions. To get the current IP address, ask in the Data for Democracy #p-hate-speech Slack channel. 19 | 20 | ### Installation (Docker) 21 | 22 | Navigate to the hate_speech_detector/ directory and build the container: 23 | 24 | ```shell 25 | docker build -t [container-name] ./app 26 | ``` 27 | 28 | ### Usage 29 | 30 | When you run the container, you must also expose the port 8000. For example: 31 | 32 | 33 | ```shell 34 | docker run -p 8000:8000 -t [container-name] 35 | ``` 36 | 37 | #### Labels 38 | 39 | Labels are Hate = 0, Offensive = 1, Not Offensive = 2. 40 | 41 | #### API Endpoints 42 | 43 | - /label 44 | 45 | Then, the API can be called to predict the label on new text data via: 46 | 47 | ```shell 48 | curl -H "Content-Type: application/json" -X POST -d '{"text":"Text that might be offensive or hateful... or not."}' http://0.0.0.0:8000/label 49 | ``` 50 | 51 | Output Ex: 52 | 53 | ```shell 54 | { 55 | "label": 2, 56 | "text": "Text that might be offensive or hateful... or not." 57 | } 58 | 59 | ``` 60 | 61 | In this case, "text" is the input text and label is the predicted label from the model. 62 | 63 | 64 | - /demo 65 | 66 | You may also see the model predict on held out test set values via: 67 | 68 | ```shell 69 | curl http://0.0.0.0:8000/demo 70 | ``` 71 | Output Ex: 72 | 73 | ```shell 74 | { 75 | "label": 2, 76 | "text": "#stateoftheunion would last 15mins if they let the President talk all that standing up clapping is for the birds", 77 | "true": 2 78 | } 79 | ``` 80 | 81 | Here, "text" is the text input, "label" is the predicted label from the model, and "true" is the actual label given by a human. 82 | 83 | ### Deploy with prebuilt containers via docker-compose 84 | 85 | 86 | ```shell 87 | docker-compose -f docker-compose-prebuilt.yml up 88 | ``` 89 | 90 | At this point, if you set this up on AWS, then you could navigate to 91 | 92 | ```shell 93 | [your aws ip]/demo 94 | ``` 95 | 96 | in your browser to see the demo. 97 | 98 | ## To Do: 99 | 100 | ### Classifier 101 | There are currently two Jupyter Notebooks containing models to classify the data, but both could be greatly improved. Please feel free to take a look and let us know if you make any improvements! 102 | 103 | ### Data preprocessing 104 | There is currently very little preprocessing done on this data. Would someone be interested in creating some useful categories for machine learning and plugging them back into the models? My guess is feature engineering has the most potential to improve the model. 105 | 106 | 107 | ### Front End 108 | 109 | A front end for the demo app which can demo random elements of the test set or allow the user to input their own text. 110 | 111 | ### Dataset 112 | The Davidson et al. paper remarked on some possible mislabelings in the dataset. Is mislabeling common in the dataset? Fixing any labels would definitely improve our ability to create a classifier. How big a problem is this? Does someone want to look at some of the misclassifications and see if any are incorrectly labeled? 113 | 114 | Also, there's a second data source, also containing labeled hate speech from Twitter, but we are yet to explore it. You can find the data here: https://github.com/zeerakw/hatespeech. If anyone wants to look into this dataset and assess its value, it would be very useful. Something else to consider - can these datasets be combined? 115 | 116 | ### Strategy 117 | What else do we want to do with this? 118 | -------------------------------------------------------------------------------- /app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data4Democracy/hate_speech_detector/c527aa6d286c663e74f538aad16e085301144250/app/__init__.py -------------------------------------------------------------------------------- /app/app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, jsonify, request 2 | 3 | from app.core import DataStore 4 | from app.core import Model 5 | 6 | app = Flask(__name__) 7 | 8 | @app.route('/') 9 | def index(): 10 | # TODO: return a front end page that can call /demo or /label 11 | return '' 12 | 13 | @app.route('/demo') 14 | def demo(): 15 | """ Endpoint to see the model's performance on a random test set example. """ 16 | tweet, label = DataStore.get_random_test() 17 | return_json = Model.predict([tweet])[0] 18 | return_json['true'] = int(label) 19 | return jsonify(return_json) 20 | 21 | @app.route('/label', methods=['POST']) 22 | def predict(): 23 | """ Applies the model to the 'text' entry of the payload. """ 24 | content = request.json 25 | return jsonify(Model.predict([content['text']])[0]) 26 | 27 | 28 | if __name__ == '__main__': 29 | app.run() 30 | -------------------------------------------------------------------------------- /app/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | TRAINING_DATA_LOCATION = os.environ['TRAINING_DATA_LOCATION'] 4 | MODEL_LOCATION = 'model.pkl' -------------------------------------------------------------------------------- /app/core.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.externals import joblib 4 | from app.config import TRAINING_DATA_LOCATION, MODEL_LOCATION 5 | 6 | 7 | class GlobalDataLoad(type): 8 | """ Metaclass used to load data before app begins.""" 9 | def __new__(meta, name, bases, clzdict): 10 | cls = super().__new__(type,name,bases,clzdict) 11 | 12 | df = pd.read_csv(TRAINING_DATA_LOCATION, encoding='latin-1') 13 | 14 | cls.X=df['tweet'] 15 | cls.y=df['class'] 16 | 17 | return cls 18 | 19 | 20 | class DataStore(metaclass=GlobalDataLoad): 21 | 22 | 23 | @classmethod 24 | def get_random_test(cls): 25 | ind = np.random.choice(len(cls.X)) 26 | return cls.X.iloc[ind], cls.y.iloc[ind] 27 | 28 | 29 | class GlobalModelLoad(type): 30 | """ Metaclass used to load and train the model before app begins.""" 31 | def __new__(meta, name, bases, clzdict): 32 | cls = super().__new__(type,name,bases,clzdict) 33 | 34 | # Load model 35 | cls._model = joblib.load(MODEL_LOCATION) 36 | 37 | return cls 38 | 39 | 40 | class Model(metaclass=GlobalModelLoad): 41 | 42 | @classmethod 43 | def predict(cls, X): 44 | """ X should be a list of strings.""" 45 | return [{ "text" : k, "label" : int(v)} for k,v in zip(X,cls._model.predict(X))] # cast to int for json, remove numpy type 46 | 47 | 48 | -------------------------------------------------------------------------------- /app/data_science.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import nltk 4 | from nltk.stem.porter import * 5 | import string 6 | import re 7 | from sklearn.feature_extraction.text import TfidfVectorizer 8 | from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS 9 | from textstat.textstat import * 10 | from sklearn.base import BaseEstimator, TransformerMixin 11 | 12 | stopwords=stopwords = nltk.corpus.stopwords.words("english") 13 | 14 | other_exclusions = ["#ff", "ff", "rt"] 15 | stopwords.extend(other_exclusions) 16 | 17 | stemmer = PorterStemmer() 18 | 19 | def preprocess(text_string): 20 | """ 21 | Accepts a text string and replaces: 22 | 1) urls with URLHERE 23 | 2) lots of whitespace with one instance 24 | 3) mentions with MENTIONHERE 25 | 26 | This allows us to get standardized counts of urls and mentions 27 | Without caring about specific people mentioned 28 | """ 29 | space_pattern = '\s+' 30 | giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|' 31 | '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') 32 | mention_regex = '@[\w\-]+' 33 | parsed_text = re.sub(space_pattern, ' ', text_string) 34 | parsed_text = re.sub(giant_url_regex, '', parsed_text) 35 | parsed_text = re.sub(mention_regex, '', parsed_text) 36 | return parsed_text 37 | 38 | def tokenize(tweet): 39 | """Removes punctuation & excess whitespace, sets to lowercase, 40 | and stems tweets. Returns a list of stemmed tokens.""" 41 | tweet = " ".join(re.split("[^a-zA-Z]*", tweet.lower())).strip() 42 | tokens = [stemmer.stem(t) for t in tweet.split()] 43 | return tokens 44 | 45 | def basic_tokenize(tweet): 46 | """Same as tokenize but without the stemming""" 47 | tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip() 48 | return tweet.split() 49 | 50 | class PosTfidfVectorizer(BaseEstimator, TransformerMixin): 51 | """Get POS tags for tweets and transform via tfidf""" 52 | 53 | def __init__(self): 54 | self._pos_vectorizer = TfidfVectorizer( 55 | tokenizer=None, 56 | lowercase=False, 57 | preprocessor=None, 58 | ngram_range=(1, 3), 59 | stop_words=None, 60 | use_idf=False, 61 | smooth_idf=False, 62 | norm=None, 63 | decode_error='replace', 64 | max_features=5000, 65 | min_df=5, 66 | max_df=0.75, 67 | ) 68 | 69 | def _preprocess(self, X): 70 | tweet_tags = [] 71 | for t in X: 72 | tokens = basic_tokenize(preprocess(t)) 73 | tags = nltk.pos_tag(tokens) 74 | tag_list = [x[1] for x in tags] 75 | tag_str = " ".join(tag_list) 76 | tweet_tags.append(tag_str) 77 | return tweet_tags 78 | 79 | def fit(self, X, y=None): 80 | tweet_tags = self._preprocess(X) 81 | self._pos_vectorizer.fit(X) 82 | 83 | return self 84 | 85 | def transform(self, X, y=None): 86 | tweet_tags = self._preprocess(X) 87 | return self._pos_vectorizer.transform(X) 88 | 89 | class SentimentVectorizer(BaseEstimator, TransformerMixin): 90 | sentiment_analyzer = VS() 91 | 92 | def count_twitter_objs(self, text_string): 93 | """ 94 | Accepts a text string and replaces: 95 | 1) urls with URLHERE 96 | 2) lots of whitespace with one instance 97 | 3) mentions with MENTIONHERE 98 | 4) hashtags with HASHTAGHERE 99 | 100 | This allows us to get standardized counts of urls and mentions 101 | Without caring about specific people mentioned. 102 | 103 | Returns counts of urls, mentions, and hashtags. 104 | """ 105 | space_pattern = '\s+' 106 | giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|' 107 | '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') 108 | mention_regex = '@[\w\-]+' 109 | hashtag_regex = '#[\w\-]+' 110 | parsed_text = re.sub(space_pattern, ' ', text_string) 111 | parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text) 112 | parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text) 113 | parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text) 114 | return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE')) 115 | 116 | def other_features(self, tweet): 117 | """This function takes a string and returns a list of features. 118 | These include Sentiment scores, Text and Readability scores, 119 | as well as Twitter specific features""" 120 | sentiment = self.sentiment_analyzer.polarity_scores(tweet) 121 | 122 | words = preprocess(tweet) #Get text only 123 | 124 | syllables = textstat.syllable_count(words) 125 | num_chars = sum(len(w) for w in words) 126 | num_chars_total = len(tweet) 127 | num_terms = len(tweet.split()) 128 | num_words = len(words.split()) 129 | avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4) 130 | num_unique_terms = len(set(words.split())) 131 | 132 | ###Modified FK grade, where avg words per sentence is just num words/1 133 | FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1) 134 | ##Modified FRE score, where sentence fixed to 1 135 | FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2) 136 | 137 | twitter_objs = self.count_twitter_objs(tweet) 138 | retweet = 0 139 | if "rt" in words: 140 | retweet = 1 141 | features = [FKRA, FRE,syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words, 142 | num_unique_terms, sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound'], 143 | twitter_objs[2], twitter_objs[1], 144 | twitter_objs[0], retweet] 145 | return features 146 | 147 | def get_feature_array(self, tweets): 148 | feats=[] 149 | for t in tweets: 150 | feats.append(self.other_features(t)) 151 | return np.array(feats) 152 | 153 | def fit(self, X, y=None): 154 | return self 155 | 156 | def transform(self, X, y=None): 157 | return self.get_feature_array(X) -------------------------------------------------------------------------------- /app/training.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | 3 | # download necessary components 4 | nltk.download('stopwords') 5 | nltk.download('averaged_perceptron_tagger') 6 | 7 | import pandas as pd 8 | import numpy as np 9 | from sklearn.externals import joblib 10 | from sklearn.pipeline import Pipeline, FeatureUnion 11 | from sklearn.feature_extraction.text import TfidfVectorizer 12 | from sklearn.feature_selection import SelectFromModel 13 | from sklearn.linear_model import LogisticRegression 14 | import nltk 15 | 16 | from app.config import TRAINING_DATA_LOCATION, MODEL_LOCATION 17 | from app.data_science import SentimentVectorizer, preprocess, tokenize, PosTfidfVectorizer, stopwords 18 | 19 | 20 | # Train Model and Save 21 | 22 | vectorizer = TfidfVectorizer( 23 | tokenizer=tokenize, 24 | preprocessor=preprocess, 25 | ngram_range=(1, 3), 26 | stop_words=stopwords, 27 | use_idf=True, 28 | smooth_idf=False, 29 | norm=None, 30 | decode_error='replace', 31 | max_features=10000, 32 | min_df=5, 33 | max_df=0.75 34 | ) 35 | pos_vectorizer = PosTfidfVectorizer() 36 | sentiment_vectorizer = SentimentVectorizer() 37 | 38 | model = Pipeline( [('features', FeatureUnion([('tfidf', vectorizer),('pos_tfidf', pos_vectorizer), 39 | ('sentiment',sentiment_vectorizer)])), 40 | ('feature_selector', SelectFromModel(LogisticRegression(class_weight='balanced',penalty="l1",C=0.01))), 41 | ('model', LogisticRegression(class_weight='balanced',penalty='l2',C=0.01))] ) 42 | 43 | df = pd.read_csv(TRAINING_DATA_LOCATION, encoding='latin-1') 44 | 45 | X = df['tweet'] 46 | y = df['class'] 47 | 48 | model.fit(X,y) 49 | joblib.dump(model, MODEL_LOCATION) 50 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | # Data Definitions 2 | 3 | ## `twitter-hate-speech.csv` 4 | This file was copied from a [data.world](https://data.world/) repository: [crowdflower](https://data.world/crowdflower)/[Hate Speech Identification](https://data.world/crowdflower/hate-speech-identification/)/[twitter-hate-speech-classifier-DFE-a845520.csv](https://data.world/crowdflower/hate-speech-identification/workspace/file?filename=twitter-hate-speech-classifier-DFE-a845520.csv) 5 | 6 | _Open Issue_: What do the columns mean? 7 | 8 | ## `twitter-hate-speech2.csv` 9 | This file was copied from [Davidson et al.](https://aaai.org/ocs/index.php/ICWSM/ICWSM17/paper/view/15665)'s [labeled_data.csv](https://github.com/t-davidson/hate-speech-and-offensive-language/blob/master/data/labeled_data.csv). The same data is also in a [data.world](https://data.world/) repository: [thomasrdavidson](https://data.world/thomasrdavidson/)/[Hate Speech and Offensive Language](https://data.world/thomasrdavidson/hate-speech-and-offensive-language/)/[labeled_data.csv](https://data.world/thomasrdavidson/hate-speech-and-offensive-language/workspace/file?filename=labeled_data.csv) 10 | 11 | The file contains 5 columns: 12 | 13 | `count` = number of CrowdFlower users who coded each tweet (min is 3, sometimes more users coded a tweet when judgments were determined to be unreliable by CF). 14 | 15 | `hate_speech` = number of CF users who judged the tweet to be hate speech. 16 | 17 | `offensive_language` = number of CF users who judged the tweet to be offensive. 18 | 19 | `neither` = number of CF users who judged the tweet to be neither offensive nor non-offensive. 20 | 21 | `class` = class label for majority of CF users. 22 | 0 - hate speech 23 | 1 - offensive language 24 | 2 - neither 25 | -------------------------------------------------------------------------------- /data/twitter-hate-speech.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data4Democracy/hate_speech_detector/c527aa6d286c663e74f538aad16e085301144250/data/twitter-hate-speech.csv -------------------------------------------------------------------------------- /docker-compose-prebuilt.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | app: 4 | image: "srome/d4d-hate-speech-detector:latest" 5 | networks: 6 | - web_nw 7 | ports: 8 | - "80:8000" 9 | nginx: 10 | image: "srome/d4d-hate-speech-detector:nginx" 11 | networks: 12 | - web_nw 13 | networks: 14 | web_nw: 15 | driver: bridge -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | app: 4 | build: . 5 | networks: 6 | - web_nw 7 | ports: 8 | - "80:8000" 9 | nginx: 10 | build: ./nginx 11 | networks: 12 | - web_nw 13 | networks: 14 | web_nw: 15 | driver: bridge -------------------------------------------------------------------------------- /docs/summarize_twitter-hate-speech2/quantity_of_tweets_per_class_histogram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data4Democracy/hate_speech_detector/c527aa6d286c663e74f538aad16e085301144250/docs/summarize_twitter-hate-speech2/quantity_of_tweets_per_class_histogram.png -------------------------------------------------------------------------------- /docs/summarize_twitter-hate-speech2/tweet_length_histogram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data4Democracy/hate_speech_detector/c527aa6d286c663e74f538aad16e085301144250/docs/summarize_twitter-hate-speech2/tweet_length_histogram.png -------------------------------------------------------------------------------- /flask_app.py: -------------------------------------------------------------------------------- 1 | 2 | # A very simple Flask Hello World app for you to get started with... 3 | 4 | from flask import Flask 5 | 6 | app = Flask(__name__) 7 | 8 | @app.route('/') 9 | def hello_world(): 10 | return 'Hello from Flask!' 11 | 12 | -------------------------------------------------------------------------------- /hate_speech_detector.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | -------------------------------------------------------------------------------- /hatebase_api.py: -------------------------------------------------------------------------------- 1 | ''' This is the code for using the Hatebase API 2 | It uses this Python wrapper: https://github.com/DanielJDufour/hatebase 3 | which can be installed with: pip install hatebase 4 | 5 | ''' 6 | from json import loads 7 | from hatebase import HatebaseAPI 8 | 9 | key = # get a key here: https://www.hatebase.org/request_api 10 | 11 | # Define parameters 12 | hatebase = HatebaseAPI({"key": key}) 13 | filters = {'language': 'eng'} 14 | output = 'json' 15 | query_type = 'sightings' 16 | 17 | # Query the database 18 | response = hatebase.performRequest(filters, output, query_type) 19 | 20 | # Convert to Python object 21 | resp = loads(response) -------------------------------------------------------------------------------- /nginx/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nginx 2 | 3 | COPY conf.d /etc -------------------------------------------------------------------------------- /nginx/conf.d: -------------------------------------------------------------------------------- 1 | worker_processes 1; 2 | 3 | events { worker_connections 1024; } 4 | 5 | http { 6 | 7 | upstream docker-app { 8 | server app; 9 | } 10 | 11 | server { 12 | listen 80; 13 | 14 | location / { 15 | proxy_pass http://docker-app; 16 | proxy_redirect off; 17 | proxy_set_header Host $host; 18 | proxy_set_header X-Real-IP $remote_addr; 19 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 20 | proxy_set_header X-Forwarded-Host $server_name; 21 | } 22 | } 23 | 24 | } -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/Data Exploration-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook is an attempt to explore the dataset. This notebook needs to be expanded upon." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "import re\n", 20 | "from textstat.textstat import textstat\n", 21 | "from textblob import TextBlob\n", 22 | "import seaborn as sns\n", 23 | "%matplotlib inline\n", 24 | "sns.set_style(\"dark\")\n", 25 | "sns.set_context(\"talk\")" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "df = pd.read_csv('twitter-hate-speech.csv', encoding='latin-1')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 3, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/html": [ 47 | "
\n", 48 | "\n", 61 | "\n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | "
_unit_id_golden_unit_state_trusted_judgments_last_judgment_atdoes_this_tweet_contain_hate_speechdoes_this_tweet_contain_hate_speech:confidence_created_atorig__goldenorig__last_judgment_atorig__trusted_judgmentsorig__unit_idorig__unit_state_updated_atorig_does_this_tweet_contain_hate_speechdoes_this_tweet_contain_hate_speech_golddoes_this_tweet_contain_hate_speech_gold_reasondoes_this_tweet_contain_hate_speechconfidencetweet_idtweet_text
0853718217Truegolden86NaNThe tweet uses offensive language but not hate...0.6013NaNTrueNaN0.0615561535.0goldenNaNThe tweet contains hate speechThe tweet contains hate speech\\nThe tweet uses...NaN1.01.666196e+09Warning: penny boards will make you a faggot
1853718218Truegolden92NaNThe tweet contains hate speech0.7227NaNTrueNaN0.0615561723.0goldenNaNThe tweet contains hate speechThe tweet contains hate speech\\nThe tweet uses...NaN1.04.295121e+08Fuck dykes
2853718219Truegolden86NaNThe tweet contains hate speech0.5229NaNTrueNaN0.0615562039.0goldenNaNThe tweet contains hate speechThe tweet contains hate speech\\nThe tweet uses...NaN1.03.956238e+08@sizzurp__ @ILIKECATS74 @yoPapi_chulo @brandon...
3853718220Truegolden98NaNThe tweet contains hate speech0.5184NaNTrueNaN0.0615562068.0goldenNaNThe tweet contains hate speechThe tweet contains hate speech\\nThe tweet uses...NaN1.04.975147e+08\"@jayswaggkillah: \"@JacklynAnnn: @jayswaggkill...
4853718221Truegolden88NaNThe tweet uses offensive language but not hate...0.5185NaNTrueNaN0.0615562488.0goldenNaNThe tweet contains hate speechThe tweet contains hate speech\\nThe tweet uses...NaN1.05.889236e+08@Zhugstubble You heard me bitch but any way I'...
\n", 205 | "
" 206 | ], 207 | "text/plain": [ 208 | " _unit_id _golden _unit_state _trusted_judgments _last_judgment_at \\\n", 209 | "0 853718217 True golden 86 NaN \n", 210 | "1 853718218 True golden 92 NaN \n", 211 | "2 853718219 True golden 86 NaN \n", 212 | "3 853718220 True golden 98 NaN \n", 213 | "4 853718221 True golden 88 NaN \n", 214 | "\n", 215 | " does_this_tweet_contain_hate_speech \\\n", 216 | "0 The tweet uses offensive language but not hate... \n", 217 | "1 The tweet contains hate speech \n", 218 | "2 The tweet contains hate speech \n", 219 | "3 The tweet contains hate speech \n", 220 | "4 The tweet uses offensive language but not hate... \n", 221 | "\n", 222 | " does_this_tweet_contain_hate_speech:confidence _created_at orig__golden \\\n", 223 | "0 0.6013 NaN True \n", 224 | "1 0.7227 NaN True \n", 225 | "2 0.5229 NaN True \n", 226 | "3 0.5184 NaN True \n", 227 | "4 0.5185 NaN True \n", 228 | "\n", 229 | " orig__last_judgment_at orig__trusted_judgments orig__unit_id \\\n", 230 | "0 NaN 0.0 615561535.0 \n", 231 | "1 NaN 0.0 615561723.0 \n", 232 | "2 NaN 0.0 615562039.0 \n", 233 | "3 NaN 0.0 615562068.0 \n", 234 | "4 NaN 0.0 615562488.0 \n", 235 | "\n", 236 | " orig__unit_state _updated_at orig_does_this_tweet_contain_hate_speech \\\n", 237 | "0 golden NaN The tweet contains hate speech \n", 238 | "1 golden NaN The tweet contains hate speech \n", 239 | "2 golden NaN The tweet contains hate speech \n", 240 | "3 golden NaN The tweet contains hate speech \n", 241 | "4 golden NaN The tweet contains hate speech \n", 242 | "\n", 243 | " does_this_tweet_contain_hate_speech_gold \\\n", 244 | "0 The tweet contains hate speech\\nThe tweet uses... \n", 245 | "1 The tweet contains hate speech\\nThe tweet uses... \n", 246 | "2 The tweet contains hate speech\\nThe tweet uses... \n", 247 | "3 The tweet contains hate speech\\nThe tweet uses... \n", 248 | "4 The tweet contains hate speech\\nThe tweet uses... \n", 249 | "\n", 250 | " does_this_tweet_contain_hate_speech_gold_reason \\\n", 251 | "0 NaN \n", 252 | "1 NaN \n", 253 | "2 NaN \n", 254 | "3 NaN \n", 255 | "4 NaN \n", 256 | "\n", 257 | " does_this_tweet_contain_hate_speechconfidence tweet_id \\\n", 258 | "0 1.0 1.666196e+09 \n", 259 | "1 1.0 4.295121e+08 \n", 260 | "2 1.0 3.956238e+08 \n", 261 | "3 1.0 4.975147e+08 \n", 262 | "4 1.0 5.889236e+08 \n", 263 | "\n", 264 | " tweet_text \n", 265 | "0 Warning: penny boards will make you a faggot \n", 266 | "1 Fuck dykes \n", 267 | "2 @sizzurp__ @ILIKECATS74 @yoPapi_chulo @brandon... \n", 268 | "3 \"@jayswaggkillah: \"@JacklynAnnn: @jayswaggkill... \n", 269 | "4 @Zhugstubble You heard me bitch but any way I'... " 270 | ] 271 | }, 272 | "execution_count": 3, 273 | "metadata": {}, 274 | "output_type": "execute_result" 275 | } 276 | ], 277 | "source": [ 278 | "df.head()" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 4, 284 | "metadata": {}, 285 | "outputs": [ 286 | { 287 | "data": { 288 | "text/html": [ 289 | "
\n", 290 | "\n", 303 | "\n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | "
_unit_id_trusted_judgmentsdoes_this_tweet_contain_hate_speech:confidence_created_atorig__last_judgment_atorig__trusted_judgmentsorig__unit_id_updated_atdoes_this_tweet_contain_hate_speech_gold_reasondoes_this_tweet_contain_hate_speechconfidencetweet_id
count1.450900e+0414509.00000014509.0000000.00.067.06.700000e+010.00.067.01.450900e+04
mean8.537266e+083.4069890.865844NaNNaN0.06.155623e+08NaNNaN1.06.762014e+17
std4.205642e+035.9796990.178734NaNNaN0.01.089591e+03NaNNaN0.04.606417e+16
min8.537182e+083.0000000.333300NaNNaN0.06.155611e+08NaNNaN1.02.423319e+07
25%8.537230e+083.0000000.668400NaNNaN0.06.155612e+08NaNNaN1.06.790000e+17
50%8.537266e+083.0000001.000000NaNNaN0.06.155622e+08NaNNaN1.06.790000e+17
75%8.537303e+083.0000001.000000NaNNaN0.06.155625e+08NaNNaN1.06.800000e+17
max8.537339e+0898.0000001.000000NaNNaN0.06.155658e+08NaNNaN1.06.800000e+17
\n", 435 | "
" 436 | ], 437 | "text/plain": [ 438 | " _unit_id _trusted_judgments \\\n", 439 | "count 1.450900e+04 14509.000000 \n", 440 | "mean 8.537266e+08 3.406989 \n", 441 | "std 4.205642e+03 5.979699 \n", 442 | "min 8.537182e+08 3.000000 \n", 443 | "25% 8.537230e+08 3.000000 \n", 444 | "50% 8.537266e+08 3.000000 \n", 445 | "75% 8.537303e+08 3.000000 \n", 446 | "max 8.537339e+08 98.000000 \n", 447 | "\n", 448 | " does_this_tweet_contain_hate_speech:confidence _created_at \\\n", 449 | "count 14509.000000 0.0 \n", 450 | "mean 0.865844 NaN \n", 451 | "std 0.178734 NaN \n", 452 | "min 0.333300 NaN \n", 453 | "25% 0.668400 NaN \n", 454 | "50% 1.000000 NaN \n", 455 | "75% 1.000000 NaN \n", 456 | "max 1.000000 NaN \n", 457 | "\n", 458 | " orig__last_judgment_at orig__trusted_judgments orig__unit_id \\\n", 459 | "count 0.0 67.0 6.700000e+01 \n", 460 | "mean NaN 0.0 6.155623e+08 \n", 461 | "std NaN 0.0 1.089591e+03 \n", 462 | "min NaN 0.0 6.155611e+08 \n", 463 | "25% NaN 0.0 6.155612e+08 \n", 464 | "50% NaN 0.0 6.155622e+08 \n", 465 | "75% NaN 0.0 6.155625e+08 \n", 466 | "max NaN 0.0 6.155658e+08 \n", 467 | "\n", 468 | " _updated_at does_this_tweet_contain_hate_speech_gold_reason \\\n", 469 | "count 0.0 0.0 \n", 470 | "mean NaN NaN \n", 471 | "std NaN NaN \n", 472 | "min NaN NaN \n", 473 | "25% NaN NaN \n", 474 | "50% NaN NaN \n", 475 | "75% NaN NaN \n", 476 | "max NaN NaN \n", 477 | "\n", 478 | " does_this_tweet_contain_hate_speechconfidence tweet_id \n", 479 | "count 67.0 1.450900e+04 \n", 480 | "mean 1.0 6.762014e+17 \n", 481 | "std 0.0 4.606417e+16 \n", 482 | "min 1.0 2.423319e+07 \n", 483 | "25% 1.0 6.790000e+17 \n", 484 | "50% 1.0 6.790000e+17 \n", 485 | "75% 1.0 6.800000e+17 \n", 486 | "max 1.0 6.800000e+17 " 487 | ] 488 | }, 489 | "execution_count": 4, 490 | "metadata": {}, 491 | "output_type": "execute_result" 492 | } 493 | ], 494 | "source": [ 495 | "df.describe()" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": 5, 501 | "metadata": { 502 | "collapsed": true 503 | }, 504 | "outputs": [], 505 | "source": [ 506 | "data_path = 'twitter-hate-speech.csv'\n", 507 | "\n", 508 | "df = pd.read_csv(data_path, encoding='latin1')\n", 509 | "df = df.rename(columns={'does_this_tweet_contain_hate_speech': 'label', \n", 510 | " 'does_this_tweet_contain_hate_speech:confidence': 'confidence' })\n", 511 | "\n", 512 | "mapping = {'The tweet is not offensive': 'Not offensive', \n", 513 | " 'The tweet uses offensive language but not hate speech': 'Offensive',\n", 514 | " 'The tweet contains hate speech': 'Hate speech'\n", 515 | " }\n", 516 | "df['label'] = df['label'].map(lambda x: mapping[x])" 517 | ] 518 | }, 519 | { 520 | "cell_type": "code", 521 | "execution_count": 6, 522 | "metadata": { 523 | "collapsed": true 524 | }, 525 | "outputs": [], 526 | "source": [ 527 | "text = df['tweet_text']" 528 | ] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": 7, 533 | "metadata": {}, 534 | "outputs": [ 535 | { 536 | "data": { 537 | "text/plain": [ 538 | "0 Warning: penny boards will make you a faggot\n", 539 | "1 Fuck dykes\n", 540 | "2 @sizzurp__ @ILIKECATS74 @yoPapi_chulo @brandon...\n", 541 | "3 \"@jayswaggkillah: \"@JacklynAnnn: @jayswaggkill...\n", 542 | "4 @Zhugstubble You heard me bitch but any way I'...\n", 543 | "5 @elaynay your a dirty terrorist and your relig...\n", 544 | "6 RT @ivanrabago_: @_WhitePonyJr_ looking like f...\n", 545 | "7 Well I thought you knew actually RT @KingHorse...\n", 546 | "8 @Stonisnipezz I know. It was a joke, faggot.\n", 547 | "9 I'm tired of people saying I look like my brot...\n", 548 | "Name: tweet_text, dtype: object" 549 | ] 550 | }, 551 | "execution_count": 7, 552 | "metadata": {}, 553 | "output_type": "execute_result" 554 | } 555 | ], 556 | "source": [ 557 | "text[:10]" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": 8, 563 | "metadata": { 564 | "collapsed": true 565 | }, 566 | "outputs": [], 567 | "source": [ 568 | "def remove_handles(content):\n", 569 | " return ' '.join(re.sub(\"(@[A-Za-z0-9]+)\",\" \",content).split())" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": 9, 575 | "metadata": {}, 576 | "outputs": [ 577 | { 578 | "data": { 579 | "text/plain": [ 580 | "0 Warning: penny boards will make you a faggot\n", 581 | "1 Fuck dykes\n", 582 | "2 __ _chulo at least i dont look like jefree sta...\n", 583 | "3 \" : \" : Is a fag\" jackie jealous\" Neeeee\n", 584 | "4 You heard me bitch but any way I'm back th tex...\n", 585 | "5 your a dirty terrorist and your religion is a ...\n", 586 | "6 RT _: @_WhitePonyJr_ looking like faggots?\n", 587 | "7 Well I thought you knew actually RT : Man why ...\n", 588 | "8 I know. It was a joke, faggot.\n", 589 | "9 I'm tired of people saying I look like my brot...\n", 590 | "Name: tweet_text, dtype: object" 591 | ] 592 | }, 593 | "execution_count": 9, 594 | "metadata": {}, 595 | "output_type": "execute_result" 596 | } 597 | ], 598 | "source": [ 599 | "text.apply(remove_handles)[:10]" 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": 10, 605 | "metadata": { 606 | "collapsed": true 607 | }, 608 | "outputs": [], 609 | "source": [ 610 | "data = df[~df['_golden']].dropna(axis=1)" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": 11, 616 | "metadata": {}, 617 | "outputs": [ 618 | { 619 | "data": { 620 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYoAAAEPCAYAAABcA4N7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XtYVNX+BvB3BmYYBhQhEK9HAUW7qBAokqSJmoUmpHA0\nDDUzhczMynshGWYXNbM8VmidJD2maJnpTysrLwmm5iVBTGU0FVTuCMNcmFm/PzzOcUJGUGYj+n56\neB5Ze89e3xmaeWff1pIJIQSIiIhqIG/oAoiI6PbGoCAiIpsYFEREZBODgoiIbGJQEBGRTQwKIiKy\nybGhC6hv+fmXG7oEIqJGx8urSY3LuEdBREQ2MSiIiMgmBgUREdnEoCAiIpsYFEREZBODgoiIbGJQ\nEBGRTQwKIiKyiUFBREQ2Ncid2UeOHMHzzz+P3bt3X3f5d999h/fffx+FhYUICQnBvHnz4OnpKXGV\n9rfs+9/w2R86u/ez75Xedu/jbuSy1AdOMNpt+wJA8cRzdtv+3e7xLeHQw37vv5i2sUjo8oLdti8l\nSfcohBBIS0vD2LFjYTRe/w2WnZ2NOXPmYNGiRcjIyICnpydmzpwpZZmS6L5wpyQhcbUvql/3LG0D\nZxjhANjtxxGA59I2cNieKN0TuwscK85C+JaH7BoSALDu7GqEb3nIrn1IRdKg+Pjjj7Fy5UrEx8fX\nuM6mTZvQr18/dOvWDSqVCq+++ip27dqFgoICCSu986z49XhDl3DHaLrUD3IAMgn6kgFwz/5Mgp7u\nHhPTx0na35qTqZL2Zw+SBsWwYcOwceNGdOnSpcZ1cnJy0KFDB8vv7u7ucHNzg0ajkaJESXyfeV7y\nPj/OuCh5n3cqBfQNXQI1Ip/9mdLQJdwySYOiefPmkMlsfw+rrKyESqWyanN2dkZlZaU9S5NUmK9H\nQ5dAt0A0dAHUqLR2btPQJdyy2+6qJ5VKBZ3O+thhZWUl1Gp1A1VU/9TOzpL3yRPa9ado9CHJwkIA\nMEnU192iW7MASftb/ggPPdU7Pz8/q8NMRUVFKC0thZ+fXwNWVf+k/OD+bmzNh/roJrh6oqjzGJhx\n5YPcnj9V4JVP9e39h/4FH7WvJH39u9d/4CBzkKQve7rtJi4aPHgwnn76aQwbNgxdunTBokWL0Lt3\nb7i7uzd0afWO3/IbL3O/ZBT2S27oMugmrXjky4YuoVG5LYIiMfHK5X9z587FvffeizfffBOzZ89G\nfn4+goODMX/+/AaukIjo7iUTQtxR5+Y4FSoRUd1xKlQiIrppDAoiIrKJQUFERDYxKIiIyCYGBRER\n2cSgICIimxgURERkE4OCiIhsYlAQEZFNDAoiIrKJQUFERDYxKIiIyCYGBRER2cSgICIimxgURERk\nE4OCiIhsYlAQEZFNDAoiIrJJ0qDIyspCdHQ0AgICEBkZiUOHDl13vS+++ALh4eEIDg7GpEmTUFBQ\nIGWZRER0DcmCQq/XIz4+HkOHDsW+ffsQFxeHhIQEVFRUWK23ZcsWLF26FAsXLkR6ejo6dOiAhIQE\nqcokIqK/kSwoMjIyIJfLERsbC4VCgejoaHh6emLHjh1W633//ff45z//icDAQCgUCkyaNAknT57E\n8ePHpSqViIiuIVlQaDQa+Pn5WbX5+PggJyfHqs1sNkOlUll+l8lkkMlkOHPmjCR1EhGRNcmCQqvV\nwtnZ2apNpVJBp9NZtYWHh2Pt2rXIzs6GwWDA0qVLodPpoNfrpSqViIiu4ShVR87OztVCQafTQa1W\nW7VFRUXh0qVLeP7552E0GhEdHQ0/Pz80bdpUqlKJiOgaku1R+Pr6QqPRWLVpNBp06NDBqu3SpUuI\niIjATz/9hF27duGZZ57BmTNncO+990pVKhERXUOyoAgNDYXBYEBqaiqMRiPS0tJQUFCAsLAwq/X2\n7NmDCRMmoKioCOXl5UhOTkavXr3QvHlzqUolIqJrSBYUSqUSKSkp2Lx5M3r06IEvv/wSy5Ytg1qt\nRmJiIhITEwEAkZGR6N27NyIiIhAeHg4hBN59912pyiQior+RCSFEQxdRn/LzLzd0CUREjY6XV5Ma\nl3EIDyIisolBQURENjEoiIjIJgYFERHZxKAgIiKbGBRERGQTg4KIiGxiUBARkU0MCiIisolBQURE\nNjEoiIjIJgYFERHZxKAgIiKbGBRERGQTg4KIiGxiUBARkU0MCiIisolBQURENjEoiIjIJkmDIisr\nC9HR0QgICEBkZCQOHTp03fXWrVuHfv36ISgoCCNGjMDRo0elLJOIiK4hWVDo9XrEx8dj6NCh2Ldv\nH+Li4pCQkICKigqr9bKzs7FgwQIsX74c+/btQ3h4OCZPnixVmURE9DeSBUVGRgbkcjliY2OhUCgQ\nHR0NT09P7Nixw2q9M2fOwGw2w2QyQQgBuVwOlUolVZlERPQ3jlJ1pNFo4OfnZ9Xm4+ODnJwcq7aw\nsDC0b98egwYNgoODA1xcXLBy5UqpyiQior+RbI9Cq9XC2dnZqk2lUkGn01m16fV6dOjQAWlpaTh4\n8CBGjx6NF154odp6REQkDcmCwtnZudqHvU6ng1qttmr76KOP0KJFC3Tp0gVOTk6YOHEijEYj9uzZ\nI1WpRER0DcmCwtfXFxqNxqpNo9GgQ4cOVm25ubkwGAyW32UyGRwcHODg4CBJnUREZE2yoAgNDYXB\nYEBqaiqMRiPS0tJQUFCAsLAwq/UeeeQRpKWlITMzE1VVVfj8889hMpkQFBQkValERHQNmRBCSNVZ\ndnY2kpKScPz4cbRr1w5JSUkICAhAYmIiAGDu3LkQQiAlJQVr1qxBWVkZ7r33Xrz++uvw9/evVR/5\n+Zft+RSIiO5IXl5NalwmaVBIgUFBRFR3toKiToeeCgsL8fHHH2P69OkoLCzEli1b8Oeff95ygURE\ndPuqdVBkZWVh4MCB+OWXX7B582ZotVrs2bMHMTExSE9Pt2eNRETUgGodFPPnz8fo0aOxZs0aKBQK\nAEBycjLi4uKwYMECuxVIREQNq9ZBkZmZiSFDhlRrHz58OE6dOlWvRRER0e2j1kHh5uaG3Nzcau2Z\nmZnw8PCo16KIiOj2UeugeOqpp5CYmIht27YBAI4fP45Vq1YhKSkJw4cPt1uBRETUsOp0eeyqVauw\nfPly5OXlAQA8PT0xbtw4jB49GjKZzG5F1gUvjyUiqrt6v49Cq9XCZDJBJpPB1dX1loqrbwwKIqK6\nq5f7KIqKihAfH48lS5ZArVajSZMmeOyxxzBx4kSUlpbWS6FERHT7qXVQJCUloby8HIMGDbK0rVix\nAmVlZZg3b55diiMiooZX60NPwcHB+Oqrr6pNPnT8+HGMGjUKe/futUuBdcVDT0REdVcvh56cnJxQ\nVFRUrf3vc14TEdGdpdZBERERgddeew27du1CcXExiouLsWfPHsyZMwePPfaYPWskIqIGVOs5s6dO\nnYqysjIkJCTAZDIBAORyOaKjozFjxgy7FUhERA2rzpfHlpeXQ6PRQKFQoG3btnBxcbFXbTeF5yiI\niOrO1jmKWu9RAEBJSQmOHz+OqqoqCCFQUFBgWfb3meqIiOjOUOug2LBhA5KSkqzms75KJpPh2LFj\n9VoYERHdHmp96OmRRx5B//798dJLL912d2Nfi4eeiIjqrl4ujy0uLsaYMWNuKSSysrIQHR2NgIAA\nREZG4tChQ9XWSUxMRGBgoOUnICAAnTp1wqZNm266XyIiunm1DoqHHnoIe/bsuemO9Ho94uPjMXTo\nUOzbtw9xcXFISEiodh/G3LlzcfDgQcvPM888gx49evASXCKiBlLrcxT3338/5s2bh59++gk+Pj6W\nWe6uevnll20+PiMjA3K5HLGxsQCA6OhofPHFF9ixYwciIiKu+5ijR48iNTUVmzZtqtYfERFJo9ZB\nsXfvXnTt2hUVFRU4evSo1bLaDDGu0WiqDf/h4+ODnJycGh8zf/58jB8/Hi1btqxtmUREVM9qHRSp\nqam31JFWq4Wzs7NVm0qlgk6nu+76Bw4cwMmTJ/Hpp5/eUr9ERHRr6nQfRWFhIdatW4fTp09j6tSp\n2Lt3Lzp27IiOHTve8LHOzs7VQkGn00GtVl93/Q0bNmDIkCG33Q19RDfNpIfzH19AeXYHTK6tUNnt\nOZg8/Bu6KqIbqnVQZGVlYdSoUejQoQOOHj2KiRMn4tdff8XMmTPx8ccfIzQ01ObjfX198eWXX1q1\naTQaDB48+Lrr//zzz/joo49qW16jZxYCmkIt9p4pRoamCBcv6+GslOO+Fk0xMqgNHOQyyGQyGKtM\n2Jx1CfmXddBWmXFvc1dcNpigKdSiVVMnBLdthg5ertBXmdGiqRNUCoeGfmp3Jl0p3DY9DcWlg5Ym\nMxwggxkyCMvvclwZ7kYAkP335ypV1n9w9dp0GWT/Xet/TC4tcPnxFajy7ma3p0HWLmjzsOiPd3C8\nLBsKmSN6e/dFj+YPwbepL5o7e8NgMuBCZS6aO7fA+Ypz2HL2W1SaKhHs2R1/lf+F89qzCLgnCANb\nR8BRXqfv4be1Wt9HERcXhx49emDSpEkIDAzEt99+i7Zt22LBggVIT0/H+vXrbT7eYDCgX79+GD9+\nPEaMGIGNGzdi4cKF2L59e7W9irNnz+Lxxx/H77//DqVSWacn1Bjvo/jmSB4W/HwK+ipzvW5XrZBh\nUm8/RAe0qtft3vWEGZ4fd4DMXP3m03rvCkDx0I0wtQyye193k2J9Ef5z6kv8XrgfHZv6w9PJEz+c\n34ZL+os1PkYpc0KVMMIMM2SQQaDmj87ObvfhX72W26N0u6mXITwyMzORnJxcrX348OHV9hSuR6lU\nIiUlBUlJSVi0aBHatWuHZcuWQa1WIzExEcCVS2MB4Pz583Bzc6tzSDQmZiEQl3oAf+Zr7daH1ijw\nzvaT8PN0QWAbN7v1c7dRHVgqSUgAV/ZAmux8DSXD/0+S/u4GlVVajN4xAuVV5QCAnMsna/U4g9Bb\n/m0rJAAguzQLr++biaTgZDjIGv9efa2Dws3NDbm5uWjXrp1Ve2ZmJjw8PGq1jc6dO2PNmjXV2q8G\nxFU9e/bEr7/+WtvSGqUR/94HTdH1T+TXt5e/PoqfJ/WSpK+7gSprtaT9ORb8IWl/d7qvclZbQsKe\nfs3fgSe2DcCWx36ye1/2Vusb7p566ikkJiZi27ZtAK7MbLdq1SokJSVh+PDhdivwTmQWQrKQAIBy\ngwmGej6sdTcTDk4NXQLdgq9Pp0nWl86sw/qcryTrz15qvUcxfvx4uLi44O2330ZlZSVeeOEFeHp6\nIj4+HqNHj7ZnjXccc50Gdq8fB8+XIqSdu/Qd34GMHp2hLKnd4Qq6/VyuKpO0vy9OfoZhvo37y3Sd\nTsuPHDkSI0eOhFarhclkQpMmNZ/8oJo5ym98g2J9a4Au71jKv3Y0dAnUiJjNjX9v3mZQ7N69u9Yb\n4nwUt7fANs0auoQ7hkNV47uyjhpOM4fG/96zGRTjxo2z+l0mk0EIAWdnZzg6OuLy5ctwcHBA06ZN\nkZ6ebtdC6dYUluvh3VTV0GUQ3XXyjZcauoRbZjMosrOzLf/esGED1q5di3nz5lnGbDp79ixmzZqF\nvn372rfKO4ypAU5SGP47zzkRSasBTknWu1pf9bRw4UIkJSVZDezXtm1bzJ49G5988oldirtTOTTA\nCQMvV+5N1JcqjwcaugRqRLp6BDR0Cbes1kFRVVWF0tLSau0XL16Eg0Pjv6HkTqZ0AIfyqEe6+/4p\naX9mZeM/xn078VDU7r6v+vKkT7Sk/dlDrYNi6NChmD59OtLS0nD06FH88ccfWLVqFWbOnGmZY4Jq\nb/GT90vW16wBnSTr625glnggP73/EEn7u9O9GfSOZH21cm6Nnl62x8FrDGo91pPJZMKSJUuwbt06\nFBUVAQC8vLwwatQoPPfcc3Ytsi4a01hP54q1GPHFAehN9juKOej+5kh6rLPdtn9XEgJuX0dDmbfX\n7l2ZFa4oHr4VZrf2du/rbjJmRyz+qjht1z5aq9tgWa/P4Kq4+emjpWRrrKdaB8W1ioqKIJPJ4O5+\n+93A1ZiC4qrcUh0Ony+Fs0KOi5f12Jqdj8IKA1ydHNCtlRseu7c5urZqit/OlCAl/TTOlejQupkK\n3q5OOFmgRZHWgHKDyXKSXC4D2jRTIS64LaK6ctInuzBWwjkzFYrzGXC8dBhy7UXIAFQ5e0Hv+xic\nzmyHQ0U+hEwGyB2BKgNkqAIAyzixwskdZqUrYAZk+hLIqy5blpmVzaC/NxqVXZ6B2a1dzXXQTRFC\n4ONjH+L/zm6BzqSFCWYI1P5+B9l//1PJVagSJphEFWQyGeQyBzR38kJU+xgM/kcklA6NZ7y6mw6K\nr776Ck8++SSUSiW++sr2bei3yzAejTEo6oMQAqcKtWjq5IjmTTjEhNTk5bmQ6ctg8ugE2JrxUZjh\nUJgN4XwPzC7e1tuouAhZZSFM93QGZLU+KkxUL246KMLDw7F+/Xq4u7sjPDy85o3IZNi+ffutVVlP\n7tagICK6FTc9zPj48eMtQ33/9FPjHwGRiIjqzub+7fz581FSUgIAuPfeey0nsYmI6O5hc4+idevW\nmDRpEjp16gQhBJKTk+HkdP3j3/Pnz7dLgURE1LBs7lF8+OGH6NKlCwyGK7N56fX6Gn+IiOjOVKc5\ns5cuXYqmTZvau6ZbwpPZRER1V2/3UZhMJuTl5aGqqgp/f5iPj8/NV1iPGBRERHV301c9XWvnzp2Y\nNWsWCgsLLSFxddhxmUyGY8eO3XAbWVlZSExMxMmTJ9GuXTu88cYbCAioPmDW/v37MW/ePJw+fRpt\n2rTBrFmzEBra+G+DJyJqjGq9RzFw4EB06tQJEydOhKtr9VvSW7dubfPxer0eAwYMQHx8PGJiYrBx\n40YsXLgQP/74I1xcXCzrXbx4EYMHD0ZycjIeffRRbN68GUlJSdi9ezdUqhuPgMo9CiKiuquXPYq8\nvDwsX74cbdu2vakiMjIyIJfLLQMIRkdH44svvsCOHTsQERFhWW/jxo146KGHMHDgQADA4MGD4ePj\nA7mcd6oSETWEWn/6duvWDZmZmTfdkUajsZrLArhyXiMnJ8eqLTMzE97e3pg4cSJCQkIwfPhwmEwm\ny41/REQkrVrvUQwcOBBz5szB/v370b59eygUCqvlNxrrSavVwtnZ2apNpVJBp9NZtZWWlmLnzp34\n8MMPsXjxYqxduxbjx4/Htm3b4ObmVttyiYiontQ6KD777DO4uLhcdygPmUx2w6BwdnauFgo6nQ5q\ntdqqTalUonfv3ggLCwMAjBw5EitWrMDvv//OKVeJiBpArYPiVsd68vX1xZdffmnVptFoMHjwYKs2\nHx8f/PXXX1ZtZrO52uW4REQkjTqdIRZC4Oeff0ZKSgo++eQT/PDDD7W+Kzs0NBQGgwGpqakwGo1I\nS0tDQUGBZc/hqsjISOzevRu//PILzGYzUlNTodfrERISUpdSiYiontT68ti8vDxMmDABZ8+ehY+P\nD0wmE86cOQNvb2+sXLkS3t7eN9xGdnY2kpKScPz4cbRr1w5JSUkICAhAYmIiAGDu3LkAgN27d2PB\nggU4c+YMfHx8MGfOHHTr1q1WT4iXxxIR1V293Jn9/PPPQ6/XY+HChWjW7Mpk70VFRXj11VfRpEkT\nfPDBB/VT7S1iUBAR1Z2toKj1oaf09HRMmzbNEhIA4OHhgWnTpuHXX3+9tQqJiOi2VeugcHV1rXbV\nEgBUVlbyZjgiojtYrT/hBwwYgDfeeAMnTpywtB0/fhxz585Fv3797FIcERE1vFqfoygvL8eLL76I\n9PR0y5hLOp0O/fr1w1tvvXXbDD/OcxRERHVXL2M9ubq6olevXnjwwQfh5+cHpVKJlStXIjg4+LYJ\nCSIiqn+1DooFCxZg48aNeOONNxAeHg4AuHDhAj755BOUl5fjhRdesFuRRETUcGp96CksLAyLFy9G\ncHCwVXt6ejpmzJiBHTt22KXAuuKhJyKiuquXy2O1Wu11B+Xz8vJCWVnZzVVGRES3vVoHRc+ePbFg\nwQKrUCgvL8eSJUvQvXt3uxRHREQNr05DeIwZMwYXL160TF507tw5tGnTBv/6179uekKj+sZDT0RE\ndVcvQ3gAgMFgwJ49e3Dq1CkoFAq0b98eYWFht9UNdwwKIqK6q7egaAwYFEREdVcvJ7OJiOjuxKAg\nIiKbGBRERGQTg4KIiGxiUBARkU0MCiIisknSoMjKykJ0dDQCAgIQGRmJQ4cOXXe9CRMmoGvXrggM\nDLT8EBFRw5AsKPR6PeLj4zF06FDs27cPcXFxSEhIQEVFRbV1s7KysGrVKhw8eNDyQ0REDUOyoMjI\nyIBcLkdsbCwUCgWio6Ph6elZbdTZwsJCFBUVwd/fX6rSiIjIBsmCQqPRwM/Pz6rNx8cHOTk5Vm1Z\nWVlwcXHBhAkT0LNnT4wYMYJ7FEREDUiyoNBqtXB2drZqU6lU0Ol0Vm16vR4BAQGYPXs2du7ciSFD\nhuC5555Dfn6+VKUSEdE1JAsKZ2fnaqGg0+mgVqut2vr3749PP/0UHTt2hFKpRGxsLFq2bIm9e/dK\nVSoREV1DsqDw9fWFRqOxatNoNOjQoYNV29atW7FlyxarNr1eDycnJ7vXSERE1UkWFKGhoTAYDEhN\nTYXRaERaWhoKCgoQFhZmtZ5Wq8W8efNw8uRJGI1GLF++HDqdDr169ZKqVCIiuoajVB0plUqkpKQg\nKSkJixYtQrt27bBs2TKo1WokJiYCAObOnYuhQ4ciPz8f48aNQ0lJCe677z6kpKRUO0RFRETS4HwU\nRETE+SiIiOjmMSiIiMgmBgUREdnEoCAiIpsYFEREZBODgoiIbGJQEBGRTQwKIiKyiUFBREQ2MSiI\niMgmBgUREdnEoCAiIpsYFEREZBODgoiIbGJQEBGRTQwKIiKyiUFBREQ2MSiIiMgmBgUREdkkaVBk\nZWUhOjoaAQEBiIyMxKFDh2yun56ejs6dO6OiokKiComI6O8kCwq9Xo/4+HgMHToU+/btQ1xcHBIS\nEmoMgdLSUsyaNQtCCKlKJCKi65AsKDIyMiCXyxEbGwuFQoHo6Gh4enpix44d110/KSkJERERUpVH\nREQ1kCwoNBoN/Pz8rNp8fHyQk5NTbd1vv/0WZWVleOqpp6Qqj4iIauAoVUdarRbOzs5WbSqVCjqd\nzqotNzcXH3zwAVavXg2j0ShVeUREVAPJ9iicnZ2rhYJOp4Narbb8bjabMX36dEyZMgXe3t5SlUZE\nRDZIFhS+vr7QaDRWbRqNBh06dLD8fuHCBRw+fBhJSUkIDg7GkCFDAAB9+vTB/v37pSqViIiuIdmh\np9DQUBgMBqSmpmLEiBHYuHEjCgoKEBYWZlmnVatWOHLkiOX3c+fOoV+/ftixYwdcXFykKpWIiK4h\n2R6FUqlESkoKNm/ejB49euDLL7/EsmXLoFarkZiYiMTERKlKISKiOpCJO+xGhfz8yw1dAhFRo+Pl\n1aTGZRzCg4iIbGJQEBGRTQwKIiKyiUFBREQ2MSiI6luVDorcDDiUVB+ehqgxkuw+CqpZbqkOaqUD\nzhRpsTnrIrQGE7q0bIqurZrAQ62EwSTQsqkTHB3+l+vnSyvRxMkRWoMJTo5yOMhluKyvwumiSvyQ\nfQkAMLBzc4T6eDTU07orKc7uQtPvn4dcVwwA0LfuhbLHPwGcmsExbz9c9r4L+eVzqPLqCm3ABAhX\nb5hdW1ke71CQBZmxAlXeDwJyh4Z6GncVk7kKFyov4B6VJ1QOKlRWaVGkL0ILdUs4yG78Nyg1lKDS\nVIkWzi0lqLZh8PLYBnS6SItZ32XhRL72huuqHOVo1dQJvp4u+DP/Mv4q1teqj9ZuKqx4KgD3uChv\ntVy6lkkP9d4FcNJsg1C5o6LrWDgWn4LL70shM1n/bcwyR5i87ofjpcOQXdMuAMgAGJt1gMxshPxy\nLuTC+N9lclR2Ho6K8HcBmQxkH7vyfsEHRxeiyFgItYMaXTy64WDBARiEAUq5EsPa/xPNVS1wn/v9\n6OjWCcX6IhwuPAh3Jw8IIfBVzir8VrAXAmZ0bNIJEf94Ajmlp6AzVyLgngcxoPVjcJQ3ju/jti6P\nZVA0oEGfZOBSucHu/fh7uWDVqCC793PXEALua/rBsejP/zUBsMfHudnBGYVjDwJKVzts/e51quwE\nXt8/HRd0F+zaTy/vh/Fm0Dt27aO+8D6K29DWYxclCQkA+DO/AlXmO+r7QINSnN9jFRKAfUICAOSm\nSrhtirPT1u9OBpMBU9JfsHtIAMCvF3fheMkxu/djbwyKBvLLiQJJ+3Pg0Yt645i3T9L+FBcOSNrf\nnW7LuU0oN0l35OFk2QnJ+rIXBkUDOVNc2dAl0M0SZok7lLq/O9u2s5sl7a+i6vrTPTcmDIoGonSQ\n9qU/drHxnLu53Zma+Uran3BQSdrfna5IXyhpf25KN0n7swcGRQOJuF/aiZnkvHKm3pia/kPS/ip6\nTJW0vzvdP1zbS9aXQqZAL+/ekvVnLwyKBjKsa0uoFdK8/GqFHJ29a76igerG3MwHUl0aUOXcHLoH\nJ0jU291hSpdpkEvw0eckV2Fe8AK4Khr/FWu8PLYBFZTrMef/juPA2RKY/vtXUDrIYDDV359E5SjH\nkmFdENim8e/+3k6a/N94qHK21Lj86l9QVsPvV9v+vp93dT3h4IzLvefBcG80IOP3ufp2tvwvvHMk\nGcdKsiD+ew6ohXNLPNPxOZwo+xPpl35FoS4fAgIOMkdUmrQQ13w98FDeg85u90Jn0qHMWIJW6rY4\nW/EXLukuwNu5JUb4jkS/Vo9C1oj25HkfRSNzNK8M6w7mwslRjuZNlKgyCxhNAgGtm+JhP0/kleqw\n+sA5OCsd8LDvPTicWwYXpQP6+3uhicoRQggcyS1DpdGEB9s0g9KRHzT1zlwF50MpUGWvBXSlkBvK\nIeQOqLrnPlQGjIPRdyDkxafgsncBzE5uqOz+0pWb9H7/CI6Fx2H0DoT2gTFQ//EZZPrLMLTuCShd\nYfJ6QPJzIEQAg4KIiG6AN9wREdFNkzQosrKyEB0djYCAAERGRuLQoUPV1hFC4IMPPkBYWBgCAwMR\nFxeHEyf8X6uFAAATCklEQVQa/w0rRESNlWRBodfrER8fj6FDh2Lfvn2Ii4tDQkICKiqsb0ZJS0vD\ntm3bsH79ehw4cADBwcGYNm2aVGUSEdHfSBYUGRkZkMvliI2NhUKhQHR0NDw9PbFjxw6r9aKjo5GW\nlgZvb29otVpcvnwZ7u7uUpVJRER/I9n4txqNBn5+flZtPj4+yMmxntxFJpNBrVZjw4YNmDVrFlxd\nXfHZZ59JVSYREf2NZHsUWq0Wzs7OVm0qlQo6ne666w8ePBhHjhxBQkICxo0bh5KSEinKJCKiv5Es\nKJydnauFgk6ng1qtvu76SqUSSqUSzz77LFxdXfHbb79JUSYREf2NZEHh6+sLjUZj1abRaNChQwer\ntiVLluD999+3/C6EgMFgQJMmHIKCiKghSBYUoaGhMBgMSE1NhdFoRFpaGgoKChAWFma1Xrdu3fCf\n//wH2dnZMBgM+Oijj+Dq6ooHH3xQqlKJiOgakt6ZnZ2djaSkJBw/fhzt2rVDUlISAgICkJiYCACY\nO3cuAGDNmjVISUnB5cuXERgYiNdffx1t2rSRqkwiIrrGHTeEBxER1S8O4UFERDYxKIiIyCYGBRER\n2cSgICIimxgURERkE4OCiIhsYlAQEZFNDAo7O3v2bEOXQHRX43vw1jEobOjUqZPlrvFrhYeH4+ef\nf77h47dv344pU6bcVN//+te/EBQUhF69euHy5cuIjY1FQECA5e71+jZo0CDs3LnTLttuLNLT0zF6\n9Gg8+OCD6N69O0aOHIkff/zRsnz9+vUICQlB9+7dcf78eSQkJCAgIAAJCQl2qWfcuHH46quv7LLt\nxqJTp074888/q7WHhIRg7969N3z8rbwHbxd79+5FSEhIg9Yg2XwUjdXatWvRv39/9O7du86PLS0t\nhdlsvql+N2zYgJkzZyI6Ohr79+9HZmYm9uzZAxcXl5va3o1s3rzZLtttLDZt2oQ333wTU6dOxdKl\nS+Hk5IRffvkFiYmJOHfuHMaMGYNvv/0WsbGxmDx5MvLy8vDTTz/hxx9/RNu2be1S0/Lly+2y3bvJ\nrbwH6X+4R3EDMTExmDVrVo3zYRQUFOCVV15BSEgI+vTpg3fffRcGgwFHjhzBnDlzcOzYMfTq1eu6\nj/3uu+8QERGBoKAgjBgxAocPHwYADBw4EOfOncPcuXMxePBgjB07FjqdDmFhYTh48CBKSkowdepU\nhIaGIjw8HJ9++imujsQyY8YMJCcnIzY2FoGBgRg6dCgyMzMBAGVlZXj++efRo0cP9O3bF7Nnz4Ze\nrwfwv72k999/Hy+++KKlRiEEwsPDLTMRrl69Go8++ihCQkIwceJE5Ofn188L3YB0Oh2Sk5Mxd+5c\nxMTEwNXVFQqFAgMGDMCiRYuwYMECjBo1Cr/99htSUlIQHx+Pxx9/HAAwZMgQbNmyxbKNhx9+GGFh\nYXjnnXdgMBgAAB9++CFeffVVTJgwAYGBgYiIiMDu3bsBAAaDATNnzkRISAjCwsLw4osvori4GAAQ\nFxeHL7/8EmvXrsWwYcOsan766aexevVqAMD333+PwYMHIzg4GKNHj642SvOdLisrC2PGjEFYWBi6\ndeuGsWPHoqCg4LrvQVvvnb/797//jUceeQQhISEYOXIkjh49CuDK33PKlCkYNWoUAgICEBMTg2PH\njlket2/fPgwbNgzBwcGIiYnBkSNHLMtyc3MRHx+PkJAQPProo1i/fr1lWVlZGaZOnYrg4GCEhobi\n3XfftdQmhMDChQvx8MMPo2fPnlixYkW9v442CaqRv7+/yM7OFqNHjxaTJ0+2tPft21f89NNPQggh\nhg8fLl5++WVx+fJlceHCBTFs2DDx3nvvCSGEWL9+vXjyySevu+2dO3eKwMBA8dtvvwmj0SjWrVsn\ngoKCxKVLl6r1kZGRIXr06GF57HPPPSemTp0qKioqxNmzZ8WgQYNEWlqaEEKI6dOni+DgYHHs2DFR\nWVkpXnrpJTF27FghhBDvv/++eOGFF4RerxclJSUiMjJSrF271qq/kydPiq5du4ry8nIhhBD79u0T\nvXr1ElVVVWLLli2iT58+4s8//xQ6nU7Mnz9fjBw5st5e74ayZ88e8cADDwiDwXDd5X379hXr168X\nTz/9tEhNTRVCCHH27Fnh7+9veZ2SkpLEM888I4qKikRhYaF4+umnxQcffCCEEGLJkiXi/vvvF3v2\n7BF6vV68/fbb4tFHHxVCCLF27VoRExMjKioqhFarFc8++6xYvHixEEJY+istLRVdunQRZ86cEUII\nkZubK7p06SKKi4vF4cOHRVBQkNi/f78wGAzi888/FwMGDKjxuTQ2/v7+IjAwUAQFBVn9dOrUSWRk\nZAghhOjfv79YuXKlMJvNoqioSERHR4v3339fCFH9PWjrvXOt06dPi4CAAHHu3DlhNpvFkiVLRGxs\nrBDiyt+zU6dOYvPmzcJgMIgPP/xQPPLII0Kv14vz58+LwMBA8cMPPwij0Si2bNkievToIYqLi0VV\nVZV44oknxIIFC4RerxfHjh0TvXr1Eunp6UIIIV566SUxYcIEUVJSIgoKCkRERIRYs2aNyMjIEP7+\n/mLZsmWiqqpK/PLLL8Lf31/k5eXZ++W34B7FDchkMsyfPx+7d+/Gpk2brJb99ddfOHjwIGbPng1X\nV1d4e3tj8uTJ+Prrr2+43W+//RZRUVHo3r07HB0dER0dDT8/P6tj4teTn5+PnTt3YubMmVCr1WjT\npg2effZZrFu3zrJOeHg4OnfuDJVKhYiICJw+fRoA4OTkhMzMTGzevBlGoxEbNmxATEyM1fb9/PzQ\nsWNHbN++HcCVvZ5BgwbBwcEBaWlpGDNmDDp27AgnJye8/PLLOHz4cKP/BltQUIBmzZpBoVBcd7mn\npycKCgpqfLwQAhs2bMCrr74Kd3d3eHh4YNKkSVi7dq1lnYCAAISGhkKpVOKJJ57AmTNnAFz5m5w5\ncwZff/01iouL8emnn2Ly5MlW22/atCn69u1rOTz43XffoXfv3mjWrBnS0tIQFRWFoKAgKBQKjBkz\nBlVVVbU6ft9YrFmzBvv377f6cXNzsyxfsWIFRo4cicrKSly8eBHu7u64ePFite3U5r1zlaOjI4xG\nI9auXYvs7GxMnDgRq1atsiwPDQ1FREQEFAoFEhISoNVq8fvvv+O7775DSEgI+vfvD0dHRzz++OPw\n9/fHtm3b8McffyAvLw9TpkyBUqlE586dMWLECKxbtw4GgwE//PADXnrpJbi5ueGee+7BsmXL0KdP\nHwCAQqHAuHHj4ODggD59+sDFxQXnzp2zw6t9fTxHUQstW7bEa6+9hrlz56J79+6W9sLCQqjVanh4\neFjaWrVqhYKCAhiNRpvbLCoqQufOna3aWrVqhQsXLth8XF5eHoQQGDBggKXNbDajWbNmlt+vrcfR\n0dGy+zp+/HgAwGeffYZZs2YhKCgIycnJaN++vVUfUVFR2LJlCyIiIrB161bLnOV5eXlYvHgxPvro\nI8u6MpkMubm58PHxsVn37czT0xOFhYUwGAxQKpXVlufm5sLT07PGxxcVFUGn0yEuLg4ymQzAlfAw\nGo2WQ3s1/U2GDBmC8vJybNiwAfPmzYO/vz/mzp2Lrl27WvURFRWFRYsWISEhAd999x1eeOEFAFf+\nJnv37sU333xjWddoNCIvL+8mX43G58iRI3juuedQUVGBTp06obS01Or1vqo2752rWrdujZSUFCxf\nvhz//ve/4ebmhsmTJ1sOAf7jH/+wrOvg4AAvLy8UFBQgNzcXu3btQnBwsGV5VVUVgoKC0KRJE5SX\nl6NHjx6WZSaTCffffz9KS0thNBrh7e1tWXa1jzNnzsDFxQWOjv/7uFYoFDCZTDfzct0UBkUtRUVF\nYfv27Zg1a5blTd6qVStotVoUFxfD3d0dAHDu3Dmb306vatmyJXJzc63azp07d8MJmry8vODo6Ig9\ne/ZYPtRKS0tRUVFxw+dw4sQJREZGIiEhARcvXsRbb72FN998s9rxzoiICCxcuBA//PADPD09cd99\n91n6Hjt2LKKjoy3rnjp1ym4nc6USFBSEpk2bYtOmTdXOBezatQslJSXo3bt3jXuKV//e33zzjeW1\n0Gq1KCgogJOTk82+T58+jZ49eyI2NhbFxcVYunQppk2bhq1bt1qt9/DDD2PWrFn48ccfceHCBcs3\nTS8vLzz77LNWeyGnT5+2+sC5k124cAHTp0/H6tWr0a1bNwDAzJkzr3veoS7vnaKiIqjVaqxYsQJ6\nvR5bt27F9OnTLROtXbp0ybJuVVUVLl26hBYtWsDLywsRERF49913LcvPnj0Ld3d3ZGdnw9vbG7/8\n8otlWUFBAYQQ8PDwgEKhsOwRAf/7f6958+a3/kLdIh56qoM33ngDx48ft3zAe3t7IzQ0FG+99RYq\nKipw8eJFLFmyBE888QSAK/N+V1RUXPd/2qioKHzzzTfYv38/qqqqkJaWhpMnT6J///42a2jZsiWC\ngoLw3nvvQafToaSkBC+++KLV9LE1Wbt2LebMmYPy8nK4u7tDpVJd99uUh4cHevbsiXfeeQdDhgyx\ntD/55JP4/PPPcebMGZjNZqSmpuKf//wnKisrb9j37UypVGLOnDl4++23sW7dOpSXl6OyshLbtm3D\n9OnTMWXKFJt7FA4ODnjiiSewYMEClJWVQavVIjExETNmzLhh39u3b8crr7yCgoICuLm5wcXF5bp/\nE0dHRwwaNAjJycl47LHHLB90UVFRWLduHTIzMyGEwA8//IDBgwffNXsUV99fKpUKQgjs2LEDW7du\ntezRX/serMt75/z583jmmWeQmZkJJycnuLu7w8nJCWq1GsCVD/E9e/bAaDRi6dKlcHd3R2BgIAYN\nGoSff/4Z6enpEELgwIEDGDJkCP744w9069YNKpUKy5cvh9FoxIULF/DMM89g1apVcHBwQEREBJYs\nWYLy8nLk5+fj3XffhU6nk/T1rAmDog48PDzw5ptvWrUtWLAAVVVV6NevHyIjIxEUFISpU6cCgOUw\nVffu3S2HIK4KDg5GUlISEhMT0b17d8usfi1btrxhHYsWLUJhYSHCw8MxcOBANG/eHHPmzLnh46ZM\nmQIXFxf069cPPXv2RGlpKWbOnHnddaOionDx4kWroIiMjERMTAyee+45BAcHY+PGjfjkk0+sjhc3\nVo8//jiWLl2KrVu3om/fvggLC8PKlSvxxhtv4Nlnn73h42fPng13d3cMGjQIffr0QXl5ea3Ce9So\nUejatSueeOIJBAUF4ffff8f8+fOvu25UVBTy8vIQGRlpaevRowdmzJiBadOm4cEHH8QHH3yAxYsX\nw9fXt/ZPvhHz8/PD888/j9GjRyMkJATLli3DiBEjkJOTA6D6e7C2750uXbrglVdewaRJkxAQEIC3\n334bixcvRpMmTQAAXbt2RUpKCkJCQrB//3588skncHBwQPv27bF48WK89957CAoKwvTp0zFz5kyE\nhoZCoVDg008/xW+//YawsDAMHTrUcvUgALz++utwc3PDo48+iqioKAwYMKDaOcSGwhnuiIjq4MMP\nP8SJEyewZMmShi5FMtyjICIimxgURERkEw89ERGRTdyjICIimxgURERkE4OCiIhsYlAQ3YRz586h\nU6dOOHXqVL2sV5MNGzbUOPowkVQYFEREZBODgoiIbGJQEN2inJwcjB8/HkFBQXjggQcQFRVVbZjv\nn3/+GeHh4QgICMBLL72E0tJSy7JTp05h7Nix6NatG8LDw7F48eIbjj5MJCUGBdEtSkhIgIeHB9LS\n0rBhwwa0aNGi2lzrq1atwrx587By5UqcPHnSMve5Xq/HuHHj4O/vj2+++QZvvfUWtm7dWqtxooik\nwqAgugVmsxkxMTGYPXs2fHx84O/vj1GjRuH06dNWewXTpk1DaGgounbtitdeew1btmxBcXExNm3a\nBLVajRkzZsDHxwc9e/bEa6+9htTUVEnnGyCyhfNREN0CuVyOkSNH4ttvv8XRo0eh0Wgsc5SbzWbL\negEBAZZ/P/DAAzCbzTh9+jROnToFjUaDwMBAy3IhBAwGA86fPy/dEyGygUFBdAtMJhNiYmKgVqvR\nv39/9O/fHwaDwTID3VVy+f923q8GiFKptMx+lpycXG3bLVq0sG/xRLXEoCC6Bdu3b8dff/2F/fv3\nWyYTujq38rXDqF2d3QwADh8+DIVCgXbt2sHPzw9bt25Fy5YtLY/PyMjA6tWr8d5770n8bIiuj+co\niG5BaGgo9Ho9tm3bhvPnz2Pz5s2WeQoMBoNlvXnz5mH//v04cOAAkpOTMXz4cLi6ulomhpoxYwZO\nnDiBffv2Yfbs2XB0dLzhNKpEUuEeBdEtaNKkCSZPnoy3334bWq0Wvr6+mDNnDqZNm4bMzEzLHNpj\nxozBlClTUF5ejkGDBllmQbw6L/P8+fMRHR0NtVqNAQMG1GoaVSKpcJhxIiKyiYeeiIjIJgYFERHZ\nxKAgIiKbGBRERGQTg4KIiGxiUBARkU0MCiIisolBQURENjEoiIjIpv8HKAOCrTvWENEAAAAASUVO\nRK5CYII=\n", 621 | "text/plain": [ 622 | "" 623 | ] 624 | }, 625 | "metadata": {}, 626 | "output_type": "display_data" 627 | } 628 | ], 629 | "source": [ 630 | "sns.stripplot(x=\"label\", y=\"confidence\", data=data, size=6, jitter=True);" 631 | ] 632 | }, 633 | { 634 | "cell_type": "code", 635 | "execution_count": 12, 636 | "metadata": {}, 637 | "outputs": [ 638 | { 639 | "data": { 640 | "text/plain": [ 641 | "Not offensive 7253\n", 642 | "Offensive 4807\n", 643 | "Hate speech 2382\n", 644 | "Name: label, dtype: int64" 645 | ] 646 | }, 647 | "execution_count": 12, 648 | "metadata": {}, 649 | "output_type": "execute_result" 650 | } 651 | ], 652 | "source": [ 653 | "data['label'].value_counts()" 654 | ] 655 | }, 656 | { 657 | "cell_type": "code", 658 | "execution_count": 13, 659 | "metadata": {}, 660 | "outputs": [ 661 | { 662 | "data": { 663 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAD8CAYAAACVZ8iyAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAGrhJREFUeJzt3H9s1PXhx/HntYVejzpaVnCSEAdFGXPQK22lIGs7zswp\n/aGum1p/EDLg0M5l0zLHOlaywnCTdrRj05qBUUI0ljmglWglE8VU1FprI1Iz15OwdI2cHYfi9Yd3\n7+8fhvtyA+ynQO968/VI/IN7v6+f1/v6iS8+n/vwthljDCIi8qUWF+0AIiISfSoDERFRGYiIiMpA\nRERQGYiICJAQ7QDnwxjDRx+dJJYehLLZbHz1qxOUOwJiMTModyTFYma48NyTJ19yzrGYvDKw2WzE\nxVjyuDjljpRYzAzKHUmxmBlGN3eMfRQiIjIaVAYiIqIyEBERlYGIiKAyEBERVAYiIoLKQEREUBmI\niAgqAxERIUa3oxARiaacmpejduwPHlwyKj9XVwYiIqIyEBERlYGIiKAyEBERVAYiIoLKQEREUBmI\niAgqAxERQWUgIiKoDEREBJWBiIigMhAREVQGIiKCykBERFAZiIgIKgMREUFlICIiqAxERASLZdDe\n3s7NN9/MvHnzuO6662hqagLA5/NRXl5OVlYWBQUFNDY2ht5jjKGmpobc3FxycnJYv349gUAgNN7c\n3IzL5cLpdOJ2u/F6vRd5aSIiYtWwZRAIBCgvL2flypW0t7ezYcMGfvGLX/Cvf/2LtWvX4nA4aG1t\npb6+nk2bNtHR0QHAjh072L9/P3v27GHv3r20t7ezbds2ALq6uqiqqqK2tpaDBw+SlpbGmjVrRnel\nIiJyTgnDTThx4gR9fX0EAgGMMdhsNsaNG0d8fDz79u3j+eefJzExkblz51JYWMiuXbtwOp3s3r2b\npUuXMmXKFADcbjd1dXWsWLGCpqYmXC4XGRkZAFRUVLBgwQK8Xi9paWmWgsfF2S5g2ZF3Kq9yj75Y\nzAzKHUmxmPl0o5F72DJITU2lrKyM++67j9WrVxMMBtmwYQP/+c9/SEhIYNq0aaG506dPp6WlBYDu\n7m5mzpwZNubxeDDG0N3dTWZmZtgxJk6ciMfjsVwGKSkTLC9yLFHuyInFzKDckRSLmWF0cg9bBsFg\nELvdTl1dHYsXL6a1tZX777+fhx9+GLvdHjbXbrfT398PgN/vDxtPSkoiGAwyODh4xtipcb/fbzn4\n8eMnCQaN5fnRFhdnIyVlgnJHQCxmBuWOpFjMfLrzzT1pUvI5x4Ytg5aWFjo7O3nggQcAKCgooKCg\ngD/+8Y8MDAyEze3v78fhcACfF8Pp436/n4SEBBITE8NK4/TxU++1Ihg0BAKx90tU7siJxcyg3JEU\ni5lhdHIP+wXyv//9bwYHB8NeS0hI4KqrrmJoaIienp7Q6x6PJ3RrKD09HY/HEzY2Y8aMs4719fXh\n8/lIT0+/sNWIiMh5GbYMFi5cyOHDh/nrX/+KMYbXX3+dF154gSVLluByuaipqcHv99PZ2UlzczNF\nRUUAFBcXs3XrVnp7e/F6vTQ0NFBSUgJAYWEhLS0ttLW1MTAwQG1tLXl5eaSmpo7uakVE5KyGvU00\na9Ys6uvrqaurY8OGDUydOpXf/e53zJkzh+rqaqqqqsjPz8fhcLB69erQE0JlZWV4vV5KS0sZGhqi\nqKiIZcuWATB79myqq6uprKzk2LFjZGdns3HjxtFdqYiInJPNGBN7N8yAvr5PYupeX3y8jUmTkpU7\nAmIxMyh3JF1o5pyal0chlTUfPLjkvHNPnnzJOce0HYWIiKgMREREZSAiIqgMREQElYGIiKAyEBER\nVAYiIoLKQEREUBmIiAgqAxERQWUgIiKoDEREBJWBiIigMhAREVQGIiKCykBERFAZiIgIKgMREUFl\nICIiqAxERASVgYiIoDIQERFUBiIigspARERQGYiICCoDERFBZSAiIqgMREQElYGIiKAyEBERVAYi\nIoLKQEREUBmIiAgqAxERQWUgIiKoDEREBItl0Nvbi9vtZt68eeTl5fHEE08A4PP5KC8vJysri4KC\nAhobG0PvMcZQU1NDbm4uOTk5rF+/nkAgEBpvbm7G5XLhdDpxu914vd6LvDQREbFq2DIwxnDPPfcw\nY8YMXnvtNbZu3cqWLVtob29n7dq1OBwOWltbqa+vZ9OmTXR0dACwY8cO9u/fz549e9i7dy/t7e1s\n27YNgK6uLqqqqqitreXgwYOkpaWxZs2a0V2piIicU8JwE95++20+/PBDKioqiI+P54orruCpp54i\nMTGRffv28fzzz5OYmMjcuXMpLCxk165dOJ1Odu/ezdKlS5kyZQoAbreburo6VqxYQVNTEy6Xi4yM\nDAAqKipYsGABXq+XtLQ0S8Hj4mwXsOzIO5VXuUdfLGYG5Y6kWMx8utHIPWwZHDp0iCuuuIKHHnqI\npqYmkpOTWbVqFbNmzSIhIYFp06aF5k6fPp2WlhYAuru7mTlzZtiYx+PBGEN3dzeZmZmhsdTUVCZO\nnIjH47FcBikpEywvcixR7siJxcyg3JEUi5lhdHIPWwY+n4/XXnuN3NxcXnzxRd555x2WL1/Oo48+\nit1uD5trt9vp7+8HwO/3h40nJSURDAYZHBw8Y+zUuN/vtxz8+PGTBIPG8vxoi4uzkZIyQbkjIBYz\ng3JHUixmPt355p40KfmcY8OWwfjx45k4cSJutxuAefPmcd1111FfX8/AwEDY3P7+fhwOB/B5MZw+\n7vf7SUhIIDExMaw0Th8/9V4rgkFDIBB7v0TljpxYzAzKHUmxmBlGJ/ewXyBPnz6dQCAQ9iRQIBDg\nm9/8JkNDQ/T09IRe93g8oVtD6enpeDyesLEZM2acdayvrw+fz0d6evqFr0hEREZs2DK45pprsNvt\nbNmyhc8++4z29nZeeOEFvve97+FyuaipqcHv99PZ2UlzczNFRUUAFBcXs3XrVnp7e/F6vTQ0NFBS\nUgJAYWEhLS0ttLW1MTAwQG1tLXl5eaSmpo7uakVE5KyGvU1kt9vZvn07v/nNb1i4cCHJycn86le/\nwul0Ul1dTVVVFfn5+TgcDlavXh16QqisrAyv10tpaSlDQ0MUFRWxbNkyAGbPnk11dTWVlZUcO3aM\n7OxsNm7cOLorFRGRc7IZY2LvhhnQ1/dJTN3ri4+3MWlSsnJHQCxmBuWOpAvNnFPz8iiksuaDB5ec\nd+7Jky8555i2oxAREZWBiIioDEREBJWBiIigMhAREVQGIiKCykBERFAZiIgIKgMREUFlICIiqAxE\nRASVgYiIoDIQERFUBiIigspARERQGYiICCoDERFBZSAiIqgMREQElYGIiKAyEBERVAYiIoLKQERE\nUBmIiAgqAxERQWUgIiKoDEREBJWBiIigMhAREVQGIiKCykBERFAZiIgIKgMREUFlICIiqAxERASV\ngYiIMIIy8Hq9LFiwgBdffBEAn89HeXk5WVlZFBQU0NjYGJprjKGmpobc3FxycnJYv349gUAgNN7c\n3IzL5cLpdOJ2u/F6vRdxSSIiMlKWy6CyspLjx4+H/rx27VocDgetra3U19ezadMmOjo6ANixYwf7\n9+9nz5497N27l/b2drZt2wZAV1cXVVVV1NbWcvDgQdLS0lizZs1FXpaIiIxEgpVJTz75JElJSVx2\n2WUAnDx5kn379vH888+TmJjI3LlzKSwsZNeuXTidTnbv3s3SpUuZMmUKAG63m7q6OlasWEFTUxMu\nl4uMjAwAKioqWLBgAV6vl7S0NMvB4+JsI11rVJ3Kq9yjLxYzg3JHUixmPt1o5B62DDweD4899hhP\nP/00N998MwBHjhwhISGBadOmheZNnz6dlpYWALq7u5k5c2bYmMfjwRhDd3c3mZmZobHU1FQmTpyI\nx+MZURmkpEywPHcsUe7IicXMoNyRFIuZYXRyf2EZfPbZZ/z85z+nsrKSlJSU0Ouffvopdrs9bK7d\nbqe/vx8Av98fNp6UlEQwGGRwcPCMsVPjfr9/RMGPHz9JMGhG9J5oiouzkZIyQbkjIBYzg3JHUixm\nPt355p40KfmcY19YBn/+85+ZPXs2+fn5Ya8nJSUxMDAQ9lp/fz8OhwP4vBhOH/f7/SQkJJCYmBhW\nGqePn3qvVcGgIRCIvV+ickdOLGYG5Y6kWMwMo5P7C79A3rt3L88++yzZ2dlkZ2fT09PDfffdx/79\n+xkaGqKnpyc01+PxhG4Npaen4/F4wsZmzJhx1rG+vj58Ph/p6ekXdWEiImLdF5bBc889x5tvvklb\nWxttbW1MnTqV2tpaysvLcblc1NTU4Pf76ezspLm5maKiIgCKi4vZunUrvb29eL1eGhoaKCkpAaCw\nsJCWlhba2toYGBigtraWvLw8UlNTR3+1IiJyVpaeJjqb6upqqqqqyM/Px+FwsHr16tATQmVlZXi9\nXkpLSxkaGqKoqIhly5YBMHv2bKqrq6msrOTYsWNkZ2ezcePGi7MaERE5LzZjTOzdMAP6+j6JqXt9\n8fE2Jk1KVu4IiMXMoNyRdKGZc2peHoVU1nzw4JLzzj158iXnHNN2FCIiojIQERGVgYiIoDIQERFU\nBiIigspARERQGYiICCoDERFBZSAiIqgMREQElYGIiKAyEBERVAYiIoLKQEREUBmIiAgqAxERQWUg\nIiKoDEREBJWBiIigMhAREVQGIiKCykBERFAZiIgIKgMREQESoh1A5H9NTs3LUTnuG/fnReW48r9B\nVwYiIqIyEBERlYGIiKAyEBER9AWy/I/6+i+ejXYEkZiiKwMREVEZiIiIykBERFAZiIgIKgMREUFl\nICIiWCyDtrY2fvCDH5CVlcW1117LU089BYDP56O8vJysrCwKCgpobGwMvccYQ01NDbm5ueTk5LB+\n/XoCgUBovLm5GZfLhdPpxO124/V6L/LSRETEqmHLwOfzcc8993DXXXfxxhtvUFdXR21tLa2traxd\nuxaHw0Frayv19fVs2rSJjo4OAHbs2MH+/fvZs2cPe/fupb29nW3btgHQ1dVFVVUVtbW1HDx4kLS0\nNNasWTO6KxURkXMatgx6enrIz8+nqKiIuLg4rrrqKubPn097ezv79u3jJz/5CYmJicydO5fCwkJ2\n7doFwO7du1m6dClTpkxh8uTJuN1u/va3vwHQ1NSEy+UiIyMDu91ORUUFBw4c0NWBiEiUDPsvkGfP\nns1DDz0U+rPP56OtrY1Zs2aRkJDAtGnTQmPTp0+npaUFgO7ubmbOnBk25vF4MMbQ3d1NZmZmaCw1\nNZWJEyfi8XhIS0uzFDwuzmZp3lhxKq9yj75YynoxxcdHZ92xfI7EUubTjUbuEW1H8fHHH7Nq1arQ\n1cETTzwRNm632+nv7wfA7/djt9tDY0lJSQSDQQYHB88YOzXu9/stZ0lJmTCS6GOGcstomTQpOarH\nj8VzJBYzw+jktlwGR48eZdWqVUybNo3Nmzfzz3/+k4GBgbA5/f39OBwO4PNiOH3c7/eTkJBAYmJi\nWGmcPn7qvVYcP36SYNBYnh9tcXE2UlImKHcExOrf9i5UX98nUTlurJ4jsZb5dOeb+4v+wmCpDA4d\nOsTy5cspLi7mgQceIC4ujssvv5yhoSF6enqYOnUqAB6PJ3RrKD09HY/HQ0ZGRmhsxowZYWOn9PX1\n4fP5SE9Pt7yoYNAQCMTeL1G5ZbRE+/cTi+dILGaG0ck97BfIXq+X5cuXs2zZMtasWUNc3OdvSU5O\nxuVyUVNTg9/vp7Ozk+bmZoqKigAoLi5m69at9Pb24vV6aWhooKSkBIDCwkJaWlpoa2tjYGCA2tpa\n8vLySE1NvaiLExERa4a9Mti5cyd9fX08/PDDPPzww6HX77rrLqqrq6mqqiI/Px+Hw8Hq1atDVwJl\nZWV4vV5KS0sZGhqiqKiIZcuWAZ9/KV1dXU1lZSXHjh0jOzubjRs3jtISRURkODZjTOxdI/H5/dFY\nuryLj7cxaVKyckdAfLyNeb9/KdoxIu6N+/OictxYPUcuJHNOzcujkMqaDx5cct65J0++5Jxj2o5C\nRERUBiIiojIQERFUBiIigspARERQGYiICCoDERFBZSAiIqgMREQElYGIiKAyEBERVAYiIoLKQERE\nUBmIiAgqAxERQWUgIiKoDEREBJWBiIigMhAREVQGIiKCykBERFAZiIgIKgMREUFlICIiqAxERASV\ngYiIoDIQERFUBiIigspARERQGYiICCoDERFBZSAiIkBCtANEQ07Ny1E57gcPLonKcUVEhqMrAxER\nURmIiIjKQEREiGIZvPvuu5SWluJ0OikpKaGjoyNaUUREvvSiUgYDAwOsWrWKm2++mTfeeIM777yT\nu+++m5MnT0YjjojIl15UyuDgwYPExcVRVlbGuHHjKC0tJS0tjZdeeikacUREvvSi8mipx+MhPT09\n7LXp06fT3d1t+WfExdkudqyIiLXcp/LGUu5YynoxxcdHZ92xfI7EUubTjUbuqJTBp59+SlJSUthr\ndrud/v5+yz8jJWXCeR8/ms/7X0juaIq13Po3HZEXa+cInH/maJ9fo/FZR+U2UVJS0hn/4+/v78fh\ncEQjjojIl15UymDGjBl4PJ6w1zweDzNnzoxGHBGRL72olMGCBQsYHBxk+/btDA0NsXPnTrxeL4sW\nLYpGHBGRLz2bMcZE48BdXV2sW7eO9957j8svv5x169bhdDqjEUVE5EsvamUgIiJjh7ajEBERlYGI\niKgMREQElYGIiDCGy8DKrqbGGOrq6li0aBGZmZnceeed/OMf/4hC2s+NdCfWV199lW984xtR36DP\nam63283cuXPJzMwM/RdNVnO3tbVx0003kZmZSVFREa+++mqEk/4/K5l//etfh33GTqeTWbNm0dTU\nFIXEn7P6WTc2NuJyucjKyuLWW2/lnXfeiXDScFZzP/744yxevJjs7GzuvfdevF5vhJOeqbOz8wsf\nt29ubsblcuF0OnG73Ree2YxB/f395tvf/rbZsWOHGRwcNI2NjSY3N9d88sknYfOefvppc/3115ve\n3l4TCATM5s2bzY033jimM59y/PhxU1BQYK688spzzomEkeRetGiR6ezsjELKM1nN3dvba7Kzs81z\nzz1ngsGgaWpqMllZWcbv94/ZzP9t8+bN5o477jCDg4MRShrOau7Dhw+bq6++2nR3d5tAIGAaGhrM\n4sWLo5LZGOu5n332WZOTk2Pa29vN4OCg2bx5syktLY1SamOCwaBpbGw0WVlZ5uqrrz7rnMOHD5t5\n8+aZjo4O4/f7zS9/+UuzfPnyCzrumLwysLqraWlpKTt37uTSSy/l008/5eOPPyY1NXVMZz5l3bp1\n3HDDDRFOeSaruT/66CP6+vq48soro5Q0nNXcu3fvZuHChVx33XXYbDYKCwt5/PHHiYuL/Kl/Prv1\nvvPOO2zfvp3f//73jBs3LoJp/5/V3EeOHCEYDBIIBDDGEBcXh91uj0pmsJ67paWFH/7wh2RmZjJu\n3Djuvfde3n//fd57772o5H7kkUd44oknWLVq1TnnNDU14XK5yMjIwG63U1FRwYEDBy7o6mBMloHV\nXU1tNhsOh4NnnnmG7Oxsdu3axU9/+tNIRg0ZyU6se/bs4cSJE9x2222RindOVnO/++67TJgwAbfb\nTW5uLrfeeitvvfVWJKOGsZr70KFDXHrppZSXlzN//nxuueUWAoEA48ePj2Rc4Px26924cSMrV67k\nsssuG+1452Q196JFi/j617/OkiVLmDNnDg0NDWzatCmSUcNYzR0MBsNKy2azYbPZOHLkSERy/rfv\nf//77N69mzlz5pxzTnd3d9j2PampqUycOPGMbX5GYkyWwUh3NS0sLKSzs5O7776b5cuXc/z48UjE\nDGM1c09PD3V1dfz2t7+NZLxzspp7YGAAp9NJZWUlL7/8MsXFxaxYsYJjx45FMm6I1dw+n4/GxkZu\nu+02XnnlFYqLi1m5ciU+ny+ScYGRn9dvvvkm77//Prfffnsk4p3TSM6RmTNnsnPnTt566y2WLl3K\nj3/84xHtRnwxWc29ePFinn76abq6uhgcHORPf/oT/f39DAwMRDJuyJQpU7DZvniLar/ff8ZVV1JS\nEn6//7yPOybLYKS7mo4fP57x48fzox/9iOTkZF5//fVIxAxjJXMwGOSBBx7gZz/7GZdeemmkI56V\n1c/62muv5dFHH+WKK65g/PjxlJWVcdlll/Haa69FMm6I1dzjx48nLy+PRYsWMW7cOG6//XYcDgft\n7e2RjAuM/Lx+5plnKC4uZsKE6G4NbTX3li1b+NrXvsacOXNITEykvLycoaEhWltbIxk3xGruG2+8\nkTvuuIN77rkHl8tFIBAgPT2dr3zlK5GMOyJnKzW/339BOz+PyTKwuqtpfX09f/jDH0J/NsYwODjI\nJZdcEpGcp7OSube3l7fffpt169aRnZ1NcXExAPn5+bS1tUU07ylWP+vnnnuOvXv3hr02MDBAYmLi\nqGc8G6u5p0+fzuDgYNhrwWAQE4VdWEa6W++LL77I9ddfH4loX8hq7p6enrDP2mazER8fT3x8fERy\n/jeruT/88ENuuOEG/v73v3PgwAGWLVvGkSNHmD17diTjjkh6enrY2vr6+vD5fGfcFhuJMVkGVnc1\nzcjI4Mknnwxd3m3ZsoXk5GTmzZs3JjNPnTqVzs5O2traaGtrY8+ePQC89NJLZGdnRzyz1dzw+SX3\nhg0beP/99xkaGuIvf/kL/f39XHPNNWM6d0lJCa+88gr79+8nGAyyfft2BgYGmD9//pjNDHD06FFO\nnDjBt771rYjn/G9WcxcUFLBz504OHTrEZ599xmOPPUYgECArK2tM525tbcXtdtPX18cnn3zC+vXr\nueaaa5gyZUpUcltRWFhIS0sLbW1tDAwMUFtbS15e3oU9QHNBzyKNosOHD5tbbrnFOJ1OU1JSYt56\n6y1jjDFr1641a9euDc178sknzeLFi01OTo5ZuXKlOXr0aLQiW858ytGjR6P+aKkx1nM/8sgjJj8/\n32RkZJjbbrvNdHV1RSuyMcZ67gMHDpiSkhLjdDrNTTfdZDo6OqIV2XLmV1991SxcuDBaMc9gJXcw\nGDQNDQ3mO9/5jsnKyjJ33HGHee+996IZ23LuBx980MyfP9/k5OSYiooKc+LEiWjGNsYYc/DgwbBH\nS//7HHn22WfNd7/7XZOZmWlWrFhhvF7vBR1Pu5aKiMjYvE0kIiKRpTIQERGVgYiIqAxERASVgYiI\noDIQERFUBiIigspARESA/wMxP+RvIT9yYgAAAABJRU5ErkJggg==\n", 664 | "text/plain": [ 665 | "" 666 | ] 667 | }, 668 | "metadata": {}, 669 | "output_type": "display_data" 670 | } 671 | ], 672 | "source": [ 673 | "data['confidence'].hist(bins=10);" 674 | ] 675 | }, 676 | { 677 | "cell_type": "code", 678 | "execution_count": null, 679 | "metadata": { 680 | "collapsed": true 681 | }, 682 | "outputs": [], 683 | "source": [] 684 | } 685 | ], 686 | "metadata": { 687 | "kernelspec": { 688 | "display_name": "Python 3", 689 | "language": "python", 690 | "name": "python3" 691 | }, 692 | "language_info": { 693 | "codemirror_mode": { 694 | "name": "ipython", 695 | "version": 3 696 | }, 697 | "file_extension": ".py", 698 | "mimetype": "text/x-python", 699 | "name": "python", 700 | "nbconvert_exporter": "python", 701 | "pygments_lexer": "ipython3", 702 | "version": "3.6.7" 703 | } 704 | }, 705 | "nbformat": 4, 706 | "nbformat_minor": 2 707 | } 708 | -------------------------------------------------------------------------------- /notebooks/Data Exploration.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook is an attempt to explore the dataset. This notebook needs to be expanded upon." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "import re\n", 18 | "from textstat.textstat import textstat\n", 19 | "from textblob import TextBlob\n", 20 | "import seaborn as sns\n", 21 | "%matplotlib inline\n", 22 | "sns.set_style(\"dark\")\n", 23 | "sns.set_context(\"talk\")" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "df = pd.read_csv('../data/twitter-hate-speech.csv', encoding='latin-1')" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "df.head()" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "df.describe()" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "data_path = '../data/twitter-hate-speech.csv'\n", 60 | "\n", 61 | "df = pd.read_csv(data_path, encoding='latin1')\n", 62 | "df = df.rename(columns={'does_this_tweet_contain_hate_speech': 'label', \n", 63 | " 'does_this_tweet_contain_hate_speech:confidence': 'confidence' })\n", 64 | "\n", 65 | "mapping = {'The tweet is not offensive': 'Not offensive', \n", 66 | " 'The tweet uses offensive language but not hate speech': 'Offensive',\n", 67 | " 'The tweet contains hate speech': 'Hate speech'\n", 68 | " }\n", 69 | "df['label'] = df['label'].map(lambda x: mapping[x])" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "text = df['tweet_text']" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "text[:10]" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "def remove_handles(content):\n", 97 | " return ' '.join(re.sub(\"(@[A-Za-z0-9]+)\",\" \",content).split())" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "text.apply(remove_handles)[:10]" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "data = df[~df['_golden']].dropna(axis=1)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "sns.stripplot(x=\"label\", y=\"confidence\", data=data, size=6, jitter=True);" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "data['label'].value_counts()" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "data['confidence'].hist(bins=10);" 143 | ] 144 | } 145 | ], 146 | "metadata": { 147 | "kernelspec": { 148 | "display_name": "Python 3", 149 | "language": "python", 150 | "name": "python3" 151 | }, 152 | "language_info": { 153 | "codemirror_mode": { 154 | "name": "ipython", 155 | "version": 3 156 | }, 157 | "file_extension": ".py", 158 | "mimetype": "text/x-python", 159 | "name": "python", 160 | "nbconvert_exporter": "python", 161 | "pygments_lexer": "ipython3", 162 | "version": "3.7.0" 163 | } 164 | }, 165 | "nbformat": 4, 166 | "nbformat_minor": 2 167 | } 168 | -------------------------------------------------------------------------------- /notebooks/LSTM with Keras and TensorFlow.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook is an attempt to classify the dataset using a neural network with TensorFlow and Keras." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "import seaborn as sns\n", 20 | "import numpy as np\n", 21 | "%matplotlib inline\n", 22 | "from sklearn.model_selection import train_test_split" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "metadata": { 29 | "collapsed": true 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "data_path = 'twitter-hate-speech.csv'\n", 34 | "\n", 35 | "df = pd.read_csv(data_path, encoding='latin1')\n", 36 | "df = df.rename(columns={'does_this_tweet_contain_hate_speech': 'label', \n", 37 | " 'does_this_tweet_contain_hate_speech:confidence': 'confidence' })" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "data": { 47 | "text/html": [ 48 | "
\n", 49 | "\n", 62 | "\n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | "
_unit_id_golden_unit_state_trusted_judgments_last_judgment_atlabelconfidence_created_atorig__goldenorig__last_judgment_atorig__trusted_judgmentsorig__unit_idorig__unit_state_updated_atorig_does_this_tweet_contain_hate_speechdoes_this_tweet_contain_hate_speech_golddoes_this_tweet_contain_hate_speech_gold_reasondoes_this_tweet_contain_hate_speechconfidencetweet_idtweet_text
0853718217Truegolden86NaNThe tweet uses offensive language but not hate...0.6013NaNTrueNaN0.0615561535.0goldenNaNThe tweet contains hate speechThe tweet contains hate speech\\nThe tweet uses...NaN1.01.666196e+09Warning: penny boards will make you a faggot
1853718218Truegolden92NaNThe tweet contains hate speech0.7227NaNTrueNaN0.0615561723.0goldenNaNThe tweet contains hate speechThe tweet contains hate speech\\nThe tweet uses...NaN1.04.295121e+08Fuck dykes
2853718219Truegolden86NaNThe tweet contains hate speech0.5229NaNTrueNaN0.0615562039.0goldenNaNThe tweet contains hate speechThe tweet contains hate speech\\nThe tweet uses...NaN1.03.956238e+08@sizzurp__ @ILIKECATS74 @yoPapi_chulo @brandon...
3853718220Truegolden98NaNThe tweet contains hate speech0.5184NaNTrueNaN0.0615562068.0goldenNaNThe tweet contains hate speechThe tweet contains hate speech\\nThe tweet uses...NaN1.04.975147e+08\"@jayswaggkillah: \"@JacklynAnnn: @jayswaggkill...
4853718221Truegolden88NaNThe tweet uses offensive language but not hate...0.5185NaNTrueNaN0.0615562488.0goldenNaNThe tweet contains hate speechThe tweet contains hate speech\\nThe tweet uses...NaN1.05.889236e+08@Zhugstubble You heard me bitch but any way I'...
\n", 206 | "
" 207 | ], 208 | "text/plain": [ 209 | " _unit_id _golden _unit_state _trusted_judgments _last_judgment_at \\\n", 210 | "0 853718217 True golden 86 NaN \n", 211 | "1 853718218 True golden 92 NaN \n", 212 | "2 853718219 True golden 86 NaN \n", 213 | "3 853718220 True golden 98 NaN \n", 214 | "4 853718221 True golden 88 NaN \n", 215 | "\n", 216 | " label confidence _created_at \\\n", 217 | "0 The tweet uses offensive language but not hate... 0.6013 NaN \n", 218 | "1 The tweet contains hate speech 0.7227 NaN \n", 219 | "2 The tweet contains hate speech 0.5229 NaN \n", 220 | "3 The tweet contains hate speech 0.5184 NaN \n", 221 | "4 The tweet uses offensive language but not hate... 0.5185 NaN \n", 222 | "\n", 223 | " orig__golden orig__last_judgment_at orig__trusted_judgments \\\n", 224 | "0 True NaN 0.0 \n", 225 | "1 True NaN 0.0 \n", 226 | "2 True NaN 0.0 \n", 227 | "3 True NaN 0.0 \n", 228 | "4 True NaN 0.0 \n", 229 | "\n", 230 | " orig__unit_id orig__unit_state _updated_at \\\n", 231 | "0 615561535.0 golden NaN \n", 232 | "1 615561723.0 golden NaN \n", 233 | "2 615562039.0 golden NaN \n", 234 | "3 615562068.0 golden NaN \n", 235 | "4 615562488.0 golden NaN \n", 236 | "\n", 237 | " orig_does_this_tweet_contain_hate_speech \\\n", 238 | "0 The tweet contains hate speech \n", 239 | "1 The tweet contains hate speech \n", 240 | "2 The tweet contains hate speech \n", 241 | "3 The tweet contains hate speech \n", 242 | "4 The tweet contains hate speech \n", 243 | "\n", 244 | " does_this_tweet_contain_hate_speech_gold \\\n", 245 | "0 The tweet contains hate speech\\nThe tweet uses... \n", 246 | "1 The tweet contains hate speech\\nThe tweet uses... \n", 247 | "2 The tweet contains hate speech\\nThe tweet uses... \n", 248 | "3 The tweet contains hate speech\\nThe tweet uses... \n", 249 | "4 The tweet contains hate speech\\nThe tweet uses... \n", 250 | "\n", 251 | " does_this_tweet_contain_hate_speech_gold_reason \\\n", 252 | "0 NaN \n", 253 | "1 NaN \n", 254 | "2 NaN \n", 255 | "3 NaN \n", 256 | "4 NaN \n", 257 | "\n", 258 | " does_this_tweet_contain_hate_speechconfidence tweet_id \\\n", 259 | "0 1.0 1.666196e+09 \n", 260 | "1 1.0 4.295121e+08 \n", 261 | "2 1.0 3.956238e+08 \n", 262 | "3 1.0 4.975147e+08 \n", 263 | "4 1.0 5.889236e+08 \n", 264 | "\n", 265 | " tweet_text \n", 266 | "0 Warning: penny boards will make you a faggot \n", 267 | "1 Fuck dykes \n", 268 | "2 @sizzurp__ @ILIKECATS74 @yoPapi_chulo @brandon... \n", 269 | "3 \"@jayswaggkillah: \"@JacklynAnnn: @jayswaggkill... \n", 270 | "4 @Zhugstubble You heard me bitch but any way I'... " 271 | ] 272 | }, 273 | "execution_count": 3, 274 | "metadata": {}, 275 | "output_type": "execute_result" 276 | } 277 | ], 278 | "source": [ 279 | "df.head()" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 4, 285 | "metadata": { 286 | "collapsed": true 287 | }, 288 | "outputs": [], 289 | "source": [ 290 | "data = df[~df['_golden']].dropna(axis=1)" 291 | ] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "metadata": {}, 296 | "source": [ 297 | "# Keras neural network with TensorFlow" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 5, 303 | "metadata": {}, 304 | "outputs": [ 305 | { 306 | "name": "stderr", 307 | "output_type": "stream", 308 | "text": [ 309 | "Using TensorFlow backend.\n" 310 | ] 311 | } 312 | ], 313 | "source": [ 314 | "import os\n", 315 | "os.environ['KERAS_BACKEND'] = 'tensorflow'\n", 316 | "import keras" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 6, 322 | "metadata": { 323 | "collapsed": true 324 | }, 325 | "outputs": [], 326 | "source": [ 327 | "from keras.preprocessing import sequence\n", 328 | "from keras.preprocessing.text import text_to_word_sequence, one_hot, Tokenizer\n", 329 | "from keras.utils import to_categorical\n", 330 | "\n", 331 | "from keras.models import Sequential\n", 332 | "from keras.layers import Dense, Embedding\n", 333 | "from keras.layers import LSTM\n", 334 | "\n", 335 | "from sklearn.metrics import classification_report, confusion_matrix" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 7, 341 | "metadata": { 342 | "collapsed": true 343 | }, 344 | "outputs": [], 345 | "source": [ 346 | "X = data['tweet_text'].tolist()\n", 347 | "\n", 348 | "# Encode labels\n", 349 | "mapping = {l: i for i, l in enumerate(sorted(data['label'].unique()))}\n", 350 | "reverse_mapping = {i: l for l, i in mapping.items()}\n", 351 | "\n", 352 | "label_names = sorted(mapping.keys())\n", 353 | "y = data['label'].map(lambda x: mapping[x])\n", 354 | "y = to_categorical(y, num_classes=3)\n", 355 | "\n", 356 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 8, 362 | "metadata": {}, 363 | "outputs": [ 364 | { 365 | "name": "stdout", 366 | "output_type": "stream", 367 | "text": [ 368 | "X_train shape: (11553, 30)\n", 369 | "X_test shape: (2889, 30)\n", 370 | "y_train shape: (11553, 3)\n" 371 | ] 372 | } 373 | ], 374 | "source": [ 375 | "max_features = 4000\n", 376 | "maxlen = 30\n", 377 | "batch_size = 32\n", 378 | "\n", 379 | "tokenizer = Tokenizer(num_words=max_features)\n", 380 | "tokenizer.fit_on_texts(X_train)\n", 381 | "\n", 382 | "X_train = tokenizer.texts_to_sequences(X_train)\n", 383 | "X_test = tokenizer.texts_to_sequences(X_test)\n", 384 | "\n", 385 | "X_train = sequence.pad_sequences(X_train, maxlen=maxlen)\n", 386 | "X_test = sequence.pad_sequences(X_test, maxlen=maxlen)\n", 387 | "\n", 388 | "print('X_train shape:', X_train.shape)\n", 389 | "print('X_test shape:', X_test.shape)\n", 390 | "print('y_train shape:', y_train.shape)" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": 9, 396 | "metadata": {}, 397 | "outputs": [ 398 | { 399 | "name": "stdout", 400 | "output_type": "stream", 401 | "text": [ 402 | "_________________________________________________________________\n", 403 | "Layer (type) Output Shape Param # \n", 404 | "=================================================================\n", 405 | "embedding_1 (Embedding) (None, None, 128) 512000 \n", 406 | "_________________________________________________________________\n", 407 | "lstm_1 (LSTM) (None, 128) 131584 \n", 408 | "_________________________________________________________________\n", 409 | "dense_1 (Dense) (None, 3) 387 \n", 410 | "=================================================================\n", 411 | "Total params: 643,971\n", 412 | "Trainable params: 643,971\n", 413 | "Non-trainable params: 0\n", 414 | "_________________________________________________________________\n" 415 | ] 416 | } 417 | ], 418 | "source": [ 419 | "model = Sequential()\n", 420 | "model.add(Embedding(max_features, 128))\n", 421 | "model.add(LSTM(128, dropout=0.3, recurrent_dropout=0.2))\n", 422 | "model.add(Dense(3, activation='sigmoid'))\n", 423 | "\n", 424 | "model.compile(loss='categorical_crossentropy',\n", 425 | " optimizer='adam',\n", 426 | " metrics=['accuracy'])\n", 427 | "\n", 428 | "model.summary()" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": 10, 434 | "metadata": {}, 435 | "outputs": [ 436 | { 437 | "name": "stdout", 438 | "output_type": "stream", 439 | "text": [ 440 | "Train on 11553 samples, validate on 2889 samples\n", 441 | "Epoch 1/30\n", 442 | "11553/11553 [==============================] - 21s - loss: 0.6829 - acc: 0.7003 - val_loss: 0.5104 - val_acc: 0.7781\n", 443 | "Epoch 2/30\n", 444 | "11553/11553 [==============================] - 16s - loss: 0.4571 - acc: 0.7911 - val_loss: 0.4674 - val_acc: 0.7899\n", 445 | "Epoch 3/30\n", 446 | "11553/11553 [==============================] - 16s - loss: 0.3936 - acc: 0.8161 - val_loss: 0.4789 - val_acc: 0.7958\n", 447 | "Epoch 4/30\n", 448 | "11553/11553 [==============================] - 18s - loss: 0.3581 - acc: 0.8389 - val_loss: 0.4977 - val_acc: 0.7972\n", 449 | "Epoch 5/30\n", 450 | "11553/11553 [==============================] - 16s - loss: 0.3291 - acc: 0.8511 - val_loss: 0.5127 - val_acc: 0.7930\n", 451 | "Epoch 6/30\n", 452 | "11553/11553 [==============================] - 16s - loss: 0.3007 - acc: 0.8638 - val_loss: 0.5515 - val_acc: 0.7709\n", 453 | "Epoch 7/30\n", 454 | "11553/11553 [==============================] - 16s - loss: 0.2771 - acc: 0.8764 - val_loss: 0.5817 - val_acc: 0.7736\n", 455 | "Epoch 8/30\n", 456 | "11553/11553 [==============================] - 16s - loss: 0.2511 - acc: 0.8906 - val_loss: 0.6567 - val_acc: 0.7729\n", 457 | "Epoch 9/30\n", 458 | "11553/11553 [==============================] - 16s - loss: 0.2614 - acc: 0.8883 - val_loss: 0.6234 - val_acc: 0.7799\n", 459 | "Epoch 10/30\n", 460 | "11553/11553 [==============================] - 16s - loss: 0.2228 - acc: 0.9014 - val_loss: 0.7142 - val_acc: 0.7844\n", 461 | "Epoch 11/30\n", 462 | "11553/11553 [==============================] - 17s - loss: 0.2087 - acc: 0.9069 - val_loss: 0.6736 - val_acc: 0.7785\n", 463 | "Epoch 12/30\n", 464 | "11553/11553 [==============================] - 17s - loss: 0.1918 - acc: 0.9152 - val_loss: 0.7782 - val_acc: 0.7639\n", 465 | "Epoch 13/30\n", 466 | "11553/11553 [==============================] - 18s - loss: 0.1789 - acc: 0.9192 - val_loss: 0.7574 - val_acc: 0.7667\n", 467 | "Epoch 14/30\n", 468 | "11553/11553 [==============================] - 18s - loss: 0.1711 - acc: 0.9228 - val_loss: 0.8155 - val_acc: 0.7632\n", 469 | "Epoch 15/30\n", 470 | "11553/11553 [==============================] - 18s - loss: 0.1614 - acc: 0.9251 - val_loss: 0.8418 - val_acc: 0.7601\n", 471 | "Epoch 16/30\n", 472 | "11553/11553 [==============================] - 19s - loss: 0.1442 - acc: 0.9321 - val_loss: 0.8810 - val_acc: 0.7629\n", 473 | "Epoch 17/30\n", 474 | "11553/11553 [==============================] - 17s - loss: 0.1373 - acc: 0.9355 - val_loss: 1.0003 - val_acc: 0.7615\n", 475 | "Epoch 18/30\n", 476 | "11553/11553 [==============================] - 16s - loss: 0.1371 - acc: 0.9340 - val_loss: 0.9598 - val_acc: 0.7670\n", 477 | "Epoch 19/30\n", 478 | "11553/11553 [==============================] - 17s - loss: 0.1331 - acc: 0.9385 - val_loss: 0.9785 - val_acc: 0.7529\n", 479 | "Epoch 20/30\n", 480 | "11553/11553 [==============================] - 16s - loss: 0.1376 - acc: 0.9364 - val_loss: 1.0256 - val_acc: 0.7560\n", 481 | "Epoch 21/30\n", 482 | "11553/11553 [==============================] - 16s - loss: 0.1156 - acc: 0.9444 - val_loss: 1.0967 - val_acc: 0.7522\n", 483 | "Epoch 22/30\n", 484 | "11553/11553 [==============================] - 16s - loss: 0.1088 - acc: 0.9449 - val_loss: 1.1062 - val_acc: 0.7511\n", 485 | "Epoch 23/30\n", 486 | "11553/11553 [==============================] - 17s - loss: 0.1119 - acc: 0.9464 - val_loss: 1.0966 - val_acc: 0.7529\n", 487 | "Epoch 24/30\n", 488 | "11553/11553 [==============================] - 17s - loss: 0.1080 - acc: 0.9467 - val_loss: 1.1104 - val_acc: 0.7632\n", 489 | "Epoch 25/30\n", 490 | "11553/11553 [==============================] - 16s - loss: 0.1006 - acc: 0.9479 - val_loss: 1.1826 - val_acc: 0.7612\n", 491 | "Epoch 26/30\n", 492 | "11553/11553 [==============================] - 16s - loss: 0.0972 - acc: 0.9510 - val_loss: 1.1623 - val_acc: 0.7629\n", 493 | "Epoch 27/30\n", 494 | "11553/11553 [==============================] - 16s - loss: 0.0968 - acc: 0.9522 - val_loss: 1.2643 - val_acc: 0.7584\n", 495 | "Epoch 28/30\n", 496 | "11553/11553 [==============================] - 15s - loss: 0.0969 - acc: 0.9523 - val_loss: 1.2078 - val_acc: 0.7549\n", 497 | "Epoch 29/30\n", 498 | "11553/11553 [==============================] - 15s - loss: 0.0956 - acc: 0.9537 - val_loss: 1.2662 - val_acc: 0.7525\n", 499 | "Epoch 30/30\n", 500 | "11553/11553 [==============================] - 16s - loss: 0.0915 - acc: 0.9555 - val_loss: 1.3384 - val_acc: 0.7522\n" 501 | ] 502 | } 503 | ], 504 | "source": [ 505 | "model.fit(X_train, y_train,\n", 506 | " batch_size=batch_size,\n", 507 | " epochs=30,\n", 508 | " validation_data=(X_test, y_test));" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": 11, 514 | "metadata": {}, 515 | "outputs": [ 516 | { 517 | "name": "stdout", 518 | "output_type": "stream", 519 | "text": [ 520 | "2880/2889 [============================>.] - ETA: 0s\n", 521 | "Test score: 1.33836409898 Test accuracy: 0.752163378373\n", 522 | "\n", 523 | " precision recall f1-score support\n", 524 | "\n", 525 | " 0 0.45 0.43 0.44 476\n", 526 | " 1 0.91 0.91 0.91 1451\n", 527 | " 2 0.67 0.68 0.67 962\n", 528 | "\n", 529 | "avg / total 0.75 0.75 0.75 2889\n", 530 | "\n" 531 | ] 532 | }, 533 | { 534 | "data": { 535 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAnMAAAIICAYAAAAFVWgMAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzs3XecFdX5x/HPdxdUelEgqChF1AgKxt57N4q9txjQJGKL\nxhpb7F1jLGDvvXfFKERUUERRFBv6s6AoCDZQWJ7fH3NWLuvusndZuNy73zevee29Z2bOPDN32Pvs\nOWdmFBGYmZmZWXEqK3QAZmZmZlZ/TubMzMzMipiTOTMzM7Mi5mTOzMzMrIg5mTMzMzMrYk7mzMzM\nzIqYkzkzMzOzIuZkzszMzKyIOZkzMzMzK2JNCh2AmVml+HCUH0kzn6lzj0KHUPJmffRGoUNoFMp6\nb6h5reNQta7z75yr47t53t784pY5MzMzsyLmljkzMzNrlJpooW1sy4uTOTMzM2uUSqV70smcmZmZ\nNUplpdEw52TOzMzMGie3zJmZmZkVsTKPmTMzMzMrXqXSMlcq+2FmZmaWlyaq+zQ3kq6XNFHSWzll\nF0h6V9Kbkh6Q1DaVd5U0TdLoNF2ds85qksZI+kDS5dLcmw+dzJmZmVmjJKnOUx3cCGxdpewZoHdE\nrAK8B5yQM+/DiOibpkNzyq8C+gM901S1zt9wMmdmZmaNUlke09xExFBgcpWypyNiZnr7MrB0bXVI\n6gy0joiXIyKAm4F+ddkPMzMzs0anTHWfGsCfgCdy3ndLXawvSNoglS0FfJazzGeprFa+AMLMzMwa\npXxatCQNAAbkFA2KiEF1XPckYCZwWyqaACwTEZMkrQY8KKlXHuHMwcmcmZmZNUr5PM4rJW51St5y\nSToQ2B7YLHWdEhE/Az+n169J+hBYHvicObtil05ltXI3q5mZmTVK87ubVdLWwD+AHSLip5zyDpLK\n0+vuZBc6fBQRE4DvJK2drmLdH3hobttxy5yZmZk1Sg3ZoiXpDmBjYAlJnwGnkl29uijwTLoi9uV0\n5eqGwBmSZgCzgEMjovLiib+SXRnbjGyMXe44u2o5mTMzM7NGqYyGewJEROxVTfF1NSx7H3BfDfNe\nBXrns20nc2ZmZtYoNdBVqgXnZM7MzMwapVK5cMDJnJmZmTVK+VzNujBzMmdmZmaNkrtZzczMzIqY\nu1nNzMzMiphb5szMzMyKWEPemqSQnMyZmZlZo+SWOTMzM7MiVu5kzszMzKx4uZvVzMzMrIi5m9XM\nzMysiPnWJGZmZmZFrEQa5pzMmZmZWeNU7sd5mZmZmRWv0kjlnMyZmdVowteTOO6iK5n07VQk2H3r\nzdi/3zZM+f4Hjj7nMj6f+A1LdVyCS044gjatWvLZV1+z3SF/p9vSSwLQZ4XlOH3gnwu8F8Vlwpdf\n8Y9/nsakSZOzY77LThyw954A3HLHXdx2972Ul5Wx0Qbr8Y8jDy9wtMVjwjeTOf7y65k09TsAdt9i\nQ/bffvNf59/w8NOcf9M9DL/hYtq1bsXnE79huyNOoduSnQDos3x3Tjtkv4LEPj85mbOiJmlxYEh6\n+zugAvga6Ap8EREr1bPejYFfImJ4A4RZXf1dgXUj4vb5UX/aRl9gyYh4fC7LrQ7sHxHz9I0i6Xng\nmIh4tY7Ld2U+H4OGIuljYPWI+KbQsdRHeXkZx/15X3ot140ffprGLoefyLp/WJkHnnmBtfv2ZsDu\nOzLo7ocYfM/DHPOnvQFYpnMnHrzi3AJHXrzKy8s5/ugj6PX7Ffnhxx/ZZe/9WW+tNflm8mSGPD+U\nh++6jUUWWYRJkycXOtSiUl5exj8O3I1e3Zflx2nT2eXYf7Fun5VYrsuSTPhmMi+OfpvOS7SfY50u\nnTrwwEWnFijiBUMl0s1aKhdyWJ4iYlJE9I2IvsDVwCXpdV9g1jxUvTGwbgOEWJOuwN7zsX7IjsG2\nc1soIl6d10Sunroy/4+BAR3bt6PXct0AaNm8GT2WWYqvvpnMkJdfo9/mGwLQb/MNefalOuXhVgcd\nOyxBr9+vCEDLFi3o3q0bX339NXfccx8DDjqARRZZBIDF27evrRqromO7tvTqviwALZotRo+lO/PV\n5CkAnHvDXRyz/64lk9jkQ3lMCzMnc1adckmDJb0t6WlJzQAk9ZD0pKTXJA2TtGLuSqnF6FDgKEmj\nJW0kabwybSVVSNowLTtUUk9JLSRdL2mEpNcl7Zjml0u6QNJISW9KOiRt5lxgg1T/UVUDl3ScpDGS\n3pB0birrK+nlVM8Dktql8uclnZe2/Z6kDSQtApwB7JG2sYekNSW9lOIbLmmFtP7Gkh5Nr09L+/G8\npI8kHZ7KW0h6LMXzlqQ9ajjmu+XGUXk803EelabKJHmOY1DLsco9LtXGIeljSeenYzZC0nKpvIOk\n+1KdIyWtl1NPTZ/XhanuNyUNzNn8wBT/mKrnTDH57KuveefDj+mz4nJMmjKVju3bAdChXVsmTZk6\ne7kvv6bfYcez7z9O59W33i1UuCXhsy++4J1x4+jTuxcff/J/vPr6aHbb7yD2PfgQ3nx7bKHDK1qf\nT/yGd8Z/Sp+e3RgyYjSd2rdjxa5dql1up7+fzn7/vIBXx75XgEjnv7I8poWZu1mtOj2BvSKiv6S7\ngV2AW4FBwKER8b6ktYArgU0rV4qIjyVdDfwQERcCSBoHrAR0A0aRJSGvAF1SPWcDz0XEnyS1BUZI\nehbYB5gaEWtIWhR4UdLTwPFkXZLbVw1a0jbAjsBaEfGTpMo/3W8GBkbEC5LOAE4FjkzzmkTEmpK2\nBU6NiM0lnULWNXhYqrc1sEFEzJS0OXB2OiZVrQhsArQCxkm6CtiarNt6u1RXmxqO+RxxAJsDE4Et\nImK6pJ7AHcDqVY+BpAHVHauIGJ9Tf21xTI2IlSXtD1wKbA9cRtZa+z9JywBPAb8HTqrh89qfrMWw\nbzpOuc0m30TEHyT9FTgGmGMQWYp/AMDVZ57EgD13ruEQFc6P06Zz+FmXcMKA/WnZvPkc8yT92qLR\nsX1bnrvp37Rr3Yq33v+Iw/51EY9efcFv1rG5+/Gnnzj8mOM58ZijadmyJRUVFUydOpW7b76eMW+P\n5ch/nMCQRx9slK1J8+LHadM5/IKrOP6gPSgvL2PQ/Y9z7T+P/M1yHdq1Ycg159GuVUve/vATDjvv\nPzxy6em0bN6sAFHPP2Ulcv44mbPqjI+I0en1a0BXSS3Juk/vyfnluWgd6hoGbEiWzJ0D9AdeAEam\n+VsCO0g6Jr1fDFgmla8iaddU3oYsyfyllm1tDtwQET8BRMTklLS0jYgX0jI3AffkrHN/7n7WUG8b\n4KaUUAXQtIblHouIn4GfJU0EOgFjgIsknQc8GhHDali3ujiaAlcoG8NXASxfw7o1HavcZK62OO7I\n+XlJer05sFLOZ906nQM1fV6bA1dHxEzIjn0N+/abTC0iBpH9oUB8OCpq2MeCmTFzJoefdQl/3Hg9\ntlxvTQAWb9uGiZO/pWP7dkyc/C3t27QGYJGmTVmkaXZ69O7ZnS6dOzH+swmsvHyPgsVfjGbMmMnh\nxxzHH7fZii032wSATp06ssVmmyCJVXr3oqysjG+/nUL71EJqczdj5kyOuOAq/rjBWmy59h9475PP\n+Oyrb+j39zMA+GrSt+xy7Jncde6JdGjX5tdzuVePZenyuw58/MVX9F6uawH3oOGVRirnZM6q93PO\n6wqgGVkr85Q0ri4fQ4G/AEsCpwDHko2rq0wmBOwSEeNyV1KWRQyMiKeqlG+c5/bnpnJfK6j5/8O/\ngP9GxE7KupKfn0tdv9YXEe9J+gPZGLwzJQ2JiDPqGMdRwFdAH7LjP72G7VZ7rHLNJY7cBKrydRmw\ndkTMsc30uVT3edW06Zr2rShEBCdfOogeXZbkoJ23+7V807VX48FnhzJg9x158NmhbLb2agBMnvod\nbVq2pLy8jE8nfMUnX3xJl86dChV+UYoITjr9X3Tv1o2D9tvn1/LNN96IV0a+xtprrM74Tz5hxowZ\ntGvXtoCRFpeI4OQrb6L70p05cIctAVh+2aV58YaLf11ms0OP597zT6Jd61ZMnvo9bVq2yM7lL7/m\nkwkTWbpTh0KFP984mbNGJSK+Uzb+bbeIuCd9qa8SEW9UWfR7oHXO+xHALcBHqbtwNHAIWVceZN13\nAyUNjIiQtGpEvJ7K/yLpuYiYIWl54PNUf6sawnwGOEXSbZXdrKl17ltJG6TWqP3IWgZrU3UbbdK2\nAQ6cy7pzkLQkMDkibpU0hSpdjHPRBvgsImZJOgAoryG+ao9VRPxYxzj2IBuHtwfwUip7GhgIXJDW\n75taa2v6vJ4BDpH038pu1iqtc0Vp1NhxPPTcMJbv2oV+hx0PwFEH7EH/3XbgqHMu476nn2fJdGsS\ngJFj3uHft95DkyZNKJM47bCDaduqZSF3oei8NvoNHnrsCZbvuRw77pElc0cf9ld26bcDJ572L7bf\ndU+aNm3KuWec6i7WPIx69wMefuFlll9mKXb6++kAHLn3zmy02srVLv/q2Pe4/M6HaNqkHKmM0wbs\nS9tWLRZkyAuEn81qjdE+wFWSTibrArwTqJrMPQLcmwbGD4yIYZI+BV5O84cBe5F1+0HW6nUp8Kak\nMrKuwe2Ba8m6G0elxPFroB/wJlAh6Q3gxoio7BYkIp5MXZKvSvoFeBw4ETgAuFpSc+Aj4KC57Od/\ngeNT4nkOcD5ZN+vJwGN1O1S/Whm4QNIsYAZZK2VdXQncl8ayPQlUJmdzHAOy8W1d+e2xqmsc7SS9\nSdaCtlcqOxz4TypvQtbCeii1f17Lp/IZwGDgijz2daG0Wq8VeffxO6qdd+M5J/+mbKv112Kr9dea\n32GVtNVX7cu410dUO+/Cs6pr1La6WO33PXnnvsG1LjPk6tm31NlyndXYcp3V5ndYBacSaZtTxEI3\nRMXMFhAtZPeBWxjHzJUadfb4vflt1kdV/8a1+aGs94bznIk9ucSSdf6ds/U3Xyy0mZ9b5szMzKxR\ncjermRW9iOha6BjMzAqlrES6WZ3MmZmZWaNUGqmckzkzMzNrpErlgmgnc2ZmZtYolUgu52TOzMzM\nGqfyEmmaczJnZmZmjVJppHJO5szMzKyRKpVkrqzQAZiZmZkVgvL4N9e6pOslTZT0Vk5Ze0nPSHo/\n/WyXM+8ESR9IGidpq5zy1SSNSfMuVx2eW+dkzszMzBolqe5THdwIbF2l7HhgSET0BIak90haCdgT\n6JXWuVJS5fO3rwL6Az3TVLXO33AyZ2ZmZo1SWR7T3ETEUGByleIdgZvS65uY/dzsHYE7I+LniBgP\nfACsKakz0DoiXo7seas389tnbf+Gx8yZmZlZo1Q2/69m7RQRE9LrL4FO6fVSwMs5y32Wymak11XL\na+WWOTMzM2uUlM8kDZD0as40IJ9tpZa2aMj4K7llzszMzBqlfNrlImIQMCjPTXwlqXNETEhdqBNT\n+edAl5zllk5ln6fXVctr5ZY5MzMza5Qk1Xmqp4eBA9LrA4CHcsr3lLSopG5kFzqMSF2y30laO13F\nun/OOjVyy5yZmZk1SmUNOGRO0h3AxsASkj4DTgXOBe6WdDDwCbA7QES8LeluYCwwE/hbRFSkqv5K\ndmVsM+CJNNXKyZyZmZk1SmXlDZfNRcReNczarIblzwLOqqb8VaB3Ptt2MmdmZmaNUok8mtXJnJmZ\nmTVO8zAWbqHiZM7MzMwapRLJ5ZzMmZmZWePkljkzMzOzIlYiuZyTOTMzM2ucyhvy3iQF5GTOzMzM\nGiV3s5qZmZkVMZXIc7CczJmZmVmj5JY5MzMzsyJWIrmckzkzMzNrnNwyZ2ZmZlbEfDWrmZmZWREr\nkYY5J3NmtvBQx2ULHULJO7RFl0KHUPKunjyu0CFYHbmb1czMzKyIlUgu52TOzMzMGicnc2ZmZmZF\nrKy8NLI5J3NmZmbWKJWVSNOckzkzMzNrlEokl3MyZ2ZmZo2Tr2Y1MzMzK2Ilkss5mTMzM7PGyS1z\nZmZmZkWszI/zMjMzMyteKit0BA3DyZyZmZk1Su5mNTMzMytm7mY1MzMzK2JumTMzMzMrXu5mNTMz\nMytm5aVxBYSTOTMzM2uU5DFzZmZmZkXM3axmZmZmxcstc2ZmZmbFrERa5kpj5J+ZmZlZnlReVudp\nrnVJK0ganTN9J+lISadJ+jynfNucdU6Q9IGkcZK2qu9+uGXOzMzMGqcG7GaNiHFAXwBJ5cDnwAPA\nQcAlEXFh7vKSVgL2BHoBSwLPSlo+Iiry3bZb5szMzKxxkuo+5Wcz4MOI+KSWZXYE7oyInyNiPPAB\nsGZ9dsPJnJmZmTVKKqv7lKc9gTty3g+U9Kak6yW1S2VLAZ/mLPNZKsubkzkzMzNrnPJomZM0QNKr\nOdOA6qvUIsAOwD2p6CqgO1kX7ATgoobeDY+ZMzMzs0Ypn1uTRMQgYFAdFt0GGBURX6X1vvp1e9Jg\n4NH09nOgS856S6eyvLllzszMzBqn8rK6T3W3FzldrJI658zbCXgrvX4Y2FPSopK6AT2BEfXZDbfM\nmZmZWaOkBr7PnKQWwBbAITnF50vqCwTwceW8iHhb0t3AWGAm8Lf6XMkKTubMzMyssWrgJ0BExI/A\n4lXK9qtl+bOAs+Z1u07mzMzMrHEqkSdAOJkzMzOzRqmhu1kLxcmcmZmZNU4N3M1aKE7mzMzqaNM/\n7kyL5s0pKy+nvLyc+2+5nieefY4rBl3Hh+M/5p6brmXllX5f6DCLwn7X/YeVt9+a7yd+zb9WXhuA\nP55xMn123JaYNYvvJ37DTQceytQJX9KifXsG3Hszy67xB16+8XbuHHjMr/WsvueubHPi34kIpn7x\nJdfv+2d+nDS5ULtVNG684x7uefgxJFi+R3fOOfk4Lh10Pf/933CaNmnKMksvyTknH0frVq0KHep8\nVZdnrhaD0tgLKxqSFs952PCXOQ8fniJp7DzUu7GkdRsy1ir1d5W0dw3zlpR073zcdr/0DL981ukg\n6RVJr0vaQNJukt6R9N8GjOtQSfs3VH3F4qZrruCh22/i/luuB7Ivwn+ffzZrrNq3wJEVl5duvI1/\nb73zHGXPXHAZZ/ZZl7NWXZ8xjz7JdqccB8CM6dN5+J9nct8xJ8+xfFl5Obtfdh4Xb7IdZ/ZZl8/f\nfItNDjsEq91XE7/m5rvv474bruHR22+kYtYsHnvmOdZbc3Ueve0GHrnterp26cI1N91e6FDnv/n3\nOK8FysmcLVARMSki+kZEX+BqsocP9yW7M/aseah6Y2C+JXNAV6DaZC4ivoiIXefjtvsBeSVzZM8F\nHBMRq0bEMOBgoH9EbNJQQUXE1RFxc0PVV6x6dOtK967LFjqMovPBsOH8NPnbOcqmf//9r68XadGc\niADgl59+4sMXX2bm9OlzVpLdlZ9FW7QAYLHWrZjyxYT5G3iJqKioYPrPPzNz5kymT59Oxw5LsP5a\na9CkSdZh17f3Snw58esCRzn/qUx1nhZmTuZsYVIuabCktyU9LakZgKQekp6U9JqkYZJWzF1JUlfg\nUOCo1Mq3kaTxyrSVVCFpw7TsUEk9JbVIz8gbkVqvdkzzyyVdIGlkeo5e5Z/55wIbpPqPqrp9SW+l\n171SnaPT+j2r7qSkHySdJekNSS9L6pRTz3NpvSGSlkmtjTsAF6Q6e1Sz7arr9AXOB3ZM65wKrA9c\nl/at2n1MrZvPS7pX0ruSblMaHSzpXElj0/IXprLTJB0jaUVJI6rENCa9Xk3SC+mze6rKzTOLj8RB\nfz2Cnfc9iLvuf7DQ0ZSkHc/8J2f/31jW3Gd3Hjml9js2zJo5kzv+chT/HPMS533xHp1XWpEXr2v0\nf1/MVaeOHfjTPnuwSb/dWX/7XWjZoiXrr7XGHMvc98jjbLhOvZ75XlzcMmfW4HoC/4mIXsAUYJdU\nPggYGBGrAccAV+auFBEfk9PKFxEvAOPIWrPWB0aRJWKLAl0i4n3gJOC5iFgT2IQsWWpB1oI1NSLW\nANYA+qc7cx8PDEv1X1LLPhwKXJZaG1cne3ByVS2AlyOiDzAU6J/K/w3cFBGrALcBl0fEcLK7hB+b\ntv1hlbqqW2c0cApwV1rndOBVYJ+IOLaWfQRYFTgyHbvuwHqSFie7a3mvtJ0zqxz/d4FFcurYA7hL\nUtMU367ps7ueBrifUiHdce3VPHT7TQy+/CJuu+d+Ro56vdAhlZyHTv4XJy6zEiNuu5uN59JlWtak\nCRv+5WDOWnUDjltyeT5/8222PuHvCyjS4jX1u+8ZMvRFhtx/J8MevY9p06fx0BNP/zr/qhtuobxJ\nOTtsvUUBo1xAylT3aSHmZM4WJuNTIgLwGtBVUkuy7tN7JI0GrgHq0rozDNgwTeeQJXVrACPT/C2B\n41OdzwOLAcuk8v1T+StkN3/8TetaLV4CTpR0HLBsREyrZplfmP1svtfIunAB1gEqB6nckmKem/qs\nU9s+joiIzyJiFjA6xTYVmE7Wsrcz8FM1dd5NlsSRft4FrAD0Bp5J2zqZ7NmDc1DOw6sH3XBTHcIv\nnE4dOwCwePv2bLHxhrz59jsFjqh0jbjtblbdZYdal+nSdxUAvvloPACv3n0/PdZda77HVuyGj3yN\npZfsTPt2bWnapAlbbrwhr495G4D7H32C5198iQtPP7lkbttRG5WX1XlamPlqVluY/JzzugJoRvYH\nx5TU0pWPocBfgCXJWqmOJRtXNyzNF7BLRIzLXSl1Kw6MiKeqlG9cl41GxO2SXgG2Ax6XdEhEPFdl\nsRlRORgo288F/f+wtn2s+hk0iYiZktYkG4e3K3AYsGmVOu8iS7jvByIi3pe0MvB2RKxTWzBzPLz6\n+0lR27KF9NO0acyaNYuWLVrw07RpvPjKCP765z8VOqyS0nG5Hkz8IGt87rPjdnz17nu1Lj/l8y/o\nvNKKtFxicX74ZhK/32JTJrwzrtZ1DJbs1JE33hrLtOnTWWzRRXnp1VH0XnEFhr70Ctfeeie3XnUZ\nzRZbrNBhLhglkrA6mbOFWkR8l8a/7RYR96Rka5WIeKPKot8DrXPejyBrqfooIqanlqFDgO3T/KeA\ngZIGRkRIWjUiXk/lf5H0XETMkLQ88Hmqf67X6EvqnrZ5uaRlgFWAqslcTYYDe6a492F24lnbtmta\npzY17WO1Uuto84h4XNKLwEdVl4mIDyVVAP8kS+wg6+ruIGmdiHgpdbsuHxFv1yHGhc6kSZP527En\nANng8e232oIN112bZ/77Av+64GImfzuFQ448ht8v35Prrri0wNEu/A6+/XqW33h9Wi6xOOd8+g6P\nnHo2vbfdkk4r9CRmzWLyJ59y+6FH/rr8WePHsFjr1pQv0pQ+/bbj8i37MeGdcTx6+rn8feiTVMyY\nweRPPuWmA/9SwL0qDn16r8RWm27ETgf0p0l5Ob9fvid79Nue7fY+kF9+mcFBh//91+XOOK7Eu62d\nzJktMPsAV0k6GWgK3AlUTeYeAe5NFzIMjIhhkj4FXk7zhwF7AWPS+38BlwJvSioDxpMleteSdS2O\nSonj12RXk74JVEh6A7ixlnFzuwP7SZoBfAmcncd+DgRukHRs2u5BqfxOYLCkw8nGn31Yh3VqU9M+\n1qQV8JCkxcha9Y6uYbm7gAuAbgAR8YukXYHLJbUh+31zKVCUyVyXpZfi4Tt+O7h+i002YotNNipA\nRMXtur1/26o5/Ppbalz+pG4rV1s+7JrrGXbN9Q0WV2NxeP+DOLz/nL8unrm3EdyKpKoSSeY0u7fH\nzKzAFuJu1lJxaOtuc1/I5snVk93Vu0C06zzPmdjMo3aq8++cJpc8sNBmfm6ZMzMzs8apRFrmnMyZ\nmZlZ41S2cF+lWldO5szMzKxxcjJnZmZmVsTczWpmZmZWxJzMmZmZmRUxJ3NmZmZmRay8vNARNAgn\nc2ZmZtY4uWXOzMzMrIg5mTMzMzMrXvKtSczMzMyKmFvmzMzMzIqYkzkzMzOzIuarWc3MzMyKmFvm\nzMzMzIqYkzkzMzOzIuZkzszMzKyI+dYkZmZmZkXMLXNmZmZmRcxXs5qZmZkVsRJpmSuNzmIzMzOz\nfEl1n+pUnT6WNEbSaEmvprL2kp6R9H762S5n+RMkfSBpnKSt6rsbTubMzMyscWrgZC7ZJCL6RsTq\n6f3xwJCI6AkMSe+RtBKwJ9AL2Bq4UlK9+n2dzJmZmVnjVFZW96n+dgRuSq9vAvrllN8ZET9HxHjg\nA2DNeu3GvERnZmZmVrQaPpkL4FlJr0kakMo6RcSE9PpLoFN6vRTwac66n6WyvPkCCDMzM2ucVPc2\nrZScDcgpGhQRg6ostn5EfC6pI/CMpHdzZ0ZESIp6x1sDJ3NmZmbWOJXVfSxcStyqJm9Vl/k8/Zwo\n6QGybtOvJHWOiAmSOgMT0+KfA11yVl86leXN3axmZmbWOKms7tPcqpJaSGpV+RrYEngLeBg4IC12\nAPBQev0wsKekRSV1A3oCI+qzG26ZMzMzs8apYe8z1wl4QFmdTYDbI+JJSSOBuyUdDHwC7A4QEW9L\nuhsYC8wE/hYRFfXZsJM5MzMza5wa8NmsEfER0Kea8knAZjWscxZw1rxu28mcmZmZNU5lfpyXmZmZ\nWfEqkcd5OZkzs4VGTPyk0CGUvKsmvlXoEErezDP+WugQGoUmlzww75U0YDdrITmZMzMzs8bJLXNm\nZmZmRSyPmwYvzJzMmZmZWeOUx02DF2ZO5szMzKxx8tWsZmZmZkXM3axmZmZmRczdrGZmZmZFzFez\nmpmZmRUxd7OamZmZFbFyXwBhZmZmVrzczWpmZmZWxNzNamZmZlbEfDWrmZmZWRFzy5yZmZlZEfOY\nOTMzM7Mi5qtZzczMzIqYu1nNzMzMipi7Wc3MzMyKWJlb5szMzMyKl1vmzMzMzIqYx8yZmZmZFbEy\nX81qZmZmVrz8BAgzMzOzIuZuVjMzM7Mi5gsgzMzMzIqYW+bMzMzMipd8AYSZmZlZEXPLnJmZmVkR\n89WsZmZmZkXMLXNmZmZmRaxErmYtjZTUzMzMLF8qq/s0t6qkLpL+K2mspLclHZHKT5P0uaTRado2\nZ50TJH0gaZykreq7G26ZMzOrwYSvJ3HcRVcy6dupSLD71puxf79tmPL9Dxx9zmV8PvEbluq4BJec\ncARtWrXW2z9WAAAgAElEQVT8db0vJn7D9ocew9/22ZWDd9m+gHtQHE48+0KeH/4Ki7dryyO3DAZg\nynffcfQpZ/H5l1+y1O9+xyVnnEyb1q2YMXMmJ597MWPfe5+Kigp23HoLDtlvrwLvQZFYrDlle/4N\n/W4ZACruuIKyFfuitbeAH78DYNZjtxLvjAJAm+1M2VqbQ8xi1v3XEuNGFyz0+aZhr2adCfw9IkZJ\nagW8JumZNO+SiLgwd2FJKwF7Ar2AJYFnJS0fERX5brjWVFPS4jmZ5Jc5meUUSWPz3VhOvRtLWre+\n69eh/q6S9p5f9c8rSYdLekfSbZIWlfRsOq57NOA2hjdQPT80RD2FlM63R/Ncp1/6j9YQ26/xfKxn\nbAdKWrIhYpufUpxXFDqOeVFeXsZxf96Xx665kDsv/he3Pfo0H/zfZwy++yHW7tubp669hLX79mbw\nPQ/Psd65g29hg9X7Fijq4rPTtlsy+KKz5ygbfOtdrL3aqjx1502svdqqDL71TgCefG4oM2bM4JGb\nB3PfdVdy10OP8dmELwsRdtEp2/nPxDuvU3HuQCouOAq++hSAWS88QsWFR1Nx4dG/JnJ0WpqyVden\n4rzDqbjmDMp2PaRkxpfNoUx1n+YiIiZExKj0+nvgHWCpWlbZEbgzIn6OiPHAB8Ca9dqNuQQ2KSL6\nRkRf4GqyzLIv0BeYVZ8NJhsD8y2ZA7oCC20yB/wV2CIi9gFWBUjH+a6G2kBEzM/j2xj0AxokmaPh\nz8cDyf6Ks/msY/t29FquGwAtmzejxzJL8dU3kxny8mv023xDAPptviHPvvTqr+s8O3wkS/+uI8st\ns3RBYi5Ga/RdhTatW81RNmTYcPptswUA/bbZgmeHZX+fSvDTtOnMnFnB9J9/oWmTJrRs0XyBx1x0\nFmuOuq9EvPJs9r5iJkz/qcbF1XtNZr3+v2y5yROJbybAMj0XULALUAN2s85RrdSV7Pv9lVQ0UNKb\nkq6X1C6VLQV8mrPaZ9Se/NVoXtLsckmDU7/w05KaAUjqIelJSa9JGiZpxdyV0g4eChyVWqM2kjRe\nmbaSKiRtmJYdKqmnpBbpAIyQ9LqkHdP8ckkXSBqZDtIhaTPnAhuk+o+qsv05WkIkXSHpwPT63NTX\n/aakC1NZB0n3pW2MlLReKt8op9Xy9dSkSpVtHS3prTQdmcquBroDT0g6DrgVWCPV00PSapJeSMfv\nKUmd03rPSzovHYP3JG2QynulstEp7p6p/If0805J2+XEdKOkXWs5dtWS1FLSEEmjJI3J+Qy6plbG\n6s6FNVLdo9O23krlc7TYSHpU0sbp9VWSXk11nZ6zzLaS3k3H5fLKz7Cmc6MarSU9pmxcwtVS9j9T\nOS2P6bjcqKzVeAfggsrPpcqxuDHFMFzSR5J2TeWq3M90jCpbWms8H5OWku5N+3eblI3IlXRK+nze\nkjQo1b8rsDpwW6qvWU3nTJWYd0v1vCFpaM7n8FA6t96XdGrO8vvmnFfXSCpP5VtKeimdB/dIapnz\nWQ9P9Y/Q7P8PSyr7ffC+pPNr+GyKwmdffc07H35MnxWXY9KUqXRsn/0+7tCuLZOmTAXgx2nTGXzv\nI/xt710KGWpJmPTtt3RcYnEAOizenknffgvAVptsSPNmi7FBvz3YdJd9+NNeu9G2detChloc2neE\nH76jbK+BlP/9Isr2+CsssigAZRtsS/mxl1C252HQrAUAarM4TJk0e/0pk1Db9oWIfP6S6jxJGpC+\nnyqnAdVXqZbAfcCREfEdcBXZ935fYAJwUUPvxrwkcz2B/0REL2AKUPnbaxAwMCJWA44BrsxdKSI+\nJqeVLyJeAMaRtYKsD4wi++JbFOgSEe8DJwHPRcSawCZkX7ItgIOBqRGxBrAG0F9SN+B4YFiq/5K6\n7IykxYGdgF4RsQpwZpp1WYp1jbSP16byY4C/pZbKDYBpVepbDTgIWAtYO8W2akQcCnwBbBIR5wF/\nrowV+D/g38Cu6fhdD5yVU22TdAyOBCq/eA8FLkvrr06W2ee6C9g9xbQIsBnwWC3HribTgZ0i4g9k\nn8FFlUkHNZ8LNwCHpNjqOgbgpIhYHVgF2EjSKpIWA64BtknHpUPu8lR/blS1JjCQ7DzrAexcUwAR\nMRx4GDg2nUMfVrNYZ7LzdXuyZI1UZ1+gD7B5iqUzcz8fVyX7TFci+w+/Xiq/IiLWiIjeQDNg+4i4\nF3gV2Ccd15nUfs5UOgXYKiL6kCWqucdlF7LjvZuk1SX9HtgDWC/ns9tH0hLAycDm6Tx4FTg6nVd3\nAUek+jdn9v+HvqmulYE9JHWpGljuL8hBd95fTeiF9+O06Rx+1iWcMGB/WjafsxVI2S95AK647V4O\n7LcNLZotVogwS5YkRHaMx4x9l7KyMoY+eCfP3nMzN9x5L59+PqHAERaB8nJYujuzXnySiov+Dr/8\nTNlmO2fvz/wLFRceDd99S9mOBxU60gUrj5a5iBgUEavnTIN+U53UlCyRuy0i7geIiK8ioiIiZgGD\nmd2V+jmQ+ztx6VSWt3m5AGJ8RFSOhnwN6Jqy0XWBe2Z/z7NoHeoaBmwIdAPOAfoDLwAj0/wtgR0k\nHZPeLwYsk8pXqWwZAdqQJRa/1GN/ppIlLNelVp/K1rvNgZVy9qd12s8XgYsl3QbcHxFVk6j1gQci\n4kcASfeTJX2v1xLDCkBv4Jm0vXKyLL5S5Tfda2RddwAvASdJWjrF8X6VOp8ALkvJ8dbA0IiYJqmm\nYze+htgEnK2s1XQWWVNwpzSvunOhLdAqIl5K5beTJT5zs3v6a6cJWcK0EtkfHR+lMQUAdwCVfxHV\ndG68U6XeERHxEYCkO8g+n3vrEE9NHkz/McdKqjwO6wN3pMGrX0l6gSxR/m4udY2oPH8kjSb7bP8H\nbCLpH0BzoD3wNvBIlXXnds5UehG4UdLdzD6PAJ6JiElp2/enfZgJrAaMTHU2AyaS/VGyEvBiKl+E\n7PxbAZgQESMB0l+ilQnOkIiYmt6PBZZlzm4F0i/EQQDx4aiYy7Fa4GbMnMnhZ13CHzdejy3Xy34H\nL962DRMnf0vH9u2YOPlb2rfJWobeHPcBT/3vFS64/na+//EnyiQWXaQp+/6x3hepNVqLt2vHxG8m\n0XGJxZn4zSTat2sLwKPPPMcGa61O0yZNWLxdO/6wci/eevc9uiz1mwZpyzVlEkydBP+XfUXMemM4\nZZvtDD9M/XWRWS89TXn/kwGIqZOg7eKz12+7ODFl8gINeYFowAsgUgPHdcA7EXFxTnnniKj8vbwT\n8FZ6/TBwu6SLyYbO9ARG1Gfb85LM/ZzzuoLsF34ZMCX9NZ+PocBfyHbmFOBYsnF1w9J8AbtExLjc\nldKBGxgRT1Up37iWbc1kzhbJxQAiYqakNclarnYFDgM2TcuuHRHTq9RzrqTHgG3Jvty2ioh367S3\nNRPwdkSsU8P8ymNeQfrsIuJ2Sa8A2wGPSzokIp6rXCEipkt6HtiKrIXkzpxt/ebY1WIfshax1SJi\nhqSPSceO6s+F2lT7GaSWwWOANSLiW0k35myjJtWeG9WomiRENeX5NKfk7vO83qio6vFrklojrwRW\nj4hPJZ1WQ3xzO2cAiIhDJa1Fdp68llqOofrjIuCmiDhhjg1JfyRL/vaqUr5yPvtWW5wLm4jg5EsH\n0aPLkhy086+jFdh07dV48NmhDNh9Rx58diibrZ0dztsuOO3XZf596700b7aYE7l62nT9dXjwiWcY\nsN+ePPjEM2y2QTYMuHOnjrw8ajQ7br0FP02bxhtj3+GA3WtsaLdK30+BKd9AhyXh6y8o67kKfPkZ\ntG4H32Vd2FplbWLCJwDE2yMp3/coKp5/GNq0Rx06/5oIlpSyBr2oYz1gP2BM+sMc4ERgL0l9yX6/\nfgwcAhARb6c/sMeSfS/+rT5XskID32cu/UU+XtJu8OsYoj7VLPo9kDvGbARZi96slDSNJtvZoWn+\nU2SDByvHEq2aU/6X1KyJpOVTF1vV+nN9QtbStmhqPdosrdsSaBMRjwNHkXWVATxN1j1HWq5v+tkj\nIsakrtKRwBxjA8kS0X6SmqeYdmJ2clqTcUAHSeukbTSV1Ku2FSR1J2u1uhx4iKy7rKq7yLp8NwCe\nTGU1HbuatAEmpkRuE7IWlhpFxBTg+5RAQHb5daWPgb6SylK3W2WTc2vgR2Bqau3aJpWPA7orG28J\nWVJaqaZzo6o1JXVTNlZuD7KWL8ha0H6fynfKWb62c6gmw8i6EssldSBrbR5Rz7oqE7dv0rm5a868\n3PrqdM6k8/WViDgF+JrZTftbSGqvbJxjP7IWvCHArpI6pnXbS1oWeBlYT9JyqbyFpOVTDJ0lrZHK\nW0kqqqStJqPGjuOh54bx8htv0++w4+l32PG8MPJ1+u+2A8NfH8NWfz6Kl0a/Rf/daxqqaXVx9Kln\nsdehRzD+/z5lo5324t5Hn6D/vnsy/NXX2GrPA3jp1VH03zf7b7/3zjvy00/T2H7fP7Nb/8PYedut\nWGG57gXeg+JQcd9gyvc7ivJjL4GlujHr2Xsp++P+lB97KeXHXoKW682sh27IFv7yU2aNHk758f+m\n/JBTmHXvYIh5ue5x4VQ5TKIu09xExP8iQhGxSuXFoxHxeETsFxErp/IdclrpiIizIqJHRKwQEU/U\ndz/mxy/cfYCrJJ0MNCVrCXqjyjKPAPcqG6w+MCKGSfqU7MsCsi/FvYAx6f2/gEuBN9OX7niyLrtr\nybqkRqUv86/JvpDeBCokvQHcmDtOKbVy3E3WzDme2d2erYCHUouIgKNT+eHAfyS9SXa8hpKNUzsy\nJTWzyLq/5vgQIrvPzI3MbjK9NiJq62IlIn5J3Z6XS2qTtndpqr8muwP7SZoBfAmcXc0yTwO3AA9F\nRGUXdE3Hria3AY9IGkM2VqourZAHA4MlzSLrNq9sz3+R7NiPJesOrbyU+w1Jr6e6P03LkbqF/wo8\nKelHZne/Q83nRlUjgSuA5YD/Ag+k8uPJutS/TvtVebOwO1Psh5ONR6tu3FxVDwDrkJ3vAfwjIr6U\nNIkazseaRMQUSYPJztMvq+zzjcDVkqal7dXlnLlA2cUxIkvW3iAbzzaCbHzH0sCtEfEqQPr/+3Q6\npjPI/mJ8WdnFQnco67YHODki3lN2sce/U1I4jWx4QtFbrdeKvPv4HdXOu/Gck2tdd+C+u9Y632a7\n+PSTqi2/8bILflPWonkzLjvzlPkdUmn64mMqLj52jqJZt11W4+Lx7L1UPDsvo1GKQIncbkURC90Q\nFSsRklpGROVVtccDnSPiiHmpKyWe/wHer+vFLVa9lJitHhGHFTqWSgvjmLmS03qJQkdQ8irOrtev\nOctTk0semOdnccX7I+v8O0c911hon/1VEl0httDaTtIJZOfZJ2T3R6uv/pIOIBt0/zrZ1a1mZmb1\nVyItc07mbL6J7CbIDXIj5NQK55a4BhQRN5J12ZqZNU7lDfo4r4JxMmdmZmaNUx0ubCgGTubMzMys\ncXI3q5mZmVkRc8ucmZmZWTFzMmdmZmZWvNwyZ2ZmZlbEPGbOzMzMrIi5Zc7MzMysiJVGLudkzszM\nzBqr0sjmnMyZmZlZ4+RuVjMzM7Mi5gsgzMzMzIqYW+bMzMzMipmTOTMzM7Pi5ZY5MzMzsyLmZM7M\nzMysmDmZMzMzMytaKvPVrGZmZmZFzC1zZmZmZsXLY+bMzMzMipiTOTMzM7Ni5mTOzMzMrHi5Zc7M\nzMysiDmZMzMzMytiTubMzMzMipiTOTMzM7Ni5mTOzMzMrHi5Zc7MzMysiMmP8zIzMzMrXiXSMqeI\nKHQMZmZFS9KAiBhU6DhKmY/xguHjXLxKo33RzKxwBhQ6gEbAx3jB8HEuUk7mzMzMzIqYkzkzMzOz\nIuZkzsxs3niM0fznY7xg+DgXKV8AYWZmZlbE3DJnZmZmVsSczJmZmZkVMd802MzMrBGStBSwLDm5\nQEQMLVxEVl9O5szM8iBpeeBYfvsluGnBgioxkjoBZwNLRsQ2klYC1omI6wocWsmQdB6wBzAWqEjF\nATiZK0K+AMLMLA+S3gCuBl5j9pcgEfFawYIqMZKeAG4AToqIPpKaAK9HxMoFDq1kSBoHrBIRPxc6\nFpt3bpkzM8vPzIi4qtBBlLglIuJuSScARMRMSRVzW8ny8hHQFHAyVwKczJmZ1YGk9unlI5L+CjxA\nzhdhREwuSGCl6UdJi5N1+yFpbWBqYUMqDZL+TXZcfwJGSxrCnOfx4YWKzerP3axmZnUgaTzZl6Cq\nmR0R0X0Bh1SyJK0GXA70Bt4COgC7RsSbBQ2sBEg6oLb5EXHTgorFGo6TOTMzW+ikcXIrkCXP4yJi\nRoFDKimSWgDTI6IivS8HFo2InwobmdWH7zNnZpYHSX+T1DbnfbvU7WoNRNKbwD/Iko23nMjNF0OA\nZjnvmwHPFigWm0dO5szM8tM/IqZUvomIb4H+BYynFP0RmAncLWmkpGMkLVPooErMYhHxQ+Wb9Lp5\nAeOxeeBkzswsP+WSfh03l7qnFilgPCUnIj6JiPMjYjVgb2AVYHyBwyo1P0r6Q+WbNE5xWgHjsXng\nq1nNzPLzJHCXpGvS+0NSmTUgScuS3dR2D7L7+f2jsBGVnCOBeyR9QTYu8Xdkx9qKkC+AMDPLg6Qy\nsgRus1T0DHBt5UBym3eSXiG7B9o9wF0R8VGBQypJkpqSXWQCvsikqDmZMzPLk6RmwDIRMa7QsZQi\nSSv42M5fkpoDRwPLRkR/ST2BFSLi0QKHZvXgZM7MLA+SdgAuABaJiG6S+gJnRMQOBQ6t6EnaNyJu\nlXR0dfMj4uIFHVOpknQX2SPp9o+I3im5Gx4RfQscmtWDL4AwM8vPqcCawBSAiBgNdCtoRKWjRfrZ\nqobJGk6PiDgfmAGQ7i9X3Q2xrQj4Aggzs/zMiIipORe0QnrslM2biLgm/Ty90LE0Ar+k4QKVj0zr\ngZ/TWrTcMmdmlp+3Je1NdouSnulZl8MLHVQpkXS+pNaSmkoaIulrSfsWOq4ScyrZVdhdJN1GdhNh\nXzFcpDxmzswsD2ls0UnAlqnoKeDMiJheuKhKi6TREdFX0k7A9mQD9YdGRJ8Ch1ZSJC0OrE3Wvfpy\nRHxT4JCsntzNamaWhzS26CRJZ/k5lvNN5XfTdsA91XRr2zxKN77eBugeEWdIWkbSmhExotCxWf7c\nzWpmlgdJ60oaC7yb3veRdGWBwyo1j0p6F1gNGCKpA+CWz4Z1JbAOsFd6/z3wn8KFY/PC3axmZnlI\nN7TdFXg4IlZNZW9FRO/CRlZaJLUHpkZERerabh0RXxY6rlIhaVRE/EHS6znn8Rvuyi5O7mY1M8tT\nRHxapdvPT39oeCsCXSXlfk/dXKhgStCM9FzhyqtZOwCzChuS1ZeTOTOz/HwqaV0g0uOQjgDeKXBM\nJUXSLUAPYDSzE+XAyVxDuhx4AOgk6Syy1uaTCxuS1Ze7Wc3M8iBpCeAyYHOyccdPAUdExKSCBlZC\nJL0DrBT+gpqvJK3I7GcMPxcR/qOkSLllzswsD+n2DfsUOo4S9xbwO2BCoQMpcc2Byq7WZgWOxeaB\nr2Y1M8uDpO6SHkk3sp0o6SFJ3QsdV4lZAhgr6SlJD1dOhQ6qlEg6BbgJaE92vG+Q5G7WIuVuVjOz\nPEh6mewWDnekoj2BgRGxVuGiKi2SNqquPCJeWNCxlCpJ44A+lTe7To/2Gh0RKxQ2MqsPt8yZmeWn\neUTcEhEz03QrsFihgyolKWn7GGiaXo8ERhU0qNLzBXOet4sCnxcoFptHbpkzM8uDpPOAb4E7ycYa\n7QG0Ay4AiIjJhYuuNEjqDwwA2kdED0k9gasjYrO5rGp1JOlBYA3gGbLzeAtgBPAZQEQcXrjoLF9O\n5szM8iBpfC2zIyI8fm4eSRoNrAm8knND2zERsXJhIysdkg6obX5E3LSgYrF556tZzczyEBHdCh1D\nI/BzRPxSeWPmdONgtzw0oNxkTVI7oEtEvFnAkGweeMycmVkeJO0mqVV6fbKk+yWtWui4SswLkk4E\nmknaArgHeKTAMZUUSc9Lap0emzYKGCzp4kLHZfXjZM7MLD//jIjvJa1PduPg64CrCxxTqTke+BoY\nAxwCPI6fTtDQ2kTEd8DOwM3pauzNCxyT1ZOTOTOz/FQ+Xmo7YFBEPAYsUsB4SoakIenlORExOCJ2\ni4hd02t3szasJpI6A7sDjxY6GJs3HjNnZpafzyVdQ3b133mSFsV/GDeUzum5tztIuhNQ7syI8O1J\nGs4ZZI+i+19EjEw3vn6/wDFZPflqVjOzPEhqDmwNjImI91PrxsoR8XSBQyt6knYFDgbWB16tMjsi\nYtMFH5XZws/JnJmZLRQkrRcRL0o6JSLOKHQ8ZsXCXQNmZrawuDz97FfQKMyKjMfMmZnZwmKGpEHA\nUpIurzrTTyUwq55b5szM8iRpWUmbp9fNKu87Z/Nse+A5YDrwWjWTNRBJnSRdJ+mJ9H4lSQcXOi6r\nH4+ZMzPLg58bOv9J6hMRbxQ6jlKWkrgbgJMiok96ysbrfmRacXLLnJlZfv4GrAd8BxAR7wMdCxpR\n6Zkk6QFJE9N0n6SlCx1UiVkiIu4GZgFExExm30PRioyTOTOz/PwcEb9UvvFzQ+eLG4CHgSXT9Egq\ns4bzo6TFSeeupLWBqYUNyerLyZyZWX783ND5r2NE3BARM9N0I9Ch0EGVmKPJEuYekl4EbgZ8gUmR\n8pg5M7M8SCoju7HtlmRPKHgqIgYXNqrSkh7rdQNwRyraCzjI4xIbTnpySQWwAtl5PA4oi4ifCxqY\n1YuTOTOzPEg6IiIum1uZ1Z+kZYF/A+uQdQMOBw6PiP8raGAlRNKoiPjD3MqsODiZMzPLQw1fgq9H\nxKqFismsriT9DlgKuBXYm9nPv21NdlX2ioWKzerPNw02M6sDSXuRffl1k/RwzqxWwOTCRGWWt62A\nA4GlgYtzyr8HTixEQDbv3DJnZlYHqeuvG3AOcHzOrO+BN9OtHcyKgqRdIuK+QsdhDcPJnJmZLVQk\ndYuI8XMrs3kjaTugF7BYZVlEnFG4iKy+fGsSM7M8SFpb0khJP0j6RVKFpO8KHVeJqa7F6N4FHkUJ\nk3Q1sAcwkGzc3G7AsgUNyurNY+bMzPJzBbAn2f3lVgf2B5YvaEQlQtKKZC1FbSTtnDOrNTmtR9Yg\n1o2IVSS9GRGnS7oIeKLQQVn9OJkzM8tTRHwgqTwiKoAbJL0OnFDouErACsD2QFvgjznl3wP9CxJR\n6ZqWfv4kaUlgEtC5gPHYPHAyZ2aWn58kLQKMlnQ+MAEPWWkQEfEQ8JCkdSLipULHU+IeldQWuAAY\nRXY/v2sLG5LVly+AMDPLQ7qq9StgEeAooA1wZUR8UNDASoikpcluGrxeKhoGHBERnxUuqtKVngax\nWET42axFysmcmZktVCQ9A9wO3JKK9gX2iYgtChdV6ZG0LtCVnF66iLi5YAFZvTmZMzPLg6T1gNPI\nrvzL/RLsXqiYSo2kNyKiT5Wy0RHRt1AxlRpJtwA9gNFkz2gFiIg4vHBRWX15zJyZWX6uI+tefY3Z\nX4LWsL6RtC9wR3q/F9kAfWs4qwMrhVt0SoKTOTOz/EyNCN/CYf76E9mYuUvIBuYPBw4qaESl5y3g\nd2QX8FiRczermVkdSPpDerk7UA7cD/xcOT8iRhUiLrN8SHqELEFuBfQFRjDnebxDgUKzeeBkzsys\nDiT9t5bZERGbLrBgSpykDmT3levKnOMS/1SomEqFpI1qmx8RLyyoWKzhOJkzM7OFiqThZLcjmWNc\noh8Mb1Y9J3NmZrZQ8ZWrZvnxXcvNzGxh86ikbQsdhFmxcDJn9v/t3XuwpVWZ3/Hvr5u7dNtDAGNE\noGkbZ1ARyYADtISoMDpKJwroiCZg8JLRCA4pK/FWWmpC1NHSUOUFRScqMqOJiiCXDJbDRYhoA4Io\nyMhFZwbjCNLdY7Dppp/88e4zHDqtnH1Jr977/X6qdp3zrvdQ9SvqVJ2n11rPWpK2N2fQFXQPJFmX\nZH2Sda1DzZIkZyxkTNPBYk6ShpDkpCRLBt+/LcmX5nW6agKqaklVLaqqXatq6eB5aetcM+aUrYyd\nuq1DaDI8Z06ShvP2qvpiklXAc+kuKv8o8My2saRHl+RlwMnA8iRfnfdqCXBfm1Qal8WcJA1nrrvy\nBcA5VfW1JO9pGUgawjV0BwXvCXxg3vh64KYmiTQ2u1klaQhJLgL+BjgWOBR4ALhuy7tEpe1dkscB\nhw0er6uqn7XMo9G5Z06ShvMS4DLg96vqfmAP4E1tI82WJCuS7Dz4/pgkpydZ1jrXLElyEt3tDyfR\n/U5/K8mJbVNpVM7MSdKQkiwGHscjbyf4cbtEsyXJjXQXwe8PXAxcADylqjyuZEKSfBc4dm42bnDr\nxuXOME8n98xJ0hCSvAF4B/C/gc2D4QIObhZq9myuqk1JXgScXVVnJ7mhdagZs2iLZdV7cbVualnM\nSdJwzgCeXFX3tg4ywzYOui5PAY4fjO3YMM8sujTJZcD5g+eX0s2Cagq5zCpJQ0jyDbrlqU2ts8yq\nJAcB/xa4tqrOT7IceElVvbdxtJmS5ATgqMHjVVX15ZZ5NDqLOUkaQpJzgScDXwM2zI1X1QebhZLU\nay6zStJwfjz47DT4aEKSfKGqXpLkZrp9iI9QVe5LnJAkLwbeC+wNZPApb9qYTs7MSZK2C0keX1X3\nJNlva++r6u5tnWlWJfkr4Piq+kHrLBqfxZwkLUCSD1XVG5NcyNZnjVY3iCWNJMk3q+qoR/9JTQOX\nWSVpYT47+PonTVNIk/GdJH8OfIVH7v38UrtIGpUzc5Ik9UyST29luKrq32zzMBqbxZwkDSHJSuAs\n4CBgl7nxqjqgWagZk+SMqvrwo41J6njasyQN59PAR4FNwD8HPgN8rmmi2XPKVsZO3dYhpGnhzJwk\nDSHJmqr6p0lurqqnzR9rnW3aDW59OBlYBVw179USuiu+ntMkmLSdswFCkoazIcki4PYk/w74G2D3\nxiAP3EAAABR4SURBVJlmxTXAPcCewAfmja8HbmqSaEYlWV5Vdz7amKaDM3OSNIQkhwE/AJYB7waW\nAu+rqm81DTZjBmfNrayqy5PsCuxQVetb55oVSa6vqkO3GHOGeUo5MydJw9m/qr4N/D3wSoAkJwEW\ncxOS5NXAa4A9gBXAPsDHAJdZx5Tkt4GnAI8d3AIxZynzGno0XWyAkKThvHmBYxrd6+kugF8HUFW3\n0107pfE9GXgh3czy8fM+hwKvbphLY3BmTpIWIMnzgT8AnpDkv857tZSus1WTs6GqHkwCQJId2Mqt\nGxpeVV0AXJDkiKq6tnUeTYbFnCQtzN8C3wFWA2vmja8H/rhJotl1RZK3ALsmORZ4HXBh40yz5jWD\n5exH8NDg6WQDhCQNIcmOVbWxdY5ZNugWPg04DghwGfDJ8g/WxCQ5Yd7jLsCLgL+tqtMbRdIYLOYk\naQhJjgLeCexHt7oRumuQvAFiggYdrPtW1W2ts/TBoIC+uqqObJ1Fw3OZVZKGcy7dsuoa4KHGWWZS\nktXA+4GdgOVJDgHeVVWr2yabaSuxyWRqWcxJ0nDWVtUlrUPMuHcAhwN/CVBVNyZZ3jTRjEmynq6p\nJIOvPwX+Q9NQGpnFnCQN5xtJ3g98CdgwN1hV17eLNHM2VtXauW7WAfcETVBVLWmdQZNjMSdJw3nm\n4Ovvzhsr4NkNssyqW5KcDCxOshI4ne6qL03Q4NDgVXS/v1dV1VcaR9KIbICQJG1XkuwGvJWHu1kv\nBd5TVb9qGmyGJPkI8CTg/MHQS4EfVdXr26XSqCzmJGkISR5Lt6fr6MHQFXSb89e2SzVbkqyoqh+1\nzjHLktwK/M7ccS+DbtZbqup32ibTKLzOS5KG8ym6g4JfMvisAz7dNNHs+VSSHyX5sySvT/K01oFm\n0F8B+857fuJgTFPImTlJGkKSG6vqkEcb03iS7AQcBhwDvBbYvar2aBpqBiS5kG6P3GPp/v9eN3h+\nJnBdVR3TLp1GZQOEJA3ngSSrqupq+IdDhB9onGmmJFkFPGvwWQZcBFzVNNTs+JPWATR5zsxJ0hAG\nB9j+N7qZDYBfAKdW1XfbpZotSTbRHcp8FnBxVT3YOJK0XbOYk6QRJFkKUFXrWmeZNUmWAUfRNZkc\nBmwGrq2qtzcNJm2nbICQpCEk+c9JllXVuqpal+S3krynda5ZUlX3A3cAdwL3ACt4uHtY0hacmZOk\nISS5oaqescXY9VV1aKtMsybJHcCtdPvkrqLbmO9S64Ql2RXYt6pua51F43FmTpKGszjJznMPgz+I\nO/+Gn9fwTq2qP6iqs6rq6qp6cNBooglJcjxwI92BzCQ5JMlX26bSqCzmJGk45wFfT3JaktOAv6Br\niNDkfGgrY2dv8xSz7Z3A4cD9AFV1I7C8ZSCNzqNJJGkIVfXeJN8FnjsYendVXdYy06xIcgRwJLBX\nkjPnvVoKLG6TamZtrKq1SeaPue9qSlnMSdKQqupSBstTmqidgN3p/jYtmTe+DjixSaLZdUuSk+m2\nDawETgeuaZxJI7IBQpK0XUmyX1Xd3TrHLEuyG/BW4DggwGV0s8y/ahpMI7GYkyRJmmIus0rSEJKc\nUVUffrQxaXs2747W+dYC3wE+7gzddLGbVZKGc8pWxk7d1iGkMd0B/D3wicFnHbAeOHDwrCnizJwk\nLUCSlwEnA8u3OI9rCXBfm1SzKcmBwEeBx1XVU5McDKyuKm/amJwjq+qwec8XJvl2VR2W5JZmqTQS\nizlJWphr6K6W2hP4wLzx9cBNTRLNrk8AbwI+DlBVNyX5PGAxNzm7J9m3qn4MkGRfuk5iAG/bmDIW\nc5K0AIPuyruBI5LsB6ysqssHN0DsSlfUaTJ2q6rrtjgDbVOrMDPq3wNXJ/kRXTfrcuB1SR6Dh2BP\nHYs5SRpCklcDrwH2oLsAfh/gY8BzWuaaMT9PsoLBBv0kJ9LNimpCquriwflyvz0Yum1e08PWbuDQ\ndsyjSSRpCElupLsG6VtV9YzB2M1V9bS2yWZHkgOAc+hug/gFcCfwiqq6q2WuWZPkqcBBwC5zY1X1\nmXaJNCpn5iRpOBsGF78DkGQHvAZpoqrqDuC5gyW/RVXlEvaEJXkHcAxdMXcx8HzgasBibgpZzEnS\ncK5I8hZg1yTHAq8DLmycaaYk2Rk4Adgf2GGucK6qdzWMNWtOBJ4O3FBVr0zyOOBzjTNpRJ4zJ0nD\n+Y/A3wE3A6+lm9V4W9NEs+cC4F/QNT38ct5Hk/NAVW0GNiVZCvwMeGLjTBqRM3OSNISq2pzkc8CV\nVXVb6zwzap+qel7rEDPuO0mW0R0Ds4buAOFr20bSqGyAkKQhJFkNvB/YqaqWJzkEeFdVrW4cbWYk\nOQc4u6pubp2lD5LsDyytKs9LnFIWc5I0hCRrgGcDf2k362Ql+R6wmW7VaCXdlVMb6M5Bq6o6uGG8\nmZLk6K2NV9WV2zqLxucyqyQNZ2NVrd3iQFv/VTwZTwAOaR2iJ9407/td6I7bmfuHiqaMxZwkDeeW\nJCcDiweHrp5Od9WXxnfn4KYN/X9WVcfPf07yRDwseGq5zCpJQ0iyG/BW4Di65b9LgffMOz1fI0ry\n18AHf937qvq17zSedFPNt1TVQa2zaHjOzEnScB5fVW+lK+g0WYvpLnvPo/2gxpPkbB7eHrCIbnn7\n+naJNA5n5iRpCEmuoLuP9dvAVXRHlNh1OQFJrq+qQ1vn6IMkp8x73ATcVVXfbJVH47GYk6QhJdkJ\nOIzuOqTXArtX1R5NQ82AJDfMdQhLWjiXWSVpCElWAc8afJYBF9HN0Gl8z2kdoC+S3Mz/24W9FvgO\n3R7Qe7d9Ko3KmTlJGkKSTXRHOJwFXFxVDzaOJA0tyfuAh4DPD4b+ENgN+CmwastuV23fLOYkaQiD\nK5COAo6mW2rdDFxbVW9vGkwawtb2J86NeQj29FnUOoAkTZOqup/uZoI7gXuAFXSFnTRNFic5fO4h\nyWF03cTQNURoijgzJ0lDSHIHcCvdPrmrgOtcatW0GRRvn+Lho2DWAa8CbgFeUFVfaBhPQ7KYk6Qh\nJDl6y/srkxzlsQ6aRkkeC1BVa1tn0egs5iRpCL9pr1GrTNKwkuwMnADsz7yTLarqXa0yaXQeTSJJ\nC5DkCOBIYK8kZ857tZSH9xpJ0+ICuqNI1gAbGmfRmCzmJGlhdqLbX7QDsGTe+DrgxCaJpNHtU1XP\nax1Ck+EyqyQNIcl+VXV36xzSOJKcA5ztVXSzwWJOkqSeSfJ94El0R+xsoOtorao6uGkwjcRiTpKk\nnkmy39bGnXWeTu6ZkySpZ+aKtiR7A7s0jqMxeQOEJA0hyYFJvp7ke4Png5O8rXUuaRhJVie5nW6Z\n9QrgLuCSpqE0Mos5SRrOJ4A3AxsBquomukvKpWnybuD3gB9W1XLgOcD/ahtJo7KYk6Th7FZV120x\n5l2WmjYbq+peYFGSRVX1DeB3W4fSaNwzJ0nD+XmSFUABJDkRuKdtJGlo9yfZHbgSOC/Jz4BfNs6k\nEdnNKklDSHIAcA7dbRC/oNtz9IqquqtlLmkYSR4D/IruSJKXA48FzhvM1mnKWMxJ0ggGfwwXVdX6\n1lkk9ZvFnCQNwQvKNc2SrGewRWDLV3SHBi/dxpE0Ae6Zk6TheEG5plZVLXn0n9K0cWZOkoaQ5HtV\n9dTWOSRpjkeTSNJwrknytNYhJGmOM3OStACDGx82021PWQncgReUS9oOuGdOkhbmCcAhrUNIk5Jk\nP2BlVV2eZFdgB7uzp5PFnCQtzJ1zl5NL0y7Jq4HXAHsAK4B9gI/RXeulKWMxJ0kLs3eSM3/dy6r6\n4LYMI43p9cDhwLcAqur2JHu3jaRRWcxJ0sIsBnan2yMnTbsNVfVg0v06J9mBrZ8/pylgMSdJC3OP\nBwNrhlyR5C3ArkmOBV4HXNg4k0ZkN6skLUCSG6rqGa1zSJOQZBFwGnAc3WzzZcAny6JgKlnMSdIC\nJNmjqu5rnUOahCQvBr5WVd5iMgM8NFiSFsBCTjPmeOCHST6b5IWDPXOaUs7MSZLUQ0l2BJ4PvBRY\nBfxFVb2qbSqNwmJOkqSeGhR0zwNeCRxdVXs2jqQRuMwqSVLPJHl+kj8FbgdOAD4J/OOmoTQyZ+Yk\nSeqZJOcDfw5cYhPE9LOYkyRJmmJ2r0iS1BNJrq6qVUnW88gbHwJUVS1tFE1jcGZOkiRpitkAIUlS\nzyRZkWTnwffHJDk9ybLWuTQaizlJkvrnfwAPJXkScA7wRODzbSNpVBZzkiT1z+aq2gS8CDi7qt4E\nPL5xJo3IYk6SpP7ZmORlwCnARYOxHRvm0Rgs5iRJ6p9XAkcA/6mq7kyyHPhs40wakd2skiRJU8xz\n5iRJ6pkkRwHvBPajqwXmzpk7oGUujcaZOUmSeibJrcAfA2uAh+bGq+reZqE0MmfmJEnqn7VVdUnr\nEJoMZ+YkSeqZJP8FWAx8CdgwN15V1zcLpZFZzEmS1DNJvrGV4aqqZ2/zMBqbxZwkSdIU85w5SZJ6\nJsnjkpyb5JLB80FJTmudS6OxmJMkqX/+FLgM+CeD5x8Cb2yWRmOxmJMkqX/2rKovAJsBBve0PvSb\n/xNtryzmJEnqn18m+UdAAST5PWBt20galefMSZLUP2cCXwVWJPkmsBdwYttIGpXdrJIk9USSk6rq\ni0mWAz8Bnkx3lddtVbWxbTqNymJOkqSeSHJ9VR0697V1Hk2GxZwkST2R5HK6pofDgSu3fF9Vq7d5\nKI3NYk6SpJ5IshNwKPBZ4FVbvq+qK7Z5KI3NBghJkvrj3Kr6V0k+YeE2O5yZkySpJ5J8H3gucAlw\nDF3zwz+oqvsaxNKYnJmTJKk/PgZ8HTgAWMMji7kajGvKODMnSVLPJPloVf1R6xyaDIs5SZJ6KMnT\ngWcNHq+sqpta5tHovM5LkqSeSXI6cB6w9+BzXpI3tE2lUTkzJ0lSzyS5CTiiqn45eH4McG1VHdw2\nmUbhzJwkSf0T4KF5zw+xRWerpofdrJIk9c+ngW8l+fLg+V8C5zbMozG4zCpJUg8lORRYNXi8qqpu\naJlHo7OYkyRJmmLumZMkSZpiFnOSJElTzGJOkqSeSfKYJIsG3x+YZHWSHVvn0mjcMydJUs8kWUN3\n+8NvAd8Evg08WFUvbxpMI3FmTpKk/klV/R/gxcBHquok4CmNM2lEFnOSJPVPkhwBvBz42mBsccM8\nGoPFnCRJ/fNG4M3Al6vqliQHAN9onEkjcs+cJEk9lWS3wXKrppgzc5Ik9UySI5J8H7h18Pz0JB9p\nHEsjspiTJKl/PgT8PnAvQFV9Fzi6aSKNzGJOkqQeqqqfbDH0UJMgGtsOrQNIkqRt7idJjgRqcFjw\nGcAPGmfSiGyAkCSpZ5LsCXwYeC4Q4H8CZ1TVvU2DaSQWc5IkSVPMPXOSJPVMkvclWZpkxyRfT/J3\nSV7ROpdGYzEnSVL/HFdV64AXAncBTwLe1DSRRmYxJ0lS/8w1QL4A+GJVrW0ZRuOxm1WSpP65KMmt\nwAPAHyXZC/hV40wakQ0QkiT1UJI9gLVV9VCS3YClVfXT1rk0PGfmJEnqmST/et738199Ztun0bgs\n5iRJ6p/D5n2/C/Ac4Hos5qaSy6ySJPVckmXAn1XV81pn0fDsZpUkSb8ElrcOodG4zCpJUs8kuRCY\nW5pbBBwEfKFdIo3DZVZJknomyT+b97gJuLuq/rpVHo3HYk6SJGmKuWdOkiRpilnMSZIkTTGLOUmS\neibJGQsZ03SwmJMkqX9O2crYqds6hCbDo0kkSeqJJC8DTgaWJ/nqvFdLgPvapNK4LOYkSeqPa4B7\ngD2BD8wbXw/c1CSRxubRJJIk9VCS/YCVVXV5kl2BHapqfetcGp575iRJ6pkkrwb+O/DxwdA+wFfa\nJdI4LOYkSeqf1wNHAesAqup2YO+miTQyizlJkvpnQ1U9OPeQZAcevqtVU8ZiTpKk/rkiyVuAXZMc\nC3wRuLBxJo3IBghJknomySLgNOA4IMBlwCfLomAqWcxJktRDgw7WfavqttZZNB6XWSVJ6pkkq4Eb\ngUsHz4dscYiwpojFnCRJ/fMO4HDgfoCquhFY3jSRRmYxJ0lS/2ysqrVbjLnvakp5nZckSf1zS5KT\ngcVJVgKn0131pSnkzJwkSf3zBuApwAbgfGAt8MamiTQyu1klSeqZJCuq6ketc2gyLOYkSeqZJFfQ\n3cf6beAq4MqqurltKo3KYk6SpB5KshNwGHAM8Fpg96rao2kojcQGCEmSeibJKuBZg88y4CK6GTpN\nIWfmJEnqmSSbgDXAWcDFVfVg40gag8WcJEk9k2QZcBRwNN1S62bg2qp6e9NgGonLrJIk9UxV3Z/k\nDuCJdI0QRwI7tk2lUTkzJ0lSzwwKuVvp9sldBVznUuv0spiTJKlnkhxdVVduMXZUVX2zVSaNzmJO\nkqSeSXJ9VR36aGOaDu6ZkySpJ5IcQbc/bq8kZ857tRRY3CaVxmUxJ0lSf+wE7E7393/JvPF1wIlN\nEmlsLrNKktQzSfarqrtb59BkWMxJkiRNsUWtA0iSJGl0FnOSJElTzGJOkqSeSXJgkq8n+d7g+eAk\nb2udS6OxmJMkqX8+AbwZ2AhQVTcBf9g0kUZmMSdJUv/sVlXXbTG2qUkSjc1iTpKk/vl5khVAASQ5\nEbinbSSNyqNJJEnqmSQHAOfQ3QbxC+BO4BVVdVfLXBqNxZwkST2V5DHAoqpa3zqLRmcxJ0lSzyTZ\nGTgB2J95V3tW1btaZdLovJtVkqT+uQBYC6wBNjTOojE5MydJUs8k+V5VPbV1Dk2G3aySJPXPNUme\n1jqEJsOZOUmSemJw48Nmum1WK4E76JZZA1RVHdwwnkbknjlJkvrjCcAhrUNosizmJEnqjzur6u7W\nITRZFnOSJPXH3knO/HUvq+qD2zKMJsNiTpKk/lgM7E63R04zwgYISZJ6Isn1VXVo6xyaLI8mkSSp\nP5yRm0HOzEmS1BNJ9qiq+1rn0GRZzEmSJE0xl1klSZKmmMWcJEnSFLOYkyRJmmIWc5IkSVPMYk6S\nJGmK/V8fJXbq2GmtTwAAAABJRU5ErkJggg==\n", 536 | "text/plain": [ 537 | "" 538 | ] 539 | }, 540 | "metadata": {}, 541 | "output_type": "display_data" 542 | } 543 | ], 544 | "source": [ 545 | "score, acc = model.evaluate(X_test, y_test, batch_size=batch_size)\n", 546 | "print('\\nTest score:', score, 'Test accuracy:', acc)\n", 547 | "\n", 548 | "pred_probas = model.predict(X_test)\n", 549 | "pred_labels = np.argmax(pred_probas, axis=1)\n", 550 | "true_labels = np.argmax(y_test, axis=1)\n", 551 | "print('\\n', classification_report(true_labels, pred_labels))\n", 552 | "\n", 553 | "cnf = confusion_matrix(true_labels, pred_labels)\n", 554 | "sns.heatmap(cnf, annot=True, fmt='d', xticklabels=label_names, yticklabels=label_names, cmap=\"Reds\");" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": null, 560 | "metadata": { 561 | "collapsed": true 562 | }, 563 | "outputs": [], 564 | "source": [] 565 | } 566 | ], 567 | "metadata": { 568 | "kernelspec": { 569 | "display_name": "Python 3", 570 | "language": "python", 571 | "name": "python3" 572 | }, 573 | "language_info": { 574 | "codemirror_mode": { 575 | "name": "ipython", 576 | "version": 3 577 | }, 578 | "file_extension": ".py", 579 | "mimetype": "text/x-python", 580 | "name": "python", 581 | "nbconvert_exporter": "python", 582 | "pygments_lexer": "ipython3", 583 | "version": "3.6.2" 584 | } 585 | }, 586 | "nbformat": 4, 587 | "nbformat_minor": 2 588 | } 589 | -------------------------------------------------------------------------------- /notebooks/best_multiclass_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook contains different models that attempt to classify hate speech from Twitter. It was built as part of this research: https://arxiv.org/pdf/1703.04009.pdf " 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "import numpy as np\n", 18 | "import sys\n", 19 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 20 | "import nltk\n", 21 | "from nltk.stem.porter import *\n", 22 | "import string\n", 23 | "import re\n", 24 | "from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS\n", 25 | "from textstat.textstat import *\n", 26 | "from sklearn.linear_model import LogisticRegression\n", 27 | "from sklearn.feature_selection import SelectFromModel\n", 28 | "from sklearn.metrics import classification_report\n", 29 | "from sklearn.svm import LinearSVC\n", 30 | "import matplotlib.pyplot as plt\n", 31 | "import seaborn\n", 32 | "%matplotlib inline" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "nltk.download('averaged_perceptron_tagger')\n", 42 | "nltk.download('stopwords')" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "df = pd.read_csv('../data/twitter-hate-speech2.csv', encoding='latin-1')" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "df.head()" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": { 67 | "scrolled": true 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "df.describe()" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "df['class'].hist()" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit\n", 90 | "X=df['tweet']\n", 91 | "y=df['class']\n", 92 | "\n", 93 | "sss = StratifiedShuffleSplit(n_splits=1, test_size=.15, random_state=0) #TODO: Coordinate random seed between notebooks\n", 94 | "train_index, test_index = next(sss.split(X,y))\n", 95 | "\n", 96 | "X_train = X.iloc[train_index]\n", 97 | "X_test = X.iloc[test_index]\n", 98 | "y_train = y.iloc[train_index]\n", 99 | "y_test = y.iloc[test_index]" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "# Feature Engineering" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "stopwords=stopwords = nltk.corpus.stopwords.words(\"english\")\n", 116 | "\n", 117 | "other_exclusions = [\"#ff\", \"ff\", \"rt\"]\n", 118 | "stopwords.extend(other_exclusions)\n", 119 | "\n", 120 | "stemmer = PorterStemmer()\n", 121 | "\n", 122 | "def preprocess(text_string):\n", 123 | " \"\"\"\n", 124 | " Accepts a text string and replaces:\n", 125 | " 1) urls with URLHERE\n", 126 | " 2) lots of whitespace with one instance\n", 127 | " 3) mentions with MENTIONHERE\n", 128 | "\n", 129 | " This allows us to get standardized counts of urls and mentions\n", 130 | " Without caring about specific people mentioned\n", 131 | " \"\"\"\n", 132 | " space_pattern = '\\s+'\n", 133 | " giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'\n", 134 | " '[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')\n", 135 | " mention_regex = '@[\\w\\-]+'\n", 136 | " parsed_text = re.sub(space_pattern, ' ', text_string)\n", 137 | " parsed_text = re.sub(giant_url_regex, '', parsed_text)\n", 138 | " parsed_text = re.sub(mention_regex, '', parsed_text)\n", 139 | " return parsed_text\n", 140 | "\n", 141 | "def tokenize(tweet):\n", 142 | " \"\"\"Removes punctuation & excess whitespace, sets to lowercase,\n", 143 | " and stems tweets. Returns a list of stemmed tokens.\"\"\"\n", 144 | " tweet = \" \".join(re.split(\"[^a-zA-Z]*\", tweet.lower())).strip()\n", 145 | " tokens = [stemmer.stem(t) for t in tweet.split()]\n", 146 | " return tokens\n", 147 | "\n", 148 | "def basic_tokenize(tweet):\n", 149 | " \"\"\"Same as tokenize but without the stemming\"\"\"\n", 150 | " tweet = \" \".join(re.split(\"[^a-zA-Z.,!?]*\", tweet.lower())).strip()\n", 151 | " return tweet.split()" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "from sklearn.base import BaseEstimator, TransformerMixin\n", 161 | "class PosTfidfVectorizer(BaseEstimator, TransformerMixin):\n", 162 | " \"\"\"Get POS tags for tweets and transform via tfidf\"\"\"\n", 163 | " \n", 164 | " def __init__(self):\n", 165 | " self._pos_vectorizer = TfidfVectorizer(\n", 166 | " tokenizer=None,\n", 167 | " lowercase=False,\n", 168 | " preprocessor=None,\n", 169 | " ngram_range=(1, 3),\n", 170 | " stop_words=None,\n", 171 | " use_idf=False,\n", 172 | " smooth_idf=False,\n", 173 | " norm=None,\n", 174 | " decode_error='replace',\n", 175 | " max_features=5000,\n", 176 | " min_df=5,\n", 177 | " max_df=0.75,\n", 178 | " ) \n", 179 | " \n", 180 | " def _preprocess(self, X):\n", 181 | " tweet_tags = []\n", 182 | " for t in X:\n", 183 | " tokens = basic_tokenize(preprocess(t))\n", 184 | " tags = nltk.pos_tag(tokens)\n", 185 | " tag_list = [x[1] for x in tags]\n", 186 | " tag_str = \" \".join(tag_list)\n", 187 | " tweet_tags.append(tag_str)\n", 188 | " return tweet_tags\n", 189 | " \n", 190 | " def fit(self, X, y=None):\n", 191 | " tweet_tags = self._preprocess(X)\n", 192 | " self._pos_vectorizer.fit(X)\n", 193 | " \n", 194 | " return self\n", 195 | " \n", 196 | " def transform(self, X, y=None):\n", 197 | " tweet_tags = self._preprocess(X)\n", 198 | " return self._pos_vectorizer.transform(X)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "sentiment_analyzer = VS()\n", 208 | "\n", 209 | "def count_twitter_objs(text_string):\n", 210 | " \"\"\"\n", 211 | " Accepts a text string and replaces:\n", 212 | " 1) urls with URLHERE\n", 213 | " 2) lots of whitespace with one instance\n", 214 | " 3) mentions with MENTIONHERE\n", 215 | " 4) hashtags with HASHTAGHERE\n", 216 | "\n", 217 | " This allows us to get standardized counts of urls and mentions\n", 218 | " Without caring about specific people mentioned.\n", 219 | " \n", 220 | " Returns counts of urls, mentions, and hashtags.\n", 221 | " \"\"\"\n", 222 | " space_pattern = '\\s+'\n", 223 | " giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'\n", 224 | " '[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')\n", 225 | " mention_regex = '@[\\w\\-]+'\n", 226 | " hashtag_regex = '#[\\w\\-]+'\n", 227 | " parsed_text = re.sub(space_pattern, ' ', text_string)\n", 228 | " parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)\n", 229 | " parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)\n", 230 | " parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)\n", 231 | " return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))\n", 232 | "\n", 233 | "def other_features(tweet):\n", 234 | " \"\"\"This function takes a string and returns a list of features.\n", 235 | " These include Sentiment scores, Text and Readability scores,\n", 236 | " as well as Twitter specific features\"\"\"\n", 237 | " sentiment = sentiment_analyzer.polarity_scores(tweet)\n", 238 | " \n", 239 | " words = preprocess(tweet) #Get text only\n", 240 | " \n", 241 | " syllables = textstat.syllable_count(words)\n", 242 | " num_chars = sum(len(w) for w in words)\n", 243 | " num_chars_total = len(tweet)\n", 244 | " num_terms = len(tweet.split())\n", 245 | " num_words = len(words.split())\n", 246 | " avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)\n", 247 | " num_unique_terms = len(set(words.split()))\n", 248 | " \n", 249 | " ###Modified FK grade, where avg words per sentence is just num words/1\n", 250 | " FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)\n", 251 | " ##Modified FRE score, where sentence fixed to 1\n", 252 | " FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)\n", 253 | " \n", 254 | " twitter_objs = count_twitter_objs(tweet)\n", 255 | " retweet = 0\n", 256 | " if \"rt\" in words:\n", 257 | " retweet = 1\n", 258 | " features = [FKRA, FRE,syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words,\n", 259 | " num_unique_terms, sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound'],\n", 260 | " twitter_objs[2], twitter_objs[1],\n", 261 | " twitter_objs[0], retweet]\n", 262 | " return features\n", 263 | "\n", 264 | "def get_feature_array(tweets):\n", 265 | " feats=[]\n", 266 | " for t in tweets:\n", 267 | " feats.append(other_features(t))\n", 268 | " return np.array(feats)\n", 269 | "\n", 270 | "class SentimentVectorizer(BaseEstimator, TransformerMixin): \n", 271 | "\n", 272 | " def fit(self, X, y=None):\n", 273 | " return self\n", 274 | " \n", 275 | " def transform(self, X, y=None):\n", 276 | " return get_feature_array(X)" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "from sklearn.pipeline import FeatureUnion, Pipeline\n", 286 | "\n", 287 | "vectorizer = TfidfVectorizer(\n", 288 | " tokenizer=tokenize,\n", 289 | " preprocessor=preprocess,\n", 290 | " ngram_range=(1, 3),\n", 291 | " stop_words=stopwords,\n", 292 | " use_idf=True,\n", 293 | " smooth_idf=False,\n", 294 | " norm=None,\n", 295 | " decode_error='replace',\n", 296 | " max_features=10000,\n", 297 | " min_df=5,\n", 298 | " max_df=0.75\n", 299 | " )\n", 300 | "pos_vectorizer = PosTfidfVectorizer()\n", 301 | "sentiment_vectorizer = SentimentVectorizer()\n", 302 | "\n", 303 | "model = Pipeline( [('features', FeatureUnion([('tfidf', vectorizer),('pos_tfidf', pos_vectorizer), \n", 304 | " ('sentiment',sentiment_vectorizer)])),\n", 305 | " ('feature_selector', SelectFromModel(LogisticRegression(class_weight='balanced',penalty=\"l1\",C=0.01))),\n", 306 | " ('model', LogisticRegression(class_weight='balanced',penalty='l2',C=0.01))] )" 307 | ] 308 | }, 309 | { 310 | "cell_type": "markdown", 311 | "metadata": {}, 312 | "source": [ 313 | "# Running the model" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "model.fit(X_train,y_train)" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [ 331 | "y_preds = model.predict(X_test)" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": {}, 337 | "source": [ 338 | "## Evaluating the results on the test set" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": null, 344 | "metadata": {}, 345 | "outputs": [], 346 | "source": [ 347 | "report = classification_report( y_test, y_preds)" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [ 356 | "print(report)" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "from sklearn.metrics import confusion_matrix\n", 366 | "\n", 367 | "confusion_matrix = confusion_matrix(y_test, y_preds)\n", 368 | "matrix_proportions = np.zeros((3, 3))\n", 369 | "for i in range(0, 3):\n", 370 | " matrix_proportions[i, :] = confusion_matrix[i, :] / \\\n", 371 | " float(confusion_matrix[i, :].sum())\n", 372 | "names = ['Hate', 'Offensive', 'Neither']\n", 373 | "confusion_df = pd.DataFrame(matrix_proportions, index=names, columns=names)\n", 374 | "plt.figure(figsize=(5, 5))\n", 375 | "seaborn.heatmap(confusion_df, annot=True, annot_kws={\"size\": 12}, square=True, cmap=\"Reds\")" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "metadata": {}, 382 | "outputs": [], 383 | "source": [] 384 | } 385 | ], 386 | "metadata": { 387 | "kernelspec": { 388 | "display_name": "Python 3", 389 | "language": "python", 390 | "name": "python3" 391 | }, 392 | "language_info": { 393 | "codemirror_mode": { 394 | "name": "ipython", 395 | "version": 3 396 | }, 397 | "file_extension": ".py", 398 | "mimetype": "text/x-python", 399 | "name": "python", 400 | "nbconvert_exporter": "python", 401 | "pygments_lexer": "ipython3", 402 | "version": "3.7.0" 403 | } 404 | }, 405 | "nbformat": 4, 406 | "nbformat_minor": 2 407 | } 408 | -------------------------------------------------------------------------------- /notebooks/multiclass_baseline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false, 8 | "deletable": true, 9 | "editable": true 10 | }, 11 | "outputs": [], 12 | "source": [ 13 | "import pandas as pd\n", 14 | "import numpy as np\n", 15 | "%matplotlib inline" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 2, 21 | "metadata": { 22 | "collapsed": false, 23 | "deletable": true, 24 | "editable": true 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "df = pd.read_csv('../data/twitter-hate-speech2.csv', encoding='latin-1')" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 3, 34 | "metadata": { 35 | "collapsed": false, 36 | "deletable": true, 37 | "editable": true 38 | }, 39 | "outputs": [ 40 | { 41 | "data": { 42 | "text/html": [ 43 | "
\n", 44 | "\n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | "
Unnamed: 0counthate_speechoffensive_languageneitherclasstweet
0030032!!! RT @mayasolovely: As a woman you shouldn't...
1130301!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2230301!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3330211!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
\n", 100 | "
" 101 | ], 102 | "text/plain": [ 103 | " Unnamed: 0 count hate_speech offensive_language neither class \\\n", 104 | "0 0 3 0 0 3 2 \n", 105 | "1 1 3 0 3 0 1 \n", 106 | "2 2 3 0 3 0 1 \n", 107 | "3 3 3 0 2 1 1 \n", 108 | "\n", 109 | " tweet \n", 110 | "0 !!! RT @mayasolovely: As a woman you shouldn't... \n", 111 | "1 !!!!! RT @mleew17: boy dats cold...tyga dwn ba... \n", 112 | "2 !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby... \n", 113 | "3 !!!!!!!!! RT @C_G_Anderson: @viva_based she lo... " 114 | ] 115 | }, 116 | "execution_count": 3, 117 | "metadata": {}, 118 | "output_type": "execute_result" 119 | } 120 | ], 121 | "source": [ 122 | "df.head(4)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 4, 128 | "metadata": { 129 | "collapsed": false, 130 | "deletable": true, 131 | "editable": true, 132 | "scrolled": true 133 | }, 134 | "outputs": [ 135 | { 136 | "data": { 137 | "text/html": [ 138 | "
\n", 139 | "\n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | "
Unnamed: 0counthate_speechoffensive_languageneitherclass
count24783.00000024783.00000024783.00000024783.00000024783.00000024783.000000
mean12681.1920273.2434730.2805152.4137110.5492471.110277
std7299.5538630.8830600.6318511.3994591.1132990.462089
min0.0000003.0000000.0000000.0000000.0000000.000000
25%6372.5000003.0000000.0000002.0000000.0000001.000000
50%12703.0000003.0000000.0000003.0000000.0000001.000000
75%18995.5000003.0000000.0000003.0000000.0000001.000000
max25296.0000009.0000007.0000009.0000009.0000002.000000
\n", 226 | "
" 227 | ], 228 | "text/plain": [ 229 | " Unnamed: 0 count hate_speech offensive_language \\\n", 230 | "count 24783.000000 24783.000000 24783.000000 24783.000000 \n", 231 | "mean 12681.192027 3.243473 0.280515 2.413711 \n", 232 | "std 7299.553863 0.883060 0.631851 1.399459 \n", 233 | "min 0.000000 3.000000 0.000000 0.000000 \n", 234 | "25% 6372.500000 3.000000 0.000000 2.000000 \n", 235 | "50% 12703.000000 3.000000 0.000000 3.000000 \n", 236 | "75% 18995.500000 3.000000 0.000000 3.000000 \n", 237 | "max 25296.000000 9.000000 7.000000 9.000000 \n", 238 | "\n", 239 | " neither class \n", 240 | "count 24783.000000 24783.000000 \n", 241 | "mean 0.549247 1.110277 \n", 242 | "std 1.113299 0.462089 \n", 243 | "min 0.000000 0.000000 \n", 244 | "25% 0.000000 1.000000 \n", 245 | "50% 0.000000 1.000000 \n", 246 | "75% 0.000000 1.000000 \n", 247 | "max 9.000000 2.000000 " 248 | ] 249 | }, 250 | "execution_count": 4, 251 | "metadata": {}, 252 | "output_type": "execute_result" 253 | } 254 | ], 255 | "source": [ 256 | "df.describe()" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": { 262 | "deletable": true, 263 | "editable": true 264 | }, 265 | "source": [ 266 | "# Cleaning, etc." 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 7, 272 | "metadata": { 273 | "collapsed": true, 274 | "deletable": true, 275 | "editable": true 276 | }, 277 | "outputs": [], 278 | "source": [ 279 | "df['hate_class'] = (df['class'] == 0)*1\n", 280 | "\n", 281 | "#TODO: more" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": { 287 | "deletable": true, 288 | "editable": true 289 | }, 290 | "source": [ 291 | "# Train/Test Definition" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 5, 297 | "metadata": { 298 | "collapsed": false, 299 | "deletable": true, 300 | "editable": true 301 | }, 302 | "outputs": [], 303 | "source": [ 304 | "from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit\n", 305 | "X=df['tweet']\n", 306 | "y=df['class']\n", 307 | "\n", 308 | "sss = StratifiedShuffleSplit(n_splits=1, test_size=.15, random_state=0) #TODO: Coordinate random seed between notebooks\n", 309 | "train_index, test_index = next(sss.split(X,y))\n", 310 | "\n", 311 | "X_train = X.iloc[train_index]\n", 312 | "X_test = X.iloc[test_index]\n", 313 | "y_train = y.iloc[train_index]\n", 314 | "y_test = y.iloc[test_index]" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 6, 320 | "metadata": { 321 | "collapsed": false, 322 | "deletable": true, 323 | "editable": true 324 | }, 325 | "outputs": [ 326 | { 327 | "name": "stdout", 328 | "output_type": "stream", 329 | "text": [ 330 | "(21065,) (3718,)\n" 331 | ] 332 | } 333 | ], 334 | "source": [ 335 | "print(X_train.shape,X_test.shape)" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": { 341 | "deletable": true, 342 | "editable": true 343 | }, 344 | "source": [ 345 | "# Explore/Feature Engineer\n", 346 | "\n", 347 | "Engineering only using the training set." 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 7, 353 | "metadata": { 354 | "collapsed": true, 355 | "deletable": true, 356 | "editable": true 357 | }, 358 | "outputs": [], 359 | "source": [ 360 | "# TODO" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "metadata": { 366 | "deletable": true, 367 | "editable": true 368 | }, 369 | "source": [ 370 | "## Model Training" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 8, 376 | "metadata": { 377 | "collapsed": true, 378 | "deletable": true, 379 | "editable": true 380 | }, 381 | "outputs": [], 382 | "source": [ 383 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 384 | "from sklearn.ensemble import ExtraTreesClassifier\n", 385 | "from sklearn.pipeline import Pipeline\n", 386 | "from sklearn.model_selection import cross_val_score, GridSearchCV" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 9, 392 | "metadata": { 393 | "collapsed": false, 394 | "deletable": true, 395 | "editable": true 396 | }, 397 | "outputs": [], 398 | "source": [ 399 | "pipe = Pipeline([\n", 400 | " ('tfidf', TfidfVectorizer(\n", 401 | " max_features = 2000, # wild guess\n", 402 | " stop_words = 'english',\n", 403 | " min_df=2, \n", 404 | " ngram_range = (1,3)\n", 405 | " )),\n", 406 | " ('et',ExtraTreesClassifier(n_estimators=50,verbose=True))\n", 407 | "])\n", 408 | "\n", 409 | "param_grid = {\n", 410 | " 'tfidf__min_df' : [2,5]\n", 411 | "}\n", 412 | "\n", 413 | "model = GridSearchCV(pipe, param_grid)" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": 11, 419 | "metadata": { 420 | "collapsed": false, 421 | "deletable": true, 422 | "editable": true 423 | }, 424 | "outputs": [ 425 | { 426 | "name": "stderr", 427 | "output_type": "stream", 428 | "text": [ 429 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 4.0s finished\n", 430 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 0.2s finished\n", 431 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 0.3s finished\n", 432 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 3.7s finished\n", 433 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 0.2s finished\n", 434 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 0.3s finished\n", 435 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 4.0s finished\n", 436 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 0.2s finished\n", 437 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 0.3s finished\n", 438 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 3.9s finished\n", 439 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 0.2s finished\n", 440 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 0.3s finished\n", 441 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 3.7s finished\n", 442 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 0.2s finished\n", 443 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 0.3s finished\n", 444 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 3.9s finished\n", 445 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 0.2s finished\n", 446 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 0.3s finished\n", 447 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 7.0s finished\n" 448 | ] 449 | }, 450 | { 451 | "data": { 452 | "text/plain": [ 453 | "GridSearchCV(cv=None, error_score='raise',\n", 454 | " estimator=Pipeline(steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n", 455 | " dtype=, encoding='utf-8', input='content',\n", 456 | " lowercase=True, max_df=1.0, max_features=2000, min_df=2,\n", 457 | " ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,\n", 458 | " ...ators=50, n_jobs=1, oob_score=False, random_state=None,\n", 459 | " verbose=True, warm_start=False))]),\n", 460 | " fit_params={}, iid=True, n_jobs=1,\n", 461 | " param_grid={'tfidf__min_df': [2, 5]}, pre_dispatch='2*n_jobs',\n", 462 | " refit=True, return_train_score=True, scoring=None, verbose=0)" 463 | ] 464 | }, 465 | "execution_count": 11, 466 | "metadata": {}, 467 | "output_type": "execute_result" 468 | } 469 | ], 470 | "source": [ 471 | "model.fit(X_train, y_train)" 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": { 477 | "deletable": true, 478 | "editable": true 479 | }, 480 | "source": [ 481 | "# Held Out Test" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": 12, 487 | "metadata": { 488 | "collapsed": false, 489 | "deletable": true, 490 | "editable": true 491 | }, 492 | "outputs": [ 493 | { 494 | "name": "stderr", 495 | "output_type": "stream", 496 | "text": [ 497 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 0.1s finished\n", 498 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 0.1s finished\n" 499 | ] 500 | } 501 | ], 502 | "source": [ 503 | "y_pred = model.predict(X_test)\n", 504 | "y_proba = model.predict_proba(X_test)" 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": 13, 510 | "metadata": { 511 | "collapsed": false, 512 | "deletable": true, 513 | "editable": true 514 | }, 515 | "outputs": [ 516 | { 517 | "name": "stdout", 518 | "output_type": "stream", 519 | "text": [ 520 | " precision recall f1-score support\n", 521 | "\n", 522 | " 0 0.37 0.12 0.18 214\n", 523 | " 1 0.92 0.96 0.94 2879\n", 524 | " 2 0.83 0.87 0.85 625\n", 525 | "\n", 526 | "avg / total 0.88 0.90 0.88 3718\n", 527 | "\n" 528 | ] 529 | } 530 | ], 531 | "source": [ 532 | "from sklearn.metrics import classification_report\n", 533 | "report = classification_report(y_test, y_pred)\n", 534 | "print(report)" 535 | ] 536 | }, 537 | { 538 | "cell_type": "code", 539 | "execution_count": 14, 540 | "metadata": { 541 | "collapsed": false, 542 | "deletable": true, 543 | "editable": true 544 | }, 545 | "outputs": [ 546 | { 547 | "data": { 548 | "text/plain": [ 549 | "array([[ 25, 159, 30],\n", 550 | " [ 34, 2767, 78],\n", 551 | " [ 8, 73, 544]])" 552 | ] 553 | }, 554 | "execution_count": 14, 555 | "metadata": {}, 556 | "output_type": "execute_result" 557 | } 558 | ], 559 | "source": [ 560 | "from sklearn.metrics import confusion_matrix\n", 561 | "confusion_matrix(y_test,y_pred)" 562 | ] 563 | } 564 | ], 565 | "metadata": { 566 | "kernelspec": { 567 | "display_name": "Python 3", 568 | "language": "python", 569 | "name": "python3" 570 | }, 571 | "language_info": { 572 | "codemirror_mode": { 573 | "name": "ipython", 574 | "version": 3 575 | }, 576 | "file_extension": ".py", 577 | "mimetype": "text/x-python", 578 | "name": "python", 579 | "nbconvert_exporter": "python", 580 | "pygments_lexer": "ipython3", 581 | "version": "3.5.2" 582 | } 583 | }, 584 | "nbformat": 4, 585 | "nbformat_minor": 2 586 | } 587 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Flask==2.3.2 2 | numpy==1.22.0 3 | pandas==0.19.2 4 | scikit-learn==0.18.1 5 | scipy==1.10.0 6 | Werkzeug==2.2.3 7 | gunicorn==19.7.1 8 | vaderSentiment==2.5 9 | nltk==3.6.6 10 | textstat==0.4.1 11 | requests==2.31.0 12 | -------------------------------------------------------------------------------- /research/23_Paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data4Democracy/hate_speech_detector/c527aa6d286c663e74f538aad16e085301144250/research/23_Paper.pdf --------------------------------------------------------------------------------