├── .gitignore
├── Dockerfile
├── Python
└── code
│ ├── twitter_label.py
│ └── wiki_badwords.py
├── R
└── explore.R
├── README.md
├── app
├── __init__.py
├── app.py
├── config.py
├── core.py
├── data_science.py
└── training.py
├── data
├── README.md
├── twitter-hate-speech.csv
└── twitter-hate-speech2.csv
├── docker-compose-prebuilt.yml
├── docker-compose.yml
├── docs
└── summarize_twitter-hate-speech2
│ ├── quantity_of_tweets_per_class_histogram.png
│ └── tweet_length_histogram.png
├── flask_app.py
├── hate_speech_detector.Rproj
├── hatebase_api.py
├── nginx
├── Dockerfile
└── conf.d
├── notebooks
├── .ipynb_checkpoints
│ ├── Data Cleaning-checkpoint.ipynb
│ └── Data Exploration-checkpoint.ipynb
├── Data Cleaning.ipynb
├── Data Exploration.ipynb
├── LSTM with Keras and TensorFlow.ipynb
├── best_multiclass_model.ipynb
├── clean.csv
├── labeled_data.csv
└── multiclass_baseline.ipynb
├── requirements.txt
└── research
└── 23_Paper.pdf
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | __pycache__
3 | *.ipynb
4 | *.xlsx
5 | Python/code/bad_words_from_wiki.txt
6 | Python/code/.DS_Store
7 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.5-slim
2 |
3 | WORKDIR ./src
4 |
5 | ADD requirements.txt requirements.txt
6 | ADD app ./app/
7 | ADD ./data/twitter-hate-speech2.csv ./data/
8 |
9 | ENV TRAINING_DATA_LOCATION data/twitter-hate-speech2.csv
10 |
11 | RUN pip install -r requirements.txt
12 |
13 | RUN python -m app.training
14 |
15 | ENTRYPOINT ["gunicorn","-b", "0.0.0.0:8000", "app.app:app"]
--------------------------------------------------------------------------------
/Python/code/twitter_label.py:
--------------------------------------------------------------------------------
1 |
2 | import pandas
3 | import numpy as np
4 | import re
5 |
6 | tweets = pandas.read_csv('~/Documents/final_tweets_NLP+CSS_2016.csv', header = None)
7 | tweets['label'] = 0
8 |
9 | badwords = pandas.read_csv('~/Documents/list1.csv', header = None)
10 |
11 | for i in range(0, len(badwords)):
12 |
13 | print(badwords[0][i])
14 | tweets["label"] = tweets["label"] + [1 if badwords[0][i] in ele else 0 for ele in tweets[1]]
15 |
16 |
17 |
18 |
19 | badwords = pandas.read_csv('~/Documents/list2.csv', header = None)
20 |
21 | for i in range(0, len(badwords)):
22 | text = re.findall('\"(.*?)\"', badwords.loc[i][0])
23 | print(text)
24 | tweets["label"] = tweets["label"] + [1 if text[0] in ele else 0 for ele in tweets[1]]
25 |
26 |
27 | badwords = pandas.read_csv('~/Documents/list3.csv', header = None)
28 |
29 | for i in range(0, len(badwords)):
30 |
31 | print(badwords[0][i])
32 | tweets["label"] = tweets["label"] + [1 if badwords[0][i] in ele else 0 for ele in tweets[1]]
33 |
34 |
35 |
36 | badwords = pandas.read_csv('~/Documents/list4.csv', header = None)
37 |
38 | for i in range(0, len(badwords)):
39 |
40 | print(badwords[0][i])
41 | tweets["label"] = tweets["label"] + [1 if badwords[0][i] in ele else 0 for ele in tweets[1]]
42 |
43 |
44 | badwords = pandas.read_csv('~/Documents/list5.csv', header = None)
45 |
46 | for i in range(0, len(badwords)):
47 | text = badwords.loc[i][0].split(',')
48 | print(text)
49 | tweets["label"] = tweets["label"] + [1 if text[0] in ele else 0 for ele in tweets[1]]
50 |
51 |
52 | tweets['label'] = np.where(tweets['label']>=1, 1, 0)
53 |
54 |
55 | ## sanity check
56 | tweets['label'].sum()
57 |
--------------------------------------------------------------------------------
/Python/code/wiki_badwords.py:
--------------------------------------------------------------------------------
1 | import wikipediaapi
2 | import re
3 |
4 |
5 |
6 | def process_word(word):
7 | # remove tags
8 | remove_list = ['', '', '', '', '\xa0', '"']
9 | for r in remove_list:
10 | word = word.replace(r, '')
11 | # Remove span
12 | if 'span' in word:
13 | word = re.findall(r'>(.*?)<', word)[0]
14 | # Remove words in parentheses
15 | if '(' in word and ')' in word:
16 | word = word[:word.index('(')] + word[word.index(')')+1:]
17 | if '(' in word:
18 | word = word[:word.index('(')]
19 | # Replace differet delimiters to comma
20 | replace_list = ['/', ' or ', ' also spelled ']
21 | for r in replace_list:
22 | word = word.replace(r, ',')
23 | # Remove non-latin characters
24 | stripped_text = ''
25 | for c in word:
26 | stripped_text += c if len(c.encode(encoding='utf_8'))==1 else ''
27 | word = stripped_text
28 | return(word)
29 |
30 |
31 | def process_extract(page_text):
32 | extract = re.findall(r'
(.*?)', page_text)
33 | for word in extract:
34 | ind = extract.index(word)
35 | extract[ind] = process_word(word)
36 |
37 | bad_words = []
38 | for word in extract:
39 | bad_words.extend(word.split(','))
40 | #ethnic_words = [x.strip() for x in ethnic_words]
41 |
42 | bad_words[:] = [x for x in bad_words if x != '']
43 | bad_words[:] = [x.strip().lower() for x in bad_words]
44 | return(bad_words)
45 |
46 |
47 |
48 |
49 |
50 | wiki_html = wikipediaapi.Wikipedia(
51 | language='en',
52 | extract_format=wikipediaapi.ExtractFormat.HTML
53 | )
54 |
55 | page_ethnic = wiki_html.page("List_of_ethnic_slurs")
56 | page_religous = wiki_html.page("List_of_religious_slurs")
57 |
58 |
59 | ethnic_bad_words = process_extract(page_ethnic.text)
60 | religious_bad_words = process_extract(page_religous.text)
61 |
62 |
63 |
64 |
65 |
66 |
67 |
--------------------------------------------------------------------------------
/R/explore.R:
--------------------------------------------------------------------------------
1 | library(readr)
2 | library(dplyr)
3 | library(ggplot2)
4 |
5 | # Explore the data in 'twitter-hate-speech2.csv'.
6 |
7 | explore <- function() {
8 | encoded_tweets <- readr::read_csv("./data/twitter-hate-speech2.csv")
9 | names(encoded_tweets)[1] <- "id"
10 |
11 | # Bar chart of the number of tweets in each class
12 | encoded_tweets %>%
13 | dplyr::group_by(class) %>%
14 | dplyr::summarize(total = n()) %>%
15 | ggplot(aes(x = factor(class), y = total)) +
16 | geom_bar(stat = "identity") +
17 | geom_text(aes(label = total, vjust = -0.25)) +
18 | scale_x_discrete(labels = c("hate speech", "offensive language", "neither")) +
19 | xlab("class") +
20 | ylab("quantity") +
21 | ggtitle("Number of tweets in each class (data: twitter-hate-speech2.csv)") %>%
22 | print()
23 |
24 | # Histogram of tweet lengths
25 | encoded_tweets %>%
26 | mutate(tweet_length = nchar(tweet)) %>%
27 | ggplot(aes(x = tweet_length)) +
28 | geom_histogram(binwidth = 1) +
29 | scale_x_continuous(name = "Number of characters in tweet",
30 | limits = c(0, 300),
31 | expand = c(0, 0)) +
32 | ylab("Number of tweets") +
33 | ggtitle("Length of tweets (data: twitter-hate-speech2.csv)") %>%
34 | print()
35 | }
36 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # hate_speech_detector
2 |
3 | ## Project Status
4 |
5 | We have built a few fully working hate speech detection models. This project is currently in hibernation, in that there's not an active use case at the moment but the models are ready to go. Every once in a while we'll tweak the models a bit but in general, there is not active development on this project. If you know of a possible application, please reach out to us. Also, if you're interested in helping, we're always looking for help, whether more data, more models, or any other interesting component of hate speech detection. If you're interested in using or working on this model, feel free to reach out to the Slack channel (#p-hate-speech) or Julius Simonelli (jss367 in Slack).
6 |
7 | ## Data
8 |
9 | We are currently working with the data collected by [Davidson et al.](https://aaai.org/ocs/index.php/ICWSM/ICWSM17/paper/view/15665) for their research on hate speech detection. The dataset contains tweets that are labeled as either hate speech, offensive language, or neither. The data were pulled from [Hatebase.org](https://www.hatebase.org/), an organization that collects instances of potential hate speech. The data were then labeled using [CrowdFlower](https://www.crowdflower.com/), which uses non-specialists to clean and label data. Each tweet was reviewed by three or more people and a majority-wins approach was taken when there was disagreement.
10 |
11 | All data used in this analysis is stored in the [data](https://github.com/Data4Democracy/hate_speech_detector/tree/master/data) folder of [this repository](https://github.com/Data4Democracy/hate_speech_detector). The original source of the data is: https://github.com/t-davidson/hate-speech-and-offensive-language
12 |
13 | The paper by Davidson et al. can be found here:
14 | Thomas Davidson, Dana Warmsley, Michael Macy, Ingmar Weber. 2017. "[Automated Hate Speech Detection and the Problem of Offensive Language](https://aaai.org/ocs/index.php/ICWSM/ICWSM17/paper/view/15665)". Proceedings of the 11th International AAAI Conference on Web and Social Media (ICWSM).
15 |
16 | ## Demo REST API
17 |
18 | A REST API has been designed to demo the functionality of a basic model. The code will train a basic model as defined in the data_science.py and use this model to make predictions. To get the current IP address, ask in the Data for Democracy #p-hate-speech Slack channel.
19 |
20 | ### Installation (Docker)
21 |
22 | Navigate to the hate_speech_detector/ directory and build the container:
23 |
24 | ```shell
25 | docker build -t [container-name] ./app
26 | ```
27 |
28 | ### Usage
29 |
30 | When you run the container, you must also expose the port 8000. For example:
31 |
32 |
33 | ```shell
34 | docker run -p 8000:8000 -t [container-name]
35 | ```
36 |
37 | #### Labels
38 |
39 | Labels are Hate = 0, Offensive = 1, Not Offensive = 2.
40 |
41 | #### API Endpoints
42 |
43 | - /label
44 |
45 | Then, the API can be called to predict the label on new text data via:
46 |
47 | ```shell
48 | curl -H "Content-Type: application/json" -X POST -d '{"text":"Text that might be offensive or hateful... or not."}' http://0.0.0.0:8000/label
49 | ```
50 |
51 | Output Ex:
52 |
53 | ```shell
54 | {
55 | "label": 2,
56 | "text": "Text that might be offensive or hateful... or not."
57 | }
58 |
59 | ```
60 |
61 | In this case, "text" is the input text and label is the predicted label from the model.
62 |
63 |
64 | - /demo
65 |
66 | You may also see the model predict on held out test set values via:
67 |
68 | ```shell
69 | curl http://0.0.0.0:8000/demo
70 | ```
71 | Output Ex:
72 |
73 | ```shell
74 | {
75 | "label": 2,
76 | "text": "#stateoftheunion would last 15mins if they let the President talk all that standing up clapping is for the birds",
77 | "true": 2
78 | }
79 | ```
80 |
81 | Here, "text" is the text input, "label" is the predicted label from the model, and "true" is the actual label given by a human.
82 |
83 | ### Deploy with prebuilt containers via docker-compose
84 |
85 |
86 | ```shell
87 | docker-compose -f docker-compose-prebuilt.yml up
88 | ```
89 |
90 | At this point, if you set this up on AWS, then you could navigate to
91 |
92 | ```shell
93 | [your aws ip]/demo
94 | ```
95 |
96 | in your browser to see the demo.
97 |
98 | ## To Do:
99 |
100 | ### Classifier
101 | There are currently two Jupyter Notebooks containing models to classify the data, but both could be greatly improved. Please feel free to take a look and let us know if you make any improvements!
102 |
103 | ### Data preprocessing
104 | There is currently very little preprocessing done on this data. Would someone be interested in creating some useful categories for machine learning and plugging them back into the models? My guess is feature engineering has the most potential to improve the model.
105 |
106 |
107 | ### Front End
108 |
109 | A front end for the demo app which can demo random elements of the test set or allow the user to input their own text.
110 |
111 | ### Dataset
112 | The Davidson et al. paper remarked on some possible mislabelings in the dataset. Is mislabeling common in the dataset? Fixing any labels would definitely improve our ability to create a classifier. How big a problem is this? Does someone want to look at some of the misclassifications and see if any are incorrectly labeled?
113 |
114 | Also, there's a second data source, also containing labeled hate speech from Twitter, but we are yet to explore it. You can find the data here: https://github.com/zeerakw/hatespeech. If anyone wants to look into this dataset and assess its value, it would be very useful. Something else to consider - can these datasets be combined?
115 |
116 | ### Strategy
117 | What else do we want to do with this?
118 |
--------------------------------------------------------------------------------
/app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Data4Democracy/hate_speech_detector/c527aa6d286c663e74f538aad16e085301144250/app/__init__.py
--------------------------------------------------------------------------------
/app/app.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, jsonify, request
2 |
3 | from app.core import DataStore
4 | from app.core import Model
5 |
6 | app = Flask(__name__)
7 |
8 | @app.route('/')
9 | def index():
10 | # TODO: return a front end page that can call /demo or /label
11 | return ''
12 |
13 | @app.route('/demo')
14 | def demo():
15 | """ Endpoint to see the model's performance on a random test set example. """
16 | tweet, label = DataStore.get_random_test()
17 | return_json = Model.predict([tweet])[0]
18 | return_json['true'] = int(label)
19 | return jsonify(return_json)
20 |
21 | @app.route('/label', methods=['POST'])
22 | def predict():
23 | """ Applies the model to the 'text' entry of the payload. """
24 | content = request.json
25 | return jsonify(Model.predict([content['text']])[0])
26 |
27 |
28 | if __name__ == '__main__':
29 | app.run()
30 |
--------------------------------------------------------------------------------
/app/config.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | TRAINING_DATA_LOCATION = os.environ['TRAINING_DATA_LOCATION']
4 | MODEL_LOCATION = 'model.pkl'
--------------------------------------------------------------------------------
/app/core.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from sklearn.externals import joblib
4 | from app.config import TRAINING_DATA_LOCATION, MODEL_LOCATION
5 |
6 |
7 | class GlobalDataLoad(type):
8 | """ Metaclass used to load data before app begins."""
9 | def __new__(meta, name, bases, clzdict):
10 | cls = super().__new__(type,name,bases,clzdict)
11 |
12 | df = pd.read_csv(TRAINING_DATA_LOCATION, encoding='latin-1')
13 |
14 | cls.X=df['tweet']
15 | cls.y=df['class']
16 |
17 | return cls
18 |
19 |
20 | class DataStore(metaclass=GlobalDataLoad):
21 |
22 |
23 | @classmethod
24 | def get_random_test(cls):
25 | ind = np.random.choice(len(cls.X))
26 | return cls.X.iloc[ind], cls.y.iloc[ind]
27 |
28 |
29 | class GlobalModelLoad(type):
30 | """ Metaclass used to load and train the model before app begins."""
31 | def __new__(meta, name, bases, clzdict):
32 | cls = super().__new__(type,name,bases,clzdict)
33 |
34 | # Load model
35 | cls._model = joblib.load(MODEL_LOCATION)
36 |
37 | return cls
38 |
39 |
40 | class Model(metaclass=GlobalModelLoad):
41 |
42 | @classmethod
43 | def predict(cls, X):
44 | """ X should be a list of strings."""
45 | return [{ "text" : k, "label" : int(v)} for k,v in zip(X,cls._model.predict(X))] # cast to int for json, remove numpy type
46 |
47 |
48 |
--------------------------------------------------------------------------------
/app/data_science.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import nltk
4 | from nltk.stem.porter import *
5 | import string
6 | import re
7 | from sklearn.feature_extraction.text import TfidfVectorizer
8 | from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS
9 | from textstat.textstat import *
10 | from sklearn.base import BaseEstimator, TransformerMixin
11 |
12 | stopwords=stopwords = nltk.corpus.stopwords.words("english")
13 |
14 | other_exclusions = ["#ff", "ff", "rt"]
15 | stopwords.extend(other_exclusions)
16 |
17 | stemmer = PorterStemmer()
18 |
19 | def preprocess(text_string):
20 | """
21 | Accepts a text string and replaces:
22 | 1) urls with URLHERE
23 | 2) lots of whitespace with one instance
24 | 3) mentions with MENTIONHERE
25 |
26 | This allows us to get standardized counts of urls and mentions
27 | Without caring about specific people mentioned
28 | """
29 | space_pattern = '\s+'
30 | giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
31 | '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
32 | mention_regex = '@[\w\-]+'
33 | parsed_text = re.sub(space_pattern, ' ', text_string)
34 | parsed_text = re.sub(giant_url_regex, '', parsed_text)
35 | parsed_text = re.sub(mention_regex, '', parsed_text)
36 | return parsed_text
37 |
38 | def tokenize(tweet):
39 | """Removes punctuation & excess whitespace, sets to lowercase,
40 | and stems tweets. Returns a list of stemmed tokens."""
41 | tweet = " ".join(re.split("[^a-zA-Z]*", tweet.lower())).strip()
42 | tokens = [stemmer.stem(t) for t in tweet.split()]
43 | return tokens
44 |
45 | def basic_tokenize(tweet):
46 | """Same as tokenize but without the stemming"""
47 | tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip()
48 | return tweet.split()
49 |
50 | class PosTfidfVectorizer(BaseEstimator, TransformerMixin):
51 | """Get POS tags for tweets and transform via tfidf"""
52 |
53 | def __init__(self):
54 | self._pos_vectorizer = TfidfVectorizer(
55 | tokenizer=None,
56 | lowercase=False,
57 | preprocessor=None,
58 | ngram_range=(1, 3),
59 | stop_words=None,
60 | use_idf=False,
61 | smooth_idf=False,
62 | norm=None,
63 | decode_error='replace',
64 | max_features=5000,
65 | min_df=5,
66 | max_df=0.75,
67 | )
68 |
69 | def _preprocess(self, X):
70 | tweet_tags = []
71 | for t in X:
72 | tokens = basic_tokenize(preprocess(t))
73 | tags = nltk.pos_tag(tokens)
74 | tag_list = [x[1] for x in tags]
75 | tag_str = " ".join(tag_list)
76 | tweet_tags.append(tag_str)
77 | return tweet_tags
78 |
79 | def fit(self, X, y=None):
80 | tweet_tags = self._preprocess(X)
81 | self._pos_vectorizer.fit(X)
82 |
83 | return self
84 |
85 | def transform(self, X, y=None):
86 | tweet_tags = self._preprocess(X)
87 | return self._pos_vectorizer.transform(X)
88 |
89 | class SentimentVectorizer(BaseEstimator, TransformerMixin):
90 | sentiment_analyzer = VS()
91 |
92 | def count_twitter_objs(self, text_string):
93 | """
94 | Accepts a text string and replaces:
95 | 1) urls with URLHERE
96 | 2) lots of whitespace with one instance
97 | 3) mentions with MENTIONHERE
98 | 4) hashtags with HASHTAGHERE
99 |
100 | This allows us to get standardized counts of urls and mentions
101 | Without caring about specific people mentioned.
102 |
103 | Returns counts of urls, mentions, and hashtags.
104 | """
105 | space_pattern = '\s+'
106 | giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
107 | '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
108 | mention_regex = '@[\w\-]+'
109 | hashtag_regex = '#[\w\-]+'
110 | parsed_text = re.sub(space_pattern, ' ', text_string)
111 | parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
112 | parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
113 | parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
114 | return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))
115 |
116 | def other_features(self, tweet):
117 | """This function takes a string and returns a list of features.
118 | These include Sentiment scores, Text and Readability scores,
119 | as well as Twitter specific features"""
120 | sentiment = self.sentiment_analyzer.polarity_scores(tweet)
121 |
122 | words = preprocess(tweet) #Get text only
123 |
124 | syllables = textstat.syllable_count(words)
125 | num_chars = sum(len(w) for w in words)
126 | num_chars_total = len(tweet)
127 | num_terms = len(tweet.split())
128 | num_words = len(words.split())
129 | avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
130 | num_unique_terms = len(set(words.split()))
131 |
132 | ###Modified FK grade, where avg words per sentence is just num words/1
133 | FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
134 | ##Modified FRE score, where sentence fixed to 1
135 | FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)
136 |
137 | twitter_objs = self.count_twitter_objs(tweet)
138 | retweet = 0
139 | if "rt" in words:
140 | retweet = 1
141 | features = [FKRA, FRE,syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words,
142 | num_unique_terms, sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound'],
143 | twitter_objs[2], twitter_objs[1],
144 | twitter_objs[0], retweet]
145 | return features
146 |
147 | def get_feature_array(self, tweets):
148 | feats=[]
149 | for t in tweets:
150 | feats.append(self.other_features(t))
151 | return np.array(feats)
152 |
153 | def fit(self, X, y=None):
154 | return self
155 |
156 | def transform(self, X, y=None):
157 | return self.get_feature_array(X)
--------------------------------------------------------------------------------
/app/training.py:
--------------------------------------------------------------------------------
1 | import nltk
2 |
3 | # download necessary components
4 | nltk.download('stopwords')
5 | nltk.download('averaged_perceptron_tagger')
6 |
7 | import pandas as pd
8 | import numpy as np
9 | from sklearn.externals import joblib
10 | from sklearn.pipeline import Pipeline, FeatureUnion
11 | from sklearn.feature_extraction.text import TfidfVectorizer
12 | from sklearn.feature_selection import SelectFromModel
13 | from sklearn.linear_model import LogisticRegression
14 | import nltk
15 |
16 | from app.config import TRAINING_DATA_LOCATION, MODEL_LOCATION
17 | from app.data_science import SentimentVectorizer, preprocess, tokenize, PosTfidfVectorizer, stopwords
18 |
19 |
20 | # Train Model and Save
21 |
22 | vectorizer = TfidfVectorizer(
23 | tokenizer=tokenize,
24 | preprocessor=preprocess,
25 | ngram_range=(1, 3),
26 | stop_words=stopwords,
27 | use_idf=True,
28 | smooth_idf=False,
29 | norm=None,
30 | decode_error='replace',
31 | max_features=10000,
32 | min_df=5,
33 | max_df=0.75
34 | )
35 | pos_vectorizer = PosTfidfVectorizer()
36 | sentiment_vectorizer = SentimentVectorizer()
37 |
38 | model = Pipeline( [('features', FeatureUnion([('tfidf', vectorizer),('pos_tfidf', pos_vectorizer),
39 | ('sentiment',sentiment_vectorizer)])),
40 | ('feature_selector', SelectFromModel(LogisticRegression(class_weight='balanced',penalty="l1",C=0.01))),
41 | ('model', LogisticRegression(class_weight='balanced',penalty='l2',C=0.01))] )
42 |
43 | df = pd.read_csv(TRAINING_DATA_LOCATION, encoding='latin-1')
44 |
45 | X = df['tweet']
46 | y = df['class']
47 |
48 | model.fit(X,y)
49 | joblib.dump(model, MODEL_LOCATION)
50 |
--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | # Data Definitions
2 |
3 | ## `twitter-hate-speech.csv`
4 | This file was copied from a [data.world](https://data.world/) repository: [crowdflower](https://data.world/crowdflower)/[Hate Speech Identification](https://data.world/crowdflower/hate-speech-identification/)/[twitter-hate-speech-classifier-DFE-a845520.csv](https://data.world/crowdflower/hate-speech-identification/workspace/file?filename=twitter-hate-speech-classifier-DFE-a845520.csv)
5 |
6 | _Open Issue_: What do the columns mean?
7 |
8 | ## `twitter-hate-speech2.csv`
9 | This file was copied from [Davidson et al.](https://aaai.org/ocs/index.php/ICWSM/ICWSM17/paper/view/15665)'s [labeled_data.csv](https://github.com/t-davidson/hate-speech-and-offensive-language/blob/master/data/labeled_data.csv). The same data is also in a [data.world](https://data.world/) repository: [thomasrdavidson](https://data.world/thomasrdavidson/)/[Hate Speech and Offensive Language](https://data.world/thomasrdavidson/hate-speech-and-offensive-language/)/[labeled_data.csv](https://data.world/thomasrdavidson/hate-speech-and-offensive-language/workspace/file?filename=labeled_data.csv)
10 |
11 | The file contains 5 columns:
12 |
13 | `count` = number of CrowdFlower users who coded each tweet (min is 3, sometimes more users coded a tweet when judgments were determined to be unreliable by CF).
14 |
15 | `hate_speech` = number of CF users who judged the tweet to be hate speech.
16 |
17 | `offensive_language` = number of CF users who judged the tweet to be offensive.
18 |
19 | `neither` = number of CF users who judged the tweet to be neither offensive nor non-offensive.
20 |
21 | `class` = class label for majority of CF users.
22 | 0 - hate speech
23 | 1 - offensive language
24 | 2 - neither
25 |
--------------------------------------------------------------------------------
/data/twitter-hate-speech.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Data4Democracy/hate_speech_detector/c527aa6d286c663e74f538aad16e085301144250/data/twitter-hate-speech.csv
--------------------------------------------------------------------------------
/docker-compose-prebuilt.yml:
--------------------------------------------------------------------------------
1 | version: '3'
2 | services:
3 | app:
4 | image: "srome/d4d-hate-speech-detector:latest"
5 | networks:
6 | - web_nw
7 | ports:
8 | - "80:8000"
9 | nginx:
10 | image: "srome/d4d-hate-speech-detector:nginx"
11 | networks:
12 | - web_nw
13 | networks:
14 | web_nw:
15 | driver: bridge
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3'
2 | services:
3 | app:
4 | build: .
5 | networks:
6 | - web_nw
7 | ports:
8 | - "80:8000"
9 | nginx:
10 | build: ./nginx
11 | networks:
12 | - web_nw
13 | networks:
14 | web_nw:
15 | driver: bridge
--------------------------------------------------------------------------------
/docs/summarize_twitter-hate-speech2/quantity_of_tweets_per_class_histogram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Data4Democracy/hate_speech_detector/c527aa6d286c663e74f538aad16e085301144250/docs/summarize_twitter-hate-speech2/quantity_of_tweets_per_class_histogram.png
--------------------------------------------------------------------------------
/docs/summarize_twitter-hate-speech2/tweet_length_histogram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Data4Democracy/hate_speech_detector/c527aa6d286c663e74f538aad16e085301144250/docs/summarize_twitter-hate-speech2/tweet_length_histogram.png
--------------------------------------------------------------------------------
/flask_app.py:
--------------------------------------------------------------------------------
1 |
2 | # A very simple Flask Hello World app for you to get started with...
3 |
4 | from flask import Flask
5 |
6 | app = Flask(__name__)
7 |
8 | @app.route('/')
9 | def hello_world():
10 | return 'Hello from Flask!'
11 |
12 |
--------------------------------------------------------------------------------
/hate_speech_detector.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
15 | AutoAppendNewline: Yes
16 |
--------------------------------------------------------------------------------
/hatebase_api.py:
--------------------------------------------------------------------------------
1 | ''' This is the code for using the Hatebase API
2 | It uses this Python wrapper: https://github.com/DanielJDufour/hatebase
3 | which can be installed with: pip install hatebase
4 |
5 | '''
6 | from json import loads
7 | from hatebase import HatebaseAPI
8 |
9 | key = # get a key here: https://www.hatebase.org/request_api
10 |
11 | # Define parameters
12 | hatebase = HatebaseAPI({"key": key})
13 | filters = {'language': 'eng'}
14 | output = 'json'
15 | query_type = 'sightings'
16 |
17 | # Query the database
18 | response = hatebase.performRequest(filters, output, query_type)
19 |
20 | # Convert to Python object
21 | resp = loads(response)
--------------------------------------------------------------------------------
/nginx/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nginx
2 |
3 | COPY conf.d /etc
--------------------------------------------------------------------------------
/nginx/conf.d:
--------------------------------------------------------------------------------
1 | worker_processes 1;
2 |
3 | events { worker_connections 1024; }
4 |
5 | http {
6 |
7 | upstream docker-app {
8 | server app;
9 | }
10 |
11 | server {
12 | listen 80;
13 |
14 | location / {
15 | proxy_pass http://docker-app;
16 | proxy_redirect off;
17 | proxy_set_header Host $host;
18 | proxy_set_header X-Real-IP $remote_addr;
19 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
20 | proxy_set_header X-Forwarded-Host $server_name;
21 | }
22 | }
23 |
24 | }
--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/Data Exploration-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook is an attempt to explore the dataset. This notebook needs to be expanded upon."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {
14 | "collapsed": true
15 | },
16 | "outputs": [],
17 | "source": [
18 | "import pandas as pd\n",
19 | "import re\n",
20 | "from textstat.textstat import textstat\n",
21 | "from textblob import TextBlob\n",
22 | "import seaborn as sns\n",
23 | "%matplotlib inline\n",
24 | "sns.set_style(\"dark\")\n",
25 | "sns.set_context(\"talk\")"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 2,
31 | "metadata": {
32 | "collapsed": true
33 | },
34 | "outputs": [],
35 | "source": [
36 | "df = pd.read_csv('twitter-hate-speech.csv', encoding='latin-1')"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 3,
42 | "metadata": {},
43 | "outputs": [
44 | {
45 | "data": {
46 | "text/html": [
47 | "\n",
48 | "\n",
61 | "
\n",
62 | " \n",
63 | " \n",
64 | " | \n",
65 | " _unit_id | \n",
66 | " _golden | \n",
67 | " _unit_state | \n",
68 | " _trusted_judgments | \n",
69 | " _last_judgment_at | \n",
70 | " does_this_tweet_contain_hate_speech | \n",
71 | " does_this_tweet_contain_hate_speech:confidence | \n",
72 | " _created_at | \n",
73 | " orig__golden | \n",
74 | " orig__last_judgment_at | \n",
75 | " orig__trusted_judgments | \n",
76 | " orig__unit_id | \n",
77 | " orig__unit_state | \n",
78 | " _updated_at | \n",
79 | " orig_does_this_tweet_contain_hate_speech | \n",
80 | " does_this_tweet_contain_hate_speech_gold | \n",
81 | " does_this_tweet_contain_hate_speech_gold_reason | \n",
82 | " does_this_tweet_contain_hate_speechconfidence | \n",
83 | " tweet_id | \n",
84 | " tweet_text | \n",
85 | "
\n",
86 | " \n",
87 | " \n",
88 | " \n",
89 | " 0 | \n",
90 | " 853718217 | \n",
91 | " True | \n",
92 | " golden | \n",
93 | " 86 | \n",
94 | " NaN | \n",
95 | " The tweet uses offensive language but not hate... | \n",
96 | " 0.6013 | \n",
97 | " NaN | \n",
98 | " True | \n",
99 | " NaN | \n",
100 | " 0.0 | \n",
101 | " 615561535.0 | \n",
102 | " golden | \n",
103 | " NaN | \n",
104 | " The tweet contains hate speech | \n",
105 | " The tweet contains hate speech\\nThe tweet uses... | \n",
106 | " NaN | \n",
107 | " 1.0 | \n",
108 | " 1.666196e+09 | \n",
109 | " Warning: penny boards will make you a faggot | \n",
110 | "
\n",
111 | " \n",
112 | " 1 | \n",
113 | " 853718218 | \n",
114 | " True | \n",
115 | " golden | \n",
116 | " 92 | \n",
117 | " NaN | \n",
118 | " The tweet contains hate speech | \n",
119 | " 0.7227 | \n",
120 | " NaN | \n",
121 | " True | \n",
122 | " NaN | \n",
123 | " 0.0 | \n",
124 | " 615561723.0 | \n",
125 | " golden | \n",
126 | " NaN | \n",
127 | " The tweet contains hate speech | \n",
128 | " The tweet contains hate speech\\nThe tweet uses... | \n",
129 | " NaN | \n",
130 | " 1.0 | \n",
131 | " 4.295121e+08 | \n",
132 | " Fuck dykes | \n",
133 | "
\n",
134 | " \n",
135 | " 2 | \n",
136 | " 853718219 | \n",
137 | " True | \n",
138 | " golden | \n",
139 | " 86 | \n",
140 | " NaN | \n",
141 | " The tweet contains hate speech | \n",
142 | " 0.5229 | \n",
143 | " NaN | \n",
144 | " True | \n",
145 | " NaN | \n",
146 | " 0.0 | \n",
147 | " 615562039.0 | \n",
148 | " golden | \n",
149 | " NaN | \n",
150 | " The tweet contains hate speech | \n",
151 | " The tweet contains hate speech\\nThe tweet uses... | \n",
152 | " NaN | \n",
153 | " 1.0 | \n",
154 | " 3.956238e+08 | \n",
155 | " @sizzurp__ @ILIKECATS74 @yoPapi_chulo @brandon... | \n",
156 | "
\n",
157 | " \n",
158 | " 3 | \n",
159 | " 853718220 | \n",
160 | " True | \n",
161 | " golden | \n",
162 | " 98 | \n",
163 | " NaN | \n",
164 | " The tweet contains hate speech | \n",
165 | " 0.5184 | \n",
166 | " NaN | \n",
167 | " True | \n",
168 | " NaN | \n",
169 | " 0.0 | \n",
170 | " 615562068.0 | \n",
171 | " golden | \n",
172 | " NaN | \n",
173 | " The tweet contains hate speech | \n",
174 | " The tweet contains hate speech\\nThe tweet uses... | \n",
175 | " NaN | \n",
176 | " 1.0 | \n",
177 | " 4.975147e+08 | \n",
178 | " \"@jayswaggkillah: \"@JacklynAnnn: @jayswaggkill... | \n",
179 | "
\n",
180 | " \n",
181 | " 4 | \n",
182 | " 853718221 | \n",
183 | " True | \n",
184 | " golden | \n",
185 | " 88 | \n",
186 | " NaN | \n",
187 | " The tweet uses offensive language but not hate... | \n",
188 | " 0.5185 | \n",
189 | " NaN | \n",
190 | " True | \n",
191 | " NaN | \n",
192 | " 0.0 | \n",
193 | " 615562488.0 | \n",
194 | " golden | \n",
195 | " NaN | \n",
196 | " The tweet contains hate speech | \n",
197 | " The tweet contains hate speech\\nThe tweet uses... | \n",
198 | " NaN | \n",
199 | " 1.0 | \n",
200 | " 5.889236e+08 | \n",
201 | " @Zhugstubble You heard me bitch but any way I'... | \n",
202 | "
\n",
203 | " \n",
204 | "
\n",
205 | "
"
206 | ],
207 | "text/plain": [
208 | " _unit_id _golden _unit_state _trusted_judgments _last_judgment_at \\\n",
209 | "0 853718217 True golden 86 NaN \n",
210 | "1 853718218 True golden 92 NaN \n",
211 | "2 853718219 True golden 86 NaN \n",
212 | "3 853718220 True golden 98 NaN \n",
213 | "4 853718221 True golden 88 NaN \n",
214 | "\n",
215 | " does_this_tweet_contain_hate_speech \\\n",
216 | "0 The tweet uses offensive language but not hate... \n",
217 | "1 The tweet contains hate speech \n",
218 | "2 The tweet contains hate speech \n",
219 | "3 The tweet contains hate speech \n",
220 | "4 The tweet uses offensive language but not hate... \n",
221 | "\n",
222 | " does_this_tweet_contain_hate_speech:confidence _created_at orig__golden \\\n",
223 | "0 0.6013 NaN True \n",
224 | "1 0.7227 NaN True \n",
225 | "2 0.5229 NaN True \n",
226 | "3 0.5184 NaN True \n",
227 | "4 0.5185 NaN True \n",
228 | "\n",
229 | " orig__last_judgment_at orig__trusted_judgments orig__unit_id \\\n",
230 | "0 NaN 0.0 615561535.0 \n",
231 | "1 NaN 0.0 615561723.0 \n",
232 | "2 NaN 0.0 615562039.0 \n",
233 | "3 NaN 0.0 615562068.0 \n",
234 | "4 NaN 0.0 615562488.0 \n",
235 | "\n",
236 | " orig__unit_state _updated_at orig_does_this_tweet_contain_hate_speech \\\n",
237 | "0 golden NaN The tweet contains hate speech \n",
238 | "1 golden NaN The tweet contains hate speech \n",
239 | "2 golden NaN The tweet contains hate speech \n",
240 | "3 golden NaN The tweet contains hate speech \n",
241 | "4 golden NaN The tweet contains hate speech \n",
242 | "\n",
243 | " does_this_tweet_contain_hate_speech_gold \\\n",
244 | "0 The tweet contains hate speech\\nThe tweet uses... \n",
245 | "1 The tweet contains hate speech\\nThe tweet uses... \n",
246 | "2 The tweet contains hate speech\\nThe tweet uses... \n",
247 | "3 The tweet contains hate speech\\nThe tweet uses... \n",
248 | "4 The tweet contains hate speech\\nThe tweet uses... \n",
249 | "\n",
250 | " does_this_tweet_contain_hate_speech_gold_reason \\\n",
251 | "0 NaN \n",
252 | "1 NaN \n",
253 | "2 NaN \n",
254 | "3 NaN \n",
255 | "4 NaN \n",
256 | "\n",
257 | " does_this_tweet_contain_hate_speechconfidence tweet_id \\\n",
258 | "0 1.0 1.666196e+09 \n",
259 | "1 1.0 4.295121e+08 \n",
260 | "2 1.0 3.956238e+08 \n",
261 | "3 1.0 4.975147e+08 \n",
262 | "4 1.0 5.889236e+08 \n",
263 | "\n",
264 | " tweet_text \n",
265 | "0 Warning: penny boards will make you a faggot \n",
266 | "1 Fuck dykes \n",
267 | "2 @sizzurp__ @ILIKECATS74 @yoPapi_chulo @brandon... \n",
268 | "3 \"@jayswaggkillah: \"@JacklynAnnn: @jayswaggkill... \n",
269 | "4 @Zhugstubble You heard me bitch but any way I'... "
270 | ]
271 | },
272 | "execution_count": 3,
273 | "metadata": {},
274 | "output_type": "execute_result"
275 | }
276 | ],
277 | "source": [
278 | "df.head()"
279 | ]
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": 4,
284 | "metadata": {},
285 | "outputs": [
286 | {
287 | "data": {
288 | "text/html": [
289 | "\n",
290 | "\n",
303 | "
\n",
304 | " \n",
305 | " \n",
306 | " | \n",
307 | " _unit_id | \n",
308 | " _trusted_judgments | \n",
309 | " does_this_tweet_contain_hate_speech:confidence | \n",
310 | " _created_at | \n",
311 | " orig__last_judgment_at | \n",
312 | " orig__trusted_judgments | \n",
313 | " orig__unit_id | \n",
314 | " _updated_at | \n",
315 | " does_this_tweet_contain_hate_speech_gold_reason | \n",
316 | " does_this_tweet_contain_hate_speechconfidence | \n",
317 | " tweet_id | \n",
318 | "
\n",
319 | " \n",
320 | " \n",
321 | " \n",
322 | " count | \n",
323 | " 1.450900e+04 | \n",
324 | " 14509.000000 | \n",
325 | " 14509.000000 | \n",
326 | " 0.0 | \n",
327 | " 0.0 | \n",
328 | " 67.0 | \n",
329 | " 6.700000e+01 | \n",
330 | " 0.0 | \n",
331 | " 0.0 | \n",
332 | " 67.0 | \n",
333 | " 1.450900e+04 | \n",
334 | "
\n",
335 | " \n",
336 | " mean | \n",
337 | " 8.537266e+08 | \n",
338 | " 3.406989 | \n",
339 | " 0.865844 | \n",
340 | " NaN | \n",
341 | " NaN | \n",
342 | " 0.0 | \n",
343 | " 6.155623e+08 | \n",
344 | " NaN | \n",
345 | " NaN | \n",
346 | " 1.0 | \n",
347 | " 6.762014e+17 | \n",
348 | "
\n",
349 | " \n",
350 | " std | \n",
351 | " 4.205642e+03 | \n",
352 | " 5.979699 | \n",
353 | " 0.178734 | \n",
354 | " NaN | \n",
355 | " NaN | \n",
356 | " 0.0 | \n",
357 | " 1.089591e+03 | \n",
358 | " NaN | \n",
359 | " NaN | \n",
360 | " 0.0 | \n",
361 | " 4.606417e+16 | \n",
362 | "
\n",
363 | " \n",
364 | " min | \n",
365 | " 8.537182e+08 | \n",
366 | " 3.000000 | \n",
367 | " 0.333300 | \n",
368 | " NaN | \n",
369 | " NaN | \n",
370 | " 0.0 | \n",
371 | " 6.155611e+08 | \n",
372 | " NaN | \n",
373 | " NaN | \n",
374 | " 1.0 | \n",
375 | " 2.423319e+07 | \n",
376 | "
\n",
377 | " \n",
378 | " 25% | \n",
379 | " 8.537230e+08 | \n",
380 | " 3.000000 | \n",
381 | " 0.668400 | \n",
382 | " NaN | \n",
383 | " NaN | \n",
384 | " 0.0 | \n",
385 | " 6.155612e+08 | \n",
386 | " NaN | \n",
387 | " NaN | \n",
388 | " 1.0 | \n",
389 | " 6.790000e+17 | \n",
390 | "
\n",
391 | " \n",
392 | " 50% | \n",
393 | " 8.537266e+08 | \n",
394 | " 3.000000 | \n",
395 | " 1.000000 | \n",
396 | " NaN | \n",
397 | " NaN | \n",
398 | " 0.0 | \n",
399 | " 6.155622e+08 | \n",
400 | " NaN | \n",
401 | " NaN | \n",
402 | " 1.0 | \n",
403 | " 6.790000e+17 | \n",
404 | "
\n",
405 | " \n",
406 | " 75% | \n",
407 | " 8.537303e+08 | \n",
408 | " 3.000000 | \n",
409 | " 1.000000 | \n",
410 | " NaN | \n",
411 | " NaN | \n",
412 | " 0.0 | \n",
413 | " 6.155625e+08 | \n",
414 | " NaN | \n",
415 | " NaN | \n",
416 | " 1.0 | \n",
417 | " 6.800000e+17 | \n",
418 | "
\n",
419 | " \n",
420 | " max | \n",
421 | " 8.537339e+08 | \n",
422 | " 98.000000 | \n",
423 | " 1.000000 | \n",
424 | " NaN | \n",
425 | " NaN | \n",
426 | " 0.0 | \n",
427 | " 6.155658e+08 | \n",
428 | " NaN | \n",
429 | " NaN | \n",
430 | " 1.0 | \n",
431 | " 6.800000e+17 | \n",
432 | "
\n",
433 | " \n",
434 | "
\n",
435 | "
"
436 | ],
437 | "text/plain": [
438 | " _unit_id _trusted_judgments \\\n",
439 | "count 1.450900e+04 14509.000000 \n",
440 | "mean 8.537266e+08 3.406989 \n",
441 | "std 4.205642e+03 5.979699 \n",
442 | "min 8.537182e+08 3.000000 \n",
443 | "25% 8.537230e+08 3.000000 \n",
444 | "50% 8.537266e+08 3.000000 \n",
445 | "75% 8.537303e+08 3.000000 \n",
446 | "max 8.537339e+08 98.000000 \n",
447 | "\n",
448 | " does_this_tweet_contain_hate_speech:confidence _created_at \\\n",
449 | "count 14509.000000 0.0 \n",
450 | "mean 0.865844 NaN \n",
451 | "std 0.178734 NaN \n",
452 | "min 0.333300 NaN \n",
453 | "25% 0.668400 NaN \n",
454 | "50% 1.000000 NaN \n",
455 | "75% 1.000000 NaN \n",
456 | "max 1.000000 NaN \n",
457 | "\n",
458 | " orig__last_judgment_at orig__trusted_judgments orig__unit_id \\\n",
459 | "count 0.0 67.0 6.700000e+01 \n",
460 | "mean NaN 0.0 6.155623e+08 \n",
461 | "std NaN 0.0 1.089591e+03 \n",
462 | "min NaN 0.0 6.155611e+08 \n",
463 | "25% NaN 0.0 6.155612e+08 \n",
464 | "50% NaN 0.0 6.155622e+08 \n",
465 | "75% NaN 0.0 6.155625e+08 \n",
466 | "max NaN 0.0 6.155658e+08 \n",
467 | "\n",
468 | " _updated_at does_this_tweet_contain_hate_speech_gold_reason \\\n",
469 | "count 0.0 0.0 \n",
470 | "mean NaN NaN \n",
471 | "std NaN NaN \n",
472 | "min NaN NaN \n",
473 | "25% NaN NaN \n",
474 | "50% NaN NaN \n",
475 | "75% NaN NaN \n",
476 | "max NaN NaN \n",
477 | "\n",
478 | " does_this_tweet_contain_hate_speechconfidence tweet_id \n",
479 | "count 67.0 1.450900e+04 \n",
480 | "mean 1.0 6.762014e+17 \n",
481 | "std 0.0 4.606417e+16 \n",
482 | "min 1.0 2.423319e+07 \n",
483 | "25% 1.0 6.790000e+17 \n",
484 | "50% 1.0 6.790000e+17 \n",
485 | "75% 1.0 6.800000e+17 \n",
486 | "max 1.0 6.800000e+17 "
487 | ]
488 | },
489 | "execution_count": 4,
490 | "metadata": {},
491 | "output_type": "execute_result"
492 | }
493 | ],
494 | "source": [
495 | "df.describe()"
496 | ]
497 | },
498 | {
499 | "cell_type": "code",
500 | "execution_count": 5,
501 | "metadata": {
502 | "collapsed": true
503 | },
504 | "outputs": [],
505 | "source": [
506 | "data_path = 'twitter-hate-speech.csv'\n",
507 | "\n",
508 | "df = pd.read_csv(data_path, encoding='latin1')\n",
509 | "df = df.rename(columns={'does_this_tweet_contain_hate_speech': 'label', \n",
510 | " 'does_this_tweet_contain_hate_speech:confidence': 'confidence' })\n",
511 | "\n",
512 | "mapping = {'The tweet is not offensive': 'Not offensive', \n",
513 | " 'The tweet uses offensive language but not hate speech': 'Offensive',\n",
514 | " 'The tweet contains hate speech': 'Hate speech'\n",
515 | " }\n",
516 | "df['label'] = df['label'].map(lambda x: mapping[x])"
517 | ]
518 | },
519 | {
520 | "cell_type": "code",
521 | "execution_count": 6,
522 | "metadata": {
523 | "collapsed": true
524 | },
525 | "outputs": [],
526 | "source": [
527 | "text = df['tweet_text']"
528 | ]
529 | },
530 | {
531 | "cell_type": "code",
532 | "execution_count": 7,
533 | "metadata": {},
534 | "outputs": [
535 | {
536 | "data": {
537 | "text/plain": [
538 | "0 Warning: penny boards will make you a faggot\n",
539 | "1 Fuck dykes\n",
540 | "2 @sizzurp__ @ILIKECATS74 @yoPapi_chulo @brandon...\n",
541 | "3 \"@jayswaggkillah: \"@JacklynAnnn: @jayswaggkill...\n",
542 | "4 @Zhugstubble You heard me bitch but any way I'...\n",
543 | "5 @elaynay your a dirty terrorist and your relig...\n",
544 | "6 RT @ivanrabago_: @_WhitePonyJr_ looking like f...\n",
545 | "7 Well I thought you knew actually RT @KingHorse...\n",
546 | "8 @Stonisnipezz I know. It was a joke, faggot.\n",
547 | "9 I'm tired of people saying I look like my brot...\n",
548 | "Name: tweet_text, dtype: object"
549 | ]
550 | },
551 | "execution_count": 7,
552 | "metadata": {},
553 | "output_type": "execute_result"
554 | }
555 | ],
556 | "source": [
557 | "text[:10]"
558 | ]
559 | },
560 | {
561 | "cell_type": "code",
562 | "execution_count": 8,
563 | "metadata": {
564 | "collapsed": true
565 | },
566 | "outputs": [],
567 | "source": [
568 | "def remove_handles(content):\n",
569 | " return ' '.join(re.sub(\"(@[A-Za-z0-9]+)\",\" \",content).split())"
570 | ]
571 | },
572 | {
573 | "cell_type": "code",
574 | "execution_count": 9,
575 | "metadata": {},
576 | "outputs": [
577 | {
578 | "data": {
579 | "text/plain": [
580 | "0 Warning: penny boards will make you a faggot\n",
581 | "1 Fuck dykes\n",
582 | "2 __ _chulo at least i dont look like jefree sta...\n",
583 | "3 \" : \" : Is a fag\" jackie jealous\" Neeeee\n",
584 | "4 You heard me bitch but any way I'm back th tex...\n",
585 | "5 your a dirty terrorist and your religion is a ...\n",
586 | "6 RT _: @_WhitePonyJr_ looking like faggots?\n",
587 | "7 Well I thought you knew actually RT : Man why ...\n",
588 | "8 I know. It was a joke, faggot.\n",
589 | "9 I'm tired of people saying I look like my brot...\n",
590 | "Name: tweet_text, dtype: object"
591 | ]
592 | },
593 | "execution_count": 9,
594 | "metadata": {},
595 | "output_type": "execute_result"
596 | }
597 | ],
598 | "source": [
599 | "text.apply(remove_handles)[:10]"
600 | ]
601 | },
602 | {
603 | "cell_type": "code",
604 | "execution_count": 10,
605 | "metadata": {
606 | "collapsed": true
607 | },
608 | "outputs": [],
609 | "source": [
610 | "data = df[~df['_golden']].dropna(axis=1)"
611 | ]
612 | },
613 | {
614 | "cell_type": "code",
615 | "execution_count": 11,
616 | "metadata": {},
617 | "outputs": [
618 | {
619 | "data": {
620 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYoAAAEPCAYAAABcA4N7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XtYVNX+BvB3BmYYBhQhEK9HAUW7qBAokqSJmoUmpHA0\nDDUzhczMynshGWYXNbM8VmidJD2maJnpTysrLwmm5iVBTGU0FVTuCMNcmFm/PzzOcUJGUGYj+n56\neB5Ze89e3xmaeWff1pIJIQSIiIhqIG/oAoiI6PbGoCAiIpsYFEREZBODgoiIbGJQEBGRTQwKIiKy\nybGhC6hv+fmXG7oEIqJGx8urSY3LuEdBREQ2MSiIiMgmBgUREdnEoCAiIpsYFEREZBODgoiIbGJQ\nEBGRTQwKIiKyiUFBREQ2Ncid2UeOHMHzzz+P3bt3X3f5d999h/fffx+FhYUICQnBvHnz4OnpKXGV\n9rfs+9/w2R86u/ez75Xedu/jbuSy1AdOMNpt+wJA8cRzdtv+3e7xLeHQw37vv5i2sUjo8oLdti8l\nSfcohBBIS0vD2LFjYTRe/w2WnZ2NOXPmYNGiRcjIyICnpydmzpwpZZmS6L5wpyQhcbUvql/3LG0D\nZxjhANjtxxGA59I2cNieKN0TuwscK85C+JaH7BoSALDu7GqEb3nIrn1IRdKg+Pjjj7Fy5UrEx8fX\nuM6mTZvQr18/dOvWDSqVCq+++ip27dqFgoICCSu986z49XhDl3DHaLrUD3IAMgn6kgFwz/5Mgp7u\nHhPTx0na35qTqZL2Zw+SBsWwYcOwceNGdOnSpcZ1cnJy0KFDB8vv7u7ucHNzg0ajkaJESXyfeV7y\nPj/OuCh5n3cqBfQNXQI1Ip/9mdLQJdwySYOiefPmkMlsfw+rrKyESqWyanN2dkZlZaU9S5NUmK9H\nQ5dAt0A0dAHUqLR2btPQJdyy2+6qJ5VKBZ3O+thhZWUl1Gp1A1VU/9TOzpL3yRPa9ado9CHJwkIA\nMEnU192iW7MASftb/ggPPdU7Pz8/q8NMRUVFKC0thZ+fXwNWVf+k/OD+bmzNh/roJrh6oqjzGJhx\n5YPcnj9V4JVP9e39h/4FH7WvJH39u9d/4CBzkKQve7rtJi4aPHgwnn76aQwbNgxdunTBokWL0Lt3\nb7i7uzd0afWO3/IbL3O/ZBT2S27oMugmrXjky4YuoVG5LYIiMfHK5X9z587FvffeizfffBOzZ89G\nfn4+goODMX/+/AaukIjo7iUTQtxR5+Y4FSoRUd1xKlQiIrppDAoiIrKJQUFERDYxKIiIyCYGBRER\n2cSgICIimxgURERkE4OCiIhsYlAQEZFNDAoiIrKJQUFERDYxKIiIyCYGBRER2cSgICIimxgURERk\nE4OCiIhsYlAQEZFNDAoiIrJJ0qDIyspCdHQ0AgICEBkZiUOHDl13vS+++ALh4eEIDg7GpEmTUFBQ\nIGWZRER0DcmCQq/XIz4+HkOHDsW+ffsQFxeHhIQEVFRUWK23ZcsWLF26FAsXLkR6ejo6dOiAhIQE\nqcokIqK/kSwoMjIyIJfLERsbC4VCgejoaHh6emLHjh1W633//ff45z//icDAQCgUCkyaNAknT57E\n8ePHpSqViIiuIVlQaDQa+Pn5WbX5+PggJyfHqs1sNkOlUll+l8lkkMlkOHPmjCR1EhGRNcmCQqvV\nwtnZ2apNpVJBp9NZtYWHh2Pt2rXIzs6GwWDA0qVLodPpoNfrpSqViIiu4ShVR87OztVCQafTQa1W\nW7VFRUXh0qVLeP7552E0GhEdHQ0/Pz80bdpUqlKJiOgaku1R+Pr6QqPRWLVpNBp06NDBqu3SpUuI\niIjATz/9hF27duGZZ57BmTNncO+990pVKhERXUOyoAgNDYXBYEBqaiqMRiPS0tJQUFCAsLAwq/X2\n7NmDCRMmoKioCOXl5UhOTkavXr3QvHlzqUolIqJrSBYUSqUSKSkp2Lx5M3r06IEvv/wSy5Ytg1qt\nRmJiIhITEwEAkZGR6N27NyIiIhAeHg4hBN59912pyiQior+RCSFEQxdRn/LzLzd0CUREjY6XV5Ma\nl3EIDyIisolBQURENjEoiIjIJgYFERHZxKAgIiKbGBRERGQTg4KIiGxiUBARkU0MCiIisolBQURE\nNjEoiIjIJgYFERHZxKAgIiKbGBRERGQTg4KIiGxiUBARkU0MCiIisolBQURENjEoiIjIJkmDIisr\nC9HR0QgICEBkZCQOHTp03fXWrVuHfv36ISgoCCNGjMDRo0elLJOIiK4hWVDo9XrEx8dj6NCh2Ldv\nH+Li4pCQkICKigqr9bKzs7FgwQIsX74c+/btQ3h4OCZPnixVmURE9DeSBUVGRgbkcjliY2OhUCgQ\nHR0NT09P7Nixw2q9M2fOwGw2w2QyQQgBuVwOlUolVZlERPQ3jlJ1pNFo4OfnZ9Xm4+ODnJwcq7aw\nsDC0b98egwYNgoODA1xcXLBy5UqpyiQior+RbI9Cq9XC2dnZqk2lUkGn01m16fV6dOjQAWlpaTh4\n8CBGjx6NF154odp6REQkDcmCwtnZudqHvU6ng1qttmr76KOP0KJFC3Tp0gVOTk6YOHEijEYj9uzZ\nI1WpRER0DcmCwtfXFxqNxqpNo9GgQ4cOVm25ubkwGAyW32UyGRwcHODg4CBJnUREZE2yoAgNDYXB\nYEBqaiqMRiPS0tJQUFCAsLAwq/UeeeQRpKWlITMzE1VVVfj8889hMpkQFBQkValERHQNmRBCSNVZ\ndnY2kpKScPz4cbRr1w5JSUkICAhAYmIiAGDu3LkQQiAlJQVr1qxBWVkZ7r33Xrz++uvw9/evVR/5\n+Zft+RSIiO5IXl5NalwmaVBIgUFBRFR3toKiToeeCgsL8fHHH2P69OkoLCzEli1b8Oeff95ygURE\ndPuqdVBkZWVh4MCB+OWXX7B582ZotVrs2bMHMTExSE9Pt2eNRETUgGodFPPnz8fo0aOxZs0aKBQK\nAEBycjLi4uKwYMECuxVIREQNq9ZBkZmZiSFDhlRrHz58OE6dOlWvRRER0e2j1kHh5uaG3Nzcau2Z\nmZnw8PCo16KIiOj2UeugeOqpp5CYmIht27YBAI4fP45Vq1YhKSkJw4cPt1uBRETUsOp0eeyqVauw\nfPly5OXlAQA8PT0xbtw4jB49GjKZzG5F1gUvjyUiqrt6v49Cq9XCZDJBJpPB1dX1loqrbwwKIqK6\nq5f7KIqKihAfH48lS5ZArVajSZMmeOyxxzBx4kSUlpbWS6FERHT7qXVQJCUloby8HIMGDbK0rVix\nAmVlZZg3b55diiMiooZX60NPwcHB+Oqrr6pNPnT8+HGMGjUKe/futUuBdcVDT0REdVcvh56cnJxQ\nVFRUrf3vc14TEdGdpdZBERERgddeew27du1CcXExiouLsWfPHsyZMwePPfaYPWskIqIGVOs5s6dO\nnYqysjIkJCTAZDIBAORyOaKjozFjxgy7FUhERA2rzpfHlpeXQ6PRQKFQoG3btnBxcbFXbTeF5yiI\niOrO1jmKWu9RAEBJSQmOHz+OqqoqCCFQUFBgWfb3meqIiOjOUOug2LBhA5KSkqzms75KJpPh2LFj\n9VoYERHdHmp96OmRRx5B//798dJLL912d2Nfi4eeiIjqrl4ujy0uLsaYMWNuKSSysrIQHR2NgIAA\nREZG4tChQ9XWSUxMRGBgoOUnICAAnTp1wqZNm266XyIiunm1DoqHHnoIe/bsuemO9Ho94uPjMXTo\nUOzbtw9xcXFISEiodh/G3LlzcfDgQcvPM888gx49evASXCKiBlLrcxT3338/5s2bh59++gk+Pj6W\nWe6uevnll20+PiMjA3K5HLGxsQCA6OhofPHFF9ixYwciIiKu+5ijR48iNTUVmzZtqtYfERFJo9ZB\nsXfvXnTt2hUVFRU4evSo1bLaDDGu0WiqDf/h4+ODnJycGh8zf/58jB8/Hi1btqxtmUREVM9qHRSp\nqam31JFWq4Wzs7NVm0qlgk6nu+76Bw4cwMmTJ/Hpp5/eUr9ERHRr6nQfRWFhIdatW4fTp09j6tSp\n2Lt3Lzp27IiOHTve8LHOzs7VQkGn00GtVl93/Q0bNmDIkCG33Q19RDfNpIfzH19AeXYHTK6tUNnt\nOZg8/Bu6KqIbqnVQZGVlYdSoUejQoQOOHj2KiRMn4tdff8XMmTPx8ccfIzQ01ObjfX198eWXX1q1\naTQaDB48+Lrr//zzz/joo49qW16jZxYCmkIt9p4pRoamCBcv6+GslOO+Fk0xMqgNHOQyyGQyGKtM\n2Jx1CfmXddBWmXFvc1dcNpigKdSiVVMnBLdthg5ertBXmdGiqRNUCoeGfmp3Jl0p3DY9DcWlg5Ym\nMxwggxkyCMvvclwZ7kYAkP335ypV1n9w9dp0GWT/Xet/TC4tcPnxFajy7ma3p0HWLmjzsOiPd3C8\nLBsKmSN6e/dFj+YPwbepL5o7e8NgMuBCZS6aO7fA+Ypz2HL2W1SaKhHs2R1/lf+F89qzCLgnCANb\nR8BRXqfv4be1Wt9HERcXhx49emDSpEkIDAzEt99+i7Zt22LBggVIT0/H+vXrbT7eYDCgX79+GD9+\nPEaMGIGNGzdi4cKF2L59e7W9irNnz+Lxxx/H77//DqVSWacn1Bjvo/jmSB4W/HwK+ipzvW5XrZBh\nUm8/RAe0qtft3vWEGZ4fd4DMXP3m03rvCkDx0I0wtQyye193k2J9Ef5z6kv8XrgfHZv6w9PJEz+c\n34ZL+os1PkYpc0KVMMIMM2SQQaDmj87ObvfhX72W26N0u6mXITwyMzORnJxcrX348OHV9hSuR6lU\nIiUlBUlJSVi0aBHatWuHZcuWQa1WIzExEcCVS2MB4Pz583Bzc6tzSDQmZiEQl3oAf+Zr7daH1ijw\nzvaT8PN0QWAbN7v1c7dRHVgqSUgAV/ZAmux8DSXD/0+S/u4GlVVajN4xAuVV5QCAnMsna/U4g9Bb\n/m0rJAAguzQLr++biaTgZDjIGv9efa2Dws3NDbm5uWjXrp1Ve2ZmJjw8PGq1jc6dO2PNmjXV2q8G\nxFU9e/bEr7/+WtvSGqUR/94HTdH1T+TXt5e/PoqfJ/WSpK+7gSprtaT9ORb8IWl/d7qvclZbQsKe\nfs3fgSe2DcCWx36ye1/2Vusb7p566ikkJiZi27ZtAK7MbLdq1SokJSVh+PDhdivwTmQWQrKQAIBy\ngwmGej6sdTcTDk4NXQLdgq9Pp0nWl86sw/qcryTrz15qvUcxfvx4uLi44O2330ZlZSVeeOEFeHp6\nIj4+HqNHj7ZnjXccc50Gdq8fB8+XIqSdu/Qd34GMHp2hLKnd4Qq6/VyuKpO0vy9OfoZhvo37y3Sd\nTsuPHDkSI0eOhFarhclkQpMmNZ/8oJo5ym98g2J9a4Au71jKv3Y0dAnUiJjNjX9v3mZQ7N69u9Yb\n4nwUt7fANs0auoQ7hkNV47uyjhpOM4fG/96zGRTjxo2z+l0mk0EIAWdnZzg6OuLy5ctwcHBA06ZN\nkZ6ebtdC6dYUluvh3VTV0GUQ3XXyjZcauoRbZjMosrOzLf/esGED1q5di3nz5lnGbDp79ixmzZqF\nvn372rfKO4ypAU5SGP47zzkRSasBTknWu1pf9bRw4UIkJSVZDezXtm1bzJ49G5988oldirtTOTTA\nCQMvV+5N1JcqjwcaugRqRLp6BDR0Cbes1kFRVVWF0tLSau0XL16Eg0Pjv6HkTqZ0AIfyqEe6+/4p\naX9mZeM/xn078VDU7r6v+vKkT7Sk/dlDrYNi6NChmD59OtLS0nD06FH88ccfWLVqFWbOnGmZY4Jq\nb/GT90vW16wBnSTr625glnggP73/EEn7u9O9GfSOZH21cm6Nnl62x8FrDGo91pPJZMKSJUuwbt06\nFBUVAQC8vLwwatQoPPfcc3Ytsi4a01hP54q1GPHFAehN9juKOej+5kh6rLPdtn9XEgJuX0dDmbfX\n7l2ZFa4oHr4VZrf2du/rbjJmRyz+qjht1z5aq9tgWa/P4Kq4+emjpWRrrKdaB8W1ioqKIJPJ4O5+\n+93A1ZiC4qrcUh0Ony+Fs0KOi5f12Jqdj8IKA1ydHNCtlRseu7c5urZqit/OlCAl/TTOlejQupkK\n3q5OOFmgRZHWgHKDyXKSXC4D2jRTIS64LaK6ctInuzBWwjkzFYrzGXC8dBhy7UXIAFQ5e0Hv+xic\nzmyHQ0U+hEwGyB2BKgNkqAIAyzixwskdZqUrYAZk+hLIqy5blpmVzaC/NxqVXZ6B2a1dzXXQTRFC\n4ONjH+L/zm6BzqSFCWYI1P5+B9l//1PJVagSJphEFWQyGeQyBzR38kJU+xgM/kcklA6NZ7y6mw6K\nr776Ck8++SSUSiW++sr2bei3yzAejTEo6oMQAqcKtWjq5IjmTTjEhNTk5bmQ6ctg8ugE2JrxUZjh\nUJgN4XwPzC7e1tuouAhZZSFM93QGZLU+KkxUL246KMLDw7F+/Xq4u7sjPDy85o3IZNi+ffutVVlP\n7tagICK6FTc9zPj48eMtQ33/9FPjHwGRiIjqzub+7fz581FSUgIAuPfeey0nsYmI6O5hc4+idevW\nmDRpEjp16gQhBJKTk+HkdP3j3/Pnz7dLgURE1LBs7lF8+OGH6NKlCwyGK7N56fX6Gn+IiOjOVKc5\ns5cuXYqmTZvau6ZbwpPZRER1V2/3UZhMJuTl5aGqqgp/f5iPj8/NV1iPGBRERHV301c9XWvnzp2Y\nNWsWCgsLLSFxddhxmUyGY8eO3XAbWVlZSExMxMmTJ9GuXTu88cYbCAioPmDW/v37MW/ePJw+fRpt\n2rTBrFmzEBra+G+DJyJqjGq9RzFw4EB06tQJEydOhKtr9VvSW7dubfPxer0eAwYMQHx8PGJiYrBx\n40YsXLgQP/74I1xcXCzrXbx4EYMHD0ZycjIeffRRbN68GUlJSdi9ezdUqhuPgMo9CiKiuquXPYq8\nvDwsX74cbdu2vakiMjIyIJfLLQMIRkdH44svvsCOHTsQERFhWW/jxo146KGHMHDgQADA4MGD4ePj\nA7mcd6oSETWEWn/6duvWDZmZmTfdkUajsZrLArhyXiMnJ8eqLTMzE97e3pg4cSJCQkIwfPhwmEwm\ny41/REQkrVrvUQwcOBBz5szB/v370b59eygUCqvlNxrrSavVwtnZ2apNpVJBp9NZtZWWlmLnzp34\n8MMPsXjxYqxduxbjx4/Htm3b4ObmVttyiYiontQ6KD777DO4uLhcdygPmUx2w6BwdnauFgo6nQ5q\ntdqqTalUonfv3ggLCwMAjBw5EitWrMDvv//OKVeJiBpArYPiVsd68vX1xZdffmnVptFoMHjwYKs2\nHx8f/PXXX1ZtZrO52uW4REQkjTqdIRZC4Oeff0ZKSgo++eQT/PDDD7W+Kzs0NBQGgwGpqakwGo1I\nS0tDQUGBZc/hqsjISOzevRu//PILzGYzUlNTodfrERISUpdSiYiontT68ti8vDxMmDABZ8+ehY+P\nD0wmE86cOQNvb2+sXLkS3t7eN9xGdnY2kpKScPz4cbRr1w5JSUkICAhAYmIiAGDu3LkAgN27d2PB\nggU4c+YMfHx8MGfOHHTr1q1WT4iXxxIR1V293Jn9/PPPQ6/XY+HChWjW7Mpk70VFRXj11VfRpEkT\nfPDBB/VT7S1iUBAR1Z2toKj1oaf09HRMmzbNEhIA4OHhgWnTpuHXX3+9tQqJiOi2VeugcHV1rXbV\nEgBUVlbyZjgiojtYrT/hBwwYgDfeeAMnTpywtB0/fhxz585Fv3797FIcERE1vFqfoygvL8eLL76I\n9PR0y5hLOp0O/fr1w1tvvXXbDD/OcxRERHVXL2M9ubq6olevXnjwwQfh5+cHpVKJlStXIjg4+LYJ\nCSIiqn+1DooFCxZg48aNeOONNxAeHg4AuHDhAj755BOUl5fjhRdesFuRRETUcGp96CksLAyLFy9G\ncHCwVXt6ejpmzJiBHTt22KXAuuKhJyKiuquXy2O1Wu11B+Xz8vJCWVnZzVVGRES3vVoHRc+ePbFg\nwQKrUCgvL8eSJUvQvXt3uxRHREQNr05DeIwZMwYXL160TF507tw5tGnTBv/6179uekKj+sZDT0RE\ndVcvQ3gAgMFgwJ49e3Dq1CkoFAq0b98eYWFht9UNdwwKIqK6q7egaAwYFEREdVcvJ7OJiOjuxKAg\nIiKbGBRERGQTg4KIiGxiUBARkU0MCiIisknSoMjKykJ0dDQCAgIQGRmJQ4cOXXe9CRMmoGvXrggM\nDLT8EBFRw5AsKPR6PeLj4zF06FDs27cPcXFxSEhIQEVFRbV1s7KysGrVKhw8eNDyQ0REDUOyoMjI\nyIBcLkdsbCwUCgWio6Ph6elZbdTZwsJCFBUVwd/fX6rSiIjIBsmCQqPRwM/Pz6rNx8cHOTk5Vm1Z\nWVlwcXHBhAkT0LNnT4wYMYJ7FEREDUiyoNBqtXB2drZqU6lU0Ol0Vm16vR4BAQGYPXs2du7ciSFD\nhuC5555Dfn6+VKUSEdE1JAsKZ2fnaqGg0+mgVqut2vr3749PP/0UHTt2hFKpRGxsLFq2bIm9e/dK\nVSoREV1DsqDw9fWFRqOxatNoNOjQoYNV29atW7FlyxarNr1eDycnJ7vXSERE1UkWFKGhoTAYDEhN\nTYXRaERaWhoKCgoQFhZmtZ5Wq8W8efNw8uRJGI1GLF++HDqdDr169ZKqVCIiuoajVB0plUqkpKQg\nKSkJixYtQrt27bBs2TKo1WokJiYCAObOnYuhQ4ciPz8f48aNQ0lJCe677z6kpKRUO0RFRETS4HwU\nRETE+SiIiOjmMSiIiMgmBgUREdnEoCAiIpsYFEREZBODgoiIbGJQEBGRTQwKIiKyiUFBREQ2MSiI\niMgmBgUREdnEoCAiIpsYFEREZBODgoiIbGJQEBGRTQwKIiKyiUFBREQ2MSiIiMgmBgUREdkkaVBk\nZWUhOjoaAQEBiIyMxKFDh2yun56ejs6dO6OiokKiComI6O8kCwq9Xo/4+HgMHToU+/btQ1xcHBIS\nEmoMgdLSUsyaNQtCCKlKJCKi65AsKDIyMiCXyxEbGwuFQoHo6Gh4enpix44d110/KSkJERERUpVH\nREQ1kCwoNBoN/Pz8rNp8fHyQk5NTbd1vv/0WZWVleOqpp6Qqj4iIauAoVUdarRbOzs5WbSqVCjqd\nzqotNzcXH3zwAVavXg2j0ShVeUREVAPJ9iicnZ2rhYJOp4Narbb8bjabMX36dEyZMgXe3t5SlUZE\nRDZIFhS+vr7QaDRWbRqNBh06dLD8fuHCBRw+fBhJSUkIDg7GkCFDAAB9+vTB/v37pSqViIiuIdmh\np9DQUBgMBqSmpmLEiBHYuHEjCgoKEBYWZlmnVatWOHLkiOX3c+fOoV+/ftixYwdcXFykKpWIiK4h\n2R6FUqlESkoKNm/ejB49euDLL7/EsmXLoFarkZiYiMTERKlKISKiOpCJO+xGhfz8yw1dAhFRo+Pl\n1aTGZRzCg4iIbGJQEBGRTQwKIiKyiUFBREQ2MSiI6luVDorcDDiUVB+ehqgxkuw+CqpZbqkOaqUD\nzhRpsTnrIrQGE7q0bIqurZrAQ62EwSTQsqkTHB3+l+vnSyvRxMkRWoMJTo5yOMhluKyvwumiSvyQ\nfQkAMLBzc4T6eDTU07orKc7uQtPvn4dcVwwA0LfuhbLHPwGcmsExbz9c9r4L+eVzqPLqCm3ABAhX\nb5hdW1ke71CQBZmxAlXeDwJyh4Z6GncVk7kKFyov4B6VJ1QOKlRWaVGkL0ILdUs4yG78Nyg1lKDS\nVIkWzi0lqLZh8PLYBnS6SItZ32XhRL72huuqHOVo1dQJvp4u+DP/Mv4q1teqj9ZuKqx4KgD3uChv\ntVy6lkkP9d4FcNJsg1C5o6LrWDgWn4LL70shM1n/bcwyR5i87ofjpcOQXdMuAMgAGJt1gMxshPxy\nLuTC+N9lclR2Ho6K8HcBmQxkH7vyfsEHRxeiyFgItYMaXTy64WDBARiEAUq5EsPa/xPNVS1wn/v9\n6OjWCcX6IhwuPAh3Jw8IIfBVzir8VrAXAmZ0bNIJEf94Ajmlp6AzVyLgngcxoPVjcJQ3ju/jti6P\nZVA0oEGfZOBSucHu/fh7uWDVqCC793PXEALua/rBsejP/zUBsMfHudnBGYVjDwJKVzts/e51quwE\nXt8/HRd0F+zaTy/vh/Fm0Dt27aO+8D6K29DWYxclCQkA+DO/AlXmO+r7QINSnN9jFRKAfUICAOSm\nSrhtirPT1u9OBpMBU9JfsHtIAMCvF3fheMkxu/djbwyKBvLLiQJJ+3Pg0Yt645i3T9L+FBcOSNrf\nnW7LuU0oN0l35OFk2QnJ+rIXBkUDOVNc2dAl0M0SZok7lLq/O9u2s5sl7a+i6vrTPTcmDIoGonSQ\n9qU/drHxnLu53Zma+Uran3BQSdrfna5IXyhpf25KN0n7swcGRQOJuF/aiZnkvHKm3pia/kPS/ip6\nTJW0vzvdP1zbS9aXQqZAL+/ekvVnLwyKBjKsa0uoFdK8/GqFHJ29a76igerG3MwHUl0aUOXcHLoH\nJ0jU291hSpdpkEvw0eckV2Fe8AK4Khr/FWu8PLYBFZTrMef/juPA2RKY/vtXUDrIYDDV359E5SjH\nkmFdENim8e/+3k6a/N94qHK21Lj86l9QVsPvV9v+vp93dT3h4IzLvefBcG80IOP3ufp2tvwvvHMk\nGcdKsiD+ew6ohXNLPNPxOZwo+xPpl35FoS4fAgIOMkdUmrQQ13w98FDeg85u90Jn0qHMWIJW6rY4\nW/EXLukuwNu5JUb4jkS/Vo9C1oj25HkfRSNzNK8M6w7mwslRjuZNlKgyCxhNAgGtm+JhP0/kleqw\n+sA5OCsd8LDvPTicWwYXpQP6+3uhicoRQggcyS1DpdGEB9s0g9KRHzT1zlwF50MpUGWvBXSlkBvK\nIeQOqLrnPlQGjIPRdyDkxafgsncBzE5uqOz+0pWb9H7/CI6Fx2H0DoT2gTFQ//EZZPrLMLTuCShd\nYfJ6QPJzIEQAg4KIiG6AN9wREdFNkzQosrKyEB0djYCAAERGRuLQoUPV1hFC4IMPPkBYWBgCAwMR\nFxeHEyf8X6uFAAATCklEQVQa/w0rRESNlWRBodfrER8fj6FDh2Lfvn2Ii4tDQkICKiqsb0ZJS0vD\ntm3bsH79ehw4cADBwcGYNm2aVGUSEdHfSBYUGRkZkMvliI2NhUKhQHR0NDw9PbFjxw6r9aKjo5GW\nlgZvb29otVpcvnwZ7u7uUpVJRER/I9n4txqNBn5+flZtPj4+yMmxntxFJpNBrVZjw4YNmDVrFlxd\nXfHZZ59JVSYREf2NZHsUWq0Wzs7OVm0qlQo6ne666w8ePBhHjhxBQkICxo0bh5KSEinKJCKiv5Es\nKJydnauFgk6ng1qtvu76SqUSSqUSzz77LFxdXfHbb79JUSYREf2NZEHh6+sLjUZj1abRaNChQwer\ntiVLluD999+3/C6EgMFgQJMmHIKCiKghSBYUoaGhMBgMSE1NhdFoRFpaGgoKChAWFma1Xrdu3fCf\n//wH2dnZMBgM+Oijj+Dq6ooHH3xQqlKJiOgakt6ZnZ2djaSkJBw/fhzt2rVDUlISAgICkJiYCACY\nO3cuAGDNmjVISUnB5cuXERgYiNdffx1t2rSRqkwiIrrGHTeEBxER1S8O4UFERDYxKIiIyCYGBRER\n2cSgICIimxgURERkE4OCiIhsYlAQEZFNDAo7O3v2bEOXQHRX43vw1jEobOjUqZPlrvFrhYeH4+ef\nf77h47dv344pU6bcVN//+te/EBQUhF69euHy5cuIjY1FQECA5e71+jZo0CDs3LnTLttuLNLT0zF6\n9Gg8+OCD6N69O0aOHIkff/zRsnz9+vUICQlB9+7dcf78eSQkJCAgIAAJCQl2qWfcuHH46quv7LLt\nxqJTp074888/q7WHhIRg7969N3z8rbwHbxd79+5FSEhIg9Yg2XwUjdXatWvRv39/9O7du86PLS0t\nhdlsvql+N2zYgJkzZyI6Ohr79+9HZmYm9uzZAxcXl5va3o1s3rzZLtttLDZt2oQ333wTU6dOxdKl\nS+Hk5IRffvkFiYmJOHfuHMaMGYNvv/0WsbGxmDx5MvLy8vDTTz/hxx9/RNu2be1S0/Lly+2y3bvJ\nrbwH6X+4R3EDMTExmDVrVo3zYRQUFOCVV15BSEgI+vTpg3fffRcGgwFHjhzBnDlzcOzYMfTq1eu6\nj/3uu+8QERGBoKAgjBgxAocPHwYADBw4EOfOncPcuXMxePBgjB07FjqdDmFhYTh48CBKSkowdepU\nhIaGIjw8HJ9++imujsQyY8YMJCcnIzY2FoGBgRg6dCgyMzMBAGVlZXj++efRo0cP9O3bF7Nnz4Ze\nrwfwv72k999/Hy+++KKlRiEEwsPDLTMRrl69Go8++ihCQkIwceJE5Ofn188L3YB0Oh2Sk5Mxd+5c\nxMTEwNXVFQqFAgMGDMCiRYuwYMECjBo1Cr/99htSUlIQHx+Pxx9/HAAwZMgQbNmyxbKNhx9+GGFh\nYXjnnXdgMBgAAB9++CFeffVVTJgwAYGBgYiIiMDu3bsBAAaDATNnzkRISAjCwsLw4osvori4GAAQ\nFxeHL7/8EmvXrsWwYcOsan766aexevVqAMD333+PwYMHIzg4GKNHj642SvOdLisrC2PGjEFYWBi6\ndeuGsWPHoqCg4LrvQVvvnb/797//jUceeQQhISEYOXIkjh49CuDK33PKlCkYNWoUAgICEBMTg2PH\njlket2/fPgwbNgzBwcGIiYnBkSNHLMtyc3MRHx+PkJAQPProo1i/fr1lWVlZGaZOnYrg4GCEhobi\n3XfftdQmhMDChQvx8MMPo2fPnlixYkW9v442CaqRv7+/yM7OFqNHjxaTJ0+2tPft21f89NNPQggh\nhg8fLl5++WVx+fJlceHCBTFs2DDx3nvvCSGEWL9+vXjyySevu+2dO3eKwMBA8dtvvwmj0SjWrVsn\ngoKCxKVLl6r1kZGRIXr06GF57HPPPSemTp0qKioqxNmzZ8WgQYNEWlqaEEKI6dOni+DgYHHs2DFR\nWVkpXnrpJTF27FghhBDvv/++eOGFF4RerxclJSUiMjJSrF271qq/kydPiq5du4ry8nIhhBD79u0T\nvXr1ElVVVWLLli2iT58+4s8//xQ6nU7Mnz9fjBw5st5e74ayZ88e8cADDwiDwXDd5X379hXr168X\nTz/9tEhNTRVCCHH27Fnh7+9veZ2SkpLEM888I4qKikRhYaF4+umnxQcffCCEEGLJkiXi/vvvF3v2\n7BF6vV68/fbb4tFHHxVCCLF27VoRExMjKioqhFarFc8++6xYvHixEEJY+istLRVdunQRZ86cEUII\nkZubK7p06SKKi4vF4cOHRVBQkNi/f78wGAzi888/FwMGDKjxuTQ2/v7+IjAwUAQFBVn9dOrUSWRk\nZAghhOjfv79YuXKlMJvNoqioSERHR4v3339fCFH9PWjrvXOt06dPi4CAAHHu3DlhNpvFkiVLRGxs\nrBDiyt+zU6dOYvPmzcJgMIgPP/xQPPLII0Kv14vz58+LwMBA8cMPPwij0Si2bNkievToIYqLi0VV\nVZV44oknxIIFC4RerxfHjh0TvXr1Eunp6UIIIV566SUxYcIEUVJSIgoKCkRERIRYs2aNyMjIEP7+\n/mLZsmWiqqpK/PLLL8Lf31/k5eXZ++W34B7FDchkMsyfPx+7d+/Gpk2brJb99ddfOHjwIGbPng1X\nV1d4e3tj8uTJ+Prrr2+43W+//RZRUVHo3r07HB0dER0dDT8/P6tj4teTn5+PnTt3YubMmVCr1WjT\npg2effZZrFu3zrJOeHg4OnfuDJVKhYiICJw+fRoA4OTkhMzMTGzevBlGoxEbNmxATEyM1fb9/PzQ\nsWNHbN++HcCVvZ5BgwbBwcEBaWlpGDNmDDp27AgnJye8/PLLOHz4cKP/BltQUIBmzZpBoVBcd7mn\npycKCgpqfLwQAhs2bMCrr74Kd3d3eHh4YNKkSVi7dq1lnYCAAISGhkKpVOKJJ57AmTNnAFz5m5w5\ncwZff/01iouL8emnn2Ly5MlW22/atCn69u1rOTz43XffoXfv3mjWrBnS0tIQFRWFoKAgKBQKjBkz\nBlVVVbU6ft9YrFmzBvv377f6cXNzsyxfsWIFRo4cicrKSly8eBHu7u64ePFite3U5r1zlaOjI4xG\nI9auXYvs7GxMnDgRq1atsiwPDQ1FREQEFAoFEhISoNVq8fvvv+O7775DSEgI+vfvD0dHRzz++OPw\n9/fHtm3b8McffyAvLw9TpkyBUqlE586dMWLECKxbtw4GgwE//PADXnrpJbi5ueGee+7BsmXL0KdP\nHwCAQqHAuHHj4ODggD59+sDFxQXnzp2zw6t9fTxHUQstW7bEa6+9hrlz56J79+6W9sLCQqjVanh4\neFjaWrVqhYKCAhiNRpvbLCoqQufOna3aWrVqhQsXLth8XF5eHoQQGDBggKXNbDajWbNmlt+vrcfR\n0dGy+zp+/HgAwGeffYZZs2YhKCgIycnJaN++vVUfUVFR2LJlCyIiIrB161bLnOV5eXlYvHgxPvro\nI8u6MpkMubm58PHxsVn37czT0xOFhYUwGAxQKpXVlufm5sLT07PGxxcVFUGn0yEuLg4ymQzAlfAw\nGo2WQ3s1/U2GDBmC8vJybNiwAfPmzYO/vz/mzp2Lrl27WvURFRWFRYsWISEhAd999x1eeOEFAFf+\nJnv37sU333xjWddoNCIvL+8mX43G58iRI3juuedQUVGBTp06obS01Or1vqo2752rWrdujZSUFCxf\nvhz//ve/4ebmhsmTJ1sOAf7jH/+wrOvg4AAvLy8UFBQgNzcXu3btQnBwsGV5VVUVgoKC0KRJE5SX\nl6NHjx6WZSaTCffffz9KS0thNBrh7e1tWXa1jzNnzsDFxQWOjv/7uFYoFDCZTDfzct0UBkUtRUVF\nYfv27Zg1a5blTd6qVStotVoUFxfD3d0dAHDu3Dmb306vatmyJXJzc63azp07d8MJmry8vODo6Ig9\ne/ZYPtRKS0tRUVFxw+dw4sQJREZGIiEhARcvXsRbb72FN998s9rxzoiICCxcuBA//PADPD09cd99\n91n6Hjt2LKKjoy3rnjp1ym4nc6USFBSEpk2bYtOmTdXOBezatQslJSXo3bt3jXuKV//e33zzjeW1\n0Gq1KCgogJOTk82+T58+jZ49eyI2NhbFxcVYunQppk2bhq1bt1qt9/DDD2PWrFn48ccfceHCBcs3\nTS8vLzz77LNWeyGnT5+2+sC5k124cAHTp0/H6tWr0a1bNwDAzJkzr3veoS7vnaKiIqjVaqxYsQJ6\nvR5bt27F9OnTLROtXbp0ybJuVVUVLl26hBYtWsDLywsRERF49913LcvPnj0Ld3d3ZGdnw9vbG7/8\n8otlWUFBAYQQ8PDwgEKhsOwRAf/7f6958+a3/kLdIh56qoM33ngDx48ft3zAe3t7IzQ0FG+99RYq\nKipw8eJFLFmyBE888QSAK/N+V1RUXPd/2qioKHzzzTfYv38/qqqqkJaWhpMnT6J///42a2jZsiWC\ngoLw3nvvQafToaSkBC+++KLV9LE1Wbt2LebMmYPy8nK4u7tDpVJd99uUh4cHevbsiXfeeQdDhgyx\ntD/55JP4/PPPcebMGZjNZqSmpuKf//wnKisrb9j37UypVGLOnDl4++23sW7dOpSXl6OyshLbtm3D\n9OnTMWXKFJt7FA4ODnjiiSewYMEClJWVQavVIjExETNmzLhh39u3b8crr7yCgoICuLm5wcXF5bp/\nE0dHRwwaNAjJycl47LHHLB90UVFRWLduHTIzMyGEwA8//IDBgwffNXsUV99fKpUKQgjs2LEDW7du\ntezRX/serMt75/z583jmmWeQmZkJJycnuLu7w8nJCWq1GsCVD/E9e/bAaDRi6dKlcHd3R2BgIAYN\nGoSff/4Z6enpEELgwIEDGDJkCP744w9069YNKpUKy5cvh9FoxIULF/DMM89g1apVcHBwQEREBJYs\nWYLy8nLk5+fj3XffhU6nk/T1rAmDog48PDzw5ptvWrUtWLAAVVVV6NevHyIjIxEUFISpU6cCgOUw\nVffu3S2HIK4KDg5GUlISEhMT0b17d8usfi1btrxhHYsWLUJhYSHCw8MxcOBANG/eHHPmzLnh46ZM\nmQIXFxf069cPPXv2RGlpKWbOnHnddaOionDx4kWroIiMjERMTAyee+45BAcHY+PGjfjkk0+sjhc3\nVo8//jiWLl2KrVu3om/fvggLC8PKlSvxxhtv4Nlnn73h42fPng13d3cMGjQIffr0QXl5ea3Ce9So\nUejatSueeOIJBAUF4ffff8f8+fOvu25UVBTy8vIQGRlpaevRowdmzJiBadOm4cEHH8QHH3yAxYsX\nw9fXt/ZPvhHz8/PD888/j9GjRyMkJATLli3DiBEjkJOTA6D6e7C2750uXbrglVdewaRJkxAQEIC3\n334bixcvRpMmTQAAXbt2RUpKCkJCQrB//3588skncHBwQPv27bF48WK89957CAoKwvTp0zFz5kyE\nhoZCoVDg008/xW+//YawsDAMHTrUcvUgALz++utwc3PDo48+iqioKAwYMKDaOcSGwhnuiIjq4MMP\nP8SJEyewZMmShi5FMtyjICIimxgURERkEw89ERGRTdyjICIimxgURERkE4OCiIhsYlAQ3YRz586h\nU6dOOHXqVL2sV5MNGzbUOPowkVQYFEREZBODgoiIbGJQEN2inJwcjB8/HkFBQXjggQcQFRVVbZjv\nn3/+GeHh4QgICMBLL72E0tJSy7JTp05h7Nix6NatG8LDw7F48eIbjj5MJCUGBdEtSkhIgIeHB9LS\n0rBhwwa0aNGi2lzrq1atwrx587By5UqcPHnSMve5Xq/HuHHj4O/vj2+++QZvvfUWtm7dWqtxooik\nwqAgugVmsxkxMTGYPXs2fHx84O/vj1GjRuH06dNWewXTpk1DaGgounbtitdeew1btmxBcXExNm3a\nBLVajRkzZsDHxwc9e/bEa6+9htTUVEnnGyCyhfNREN0CuVyOkSNH4ttvv8XRo0eh0Wgsc5SbzWbL\negEBAZZ/P/DAAzCbzTh9+jROnToFjUaDwMBAy3IhBAwGA86fPy/dEyGygUFBdAtMJhNiYmKgVqvR\nv39/9O/fHwaDwTID3VVy+f923q8GiFKptMx+lpycXG3bLVq0sG/xRLXEoCC6Bdu3b8dff/2F/fv3\nWyYTujq38rXDqF2d3QwADh8+DIVCgXbt2sHPzw9bt25Fy5YtLY/PyMjA6tWr8d5770n8bIiuj+co\niG5BaGgo9Ho9tm3bhvPnz2Pz5s2WeQoMBoNlvXnz5mH//v04cOAAkpOTMXz4cLi6ulomhpoxYwZO\nnDiBffv2Yfbs2XB0dLzhNKpEUuEeBdEtaNKkCSZPnoy3334bWq0Wvr6+mDNnDqZNm4bMzEzLHNpj\nxozBlClTUF5ejkGDBllmQbw6L/P8+fMRHR0NtVqNAQMG1GoaVSKpcJhxIiKyiYeeiIjIJgYFERHZ\nxKAgIiKbGBRERGQTg4KIiGxiUBARkU0MCiIisolBQURENjEoiIjIpv8HKAOCrTvWENEAAAAASUVO\nRK5CYII=\n",
621 | "text/plain": [
622 | ""
623 | ]
624 | },
625 | "metadata": {},
626 | "output_type": "display_data"
627 | }
628 | ],
629 | "source": [
630 | "sns.stripplot(x=\"label\", y=\"confidence\", data=data, size=6, jitter=True);"
631 | ]
632 | },
633 | {
634 | "cell_type": "code",
635 | "execution_count": 12,
636 | "metadata": {},
637 | "outputs": [
638 | {
639 | "data": {
640 | "text/plain": [
641 | "Not offensive 7253\n",
642 | "Offensive 4807\n",
643 | "Hate speech 2382\n",
644 | "Name: label, dtype: int64"
645 | ]
646 | },
647 | "execution_count": 12,
648 | "metadata": {},
649 | "output_type": "execute_result"
650 | }
651 | ],
652 | "source": [
653 | "data['label'].value_counts()"
654 | ]
655 | },
656 | {
657 | "cell_type": "code",
658 | "execution_count": 13,
659 | "metadata": {},
660 | "outputs": [
661 | {
662 | "data": {
663 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAD8CAYAAACVZ8iyAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAGrhJREFUeJzt3H9s1PXhx/HntYVejzpaVnCSEAdFGXPQK22lIGs7zswp\n/aGum1p/EDLg0M5l0zLHOlaywnCTdrRj05qBUUI0ljmglWglE8VU1FprI1Iz15OwdI2cHYfi9Yd3\n7+8fhvtyA+ynQO968/VI/IN7v6+f1/v6iS8+n/vwthljDCIi8qUWF+0AIiISfSoDERFRGYiIiMpA\nRERQGYiICJAQ7QDnwxjDRx+dJJYehLLZbHz1qxOUOwJiMTModyTFYma48NyTJ19yzrGYvDKw2WzE\nxVjyuDjljpRYzAzKHUmxmBlGN3eMfRQiIjIaVAYiIqIyEBERlYGIiKAyEBERVAYiIoLKQEREUBmI\niAgqAxERIUa3oxARiaacmpejduwPHlwyKj9XVwYiIqIyEBERlYGIiKAyEBERVAYiIoLKQEREUBmI\niAgqAxERQWUgIiKoDEREBJWBiIigMhAREVQGIiKCykBERFAZiIgIKgMREUFlICIiqAxERASLZdDe\n3s7NN9/MvHnzuO6662hqagLA5/NRXl5OVlYWBQUFNDY2ht5jjKGmpobc3FxycnJYv349gUAgNN7c\n3IzL5cLpdOJ2u/F6vRd5aSIiYtWwZRAIBCgvL2flypW0t7ezYcMGfvGLX/Cvf/2LtWvX4nA4aG1t\npb6+nk2bNtHR0QHAjh072L9/P3v27GHv3r20t7ezbds2ALq6uqiqqqK2tpaDBw+SlpbGmjVrRnel\nIiJyTgnDTThx4gR9fX0EAgGMMdhsNsaNG0d8fDz79u3j+eefJzExkblz51JYWMiuXbtwOp3s3r2b\npUuXMmXKFADcbjd1dXWsWLGCpqYmXC4XGRkZAFRUVLBgwQK8Xi9paWmWgsfF2S5g2ZF3Kq9yj75Y\nzAzKHUmxmPl0o5F72DJITU2lrKyM++67j9WrVxMMBtmwYQP/+c9/SEhIYNq0aaG506dPp6WlBYDu\n7m5mzpwZNubxeDDG0N3dTWZmZtgxJk6ciMfjsVwGKSkTLC9yLFHuyInFzKDckRSLmWF0cg9bBsFg\nELvdTl1dHYsXL6a1tZX777+fhx9+GLvdHjbXbrfT398PgN/vDxtPSkoiGAwyODh4xtipcb/fbzn4\n8eMnCQaN5fnRFhdnIyVlgnJHQCxmBuWOpFjMfLrzzT1pUvI5x4Ytg5aWFjo7O3nggQcAKCgooKCg\ngD/+8Y8MDAyEze3v78fhcACfF8Pp436/n4SEBBITE8NK4/TxU++1Ihg0BAKx90tU7siJxcyg3JEU\ni5lhdHIP+wXyv//9bwYHB8NeS0hI4KqrrmJoaIienp7Q6x6PJ3RrKD09HY/HEzY2Y8aMs4719fXh\n8/lIT0+/sNWIiMh5GbYMFi5cyOHDh/nrX/+KMYbXX3+dF154gSVLluByuaipqcHv99PZ2UlzczNF\nRUUAFBcXs3XrVnp7e/F6vTQ0NFBSUgJAYWEhLS0ttLW1MTAwQG1tLXl5eaSmpo7uakVE5KyGvU00\na9Ys6uvrqaurY8OGDUydOpXf/e53zJkzh+rqaqqqqsjPz8fhcLB69erQE0JlZWV4vV5KS0sZGhqi\nqKiIZcuWATB79myqq6uprKzk2LFjZGdns3HjxtFdqYiInJPNGBN7N8yAvr5PYupeX3y8jUmTkpU7\nAmIxMyh3JF1o5pyal0chlTUfPLjkvHNPnnzJOce0HYWIiKgMREREZSAiIqgMREQElYGIiKAyEBER\nVAYiIoLKQEREUBmIiAgqAxERQWUgIiKoDEREBJWBiIigMhAREVQGIiKCykBERFAZiIgIKgMREUFl\nICIiqAxERASVgYiIoDIQERFUBiIigspARERQGYiICCoDERFBZSAiIqgMREQElYGIiKAyEBERVAYi\nIoLKQEREUBmIiAgqAxERQWUgIiKoDEREBItl0Nvbi9vtZt68eeTl5fHEE08A4PP5KC8vJysri4KC\nAhobG0PvMcZQU1NDbm4uOTk5rF+/nkAgEBpvbm7G5XLhdDpxu914vd6LvDQREbFq2DIwxnDPPfcw\nY8YMXnvtNbZu3cqWLVtob29n7dq1OBwOWltbqa+vZ9OmTXR0dACwY8cO9u/fz549e9i7dy/t7e1s\n27YNgK6uLqqqqqitreXgwYOkpaWxZs2a0V2piIicU8JwE95++20+/PBDKioqiI+P54orruCpp54i\nMTGRffv28fzzz5OYmMjcuXMpLCxk165dOJ1Odu/ezdKlS5kyZQoAbreburo6VqxYQVNTEy6Xi4yM\nDAAqKipYsGABXq+XtLQ0S8Hj4mwXsOzIO5VXuUdfLGYG5Y6kWMx8utHIPWwZHDp0iCuuuIKHHnqI\npqYmkpOTWbVqFbNmzSIhIYFp06aF5k6fPp2WlhYAuru7mTlzZtiYx+PBGEN3dzeZmZmhsdTUVCZO\nnIjH47FcBikpEywvcixR7siJxcyg3JEUi5lhdHIPWwY+n4/XXnuN3NxcXnzxRd555x2WL1/Oo48+\nit1uD5trt9vp7+8HwO/3h40nJSURDAYZHBw8Y+zUuN/vtxz8+PGTBIPG8vxoi4uzkZIyQbkjIBYz\ng3JHUixmPt355p40KfmcY8OWwfjx45k4cSJutxuAefPmcd1111FfX8/AwEDY3P7+fhwOB/B5MZw+\n7vf7SUhIIDExMaw0Th8/9V4rgkFDIBB7v0TljpxYzAzKHUmxmBlGJ/ewXyBPnz6dQCAQ9iRQIBDg\nm9/8JkNDQ/T09IRe93g8oVtD6enpeDyesLEZM2acdayvrw+fz0d6evqFr0hEREZs2DK45pprsNvt\nbNmyhc8++4z29nZeeOEFvve97+FyuaipqcHv99PZ2UlzczNFRUUAFBcXs3XrVnp7e/F6vTQ0NFBS\nUgJAYWEhLS0ttLW1MTAwQG1tLXl5eaSmpo7uakVE5KyGvU1kt9vZvn07v/nNb1i4cCHJycn86le/\nwul0Ul1dTVVVFfn5+TgcDlavXh16QqisrAyv10tpaSlDQ0MUFRWxbNkyAGbPnk11dTWVlZUcO3aM\n7OxsNm7cOLorFRGRc7IZY2LvhhnQ1/dJTN3ri4+3MWlSsnJHQCxmBuWOpAvNnFPz8iiksuaDB5ec\nd+7Jky8555i2oxAREZWBiIioDEREBJWBiIigMhAREVQGIiKCykBERFAZiIgIKgMREUFlICIiqAxE\nRASVgYiIoDIQERFUBiIigspARERQGYiICCoDERFBZSAiIqgMREQElYGIiKAyEBERVAYiIoLKQERE\nUBmIiAgqAxERQWUgIiKoDEREBJWBiIigMhAREVQGIiKCykBERFAZiIgIKgMREUFlICIiqAxERASV\ngYiIMIIy8Hq9LFiwgBdffBEAn89HeXk5WVlZFBQU0NjYGJprjKGmpobc3FxycnJYv349gUAgNN7c\n3IzL5cLpdOJ2u/F6vRdxSSIiMlKWy6CyspLjx4+H/rx27VocDgetra3U19ezadMmOjo6ANixYwf7\n9+9nz5497N27l/b2drZt2wZAV1cXVVVV1NbWcvDgQdLS0lizZs1FXpaIiIxEgpVJTz75JElJSVx2\n2WUAnDx5kn379vH888+TmJjI3LlzKSwsZNeuXTidTnbv3s3SpUuZMmUKAG63m7q6OlasWEFTUxMu\nl4uMjAwAKioqWLBgAV6vl7S0NMvB4+JsI11rVJ3Kq9yjLxYzg3JHUixmPt1o5B62DDweD4899hhP\nP/00N998MwBHjhwhISGBadOmheZNnz6dlpYWALq7u5k5c2bYmMfjwRhDd3c3mZmZobHU1FQmTpyI\nx+MZURmkpEywPHcsUe7IicXMoNyRFIuZYXRyf2EZfPbZZ/z85z+nsrKSlJSU0Ouffvopdrs9bK7d\nbqe/vx8Av98fNp6UlEQwGGRwcPCMsVPjfr9/RMGPHz9JMGhG9J5oiouzkZIyQbkjIBYzg3JHUixm\nPt355p40KfmcY19YBn/+85+ZPXs2+fn5Ya8nJSUxMDAQ9lp/fz8OhwP4vBhOH/f7/SQkJJCYmBhW\nGqePn3qvVcGgIRCIvV+ickdOLGYG5Y6kWMwMo5P7C79A3rt3L88++yzZ2dlkZ2fT09PDfffdx/79\n+xkaGqKnpyc01+PxhG4Npaen4/F4wsZmzJhx1rG+vj58Ph/p6ekXdWEiImLdF5bBc889x5tvvklb\nWxttbW1MnTqV2tpaysvLcblc1NTU4Pf76ezspLm5maKiIgCKi4vZunUrvb29eL1eGhoaKCkpAaCw\nsJCWlhba2toYGBigtraWvLw8UlNTR3+1IiJyVpaeJjqb6upqqqqqyM/Px+FwsHr16tATQmVlZXi9\nXkpLSxkaGqKoqIhly5YBMHv2bKqrq6msrOTYsWNkZ2ezcePGi7MaERE5LzZjTOzdMAP6+j6JqXt9\n8fE2Jk1KVu4IiMXMoNyRdKGZc2peHoVU1nzw4JLzzj158iXnHNN2FCIiojIQERGVgYiIoDIQERFU\nBiIigspARERQGYiICCoDERFBZSAiIqgMREQElYGIiKAyEBERVAYiIoLKQEREUBmIiAgqAxERQWUg\nIiKoDEREBJWBiIigMhAREVQGIiKCykBERFAZiIgIKgMREQESoh1A5H9NTs3LUTnuG/fnReW48r9B\nVwYiIqIyEBERlYGIiKAyEBER9AWy/I/6+i+ejXYEkZiiKwMREVEZiIiIykBERFAZiIgIKgMREUFl\nICIiWCyDtrY2fvCDH5CVlcW1117LU089BYDP56O8vJysrCwKCgpobGwMvccYQ01NDbm5ueTk5LB+\n/XoCgUBovLm5GZfLhdPpxO124/V6L/LSRETEqmHLwOfzcc8993DXXXfxxhtvUFdXR21tLa2traxd\nuxaHw0Frayv19fVs2rSJjo4OAHbs2MH+/fvZs2cPe/fupb29nW3btgHQ1dVFVVUVtbW1HDx4kLS0\nNNasWTO6KxURkXMatgx6enrIz8+nqKiIuLg4rrrqKubPn097ezv79u3jJz/5CYmJicydO5fCwkJ2\n7doFwO7du1m6dClTpkxh8uTJuN1u/va3vwHQ1NSEy+UiIyMDu91ORUUFBw4c0NWBiEiUDPsvkGfP\nns1DDz0U+rPP56OtrY1Zs2aRkJDAtGnTQmPTp0+npaUFgO7ubmbOnBk25vF4MMbQ3d1NZmZmaCw1\nNZWJEyfi8XhIS0uzFDwuzmZp3lhxKq9yj75YynoxxcdHZ92xfI7EUubTjUbuEW1H8fHHH7Nq1arQ\n1cETTzwRNm632+nv7wfA7/djt9tDY0lJSQSDQQYHB88YOzXu9/stZ0lJmTCS6GOGcstomTQpOarH\nj8VzJBYzw+jktlwGR48eZdWqVUybNo3Nmzfzz3/+k4GBgbA5/f39OBwO4PNiOH3c7/eTkJBAYmJi\nWGmcPn7qvVYcP36SYNBYnh9tcXE2UlImKHcExOrf9i5UX98nUTlurJ4jsZb5dOeb+4v+wmCpDA4d\nOsTy5cspLi7mgQceIC4ujssvv5yhoSF6enqYOnUqAB6PJ3RrKD09HY/HQ0ZGRmhsxowZYWOn9PX1\n4fP5SE9Pt7yoYNAQCMTeL1G5ZbRE+/cTi+dILGaG0ck97BfIXq+X5cuXs2zZMtasWUNc3OdvSU5O\nxuVyUVNTg9/vp7Ozk+bmZoqKigAoLi5m69at9Pb24vV6aWhooKSkBIDCwkJaWlpoa2tjYGCA2tpa\n8vLySE1NvaiLExERa4a9Mti5cyd9fX08/PDDPPzww6HX77rrLqqrq6mqqiI/Px+Hw8Hq1atDVwJl\nZWV4vV5KS0sZGhqiqKiIZcuWAZ9/KV1dXU1lZSXHjh0jOzubjRs3jtISRURkODZjTOxdI/H5/dFY\nuryLj7cxaVKyckdAfLyNeb9/KdoxIu6N+/OictxYPUcuJHNOzcujkMqaDx5cct65J0++5Jxj2o5C\nRERUBiIiojIQERFUBiIigspARERQGYiICCoDERFBZSAiIqgMREQElYGIiKAyEBERVAYiIoLKQERE\nUBmIiAgqAxERQWUgIiKoDEREBJWBiIigMhAREVQGIiKCykBERFAZiIgIKgMREUFlICIiqAxERASV\ngYiIoDIQERFUBiIigspARERQGYiICCoDERFBZSAiIkBCtANEQ07Ny1E57gcPLonKcUVEhqMrAxER\nURmIiIjKQEREiGIZvPvuu5SWluJ0OikpKaGjoyNaUUREvvSiUgYDAwOsWrWKm2++mTfeeIM777yT\nu+++m5MnT0YjjojIl15UyuDgwYPExcVRVlbGuHHjKC0tJS0tjZdeeikacUREvvSi8mipx+MhPT09\n7LXp06fT3d1t+WfExdkudqyIiLXcp/LGUu5YynoxxcdHZ92xfI7EUubTjUbuqJTBp59+SlJSUthr\ndrud/v5+yz8jJWXCeR8/ms/7X0juaIq13Po3HZEXa+cInH/maJ9fo/FZR+U2UVJS0hn/4+/v78fh\ncEQjjojIl15UymDGjBl4PJ6w1zweDzNnzoxGHBGRL72olMGCBQsYHBxk+/btDA0NsXPnTrxeL4sW\nLYpGHBGRLz2bMcZE48BdXV2sW7eO9957j8svv5x169bhdDqjEUVE5EsvamUgIiJjh7ajEBERlYGI\niKgMREQElYGIiDCGy8DKrqbGGOrq6li0aBGZmZnceeed/OMf/4hC2s+NdCfWV199lW984xtR36DP\nam63283cuXPJzMwM/RdNVnO3tbVx0003kZmZSVFREa+++mqEk/4/K5l//etfh33GTqeTWbNm0dTU\nFIXEn7P6WTc2NuJyucjKyuLWW2/lnXfeiXDScFZzP/744yxevJjs7GzuvfdevF5vhJOeqbOz8wsf\nt29ubsblcuF0OnG73Ree2YxB/f395tvf/rbZsWOHGRwcNI2NjSY3N9d88sknYfOefvppc/3115ve\n3l4TCATM5s2bzY033jimM59y/PhxU1BQYK688spzzomEkeRetGiR6ezsjELKM1nN3dvba7Kzs81z\nzz1ngsGgaWpqMllZWcbv94/ZzP9t8+bN5o477jCDg4MRShrOau7Dhw+bq6++2nR3d5tAIGAaGhrM\n4sWLo5LZGOu5n332WZOTk2Pa29vN4OCg2bx5syktLY1SamOCwaBpbGw0WVlZ5uqrrz7rnMOHD5t5\n8+aZjo4O4/f7zS9/+UuzfPnyCzrumLwysLqraWlpKTt37uTSSy/l008/5eOPPyY1NXVMZz5l3bp1\n3HDDDRFOeSaruT/66CP6+vq48soro5Q0nNXcu3fvZuHChVx33XXYbDYKCwt5/PHHiYuL/Kl/Prv1\nvvPOO2zfvp3f//73jBs3LoJp/5/V3EeOHCEYDBIIBDDGEBcXh91uj0pmsJ67paWFH/7wh2RmZjJu\n3Djuvfde3n//fd57772o5H7kkUd44oknWLVq1TnnNDU14XK5yMjIwG63U1FRwYEDBy7o6mBMloHV\nXU1tNhsOh4NnnnmG7Oxsdu3axU9/+tNIRg0ZyU6se/bs4cSJE9x2222RindOVnO/++67TJgwAbfb\nTW5uLrfeeitvvfVWJKOGsZr70KFDXHrppZSXlzN//nxuueUWAoEA48ePj2Rc4Px26924cSMrV67k\nsssuG+1452Q196JFi/j617/OkiVLmDNnDg0NDWzatCmSUcNYzR0MBsNKy2azYbPZOHLkSERy/rfv\nf//77N69mzlz5pxzTnd3d9j2PampqUycOPGMbX5GYkyWwUh3NS0sLKSzs5O7776b5cuXc/z48UjE\nDGM1c09PD3V1dfz2t7+NZLxzspp7YGAAp9NJZWUlL7/8MsXFxaxYsYJjx45FMm6I1dw+n4/GxkZu\nu+02XnnlFYqLi1m5ciU+ny+ScYGRn9dvvvkm77//Prfffnsk4p3TSM6RmTNnsnPnTt566y2WLl3K\nj3/84xHtRnwxWc29ePFinn76abq6uhgcHORPf/oT/f39DAwMRDJuyJQpU7DZvniLar/ff8ZVV1JS\nEn6//7yPOybLYKS7mo4fP57x48fzox/9iOTkZF5//fVIxAxjJXMwGOSBBx7gZz/7GZdeemmkI56V\n1c/62muv5dFHH+WKK65g/PjxlJWVcdlll/Haa69FMm6I1dzjx48nLy+PRYsWMW7cOG6//XYcDgft\n7e2RjAuM/Lx+5plnKC4uZsKE6G4NbTX3li1b+NrXvsacOXNITEykvLycoaEhWltbIxk3xGruG2+8\nkTvuuIN77rkHl8tFIBAgPT2dr3zlK5GMOyJnKzW/339BOz+PyTKwuqtpfX09f/jDH0J/NsYwODjI\nJZdcEpGcp7OSube3l7fffpt169aRnZ1NcXExAPn5+bS1tUU07ylWP+vnnnuOvXv3hr02MDBAYmLi\nqGc8G6u5p0+fzuDgYNhrwWAQE4VdWEa6W++LL77I9ddfH4loX8hq7p6enrDP2mazER8fT3x8fERy\n/jeruT/88ENuuOEG/v73v3PgwAGWLVvGkSNHmD17diTjjkh6enrY2vr6+vD5fGfcFhuJMVkGVnc1\nzcjI4Mknnwxd3m3ZsoXk5GTmzZs3JjNPnTqVzs5O2traaGtrY8+ePQC89NJLZGdnRzyz1dzw+SX3\nhg0beP/99xkaGuIvf/kL/f39XHPNNWM6d0lJCa+88gr79+8nGAyyfft2BgYGmD9//pjNDHD06FFO\nnDjBt771rYjn/G9WcxcUFLBz504OHTrEZ599xmOPPUYgECArK2tM525tbcXtdtPX18cnn3zC+vXr\nueaaa5gyZUpUcltRWFhIS0sLbW1tDAwMUFtbS15e3oU9QHNBzyKNosOHD5tbbrnFOJ1OU1JSYt56\n6y1jjDFr1641a9euDc178sknzeLFi01OTo5ZuXKlOXr0aLQiW858ytGjR6P+aKkx1nM/8sgjJj8/\n32RkZJjbbrvNdHV1RSuyMcZ67gMHDpiSkhLjdDrNTTfdZDo6OqIV2XLmV1991SxcuDBaMc9gJXcw\nGDQNDQ3mO9/5jsnKyjJ33HGHee+996IZ23LuBx980MyfP9/k5OSYiooKc+LEiWjGNsYYc/DgwbBH\nS//7HHn22WfNd7/7XZOZmWlWrFhhvF7vBR1Pu5aKiMjYvE0kIiKRpTIQERGVgYiIqAxERASVgYiI\noDIQERFUBiIigspARESA/wMxP+RvIT9yYgAAAABJRU5ErkJggg==\n",
664 | "text/plain": [
665 | ""
666 | ]
667 | },
668 | "metadata": {},
669 | "output_type": "display_data"
670 | }
671 | ],
672 | "source": [
673 | "data['confidence'].hist(bins=10);"
674 | ]
675 | },
676 | {
677 | "cell_type": "code",
678 | "execution_count": null,
679 | "metadata": {
680 | "collapsed": true
681 | },
682 | "outputs": [],
683 | "source": []
684 | }
685 | ],
686 | "metadata": {
687 | "kernelspec": {
688 | "display_name": "Python 3",
689 | "language": "python",
690 | "name": "python3"
691 | },
692 | "language_info": {
693 | "codemirror_mode": {
694 | "name": "ipython",
695 | "version": 3
696 | },
697 | "file_extension": ".py",
698 | "mimetype": "text/x-python",
699 | "name": "python",
700 | "nbconvert_exporter": "python",
701 | "pygments_lexer": "ipython3",
702 | "version": "3.6.7"
703 | }
704 | },
705 | "nbformat": 4,
706 | "nbformat_minor": 2
707 | }
708 |
--------------------------------------------------------------------------------
/notebooks/Data Exploration.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook is an attempt to explore the dataset. This notebook needs to be expanded upon."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import pandas as pd\n",
17 | "import re\n",
18 | "from textstat.textstat import textstat\n",
19 | "from textblob import TextBlob\n",
20 | "import seaborn as sns\n",
21 | "%matplotlib inline\n",
22 | "sns.set_style(\"dark\")\n",
23 | "sns.set_context(\"talk\")"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "df = pd.read_csv('../data/twitter-hate-speech.csv', encoding='latin-1')"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": null,
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "df.head()"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "df.describe()"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "data_path = '../data/twitter-hate-speech.csv'\n",
60 | "\n",
61 | "df = pd.read_csv(data_path, encoding='latin1')\n",
62 | "df = df.rename(columns={'does_this_tweet_contain_hate_speech': 'label', \n",
63 | " 'does_this_tweet_contain_hate_speech:confidence': 'confidence' })\n",
64 | "\n",
65 | "mapping = {'The tweet is not offensive': 'Not offensive', \n",
66 | " 'The tweet uses offensive language but not hate speech': 'Offensive',\n",
67 | " 'The tweet contains hate speech': 'Hate speech'\n",
68 | " }\n",
69 | "df['label'] = df['label'].map(lambda x: mapping[x])"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": null,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "text = df['tweet_text']"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": null,
84 | "metadata": {},
85 | "outputs": [],
86 | "source": [
87 | "text[:10]"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": null,
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "def remove_handles(content):\n",
97 | " return ' '.join(re.sub(\"(@[A-Za-z0-9]+)\",\" \",content).split())"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "metadata": {},
104 | "outputs": [],
105 | "source": [
106 | "text.apply(remove_handles)[:10]"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": null,
112 | "metadata": {},
113 | "outputs": [],
114 | "source": [
115 | "data = df[~df['_golden']].dropna(axis=1)"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "sns.stripplot(x=\"label\", y=\"confidence\", data=data, size=6, jitter=True);"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": null,
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "data['label'].value_counts()"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": null,
139 | "metadata": {},
140 | "outputs": [],
141 | "source": [
142 | "data['confidence'].hist(bins=10);"
143 | ]
144 | }
145 | ],
146 | "metadata": {
147 | "kernelspec": {
148 | "display_name": "Python 3",
149 | "language": "python",
150 | "name": "python3"
151 | },
152 | "language_info": {
153 | "codemirror_mode": {
154 | "name": "ipython",
155 | "version": 3
156 | },
157 | "file_extension": ".py",
158 | "mimetype": "text/x-python",
159 | "name": "python",
160 | "nbconvert_exporter": "python",
161 | "pygments_lexer": "ipython3",
162 | "version": "3.7.0"
163 | }
164 | },
165 | "nbformat": 4,
166 | "nbformat_minor": 2
167 | }
168 |
--------------------------------------------------------------------------------
/notebooks/LSTM with Keras and TensorFlow.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook is an attempt to classify the dataset using a neural network with TensorFlow and Keras."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {
14 | "collapsed": true
15 | },
16 | "outputs": [],
17 | "source": [
18 | "import pandas as pd\n",
19 | "import seaborn as sns\n",
20 | "import numpy as np\n",
21 | "%matplotlib inline\n",
22 | "from sklearn.model_selection import train_test_split"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 2,
28 | "metadata": {
29 | "collapsed": true
30 | },
31 | "outputs": [],
32 | "source": [
33 | "data_path = 'twitter-hate-speech.csv'\n",
34 | "\n",
35 | "df = pd.read_csv(data_path, encoding='latin1')\n",
36 | "df = df.rename(columns={'does_this_tweet_contain_hate_speech': 'label', \n",
37 | " 'does_this_tweet_contain_hate_speech:confidence': 'confidence' })"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 3,
43 | "metadata": {},
44 | "outputs": [
45 | {
46 | "data": {
47 | "text/html": [
48 | "\n",
49 | "\n",
62 | "
\n",
63 | " \n",
64 | " \n",
65 | " | \n",
66 | " _unit_id | \n",
67 | " _golden | \n",
68 | " _unit_state | \n",
69 | " _trusted_judgments | \n",
70 | " _last_judgment_at | \n",
71 | " label | \n",
72 | " confidence | \n",
73 | " _created_at | \n",
74 | " orig__golden | \n",
75 | " orig__last_judgment_at | \n",
76 | " orig__trusted_judgments | \n",
77 | " orig__unit_id | \n",
78 | " orig__unit_state | \n",
79 | " _updated_at | \n",
80 | " orig_does_this_tweet_contain_hate_speech | \n",
81 | " does_this_tweet_contain_hate_speech_gold | \n",
82 | " does_this_tweet_contain_hate_speech_gold_reason | \n",
83 | " does_this_tweet_contain_hate_speechconfidence | \n",
84 | " tweet_id | \n",
85 | " tweet_text | \n",
86 | "
\n",
87 | " \n",
88 | " \n",
89 | " \n",
90 | " 0 | \n",
91 | " 853718217 | \n",
92 | " True | \n",
93 | " golden | \n",
94 | " 86 | \n",
95 | " NaN | \n",
96 | " The tweet uses offensive language but not hate... | \n",
97 | " 0.6013 | \n",
98 | " NaN | \n",
99 | " True | \n",
100 | " NaN | \n",
101 | " 0.0 | \n",
102 | " 615561535.0 | \n",
103 | " golden | \n",
104 | " NaN | \n",
105 | " The tweet contains hate speech | \n",
106 | " The tweet contains hate speech\\nThe tweet uses... | \n",
107 | " NaN | \n",
108 | " 1.0 | \n",
109 | " 1.666196e+09 | \n",
110 | " Warning: penny boards will make you a faggot | \n",
111 | "
\n",
112 | " \n",
113 | " 1 | \n",
114 | " 853718218 | \n",
115 | " True | \n",
116 | " golden | \n",
117 | " 92 | \n",
118 | " NaN | \n",
119 | " The tweet contains hate speech | \n",
120 | " 0.7227 | \n",
121 | " NaN | \n",
122 | " True | \n",
123 | " NaN | \n",
124 | " 0.0 | \n",
125 | " 615561723.0 | \n",
126 | " golden | \n",
127 | " NaN | \n",
128 | " The tweet contains hate speech | \n",
129 | " The tweet contains hate speech\\nThe tweet uses... | \n",
130 | " NaN | \n",
131 | " 1.0 | \n",
132 | " 4.295121e+08 | \n",
133 | " Fuck dykes | \n",
134 | "
\n",
135 | " \n",
136 | " 2 | \n",
137 | " 853718219 | \n",
138 | " True | \n",
139 | " golden | \n",
140 | " 86 | \n",
141 | " NaN | \n",
142 | " The tweet contains hate speech | \n",
143 | " 0.5229 | \n",
144 | " NaN | \n",
145 | " True | \n",
146 | " NaN | \n",
147 | " 0.0 | \n",
148 | " 615562039.0 | \n",
149 | " golden | \n",
150 | " NaN | \n",
151 | " The tweet contains hate speech | \n",
152 | " The tweet contains hate speech\\nThe tweet uses... | \n",
153 | " NaN | \n",
154 | " 1.0 | \n",
155 | " 3.956238e+08 | \n",
156 | " @sizzurp__ @ILIKECATS74 @yoPapi_chulo @brandon... | \n",
157 | "
\n",
158 | " \n",
159 | " 3 | \n",
160 | " 853718220 | \n",
161 | " True | \n",
162 | " golden | \n",
163 | " 98 | \n",
164 | " NaN | \n",
165 | " The tweet contains hate speech | \n",
166 | " 0.5184 | \n",
167 | " NaN | \n",
168 | " True | \n",
169 | " NaN | \n",
170 | " 0.0 | \n",
171 | " 615562068.0 | \n",
172 | " golden | \n",
173 | " NaN | \n",
174 | " The tweet contains hate speech | \n",
175 | " The tweet contains hate speech\\nThe tweet uses... | \n",
176 | " NaN | \n",
177 | " 1.0 | \n",
178 | " 4.975147e+08 | \n",
179 | " \"@jayswaggkillah: \"@JacklynAnnn: @jayswaggkill... | \n",
180 | "
\n",
181 | " \n",
182 | " 4 | \n",
183 | " 853718221 | \n",
184 | " True | \n",
185 | " golden | \n",
186 | " 88 | \n",
187 | " NaN | \n",
188 | " The tweet uses offensive language but not hate... | \n",
189 | " 0.5185 | \n",
190 | " NaN | \n",
191 | " True | \n",
192 | " NaN | \n",
193 | " 0.0 | \n",
194 | " 615562488.0 | \n",
195 | " golden | \n",
196 | " NaN | \n",
197 | " The tweet contains hate speech | \n",
198 | " The tweet contains hate speech\\nThe tweet uses... | \n",
199 | " NaN | \n",
200 | " 1.0 | \n",
201 | " 5.889236e+08 | \n",
202 | " @Zhugstubble You heard me bitch but any way I'... | \n",
203 | "
\n",
204 | " \n",
205 | "
\n",
206 | "
"
207 | ],
208 | "text/plain": [
209 | " _unit_id _golden _unit_state _trusted_judgments _last_judgment_at \\\n",
210 | "0 853718217 True golden 86 NaN \n",
211 | "1 853718218 True golden 92 NaN \n",
212 | "2 853718219 True golden 86 NaN \n",
213 | "3 853718220 True golden 98 NaN \n",
214 | "4 853718221 True golden 88 NaN \n",
215 | "\n",
216 | " label confidence _created_at \\\n",
217 | "0 The tweet uses offensive language but not hate... 0.6013 NaN \n",
218 | "1 The tweet contains hate speech 0.7227 NaN \n",
219 | "2 The tweet contains hate speech 0.5229 NaN \n",
220 | "3 The tweet contains hate speech 0.5184 NaN \n",
221 | "4 The tweet uses offensive language but not hate... 0.5185 NaN \n",
222 | "\n",
223 | " orig__golden orig__last_judgment_at orig__trusted_judgments \\\n",
224 | "0 True NaN 0.0 \n",
225 | "1 True NaN 0.0 \n",
226 | "2 True NaN 0.0 \n",
227 | "3 True NaN 0.0 \n",
228 | "4 True NaN 0.0 \n",
229 | "\n",
230 | " orig__unit_id orig__unit_state _updated_at \\\n",
231 | "0 615561535.0 golden NaN \n",
232 | "1 615561723.0 golden NaN \n",
233 | "2 615562039.0 golden NaN \n",
234 | "3 615562068.0 golden NaN \n",
235 | "4 615562488.0 golden NaN \n",
236 | "\n",
237 | " orig_does_this_tweet_contain_hate_speech \\\n",
238 | "0 The tweet contains hate speech \n",
239 | "1 The tweet contains hate speech \n",
240 | "2 The tweet contains hate speech \n",
241 | "3 The tweet contains hate speech \n",
242 | "4 The tweet contains hate speech \n",
243 | "\n",
244 | " does_this_tweet_contain_hate_speech_gold \\\n",
245 | "0 The tweet contains hate speech\\nThe tweet uses... \n",
246 | "1 The tweet contains hate speech\\nThe tweet uses... \n",
247 | "2 The tweet contains hate speech\\nThe tweet uses... \n",
248 | "3 The tweet contains hate speech\\nThe tweet uses... \n",
249 | "4 The tweet contains hate speech\\nThe tweet uses... \n",
250 | "\n",
251 | " does_this_tweet_contain_hate_speech_gold_reason \\\n",
252 | "0 NaN \n",
253 | "1 NaN \n",
254 | "2 NaN \n",
255 | "3 NaN \n",
256 | "4 NaN \n",
257 | "\n",
258 | " does_this_tweet_contain_hate_speechconfidence tweet_id \\\n",
259 | "0 1.0 1.666196e+09 \n",
260 | "1 1.0 4.295121e+08 \n",
261 | "2 1.0 3.956238e+08 \n",
262 | "3 1.0 4.975147e+08 \n",
263 | "4 1.0 5.889236e+08 \n",
264 | "\n",
265 | " tweet_text \n",
266 | "0 Warning: penny boards will make you a faggot \n",
267 | "1 Fuck dykes \n",
268 | "2 @sizzurp__ @ILIKECATS74 @yoPapi_chulo @brandon... \n",
269 | "3 \"@jayswaggkillah: \"@JacklynAnnn: @jayswaggkill... \n",
270 | "4 @Zhugstubble You heard me bitch but any way I'... "
271 | ]
272 | },
273 | "execution_count": 3,
274 | "metadata": {},
275 | "output_type": "execute_result"
276 | }
277 | ],
278 | "source": [
279 | "df.head()"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": 4,
285 | "metadata": {
286 | "collapsed": true
287 | },
288 | "outputs": [],
289 | "source": [
290 | "data = df[~df['_golden']].dropna(axis=1)"
291 | ]
292 | },
293 | {
294 | "cell_type": "markdown",
295 | "metadata": {},
296 | "source": [
297 | "# Keras neural network with TensorFlow"
298 | ]
299 | },
300 | {
301 | "cell_type": "code",
302 | "execution_count": 5,
303 | "metadata": {},
304 | "outputs": [
305 | {
306 | "name": "stderr",
307 | "output_type": "stream",
308 | "text": [
309 | "Using TensorFlow backend.\n"
310 | ]
311 | }
312 | ],
313 | "source": [
314 | "import os\n",
315 | "os.environ['KERAS_BACKEND'] = 'tensorflow'\n",
316 | "import keras"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": 6,
322 | "metadata": {
323 | "collapsed": true
324 | },
325 | "outputs": [],
326 | "source": [
327 | "from keras.preprocessing import sequence\n",
328 | "from keras.preprocessing.text import text_to_word_sequence, one_hot, Tokenizer\n",
329 | "from keras.utils import to_categorical\n",
330 | "\n",
331 | "from keras.models import Sequential\n",
332 | "from keras.layers import Dense, Embedding\n",
333 | "from keras.layers import LSTM\n",
334 | "\n",
335 | "from sklearn.metrics import classification_report, confusion_matrix"
336 | ]
337 | },
338 | {
339 | "cell_type": "code",
340 | "execution_count": 7,
341 | "metadata": {
342 | "collapsed": true
343 | },
344 | "outputs": [],
345 | "source": [
346 | "X = data['tweet_text'].tolist()\n",
347 | "\n",
348 | "# Encode labels\n",
349 | "mapping = {l: i for i, l in enumerate(sorted(data['label'].unique()))}\n",
350 | "reverse_mapping = {i: l for l, i in mapping.items()}\n",
351 | "\n",
352 | "label_names = sorted(mapping.keys())\n",
353 | "y = data['label'].map(lambda x: mapping[x])\n",
354 | "y = to_categorical(y, num_classes=3)\n",
355 | "\n",
356 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)"
357 | ]
358 | },
359 | {
360 | "cell_type": "code",
361 | "execution_count": 8,
362 | "metadata": {},
363 | "outputs": [
364 | {
365 | "name": "stdout",
366 | "output_type": "stream",
367 | "text": [
368 | "X_train shape: (11553, 30)\n",
369 | "X_test shape: (2889, 30)\n",
370 | "y_train shape: (11553, 3)\n"
371 | ]
372 | }
373 | ],
374 | "source": [
375 | "max_features = 4000\n",
376 | "maxlen = 30\n",
377 | "batch_size = 32\n",
378 | "\n",
379 | "tokenizer = Tokenizer(num_words=max_features)\n",
380 | "tokenizer.fit_on_texts(X_train)\n",
381 | "\n",
382 | "X_train = tokenizer.texts_to_sequences(X_train)\n",
383 | "X_test = tokenizer.texts_to_sequences(X_test)\n",
384 | "\n",
385 | "X_train = sequence.pad_sequences(X_train, maxlen=maxlen)\n",
386 | "X_test = sequence.pad_sequences(X_test, maxlen=maxlen)\n",
387 | "\n",
388 | "print('X_train shape:', X_train.shape)\n",
389 | "print('X_test shape:', X_test.shape)\n",
390 | "print('y_train shape:', y_train.shape)"
391 | ]
392 | },
393 | {
394 | "cell_type": "code",
395 | "execution_count": 9,
396 | "metadata": {},
397 | "outputs": [
398 | {
399 | "name": "stdout",
400 | "output_type": "stream",
401 | "text": [
402 | "_________________________________________________________________\n",
403 | "Layer (type) Output Shape Param # \n",
404 | "=================================================================\n",
405 | "embedding_1 (Embedding) (None, None, 128) 512000 \n",
406 | "_________________________________________________________________\n",
407 | "lstm_1 (LSTM) (None, 128) 131584 \n",
408 | "_________________________________________________________________\n",
409 | "dense_1 (Dense) (None, 3) 387 \n",
410 | "=================================================================\n",
411 | "Total params: 643,971\n",
412 | "Trainable params: 643,971\n",
413 | "Non-trainable params: 0\n",
414 | "_________________________________________________________________\n"
415 | ]
416 | }
417 | ],
418 | "source": [
419 | "model = Sequential()\n",
420 | "model.add(Embedding(max_features, 128))\n",
421 | "model.add(LSTM(128, dropout=0.3, recurrent_dropout=0.2))\n",
422 | "model.add(Dense(3, activation='sigmoid'))\n",
423 | "\n",
424 | "model.compile(loss='categorical_crossentropy',\n",
425 | " optimizer='adam',\n",
426 | " metrics=['accuracy'])\n",
427 | "\n",
428 | "model.summary()"
429 | ]
430 | },
431 | {
432 | "cell_type": "code",
433 | "execution_count": 10,
434 | "metadata": {},
435 | "outputs": [
436 | {
437 | "name": "stdout",
438 | "output_type": "stream",
439 | "text": [
440 | "Train on 11553 samples, validate on 2889 samples\n",
441 | "Epoch 1/30\n",
442 | "11553/11553 [==============================] - 21s - loss: 0.6829 - acc: 0.7003 - val_loss: 0.5104 - val_acc: 0.7781\n",
443 | "Epoch 2/30\n",
444 | "11553/11553 [==============================] - 16s - loss: 0.4571 - acc: 0.7911 - val_loss: 0.4674 - val_acc: 0.7899\n",
445 | "Epoch 3/30\n",
446 | "11553/11553 [==============================] - 16s - loss: 0.3936 - acc: 0.8161 - val_loss: 0.4789 - val_acc: 0.7958\n",
447 | "Epoch 4/30\n",
448 | "11553/11553 [==============================] - 18s - loss: 0.3581 - acc: 0.8389 - val_loss: 0.4977 - val_acc: 0.7972\n",
449 | "Epoch 5/30\n",
450 | "11553/11553 [==============================] - 16s - loss: 0.3291 - acc: 0.8511 - val_loss: 0.5127 - val_acc: 0.7930\n",
451 | "Epoch 6/30\n",
452 | "11553/11553 [==============================] - 16s - loss: 0.3007 - acc: 0.8638 - val_loss: 0.5515 - val_acc: 0.7709\n",
453 | "Epoch 7/30\n",
454 | "11553/11553 [==============================] - 16s - loss: 0.2771 - acc: 0.8764 - val_loss: 0.5817 - val_acc: 0.7736\n",
455 | "Epoch 8/30\n",
456 | "11553/11553 [==============================] - 16s - loss: 0.2511 - acc: 0.8906 - val_loss: 0.6567 - val_acc: 0.7729\n",
457 | "Epoch 9/30\n",
458 | "11553/11553 [==============================] - 16s - loss: 0.2614 - acc: 0.8883 - val_loss: 0.6234 - val_acc: 0.7799\n",
459 | "Epoch 10/30\n",
460 | "11553/11553 [==============================] - 16s - loss: 0.2228 - acc: 0.9014 - val_loss: 0.7142 - val_acc: 0.7844\n",
461 | "Epoch 11/30\n",
462 | "11553/11553 [==============================] - 17s - loss: 0.2087 - acc: 0.9069 - val_loss: 0.6736 - val_acc: 0.7785\n",
463 | "Epoch 12/30\n",
464 | "11553/11553 [==============================] - 17s - loss: 0.1918 - acc: 0.9152 - val_loss: 0.7782 - val_acc: 0.7639\n",
465 | "Epoch 13/30\n",
466 | "11553/11553 [==============================] - 18s - loss: 0.1789 - acc: 0.9192 - val_loss: 0.7574 - val_acc: 0.7667\n",
467 | "Epoch 14/30\n",
468 | "11553/11553 [==============================] - 18s - loss: 0.1711 - acc: 0.9228 - val_loss: 0.8155 - val_acc: 0.7632\n",
469 | "Epoch 15/30\n",
470 | "11553/11553 [==============================] - 18s - loss: 0.1614 - acc: 0.9251 - val_loss: 0.8418 - val_acc: 0.7601\n",
471 | "Epoch 16/30\n",
472 | "11553/11553 [==============================] - 19s - loss: 0.1442 - acc: 0.9321 - val_loss: 0.8810 - val_acc: 0.7629\n",
473 | "Epoch 17/30\n",
474 | "11553/11553 [==============================] - 17s - loss: 0.1373 - acc: 0.9355 - val_loss: 1.0003 - val_acc: 0.7615\n",
475 | "Epoch 18/30\n",
476 | "11553/11553 [==============================] - 16s - loss: 0.1371 - acc: 0.9340 - val_loss: 0.9598 - val_acc: 0.7670\n",
477 | "Epoch 19/30\n",
478 | "11553/11553 [==============================] - 17s - loss: 0.1331 - acc: 0.9385 - val_loss: 0.9785 - val_acc: 0.7529\n",
479 | "Epoch 20/30\n",
480 | "11553/11553 [==============================] - 16s - loss: 0.1376 - acc: 0.9364 - val_loss: 1.0256 - val_acc: 0.7560\n",
481 | "Epoch 21/30\n",
482 | "11553/11553 [==============================] - 16s - loss: 0.1156 - acc: 0.9444 - val_loss: 1.0967 - val_acc: 0.7522\n",
483 | "Epoch 22/30\n",
484 | "11553/11553 [==============================] - 16s - loss: 0.1088 - acc: 0.9449 - val_loss: 1.1062 - val_acc: 0.7511\n",
485 | "Epoch 23/30\n",
486 | "11553/11553 [==============================] - 17s - loss: 0.1119 - acc: 0.9464 - val_loss: 1.0966 - val_acc: 0.7529\n",
487 | "Epoch 24/30\n",
488 | "11553/11553 [==============================] - 17s - loss: 0.1080 - acc: 0.9467 - val_loss: 1.1104 - val_acc: 0.7632\n",
489 | "Epoch 25/30\n",
490 | "11553/11553 [==============================] - 16s - loss: 0.1006 - acc: 0.9479 - val_loss: 1.1826 - val_acc: 0.7612\n",
491 | "Epoch 26/30\n",
492 | "11553/11553 [==============================] - 16s - loss: 0.0972 - acc: 0.9510 - val_loss: 1.1623 - val_acc: 0.7629\n",
493 | "Epoch 27/30\n",
494 | "11553/11553 [==============================] - 16s - loss: 0.0968 - acc: 0.9522 - val_loss: 1.2643 - val_acc: 0.7584\n",
495 | "Epoch 28/30\n",
496 | "11553/11553 [==============================] - 15s - loss: 0.0969 - acc: 0.9523 - val_loss: 1.2078 - val_acc: 0.7549\n",
497 | "Epoch 29/30\n",
498 | "11553/11553 [==============================] - 15s - loss: 0.0956 - acc: 0.9537 - val_loss: 1.2662 - val_acc: 0.7525\n",
499 | "Epoch 30/30\n",
500 | "11553/11553 [==============================] - 16s - loss: 0.0915 - acc: 0.9555 - val_loss: 1.3384 - val_acc: 0.7522\n"
501 | ]
502 | }
503 | ],
504 | "source": [
505 | "model.fit(X_train, y_train,\n",
506 | " batch_size=batch_size,\n",
507 | " epochs=30,\n",
508 | " validation_data=(X_test, y_test));"
509 | ]
510 | },
511 | {
512 | "cell_type": "code",
513 | "execution_count": 11,
514 | "metadata": {},
515 | "outputs": [
516 | {
517 | "name": "stdout",
518 | "output_type": "stream",
519 | "text": [
520 | "2880/2889 [============================>.] - ETA: 0s\n",
521 | "Test score: 1.33836409898 Test accuracy: 0.752163378373\n",
522 | "\n",
523 | " precision recall f1-score support\n",
524 | "\n",
525 | " 0 0.45 0.43 0.44 476\n",
526 | " 1 0.91 0.91 0.91 1451\n",
527 | " 2 0.67 0.68 0.67 962\n",
528 | "\n",
529 | "avg / total 0.75 0.75 0.75 2889\n",
530 | "\n"
531 | ]
532 | },
533 | {
534 | "data": {
535 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAnMAAAIICAYAAAAFVWgMAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzs3XecFdX5x/HPdxdUelEgqChF1AgKxt57N4q9txjQJGKL\nxhpb7F1jLGDvvXfFKERUUERRFBv6s6AoCDZQWJ7fH3NWLuvusndZuNy73zevee29Z2bOPDN32Pvs\nOWdmFBGYmZmZWXEqK3QAZmZmZlZ/TubMzMzMipiTOTMzM7Mi5mTOzMzMrIg5mTMzMzMrYk7mzMzM\nzIqYkzkzMzOzIuZkzszMzKyIOZkzMzMzK2JNCh2AmVml+HCUH0kzn6lzj0KHUPJmffRGoUNoFMp6\nb6h5reNQta7z75yr47t53t784pY5MzMzsyLmljkzMzNrlJpooW1sy4uTOTMzM2uUSqV70smcmZmZ\nNUplpdEw52TOzMzMGie3zJmZmZkVsTKPmTMzMzMrXqXSMlcq+2FmZmaWlyaq+zQ3kq6XNFHSWzll\nF0h6V9Kbkh6Q1DaVd5U0TdLoNF2ds85qksZI+kDS5dLcmw+dzJmZmVmjJKnOUx3cCGxdpewZoHdE\nrAK8B5yQM+/DiOibpkNzyq8C+gM901S1zt9wMmdmZmaNUlke09xExFBgcpWypyNiZnr7MrB0bXVI\n6gy0joiXIyKAm4F+ddkPMzMzs0anTHWfGsCfgCdy3ndLXawvSNoglS0FfJazzGeprFa+AMLMzMwa\npXxatCQNAAbkFA2KiEF1XPckYCZwWyqaACwTEZMkrQY8KKlXHuHMwcmcmZmZNUr5PM4rJW51St5y\nSToQ2B7YLHWdEhE/Az+n169J+hBYHvicObtil05ltXI3q5mZmTVK87ubVdLWwD+AHSLip5zyDpLK\n0+vuZBc6fBQRE4DvJK2drmLdH3hobttxy5yZmZk1Sg3ZoiXpDmBjYAlJnwGnkl29uijwTLoi9uV0\n5eqGwBmSZgCzgEMjovLiib+SXRnbjGyMXe44u2o5mTMzM7NGqYyGewJEROxVTfF1NSx7H3BfDfNe\nBXrns20nc2ZmZtYoNdBVqgXnZM7MzMwapVK5cMDJnJmZmTVK+VzNujBzMmdmZmaNkrtZzczMzIqY\nu1nNzMzMiphb5szMzMyKWEPemqSQnMyZmZlZo+SWOTMzM7MiVu5kzszMzKx4uZvVzMzMrIi5m9XM\nzMysiPnWJGZmZmZFrEQa5pzMmZmZWeNU7sd5mZmZmRWv0kjlnMyZmdVowteTOO6iK5n07VQk2H3r\nzdi/3zZM+f4Hjj7nMj6f+A1LdVyCS044gjatWvLZV1+z3SF/p9vSSwLQZ4XlOH3gnwu8F8Vlwpdf\n8Y9/nsakSZOzY77LThyw954A3HLHXdx2972Ul5Wx0Qbr8Y8jDy9wtMVjwjeTOf7y65k09TsAdt9i\nQ/bffvNf59/w8NOcf9M9DL/hYtq1bsXnE79huyNOoduSnQDos3x3Tjtkv4LEPj85mbOiJmlxYEh6\n+zugAvga6Ap8EREr1bPejYFfImJ4A4RZXf1dgXUj4vb5UX/aRl9gyYh4fC7LrQ7sHxHz9I0i6Xng\nmIh4tY7Ld2U+H4OGIuljYPWI+KbQsdRHeXkZx/15X3ot140ffprGLoefyLp/WJkHnnmBtfv2ZsDu\nOzLo7ocYfM/DHPOnvQFYpnMnHrzi3AJHXrzKy8s5/ugj6PX7Ffnhxx/ZZe/9WW+tNflm8mSGPD+U\nh++6jUUWWYRJkycXOtSiUl5exj8O3I1e3Zflx2nT2eXYf7Fun5VYrsuSTPhmMi+OfpvOS7SfY50u\nnTrwwEWnFijiBUMl0s1aKhdyWJ4iYlJE9I2IvsDVwCXpdV9g1jxUvTGwbgOEWJOuwN7zsX7IjsG2\nc1soIl6d10Sunroy/4+BAR3bt6PXct0AaNm8GT2WWYqvvpnMkJdfo9/mGwLQb/MNefalOuXhVgcd\nOyxBr9+vCEDLFi3o3q0bX339NXfccx8DDjqARRZZBIDF27evrRqromO7tvTqviwALZotRo+lO/PV\n5CkAnHvDXRyz/64lk9jkQ3lMCzMnc1adckmDJb0t6WlJzQAk9ZD0pKTXJA2TtGLuSqnF6FDgKEmj\nJW0kabwybSVVSNowLTtUUk9JLSRdL2mEpNcl7Zjml0u6QNJISW9KOiRt5lxgg1T/UVUDl3ScpDGS\n3pB0birrK+nlVM8Dktql8uclnZe2/Z6kDSQtApwB7JG2sYekNSW9lOIbLmmFtP7Gkh5Nr09L+/G8\npI8kHZ7KW0h6LMXzlqQ9ajjmu+XGUXk803EelabKJHmOY1DLsco9LtXGIeljSeenYzZC0nKpvIOk\n+1KdIyWtl1NPTZ/XhanuNyUNzNn8wBT/mKrnTDH57KuveefDj+mz4nJMmjKVju3bAdChXVsmTZk6\ne7kvv6bfYcez7z9O59W33i1UuCXhsy++4J1x4+jTuxcff/J/vPr6aHbb7yD2PfgQ3nx7bKHDK1qf\nT/yGd8Z/Sp+e3RgyYjSd2rdjxa5dql1up7+fzn7/vIBXx75XgEjnv7I8poWZu1mtOj2BvSKiv6S7\ngV2AW4FBwKER8b6ktYArgU0rV4qIjyVdDfwQERcCSBoHrAR0A0aRJSGvAF1SPWcDz0XEnyS1BUZI\nehbYB5gaEWtIWhR4UdLTwPFkXZLbVw1a0jbAjsBaEfGTpMo/3W8GBkbEC5LOAE4FjkzzmkTEmpK2\nBU6NiM0lnULWNXhYqrc1sEFEzJS0OXB2OiZVrQhsArQCxkm6CtiarNt6u1RXmxqO+RxxAJsDE4Et\nImK6pJ7AHcDqVY+BpAHVHauIGJ9Tf21xTI2IlSXtD1wKbA9cRtZa+z9JywBPAb8HTqrh89qfrMWw\nbzpOuc0m30TEHyT9FTgGmGMQWYp/AMDVZ57EgD13ruEQFc6P06Zz+FmXcMKA/WnZvPkc8yT92qLR\nsX1bnrvp37Rr3Yq33v+Iw/51EY9efcFv1rG5+/Gnnzj8mOM58ZijadmyJRUVFUydOpW7b76eMW+P\n5ch/nMCQRx9slK1J8+LHadM5/IKrOP6gPSgvL2PQ/Y9z7T+P/M1yHdq1Ycg159GuVUve/vATDjvv\nPzxy6em0bN6sAFHPP2Ulcv44mbPqjI+I0en1a0BXSS3Juk/vyfnluWgd6hoGbEiWzJ0D9AdeAEam\n+VsCO0g6Jr1fDFgmla8iaddU3oYsyfyllm1tDtwQET8BRMTklLS0jYgX0jI3AffkrHN/7n7WUG8b\n4KaUUAXQtIblHouIn4GfJU0EOgFjgIsknQc8GhHDali3ujiaAlcoG8NXASxfw7o1HavcZK62OO7I\n+XlJer05sFLOZ906nQM1fV6bA1dHxEzIjn0N+/abTC0iBpH9oUB8OCpq2MeCmTFzJoefdQl/3Hg9\ntlxvTQAWb9uGiZO/pWP7dkyc/C3t27QGYJGmTVmkaXZ69O7ZnS6dOzH+swmsvHyPgsVfjGbMmMnh\nxxzHH7fZii032wSATp06ssVmmyCJVXr3oqysjG+/nUL71EJqczdj5kyOuOAq/rjBWmy59h9475PP\n+Oyrb+j39zMA+GrSt+xy7Jncde6JdGjX5tdzuVePZenyuw58/MVX9F6uawH3oOGVRirnZM6q93PO\n6wqgGVkr85Q0ri4fQ4G/AEsCpwDHko2rq0wmBOwSEeNyV1KWRQyMiKeqlG+c5/bnpnJfK6j5/8O/\ngP9GxE7KupKfn0tdv9YXEe9J+gPZGLwzJQ2JiDPqGMdRwFdAH7LjP72G7VZ7rHLNJY7cBKrydRmw\ndkTMsc30uVT3edW06Zr2rShEBCdfOogeXZbkoJ23+7V807VX48FnhzJg9x158NmhbLb2agBMnvod\nbVq2pLy8jE8nfMUnX3xJl86dChV+UYoITjr9X3Tv1o2D9tvn1/LNN96IV0a+xtprrM74Tz5hxowZ\ntGvXtoCRFpeI4OQrb6L70p05cIctAVh+2aV58YaLf11ms0OP597zT6Jd61ZMnvo9bVq2yM7lL7/m\nkwkTWbpTh0KFP984mbNGJSK+Uzb+bbeIuCd9qa8SEW9UWfR7oHXO+xHALcBHqbtwNHAIWVceZN13\nAyUNjIiQtGpEvJ7K/yLpuYiYIWl54PNUf6sawnwGOEXSbZXdrKl17ltJG6TWqP3IWgZrU3UbbdK2\nAQ6cy7pzkLQkMDkibpU0hSpdjHPRBvgsImZJOgAoryG+ao9VRPxYxzj2IBuHtwfwUip7GhgIXJDW\n75taa2v6vJ4BDpH038pu1iqtc0Vp1NhxPPTcMJbv2oV+hx0PwFEH7EH/3XbgqHMu476nn2fJdGsS\ngJFj3uHft95DkyZNKJM47bCDaduqZSF3oei8NvoNHnrsCZbvuRw77pElc0cf9ld26bcDJ572L7bf\ndU+aNm3KuWec6i7WPIx69wMefuFlll9mKXb6++kAHLn3zmy02srVLv/q2Pe4/M6HaNqkHKmM0wbs\nS9tWLRZkyAuEn81qjdE+wFWSTibrArwTqJrMPQLcmwbGD4yIYZI+BV5O84cBe5F1+0HW6nUp8Kak\nMrKuwe2Ba8m6G0elxPFroB/wJlAh6Q3gxoio7BYkIp5MXZKvSvoFeBw4ETgAuFpSc+Aj4KC57Od/\ngeNT4nkOcD5ZN+vJwGN1O1S/Whm4QNIsYAZZK2VdXQncl8ayPQlUJmdzHAOy8W1d+e2xqmsc7SS9\nSdaCtlcqOxz4TypvQtbCeii1f17Lp/IZwGDgijz2daG0Wq8VeffxO6qdd+M5J/+mbKv112Kr9dea\n32GVtNVX7cu410dUO+/Cs6pr1La6WO33PXnnvsG1LjPk6tm31NlyndXYcp3V5ndYBacSaZtTxEI3\nRMXMFhAtZPeBWxjHzJUadfb4vflt1kdV/8a1+aGs94bznIk9ucSSdf6ds/U3Xyy0mZ9b5szMzKxR\ncjermRW9iOha6BjMzAqlrES6WZ3MmZmZWaNUGqmckzkzMzNrpErlgmgnc2ZmZtYolUgu52TOzMzM\nGqfyEmmaczJnZmZmjVJppHJO5szMzKyRKpVkrqzQAZiZmZkVgvL4N9e6pOslTZT0Vk5Ze0nPSHo/\n/WyXM+8ESR9IGidpq5zy1SSNSfMuVx2eW+dkzszMzBolqe5THdwIbF2l7HhgSET0BIak90haCdgT\n6JXWuVJS5fO3rwL6Az3TVLXO33AyZ2ZmZo1SWR7T3ETEUGByleIdgZvS65uY/dzsHYE7I+LniBgP\nfACsKakz0DoiXo7seas389tnbf+Gx8yZmZlZo1Q2/69m7RQRE9LrL4FO6fVSwMs5y32Wymak11XL\na+WWOTMzM2uUlM8kDZD0as40IJ9tpZa2aMj4K7llzszMzBqlfNrlImIQMCjPTXwlqXNETEhdqBNT\n+edAl5zllk5ln6fXVctr5ZY5MzMza5Qk1Xmqp4eBA9LrA4CHcsr3lLSopG5kFzqMSF2y30laO13F\nun/OOjVyy5yZmZk1SmUNOGRO0h3AxsASkj4DTgXOBe6WdDDwCbA7QES8LeluYCwwE/hbRFSkqv5K\ndmVsM+CJNNXKyZyZmZk1SmXlDZfNRcReNczarIblzwLOqqb8VaB3Ptt2MmdmZmaNUok8mtXJnJmZ\nmTVO8zAWbqHiZM7MzMwapRLJ5ZzMmZmZWePkljkzMzOzIlYiuZyTOTMzM2ucyhvy3iQF5GTOzMzM\nGiV3s5qZmZkVMZXIc7CczJmZmVmj5JY5MzMzsyJWIrmckzkzMzNrnNwyZ2ZmZlbEfDWrmZmZWREr\nkYY5J3NmtvBQx2ULHULJO7RFl0KHUPKunjyu0CFYHbmb1czMzKyIlUgu52TOzMzMGicnc2ZmZmZF\nrKy8NLI5J3NmZmbWKJWVSNOckzkzMzNrlEokl3MyZ2ZmZo2Tr2Y1MzMzK2Ilkss5mTMzM7PGyS1z\nZmZmZkWszI/zMjMzMyteKit0BA3DyZyZmZk1Su5mNTMzMytm7mY1MzMzK2JumTMzMzMrXu5mNTMz\nMytm5aVxBYSTOTMzM2uU5DFzZmZmZkXM3axmZmZmxcstc2ZmZmbFrERa5kpj5J+ZmZlZnlReVudp\nrnVJK0ganTN9J+lISadJ+jynfNucdU6Q9IGkcZK2qu9+uGXOzMzMGqcG7GaNiHFAXwBJ5cDnwAPA\nQcAlEXFh7vKSVgL2BHoBSwLPSlo+Iiry3bZb5szMzKxxkuo+5Wcz4MOI+KSWZXYE7oyInyNiPPAB\nsGZ9dsPJnJmZmTVKKqv7lKc9gTty3g+U9Kak6yW1S2VLAZ/mLPNZKsubkzkzMzNrnPJomZM0QNKr\nOdOA6qvUIsAOwD2p6CqgO1kX7ATgoobeDY+ZMzMzs0Ypn1uTRMQgYFAdFt0GGBURX6X1vvp1e9Jg\n4NH09nOgS856S6eyvLllzszMzBqn8rK6T3W3FzldrJI658zbCXgrvX4Y2FPSopK6AT2BEfXZDbfM\nmZmZWaOkBr7PnKQWwBbAITnF50vqCwTwceW8iHhb0t3AWGAm8Lf6XMkKTubMzMyssWrgJ0BExI/A\n4lXK9qtl+bOAs+Z1u07mzMzMrHEqkSdAOJkzMzOzRqmhu1kLxcmcmZmZNU4N3M1aKE7mzMzqaNM/\n7kyL5s0pKy+nvLyc+2+5nieefY4rBl3Hh+M/5p6brmXllX5f6DCLwn7X/YeVt9+a7yd+zb9WXhuA\nP55xMn123JaYNYvvJ37DTQceytQJX9KifXsG3Hszy67xB16+8XbuHHjMr/WsvueubHPi34kIpn7x\nJdfv+2d+nDS5ULtVNG684x7uefgxJFi+R3fOOfk4Lh10Pf/933CaNmnKMksvyTknH0frVq0KHep8\nVZdnrhaD0tgLKxqSFs952PCXOQ8fniJp7DzUu7GkdRsy1ir1d5W0dw3zlpR073zcdr/0DL981ukg\n6RVJr0vaQNJukt6R9N8GjOtQSfs3VH3F4qZrruCh22/i/luuB7Ivwn+ffzZrrNq3wJEVl5duvI1/\nb73zHGXPXHAZZ/ZZl7NWXZ8xjz7JdqccB8CM6dN5+J9nct8xJ8+xfFl5Obtfdh4Xb7IdZ/ZZl8/f\nfItNDjsEq91XE7/m5rvv474bruHR22+kYtYsHnvmOdZbc3Ueve0GHrnterp26cI1N91e6FDnv/n3\nOK8FysmcLVARMSki+kZEX+BqsocP9yW7M/aseah6Y2C+JXNAV6DaZC4ivoiIXefjtvsBeSVzZM8F\nHBMRq0bEMOBgoH9EbNJQQUXE1RFxc0PVV6x6dOtK967LFjqMovPBsOH8NPnbOcqmf//9r68XadGc\niADgl59+4sMXX2bm9OlzVpLdlZ9FW7QAYLHWrZjyxYT5G3iJqKioYPrPPzNz5kymT59Oxw5LsP5a\na9CkSdZh17f3Snw58esCRzn/qUx1nhZmTuZsYVIuabCktyU9LakZgKQekp6U9JqkYZJWzF1JUlfg\nUOCo1Mq3kaTxyrSVVCFpw7TsUEk9JbVIz8gbkVqvdkzzyyVdIGlkeo5e5Z/55wIbpPqPqrp9SW+l\n171SnaPT+j2r7qSkHySdJekNSS9L6pRTz3NpvSGSlkmtjTsAF6Q6e1Sz7arr9AXOB3ZM65wKrA9c\nl/at2n1MrZvPS7pX0ruSblMaHSzpXElj0/IXprLTJB0jaUVJI6rENCa9Xk3SC+mze6rKzTOLj8RB\nfz2Cnfc9iLvuf7DQ0ZSkHc/8J2f/31jW3Gd3Hjml9js2zJo5kzv+chT/HPMS533xHp1XWpEXr2v0\nf1/MVaeOHfjTPnuwSb/dWX/7XWjZoiXrr7XGHMvc98jjbLhOvZ75XlzcMmfW4HoC/4mIXsAUYJdU\nPggYGBGrAccAV+auFBEfk9PKFxEvAOPIWrPWB0aRJWKLAl0i4n3gJOC5iFgT2IQsWWpB1oI1NSLW\nANYA+qc7cx8PDEv1X1LLPhwKXJZaG1cne3ByVS2AlyOiDzAU6J/K/w3cFBGrALcBl0fEcLK7hB+b\ntv1hlbqqW2c0cApwV1rndOBVYJ+IOLaWfQRYFTgyHbvuwHqSFie7a3mvtJ0zqxz/d4FFcurYA7hL\nUtMU367ps7ueBrifUiHdce3VPHT7TQy+/CJuu+d+Ro56vdAhlZyHTv4XJy6zEiNuu5uN59JlWtak\nCRv+5WDOWnUDjltyeT5/8222PuHvCyjS4jX1u+8ZMvRFhtx/J8MevY9p06fx0BNP/zr/qhtuobxJ\nOTtsvUUBo1xAylT3aSHmZM4WJuNTIgLwGtBVUkuy7tN7JI0GrgHq0rozDNgwTeeQJXVrACPT/C2B\n41OdzwOLAcuk8v1T+StkN3/8TetaLV4CTpR0HLBsREyrZplfmP1svtfIunAB1gEqB6nckmKem/qs\nU9s+joiIzyJiFjA6xTYVmE7Wsrcz8FM1dd5NlsSRft4FrAD0Bp5J2zqZ7NmDc1DOw6sH3XBTHcIv\nnE4dOwCwePv2bLHxhrz59jsFjqh0jbjtblbdZYdal+nSdxUAvvloPACv3n0/PdZda77HVuyGj3yN\npZfsTPt2bWnapAlbbrwhr495G4D7H32C5198iQtPP7lkbttRG5WX1XlamPlqVluY/JzzugJoRvYH\nx5TU0pWPocBfgCXJWqmOJRtXNyzNF7BLRIzLXSl1Kw6MiKeqlG9cl41GxO2SXgG2Ax6XdEhEPFdl\nsRlRORgo288F/f+wtn2s+hk0iYiZktYkG4e3K3AYsGmVOu8iS7jvByIi3pe0MvB2RKxTWzBzPLz6\n+0lR27KF9NO0acyaNYuWLVrw07RpvPjKCP765z8VOqyS0nG5Hkz8IGt87rPjdnz17nu1Lj/l8y/o\nvNKKtFxicX74ZhK/32JTJrwzrtZ1DJbs1JE33hrLtOnTWWzRRXnp1VH0XnEFhr70Ctfeeie3XnUZ\nzRZbrNBhLhglkrA6mbOFWkR8l8a/7RYR96Rka5WIeKPKot8DrXPejyBrqfooIqanlqFDgO3T/KeA\ngZIGRkRIWjUiXk/lf5H0XETMkLQ88Hmqf67X6EvqnrZ5uaRlgFWAqslcTYYDe6a492F24lnbtmta\npzY17WO1Uuto84h4XNKLwEdVl4mIDyVVAP8kS+wg6+ruIGmdiHgpdbsuHxFv1yHGhc6kSZP527En\nANng8e232oIN112bZ/77Av+64GImfzuFQ448ht8v35Prrri0wNEu/A6+/XqW33h9Wi6xOOd8+g6P\nnHo2vbfdkk4r9CRmzWLyJ59y+6FH/rr8WePHsFjr1pQv0pQ+/bbj8i37MeGdcTx6+rn8feiTVMyY\nweRPPuWmA/9SwL0qDn16r8RWm27ETgf0p0l5Ob9fvid79Nue7fY+kF9+mcFBh//91+XOOK7Eu62d\nzJktMPsAV0k6GWgK3AlUTeYeAe5NFzIMjIhhkj4FXk7zhwF7AWPS+38BlwJvSioDxpMleteSdS2O\nSonj12RXk74JVEh6A7ixlnFzuwP7SZoBfAmcncd+DgRukHRs2u5BqfxOYLCkw8nGn31Yh3VqU9M+\n1qQV8JCkxcha9Y6uYbm7gAuAbgAR8YukXYHLJbUh+31zKVCUyVyXpZfi4Tt+O7h+i002YotNNipA\nRMXtur1/26o5/Ppbalz+pG4rV1s+7JrrGXbN9Q0WV2NxeP+DOLz/nL8unrm3EdyKpKoSSeY0u7fH\nzKzAFuJu1lJxaOtuc1/I5snVk93Vu0C06zzPmdjMo3aq8++cJpc8sNBmfm6ZMzMzs8apRFrmnMyZ\nmZlZ41S2cF+lWldO5szMzKxxcjJnZmZmVsTczWpmZmZWxJzMmZmZmRUxJ3NmZmZmRay8vNARNAgn\nc2ZmZtY4uWXOzMzMrIg5mTMzMzMrXvKtSczMzMyKmFvmzMzMzIqYkzkzMzOzIuarWc3MzMyKmFvm\nzMzMzIqYkzkzMzOzIuZkzszMzKyI+dYkZmZmZkXMLXNmZmZmRcxXs5qZmZkVsRJpmSuNzmIzMzOz\nfEl1n+pUnT6WNEbSaEmvprL2kp6R9H762S5n+RMkfSBpnKSt6rsbTubMzMyscWrgZC7ZJCL6RsTq\n6f3xwJCI6AkMSe+RtBKwJ9AL2Bq4UlK9+n2dzJmZmVnjVFZW96n+dgRuSq9vAvrllN8ZET9HxHjg\nA2DNeu3GvERnZmZmVrQaPpkL4FlJr0kakMo6RcSE9PpLoFN6vRTwac66n6WyvPkCCDMzM2ucVPc2\nrZScDcgpGhQRg6ostn5EfC6pI/CMpHdzZ0ZESIp6x1sDJ3NmZmbWOJXVfSxcStyqJm9Vl/k8/Zwo\n6QGybtOvJHWOiAmSOgMT0+KfA11yVl86leXN3axmZmbWOKms7tPcqpJaSGpV+RrYEngLeBg4IC12\nAPBQev0wsKekRSV1A3oCI+qzG26ZMzMzs8apYe8z1wl4QFmdTYDbI+JJSSOBuyUdDHwC7A4QEW9L\nuhsYC8wE/hYRFfXZsJM5MzMza5wa8NmsEfER0Kea8knAZjWscxZw1rxu28mcmZmZNU5lfpyXmZmZ\nWfEqkcd5OZkzs4VGTPyk0CGUvKsmvlXoEErezDP+WugQGoUmlzww75U0YDdrITmZMzMzs8bJLXNm\nZmZmRSyPmwYvzJzMmZmZWeOUx02DF2ZO5szMzKxx8tWsZmZmZkXM3axmZmZmRczdrGZmZmZFzFez\nmpmZmRUxd7OamZmZFbFyXwBhZmZmVrzczWpmZmZWxNzNamZmZlbEfDWrmZmZWRFzy5yZmZlZEfOY\nOTMzM7Mi5qtZzczMzIqYu1nNzMzMipi7Wc3MzMyKWJlb5szMzMyKl1vmzMzMzIqYx8yZmZmZFbEy\nX81qZmZmVrz8BAgzMzOzIuZuVjMzM7Mi5gsgzMzMzIqYW+bMzMzMipd8AYSZmZlZEXPLnJmZmVkR\n89WsZmZmZkXMLXNmZmZmRaxErmYtjZTUzMzMLF8qq/s0t6qkLpL+K2mspLclHZHKT5P0uaTRado2\nZ50TJH0gaZykreq7G26ZMzOrwYSvJ3HcRVcy6dupSLD71puxf79tmPL9Dxx9zmV8PvEbluq4BJec\ncARtWrXW2z9WAAAgAElEQVT8db0vJn7D9ocew9/22ZWDd9m+gHtQHE48+0KeH/4Ki7dryyO3DAZg\nynffcfQpZ/H5l1+y1O9+xyVnnEyb1q2YMXMmJ597MWPfe5+Kigp23HoLDtlvrwLvQZFYrDlle/4N\n/W4ZACruuIKyFfuitbeAH78DYNZjtxLvjAJAm+1M2VqbQ8xi1v3XEuNGFyz0+aZhr2adCfw9IkZJ\nagW8JumZNO+SiLgwd2FJKwF7Ar2AJYFnJS0fERX5brjWVFPS4jmZ5Jc5meUUSWPz3VhOvRtLWre+\n69eh/q6S9p5f9c8rSYdLekfSbZIWlfRsOq57NOA2hjdQPT80RD2FlM63R/Ncp1/6j9YQ26/xfKxn\nbAdKWrIhYpufUpxXFDqOeVFeXsZxf96Xx665kDsv/he3Pfo0H/zfZwy++yHW7tubp669hLX79mbw\nPQ/Psd65g29hg9X7Fijq4rPTtlsy+KKz5ygbfOtdrL3aqjx1502svdqqDL71TgCefG4oM2bM4JGb\nB3PfdVdy10OP8dmELwsRdtEp2/nPxDuvU3HuQCouOAq++hSAWS88QsWFR1Nx4dG/JnJ0WpqyVden\n4rzDqbjmDMp2PaRkxpfNoUx1n+YiIiZExKj0+nvgHWCpWlbZEbgzIn6OiPHAB8Ca9dqNuQQ2KSL6\nRkRf4GqyzLIv0BeYVZ8NJhsD8y2ZA7oCC20yB/wV2CIi9gFWBUjH+a6G2kBEzM/j2xj0AxokmaPh\nz8cDyf6Ks/msY/t29FquGwAtmzejxzJL8dU3kxny8mv023xDAPptviHPvvTqr+s8O3wkS/+uI8st\ns3RBYi5Ga/RdhTatW81RNmTYcPptswUA/bbZgmeHZX+fSvDTtOnMnFnB9J9/oWmTJrRs0XyBx1x0\nFmuOuq9EvPJs9r5iJkz/qcbF1XtNZr3+v2y5yROJbybAMj0XULALUAN2s85RrdSV7Pv9lVQ0UNKb\nkq6X1C6VLQV8mrPaZ9Se/NVoXtLsckmDU7/w05KaAUjqIelJSa9JGiZpxdyV0g4eChyVWqM2kjRe\nmbaSKiRtmJYdKqmnpBbpAIyQ9LqkHdP8ckkXSBqZDtIhaTPnAhuk+o+qsv05WkIkXSHpwPT63NTX\n/aakC1NZB0n3pW2MlLReKt8op9Xy9dSkSpVtHS3prTQdmcquBroDT0g6DrgVWCPV00PSapJeSMfv\nKUmd03rPSzovHYP3JG2QynulstEp7p6p/If0805J2+XEdKOkXWs5dtWS1FLSEEmjJI3J+Qy6plbG\n6s6FNVLdo9O23krlc7TYSHpU0sbp9VWSXk11nZ6zzLaS3k3H5fLKz7Cmc6MarSU9pmxcwtVS9j9T\nOS2P6bjcqKzVeAfggsrPpcqxuDHFMFzSR5J2TeWq3M90jCpbWms8H5OWku5N+3eblI3IlXRK+nze\nkjQo1b8rsDpwW6qvWU3nTJWYd0v1vCFpaM7n8FA6t96XdGrO8vvmnFfXSCpP5VtKeimdB/dIapnz\nWQ9P9Y/Q7P8PSyr7ffC+pPNr+GyKwmdffc07H35MnxWXY9KUqXRsn/0+7tCuLZOmTAXgx2nTGXzv\nI/xt710KGWpJmPTtt3RcYnEAOizenknffgvAVptsSPNmi7FBvz3YdJd9+NNeu9G2detChloc2neE\nH76jbK+BlP/9Isr2+CsssigAZRtsS/mxl1C252HQrAUAarM4TJk0e/0pk1Db9oWIfP6S6jxJGpC+\nnyqnAdVXqZbAfcCREfEdcBXZ935fYAJwUUPvxrwkcz2B/0REL2AKUPnbaxAwMCJWA44BrsxdKSI+\nJqeVLyJeAMaRtYKsD4wi++JbFOgSEe8DJwHPRcSawCZkX7ItgIOBqRGxBrAG0F9SN+B4YFiq/5K6\n7IykxYGdgF4RsQpwZpp1WYp1jbSP16byY4C/pZbKDYBpVepbDTgIWAtYO8W2akQcCnwBbBIR5wF/\nrowV+D/g38Cu6fhdD5yVU22TdAyOBCq/eA8FLkvrr06W2ee6C9g9xbQIsBnwWC3HribTgZ0i4g9k\nn8FFlUkHNZ8LNwCHpNjqOgbgpIhYHVgF2EjSKpIWA64BtknHpUPu8lR/blS1JjCQ7DzrAexcUwAR\nMRx4GDg2nUMfVrNYZ7LzdXuyZI1UZ1+gD7B5iqUzcz8fVyX7TFci+w+/Xiq/IiLWiIjeQDNg+4i4\nF3gV2Ccd15nUfs5UOgXYKiL6kCWqucdlF7LjvZuk1SX9HtgDWC/ns9tH0hLAycDm6Tx4FTg6nVd3\nAUek+jdn9v+HvqmulYE9JHWpGljuL8hBd95fTeiF9+O06Rx+1iWcMGB/WjafsxVI2S95AK647V4O\n7LcNLZotVogwS5YkRHaMx4x9l7KyMoY+eCfP3nMzN9x5L59+PqHAERaB8nJYujuzXnySiov+Dr/8\nTNlmO2fvz/wLFRceDd99S9mOBxU60gUrj5a5iBgUEavnTIN+U53UlCyRuy0i7geIiK8ioiIiZgGD\nmd2V+jmQ+ztx6VSWt3m5AGJ8RFSOhnwN6Jqy0XWBe2Z/z7NoHeoaBmwIdAPOAfoDLwAj0/wtgR0k\nHZPeLwYsk8pXqWwZAdqQJRa/1GN/ppIlLNelVp/K1rvNgZVy9qd12s8XgYsl3QbcHxFVk6j1gQci\n4kcASfeTJX2v1xLDCkBv4Jm0vXKyLL5S5Tfda2RddwAvASdJWjrF8X6VOp8ALkvJ8dbA0IiYJqmm\nYze+htgEnK2s1XQWWVNwpzSvunOhLdAqIl5K5beTJT5zs3v6a6cJWcK0EtkfHR+lMQUAdwCVfxHV\ndG68U6XeERHxEYCkO8g+n3vrEE9NHkz/McdKqjwO6wN3pMGrX0l6gSxR/m4udY2oPH8kjSb7bP8H\nbCLpH0BzoD3wNvBIlXXnds5UehG4UdLdzD6PAJ6JiElp2/enfZgJrAaMTHU2AyaS/VGyEvBiKl+E\n7PxbAZgQESMB0l+ilQnOkIiYmt6PBZZlzm4F0i/EQQDx4aiYy7Fa4GbMnMnhZ13CHzdejy3Xy34H\nL962DRMnf0vH9u2YOPlb2rfJWobeHPcBT/3vFS64/na+//EnyiQWXaQp+/6x3hepNVqLt2vHxG8m\n0XGJxZn4zSTat2sLwKPPPMcGa61O0yZNWLxdO/6wci/eevc9uiz1mwZpyzVlEkydBP+XfUXMemM4\nZZvtDD9M/XWRWS89TXn/kwGIqZOg7eKz12+7ODFl8gINeYFowAsgUgPHdcA7EXFxTnnniKj8vbwT\n8FZ6/TBwu6SLyYbO9ARG1Gfb85LM/ZzzuoLsF34ZMCX9NZ+PocBfyHbmFOBYsnF1w9J8AbtExLjc\nldKBGxgRT1Up37iWbc1kzhbJxQAiYqakNclarnYFDgM2TcuuHRHTq9RzrqTHgG3Jvty2ioh367S3\nNRPwdkSsU8P8ymNeQfrsIuJ2Sa8A2wGPSzokIp6rXCEipkt6HtiKrIXkzpxt/ebY1WIfshax1SJi\nhqSPSceO6s+F2lT7GaSWwWOANSLiW0k35myjJtWeG9WomiRENeX5NKfk7vO83qio6vFrklojrwRW\nj4hPJZ1WQ3xzO2cAiIhDJa1Fdp68llqOofrjIuCmiDhhjg1JfyRL/vaqUr5yPvtWW5wLm4jg5EsH\n0aPLkhy086+jFdh07dV48NmhDNh9Rx58diibrZ0dztsuOO3XZf596700b7aYE7l62nT9dXjwiWcY\nsN+ePPjEM2y2QTYMuHOnjrw8ajQ7br0FP02bxhtj3+GA3WtsaLdK30+BKd9AhyXh6y8o67kKfPkZ\ntG4H32Vd2FplbWLCJwDE2yMp3/coKp5/GNq0Rx06/5oIlpSyBr2oYz1gP2BM+sMc4ERgL0l9yX6/\nfgwcAhARb6c/sMeSfS/+rT5XskID32cu/UU+XtJu8OsYoj7VLPo9kDvGbARZi96slDSNJtvZoWn+\nU2SDByvHEq2aU/6X1KyJpOVTF1vV+nN9QtbStmhqPdosrdsSaBMRjwNHkXWVATxN1j1HWq5v+tkj\nIsakrtKRwBxjA8kS0X6SmqeYdmJ2clqTcUAHSeukbTSV1Ku2FSR1J2u1uhx4iKy7rKq7yLp8NwCe\nTGU1HbuatAEmpkRuE7IWlhpFxBTg+5RAQHb5daWPgb6SylK3W2WTc2vgR2Bqau3aJpWPA7orG28J\nWVJaqaZzo6o1JXVTNlZuD7KWL8ha0H6fynfKWb62c6gmw8i6EssldSBrbR5Rz7oqE7dv0rm5a868\n3PrqdM6k8/WViDgF+JrZTftbSGqvbJxjP7IWvCHArpI6pnXbS1oWeBlYT9JyqbyFpOVTDJ0lrZHK\nW0kqqqStJqPGjuOh54bx8htv0++w4+l32PG8MPJ1+u+2A8NfH8NWfz6Kl0a/Rf/daxqqaXVx9Kln\nsdehRzD+/z5lo5324t5Hn6D/vnsy/NXX2GrPA3jp1VH03zf7b7/3zjvy00/T2H7fP7Nb/8PYedut\nWGG57gXeg+JQcd9gyvc7ivJjL4GlujHr2Xsp++P+lB97KeXHXoKW682sh27IFv7yU2aNHk758f+m\n/JBTmHXvYIh5ue5x4VQ5TKIu09xExP8iQhGxSuXFoxHxeETsFxErp/IdclrpiIizIqJHRKwQEU/U\ndz/mxy/cfYCrJJ0MNCVrCXqjyjKPAPcqG6w+MCKGSfqU7MsCsi/FvYAx6f2/gEuBN9OX7niyLrtr\nybqkRqUv86/JvpDeBCokvQHcmDtOKbVy3E3WzDme2d2erYCHUouIgKNT+eHAfyS9SXa8hpKNUzsy\nJTWzyLq/5vgQIrvPzI3MbjK9NiJq62IlIn5J3Z6XS2qTtndpqr8muwP7SZoBfAmcXc0yTwO3AA9F\nRGUXdE3Hria3AY9IGkM2VqourZAHA4MlzSLrNq9sz3+R7NiPJesOrbyU+w1Jr6e6P03LkbqF/wo8\nKelHZne/Q83nRlUjgSuA5YD/Ag+k8uPJutS/TvtVebOwO1Psh5ONR6tu3FxVDwDrkJ3vAfwjIr6U\nNIkazseaRMQUSYPJztMvq+zzjcDVkqal7dXlnLlA2cUxIkvW3iAbzzaCbHzH0sCtEfEqQPr/+3Q6\npjPI/mJ8WdnFQnco67YHODki3lN2sce/U1I4jWx4QtFbrdeKvPv4HdXOu/Gck2tdd+C+u9Y632a7\n+PSTqi2/8bILflPWonkzLjvzlPkdUmn64mMqLj52jqJZt11W4+Lx7L1UPDsvo1GKQIncbkURC90Q\nFSsRklpGROVVtccDnSPiiHmpKyWe/wHer+vFLVa9lJitHhGHFTqWSgvjmLmS03qJQkdQ8irOrtev\nOctTk0semOdnccX7I+v8O0c911hon/1VEl0httDaTtIJZOfZJ2T3R6uv/pIOIBt0/zrZ1a1mZmb1\nVyItc07mbL6J7CbIDXIj5NQK55a4BhQRN5J12ZqZNU7lDfo4r4JxMmdmZmaNUx0ubCgGTubMzMys\ncXI3q5mZmVkRc8ucmZmZWTFzMmdmZmZWvNwyZ2ZmZlbEPGbOzMzMrIi5Zc7MzMysiJVGLudkzszM\nzBqr0sjmnMyZmZlZ4+RuVjMzM7Mi5gsgzMzMzIqYW+bMzMzMipmTOTMzM7Pi5ZY5MzMzsyLmZM7M\nzMysmDmZMzMzMytaKvPVrGZmZmZFzC1zZmZmZsXLY+bMzMzMipiTOTMzM7Ni5mTOzMzMrHi5Zc7M\nzMysiDmZMzMzMytiTubMzMzMipiTOTMzM7Ni5mTOzMzMrHi5Zc7MzMysiMmP8zIzMzMrXiXSMqeI\nKHQMZmZFS9KAiBhU6DhKmY/xguHjXLxKo33RzKxwBhQ6gEbAx3jB8HEuUk7mzMzMzIqYkzkzMzOz\nIuZkzsxs3niM0fznY7xg+DgXKV8AYWZmZlbE3DJnZmZmVsSczJmZmZkVMd802MzMrBGStBSwLDm5\nQEQMLVxEVl9O5szM8iBpeeBYfvsluGnBgioxkjoBZwNLRsQ2klYC1omI6wocWsmQdB6wBzAWqEjF\nATiZK0K+AMLMLA+S3gCuBl5j9pcgEfFawYIqMZKeAG4AToqIPpKaAK9HxMoFDq1kSBoHrBIRPxc6\nFpt3bpkzM8vPzIi4qtBBlLglIuJuSScARMRMSRVzW8ny8hHQFHAyVwKczJmZ1YGk9unlI5L+CjxA\nzhdhREwuSGCl6UdJi5N1+yFpbWBqYUMqDZL+TXZcfwJGSxrCnOfx4YWKzerP3axmZnUgaTzZl6Cq\nmR0R0X0Bh1SyJK0GXA70Bt4COgC7RsSbBQ2sBEg6oLb5EXHTgorFGo6TOTMzW+ikcXIrkCXP4yJi\nRoFDKimSWgDTI6IivS8HFo2InwobmdWH7zNnZpYHSX+T1DbnfbvU7WoNRNKbwD/Iko23nMjNF0OA\nZjnvmwHPFigWm0dO5szM8tM/IqZUvomIb4H+BYynFP0RmAncLWmkpGMkLVPooErMYhHxQ+Wb9Lp5\nAeOxeeBkzswsP+WSfh03l7qnFilgPCUnIj6JiPMjYjVgb2AVYHyBwyo1P0r6Q+WbNE5xWgHjsXng\nq1nNzPLzJHCXpGvS+0NSmTUgScuS3dR2D7L7+f2jsBGVnCOBeyR9QTYu8Xdkx9qKkC+AMDPLg6Qy\nsgRus1T0DHBt5UBym3eSXiG7B9o9wF0R8VGBQypJkpqSXWQCvsikqDmZMzPLk6RmwDIRMa7QsZQi\nSSv42M5fkpoDRwPLRkR/ST2BFSLi0QKHZvXgZM7MLA+SdgAuABaJiG6S+gJnRMQOBQ6t6EnaNyJu\nlXR0dfMj4uIFHVOpknQX2SPp9o+I3im5Gx4RfQscmtWDL4AwM8vPqcCawBSAiBgNdCtoRKWjRfrZ\nqobJGk6PiDgfmAGQ7i9X3Q2xrQj4Aggzs/zMiIipORe0QnrslM2biLgm/Ty90LE0Ar+k4QKVj0zr\ngZ/TWrTcMmdmlp+3Je1NdouSnulZl8MLHVQpkXS+pNaSmkoaIulrSfsWOq4ScyrZVdhdJN1GdhNh\nXzFcpDxmzswsD2ls0UnAlqnoKeDMiJheuKhKi6TREdFX0k7A9mQD9YdGRJ8Ch1ZSJC0OrE3Wvfpy\nRHxT4JCsntzNamaWhzS26CRJZ/k5lvNN5XfTdsA91XRr2zxKN77eBugeEWdIWkbSmhExotCxWf7c\nzWpmlgdJ60oaC7yb3veRdGWBwyo1j0p6F1gNGCKpA+CWz4Z1JbAOsFd6/z3wn8KFY/PC3axmZnlI\nN7TdFXg4IlZNZW9FRO/CRlZaJLUHpkZERerabh0RXxY6rlIhaVRE/EHS6znn8Rvuyi5O7mY1M8tT\nRHxapdvPT39oeCsCXSXlfk/dXKhgStCM9FzhyqtZOwCzChuS1ZeTOTOz/HwqaV0g0uOQjgDeKXBM\nJUXSLUAPYDSzE+XAyVxDuhx4AOgk6Syy1uaTCxuS1Ze7Wc3M8iBpCeAyYHOyccdPAUdExKSCBlZC\nJL0DrBT+gpqvJK3I7GcMPxcR/qOkSLllzswsD+n2DfsUOo4S9xbwO2BCoQMpcc2Byq7WZgWOxeaB\nr2Y1M8uDpO6SHkk3sp0o6SFJ3QsdV4lZAhgr6SlJD1dOhQ6qlEg6BbgJaE92vG+Q5G7WIuVuVjOz\nPEh6mewWDnekoj2BgRGxVuGiKi2SNqquPCJeWNCxlCpJ44A+lTe7To/2Gh0RKxQ2MqsPt8yZmeWn\neUTcEhEz03QrsFihgyolKWn7GGiaXo8ERhU0qNLzBXOet4sCnxcoFptHbpkzM8uDpPOAb4E7ycYa\n7QG0Ay4AiIjJhYuuNEjqDwwA2kdED0k9gasjYrO5rGp1JOlBYA3gGbLzeAtgBPAZQEQcXrjoLF9O\n5szM8iBpfC2zIyI8fm4eSRoNrAm8knND2zERsXJhIysdkg6obX5E3LSgYrF556tZzczyEBHdCh1D\nI/BzRPxSeWPmdONgtzw0oNxkTVI7oEtEvFnAkGweeMycmVkeJO0mqVV6fbKk+yWtWui4SswLkk4E\nmknaArgHeKTAMZUUSc9Lap0emzYKGCzp4kLHZfXjZM7MLD//jIjvJa1PduPg64CrCxxTqTke+BoY\nAxwCPI6fTtDQ2kTEd8DOwM3pauzNCxyT1ZOTOTOz/FQ+Xmo7YFBEPAYsUsB4SoakIenlORExOCJ2\ni4hd02t3szasJpI6A7sDjxY6GJs3HjNnZpafzyVdQ3b133mSFsV/GDeUzum5tztIuhNQ7syI8O1J\nGs4ZZI+i+19EjEw3vn6/wDFZPflqVjOzPEhqDmwNjImI91PrxsoR8XSBQyt6knYFDgbWB16tMjsi\nYtMFH5XZws/JnJmZLRQkrRcRL0o6JSLOKHQ8ZsXCXQNmZrawuDz97FfQKMyKjMfMmZnZwmKGpEHA\nUpIurzrTTyUwq55b5szM8iRpWUmbp9fNKu87Z/Nse+A5YDrwWjWTNRBJnSRdJ+mJ9H4lSQcXOi6r\nH4+ZMzPLg58bOv9J6hMRbxQ6jlKWkrgbgJMiok96ysbrfmRacXLLnJlZfv4GrAd8BxAR7wMdCxpR\n6Zkk6QFJE9N0n6SlCx1UiVkiIu4GZgFExExm30PRioyTOTOz/PwcEb9UvvFzQ+eLG4CHgSXT9Egq\ns4bzo6TFSeeupLWBqYUNyerLyZyZWX783ND5r2NE3BARM9N0I9Ch0EGVmKPJEuYekl4EbgZ8gUmR\n8pg5M7M8SCoju7HtlmRPKHgqIgYXNqrSkh7rdQNwRyraCzjI4xIbTnpySQWwAtl5PA4oi4ifCxqY\n1YuTOTOzPEg6IiIum1uZ1Z+kZYF/A+uQdQMOBw6PiP8raGAlRNKoiPjD3MqsODiZMzPLQw1fgq9H\nxKqFismsriT9DlgKuBXYm9nPv21NdlX2ioWKzerPNw02M6sDSXuRffl1k/RwzqxWwOTCRGWWt62A\nA4GlgYtzyr8HTixEQDbv3DJnZlYHqeuvG3AOcHzOrO+BN9OtHcyKgqRdIuK+QsdhDcPJnJmZLVQk\ndYuI8XMrs3kjaTugF7BYZVlEnFG4iKy+fGsSM7M8SFpb0khJP0j6RVKFpO8KHVeJqa7F6N4FHkUJ\nk3Q1sAcwkGzc3G7AsgUNyurNY+bMzPJzBbAn2f3lVgf2B5YvaEQlQtKKZC1FbSTtnDOrNTmtR9Yg\n1o2IVSS9GRGnS7oIeKLQQVn9OJkzM8tTRHwgqTwiKoAbJL0OnFDouErACsD2QFvgjznl3wP9CxJR\n6ZqWfv4kaUlgEtC5gPHYPHAyZ2aWn58kLQKMlnQ+MAEPWWkQEfEQ8JCkdSLipULHU+IeldQWuAAY\nRXY/v2sLG5LVly+AMDPLQ7qq9StgEeAooA1wZUR8UNDASoikpcluGrxeKhoGHBERnxUuqtKVngax\nWET42axFysmcmZktVCQ9A9wO3JKK9gX2iYgtChdV6ZG0LtCVnF66iLi5YAFZvTmZMzPLg6T1gNPI\nrvzL/RLsXqiYSo2kNyKiT5Wy0RHRt1AxlRpJtwA9gNFkz2gFiIg4vHBRWX15zJyZWX6uI+tefY3Z\nX4LWsL6RtC9wR3q/F9kAfWs4qwMrhVt0SoKTOTOz/EyNCN/CYf76E9mYuUvIBuYPBw4qaESl5y3g\nd2QX8FiRczermVkdSPpDerk7UA7cD/xcOT8iRhUiLrN8SHqELEFuBfQFRjDnebxDgUKzeeBkzsys\nDiT9t5bZERGbLrBgSpykDmT3levKnOMS/1SomEqFpI1qmx8RLyyoWKzhOJkzM7OFiqThZLcjmWNc\noh8Mb1Y9J3NmZrZQ8ZWrZvnxXcvNzGxh86ikbQsdhFmxcDJn9v/t3XuwpVWZ3/Hvr5u7dNtDAGNE\noGkbZ1ARyYADtISoMDpKJwroiCZg8JLRCA4pK/FWWmpC1NHSUOUFRScqMqOJiiCXDJbDRYhoA4Io\nyMhFZwbjCNLdY7Dppp/88e4zHDqtnH1Jr977/X6qdp3zrvdQ9SvqVJ2n11rPWpK2N2fQFXQPJFmX\nZH2Sda1DzZIkZyxkTNPBYk6ShpDkpCRLBt+/LcmX5nW6agKqaklVLaqqXatq6eB5aetcM+aUrYyd\nuq1DaDI8Z06ShvP2qvpiklXAc+kuKv8o8My2saRHl+RlwMnA8iRfnfdqCXBfm1Qal8WcJA1nrrvy\nBcA5VfW1JO9pGUgawjV0BwXvCXxg3vh64KYmiTQ2u1klaQhJLgL+BjgWOBR4ALhuy7tEpe1dkscB\nhw0er6uqn7XMo9G5Z06ShvMS4DLg96vqfmAP4E1tI82WJCuS7Dz4/pgkpydZ1jrXLElyEt3tDyfR\n/U5/K8mJbVNpVM7MSdKQkiwGHscjbyf4cbtEsyXJjXQXwe8PXAxcADylqjyuZEKSfBc4dm42bnDr\nxuXOME8n98xJ0hCSvAF4B/C/gc2D4QIObhZq9myuqk1JXgScXVVnJ7mhdagZs2iLZdV7cbVualnM\nSdJwzgCeXFX3tg4ywzYOui5PAY4fjO3YMM8sujTJZcD5g+eX0s2Cagq5zCpJQ0jyDbrlqU2ts8yq\nJAcB/xa4tqrOT7IceElVvbdxtJmS5ATgqMHjVVX15ZZ5NDqLOUkaQpJzgScDXwM2zI1X1QebhZLU\nay6zStJwfjz47DT4aEKSfKGqXpLkZrp9iI9QVe5LnJAkLwbeC+wNZPApb9qYTs7MSZK2C0keX1X3\nJNlva++r6u5tnWlWJfkr4Piq+kHrLBqfxZwkLUCSD1XVG5NcyNZnjVY3iCWNJMk3q+qoR/9JTQOX\nWSVpYT47+PonTVNIk/GdJH8OfIVH7v38UrtIGpUzc5Ik9UyST29luKrq32zzMBqbxZwkDSHJSuAs\n4CBgl7nxqjqgWagZk+SMqvrwo41J6njasyQN59PAR4FNwD8HPgN8rmmi2XPKVsZO3dYhpGnhzJwk\nDSHJmqr6p0lurqqnzR9rnW3aDW59OBlYBVw179USuiu+ntMkmLSdswFCkoazIcki4PYk/w74G2D3\nxiAP3EAAABR4SURBVJlmxTXAPcCewAfmja8HbmqSaEYlWV5Vdz7amKaDM3OSNIQkhwE/AJYB7waW\nAu+rqm81DTZjBmfNrayqy5PsCuxQVetb55oVSa6vqkO3GHOGeUo5MydJw9m/qr4N/D3wSoAkJwEW\ncxOS5NXAa4A9gBXAPsDHAJdZx5Tkt4GnAI8d3AIxZynzGno0XWyAkKThvHmBYxrd6+kugF8HUFW3\n0107pfE9GXgh3czy8fM+hwKvbphLY3BmTpIWIMnzgT8AnpDkv857tZSus1WTs6GqHkwCQJId2Mqt\nGxpeVV0AXJDkiKq6tnUeTYbFnCQtzN8C3wFWA2vmja8H/rhJotl1RZK3ALsmORZ4HXBh40yz5jWD\n5exH8NDg6WQDhCQNIcmOVbWxdY5ZNugWPg04DghwGfDJ8g/WxCQ5Yd7jLsCLgL+tqtMbRdIYLOYk\naQhJjgLeCexHt7oRumuQvAFiggYdrPtW1W2ts/TBoIC+uqqObJ1Fw3OZVZKGcy7dsuoa4KHGWWZS\nktXA+4GdgOVJDgHeVVWr2yabaSuxyWRqWcxJ0nDWVtUlrUPMuHcAhwN/CVBVNyZZ3jTRjEmynq6p\nJIOvPwX+Q9NQGpnFnCQN5xtJ3g98CdgwN1hV17eLNHM2VtXauW7WAfcETVBVLWmdQZNjMSdJw3nm\n4Ovvzhsr4NkNssyqW5KcDCxOshI4ne6qL03Q4NDgVXS/v1dV1VcaR9KIbICQJG1XkuwGvJWHu1kv\nBd5TVb9qGmyGJPkI8CTg/MHQS4EfVdXr26XSqCzmJGkISR5Lt6fr6MHQFXSb89e2SzVbkqyoqh+1\nzjHLktwK/M7ccS+DbtZbqup32ibTKLzOS5KG8ym6g4JfMvisAz7dNNHs+VSSHyX5sySvT/K01oFm\n0F8B+857fuJgTFPImTlJGkKSG6vqkEcb03iS7AQcBhwDvBbYvar2aBpqBiS5kG6P3GPp/v9eN3h+\nJnBdVR3TLp1GZQOEJA3ngSSrqupq+IdDhB9onGmmJFkFPGvwWQZcBFzVNNTs+JPWATR5zsxJ0hAG\nB9j+N7qZDYBfAKdW1XfbpZotSTbRHcp8FnBxVT3YOJK0XbOYk6QRJFkKUFXrWmeZNUmWAUfRNZkc\nBmwGrq2qtzcNJm2nbICQpCEk+c9JllXVuqpal+S3krynda5ZUlX3A3cAdwL3ACt4uHtY0hacmZOk\nISS5oaqescXY9VV1aKtMsybJHcCtdPvkrqLbmO9S64Ql2RXYt6pua51F43FmTpKGszjJznMPgz+I\nO/+Gn9fwTq2qP6iqs6rq6qp6cNBooglJcjxwI92BzCQ5JMlX26bSqCzmJGk45wFfT3JaktOAv6Br\niNDkfGgrY2dv8xSz7Z3A4cD9AFV1I7C8ZSCNzqNJJGkIVfXeJN8FnjsYendVXdYy06xIcgRwJLBX\nkjPnvVoKLG6TamZtrKq1SeaPue9qSlnMSdKQqupSBstTmqidgN3p/jYtmTe+DjixSaLZdUuSk+m2\nDawETgeuaZxJI7IBQpK0XUmyX1Xd3TrHLEuyG/BW4DggwGV0s8y/ahpMI7GYkyRJmmIus0rSEJKc\nUVUffrQxaXs2747W+dYC3wE+7gzddLGbVZKGc8pWxk7d1iGkMd0B/D3wicFnHbAeOHDwrCnizJwk\nLUCSlwEnA8u3OI9rCXBfm1SzKcmBwEeBx1XVU5McDKyuKm/amJwjq+qwec8XJvl2VR2W5JZmqTQS\nizlJWphr6K6W2hP4wLzx9cBNTRLNrk8AbwI+DlBVNyX5PGAxNzm7J9m3qn4MkGRfuk5iAG/bmDIW\nc5K0AIPuyruBI5LsB6ysqssHN0DsSlfUaTJ2q6rrtjgDbVOrMDPq3wNXJ/kRXTfrcuB1SR6Dh2BP\nHYs5SRpCklcDrwH2oLsAfh/gY8BzWuaaMT9PsoLBBv0kJ9LNimpCquriwflyvz0Yum1e08PWbuDQ\ndsyjSSRpCElupLsG6VtV9YzB2M1V9bS2yWZHkgOAc+hug/gFcCfwiqq6q2WuWZPkqcBBwC5zY1X1\nmXaJNCpn5iRpOBsGF78DkGQHvAZpoqrqDuC5gyW/RVXlEvaEJXkHcAxdMXcx8HzgasBibgpZzEnS\ncK5I8hZg1yTHAq8DLmycaaYk2Rk4Adgf2GGucK6qdzWMNWtOBJ4O3FBVr0zyOOBzjTNpRJ4zJ0nD\n+Y/A3wE3A6+lm9V4W9NEs+cC4F/QNT38ct5Hk/NAVW0GNiVZCvwMeGLjTBqRM3OSNISq2pzkc8CV\nVXVb6zwzap+qel7rEDPuO0mW0R0Ds4buAOFr20bSqGyAkKQhJFkNvB/YqaqWJzkEeFdVrW4cbWYk\nOQc4u6pubp2lD5LsDyytKs9LnFIWc5I0hCRrgGcDf2k362Ql+R6wmW7VaCXdlVMb6M5Bq6o6uGG8\nmZLk6K2NV9WV2zqLxucyqyQNZ2NVrd3iQFv/VTwZTwAOaR2iJ9407/td6I7bmfuHiqaMxZwkDeeW\nJCcDiweHrp5Od9WXxnfn4KYN/X9WVcfPf07yRDwseGq5zCpJQ0iyG/BW4Di65b9LgffMOz1fI0ry\n18AHf937qvq17zSedFPNt1TVQa2zaHjOzEnScB5fVW+lK+g0WYvpLnvPo/2gxpPkbB7eHrCIbnn7\n+naJNA5n5iRpCEmuoLuP9dvAVXRHlNh1OQFJrq+qQ1vn6IMkp8x73ATcVVXfbJVH47GYk6QhJdkJ\nOIzuOqTXArtX1R5NQ82AJDfMdQhLWjiXWSVpCElWAc8afJYBF9HN0Gl8z2kdoC+S3Mz/24W9FvgO\n3R7Qe7d9Ko3KmTlJGkKSTXRHOJwFXFxVDzaOJA0tyfuAh4DPD4b+ENgN+CmwastuV23fLOYkaQiD\nK5COAo6mW2rdDFxbVW9vGkwawtb2J86NeQj29FnUOoAkTZOqup/uZoI7gXuAFXSFnTRNFic5fO4h\nyWF03cTQNURoijgzJ0lDSHIHcCvdPrmrgOtcatW0GRRvn+Lho2DWAa8CbgFeUFVfaBhPQ7KYk6Qh\nJDl6y/srkxzlsQ6aRkkeC1BVa1tn0egs5iRpCL9pr1GrTNKwkuwMnADsz7yTLarqXa0yaXQeTSJJ\nC5DkCOBIYK8kZ857tZSH9xpJ0+ICuqNI1gAbGmfRmCzmJGlhdqLbX7QDsGTe+DrgxCaJpNHtU1XP\nax1Ck+EyqyQNIcl+VXV36xzSOJKcA5ztVXSzwWJOkqSeSfJ94El0R+xsoOtorao6uGkwjcRiTpKk\nnkmy39bGnXWeTu6ZkySpZ+aKtiR7A7s0jqMxeQOEJA0hyYFJvp7ke4Png5O8rXUuaRhJVie5nW6Z\n9QrgLuCSpqE0Mos5SRrOJ4A3AxsBquomukvKpWnybuD3gB9W1XLgOcD/ahtJo7KYk6Th7FZV120x\n5l2WmjYbq+peYFGSRVX1DeB3W4fSaNwzJ0nD+XmSFUABJDkRuKdtJGlo9yfZHbgSOC/Jz4BfNs6k\nEdnNKklDSHIAcA7dbRC/oNtz9IqquqtlLmkYSR4D/IruSJKXA48FzhvM1mnKWMxJ0ggGfwwXVdX6\n1lkk9ZvFnCQNwQvKNc2SrGewRWDLV3SHBi/dxpE0Ae6Zk6TheEG5plZVLXn0n9K0cWZOkoaQ5HtV\n9dTWOSRpjkeTSNJwrknytNYhJGmOM3OStACDGx82021PWQncgReUS9oOuGdOkhbmCcAhrUNIk5Jk\nP2BlVV2eZFdgB7uzp5PFnCQtzJ1zl5NL0y7Jq4HXAHsAK4B9gI/RXeulKWMxJ0kLs3eSM3/dy6r6\n4LYMI43p9cDhwLcAqur2JHu3jaRRWcxJ0sIsBnan2yMnTbsNVfVg0v06J9mBrZ8/pylgMSdJC3OP\nBwNrhlyR5C3ArkmOBV4HXNg4k0ZkN6skLUCSG6rqGa1zSJOQZBFwGnAc3WzzZcAny6JgKlnMSdIC\nJNmjqu5rnUOahCQvBr5WVd5iMgM8NFiSFsBCTjPmeOCHST6b5IWDPXOaUs7MSZLUQ0l2BJ4PvBRY\nBfxFVb2qbSqNwmJOkqSeGhR0zwNeCRxdVXs2jqQRuMwqSVLPJHl+kj8FbgdOAD4J/OOmoTQyZ+Yk\nSeqZJOcDfw5cYhPE9LOYkyRJmmJ2r0iS1BNJrq6qVUnW88gbHwJUVS1tFE1jcGZOkiRpitkAIUlS\nzyRZkWTnwffHJDk9ybLWuTQaizlJkvrnfwAPJXkScA7wRODzbSNpVBZzkiT1z+aq2gS8CDi7qt4E\nPL5xJo3IYk6SpP7ZmORlwCnARYOxHRvm0Rgs5iRJ6p9XAkcA/6mq7kyyHPhs40wakd2skiRJU8xz\n5iRJ6pkkRwHvBPajqwXmzpk7oGUujcaZOUmSeibJrcAfA2uAh+bGq+reZqE0MmfmJEnqn7VVdUnr\nEJoMZ+YkSeqZJP8FWAx8CdgwN15V1zcLpZFZzEmS1DNJvrGV4aqqZ2/zMBqbxZwkSdIU85w5SZJ6\nJsnjkpyb5JLB80FJTmudS6OxmJMkqX/+FLgM+CeD5x8Cb2yWRmOxmJMkqX/2rKovAJsBBve0PvSb\n/xNtryzmJEnqn18m+UdAAST5PWBt20galefMSZLUP2cCXwVWJPkmsBdwYttIGpXdrJIk9USSk6rq\ni0mWAz8Bnkx3lddtVbWxbTqNymJOkqSeSHJ9VR0697V1Hk2GxZwkST2R5HK6pofDgSu3fF9Vq7d5\nKI3NYk6SpJ5IshNwKPBZ4FVbvq+qK7Z5KI3NBghJkvrj3Kr6V0k+YeE2O5yZkySpJ5J8H3gucAlw\nDF3zwz+oqvsaxNKYnJmTJKk/PgZ8HTgAWMMji7kajGvKODMnSVLPJPloVf1R6xyaDIs5SZJ6KMnT\ngWcNHq+sqpta5tHovM5LkqSeSXI6cB6w9+BzXpI3tE2lUTkzJ0lSzyS5CTiiqn45eH4McG1VHdw2\nmUbhzJwkSf0T4KF5zw+xRWerpofdrJIk9c+ngW8l+fLg+V8C5zbMozG4zCpJUg8lORRYNXi8qqpu\naJlHo7OYkyRJmmLumZMkSZpiFnOSJElTzGJOkqSeSfKYJIsG3x+YZHWSHVvn0mjcMydJUs8kWUN3\n+8NvAd8Evg08WFUvbxpMI3FmTpKk/klV/R/gxcBHquok4CmNM2lEFnOSJPVPkhwBvBz42mBsccM8\nGoPFnCRJ/fNG4M3Al6vqliQHAN9onEkjcs+cJEk9lWS3wXKrppgzc5Ik9UySI5J8H7h18Pz0JB9p\nHEsjspiTJKl/PgT8PnAvQFV9Fzi6aSKNzGJOkqQeqqqfbDH0UJMgGtsOrQNIkqRt7idJjgRqcFjw\nGcAPGmfSiGyAkCSpZ5LsCXwYeC4Q4H8CZ1TVvU2DaSQWc5IkSVPMPXOSJPVMkvclWZpkxyRfT/J3\nSV7ROpdGYzEnSVL/HFdV64AXAncBTwLe1DSRRmYxJ0lS/8w1QL4A+GJVrW0ZRuOxm1WSpP65KMmt\nwAPAHyXZC/hV40wakQ0QkiT1UJI9gLVV9VCS3YClVfXT1rk0PGfmJEnqmST/et738199Ztun0bgs\n5iRJ6p/D5n2/C/Ac4Hos5qaSy6ySJPVckmXAn1XV81pn0fDsZpUkSb8ElrcOodG4zCpJUs8kuRCY\nW5pbBBwEfKFdIo3DZVZJknomyT+b97gJuLuq/rpVHo3HYk6SJGmKuWdOkiRpilnMSZIkTTGLOUmS\neibJGQsZ03SwmJMkqX9O2crYqds6hCbDo0kkSeqJJC8DTgaWJ/nqvFdLgPvapNK4LOYkSeqPa4B7\ngD2BD8wbXw/c1CSRxubRJJIk9VCS/YCVVXV5kl2BHapqfetcGp575iRJ6pkkrwb+O/DxwdA+wFfa\nJdI4LOYkSeqf1wNHAesAqup2YO+miTQyizlJkvpnQ1U9OPeQZAcevqtVU8ZiTpKk/rkiyVuAXZMc\nC3wRuLBxJo3IBghJknomySLgNOA4IMBlwCfLomAqWcxJktRDgw7WfavqttZZNB6XWSVJ6pkkq4Eb\ngUsHz4dscYiwpojFnCRJ/fMO4HDgfoCquhFY3jSRRmYxJ0lS/2ysqrVbjLnvakp5nZckSf1zS5KT\ngcVJVgKn0131pSnkzJwkSf3zBuApwAbgfGAt8MamiTQyu1klSeqZJCuq6ketc2gyLOYkSeqZJFfQ\n3cf6beAq4MqqurltKo3KYk6SpB5KshNwGHAM8Fpg96rao2kojcQGCEmSeibJKuBZg88y4CK6GTpN\nIWfmJEnqmSSbgDXAWcDFVfVg40gag8WcJEk9k2QZcBRwNN1S62bg2qp6e9NgGonLrJIk9UxV3Z/k\nDuCJdI0QRwI7tk2lUTkzJ0lSzwwKuVvp9sldBVznUuv0spiTJKlnkhxdVVduMXZUVX2zVSaNzmJO\nkqSeSXJ9VR36aGOaDu6ZkySpJ5IcQbc/bq8kZ857tRRY3CaVxmUxJ0lSf+wE7E7393/JvPF1wIlN\nEmlsLrNKktQzSfarqrtb59BkWMxJkiRNsUWtA0iSJGl0FnOSJElTzGJOkqSeSXJgkq8n+d7g+eAk\nb2udS6OxmJMkqX8+AbwZ2AhQVTcBf9g0kUZmMSdJUv/sVlXXbTG2qUkSjc1iTpKk/vl5khVAASQ5\nEbinbSSNyqNJJEnqmSQHAOfQ3QbxC+BO4BVVdVfLXBqNxZwkST2V5DHAoqpa3zqLRmcxJ0lSzyTZ\nGTgB2J95V3tW1btaZdLovJtVkqT+uQBYC6wBNjTOojE5MydJUs8k+V5VPbV1Dk2G3aySJPXPNUme\n1jqEJsOZOUmSemJw48Nmum1WK4E76JZZA1RVHdwwnkbknjlJkvrjCcAhrUNosizmJEnqjzur6u7W\nITRZFnOSJPXH3knO/HUvq+qD2zKMJsNiTpKk/lgM7E63R04zwgYISZJ6Isn1VXVo6xyaLI8mkSSp\nP5yRm0HOzEmS1BNJ9qiq+1rn0GRZzEmSJE0xl1klSZKmmMWcJEnSFLOYkyRJmmIWc5IkSVPMYk6S\nJGmK/V8fJXbq2GmtTwAAAABJRU5ErkJggg==\n",
536 | "text/plain": [
537 | ""
538 | ]
539 | },
540 | "metadata": {},
541 | "output_type": "display_data"
542 | }
543 | ],
544 | "source": [
545 | "score, acc = model.evaluate(X_test, y_test, batch_size=batch_size)\n",
546 | "print('\\nTest score:', score, 'Test accuracy:', acc)\n",
547 | "\n",
548 | "pred_probas = model.predict(X_test)\n",
549 | "pred_labels = np.argmax(pred_probas, axis=1)\n",
550 | "true_labels = np.argmax(y_test, axis=1)\n",
551 | "print('\\n', classification_report(true_labels, pred_labels))\n",
552 | "\n",
553 | "cnf = confusion_matrix(true_labels, pred_labels)\n",
554 | "sns.heatmap(cnf, annot=True, fmt='d', xticklabels=label_names, yticklabels=label_names, cmap=\"Reds\");"
555 | ]
556 | },
557 | {
558 | "cell_type": "code",
559 | "execution_count": null,
560 | "metadata": {
561 | "collapsed": true
562 | },
563 | "outputs": [],
564 | "source": []
565 | }
566 | ],
567 | "metadata": {
568 | "kernelspec": {
569 | "display_name": "Python 3",
570 | "language": "python",
571 | "name": "python3"
572 | },
573 | "language_info": {
574 | "codemirror_mode": {
575 | "name": "ipython",
576 | "version": 3
577 | },
578 | "file_extension": ".py",
579 | "mimetype": "text/x-python",
580 | "name": "python",
581 | "nbconvert_exporter": "python",
582 | "pygments_lexer": "ipython3",
583 | "version": "3.6.2"
584 | }
585 | },
586 | "nbformat": 4,
587 | "nbformat_minor": 2
588 | }
589 |
--------------------------------------------------------------------------------
/notebooks/best_multiclass_model.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook contains different models that attempt to classify hate speech from Twitter. It was built as part of this research: https://arxiv.org/pdf/1703.04009.pdf "
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import pandas as pd\n",
17 | "import numpy as np\n",
18 | "import sys\n",
19 | "from sklearn.feature_extraction.text import TfidfVectorizer\n",
20 | "import nltk\n",
21 | "from nltk.stem.porter import *\n",
22 | "import string\n",
23 | "import re\n",
24 | "from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS\n",
25 | "from textstat.textstat import *\n",
26 | "from sklearn.linear_model import LogisticRegression\n",
27 | "from sklearn.feature_selection import SelectFromModel\n",
28 | "from sklearn.metrics import classification_report\n",
29 | "from sklearn.svm import LinearSVC\n",
30 | "import matplotlib.pyplot as plt\n",
31 | "import seaborn\n",
32 | "%matplotlib inline"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": null,
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "nltk.download('averaged_perceptron_tagger')\n",
42 | "nltk.download('stopwords')"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "df = pd.read_csv('../data/twitter-hate-speech2.csv', encoding='latin-1')"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "df.head()"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "metadata": {
67 | "scrolled": true
68 | },
69 | "outputs": [],
70 | "source": [
71 | "df.describe()"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "df['class'].hist()"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit\n",
90 | "X=df['tweet']\n",
91 | "y=df['class']\n",
92 | "\n",
93 | "sss = StratifiedShuffleSplit(n_splits=1, test_size=.15, random_state=0) #TODO: Coordinate random seed between notebooks\n",
94 | "train_index, test_index = next(sss.split(X,y))\n",
95 | "\n",
96 | "X_train = X.iloc[train_index]\n",
97 | "X_test = X.iloc[test_index]\n",
98 | "y_train = y.iloc[train_index]\n",
99 | "y_test = y.iloc[test_index]"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "# Feature Engineering"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": null,
112 | "metadata": {},
113 | "outputs": [],
114 | "source": [
115 | "stopwords=stopwords = nltk.corpus.stopwords.words(\"english\")\n",
116 | "\n",
117 | "other_exclusions = [\"#ff\", \"ff\", \"rt\"]\n",
118 | "stopwords.extend(other_exclusions)\n",
119 | "\n",
120 | "stemmer = PorterStemmer()\n",
121 | "\n",
122 | "def preprocess(text_string):\n",
123 | " \"\"\"\n",
124 | " Accepts a text string and replaces:\n",
125 | " 1) urls with URLHERE\n",
126 | " 2) lots of whitespace with one instance\n",
127 | " 3) mentions with MENTIONHERE\n",
128 | "\n",
129 | " This allows us to get standardized counts of urls and mentions\n",
130 | " Without caring about specific people mentioned\n",
131 | " \"\"\"\n",
132 | " space_pattern = '\\s+'\n",
133 | " giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'\n",
134 | " '[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')\n",
135 | " mention_regex = '@[\\w\\-]+'\n",
136 | " parsed_text = re.sub(space_pattern, ' ', text_string)\n",
137 | " parsed_text = re.sub(giant_url_regex, '', parsed_text)\n",
138 | " parsed_text = re.sub(mention_regex, '', parsed_text)\n",
139 | " return parsed_text\n",
140 | "\n",
141 | "def tokenize(tweet):\n",
142 | " \"\"\"Removes punctuation & excess whitespace, sets to lowercase,\n",
143 | " and stems tweets. Returns a list of stemmed tokens.\"\"\"\n",
144 | " tweet = \" \".join(re.split(\"[^a-zA-Z]*\", tweet.lower())).strip()\n",
145 | " tokens = [stemmer.stem(t) for t in tweet.split()]\n",
146 | " return tokens\n",
147 | "\n",
148 | "def basic_tokenize(tweet):\n",
149 | " \"\"\"Same as tokenize but without the stemming\"\"\"\n",
150 | " tweet = \" \".join(re.split(\"[^a-zA-Z.,!?]*\", tweet.lower())).strip()\n",
151 | " return tweet.split()"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "metadata": {},
158 | "outputs": [],
159 | "source": [
160 | "from sklearn.base import BaseEstimator, TransformerMixin\n",
161 | "class PosTfidfVectorizer(BaseEstimator, TransformerMixin):\n",
162 | " \"\"\"Get POS tags for tweets and transform via tfidf\"\"\"\n",
163 | " \n",
164 | " def __init__(self):\n",
165 | " self._pos_vectorizer = TfidfVectorizer(\n",
166 | " tokenizer=None,\n",
167 | " lowercase=False,\n",
168 | " preprocessor=None,\n",
169 | " ngram_range=(1, 3),\n",
170 | " stop_words=None,\n",
171 | " use_idf=False,\n",
172 | " smooth_idf=False,\n",
173 | " norm=None,\n",
174 | " decode_error='replace',\n",
175 | " max_features=5000,\n",
176 | " min_df=5,\n",
177 | " max_df=0.75,\n",
178 | " ) \n",
179 | " \n",
180 | " def _preprocess(self, X):\n",
181 | " tweet_tags = []\n",
182 | " for t in X:\n",
183 | " tokens = basic_tokenize(preprocess(t))\n",
184 | " tags = nltk.pos_tag(tokens)\n",
185 | " tag_list = [x[1] for x in tags]\n",
186 | " tag_str = \" \".join(tag_list)\n",
187 | " tweet_tags.append(tag_str)\n",
188 | " return tweet_tags\n",
189 | " \n",
190 | " def fit(self, X, y=None):\n",
191 | " tweet_tags = self._preprocess(X)\n",
192 | " self._pos_vectorizer.fit(X)\n",
193 | " \n",
194 | " return self\n",
195 | " \n",
196 | " def transform(self, X, y=None):\n",
197 | " tweet_tags = self._preprocess(X)\n",
198 | " return self._pos_vectorizer.transform(X)"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": null,
204 | "metadata": {},
205 | "outputs": [],
206 | "source": [
207 | "sentiment_analyzer = VS()\n",
208 | "\n",
209 | "def count_twitter_objs(text_string):\n",
210 | " \"\"\"\n",
211 | " Accepts a text string and replaces:\n",
212 | " 1) urls with URLHERE\n",
213 | " 2) lots of whitespace with one instance\n",
214 | " 3) mentions with MENTIONHERE\n",
215 | " 4) hashtags with HASHTAGHERE\n",
216 | "\n",
217 | " This allows us to get standardized counts of urls and mentions\n",
218 | " Without caring about specific people mentioned.\n",
219 | " \n",
220 | " Returns counts of urls, mentions, and hashtags.\n",
221 | " \"\"\"\n",
222 | " space_pattern = '\\s+'\n",
223 | " giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'\n",
224 | " '[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')\n",
225 | " mention_regex = '@[\\w\\-]+'\n",
226 | " hashtag_regex = '#[\\w\\-]+'\n",
227 | " parsed_text = re.sub(space_pattern, ' ', text_string)\n",
228 | " parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)\n",
229 | " parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)\n",
230 | " parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)\n",
231 | " return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))\n",
232 | "\n",
233 | "def other_features(tweet):\n",
234 | " \"\"\"This function takes a string and returns a list of features.\n",
235 | " These include Sentiment scores, Text and Readability scores,\n",
236 | " as well as Twitter specific features\"\"\"\n",
237 | " sentiment = sentiment_analyzer.polarity_scores(tweet)\n",
238 | " \n",
239 | " words = preprocess(tweet) #Get text only\n",
240 | " \n",
241 | " syllables = textstat.syllable_count(words)\n",
242 | " num_chars = sum(len(w) for w in words)\n",
243 | " num_chars_total = len(tweet)\n",
244 | " num_terms = len(tweet.split())\n",
245 | " num_words = len(words.split())\n",
246 | " avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)\n",
247 | " num_unique_terms = len(set(words.split()))\n",
248 | " \n",
249 | " ###Modified FK grade, where avg words per sentence is just num words/1\n",
250 | " FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)\n",
251 | " ##Modified FRE score, where sentence fixed to 1\n",
252 | " FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)\n",
253 | " \n",
254 | " twitter_objs = count_twitter_objs(tweet)\n",
255 | " retweet = 0\n",
256 | " if \"rt\" in words:\n",
257 | " retweet = 1\n",
258 | " features = [FKRA, FRE,syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words,\n",
259 | " num_unique_terms, sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound'],\n",
260 | " twitter_objs[2], twitter_objs[1],\n",
261 | " twitter_objs[0], retweet]\n",
262 | " return features\n",
263 | "\n",
264 | "def get_feature_array(tweets):\n",
265 | " feats=[]\n",
266 | " for t in tweets:\n",
267 | " feats.append(other_features(t))\n",
268 | " return np.array(feats)\n",
269 | "\n",
270 | "class SentimentVectorizer(BaseEstimator, TransformerMixin): \n",
271 | "\n",
272 | " def fit(self, X, y=None):\n",
273 | " return self\n",
274 | " \n",
275 | " def transform(self, X, y=None):\n",
276 | " return get_feature_array(X)"
277 | ]
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": null,
282 | "metadata": {},
283 | "outputs": [],
284 | "source": [
285 | "from sklearn.pipeline import FeatureUnion, Pipeline\n",
286 | "\n",
287 | "vectorizer = TfidfVectorizer(\n",
288 | " tokenizer=tokenize,\n",
289 | " preprocessor=preprocess,\n",
290 | " ngram_range=(1, 3),\n",
291 | " stop_words=stopwords,\n",
292 | " use_idf=True,\n",
293 | " smooth_idf=False,\n",
294 | " norm=None,\n",
295 | " decode_error='replace',\n",
296 | " max_features=10000,\n",
297 | " min_df=5,\n",
298 | " max_df=0.75\n",
299 | " )\n",
300 | "pos_vectorizer = PosTfidfVectorizer()\n",
301 | "sentiment_vectorizer = SentimentVectorizer()\n",
302 | "\n",
303 | "model = Pipeline( [('features', FeatureUnion([('tfidf', vectorizer),('pos_tfidf', pos_vectorizer), \n",
304 | " ('sentiment',sentiment_vectorizer)])),\n",
305 | " ('feature_selector', SelectFromModel(LogisticRegression(class_weight='balanced',penalty=\"l1\",C=0.01))),\n",
306 | " ('model', LogisticRegression(class_weight='balanced',penalty='l2',C=0.01))] )"
307 | ]
308 | },
309 | {
310 | "cell_type": "markdown",
311 | "metadata": {},
312 | "source": [
313 | "# Running the model"
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": null,
319 | "metadata": {},
320 | "outputs": [],
321 | "source": [
322 | "model.fit(X_train,y_train)"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": null,
328 | "metadata": {},
329 | "outputs": [],
330 | "source": [
331 | "y_preds = model.predict(X_test)"
332 | ]
333 | },
334 | {
335 | "cell_type": "markdown",
336 | "metadata": {},
337 | "source": [
338 | "## Evaluating the results on the test set"
339 | ]
340 | },
341 | {
342 | "cell_type": "code",
343 | "execution_count": null,
344 | "metadata": {},
345 | "outputs": [],
346 | "source": [
347 | "report = classification_report( y_test, y_preds)"
348 | ]
349 | },
350 | {
351 | "cell_type": "code",
352 | "execution_count": null,
353 | "metadata": {},
354 | "outputs": [],
355 | "source": [
356 | "print(report)"
357 | ]
358 | },
359 | {
360 | "cell_type": "code",
361 | "execution_count": null,
362 | "metadata": {},
363 | "outputs": [],
364 | "source": [
365 | "from sklearn.metrics import confusion_matrix\n",
366 | "\n",
367 | "confusion_matrix = confusion_matrix(y_test, y_preds)\n",
368 | "matrix_proportions = np.zeros((3, 3))\n",
369 | "for i in range(0, 3):\n",
370 | " matrix_proportions[i, :] = confusion_matrix[i, :] / \\\n",
371 | " float(confusion_matrix[i, :].sum())\n",
372 | "names = ['Hate', 'Offensive', 'Neither']\n",
373 | "confusion_df = pd.DataFrame(matrix_proportions, index=names, columns=names)\n",
374 | "plt.figure(figsize=(5, 5))\n",
375 | "seaborn.heatmap(confusion_df, annot=True, annot_kws={\"size\": 12}, square=True, cmap=\"Reds\")"
376 | ]
377 | },
378 | {
379 | "cell_type": "code",
380 | "execution_count": null,
381 | "metadata": {},
382 | "outputs": [],
383 | "source": []
384 | }
385 | ],
386 | "metadata": {
387 | "kernelspec": {
388 | "display_name": "Python 3",
389 | "language": "python",
390 | "name": "python3"
391 | },
392 | "language_info": {
393 | "codemirror_mode": {
394 | "name": "ipython",
395 | "version": 3
396 | },
397 | "file_extension": ".py",
398 | "mimetype": "text/x-python",
399 | "name": "python",
400 | "nbconvert_exporter": "python",
401 | "pygments_lexer": "ipython3",
402 | "version": "3.7.0"
403 | }
404 | },
405 | "nbformat": 4,
406 | "nbformat_minor": 2
407 | }
408 |
--------------------------------------------------------------------------------
/notebooks/multiclass_baseline.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": false,
8 | "deletable": true,
9 | "editable": true
10 | },
11 | "outputs": [],
12 | "source": [
13 | "import pandas as pd\n",
14 | "import numpy as np\n",
15 | "%matplotlib inline"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 2,
21 | "metadata": {
22 | "collapsed": false,
23 | "deletable": true,
24 | "editable": true
25 | },
26 | "outputs": [],
27 | "source": [
28 | "df = pd.read_csv('../data/twitter-hate-speech2.csv', encoding='latin-1')"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 3,
34 | "metadata": {
35 | "collapsed": false,
36 | "deletable": true,
37 | "editable": true
38 | },
39 | "outputs": [
40 | {
41 | "data": {
42 | "text/html": [
43 | "\n",
44 | "
\n",
45 | " \n",
46 | " \n",
47 | " | \n",
48 | " Unnamed: 0 | \n",
49 | " count | \n",
50 | " hate_speech | \n",
51 | " offensive_language | \n",
52 | " neither | \n",
53 | " class | \n",
54 | " tweet | \n",
55 | "
\n",
56 | " \n",
57 | " \n",
58 | " \n",
59 | " 0 | \n",
60 | " 0 | \n",
61 | " 3 | \n",
62 | " 0 | \n",
63 | " 0 | \n",
64 | " 3 | \n",
65 | " 2 | \n",
66 | " !!! RT @mayasolovely: As a woman you shouldn't... | \n",
67 | "
\n",
68 | " \n",
69 | " 1 | \n",
70 | " 1 | \n",
71 | " 3 | \n",
72 | " 0 | \n",
73 | " 3 | \n",
74 | " 0 | \n",
75 | " 1 | \n",
76 | " !!!!! RT @mleew17: boy dats cold...tyga dwn ba... | \n",
77 | "
\n",
78 | " \n",
79 | " 2 | \n",
80 | " 2 | \n",
81 | " 3 | \n",
82 | " 0 | \n",
83 | " 3 | \n",
84 | " 0 | \n",
85 | " 1 | \n",
86 | " !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby... | \n",
87 | "
\n",
88 | " \n",
89 | " 3 | \n",
90 | " 3 | \n",
91 | " 3 | \n",
92 | " 0 | \n",
93 | " 2 | \n",
94 | " 1 | \n",
95 | " 1 | \n",
96 | " !!!!!!!!! RT @C_G_Anderson: @viva_based she lo... | \n",
97 | "
\n",
98 | " \n",
99 | "
\n",
100 | "
"
101 | ],
102 | "text/plain": [
103 | " Unnamed: 0 count hate_speech offensive_language neither class \\\n",
104 | "0 0 3 0 0 3 2 \n",
105 | "1 1 3 0 3 0 1 \n",
106 | "2 2 3 0 3 0 1 \n",
107 | "3 3 3 0 2 1 1 \n",
108 | "\n",
109 | " tweet \n",
110 | "0 !!! RT @mayasolovely: As a woman you shouldn't... \n",
111 | "1 !!!!! RT @mleew17: boy dats cold...tyga dwn ba... \n",
112 | "2 !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby... \n",
113 | "3 !!!!!!!!! RT @C_G_Anderson: @viva_based she lo... "
114 | ]
115 | },
116 | "execution_count": 3,
117 | "metadata": {},
118 | "output_type": "execute_result"
119 | }
120 | ],
121 | "source": [
122 | "df.head(4)"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": 4,
128 | "metadata": {
129 | "collapsed": false,
130 | "deletable": true,
131 | "editable": true,
132 | "scrolled": true
133 | },
134 | "outputs": [
135 | {
136 | "data": {
137 | "text/html": [
138 | "\n",
139 | "
\n",
140 | " \n",
141 | " \n",
142 | " | \n",
143 | " Unnamed: 0 | \n",
144 | " count | \n",
145 | " hate_speech | \n",
146 | " offensive_language | \n",
147 | " neither | \n",
148 | " class | \n",
149 | "
\n",
150 | " \n",
151 | " \n",
152 | " \n",
153 | " count | \n",
154 | " 24783.000000 | \n",
155 | " 24783.000000 | \n",
156 | " 24783.000000 | \n",
157 | " 24783.000000 | \n",
158 | " 24783.000000 | \n",
159 | " 24783.000000 | \n",
160 | "
\n",
161 | " \n",
162 | " mean | \n",
163 | " 12681.192027 | \n",
164 | " 3.243473 | \n",
165 | " 0.280515 | \n",
166 | " 2.413711 | \n",
167 | " 0.549247 | \n",
168 | " 1.110277 | \n",
169 | "
\n",
170 | " \n",
171 | " std | \n",
172 | " 7299.553863 | \n",
173 | " 0.883060 | \n",
174 | " 0.631851 | \n",
175 | " 1.399459 | \n",
176 | " 1.113299 | \n",
177 | " 0.462089 | \n",
178 | "
\n",
179 | " \n",
180 | " min | \n",
181 | " 0.000000 | \n",
182 | " 3.000000 | \n",
183 | " 0.000000 | \n",
184 | " 0.000000 | \n",
185 | " 0.000000 | \n",
186 | " 0.000000 | \n",
187 | "
\n",
188 | " \n",
189 | " 25% | \n",
190 | " 6372.500000 | \n",
191 | " 3.000000 | \n",
192 | " 0.000000 | \n",
193 | " 2.000000 | \n",
194 | " 0.000000 | \n",
195 | " 1.000000 | \n",
196 | "
\n",
197 | " \n",
198 | " 50% | \n",
199 | " 12703.000000 | \n",
200 | " 3.000000 | \n",
201 | " 0.000000 | \n",
202 | " 3.000000 | \n",
203 | " 0.000000 | \n",
204 | " 1.000000 | \n",
205 | "
\n",
206 | " \n",
207 | " 75% | \n",
208 | " 18995.500000 | \n",
209 | " 3.000000 | \n",
210 | " 0.000000 | \n",
211 | " 3.000000 | \n",
212 | " 0.000000 | \n",
213 | " 1.000000 | \n",
214 | "
\n",
215 | " \n",
216 | " max | \n",
217 | " 25296.000000 | \n",
218 | " 9.000000 | \n",
219 | " 7.000000 | \n",
220 | " 9.000000 | \n",
221 | " 9.000000 | \n",
222 | " 2.000000 | \n",
223 | "
\n",
224 | " \n",
225 | "
\n",
226 | "
"
227 | ],
228 | "text/plain": [
229 | " Unnamed: 0 count hate_speech offensive_language \\\n",
230 | "count 24783.000000 24783.000000 24783.000000 24783.000000 \n",
231 | "mean 12681.192027 3.243473 0.280515 2.413711 \n",
232 | "std 7299.553863 0.883060 0.631851 1.399459 \n",
233 | "min 0.000000 3.000000 0.000000 0.000000 \n",
234 | "25% 6372.500000 3.000000 0.000000 2.000000 \n",
235 | "50% 12703.000000 3.000000 0.000000 3.000000 \n",
236 | "75% 18995.500000 3.000000 0.000000 3.000000 \n",
237 | "max 25296.000000 9.000000 7.000000 9.000000 \n",
238 | "\n",
239 | " neither class \n",
240 | "count 24783.000000 24783.000000 \n",
241 | "mean 0.549247 1.110277 \n",
242 | "std 1.113299 0.462089 \n",
243 | "min 0.000000 0.000000 \n",
244 | "25% 0.000000 1.000000 \n",
245 | "50% 0.000000 1.000000 \n",
246 | "75% 0.000000 1.000000 \n",
247 | "max 9.000000 2.000000 "
248 | ]
249 | },
250 | "execution_count": 4,
251 | "metadata": {},
252 | "output_type": "execute_result"
253 | }
254 | ],
255 | "source": [
256 | "df.describe()"
257 | ]
258 | },
259 | {
260 | "cell_type": "markdown",
261 | "metadata": {
262 | "deletable": true,
263 | "editable": true
264 | },
265 | "source": [
266 | "# Cleaning, etc."
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": 7,
272 | "metadata": {
273 | "collapsed": true,
274 | "deletable": true,
275 | "editable": true
276 | },
277 | "outputs": [],
278 | "source": [
279 | "df['hate_class'] = (df['class'] == 0)*1\n",
280 | "\n",
281 | "#TODO: more"
282 | ]
283 | },
284 | {
285 | "cell_type": "markdown",
286 | "metadata": {
287 | "deletable": true,
288 | "editable": true
289 | },
290 | "source": [
291 | "# Train/Test Definition"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": 5,
297 | "metadata": {
298 | "collapsed": false,
299 | "deletable": true,
300 | "editable": true
301 | },
302 | "outputs": [],
303 | "source": [
304 | "from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit\n",
305 | "X=df['tweet']\n",
306 | "y=df['class']\n",
307 | "\n",
308 | "sss = StratifiedShuffleSplit(n_splits=1, test_size=.15, random_state=0) #TODO: Coordinate random seed between notebooks\n",
309 | "train_index, test_index = next(sss.split(X,y))\n",
310 | "\n",
311 | "X_train = X.iloc[train_index]\n",
312 | "X_test = X.iloc[test_index]\n",
313 | "y_train = y.iloc[train_index]\n",
314 | "y_test = y.iloc[test_index]"
315 | ]
316 | },
317 | {
318 | "cell_type": "code",
319 | "execution_count": 6,
320 | "metadata": {
321 | "collapsed": false,
322 | "deletable": true,
323 | "editable": true
324 | },
325 | "outputs": [
326 | {
327 | "name": "stdout",
328 | "output_type": "stream",
329 | "text": [
330 | "(21065,) (3718,)\n"
331 | ]
332 | }
333 | ],
334 | "source": [
335 | "print(X_train.shape,X_test.shape)"
336 | ]
337 | },
338 | {
339 | "cell_type": "markdown",
340 | "metadata": {
341 | "deletable": true,
342 | "editable": true
343 | },
344 | "source": [
345 | "# Explore/Feature Engineer\n",
346 | "\n",
347 | "Engineering only using the training set."
348 | ]
349 | },
350 | {
351 | "cell_type": "code",
352 | "execution_count": 7,
353 | "metadata": {
354 | "collapsed": true,
355 | "deletable": true,
356 | "editable": true
357 | },
358 | "outputs": [],
359 | "source": [
360 | "# TODO"
361 | ]
362 | },
363 | {
364 | "cell_type": "markdown",
365 | "metadata": {
366 | "deletable": true,
367 | "editable": true
368 | },
369 | "source": [
370 | "## Model Training"
371 | ]
372 | },
373 | {
374 | "cell_type": "code",
375 | "execution_count": 8,
376 | "metadata": {
377 | "collapsed": true,
378 | "deletable": true,
379 | "editable": true
380 | },
381 | "outputs": [],
382 | "source": [
383 | "from sklearn.feature_extraction.text import TfidfVectorizer\n",
384 | "from sklearn.ensemble import ExtraTreesClassifier\n",
385 | "from sklearn.pipeline import Pipeline\n",
386 | "from sklearn.model_selection import cross_val_score, GridSearchCV"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": 9,
392 | "metadata": {
393 | "collapsed": false,
394 | "deletable": true,
395 | "editable": true
396 | },
397 | "outputs": [],
398 | "source": [
399 | "pipe = Pipeline([\n",
400 | " ('tfidf', TfidfVectorizer(\n",
401 | " max_features = 2000, # wild guess\n",
402 | " stop_words = 'english',\n",
403 | " min_df=2, \n",
404 | " ngram_range = (1,3)\n",
405 | " )),\n",
406 | " ('et',ExtraTreesClassifier(n_estimators=50,verbose=True))\n",
407 | "])\n",
408 | "\n",
409 | "param_grid = {\n",
410 | " 'tfidf__min_df' : [2,5]\n",
411 | "}\n",
412 | "\n",
413 | "model = GridSearchCV(pipe, param_grid)"
414 | ]
415 | },
416 | {
417 | "cell_type": "code",
418 | "execution_count": 11,
419 | "metadata": {
420 | "collapsed": false,
421 | "deletable": true,
422 | "editable": true
423 | },
424 | "outputs": [
425 | {
426 | "name": "stderr",
427 | "output_type": "stream",
428 | "text": [
429 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 4.0s finished\n",
430 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 0.2s finished\n",
431 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 0.3s finished\n",
432 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 3.7s finished\n",
433 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 0.2s finished\n",
434 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 0.3s finished\n",
435 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 4.0s finished\n",
436 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 0.2s finished\n",
437 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 0.3s finished\n",
438 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 3.9s finished\n",
439 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 0.2s finished\n",
440 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 0.3s finished\n",
441 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 3.7s finished\n",
442 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 0.2s finished\n",
443 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 0.3s finished\n",
444 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 3.9s finished\n",
445 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 0.2s finished\n",
446 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 0.3s finished\n",
447 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 7.0s finished\n"
448 | ]
449 | },
450 | {
451 | "data": {
452 | "text/plain": [
453 | "GridSearchCV(cv=None, error_score='raise',\n",
454 | " estimator=Pipeline(steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
455 | " dtype=, encoding='utf-8', input='content',\n",
456 | " lowercase=True, max_df=1.0, max_features=2000, min_df=2,\n",
457 | " ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,\n",
458 | " ...ators=50, n_jobs=1, oob_score=False, random_state=None,\n",
459 | " verbose=True, warm_start=False))]),\n",
460 | " fit_params={}, iid=True, n_jobs=1,\n",
461 | " param_grid={'tfidf__min_df': [2, 5]}, pre_dispatch='2*n_jobs',\n",
462 | " refit=True, return_train_score=True, scoring=None, verbose=0)"
463 | ]
464 | },
465 | "execution_count": 11,
466 | "metadata": {},
467 | "output_type": "execute_result"
468 | }
469 | ],
470 | "source": [
471 | "model.fit(X_train, y_train)"
472 | ]
473 | },
474 | {
475 | "cell_type": "markdown",
476 | "metadata": {
477 | "deletable": true,
478 | "editable": true
479 | },
480 | "source": [
481 | "# Held Out Test"
482 | ]
483 | },
484 | {
485 | "cell_type": "code",
486 | "execution_count": 12,
487 | "metadata": {
488 | "collapsed": false,
489 | "deletable": true,
490 | "editable": true
491 | },
492 | "outputs": [
493 | {
494 | "name": "stderr",
495 | "output_type": "stream",
496 | "text": [
497 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 0.1s finished\n",
498 | "[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 0.1s finished\n"
499 | ]
500 | }
501 | ],
502 | "source": [
503 | "y_pred = model.predict(X_test)\n",
504 | "y_proba = model.predict_proba(X_test)"
505 | ]
506 | },
507 | {
508 | "cell_type": "code",
509 | "execution_count": 13,
510 | "metadata": {
511 | "collapsed": false,
512 | "deletable": true,
513 | "editable": true
514 | },
515 | "outputs": [
516 | {
517 | "name": "stdout",
518 | "output_type": "stream",
519 | "text": [
520 | " precision recall f1-score support\n",
521 | "\n",
522 | " 0 0.37 0.12 0.18 214\n",
523 | " 1 0.92 0.96 0.94 2879\n",
524 | " 2 0.83 0.87 0.85 625\n",
525 | "\n",
526 | "avg / total 0.88 0.90 0.88 3718\n",
527 | "\n"
528 | ]
529 | }
530 | ],
531 | "source": [
532 | "from sklearn.metrics import classification_report\n",
533 | "report = classification_report(y_test, y_pred)\n",
534 | "print(report)"
535 | ]
536 | },
537 | {
538 | "cell_type": "code",
539 | "execution_count": 14,
540 | "metadata": {
541 | "collapsed": false,
542 | "deletable": true,
543 | "editable": true
544 | },
545 | "outputs": [
546 | {
547 | "data": {
548 | "text/plain": [
549 | "array([[ 25, 159, 30],\n",
550 | " [ 34, 2767, 78],\n",
551 | " [ 8, 73, 544]])"
552 | ]
553 | },
554 | "execution_count": 14,
555 | "metadata": {},
556 | "output_type": "execute_result"
557 | }
558 | ],
559 | "source": [
560 | "from sklearn.metrics import confusion_matrix\n",
561 | "confusion_matrix(y_test,y_pred)"
562 | ]
563 | }
564 | ],
565 | "metadata": {
566 | "kernelspec": {
567 | "display_name": "Python 3",
568 | "language": "python",
569 | "name": "python3"
570 | },
571 | "language_info": {
572 | "codemirror_mode": {
573 | "name": "ipython",
574 | "version": 3
575 | },
576 | "file_extension": ".py",
577 | "mimetype": "text/x-python",
578 | "name": "python",
579 | "nbconvert_exporter": "python",
580 | "pygments_lexer": "ipython3",
581 | "version": "3.5.2"
582 | }
583 | },
584 | "nbformat": 4,
585 | "nbformat_minor": 2
586 | }
587 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask==2.3.2
2 | numpy==1.22.0
3 | pandas==0.19.2
4 | scikit-learn==0.18.1
5 | scipy==1.10.0
6 | Werkzeug==2.2.3
7 | gunicorn==19.7.1
8 | vaderSentiment==2.5
9 | nltk==3.6.6
10 | textstat==0.4.1
11 | requests==2.31.0
12 |
--------------------------------------------------------------------------------
/research/23_Paper.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Data4Democracy/hate_speech_detector/c527aa6d286c663e74f538aad16e085301144250/research/23_Paper.pdf
--------------------------------------------------------------------------------