├── datasets
└── .gitkeep
├── models
└── .gitkeep
├── nlp
├── __init__.py
├── dataset.py
└── utils.py
├── requirements.txt
├── requirements.gpu.txt
├── .env.sample
├── .gitignore
├── README.md
├── query_relations.json
└── notebooks
├── Fetch Tweets.ipynb
├── Validate API Data.ipynb
├── Predict Emotion.ipynb
├── Train Sentiment Analysis.ipynb
├── Train Emotion Recognition Model.ipynb
├── Sentiment Analysis Score.ipynb
├── Emotion Recognition Model Validation.ipynb
└── Check Emotion Labeled Dataset.ipynb
/datasets/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/models/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/nlp/__init__.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | nltk.download('stopwords')
3 |
4 | from .dataset import Dataset
5 | from .utils import preprocess
6 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | python-dotenv==0.1.0
2 | tweepy==3.5.0
3 | jupyter==1.0.0
4 | tensorflow==1.14.0
5 | pandas==0.24.2
6 | python-dotenv==0.1.0
7 | nltk==3.4.3
8 | scikit-learn==0.21.2
9 | emoji==0.5.2
10 | seaborn==0.9.0
11 | tqdm==4.32.2
12 | matplotlib==3.0.3
--------------------------------------------------------------------------------
/requirements.gpu.txt:
--------------------------------------------------------------------------------
1 | python-dotenv==0.1.0
2 | tweepy==3.5.0
3 | jupyter==1.0.0
4 | tensorflow-gpu==1.14.0
5 | pandas==0.24.2
6 | python-dotenv==0.1.0
7 | nltk==3.4.3
8 | scikit-learn==0.21.2
9 | emoji==0.5.2
10 | seaborn==0.9.0
11 | tqdm==4.32.2
12 | matplotlib==3.0.3
--------------------------------------------------------------------------------
/.env.sample:
--------------------------------------------------------------------------------
1 | CONSUMER_KEY=consumer-key.get-from-https://developer.twitter.com/
2 | CONSUMER_SECRET=consumer-secret.get-from-https://developer.twitter.com/
3 | ACCESS_TOKEN=access_token.get-from-https://developer.twitter.com/
4 | ACCESS_TOKEN_SECRET=access_token_secret.get-from-https://developer.twitter.com/
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 | .DS_STORE
3 | .env
4 |
5 | # Python related
6 | __pycache__
7 | .ipynb_checkpoints
8 |
9 | # Dataset files
10 | datasets/**/*.csv
11 | datasets/**/*.pickle
12 |
13 | # Model files
14 | models/**/*.h5
15 | models/**/*.pickle
16 |
17 | # Tensorboard logs
18 | models/**/logs/**/*.*
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Emotion From Tweet
2 |
3 | This repository contains the source code for the article
4 | **From Sentiment Analysis to Emotion Recognition: A NLP story**
5 |
6 | ## Setup
7 |
8 | 1. Install the dependencies (use ***requirements.gpu.txt*** instead of
9 | *requirements.txt* when using GPU processing):
10 |
11 | ```bash
12 | pip install -r requirements.txt
13 | ```
14 | 1. Create a `.env` file:
15 |
16 | ```bash
17 | cp .env.sample .env
18 | ```
19 | 1. Set the environment variables inside the created `.env` file
20 |
21 | ## Running
22 |
23 | 1. Start the jupyter notebook:
24 |
25 | ```bash
26 | jupyter notebook
27 | ```
28 | 1. Go to the `notebooks` folder
29 | 1. Open and run the notebook you want
30 |
31 | ***Note***: *Check the releases if you want the training step output files*
32 |
--------------------------------------------------------------------------------
/nlp/dataset.py:
--------------------------------------------------------------------------------
1 | import re
2 | import pandas as pd
3 | from time import time
4 | from pathlib import Path
5 | from .utils import preprocess
6 |
7 | class Dataset:
8 | def __init__(self, filename, label_col='label', text_col='text'):
9 | self.filename = filename
10 | self.label_col = label_col
11 | self.text_col = text_col
12 |
13 | @property
14 | def data(self):
15 | data = self.dataframe[[self.label_col, self.text_col]].copy()
16 | data.columns = ['label', 'text']
17 | return data
18 |
19 | @property
20 | def cleaned_data(self):
21 | data = self.dataframe[[self.label_col, 'cleaned']]
22 | data.columns = ['label', 'text']
23 | return data
24 |
25 | def load(self):
26 | df = pd.read_csv(Path(self.filename).resolve())
27 | self.dataframe = df
28 |
29 | def preprocess_texts(self, quiet=False):
30 | self.dataframe['cleaned'] = preprocess(self.dataframe[self.text_col], quiet)
31 |
--------------------------------------------------------------------------------
/query_relations.json:
--------------------------------------------------------------------------------
1 | {
2 | ":face_screaming_in_fear:": "fear",
3 | ":face_with_tears_of_joy:": "joy",
4 | ":grinning_face_with_smiling_eyes:": "joy",
5 | ":pouting_face:": "anger",
6 | ":crying_face:": "sadness",
7 | ":fearful_face:": "fear",
8 | ":face_with_steam_from_nose:": "anger",
9 | "#anxious": "fear",
10 | "#sad": "sadness",
11 | "#happiness": "joy",
12 | "#fear": "fear",
13 | "#joy": "joy",
14 | "#pissed": "anger",
15 | "#angry": "anger",
16 | "#mad": "anger",
17 | "#excited": "joy",
18 | "#furious": "anger",
19 | "#depressed": "sadness",
20 | ":pensive_face:": "sadness",
21 | "#afraid": "fear",
22 | "#scared": "fear",
23 | "#worried": "fear",
24 | "#scary": "fear",
25 | ":anxious_face_with_sweat:": "fear",
26 | "#hateyou": "anger",
27 | ":loudly_crying_face:": "sadness",
28 | ":broken_heart:": "sadness",
29 | ":red_heart:": "joy",
30 | ":face_with_symbols_on_mouth:": "anger",
31 | ":anger_face:": "anger",
32 | ":smiling_face_with_smiling_eyes:": "joy",
33 | "#depression": "sadness",
34 | "#pissedoff": "anger"
35 | }
--------------------------------------------------------------------------------
/nlp/utils.py:
--------------------------------------------------------------------------------
1 | import re
2 | import nltk
3 | from time import time
4 | from emoji import demojize
5 |
6 | def preprocess(texts, quiet=False):
7 | start = time()
8 | # Lowercasing
9 | texts = texts.str.lower()
10 |
11 | # Remove special chars
12 | texts = texts.str.replace(r"(http|@)\S+", "")
13 | texts = texts.apply(demojize)
14 | texts = texts.str.replace(r"::", ": :")
15 | texts = texts.str.replace(r"’", "'")
16 | texts = texts.str.replace(r"[^a-z\':_]", " ")
17 |
18 | # Remove repetitions
19 | pattern = re.compile(r"(.)\1{2,}", re.DOTALL)
20 | texts = texts.str.replace(pattern, r"\1")
21 |
22 | # Transform short negation form
23 | texts = texts.str.replace(r"(can't|cannot)", 'can not')
24 | texts = texts.str.replace(r"n't", ' not')
25 |
26 | # Remove stop words
27 | stopwords = nltk.corpus.stopwords.words('english')
28 | stopwords.remove('not')
29 | stopwords.remove('nor')
30 | stopwords.remove('no')
31 | texts = texts.apply(
32 | lambda x: ' '.join([word for word in x.split() if word not in stopwords])
33 | )
34 |
35 | if not quiet:
36 | print("Time to clean up: {:.2f} sec".format(time() - start))
37 |
38 | return texts
39 |
--------------------------------------------------------------------------------
/notebooks/Fetch Tweets.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Fetch Tweets\n",
8 | "\n",
9 | "Donwload and save tweets, using a **query** value"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/plain": [
20 | "True"
21 | ]
22 | },
23 | "execution_count": 1,
24 | "metadata": {},
25 | "output_type": "execute_result"
26 | }
27 | ],
28 | "source": [
29 | "from dotenv import load_dotenv\n",
30 | "from pathlib import Path\n",
31 | "\n",
32 | "env_path = Path('../.env').resolve()\n",
33 | "load_dotenv(dotenv_path=env_path)"
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "metadata": {},
39 | "source": [
40 | "## API access\n",
41 | "\n",
42 | "First of all, we'll connect to the Twitter API"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 2,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "import os"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 3,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "consumer_key = os.getenv(\"CONSUMER_KEY\")\n",
61 | "consumer_secret = os.getenv(\"CONSUMER_SECRET\")\n",
62 | "access_token = os.getenv(\"ACCESS_TOKEN\")\n",
63 | "access_token_secret = os.getenv(\"ACCESS_TOKEN_SECRET\")"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 4,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "from tweepy import OAuthHandler, API, TweepError"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 5,
78 | "metadata": {},
79 | "outputs": [
80 | {
81 | "name": "stdout",
82 | "output_type": "stream",
83 | "text": [
84 | "Successfully connected to the Twitter API.\n"
85 | ]
86 | }
87 | ],
88 | "source": [
89 | "auth = OAuthHandler(consumer_key, consumer_secret)\n",
90 | "auth.set_access_token(access_token, access_token_secret)\n",
91 | "api = API(auth)\n",
92 | "print('Successfully connected to the Twitter API.')"
93 | ]
94 | },
95 | {
96 | "cell_type": "markdown",
97 | "metadata": {},
98 | "source": [
99 | "## Search Tweets\n",
100 | "\n",
101 | "Now we can define our query and search for the tweets containing it.\n",
102 | "\n",
103 | "- **query**: *hashtag* or *emoji* that will be used to fetch the tweets\n",
104 | "- **max_requests**: Maximum number of requests to the API.\n",
105 | " - Restriction: 180 requests / 15 min window"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 6,
111 | "metadata": {},
112 | "outputs": [],
113 | "source": [
114 | "query = '#angry'\n",
115 | "max_requests = 180"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": 7,
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "# Converts aliases to the real emoji representation (e.g. :thumbs_up: => 👍)\n",
125 | "\n",
126 | "from emoji import emojize"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 8,
132 | "metadata": {},
133 | "outputs": [],
134 | "source": [
135 | "q = emojize(query) + ' -filter:retweets'\n",
136 | "searched_tweets = []\n",
137 | "last_id = -1\n",
138 | "request_count = 0\n",
139 | "while request_count < max_requests:\n",
140 | " try:\n",
141 | " new_tweets = api.search(q=q,\n",
142 | " lang='en',\n",
143 | " count=100,\n",
144 | " max_id=str(last_id - 1),\n",
145 | " tweet_mode='extended')\n",
146 | " if not new_tweets:\n",
147 | " break\n",
148 | " searched_tweets.extend(new_tweets)\n",
149 | " last_id = new_tweets[-1].id\n",
150 | " request_count += 1\n",
151 | " except TweepError as e:\n",
152 | " print(e)\n",
153 | " break"
154 | ]
155 | },
156 | {
157 | "cell_type": "markdown",
158 | "metadata": {},
159 | "source": [
160 | "## Format and save\n",
161 | "\n",
162 | "Format the API data to the desired structure and save a `.csv` file"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": 9,
168 | "metadata": {},
169 | "outputs": [],
170 | "source": [
171 | "import pandas as pd"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": 10,
177 | "metadata": {},
178 | "outputs": [
179 | {
180 | "name": "stdout",
181 | "output_type": "stream",
182 | "text": [
183 | "66 #angry tweets\n"
184 | ]
185 | }
186 | ],
187 | "source": [
188 | "data = []\n",
189 | "for tweet in searched_tweets:\n",
190 | " data.append([tweet.id, tweet.created_at, tweet.user.screen_name, tweet.full_text])\n",
191 | "df = pd.DataFrame(data=data, columns=['id', 'date', 'user', 'text'])\n",
192 | "print(str(len(data)) + ' ' + query + ' tweets')"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": 11,
198 | "metadata": {
199 | "scrolled": true
200 | },
201 | "outputs": [
202 | {
203 | "data": {
204 | "text/html": [
205 | "
\n",
206 | "\n",
219 | "
\n",
220 | " \n",
221 | " \n",
222 | " | \n",
223 | " id | \n",
224 | " date | \n",
225 | " user | \n",
226 | " text | \n",
227 | "
\n",
228 | " \n",
229 | " \n",
230 | " \n",
231 | " | 0 | \n",
232 | " 1151133382627057664 | \n",
233 | " 2019-07-16 14:16:00 | \n",
234 | " DaradeAbhijeet | \n",
235 | " Don't promise when you are #Happy\\n&\\nDon'... | \n",
236 | "
\n",
237 | " \n",
238 | " | 1 | \n",
239 | " 1151124672496324608 | \n",
240 | " 2019-07-16 13:41:23 | \n",
241 | " TheRealFakeJack | \n",
242 | " @realDonaldTrump 4:20 am it is a sign u need t... | \n",
243 | "
\n",
244 | " \n",
245 | " | 2 | \n",
246 | " 1151118984793776129 | \n",
247 | " 2019-07-16 13:18:47 | \n",
248 | " masterofnaps | \n",
249 | " There's a special place in hell for people who... | \n",
250 | "
\n",
251 | " \n",
252 | " | 3 | \n",
253 | " 1151115966220328960 | \n",
254 | " 2019-07-16 13:06:47 | \n",
255 | " TiknisArts | \n",
256 | " We know #Trump needs #attention to survive. It... | \n",
257 | "
\n",
258 | " \n",
259 | " | 4 | \n",
260 | " 1151113082099232768 | \n",
261 | " 2019-07-16 12:55:20 | \n",
262 | " emilieraddish | \n",
263 | " Get your Instagram photo elsewhere not on top ... | \n",
264 | "
\n",
265 | " \n",
266 | "
\n",
267 | "
"
268 | ],
269 | "text/plain": [
270 | " id date user \\\n",
271 | "0 1151133382627057664 2019-07-16 14:16:00 DaradeAbhijeet \n",
272 | "1 1151124672496324608 2019-07-16 13:41:23 TheRealFakeJack \n",
273 | "2 1151118984793776129 2019-07-16 13:18:47 masterofnaps \n",
274 | "3 1151115966220328960 2019-07-16 13:06:47 TiknisArts \n",
275 | "4 1151113082099232768 2019-07-16 12:55:20 emilieraddish \n",
276 | "\n",
277 | " text \n",
278 | "0 Don't promise when you are #Happy\\n&\\nDon'... \n",
279 | "1 @realDonaldTrump 4:20 am it is a sign u need t... \n",
280 | "2 There's a special place in hell for people who... \n",
281 | "3 We know #Trump needs #attention to survive. It... \n",
282 | "4 Get your Instagram photo elsewhere not on top ... "
283 | ]
284 | },
285 | "execution_count": 11,
286 | "metadata": {},
287 | "output_type": "execute_result"
288 | }
289 | ],
290 | "source": [
291 | "df.head()"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": 12,
297 | "metadata": {},
298 | "outputs": [
299 | {
300 | "name": "stdout",
301 | "output_type": "stream",
302 | "text": [
303 | "Saved under: \"/home/rmohashi/Workspace/emotion-from-tweets/datasets/tweepy\"\n"
304 | ]
305 | }
306 | ],
307 | "source": [
308 | "PATH = Path('../datasets/tweepy').resolve()\n",
309 | "filename = query + '.csv'\n",
310 | "df.to_csv(os.path.join(PATH, filename), index=None)\n",
311 | "print('Saved under: \"' + PATH.as_posix() + '\"')"
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": null,
317 | "metadata": {},
318 | "outputs": [],
319 | "source": []
320 | }
321 | ],
322 | "metadata": {
323 | "kernelspec": {
324 | "display_name": "Python 3",
325 | "language": "python",
326 | "name": "python3"
327 | },
328 | "language_info": {
329 | "codemirror_mode": {
330 | "name": "ipython",
331 | "version": 3
332 | },
333 | "file_extension": ".py",
334 | "mimetype": "text/x-python",
335 | "name": "python",
336 | "nbconvert_exporter": "python",
337 | "pygments_lexer": "ipython3",
338 | "version": "3.6.8"
339 | }
340 | },
341 | "nbformat": 4,
342 | "nbformat_minor": 2
343 | }
344 |
--------------------------------------------------------------------------------
/notebooks/Validate API Data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Validate API Data\n",
8 | "\n",
9 | "Validate and create a emotion labeled dataset"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "# Add project path to the PYTHONPATH\n",
19 | "\n",
20 | "import os\n",
21 | "import sys\n",
22 | "from pathlib import Path\n",
23 | "\n",
24 | "sys.path.append(Path(os.path.join(os.path.abspath(''), '../')).resolve().as_posix())"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 2,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "import json\n",
34 | "from pathlib import Path"
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {},
40 | "source": [
41 | "## Load Relations\n",
42 | "\n",
43 | "Load the relations between queries and emotions"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 3,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "relations_path = Path('../query_relations.json').resolve()"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 4,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "with relations_path.open('rb') as file:\n",
62 | " relations = json.load(file)"
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "metadata": {},
68 | "source": [
69 | "## Load Tokenizer\n",
70 | "\n",
71 | "Load the tokenizer, created at the model training process"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 5,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "import pickle"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 6,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "tokenizer_path = Path('../datasets/sentiment140/tokenizer.pickle').resolve()\n",
90 | "with tokenizer_path.open('rb') as file:\n",
91 | " tokenizer = pickle.load(file)"
92 | ]
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "metadata": {},
97 | "source": [
98 | "## Load Model\n",
99 | "\n",
100 | "Load the model, using the saved weights"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": 7,
106 | "metadata": {},
107 | "outputs": [],
108 | "source": [
109 | "from tensorflow.keras.layers import Input, Embedding, GRU\n",
110 | "from tensorflow.keras.layers import Dropout, GlobalMaxPooling1D\n",
111 | "from tensorflow.keras.layers import Bidirectional, Dense\n",
112 | "from tensorflow.keras.models import Sequential"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 8,
118 | "metadata": {},
119 | "outputs": [],
120 | "source": [
121 | "input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)\n",
122 | "embedding_dim = 200\n",
123 | "input_length = 100\n",
124 | "gru_units = 128\n",
125 | "gru_dropout = 0.1\n",
126 | "recurrent_dropout = 0.1\n",
127 | "dropout = 0.1"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": 9,
133 | "metadata": {},
134 | "outputs": [
135 | {
136 | "name": "stderr",
137 | "output_type": "stream",
138 | "text": [
139 | "WARNING: Logging before flag parsing goes to stderr.\n",
140 | "W0719 09:43:55.179866 4476569024 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
141 | "Instructions for updating:\n",
142 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
143 | "W0719 09:43:55.207387 4476569024 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
144 | "Instructions for updating:\n",
145 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
146 | "W0719 09:43:55.215560 4476569024 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
147 | "Instructions for updating:\n",
148 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
149 | "W0719 09:43:55.216914 4476569024 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
150 | "Instructions for updating:\n",
151 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
152 | "W0719 09:43:55.219862 4476569024 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
153 | "Instructions for updating:\n",
154 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n"
155 | ]
156 | }
157 | ],
158 | "source": [
159 | "model = Sequential()\n",
160 | "model.add(Embedding(\n",
161 | " input_dim=input_dim,\n",
162 | " output_dim=embedding_dim,\n",
163 | " input_shape=(input_length,)\n",
164 | "))\n",
165 | "\n",
166 | "model.add(Bidirectional(GRU(\n",
167 | " gru_units,\n",
168 | " return_sequences=True,\n",
169 | " dropout=gru_dropout,\n",
170 | " recurrent_dropout=recurrent_dropout\n",
171 | ")))\n",
172 | "model.add(GlobalMaxPooling1D())\n",
173 | "model.add(Dense(32, activation='relu'))\n",
174 | "model.add(Dropout(dropout))\n",
175 | "\n",
176 | "model.add(Dense(1, activation='sigmoid'))"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 10,
182 | "metadata": {},
183 | "outputs": [
184 | {
185 | "name": "stdout",
186 | "output_type": "stream",
187 | "text": [
188 | "Model: \"sequential\"\n",
189 | "_________________________________________________________________\n",
190 | "Layer (type) Output Shape Param # \n",
191 | "=================================================================\n",
192 | "embedding (Embedding) (None, 100, 200) 2000000 \n",
193 | "_________________________________________________________________\n",
194 | "bidirectional (Bidirectional (None, 100, 256) 252672 \n",
195 | "_________________________________________________________________\n",
196 | "global_max_pooling1d (Global (None, 256) 0 \n",
197 | "_________________________________________________________________\n",
198 | "dense (Dense) (None, 32) 8224 \n",
199 | "_________________________________________________________________\n",
200 | "dropout (Dropout) (None, 32) 0 \n",
201 | "_________________________________________________________________\n",
202 | "dense_1 (Dense) (None, 1) 33 \n",
203 | "=================================================================\n",
204 | "Total params: 2,260,929\n",
205 | "Trainable params: 2,260,929\n",
206 | "Non-trainable params: 0\n",
207 | "_________________________________________________________________\n",
208 | "None\n"
209 | ]
210 | }
211 | ],
212 | "source": [
213 | "print(model.summary())"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": 11,
219 | "metadata": {},
220 | "outputs": [],
221 | "source": [
222 | "weights_path = Path('../models/sentiment_analysis/model_weights.h5').resolve()\n",
223 | "model.load_weights(weights_path.as_posix())"
224 | ]
225 | },
226 | {
227 | "cell_type": "markdown",
228 | "metadata": {},
229 | "source": [
230 | "## Group data by emotion\n",
231 | "\n",
232 | "Use the emotions to group the data"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": 12,
238 | "metadata": {},
239 | "outputs": [],
240 | "source": [
241 | "import os\n",
242 | "import re\n",
243 | "import pandas as pd\n",
244 | "from tqdm import tqdm"
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": 13,
250 | "metadata": {},
251 | "outputs": [],
252 | "source": [
253 | "files_dir = Path('../datasets/tweepy').resolve()"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": 14,
259 | "metadata": {},
260 | "outputs": [
261 | {
262 | "name": "stderr",
263 | "output_type": "stream",
264 | "text": [
265 | "100%|██████████| 19/19 [00:00<00:00, 27.29it/s]\n"
266 | ]
267 | }
268 | ],
269 | "source": [
270 | "emotion_data_dict = {}\n",
271 | "\n",
272 | "filenames = os.listdir(files_dir)\n",
273 | "with tqdm(total=len(filenames)) as t:\n",
274 | " for filename in filenames:\n",
275 | " query = re.findall(r'(#[^.]+|:.+:)', filename)[0]\n",
276 | " emotion = relations[query]\n",
277 | "\n",
278 | " file_data = pd.read_csv(os.path.join(files_dir, filename))\n",
279 | " dict_data = emotion_data_dict[emotion] if emotion in emotion_data_dict else None\n",
280 | " emotion_data_dict[emotion] = pd.concat([dict_data, file_data])\n",
281 | " t.update()"
282 | ]
283 | },
284 | {
285 | "cell_type": "markdown",
286 | "metadata": {},
287 | "source": [
288 | "## Predict emotion and filter data\n",
289 | "\n",
290 | "Predict emotion and filter rows for each group created in the step above"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": 15,
296 | "metadata": {},
297 | "outputs": [
298 | {
299 | "name": "stderr",
300 | "output_type": "stream",
301 | "text": [
302 | "[nltk_data] Downloading package stopwords to\n",
303 | "[nltk_data] /Users/rmohashi/nltk_data...\n",
304 | "[nltk_data] Package stopwords is already up-to-date!\n"
305 | ]
306 | }
307 | ],
308 | "source": [
309 | "import re\n",
310 | "import numpy as np\n",
311 | "from emoji import demojize\n",
312 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
313 | "from nlp import preprocess"
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": 16,
319 | "metadata": {},
320 | "outputs": [],
321 | "source": [
322 | "def get_score_range(mean):\n",
323 | " if mean < 0.5:\n",
324 | " return (0.0, mean)\n",
325 | " return (mean, 1.0)"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": 17,
331 | "metadata": {},
332 | "outputs": [
333 | {
334 | "name": "stderr",
335 | "output_type": "stream",
336 | "text": [
337 | "Processing \"joy\" data: 100%|██████████| 4/4 [03:28<00:00, 51.09s/it] "
338 | ]
339 | },
340 | {
341 | "name": "stdout",
342 | "output_type": "stream",
343 | "text": [
344 | "Fear: Score Range: 0.000000 - 0.434182\n",
345 | "Sadness: Score Range: 0.000000 - 0.220770\n",
346 | "Anger: Score Range: 0.000000 - 0.410283\n",
347 | "Joy: Score Range: 0.870705 - 1.000000\n"
348 | ]
349 | },
350 | {
351 | "name": "stderr",
352 | "output_type": "stream",
353 | "text": [
354 | "\n"
355 | ]
356 | }
357 | ],
358 | "source": [
359 | "result_data = []\n",
360 | "\n",
361 | "messages = []\n",
362 | "with tqdm(total=len(emotion_data_dict.items())) as t:\n",
363 | " for emotion, dataset in emotion_data_dict.items():\n",
364 | " t.set_description('Processing \"' + emotion + '\" data')\n",
365 | "\n",
366 | " cleaned_texts = preprocess(dataset.text, quiet=True)\n",
367 | " predict_sequences = [text.split() for text in cleaned_texts]\n",
368 | " list_tokenized_predict = tokenizer.texts_to_sequences(predict_sequences)\n",
369 | " x_predict = pad_sequences(list_tokenized_predict, maxlen=100)\n",
370 | "\n",
371 | " result = model.predict(x_predict)\n",
372 | " mean = np.mean(result)\n",
373 | " std = np.std(result)\n",
374 | " low, high = get_score_range(mean)\n",
375 | " messages.append(emotion.capitalize() + \": Score Range: {:4f} - {:4f}\".format(low, high))\n",
376 | " dataset = dataset[np.all([(result >= low), (result <= high)], axis=0)]\n",
377 | " dataset.insert(0, 'label', emotion)\n",
378 | "\n",
379 | " result_data = result_data + [dataset]\n",
380 | " t.update()\n",
381 | "\n",
382 | "for message in messages:\n",
383 | " print(message)"
384 | ]
385 | },
386 | {
387 | "cell_type": "markdown",
388 | "metadata": {},
389 | "source": [
390 | "## Save dataset\n",
391 | "\n",
392 | "Save the resulting data"
393 | ]
394 | },
395 | {
396 | "cell_type": "code",
397 | "execution_count": 18,
398 | "metadata": {},
399 | "outputs": [
400 | {
401 | "name": "stdout",
402 | "output_type": "stream",
403 | "text": [
404 | "Files saved under \"/Users/rmohashi/Workspace/emotion-from-tweet/datasets/sentiment_analysis/dataset.csv\"\n"
405 | ]
406 | }
407 | ],
408 | "source": [
409 | "if len(result_data) > 0:\n",
410 | " result_data = pd.concat(result_data)\n",
411 | "\n",
412 | " path = Path('../datasets/sentiment_analysis/dataset.csv').resolve()\n",
413 | " result_data.to_csv(path, index=None)\n",
414 | "\n",
415 | " print('Files saved under \"' + path.as_posix() + '\"')"
416 | ]
417 | },
418 | {
419 | "cell_type": "code",
420 | "execution_count": null,
421 | "metadata": {},
422 | "outputs": [],
423 | "source": []
424 | }
425 | ],
426 | "metadata": {
427 | "kernelspec": {
428 | "display_name": "Python 3",
429 | "language": "python",
430 | "name": "python3"
431 | },
432 | "language_info": {
433 | "codemirror_mode": {
434 | "name": "ipython",
435 | "version": 3
436 | },
437 | "file_extension": ".py",
438 | "mimetype": "text/x-python",
439 | "name": "python",
440 | "nbconvert_exporter": "python",
441 | "pygments_lexer": "ipython3",
442 | "version": "3.6.8"
443 | }
444 | },
445 | "nbformat": 4,
446 | "nbformat_minor": 2
447 | }
448 |
--------------------------------------------------------------------------------
/notebooks/Predict Emotion.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Predict Emotion\n",
8 | "\n",
9 | "The main objective of this notebook is to predict emotions from tweets"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "# Add project path to the PYTHONPATH\n",
19 | "\n",
20 | "import os\n",
21 | "import sys\n",
22 | "from pathlib import Path\n",
23 | "\n",
24 | "sys.path.append(Path(os.path.join(os.path.abspath(''), '../')).resolve().as_posix())"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 2,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "import pickle"
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "metadata": {},
39 | "source": [
40 | "## Load Tokenizer\n",
41 | "\n",
42 | "Load `.pickle` file with the tokenizer"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 3,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "tokenizer_path = Path('../datasets/sentiment_analysis/tokenizer.pickle').resolve()\n",
52 | "with tokenizer_path.open('rb') as file:\n",
53 | " tokenizer = pickle.load(file)"
54 | ]
55 | },
56 | {
57 | "cell_type": "markdown",
58 | "metadata": {},
59 | "source": [
60 | "## Load Model\n",
61 | "\n",
62 | "Load the trained emotion recognition model"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 4,
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, LSTM\n",
72 | "from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D\n",
73 | "from tensorflow.keras.layers import Bidirectional, Conv1D, Dense, concatenate\n",
74 | "from tensorflow.keras.models import Model"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 5,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)\n",
84 | "num_classes = 4\n",
85 | "embedding_dim = 500\n",
86 | "input_length = 100\n",
87 | "lstm_units = 128\n",
88 | "lstm_dropout = 0.1\n",
89 | "recurrent_dropout = 0.1\n",
90 | "spatial_dropout=0.2\n",
91 | "filters=64\n",
92 | "kernel_size=3"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 6,
98 | "metadata": {},
99 | "outputs": [
100 | {
101 | "name": "stderr",
102 | "output_type": "stream",
103 | "text": [
104 | "WARNING: Logging before flag parsing goes to stderr.\n",
105 | "W0719 10:47:51.968286 4693956032 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
106 | "Instructions for updating:\n",
107 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
108 | "W0719 10:47:52.031774 4693956032 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
109 | "Instructions for updating:\n",
110 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
111 | "W0719 10:47:52.039301 4693956032 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
112 | "Instructions for updating:\n",
113 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
114 | "W0719 10:47:52.040482 4693956032 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
115 | "Instructions for updating:\n",
116 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
117 | "W0719 10:47:52.041715 4693956032 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
118 | "Instructions for updating:\n",
119 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n"
120 | ]
121 | }
122 | ],
123 | "source": [
124 | "input_layer = Input(shape=(input_length,))\n",
125 | "output_layer = Embedding(\n",
126 | " input_dim=input_dim,\n",
127 | " output_dim=embedding_dim,\n",
128 | " input_shape=(input_length,)\n",
129 | ")(input_layer)\n",
130 | "\n",
131 | "output_layer = SpatialDropout1D(spatial_dropout)(output_layer)\n",
132 | "\n",
133 | "output_layer = Bidirectional(\n",
134 | "LSTM(lstm_units, return_sequences=True,\n",
135 | " dropout=lstm_dropout, recurrent_dropout=recurrent_dropout)\n",
136 | ")(output_layer)\n",
137 | "output_layer = Conv1D(filters, kernel_size=kernel_size, padding='valid',\n",
138 | " kernel_initializer='glorot_uniform')(output_layer)\n",
139 | "\n",
140 | "avg_pool = GlobalAveragePooling1D()(output_layer)\n",
141 | "max_pool = GlobalMaxPooling1D()(output_layer)\n",
142 | "output_layer = concatenate([avg_pool, max_pool])\n",
143 | "\n",
144 | "output_layer = Dense(num_classes, activation='softmax')(output_layer)\n",
145 | "\n",
146 | "model = Model(input_layer, output_layer)"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": 7,
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "model_weights_path = Path('../models/emotion_recognition/model_weights.h5').resolve()\n",
156 | "model.load_weights(model_weights_path.as_posix())"
157 | ]
158 | },
159 | {
160 | "cell_type": "markdown",
161 | "metadata": {},
162 | "source": [
163 | "## Load data\n",
164 | "\n",
165 | "Load the data that will have the labels predicted by the model\n",
166 | "\n",
167 | "**data_path**: Path to the `.csv` file that will be used"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": 8,
173 | "metadata": {},
174 | "outputs": [],
175 | "source": [
176 | "import pandas as pd"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 9,
182 | "metadata": {},
183 | "outputs": [
184 | {
185 | "data": {
186 | "text/html": [
187 | "\n",
188 | "\n",
201 | "
\n",
202 | " \n",
203 | " \n",
204 | " | \n",
205 | " id | \n",
206 | " date | \n",
207 | " user | \n",
208 | " text | \n",
209 | "
\n",
210 | " \n",
211 | " \n",
212 | " \n",
213 | " | 0 | \n",
214 | " 1151893341782585349 | \n",
215 | " 2019-07-18 16:35:48 | \n",
216 | " Ozzzylot | \n",
217 | " ⚡️ Fans share what Kyoto Animation studio mean... | \n",
218 | "
\n",
219 | " \n",
220 | " | 1 | \n",
221 | " 1151893322908102657 | \n",
222 | " 2019-07-18 16:35:43 | \n",
223 | " rosyutori | \n",
224 | " Deep condolences to all who are passed away at... | \n",
225 | "
\n",
226 | " \n",
227 | " | 2 | \n",
228 | " 1151893318101377024 | \n",
229 | " 2019-07-18 16:35:42 | \n",
230 | " met_bit | \n",
231 | " Striking news... How on earth can someone be s... | \n",
232 | "
\n",
233 | " \n",
234 | " | 3 | \n",
235 | " 1151893304117813248 | \n",
236 | " 2019-07-18 16:35:39 | \n",
237 | " Destructo_Dan | \n",
238 | " I don’t know if I had any favorite anime from ... | \n",
239 | "
\n",
240 | " \n",
241 | " | 4 | \n",
242 | " 1151893302863650816 | \n",
243 | " 2019-07-18 16:35:39 | \n",
244 | " KDiscavage | \n",
245 | " The news about Kyoto Animation Studios hit me ... | \n",
246 | "
\n",
247 | " \n",
248 | "
\n",
249 | "
"
250 | ],
251 | "text/plain": [
252 | " id date user \\\n",
253 | "0 1151893341782585349 2019-07-18 16:35:48 Ozzzylot \n",
254 | "1 1151893322908102657 2019-07-18 16:35:43 rosyutori \n",
255 | "2 1151893318101377024 2019-07-18 16:35:42 met_bit \n",
256 | "3 1151893304117813248 2019-07-18 16:35:39 Destructo_Dan \n",
257 | "4 1151893302863650816 2019-07-18 16:35:39 KDiscavage \n",
258 | "\n",
259 | " text \n",
260 | "0 ⚡️ Fans share what Kyoto Animation studio mean... \n",
261 | "1 Deep condolences to all who are passed away at... \n",
262 | "2 Striking news... How on earth can someone be s... \n",
263 | "3 I don’t know if I had any favorite anime from ... \n",
264 | "4 The news about Kyoto Animation Studios hit me ... "
265 | ]
266 | },
267 | "execution_count": 9,
268 | "metadata": {},
269 | "output_type": "execute_result"
270 | }
271 | ],
272 | "source": [
273 | "data_path = Path('../datasets/predict/1151893341782585349-1151863653320159233_kyoto_animation.csv').resolve()\n",
274 | "data = pd.read_csv(data_path)\n",
275 | "data.head()"
276 | ]
277 | },
278 | {
279 | "cell_type": "markdown",
280 | "metadata": {},
281 | "source": [
282 | "## Load Encoder\n",
283 | "\n",
284 | "Load `.pickle` file with the encoder"
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": 10,
290 | "metadata": {},
291 | "outputs": [],
292 | "source": [
293 | "encoder_path = Path('../models/emotion_recognition/encoder.pickle').resolve()\n",
294 | "with encoder_path.open('rb') as file:\n",
295 | " encoder = pickle.load(file)"
296 | ]
297 | },
298 | {
299 | "cell_type": "markdown",
300 | "metadata": {},
301 | "source": [
302 | "## Preprocess data\n",
303 | "\n",
304 | "Preprocess the data that will be used"
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": 11,
310 | "metadata": {},
311 | "outputs": [
312 | {
313 | "name": "stderr",
314 | "output_type": "stream",
315 | "text": [
316 | "[nltk_data] Downloading package stopwords to\n",
317 | "[nltk_data] /Users/rmohashi/nltk_data...\n",
318 | "[nltk_data] Package stopwords is already up-to-date!\n"
319 | ]
320 | }
321 | ],
322 | "source": [
323 | "from nlp import preprocess\n",
324 | "from tensorflow.keras.preprocessing.sequence import pad_sequences"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": 12,
330 | "metadata": {},
331 | "outputs": [
332 | {
333 | "name": "stdout",
334 | "output_type": "stream",
335 | "text": [
336 | "Time to clean up: 1.41 sec\n"
337 | ]
338 | }
339 | ],
340 | "source": [
341 | "cleaned_data = preprocess(data.text)\n",
342 | "sequences = [text.split() for text in cleaned_data]\n",
343 | "list_tokenized = tokenizer.texts_to_sequences(sequences)\n",
344 | "x_data = pad_sequences(list_tokenized, maxlen=100)"
345 | ]
346 | },
347 | {
348 | "cell_type": "markdown",
349 | "metadata": {},
350 | "source": [
351 | "## Results\n",
352 | "\n",
353 | "Predict the labels and generate a confusion matrix"
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": 13,
359 | "metadata": {},
360 | "outputs": [],
361 | "source": [
362 | "import numpy as np"
363 | ]
364 | },
365 | {
366 | "cell_type": "code",
367 | "execution_count": 14,
368 | "metadata": {},
369 | "outputs": [],
370 | "source": [
371 | "y_pred = model.predict(x_data)"
372 | ]
373 | },
374 | {
375 | "cell_type": "code",
376 | "execution_count": 15,
377 | "metadata": {},
378 | "outputs": [
379 | {
380 | "name": "stdout",
381 | "output_type": "stream",
382 | "text": [
383 | "angry: 0.0977998\n",
384 | "fear: 0.3991122\n",
385 | "joy: 0.03104621\n",
386 | "sadness: 0.4720413\n"
387 | ]
388 | }
389 | ],
390 | "source": [
391 | "for index, value in enumerate(np.sum(y_pred, axis=0) / len(y_pred)):\n",
392 | " print(encoder.classes_[index] + \": \" + str(value))"
393 | ]
394 | },
395 | {
396 | "cell_type": "code",
397 | "execution_count": 16,
398 | "metadata": {},
399 | "outputs": [
400 | {
401 | "name": "stdout",
402 | "output_type": "stream",
403 | "text": [
404 | "angry: 0.09889558232931726\n",
405 | "fear: 0.4011044176706827\n",
406 | "joy: 0.030622489959839357\n",
407 | "sadness: 0.46937751004016065\n"
408 | ]
409 | }
410 | ],
411 | "source": [
412 | "y_pred_argmax = y_pred.argmax(axis=1)\n",
413 | "data_len = len(y_pred_argmax)\n",
414 | "for index, value in enumerate(np.unique(y_pred_argmax)):\n",
415 | " print(encoder.classes_[index] + \": \" + str(len(y_pred_argmax[y_pred_argmax == value]) / data_len))"
416 | ]
417 | },
418 | {
419 | "cell_type": "code",
420 | "execution_count": 17,
421 | "metadata": {},
422 | "outputs": [
423 | {
424 | "data": {
425 | "text/plain": [
426 | "array([3, 3, 3, 3, 3])"
427 | ]
428 | },
429 | "execution_count": 17,
430 | "metadata": {},
431 | "output_type": "execute_result"
432 | }
433 | ],
434 | "source": [
435 | "y_pred[5:10].argmax(axis=1)"
436 | ]
437 | },
438 | {
439 | "cell_type": "code",
440 | "execution_count": 18,
441 | "metadata": {},
442 | "outputs": [
443 | {
444 | "data": {
445 | "text/plain": [
446 | "'My heart goes out to the people who died in the fire at Kyoto Animation Studio. \\n\\n#PrayForKyoani https://t.co/Jvg9R8f6Oc'"
447 | ]
448 | },
449 | "execution_count": 18,
450 | "metadata": {},
451 | "output_type": "execute_result"
452 | }
453 | ],
454 | "source": [
455 | "data.text.iloc[6]"
456 | ]
457 | },
458 | {
459 | "cell_type": "code",
460 | "execution_count": null,
461 | "metadata": {},
462 | "outputs": [],
463 | "source": []
464 | }
465 | ],
466 | "metadata": {
467 | "kernelspec": {
468 | "display_name": "Python 3",
469 | "language": "python",
470 | "name": "python3"
471 | },
472 | "language_info": {
473 | "codemirror_mode": {
474 | "name": "ipython",
475 | "version": 3
476 | },
477 | "file_extension": ".py",
478 | "mimetype": "text/x-python",
479 | "name": "python",
480 | "nbconvert_exporter": "python",
481 | "pygments_lexer": "ipython3",
482 | "version": "3.6.8"
483 | }
484 | },
485 | "nbformat": 4,
486 | "nbformat_minor": 2
487 | }
488 |
--------------------------------------------------------------------------------
/notebooks/Train Sentiment Analysis.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Train Sentiment Analysis\n",
8 | "\n",
9 | "Here we'll train a sentiment analysis model to validate the data from the API."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import warnings\n",
19 | "warnings.filterwarnings('ignore')"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 2,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "from pathlib import Path\n",
29 | "import pandas as pd"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 3,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "sentiment140_path = Path('../datasets/sentiment140/sentiment140.csv')\n",
39 | "data = pd.read_csv(sentiment140_path)"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 4,
45 | "metadata": {},
46 | "outputs": [
47 | {
48 | "data": {
49 | "text/html": [
50 | "\n",
51 | "\n",
64 | "
\n",
65 | " \n",
66 | " \n",
67 | " | \n",
68 | " label | \n",
69 | " tweet | \n",
70 | "
\n",
71 | " \n",
72 | " \n",
73 | " \n",
74 | " | 0 | \n",
75 | " 0 | \n",
76 | " @whiskey_kitten www.Pandora.com - plays music ... | \n",
77 | "
\n",
78 | " \n",
79 | " | 1 | \n",
80 | " 0 | \n",
81 | " studying for a test I hope not to fail....most... | \n",
82 | "
\n",
83 | " \n",
84 | " | 2 | \n",
85 | " 4 | \n",
86 | " @BlowhornOz Oh! Doesn't sound so good, I got t... | \n",
87 | "
\n",
88 | " \n",
89 | " | 3 | \n",
90 | " 0 | \n",
91 | " tomorrow is my last day at A&D HS fml and... | \n",
92 | "
\n",
93 | " \n",
94 | " | 4 | \n",
95 | " 0 | \n",
96 | " Journalism has no future? That sounds pretty m... | \n",
97 | "
\n",
98 | " \n",
99 | "
\n",
100 | "
"
101 | ],
102 | "text/plain": [
103 | " label tweet\n",
104 | "0 0 @whiskey_kitten www.Pandora.com - plays music ...\n",
105 | "1 0 studying for a test I hope not to fail....most...\n",
106 | "2 4 @BlowhornOz Oh! Doesn't sound so good, I got t...\n",
107 | "3 0 tomorrow is my last day at A&D HS fml and...\n",
108 | "4 0 Journalism has no future? That sounds pretty m..."
109 | ]
110 | },
111 | "execution_count": 4,
112 | "metadata": {},
113 | "output_type": "execute_result"
114 | }
115 | ],
116 | "source": [
117 | "data.head()"
118 | ]
119 | },
120 | {
121 | "cell_type": "markdown",
122 | "metadata": {},
123 | "source": [
124 | "## Data preprocessing\n",
125 | "\n",
126 | "Preprocess the texts:\n",
127 | "- Convert to Lowercase: Convert all characters from the text to lowercase\n",
128 | "- Remove special characters: Remove links and usernames and transform emojis to text\n",
129 | "- Remove repetitions: Remove char repetitions (e.g. whaaaaaat => what)\n",
130 | "- Remove Stop words: Remove common stop words"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 5,
136 | "metadata": {},
137 | "outputs": [
138 | {
139 | "name": "stderr",
140 | "output_type": "stream",
141 | "text": [
142 | "[nltk_data] Downloading package stopwords to\n",
143 | "[nltk_data] /home/rmohashi/nltk_data...\n",
144 | "[nltk_data] Package stopwords is already up-to-date!\n"
145 | ]
146 | },
147 | {
148 | "data": {
149 | "text/plain": [
150 | "True"
151 | ]
152 | },
153 | "execution_count": 5,
154 | "metadata": {},
155 | "output_type": "execute_result"
156 | }
157 | ],
158 | "source": [
159 | "import re\n",
160 | "from time import time\n",
161 | "import nltk\n",
162 | "from emoji import demojize\n",
163 | "\n",
164 | "nltk.download('stopwords')"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 6,
170 | "metadata": {},
171 | "outputs": [
172 | {
173 | "name": "stdout",
174 | "output_type": "stream",
175 | "text": [
176 | "Time to clean up: 78.03 sec\n"
177 | ]
178 | }
179 | ],
180 | "source": [
181 | "texts = data.tweet\n",
182 | "\n",
183 | "start = time()\n",
184 | "# Lowercasing\n",
185 | "texts = texts.str.lower()\n",
186 | "\n",
187 | "# Remove special chars\n",
188 | "texts = texts.str.replace(r\"(http|@)\\S+\", \"\")\n",
189 | "texts = texts.apply(demojize)\n",
190 | "texts = texts.str.replace(r\"::\", \": :\")\n",
191 | "texts = texts.str.replace(r\"’\", \"'\")\n",
192 | "texts = texts.str.replace(r\"[^a-z\\':_]\", \" \")\n",
193 | "\n",
194 | "# Remove repetitions\n",
195 | "pattern = re.compile(r\"(.)\\1{2,}\", re.DOTALL)\n",
196 | "texts = texts.str.replace(pattern, r\"\\1\")\n",
197 | "\n",
198 | "# Transform short negation form\n",
199 | "texts = texts.str.replace(r\"(can't|cannot)\", 'can not')\n",
200 | "texts = texts.str.replace(r\"n't\", ' not')\n",
201 | "\n",
202 | "# Remove stop words\n",
203 | "stopwords = nltk.corpus.stopwords.words('english')\n",
204 | "stopwords.remove('not')\n",
205 | "stopwords.remove('nor')\n",
206 | "stopwords.remove('no')\n",
207 | "texts = texts.apply(\n",
208 | " lambda x: ' '.join([word for word in x.split() if word not in stopwords])\n",
209 | ")\n",
210 | "\n",
211 | "print(\"Time to clean up: {:.2f} sec\".format(time() - start))\n",
212 | "\n",
213 | "data.tweet = texts"
214 | ]
215 | },
216 | {
217 | "cell_type": "markdown",
218 | "metadata": {},
219 | "source": [
220 | "## Tokenize\n",
221 | "\n",
222 | "Transform the text corpus to a vector representation\n",
223 | "\n",
224 | "- **num_words**: Number of words to use"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": 7,
230 | "metadata": {},
231 | "outputs": [],
232 | "source": [
233 | "num_words = 10000"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": 8,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "import pickle\n",
243 | "from tensorflow.keras.preprocessing.text import Tokenizer"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": 9,
249 | "metadata": {},
250 | "outputs": [],
251 | "source": [
252 | "tokenizer = Tokenizer(num_words=num_words, lower=True)\n",
253 | "tokenizer.fit_on_texts(data.tweet)\n",
254 | "\n",
255 | "file_to_save = Path('../datasets/sentiment140/tokenizer.pickle').resolve()\n",
256 | "with file_to_save.open('wb') as file:\n",
257 | " pickle.dump(tokenizer, file)"
258 | ]
259 | },
260 | {
261 | "cell_type": "markdown",
262 | "metadata": {},
263 | "source": [
264 | "## Split data\n",
265 | "\n",
266 | "Split the dataset in train and validation data"
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": 10,
272 | "metadata": {},
273 | "outputs": [],
274 | "source": [
275 | "from sklearn.model_selection import train_test_split"
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": 11,
281 | "metadata": {},
282 | "outputs": [],
283 | "source": [
284 | "train = pd.DataFrame(columns=['label', 'tweet'])\n",
285 | "validation = pd.DataFrame(columns=['label', 'tweet'])\n",
286 | "for label in data.label.unique():\n",
287 | " label_data = data[data.label == label]\n",
288 | " train_data, validation_data = train_test_split(label_data, test_size=0.3)\n",
289 | " train = pd.concat([train, train_data])\n",
290 | " validation = pd.concat([validation, validation_data])"
291 | ]
292 | },
293 | {
294 | "cell_type": "markdown",
295 | "metadata": {},
296 | "source": [
297 | "## Model\n",
298 | "\n",
299 | "Define the Bidirectional GRU model"
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": 12,
305 | "metadata": {},
306 | "outputs": [],
307 | "source": [
308 | "from tensorflow.keras.layers import Input, Embedding, GRU\n",
309 | "from tensorflow.keras.layers import Dropout, GlobalMaxPooling1D\n",
310 | "from tensorflow.keras.layers import Bidirectional, Dense\n",
311 | "from tensorflow.keras.models import Sequential"
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": 13,
317 | "metadata": {},
318 | "outputs": [],
319 | "source": [
320 | "input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)\n",
321 | "embedding_dim = 200\n",
322 | "input_length = 100\n",
323 | "gru_units = 128\n",
324 | "gru_dropout = 0.1\n",
325 | "recurrent_dropout = 0.1\n",
326 | "dropout = 0.1"
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": 14,
332 | "metadata": {},
333 | "outputs": [
334 | {
335 | "name": "stderr",
336 | "output_type": "stream",
337 | "text": [
338 | "WARNING: Logging before flag parsing goes to stderr.\n",
339 | "W0716 13:36:20.397812 140315330369344 deprecation.py:506] From /home/rmohashi/anaconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
340 | "Instructions for updating:\n",
341 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
342 | "W0716 13:36:20.410246 140315330369344 deprecation.py:506] From /home/rmohashi/anaconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
343 | "Instructions for updating:\n",
344 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
345 | "W0716 13:36:20.413324 140315330369344 deprecation.py:506] From /home/rmohashi/anaconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
346 | "Instructions for updating:\n",
347 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
348 | "W0716 13:36:20.413828 140315330369344 deprecation.py:506] From /home/rmohashi/anaconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
349 | "Instructions for updating:\n",
350 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
351 | "W0716 13:36:20.414215 140315330369344 deprecation.py:506] From /home/rmohashi/anaconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
352 | "Instructions for updating:\n",
353 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n"
354 | ]
355 | }
356 | ],
357 | "source": [
358 | "model = Sequential()\n",
359 | "model.add(Embedding(\n",
360 | " input_dim=input_dim,\n",
361 | " output_dim=embedding_dim,\n",
362 | " input_shape=(input_length,)\n",
363 | "))\n",
364 | "\n",
365 | "model.add(Bidirectional(GRU(\n",
366 | " gru_units,\n",
367 | " return_sequences=True,\n",
368 | " dropout=gru_dropout,\n",
369 | " recurrent_dropout=recurrent_dropout\n",
370 | ")))\n",
371 | "model.add(GlobalMaxPooling1D())\n",
372 | "model.add(Dense(32, activation='relu'))\n",
373 | "model.add(Dropout(dropout))\n",
374 | "\n",
375 | "model.add(Dense(1, activation='sigmoid'))"
376 | ]
377 | },
378 | {
379 | "cell_type": "code",
380 | "execution_count": 15,
381 | "metadata": {},
382 | "outputs": [
383 | {
384 | "name": "stderr",
385 | "output_type": "stream",
386 | "text": [
387 | "W0716 13:36:20.902724 140315330369344 deprecation.py:323] From /home/rmohashi/anaconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n",
388 | "Instructions for updating:\n",
389 | "Use tf.where in 2.0, which has the same broadcast rule as np.where\n"
390 | ]
391 | },
392 | {
393 | "name": "stdout",
394 | "output_type": "stream",
395 | "text": [
396 | "Model: \"sequential\"\n",
397 | "_________________________________________________________________\n",
398 | "Layer (type) Output Shape Param # \n",
399 | "=================================================================\n",
400 | "embedding (Embedding) (None, 100, 200) 2000000 \n",
401 | "_________________________________________________________________\n",
402 | "bidirectional (Bidirectional (None, 100, 256) 252672 \n",
403 | "_________________________________________________________________\n",
404 | "global_max_pooling1d (Global (None, 256) 0 \n",
405 | "_________________________________________________________________\n",
406 | "dense (Dense) (None, 32) 8224 \n",
407 | "_________________________________________________________________\n",
408 | "dropout (Dropout) (None, 32) 0 \n",
409 | "_________________________________________________________________\n",
410 | "dense_1 (Dense) (None, 1) 33 \n",
411 | "=================================================================\n",
412 | "Total params: 2,260,929\n",
413 | "Trainable params: 2,260,929\n",
414 | "Non-trainable params: 0\n",
415 | "_________________________________________________________________\n",
416 | "None\n"
417 | ]
418 | }
419 | ],
420 | "source": [
421 | "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
422 | "print(model.summary())"
423 | ]
424 | },
425 | {
426 | "cell_type": "markdown",
427 | "metadata": {},
428 | "source": [
429 | "## Prepare the data\n",
430 | "\n",
431 | "Prepare the model input data"
432 | ]
433 | },
434 | {
435 | "cell_type": "code",
436 | "execution_count": 16,
437 | "metadata": {},
438 | "outputs": [],
439 | "source": [
440 | "from tensorflow.keras.preprocessing.sequence import pad_sequences"
441 | ]
442 | },
443 | {
444 | "cell_type": "code",
445 | "execution_count": 17,
446 | "metadata": {},
447 | "outputs": [],
448 | "source": [
449 | "train_sequences = [text.split() for text in train.tweet]\n",
450 | "validation_sequences = [text.split() for text in validation.tweet]\n",
451 | "list_tokenized_train = tokenizer.texts_to_sequences(train_sequences)\n",
452 | "list_tokenized_validation = tokenizer.texts_to_sequences(validation_sequences)\n",
453 | "\n",
454 | "x_train = pad_sequences(list_tokenized_train, maxlen=input_length)\n",
455 | "x_validation = pad_sequences(list_tokenized_validation, maxlen=input_length)\n",
456 | "y_train = train.label.replace(4, 1)\n",
457 | "y_validation = validation.label.replace(4, 1)"
458 | ]
459 | },
460 | {
461 | "cell_type": "markdown",
462 | "metadata": {},
463 | "source": [
464 | "## Train model\n",
465 | "\n",
466 | "Do the training process with the given data"
467 | ]
468 | },
469 | {
470 | "cell_type": "code",
471 | "execution_count": 18,
472 | "metadata": {},
473 | "outputs": [],
474 | "source": [
475 | "batch_size = 128\n",
476 | "epochs = 1"
477 | ]
478 | },
479 | {
480 | "cell_type": "code",
481 | "execution_count": 19,
482 | "metadata": {},
483 | "outputs": [
484 | {
485 | "name": "stdout",
486 | "output_type": "stream",
487 | "text": [
488 | "Train on 280000 samples, validate on 120000 samples\n",
489 | "280000/280000 [==============================] - 374s 1ms/sample - loss: 0.4637 - acc: 0.7804 - val_loss: 0.4366 - val_acc: 0.7937\n"
490 | ]
491 | },
492 | {
493 | "data": {
494 | "text/plain": [
495 | ""
496 | ]
497 | },
498 | "execution_count": 19,
499 | "metadata": {},
500 | "output_type": "execute_result"
501 | }
502 | ],
503 | "source": [
504 | "model.fit(\n",
505 | " x_train,\n",
506 | " y=y_train,\n",
507 | " batch_size=batch_size,\n",
508 | " epochs=epochs,\n",
509 | " validation_data=(x_validation, y_validation),\n",
510 | ")"
511 | ]
512 | },
513 | {
514 | "cell_type": "code",
515 | "execution_count": 20,
516 | "metadata": {},
517 | "outputs": [],
518 | "source": [
519 | "model_file = Path('../models/sentiment_analysis/gru_model.h5').resolve()\n",
520 | "model.save_weights(model_file.as_posix())"
521 | ]
522 | }
523 | ],
524 | "metadata": {
525 | "kernelspec": {
526 | "display_name": "Python 3",
527 | "language": "python",
528 | "name": "python3"
529 | },
530 | "language_info": {
531 | "codemirror_mode": {
532 | "name": "ipython",
533 | "version": 3
534 | },
535 | "file_extension": ".py",
536 | "mimetype": "text/x-python",
537 | "name": "python",
538 | "nbconvert_exporter": "python",
539 | "pygments_lexer": "ipython3",
540 | "version": "3.6.8"
541 | }
542 | },
543 | "nbformat": 4,
544 | "nbformat_minor": 2
545 | }
546 |
--------------------------------------------------------------------------------
/notebooks/Train Emotion Recognition Model.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Train Emotion Recognition Model\n",
8 | "\n",
9 | "Here we'll train a emotion recognition model, using the output data from the sentiment analysis."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "# Add project path to the PYTHONPATH\n",
19 | "\n",
20 | "import os\n",
21 | "import sys\n",
22 | "from pathlib import Path\n",
23 | "\n",
24 | "sys.path.append(Path(os.path.join(os.path.abspath(''), '../')).resolve().as_posix())"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "## Load Dataset\n",
32 | "\n",
33 | "Load the emotion labeled dataset"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 2,
39 | "metadata": {},
40 | "outputs": [
41 | {
42 | "name": "stderr",
43 | "output_type": "stream",
44 | "text": [
45 | "[nltk_data] Downloading package stopwords to\n",
46 | "[nltk_data] /Users/rmohashi/nltk_data...\n",
47 | "[nltk_data] Package stopwords is already up-to-date!\n"
48 | ]
49 | }
50 | ],
51 | "source": [
52 | "from pathlib import Path\n",
53 | "import pandas as pd\n",
54 | "from nlp import Dataset"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 3,
60 | "metadata": {},
61 | "outputs": [
62 | {
63 | "name": "stdout",
64 | "output_type": "stream",
65 | "text": [
66 | "Time to clean up: 19.33 sec\n"
67 | ]
68 | }
69 | ],
70 | "source": [
71 | "dataset_path = Path('../datasets/sentiment_analysis/dataset.csv').resolve()\n",
72 | "dataset = Dataset(dataset_path)\n",
73 | "dataset.load()\n",
74 | "dataset.preprocess_texts()"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 4,
80 | "metadata": {},
81 | "outputs": [
82 | {
83 | "data": {
84 | "text/html": [
85 | "\n",
86 | "\n",
99 | "
\n",
100 | " \n",
101 | " \n",
102 | " | \n",
103 | " label | \n",
104 | " text | \n",
105 | "
\n",
106 | " \n",
107 | " \n",
108 | " \n",
109 | " | 0 | \n",
110 | " fear | \n",
111 | " sometimes afraid thing set free gt | \n",
112 | "
\n",
113 | " \n",
114 | " | 1 | \n",
115 | " fear | \n",
116 | " delayed post afraid | \n",
117 | "
\n",
118 | " \n",
119 | " | 2 | \n",
120 | " fear | \n",
121 | " eyeson seesomethingsaysomething cia clowns dee... | \n",
122 | "
\n",
123 | " \n",
124 | " | 3 | \n",
125 | " fear | \n",
126 | " happybirthdaystevenavery corruptiwoccounty afr... | \n",
127 | "
\n",
128 | " \n",
129 | " | 4 | \n",
130 | " fear | \n",
131 | " fight fire fire think reign fire comment check... | \n",
132 | "
\n",
133 | " \n",
134 | "
\n",
135 | "
"
136 | ],
137 | "text/plain": [
138 | " label text\n",
139 | "0 fear sometimes afraid thing set free gt\n",
140 | "1 fear delayed post afraid\n",
141 | "2 fear eyeson seesomethingsaysomething cia clowns dee...\n",
142 | "3 fear happybirthdaystevenavery corruptiwoccounty afr...\n",
143 | "4 fear fight fire fire think reign fire comment check..."
144 | ]
145 | },
146 | "execution_count": 4,
147 | "metadata": {},
148 | "output_type": "execute_result"
149 | }
150 | ],
151 | "source": [
152 | "dataset.cleaned_data.head()"
153 | ]
154 | },
155 | {
156 | "cell_type": "markdown",
157 | "metadata": {},
158 | "source": [
159 | "## Tokenize\n",
160 | "\n",
161 | "Transform the text corpus to a vector representation\n",
162 | "\n",
163 | "- **num_words**: Number of words to use"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 5,
169 | "metadata": {},
170 | "outputs": [],
171 | "source": [
172 | "import pickle\n",
173 | "from tensorflow.keras.preprocessing.text import Tokenizer"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": 6,
179 | "metadata": {},
180 | "outputs": [],
181 | "source": [
182 | "num_words = 10000\n",
183 | "\n",
184 | "tokenizer = Tokenizer(num_words=num_words, lower=True)\n",
185 | "tokenizer.fit_on_texts(dataset.cleaned_data.text)\n",
186 | "\n",
187 | "file_to_save = Path('../datasets/sentiment_analysis/tokenizer.pickle').resolve()\n",
188 | "with file_to_save.open('wb') as file:\n",
189 | " pickle.dump(tokenizer, file)"
190 | ]
191 | },
192 | {
193 | "cell_type": "markdown",
194 | "metadata": {},
195 | "source": [
196 | "## Split data\n",
197 | "\n",
198 | "Split the dataset in train and validation data"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": 7,
204 | "metadata": {},
205 | "outputs": [],
206 | "source": [
207 | "from sklearn.model_selection import train_test_split"
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": 8,
213 | "metadata": {},
214 | "outputs": [],
215 | "source": [
216 | "data = dataset.cleaned_data.copy()\n",
217 | "\n",
218 | "train = pd.DataFrame(columns=['label', 'text'])\n",
219 | "validation = pd.DataFrame(columns=['label', 'text'])\n",
220 | "for label in data.label.unique():\n",
221 | " label_data = data[data.label == label]\n",
222 | " train_data, validation_data = train_test_split(label_data, test_size=0.3)\n",
223 | " train = pd.concat([train, train_data])\n",
224 | " validation = pd.concat([validation, validation_data])"
225 | ]
226 | },
227 | {
228 | "cell_type": "markdown",
229 | "metadata": {},
230 | "source": [
231 | "## Model\n",
232 | "\n",
233 | "Define the **LSTM** + **CNN** model"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": 9,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, LSTM\n",
243 | "from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D\n",
244 | "from tensorflow.keras.layers import Bidirectional, Conv1D, Dense, concatenate\n",
245 | "from tensorflow.keras.models import Model"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": 10,
251 | "metadata": {},
252 | "outputs": [],
253 | "source": [
254 | "input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)\n",
255 | "num_classes = len(data.label.unique())\n",
256 | "embedding_dim = 500\n",
257 | "input_length = 100\n",
258 | "lstm_units = 128\n",
259 | "lstm_dropout = 0.1\n",
260 | "recurrent_dropout = 0.1\n",
261 | "spatial_dropout=0.2\n",
262 | "filters=64\n",
263 | "kernel_size=3"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": 11,
269 | "metadata": {
270 | "scrolled": false
271 | },
272 | "outputs": [
273 | {
274 | "name": "stderr",
275 | "output_type": "stream",
276 | "text": [
277 | "WARNING: Logging before flag parsing goes to stderr.\n",
278 | "W0719 10:32:00.331336 4686337472 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
279 | "Instructions for updating:\n",
280 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
281 | "W0719 10:32:00.392153 4686337472 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
282 | "Instructions for updating:\n",
283 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
284 | "W0719 10:32:00.397410 4686337472 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
285 | "Instructions for updating:\n",
286 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
287 | "W0719 10:32:00.399722 4686337472 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
288 | "Instructions for updating:\n",
289 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
290 | "W0719 10:32:00.403119 4686337472 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
291 | "Instructions for updating:\n",
292 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n"
293 | ]
294 | }
295 | ],
296 | "source": [
297 | "input_layer = Input(shape=(input_length,))\n",
298 | "output_layer = Embedding(\n",
299 | " input_dim=input_dim,\n",
300 | " output_dim=embedding_dim,\n",
301 | " input_shape=(input_length,)\n",
302 | ")(input_layer)\n",
303 | "\n",
304 | "output_layer = SpatialDropout1D(spatial_dropout)(output_layer)\n",
305 | "\n",
306 | "output_layer = Bidirectional(\n",
307 | "LSTM(lstm_units, return_sequences=True,\n",
308 | " dropout=lstm_dropout, recurrent_dropout=recurrent_dropout)\n",
309 | ")(output_layer)\n",
310 | "output_layer = Conv1D(filters, kernel_size=kernel_size, padding='valid',\n",
311 | " kernel_initializer='glorot_uniform')(output_layer)\n",
312 | "\n",
313 | "avg_pool = GlobalAveragePooling1D()(output_layer)\n",
314 | "max_pool = GlobalMaxPooling1D()(output_layer)\n",
315 | "output_layer = concatenate([avg_pool, max_pool])\n",
316 | "\n",
317 | "output_layer = Dense(num_classes, activation='softmax')(output_layer)\n",
318 | "\n",
319 | "model = Model(input_layer, output_layer)"
320 | ]
321 | },
322 | {
323 | "cell_type": "code",
324 | "execution_count": 12,
325 | "metadata": {},
326 | "outputs": [
327 | {
328 | "name": "stdout",
329 | "output_type": "stream",
330 | "text": [
331 | "Model: \"model\"\n",
332 | "__________________________________________________________________________________________________\n",
333 | "Layer (type) Output Shape Param # Connected to \n",
334 | "==================================================================================================\n",
335 | "input_1 (InputLayer) [(None, 100)] 0 \n",
336 | "__________________________________________________________________________________________________\n",
337 | "embedding (Embedding) (None, 100, 500) 5000000 input_1[0][0] \n",
338 | "__________________________________________________________________________________________________\n",
339 | "spatial_dropout1d (SpatialDropo (None, 100, 500) 0 embedding[0][0] \n",
340 | "__________________________________________________________________________________________________\n",
341 | "bidirectional (Bidirectional) (None, 100, 256) 644096 spatial_dropout1d[0][0] \n",
342 | "__________________________________________________________________________________________________\n",
343 | "conv1d (Conv1D) (None, 98, 64) 49216 bidirectional[0][0] \n",
344 | "__________________________________________________________________________________________________\n",
345 | "global_average_pooling1d (Globa (None, 64) 0 conv1d[0][0] \n",
346 | "__________________________________________________________________________________________________\n",
347 | "global_max_pooling1d (GlobalMax (None, 64) 0 conv1d[0][0] \n",
348 | "__________________________________________________________________________________________________\n",
349 | "concatenate (Concatenate) (None, 128) 0 global_average_pooling1d[0][0] \n",
350 | " global_max_pooling1d[0][0] \n",
351 | "__________________________________________________________________________________________________\n",
352 | "dense (Dense) (None, 4) 516 concatenate[0][0] \n",
353 | "==================================================================================================\n",
354 | "Total params: 5,693,828\n",
355 | "Trainable params: 5,693,828\n",
356 | "Non-trainable params: 0\n",
357 | "__________________________________________________________________________________________________\n"
358 | ]
359 | }
360 | ],
361 | "source": [
362 | "model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
363 | "model.summary()"
364 | ]
365 | },
366 | {
367 | "cell_type": "markdown",
368 | "metadata": {},
369 | "source": [
370 | "## Prepare the data\n",
371 | "\n",
372 | "Prepare the model input data"
373 | ]
374 | },
375 | {
376 | "cell_type": "code",
377 | "execution_count": 13,
378 | "metadata": {},
379 | "outputs": [],
380 | "source": [
381 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
382 | "from sklearn.preprocessing import LabelBinarizer"
383 | ]
384 | },
385 | {
386 | "cell_type": "code",
387 | "execution_count": 14,
388 | "metadata": {},
389 | "outputs": [],
390 | "source": [
391 | "train_sequences = [text.split() for text in train.text]\n",
392 | "validation_sequences = [text.split() for text in validation.text]\n",
393 | "list_tokenized_train = tokenizer.texts_to_sequences(train_sequences)\n",
394 | "list_tokenized_validation = tokenizer.texts_to_sequences(validation_sequences)\n",
395 | "x_train = pad_sequences(list_tokenized_train, maxlen=input_length)\n",
396 | "x_validation = pad_sequences(list_tokenized_validation, maxlen=input_length)\n",
397 | "\n",
398 | "encoder = LabelBinarizer()\n",
399 | "encoder.fit(data.label.unique())\n",
400 | "\n",
401 | "encoder_path = Path('../models/emotion_recognition', 'encoder.pickle')\n",
402 | "with encoder_path.open('wb') as file:\n",
403 | " pickle.dump(encoder, file)\n",
404 | "\n",
405 | "y_train = encoder.transform(train.label)\n",
406 | "y_validation = encoder.transform(validation.label)"
407 | ]
408 | },
409 | {
410 | "cell_type": "markdown",
411 | "metadata": {},
412 | "source": [
413 | "## Train model\n",
414 | "\n",
415 | "Do the training process with the given data"
416 | ]
417 | },
418 | {
419 | "cell_type": "code",
420 | "execution_count": 15,
421 | "metadata": {},
422 | "outputs": [],
423 | "source": [
424 | "batch_size = 128\n",
425 | "epochs = 1"
426 | ]
427 | },
428 | {
429 | "cell_type": "code",
430 | "execution_count": 16,
431 | "metadata": {},
432 | "outputs": [
433 | {
434 | "name": "stdout",
435 | "output_type": "stream",
436 | "text": [
437 | "Train on 25454 samples, validate on 10911 samples\n"
438 | ]
439 | },
440 | {
441 | "name": "stderr",
442 | "output_type": "stream",
443 | "text": [
444 | "W0719 10:32:03.006144 4686337472 deprecation.py:323] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n",
445 | "Instructions for updating:\n",
446 | "Use tf.where in 2.0, which has the same broadcast rule as np.where\n"
447 | ]
448 | },
449 | {
450 | "name": "stdout",
451 | "output_type": "stream",
452 | "text": [
453 | "25454/25454 [==============================] - 570s 22ms/sample - loss: 0.5621 - acc: 0.7593 - val_loss: 0.3839 - val_acc: 0.8381\n"
454 | ]
455 | },
456 | {
457 | "data": {
458 | "text/plain": [
459 | ""
460 | ]
461 | },
462 | "execution_count": 16,
463 | "metadata": {},
464 | "output_type": "execute_result"
465 | }
466 | ],
467 | "source": [
468 | "model.fit(\n",
469 | " x_train,\n",
470 | " y=y_train,\n",
471 | " batch_size=batch_size,\n",
472 | " epochs=epochs,\n",
473 | " validation_data=(x_validation, y_validation)\n",
474 | ")"
475 | ]
476 | },
477 | {
478 | "cell_type": "code",
479 | "execution_count": 17,
480 | "metadata": {},
481 | "outputs": [],
482 | "source": [
483 | "model_file = Path('../models/emotion_recognition/model_weights.h5').resolve()\n",
484 | "model.save_weights(model_file.as_posix())"
485 | ]
486 | },
487 | {
488 | "cell_type": "code",
489 | "execution_count": null,
490 | "metadata": {},
491 | "outputs": [],
492 | "source": []
493 | }
494 | ],
495 | "metadata": {
496 | "kernelspec": {
497 | "display_name": "Python 3",
498 | "language": "python",
499 | "name": "python3"
500 | },
501 | "language_info": {
502 | "codemirror_mode": {
503 | "name": "ipython",
504 | "version": 3
505 | },
506 | "file_extension": ".py",
507 | "mimetype": "text/x-python",
508 | "name": "python",
509 | "nbconvert_exporter": "python",
510 | "pygments_lexer": "ipython3",
511 | "version": "3.6.8"
512 | }
513 | },
514 | "nbformat": 4,
515 | "nbformat_minor": 2
516 | }
517 |
--------------------------------------------------------------------------------
/notebooks/Sentiment Analysis Score.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Sentiment Analysis Score\n",
8 | "\n",
9 | "Predict the sentiment analysis label, using a deep learning model for each query/emotion inside the relations file"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "# Add project path to the PYTHONPATH\n",
19 | "\n",
20 | "import os\n",
21 | "import sys\n",
22 | "from pathlib import Path\n",
23 | "\n",
24 | "sys.path.append(Path(os.path.join(os.path.abspath(''), '../')).resolve().as_posix())"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "## Load Tokenizer\n",
32 | "\n",
33 | "Import and load the tokenizer from a `.pickle` file"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 2,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "import pickle\n",
43 | "from pathlib import Path"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 3,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "tokenizer_file = Path('../datasets/sentiment140/tokenizer.pickle').resolve()\n",
53 | "with tokenizer_file.open('rb') as file:\n",
54 | " tokenizer = pickle.load(file)"
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {},
60 | "source": [
61 | "## Load Model\n",
62 | "\n",
63 | "Load the sentiment analysis model, using the saved weights"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 6,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "from tensorflow.keras.layers import Input, Embedding, GRU\n",
73 | "from tensorflow.keras.layers import Dropout, GlobalMaxPooling1D\n",
74 | "from tensorflow.keras.layers import Bidirectional, Dense\n",
75 | "from tensorflow.keras.models import Sequential"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 7,
81 | "metadata": {},
82 | "outputs": [],
83 | "source": [
84 | "input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)\n",
85 | "embedding_dim = 200\n",
86 | "input_length = 100\n",
87 | "gru_units = 128\n",
88 | "gru_dropout = 0.1\n",
89 | "recurrent_dropout = 0.1\n",
90 | "dropout = 0.1"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": 8,
96 | "metadata": {},
97 | "outputs": [
98 | {
99 | "name": "stderr",
100 | "output_type": "stream",
101 | "text": [
102 | "WARNING: Logging before flag parsing goes to stderr.\n",
103 | "W0719 09:56:43.758275 4692825536 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
104 | "Instructions for updating:\n",
105 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
106 | "W0719 09:56:43.802737 4692825536 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
107 | "Instructions for updating:\n",
108 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
109 | "W0719 09:56:43.809999 4692825536 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
110 | "Instructions for updating:\n",
111 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
112 | "W0719 09:56:43.811434 4692825536 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
113 | "Instructions for updating:\n",
114 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
115 | "W0719 09:56:43.813139 4692825536 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
116 | "Instructions for updating:\n",
117 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n"
118 | ]
119 | }
120 | ],
121 | "source": [
122 | "model = Sequential()\n",
123 | "model.add(Embedding(\n",
124 | " input_dim=input_dim,\n",
125 | " output_dim=embedding_dim,\n",
126 | " input_shape=(input_length,)\n",
127 | "))\n",
128 | "\n",
129 | "model.add(Bidirectional(GRU(\n",
130 | " gru_units,\n",
131 | " return_sequences=True,\n",
132 | " dropout=gru_dropout,\n",
133 | " recurrent_dropout=recurrent_dropout\n",
134 | ")))\n",
135 | "model.add(GlobalMaxPooling1D())\n",
136 | "model.add(Dense(32, activation='relu'))\n",
137 | "model.add(Dropout(dropout))\n",
138 | "\n",
139 | "model.add(Dense(1, activation='sigmoid'))"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 9,
145 | "metadata": {},
146 | "outputs": [],
147 | "source": [
148 | "weights_path = Path('../models/sentiment_analysis/model_weights.h5').resolve()\n",
149 | "model.load_weights(weights_path.as_posix())"
150 | ]
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "metadata": {},
155 | "source": [
156 | "## Load Query Relations\n",
157 | "\n",
158 | "Load the relations between queries and emotions from a `.json` file"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": 10,
164 | "metadata": {},
165 | "outputs": [],
166 | "source": [
167 | "import json"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": 12,
173 | "metadata": {},
174 | "outputs": [],
175 | "source": [
176 | "relations_path = Path('../query_relations.json')\n",
177 | "with relations_path.open('r') as file:\n",
178 | " relations = json.load(file)"
179 | ]
180 | },
181 | {
182 | "cell_type": "markdown",
183 | "metadata": {},
184 | "source": [
185 | "## Predict polarity\n",
186 | "\n",
187 | "Predict the polarity of the texts, using the sentiment analysis model"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": 13,
193 | "metadata": {},
194 | "outputs": [
195 | {
196 | "name": "stderr",
197 | "output_type": "stream",
198 | "text": [
199 | "[nltk_data] Downloading package stopwords to\n",
200 | "[nltk_data] /Users/rmohashi/nltk_data...\n",
201 | "[nltk_data] Package stopwords is already up-to-date!\n"
202 | ]
203 | }
204 | ],
205 | "source": [
206 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
207 | "from nlp import preprocess\n",
208 | "from tqdm import tqdm\n",
209 | "import pandas as pd\n",
210 | "import numpy as np\n",
211 | "import re"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": 14,
217 | "metadata": {},
218 | "outputs": [],
219 | "source": [
220 | "dataset_dir = Path('../datasets/tweepy').resolve()"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": 15,
226 | "metadata": {},
227 | "outputs": [
228 | {
229 | "name": "stderr",
230 | "output_type": "stream",
231 | "text": [
232 | "100%|██████████| 19/19 [02:59<00:00, 12.95s/it]\n"
233 | ]
234 | }
235 | ],
236 | "source": [
237 | "data_dict = {}\n",
238 | "\n",
239 | "query_dict = {\n",
240 | " 'query': [],\n",
241 | " 'mean': [],\n",
242 | " 'max': [],\n",
243 | " 'min': [],\n",
244 | " 'std': [],\n",
245 | " 'count': [],\n",
246 | " 'emotion': []\n",
247 | "}\n",
248 | "\n",
249 | "dir_files = os.listdir(dataset_dir)\n",
250 | "\n",
251 | "with tqdm(total=len(dir_files)) as t:\n",
252 | " for filename in dir_files:\n",
253 | " dataset = pd.read_csv(os.path.join(dataset_dir, filename))\n",
254 | " cleaned_texts = preprocess(dataset.text, quiet=True)\n",
255 | "\n",
256 | " query = re.findall(r'(#[^.]+|:.+:)', filename)[0]\n",
257 | "\n",
258 | " predict_sequences = [text.split() for text in cleaned_texts]\n",
259 | " list_tokenized_predict = tokenizer.texts_to_sequences(predict_sequences)\n",
260 | " x_predict = pad_sequences(list_tokenized_predict, maxlen=100)\n",
261 | "\n",
262 | " result = model.predict(x_predict)\n",
263 | " \n",
264 | " emotion = relations[query]\n",
265 | " query_dict['query'].append(query)\n",
266 | " query_dict['mean'].append(np.mean(result))\n",
267 | " query_dict['max'].append(np.amax(result))\n",
268 | " query_dict['min'].append(np.amin(result))\n",
269 | " query_dict['count'].append(len(dataset))\n",
270 | " query_dict['std'].append(np.std(result))\n",
271 | " query_dict['emotion'].append(emotion)\n",
272 | "\n",
273 | " if emotion in data_dict:\n",
274 | " data_dict[emotion] = np.concatenate([data_dict[emotion], result])\n",
275 | " else:\n",
276 | " data_dict[emotion] = result\n",
277 | " \n",
278 | " t.update()"
279 | ]
280 | },
281 | {
282 | "cell_type": "markdown",
283 | "metadata": {},
284 | "source": [
285 | "## Print Results\n",
286 | "\n",
287 | "Print the queries/emotions and the values"
288 | ]
289 | },
290 | {
291 | "cell_type": "code",
292 | "execution_count": 10,
293 | "metadata": {
294 | "scrolled": false
295 | },
296 | "outputs": [
297 | {
298 | "data": {
299 | "text/html": [
300 | "\n",
301 | "\n",
314 | "
\n",
315 | " \n",
316 | " \n",
317 | " | \n",
318 | " query | \n",
319 | " mean | \n",
320 | " max | \n",
321 | " min | \n",
322 | " std | \n",
323 | " count | \n",
324 | " emotion | \n",
325 | "
\n",
326 | " \n",
327 | " \n",
328 | " \n",
329 | " | 0 | \n",
330 | " :anxious_face_with_sweat: | \n",
331 | " 0.428562 | \n",
332 | " 0.983642 | \n",
333 | " 0.004371 | \n",
334 | " 0.274272 | \n",
335 | " 199 | \n",
336 | " fear | \n",
337 | "
\n",
338 | " \n",
339 | " | 6 | \n",
340 | " #worried | \n",
341 | " 0.205504 | \n",
342 | " 0.879476 | \n",
343 | " 0.004883 | \n",
344 | " 0.210547 | \n",
345 | " 196 | \n",
346 | " fear | \n",
347 | "
\n",
348 | " \n",
349 | "
\n",
350 | "
"
351 | ],
352 | "text/plain": [
353 | " query mean max min std count \\\n",
354 | "0 :anxious_face_with_sweat: 0.428562 0.983642 0.004371 0.274272 199 \n",
355 | "6 #worried 0.205504 0.879476 0.004883 0.210547 196 \n",
356 | "\n",
357 | " emotion \n",
358 | "0 fear \n",
359 | "6 fear "
360 | ]
361 | },
362 | "metadata": {},
363 | "output_type": "display_data"
364 | },
365 | {
366 | "data": {
367 | "text/html": [
368 | "\n",
369 | "\n",
382 | "
\n",
383 | " \n",
384 | " \n",
385 | " | \n",
386 | " query | \n",
387 | " mean | \n",
388 | " max | \n",
389 | " min | \n",
390 | " std | \n",
391 | " count | \n",
392 | " emotion | \n",
393 | "
\n",
394 | " \n",
395 | " \n",
396 | " \n",
397 | " | 1 | \n",
398 | " #sad | \n",
399 | " 0.073413 | \n",
400 | " 0.873629 | \n",
401 | " 0.002289 | \n",
402 | " 0.127914 | \n",
403 | " 200 | \n",
404 | " sadness | \n",
405 | "
\n",
406 | " \n",
407 | " | 2 | \n",
408 | " :crying_face: | \n",
409 | " 0.438269 | \n",
410 | " 0.996975 | \n",
411 | " 0.005851 | \n",
412 | " 0.296389 | \n",
413 | " 197 | \n",
414 | " sadness | \n",
415 | "
\n",
416 | " \n",
417 | "
\n",
418 | "
"
419 | ],
420 | "text/plain": [
421 | " query mean max min std count emotion\n",
422 | "1 #sad 0.073413 0.873629 0.002289 0.127914 200 sadness\n",
423 | "2 :crying_face: 0.438269 0.996975 0.005851 0.296389 197 sadness"
424 | ]
425 | },
426 | "metadata": {},
427 | "output_type": "display_data"
428 | },
429 | {
430 | "data": {
431 | "text/html": [
432 | "\n",
433 | "\n",
446 | "
\n",
447 | " \n",
448 | " \n",
449 | " | \n",
450 | " query | \n",
451 | " mean | \n",
452 | " max | \n",
453 | " min | \n",
454 | " std | \n",
455 | " count | \n",
456 | " emotion | \n",
457 | "
\n",
458 | " \n",
459 | " \n",
460 | " \n",
461 | " | 3 | \n",
462 | " :red_heart: | \n",
463 | " 0.770384 | \n",
464 | " 0.996633 | \n",
465 | " 0.042774 | \n",
466 | " 0.225747 | \n",
467 | " 200 | \n",
468 | " joy | \n",
469 | "
\n",
470 | " \n",
471 | " | 7 | \n",
472 | " #joy | \n",
473 | " 0.832007 | \n",
474 | " 0.997057 | \n",
475 | " 0.208914 | \n",
476 | " 0.152068 | \n",
477 | " 191 | \n",
478 | " joy | \n",
479 | "
\n",
480 | " \n",
481 | "
\n",
482 | "
"
483 | ],
484 | "text/plain": [
485 | " query mean max min std count emotion\n",
486 | "3 :red_heart: 0.770384 0.996633 0.042774 0.225747 200 joy\n",
487 | "7 #joy 0.832007 0.997057 0.208914 0.152068 191 joy"
488 | ]
489 | },
490 | "metadata": {},
491 | "output_type": "display_data"
492 | },
493 | {
494 | "data": {
495 | "text/html": [
496 | "\n",
497 | "\n",
510 | "
\n",
511 | " \n",
512 | " \n",
513 | " | \n",
514 | " query | \n",
515 | " mean | \n",
516 | " max | \n",
517 | " min | \n",
518 | " std | \n",
519 | " count | \n",
520 | " emotion | \n",
521 | "
\n",
522 | " \n",
523 | " \n",
524 | " \n",
525 | " | 4 | \n",
526 | " :face_with_symbols_on_mouth: | \n",
527 | " 0.403210 | \n",
528 | " 0.997371 | \n",
529 | " 0.010545 | \n",
530 | " 0.261377 | \n",
531 | " 194 | \n",
532 | " angry | \n",
533 | "
\n",
534 | " \n",
535 | " | 5 | \n",
536 | " #pissed | \n",
537 | " 0.230712 | \n",
538 | " 0.912333 | \n",
539 | " 0.008014 | \n",
540 | " 0.180684 | \n",
541 | " 200 | \n",
542 | " angry | \n",
543 | "
\n",
544 | " \n",
545 | "
\n",
546 | "
"
547 | ],
548 | "text/plain": [
549 | " query mean max min std \\\n",
550 | "4 :face_with_symbols_on_mouth: 0.403210 0.997371 0.010545 0.261377 \n",
551 | "5 #pissed 0.230712 0.912333 0.008014 0.180684 \n",
552 | "\n",
553 | " count emotion \n",
554 | "4 194 angry \n",
555 | "5 200 angry "
556 | ]
557 | },
558 | "metadata": {},
559 | "output_type": "display_data"
560 | }
561 | ],
562 | "source": [
563 | "df = pd.DataFrame(data=query_dict)\n",
564 | "for emotion in df.emotion.unique():\n",
565 | " display(df[df.emotion == emotion])"
566 | ]
567 | },
568 | {
569 | "cell_type": "code",
570 | "execution_count": 11,
571 | "metadata": {},
572 | "outputs": [
573 | {
574 | "data": {
575 | "text/html": [
576 | "\n",
577 | "\n",
590 | "
\n",
591 | " \n",
592 | " \n",
593 | " | \n",
594 | " emotion | \n",
595 | " mean | \n",
596 | " max | \n",
597 | " min | \n",
598 | " std | \n",
599 | " count | \n",
600 | "
\n",
601 | " \n",
602 | " \n",
603 | " \n",
604 | " | 0 | \n",
605 | " fear | \n",
606 | " 0.317880 | \n",
607 | " 0.983642 | \n",
608 | " 0.004371 | \n",
609 | " 0.268948 | \n",
610 | " 395 | \n",
611 | "
\n",
612 | " \n",
613 | " | 1 | \n",
614 | " sadness | \n",
615 | " 0.254463 | \n",
616 | " 0.996975 | \n",
617 | " 0.002289 | \n",
618 | " 0.291740 | \n",
619 | " 397 | \n",
620 | "
\n",
621 | " \n",
622 | " | 2 | \n",
623 | " joy | \n",
624 | " 0.800486 | \n",
625 | " 0.997057 | \n",
626 | " 0.042774 | \n",
627 | " 0.195736 | \n",
628 | " 391 | \n",
629 | "
\n",
630 | " \n",
631 | " | 3 | \n",
632 | " angry | \n",
633 | " 0.315648 | \n",
634 | " 0.997371 | \n",
635 | " 0.008014 | \n",
636 | " 0.240100 | \n",
637 | " 394 | \n",
638 | "
\n",
639 | " \n",
640 | "
\n",
641 | "
"
642 | ],
643 | "text/plain": [
644 | " emotion mean max min std count\n",
645 | "0 fear 0.317880 0.983642 0.004371 0.268948 395\n",
646 | "1 sadness 0.254463 0.996975 0.002289 0.291740 397\n",
647 | "2 joy 0.800486 0.997057 0.042774 0.195736 391\n",
648 | "3 angry 0.315648 0.997371 0.008014 0.240100 394"
649 | ]
650 | },
651 | "metadata": {},
652 | "output_type": "display_data"
653 | }
654 | ],
655 | "source": [
656 | "emotion_dict = {\n",
657 | " 'emotion': [],\n",
658 | " 'mean': [],\n",
659 | " 'max': [],\n",
660 | " 'min': [],\n",
661 | " 'std': [],\n",
662 | " 'count': []\n",
663 | "}\n",
664 | "\n",
665 | "for emotion, result in data_dict.items():\n",
666 | " emotion_dict['emotion'].append(emotion)\n",
667 | " emotion_dict['mean'].append(np.mean(result))\n",
668 | " emotion_dict['max'].append(np.amax(result))\n",
669 | " emotion_dict['min'].append(np.amin(result))\n",
670 | " emotion_dict['std'].append(np.std(result))\n",
671 | " emotion_dict['count'].append(len(result))\n",
672 | " \n",
673 | "emotion_df = pd.DataFrame(data=emotion_dict)\n",
674 | "display(emotion_df)"
675 | ]
676 | },
677 | {
678 | "cell_type": "code",
679 | "execution_count": null,
680 | "metadata": {},
681 | "outputs": [],
682 | "source": []
683 | }
684 | ],
685 | "metadata": {
686 | "kernelspec": {
687 | "display_name": "Python 3",
688 | "language": "python",
689 | "name": "python3"
690 | },
691 | "language_info": {
692 | "codemirror_mode": {
693 | "name": "ipython",
694 | "version": 3
695 | },
696 | "file_extension": ".py",
697 | "mimetype": "text/x-python",
698 | "name": "python",
699 | "nbconvert_exporter": "python",
700 | "pygments_lexer": "ipython3",
701 | "version": "3.6.8"
702 | }
703 | },
704 | "nbformat": 4,
705 | "nbformat_minor": 2
706 | }
707 |
--------------------------------------------------------------------------------
/notebooks/Emotion Recognition Model Validation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Emotion Recognition Model Validation\n",
8 | "\n",
9 | "The main objective of this notebook is to validate the trained model for emotion recognition"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "# Add project path to the PYTHONPATH\n",
19 | "\n",
20 | "import os\n",
21 | "import sys\n",
22 | "from pathlib import Path\n",
23 | "\n",
24 | "sys.path.append(Path(os.path.join(os.path.abspath(''), '../')).resolve().as_posix())"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 2,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "import pickle"
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "metadata": {},
39 | "source": [
40 | "## Load Tokenizer\n",
41 | "\n",
42 | "Load `.pickle` file with the tokenizer"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 3,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "tokenizer_path = Path('../datasets/sentiment_analysis/tokenizer.pickle').resolve()\n",
52 | "with tokenizer_path.open('rb') as file:\n",
53 | " tokenizer = pickle.load(file)"
54 | ]
55 | },
56 | {
57 | "cell_type": "markdown",
58 | "metadata": {},
59 | "source": [
60 | "## Load Model\n",
61 | "\n",
62 | "Load the trained emotion recognition model"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 4,
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, LSTM\n",
72 | "from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D\n",
73 | "from tensorflow.keras.layers import Bidirectional, Conv1D, Dense, concatenate\n",
74 | "from tensorflow.keras.models import Model"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 5,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)\n",
84 | "num_classes = 4\n",
85 | "embedding_dim = 500\n",
86 | "input_length = 100\n",
87 | "lstm_units = 128\n",
88 | "lstm_dropout = 0.1\n",
89 | "recurrent_dropout = 0.1\n",
90 | "spatial_dropout=0.2\n",
91 | "filters=64\n",
92 | "kernel_size=3"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 6,
98 | "metadata": {},
99 | "outputs": [
100 | {
101 | "name": "stderr",
102 | "output_type": "stream",
103 | "text": [
104 | "WARNING: Logging before flag parsing goes to stderr.\n",
105 | "W0719 10:46:16.952994 4735395264 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
106 | "Instructions for updating:\n",
107 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
108 | "W0719 10:46:17.039670 4735395264 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
109 | "Instructions for updating:\n",
110 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
111 | "W0719 10:46:17.047888 4735395264 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
112 | "Instructions for updating:\n",
113 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
114 | "W0719 10:46:17.049386 4735395264 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
115 | "Instructions for updating:\n",
116 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
117 | "W0719 10:46:17.050548 4735395264 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
118 | "Instructions for updating:\n",
119 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n"
120 | ]
121 | }
122 | ],
123 | "source": [
124 | "input_layer = Input(shape=(input_length,))\n",
125 | "output_layer = Embedding(\n",
126 | " input_dim=input_dim,\n",
127 | " output_dim=embedding_dim,\n",
128 | " input_shape=(input_length,)\n",
129 | ")(input_layer)\n",
130 | "\n",
131 | "output_layer = SpatialDropout1D(spatial_dropout)(output_layer)\n",
132 | "\n",
133 | "output_layer = Bidirectional(\n",
134 | "LSTM(lstm_units, return_sequences=True,\n",
135 | " dropout=lstm_dropout, recurrent_dropout=recurrent_dropout)\n",
136 | ")(output_layer)\n",
137 | "output_layer = Conv1D(filters, kernel_size=kernel_size, padding='valid',\n",
138 | " kernel_initializer='glorot_uniform')(output_layer)\n",
139 | "\n",
140 | "avg_pool = GlobalAveragePooling1D()(output_layer)\n",
141 | "max_pool = GlobalMaxPooling1D()(output_layer)\n",
142 | "output_layer = concatenate([avg_pool, max_pool])\n",
143 | "\n",
144 | "output_layer = Dense(num_classes, activation='softmax')(output_layer)\n",
145 | "\n",
146 | "model = Model(input_layer, output_layer)"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": 7,
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "model_weights_path = Path('../models/emotion_recognition/model_weights.h5').resolve()\n",
156 | "model.load_weights(model_weights_path.as_posix())"
157 | ]
158 | },
159 | {
160 | "cell_type": "markdown",
161 | "metadata": {},
162 | "source": [
163 | "## Load test dataset\n",
164 | "\n",
165 | "Load the dataset that will be used to test the model"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": 8,
171 | "metadata": {},
172 | "outputs": [],
173 | "source": [
174 | "import pandas as pd"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": 9,
180 | "metadata": {},
181 | "outputs": [],
182 | "source": [
183 | "test_data_path = Path('../datasets/sentiment_analysis/test.csv').resolve()\n",
184 | "test_data = pd.read_csv(test_data_path)"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 10,
190 | "metadata": {
191 | "scrolled": true
192 | },
193 | "outputs": [
194 | {
195 | "data": {
196 | "text/html": [
197 | "\n",
198 | "\n",
211 | "
\n",
212 | " \n",
213 | " \n",
214 | " | \n",
215 | " label | \n",
216 | " id | \n",
217 | " date | \n",
218 | " user | \n",
219 | " text | \n",
220 | "
\n",
221 | " \n",
222 | " \n",
223 | " \n",
224 | " | 0 | \n",
225 | " fear | \n",
226 | " 1151474078131339264 | \n",
227 | " 2019-07-17 12:49:48 | \n",
228 | " 13thSnipers | \n",
229 | " It's so obvious Ashley Young @youngy18 is not ... | \n",
230 | "
\n",
231 | " \n",
232 | " | 1 | \n",
233 | " fear | \n",
234 | " 1151474075723870208 | \n",
235 | " 2019-07-17 12:49:47 | \n",
236 | " ShukrahFirdaus | \n",
237 | " Engaging in a staring competition with this wo... | \n",
238 | "
\n",
239 | " \n",
240 | " | 2 | \n",
241 | " fear | \n",
242 | " 1151473913668313089 | \n",
243 | " 2019-07-17 12:49:09 | \n",
244 | " EvinErvian | \n",
245 | " @savage2ooo yah me too. worst? can't stand wat... | \n",
246 | "
\n",
247 | " \n",
248 | " | 3 | \n",
249 | " fear | \n",
250 | " 1151473830398976000 | \n",
251 | " 2019-07-17 12:48:49 | \n",
252 | " oliviaakuhn | \n",
253 | " i was with @regiannoni EXACTLY 2 years and 3 y... | \n",
254 | "
\n",
255 | " \n",
256 | " | 4 | \n",
257 | " fear | \n",
258 | " 1151473618318176257 | \n",
259 | " 2019-07-17 12:47:58 | \n",
260 | " zaaboogie_ | \n",
261 | " This heat different 😰 | \n",
262 | "
\n",
263 | " \n",
264 | "
\n",
265 | "
"
266 | ],
267 | "text/plain": [
268 | " label id date user \\\n",
269 | "0 fear 1151474078131339264 2019-07-17 12:49:48 13thSnipers \n",
270 | "1 fear 1151474075723870208 2019-07-17 12:49:47 ShukrahFirdaus \n",
271 | "2 fear 1151473913668313089 2019-07-17 12:49:09 EvinErvian \n",
272 | "3 fear 1151473830398976000 2019-07-17 12:48:49 oliviaakuhn \n",
273 | "4 fear 1151473618318176257 2019-07-17 12:47:58 zaaboogie_ \n",
274 | "\n",
275 | " text \n",
276 | "0 It's so obvious Ashley Young @youngy18 is not ... \n",
277 | "1 Engaging in a staring competition with this wo... \n",
278 | "2 @savage2ooo yah me too. worst? can't stand wat... \n",
279 | "3 i was with @regiannoni EXACTLY 2 years and 3 y... \n",
280 | "4 This heat different 😰 "
281 | ]
282 | },
283 | "execution_count": 10,
284 | "metadata": {},
285 | "output_type": "execute_result"
286 | }
287 | ],
288 | "source": [
289 | "test_data.head()"
290 | ]
291 | },
292 | {
293 | "cell_type": "markdown",
294 | "metadata": {},
295 | "source": [
296 | "## Load Encoder\n",
297 | "\n",
298 | "Load `.pickle` file with the encoder"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": 11,
304 | "metadata": {},
305 | "outputs": [],
306 | "source": [
307 | "encoder_path = Path('../models/emotion_recognition/encoder.pickle').resolve()\n",
308 | "with encoder_path.open('rb') as file:\n",
309 | " encoder = pickle.load(file)"
310 | ]
311 | },
312 | {
313 | "cell_type": "markdown",
314 | "metadata": {},
315 | "source": [
316 | "## Preprocess data\n",
317 | "\n",
318 | "Preprocess the data that will be used"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": 12,
324 | "metadata": {},
325 | "outputs": [
326 | {
327 | "name": "stderr",
328 | "output_type": "stream",
329 | "text": [
330 | "[nltk_data] Downloading package stopwords to\n",
331 | "[nltk_data] /Users/rmohashi/nltk_data...\n",
332 | "[nltk_data] Package stopwords is already up-to-date!\n"
333 | ]
334 | }
335 | ],
336 | "source": [
337 | "from nlp.utils import preprocess\n",
338 | "from tensorflow.keras.preprocessing.sequence import pad_sequences"
339 | ]
340 | },
341 | {
342 | "cell_type": "code",
343 | "execution_count": 13,
344 | "metadata": {},
345 | "outputs": [
346 | {
347 | "name": "stdout",
348 | "output_type": "stream",
349 | "text": [
350 | "Time to clean up: 0.71 sec\n"
351 | ]
352 | }
353 | ],
354 | "source": [
355 | "test_data['text'] = preprocess(test_data.text)\n",
356 | "sequences = [text.split() for text in test_data.text]\n",
357 | "list_tokenized = tokenizer.texts_to_sequences(sequences)\n",
358 | "x_test = pad_sequences(list_tokenized, maxlen=100)\n",
359 | "y_test = encoder.transform(test_data.label)"
360 | ]
361 | },
362 | {
363 | "cell_type": "markdown",
364 | "metadata": {},
365 | "source": [
366 | "## Results\n",
367 | "\n",
368 | "Predict the labels and generate a confusion matrix"
369 | ]
370 | },
371 | {
372 | "cell_type": "code",
373 | "execution_count": 14,
374 | "metadata": {},
375 | "outputs": [],
376 | "source": [
377 | "y_pred = model.predict(x_test)"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": 15,
383 | "metadata": {},
384 | "outputs": [],
385 | "source": [
386 | "y_pred = y_pred.argmax(axis=1)\n",
387 | "y_test = y_test.argmax(axis=1)"
388 | ]
389 | },
390 | {
391 | "cell_type": "code",
392 | "execution_count": 16,
393 | "metadata": {},
394 | "outputs": [],
395 | "source": [
396 | "import numpy as np\n",
397 | "import matplotlib.pyplot as plt\n",
398 | "from sklearn.metrics import confusion_matrix\n",
399 | "from sklearn.utils.multiclass import unique_labels"
400 | ]
401 | },
402 | {
403 | "cell_type": "code",
404 | "execution_count": 17,
405 | "metadata": {},
406 | "outputs": [],
407 | "source": [
408 | "def plot_confusion_matrix(y_true, y_pred, classes,\n",
409 | " normalize=False,\n",
410 | " title=None,\n",
411 | " cmap=plt.cm.Blues):\n",
412 | " \"\"\"\n",
413 | " This function prints and plots the confusion matrix.\n",
414 | " Normalization can be applied by setting `normalize=True`.\n",
415 | " \"\"\"\n",
416 | " if not title:\n",
417 | " if normalize:\n",
418 | " title = 'Normalized confusion matrix'\n",
419 | " else:\n",
420 | " title = 'Confusion matrix, without normalization'\n",
421 | "\n",
422 | " # Compute confusion matrix\n",
423 | " cm = confusion_matrix(y_true, y_pred)\n",
424 | " # Only use the labels that appear in the data\n",
425 | " classes = classes[unique_labels(y_true, y_pred)]\n",
426 | " if normalize:\n",
427 | " cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n",
428 | " print(\"Normalized confusion matrix\")\n",
429 | " else:\n",
430 | " print('Confusion matrix, without normalization')\n",
431 | "\n",
432 | " print(cm)\n",
433 | "\n",
434 | " fig, ax = plt.subplots()\n",
435 | " im = ax.imshow(cm, interpolation='nearest', cmap=cmap)\n",
436 | " ax.figure.colorbar(im, ax=ax)\n",
437 | " # We want to show all ticks...\n",
438 | " ax.set(xticks=np.arange(cm.shape[1]),\n",
439 | " yticks=np.arange(cm.shape[0]),\n",
440 | " # ... and label them with the respective list entries\n",
441 | " xticklabels=classes, yticklabels=classes,\n",
442 | " title=title,\n",
443 | " ylabel='True label',\n",
444 | " xlabel='Predicted label')\n",
445 | "\n",
446 | " # Rotate the tick labels and set their alignment.\n",
447 | " plt.setp(ax.get_xticklabels(), rotation=45, ha=\"right\",\n",
448 | " rotation_mode=\"anchor\")\n",
449 | "\n",
450 | " # Loop over data dimensions and create text annotations.\n",
451 | " fmt = '.2f' if normalize else 'd'\n",
452 | " thresh = cm.max() / 2.\n",
453 | " for i in range(cm.shape[0]):\n",
454 | " for j in range(cm.shape[1]):\n",
455 | " ax.text(j, i, format(cm[i, j], fmt),\n",
456 | " ha=\"center\", va=\"center\",\n",
457 | " color=\"white\" if cm[i, j] > thresh else \"black\")\n",
458 | " fig.tight_layout()\n",
459 | " return fig, ax"
460 | ]
461 | },
462 | {
463 | "cell_type": "code",
464 | "execution_count": 18,
465 | "metadata": {
466 | "scrolled": false
467 | },
468 | "outputs": [
469 | {
470 | "name": "stdout",
471 | "output_type": "stream",
472 | "text": [
473 | "Normalized confusion matrix\n",
474 | "[[0.83657588 0.07782101 0.01167315 0.07392996]\n",
475 | " [0.12653061 0.76326531 0.00408163 0.10612245]\n",
476 | " [0.06028369 0.02836879 0.90425532 0.0070922 ]\n",
477 | " [0.0929368 0.05947955 0.00371747 0.84386617]]\n"
478 | ]
479 | },
480 | {
481 | "data": {
482 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAUIAAAEYCAYAAAApuP8NAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzs3Xd4FWX2wPHvSUKRmhBUSIJ0CAm9gwgISA3ggnRQwL4WxNXVVVdd7GLD9nN1RbHSOwjYUBExNFFpEjAISSx0BQkmnN8fMwk3/QLJvTfkfJ5nntyZee/Mmdybk/d9Z+YdUVWMMaYkC/J3AMYY42+WCI0xJZ4lQmNMiWeJ0BhT4lkiNMaUeJYIjTElniXCc5yIPCgi77ivLxKRP0QkuJD3kSgiPQpzm17s80YR+cU9nvCz2M4fIlKnMGPzFxHZLCJd/R1HcWSJ8Cy5SeBXESnvsewaEVnpx7Bypao/qWoFVU33dyxnQ0RKAc8APd3j2X+m23Lfv6vwoit8IvKmiDxcUDlVjVXVlT4I6ZxjibBwBAMTznYj4rDPpGAXAmWBzf4OJBCISIi/Yyju7I+ucEwG7hCR0NxWikhHEVkrIofdnx091q0UkUdE5EvgGFDHXfawiKx2m26LRCRcRN4VkSPuNmp5bGOKiOxx160XkUvyiKOWiKiIhIhIB3fbGdNxEUl0ywWJyN0islNE9ovITBGp4rGdMSKy2113b36/GBE5T0SedssfFpFVInKeu26A25w75B5zI4/3JYrIHSLyrfu+GSJSVkQaANvdYodE5BPP48r2e73GfV1PRD5zt7NPRGZ4lFMRqee+riwib4nIb26892X8YxKRsW7sT4nIQRH5UUT65HPciSJypxv/URF5XUQuFJEPROR3EflIRMI8ys8SkZ/dGD8XkVh3+XXAKOCfGd8Fj+3fJSLfAkfdzzSzi0JElorI0x7bny4iU/P7rEo0VbXpLCYgEegBzAUedpddA6x0X1cBDgJjgBBghDsf7q5fCfwExLrrS7nLEoC6QGVgC/CDu58Q4C3gDY8YRgPh7rp/AD8DZd11DwLvuK9rAQqEZDuGUsBnwGPu/ARgDRAFlAH+C7zvrosB/gA6u+ueAdKAHnn8fl5yjycSp+bc0X1fA+AocJm7/3+6x1za4/caD0S4v8OtwA25HUdux+Xu8xr39fvAvTj/+MsCnTzKKVDPff0WsACo6G7zB+Bqd91Y4C/gWvc4bgSSAcnne7EGp/YaCfwKbABauDF8AjzgUX68u98ywHPANx7r3sT9bmXb/jdADeA8z++i+7qau89uOIl0F1DR338vgTr5PYDiPnEqETYGDgPnkzURjgHis73nK2Cs+3olMCnb+pXAvR7zTwMfeMz39/xDySWmg0Az9/WDFJwI/w9YDAS581uB7h7rq7tJIAS4H5jusa48cIJcEqGbeP7MiCXbun8DM7OVTQK6evxeR3usfxJ4JbfjyO24yJoI3wJeBaJyiUOBejjJ7QQQ47Hueo/PcSyQ4LGunPveavl8L0Z5zM8B/s9j/hZgfh7vDXW3Xdmdf5PcE+H43L6LHvODgT3APjySv005J2saFxJV/R4nmdydbVUEsDvbst04tYQMe3LZ5C8er//MZb5CxozbhNzqNqsO4dQiq3oTt4hcD3QFRqrqSXdxTWCe22Q9hJMY03FqNxGe8arqUSCvkxVVcWo/O3NZl+X34u57D1l/Lz97vD6GxzGfpn8CAsS7TfHxecRaiqyfVfbPKTMeVT3mvswvJq8+QxEJFpHH3a6IIzgJLSOm/OT2vfG0CCfBb1fVVQWULdEsERauB3CaTp5/PMk4icXTRTi1nwxnPASQ2x/4T2AoEKaqoTg1U/HyvQ8BA1X1iMeqPUAfVQ31mMqqahKQgtMcy9hGOZxmeW72AcdxmvjZZfm9iIi4203KpWxBjro/y3ksq5bxQlV/VtVrVTUCp5b3cka/YLZY/yLrZ5X9cyoqI4GBOC2Lyjg1XDj1Geb1/Sjoe/MIzj+x6iIy4ixjPKdZIixEqpoAzABu9Vi8FGggIiPdDu1hOP1siwtptxVx+uh+A0JE5H6gUkFvEpEawEzgSlX9IdvqV4BHRKSmW/Z8ERnorpsNxIlIJxEpDUwij++RW8ubCjwjIhFuzaeDiJRx991PRLqLcznMP4BUYPVpHb2zn99wEtZodx/j8Ui+IjJERKLc2YM4CeRktm2kuzE9IiIV3WO/HXjndOM5AxVxjn0/TjJ/NNv6X4DTutZRRDoD44ArgauAF0QkMv93lVyWCAvfJJx+MwDUucYtDucPfT9O7S1OVfcV0v6WA8twOvZ349TACmoyAXTHaerOllNnjjMuR5kCLARWiMjvOJ3+7dzj2QzcBLyHUzs8COzNZz93AN8Ba4EDwBM4fZHbcU7yvIBTG+sP9FfVE14ed3bXAnfi/I5jyZpQ2wBfi8gf7nFN0NyvHbwFp3a5C1jlHqMvzrS+hfPZJeGcGFuTbf3rQIzbVTG/oI2JSCV3mzerapKqfuFu4w235m2yEbdT1RhjSiyrERpjSjxLhMaYEs8SoTGmxLNEaIwp8exm7VxI6fIqZcMKLhigmtS90N8hnLXgoOJ9cjP9ZPE+Cbn3p93s37+v0D6E4Eo1VdP+LLCc/vnbclXtXVj79ZYlwlxI2TDKtL3F32GcsWVzJvo7hLNW+bxS/g7hrPx+PM3fIZyVnl3aF+r2NO1PyjQcWmC549+85NUdUYXNEqExpuiJQFChjgdcqCwRGmN8I4CH2rREaIzxjQC+qcUSoTHGB6xpbIwp6QRrGhtjSjqxprExxljT2BhTwok1jY0xJZxgTWNjTEknEBS46SZwIzPGnFsC+P5xS4TGmKJnl88YYwzWR2iMKenszhJjjLGmsTGmhBO7s8QYY6xpbIwp6QL7zpLAjayYuqxNbTa9cQ3fT7uOO4a3y7G+xgUVWfbUcL56ZSzxr46jV9s6Odb/tmgitw1p66uQs/j0o+V0at2Yji0a8cKzk3OsT01N5fpxo+jYohH9undiz+5EAP766y8m3HA13Tq2pHPbprzwzJM+jtyxYvkymjeOpkmj+jw1+fEc61NTU7ly1HCaNKpPl07t2Z2YCMD+/fvp07MbF1SpyO0TbvZx1Fl98tFyLm4VS/vmjXL9PaampnLd2JG0b96IPt0u5if3M5gz8z26d2qdOVUPLcP3337j4+jzkdE8zm/yajPSW0S2i0iCiNydy/qLRORTEdkoIt+KSN+CtmmJsBAFBQnP3XIZA++ZRYur/8eQS2OIvig8S5m7RnVkzmfb6HDDm1z58EKm3Nozy/onbujOivhdvgw7U3p6OvfcMYF3Zy9k5debWDB7Bj9s25qlzPtvv0FoaCirN27l2r/fysMP3gvAovlzSD2RyierN7Bs5RrefuN/mUnSl/HfPuFm5i1cyvpNm5k1Yzpbt27JUmbaG68TGhrKd1t3cPOtt/Hve52/o7Jly/LvBybx6OM5k78vpaen869/TOC92Yv4PH4T8+bMYPu2rMfw3ltvEBoaxppvtnL932/l4QfuAWDw0JF8vGodH69ax4v/fYOLatamcdPm/jiMnMS9s6SgqcDNSDDwEtAHiAFGiEhMtmL3ATNVtQUwHHi5oO2e04lQRHza9G/TsDo7kw+RmHKYv9JOMmvlVuIurp+ljCpUKlcGgMrly5Cy/4/Mdf071ifx50Ns2b3Pl2Fn2rh+LbXq1KVmrTqULl2agYOHsnzpoixlli9dxJARYwCIGziIVZ99iqoiIhw7epS0tDSOH/+T0qVLUaFSJZ/Gv25tPHXq1qN2HSf+K4YOY/GiBVnKLF60kFFjrgLgb4OuYOWnH6OqlC9fno4Xd6JM2bI+jTm7jevXUrtOXWrWdo7h8kFDWb4k52cwdKT7GVw+OPMz8DRv9gwuHzzEZ3F7pXBqhG2BBFXdpaongOnAwGxlFMj48lUGkgvaaEAlQhGZLyLrRWSziFznLvtDRB4RkU0iskZELnSX13XnvxORh0XkD3d5VxH5QkQWAltEZJKI3Oaxj0dEZEJRxB9RtSJ7fz2SOZ/02+9EhlfIUuaRt1YxvEcsCe//nXmPDuH2Fz8EoHzZUvxjeDseeevLogjNKz+nJBMRWSNzvnpEJCkpSbmUiQIgJCSESpUqceDAfuIGDqJc+fI0b1iTNo3rccMtEwkLq+LT+JOTk4iqEZU5HxkZRUpSUs4yUTU84q/M/v37fRpnflKSkzJ/vwDVIyNJScn6d5ySkpTlM6hYqTIHDmQ9hgVzZ3P5FcOKPuDTIUEFT1BVRNZ5TNdl20oksMdjfq+7zNODwGgR2QssBQp8JGVAJUJgvKq2AloDt4pIOFAeWKOqzYDPgWvdslOAKaraBOeX4aklMEFVGwBTgSsBRCQIp6r8TpEfSR6GXhrDO8u/o96Il/nbPbN4/e44ROC+Kzvxwpx1HD3+l79COysb168lODiYjdsS+XrTdl558Tl2J/qniV/SbVgXz3nlzqNRTGN/h3JKxlPsCppgn6q29phePYO9jQDeVNUooC/wtvu3n6dAS4S3isgmYA1QA6gPnAAWu+vXA7Xc1x2AWe7r97JtJ15VfwRQ1URgv4i0AHoCG1U1RxVARK7L+C+kfx09o+CT9/1O1AWnmoOR51ckyaPpC3BVn6bM+WwbAF9vTaZs6RCqVi5Hm0bVeeTarmx75wZuHtSaO0e054aBLc8ojjNVrXoEyUmn/tmmJCdRvXpkLmWc/ztpaWkcOXKEKlXCmTd7Opd270mpUqWoev4FtGnXkU0bN/g0/oiISPbuOfU/MSlpL9UjI3OW2bvHI/7DhIdn7cf1p+oRkZm/X4CUpCSqV4/IWqZ6ZJbP4Pcjh6lS5dQxzJ8zk78NDrDaIBRW0zgJJzdkiHKXeboamAmgql8BZYF8n5ccMIlQRLoCPYAObu1vI84B/KWnOkDS8e6Sn+yZ7H/AWGAcTg0xB1V9NeO/kJQqf/oHAKzbnkK9yDBqVqtMqZAghnRtxJLVCVnK7Pn1CF1b1ASg4UXhlC0VzG+HjtFj4ntEj36F6NGv8OLcdUx+fw2vLPBtImnesjU/7kzgp8QfOXHiBAvmzKRnn7gsZXr2iWPW+28DsHjBXDp17oqIEBl1Eas+XwnAsaNH2bDua+rVb+jT+Fu1bsPOhB0k/ujEP3vmDPrFDchSpl9cf959exoA8+bOpkvXbkgAXejbvGVrdu1MYLf7GcyfO5OefbN9Bn3jmPme+xnMn8PF7mcAcPLkSRbOm83lgwt+mLqviUiBkxfWAvVFpLaIlMZp4S3MVuYnoLu7z0Y4eeS3/DYaSNcRVgYOquoxEYkG2hdQfg0wGJiB88vIzzxgElAKGHm2geYl/aQy8YUPWfT4UIKDhGnLvmPr7n38+6pObPjhZ5Z8lcDdr3zCy7f35pbBbVBVrp28tKjCOW0hISE8Mvk5Rg6OIz09neGjx9KwUQxPPvIfmrVoSa++/RkxZhy3Xj+Oji0aERpWhf+b6vxBjrvmBibedC1d2zdHVRk26kpiGjfxefxPP/cCA+N6k56ezpVjxxETE8tD/7mfli1b06//AK4adzXXjLuSJo3qE1alCtPefj/z/Y0a1Ob3I0c4ceIEixYtYOGS5TRqlP2EZNEfw6NPPceIQf1ITz/JiNFXEd0oliceeZDmLVrRq29/Ro4Zx83XjaV980aEhoXx36mnenq++vILIiKjqFm7Tj578T0RkEIYhktV00TkZmA5EAxMVdXNIjIJWKeqC4F/AK+JyEScEydjNfvZpOzxFbDeZ0SkDDAfp+m7HQjF6fRcrKoV3DJXAHGqOlZE6uP09Z0HLANGqWqkW7O8Q1Xjsm3/FeCQqua47ii7oEpRWqZtgf2rAWvXnIn+DuGsVT6vlL9DOCu/H0/zdwhnpWeX9mzauL7QqsrBVWrreT0eKLDc0Vnj1qtq68Lar7cCpkaoqqk41wZlV8GjzGxgtjubBLRXVRWR4UBDt8xKYKXnBtyO0vZAgF1PYEzJEUhdENkFTCI8A62AF8X57R4CxudWyL3YcjEwT1V3+DA+Y4yHoKCAOSWRQ7FNhKr6BdDMi3JbgMDqMDGmpBF3ClDFNhEaY4oPweuzwn5hidAY4xPWNDbGlHhWIzTGlGzWR2iMKekEsaaxMcZY09gYYwI3D1oiNMb4gNhZY2OMsaaxMaZkswuqjTGmkIbhKiqWCI0xPmE1QmNMiWeJ0BhT4lnT2BhTop3GM0n8whKhMcYnLBEWM7F1LmD+e7f6O4wzFn3jDH+HcNZ+eWuMv0M4K8f/Svd3CGelKB5lZInQGFPiWR+hMaZkE6sRGmNKOMF5tnGgskRojPEBIciaxsaYks6axsaYkk2saWyMKeEECA4O3ExoidAY4xPWNDbGlGzWNDbGlHT2FDtjjMFqhMYYY32ExpiSTQS7oNoYYwK4Qkjg9l4aY84pGYOz5jd5uZ3eIrJdRBJE5O48ygwVkS0isllE3itom1YjNMYUvUJqGotIMPAScBmwF1grIgtVdYtHmfrAv4CLVfWgiFxQ0HatRmiMKXIZo88UNHmhLZCgqrtU9QQwHRiYrcy1wEuqehBAVX8taKOWCI0xPlBws9htGlcVkXUe03XZNhQJ7PGY3+su89QAaCAiX4rIGhHpXVB01jQ2xviEl03jfara+ix3FQLUB7oCUcDnItJEVQ/lGdtZ7tAYYwrmRbPYy6ZxElDDYz7KXeZpL7BQVf9S1R+BH3ASY54sERayzz5ZwWUdm9GtXWNeef6pHOvjv1rFgB4daBhRkQ8WzctcnrTnJwb06ED/bu3o3bkV7017zZdhZ+reNIJ1Tw1g4zMDmdg/Nsf6R0e35otH+/HFo/1Y//RAdr82LHNdVHg55t3dnfjJA/j6yf5cVLW8L0PPtGL5MprGNiQ2uh6Tn3w8x/rU1FRGjxxGbHQ9LunYjt2JiZnrJj/xGLHR9Wga25APVyz3YdSnrPx4Bd3aNaVLm1henjI5x/qvV6+i36UdqHthBZYunJtl3ZVDB9CkTjXGjxjkq3C94vQRFspZ47VAfRGpLSKlgeHAwmxl5uPUBhGRqjhN5V35bbRYNY1F5FbgRmCDqo7ydzzZpaen8+DdE5k2czHVIiIZ1OsSuvfqR/2GjTLLRETW4Mkpr/K//5uS5b3nX1iNWUtWUqZMGY4e/YO+XVrTvVc/LqwW4bP4g0R4elxbLn/sI5L2H+PTh/uwdMNeticdzixzzzvrMl9f17MhTWtVyZx/5caLeXr+93z6fQrly4RwsigehVaA9PR0brv1JpZ88CGRUVF0at+GuLgBNIqJySzz5tTXCQsNY/O2BGbOmM6999zFO+/NYOuWLcyaMZ0NmzaTkpxM3949+G7LDwQHB/s0/vvvuo13Zi+hWkQkAy7rxGW947J+h6Jq8NSLr/LaS8/leP/1N0/kz2PHeG/a6z6L2VuFcdZYVdNE5GZgORAMTFXVzSIyCVinqgvddT1FZAuQDtypqvvzje2sI/OtvwOXnU0SFJEiS/6bNqyjZu26XFSrNqVLl6bf5Vfw0bLFWcpEXVST6NgmOW5AL126NGXKlAHgRGoqJ0+eLKow89SqXji7fvmdxF//4K/0k8z9ajf9WtXIs/wVHWsxZ3UiAA0jKxMSHMSn36cAcDQ1jT9P+P6Rlmvj46lbtx6169ShdOnSDBk2nMWLFmQps3jRAkaNuQqAQYOvYOUnH6OqLF60gCHDhlOmTBlq1a5N3br1WBsf79P4v9mwNst3qP/fhrDig6zfoRoX1aRRbBMkl0EMLu58KeUrVPRVuKelsK4jVNWlqtpAVeuq6iPusvvdJIg6blfVGFVtoqrTC9pmsUmEIvIKUAf4QETuFZGpIhIvIhtFZKBbppaIfCEiG9ypo7u8q7t8IbAln92clV9+TqZ6xKkTWNUiIvnl52Sv35+ctJd+XdtyScsGXHfz7T6tDQJEhJUjaf/RzPmkA0epXuW8XMvWqFqemudX4LPNPwNQr3olDh89wTu3deGLR/vx0MiWBPnhVoLk5CSiok4l78jIKJKSknKWqeGUCQkJoVLlyuzfv5+kpJzvTU7O3v1UtH5JSSYiIipzvnpEJL+k+DaGIlF4fYRFotgkQlW9AUgGLgXKA5+oalt3frKIlAd+xakxtgSGAc97bKIlMEFVG+S2fRG5LuOU/YH9+4ryUPIUERnFkpXxfLzmO+bNeJd9v/7ilzi8MbhDLRbE/5TZ/A0JEjpEX8B9762n631LqXVBBUZ1qevnKE2gEPfhTQVN/pJnIhSRSvlNvgwyFz2Bu0XkG2AlUBa4CCgFvCYi3wGzgBiP98S7Z5BypaqvqmprVW1dJbzqGQV1YbUIUjxqED8nJ51Rre7CahE0iI5h7derzyiOM5V88BiR4adOcERWKU/KgT9zLTu4Qy1mu81igKQDx/hu90ESf/2D9JPK4nV7aObRf+grERGR7N176jKzpKS9REZG5iyzxymTlpbGkcOHCQ8PJzIy53sjIrJfola0LqweQXLy3sz5lOQkLqzu2xiKSpBIgZPfYstn3Wbge/fn5mzz3xd9aPkSYLCqNneni1R1KzAR+AVoBrQGSnu852gu2ylUTVu0YveuBPbsTuTEiRMsmT+b7r36efXelOS9HP/TSTqHDx1kXfxX1Kmb7xn/Qrdh537qVqtIzfMrUCo4iEEdarJ0/Z4c5epHVKJy+dLE7/gty3srlytFeEWnn7NzbDW2JeV52VaRad2mDQkJO0j88UdOnDjBrBnT6Rc3IEuZfnEDePftaQDMnTObLpd2Q0ToFzeAWTOmk5qaSuKPP5KQsIM2bdv6NP5mLVqT6PEdWjRvFpf19u47FOgCuWmc54kDVc27l9z/lgO3iMgtqqoi0kJVNwKVgb2qelJErsI5q+QzISEhPPDYM4wbPoD09HSGjLiSBtExPPfEJBo3a0mP3nF8u3EdN44bzpFDh/hkxVKmTH6YZZ+vZ+eO7Tz2wL8QEVSVa26cQMOYxr4Mn/STyh1vxjP37u4EBwnvrExgW9Jh7rmiGRt37eeDDU5NZXCHWsz9KjHLe0+q8u93N7Dw3ssQ4JsfDzDtkwSfxg/OZ/DslBfp368X6enpXDV2PDGxsUx68H5atmpNXP8BjB1/NePHjiE2uh5hYVV4+12nLz0mNpbBQ4bSomkMISEhPPf8Sz49Y5wR/6THn+XKIf1JP5nO0JFX0SA6hmcem0ST5i25rE8cmzas4/qrhnH48CE+Xr6UZ594mA+/3ADAkLju7NzxA0eP/kH7JnV5YsordOl2mU+PITciEBzAw3CJenGJg4gMB+qo6qMiEgVcqKrrizy6nHEk4tT0jgLPAR1xarU/qmqce7P1HECBZcBNqlpBRLoCd6hqnDf7adK8pc5f8WURHIFvtLxttr9DOGu/vDXG3yGclV8OH/d3CGelf/eL+fab9YWWuSrXbKQX/2tageU+uLHd+kK4s+S0FXgpiYi8iNP31hl4FDgGvAK0KdrQclLVWh6z1+eyfgfQ1GPRXe7ylTh9icYYPwnk8Qi9uaauo6q2FJGNAKp6wL2i2xhjvCI4Z44DlTeJ8C8RCcJpbiIi4YDvr/Y1xhRfIgHdR+jNdYQv4fS7nS8i/wFWAU8UaVTGmHNOsTxrnEFV3xKR9UAPd9EQVfX35TPGmGJEwK/XCRbE2/tug4G/cJrHxeZuFGNM4Ajkp9gVmNRE5F7gfSACZ+yv90TkX0UdmDHm3OFNszigm8bAlUALVT0GICKPABuBx4oyMGPMuaW4N41TspULcZcZY4zXimUiFJFncfoEDwCbRWS5O98TZ5RYY4zxinOyxN9R5C2/GmHGmeHNwBKP5WuKLhxjzDnpNAZe9Yf8Bl0IvLG+jTHFViCfNfbmXuO6wCM4Y/uVzVie1wCnxhiTXaA3jb25JvBN4A2cY+kDzARmFGFMxphzUGE9s6QoeJMIy6nqcgBV3amq9+EkRGOM8YoIBIsUOPmLN5fPpLqDLuwUkRtwHqYcmI/JMsYErAA+V+JVIpyI87CkW3H6CisD44syKGPMuadYnjXOoKpfuy9/B4r3sMHGGL8QAnsYrvwuqJ6HOwZhblR1UJFEZIw59/j5XuKC5FcjfNFnUQSY4CCh0nneDswTePZMHeXvEM5aWJub/R3CWTm4tnj/+ZQKLvysVSybxqr6sS8DMcacuwT8ela4IMW32mOMKVYCuIvQEqExxjfOiUQoImVUNbUogzHGnJsC/QHv3oxQ3VZEvgN2uPPNROSFIo/MGHNOCeQRqr25xe55IA7YD6Cqm4BLizIoY8y5JePhTQVN/uJN0zhIVXdnO/WdXkTxGGPOUUVwRU6h8SYR7hGRtoCKSDBwC/BD0YZljDmXiJ9rfAXxpml8I3A7cBHwC9DeXWaMMV4rrD5CEektIttFJEFE7s6n3GARURFpXdA2vbnX+FdguHchGmNMTgKEFMJZY7dV+hJwGbAXWCsiC1V1S7ZyFYEJwNc5t5KTNyNUv0Yu9xyr6nXe7MAYY6DQzgq3BRJUdZezTZkODAS2ZCv3EPAEcKc3G/WmafwR8LE7fQlcANj1hMYY74lzQXVBE1BVRNZ5TNkrXJHAHo/5ve6yU7sSaQnUUFXPh87ly5umcZZh+UXkbWCVtzswxhhwhuLywj5VLbBPL899OINIPwOMPZ33ncktdrWBC8/gfcaYEsrpIyyUTSUBNTzmo9xlGSoCjYGV7iV/1YCFIjJAVdfltVFv+ggPcqqPMAjnge95nqkxxpjcFNIwXGuB+iJSGycBDgdGZqxU1cNAVY99rgTuyC8JQgGJUJzIm3Eq455U1TwHazXGmNwU1uM8VTVNRG4GlgPBwFRV3Swik4B1qrrwTLabbyJUVRWRpara+Ew2bowxABTioAuquhRYmm3Z/XmU7erNNr1ptX8jIi282ZgxxuQmo0boxVljv8jvmSUhqpoGtMC5aHEncBTnmFRVW/ooRmPMOSCA77DLt0YY7/4cADQE+gJDgCvcnyYXn3y4nA4tY2nbrBHPP/NkjvWpqalcO3YkbZs1ovelF/PT7sTMdZu//5Y+3S/hkrbN6NK+BcePH/dh5I6PViyjVdNGNI9twDOTn8jghDvdAAAgAElEQVSxPjU1lbGjh9M8tgHdLunAbjf+9Wvj6dSuJZ3ateTiti1YtGCejyN3XNaxEZvm/ZvvFzzAHeMuy7H+ouphLH3lFuJn/Ivlr00g8oLQzHWj+rfjuwX3892C+xnVv50vw85ixfJlNI1tSGx0PSY/+XiO9ampqYweOYzY6Hpc0rEduxMTAdi/fz+9elxK1dAK3HZrYD3zRSj44e7+HMo/v0QoAKq6M7fJR/F5RURW+zsGgPT0dO76xwTen7OIVWs3MXf2DLZvy3rB+7tvvUHl0DDiN23l+ptu5aEH7gEgLS2Nv187lsnPvcgX8ZuYt+QjSpUq5fP4/3HbLcxesIT4jd8zZ9Z0tm3NGv9bb04lNCyMbzb/wN9vmcAD9zoXEDSKbczKL+NZ9fUG5ixYym233EhaWppP4w8KEp67eygDb36ZFoMfZkjvVkTXqZalzGMT/8a7S+JpO+wxHn31AybdMgCAsErluPe6PnQe8xSXjJ7Mvdf1IbTieT6NH5zP4LZbb2LBog/Y+O0WZk1/n61bsn4Gb059nbDQMDZvS+CWCRO59567AChbtiz3P/gQjz3xlM/jLpD3F1T7RX6J8HwRuT2vyWcRekFVO/o7BoAN69ZSu05datWuQ+nSpfnb4KEsW7IoS5llSxYxbITzeOj+lw/mi5Wfoqqs/PhDYmKb0LhJMwCqhIcTHBzs0/jXr42nTt261HbjHzRkGEsWZz0Jt3TxAkaOuhKAywddwWcrP0FVKVeuHCEhTk/L8dTjfnliWZvGtdi5Zx+JSfv5Ky2dWcs3ENe1aZYy0XWq81n8dgA+W/sDcV2bAE5N8uM12zh45BiHfv+Tj9dso+fFMT4/hrXx8dStW4/adZzPYMiw4SxetCBLmcWLFjBqzFUADBp8BSs/+RhVpXz58lzcqRNly5b1edzeCOTxCPNLhMFABZwLFHObAoaI/CGOySLyvYh8JyLD3HVvicjlHmXfFZGBRRHHzylJREZFZc5Xj4gkJTk5zzIhISFUrFSZAwf2szNhByLC0Mv70f2StrzwnO//qycnJxEZdepa1cjISFKSkrKUSUlOziwTEhJCpUqVObB/PwDr4r+mXcsmdGzdjGeffzkzMfpKxAWV2fvLwcz5pF8OEnl+5SxlvvshiYHdmgMwsFszKlU4jyqVyxNxfmjW9/56iIjzQ/G15OQkorJ8BlEkZfsMkpOTiKrh8RlUrsx+9zMIVIJz1rigyV/y+6amqOokn0Vy9gYBzXGue6yKc4Lnc+B1YCIwX0QqAx2Bq7K/2b2n8TqAqBoX+SrmTGnpacSvWc3ylas577xyDO7fi2bNW9K5azefx3KmWrdtx9cbvmP7tq3ccM04LuvVJ+BqJ/96dh7P3jWE0QPa8eWGBJJ+OUh6+kl/h1UiFNeTJQEcdq46Ae+rarqq/gJ8BrRR1c9wrkQ/HxgBzHHPhmehqq+qamtVbR1etWr21V6pVj2SpL17M+dTkpOoHhGRZ5m0tDR+P3KYKlXCiYiIpH3HToSHV6VcuXL06NmbbzdtPKM4zlRERCRJe0/dz56UlET1yCz3s1M9IiKzTFpaGkeOHKZKeHiWMg2jG1G+QgW2bP6+6IP2kPzrYaIuDMucj7wwjKTfDmcpk/LbYYbf8T86jHiCB150ui0O//Enyb8dyvreC0JJ/u2QbwL3EBERyd4sn8FeIrN9BhERkezd4/EZHD5MeLbPINAITrIpaPKX/Pbd3WdRFL23gNHAOGBqUe2kRavW7NqVwO7EHzlx4gTz5sykV9+4LGV69Y1jxvtvA7Bo/hw6demKiHBp955s3fI9x44dIy0tjdVffkHDho2KKtRctWzdhp0JCSS68c+dNYO+/fpnKdO33wDee/ctAObPnU3nLpciIiQm/ph5cuSn3bvZsX0bNWvW8mn86zbvpt5F51MzIpxSIcEM6dWSJSu/zVImPLR8Zv/lneN7MW3BGgA+XL2VHh2iCa14HqEVz6NHh2g+XL3Vp/EDtG7ThoSEHST+6HwGs2ZMp1/cgCxl+sUN4N23pwEwd85sulzazS99sqdFAruPMM+msaoe8GUgheAL4HoRmQZUATpzaiyyN3EuB/o5+wCOhSkkJITHJz/HsL/1Iz39JCPHXEV0o1gef/hBmrdsRe++/Rl15Thuum4sbZs1IiwsjP++8Q4AoWFh3HDTBHp17YCI0L1nby7r3beoQs0z/qeefZ5B/fuQnp7O6KvG0SgmlkcmPUCLlq3oGzeAMWPHc934K2ke24CwsCpMffs9ANasXsWzTz1JqVKlkKAgnp7yImdasz5T6eknmfjETBa9fBPBQcK0BWvYuutn/n1jPzZs+Ykln31H59b1mXTLAFRh1YYEbntsJgAHjxzjsdeWseqdfwLw6KvLOHjkmE/jB+czeHbKi/Tv14v09HSuGjuemNhYJj14Py1btSau/wDGjr+a8WPHEBtdj7CwKrz97vTM9zesV4vfjxzhxIkTLFo4n8VLV9AoxvcnfbLLeHhToJJz4dZhEfkdqAQ8CfTBGSTiYc8hxERkGTBfVV8paHvNW7bSDz9bU1ThFrkypXx7trkoXNjhVn+HcFYOrn3R3yGclYvbtWb9+nWFlrnqxDTVh95eWmC50a1rrD+bYbjOlG9P6xUBEQkHDriDQdxJLiPSikg5oD7wvo/DM8YAIAQV5we8BzIRiQC+AvK81kREegBbgRfcIXqMMT4W6CdLinWNUFWTgQYFlPkIqOmbiIwxeQnkEzrFOhEaY4oJCeyTJZYIjTFFLqNpHKgsERpjfMKaxsaYEi+ATxpbIjTGFD2naRy4mdASoTHGJwK4ZWyJ0BjjC/69l7gglgiNMUXOmsbGGCPWNDbGGGsaG2NKtoznGgcqS4TGGJ8Q6yM0xpR01jQ2xpRo1jQ2xhjEmsbGmBLOLp8pfk4qpKYV32fdBvIoH94q7s/8COv5qL9DOCupO1IKdXsCBAfw99ISoTHGJwI3DVoiNMb4SgBnQkuExhifCOTLZwJ59GxjzDlEvJi82o5IbxHZLiIJInJ3LutvF5EtIvKtiHwsIgU+vM0SoTHGNwohE4pIMPAS0AeIAUaISEy2YhuB1qraFJgNPFnQdi0RGmOKnLhPsSto8kJbIEFVd6nqCWA6MNCzgKp+qqrH3Nk1QFRBG7VEaIzxCS8rhFVFZJ3HdF22zUQCezzm97rL8nI18EFBsdnJEmOMb3jXCbhPVVsXyu5ERgOtgS4FlbVEaIzxgUIbqj8JqOExH+Uuy7o3kR7AvUAXVU0taKPWNDbGFDlvmsVepsm1QH0RqS0ipYHhwMIs+xJpAfwXGKCqv3qzUUuExhjfKIRMqKppwM3AcmArMFNVN4vIJBEZ4BabDFQAZonINyKyMI/NZbKmsTHGJwrrgmpVXQoszbbsfo/XPU53m5YIjTE+Ebj3lVgiNMb4wuncOuIHlgiNMUXOGaE6cDOhJUJjjE8Ebhq0RGiM8ZUAzoSWCI0xPmFNY2NMiRe4adASoTHGVwI4E9qdJYXs049W0KVtEzq1iuGl5ybnWJ+amsqN40fTqVUM/Xtcwp6fEgE4ceIEt990LT0ubkXPS9rw1arPfBy54+MPl9O+RSxtmkUz5emcw7ilpqZyzVUjadMsml6XduSn3YmZ6zZ//y19unWiU5tmdG7XnOPHj/sw8lNWLF9G09iGxEbXY/KTj+dYn5qayuiRw4iNrsclHduxOzExc93kJx4jNroeTWMb8uGK5T6M+pTL2tRh07Tr+f7tG7hjRIcc62tcUIllT4/iq/+OJ/61a+jVrm6O9b8tuYPbhrbzVcgFKsRhuIpEwCVCEaklIt/7O44zkZ6ezn3/nMBbMxfwyVffsGDOTH7YtjVLmenvvEloaCir1m/hmhtv4dEH7wPgvbemAvDRl+t5b+4SHvr33Zw86dsn6aWnp3P3P25l+txFfLn2W+bNns72bVuylHn3ramEhoaydtM2brhpApPuvweAtLQ0/n7NVUye8hKr1m5i/tKPKVWqlE/jzziG2269iQWLPmDjt1uYNf19tm7JegxvTn2dsNAwNm9L4JYJE7n3nrsA2LplC7NmTGfDps0sXLyMCbf8nfT0dJ/GHxQkPDehFwPvnkGLca8ypFsM0TWrZilz1+iLmfPZVjpcP5UrH57PlAm9sqx/4sYerIjf6cuwvVJYI1QXhYBLhMXZN+vXUqt2XWrWqkPp0qUZMGgIKz5YlKXMiqWLuGL4aAD6DRzEl59/iqqyY/tWLu7cFYCq519ApcqV2bRxvU/j37Aunlp16lKrthP/5YOH8cHirPF/sGQRw0aOAaD/5YP5YuUnqCqffvwhMY2b0LhJMwCqhIcTHBzs0/gB1sbHU7duPWrXcY5hyLDhLF60IEuZxYsWMGrMVQAMGnwFKz/5GFVl8aIFDBk2nDJlylCrdm3q1q3H2vh4n8bfJjqCnUkHSUw5xF9pJ5n1yRbiOtbPUkYVKpUrDUDl8mVI2f9H5rr+Fzcg8edDbEnc59O4vRLAmbDIEqGIlBeRJSKySUS+F5FhInK/iKx1518V9wG8ItLKLbcJuMljG2NFZK6ILBORHSLypMe6niLylYhsEJFZIlLBXf64x/MKnnKXDXH3uUlEPi+qY/45JZmIyFOD4VaPiOTnlOQ8y4SEhFCxUiUOHthPTGwTPvxgCWlpafy0+0e++2YjKUl7iyrUXKWkJBPpEX9EZCQpKVlHOPo5OZnIqBqZ8VeqXJkD+/ezM+EHRIQhl/elW6c2vPDsUz6NPUNychJRUadGaYqMjCIpKSlnmRpZj2H//v0kJeV8b3JyjhGeilRE1Yrs/fVI5nzSvt+JPL9iljKPTPuc4T0akzDjZuY9NpTbn18BQPmypfjH8PY8Mu0Ln8bsnYKbxf5sGhflyZLeQLKq9gMQkcrAh6o6yZ1/G4gDFgFvADer6ucikr1jrTnQAkgFtovIC8CfwH1AD1U9KiJ3AbeLyEvA34BoVVURCXW3cT/QS1WTPJZl4Y6Eex2Q+YfuS8NGj2XHD9vp160jkTUuolXb9gT5oUZ1ptLT0vn6q9WsWPkV55Urx+C4njRr0ZLOXbv5O7RzztBusbyz/FumzIqnXUwkr/9rAK2ufpX7xl7CC7PXcvT4X/4OMQd/N30LUpSJ8DvgaRF5Alisql+IyGAR+SdQDqgCbBaRL4BQVc2oqb2N82CWDB+r6mEAEdkC1ARCcR7c8qVbqSwNfAUcBo4Dr4vIYmCxu40vgTdFZCYwN7dgVfVV4FWApi1a6ZkccLXqESR71OJSkpOoVj0i1zLVI6NIS0vj9yNHCKsSjojw4KOn/gdc3qsrdepmbRIVterVI0jyiD85KYnq1bOOgl4tIoKkvXuIcOM/cvgwVcLDiYiMpH3HToRXdfqzevTqw7ffbPR5IoyIiGTv3lMjuScl7SUyMjJnmT17iIo6dQzh4eFERuZ8b0REfqPAF77kfb8TdUGlzPnIqhVJ+u33LGWu6tuMgXdNB+DrLUmULR1M1crlaBMdyd86R/PI9ZdSuUJZTp5Ujp9I45X5vu1iyVMAZ8Iiaxqr6g9AS5yE+LCI3A+8DFyhqk2A14CyXmzKc3TZdJzkLTi1y+buFKOqV7tjlbXFeXJVHLDMjeUGnBpkDWC9iIQXykFm06xlaxJ3JfDT7h85ceIEC+fO4rLecVnKXNYnjtnT3wFgyYK5XHxJV0SEP48d49jRowB8/ulHBIcE0yC6UVGEmacWrdrw484Edic68c+fM4Pe/bLG37tvHDPeexuARfPn0KnLpYgIl3bvydYt33Ps2DHS0tJYvepzn8cP0LpNGxISdpD4o3MMs2ZMp1/cgCxl+sUN4N23pwEwd85sulzaDRGhX9wAZs2YTmpqKok//khCwg7atG3r0/jXbUumXmQYNatVplRIEEO6xbDkqx1Zyuz55QhdW9YCoOFF4ZQtHcJvh47R47a3iR75MtEjX+bFOWuZ/N7qwEmCBPZZ4yKrEYpIBHBAVd8RkUPANe6qfW5/3hXAbFU9JCKHRKSTqq4CRnmx+TXASyJST1UTRKQ8zgNckoFyqrpURL4Edrmx1FXVr4GvRaQPTkLcX6gHjNPf9NCTzzH6iv6kp6czbNRVNGwUw1OP/oemLVrRs08cw0eP5bYbxtOpVQyhYVV46X9vOb+Ufb8y+or+BEkQ1SIimPLK1MIOz6v4H3tqCkMv78fJk+mMGDOW6EaxPP7wgzRv0Yre/foz6srx/P3asbRpFk1YWBivvvEuAKFhYdx482307NIBEaFHz9707N3XL8fw7JQX6d+vF+np6Vw1djwxsbFMevB+WrZqTVz/AYwdfzXjx44hNroeYWFVePtdp3YVExvL4CFDadE0hpCQEJ57/iWfn/BJP6lMfGEFi54YTnBwENM+2MTWxH38e2xnNvyQwpLVO7j7lY95+R99uOWKtqjCtU8uLnjDASCAK4SI6hm1AgvesEgvnJFiTwJ/ATcClwMjgJ+BH4DdqvqgiLQCpgIKrAD6qmpjERmL83zSm91tLgaeUtWVItINeAIo4+7yPpxhvBfg1DTFLTtNROYC9d1lHwO3aT4H3rRFK136yerC+2X4WNlSxadvMS8Vyhbva/3Dej7q7xDOSuq6lzh5JKnQcpe3f1M1qpRdX1gPbzodRfZtU9XlOMNpe1qHk7Cyl10PNPNY9E93+ZvAmx7l4jxefwK0yWXXOdoyqjrI+8iNMUUjcOuExfvfrjGmWHDGI/R3FHmzRGiM8YkAHnzGEqExxjfEmsbGmJLOaoTGmBJNxBKhMcZY09gYY6xGaIwp8SwRGmNKOLGmsTGmZBOsRmiMMZYIjTHGmsbGmBLNeYqdv6PImyVCY4xvWCI0xpR01jQ2xpR41jQ2xhhLhMaYki6Qm8ZF9syS4kxEfgN2F+EuqgL7inD7vlDcj8Hiz19NVT2/sDYmIstwYi7IPlXtXVj79ZYlQj8QkXX+eEBNYSrux2DxG09F9lxjY4wpLiwRGmNKPEuE/vGqvwMoBMX9GCx+k8n6CI0xJZ7VCI0xJZ4lQmNMiWeJ0BhT4lki9AMRKeXvGM6EiDO0ZsbP4kpE6otIQ3/HcSay/+6L+2cRKCwR+piINADGuq+D/RuN90RE9NSZtcZ+DeYMiaMs8G+gl7/jOV2en4GIdARQO9tZKCwR+l4HYACAqqb7ORavefwBXgnMFJEKxa02oo7jwH+BEcWtVujxGdwEvCwiF/k5pHOGJUIfEZFyAKo6DQgSkZv9HNJpE5FuwE1Af1X9AyhONdrGItJDRKqr6pfA58AF7rridBx9gXFAT1X9SUQaiIgNnnKWLBH6gNscvlVExrmLXgPK+TEkr3jW+Nw/tnJALeBKAFVNK0a1wn44NfG5ItIOqADcLiLBgVwzz+X3WxZYArQXkYfc1wtFxJsBDUweLBEWMRGJA54HdgA3iMj9QHvg6ox+nkCUrT+qMlBGVRfj1EZaisiN4DTXAi0Zun2BGSd2GopILPCyqt4KPAMMBkKBLkDvjPf4K968ZPsMrhCRzsBqnO6VocAaoAnwB9DGb4GeA6xKXYREpA1wLTBJVVeLyEqgB1APCAOGiMh64ESgdXp7/AHeDlwClBOR/6rqXBFR4FoRKauqzwZw7HE4iW8bEC4iz6jqLBFZjDMk1F9AT2BJoB0DZDmOO4HLgetV9WcR6a+qqe66fkBDYLP/Ij0HqKpNRTDhNL2mAgl5rB8ELAfC/R1rPsdwI/ApUB6YAaQDY911A4H3gFB/x+kRbw3gNfd1FeBjoKU7Pw54PWPe4z1f4Yy95/f48zimGOAz93U54FLgRnd+OLAeaOLvOIv7ZE3jIiAiDdQ5mfAUsFdEpnisKw2gqnOBNJwaYsBxm4pHcf7YrgcUpxn5qoiMUdUFwLWqesiPYWahqnuAF0WkpqoeAPbg9Gmiqm8A+4F/ZpQXkVY4NcOjvo82d7k00Q/i1Man4tRuxwH/EpFbgKXA5ar6nY/DPOdYIixkIlIfWC8iU1R1C/B3IFREngRQ1RMiEiwiFwLhOP/R/crtUgvymC+tjrdwviN9gLtU9UNgBfC4iFRU1YBLIKq6CfifiGwGNgFVRKS5W2whsM/jLOuvQDdVDYiRqrP1CfYSkdY4tfGxOMn6ZVW9EieZl1fVI27yN2fJRp8pRCIyABgFJAJjgHmqepOINAImAXtVdaJH+QpuzdGvPOMQkduAujg1pXtxhoN/GicBng9EA4+rarKfwvWKiLwP9MVpDpfBSeiX4iT0Bf6MrSAiMhGnT/AjnBbDtar6g7vuBpxLmIarqvULFhZ/t83PlQnnP/dKYKA7Hwb8ADzjzjcGWvg7zlziHgC87r4eDXyC0xf1E07CA5gIvAB8R4D3RwFBHq/fxfmn1A6nid/ZXS7+jjOf+BsAS93XTwFzcZJ4RaAm8AHQ2N9xnmuT1QgLiXtR7mvAS6q63l3WB5gFTFbV//gzvtyISDjOSZCbgWPA7cA7OIkjDqf/KdWjfGVVPeyPWE+HiASp6kn39SKglLoPBPJcFwiy3bqIiEQDd+Ek8DbAFap63G1trAL+VNU//RLsOcz6CM+SiNQWkfLqXJS7GXgn4y4S4Hec27n6udeABZoTOCdsHgCew+mHehToDgxQ1VQRecC99hHgiH/CPD2qejKjz1NV+wPHPfpoAzIJZlwQrarbgPNwulaGuEnwGuBunNquJcEiYNcRngUR6YVTC/xMRHYBD+JctrFaRFYAI3GanunuFFBU9XcR+QS4H/gP8CbwBfAwUFVEugB/A0a45YtN8yEjGbqJbyHQRURCVDXN37FBjiR4CzBARJKB+3D+ef4KzBeRz3Eunh6lAXJS51xkTeMz5F4sfTlOnw1Af6A0cAfQFudkw3bgQpz+tUGqussPoeZLRGoC9YEXcU7o7MFpKitQGbhTVb/3X4RnT0S6Ar+qcxY/IGQkaRG5HLgBuA3nH9IB4A2cpvEQ4DCwXt2TJaZoWCI8AyJSBueWuV9UtY27rBVwBc4lMfercwdALM5Zy+vVuawjYLnxz8AZomomTrdJueLQJ1icuLdVHlfVDSLSApiMc8H0Q24/87M4/1BfVtVv/RlrSWJ9hKdJROrhnMHrDFwkIncDuCdI5uNcABvuFt8L9Av0JAiZ8Q/Gqb1er6p/WRIsEq1xhjFrDOwC4oFuItJFVdPVuR+6FHCN+w/X+IDVCE+DiPTH6T/bjdPs/QynX+1JVX3SLVNJVYvFSYXcuH+gf6rqTn/Hci7Jdib7PzjXOI7AuUwp49rNd1X1c7fMhar6i7/iLWnsZImXRKQ9Th/OZe70KvAnzlX/s8UZzumx4pwEAYp7f2Cg8kiCN+OMg5gGTMc5O/wSzn3dN4hIuqp+aUnQt6xG6CURiQKq41wo/TDOGeH/Ask4ZyUPqXMLmjG5EpG2OANVdMFp/vbHuXd4KM5Z4nHATFVN8VuQJZT1EXpJVfeq6lqcL/G7qpoATMMZHWSNqn4YiGPaGf/J+D54fC/+Ar5W1SScJvH/gJ04Vx6cDzxvSdA/rGl8+r4DrhfnSXSDgFvVvfG9OF1nZ4pWtjtGyuJ0o+wAmonIvar6CPCniKzDufD+pH1//MeaxqdJRCrhXGQ8AJiqqkv8HJIJYCJyPdAJWIvThRIEzMEZXXo3zj3QfTXAB7E411kiPEMZdylkv1fUmAwici3OybSJOHcgbcLpV96OMzxbEDBHbTxBv7Om8ZlLB2sOm1Oy3TYXjTNaTD+codkOAwnABGCKqk7yW6AmB0uEZ8gSoPGULQlm1PZew7nFsr+qdhaRCJzHB8SJyCYNgLEojcPOGhtTCDyS4PU4l8EsUNXdOPdrX+SeXGuF8yCpZy0JBharERpTSETkPJzHGtwHHHNHk74QiMIZ8LYSMEZVf/VflCY3drLEmEIkItfh3CWyB6f2twsnGS4EkiwJBiZLhMYUIhEpi/PQ9Z2qekBERgHX4FwiY4OqBihLhMYUAXeE7HE4AyqMsHu4A5v1ERpTNMoCJ4GhqrrV38GY/FmN0JgiYhfbFx+WCI0xJZ5dR2iMKfEsERpjSjxLhMaYEs8SoTGmxLNEaBCRdBH5RkS+F5FZIlLuLLbVVUQWu68HZDzlL4+yoe4ABae7jwdF5A5vl2cr86aIXHEa+6olInYN4DnOEqEB56l1zVW1MXAC54HjmcRx2t8VVV2oqo/nUyQUZ1w+Y/zKEqHJ7gugnlsT2i4ibwHfAzVEpKeIfCUiG9yaYwUAEektIttEZAPO4wtwl48VkRfd1xeKyDwR2eROHYHHgbpubXSyW+5OEVkrIt+6j73M2Na9IvKDiKwCGhZ0ECJyrbudTSIyJ1stt4eIrHO3F+eWDxaRyR77vv5sf5Gm+LBEaDKJSAjO6CkZIybXB15W1VjgKM6oKj1UtSWwDrjdvbf2NZwnsrUCquWx+eeBz1S1GdAS2AzcjXNPbnNVvVNEerr7bAs0B1qJSGcRaYUzpH1znOcBt/HicOaqaht3f1uBqz3W1XL30Q94xT2Gq4HDqtrG3f61IlLbi/2Yc4DdYmcAzhORb9zXXwCvAxHAblVd4y5vj/PEvi/dh7KVBr4CooEfVXUHgIi8A1yXyz66AVcCqGo6cFhEwrKV6elOG935CjiJsSIwT1WPuftY6MUxNRaRh3Ga3xWA5R7rZrrPGd4hIrvcY+gJNPXoP6zs7vsHL/ZlijlLhAbcPkLPBW6yO+q5CPhQVUdkK5flfWdJgMdU9b/Z9nHbGWzrTeByVf3/9u4YJWIgCuP4/7MRWYKdjc2isuAZbDzCNhaihWjjFuIF9CJiLx5AZAsbQVYtFmwtbQTTitg9ixkhLMoGu3W+XxPITDIhxWNmSN57krQHbDbaJn+nijz2UUQ0AyaSun8Y22aMl8bW1j2wIWkNQFJHUo+Uc68raTX32/7l+htSnr7v/bhFUhnLqtFnCOw39h6XJXPN3EsAAAC/SURBVC0Bt0Bf0oKkirQMn6YCXnNm6J2Jti1Jc/mZV0jFlIbAIPdHUk9Sp8U49g94RmitRESdZ1YXkubz6ZOIeM7JSK8kfZCW1tUPtzgGziQdkApfDSJiJOkuf55ynfcJ14FRnpG+A7sRMZZ0SaoC90YqjTnNKfAA1PnYfKYX4JGUMfowIj4lnZP2DsdKg9dAv93bsVnnpAtmVjwvjc2seA6EZlY8B0IzK54DoZkVz4HQzIrnQGhmxXMgNLPifQEkp62Mjgwh+QAAAABJRU5ErkJggg==\n",
483 | "text/plain": [
484 | ""
485 | ]
486 | },
487 | "metadata": {
488 | "needs_background": "light"
489 | },
490 | "output_type": "display_data"
491 | }
492 | ],
493 | "source": [
494 | "fig, ax = plot_confusion_matrix(y_test, y_pred, encoder.classes_, normalize=True)\n",
495 | "fig.savefig('confusion_matrix.png')"
496 | ]
497 | },
498 | {
499 | "cell_type": "code",
500 | "execution_count": null,
501 | "metadata": {},
502 | "outputs": [],
503 | "source": []
504 | }
505 | ],
506 | "metadata": {
507 | "kernelspec": {
508 | "display_name": "Python 3",
509 | "language": "python",
510 | "name": "python3"
511 | },
512 | "language_info": {
513 | "codemirror_mode": {
514 | "name": "ipython",
515 | "version": 3
516 | },
517 | "file_extension": ".py",
518 | "mimetype": "text/x-python",
519 | "name": "python",
520 | "nbconvert_exporter": "python",
521 | "pygments_lexer": "ipython3",
522 | "version": "3.6.8"
523 | }
524 | },
525 | "nbformat": 4,
526 | "nbformat_minor": 2
527 | }
528 |
--------------------------------------------------------------------------------
/notebooks/Check Emotion Labeled Dataset.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Check Emotion Labeled Dataset\n",
8 | "\n",
9 | "The main objective of this notebook is to show the output dataset from the sentiment analysis model"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import os\n",
19 | "import pandas as pd\n",
20 | "from pathlib import Path"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 2,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "dataset_path = Path('../datasets/sentiment_analysis/dataset.csv').resolve()"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 3,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "dataset = pd.read_csv(dataset_path)"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 4,
44 | "metadata": {},
45 | "outputs": [
46 | {
47 | "data": {
48 | "text/html": [
49 | "\n",
50 | "\n",
63 | "
\n",
64 | " \n",
65 | " \n",
66 | " | \n",
67 | " label | \n",
68 | " id | \n",
69 | " date | \n",
70 | " user | \n",
71 | " text | \n",
72 | "
\n",
73 | " \n",
74 | " \n",
75 | " \n",
76 | " | 0 | \n",
77 | " fear | \n",
78 | " 1148914607152619520 | \n",
79 | " 2019-07-10 11:19:22 | \n",
80 | " cheri_shapley | \n",
81 | " Sometimes what you're most #afraid of doing is... | \n",
82 | "
\n",
83 | " \n",
84 | " | 1 | \n",
85 | " fear | \n",
86 | " 1148837283812073473 | \n",
87 | " 2019-07-10 06:12:07 | \n",
88 | " Dronearl_RSA | \n",
89 | " Delayed post \\n#Afraid \\n@TallRacksRec https:/... | \n",
90 | "
\n",
91 | " \n",
92 | " | 2 | \n",
93 | " fear | \n",
94 | " 1148719897788084224 | \n",
95 | " 2019-07-09 22:25:40 | \n",
96 | " wavetossed | \n",
97 | " #EyesOn #SeeSomethingSaySomething #CIA #Clowns... | \n",
98 | "
\n",
99 | " \n",
100 | " | 3 | \n",
101 | " fear | \n",
102 | " 1148653069003034630 | \n",
103 | " 2019-07-09 18:00:07 | \n",
104 | " Misspiggychop | \n",
105 | " #HappyBirthdayStevenAvery\\n\\n#CorruptiwocCount... | \n",
106 | "
\n",
107 | " \n",
108 | " | 4 | \n",
109 | " fear | \n",
110 | " 1148593210756947968 | \n",
111 | " 2019-07-09 14:02:15 | \n",
112 | " HorrorBitsVids | \n",
113 | " \"Fight Fire With Fire\"\\n\\nWhat did you think o... | \n",
114 | "
\n",
115 | " \n",
116 | "
\n",
117 | "
"
118 | ],
119 | "text/plain": [
120 | " label id date user \\\n",
121 | "0 fear 1148914607152619520 2019-07-10 11:19:22 cheri_shapley \n",
122 | "1 fear 1148837283812073473 2019-07-10 06:12:07 Dronearl_RSA \n",
123 | "2 fear 1148719897788084224 2019-07-09 22:25:40 wavetossed \n",
124 | "3 fear 1148653069003034630 2019-07-09 18:00:07 Misspiggychop \n",
125 | "4 fear 1148593210756947968 2019-07-09 14:02:15 HorrorBitsVids \n",
126 | "\n",
127 | " text \n",
128 | "0 Sometimes what you're most #afraid of doing is... \n",
129 | "1 Delayed post \\n#Afraid \\n@TallRacksRec https:/... \n",
130 | "2 #EyesOn #SeeSomethingSaySomething #CIA #Clowns... \n",
131 | "3 #HappyBirthdayStevenAvery\\n\\n#CorruptiwocCount... \n",
132 | "4 \"Fight Fire With Fire\"\\n\\nWhat did you think o... "
133 | ]
134 | },
135 | "execution_count": 4,
136 | "metadata": {},
137 | "output_type": "execute_result"
138 | }
139 | ],
140 | "source": [
141 | "dataset.head()"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": 5,
147 | "metadata": {},
148 | "outputs": [],
149 | "source": [
150 | "import seaborn as sns"
151 | ]
152 | },
153 | {
154 | "cell_type": "markdown",
155 | "metadata": {},
156 | "source": [
157 | "## Label count\n",
158 | "\n",
159 | "Check the count of each label"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": 6,
165 | "metadata": {},
166 | "outputs": [
167 | {
168 | "data": {
169 | "text/plain": [
170 | ""
171 | ]
172 | },
173 | "execution_count": 6,
174 | "metadata": {},
175 | "output_type": "execute_result"
176 | },
177 | {
178 | "data": {
179 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZUAAAEKCAYAAADaa8itAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAFs9JREFUeJzt3Xu0XnV95/H3x0REsRKEU0YTbBjNaNG2S8ggSgcvOIDWGsaiwmiJyGrGEbW2s6o4dkkr0lGxpV6qHVoiF6lI8ULGopgVb62VSxDkKpIFImFxORJAkaIGv/PH8zvwkJ7gMfmd5+Ek79dazzp7f/dv7/3bO1nP5+zrSVUhSVIPjxp3ByRJ2w5DRZLUjaEiSerGUJEkdWOoSJK6MVQkSd0YKpKkbgwVSVI3hookqZv54+7AqO222261ePHicXdDkuaUSy655AdVNfGL2m13obJ48WLWrl077m5I0pyS5MaZtPP0lySpG0NFktSNoSJJ6sZQkSR1Y6hIkroxVCRJ3RgqkqRuDBVJUjeGiiSpm+3uiXqN1vff/Rvj7sIjxlPedcW4uyDNOo9UJEndGCqSpG4MFUlSN4aKJKkbQ0WS1I2hIknqxlCRJHVjqEiSupm1UEmyMsntSa4cqp2Y5DtJLk/y2SQLhqa9I8m6JNcmOXiofkirrUty7FB9zyQXtvqnkuwwW9siSZqZ2TxSORU4ZJPaauBZVfWbwHeBdwAk2Qs4HHhmm+ejSeYlmQf8DfASYC/giNYW4H3ASVX1NOBO4OhZ3BZJ0gzMWqhU1deBDZvUvlRVG9voBcCiNrwMOKuqflJVNwDrgH3bZ11VXV9VPwXOApYlCfAi4Jw2/2nAobO1LZKkmRnnNZXXA19owwuBm4amrW+1zdV3Be4aCqipuiRpjMYSKkneCWwEzhzR+lYkWZtk7eTk5ChWKUnbpZGHSpLXAS8DXlNV1co3A3sMNVvUapur3wEsSDJ/k/q0qurkqlpaVUsnJia6bIck6d8baagkOQR4G/Dyqrp3aNIq4PAkj0myJ7AEuAi4GFjS7vTagcHF/FUtjL4CHNbmXw6cO6rtkCRNbzZvKf4k8E3g6UnWJzka+AjwK8DqJJcl+VuAqroKOBu4GvgicExV3d+umbwJOB+4Bji7tQV4O/DHSdYxuMZyymxtiyRpZmbtj3RV1RHTlDf7xV9VJwAnTFM/Dzhvmvr1DO4OkyQ9QvhEvSSpG0NFktSNoSJJ6sZQkSR1Y6hIkroxVCRJ3RgqkqRuDBVJUjeGiiSpG0NFktSNoSJJ6sZQkSR1Y6hIkroxVCRJ3RgqkqRuDBVJUjeGiiSpG0NFktSNoSJJ6sZQkSR1Y6hIkroxVCRJ3RgqkqRuZi1UkqxMcnuSK4dqT0yyOsl17ecurZ4kH0qyLsnlSfYemmd5a39dkuVD9X2SXNHm+VCSzNa2SJJmZjaPVE4FDtmkdiywpqqWAGvaOMBLgCXtswL4GAxCCDgOeA6wL3DcVBC1Nn8wNN+m65IkjdishUpVfR3YsEl5GXBaGz4NOHSofnoNXAAsSPIk4GBgdVVtqKo7gdXAIW3aE6rqgqoq4PShZUmSxmTU11R2r6pb2vCtwO5teCFw01C79a32cPX109QlSWM0tgv17QijRrGuJCuSrE2ydnJychSrlKTt0qhD5bZ26or28/ZWvxnYY6jdolZ7uPqiaerTqqqTq2ppVS2dmJjY6o2QJE1v1KGyCpi6g2s5cO5Q/ch2F9h+wN3tNNn5wEFJdmkX6A8Czm/Tfphkv3bX15FDy5Ikjcn82Vpwkk8CLwB2S7KewV1c7wXOTnI0cCPwqtb8POClwDrgXuAogKrakOR44OLW7t1VNXXx/40M7jB7LPCF9pEkjdGshUpVHbGZSQdO07aAYzaznJXAymnqa4FnbU0fJUl9+US9JKkbQ0WS1I2hIknqxlCRJHVjqEiSujFUJEndGCqSpG4MFUlSN4aKJKkbQ0WS1I2hIknqxlCRJHVjqEiSujFUJEndzNqr7yX1tf+H9x93Fx4xvvHmb4y7C9oMj1QkSd0YKpKkbgwVSVI3hookqRtDRZLUjaEiSerGUJEkdWOoSJK6MVQkSd2MJVSS/FGSq5JcmeSTSXZMsmeSC5OsS/KpJDu0to9p4+va9MVDy3lHq1+b5OBxbIsk6UEjD5UkC4G3AEur6lnAPOBw4H3ASVX1NOBO4Og2y9HAna1+UmtHkr3afM8EDgE+mmTeKLdFkvRQ4zr9NR94bJL5wOOAW4AXAee06acBh7bhZW2cNv3AJGn1s6rqJ1V1A7AO2HdE/ZckTWPkoVJVNwMfAL7PIEzuBi4B7qqqja3ZemBhG14I3NTm3dja7zpcn2aeh0iyIsnaJGsnJyf7bpAk6QHjOP21C4OjjD2BJwM7MTh9NWuq6uSqWlpVSycmJmZzVZK0XRvH6a8XAzdU1WRV/Qz4DLA/sKCdDgNYBNzchm8G9gBo03cG7hiuTzOPJGkMxvH3VL4P7JfkccC/AQcCa4GvAIcBZwHLgXNb+1Vt/Jtt+perqpKsAv4hyV8xOOJZAly0tZ3b509O39pFbDMuOfHIcXdB0hwz8lCpqguTnAN8C9gIXAqcDPwTcFaS97TaKW2WU4AzkqwDNjC444uquirJ2cDVbTnHVNX9I90YSdJDjOUvP1bVccBxm5SvZ5q7t6rqPuCVm1nOCcAJ3TsoSdoiPlEvSerGUJEkdWOoSJK6mVGoJFkzk5okafv2sBfqk+zI4DUqu7WHFtMmPYHNPL0uSdp+/aK7v/4H8FYGz4FcwoOh8kPgI7PYL0nSHPSwoVJVHwQ+mOTNVfXhEfVJkjRHzeg5lar6cJLnAYuH56kqHz+XJD1gRqGS5AzgqcBlwNRT6wUYKpKkB8z0ifqlwF5VVbPZGUnS3DbT51SuBP7DbHZEkjT3zfRIZTfg6iQXAT+ZKlbVy2elV5KkOWmmofJns9kJSdK2YaZ3f31ttjsiSaP0tQOeP+4uPGI8/+v9vuJnevfXjxjc7QWwA/Bo4MdV9YRuPZEkzXkzPVL5lanhJGHwN+b3m61OSZLmpl/6LcU18Dng4FnojyRpDpvp6a9XDI0+isFzK/fNSo8kSXPWTO/++t2h4Y3A9xicApMk6QEzvaZy1Gx3RJI09830j3QtSvLZJLe3z6eTLJrtzkmS5paZXqj/OLCKwd9VeTLw/1pNkqQHzDRUJqrq41W1sX1OBSZmsV+SpDlopqFyR5LXJpnXPq8F7tjSlSZZkOScJN9Jck2S5yZ5YpLVSa5rP3dpbZPkQ0nWJbk8yd5Dy1ne2l+XZPmW9keS1MdMQ+X1wKuAW4FbgMOA123Fej8IfLGqngH8FnANcCywpqqWAGvaOMBLgCXtswL4GECSJwLHAc8B9gWOmwoiSdJ4zDRU3g0sr6qJqvpVBiHz51uywiQ7AwcApwBU1U+r6i4Gtyif1pqdBhzahpcBp7eHLi8AFiR5EoOHL1dX1YaquhNYDRyyJX2SJPUx01D5zfbFDUBVbQCevYXr3BOYBD6e5NIkf59kJ2D3qrqltbkV2L0NLwRuGpp/fattri5JGpOZhsqjhk8ttVNPM31wclPzgb2Bj1XVs4Ef8+CpLmDwKhgefIHlVkuyIsnaJGsnJyd7LVaStImZhspfAt9McnyS44F/Bd6/hetcD6yvqgvb+DkMQua2dlqL9vP2Nv1mYI+h+Re12ubq/05VnVxVS6tq6cSEN61J0myZUahU1enAK4Db2ucVVXXGlqywqm4Fbkry9FY6ELiawXMwU3dwLQfObcOrgCPbXWD7AXe302TnAwcl2aUdRR3UapKkMZnxKayquprBl38PbwbOTLIDcD1wFIOAOzvJ0cCNDO42AzgPeCmwDri3taWqNrSjpotbu3e3az2SpDHZ0usiW6WqLmPwpuNNHThN2wKO2cxyVgIr+/ZOkrSlfum/pyJJ0uYYKpKkbgwVSVI3hookqRtDRZLUjaEiSerGUJEkdWOoSJK6MVQkSd0YKpKkbgwVSVI3hookqRtDRZLUjaEiSerGUJEkdWOoSJK6MVQkSd0YKpKkbgwVSVI3hookqRtDRZLUjaEiSerGUJEkdTO2UEkyL8mlST7fxvdMcmGSdUk+lWSHVn9MG1/Xpi8eWsY7Wv3aJAePZ0skSVPGeaTyh8A1Q+PvA06qqqcBdwJHt/rRwJ2tflJrR5K9gMOBZwKHAB9NMm9EfZckTWMsoZJkEfA7wN+38QAvAs5pTU4DDm3Dy9o4bfqBrf0y4Kyq+klV3QCsA/YdzRZIkqYzriOVvwbeBvy8je8K3FVVG9v4emBhG14I3ATQpt/d2j9Qn2YeSdIYjDxUkrwMuL2qLhnhOlckWZtk7eTk5KhWK0nbnXEcqewPvDzJ94CzGJz2+iCwIMn81mYRcHMbvhnYA6BN3xm4Y7g+zTwPUVUnV9XSqlo6MTHRd2skSQ8YeahU1TuqalFVLWZwof3LVfUa4CvAYa3ZcuDcNryqjdOmf7mqqtUPb3eH7QksAS4a0WZIkqYx/xc3GZm3A2cleQ9wKXBKq58CnJFkHbCBQRBRVVclORu4GtgIHFNV94++25KkKWMNlar6KvDVNnw909y9VVX3Aa/czPwnACfMXg8lSb8Mn6iXJHVjqEiSujFUJEndGCqSpG4MFUlSN4aKJKkbQ0WS1I2hIknqxlCRJHVjqEiSujFUJEndGCqSpG4MFUlSN4aKJKkbQ0WS1I2hIknqxlCRJHVjqEiSujFUJEndGCqSpG4MFUlSN4aKJKkbQ0WS1I2hIknqZuShkmSPJF9JcnWSq5L8Yas/McnqJNe1n7u0epJ8KMm6JJcn2XtoWctb++uSLB/1tkiSHmocRyobgf9VVXsB+wHHJNkLOBZYU1VLgDVtHOAlwJL2WQF8DAYhBBwHPAfYFzhuKogkSeMx8lCpqluq6ltt+EfANcBCYBlwWmt2GnBoG14GnF4DFwALkjwJOBhYXVUbqupOYDVwyAg3RZK0ibFeU0myGHg2cCGwe1Xd0ibdCuzehhcCNw3Ntr7VNlefbj0rkqxNsnZycrJb/yVJDzW2UEnyeODTwFur6ofD06qqgOq1rqo6uaqWVtXSiYmJXouVJG1iLKGS5NEMAuXMqvpMK9/WTmvRft7e6jcDewzNvqjVNleXJI3JOO7+CnAKcE1V/dXQpFXA1B1cy4Fzh+pHtrvA9gPubqfJzgcOSrJLu0B/UKtJksZk/hjWuT/w+8AVSS5rtf8NvBc4O8nRwI3Aq9q084CXAuuAe4GjAKpqQ5LjgYtbu3dX1YbRbIIkaTojD5Wq+hcgm5l84DTtCzhmM8taCazs1ztJ0tbwiXpJUjeGiiSpG0NFktSNoSJJ6sZQkSR1Y6hIkroxVCRJ3RgqkqRuDBVJUjeGiiSpG0NFktSNoSJJ6sZQkSR1Y6hIkroxVCRJ3RgqkqRuDBVJUjeGiiSpG0NFktSNoSJJ6sZQkSR1Y6hIkroxVCRJ3cz5UElySJJrk6xLcuy4+yNJ27M5HSpJ5gF/A7wE2As4Isle4+2VJG2/5nSoAPsC66rq+qr6KXAWsGzMfZKk7dZcD5WFwE1D4+tbTZI0BvPH3YFRSLICWNFG70ly7Tj7M0O7AT8YZwfygeXjXH1PY9+XAByXcfegl7Hvz7xlm9mX8AjYn2RG+/PXZtJorofKzcAeQ+OLWu0hqupk4ORRdaqHJGuraum4+7EtcF/25f7sa1vbn3P99NfFwJIkeybZATgcWDXmPknSdmtOH6lU1cYkbwLOB+YBK6vqqjF3S5K2W3M6VACq6jzgvHH3YxbMqdN1j3Duy77cn31tU/szVTXuPkiSthFz/ZqKJOkRxFAZgyRvSXJNkjPH3ZdtSZLFSa4cdz+k6ST513H3YRTm/DWVOeqNwIurav2WLiDJ/Kra2LFP0qxJEgan238+7r6MS1U9b9x9GAWPVEYsyd8C/xH4QpJ3JlmZ5KIklyZZ1tosTvLPSb7VPs9r9Re0+irg6jFuxqxKslOSf0ry7SRXJnl1knclubiNn9y+pEiyT2v3beCYoWW8LslnknwxyXVJ3j807aAk32z79h+TPL7V35vk6iSXJ/lAq72yrfPbSb4+4l0x65J8LsklSa5qDwmT5J4kJ7RtviDJ7q3+1DZ+RZL3JLlnaDl/0v59Lk/y5622uL3s9XTgSh76TNl2p+3XJDmx/Z+6Ismr27TTkxw61PbMqe+DOaeq/Iz4A3yPwVO0fwG8ttUWAN8FdgIeB+zY6kuAtW34BcCPgT3HvQ2zvH9+D/i7ofGdgScOjZ8B/G4bvhw4oA2fCFzZhl8HXN/m3RG4kcGX2m7A14GdWru3A+8CdgWu5cGbVxa0n1cAC4dr29Jnar8Cj2Xwxb8rUEP79/3An7bhzwNHtOE3APe04YMY3MEUBr+ofh44AFgM/BzYb9zb+Uj4APe0/9urGTwCsTvwfeBJwPOBz7V2OwM3APPH3ect+XikMl4HAccmuQz4KoMvv6cAjwb+LskVwD8yeAPzlIuq6oZRd3TErgD+a5L3JfkvVXU38MIkF7Z98iLgmUkWMPiinzqCOGOT5aypqrur6j4GR3a/BuzHYH9+o+335a1+N3AfcEqSVwD3tmV8Azg1yR8w+CLY1rylHeVdwCB0lwA/ZRAMAJcwCAeA5zL4/wjwD0PLOKh9LgW+BTyjLQfgxqq6YLY6Pwf9NvDJqrq/qm4Dvgb856r6GoMHuSeAI4BP1xw9ve01lfEK8HtV9ZB3kSX5M+A24LcY/OZ339DkH4+sd2NSVd9NsjfwUuA9SdYwOLW1tKpuavtnxxks6idDw/cz+P8eYHVVHbFp4yT7AgcChwFvAl5UVW9I8hzgd4BLkuxTVXdsxeY9YiR5AfBi4LlVdW+SrzLYrz+r9iszD+63h10U8H+q6v9usvzFbAf/Xzs6HXgtgzeDHDXmvmwxj1TG63zgzUPXB57d6jsDt9Tgoubvs23+hrxZSZ4M3FtVn2BwSmvvNukH7frHYQBVdRdwV5LfbtNfM4PFXwDsn+RpbV07JflPbbk71+Bh2j9iEOgkeWpVXVhV7wIm2bauC+wM3NkC5RkMjuIezgUMTt/A4ItvyvnA64euTS1M8qvde7tt+Gfg1UnmtaOSA4CL2rRTgbcCVNWcvWbqkcp4HQ/8NXB5kkcxOI/6MuCjwKeTHAl8ke3vt73fAE5M8nPgZ8D/BA5lcM7/VgbvfJtyFLAySQFf+kULrqrJJK8DPpnkMa38p8CPgHOT7MjgN+8/btNOTLKk1dYA397KbXsk+SLwhiTXMLie9ItOU70V+ESSd7Z57waoqi8l+XXgm+33o3sY/MZ9/2x1fI4q4LMMTiN+u42/rapuBaiq29q/xefG18Wt5xP1kmYkyeOAf6uqSnI4g4v2c/MOpRFLsivwrara7Ovj2/69Ati7XUeckzxSkTRT+wAfaadr7wJeP+b+zAntdO5XgQ88TJsXA6cAJ83lQAGPVCRJHXmhXpLUjaEiSerGUJEkdWOoSLNo+P1Ym5n+S79ZOcmpSQ7bup5Js8NQkSR1Y6hII5Dk8UnWtDcjX7HJG2jnt7fSXpPknPa8wtQbmL/W3iJ8fpInjan70owZKtJo3Af8t6raG3gh8JdTr+cBng58tKp+Hfgh8MYkjwY+DBxWVfsAK4ETxtBv6Zfiw4/SaAT4iyQHMHgd/EIGrz4HuKmqvtGGPwG8hcFrUJ4FrG7ZMw+4ZaQ9lraAoSKNxmuACWCfqvpZku/x4JuWN30CuRiE0FVV9dzRdVHaep7+kkZjZ+D2FigvZPA3XKY8JclUePx34F8YvOBxYqqe5NFJnjnSHktbwFCRRuNMYGn7I2NHAt8ZmnYtcEx7Q+0uwMeq6qcMXvH/vvZHtC4Dtou/ca65zXd/SZK68UhFktSNoSJJ6sZQkSR1Y6hIkroxVCRJ3RgqkqRuDBVJUjeGiiSpm/8P4cnvL2MVrpUAAAAASUVORK5CYII=\n",
180 | "text/plain": [
181 | ""
182 | ]
183 | },
184 | "metadata": {
185 | "needs_background": "light"
186 | },
187 | "output_type": "display_data"
188 | }
189 | ],
190 | "source": [
191 | "sns.countplot(x='label', data=dataset)"
192 | ]
193 | },
194 | {
195 | "cell_type": "markdown",
196 | "metadata": {},
197 | "source": [
198 | "## Text length\n",
199 | "\n",
200 | "Check the length of the tweets"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": 7,
206 | "metadata": {},
207 | "outputs": [
208 | {
209 | "data": {
210 | "text/plain": [
211 | ""
212 | ]
213 | },
214 | "execution_count": 7,
215 | "metadata": {},
216 | "output_type": "execute_result"
217 | },
218 | {
219 | "data": {
220 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYQAAAEKCAYAAAASByJ7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3Xt0nPV95/H3d2Y0I2lkWTffb5KxCRgSDBgIkFtDEqDNwm6WbIBeOAln2Z7CbnrbLuyeTVtOOV22PUmzbdKEllxOUkIoSVovS0ISSAoJ1GAwNxuMhe/GF1mWdfVImpnv/jHPyGNZl5E0N0mf1zk6nnnmeZ75PXrg+ej3+z2/32PujoiISKjcBRARkcqgQBAREUCBICIiAQWCiIgACgQREQkoEEREBFAgiIhIQIEgIiKAAkFERAKRchdgKlpaWry1tbXcxRARmTVefPHF4+6+KJ91Z1UgtLa2snXr1nIXQ0Rk1jCzffmuqyYjEREBFAgiIhJQIIiICKBAEBGRgAJBREQABYKIiAQUCCIiAigQCiad1qNIRWR2m1UD0yrVM7s6+PTXX2BJfTXnLK5j48qF3PnhdcQi4XIXTUQkbwqEaXhoy/4z3j/26jsAtNRF2XW0l6ff6mB1c5ybLl1ZjuKJiEyLmowKYP+JAVY11fKpy1Zz16+soyke5QfbDpa7WCIiU6JAmKGhZJp3Tp5idVMtAGbGxlUNPPt2J4e7T5W5dCIi+VMgzNChk6dIO6wJAgFg46oG3GHzy++UsWQiIlOjQJih/Z39AKzKCYSWuhgXr27gB9sOlatYIiJTpkCYoX0nBmipixGPndk//4mLV/DmkV52vNNTppKJiEyNAmEG3J39JwbOaC7K+rX3LCcSMv7pZdUSRGR2UCDMQGffEANDKVY3nx0ITfEoH3rXYv755UOkNGhNRGYBBcIM7DsxADByh9Fon7hkBUd7Btmyp7OUxRIRmRYFwgzsP9FPdVWIRQtiY37+wXMXEQkZz+w6XuKSiYhMXV6BYGbXmdlOM2s3s7vH+DxmZt8NPt9iZq05n90TLN9pZtfmLN9rZq+Z2ctmNisflLyvc4DVTbWEzMb8PB6LZMYktCsQRKTyTRoIZhYGvgRcD2wAbjGzDaNWux3ocvd1wBeA+4NtNwA3AxcA1wFfDvaX9SvuvtHdN834SErs1FCKY72DrG6KT7jeVetaeO1QN92nhktUMhGR6cmnhnA50O7uu919CHgYuHHUOjcC3wxePwpcY2YWLH/Y3QfdfQ/QHuxv1jvQlek/WDNGh3Kuq89pJu3wr7vVjyAilS2fQFgBHMh5fzBYNuY67p4EuoHmSbZ14Mdm9qKZ3TH1opfXke4EACsaaiZc7+LVjdRUhdVsJCIVr5yznb7P3Q+Z2WLgJ2b2prs/PXqlICzuAFi9enWpyziu3sQw0UiI6qqJp7iORkJc1tbEL99WDUFEKls+NYRDwKqc9yuDZWOuY2YRYCHQOdG27p799xjwA8ZpSnL3B9x9k7tvWrRoUR7FLY3ewSQLYvnl6dXnNNN+rI+jPYkil0pEZPryuaK9AKw3szYyF/ObgVtHrbMZuA14DrgJeMrd3cw2Aw+Z2eeB5cB64HkziwMhd+8NXn8MuLcgR1QifYkkddXj//pyn5nQk0gC8JdP7OTi1Y1nrHfrFZVT6xGR+W3SQHD3pJndBTwBhIGvuft2M7sX2Orum4EHgW+ZWTtwgkxoEKz3CLADSAJ3unvKzJYAP8j0OxMBHnL3HxXh+IqmdzDJknHGH4y2bGE1NVVh3u7oPysQREQqRV5tHu7+OPD4qGWfy3mdAD45zrb3AfeNWrYbuGiqha0kfYkk5yyqy2vdkBnnLIrzdkcf7o6NM25BRKScNFJ5GpKpNKeGUyyYoMlotHMW19F9apjO/qEilkxEZPoUCNPQN5jpE8i3UxkYqU283dFXlDKJiMyUAmEaeoNO4ok6lUdrjkepr46w53h/sYolIjIjCoRpOF1DqMp7GzNj7aI69nT0467psEWk8igQpmE6NQSAtpY4vYNJOvoGi1EsEZEZUSBMQ+9gZqK6uin0IQCsbclMhKdmIxGpRAqEaehLJKmNhgmHpnb7aFPQj7C7Q4EgIpVHgTANvYnklG45zRrpRziufgQRqTwKhGnoG0xOqUM519qWOH2DSTp61Y8gIpVFgTANvYnhKXcoZ7UF/Qi71Y8gIhVGgTBF7h7UEKYXCE3xKAtrqhQIIlJxFAhT1DeYZDjl064hmBlrW+LsCeY1EhGpFAqEKcq2/U+nUzmrrSVOf/BMZhGRSqFAmKJsINRNs1MZYK3mNRKRCqRAmKLsKOOZ1BCa4lGa41HeOtpbqGKJiMyYAmGKRpqMptmpnHXe0gXs7uhnYChZiGKJiMyYAmGKOnoHCZtRHQ3PaD/vWlpPMu38sr2zQCUTEZkZBcIUdfQOUlcdITTDp561ttQSi4R46s2jBSqZiMjMKBCmqKNvcMqT2o0lEgqxbnEdT715TLefikhFUCBMUUfv4Iw6lHOdt7Seoz2DbH+npyD7ExGZCQXCFHX0FqaGAHDukjrM4Kk3jxVkfyIiM6FAmIJU2unsH5r2KOXRFlRX8Z6VDTypQBCRCqBAmIKugSFSaZ/xLae5rjlvMa8cOKnZT0Wk7BQIUzAySrl6+qOUR/vweYsB+JlqCSJSZgqEKSjUoLRcFyyvp60lzre37NPdRiJSVgqEKSjExHajmRn/8f1refVgN8++rUFqIlI+CoQpyM5jVKhO5axPXLKCRQtifOVf3i7ofkVEpkKBMAVd/UNEIyFikZlNWzFadVWYz1zdxjO7jvPawe6C7ltEJF8KhCnoSQxTX8AO5Vy//t7VLIhF+MrTqiWISHkoEKagJ5GkvqawzUVZ9dVV/MaVa/jha4fZq8drikgZ5BUIZnadme00s3Yzu3uMz2Nm9t3g8y1m1prz2T3B8p1mdu2o7cJmts3MHpvpgZRCz6lhFhSphgDw6atbiYRD/N0zu4v2HSIi45k0EMwsDHwJuB7YANxiZhtGrXY70OXu64AvAPcH224AbgYuAK4DvhzsL+uzwBszPYhS6U0kqS9wh3KuxQuq+bcbl/O9lw5ycmCoaN8jIjKWfK5ulwPt7r4bwMweBm4EduSscyPwJ8HrR4G/MTMLlj/s7oPAHjNrD/b3nJmtBH4NuA/4/QIcS9H1JIZZ0VBT0H0+tGX/Ge+X1FeTGE5z9/de4wPnLgLg1itWF/Q7RUTGkk+T0QrgQM77g8GyMddx9yTQDTRPsu1fAX8EpCf6cjO7w8y2mtnWjo6OPIpbPL1F7EPIWrawhrUtcZ7b3UkqrYFqIlI6ZelUNrOPA8fc/cXJ1nX3B9x9k7tvWrRoUQlKN77eRHH7ELKuOqeF7lPD7DisabFFpHTyCYRDwKqc9yuDZWOuY2YRYCHQOcG2VwM3mNle4GHgw2b27WmUv2SGkmkSw+mi9iFknbdsAU3xKM+2Hy/6d4mIZOUTCC8A682szcyiZDqJN49aZzNwW/D6JuApz0zMsxm4ObgLqQ1YDzzv7ve4+0p3bw3295S7/0YBjqdoehPDACWpIYTMuHJtM/tODHCwa6Do3yciAnkEQtAncBfwBJk7gh5x9+1mdq+Z3RCs9iDQHHQa/z5wd7DtduARMh3QPwLudPdU4Q+j+HoSSaCw8xhN5NI1jUQjIbbsPlGS7xMRyevq5u6PA4+PWva5nNcJ4JPjbHsfmTuJxtv3z4Gf51OOcsrWEOqrq0gMF//ZBdVVYTYsq+eNIz2k0k44ZEX/ThGZ3zRSOU89p0pbQwA4b+kCBoZSvLS/q2TfKSLzlwIhTyM1hJri9yFknbtkAWEzfvrG0ZJ9p4jMXwqEPPWMdCqXroZQXRWmrSXOT3coEESk+BQIeeoNOpVLWUOAzC2ob3f0s0cT3olIkSkQ8tSTSGIGddHS1RAAzl9aD8CTajYSkSJTIOSp59QwdbEIoRLf7dMYj3Le0gXqRxCRolMg5Ckz02lpm4uyrjl/MS/s7aJ7YLgs3y8i84MCIU89ieGSdijn+sj5S0ilnZ+/daws3y8i84MCIU+9RXx85mQuWtlAS12Mn76hQBCR4lEg5KnnVLJsNYRQyHjfuma27O4kM0WUiEjhKRDy1Ds4XPJbTnNd2trEsd5BDpw4VbYyiMjcpkDIUzlrCACb1jQCsHWfJrsTkeJQIOTB3ekbLN9dRpCZxmJBdYSt+zSvkYgUhwIhDwNDKVJpL2sNIRwyLlndyNa9qiGISHEoEPLQU4aJ7cayaU0jbx3t03gEESkKBUIeekv8cJzxbGptAtB02CJSFAqEPPScOv1wnHLauKqBSMh4Qc1GIlIECoQ8VEoNoSYa5oLl9epYFpGiUCDkoVL6EAAuXdPEKwdOMpRMl7soIjLHKBDy0FMhNQSAy1obGUymef2d7nIXRUTmGAVCHiqlDwHg0tbMALUX96rZSEQKq/x/8s4CvYkk0XCIWKQ8+fnQlv1nvG+KR/nBtkPEY2eevluvWF3KYonIHKMaQh56g6mvzUr7cJzxrGmqZV9nvya6E5GCUiDkoSeRrIgO5azWljj9QymO9w2VuygiMocoEPLQW8aH44xlTXMtAPs6+8tcEhGZSxQIeeg5Vb6H44xlUV2M2miYvQoEESkgBUIeehPlnfp6NDOjtTnO3s6BchdFROYQBUIeesr4+MzxtDbXcqJ/aGTQnIjITCkQ8lBpNQSANc1xAPapliAiBZJXIJjZdWa208zazezuMT6Pmdl3g8+3mFlrzmf3BMt3mtm1wbJqM3vezF4xs+1m9qeFOqBCS6bSDAylKuouI4DlDTVUhY29x9WPICKFMWkgmFkY+BJwPbABuMXMNoxa7Xagy93XAV8A7g+23QDcDFwAXAd8OdjfIPBhd78I2AhcZ2bvLcwhFValTGw3WjhkrGqqVceyiBRMPjWEy4F2d9/t7kPAw8CNo9a5Efhm8PpR4BrLjOK6EXjY3QfdfQ/QDlzuGX3B+lXBT0WOsjodCJVVQwBobY5zpDtBYjhV7qKIyByQTyCsAA7kvD8YLBtzHXdPAt1A80TbmlnYzF4GjgE/cfct0zmAYhuZ6bTCagiQCQQH9p9QP4KIzFzZOpXdPeXuG4GVwOVmduFY65nZHWa21cy2dnR0lLaQnA6ESqwhrGqqIWSo2UhECiKfQDgErMp5vzJYNuY6ZhYBFgKd+Wzr7ieBn5HpYziLuz/g7pvcfdOiRYvyKG5h9ZzKNBnV11ReDSEWCbNsYQ17j6uGICIzl08gvACsN7M2M4uS6STePGqdzcBtweubgKc8M/PaZuDm4C6kNmA98LyZLTKzBgAzqwE+Crw588MpvN5E5Ux9PZZzFtWx/0Q/A0PJchdFRGa5Sf/sdfekmd0FPAGEga+5+3YzuxfY6u6bgQeBb5lZO3CCTGgQrPcIsANIAne6e8rMlgHfDO44CgGPuPtjxTjAmco+HKdSA+HCFfU8vauDNw73lLsoIjLL5dUO4u6PA4+PWva5nNcJ4JPjbHsfcN+oZa8CF0+1sOWQrSHUVWCnMsCKhhoaa6t47ZCeoCYiM6ORypPoTSSpi0UIhyrjWQijmRkXrlhI+7E+Tg5oOmwRmT4FwiR6E8PUxSqzdpD17hULSTv8eMfRchdFRGYxBcIkKnEeo9GyzUaPv3a4YPv80etH6BtUR7XIfKJAmETfYOUHQrbZ6Be7jhek2aj9WB+//e0X+fyP3ypA6URktlAgTKInkaSuQu8wyvXuFQtJpr0gzUYv7e8C4JGtB0Y61UVk7lMgTKIvMcyCCu9DgEyz0crGmoI0G7184CRVYaNvMMkjWw8WoHQiMhsoECYxG/oQINNs9PH3LOeZXcc53H1qRvvatv8k713bzKY1jXzj2T2k0hU576CIFJgCYRJ9g8mKv8so69evWI27863n9k17HwNDSXYe6eHiVQ185n1tHDhxip/o7iWReUGBMIHsw3EqcWK7saxqquUj5y/hO8/vn/aU2K8e7CbtsHF1Ax/bsIQVDTV87Zd7prSPxHCKft2hJDLrKBAm0D+YuahW6ijlsXzmfW10DQzzT9tGzz+Yn5cPnATgopUNRMIhPn11K8/vOcFrB/MbCf3awW4+9Bc/55NfeY60mppEZhUFwgROT309ewLhirYmzl9Wz9d/uZfM/IJT8/L+k6xprqW5LgbAf7hsFZGQ8cPXJ++sfuzVd/jkV5+lbzDJjsM9PJ7HNiJSORQIE8gOzJoNdxllmRmfvrqVnUd7efbtzilvv+1AFxtXNYy8r6+uYu2iOG8d7Z1wuwd/sYe7HtrGBcsX8tQffJB1i+v44k93qUNaZBaZPVe6MhgJhFnSh5B1w0XLuf+Hb/L1X+7h6nUteW93uPsUR3sGuXhVAw9t2T+yvLoqzIv7us5YBnDrFasBcHf+9udvc9U5zXz905cRi4T57DXr+c/f2cbjrx3m31y0vDAHJiJFpRrCBCp9ptPxVFeFufWK1Tz55jEOTOHxmtv2Z/oPNq5uPGP5kvpqugaGGRyno/pg1ymO9w1y/buXEYuEAfjVdy9j/eI6vvikagkis4UCYQK9iWwNYXYFAsDNl2f+ev/HrQcmWfO0lw+cJBoOcf6yBWcsX1pfDcDR3sExt9sWdERfnNPUFA4Zn/3IetqP9fH/CjjHkogUz+y70pXQSCDMkj6E0U066xfX8Y1n97JoQfXI9N3ZZp6xvLz/JBesqB/5Kz9rSTYQuhOsbqo96/v+76vvUBU2tu0/yas5dyOl3Vm8IMbfP7ObG9RsJFLxVEOYQLYPYbY1GWVtWtNETyLJrmMTdwgDDKfSvHro5BkdylkNtVVEwyGO9CbG3PbAiQFWNNSe9cyIkBnvWrqAN4/06hZUkVlAgTCB3sQw4ZBRUxWefOUKdP6yeuKxCC/s7Zp03R3v9JAYTnPxqP4DyFzYl9THONp9diAMp9IcPnlmzSFXSzzGUDLNOzOcTkNEik+BMIG+4GlpZpX5tLTJhEPGpasb2HmkZ2RMxXie2dUBwFXnNI/5+ZL6ao70JM4a23D45ClS7qxuqhlzu+YFUQD2HO+favFFpMQUCBOYLRPbTWRTaxNph5f2TVxLeHrXcS5YXk9LMCBttCX11QwMpc56aM7+4C6mlRPUEECBIDIbKBAm0DuLJrYbT0tdjLaWOC/sPUF6nJHLvYlhXtrXxQfOXTTufpYuDDqWe86802h/1ykaaquoH2esxoLqCLXRsAJBZBZQIEygNzE87oVuNrm8tYmugeFxRxs/93YnybTzgfXjB8LInUY9Z/YjHDgxMG7/AWRGTre1xBUIIrPA7P7zt8BG37a5/8QAC2JVZy2fbS5csZAfbT/CL3YdH/PzZ3YdpzYa5tI1Z3coZ9XFIsRjEY7kBEL3qWG6Tw2zqnH8QABobYmz/VB+k+OJSPmohjCBxHCa6qrZ/ysKh4yrzmlm9/F+Xh/jwvz0rg6uXNtMNDLxsS6tj51RQ8iOgp6ohgCwtiXOga5TDCXT0yi9iJTK7L/aFdHgcIrYLL3ldLTLWpuIRUL83TO7z1i+r7OffZ0DE/YfZC2pr+ZoT2KkL+LAiQHCIWNZ0L8wnraWOKm0c6Ar/2k0RKT0FAgTGEymqY7MjUCorgqzaU0jj716mHdOnh4T8HTQjJRPICytr2Y45XT1D5EYTrHrWB8rGmqIhCf+z6i1JQ7AXvUjiFQ0BcI4kqk0ybTPiSajrKuCmU+/8ezekWVPv9XBysYaWpsnbvaB0x3L//JWB5//yVsc7UlwyRgD2UZbGwSCOpZFKps6lceRCNq750qTEUBjbZTrL1zKQ1v2UxsNc97Sep57u5MbNi7Pa/Dd4vrMmIKt+7pY2VjDb125hpWTdCgDNNRGaaitUiCIVDgFwjiyUz1XT9LROtv87kfOZdfRPr745C6ywxI+mEdzEUAsEub6C5dSGw1z8epGQlMYwa1bT0UqnwJhHCM1hDnSh5C1bnEdT/zeBxgYSvLW0T6OdCf46PlL8t7+/ROMVZhIW0uc56bxBDcRKZ28/vw1s+vMbKeZtZvZ3WN8HjOz7wafbzGz1pzP7gmW7zSza4Nlq8zsZ2a2w8y2m9lnC3VAhZLI1hDmUB9CrtpohI2rGrjuwqWEQsWfq6mtOc7h7gSnhsZ+yI6IlN+kNQQzCwNfAj4KHAReMLPN7r4jZ7XbgS53X2dmNwP3A58ysw3AzcAFwHLgp2Z2LpAE/sDdXzKzBcCLZvaTUfssq8HhudeHAGcPviuVtkXBnUad/Zy/rL4sZRCRieXz5+/lQLu773b3IeBh4MZR69wIfDN4/ShwjWV6KW8EHnb3QXffA7QDl7v7YXd/CcDde4E3gBUzP5zCSSTnZh9CubTp1lORipfP1W4FkPscxoOcffEeWcfdk0A30JzPtkHz0sXAlvyLXXzZTuW5VkMol9bmTCDsViCIVKyy/vlrZnXA94Dfdfeecda5w8y2mtnWjo6OkpUt26msGkJhxGMRltTHdKeRSAXL52p3CFiV835lsGzMdcwsAiwEOifa1syqyITBP7j798f7cnd/wN03ufumRYumd4fLdAwOp4iEbNJRuJK/1ua4moxEKlg+V7sXgPVm1mZmUTKdxJtHrbMZuC14fRPwlGcerbUZuDm4C6kNWA88H/QvPAi84e6fL8SBFFoimVZzUYG1tcTZ26lAEKlUk95l5O5JM7sLeAIIA19z9+1mdi+w1d03k7m4f8vM2oETZEKDYL1HgB1k7iy6091TZvY+4DeB18zs5eCr/ru7P17oA5yuxHBKzUUFtri+ms7+IVJpJ1yCW11FZGryGpgWXKgfH7XsczmvE8Anx9n2PuC+Uct+AVT0FWFwOE21aggF1VIXxR26BobGfVSniJSP/gQeRyKZIqYaQkE1xaMAnOgfKnNJRGQsuuKNY3BYfQiF1hzP1AqO9w1OsqaIlIMCYRyJpPoQCq2lLlND6OxTDUGkEumKNw7VEAqvOeg36FQNQaQiKRDG4O4MJlNzdmK7cmmoqSJk6kMQqVS64o1hOOWknTnz+MxKEQoZTfEoxxUIIhVJgTCGxMg8Rvr1FFpzPKYmI5EKpSveGE7PdKoaQqE1xaPqVBapUAqEMWSfhaA+hMJrrouqD0GkQumKN4ZsDWGuPT6zErTUxTQOQaRCKRDGcLqGoEAotOZ4lJ5EkqFgenERqRx5zWU036hTubByH9vZ3tEHwIO/2MPCmqqR5bdesbrk5RKRM+mKN4bBkYfjqIZQaHWxzN8g/YPJMpdEREZTIIxBNYTiUSCIVC5d8cYwmEwTDYcIWUXP0D0rxaOZQOhTIIhUHAXCGBLDmraiWOLZGsJQqswlEZHRdNUbQ2I4pYntiqS6KkTYTE1GIhVIgTCG/qEU8agCoRjMjHgsrCYjkQqkQBhD32BypGlDCi8ei6iGIFKBFAhj6FcgFFWdAkGkIikQRkm7c2ooNXI3jBRePBZRk5FIBVIgjDIwlMKBupj6EIolHg3TP6i7jEQqjQJhlGxThpqMiqcuFmEoldZ8RiIVRoEwigKh+E6PRVCzkUglUSCM0qdAKDpNXyFSmRQIo2RH0GocQvFkw1YdyyKVRYEwSvav1lrdZVQ0cdUQRCqSAmGU/sEktdEw4ZAmtiuW001GutNIpJIoEEbpH0xqDEKRRSMhqsKmJiORCqNAGKV/KEVcYxCKTtNXiFSevALBzK4zs51m1m5md4/xeczMvht8vsXMWnM+uydYvtPMrs1Z/jUzO2ZmrxfiQApF8xiVRp1GK4tUnEkDwczCwJeA64ENwC1mtmHUarcDXe6+DvgCcH+w7QbgZuAC4Drgy8H+AL4RLKsomseoNOLRiMYhiFSYfGoIlwPt7r7b3YeAh4EbR61zI/DN4PWjwDVmZsHyh9190N33AO3B/nD3p4ETBTiGgtE8RqWTaTJSp7JIJcknEFYAB3LeHwyWjbmOuyeBbqA5z20nZGZ3mNlWM9va0dExlU2nLDuPkfoQiq8ueCaCu5e7KCISqPhOZXd/wN03ufumRYsWFfW7sp2cdWoyKrr6mipSaVc/gkgFyScQDgGrct6vDJaNuY6ZRYCFQGee21YMzWNUOk21UQC6BobLXBIRyconEF4A1ptZm5lFyXQSbx61zmbgtuD1TcBTnmkL2AzcHNyF1AasB54vTNEL7/S0FQqEYmuMB4HQP1TmkohI1qSBEPQJ3AU8AbwBPOLu283sXjO7IVjtQaDZzNqB3wfuDrbdDjwC7AB+BNzp7ikAM/sO8BzwLjM7aGa3F/bQpu70xHbqQyi2xpEaggJBpFLk9aewuz8OPD5q2edyXieAT46z7X3AfWMsv2VKJS0BzWNUOtFIiHgswgnVEEQqRsV3KpdS/2CSmirNY1QqTbVVqiGIVBAFQo7+waTuMCqhhtqoOpVFKogCIYfmMSqtpniU7oFh0hqLIFIRFAg5NG1FaTXWRkm503NKtQSRSqBAyNGnqa9LqjFeBcAJ9SOIVAQFQiCVDuYxUg2hZEYGp/WrhiBSCRQIgZMDQ5rHqMQW1lZhaCyCSKVQIAQ6g/vhVUMonUgoRH1NlUYri1QIBUKgsy9zUdJtp6XVqLEIIhVDgRDIjphVp3JpNWosgkjFUCAEOvsHAfUhlFpjPErPqWEGk3pYjki5KRAC2SYjzWNUWk21URx452Si3EURmfcUCIET/UOax6gMstNgH+waKHNJRESBEOjsH9QdRmXQWJsZnHbgxKkyl0REFAiBQycT1FcrEEqtvqaKsBkHVEMQKTsFAjCUTPPG4R6WN9SUuyjzTsiMhbVVHDihQBApNwUC8NbRXoaSaVY2KhDKoak2yoEuNRmJlJsCAXjl4EkAVjbWlrkk81NjvIqDqiGIlJ0CAXj1QDeNtVUjHZxSWo21UTr7h+jWNNgiZaVAIFNDeM/KBsx0y2k5rF1UB8ATrx8pc0lE5rd5HwgDQ0neOtrLRSsXlrso89aqxhrWtsR59KWD5S6KyLw27wNh+zs9pB3es7Kh3EWZt8yMT1yyguf3nNDdRiJlNO8D4ZUDmQ7l96xSDaGc/t0lKwH4/kuHylwSkflLgXCwm+ULq1m8oLrcRZnXVjTUcOXaZr6/7SBwRarkAAAJj0lEQVTuXu7iiMxL8z4QXg06lKX8/v2lK9nXOcCL+7rKXRSReWleB8LJgSH2dQ6ouahCXHfhUmqqwnxPncsiZTGvA+GVg90AbFQNoSLUxSJcf+FSHnvlMCf1FDWRkpvXgfBq0KF8oW45rRi/dVUrg8k0n/rqv3K0R89IECmleR0IL+3vYu2iOPXVGqFcKTauauDrn76Mg10D3PSVZ9l7vL/cRRKZN/IKBDO7zsx2mlm7md09xucxM/tu8PkWM2vN+eyeYPlOM7s2330W2zef3cvPdnbw0fOXlPqrZQwPbdk/8rOvc4Dbrmqls2+IX/vrX/A7336Rv396Nw9t2V/uYorMaZM+AMDMwsCXgI8CB4EXzGyzu+/IWe12oMvd15nZzcD9wKfMbANwM3ABsBz4qZmdG2wz2T6L5tEXD/LHm7fz0Q1L+MNr31WKr5QpWtlYyx3vX8v3tx3i8deP8OMdR7lgeT0A5y1bwHlLF+hxpyIFls//UZcD7e6+G8DMHgZuBHIv3jcCfxK8fhT4G8tMDHQj8LC7DwJ7zKw92B957LOguk8Nc6jrFFv3neBPNm/n6nXN/PUtF1MVntetZhVtcX01v/3BczjSneD5vZ28fOAk//0Hr418XhsNs6A6woLqKmKRENFIiGg4RHVVmOqqEDVVYWpjERZUR6ivriIeDROPRaiLRYhGQmSHO6TdSbuTSoMZVIVP7ysaMarCIcIhI2SGGRiGk9nYPbN97r/ZURQGhEOZbcIhIxIywqFQ8G/mvY3sM7Nddh/Z7QFCwbqhbBlgpBxpz5TEs/+mz/49hsOZ7bPfO96cXe5OMu2kgp9k2jP79dNlyy5PpZyUO6l0mrRzxu9yZJ20A5ntzSAWCY/8XqsiIaqC8njwexxz/ElwnGT2BH7m72nkdz5q0+whZn5nmeduhCz4vYeNqlDmnIaDzyf6naT99L+ZY0uTTDlDqTRDyTRDqTTpdOb3kU5DNGJEw2FiVZljjVWFiEVmx+N58wmEFcCBnPcHgSvGW8fdk2bWDTQHy/911LYrgteT7bMg0mnnkj/7CScHTs+kecnqBh74zU1UV4WL8ZVSYEsXVnPDRSv4+HuWc3JgmCPdCY71JhgYSpEYzvwMp5zBZJJkKk0y7Qyn0sGyNIPDKZJpDXbLCgUBlXuhTaUzF7z5KjcPij0ucqxcsCDoR/7oCIIwW66WuhhP/9GvFLdg5BcIZWVmdwB3BG/7zGznTPe5D6i786zFLcDxme57lphPxwrz63h1rHOU/bdpH++afFfMJxAOAaty3q8Mlo21zkEziwALgc5Jtp1snwC4+wPAA3mUc0bMbKu7byr291SC+XSsML+OV8c6d5XiePNpQH8BWG9mbWYWJdNJvHnUOpuB24LXNwFPeaZBcDNwc3AXUhuwHng+z32KiEgJTVpDCPoE7gKeAMLA19x9u5ndC2x1983Ag8C3gk7jE2Qu8ATrPUKmszgJ3OnuKYCx9ln4wxMRkXyZZpbMMLM7guapOW8+HSvMr+PVsc5dpTheBYKIiADzfOoKERE5bd4HQrmn0Cg0M1tlZj8zsx1mtt3MPhssbzKzn5jZruDfxmC5mdn/CY7/VTO7pLxHMD1mFjazbWb2WPC+LZhGpT2YViUaLB93mpXZwMwazOxRM3vTzN4wsyvn8rk1s98L/jt+3cy+Y2bVc+XcmtnXzOyYmb2es2zK59LMbgvW32Vmt431Xfma14Fgp6fluB7YANximek2ZrMk8AfuvgF4L3BncEx3A0+6+3rgyeA9ZI59ffBzB/C3pS9yQXwWeCPn/f3AF9x9HdBFZnoVyJlmBfhCsN5s8kXgR+5+HnARmWOek+fWzFYA/wXY5O4XkrkBJTs1zlw4t98Arhu1bErn0syagD8mM7D3cuCPsyEyLZmh6fPzB7gSeCLn/T3APeUuV4GP8Z/JzBm1E1gWLFsG7AxefxW4JWf9kfVmyw+ZcSxPAh8GHiMz68NxIDL6PJO5s+3K4HUkWM/KfQx5HudCYM/o8s7Vc8vpGRCagnP1GHDtXDq3QCvw+nTPJXAL8NWc5WesN9WfeV1DYOxpOVaMs+6sE1SZLwa2AEvc/XDw0REgO83rXPgd/BXwR0B2Jp9m4KS7J4P3ucd0xjQrQHaaldmgDegAvh40j/29mcWZo+fW3Q8BfwnsBw6TOVcvMjfPbdZUz2VBz/F8D4Q5y8zqgO8Bv+vuPbmfeeZPiTlxe5mZfRw45u4vlrssJRABLgH+1t0vBvo53aQAzLlz20hm0ss2MrMlxzm7iWXOKse5nO+BkM+0HLOOmVWRCYN/cPfvB4uPmtmy4PNlwLFg+Wz/HVwN3GBme4GHyTQbfRFosMw0KnDmMY0cr505zcpscBA46O5bgvePkgmIuXpuPwLscfcOdx8Gvk/mfM/Fc5s11XNZ0HM83wNhzk2hYWZGZuT4G+7++ZyPcqcXuY1M30J2+W8FdzG8F+jOqbJWPHe/x91XunsrmfP3lLv/OvAzMtOowNnHO9Y0KxXP3Y8AB8ws+xCPa8jMAjAnzy2ZpqL3mllt8N919njn3LnNMdVz+QTwMTNrDGpUHwuWTU+5O1XK/QP8KvAW8DbwP8pdngIcz/vIVDNfBV4Ofn6VTFvqk8Au4KdAU7C+kbnT6m3gNTJ3dJT9OKZ57B8CHgteryUzb1Y78I9ALFheHbxvDz5fW+5yT/EYNwJbg/P7T0DjXD63wJ8CbwKvA98CYnPl3ALfIdM3Mkym9nf7dM4l8JngmNuBT8+kTBqpLCIigJqMREQkoEAQERFAgSAiIgEFgoiIAAoEEREJKBBEJhDMLvo709y21cxuLXSZRIpFgSAysQZgWoFAZuIyBYLMGgoEkYn9L+AcM3vZzP7CzP6rmb0QzEn/pwBmdlnwvtrM4sH8/RcG274/2Pb3ynoUInnQwDSRCQQzxj7m7hea2cfITInwn8iMHN0M/G93f9rM/ozMSNkaMvMN/bmZfQj4Q3f/eFkKLzJFkclXEZHAx4KfbcH7OjIPLHkauJfM3FgJMg91EZl1FAgi+TPgz939q2N81kwmIKrI1BT6S1kwkUJQH4LIxHqBBcHrJ4DPBM+awMxWmNni4LOvAv8T+AdOP7oxd1uRiqcagsgE3L3TzH4ZPAj9h8BDwHOZ2ZjpA37DzK4Dht39oeA53c+a2YeBZ4CUmb0CfMPdv1CmwxDJizqVRUQEUJORiIgEFAgiIgIoEEREJKBAEBERQIEgIiIBBYKIiAAKBBERCSgQREQEgP8PyfQIh8tMYAgAAAAASUVORK5CYII=\n",
221 | "text/plain": [
222 | ""
223 | ]
224 | },
225 | "metadata": {
226 | "needs_background": "light"
227 | },
228 | "output_type": "display_data"
229 | }
230 | ],
231 | "source": [
232 | "sns.distplot(dataset.text.apply(lambda text: len(text)), bins=30)"
233 | ]
234 | },
235 | {
236 | "cell_type": "markdown",
237 | "metadata": {},
238 | "source": [
239 | "## Word count\n",
240 | "\n",
241 | "Check the word count"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": 8,
247 | "metadata": {},
248 | "outputs": [
249 | {
250 | "data": {
251 | "text/plain": [
252 | ""
253 | ]
254 | },
255 | "execution_count": 8,
256 | "metadata": {},
257 | "output_type": "execute_result"
258 | },
259 | {
260 | "data": {
261 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYQAAAEKCAYAAAASByJ7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3Xt8XVWd9/HP7+SetLk0TUubXukFSbmV1gIiILdaHKSOwgg4wji8BnmAGRX1GXyeB0YZZ4QZZ3BUxhGFEVEEBJUKlTpcFBAoTUtbWkpp6D1tadomaZvbyUl+zx9nB0LI5SQ5Obd836/XeWWfvdfe57ez2/PLWmuvtc3dERERCSU7ABERSQ1KCCIiAighiIhIQAlBREQAJQQREQkoIYiICKCEICIiASUEEREBlBBERCSQnewABmP8+PE+Y8aMZIchIpJWVq9efcDdKwYql1YJYcaMGVRXVyc7DBGRtGJmO2IppyYjEREBlBBERCSghCAiIoASgoiIBJQQREQEUEIQEZGAEoKIiABKCAmlx5WKSCpTQkiQF2sOsOifn+auZ2uSHYqISK+UEBLg/pe289l7X+FwSzv/9vvNVG8/lOyQRETeJ6aEYGZLzGyzmdWY2c29bM8zs4eC7SvNbEawfpGZrQ1e68zsz7vts93MXgu2ZeR8FO7OPzy2gVse28hH5lbwx6+ey5SyQr740FoOt7YnOzwRkfcYMCGYWRZwF3ARUAVcYWZVPYpdA9S7+2zgTuCOYP0GYKG7nwIsAX5oZt3nTzrX3U9x94XDPI+U9FbdUe57aQefOW0ad1+1kGNK8rnz06ewt7GVW3+zIdnhiYi8Ryw1hEVAjbtvdfcw8CCwtEeZpcB9wfIjwPlmZu7e7O6RYH0+MKp6VVfvqAfgc2fOJCtkACyYXsbfnTeH36zdw2/X7UlmeCIi7xFLQqgEdnV7vztY12uZIAE0AuUAZnaamW0EXgOu65YgHPi9ma02s2uHfgqpa82OBkoKcjh2fNF71t9w7ixOqCzm9t+9QTjSmaToRETea8Q7ld19pbvPAz4IfM3M8oNNH3b3U4k2Rd1gZmf3tr+ZXWtm1WZWXVdXN9LhxtWanfXMn1ZKKKgddMnOCvGVxcdR29DCw9W7+thbRCSxYkkItcDUbu+nBOt6LRP0EZQAB7sXcPdNwFHghOB9bfBzP/Brok1T7+Pud7v7QndfWFEx4PMdUkZjSztb9h/l1GllvW4/Z24FC6eX8b1nttDa3pHg6ERE3i+WhLAKmGNmM80sF7gcWNajzDLg6mD5UuAZd/dgn2wAM5sOfADYbmZFZjY2WF8ELCbaAZ0x1u1qAOgzIZgZX158HG8fbuPnK3cmMjQRkV4NmBCCNv8bgRXAJuBhd99oZreZ2SVBsXuAcjOrAW4Cum5N/TCwzszWEq0FXO/uB4CJwAtmtg54BXjC3Z+M54kl25qd9ZjByVNL+ixzxqxyPjSrnB/8oYamtkif5UREEiGmR2i6+3JgeY91t3ZbbgUu62W/+4H7e1m/FTh5sMGmkzU7G5g7YSxj83P6LfflxXP51A9e4icvbueGc2cnKDoRkffTSOUR0NnpvLqznlOnlw5YdsH0cVxw/AT+89ka9h9uTUB0IiK9U0IYAVsPHOVIa4T5ffQf9PT//qyK9g7n9iffGOHIRET6poQwAtbs6L9DuacZ44u45qyZ/GpNLWt21o9kaCIifVJCGAFrdtb3OiCtPzeeO5uJxXl8fdlGOjtH1YBuEUkRSggjoK8Baf0pysvmaxcdz/rdjfxytQariUjiKSHE2eHW/gek9WfpKZP54Iwy/nn5G7ytDmYRSTAlhDh7a/9R3KFqUvGg9zUz7vjUSbRFOvj7R9frCWsiklBKCHG2pyH6l31lWcGQ9j+2Ygw3L/kAf9hcx0Or1HQkIomjhBBnextbAJhcOrSEAHDVGTP40Kxy/vHx19l1qDleoYmI9EsJIc5qG1ooys2iOD+mQeC9CoWMf7n0pOh8R79cp7uORCQhlBDibE9DC5NLCzCL/Q6j3kwpK+TWj1fxyrZD3PPCtjhFJyLSNyWEONvT0Dqs5qLuLlswhQurJvKvKzazed+RuBxTRKQvSghxtrexhcml+QMXjIGZ8a1PnkhxQTZffGgtbRE9N0FERo4SQhy1tndw4GiYySXxqSEAjB+Tx+2fPIlNew/znae2xO24IiI9KSHE0d7G6C2n8Woy6nJB1UT+YuEU7n5uK2++raYjERkZSghxtLchesvppDg1GXX3tYuOZ2x+Nrc+tkED1kRkRCghxFFtkBAq41xDACgryuUri4/j5a2HeHz93rgfX0RECSGOukYpH1MS/xoCwBWLpjFvcjH/9MQmPXJTROJOCSGO9ja2MH5MHnnZWSNy/KyQcdvSE9h3uJXvPVMzIp8hIqOXEkIc1Ta0UDkC/QfdLZhexidPreTeF7bpkZsiElcxJQQzW2Jmm82sxsxu7mV7npk9FGxfaWYzgvWLzGxt8FpnZn8e6zHTUdco5ZH2d+fNIdLZyb1/2j7inyUio8eACcHMsoC7gIuAKuAKM6vqUewaoN7dZwN3AncE6zcAC939FGAJ8EMzy47xmGnF3dnT0MqkOI5B6MuM8UV87MRJ/PzlHRxubR/xzxOR0SGWGsIioMbdt7p7GHgQWNqjzFLgvmD5EeB8MzN3b3b3rt7PfKDrfslYjplWGlvaaWnviNso5YFcd84sjrRF+PnLOxPyeSKS+WKZkrMS6D4x/27gtL7KuHvEzBqBcuCAmZ0G3AtMBz4bbI/lmGlluLecPrBy8F/scyaM4a5nayjMzSIna3DdQVeeNm3QnycimW3EO5XdfaW7zwM+CHzNzAb1J7SZXWtm1WZWXVdXNzJBxkHXLaeTEtCH0OXsuRUcbYuwZmd9wj5TRDJXLAmhFpja7f2UYF2vZcwsGygBDnYv4O6bgKPACTEes2u/u919obsvrKioiCHc5Hj3wTiJaTICOHZ8EVPKCnh+ywE6NXpZRIYploSwCphjZjPNLBe4HFjWo8wy4Opg+VLgGXf3YJ9sADObDnwA2B7jMdNKbUMLuVkhxhflJewzzYwzZ43nUFOYbQeaEva5IpKZBuxDCNr8bwRWAFnAve6+0cxuA6rdfRlwD3C/mdUAh4h+wQN8GLjZzNqBTuB6dz8A0Nsx43xuCbWnoZVjSvIJhYb3YJzBqppcTH5OiDU76plVMSahny0imSWm5zy6+3JgeY91t3ZbbgUu62W/+4H7Yz1mOtvbEL/nIAxGTlaIEytLWburnkvaJ5OXMzKjpEUk82mkcpwkalBabxZMK6W9w9mw53BSPl9EMoMSQhxEOjrZd7g1rg/GGYyp4wopL8rV3UYiMixKCHHw9pE2Oj3+D8aJlZmxYHoZ2w40cagpnJQYRCT9KSHEwb7GkXswTqxOmVqKAa+qliAiQ6SEEAcHjkb/Kq8Yk7hbTnsqLcxlVsUY1uys15gEERkSJYQ46GqmKSvKTWoc86eVUt/cTm19S1LjEJH0pIQQB10JYVxhchPC8ZOKyTLjtdrGpMYhIulJCSEODjWFKcjJoiA3uWMA8nOymD1hDBv2NOJqNhKRQVJCiIP6pjDjktxc1OWEyhIamtvfmX1VRCRWSghxcDCFEsLxk8YSMthQq0FqIjI4SghxUN+cOgmhMDebWRVqNhKRwVNCiINDKVRDgGiz0aGmMHsbW5MdioikESWEOEi1hFA1qTjabLRHdxuJSOyUEIaptb2D5nBHSiWEorxsZo4vYkOtmo1EJHZKCMP0zhiEFEoIEG02OnA0zNuH25IdioikCSWEYXpnlHKSB6X1VDWpGAPW1zYkOxQRSRNKCMOUqjWEsfk5HFtRxGu71WwkIrFRQhim+ubUTAgAJ1WWclB3G4lIjJQQhung0dRNCPMmR+82Wr9bdxuJyMCUEIapvjlMyKCkICfZobxPYV42syeM4bXaBjUbiciAsmMpZGZLgP8AsoAfu/vtPbbnAT8FFgAHgU+7+3YzuxC4HcgFwsBX3f2ZYJ8/AJOArkl3Frv7/mGfURw9sHLngGVWbj1Efk4WD63alYCIBu/EyhIeXVPL7voWpo4rTHY4IpLCBqwhmFkWcBdwEVAFXGFmVT2KXQPUu/ts4E7gjmD9AeDj7n4icDVwf4/9PuPupwSvlEoGsWoKRyjKiymvJkXVpBJNiS0iMYmlyWgRUOPuW909DDwILO1RZilwX7D8CHC+mZm7v+rue4L1G4GCoDaRMZrDHRQledrr/hTkZjFn4hheq23Uk9REpF+xJIRKoHt7yO5gXa9l3D0CNALlPcp8Cljj7t1HSv23ma01s1vMzAYVeYpoakvtGgJEm40aW9rZfrAp2aGISApLSKeymc0j2oz0+W6rPxM0JZ0VvD7bx77Xmlm1mVXX1dWNfLCD1BTuoDA3tRPCvMkl5OeEeGXboWSHIiIpLJaEUAtM7fZ+SrCu1zJmlg2UEO1cxsymAL8GrnL3t7p2cPfa4OcR4AGiTVPv4+53u/tCd19YUVERyzklTKc7LeEIRXmp22QEkJsdYv7UMjbWHuZoWyTZ4YhIioolIawC5pjZTDPLBS4HlvUos4xopzHApcAz7u5mVgo8Adzs7n/qKmxm2WY2PljOAS4GNgzvVBKvtb2DToeiFK8hACyaOY4Od1bvqE92KCKSogZMCEGfwI3ACmAT8LC7bzSz28zskqDYPUC5mdUANwE3B+tvBGYDtwZ9BWvNbAKQB6wws/XAWqI1jB/F88QSoamtAyDlawgAE4vzmTm+iFe2HVTnsoj0KqY/bd19ObC8x7pbuy23Apf1st83gW/2cdgFsYeZmprD0eaXVO9D6HLazHE8uGoXW94+muxQRCQFaaTyMDQF7fGpfpdRl6rJxYzJy+aVbQeTHYqIpCAlhGFoCgdNRik8DqG77FCIhdPLeGPfEWobWgbeQURGFSWEYeiqIaRLkxFEO5fN4EfPbU12KCKSYpQQhqE53EFOlpGbnT6/xtLCXOZPK+MXr+xk/xFNiy0i70qfb7IUlA6jlHvzkbkVtHd0qpYgIu+hhDAMTeFIWoxB6Kl8TB6fOKWSn728k4NH9cxlEYlSQhiG5nAHhWnSodzTDefNpjXSwY9f2JbsUEQkRSghDEO6NhkBzKoYw8UnTeanL26nPngutIiMbkoIw9CU4lNfD+Rvz5tNc3sH33+2JtmhiEgKUEIYovaOTsKRzrStIQDMnTiWyz84jfte3E7N/iPJDkdEkkwJYYiag0Fp6TQGoTdfWTyXgtwsvvHb1/XcZZFRTglhiN6dtiJ9m4wgesfRTRfO5fktB/if199OdjgikkRKCEPUlGYT2/XnL0+fztyJY/jHJ16ntb0j2eGISJIoIQzRu01G6V1DAMjJCvEPH5/HrkMt/OuKzckOR0SSRAlhiFoyKCEAnDl7PFedMZ17XtjGo6t3JzscEUkCJYQhagmaVgpyMiMhANxycRVnHFvO1379Gmt3NSQ7HBFJMCWEIWoJJrbLzsqcX2FOVoi7PnMqE4vzuPan1bx9WJPfiYwmmfNtlmAt4Y6Mqh10GVeUy4+v+iBNbRGu+9lq2iLqZBYZLZQQhqilvSMj7jDqzXHHjOXf/uJkXt3ZwD88tlHjE0RGCSWEIWoOd5CfgTWELktOmMSN587mwVW7+PnKnckOR0QSIKaEYGZLzGyzmdWY2c29bM8zs4eC7SvNbEaw/kIzW21mrwU/z+u2z4JgfY2ZfdfMLF4nlQit7ek702msvnThXM49roJv/HYj1dsPJTscERlhAyYEM8sC7gIuAqqAK8ysqkexa4B6d58N3AncEaw/AHzc3U8Ergbu77bPD4C/AeYEryXDOI+Eaw5HMrIPobuskPGdy+czsTifWx7bSGenmo5EMlksNYRFQI27b3X3MPAgsLRHmaXAfcHyI8D5Zmbu/qq77wnWbwQKgtrEJKDY3V/2aAP1T4FPDPtsEqilvYOCDK8hAJQU5PDlxXPZtPcwKzbuS3Y4IjKCYkkIlcCubu93B+t6LePuEaARKO9R5lPAGndvC8p3H/3U2zEBMLNrzazazKrr6upiCHfkRTo6ae/wUZEQAC45uZJjK4q486k3VUsQyWAJ6VQ2s3lEm5E+P9h93f1ud1/o7gsrKiriH9wQZOKgtP5khYwvXjCXN98+yhOv7U12OCIyQmJJCLXA1G7vpwTrei1jZtlACXAweD8F+DVwlbu/1a38lAGOmbK6pq0YLTUEgItPnMTciWP4zlNv0qFagkhGiuVG+lXAHDObSfRL+3Lgyh5llhHtNH4JuBR4xt3dzEqBJ4Cb3f1PXYXdfa+ZHTaz04GVwFXA94Z9NgmSCTWEB4ZwK+nC6eN44JWd3PzoeuZPKxvUvleeNm3QnyciiTVgDSHoE7gRWAFsAh52941mdpuZXRIUuwcoN7Ma4Cag69bUG4HZwK1mtjZ4TQi2XQ/8GKgB3gJ+F6+TGmmZNrFdrKomF3NMcT5/fLNOg9VEMlBMQ23dfTmwvMe6W7sttwKX9bLfN4Fv9nHMauCEwQSbKpozoIYwFCEzzpxdzqNratl6oIlZFWOSHZKIxJFGKg/BaOxD6HLSlFKKcrN4seZAskMRkThTQhiCrj6ETJ66oi85WSEWzRzHG/uOcKgpnOxwRCSOlBCGoCXcQX5OiFB6zbYRN6fNLMcMXnpLtQSRTKKEMAQt7Zk59XWsigtyOKGyhOod9ZoeWySDKCEMQUt4dExb0Z8PzRpPW6STNTv1ZDWRTKGEMAQt7R0U5mTmsxBiNW1cIVPLCvhTzQENVBPJEEoIQ9Ac7iB/lNcQAM49bgKHmsK8urM+2aGISBwoIQxBtIaghHDcMWOZUlbAs5v3E+nsTHY4IjJMSgiD5O60hCOjvg8BwMw4/wMTqG9u59Ud6ksQSXdKCIMU7uik00ffKOW+zJ04lqmqJYhkBCWEQRrNo5R7Y2acf/xEGlraWb1DfQki6UwJYZAyYabTeJszYQzTxhXy7Bv7CUdUSxBJV0oIg6QawvuZGRedcAyHWyM8tyU1nmonIoOnhDBIzWHVEHozvbyIEytLeH5LHY0t7ckOR0SGQAlhkFrbR+ezEGKxZN4xuMOKjfuSHYqIDIESwiCphtC3sqJczpw9nrW7Gth1qDnZ4YjIICkhDFJLewchg9xs/ep6c87cCsbkZbP8tb16qppImtG32iC1hKMzndoonfp6IPk5WVxw/ER2HGrmjX1Hkh2OiAyCEsIgtbR3UJA7uie2G8iC6WWUF+Xy+9f30alagkjaUEIYpGgNQb+2/mSFjAurJvL24TbW7dKUFiLpIqZvNjNbYmabzazGzG7uZXuemT0UbF9pZjOC9eVm9qyZHTWz7/fY5w/BMdcGrwnxOKGR1tLeQaFqCAM6obKEySX5PLXpbU1pIZImBkwIZpYF3AVcBFQBV5hZVY9i1wD17j4buBO4I1jfCtwCfKWPw3/G3U8JXvuHcgKJFm0y0h1GAwmZsXjeMdQ3t7Nq26FkhyMiMYilhrAIqHH3re4eBh4ElvYosxS4L1h+BDjfzMzdm9z9BaKJISM0hyPk65bTmMyZMIaZ44t4dnMdzeFIssMRkQHEkhAqgV3d3u8O1vVaxt0jQCNQHsOx/ztoLrrF+rhtx8yuNbNqM6uuq0vutAid7rS1d2pQWozMjMVVEznaFuFnL+9IdjgiMoBk9o5+xt1PBM4KXp/trZC73+3uC919YUVFRUID7Km1vQNHg9IGY3p5EbMnjOGHf9yqWoJIioslIdQCU7u9nxKs67WMmWUDJcDB/g7q7rXBzyPAA0SbplKaJrYbmgs+MIGDTWF++pJqCSKpLJaEsAqYY2YzzSwXuBxY1qPMMuDqYPlS4BnvZ5iqmWWb2fhgOQe4GNgw2OATTVNfD8208iLOnlvB3c9tpalNtQSRVDVgQgj6BG4EVgCbgIfdfaOZ3WZmlwTF7gHKzawGuAl459ZUM9sO/DvwV2a2O7hDKQ9YYWbrgbVEaxg/it9pjYyuGoL6EAbvSxfM4VBTmPte2p7sUESkDzHdUO/uy4HlPdbd2m25Fbisj31n9HHYBbGFmDqagxqC7jIavPnTyvjIcRX86Lmt/OXp0ynOz0l2SCLSg4bcDoJqCMPzlcXHUd/czn/94a1khyIivVBCGISuu2Q0UnloTqgs4c/nV3LPC9vY09CS7HBEpAclhEFoCneQnxMiK6SZTofqy4vn4sC3f7852aGISA9KCIPQ3BZR7WCYppQV8tdnzuTXr9ayobYx2eGISDdKCIPQHO5Q/0EcXH/uLEoLcvinJzbpIToiKUQJYRCawx0UqYYwbMX5OXzh/Dm8tPUgy9btSXY4IhJQQhiEpnBENYQ4+ewZMzhlailfX7aRuiNtyQ5HRFBCGBQ1GcVPVsj410tPoqmtg1sfS/lB6iKjghJCjNo7OglHOinKU5NRvMyZOJYvXjiH323YxxPr9yY7HJFRTwkhRs2a2G5EXHvWsZw0pYRbHtvAgaNqOhJJJiWEGHUNSlOncnxlZ4X49mUnc7Qtws2Pvqa7jkSSSAkhRs2atmLEzJ04lr9f8gGe2vQ2D67aNfAOIjIilBBi1DVtc6H6EEbE5z40gw/PHs9tv32dbQeakh2OyKikhBCjrhpCkWoIIyIUMr592cnkZof44oOv0t7RmeyQREYdJYQYdfUhqFN55BxTks+3Pnki63Y38r2ntyQ7HJFRRwkhRs3hDvKyQ2SH9CsbSR87cRKXLpjC95+t4ZVth5Idjsioom+3GDWHOzQGIUG+fsk8po4r5EsPraWxpT3Z4YiMGkoIMWpq07QViTImL5vvfPoU9h1u1ShmkQRSQoiRpq1IrPnTyvji+XN4bO0efvHKzmSHIzIqxJQQzGyJmW02sxozu7mX7Xlm9lCwfaWZzQjWl5vZs2Z21My+32OfBWb2WrDPd80spZ860xyOaFBagl1/7mzOmVvBLb/ZwJ9qDiQ7HJGMN2BCMLMs4C7gIqAKuMLMqnoUuwaod/fZwJ3AHcH6VuAW4Cu9HPoHwN8Ac4LXkqGcQKI0qYaQcFkh4/tXzmdWxRiu+9lqavYfSXZIIhktlhrCIqDG3be6exh4EFjao8xS4L5g+RHgfDMzd29y9xeIJoZ3mNkkoNjdX/boXAU/BT4xnBMZSZFgYjsNSku8sfk53PNXC8nLDvHXP6nWfEciIyiWb7hKoPt8AruB0/oq4+4RM2sEyoG+6vmVwXG6H7MyloCTQdNWDN8DK4fXD3DZgqn8+IWtfPTO5/jcmTMZV5Tbb/krT5s2rM8TGY1SvlPZzK41s2ozq66rq0tKDE3BoDQ9Tzl5po6LPou5OdzBf/3xLWobWpIdkkjGiSUh1AJTu72fEqzrtYyZZQMlwMEBjjllgGMC4O53u/tCd19YUVERQ7jxp2krUsP08iI+f/axZIeMHz2/lddqG5MdkkhGiSUhrALmmNlMM8sFLgeW9SizDLg6WL4UeMb7mcfY3fcCh83s9ODuoquAxwYdfYK802SkPoSkm1Ccz3XnzKJiTB6/eGUnD6zcwZFWDV4TiYcBv+GCPoEbgRVAFnCvu280s9uAandfBtwD3G9mNcAhokkDADPbDhQDuWb2CWCxu78OXA/8BCgAfhe8UtI7M52qhpASigtyuO6cWbywpY6n39jPW3VbuLBqIgunl5GdlfKtoCIpK6Y/ed19ObC8x7pbuy23Apf1se+MPtZXAyfEGmgyqVM59WSFjHOOm8Dxk4v5zat7WLZuD398s46PHFfBgmllyQ5PJC2pDSQGzeGIJrZLURPG5vM3Z82kpu4oT2/az2Nr9/DClgNMH1/EOXOT0+ckkq70DRcDTVuR2syMORPG8vmzj+WvPjQDM7j63lf4Xz9bzR7djSQSM9UQYtAcjmim0zRgZsydOJa/O28OR9oifO+ZLbz41kHu/PTJnPeBickOTyTlqYYQg6Y21RDSSXZWiBvOnc2TXzibytIC/von1Xx7xWY6Ovu88U1EUEKISXM4okFpaWjG+CJ+df2HuPyDU/n+szV89p6VHNTUFyJ9UkKIQXO4Q4PS0lR+Tha3f+ok/uXSk6jeUc/Hv/cC63c3JDsskZSkhDCASEcnbZrYLu39xcKpPHrdhzAzLv2vl3i4etfAO4mMMkoIA9AYhMxx4pQSfvu3H+aDM8r434+s59//5036GVAvMuooIQzg3YSgGkImGFeUy08+t4jLFkzhu09v4e8fXU97R2eywxJJCfqWG0DXTKfqQ8gcOVkh/uXSk5hUks93n6nhwNEw//WXC8jN1t9HMrrpf8AAVEPITGbGTYuP4x8/cQLPvLGfmx5eS6duS5VRTt9yA2juehZCnmoImeizp0+nuS3Ct373BuVFuXz9knmk+OO9RUaMEsIAjrRGMNSpnMk+f84sDjaFufu5rYwryuMLF8xJdkgiSaGEMICG5nbG5mdrYrs0M9hHdk4fV8ip08q486k32dvYwklTSge1vx7ZKZlA33IDaGgOU1rY//N7Jf2ZGZ+YP5lp4wr51Zpa9h1uTXZIIgmnhDCAhpZ2Sgtzkh2GJEB2KMSVi6aRlx3i5y/voCW4oUBktFBC6EenO43N7ZQWqIYwWhQX5HDladOobw7zcPUuOjVwTUYRJYR+HGmN0OGuGsIoM728iD87cRKb3z7Cy1sPJjsckYRRQuhHQ3MYgDIlhFHn9GPLmTtxDCs27uPAEc2QKqODEkI/GprbAdSpPAqZGZ+cP4XsUIhfrt6lZynIqBBTQjCzJWa22cxqzOzmXrbnmdlDwfaVZjaj27avBes3m9lHu63fbmavmdlaM6uOx8nEW1cNQU1Go1NxQQ6XnDKZXfUtPL+lLtnhiIy4AROCmWUBdwEXAVXAFWZW1aPYNUC9u88G7gTuCPatAi4H5gFLgP8MjtflXHc/xd0XDvtMRkB9SzsFOVnkZWtQ2mh18pRSTqws4elN+3lbt6JKhoulhrAIqHH3re4eBh4ElvYosxS4L1h+BDjfouP/lwIPunubu28DaoLjpYWG5rD6D4RLTp5MbnaIx9bu0XTZktFiSQiVQPeniewO1vVaxt0jQCNQPsC+DvzezFab2bWDD33kNTQyaYExAAAMY0lEQVS3q/9AKMrLZsm8Y9h+sIm1u/S0NclcyexU/rC7n0q0KeoGMzu7t0Jmdq2ZVZtZdV1d4tpx3Z2G5nbVEASABTPKmFpWwPIN+zRgTTJWLAmhFpja7f2UYF2vZcwsGygBDva3r7t3/dwP/Jo+mpLc/W53X+juCysqKmIINz4amtsJd3SqhiAAhMxYekolzW0Rfv/6vmSHIzIiYkkIq4A5ZjbTzHKJdhIv61FmGXB1sHwp8IxHG1uXAZcHdyHNBOYAr5hZkZmNBTCzImAxsGH4pxM/u+tbAN1hJO+aXFrA6ceW88q2Q9QG/z5EMsmACSHoE7gRWAFsAh52941mdpuZXRIUuwcoN7Ma4Cbg5mDfjcDDwOvAk8AN7t4BTAReMLN1wCvAE+7+ZHxPbXhqG5oBjUGQ97qwaiKFedn8dv0eTWshGSem6a/dfTmwvMe6W7sttwKX9bHvPwH/1GPdVuDkwQabSF01hLIC1RDkXfk5WSyZN5FH19SyblcD86eVJTskkbjRSOU+1Da0kJsVokAPxpEe5k8rY0pZAU9u2EdbuzqYJXMoIfShtr6F0sIcPU5R3idkxsdPmsyRtgjPbN6f7HBE4kYJoQ+1DS3qUJY+TR1XyIJpZbxYc5A6TX4nGUIJoQ/RhKAOZenb4nkTyc4yHl+vEcySGZQQetHUFokOSlOHsvRjbH4OFxw/kS37j/I/r7+d7HBEhk0JoRe1DV1jEFRDkP6dfmw5E8bmcdvjr9OqDmZJc0oIvajVoDSJUVbI+PjJk9ld38IP/7g12eGIDIsSQi92q4YggzCrYgx/dtIk/vMPNew61JzscESGTAmhF9sPNJGbFWJsfkzj9kT4vx87nqyQ8f9+s0EdzJK2lBB68eJbBzl1eikhjUGQGE0uLeCrHz2OP75Zx2/W9pz7USQ9KCH0sP9IK5v2HubsuYmbWVUyw1VnzGD+tFJu++3rHDyqsQmSfpQQenj+zQMAnD1HCUEGJytk3PGpkzjaFuG2x19Pdjgig6aE0MNzW+oYPyaXqknFyQ5F0tDciWO54dzZPLZ2Dys26rkJkl6UELrp7HSe33KAs+ZUEAqp/0CG5vqPzObEyhK+/PA6Nu87kuxwRGKmhNDN63sPc6gpzFlzxic7FEljudkh7r5qAYW5WVxz3yr1J0jaUELo5o9vRp/ZfJb6D2SYJpUU8KOrFlJ3pI3rfraatohGMUvqU0Lo5rk366iaVEzF2LxkhyIZ4OSppXz7spNZtb2ev/npahqaw8kOSaRfSgiBo20RVu+o1+2mElcfP3kyt3/yRF5+6yAXf+8FNtQ2JjskkT4pIQRefusgkU7n7LnqP5D4unzRNH553Rl0djqf/MGL/OAPb9HUFkl2WCLvo7kZgI5O5+crd1CYm8WC6XpGrgzeAyt3Dljmc2fO5NE1u7njyTf47tNbOHN2OafPLKcwb/D/Da88bdpQwhTpV0w1BDNbYmabzazGzG7uZXuemT0UbF9pZjO6bftasH6zmX001mMmirvzf371Gs9uruOrHz2OvGw9Q1lGRlFeNledMYP/dc4sppcX8tSm/fzz7zZx7wvbeHnrQerVxyBJNuCfJmaWBdwFXAjsBlaZ2TJ37z4U8xqg3t1nm9nlwB3Ap82sCrgcmAdMBp4ys7nBPgMdMyFuf/INHqrexd+eN5vPnTkz0R8vo9DUcYVcdcYM9jW2sm53Axv3NLJs3R5YByUFOUwvL2TauEKOKclnUnEBBbn6I0USI5a66iKgxt23ApjZg8BSoPuX91Lg68HyI8D3Lfp0+qXAg+7eBmwzs5rgeMRwzLjr7HTCHZ1sP9jE2p0NvFBzgMfX7+Wzp0/npgvnDnwAkTg6piSfY0qOYXHVRPYfaWNr3VG2H2xm24Em1u9+t/O5OD+b0sJcSgtzKM7PoSA3i0hnJ2PzsynIySI/eOVmh8jNCpGXHSI7K0R2yMgKGSEzQgZmRtd8jUb0PURryZ0e/dnhTqTDiXQ6kY5OIp1OR6fTGczgakSPETIjO8vIDhnZoRA52cHPLMO6fV6nOx4c2x265oHt2h4yusUYfR8K4rRuk0t237/rmJ0DzCrb2W0fd3/ns7vO3Xj3s/rSda7df4eZLJaEUAns6vZ+N3BaX2XcPWJmjUB5sP7lHvtWBssDHTNuLv7e87yx9wiRzvf+AyotzOGvPjSDWy+uyvgLLanLzJhYnM/E4nzOmBX98jrSFmFfYyv7GlupO9JGfUuY2voWNrUepr3D9cjOJOlKEFlmYF3JJZoouzjdkp+/933PqdHfSZ70frzuX0trbrmQ/JyRrS2mfKeymV0LXBu8PWpmm+N17B3AOuAb7109HjgQr89IATqf1JZp5wOZd04pcT4F/zis3afHUiiWhFALTO32fkqwrrcyu80sGygBDg6w70DHBMDd7wbujiHOuDCzandfmKjPG2k6n9SWaecDmXdOmXY+/YnlLqNVwBwzm2lmuUQ7iZf1KLMMuDpYvhR4xqN1o2XA5cFdSDOBOcArMR5TREQSaMAaQtAncCOwAsgC7nX3jWZ2G1Dt7suAe4D7g07jQ0S/4AnKPUy0szgC3ODuHQC9HTP+pyciIrEyPf/1vczs2qCZKiPofFJbpp0PZN45Zdr59EcJQUREAM1lJCIiASWEQKpMpTEcZjbVzJ41s9fNbKOZfSFYP87M/sfMtgQ/02bCJjPLMrNXzezx4P3MYHqUmmC6lNxkxzgYZlZqZo+Y2RtmtsnMzkjz6/Ol4N/aBjP7hZnlp9M1MrN7zWy/mW3otq7X62FR3w3Oa72ZnZq8yEeGEgLvmZ7jIqAKuCKYdiPdRIAvu3sVcDpwQ3AeNwNPu/sc4Ongfbr4ArCp2/s7gDvdfTZQT3TalHTyH8CT7v4B4GSi55aW18fMKoG/Axa6+wlEbxDpmromXa7RT4AlPdb1dT0uInqn5ByiY6N+kKAYE0YJIeqd6TncPQx0TaWRVtx9r7uvCZaPEP2yqSR6LvcFxe4DPpGcCAfHzKYAfwb8OHhvwHlEp0eBNDoXADMrAc4melce7h529wbS9PoEsoGCYPxRIbCXNLpG7v4c0Tsju+vreiwFfupRLwOlZjYpMZEmhhJCVG/Tc1T2UTYtWHTG2fnASmCiu+8NNu0DJiYprMH6DvC/gc7gfTnQ4O5dDxNIt+s0E6gD/jtoBvuxmRWRptfH3WuBbwM7iSaCRmA16X2NoO/rkXHfEz0pIWQgMxsDPAp80d0Pd98WDBhM+VvLzOxiYL+7r052LHGUDZwK/MDd5wNN9GgeSpfrAxC0rS8lmugmA0W8v/klraXT9YgHJYSoWKbnSAtmlkM0Gfzc3X8VrH67q2ob/NyfrPgG4UzgEjPbTrQJ7zyi7e+lQfMEpN912g3sdveVwftHiCaIdLw+ABcA29y9zt3bgV8RvW7pfI2g7+uRMd8TfVFCiMqIqTSCNvZ7gE3u/u/dNnWfWuRq4LFExzZY7v41d5/i7jOIXo9n3P0zwLNEp0eBNDmXLu6+D9hlZscFq84nOoo/7a5PYCdwupkVBv/2us4nba9RoK/rsQy4Krjb6HSgsVvTUmaIzjOuF/Ax4E3gLeD/JjueIZ7Dh4lWb9cDa4PXx4i2vT8NbAGeAsYlO9ZBntdHgMeD5WOJzodVA/wSyEt2fIM8l1OA6uAa/QYoS+frQ3Sy4DeADcD9QF46XSPgF0T7P9qJ1uCu6et6EJ2d+q7gO+I1ondXJf0c4vnSSGUREQHUZCQiIgElBBERAZQQREQkoIQgIiKAEoKIiASUEET6EcxOev0Q951hZlfGOyaRkaKEINK/UmBICQGYASghSNpQQhDp3+3ALDNba2b/amZfNbNVwXz43wAwsw8G7/PNrCh4PsAJwb5nBft+KalnIRIDDUwT6Ucwa+zj7n6CmS0mOiXD54mOWl0G/Iu7P2dm3wTygQKi8xV9y8w+AnzF3S9OSvAig5Q9cBERCSwOXq8G78cQfVjKc8BtROfEaiX60BiRtKOEIBI7A77l7j/sZVs50QSRQ7Sm0JTIwETiQX0IIv07AowNllcAfx08bwIzqzSzCcG2HwK3AD8n+gjJnvuKpDzVEET64e4HzexPwUPYfwc8ALwUne2Zo8BfmtkSoN3dHwiez/2imZ0HPA90mNk64CfufmeSTkMkJupUFhERQE1GIiISUEIQERFACUFERAJKCCIiAighiIhIQAlBREQAJQQREQkoIYiICAD/H27qECVGcYpoAAAAAElFTkSuQmCC\n",
262 | "text/plain": [
263 | ""
264 | ]
265 | },
266 | "metadata": {
267 | "needs_background": "light"
268 | },
269 | "output_type": "display_data"
270 | }
271 | ],
272 | "source": [
273 | "sns.distplot(dataset.text.apply(lambda text: len(text.split())), bins=10)"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": null,
279 | "metadata": {},
280 | "outputs": [],
281 | "source": []
282 | }
283 | ],
284 | "metadata": {
285 | "kernelspec": {
286 | "display_name": "Python 3",
287 | "language": "python",
288 | "name": "python3"
289 | },
290 | "language_info": {
291 | "codemirror_mode": {
292 | "name": "ipython",
293 | "version": 3
294 | },
295 | "file_extension": ".py",
296 | "mimetype": "text/x-python",
297 | "name": "python",
298 | "nbconvert_exporter": "python",
299 | "pygments_lexer": "ipython3",
300 | "version": "3.6.8"
301 | }
302 | },
303 | "nbformat": 4,
304 | "nbformat_minor": 2
305 | }
306 |
--------------------------------------------------------------------------------