├── datasets
└── .gitkeep
├── models
└── .gitkeep
├── nlp
├── __init__.py
├── dataset.py
└── utils.py
├── requirements.txt
├── requirements.gpu.txt
├── .env.sample
├── .gitignore
├── README.md
├── query_relations.json
└── notebooks
├── Fetch Tweets.ipynb
├── Validate API Data.ipynb
├── Predict Emotion.ipynb
├── Train Sentiment Analysis.ipynb
├── Train Emotion Recognition Model.ipynb
├── Sentiment Analysis Score.ipynb
├── Emotion Recognition Model Validation.ipynb
└── Check Emotion Labeled Dataset.ipynb
/datasets/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/models/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/nlp/__init__.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | nltk.download('stopwords')
3 |
4 | from .dataset import Dataset
5 | from .utils import preprocess
6 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | python-dotenv==0.1.0
2 | tweepy==3.5.0
3 | jupyter==1.0.0
4 | tensorflow==1.14.0
5 | pandas==0.24.2
6 | python-dotenv==0.1.0
7 | nltk==3.4.3
8 | scikit-learn==0.21.2
9 | emoji==0.5.2
10 | seaborn==0.9.0
11 | tqdm==4.32.2
12 | matplotlib==3.0.3
--------------------------------------------------------------------------------
/requirements.gpu.txt:
--------------------------------------------------------------------------------
1 | python-dotenv==0.1.0
2 | tweepy==3.5.0
3 | jupyter==1.0.0
4 | tensorflow-gpu==1.14.0
5 | pandas==0.24.2
6 | python-dotenv==0.1.0
7 | nltk==3.4.3
8 | scikit-learn==0.21.2
9 | emoji==0.5.2
10 | seaborn==0.9.0
11 | tqdm==4.32.2
12 | matplotlib==3.0.3
--------------------------------------------------------------------------------
/.env.sample:
--------------------------------------------------------------------------------
1 | CONSUMER_KEY=consumer-key.get-from-https://developer.twitter.com/
2 | CONSUMER_SECRET=consumer-secret.get-from-https://developer.twitter.com/
3 | ACCESS_TOKEN=access_token.get-from-https://developer.twitter.com/
4 | ACCESS_TOKEN_SECRET=access_token_secret.get-from-https://developer.twitter.com/
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 | .DS_STORE
3 | .env
4 |
5 | # Python related
6 | __pycache__
7 | .ipynb_checkpoints
8 |
9 | # Dataset files
10 | datasets/**/*.csv
11 | datasets/**/*.pickle
12 |
13 | # Model files
14 | models/**/*.h5
15 | models/**/*.pickle
16 |
17 | # Tensorboard logs
18 | models/**/logs/**/*.*
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Emotion From Tweet
2 |
3 | This repository contains the source code for the article
4 | **From Sentiment Analysis to Emotion Recognition: A NLP story**
5 |
6 | ## Setup
7 |
8 | 1. Install the dependencies (use ***requirements.gpu.txt*** instead of
9 | *requirements.txt* when using GPU processing):
10 |
11 | ```bash
12 | pip install -r requirements.txt
13 | ```
14 | 1. Create a `.env` file:
15 |
16 | ```bash
17 | cp .env.sample .env
18 | ```
19 | 1. Set the environment variables inside the created `.env` file
20 |
21 | ## Running
22 |
23 | 1. Start the jupyter notebook:
24 |
25 | ```bash
26 | jupyter notebook
27 | ```
28 | 1. Go to the `notebooks` folder
29 | 1. Open and run the notebook you want
30 |
31 | ***Note***: *Check the releases if you want the training step output files*
32 |
--------------------------------------------------------------------------------
/nlp/dataset.py:
--------------------------------------------------------------------------------
1 | import re
2 | import pandas as pd
3 | from time import time
4 | from pathlib import Path
5 | from .utils import preprocess
6 |
7 | class Dataset:
8 | def __init__(self, filename, label_col='label', text_col='text'):
9 | self.filename = filename
10 | self.label_col = label_col
11 | self.text_col = text_col
12 |
13 | @property
14 | def data(self):
15 | data = self.dataframe[[self.label_col, self.text_col]].copy()
16 | data.columns = ['label', 'text']
17 | return data
18 |
19 | @property
20 | def cleaned_data(self):
21 | data = self.dataframe[[self.label_col, 'cleaned']]
22 | data.columns = ['label', 'text']
23 | return data
24 |
25 | def load(self):
26 | df = pd.read_csv(Path(self.filename).resolve())
27 | self.dataframe = df
28 |
29 | def preprocess_texts(self, quiet=False):
30 | self.dataframe['cleaned'] = preprocess(self.dataframe[self.text_col], quiet)
31 |
--------------------------------------------------------------------------------
/query_relations.json:
--------------------------------------------------------------------------------
1 | {
2 | ":face_screaming_in_fear:": "fear",
3 | ":face_with_tears_of_joy:": "joy",
4 | ":grinning_face_with_smiling_eyes:": "joy",
5 | ":pouting_face:": "anger",
6 | ":crying_face:": "sadness",
7 | ":fearful_face:": "fear",
8 | ":face_with_steam_from_nose:": "anger",
9 | "#anxious": "fear",
10 | "#sad": "sadness",
11 | "#happiness": "joy",
12 | "#fear": "fear",
13 | "#joy": "joy",
14 | "#pissed": "anger",
15 | "#angry": "anger",
16 | "#mad": "anger",
17 | "#excited": "joy",
18 | "#furious": "anger",
19 | "#depressed": "sadness",
20 | ":pensive_face:": "sadness",
21 | "#afraid": "fear",
22 | "#scared": "fear",
23 | "#worried": "fear",
24 | "#scary": "fear",
25 | ":anxious_face_with_sweat:": "fear",
26 | "#hateyou": "anger",
27 | ":loudly_crying_face:": "sadness",
28 | ":broken_heart:": "sadness",
29 | ":red_heart:": "joy",
30 | ":face_with_symbols_on_mouth:": "anger",
31 | ":anger_face:": "anger",
32 | ":smiling_face_with_smiling_eyes:": "joy",
33 | "#depression": "sadness",
34 | "#pissedoff": "anger"
35 | }
--------------------------------------------------------------------------------
/nlp/utils.py:
--------------------------------------------------------------------------------
1 | import re
2 | import nltk
3 | from time import time
4 | from emoji import demojize
5 |
6 | def preprocess(texts, quiet=False):
7 | start = time()
8 | # Lowercasing
9 | texts = texts.str.lower()
10 |
11 | # Remove special chars
12 | texts = texts.str.replace(r"(http|@)\S+", "")
13 | texts = texts.apply(demojize)
14 | texts = texts.str.replace(r"::", ": :")
15 | texts = texts.str.replace(r"’", "'")
16 | texts = texts.str.replace(r"[^a-z\':_]", " ")
17 |
18 | # Remove repetitions
19 | pattern = re.compile(r"(.)\1{2,}", re.DOTALL)
20 | texts = texts.str.replace(pattern, r"\1")
21 |
22 | # Transform short negation form
23 | texts = texts.str.replace(r"(can't|cannot)", 'can not')
24 | texts = texts.str.replace(r"n't", ' not')
25 |
26 | # Remove stop words
27 | stopwords = nltk.corpus.stopwords.words('english')
28 | stopwords.remove('not')
29 | stopwords.remove('nor')
30 | stopwords.remove('no')
31 | texts = texts.apply(
32 | lambda x: ' '.join([word for word in x.split() if word not in stopwords])
33 | )
34 |
35 | if not quiet:
36 | print("Time to clean up: {:.2f} sec".format(time() - start))
37 |
38 | return texts
39 |
--------------------------------------------------------------------------------
/notebooks/Fetch Tweets.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Fetch Tweets\n",
8 | "\n",
9 | "Donwload and save tweets, using a **query** value"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/plain": [
20 | "True"
21 | ]
22 | },
23 | "execution_count": 1,
24 | "metadata": {},
25 | "output_type": "execute_result"
26 | }
27 | ],
28 | "source": [
29 | "from dotenv import load_dotenv\n",
30 | "from pathlib import Path\n",
31 | "\n",
32 | "env_path = Path('../.env').resolve()\n",
33 | "load_dotenv(dotenv_path=env_path)"
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "metadata": {},
39 | "source": [
40 | "## API access\n",
41 | "\n",
42 | "First of all, we'll connect to the Twitter API"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 2,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "import os"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 3,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "consumer_key = os.getenv(\"CONSUMER_KEY\")\n",
61 | "consumer_secret = os.getenv(\"CONSUMER_SECRET\")\n",
62 | "access_token = os.getenv(\"ACCESS_TOKEN\")\n",
63 | "access_token_secret = os.getenv(\"ACCESS_TOKEN_SECRET\")"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 4,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "from tweepy import OAuthHandler, API, TweepError"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 5,
78 | "metadata": {},
79 | "outputs": [
80 | {
81 | "name": "stdout",
82 | "output_type": "stream",
83 | "text": [
84 | "Successfully connected to the Twitter API.\n"
85 | ]
86 | }
87 | ],
88 | "source": [
89 | "auth = OAuthHandler(consumer_key, consumer_secret)\n",
90 | "auth.set_access_token(access_token, access_token_secret)\n",
91 | "api = API(auth)\n",
92 | "print('Successfully connected to the Twitter API.')"
93 | ]
94 | },
95 | {
96 | "cell_type": "markdown",
97 | "metadata": {},
98 | "source": [
99 | "## Search Tweets\n",
100 | "\n",
101 | "Now we can define our query and search for the tweets containing it.\n",
102 | "\n",
103 | "- **query**: *hashtag* or *emoji* that will be used to fetch the tweets\n",
104 | "- **max_requests**: Maximum number of requests to the API.\n",
105 | " - Restriction: 180 requests / 15 min window"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 6,
111 | "metadata": {},
112 | "outputs": [],
113 | "source": [
114 | "query = '#angry'\n",
115 | "max_requests = 180"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": 7,
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "# Converts aliases to the real emoji representation (e.g. :thumbs_up: => 👍)\n",
125 | "\n",
126 | "from emoji import emojize"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 8,
132 | "metadata": {},
133 | "outputs": [],
134 | "source": [
135 | "q = emojize(query) + ' -filter:retweets'\n",
136 | "searched_tweets = []\n",
137 | "last_id = -1\n",
138 | "request_count = 0\n",
139 | "while request_count < max_requests:\n",
140 | " try:\n",
141 | " new_tweets = api.search(q=q,\n",
142 | " lang='en',\n",
143 | " count=100,\n",
144 | " max_id=str(last_id - 1),\n",
145 | " tweet_mode='extended')\n",
146 | " if not new_tweets:\n",
147 | " break\n",
148 | " searched_tweets.extend(new_tweets)\n",
149 | " last_id = new_tweets[-1].id\n",
150 | " request_count += 1\n",
151 | " except TweepError as e:\n",
152 | " print(e)\n",
153 | " break"
154 | ]
155 | },
156 | {
157 | "cell_type": "markdown",
158 | "metadata": {},
159 | "source": [
160 | "## Format and save\n",
161 | "\n",
162 | "Format the API data to the desired structure and save a `.csv` file"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": 9,
168 | "metadata": {},
169 | "outputs": [],
170 | "source": [
171 | "import pandas as pd"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": 10,
177 | "metadata": {},
178 | "outputs": [
179 | {
180 | "name": "stdout",
181 | "output_type": "stream",
182 | "text": [
183 | "66 #angry tweets\n"
184 | ]
185 | }
186 | ],
187 | "source": [
188 | "data = []\n",
189 | "for tweet in searched_tweets:\n",
190 | " data.append([tweet.id, tweet.created_at, tweet.user.screen_name, tweet.full_text])\n",
191 | "df = pd.DataFrame(data=data, columns=['id', 'date', 'user', 'text'])\n",
192 | "print(str(len(data)) + ' ' + query + ' tweets')"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": 11,
198 | "metadata": {
199 | "scrolled": true
200 | },
201 | "outputs": [
202 | {
203 | "data": {
204 | "text/html": [
205 | "
\n",
206 | "\n",
219 | "
\n",
220 | " \n",
221 | " \n",
222 | " | \n",
223 | " id | \n",
224 | " date | \n",
225 | " user | \n",
226 | " text | \n",
227 | "
\n",
228 | " \n",
229 | " \n",
230 | " \n",
231 | " | 0 | \n",
232 | " 1151133382627057664 | \n",
233 | " 2019-07-16 14:16:00 | \n",
234 | " DaradeAbhijeet | \n",
235 | " Don't promise when you are #Happy\\n&\\nDon'... | \n",
236 | "
\n",
237 | " \n",
238 | " | 1 | \n",
239 | " 1151124672496324608 | \n",
240 | " 2019-07-16 13:41:23 | \n",
241 | " TheRealFakeJack | \n",
242 | " @realDonaldTrump 4:20 am it is a sign u need t... | \n",
243 | "
\n",
244 | " \n",
245 | " | 2 | \n",
246 | " 1151118984793776129 | \n",
247 | " 2019-07-16 13:18:47 | \n",
248 | " masterofnaps | \n",
249 | " There's a special place in hell for people who... | \n",
250 | "
\n",
251 | " \n",
252 | " | 3 | \n",
253 | " 1151115966220328960 | \n",
254 | " 2019-07-16 13:06:47 | \n",
255 | " TiknisArts | \n",
256 | " We know #Trump needs #attention to survive. It... | \n",
257 | "
\n",
258 | " \n",
259 | " | 4 | \n",
260 | " 1151113082099232768 | \n",
261 | " 2019-07-16 12:55:20 | \n",
262 | " emilieraddish | \n",
263 | " Get your Instagram photo elsewhere not on top ... | \n",
264 | "
\n",
265 | " \n",
266 | "
\n",
267 | "
"
268 | ],
269 | "text/plain": [
270 | " id date user \\\n",
271 | "0 1151133382627057664 2019-07-16 14:16:00 DaradeAbhijeet \n",
272 | "1 1151124672496324608 2019-07-16 13:41:23 TheRealFakeJack \n",
273 | "2 1151118984793776129 2019-07-16 13:18:47 masterofnaps \n",
274 | "3 1151115966220328960 2019-07-16 13:06:47 TiknisArts \n",
275 | "4 1151113082099232768 2019-07-16 12:55:20 emilieraddish \n",
276 | "\n",
277 | " text \n",
278 | "0 Don't promise when you are #Happy\\n&\\nDon'... \n",
279 | "1 @realDonaldTrump 4:20 am it is a sign u need t... \n",
280 | "2 There's a special place in hell for people who... \n",
281 | "3 We know #Trump needs #attention to survive. It... \n",
282 | "4 Get your Instagram photo elsewhere not on top ... "
283 | ]
284 | },
285 | "execution_count": 11,
286 | "metadata": {},
287 | "output_type": "execute_result"
288 | }
289 | ],
290 | "source": [
291 | "df.head()"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": 12,
297 | "metadata": {},
298 | "outputs": [
299 | {
300 | "name": "stdout",
301 | "output_type": "stream",
302 | "text": [
303 | "Saved under: \"/home/rmohashi/Workspace/emotion-from-tweets/datasets/tweepy\"\n"
304 | ]
305 | }
306 | ],
307 | "source": [
308 | "PATH = Path('../datasets/tweepy').resolve()\n",
309 | "filename = query + '.csv'\n",
310 | "df.to_csv(os.path.join(PATH, filename), index=None)\n",
311 | "print('Saved under: \"' + PATH.as_posix() + '\"')"
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": null,
317 | "metadata": {},
318 | "outputs": [],
319 | "source": []
320 | }
321 | ],
322 | "metadata": {
323 | "kernelspec": {
324 | "display_name": "Python 3",
325 | "language": "python",
326 | "name": "python3"
327 | },
328 | "language_info": {
329 | "codemirror_mode": {
330 | "name": "ipython",
331 | "version": 3
332 | },
333 | "file_extension": ".py",
334 | "mimetype": "text/x-python",
335 | "name": "python",
336 | "nbconvert_exporter": "python",
337 | "pygments_lexer": "ipython3",
338 | "version": "3.6.8"
339 | }
340 | },
341 | "nbformat": 4,
342 | "nbformat_minor": 2
343 | }
344 |
--------------------------------------------------------------------------------
/notebooks/Validate API Data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Validate API Data\n",
8 | "\n",
9 | "Validate and create a emotion labeled dataset"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "# Add project path to the PYTHONPATH\n",
19 | "\n",
20 | "import os\n",
21 | "import sys\n",
22 | "from pathlib import Path\n",
23 | "\n",
24 | "sys.path.append(Path(os.path.join(os.path.abspath(''), '../')).resolve().as_posix())"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 2,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "import json\n",
34 | "from pathlib import Path"
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {},
40 | "source": [
41 | "## Load Relations\n",
42 | "\n",
43 | "Load the relations between queries and emotions"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 3,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "relations_path = Path('../query_relations.json').resolve()"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 4,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "with relations_path.open('rb') as file:\n",
62 | " relations = json.load(file)"
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "metadata": {},
68 | "source": [
69 | "## Load Tokenizer\n",
70 | "\n",
71 | "Load the tokenizer, created at the model training process"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 5,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "import pickle"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 6,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "tokenizer_path = Path('../datasets/sentiment140/tokenizer.pickle').resolve()\n",
90 | "with tokenizer_path.open('rb') as file:\n",
91 | " tokenizer = pickle.load(file)"
92 | ]
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "metadata": {},
97 | "source": [
98 | "## Load Model\n",
99 | "\n",
100 | "Load the model, using the saved weights"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": 7,
106 | "metadata": {},
107 | "outputs": [],
108 | "source": [
109 | "from tensorflow.keras.layers import Input, Embedding, GRU\n",
110 | "from tensorflow.keras.layers import Dropout, GlobalMaxPooling1D\n",
111 | "from tensorflow.keras.layers import Bidirectional, Dense\n",
112 | "from tensorflow.keras.models import Sequential"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 8,
118 | "metadata": {},
119 | "outputs": [],
120 | "source": [
121 | "input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)\n",
122 | "embedding_dim = 200\n",
123 | "input_length = 100\n",
124 | "gru_units = 128\n",
125 | "gru_dropout = 0.1\n",
126 | "recurrent_dropout = 0.1\n",
127 | "dropout = 0.1"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": 9,
133 | "metadata": {},
134 | "outputs": [
135 | {
136 | "name": "stderr",
137 | "output_type": "stream",
138 | "text": [
139 | "WARNING: Logging before flag parsing goes to stderr.\n",
140 | "W0719 09:43:55.179866 4476569024 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
141 | "Instructions for updating:\n",
142 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
143 | "W0719 09:43:55.207387 4476569024 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
144 | "Instructions for updating:\n",
145 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
146 | "W0719 09:43:55.215560 4476569024 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
147 | "Instructions for updating:\n",
148 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
149 | "W0719 09:43:55.216914 4476569024 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
150 | "Instructions for updating:\n",
151 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
152 | "W0719 09:43:55.219862 4476569024 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
153 | "Instructions for updating:\n",
154 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n"
155 | ]
156 | }
157 | ],
158 | "source": [
159 | "model = Sequential()\n",
160 | "model.add(Embedding(\n",
161 | " input_dim=input_dim,\n",
162 | " output_dim=embedding_dim,\n",
163 | " input_shape=(input_length,)\n",
164 | "))\n",
165 | "\n",
166 | "model.add(Bidirectional(GRU(\n",
167 | " gru_units,\n",
168 | " return_sequences=True,\n",
169 | " dropout=gru_dropout,\n",
170 | " recurrent_dropout=recurrent_dropout\n",
171 | ")))\n",
172 | "model.add(GlobalMaxPooling1D())\n",
173 | "model.add(Dense(32, activation='relu'))\n",
174 | "model.add(Dropout(dropout))\n",
175 | "\n",
176 | "model.add(Dense(1, activation='sigmoid'))"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 10,
182 | "metadata": {},
183 | "outputs": [
184 | {
185 | "name": "stdout",
186 | "output_type": "stream",
187 | "text": [
188 | "Model: \"sequential\"\n",
189 | "_________________________________________________________________\n",
190 | "Layer (type) Output Shape Param # \n",
191 | "=================================================================\n",
192 | "embedding (Embedding) (None, 100, 200) 2000000 \n",
193 | "_________________________________________________________________\n",
194 | "bidirectional (Bidirectional (None, 100, 256) 252672 \n",
195 | "_________________________________________________________________\n",
196 | "global_max_pooling1d (Global (None, 256) 0 \n",
197 | "_________________________________________________________________\n",
198 | "dense (Dense) (None, 32) 8224 \n",
199 | "_________________________________________________________________\n",
200 | "dropout (Dropout) (None, 32) 0 \n",
201 | "_________________________________________________________________\n",
202 | "dense_1 (Dense) (None, 1) 33 \n",
203 | "=================================================================\n",
204 | "Total params: 2,260,929\n",
205 | "Trainable params: 2,260,929\n",
206 | "Non-trainable params: 0\n",
207 | "_________________________________________________________________\n",
208 | "None\n"
209 | ]
210 | }
211 | ],
212 | "source": [
213 | "print(model.summary())"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": 11,
219 | "metadata": {},
220 | "outputs": [],
221 | "source": [
222 | "weights_path = Path('../models/sentiment_analysis/model_weights.h5').resolve()\n",
223 | "model.load_weights(weights_path.as_posix())"
224 | ]
225 | },
226 | {
227 | "cell_type": "markdown",
228 | "metadata": {},
229 | "source": [
230 | "## Group data by emotion\n",
231 | "\n",
232 | "Use the emotions to group the data"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": 12,
238 | "metadata": {},
239 | "outputs": [],
240 | "source": [
241 | "import os\n",
242 | "import re\n",
243 | "import pandas as pd\n",
244 | "from tqdm import tqdm"
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": 13,
250 | "metadata": {},
251 | "outputs": [],
252 | "source": [
253 | "files_dir = Path('../datasets/tweepy').resolve()"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": 14,
259 | "metadata": {},
260 | "outputs": [
261 | {
262 | "name": "stderr",
263 | "output_type": "stream",
264 | "text": [
265 | "100%|██████████| 19/19 [00:00<00:00, 27.29it/s]\n"
266 | ]
267 | }
268 | ],
269 | "source": [
270 | "emotion_data_dict = {}\n",
271 | "\n",
272 | "filenames = os.listdir(files_dir)\n",
273 | "with tqdm(total=len(filenames)) as t:\n",
274 | " for filename in filenames:\n",
275 | " query = re.findall(r'(#[^.]+|:.+:)', filename)[0]\n",
276 | " emotion = relations[query]\n",
277 | "\n",
278 | " file_data = pd.read_csv(os.path.join(files_dir, filename))\n",
279 | " dict_data = emotion_data_dict[emotion] if emotion in emotion_data_dict else None\n",
280 | " emotion_data_dict[emotion] = pd.concat([dict_data, file_data])\n",
281 | " t.update()"
282 | ]
283 | },
284 | {
285 | "cell_type": "markdown",
286 | "metadata": {},
287 | "source": [
288 | "## Predict emotion and filter data\n",
289 | "\n",
290 | "Predict emotion and filter rows for each group created in the step above"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": 15,
296 | "metadata": {},
297 | "outputs": [
298 | {
299 | "name": "stderr",
300 | "output_type": "stream",
301 | "text": [
302 | "[nltk_data] Downloading package stopwords to\n",
303 | "[nltk_data] /Users/rmohashi/nltk_data...\n",
304 | "[nltk_data] Package stopwords is already up-to-date!\n"
305 | ]
306 | }
307 | ],
308 | "source": [
309 | "import re\n",
310 | "import numpy as np\n",
311 | "from emoji import demojize\n",
312 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
313 | "from nlp import preprocess"
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": 16,
319 | "metadata": {},
320 | "outputs": [],
321 | "source": [
322 | "def get_score_range(mean):\n",
323 | " if mean < 0.5:\n",
324 | " return (0.0, mean)\n",
325 | " return (mean, 1.0)"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": 17,
331 | "metadata": {},
332 | "outputs": [
333 | {
334 | "name": "stderr",
335 | "output_type": "stream",
336 | "text": [
337 | "Processing \"joy\" data: 100%|██████████| 4/4 [03:28<00:00, 51.09s/it] "
338 | ]
339 | },
340 | {
341 | "name": "stdout",
342 | "output_type": "stream",
343 | "text": [
344 | "Fear: Score Range: 0.000000 - 0.434182\n",
345 | "Sadness: Score Range: 0.000000 - 0.220770\n",
346 | "Anger: Score Range: 0.000000 - 0.410283\n",
347 | "Joy: Score Range: 0.870705 - 1.000000\n"
348 | ]
349 | },
350 | {
351 | "name": "stderr",
352 | "output_type": "stream",
353 | "text": [
354 | "\n"
355 | ]
356 | }
357 | ],
358 | "source": [
359 | "result_data = []\n",
360 | "\n",
361 | "messages = []\n",
362 | "with tqdm(total=len(emotion_data_dict.items())) as t:\n",
363 | " for emotion, dataset in emotion_data_dict.items():\n",
364 | " t.set_description('Processing \"' + emotion + '\" data')\n",
365 | "\n",
366 | " cleaned_texts = preprocess(dataset.text, quiet=True)\n",
367 | " predict_sequences = [text.split() for text in cleaned_texts]\n",
368 | " list_tokenized_predict = tokenizer.texts_to_sequences(predict_sequences)\n",
369 | " x_predict = pad_sequences(list_tokenized_predict, maxlen=100)\n",
370 | "\n",
371 | " result = model.predict(x_predict)\n",
372 | " mean = np.mean(result)\n",
373 | " std = np.std(result)\n",
374 | " low, high = get_score_range(mean)\n",
375 | " messages.append(emotion.capitalize() + \": Score Range: {:4f} - {:4f}\".format(low, high))\n",
376 | " dataset = dataset[np.all([(result >= low), (result <= high)], axis=0)]\n",
377 | " dataset.insert(0, 'label', emotion)\n",
378 | "\n",
379 | " result_data = result_data + [dataset]\n",
380 | " t.update()\n",
381 | "\n",
382 | "for message in messages:\n",
383 | " print(message)"
384 | ]
385 | },
386 | {
387 | "cell_type": "markdown",
388 | "metadata": {},
389 | "source": [
390 | "## Save dataset\n",
391 | "\n",
392 | "Save the resulting data"
393 | ]
394 | },
395 | {
396 | "cell_type": "code",
397 | "execution_count": 18,
398 | "metadata": {},
399 | "outputs": [
400 | {
401 | "name": "stdout",
402 | "output_type": "stream",
403 | "text": [
404 | "Files saved under \"/Users/rmohashi/Workspace/emotion-from-tweet/datasets/sentiment_analysis/dataset.csv\"\n"
405 | ]
406 | }
407 | ],
408 | "source": [
409 | "if len(result_data) > 0:\n",
410 | " result_data = pd.concat(result_data)\n",
411 | "\n",
412 | " path = Path('../datasets/sentiment_analysis/dataset.csv').resolve()\n",
413 | " result_data.to_csv(path, index=None)\n",
414 | "\n",
415 | " print('Files saved under \"' + path.as_posix() + '\"')"
416 | ]
417 | },
418 | {
419 | "cell_type": "code",
420 | "execution_count": null,
421 | "metadata": {},
422 | "outputs": [],
423 | "source": []
424 | }
425 | ],
426 | "metadata": {
427 | "kernelspec": {
428 | "display_name": "Python 3",
429 | "language": "python",
430 | "name": "python3"
431 | },
432 | "language_info": {
433 | "codemirror_mode": {
434 | "name": "ipython",
435 | "version": 3
436 | },
437 | "file_extension": ".py",
438 | "mimetype": "text/x-python",
439 | "name": "python",
440 | "nbconvert_exporter": "python",
441 | "pygments_lexer": "ipython3",
442 | "version": "3.6.8"
443 | }
444 | },
445 | "nbformat": 4,
446 | "nbformat_minor": 2
447 | }
448 |
--------------------------------------------------------------------------------
/notebooks/Predict Emotion.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Predict Emotion\n",
8 | "\n",
9 | "The main objective of this notebook is to predict emotions from tweets"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "# Add project path to the PYTHONPATH\n",
19 | "\n",
20 | "import os\n",
21 | "import sys\n",
22 | "from pathlib import Path\n",
23 | "\n",
24 | "sys.path.append(Path(os.path.join(os.path.abspath(''), '../')).resolve().as_posix())"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 2,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "import pickle"
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "metadata": {},
39 | "source": [
40 | "## Load Tokenizer\n",
41 | "\n",
42 | "Load `.pickle` file with the tokenizer"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 3,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "tokenizer_path = Path('../datasets/sentiment_analysis/tokenizer.pickle').resolve()\n",
52 | "with tokenizer_path.open('rb') as file:\n",
53 | " tokenizer = pickle.load(file)"
54 | ]
55 | },
56 | {
57 | "cell_type": "markdown",
58 | "metadata": {},
59 | "source": [
60 | "## Load Model\n",
61 | "\n",
62 | "Load the trained emotion recognition model"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 4,
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, LSTM\n",
72 | "from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D\n",
73 | "from tensorflow.keras.layers import Bidirectional, Conv1D, Dense, concatenate\n",
74 | "from tensorflow.keras.models import Model"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 5,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)\n",
84 | "num_classes = 4\n",
85 | "embedding_dim = 500\n",
86 | "input_length = 100\n",
87 | "lstm_units = 128\n",
88 | "lstm_dropout = 0.1\n",
89 | "recurrent_dropout = 0.1\n",
90 | "spatial_dropout=0.2\n",
91 | "filters=64\n",
92 | "kernel_size=3"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 6,
98 | "metadata": {},
99 | "outputs": [
100 | {
101 | "name": "stderr",
102 | "output_type": "stream",
103 | "text": [
104 | "WARNING: Logging before flag parsing goes to stderr.\n",
105 | "W0719 10:47:51.968286 4693956032 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
106 | "Instructions for updating:\n",
107 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
108 | "W0719 10:47:52.031774 4693956032 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
109 | "Instructions for updating:\n",
110 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
111 | "W0719 10:47:52.039301 4693956032 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
112 | "Instructions for updating:\n",
113 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
114 | "W0719 10:47:52.040482 4693956032 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
115 | "Instructions for updating:\n",
116 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
117 | "W0719 10:47:52.041715 4693956032 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
118 | "Instructions for updating:\n",
119 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n"
120 | ]
121 | }
122 | ],
123 | "source": [
124 | "input_layer = Input(shape=(input_length,))\n",
125 | "output_layer = Embedding(\n",
126 | " input_dim=input_dim,\n",
127 | " output_dim=embedding_dim,\n",
128 | " input_shape=(input_length,)\n",
129 | ")(input_layer)\n",
130 | "\n",
131 | "output_layer = SpatialDropout1D(spatial_dropout)(output_layer)\n",
132 | "\n",
133 | "output_layer = Bidirectional(\n",
134 | "LSTM(lstm_units, return_sequences=True,\n",
135 | " dropout=lstm_dropout, recurrent_dropout=recurrent_dropout)\n",
136 | ")(output_layer)\n",
137 | "output_layer = Conv1D(filters, kernel_size=kernel_size, padding='valid',\n",
138 | " kernel_initializer='glorot_uniform')(output_layer)\n",
139 | "\n",
140 | "avg_pool = GlobalAveragePooling1D()(output_layer)\n",
141 | "max_pool = GlobalMaxPooling1D()(output_layer)\n",
142 | "output_layer = concatenate([avg_pool, max_pool])\n",
143 | "\n",
144 | "output_layer = Dense(num_classes, activation='softmax')(output_layer)\n",
145 | "\n",
146 | "model = Model(input_layer, output_layer)"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": 7,
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "model_weights_path = Path('../models/emotion_recognition/model_weights.h5').resolve()\n",
156 | "model.load_weights(model_weights_path.as_posix())"
157 | ]
158 | },
159 | {
160 | "cell_type": "markdown",
161 | "metadata": {},
162 | "source": [
163 | "## Load data\n",
164 | "\n",
165 | "Load the data that will have the labels predicted by the model\n",
166 | "\n",
167 | "**data_path**: Path to the `.csv` file that will be used"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": 8,
173 | "metadata": {},
174 | "outputs": [],
175 | "source": [
176 | "import pandas as pd"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 9,
182 | "metadata": {},
183 | "outputs": [
184 | {
185 | "data": {
186 | "text/html": [
187 | "\n",
188 | "\n",
201 | "
\n",
202 | " \n",
203 | " \n",
204 | " | \n",
205 | " id | \n",
206 | " date | \n",
207 | " user | \n",
208 | " text | \n",
209 | "
\n",
210 | " \n",
211 | " \n",
212 | " \n",
213 | " | 0 | \n",
214 | " 1151893341782585349 | \n",
215 | " 2019-07-18 16:35:48 | \n",
216 | " Ozzzylot | \n",
217 | " ⚡️ Fans share what Kyoto Animation studio mean... | \n",
218 | "
\n",
219 | " \n",
220 | " | 1 | \n",
221 | " 1151893322908102657 | \n",
222 | " 2019-07-18 16:35:43 | \n",
223 | " rosyutori | \n",
224 | " Deep condolences to all who are passed away at... | \n",
225 | "
\n",
226 | " \n",
227 | " | 2 | \n",
228 | " 1151893318101377024 | \n",
229 | " 2019-07-18 16:35:42 | \n",
230 | " met_bit | \n",
231 | " Striking news... How on earth can someone be s... | \n",
232 | "
\n",
233 | " \n",
234 | " | 3 | \n",
235 | " 1151893304117813248 | \n",
236 | " 2019-07-18 16:35:39 | \n",
237 | " Destructo_Dan | \n",
238 | " I don’t know if I had any favorite anime from ... | \n",
239 | "
\n",
240 | " \n",
241 | " | 4 | \n",
242 | " 1151893302863650816 | \n",
243 | " 2019-07-18 16:35:39 | \n",
244 | " KDiscavage | \n",
245 | " The news about Kyoto Animation Studios hit me ... | \n",
246 | "
\n",
247 | " \n",
248 | "
\n",
249 | "
"
250 | ],
251 | "text/plain": [
252 | " id date user \\\n",
253 | "0 1151893341782585349 2019-07-18 16:35:48 Ozzzylot \n",
254 | "1 1151893322908102657 2019-07-18 16:35:43 rosyutori \n",
255 | "2 1151893318101377024 2019-07-18 16:35:42 met_bit \n",
256 | "3 1151893304117813248 2019-07-18 16:35:39 Destructo_Dan \n",
257 | "4 1151893302863650816 2019-07-18 16:35:39 KDiscavage \n",
258 | "\n",
259 | " text \n",
260 | "0 ⚡️ Fans share what Kyoto Animation studio mean... \n",
261 | "1 Deep condolences to all who are passed away at... \n",
262 | "2 Striking news... How on earth can someone be s... \n",
263 | "3 I don’t know if I had any favorite anime from ... \n",
264 | "4 The news about Kyoto Animation Studios hit me ... "
265 | ]
266 | },
267 | "execution_count": 9,
268 | "metadata": {},
269 | "output_type": "execute_result"
270 | }
271 | ],
272 | "source": [
273 | "data_path = Path('../datasets/predict/1151893341782585349-1151863653320159233_kyoto_animation.csv').resolve()\n",
274 | "data = pd.read_csv(data_path)\n",
275 | "data.head()"
276 | ]
277 | },
278 | {
279 | "cell_type": "markdown",
280 | "metadata": {},
281 | "source": [
282 | "## Load Encoder\n",
283 | "\n",
284 | "Load `.pickle` file with the encoder"
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": 10,
290 | "metadata": {},
291 | "outputs": [],
292 | "source": [
293 | "encoder_path = Path('../models/emotion_recognition/encoder.pickle').resolve()\n",
294 | "with encoder_path.open('rb') as file:\n",
295 | " encoder = pickle.load(file)"
296 | ]
297 | },
298 | {
299 | "cell_type": "markdown",
300 | "metadata": {},
301 | "source": [
302 | "## Preprocess data\n",
303 | "\n",
304 | "Preprocess the data that will be used"
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": 11,
310 | "metadata": {},
311 | "outputs": [
312 | {
313 | "name": "stderr",
314 | "output_type": "stream",
315 | "text": [
316 | "[nltk_data] Downloading package stopwords to\n",
317 | "[nltk_data] /Users/rmohashi/nltk_data...\n",
318 | "[nltk_data] Package stopwords is already up-to-date!\n"
319 | ]
320 | }
321 | ],
322 | "source": [
323 | "from nlp import preprocess\n",
324 | "from tensorflow.keras.preprocessing.sequence import pad_sequences"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": 12,
330 | "metadata": {},
331 | "outputs": [
332 | {
333 | "name": "stdout",
334 | "output_type": "stream",
335 | "text": [
336 | "Time to clean up: 1.41 sec\n"
337 | ]
338 | }
339 | ],
340 | "source": [
341 | "cleaned_data = preprocess(data.text)\n",
342 | "sequences = [text.split() for text in cleaned_data]\n",
343 | "list_tokenized = tokenizer.texts_to_sequences(sequences)\n",
344 | "x_data = pad_sequences(list_tokenized, maxlen=100)"
345 | ]
346 | },
347 | {
348 | "cell_type": "markdown",
349 | "metadata": {},
350 | "source": [
351 | "## Results\n",
352 | "\n",
353 | "Predict the labels and generate a confusion matrix"
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": 13,
359 | "metadata": {},
360 | "outputs": [],
361 | "source": [
362 | "import numpy as np"
363 | ]
364 | },
365 | {
366 | "cell_type": "code",
367 | "execution_count": 14,
368 | "metadata": {},
369 | "outputs": [],
370 | "source": [
371 | "y_pred = model.predict(x_data)"
372 | ]
373 | },
374 | {
375 | "cell_type": "code",
376 | "execution_count": 15,
377 | "metadata": {},
378 | "outputs": [
379 | {
380 | "name": "stdout",
381 | "output_type": "stream",
382 | "text": [
383 | "angry: 0.0977998\n",
384 | "fear: 0.3991122\n",
385 | "joy: 0.03104621\n",
386 | "sadness: 0.4720413\n"
387 | ]
388 | }
389 | ],
390 | "source": [
391 | "for index, value in enumerate(np.sum(y_pred, axis=0) / len(y_pred)):\n",
392 | " print(encoder.classes_[index] + \": \" + str(value))"
393 | ]
394 | },
395 | {
396 | "cell_type": "code",
397 | "execution_count": 16,
398 | "metadata": {},
399 | "outputs": [
400 | {
401 | "name": "stdout",
402 | "output_type": "stream",
403 | "text": [
404 | "angry: 0.09889558232931726\n",
405 | "fear: 0.4011044176706827\n",
406 | "joy: 0.030622489959839357\n",
407 | "sadness: 0.46937751004016065\n"
408 | ]
409 | }
410 | ],
411 | "source": [
412 | "y_pred_argmax = y_pred.argmax(axis=1)\n",
413 | "data_len = len(y_pred_argmax)\n",
414 | "for index, value in enumerate(np.unique(y_pred_argmax)):\n",
415 | " print(encoder.classes_[index] + \": \" + str(len(y_pred_argmax[y_pred_argmax == value]) / data_len))"
416 | ]
417 | },
418 | {
419 | "cell_type": "code",
420 | "execution_count": 17,
421 | "metadata": {},
422 | "outputs": [
423 | {
424 | "data": {
425 | "text/plain": [
426 | "array([3, 3, 3, 3, 3])"
427 | ]
428 | },
429 | "execution_count": 17,
430 | "metadata": {},
431 | "output_type": "execute_result"
432 | }
433 | ],
434 | "source": [
435 | "y_pred[5:10].argmax(axis=1)"
436 | ]
437 | },
438 | {
439 | "cell_type": "code",
440 | "execution_count": 18,
441 | "metadata": {},
442 | "outputs": [
443 | {
444 | "data": {
445 | "text/plain": [
446 | "'My heart goes out to the people who died in the fire at Kyoto Animation Studio. \\n\\n#PrayForKyoani https://t.co/Jvg9R8f6Oc'"
447 | ]
448 | },
449 | "execution_count": 18,
450 | "metadata": {},
451 | "output_type": "execute_result"
452 | }
453 | ],
454 | "source": [
455 | "data.text.iloc[6]"
456 | ]
457 | },
458 | {
459 | "cell_type": "code",
460 | "execution_count": null,
461 | "metadata": {},
462 | "outputs": [],
463 | "source": []
464 | }
465 | ],
466 | "metadata": {
467 | "kernelspec": {
468 | "display_name": "Python 3",
469 | "language": "python",
470 | "name": "python3"
471 | },
472 | "language_info": {
473 | "codemirror_mode": {
474 | "name": "ipython",
475 | "version": 3
476 | },
477 | "file_extension": ".py",
478 | "mimetype": "text/x-python",
479 | "name": "python",
480 | "nbconvert_exporter": "python",
481 | "pygments_lexer": "ipython3",
482 | "version": "3.6.8"
483 | }
484 | },
485 | "nbformat": 4,
486 | "nbformat_minor": 2
487 | }
488 |
--------------------------------------------------------------------------------
/notebooks/Train Sentiment Analysis.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Train Sentiment Analysis\n",
8 | "\n",
9 | "Here we'll train a sentiment analysis model to validate the data from the API."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import warnings\n",
19 | "warnings.filterwarnings('ignore')"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 2,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "from pathlib import Path\n",
29 | "import pandas as pd"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 3,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "sentiment140_path = Path('../datasets/sentiment140/sentiment140.csv')\n",
39 | "data = pd.read_csv(sentiment140_path)"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 4,
45 | "metadata": {},
46 | "outputs": [
47 | {
48 | "data": {
49 | "text/html": [
50 | "\n",
51 | "\n",
64 | "
\n",
65 | " \n",
66 | " \n",
67 | " | \n",
68 | " label | \n",
69 | " tweet | \n",
70 | "
\n",
71 | " \n",
72 | " \n",
73 | " \n",
74 | " | 0 | \n",
75 | " 0 | \n",
76 | " @whiskey_kitten www.Pandora.com - plays music ... | \n",
77 | "
\n",
78 | " \n",
79 | " | 1 | \n",
80 | " 0 | \n",
81 | " studying for a test I hope not to fail....most... | \n",
82 | "
\n",
83 | " \n",
84 | " | 2 | \n",
85 | " 4 | \n",
86 | " @BlowhornOz Oh! Doesn't sound so good, I got t... | \n",
87 | "
\n",
88 | " \n",
89 | " | 3 | \n",
90 | " 0 | \n",
91 | " tomorrow is my last day at A&D HS fml and... | \n",
92 | "
\n",
93 | " \n",
94 | " | 4 | \n",
95 | " 0 | \n",
96 | " Journalism has no future? That sounds pretty m... | \n",
97 | "
\n",
98 | " \n",
99 | "
\n",
100 | "
"
101 | ],
102 | "text/plain": [
103 | " label tweet\n",
104 | "0 0 @whiskey_kitten www.Pandora.com - plays music ...\n",
105 | "1 0 studying for a test I hope not to fail....most...\n",
106 | "2 4 @BlowhornOz Oh! Doesn't sound so good, I got t...\n",
107 | "3 0 tomorrow is my last day at A&D HS fml and...\n",
108 | "4 0 Journalism has no future? That sounds pretty m..."
109 | ]
110 | },
111 | "execution_count": 4,
112 | "metadata": {},
113 | "output_type": "execute_result"
114 | }
115 | ],
116 | "source": [
117 | "data.head()"
118 | ]
119 | },
120 | {
121 | "cell_type": "markdown",
122 | "metadata": {},
123 | "source": [
124 | "## Data preprocessing\n",
125 | "\n",
126 | "Preprocess the texts:\n",
127 | "- Convert to Lowercase: Convert all characters from the text to lowercase\n",
128 | "- Remove special characters: Remove links and usernames and transform emojis to text\n",
129 | "- Remove repetitions: Remove char repetitions (e.g. whaaaaaat => what)\n",
130 | "- Remove Stop words: Remove common stop words"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 5,
136 | "metadata": {},
137 | "outputs": [
138 | {
139 | "name": "stderr",
140 | "output_type": "stream",
141 | "text": [
142 | "[nltk_data] Downloading package stopwords to\n",
143 | "[nltk_data] /home/rmohashi/nltk_data...\n",
144 | "[nltk_data] Package stopwords is already up-to-date!\n"
145 | ]
146 | },
147 | {
148 | "data": {
149 | "text/plain": [
150 | "True"
151 | ]
152 | },
153 | "execution_count": 5,
154 | "metadata": {},
155 | "output_type": "execute_result"
156 | }
157 | ],
158 | "source": [
159 | "import re\n",
160 | "from time import time\n",
161 | "import nltk\n",
162 | "from emoji import demojize\n",
163 | "\n",
164 | "nltk.download('stopwords')"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 6,
170 | "metadata": {},
171 | "outputs": [
172 | {
173 | "name": "stdout",
174 | "output_type": "stream",
175 | "text": [
176 | "Time to clean up: 78.03 sec\n"
177 | ]
178 | }
179 | ],
180 | "source": [
181 | "texts = data.tweet\n",
182 | "\n",
183 | "start = time()\n",
184 | "# Lowercasing\n",
185 | "texts = texts.str.lower()\n",
186 | "\n",
187 | "# Remove special chars\n",
188 | "texts = texts.str.replace(r\"(http|@)\\S+\", \"\")\n",
189 | "texts = texts.apply(demojize)\n",
190 | "texts = texts.str.replace(r\"::\", \": :\")\n",
191 | "texts = texts.str.replace(r\"’\", \"'\")\n",
192 | "texts = texts.str.replace(r\"[^a-z\\':_]\", \" \")\n",
193 | "\n",
194 | "# Remove repetitions\n",
195 | "pattern = re.compile(r\"(.)\\1{2,}\", re.DOTALL)\n",
196 | "texts = texts.str.replace(pattern, r\"\\1\")\n",
197 | "\n",
198 | "# Transform short negation form\n",
199 | "texts = texts.str.replace(r\"(can't|cannot)\", 'can not')\n",
200 | "texts = texts.str.replace(r\"n't\", ' not')\n",
201 | "\n",
202 | "# Remove stop words\n",
203 | "stopwords = nltk.corpus.stopwords.words('english')\n",
204 | "stopwords.remove('not')\n",
205 | "stopwords.remove('nor')\n",
206 | "stopwords.remove('no')\n",
207 | "texts = texts.apply(\n",
208 | " lambda x: ' '.join([word for word in x.split() if word not in stopwords])\n",
209 | ")\n",
210 | "\n",
211 | "print(\"Time to clean up: {:.2f} sec\".format(time() - start))\n",
212 | "\n",
213 | "data.tweet = texts"
214 | ]
215 | },
216 | {
217 | "cell_type": "markdown",
218 | "metadata": {},
219 | "source": [
220 | "## Tokenize\n",
221 | "\n",
222 | "Transform the text corpus to a vector representation\n",
223 | "\n",
224 | "- **num_words**: Number of words to use"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": 7,
230 | "metadata": {},
231 | "outputs": [],
232 | "source": [
233 | "num_words = 10000"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": 8,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "import pickle\n",
243 | "from tensorflow.keras.preprocessing.text import Tokenizer"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": 9,
249 | "metadata": {},
250 | "outputs": [],
251 | "source": [
252 | "tokenizer = Tokenizer(num_words=num_words, lower=True)\n",
253 | "tokenizer.fit_on_texts(data.tweet)\n",
254 | "\n",
255 | "file_to_save = Path('../datasets/sentiment140/tokenizer.pickle').resolve()\n",
256 | "with file_to_save.open('wb') as file:\n",
257 | " pickle.dump(tokenizer, file)"
258 | ]
259 | },
260 | {
261 | "cell_type": "markdown",
262 | "metadata": {},
263 | "source": [
264 | "## Split data\n",
265 | "\n",
266 | "Split the dataset in train and validation data"
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": 10,
272 | "metadata": {},
273 | "outputs": [],
274 | "source": [
275 | "from sklearn.model_selection import train_test_split"
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": 11,
281 | "metadata": {},
282 | "outputs": [],
283 | "source": [
284 | "train = pd.DataFrame(columns=['label', 'tweet'])\n",
285 | "validation = pd.DataFrame(columns=['label', 'tweet'])\n",
286 | "for label in data.label.unique():\n",
287 | " label_data = data[data.label == label]\n",
288 | " train_data, validation_data = train_test_split(label_data, test_size=0.3)\n",
289 | " train = pd.concat([train, train_data])\n",
290 | " validation = pd.concat([validation, validation_data])"
291 | ]
292 | },
293 | {
294 | "cell_type": "markdown",
295 | "metadata": {},
296 | "source": [
297 | "## Model\n",
298 | "\n",
299 | "Define the Bidirectional GRU model"
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": 12,
305 | "metadata": {},
306 | "outputs": [],
307 | "source": [
308 | "from tensorflow.keras.layers import Input, Embedding, GRU\n",
309 | "from tensorflow.keras.layers import Dropout, GlobalMaxPooling1D\n",
310 | "from tensorflow.keras.layers import Bidirectional, Dense\n",
311 | "from tensorflow.keras.models import Sequential"
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": 13,
317 | "metadata": {},
318 | "outputs": [],
319 | "source": [
320 | "input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)\n",
321 | "embedding_dim = 200\n",
322 | "input_length = 100\n",
323 | "gru_units = 128\n",
324 | "gru_dropout = 0.1\n",
325 | "recurrent_dropout = 0.1\n",
326 | "dropout = 0.1"
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": 14,
332 | "metadata": {},
333 | "outputs": [
334 | {
335 | "name": "stderr",
336 | "output_type": "stream",
337 | "text": [
338 | "WARNING: Logging before flag parsing goes to stderr.\n",
339 | "W0716 13:36:20.397812 140315330369344 deprecation.py:506] From /home/rmohashi/anaconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
340 | "Instructions for updating:\n",
341 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
342 | "W0716 13:36:20.410246 140315330369344 deprecation.py:506] From /home/rmohashi/anaconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
343 | "Instructions for updating:\n",
344 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
345 | "W0716 13:36:20.413324 140315330369344 deprecation.py:506] From /home/rmohashi/anaconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
346 | "Instructions for updating:\n",
347 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
348 | "W0716 13:36:20.413828 140315330369344 deprecation.py:506] From /home/rmohashi/anaconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
349 | "Instructions for updating:\n",
350 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
351 | "W0716 13:36:20.414215 140315330369344 deprecation.py:506] From /home/rmohashi/anaconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
352 | "Instructions for updating:\n",
353 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n"
354 | ]
355 | }
356 | ],
357 | "source": [
358 | "model = Sequential()\n",
359 | "model.add(Embedding(\n",
360 | " input_dim=input_dim,\n",
361 | " output_dim=embedding_dim,\n",
362 | " input_shape=(input_length,)\n",
363 | "))\n",
364 | "\n",
365 | "model.add(Bidirectional(GRU(\n",
366 | " gru_units,\n",
367 | " return_sequences=True,\n",
368 | " dropout=gru_dropout,\n",
369 | " recurrent_dropout=recurrent_dropout\n",
370 | ")))\n",
371 | "model.add(GlobalMaxPooling1D())\n",
372 | "model.add(Dense(32, activation='relu'))\n",
373 | "model.add(Dropout(dropout))\n",
374 | "\n",
375 | "model.add(Dense(1, activation='sigmoid'))"
376 | ]
377 | },
378 | {
379 | "cell_type": "code",
380 | "execution_count": 15,
381 | "metadata": {},
382 | "outputs": [
383 | {
384 | "name": "stderr",
385 | "output_type": "stream",
386 | "text": [
387 | "W0716 13:36:20.902724 140315330369344 deprecation.py:323] From /home/rmohashi/anaconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n",
388 | "Instructions for updating:\n",
389 | "Use tf.where in 2.0, which has the same broadcast rule as np.where\n"
390 | ]
391 | },
392 | {
393 | "name": "stdout",
394 | "output_type": "stream",
395 | "text": [
396 | "Model: \"sequential\"\n",
397 | "_________________________________________________________________\n",
398 | "Layer (type) Output Shape Param # \n",
399 | "=================================================================\n",
400 | "embedding (Embedding) (None, 100, 200) 2000000 \n",
401 | "_________________________________________________________________\n",
402 | "bidirectional (Bidirectional (None, 100, 256) 252672 \n",
403 | "_________________________________________________________________\n",
404 | "global_max_pooling1d (Global (None, 256) 0 \n",
405 | "_________________________________________________________________\n",
406 | "dense (Dense) (None, 32) 8224 \n",
407 | "_________________________________________________________________\n",
408 | "dropout (Dropout) (None, 32) 0 \n",
409 | "_________________________________________________________________\n",
410 | "dense_1 (Dense) (None, 1) 33 \n",
411 | "=================================================================\n",
412 | "Total params: 2,260,929\n",
413 | "Trainable params: 2,260,929\n",
414 | "Non-trainable params: 0\n",
415 | "_________________________________________________________________\n",
416 | "None\n"
417 | ]
418 | }
419 | ],
420 | "source": [
421 | "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
422 | "print(model.summary())"
423 | ]
424 | },
425 | {
426 | "cell_type": "markdown",
427 | "metadata": {},
428 | "source": [
429 | "## Prepare the data\n",
430 | "\n",
431 | "Prepare the model input data"
432 | ]
433 | },
434 | {
435 | "cell_type": "code",
436 | "execution_count": 16,
437 | "metadata": {},
438 | "outputs": [],
439 | "source": [
440 | "from tensorflow.keras.preprocessing.sequence import pad_sequences"
441 | ]
442 | },
443 | {
444 | "cell_type": "code",
445 | "execution_count": 17,
446 | "metadata": {},
447 | "outputs": [],
448 | "source": [
449 | "train_sequences = [text.split() for text in train.tweet]\n",
450 | "validation_sequences = [text.split() for text in validation.tweet]\n",
451 | "list_tokenized_train = tokenizer.texts_to_sequences(train_sequences)\n",
452 | "list_tokenized_validation = tokenizer.texts_to_sequences(validation_sequences)\n",
453 | "\n",
454 | "x_train = pad_sequences(list_tokenized_train, maxlen=input_length)\n",
455 | "x_validation = pad_sequences(list_tokenized_validation, maxlen=input_length)\n",
456 | "y_train = train.label.replace(4, 1)\n",
457 | "y_validation = validation.label.replace(4, 1)"
458 | ]
459 | },
460 | {
461 | "cell_type": "markdown",
462 | "metadata": {},
463 | "source": [
464 | "## Train model\n",
465 | "\n",
466 | "Do the training process with the given data"
467 | ]
468 | },
469 | {
470 | "cell_type": "code",
471 | "execution_count": 18,
472 | "metadata": {},
473 | "outputs": [],
474 | "source": [
475 | "batch_size = 128\n",
476 | "epochs = 1"
477 | ]
478 | },
479 | {
480 | "cell_type": "code",
481 | "execution_count": 19,
482 | "metadata": {},
483 | "outputs": [
484 | {
485 | "name": "stdout",
486 | "output_type": "stream",
487 | "text": [
488 | "Train on 280000 samples, validate on 120000 samples\n",
489 | "280000/280000 [==============================] - 374s 1ms/sample - loss: 0.4637 - acc: 0.7804 - val_loss: 0.4366 - val_acc: 0.7937\n"
490 | ]
491 | },
492 | {
493 | "data": {
494 | "text/plain": [
495 | ""
496 | ]
497 | },
498 | "execution_count": 19,
499 | "metadata": {},
500 | "output_type": "execute_result"
501 | }
502 | ],
503 | "source": [
504 | "model.fit(\n",
505 | " x_train,\n",
506 | " y=y_train,\n",
507 | " batch_size=batch_size,\n",
508 | " epochs=epochs,\n",
509 | " validation_data=(x_validation, y_validation),\n",
510 | ")"
511 | ]
512 | },
513 | {
514 | "cell_type": "code",
515 | "execution_count": 20,
516 | "metadata": {},
517 | "outputs": [],
518 | "source": [
519 | "model_file = Path('../models/sentiment_analysis/gru_model.h5').resolve()\n",
520 | "model.save_weights(model_file.as_posix())"
521 | ]
522 | }
523 | ],
524 | "metadata": {
525 | "kernelspec": {
526 | "display_name": "Python 3",
527 | "language": "python",
528 | "name": "python3"
529 | },
530 | "language_info": {
531 | "codemirror_mode": {
532 | "name": "ipython",
533 | "version": 3
534 | },
535 | "file_extension": ".py",
536 | "mimetype": "text/x-python",
537 | "name": "python",
538 | "nbconvert_exporter": "python",
539 | "pygments_lexer": "ipython3",
540 | "version": "3.6.8"
541 | }
542 | },
543 | "nbformat": 4,
544 | "nbformat_minor": 2
545 | }
546 |
--------------------------------------------------------------------------------
/notebooks/Train Emotion Recognition Model.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Train Emotion Recognition Model\n",
8 | "\n",
9 | "Here we'll train a emotion recognition model, using the output data from the sentiment analysis."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "# Add project path to the PYTHONPATH\n",
19 | "\n",
20 | "import os\n",
21 | "import sys\n",
22 | "from pathlib import Path\n",
23 | "\n",
24 | "sys.path.append(Path(os.path.join(os.path.abspath(''), '../')).resolve().as_posix())"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "## Load Dataset\n",
32 | "\n",
33 | "Load the emotion labeled dataset"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 2,
39 | "metadata": {},
40 | "outputs": [
41 | {
42 | "name": "stderr",
43 | "output_type": "stream",
44 | "text": [
45 | "[nltk_data] Downloading package stopwords to\n",
46 | "[nltk_data] /Users/rmohashi/nltk_data...\n",
47 | "[nltk_data] Package stopwords is already up-to-date!\n"
48 | ]
49 | }
50 | ],
51 | "source": [
52 | "from pathlib import Path\n",
53 | "import pandas as pd\n",
54 | "from nlp import Dataset"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 3,
60 | "metadata": {},
61 | "outputs": [
62 | {
63 | "name": "stdout",
64 | "output_type": "stream",
65 | "text": [
66 | "Time to clean up: 19.33 sec\n"
67 | ]
68 | }
69 | ],
70 | "source": [
71 | "dataset_path = Path('../datasets/sentiment_analysis/dataset.csv').resolve()\n",
72 | "dataset = Dataset(dataset_path)\n",
73 | "dataset.load()\n",
74 | "dataset.preprocess_texts()"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 4,
80 | "metadata": {},
81 | "outputs": [
82 | {
83 | "data": {
84 | "text/html": [
85 | "\n",
86 | "\n",
99 | "
\n",
100 | " \n",
101 | " \n",
102 | " | \n",
103 | " label | \n",
104 | " text | \n",
105 | "
\n",
106 | " \n",
107 | " \n",
108 | " \n",
109 | " | 0 | \n",
110 | " fear | \n",
111 | " sometimes afraid thing set free gt | \n",
112 | "
\n",
113 | " \n",
114 | " | 1 | \n",
115 | " fear | \n",
116 | " delayed post afraid | \n",
117 | "
\n",
118 | " \n",
119 | " | 2 | \n",
120 | " fear | \n",
121 | " eyeson seesomethingsaysomething cia clowns dee... | \n",
122 | "
\n",
123 | " \n",
124 | " | 3 | \n",
125 | " fear | \n",
126 | " happybirthdaystevenavery corruptiwoccounty afr... | \n",
127 | "
\n",
128 | " \n",
129 | " | 4 | \n",
130 | " fear | \n",
131 | " fight fire fire think reign fire comment check... | \n",
132 | "
\n",
133 | " \n",
134 | "
\n",
135 | "
"
136 | ],
137 | "text/plain": [
138 | " label text\n",
139 | "0 fear sometimes afraid thing set free gt\n",
140 | "1 fear delayed post afraid\n",
141 | "2 fear eyeson seesomethingsaysomething cia clowns dee...\n",
142 | "3 fear happybirthdaystevenavery corruptiwoccounty afr...\n",
143 | "4 fear fight fire fire think reign fire comment check..."
144 | ]
145 | },
146 | "execution_count": 4,
147 | "metadata": {},
148 | "output_type": "execute_result"
149 | }
150 | ],
151 | "source": [
152 | "dataset.cleaned_data.head()"
153 | ]
154 | },
155 | {
156 | "cell_type": "markdown",
157 | "metadata": {},
158 | "source": [
159 | "## Tokenize\n",
160 | "\n",
161 | "Transform the text corpus to a vector representation\n",
162 | "\n",
163 | "- **num_words**: Number of words to use"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 5,
169 | "metadata": {},
170 | "outputs": [],
171 | "source": [
172 | "import pickle\n",
173 | "from tensorflow.keras.preprocessing.text import Tokenizer"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": 6,
179 | "metadata": {},
180 | "outputs": [],
181 | "source": [
182 | "num_words = 10000\n",
183 | "\n",
184 | "tokenizer = Tokenizer(num_words=num_words, lower=True)\n",
185 | "tokenizer.fit_on_texts(dataset.cleaned_data.text)\n",
186 | "\n",
187 | "file_to_save = Path('../datasets/sentiment_analysis/tokenizer.pickle').resolve()\n",
188 | "with file_to_save.open('wb') as file:\n",
189 | " pickle.dump(tokenizer, file)"
190 | ]
191 | },
192 | {
193 | "cell_type": "markdown",
194 | "metadata": {},
195 | "source": [
196 | "## Split data\n",
197 | "\n",
198 | "Split the dataset in train and validation data"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": 7,
204 | "metadata": {},
205 | "outputs": [],
206 | "source": [
207 | "from sklearn.model_selection import train_test_split"
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": 8,
213 | "metadata": {},
214 | "outputs": [],
215 | "source": [
216 | "data = dataset.cleaned_data.copy()\n",
217 | "\n",
218 | "train = pd.DataFrame(columns=['label', 'text'])\n",
219 | "validation = pd.DataFrame(columns=['label', 'text'])\n",
220 | "for label in data.label.unique():\n",
221 | " label_data = data[data.label == label]\n",
222 | " train_data, validation_data = train_test_split(label_data, test_size=0.3)\n",
223 | " train = pd.concat([train, train_data])\n",
224 | " validation = pd.concat([validation, validation_data])"
225 | ]
226 | },
227 | {
228 | "cell_type": "markdown",
229 | "metadata": {},
230 | "source": [
231 | "## Model\n",
232 | "\n",
233 | "Define the **LSTM** + **CNN** model"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": 9,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, LSTM\n",
243 | "from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D\n",
244 | "from tensorflow.keras.layers import Bidirectional, Conv1D, Dense, concatenate\n",
245 | "from tensorflow.keras.models import Model"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": 10,
251 | "metadata": {},
252 | "outputs": [],
253 | "source": [
254 | "input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)\n",
255 | "num_classes = len(data.label.unique())\n",
256 | "embedding_dim = 500\n",
257 | "input_length = 100\n",
258 | "lstm_units = 128\n",
259 | "lstm_dropout = 0.1\n",
260 | "recurrent_dropout = 0.1\n",
261 | "spatial_dropout=0.2\n",
262 | "filters=64\n",
263 | "kernel_size=3"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": 11,
269 | "metadata": {
270 | "scrolled": false
271 | },
272 | "outputs": [
273 | {
274 | "name": "stderr",
275 | "output_type": "stream",
276 | "text": [
277 | "WARNING: Logging before flag parsing goes to stderr.\n",
278 | "W0719 10:32:00.331336 4686337472 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
279 | "Instructions for updating:\n",
280 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
281 | "W0719 10:32:00.392153 4686337472 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
282 | "Instructions for updating:\n",
283 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
284 | "W0719 10:32:00.397410 4686337472 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
285 | "Instructions for updating:\n",
286 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
287 | "W0719 10:32:00.399722 4686337472 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
288 | "Instructions for updating:\n",
289 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
290 | "W0719 10:32:00.403119 4686337472 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
291 | "Instructions for updating:\n",
292 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n"
293 | ]
294 | }
295 | ],
296 | "source": [
297 | "input_layer = Input(shape=(input_length,))\n",
298 | "output_layer = Embedding(\n",
299 | " input_dim=input_dim,\n",
300 | " output_dim=embedding_dim,\n",
301 | " input_shape=(input_length,)\n",
302 | ")(input_layer)\n",
303 | "\n",
304 | "output_layer = SpatialDropout1D(spatial_dropout)(output_layer)\n",
305 | "\n",
306 | "output_layer = Bidirectional(\n",
307 | "LSTM(lstm_units, return_sequences=True,\n",
308 | " dropout=lstm_dropout, recurrent_dropout=recurrent_dropout)\n",
309 | ")(output_layer)\n",
310 | "output_layer = Conv1D(filters, kernel_size=kernel_size, padding='valid',\n",
311 | " kernel_initializer='glorot_uniform')(output_layer)\n",
312 | "\n",
313 | "avg_pool = GlobalAveragePooling1D()(output_layer)\n",
314 | "max_pool = GlobalMaxPooling1D()(output_layer)\n",
315 | "output_layer = concatenate([avg_pool, max_pool])\n",
316 | "\n",
317 | "output_layer = Dense(num_classes, activation='softmax')(output_layer)\n",
318 | "\n",
319 | "model = Model(input_layer, output_layer)"
320 | ]
321 | },
322 | {
323 | "cell_type": "code",
324 | "execution_count": 12,
325 | "metadata": {},
326 | "outputs": [
327 | {
328 | "name": "stdout",
329 | "output_type": "stream",
330 | "text": [
331 | "Model: \"model\"\n",
332 | "__________________________________________________________________________________________________\n",
333 | "Layer (type) Output Shape Param # Connected to \n",
334 | "==================================================================================================\n",
335 | "input_1 (InputLayer) [(None, 100)] 0 \n",
336 | "__________________________________________________________________________________________________\n",
337 | "embedding (Embedding) (None, 100, 500) 5000000 input_1[0][0] \n",
338 | "__________________________________________________________________________________________________\n",
339 | "spatial_dropout1d (SpatialDropo (None, 100, 500) 0 embedding[0][0] \n",
340 | "__________________________________________________________________________________________________\n",
341 | "bidirectional (Bidirectional) (None, 100, 256) 644096 spatial_dropout1d[0][0] \n",
342 | "__________________________________________________________________________________________________\n",
343 | "conv1d (Conv1D) (None, 98, 64) 49216 bidirectional[0][0] \n",
344 | "__________________________________________________________________________________________________\n",
345 | "global_average_pooling1d (Globa (None, 64) 0 conv1d[0][0] \n",
346 | "__________________________________________________________________________________________________\n",
347 | "global_max_pooling1d (GlobalMax (None, 64) 0 conv1d[0][0] \n",
348 | "__________________________________________________________________________________________________\n",
349 | "concatenate (Concatenate) (None, 128) 0 global_average_pooling1d[0][0] \n",
350 | " global_max_pooling1d[0][0] \n",
351 | "__________________________________________________________________________________________________\n",
352 | "dense (Dense) (None, 4) 516 concatenate[0][0] \n",
353 | "==================================================================================================\n",
354 | "Total params: 5,693,828\n",
355 | "Trainable params: 5,693,828\n",
356 | "Non-trainable params: 0\n",
357 | "__________________________________________________________________________________________________\n"
358 | ]
359 | }
360 | ],
361 | "source": [
362 | "model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
363 | "model.summary()"
364 | ]
365 | },
366 | {
367 | "cell_type": "markdown",
368 | "metadata": {},
369 | "source": [
370 | "## Prepare the data\n",
371 | "\n",
372 | "Prepare the model input data"
373 | ]
374 | },
375 | {
376 | "cell_type": "code",
377 | "execution_count": 13,
378 | "metadata": {},
379 | "outputs": [],
380 | "source": [
381 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
382 | "from sklearn.preprocessing import LabelBinarizer"
383 | ]
384 | },
385 | {
386 | "cell_type": "code",
387 | "execution_count": 14,
388 | "metadata": {},
389 | "outputs": [],
390 | "source": [
391 | "train_sequences = [text.split() for text in train.text]\n",
392 | "validation_sequences = [text.split() for text in validation.text]\n",
393 | "list_tokenized_train = tokenizer.texts_to_sequences(train_sequences)\n",
394 | "list_tokenized_validation = tokenizer.texts_to_sequences(validation_sequences)\n",
395 | "x_train = pad_sequences(list_tokenized_train, maxlen=input_length)\n",
396 | "x_validation = pad_sequences(list_tokenized_validation, maxlen=input_length)\n",
397 | "\n",
398 | "encoder = LabelBinarizer()\n",
399 | "encoder.fit(data.label.unique())\n",
400 | "\n",
401 | "encoder_path = Path('../models/emotion_recognition', 'encoder.pickle')\n",
402 | "with encoder_path.open('wb') as file:\n",
403 | " pickle.dump(encoder, file)\n",
404 | "\n",
405 | "y_train = encoder.transform(train.label)\n",
406 | "y_validation = encoder.transform(validation.label)"
407 | ]
408 | },
409 | {
410 | "cell_type": "markdown",
411 | "metadata": {},
412 | "source": [
413 | "## Train model\n",
414 | "\n",
415 | "Do the training process with the given data"
416 | ]
417 | },
418 | {
419 | "cell_type": "code",
420 | "execution_count": 15,
421 | "metadata": {},
422 | "outputs": [],
423 | "source": [
424 | "batch_size = 128\n",
425 | "epochs = 1"
426 | ]
427 | },
428 | {
429 | "cell_type": "code",
430 | "execution_count": 16,
431 | "metadata": {},
432 | "outputs": [
433 | {
434 | "name": "stdout",
435 | "output_type": "stream",
436 | "text": [
437 | "Train on 25454 samples, validate on 10911 samples\n"
438 | ]
439 | },
440 | {
441 | "name": "stderr",
442 | "output_type": "stream",
443 | "text": [
444 | "W0719 10:32:03.006144 4686337472 deprecation.py:323] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n",
445 | "Instructions for updating:\n",
446 | "Use tf.where in 2.0, which has the same broadcast rule as np.where\n"
447 | ]
448 | },
449 | {
450 | "name": "stdout",
451 | "output_type": "stream",
452 | "text": [
453 | "25454/25454 [==============================] - 570s 22ms/sample - loss: 0.5621 - acc: 0.7593 - val_loss: 0.3839 - val_acc: 0.8381\n"
454 | ]
455 | },
456 | {
457 | "data": {
458 | "text/plain": [
459 | ""
460 | ]
461 | },
462 | "execution_count": 16,
463 | "metadata": {},
464 | "output_type": "execute_result"
465 | }
466 | ],
467 | "source": [
468 | "model.fit(\n",
469 | " x_train,\n",
470 | " y=y_train,\n",
471 | " batch_size=batch_size,\n",
472 | " epochs=epochs,\n",
473 | " validation_data=(x_validation, y_validation)\n",
474 | ")"
475 | ]
476 | },
477 | {
478 | "cell_type": "code",
479 | "execution_count": 17,
480 | "metadata": {},
481 | "outputs": [],
482 | "source": [
483 | "model_file = Path('../models/emotion_recognition/model_weights.h5').resolve()\n",
484 | "model.save_weights(model_file.as_posix())"
485 | ]
486 | },
487 | {
488 | "cell_type": "code",
489 | "execution_count": null,
490 | "metadata": {},
491 | "outputs": [],
492 | "source": []
493 | }
494 | ],
495 | "metadata": {
496 | "kernelspec": {
497 | "display_name": "Python 3",
498 | "language": "python",
499 | "name": "python3"
500 | },
501 | "language_info": {
502 | "codemirror_mode": {
503 | "name": "ipython",
504 | "version": 3
505 | },
506 | "file_extension": ".py",
507 | "mimetype": "text/x-python",
508 | "name": "python",
509 | "nbconvert_exporter": "python",
510 | "pygments_lexer": "ipython3",
511 | "version": "3.6.8"
512 | }
513 | },
514 | "nbformat": 4,
515 | "nbformat_minor": 2
516 | }
517 |
--------------------------------------------------------------------------------
/notebooks/Sentiment Analysis Score.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Sentiment Analysis Score\n",
8 | "\n",
9 | "Predict the sentiment analysis label, using a deep learning model for each query/emotion inside the relations file"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "# Add project path to the PYTHONPATH\n",
19 | "\n",
20 | "import os\n",
21 | "import sys\n",
22 | "from pathlib import Path\n",
23 | "\n",
24 | "sys.path.append(Path(os.path.join(os.path.abspath(''), '../')).resolve().as_posix())"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "## Load Tokenizer\n",
32 | "\n",
33 | "Import and load the tokenizer from a `.pickle` file"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 2,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "import pickle\n",
43 | "from pathlib import Path"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 3,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "tokenizer_file = Path('../datasets/sentiment140/tokenizer.pickle').resolve()\n",
53 | "with tokenizer_file.open('rb') as file:\n",
54 | " tokenizer = pickle.load(file)"
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {},
60 | "source": [
61 | "## Load Model\n",
62 | "\n",
63 | "Load the sentiment analysis model, using the saved weights"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 6,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "from tensorflow.keras.layers import Input, Embedding, GRU\n",
73 | "from tensorflow.keras.layers import Dropout, GlobalMaxPooling1D\n",
74 | "from tensorflow.keras.layers import Bidirectional, Dense\n",
75 | "from tensorflow.keras.models import Sequential"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 7,
81 | "metadata": {},
82 | "outputs": [],
83 | "source": [
84 | "input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)\n",
85 | "embedding_dim = 200\n",
86 | "input_length = 100\n",
87 | "gru_units = 128\n",
88 | "gru_dropout = 0.1\n",
89 | "recurrent_dropout = 0.1\n",
90 | "dropout = 0.1"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": 8,
96 | "metadata": {},
97 | "outputs": [
98 | {
99 | "name": "stderr",
100 | "output_type": "stream",
101 | "text": [
102 | "WARNING: Logging before flag parsing goes to stderr.\n",
103 | "W0719 09:56:43.758275 4692825536 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
104 | "Instructions for updating:\n",
105 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
106 | "W0719 09:56:43.802737 4692825536 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
107 | "Instructions for updating:\n",
108 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
109 | "W0719 09:56:43.809999 4692825536 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
110 | "Instructions for updating:\n",
111 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
112 | "W0719 09:56:43.811434 4692825536 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
113 | "Instructions for updating:\n",
114 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
115 | "W0719 09:56:43.813139 4692825536 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
116 | "Instructions for updating:\n",
117 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n"
118 | ]
119 | }
120 | ],
121 | "source": [
122 | "model = Sequential()\n",
123 | "model.add(Embedding(\n",
124 | " input_dim=input_dim,\n",
125 | " output_dim=embedding_dim,\n",
126 | " input_shape=(input_length,)\n",
127 | "))\n",
128 | "\n",
129 | "model.add(Bidirectional(GRU(\n",
130 | " gru_units,\n",
131 | " return_sequences=True,\n",
132 | " dropout=gru_dropout,\n",
133 | " recurrent_dropout=recurrent_dropout\n",
134 | ")))\n",
135 | "model.add(GlobalMaxPooling1D())\n",
136 | "model.add(Dense(32, activation='relu'))\n",
137 | "model.add(Dropout(dropout))\n",
138 | "\n",
139 | "model.add(Dense(1, activation='sigmoid'))"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 9,
145 | "metadata": {},
146 | "outputs": [],
147 | "source": [
148 | "weights_path = Path('../models/sentiment_analysis/model_weights.h5').resolve()\n",
149 | "model.load_weights(weights_path.as_posix())"
150 | ]
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "metadata": {},
155 | "source": [
156 | "## Load Query Relations\n",
157 | "\n",
158 | "Load the relations between queries and emotions from a `.json` file"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": 10,
164 | "metadata": {},
165 | "outputs": [],
166 | "source": [
167 | "import json"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": 12,
173 | "metadata": {},
174 | "outputs": [],
175 | "source": [
176 | "relations_path = Path('../query_relations.json')\n",
177 | "with relations_path.open('r') as file:\n",
178 | " relations = json.load(file)"
179 | ]
180 | },
181 | {
182 | "cell_type": "markdown",
183 | "metadata": {},
184 | "source": [
185 | "## Predict polarity\n",
186 | "\n",
187 | "Predict the polarity of the texts, using the sentiment analysis model"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": 13,
193 | "metadata": {},
194 | "outputs": [
195 | {
196 | "name": "stderr",
197 | "output_type": "stream",
198 | "text": [
199 | "[nltk_data] Downloading package stopwords to\n",
200 | "[nltk_data] /Users/rmohashi/nltk_data...\n",
201 | "[nltk_data] Package stopwords is already up-to-date!\n"
202 | ]
203 | }
204 | ],
205 | "source": [
206 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
207 | "from nlp import preprocess\n",
208 | "from tqdm import tqdm\n",
209 | "import pandas as pd\n",
210 | "import numpy as np\n",
211 | "import re"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": 14,
217 | "metadata": {},
218 | "outputs": [],
219 | "source": [
220 | "dataset_dir = Path('../datasets/tweepy').resolve()"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": 15,
226 | "metadata": {},
227 | "outputs": [
228 | {
229 | "name": "stderr",
230 | "output_type": "stream",
231 | "text": [
232 | "100%|██████████| 19/19 [02:59<00:00, 12.95s/it]\n"
233 | ]
234 | }
235 | ],
236 | "source": [
237 | "data_dict = {}\n",
238 | "\n",
239 | "query_dict = {\n",
240 | " 'query': [],\n",
241 | " 'mean': [],\n",
242 | " 'max': [],\n",
243 | " 'min': [],\n",
244 | " 'std': [],\n",
245 | " 'count': [],\n",
246 | " 'emotion': []\n",
247 | "}\n",
248 | "\n",
249 | "dir_files = os.listdir(dataset_dir)\n",
250 | "\n",
251 | "with tqdm(total=len(dir_files)) as t:\n",
252 | " for filename in dir_files:\n",
253 | " dataset = pd.read_csv(os.path.join(dataset_dir, filename))\n",
254 | " cleaned_texts = preprocess(dataset.text, quiet=True)\n",
255 | "\n",
256 | " query = re.findall(r'(#[^.]+|:.+:)', filename)[0]\n",
257 | "\n",
258 | " predict_sequences = [text.split() for text in cleaned_texts]\n",
259 | " list_tokenized_predict = tokenizer.texts_to_sequences(predict_sequences)\n",
260 | " x_predict = pad_sequences(list_tokenized_predict, maxlen=100)\n",
261 | "\n",
262 | " result = model.predict(x_predict)\n",
263 | " \n",
264 | " emotion = relations[query]\n",
265 | " query_dict['query'].append(query)\n",
266 | " query_dict['mean'].append(np.mean(result))\n",
267 | " query_dict['max'].append(np.amax(result))\n",
268 | " query_dict['min'].append(np.amin(result))\n",
269 | " query_dict['count'].append(len(dataset))\n",
270 | " query_dict['std'].append(np.std(result))\n",
271 | " query_dict['emotion'].append(emotion)\n",
272 | "\n",
273 | " if emotion in data_dict:\n",
274 | " data_dict[emotion] = np.concatenate([data_dict[emotion], result])\n",
275 | " else:\n",
276 | " data_dict[emotion] = result\n",
277 | " \n",
278 | " t.update()"
279 | ]
280 | },
281 | {
282 | "cell_type": "markdown",
283 | "metadata": {},
284 | "source": [
285 | "## Print Results\n",
286 | "\n",
287 | "Print the queries/emotions and the values"
288 | ]
289 | },
290 | {
291 | "cell_type": "code",
292 | "execution_count": 10,
293 | "metadata": {
294 | "scrolled": false
295 | },
296 | "outputs": [
297 | {
298 | "data": {
299 | "text/html": [
300 | "\n",
301 | "\n",
314 | "
\n",
315 | " \n",
316 | " \n",
317 | " | \n",
318 | " query | \n",
319 | " mean | \n",
320 | " max | \n",
321 | " min | \n",
322 | " std | \n",
323 | " count | \n",
324 | " emotion | \n",
325 | "
\n",
326 | " \n",
327 | " \n",
328 | " \n",
329 | " | 0 | \n",
330 | " :anxious_face_with_sweat: | \n",
331 | " 0.428562 | \n",
332 | " 0.983642 | \n",
333 | " 0.004371 | \n",
334 | " 0.274272 | \n",
335 | " 199 | \n",
336 | " fear | \n",
337 | "
\n",
338 | " \n",
339 | " | 6 | \n",
340 | " #worried | \n",
341 | " 0.205504 | \n",
342 | " 0.879476 | \n",
343 | " 0.004883 | \n",
344 | " 0.210547 | \n",
345 | " 196 | \n",
346 | " fear | \n",
347 | "
\n",
348 | " \n",
349 | "
\n",
350 | "
"
351 | ],
352 | "text/plain": [
353 | " query mean max min std count \\\n",
354 | "0 :anxious_face_with_sweat: 0.428562 0.983642 0.004371 0.274272 199 \n",
355 | "6 #worried 0.205504 0.879476 0.004883 0.210547 196 \n",
356 | "\n",
357 | " emotion \n",
358 | "0 fear \n",
359 | "6 fear "
360 | ]
361 | },
362 | "metadata": {},
363 | "output_type": "display_data"
364 | },
365 | {
366 | "data": {
367 | "text/html": [
368 | "\n",
369 | "\n",
382 | "
\n",
383 | " \n",
384 | " \n",
385 | " | \n",
386 | " query | \n",
387 | " mean | \n",
388 | " max | \n",
389 | " min | \n",
390 | " std | \n",
391 | " count | \n",
392 | " emotion | \n",
393 | "
\n",
394 | " \n",
395 | " \n",
396 | " \n",
397 | " | 1 | \n",
398 | " #sad | \n",
399 | " 0.073413 | \n",
400 | " 0.873629 | \n",
401 | " 0.002289 | \n",
402 | " 0.127914 | \n",
403 | " 200 | \n",
404 | " sadness | \n",
405 | "
\n",
406 | " \n",
407 | " | 2 | \n",
408 | " :crying_face: | \n",
409 | " 0.438269 | \n",
410 | " 0.996975 | \n",
411 | " 0.005851 | \n",
412 | " 0.296389 | \n",
413 | " 197 | \n",
414 | " sadness | \n",
415 | "
\n",
416 | " \n",
417 | "
\n",
418 | "
"
419 | ],
420 | "text/plain": [
421 | " query mean max min std count emotion\n",
422 | "1 #sad 0.073413 0.873629 0.002289 0.127914 200 sadness\n",
423 | "2 :crying_face: 0.438269 0.996975 0.005851 0.296389 197 sadness"
424 | ]
425 | },
426 | "metadata": {},
427 | "output_type": "display_data"
428 | },
429 | {
430 | "data": {
431 | "text/html": [
432 | "\n",
433 | "\n",
446 | "
\n",
447 | " \n",
448 | " \n",
449 | " | \n",
450 | " query | \n",
451 | " mean | \n",
452 | " max | \n",
453 | " min | \n",
454 | " std | \n",
455 | " count | \n",
456 | " emotion | \n",
457 | "
\n",
458 | " \n",
459 | " \n",
460 | " \n",
461 | " | 3 | \n",
462 | " :red_heart: | \n",
463 | " 0.770384 | \n",
464 | " 0.996633 | \n",
465 | " 0.042774 | \n",
466 | " 0.225747 | \n",
467 | " 200 | \n",
468 | " joy | \n",
469 | "
\n",
470 | " \n",
471 | " | 7 | \n",
472 | " #joy | \n",
473 | " 0.832007 | \n",
474 | " 0.997057 | \n",
475 | " 0.208914 | \n",
476 | " 0.152068 | \n",
477 | " 191 | \n",
478 | " joy | \n",
479 | "
\n",
480 | " \n",
481 | "
\n",
482 | "
"
483 | ],
484 | "text/plain": [
485 | " query mean max min std count emotion\n",
486 | "3 :red_heart: 0.770384 0.996633 0.042774 0.225747 200 joy\n",
487 | "7 #joy 0.832007 0.997057 0.208914 0.152068 191 joy"
488 | ]
489 | },
490 | "metadata": {},
491 | "output_type": "display_data"
492 | },
493 | {
494 | "data": {
495 | "text/html": [
496 | "\n",
497 | "\n",
510 | "
\n",
511 | " \n",
512 | " \n",
513 | " | \n",
514 | " query | \n",
515 | " mean | \n",
516 | " max | \n",
517 | " min | \n",
518 | " std | \n",
519 | " count | \n",
520 | " emotion | \n",
521 | "
\n",
522 | " \n",
523 | " \n",
524 | " \n",
525 | " | 4 | \n",
526 | " :face_with_symbols_on_mouth: | \n",
527 | " 0.403210 | \n",
528 | " 0.997371 | \n",
529 | " 0.010545 | \n",
530 | " 0.261377 | \n",
531 | " 194 | \n",
532 | " angry | \n",
533 | "
\n",
534 | " \n",
535 | " | 5 | \n",
536 | " #pissed | \n",
537 | " 0.230712 | \n",
538 | " 0.912333 | \n",
539 | " 0.008014 | \n",
540 | " 0.180684 | \n",
541 | " 200 | \n",
542 | " angry | \n",
543 | "
\n",
544 | " \n",
545 | "
\n",
546 | "
"
547 | ],
548 | "text/plain": [
549 | " query mean max min std \\\n",
550 | "4 :face_with_symbols_on_mouth: 0.403210 0.997371 0.010545 0.261377 \n",
551 | "5 #pissed 0.230712 0.912333 0.008014 0.180684 \n",
552 | "\n",
553 | " count emotion \n",
554 | "4 194 angry \n",
555 | "5 200 angry "
556 | ]
557 | },
558 | "metadata": {},
559 | "output_type": "display_data"
560 | }
561 | ],
562 | "source": [
563 | "df = pd.DataFrame(data=query_dict)\n",
564 | "for emotion in df.emotion.unique():\n",
565 | " display(df[df.emotion == emotion])"
566 | ]
567 | },
568 | {
569 | "cell_type": "code",
570 | "execution_count": 11,
571 | "metadata": {},
572 | "outputs": [
573 | {
574 | "data": {
575 | "text/html": [
576 | "\n",
577 | "\n",
590 | "
\n",
591 | " \n",
592 | " \n",
593 | " | \n",
594 | " emotion | \n",
595 | " mean | \n",
596 | " max | \n",
597 | " min | \n",
598 | " std | \n",
599 | " count | \n",
600 | "
\n",
601 | " \n",
602 | " \n",
603 | " \n",
604 | " | 0 | \n",
605 | " fear | \n",
606 | " 0.317880 | \n",
607 | " 0.983642 | \n",
608 | " 0.004371 | \n",
609 | " 0.268948 | \n",
610 | " 395 | \n",
611 | "
\n",
612 | " \n",
613 | " | 1 | \n",
614 | " sadness | \n",
615 | " 0.254463 | \n",
616 | " 0.996975 | \n",
617 | " 0.002289 | \n",
618 | " 0.291740 | \n",
619 | " 397 | \n",
620 | "
\n",
621 | " \n",
622 | " | 2 | \n",
623 | " joy | \n",
624 | " 0.800486 | \n",
625 | " 0.997057 | \n",
626 | " 0.042774 | \n",
627 | " 0.195736 | \n",
628 | " 391 | \n",
629 | "
\n",
630 | " \n",
631 | " | 3 | \n",
632 | " angry | \n",
633 | " 0.315648 | \n",
634 | " 0.997371 | \n",
635 | " 0.008014 | \n",
636 | " 0.240100 | \n",
637 | " 394 | \n",
638 | "
\n",
639 | " \n",
640 | "
\n",
641 | "
"
642 | ],
643 | "text/plain": [
644 | " emotion mean max min std count\n",
645 | "0 fear 0.317880 0.983642 0.004371 0.268948 395\n",
646 | "1 sadness 0.254463 0.996975 0.002289 0.291740 397\n",
647 | "2 joy 0.800486 0.997057 0.042774 0.195736 391\n",
648 | "3 angry 0.315648 0.997371 0.008014 0.240100 394"
649 | ]
650 | },
651 | "metadata": {},
652 | "output_type": "display_data"
653 | }
654 | ],
655 | "source": [
656 | "emotion_dict = {\n",
657 | " 'emotion': [],\n",
658 | " 'mean': [],\n",
659 | " 'max': [],\n",
660 | " 'min': [],\n",
661 | " 'std': [],\n",
662 | " 'count': []\n",
663 | "}\n",
664 | "\n",
665 | "for emotion, result in data_dict.items():\n",
666 | " emotion_dict['emotion'].append(emotion)\n",
667 | " emotion_dict['mean'].append(np.mean(result))\n",
668 | " emotion_dict['max'].append(np.amax(result))\n",
669 | " emotion_dict['min'].append(np.amin(result))\n",
670 | " emotion_dict['std'].append(np.std(result))\n",
671 | " emotion_dict['count'].append(len(result))\n",
672 | " \n",
673 | "emotion_df = pd.DataFrame(data=emotion_dict)\n",
674 | "display(emotion_df)"
675 | ]
676 | },
677 | {
678 | "cell_type": "code",
679 | "execution_count": null,
680 | "metadata": {},
681 | "outputs": [],
682 | "source": []
683 | }
684 | ],
685 | "metadata": {
686 | "kernelspec": {
687 | "display_name": "Python 3",
688 | "language": "python",
689 | "name": "python3"
690 | },
691 | "language_info": {
692 | "codemirror_mode": {
693 | "name": "ipython",
694 | "version": 3
695 | },
696 | "file_extension": ".py",
697 | "mimetype": "text/x-python",
698 | "name": "python",
699 | "nbconvert_exporter": "python",
700 | "pygments_lexer": "ipython3",
701 | "version": "3.6.8"
702 | }
703 | },
704 | "nbformat": 4,
705 | "nbformat_minor": 2
706 | }
707 |
--------------------------------------------------------------------------------
/notebooks/Emotion Recognition Model Validation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Emotion Recognition Model Validation\n",
8 | "\n",
9 | "The main objective of this notebook is to validate the trained model for emotion recognition"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "# Add project path to the PYTHONPATH\n",
19 | "\n",
20 | "import os\n",
21 | "import sys\n",
22 | "from pathlib import Path\n",
23 | "\n",
24 | "sys.path.append(Path(os.path.join(os.path.abspath(''), '../')).resolve().as_posix())"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 2,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "import pickle"
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "metadata": {},
39 | "source": [
40 | "## Load Tokenizer\n",
41 | "\n",
42 | "Load `.pickle` file with the tokenizer"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 3,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "tokenizer_path = Path('../datasets/sentiment_analysis/tokenizer.pickle').resolve()\n",
52 | "with tokenizer_path.open('rb') as file:\n",
53 | " tokenizer = pickle.load(file)"
54 | ]
55 | },
56 | {
57 | "cell_type": "markdown",
58 | "metadata": {},
59 | "source": [
60 | "## Load Model\n",
61 | "\n",
62 | "Load the trained emotion recognition model"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 4,
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, LSTM\n",
72 | "from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D\n",
73 | "from tensorflow.keras.layers import Bidirectional, Conv1D, Dense, concatenate\n",
74 | "from tensorflow.keras.models import Model"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 5,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)\n",
84 | "num_classes = 4\n",
85 | "embedding_dim = 500\n",
86 | "input_length = 100\n",
87 | "lstm_units = 128\n",
88 | "lstm_dropout = 0.1\n",
89 | "recurrent_dropout = 0.1\n",
90 | "spatial_dropout=0.2\n",
91 | "filters=64\n",
92 | "kernel_size=3"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 6,
98 | "metadata": {},
99 | "outputs": [
100 | {
101 | "name": "stderr",
102 | "output_type": "stream",
103 | "text": [
104 | "WARNING: Logging before flag parsing goes to stderr.\n",
105 | "W0719 10:46:16.952994 4735395264 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
106 | "Instructions for updating:\n",
107 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
108 | "W0719 10:46:17.039670 4735395264 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
109 | "Instructions for updating:\n",
110 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
111 | "W0719 10:46:17.047888 4735395264 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
112 | "Instructions for updating:\n",
113 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
114 | "W0719 10:46:17.049386 4735395264 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
115 | "Instructions for updating:\n",
116 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
117 | "W0719 10:46:17.050548 4735395264 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
118 | "Instructions for updating:\n",
119 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n"
120 | ]
121 | }
122 | ],
123 | "source": [
124 | "input_layer = Input(shape=(input_length,))\n",
125 | "output_layer = Embedding(\n",
126 | " input_dim=input_dim,\n",
127 | " output_dim=embedding_dim,\n",
128 | " input_shape=(input_length,)\n",
129 | ")(input_layer)\n",
130 | "\n",
131 | "output_layer = SpatialDropout1D(spatial_dropout)(output_layer)\n",
132 | "\n",
133 | "output_layer = Bidirectional(\n",
134 | "LSTM(lstm_units, return_sequences=True,\n",
135 | " dropout=lstm_dropout, recurrent_dropout=recurrent_dropout)\n",
136 | ")(output_layer)\n",
137 | "output_layer = Conv1D(filters, kernel_size=kernel_size, padding='valid',\n",
138 | " kernel_initializer='glorot_uniform')(output_layer)\n",
139 | "\n",
140 | "avg_pool = GlobalAveragePooling1D()(output_layer)\n",
141 | "max_pool = GlobalMaxPooling1D()(output_layer)\n",
142 | "output_layer = concatenate([avg_pool, max_pool])\n",
143 | "\n",
144 | "output_layer = Dense(num_classes, activation='softmax')(output_layer)\n",
145 | "\n",
146 | "model = Model(input_layer, output_layer)"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": 7,
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "model_weights_path = Path('../models/emotion_recognition/model_weights.h5').resolve()\n",
156 | "model.load_weights(model_weights_path.as_posix())"
157 | ]
158 | },
159 | {
160 | "cell_type": "markdown",
161 | "metadata": {},
162 | "source": [
163 | "## Load test dataset\n",
164 | "\n",
165 | "Load the dataset that will be used to test the model"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": 8,
171 | "metadata": {},
172 | "outputs": [],
173 | "source": [
174 | "import pandas as pd"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": 9,
180 | "metadata": {},
181 | "outputs": [],
182 | "source": [
183 | "test_data_path = Path('../datasets/sentiment_analysis/test.csv').resolve()\n",
184 | "test_data = pd.read_csv(test_data_path)"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 10,
190 | "metadata": {
191 | "scrolled": true
192 | },
193 | "outputs": [
194 | {
195 | "data": {
196 | "text/html": [
197 | "\n",
198 | "\n",
211 | "
\n",
212 | " \n",
213 | " \n",
214 | " | \n",
215 | " label | \n",
216 | " id | \n",
217 | " date | \n",
218 | " user | \n",
219 | " text | \n",
220 | "
\n",
221 | " \n",
222 | " \n",
223 | " \n",
224 | " | 0 | \n",
225 | " fear | \n",
226 | " 1151474078131339264 | \n",
227 | " 2019-07-17 12:49:48 | \n",
228 | " 13thSnipers | \n",
229 | " It's so obvious Ashley Young @youngy18 is not ... | \n",
230 | "
\n",
231 | " \n",
232 | " | 1 | \n",
233 | " fear | \n",
234 | " 1151474075723870208 | \n",
235 | " 2019-07-17 12:49:47 | \n",
236 | " ShukrahFirdaus | \n",
237 | " Engaging in a staring competition with this wo... | \n",
238 | "
\n",
239 | " \n",
240 | " | 2 | \n",
241 | " fear | \n",
242 | " 1151473913668313089 | \n",
243 | " 2019-07-17 12:49:09 | \n",
244 | " EvinErvian | \n",
245 | " @savage2ooo yah me too. worst? can't stand wat... | \n",
246 | "
\n",
247 | " \n",
248 | " | 3 | \n",
249 | " fear | \n",
250 | " 1151473830398976000 | \n",
251 | " 2019-07-17 12:48:49 | \n",
252 | " oliviaakuhn | \n",
253 | " i was with @regiannoni EXACTLY 2 years and 3 y... | \n",
254 | "
\n",
255 | " \n",
256 | " | 4 | \n",
257 | " fear | \n",
258 | " 1151473618318176257 | \n",
259 | " 2019-07-17 12:47:58 | \n",
260 | " zaaboogie_ | \n",
261 | " This heat different 😰 | \n",
262 | "
\n",
263 | " \n",
264 | "
\n",
265 | "
"
266 | ],
267 | "text/plain": [
268 | " label id date user \\\n",
269 | "0 fear 1151474078131339264 2019-07-17 12:49:48 13thSnipers \n",
270 | "1 fear 1151474075723870208 2019-07-17 12:49:47 ShukrahFirdaus \n",
271 | "2 fear 1151473913668313089 2019-07-17 12:49:09 EvinErvian \n",
272 | "3 fear 1151473830398976000 2019-07-17 12:48:49 oliviaakuhn \n",
273 | "4 fear 1151473618318176257 2019-07-17 12:47:58 zaaboogie_ \n",
274 | "\n",
275 | " text \n",
276 | "0 It's so obvious Ashley Young @youngy18 is not ... \n",
277 | "1 Engaging in a staring competition with this wo... \n",
278 | "2 @savage2ooo yah me too. worst? can't stand wat... \n",
279 | "3 i was with @regiannoni EXACTLY 2 years and 3 y... \n",
280 | "4 This heat different 😰 "
281 | ]
282 | },
283 | "execution_count": 10,
284 | "metadata": {},
285 | "output_type": "execute_result"
286 | }
287 | ],
288 | "source": [
289 | "test_data.head()"
290 | ]
291 | },
292 | {
293 | "cell_type": "markdown",
294 | "metadata": {},
295 | "source": [
296 | "## Load Encoder\n",
297 | "\n",
298 | "Load `.pickle` file with the encoder"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": 11,
304 | "metadata": {},
305 | "outputs": [],
306 | "source": [
307 | "encoder_path = Path('../models/emotion_recognition/encoder.pickle').resolve()\n",
308 | "with encoder_path.open('rb') as file:\n",
309 | " encoder = pickle.load(file)"
310 | ]
311 | },
312 | {
313 | "cell_type": "markdown",
314 | "metadata": {},
315 | "source": [
316 | "## Preprocess data\n",
317 | "\n",
318 | "Preprocess the data that will be used"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": 12,
324 | "metadata": {},
325 | "outputs": [
326 | {
327 | "name": "stderr",
328 | "output_type": "stream",
329 | "text": [
330 | "[nltk_data] Downloading package stopwords to\n",
331 | "[nltk_data] /Users/rmohashi/nltk_data...\n",
332 | "[nltk_data] Package stopwords is already up-to-date!\n"
333 | ]
334 | }
335 | ],
336 | "source": [
337 | "from nlp.utils import preprocess\n",
338 | "from tensorflow.keras.preprocessing.sequence import pad_sequences"
339 | ]
340 | },
341 | {
342 | "cell_type": "code",
343 | "execution_count": 13,
344 | "metadata": {},
345 | "outputs": [
346 | {
347 | "name": "stdout",
348 | "output_type": "stream",
349 | "text": [
350 | "Time to clean up: 0.71 sec\n"
351 | ]
352 | }
353 | ],
354 | "source": [
355 | "test_data['text'] = preprocess(test_data.text)\n",
356 | "sequences = [text.split() for text in test_data.text]\n",
357 | "list_tokenized = tokenizer.texts_to_sequences(sequences)\n",
358 | "x_test = pad_sequences(list_tokenized, maxlen=100)\n",
359 | "y_test = encoder.transform(test_data.label)"
360 | ]
361 | },
362 | {
363 | "cell_type": "markdown",
364 | "metadata": {},
365 | "source": [
366 | "## Results\n",
367 | "\n",
368 | "Predict the labels and generate a confusion matrix"
369 | ]
370 | },
371 | {
372 | "cell_type": "code",
373 | "execution_count": 14,
374 | "metadata": {},
375 | "outputs": [],
376 | "source": [
377 | "y_pred = model.predict(x_test)"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": 15,
383 | "metadata": {},
384 | "outputs": [],
385 | "source": [
386 | "y_pred = y_pred.argmax(axis=1)\n",
387 | "y_test = y_test.argmax(axis=1)"
388 | ]
389 | },
390 | {
391 | "cell_type": "code",
392 | "execution_count": 16,
393 | "metadata": {},
394 | "outputs": [],
395 | "source": [
396 | "import numpy as np\n",
397 | "import matplotlib.pyplot as plt\n",
398 | "from sklearn.metrics import confusion_matrix\n",
399 | "from sklearn.utils.multiclass import unique_labels"
400 | ]
401 | },
402 | {
403 | "cell_type": "code",
404 | "execution_count": 17,
405 | "metadata": {},
406 | "outputs": [],
407 | "source": [
408 | "def plot_confusion_matrix(y_true, y_pred, classes,\n",
409 | " normalize=False,\n",
410 | " title=None,\n",
411 | " cmap=plt.cm.Blues):\n",
412 | " \"\"\"\n",
413 | " This function prints and plots the confusion matrix.\n",
414 | " Normalization can be applied by setting `normalize=True`.\n",
415 | " \"\"\"\n",
416 | " if not title:\n",
417 | " if normalize:\n",
418 | " title = 'Normalized confusion matrix'\n",
419 | " else:\n",
420 | " title = 'Confusion matrix, without normalization'\n",
421 | "\n",
422 | " # Compute confusion matrix\n",
423 | " cm = confusion_matrix(y_true, y_pred)\n",
424 | " # Only use the labels that appear in the data\n",
425 | " classes = classes[unique_labels(y_true, y_pred)]\n",
426 | " if normalize:\n",
427 | " cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n",
428 | " print(\"Normalized confusion matrix\")\n",
429 | " else:\n",
430 | " print('Confusion matrix, without normalization')\n",
431 | "\n",
432 | " print(cm)\n",
433 | "\n",
434 | " fig, ax = plt.subplots()\n",
435 | " im = ax.imshow(cm, interpolation='nearest', cmap=cmap)\n",
436 | " ax.figure.colorbar(im, ax=ax)\n",
437 | " # We want to show all ticks...\n",
438 | " ax.set(xticks=np.arange(cm.shape[1]),\n",
439 | " yticks=np.arange(cm.shape[0]),\n",
440 | " # ... and label them with the respective list entries\n",
441 | " xticklabels=classes, yticklabels=classes,\n",
442 | " title=title,\n",
443 | " ylabel='True label',\n",
444 | " xlabel='Predicted label')\n",
445 | "\n",
446 | " # Rotate the tick labels and set their alignment.\n",
447 | " plt.setp(ax.get_xticklabels(), rotation=45, ha=\"right\",\n",
448 | " rotation_mode=\"anchor\")\n",
449 | "\n",
450 | " # Loop over data dimensions and create text annotations.\n",
451 | " fmt = '.2f' if normalize else 'd'\n",
452 | " thresh = cm.max() / 2.\n",
453 | " for i in range(cm.shape[0]):\n",
454 | " for j in range(cm.shape[1]):\n",
455 | " ax.text(j, i, format(cm[i, j], fmt),\n",
456 | " ha=\"center\", va=\"center\",\n",
457 | " color=\"white\" if cm[i, j] > thresh else \"black\")\n",
458 | " fig.tight_layout()\n",
459 | " return fig, ax"
460 | ]
461 | },
462 | {
463 | "cell_type": "code",
464 | "execution_count": 18,
465 | "metadata": {
466 | "scrolled": false
467 | },
468 | "outputs": [
469 | {
470 | "name": "stdout",
471 | "output_type": "stream",
472 | "text": [
473 | "Normalized confusion matrix\n",
474 | "[[0.83657588 0.07782101 0.01167315 0.07392996]\n",
475 | " [0.12653061 0.76326531 0.00408163 0.10612245]\n",
476 | " [0.06028369 0.02836879 0.90425532 0.0070922 ]\n",
477 | " [0.0929368 0.05947955 0.00371747 0.84386617]]\n"
478 | ]
479 | },
480 | {
481 | "data": {
482 | "image/png": "\n",
483 | "text/plain": [
484 | ""
485 | ]
486 | },
487 | "metadata": {
488 | "needs_background": "light"
489 | },
490 | "output_type": "display_data"
491 | }
492 | ],
493 | "source": [
494 | "fig, ax = plot_confusion_matrix(y_test, y_pred, encoder.classes_, normalize=True)\n",
495 | "fig.savefig('confusion_matrix.png')"
496 | ]
497 | },
498 | {
499 | "cell_type": "code",
500 | "execution_count": null,
501 | "metadata": {},
502 | "outputs": [],
503 | "source": []
504 | }
505 | ],
506 | "metadata": {
507 | "kernelspec": {
508 | "display_name": "Python 3",
509 | "language": "python",
510 | "name": "python3"
511 | },
512 | "language_info": {
513 | "codemirror_mode": {
514 | "name": "ipython",
515 | "version": 3
516 | },
517 | "file_extension": ".py",
518 | "mimetype": "text/x-python",
519 | "name": "python",
520 | "nbconvert_exporter": "python",
521 | "pygments_lexer": "ipython3",
522 | "version": "3.6.8"
523 | }
524 | },
525 | "nbformat": 4,
526 | "nbformat_minor": 2
527 | }
528 |
--------------------------------------------------------------------------------
/notebooks/Check Emotion Labeled Dataset.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Check Emotion Labeled Dataset\n",
8 | "\n",
9 | "The main objective of this notebook is to show the output dataset from the sentiment analysis model"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import os\n",
19 | "import pandas as pd\n",
20 | "from pathlib import Path"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 2,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "dataset_path = Path('../datasets/sentiment_analysis/dataset.csv').resolve()"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 3,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "dataset = pd.read_csv(dataset_path)"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 4,
44 | "metadata": {},
45 | "outputs": [
46 | {
47 | "data": {
48 | "text/html": [
49 | "\n",
50 | "\n",
63 | "
\n",
64 | " \n",
65 | " \n",
66 | " | \n",
67 | " label | \n",
68 | " id | \n",
69 | " date | \n",
70 | " user | \n",
71 | " text | \n",
72 | "
\n",
73 | " \n",
74 | " \n",
75 | " \n",
76 | " | 0 | \n",
77 | " fear | \n",
78 | " 1148914607152619520 | \n",
79 | " 2019-07-10 11:19:22 | \n",
80 | " cheri_shapley | \n",
81 | " Sometimes what you're most #afraid of doing is... | \n",
82 | "
\n",
83 | " \n",
84 | " | 1 | \n",
85 | " fear | \n",
86 | " 1148837283812073473 | \n",
87 | " 2019-07-10 06:12:07 | \n",
88 | " Dronearl_RSA | \n",
89 | " Delayed post \\n#Afraid \\n@TallRacksRec https:/... | \n",
90 | "
\n",
91 | " \n",
92 | " | 2 | \n",
93 | " fear | \n",
94 | " 1148719897788084224 | \n",
95 | " 2019-07-09 22:25:40 | \n",
96 | " wavetossed | \n",
97 | " #EyesOn #SeeSomethingSaySomething #CIA #Clowns... | \n",
98 | "
\n",
99 | " \n",
100 | " | 3 | \n",
101 | " fear | \n",
102 | " 1148653069003034630 | \n",
103 | " 2019-07-09 18:00:07 | \n",
104 | " Misspiggychop | \n",
105 | " #HappyBirthdayStevenAvery\\n\\n#CorruptiwocCount... | \n",
106 | "
\n",
107 | " \n",
108 | " | 4 | \n",
109 | " fear | \n",
110 | " 1148593210756947968 | \n",
111 | " 2019-07-09 14:02:15 | \n",
112 | " HorrorBitsVids | \n",
113 | " \"Fight Fire With Fire\"\\n\\nWhat did you think o... | \n",
114 | "
\n",
115 | " \n",
116 | "
\n",
117 | "
"
118 | ],
119 | "text/plain": [
120 | " label id date user \\\n",
121 | "0 fear 1148914607152619520 2019-07-10 11:19:22 cheri_shapley \n",
122 | "1 fear 1148837283812073473 2019-07-10 06:12:07 Dronearl_RSA \n",
123 | "2 fear 1148719897788084224 2019-07-09 22:25:40 wavetossed \n",
124 | "3 fear 1148653069003034630 2019-07-09 18:00:07 Misspiggychop \n",
125 | "4 fear 1148593210756947968 2019-07-09 14:02:15 HorrorBitsVids \n",
126 | "\n",
127 | " text \n",
128 | "0 Sometimes what you're most #afraid of doing is... \n",
129 | "1 Delayed post \\n#Afraid \\n@TallRacksRec https:/... \n",
130 | "2 #EyesOn #SeeSomethingSaySomething #CIA #Clowns... \n",
131 | "3 #HappyBirthdayStevenAvery\\n\\n#CorruptiwocCount... \n",
132 | "4 \"Fight Fire With Fire\"\\n\\nWhat did you think o... "
133 | ]
134 | },
135 | "execution_count": 4,
136 | "metadata": {},
137 | "output_type": "execute_result"
138 | }
139 | ],
140 | "source": [
141 | "dataset.head()"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": 5,
147 | "metadata": {},
148 | "outputs": [],
149 | "source": [
150 | "import seaborn as sns"
151 | ]
152 | },
153 | {
154 | "cell_type": "markdown",
155 | "metadata": {},
156 | "source": [
157 | "## Label count\n",
158 | "\n",
159 | "Check the count of each label"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": 6,
165 | "metadata": {},
166 | "outputs": [
167 | {
168 | "data": {
169 | "text/plain": [
170 | ""
171 | ]
172 | },
173 | "execution_count": 6,
174 | "metadata": {},
175 | "output_type": "execute_result"
176 | },
177 | {
178 | "data": {
179 | "image/png": "\n",
180 | "text/plain": [
181 | ""
182 | ]
183 | },
184 | "metadata": {
185 | "needs_background": "light"
186 | },
187 | "output_type": "display_data"
188 | }
189 | ],
190 | "source": [
191 | "sns.countplot(x='label', data=dataset)"
192 | ]
193 | },
194 | {
195 | "cell_type": "markdown",
196 | "metadata": {},
197 | "source": [
198 | "## Text length\n",
199 | "\n",
200 | "Check the length of the tweets"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": 7,
206 | "metadata": {},
207 | "outputs": [
208 | {
209 | "data": {
210 | "text/plain": [
211 | ""
212 | ]
213 | },
214 | "execution_count": 7,
215 | "metadata": {},
216 | "output_type": "execute_result"
217 | },
218 | {
219 | "data": {
220 | "image/png": "\n",
221 | "text/plain": [
222 | ""
223 | ]
224 | },
225 | "metadata": {
226 | "needs_background": "light"
227 | },
228 | "output_type": "display_data"
229 | }
230 | ],
231 | "source": [
232 | "sns.distplot(dataset.text.apply(lambda text: len(text)), bins=30)"
233 | ]
234 | },
235 | {
236 | "cell_type": "markdown",
237 | "metadata": {},
238 | "source": [
239 | "## Word count\n",
240 | "\n",
241 | "Check the word count"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": 8,
247 | "metadata": {},
248 | "outputs": [
249 | {
250 | "data": {
251 | "text/plain": [
252 | ""
253 | ]
254 | },
255 | "execution_count": 8,
256 | "metadata": {},
257 | "output_type": "execute_result"
258 | },
259 | {
260 | "data": {
261 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYQAAAEKCAYAAAASByJ7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3Xt8XVWd9/HP7+SetLk0TUubXukFSbmV1gIiILdaHKSOwgg4wji8BnmAGRX1GXyeB0YZZ4QZZ3BUxhGFEVEEBJUKlTpcFBAoTUtbWkpp6D1tadomaZvbyUl+zx9nB0LI5SQ5Obd836/XeWWfvdfe57ez2/PLWmuvtc3dERERCSU7ABERSQ1KCCIiAighiIhIQAlBREQAJQQREQkoIYiICKCEICIiASUEEREBlBBERCSQnewABmP8+PE+Y8aMZIchIpJWVq9efcDdKwYql1YJYcaMGVRXVyc7DBGRtGJmO2IppyYjEREBlBBERCSghCAiIoASgoiIBJQQREQEUEIQEZGAEoKIiABKCAmlx5WKSCpTQkiQF2sOsOifn+auZ2uSHYqISK+UEBLg/pe289l7X+FwSzv/9vvNVG8/lOyQRETeJ6aEYGZLzGyzmdWY2c29bM8zs4eC7SvNbEawfpGZrQ1e68zsz7vts93MXgu2ZeR8FO7OPzy2gVse28hH5lbwx6+ey5SyQr740FoOt7YnOzwRkfcYMCGYWRZwF3ARUAVcYWZVPYpdA9S7+2zgTuCOYP0GYKG7nwIsAX5oZt3nTzrX3U9x94XDPI+U9FbdUe57aQefOW0ad1+1kGNK8rnz06ewt7GVW3+zIdnhiYi8Ryw1hEVAjbtvdfcw8CCwtEeZpcB9wfIjwPlmZu7e7O6RYH0+MKp6VVfvqAfgc2fOJCtkACyYXsbfnTeH36zdw2/X7UlmeCIi7xFLQqgEdnV7vztY12uZIAE0AuUAZnaamW0EXgOu65YgHPi9ma02s2uHfgqpa82OBkoKcjh2fNF71t9w7ixOqCzm9t+9QTjSmaToRETea8Q7ld19pbvPAz4IfM3M8oNNH3b3U4k2Rd1gZmf3tr+ZXWtm1WZWXVdXN9LhxtWanfXMn1ZKKKgddMnOCvGVxcdR29DCw9W7+thbRCSxYkkItcDUbu+nBOt6LRP0EZQAB7sXcPdNwFHghOB9bfBzP/Brok1T7+Pud7v7QndfWFEx4PMdUkZjSztb9h/l1GllvW4/Z24FC6eX8b1nttDa3pHg6ERE3i+WhLAKmGNmM80sF7gcWNajzDLg6mD5UuAZd/dgn2wAM5sOfADYbmZFZjY2WF8ELCbaAZ0x1u1qAOgzIZgZX158HG8fbuPnK3cmMjQRkV4NmBCCNv8bgRXAJuBhd99oZreZ2SVBsXuAcjOrAW4Cum5N/TCwzszWEq0FXO/uB4CJwAtmtg54BXjC3Z+M54kl25qd9ZjByVNL+ixzxqxyPjSrnB/8oYamtkif5UREEiGmR2i6+3JgeY91t3ZbbgUu62W/+4H7e1m/FTh5sMGmkzU7G5g7YSxj83P6LfflxXP51A9e4icvbueGc2cnKDoRkffTSOUR0NnpvLqznlOnlw5YdsH0cVxw/AT+89ka9h9uTUB0IiK9U0IYAVsPHOVIa4T5ffQf9PT//qyK9g7n9iffGOHIRET6poQwAtbs6L9DuacZ44u45qyZ/GpNLWt21o9kaCIifVJCGAFrdtb3OiCtPzeeO5uJxXl8fdlGOjtH1YBuEUkRSggjoK8Baf0pysvmaxcdz/rdjfxytQariUjiKSHE2eHW/gek9WfpKZP54Iwy/nn5G7ytDmYRSTAlhDh7a/9R3KFqUvGg9zUz7vjUSbRFOvj7R9frCWsiklBKCHG2pyH6l31lWcGQ9j+2Ygw3L/kAf9hcx0Or1HQkIomjhBBnextbAJhcOrSEAHDVGTP40Kxy/vHx19l1qDleoYmI9EsJIc5qG1ooys2iOD+mQeC9CoWMf7n0pOh8R79cp7uORCQhlBDibE9DC5NLCzCL/Q6j3kwpK+TWj1fxyrZD3PPCtjhFJyLSNyWEONvT0Dqs5qLuLlswhQurJvKvKzazed+RuBxTRKQvSghxtrexhcml+QMXjIGZ8a1PnkhxQTZffGgtbRE9N0FERo4SQhy1tndw4GiYySXxqSEAjB+Tx+2fPIlNew/znae2xO24IiI9KSHE0d7G6C2n8Woy6nJB1UT+YuEU7n5uK2++raYjERkZSghxtLchesvppDg1GXX3tYuOZ2x+Nrc+tkED1kRkRCghxFFtkBAq41xDACgryuUri4/j5a2HeHz93rgfX0RECSGOukYpH1MS/xoCwBWLpjFvcjH/9MQmPXJTROJOCSGO9ja2MH5MHnnZWSNy/KyQcdvSE9h3uJXvPVMzIp8hIqOXEkIc1Ta0UDkC/QfdLZhexidPreTeF7bpkZsiElcxJQQzW2Jmm82sxsxu7mV7npk9FGxfaWYzgvWLzGxt8FpnZn8e6zHTUdco5ZH2d+fNIdLZyb1/2j7inyUio8eACcHMsoC7gIuAKuAKM6vqUewaoN7dZwN3AncE6zcAC939FGAJ8EMzy47xmGnF3dnT0MqkOI5B6MuM8UV87MRJ/PzlHRxubR/xzxOR0SGWGsIioMbdt7p7GHgQWNqjzFLgvmD5EeB8MzN3b3b3rt7PfKDrfslYjplWGlvaaWnviNso5YFcd84sjrRF+PnLOxPyeSKS+WKZkrMS6D4x/27gtL7KuHvEzBqBcuCAmZ0G3AtMBz4bbI/lmGlluLecPrBy8F/scyaM4a5nayjMzSIna3DdQVeeNm3QnycimW3EO5XdfaW7zwM+CHzNzAb1J7SZXWtm1WZWXVdXNzJBxkHXLaeTEtCH0OXsuRUcbYuwZmd9wj5TRDJXLAmhFpja7f2UYF2vZcwsGygBDnYv4O6bgKPACTEes2u/u919obsvrKioiCHc5Hj3wTiJaTICOHZ8EVPKCnh+ywE6NXpZRIYploSwCphjZjPNLBe4HFjWo8wy4Opg+VLgGXf3YJ9sADObDnwA2B7jMdNKbUMLuVkhxhflJewzzYwzZ43nUFOYbQeaEva5IpKZBuxDCNr8bwRWAFnAve6+0cxuA6rdfRlwD3C/mdUAh4h+wQN8GLjZzNqBTuB6dz8A0Nsx43xuCbWnoZVjSvIJhYb3YJzBqppcTH5OiDU76plVMSahny0imSWm5zy6+3JgeY91t3ZbbgUu62W/+4H7Yz1mOtvbEL/nIAxGTlaIEytLWburnkvaJ5OXMzKjpEUk82mkcpwkalBabxZMK6W9w9mw53BSPl9EMoMSQhxEOjrZd7g1rg/GGYyp4wopL8rV3UYiMixKCHHw9pE2Oj3+D8aJlZmxYHoZ2w40cagpnJQYRCT9KSHEwb7GkXswTqxOmVqKAa+qliAiQ6SEEAcHjkb/Kq8Yk7hbTnsqLcxlVsUY1uys15gEERkSJYQ46GqmKSvKTWoc86eVUt/cTm19S1LjEJH0pIQQB10JYVxhchPC8ZOKyTLjtdrGpMYhIulJCSEODjWFKcjJoiA3uWMA8nOymD1hDBv2NOJqNhKRQVJCiIP6pjDjktxc1OWEyhIamtvfmX1VRCRWSghxcDCFEsLxk8YSMthQq0FqIjI4SghxUN+cOgmhMDebWRVqNhKRwVNCiINDKVRDgGiz0aGmMHsbW5MdioikESWEOEi1hFA1qTjabLRHdxuJSOyUEIaptb2D5nBHSiWEorxsZo4vYkOtmo1EJHZKCMP0zhiEFEoIEG02OnA0zNuH25IdioikCSWEYXpnlHKSB6X1VDWpGAPW1zYkOxQRSRNKCMOUqjWEsfk5HFtRxGu71WwkIrFRQhim+ubUTAgAJ1WWclB3G4lIjJQQhung0dRNCPMmR+82Wr9bdxuJyMCUEIapvjlMyKCkICfZobxPYV42syeM4bXaBjUbiciAsmMpZGZLgP8AsoAfu/vtPbbnAT8FFgAHgU+7+3YzuxC4HcgFwsBX3f2ZYJ8/AJOArkl3Frv7/mGfURw9sHLngGVWbj1Efk4WD63alYCIBu/EyhIeXVPL7voWpo4rTHY4IpLCBqwhmFkWcBdwEVAFXGFmVT2KXQPUu/ts4E7gjmD9AeDj7n4icDVwf4/9PuPupwSvlEoGsWoKRyjKiymvJkXVpBJNiS0iMYmlyWgRUOPuW909DDwILO1RZilwX7D8CHC+mZm7v+rue4L1G4GCoDaRMZrDHRQledrr/hTkZjFn4hheq23Uk9REpF+xJIRKoHt7yO5gXa9l3D0CNALlPcp8Cljj7t1HSv23ma01s1vMzAYVeYpoakvtGgJEm40aW9rZfrAp2aGISApLSKeymc0j2oz0+W6rPxM0JZ0VvD7bx77Xmlm1mVXX1dWNfLCD1BTuoDA3tRPCvMkl5OeEeGXboWSHIiIpLJaEUAtM7fZ+SrCu1zJmlg2UEO1cxsymAL8GrnL3t7p2cPfa4OcR4AGiTVPv4+53u/tCd19YUVERyzklTKc7LeEIRXmp22QEkJsdYv7UMjbWHuZoWyTZ4YhIioolIawC5pjZTDPLBS4HlvUos4xopzHApcAz7u5mVgo8Adzs7n/qKmxm2WY2PljOAS4GNgzvVBKvtb2DToeiFK8hACyaOY4Od1bvqE92KCKSogZMCEGfwI3ACmAT8LC7bzSz28zskqDYPUC5mdUANwE3B+tvBGYDtwZ9BWvNbAKQB6wws/XAWqI1jB/F88QSoamtAyDlawgAE4vzmTm+iFe2HVTnsoj0KqY/bd19ObC8x7pbuy23Apf1st83gW/2cdgFsYeZmprD0eaXVO9D6HLazHE8uGoXW94+muxQRCQFaaTyMDQF7fGpfpdRl6rJxYzJy+aVbQeTHYqIpCAlhGFoCgdNRik8DqG77FCIhdPLeGPfEWobWgbeQURGFSWEYeiqIaRLkxFEO5fN4EfPbU12KCKSYpQQhqE53EFOlpGbnT6/xtLCXOZPK+MXr+xk/xFNiy0i70qfb7IUlA6jlHvzkbkVtHd0qpYgIu+hhDAMTeFIWoxB6Kl8TB6fOKWSn728k4NH9cxlEYlSQhiG5nAHhWnSodzTDefNpjXSwY9f2JbsUEQkRSghDEO6NhkBzKoYw8UnTeanL26nPngutIiMbkoIw9CU4lNfD+Rvz5tNc3sH33+2JtmhiEgKUEIYovaOTsKRzrStIQDMnTiWyz84jfte3E7N/iPJDkdEkkwJYYiag0Fp6TQGoTdfWTyXgtwsvvHb1/XcZZFRTglhiN6dtiJ9m4wgesfRTRfO5fktB/if199OdjgikkRKCEPUlGYT2/XnL0+fztyJY/jHJ16ntb0j2eGISJIoIQzRu01G6V1DAMjJCvEPH5/HrkMt/OuKzckOR0SSRAlhiFoyKCEAnDl7PFedMZ17XtjGo6t3JzscEUkCJYQhagmaVgpyMiMhANxycRVnHFvO1379Gmt3NSQ7HBFJMCWEIWoJJrbLzsqcX2FOVoi7PnMqE4vzuPan1bx9WJPfiYwmmfNtlmAt4Y6Mqh10GVeUy4+v+iBNbRGu+9lq2iLqZBYZLZQQhqilvSMj7jDqzXHHjOXf/uJkXt3ZwD88tlHjE0RGCSWEIWoOd5CfgTWELktOmMSN587mwVW7+PnKnckOR0QSIKaEYGZLzGyzmdWY2c29bM8zs4eC7SvNbEaw/kIzW21mrwU/z+u2z4JgfY2ZfdfMLF4nlQit7ek702msvnThXM49roJv/HYj1dsPJTscERlhAyYEM8sC7gIuAqqAK8ysqkexa4B6d58N3AncEaw/AHzc3U8Ergbu77bPD4C/AeYEryXDOI+Eaw5HMrIPobuskPGdy+czsTifWx7bSGenmo5EMlksNYRFQI27b3X3MPAgsLRHmaXAfcHyI8D5Zmbu/qq77wnWbwQKgtrEJKDY3V/2aAP1T4FPDPtsEqilvYOCDK8hAJQU5PDlxXPZtPcwKzbuS3Y4IjKCYkkIlcCubu93B+t6LePuEaARKO9R5lPAGndvC8p3H/3U2zEBMLNrzazazKrr6upiCHfkRTo6ae/wUZEQAC45uZJjK4q486k3VUsQyWAJ6VQ2s3lEm5E+P9h93f1ud1/o7gsrKiriH9wQZOKgtP5khYwvXjCXN98+yhOv7U12OCIyQmJJCLXA1G7vpwTrei1jZtlACXAweD8F+DVwlbu/1a38lAGOmbK6pq0YLTUEgItPnMTciWP4zlNv0qFagkhGiuVG+lXAHDObSfRL+3Lgyh5llhHtNH4JuBR4xt3dzEqBJ4Cb3f1PXYXdfa+ZHTaz04GVwFXA94Z9NgmSCTWEB4ZwK+nC6eN44JWd3PzoeuZPKxvUvleeNm3QnyciiTVgDSHoE7gRWAFsAh52941mdpuZXRIUuwcoN7Ma4Cag69bUG4HZwK1mtjZ4TQi2XQ/8GKgB3gJ+F6+TGmmZNrFdrKomF3NMcT5/fLNOg9VEMlBMQ23dfTmwvMe6W7sttwKX9bLfN4Fv9nHMauCEwQSbKpozoIYwFCEzzpxdzqNratl6oIlZFWOSHZKIxJFGKg/BaOxD6HLSlFKKcrN4seZAskMRkThTQhiCrj6ETJ66oi85WSEWzRzHG/uOcKgpnOxwRCSOlBCGoCXcQX5OiFB6zbYRN6fNLMcMXnpLtQSRTKKEMAQt7Zk59XWsigtyOKGyhOod9ZoeWySDKCEMQUt4dExb0Z8PzRpPW6STNTv1ZDWRTKGEMAQt7R0U5mTmsxBiNW1cIVPLCvhTzQENVBPJEEoIQ9Ac7iB/lNcQAM49bgKHmsK8urM+2aGISBwoIQxBtIaghHDcMWOZUlbAs5v3E+nsTHY4IjJMSgiD5O60hCOjvg8BwMw4/wMTqG9u59Ud6ksQSXdKCIMU7uik00ffKOW+zJ04lqmqJYhkBCWEQRrNo5R7Y2acf/xEGlraWb1DfQki6UwJYZAyYabTeJszYQzTxhXy7Bv7CUdUSxBJV0oIg6QawvuZGRedcAyHWyM8tyU1nmonIoOnhDBIzWHVEHozvbyIEytLeH5LHY0t7ckOR0SGQAlhkFrbR+ezEGKxZN4xuMOKjfuSHYqIDIESwiCphtC3sqJczpw9nrW7Gth1qDnZ4YjIICkhDFJLewchg9xs/ep6c87cCsbkZbP8tb16qppImtG32iC1hKMzndoonfp6IPk5WVxw/ER2HGrmjX1Hkh2OiAyCEsIgtbR3UJA7uie2G8iC6WWUF+Xy+9f30alagkjaUEIYpGgNQb+2/mSFjAurJvL24TbW7dKUFiLpIqZvNjNbYmabzazGzG7uZXuemT0UbF9pZjOC9eVm9qyZHTWz7/fY5w/BMdcGrwnxOKGR1tLeQaFqCAM6obKEySX5PLXpbU1pIZImBkwIZpYF3AVcBFQBV5hZVY9i1wD17j4buBO4I1jfCtwCfKWPw3/G3U8JXvuHcgKJFm0y0h1GAwmZsXjeMdQ3t7Nq26FkhyMiMYilhrAIqHH3re4eBh4ElvYosxS4L1h+BDjfzMzdm9z9BaKJISM0hyPk65bTmMyZMIaZ44t4dnMdzeFIssMRkQHEkhAqgV3d3u8O1vVaxt0jQCNQHsOx/ztoLrrF+rhtx8yuNbNqM6uuq0vutAid7rS1d2pQWozMjMVVEznaFuFnL+9IdjgiMoBk9o5+xt1PBM4KXp/trZC73+3uC919YUVFRUID7Km1vQNHg9IGY3p5EbMnjOGHf9yqWoJIioslIdQCU7u9nxKs67WMmWUDJcDB/g7q7rXBzyPAA0SbplKaJrYbmgs+MIGDTWF++pJqCSKpLJaEsAqYY2YzzSwXuBxY1qPMMuDqYPlS4BnvZ5iqmWWb2fhgOQe4GNgw2OATTVNfD8208iLOnlvB3c9tpalNtQSRVDVgQgj6BG4EVgCbgIfdfaOZ3WZmlwTF7gHKzawGuAl459ZUM9sO/DvwV2a2O7hDKQ9YYWbrgbVEaxg/it9pjYyuGoL6EAbvSxfM4VBTmPte2p7sUESkDzHdUO/uy4HlPdbd2m25Fbisj31n9HHYBbGFmDqagxqC7jIavPnTyvjIcRX86Lmt/OXp0ynOz0l2SCLSg4bcDoJqCMPzlcXHUd/czn/94a1khyIivVBCGISuu2Q0UnloTqgs4c/nV3LPC9vY09CS7HBEpAclhEFoCneQnxMiK6SZTofqy4vn4sC3f7852aGISA9KCIPQ3BZR7WCYppQV8tdnzuTXr9ayobYx2eGISDdKCIPQHO5Q/0EcXH/uLEoLcvinJzbpIToiKUQJYRCawx0UqYYwbMX5OXzh/Dm8tPUgy9btSXY4IhJQQhiEpnBENYQ4+ewZMzhlailfX7aRuiNtyQ5HRFBCGBQ1GcVPVsj410tPoqmtg1sfS/lB6iKjghJCjNo7OglHOinKU5NRvMyZOJYvXjiH323YxxPr9yY7HJFRTwkhRs2a2G5EXHvWsZw0pYRbHtvAgaNqOhJJJiWEGHUNSlOncnxlZ4X49mUnc7Qtws2Pvqa7jkSSSAkhRs2atmLEzJ04lr9f8gGe2vQ2D67aNfAOIjIilBBi1DVtc6H6EEbE5z40gw/PHs9tv32dbQeakh2OyKikhBCjrhpCkWoIIyIUMr592cnkZof44oOv0t7RmeyQREYdJYQYdfUhqFN55BxTks+3Pnki63Y38r2ntyQ7HJFRRwkhRs3hDvKyQ2SH9CsbSR87cRKXLpjC95+t4ZVth5Idjsioom+3GDWHOzQGIUG+fsk8po4r5EsPraWxpT3Z4YiMGkoIMWpq07QViTImL5vvfPoU9h1u1ShmkQRSQoiRpq1IrPnTyvji+XN4bO0efvHKzmSHIzIqxJQQzGyJmW02sxozu7mX7Xlm9lCwfaWZzQjWl5vZs2Z21My+32OfBWb2WrDPd80spZ860xyOaFBagl1/7mzOmVvBLb/ZwJ9qDiQ7HJGMN2BCMLMs4C7gIqAKuMLMqnoUuwaod/fZwJ3AHcH6VuAW4Cu9HPoHwN8Ac4LXkqGcQKI0qYaQcFkh4/tXzmdWxRiu+9lqavYfSXZIIhktlhrCIqDG3be6exh4EFjao8xS4L5g+RHgfDMzd29y9xeIJoZ3mNkkoNjdX/boXAU/BT4xnBMZSZFgYjsNSku8sfk53PNXC8nLDvHXP6nWfEciIyiWb7hKoPt8AruB0/oq4+4RM2sEyoG+6vmVwXG6H7MyloCTQdNWDN8DK4fXD3DZgqn8+IWtfPTO5/jcmTMZV5Tbb/krT5s2rM8TGY1SvlPZzK41s2ozq66rq0tKDE3BoDQ9Tzl5po6LPou5OdzBf/3xLWobWpIdkkjGiSUh1AJTu72fEqzrtYyZZQMlwMEBjjllgGMC4O53u/tCd19YUVERQ7jxp2krUsP08iI+f/axZIeMHz2/lddqG5MdkkhGiSUhrALmmNlMM8sFLgeW9SizDLg6WL4UeMb7mcfY3fcCh83s9ODuoquAxwYdfYK802SkPoSkm1Ccz3XnzKJiTB6/eGUnD6zcwZFWDV4TiYcBv+GCPoEbgRVAFnCvu280s9uAandfBtwD3G9mNcAhokkDADPbDhQDuWb2CWCxu78OXA/8BCgAfhe8UtI7M52qhpASigtyuO6cWbywpY6n39jPW3VbuLBqIgunl5GdlfKtoCIpK6Y/ed19ObC8x7pbuy23Apf1se+MPtZXAyfEGmgyqVM59WSFjHOOm8Dxk4v5zat7WLZuD398s46PHFfBgmllyQ5PJC2pDSQGzeGIJrZLURPG5vM3Z82kpu4oT2/az2Nr9/DClgNMH1/EOXOT0+ckkq70DRcDTVuR2syMORPG8vmzj+WvPjQDM7j63lf4Xz9bzR7djSQSM9UQYtAcjmim0zRgZsydOJa/O28OR9oifO+ZLbz41kHu/PTJnPeBickOTyTlqYYQg6Y21RDSSXZWiBvOnc2TXzibytIC/von1Xx7xWY6Ovu88U1EUEKISXM4okFpaWjG+CJ+df2HuPyDU/n+szV89p6VHNTUFyJ9UkKIQXO4Q4PS0lR+Tha3f+ok/uXSk6jeUc/Hv/cC63c3JDsskZSkhDCASEcnbZrYLu39xcKpPHrdhzAzLv2vl3i4etfAO4mMMkoIA9AYhMxx4pQSfvu3H+aDM8r434+s59//5036GVAvMuooIQzg3YSgGkImGFeUy08+t4jLFkzhu09v4e8fXU97R2eywxJJCfqWG0DXTKfqQ8gcOVkh/uXSk5hUks93n6nhwNEw//WXC8jN1t9HMrrpf8AAVEPITGbGTYuP4x8/cQLPvLGfmx5eS6duS5VRTt9yA2juehZCnmoImeizp0+nuS3Ct373BuVFuXz9knmk+OO9RUaMEsIAjrRGMNSpnMk+f84sDjaFufu5rYwryuMLF8xJdkgiSaGEMICG5nbG5mdrYrs0M9hHdk4fV8ip08q486k32dvYwklTSge1vx7ZKZlA33IDaGgOU1rY//N7Jf2ZGZ+YP5lp4wr51Zpa9h1uTXZIIgmnhDCAhpZ2Sgtzkh2GJEB2KMSVi6aRlx3i5y/voCW4oUBktFBC6EenO43N7ZQWqIYwWhQX5HDladOobw7zcPUuOjVwTUYRJYR+HGmN0OGuGsIoM728iD87cRKb3z7Cy1sPJjsckYRRQuhHQ3MYgDIlhFHn9GPLmTtxDCs27uPAEc2QKqODEkI/GprbAdSpPAqZGZ+cP4XsUIhfrt6lZynIqBBTQjCzJWa22cxqzOzmXrbnmdlDwfaVZjaj27avBes3m9lHu63fbmavmdlaM6uOx8nEW1cNQU1Go1NxQQ6XnDKZXfUtPL+lLtnhiIy4AROCmWUBdwEXAVXAFWZW1aPYNUC9u88G7gTuCPatAi4H5gFLgP8MjtflXHc/xd0XDvtMRkB9SzsFOVnkZWtQ2mh18pRSTqws4elN+3lbt6JKhoulhrAIqHH3re4eBh4ElvYosxS4L1h+BDjfouP/lwIPunubu28DaoLjpYWG5rD6D4RLTp5MbnaIx9bu0XTZktFiSQiVQPeniewO1vVaxt0jQCNQPsC+DvzezFab2bWDD33kNTQyaYExAAAMY0lEQVS3q/9AKMrLZsm8Y9h+sIm1u/S0NclcyexU/rC7n0q0KeoGMzu7t0Jmdq2ZVZtZdV1d4tpx3Z2G5nbVEASABTPKmFpWwPIN+zRgTTJWLAmhFpja7f2UYF2vZcwsGygBDva3r7t3/dwP/Jo+mpLc/W53X+juCysqKmIINz4amtsJd3SqhiAAhMxYekolzW0Rfv/6vmSHIzIiYkkIq4A5ZjbTzHKJdhIv61FmGXB1sHwp8IxHG1uXAZcHdyHNBOYAr5hZkZmNBTCzImAxsGH4pxM/u+tbAN1hJO+aXFrA6ceW88q2Q9QG/z5EMsmACSHoE7gRWAFsAh52941mdpuZXRIUuwcoN7Ma4Cbg5mDfjcDDwOvAk8AN7t4BTAReMLN1wCvAE+7+ZHxPbXhqG5oBjUGQ97qwaiKFedn8dv0eTWshGSem6a/dfTmwvMe6W7sttwKX9bHvPwH/1GPdVuDkwQabSF01hLIC1RDkXfk5WSyZN5FH19SyblcD86eVJTskkbjRSOU+1Da0kJsVokAPxpEe5k8rY0pZAU9u2EdbuzqYJXMoIfShtr6F0sIcPU5R3idkxsdPmsyRtgjPbN6f7HBE4kYJoQ+1DS3qUJY+TR1XyIJpZbxYc5A6TX4nGUIJoQ/RhKAOZenb4nkTyc4yHl+vEcySGZQQetHUFokOSlOHsvRjbH4OFxw/kS37j/I/r7+d7HBEhk0JoRe1DV1jEFRDkP6dfmw5E8bmcdvjr9OqDmZJc0oIvajVoDSJUVbI+PjJk9ld38IP/7g12eGIDIsSQi92q4YggzCrYgx/dtIk/vMPNew61JzscESGTAmhF9sPNJGbFWJsfkzj9kT4vx87nqyQ8f9+s0EdzJK2lBB68eJbBzl1eikhjUGQGE0uLeCrHz2OP75Zx2/W9pz7USQ9KCH0sP9IK5v2HubsuYmbWVUyw1VnzGD+tFJu++3rHDyqsQmSfpQQenj+zQMAnD1HCUEGJytk3PGpkzjaFuG2x19Pdjgig6aE0MNzW+oYPyaXqknFyQ5F0tDciWO54dzZPLZ2Dys26rkJkl6UELrp7HSe33KAs+ZUEAqp/0CG5vqPzObEyhK+/PA6Nu87kuxwRGKmhNDN63sPc6gpzFlzxic7FEljudkh7r5qAYW5WVxz3yr1J0jaUELo5o9vRp/ZfJb6D2SYJpUU8KOrFlJ3pI3rfraatohGMUvqU0Lo5rk366iaVEzF2LxkhyIZ4OSppXz7spNZtb2ev/npahqaw8kOSaRfSgiBo20RVu+o1+2mElcfP3kyt3/yRF5+6yAXf+8FNtQ2JjskkT4pIQRefusgkU7n7LnqP5D4unzRNH553Rl0djqf/MGL/OAPb9HUFkl2WCLvo7kZgI5O5+crd1CYm8WC6XpGrgzeAyt3Dljmc2fO5NE1u7njyTf47tNbOHN2OafPLKcwb/D/Da88bdpQwhTpV0w1BDNbYmabzazGzG7uZXuemT0UbF9pZjO6bftasH6zmX001mMmirvzf371Gs9uruOrHz2OvGw9Q1lGRlFeNledMYP/dc4sppcX8tSm/fzz7zZx7wvbeHnrQerVxyBJNuCfJmaWBdwFXAjsBlaZ2TJ37z4U8xqg3t1nm9nlwB3Ap82sCrgcmAdMBp4ys7nBPgMdMyFuf/INHqrexd+eN5vPnTkz0R8vo9DUcYVcdcYM9jW2sm53Axv3NLJs3R5YByUFOUwvL2TauEKOKclnUnEBBbn6I0USI5a66iKgxt23ApjZg8BSoPuX91Lg68HyI8D3Lfp0+qXAg+7eBmwzs5rgeMRwzLjr7HTCHZ1sP9jE2p0NvFBzgMfX7+Wzp0/npgvnDnwAkTg6piSfY0qOYXHVRPYfaWNr3VG2H2xm24Em1u9+t/O5OD+b0sJcSgtzKM7PoSA3i0hnJ2PzsynIySI/eOVmh8jNCpGXHSI7K0R2yMgKGSEzQgZmRtd8jUb0PURryZ0e/dnhTqTDiXQ6kY5OIp1OR6fTGczgakSPETIjO8vIDhnZoRA52cHPLMO6fV6nOx4c2x265oHt2h4yusUYfR8K4rRuk0t237/rmJ0DzCrb2W0fd3/ns7vO3Xj3s/rSda7df4eZLJaEUAns6vZ+N3BaX2XcPWJmjUB5sP7lHvtWBssDHTNuLv7e87yx9wiRzvf+AyotzOGvPjSDWy+uyvgLLanLzJhYnM/E4nzOmBX98jrSFmFfYyv7GlupO9JGfUuY2voWNrUepr3D9cjOJOlKEFlmYF3JJZoouzjdkp+/933PqdHfSZ70frzuX0trbrmQ/JyRrS2mfKeymV0LXBu8PWpmm+N17B3AOuAb7109HjgQr89IATqf1JZp5wOZd04pcT4F/zis3afHUiiWhFALTO32fkqwrrcyu80sGygBDg6w70DHBMDd7wbujiHOuDCzandfmKjPG2k6n9SWaecDmXdOmXY+/YnlLqNVwBwzm2lmuUQ7iZf1KLMMuDpYvhR4xqN1o2XA5cFdSDOBOcArMR5TREQSaMAaQtAncCOwAsgC7nX3jWZ2G1Dt7suAe4D7g07jQ0S/4AnKPUy0szgC3ODuHQC9HTP+pyciIrEyPf/1vczs2qCZKiPofFJbpp0PZN45Zdr59EcJQUREAM1lJCIiASWEQKpMpTEcZjbVzJ41s9fNbKOZfSFYP87M/sfMtgQ/02bCJjPLMrNXzezx4P3MYHqUmmC6lNxkxzgYZlZqZo+Y2RtmtsnMzkjz6/Ol4N/aBjP7hZnlp9M1MrN7zWy/mW3otq7X62FR3w3Oa72ZnZq8yEeGEgLvmZ7jIqAKuCKYdiPdRIAvu3sVcDpwQ3AeNwNPu/sc4Ongfbr4ArCp2/s7gDvdfTZQT3TalHTyH8CT7v4B4GSi55aW18fMKoG/Axa6+wlEbxDpmromXa7RT4AlPdb1dT0uInqn5ByiY6N+kKAYE0YJIeqd6TncPQx0TaWRVtx9r7uvCZaPEP2yqSR6LvcFxe4DPpGcCAfHzKYAfwb8OHhvwHlEp0eBNDoXADMrAc4melce7h529wbS9PoEsoGCYPxRIbCXNLpG7v4c0Tsju+vreiwFfupRLwOlZjYpMZEmhhJCVG/Tc1T2UTYtWHTG2fnASmCiu+8NNu0DJiYprMH6DvC/gc7gfTnQ4O5dDxNIt+s0E6gD/jtoBvuxmRWRptfH3WuBbwM7iSaCRmA16X2NoO/rkXHfEz0pIWQgMxsDPAp80d0Pd98WDBhM+VvLzOxiYL+7r052LHGUDZwK/MDd5wNN9GgeSpfrAxC0rS8lmugmA0W8v/klraXT9YgHJYSoWKbnSAtmlkM0Gfzc3X8VrH67q2ob/NyfrPgG4UzgEjPbTrQJ7zyi7e+lQfMEpN912g3sdveVwftHiCaIdLw+ABcA29y9zt3bgV8RvW7pfI2g7+uRMd8TfVFCiMqIqTSCNvZ7gE3u/u/dNnWfWuRq4LFExzZY7v41d5/i7jOIXo9n3P0zwLNEp0eBNDmXLu6+D9hlZscFq84nOoo/7a5PYCdwupkVBv/2us4nba9RoK/rsQy4Krjb6HSgsVvTUmaIzjOuF/Ax4E3gLeD/JjueIZ7Dh4lWb9cDa4PXx4i2vT8NbAGeAsYlO9ZBntdHgMeD5WOJzodVA/wSyEt2fIM8l1OA6uAa/QYoS+frQ3Sy4DeADcD9QF46XSPgF0T7P9qJ1uCu6et6EJ2d+q7gO+I1ondXJf0c4vnSSGUREQHUZCQiIgElBBERAZQQREQkoIQgIiKAEoKIiASUEET6EcxOev0Q951hZlfGOyaRkaKEINK/UmBICQGYASghSNpQQhDp3+3ALDNba2b/amZfNbNVwXz43wAwsw8G7/PNrCh4PsAJwb5nBft+KalnIRIDDUwT6Ucwa+zj7n6CmS0mOiXD54mOWl0G/Iu7P2dm3wTygQKi8xV9y8w+AnzF3S9OSvAig5Q9cBERCSwOXq8G78cQfVjKc8BtROfEaiX60BiRtKOEIBI7A77l7j/sZVs50QSRQ7Sm0JTIwETiQX0IIv07AowNllcAfx08bwIzqzSzCcG2HwK3AD8n+gjJnvuKpDzVEET64e4HzexPwUPYfwc8ALwUne2Zo8BfmtkSoN3dHwiez/2imZ0HPA90mNk64CfufmeSTkMkJupUFhERQE1GIiISUEIQERFACUFERAJKCCIiAighiIhIQAlBREQAJQQREQkoIYiICAD/H27qECVGcYpoAAAAAElFTkSuQmCC\n",
262 | "text/plain": [
263 | ""
264 | ]
265 | },
266 | "metadata": {
267 | "needs_background": "light"
268 | },
269 | "output_type": "display_data"
270 | }
271 | ],
272 | "source": [
273 | "sns.distplot(dataset.text.apply(lambda text: len(text.split())), bins=10)"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": null,
279 | "metadata": {},
280 | "outputs": [],
281 | "source": []
282 | }
283 | ],
284 | "metadata": {
285 | "kernelspec": {
286 | "display_name": "Python 3",
287 | "language": "python",
288 | "name": "python3"
289 | },
290 | "language_info": {
291 | "codemirror_mode": {
292 | "name": "ipython",
293 | "version": 3
294 | },
295 | "file_extension": ".py",
296 | "mimetype": "text/x-python",
297 | "name": "python",
298 | "nbconvert_exporter": "python",
299 | "pygments_lexer": "ipython3",
300 | "version": "3.6.8"
301 | }
302 | },
303 | "nbformat": 4,
304 | "nbformat_minor": 2
305 | }
306 |
--------------------------------------------------------------------------------