├── datasets
    └── .gitkeep
├── models
    └── .gitkeep
├── nlp
    ├── __init__.py
    ├── dataset.py
    └── utils.py
├── requirements.txt
├── requirements.gpu.txt
├── .env.sample
├── .gitignore
├── README.md
├── query_relations.json
└── notebooks
    ├── Fetch Tweets.ipynb
    ├── Validate API Data.ipynb
    ├── Predict Emotion.ipynb
    ├── Train Sentiment Analysis.ipynb
    ├── Train Emotion Recognition Model.ipynb
    ├── Sentiment Analysis Score.ipynb
    ├── Emotion Recognition Model Validation.ipynb
    └── Check Emotion Labeled Dataset.ipynb


/datasets/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/models/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/nlp/__init__.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | nltk.download('stopwords')
3 | 
4 | from .dataset import Dataset
5 | from .utils import preprocess
6 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | python-dotenv==0.1.0
 2 | tweepy==3.5.0
 3 | jupyter==1.0.0
 4 | tensorflow==1.14.0
 5 | pandas==0.24.2
 6 | python-dotenv==0.1.0
 7 | nltk==3.4.3
 8 | scikit-learn==0.21.2
 9 | emoji==0.5.2
10 | seaborn==0.9.0
11 | tqdm==4.32.2
12 | matplotlib==3.0.3


--------------------------------------------------------------------------------
/requirements.gpu.txt:
--------------------------------------------------------------------------------
 1 | python-dotenv==0.1.0
 2 | tweepy==3.5.0
 3 | jupyter==1.0.0
 4 | tensorflow-gpu==1.14.0
 5 | pandas==0.24.2
 6 | python-dotenv==0.1.0
 7 | nltk==3.4.3
 8 | scikit-learn==0.21.2
 9 | emoji==0.5.2
10 | seaborn==0.9.0
11 | tqdm==4.32.2
12 | matplotlib==3.0.3


--------------------------------------------------------------------------------
/.env.sample:
--------------------------------------------------------------------------------
1 | CONSUMER_KEY=consumer-key.get-from-https://developer.twitter.com/
2 | CONSUMER_SECRET=consumer-secret.get-from-https://developer.twitter.com/
3 | ACCESS_TOKEN=access_token.get-from-https://developer.twitter.com/
4 | ACCESS_TOKEN_SECRET=access_token_secret.get-from-https://developer.twitter.com/


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode
 2 | .DS_STORE
 3 | .env
 4 | 
 5 | # Python related
 6 | __pycache__
 7 | .ipynb_checkpoints
 8 | 
 9 | # Dataset files
10 | datasets/**/*.csv
11 | datasets/**/*.pickle
12 | 
13 | # Model files
14 | models/**/*.h5
15 | models/**/*.pickle
16 | 
17 | # Tensorboard logs
18 | models/**/logs/**/*.*


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Emotion From Tweet
 2 | 
 3 | This repository contains the source code for the article
 4 | **From Sentiment Analysis to Emotion Recognition: A NLP story**
 5 | 
 6 | ## Setup
 7 | 
 8 | 1. Install the dependencies (use ***requirements.gpu.txt*** instead of
 9 | *requirements.txt* when using GPU processing):
10 | 
11 |     ```bash
12 |     pip install -r requirements.txt
13 |     ```
14 | 1. Create a `.env` file:
15 | 
16 |     ```bash
17 |     cp .env.sample .env
18 |     ```
19 | 1. Set the environment variables inside the created `.env` file
20 | 
21 | ## Running
22 | 
23 | 1. Start the jupyter notebook:
24 | 
25 |     ```bash
26 |     jupyter notebook
27 |     ```
28 | 1. Go to the `notebooks` folder
29 | 1. Open and run the notebook you want
30 | 
31 | ***Note***: *Check the releases if you want the training step output files*
32 | 


--------------------------------------------------------------------------------
/nlp/dataset.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import pandas as pd
 3 | from time import time
 4 | from pathlib import Path
 5 | from .utils import preprocess
 6 | 
 7 | class Dataset:
 8 |   def __init__(self, filename, label_col='label', text_col='text'):
 9 |     self.filename = filename
10 |     self.label_col = label_col
11 |     self.text_col = text_col
12 | 
13 |   @property
14 |   def data(self):
15 |     data = self.dataframe[[self.label_col, self.text_col]].copy()
16 |     data.columns = ['label', 'text']
17 |     return data
18 | 
19 |   @property
20 |   def cleaned_data(self):
21 |     data =  self.dataframe[[self.label_col, 'cleaned']]
22 |     data.columns = ['label', 'text']
23 |     return data
24 | 
25 |   def load(self):
26 |     df = pd.read_csv(Path(self.filename).resolve())
27 |     self.dataframe = df
28 | 
29 |   def preprocess_texts(self, quiet=False):
30 |     self.dataframe['cleaned'] = preprocess(self.dataframe[self.text_col], quiet)
31 | 


--------------------------------------------------------------------------------
/query_relations.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   ":face_screaming_in_fear:": "fear",
 3 |   ":face_with_tears_of_joy:": "joy",
 4 |   ":grinning_face_with_smiling_eyes:": "joy",
 5 |   ":pouting_face:": "anger",
 6 |   ":crying_face:": "sadness",
 7 |   ":fearful_face:": "fear",
 8 |   ":face_with_steam_from_nose:": "anger",
 9 |   "#anxious": "fear",
10 |   "#sad": "sadness",
11 |   "#happiness": "joy",
12 |   "#fear": "fear",
13 |   "#joy": "joy",
14 |   "#pissed": "anger",
15 |   "#angry": "anger",
16 |   "#mad": "anger",
17 |   "#excited": "joy",
18 |   "#furious": "anger",
19 |   "#depressed": "sadness",
20 |   ":pensive_face:": "sadness",
21 |   "#afraid": "fear",
22 |   "#scared": "fear",
23 |   "#worried": "fear",
24 |   "#scary": "fear",
25 |   ":anxious_face_with_sweat:": "fear",
26 |   "#hateyou": "anger",
27 |   ":loudly_crying_face:": "sadness",
28 |   ":broken_heart:": "sadness",
29 |   ":red_heart:": "joy",
30 |   ":face_with_symbols_on_mouth:": "anger",
31 |   ":anger_face:": "anger",
32 |   ":smiling_face_with_smiling_eyes:": "joy",
33 |   "#depression": "sadness",
34 |   "#pissedoff": "anger"
35 | }


--------------------------------------------------------------------------------
/nlp/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import nltk
 3 | from time import time
 4 | from emoji import demojize
 5 | 
 6 | def preprocess(texts, quiet=False):
 7 |   start = time()
 8 |   # Lowercasing
 9 |   texts = texts.str.lower()
10 | 
11 |   # Remove special chars
12 |   texts = texts.str.replace(r"(http|@)\S+", "")
13 |   texts = texts.apply(demojize)
14 |   texts = texts.str.replace(r"::", ": :")
15 |   texts = texts.str.replace(r"’", "'")
16 |   texts = texts.str.replace(r"[^a-z\':_]", " ")
17 | 
18 |   # Remove repetitions
19 |   pattern = re.compile(r"(.)\1{2,}", re.DOTALL)
20 |   texts = texts.str.replace(pattern, r"\1")
21 | 
22 |   # Transform short negation form
23 |   texts = texts.str.replace(r"(can't|cannot)", 'can not')
24 |   texts = texts.str.replace(r"n't", ' not')
25 | 
26 |   # Remove stop words
27 |   stopwords = nltk.corpus.stopwords.words('english')
28 |   stopwords.remove('not')
29 |   stopwords.remove('nor')
30 |   stopwords.remove('no')
31 |   texts = texts.apply(
32 |     lambda x: ' '.join([word for word in x.split() if word not in stopwords])
33 |   )
34 | 
35 |   if not quiet:
36 |     print("Time to clean up: {:.2f} sec".format(time() - start))
37 | 
38 |   return texts
39 | 


--------------------------------------------------------------------------------
/notebooks/Fetch Tweets.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Fetch Tweets\n",
  8 |     "\n",
  9 |     "Donwload and save tweets, using a **query** value"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/plain": [
 20 |        "True"
 21 |       ]
 22 |      },
 23 |      "execution_count": 1,
 24 |      "metadata": {},
 25 |      "output_type": "execute_result"
 26 |     }
 27 |    ],
 28 |    "source": [
 29 |     "from dotenv import load_dotenv\n",
 30 |     "from pathlib import Path\n",
 31 |     "\n",
 32 |     "env_path = Path('../.env').resolve()\n",
 33 |     "load_dotenv(dotenv_path=env_path)"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "## API access\n",
 41 |     "\n",
 42 |     "First of all, we'll connect to the Twitter API"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 2,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "import os"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 3,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "consumer_key = os.getenv(\"CONSUMER_KEY\")\n",
 61 |     "consumer_secret = os.getenv(\"CONSUMER_SECRET\")\n",
 62 |     "access_token = os.getenv(\"ACCESS_TOKEN\")\n",
 63 |     "access_token_secret = os.getenv(\"ACCESS_TOKEN_SECRET\")"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 4,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "from tweepy import OAuthHandler, API, TweepError"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 5,
 78 |    "metadata": {},
 79 |    "outputs": [
 80 |     {
 81 |      "name": "stdout",
 82 |      "output_type": "stream",
 83 |      "text": [
 84 |       "Successfully connected to the Twitter API.\n"
 85 |      ]
 86 |     }
 87 |    ],
 88 |    "source": [
 89 |     "auth = OAuthHandler(consumer_key, consumer_secret)\n",
 90 |     "auth.set_access_token(access_token, access_token_secret)\n",
 91 |     "api = API(auth)\n",
 92 |     "print('Successfully connected to the Twitter API.')"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "## Search Tweets\n",
100 |     "\n",
101 |     "Now we can define our query and search for the tweets containing it.\n",
102 |     "\n",
103 |     "- **query**: *hashtag* or *emoji* that will be used to fetch the tweets\n",
104 |     "- **max_requests**: Maximum number of requests to the API.\n",
105 |     "    - Restriction: 180 requests / 15 min window"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 6,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "query = '#angry'\n",
115 |     "max_requests = 180"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": 7,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "# Converts aliases to the real emoji representation (e.g. :thumbs_up: => 👍)\n",
125 |     "\n",
126 |     "from emoji import emojize"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 8,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "q = emojize(query) + ' -filter:retweets'\n",
136 |     "searched_tweets = []\n",
137 |     "last_id = -1\n",
138 |     "request_count = 0\n",
139 |     "while request_count < max_requests:\n",
140 |     "    try:\n",
141 |     "        new_tweets = api.search(q=q,\n",
142 |     "                                lang='en',\n",
143 |     "                                count=100,\n",
144 |     "                                max_id=str(last_id - 1),\n",
145 |     "                                tweet_mode='extended')\n",
146 |     "        if not new_tweets:\n",
147 |     "            break\n",
148 |     "        searched_tweets.extend(new_tweets)\n",
149 |     "        last_id = new_tweets[-1].id\n",
150 |     "        request_count += 1\n",
151 |     "    except TweepError as e:\n",
152 |     "        print(e)\n",
153 |     "        break"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "markdown",
158 |    "metadata": {},
159 |    "source": [
160 |     "## Format and save\n",
161 |     "\n",
162 |     "Format the API data to the desired structure and save a `.csv` file"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 9,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "import pandas as pd"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": 10,
177 |    "metadata": {},
178 |    "outputs": [
179 |     {
180 |      "name": "stdout",
181 |      "output_type": "stream",
182 |      "text": [
183 |       "66 #angry tweets\n"
184 |      ]
185 |     }
186 |    ],
187 |    "source": [
188 |     "data = []\n",
189 |     "for tweet in searched_tweets:\n",
190 |     "    data.append([tweet.id, tweet.created_at, tweet.user.screen_name, tweet.full_text])\n",
191 |     "df = pd.DataFrame(data=data, columns=['id', 'date', 'user', 'text'])\n",
192 |     "print(str(len(data)) + ' ' + query + ' tweets')"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": 11,
198 |    "metadata": {
199 |     "scrolled": true
200 |    },
201 |    "outputs": [
202 |     {
203 |      "data": {
204 |       "text/html": [
205 |        "<div>\n",
206 |        "<style scoped>\n",
207 |        "    .dataframe tbody tr th:only-of-type {\n",
208 |        "        vertical-align: middle;\n",
209 |        "    }\n",
210 |        "\n",
211 |        "    .dataframe tbody tr th {\n",
212 |        "        vertical-align: top;\n",
213 |        "    }\n",
214 |        "\n",
215 |        "    .dataframe thead th {\n",
216 |        "        text-align: right;\n",
217 |        "    }\n",
218 |        "</style>\n",
219 |        "<table border=\"1\" class=\"dataframe\">\n",
220 |        "  <thead>\n",
221 |        "    <tr style=\"text-align: right;\">\n",
222 |        "      <th></th>\n",
223 |        "      <th>id</th>\n",
224 |        "      <th>date</th>\n",
225 |        "      <th>user</th>\n",
226 |        "      <th>text</th>\n",
227 |        "    </tr>\n",
228 |        "  </thead>\n",
229 |        "  <tbody>\n",
230 |        "    <tr>\n",
231 |        "      <th>0</th>\n",
232 |        "      <td>1151133382627057664</td>\n",
233 |        "      <td>2019-07-16 14:16:00</td>\n",
234 |        "      <td>DaradeAbhijeet</td>\n",
235 |        "      <td>Don't promise when you are #Happy\\n&amp;amp;\\nDon'...</td>\n",
236 |        "    </tr>\n",
237 |        "    <tr>\n",
238 |        "      <th>1</th>\n",
239 |        "      <td>1151124672496324608</td>\n",
240 |        "      <td>2019-07-16 13:41:23</td>\n",
241 |        "      <td>TheRealFakeJack</td>\n",
242 |        "      <td>@realDonaldTrump 4:20 am it is a sign u need t...</td>\n",
243 |        "    </tr>\n",
244 |        "    <tr>\n",
245 |        "      <th>2</th>\n",
246 |        "      <td>1151118984793776129</td>\n",
247 |        "      <td>2019-07-16 13:18:47</td>\n",
248 |        "      <td>masterofnaps</td>\n",
249 |        "      <td>There's a special place in hell for people who...</td>\n",
250 |        "    </tr>\n",
251 |        "    <tr>\n",
252 |        "      <th>3</th>\n",
253 |        "      <td>1151115966220328960</td>\n",
254 |        "      <td>2019-07-16 13:06:47</td>\n",
255 |        "      <td>TiknisArts</td>\n",
256 |        "      <td>We know #Trump needs #attention to survive. It...</td>\n",
257 |        "    </tr>\n",
258 |        "    <tr>\n",
259 |        "      <th>4</th>\n",
260 |        "      <td>1151113082099232768</td>\n",
261 |        "      <td>2019-07-16 12:55:20</td>\n",
262 |        "      <td>emilieraddish</td>\n",
263 |        "      <td>Get your Instagram photo elsewhere not on top ...</td>\n",
264 |        "    </tr>\n",
265 |        "  </tbody>\n",
266 |        "</table>\n",
267 |        "</div>"
268 |       ],
269 |       "text/plain": [
270 |        "                    id                date             user  \\\n",
271 |        "0  1151133382627057664 2019-07-16 14:16:00   DaradeAbhijeet   \n",
272 |        "1  1151124672496324608 2019-07-16 13:41:23  TheRealFakeJack   \n",
273 |        "2  1151118984793776129 2019-07-16 13:18:47     masterofnaps   \n",
274 |        "3  1151115966220328960 2019-07-16 13:06:47       TiknisArts   \n",
275 |        "4  1151113082099232768 2019-07-16 12:55:20    emilieraddish   \n",
276 |        "\n",
277 |        "                                                text  \n",
278 |        "0  Don't promise when you are #Happy\\n&amp;\\nDon'...  \n",
279 |        "1  @realDonaldTrump 4:20 am it is a sign u need t...  \n",
280 |        "2  There's a special place in hell for people who...  \n",
281 |        "3  We know #Trump needs #attention to survive. It...  \n",
282 |        "4  Get your Instagram photo elsewhere not on top ...  "
283 |       ]
284 |      },
285 |      "execution_count": 11,
286 |      "metadata": {},
287 |      "output_type": "execute_result"
288 |     }
289 |    ],
290 |    "source": [
291 |     "df.head()"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": 12,
297 |    "metadata": {},
298 |    "outputs": [
299 |     {
300 |      "name": "stdout",
301 |      "output_type": "stream",
302 |      "text": [
303 |       "Saved under: \"/home/rmohashi/Workspace/emotion-from-tweets/datasets/tweepy\"\n"
304 |      ]
305 |     }
306 |    ],
307 |    "source": [
308 |     "PATH = Path('../datasets/tweepy').resolve()\n",
309 |     "filename = query + '.csv'\n",
310 |     "df.to_csv(os.path.join(PATH, filename), index=None)\n",
311 |     "print('Saved under: \"' + PATH.as_posix() + '\"')"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": null,
317 |    "metadata": {},
318 |    "outputs": [],
319 |    "source": []
320 |   }
321 |  ],
322 |  "metadata": {
323 |   "kernelspec": {
324 |    "display_name": "Python 3",
325 |    "language": "python",
326 |    "name": "python3"
327 |   },
328 |   "language_info": {
329 |    "codemirror_mode": {
330 |     "name": "ipython",
331 |     "version": 3
332 |    },
333 |    "file_extension": ".py",
334 |    "mimetype": "text/x-python",
335 |    "name": "python",
336 |    "nbconvert_exporter": "python",
337 |    "pygments_lexer": "ipython3",
338 |    "version": "3.6.8"
339 |   }
340 |  },
341 |  "nbformat": 4,
342 |  "nbformat_minor": 2
343 | }
344 | 


--------------------------------------------------------------------------------
/notebooks/Validate API Data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Validate API Data\n",
  8 |     "\n",
  9 |     "Validate and create a emotion labeled dataset"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "# Add project path to the PYTHONPATH\n",
 19 |     "\n",
 20 |     "import os\n",
 21 |     "import sys\n",
 22 |     "from pathlib import Path\n",
 23 |     "\n",
 24 |     "sys.path.append(Path(os.path.join(os.path.abspath(''), '../')).resolve().as_posix())"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 2,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "import json\n",
 34 |     "from pathlib import Path"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "## Load Relations\n",
 42 |     "\n",
 43 |     "Load the relations between queries and emotions"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 3,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "relations_path = Path('../query_relations.json').resolve()"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 4,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "with relations_path.open('rb') as file:\n",
 62 |     "    relations = json.load(file)"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "## Load Tokenizer\n",
 70 |     "\n",
 71 |     "Load the tokenizer, created at the model training process"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 5,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "import pickle"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 6,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "tokenizer_path = Path('../datasets/sentiment140/tokenizer.pickle').resolve()\n",
 90 |     "with tokenizer_path.open('rb') as file:\n",
 91 |     "    tokenizer = pickle.load(file)"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "## Load Model\n",
 99 |     "\n",
100 |     "Load the model, using the saved weights"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 7,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "from tensorflow.keras.layers import Input, Embedding, GRU\n",
110 |     "from tensorflow.keras.layers import Dropout, GlobalMaxPooling1D\n",
111 |     "from tensorflow.keras.layers import Bidirectional, Dense\n",
112 |     "from tensorflow.keras.models import Sequential"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 8,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)\n",
122 |     "embedding_dim = 200\n",
123 |     "input_length = 100\n",
124 |     "gru_units = 128\n",
125 |     "gru_dropout = 0.1\n",
126 |     "recurrent_dropout = 0.1\n",
127 |     "dropout = 0.1"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 9,
133 |    "metadata": {},
134 |    "outputs": [
135 |     {
136 |      "name": "stderr",
137 |      "output_type": "stream",
138 |      "text": [
139 |       "WARNING: Logging before flag parsing goes to stderr.\n",
140 |       "W0719 09:43:55.179866 4476569024 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
141 |       "Instructions for updating:\n",
142 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
143 |       "W0719 09:43:55.207387 4476569024 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
144 |       "Instructions for updating:\n",
145 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
146 |       "W0719 09:43:55.215560 4476569024 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
147 |       "Instructions for updating:\n",
148 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
149 |       "W0719 09:43:55.216914 4476569024 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
150 |       "Instructions for updating:\n",
151 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
152 |       "W0719 09:43:55.219862 4476569024 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
153 |       "Instructions for updating:\n",
154 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n"
155 |      ]
156 |     }
157 |    ],
158 |    "source": [
159 |     "model = Sequential()\n",
160 |     "model.add(Embedding(\n",
161 |     "    input_dim=input_dim,\n",
162 |     "    output_dim=embedding_dim,\n",
163 |     "    input_shape=(input_length,)\n",
164 |     "))\n",
165 |     "\n",
166 |     "model.add(Bidirectional(GRU(\n",
167 |     "    gru_units,\n",
168 |     "    return_sequences=True,\n",
169 |     "    dropout=gru_dropout,\n",
170 |     "    recurrent_dropout=recurrent_dropout\n",
171 |     ")))\n",
172 |     "model.add(GlobalMaxPooling1D())\n",
173 |     "model.add(Dense(32, activation='relu'))\n",
174 |     "model.add(Dropout(dropout))\n",
175 |     "\n",
176 |     "model.add(Dense(1, activation='sigmoid'))"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 10,
182 |    "metadata": {},
183 |    "outputs": [
184 |     {
185 |      "name": "stdout",
186 |      "output_type": "stream",
187 |      "text": [
188 |       "Model: \"sequential\"\n",
189 |       "_________________________________________________________________\n",
190 |       "Layer (type)                 Output Shape              Param #   \n",
191 |       "=================================================================\n",
192 |       "embedding (Embedding)        (None, 100, 200)          2000000   \n",
193 |       "_________________________________________________________________\n",
194 |       "bidirectional (Bidirectional (None, 100, 256)          252672    \n",
195 |       "_________________________________________________________________\n",
196 |       "global_max_pooling1d (Global (None, 256)               0         \n",
197 |       "_________________________________________________________________\n",
198 |       "dense (Dense)                (None, 32)                8224      \n",
199 |       "_________________________________________________________________\n",
200 |       "dropout (Dropout)            (None, 32)                0         \n",
201 |       "_________________________________________________________________\n",
202 |       "dense_1 (Dense)              (None, 1)                 33        \n",
203 |       "=================================================================\n",
204 |       "Total params: 2,260,929\n",
205 |       "Trainable params: 2,260,929\n",
206 |       "Non-trainable params: 0\n",
207 |       "_________________________________________________________________\n",
208 |       "None\n"
209 |      ]
210 |     }
211 |    ],
212 |    "source": [
213 |     "print(model.summary())"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 11,
219 |    "metadata": {},
220 |    "outputs": [],
221 |    "source": [
222 |     "weights_path = Path('../models/sentiment_analysis/model_weights.h5').resolve()\n",
223 |     "model.load_weights(weights_path.as_posix())"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "metadata": {},
229 |    "source": [
230 |     "## Group data by emotion\n",
231 |     "\n",
232 |     "Use the emotions to group the data"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": 12,
238 |    "metadata": {},
239 |    "outputs": [],
240 |    "source": [
241 |     "import os\n",
242 |     "import re\n",
243 |     "import pandas as pd\n",
244 |     "from tqdm import tqdm"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 13,
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "files_dir = Path('../datasets/tweepy').resolve()"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": 14,
259 |    "metadata": {},
260 |    "outputs": [
261 |     {
262 |      "name": "stderr",
263 |      "output_type": "stream",
264 |      "text": [
265 |       "100%|██████████| 19/19 [00:00<00:00, 27.29it/s]\n"
266 |      ]
267 |     }
268 |    ],
269 |    "source": [
270 |     "emotion_data_dict = {}\n",
271 |     "\n",
272 |     "filenames = os.listdir(files_dir)\n",
273 |     "with tqdm(total=len(filenames)) as t:\n",
274 |     "    for filename in filenames:\n",
275 |     "        query = re.findall(r'(#[^.]+|:.+:)', filename)[0]\n",
276 |     "        emotion = relations[query]\n",
277 |     "\n",
278 |     "        file_data = pd.read_csv(os.path.join(files_dir, filename))\n",
279 |     "        dict_data = emotion_data_dict[emotion] if emotion in emotion_data_dict else None\n",
280 |     "        emotion_data_dict[emotion] = pd.concat([dict_data, file_data])\n",
281 |     "        t.update()"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "markdown",
286 |    "metadata": {},
287 |    "source": [
288 |     "## Predict emotion and filter data\n",
289 |     "\n",
290 |     "Predict emotion and filter rows for each group created in the step above"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": 15,
296 |    "metadata": {},
297 |    "outputs": [
298 |     {
299 |      "name": "stderr",
300 |      "output_type": "stream",
301 |      "text": [
302 |       "[nltk_data] Downloading package stopwords to\n",
303 |       "[nltk_data]     /Users/rmohashi/nltk_data...\n",
304 |       "[nltk_data]   Package stopwords is already up-to-date!\n"
305 |      ]
306 |     }
307 |    ],
308 |    "source": [
309 |     "import re\n",
310 |     "import numpy as np\n",
311 |     "from emoji import demojize\n",
312 |     "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
313 |     "from nlp import preprocess"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": 16,
319 |    "metadata": {},
320 |    "outputs": [],
321 |    "source": [
322 |     "def get_score_range(mean):\n",
323 |     "  if mean < 0.5:\n",
324 |     "    return (0.0, mean)\n",
325 |     "  return (mean, 1.0)"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 17,
331 |    "metadata": {},
332 |    "outputs": [
333 |     {
334 |      "name": "stderr",
335 |      "output_type": "stream",
336 |      "text": [
337 |       "Processing \"joy\" data: 100%|██████████| 4/4 [03:28<00:00, 51.09s/it]    "
338 |      ]
339 |     },
340 |     {
341 |      "name": "stdout",
342 |      "output_type": "stream",
343 |      "text": [
344 |       "Fear: Score Range: 0.000000 - 0.434182\n",
345 |       "Sadness: Score Range: 0.000000 - 0.220770\n",
346 |       "Anger: Score Range: 0.000000 - 0.410283\n",
347 |       "Joy: Score Range: 0.870705 - 1.000000\n"
348 |      ]
349 |     },
350 |     {
351 |      "name": "stderr",
352 |      "output_type": "stream",
353 |      "text": [
354 |       "\n"
355 |      ]
356 |     }
357 |    ],
358 |    "source": [
359 |     "result_data = []\n",
360 |     "\n",
361 |     "messages = []\n",
362 |     "with tqdm(total=len(emotion_data_dict.items())) as t:\n",
363 |     "    for emotion, dataset in emotion_data_dict.items():\n",
364 |     "        t.set_description('Processing \"' + emotion + '\" data')\n",
365 |     "\n",
366 |     "        cleaned_texts = preprocess(dataset.text, quiet=True)\n",
367 |     "        predict_sequences = [text.split() for text in cleaned_texts]\n",
368 |     "        list_tokenized_predict = tokenizer.texts_to_sequences(predict_sequences)\n",
369 |     "        x_predict = pad_sequences(list_tokenized_predict, maxlen=100)\n",
370 |     "\n",
371 |     "        result = model.predict(x_predict)\n",
372 |     "        mean = np.mean(result)\n",
373 |     "        std = np.std(result)\n",
374 |     "        low, high = get_score_range(mean)\n",
375 |     "        messages.append(emotion.capitalize() + \": Score Range: {:4f} - {:4f}\".format(low, high))\n",
376 |     "        dataset = dataset[np.all([(result >= low), (result <= high)], axis=0)]\n",
377 |     "        dataset.insert(0, 'label', emotion)\n",
378 |     "\n",
379 |     "        result_data = result_data + [dataset]\n",
380 |     "        t.update()\n",
381 |     "\n",
382 |     "for message in messages:\n",
383 |     "    print(message)"
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "markdown",
388 |    "metadata": {},
389 |    "source": [
390 |     "## Save dataset\n",
391 |     "\n",
392 |     "Save the resulting data"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "code",
397 |    "execution_count": 18,
398 |    "metadata": {},
399 |    "outputs": [
400 |     {
401 |      "name": "stdout",
402 |      "output_type": "stream",
403 |      "text": [
404 |       "Files saved under \"/Users/rmohashi/Workspace/emotion-from-tweet/datasets/sentiment_analysis/dataset.csv\"\n"
405 |      ]
406 |     }
407 |    ],
408 |    "source": [
409 |     "if len(result_data) > 0:\n",
410 |     "    result_data = pd.concat(result_data)\n",
411 |     "\n",
412 |     "    path = Path('../datasets/sentiment_analysis/dataset.csv').resolve()\n",
413 |     "    result_data.to_csv(path, index=None)\n",
414 |     "\n",
415 |     "    print('Files saved under \"' + path.as_posix() + '\"')"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": null,
421 |    "metadata": {},
422 |    "outputs": [],
423 |    "source": []
424 |   }
425 |  ],
426 |  "metadata": {
427 |   "kernelspec": {
428 |    "display_name": "Python 3",
429 |    "language": "python",
430 |    "name": "python3"
431 |   },
432 |   "language_info": {
433 |    "codemirror_mode": {
434 |     "name": "ipython",
435 |     "version": 3
436 |    },
437 |    "file_extension": ".py",
438 |    "mimetype": "text/x-python",
439 |    "name": "python",
440 |    "nbconvert_exporter": "python",
441 |    "pygments_lexer": "ipython3",
442 |    "version": "3.6.8"
443 |   }
444 |  },
445 |  "nbformat": 4,
446 |  "nbformat_minor": 2
447 | }
448 | 


--------------------------------------------------------------------------------
/notebooks/Predict Emotion.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Predict Emotion\n",
  8 |     "\n",
  9 |     "The main objective of this notebook is to predict emotions from tweets"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "# Add project path to the PYTHONPATH\n",
 19 |     "\n",
 20 |     "import os\n",
 21 |     "import sys\n",
 22 |     "from pathlib import Path\n",
 23 |     "\n",
 24 |     "sys.path.append(Path(os.path.join(os.path.abspath(''), '../')).resolve().as_posix())"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 2,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "import pickle"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "## Load Tokenizer\n",
 41 |     "\n",
 42 |     "Load `.pickle` file with the tokenizer"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 3,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "tokenizer_path = Path('../datasets/sentiment_analysis/tokenizer.pickle').resolve()\n",
 52 |     "with tokenizer_path.open('rb') as file:\n",
 53 |     "    tokenizer = pickle.load(file)"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "## Load Model\n",
 61 |     "\n",
 62 |     "Load the trained emotion recognition model"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 4,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, LSTM\n",
 72 |     "from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D\n",
 73 |     "from tensorflow.keras.layers import Bidirectional, Conv1D, Dense, concatenate\n",
 74 |     "from tensorflow.keras.models import Model"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 5,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)\n",
 84 |     "num_classes = 4\n",
 85 |     "embedding_dim = 500\n",
 86 |     "input_length = 100\n",
 87 |     "lstm_units = 128\n",
 88 |     "lstm_dropout = 0.1\n",
 89 |     "recurrent_dropout = 0.1\n",
 90 |     "spatial_dropout=0.2\n",
 91 |     "filters=64\n",
 92 |     "kernel_size=3"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 6,
 98 |    "metadata": {},
 99 |    "outputs": [
100 |     {
101 |      "name": "stderr",
102 |      "output_type": "stream",
103 |      "text": [
104 |       "WARNING: Logging before flag parsing goes to stderr.\n",
105 |       "W0719 10:47:51.968286 4693956032 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
106 |       "Instructions for updating:\n",
107 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
108 |       "W0719 10:47:52.031774 4693956032 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
109 |       "Instructions for updating:\n",
110 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
111 |       "W0719 10:47:52.039301 4693956032 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
112 |       "Instructions for updating:\n",
113 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
114 |       "W0719 10:47:52.040482 4693956032 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
115 |       "Instructions for updating:\n",
116 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
117 |       "W0719 10:47:52.041715 4693956032 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
118 |       "Instructions for updating:\n",
119 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n"
120 |      ]
121 |     }
122 |    ],
123 |    "source": [
124 |     "input_layer = Input(shape=(input_length,))\n",
125 |     "output_layer = Embedding(\n",
126 |     "  input_dim=input_dim,\n",
127 |     "  output_dim=embedding_dim,\n",
128 |     "  input_shape=(input_length,)\n",
129 |     ")(input_layer)\n",
130 |     "\n",
131 |     "output_layer = SpatialDropout1D(spatial_dropout)(output_layer)\n",
132 |     "\n",
133 |     "output_layer = Bidirectional(\n",
134 |     "LSTM(lstm_units, return_sequences=True,\n",
135 |     "     dropout=lstm_dropout, recurrent_dropout=recurrent_dropout)\n",
136 |     ")(output_layer)\n",
137 |     "output_layer = Conv1D(filters, kernel_size=kernel_size, padding='valid',\n",
138 |     "                    kernel_initializer='glorot_uniform')(output_layer)\n",
139 |     "\n",
140 |     "avg_pool = GlobalAveragePooling1D()(output_layer)\n",
141 |     "max_pool = GlobalMaxPooling1D()(output_layer)\n",
142 |     "output_layer = concatenate([avg_pool, max_pool])\n",
143 |     "\n",
144 |     "output_layer = Dense(num_classes, activation='softmax')(output_layer)\n",
145 |     "\n",
146 |     "model = Model(input_layer, output_layer)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": 7,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "model_weights_path = Path('../models/emotion_recognition/model_weights.h5').resolve()\n",
156 |     "model.load_weights(model_weights_path.as_posix())"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "metadata": {},
162 |    "source": [
163 |     "## Load data\n",
164 |     "\n",
165 |     "Load the data that will have the labels predicted by the model\n",
166 |     "\n",
167 |     "**data_path**: Path to the `.csv` file that will be used"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 8,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "import pandas as pd"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 9,
182 |    "metadata": {},
183 |    "outputs": [
184 |     {
185 |      "data": {
186 |       "text/html": [
187 |        "<div>\n",
188 |        "<style scoped>\n",
189 |        "    .dataframe tbody tr th:only-of-type {\n",
190 |        "        vertical-align: middle;\n",
191 |        "    }\n",
192 |        "\n",
193 |        "    .dataframe tbody tr th {\n",
194 |        "        vertical-align: top;\n",
195 |        "    }\n",
196 |        "\n",
197 |        "    .dataframe thead th {\n",
198 |        "        text-align: right;\n",
199 |        "    }\n",
200 |        "</style>\n",
201 |        "<table border=\"1\" class=\"dataframe\">\n",
202 |        "  <thead>\n",
203 |        "    <tr style=\"text-align: right;\">\n",
204 |        "      <th></th>\n",
205 |        "      <th>id</th>\n",
206 |        "      <th>date</th>\n",
207 |        "      <th>user</th>\n",
208 |        "      <th>text</th>\n",
209 |        "    </tr>\n",
210 |        "  </thead>\n",
211 |        "  <tbody>\n",
212 |        "    <tr>\n",
213 |        "      <th>0</th>\n",
214 |        "      <td>1151893341782585349</td>\n",
215 |        "      <td>2019-07-18 16:35:48</td>\n",
216 |        "      <td>Ozzzylot</td>\n",
217 |        "      <td>⚡️ Fans share what Kyoto Animation studio mean...</td>\n",
218 |        "    </tr>\n",
219 |        "    <tr>\n",
220 |        "      <th>1</th>\n",
221 |        "      <td>1151893322908102657</td>\n",
222 |        "      <td>2019-07-18 16:35:43</td>\n",
223 |        "      <td>rosyutori</td>\n",
224 |        "      <td>Deep condolences to all who are passed away at...</td>\n",
225 |        "    </tr>\n",
226 |        "    <tr>\n",
227 |        "      <th>2</th>\n",
228 |        "      <td>1151893318101377024</td>\n",
229 |        "      <td>2019-07-18 16:35:42</td>\n",
230 |        "      <td>met_bit</td>\n",
231 |        "      <td>Striking news... How on earth can someone be s...</td>\n",
232 |        "    </tr>\n",
233 |        "    <tr>\n",
234 |        "      <th>3</th>\n",
235 |        "      <td>1151893304117813248</td>\n",
236 |        "      <td>2019-07-18 16:35:39</td>\n",
237 |        "      <td>Destructo_Dan</td>\n",
238 |        "      <td>I don’t know if I had any favorite anime from ...</td>\n",
239 |        "    </tr>\n",
240 |        "    <tr>\n",
241 |        "      <th>4</th>\n",
242 |        "      <td>1151893302863650816</td>\n",
243 |        "      <td>2019-07-18 16:35:39</td>\n",
244 |        "      <td>KDiscavage</td>\n",
245 |        "      <td>The news about Kyoto Animation Studios hit me ...</td>\n",
246 |        "    </tr>\n",
247 |        "  </tbody>\n",
248 |        "</table>\n",
249 |        "</div>"
250 |       ],
251 |       "text/plain": [
252 |        "                    id                 date           user  \\\n",
253 |        "0  1151893341782585349  2019-07-18 16:35:48       Ozzzylot   \n",
254 |        "1  1151893322908102657  2019-07-18 16:35:43      rosyutori   \n",
255 |        "2  1151893318101377024  2019-07-18 16:35:42        met_bit   \n",
256 |        "3  1151893304117813248  2019-07-18 16:35:39  Destructo_Dan   \n",
257 |        "4  1151893302863650816  2019-07-18 16:35:39     KDiscavage   \n",
258 |        "\n",
259 |        "                                                text  \n",
260 |        "0  ⚡️ Fans share what Kyoto Animation studio mean...  \n",
261 |        "1  Deep condolences to all who are passed away at...  \n",
262 |        "2  Striking news... How on earth can someone be s...  \n",
263 |        "3  I don’t know if I had any favorite anime from ...  \n",
264 |        "4  The news about Kyoto Animation Studios hit me ...  "
265 |       ]
266 |      },
267 |      "execution_count": 9,
268 |      "metadata": {},
269 |      "output_type": "execute_result"
270 |     }
271 |    ],
272 |    "source": [
273 |     "data_path = Path('../datasets/predict/1151893341782585349-1151863653320159233_kyoto_animation.csv').resolve()\n",
274 |     "data = pd.read_csv(data_path)\n",
275 |     "data.head()"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "markdown",
280 |    "metadata": {},
281 |    "source": [
282 |     "## Load Encoder\n",
283 |     "\n",
284 |     "Load `.pickle` file with the encoder"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": 10,
290 |    "metadata": {},
291 |    "outputs": [],
292 |    "source": [
293 |     "encoder_path = Path('../models/emotion_recognition/encoder.pickle').resolve()\n",
294 |     "with encoder_path.open('rb') as file:\n",
295 |     "    encoder = pickle.load(file)"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "markdown",
300 |    "metadata": {},
301 |    "source": [
302 |     "## Preprocess data\n",
303 |     "\n",
304 |     "Preprocess the data that will be used"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": 11,
310 |    "metadata": {},
311 |    "outputs": [
312 |     {
313 |      "name": "stderr",
314 |      "output_type": "stream",
315 |      "text": [
316 |       "[nltk_data] Downloading package stopwords to\n",
317 |       "[nltk_data]     /Users/rmohashi/nltk_data...\n",
318 |       "[nltk_data]   Package stopwords is already up-to-date!\n"
319 |      ]
320 |     }
321 |    ],
322 |    "source": [
323 |     "from nlp import preprocess\n",
324 |     "from tensorflow.keras.preprocessing.sequence import pad_sequences"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": 12,
330 |    "metadata": {},
331 |    "outputs": [
332 |     {
333 |      "name": "stdout",
334 |      "output_type": "stream",
335 |      "text": [
336 |       "Time to clean up: 1.41 sec\n"
337 |      ]
338 |     }
339 |    ],
340 |    "source": [
341 |     "cleaned_data = preprocess(data.text)\n",
342 |     "sequences = [text.split() for text in cleaned_data]\n",
343 |     "list_tokenized = tokenizer.texts_to_sequences(sequences)\n",
344 |     "x_data = pad_sequences(list_tokenized, maxlen=100)"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "markdown",
349 |    "metadata": {},
350 |    "source": [
351 |     "## Results\n",
352 |     "\n",
353 |     "Predict the labels and generate a confusion matrix"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": 13,
359 |    "metadata": {},
360 |    "outputs": [],
361 |    "source": [
362 |     "import numpy as np"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": 14,
368 |    "metadata": {},
369 |    "outputs": [],
370 |    "source": [
371 |     "y_pred = model.predict(x_data)"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "code",
376 |    "execution_count": 15,
377 |    "metadata": {},
378 |    "outputs": [
379 |     {
380 |      "name": "stdout",
381 |      "output_type": "stream",
382 |      "text": [
383 |       "angry: 0.0977998\n",
384 |       "fear: 0.3991122\n",
385 |       "joy: 0.03104621\n",
386 |       "sadness: 0.4720413\n"
387 |      ]
388 |     }
389 |    ],
390 |    "source": [
391 |     "for index, value in enumerate(np.sum(y_pred, axis=0) / len(y_pred)):\n",
392 |     "    print(encoder.classes_[index] + \": \" + str(value))"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "code",
397 |    "execution_count": 16,
398 |    "metadata": {},
399 |    "outputs": [
400 |     {
401 |      "name": "stdout",
402 |      "output_type": "stream",
403 |      "text": [
404 |       "angry: 0.09889558232931726\n",
405 |       "fear: 0.4011044176706827\n",
406 |       "joy: 0.030622489959839357\n",
407 |       "sadness: 0.46937751004016065\n"
408 |      ]
409 |     }
410 |    ],
411 |    "source": [
412 |     "y_pred_argmax = y_pred.argmax(axis=1)\n",
413 |     "data_len = len(y_pred_argmax)\n",
414 |     "for index, value in enumerate(np.unique(y_pred_argmax)):\n",
415 |     "    print(encoder.classes_[index] + \": \" + str(len(y_pred_argmax[y_pred_argmax == value]) / data_len))"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": 17,
421 |    "metadata": {},
422 |    "outputs": [
423 |     {
424 |      "data": {
425 |       "text/plain": [
426 |        "array([3, 3, 3, 3, 3])"
427 |       ]
428 |      },
429 |      "execution_count": 17,
430 |      "metadata": {},
431 |      "output_type": "execute_result"
432 |     }
433 |    ],
434 |    "source": [
435 |     "y_pred[5:10].argmax(axis=1)"
436 |    ]
437 |   },
438 |   {
439 |    "cell_type": "code",
440 |    "execution_count": 18,
441 |    "metadata": {},
442 |    "outputs": [
443 |     {
444 |      "data": {
445 |       "text/plain": [
446 |        "'My heart goes out to the people who died in the fire at Kyoto Animation Studio. \\n\\n#PrayForKyoani https://t.co/Jvg9R8f6Oc'"
447 |       ]
448 |      },
449 |      "execution_count": 18,
450 |      "metadata": {},
451 |      "output_type": "execute_result"
452 |     }
453 |    ],
454 |    "source": [
455 |     "data.text.iloc[6]"
456 |    ]
457 |   },
458 |   {
459 |    "cell_type": "code",
460 |    "execution_count": null,
461 |    "metadata": {},
462 |    "outputs": [],
463 |    "source": []
464 |   }
465 |  ],
466 |  "metadata": {
467 |   "kernelspec": {
468 |    "display_name": "Python 3",
469 |    "language": "python",
470 |    "name": "python3"
471 |   },
472 |   "language_info": {
473 |    "codemirror_mode": {
474 |     "name": "ipython",
475 |     "version": 3
476 |    },
477 |    "file_extension": ".py",
478 |    "mimetype": "text/x-python",
479 |    "name": "python",
480 |    "nbconvert_exporter": "python",
481 |    "pygments_lexer": "ipython3",
482 |    "version": "3.6.8"
483 |   }
484 |  },
485 |  "nbformat": 4,
486 |  "nbformat_minor": 2
487 | }
488 | 


--------------------------------------------------------------------------------
/notebooks/Train Sentiment Analysis.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Train Sentiment Analysis\n",
  8 |     "\n",
  9 |     "Here we'll train a sentiment analysis model to validate the data from the API."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import warnings\n",
 19 |     "warnings.filterwarnings('ignore')"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "from pathlib import Path\n",
 29 |     "import pandas as pd"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 3,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "sentiment140_path = Path('../datasets/sentiment140/sentiment140.csv')\n",
 39 |     "data = pd.read_csv(sentiment140_path)"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 4,
 45 |    "metadata": {},
 46 |    "outputs": [
 47 |     {
 48 |      "data": {
 49 |       "text/html": [
 50 |        "<div>\n",
 51 |        "<style scoped>\n",
 52 |        "    .dataframe tbody tr th:only-of-type {\n",
 53 |        "        vertical-align: middle;\n",
 54 |        "    }\n",
 55 |        "\n",
 56 |        "    .dataframe tbody tr th {\n",
 57 |        "        vertical-align: top;\n",
 58 |        "    }\n",
 59 |        "\n",
 60 |        "    .dataframe thead th {\n",
 61 |        "        text-align: right;\n",
 62 |        "    }\n",
 63 |        "</style>\n",
 64 |        "<table border=\"1\" class=\"dataframe\">\n",
 65 |        "  <thead>\n",
 66 |        "    <tr style=\"text-align: right;\">\n",
 67 |        "      <th></th>\n",
 68 |        "      <th>label</th>\n",
 69 |        "      <th>tweet</th>\n",
 70 |        "    </tr>\n",
 71 |        "  </thead>\n",
 72 |        "  <tbody>\n",
 73 |        "    <tr>\n",
 74 |        "      <th>0</th>\n",
 75 |        "      <td>0</td>\n",
 76 |        "      <td>@whiskey_kitten www.Pandora.com - plays music ...</td>\n",
 77 |        "    </tr>\n",
 78 |        "    <tr>\n",
 79 |        "      <th>1</th>\n",
 80 |        "      <td>0</td>\n",
 81 |        "      <td>studying for a test I hope not to fail....most...</td>\n",
 82 |        "    </tr>\n",
 83 |        "    <tr>\n",
 84 |        "      <th>2</th>\n",
 85 |        "      <td>4</td>\n",
 86 |        "      <td>@BlowhornOz Oh! Doesn't sound so good, I got t...</td>\n",
 87 |        "    </tr>\n",
 88 |        "    <tr>\n",
 89 |        "      <th>3</th>\n",
 90 |        "      <td>0</td>\n",
 91 |        "      <td>tomorrow is my last day at A&amp;amp;D HS  fml and...</td>\n",
 92 |        "    </tr>\n",
 93 |        "    <tr>\n",
 94 |        "      <th>4</th>\n",
 95 |        "      <td>0</td>\n",
 96 |        "      <td>Journalism has no future? That sounds pretty m...</td>\n",
 97 |        "    </tr>\n",
 98 |        "  </tbody>\n",
 99 |        "</table>\n",
100 |        "</div>"
101 |       ],
102 |       "text/plain": [
103 |        "   label                                              tweet\n",
104 |        "0      0  @whiskey_kitten www.Pandora.com - plays music ...\n",
105 |        "1      0  studying for a test I hope not to fail....most...\n",
106 |        "2      4  @BlowhornOz Oh! Doesn't sound so good, I got t...\n",
107 |        "3      0  tomorrow is my last day at A&amp;D HS  fml and...\n",
108 |        "4      0  Journalism has no future? That sounds pretty m..."
109 |       ]
110 |      },
111 |      "execution_count": 4,
112 |      "metadata": {},
113 |      "output_type": "execute_result"
114 |     }
115 |    ],
116 |    "source": [
117 |     "data.head()"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "## Data preprocessing\n",
125 |     "\n",
126 |     "Preprocess the texts:\n",
127 |     "- Convert to Lowercase: Convert all characters from the text to lowercase\n",
128 |     "- Remove special characters: Remove links and usernames and transform emojis to text\n",
129 |     "- Remove repetitions: Remove char repetitions (e.g. whaaaaaat => what)\n",
130 |     "- Remove Stop words: Remove common stop words"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 5,
136 |    "metadata": {},
137 |    "outputs": [
138 |     {
139 |      "name": "stderr",
140 |      "output_type": "stream",
141 |      "text": [
142 |       "[nltk_data] Downloading package stopwords to\n",
143 |       "[nltk_data]     /home/rmohashi/nltk_data...\n",
144 |       "[nltk_data]   Package stopwords is already up-to-date!\n"
145 |      ]
146 |     },
147 |     {
148 |      "data": {
149 |       "text/plain": [
150 |        "True"
151 |       ]
152 |      },
153 |      "execution_count": 5,
154 |      "metadata": {},
155 |      "output_type": "execute_result"
156 |     }
157 |    ],
158 |    "source": [
159 |     "import re\n",
160 |     "from time import time\n",
161 |     "import nltk\n",
162 |     "from emoji import demojize\n",
163 |     "\n",
164 |     "nltk.download('stopwords')"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": 6,
170 |    "metadata": {},
171 |    "outputs": [
172 |     {
173 |      "name": "stdout",
174 |      "output_type": "stream",
175 |      "text": [
176 |       "Time to clean up: 78.03 sec\n"
177 |      ]
178 |     }
179 |    ],
180 |    "source": [
181 |     "texts = data.tweet\n",
182 |     "\n",
183 |     "start = time()\n",
184 |     "# Lowercasing\n",
185 |     "texts = texts.str.lower()\n",
186 |     "\n",
187 |     "# Remove special chars\n",
188 |     "texts = texts.str.replace(r\"(http|@)\\S+\", \"\")\n",
189 |     "texts = texts.apply(demojize)\n",
190 |     "texts = texts.str.replace(r\"::\", \": :\")\n",
191 |     "texts = texts.str.replace(r\"’\", \"'\")\n",
192 |     "texts = texts.str.replace(r\"[^a-z\\':_]\", \" \")\n",
193 |     "\n",
194 |     "# Remove repetitions\n",
195 |     "pattern = re.compile(r\"(.)\\1{2,}\", re.DOTALL)\n",
196 |     "texts = texts.str.replace(pattern, r\"\\1\")\n",
197 |     "\n",
198 |     "# Transform short negation form\n",
199 |     "texts = texts.str.replace(r\"(can't|cannot)\", 'can not')\n",
200 |     "texts = texts.str.replace(r\"n't\", ' not')\n",
201 |     "\n",
202 |     "# Remove stop words\n",
203 |     "stopwords = nltk.corpus.stopwords.words('english')\n",
204 |     "stopwords.remove('not')\n",
205 |     "stopwords.remove('nor')\n",
206 |     "stopwords.remove('no')\n",
207 |     "texts = texts.apply(\n",
208 |     "    lambda x: ' '.join([word for word in x.split() if word not in stopwords])\n",
209 |     ")\n",
210 |     "\n",
211 |     "print(\"Time to clean up: {:.2f} sec\".format(time() - start))\n",
212 |     "\n",
213 |     "data.tweet = texts"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "metadata": {},
219 |    "source": [
220 |     "## Tokenize\n",
221 |     "\n",
222 |     "Transform the text corpus to a vector representation\n",
223 |     "\n",
224 |     "- **num_words**: Number of words to use"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": 7,
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "num_words = 10000"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 8,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "import pickle\n",
243 |     "from tensorflow.keras.preprocessing.text import Tokenizer"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": 9,
249 |    "metadata": {},
250 |    "outputs": [],
251 |    "source": [
252 |     "tokenizer = Tokenizer(num_words=num_words, lower=True)\n",
253 |     "tokenizer.fit_on_texts(data.tweet)\n",
254 |     "\n",
255 |     "file_to_save = Path('../datasets/sentiment140/tokenizer.pickle').resolve()\n",
256 |     "with file_to_save.open('wb') as file:\n",
257 |     "    pickle.dump(tokenizer, file)"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "markdown",
262 |    "metadata": {},
263 |    "source": [
264 |     "## Split data\n",
265 |     "\n",
266 |     "Split the dataset in train and validation data"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": 10,
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": [
275 |     "from sklearn.model_selection import train_test_split"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": 11,
281 |    "metadata": {},
282 |    "outputs": [],
283 |    "source": [
284 |     "train = pd.DataFrame(columns=['label', 'tweet'])\n",
285 |     "validation = pd.DataFrame(columns=['label', 'tweet'])\n",
286 |     "for label in data.label.unique():\n",
287 |     "    label_data = data[data.label == label]\n",
288 |     "    train_data, validation_data = train_test_split(label_data, test_size=0.3)\n",
289 |     "    train = pd.concat([train, train_data])\n",
290 |     "    validation = pd.concat([validation, validation_data])"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "markdown",
295 |    "metadata": {},
296 |    "source": [
297 |     "## Model\n",
298 |     "\n",
299 |     "Define the Bidirectional GRU model"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": 12,
305 |    "metadata": {},
306 |    "outputs": [],
307 |    "source": [
308 |     "from tensorflow.keras.layers import Input, Embedding, GRU\n",
309 |     "from tensorflow.keras.layers import Dropout, GlobalMaxPooling1D\n",
310 |     "from tensorflow.keras.layers import Bidirectional, Dense\n",
311 |     "from tensorflow.keras.models import Sequential"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": 13,
317 |    "metadata": {},
318 |    "outputs": [],
319 |    "source": [
320 |     "input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)\n",
321 |     "embedding_dim = 200\n",
322 |     "input_length = 100\n",
323 |     "gru_units = 128\n",
324 |     "gru_dropout = 0.1\n",
325 |     "recurrent_dropout = 0.1\n",
326 |     "dropout = 0.1"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": 14,
332 |    "metadata": {},
333 |    "outputs": [
334 |     {
335 |      "name": "stderr",
336 |      "output_type": "stream",
337 |      "text": [
338 |       "WARNING: Logging before flag parsing goes to stderr.\n",
339 |       "W0716 13:36:20.397812 140315330369344 deprecation.py:506] From /home/rmohashi/anaconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
340 |       "Instructions for updating:\n",
341 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
342 |       "W0716 13:36:20.410246 140315330369344 deprecation.py:506] From /home/rmohashi/anaconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
343 |       "Instructions for updating:\n",
344 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
345 |       "W0716 13:36:20.413324 140315330369344 deprecation.py:506] From /home/rmohashi/anaconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
346 |       "Instructions for updating:\n",
347 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
348 |       "W0716 13:36:20.413828 140315330369344 deprecation.py:506] From /home/rmohashi/anaconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
349 |       "Instructions for updating:\n",
350 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
351 |       "W0716 13:36:20.414215 140315330369344 deprecation.py:506] From /home/rmohashi/anaconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
352 |       "Instructions for updating:\n",
353 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n"
354 |      ]
355 |     }
356 |    ],
357 |    "source": [
358 |     "model = Sequential()\n",
359 |     "model.add(Embedding(\n",
360 |     "    input_dim=input_dim,\n",
361 |     "    output_dim=embedding_dim,\n",
362 |     "    input_shape=(input_length,)\n",
363 |     "))\n",
364 |     "\n",
365 |     "model.add(Bidirectional(GRU(\n",
366 |     "    gru_units,\n",
367 |     "    return_sequences=True,\n",
368 |     "    dropout=gru_dropout,\n",
369 |     "    recurrent_dropout=recurrent_dropout\n",
370 |     ")))\n",
371 |     "model.add(GlobalMaxPooling1D())\n",
372 |     "model.add(Dense(32, activation='relu'))\n",
373 |     "model.add(Dropout(dropout))\n",
374 |     "\n",
375 |     "model.add(Dense(1, activation='sigmoid'))"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": 15,
381 |    "metadata": {},
382 |    "outputs": [
383 |     {
384 |      "name": "stderr",
385 |      "output_type": "stream",
386 |      "text": [
387 |       "W0716 13:36:20.902724 140315330369344 deprecation.py:323] From /home/rmohashi/anaconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n",
388 |       "Instructions for updating:\n",
389 |       "Use tf.where in 2.0, which has the same broadcast rule as np.where\n"
390 |      ]
391 |     },
392 |     {
393 |      "name": "stdout",
394 |      "output_type": "stream",
395 |      "text": [
396 |       "Model: \"sequential\"\n",
397 |       "_________________________________________________________________\n",
398 |       "Layer (type)                 Output Shape              Param #   \n",
399 |       "=================================================================\n",
400 |       "embedding (Embedding)        (None, 100, 200)          2000000   \n",
401 |       "_________________________________________________________________\n",
402 |       "bidirectional (Bidirectional (None, 100, 256)          252672    \n",
403 |       "_________________________________________________________________\n",
404 |       "global_max_pooling1d (Global (None, 256)               0         \n",
405 |       "_________________________________________________________________\n",
406 |       "dense (Dense)                (None, 32)                8224      \n",
407 |       "_________________________________________________________________\n",
408 |       "dropout (Dropout)            (None, 32)                0         \n",
409 |       "_________________________________________________________________\n",
410 |       "dense_1 (Dense)              (None, 1)                 33        \n",
411 |       "=================================================================\n",
412 |       "Total params: 2,260,929\n",
413 |       "Trainable params: 2,260,929\n",
414 |       "Non-trainable params: 0\n",
415 |       "_________________________________________________________________\n",
416 |       "None\n"
417 |      ]
418 |     }
419 |    ],
420 |    "source": [
421 |     "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
422 |     "print(model.summary())"
423 |    ]
424 |   },
425 |   {
426 |    "cell_type": "markdown",
427 |    "metadata": {},
428 |    "source": [
429 |     "## Prepare the data\n",
430 |     "\n",
431 |     "Prepare the model input data"
432 |    ]
433 |   },
434 |   {
435 |    "cell_type": "code",
436 |    "execution_count": 16,
437 |    "metadata": {},
438 |    "outputs": [],
439 |    "source": [
440 |     "from tensorflow.keras.preprocessing.sequence import pad_sequences"
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "code",
445 |    "execution_count": 17,
446 |    "metadata": {},
447 |    "outputs": [],
448 |    "source": [
449 |     "train_sequences = [text.split() for text in train.tweet]\n",
450 |     "validation_sequences = [text.split() for text in validation.tweet]\n",
451 |     "list_tokenized_train = tokenizer.texts_to_sequences(train_sequences)\n",
452 |     "list_tokenized_validation = tokenizer.texts_to_sequences(validation_sequences)\n",
453 |     "\n",
454 |     "x_train = pad_sequences(list_tokenized_train, maxlen=input_length)\n",
455 |     "x_validation = pad_sequences(list_tokenized_validation, maxlen=input_length)\n",
456 |     "y_train = train.label.replace(4, 1)\n",
457 |     "y_validation = validation.label.replace(4, 1)"
458 |    ]
459 |   },
460 |   {
461 |    "cell_type": "markdown",
462 |    "metadata": {},
463 |    "source": [
464 |     "## Train model\n",
465 |     "\n",
466 |     "Do the training process with the given data"
467 |    ]
468 |   },
469 |   {
470 |    "cell_type": "code",
471 |    "execution_count": 18,
472 |    "metadata": {},
473 |    "outputs": [],
474 |    "source": [
475 |     "batch_size = 128\n",
476 |     "epochs = 1"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "code",
481 |    "execution_count": 19,
482 |    "metadata": {},
483 |    "outputs": [
484 |     {
485 |      "name": "stdout",
486 |      "output_type": "stream",
487 |      "text": [
488 |       "Train on 280000 samples, validate on 120000 samples\n",
489 |       "280000/280000 [==============================] - 374s 1ms/sample - loss: 0.4637 - acc: 0.7804 - val_loss: 0.4366 - val_acc: 0.7937\n"
490 |      ]
491 |     },
492 |     {
493 |      "data": {
494 |       "text/plain": [
495 |        "<tensorflow.python.keras.callbacks.History at 0x7f9d0987c940>"
496 |       ]
497 |      },
498 |      "execution_count": 19,
499 |      "metadata": {},
500 |      "output_type": "execute_result"
501 |     }
502 |    ],
503 |    "source": [
504 |     "model.fit(\n",
505 |     "    x_train,\n",
506 |     "    y=y_train,\n",
507 |     "    batch_size=batch_size,\n",
508 |     "    epochs=epochs,\n",
509 |     "    validation_data=(x_validation, y_validation),\n",
510 |     ")"
511 |    ]
512 |   },
513 |   {
514 |    "cell_type": "code",
515 |    "execution_count": 20,
516 |    "metadata": {},
517 |    "outputs": [],
518 |    "source": [
519 |     "model_file = Path('../models/sentiment_analysis/gru_model.h5').resolve()\n",
520 |     "model.save_weights(model_file.as_posix())"
521 |    ]
522 |   }
523 |  ],
524 |  "metadata": {
525 |   "kernelspec": {
526 |    "display_name": "Python 3",
527 |    "language": "python",
528 |    "name": "python3"
529 |   },
530 |   "language_info": {
531 |    "codemirror_mode": {
532 |     "name": "ipython",
533 |     "version": 3
534 |    },
535 |    "file_extension": ".py",
536 |    "mimetype": "text/x-python",
537 |    "name": "python",
538 |    "nbconvert_exporter": "python",
539 |    "pygments_lexer": "ipython3",
540 |    "version": "3.6.8"
541 |   }
542 |  },
543 |  "nbformat": 4,
544 |  "nbformat_minor": 2
545 | }
546 | 


--------------------------------------------------------------------------------
/notebooks/Train Emotion Recognition Model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Train Emotion Recognition Model\n",
  8 |     "\n",
  9 |     "Here we'll train a emotion recognition model, using the output data from the sentiment analysis."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "# Add project path to the PYTHONPATH\n",
 19 |     "\n",
 20 |     "import os\n",
 21 |     "import sys\n",
 22 |     "from pathlib import Path\n",
 23 |     "\n",
 24 |     "sys.path.append(Path(os.path.join(os.path.abspath(''), '../')).resolve().as_posix())"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "## Load Dataset\n",
 32 |     "\n",
 33 |     "Load the emotion labeled dataset"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 2,
 39 |    "metadata": {},
 40 |    "outputs": [
 41 |     {
 42 |      "name": "stderr",
 43 |      "output_type": "stream",
 44 |      "text": [
 45 |       "[nltk_data] Downloading package stopwords to\n",
 46 |       "[nltk_data]     /Users/rmohashi/nltk_data...\n",
 47 |       "[nltk_data]   Package stopwords is already up-to-date!\n"
 48 |      ]
 49 |     }
 50 |    ],
 51 |    "source": [
 52 |     "from pathlib import Path\n",
 53 |     "import pandas as pd\n",
 54 |     "from nlp import Dataset"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 3,
 60 |    "metadata": {},
 61 |    "outputs": [
 62 |     {
 63 |      "name": "stdout",
 64 |      "output_type": "stream",
 65 |      "text": [
 66 |       "Time to clean up: 19.33 sec\n"
 67 |      ]
 68 |     }
 69 |    ],
 70 |    "source": [
 71 |     "dataset_path = Path('../datasets/sentiment_analysis/dataset.csv').resolve()\n",
 72 |     "dataset = Dataset(dataset_path)\n",
 73 |     "dataset.load()\n",
 74 |     "dataset.preprocess_texts()"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 4,
 80 |    "metadata": {},
 81 |    "outputs": [
 82 |     {
 83 |      "data": {
 84 |       "text/html": [
 85 |        "<div>\n",
 86 |        "<style scoped>\n",
 87 |        "    .dataframe tbody tr th:only-of-type {\n",
 88 |        "        vertical-align: middle;\n",
 89 |        "    }\n",
 90 |        "\n",
 91 |        "    .dataframe tbody tr th {\n",
 92 |        "        vertical-align: top;\n",
 93 |        "    }\n",
 94 |        "\n",
 95 |        "    .dataframe thead th {\n",
 96 |        "        text-align: right;\n",
 97 |        "    }\n",
 98 |        "</style>\n",
 99 |        "<table border=\"1\" class=\"dataframe\">\n",
100 |        "  <thead>\n",
101 |        "    <tr style=\"text-align: right;\">\n",
102 |        "      <th></th>\n",
103 |        "      <th>label</th>\n",
104 |        "      <th>text</th>\n",
105 |        "    </tr>\n",
106 |        "  </thead>\n",
107 |        "  <tbody>\n",
108 |        "    <tr>\n",
109 |        "      <th>0</th>\n",
110 |        "      <td>fear</td>\n",
111 |        "      <td>sometimes afraid thing set free gt</td>\n",
112 |        "    </tr>\n",
113 |        "    <tr>\n",
114 |        "      <th>1</th>\n",
115 |        "      <td>fear</td>\n",
116 |        "      <td>delayed post afraid</td>\n",
117 |        "    </tr>\n",
118 |        "    <tr>\n",
119 |        "      <th>2</th>\n",
120 |        "      <td>fear</td>\n",
121 |        "      <td>eyeson seesomethingsaysomething cia clowns dee...</td>\n",
122 |        "    </tr>\n",
123 |        "    <tr>\n",
124 |        "      <th>3</th>\n",
125 |        "      <td>fear</td>\n",
126 |        "      <td>happybirthdaystevenavery corruptiwoccounty afr...</td>\n",
127 |        "    </tr>\n",
128 |        "    <tr>\n",
129 |        "      <th>4</th>\n",
130 |        "      <td>fear</td>\n",
131 |        "      <td>fight fire fire think reign fire comment check...</td>\n",
132 |        "    </tr>\n",
133 |        "  </tbody>\n",
134 |        "</table>\n",
135 |        "</div>"
136 |       ],
137 |       "text/plain": [
138 |        "  label                                               text\n",
139 |        "0  fear                 sometimes afraid thing set free gt\n",
140 |        "1  fear                                delayed post afraid\n",
141 |        "2  fear  eyeson seesomethingsaysomething cia clowns dee...\n",
142 |        "3  fear  happybirthdaystevenavery corruptiwoccounty afr...\n",
143 |        "4  fear  fight fire fire think reign fire comment check..."
144 |       ]
145 |      },
146 |      "execution_count": 4,
147 |      "metadata": {},
148 |      "output_type": "execute_result"
149 |     }
150 |    ],
151 |    "source": [
152 |     "dataset.cleaned_data.head()"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "metadata": {},
158 |    "source": [
159 |     "## Tokenize\n",
160 |     "\n",
161 |     "Transform the text corpus to a vector representation\n",
162 |     "\n",
163 |     "- **num_words**: Number of words to use"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 5,
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "import pickle\n",
173 |     "from tensorflow.keras.preprocessing.text import Tokenizer"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 6,
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "num_words = 10000\n",
183 |     "\n",
184 |     "tokenizer = Tokenizer(num_words=num_words, lower=True)\n",
185 |     "tokenizer.fit_on_texts(dataset.cleaned_data.text)\n",
186 |     "\n",
187 |     "file_to_save = Path('../datasets/sentiment_analysis/tokenizer.pickle').resolve()\n",
188 |     "with file_to_save.open('wb') as file:\n",
189 |     "    pickle.dump(tokenizer, file)"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "markdown",
194 |    "metadata": {},
195 |    "source": [
196 |     "## Split data\n",
197 |     "\n",
198 |     "Split the dataset in train and validation data"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": 7,
204 |    "metadata": {},
205 |    "outputs": [],
206 |    "source": [
207 |     "from sklearn.model_selection import train_test_split"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": 8,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": [
216 |     "data = dataset.cleaned_data.copy()\n",
217 |     "\n",
218 |     "train = pd.DataFrame(columns=['label', 'text'])\n",
219 |     "validation = pd.DataFrame(columns=['label', 'text'])\n",
220 |     "for label in data.label.unique():\n",
221 |     "    label_data = data[data.label == label]\n",
222 |     "    train_data, validation_data = train_test_split(label_data, test_size=0.3)\n",
223 |     "    train = pd.concat([train, train_data])\n",
224 |     "    validation = pd.concat([validation, validation_data])"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "markdown",
229 |    "metadata": {},
230 |    "source": [
231 |     "## Model\n",
232 |     "\n",
233 |     "Define the **LSTM** + **CNN** model"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 9,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, LSTM\n",
243 |     "from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D\n",
244 |     "from tensorflow.keras.layers import Bidirectional, Conv1D, Dense, concatenate\n",
245 |     "from tensorflow.keras.models import Model"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": 10,
251 |    "metadata": {},
252 |    "outputs": [],
253 |    "source": [
254 |     "input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)\n",
255 |     "num_classes = len(data.label.unique())\n",
256 |     "embedding_dim = 500\n",
257 |     "input_length = 100\n",
258 |     "lstm_units = 128\n",
259 |     "lstm_dropout = 0.1\n",
260 |     "recurrent_dropout = 0.1\n",
261 |     "spatial_dropout=0.2\n",
262 |     "filters=64\n",
263 |     "kernel_size=3"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": 11,
269 |    "metadata": {
270 |     "scrolled": false
271 |    },
272 |    "outputs": [
273 |     {
274 |      "name": "stderr",
275 |      "output_type": "stream",
276 |      "text": [
277 |       "WARNING: Logging before flag parsing goes to stderr.\n",
278 |       "W0719 10:32:00.331336 4686337472 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
279 |       "Instructions for updating:\n",
280 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
281 |       "W0719 10:32:00.392153 4686337472 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
282 |       "Instructions for updating:\n",
283 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
284 |       "W0719 10:32:00.397410 4686337472 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
285 |       "Instructions for updating:\n",
286 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
287 |       "W0719 10:32:00.399722 4686337472 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
288 |       "Instructions for updating:\n",
289 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
290 |       "W0719 10:32:00.403119 4686337472 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
291 |       "Instructions for updating:\n",
292 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n"
293 |      ]
294 |     }
295 |    ],
296 |    "source": [
297 |     "input_layer = Input(shape=(input_length,))\n",
298 |     "output_layer = Embedding(\n",
299 |     "  input_dim=input_dim,\n",
300 |     "  output_dim=embedding_dim,\n",
301 |     "  input_shape=(input_length,)\n",
302 |     ")(input_layer)\n",
303 |     "\n",
304 |     "output_layer = SpatialDropout1D(spatial_dropout)(output_layer)\n",
305 |     "\n",
306 |     "output_layer = Bidirectional(\n",
307 |     "LSTM(lstm_units, return_sequences=True,\n",
308 |     "     dropout=lstm_dropout, recurrent_dropout=recurrent_dropout)\n",
309 |     ")(output_layer)\n",
310 |     "output_layer = Conv1D(filters, kernel_size=kernel_size, padding='valid',\n",
311 |     "                    kernel_initializer='glorot_uniform')(output_layer)\n",
312 |     "\n",
313 |     "avg_pool = GlobalAveragePooling1D()(output_layer)\n",
314 |     "max_pool = GlobalMaxPooling1D()(output_layer)\n",
315 |     "output_layer = concatenate([avg_pool, max_pool])\n",
316 |     "\n",
317 |     "output_layer = Dense(num_classes, activation='softmax')(output_layer)\n",
318 |     "\n",
319 |     "model = Model(input_layer, output_layer)"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "code",
324 |    "execution_count": 12,
325 |    "metadata": {},
326 |    "outputs": [
327 |     {
328 |      "name": "stdout",
329 |      "output_type": "stream",
330 |      "text": [
331 |       "Model: \"model\"\n",
332 |       "__________________________________________________________________________________________________\n",
333 |       "Layer (type)                    Output Shape         Param #     Connected to                     \n",
334 |       "==================================================================================================\n",
335 |       "input_1 (InputLayer)            [(None, 100)]        0                                            \n",
336 |       "__________________________________________________________________________________________________\n",
337 |       "embedding (Embedding)           (None, 100, 500)     5000000     input_1[0][0]                    \n",
338 |       "__________________________________________________________________________________________________\n",
339 |       "spatial_dropout1d (SpatialDropo (None, 100, 500)     0           embedding[0][0]                  \n",
340 |       "__________________________________________________________________________________________________\n",
341 |       "bidirectional (Bidirectional)   (None, 100, 256)     644096      spatial_dropout1d[0][0]          \n",
342 |       "__________________________________________________________________________________________________\n",
343 |       "conv1d (Conv1D)                 (None, 98, 64)       49216       bidirectional[0][0]              \n",
344 |       "__________________________________________________________________________________________________\n",
345 |       "global_average_pooling1d (Globa (None, 64)           0           conv1d[0][0]                     \n",
346 |       "__________________________________________________________________________________________________\n",
347 |       "global_max_pooling1d (GlobalMax (None, 64)           0           conv1d[0][0]                     \n",
348 |       "__________________________________________________________________________________________________\n",
349 |       "concatenate (Concatenate)       (None, 128)          0           global_average_pooling1d[0][0]   \n",
350 |       "                                                                 global_max_pooling1d[0][0]       \n",
351 |       "__________________________________________________________________________________________________\n",
352 |       "dense (Dense)                   (None, 4)            516         concatenate[0][0]                \n",
353 |       "==================================================================================================\n",
354 |       "Total params: 5,693,828\n",
355 |       "Trainable params: 5,693,828\n",
356 |       "Non-trainable params: 0\n",
357 |       "__________________________________________________________________________________________________\n"
358 |      ]
359 |     }
360 |    ],
361 |    "source": [
362 |     "model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
363 |     "model.summary()"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "markdown",
368 |    "metadata": {},
369 |    "source": [
370 |     "## Prepare the data\n",
371 |     "\n",
372 |     "Prepare the model input data"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": 13,
378 |    "metadata": {},
379 |    "outputs": [],
380 |    "source": [
381 |     "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
382 |     "from sklearn.preprocessing import LabelBinarizer"
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": 14,
388 |    "metadata": {},
389 |    "outputs": [],
390 |    "source": [
391 |     "train_sequences = [text.split() for text in train.text]\n",
392 |     "validation_sequences = [text.split() for text in validation.text]\n",
393 |     "list_tokenized_train = tokenizer.texts_to_sequences(train_sequences)\n",
394 |     "list_tokenized_validation = tokenizer.texts_to_sequences(validation_sequences)\n",
395 |     "x_train = pad_sequences(list_tokenized_train, maxlen=input_length)\n",
396 |     "x_validation = pad_sequences(list_tokenized_validation, maxlen=input_length)\n",
397 |     "\n",
398 |     "encoder = LabelBinarizer()\n",
399 |     "encoder.fit(data.label.unique())\n",
400 |     "\n",
401 |     "encoder_path = Path('../models/emotion_recognition', 'encoder.pickle')\n",
402 |     "with encoder_path.open('wb') as file:\n",
403 |     "    pickle.dump(encoder, file)\n",
404 |     "\n",
405 |     "y_train = encoder.transform(train.label)\n",
406 |     "y_validation = encoder.transform(validation.label)"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "markdown",
411 |    "metadata": {},
412 |    "source": [
413 |     "## Train model\n",
414 |     "\n",
415 |     "Do the training process with the given data"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": 15,
421 |    "metadata": {},
422 |    "outputs": [],
423 |    "source": [
424 |     "batch_size = 128\n",
425 |     "epochs = 1"
426 |    ]
427 |   },
428 |   {
429 |    "cell_type": "code",
430 |    "execution_count": 16,
431 |    "metadata": {},
432 |    "outputs": [
433 |     {
434 |      "name": "stdout",
435 |      "output_type": "stream",
436 |      "text": [
437 |       "Train on 25454 samples, validate on 10911 samples\n"
438 |      ]
439 |     },
440 |     {
441 |      "name": "stderr",
442 |      "output_type": "stream",
443 |      "text": [
444 |       "W0719 10:32:03.006144 4686337472 deprecation.py:323] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n",
445 |       "Instructions for updating:\n",
446 |       "Use tf.where in 2.0, which has the same broadcast rule as np.where\n"
447 |      ]
448 |     },
449 |     {
450 |      "name": "stdout",
451 |      "output_type": "stream",
452 |      "text": [
453 |       "25454/25454 [==============================] - 570s 22ms/sample - loss: 0.5621 - acc: 0.7593 - val_loss: 0.3839 - val_acc: 0.8381\n"
454 |      ]
455 |     },
456 |     {
457 |      "data": {
458 |       "text/plain": [
459 |        "<tensorflow.python.keras.callbacks.History at 0x13c451668>"
460 |       ]
461 |      },
462 |      "execution_count": 16,
463 |      "metadata": {},
464 |      "output_type": "execute_result"
465 |     }
466 |    ],
467 |    "source": [
468 |     "model.fit(\n",
469 |     "    x_train,\n",
470 |     "    y=y_train,\n",
471 |     "    batch_size=batch_size,\n",
472 |     "    epochs=epochs,\n",
473 |     "    validation_data=(x_validation, y_validation)\n",
474 |     ")"
475 |    ]
476 |   },
477 |   {
478 |    "cell_type": "code",
479 |    "execution_count": 17,
480 |    "metadata": {},
481 |    "outputs": [],
482 |    "source": [
483 |     "model_file = Path('../models/emotion_recognition/model_weights.h5').resolve()\n",
484 |     "model.save_weights(model_file.as_posix())"
485 |    ]
486 |   },
487 |   {
488 |    "cell_type": "code",
489 |    "execution_count": null,
490 |    "metadata": {},
491 |    "outputs": [],
492 |    "source": []
493 |   }
494 |  ],
495 |  "metadata": {
496 |   "kernelspec": {
497 |    "display_name": "Python 3",
498 |    "language": "python",
499 |    "name": "python3"
500 |   },
501 |   "language_info": {
502 |    "codemirror_mode": {
503 |     "name": "ipython",
504 |     "version": 3
505 |    },
506 |    "file_extension": ".py",
507 |    "mimetype": "text/x-python",
508 |    "name": "python",
509 |    "nbconvert_exporter": "python",
510 |    "pygments_lexer": "ipython3",
511 |    "version": "3.6.8"
512 |   }
513 |  },
514 |  "nbformat": 4,
515 |  "nbformat_minor": 2
516 | }
517 | 


--------------------------------------------------------------------------------
/notebooks/Sentiment Analysis Score.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Sentiment Analysis Score\n",
  8 |     "\n",
  9 |     "Predict the sentiment analysis label, using a deep learning model for each query/emotion inside the relations file"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "# Add project path to the PYTHONPATH\n",
 19 |     "\n",
 20 |     "import os\n",
 21 |     "import sys\n",
 22 |     "from pathlib import Path\n",
 23 |     "\n",
 24 |     "sys.path.append(Path(os.path.join(os.path.abspath(''), '../')).resolve().as_posix())"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "## Load Tokenizer\n",
 32 |     "\n",
 33 |     "Import and load the tokenizer from a `.pickle` file"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 2,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "import pickle\n",
 43 |     "from pathlib import Path"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 3,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "tokenizer_file = Path('../datasets/sentiment140/tokenizer.pickle').resolve()\n",
 53 |     "with tokenizer_file.open('rb') as file:\n",
 54 |     "    tokenizer = pickle.load(file)"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "## Load Model\n",
 62 |     "\n",
 63 |     "Load the sentiment analysis model, using the saved weights"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 6,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "from tensorflow.keras.layers import Input, Embedding, GRU\n",
 73 |     "from tensorflow.keras.layers import Dropout, GlobalMaxPooling1D\n",
 74 |     "from tensorflow.keras.layers import Bidirectional, Dense\n",
 75 |     "from tensorflow.keras.models import Sequential"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 7,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)\n",
 85 |     "embedding_dim = 200\n",
 86 |     "input_length = 100\n",
 87 |     "gru_units = 128\n",
 88 |     "gru_dropout = 0.1\n",
 89 |     "recurrent_dropout = 0.1\n",
 90 |     "dropout = 0.1"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 8,
 96 |    "metadata": {},
 97 |    "outputs": [
 98 |     {
 99 |      "name": "stderr",
100 |      "output_type": "stream",
101 |      "text": [
102 |       "WARNING: Logging before flag parsing goes to stderr.\n",
103 |       "W0719 09:56:43.758275 4692825536 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
104 |       "Instructions for updating:\n",
105 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
106 |       "W0719 09:56:43.802737 4692825536 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
107 |       "Instructions for updating:\n",
108 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
109 |       "W0719 09:56:43.809999 4692825536 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
110 |       "Instructions for updating:\n",
111 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
112 |       "W0719 09:56:43.811434 4692825536 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
113 |       "Instructions for updating:\n",
114 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
115 |       "W0719 09:56:43.813139 4692825536 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
116 |       "Instructions for updating:\n",
117 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n"
118 |      ]
119 |     }
120 |    ],
121 |    "source": [
122 |     "model = Sequential()\n",
123 |     "model.add(Embedding(\n",
124 |     "    input_dim=input_dim,\n",
125 |     "    output_dim=embedding_dim,\n",
126 |     "    input_shape=(input_length,)\n",
127 |     "))\n",
128 |     "\n",
129 |     "model.add(Bidirectional(GRU(\n",
130 |     "    gru_units,\n",
131 |     "    return_sequences=True,\n",
132 |     "    dropout=gru_dropout,\n",
133 |     "    recurrent_dropout=recurrent_dropout\n",
134 |     ")))\n",
135 |     "model.add(GlobalMaxPooling1D())\n",
136 |     "model.add(Dense(32, activation='relu'))\n",
137 |     "model.add(Dropout(dropout))\n",
138 |     "\n",
139 |     "model.add(Dense(1, activation='sigmoid'))"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 9,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "weights_path = Path('../models/sentiment_analysis/model_weights.h5').resolve()\n",
149 |     "model.load_weights(weights_path.as_posix())"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "metadata": {},
155 |    "source": [
156 |     "## Load Query Relations\n",
157 |     "\n",
158 |     "Load the relations between queries and emotions from a `.json` file"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 10,
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "import json"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 12,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "relations_path = Path('../query_relations.json')\n",
177 |     "with relations_path.open('r') as file:\n",
178 |     "    relations = json.load(file)"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {},
184 |    "source": [
185 |     "## Predict polarity\n",
186 |     "\n",
187 |     "Predict the polarity of the texts, using the sentiment analysis model"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": 13,
193 |    "metadata": {},
194 |    "outputs": [
195 |     {
196 |      "name": "stderr",
197 |      "output_type": "stream",
198 |      "text": [
199 |       "[nltk_data] Downloading package stopwords to\n",
200 |       "[nltk_data]     /Users/rmohashi/nltk_data...\n",
201 |       "[nltk_data]   Package stopwords is already up-to-date!\n"
202 |      ]
203 |     }
204 |    ],
205 |    "source": [
206 |     "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
207 |     "from nlp import preprocess\n",
208 |     "from tqdm import tqdm\n",
209 |     "import pandas as pd\n",
210 |     "import numpy as np\n",
211 |     "import re"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 14,
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": [
220 |     "dataset_dir = Path('../datasets/tweepy').resolve()"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 15,
226 |    "metadata": {},
227 |    "outputs": [
228 |     {
229 |      "name": "stderr",
230 |      "output_type": "stream",
231 |      "text": [
232 |       "100%|██████████| 19/19 [02:59<00:00, 12.95s/it]\n"
233 |      ]
234 |     }
235 |    ],
236 |    "source": [
237 |     "data_dict = {}\n",
238 |     "\n",
239 |     "query_dict = {\n",
240 |     "    'query': [],\n",
241 |     "    'mean': [],\n",
242 |     "    'max': [],\n",
243 |     "    'min': [],\n",
244 |     "    'std': [],\n",
245 |     "    'count': [],\n",
246 |     "    'emotion': []\n",
247 |     "}\n",
248 |     "\n",
249 |     "dir_files = os.listdir(dataset_dir)\n",
250 |     "\n",
251 |     "with tqdm(total=len(dir_files)) as t:\n",
252 |     "    for filename in dir_files:\n",
253 |     "        dataset = pd.read_csv(os.path.join(dataset_dir, filename))\n",
254 |     "        cleaned_texts = preprocess(dataset.text, quiet=True)\n",
255 |     "\n",
256 |     "        query = re.findall(r'(#[^.]+|:.+:)', filename)[0]\n",
257 |     "\n",
258 |     "        predict_sequences = [text.split() for text in cleaned_texts]\n",
259 |     "        list_tokenized_predict = tokenizer.texts_to_sequences(predict_sequences)\n",
260 |     "        x_predict = pad_sequences(list_tokenized_predict, maxlen=100)\n",
261 |     "\n",
262 |     "        result = model.predict(x_predict)\n",
263 |     "        \n",
264 |     "        emotion = relations[query]\n",
265 |     "        query_dict['query'].append(query)\n",
266 |     "        query_dict['mean'].append(np.mean(result))\n",
267 |     "        query_dict['max'].append(np.amax(result))\n",
268 |     "        query_dict['min'].append(np.amin(result))\n",
269 |     "        query_dict['count'].append(len(dataset))\n",
270 |     "        query_dict['std'].append(np.std(result))\n",
271 |     "        query_dict['emotion'].append(emotion)\n",
272 |     "\n",
273 |     "        if emotion in data_dict:\n",
274 |     "            data_dict[emotion] = np.concatenate([data_dict[emotion], result])\n",
275 |     "        else:\n",
276 |     "            data_dict[emotion] = result\n",
277 |     "        \n",
278 |     "        t.update()"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "markdown",
283 |    "metadata": {},
284 |    "source": [
285 |     "## Print Results\n",
286 |     "\n",
287 |     "Print the queries/emotions and the values"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": 10,
293 |    "metadata": {
294 |     "scrolled": false
295 |    },
296 |    "outputs": [
297 |     {
298 |      "data": {
299 |       "text/html": [
300 |        "<div>\n",
301 |        "<style scoped>\n",
302 |        "    .dataframe tbody tr th:only-of-type {\n",
303 |        "        vertical-align: middle;\n",
304 |        "    }\n",
305 |        "\n",
306 |        "    .dataframe tbody tr th {\n",
307 |        "        vertical-align: top;\n",
308 |        "    }\n",
309 |        "\n",
310 |        "    .dataframe thead th {\n",
311 |        "        text-align: right;\n",
312 |        "    }\n",
313 |        "</style>\n",
314 |        "<table border=\"1\" class=\"dataframe\">\n",
315 |        "  <thead>\n",
316 |        "    <tr style=\"text-align: right;\">\n",
317 |        "      <th></th>\n",
318 |        "      <th>query</th>\n",
319 |        "      <th>mean</th>\n",
320 |        "      <th>max</th>\n",
321 |        "      <th>min</th>\n",
322 |        "      <th>std</th>\n",
323 |        "      <th>count</th>\n",
324 |        "      <th>emotion</th>\n",
325 |        "    </tr>\n",
326 |        "  </thead>\n",
327 |        "  <tbody>\n",
328 |        "    <tr>\n",
329 |        "      <th>0</th>\n",
330 |        "      <td>:anxious_face_with_sweat:</td>\n",
331 |        "      <td>0.428562</td>\n",
332 |        "      <td>0.983642</td>\n",
333 |        "      <td>0.004371</td>\n",
334 |        "      <td>0.274272</td>\n",
335 |        "      <td>199</td>\n",
336 |        "      <td>fear</td>\n",
337 |        "    </tr>\n",
338 |        "    <tr>\n",
339 |        "      <th>6</th>\n",
340 |        "      <td>#worried</td>\n",
341 |        "      <td>0.205504</td>\n",
342 |        "      <td>0.879476</td>\n",
343 |        "      <td>0.004883</td>\n",
344 |        "      <td>0.210547</td>\n",
345 |        "      <td>196</td>\n",
346 |        "      <td>fear</td>\n",
347 |        "    </tr>\n",
348 |        "  </tbody>\n",
349 |        "</table>\n",
350 |        "</div>"
351 |       ],
352 |       "text/plain": [
353 |        "                       query      mean       max       min       std  count  \\\n",
354 |        "0  :anxious_face_with_sweat:  0.428562  0.983642  0.004371  0.274272    199   \n",
355 |        "6                   #worried  0.205504  0.879476  0.004883  0.210547    196   \n",
356 |        "\n",
357 |        "  emotion  \n",
358 |        "0    fear  \n",
359 |        "6    fear  "
360 |       ]
361 |      },
362 |      "metadata": {},
363 |      "output_type": "display_data"
364 |     },
365 |     {
366 |      "data": {
367 |       "text/html": [
368 |        "<div>\n",
369 |        "<style scoped>\n",
370 |        "    .dataframe tbody tr th:only-of-type {\n",
371 |        "        vertical-align: middle;\n",
372 |        "    }\n",
373 |        "\n",
374 |        "    .dataframe tbody tr th {\n",
375 |        "        vertical-align: top;\n",
376 |        "    }\n",
377 |        "\n",
378 |        "    .dataframe thead th {\n",
379 |        "        text-align: right;\n",
380 |        "    }\n",
381 |        "</style>\n",
382 |        "<table border=\"1\" class=\"dataframe\">\n",
383 |        "  <thead>\n",
384 |        "    <tr style=\"text-align: right;\">\n",
385 |        "      <th></th>\n",
386 |        "      <th>query</th>\n",
387 |        "      <th>mean</th>\n",
388 |        "      <th>max</th>\n",
389 |        "      <th>min</th>\n",
390 |        "      <th>std</th>\n",
391 |        "      <th>count</th>\n",
392 |        "      <th>emotion</th>\n",
393 |        "    </tr>\n",
394 |        "  </thead>\n",
395 |        "  <tbody>\n",
396 |        "    <tr>\n",
397 |        "      <th>1</th>\n",
398 |        "      <td>#sad</td>\n",
399 |        "      <td>0.073413</td>\n",
400 |        "      <td>0.873629</td>\n",
401 |        "      <td>0.002289</td>\n",
402 |        "      <td>0.127914</td>\n",
403 |        "      <td>200</td>\n",
404 |        "      <td>sadness</td>\n",
405 |        "    </tr>\n",
406 |        "    <tr>\n",
407 |        "      <th>2</th>\n",
408 |        "      <td>:crying_face:</td>\n",
409 |        "      <td>0.438269</td>\n",
410 |        "      <td>0.996975</td>\n",
411 |        "      <td>0.005851</td>\n",
412 |        "      <td>0.296389</td>\n",
413 |        "      <td>197</td>\n",
414 |        "      <td>sadness</td>\n",
415 |        "    </tr>\n",
416 |        "  </tbody>\n",
417 |        "</table>\n",
418 |        "</div>"
419 |       ],
420 |       "text/plain": [
421 |        "           query      mean       max       min       std  count  emotion\n",
422 |        "1           #sad  0.073413  0.873629  0.002289  0.127914    200  sadness\n",
423 |        "2  :crying_face:  0.438269  0.996975  0.005851  0.296389    197  sadness"
424 |       ]
425 |      },
426 |      "metadata": {},
427 |      "output_type": "display_data"
428 |     },
429 |     {
430 |      "data": {
431 |       "text/html": [
432 |        "<div>\n",
433 |        "<style scoped>\n",
434 |        "    .dataframe tbody tr th:only-of-type {\n",
435 |        "        vertical-align: middle;\n",
436 |        "    }\n",
437 |        "\n",
438 |        "    .dataframe tbody tr th {\n",
439 |        "        vertical-align: top;\n",
440 |        "    }\n",
441 |        "\n",
442 |        "    .dataframe thead th {\n",
443 |        "        text-align: right;\n",
444 |        "    }\n",
445 |        "</style>\n",
446 |        "<table border=\"1\" class=\"dataframe\">\n",
447 |        "  <thead>\n",
448 |        "    <tr style=\"text-align: right;\">\n",
449 |        "      <th></th>\n",
450 |        "      <th>query</th>\n",
451 |        "      <th>mean</th>\n",
452 |        "      <th>max</th>\n",
453 |        "      <th>min</th>\n",
454 |        "      <th>std</th>\n",
455 |        "      <th>count</th>\n",
456 |        "      <th>emotion</th>\n",
457 |        "    </tr>\n",
458 |        "  </thead>\n",
459 |        "  <tbody>\n",
460 |        "    <tr>\n",
461 |        "      <th>3</th>\n",
462 |        "      <td>:red_heart:</td>\n",
463 |        "      <td>0.770384</td>\n",
464 |        "      <td>0.996633</td>\n",
465 |        "      <td>0.042774</td>\n",
466 |        "      <td>0.225747</td>\n",
467 |        "      <td>200</td>\n",
468 |        "      <td>joy</td>\n",
469 |        "    </tr>\n",
470 |        "    <tr>\n",
471 |        "      <th>7</th>\n",
472 |        "      <td>#joy</td>\n",
473 |        "      <td>0.832007</td>\n",
474 |        "      <td>0.997057</td>\n",
475 |        "      <td>0.208914</td>\n",
476 |        "      <td>0.152068</td>\n",
477 |        "      <td>191</td>\n",
478 |        "      <td>joy</td>\n",
479 |        "    </tr>\n",
480 |        "  </tbody>\n",
481 |        "</table>\n",
482 |        "</div>"
483 |       ],
484 |       "text/plain": [
485 |        "         query      mean       max       min       std  count emotion\n",
486 |        "3  :red_heart:  0.770384  0.996633  0.042774  0.225747    200     joy\n",
487 |        "7         #joy  0.832007  0.997057  0.208914  0.152068    191     joy"
488 |       ]
489 |      },
490 |      "metadata": {},
491 |      "output_type": "display_data"
492 |     },
493 |     {
494 |      "data": {
495 |       "text/html": [
496 |        "<div>\n",
497 |        "<style scoped>\n",
498 |        "    .dataframe tbody tr th:only-of-type {\n",
499 |        "        vertical-align: middle;\n",
500 |        "    }\n",
501 |        "\n",
502 |        "    .dataframe tbody tr th {\n",
503 |        "        vertical-align: top;\n",
504 |        "    }\n",
505 |        "\n",
506 |        "    .dataframe thead th {\n",
507 |        "        text-align: right;\n",
508 |        "    }\n",
509 |        "</style>\n",
510 |        "<table border=\"1\" class=\"dataframe\">\n",
511 |        "  <thead>\n",
512 |        "    <tr style=\"text-align: right;\">\n",
513 |        "      <th></th>\n",
514 |        "      <th>query</th>\n",
515 |        "      <th>mean</th>\n",
516 |        "      <th>max</th>\n",
517 |        "      <th>min</th>\n",
518 |        "      <th>std</th>\n",
519 |        "      <th>count</th>\n",
520 |        "      <th>emotion</th>\n",
521 |        "    </tr>\n",
522 |        "  </thead>\n",
523 |        "  <tbody>\n",
524 |        "    <tr>\n",
525 |        "      <th>4</th>\n",
526 |        "      <td>:face_with_symbols_on_mouth:</td>\n",
527 |        "      <td>0.403210</td>\n",
528 |        "      <td>0.997371</td>\n",
529 |        "      <td>0.010545</td>\n",
530 |        "      <td>0.261377</td>\n",
531 |        "      <td>194</td>\n",
532 |        "      <td>angry</td>\n",
533 |        "    </tr>\n",
534 |        "    <tr>\n",
535 |        "      <th>5</th>\n",
536 |        "      <td>#pissed</td>\n",
537 |        "      <td>0.230712</td>\n",
538 |        "      <td>0.912333</td>\n",
539 |        "      <td>0.008014</td>\n",
540 |        "      <td>0.180684</td>\n",
541 |        "      <td>200</td>\n",
542 |        "      <td>angry</td>\n",
543 |        "    </tr>\n",
544 |        "  </tbody>\n",
545 |        "</table>\n",
546 |        "</div>"
547 |       ],
548 |       "text/plain": [
549 |        "                          query      mean       max       min       std  \\\n",
550 |        "4  :face_with_symbols_on_mouth:  0.403210  0.997371  0.010545  0.261377   \n",
551 |        "5                       #pissed  0.230712  0.912333  0.008014  0.180684   \n",
552 |        "\n",
553 |        "   count emotion  \n",
554 |        "4    194   angry  \n",
555 |        "5    200   angry  "
556 |       ]
557 |      },
558 |      "metadata": {},
559 |      "output_type": "display_data"
560 |     }
561 |    ],
562 |    "source": [
563 |     "df = pd.DataFrame(data=query_dict)\n",
564 |     "for emotion in df.emotion.unique():\n",
565 |     "    display(df[df.emotion == emotion])"
566 |    ]
567 |   },
568 |   {
569 |    "cell_type": "code",
570 |    "execution_count": 11,
571 |    "metadata": {},
572 |    "outputs": [
573 |     {
574 |      "data": {
575 |       "text/html": [
576 |        "<div>\n",
577 |        "<style scoped>\n",
578 |        "    .dataframe tbody tr th:only-of-type {\n",
579 |        "        vertical-align: middle;\n",
580 |        "    }\n",
581 |        "\n",
582 |        "    .dataframe tbody tr th {\n",
583 |        "        vertical-align: top;\n",
584 |        "    }\n",
585 |        "\n",
586 |        "    .dataframe thead th {\n",
587 |        "        text-align: right;\n",
588 |        "    }\n",
589 |        "</style>\n",
590 |        "<table border=\"1\" class=\"dataframe\">\n",
591 |        "  <thead>\n",
592 |        "    <tr style=\"text-align: right;\">\n",
593 |        "      <th></th>\n",
594 |        "      <th>emotion</th>\n",
595 |        "      <th>mean</th>\n",
596 |        "      <th>max</th>\n",
597 |        "      <th>min</th>\n",
598 |        "      <th>std</th>\n",
599 |        "      <th>count</th>\n",
600 |        "    </tr>\n",
601 |        "  </thead>\n",
602 |        "  <tbody>\n",
603 |        "    <tr>\n",
604 |        "      <th>0</th>\n",
605 |        "      <td>fear</td>\n",
606 |        "      <td>0.317880</td>\n",
607 |        "      <td>0.983642</td>\n",
608 |        "      <td>0.004371</td>\n",
609 |        "      <td>0.268948</td>\n",
610 |        "      <td>395</td>\n",
611 |        "    </tr>\n",
612 |        "    <tr>\n",
613 |        "      <th>1</th>\n",
614 |        "      <td>sadness</td>\n",
615 |        "      <td>0.254463</td>\n",
616 |        "      <td>0.996975</td>\n",
617 |        "      <td>0.002289</td>\n",
618 |        "      <td>0.291740</td>\n",
619 |        "      <td>397</td>\n",
620 |        "    </tr>\n",
621 |        "    <tr>\n",
622 |        "      <th>2</th>\n",
623 |        "      <td>joy</td>\n",
624 |        "      <td>0.800486</td>\n",
625 |        "      <td>0.997057</td>\n",
626 |        "      <td>0.042774</td>\n",
627 |        "      <td>0.195736</td>\n",
628 |        "      <td>391</td>\n",
629 |        "    </tr>\n",
630 |        "    <tr>\n",
631 |        "      <th>3</th>\n",
632 |        "      <td>angry</td>\n",
633 |        "      <td>0.315648</td>\n",
634 |        "      <td>0.997371</td>\n",
635 |        "      <td>0.008014</td>\n",
636 |        "      <td>0.240100</td>\n",
637 |        "      <td>394</td>\n",
638 |        "    </tr>\n",
639 |        "  </tbody>\n",
640 |        "</table>\n",
641 |        "</div>"
642 |       ],
643 |       "text/plain": [
644 |        "   emotion      mean       max       min       std  count\n",
645 |        "0     fear  0.317880  0.983642  0.004371  0.268948    395\n",
646 |        "1  sadness  0.254463  0.996975  0.002289  0.291740    397\n",
647 |        "2      joy  0.800486  0.997057  0.042774  0.195736    391\n",
648 |        "3    angry  0.315648  0.997371  0.008014  0.240100    394"
649 |       ]
650 |      },
651 |      "metadata": {},
652 |      "output_type": "display_data"
653 |     }
654 |    ],
655 |    "source": [
656 |     "emotion_dict = {\n",
657 |     "    'emotion': [],\n",
658 |     "    'mean': [],\n",
659 |     "    'max': [],\n",
660 |     "    'min': [],\n",
661 |     "    'std': [],\n",
662 |     "    'count': []\n",
663 |     "}\n",
664 |     "\n",
665 |     "for emotion, result in data_dict.items():\n",
666 |     "    emotion_dict['emotion'].append(emotion)\n",
667 |     "    emotion_dict['mean'].append(np.mean(result))\n",
668 |     "    emotion_dict['max'].append(np.amax(result))\n",
669 |     "    emotion_dict['min'].append(np.amin(result))\n",
670 |     "    emotion_dict['std'].append(np.std(result))\n",
671 |     "    emotion_dict['count'].append(len(result))\n",
672 |     "    \n",
673 |     "emotion_df = pd.DataFrame(data=emotion_dict)\n",
674 |     "display(emotion_df)"
675 |    ]
676 |   },
677 |   {
678 |    "cell_type": "code",
679 |    "execution_count": null,
680 |    "metadata": {},
681 |    "outputs": [],
682 |    "source": []
683 |   }
684 |  ],
685 |  "metadata": {
686 |   "kernelspec": {
687 |    "display_name": "Python 3",
688 |    "language": "python",
689 |    "name": "python3"
690 |   },
691 |   "language_info": {
692 |    "codemirror_mode": {
693 |     "name": "ipython",
694 |     "version": 3
695 |    },
696 |    "file_extension": ".py",
697 |    "mimetype": "text/x-python",
698 |    "name": "python",
699 |    "nbconvert_exporter": "python",
700 |    "pygments_lexer": "ipython3",
701 |    "version": "3.6.8"
702 |   }
703 |  },
704 |  "nbformat": 4,
705 |  "nbformat_minor": 2
706 | }
707 | 


--------------------------------------------------------------------------------
/notebooks/Emotion Recognition Model Validation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Emotion Recognition Model Validation\n",
  8 |     "\n",
  9 |     "The main objective of this notebook is to validate the trained model for emotion recognition"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "# Add project path to the PYTHONPATH\n",
 19 |     "\n",
 20 |     "import os\n",
 21 |     "import sys\n",
 22 |     "from pathlib import Path\n",
 23 |     "\n",
 24 |     "sys.path.append(Path(os.path.join(os.path.abspath(''), '../')).resolve().as_posix())"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 2,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "import pickle"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "## Load Tokenizer\n",
 41 |     "\n",
 42 |     "Load `.pickle` file with the tokenizer"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 3,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "tokenizer_path = Path('../datasets/sentiment_analysis/tokenizer.pickle').resolve()\n",
 52 |     "with tokenizer_path.open('rb') as file:\n",
 53 |     "    tokenizer = pickle.load(file)"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "## Load Model\n",
 61 |     "\n",
 62 |     "Load the trained emotion recognition model"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 4,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, LSTM\n",
 72 |     "from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D\n",
 73 |     "from tensorflow.keras.layers import Bidirectional, Conv1D, Dense, concatenate\n",
 74 |     "from tensorflow.keras.models import Model"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 5,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)\n",
 84 |     "num_classes = 4\n",
 85 |     "embedding_dim = 500\n",
 86 |     "input_length = 100\n",
 87 |     "lstm_units = 128\n",
 88 |     "lstm_dropout = 0.1\n",
 89 |     "recurrent_dropout = 0.1\n",
 90 |     "spatial_dropout=0.2\n",
 91 |     "filters=64\n",
 92 |     "kernel_size=3"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 6,
 98 |    "metadata": {},
 99 |    "outputs": [
100 |     {
101 |      "name": "stderr",
102 |      "output_type": "stream",
103 |      "text": [
104 |       "WARNING: Logging before flag parsing goes to stderr.\n",
105 |       "W0719 10:46:16.952994 4735395264 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
106 |       "Instructions for updating:\n",
107 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
108 |       "W0719 10:46:17.039670 4735395264 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
109 |       "Instructions for updating:\n",
110 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
111 |       "W0719 10:46:17.047888 4735395264 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
112 |       "Instructions for updating:\n",
113 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
114 |       "W0719 10:46:17.049386 4735395264 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
115 |       "Instructions for updating:\n",
116 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
117 |       "W0719 10:46:17.050548 4735395264 deprecation.py:506] From /Users/rmohashi/miniconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
118 |       "Instructions for updating:\n",
119 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n"
120 |      ]
121 |     }
122 |    ],
123 |    "source": [
124 |     "input_layer = Input(shape=(input_length,))\n",
125 |     "output_layer = Embedding(\n",
126 |     "  input_dim=input_dim,\n",
127 |     "  output_dim=embedding_dim,\n",
128 |     "  input_shape=(input_length,)\n",
129 |     ")(input_layer)\n",
130 |     "\n",
131 |     "output_layer = SpatialDropout1D(spatial_dropout)(output_layer)\n",
132 |     "\n",
133 |     "output_layer = Bidirectional(\n",
134 |     "LSTM(lstm_units, return_sequences=True,\n",
135 |     "     dropout=lstm_dropout, recurrent_dropout=recurrent_dropout)\n",
136 |     ")(output_layer)\n",
137 |     "output_layer = Conv1D(filters, kernel_size=kernel_size, padding='valid',\n",
138 |     "                    kernel_initializer='glorot_uniform')(output_layer)\n",
139 |     "\n",
140 |     "avg_pool = GlobalAveragePooling1D()(output_layer)\n",
141 |     "max_pool = GlobalMaxPooling1D()(output_layer)\n",
142 |     "output_layer = concatenate([avg_pool, max_pool])\n",
143 |     "\n",
144 |     "output_layer = Dense(num_classes, activation='softmax')(output_layer)\n",
145 |     "\n",
146 |     "model = Model(input_layer, output_layer)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": 7,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "model_weights_path = Path('../models/emotion_recognition/model_weights.h5').resolve()\n",
156 |     "model.load_weights(model_weights_path.as_posix())"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "metadata": {},
162 |    "source": [
163 |     "## Load test dataset\n",
164 |     "\n",
165 |     "Load the dataset that will be used to test the model"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 8,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "import pandas as pd"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": 9,
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": [
183 |     "test_data_path = Path('../datasets/sentiment_analysis/test.csv').resolve()\n",
184 |     "test_data = pd.read_csv(test_data_path)"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": 10,
190 |    "metadata": {
191 |     "scrolled": true
192 |    },
193 |    "outputs": [
194 |     {
195 |      "data": {
196 |       "text/html": [
197 |        "<div>\n",
198 |        "<style scoped>\n",
199 |        "    .dataframe tbody tr th:only-of-type {\n",
200 |        "        vertical-align: middle;\n",
201 |        "    }\n",
202 |        "\n",
203 |        "    .dataframe tbody tr th {\n",
204 |        "        vertical-align: top;\n",
205 |        "    }\n",
206 |        "\n",
207 |        "    .dataframe thead th {\n",
208 |        "        text-align: right;\n",
209 |        "    }\n",
210 |        "</style>\n",
211 |        "<table border=\"1\" class=\"dataframe\">\n",
212 |        "  <thead>\n",
213 |        "    <tr style=\"text-align: right;\">\n",
214 |        "      <th></th>\n",
215 |        "      <th>label</th>\n",
216 |        "      <th>id</th>\n",
217 |        "      <th>date</th>\n",
218 |        "      <th>user</th>\n",
219 |        "      <th>text</th>\n",
220 |        "    </tr>\n",
221 |        "  </thead>\n",
222 |        "  <tbody>\n",
223 |        "    <tr>\n",
224 |        "      <th>0</th>\n",
225 |        "      <td>fear</td>\n",
226 |        "      <td>1151474078131339264</td>\n",
227 |        "      <td>2019-07-17 12:49:48</td>\n",
228 |        "      <td>13thSnipers</td>\n",
229 |        "      <td>It's so obvious Ashley Young @youngy18 is not ...</td>\n",
230 |        "    </tr>\n",
231 |        "    <tr>\n",
232 |        "      <th>1</th>\n",
233 |        "      <td>fear</td>\n",
234 |        "      <td>1151474075723870208</td>\n",
235 |        "      <td>2019-07-17 12:49:47</td>\n",
236 |        "      <td>ShukrahFirdaus</td>\n",
237 |        "      <td>Engaging in a staring competition with this wo...</td>\n",
238 |        "    </tr>\n",
239 |        "    <tr>\n",
240 |        "      <th>2</th>\n",
241 |        "      <td>fear</td>\n",
242 |        "      <td>1151473913668313089</td>\n",
243 |        "      <td>2019-07-17 12:49:09</td>\n",
244 |        "      <td>EvinErvian</td>\n",
245 |        "      <td>@savage2ooo yah me too. worst? can't stand wat...</td>\n",
246 |        "    </tr>\n",
247 |        "    <tr>\n",
248 |        "      <th>3</th>\n",
249 |        "      <td>fear</td>\n",
250 |        "      <td>1151473830398976000</td>\n",
251 |        "      <td>2019-07-17 12:48:49</td>\n",
252 |        "      <td>oliviaakuhn</td>\n",
253 |        "      <td>i was with @regiannoni EXACTLY 2 years and 3 y...</td>\n",
254 |        "    </tr>\n",
255 |        "    <tr>\n",
256 |        "      <th>4</th>\n",
257 |        "      <td>fear</td>\n",
258 |        "      <td>1151473618318176257</td>\n",
259 |        "      <td>2019-07-17 12:47:58</td>\n",
260 |        "      <td>zaaboogie_</td>\n",
261 |        "      <td>This heat different 😰</td>\n",
262 |        "    </tr>\n",
263 |        "  </tbody>\n",
264 |        "</table>\n",
265 |        "</div>"
266 |       ],
267 |       "text/plain": [
268 |        "  label                   id                 date            user  \\\n",
269 |        "0  fear  1151474078131339264  2019-07-17 12:49:48     13thSnipers   \n",
270 |        "1  fear  1151474075723870208  2019-07-17 12:49:47  ShukrahFirdaus   \n",
271 |        "2  fear  1151473913668313089  2019-07-17 12:49:09      EvinErvian   \n",
272 |        "3  fear  1151473830398976000  2019-07-17 12:48:49     oliviaakuhn   \n",
273 |        "4  fear  1151473618318176257  2019-07-17 12:47:58      zaaboogie_   \n",
274 |        "\n",
275 |        "                                                text  \n",
276 |        "0  It's so obvious Ashley Young @youngy18 is not ...  \n",
277 |        "1  Engaging in a staring competition with this wo...  \n",
278 |        "2  @savage2ooo yah me too. worst? can't stand wat...  \n",
279 |        "3  i was with @regiannoni EXACTLY 2 years and 3 y...  \n",
280 |        "4                              This heat different 😰  "
281 |       ]
282 |      },
283 |      "execution_count": 10,
284 |      "metadata": {},
285 |      "output_type": "execute_result"
286 |     }
287 |    ],
288 |    "source": [
289 |     "test_data.head()"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "markdown",
294 |    "metadata": {},
295 |    "source": [
296 |     "## Load Encoder\n",
297 |     "\n",
298 |     "Load `.pickle` file with the encoder"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": 11,
304 |    "metadata": {},
305 |    "outputs": [],
306 |    "source": [
307 |     "encoder_path = Path('../models/emotion_recognition/encoder.pickle').resolve()\n",
308 |     "with encoder_path.open('rb') as file:\n",
309 |     "    encoder = pickle.load(file)"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "markdown",
314 |    "metadata": {},
315 |    "source": [
316 |     "## Preprocess data\n",
317 |     "\n",
318 |     "Preprocess the data that will be used"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": 12,
324 |    "metadata": {},
325 |    "outputs": [
326 |     {
327 |      "name": "stderr",
328 |      "output_type": "stream",
329 |      "text": [
330 |       "[nltk_data] Downloading package stopwords to\n",
331 |       "[nltk_data]     /Users/rmohashi/nltk_data...\n",
332 |       "[nltk_data]   Package stopwords is already up-to-date!\n"
333 |      ]
334 |     }
335 |    ],
336 |    "source": [
337 |     "from nlp.utils import preprocess\n",
338 |     "from tensorflow.keras.preprocessing.sequence import pad_sequences"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": 13,
344 |    "metadata": {},
345 |    "outputs": [
346 |     {
347 |      "name": "stdout",
348 |      "output_type": "stream",
349 |      "text": [
350 |       "Time to clean up: 0.71 sec\n"
351 |      ]
352 |     }
353 |    ],
354 |    "source": [
355 |     "test_data['text'] = preprocess(test_data.text)\n",
356 |     "sequences = [text.split() for text in test_data.text]\n",
357 |     "list_tokenized = tokenizer.texts_to_sequences(sequences)\n",
358 |     "x_test = pad_sequences(list_tokenized, maxlen=100)\n",
359 |     "y_test = encoder.transform(test_data.label)"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "markdown",
364 |    "metadata": {},
365 |    "source": [
366 |     "## Results\n",
367 |     "\n",
368 |     "Predict the labels and generate a confusion matrix"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "code",
373 |    "execution_count": 14,
374 |    "metadata": {},
375 |    "outputs": [],
376 |    "source": [
377 |     "y_pred = model.predict(x_test)"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": 15,
383 |    "metadata": {},
384 |    "outputs": [],
385 |    "source": [
386 |     "y_pred = y_pred.argmax(axis=1)\n",
387 |     "y_test = y_test.argmax(axis=1)"
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "code",
392 |    "execution_count": 16,
393 |    "metadata": {},
394 |    "outputs": [],
395 |    "source": [
396 |     "import numpy as np\n",
397 |     "import matplotlib.pyplot as plt\n",
398 |     "from sklearn.metrics import confusion_matrix\n",
399 |     "from sklearn.utils.multiclass import unique_labels"
400 |    ]
401 |   },
402 |   {
403 |    "cell_type": "code",
404 |    "execution_count": 17,
405 |    "metadata": {},
406 |    "outputs": [],
407 |    "source": [
408 |     "def plot_confusion_matrix(y_true, y_pred, classes,\n",
409 |     "                          normalize=False,\n",
410 |     "                          title=None,\n",
411 |     "                          cmap=plt.cm.Blues):\n",
412 |     "    \"\"\"\n",
413 |     "    This function prints and plots the confusion matrix.\n",
414 |     "    Normalization can be applied by setting `normalize=True`.\n",
415 |     "    \"\"\"\n",
416 |     "    if not title:\n",
417 |     "        if normalize:\n",
418 |     "            title = 'Normalized confusion matrix'\n",
419 |     "        else:\n",
420 |     "            title = 'Confusion matrix, without normalization'\n",
421 |     "\n",
422 |     "    # Compute confusion matrix\n",
423 |     "    cm = confusion_matrix(y_true, y_pred)\n",
424 |     "    # Only use the labels that appear in the data\n",
425 |     "    classes = classes[unique_labels(y_true, y_pred)]\n",
426 |     "    if normalize:\n",
427 |     "        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n",
428 |     "        print(\"Normalized confusion matrix\")\n",
429 |     "    else:\n",
430 |     "        print('Confusion matrix, without normalization')\n",
431 |     "\n",
432 |     "    print(cm)\n",
433 |     "\n",
434 |     "    fig, ax = plt.subplots()\n",
435 |     "    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)\n",
436 |     "    ax.figure.colorbar(im, ax=ax)\n",
437 |     "    # We want to show all ticks...\n",
438 |     "    ax.set(xticks=np.arange(cm.shape[1]),\n",
439 |     "           yticks=np.arange(cm.shape[0]),\n",
440 |     "           # ... and label them with the respective list entries\n",
441 |     "           xticklabels=classes, yticklabels=classes,\n",
442 |     "           title=title,\n",
443 |     "           ylabel='True label',\n",
444 |     "           xlabel='Predicted label')\n",
445 |     "\n",
446 |     "    # Rotate the tick labels and set their alignment.\n",
447 |     "    plt.setp(ax.get_xticklabels(), rotation=45, ha=\"right\",\n",
448 |     "             rotation_mode=\"anchor\")\n",
449 |     "\n",
450 |     "    # Loop over data dimensions and create text annotations.\n",
451 |     "    fmt = '.2f' if normalize else 'd'\n",
452 |     "    thresh = cm.max() / 2.\n",
453 |     "    for i in range(cm.shape[0]):\n",
454 |     "        for j in range(cm.shape[1]):\n",
455 |     "            ax.text(j, i, format(cm[i, j], fmt),\n",
456 |     "                    ha=\"center\", va=\"center\",\n",
457 |     "                    color=\"white\" if cm[i, j] > thresh else \"black\")\n",
458 |     "    fig.tight_layout()\n",
459 |     "    return fig, ax"
460 |    ]
461 |   },
462 |   {
463 |    "cell_type": "code",
464 |    "execution_count": 18,
465 |    "metadata": {
466 |     "scrolled": false
467 |    },
468 |    "outputs": [
469 |     {
470 |      "name": "stdout",
471 |      "output_type": "stream",
472 |      "text": [
473 |       "Normalized confusion matrix\n",
474 |       "[[0.83657588 0.07782101 0.01167315 0.07392996]\n",
475 |       " [0.12653061 0.76326531 0.00408163 0.10612245]\n",
476 |       " [0.06028369 0.02836879 0.90425532 0.0070922 ]\n",
477 |       " [0.0929368  0.05947955 0.00371747 0.84386617]]\n"
478 |      ]
479 |     },
480 |     {
481 |      "data": {
482 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAUIAAAEYCAYAAAApuP8NAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzs3Xd4FWX2wPHvSUKRmhBUSIJ0CAm9gwgISA3ggnRQwL4WxNXVVVdd7GLD9nN1RbHSOwjYUBExNFFpEjAISSx0BQkmnN8fMwk3/QLJvTfkfJ5nntyZee/Mmdybk/d9Z+YdUVWMMaYkC/J3AMYY42+WCI0xJZ4lQmNMiWeJ0BhT4lkiNMaUeJYIjTElniXCc5yIPCgi77ivLxKRP0QkuJD3kSgiPQpzm17s80YR+cU9nvCz2M4fIlKnMGPzFxHZLCJd/R1HcWSJ8Cy5SeBXESnvsewaEVnpx7Bypao/qWoFVU33dyxnQ0RKAc8APd3j2X+m23Lfv6vwoit8IvKmiDxcUDlVjVXVlT4I6ZxjibBwBAMTznYj4rDPpGAXAmWBzf4OJBCISIi/Yyju7I+ucEwG7hCR0NxWikhHEVkrIofdnx091q0UkUdE5EvgGFDHXfawiKx2m26LRCRcRN4VkSPuNmp5bGOKiOxx160XkUvyiKOWiKiIhIhIB3fbGdNxEUl0ywWJyN0islNE9ovITBGp4rGdMSKy2113b36/GBE5T0SedssfFpFVInKeu26A25w75B5zI4/3JYrIHSLyrfu+GSJSVkQaANvdYodE5BPP48r2e73GfV1PRD5zt7NPRGZ4lFMRqee+riwib4nIb26892X8YxKRsW7sT4nIQRH5UUT65HPciSJypxv/URF5XUQuFJEPROR3EflIRMI8ys8SkZ/dGD8XkVh3+XXAKOCfGd8Fj+3fJSLfAkfdzzSzi0JElorI0x7bny4iU/P7rEo0VbXpLCYgEegBzAUedpddA6x0X1cBDgJjgBBghDsf7q5fCfwExLrrS7nLEoC6QGVgC/CDu58Q4C3gDY8YRgPh7rp/AD8DZd11DwLvuK9rAQqEZDuGUsBnwGPu/ARgDRAFlAH+C7zvrosB/gA6u+ueAdKAHnn8fl5yjycSp+bc0X1fA+AocJm7/3+6x1za4/caD0S4v8OtwA25HUdux+Xu8xr39fvAvTj/+MsCnTzKKVDPff0WsACo6G7zB+Bqd91Y4C/gWvc4bgSSAcnne7EGp/YaCfwKbABauDF8AjzgUX68u98ywHPANx7r3sT9bmXb/jdADeA8z++i+7qau89uOIl0F1DR338vgTr5PYDiPnEqETYGDgPnkzURjgHis73nK2Cs+3olMCnb+pXAvR7zTwMfeMz39/xDySWmg0Az9/WDFJwI/w9YDAS581uB7h7rq7tJIAS4H5jusa48cIJcEqGbeP7MiCXbun8DM7OVTQK6evxeR3usfxJ4JbfjyO24yJoI3wJeBaJyiUOBejjJ7QQQ47Hueo/PcSyQ4LGunPveavl8L0Z5zM8B/s9j/hZgfh7vDXW3Xdmdf5PcE+H43L6LHvODgT3APjySv005J2saFxJV/R4nmdydbVUEsDvbst04tYQMe3LZ5C8er//MZb5CxozbhNzqNqsO4dQiq3oTt4hcD3QFRqrqSXdxTWCe22Q9hJMY03FqNxGe8arqUSCvkxVVcWo/O3NZl+X34u57D1l/Lz97vD6GxzGfpn8CAsS7TfHxecRaiqyfVfbPKTMeVT3mvswvJq8+QxEJFpHH3a6IIzgJLSOm/OT2vfG0CCfBb1fVVQWULdEsERauB3CaTp5/PMk4icXTRTi1nwxnPASQ2x/4T2AoEKaqoTg1U/HyvQ8BA1X1iMeqPUAfVQ31mMqqahKQgtMcy9hGOZxmeW72AcdxmvjZZfm9iIi4203KpWxBjro/y3ksq5bxQlV/VtVrVTUCp5b3cka/YLZY/yLrZ5X9cyoqI4GBOC2Lyjg1XDj1Geb1/Sjoe/MIzj+x6iIy4ixjPKdZIixEqpoAzABu9Vi8FGggIiPdDu1hOP1siwtptxVx+uh+A0JE5H6gUkFvEpEawEzgSlX9IdvqV4BHRKSmW/Z8ERnorpsNxIlIJxEpDUwij++RW8ubCjwjIhFuzaeDiJRx991PRLqLcznMP4BUYPVpHb2zn99wEtZodx/j8Ui+IjJERKLc2YM4CeRktm2kuzE9IiIV3WO/HXjndOM5AxVxjn0/TjJ/NNv6X4DTutZRRDoD44ArgauAF0QkMv93lVyWCAvfJJx+MwDUucYtDucPfT9O7S1OVfcV0v6WA8twOvZ349TACmoyAXTHaerOllNnjjMuR5kCLARWiMjvOJ3+7dzj2QzcBLyHUzs8COzNZz93AN8Ba4EDwBM4fZHbcU7yvIBTG+sP9FfVE14ed3bXAnfi/I5jyZpQ2wBfi8gf7nFN0NyvHbwFp3a5C1jlHqMvzrS+hfPZJeGcGFuTbf3rQIzbVTG/oI2JSCV3mzerapKqfuFu4w235m2yEbdT1RhjSiyrERpjSjxLhMaYEs8SoTGmxLNEaIwp8exm7VxI6fIqZcMKLhigmtS90N8hnLXgoOJ9cjP9ZPE+Cbn3p93s37+v0D6E4Eo1VdP+LLCc/vnbclXtXVj79ZYlwlxI2TDKtL3F32GcsWVzJvo7hLNW+bxS/g7hrPx+PM3fIZyVnl3aF+r2NO1PyjQcWmC549+85NUdUYXNEqExpuiJQFChjgdcqCwRGmN8I4CH2rREaIzxjQC+qcUSoTHGB6xpbIwp6QRrGhtjSjqxprExxljT2BhTwok1jY0xJZxgTWNjTEknEBS46SZwIzPGnFsC+P5xS4TGmKJnl88YYwzWR2iMKenszhJjjLGmsTGmhBO7s8QYY6xpbIwp6QL7zpLAjayYuqxNbTa9cQ3fT7uOO4a3y7G+xgUVWfbUcL56ZSzxr46jV9s6Odb/tmgitw1p66uQs/j0o+V0at2Yji0a8cKzk3OsT01N5fpxo+jYohH9undiz+5EAP766y8m3HA13Tq2pHPbprzwzJM+jtyxYvkymjeOpkmj+jw1+fEc61NTU7ly1HCaNKpPl07t2Z2YCMD+/fvp07MbF1SpyO0TbvZx1Fl98tFyLm4VS/vmjXL9PaampnLd2JG0b96IPt0u5if3M5gz8z26d2qdOVUPLcP3337j4+jzkdE8zm/yajPSW0S2i0iCiNydy/qLRORTEdkoIt+KSN+CtmmJsBAFBQnP3XIZA++ZRYur/8eQS2OIvig8S5m7RnVkzmfb6HDDm1z58EKm3Nozy/onbujOivhdvgw7U3p6OvfcMYF3Zy9k5debWDB7Bj9s25qlzPtvv0FoaCirN27l2r/fysMP3gvAovlzSD2RyierN7Bs5RrefuN/mUnSl/HfPuFm5i1cyvpNm5k1Yzpbt27JUmbaG68TGhrKd1t3cPOtt/Hve52/o7Jly/LvBybx6OM5k78vpaen869/TOC92Yv4PH4T8+bMYPu2rMfw3ltvEBoaxppvtnL932/l4QfuAWDw0JF8vGodH69ax4v/fYOLatamcdPm/jiMnMS9s6SgqcDNSDDwEtAHiAFGiEhMtmL3ATNVtQUwHHi5oO2e04lQRHza9G/TsDo7kw+RmHKYv9JOMmvlVuIurp+ljCpUKlcGgMrly5Cy/4/Mdf071ifx50Ns2b3Pl2Fn2rh+LbXq1KVmrTqULl2agYOHsnzpoixlli9dxJARYwCIGziIVZ99iqoiIhw7epS0tDSOH/+T0qVLUaFSJZ/Gv25tPHXq1qN2HSf+K4YOY/GiBVnKLF60kFFjrgLgb4OuYOWnH6OqlC9fno4Xd6JM2bI+jTm7jevXUrtOXWrWdo7h8kFDWb4k52cwdKT7GVw+OPMz8DRv9gwuHzzEZ3F7pXBqhG2BBFXdpaongOnAwGxlFMj48lUGkgvaaEAlQhGZLyLrRWSziFznLvtDRB4RkU0iskZELnSX13XnvxORh0XkD3d5VxH5QkQWAltEZJKI3Oaxj0dEZEJRxB9RtSJ7fz2SOZ/02+9EhlfIUuaRt1YxvEcsCe//nXmPDuH2Fz8EoHzZUvxjeDseeevLogjNKz+nJBMRWSNzvnpEJCkpSbmUiQIgJCSESpUqceDAfuIGDqJc+fI0b1iTNo3rccMtEwkLq+LT+JOTk4iqEZU5HxkZRUpSUs4yUTU84q/M/v37fRpnflKSkzJ/vwDVIyNJScn6d5ySkpTlM6hYqTIHDmQ9hgVzZ3P5FcOKPuDTIUEFT1BVRNZ5TNdl20oksMdjfq+7zNODwGgR2QssBQp8JGVAJUJgvKq2AloDt4pIOFAeWKOqzYDPgWvdslOAKaraBOeX4aklMEFVGwBTgSsBRCQIp6r8TpEfSR6GXhrDO8u/o96Il/nbPbN4/e44ROC+Kzvxwpx1HD3+l79COysb168lODiYjdsS+XrTdl558Tl2J/qniV/SbVgXz3nlzqNRTGN/h3JKxlPsCppgn6q29phePYO9jQDeVNUooC/wtvu3n6dAS4S3isgmYA1QA6gPnAAWu+vXA7Xc1x2AWe7r97JtJ15VfwRQ1URgv4i0AHoCG1U1RxVARK7L+C+kfx09o+CT9/1O1AWnmoOR51ckyaPpC3BVn6bM+WwbAF9vTaZs6RCqVi5Hm0bVeeTarmx75wZuHtSaO0e054aBLc8ojjNVrXoEyUmn/tmmJCdRvXpkLmWc/ztpaWkcOXKEKlXCmTd7Opd270mpUqWoev4FtGnXkU0bN/g0/oiISPbuOfU/MSlpL9UjI3OW2bvHI/7DhIdn7cf1p+oRkZm/X4CUpCSqV4/IWqZ6ZJbP4Pcjh6lS5dQxzJ8zk78NDrDaIBRW0zgJJzdkiHKXeboamAmgql8BZYF8n5ccMIlQRLoCPYAObu1vI84B/KWnOkDS8e6Sn+yZ7H/AWGAcTg0xB1V9NeO/kJQqf/oHAKzbnkK9yDBqVqtMqZAghnRtxJLVCVnK7Pn1CF1b1ASg4UXhlC0VzG+HjtFj4ntEj36F6NGv8OLcdUx+fw2vLPBtImnesjU/7kzgp8QfOXHiBAvmzKRnn7gsZXr2iWPW+28DsHjBXDp17oqIEBl1Eas+XwnAsaNH2bDua+rVb+jT+Fu1bsPOhB0k/ujEP3vmDPrFDchSpl9cf959exoA8+bOpkvXbkgAXejbvGVrdu1MYLf7GcyfO5OefbN9Bn3jmPme+xnMn8PF7mcAcPLkSRbOm83lgwt+mLqviUiBkxfWAvVFpLaIlMZp4S3MVuYnoLu7z0Y4eeS3/DYaSNcRVgYOquoxEYkG2hdQfg0wGJiB88vIzzxgElAKGHm2geYl/aQy8YUPWfT4UIKDhGnLvmPr7n38+6pObPjhZ5Z8lcDdr3zCy7f35pbBbVBVrp28tKjCOW0hISE8Mvk5Rg6OIz09neGjx9KwUQxPPvIfmrVoSa++/RkxZhy3Xj+Oji0aERpWhf+b6vxBjrvmBibedC1d2zdHVRk26kpiGjfxefxPP/cCA+N6k56ezpVjxxETE8tD/7mfli1b06//AK4adzXXjLuSJo3qE1alCtPefj/z/Y0a1Ob3I0c4ceIEixYtYOGS5TRqlP2EZNEfw6NPPceIQf1ITz/JiNFXEd0oliceeZDmLVrRq29/Ro4Zx83XjaV980aEhoXx36mnenq++vILIiKjqFm7Tj578T0RkEIYhktV00TkZmA5EAxMVdXNIjIJWKeqC4F/AK+JyEScEydjNfvZpOzxFbDeZ0SkDDAfp+m7HQjF6fRcrKoV3DJXAHGqOlZE6uP09Z0HLANGqWqkW7O8Q1Xjsm3/FeCQqua47ii7oEpRWqZtgf2rAWvXnIn+DuGsVT6vlL9DOCu/H0/zdwhnpWeX9mzauL7QqsrBVWrreT0eKLDc0Vnj1qtq68Lar7cCpkaoqqk41wZlV8GjzGxgtjubBLRXVRWR4UBDt8xKYKXnBtyO0vZAgF1PYEzJEUhdENkFTCI8A62AF8X57R4CxudWyL3YcjEwT1V3+DA+Y4yHoKCAOSWRQ7FNhKr6BdDMi3JbgMDqMDGmpBF3ClDFNhEaY4oPweuzwn5hidAY4xPWNDbGlHhWIzTGlGzWR2iMKekEsaaxMcZY09gYYwI3D1oiNMb4gNhZY2OMsaaxMaZkswuqjTGmkIbhKiqWCI0xPmE1QmNMiWeJ0BhT4lnT2BhTop3GM0n8whKhMcYnLBEWM7F1LmD+e7f6O4wzFn3jDH+HcNZ+eWuMv0M4K8f/Svd3CGelKB5lZInQGFPiWR+hMaZkE6sRGmNKOMF5tnGgskRojPEBIciaxsaYks6axsaYkk2saWyMKeEECA4O3ExoidAY4xPWNDbGlGzWNDbGlHT2FDtjjMFqhMYYY32ExpiSTQS7oNoYYwK4Qkjg9l4aY84pGYOz5jd5uZ3eIrJdRBJE5O48ygwVkS0isllE3itom1YjNMYUvUJqGotIMPAScBmwF1grIgtVdYtHmfrAv4CLVfWgiFxQ0HatRmiMKXIZo88UNHmhLZCgqrtU9QQwHRiYrcy1wEuqehBAVX8taKOWCI0xPlBws9htGlcVkXUe03XZNhQJ7PGY3+su89QAaCAiX4rIGhHpXVB01jQ2xviEl03jfara+ix3FQLUB7oCUcDnItJEVQ/lGdtZ7tAYYwrmRbPYy6ZxElDDYz7KXeZpL7BQVf9S1R+BH3ASY54sERayzz5ZwWUdm9GtXWNeef6pHOvjv1rFgB4daBhRkQ8WzctcnrTnJwb06ED/bu3o3bkV7017zZdhZ+reNIJ1Tw1g4zMDmdg/Nsf6R0e35otH+/HFo/1Y//RAdr82LHNdVHg55t3dnfjJA/j6yf5cVLW8L0PPtGL5MprGNiQ2uh6Tn3w8x/rU1FRGjxxGbHQ9LunYjt2JiZnrJj/xGLHR9Wga25APVyz3YdSnrPx4Bd3aNaVLm1henjI5x/qvV6+i36UdqHthBZYunJtl3ZVDB9CkTjXGjxjkq3C94vQRFspZ47VAfRGpLSKlgeHAwmxl5uPUBhGRqjhN5V35bbRYNY1F5FbgRmCDqo7ydzzZpaen8+DdE5k2czHVIiIZ1OsSuvfqR/2GjTLLRETW4Mkpr/K//5uS5b3nX1iNWUtWUqZMGY4e/YO+XVrTvVc/LqwW4bP4g0R4elxbLn/sI5L2H+PTh/uwdMNeticdzixzzzvrMl9f17MhTWtVyZx/5caLeXr+93z6fQrly4RwsigehVaA9PR0brv1JpZ88CGRUVF0at+GuLgBNIqJySzz5tTXCQsNY/O2BGbOmM6999zFO+/NYOuWLcyaMZ0NmzaTkpxM3949+G7LDwQHB/s0/vvvuo13Zi+hWkQkAy7rxGW947J+h6Jq8NSLr/LaS8/leP/1N0/kz2PHeG/a6z6L2VuFcdZYVdNE5GZgORAMTFXVzSIyCVinqgvddT1FZAuQDtypqvvzje2sI/OtvwOXnU0SFJEiS/6bNqyjZu26XFSrNqVLl6bf5Vfw0bLFWcpEXVST6NgmOW5AL126NGXKlAHgRGoqJ0+eLKow89SqXji7fvmdxF//4K/0k8z9ajf9WtXIs/wVHWsxZ3UiAA0jKxMSHMSn36cAcDQ1jT9P+P6Rlmvj46lbtx6169ShdOnSDBk2nMWLFmQps3jRAkaNuQqAQYOvYOUnH6OqLF60gCHDhlOmTBlq1a5N3br1WBsf79P4v9mwNst3qP/fhrDig6zfoRoX1aRRbBMkl0EMLu58KeUrVPRVuKelsK4jVNWlqtpAVeuq6iPusvvdJIg6blfVGFVtoqrTC9pmsUmEIvIKUAf4QETuFZGpIhIvIhtFZKBbppaIfCEiG9ypo7u8q7t8IbAln92clV9+TqZ6xKkTWNUiIvnl52Sv35+ctJd+XdtyScsGXHfz7T6tDQJEhJUjaf/RzPmkA0epXuW8XMvWqFqemudX4LPNPwNQr3olDh89wTu3deGLR/vx0MiWBPnhVoLk5CSiok4l78jIKJKSknKWqeGUCQkJoVLlyuzfv5+kpJzvTU7O3v1UtH5JSSYiIipzvnpEJL+k+DaGIlF4fYRFotgkQlW9AUgGLgXKA5+oalt3frKIlAd+xakxtgSGAc97bKIlMEFVG+S2fRG5LuOU/YH9+4ryUPIUERnFkpXxfLzmO+bNeJd9v/7ilzi8MbhDLRbE/5TZ/A0JEjpEX8B9762n631LqXVBBUZ1qevnKE2gEPfhTQVN/pJnIhSRSvlNvgwyFz2Bu0XkG2AlUBa4CCgFvCYi3wGzgBiP98S7Z5BypaqvqmprVW1dJbzqGQV1YbUIUjxqED8nJ51Rre7CahE0iI5h7derzyiOM5V88BiR4adOcERWKU/KgT9zLTu4Qy1mu81igKQDx/hu90ESf/2D9JPK4nV7aObRf+grERGR7N176jKzpKS9REZG5iyzxymTlpbGkcOHCQ8PJzIy53sjIrJfola0LqweQXLy3sz5lOQkLqzu2xiKSpBIgZPfYstn3Wbge/fn5mzz3xd9aPkSYLCqNneni1R1KzAR+AVoBrQGSnu852gu2ylUTVu0YveuBPbsTuTEiRMsmT+b7r36efXelOS9HP/TSTqHDx1kXfxX1Kmb7xn/Qrdh537qVqtIzfMrUCo4iEEdarJ0/Z4c5epHVKJy+dLE7/gty3srlytFeEWnn7NzbDW2JeV52VaRad2mDQkJO0j88UdOnDjBrBnT6Rc3IEuZfnEDePftaQDMnTObLpd2Q0ToFzeAWTOmk5qaSuKPP5KQsIM2bdv6NP5mLVqT6PEdWjRvFpf19u47FOgCuWmc54kDVc27l9z/lgO3iMgtqqoi0kJVNwKVgb2qelJErsI5q+QzISEhPPDYM4wbPoD09HSGjLiSBtExPPfEJBo3a0mP3nF8u3EdN44bzpFDh/hkxVKmTH6YZZ+vZ+eO7Tz2wL8QEVSVa26cQMOYxr4Mn/STyh1vxjP37u4EBwnvrExgW9Jh7rmiGRt37eeDDU5NZXCHWsz9KjHLe0+q8u93N7Dw3ssQ4JsfDzDtkwSfxg/OZ/DslBfp368X6enpXDV2PDGxsUx68H5atmpNXP8BjB1/NePHjiE2uh5hYVV4+12nLz0mNpbBQ4bSomkMISEhPPf8Sz49Y5wR/6THn+XKIf1JP5nO0JFX0SA6hmcem0ST5i25rE8cmzas4/qrhnH48CE+Xr6UZ594mA+/3ADAkLju7NzxA0eP/kH7JnV5YsordOl2mU+PITciEBzAw3CJenGJg4gMB+qo6qMiEgVcqKrrizy6nHEk4tT0jgLPAR1xarU/qmqce7P1HECBZcBNqlpBRLoCd6hqnDf7adK8pc5f8WURHIFvtLxttr9DOGu/vDXG3yGclV8OH/d3CGelf/eL+fab9YWWuSrXbKQX/2tageU+uLHd+kK4s+S0FXgpiYi8iNP31hl4FDgGvAK0KdrQclLVWh6z1+eyfgfQ1GPRXe7ylTh9icYYPwnk8Qi9uaauo6q2FJGNAKp6wL2i2xhjvCI4Z44DlTeJ8C8RCcJpbiIi4YDvr/Y1xhRfIgHdR+jNdYQv4fS7nS8i/wFWAU8UaVTGmHNOsTxrnEFV3xKR9UAPd9EQVfX35TPGmGJEwK/XCRbE2/tug4G/cJrHxeZuFGNM4Ajkp9gVmNRE5F7gfSACZ+yv90TkX0UdmDHm3OFNszigm8bAlUALVT0GICKPABuBx4oyMGPMuaW4N41TspULcZcZY4zXimUiFJFncfoEDwCbRWS5O98TZ5RYY4zxinOyxN9R5C2/GmHGmeHNwBKP5WuKLhxjzDnpNAZe9Yf8Bl0IvLG+jTHFViCfNfbmXuO6wCM4Y/uVzVie1wCnxhiTXaA3jb25JvBN4A2cY+kDzARmFGFMxphzUGE9s6QoeJMIy6nqcgBV3amq9+EkRGOM8YoIBIsUOPmLN5fPpLqDLuwUkRtwHqYcmI/JMsYErAA+V+JVIpyI87CkW3H6CisD44syKGPMuadYnjXOoKpfuy9/B4r3sMHGGL8QAnsYrvwuqJ6HOwZhblR1UJFEZIw59/j5XuKC5FcjfNFnUQSY4CCh0nneDswTePZMHeXvEM5aWJub/R3CWTm4tnj/+ZQKLvysVSybxqr6sS8DMcacuwT8ela4IMW32mOMKVYCuIvQEqExxjfOiUQoImVUNbUogzHGnJsC/QHv3oxQ3VZEvgN2uPPNROSFIo/MGHNOCeQRqr25xe55IA7YD6Cqm4BLizIoY8y5JePhTQVN/uJN0zhIVXdnO/WdXkTxGGPOUUVwRU6h8SYR7hGRtoCKSDBwC/BD0YZljDmXiJ9rfAXxpml8I3A7cBHwC9DeXWaMMV4rrD5CEektIttFJEFE7s6n3GARURFpXdA2vbnX+FdguHchGmNMTgKEFMJZY7dV+hJwGbAXWCsiC1V1S7ZyFYEJwNc5t5KTNyNUv0Yu9xyr6nXe7MAYY6DQzgq3BRJUdZezTZkODAS2ZCv3EPAEcKc3G/WmafwR8LE7fQlcANj1hMYY74lzQXVBE1BVRNZ5TNkrXJHAHo/5ve6yU7sSaQnUUFXPh87ly5umcZZh+UXkbWCVtzswxhhwhuLywj5VLbBPL899OINIPwOMPZ33ncktdrWBC8/gfcaYEsrpIyyUTSUBNTzmo9xlGSoCjYGV7iV/1YCFIjJAVdfltVFv+ggPcqqPMAjnge95nqkxxpjcFNIwXGuB+iJSGycBDgdGZqxU1cNAVY99rgTuyC8JQgGJUJzIm3Eq455U1TwHazXGmNwU1uM8VTVNRG4GlgPBwFRV3Swik4B1qrrwTLabbyJUVRWRpara+Ew2bowxABTioAuquhRYmm3Z/XmU7erNNr1ptX8jIi282ZgxxuQmo0boxVljv8jvmSUhqpoGtMC5aHEncBTnmFRVW/ooRmPMOSCA77DLt0YY7/4cADQE+gJDgCvcnyYXn3y4nA4tY2nbrBHPP/NkjvWpqalcO3YkbZs1ovelF/PT7sTMdZu//5Y+3S/hkrbN6NK+BcePH/dh5I6PViyjVdNGNI9twDOTn8jghDvdAAAgAElEQVSxPjU1lbGjh9M8tgHdLunAbjf+9Wvj6dSuJZ3ateTiti1YtGCejyN3XNaxEZvm/ZvvFzzAHeMuy7H+ouphLH3lFuJn/Ivlr00g8oLQzHWj+rfjuwX3892C+xnVv50vw85ixfJlNI1tSGx0PSY/+XiO9ampqYweOYzY6Hpc0rEduxMTAdi/fz+9elxK1dAK3HZrYD3zRSj44e7+HMo/v0QoAKq6M7fJR/F5RURW+zsGgPT0dO76xwTen7OIVWs3MXf2DLZvy3rB+7tvvUHl0DDiN23l+ptu5aEH7gEgLS2Nv187lsnPvcgX8ZuYt+QjSpUq5fP4/3HbLcxesIT4jd8zZ9Z0tm3NGv9bb04lNCyMbzb/wN9vmcAD9zoXEDSKbczKL+NZ9fUG5ixYym233EhaWppP4w8KEp67eygDb36ZFoMfZkjvVkTXqZalzGMT/8a7S+JpO+wxHn31AybdMgCAsErluPe6PnQe8xSXjJ7Mvdf1IbTieT6NH5zP4LZbb2LBog/Y+O0WZk1/n61bsn4Gb059nbDQMDZvS+CWCRO59567AChbtiz3P/gQjz3xlM/jLpD3F1T7RX6J8HwRuT2vyWcRekFVO/o7BoAN69ZSu05datWuQ+nSpfnb4KEsW7IoS5llSxYxbITzeOj+lw/mi5Wfoqqs/PhDYmKb0LhJMwCqhIcTHBzs0/jXr42nTt261HbjHzRkGEsWZz0Jt3TxAkaOuhKAywddwWcrP0FVKVeuHCEhTk/L8dTjfnliWZvGtdi5Zx+JSfv5Ky2dWcs3ENe1aZYy0XWq81n8dgA+W/sDcV2bAE5N8uM12zh45BiHfv+Tj9dso+fFMT4/hrXx8dStW4/adZzPYMiw4SxetCBLmcWLFjBqzFUADBp8BSs/+RhVpXz58lzcqRNly5b1edzeCOTxCPNLhMFABZwLFHObAoaI/CGOySLyvYh8JyLD3HVvicjlHmXfFZGBRRHHzylJREZFZc5Xj4gkJTk5zzIhISFUrFSZAwf2szNhByLC0Mv70f2StrzwnO//qycnJxEZdepa1cjISFKSkrKUSUlOziwTEhJCpUqVObB/PwDr4r+mXcsmdGzdjGeffzkzMfpKxAWV2fvLwcz5pF8OEnl+5SxlvvshiYHdmgMwsFszKlU4jyqVyxNxfmjW9/56iIjzQ/G15OQkorJ8BlEkZfsMkpOTiKrh8RlUrsx+9zMIVIJz1rigyV/y+6amqOokn0Vy9gYBzXGue6yKc4Lnc+B1YCIwX0QqAx2Bq7K/2b2n8TqAqBoX+SrmTGnpacSvWc3ylas577xyDO7fi2bNW9K5azefx3KmWrdtx9cbvmP7tq3ccM04LuvVJ+BqJ/96dh7P3jWE0QPa8eWGBJJ+OUh6+kl/h1UiFNeTJQEcdq46Ae+rarqq/gJ8BrRR1c9wrkQ/HxgBzHHPhmehqq+qamtVbR1etWr21V6pVj2SpL17M+dTkpOoHhGRZ5m0tDR+P3KYKlXCiYiIpH3HToSHV6VcuXL06NmbbzdtPKM4zlRERCRJe0/dz56UlET1yCz3s1M9IiKzTFpaGkeOHKZKeHiWMg2jG1G+QgW2bP6+6IP2kPzrYaIuDMucj7wwjKTfDmcpk/LbYYbf8T86jHiCB150ui0O//Enyb8dyvreC0JJ/u2QbwL3EBERyd4sn8FeIrN9BhERkezd4/EZHD5MeLbPINAITrIpaPKX/Pbd3WdRFL23gNHAOGBqUe2kRavW7NqVwO7EHzlx4gTz5sykV9+4LGV69Y1jxvtvA7Bo/hw6demKiHBp955s3fI9x44dIy0tjdVffkHDho2KKtRctWzdhp0JCSS68c+dNYO+/fpnKdO33wDee/ctAObPnU3nLpciIiQm/ph5cuSn3bvZsX0bNWvW8mn86zbvpt5F51MzIpxSIcEM6dWSJSu/zVImPLR8Zv/lneN7MW3BGgA+XL2VHh2iCa14HqEVz6NHh2g+XL3Vp/EDtG7ThoSEHST+6HwGs2ZMp1/cgCxl+sUN4N23pwEwd85sulzazS99sqdFAruPMM+msaoe8GUgheAL4HoRmQZUATpzaiyyN3EuB/o5+wCOhSkkJITHJz/HsL/1Iz39JCPHXEV0o1gef/hBmrdsRe++/Rl15Thuum4sbZs1IiwsjP++8Q4AoWFh3HDTBHp17YCI0L1nby7r3beoQs0z/qeefZ5B/fuQnp7O6KvG0SgmlkcmPUCLlq3oGzeAMWPHc934K2ke24CwsCpMffs9ANasXsWzTz1JqVKlkKAgnp7yImdasz5T6eknmfjETBa9fBPBQcK0BWvYuutn/n1jPzZs+Ykln31H59b1mXTLAFRh1YYEbntsJgAHjxzjsdeWseqdfwLw6KvLOHjkmE/jB+czeHbKi/Tv14v09HSuGjuemNhYJj14Py1btSau/wDGjr+a8WPHEBtdj7CwKrz97vTM9zesV4vfjxzhxIkTLFo4n8VLV9AoxvcnfbLLeHhToJJz4dZhEfkdqAQ8CfTBGSTiYc8hxERkGTBfVV8paHvNW7bSDz9bU1ThFrkypXx7trkoXNjhVn+HcFYOrn3R3yGclYvbtWb9+nWFlrnqxDTVh95eWmC50a1rrD+bYbjOlG9P6xUBEQkHDriDQdxJLiPSikg5oD7wvo/DM8YAIAQV5we8BzIRiQC+AvK81kREegBbgRfcIXqMMT4W6CdLinWNUFWTgQYFlPkIqOmbiIwxeQnkEzrFOhEaY4oJCeyTJZYIjTFFLqNpHKgsERpjfMKaxsaYEi+ATxpbIjTGFD2naRy4mdASoTHGJwK4ZWyJ0BjjC/69l7gglgiNMUXOmsbGGCPWNDbGGGsaG2NKtoznGgcqS4TGGJ8Q6yM0xpR01jQ2xpRo1jQ2xhjEmsbGmBLOLp8pfk4qpKYV32fdBvIoH94q7s/8COv5qL9DOCupO1IKdXsCBAfw99ISoTHGJwI3DVoiNMb4SgBnQkuExhifCOTLZwJ59GxjzDlEvJi82o5IbxHZLiIJInJ3LutvF5EtIvKtiHwsIgU+vM0SoTHGNwohE4pIMPAS0AeIAUaISEy2YhuB1qraFJgNPFnQdi0RGmOKnLhPsSto8kJbIEFVd6nqCWA6MNCzgKp+qqrH3Nk1QFRBG7VEaIzxCS8rhFVFZJ3HdF22zUQCezzm97rL8nI18EFBsdnJEmOMb3jXCbhPVVsXyu5ERgOtgS4FlbVEaIzxgUIbqj8JqOExH+Uuy7o3kR7AvUAXVU0taKPWNDbGFDlvmsVepsm1QH0RqS0ipYHhwMIs+xJpAfwXGKCqv3qzUUuExhjfKIRMqKppwM3AcmArMFNVN4vIJBEZ4BabDFQAZonINyKyMI/NZbKmsTHGJwrrgmpVXQoszbbsfo/XPU53m5YIjTE+Ebj3lVgiNMb4wuncOuIHlgiNMUXOGaE6cDOhJUJjjE8Ebhq0RGiM8ZUAzoSWCI0xPmFNY2NMiRe4adASoTHGVwI4E9qdJYXs049W0KVtEzq1iuGl5ybnWJ+amsqN40fTqVUM/Xtcwp6fEgE4ceIEt990LT0ubkXPS9rw1arPfBy54+MPl9O+RSxtmkUz5emcw7ilpqZyzVUjadMsml6XduSn3YmZ6zZ//y19unWiU5tmdG7XnOPHj/sw8lNWLF9G09iGxEbXY/KTj+dYn5qayuiRw4iNrsclHduxOzExc93kJx4jNroeTWMb8uGK5T6M+pTL2tRh07Tr+f7tG7hjRIcc62tcUIllT4/iq/+OJ/61a+jVrm6O9b8tuYPbhrbzVcgFKsRhuIpEwCVCEaklIt/7O44zkZ6ezn3/nMBbMxfwyVffsGDOTH7YtjVLmenvvEloaCir1m/hmhtv4dEH7wPgvbemAvDRl+t5b+4SHvr33Zw86dsn6aWnp3P3P25l+txFfLn2W+bNns72bVuylHn3ramEhoaydtM2brhpApPuvweAtLQ0/n7NVUye8hKr1m5i/tKPKVWqlE/jzziG2269iQWLPmDjt1uYNf19tm7JegxvTn2dsNAwNm9L4JYJE7n3nrsA2LplC7NmTGfDps0sXLyMCbf8nfT0dJ/GHxQkPDehFwPvnkGLca8ypFsM0TWrZilz1+iLmfPZVjpcP5UrH57PlAm9sqx/4sYerIjf6cuwvVJYI1QXhYBLhMXZN+vXUqt2XWrWqkPp0qUZMGgIKz5YlKXMiqWLuGL4aAD6DRzEl59/iqqyY/tWLu7cFYCq519ApcqV2bRxvU/j37Aunlp16lKrthP/5YOH8cHirPF/sGQRw0aOAaD/5YP5YuUnqCqffvwhMY2b0LhJMwCqhIcTHBzs0/gB1sbHU7duPWrXcY5hyLDhLF60IEuZxYsWMGrMVQAMGnwFKz/5GFVl8aIFDBk2nDJlylCrdm3q1q3H2vh4n8bfJjqCnUkHSUw5xF9pJ5n1yRbiOtbPUkYVKpUrDUDl8mVI2f9H5rr+Fzcg8edDbEnc59O4vRLAmbDIEqGIlBeRJSKySUS+F5FhInK/iKx1518V9wG8ItLKLbcJuMljG2NFZK6ILBORHSLypMe6niLylYhsEJFZIlLBXf64x/MKnnKXDXH3uUlEPi+qY/45JZmIyFOD4VaPiOTnlOQ8y4SEhFCxUiUOHthPTGwTPvxgCWlpafy0+0e++2YjKUl7iyrUXKWkJBPpEX9EZCQpKVlHOPo5OZnIqBqZ8VeqXJkD+/ezM+EHRIQhl/elW6c2vPDsUz6NPUNychJRUadGaYqMjCIpKSlnmRpZj2H//v0kJeV8b3JyjhGeilRE1Yrs/fVI5nzSvt+JPL9iljKPTPuc4T0akzDjZuY9NpTbn18BQPmypfjH8PY8Mu0Ln8bsnYKbxf5sGhflyZLeQLKq9gMQkcrAh6o6yZ1/G4gDFgFvADer6ucikr1jrTnQAkgFtovIC8CfwH1AD1U9KiJ3AbeLyEvA34BoVVURCXW3cT/QS1WTPJZl4Y6Eex2Q+YfuS8NGj2XHD9vp160jkTUuolXb9gT5oUZ1ptLT0vn6q9WsWPkV55Urx+C4njRr0ZLOXbv5O7RzztBusbyz/FumzIqnXUwkr/9rAK2ufpX7xl7CC7PXcvT4X/4OMQd/N30LUpSJ8DvgaRF5Alisql+IyGAR+SdQDqgCbBaRL4BQVc2oqb2N82CWDB+r6mEAEdkC1ARCcR7c8qVbqSwNfAUcBo4Dr4vIYmCxu40vgTdFZCYwN7dgVfVV4FWApi1a6ZkccLXqESR71OJSkpOoVj0i1zLVI6NIS0vj9yNHCKsSjojw4KOn/gdc3qsrdepmbRIVterVI0jyiD85KYnq1bOOgl4tIoKkvXuIcOM/cvgwVcLDiYiMpH3HToRXdfqzevTqw7ffbPR5IoyIiGTv3lMjuScl7SUyMjJnmT17iIo6dQzh4eFERuZ8b0REfqPAF77kfb8TdUGlzPnIqhVJ+u33LGWu6tuMgXdNB+DrLUmULR1M1crlaBMdyd86R/PI9ZdSuUJZTp5Ujp9I45X5vu1iyVMAZ8Iiaxqr6g9AS5yE+LCI3A+8DFyhqk2A14CyXmzKc3TZdJzkLTi1y+buFKOqV7tjlbXFeXJVHLDMjeUGnBpkDWC9iIQXykFm06xlaxJ3JfDT7h85ceIEC+fO4rLecVnKXNYnjtnT3wFgyYK5XHxJV0SEP48d49jRowB8/ulHBIcE0yC6UVGEmacWrdrw484Edic68c+fM4Pe/bLG37tvHDPeexuARfPn0KnLpYgIl3bvydYt33Ps2DHS0tJYvepzn8cP0LpNGxISdpD4o3MMs2ZMp1/cgCxl+sUN4N23pwEwd85sulzaDRGhX9wAZs2YTmpqKok//khCwg7atG3r0/jXbUumXmQYNatVplRIEEO6xbDkqx1Zyuz55QhdW9YCoOFF4ZQtHcJvh47R47a3iR75MtEjX+bFOWuZ/N7qwEmCBPZZ4yKrEYpIBHBAVd8RkUPANe6qfW5/3hXAbFU9JCKHRKSTqq4CRnmx+TXASyJST1UTRKQ8zgNckoFyqrpURL4Edrmx1FXVr4GvRaQPTkLcX6gHjNPf9NCTzzH6iv6kp6czbNRVNGwUw1OP/oemLVrRs08cw0eP5bYbxtOpVQyhYVV46X9vOb+Ufb8y+or+BEkQ1SIimPLK1MIOz6v4H3tqCkMv78fJk+mMGDOW6EaxPP7wgzRv0Yre/foz6srx/P3asbRpFk1YWBivvvEuAKFhYdx482307NIBEaFHz9707N3XL8fw7JQX6d+vF+np6Vw1djwxsbFMevB+WrZqTVz/AYwdfzXjx44hNroeYWFVePtdp3YVExvL4CFDadE0hpCQEJ57/iWfn/BJP6lMfGEFi54YTnBwENM+2MTWxH38e2xnNvyQwpLVO7j7lY95+R99uOWKtqjCtU8uLnjDASCAK4SI6hm1AgvesEgvnJFiTwJ/ATcClwMjgJ+BH4DdqvqgiLQCpgIKrAD6qmpjERmL83zSm91tLgaeUtWVItINeAIo4+7yPpxhvBfg1DTFLTtNROYC9d1lHwO3aT4H3rRFK136yerC+2X4WNlSxadvMS8Vyhbva/3Dej7q7xDOSuq6lzh5JKnQcpe3f1M1qpRdX1gPbzodRfZtU9XlOMNpe1qHk7Cyl10PNPNY9E93+ZvAmx7l4jxefwK0yWXXOdoyqjrI+8iNMUUjcOuExfvfrjGmWHDGI/R3FHmzRGiM8YkAHnzGEqExxjfEmsbGmJLOaoTGmBJNxBKhMcZY09gYY6xGaIwp8SwRGmNKOLGmsTGmZBOsRmiMMZYIjTHGmsbGmBLNeYqdv6PImyVCY4xvWCI0xpR01jQ2xpR41jQ2xhhLhMaYki6Qm8ZF9syS4kxEfgN2F+EuqgL7inD7vlDcj8Hiz19NVT2/sDYmIstwYi7IPlXtXVj79ZYlQj8QkXX+eEBNYSrux2DxG09F9lxjY4wpLiwRGmNKPEuE/vGqvwMoBMX9GCx+k8n6CI0xJZ7VCI0xJZ4lQmNMiWeJ0BhT4lki9AMRKeXvGM6EiDO0ZsbP4kpE6otIQ3/HcSay/+6L+2cRKCwR+piINADGuq+D/RuN90RE9NSZtcZ+DeYMiaMs8G+gl7/jOV2en4GIdARQO9tZKCwR+l4HYACAqqb7ORavefwBXgnMFJEKxa02oo7jwH+BEcWtVujxGdwEvCwiF/k5pHOGJUIfEZFyAKo6DQgSkZv9HNJpE5FuwE1Af1X9AyhONdrGItJDRKqr6pfA58AF7rridBx9gXFAT1X9SUQaiIgNnnKWLBH6gNscvlVExrmLXgPK+TEkr3jW+Nw/tnJALeBKAFVNK0a1wn44NfG5ItIOqADcLiLBgVwzz+X3WxZYArQXkYfc1wtFxJsBDUweLBEWMRGJA54HdgA3iMj9QHvg6ox+nkCUrT+qMlBGVRfj1EZaisiN4DTXAi0Zun2BGSd2GopILPCyqt4KPAMMBkKBLkDvjPf4K968ZPsMrhCRzsBqnO6VocAaoAnwB9DGb4GeA6xKXYREpA1wLTBJVVeLyEqgB1APCAOGiMh64ESgdXp7/AHeDlwClBOR/6rqXBFR4FoRKauqzwZw7HE4iW8bEC4iz6jqLBFZjDMk1F9AT2BJoB0DZDmOO4HLgetV9WcR6a+qqe66fkBDYLP/Ij0HqKpNRTDhNL2mAgl5rB8ELAfC/R1rPsdwI/ApUB6YAaQDY911A4H3gFB/x+kRbw3gNfd1FeBjoKU7Pw54PWPe4z1f4Yy95/f48zimGOAz93U54FLgRnd+OLAeaOLvOIv7ZE3jIiAiDdQ5mfAUsFdEpnisKw2gqnOBNJwaYsBxm4pHcf7YrgcUpxn5qoiMUdUFwLWqesiPYWahqnuAF0WkpqoeAPbg9Gmiqm8A+4F/ZpQXkVY4NcOjvo82d7k00Q/i1Man4tRuxwH/EpFbgKXA5ar6nY/DPOdYIixkIlIfWC8iU1R1C/B3IFREngRQ1RMiEiwiFwLhOP/R/crtUgvymC+tjrdwviN9gLtU9UNgBfC4iFRU1YBLIKq6CfifiGwGNgFVRKS5W2whsM/jLOuvQDdVDYiRqrP1CfYSkdY4tfGxOMn6ZVW9EieZl1fVI27yN2fJRp8pRCIyABgFJAJjgHmqepOINAImAXtVdaJH+QpuzdGvPOMQkduAujg1pXtxhoN/GicBng9EA4+rarKfwvWKiLwP9MVpDpfBSeiX4iT0Bf6MrSAiMhGnT/AjnBbDtar6g7vuBpxLmIarqvULFhZ/t83PlQnnP/dKYKA7Hwb8ADzjzjcGWvg7zlziHgC87r4eDXyC0xf1E07CA5gIvAB8R4D3RwFBHq/fxfmn1A6nid/ZXS7+jjOf+BsAS93XTwFzcZJ4RaAm8AHQ2N9xnmuT1QgLiXtR7mvAS6q63l3WB5gFTFbV//gzvtyISDjOSZCbgWPA7cA7OIkjDqf/KdWjfGVVPeyPWE+HiASp6kn39SKglLoPBPJcFwiy3bqIiEQDd+Ek8DbAFap63G1trAL+VNU//RLsOcz6CM+SiNQWkfLqXJS7GXgn4y4S4Hec27n6udeABZoTOCdsHgCew+mHehToDgxQ1VQRecC99hHgiH/CPD2qejKjz1NV+wPHPfpoAzIJZlwQrarbgPNwulaGuEnwGuBunNquJcEiYNcRngUR6YVTC/xMRHYBD+JctrFaRFYAI3GanunuFFBU9XcR+QS4H/gP8CbwBfAwUFVEugB/A0a45YtN8yEjGbqJbyHQRURCVDXN37FBjiR4CzBARJKB+3D+ef4KzBeRz3Eunh6lAXJS51xkTeMz5F4sfTlOnw1Af6A0cAfQFudkw3bgQpz+tUGqussPoeZLRGoC9YEXcU7o7MFpKitQGbhTVb/3X4RnT0S6Ar+qcxY/IGQkaRG5HLgBuA3nH9IB4A2cpvEQ4DCwXt2TJaZoWCI8AyJSBueWuV9UtY27rBVwBc4lMfercwdALM5Zy+vVuawjYLnxz8AZomomTrdJueLQJ1icuLdVHlfVDSLSApiMc8H0Q24/87M4/1BfVtVv/RlrSWJ9hKdJROrhnMHrDFwkIncDuCdI5uNcABvuFt8L9Av0JAiZ8Q/Gqb1er6p/WRIsEq1xhjFrDOwC4oFuItJFVdPVuR+6FHCN+w/X+IDVCE+DiPTH6T/bjdPs/QynX+1JVX3SLVNJVYvFSYXcuH+gf6rqTn/Hci7Jdib7PzjXOI7AuUwp49rNd1X1c7fMhar6i7/iLWnsZImXRKQ9Th/OZe70KvAnzlX/s8UZzumx4pwEAYp7f2Cg8kiCN+OMg5gGTMc5O/wSzn3dN4hIuqp+aUnQt6xG6CURiQKq41wo/TDOGeH/Ask4ZyUPqXMLmjG5EpG2OANVdMFp/vbHuXd4KM5Z4nHATFVN8VuQJZT1EXpJVfeq6lqcL/G7qpoATMMZHWSNqn4YiGPaGf/J+D54fC/+Ar5W1SScJvH/gJ04Vx6cDzxvSdA/rGl8+r4DrhfnSXSDgFvVvfG9OF1nZ4pWtjtGyuJ0o+wAmonIvar6CPCniKzDufD+pH1//MeaxqdJRCrhXGQ8AJiqqkv8HJIJYCJyPdAJWIvThRIEzMEZXXo3zj3QfTXAB7E411kiPEMZdylkv1fUmAwici3OybSJOHcgbcLpV96OMzxbEDBHbTxBv7Om8ZlLB2sOm1Oy3TYXjTNaTD+codkOAwnABGCKqk7yW6AmB0uEZ8gSoPGULQlm1PZew7nFsr+qdhaRCJzHB8SJyCYNgLEojcPOGhtTCDyS4PU4l8EsUNXdOPdrX+SeXGuF8yCpZy0JBharERpTSETkPJzHGtwHHHNHk74QiMIZ8LYSMEZVf/VflCY3drLEmEIkItfh3CWyB6f2twsnGS4EkiwJBiZLhMYUIhEpi/PQ9Z2qekBERgHX4FwiY4OqBihLhMYUAXeE7HE4AyqMsHu4A5v1ERpTNMoCJ4GhqrrV38GY/FmN0JgiYhfbFx+WCI0xJZ5dR2iMKfEsERpjSjxLhMaYEs8SoTGmxLNEaBCRdBH5RkS+F5FZIlLuLLbVVUQWu68HZDzlL4+yoe4ABae7jwdF5A5vl2cr86aIXHEa+6olInYN4DnOEqEB56l1zVW1MXAC54HjmcRx2t8VVV2oqo/nUyQUZ1w+Y/zKEqHJ7gugnlsT2i4ibwHfAzVEpKeIfCUiG9yaYwUAEektIttEZAPO4wtwl48VkRfd1xeKyDwR2eROHYHHgbpubXSyW+5OEVkrIt+6j73M2Na9IvKDiKwCGhZ0ECJyrbudTSIyJ1stt4eIrHO3F+eWDxaRyR77vv5sf5Gm+LBEaDKJSAjO6CkZIybXB15W1VjgKM6oKj1UtSWwDrjdvbf2NZwnsrUCquWx+eeBz1S1GdAS2AzcjXNPbnNVvVNEerr7bAs0B1qJSGcRaYUzpH1znOcBt/HicOaqaht3f1uBqz3W1XL30Q94xT2Gq4HDqtrG3f61IlLbi/2Yc4DdYmcAzhORb9zXXwCvAxHAblVd4y5vj/PEvi/dh7KVBr4CooEfVXUHgIi8A1yXyz66AVcCqGo6cFhEwrKV6elOG935CjiJsSIwT1WPuftY6MUxNRaRh3Ga3xWA5R7rZrrPGd4hIrvcY+gJNPXoP6zs7vsHL/ZlijlLhAbcPkLPBW6yO+q5CPhQVUdkK5flfWdJgMdU9b/Z9nHbGWzrTeByVf3/9u4YJWIgCuP4/7MRWYKdjc2isuAZbDzCNhaihWjjFuIF9CJiLx5AZAsbQVYtFmwtbQTTitg9ixkhLMoGu3W+XxPITDIhxWNmSN57krQHbDbaJn+nijz2UUQ0AyaSun8Y22aMl8bW1j2wIWkNQFJHUo+Uc68raTX32/7l+htSnr7v/bhFUhnLqtFnCOw39h6XJXPN3EsAAAC/SURBVC0Bt0Bf0oKkirQMn6YCXnNm6J2Jti1Jc/mZV0jFlIbAIPdHUk9Sp8U49g94RmitRESdZ1YXkubz6ZOIeM7JSK8kfZCW1tUPtzgGziQdkApfDSJiJOkuf55ynfcJ14FRnpG+A7sRMZZ0SaoC90YqjTnNKfAA1PnYfKYX4JGUMfowIj4lnZP2DsdKg9dAv93bsVnnpAtmVjwvjc2seA6EZlY8B0IzK54DoZkVz4HQzIrnQGhmxXMgNLPifQEkp62Mjgwh+QAAAABJRU5ErkJggg==\n",
483 |       "text/plain": [
484 |        "<Figure size 432x288 with 2 Axes>"
485 |       ]
486 |      },
487 |      "metadata": {
488 |       "needs_background": "light"
489 |      },
490 |      "output_type": "display_data"
491 |     }
492 |    ],
493 |    "source": [
494 |     "fig, ax = plot_confusion_matrix(y_test, y_pred, encoder.classes_, normalize=True)\n",
495 |     "fig.savefig('confusion_matrix.png')"
496 |    ]
497 |   },
498 |   {
499 |    "cell_type": "code",
500 |    "execution_count": null,
501 |    "metadata": {},
502 |    "outputs": [],
503 |    "source": []
504 |   }
505 |  ],
506 |  "metadata": {
507 |   "kernelspec": {
508 |    "display_name": "Python 3",
509 |    "language": "python",
510 |    "name": "python3"
511 |   },
512 |   "language_info": {
513 |    "codemirror_mode": {
514 |     "name": "ipython",
515 |     "version": 3
516 |    },
517 |    "file_extension": ".py",
518 |    "mimetype": "text/x-python",
519 |    "name": "python",
520 |    "nbconvert_exporter": "python",
521 |    "pygments_lexer": "ipython3",
522 |    "version": "3.6.8"
523 |   }
524 |  },
525 |  "nbformat": 4,
526 |  "nbformat_minor": 2
527 | }
528 | 


--------------------------------------------------------------------------------
/notebooks/Check Emotion Labeled Dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Check Emotion Labeled Dataset\n",
  8 |     "\n",
  9 |     "The main objective of this notebook is to show the output dataset from the sentiment analysis model"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import os\n",
 19 |     "import pandas as pd\n",
 20 |     "from pathlib import Path"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "dataset_path = Path('../datasets/sentiment_analysis/dataset.csv').resolve()"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 3,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "dataset = pd.read_csv(dataset_path)"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 4,
 44 |    "metadata": {},
 45 |    "outputs": [
 46 |     {
 47 |      "data": {
 48 |       "text/html": [
 49 |        "<div>\n",
 50 |        "<style scoped>\n",
 51 |        "    .dataframe tbody tr th:only-of-type {\n",
 52 |        "        vertical-align: middle;\n",
 53 |        "    }\n",
 54 |        "\n",
 55 |        "    .dataframe tbody tr th {\n",
 56 |        "        vertical-align: top;\n",
 57 |        "    }\n",
 58 |        "\n",
 59 |        "    .dataframe thead th {\n",
 60 |        "        text-align: right;\n",
 61 |        "    }\n",
 62 |        "</style>\n",
 63 |        "<table border=\"1\" class=\"dataframe\">\n",
 64 |        "  <thead>\n",
 65 |        "    <tr style=\"text-align: right;\">\n",
 66 |        "      <th></th>\n",
 67 |        "      <th>label</th>\n",
 68 |        "      <th>id</th>\n",
 69 |        "      <th>date</th>\n",
 70 |        "      <th>user</th>\n",
 71 |        "      <th>text</th>\n",
 72 |        "    </tr>\n",
 73 |        "  </thead>\n",
 74 |        "  <tbody>\n",
 75 |        "    <tr>\n",
 76 |        "      <th>0</th>\n",
 77 |        "      <td>fear</td>\n",
 78 |        "      <td>1148914607152619520</td>\n",
 79 |        "      <td>2019-07-10 11:19:22</td>\n",
 80 |        "      <td>cheri_shapley</td>\n",
 81 |        "      <td>Sometimes what you're most #afraid of doing is...</td>\n",
 82 |        "    </tr>\n",
 83 |        "    <tr>\n",
 84 |        "      <th>1</th>\n",
 85 |        "      <td>fear</td>\n",
 86 |        "      <td>1148837283812073473</td>\n",
 87 |        "      <td>2019-07-10 06:12:07</td>\n",
 88 |        "      <td>Dronearl_RSA</td>\n",
 89 |        "      <td>Delayed post \\n#Afraid \\n@TallRacksRec https:/...</td>\n",
 90 |        "    </tr>\n",
 91 |        "    <tr>\n",
 92 |        "      <th>2</th>\n",
 93 |        "      <td>fear</td>\n",
 94 |        "      <td>1148719897788084224</td>\n",
 95 |        "      <td>2019-07-09 22:25:40</td>\n",
 96 |        "      <td>wavetossed</td>\n",
 97 |        "      <td>#EyesOn #SeeSomethingSaySomething #CIA #Clowns...</td>\n",
 98 |        "    </tr>\n",
 99 |        "    <tr>\n",
100 |        "      <th>3</th>\n",
101 |        "      <td>fear</td>\n",
102 |        "      <td>1148653069003034630</td>\n",
103 |        "      <td>2019-07-09 18:00:07</td>\n",
104 |        "      <td>Misspiggychop</td>\n",
105 |        "      <td>#HappyBirthdayStevenAvery\\n\\n#CorruptiwocCount...</td>\n",
106 |        "    </tr>\n",
107 |        "    <tr>\n",
108 |        "      <th>4</th>\n",
109 |        "      <td>fear</td>\n",
110 |        "      <td>1148593210756947968</td>\n",
111 |        "      <td>2019-07-09 14:02:15</td>\n",
112 |        "      <td>HorrorBitsVids</td>\n",
113 |        "      <td>\"Fight Fire With Fire\"\\n\\nWhat did you think o...</td>\n",
114 |        "    </tr>\n",
115 |        "  </tbody>\n",
116 |        "</table>\n",
117 |        "</div>"
118 |       ],
119 |       "text/plain": [
120 |        "  label                   id                 date            user  \\\n",
121 |        "0  fear  1148914607152619520  2019-07-10 11:19:22   cheri_shapley   \n",
122 |        "1  fear  1148837283812073473  2019-07-10 06:12:07    Dronearl_RSA   \n",
123 |        "2  fear  1148719897788084224  2019-07-09 22:25:40      wavetossed   \n",
124 |        "3  fear  1148653069003034630  2019-07-09 18:00:07   Misspiggychop   \n",
125 |        "4  fear  1148593210756947968  2019-07-09 14:02:15  HorrorBitsVids   \n",
126 |        "\n",
127 |        "                                                text  \n",
128 |        "0  Sometimes what you're most #afraid of doing is...  \n",
129 |        "1  Delayed post \\n#Afraid \\n@TallRacksRec https:/...  \n",
130 |        "2  #EyesOn #SeeSomethingSaySomething #CIA #Clowns...  \n",
131 |        "3  #HappyBirthdayStevenAvery\\n\\n#CorruptiwocCount...  \n",
132 |        "4  \"Fight Fire With Fire\"\\n\\nWhat did you think o...  "
133 |       ]
134 |      },
135 |      "execution_count": 4,
136 |      "metadata": {},
137 |      "output_type": "execute_result"
138 |     }
139 |    ],
140 |    "source": [
141 |     "dataset.head()"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 5,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "import seaborn as sns"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "metadata": {},
156 |    "source": [
157 |     "## Label count\n",
158 |     "\n",
159 |     "Check the count of each label"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": 6,
165 |    "metadata": {},
166 |    "outputs": [
167 |     {
168 |      "data": {
169 |       "text/plain": [
170 |        "<matplotlib.axes._subplots.AxesSubplot at 0x10d4731d0>"
171 |       ]
172 |      },
173 |      "execution_count": 6,
174 |      "metadata": {},
175 |      "output_type": "execute_result"
176 |     },
177 |     {
178 |      "data": {
179 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZUAAAEKCAYAAADaa8itAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAFs9JREFUeJzt3Xu0XnV95/H3x0REsRKEU0YTbBjNaNG2S8ggSgcvOIDWGsaiwmiJyGrGEbW2s6o4dkkr0lGxpV6qHVoiF6lI8ULGopgVb62VSxDkKpIFImFxORJAkaIGv/PH8zvwkJ7gMfmd5+Ek79dazzp7f/dv7/3bO1nP5+zrSVUhSVIPjxp3ByRJ2w5DRZLUjaEiSerGUJEkdWOoSJK6MVQkSd0YKpKkbgwVSVI3hookqZv54+7AqO222261ePHicXdDkuaUSy655AdVNfGL2m13obJ48WLWrl077m5I0pyS5MaZtPP0lySpG0NFktSNoSJJ6sZQkSR1Y6hIkroxVCRJ3RgqkqRuDBVJUjeGiiSpm+3uiXqN1vff/Rvj7sIjxlPedcW4uyDNOo9UJEndGCqSpG4MFUlSN4aKJKkbQ0WS1I2hIknqxlCRJHVjqEiSupm1UEmyMsntSa4cqp2Y5DtJLk/y2SQLhqa9I8m6JNcmOXiofkirrUty7FB9zyQXtvqnkuwwW9siSZqZ2TxSORU4ZJPaauBZVfWbwHeBdwAk2Qs4HHhmm+ejSeYlmQf8DfASYC/giNYW4H3ASVX1NOBO4OhZ3BZJ0gzMWqhU1deBDZvUvlRVG9voBcCiNrwMOKuqflJVNwDrgH3bZ11VXV9VPwXOApYlCfAi4Jw2/2nAobO1LZKkmRnnNZXXA19owwuBm4amrW+1zdV3Be4aCqipuiRpjMYSKkneCWwEzhzR+lYkWZtk7eTk5ChWKUnbpZGHSpLXAS8DXlNV1co3A3sMNVvUapur3wEsSDJ/k/q0qurkqlpaVUsnJia6bIck6d8baagkOQR4G/Dyqrp3aNIq4PAkj0myJ7AEuAi4GFjS7vTagcHF/FUtjL4CHNbmXw6cO6rtkCRNbzZvKf4k8E3g6UnWJzka+AjwK8DqJJcl+VuAqroKOBu4GvgicExV3d+umbwJOB+4Bji7tQV4O/DHSdYxuMZyymxtiyRpZmbtj3RV1RHTlDf7xV9VJwAnTFM/Dzhvmvr1DO4OkyQ9QvhEvSSpG0NFktSNoSJJ6sZQkSR1Y6hIkroxVCRJ3RgqkqRuDBVJUjeGiiSpG0NFktSNoSJJ6sZQkSR1Y6hIkroxVCRJ3RgqkqRuDBVJUjeGiiSpG0NFktSNoSJJ6sZQkSR1Y6hIkroxVCRJ3RgqkqRuZi1UkqxMcnuSK4dqT0yyOsl17ecurZ4kH0qyLsnlSfYemmd5a39dkuVD9X2SXNHm+VCSzNa2SJJmZjaPVE4FDtmkdiywpqqWAGvaOMBLgCXtswL4GAxCCDgOeA6wL3DcVBC1Nn8wNN+m65IkjdishUpVfR3YsEl5GXBaGz4NOHSofnoNXAAsSPIk4GBgdVVtqKo7gdXAIW3aE6rqgqoq4PShZUmSxmTU11R2r6pb2vCtwO5teCFw01C79a32cPX109QlSWM0tgv17QijRrGuJCuSrE2ydnJychSrlKTt0qhD5bZ26or28/ZWvxnYY6jdolZ7uPqiaerTqqqTq2ppVS2dmJjY6o2QJE1v1KGyCpi6g2s5cO5Q/ch2F9h+wN3tNNn5wEFJdmkX6A8Czm/Tfphkv3bX15FDy5Ikjcn82Vpwkk8CLwB2S7KewV1c7wXOTnI0cCPwqtb8POClwDrgXuAogKrakOR44OLW7t1VNXXx/40M7jB7LPCF9pEkjdGshUpVHbGZSQdO07aAYzaznJXAymnqa4FnbU0fJUl9+US9JKkbQ0WS1I2hIknqxlCRJHVjqEiSujFUJEndGCqSpG4MFUlSN4aKJKkbQ0WS1I2hIknqxlCRJHVjqEiSujFUJEndzNqr7yX1tf+H9x93Fx4xvvHmb4y7C9oMj1QkSd0YKpKkbgwVSVI3hookqRtDRZLUjaEiSerGUJEkdWOoSJK6MVQkSd2MJVSS/FGSq5JcmeSTSXZMsmeSC5OsS/KpJDu0to9p4+va9MVDy3lHq1+b5OBxbIsk6UEjD5UkC4G3AEur6lnAPOBw4H3ASVX1NOBO4Og2y9HAna1+UmtHkr3afM8EDgE+mmTeKLdFkvRQ4zr9NR94bJL5wOOAW4AXAee06acBh7bhZW2cNv3AJGn1s6rqJ1V1A7AO2HdE/ZckTWPkoVJVNwMfAL7PIEzuBi4B7qqqja3ZemBhG14I3NTm3dja7zpcn2aeh0iyIsnaJGsnJyf7bpAk6QHjOP21C4OjjD2BJwM7MTh9NWuq6uSqWlpVSycmJmZzVZK0XRvH6a8XAzdU1WRV/Qz4DLA/sKCdDgNYBNzchm8G9gBo03cG7hiuTzOPJGkMxvH3VL4P7JfkccC/AQcCa4GvAIcBZwHLgXNb+1Vt/Jtt+perqpKsAv4hyV8xOOJZAly0tZ3b509O39pFbDMuOfHIcXdB0hwz8lCpqguTnAN8C9gIXAqcDPwTcFaS97TaKW2WU4AzkqwDNjC444uquirJ2cDVbTnHVNX9I90YSdJDjOUvP1bVccBxm5SvZ5q7t6rqPuCVm1nOCcAJ3TsoSdoiPlEvSerGUJEkdWOoSJK6mVGoJFkzk5okafv2sBfqk+zI4DUqu7WHFtMmPYHNPL0uSdp+/aK7v/4H8FYGz4FcwoOh8kPgI7PYL0nSHPSwoVJVHwQ+mOTNVfXhEfVJkjRHzeg5lar6cJLnAYuH56kqHz+XJD1gRqGS5AzgqcBlwNRT6wUYKpKkB8z0ifqlwF5VVbPZGUnS3DbT51SuBP7DbHZEkjT3zfRIZTfg6iQXAT+ZKlbVy2elV5KkOWmmofJns9kJSdK2YaZ3f31ttjsiSaP0tQOeP+4uPGI8/+v9vuJnevfXjxjc7QWwA/Bo4MdV9YRuPZEkzXkzPVL5lanhJGHwN+b3m61OSZLmpl/6LcU18Dng4FnojyRpDpvp6a9XDI0+isFzK/fNSo8kSXPWTO/++t2h4Y3A9xicApMk6QEzvaZy1Gx3RJI09830j3QtSvLZJLe3z6eTLJrtzkmS5paZXqj/OLCKwd9VeTLw/1pNkqQHzDRUJqrq41W1sX1OBSZmsV+SpDlopqFyR5LXJpnXPq8F7tjSlSZZkOScJN9Jck2S5yZ5YpLVSa5rP3dpbZPkQ0nWJbk8yd5Dy1ne2l+XZPmW9keS1MdMQ+X1wKuAW4FbgMOA123Fej8IfLGqngH8FnANcCywpqqWAGvaOMBLgCXtswL4GECSJwLHAc8B9gWOmwoiSdJ4zDRU3g0sr6qJqvpVBiHz51uywiQ7AwcApwBU1U+r6i4Gtyif1pqdBhzahpcBp7eHLi8AFiR5EoOHL1dX1YaquhNYDRyyJX2SJPUx01D5zfbFDUBVbQCevYXr3BOYBD6e5NIkf59kJ2D3qrqltbkV2L0NLwRuGpp/fattri5JGpOZhsqjhk8ttVNPM31wclPzgb2Bj1XVs4Ef8+CpLmDwKhgefIHlVkuyIsnaJGsnJyd7LVaStImZhspfAt9McnyS44F/Bd6/hetcD6yvqgvb+DkMQua2dlqL9vP2Nv1mYI+h+Re12ubq/05VnVxVS6tq6cSEN61J0myZUahU1enAK4Db2ucVVXXGlqywqm4Fbkry9FY6ELiawXMwU3dwLQfObcOrgCPbXWD7AXe302TnAwcl2aUdRR3UapKkMZnxKayquprBl38PbwbOTLIDcD1wFIOAOzvJ0cCNDO42AzgPeCmwDri3taWqNrSjpotbu3e3az2SpDHZ0usiW6WqLmPwpuNNHThN2wKO2cxyVgIr+/ZOkrSlfum/pyJJ0uYYKpKkbgwVSVI3hookqRtDRZLUjaEiSerGUJEkdWOoSJK6MVQkSd0YKpKkbgwVSVI3hookqRtDRZLUjaEiSerGUJEkdWOoSJK6MVQkSd0YKpKkbgwVSVI3hookqRtDRZLUjaEiSerGUJEkdTO2UEkyL8mlST7fxvdMcmGSdUk+lWSHVn9MG1/Xpi8eWsY7Wv3aJAePZ0skSVPGeaTyh8A1Q+PvA06qqqcBdwJHt/rRwJ2tflJrR5K9gMOBZwKHAB9NMm9EfZckTWMsoZJkEfA7wN+38QAvAs5pTU4DDm3Dy9o4bfqBrf0y4Kyq+klV3QCsA/YdzRZIkqYzriOVvwbeBvy8je8K3FVVG9v4emBhG14I3ATQpt/d2j9Qn2YeSdIYjDxUkrwMuL2qLhnhOlckWZtk7eTk5KhWK0nbnXEcqewPvDzJ94CzGJz2+iCwIMn81mYRcHMbvhnYA6BN3xm4Y7g+zTwPUVUnV9XSqlo6MTHRd2skSQ8YeahU1TuqalFVLWZwof3LVfUa4CvAYa3ZcuDcNryqjdOmf7mqqtUPb3eH7QksAS4a0WZIkqYx/xc3GZm3A2cleQ9wKXBKq58CnJFkHbCBQRBRVVclORu4GtgIHFNV94++25KkKWMNlar6KvDVNnw909y9VVX3Aa/czPwnACfMXg8lSb8Mn6iXJHVjqEiSujFUJEndGCqSpG4MFUlSN4aKJKkbQ0WS1I2hIknqxlCRJHVjqEiSujFUJEndGCqSpG4MFUlSN4aKJKkbQ0WS1I2hIknqxlCRJHVjqEiSujFUJEndGCqSpG4MFUlSN4aKJKkbQ0WS1I2hIknqZuShkmSPJF9JcnWSq5L8Yas/McnqJNe1n7u0epJ8KMm6JJcn2XtoWctb++uSLB/1tkiSHmocRyobgf9VVXsB+wHHJNkLOBZYU1VLgDVtHOAlwJL2WQF8DAYhBBwHPAfYFzhuKogkSeMx8lCpqluq6ltt+EfANcBCYBlwWmt2GnBoG14GnF4DFwALkjwJOBhYXVUbqupOYDVwyAg3RZK0ibFeU0myGHg2cCGwe1Xd0ibdCuzehhcCNw3Ntr7VNlefbj0rkqxNsnZycrJb/yVJDzW2UEnyeODTwFur6ofD06qqgOq1rqo6uaqWVtXSiYmJXouVJG1iLKGS5NEMAuXMqvpMK9/WTmvRft7e6jcDewzNvqjVNleXJI3JOO7+CnAKcE1V/dXQpFXA1B1cy4Fzh+pHtrvA9gPubqfJzgcOSrJLu0B/UKtJksZk/hjWuT/w+8AVSS5rtf8NvBc4O8nRwI3Aq9q084CXAuuAe4GjAKpqQ5LjgYtbu3dX1YbRbIIkaTojD5Wq+hcgm5l84DTtCzhmM8taCazs1ztJ0tbwiXpJUjeGiiSpG0NFktSNoSJJ6sZQkSR1Y6hIkroxVCRJ3RgqkqRuDBVJUjeGiiSpG0NFktSNoSJJ6sZQkSR1Y6hIkroxVCRJ3RgqkqRuDBVJUjeGiiSpG0NFktSNoSJJ6sZQkSR1Y6hIkroxVCRJ3cz5UElySJJrk6xLcuy4+yNJ27M5HSpJ5gF/A7wE2As4Isle4+2VJG2/5nSoAPsC66rq+qr6KXAWsGzMfZKk7dZcD5WFwE1D4+tbTZI0BvPH3YFRSLICWNFG70ly7Tj7M0O7AT8YZwfygeXjXH1PY9+XAByXcfegl7Hvz7xlm9mX8AjYn2RG+/PXZtJorofKzcAeQ+OLWu0hqupk4ORRdaqHJGuraum4+7EtcF/25f7sa1vbn3P99NfFwJIkeybZATgcWDXmPknSdmtOH6lU1cYkbwLOB+YBK6vqqjF3S5K2W3M6VACq6jzgvHH3YxbMqdN1j3Duy77cn31tU/szVTXuPkiSthFz/ZqKJOkRxFAZgyRvSXJNkjPH3ZdtSZLFSa4cdz+k6ST513H3YRTm/DWVOeqNwIurav2WLiDJ/Kra2LFP0qxJEgan238+7r6MS1U9b9x9GAWPVEYsyd8C/xH4QpJ3JlmZ5KIklyZZ1tosTvLPSb7VPs9r9Re0+irg6jFuxqxKslOSf0ry7SRXJnl1knclubiNn9y+pEiyT2v3beCYoWW8LslnknwxyXVJ3j807aAk32z79h+TPL7V35vk6iSXJ/lAq72yrfPbSb4+4l0x65J8LsklSa5qDwmT5J4kJ7RtviDJ7q3+1DZ+RZL3JLlnaDl/0v59Lk/y5622uL3s9XTgSh76TNl2p+3XJDmx/Z+6Ismr27TTkxw61PbMqe+DOaeq/Iz4A3yPwVO0fwG8ttUWAN8FdgIeB+zY6kuAtW34BcCPgT3HvQ2zvH9+D/i7ofGdgScOjZ8B/G4bvhw4oA2fCFzZhl8HXN/m3RG4kcGX2m7A14GdWru3A+8CdgWu5cGbVxa0n1cAC4dr29Jnar8Cj2Xwxb8rUEP79/3An7bhzwNHtOE3APe04YMY3MEUBr+ofh44AFgM/BzYb9zb+Uj4APe0/9urGTwCsTvwfeBJwPOBz7V2OwM3APPH3ect+XikMl4HAccmuQz4KoMvv6cAjwb+LskVwD8yeAPzlIuq6oZRd3TErgD+a5L3JfkvVXU38MIkF7Z98iLgmUkWMPiinzqCOGOT5aypqrur6j4GR3a/BuzHYH9+o+335a1+N3AfcEqSVwD3tmV8Azg1yR8w+CLY1rylHeVdwCB0lwA/ZRAMAJcwCAeA5zL4/wjwD0PLOKh9LgW+BTyjLQfgxqq6YLY6Pwf9NvDJqrq/qm4Dvgb856r6GoMHuSeAI4BP1xw9ve01lfEK8HtV9ZB3kSX5M+A24LcY/OZ339DkH4+sd2NSVd9NsjfwUuA9SdYwOLW1tKpuavtnxxks6idDw/cz+P8eYHVVHbFp4yT7AgcChwFvAl5UVW9I8hzgd4BLkuxTVXdsxeY9YiR5AfBi4LlVdW+SrzLYrz+r9iszD+63h10U8H+q6v9usvzFbAf/Xzs6HXgtgzeDHDXmvmwxj1TG63zgzUPXB57d6jsDt9Tgoubvs23+hrxZSZ4M3FtVn2BwSmvvNukH7frHYQBVdRdwV5LfbtNfM4PFXwDsn+RpbV07JflPbbk71+Bh2j9iEOgkeWpVXVhV7wIm2bauC+wM3NkC5RkMjuIezgUMTt/A4ItvyvnA64euTS1M8qvde7tt+Gfg1UnmtaOSA4CL2rRTgbcCVNWcvWbqkcp4HQ/8NXB5kkcxOI/6MuCjwKeTHAl8ke3vt73fAE5M8nPgZ8D/BA5lcM7/VgbvfJtyFLAySQFf+kULrqrJJK8DPpnkMa38p8CPgHOT7MjgN+8/btNOTLKk1dYA397KbXsk+SLwhiTXMLie9ItOU70V+ESSd7Z57waoqi8l+XXgm+33o3sY/MZ9/2x1fI4q4LMMTiN+u42/rapuBaiq29q/xefG18Wt5xP1kmYkyeOAf6uqSnI4g4v2c/MOpRFLsivwrara7Ovj2/69Ati7XUeckzxSkTRT+wAfaadr7wJeP+b+zAntdO5XgQ88TJsXA6cAJ83lQAGPVCRJHXmhXpLUjaEiSerGUJEkdWOoSLNo+P1Ym5n+S79ZOcmpSQ7bup5Js8NQkSR1Y6hII5Dk8UnWtDcjX7HJG2jnt7fSXpPknPa8wtQbmL/W3iJ8fpInjan70owZKtJo3Af8t6raG3gh8JdTr+cBng58tKp+Hfgh8MYkjwY+DBxWVfsAK4ETxtBv6Zfiw4/SaAT4iyQHMHgd/EIGrz4HuKmqvtGGPwG8hcFrUJ4FrG7ZMw+4ZaQ9lraAoSKNxmuACWCfqvpZku/x4JuWN30CuRiE0FVV9dzRdVHaep7+kkZjZ+D2FigvZPA3XKY8JclUePx34F8YvOBxYqqe5NFJnjnSHktbwFCRRuNMYGn7I2NHAt8ZmnYtcEx7Q+0uwMeq6qcMXvH/vvZHtC4Dtou/ca65zXd/SZK68UhFktSNoSJJ6sZQkSR1Y6hIkroxVCRJ3RgqkqRuDBVJUjeGiiSpm/8P4cnvL2MVrpUAAAAASUVORK5CYII=\n",
180 |       "text/plain": [
181 |        "<Figure size 432x288 with 1 Axes>"
182 |       ]
183 |      },
184 |      "metadata": {
185 |       "needs_background": "light"
186 |      },
187 |      "output_type": "display_data"
188 |     }
189 |    ],
190 |    "source": [
191 |     "sns.countplot(x='label', data=dataset)"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "markdown",
196 |    "metadata": {},
197 |    "source": [
198 |     "## Text length\n",
199 |     "\n",
200 |     "Check the length of the tweets"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": 7,
206 |    "metadata": {},
207 |    "outputs": [
208 |     {
209 |      "data": {
210 |       "text/plain": [
211 |        "<matplotlib.axes._subplots.AxesSubplot at 0x10cbac3c8>"
212 |       ]
213 |      },
214 |      "execution_count": 7,
215 |      "metadata": {},
216 |      "output_type": "execute_result"
217 |     },
218 |     {
219 |      "data": {
220 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYQAAAEKCAYAAAASByJ7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3Xt0nPV95/H3d2Y0I2lkWTffb5KxCRgSDBgIkFtDEqDNwm6WbIBeOAln2Z7CbnrbLuyeTVtOOV22PUmzbdKEllxOUkIoSVovS0ISSAoJ1GAwNxuMhe/GF1mWdfVImpnv/jHPyGNZl5E0N0mf1zk6nnnmeZ75PXrg+ej3+z2/32PujoiISKjcBRARkcqgQBAREUCBICIiAQWCiIgACgQREQkoEEREBFAgiIhIQIEgIiKAAkFERAKRchdgKlpaWry1tbXcxRARmTVefPHF4+6+KJ91Z1UgtLa2snXr1nIXQ0Rk1jCzffmuqyYjEREBFAgiIhJQIIiICKBAEBGRgAJBREQABYKIiAQUCCIiAigQCiad1qNIRWR2m1UD0yrVM7s6+PTXX2BJfTXnLK5j48qF3PnhdcQi4XIXTUQkbwqEaXhoy/4z3j/26jsAtNRF2XW0l6ff6mB1c5ybLl1ZjuKJiEyLmowKYP+JAVY11fKpy1Zz16+soyke5QfbDpa7WCIiU6JAmKGhZJp3Tp5idVMtAGbGxlUNPPt2J4e7T5W5dCIi+VMgzNChk6dIO6wJAgFg46oG3GHzy++UsWQiIlOjQJih/Z39AKzKCYSWuhgXr27gB9sOlatYIiJTpkCYoX0nBmipixGPndk//4mLV/DmkV52vNNTppKJiEyNAmEG3J39JwbOaC7K+rX3LCcSMv7pZdUSRGR2UCDMQGffEANDKVY3nx0ITfEoH3rXYv755UOkNGhNRGYBBcIM7DsxADByh9Fon7hkBUd7Btmyp7OUxRIRmRYFwgzsP9FPdVWIRQtiY37+wXMXEQkZz+w6XuKSiYhMXV6BYGbXmdlOM2s3s7vH+DxmZt8NPt9iZq05n90TLN9pZtfmLN9rZq+Z2ctmNisflLyvc4DVTbWEzMb8PB6LZMYktCsQRKTyTRoIZhYGvgRcD2wAbjGzDaNWux3ocvd1wBeA+4NtNwA3AxcA1wFfDvaX9SvuvtHdN834SErs1FCKY72DrG6KT7jeVetaeO1QN92nhktUMhGR6cmnhnA50O7uu919CHgYuHHUOjcC3wxePwpcY2YWLH/Y3QfdfQ/QHuxv1jvQlek/WDNGh3Kuq89pJu3wr7vVjyAilS2fQFgBHMh5fzBYNuY67p4EuoHmSbZ14Mdm9qKZ3TH1opfXke4EACsaaiZc7+LVjdRUhdVsJCIVr5yznb7P3Q+Z2WLgJ2b2prs/PXqlICzuAFi9enWpyziu3sQw0UiI6qqJp7iORkJc1tbEL99WDUFEKls+NYRDwKqc9yuDZWOuY2YRYCHQOdG27p799xjwA8ZpSnL3B9x9k7tvWrRoUR7FLY3ewSQLYvnl6dXnNNN+rI+jPYkil0pEZPryuaK9AKw3szYyF/ObgVtHrbMZuA14DrgJeMrd3cw2Aw+Z2eeB5cB64HkziwMhd+8NXn8MuLcgR1QifYkkddXj//pyn5nQk0gC8JdP7OTi1Y1nrHfrFZVT6xGR+W3SQHD3pJndBTwBhIGvuft2M7sX2Orum4EHgW+ZWTtwgkxoEKz3CLADSAJ3unvKzJYAP8j0OxMBHnL3HxXh+IqmdzDJknHGH4y2bGE1NVVh3u7oPysQREQqRV5tHu7+OPD4qGWfy3mdAD45zrb3AfeNWrYbuGiqha0kfYkk5yyqy2vdkBnnLIrzdkcf7o6NM25BRKScNFJ5GpKpNKeGUyyYoMlotHMW19F9apjO/qEilkxEZPoUCNPQN5jpE8i3UxkYqU283dFXlDKJiMyUAmEaeoNO4ok6lUdrjkepr46w53h/sYolIjIjCoRpOF1DqMp7GzNj7aI69nT0467psEWk8igQpmE6NQSAtpY4vYNJOvoGi1EsEZEZUSBMQ+9gZqK6uin0IQCsbclMhKdmIxGpRAqEaehLJKmNhgmHpnb7aFPQj7C7Q4EgIpVHgTANvYnklG45zRrpRziufgQRqTwKhGnoG0xOqUM519qWOH2DSTp61Y8gIpVFgTANvYnhKXcoZ7UF/Qi71Y8gIhVGgTBF7h7UEKYXCE3xKAtrqhQIIlJxFAhT1DeYZDjl064hmBlrW+LsCeY1EhGpFAqEKcq2/U+nUzmrrSVOf/BMZhGRSqFAmKJsINRNs1MZYK3mNRKRCqRAmKLsKOOZ1BCa4lGa41HeOtpbqGKJiMyYAmGKRpqMptmpnHXe0gXs7uhnYChZiGKJiMyYAmGKOnoHCZtRHQ3PaD/vWlpPMu38sr2zQCUTEZkZBcIUdfQOUlcdITTDp561ttQSi4R46s2jBSqZiMjMKBCmqKNvcMqT2o0lEgqxbnEdT715TLefikhFUCBMUUfv4Iw6lHOdt7Seoz2DbH+npyD7ExGZCQXCFHX0FqaGAHDukjrM4Kk3jxVkfyIiM6FAmIJU2unsH5r2KOXRFlRX8Z6VDTypQBCRCqBAmIKugSFSaZ/xLae5rjlvMa8cOKnZT0Wk7BQIUzAySrl6+qOUR/vweYsB+JlqCSJSZgqEKSjUoLRcFyyvp60lzre37NPdRiJSVgqEKSjExHajmRn/8f1refVgN8++rUFqIlI+CoQpyM5jVKhO5axPXLKCRQtifOVf3i7ofkVEpkKBMAVd/UNEIyFikZlNWzFadVWYz1zdxjO7jvPawe6C7ltEJF8KhCnoSQxTX8AO5Vy//t7VLIhF+MrTqiWISHkoEKagJ5GkvqawzUVZ9dVV/MaVa/jha4fZq8drikgZ5BUIZnadme00s3Yzu3uMz2Nm9t3g8y1m1prz2T3B8p1mdu2o7cJmts3MHpvpgZRCz6lhFhSphgDw6atbiYRD/N0zu4v2HSIi45k0EMwsDHwJuB7YANxiZhtGrXY70OXu64AvAPcH224AbgYuAK4DvhzsL+uzwBszPYhS6U0kqS9wh3KuxQuq+bcbl/O9lw5ycmCoaN8jIjKWfK5ulwPt7r4bwMweBm4EduSscyPwJ8HrR4G/MTMLlj/s7oPAHjNrD/b3nJmtBH4NuA/4/QIcS9H1JIZZ0VBT0H0+tGX/Ge+X1FeTGE5z9/de4wPnLgLg1itWF/Q7RUTGkk+T0QrgQM77g8GyMddx9yTQDTRPsu1fAX8EpCf6cjO7w8y2mtnWjo6OPIpbPL1F7EPIWrawhrUtcZ7b3UkqrYFqIlI6ZelUNrOPA8fc/cXJ1nX3B9x9k7tvWrRoUQlKN77eRHH7ELKuOqeF7lPD7DisabFFpHTyCYRDwKqc9yuDZWOuY2YRYCHQOcG2VwM3mNle4GHgw2b27WmUv2SGkmkSw+mi9iFknbdsAU3xKM+2Hy/6d4mIZOUTCC8A682szcyiZDqJN49aZzNwW/D6JuApz0zMsxm4ObgLqQ1YDzzv7ve4+0p3bw3295S7/0YBjqdoehPDACWpIYTMuHJtM/tODHCwa6Do3yciAnkEQtAncBfwBJk7gh5x9+1mdq+Z3RCs9iDQHHQa/z5wd7DtduARMh3QPwLudPdU4Q+j+HoSSaCw8xhN5NI1jUQjIbbsPlGS7xMRyevq5u6PA4+PWva5nNcJ4JPjbHsfmTuJxtv3z4Gf51OOcsrWEOqrq0gMF//ZBdVVYTYsq+eNIz2k0k44ZEX/ThGZ3zRSOU89p0pbQwA4b+kCBoZSvLS/q2TfKSLzlwIhTyM1hJri9yFknbtkAWEzfvrG0ZJ9p4jMXwqEPPWMdCqXroZQXRWmrSXOT3coEESk+BQIeeoNOpVLWUOAzC2ob3f0s0cT3olIkSkQ8tSTSGIGddHS1RAAzl9aD8CTajYSkSJTIOSp59QwdbEIoRLf7dMYj3Le0gXqRxCRolMg5Ckz02lpm4uyrjl/MS/s7aJ7YLgs3y8i84MCIU89ieGSdijn+sj5S0ilnZ+/daws3y8i84MCIU+9RXx85mQuWtlAS12Mn76hQBCR4lEg5KnnVLJsNYRQyHjfuma27O4kM0WUiEjhKRDy1Ds4XPJbTnNd2trEsd5BDpw4VbYyiMjcpkDIUzlrCACb1jQCsHWfJrsTkeJQIOTB3ekbLN9dRpCZxmJBdYSt+zSvkYgUhwIhDwNDKVJpL2sNIRwyLlndyNa9qiGISHEoEPLQU4aJ7cayaU0jbx3t03gEESkKBUIeekv8cJzxbGptAtB02CJSFAqEPPScOv1wnHLauKqBSMh4Qc1GIlIECoQ8VEoNoSYa5oLl9epYFpGiUCDkoVL6EAAuXdPEKwdOMpRMl7soIjLHKBDy0FMhNQSAy1obGUymef2d7nIXRUTmGAVCHiqlDwHg0tbMALUX96rZSEQKq/x/8s4CvYkk0XCIWKQ8+fnQlv1nvG+KR/nBtkPEY2eevluvWF3KYonIHKMaQh56g6mvzUr7cJzxrGmqZV9nvya6E5GCUiDkoSeRrIgO5azWljj9QymO9w2VuygiMocoEPLQW8aH44xlTXMtAPs6+8tcEhGZSxQIeeg5Vb6H44xlUV2M2miYvQoEESkgBUIeehPlnfp6NDOjtTnO3s6BchdFROYQBUIeesr4+MzxtDbXcqJ/aGTQnIjITCkQ8lBpNQSANc1xAPapliAiBZJXIJjZdWa208zazezuMT6Pmdl3g8+3mFlrzmf3BMt3mtm1wbJqM3vezF4xs+1m9qeFOqBCS6bSDAylKuouI4DlDTVUhY29x9WPICKFMWkgmFkY+BJwPbABuMXMNoxa7Xagy93XAV8A7g+23QDcDFwAXAd8OdjfIPBhd78I2AhcZ2bvLcwhFValTGw3WjhkrGqqVceyiBRMPjWEy4F2d9/t7kPAw8CNo9a5Efhm8PpR4BrLjOK6EXjY3QfdfQ/QDlzuGX3B+lXBT0WOsjodCJVVQwBobY5zpDtBYjhV7qKIyByQTyCsAA7kvD8YLBtzHXdPAt1A80TbmlnYzF4GjgE/cfct0zmAYhuZ6bTCagiQCQQH9p9QP4KIzFzZOpXdPeXuG4GVwOVmduFY65nZHWa21cy2dnR0lLaQnA6ESqwhrGqqIWSo2UhECiKfQDgErMp5vzJYNuY6ZhYBFgKd+Wzr7ieBn5HpYziLuz/g7pvcfdOiRYvyKG5h9ZzKNBnV11ReDSEWCbNsYQ17j6uGICIzl08gvACsN7M2M4uS6STePGqdzcBtweubgKc8M/PaZuDm4C6kNmA98LyZLTKzBgAzqwE+Crw588MpvN5E5Ux9PZZzFtWx/0Q/A0PJchdFRGa5Sf/sdfekmd0FPAGEga+5+3YzuxfY6u6bgQeBb5lZO3CCTGgQrPcIsANIAne6e8rMlgHfDO44CgGPuPtjxTjAmco+HKdSA+HCFfU8vauDNw73lLsoIjLL5dUO4u6PA4+PWva5nNcJ4JPjbHsfcN+oZa8CF0+1sOWQrSHUVWCnMsCKhhoaa6t47ZCeoCYiM6ORypPoTSSpi0UIhyrjWQijmRkXrlhI+7E+Tg5oOmwRmT4FwiR6E8PUxSqzdpD17hULSTv8eMfRchdFRGYxBcIkKnEeo9GyzUaPv3a4YPv80etH6BtUR7XIfKJAmETfYOUHQrbZ6Be7jhek2aj9WB+//e0X+fyP3ypA6URktlAgTKInkaSuQu8wyvXuFQtJpr0gzUYv7e8C4JGtB0Y61UVk7lMgTKIvMcyCCu9DgEyz0crGmoI0G7184CRVYaNvMMkjWw8WoHQiMhsoECYxG/oQINNs9PH3LOeZXcc53H1qRvvatv8k713bzKY1jXzj2T2k0hU576CIFJgCYRJ9g8mKv8so69evWI27863n9k17HwNDSXYe6eHiVQ185n1tHDhxip/o7iWReUGBMIHsw3EqcWK7saxqquUj5y/hO8/vn/aU2K8e7CbtsHF1Ax/bsIQVDTV87Zd7prSPxHCKft2hJDLrKBAm0D+YuahW6ijlsXzmfW10DQzzT9tGzz+Yn5cPnATgopUNRMIhPn11K8/vOcFrB/MbCf3awW4+9Bc/55NfeY60mppEZhUFwgROT309ewLhirYmzl9Wz9d/uZfM/IJT8/L+k6xprqW5LgbAf7hsFZGQ8cPXJ++sfuzVd/jkV5+lbzDJjsM9PJ7HNiJSORQIE8gOzJoNdxllmRmfvrqVnUd7efbtzilvv+1AFxtXNYy8r6+uYu2iOG8d7Z1wuwd/sYe7HtrGBcsX8tQffJB1i+v44k93qUNaZBaZPVe6MhgJhFnSh5B1w0XLuf+Hb/L1X+7h6nUteW93uPsUR3sGuXhVAw9t2T+yvLoqzIv7us5YBnDrFasBcHf+9udvc9U5zXz905cRi4T57DXr+c/f2cbjrx3m31y0vDAHJiJFpRrCBCp9ptPxVFeFufWK1Tz55jEOTOHxmtv2Z/oPNq5uPGP5kvpqugaGGRyno/pg1ymO9w1y/buXEYuEAfjVdy9j/eI6vvikagkis4UCYQK9iWwNYXYFAsDNl2f+ev/HrQcmWfO0lw+cJBoOcf6yBWcsX1pfDcDR3sExt9sWdERfnNPUFA4Zn/3IetqP9fH/CjjHkogUz+y70pXQSCDMkj6E0U066xfX8Y1n97JoQfXI9N3ZZp6xvLz/JBesqB/5Kz9rSTYQuhOsbqo96/v+76vvUBU2tu0/yas5dyOl3Vm8IMbfP7ObG9RsJFLxVEOYQLYPYbY1GWVtWtNETyLJrmMTdwgDDKfSvHro5BkdylkNtVVEwyGO9CbG3PbAiQFWNNSe9cyIkBnvWrqAN4/06hZUkVlAgTCB3sQw4ZBRUxWefOUKdP6yeuKxCC/s7Zp03R3v9JAYTnPxqP4DyFzYl9THONp9diAMp9IcPnlmzSFXSzzGUDLNOzOcTkNEik+BMIG+4GlpZpX5tLTJhEPGpasb2HmkZ2RMxXie2dUBwFXnNI/5+ZL6ao70JM4a23D45ClS7qxuqhlzu+YFUQD2HO+favFFpMQUCBOYLRPbTWRTaxNph5f2TVxLeHrXcS5YXk9LMCBttCX11QwMpc56aM7+4C6mlRPUEECBIDIbKBAm0DuLJrYbT0tdjLaWOC/sPUF6nJHLvYlhXtrXxQfOXTTufpYuDDqWe86802h/1ykaaquoH2esxoLqCLXRsAJBZBZQIEygNzE87oVuNrm8tYmugeFxRxs/93YnybTzgfXjB8LInUY9Z/YjHDgxMG7/AWRGTre1xBUIIrPA7P7zt8BG37a5/8QAC2JVZy2fbS5csZAfbT/CL3YdH/PzZ3YdpzYa5tI1Z3coZ9XFIsRjEY7kBEL3qWG6Tw2zqnH8QABobYmz/VB+k+OJSPmohjCBxHCa6qrZ/ysKh4yrzmlm9/F+Xh/jwvz0rg6uXNtMNDLxsS6tj51RQ8iOgp6ohgCwtiXOga5TDCXT0yi9iJTK7L/aFdHgcIrYLL3ldLTLWpuIRUL83TO7z1i+r7OffZ0DE/YfZC2pr+ZoT2KkL+LAiQHCIWNZ0L8wnraWOKm0c6Ar/2k0RKT0FAgTGEymqY7MjUCorgqzaU0jj716mHdOnh4T8HTQjJRPICytr2Y45XT1D5EYTrHrWB8rGmqIhCf+z6i1JQ7AXvUjiFQ0BcI4kqk0ybTPiSajrKuCmU+/8ezekWVPv9XBysYaWpsnbvaB0x3L//JWB5//yVsc7UlwyRgD2UZbGwSCOpZFKps6lceRCNq750qTEUBjbZTrL1zKQ1v2UxsNc97Sep57u5MbNi7Pa/Dd4vrMmIKt+7pY2VjDb125hpWTdCgDNNRGaaitUiCIVDgFwjiyUz1XT9LROtv87kfOZdfRPr745C6ywxI+mEdzEUAsEub6C5dSGw1z8epGQlMYwa1bT0UqnwJhHCM1hDnSh5C1bnEdT/zeBxgYSvLW0T6OdCf46PlL8t7+/ROMVZhIW0uc56bxBDcRKZ28/vw1s+vMbKeZtZvZ3WN8HjOz7wafbzGz1pzP7gmW7zSza4Nlq8zsZ2a2w8y2m9lnC3VAhZLI1hDmUB9CrtpohI2rGrjuwqWEQsWfq6mtOc7h7gSnhsZ+yI6IlN+kNQQzCwNfAj4KHAReMLPN7r4jZ7XbgS53X2dmNwP3A58ysw3AzcAFwHLgp2Z2LpAE/sDdXzKzBcCLZvaTUfssq8HhudeHAGcPviuVtkXBnUad/Zy/rL4sZRCRieXz5+/lQLu773b3IeBh4MZR69wIfDN4/ShwjWV6KW8EHnb3QXffA7QDl7v7YXd/CcDde4E3gBUzP5zCSSTnZh9CubTp1lORipfP1W4FkPscxoOcffEeWcfdk0A30JzPtkHz0sXAlvyLXXzZTuW5VkMol9bmTCDsViCIVKyy/vlrZnXA94Dfdfeecda5w8y2mtnWjo6OkpUt26msGkJhxGMRltTHdKeRSAXL52p3CFiV835lsGzMdcwsAiwEOifa1syqyITBP7j798f7cnd/wN03ufumRYumd4fLdAwOp4iEbNJRuJK/1ua4moxEKlg+V7sXgPVm1mZmUTKdxJtHrbMZuC14fRPwlGcerbUZuDm4C6kNWA88H/QvPAi84e6fL8SBFFoimVZzUYG1tcTZ26lAEKlUk95l5O5JM7sLeAIIA19z9+1mdi+w1d03k7m4f8vM2oETZEKDYL1HgB1k7iy6091TZvY+4DeB18zs5eCr/ru7P17oA5yuxHBKzUUFtri+ms7+IVJpJ1yCW11FZGryGpgWXKgfH7XsczmvE8Anx9n2PuC+Uct+AVT0FWFwOE21aggF1VIXxR26BobGfVSniJSP/gQeRyKZIqYaQkE1xaMAnOgfKnNJRGQsuuKNY3BYfQiF1hzP1AqO9w1OsqaIlIMCYRyJpPoQCq2lLlND6OxTDUGkEumKNw7VEAqvOeg36FQNQaQiKRDG4O4MJlNzdmK7cmmoqSJk6kMQqVS64o1hOOWknTnz+MxKEQoZTfEoxxUIIhVJgTCGxMg8Rvr1FFpzPKYmI5EKpSveGE7PdKoaQqE1xaPqVBapUAqEMWSfhaA+hMJrrouqD0GkQumKN4ZsDWGuPT6zErTUxTQOQaRCKRDGcLqGoEAotOZ4lJ5EkqFgenERqRx5zWU036hTubByH9vZ3tEHwIO/2MPCmqqR5bdesbrk5RKRM+mKN4bBkYfjqIZQaHWxzN8g/YPJMpdEREZTIIxBNYTiUSCIVC5d8cYwmEwTDYcIWUXP0D0rxaOZQOhTIIhUHAXCGBLDmraiWOLZGsJQqswlEZHRdNUbQ2I4pYntiqS6KkTYTE1GIhVIgTCG/qEU8agCoRjMjHgsrCYjkQqkQBhD32BypGlDCi8ei6iGIFKBFAhj6FcgFFWdAkGkIikQRkm7c2ooNXI3jBRePBZRk5FIBVIgjDIwlMKBupj6EIolHg3TP6i7jEQqjQJhlGxThpqMiqcuFmEoldZ8RiIVRoEwigKh+E6PRVCzkUglUSCM0qdAKDpNXyFSmRQIo2RH0GocQvFkw1YdyyKVRYEwSvav1lrdZVQ0cdUQRCqSAmGU/sEktdEw4ZAmtiuW001GutNIpJIoEEbpH0xqDEKRRSMhqsKmJiORCqNAGKV/KEVcYxCKTtNXiFSevALBzK4zs51m1m5md4/xeczMvht8vsXMWnM+uydYvtPMrs1Z/jUzO2ZmrxfiQApF8xiVRp1GK4tUnEkDwczCwJeA64ENwC1mtmHUarcDXe6+DvgCcH+w7QbgZuAC4Drgy8H+AL4RLKsomseoNOLRiMYhiFSYfGoIlwPt7r7b3YeAh4EbR61zI/DN4PWjwDVmZsHyh9190N33AO3B/nD3p4ETBTiGgtE8RqWTaTJSp7JIJcknEFYAB3LeHwyWjbmOuyeBbqA5z20nZGZ3mNlWM9va0dExlU2nLDuPkfoQiq8ueCaCu5e7KCISqPhOZXd/wN03ufumRYsWFfW7sp2cdWoyKrr6mipSaVc/gkgFyScQDgGrct6vDJaNuY6ZRYCFQGee21YMzWNUOk21UQC6BobLXBIRyconEF4A1ptZm5lFyXQSbx61zmbgtuD1TcBTnmkL2AzcHNyF1AasB54vTNEL7/S0FQqEYmuMB4HQP1TmkohI1qSBEPQJ3AU8AbwBPOLu283sXjO7IVjtQaDZzNqB3wfuDrbdDjwC7AB+BNzp7ikAM/sO8BzwLjM7aGa3F/bQpu70xHbqQyi2xpEaggJBpFLk9aewuz8OPD5q2edyXieAT46z7X3AfWMsv2VKJS0BzWNUOtFIiHgswgnVEEQqRsV3KpdS/2CSmirNY1QqTbVVqiGIVBAFQo7+waTuMCqhhtqoOpVFKogCIYfmMSqtpniU7oFh0hqLIFIRFAg5NG1FaTXWRkm503NKtQSRSqBAyNGnqa9LqjFeBcAJ9SOIVAQFQiCVDuYxUg2hZEYGp/WrhiBSCRQIgZMDQ5rHqMQW1lZhaCyCSKVQIAQ6g/vhVUMonUgoRH1NlUYri1QIBUKgsy9zUdJtp6XVqLEIIhVDgRDIjphVp3JpNWosgkjFUCAEOvsHAfUhlFpjPErPqWEGk3pYjki5KRAC2SYjzWNUWk21URx452Si3EURmfcUCIET/UOax6gMstNgH+waKHNJRESBEOjsH9QdRmXQWJsZnHbgxKkyl0REFAiBQycT1FcrEEqtvqaKsBkHVEMQKTsFAjCUTPPG4R6WN9SUuyjzTsiMhbVVHDihQBApNwUC8NbRXoaSaVY2KhDKoak2yoEuNRmJlJsCAXjl4EkAVjbWlrkk81NjvIqDqiGIlJ0CAXj1QDeNtVUjHZxSWo21UTr7h+jWNNgiZaVAIFNDeM/KBsx0y2k5rF1UB8ATrx8pc0lE5rd5HwgDQ0neOtrLRSsXlrso89aqxhrWtsR59KWD5S6KyLw27wNh+zs9pB3es7Kh3EWZt8yMT1yyguf3nNDdRiJlNO8D4ZUDmQ7l96xSDaGc/t0lKwH4/kuHylwSkflLgXCwm+ULq1m8oLrcRZnXVjTUcOXaZr6/7SBwRarkAAAJj0lEQVTuXu7iiMxL8z4QXg06lKX8/v2lK9nXOcCL+7rKXRSReWleB8LJgSH2dQ6ouahCXHfhUmqqwnxPncsiZTGvA+GVg90AbFQNoSLUxSJcf+FSHnvlMCf1FDWRkpvXgfBq0KF8oW45rRi/dVUrg8k0n/rqv3K0R89IECmleR0IL+3vYu2iOPXVGqFcKTauauDrn76Mg10D3PSVZ9l7vL/cRRKZN/IKBDO7zsx2mlm7md09xucxM/tu8PkWM2vN+eyeYPlOM7s2330W2zef3cvPdnbw0fOXlPqrZQwPbdk/8rOvc4Dbrmqls2+IX/vrX/A7336Rv396Nw9t2V/uYorMaZM+AMDMwsCXgI8CB4EXzGyzu+/IWe12oMvd15nZzcD9wKfMbANwM3ABsBz4qZmdG2wz2T6L5tEXD/LHm7fz0Q1L+MNr31WKr5QpWtlYyx3vX8v3tx3i8deP8OMdR7lgeT0A5y1bwHlLF+hxpyIFls//UZcD7e6+G8DMHgZuBHIv3jcCfxK8fhT4G8tMDHQj8LC7DwJ7zKw92B957LOguk8Nc6jrFFv3neBPNm/n6nXN/PUtF1MVntetZhVtcX01v/3BczjSneD5vZ28fOAk//0Hr418XhsNs6A6woLqKmKRENFIiGg4RHVVmOqqEDVVYWpjERZUR6ivriIeDROPRaiLRYhGQmSHO6TdSbuTSoMZVIVP7ysaMarCIcIhI2SGGRiGk9nYPbN97r/ZURQGhEOZbcIhIxIywqFQ8G/mvY3sM7Nddh/Z7QFCwbqhbBlgpBxpz5TEs/+mz/49hsOZ7bPfO96cXe5OMu2kgp9k2jP79dNlyy5PpZyUO6l0mrRzxu9yZJ20A5ntzSAWCY/8XqsiIaqC8njwexxz/ElwnGT2BH7m72nkdz5q0+whZn5nmeduhCz4vYeNqlDmnIaDzyf6naT99L+ZY0uTTDlDqTRDyTRDqTTpdOb3kU5DNGJEw2FiVZljjVWFiEVmx+N58wmEFcCBnPcHgSvGW8fdk2bWDTQHy/911LYrgteT7bMg0mnnkj/7CScHTs+kecnqBh74zU1UV4WL8ZVSYEsXVnPDRSv4+HuWc3JgmCPdCY71JhgYSpEYzvwMp5zBZJJkKk0y7Qyn0sGyNIPDKZJpDXbLCgUBlXuhTaUzF7z5KjcPij0ucqxcsCDoR/7oCIIwW66WuhhP/9GvFLdg5BcIZWVmdwB3BG/7zGznTPe5D6i786zFLcDxme57lphPxwrz63h1rHOU/bdpH++afFfMJxAOAaty3q8Mlo21zkEziwALgc5Jtp1snwC4+wPAA3mUc0bMbKu7byr291SC+XSsML+OV8c6d5XiePNpQH8BWG9mbWYWJdNJvHnUOpuB24LXNwFPeaZBcDNwc3AXUhuwHng+z32KiEgJTVpDCPoE7gKeAMLA19x9u5ndC2x1983Ag8C3gk7jE2Qu8ATrPUKmszgJ3OnuKYCx9ln4wxMRkXyZZpbMMLM7guapOW8+HSvMr+PVsc5dpTheBYKIiADzfOoKERE5bd4HQrmn0Cg0M1tlZj8zsx1mtt3MPhssbzKzn5jZruDfxmC5mdn/CY7/VTO7pLxHMD1mFjazbWb2WPC+LZhGpT2YViUaLB93mpXZwMwazOxRM3vTzN4wsyvn8rk1s98L/jt+3cy+Y2bVc+XcmtnXzOyYmb2es2zK59LMbgvW32Vmt431Xfma14Fgp6fluB7YANximek2ZrMk8AfuvgF4L3BncEx3A0+6+3rgyeA9ZI59ffBzB/C3pS9yQXwWeCPn/f3AF9x9HdBFZnoVyJlmBfhCsN5s8kXgR+5+HnARmWOek+fWzFYA/wXY5O4XkrkBJTs1zlw4t98Arhu1bErn0syagD8mM7D3cuCPsyEyLZmh6fPzB7gSeCLn/T3APeUuV4GP8Z/JzBm1E1gWLFsG7AxefxW4JWf9kfVmyw+ZcSxPAh8GHiMz68NxIDL6PJO5s+3K4HUkWM/KfQx5HudCYM/o8s7Vc8vpGRCagnP1GHDtXDq3QCvw+nTPJXAL8NWc5WesN9WfeV1DYOxpOVaMs+6sE1SZLwa2AEvc/XDw0REgO83rXPgd/BXwR0B2Jp9m4KS7J4P3ucd0xjQrQHaaldmgDegAvh40j/29mcWZo+fW3Q8BfwnsBw6TOVcvMjfPbdZUz2VBz/F8D4Q5y8zqgO8Bv+vuPbmfeeZPiTlxe5mZfRw45u4vlrssJRABLgH+1t0vBvo53aQAzLlz20hm0ss2MrMlxzm7iWXOKse5nO+BkM+0HLOOmVWRCYN/cPfvB4uPmtmy4PNlwLFg+Wz/HVwN3GBme4GHyTQbfRFosMw0KnDmMY0cr505zcpscBA46O5bgvePkgmIuXpuPwLscfcOdx8Gvk/mfM/Fc5s11XNZ0HM83wNhzk2hYWZGZuT4G+7++ZyPcqcXuY1M30J2+W8FdzG8F+jOqbJWPHe/x91XunsrmfP3lLv/OvAzMtOowNnHO9Y0KxXP3Y8AB8ws+xCPa8jMAjAnzy2ZpqL3mllt8N919njn3LnNMdVz+QTwMTNrDGpUHwuWTU+5O1XK/QP8KvAW8DbwP8pdngIcz/vIVDNfBV4Ofn6VTFvqk8Au4KdAU7C+kbnT6m3gNTJ3dJT9OKZ57B8CHgteryUzb1Y78I9ALFheHbxvDz5fW+5yT/EYNwJbg/P7T0DjXD63wJ8CbwKvA98CYnPl3ALfIdM3Mkym9nf7dM4l8JngmNuBT8+kTBqpLCIigJqMREQkoEAQERFAgSAiIgEFgoiIAAoEEREJKBBEJhDMLvo709y21cxuLXSZRIpFgSAysQZgWoFAZuIyBYLMGgoEkYn9L+AcM3vZzP7CzP6rmb0QzEn/pwBmdlnwvtrM4sH8/RcG274/2Pb3ynoUInnQwDSRCQQzxj7m7hea2cfITInwn8iMHN0M/G93f9rM/ozMSNkaMvMN/bmZfQj4Q3f/eFkKLzJFkclXEZHAx4KfbcH7OjIPLHkauJfM3FgJMg91EZl1FAgi+TPgz939q2N81kwmIKrI1BT6S1kwkUJQH4LIxHqBBcHrJ4DPBM+awMxWmNni4LOvAv8T+AdOP7oxd1uRiqcagsgE3L3TzH4ZPAj9h8BDwHOZ2ZjpA37DzK4Dht39oeA53c+a2YeBZ4CUmb0CfMPdv1CmwxDJizqVRUQEUJORiIgEFAgiIgIoEEREJKBAEBERQIEgIiIBBYKIiAAKBBERCSgQREQEgP8PyfQIh8tMYAgAAAAASUVORK5CYII=\n",
221 |       "text/plain": [
222 |        "<Figure size 432x288 with 1 Axes>"
223 |       ]
224 |      },
225 |      "metadata": {
226 |       "needs_background": "light"
227 |      },
228 |      "output_type": "display_data"
229 |     }
230 |    ],
231 |    "source": [
232 |     "sns.distplot(dataset.text.apply(lambda text: len(text)), bins=30)"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "metadata": {},
238 |    "source": [
239 |     "## Word count\n",
240 |     "\n",
241 |     "Check the word count"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": 8,
247 |    "metadata": {},
248 |    "outputs": [
249 |     {
250 |      "data": {
251 |       "text/plain": [
252 |        "<matplotlib.axes._subplots.AxesSubplot at 0x10cd5bef0>"
253 |       ]
254 |      },
255 |      "execution_count": 8,
256 |      "metadata": {},
257 |      "output_type": "execute_result"
258 |     },
259 |     {
260 |      "data": {
261 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYQAAAEKCAYAAAASByJ7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3Xt8XVWd9/HP7+SetLk0TUubXukFSbmV1gIiILdaHKSOwgg4wji8BnmAGRX1GXyeB0YZZ4QZZ3BUxhGFEVEEBJUKlTpcFBAoTUtbWkpp6D1tadomaZvbyUl+zx9nB0LI5SQ5Obd836/XeWWfvdfe57ez2/PLWmuvtc3dERERCSU7ABERSQ1KCCIiAighiIhIQAlBREQAJQQREQkoIYiICKCEICIiASUEEREBlBBERCSQnewABmP8+PE+Y8aMZIchIpJWVq9efcDdKwYql1YJYcaMGVRXVyc7DBGRtGJmO2IppyYjEREBlBBERCSghCAiIoASgoiIBJQQREQEUEIQEZGAEoKIiABKCAmlx5WKSCpTQkiQF2sOsOifn+auZ2uSHYqISK+UEBLg/pe289l7X+FwSzv/9vvNVG8/lOyQRETeJ6aEYGZLzGyzmdWY2c29bM8zs4eC7SvNbEawfpGZrQ1e68zsz7vts93MXgu2ZeR8FO7OPzy2gVse28hH5lbwx6+ey5SyQr740FoOt7YnOzwRkfcYMCGYWRZwF3ARUAVcYWZVPYpdA9S7+2zgTuCOYP0GYKG7nwIsAX5oZt3nTzrX3U9x94XDPI+U9FbdUe57aQefOW0ad1+1kGNK8rnz06ewt7GVW3+zIdnhiYi8Ryw1hEVAjbtvdfcw8CCwtEeZpcB9wfIjwPlmZu7e7O6RYH0+MKp6VVfvqAfgc2fOJCtkACyYXsbfnTeH36zdw2/X7UlmeCIi7xFLQqgEdnV7vztY12uZIAE0AuUAZnaamW0EXgOu65YgHPi9ma02s2uHfgqpa82OBkoKcjh2fNF71t9w7ixOqCzm9t+9QTjSmaToRETea8Q7ld19pbvPAz4IfM3M8oNNH3b3U4k2Rd1gZmf3tr+ZXWtm1WZWXVdXN9LhxtWanfXMn1ZKKKgddMnOCvGVxcdR29DCw9W7+thbRCSxYkkItcDUbu+nBOt6LRP0EZQAB7sXcPdNwFHghOB9bfBzP/Brok1T7+Pud7v7QndfWFEx4PMdUkZjSztb9h/l1GllvW4/Z24FC6eX8b1nttDa3pHg6ERE3i+WhLAKmGNmM80sF7gcWNajzDLg6mD5UuAZd/dgn2wAM5sOfADYbmZFZjY2WF8ELCbaAZ0x1u1qAOgzIZgZX158HG8fbuPnK3cmMjQRkV4NmBCCNv8bgRXAJuBhd99oZreZ2SVBsXuAcjOrAW4Cum5N/TCwzszWEq0FXO/uB4CJwAtmtg54BXjC3Z+M54kl25qd9ZjByVNL+ixzxqxyPjSrnB/8oYamtkif5UREEiGmR2i6+3JgeY91t3ZbbgUu62W/+4H7e1m/FTh5sMGmkzU7G5g7YSxj83P6LfflxXP51A9e4icvbueGc2cnKDoRkffTSOUR0NnpvLqznlOnlw5YdsH0cVxw/AT+89ka9h9uTUB0IiK9U0IYAVsPHOVIa4T5ffQf9PT//qyK9g7n9iffGOHIRET6poQwAtbs6L9DuacZ44u45qyZ/GpNLWt21o9kaCIifVJCGAFrdtb3OiCtPzeeO5uJxXl8fdlGOjtH1YBuEUkRSggjoK8Baf0pysvmaxcdz/rdjfxytQariUjiKSHE2eHW/gek9WfpKZP54Iwy/nn5G7ytDmYRSTAlhDh7a/9R3KFqUvGg9zUz7vjUSbRFOvj7R9frCWsiklBKCHG2pyH6l31lWcGQ9j+2Ygw3L/kAf9hcx0Or1HQkIomjhBBnextbAJhcOrSEAHDVGTP40Kxy/vHx19l1qDleoYmI9EsJIc5qG1ooys2iOD+mQeC9CoWMf7n0pOh8R79cp7uORCQhlBDibE9DC5NLCzCL/Q6j3kwpK+TWj1fxyrZD3PPCtjhFJyLSNyWEONvT0Dqs5qLuLlswhQurJvKvKzazed+RuBxTRKQvSghxtrexhcml+QMXjIGZ8a1PnkhxQTZffGgtbRE9N0FERo4SQhy1tndw4GiYySXxqSEAjB+Tx+2fPIlNew/znae2xO24IiI9KSHE0d7G6C2n8Woy6nJB1UT+YuEU7n5uK2++raYjERkZSghxtLchesvppDg1GXX3tYuOZ2x+Nrc+tkED1kRkRCghxFFtkBAq41xDACgryuUri4/j5a2HeHz93rgfX0RECSGOukYpH1MS/xoCwBWLpjFvcjH/9MQmPXJTROJOCSGO9ja2MH5MHnnZWSNy/KyQcdvSE9h3uJXvPVMzIp8hIqOXEkIc1Ta0UDkC/QfdLZhexidPreTeF7bpkZsiElcxJQQzW2Jmm82sxsxu7mV7npk9FGxfaWYzgvWLzGxt8FpnZn8e6zHTUdco5ZH2d+fNIdLZyb1/2j7inyUio8eACcHMsoC7gIuAKuAKM6vqUewaoN7dZwN3AncE6zcAC939FGAJ8EMzy47xmGnF3dnT0MqkOI5B6MuM8UV87MRJ/PzlHRxubR/xzxOR0SGWGsIioMbdt7p7GHgQWNqjzFLgvmD5EeB8MzN3b3b3rt7PfKDrfslYjplWGlvaaWnviNso5YFcd84sjrRF+PnLOxPyeSKS+WKZkrMS6D4x/27gtL7KuHvEzBqBcuCAmZ0G3AtMBz4bbI/lmGlluLecPrBy8F/scyaM4a5nayjMzSIna3DdQVeeNm3QnycimW3EO5XdfaW7zwM+CHzNzAb1J7SZXWtm1WZWXVdXNzJBxkHXLaeTEtCH0OXsuRUcbYuwZmd9wj5TRDJXLAmhFpja7f2UYF2vZcwsGygBDnYv4O6bgKPACTEes2u/u919obsvrKioiCHc5Hj3wTiJaTICOHZ8EVPKCnh+ywE6NXpZRIYploSwCphjZjPNLBe4HFjWo8wy4Opg+VLgGXf3YJ9sADObDnwA2B7jMdNKbUMLuVkhxhflJewzzYwzZ43nUFOYbQeaEva5IpKZBuxDCNr8bwRWAFnAve6+0cxuA6rdfRlwD3C/mdUAh4h+wQN8GLjZzNqBTuB6dz8A0Nsx43xuCbWnoZVjSvIJhYb3YJzBqppcTH5OiDU76plVMSahny0imSWm5zy6+3JgeY91t3ZbbgUu62W/+4H7Yz1mOtvbEL/nIAxGTlaIEytLWburnkvaJ5OXMzKjpEUk82mkcpwkalBabxZMK6W9w9mw53BSPl9EMoMSQhxEOjrZd7g1rg/GGYyp4wopL8rV3UYiMixKCHHw9pE2Oj3+D8aJlZmxYHoZ2w40cagpnJQYRCT9KSHEwb7GkXswTqxOmVqKAa+qliAiQ6SEEAcHjkb/Kq8Yk7hbTnsqLcxlVsUY1uys15gEERkSJYQ46GqmKSvKTWoc86eVUt/cTm19S1LjEJH0pIQQB10JYVxhchPC8ZOKyTLjtdrGpMYhIulJCSEODjWFKcjJoiA3uWMA8nOymD1hDBv2NOJqNhKRQVJCiIP6pjDjktxc1OWEyhIamtvfmX1VRCRWSghxcDCFEsLxk8YSMthQq0FqIjI4SghxUN+cOgmhMDebWRVqNhKRwVNCiINDKVRDgGiz0aGmMHsbW5MdioikESWEOEi1hFA1qTjabLRHdxuJSOyUEIaptb2D5nBHSiWEorxsZo4vYkOtmo1EJHZKCMP0zhiEFEoIEG02OnA0zNuH25IdioikCSWEYXpnlHKSB6X1VDWpGAPW1zYkOxQRSRNKCMOUqjWEsfk5HFtRxGu71WwkIrFRQhim+ubUTAgAJ1WWclB3G4lIjJQQhung0dRNCPMmR+82Wr9bdxuJyMCUEIapvjlMyKCkICfZobxPYV42syeM4bXaBjUbiciAsmMpZGZLgP8AsoAfu/vtPbbnAT8FFgAHgU+7+3YzuxC4HcgFwsBX3f2ZYJ8/AJOArkl3Frv7/mGfURw9sHLngGVWbj1Efk4WD63alYCIBu/EyhIeXVPL7voWpo4rTHY4IpLCBqwhmFkWcBdwEVAFXGFmVT2KXQPUu/ts4E7gjmD9AeDj7n4icDVwf4/9PuPupwSvlEoGsWoKRyjKiymvJkXVpBJNiS0iMYmlyWgRUOPuW909DDwILO1RZilwX7D8CHC+mZm7v+rue4L1G4GCoDaRMZrDHRQledrr/hTkZjFn4hheq23Uk9REpF+xJIRKoHt7yO5gXa9l3D0CNALlPcp8Cljj7t1HSv23ma01s1vMzAYVeYpoakvtGgJEm40aW9rZfrAp2aGISApLSKeymc0j2oz0+W6rPxM0JZ0VvD7bx77Xmlm1mVXX1dWNfLCD1BTuoDA3tRPCvMkl5OeEeGXboWSHIiIpLJaEUAtM7fZ+SrCu1zJmlg2UEO1cxsymAL8GrnL3t7p2cPfa4OcR4AGiTVPv4+53u/tCd19YUVERyzklTKc7LeEIRXmp22QEkJsdYv7UMjbWHuZoWyTZ4YhIioolIawC5pjZTDPLBS4HlvUos4xopzHApcAz7u5mVgo8Adzs7n/qKmxm2WY2PljOAS4GNgzvVBKvtb2DToeiFK8hACyaOY4Od1bvqE92KCKSogZMCEGfwI3ACmAT8LC7bzSz28zskqDYPUC5mdUANwE3B+tvBGYDtwZ9BWvNbAKQB6wws/XAWqI1jB/F88QSoamtAyDlawgAE4vzmTm+iFe2HVTnsoj0KqY/bd19ObC8x7pbuy23Apf1st83gW/2cdgFsYeZmprD0eaXVO9D6HLazHE8uGoXW94+muxQRCQFaaTyMDQF7fGpfpdRl6rJxYzJy+aVbQeTHYqIpCAlhGFoCgdNRik8DqG77FCIhdPLeGPfEWobWgbeQURGFSWEYeiqIaRLkxFEO5fN4EfPbU12KCKSYpQQhqE53EFOlpGbnT6/xtLCXOZPK+MXr+xk/xFNiy0i70qfb7IUlA6jlHvzkbkVtHd0qpYgIu+hhDAMTeFIWoxB6Kl8TB6fOKWSn728k4NH9cxlEYlSQhiG5nAHhWnSodzTDefNpjXSwY9f2JbsUEQkRSghDEO6NhkBzKoYw8UnTeanL26nPngutIiMbkoIw9CU4lNfD+Rvz5tNc3sH33+2JtmhiEgKUEIYovaOTsKRzrStIQDMnTiWyz84jfte3E7N/iPJDkdEkkwJYYiag0Fp6TQGoTdfWTyXgtwsvvHb1/XcZZFRTglhiN6dtiJ9m4wgesfRTRfO5fktB/if199OdjgikkRKCEPUlGYT2/XnL0+fztyJY/jHJ16ntb0j2eGISJIoIQzRu01G6V1DAMjJCvEPH5/HrkMt/OuKzckOR0SSRAlhiFoyKCEAnDl7PFedMZ17XtjGo6t3JzscEUkCJYQhagmaVgpyMiMhANxycRVnHFvO1379Gmt3NSQ7HBFJMCWEIWoJJrbLzsqcX2FOVoi7PnMqE4vzuPan1bx9WJPfiYwmmfNtlmAt4Y6Mqh10GVeUy4+v+iBNbRGu+9lq2iLqZBYZLZQQhqilvSMj7jDqzXHHjOXf/uJkXt3ZwD88tlHjE0RGCSWEIWoOd5CfgTWELktOmMSN587mwVW7+PnKnckOR0QSIKaEYGZLzGyzmdWY2c29bM8zs4eC7SvNbEaw/kIzW21mrwU/z+u2z4JgfY2ZfdfMLF4nlQit7ek702msvnThXM49roJv/HYj1dsPJTscERlhAyYEM8sC7gIuAqqAK8ysqkexa4B6d58N3AncEaw/AHzc3U8Ergbu77bPD4C/AeYEryXDOI+Eaw5HMrIPobuskPGdy+czsTifWx7bSGenmo5EMlksNYRFQI27b3X3MPAgsLRHmaXAfcHyI8D5Zmbu/qq77wnWbwQKgtrEJKDY3V/2aAP1T4FPDPtsEqilvYOCDK8hAJQU5PDlxXPZtPcwKzbuS3Y4IjKCYkkIlcCubu93B+t6LePuEaARKO9R5lPAGndvC8p3H/3U2zEBMLNrzazazKrr6upiCHfkRTo6ae/wUZEQAC45uZJjK4q486k3VUsQyWAJ6VQ2s3lEm5E+P9h93f1ud1/o7gsrKiriH9wQZOKgtP5khYwvXjCXN98+yhOv7U12OCIyQmJJCLXA1G7vpwTrei1jZtlACXAweD8F+DVwlbu/1a38lAGOmbK6pq0YLTUEgItPnMTciWP4zlNv0qFagkhGiuVG+lXAHDObSfRL+3Lgyh5llhHtNH4JuBR4xt3dzEqBJ4Cb3f1PXYXdfa+ZHTaz04GVwFXA94Z9NgmSCTWEB4ZwK+nC6eN44JWd3PzoeuZPKxvUvleeNm3QnyciiTVgDSHoE7gRWAFsAh52941mdpuZXRIUuwcoN7Ma4Cag69bUG4HZwK1mtjZ4TQi2XQ/8GKgB3gJ+F6+TGmmZNrFdrKomF3NMcT5/fLNOg9VEMlBMQ23dfTmwvMe6W7sttwKX9bLfN4Fv9nHMauCEwQSbKpozoIYwFCEzzpxdzqNratl6oIlZFWOSHZKIxJFGKg/BaOxD6HLSlFKKcrN4seZAskMRkThTQhiCrj6ETJ66oi85WSEWzRzHG/uOcKgpnOxwRCSOlBCGoCXcQX5OiFB6zbYRN6fNLMcMXnpLtQSRTKKEMAQt7Zk59XWsigtyOKGyhOod9ZoeWySDKCEMQUt4dExb0Z8PzRpPW6STNTv1ZDWRTKGEMAQt7R0U5mTmsxBiNW1cIVPLCvhTzQENVBPJEEoIQ9Ac7iB/lNcQAM49bgKHmsK8urM+2aGISBwoIQxBtIaghHDcMWOZUlbAs5v3E+nsTHY4IjJMSgiD5O60hCOjvg8BwMw4/wMTqG9u59Ud6ksQSXdKCIMU7uik00ffKOW+zJ04lqmqJYhkBCWEQRrNo5R7Y2acf/xEGlraWb1DfQki6UwJYZAyYabTeJszYQzTxhXy7Bv7CUdUSxBJV0oIg6QawvuZGRedcAyHWyM8tyU1nmonIoOnhDBIzWHVEHozvbyIEytLeH5LHY0t7ckOR0SGQAlhkFrbR+ezEGKxZN4xuMOKjfuSHYqIDIESwiCphtC3sqJczpw9nrW7Gth1qDnZ4YjIICkhDFJLewchg9xs/ep6c87cCsbkZbP8tb16qppImtG32iC1hKMzndoonfp6IPk5WVxw/ER2HGrmjX1Hkh2OiAyCEsIgtbR3UJA7uie2G8iC6WWUF+Xy+9f30alagkjaUEIYpGgNQb+2/mSFjAurJvL24TbW7dKUFiLpIqZvNjNbYmabzazGzG7uZXuemT0UbF9pZjOC9eVm9qyZHTWz7/fY5w/BMdcGrwnxOKGR1tLeQaFqCAM6obKEySX5PLXpbU1pIZImBkwIZpYF3AVcBFQBV5hZVY9i1wD17j4buBO4I1jfCtwCfKWPw3/G3U8JXvuHcgKJFm0y0h1GAwmZsXjeMdQ3t7Nq26FkhyMiMYilhrAIqHH3re4eBh4ElvYosxS4L1h+BDjfzMzdm9z9BaKJISM0hyPk65bTmMyZMIaZ44t4dnMdzeFIssMRkQHEkhAqgV3d3u8O1vVaxt0jQCNQHsOx/ztoLrrF+rhtx8yuNbNqM6uuq0vutAid7rS1d2pQWozMjMVVEznaFuFnL+9IdjgiMoBk9o5+xt1PBM4KXp/trZC73+3uC919YUVFRUID7Km1vQNHg9IGY3p5EbMnjOGHf9yqWoJIioslIdQCU7u9nxKs67WMmWUDJcDB/g7q7rXBzyPAA0SbplKaJrYbmgs+MIGDTWF++pJqCSKpLJaEsAqYY2YzzSwXuBxY1qPMMuDqYPlS4BnvZ5iqmWWb2fhgOQe4GNgw2OATTVNfD8208iLOnlvB3c9tpalNtQSRVDVgQgj6BG4EVgCbgIfdfaOZ3WZmlwTF7gHKzawGuAl459ZUM9sO/DvwV2a2O7hDKQ9YYWbrgbVEaxg/it9pjYyuGoL6EAbvSxfM4VBTmPte2p7sUESkDzHdUO/uy4HlPdbd2m25Fbisj31n9HHYBbGFmDqagxqC7jIavPnTyvjIcRX86Lmt/OXp0ynOz0l2SCLSg4bcDoJqCMPzlcXHUd/czn/94a1khyIivVBCGISuu2Q0UnloTqgs4c/nV3LPC9vY09CS7HBEpAclhEFoCneQnxMiK6SZTofqy4vn4sC3f7852aGISA9KCIPQ3BZR7WCYppQV8tdnzuTXr9ayobYx2eGISDdKCIPQHO5Q/0EcXH/uLEoLcvinJzbpIToiKUQJYRCawx0UqYYwbMX5OXzh/Dm8tPUgy9btSXY4IhJQQhiEpnBENYQ4+ewZMzhlailfX7aRuiNtyQ5HRFBCGBQ1GcVPVsj410tPoqmtg1sfS/lB6iKjghJCjNo7OglHOinKU5NRvMyZOJYvXjiH323YxxPr9yY7HJFRTwkhRs2a2G5EXHvWsZw0pYRbHtvAgaNqOhJJJiWEGHUNSlOncnxlZ4X49mUnc7Qtws2Pvqa7jkSSSAkhRs2atmLEzJ04lr9f8gGe2vQ2D67aNfAOIjIilBBi1DVtc6H6EEbE5z40gw/PHs9tv32dbQeakh2OyKikhBCjrhpCkWoIIyIUMr592cnkZof44oOv0t7RmeyQREYdJYQYdfUhqFN55BxTks+3Pnki63Y38r2ntyQ7HJFRRwkhRs3hDvKyQ2SH9CsbSR87cRKXLpjC95+t4ZVth5Idjsioom+3GDWHOzQGIUG+fsk8po4r5EsPraWxpT3Z4YiMGkoIMWpq07QViTImL5vvfPoU9h1u1ShmkQRSQoiRpq1IrPnTyvji+XN4bO0efvHKzmSHIzIqxJQQzGyJmW02sxozu7mX7Xlm9lCwfaWZzQjWl5vZs2Z21My+32OfBWb2WrDPd80spZ860xyOaFBagl1/7mzOmVvBLb/ZwJ9qDiQ7HJGMN2BCMLMs4C7gIqAKuMLMqnoUuwaod/fZwJ3AHcH6VuAW4Cu9HPoHwN8Ac4LXkqGcQKI0qYaQcFkh4/tXzmdWxRiu+9lqavYfSXZIIhktlhrCIqDG3be6exh4EFjao8xS4L5g+RHgfDMzd29y9xeIJoZ3mNkkoNjdX/boXAU/BT4xnBMZSZFgYjsNSku8sfk53PNXC8nLDvHXP6nWfEciIyiWb7hKoPt8AruB0/oq4+4RM2sEyoG+6vmVwXG6H7MyloCTQdNWDN8DK4fXD3DZgqn8+IWtfPTO5/jcmTMZV5Tbb/krT5s2rM8TGY1SvlPZzK41s2ozq66rq0tKDE3BoDQ9Tzl5po6LPou5OdzBf/3xLWobWpIdkkjGiSUh1AJTu72fEqzrtYyZZQMlwMEBjjllgGMC4O53u/tCd19YUVERQ7jxp2krUsP08iI+f/axZIeMHz2/lddqG5MdkkhGiSUhrALmmNlMM8sFLgeW9SizDLg6WL4UeMb7mcfY3fcCh83s9ODuoquAxwYdfYK802SkPoSkm1Ccz3XnzKJiTB6/eGUnD6zcwZFWDV4TiYcBv+GCPoEbgRVAFnCvu280s9uAandfBtwD3G9mNcAhokkDADPbDhQDuWb2CWCxu78OXA/8BCgAfhe8UtI7M52qhpASigtyuO6cWbywpY6n39jPW3VbuLBqIgunl5GdlfKtoCIpK6Y/ed19ObC8x7pbuy23Apf1se+MPtZXAyfEGmgyqVM59WSFjHOOm8Dxk4v5zat7WLZuD398s46PHFfBgmllyQ5PJC2pDSQGzeGIJrZLURPG5vM3Z82kpu4oT2/az2Nr9/DClgNMH1/EOXOT0+ckkq70DRcDTVuR2syMORPG8vmzj+WvPjQDM7j63lf4Xz9bzR7djSQSM9UQYtAcjmim0zRgZsydOJa/O28OR9oifO+ZLbz41kHu/PTJnPeBickOTyTlqYYQg6Y21RDSSXZWiBvOnc2TXzibytIC/von1Xx7xWY6Ovu88U1EUEKISXM4okFpaWjG+CJ+df2HuPyDU/n+szV89p6VHNTUFyJ9UkKIQXO4Q4PS0lR+Tha3f+ok/uXSk6jeUc/Hv/cC63c3JDsskZSkhDCASEcnbZrYLu39xcKpPHrdhzAzLv2vl3i4etfAO4mMMkoIA9AYhMxx4pQSfvu3H+aDM8r434+s59//5036GVAvMuooIQzg3YSgGkImGFeUy08+t4jLFkzhu09v4e8fXU97R2eywxJJCfqWG0DXTKfqQ8gcOVkh/uXSk5hUks93n6nhwNEw//WXC8jN1t9HMrrpf8AAVEPITGbGTYuP4x8/cQLPvLGfmx5eS6duS5VRTt9yA2juehZCnmoImeizp0+nuS3Ct373BuVFuXz9knmk+OO9RUaMEsIAjrRGMNSpnMk+f84sDjaFufu5rYwryuMLF8xJdkgiSaGEMICG5nbG5mdrYrs0M9hHdk4fV8ip08q486k32dvYwklTSge1vx7ZKZlA33IDaGgOU1rY//N7Jf2ZGZ+YP5lp4wr51Zpa9h1uTXZIIgmnhDCAhpZ2Sgtzkh2GJEB2KMSVi6aRlx3i5y/voCW4oUBktFBC6EenO43N7ZQWqIYwWhQX5HDladOobw7zcPUuOjVwTUYRJYR+HGmN0OGuGsIoM728iD87cRKb3z7Cy1sPJjsckYRRQuhHQ3MYgDIlhFHn9GPLmTtxDCs27uPAEc2QKqODEkI/GprbAdSpPAqZGZ+cP4XsUIhfrt6lZynIqBBTQjCzJWa22cxqzOzmXrbnmdlDwfaVZjaj27avBes3m9lHu63fbmavmdlaM6uOx8nEW1cNQU1Go1NxQQ6XnDKZXfUtPL+lLtnhiIy4AROCmWUBdwEXAVXAFWZW1aPYNUC9u88G7gTuCPatAi4H5gFLgP8MjtflXHc/xd0XDvtMRkB9SzsFOVnkZWtQ2mh18pRSTqws4elN+3lbt6JKhoulhrAIqHH3re4eBh4ElvYosxS4L1h+BDjfouP/lwIPunubu28DaoLjpYWG5rD6D4RLTp5MbnaIx9bu0XTZktFiSQiVQPeniewO1vVaxt0jQCNQPsC+DvzezFab2bWDD33kNTQyaYExAAAMY0lEQVS3q/9AKMrLZsm8Y9h+sIm1u/S0NclcyexU/rC7n0q0KeoGMzu7t0Jmdq2ZVZtZdV1d4tpx3Z2G5nbVEASABTPKmFpWwPIN+zRgTTJWLAmhFpja7f2UYF2vZcwsGygBDva3r7t3/dwP/Jo+mpLc/W53X+juCysqKmIINz4amtsJd3SqhiAAhMxYekolzW0Rfv/6vmSHIzIiYkkIq4A5ZjbTzHKJdhIv61FmGXB1sHwp8IxHG1uXAZcHdyHNBOYAr5hZkZmNBTCzImAxsGH4pxM/u+tbAN1hJO+aXFrA6ceW88q2Q9QG/z5EMsmACSHoE7gRWAFsAh52941mdpuZXRIUuwcoN7Ma4Cbg5mDfjcDDwOvAk8AN7t4BTAReMLN1wCvAE+7+ZHxPbXhqG5oBjUGQ97qwaiKFedn8dv0eTWshGSem6a/dfTmwvMe6W7sttwKX9bHvPwH/1GPdVuDkwQabSF01hLIC1RDkXfk5WSyZN5FH19SyblcD86eVJTskkbjRSOU+1Da0kJsVokAPxpEe5k8rY0pZAU9u2EdbuzqYJXMoIfShtr6F0sIcPU5R3idkxsdPmsyRtgjPbN6f7HBE4kYJoQ+1DS3qUJY+TR1XyIJpZbxYc5A6TX4nGUIJoQ/RhKAOZenb4nkTyc4yHl+vEcySGZQQetHUFokOSlOHsvRjbH4OFxw/kS37j/I/r7+d7HBEhk0JoRe1DV1jEFRDkP6dfmw5E8bmcdvjr9OqDmZJc0oIvajVoDSJUVbI+PjJk9ld38IP/7g12eGIDIsSQi92q4YggzCrYgx/dtIk/vMPNew61JzscESGTAmhF9sPNJGbFWJsfkzj9kT4vx87nqyQ8f9+s0EdzJK2lBB68eJbBzl1eikhjUGQGE0uLeCrHz2OP75Zx2/W9pz7USQ9KCH0sP9IK5v2HubsuYmbWVUyw1VnzGD+tFJu++3rHDyqsQmSfpQQenj+zQMAnD1HCUEGJytk3PGpkzjaFuG2x19Pdjgig6aE0MNzW+oYPyaXqknFyQ5F0tDciWO54dzZPLZ2Dys26rkJkl6UELrp7HSe33KAs+ZUEAqp/0CG5vqPzObEyhK+/PA6Nu87kuxwRGKmhNDN63sPc6gpzFlzxic7FEljudkh7r5qAYW5WVxz3yr1J0jaUELo5o9vRp/ZfJb6D2SYJpUU8KOrFlJ3pI3rfraatohGMUvqU0Lo5rk366iaVEzF2LxkhyIZ4OSppXz7spNZtb2ev/npahqaw8kOSaRfSgiBo20RVu+o1+2mElcfP3kyt3/yRF5+6yAXf+8FNtQ2JjskkT4pIQRefusgkU7n7LnqP5D4unzRNH553Rl0djqf/MGL/OAPb9HUFkl2WCLvo7kZgI5O5+crd1CYm8WC6XpGrgzeAyt3Dljmc2fO5NE1u7njyTf47tNbOHN2OafPLKcwb/D/Da88bdpQwhTpV0w1BDNbYmabzazGzG7uZXuemT0UbF9pZjO6bftasH6zmX001mMmirvzf371Gs9uruOrHz2OvGw9Q1lGRlFeNledMYP/dc4sppcX8tSm/fzz7zZx7wvbeHnrQerVxyBJNuCfJmaWBdwFXAjsBlaZ2TJ37z4U8xqg3t1nm9nlwB3Ap82sCrgcmAdMBp4ys7nBPgMdMyFuf/INHqrexd+eN5vPnTkz0R8vo9DUcYVcdcYM9jW2sm53Axv3NLJs3R5YByUFOUwvL2TauEKOKclnUnEBBbn6I0USI5a66iKgxt23ApjZg8BSoPuX91Lg68HyI8D3Lfp0+qXAg+7eBmwzs5rgeMRwzLjr7HTCHZ1sP9jE2p0NvFBzgMfX7+Wzp0/npgvnDnwAkTg6piSfY0qOYXHVRPYfaWNr3VG2H2xm24Em1u9+t/O5OD+b0sJcSgtzKM7PoSA3i0hnJ2PzsynIySI/eOVmh8jNCpGXHSI7K0R2yMgKGSEzQgZmRtd8jUb0PURryZ0e/dnhTqTDiXQ6kY5OIp1OR6fTGczgakSPETIjO8vIDhnZoRA52cHPLMO6fV6nOx4c2x265oHt2h4yusUYfR8K4rRuk0t237/rmJ0DzCrb2W0fd3/ns7vO3Xj3s/rSda7df4eZLJaEUAns6vZ+N3BaX2XcPWJmjUB5sP7lHvtWBssDHTNuLv7e87yx9wiRzvf+AyotzOGvPjSDWy+uyvgLLanLzJhYnM/E4nzOmBX98jrSFmFfYyv7GlupO9JGfUuY2voWNrUepr3D9cjOJOlKEFlmYF3JJZoouzjdkp+/933PqdHfSZ70frzuX0trbrmQ/JyRrS2mfKeymV0LXBu8PWpmm+N17B3AOuAb7109HjgQr89IATqf1JZp5wOZd04pcT4F/zis3afHUiiWhFALTO32fkqwrrcyu80sGygBDg6w70DHBMDd7wbujiHOuDCzandfmKjPG2k6n9SWaecDmXdOmXY+/YnlLqNVwBwzm2lmuUQ7iZf1KLMMuDpYvhR4xqN1o2XA5cFdSDOBOcArMR5TREQSaMAaQtAncCOwAsgC7nX3jWZ2G1Dt7suAe4D7g07jQ0S/4AnKPUy0szgC3ODuHQC9HTP+pyciIrEyPf/1vczs2qCZKiPofFJbpp0PZN45Zdr59EcJQUREAM1lJCIiASWEQKpMpTEcZjbVzJ41s9fNbKOZfSFYP87M/sfMtgQ/02bCJjPLMrNXzezx4P3MYHqUmmC6lNxkxzgYZlZqZo+Y2RtmtsnMzkjz6/Ol4N/aBjP7hZnlp9M1MrN7zWy/mW3otq7X62FR3w3Oa72ZnZq8yEeGEgLvmZ7jIqAKuCKYdiPdRIAvu3sVcDpwQ3AeNwNPu/sc4Ongfbr4ArCp2/s7gDvdfTZQT3TalHTyH8CT7v4B4GSi55aW18fMKoG/Axa6+wlEbxDpmromXa7RT4AlPdb1dT0uInqn5ByiY6N+kKAYE0YJIeqd6TncPQx0TaWRVtx9r7uvCZaPEP2yqSR6LvcFxe4DPpGcCAfHzKYAfwb8OHhvwHlEp0eBNDoXADMrAc4melce7h529wbS9PoEsoGCYPxRIbCXNLpG7v4c0Tsju+vreiwFfupRLwOlZjYpMZEmhhJCVG/Tc1T2UTYtWHTG2fnASmCiu+8NNu0DJiYprMH6DvC/gc7gfTnQ4O5dDxNIt+s0E6gD/jtoBvuxmRWRptfH3WuBbwM7iSaCRmA16X2NoO/rkXHfEz0pIWQgMxsDPAp80d0Pd98WDBhM+VvLzOxiYL+7r052LHGUDZwK/MDd5wNN9GgeSpfrAxC0rS8lmugmA0W8v/klraXT9YgHJYSoWKbnSAtmlkM0Gfzc3X8VrH67q2ob/NyfrPgG4UzgEjPbTrQJ7zyi7e+lQfMEpN912g3sdveVwftHiCaIdLw+ABcA29y9zt3bgV8RvW7pfI2g7+uRMd8TfVFCiMqIqTSCNvZ7gE3u/u/dNnWfWuRq4LFExzZY7v41d5/i7jOIXo9n3P0zwLNEp0eBNDmXLu6+D9hlZscFq84nOoo/7a5PYCdwupkVBv/2us4nba9RoK/rsQy4Krjb6HSgsVvTUmaIzjOuF/Ax4E3gLeD/JjueIZ7Dh4lWb9cDa4PXx4i2vT8NbAGeAsYlO9ZBntdHgMeD5WOJzodVA/wSyEt2fIM8l1OA6uAa/QYoS+frQ3Sy4DeADcD9QF46XSPgF0T7P9qJ1uCu6et6EJ2d+q7gO+I1ondXJf0c4vnSSGUREQHUZCQiIgElBBERAZQQREQkoIQgIiKAEoKIiASUEET6EcxOev0Q951hZlfGOyaRkaKEINK/UmBICQGYASghSNpQQhDp3+3ALDNba2b/amZfNbNVwXz43wAwsw8G7/PNrCh4PsAJwb5nBft+KalnIRIDDUwT6Ucwa+zj7n6CmS0mOiXD54mOWl0G/Iu7P2dm3wTygQKi8xV9y8w+AnzF3S9OSvAig5Q9cBERCSwOXq8G78cQfVjKc8BtROfEaiX60BiRtKOEIBI7A77l7j/sZVs50QSRQ7Sm0JTIwETiQX0IIv07AowNllcAfx08bwIzqzSzCcG2HwK3AD8n+gjJnvuKpDzVEET64e4HzexPwUPYfwc8ALwUne2Zo8BfmtkSoN3dHwiez/2imZ0HPA90mNk64CfufmeSTkMkJupUFhERQE1GIiISUEIQERFACUFERAJKCCIiAighiIhIQAlBREQAJQQREQkoIYiICAD/H27qECVGcYpoAAAAAElFTkSuQmCC\n",
262 |       "text/plain": [
263 |        "<Figure size 432x288 with 1 Axes>"
264 |       ]
265 |      },
266 |      "metadata": {
267 |       "needs_background": "light"
268 |      },
269 |      "output_type": "display_data"
270 |     }
271 |    ],
272 |    "source": [
273 |     "sns.distplot(dataset.text.apply(lambda text: len(text.split())), bins=10)"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": null,
279 |    "metadata": {},
280 |    "outputs": [],
281 |    "source": []
282 |   }
283 |  ],
284 |  "metadata": {
285 |   "kernelspec": {
286 |    "display_name": "Python 3",
287 |    "language": "python",
288 |    "name": "python3"
289 |   },
290 |   "language_info": {
291 |    "codemirror_mode": {
292 |     "name": "ipython",
293 |     "version": 3
294 |    },
295 |    "file_extension": ".py",
296 |    "mimetype": "text/x-python",
297 |    "name": "python",
298 |    "nbconvert_exporter": "python",
299 |    "pygments_lexer": "ipython3",
300 |    "version": "3.6.8"
301 |   }
302 |  },
303 |  "nbformat": 4,
304 |  "nbformat_minor": 2
305 | }
306 | 


--------------------------------------------------------------------------------