├── .ipynb_checkpoints
├── data_exploration-checkpoint.ipynb
└── model_evaluation-checkpoint.ipynb
├── README.md
├── assets
├── class_distribution.png
├── loss.svg
├── lr.svg
└── nlp_report.pdf
├── cache
├── cached_bert_dev_multi_label_512_nlp_valid.csv
└── cached_bert_train_multi_label_512_nlp_train.csv
├── data_exploration.ipynb
├── data_generator.py
├── find_threshold.py
├── inference.py
├── labels.csv
├── nlp_test.csv
├── nlp_train.csv
├── nlp_valid.csv
├── requirements.txt
└── train_bert.py
/.ipynb_checkpoints/data_exploration-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import matplotlib\n",
11 | "from matplotlib import pyplot as plt\n",
12 | "%matplotlib inline"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 2,
18 | "metadata": {},
19 | "outputs": [
20 | {
21 | "data": {
22 | "text/html": [
23 | "
\n",
24 | "\n",
37 | "
\n",
38 | " \n",
39 | " \n",
40 | " | \n",
41 | " id | \n",
42 | " text | \n",
43 | " anger | \n",
44 | " anticipation | \n",
45 | " disgust | \n",
46 | " fear | \n",
47 | " joy | \n",
48 | " love | \n",
49 | " optimism | \n",
50 | " pessimism | \n",
51 | " sadness | \n",
52 | " surprise | \n",
53 | " trust | \n",
54 | " neutral | \n",
55 | "
\n",
56 | " \n",
57 | " \n",
58 | " \n",
59 | " 0 | \n",
60 | " 0 | \n",
61 | " He was answering a question about the criticis... | \n",
62 | " 1 | \n",
63 | " 0 | \n",
64 | " 1 | \n",
65 | " 0 | \n",
66 | " 0 | \n",
67 | " 0 | \n",
68 | " 0 | \n",
69 | " 1 | \n",
70 | " 0 | \n",
71 | " 0 | \n",
72 | " 0 | \n",
73 | " 0 | \n",
74 | "
\n",
75 | " \n",
76 | " 1 | \n",
77 | " 1 | \n",
78 | " I'm going to start today's discussion thread w... | \n",
79 | " 1 | \n",
80 | " 1 | \n",
81 | " 1 | \n",
82 | " 1 | \n",
83 | " 0 | \n",
84 | " 0 | \n",
85 | " 0 | \n",
86 | " 1 | \n",
87 | " 0 | \n",
88 | " 0 | \n",
89 | " 0 | \n",
90 | " 0 | \n",
91 | "
\n",
92 | " \n",
93 | " 2 | \n",
94 | " 2 | \n",
95 | " By announcing the 395 self-quarantined, it pai... | \n",
96 | " 1 | \n",
97 | " 1 | \n",
98 | " 1 | \n",
99 | " 1 | \n",
100 | " 0 | \n",
101 | " 0 | \n",
102 | " 0 | \n",
103 | " 1 | \n",
104 | " 0 | \n",
105 | " 0 | \n",
106 | " 0 | \n",
107 | " 0 | \n",
108 | "
\n",
109 | " \n",
110 | " 3 | \n",
111 | " 3 | \n",
112 | " Likewise, sorry if I offended you. I’m not act... | \n",
113 | " 1 | \n",
114 | " 0 | \n",
115 | " 1 | \n",
116 | " 1 | \n",
117 | " 0 | \n",
118 | " 0 | \n",
119 | " 0 | \n",
120 | " 1 | \n",
121 | " 0 | \n",
122 | " 0 | \n",
123 | " 0 | \n",
124 | " 0 | \n",
125 | "
\n",
126 | " \n",
127 | " 4 | \n",
128 | " 4 | \n",
129 | " People infected by experience high fever, coug... | \n",
130 | " 0 | \n",
131 | " 0 | \n",
132 | " 0 | \n",
133 | " 0 | \n",
134 | " 0 | \n",
135 | " 0 | \n",
136 | " 0 | \n",
137 | " 0 | \n",
138 | " 0 | \n",
139 | " 0 | \n",
140 | " 0 | \n",
141 | " 1 | \n",
142 | "
\n",
143 | " \n",
144 | "
\n",
145 | "
"
146 | ],
147 | "text/plain": [
148 | " id text anger anticipation \\\n",
149 | "0 0 He was answering a question about the criticis... 1 0 \n",
150 | "1 1 I'm going to start today's discussion thread w... 1 1 \n",
151 | "2 2 By announcing the 395 self-quarantined, it pai... 1 1 \n",
152 | "3 3 Likewise, sorry if I offended you. I’m not act... 1 0 \n",
153 | "4 4 People infected by experience high fever, coug... 0 0 \n",
154 | "\n",
155 | " disgust fear joy love optimism pessimism sadness surprise trust \\\n",
156 | "0 1 0 0 0 0 1 0 0 0 \n",
157 | "1 1 1 0 0 0 1 0 0 0 \n",
158 | "2 1 1 0 0 0 1 0 0 0 \n",
159 | "3 1 1 0 0 0 1 0 0 0 \n",
160 | "4 0 0 0 0 0 0 0 0 0 \n",
161 | "\n",
162 | " neutral \n",
163 | "0 0 \n",
164 | "1 0 \n",
165 | "2 0 \n",
166 | "3 0 \n",
167 | "4 1 "
168 | ]
169 | },
170 | "execution_count": 2,
171 | "metadata": {},
172 | "output_type": "execute_result"
173 | }
174 | ],
175 | "source": [
176 | "# read dataset\n",
177 | "# we will be using trained dataset to understand how people are reacting\n",
178 | "data = pd.read_csv(\"nlp_train.csv\")\n",
179 | "data.head()"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 3,
185 | "metadata": {},
186 | "outputs": [
187 | {
188 | "data": {
189 | "text/html": [
190 | "\n",
191 | "\n",
204 | "
\n",
205 | " \n",
206 | " \n",
207 | " | \n",
208 | " id | \n",
209 | " anger | \n",
210 | " anticipation | \n",
211 | " disgust | \n",
212 | " fear | \n",
213 | " joy | \n",
214 | " love | \n",
215 | " optimism | \n",
216 | " pessimism | \n",
217 | " sadness | \n",
218 | " surprise | \n",
219 | " trust | \n",
220 | " neutral | \n",
221 | "
\n",
222 | " \n",
223 | " \n",
224 | " \n",
225 | " count | \n",
226 | " 1493.000000 | \n",
227 | " 1493.000000 | \n",
228 | " 1493.000000 | \n",
229 | " 1493.000000 | \n",
230 | " 1493.000000 | \n",
231 | " 1493.000000 | \n",
232 | " 1493.000000 | \n",
233 | " 1493.000000 | \n",
234 | " 1493.000000 | \n",
235 | " 1493.000000 | \n",
236 | " 1493.000000 | \n",
237 | " 1493.000000 | \n",
238 | " 1493.000000 | \n",
239 | "
\n",
240 | " \n",
241 | " mean | \n",
242 | " 746.000000 | \n",
243 | " 0.364367 | \n",
244 | " 0.503014 | \n",
245 | " 0.454119 | \n",
246 | " 0.454119 | \n",
247 | " 0.123912 | \n",
248 | " 0.092431 | \n",
249 | " 0.328198 | \n",
250 | " 0.432686 | \n",
251 | " 0.277294 | \n",
252 | " 0.108506 | \n",
253 | " 0.168118 | \n",
254 | " 0.113195 | \n",
255 | "
\n",
256 | " \n",
257 | " std | \n",
258 | " 431.136289 | \n",
259 | " 0.481413 | \n",
260 | " 0.500158 | \n",
261 | " 0.498057 | \n",
262 | " 0.498057 | \n",
263 | " 0.329591 | \n",
264 | " 0.289731 | \n",
265 | " 0.469715 | \n",
266 | " 0.495614 | \n",
267 | " 0.447813 | \n",
268 | " 0.311123 | \n",
269 | " 0.374096 | \n",
270 | " 0.316937 | \n",
271 | "
\n",
272 | " \n",
273 | " min | \n",
274 | " 0.000000 | \n",
275 | " 0.000000 | \n",
276 | " 0.000000 | \n",
277 | " 0.000000 | \n",
278 | " 0.000000 | \n",
279 | " 0.000000 | \n",
280 | " 0.000000 | \n",
281 | " 0.000000 | \n",
282 | " 0.000000 | \n",
283 | " 0.000000 | \n",
284 | " 0.000000 | \n",
285 | " 0.000000 | \n",
286 | " 0.000000 | \n",
287 | "
\n",
288 | " \n",
289 | " 25% | \n",
290 | " 373.000000 | \n",
291 | " 0.000000 | \n",
292 | " 0.000000 | \n",
293 | " 0.000000 | \n",
294 | " 0.000000 | \n",
295 | " 0.000000 | \n",
296 | " 0.000000 | \n",
297 | " 0.000000 | \n",
298 | " 0.000000 | \n",
299 | " 0.000000 | \n",
300 | " 0.000000 | \n",
301 | " 0.000000 | \n",
302 | " 0.000000 | \n",
303 | "
\n",
304 | " \n",
305 | " 50% | \n",
306 | " 746.000000 | \n",
307 | " 0.000000 | \n",
308 | " 1.000000 | \n",
309 | " 0.000000 | \n",
310 | " 0.000000 | \n",
311 | " 0.000000 | \n",
312 | " 0.000000 | \n",
313 | " 0.000000 | \n",
314 | " 0.000000 | \n",
315 | " 0.000000 | \n",
316 | " 0.000000 | \n",
317 | " 0.000000 | \n",
318 | " 0.000000 | \n",
319 | "
\n",
320 | " \n",
321 | " 75% | \n",
322 | " 1119.000000 | \n",
323 | " 1.000000 | \n",
324 | " 1.000000 | \n",
325 | " 1.000000 | \n",
326 | " 1.000000 | \n",
327 | " 0.000000 | \n",
328 | " 0.000000 | \n",
329 | " 1.000000 | \n",
330 | " 1.000000 | \n",
331 | " 1.000000 | \n",
332 | " 0.000000 | \n",
333 | " 0.000000 | \n",
334 | " 0.000000 | \n",
335 | "
\n",
336 | " \n",
337 | " max | \n",
338 | " 1492.000000 | \n",
339 | " 1.000000 | \n",
340 | " 1.000000 | \n",
341 | " 1.000000 | \n",
342 | " 1.000000 | \n",
343 | " 1.000000 | \n",
344 | " 1.000000 | \n",
345 | " 1.000000 | \n",
346 | " 1.000000 | \n",
347 | " 1.000000 | \n",
348 | " 1.000000 | \n",
349 | " 1.000000 | \n",
350 | " 1.000000 | \n",
351 | "
\n",
352 | " \n",
353 | "
\n",
354 | "
"
355 | ],
356 | "text/plain": [
357 | " id anger anticipation disgust fear \\\n",
358 | "count 1493.000000 1493.000000 1493.000000 1493.000000 1493.000000 \n",
359 | "mean 746.000000 0.364367 0.503014 0.454119 0.454119 \n",
360 | "std 431.136289 0.481413 0.500158 0.498057 0.498057 \n",
361 | "min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
362 | "25% 373.000000 0.000000 0.000000 0.000000 0.000000 \n",
363 | "50% 746.000000 0.000000 1.000000 0.000000 0.000000 \n",
364 | "75% 1119.000000 1.000000 1.000000 1.000000 1.000000 \n",
365 | "max 1492.000000 1.000000 1.000000 1.000000 1.000000 \n",
366 | "\n",
367 | " joy love optimism pessimism sadness \\\n",
368 | "count 1493.000000 1493.000000 1493.000000 1493.000000 1493.000000 \n",
369 | "mean 0.123912 0.092431 0.328198 0.432686 0.277294 \n",
370 | "std 0.329591 0.289731 0.469715 0.495614 0.447813 \n",
371 | "min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
372 | "25% 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
373 | "50% 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
374 | "75% 0.000000 0.000000 1.000000 1.000000 1.000000 \n",
375 | "max 1.000000 1.000000 1.000000 1.000000 1.000000 \n",
376 | "\n",
377 | " surprise trust neutral \n",
378 | "count 1493.000000 1493.000000 1493.000000 \n",
379 | "mean 0.108506 0.168118 0.113195 \n",
380 | "std 0.311123 0.374096 0.316937 \n",
381 | "min 0.000000 0.000000 0.000000 \n",
382 | "25% 0.000000 0.000000 0.000000 \n",
383 | "50% 0.000000 0.000000 0.000000 \n",
384 | "75% 0.000000 0.000000 0.000000 \n",
385 | "max 1.000000 1.000000 1.000000 "
386 | ]
387 | },
388 | "execution_count": 3,
389 | "metadata": {},
390 | "output_type": "execute_result"
391 | }
392 | ],
393 | "source": [
394 | "#basic stats\n",
395 | "data.describe()"
396 | ]
397 | },
398 | {
399 | "cell_type": "code",
400 | "execution_count": 11,
401 | "metadata": {},
402 | "outputs": [
403 | {
404 | "data": {
405 | "text/plain": [
406 | "0 949\n",
407 | "1 544\n",
408 | "Name: anger, dtype: int64"
409 | ]
410 | },
411 | "execution_count": 11,
412 | "metadata": {},
413 | "output_type": "execute_result"
414 | }
415 | ],
416 | "source": [
417 | "data['anger'].value_counts()"
418 | ]
419 | },
420 | {
421 | "cell_type": "code",
422 | "execution_count": 15,
423 | "metadata": {},
424 | "outputs": [],
425 | "source": []
426 | },
427 | {
428 | "cell_type": "code",
429 | "execution_count": 16,
430 | "metadata": {},
431 | "outputs": [
432 | {
433 | "data": {
434 | "text/plain": [
435 | "0 1003\n",
436 | "1 490\n",
437 | "Name: optimism, dtype: int64"
438 | ]
439 | },
440 | "execution_count": 16,
441 | "metadata": {},
442 | "output_type": "execute_result"
443 | }
444 | ],
445 | "source": [
446 | "freqs = {\"anger\":data['anger'].value_counts()[1]\n",
447 | "anticipation = data['anticipation'].value_counts()[1]\n",
448 | "data['disgust'].value_counts()[1]\n",
449 | "data['fear'].value_counts()[1]\n",
450 | "data['joy'].value_counts()[1]\n",
451 | "data['love'].value_counts()[1]\n",
452 | "data['optimism'].value_counts()[1]\n",
453 | "data['pessimism'].value_counts()[1]\n",
454 | "data['sadness'].value_counts()[1]\n",
455 | "data['surprise'].value_counts()[1]\n",
456 | "data['trust'].value_counts()[1]\n",
457 | "data['neutral'].value_counts()[1]"
458 | ]
459 | },
460 | {
461 | "cell_type": "code",
462 | "execution_count": null,
463 | "metadata": {},
464 | "outputs": [],
465 | "source": []
466 | }
467 | ],
468 | "metadata": {
469 | "kernelspec": {
470 | "display_name": "Python 3",
471 | "language": "python",
472 | "name": "python3"
473 | },
474 | "language_info": {
475 | "codemirror_mode": {
476 | "name": "ipython",
477 | "version": 3
478 | },
479 | "file_extension": ".py",
480 | "mimetype": "text/x-python",
481 | "name": "python",
482 | "nbconvert_exporter": "python",
483 | "pygments_lexer": "ipython3",
484 | "version": "3.7.4"
485 | }
486 | },
487 | "nbformat": 4,
488 | "nbformat_minor": 2
489 | }
490 |
--------------------------------------------------------------------------------
/.ipynb_checkpoints/model_evaluation-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [],
3 | "metadata": {},
4 | "nbformat": 4,
5 | "nbformat_minor": 2
6 | }
7 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Multi Emotion Detection from COVID-19 Text using BERT ##
2 |
3 | ### Requirements ###
4 |
5 | The code was tested with Python3.8 and PyTorch 1.5.0. Requirements can be installed with following command.
6 | ```
7 | pip install -r requirements.txt
8 | ```
9 | You may have to change package name (append +cpu) for non-cuda version.
10 |
11 | ### Data Preparation ###
12 | The model expects data to be in the csv file. So, you first need to convert json files into csv files. If you have data in any other format, you may need to modify the code.
13 |
14 | To generate the csv files run the following command.
15 |
16 | ```
17 | python .\data_generator.py --file=D:\UTD\Assignment\NLP\project\nlp_test.json --csvfile=D:\UTD\Assignment\NLP\project\nlp_test.csv
18 | ```
19 |
20 | Here ```--csvfile``` represents where to store the converted file.
21 |
22 |
23 | ### Training ###
24 | Once you have the files in the required format, you can start training. You may want to change the parameters. I tried with multiple parameters and the file contains ones that gave the best result.
25 | ```
26 | python train_bert.py --epochs=15
27 | ```
28 |
29 | You can find all available options by running following command.
30 |
31 | ```
32 | python train_bert.py --help
33 | ```
34 | Following graphs shows loss (1) and learning rate (2) over time.
35 |
36 |
37 |
38 |
39 |
40 |
41 | ### Inference ###
42 | Once you have the trained model, you can run the inference on test csv files. Note that as of now, this script requires annotated data to compute the metrics. But it can easily be modified to generate output only.
43 |
44 | Pretrained model can be found [here](https://utdallas.box.com/s/sqqb0n9qe7txb6j3725aiz76gwlmszuw)
45 |
46 | ```
47 | python inference.py --test_csv=D:\\UTD\\Assignment\\NLP\\project\\nlp_valid.csv --model_dir=D:\\UTD\\Assignment\\NLP\\project\\model_output\\3_finetune_e20
48 | ```
49 |
50 | If ```--evaluation``` is set to true, it will output various metrics.
51 |
52 | **Threshold**
53 | One important factor here is to find the optimal threshold for the confidence score. I tested various threshold and found 0.0017 to give the best results for the above specified model. If you train your own model, you may want to run ```find_threshold.py``` to find the best threshold.
54 |
55 | ### Model Evaluation ###
56 |
57 | If you want to evaluate your model on test or train set, you can do so by running following command. Note that the file must be in the csv format.
58 |
59 | ```
60 | python inference.py --test_csv=D:\\UTD\\Assignment\\NLP\\project\\nlp_test.csv --evaluation=True
61 | ```
62 |
63 | I ran the evaluation on train set and found following information.
64 |
65 |
66 | Emotion | Precision | Recall | f1-score
67 | ---------------|---------------|------------|---------------
68 | Anger | 0.97 | 0.95 | 0.96
69 | Anticipation | 0.98 | 1.00 | 0.99
70 | Disgust | 0.97 | 0.96 | 0.97
71 | Fear | 1.00 | 1.00 | 1.00
72 | Joy | 0.93 | 0.93 | 0.93
73 | Love | 1.00 | 0.73 | 0.85
74 | Optimism | 0.91 | 1.00 | 0.95
75 | Pessimism | 0.98 | 0.94 | 0.96
76 | Sadness | 0.99 | 0.92 | 0.95
77 | Suprise | 0.98 | 0.92 | 0.95
78 | Trust | 0.95 | 0.88 | 0.91
79 | Neutral | 0.92 | 1.00 | 0.96
80 | Average | 0.98 | 0.97 | 0.97
81 |
82 | Following table shows information about test set.
83 |
84 | Emotion | Precision | Recall | f1-score
85 | ---------------|---------------|------------|---------------
86 | Anger | 0.53 | 0.67 | 0.59
87 | Anticipation | 0.67 | 0.68 | 0.68
88 | Disgust | 0.62 | 0.78 | 0.69
89 | Fear | 0.69 | 0.72 | 0.71
90 | Joy | 0.50 | 0.27 | 0.35
91 | Love | 0.32 | 0.37 | 0.34
92 | Optimism | 0.37 | 0.59 | 0.45
93 | Pessimism | 0.44 | 0.73 | 0.55
94 | Sadness | 0.42 | 0.53 | 0.47
95 | Suprise | 0.55 | 0.38 | 0.45
96 | Trust | 0.12 | 0.12 | 0.12
97 | Neutral | 0.37 | 0.37 | 0.37
98 | Average | 0.60 | 0.62 | 0.55
99 |
100 | If we set threshold to 0.02 then the average accuracy is 0.66.
101 |
102 | ### Possible Improvements
103 |
104 | 
105 |
106 | The biggest caveat here was the class imbalance. It has been established that the class imabalance can negatively affect our model. So, it is always a good idea to balance our data before training the model. Due to limited time, I didn't do any of that stuff. But ideally, we want to oversample from minority classes or pass weights to the loss function. I implemented both approaches for image classification [here](github.com/savan77/Transfer-Learning).
107 |
108 | As we can see above, negative emotions such as an anger or pessimism have bigger representation in the data compare to happy emotions. This makes sense, but in order to train a good model we should even it out.
109 |
110 | Moreover, we can play with the network. Here, the standard off-the-shelf text classification network was used. Maybe adding more fully connected layer on top of the BERT may help.
--------------------------------------------------------------------------------
/assets/class_distribution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/savan77/EmotionDetectionBERT/39f859d48250d84e0cef7d1fb9163c37afe6dbfa/assets/class_distribution.png
--------------------------------------------------------------------------------
/assets/loss.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/assets/lr.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/assets/nlp_report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/savan77/EmotionDetectionBERT/39f859d48250d84e0cef7d1fb9163c37afe6dbfa/assets/nlp_report.pdf
--------------------------------------------------------------------------------
/cache/cached_bert_dev_multi_label_512_nlp_valid.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/savan77/EmotionDetectionBERT/39f859d48250d84e0cef7d1fb9163c37afe6dbfa/cache/cached_bert_dev_multi_label_512_nlp_valid.csv
--------------------------------------------------------------------------------
/cache/cached_bert_train_multi_label_512_nlp_train.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/savan77/EmotionDetectionBERT/39f859d48250d84e0cef7d1fb9163c37afe6dbfa/cache/cached_bert_train_multi_label_512_nlp_train.csv
--------------------------------------------------------------------------------
/data_exploration.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import matplotlib\n",
11 | "from matplotlib import pyplot as plt\n",
12 | "%matplotlib inline"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 2,
18 | "metadata": {},
19 | "outputs": [
20 | {
21 | "data": {
22 | "text/html": [
23 | "\n",
24 | "\n",
37 | "
\n",
38 | " \n",
39 | " \n",
40 | " | \n",
41 | " id | \n",
42 | " text | \n",
43 | " anger | \n",
44 | " anticipation | \n",
45 | " disgust | \n",
46 | " fear | \n",
47 | " joy | \n",
48 | " love | \n",
49 | " optimism | \n",
50 | " pessimism | \n",
51 | " sadness | \n",
52 | " surprise | \n",
53 | " trust | \n",
54 | " neutral | \n",
55 | "
\n",
56 | " \n",
57 | " \n",
58 | " \n",
59 | " 0 | \n",
60 | " 0 | \n",
61 | " He was answering a question about the criticis... | \n",
62 | " 1 | \n",
63 | " 0 | \n",
64 | " 1 | \n",
65 | " 0 | \n",
66 | " 0 | \n",
67 | " 0 | \n",
68 | " 0 | \n",
69 | " 1 | \n",
70 | " 0 | \n",
71 | " 0 | \n",
72 | " 0 | \n",
73 | " 0 | \n",
74 | "
\n",
75 | " \n",
76 | " 1 | \n",
77 | " 1 | \n",
78 | " I'm going to start today's discussion thread w... | \n",
79 | " 1 | \n",
80 | " 1 | \n",
81 | " 1 | \n",
82 | " 1 | \n",
83 | " 0 | \n",
84 | " 0 | \n",
85 | " 0 | \n",
86 | " 1 | \n",
87 | " 0 | \n",
88 | " 0 | \n",
89 | " 0 | \n",
90 | " 0 | \n",
91 | "
\n",
92 | " \n",
93 | " 2 | \n",
94 | " 2 | \n",
95 | " By announcing the 395 self-quarantined, it pai... | \n",
96 | " 1 | \n",
97 | " 1 | \n",
98 | " 1 | \n",
99 | " 1 | \n",
100 | " 0 | \n",
101 | " 0 | \n",
102 | " 0 | \n",
103 | " 1 | \n",
104 | " 0 | \n",
105 | " 0 | \n",
106 | " 0 | \n",
107 | " 0 | \n",
108 | "
\n",
109 | " \n",
110 | " 3 | \n",
111 | " 3 | \n",
112 | " Likewise, sorry if I offended you. I’m not act... | \n",
113 | " 1 | \n",
114 | " 0 | \n",
115 | " 1 | \n",
116 | " 1 | \n",
117 | " 0 | \n",
118 | " 0 | \n",
119 | " 0 | \n",
120 | " 1 | \n",
121 | " 0 | \n",
122 | " 0 | \n",
123 | " 0 | \n",
124 | " 0 | \n",
125 | "
\n",
126 | " \n",
127 | " 4 | \n",
128 | " 4 | \n",
129 | " People infected by experience high fever, coug... | \n",
130 | " 0 | \n",
131 | " 0 | \n",
132 | " 0 | \n",
133 | " 0 | \n",
134 | " 0 | \n",
135 | " 0 | \n",
136 | " 0 | \n",
137 | " 0 | \n",
138 | " 0 | \n",
139 | " 0 | \n",
140 | " 0 | \n",
141 | " 1 | \n",
142 | "
\n",
143 | " \n",
144 | "
\n",
145 | "
"
146 | ],
147 | "text/plain": [
148 | " id text anger anticipation \\\n",
149 | "0 0 He was answering a question about the criticis... 1 0 \n",
150 | "1 1 I'm going to start today's discussion thread w... 1 1 \n",
151 | "2 2 By announcing the 395 self-quarantined, it pai... 1 1 \n",
152 | "3 3 Likewise, sorry if I offended you. I’m not act... 1 0 \n",
153 | "4 4 People infected by experience high fever, coug... 0 0 \n",
154 | "\n",
155 | " disgust fear joy love optimism pessimism sadness surprise trust \\\n",
156 | "0 1 0 0 0 0 1 0 0 0 \n",
157 | "1 1 1 0 0 0 1 0 0 0 \n",
158 | "2 1 1 0 0 0 1 0 0 0 \n",
159 | "3 1 1 0 0 0 1 0 0 0 \n",
160 | "4 0 0 0 0 0 0 0 0 0 \n",
161 | "\n",
162 | " neutral \n",
163 | "0 0 \n",
164 | "1 0 \n",
165 | "2 0 \n",
166 | "3 0 \n",
167 | "4 1 "
168 | ]
169 | },
170 | "execution_count": 2,
171 | "metadata": {},
172 | "output_type": "execute_result"
173 | }
174 | ],
175 | "source": [
176 | "# read dataset\n",
177 | "# we will be using trained dataset to understand how people are reacting\n",
178 | "data = pd.read_csv(\"nlp_train.csv\")\n",
179 | "data.head()"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 3,
185 | "metadata": {},
186 | "outputs": [
187 | {
188 | "data": {
189 | "text/html": [
190 | "\n",
191 | "\n",
204 | "
\n",
205 | " \n",
206 | " \n",
207 | " | \n",
208 | " id | \n",
209 | " anger | \n",
210 | " anticipation | \n",
211 | " disgust | \n",
212 | " fear | \n",
213 | " joy | \n",
214 | " love | \n",
215 | " optimism | \n",
216 | " pessimism | \n",
217 | " sadness | \n",
218 | " surprise | \n",
219 | " trust | \n",
220 | " neutral | \n",
221 | "
\n",
222 | " \n",
223 | " \n",
224 | " \n",
225 | " count | \n",
226 | " 1493.000000 | \n",
227 | " 1493.000000 | \n",
228 | " 1493.000000 | \n",
229 | " 1493.000000 | \n",
230 | " 1493.000000 | \n",
231 | " 1493.000000 | \n",
232 | " 1493.000000 | \n",
233 | " 1493.000000 | \n",
234 | " 1493.000000 | \n",
235 | " 1493.000000 | \n",
236 | " 1493.000000 | \n",
237 | " 1493.000000 | \n",
238 | " 1493.000000 | \n",
239 | "
\n",
240 | " \n",
241 | " mean | \n",
242 | " 746.000000 | \n",
243 | " 0.364367 | \n",
244 | " 0.503014 | \n",
245 | " 0.454119 | \n",
246 | " 0.454119 | \n",
247 | " 0.123912 | \n",
248 | " 0.092431 | \n",
249 | " 0.328198 | \n",
250 | " 0.432686 | \n",
251 | " 0.277294 | \n",
252 | " 0.108506 | \n",
253 | " 0.168118 | \n",
254 | " 0.113195 | \n",
255 | "
\n",
256 | " \n",
257 | " std | \n",
258 | " 431.136289 | \n",
259 | " 0.481413 | \n",
260 | " 0.500158 | \n",
261 | " 0.498057 | \n",
262 | " 0.498057 | \n",
263 | " 0.329591 | \n",
264 | " 0.289731 | \n",
265 | " 0.469715 | \n",
266 | " 0.495614 | \n",
267 | " 0.447813 | \n",
268 | " 0.311123 | \n",
269 | " 0.374096 | \n",
270 | " 0.316937 | \n",
271 | "
\n",
272 | " \n",
273 | " min | \n",
274 | " 0.000000 | \n",
275 | " 0.000000 | \n",
276 | " 0.000000 | \n",
277 | " 0.000000 | \n",
278 | " 0.000000 | \n",
279 | " 0.000000 | \n",
280 | " 0.000000 | \n",
281 | " 0.000000 | \n",
282 | " 0.000000 | \n",
283 | " 0.000000 | \n",
284 | " 0.000000 | \n",
285 | " 0.000000 | \n",
286 | " 0.000000 | \n",
287 | "
\n",
288 | " \n",
289 | " 25% | \n",
290 | " 373.000000 | \n",
291 | " 0.000000 | \n",
292 | " 0.000000 | \n",
293 | " 0.000000 | \n",
294 | " 0.000000 | \n",
295 | " 0.000000 | \n",
296 | " 0.000000 | \n",
297 | " 0.000000 | \n",
298 | " 0.000000 | \n",
299 | " 0.000000 | \n",
300 | " 0.000000 | \n",
301 | " 0.000000 | \n",
302 | " 0.000000 | \n",
303 | "
\n",
304 | " \n",
305 | " 50% | \n",
306 | " 746.000000 | \n",
307 | " 0.000000 | \n",
308 | " 1.000000 | \n",
309 | " 0.000000 | \n",
310 | " 0.000000 | \n",
311 | " 0.000000 | \n",
312 | " 0.000000 | \n",
313 | " 0.000000 | \n",
314 | " 0.000000 | \n",
315 | " 0.000000 | \n",
316 | " 0.000000 | \n",
317 | " 0.000000 | \n",
318 | " 0.000000 | \n",
319 | "
\n",
320 | " \n",
321 | " 75% | \n",
322 | " 1119.000000 | \n",
323 | " 1.000000 | \n",
324 | " 1.000000 | \n",
325 | " 1.000000 | \n",
326 | " 1.000000 | \n",
327 | " 0.000000 | \n",
328 | " 0.000000 | \n",
329 | " 1.000000 | \n",
330 | " 1.000000 | \n",
331 | " 1.000000 | \n",
332 | " 0.000000 | \n",
333 | " 0.000000 | \n",
334 | " 0.000000 | \n",
335 | "
\n",
336 | " \n",
337 | " max | \n",
338 | " 1492.000000 | \n",
339 | " 1.000000 | \n",
340 | " 1.000000 | \n",
341 | " 1.000000 | \n",
342 | " 1.000000 | \n",
343 | " 1.000000 | \n",
344 | " 1.000000 | \n",
345 | " 1.000000 | \n",
346 | " 1.000000 | \n",
347 | " 1.000000 | \n",
348 | " 1.000000 | \n",
349 | " 1.000000 | \n",
350 | " 1.000000 | \n",
351 | "
\n",
352 | " \n",
353 | "
\n",
354 | "
"
355 | ],
356 | "text/plain": [
357 | " id anger anticipation disgust fear \\\n",
358 | "count 1493.000000 1493.000000 1493.000000 1493.000000 1493.000000 \n",
359 | "mean 746.000000 0.364367 0.503014 0.454119 0.454119 \n",
360 | "std 431.136289 0.481413 0.500158 0.498057 0.498057 \n",
361 | "min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
362 | "25% 373.000000 0.000000 0.000000 0.000000 0.000000 \n",
363 | "50% 746.000000 0.000000 1.000000 0.000000 0.000000 \n",
364 | "75% 1119.000000 1.000000 1.000000 1.000000 1.000000 \n",
365 | "max 1492.000000 1.000000 1.000000 1.000000 1.000000 \n",
366 | "\n",
367 | " joy love optimism pessimism sadness \\\n",
368 | "count 1493.000000 1493.000000 1493.000000 1493.000000 1493.000000 \n",
369 | "mean 0.123912 0.092431 0.328198 0.432686 0.277294 \n",
370 | "std 0.329591 0.289731 0.469715 0.495614 0.447813 \n",
371 | "min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
372 | "25% 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
373 | "50% 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
374 | "75% 0.000000 0.000000 1.000000 1.000000 1.000000 \n",
375 | "max 1.000000 1.000000 1.000000 1.000000 1.000000 \n",
376 | "\n",
377 | " surprise trust neutral \n",
378 | "count 1493.000000 1493.000000 1493.000000 \n",
379 | "mean 0.108506 0.168118 0.113195 \n",
380 | "std 0.311123 0.374096 0.316937 \n",
381 | "min 0.000000 0.000000 0.000000 \n",
382 | "25% 0.000000 0.000000 0.000000 \n",
383 | "50% 0.000000 0.000000 0.000000 \n",
384 | "75% 0.000000 0.000000 0.000000 \n",
385 | "max 1.000000 1.000000 1.000000 "
386 | ]
387 | },
388 | "execution_count": 3,
389 | "metadata": {},
390 | "output_type": "execute_result"
391 | }
392 | ],
393 | "source": [
394 | "#basic stats\n",
395 | "data.describe()"
396 | ]
397 | },
398 | {
399 | "cell_type": "code",
400 | "execution_count": 11,
401 | "metadata": {},
402 | "outputs": [
403 | {
404 | "data": {
405 | "text/plain": [
406 | "0 949\n",
407 | "1 544\n",
408 | "Name: anger, dtype: int64"
409 | ]
410 | },
411 | "execution_count": 11,
412 | "metadata": {},
413 | "output_type": "execute_result"
414 | }
415 | ],
416 | "source": [
417 | "data['anger'].value_counts()"
418 | ]
419 | },
420 | {
421 | "cell_type": "code",
422 | "execution_count": 15,
423 | "metadata": {},
424 | "outputs": [],
425 | "source": []
426 | },
427 | {
428 | "cell_type": "code",
429 | "execution_count": 17,
430 | "metadata": {},
431 | "outputs": [],
432 | "source": [
433 | "freqs = {\"anger\":data['anger'].value_counts()[1],\n",
434 | "\"anticipation\": data['anticipation'].value_counts()[1],\n",
435 | "\"disgust\":data['disgust'].value_counts()[1],\n",
436 | "\"fear\":data['fear'].value_counts()[1],\n",
437 | "\"joy\":data['joy'].value_counts()[1],\n",
438 | "\"love\":data['love'].value_counts()[1],\n",
439 | "\"optimism\":data['optimism'].value_counts()[1],\n",
440 | "\"pessimism\":data['pessimism'].value_counts()[1],\n",
441 | "\"sadness\":data['sadness'].value_counts()[1],\n",
442 | "\"surprise\":data['surprise'].value_counts()[1],\n",
443 | "\"trust\":data['trust'].value_counts()[1],\n",
444 | "\"neutral\":data['neutral'].value_counts()[1]}"
445 | ]
446 | },
447 | {
448 | "cell_type": "code",
449 | "execution_count": 18,
450 | "metadata": {},
451 | "outputs": [
452 | {
453 | "data": {
454 | "text/plain": [
455 | "{'anger': 544,\n",
456 | " 'anticipation': 751,\n",
457 | " 'disgust': 678,\n",
458 | " 'fear': 678,\n",
459 | " 'joy': 185,\n",
460 | " 'love': 138,\n",
461 | " 'optimism': 490,\n",
462 | " 'pessimism': 646,\n",
463 | " 'sadness': 414,\n",
464 | " 'surprise': 162,\n",
465 | " 'trust': 251,\n",
466 | " 'neutral': 169}"
467 | ]
468 | },
469 | "execution_count": 18,
470 | "metadata": {},
471 | "output_type": "execute_result"
472 | }
473 | ],
474 | "source": [
475 | "freqs"
476 | ]
477 | },
478 | {
479 | "cell_type": "code",
480 | "execution_count": 85,
481 | "metadata": {},
482 | "outputs": [
483 | {
484 | "data": {
485 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAw4AAAFUCAYAAACNyGv1AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAgAElEQVR4nOzdebwd8/348ddbdmIJWaS2WG6pWILU7ieklpbal9RS0bTRfoNqvy2huLkUUY1W+y0VS4MGDbVTa4tWEYmoIpaUkFQksbS2CJHP74+Ze5177rk599zcTfJ6Ph7ncc75zGdm3nPOnDnznvnMZyKlhCRJkiQtyQrtHYAkSZKkjs/EQZIkSVJZJg6SJEmSyjJxkCRJklSWiYMkSZKkskwcJEmSJJVl4iCp3UTEmIhIETGgvWMpJyJWyON9OSIWRYR9WatdRcTMiHiwveNoTEQMyH/fY9o7Fkktw8RB+pyLiCH5n3Njj0XtHN8By8iOwzFANfAXYARwdPuG0zwRMTQiJkXErIhYGBHvRcTUiDgnItYuUb9vRPwsIp6NiA/y+tMi4syIWKWo7g35OjdoCfOPiHglIv4TET3yspkR8UxRvQlF6/FHETE3Ih7OY92gGct+ah7jy/k0Z5apf2hE/L1guf8aEV+rYH5jipZhcUS8HREPRMR+lcbf0iJieESc1M4xDIglb7/qPVo5lsMi4rTWnIf0ede5vQOQ1GKuA+4qUb64rQMpcgDZTveYEsN+CowFFrZlQM20B/Bf4Nvpc3jnzIhYAbgU+DbwKnAt8BLQFdgGGAV8B+hbMM4OwO3AKsBE4FdAJ2A3su/z2IjYK6X0Yj7KFcAhwLHA9xsJZTdgAHBpSmlBE0L/HvA+2f9Vb2Bb4H+BH0XEqSmlC5swjVrnAm8DTwKrLaliRJxCtm5OA84EEnAUcEdEHJ1SmljBfM8EXsmXYUPgOODWiDiqwum0tOFk38Uv2zGG+TRMwg8CDiT7vqa3YSyHAXvn85VUgomDtOx4MqX0+/YOohIppUVAu54RqcCawH+akjRExMoppffaIKZKjCFLGq4DhqeUPi4cGBH/S3ZGpfb9msCtZP8TO6WUniiofnF+5P0W4LaI2CpPAu4FZgFHRsSPi+eROzZ/vqKJcd+YUnqzKNZ1gTuAcRHx75TSH5o4rQ1TSi/n03gG6FmqUkT0A84CngG2Syl9kpf/mizp+HVE3J5SereJ8/1TSmlKwfRvBJ4CRpMlZMutlNIHQL3tVkRsRJY43JdSerA94pJUmk2VpOVIYZvj/LT8UxGxICJmRMSxeZ11I+LGvEnFexHx+4hYucS0toiImyPirbwZyXMRcXJEdCqo8yDZ2QaKmhwMz8tKXuOQx3lN3jRlYUT8KyLOjYgVi+rVjr9xPnx2Xv8fpZqURMQ3I2Jy3kzmg7zJysSI6LOEz2xI3kRiN2C9gmWYULuMeVObDWo/N+DdgvFXiojz8mVYGBFvRMTVEbFeqfnkzUf+JyJeyD/Xf0bEPnmdzSPi7oh4N//cfxURXRqLvWDafYEfk51p+FapHfqU0n9SSj8oKPox0Ac4tShpqK1/F9mR6o3Jmm6RUloMTADWABo0xYmsadNBwDOlptlUKaXXyM5sLAbOqWC8l5tYdUeyMzETa5OGfPxPyM7U9AL2b3LADeP4B/AmUFVqeEQcHhF/y39/H0bE4xFxSFOmHRF7RsQf8nV7Qb6u3xsRuxbVmwnsSv11OkXEkII6VfnvcE5EfJyv5xdExEol5rtzRDySz3NuRPwfjSRmLSEiVsx//9Pz38nb+fZoYEGdiIg789i3LRr/oHx5L87fTwEOBlYq+jya9LlLywvPOEjLjhUjoneJ8o9LHBndF/gucDFZ040RwJUR8THZafo/A6cBXwa+BXxEdrQagIgYDDwEfAL8BngD+DpwPrAlcGRe9RyyAxS7UL85wt8bW4h8h3oysCpwCfAiMAQ4FdgpIobmZyoKXZXH8nOyHb6TgFsi4osppZn5dI/K6/2VrOnIAmBd4KtkzXPmNxLS9Dz2n5A1landuf5XQZ2e+efxSF6vbz7PzsA9wE7AjcA4sp3F7wF7RsTglNLsovmNItsxvZzscz8xX5ZDgcvIzhjcAuwJnADMI2vytST7AN2Bq1NKH5WpW+tg4GOyz6wxl5ElGAcD/5eX/Q44nezMwo1F9YcBK9L0sw2NSim9GBF/BXaNiI1TSi8s7TQLdMufPywxrLZse+Ca5kw8InqRfcfzSgz7Kdk6dDdwBllydCBwQ0Qcn1L6TZnJDwdWB64GZgNrkf12H4iI3VJKf83rnQScR/11GvKmQRGxDdl24D9kTdz+TfbbPpHsd7hrwZmY7YD7gffItgH/Ifuur27aJ1KZiOiexzaILFG9iCxZPQ54LCJ2SCk9k1JKEXEM8A/g+ogYlFJ6N7IzVpeTnVH6YT7ZM4AaYAsKtnVk2yJJtVJKPnz4+Bw/yHaq0xIedxTUHZCXfQCsV1Deh2wndTHww6Lp30S2A9mzoOwRsiZGWxSUBTApn/7QgvIJ2aamZOxj8voDCsom5mVfK6p7QV4+osT4dwBRUP7lvPy8ouV4F+jczM/5QWBmI+UJ+GmJYd/Jh/2sqHyfvPyaEt/jv4FVC8q3yMsXAwcVTWcqMKcJsY/Lp3FQubp5/ZXz+k83oe67wJtFZQ/k68cXisofJbuepXdR+UyysxCFZRPyGHovYd6/yut8vRnf5zOlvs982Ob5dG8pMeyWfNhtTZhH7fo5lGwHfU2yJPIvjawXW+fl5zYy33eBlYs+tweL6q1UYtx+ZGc47mrKOp0P+wfwfOH88vID8xiHF5T9nWwb8cWCsq5kO90JGFPhd1P7uQ1pZPgZ+fq1S1F5b2AuBdu8vHwI8ClwPdkB00fIEsBNi+rdCLxf6brkw8fy9LCpkrTsGE92AW/x4ycl6t6SUnq19k1KaT7wAtnOafERzb8CXciSjtpmLzuS7Tg9XTCNxGcXFR7YnAWI7ALe/YBpKWsKU+g8Pjv6WuyifP61sTxBdvSzsCnIf8mOdu8TEdGc+Mr4eYmyA8liPq+wMKV0J1kb9/3zZS40IaX034K6T5PtML6eUrqpqO7fgDUjolyTkNrej5raJr+2/n+XWOuzaa5aVHYF2UXUdWeZImITsqP0t6WiaxaWQu3yrLLEWhVKKf0TuI/s+/lZRHwpf5xPdoYKsnWpqe4nO6M1h+w724HsyHxxDz5Hku0wXxURvQsfwG1kCd0OZWL/oPZ1RPSMiDXIdpofB7ZrSrARsTlZwnot0K0ojr+RHXjYM6/bN4/p1vTZRfKkrDncL5oyv2Y4iuz3M70oNsiSsqGFv6uUXSdxDnA4WbK0I/D9lNJzrRSftMyyqZK07HgppXR/E+uWauv9DtnR6+Iejt7Jn9fIn9fPn58tMY3nyHaUK+4qM9eHrNlPg2mnlN6OiDmNTLvU8rzNZzFDltT8P7Ijt29FxEPAn4A/pKW/kHl+Suk/JcrXJ9vhf6fEsGfJmlr0pn6Tlca+m1mNlEO2nO8vIb7aHewG16qUqV+cEJSyCg0TjJvImqscS7aDDFmTN4ArmxhDU9RLiCJiVaBHUZ35KaVPmzHtw8mas/yIrDkWZEf4R5E10WpqEkY+zotkycZuZM19eqWGTe6+RHbm7vklTKvfkmYUERuS7STvRcOeo5raG9iX8uea/LGkOGp/j6VibvEd88iuoaoi+5waa14I2bpb+LurAb5Gdsbn5pTSZS0dm7Q8MHGQlk+N7UgtaQcrip5bQ3On3VjcddNLKb0UEZuSNRsZSnZh6GVATUT8v5TSvxqZRlOUagtfb/4VWJrvpjG190jYCri5XAAppfci4jVg44hYMaVUcvki6/1mZbKjuIXjfxQR1wL/ExE7kh3tPpqszf295eZfgS3y59rrGy4ivxi/wPpkO/wVyZO9gyPrYemLZInZP8i664Ql79wXm5w+61XptoiYC5wXEdNSSr8tqBdkO/dfpfHvu1TCno2cnXl6GFiJ7ML1f5KdeVtMdo3Q7k2Mt3Z9Gkd2rUUp7xTVLZWUtMa2IvLHEzQ8Y1OoOJGu4rOEaOOI6JGa1h2wpAImDpIqVXtEfGCJYZuQXQxdeNS8knsezCPb0Wkw7fyC0v5kTRSaJT+bclf+ILKel+4ku0ByVHOnuwT/AvaOiNVKnJHYlPz6gFaYb7E7ya5hOToizilxVqmUm8guoP0m8NtG6ny7oG6xK4D/ITvrsDpZ+/5zmnn0v4GI+CLZRfcvFTSR+RlFXXuSXbjfbCmluWTt5mvnW9tbV6l7pjTVOLIOCX4aEdemzzoveIksMXktpdSc+xcMBb5A1nPW7woH5BddF2vst/lS/vxpE85i1ibcXyoxrFTZUkkpLYqIV4A1mnqGNSK6AX8g60Dhh8CFZInVccWTb8lYpWWR1zhIqkhKaR7ZxZBfj4jNasvz6wZOzd8WHtV+Px++ehOmvZjshmNbRcTeRYNHk22zyh4xL6WRHqeezJ/LxtZMt5DFPLoolq+SHf2/LV/mVpV/ZxeQXadyeUR0La4TEatERGGb9AuAt8iOjG9dov5eZDthL1Kil6SU0pNkSd7hwPFkO2W/K67XHHmvODeQfbZ11/CklJ5LKd1f9GhqL1JNme9gsmTpoZTS35o7nZT1RnQuWROzEwsG1fbSdG4UdGtcMP++xWVFapOyekf6I2JPSl/f8D7Qq8Q1P9PIzlJ9N0rcoTsiOtf+nvN16zGy60G+WFCnK/V7a2pJVwMbRMR3Sw3MzxIVGkd2duo7KaVfkCXCIyPi4KJ675P1TtcNSSV5xkFadmyddzlayi0ppSW1ga/U98m6H/1rRNR2x7ovWbvqa1NKDxTUfYxsx/HiiLiT7Kjf4ymlVxqZ9mlkF3XfkvexPoPs2oTDyZphLKl70CW5NyL+m09jFln77+HkvRs1c5rlTCBrOnNKZPeqeBjYiOxI/FyW3NSipY0hO2PzbWDniLie7LPtSnatxaFkPeP8ACCl9HpEHEB2E7hHI2Ii2XfZiayXmkOA14D9GmvKRJZQ/JpsvXiwmc3BDomI2jtHr0F25+j9yJKGk1JKNzR1QhFxNFB7/4w+QNeIOD1//2pK6ZqCumeTNW+ZTHYNx9Zk12n8m4Z3Om6Oa8i6Bf5hRPw6pfTflNITEVFN1h7/qYi4AXid7HvbhqyNfoOkr8DfyH6L4/L1bTbZd3s0WbOlzYvqP0b2u/2/iPg7WeLx55TSvPyz+jPwdERcSdZEakWy9fcgsoMEE/Lp/JCsudoj+fagtjvW1trHGEvW7OqSPAl/mOyC7XXJth213UMTEfuTnU0cX7Cu/ADYGbgsIp5I2X1BIPs8hgPjI+J+sm3V31LDLpOl5Vd7d+vkw4ePpXtQvjvWBGyU1x1AI90j0nh3o8Mp0TUiWZ/ut5BdhLyQrP/3k4FORfVWIOtxaDbZjkldV46U6I41L1+fbMdqHtnO7MtkR2hXLKpXcvx82EwKuqok6xr1PrKdio/Jeri5C9itiZ9zY59PyfKC4SuR9ar0cj7fefmyrdfI9zi83LI0ZfmXEM9XyI7Wz87jeY+sW9ezgf4l6q+Zf3/Tya7leJ/sTEI1Bd3GNjKvXmT3y0jA0UuoN5PGu2OtfSzMP7u/kt23YoNm/FYepPHfyINFdQ8i6z72bbJmXi+SXei9WgXzq/1+Bjcy/Lh8eHVR+T5k9/+o/W3NIruQ/3vl1guyI+t3k12D8F6+zLtQolvkfN28giyJrf1tDikYvh7Z0fmZ+bryVr6unAesUzSt/0d2JvKj/Hv6DbAZrdAda16nK9mF608WrJcvkh1Y2C2vs3Ye8zNAj6LxB+bj/ZV8m0XWe9yvyZK12s/jkErXMx8+luVHpGSTPkmSJElL5jUOkiRJksoycZAkSZJUlomDJEmSpLJMHCRJkiSVZeIgSZIkqaxl5j4OvXv3TgMGDGjvMCRJkqTPtalTp76ZUupTXL7MJA4DBgxgypQp7R2GJEmS9LkWEa+WKrepkiRJkqSyTBwkSZIklWXiIEmSJKksEwdJkiRJZZk4SJIkSSrLxEGSJElSWSYOkiRJksoycZAkSZJUlomDJEmSpLJMHCRJkiSVZeIgSZIkqazO7R2APp/GTntzqacxeqveLRCJJEmS2oJnHCRJkiSVZeIgSZIkqSwTB0mSJEllmThIkiRJKsvEQZIkSVJZJg6SJEmSyjJxkCRJklSWiYMkSZKkskwcJEmSJJVl4iBJkpSLiLKPBx98sL3DrPPhhx8yZswYnnnmmSaPM2XKFA455BD69etH165dWXvttfn2t7/NCy+8UFdnzTXX5PTTT2+NkCuy/fbb133uXbt2pV+/fnzlK1/h0ksv5eOPP65X97e//S0RwaJFi5o07eeee44xY8bw/vvvN6n+888/T0Rw//3315W15Od08cUXc8cddzQo7yjfBUDn9g5AkiSpo3j00UfrXi9YsIDdd9+d008/nX322aeufNNNN22P0Er68MMPqampYZNNNmGzzTYrW/+6667jm9/8JkOHDuVXv/oV/fv3Z9asWfz+979nyJAhzJkzpw2irsxee+3FmDFj+PTTT5k/fz5//vOf+cEPfsCVV17J/fffz8orrwzAQQcdxKBBg+jcuWm7t8899xw1NTV897vfpWfPnmXrDxgwgEcffbTVvv+LL76YnXfemX333bde+V133UXfvn1bZZ6VMnGQJEnKbb/99nWva49Eb7jhhvXKl8aCBQvo0aNHi0yrUq+++iojRozg2GOPZfz48fWGHXnkkSWPdncEvXv3rvf5H3DAARxzzDHsvPPOnHzyyVxyySUA9O3bt9V2sD/66CO6d+/eYutBJbbeeus2n2djbKokSZJUoVmzZnHMMcew/vrr06NHDzbeeGNqamr45JNP6urUNm2ZNGkSRxxxBKuuuiqHHnookCUQ3/nOd1hllVXo3bs3p512Gueffz7du3evN5/58+czYsQI+vbtS48ePdhll12YOnUqkO3M9unTB4BvfOMbdU163njjjZIxX3rppQBccMEFJYcXH+ku9PDDD7PvvvvSv39/evbsydZbb80NN9xQr85bb73F8OHD6d+/P927d2e99dZj1KhRdcNnzpzJQQcdRJ8+fejRowdVVVWcffbZjc5zSbbZZhuOO+44JkyYwIIFC4CGTZVSSpx11llssMEGdO/enTXXXJOvfe1rvPXWW9x9991130X//v2JCDbZZJN603nyySfZZZdd6NGjB7/+9a9LNlWqnc+ZZ55Jv379WHnllTnmmGPqNX9qrAlVYROk7bffnmeffZZLL7207nu8/vrrG9SrNXHiRAYOHEi3bt1Yd911687IFM9z+vTp7L777qy44opsuummS50ctskZh4jYGPhDQdEGwJnA1Xn5AGAmcFhK6Z18nFOBEcCnwIkppXvaIlZJkqRy5s2bx5prrskvf/lLVlttNaZPn86YMWN4++23ueiii+rVPemkkzjssMP44x//WNeM5qSTTuLaa6/lvPPOo6qqissuu4wnnnii3ngLFixgt912Y+HChVx44YWsscYa/PrXv2bo0KHMmDGDNdZYg7vvvpu9996bs88+m6985SsArLHGGiVjfuihh9hhhx1YddVVK17eV199lSFDhjBq1Ci6du3Kww8/zDe+8Q06d+7MgQceCMAJJ5zA008/za9+9Sv69u3La6+9Vq/p1xFHHEHnzp25/PLLWWWVVfjXv/7Fyy+/XHEstfbYYw8uuugi/vGPf5Q8E3DZZZcxbtw4fvazn/GlL32J+fPnc//997NgwQJ22GEHzj33XE477TTuvPNOVl999QZngg4//HBGjRrFWWedxeqrr95oHBMmTOBLX/oSV155Ja+99hqnnHIKixcv5pprrmnyslxxxRXst99+bLnllpx88skAVFVVlax7++23c9RRRzFixAguvPBCpk6dSnV1Nf/5z3/45S9/Wa/usGHDOO644xg9ejTjxo3j0EMPZebMmfTr16/JsRVqk8QhpfQCMAggIjoB/wZuBkYDD6SUxkbE6Pz9KRGxKTAMGAh8Abg/Ir6YUvq05AwkSZLa0DbbbMM222wDZEecd9ppJ7p27cr3v/99LrzwQjp16lRXd9ddd623Q/fGG2/wu9/9jnHjxnHCCScAWTv+jTfeuN48rrzySv71r38xffp0BgwYAMDuu+/ORhttxEUXXcTZZ59dF8NGG21UthnNv//9b3bbbbdmLe/RRx9d93rx4sXsuuuuzJw5k8suu6wucZg8eTKnnHJK3ZH8wvFSSjzxxBPcdddd7LHHHgDNjqXW2muvDcDcuXNLDp88eTL77rsvxx13XF3ZwQcfXPe6dsd86623Zs0112ww/o9+9KN64z7//PMl57No0SLuuOOOurNFXbt2ZeTIkYwZM4YNN9ywScsycOBAevToQd++fct+j2eccQZ77703l19+OZCtO4sWLeLss8/mtNNOq9dc65RTTuGII44AYPPNN+cLX/gCf/rTnxg+fHiT4irWHk2VhgL/Sim9CuwPXJWXXwUckL/eH7g+pbQwpfQKMAPYts0jlSRJKmHx4sVccMEFbLLJJvTo0YMuXbowYsQI3n///QYXGBdeWA3w1FNP8cknn7DffvvVla2wwgoN6t1///1st912rL322ixatIhFixbRqVMndtllF6ZMmdKsuCOiWeO99dZbjBo1inXXXZeuXbvSpUsXrr76al588cW6OoMGDeK8887jt7/9LTNmzGgw3y233JIf//jHXH311cyePbtZcRRKKS1x+KBBg7jllls466yzmDJlCosXL65o+sXfR2P23nvvek3MDjroIBYvXlzXpKwlLVy4kKeffrpecgbZ2ZFFixbx+OOP1yvfc889617379+fXr16LdVn3x6JwzDguvx1v5TSHID8uTZFWguYVTDO7LxMkiSp3Z1//vmcdtppHH744dx+++1MnjyZX/ziF0B27UGh4mYhtdcg1F6fUKv4/ZtvvslDDz1Ely5d6j2uu+46Zs2aRaXWWmstXnvttYrHg6yZ0S233MKpp57KfffdxxNPPMFRRx1Vb1nHjx/P3nvvzZlnnklVVRWbbLIJN910U93wm266ic0335wTTzyRddZZh2222YaHH364WfFAdgYFGn6+tb73ve9RXV3NxIkT+fKXv8yaa65JTU1NkxOIpjbnKb4gu1evXnTp0qVVeqh64403SCk1iK32/dtvv12vfLXVVqv3vmvXrg3Wz0q0aeIQEV2B/YAbylUtUdYgrYyIkRExJSKmzJ8/vyVClCRJKuuGG27gyCOPpKamhj322IMvf/nLjfaWVHyUv7ZZTPG+S/H71VdfnZ122oknnniiweMPf/gDlRoyZAiPPvoo7777bkXjvfvuu9x7772cc845fO9732O33XZj8ODBDS72XX311bn44ouZO3cu06ZNY8stt+Swww6rO/uw7rrrcs011/DWW2/xyCOPsNpqq7HvvvtWHE+te++9l+7duzNo0KCSwzt16sTJJ5/MCy+8wMyZMznhhBOoqanh6quvbtL0m3p2Zt68efXev/POO3zyySf0798foO5sROF9J1JK/Pe//23S9AutueaaRESDedY211rStRgtoa3POHwVeDKlVNsYbW5E9AfIn2s/hdnAOgXjrQ28XjyxlNL4lNLglNLg4ixdkiSptSxYsIBu3brVK5s4cWKTxt1yyy3p0qULt956a13Z4sWLG/R4M3ToUF544QU22GADBg8eXO8xcOBAIDuCDA3PcpQycuRIUkqccsopJYffeeedJctrey0qXN533nmHu+66q2T9iGDQoEGMHTuWTz/9tF5zJsh26HfccUdOP/103nvvvWY1nZk6dSrjx4/n2GOPbdATVSnrrbceZ5xxBuussw7PPfccUNlntyR33313vWncdNNNrLDCCnXXn9ReizF9+vS6Og8//HCD+TblbEC3bt3YcsstG/RoNWnSJDp37sx22223VMtSTlvfx+EbfNZMCeA24BhgbP58a0H5tRFxIdnF0VXA5DaMU5IkqVF77LEHV1xxBVtvvTXrrbceV111VZN3gPv378/w4cM59dRTiQiqqqoYP348CxcuZIUVPjum++1vf5vLLruMIUOG8MMf/pD111+fN998k0cffZT111+fUaNGscoqq9C/f3+uv/56qqqq6NatW6M3QVtvvfW44oor+OY3v8mrr75a13Xq7Nmzufbaa5k6dSqvv97gOC39+vVj880358wzz6R79+4sXryYc889lzXWWKPeUfTtttuOYcOGMXDgQFJKXHLJJayyyipss802zJ07l4MPPpijjjqKqqoqPvzwQy644ALWXnvtRnsPqvXmm2/y2GOPsXjx4robwF1++eVsscUWnH/++Y2Od+yxx7LWWmux7bbbssoqq3Dvvfcya9asuouya7tfvfjiizn44IPp2bNnXUJWic6dO7Pvvvvygx/8gFmzZnHyySczbNiwugujd955Z/r27cuoUaOorq5m3rx5jBs3jpVWWqnedDbZZBP+8pe/cN9999GrVy823HBDevXq1WB+Z511Fvvttx8jR47kkEMO4cknn+Tss89m1KhRrX6juDZLHCJiRWAP4LiC4rHApIgYAbwGHAqQUno2IiYBzwGLgFH2qCRJkjqKn/70p7zzzjuMHj2aFVZYgUMPPZSf//zn9XrtWZJf/vKXLF68mJ/85Cd07tyZ4cOHs/HGG/O73/2urs6KK67IQw89xBlnnMFPfvIT5s+fT79+/dh+++057LDD6upddtllnHLKKQwdOpSFCxcyZ86ckr0EQXa/h4022oixY8dy/PHH884779CvXz9233137r777kbjnTRpEscddxxHHnkkffr04aSTTmLu3Ln8/ve/r6uzww47cMUVVzBz5ky6dOnC1ltvzT333EO/fv344IMP2HjjjbnwwguZNWsWPXv2ZMcdd+TSSy+lS5cuS/ys7rnnHu655x46d+5Mr1692GKLLRg3bhzf+ta36s4alLLjjjty5ZVX8pvf/IaPP/6YqqoqJkyYwFe/+lUAvvjFL3LuuedyySWXMG7cOKqqqhrtOWlJansoGj58OAsWLPcbmdEAACAASURBVODAAw/kN7/5Td3w7t27c/PNN3P88cdz8MEHs+mmm3L55ZfXuzgeYMyYMRx33HEcfPDBvPfee1x33XUMGzaswfy+/vWvc80113DuuecyYcIE+vXrx2mnncYZZ5xRceyVinJXpH9eDB48ODW3hwFVbuy0N5d6GqO36t0CkUiStGzYeeedWWmllbjnHm9dpfYVEVNTSoOLy9u6qZIkSdJy79577+Wpp55iq622YuHChUycOJFHHnmE22+/vb1Dkxpl4iBJktTGevbsyY033shPf/pTFi5cyCabbMK1117Lvvvu296hSY0ycZCWoCWaZEHHb5a1vCynJHUUO+64I5Mn2++LPl/a4wZwkiRJkj5nTBwkSZIklWXiIEmSJKksEwdJkiRJZZk4SJIkSSrLxEGSJElSWSYOkiRJksoycZAkSZJUlomDJEmSpLJMHCRJkiSV1bm9A5AkSeWNnfbmUk9j9Fa9WyASScsrzzhIkiRJKsvEQZIkSVJZJg6SJEmSyjJxkCRJklSWiYMkSZKkskwcJEmSJJVl4iBJkiSpLBMHSZIkSWWZOEiSJEkqy8RBkiRJUlkmDpIkSZLKarPEISJWi4gbI+L5iJgeETtExOoRcV9EvJQ/9yqof2pEzIiIFyJir7aKU5IkSVJDbXnG4SLg7pTSJsCWwHRgNPBASqkKeCB/T0RsCgwDBgJ7AxdHRKc2jFWSJElSgTZJHCJiFeD/AVcApJQ+Tin9B9gfuCqvdhVwQP56f+D6lNLClNIrwAxg27aIVZIkSVJDbXXGYQNgPvC7iJgWEZdHxEpAv5TSHID8uW9efy1gVsH4s/MySZIkSe2grRKHzsDWwCUppa2AD8ibJTUiSpSlBpUiRkbElIiYMn/+/JaJVJIkSVIDbZU4zAZmp5Qez9/fSJZIzI2I/gD587yC+usUjL828HrxRFNK41NKg1NKg/v06dNqwUuSJEnLuzZJHFJKbwCzImLjvGgo8BxwG3BMXnYMcGv++jZgWER0i4j1gSpgclvEKkmSJKmhzm04rxOAiRHRFXgZOJYscZkUESOA14BDAVJKz0bEJLLkYhEwKqX0aRvGKkmSJKlAmyUOKaWngMElBg1tpP45wDmtGpQkSZKkJvHO0ZIkSZLKMnGQJEmSVJaJgyRJkqSyTBwkSZIklWXiIEmSJKksEwdJkiRJZZk4SJIkSSrLxEGSJElSWSYOkiRJksoycZAkSZJUlomDJEmSpLJMHCRJkiSVZeIgSZIkqSwTB0mSJElldW7vAJYlY6e92SLTGb1V7xaZjiRJktRSPOMgSZIkqSwTB0mSJEllmThIkiRJKsvEQZIkSVJZJg6SJEmSyjJxkCRJklSWiYMkSZKkskwcJEmSJJVl4iBJkiSpLBMHSZIkSWWZOEiSJEkqq80Sh4iYGRH/jIinImJKXrZ6RNwXES/lz70K6p8aETMi4oWI2Kut4pQkSZLUUFufcdgtpTQopTQ4fz8aeCClVAU8kL8nIjYFhgEDgb2BiyOiUxvHKkmSJCnX3k2V9geuyl9fBRxQUH59SmlhSukVYAawbTvEJ0mSJIm2TRwScG9ETI2IkXlZv5TSHID8uW9evhYwq2Dc2XlZPRExMiKmRMSU+fPnt2LokiRJ0vKtcxvOa6eU0usR0Re4LyKeX0LdKFGWGhSkNB4YDzB48OAGwyVJkiS1jDY745BSej1/ngfcTNb0aG5E9AfIn+fl1WcD6xSMvjbwelvFKkmSJKm+NkkcImKliFi59jWwJ/AMcBtwTF7tGODW/PVtwLCI6BYR6wNVwOS2iFWSJElSQ23VVKkfcHNE1M7z2pTS3RHxBDApIkYArwGHAqSUno2IScBzwCJgVErp0zaKVZL0OTF22pstMp3RW/VukelI0rKsyYlD1ES3VJ0WNmcmKaWXgS1LlL8FDG1knHOAc5ozP0mSJEktq5KmSnOiJi6Kmtis1aKRJEmS1CFVkjgMA9YEpkRNPBY18a2oiRVbKS5JkiRJHUiTE4dUne5N1elwsvsp/AH4AdlZiN9GTWzTWgFKkiRJan8V96qUqtNbqTr9AjiO7I7OI4GHoyb+HjXR4DoGSZIkSZ9/FfWqFDXRG/gmMALoDVxF1oRpDvC/wI1kXadKkiRJWoZU0qvSjcA+wKPA2cAfU3X6pGD4WcCPWjxCSZIkSe2ukjMOM4EtU3V6sdTAVJ1S1MQXWyQqSZIkSR1KJYnDWWQ3Y6uT96rUOVWndwFSdZrTgrFJkiRJ6iAquTj6DmCLorItgNtaLhxJkiRJHVElicNmwOSisidomExIkiRJWsZUkjgsBIpv+LYiRc2XJEmSJC17KkkcHgHOipqIgrJq4O8tG5IkSZKkjqaSi6N/DPwZOCRq4l/ABkACdm+NwCRJkiR1HE1OHFJ1eiVqYjNgf2A9su5Zb0vV6YNWik2SJElSB1HRnaPzJOHaVopFkiRJUgdVUeIQNXEoMBhYubA8Vaf/acmgJEmSJHUsTU4coiZ+AxwB/AWweZIkSZK0HKnkjMPhwLapOr3UWsFIkiRJ6pgq6Y71I7ILoiVJkiQtZypJHH4O/KS1ApEkSZLUcVXSVOk4YMOoiROAuYUDUnXatEWjkiRJktShVJI4/LzVopAkSZLUoVVyA7grWjMQSZIkSR1Xpfdx2AE4GuifqtOBURNbASul6vS3VolOkiRJUodQyX0chgGXANcBQ/PiFYCzgN1bPjRJkrS8GTvtzRaZzuiterfIdCR9ppJelU4H9srvEv1pXvZPYLOmTiAiOkXEtIi4I3+/ekTcFxEv5c+9CuqeGhEzIuKFiNirgjglSZIktbBKEoe1UnWanL9O+fMioFMF0/g+ML3g/WjggZRSFfBA/p6I2BQYBgwE9gYujohK5iNJkiSpBVWSOLwcNbF9Udn2QJPuJB0RawP7AJcXFO8PXJW/vgo4oKD8+pTSwpTSK8AMYNsKYpUkSZLUgipJHM4Fbo2aGA10iZr4Ptn1Dj9t4vi/BE4GFheU9UspzQHIn/vm5WsBswrqzc7LJEmSJLWDJicOqTr9ERgO7Ar8G/ga8N1Une4oN25E7AvMSylNbeLsolQIJaY7MiKmRMSU+fPnN3HSkiRJkipVUXesqTr9CfhTM+azE7BfRHwN6A6sEhG/B+ZGRP+U0pyI6A/My+vPBtYpGH9t4PUG8aQ0HhgPMHjw4AaJhSRJkqSWUUl3rI1eY1Bw0XTp4SmdCpwKEBFDgB+llI6KiAuAY4Cx+fOt+Si3AddGxIXAF4AqYInzkCRJktR6Kjnj8FiJstqj/M3t8WgsMCkiRgCvAYcCpJSejYhJwHNkPTeNSil92vhkJEmSJLWmShKHLkXv1wLOJjs70GQppQeBB/PXb/HZzeSK650DnFPJtCVJkiS1jiYnDqm6wRH/16ImTgCmAn9s0agkSZIkdSiVdMdaShc+60JVkiRJ0jKqkoujTy4qWgk4kOyOz5IkSZKWYZVc4/D1ovfvk/WCNK7lwpEkSZLUEVVyjcMurRmIJEmSpI5raa9xkCRJkrQcqOQah0/47L4NjUrVqetSRSRJkiSpw6nkGocfA98DfgG8AmwAnAj8Fniq5UOTJEmS1FFUkjgcDeyTqtOM2oKoifuB61N1uqjFI5MkSZLUYVRyjUMV8FpR2WvARi0XjiRJkqSOqJLEYRowNmqiG0DURFfgXOAfrRGYJEmSpI6jkqZK3wHuAEZGTcwF+gFzaHh/B0mSJEnLmEru4/Bi1MSmwE7AWsC/gUdSdVrUWsFJkiRJ6hgqOeNAniQ8FDXRJ1Wn+a0UkyRJkqQOppL7OKxI1hXr0WT3c1gpamJ/YLNUnc5ppfgkSZIkdQCVXBz9c2AdYDfgk7xsKnBESwclSZIkqWOpJHHYDzgiVafHgcUAqTrNJrveQZIkSdIyrJLEoRPwYWFB1MRKwPstGpEkSZKkDqeSxOHvwMlFZaOAh1ouHEmSJEkdUSW9Kv0v8OeoiaOAnlET04CeZNc8SJIkSVqGVXIfh5lREwPJrnVYH3gVuC1Vpw9aKzhJkiRJHUOTEoeoic7AH4HDU3X6Q+uGJEmSJKmjadI1DvmN37YHvEu0JEmStByq5OLoicB3WysQSZIkSR1XJRdHDwSOj5o4HphJfi8HgFSdvtbCcUmSJEnqQCpJHCbnD0mSJEnLmbKJQ9TE+FSdRqbqdEb+fttUnSpKICKiO/Aw0C2f540ppeqIWB34AzCA7CzGYSmld/JxTgVGAJ8CJ6aU7qlknpIkSZJaTlOucRhW9P7uZsxnIbB7SmlLYBCwd0RsD4wGHkgpVQEP5O+JiE3z+Q4E9gYujohOzZivJEmSpBbQlMQhyrwvK2Xez992yR8J2B+4Ki+/Cjggf70/cH1KaWFK6RVgBrBtpfOVJEmS1DKakjikMu+bJCI6RcRTwDzgvpTS40C/lNIcgPy5b159LWBWweiz87LiaY6MiCkRMWX+/PnNCUuSJElSEzTl4uiuUROnFbzvXvSeVJ3OLTeRlNKnwKCIWA24OSI2W0L1Umc1GiQsKaXxwHiAwYMHNyuhkSRJklReUxKHx4A9Ct4/XvQ+AWUTh7rKKf0nIh4ku3ZhbkT0TynNiYj+ZGcjIDvDsE7BaGsDrzd1HpIkSZJaVtnEIVWnIUs7k4joA3ySJw09gK8A5wO3AccAY/PnW/NRbgOujYgLgS8AVdgVrCRJktRuKrmPw9LoD1yV94y0AjAppXRHRDwKTIqIEcBrwKEAKaVnI2IS8BywCBiVN3WSJEmS1A7aJHFIKT0NbFWi/C1gaCPjnAOc08qhSZIkSWqCpvSqJEmSJGk5Z+IgSZIkqSwTB0mSJEllmThIkiRJKsvEQZIkSVJZJg6SJEmSyjJxkCRJklSWiYMkSZKkskwcJEmSJJVl4iBJkiSpLBMHSZIkSWV1bu8AJEmStOwaO+3NpZ7G6K16t0AkWlqecZAkSZJUlomDJEmSpLJMHCRJkiSVZeIgSZIkqSwTB0mSJEllmThIkiRJKsvEQZIkSVJZJg6SJEmSyjJxkCRJklSWiYMkSZKkskwcJEmSJJVl4iBJkiSpLBMHSZIkSWW1SeIQEetExF8iYnpEPBsR38/LV4+I+yLipfy5V8E4p0bEjIh4ISL2aos4JUmSJJXWVmccFgH/m1L6ErA9MCoiNgVGAw+klKqAB/L35MOGAQOBvYGLI6JTG8UqSZIkqUibJA4ppTkppSfz1+8B04G1gP2Bq/JqVwEH5K/3B65PKS1MKb0CzAC2bYtYJUmSJDXU5tc4RMQAYCvgcaBfSmkOZMkF0DevthYwq2C02XmZJEmSpHbQpolDRPQE/giclFJ6d0lVS5SlEtMbGRFTImLK/PnzWypMSZIkSUXaLHGIiC5kScPElNJNefHciOifD+8PzMvLZwPrFIy+NvB68TRTSuNTSoNTSoP79OnTesFLkiRJy7m26lUpgCuA6SmlCwsG3QYck78+Bri1oHxYRHSLiPWBKmByW8QqSZIkqaHObTSfnYCjgX9GxFN52WnAWGBSRIwAXgMOBUgpPRsRk4DnyHpkGpVS+rSNYpUkSZJUpE0Sh5TS3yh93QLA0EbGOQc4p9WCkiRJktRkbXXGQZI6hLHT3lzqaYzeqncLRCJJ0udLm3fHKkmSJOnzx8RBkiRJUlkmDpIkSZLKMnGQJEmSVJaJgyRJkqSy7FVJkiRJWkot0WsfdOye+zzjIEmSJKkszzhIkiS1A+8ro88bzzhIkiRJKsvEQZIkSVJZJg6SJEmSyjJxkCRJklSWiYMkSZKkskwcJEmSJJVl4iBJkiSpLO/jIEnLoOXhDqaSpLblGQdJkiRJZZk4SJIkSSrLxEGSJElSWSYOkiRJksoycZAkSZJUlomDJEmSpLJMHCRJkiSVZeIgSZIkqSwTB0mSJElltUniEBFXRsS8iHimoGz1iLgvIl7Kn3sVDDs1ImZExAsRsVdbxChJkiSpcW11xmECsHdR2WjggZRSFfBA/p6I2BQYBgzMx7k4Ijq1UZySJEmSSmiTxCGl9DDwdlHx/sBV+eurgAMKyq9PKS1MKb0CzAC2bYs4JUmSJJXWntc49EspzQHIn/vm5WsBswrqzc7LGoiIkRExJSKmzJ8/v1WDlSRJkpZnHfHi6ChRlkpVTCmNTykNTikN7tOnTyuHJUmSJC2/2jNxmBsR/QHy53l5+WxgnYJ6awOvt3FskiRJkgq0Z+JwG3BM/voY4NaC8mER0S0i1geqgMntEJ8kSZKkXOe2mElEXAcMAXpHxGygGhgLTIqIEcBrwKEAKaVnI2IS8BywCBiVUvq0LeKUJEmSVFqbJA4ppW80MmhoI/XPAc5pvYgkSZIkVaIjXhwtSZIkqYMxcZAkSZJUlomDJEmSpLJMHCRJkiSVZeIgSZIkqSwTB0mSJEllmThIkiRJKsvEQZIkSVJZJg6SJEmSyjJxkCRJklSWiYMkSZKkskwcJEmSJJVl4iBJkiSpLBMHSZIkSWWZOEiSJEkqy8RBkiRJUlkmDpIkSZLKMnGQJEmSVJaJgyRJkqSyTBwkSZIklWXiIEmSJKksEwdJkiRJZZk4SJIkSSrLxEGSJElSWSYOkiRJksrq0IlDROwdES9ExIyIGN3e8UiSJEnLqw6bOEREJ+A3wFeBTYFvRMSm7RuVJEmStHzqsIkDsC0wI6X0ckrpY+B6YP92jkmSJElaLnXkxGEtYFbB+9l5mSRJkqQ2Fiml9o6hpIg4FNgrpfTt/P3RwLYppRMK6owERuZvNwZeaPNAK9cbeLO9g2gDy8tywvKzrC7nsmd5WdblZTlh+VlWl3PZs7ws6+dlOddLKfUpLuzcHpE00WxgnYL3awOvF1ZIKY0HxrdlUEsrIqaklAa3dxytbXlZTlh+ltXlXPYsL8u6vCwnLD/L6nIue5aXZf28L2dHbqr0BFAVEetHRFdgGHBbO8ckSZIkLZc67BmHlNKiiDgeuAfoBFyZUnq2ncOSJEmSlksdNnEASCndBdzV3nG0sM9V06qlsLwsJyw/y+pyLnuWl2VdXpYTlp9ldTmXPcvLsn6ul7PDXhwtSZIkqePoyNc4SJIkSeogTBzUQEQcUHiX7og4KyK+soT6gyPiV82c12oR8T8F778QETc2Z1pLKyLGRMSPyi1vC86v3ufcniLixIiYHhET2zuWthQRf2/vGFpDRLzf3jG0hYg4KSJWLHh/V0SsVsH4+0XE6NaJrm00Z5vZVtu4lhARAyLimfaOoyOqdH1va8X/70s5rSERsWNLTKsjyNfrI5o5brtu322q9DkQEUH2XS1uo/lNAO5IKbX6DnxEDMjntVlrz6uciBgDvJ9S+nkbzW8CbfQ5NyGW54GvppReWYppdEopfdqCYamZIuL9lFLP9o6jtUXETGBwSunz0Ce6mqEj/Ue0tojonFJa1IR6bbpP0FyNfXfN+a9o6//n1hYRQ4AfpZT2LTFsietBu2/fU0o+mvkAbgGmAs8CI/Oy94FzgH8AjwH98vIN8/dPAGeR/QBqp/PjvPxpoCYvGwBMBy4GppHdiKPVYwV2BN4GXgGeyuOeABySj/Nl4O/5OJOBlYEhZBsHgDHANcCfgZeA7+TlPYEHgCeBfwL75+XXAwvyeV2QL/cz+bDuwO/y+tOA3fLy4cBNwN35PH62FJ/LT8huHHg/cB3wo6LlHQs8l383P1/Sd1n4OeTv/w8YXmo6pT7ndlyPfwt8nH/OPwGuzJdtWsH3NAD4a/79PQnsWLDMfwGuBZ5r799kM5b9fSDyde+Z/DM4PB92Te3y5+8nAvu1d8xNXa78ubFl+wPwtYL6E4CDyXqwu4DPtkfHtUPsP8zjfQY4KV/3ngeuymO6EVgROLFgvf1LPu5Mspsr1Y5zeT6dicBXgEfybca2ef3hwP/lrw/N6/4DeLhg+C3A7flv9fg8vmn5NmD1CparseXYBniIbPt8D9A/r39iwTbj+rxs13x78VQew8rU32Y2KV7Kb+MmAJeQ/bZfzud7Jdl/0oRmfq8rAXfmn+8zwOHAmfm69gzZBaO1BzO3yes9mq+PhctXctsP7JnXfxK4Aei5hOVr8F238DpcallnAr3z4YOBB/PXY/Jlv5dsOzocuDVfxheA6oL1p94+AZ+t7w3mV/A5Nli32vC3XPj//gQF/xWF621e90fAmFLrfl73DeDf+bR2aevtUtHveDpwGdn+1L1AD7L9grvzz/qvwCbFv7X8fe22+THgv/ny/CD/3m8g++3+mUb2mQqn0W6fQXvO/PP+4LONcI/8x7oGkICv5+U/A07PX98BfCN//d2ClWfPfKMRZE3H7gD+X75yLga2b4dYi1f0CcAhQFeyP5Ev5+WrkPXMNYT6icM/8vn0BmYBX8jrrZLX6Q3MyJe5eONR9x74X+B3+etNgNfIkonheRyr5u9fBdZpxmeyTf6DXDFflhkUJA7A6mQb7to/s9XKfJd1n0P+/v/yWBubTr3PuZ3X5Zn593IucFRtnMCLZH9KKwLd8/IqYErBMn8ArN/ey9DM5X6fbIf5PrKd5n75etafbGfplrzeqmQ7Yp3bO+amLlf+3NiyHQhcldfpmv9OewAj+Ww70A2Y0pbfbcFvciWyP85nga3ItlU75XWuJDtSV7felliPBwCLgM3JtqtT8/EC2L/gex3OZ4nDP4G1atf9guEzyHbQ+5D90X83H/YL4KQKlm1AieX4MdmBmD552eFkXY9DdsPTbkXx3F4wfk+y7eoA6u9Yl42X8tu4CWQ7bLWf17tFn+WgZny3BwOXFbxflYLEiyxRr/0/ehrYNX9dnDg02Pbn3/nDwEp5vVPIkpLGlq/Bd93C63GpZZ1J44nDVKBHwTLOIfuPrv2/HkyJfQI+W99Lza9LY+tWG/6eC9fNIRT8V7DkxKHUuj+G/Hffng8+27YMyt9PAo4i28mvysu2A/5c+FsrGL+x/YXhZDc+rt1XK7nPVDiN9np4jcPSOTEiao/Wr0O2Q/Ux2Y4lZBuDAfnrHciyScgy7lp75o9pZJnlJvl0AF5NKT3WDrE2ZmNgTkrpCYCU0rup9Om0W1NKC1LWfOAvwLZkf0DnRsTTZEf31yLbkVmSncn+TEgpPU/2J/HFfNgDKaX/ppQ+IjsysV6ZaZWyC3BzSunDlNK7NLzB4LvAR8DlEXEQ8GFe3th32ZjGptMR7QmMjoingAfJ/pzXJfsTuiwi/km27IXXZkxOS9HEqQPYGbgupfRpSmku2RG6L6eUHgI2ioi+wDeAPzayvndkJZcN+BOwe0R0A75KdtR1Adn3/838+3+cbOelqvSkWy3em1NKH6SU3ic7urwLMCul9Ehe5/d5vXJeSSn9M2XNOZ4l22Yksp3GASXqPwJMiIjvkCVatf6SUnovpTSfbEf89ry8seksSfFy7AVsBtyXf+anA2vnw58GJkbEUWQ7KrUxXhgRJ5LtUJVaHyuJd0nbptsLPq+5RZ9lpctdO/+vRMT5EbFLSum/wG4R8Xi+XdkdGBgRq+bL9lA+3jVF0ym17d+ebJv0SP45HpOXN7Z8jX3XLaXUsi7Jbfnvr9Z9KaW38rKb+Gx9b2yfoNT8Nqbxdau9NPW/otS635G8klJ6Kn9du++0I3BD/llfSnaAplL3pZTezl83Z5+pTXTo+zh0ZHn7tK8AO6SUPoyIB8l2sj7JN7YAn1L+Mw7gvJTSpUXTH0CWnXe0WFOZOpSok4AjyY6AbZNS+iRvm9y9CfNrzMKC102JvTGNLk/KbkK4LTCU7M7lx5P9uTVmEfU7HOjezOm0pwAOTim9UK8wa186F9iSbBk/KhjcIutpO1rSenYN2bo7DPhW24TTokouW0rpo3w7sBfZkcjrCuqfkFK6p23Ca6Cx76LUNqWcwm3E4oL3iymxvUgpfTcitgP2AZ6KiEHNmU4ZxXG/BzybUtqhRN19yM4+7wecEREDU0pjI+JO4GvAY/kFzh8VjdfkeMtsmxaWmEbJ6TRFSunFiNgmj/28iLgXGEV2jcqsfBvTnfL/M6W2/UG20/WN4sqllq/Ud51SeqvSZWpMI8ta+P9Q/N9XvA1tbH0vua1tZH430/i61V4K4y/5f5lrsO63QWyVKF4H+wH/SSn9//buN0auqozj+PenoLS2qRFKIir4B7RgjMofIyExISqvSgoRg6YihRAiUQyKIVGqN+PiC1L/EIpQtepGGy1EXphqak0IhICWhfQFfSFVCYVGKaXSYmkrtPTxxXPGHceZudvZ2Znu+vskm+zeO3vuOXPvPfee85xz7wc6fPY/5SxzU17XI93W76efe6ahcMShf4uAPeVGfAnZ49HLZjKcCFmBNW0Crpa0AEDSW0oP5yjzuo8Mdbd7AjhF0nkAkhZK6nQBWSbpBEknkuG4R0sedpUT4EImIwTdtgUZel5etvVusud7W5fP9uNB4FJJ8yQtBC5uXVn2yaLIFxHeADQrhW778mngLEmvL71mH61Jp1fZR2UTcH2p4JD0wbJ8ERltOgJcwcz00o3Kg8Dlkl4raTF5wZoo68bJfUbMzjfX9yrbeuAqske/2VDYBFwn6XjI807SG4ac30skzS/bvZQcL3yqpOYN0KeBh8rvAzuHJL0rIh6JiG8Au8nI7KC1l2MzsLi5TNLxkt4r6TXk8Mv7gZvIYYMLSh63RsSt5DCyJdPJTI+6aeAknQIciIh15Dyvs8uq3SUflwFExF7gRUnNXvblU0h+M3CBpNPLtuaXY7dj+WZ6X3cp63ZyKB5MXj+6+bikN0maB1xCRkiOdnvb6HBs9VmkfvU6P58DTpZ0Yol8LgXoduzXpDVq/wSekvRJyAaCpPeXdduZ3O/LyOg91Jen2z3TyDni0L/fAZ8rYaRtZMXVyw3AOkk3kpOYXgSIiN9LOhP4Y7lXe4kcLzfIp9McbV7Xk8NSvkipzEteX5F0ObC6VGgHyUhGuwmyjKcCYxHxd+VjiWCMoQAABHJJREFUPjdIeoycDPRESfMfkh5WPm5vI/D9lnTuBNaUMPZhcqLxy+V7mraI2CLp7pKfp8kblFYLgV9LavaCfaks77Yvd0i6hwyz/oUcftYrnf/6niPiyYEUbHrGgNuAx0vjYTtZod8J3FsqxvuZ/VGGpiB75s4n5+YEcFNE7ASIiOck/YmccDobdS0bOanvZ+QwiVfKsrVk2H1L2f/PkzcuQ1HOyXEmGzdrgT3kZMQrJf2APLfuKut/CGyU9GxEXDjNza+SdAZ5jt5HfmeDvpFuL8dqsrF2e+lsOI48//5M1jGLSn6+FxF7JY2Vm4hXyWE6G+lvSERTt7ppJryP/I6PAIeA68hjaytZzzza8tmrgJ9IOsBko7ariHhe0grgl+UmFHJozj46l6/Tvh6kTmWdB/xY0tfIYYC9PERGO08HfhERjylHIUx5e+V6fRn/e2wNrQOk7fp+kGwsNNcdkvRN8rt4inJPQHZKdTr2NwC/krSMjIq2X69HbTlwl6SVZONgPXlc/Yg8BifIY6157XwcOKwcQj5O1nOtOt4zHQv8ONYhUT5r/GBEhKRPkZNrl406X4OmOfbItE7+X/blXFciYlsiomtPTtnXW4GzpzBO2WaA5sjjOOdKOWxmlQbQuRHxhVHnxawTRxyG5xzgjtKLt5fZOV7akvflLFdC+w+QYf1un/kY+eSb77rRYGZm5oiDmZmZmZlNgSdHm5mZmZlZLTcczMzMzMyslhsOZmZmZmZWyw0HMzM7JqihNWrojlHnw8zMOvPkaDMzQw09QL7z4VDbqvOjiq0zsL3twMqoYt2g0zYzs5nhx7GamVnTWFRxy6gzYWZmxyY3HMzMrKcSjdgCvIN8W/wu4Fryza63kW+Jvw/4bFSxr/zPacDtwAXkW2PvBb4aVRxUQxvK/6xVQ2uAP0QVF6mhceBwVHFNXRplfQCfJ980vIR8K+6KqOKYecuqmdlc4jkOZmY2FVcAtwJvBO4Gfk42Hj4CvB14D3A9gBo6DvgtsBM4DfgwefP/bYCo4mLgGeCaqGJBVHFR+8bq0mixAvgEcBKwA1g9mOKamVk7NxzMzKzpZjW0t/WnZd09UcXmqOJVYB3wZmBVVPFCVPEC8BvgvPLZDwFnAF+OKvZHFX8DVgJXqyFNMS9TTWNVVPFMVPEyMA6c21/RzcysjocqmZlZ07d6zHF4tuX3A12WLSy/vw3YFVXsb1n/JHACsJgc6lRnqmm05mF/Sx7MzGzAHHEwM7NB2wGcrIbmtyx7J/AvYHf5+8gA0jAzsyFyxMHMzAZtAvgr8B01dCM5L2IM+GlU0Www7CSHIk0nDTMzGyJHHMzMrOnrauiltp+lR5tIVHEYWAq8lZwEPQE8Anyl5WO3AJ9RQ3vU0MY+0zAzsyHyC+DMzMzMzKyWIw5mZmZmZlbLDQczMzMzM6vlhoOZmZmZmdVyw8HMzMzMzGq54WBmZmZmZrXccDAzMzMzs1puOJiZmZmZWS03HMzMzMzMrJYbDmZmZmZmVuvfkNjc6aBV5hkAAAAASUVORK5CYII=\n",
486 | "text/plain": [
487 | ""
488 | ]
489 | },
490 | "metadata": {
491 | "needs_background": "light"
492 | },
493 | "output_type": "display_data"
494 | }
495 | ],
496 | "source": [
497 | "#plot class distribution\n",
498 | "plt.rcParams['figure.figsize'] = [13, 5]\n",
499 | "plt.bar(freqs.keys(), freqs.values(), width=0.3,color='skyblue')\n",
500 | "plt.text(10,700,\"Target Class Distribution\", fontsize=15, ha='center', va='center')\n",
501 | "t = plt.title(\"Emotions from COVID-19 Related Text\", fontsize=18)\n",
502 | "# t.set_color(\"m\")\n",
503 | "x = plt.xlabel(\"Emotion\", fontsize=13)\n",
504 | "x.set_color('g')\n",
505 | "y = plt.ylabel(\"Frequency\", fontsize=13)\n",
506 | "y.set_color('g')\n",
507 | "plt.savefig('class_distribution.png')\n",
508 | "# [i.set_color(\"c\") for i in plt.gca().get_xticklabels()]\n",
509 | "plt.show()"
510 | ]
511 | },
512 | {
513 | "cell_type": "code",
514 | "execution_count": 86,
515 | "metadata": {},
516 | "outputs": [],
517 | "source": [
518 | "# there is a clear class imbalance"
519 | ]
520 | },
521 | {
522 | "cell_type": "code",
523 | "execution_count": null,
524 | "metadata": {},
525 | "outputs": [],
526 | "source": []
527 | }
528 | ],
529 | "metadata": {
530 | "kernelspec": {
531 | "display_name": "Python 3",
532 | "language": "python",
533 | "name": "python3"
534 | },
535 | "language_info": {
536 | "codemirror_mode": {
537 | "name": "ipython",
538 | "version": 3
539 | },
540 | "file_extension": ".py",
541 | "mimetype": "text/x-python",
542 | "name": "python",
543 | "nbconvert_exporter": "python",
544 | "pygments_lexer": "ipython3",
545 | "version": "3.7.4"
546 | }
547 | },
548 | "nbformat": 4,
549 | "nbformat_minor": 2
550 | }
551 |
--------------------------------------------------------------------------------
/data_generator.py:
--------------------------------------------------------------------------------
1 | ###
2 | # Generate data (in csv format) to train the BERT model
3 |
4 | import csv
5 | import json
6 | import argparse
7 | import os
8 |
9 | # store emotions as a one hot encodings
10 | def create_model(llabels):
11 | llist = [0]* 12
12 | for em, val in enumerate(llabels.values()):
13 | llist[em] = 1 if val else 0
14 | return llist
15 |
16 | # generate and write to csv file
17 | def generate_csv(file, csvfile):
18 | data= open(file,"r")
19 | out = open(csvfile, "w", encoding="utf-8", newline="")
20 | writer = csv.writer(out)
21 | writer.writerow(["id", "text", "anger", "anticipation","disgust","fear","joy","love","optimism","pessimism","sadness","surprise","trust","neutral"])
22 | data = json.load(data)
23 | idd = 0
24 | for i,v in data.items():
25 | bin_vector = create_model(v['emotion'])
26 | writer.writerow([idd,v['body']]+bin_vector)
27 | idd += 1
28 |
29 |
30 | if __name__ == "__main__":
31 | parser = argparse.ArgumentParser()
32 | parser.add_argument("--file", default="nlp_train.json", type=str)
33 | parser.add_argument("--csvfile", default="nlp_train.csv", type=str)
34 | args = parser.parse_args()
35 | generate_csv(args.file, args.csvfile)
--------------------------------------------------------------------------------
/find_threshold.py:
--------------------------------------------------------------------------------
1 | ######
2 | # Script to find best possible threshold for the confidence.
3 | #
4 |
5 | from fast_bert.prediction import BertClassificationPredictor
6 | import argparse
7 | import csv
8 | import pandas as pd
9 |
10 | def threshold(model, csvs):
11 | labels = ["anger", "anticipation","disgust","fear","joy","love","optimism","pessimism","sadness","surprise","trust","neutral"]
12 |
13 | predictor = BertClassificationPredictor(
14 | model_path=args.model_dir,
15 | label_path="D:\\UTD\\Assignment\\NLP\\project\\", # location for labels.csv file
16 | multi_label=False,
17 | model_type='bert',
18 | do_lower_case=False)
19 | thresholds = [0.0005,0.00077,0.00079,0.00083,0.00087,0.0009,0.00093,0.00095,0.00099,0.001,0.0012,0.0015,0.00155,0.0016,0.00166,0.0017,0.0019,0.002,0.0021,0.0023,0.0025,0.0028,0.003,0.0035,0.0032,0.0037,0.004,0.0045,0.0047,0.0041,0.005,0.0053,0.0055,0.0062,0.009, 0.007, 0.01, 0.011,0.013,0.014,0.012, 0.015, 0.02, 0.25, 0.03,0.035,0.039]
20 | # targets = []
21 | inputs = {}
22 | data = pd.read_csv(csvs)
23 | # print(data.head())
24 | for idx, row in data.iterrows():
25 | temp = []
26 | for label in labels:
27 | if row[label] == 1:
28 | temp.append(label)
29 | inputs[row['text']] = temp
30 |
31 | multiple_predictions = predictor.predict_batch(list(inputs.keys()))
32 | threshold_accs = {}
33 |
34 | for th in thresholds:
35 | correct = 0
36 | # print(list(inputs.values())[0])
37 | outputs = []
38 | for out in multiple_predictions:
39 | temp = []
40 | for emotion in out:
41 | if emotion[1] >= th: # greater than threshold
42 | temp.append(emotion[0])
43 | outputs.append(temp)
44 | # print(outputs[0])
45 | for i in range(len(inputs)):
46 | if (set(outputs[i]) == set(list(inputs.values())[i])):
47 | correct += 1
48 | print("Threshold: ", th, "Correct: ", correct)
49 | threshold_accs[str(th)] = correct/len(inputs)
50 | print(threshold_accs)
51 |
52 | if __name__ == "__main__":
53 | parser = argparse.ArgumentParser()
54 | parser.add_argument("--model_dir",default="D:\\UTD\\Assignment\\NLP\\project\\model_output\\3_finetune_e20", help="path to output dir")
55 | parser.add_argument("--test_csv", default="D:\\UTD\\Assignment\\NLP\\project\\nlp_test.csv")
56 | args = parser.parse_args()
57 | threshold(args.model_dir, args.test_csv)
--------------------------------------------------------------------------------
/inference.py:
--------------------------------------------------------------------------------
1 | # Script to generate inference ofr a given csv file
2 |
3 | from fast_bert.prediction import BertClassificationPredictor
4 | import argparse
5 | import csv
6 | import pandas as pd
7 | import os
8 | from sklearn.metrics import classification_report
9 | from sklearn.preprocessing import MultiLabelBinarizer
10 | from pprint import pprint
11 |
12 | # run inference on the csv file provided using the trained model
13 | def run(model,csvs, threshold, evaluation):
14 | labels = ["anger", "anticipation","disgust","fear","joy","love","optimism","pessimism","sadness","surprise","trust","neutral"]
15 |
16 | predictor = BertClassificationPredictor(
17 | model_path=args.model_dir,
18 | label_path="D:\\UTD\\Assignment\\NLP\\project\\", # location for labels.csv file
19 | multi_label=False,
20 | model_type='bert',
21 | do_lower_case=False)
22 |
23 | inputs = {}
24 | ids = []
25 | data = pd.read_csv(csvs)
26 | # print(data.head())
27 | for idx, row in data.iterrows():
28 | temp = []
29 | for label in labels:
30 | if row[label] == 1:
31 | temp.append(label)
32 | inputs[row['text']] = temp
33 | ids.append(row['id'])
34 |
35 | multiple_predictions = predictor.predict_batch(list(inputs.keys()))
36 | outputs = []
37 | out_file = open(os.path.join(os.path.dirname(csvs),"model_output.csv"), "w", encoding="utf-8", newline="")
38 | csv_writer = csv.writer(out_file)
39 | csv_writer.writerow(["id","text", "emotions", "target"])
40 |
41 | for i, out in enumerate(multiple_predictions):
42 | temp = []
43 | for emotion in out:
44 | if emotion[1] > threshold: # greater than threshold
45 | temp.append(emotion[0])
46 | csv_writer.writerow([ids[i],list(inputs.keys())[i],temp,list(inputs.values())[i] ])
47 | outputs.append(temp)
48 |
49 | print("****************\n")
50 | print("Predictions saved in a file: ", os.path.join(os.path.dirname(csvs),"model_output.csv"))
51 | if evaluation:
52 | print("\n\n Running Model Evaluation\n")
53 | y_true = list(inputs.values())
54 | y_pred = outputs
55 | y_true_encoded = MultiLabelBinarizer().fit_transform(y_true)
56 | y_pred_encoded = MultiLabelBinarizer().fit_transform(y_pred)
57 | pprint(classification_report(y_true_encoded, y_pred_encoded))
58 | pprint(classification_report(y_true_encoded, y_pred_encoded, target_names=labels))
59 |
60 | if __name__ == "__main__":
61 | parser = argparse.ArgumentParser()
62 | parser.add_argument("--model_dir",default="D:\\UTD\\Assignment\\NLP\\project\\model_output\\3_finetune_e20", help="path to output dir")
63 | parser.add_argument("--test_csv", default="D:\\UTD\\Assignment\\NLP\\project\\nlp_test.csv")
64 | parser.add_argument("--threshold", default=0.0017, type=float)
65 | parser.add_argument("--writeto_file", default=True)
66 | parser.add_argument("--evaluation", default=True)
67 | args = parser.parse_args()
68 | run(args.model_dir, args.test_csv, args.threshold, args.evaluation)
--------------------------------------------------------------------------------
/labels.csv:
--------------------------------------------------------------------------------
1 | anger
2 | anticipation
3 | disgust
4 | fear
5 | joy
6 | love
7 | optimism
8 | pessimism
9 | sadness
10 | surprise
11 | trust
12 | neutral
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch===1.5.0
2 | torchvision===0.6.0
3 | fast-bert
--------------------------------------------------------------------------------
/train_bert.py:
--------------------------------------------------------------------------------
1 | # Training script for bert
2 |
3 | from fast_bert.data_cls import BertDataBunch
4 | from fast_bert.learner_cls import BertLearner
5 | from fast_bert.metrics import accuracy
6 | import logging
7 | import torch
8 | import os
9 | import argparse
10 |
11 | OUTPUT_DIR = "model_output/"
12 |
13 | def train(args):
14 | if args.is_onepanel:
15 | args.out_dir = os.path.join("/onepanel/output/",args.out_dir)
16 | if not os.path.exists(args.out_dir):
17 | os.mkdir(args.out_dir)
18 |
19 | logger = logging.getLogger()
20 | labels = ["anger", "anticipation","disgust","fear","joy","love","optimism","pessimism","sadness","surprise","trust","neutral"]
21 | databunch = BertDataBunch(".", ".",
22 | tokenizer=args.pretrained_model,
23 | train_file='nlp_train.csv',
24 | label_file='labels.csv',
25 | val_file="nlp_valid.csv",
26 | text_col='text',
27 | label_col=labels,
28 | batch_size_per_gpu=args.batch_size,
29 | max_seq_length=512,
30 | multi_gpu=False,
31 | multi_label=True,
32 | model_type='bert')
33 |
34 | device_cuda = torch.device("cuda")
35 | metrics = [{'name': 'accuracy', 'function': accuracy}]
36 |
37 | learner = BertLearner.from_pretrained_model(
38 | databunch,
39 | pretrained_path=args.pretrained_model,
40 | metrics=metrics,
41 | device=device_cuda,
42 | logger=logger,
43 | output_dir=args.out_dir,
44 | finetuned_wgts_path=None,
45 | warmup_steps=200,
46 | multi_gpu=False,
47 | is_fp16=False,
48 | multi_label=True,
49 | logging_steps=10)
50 |
51 | learner.fit(epochs=args.epochs,
52 | lr=2e-3,
53 | schedule_type="warmup_cosine_hard_restarts",
54 | optimizer_type="lamb")
55 | # validate=True)
56 | learner.save_model()
57 |
58 |
59 | if __name__ == "__main__":
60 | parser = argparse.ArgumentParser()
61 | parser.add_argument("--pretrained_model", default="bert-base-uncased", help="path to a pretrained model")
62 | parser.add_argument("--out_dir",default="model_output/", help="path to output dir")
63 | parser.add_argument("--is_onepanel", default=False, type=bool, help="train on onepanel cloud")
64 | parser.add_argument("--epochs", default=15, type=int)
65 | parser.add_argument("--batch_size", default=10, type=int)
66 | args = parser.parse_args()
67 | train(args)
68 |
--------------------------------------------------------------------------------