├── App
├── app.py
├── data.db
├── models
│ └── emotion_classifier_pipe_lr_03_june_2021.pkl
└── track_utils.py
├── README.md
├── data
├── EmotionClf-End2End.png
├── EmotionDetectionNLP-End2End.pdf
└── emotion_dataset_2.csv
├── models
└── emotion_classifier_pipe_lr_03_june_2021.pkl
└── notebooks
├── .ipynb_checkpoints
└── End2End-NLP-Project-checkpoint.ipynb
├── End2End-NLP-Project.ipynb
├── data
├── emotion_dataset_2.csv
└── emotion_dataset_raw.csv
└── emotion_classifier_pipe_lr_03_june_2021.pkl
/App/app.py:
--------------------------------------------------------------------------------
1 | # Core Pkgs
2 | import streamlit as st
3 | import altair as alt
4 | import plotly.express as px
5 |
6 | # EDA Pkgs
7 | import pandas as pd
8 | import numpy as np
9 | from datetime import datetime
10 |
11 | # Utils
12 | import joblib
13 | pipe_lr = joblib.load(open("models/emotion_classifier_pipe_lr_03_june_2021.pkl","rb"))
14 |
15 |
16 | # Track Utils
17 | from track_utils import create_page_visited_table,add_page_visited_details,view_all_page_visited_details,add_prediction_details,view_all_prediction_details,create_emotionclf_table
18 |
19 | # Fxn
20 | def predict_emotions(docx):
21 | results = pipe_lr.predict([docx])
22 | return results[0]
23 |
24 | def get_prediction_proba(docx):
25 | results = pipe_lr.predict_proba([docx])
26 | return results
27 |
28 | emotions_emoji_dict = {"anger":"😠","disgust":"🤮", "fear":"😨😱", "happy":"🤗", "joy":"😂", "neutral":"😐", "sad":"😔", "sadness":"😔", "shame":"😳", "surprise":"😮"}
29 |
30 |
31 | # Main Application
32 | def main():
33 | st.title("Emotion Classifier App")
34 | menu = ["Home","Monitor","About"]
35 | choice = st.sidebar.selectbox("Menu",menu)
36 | create_page_visited_table()
37 | create_emotionclf_table()
38 | if choice == "Home":
39 | add_page_visited_details("Home",datetime.now())
40 | st.subheader("Home-Emotion In Text")
41 |
42 | with st.form(key='emotion_clf_form'):
43 | raw_text = st.text_area("Type Here")
44 | submit_text = st.form_submit_button(label='Submit')
45 |
46 | if submit_text:
47 | col1,col2 = st.beta_columns(2)
48 |
49 | # Apply Fxn Here
50 | prediction = predict_emotions(raw_text)
51 | probability = get_prediction_proba(raw_text)
52 |
53 | add_prediction_details(raw_text,prediction,np.max(probability),datetime.now())
54 |
55 | with col1:
56 | st.success("Original Text")
57 | st.write(raw_text)
58 |
59 | st.success("Prediction")
60 | emoji_icon = emotions_emoji_dict[prediction]
61 | st.write("{}:{}".format(prediction,emoji_icon))
62 | st.write("Confidence:{}".format(np.max(probability)))
63 |
64 |
65 |
66 | with col2:
67 | st.success("Prediction Probability")
68 | # st.write(probability)
69 | proba_df = pd.DataFrame(probability,columns=pipe_lr.classes_)
70 | # st.write(proba_df.T)
71 | proba_df_clean = proba_df.T.reset_index()
72 | proba_df_clean.columns = ["emotions","probability"]
73 |
74 | fig = alt.Chart(proba_df_clean).mark_bar().encode(x='emotions',y='probability',color='emotions')
75 | st.altair_chart(fig,use_container_width=True)
76 |
77 |
78 |
79 | elif choice == "Monitor":
80 | add_page_visited_details("Monitor",datetime.now())
81 | st.subheader("Monitor App")
82 |
83 | with st.beta_expander("Page Metrics"):
84 | page_visited_details = pd.DataFrame(view_all_page_visited_details(),columns=['Pagename','Time_of_Visit'])
85 | st.dataframe(page_visited_details)
86 |
87 | pg_count = page_visited_details['Pagename'].value_counts().rename_axis('Pagename').reset_index(name='Counts')
88 | c = alt.Chart(pg_count).mark_bar().encode(x='Pagename',y='Counts',color='Pagename')
89 | st.altair_chart(c,use_container_width=True)
90 |
91 | p = px.pie(pg_count,values='Counts',names='Pagename')
92 | st.plotly_chart(p,use_container_width=True)
93 |
94 | with st.beta_expander('Emotion Classifier Metrics'):
95 | df_emotions = pd.DataFrame(view_all_prediction_details(),columns=['Rawtext','Prediction','Probability','Time_of_Visit'])
96 | st.dataframe(df_emotions)
97 |
98 | prediction_count = df_emotions['Prediction'].value_counts().rename_axis('Prediction').reset_index(name='Counts')
99 | pc = alt.Chart(prediction_count).mark_bar().encode(x='Prediction',y='Counts',color='Prediction')
100 | st.altair_chart(pc,use_container_width=True)
101 |
102 |
103 |
104 | else:
105 | st.subheader("About")
106 | add_page_visited_details("About",datetime.now())
107 |
108 |
109 |
110 |
111 |
112 | if __name__ == '__main__':
113 | main()
--------------------------------------------------------------------------------
/App/data.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jcharis/end2end-nlp-project/f82d87ed6d2608ad2c9b43e29e06da53429c1915/App/data.db
--------------------------------------------------------------------------------
/App/models/emotion_classifier_pipe_lr_03_june_2021.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jcharis/end2end-nlp-project/f82d87ed6d2608ad2c9b43e29e06da53429c1915/App/models/emotion_classifier_pipe_lr_03_june_2021.pkl
--------------------------------------------------------------------------------
/App/track_utils.py:
--------------------------------------------------------------------------------
1 | # Load Database Pkg
2 | import sqlite3
3 | conn = sqlite3.connect('data.db')
4 | c = conn.cursor()
5 |
6 |
7 | # Fxn
8 | def create_page_visited_table():
9 | c.execute('CREATE TABLE IF NOT EXISTS pageTrackTable(pagename TEXT,timeOfvisit TIMESTAMP)')
10 |
11 | def add_page_visited_details(pagename,timeOfvisit):
12 | c.execute('INSERT INTO pageTrackTable(pagename,timeOfvisit) VALUES(?,?)',(pagename,timeOfvisit))
13 | conn.commit()
14 |
15 | def view_all_page_visited_details():
16 | c.execute('SELECT * FROM pageTrackTable')
17 | data = c.fetchall()
18 | return data
19 |
20 |
21 | # Fxn To Track Input & Prediction
22 | def create_emotionclf_table():
23 | c.execute('CREATE TABLE IF NOT EXISTS emotionclfTable(rawtext TEXT,prediction TEXT,probability NUMBER,timeOfvisit TIMESTAMP)')
24 |
25 | def add_prediction_details(rawtext,prediction,probability,timeOfvisit):
26 | c.execute('INSERT INTO emotionclfTable(rawtext,prediction,probability,timeOfvisit) VALUES(?,?,?,?)',(rawtext,prediction,probability,timeOfvisit))
27 | conn.commit()
28 |
29 | def view_all_prediction_details():
30 | c.execute('SELECT * FROM emotionclfTable')
31 | data = c.fetchall()
32 | return data
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # end2end-nlp-project
2 | End 2 End NLP Project with Python
3 |
--------------------------------------------------------------------------------
/data/EmotionClf-End2End.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jcharis/end2end-nlp-project/f82d87ed6d2608ad2c9b43e29e06da53429c1915/data/EmotionClf-End2End.png
--------------------------------------------------------------------------------
/data/EmotionDetectionNLP-End2End.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jcharis/end2end-nlp-project/f82d87ed6d2608ad2c9b43e29e06da53429c1915/data/EmotionDetectionNLP-End2End.pdf
--------------------------------------------------------------------------------
/models/emotion_classifier_pipe_lr_03_june_2021.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jcharis/end2end-nlp-project/f82d87ed6d2608ad2c9b43e29e06da53429c1915/models/emotion_classifier_pipe_lr_03_june_2021.pkl
--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/End2End-NLP-Project-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "36f38e2a-9998-4c73-bfc3-21b74d64a5ee",
6 | "metadata": {},
7 | "source": [
8 | "### End 2 End NLP Project\n",
9 | "+ Emotion Detection In Text \n",
10 | "+ Text Classifier"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "id": "f0814628-3d83-4fd6-a511-2eccf79f9f1e",
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "# Load EDA Pkgs\n",
21 | "import pandas as pd\n",
22 | "import numpy as np"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 2,
28 | "id": "ea0d580d-c31c-44b7-b09b-10225857eebe",
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "# Load Data Viz Pkgs\n",
33 | "import seaborn as sns"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 3,
39 | "id": "91eccfbf-d4d0-4e16-b0f7-2d7941efddb0",
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "# Load Text Cleaning Pkgs\n",
44 | "import neattext.functions as nfx"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 5,
50 | "id": "21e7e868-35fb-483f-82b6-842a29ef1342",
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "# Load ML Pkgs\n",
55 | "# Estimators\n",
56 | "from sklearn.linear_model import LogisticRegression\n",
57 | "from sklearn.naive_bayes import MultinomialNB\n",
58 | "\n",
59 | "# Transformers\n",
60 | "from sklearn.feature_extraction.text import CountVectorizer\n",
61 | "from sklearn.model_selection import train_test_split\n",
62 | "from sklearn.metrics import accuracy_score,classification_report,confusion_matrix"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 10,
68 | "id": "b209e004-ab77-4407-8689-b4318944d47f",
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "# Load Dataset\n",
73 | "df = pd.read_csv(\"data/emotion_dataset_raw.csv\")"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 11,
79 | "id": "fea2d4c0-3bdd-405e-ab69-507ceaac36cb",
80 | "metadata": {},
81 | "outputs": [
82 | {
83 | "data": {
84 | "text/html": [
85 | "
\n",
86 | "\n",
99 | "
\n",
100 | " \n",
101 | " \n",
102 | " | \n",
103 | " Emotion | \n",
104 | " Text | \n",
105 | "
\n",
106 | " \n",
107 | " \n",
108 | " \n",
109 | " 0 | \n",
110 | " neutral | \n",
111 | " Why ? | \n",
112 | "
\n",
113 | " \n",
114 | " 1 | \n",
115 | " joy | \n",
116 | " Sage Act upgrade on my to do list for tommorow. | \n",
117 | "
\n",
118 | " \n",
119 | " 2 | \n",
120 | " sadness | \n",
121 | " ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ... | \n",
122 | "
\n",
123 | " \n",
124 | " 3 | \n",
125 | " joy | \n",
126 | " Such an eye ! The true hazel eye-and so brill... | \n",
127 | "
\n",
128 | " \n",
129 | " 4 | \n",
130 | " joy | \n",
131 | " @Iluvmiasantos ugh babe.. hugggzzz for u .! b... | \n",
132 | "
\n",
133 | " \n",
134 | "
\n",
135 | "
"
136 | ],
137 | "text/plain": [
138 | " Emotion Text\n",
139 | "0 neutral Why ? \n",
140 | "1 joy Sage Act upgrade on my to do list for tommorow.\n",
141 | "2 sadness ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ...\n",
142 | "3 joy Such an eye ! The true hazel eye-and so brill...\n",
143 | "4 joy @Iluvmiasantos ugh babe.. hugggzzz for u .! b..."
144 | ]
145 | },
146 | "execution_count": 11,
147 | "metadata": {},
148 | "output_type": "execute_result"
149 | }
150 | ],
151 | "source": [
152 | "df.head()"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": 12,
158 | "id": "430565a3-cf3b-4c6f-afa5-bafd084f5676",
159 | "metadata": {},
160 | "outputs": [
161 | {
162 | "data": {
163 | "text/plain": [
164 | "joy 11045\n",
165 | "sadness 6722\n",
166 | "fear 5410\n",
167 | "anger 4297\n",
168 | "surprise 4062\n",
169 | "neutral 2254\n",
170 | "disgust 856\n",
171 | "shame 146\n",
172 | "Name: Emotion, dtype: int64"
173 | ]
174 | },
175 | "execution_count": 12,
176 | "metadata": {},
177 | "output_type": "execute_result"
178 | }
179 | ],
180 | "source": [
181 | "# Value Counts\n",
182 | "df['Emotion'].value_counts()"
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": 13,
188 | "id": "531d3449-a959-4a19-bff0-3ffed551e619",
189 | "metadata": {},
190 | "outputs": [
191 | {
192 | "data": {
193 | "text/plain": [
194 | ""
195 | ]
196 | },
197 | "execution_count": 13,
198 | "metadata": {},
199 | "output_type": "execute_result"
200 | },
201 | {
202 | "data": {
203 | "image/png": "\n",
204 | "text/plain": [
205 | ""
206 | ]
207 | },
208 | "metadata": {
209 | "needs_background": "light"
210 | },
211 | "output_type": "display_data"
212 | }
213 | ],
214 | "source": [
215 | "# Plot\n",
216 | "sns.countplot(x='Emotion',data=df)"
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": 14,
222 | "id": "40f991d0-952f-40c1-bf00-f3476ce0436d",
223 | "metadata": {
224 | "collapsed": true,
225 | "jupyter": {
226 | "outputs_hidden": true
227 | },
228 | "tags": []
229 | },
230 | "outputs": [
231 | {
232 | "data": {
233 | "text/plain": [
234 | "['BTC_ADDRESS_REGEX',\n",
235 | " 'CURRENCY_REGEX',\n",
236 | " 'CURRENCY_SYMB_REGEX',\n",
237 | " 'Counter',\n",
238 | " 'DATE_REGEX',\n",
239 | " 'EMAIL_REGEX',\n",
240 | " 'EMOJI_REGEX',\n",
241 | " 'HASTAG_REGEX',\n",
242 | " 'MASTERCard_REGEX',\n",
243 | " 'MD5_SHA_REGEX',\n",
244 | " 'MOST_COMMON_PUNCT_REGEX',\n",
245 | " 'NUMBERS_REGEX',\n",
246 | " 'PHONE_REGEX',\n",
247 | " 'PoBOX_REGEX',\n",
248 | " 'SPECIAL_CHARACTERS_REGEX',\n",
249 | " 'STOPWORDS',\n",
250 | " 'STOPWORDS_de',\n",
251 | " 'STOPWORDS_en',\n",
252 | " 'STOPWORDS_es',\n",
253 | " 'STOPWORDS_fr',\n",
254 | " 'STOPWORDS_ru',\n",
255 | " 'STOPWORDS_yo',\n",
256 | " 'STREET_ADDRESS_REGEX',\n",
257 | " 'TextFrame',\n",
258 | " 'URL_PATTERN',\n",
259 | " 'USER_HANDLES_REGEX',\n",
260 | " 'VISACard_REGEX',\n",
261 | " '__builtins__',\n",
262 | " '__cached__',\n",
263 | " '__doc__',\n",
264 | " '__file__',\n",
265 | " '__generate_text',\n",
266 | " '__loader__',\n",
267 | " '__name__',\n",
268 | " '__numbers_dict',\n",
269 | " '__package__',\n",
270 | " '__spec__',\n",
271 | " '_lex_richness_herdan',\n",
272 | " '_lex_richness_maas_ttr',\n",
273 | " 'clean_text',\n",
274 | " 'defaultdict',\n",
275 | " 'digit2words',\n",
276 | " 'extract_btc_address',\n",
277 | " 'extract_currencies',\n",
278 | " 'extract_currency_symbols',\n",
279 | " 'extract_dates',\n",
280 | " 'extract_emails',\n",
281 | " 'extract_emojis',\n",
282 | " 'extract_hashtags',\n",
283 | " 'extract_html_tags',\n",
284 | " 'extract_mastercard_addr',\n",
285 | " 'extract_md5sha',\n",
286 | " 'extract_numbers',\n",
287 | " 'extract_pattern',\n",
288 | " 'extract_phone_numbers',\n",
289 | " 'extract_postoffice_box',\n",
290 | " 'extract_shortwords',\n",
291 | " 'extract_special_characters',\n",
292 | " 'extract_stopwords',\n",
293 | " 'extract_street_address',\n",
294 | " 'extract_urls',\n",
295 | " 'extract_userhandles',\n",
296 | " 'extract_visacard_addr',\n",
297 | " 'fix_contractions',\n",
298 | " 'generate_sentence',\n",
299 | " 'hamming_distance',\n",
300 | " 'inverse_df',\n",
301 | " 'lexical_richness',\n",
302 | " 'markov_chain',\n",
303 | " 'math',\n",
304 | " 'nlargest',\n",
305 | " 'normalize',\n",
306 | " 'num2words',\n",
307 | " 'random',\n",
308 | " 're',\n",
309 | " 'read_txt',\n",
310 | " 'remove_bad_quotes',\n",
311 | " 'remove_btc_address',\n",
312 | " 'remove_currencies',\n",
313 | " 'remove_currency_symbols',\n",
314 | " 'remove_custom_pattern',\n",
315 | " 'remove_custom_words',\n",
316 | " 'remove_dates',\n",
317 | " 'remove_emails',\n",
318 | " 'remove_emojis',\n",
319 | " 'remove_hashtags',\n",
320 | " 'remove_html_tags',\n",
321 | " 'remove_mastercard_addr',\n",
322 | " 'remove_md5sha',\n",
323 | " 'remove_multiple_spaces',\n",
324 | " 'remove_non_ascii',\n",
325 | " 'remove_numbers',\n",
326 | " 'remove_phone_numbers',\n",
327 | " 'remove_postoffice_box',\n",
328 | " 'remove_puncts',\n",
329 | " 'remove_punctuations',\n",
330 | " 'remove_shortwords',\n",
331 | " 'remove_special_characters',\n",
332 | " 'remove_stopwords',\n",
333 | " 'remove_street_address',\n",
334 | " 'remove_urls',\n",
335 | " 'remove_userhandles',\n",
336 | " 'remove_visacard_addr',\n",
337 | " 'replace_bad_quotes',\n",
338 | " 'replace_currencies',\n",
339 | " 'replace_currency_symbols',\n",
340 | " 'replace_dates',\n",
341 | " 'replace_emails',\n",
342 | " 'replace_emojis',\n",
343 | " 'replace_numbers',\n",
344 | " 'replace_phone_numbers',\n",
345 | " 'replace_special_characters',\n",
346 | " 'replace_term',\n",
347 | " 'replace_urls',\n",
348 | " 'string',\n",
349 | " 'term_freq',\n",
350 | " 'to_txt',\n",
351 | " 'word_freq',\n",
352 | " 'word_length_freq']"
353 | ]
354 | },
355 | "execution_count": 14,
356 | "metadata": {},
357 | "output_type": "execute_result"
358 | }
359 | ],
360 | "source": [
361 | "# Data Cleaning\n",
362 | "dir(nfx)"
363 | ]
364 | },
365 | {
366 | "cell_type": "code",
367 | "execution_count": 17,
368 | "id": "b1f87847-a91c-4bd6-a307-d746eb5aa9a0",
369 | "metadata": {},
370 | "outputs": [],
371 | "source": [
372 | "# User handles\n",
373 | "df['Clean_Text'] = df['Text'].apply(nfx.remove_userhandles)"
374 | ]
375 | },
376 | {
377 | "cell_type": "code",
378 | "execution_count": 18,
379 | "id": "03886bc3-1ac4-4f1b-842b-e5d2d770ff81",
380 | "metadata": {},
381 | "outputs": [],
382 | "source": [
383 | "# Stopwords\n",
384 | "df['Clean_Text'] = df['Clean_Text'].apply(nfx.remove_stopwords)"
385 | ]
386 | },
387 | {
388 | "cell_type": "code",
389 | "execution_count": 19,
390 | "id": "0a0fcc0c-4adf-4f0b-b226-164659ad70ba",
391 | "metadata": {
392 | "collapsed": true,
393 | "jupyter": {
394 | "outputs_hidden": true
395 | },
396 | "tags": []
397 | },
398 | "outputs": [
399 | {
400 | "data": {
401 | "text/html": [
402 | "\n",
403 | "\n",
416 | "
\n",
417 | " \n",
418 | " \n",
419 | " | \n",
420 | " Emotion | \n",
421 | " Text | \n",
422 | " Clean_Text | \n",
423 | "
\n",
424 | " \n",
425 | " \n",
426 | " \n",
427 | " 0 | \n",
428 | " neutral | \n",
429 | " Why ? | \n",
430 | " ? | \n",
431 | "
\n",
432 | " \n",
433 | " 1 | \n",
434 | " joy | \n",
435 | " Sage Act upgrade on my to do list for tommorow. | \n",
436 | " Sage Act upgrade list tommorow. | \n",
437 | "
\n",
438 | " \n",
439 | " 2 | \n",
440 | " sadness | \n",
441 | " ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ... | \n",
442 | " WAY HOMEGIRL BABY FUNERAL!!! MAN HATE FUNERALS... | \n",
443 | "
\n",
444 | " \n",
445 | " 3 | \n",
446 | " joy | \n",
447 | " Such an eye ! The true hazel eye-and so brill... | \n",
448 | " eye ! true hazel eye-and brilliant ! Regular f... | \n",
449 | "
\n",
450 | " \n",
451 | " 4 | \n",
452 | " joy | \n",
453 | " @Iluvmiasantos ugh babe.. hugggzzz for u .! b... | \n",
454 | " ugh babe.. hugggzzz u .! babe naamazed nga ako... | \n",
455 | "
\n",
456 | " \n",
457 | " ... | \n",
458 | " ... | \n",
459 | " ... | \n",
460 | " ... | \n",
461 | "
\n",
462 | " \n",
463 | " 34787 | \n",
464 | " surprise | \n",
465 | " @MichelGW have you gift! Hope you like it! It'... | \n",
466 | " gift! Hope like it! hand wear ! It'll warm! Lol | \n",
467 | "
\n",
468 | " \n",
469 | " 34788 | \n",
470 | " joy | \n",
471 | " The world didnt give it to me..so the world MO... | \n",
472 | " world didnt me..so world DEFINITELY cnt away!!! | \n",
473 | "
\n",
474 | " \n",
475 | " 34789 | \n",
476 | " anger | \n",
477 | " A man robbed me today . | \n",
478 | " man robbed today . | \n",
479 | "
\n",
480 | " \n",
481 | " 34790 | \n",
482 | " fear | \n",
483 | " Youu call it JEALOUSY, I call it of #Losing YO... | \n",
484 | " Youu JEALOUSY, #Losing YOU... | \n",
485 | "
\n",
486 | " \n",
487 | " 34791 | \n",
488 | " sadness | \n",
489 | " I think about you baby, and I dream about you ... | \n",
490 | " think baby, dream time | \n",
491 | "
\n",
492 | " \n",
493 | "
\n",
494 | "
34792 rows × 3 columns
\n",
495 | "
"
496 | ],
497 | "text/plain": [
498 | " Emotion Text \\\n",
499 | "0 neutral Why ? \n",
500 | "1 joy Sage Act upgrade on my to do list for tommorow. \n",
501 | "2 sadness ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ... \n",
502 | "3 joy Such an eye ! The true hazel eye-and so brill... \n",
503 | "4 joy @Iluvmiasantos ugh babe.. hugggzzz for u .! b... \n",
504 | "... ... ... \n",
505 | "34787 surprise @MichelGW have you gift! Hope you like it! It'... \n",
506 | "34788 joy The world didnt give it to me..so the world MO... \n",
507 | "34789 anger A man robbed me today . \n",
508 | "34790 fear Youu call it JEALOUSY, I call it of #Losing YO... \n",
509 | "34791 sadness I think about you baby, and I dream about you ... \n",
510 | "\n",
511 | " Clean_Text \n",
512 | "0 ? \n",
513 | "1 Sage Act upgrade list tommorow. \n",
514 | "2 WAY HOMEGIRL BABY FUNERAL!!! MAN HATE FUNERALS... \n",
515 | "3 eye ! true hazel eye-and brilliant ! Regular f... \n",
516 | "4 ugh babe.. hugggzzz u .! babe naamazed nga ako... \n",
517 | "... ... \n",
518 | "34787 gift! Hope like it! hand wear ! It'll warm! Lol \n",
519 | "34788 world didnt me..so world DEFINITELY cnt away!!! \n",
520 | "34789 man robbed today . \n",
521 | "34790 Youu JEALOUSY, #Losing YOU... \n",
522 | "34791 think baby, dream time \n",
523 | "\n",
524 | "[34792 rows x 3 columns]"
525 | ]
526 | },
527 | "execution_count": 19,
528 | "metadata": {},
529 | "output_type": "execute_result"
530 | }
531 | ],
532 | "source": [
533 | "df"
534 | ]
535 | },
536 | {
537 | "cell_type": "code",
538 | "execution_count": 20,
539 | "id": "450c39c0-79dd-4eaf-85fe-57e344eb81bd",
540 | "metadata": {},
541 | "outputs": [],
542 | "source": [
543 | "# Features & Labels\n",
544 | "Xfeatures = df['Clean_Text']\n",
545 | "ylabels = df['Emotion']"
546 | ]
547 | },
548 | {
549 | "cell_type": "code",
550 | "execution_count": 21,
551 | "id": "27d7f976-c28f-449e-ae1a-53a42bbda4e8",
552 | "metadata": {},
553 | "outputs": [],
554 | "source": [
555 | "# Split Data\n",
556 | "x_train,x_test,y_train,y_test = train_test_split(Xfeatures,ylabels,test_size=0.3,random_state=42)"
557 | ]
558 | },
559 | {
560 | "cell_type": "code",
561 | "execution_count": 22,
562 | "id": "2f086f29-dba9-40d2-a9dd-f06a6cca3a4c",
563 | "metadata": {},
564 | "outputs": [],
565 | "source": [
566 | "# Build Pipeline\n",
567 | "from sklearn.pipeline import Pipeline"
568 | ]
569 | },
570 | {
571 | "cell_type": "code",
572 | "execution_count": 23,
573 | "id": "6b81cc86-2bef-40c2-b9a3-668caaadaff0",
574 | "metadata": {},
575 | "outputs": [],
576 | "source": [
577 | "# LogisticRegression Pipeline\n",
578 | "pipe_lr = Pipeline(steps=[('cv',CountVectorizer()),('lr',LogisticRegression())])"
579 | ]
580 | },
581 | {
582 | "cell_type": "code",
583 | "execution_count": 24,
584 | "id": "dc64b9a7-efe2-4bc4-a0e7-46dff1d52b31",
585 | "metadata": {
586 | "collapsed": true,
587 | "jupyter": {
588 | "outputs_hidden": true
589 | },
590 | "tags": []
591 | },
592 | "outputs": [
593 | {
594 | "name": "stderr",
595 | "output_type": "stream",
596 | "text": [
597 | "/usr/local/lib/python3.9/dist-packages/sklearn/linear_model/_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
598 | "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
599 | "\n",
600 | "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
601 | " https://scikit-learn.org/stable/modules/preprocessing.html\n",
602 | "Please also refer to the documentation for alternative solver options:\n",
603 | " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
604 | " n_iter_i = _check_optimize_result(\n"
605 | ]
606 | },
607 | {
608 | "data": {
609 | "text/plain": [
610 | "Pipeline(steps=[('cv', CountVectorizer()), ('lr', LogisticRegression())])"
611 | ]
612 | },
613 | "execution_count": 24,
614 | "metadata": {},
615 | "output_type": "execute_result"
616 | }
617 | ],
618 | "source": [
619 | "# Train and Fit Data\n",
620 | "pipe_lr.fit(x_train,y_train)"
621 | ]
622 | },
623 | {
624 | "cell_type": "code",
625 | "execution_count": 25,
626 | "id": "135ed6f8-56ff-4d53-85e3-541e3a7ae2d7",
627 | "metadata": {},
628 | "outputs": [
629 | {
630 | "data": {
631 | "text/plain": [
632 | "Pipeline(steps=[('cv', CountVectorizer()), ('lr', LogisticRegression())])"
633 | ]
634 | },
635 | "execution_count": 25,
636 | "metadata": {},
637 | "output_type": "execute_result"
638 | }
639 | ],
640 | "source": [
641 | "pipe_lr"
642 | ]
643 | },
644 | {
645 | "cell_type": "code",
646 | "execution_count": 26,
647 | "id": "28396371-5f5c-4a3b-b974-164e047764f3",
648 | "metadata": {},
649 | "outputs": [
650 | {
651 | "data": {
652 | "text/plain": [
653 | "0.6200421536692853"
654 | ]
655 | },
656 | "execution_count": 26,
657 | "metadata": {},
658 | "output_type": "execute_result"
659 | }
660 | ],
661 | "source": [
662 | "# Check Accuracy\n",
663 | "pipe_lr.score(x_test,y_test)"
664 | ]
665 | },
666 | {
667 | "cell_type": "code",
668 | "execution_count": 27,
669 | "id": "eb3a26b6-d09e-422f-991b-b08c48f55b75",
670 | "metadata": {},
671 | "outputs": [],
672 | "source": [
673 | "# Make A Prediction\n",
674 | "ex1 = \"This book was so interesting it made me happy\""
675 | ]
676 | },
677 | {
678 | "cell_type": "code",
679 | "execution_count": 28,
680 | "id": "b08597d9-6f59-45cb-a648-95b0da1ce313",
681 | "metadata": {},
682 | "outputs": [
683 | {
684 | "data": {
685 | "text/plain": [
686 | "array(['joy'], dtype=object)"
687 | ]
688 | },
689 | "execution_count": 28,
690 | "metadata": {},
691 | "output_type": "execute_result"
692 | }
693 | ],
694 | "source": [
695 | "pipe_lr.predict([ex1])"
696 | ]
697 | },
698 | {
699 | "cell_type": "code",
700 | "execution_count": 29,
701 | "id": "5b3822ac-17fc-43dd-9bb7-8dad07a4d32c",
702 | "metadata": {},
703 | "outputs": [
704 | {
705 | "data": {
706 | "text/plain": [
707 | "array([[1.60353503e-03, 7.05960421e-03, 6.95963589e-03, 9.43781635e-01,\n",
708 | " 1.00430913e-04, 2.63557471e-02, 6.65377751e-05, 1.40728742e-02]])"
709 | ]
710 | },
711 | "execution_count": 29,
712 | "metadata": {},
713 | "output_type": "execute_result"
714 | }
715 | ],
716 | "source": [
717 | "# Prediction Prob\n",
718 | "pipe_lr.predict_proba([ex1])"
719 | ]
720 | },
721 | {
722 | "cell_type": "code",
723 | "execution_count": 30,
724 | "id": "5b7c4596-d643-48e5-a777-79a6f55c49da",
725 | "metadata": {},
726 | "outputs": [
727 | {
728 | "data": {
729 | "text/plain": [
730 | "array(['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'shame',\n",
731 | " 'surprise'], dtype=object)"
732 | ]
733 | },
734 | "execution_count": 30,
735 | "metadata": {},
736 | "output_type": "execute_result"
737 | }
738 | ],
739 | "source": [
740 | "# To Know the classes\n",
741 | "pipe_lr.classes_"
742 | ]
743 | },
744 | {
745 | "cell_type": "code",
746 | "execution_count": 31,
747 | "id": "c0d40f62-b1fd-4748-a279-c8f50c748f26",
748 | "metadata": {},
749 | "outputs": [],
750 | "source": [
751 | "# Save Model & Pipeline\n",
752 | "import joblib\n",
753 | "pipeline_file = open(\"emotion_classifier_pipe_lr_03_june_2021.pkl\",\"wb\")\n",
754 | "joblib.dump(pipe_lr,pipeline_file)\n",
755 | "pipeline_file.close()"
756 | ]
757 | },
758 | {
759 | "cell_type": "code",
760 | "execution_count": null,
761 | "id": "377c4e98-67f0-45e5-8dd5-0417585754f0",
762 | "metadata": {},
763 | "outputs": [],
764 | "source": []
765 | }
766 | ],
767 | "metadata": {
768 | "kernelspec": {
769 | "display_name": "Python 3",
770 | "language": "python",
771 | "name": "python3"
772 | },
773 | "language_info": {
774 | "codemirror_mode": {
775 | "name": "ipython",
776 | "version": 3
777 | },
778 | "file_extension": ".py",
779 | "mimetype": "text/x-python",
780 | "name": "python",
781 | "nbconvert_exporter": "python",
782 | "pygments_lexer": "ipython3",
783 | "version": "3.9.1+"
784 | }
785 | },
786 | "nbformat": 4,
787 | "nbformat_minor": 5
788 | }
789 |
--------------------------------------------------------------------------------
/notebooks/End2End-NLP-Project.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "36f38e2a-9998-4c73-bfc3-21b74d64a5ee",
6 | "metadata": {},
7 | "source": [
8 | "### End 2 End NLP Project\n",
9 | "+ Emotion Detection In Text \n",
10 | "+ Text Classifier"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "id": "f0814628-3d83-4fd6-a511-2eccf79f9f1e",
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "# Load EDA Pkgs\n",
21 | "import pandas as pd\n",
22 | "import numpy as np"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 2,
28 | "id": "ea0d580d-c31c-44b7-b09b-10225857eebe",
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "# Load Data Viz Pkgs\n",
33 | "import seaborn as sns"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 3,
39 | "id": "91eccfbf-d4d0-4e16-b0f7-2d7941efddb0",
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "# Load Text Cleaning Pkgs\n",
44 | "import neattext.functions as nfx"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 5,
50 | "id": "21e7e868-35fb-483f-82b6-842a29ef1342",
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "# Load ML Pkgs\n",
55 | "# Estimators\n",
56 | "from sklearn.linear_model import LogisticRegression\n",
57 | "from sklearn.naive_bayes import MultinomialNB\n",
58 | "\n",
59 | "# Transformers\n",
60 | "from sklearn.feature_extraction.text import CountVectorizer\n",
61 | "from sklearn.model_selection import train_test_split\n",
62 | "from sklearn.metrics import accuracy_score,classification_report,confusion_matrix"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 10,
68 | "id": "b209e004-ab77-4407-8689-b4318944d47f",
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "# Load Dataset\n",
73 | "df = pd.read_csv(\"data/emotion_dataset_raw.csv\")"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 11,
79 | "id": "fea2d4c0-3bdd-405e-ab69-507ceaac36cb",
80 | "metadata": {},
81 | "outputs": [
82 | {
83 | "data": {
84 | "text/html": [
85 | "\n",
86 | "\n",
99 | "
\n",
100 | " \n",
101 | " \n",
102 | " | \n",
103 | " Emotion | \n",
104 | " Text | \n",
105 | "
\n",
106 | " \n",
107 | " \n",
108 | " \n",
109 | " 0 | \n",
110 | " neutral | \n",
111 | " Why ? | \n",
112 | "
\n",
113 | " \n",
114 | " 1 | \n",
115 | " joy | \n",
116 | " Sage Act upgrade on my to do list for tommorow. | \n",
117 | "
\n",
118 | " \n",
119 | " 2 | \n",
120 | " sadness | \n",
121 | " ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ... | \n",
122 | "
\n",
123 | " \n",
124 | " 3 | \n",
125 | " joy | \n",
126 | " Such an eye ! The true hazel eye-and so brill... | \n",
127 | "
\n",
128 | " \n",
129 | " 4 | \n",
130 | " joy | \n",
131 | " @Iluvmiasantos ugh babe.. hugggzzz for u .! b... | \n",
132 | "
\n",
133 | " \n",
134 | "
\n",
135 | "
"
136 | ],
137 | "text/plain": [
138 | " Emotion Text\n",
139 | "0 neutral Why ? \n",
140 | "1 joy Sage Act upgrade on my to do list for tommorow.\n",
141 | "2 sadness ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ...\n",
142 | "3 joy Such an eye ! The true hazel eye-and so brill...\n",
143 | "4 joy @Iluvmiasantos ugh babe.. hugggzzz for u .! b..."
144 | ]
145 | },
146 | "execution_count": 11,
147 | "metadata": {},
148 | "output_type": "execute_result"
149 | }
150 | ],
151 | "source": [
152 | "df.head()"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": 12,
158 | "id": "430565a3-cf3b-4c6f-afa5-bafd084f5676",
159 | "metadata": {},
160 | "outputs": [
161 | {
162 | "data": {
163 | "text/plain": [
164 | "joy 11045\n",
165 | "sadness 6722\n",
166 | "fear 5410\n",
167 | "anger 4297\n",
168 | "surprise 4062\n",
169 | "neutral 2254\n",
170 | "disgust 856\n",
171 | "shame 146\n",
172 | "Name: Emotion, dtype: int64"
173 | ]
174 | },
175 | "execution_count": 12,
176 | "metadata": {},
177 | "output_type": "execute_result"
178 | }
179 | ],
180 | "source": [
181 | "# Value Counts\n",
182 | "df['Emotion'].value_counts()"
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": 13,
188 | "id": "531d3449-a959-4a19-bff0-3ffed551e619",
189 | "metadata": {},
190 | "outputs": [
191 | {
192 | "data": {
193 | "text/plain": [
194 | ""
195 | ]
196 | },
197 | "execution_count": 13,
198 | "metadata": {},
199 | "output_type": "execute_result"
200 | },
201 | {
202 | "data": {
203 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAEGCAYAAABPdROvAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAZ1ElEQVR4nO3debgkdX3v8fdHcAERBBm5OEMconNVIHFhRHBFMco1GohKHCMC6r1zNajBJRGjj6I+KFGjEQ3c4Aa4IeICmiiSQSRRFoclDEsIE0GYOMK4IW7g4Pf+Ub8jzZk+h2bqnNMc5v16nn66+tdVv/pVnTr16Vr616kqJEnaWPcYdwMkSfObQSJJ6sUgkST1YpBIknoxSCRJvWw+7gbMte23374WL1487mZI0rxywQUX/LCqFgx7b5MLksWLF7Ny5cpxN0OS5pUk35vqPU9tSZJ6MUgkSb0YJJKkXgwSSVIvBokkqReDRJLUi0EiSerFIJEk9WKQSJJ62eS+2T5fXfv2Pxh3EwD4vbesGncTJN3FeEQiSerFIJEk9WKQSJJ6MUgkSb0YJJKkXgwSSVIvBokkqReDRJLUi0EiSerFIJEk9WKQSJJ6MUgkSb0YJJKkXgwSSVIvBokkqZdZC5IkH0tyQ5JLB8q2S3JGkqva87YD770xyeokVyZ55kD57klWtfeOTpJWfu8kn23l5yVZPFvLIkma2mwekRwP7Dup7HBgRVUtAVa01yTZBVgG7NqmOSbJZm2aY4HlwJL2mKjzZcBPquqhwPuBv521JZEkTWnWgqSqzgZ+PKl4P+CENnwCsP9A+UlVdXNVXQ2sBvZIsiOwdVWdU1UFnDhpmom6TgH2mThakSTNnbm+RrJDVa0FaM8PbOULgesGxlvTyha24cnlt5umqtYDNwIPGDbTJMuTrEyyct26dTO0KJIkuOtcbB92JFHTlE83zYaFVcdV1dKqWrpgwYKNbKIkaZi5DpLr2+kq2vMNrXwNsNPAeIuA77fyRUPKbzdNks2BbdjwVJokaZbNdZCcBhzchg8GTh0oX9buxNqZ7qL6+e30101J9mzXPw6aNM1EXc8HzmzXUSRJc2jz2ao4yWeAvYHtk6wB3gocBZyc5GXAtcABAFV1WZKTgcuB9cChVXVrq+oVdHeAbQF8tT0APgp8IslquiORZbO1LJKkqc1akFTVC6d4a58pxj8SOHJI+UpgtyHlv6YFkSRpfO4qF9slSfOUQSJJ6sUgkST1YpBIknoxSCRJvRgkkqReDBJJUi8GiSSpF4NEktSLQSJJ6sUgkST1YpBIknoxSCRJvRgkkqReDBJJUi8GiSSpF4NEktSLQSJJ6sUgkST1YpBIknoxSCRJvRgkkqReDBJJUi8GiSSpF4NEktSLQSJJ6sUgkST1YpBIknoxSCRJvYwlSJK8JsllSS5N8pkk90myXZIzklzVnrcdGP+NSVYnuTLJMwfKd0+yqr13dJKMY3kkaVM250GSZCHwamBpVe0GbAYsAw4HVlTVEmBFe02SXdr7uwL7Asck2axVdyywHFjSHvvO4aJIkhjfqa3NgS2SbA5sCXwf2A84ob1/ArB/G94POKmqbq6qq4HVwB5JdgS2rqpzqqqAEwemkSTNkTkPkqr6b+C9wLXAWuDGqvo6sENVrW3jrAUe2CZZCFw3UMWaVrawDU8u30CS5UlWJlm5bt26mVwcSdrkjePU1rZ0Rxk7Aw8C7pvkwOkmGVJW05RvWFh1XFUtraqlCxYsuLNNliRNYxyntp4OXF1V66rqN8AXgMcD17fTVbTnG9r4a4CdBqZfRHcqbE0bnlwuSZpD4wiSa4E9k2zZ7rLaB7gCOA04uI1zMHBqGz4NWJbk3kl2pruofn47/XVTkj1bPQcNTCNJmiObz/UMq+q8JKcAFwLrgYuA44CtgJOTvIwubA5o41+W5GTg8jb+oVV1a6vuFcDxwBbAV9tDkjSH5jxIAKrqrcBbJxXfTHd0Mmz8I4Ejh5SvBHab8QZKkkbmN9slSb0YJJKkXgwSSVIvBokkqReDRJLUi0EiSerFIJEk9WKQSJJ6MUgkSb0YJJKkXgwSSVIvBokkqReDRJLUi0EiSerFIJEk9WKQSJJ6MUgkSb0YJJKkXsbyU7u6+3rCB58w7ibwrVd9a9xNkDYpHpFIknoxSCRJvRgkkqReDBJJUi8GiSSpl5GCJMmKUcokSZueaW//TXIfYEtg+yTbAmlvbQ08aJbbJkmaB+7oeyT/FziMLjQu4LYg+RnwD7PXLEnSfDFtkFTVB4APJHlVVX1wjtokSZpHRrpGUlUfTPL4JH+e5KCJx8bONMn9k5yS5D+SXJFkryTbJTkjyVXteduB8d+YZHWSK5M8c6B89ySr2ntHJ8nwOUqSZsuoF9s/AbwXeCLw2PZY2mO+HwC+VlUPBx4JXAEcDqyoqiXAivaaJLsAy4BdgX2BY5Js1uo5FlgOLGmPfXu0SZK0EUbta2spsEtVVd8ZJtkaeDJwCEBV3QLckmQ/YO822gnAWcAbgP2Ak6rqZuDqJKuBPZJcA2xdVee0ek8E9ge+2reNkqTRjfo9kkuB/zFD8/x9YB3w8SQXJflIkvsCO1TVWoD2/MA2/kLguoHp17SyhW14cvkGkixPsjLJynXr1s3QYkiSYPQg2R64PMnpSU6beGzkPDcHHgMcW1WPBn5BO401hWHXPWqa8g0Lq46rqqVVtXTBggV3tr2SpGmMemrriBmc5xpgTVWd116fQhck1yfZsarWJtkRuGFg/J0Gpl8EfL+VLxpSLkmaQyMFSVV9c6ZmWFU/SHJdkodV1ZXAPsDl7XEwcFR7PrVNchrw6STvo/s+yxLg/Kq6NclNSfYEzgMOArxFWZLm2EhBkuQmbjttdC/gnsAvqmrrjZzvq4BPJbkX8F3gJXSn2U5O8jLgWuAAgKq6LMnJdEGzHji0qm5t9bwCOB7Ygu4iuxfaJWmOjXpEcr/B10n2B/bY2JlW1cUMv314nynGPxI4ckj5SmC3jW2HJKm/jer9t6q+BDxtZpsiSZqPRj219dyBl/egO5ro/Z0SSdL8N+pdW88ZGF4PXEP3RUFJ0iZu1GskL5nthkiS5qdR+9palOSLSW5Icn2SzydZdMdTSpLu7kY9tfVx4NO0W3KBA1vZH81Go6TZ9s0nP2XcTeApZ8/Y17OksRr1rq0FVfXxqlrfHscD9jUiSRo5SH6Y5MAkm7XHgcCPZrNhkqT5YdQgeSnwZ8APgLXA8+m+jS5J2sSNeo3kHcDBVfUTgCTb0f3Q1Utnq2GSpPlh1COSP5wIEYCq+jHw6NlpkiRpPhk1SO4x6TfUt2P0oxlJ0t3YqGHwd8C3k5xC1zXKnzGkE0VJ0qZn1G+2n5hkJV1HjQGeW1WXz2rLJEnzwsinp1pwGB6SpNvZqG7kJUmaYJBIknoxSCRJvRgkkqReDBJJUi8GiSSpF4NEktSLQSJJ6sUgkST1YpBIknoxSCRJvdgVvKTejjzw+eNuAm/65CnjbsImyyMSSVIvBokkqZexBUmSzZJclOQr7fV2Sc5IclV7HvxFxjcmWZ3kyiTPHCjfPcmq9t7RSTKOZZGkTdk4j0j+Erhi4PXhwIqqWgKsaK9JsguwDNgV2Bc4JslmbZpjgeXAkvbYd26aLkmaMJaL7UkWAX9M93O9r23F+wF7t+ETgLOAN7Tyk6rqZuDqJKuBPZJcA2xdVee0Ok8E9ge+OicLIc2BD73uy+NuAq/8u+eMuwm6ixvXEcnfA38N/HagbIeqWgvQnh/YyhcC1w2Mt6aVLWzDk8s3kGR5kpVJVq5bt25GFkCS1JnzIEnybOCGqrpg1EmGlNU05RsWVh1XVUuraumCBQtGnK0kaRTjOLX1BOBPkjwLuA+wdZJPAtcn2bGq1ibZEbihjb8G2Glg+kXA91v5oiHlkqQ5NOdHJFX1xqpaVFWL6S6in1lVBwKnAQe30Q4GTm3DpwHLktw7yc50F9XPb6e/bkqyZ7tb66CBaSRJc+Su9M32o4CTk7wMuBY4AKCqLktyMnA5sB44tKpubdO8Ajge2ILuIrsX2iVpjo01SKrqLLq7s6iqHwH7TDHekXR3eE0uXwnsNnstlCTdEb/ZLknqxSCRJPVikEiSejFIJEm9GCSSpF4MEklSLwaJJKkXg0SS1ItBIknqxSCRJPVikEiSejFIJEm9GCSSpF4MEklSLwaJJKkXg0SS1ItBIknqxSCRJPVikEiSejFIJEm9GCSSpF4MEklSLwaJJKkXg0SS1ItBIknqxSCRJPVikEiSetl83A0Yt93/6sRxNwGAC95z0LibIEkbZc6PSJLslOQbSa5IclmSv2zl2yU5I8lV7XnbgWnemGR1kiuTPHOgfPckq9p7RyfJXC+PJG3qxnFqaz3wuqp6BLAncGiSXYDDgRVVtQRY0V7T3lsG7ArsCxyTZLNW17HAcmBJe+w7lwsiSRpDkFTV2qq6sA3fBFwBLAT2A05oo50A7N+G9wNOqqqbq+pqYDWwR5Idga2r6pyqKuDEgWkkSXNkrBfbkywGHg2cB+xQVWuhCxvggW20hcB1A5OtaWUL2/Dk8mHzWZ5kZZKV69atm9FlkKRN3diCJMlWwOeBw6rqZ9ONOqSspinfsLDquKpaWlVLFyxYcOcbK0ma0liCJMk96ULkU1X1hVZ8fTtdRXu+oZWvAXYamHwR8P1WvmhIuSRpDo3jrq0AHwWuqKr3Dbx1GnBwGz4YOHWgfFmSeyfZme6i+vnt9NdNSfZsdR40MI0kaY6M43skTwBeDKxKcnEr+xvgKODkJC8DrgUOAKiqy5KcDFxOd8fXoVV1a5vuFcDxwBbAV9tDkjSH5jxIqurfGH59A2CfKaY5EjhySPlKYLeZa50k6c6yixRJUi8GiSSpF4NEktSLQSJJ6sUgkST1YpBIknoxSCRJvRgkkqReDBJJUi8GiSSpF4NEktSLQSJJ6sUgkST1YpBIknoxSCRJvRgkkqRexvELiZKkKRxxxBHjbgJw59rhEYkkqReDRJLUi0EiSerFIJEk9WKQSJJ6MUgkSb0YJJKkXgwSSVIvBokkqReDRJLUi0EiSerFIJEk9TLvO21Msi/wAWAz4CNVddSYmyTpLuiKI88cdxMAeMSbnjbuJsy4eX1EkmQz4B+A/wXsArwwyS7jbZUkbVrmdZAAewCrq+q7VXULcBKw35jbJEmblFTVuNuw0ZI8H9i3qv53e/1i4HFV9cpJ4y0HlreXDwOunOGmbA/8cIbrnA22c2bNh3bOhzaC7Zxps9HOB1fVgmFvzPdrJBlStkEyVtVxwHGz1ohkZVUtna36Z4rtnFnzoZ3zoY1gO2faXLdzvp/aWgPsNPB6EfD9MbVFkjZJ8z1IvgMsSbJzknsBy4DTxtwmSdqkzOtTW1W1PskrgdPpbv/9WFVdNoamzNppsxlmO2fWfGjnfGgj2M6ZNqftnNcX2yVJ4zffT21JksbMIJEk9WKQzJAki5P8+UZO+/MZbsu3Z7K+2dLW2aXjbscwSV6d5Ioknxp3W2ZLkn9Ocv9xt2PcklyTZPtxt2OYJEckeX2Styd5+hzMb/+N6R3EIJk5i4GhQZJkTm9qqKrHz+X87qb+AnhWVb1oYytoXfjMmVG3s3TuUVXPqqqfznKzZs3Ecoy7HXOhqt5SVf8yB7Pan667qTunqjbpB10AXAF8GLgM+DqwBfAQ4GvABcC/Ag9v4x8PPH9g+p+353OBG4GLgdcAhwCfA74MnAlsBawALgRWAftNrmMGl+nndF/WfA9waZvfC9p7n5g0708Bf9JzfvcF/gn49za/FwBvobs9+1K6O0gmbuzYvY13zkT7WvkhwBfaOr8KePdA/c9o41/Y1ulWrfwo4HLgEuC9reyANs9/B87eyOX5f8Atbb29CfhYW5aLJtZd227+tbXpQuDxrXxv4BvAp4HLZ3B9XgNs395fCpzVho9o6/frbZ6HAKe29Xgl8NZJ2/kxbTkePFHnsPkN/K2+Sfc/cDqw44jt/1Kb5jJg+cA2eWSbx7nADq38Ie31d4C3M/C/APxVK78EeNtUyzFD6/Zt3Pa/OfG/vgfw7TafbwMPG9hWv0T3v3018ErgtW28c4HtBpZtg33ICG18U/vb/QvwGeD1DOx3GL7dD12PdNvjVwbq/hBwyLB6gMcDP27LdDHwkJHXa58dyN3h0TbM9cCj2uuTgQPpdvpLWtnjgDPb8O/+oBP/IFP8wQ6h+8LkxEa1ObB1G94eWM1tO9fZCJLnAWfQ3Ra9A3AtsCPwFOBLbbxt2kazec/5PQ/48MDrbSaWu73+BPCcNnwJ8JQ2PDlIvtumvQ/wPbovm24PnA3ct433BrqQ2q79s02sw/u351XAwsGyjVyma9q83wkcOFEf8J90O6Mtgfu08iXAyoHt4BfAzjO8Pq9h6iC5ANhiYD2uBR5A94Ho0jb+YuC3wJ5DlnHY/O5Jt/Nc0MpeQHd7/Sjtn9jmJ+b/ALoeJya2gXcDb27DXwFe2IZfzm3/T8+gfQChO3PyFeDJw5Zjhtbtq9rrv6DrRRxga9r/BvB04PMD63g1cD9gAd0HyJe3994PHNaGh+5D7qB9u9Ntw1u2+a9mIEiYerufaj3uzZAgmaae4xnYv4362CQOC0dwdVVd3IYvoNtYHw98LsnFwD/S7YTvrDOq6sdtOMA7k1xC90ljId0OfrY8EfhMVd1aVdfTfbJ8bFV9E3hokgcCL6T751jfc16rgKcn+dskT6qqG4GnJjkvySrgacCuSbah22C/2ab7xKR6VlTVjVX1a7pPSg8G9qQ71P5W+1sc3Mp/Bvwa+EiS5wK/bHV8Czg+yf+hC9G+ngEc3uZ9Fl3I/R7djvbDbfk+x+1PB5xfVVf3mOew9Tmd06rqVwOvz6iqH7WyL9BtCwDfq6pzR5zfw4DdgDPasr+ZrueIUbw6ycSRx050QXsL3c4ObvsfA9iLbv1Bd0Q14RntcRHdkcLDWz3TLccoplq3XxjStm3o9gGX0gXErgP1fKOqbqqqdXRB8uWB+hcn2YqN24c8CfhiVf2yqn7Ghl+wnmq7n2o9TmWqejbKvP5C4gy6eWD4Vrod/E+r6lFDxl1Pu7aUJMC9pqn3FwPDL6L79LJ7Vf0myTV0O6XZMqwfsgmfaO1ZBry074yq6j+T7A48C3hXkq8DhwJLq+q6JEfQLWsY0hfagMl/h83bNGdU1Qsnj5xkD2CfthyvBJ5WVS9P8jjgj4GLkzyqqn7UY/ECPK+qbtfRZ1um64FH0m0Pvx54e/DvfqdNsT5/t92x4XYzeX6T13FNMd508/sicFlV7XVn2p5kb7pP73tV1S+TnNXa+5tqH3m57W87bVXAu6rqHyfVv3iq5RjFFMsKt217g217B11g/Gmb71kDVQ1uq78deP3bNv09mHofcofNnKb964dt99PUNbjdQNt2NqKeaXlEMtzPgKuTHAC/u6j3yPbeNXSHn9B1WX/PNnwT3aHuVLYBbmgh8lS6T9Wz6WzgBUk2S7KA7rTA+e2944HDAGoGegJI8iDgl1X1SbpzrY9pb/2wfTJ7fpvXT4Ebk0x8Qh7lQva5wBOSPLTNa8sk/7PVu01V/XNblke19x9SVedV1Vvoej/daXi1IzsdeFX70ECSR7fybYC1VfVb4MXMzNEPbR7D1uc13LbdPe8OqvijJNsl2YLu4um3NmJ+VwILkuzVxrlnkl2nqWbCNsBPWog8nO6IcjrnctvyLBsoPx14afs7k2RhO4ruZZptdZhtgP9uw4fcmfm0o4mp9iHTORv40yRbJLkf8JxJ7R+63TP1evwesEuSe7czAvvcQT13tB8byiOSqb0IODbJm+nC4iS6C3QfBk5Ncj7dOdCJT0eXAOvbIf3xwE8m1fcp4MtJVtJdyPqPWWx70X2i3Ku1uYC/rqofAFTV9UmuoLtgOBP+AHhPkt8CvwFeQbcDW0W3A/zOwLgvAT6W5Jd0O4vpF6RqXZJDgM8kuXcrfjPdBn9qkokjnde0996TZEkrW0G3/H28A/h74JIWJtcAz6a72Pv5tqP4Bj2PQiYZtj63AD6a5G+A8+5g+n+jO+p8KPDpqlrZPlGPPL+quiXdzzQc3XZAm9Othzv64PE14OXtFO6VdDu46RwGfDLJ6+gugt8IUFVfT/II4JyW4T+nu3Z56x3Ud0eGrdtTphj33cAJSV5Ld8PMnTXVPmRKVXVhks/S7SO+R3eRftD9GL7dH8bw9XhdkpPp9k9X0Z0qnK6ek+hO2b6a7lrJf42yoHaRcjeT5AHAhVU15RFPki3pdvKPGeH8u+aRFrpLa9Jv8txVtW3xV1VVSZbRXTDeb9ztmm/GvR49IrkbaYftZ9Edsk81ztPpbmd9nyGiu4DdgQ+1o72fMgPX7DZRY12PHpFIknrxYrskqReDRJLUi0EiSerFIJF6SHJrkosHHofPQJ2360k6ydIkR/etV5otXmyXekjy86raaobr3Bt4fVU9eybrlWaLRyTSLEj3GxfvTHJOkpVJHpPk9CT/leTlbZwkeU+SS5OsSvKCNvlRwJPaEc5rkuyd5Cttmu2SfCnJJUnOTfKHrfyIJB9LclaS77YvlElzwu+RSP1ska5TvgnvqqrPtuHrqmqvJO+n6+3gCXR9HV1G11X9c+m6pngkXS+830lyNnA4A0ck7QhlwtuAi6pq/yRPA07ktu4tHg48le5by1cmObaqfjOTCysNY5BI/fxqmo75JnpuXUX3Gyo3ATcl+XW6Xyb8XQ/NwPVJvgk8lq6vt6k8kdanUlWdmeQBrQsTgH+qqpuBm5PcQNf56JoeyyaNxFNb0uwZ7BF2cm+xEz0b31nDppm40Dms92Rp1hkk0vhM1UPzdD2wnk3rNbmd8vph62lWGhs/sUj9TL5G8rWqGvUW4KE9NCf5EbfvSfqigWmOAD7eetf9Jd0PfUlj5e2/kqRePLUlSerFIJEk9WKQSJJ6MUgkSb0YJJKkXgwSSVIvBokkqZf/D8jGbTFp2Y+OAAAAAElFTkSuQmCC\n",
204 | "text/plain": [
205 | ""
206 | ]
207 | },
208 | "metadata": {
209 | "needs_background": "light"
210 | },
211 | "output_type": "display_data"
212 | }
213 | ],
214 | "source": [
215 | "# Plot\n",
216 | "sns.countplot(x='Emotion',data=df)"
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": 14,
222 | "id": "40f991d0-952f-40c1-bf00-f3476ce0436d",
223 | "metadata": {
224 | "collapsed": true,
225 | "jupyter": {
226 | "outputs_hidden": true
227 | },
228 | "tags": []
229 | },
230 | "outputs": [
231 | {
232 | "data": {
233 | "text/plain": [
234 | "['BTC_ADDRESS_REGEX',\n",
235 | " 'CURRENCY_REGEX',\n",
236 | " 'CURRENCY_SYMB_REGEX',\n",
237 | " 'Counter',\n",
238 | " 'DATE_REGEX',\n",
239 | " 'EMAIL_REGEX',\n",
240 | " 'EMOJI_REGEX',\n",
241 | " 'HASTAG_REGEX',\n",
242 | " 'MASTERCard_REGEX',\n",
243 | " 'MD5_SHA_REGEX',\n",
244 | " 'MOST_COMMON_PUNCT_REGEX',\n",
245 | " 'NUMBERS_REGEX',\n",
246 | " 'PHONE_REGEX',\n",
247 | " 'PoBOX_REGEX',\n",
248 | " 'SPECIAL_CHARACTERS_REGEX',\n",
249 | " 'STOPWORDS',\n",
250 | " 'STOPWORDS_de',\n",
251 | " 'STOPWORDS_en',\n",
252 | " 'STOPWORDS_es',\n",
253 | " 'STOPWORDS_fr',\n",
254 | " 'STOPWORDS_ru',\n",
255 | " 'STOPWORDS_yo',\n",
256 | " 'STREET_ADDRESS_REGEX',\n",
257 | " 'TextFrame',\n",
258 | " 'URL_PATTERN',\n",
259 | " 'USER_HANDLES_REGEX',\n",
260 | " 'VISACard_REGEX',\n",
261 | " '__builtins__',\n",
262 | " '__cached__',\n",
263 | " '__doc__',\n",
264 | " '__file__',\n",
265 | " '__generate_text',\n",
266 | " '__loader__',\n",
267 | " '__name__',\n",
268 | " '__numbers_dict',\n",
269 | " '__package__',\n",
270 | " '__spec__',\n",
271 | " '_lex_richness_herdan',\n",
272 | " '_lex_richness_maas_ttr',\n",
273 | " 'clean_text',\n",
274 | " 'defaultdict',\n",
275 | " 'digit2words',\n",
276 | " 'extract_btc_address',\n",
277 | " 'extract_currencies',\n",
278 | " 'extract_currency_symbols',\n",
279 | " 'extract_dates',\n",
280 | " 'extract_emails',\n",
281 | " 'extract_emojis',\n",
282 | " 'extract_hashtags',\n",
283 | " 'extract_html_tags',\n",
284 | " 'extract_mastercard_addr',\n",
285 | " 'extract_md5sha',\n",
286 | " 'extract_numbers',\n",
287 | " 'extract_pattern',\n",
288 | " 'extract_phone_numbers',\n",
289 | " 'extract_postoffice_box',\n",
290 | " 'extract_shortwords',\n",
291 | " 'extract_special_characters',\n",
292 | " 'extract_stopwords',\n",
293 | " 'extract_street_address',\n",
294 | " 'extract_urls',\n",
295 | " 'extract_userhandles',\n",
296 | " 'extract_visacard_addr',\n",
297 | " 'fix_contractions',\n",
298 | " 'generate_sentence',\n",
299 | " 'hamming_distance',\n",
300 | " 'inverse_df',\n",
301 | " 'lexical_richness',\n",
302 | " 'markov_chain',\n",
303 | " 'math',\n",
304 | " 'nlargest',\n",
305 | " 'normalize',\n",
306 | " 'num2words',\n",
307 | " 'random',\n",
308 | " 're',\n",
309 | " 'read_txt',\n",
310 | " 'remove_bad_quotes',\n",
311 | " 'remove_btc_address',\n",
312 | " 'remove_currencies',\n",
313 | " 'remove_currency_symbols',\n",
314 | " 'remove_custom_pattern',\n",
315 | " 'remove_custom_words',\n",
316 | " 'remove_dates',\n",
317 | " 'remove_emails',\n",
318 | " 'remove_emojis',\n",
319 | " 'remove_hashtags',\n",
320 | " 'remove_html_tags',\n",
321 | " 'remove_mastercard_addr',\n",
322 | " 'remove_md5sha',\n",
323 | " 'remove_multiple_spaces',\n",
324 | " 'remove_non_ascii',\n",
325 | " 'remove_numbers',\n",
326 | " 'remove_phone_numbers',\n",
327 | " 'remove_postoffice_box',\n",
328 | " 'remove_puncts',\n",
329 | " 'remove_punctuations',\n",
330 | " 'remove_shortwords',\n",
331 | " 'remove_special_characters',\n",
332 | " 'remove_stopwords',\n",
333 | " 'remove_street_address',\n",
334 | " 'remove_urls',\n",
335 | " 'remove_userhandles',\n",
336 | " 'remove_visacard_addr',\n",
337 | " 'replace_bad_quotes',\n",
338 | " 'replace_currencies',\n",
339 | " 'replace_currency_symbols',\n",
340 | " 'replace_dates',\n",
341 | " 'replace_emails',\n",
342 | " 'replace_emojis',\n",
343 | " 'replace_numbers',\n",
344 | " 'replace_phone_numbers',\n",
345 | " 'replace_special_characters',\n",
346 | " 'replace_term',\n",
347 | " 'replace_urls',\n",
348 | " 'string',\n",
349 | " 'term_freq',\n",
350 | " 'to_txt',\n",
351 | " 'word_freq',\n",
352 | " 'word_length_freq']"
353 | ]
354 | },
355 | "execution_count": 14,
356 | "metadata": {},
357 | "output_type": "execute_result"
358 | }
359 | ],
360 | "source": [
361 | "# Data Cleaning\n",
362 | "dir(nfx)"
363 | ]
364 | },
365 | {
366 | "cell_type": "code",
367 | "execution_count": 17,
368 | "id": "b1f87847-a91c-4bd6-a307-d746eb5aa9a0",
369 | "metadata": {},
370 | "outputs": [],
371 | "source": [
372 | "# User handles\n",
373 | "df['Clean_Text'] = df['Text'].apply(nfx.remove_userhandles)"
374 | ]
375 | },
376 | {
377 | "cell_type": "code",
378 | "execution_count": 18,
379 | "id": "03886bc3-1ac4-4f1b-842b-e5d2d770ff81",
380 | "metadata": {},
381 | "outputs": [],
382 | "source": [
383 | "# Stopwords\n",
384 | "df['Clean_Text'] = df['Clean_Text'].apply(nfx.remove_stopwords)"
385 | ]
386 | },
387 | {
388 | "cell_type": "code",
389 | "execution_count": 19,
390 | "id": "0a0fcc0c-4adf-4f0b-b226-164659ad70ba",
391 | "metadata": {
392 | "collapsed": true,
393 | "jupyter": {
394 | "outputs_hidden": true
395 | },
396 | "tags": []
397 | },
398 | "outputs": [
399 | {
400 | "data": {
401 | "text/html": [
402 | "\n",
403 | "\n",
416 | "
\n",
417 | " \n",
418 | " \n",
419 | " | \n",
420 | " Emotion | \n",
421 | " Text | \n",
422 | " Clean_Text | \n",
423 | "
\n",
424 | " \n",
425 | " \n",
426 | " \n",
427 | " 0 | \n",
428 | " neutral | \n",
429 | " Why ? | \n",
430 | " ? | \n",
431 | "
\n",
432 | " \n",
433 | " 1 | \n",
434 | " joy | \n",
435 | " Sage Act upgrade on my to do list for tommorow. | \n",
436 | " Sage Act upgrade list tommorow. | \n",
437 | "
\n",
438 | " \n",
439 | " 2 | \n",
440 | " sadness | \n",
441 | " ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ... | \n",
442 | " WAY HOMEGIRL BABY FUNERAL!!! MAN HATE FUNERALS... | \n",
443 | "
\n",
444 | " \n",
445 | " 3 | \n",
446 | " joy | \n",
447 | " Such an eye ! The true hazel eye-and so brill... | \n",
448 | " eye ! true hazel eye-and brilliant ! Regular f... | \n",
449 | "
\n",
450 | " \n",
451 | " 4 | \n",
452 | " joy | \n",
453 | " @Iluvmiasantos ugh babe.. hugggzzz for u .! b... | \n",
454 | " ugh babe.. hugggzzz u .! babe naamazed nga ako... | \n",
455 | "
\n",
456 | " \n",
457 | " ... | \n",
458 | " ... | \n",
459 | " ... | \n",
460 | " ... | \n",
461 | "
\n",
462 | " \n",
463 | " 34787 | \n",
464 | " surprise | \n",
465 | " @MichelGW have you gift! Hope you like it! It'... | \n",
466 | " gift! Hope like it! hand wear ! It'll warm! Lol | \n",
467 | "
\n",
468 | " \n",
469 | " 34788 | \n",
470 | " joy | \n",
471 | " The world didnt give it to me..so the world MO... | \n",
472 | " world didnt me..so world DEFINITELY cnt away!!! | \n",
473 | "
\n",
474 | " \n",
475 | " 34789 | \n",
476 | " anger | \n",
477 | " A man robbed me today . | \n",
478 | " man robbed today . | \n",
479 | "
\n",
480 | " \n",
481 | " 34790 | \n",
482 | " fear | \n",
483 | " Youu call it JEALOUSY, I call it of #Losing YO... | \n",
484 | " Youu JEALOUSY, #Losing YOU... | \n",
485 | "
\n",
486 | " \n",
487 | " 34791 | \n",
488 | " sadness | \n",
489 | " I think about you baby, and I dream about you ... | \n",
490 | " think baby, dream time | \n",
491 | "
\n",
492 | " \n",
493 | "
\n",
494 | "
34792 rows × 3 columns
\n",
495 | "
"
496 | ],
497 | "text/plain": [
498 | " Emotion Text \\\n",
499 | "0 neutral Why ? \n",
500 | "1 joy Sage Act upgrade on my to do list for tommorow. \n",
501 | "2 sadness ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ... \n",
502 | "3 joy Such an eye ! The true hazel eye-and so brill... \n",
503 | "4 joy @Iluvmiasantos ugh babe.. hugggzzz for u .! b... \n",
504 | "... ... ... \n",
505 | "34787 surprise @MichelGW have you gift! Hope you like it! It'... \n",
506 | "34788 joy The world didnt give it to me..so the world MO... \n",
507 | "34789 anger A man robbed me today . \n",
508 | "34790 fear Youu call it JEALOUSY, I call it of #Losing YO... \n",
509 | "34791 sadness I think about you baby, and I dream about you ... \n",
510 | "\n",
511 | " Clean_Text \n",
512 | "0 ? \n",
513 | "1 Sage Act upgrade list tommorow. \n",
514 | "2 WAY HOMEGIRL BABY FUNERAL!!! MAN HATE FUNERALS... \n",
515 | "3 eye ! true hazel eye-and brilliant ! Regular f... \n",
516 | "4 ugh babe.. hugggzzz u .! babe naamazed nga ako... \n",
517 | "... ... \n",
518 | "34787 gift! Hope like it! hand wear ! It'll warm! Lol \n",
519 | "34788 world didnt me..so world DEFINITELY cnt away!!! \n",
520 | "34789 man robbed today . \n",
521 | "34790 Youu JEALOUSY, #Losing YOU... \n",
522 | "34791 think baby, dream time \n",
523 | "\n",
524 | "[34792 rows x 3 columns]"
525 | ]
526 | },
527 | "execution_count": 19,
528 | "metadata": {},
529 | "output_type": "execute_result"
530 | }
531 | ],
532 | "source": [
533 | "df"
534 | ]
535 | },
536 | {
537 | "cell_type": "code",
538 | "execution_count": 20,
539 | "id": "450c39c0-79dd-4eaf-85fe-57e344eb81bd",
540 | "metadata": {},
541 | "outputs": [],
542 | "source": [
543 | "# Features & Labels\n",
544 | "Xfeatures = df['Clean_Text']\n",
545 | "ylabels = df['Emotion']"
546 | ]
547 | },
548 | {
549 | "cell_type": "code",
550 | "execution_count": 21,
551 | "id": "27d7f976-c28f-449e-ae1a-53a42bbda4e8",
552 | "metadata": {},
553 | "outputs": [],
554 | "source": [
555 | "# Split Data\n",
556 | "x_train,x_test,y_train,y_test = train_test_split(Xfeatures,ylabels,test_size=0.3,random_state=42)"
557 | ]
558 | },
559 | {
560 | "cell_type": "code",
561 | "execution_count": 22,
562 | "id": "2f086f29-dba9-40d2-a9dd-f06a6cca3a4c",
563 | "metadata": {},
564 | "outputs": [],
565 | "source": [
566 | "# Build Pipeline\n",
567 | "from sklearn.pipeline import Pipeline"
568 | ]
569 | },
570 | {
571 | "cell_type": "code",
572 | "execution_count": 23,
573 | "id": "6b81cc86-2bef-40c2-b9a3-668caaadaff0",
574 | "metadata": {},
575 | "outputs": [],
576 | "source": [
577 | "# LogisticRegression Pipeline\n",
578 | "pipe_lr = Pipeline(steps=[('cv',CountVectorizer()),('lr',LogisticRegression())])"
579 | ]
580 | },
581 | {
582 | "cell_type": "code",
583 | "execution_count": 24,
584 | "id": "dc64b9a7-efe2-4bc4-a0e7-46dff1d52b31",
585 | "metadata": {
586 | "collapsed": true,
587 | "jupyter": {
588 | "outputs_hidden": true
589 | },
590 | "tags": []
591 | },
592 | "outputs": [
593 | {
594 | "name": "stderr",
595 | "output_type": "stream",
596 | "text": [
597 | "/usr/local/lib/python3.9/dist-packages/sklearn/linear_model/_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
598 | "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
599 | "\n",
600 | "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
601 | " https://scikit-learn.org/stable/modules/preprocessing.html\n",
602 | "Please also refer to the documentation for alternative solver options:\n",
603 | " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
604 | " n_iter_i = _check_optimize_result(\n"
605 | ]
606 | },
607 | {
608 | "data": {
609 | "text/plain": [
610 | "Pipeline(steps=[('cv', CountVectorizer()), ('lr', LogisticRegression())])"
611 | ]
612 | },
613 | "execution_count": 24,
614 | "metadata": {},
615 | "output_type": "execute_result"
616 | }
617 | ],
618 | "source": [
619 | "# Train and Fit Data\n",
620 | "pipe_lr.fit(x_train,y_train)"
621 | ]
622 | },
623 | {
624 | "cell_type": "code",
625 | "execution_count": 25,
626 | "id": "135ed6f8-56ff-4d53-85e3-541e3a7ae2d7",
627 | "metadata": {},
628 | "outputs": [
629 | {
630 | "data": {
631 | "text/plain": [
632 | "Pipeline(steps=[('cv', CountVectorizer()), ('lr', LogisticRegression())])"
633 | ]
634 | },
635 | "execution_count": 25,
636 | "metadata": {},
637 | "output_type": "execute_result"
638 | }
639 | ],
640 | "source": [
641 | "pipe_lr"
642 | ]
643 | },
644 | {
645 | "cell_type": "code",
646 | "execution_count": 26,
647 | "id": "28396371-5f5c-4a3b-b974-164e047764f3",
648 | "metadata": {},
649 | "outputs": [
650 | {
651 | "data": {
652 | "text/plain": [
653 | "0.6200421536692853"
654 | ]
655 | },
656 | "execution_count": 26,
657 | "metadata": {},
658 | "output_type": "execute_result"
659 | }
660 | ],
661 | "source": [
662 | "# Check Accuracy\n",
663 | "pipe_lr.score(x_test,y_test)"
664 | ]
665 | },
666 | {
667 | "cell_type": "code",
668 | "execution_count": 27,
669 | "id": "eb3a26b6-d09e-422f-991b-b08c48f55b75",
670 | "metadata": {},
671 | "outputs": [],
672 | "source": [
673 | "# Make A Prediction\n",
674 | "ex1 = \"This book was so interesting it made me happy\""
675 | ]
676 | },
677 | {
678 | "cell_type": "code",
679 | "execution_count": 28,
680 | "id": "b08597d9-6f59-45cb-a648-95b0da1ce313",
681 | "metadata": {},
682 | "outputs": [
683 | {
684 | "data": {
685 | "text/plain": [
686 | "array(['joy'], dtype=object)"
687 | ]
688 | },
689 | "execution_count": 28,
690 | "metadata": {},
691 | "output_type": "execute_result"
692 | }
693 | ],
694 | "source": [
695 | "pipe_lr.predict([ex1])"
696 | ]
697 | },
698 | {
699 | "cell_type": "code",
700 | "execution_count": 29,
701 | "id": "5b3822ac-17fc-43dd-9bb7-8dad07a4d32c",
702 | "metadata": {},
703 | "outputs": [
704 | {
705 | "data": {
706 | "text/plain": [
707 | "array([[1.60353503e-03, 7.05960421e-03, 6.95963589e-03, 9.43781635e-01,\n",
708 | " 1.00430913e-04, 2.63557471e-02, 6.65377751e-05, 1.40728742e-02]])"
709 | ]
710 | },
711 | "execution_count": 29,
712 | "metadata": {},
713 | "output_type": "execute_result"
714 | }
715 | ],
716 | "source": [
717 | "# Prediction Prob\n",
718 | "pipe_lr.predict_proba([ex1])"
719 | ]
720 | },
721 | {
722 | "cell_type": "code",
723 | "execution_count": 30,
724 | "id": "5b7c4596-d643-48e5-a777-79a6f55c49da",
725 | "metadata": {},
726 | "outputs": [
727 | {
728 | "data": {
729 | "text/plain": [
730 | "array(['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'shame',\n",
731 | " 'surprise'], dtype=object)"
732 | ]
733 | },
734 | "execution_count": 30,
735 | "metadata": {},
736 | "output_type": "execute_result"
737 | }
738 | ],
739 | "source": [
740 | "# To Know the classes\n",
741 | "pipe_lr.classes_"
742 | ]
743 | },
744 | {
745 | "cell_type": "code",
746 | "execution_count": 31,
747 | "id": "c0d40f62-b1fd-4748-a279-c8f50c748f26",
748 | "metadata": {},
749 | "outputs": [],
750 | "source": [
751 | "# Save Model & Pipeline\n",
752 | "import joblib\n",
753 | "pipeline_file = open(\"emotion_classifier_pipe_lr_03_june_2021.pkl\",\"wb\")\n",
754 | "joblib.dump(pipe_lr,pipeline_file)\n",
755 | "pipeline_file.close()"
756 | ]
757 | },
758 | {
759 | "cell_type": "code",
760 | "execution_count": null,
761 | "id": "377c4e98-67f0-45e5-8dd5-0417585754f0",
762 | "metadata": {},
763 | "outputs": [],
764 | "source": []
765 | }
766 | ],
767 | "metadata": {
768 | "kernelspec": {
769 | "display_name": "Python 3",
770 | "language": "python",
771 | "name": "python3"
772 | },
773 | "language_info": {
774 | "codemirror_mode": {
775 | "name": "ipython",
776 | "version": 3
777 | },
778 | "file_extension": ".py",
779 | "mimetype": "text/x-python",
780 | "name": "python",
781 | "nbconvert_exporter": "python",
782 | "pygments_lexer": "ipython3",
783 | "version": "3.9.1+"
784 | }
785 | },
786 | "nbformat": 4,
787 | "nbformat_minor": 5
788 | }
789 |
--------------------------------------------------------------------------------
/notebooks/emotion_classifier_pipe_lr_03_june_2021.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jcharis/end2end-nlp-project/f82d87ed6d2608ad2c9b43e29e06da53429c1915/notebooks/emotion_classifier_pipe_lr_03_june_2021.pkl
--------------------------------------------------------------------------------