├── LICENSE ├── README.md ├── app.py ├── home.jpg ├── result.jpg ├── speech_classification1.pkl ├── speech_classification2.pkl ├── speech_classification3.pkl ├── speech_classification4.pkl ├── speech_classification5.pkl ├── speech_classification6.pkl ├── static └── styles.css └── templates ├── home.html └── result.html /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Aniket Gupta 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Toxic Speech Classification 2 | It is a full-fetched web application.Based on sentiment Classification, by using nltk library it predicts that a speech is how much toxic, sever toxic, insult, obscene, threat. 3 | 4 | ### Where Front-End looks like this 5 | [Home Page](https://github.com/anik8gupta/Toxic_Speech_Classification/blob/master/templates/home.html) | [Result Page](https://github.com/anik8gupta/Toxic_Speech_Classification/blob/master/templates/result.html) 6 | :-------------------------:|:-------------------------: 7 | ![](https://github.com/anik8gupta/Toxic_Speech_Classification/blob/master/home.jpg) | ![](https://github.com/anik8gupta/Toxic_Speech_Classification/blob/master/result.jpg) 8 | 9 | Here simple CSS & HTML is used with POST method to send the data. 10 | 11 | ### For Back-End [app.py](https://github.com/anik8gupta/Toxic_Speech_Classification/blob/master/app.py), I use FLASK framework of Python 12 | 13 | ### Main Libraries Used 14 | * [Pandas](https://pandas.pydata.org/pandas-docs/version/0.22/) 15 | * [Matplotlib.pyplot](https://matplotlib.org/3.1.0/api/_as_gen/matplotlib.pyplot.plot.html) 16 | * [NLTK](https://www.nltk.org/) 17 | * [Pickle](https://docs.python.org/3/library/pickle.html) 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | # Import Libraries 2 | from flask import Flask,render_template,url_for,request 3 | import pandas as pd 4 | 5 | from sklearn.feature_extraction.text import CountVectorizer 6 | 7 | from sklearn.externals import joblib 8 | 9 | 10 | app = Flask(__name__) 11 | 12 | @app.route('/') 13 | def home(): 14 | return render_template('home.html') 15 | 16 | @app.route('/predict',methods=['POST']) 17 | def predict(): 18 | 19 | import pickle as p 20 | # un-serializing model 21 | clf1 = p.load(open('speech_classification1.pkl', 'rb')) 22 | clf2 = p.load(open('speech_classification2.pkl', 'rb')) 23 | clf3 = p.load(open('speech_classification3.pkl', 'rb')) 24 | clf4 = p.load(open('speech_classification4.pkl', 'rb')) 25 | clf5 = p.load(open('speech_classification5.pkl', 'rb')) 26 | clf6 = p.load(open('speech_classification6.pkl', 'rb')) 27 | 28 | message = request.form['message'] 29 | data = message 30 | 31 | import re 32 | import nltk 33 | from nltk.corpus import stopwords 34 | from nltk.stem.porter import PorterStemmer 35 | 36 | ps = PorterStemmer() 37 | #getting setences from speech# 38 | from nltk.tokenize import sent_tokenize 39 | tokenize=sent_tokenize(data) 40 | 41 | corpus3=[] 42 | 43 | for i in range(0, len(tokenize)): 44 | review3 = re.sub('[^a-zA-Z]', ' ', tokenize[i]) 45 | review3 = review3.lower() 46 | review3 = review3.split() 47 | #review = [word for word in review if not word in set(stopwords.words('english'))] 48 | review3 = [ps.stem(word) for word in review3 if not word in set(stopwords.words('english'))] 49 | review3 = ' '.join(review3) 50 | corpus3.append(review3) 51 | 52 | #getting best 100 words 53 | cv3 = CountVectorizer(max_features = 100) 54 | X3 = cv3.fit_transform(corpus3).toarray() 55 | 56 | #predicting 57 | y_pred1 = clf1.predict(X3) 58 | y_pred2 = clf2.predict(X3) 59 | y_pred3 = clf3.predict(X3) 60 | y_pred4 = clf4.predict(X3) 61 | y_pred5 = clf5.predict(X3) 62 | y_pred6 = clf6.predict(X3) 63 | 64 | #conveting them in Data Frame 65 | y_pred1_df=pd.DataFrame(y_pred1) 66 | y_pred2_df=pd.DataFrame(y_pred2) 67 | y_pred3_df=pd.DataFrame(y_pred3) 68 | y_pred4_df=pd.DataFrame(y_pred4) 69 | y_pred5_df=pd.DataFrame(y_pred5) 70 | y_pred6_df=pd.DataFrame(y_pred6) 71 | 72 | 73 | f=y_pred6_df.iloc[:,0].values 74 | f2=y_pred5_df.iloc[:,0].values 75 | f3=y_pred4_df.iloc[:,0].values 76 | f4=y_pred3_df.iloc[:,0].values 77 | f5=y_pred2_df.iloc[:,0].values 78 | f6=y_pred1_df.iloc[:,0].values 79 | 80 | #making a final Submission Data frame 81 | submission = pd.DataFrame({'id':corpus3,'toxic':f,'severe_toxic':f2, 82 | 'obscene':f3, 83 | 'threat':f4, 84 | 'insult':f5, 85 | 'identity_hate':f6}) 86 | 87 | #getting total of all rows# 88 | submission['total']=submission.sum(axis=1) 89 | 90 | #creating a normal column# 91 | a=[] 92 | for row in submission['total']: 93 | if row==0: 94 | a.append(1) 95 | else: 96 | a.append(0) 97 | submission['normal']=pd.DataFrame(a) 98 | 99 | #getting total of column# 100 | total=submission[['toxic','severe_toxic','obscene','threat','insult','identity_hate','normal']].sum() 101 | 102 | 103 | import matplotlib.pyplot as plt 104 | import io 105 | import base64 106 | import urllib 107 | #making and saving pie-chart 108 | img = io.BytesIO() 109 | plt.pie(total) 110 | plt.title("pie chart distribution") 111 | plt.savefig(img, format='png') 112 | img.seek(0) 113 | 114 | plot_data = urllib.parse.quote(base64.b64encode(img.read()).decode()) 115 | 116 | #returning results with requested html page 117 | return render_template('result.html',normal=(total[6]/total.sum())*100, 118 | toxic=(total[0]/total.sum())*100, 119 | severe_toxic=(total[1]/total.sum())*100, 120 | obscene=(total[2]/total.sum())*100, 121 | threat=(total[3]/total.sum())*100, 122 | insult=(total[4]/total.sum())*100, 123 | identity_hate=(total[5]/total.sum())*100,plot_url=plot_data) 124 | 125 | 126 | 127 | if __name__ == '__main__': 128 | app.run(debug=True) 129 | -------------------------------------------------------------------------------- /home.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anik8gupta/Toxic_Speech_Classification/8e09edfd72ac05fd59f080dcc0ed6c0d77ab75eb/home.jpg -------------------------------------------------------------------------------- /result.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anik8gupta/Toxic_Speech_Classification/8e09edfd72ac05fd59f080dcc0ed6c0d77ab75eb/result.jpg -------------------------------------------------------------------------------- /speech_classification1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anik8gupta/Toxic_Speech_Classification/8e09edfd72ac05fd59f080dcc0ed6c0d77ab75eb/speech_classification1.pkl -------------------------------------------------------------------------------- /speech_classification2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anik8gupta/Toxic_Speech_Classification/8e09edfd72ac05fd59f080dcc0ed6c0d77ab75eb/speech_classification2.pkl -------------------------------------------------------------------------------- /speech_classification3.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anik8gupta/Toxic_Speech_Classification/8e09edfd72ac05fd59f080dcc0ed6c0d77ab75eb/speech_classification3.pkl -------------------------------------------------------------------------------- /speech_classification4.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anik8gupta/Toxic_Speech_Classification/8e09edfd72ac05fd59f080dcc0ed6c0d77ab75eb/speech_classification4.pkl -------------------------------------------------------------------------------- /speech_classification5.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anik8gupta/Toxic_Speech_Classification/8e09edfd72ac05fd59f080dcc0ed6c0d77ab75eb/speech_classification5.pkl -------------------------------------------------------------------------------- /speech_classification6.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anik8gupta/Toxic_Speech_Classification/8e09edfd72ac05fd59f080dcc0ed6c0d77ab75eb/speech_classification6.pkl -------------------------------------------------------------------------------- /static/styles.css: -------------------------------------------------------------------------------- 1 | body{ 2 | font:15px/1.5 Arial, Helvetica,sans-serif; 3 | padding: 0px; 4 | background-color:#ffffff; 5 | } 6 | 7 | .container{ 8 | width:100%; 9 | margin: auto; 10 | overflow: hidden; 11 | } 12 | 13 | header{ 14 | background-image: linear-gradient(to right,#17ba3a,#178244); 15 | border-bottom:rgb(74, 255, 68) 3px solid; 16 | height:120px; 17 | width:100%; 18 | padding-top:30px; 19 | 20 | } 21 | 22 | .main-header{ 23 | text-align:center; 24 | background-color: blue; 25 | height:100px; 26 | width:100%; 27 | margin:0px; 28 | } 29 | .brandname{ 30 | 31 | font-size:10px; 32 | color: #fff; 33 | margin: 10px; 34 | text-align:center; 35 | } 36 | 37 | header h2{ 38 | 39 | text-align:center; 40 | color:#fff; 41 | font-size:30px; 42 | 43 | } 44 | 45 | 46 | 47 | .btn-info { 48 | background-image: linear-gradient(to right,#17ba3a,#178244); 49 | height:40px; 50 | width:100px; 51 | border-radius: 20px; 52 | } /* Blue */ 53 | .btn-info:hover {background: #17ba3a;} 54 | 55 | 56 | .resultss{ 57 | border-radius: 15px 50px; 58 | background: #345fe4; 59 | padding: 20px; 60 | width: 200px; 61 | height: 150px; 62 | } -------------------------------------------------------------------------------- /templates/home.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Home 5 | 6 | 7 | 8 | 9 | 10 |
11 |
12 |

Sentimental Analysis in Speech

13 |

Machine Learning App with Flask

14 | 15 | 16 | 17 |
18 |
19 | 20 |
21 | 22 |
23 |

Enter/Paste Your Speech Here

24 | 25 | 26 |
27 | 28 | 29 | 30 |
31 | 32 |
33 | 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /templates/result.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
10 |
11 | 12 |

Sentimental Analysis in Speech

13 | 14 |

ML App

15 | 16 |
17 |
18 |

Results for Speech

19 | 20 |
21 | 22 | 23 | 24 | 25 |

Normal: {{normal}}%

26 |

Toxic: {{toxic}}%

27 |

Severe Toxic: {{severe_toxic}}%

28 |

Obscene: {{obscene}}%

29 |

Threat: {{threat}}%

30 |

Insult: {{insult}}%

31 |

Identity Hate: {{identity_hate}}%

32 | 33 |
34 | 35 | 36 | 37 | --------------------------------------------------------------------------------