├── Dataset └── suspicious tweets.csv ├── FYP.ipynb ├── FYP_Deployment.py ├── README.md ├── config.toml └── model_for_deployment ├── NaiveBayes_Model.sav └── Vectorize_Save /FYP_Deployment.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | #data pre-processing 3 | import pandas as pd 4 | import numpy as np 5 | import string 6 | import re 7 | import nltk 8 | from nltk.corpus import wordnet 9 | nltk.download('punkt') 10 | nltk.download('stopwords') 11 | nltk.download('wordnet') 12 | nltk.download('averaged_perceptron_tagger') 13 | from nltk.corpus import stopwords 14 | import emoji 15 | import warnings 16 | warnings.filterwarnings("ignore") 17 | import pickle 18 | from sklearn.feature_extraction.text import CountVectorizer 19 | from keras.models import load_model 20 | from keras.preprocessing import sequence 21 | from keras.utils import np_utils 22 | from keras.preprocessing.text import Tokenizer 23 | 24 | st.title("Expose Cyberbully tweets on Twitter using Machine Learning") 25 | abstract = st.expander("Abstract") 26 | if abstract: 27 | abstract.write("The advancements of technology along with the digitization of the relationships made a great impact among the centennials to mandatorily maintain a social media account. Despite the entertainment that social media provides, cyberbullying has been identified as a real issue in Malaysia where these centennials are victims. However, a smaller number of studies have been reported in this regard in terms of detecting the attempt of cyberbullying on social media. On this background, a solution using suitable data science techniques which can help to detect the attempt of cyberbullying on social media would be ideal. This research proposed to use suspicious tweets dataset from Kaggle to train three classifiers for supervised learning using Naïve Bayes, SVM, and LSTM. Model tuning was performed using Random Grid Search and Keras tuner. Overall, the model had an accuracy rate of 88% indicating that the optimization tuning functioned properly. Out of the three, Naïve Bayes performed the best in terms of both accuracy and area under the curve (AUC) values with 88.4% and 0.81 respectively.") 28 | 29 | about = st.expander("About") 30 | if about: 31 | about.write("The application below demonstrates a machine learning learning model (Naïve Bayes) that has been trained to detect cyberbullying in tweets from Twitter.") 32 | about.markdown("**Information on the Classifier**") 33 | if about.checkbox("About Classifer"): 34 | about.markdown('**Model:** Naïve Bayes') 35 | about.markdown('**Vectorizer:** Count') 36 | about.markdown('**Test-Train splitting:** 20% - 80%') 37 | about.markdown('**Lemmetization/Stemmer:** Wordnet with POS tagging') 38 | 39 | if about.checkbox("Evaluation Results"): 40 | about.markdown('**Accuracy:** 88%') 41 | about.markdown('**Precision:** 91%') 42 | about.markdown('**Recall:** 88%') 43 | about.markdown('**F1 Score:** 89%') 44 | about.markdown('**AUC Score:** 0.81') 45 | 46 | related = st.expander("Related Links") 47 | if related: 48 | related.write("[Dataset](https://www.kaggle.com/syedabbasraza/suspicious-tweets)") 49 | 50 | temp = st.text_area("Insert Tweet:") 51 | btn_analyse = st.button('Analyse') 52 | 53 | if btn_analyse: 54 | def get_wordnet_pos(word): 55 | 56 | tag = nltk.pos_tag([word])[0][1][0].upper() 57 | tag_dict = {"J": wordnet.ADJ, 58 | "N": wordnet.NOUN, 59 | "V": wordnet.VERB, 60 | "R": wordnet.ADV} 61 | 62 | return tag_dict.get(tag, wordnet.NOUN) 63 | 64 | temp = temp.lower() 65 | temp = emoji.demojize(temp) 66 | temp = nltk.word_tokenize(temp) 67 | 68 | #remove punctuations 69 | temp = [i for i in temp if i not in set(string.punctuation)] 70 | 71 | #remove stop words 72 | english_stops = set(stopwords.words('english')) 73 | characters_to_remove = ["''",'``',"rt","https","’","“","”","\u200b","--","n't","'s","...","//t.c" ] 74 | temp = [word for word in temp if word not in english_stops] 75 | temp = [word for word in temp if word not in set(characters_to_remove)] 76 | 77 | #Lemmatize with POS Tagging 78 | wordnet_lemmatizer = nltk.WordNetLemmatizer() 79 | temp = [wordnet_lemmatizer.lemmatize(i, get_wordnet_pos(i)) for i in temp] 80 | 81 | def eval_avg(eval): 82 | return sum(eval)/len(eval) 83 | 84 | temp = [str (item) for item in temp] 85 | vectorizer = pickle.load(open("model_for_deployment/Vectorize_Save", 'rb')) 86 | vect = vectorizer.transform(temp) 87 | 88 | #Load Naive Bayes Model 89 | load_nb = pickle.load(open("model_for_deployment/NaiveBayes_Model.sav", 'rb')) 90 | 91 | #Predict using Naive Bayes 92 | nb_pred = load_nb.predict(vect) 93 | print(nb_pred) 94 | 95 | result_nb = [int (item) for item in nb_pred] 96 | result_nb = eval_avg(result_nb) 97 | 98 | print(temp) 99 | print(result_nb) 100 | 101 | output_nb = (result_nb > 0.5) 102 | print(output_nb) 103 | 104 | st.markdown("Naive Bayes Prediction:") 105 | if (output_nb == 1): 106 | st.success("No Cyberbully intent detected") 107 | else: 108 | st.error("Possible Cyberbully intent!") 109 | help = st.expander("Need Help?") 110 | if help: 111 | help.write("[R.AGE Website](https://www.rage.com.my/helplines-and-counselling/)") 112 | help.write("Call the national 24-hour hotline **15999** to call to report abuse, bullying, neglect, etc.") 113 | else: 114 | st.warning("Please insert the tweet!") 115 | 116 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Expose Cyberbully Tweets Using Machine Learning: A Data Science Approach 2 | 3 | ## Abstract 4 | The advancements of technology along with the digitization of the relationships made a great impact among the centennials to mandatorily maintain a social media account. Despite the entertainment that social media provides, cyberbullying has been identified as a real issue in Malaysia which makes many centennials as victims. However, a few studies have been reported in detecting the attempt of cyberbullying on social media. Therefore, a solution using suitable data science techniques which can detect the attempt of cyberbullying on social media would be ideal. This research used the suspicious tweets dataset from Kaggle to build three supervised learning predictive models namely Naïve Bayes, SVM, and LSTM and tuned using Random Grid Search and Keras tuner to indicate a suitable solution. As a summary, Naïve Bayes model performed the best in terms of both accuracy and area under the curve (AUC) values with 88.4% and 0.81 respectively. While the LSTM model achieved the second-best with an accuracy of 90.6% and an AUC value of 0.58. Hence, with a greater number of records, both the accuracy 5 | and AUC values of the LSTM model can be improved. 6 | 7 | ## Files 8 | The main EDA and model training are performed in the FYP.ipynb file 9 | 10 | The main deployment file is the "FYP_Deployment.py" file 11 | 12 | "config.toml" file is the customized theme for the deployment website. 13 | 14 | "saved model" file contains the Naive Bayes model used for the Deployment and the Count Vectorizer save file. 15 | 16 | "all_model" file contains all the model that has been trained and save into a pickle file. 17 | 18 | 19 | ## Steps to run the deployment 20 | 1. Download Anaconda navigator and set up environment with python 3.8. 21 | 2. Once set up, click on home, and launch VS Code. Ensure that the appropriate enviromnent selected in the applications on dropdown menu. 22 | 3. To download all the dependencies needed, run the following command in the terminal 23 | pip install -r requirements.txt 24 | 4. To run deployment open new terminal and redirect the directory using "cd (deployment file path)" 25 | 5. Then, type on the terminal "streamlit run FYP_Deployment_NB.py" 26 | 27 | This final year project has been officially published under Young Investors Journals (YIJ). Link: https://lnkd.in/gTNkCV8G 28 | -------------------------------------------------------------------------------- /config.toml: -------------------------------------------------------------------------------- 1 | [theme] 2 | base="light" 3 | primaryColor="#000000" 4 | backgroundColor="#f8f5ec" 5 | -------------------------------------------------------------------------------- /model_for_deployment/NaiveBayes_Model.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/notrichbish/cyberbully-detection-using-ML/bb81fb85f09d113a412fceed95521dc8a9a63093/model_for_deployment/NaiveBayes_Model.sav -------------------------------------------------------------------------------- /model_for_deployment/Vectorize_Save: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/notrichbish/cyberbully-detection-using-ML/bb81fb85f09d113a412fceed95521dc8a9a63093/model_for_deployment/Vectorize_Save --------------------------------------------------------------------------------