├── .gitattributes ├── models └── spam-detection-model.pkl ├── preprocessing └── count-vectorizer.pkl ├── requirements.txt ├── README.md └── src └── model_trainingt.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /models/spam-detection-model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Davisy/SMS-Spam-Text-Classification/HEAD/models/spam-detection-model.pkl -------------------------------------------------------------------------------- /preprocessing/count-vectorizer.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Davisy/SMS-Spam-Text-Classification/HEAD/preprocessing/count-vectorizer.pkl -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.19.1 2 | pandas==1.0.5 3 | joblib==0.16.0 4 | scikit-learn==0.22.2.post1 5 | unidecode 6 | nltk==3.5 7 | wordcloud==1.6.0 8 | matplotlib==3.2.2 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SMS Spam Text Classification using NLP 2 | 3 | How to classifiy spam sms by using NLP 4 | 5 | 6 | - Model Name: MultinomialNB 7 | - Performance(F1 score): 0.905 8 | 9 | 10 | -------------------------------------------------------------------------------- /src/model_trainingt.py: -------------------------------------------------------------------------------- 1 | # import important modules 2 | import numpy as np 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | from string import punctuation 6 | 7 | # sklearn modules 8 | from sklearn.model_selection import train_test_split 9 | from sklearn.naive_bayes import MultinomialNB 10 | from sklearn.svm import SVC 11 | from sklearn.metrics import ( 12 | accuracy_score, 13 | classification_report, 14 | f1_score, 15 | 16 | ) 17 | from sklearn.feature_extraction.text import CountVectorizer 18 | from sklearn.model_selection import cross_val_score, RandomizedSearchCV 19 | 20 | # text preprocessing modules 21 | from nltk.tokenize import word_tokenize 22 | 23 | import nltk 24 | from nltk.corpus import stopwords 25 | from nltk.stem import WordNetLemmatizer 26 | import re #regular expression 27 | 28 | import warnings 29 | warnings.filterwarnings("ignore") 30 | # seeding 31 | np.random.seed(123) 32 | 33 | 34 | import warnings 35 | warnings.filterwarnings("ignore") 36 | # seeding 37 | np.random.seed(123) 38 | 39 | 40 | # load data 41 | data = pd.read_csv("../data/spam.tsv", sep="\t") 42 | 43 | 44 | # replace ham to 0 and spam to 1 45 | new_data = data.replace({"ham": 0, "spam": 1}) 46 | 47 | stop_words = stopwords.words('english') 48 | 49 | 50 | def text_cleaning(text, remove_stop_words=True, lemmatize_words=True): 51 | # Clean the text, with the option to remove stop_words and to lemmatize word 52 | 53 | # Clean the text 54 | text = re.sub(r"[^A-Za-z0-9]", " ", text) 55 | text = re.sub(r"\'s", " ", text) 56 | text = re.sub(r"n't", " not ", text) 57 | text = re.sub(r"I'm", "I am", text) 58 | text = re.sub(r"ur", " your ", text) 59 | text = re.sub(r" nd ", " and ", text) 60 | text = re.sub(r"\'d", " would ", text) 61 | text = re.sub(r"\'ll", " will ", text) 62 | text = re.sub(r" tkts ", " tickets ", text) 63 | text = re.sub(r" c ", " can ", text) 64 | text = re.sub(r" e g ", " eg ", text) 65 | text = re.sub(r'http\S+', ' link ', text) 66 | text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text) # remove numbers 67 | text = re.sub(r" u ", " you ", text) 68 | text = text.lower() # set in lowercase 69 | 70 | # Remove punctuation from text 71 | text = ''.join([c for c in text if c not in punctuation]) 72 | 73 | # Optionally, remove stop words 74 | if remove_stop_words: 75 | text = text.split() 76 | text = [w for w in text if not w in stop_words] 77 | text = " ".join(text) 78 | 79 | # Optionally, shorten words to their stems 80 | if lemmatize_words: 81 | text = text.split() 82 | lemmatizer = WordNetLemmatizer() 83 | lemmatized_words = [lemmatizer.lemmatize(word) for word in text] 84 | text = " ".join(lemmatized_words) 85 | 86 | # Return a list of words 87 | return (text) 88 | 89 | 90 | #clean the data 91 | #clean the dataset 92 | new_data["clean_message"] = new_data["message"].apply(text_cleaning) 93 | 94 | 95 | # split data into train and test 96 | X_train, X_test, y_train, y_test = train_test_split( 97 | new_data["clean_message"].values, 98 | new_data["label"].values, 99 | test_size=0.15, 100 | random_state=0, 101 | shuffle=True, 102 | stratify=data["label"], 103 | ) 104 | 105 | 106 | print("x_train type: {}".format(type(X_train))) 107 | # clean and transform 108 | 109 | vectorizer = CountVectorizer(lowercase=False) 110 | vectorizer.fit(X_train) 111 | X_train_trans = vectorizer.transform(X_train) 112 | X_text_trans = vectorizer.transform(X_test) 113 | 114 | 115 | 116 | # Create a pipeline combing the preprocessing methods and estimator 117 | 118 | spam_classifier = MultinomialNB() 119 | 120 | 121 | # Train the model with cross validation 122 | scores = cross_val_score(spam_classifier,X_train_trans,y_train,cv=10,verbose=3,n_jobs=-1) 123 | 124 | 125 | # find the mean of the all scores 126 | print("score mean: {}".format(scores.mean())) 127 | 128 | 129 | 130 | # fine turning model parameters 131 | 132 | distribution = {"alpha": [1, 0.1, 0.01, 0.001, 0.0001, 0, 0.2, 0.3]} 133 | 134 | grid = RandomizedSearchCV( 135 | spam_classifier, 136 | param_distributions=distribution, 137 | n_jobs=-1, 138 | cv=5, 139 | n_iter=20, 140 | random_state=42, 141 | return_train_score=True, 142 | verbose=2, 143 | ) 144 | 145 | 146 | # training with randomized search 147 | grid.fit(X_train_trans, y_train) 148 | 149 | # summarize the results of the random parameter search 150 | print(grid.best_score_) 151 | print(grid.best_estimator_) 152 | print(grid.best_params_) 153 | 154 | 155 | # Train the model with best parameters 156 | 157 | best_classifier = MultinomialNB(alpha=1) 158 | 159 | 160 | #cross validation 161 | scores = cross_val_score(best_classifier, X_train_trans, y_train, cv=10, verbose=2, n_jobs=-1) 162 | 163 | print(scores) 164 | print(scores.mean()) 165 | 166 | 167 | # train the best_classifier 168 | best_classifier.fit(X_train_trans,y_train) 169 | 170 | 171 | # plot the comfusion matrix 172 | X_test_trans = vectorizer.transform(X_test) 173 | 174 | 175 | 176 | # predict on the test data 177 | y_pred = best_classifier.predict(X_test_trans) 178 | 179 | 180 | # check the classification report 181 | print(classification_report(y_test, y_pred)) 182 | 183 | 184 | # check accuracy score 185 | print(accuracy_score(y_test, y_pred)) 186 | 187 | 188 | # check f1_ score 189 | print("f1 score: {}".format(f1_score(y_test, y_pred))) 190 | 191 | 192 | 193 | #save model 194 | import joblib 195 | 196 | joblib.dump(best_classifier, '../models/spam-detection-model.pkl') 197 | 198 | 199 | #save transformer 200 | joblib.dump(vectorizer,'../preprocessing/count-vectorizer.pkl') 201 | 202 | 203 | 204 | 205 | --------------------------------------------------------------------------------