├── .gitattributes
├── models
    └── spam-detection-model.pkl
├── preprocessing
    └── count-vectorizer.pkl
├── requirements.txt
├── README.md
└── src
    └── model_trainingt.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/models/spam-detection-model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Davisy/SMS-Spam-Text-Classification/HEAD/models/spam-detection-model.pkl


--------------------------------------------------------------------------------
/preprocessing/count-vectorizer.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Davisy/SMS-Spam-Text-Classification/HEAD/preprocessing/count-vectorizer.pkl


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.19.1
2 | pandas==1.0.5
3 | joblib==0.16.0
4 | scikit-learn==0.22.2.post1
5 | unidecode
6 | nltk==3.5
7 | wordcloud==1.6.0
8 | matplotlib==3.2.2


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SMS Spam Text Classification using NLP
 2 | 
 3 | How to classifiy spam sms by using NLP
 4 | 
 5 | 
 6 | - Model Name: MultinomialNB
 7 | - Performance(F1 score): 0.905
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/src/model_trainingt.py:
--------------------------------------------------------------------------------
  1 | # import important modules
  2 | import numpy as np
  3 | import pandas as pd
  4 | import matplotlib.pyplot as plt
  5 | from string import punctuation
  6 | 
  7 | # sklearn modules
  8 | from sklearn.model_selection import train_test_split
  9 | from sklearn.naive_bayes import MultinomialNB
 10 | from sklearn.svm import SVC
 11 | from sklearn.metrics import (
 12 |     accuracy_score,
 13 |     classification_report,
 14 |     f1_score,
 15 | 
 16 | )
 17 | from sklearn.feature_extraction.text import CountVectorizer
 18 | from sklearn.model_selection import cross_val_score, RandomizedSearchCV
 19 | 
 20 | # text preprocessing modules
 21 | from nltk.tokenize import word_tokenize
 22 | 
 23 | import nltk
 24 | from nltk.corpus import stopwords
 25 | from nltk.stem import WordNetLemmatizer
 26 | import re #regular expression
 27 | 
 28 | import warnings
 29 | warnings.filterwarnings("ignore")
 30 | # seeding
 31 | np.random.seed(123)
 32 | 
 33 | 
 34 | import warnings
 35 | warnings.filterwarnings("ignore")
 36 | # seeding
 37 | np.random.seed(123)
 38 | 
 39 | 
 40 | # load data
 41 | data = pd.read_csv("../data/spam.tsv", sep="\t")
 42 | 
 43 | 
 44 | # replace ham to 0 and spam to 1
 45 | new_data = data.replace({"ham": 0, "spam": 1})
 46 | 
 47 | stop_words = stopwords.words('english')
 48 | 
 49 | 
 50 | def text_cleaning(text, remove_stop_words=True, lemmatize_words=True):
 51 |     # Clean the text, with the option to remove stop_words and to lemmatize word
 52 | 
 53 |     # Clean the text
 54 |     text = re.sub(r"[^A-Za-z0-9]", " ", text)
 55 |     text = re.sub(r"\'s", " ", text)
 56 |     text = re.sub(r"n't", " not ", text)
 57 |     text = re.sub(r"I'm", "I am", text)
 58 |     text = re.sub(r"ur", " your ", text)
 59 |     text = re.sub(r" nd ", " and ", text)
 60 |     text = re.sub(r"\'d", " would ", text)
 61 |     text = re.sub(r"\'ll", " will ", text)
 62 |     text = re.sub(r" tkts ", " tickets ", text)
 63 |     text = re.sub(r" c ", " can ", text)
 64 |     text = re.sub(r" e g ", " eg ", text)
 65 |     text = re.sub(r'http\S+', ' link ', text)
 66 |     text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text)  # remove numbers
 67 |     text = re.sub(r" u ", " you ", text)
 68 |     text = text.lower()  # set in lowercase
 69 | 
 70 |     # Remove punctuation from text
 71 |     text = ''.join([c for c in text if c not in punctuation])
 72 | 
 73 |     # Optionally, remove stop words
 74 |     if remove_stop_words:
 75 |         text = text.split()
 76 |         text = [w for w in text if not w in stop_words]
 77 |         text = " ".join(text)
 78 | 
 79 |     # Optionally, shorten words to their stems
 80 |     if lemmatize_words:
 81 |         text = text.split()
 82 |         lemmatizer = WordNetLemmatizer()
 83 |         lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
 84 |         text = " ".join(lemmatized_words)
 85 | 
 86 |     # Return a list of words
 87 |     return (text)
 88 | 
 89 | 
 90 | #clean the data
 91 | #clean the dataset
 92 | new_data["clean_message"] = new_data["message"].apply(text_cleaning)
 93 | 
 94 | 
 95 | # split data into train and test
 96 | X_train, X_test, y_train, y_test = train_test_split(
 97 |     new_data["clean_message"].values,
 98 |     new_data["label"].values,
 99 |     test_size=0.15,
100 |     random_state=0,
101 |     shuffle=True,
102 |     stratify=data["label"],
103 | )
104 | 
105 | 
106 | print("x_train type: {}".format(type(X_train)))
107 | # clean and transform
108 | 
109 | vectorizer = CountVectorizer(lowercase=False)
110 | vectorizer.fit(X_train)
111 | X_train_trans = vectorizer.transform(X_train)
112 | X_text_trans = vectorizer.transform(X_test)
113 | 
114 | 
115 | 
116 | # Create a pipeline combing the preprocessing methods and estimator
117 | 
118 | spam_classifier = MultinomialNB()
119 | 
120 | 
121 | # Train the model with cross validation
122 | scores = cross_val_score(spam_classifier,X_train_trans,y_train,cv=10,verbose=3,n_jobs=-1)
123 | 
124 | 
125 | # find the mean of the all scores
126 | print("score mean: {}".format(scores.mean()))
127 | 
128 | 
129 | 
130 | # fine turning model parameters
131 | 
132 | distribution = {"alpha": [1, 0.1, 0.01, 0.001, 0.0001, 0, 0.2, 0.3]}
133 | 
134 | grid = RandomizedSearchCV(
135 |     spam_classifier,
136 |     param_distributions=distribution,
137 |     n_jobs=-1,
138 |     cv=5,
139 |     n_iter=20,
140 |     random_state=42,
141 |     return_train_score=True,
142 |     verbose=2,
143 | )
144 | 
145 | 
146 | # training with randomized search
147 | grid.fit(X_train_trans, y_train)
148 | 
149 | # summarize the results of the random parameter search
150 | print(grid.best_score_)
151 | print(grid.best_estimator_)
152 | print(grid.best_params_)
153 | 
154 | 
155 | # Train the model with best parameters
156 | 
157 | best_classifier = MultinomialNB(alpha=1)
158 | 
159 | 
160 | #cross validation
161 | scores = cross_val_score(best_classifier, X_train_trans, y_train, cv=10, verbose=2, n_jobs=-1)
162 | 
163 | print(scores)
164 | print(scores.mean())
165 | 
166 | 
167 | # train the best_classifier 
168 | best_classifier.fit(X_train_trans,y_train)
169 | 
170 | 
171 | # plot the comfusion matrix
172 | X_test_trans = vectorizer.transform(X_test)
173 | 
174 | 
175 | 
176 | # predict on the test data
177 | y_pred = best_classifier.predict(X_test_trans)
178 | 
179 | 
180 | # check the classification report
181 | print(classification_report(y_test, y_pred))
182 | 
183 | 
184 | # check accuracy score
185 | print(accuracy_score(y_test, y_pred))
186 | 
187 | 
188 | # check f1_ score
189 | print("f1 score: {}".format(f1_score(y_test, y_pred)))
190 | 
191 | 
192 | 
193 | #save model 
194 | import joblib 
195 | 
196 | joblib.dump(best_classifier, '../models/spam-detection-model.pkl')
197 | 
198 | 
199 | #save transformer 
200 | joblib.dump(vectorizer,'../preprocessing/count-vectorizer.pkl')
201 | 
202 | 
203 | 
204 | 
205 | 


--------------------------------------------------------------------------------