├── README.md ├── classifier.py └── data.csv /README.md: -------------------------------------------------------------------------------- 1 | ### Can find the article here 2 | #### [Linkedin](https://www.linkedin.com/pulse/text-classification-using-bag-words-approach-nltk-scikit-rajendran) 3 | 4 | #### [Medium](https://medium.com/@charlesrajendran44/text-classification-using-the-bag-of-words-approach-with-nltk-and-scikit-learn-9a731e5c4e2f) 5 | -------------------------------------------------------------------------------- /classifier.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | dataset = pd.read_csv('data.csv', encoding='ISO-8859-1'); 3 | 4 | import re 5 | import nltk 6 | 7 | nltk.download('punkt') 8 | from nltk.tokenize import word_tokenize as wt 9 | 10 | nltk.download('stopwords') 11 | from nltk.corpus import stopwords 12 | 13 | from nltk.stem.porter import PorterStemmer 14 | stemmer = PorterStemmer() 15 | 16 | #spell correction 17 | from autocorrect import spell 18 | 19 | data = [] 20 | 21 | for i in range(dataset.shape[0]): 22 | sms = dataset.iloc[i, 1] 23 | 24 | # remove non alphabatic characters 25 | sms = re.sub('[^A-Za-z]', ' ', sms) 26 | 27 | # make words lowercase, because Go and go will be considered as two words 28 | sms = sms.lower() 29 | 30 | # tokenising 31 | tokenized_sms = wt(sms) 32 | 33 | # remove stop words and stemming 34 | 35 | sms_processed = [] 36 | for word in tokenized_sms: 37 | if word not in set(stopwords.words('english')): 38 | sms_processed.append(spell(stemmer.stem(word))) 39 | 40 | sms_text = " ".join(sms_processed) 41 | data.append(sms_text) 42 | 43 | # creating the feature matrix 44 | from sklearn.feature_extraction.text import CountVectorizer 45 | matrix = CountVectorizer(max_features=1000) 46 | X = matrix.fit_transform(data).toarray() 47 | y = dataset.iloc[:, 0] 48 | 49 | # split train and test data 50 | from sklearn.model_selection import train_test_split 51 | X_train, X_test, y_train, y_test = train_test_split(X, y) 52 | 53 | # Naive Bayes 54 | from sklearn.naive_bayes import GaussianNB 55 | classifier = GaussianNB() 56 | classifier.fit(X_train, y_train) 57 | 58 | # predict class 59 | y_pred = classifier.predict(X_test) 60 | 61 | # Confusion matrix 62 | from sklearn.metrics import confusion_matrix, classification_report, accuracy_score 63 | cm = confusion_matrix(y_test, y_pred) 64 | cr = classification_report(y_test, y_pred) 65 | 66 | accuracy = accuracy_score(y_test, y_pred) 67 | 68 | 69 | -------------------------------------------------------------------------------- /data.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CharlesRajendran/TextClassification/95adc408e37b250d41f3f16ad27c4898d36c4f8b/data.csv --------------------------------------------------------------------------------