├── README.md
├── classifier.py
└── data.csv


/README.md:
--------------------------------------------------------------------------------
1 | ### Can find the article here
2 | #### [Linkedin](https://www.linkedin.com/pulse/text-classification-using-bag-words-approach-nltk-scikit-rajendran)
3 | 
4 | #### [Medium](https://medium.com/@charlesrajendran44/text-classification-using-the-bag-of-words-approach-with-nltk-and-scikit-learn-9a731e5c4e2f)
5 | 


--------------------------------------------------------------------------------
/classifier.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | dataset = pd.read_csv('data.csv', encoding='ISO-8859-1');
 3 | 
 4 | import re
 5 | import nltk
 6 | 
 7 | nltk.download('punkt')
 8 | from nltk.tokenize import word_tokenize as wt 
 9 | 
10 | nltk.download('stopwords')
11 | from nltk.corpus import stopwords
12 | 
13 | from nltk.stem.porter import PorterStemmer
14 | stemmer = PorterStemmer()
15 | 
16 | #spell correction
17 | from autocorrect import spell
18 | 
19 | data = []
20 | 
21 | for i in range(dataset.shape[0]):
22 |     sms = dataset.iloc[i, 1]
23 | 
24 |     # remove non alphabatic characters
25 |     sms = re.sub('[^A-Za-z]', ' ', sms)
26 | 
27 |     # make words lowercase, because Go and go will be considered as two words
28 |     sms = sms.lower()
29 | 
30 |     # tokenising
31 |     tokenized_sms = wt(sms)
32 | 
33 |     # remove stop words and stemming
34 |  
35 |     sms_processed = []
36 |     for word in tokenized_sms:
37 |         if word not in set(stopwords.words('english')):
38 |             sms_processed.append(spell(stemmer.stem(word)))
39 | 
40 |     sms_text = " ".join(sms_processed)
41 |     data.append(sms_text)
42 | 
43 | # creating the feature matrix 
44 | from sklearn.feature_extraction.text import CountVectorizer
45 | matrix = CountVectorizer(max_features=1000)
46 | X = matrix.fit_transform(data).toarray()
47 | y = dataset.iloc[:, 0]
48 | 
49 | # split train and test data
50 | from sklearn.model_selection import train_test_split
51 | X_train, X_test, y_train, y_test = train_test_split(X, y)
52 | 
53 | # Naive Bayes 
54 | from sklearn.naive_bayes import GaussianNB
55 | classifier = GaussianNB()
56 | classifier.fit(X_train, y_train)
57 | 
58 | # predict class
59 | y_pred = classifier.predict(X_test)
60 | 
61 | # Confusion matrix
62 | from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
63 | cm = confusion_matrix(y_test, y_pred)
64 | cr = classification_report(y_test, y_pred)
65 | 
66 | accuracy = accuracy_score(y_test, y_pred)
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/data.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharlesRajendran/TextClassification/95adc408e37b250d41f3f16ad27c4898d36c4f8b/data.csv


--------------------------------------------------------------------------------