├── README.md
├── Topic Classification.py
├── corpus.csv
└── version2
    ├── inference.py
    └── training.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Text Classification 
 2 | 
 3 | This is an easy to understand script for 'Text Classfication' using SVM and Naive Bayes. 
 4 | 
 5 | The input file is also uploaded - corpus.csv
 6 | 
 7 | Refer medium link for detailed explanation
 8 | https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34
 9 | 
10 | # Version 2: 
11 | For people facing challenges in using the models to inference on unseen data. I have adapted the code into two modules - training.py and inference.py.
12 | 
13 | Below is a short description of them.
14 | 
15 | training.py - This will train a model on your data and save the trained LabelEncoder, TFIDF Vector and the model itself on disk.
16 | inference.py - This will load these saved files on the disk and do prediction on unseen text.
17 | 
18 | Feel free to play with them and adapt them further as per your requirements.
19 | 


--------------------------------------------------------------------------------
/Topic Classification.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from nltk.tokenize import word_tokenize
 4 | from nltk import pos_tag
 5 | from nltk.corpus import stopwords
 6 | from nltk.stem import WordNetLemmatizer
 7 | from sklearn.preprocessing import LabelEncoder
 8 | from collections import defaultdict
 9 | from nltk.corpus import wordnet as wn
10 | from sklearn.feature_extraction.text import TfidfVectorizer
11 | from sklearn import model_selection, naive_bayes, svm
12 | from sklearn.metrics import accuracy_score
13 | 
14 | #Set Random seed
15 | np.random.seed(500)
16 | 
17 | # Add the Data using pandas
18 | Corpus = pd.read_csv(r"C:\Users\gunjit.bedi\Desktop\NLP Project\corpus_small.csv",encoding='latin-1')
19 | 
20 | # Step - 1: Data Pre-processing - This will help in getting better results through the classification algorithms
21 | 
22 | # Step - 1a : Remove blank rows if any.
23 | Corpus['text'].dropna(inplace=True)
24 | 
25 | # Step - 1b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
26 | Corpus['text'] = [entry.lower() for entry in Corpus['text']]
27 | 
28 | # Step - 1c : Tokenization : In this each entry in the corpus will be broken into set of words
29 | Corpus['text']= [word_tokenize(entry) for entry in Corpus['text']]
30 | 
31 | # Step - 1d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
32 | 
33 | # WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
34 | tag_map = defaultdict(lambda : wn.NOUN)
35 | tag_map['J'] = wn.ADJ
36 | tag_map['V'] = wn.VERB
37 | tag_map['R'] = wn.ADV
38 | 
39 | 
40 | for index,entry in enumerate(Corpus['text']):
41 |     # Declaring Empty List to store the words that follow the rules for this step
42 |     Final_words = []
43 |     # Initializing WordNetLemmatizer()
44 |     word_Lemmatized = WordNetLemmatizer()
45 |     # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
46 |     for word, tag in pos_tag(entry):
47 |         # Below condition is to check for Stop words and consider only alphabets
48 |         if word not in stopwords.words('english') and word.isalpha():
49 |             word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
50 |             Final_words.append(word_Final)
51 |     # The final processed set of words for each iteration will be stored in 'text_final'
52 |     Corpus.loc[index,'text_final'] = str(Final_words)
53 | 
54 | #print(Corpus['text_final'].head())
55 | 
56 | # Step - 2: Split the model into Train and Test Data set
57 | Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['label'],test_size=0.3)
58 | 
59 | # Step - 3: Label encode the target variable  - This is done to transform Categorical data of string type in the data set into numerical values
60 | Encoder = LabelEncoder()
61 | Train_Y = Encoder.fit_transform(Train_Y)
62 | Test_Y = Encoder.fit_transform(Test_Y)
63 | 
64 | # Step - 4: Vectorize the words by using TF-IDF Vectorizer - This is done to find how important a word in document is in comaprison to the corpus
65 | Tfidf_vect = TfidfVectorizer(max_features=5000)
66 | Tfidf_vect.fit(Corpus['text_final'])
67 | 
68 | Train_X_Tfidf = Tfidf_vect.transform(Train_X)
69 | Test_X_Tfidf = Tfidf_vect.transform(Test_X)
70 | 
71 | 
72 | # Step - 5: Now we can run different algorithms to classify out data check for accuracy
73 | 
74 | # Classifier - Algorithm - Naive Bayes
75 | # fit the training dataset on the classifier
76 | Naive = naive_bayes.MultinomialNB()
77 | Naive.fit(Train_X_Tfidf,Train_Y)
78 | 
79 | # predict the labels on validation dataset
80 | predictions_NB = Naive.predict(Test_X_Tfidf)
81 | 
82 | # Use accuracy_score function to get the accuracy
83 | print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)
84 | 
85 | 
86 | # Classifier - Algorithm - SVM
87 | # fit the training dataset on the classifier
88 | SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
89 | SVM.fit(Train_X_Tfidf,Train_Y)
90 | 
91 | # predict the labels on validation dataset
92 | predictions_SVM = SVM.predict(Test_X_Tfidf)
93 | 
94 | # Use accuracy_score function to get the accuracy
95 | print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
96 | 


--------------------------------------------------------------------------------
/corpus.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Gunjitbedi/Text-Classification/9a24f5d61ee4b5068e2692ef499cfb5769c9404d/corpus.csv


--------------------------------------------------------------------------------
/version2/inference.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from nltk.tokenize import word_tokenize
 4 | from nltk import pos_tag
 5 | from nltk.corpus import stopwords
 6 | from nltk.stem import WordNetLemmatizer
 7 | from sklearn.preprocessing import LabelEncoder
 8 | from collections import defaultdict
 9 | from nltk.corpus import wordnet as wn
10 | from sklearn.feature_extraction.text import TfidfVectorizer
11 | from sklearn import model_selection, naive_bayes, svm
12 | import pickle
13 | import ast
14 | 
15 | # WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
16 | tag_map = defaultdict(lambda: wn.NOUN)
17 | tag_map['J'] = wn.ADJ
18 | tag_map['V'] = wn.VERB
19 | tag_map['R'] = wn.ADV
20 | 
21 | 
22 | def text_preprocessing(text):
23 |     # Step - 1b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
24 |     text = text.lower()
25 | 
26 |     # Step - 1c : Tokenization : In this each entry in the corpus will be broken into set of words
27 |     text_words_list = word_tokenize(text)
28 | 
29 |     # Step - 1d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
30 |     # Declaring Empty List to store the words that follow the rules for this step
31 |     Final_words = []
32 |     # Initializing WordNetLemmatizer()
33 |     word_Lemmatized = WordNetLemmatizer()
34 |     # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
35 |     for word, tag in pos_tag(text_words_list):
36 |         # Below condition is to check for Stop words and consider only alphabets
37 |         if word not in stopwords.words('english') and word.isalpha():
38 |             word_Final = word_Lemmatized.lemmatize(word, tag_map[tag[0]])
39 |             Final_words.append(word_Final)
40 |         # The final processed set of words for each iteration will be stored in 'text_final'
41 |     return str(Final_words)
42 | 
43 | 
44 | # Loading Label encoder
45 | labelencode = pickle.load(open('labelencoder_fitted.pkl', 'rb'))
46 | 
47 | # Loading TF-IDF Vectorizer
48 | Tfidf_vect = pickle.load(open('Tfidf_vect_fitted.pkl', 'rb'))
49 | 
50 | # Loading models
51 | SVM = pickle.load(open('svm_trained_model.sav', 'rb'))
52 | Naive = pickle.load(open('nb_trained_model.sav', 'rb'))
53 | 
54 | 
55 | # Inference
56 | sample_text = "The best soundtrack ever to anything"
57 | sample_text_processed = text_preprocessing(sample_text)
58 | sample_text_processed_vectorized = Tfidf_vect.transform([sample_text_processed])
59 | 
60 | prediction_SVM = SVM.predict(sample_text_processed_vectorized)
61 | prediction_Naive = Naive.predict(sample_text_processed_vectorized)
62 | 
63 | print("Prediction from SVM Model:", labelencode.inverse_transform(prediction_SVM)[0])
64 | print("Prediction from NB Model:", labelencode.inverse_transform(prediction_Naive)[0])
65 | 
66 | 


--------------------------------------------------------------------------------
/version2/training.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from nltk.tokenize import word_tokenize
  4 | from nltk import pos_tag
  5 | from nltk.corpus import stopwords
  6 | from nltk.stem import WordNetLemmatizer
  7 | from sklearn.preprocessing import LabelEncoder
  8 | from collections import defaultdict
  9 | from nltk.corpus import wordnet as wn
 10 | from sklearn.feature_extraction.text import TfidfVectorizer
 11 | from sklearn import model_selection, naive_bayes, svm
 12 | from sklearn.metrics import accuracy_score
 13 | import pickle
 14 | 
 15 | # Set Random seed
 16 | np.random.seed(500)
 17 | 
 18 | # Add the Data using pandas
 19 | 
 20 | Corpus = pd.read_csv(r"D:\Python\Projects\Thailand\corpus.csv", encoding='latin-1')
 21 | 
 22 | # Step - 1: Data Pre-processing - This will help in getting better results through the classification algorithms
 23 | 
 24 | # Step - 1a : Remove blank rows if any.
 25 | Corpus['text'].dropna(inplace=True)
 26 | 
 27 | # WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
 28 | tag_map = defaultdict(lambda: wn.NOUN)
 29 | tag_map['J'] = wn.ADJ
 30 | tag_map['V'] = wn.VERB
 31 | tag_map['R'] = wn.ADV
 32 | 
 33 | 
 34 | def text_preprocessing(text):
 35 |     # Step - 1b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
 36 |     text = text.lower()
 37 | 
 38 |     # Step - 1c : Tokenization : In this each entry in the corpus will be broken into set of words
 39 |     text_words_list = word_tokenize(text)
 40 | 
 41 |     # Step - 1d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
 42 |     # Declaring Empty List to store the words that follow the rules for this step
 43 |     Final_words = []
 44 |     # Initializing WordNetLemmatizer()
 45 |     word_Lemmatized = WordNetLemmatizer()
 46 |     # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
 47 |     for word, tag in pos_tag(text_words_list):
 48 |         # Below condition is to check for Stop words and consider only alphabets
 49 |         if word not in stopwords.words('english') and word.isalpha():
 50 |             word_Final = word_Lemmatized.lemmatize(word, tag_map[tag[0]])
 51 |             Final_words.append(word_Final)
 52 |         # The final processed set of words for each iteration will be stored in 'text_final'
 53 |     return str(Final_words)
 54 | 
 55 | 
 56 | Corpus['text_final'] = Corpus['text'].map(text_preprocessing)
 57 | 
 58 | # Step - 2: Split the model into Train and Test Data set
 59 | Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'], Corpus['label'],
 60 |                                                                     test_size=0.3)
 61 | 
 62 | # Step - 3: Label encode the target variable  - This is done to transform Categorical data of string type in the data set into numerical values
 63 | Encoder = LabelEncoder()
 64 | Encoder.fit(Train_Y)
 65 | Train_Y = Encoder.transform(Train_Y)
 66 | Test_Y = Encoder.transform(Test_Y)
 67 | 
 68 | # Step - 4: Vectorize the words by using TF-IDF Vectorizer - This is done to find how important a word in document is in comaprison to the corpus
 69 | Tfidf_vect = TfidfVectorizer(max_features=5000)
 70 | Tfidf_vect.fit(Corpus['text_final'])
 71 | 
 72 | Train_X_Tfidf = Tfidf_vect.transform(Train_X)
 73 | Test_X_Tfidf = Tfidf_vect.transform(Test_X)
 74 | 
 75 | # Step - 5: Now we can run different algorithms to classify out data check for accuracy
 76 | 
 77 | # Classifier - Algorithm - Naive Bayes
 78 | # fit the training dataset on the classifier
 79 | Naive = naive_bayes.MultinomialNB()
 80 | Naive.fit(Train_X_Tfidf, Train_Y)
 81 | 
 82 | # predict the labels on validation dataset
 83 | predictions_NB = Naive.predict(Test_X_Tfidf)
 84 | 
 85 | # Use accuracy_score function to get the accuracy
 86 | print("Naive Bayes Accuracy Score -> ", accuracy_score(predictions_NB, Test_Y) * 100)
 87 | 
 88 | # Classifier - Algorithm - SVM
 89 | # fit the training dataset on the classifier
 90 | SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
 91 | SVM.fit(Train_X_Tfidf, Train_Y)
 92 | 
 93 | # predict the labels on validation dataset
 94 | predictions_SVM = SVM.predict(Test_X_Tfidf)
 95 | 
 96 | # Use accuracy_score function to get the accuracy
 97 | print("SVM Accuracy Score -> ", accuracy_score(predictions_SVM, Test_Y) * 100)
 98 | 
 99 | 
100 | # Saving Encdoer, TFIDF Vectorizer and the trained model for future infrerencing/prediction
101 | 
102 | # saving encoder to disk
103 | filename = 'labelencoder_fitted.pkl'
104 | pickle.dump(Encoder, open(filename, 'wb'))
105 | 
106 | # saving TFIDF Vectorizer to disk
107 | filename = 'Tfidf_vect_fitted.pkl'
108 | pickle.dump(Tfidf_vect, open(filename, 'wb'))
109 | 
110 | # saving the both models to disk
111 | filename = 'svm_trained_model.sav'
112 | pickle.dump(SVM, open(filename, 'wb'))
113 | 
114 | filename = 'nb_trained_model.sav'
115 | pickle.dump(Naive, open(filename, 'wb'))
116 | 
117 | print("Files saved to disk! Proceed to inference.py")
118 | 


--------------------------------------------------------------------------------