├── README.md ├── Topic Classification.py ├── corpus.csv └── version2 ├── inference.py └── training.py /README.md: -------------------------------------------------------------------------------- 1 | # Text Classification 2 | 3 | This is an easy to understand script for 'Text Classfication' using SVM and Naive Bayes. 4 | 5 | The input file is also uploaded - corpus.csv 6 | 7 | Refer medium link for detailed explanation 8 | https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34 9 | 10 | # Version 2: 11 | For people facing challenges in using the models to inference on unseen data. I have adapted the code into two modules - training.py and inference.py. 12 | 13 | Below is a short description of them. 14 | 15 | training.py - This will train a model on your data and save the trained LabelEncoder, TFIDF Vector and the model itself on disk. 16 | inference.py - This will load these saved files on the disk and do prediction on unseen text. 17 | 18 | Feel free to play with them and adapt them further as per your requirements. 19 | -------------------------------------------------------------------------------- /Topic Classification.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from nltk.tokenize import word_tokenize 4 | from nltk import pos_tag 5 | from nltk.corpus import stopwords 6 | from nltk.stem import WordNetLemmatizer 7 | from sklearn.preprocessing import LabelEncoder 8 | from collections import defaultdict 9 | from nltk.corpus import wordnet as wn 10 | from sklearn.feature_extraction.text import TfidfVectorizer 11 | from sklearn import model_selection, naive_bayes, svm 12 | from sklearn.metrics import accuracy_score 13 | 14 | #Set Random seed 15 | np.random.seed(500) 16 | 17 | # Add the Data using pandas 18 | Corpus = pd.read_csv(r"C:\Users\gunjit.bedi\Desktop\NLP Project\corpus_small.csv",encoding='latin-1') 19 | 20 | # Step - 1: Data Pre-processing - This will help in getting better results through the classification algorithms 21 | 22 | # Step - 1a : Remove blank rows if any. 23 | Corpus['text'].dropna(inplace=True) 24 | 25 | # Step - 1b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently 26 | Corpus['text'] = [entry.lower() for entry in Corpus['text']] 27 | 28 | # Step - 1c : Tokenization : In this each entry in the corpus will be broken into set of words 29 | Corpus['text']= [word_tokenize(entry) for entry in Corpus['text']] 30 | 31 | # Step - 1d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting. 32 | 33 | # WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun 34 | tag_map = defaultdict(lambda : wn.NOUN) 35 | tag_map['J'] = wn.ADJ 36 | tag_map['V'] = wn.VERB 37 | tag_map['R'] = wn.ADV 38 | 39 | 40 | for index,entry in enumerate(Corpus['text']): 41 | # Declaring Empty List to store the words that follow the rules for this step 42 | Final_words = [] 43 | # Initializing WordNetLemmatizer() 44 | word_Lemmatized = WordNetLemmatizer() 45 | # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else. 46 | for word, tag in pos_tag(entry): 47 | # Below condition is to check for Stop words and consider only alphabets 48 | if word not in stopwords.words('english') and word.isalpha(): 49 | word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]]) 50 | Final_words.append(word_Final) 51 | # The final processed set of words for each iteration will be stored in 'text_final' 52 | Corpus.loc[index,'text_final'] = str(Final_words) 53 | 54 | #print(Corpus['text_final'].head()) 55 | 56 | # Step - 2: Split the model into Train and Test Data set 57 | Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['label'],test_size=0.3) 58 | 59 | # Step - 3: Label encode the target variable - This is done to transform Categorical data of string type in the data set into numerical values 60 | Encoder = LabelEncoder() 61 | Train_Y = Encoder.fit_transform(Train_Y) 62 | Test_Y = Encoder.fit_transform(Test_Y) 63 | 64 | # Step - 4: Vectorize the words by using TF-IDF Vectorizer - This is done to find how important a word in document is in comaprison to the corpus 65 | Tfidf_vect = TfidfVectorizer(max_features=5000) 66 | Tfidf_vect.fit(Corpus['text_final']) 67 | 68 | Train_X_Tfidf = Tfidf_vect.transform(Train_X) 69 | Test_X_Tfidf = Tfidf_vect.transform(Test_X) 70 | 71 | 72 | # Step - 5: Now we can run different algorithms to classify out data check for accuracy 73 | 74 | # Classifier - Algorithm - Naive Bayes 75 | # fit the training dataset on the classifier 76 | Naive = naive_bayes.MultinomialNB() 77 | Naive.fit(Train_X_Tfidf,Train_Y) 78 | 79 | # predict the labels on validation dataset 80 | predictions_NB = Naive.predict(Test_X_Tfidf) 81 | 82 | # Use accuracy_score function to get the accuracy 83 | print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100) 84 | 85 | 86 | # Classifier - Algorithm - SVM 87 | # fit the training dataset on the classifier 88 | SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto') 89 | SVM.fit(Train_X_Tfidf,Train_Y) 90 | 91 | # predict the labels on validation dataset 92 | predictions_SVM = SVM.predict(Test_X_Tfidf) 93 | 94 | # Use accuracy_score function to get the accuracy 95 | print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100) 96 | -------------------------------------------------------------------------------- /corpus.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gunjitbedi/Text-Classification/9a24f5d61ee4b5068e2692ef499cfb5769c9404d/corpus.csv -------------------------------------------------------------------------------- /version2/inference.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from nltk.tokenize import word_tokenize 4 | from nltk import pos_tag 5 | from nltk.corpus import stopwords 6 | from nltk.stem import WordNetLemmatizer 7 | from sklearn.preprocessing import LabelEncoder 8 | from collections import defaultdict 9 | from nltk.corpus import wordnet as wn 10 | from sklearn.feature_extraction.text import TfidfVectorizer 11 | from sklearn import model_selection, naive_bayes, svm 12 | import pickle 13 | import ast 14 | 15 | # WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun 16 | tag_map = defaultdict(lambda: wn.NOUN) 17 | tag_map['J'] = wn.ADJ 18 | tag_map['V'] = wn.VERB 19 | tag_map['R'] = wn.ADV 20 | 21 | 22 | def text_preprocessing(text): 23 | # Step - 1b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently 24 | text = text.lower() 25 | 26 | # Step - 1c : Tokenization : In this each entry in the corpus will be broken into set of words 27 | text_words_list = word_tokenize(text) 28 | 29 | # Step - 1d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting. 30 | # Declaring Empty List to store the words that follow the rules for this step 31 | Final_words = [] 32 | # Initializing WordNetLemmatizer() 33 | word_Lemmatized = WordNetLemmatizer() 34 | # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else. 35 | for word, tag in pos_tag(text_words_list): 36 | # Below condition is to check for Stop words and consider only alphabets 37 | if word not in stopwords.words('english') and word.isalpha(): 38 | word_Final = word_Lemmatized.lemmatize(word, tag_map[tag[0]]) 39 | Final_words.append(word_Final) 40 | # The final processed set of words for each iteration will be stored in 'text_final' 41 | return str(Final_words) 42 | 43 | 44 | # Loading Label encoder 45 | labelencode = pickle.load(open('labelencoder_fitted.pkl', 'rb')) 46 | 47 | # Loading TF-IDF Vectorizer 48 | Tfidf_vect = pickle.load(open('Tfidf_vect_fitted.pkl', 'rb')) 49 | 50 | # Loading models 51 | SVM = pickle.load(open('svm_trained_model.sav', 'rb')) 52 | Naive = pickle.load(open('nb_trained_model.sav', 'rb')) 53 | 54 | 55 | # Inference 56 | sample_text = "The best soundtrack ever to anything" 57 | sample_text_processed = text_preprocessing(sample_text) 58 | sample_text_processed_vectorized = Tfidf_vect.transform([sample_text_processed]) 59 | 60 | prediction_SVM = SVM.predict(sample_text_processed_vectorized) 61 | prediction_Naive = Naive.predict(sample_text_processed_vectorized) 62 | 63 | print("Prediction from SVM Model:", labelencode.inverse_transform(prediction_SVM)[0]) 64 | print("Prediction from NB Model:", labelencode.inverse_transform(prediction_Naive)[0]) 65 | 66 | -------------------------------------------------------------------------------- /version2/training.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from nltk.tokenize import word_tokenize 4 | from nltk import pos_tag 5 | from nltk.corpus import stopwords 6 | from nltk.stem import WordNetLemmatizer 7 | from sklearn.preprocessing import LabelEncoder 8 | from collections import defaultdict 9 | from nltk.corpus import wordnet as wn 10 | from sklearn.feature_extraction.text import TfidfVectorizer 11 | from sklearn import model_selection, naive_bayes, svm 12 | from sklearn.metrics import accuracy_score 13 | import pickle 14 | 15 | # Set Random seed 16 | np.random.seed(500) 17 | 18 | # Add the Data using pandas 19 | 20 | Corpus = pd.read_csv(r"D:\Python\Projects\Thailand\corpus.csv", encoding='latin-1') 21 | 22 | # Step - 1: Data Pre-processing - This will help in getting better results through the classification algorithms 23 | 24 | # Step - 1a : Remove blank rows if any. 25 | Corpus['text'].dropna(inplace=True) 26 | 27 | # WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun 28 | tag_map = defaultdict(lambda: wn.NOUN) 29 | tag_map['J'] = wn.ADJ 30 | tag_map['V'] = wn.VERB 31 | tag_map['R'] = wn.ADV 32 | 33 | 34 | def text_preprocessing(text): 35 | # Step - 1b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently 36 | text = text.lower() 37 | 38 | # Step - 1c : Tokenization : In this each entry in the corpus will be broken into set of words 39 | text_words_list = word_tokenize(text) 40 | 41 | # Step - 1d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting. 42 | # Declaring Empty List to store the words that follow the rules for this step 43 | Final_words = [] 44 | # Initializing WordNetLemmatizer() 45 | word_Lemmatized = WordNetLemmatizer() 46 | # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else. 47 | for word, tag in pos_tag(text_words_list): 48 | # Below condition is to check for Stop words and consider only alphabets 49 | if word not in stopwords.words('english') and word.isalpha(): 50 | word_Final = word_Lemmatized.lemmatize(word, tag_map[tag[0]]) 51 | Final_words.append(word_Final) 52 | # The final processed set of words for each iteration will be stored in 'text_final' 53 | return str(Final_words) 54 | 55 | 56 | Corpus['text_final'] = Corpus['text'].map(text_preprocessing) 57 | 58 | # Step - 2: Split the model into Train and Test Data set 59 | Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'], Corpus['label'], 60 | test_size=0.3) 61 | 62 | # Step - 3: Label encode the target variable - This is done to transform Categorical data of string type in the data set into numerical values 63 | Encoder = LabelEncoder() 64 | Encoder.fit(Train_Y) 65 | Train_Y = Encoder.transform(Train_Y) 66 | Test_Y = Encoder.transform(Test_Y) 67 | 68 | # Step - 4: Vectorize the words by using TF-IDF Vectorizer - This is done to find how important a word in document is in comaprison to the corpus 69 | Tfidf_vect = TfidfVectorizer(max_features=5000) 70 | Tfidf_vect.fit(Corpus['text_final']) 71 | 72 | Train_X_Tfidf = Tfidf_vect.transform(Train_X) 73 | Test_X_Tfidf = Tfidf_vect.transform(Test_X) 74 | 75 | # Step - 5: Now we can run different algorithms to classify out data check for accuracy 76 | 77 | # Classifier - Algorithm - Naive Bayes 78 | # fit the training dataset on the classifier 79 | Naive = naive_bayes.MultinomialNB() 80 | Naive.fit(Train_X_Tfidf, Train_Y) 81 | 82 | # predict the labels on validation dataset 83 | predictions_NB = Naive.predict(Test_X_Tfidf) 84 | 85 | # Use accuracy_score function to get the accuracy 86 | print("Naive Bayes Accuracy Score -> ", accuracy_score(predictions_NB, Test_Y) * 100) 87 | 88 | # Classifier - Algorithm - SVM 89 | # fit the training dataset on the classifier 90 | SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto') 91 | SVM.fit(Train_X_Tfidf, Train_Y) 92 | 93 | # predict the labels on validation dataset 94 | predictions_SVM = SVM.predict(Test_X_Tfidf) 95 | 96 | # Use accuracy_score function to get the accuracy 97 | print("SVM Accuracy Score -> ", accuracy_score(predictions_SVM, Test_Y) * 100) 98 | 99 | 100 | # Saving Encdoer, TFIDF Vectorizer and the trained model for future infrerencing/prediction 101 | 102 | # saving encoder to disk 103 | filename = 'labelencoder_fitted.pkl' 104 | pickle.dump(Encoder, open(filename, 'wb')) 105 | 106 | # saving TFIDF Vectorizer to disk 107 | filename = 'Tfidf_vect_fitted.pkl' 108 | pickle.dump(Tfidf_vect, open(filename, 'wb')) 109 | 110 | # saving the both models to disk 111 | filename = 'svm_trained_model.sav' 112 | pickle.dump(SVM, open(filename, 'wb')) 113 | 114 | filename = 'nb_trained_model.sav' 115 | pickle.dump(Naive, open(filename, 'wb')) 116 | 117 | print("Files saved to disk! Proceed to inference.py") 118 | --------------------------------------------------------------------------------