├── .gitignore ├── LICENCE.txt ├── README.md ├── examples ├── 20newsgroup │ └── __main__.py ├── __init__.py ├── imdb_reviews │ ├── README.md │ └── __main__.py └── requirements.txt ├── naivebayes └── __init__.py ├── setup.cfg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | build/* 3 | *.egg-info/ 4 | dist/* 5 | MANIFEST 6 | .DS_Store 7 | data 8 | -------------------------------------------------------------------------------- /LICENCE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 naivebayes 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Naive Bayes Text Classifier 2 | 3 | Text classifier based on Naive Bayes. 4 | 5 | ## Instalation 6 | 7 | ```bash 8 | $ pip install naive-bayes 9 | ``` 10 | 11 | ## Usage example 12 | 13 | ```python 14 | from naivebayes import NaiveBayesTextClassifier 15 | 16 | classifier = NaiveBayesTextClassifier( 17 | categories=categories_list, 18 | stop_words=stopwords_list 19 | ) 20 | classifier.train(train_docs, train_classes) 21 | predicted_classes = classifier.classify(test_docs) 22 | ``` 23 | 24 | `NaiveBayesTextClassifier` is a simple wrapper around `scikit-learn` class `CountVectorizer`. You can put all arguments which support this class. For more information please check `scikit-learn` official documentation. 25 | 26 | ## More examples 27 | 28 | Check examples at `examples` folder. Before run them, install requirements in this folder. 29 | 30 | Clone repository from github 31 | 32 | ```bash 33 | $ git clone git@github.com:itdxer/naive-bayes.git 34 | $ cd naive-bayes/examples 35 | $ pip install -r requirements.txt 36 | ``` 37 | 38 | And run some example 39 | 40 | ### Usenet 20 newsgroup 41 | 42 | ```bash 43 | $ python 20newsgroup 44 | ``` 45 | 46 | ### Kaggle IMDB reviews competition 47 | 48 | ```bash 49 | $ python imdb_reviews 50 | ``` -------------------------------------------------------------------------------- /examples/20newsgroup/__main__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import urllib 4 | import tarfile 5 | from functools import partial 6 | 7 | from skll.metrics import kappa 8 | from nltk.corpus import stopwords 9 | from sklearn.metrics import classification_report, accuracy_score 10 | from sklearn.cross_validation import train_test_split 11 | from naivebayes import NaiveBayesTextClassifier 12 | 13 | 14 | print("> Start donwload 20 NewsGroup data") 15 | DATASET_URL = "http://qwone.com/~jason/20Newsgroups/20news-18828.tar.gz" 16 | BASEDIR = os.path.dirname(os.path.abspath(__file__)) 17 | DATA_DIR = os.path.join(BASEDIR, "data") 18 | 19 | archive_name = DATASET_URL.split('/')[-1] 20 | archive_path = os.path.join(DATA_DIR, archive_name) 21 | path_to_data = os.path.join(DATA_DIR, "20news-18828") 22 | 23 | if not os.path.exists(DATA_DIR): 24 | os.mkdir(DATA_DIR) 25 | 26 | if not os.path.exists(archive_path): 27 | data_archive = urllib.request.URLopener() 28 | data_archive.retrieve(DATASET_URL, archive_path) 29 | 30 | if not os.path.exists(path_to_data): 31 | with tarfile.open(archive_path, "r:gz") as tar: 32 | tar.extractall(path=DATA_DIR) 33 | 34 | 35 | def prepare_file(filename, datafolder=path_to_data): 36 | filepath = os.path.join(datafolder, filename) 37 | with open(filepath, 'r', encoding='ISO-8859-1') as f: 38 | return f.read() 39 | 40 | 41 | def get_texts(categories): 42 | documents = [] 43 | classes = [] 44 | 45 | for i, category in enumerate(categories): 46 | category_files_path = os.path.join(path_to_data, category) 47 | text_ids = os.listdir(category_files_path) 48 | prepare_category_file = partial( 49 | prepare_file, datafolder=category_files_path 50 | ) 51 | texts = [prepare_category_file(x) for x in text_ids] 52 | documents += texts 53 | classes += [category] * len(texts) 54 | 55 | return documents, classes 56 | 57 | print("> Read files...") 58 | start_time = time.time() 59 | categories = os.listdir(path_to_data) 60 | 61 | # Get data 62 | print("> Split data to test and train") 63 | documents, classes = get_texts(categories) 64 | train_docs, test_docs, train_classes, test_classes = train_test_split( 65 | documents, classes, train_size=0.7 66 | ) 67 | 68 | print("> Train classifier") 69 | classifier = NaiveBayesTextClassifier( 70 | categories=categories, 71 | min_df=1, 72 | lowercase=True, 73 | # 127 English stop words 74 | stop_words=stopwords.words('english') 75 | ) 76 | classifier.train(train_docs, train_classes) 77 | 78 | print("-" * 42) 79 | print("{:<25}: {:>6} articles".format("Total", len(train_docs))) 80 | print("{:<25}: {:>6} words".format( 81 | "Number of words", classifier.bag.shape[1] 82 | )) 83 | print("{:<25}: {:>6.2f} seconds".format( 84 | "Parse time", time.time() - start_time 85 | )) 86 | print("-" * 42) 87 | 88 | start_time = time.time() 89 | print("> Start classify test data") 90 | predicted_classes = classifier.classify(test_docs) 91 | end_time = time.time() 92 | 93 | 94 | def category_to_number(classes, category_type): 95 | return list(map(category_type.index, classes)) 96 | 97 | 98 | print(classification_report(test_classes, predicted_classes)) 99 | print('-' * 42) 100 | print("{:<25}: {:>6.2f} seconds".format( 101 | "Computation time", end_time - start_time 102 | )) 103 | print("{:<25}: {:>6} articles".format("Test data size", len(test_classes))) 104 | print("{:<25}: {:>6.2f} %".format( 105 | "Accuracy", 100 * accuracy_score(test_classes, predicted_classes)) 106 | ) 107 | print("{:<25}: {:>6.2f} %".format( 108 | "Kappa statistics", 100 * kappa( 109 | category_to_number(test_classes, categories), 110 | category_to_number(predicted_classes, categories) 111 | ) 112 | )) 113 | print('-' * 42) 114 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itdxer/naive-bayes/6b41ca98359df5d2f503145aad0763a2ac65c317/examples/__init__.py -------------------------------------------------------------------------------- /examples/imdb_reviews/README.md: -------------------------------------------------------------------------------- 1 | # Kaggle competition "Bag of Words Meets Bags of Popcorn" 2 | 3 | Simple solution for [Kaggle competition](http://www.kaggle.com/c/word2vec-nlp-tutorial) using Naive Bayes calssifier. 4 | 5 | ## Usage 6 | 7 | Split train data in ratio 70/30 and check how algorithm train and test your sample result. 8 | 9 | ```bash 10 | $ python imdb_reviews --test 11 | ``` 12 | 13 | Predict result for Kaggle competition tests anda save them in `data/predictedData.csv` file 14 | 15 | ```bash 16 | $ python imdb_reviews --predict 17 | ``` -------------------------------------------------------------------------------- /examples/imdb_reviews/__main__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import ntpath 4 | from optparse import OptionParser 5 | 6 | import numpy as np 7 | from skll.metrics import kappa 8 | from nltk.corpus import stopwords 9 | from pandas.io.parsers import read_csv 10 | from sklearn.cross_validation import train_test_split 11 | from sklearn.metrics import classification_report, accuracy_score 12 | from naivebayes import NaiveBayesTextClassifier 13 | 14 | 15 | # -------------- Init options --------------- # 16 | 17 | parser = OptionParser() 18 | parser.add_option("-t", "--test", 19 | action="store_true", dest="test", default=False, 20 | help=("Split labeled data 70/30 ration and test " 21 | "classification")) 22 | parser.add_option("-p", "--predict", 23 | action="store_true", dest="predict", default=False, 24 | help="Predict test data") 25 | 26 | (options, args) = parser.parse_args() 27 | 28 | if options.test and options.predict: 29 | raise EnvironmentError( 30 | "You can run with `--test` or `--predict` option, not both" 31 | ) 32 | 33 | # -------------- Check data --------------- # 34 | 35 | BASEDIR = os.path.dirname(os.path.abspath(__file__)) 36 | DATADIR = os.path.join(BASEDIR, "data") 37 | 38 | TEST_DATA_FILE = os.path.join(DATADIR, 'testData.tsv') 39 | LABELED_TRAIN_DATA_FILE = os.path.join(DATADIR, 'labeledTrainData.tsv') 40 | PREDICTED_DATA_FILE = os.path.join(DATADIR, 'predictedData.csv') 41 | 42 | if not os.path.exists(DATADIR): 43 | os.mkdir(DATADIR) 44 | raise EnvironmentError( 45 | "Download data from " 46 | "https://www.kaggle.com/c/word2vec-nlp-tutorial/data " 47 | "and put it in {}.".format(DATADIR) 48 | ) 49 | 50 | important_files = (TEST_DATA_FILE, LABELED_TRAIN_DATA_FILE) 51 | for tsv_file in important_files: 52 | if not os.path.exists(tsv_file): 53 | raise EnvironmentError("File {} doesn't exist at {}.".format( 54 | ntpath.basename(tsv_file), DATADIR 55 | )) 56 | 57 | print("> Read train data") 58 | train_data = read_csv(LABELED_TRAIN_DATA_FILE, sep='\t') 59 | 60 | print("> Init classifier") 61 | start_time = time.time() 62 | classifier = NaiveBayesTextClassifier( 63 | categories=[0, 1], 64 | min_df=1, 65 | lowercase=True, 66 | # 127 English stop words 67 | stop_words=stopwords.words('english') 68 | ) 69 | 70 | if options.test: 71 | print("> Split data to test and train") 72 | train_docs, test_docs, train_classes, test_classes = train_test_split( 73 | train_data.review, train_data.sentiment, train_size=0.7 74 | ) 75 | 76 | print("> Train classifier") 77 | classifier.train(train_docs, train_classes) 78 | total_docs = len(train_docs) 79 | 80 | elif options.predict: 81 | print("> Read test data") 82 | test_data = read_csv(TEST_DATA_FILE, sep='\t') 83 | 84 | print("> Train classifier") 85 | classifier.train(train_data.review, train_data.sentiment) 86 | total_docs = len(train_data) 87 | 88 | print("-" * 42) 89 | print("{:<25}: {:>6} articles".format("Total", total_docs)) 90 | print("{:<25}: {:>6} words".format( 91 | "Number of words", classifier.bag.shape[1] 92 | )) 93 | print("{:<25}: {:>6.2f} seconds".format( 94 | "Parse time", time.time() - start_time 95 | )) 96 | print("-" * 42) 97 | 98 | # -------------- Classify --------------- # 99 | 100 | print("> Start classify data") 101 | start_time = time.time() 102 | 103 | if options.test: 104 | predicted_classes = classifier.classify(test_docs) 105 | 106 | print(classification_report(test_classes, predicted_classes)) 107 | print('-' * 42) 108 | print("{:<25}: {:>6} articles".format("Test data size", len(test_classes))) 109 | print("{:<25}: {:>6.2f} %".format( 110 | "Accuracy", 100 * accuracy_score(test_classes, predicted_classes)) 111 | ) 112 | print("{:<25}: {:>6.2f} %".format( 113 | "Kappa statistics", 100 * kappa(test_classes, predicted_classes) 114 | )) 115 | 116 | elif options.predict: 117 | predicted_classes = classifier.classify(test_data.review) 118 | 119 | print("> Save predicted results") 120 | print("> {}".format(PREDICTED_DATA_FILE)) 121 | np.savetxt( 122 | PREDICTED_DATA_FILE, 123 | np.concatenate( 124 | (test_data.values[:, 0:1], np.matrix(predicted_classes).T), 125 | axis=1 126 | ), 127 | delimiter=',', header='id,sentiment', comments='', fmt="%s" 128 | ) 129 | print('-' * 42) 130 | 131 | 132 | end_time = time.time() 133 | print("{:<25}: {:>6.2f} seconds".format( 134 | "Computation time", end_time - start_time 135 | )) 136 | print('-' * 42) 137 | -------------------------------------------------------------------------------- /examples/requirements.txt: -------------------------------------------------------------------------------- 1 | nltk==3.0.2 2 | scikit-learn==0.15.2 3 | skll==1.0.1 4 | naive-bayes==0.1.0 5 | pandas==0.16.0 6 | numpy==1.9.0 7 | -------------------------------------------------------------------------------- /naivebayes/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.feature_extraction.text import CountVectorizer 3 | 4 | 5 | class NaiveBayesTextClassifier(object): 6 | def __init__(self, categories, **kwargs): 7 | self.categories = categories 8 | self.vectorizer = CountVectorizer(**kwargs) 9 | 10 | # Will populate this variables in `train` method 11 | self.bag = None 12 | self.min_category_prob = None 13 | 14 | def train(self, documents, classes): 15 | total_docs = len(documents) 16 | categories = self.categories 17 | total_categories = len(categories) 18 | classes = np.array(list(map(categories.index, classes))) 19 | 20 | data = self.vectorizer.fit_transform(documents).toarray() 21 | 22 | row_combination_matrix = np.zeros((total_categories, total_docs)) 23 | for i, category in enumerate(categories): 24 | row_combination_matrix[i, (classes == i)] = 1 25 | 26 | # Combine all words from one class 27 | data = np.dot(row_combination_matrix, data) 28 | number_of_words = data.shape[1] 29 | 30 | # Compute logarithmic probabilities 31 | words_in_categories = np.reshape((data != 0).sum(axis=1), 32 | (total_categories, 1)) 33 | data = np.log((data + 1) / (words_in_categories + number_of_words)) 34 | min_category_prob = np.log( 35 | 1 / (words_in_categories + number_of_words) 36 | ) 37 | 38 | self.bag = data 39 | self.min_category_prob = min_category_prob 40 | 41 | def classify(self, documents): 42 | if self.bag is None: 43 | raise AttributeError( 44 | "Your bag is empty. Train it before classify." 45 | ) 46 | 47 | total_docs = len(documents) 48 | categories = self.categories 49 | vectorizer = self.vectorizer 50 | analyze = vectorizer.build_analyzer() 51 | 52 | data = vectorizer.transform(documents).toarray() 53 | counted_words_number = np.reshape(data.sum(axis=1), (total_docs, 1)) 54 | probabilities = np.dot(self.bag, data.T) 55 | 56 | # `scikit-learn` ignore all words which we didn't use in train 57 | # examples. for this reason we must compute count of words again and 58 | # store them, we will balance probabilities with this information. 59 | total_words_number = np.zeros((total_docs, 1)) 60 | for i, doc in enumerate(documents): 61 | total_words_number[i, :] = len(analyze(doc)) 62 | 63 | ignored_words_number = total_words_number - counted_words_number 64 | probabilities += (ignored_words_number.T * self.min_category_prob) 65 | 66 | return list(map(categories.__getitem__, probabilities.argmax(axis=0))) 67 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name="naive-bayes", 5 | version="0.1.1", 6 | author="Yurii Shevchuk", 7 | author_email="mail@itdxer.com", 8 | keywords="naive bayes text classification classifier", 9 | packages=["naivebayes"], 10 | description="Naive Bayes Text Classification", 11 | install_requires=[ 12 | "scikit-learn>=0.15.2", 13 | "numpy>=1.9.0", 14 | ], 15 | ) 16 | --------------------------------------------------------------------------------