├── .gitignore
├── LICENCE.txt
├── README.md
├── examples
    ├── 20newsgroup
    │   └── __main__.py
    ├── __init__.py
    ├── imdb_reviews
    │   ├── README.md
    │   └── __main__.py
    └── requirements.txt
├── naivebayes
    └── __init__.py
├── setup.cfg
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | build/*
3 | *.egg-info/
4 | dist/*
5 | MANIFEST
6 | .DS_Store
7 | data
8 | 


--------------------------------------------------------------------------------
/LICENCE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 naivebayes
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Naive Bayes Text Classifier
 2 | 
 3 | Text classifier based on Naive Bayes.
 4 | 
 5 | ## Instalation
 6 | 
 7 | ```bash
 8 | $ pip install naive-bayes
 9 | ```
10 | 
11 | ## Usage example
12 | 
13 | ```python
14 | from naivebayes import NaiveBayesTextClassifier
15 | 
16 | classifier = NaiveBayesTextClassifier(
17 |     categories=categories_list,
18 |     stop_words=stopwords_list
19 | )
20 | classifier.train(train_docs, train_classes)
21 | predicted_classes = classifier.classify(test_docs)
22 | ```
23 | 
24 | `NaiveBayesTextClassifier` is a simple wrapper around `scikit-learn` class `CountVectorizer`. You can put all arguments which support this class. For more information please check `scikit-learn` official documentation.
25 | 
26 | ## More examples
27 | 
28 | Check examples at `examples` folder. Before run them, install requirements in this folder.
29 | 
30 | Clone repository from github
31 | 
32 | ```bash
33 | $ git clone git@github.com:itdxer/naive-bayes.git
34 | $ cd naive-bayes/examples
35 | $ pip install -r requirements.txt
36 | ```
37 | 
38 | And run some example
39 | 
40 | ### Usenet 20 newsgroup
41 | 
42 | ```bash
43 | $ python 20newsgroup
44 | ```
45 | 
46 | ### Kaggle IMDB reviews competition
47 | 
48 | ```bash
49 | $ python imdb_reviews
50 | ```


--------------------------------------------------------------------------------
/examples/20newsgroup/__main__.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import urllib
  4 | import tarfile
  5 | from functools import partial
  6 | 
  7 | from skll.metrics import kappa
  8 | from nltk.corpus import stopwords
  9 | from sklearn.metrics import classification_report, accuracy_score
 10 | from sklearn.cross_validation import train_test_split
 11 | from naivebayes import NaiveBayesTextClassifier
 12 | 
 13 | 
 14 | print("> Start donwload 20 NewsGroup data")
 15 | DATASET_URL = "http://qwone.com/~jason/20Newsgroups/20news-18828.tar.gz"
 16 | BASEDIR = os.path.dirname(os.path.abspath(__file__))
 17 | DATA_DIR = os.path.join(BASEDIR, "data")
 18 | 
 19 | archive_name = DATASET_URL.split('/')[-1]
 20 | archive_path = os.path.join(DATA_DIR, archive_name)
 21 | path_to_data = os.path.join(DATA_DIR, "20news-18828")
 22 | 
 23 | if not os.path.exists(DATA_DIR):
 24 |     os.mkdir(DATA_DIR)
 25 | 
 26 | if not os.path.exists(archive_path):
 27 |     data_archive = urllib.request.URLopener()
 28 |     data_archive.retrieve(DATASET_URL, archive_path)
 29 | 
 30 | if not os.path.exists(path_to_data):
 31 |     with tarfile.open(archive_path, "r:gz") as tar:
 32 |         tar.extractall(path=DATA_DIR)
 33 | 
 34 | 
 35 | def prepare_file(filename, datafolder=path_to_data):
 36 |     filepath = os.path.join(datafolder, filename)
 37 |     with open(filepath, 'r', encoding='ISO-8859-1') as f:
 38 |         return f.read()
 39 | 
 40 | 
 41 | def get_texts(categories):
 42 |     documents = []
 43 |     classes = []
 44 | 
 45 |     for i, category in enumerate(categories):
 46 |         category_files_path = os.path.join(path_to_data, category)
 47 |         text_ids = os.listdir(category_files_path)
 48 |         prepare_category_file = partial(
 49 |             prepare_file, datafolder=category_files_path
 50 |         )
 51 |         texts = [prepare_category_file(x) for x in text_ids]
 52 |         documents += texts
 53 |         classes += [category] * len(texts)
 54 | 
 55 |     return documents, classes
 56 | 
 57 | print("> Read files...")
 58 | start_time = time.time()
 59 | categories = os.listdir(path_to_data)
 60 | 
 61 | # Get data
 62 | print("> Split data to test and train")
 63 | documents, classes = get_texts(categories)
 64 | train_docs, test_docs, train_classes, test_classes = train_test_split(
 65 |     documents, classes, train_size=0.7
 66 | )
 67 | 
 68 | print("> Train classifier")
 69 | classifier = NaiveBayesTextClassifier(
 70 |     categories=categories,
 71 |     min_df=1,
 72 |     lowercase=True,
 73 |     # 127 English stop words
 74 |     stop_words=stopwords.words('english')
 75 | )
 76 | classifier.train(train_docs, train_classes)
 77 | 
 78 | print("-" * 42)
 79 | print("{:<25}: {:>6} articles".format("Total", len(train_docs)))
 80 | print("{:<25}: {:>6} words".format(
 81 |     "Number of words", classifier.bag.shape[1]
 82 | ))
 83 | print("{:<25}: {:>6.2f} seconds".format(
 84 |     "Parse time", time.time() - start_time
 85 | ))
 86 | print("-" * 42)
 87 | 
 88 | start_time = time.time()
 89 | print("> Start classify test data")
 90 | predicted_classes = classifier.classify(test_docs)
 91 | end_time = time.time()
 92 | 
 93 | 
 94 | def category_to_number(classes, category_type):
 95 |     return list(map(category_type.index, classes))
 96 | 
 97 | 
 98 | print(classification_report(test_classes, predicted_classes))
 99 | print('-' * 42)
100 | print("{:<25}: {:>6.2f} seconds".format(
101 |     "Computation time", end_time - start_time
102 | ))
103 | print("{:<25}: {:>6} articles".format("Test data size", len(test_classes)))
104 | print("{:<25}: {:>6.2f} %".format(
105 |     "Accuracy", 100 * accuracy_score(test_classes, predicted_classes))
106 | )
107 | print("{:<25}: {:>6.2f} %".format(
108 |     "Kappa statistics", 100 * kappa(
109 |         category_to_number(test_classes, categories),
110 |         category_to_number(predicted_classes, categories)
111 |     )
112 | ))
113 | print('-' * 42)
114 | 


--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itdxer/naive-bayes/6b41ca98359df5d2f503145aad0763a2ac65c317/examples/__init__.py


--------------------------------------------------------------------------------
/examples/imdb_reviews/README.md:
--------------------------------------------------------------------------------
 1 | # Kaggle competition "Bag of Words Meets Bags of Popcorn"
 2 | 
 3 | Simple solution for [Kaggle competition](http://www.kaggle.com/c/word2vec-nlp-tutorial) using Naive Bayes calssifier. 
 4 | 
 5 | ## Usage
 6 | 
 7 | Split train data in ratio 70/30 and check how algorithm train and test your sample result.
 8 | 
 9 | ```bash
10 | $ python imdb_reviews --test
11 | ```
12 | 
13 | Predict result for Kaggle competition tests anda save them in `data/predictedData.csv` file
14 | 
15 | ```bash
16 | $ python imdb_reviews --predict
17 | ```


--------------------------------------------------------------------------------
/examples/imdb_reviews/__main__.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import ntpath
  4 | from optparse import OptionParser
  5 | 
  6 | import numpy as np
  7 | from skll.metrics import kappa
  8 | from nltk.corpus import stopwords
  9 | from pandas.io.parsers import read_csv
 10 | from sklearn.cross_validation import train_test_split
 11 | from sklearn.metrics import classification_report, accuracy_score
 12 | from naivebayes import NaiveBayesTextClassifier
 13 | 
 14 | 
 15 | # -------------- Init options --------------- #
 16 | 
 17 | parser = OptionParser()
 18 | parser.add_option("-t", "--test",
 19 |                   action="store_true", dest="test", default=False,
 20 |                   help=("Split labeled data 70/30 ration and test "
 21 |                         "classification"))
 22 | parser.add_option("-p", "--predict",
 23 |                   action="store_true", dest="predict", default=False,
 24 |                   help="Predict test data")
 25 | 
 26 | (options, args) = parser.parse_args()
 27 | 
 28 | if options.test and options.predict:
 29 |     raise EnvironmentError(
 30 |         "You can run with `--test` or `--predict` option, not both"
 31 |     )
 32 | 
 33 | # -------------- Check data --------------- #
 34 | 
 35 | BASEDIR = os.path.dirname(os.path.abspath(__file__))
 36 | DATADIR = os.path.join(BASEDIR, "data")
 37 | 
 38 | TEST_DATA_FILE = os.path.join(DATADIR, 'testData.tsv')
 39 | LABELED_TRAIN_DATA_FILE = os.path.join(DATADIR, 'labeledTrainData.tsv')
 40 | PREDICTED_DATA_FILE = os.path.join(DATADIR, 'predictedData.csv')
 41 | 
 42 | if not os.path.exists(DATADIR):
 43 |     os.mkdir(DATADIR)
 44 |     raise EnvironmentError(
 45 |         "Download data from "
 46 |         "https://www.kaggle.com/c/word2vec-nlp-tutorial/data "
 47 |         "and put it in {}.".format(DATADIR)
 48 |     )
 49 | 
 50 | important_files = (TEST_DATA_FILE, LABELED_TRAIN_DATA_FILE)
 51 | for tsv_file in important_files:
 52 |     if not os.path.exists(tsv_file):
 53 |         raise EnvironmentError("File {} doesn't exist at {}.".format(
 54 |             ntpath.basename(tsv_file), DATADIR
 55 |         ))
 56 | 
 57 | print("> Read train data")
 58 | train_data = read_csv(LABELED_TRAIN_DATA_FILE, sep='\t')
 59 | 
 60 | print("> Init classifier")
 61 | start_time = time.time()
 62 | classifier = NaiveBayesTextClassifier(
 63 |     categories=[0, 1],
 64 |     min_df=1,
 65 |     lowercase=True,
 66 |     # 127 English stop words
 67 |     stop_words=stopwords.words('english')
 68 | )
 69 | 
 70 | if options.test:
 71 |     print("> Split data to test and train")
 72 |     train_docs, test_docs, train_classes, test_classes = train_test_split(
 73 |         train_data.review, train_data.sentiment, train_size=0.7
 74 |     )
 75 | 
 76 |     print("> Train classifier")
 77 |     classifier.train(train_docs, train_classes)
 78 |     total_docs = len(train_docs)
 79 | 
 80 | elif options.predict:
 81 |     print("> Read test data")
 82 |     test_data = read_csv(TEST_DATA_FILE, sep='\t')
 83 | 
 84 |     print("> Train classifier")
 85 |     classifier.train(train_data.review, train_data.sentiment)
 86 |     total_docs = len(train_data)
 87 | 
 88 | print("-" * 42)
 89 | print("{:<25}: {:>6} articles".format("Total", total_docs))
 90 | print("{:<25}: {:>6} words".format(
 91 |     "Number of words", classifier.bag.shape[1]
 92 | ))
 93 | print("{:<25}: {:>6.2f} seconds".format(
 94 |     "Parse time", time.time() - start_time
 95 | ))
 96 | print("-" * 42)
 97 | 
 98 | # -------------- Classify --------------- #
 99 | 
100 | print("> Start classify data")
101 | start_time = time.time()
102 | 
103 | if options.test:
104 |     predicted_classes = classifier.classify(test_docs)
105 | 
106 |     print(classification_report(test_classes, predicted_classes))
107 |     print('-' * 42)
108 |     print("{:<25}: {:>6} articles".format("Test data size", len(test_classes)))
109 |     print("{:<25}: {:>6.2f} %".format(
110 |         "Accuracy", 100 * accuracy_score(test_classes, predicted_classes))
111 |     )
112 |     print("{:<25}: {:>6.2f} %".format(
113 |         "Kappa statistics", 100 * kappa(test_classes, predicted_classes)
114 |     ))
115 | 
116 | elif options.predict:
117 |     predicted_classes = classifier.classify(test_data.review)
118 | 
119 |     print("> Save predicted results")
120 |     print("> {}".format(PREDICTED_DATA_FILE))
121 |     np.savetxt(
122 |         PREDICTED_DATA_FILE,
123 |         np.concatenate(
124 |             (test_data.values[:, 0:1], np.matrix(predicted_classes).T),
125 |             axis=1
126 |         ),
127 |         delimiter=',', header='id,sentiment', comments='', fmt="%s"
128 |     )
129 |     print('-' * 42)
130 | 
131 | 
132 | end_time = time.time()
133 | print("{:<25}: {:>6.2f} seconds".format(
134 |     "Computation time", end_time - start_time
135 | ))
136 | print('-' * 42)
137 | 


--------------------------------------------------------------------------------
/examples/requirements.txt:
--------------------------------------------------------------------------------
1 | nltk==3.0.2
2 | scikit-learn==0.15.2
3 | skll==1.0.1
4 | naive-bayes==0.1.0
5 | pandas==0.16.0
6 | numpy==1.9.0
7 | 


--------------------------------------------------------------------------------
/naivebayes/__init__.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.feature_extraction.text import CountVectorizer
 3 | 
 4 | 
 5 | class NaiveBayesTextClassifier(object):
 6 |     def __init__(self, categories, **kwargs):
 7 |         self.categories = categories
 8 |         self.vectorizer = CountVectorizer(**kwargs)
 9 | 
10 |         # Will populate this variables in `train` method
11 |         self.bag = None
12 |         self.min_category_prob = None
13 | 
14 |     def train(self, documents, classes):
15 |         total_docs = len(documents)
16 |         categories = self.categories
17 |         total_categories = len(categories)
18 |         classes = np.array(list(map(categories.index, classes)))
19 | 
20 |         data = self.vectorizer.fit_transform(documents).toarray()
21 | 
22 |         row_combination_matrix = np.zeros((total_categories, total_docs))
23 |         for i, category in enumerate(categories):
24 |             row_combination_matrix[i, (classes == i)] = 1
25 | 
26 |         # Combine all words from one class
27 |         data = np.dot(row_combination_matrix, data)
28 |         number_of_words = data.shape[1]
29 | 
30 |         # Compute logarithmic probabilities
31 |         words_in_categories = np.reshape((data != 0).sum(axis=1),
32 |                                          (total_categories, 1))
33 |         data = np.log((data + 1) / (words_in_categories + number_of_words))
34 |         min_category_prob = np.log(
35 |             1 / (words_in_categories + number_of_words)
36 |         )
37 | 
38 |         self.bag = data
39 |         self.min_category_prob = min_category_prob
40 | 
41 |     def classify(self, documents):
42 |         if self.bag is None:
43 |             raise AttributeError(
44 |                 "Your bag is empty. Train it before classify."
45 |             )
46 | 
47 |         total_docs = len(documents)
48 |         categories = self.categories
49 |         vectorizer = self.vectorizer
50 |         analyze = vectorizer.build_analyzer()
51 | 
52 |         data = vectorizer.transform(documents).toarray()
53 |         counted_words_number = np.reshape(data.sum(axis=1), (total_docs, 1))
54 |         probabilities = np.dot(self.bag, data.T)
55 | 
56 |         # `scikit-learn` ignore all words which we didn't use in train
57 |         # examples. for this reason we must compute count of words again and
58 |         # store them, we will balance probabilities with this information.
59 |         total_words_number = np.zeros((total_docs, 1))
60 |         for i, doc in enumerate(documents):
61 |             total_words_number[i, :] = len(analyze(doc))
62 | 
63 |         ignored_words_number = total_words_number - counted_words_number
64 |         probabilities += (ignored_words_number.T * self.min_category_prob)
65 | 
66 |         return list(map(categories.__getitem__, probabilities.argmax(axis=0)))
67 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name="naive-bayes",
 5 |     version="0.1.1",
 6 |     author="Yurii Shevchuk",
 7 |     author_email="mail@itdxer.com",
 8 |     keywords="naive bayes text classification classifier",
 9 |     packages=["naivebayes"],
10 |     description="Naive Bayes Text Classification",
11 |     install_requires=[
12 |         "scikit-learn>=0.15.2",
13 |         "numpy>=1.9.0",
14 |     ],
15 | )
16 | 


--------------------------------------------------------------------------------