├── requirements.txt ├── .gitignore ├── MANIFEST.in ├── naiveBayesClassifier ├── __init__.py ├── ExceptionNotSeen.py ├── tokenizer.py ├── trainer.py ├── trainedData.py └── classifier.py ├── setup.py ├── LICENSE ├── examples └── newsClassifier.py └── README.md /requirements.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | *.pyc 3 | *~ -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | include *.md -------------------------------------------------------------------------------- /naiveBayesClassifier/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /naiveBayesClassifier/ExceptionNotSeen.py: -------------------------------------------------------------------------------- 1 | class NotSeen(Exception): 2 | """ 3 | Exception for tokens which are not indexed 4 | because never seen in the trainin data 5 | """ 6 | def __init__(self, value): 7 | self.value = value 8 | 9 | def __str__(self): 10 | return "Token '{}' is never seen in the training set.".format(self.value) -------------------------------------------------------------------------------- /naiveBayesClassifier/tokenizer.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | class Tokenizer(object): 4 | def __init__(self, stop_words = [], signs_to_remove = ["?!#%&"]): 5 | self.stop_words = stop_words 6 | self.signs_to_remove = signs_to_remove 7 | 8 | def tokenize(self,text): 9 | return text.lower().split(' ') 10 | 11 | def remove_stop_words(self,token): 12 | if token in self.stop_words: 13 | return "stop_word" 14 | else: 15 | return token 16 | 17 | def remove_punctuation(self,token): 18 | return re.sub(str(self.signs_to_remove),"",token) 19 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | try: 4 | from setuptools.core import setup 5 | except ImportError: 6 | from distutils.core import setup 7 | 8 | 9 | setup(name='naiveBayesClassifier', 10 | version='0.1.3.1', 11 | license='MIT', 12 | description='yet another general purpose naive bayesian classifier', 13 | long_description=open('README.md').read(), 14 | url='https://github.com/muatik/naive-bayes-classifier', 15 | author='Mustafa Atik', 16 | author_email='muatik@gmail.com', 17 | maintainer='Nejdet Yucesoy', 18 | maintainer_email='nejdetyucesoy@gmail.com', 19 | packages=['naiveBayesClassifier'], 20 | platforms='any') -------------------------------------------------------------------------------- /naiveBayesClassifier/trainer.py: -------------------------------------------------------------------------------- 1 | from naiveBayesClassifier.trainedData import TrainedData 2 | 3 | class Trainer(object): 4 | 5 | """docstring for Trainer""" 6 | def __init__(self, tokenizer): 7 | super(Trainer, self).__init__() 8 | self.tokenizer = tokenizer 9 | self.data = TrainedData() 10 | 11 | def train(self, text, className): 12 | """ 13 | enhances trained data using the given text and class 14 | """ 15 | self.data.increaseClass(className) 16 | 17 | tokens = self.tokenizer.tokenize(text) 18 | for token in tokens: 19 | token = self.tokenizer.remove_stop_words(token) 20 | token = self.tokenizer.remove_punctuation(token) 21 | self.data.increaseToken(token, className) 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Mustafa Atik 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /examples/newsClassifier.py: -------------------------------------------------------------------------------- 1 | """ 2 | Suppose you have some texts of news and know their categories. 3 | You want to train a system with this pre-categorized/pre-classified 4 | texts. So, you have better call this data your training set. 5 | """ 6 | from naiveBayesClassifier import tokenizer 7 | from naiveBayesClassifier.trainer import Trainer 8 | from naiveBayesClassifier.classifier import Classifier 9 | 10 | newsTrainer = Trainer(tokenizer) 11 | 12 | # You need to train the system passing each text one by one to the trainer module. 13 | newsSet =[ 14 | {'text': 'not to eat too much is not enough to lose weight', 'category': 'health'}, 15 | {'text': 'Russia try to invade Ukraine', 'category': 'politics'}, 16 | {'text': 'do not neglect exercise', 'category': 'health'}, 17 | {'text': 'Syria is the main issue, Obama says', 'category': 'politics'}, 18 | {'text': 'eat to lose weight', 'category': 'health'}, 19 | {'text': 'you should not eat much', 'category': 'health'} 20 | ] 21 | for news in newsSet: 22 | newsTrainer.train(news['text'], news['category']) 23 | 24 | # When you have sufficient trained data, you are almost done and can start to use 25 | # a classifier. 26 | newsClassifier = Classifier(newsTrainer.data, tokenizer) 27 | 28 | # Now you have a classifier which can give a try to classifiy text of news whose 29 | # category is unknown, yet. 30 | classification = newsClassifier.classify("Obama is") 31 | 32 | # the classification variable holds the detected categories sorted 33 | print(classification) -------------------------------------------------------------------------------- /naiveBayesClassifier/trainedData.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from naiveBayesClassifier.ExceptionNotSeen import NotSeen 3 | 4 | 5 | class TrainedData(object): 6 | def __init__(self): 7 | self.docCountOfClasses = {} 8 | self.frequencies = {} 9 | 10 | def increaseClass(self, className, byAmount = 1): 11 | self.docCountOfClasses[className] = self.docCountOfClasses.get(className, 0) + 1 12 | 13 | def increaseToken(self, token, className, byAmount = 1): 14 | if not token in self.frequencies: 15 | self.frequencies[token] = {} 16 | 17 | self.frequencies[token][className] = self.frequencies[token].get(className, 0) + 1 18 | 19 | def decreaseToken(self, token, className, byAmount=1): 20 | if token not in self.frequencies: 21 | raise NotSeen(token) 22 | foundToken = self.frequencies[token] 23 | if className not in self.frequencies: 24 | sys.stderr.write("Warning: token %s has no entry for class %s. Not decreasing.\n" % (token, className)) 25 | return 26 | if foundToken[className] < byAmount: 27 | raise ArithmeticError("Could not decrease %s/%s count (%i) by %i, " 28 | "as that would result in a negative number." % ( 29 | token, className, foundToken[className], byAmount)) 30 | foundToken[className] -= byAmount 31 | 32 | def getDocCount(self): 33 | """ 34 | returns all documents count 35 | """ 36 | return sum(self.docCountOfClasses.values()) 37 | 38 | def getClasses(self): 39 | """ 40 | returns the names of the available classes as list 41 | """ 42 | return self.docCountOfClasses.keys() 43 | 44 | def getClassDocCount(self, className): 45 | """ 46 | returns document count of the class. 47 | If class is not available, it returns None 48 | """ 49 | return self.docCountOfClasses.get(className, None) 50 | 51 | def getFrequency(self, token, className): 52 | if token in self.frequencies: 53 | foundToken = self.frequencies[token] 54 | return foundToken.get(className) 55 | else: 56 | raise NotSeen(token) 57 | -------------------------------------------------------------------------------- /naiveBayesClassifier/classifier.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import operator 3 | from functools import reduce 4 | 5 | from naiveBayesClassifier.ExceptionNotSeen import NotSeen 6 | 7 | 8 | class Classifier(object): 9 | """docstring for Classifier""" 10 | def __init__(self, trainedData, tokenizer): 11 | super(Classifier, self).__init__() 12 | self.data = trainedData 13 | self.tokenizer = tokenizer 14 | self.defaultProb = 0.000000001 15 | 16 | # ali ata bak 17 | def classify(self, text): 18 | 19 | documentCount = self.data.getDocCount() 20 | classes = self.data.getClasses() 21 | 22 | # only unique tokens 23 | tokens = list(set(self.tokenizer.tokenize(text))) 24 | 25 | probsOfClasses = {} 26 | 27 | for className in classes: 28 | 29 | # we are calculating the probablity of seeing each token 30 | # in the text of this class 31 | # P(Token_1|Class_i) 32 | tokensProbs = [self.getTokenProb(token, className) for token in tokens] 33 | 34 | # calculating the probablity of seeing the the set of tokens 35 | # in the text of this class 36 | # P(Token_1|Class_i) * P(Token_2|Class_i) * ... * P(Token_n|Class_i) 37 | try: 38 | tokenSetProb = reduce(lambda a,b: a*b, (i for i in tokensProbs if i) ) 39 | except: 40 | tokenSetProb = 0 41 | 42 | probsOfClasses[className] = tokenSetProb * self.getPrior(className) 43 | 44 | return sorted(probsOfClasses.items(), 45 | key=operator.itemgetter(1), 46 | reverse=True) 47 | 48 | 49 | def getPrior(self, className): 50 | return self.data.getClassDocCount(className) / self.data.getDocCount() 51 | 52 | def getTokenProb(self, token, className): 53 | #p(token|Class_i) 54 | classDocumentCount = self.data.getClassDocCount(className) 55 | 56 | # if the token is not seen in the training set, so not indexed, 57 | # then we return None not to include it into calculations. 58 | try: 59 | tokenFrequency = self.data.getFrequency(token, className) 60 | except NotSeen as e: 61 | return None 62 | 63 | # this means the token is not seen in this class but others. 64 | if tokenFrequency is None: 65 | return self.defaultProb 66 | 67 | probablity = tokenFrequency / classDocumentCount 68 | return probablity 69 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Naive Bayesian Classifier 2 | ====================== 3 | 4 | yet another general purpose Naive Bayesian classifier. 5 | 6 | ##Installation 7 | You can install this package using the following ```pip``` command: 8 | 9 | ```sh 10 | $ sudo pip install naiveBayesClassifier 11 | ``` 12 | 13 | 14 | ##Example 15 | 16 | ```python 17 | """ 18 | Suppose you have some texts of news and know their categories. 19 | You want to train a system with this pre-categorized/pre-classified 20 | texts. So, you have better call this data your training set. 21 | """ 22 | from naiveBayesClassifier import tokenizer 23 | from naiveBayesClassifier.trainer import Trainer 24 | from naiveBayesClassifier.classifier import Classifier 25 | 26 | newsTrainer = Trainer(tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"])) 27 | 28 | # You need to train the system passing each text one by one to the trainer module. 29 | newsSet =[ 30 | {'text': 'not to eat too much is not enough to lose weight', 'category': 'health'}, 31 | {'text': 'Russia is trying to invade Ukraine', 'category': 'politics'}, 32 | {'text': 'do not neglect exercise', 'category': 'health'}, 33 | {'text': 'Syria is the main issue, Obama says', 'category': 'politics'}, 34 | {'text': 'eat to lose weight', 'category': 'health'}, 35 | {'text': 'you should not eat much', 'category': 'health'} 36 | ] 37 | 38 | for news in newsSet: 39 | newsTrainer.train(news['text'], news['category']) 40 | 41 | # When you have sufficient trained data, you are almost done and can start to use 42 | # a classifier. 43 | newsClassifier = Classifier(newsTrainer.data, tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"])) 44 | 45 | # Now you have a classifier which can give a try to classifiy text of news whose 46 | # category is unknown, yet. 47 | unknownInstance = "Even if I eat too much, is not it possible to lose some weight" 48 | classification = newsClassifier.classify(unknownInstance) 49 | 50 | # the classification variable holds the possible categories sorted by 51 | # their probablity value 52 | print classification 53 | ``` 54 | ***Note***: Definitely you will need much more training data than the amount in the above example. Really, a few lines of text like in the example is out of the question to be sufficient training set. 55 | 56 | 57 | 58 | ##What is the Naive Bayes Theorem and Classifier 59 | It is needless to explain everything once again here. Instead, one of the most eloquent explanations is quoted here. 60 | 61 | The following explanation is quoted from [another Bayes classifier][1] which is written in Go. 62 | 63 | > BAYESIAN CLASSIFICATION REFRESHER: suppose you have a set of classes 64 | > (e.g. categories) C := {C_1, ..., C_n}, and a document D consisting 65 | > of words D := {W_1, ..., W_k}. We wish to ascertain the probability 66 | > that the document belongs to some class C_j given some set of 67 | > training data associating documents and classes. 68 | > 69 | > By Bayes' Theorem, we have that 70 | > 71 | > P(C_j|D) = P(D|C_j)*P(C_j)/P(D). 72 | > 73 | > The LHS is the probability that the document belongs to class C_j 74 | > given the document itself (by which is meant, in practice, the word 75 | > frequencies occurring in this document), and our program will 76 | > calculate this probability for each j and spit out the most likely 77 | > class for this document. 78 | > 79 | > P(C_j) is referred to as the "prior" probability, or the probability 80 | > that a document belongs to C_j in general, without seeing the 81 | > document first. P(D|C_j) is the probability of seeing such a 82 | > document, given that it belongs to C_j. Here, by assuming that words 83 | > appear independently in documents (this being the "naive" 84 | > assumption), we can estimate 85 | > 86 | > P(D|C_j) ~= P(W_1|C_j)*...*P(W_k|C_j) 87 | > 88 | > where P(W_i|C_j) is the probability of seeing the given word in a 89 | > document of the given class. Finally, P(D) can be seen as merely a 90 | > scaling factor and is not strictly relevant to classificiation, 91 | > unless you want to normalize the resulting scores and actually see 92 | > probabilities. In this case, note that 93 | > 94 | > P(D) = SUM_j(P(D|C_j)*P(C_j)) 95 | > 96 | > One practical issue with performing these calculations is the 97 | > possibility of float64 underflow when calculating P(D|C_j), as 98 | > individual word probabilities can be arbitrarily small, and a 99 | > document can have an arbitrarily large number of them. A typical 100 | > method for dealing with this case is to transform the probability to 101 | > the log domain and perform additions instead of multiplications: 102 | > 103 | > log P(C_j|D) ~ log(P(C_j)) + SUM_i(log P(W_i|C_j)) 104 | > 105 | > where i = 1, ..., k. Note that by doing this, we are discarding the 106 | > scaling factor P(D) and our scores are no longer probabilities; 107 | > however, the monotonic relationship of the scores is preserved by the 108 | > log function. 109 | 110 | If you are very curious about Naive Bayes Theorem, you may find the following list helpful: 111 | 112 | * [Insect Examples][2] 113 | * [Stanford NLP - Bayes Classifier][3] 114 | 115 | #Improvements 116 | This classifier uses a very simple tokenizer which is just a module to split sentences into words. If your training set is large, you can rely on the available tokenizer, otherwise you need to have a better tokenizer specialized to the language of your training texts. 117 | 118 | ## TODO 119 | * inline docs 120 | * unit-tests 121 | 122 | ## AUTHORS 123 | * Mustafa Atik @muatik 124 | * Nejdet Yucesoy @nejdetckenobi 125 | 126 | 127 | [1]: https://github.com/jbrukh/bayesian/blob/master/bayesian.go 128 | [2]: http://www.cs.ucr.edu/~eamonn/CE/Bayesian%20Classification%20withInsect_examples.pdf 129 | [3]: http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html 130 | --------------------------------------------------------------------------------