├── .gitignore ├── LICENSE ├── README.md ├── bayes.db ├── bayes.py ├── classify.py ├── db.py ├── learn.py ├── mode.py ├── reset.py ├── status.py ├── testharness.py └── words.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | 24 | # Unit test / coverage reports 25 | .coverage 26 | .tox 27 | nosetests.xml 28 | 29 | # Translations 30 | *.mo 31 | 32 | # Mr Developer 33 | .mr.developer.cfg 34 | .project 35 | .pydevproject 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Rob Dawson 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Naive Bayesian Classifier 2 | ========================= 3 | This is an implementation of a Naive Bayesian Classifier written in Python. The utility uses statistical methods to classify documents, based on the words that appear within them. A common application for this type of software is in email spam filters. 4 | 5 | The utility must first be 'trained' using large numbers of pre-classified documents, during the training phase a database is populated with information about how often certain words appear in each type of document. Once training is complete, unclassified documents can be submitted to the classifier which will return a value between 0 and 1, indicating the probablity that the document belongs to one class of document rather than another. 6 | 7 | Training 8 | -------- 9 | 10 | To train the utility, use the following command: 11 | 12 | python bayes.py learn 13 | 14 | + The *doctype* argument can be any non-empty value - this is just the name you have chosen for the type of document that you are showing to the classifier 15 | + The *file* argument indicates the location of the file containing the training data that you wish to use 16 | + The *count* argument is a numeric value indicating the number of separate documents contained in the training data file 17 | 18 | For example: 19 | 20 | python bayes.py learn spam all_my_spam.txt 10000 21 | python bayes.py learn ham inbox.txt 10000 22 | 23 | Classification 24 | -------------- 25 | 26 | Once training is complete, classification is performed using this command: 27 | 28 | python bayes.py classify 29 | 30 | + The *file* argument indicates the location of the file containing the document to be classified 31 | + The two *doctype* arguments are the names of the document types against which the input file will be compared 32 | 33 | For example: 34 | 35 | python bayes.py classify nigerian_finance_email.txt spam ham 36 | > Probability that document is spam rather than ham is 0.98 37 | -------------------------------------------------------------------------------- /bayes.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codebox/bayesian-classifier/a491230f141b5a0be287dd7298b150f944ff63f7/bayes.db -------------------------------------------------------------------------------- /bayes.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from learn import Learn 3 | from classify import Classify 4 | from reset import Reset 5 | from status import Status 6 | 7 | modes = {} 8 | 9 | def register_mode(mode_class): 10 | modes[mode_class.__name__.lower()] = mode_class 11 | 12 | if __name__ == '__main__': 13 | try: 14 | register_mode(Learn) 15 | register_mode(Classify) 16 | register_mode(Reset) 17 | register_mode(Status) 18 | 19 | args = sys.argv 20 | usage = 'Usage: %s %s ' % (args[0], '|'.join(modes.keys())) 21 | 22 | if (len(args) < 2): 23 | raise ValueError(usage) 24 | 25 | mode_name = args[1] 26 | if mode_name not in modes: 27 | raise ValueError(usage + '\nUnrecognised mode: ' + mode_name) 28 | 29 | mode = modes[mode_name]() 30 | mode.validate(args) 31 | mode.output(mode.execute()) 32 | 33 | except Exception as ex: 34 | print ex 35 | -------------------------------------------------------------------------------- /classify.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from mode import Mode 3 | from db import Db 4 | from words import text_to_list 5 | 6 | class Classify(Mode): 7 | MIN_WORD_COUNT = 5 8 | RARE_WORD_PROB = 0.5 9 | EXCLUSIVE_WORD_PROB = 0.99 10 | 11 | def set_text(self, text): 12 | words = text_to_list(text) 13 | 14 | if not len(words): 15 | raise ValueError('Text did not contain any valid words') 16 | 17 | self.words = words 18 | return self 19 | 20 | def set_file_name(self, file_name): 21 | try: 22 | file_contents = open(file_name, 'r').read() 23 | return self.set_text(file_contents) 24 | 25 | except Exception as e: 26 | raise ValueError('Unable to read specified file "%s", the error message was: %s' % (file_name, e)) 27 | 28 | def set_doctypes(self, doctype1, doctype2): 29 | if doctype1 == doctype2: 30 | raise ValueError('Please enter two different doctypes') 31 | 32 | d = Db().get_doctype_counts() 33 | if doctype1 not in d.keys(): 34 | raise ValueError('Unknown doctype: ' + doctype1) 35 | 36 | if doctype2 not in d.keys(): 37 | raise ValueError('Unknown doctype: ' + doctype2) 38 | 39 | self.doctype1 = doctype1 40 | self.doctype2 = doctype2 41 | 42 | def validate(self, args): 43 | if len(args) != 5: 44 | raise ValueError('Usage: %s classify ' % args[0]) 45 | 46 | self.set_file_name(args[2]) 47 | self.set_doctypes(args[3], args[4]) 48 | 49 | def p_for_word(self, db, word): 50 | total_word_count = self.doctype1_word_count + self.doctype2_word_count 51 | 52 | word_count_doctype1 = db.get_word_count(self.doctype1, word) 53 | word_count_doctype2 = db.get_word_count(self.doctype2, word) 54 | 55 | if word_count_doctype1 + word_count_doctype2 < self.MIN_WORD_COUNT: 56 | return self.RARE_WORD_PROB 57 | 58 | if word_count_doctype1 == 0: 59 | return 1 - self.EXCLUSIVE_WORD_PROB 60 | elif word_count_doctype2 == 0: 61 | return self.EXCLUSIVE_WORD_PROB 62 | 63 | # P(S|W) = P(W|S) / ( P(W|S) + P(W|H) ) 64 | 65 | p_ws = word_count_doctype1 / self.doctype1_word_count 66 | p_wh = word_count_doctype2 / self.doctype2_word_count 67 | 68 | return p_ws / (p_ws + p_wh) 69 | 70 | def p_from_list(self, l): 71 | p_product = reduce(lambda x,y: x*y, l) 72 | p_inverse_product = reduce(lambda x,y: x*y, map(lambda x: 1-x, l)) 73 | 74 | return p_product / (p_product + p_inverse_product) 75 | 76 | def execute(self): 77 | pl = [] 78 | db = Db() 79 | 80 | d = db.get_doctype_counts() 81 | self.doctype1_count = d.get(self.doctype1) 82 | self.doctype2_count = d.get(self.doctype2) 83 | 84 | self.doctype1_word_count = db.get_words_count(self.doctype1) 85 | self.doctype2_word_count = db.get_words_count(self.doctype2) 86 | 87 | for word in self.words: 88 | p = self.p_for_word(db, word) 89 | pl.append(p) 90 | 91 | result = self.p_from_list(pl) 92 | 93 | return result 94 | 95 | def output(self, result): 96 | print 'Probability that document is %s rather than %s is %1.2f' % (self.doctype1, self.doctype2, result) 97 | -------------------------------------------------------------------------------- /db.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | ''' 4 | create table word(word, doctype, count); 5 | create table doctype_count(doctype, count); 6 | 7 | create index i1 on word(word, doctype); 8 | 9 | delete from word; 10 | update ad_count set count = 0; 11 | 12 | ''' 13 | 14 | class Db: 15 | def __init__(self): 16 | self.conn = sqlite3.connect('./bayes.db') 17 | 18 | def reset(self): 19 | c = self.conn.cursor() 20 | try: 21 | c.execute('delete from word') 22 | c.execute('delete from doctype_count') 23 | 24 | finally: 25 | c.close() 26 | self.conn.commit() 27 | 28 | def update_word_count(self, c, doctype, word, num_to_add_to_count): 29 | c.execute('select count from word where doctype=? and word=?', (doctype, word)) 30 | r = c.fetchone() 31 | if r: 32 | c.execute('update word set count=? where doctype=? and word=?', (r[0] + num_to_add_to_count, doctype, word)) 33 | else: 34 | c.execute('insert into word (doctype, word, count) values (?,?,?)', (doctype, word, num_to_add_to_count)) 35 | 36 | def update_word_counts(self, d, doctype): 37 | c = self.conn.cursor() 38 | try: 39 | for word, count in d.items(): 40 | self.update_word_count(c, doctype, word, count) 41 | finally: 42 | c.close() 43 | self.conn.commit() 44 | 45 | def get_doctype_counts(self): 46 | counts = {} 47 | c = self.conn.cursor() 48 | try: 49 | for row in c.execute('select doctype, count from doctype_count'): 50 | counts[row[0]] = row[1] 51 | 52 | return counts 53 | 54 | finally: 55 | c.close() 56 | self.conn.commit() 57 | 58 | def get_word_count(self, doctype, word): 59 | c = self.conn.cursor() 60 | try: 61 | c.execute('select count from word where doctype=? and word=?', (doctype, word)) 62 | r = c.fetchone() 63 | if r: 64 | return r[0] 65 | else: 66 | return 0 67 | 68 | finally: 69 | c.close() 70 | self.conn.commit() 71 | 72 | def get_words_count(self, doctype): 73 | c = self.conn.cursor() 74 | try: 75 | c.execute('select sum(count) from word where doctype=?', (doctype, )) 76 | r = c.fetchone() 77 | if r: 78 | return r[0] 79 | else: 80 | return 0 81 | 82 | finally: 83 | c.close() 84 | self.conn.commit() 85 | 86 | def update_doctype_count(self, num_new_ads, doctype): 87 | c = self.conn.cursor() 88 | try: 89 | counts = self.get_doctype_counts() 90 | if counts.has_key(doctype): 91 | current_count = counts[doctype] 92 | else: 93 | current_count = 0 94 | 95 | if current_count: 96 | c.execute('update doctype_count set count=? where doctype=?', (current_count + num_new_ads, doctype)) 97 | else: 98 | c.execute('insert into doctype_count (doctype, count) values (?, ?)', (doctype, num_new_ads)) 99 | 100 | finally: 101 | c.close() 102 | self.conn.commit() 103 | 104 | -------------------------------------------------------------------------------- /learn.py: -------------------------------------------------------------------------------- 1 | from db import Db 2 | from mode import Mode 3 | from words import list_to_dict 4 | from words import text_to_list 5 | 6 | class Learn(Mode): 7 | def validate(self, args): 8 | valid_args = False 9 | usage = 'Usage: %s learn ' % args[0] 10 | 11 | if len(args) == 5: 12 | doc_type = args[2] 13 | 14 | file_contents = None 15 | try: 16 | file_contents = open(args[3], 'r').read() 17 | except Exception as e: 18 | raise ValueError(usage + '\nUnable to read specified file "%s", the error message was: %s' % (args[3], e)) 19 | 20 | count = 0 21 | try: 22 | count = int(args[4]) 23 | except: 24 | raise ValueError(usage + '\nEnter an integer value for the "count" parameter') 25 | 26 | self.file_contents = file_contents 27 | self.count = count 28 | self.doc_type = doc_type 29 | 30 | else: 31 | raise ValueError(usage) 32 | 33 | def execute(self): 34 | db = Db() 35 | l = text_to_list(self.file_contents) 36 | d = list_to_dict(l) 37 | db.update_word_counts(d, self.doc_type) 38 | db.update_doctype_count(self.count, self.doc_type) 39 | return self.count 40 | 41 | def output(self, _): 42 | print "Processed %s documents of type '%s'" % (self.count, self.doc_type) 43 | -------------------------------------------------------------------------------- /mode.py: -------------------------------------------------------------------------------- 1 | class Mode: 2 | def validate(self): 3 | raise NotImplementedError() 4 | 5 | def execute(self): 6 | raise NotImplementedError() 7 | 8 | def output(self): 9 | raise NotImplementedError() 10 | -------------------------------------------------------------------------------- /reset.py: -------------------------------------------------------------------------------- 1 | from mode import Mode 2 | from status import Status 3 | from db import Db 4 | 5 | class Reset(Mode): 6 | def validate(self, args): 7 | if len(args) != 2: 8 | raise ValueError('Usage: %s reset' % args[0]) 9 | 10 | def execute(self): 11 | Db().reset() 12 | Status().execute() 13 | 14 | def output(self, _): 15 | print 'Reset Complete' -------------------------------------------------------------------------------- /status.py: -------------------------------------------------------------------------------- 1 | from db import Db 2 | from mode import Mode 3 | 4 | class Status(Mode): 5 | 6 | def validate(self, args): 7 | if len(args) != 2: 8 | raise ValueError('Usage: %s status' % args[0]) 9 | 10 | def execute(self): 11 | db = Db() 12 | return db.get_doctype_counts().items() 13 | 14 | def output(self, results): 15 | bar = '=' * 40 16 | print '%s\nStatus:\n%s\n' % (bar, bar) 17 | 18 | if results: 19 | for doctype, count in results: 20 | print '%s: %s' % (doctype, count) 21 | else: 22 | print 'No data' 23 | 24 | print '\n%s' % bar -------------------------------------------------------------------------------- /testharness.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import sys 3 | import os 4 | from classify import Classify 5 | from db import Db 6 | 7 | classifier = Classify() 8 | 9 | def is_doctype_valid(doctype): 10 | return Db().get_words_count(doctype) > 0 11 | 12 | def check_file(f): 13 | results = [] 14 | for line in open(f, 'r').readlines(): 15 | try: 16 | classifier.set_text(line) 17 | results += [classifier.execute()] 18 | except ValueError: 19 | pass 20 | 21 | return results 22 | 23 | def check_dir(d): 24 | results = [] 25 | for f in os.listdir(d): 26 | if f.endswith(".js"): 27 | results += check_file(os.path.join(d,f)) 28 | 29 | return results 30 | 31 | def show_results(results): 32 | result_count = len(results) 33 | if result_count: 34 | print 'Tested with %s document%s' % (result_count, '' if result_count == 1 else 's') 35 | print 'Result was %1.2f (0 = %s, 1 = %s)' % (sum(results) / result_count, doctype_other, doctype_expected) 36 | else : 37 | print 'No documents found' 38 | 39 | if __name__ == '__main__': 40 | usage = 'Usage: %s ' % sys.argv[0] 41 | 42 | if len(sys.argv) != 4: 43 | raise ValueError(usage) 44 | 45 | input_file = sys.argv[1] 46 | doctype_expected = sys.argv[2] 47 | doctype_other = sys.argv[3] 48 | 49 | classifier.set_doctypes(doctype_expected, doctype_other) 50 | 51 | results = None 52 | if os.path.isfile(input_file): 53 | results = check_file(input_file) 54 | elif os.path.isdir(input_file): 55 | results = check_dir(input_file) 56 | else: 57 | raise ValueError("Unable to find file/directory '%s'\n%s" % (input_file, usage)) 58 | 59 | show_results(results) 60 | -------------------------------------------------------------------------------- /words.py: -------------------------------------------------------------------------------- 1 | import re 2 | from collections import defaultdict 3 | 4 | commonWords = ('the','be','to','of','and','a','in','that','have','it','is','im','are','was','for','on','with','he','as','you','do','at','this','but','his','by','from','they','we','say','her','she','or','an','will','my','one','all','would','there','their','what','so','up','out','if','about','who','get','which','go','me','when','make','can','like','time','just','him','know','take','person','into','year','your','some','could','them','see','other','than','then','now','look','only','come','its','over','think','also','back','after','use','two','how','our','way','even','because','any','these','us') 5 | 6 | def cleanUpWord(word): 7 | word = word.lower() 8 | if (len(word) < 2): 9 | return None 10 | elif (word.isdigit()): 11 | return None 12 | elif (word in commonWords): 13 | return None 14 | 15 | return word 16 | 17 | def list_to_dict(l): 18 | d = defaultdict(int) 19 | add_list_to_dict(l, d) 20 | return d 21 | 22 | def add_list_to_dict(l, d): 23 | for word in l: 24 | d[word] += 1 25 | 26 | def text_to_list(text): 27 | cleaned_words = map(cleanUpWord, re.split('\W+', text.strip())) 28 | return filter(lambda word : word and (len(word) > 0), cleaned_words) 29 | --------------------------------------------------------------------------------