├── .gitignore ├── README.markdown ├── config └── redis.conf ├── python ├── README.markdown ├── bayes_on_redis │ ├── __init__.py │ └── bayes_on_redis.py ├── datasets │ └── stopwords.txt ├── setup.py └── test │ └── test.py └── ruby ├── README.markdown ├── bayes_on_redis.gemspec ├── datasets └── stopwords.txt ├── lib └── bayes_on_redis.rb └── test └── test.rb /.gitignore: -------------------------------------------------------------------------------- 1 | python/bayes_on_redis/*.pyc 2 | python/build 3 | -------------------------------------------------------------------------------- /README.markdown: -------------------------------------------------------------------------------- 1 | # What is BayesOnRedis? 2 | 3 | Bayesian classifier on top of Redis 4 | 5 | ## Why on Redis? 6 | 7 | [Redis](http://redis.io/) is a persistent, in-memory, key-value store with support for various data structures such as lists, sets, and ordered sets. 8 | All these data types can be manipulated with atomic operations to push/pop elements, add/remove elements, perform server-side union, intersection, difference between sets, and so forth. 9 | 10 | Because of Redis' properties: 11 | 12 | * It is extremely easy to implement simple algorithm such as bayesian filter. 13 | 14 | * The persistence of Redis means that the Bayesian implementation can be used in real production environment. 15 | 16 | * Even though I don't particularly care about performance at the moment, Redis benchmarks give me confidence that the implementation can scale to relatively large training data. 17 | 18 | ## How to install? (Ruby version) 19 | 20 | gem install bayes_on_redis 21 | 22 | ## Getting started 23 | 24 | # Require BayesOnRedis and RubyGems 25 | require "rubygems" 26 | require "bayes_on_redis" 27 | 28 | # Create instance of BayesOnRedis and pass your Redis information. 29 | # Of course, use real sentences for much better accuracy. 30 | # Unless if you want to train spam related things. 31 | bor = BayesOnRedis.new(:redis_host => '127.0.0.1', :redis_port => 6379, :redis_db => 0) 32 | 33 | # Teach it 34 | bor.train "good", "sweet awesome kick-ass cool pretty smart" 35 | bor.train "bad", "sucks lame boo death bankrupt loser sad" 36 | 37 | # Then ask it to classify text. 38 | bor.classify("awesome kick-ass ninja can still be lame.") 39 | 40 | ## for Pythonistas 41 | 42 | BayesOnRedis is also available in Python. With the same API. 43 | 44 | easy_install bayes_on_redis 45 | 46 | 47 | 48 | ## Contributing 49 | 50 | [Fork http://github.com/didip/bayes_on_redis](http://github.com/didip/bayes_on_redis) and send pull requests. 51 | -------------------------------------------------------------------------------- /config/redis.conf: -------------------------------------------------------------------------------- 1 | # Redis configuration file example 2 | 3 | # Note on units: when memory size is needed, it is possible to specifiy 4 | # it in the usual form of 1k 5GB 4M and so forth: 5 | # 6 | # 1k => 1000 bytes 7 | # 1kb => 1024 bytes 8 | # 1m => 1000000 bytes 9 | # 1mb => 1024*1024 bytes 10 | # 1g => 1000000000 bytes 11 | # 1gb => 1024*1024*1024 bytes 12 | # 13 | # units are case insensitive so 1GB 1Gb 1gB are all the same. 14 | 15 | # By default Redis does not run as a daemon. Use 'yes' if you need it. 16 | # Note that Redis will write a pid file in /var/run/redis.pid when daemonized. 17 | daemonize yes 18 | 19 | # When running daemonized, Redis writes a pid file in /var/run/redis.pid by 20 | # default. You can specify a custom pid file location here. 21 | pidfile /var/run/redis.pid 22 | 23 | # Accept connections on the specified port, default is 6379 24 | port 6379 25 | 26 | # If you want you can bind a single interface, if the bind option is not 27 | # specified all the interfaces will listen for incoming connections. 28 | # 29 | # bind 127.0.0.1 30 | 31 | # Close the connection after a client is idle for N seconds (0 to disable) 32 | timeout 300 33 | 34 | # Set server verbosity to 'debug' 35 | # it can be one of: 36 | # debug (a lot of information, useful for development/testing) 37 | # verbose (many rarely useful info, but not a mess like the debug level) 38 | # notice (moderately verbose, what you want in production probably) 39 | # warning (only very important / critical messages are logged) 40 | loglevel warning 41 | 42 | # Specify the log file name. Also 'stdout' can be used to force 43 | # Redis to log on the standard output. Note that if you use standard 44 | # output for logging but daemonize, logs will be sent to /dev/null 45 | logfile stdout 46 | 47 | # Set the number of databases. The default database is DB 0, you can select 48 | # a different one on a per-connection basis using SELECT where 49 | # dbid is a number between 0 and 'databases'-1 50 | databases 16 51 | 52 | ################################ SNAPSHOTTING ################################# 53 | # 54 | # Save the DB on disk: 55 | # 56 | # save 57 | # 58 | # Will save the DB if both the given number of seconds and the given 59 | # number of write operations against the DB occurred. 60 | # 61 | # In the example below the behaviour will be to save: 62 | # after 900 sec (15 min) if at least 1 key changed 63 | # after 300 sec (5 min) if at least 10 keys changed 64 | # after 60 sec if at least 10000 keys changed 65 | # 66 | # Note: you can disable saving at all commenting all the "save" lines. 67 | 68 | save 900 1 69 | save 300 10 70 | save 60 10000 71 | 72 | # Compress string objects using LZF when dump .rdb databases? 73 | # For default that's set to 'yes' as it's almost always a win. 74 | # If you want to save some CPU in the saving child set it to 'no' but 75 | # the dataset will likely be bigger if you have compressible values or keys. 76 | rdbcompression yes 77 | 78 | # The filename where to dump the DB 79 | dbfilename dump.rdb 80 | 81 | # The working directory. 82 | # 83 | # The DB will be written inside this directory, with the filename specified 84 | # above using the 'dbfilename' configuration directive. 85 | # 86 | # Also the Append Only File will be created inside this directory. 87 | # 88 | # Note that you must specify a directory here, not a file name. 89 | dir /tmp/ 90 | 91 | ################################# REPLICATION ################################# 92 | 93 | # Master-Slave replication. Use slaveof to make a Redis instance a copy of 94 | # another Redis server. Note that the configuration is local to the slave 95 | # so for example it is possible to configure the slave to save the DB with a 96 | # different interval, or to listen to another port, and so on. 97 | # 98 | # slaveof 99 | 100 | # If the master is password protected (using the "requirepass" configuration 101 | # directive below) it is possible to tell the slave to authenticate before 102 | # starting the replication synchronization process, otherwise the master will 103 | # refuse the slave request. 104 | # 105 | # masterauth 106 | 107 | ################################## SECURITY ################################### 108 | 109 | # Require clients to issue AUTH before processing any other 110 | # commands. This might be useful in environments in which you do not trust 111 | # others with access to the host running redis-server. 112 | # 113 | # This should stay commented out for backward compatibility and because most 114 | # people do not need auth (e.g. they run their own servers). 115 | # 116 | # Warning: since Redis is pretty fast an outside user can try up to 117 | # 150k passwords per second against a good box. This means that you should 118 | # use a very strong password otherwise it will be very easy to break. 119 | # 120 | # requirepass foobared 121 | 122 | ################################### LIMITS #################################### 123 | 124 | # Set the max number of connected clients at the same time. By default there 125 | # is no limit, and it's up to the number of file descriptors the Redis process 126 | # is able to open. The special value '0' means no limits. 127 | # Once the limit is reached Redis will close all the new connections sending 128 | # an error 'max number of clients reached'. 129 | # 130 | # maxclients 128 131 | 132 | # Don't use more memory than the specified amount of bytes. 133 | # When the memory limit is reached Redis will try to remove keys with an 134 | # EXPIRE set. It will try to start freeing keys that are going to expire 135 | # in little time and preserve keys with a longer time to live. 136 | # Redis will also try to remove objects from free lists if possible. 137 | # 138 | # If all this fails, Redis will start to reply with errors to commands 139 | # that will use more memory, like SET, LPUSH, and so on, and will continue 140 | # to reply to most read-only commands like GET. 141 | # 142 | # WARNING: maxmemory can be a good idea mainly if you want to use Redis as a 143 | # 'state' server or cache, not as a real DB. When Redis is used as a real 144 | # database the memory usage will grow over the weeks, it will be obvious if 145 | # it is going to use too much memory in the long run, and you'll have the time 146 | # to upgrade. With maxmemory after the limit is reached you'll start to get 147 | # errors for write operations, and this may even lead to DB inconsistency. 148 | # 149 | # maxmemory 150 | 151 | ############################## APPEND ONLY MODE ############################### 152 | 153 | # By default Redis asynchronously dumps the dataset on disk. If you can live 154 | # with the idea that the latest records will be lost if something like a crash 155 | # happens this is the preferred way to run Redis. If instead you care a lot 156 | # about your data and don't want to that a single record can get lost you should 157 | # enable the append only mode: when this mode is enabled Redis will append 158 | # every write operation received in the file appendonly.aof. This file will 159 | # be read on startup in order to rebuild the full dataset in memory. 160 | # 161 | # Note that you can have both the async dumps and the append only file if you 162 | # like (you have to comment the "save" statements above to disable the dumps). 163 | # Still if append only mode is enabled Redis will load the data from the 164 | # log file at startup ignoring the dump.rdb file. 165 | # 166 | # IMPORTANT: Check the BGREWRITEAOF to check how to rewrite the append 167 | # log file in background when it gets too big. 168 | 169 | appendonly no 170 | 171 | # The name of the append only file (default: "appendonly.aof") 172 | # appendfilename appendonly.aof 173 | 174 | # The fsync() call tells the Operating System to actually write data on disk 175 | # instead to wait for more data in the output buffer. Some OS will really flush 176 | # data on disk, some other OS will just try to do it ASAP. 177 | # 178 | # Redis supports three different modes: 179 | # 180 | # no: don't fsync, just let the OS flush the data when it wants. Faster. 181 | # always: fsync after every write to the append only log . Slow, Safest. 182 | # everysec: fsync only if one second passed since the last fsync. Compromise. 183 | # 184 | # The default is "everysec" that's usually the right compromise between 185 | # speed and data safety. It's up to you to understand if you can relax this to 186 | # "no" that will will let the operating system flush the output buffer when 187 | # it wants, for better performances (but if you can live with the idea of 188 | # some data loss consider the default persistence mode that's snapshotting), 189 | # or on the contrary, use "always" that's very slow but a bit safer than 190 | # everysec. 191 | # 192 | # If unsure, use "everysec". 193 | 194 | # appendfsync always 195 | appendfsync everysec 196 | # appendfsync no 197 | 198 | ################################ VIRTUAL MEMORY ############################### 199 | 200 | # Virtual Memory allows Redis to work with datasets bigger than the actual 201 | # amount of RAM needed to hold the whole dataset in memory. 202 | # In order to do so very used keys are taken in memory while the other keys 203 | # are swapped into a swap file, similarly to what operating systems do 204 | # with memory pages. 205 | # 206 | # To enable VM just set 'vm-enabled' to yes, and set the following three 207 | # VM parameters accordingly to your needs. 208 | 209 | vm-enabled yes 210 | # vm-enabled yes 211 | 212 | # This is the path of the Redis swap file. As you can guess, swap files 213 | # can't be shared by different Redis instances, so make sure to use a swap 214 | # file for every redis process you are running. Redis will complain if the 215 | # swap file is already in use. 216 | # 217 | # The best kind of storage for the Redis swap file (that's accessed at random) 218 | # is a Solid State Disk (SSD). 219 | # 220 | # *** WARNING *** if you are using a shared hosting the default of putting 221 | # the swap file under /tmp is not secure. Create a dir with access granted 222 | # only to Redis user and configure Redis to create the swap file there. 223 | vm-swap-file /tmp/redis.swap 224 | 225 | # vm-max-memory configures the VM to use at max the specified amount of 226 | # RAM. Everything that deos not fit will be swapped on disk *if* possible, that 227 | # is, if there is still enough contiguous space in the swap file. 228 | # 229 | # With vm-max-memory 0 the system will swap everything it can. Not a good 230 | # default, just specify the max amount of RAM you can in bytes, but it's 231 | # better to leave some margin. For instance specify an amount of RAM 232 | # that's more or less between 60 and 80% of your free RAM. 233 | vm-max-memory 0 234 | 235 | # Redis swap files is split into pages. An object can be saved using multiple 236 | # contiguous pages, but pages can't be shared between different objects. 237 | # So if your page is too big, small objects swapped out on disk will waste 238 | # a lot of space. If you page is too small, there is less space in the swap 239 | # file (assuming you configured the same number of total swap file pages). 240 | # 241 | # If you use a lot of small objects, use a page size of 64 or 32 bytes. 242 | # If you use a lot of big objects, use a bigger page size. 243 | # If unsure, use the default :) 244 | vm-page-size 32 245 | 246 | # Number of total memory pages in the swap file. 247 | # Given that the page table (a bitmap of free/used pages) is taken in memory, 248 | # every 8 pages on disk will consume 1 byte of RAM. 249 | # 250 | # The total swap size is vm-page-size * vm-pages 251 | # 252 | # With the default of 32-bytes memory pages and 134217728 pages Redis will 253 | # use a 4 GB swap file, that will use 16 MB of RAM for the page table. 254 | # 255 | # It's better to use the smallest acceptable value for your application, 256 | # but the default is large in order to work in most conditions. 257 | vm-pages 134217728 258 | 259 | # Max number of VM I/O threads running at the same time. 260 | # This threads are used to read/write data from/to swap file, since they 261 | # also encode and decode objects from disk to memory or the reverse, a bigger 262 | # number of threads can help with big objects even if they can't help with 263 | # I/O itself as the physical device may not be able to couple with many 264 | # reads/writes operations at the same time. 265 | # 266 | # The special value of 0 turn off threaded I/O and enables the blocking 267 | # Virtual Memory implementation. 268 | vm-max-threads 4 269 | 270 | ############################### ADVANCED CONFIG ############################### 271 | 272 | # Glue small output buffers together in order to send small replies in a 273 | # single TCP packet. Uses a bit more CPU but most of the times it is a win 274 | # in terms of number of queries per second. Use 'yes' if unsure. 275 | glueoutputbuf yes 276 | 277 | # Hashes are encoded in a special way (much more memory efficient) when they 278 | # have at max a given numer of elements, and the biggest element does not 279 | # exceed a given threshold. You can configure this limits with the following 280 | # configuration directives. 281 | hash-max-zipmap-entries 64 282 | hash-max-zipmap-value 512 283 | 284 | # Active rehashing uses 1 millisecond every 100 milliseconds of CPU time in 285 | # order to help rehashing the main Redis hash table (the one mapping top-level 286 | # keys to values). The hash table implementation redis uses (see dict.c) 287 | # performs a lazy rehashing: the more operation you run into an hash table 288 | # that is rhashing, the more rehashing "steps" are performed, so if the 289 | # server is idle the rehashing is never complete and some more memory is used 290 | # by the hash table. 291 | # 292 | # The default is to use this millisecond 10 times every second in order to 293 | # active rehashing the main dictionaries, freeing memory when possible. 294 | # 295 | # If unsure: 296 | # use "activerehashing no" if you have hard latency requirements and it is 297 | # not a good thing in your environment that Redis can reply form time to time 298 | # to queries with 2 milliseconds delay. 299 | # 300 | # use "activerehashing yes" if you don't have such hard requirements but 301 | # want to free memory asap when possible. 302 | activerehashing yes 303 | 304 | ################################## INCLUDES ################################### 305 | 306 | # Include one or more other config files here. This is useful if you 307 | # have a standard template that goes to all redis server but also need 308 | # to customize a few per-server settings. Include files can include 309 | # other files, so use this wisely. 310 | # 311 | # include /path/to/local.conf 312 | # include /path/to/other.conf 313 | -------------------------------------------------------------------------------- /python/README.markdown: -------------------------------------------------------------------------------- 1 | ../README.markdown -------------------------------------------------------------------------------- /python/bayes_on_redis/__init__.py: -------------------------------------------------------------------------------- 1 | # this is obviously directory for python project -------------------------------------------------------------------------------- /python/bayes_on_redis/bayes_on_redis.py: -------------------------------------------------------------------------------- 1 | import operator, math, os.path, re 2 | from redis import Redis 3 | 4 | class BayesOnRedis: 5 | categories_key = "BayesOnRedis:categories" 6 | one_or_two_words_re = re.compile(r"\b[^\s]{1,2}\b", re.IGNORECASE) 7 | non_alphanumeric_and_non_dot_re = re.compile(r"[^\w\.]", re.IGNORECASE) 8 | 9 | def __init__(self, **kwargs): 10 | self.redis = Redis(host=kwargs['redis_host'], port=int(kwargs['redis_port']), db=int(kwargs['redis_db'])) 11 | self.stopwords = Stopword() 12 | 13 | def flushdb(self): 14 | self.redis.flushdb() 15 | 16 | 17 | def train(self, category, text): 18 | category = category.lower() 19 | self.redis.sadd(self.__class__.categories_key, category) 20 | 21 | for word, count in self.count_occurance(text).iteritems(): 22 | self.redis.hincrby(self.redis_category_key(category), word, count) 23 | 24 | def learn(self, category, text): 25 | self.train(category, text) 26 | 27 | 28 | def untrain(self, category, text): 29 | category = category.lower() 30 | 31 | for word, count in self.count_occurance(text).iteritems(): 32 | word_count_atm = self.redis.hget(self.redis_category_key(category), word) 33 | new_count = (word_count_atm - count) if (word_count_atm >= count) else 0 34 | 35 | self.redis.hset(self.redis_category_key(category), word, new_count) 36 | 37 | 38 | def unlearn(self, category, text): 39 | self.untrain(category, text) 40 | 41 | 42 | def score(self, text): 43 | scores = {} 44 | 45 | for category in self.redis.smembers(self.__class__.categories_key): 46 | words_count_per_category = reduce(lambda x, y: x + y, map(float, self.redis.hvals(self.redis_category_key(category)))) 47 | 48 | if words_count_per_category <= 0: 49 | self.redis.srem(self.__class__.categories_key, category) 50 | 51 | scores[category] = 0 52 | 53 | for word, count in self.count_occurance(text).iteritems(): 54 | tmp_score = self.redis.hget(self.redis_category_key(category), word) 55 | if tmp_score and float(tmp_score) > 0.0: 56 | tmp_score = float(tmp_score) 57 | else: 58 | tmp_score = 0.1 59 | 60 | scores[category] += math.log(tmp_score / words_count_per_category) 61 | 62 | return scores 63 | 64 | 65 | def classify(self, text): 66 | return sorted(self.score(text).iteritems(), key=operator.itemgetter(1))[-1][0] 67 | 68 | 69 | def redis_category_key(self, category): 70 | return "BayesOnRedis:cat:%s" % category 71 | 72 | 73 | # Incoming text is always downcased 74 | def count_occurance(self, text=''): 75 | if not isinstance(text, basestring): 76 | raise Exception("input must be instance of String") 77 | 78 | separated_by_non_alphanumerics = self.__class__.non_alphanumeric_and_non_dot_re.sub(' ', text.lower()) 79 | without_one_or_two_words = self.__class__.one_or_two_words_re.sub('', separated_by_non_alphanumerics) 80 | without_dots = without_one_or_two_words.replace(".", "") 81 | text_chunks = self.stopwords.to_re().sub('', without_dots).split() 82 | 83 | frequencies = {} 84 | for word in text_chunks: 85 | frequencies[word] = (frequencies[word] if frequencies.has_key(word) else 0) + 1 86 | 87 | return frequencies 88 | 89 | 90 | class Stopword: 91 | def __init__(self): 92 | self.stopwords = open(os.path.abspath(os.path.join(__file__, "..", "..", "datasets", "stopwords.txt")), 'r').read() 93 | self.stopwords_re = None 94 | 95 | def to_list(self): 96 | return self.stopwords.split() 97 | 98 | def to_re(self): 99 | if not self.stopwords_re: 100 | self.stopwords_re = re.compile(r"\b(%s)\b" % '|'.join(self.to_list()), re.IGNORECASE) 101 | return self.stopwords_re 102 | 103 | -------------------------------------------------------------------------------- /python/datasets/stopwords.txt: -------------------------------------------------------------------------------- 1 | a able about above abroad according accordingly across actually adj after afterwards again against ago ahead 2 | ain't all allow allows almost alone along alongside already also although always am amid amidst among amongst 3 | an and another any anybody anyhow anyone anything anyway anyways anywhere apart appear appreciate appropriate 4 | are aren't around as a's aside ask asking associated at available away awfully b back backward backwards be 5 | became because become becomes becoming been before beforehand begin behind being believe below beside besides 6 | best better between beyond both brief but by c came can cannot cant can't caption cause causes certain certainly 7 | changes clearly c'mon co co. com come comes concerning consequently consider considering contain containing 8 | contains corresponding could couldn't course c's currently d dare daren't definitely described despite did didn't 9 | different directly do does doesn't doing done don't down downwards during e each edu eg eight eighty either else 10 | elsewhere end ending enough entirely especially et etc even ever evermore every everybody everyone everything 11 | everywhere ex exactly example except f fairly far farther few fewer fifth first five followed following follows 12 | for forever former formerly forth forward found four from further furthermore g get gets getting given gives go 13 | goes going gone got gotten greetings h had hadn't half happens hardly has hasn't have haven't having he he'd he'll 14 | hello help hence her here hereafter hereby herein here's hereupon hers herself he's hi him himself his hither 15 | hopefully how howbeit however hundred i i'd ie if ignored i'll i'm immediate in inasmuch inc inc. indeed indicate 16 | indicated indicates inner inside insofar instead into inward is isn't it it'd it'll its it's itself i've j just k 17 | keep keeps kept know known knows l last lately later latter latterly least less lest let let's like liked likely 18 | likewise little look looking looks low lower ltd m made mainly make makes many may maybe mayn't me mean meantime 19 | meanwhile merely might mightn't mine minus miss more moreover most mostly mr mrs much must mustn't my myself n name 20 | namely nd near nearly necessary need needn't needs neither never neverf neverless nevertheless new next nine ninety 21 | no nobody non none nonetheless noone no-one nor normally not nothing notwithstanding novel now nowhere o obviously 22 | of off often oh ok okay old on once one ones one's only onto opposite or other others otherwise ought oughtn't our 23 | ours ourselves out outside over overall own p particular particularly past per perhaps placed please plus possible 24 | presumably probably provided provides q que quite qv r rather rd re really reasonably recent recently regarding 25 | regardless regards relatively respectively right round s said same saw say saying says second secondly see seeing 26 | seem seemed seeming seems seen self selves sensible sent serious seriously seven several shall shan't she she'd 27 | she'll she's should shouldn't since six so some somebody someday somehow someone something sometime sometimes 28 | somewhat somewhere soon sorry specified specify specifying still sub such sup sure t take taken taking tell tends 29 | th than thank thanks thanx that that'll thats that's that've the their theirs them themselves then thence there 30 | thereafter thereby there'd therefore therein there'll there're theres there's thereupon there've these they they'd 31 | they'll they're they've thing things think third thirty this thorough thoroughly those though three through 32 | throughout thru thus till to together too took toward towards tried tries truly try trying t's twice two u un under 33 | underneath undoing unfortunately unless unlike unlikely until unto up upon upwards us use used useful uses using 34 | usually v value various versus very via viz vs w want wants was wasn't way we we'd welcome well we'll went were we're 35 | weren't we've what whatever what'll what's what've when whence whenever where whereafter whereas whereby wherein 36 | where's whereupon wherever whether which whichever while whilst whither who who'd whoever whole who'll whom whomever 37 | who's whose why will willing wish with within without wonder won't would wouldn't x y yes yet you you'd you'll your 38 | you're yours yourself yourselves you've z zero successful greatest began including being all for close but -------------------------------------------------------------------------------- /python/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup 3 | 4 | # Utility function to read the README file. 5 | # Used for the long_description. It's nice, because now 1) we have a top level 6 | # README file and 2) it's easier to type in the README file than to put a raw 7 | # string in below ... 8 | def read(fname): 9 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 10 | 11 | setup( 12 | name = "bayes_on_redis", 13 | version = "0.1.9", 14 | author = "Didip Kerabat", 15 | author_email = "didipk@gmail.com", 16 | description = ("bayes_on_redis library provides bayesian classification on a given text similar to many SPAM/HAM filtering technique."), 17 | license = "haven't decided", 18 | keywords = "bayesian filter redis", 19 | url = "https://github.com/didip/bayes_on_redis", 20 | packages=['bayes_on_redis', 'datasets'], 21 | package_data = { 22 | # If any package contains *.txt or *.rst files, include them: 23 | '': ['*.txt', '*.rst'], 24 | }, 25 | long_description=read('README.markdown'), 26 | classifiers=[ 27 | "Development Status :: 3 - Alpha", 28 | ], 29 | ) 30 | -------------------------------------------------------------------------------- /python/test/test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import sys, os.path 3 | sys.path.append(os.path.abspath(os.path.join(__file__, '..', '..', 'bayes_on_redis'))) 4 | 5 | from bayes_on_redis import BayesOnRedis 6 | 7 | bor = BayesOnRedis(redis_host='127.0.0.1', redis_port=6379, redis_db=5) 8 | bor.flushdb() 9 | 10 | 11 | # Classification tests 12 | 13 | bor.learn( "good", "sweet awesome kick-ass cool pretty smart" ) 14 | bor.learn( "bad", "sucks lame boo death bankrupt loser sad" ) 15 | 16 | text = "even though you are sweet and awesome ninja, you still sucks." 17 | expected = 'good' 18 | print "Expected: %s --- Result: %s" % (expected, bor.classify(text)) 19 | 20 | text = "super lame pirate" 21 | expected = 'bad' 22 | print "Expected: %s --- Result: %s" % (expected, bor.classify(text)) 23 | 24 | # ----------------------- 25 | 26 | bor.train("programming", "opera awesome web browser javascript lua c++ python www internet firefox") 27 | text = "Opera (the web browser) 11 beta, featuring extensions and tab stacking - now available for download." 28 | expected = 'programming' 29 | print "Expected: %s --- Result: %s" % (expected, bor.classify(text)) 30 | 31 | # ----------------------- 32 | 33 | bor.train("programming", "ruby git programming language") 34 | text = "Erik Andrejko shows us some of the common workflows and best features of git, making Ruby and git a powerful combination." 35 | expected = 'programming' 36 | print "Expected: %s --- Result: %s" % (expected, bor.classify(text)) 37 | 38 | # ----------------------- 39 | 40 | bor.train("programming", "python is the best programming language") 41 | text = "Always having fun with ruby and python" 42 | expected = 'programming' 43 | print "Expected: %s --- Result: %s" % (expected, bor.classify(text)) 44 | 45 | # ----------------------- 46 | # Stopwords tests 47 | print "Expected: Stopwords length should be > 0 --- Result: %s" % len(bor.stopwords.to_list()) 48 | 49 | # ----------------------- 50 | # occurance tests 51 | print bor.count_occurance("one or two cows did not scare me. It is chicken that does.") 52 | -------------------------------------------------------------------------------- /ruby/README.markdown: -------------------------------------------------------------------------------- 1 | ../README.markdown -------------------------------------------------------------------------------- /ruby/bayes_on_redis.gemspec: -------------------------------------------------------------------------------- 1 | Gem::Specification.new do |gem| 2 | gem.name = 'bayes_on_redis' 3 | gem.version = "0.2.2" 4 | gem.date = Date.today.to_s 5 | 6 | gem.summary = "Bayesian filter on top of Redis" 7 | gem.description = "bayes_on_redis library provides bayesian classification on a given text similar to many SPAM/HAM filtering technique." 8 | 9 | gem.authors = ['Didip Kerabat'] 10 | gem.email = 'didipk@gmail.com' 11 | gem.homepage = 'https://github.com/didip/bayes_on_redis' 12 | 13 | gem.rubyforge_project = nil 14 | gem.has_rdoc = false 15 | 16 | gem.files = [ 17 | "README.markdown", 18 | File.join("lib", "bayes_on_redis.rb"), 19 | File.join("datasets", "stopwords.txt") 20 | ] 21 | end 22 | -------------------------------------------------------------------------------- /ruby/datasets/stopwords.txt: -------------------------------------------------------------------------------- 1 | a able about above abroad according accordingly across actually adj after afterwards again against ago ahead 2 | ain't all allow allows almost alone along alongside already also although always am amid amidst among amongst 3 | an and another any anybody anyhow anyone anything anyway anyways anywhere apart appear appreciate appropriate 4 | are aren't around as a's aside ask asking associated at available away awfully b back backward backwards be 5 | became because become becomes becoming been before beforehand begin behind being believe below beside besides 6 | best better between beyond both brief but by c came can cannot cant can't caption cause causes certain certainly 7 | changes clearly c'mon co co. com come comes concerning consequently consider considering contain containing 8 | contains corresponding could couldn't course c's currently d dare daren't definitely described despite did didn't 9 | different directly do does doesn't doing done don't down downwards during e each edu eg eight eighty either else 10 | elsewhere end ending enough entirely especially et etc even ever evermore every everybody everyone everything 11 | everywhere ex exactly example except f fairly far farther few fewer fifth first five followed following follows 12 | for forever former formerly forth forward found four from further furthermore g get gets getting given gives go 13 | goes going gone got gotten greetings h had hadn't half happens hardly has hasn't have haven't having he he'd he'll 14 | hello help hence her here hereafter hereby herein here's hereupon hers herself he's hi him himself his hither 15 | hopefully how howbeit however hundred i i'd ie if ignored i'll i'm immediate in inasmuch inc inc. indeed indicate 16 | indicated indicates inner inside insofar instead into inward is isn't it it'd it'll its it's itself i've j just k 17 | keep keeps kept know known knows l last lately later latter latterly least less lest let let's like liked likely 18 | likewise little look looking looks low lower ltd m made mainly make makes many may maybe mayn't me mean meantime 19 | meanwhile merely might mightn't mine minus miss more moreover most mostly mr mrs much must mustn't my myself n name 20 | namely nd near nearly necessary need needn't needs neither never neverf neverless nevertheless new next nine ninety 21 | no nobody non none nonetheless noone no-one nor normally not nothing notwithstanding novel now nowhere o obviously 22 | of off often oh ok okay old on once one ones one's only onto opposite or other others otherwise ought oughtn't our 23 | ours ourselves out outside over overall own p particular particularly past per perhaps placed please plus possible 24 | presumably probably provided provides q que quite qv r rather rd re really reasonably recent recently regarding 25 | regardless regards relatively respectively right round s said same saw say saying says second secondly see seeing 26 | seem seemed seeming seems seen self selves sensible sent serious seriously seven several shall shan't she she'd 27 | she'll she's should shouldn't since six so some somebody someday somehow someone something sometime sometimes 28 | somewhat somewhere soon sorry specified specify specifying still sub such sup sure t take taken taking tell tends 29 | th than thank thanks thanx that that'll thats that's that've the their theirs them themselves then thence there 30 | thereafter thereby there'd therefore therein there'll there're theres there's thereupon there've these they they'd 31 | they'll they're they've thing things think third thirty this thorough thoroughly those though three through 32 | throughout thru thus till to together too took toward towards tried tries truly try trying t's twice two u un under 33 | underneath undoing unfortunately unless unlike unlikely until unto up upon upwards us use used useful uses using 34 | usually v value various versus very via viz vs w want wants was wasn't way we we'd welcome well we'll went were we're 35 | weren't we've what whatever what'll what's what've when whence whenever where whereafter whereas whereby wherein 36 | where's whereupon wherever whether which whichever while whilst whither who who'd whoever whole who'll whom whomever 37 | who's whose why will willing wish with within without wonder won't would wouldn't x y yes yet you you'd you'll your 38 | you're yours yourself yourselves you've z zero successful greatest began including being all for close but -------------------------------------------------------------------------------- /ruby/lib/bayes_on_redis.rb: -------------------------------------------------------------------------------- 1 | require "rubygems" 2 | require "redis" 3 | 4 | class BayesOnRedis 5 | CATEGORIES_KEY = "BayesOnRedis:categories" 6 | ONE_OR_TWO_WORDS_RE = /\b\w{1,2}\b/mi 7 | NON_ALPHANUMERIC_AND_NON_DOT_RE = /[^\w\.]/mi 8 | 9 | attr_reader :redis, :stopwords 10 | 11 | def initialize(options) 12 | @redis = Redis.new(:host => options[:redis_host], :port => options[:redis_port], :db => options[:redis_db]) 13 | @stopwords = Stopword.new 14 | end 15 | 16 | def flushdb 17 | @redis.flushdb 18 | end 19 | 20 | # training for a category 21 | def train(category, text) 22 | category = category.downcase 23 | @redis.sadd(CATEGORIES_KEY, category) 24 | 25 | count_occurance(text).each do |word, count| 26 | @redis.hincrby(redis_category_key(category), word, count) 27 | end 28 | end 29 | alias_method :learn, :train 30 | 31 | def untrain(category, text) 32 | category = category.downcase 33 | 34 | count_occurance(text).each do |word, count| 35 | word_count_atm = @redis.hget(redis_category_key(category), word) 36 | if (word_count_atm >= count) 37 | new_count = (word_count_atm - count) 38 | else 39 | new_count = 0 40 | end 41 | @redis.hset(redis_category_key(category), word, new_count) 42 | end 43 | end 44 | alias_method :unlearn, :untrain 45 | 46 | def score(text) 47 | scores = {} 48 | 49 | @redis.smembers(CATEGORIES_KEY).each do |category| 50 | words_count_per_category = @redis.hvals(redis_category_key(category)).inject(0) {|sum, score| sum + score.to_i} 51 | @redis.srem(CATEGORIES_KEY, category) if words_count_per_category <= 0 52 | 53 | scores[category] = 0 54 | 55 | count_occurance(text).each do |word, count| 56 | tmp_score = @redis.hget(redis_category_key(category), word).to_i 57 | tmp_score = 0.1 if tmp_score <= 0 58 | 59 | scores[category] += Math.log(tmp_score / words_count_per_category.to_f) 60 | end 61 | end 62 | 63 | return scores 64 | end 65 | 66 | def classify(text) 67 | (score(text).sort_by { |score| -score[1] })[0][0] # [0][0] -> first score, get the key 68 | end 69 | 70 | private 71 | def redis_category_key(category) 72 | "BayesOnRedis:cat:#{category}" 73 | end 74 | 75 | # Incoming text is always downcased 76 | def count_occurance(text='') 77 | raise "input must be instance of String" unless text.is_a?(String) 78 | 79 | text_chunks = text.downcase.gsub(ONE_OR_TWO_WORDS_RE, '').gsub(NON_ALPHANUMERIC_AND_NON_DOT_RE, ' ').gsub(@stopwords.to_re, '').gsub(/\./, '').split 80 | text_chunks.inject(Hash.new(0)) do |container, word| 81 | container[word] += 1; container 82 | end 83 | end 84 | 85 | def remove_stopwords 86 | @redis.smembers(CATEGORIES_KEY).each do |category| 87 | @stopwords.to_a.each do |stopword| 88 | @redis.hdel(redis_category_key(category), stopword) 89 | end 90 | end 91 | end 92 | end 93 | 94 | 95 | class Stopword 96 | def initialize 97 | @stopwords = File.read(File.expand_path(File.join(__FILE__, "..", "..", "datasets", "stopwords.txt"))).split 98 | end 99 | 100 | def to_a 101 | @stopwords 102 | end 103 | 104 | def to_re 105 | @to_re ||= /\b(#{@stopwords.join('|')})\b/mi 106 | end 107 | end -------------------------------------------------------------------------------- /ruby/test/test.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | require 'lib/bayes_on_redis' 3 | 4 | bor = BayesOnRedis.new(:redis_host => '127.0.0.1', :redis_port => 6379, :redis_db => 5) 5 | bor.flushdb 6 | 7 | 8 | # Classification tests 9 | 10 | bor.learn "good", "sweet awesome kick-ass cool pretty smart" 11 | bor.learn "bad", "sucks lame boo death bankrupt loser sad" 12 | 13 | text = "even though you are sweet and awesome ninja, you still sucks." 14 | expected = 'good' 15 | puts "Expected: #{expected} --- Result: #{bor.classify(text)}" 16 | 17 | text = "super lame pirate" 18 | expected = 'bad' 19 | puts "Expected: #{expected} --- Result: #{bor.classify(text)}" 20 | 21 | # ----------------------- 22 | 23 | bor.train "programming", "opera awesome web browser javascript lua c++ python www internet firefox" 24 | text = "Opera (the web browser) 11 beta, featuring extensions and tab stacking - now available for download." 25 | expected = 'programming' 26 | puts "Expected: #{expected} --- Result: #{bor.classify(text)}" 27 | 28 | # ----------------------- 29 | 30 | bor.train "programming", "ruby git programming language" 31 | text = "Erik Andrejko shows us some of the common workflows and best features of git, making Ruby and git a powerful combination." 32 | expected = 'programming' 33 | puts "Expected: #{expected} --- Result: #{bor.classify(text)}" 34 | 35 | # ----------------------- 36 | 37 | bor.train "programming", "python is the best programming language" 38 | text = "Always having fun with ruby and python" 39 | expected = 'programming' 40 | puts "Expected: #{expected} --- Result: #{bor.classify(text)}" 41 | 42 | # ----------------------- 43 | # Stopwords tests 44 | puts "Expected: Stopwords length should be > 0 --- Result: #{bor.stopwords.to_a.size}" 45 | 46 | 47 | # ----------------------- 48 | # occurance tests 49 | print bor.send(:count_occurance, "one or two cows did not scare me. It is chicken that does.").inspect 50 | --------------------------------------------------------------------------------