├── .gitignore
├── README.markdown
├── config
    └── redis.conf
├── python
    ├── README.markdown
    ├── bayes_on_redis
    │   ├── __init__.py
    │   └── bayes_on_redis.py
    ├── datasets
    │   └── stopwords.txt
    ├── setup.py
    └── test
    │   └── test.py
└── ruby
    ├── README.markdown
    ├── bayes_on_redis.gemspec
    ├── datasets
        └── stopwords.txt
    ├── lib
        └── bayes_on_redis.rb
    └── test
        └── test.rb


/.gitignore:
--------------------------------------------------------------------------------
1 | python/bayes_on_redis/*.pyc
2 | python/build
3 | 


--------------------------------------------------------------------------------
/README.markdown:
--------------------------------------------------------------------------------
 1 | # What is BayesOnRedis?
 2 | 
 3 | Bayesian classifier on top of Redis
 4 | 
 5 | ## Why on Redis?
 6 | 
 7 | [Redis](http://redis.io/) is a persistent, in-memory, key-value store with support for various data structures such as lists, sets, and ordered sets.
 8 | All these data types can be manipulated with atomic operations to push/pop elements, add/remove elements, perform server-side union, intersection, difference between sets, and so forth.
 9 | 
10 | Because of Redis' properties:
11 | 
12 |  * It is extremely easy to implement simple algorithm such as bayesian filter.
13 | 
14 |  * The persistence of Redis means that the Bayesian implementation can be used in real production environment.
15 | 
16 |  * Even though I don't particularly care about performance at the moment, Redis benchmarks give me confidence that the implementation can scale to relatively large training data.
17 | 
18 | ## How to install? (Ruby version)
19 | 
20 |     gem install bayes_on_redis
21 | 
22 | ## Getting started
23 | 
24 |     # Require BayesOnRedis and RubyGems
25 |     require "rubygems"
26 |     require "bayes_on_redis"
27 |     
28 |     # Create instance of BayesOnRedis and pass your Redis information.
29 |     # Of course, use real sentences for much better accuracy.
30 |     # Unless if you want to train spam related things.
31 |     bor = BayesOnRedis.new(:redis_host => '127.0.0.1', :redis_port => 6379, :redis_db => 0)
32 | 
33 |     # Teach it
34 |     bor.train "good", "sweet awesome kick-ass cool pretty smart"
35 |     bor.train "bad", "sucks lame boo death bankrupt loser sad"
36 | 
37 |     # Then ask it to classify text.
38 |     bor.classify("awesome kick-ass ninja can still be lame.")
39 | 
40 | ## for Pythonistas
41 | 
42 | BayesOnRedis is also available in Python. With the same API.
43 | 
44 |     easy_install bayes_on_redis
45 | 
46 | 
47 | 
48 | ## Contributing
49 | 
50 | [Fork http://github.com/didip/bayes_on_redis](http://github.com/didip/bayes_on_redis) and send pull requests.
51 | 


--------------------------------------------------------------------------------
/config/redis.conf:
--------------------------------------------------------------------------------
  1 | # Redis configuration file example
  2 | 
  3 | # Note on units: when memory size is needed, it is possible to specifiy
  4 | # it in the usual form of 1k 5GB 4M and so forth:
  5 | #
  6 | # 1k => 1000 bytes
  7 | # 1kb => 1024 bytes
  8 | # 1m => 1000000 bytes
  9 | # 1mb => 1024*1024 bytes
 10 | # 1g => 1000000000 bytes
 11 | # 1gb => 1024*1024*1024 bytes
 12 | #
 13 | # units are case insensitive so 1GB 1Gb 1gB are all the same.
 14 | 
 15 | # By default Redis does not run as a daemon. Use 'yes' if you need it.
 16 | # Note that Redis will write a pid file in /var/run/redis.pid when daemonized.
 17 | daemonize yes
 18 | 
 19 | # When running daemonized, Redis writes a pid file in /var/run/redis.pid by
 20 | # default. You can specify a custom pid file location here.
 21 | pidfile /var/run/redis.pid
 22 | 
 23 | # Accept connections on the specified port, default is 6379
 24 | port 6379
 25 | 
 26 | # If you want you can bind a single interface, if the bind option is not
 27 | # specified all the interfaces will listen for incoming connections.
 28 | #
 29 | # bind 127.0.0.1
 30 | 
 31 | # Close the connection after a client is idle for N seconds (0 to disable)
 32 | timeout 300
 33 | 
 34 | # Set server verbosity to 'debug'
 35 | # it can be one of:
 36 | # debug (a lot of information, useful for development/testing)
 37 | # verbose (many rarely useful info, but not a mess like the debug level)
 38 | # notice (moderately verbose, what you want in production probably)
 39 | # warning (only very important / critical messages are logged)
 40 | loglevel warning
 41 | 
 42 | # Specify the log file name. Also 'stdout' can be used to force
 43 | # Redis to log on the standard output. Note that if you use standard
 44 | # output for logging but daemonize, logs will be sent to /dev/null
 45 | logfile stdout
 46 | 
 47 | # Set the number of databases. The default database is DB 0, you can select
 48 | # a different one on a per-connection basis using SELECT <dbid> where
 49 | # dbid is a number between 0 and 'databases'-1
 50 | databases 16
 51 | 
 52 | ################################ SNAPSHOTTING  #################################
 53 | #
 54 | # Save the DB on disk:
 55 | #
 56 | #   save <seconds> <changes>
 57 | #
 58 | #   Will save the DB if both the given number of seconds and the given
 59 | #   number of write operations against the DB occurred.
 60 | #
 61 | #   In the example below the behaviour will be to save:
 62 | #   after 900 sec (15 min) if at least 1 key changed
 63 | #   after 300 sec (5 min) if at least 10 keys changed
 64 | #   after 60 sec if at least 10000 keys changed
 65 | #
 66 | #   Note: you can disable saving at all commenting all the "save" lines.
 67 | 
 68 | save 900 1
 69 | save 300 10
 70 | save 60 10000
 71 | 
 72 | # Compress string objects using LZF when dump .rdb databases?
 73 | # For default that's set to 'yes' as it's almost always a win.
 74 | # If you want to save some CPU in the saving child set it to 'no' but
 75 | # the dataset will likely be bigger if you have compressible values or keys.
 76 | rdbcompression yes
 77 | 
 78 | # The filename where to dump the DB
 79 | dbfilename dump.rdb
 80 | 
 81 | # The working directory.
 82 | #
 83 | # The DB will be written inside this directory, with the filename specified
 84 | # above using the 'dbfilename' configuration directive.
 85 | # 
 86 | # Also the Append Only File will be created inside this directory.
 87 | # 
 88 | # Note that you must specify a directory here, not a file name.
 89 | dir /tmp/
 90 | 
 91 | ################################# REPLICATION #################################
 92 | 
 93 | # Master-Slave replication. Use slaveof to make a Redis instance a copy of
 94 | # another Redis server. Note that the configuration is local to the slave
 95 | # so for example it is possible to configure the slave to save the DB with a
 96 | # different interval, or to listen to another port, and so on.
 97 | #
 98 | # slaveof <masterip> <masterport>
 99 | 
100 | # If the master is password protected (using the "requirepass" configuration
101 | # directive below) it is possible to tell the slave to authenticate before
102 | # starting the replication synchronization process, otherwise the master will
103 | # refuse the slave request.
104 | #
105 | # masterauth <master-password>
106 | 
107 | ################################## SECURITY ###################################
108 | 
109 | # Require clients to issue AUTH <PASSWORD> before processing any other
110 | # commands.  This might be useful in environments in which you do not trust
111 | # others with access to the host running redis-server.
112 | #
113 | # This should stay commented out for backward compatibility and because most
114 | # people do not need auth (e.g. they run their own servers).
115 | # 
116 | # Warning: since Redis is pretty fast an outside user can try up to
117 | # 150k passwords per second against a good box. This means that you should
118 | # use a very strong password otherwise it will be very easy to break.
119 | #
120 | # requirepass foobared
121 | 
122 | ################################### LIMITS ####################################
123 | 
124 | # Set the max number of connected clients at the same time. By default there
125 | # is no limit, and it's up to the number of file descriptors the Redis process
126 | # is able to open. The special value '0' means no limits.
127 | # Once the limit is reached Redis will close all the new connections sending
128 | # an error 'max number of clients reached'.
129 | #
130 | # maxclients 128
131 | 
132 | # Don't use more memory than the specified amount of bytes.
133 | # When the memory limit is reached Redis will try to remove keys with an
134 | # EXPIRE set. It will try to start freeing keys that are going to expire
135 | # in little time and preserve keys with a longer time to live.
136 | # Redis will also try to remove objects from free lists if possible.
137 | #
138 | # If all this fails, Redis will start to reply with errors to commands
139 | # that will use more memory, like SET, LPUSH, and so on, and will continue
140 | # to reply to most read-only commands like GET.
141 | #
142 | # WARNING: maxmemory can be a good idea mainly if you want to use Redis as a
143 | # 'state' server or cache, not as a real DB. When Redis is used as a real
144 | # database the memory usage will grow over the weeks, it will be obvious if
145 | # it is going to use too much memory in the long run, and you'll have the time
146 | # to upgrade. With maxmemory after the limit is reached you'll start to get
147 | # errors for write operations, and this may even lead to DB inconsistency.
148 | #
149 | # maxmemory <bytes>
150 | 
151 | ############################## APPEND ONLY MODE ###############################
152 | 
153 | # By default Redis asynchronously dumps the dataset on disk. If you can live
154 | # with the idea that the latest records will be lost if something like a crash
155 | # happens this is the preferred way to run Redis. If instead you care a lot
156 | # about your data and don't want to that a single record can get lost you should
157 | # enable the append only mode: when this mode is enabled Redis will append
158 | # every write operation received in the file appendonly.aof. This file will
159 | # be read on startup in order to rebuild the full dataset in memory.
160 | #
161 | # Note that you can have both the async dumps and the append only file if you
162 | # like (you have to comment the "save" statements above to disable the dumps).
163 | # Still if append only mode is enabled Redis will load the data from the
164 | # log file at startup ignoring the dump.rdb file.
165 | #
166 | # IMPORTANT: Check the BGREWRITEAOF to check how to rewrite the append
167 | # log file in background when it gets too big.
168 | 
169 | appendonly no
170 | 
171 | # The name of the append only file (default: "appendonly.aof")
172 | # appendfilename appendonly.aof
173 | 
174 | # The fsync() call tells the Operating System to actually write data on disk
175 | # instead to wait for more data in the output buffer. Some OS will really flush 
176 | # data on disk, some other OS will just try to do it ASAP.
177 | #
178 | # Redis supports three different modes:
179 | #
180 | # no: don't fsync, just let the OS flush the data when it wants. Faster.
181 | # always: fsync after every write to the append only log . Slow, Safest.
182 | # everysec: fsync only if one second passed since the last fsync. Compromise.
183 | #
184 | # The default is "everysec" that's usually the right compromise between
185 | # speed and data safety. It's up to you to understand if you can relax this to
186 | # "no" that will will let the operating system flush the output buffer when
187 | # it wants, for better performances (but if you can live with the idea of
188 | # some data loss consider the default persistence mode that's snapshotting),
189 | # or on the contrary, use "always" that's very slow but a bit safer than
190 | # everysec.
191 | #
192 | # If unsure, use "everysec".
193 | 
194 | # appendfsync always
195 | appendfsync everysec
196 | # appendfsync no
197 | 
198 | ################################ VIRTUAL MEMORY ###############################
199 | 
200 | # Virtual Memory allows Redis to work with datasets bigger than the actual
201 | # amount of RAM needed to hold the whole dataset in memory.
202 | # In order to do so very used keys are taken in memory while the other keys
203 | # are swapped into a swap file, similarly to what operating systems do
204 | # with memory pages.
205 | #
206 | # To enable VM just set 'vm-enabled' to yes, and set the following three
207 | # VM parameters accordingly to your needs.
208 | 
209 | vm-enabled yes
210 | # vm-enabled yes
211 | 
212 | # This is the path of the Redis swap file. As you can guess, swap files
213 | # can't be shared by different Redis instances, so make sure to use a swap
214 | # file for every redis process you are running. Redis will complain if the
215 | # swap file is already in use.
216 | #
217 | # The best kind of storage for the Redis swap file (that's accessed at random) 
218 | # is a Solid State Disk (SSD).
219 | #
220 | # *** WARNING *** if you are using a shared hosting the default of putting
221 | # the swap file under /tmp is not secure. Create a dir with access granted
222 | # only to Redis user and configure Redis to create the swap file there.
223 | vm-swap-file /tmp/redis.swap
224 | 
225 | # vm-max-memory configures the VM to use at max the specified amount of
226 | # RAM. Everything that deos not fit will be swapped on disk *if* possible, that
227 | # is, if there is still enough contiguous space in the swap file.
228 | #
229 | # With vm-max-memory 0 the system will swap everything it can. Not a good
230 | # default, just specify the max amount of RAM you can in bytes, but it's
231 | # better to leave some margin. For instance specify an amount of RAM
232 | # that's more or less between 60 and 80% of your free RAM.
233 | vm-max-memory 0
234 | 
235 | # Redis swap files is split into pages. An object can be saved using multiple
236 | # contiguous pages, but pages can't be shared between different objects.
237 | # So if your page is too big, small objects swapped out on disk will waste
238 | # a lot of space. If you page is too small, there is less space in the swap
239 | # file (assuming you configured the same number of total swap file pages).
240 | #
241 | # If you use a lot of small objects, use a page size of 64 or 32 bytes.
242 | # If you use a lot of big objects, use a bigger page size.
243 | # If unsure, use the default :)
244 | vm-page-size 32
245 | 
246 | # Number of total memory pages in the swap file.
247 | # Given that the page table (a bitmap of free/used pages) is taken in memory,
248 | # every 8 pages on disk will consume 1 byte of RAM.
249 | #
250 | # The total swap size is vm-page-size * vm-pages
251 | #
252 | # With the default of 32-bytes memory pages and 134217728 pages Redis will
253 | # use a 4 GB swap file, that will use 16 MB of RAM for the page table.
254 | #
255 | # It's better to use the smallest acceptable value for your application,
256 | # but the default is large in order to work in most conditions.
257 | vm-pages 134217728
258 | 
259 | # Max number of VM I/O threads running at the same time.
260 | # This threads are used to read/write data from/to swap file, since they
261 | # also encode and decode objects from disk to memory or the reverse, a bigger
262 | # number of threads can help with big objects even if they can't help with
263 | # I/O itself as the physical device may not be able to couple with many
264 | # reads/writes operations at the same time.
265 | #
266 | # The special value of 0 turn off threaded I/O and enables the blocking
267 | # Virtual Memory implementation.
268 | vm-max-threads 4
269 | 
270 | ############################### ADVANCED CONFIG ###############################
271 | 
272 | # Glue small output buffers together in order to send small replies in a
273 | # single TCP packet. Uses a bit more CPU but most of the times it is a win
274 | # in terms of number of queries per second. Use 'yes' if unsure.
275 | glueoutputbuf yes
276 | 
277 | # Hashes are encoded in a special way (much more memory efficient) when they
278 | # have at max a given numer of elements, and the biggest element does not
279 | # exceed a given threshold. You can configure this limits with the following
280 | # configuration directives.
281 | hash-max-zipmap-entries 64
282 | hash-max-zipmap-value 512
283 | 
284 | # Active rehashing uses 1 millisecond every 100 milliseconds of CPU time in
285 | # order to help rehashing the main Redis hash table (the one mapping top-level
286 | # keys to values). The hash table implementation redis uses (see dict.c)
287 | # performs a lazy rehashing: the more operation you run into an hash table
288 | # that is rhashing, the more rehashing "steps" are performed, so if the
289 | # server is idle the rehashing is never complete and some more memory is used
290 | # by the hash table.
291 | # 
292 | # The default is to use this millisecond 10 times every second in order to
293 | # active rehashing the main dictionaries, freeing memory when possible.
294 | #
295 | # If unsure:
296 | # use "activerehashing no" if you have hard latency requirements and it is
297 | # not a good thing in your environment that Redis can reply form time to time
298 | # to queries with 2 milliseconds delay.
299 | #
300 | # use "activerehashing yes" if you don't have such hard requirements but
301 | # want to free memory asap when possible.
302 | activerehashing yes
303 | 
304 | ################################## INCLUDES ###################################
305 | 
306 | # Include one or more other config files here.  This is useful if you
307 | # have a standard template that goes to all redis server but also need
308 | # to customize a few per-server settings.  Include files can include
309 | # other files, so use this wisely.
310 | #
311 | # include /path/to/local.conf
312 | # include /path/to/other.conf
313 | 


--------------------------------------------------------------------------------
/python/README.markdown:
--------------------------------------------------------------------------------
1 | ../README.markdown


--------------------------------------------------------------------------------
/python/bayes_on_redis/__init__.py:
--------------------------------------------------------------------------------
1 | # this is obviously directory for python project


--------------------------------------------------------------------------------
/python/bayes_on_redis/bayes_on_redis.py:
--------------------------------------------------------------------------------
  1 | import operator, math, os.path, re
  2 | from redis import Redis
  3 | 
  4 | class BayesOnRedis:
  5 |     categories_key = "BayesOnRedis:categories"
  6 |     one_or_two_words_re = re.compile(r"\b[^\s]{1,2}\b", re.IGNORECASE)
  7 |     non_alphanumeric_and_non_dot_re = re.compile(r"[^\w\.]", re.IGNORECASE)
  8 | 
  9 |     def __init__(self, **kwargs):
 10 |         self.redis = Redis(host=kwargs['redis_host'], port=int(kwargs['redis_port']), db=int(kwargs['redis_db']))
 11 |         self.stopwords = Stopword()
 12 | 
 13 |     def flushdb(self):
 14 |         self.redis.flushdb()
 15 | 
 16 | 
 17 |     def train(self, category, text):
 18 |         category = category.lower()
 19 |         self.redis.sadd(self.__class__.categories_key, category)
 20 | 
 21 |         for word, count in self.count_occurance(text).iteritems():
 22 |             self.redis.hincrby(self.redis_category_key(category), word, count)
 23 | 
 24 |     def learn(self, category, text):
 25 |         self.train(category, text)
 26 | 
 27 | 
 28 |     def untrain(self, category, text):
 29 |         category = category.lower()
 30 | 
 31 |         for word, count in self.count_occurance(text).iteritems():
 32 |             word_count_atm = self.redis.hget(self.redis_category_key(category), word)
 33 |             new_count = (word_count_atm - count) if (word_count_atm >= count) else 0
 34 | 
 35 |             self.redis.hset(self.redis_category_key(category), word, new_count)
 36 | 
 37 | 
 38 |     def unlearn(self, category, text):
 39 |         self.untrain(category, text)
 40 | 
 41 | 
 42 |     def score(self, text):
 43 |         scores = {}
 44 | 
 45 |         for category in self.redis.smembers(self.__class__.categories_key):
 46 |             words_count_per_category = reduce(lambda x, y: x + y, map(float, self.redis.hvals(self.redis_category_key(category))))
 47 | 
 48 |             if words_count_per_category <= 0:
 49 |                 self.redis.srem(self.__class__.categories_key, category)
 50 | 
 51 |             scores[category] = 0
 52 | 
 53 |             for word, count in self.count_occurance(text).iteritems():
 54 |                 tmp_score = self.redis.hget(self.redis_category_key(category), word)
 55 |                 if tmp_score and float(tmp_score) > 0.0:
 56 |                     tmp_score = float(tmp_score)
 57 |                 else:
 58 |                     tmp_score = 0.1
 59 | 
 60 |                 scores[category] += math.log(tmp_score / words_count_per_category)
 61 | 
 62 |         return scores
 63 | 
 64 | 
 65 |     def classify(self, text):
 66 |         return sorted(self.score(text).iteritems(), key=operator.itemgetter(1))[-1][0]
 67 | 
 68 | 
 69 |     def redis_category_key(self, category):
 70 |         return "BayesOnRedis:cat:%s" % category
 71 | 
 72 | 
 73 |     # Incoming text is always downcased
 74 |     def count_occurance(self, text=''):
 75 |         if not isinstance(text, basestring):
 76 |             raise Exception("input must be instance of String")
 77 | 
 78 |         separated_by_non_alphanumerics = self.__class__.non_alphanumeric_and_non_dot_re.sub(' ', text.lower())
 79 |         without_one_or_two_words = self.__class__.one_or_two_words_re.sub('', separated_by_non_alphanumerics)
 80 |         without_dots = without_one_or_two_words.replace(".", "")
 81 |         text_chunks = self.stopwords.to_re().sub('', without_dots).split()
 82 | 
 83 |         frequencies = {}
 84 |         for word in text_chunks:
 85 |             frequencies[word] = (frequencies[word] if frequencies.has_key(word) else 0) + 1
 86 | 
 87 |         return frequencies
 88 | 
 89 | 
 90 | class Stopword:
 91 |     def __init__(self):
 92 |         self.stopwords = open(os.path.abspath(os.path.join(__file__, "..", "..", "datasets", "stopwords.txt")), 'r').read()
 93 |         self.stopwords_re = None
 94 | 
 95 |     def to_list(self):
 96 |         return self.stopwords.split()
 97 | 
 98 |     def to_re(self):
 99 |         if not self.stopwords_re:
100 |             self.stopwords_re = re.compile(r"\b(%s)\b" % '|'.join(self.to_list()), re.IGNORECASE)
101 |         return self.stopwords_re
102 | 
103 | 


--------------------------------------------------------------------------------
/python/datasets/stopwords.txt:
--------------------------------------------------------------------------------
 1 | a able about above abroad according accordingly across actually adj after afterwards again against ago ahead
 2 | ain't all allow allows almost alone along alongside already also although always am amid amidst among amongst
 3 | an and another any anybody anyhow anyone anything anyway anyways anywhere apart appear appreciate appropriate
 4 | are aren't around as a's aside ask asking associated at available away awfully b back backward backwards be
 5 | became because become becomes becoming been before beforehand begin behind being believe below beside besides
 6 | best better between beyond both brief but by c came can cannot cant can't caption cause causes certain certainly
 7 | changes clearly c'mon co co. com come comes concerning consequently consider considering contain containing
 8 | contains corresponding could couldn't course c's currently d dare daren't definitely described despite did didn't
 9 | different directly do does doesn't doing done don't down downwards during e each edu eg eight eighty either else
10 | elsewhere end ending enough entirely especially et etc even ever evermore every everybody everyone everything
11 | everywhere ex exactly example except f fairly far farther few fewer fifth first five followed following follows
12 | for forever former formerly forth forward found four from further furthermore g get gets getting given gives go
13 | goes going gone got gotten greetings h had hadn't half happens hardly has hasn't have haven't having he he'd he'll
14 | hello help hence her here hereafter hereby herein here's hereupon hers herself he's hi him himself his hither
15 | hopefully how howbeit however hundred i i'd ie if ignored i'll i'm immediate in inasmuch inc inc. indeed indicate
16 | indicated indicates inner inside insofar instead into inward is isn't it it'd it'll its it's itself i've j just k
17 | keep keeps kept know known knows l last lately later latter latterly least less lest let let's like liked likely
18 | likewise little look looking looks low lower ltd m made mainly make makes many may maybe mayn't me mean meantime
19 | meanwhile merely might mightn't mine minus miss more moreover most mostly mr mrs much must mustn't my myself n name
20 | namely nd near nearly necessary need needn't needs neither never neverf neverless nevertheless new next nine ninety
21 | no nobody non none nonetheless noone no-one nor normally not nothing notwithstanding novel now nowhere o obviously
22 | of off often oh ok okay old on once one ones one's only onto opposite or other others otherwise ought oughtn't our
23 | ours ourselves out outside over overall own p particular particularly past per perhaps placed please plus possible
24 | presumably probably provided provides q que quite qv r rather rd re really reasonably recent recently regarding
25 | regardless regards relatively respectively right round s said same saw say saying says second secondly see seeing
26 | seem seemed seeming seems seen self selves sensible sent serious seriously seven several shall shan't she she'd
27 | she'll she's should shouldn't since six so some somebody someday somehow someone something sometime sometimes
28 | somewhat somewhere soon sorry specified specify specifying still sub such sup sure t take taken taking tell tends
29 | th than thank thanks thanx that that'll thats that's that've the their theirs them themselves then thence there
30 | thereafter thereby there'd therefore therein there'll there're theres there's thereupon there've these they they'd
31 | they'll they're they've thing things think third thirty this thorough thoroughly those though three through
32 | throughout thru thus till to together too took toward towards tried tries truly try trying t's twice two u un under
33 | underneath undoing unfortunately unless unlike unlikely until unto up upon upwards us use used useful uses using
34 | usually v value various versus very via viz vs w want wants was wasn't way we we'd welcome well we'll went were we're
35 | weren't we've what whatever what'll what's what've when whence whenever where whereafter whereas whereby wherein
36 | where's whereupon wherever whether which whichever while whilst whither who who'd whoever whole who'll whom whomever
37 | who's whose why will willing wish with within without wonder won't would wouldn't x y yes yet you you'd you'll your
38 | you're yours yourself yourselves you've z zero successful greatest began including being all for close but


--------------------------------------------------------------------------------
/python/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from setuptools import setup
 3 | 
 4 | # Utility function to read the README file.
 5 | # Used for the long_description.  It's nice, because now 1) we have a top level
 6 | # README file and 2) it's easier to type in the README file than to put a raw
 7 | # string in below ...
 8 | def read(fname):
 9 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
10 | 
11 | setup(
12 |     name = "bayes_on_redis",
13 |     version = "0.1.9",
14 |     author = "Didip Kerabat",
15 |     author_email = "didipk@gmail.com",
16 |     description = ("bayes_on_redis library provides bayesian classification on a given text similar to many SPAM/HAM filtering technique."),
17 |     license = "haven't decided",
18 |     keywords = "bayesian filter redis",
19 |     url = "https://github.com/didip/bayes_on_redis",
20 |     packages=['bayes_on_redis', 'datasets'],
21 |     package_data = {
22 |         # If any package contains *.txt or *.rst files, include them:
23 |         '': ['*.txt', '*.rst'],
24 |     },
25 |     long_description=read('README.markdown'),
26 |     classifiers=[
27 |         "Development Status :: 3 - Alpha",
28 |     ],
29 | )
30 | 


--------------------------------------------------------------------------------
/python/test/test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import sys, os.path
 3 | sys.path.append(os.path.abspath(os.path.join(__file__, '..', '..', 'bayes_on_redis')))
 4 | 
 5 | from bayes_on_redis import BayesOnRedis
 6 | 
 7 | bor = BayesOnRedis(redis_host='127.0.0.1', redis_port=6379, redis_db=5)
 8 | bor.flushdb()
 9 | 
10 | 
11 | # Classification tests
12 | 
13 | bor.learn( "good", "sweet awesome kick-ass cool pretty smart" )
14 | bor.learn( "bad", "sucks lame boo death bankrupt loser sad" )
15 | 
16 | text = "even though you are sweet and awesome ninja, you still sucks."
17 | expected = 'good'
18 | print "Expected: %s --- Result: %s" % (expected, bor.classify(text))
19 | 
20 | text = "super lame pirate"
21 | expected = 'bad'
22 | print "Expected: %s --- Result: %s" % (expected, bor.classify(text))
23 | 
24 | # -----------------------
25 | 
26 | bor.train("programming", "opera awesome web browser javascript lua c++ python www internet firefox")
27 | text = "Opera (the web browser) 11 beta, featuring extensions and tab stacking - now available for download."
28 | expected = 'programming'
29 | print "Expected: %s --- Result: %s" % (expected, bor.classify(text))
30 | 
31 | # -----------------------
32 | 
33 | bor.train("programming", "ruby git programming language")
34 | text = "Erik Andrejko shows us some of the common workflows and best features of git, making Ruby and git a powerful combination."
35 | expected = 'programming'
36 | print "Expected: %s --- Result: %s" % (expected, bor.classify(text))
37 | 
38 | # -----------------------
39 | 
40 | bor.train("programming", "python is the best programming language")
41 | text = "Always having fun with ruby and python"
42 | expected = 'programming'
43 | print "Expected: %s --- Result: %s" % (expected, bor.classify(text))
44 | 
45 | # -----------------------
46 | # Stopwords tests
47 | print "Expected: Stopwords length should be > 0 --- Result: %s" % len(bor.stopwords.to_list())
48 | 
49 | # -----------------------
50 | # occurance tests
51 | print bor.count_occurance("one or two cows did not scare me. It is chicken that does.")
52 | 


--------------------------------------------------------------------------------
/ruby/README.markdown:
--------------------------------------------------------------------------------
1 | ../README.markdown


--------------------------------------------------------------------------------
/ruby/bayes_on_redis.gemspec:
--------------------------------------------------------------------------------
 1 | Gem::Specification.new do |gem|
 2 |   gem.name    = 'bayes_on_redis'
 3 |   gem.version = "0.2.2"
 4 |   gem.date    = Date.today.to_s
 5 | 
 6 |   gem.summary = "Bayesian filter on top of Redis"
 7 |   gem.description = "bayes_on_redis library provides bayesian classification on a given text similar to many SPAM/HAM filtering technique."
 8 | 
 9 |   gem.authors  = ['Didip Kerabat']
10 |   gem.email    = 'didipk@gmail.com'
11 |   gem.homepage = 'https://github.com/didip/bayes_on_redis'
12 | 
13 |   gem.rubyforge_project = nil
14 |   gem.has_rdoc = false
15 | 
16 |   gem.files = [
17 |     "README.markdown",
18 |     File.join("lib", "bayes_on_redis.rb"),
19 |     File.join("datasets", "stopwords.txt")
20 |   ]
21 | end
22 | 


--------------------------------------------------------------------------------
/ruby/datasets/stopwords.txt:
--------------------------------------------------------------------------------
 1 | a able about above abroad according accordingly across actually adj after afterwards again against ago ahead
 2 | ain't all allow allows almost alone along alongside already also although always am amid amidst among amongst
 3 | an and another any anybody anyhow anyone anything anyway anyways anywhere apart appear appreciate appropriate
 4 | are aren't around as a's aside ask asking associated at available away awfully b back backward backwards be
 5 | became because become becomes becoming been before beforehand begin behind being believe below beside besides
 6 | best better between beyond both brief but by c came can cannot cant can't caption cause causes certain certainly
 7 | changes clearly c'mon co co. com come comes concerning consequently consider considering contain containing
 8 | contains corresponding could couldn't course c's currently d dare daren't definitely described despite did didn't
 9 | different directly do does doesn't doing done don't down downwards during e each edu eg eight eighty either else
10 | elsewhere end ending enough entirely especially et etc even ever evermore every everybody everyone everything
11 | everywhere ex exactly example except f fairly far farther few fewer fifth first five followed following follows
12 | for forever former formerly forth forward found four from further furthermore g get gets getting given gives go
13 | goes going gone got gotten greetings h had hadn't half happens hardly has hasn't have haven't having he he'd he'll
14 | hello help hence her here hereafter hereby herein here's hereupon hers herself he's hi him himself his hither
15 | hopefully how howbeit however hundred i i'd ie if ignored i'll i'm immediate in inasmuch inc inc. indeed indicate
16 | indicated indicates inner inside insofar instead into inward is isn't it it'd it'll its it's itself i've j just k
17 | keep keeps kept know known knows l last lately later latter latterly least less lest let let's like liked likely
18 | likewise little look looking looks low lower ltd m made mainly make makes many may maybe mayn't me mean meantime
19 | meanwhile merely might mightn't mine minus miss more moreover most mostly mr mrs much must mustn't my myself n name
20 | namely nd near nearly necessary need needn't needs neither never neverf neverless nevertheless new next nine ninety
21 | no nobody non none nonetheless noone no-one nor normally not nothing notwithstanding novel now nowhere o obviously
22 | of off often oh ok okay old on once one ones one's only onto opposite or other others otherwise ought oughtn't our
23 | ours ourselves out outside over overall own p particular particularly past per perhaps placed please plus possible
24 | presumably probably provided provides q que quite qv r rather rd re really reasonably recent recently regarding
25 | regardless regards relatively respectively right round s said same saw say saying says second secondly see seeing
26 | seem seemed seeming seems seen self selves sensible sent serious seriously seven several shall shan't she she'd
27 | she'll she's should shouldn't since six so some somebody someday somehow someone something sometime sometimes
28 | somewhat somewhere soon sorry specified specify specifying still sub such sup sure t take taken taking tell tends
29 | th than thank thanks thanx that that'll thats that's that've the their theirs them themselves then thence there
30 | thereafter thereby there'd therefore therein there'll there're theres there's thereupon there've these they they'd
31 | they'll they're they've thing things think third thirty this thorough thoroughly those though three through
32 | throughout thru thus till to together too took toward towards tried tries truly try trying t's twice two u un under
33 | underneath undoing unfortunately unless unlike unlikely until unto up upon upwards us use used useful uses using
34 | usually v value various versus very via viz vs w want wants was wasn't way we we'd welcome well we'll went were we're
35 | weren't we've what whatever what'll what's what've when whence whenever where whereafter whereas whereby wherein
36 | where's whereupon wherever whether which whichever while whilst whither who who'd whoever whole who'll whom whomever
37 | who's whose why will willing wish with within without wonder won't would wouldn't x y yes yet you you'd you'll your
38 | you're yours yourself yourselves you've z zero successful greatest began including being all for close but


--------------------------------------------------------------------------------
/ruby/lib/bayes_on_redis.rb:
--------------------------------------------------------------------------------
  1 | require "rubygems"
  2 | require "redis"
  3 | 
  4 | class BayesOnRedis
  5 |   CATEGORIES_KEY = "BayesOnRedis:categories"
  6 |   ONE_OR_TWO_WORDS_RE = /\b\w{1,2}\b/mi
  7 |   NON_ALPHANUMERIC_AND_NON_DOT_RE = /[^\w\.]/mi
  8 | 
  9 |   attr_reader :redis, :stopwords
 10 | 
 11 |   def initialize(options)
 12 |     @redis = Redis.new(:host => options[:redis_host], :port => options[:redis_port], :db => options[:redis_db])
 13 |     @stopwords = Stopword.new
 14 |   end
 15 | 
 16 |   def flushdb
 17 |     @redis.flushdb
 18 |   end
 19 | 
 20 |   # training for a category
 21 |   def train(category, text)
 22 |     category = category.downcase
 23 |     @redis.sadd(CATEGORIES_KEY, category)
 24 | 
 25 |     count_occurance(text).each do |word, count|
 26 |       @redis.hincrby(redis_category_key(category), word, count)
 27 |     end
 28 |   end
 29 |   alias_method :learn, :train
 30 | 
 31 |   def untrain(category, text)
 32 |     category = category.downcase
 33 | 
 34 |     count_occurance(text).each do |word, count|
 35 |       word_count_atm = @redis.hget(redis_category_key(category), word)
 36 |       if (word_count_atm >= count)
 37 |         new_count = (word_count_atm - count)
 38 |       else
 39 |         new_count = 0
 40 |       end
 41 |       @redis.hset(redis_category_key(category), word, new_count)
 42 |     end
 43 |   end
 44 |   alias_method :unlearn, :untrain
 45 | 
 46 |   def score(text)
 47 |     scores = {}
 48 | 
 49 |     @redis.smembers(CATEGORIES_KEY).each do |category|
 50 |       words_count_per_category = @redis.hvals(redis_category_key(category)).inject(0) {|sum, score| sum + score.to_i}
 51 |       @redis.srem(CATEGORIES_KEY, category) if words_count_per_category <= 0
 52 | 
 53 |       scores[category] = 0
 54 | 
 55 |       count_occurance(text).each do |word, count|
 56 |         tmp_score = @redis.hget(redis_category_key(category), word).to_i
 57 |         tmp_score = 0.1 if tmp_score <= 0
 58 | 
 59 |         scores[category] += Math.log(tmp_score / words_count_per_category.to_f)
 60 |       end
 61 |     end
 62 | 
 63 |     return scores
 64 |   end
 65 | 
 66 |   def classify(text)
 67 |     (score(text).sort_by { |score| -score[1] })[0][0]    # [0][0] -> first score, get the key
 68 |   end
 69 | 
 70 |   private
 71 |   def redis_category_key(category)
 72 |     "BayesOnRedis:cat:#{category}"
 73 |   end
 74 | 
 75 |   # Incoming text is always downcased
 76 |   def count_occurance(text='')
 77 |     raise "input must be instance of String" unless text.is_a?(String)
 78 | 
 79 |     text_chunks = text.downcase.gsub(ONE_OR_TWO_WORDS_RE, '').gsub(NON_ALPHANUMERIC_AND_NON_DOT_RE, ' ').gsub(@stopwords.to_re, '').gsub(/\./, '').split
 80 |     text_chunks.inject(Hash.new(0)) do |container, word|
 81 |       container[word] += 1; container
 82 |     end
 83 |   end
 84 | 
 85 |   def remove_stopwords
 86 |     @redis.smembers(CATEGORIES_KEY).each do |category|
 87 |       @stopwords.to_a.each do |stopword|
 88 |         @redis.hdel(redis_category_key(category), stopword)
 89 |       end
 90 |     end
 91 |   end
 92 | end
 93 | 
 94 | 
 95 | class Stopword
 96 |   def initialize
 97 |     @stopwords = File.read(File.expand_path(File.join(__FILE__, "..", "..", "datasets", "stopwords.txt"))).split
 98 |   end
 99 | 
100 |   def to_a
101 |     @stopwords
102 |   end
103 | 
104 |   def to_re
105 |     @to_re ||= /\b(#{@stopwords.join('|')})\b/mi
106 |   end
107 | end


--------------------------------------------------------------------------------
/ruby/test/test.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby
 2 | require 'lib/bayes_on_redis'
 3 | 
 4 | bor = BayesOnRedis.new(:redis_host => '127.0.0.1', :redis_port => 6379, :redis_db => 5)
 5 | bor.flushdb
 6 | 
 7 | 
 8 | # Classification tests
 9 | 
10 | bor.learn "good", "sweet awesome kick-ass cool pretty smart"
11 | bor.learn "bad", "sucks lame boo death bankrupt loser sad"
12 | 
13 | text = "even though you are sweet and awesome ninja, you still sucks."
14 | expected = 'good'
15 | puts "Expected: #{expected} --- Result: #{bor.classify(text)}"
16 | 
17 | text = "super lame pirate"
18 | expected = 'bad'
19 | puts "Expected: #{expected} --- Result: #{bor.classify(text)}"
20 | 
21 | # -----------------------
22 | 
23 | bor.train "programming", "opera awesome web browser javascript lua c++ python www internet firefox"
24 | text = "Opera (the web browser) 11 beta, featuring extensions and tab stacking - now available for download."
25 | expected = 'programming'
26 | puts "Expected: #{expected} --- Result: #{bor.classify(text)}"
27 | 
28 | # -----------------------
29 | 
30 | bor.train "programming", "ruby git programming language"
31 | text = "Erik Andrejko shows us some of the common workflows and best features of git, making Ruby and git a powerful combination."
32 | expected = 'programming'
33 | puts "Expected: #{expected} --- Result: #{bor.classify(text)}"
34 | 
35 | # -----------------------
36 | 
37 | bor.train "programming", "python is the best programming language"
38 | text = "Always having fun with ruby and python"
39 | expected = 'programming'
40 | puts "Expected: #{expected} --- Result: #{bor.classify(text)}"
41 | 
42 | # -----------------------
43 | # Stopwords tests
44 | puts "Expected: Stopwords length should be > 0 --- Result: #{bor.stopwords.to_a.size}"
45 | 
46 | 
47 | # -----------------------
48 | # occurance tests
49 | print bor.send(:count_occurance, "one or two cows did not scare me. It is chicken that does.").inspect
50 | 


--------------------------------------------------------------------------------