├── .document
├── .gitignore
├── .rspec
├── .ruby-version
├── .travis.yml
├── Gemfile
├── Gemfile.lock
├── LICENSE.txt
├── README.md
├── Rakefile
├── VERSION
├── images
    └── linnaeus.jpg
├── lib
    ├── linnaeus.rb
    └── linnaeus
    │   ├── classifier.rb
    │   ├── persistence.rb
    │   ├── stopwords.rb
    │   └── trainer.rb
├── linnaeus.gemspec
└── spec
    ├── linnaeus_classifier_spec.rb
    ├── linnaeus_persistence_spec.rb
    ├── linnaeus_spec.rb
    ├── linnaeus_stopwords_spec.rb
    ├── linnaeus_trainer_spec.rb
    └── spec_helper.rb


/.document:
--------------------------------------------------------------------------------
1 | lib/**/*.rb
2 | bin/*
3 | - 
4 | features/**/*.feature
5 | LICENSE.txt
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # rcov generated
 2 | coverage
 3 | coverage.data
 4 | 
 5 | # rdoc generated
 6 | rdoc
 7 | 
 8 | # yard generated
 9 | doc
10 | .yardoc
11 | 
12 | # bundler
13 | .bundle
14 | 
15 | # jeweler generated
16 | pkg
17 | 
18 | # Have editor/IDE/OS specific files you need to ignore? Consider using a global gitignore: 
19 | #
20 | # * Create a file at ~/.gitignore
21 | # * Include files you want ignored
22 | # * Run: git config --global core.excludesfile ~/.gitignore
23 | #
24 | # After doing this, these files will be ignored in all your git projects,
25 | # saving you from having to 'pollute' every project you touch with them
26 | #
27 | # Not sure what to needs to be ignored for particular editors/OSes? Here's some ideas to get you started. (Remember, remove the leading # of the line)
28 | #
29 | # For MacOS:
30 | #
31 | #.DS_Store
32 | 
33 | # For TextMate
34 | #*.tmproj
35 | #tmtags
36 | 
37 | # For emacs:
38 | #*~
39 | #\#*
40 | #.\#*
41 | 
42 | # For vim:
43 | #*.swp
44 | 
45 | # For redcar:
46 | #.redcar
47 | 
48 | # For rubinius:
49 | #*.rbc
50 | 


--------------------------------------------------------------------------------
/.rspec:
--------------------------------------------------------------------------------
1 | --colour
2 | --order rand
3 | -f d
4 | 


--------------------------------------------------------------------------------
/.ruby-version:
--------------------------------------------------------------------------------
1 | 2.0.0-p353
2 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: ruby
2 | rvm:
3 |   - 2.0.0
4 |   - 1.9.3
5 |   - 1.9.2
6 | services:
7 |   - redis
8 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
 1 | source 'http://rubygems.org'
 2 | 
 3 | gem 'redis', '~> 3.0.x'
 4 | gem 'stemmer', '~> 1.0.x'
 5 | 
 6 | # Add dependencies to develop your gem here.
 7 | # Include everything needed to run rake, tests, features, etc.
 8 | group :development do
 9 |   gem 'rspec', '~> 2.11.0'
10 |   gem 'yard', '~> 0.7'
11 |   gem 'rdoc', '~> 3.12'
12 |   gem 'bundler'
13 |   gem 'jeweler'
14 |   gem 'simplecov'
15 |   gem 'redcarpet'
16 | end
17 | 


--------------------------------------------------------------------------------
/Gemfile.lock:
--------------------------------------------------------------------------------
 1 | GEM
 2 |   remote: http://rubygems.org/
 3 |   specs:
 4 |     addressable (2.3.5)
 5 |     builder (3.2.2)
 6 |     descendants_tracker (0.0.3)
 7 |     diff-lcs (1.1.3)
 8 |     docile (1.1.3)
 9 |     faraday (0.9.0)
10 |       multipart-post (>= 1.2, < 3)
11 |     git (1.2.6)
12 |     github_api (0.11.3)
13 |       addressable (~> 2.3)
14 |       descendants_tracker (~> 0.0.1)
15 |       faraday (~> 0.8, < 0.10)
16 |       hashie (>= 1.2)
17 |       multi_json (>= 1.7.5, < 2.0)
18 |       nokogiri (~> 1.6.0)
19 |       oauth2
20 |     hashie (2.0.5)
21 |     highline (1.6.21)
22 |     jeweler (2.0.1)
23 |       builder
24 |       bundler (>= 1.0)
25 |       git (>= 1.2.5)
26 |       github_api
27 |       highline (>= 1.6.15)
28 |       nokogiri (>= 1.5.10)
29 |       rake
30 |       rdoc
31 |     json (1.8.1)
32 |     jwt (0.1.11)
33 |       multi_json (>= 1.5)
34 |     mini_portile (0.5.2)
35 |     multi_json (1.9.0)
36 |     multi_xml (0.5.5)
37 |     multipart-post (2.0.0)
38 |     nokogiri (1.6.1)
39 |       mini_portile (~> 0.5.0)
40 |     oauth2 (0.9.3)
41 |       faraday (>= 0.8, < 0.10)
42 |       jwt (~> 0.1.8)
43 |       multi_json (~> 1.3)
44 |       multi_xml (~> 0.5)
45 |       rack (~> 1.2)
46 |     rack (1.5.2)
47 |     rake (10.1.1)
48 |     rdoc (3.12.2)
49 |       json (~> 1.4)
50 |     redcarpet (3.1.1)
51 |     redis (3.0.7)
52 |     rspec (2.11.0)
53 |       rspec-core (~> 2.11.0)
54 |       rspec-expectations (~> 2.11.0)
55 |       rspec-mocks (~> 2.11.0)
56 |     rspec-core (2.11.1)
57 |     rspec-expectations (2.11.3)
58 |       diff-lcs (~> 1.1.3)
59 |     rspec-mocks (2.11.3)
60 |     simplecov (0.8.2)
61 |       docile (~> 1.1.0)
62 |       multi_json
63 |       simplecov-html (~> 0.8.0)
64 |     simplecov-html (0.8.0)
65 |     stemmer (1.0.1)
66 |     yard (0.8.7.3)
67 | 
68 | PLATFORMS
69 |   ruby
70 | 
71 | DEPENDENCIES
72 |   bundler
73 |   jeweler
74 |   rdoc (~> 3.12)
75 |   redcarpet
76 |   redis (~> 3.0.x)
77 |   rspec (~> 2.11.0)
78 |   simplecov
79 |   stemmer (~> 1.0.x)
80 |   yard (~> 0.7)
81 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2012 Dan Collis-Puro
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | "Software"), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Linnaeus [![Build Status](https://secure.travis-ci.org/djcp/linnaeus.png?branch=master)](http://travis-ci.org/djcp/linnaeus)
 2 | 
 3 | ![Carl Linnaeus](https://raw.github.com/djcp/linnaeus/master/images/linnaeus.jpg)
 4 | 
 5 | Linnaeus is a redis-backed naive Bayesian classification system. Please see the [rdoc](http://rubydoc.info/gems/linnaeus/) for more information. Ruby 1.9 is required.
 6 | 
 7 | Examples
 8 | --------
 9 | 
10 |     lt = Linnaeus::Trainer.new      # Used to train documents
11 |     lc = Linnaeus::Classifier.new   # Used to classify documents
12 | 
13 |     lt.train 'language', 'Ruby is a dynamic, reflective, general-purpose object-oriented programming language that combines syntax inspired by Perl with Smalltalk-like features.'
14 |     lt.train 'database', 'PostgreSQL, often simply Postgres, is an object-relational database management system (ORDBMS) available for many platforms including Linux, FreeBSD, Solaris, Microsoft Windows and Mac OS X.'
15 | 
16 |     lc.classify 'Perl is a high-level, general-purpose, interpreted, dynamic programming language.' # returns "language"
17 | 
18 | 
19 | Contributing to linnaeus
20 | ------------------------
21 | 
22 | * Submit bugs to the github issue tracker: https://github.com/djcp/linnaeus/issues
23 | * If you'd like to add a feature, please submit a description of it to the issue tracker so we can discuss.
24 | * If the feature makes sense, fork the github repository. Write rspec tests and issue a pull request when your change is done.
25 | 
26 | The Future
27 | ----------
28 | 
29 | * Create additional storage backends - sqlite, postgresql, mongodb, etc.
30 | * Allow for weighting tweaks.
31 | 
32 | Copyright
33 | ---------
34 | 
35 | Copyright (c) 2012 Dan Collis-Puro. See LICENSE.txt for further details.
36 | 
37 | Credits
38 | -------
39 | 
40 | * Image courtesy wikipedia. About Carl Linnaeus: http://en.wikipedia.org/wiki/Linnaeus
41 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | require 'rubygems'
 4 | require 'bundler'
 5 | begin
 6 |   Bundler.setup(:default, :development)
 7 | rescue Bundler::BundlerError => e
 8 |   $stderr.puts e.message
 9 |   $stderr.puts "Run `bundle install` to install missing gems"
10 |   exit e.status_code
11 | end
12 | require 'rake'
13 | 
14 | require 'jeweler'
15 | Jeweler::Tasks.new do |gem|
16 |   # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17 |   gem.name = "linnaeus"
18 |   gem.homepage = "http://github.com/djcp/linnaeus"
19 |   gem.license = "MIT"
20 |   gem.summary = %Q{Another redis-backed Bayesian classifier}
21 |   gem.description = %Q{Linnaeus provides a redis-backed Bayesian classifier. Words are stemmed, stopwords are stopped, and redis is used to allow for persistent and concurrent training and classification.}
22 |   gem.email = "dan@collispuro.net"
23 |   gem.authors = ["djcp"]
24 |   # dependencies defined in Gemfile
25 | end
26 | Jeweler::RubygemsDotOrgTasks.new
27 | 
28 | require 'rspec/core'
29 | require 'rspec/core/rake_task'
30 | RSpec::Core::RakeTask.new(:spec) do |spec|
31 |   spec.pattern = FileList['spec/**/*_spec.rb']
32 | end
33 | 
34 | task :default => :spec
35 | 
36 | require 'yard'
37 | YARD::Rake::YardocTask.new
38 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 1.2.0


--------------------------------------------------------------------------------
/images/linnaeus.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djcp/linnaeus/463951f9091b3a2469bfc5a9653803dd6d8c6722/images/linnaeus.jpg


--------------------------------------------------------------------------------
/lib/linnaeus.rb:
--------------------------------------------------------------------------------
 1 | $LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
 2 | 
 3 | require 'redis'
 4 | require 'stemmer'
 5 | 
 6 | # The base class. You won't use this directly - use one of the subclasses.
 7 | class Linnaeus
 8 | 
 9 |   def initialize(opts = {})
10 |     options = {
11 |       persistence_class: Persistence,
12 |       stopwords_class: Stopwords,
13 |       skip_stemming: false,
14 |       encoding: 'UTF-8'
15 |     }.merge(opts)
16 | 
17 |     @db = options[:persistence_class].new(options)
18 |     @stopword_generator = options[:stopwords_class].new
19 |     @skip_stemming = options[:skip_stemming]
20 |     @encoding = options[:encoding]
21 |   end
22 | 
23 |   # Count occurences of words in a text corpus.
24 |   #
25 |   # == Parameters
26 |   # text::
27 |   #   A string representing a document.  Stopwords are removed and words are stemmed using the "Stemmer" gem.
28 |   def count_word_occurrences(text = '')
29 |     count = {}
30 |     text.encode(@encoding).downcase.split.each do |word|
31 |       stemmed_word = (@skip_stemming) ? word : word.stem_porter
32 |       unless stopwords.include? stemmed_word
33 |         count[stemmed_word] = count[stemmed_word] ? count[stemmed_word] + 1 : 1
34 |       end
35 |     end
36 |     count
37 |   end
38 | 
39 |   private
40 |   # Format categories for training or untraining.
41 |   #
42 |   # == Parameters
43 |   # categories::
44 |   #   A string or array of categories
45 |   def normalize_categories(categories = [])
46 |     [categories].flatten.collect do |cat|
47 |       cat.to_s.encode(@encoding).downcase.gsub(/[^a-z\d\.\-_ ]/,'')
48 |     end.reject{|cat| cat == ''}.compact
49 |   end
50 | 
51 |   # Get a Set of stopwords to remove from documents for training / classifying.
52 |   def stopwords
53 |     @stopwords ||= @stopword_generator.to_set
54 |   end
55 | 
56 | end
57 | 
58 | require 'set'
59 | require 'linnaeus/stopwords'
60 | require 'linnaeus/persistence'
61 | require 'linnaeus/trainer'
62 | require 'linnaeus/classifier'
63 | 


--------------------------------------------------------------------------------
/lib/linnaeus/classifier.rb:
--------------------------------------------------------------------------------
 1 | # Classify documents against the Bayesian corpus.
 2 | #
 3 | #  lc = Linnaeus::Classifier.new(<options hash>)
 4 | #  lc.classify 'a string of text' #a wild category appears
 5 | #  lc.classification_scores 'a different string of text' #a hash of categories and scores
 6 | #
 7 | # == Constructor Options
 8 | # persistence_class::
 9 | #   A class implementing persistence - the default (Linnaeus::Persistence) uses redis.
10 | # stopwords_class::
11 | #   A class that emits a set of stopwords. The default is Linnaeus::Stopwords
12 | # skip_stemming::
13 | #   Set to true to skip porter stemming.
14 | # encoding::
15 | #   Force text to use this character set. UTF-8 by default.
16 | # redis_connection::
17 | #   An instantiated Redis connection, allowing you to reuse an existing Redis connection.
18 | # redis_host::
19 | #   Passed to persistence class constructor. Defaults to "127.0.0.1"
20 | # redis_port::
21 | #   Passed to persistence class constructor. Defaults to "6379".
22 | # redis_db::
23 | #   Passed to persistence class constructor. Defaults to "0".
24 | # redis_*::
25 | #   Please see Linnaeus::Persistence for the rest of the options that're passed through directly to the Redis client connection.
26 | class Linnaeus::Classifier < Linnaeus
27 | 
28 |   # Returns a hash of scores for each category in the Bayesian corpus.
29 |   # The closer a score is to 0, the more likely a match it is.
30 |   #
31 |   # == Parameters
32 |   # text::
33 |   #   a string of text to classify.
34 |   #
35 |   # == Returns
36 |   # a hash of categories with a score as the values.
37 |   def classification_scores(text)
38 |     scores = {}
39 | 
40 |     @db.get_categories.each do |category|
41 |       words_with_count_for_category = @db.get_words_with_count_for_category category
42 |       total_word_count_sum_for_category = words_with_count_for_category.values.reduce(0){|sum, count| sum += count.to_i}
43 | 
44 |       scores[category] = 0
45 |       count_word_occurrences(text).each do |word, count|
46 |         tmp_score = (words_with_count_for_category[word].nil?) ? 0.1 : words_with_count_for_category[word].to_i
47 |         scores[category] += Math.log(tmp_score / total_word_count_sum_for_category.to_f)
48 |       end
49 |     end
50 |     scores
51 |   end
52 | 
53 |   # The most likely category for a document.
54 |   #
55 |   # == Parameters
56 |   # text::
57 |   #   a string of text to classify.
58 |   #
59 |   # == Returns
60 |   # A string representing the most likely category.
61 |   def classify(text)
62 |     scores = classification_scores(text)
63 |     if scores.any?
64 |       (scores.sort_by { |a| -a[1] })[0][0]
65 |     else
66 |       ''
67 |     end
68 |   end
69 | 
70 | end
71 | 


--------------------------------------------------------------------------------
/lib/linnaeus/persistence.rb:
--------------------------------------------------------------------------------
  1 | # The redis persistence layer.
  2 | class Linnaeus::Persistence < Linnaeus
  3 |   attr_accessor :redis
  4 | 
  5 |   def initialize(opts = {})
  6 |     options = {
  7 |       redis_host: '127.0.0.1',
  8 |       redis_port: '6379',
  9 |       redis_db: 0,
 10 |       redis_scheme: "redis",
 11 |       redis_path: nil,
 12 |       redis_timeout: 5.0,
 13 |       redis_password: nil,
 14 |       redis_id: nil,
 15 |       redis_tcp_keepalive: 0,
 16 |       scope: nil
 17 |     }.merge(opts)
 18 | 
 19 |     @scope = options[:scope]
 20 | 
 21 |     if options[:redis_connection]
 22 |       @redis = options[:redis_connection]
 23 |     else
 24 |       @redis = Redis.new(
 25 |         host: options[:redis_host],
 26 |         port: options[:redis_port],
 27 |         db: options[:redis_db],
 28 |         scheme: options[:redis_scheme],
 29 |         path: options[:redis_path],
 30 |         timeout: options[:redis_timeout],
 31 |         password: options[:redis_password],
 32 |         id: options[:redis_id],
 33 |         tcp_keepalive: options[:redis_tcp_keepalive]
 34 |       )
 35 |     end
 36 | 
 37 |     self
 38 |   end
 39 | 
 40 |   # Add categories to the bayesian corpus.
 41 |   #
 42 |   # == Parameters
 43 |   # categories::
 44 |   #   A string or array of categories.
 45 |   def add_categories(categories)
 46 |     @redis.sadd category_collection_key, categories
 47 |   end
 48 | 
 49 |   # Remove categories from the bayesian corpus
 50 |   #
 51 |   # == Parameters
 52 |   # categories::
 53 |   #   A string or array of categories.
 54 |   def remove_category(category)
 55 |     @redis.srem category_collection_key, category
 56 |   end
 57 | 
 58 |   # Get categories from the bayesian corpus
 59 |   #
 60 |   # == Parameters
 61 |   # categories::
 62 |   #   A string or array of categories.
 63 |   def get_categories
 64 |     @redis.smembers category_collection_key
 65 |   end
 66 | 
 67 |   # Get a list of words with their number of occurrences.
 68 |   # 
 69 |   # == Parameters
 70 |   # category::
 71 |   #   A string representing a category.
 72 |   #
 73 |   # == Returns
 74 |   # A hash with the word counts for this category.
 75 |   def get_words_with_count_for_category(category)
 76 |     @redis.hgetall base_category_key + category
 77 |   end
 78 | 
 79 |   # Clear all training data from the backend.
 80 |   def clear_all_training_data
 81 |     @redis.flushdb
 82 |   end
 83 | 
 84 |   # Clear training data for the scope associated with this instance.
 85 |   def clear_training_data
 86 |      keys = @redis.keys(base_key.join(':') + '*')
 87 | 
 88 |      keys.each do |key|
 89 |        @redis.del key
 90 |      end
 91 |   end
 92 | 
 93 |   # Increment word counts within a category
 94 |   #
 95 |   # == Parameters
 96 |   # category::
 97 |   #   A string representing a category.
 98 |   # word_occurrences::
 99 |   #   A hash containing a count of the number of word occurences in a document
100 |   def increment_word_counts_for_category(category, word_occurrences)
101 |     word_occurrences.each do|word,count|
102 |       @redis.hincrby base_category_key + category, word, count
103 |     end
104 |   end
105 | 
106 |   # Decrement word counts within a category. This is used when removing a document from the corpus.
107 |   #
108 |   # == Parameters
109 |   # category::
110 |   #   A string representing a category.
111 |   # word_occurrences::
112 |   #   A hash containing a count of the number of word occurences in a document
113 |   def decrement_word_counts_for_category(category, word_occurrences)
114 |     word_occurrences.each do|word,count|
115 |       @redis.hincrby base_category_key + category, word, - count
116 |     end
117 |   end
118 | 
119 |   # Clean out words with a count of zero in a category. Used during untraining.
120 |   #
121 |   # == Parameters
122 |   # category::
123 |   #   A string representing a category.
124 |   def cleanup_empty_words_in_category(category)
125 |     word_counts = @redis.hgetall base_category_key + category
126 |     empty_words = word_counts.select{|word, count| count.to_i <= 0}
127 |     if empty_words == word_counts
128 |       @redis.del base_category_key + category
129 |     else
130 |       if empty_words.any?
131 |         @redis.hdel base_category_key + category, empty_words.keys
132 |       end
133 |     end
134 |   end
135 | 
136 |   private
137 | 
138 |   # The Set (in the redis sense) of categories are stored in this key.
139 |   def category_collection_key
140 |     [ base_key, 'category' ].compact.join(':')
141 |   end
142 | 
143 |   # The base key for a category within a scope in the redis corpus. Word
144 |   # occurrence counts for a category appear under here.
145 |   def base_category_key
146 |     [ base_key, 'cat:' ].flatten.join(':')
147 |   end
148 | 
149 |   def base_key
150 |     [ 'Linnaeus', @scope ].compact
151 |   end
152 | 
153 | end
154 | 


--------------------------------------------------------------------------------
/lib/linnaeus/stopwords.rb:
--------------------------------------------------------------------------------
 1 | # The stopword list - you can override this list by creating a stopword generator and registering it in the Linnaeus::Trainer or Linnaeus::Classifier constructors.
 2 | class Linnaeus::Stopwords
 3 |   # The default stopword list.
 4 |   DEFAULT_STOPWORDS = %w(a able about across after all almost also am among an and any are as at be because been but by can cannot could dear did do does either else ever every for from get got had has have he her hers him his how however i if in into is it its just least let like likely may me might most must my neither no nor not of off often on only or other our own rather said say says she should since so some than that the their them then there these they this tis to too twas us wants was we were what when where which while who whom why will with would yet you your)
 5 | 
 6 |   attr_accessor :stopwords
 7 | 
 8 |   # The list of stopwords as an array
 9 |   def to_a
10 |     @stopwords || DEFAULT_STOPWORDS
11 |   end
12 | 
13 |   # The list of stopwords as a ruby Set
14 |   def to_set
15 |     to_a.to_set
16 |   end
17 | end
18 | 


--------------------------------------------------------------------------------
/lib/linnaeus/trainer.rb:
--------------------------------------------------------------------------------
 1 | # Train or untrain documents from the Bayesian corpus.
 2 | #
 3 | #  lt = Linnaeus::Trainer.new(<options hash>)
 4 | #  lt.train 'category', 'a string of text' 
 5 | #  lt.train 'differentcategory', 'another string of text' 
 6 | #  lt.untrain 'category', 'a document we just removed'
 7 | #
 8 | # == Constructor Options
 9 | # persistence_class::
10 | #   A class implementing persistence - the default (Linnaeus::Persistence) uses redis.
11 | # stopwords_class::
12 | #   A class that emits a set of stopwords. The default is Linnaeus::Stopwords
13 | # skip_stemming::
14 | #   Set to true to skip porter stemming.
15 | # encoding::
16 | #   Force text to use this character set. UTF-8 by default.
17 | # redis_host::
18 | #   Passed to persistence class constructor. Defaults to "127.0.0.1"
19 | # redis_port::
20 | #   Passed to persistence class constructor. Defaults to "6379".
21 | # redis_db::
22 | #   Passed to persistence class constructor. Defaults to "0".
23 | # redis_*::
24 | #   Please see Linnaeus::Persistence for the rest of the options that're passed through directly to the Redis client connection.
25 | class Linnaeus::Trainer < Linnaeus
26 | 
27 |   # Add a document to the training corpus.
28 |   #
29 |   # == Parameters
30 |   # categories::
31 |   #   A string or array of categories
32 |   # text::
33 |   #   A string of text in this document.
34 |   def train(categories, text)
35 |     categories = normalize_categories categories
36 |     @db.add_categories(categories)
37 | 
38 |     word_occurrences = count_word_occurrences text
39 |     categories.each do|cat|
40 |       @db.increment_word_counts_for_category cat, word_occurrences
41 |     end
42 |   end
43 | 
44 |   # Remove a document from the training corpus.
45 |   #
46 |   # == Parameters
47 |   # categories::
48 |   #   A string or array of categories
49 |   # text::
50 |   #   A string of text in this document.
51 |   def untrain(categories, text)
52 |     categories = normalize_categories categories
53 | 
54 |     word_occurrences = count_word_occurrences text
55 |     categories.each do|cat|
56 |       @db.decrement_word_counts_for_category cat, word_occurrences
57 |       @db.cleanup_empty_words_in_category cat
58 |     end
59 |   end
60 | 
61 | end
62 | 


--------------------------------------------------------------------------------
/linnaeus.gemspec:
--------------------------------------------------------------------------------
 1 | # Generated by jeweler
 2 | # DO NOT EDIT THIS FILE DIRECTLY
 3 | # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
 4 | # -*- encoding: utf-8 -*-
 5 | # stub: linnaeus 1.2.0 ruby lib
 6 | 
 7 | Gem::Specification.new do |s|
 8 |   s.name = "linnaeus"
 9 |   s.version = "1.2.0"
10 | 
11 |   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
12 |   s.require_paths = ["lib"]
13 |   s.authors = ["djcp"]
14 |   s.date = "2014-03-18"
15 |   s.description = "Linnaeus provides a redis-backed Bayesian classifier. Words are stemmed, stopwords are stopped, and redis is used to allow for persistent and concurrent training and classification."
16 |   s.email = "dan@collispuro.net"
17 |   s.extra_rdoc_files = [
18 |     "LICENSE.txt",
19 |     "README.md"
20 |   ]
21 |   s.files = [
22 |     ".document",
23 |     ".rspec",
24 |     ".ruby-version",
25 |     ".travis.yml",
26 |     "Gemfile",
27 |     "Gemfile.lock",
28 |     "LICENSE.txt",
29 |     "README.md",
30 |     "Rakefile",
31 |     "VERSION",
32 |     "images/linnaeus.jpg",
33 |     "lib/linnaeus.rb",
34 |     "lib/linnaeus/classifier.rb",
35 |     "lib/linnaeus/persistence.rb",
36 |     "lib/linnaeus/stopwords.rb",
37 |     "lib/linnaeus/trainer.rb",
38 |     "linnaeus.gemspec",
39 |     "spec/linnaeus_classifier_spec.rb",
40 |     "spec/linnaeus_persistence_spec.rb",
41 |     "spec/linnaeus_spec.rb",
42 |     "spec/linnaeus_stopwords_spec.rb",
43 |     "spec/linnaeus_trainer_spec.rb",
44 |     "spec/spec_helper.rb"
45 |   ]
46 |   s.homepage = "http://github.com/djcp/linnaeus"
47 |   s.licenses = ["MIT"]
48 |   s.rubygems_version = "2.2.1"
49 |   s.summary = "Another redis-backed Bayesian classifier"
50 | 
51 |   if s.respond_to? :specification_version then
52 |     s.specification_version = 4
53 | 
54 |     if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
55 |       s.add_runtime_dependency(%q<redis>, ["~> 3.0"])
56 |       s.add_runtime_dependency(%q<stemmer>, ["~> 1.0.0"])
57 |       s.add_development_dependency(%q<rspec>, ["~> 2.11.0"])
58 |       s.add_development_dependency(%q<yard>, ["~> 0.7"])
59 |       s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
60 |       s.add_development_dependency(%q<bundler>, [">= 0"])
61 |       s.add_development_dependency(%q<jeweler>, [">= 0"])
62 |       s.add_development_dependency(%q<simplecov>, [">= 0"])
63 |       s.add_development_dependency(%q<redcarpet>, [">= 0"])
64 |     else
65 |       s.add_dependency(%q<redis>, ["~> 3.0"])
66 |       s.add_dependency(%q<stemmer>, ["~> 1.0.0"])
67 |       s.add_dependency(%q<rspec>, ["~> 2.11.0"])
68 |       s.add_dependency(%q<yard>, ["~> 0.7"])
69 |       s.add_dependency(%q<rdoc>, ["~> 3.12"])
70 |       s.add_dependency(%q<bundler>, [">= 0"])
71 |       s.add_dependency(%q<jeweler>, [">= 0"])
72 |       s.add_dependency(%q<simplecov>, [">= 0"])
73 |       s.add_dependency(%q<redcarpet>, [">= 0"])
74 |     end
75 |   else
76 |     s.add_dependency(%q<redis>, ["~> 3.0"])
77 |     s.add_dependency(%q<stemmer>, ["~> 1.0.0"])
78 |     s.add_dependency(%q<rspec>, ["~> 2.11.0"])
79 |     s.add_dependency(%q<yard>, ["~> 0.7"])
80 |     s.add_dependency(%q<rdoc>, ["~> 3.12"])
81 |     s.add_dependency(%q<bundler>, [">= 0"])
82 |     s.add_dependency(%q<jeweler>, [">= 0"])
83 |     s.add_dependency(%q<simplecov>, [">= 0"])
84 |     s.add_dependency(%q<redcarpet>, [">= 0"])
85 |   end
86 | end
87 | 
88 | 


--------------------------------------------------------------------------------
/spec/linnaeus_classifier_spec.rb:
--------------------------------------------------------------------------------
 1 | require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
 2 | 
 3 | describe Linnaeus::Classifier do
 4 |   context 'with no training data' do
 5 |     it 'should return empty values when attempting to classify' do
 6 |       Linnaeus::Persistence.new.clear_all_training_data
 7 |       subject.classify("foo bar baz").should be_empty
 8 |       subject.classification_scores("foo bar baz").should be_empty
 9 |     end
10 |   end
11 | 
12 |   context 'with a very small dataset' do
13 |     before do
14 |       create_small_dataset
15 |     end
16 | 
17 |     it 'should classify easy things well' do
18 |       subject.classify('A bird that migrates').should eq('bird')
19 |       subject.classify('This was directed by Gus Van Sant').should eq('movie')
20 |     end
21 | 
22 |     it 'should return correct classification scores' do
23 |       subject.classification_scores('a bird').should eq(
24 |         { "movie"=>-6.272877006546167, "bird"=>-4.2626798770413155 }
25 |       )
26 |       subject.classification_scores('a directorial bird').should eq(
27 |         { "movie"=>-10.24316892009829, "bird"=>-10.827944847076676 }
28 |       )
29 |     end
30 |   end
31 | 
32 |   def create_small_dataset
33 |     Linnaeus::Persistence.new.clear_all_training_data
34 |     lt = Linnaeus::Trainer.new
35 |     lt.train 'movie', "Gone with the Wind is a 1939 American historical epic film adapted from Margaret Mitchell's Pulitzer-winning 1936 novel of the same name."
36 |     lt.train 'movie', "THX 1138 is a 1971 science fiction film directed by George Lucas in his feature directorial debut. The film was written by Lucas and Walter Murch."
37 |     lt.train 'movie', "Top Gun is a 1986 American action drama film directed by Tony Scott, and produced by Don Simpson and Jerry Bruckheimer, in association with the Paramount Pictures company."
38 | 
39 |     lt.train 'bird', "The Yellow-throated Warbler (Setophaga dominica) is a small migratory songbird species breeding in temperate North America. It belongs to the New World warbler family (Parulidae)."
40 |     lt.train 'bird', "The Blue Jay (Cyanocitta cristata) is a passerine bird in the family Corvidae, native to North America. It is resident through most of eastern and central United States and southern Canada, although western populations may be migratory."
41 |     lt.train 'bird', "The Mallard or Wild Duck (Anas platyrhynchos) is a dabbling duck which breeds throughout the temperate and subtropical Americas, Europe, Asia, and North Africa, and has been introduced to New Zealand and Australia. This duck belongs to the subfamily Anatinae of the waterfowl family Anatidae"
42 |   end
43 | end
44 | 


--------------------------------------------------------------------------------
/spec/linnaeus_persistence_spec.rb:
--------------------------------------------------------------------------------
  1 | require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
  2 | 
  3 | describe Linnaeus::Persistence do
  4 |   before do
  5 |     lp = get_linnaeus_persistence
  6 |     lp.clear_training_data
  7 |   end
  8 | 
  9 |   it "should accept an existing redis connection" do
 10 |     lp = Linnaeus::Persistence.new(redis_connection: Redis.new)
 11 |     lp.redis.should_not be_nil
 12 |   end
 13 | 
 14 |   it 'sets keys properly with defaults' do
 15 |     lp2 = get_linnaeus_persistence
 16 |     train_a_document_in('foobar')
 17 |     lp2.redis.keys('*').should match_array ['Linnaeus:category', 'Linnaeus:cat:foobar']
 18 |   end
 19 | 
 20 |   context "custom scopes" do
 21 |     it 'sets keys properly' do
 22 |       lp2 = get_linnaeus_persistence(scope: 'new-scope')
 23 |       lp2.clear_all_training_data
 24 | 
 25 |       train_a_document_in('foobar', scope: 'new-scope')
 26 | 
 27 |       lp2.redis.keys('*').should match_array [
 28 |         'Linnaeus:new-scope:cat:foobar', 'Linnaeus:new-scope:category'
 29 |       ]
 30 |     end
 31 | 
 32 |     it 'can clear scoped training data separately' do
 33 |       lp = get_linnaeus_persistence
 34 | 
 35 |       train_a_document_in('foobar')
 36 | 
 37 |       lp2 = get_linnaeus_persistence(scope: 'new-scope')
 38 | 
 39 |       train_a_document_in('foobar', scope: 'new-scope')
 40 | 
 41 |       lp.redis.keys.should match_array [
 42 |         "Linnaeus:cat:foobar", "Linnaeus:category",
 43 |         "Linnaeus:new-scope:cat:foobar", "Linnaeus:new-scope:category"
 44 |       ]
 45 | 
 46 |       lp2.clear_training_data
 47 | 
 48 |       lp.redis.keys.should match_array [
 49 |         "Linnaeus:cat:foobar", "Linnaeus:category"
 50 |       ]
 51 |     end
 52 | 
 53 |     it 'stores categories successfully into different scopes' do
 54 |       lp = get_linnaeus_persistence
 55 |       add_categories lp
 56 | 
 57 |       lp2 = get_linnaeus_persistence(scope: 'new-scope')
 58 |       add_categories lp2, ['slack' , 'frop']
 59 | 
 60 |       lp2.get_categories.should match_array ['frop', 'slack']
 61 |       lp.get_categories.should match_array ['bar','baz','foo']
 62 |     end
 63 |   end
 64 | 
 65 |   it '#clear_all_training_data' do
 66 |     lp = get_linnaeus_persistence
 67 |     train_a_document_in 'testcategory'
 68 |     lp.get_words_with_count_for_category('testcategory').should_not be_empty
 69 |     lp.clear_all_training_data
 70 |     lp.get_words_with_count_for_category('testcategory').should be_empty
 71 |   end
 72 | 
 73 |   it 'stores categories successfully' do
 74 |     lp = get_linnaeus_persistence
 75 |     add_categories lp
 76 |     lp.get_categories.should match_array ['bar','baz','foo']
 77 |   end
 78 | 
 79 |   it 'can remove categories' do
 80 |     lp = get_linnaeus_persistence
 81 |     add_categories lp
 82 |     lp.remove_category 'bar'
 83 |     lp.get_categories.should match_array ['baz','foo']
 84 |   end
 85 | 
 86 |   it '#get_words_with_count_for_category' do
 87 |     lp = get_linnaeus_persistence
 88 |     train_a_document_in 'testcategory'
 89 |     lp.get_words_with_count_for_category('testcategory').should eq({
 90 |       "test"=>"1", "document"=>"1", "stuff"=>"1",
 91 |       "bayesian"=>"1", "corpu"=>"1"
 92 |     })
 93 |   end
 94 | 
 95 |   it '#increment_word_counts_for_category' do
 96 |     lp = get_linnaeus_persistence
 97 |     train_a_document_in 'testcategory'
 98 |     train_a_document_in 'testcategory'
 99 |     lp.get_words_with_count_for_category('testcategory').should eq({
100 |       "test"=>"2", "document"=>"2", "stuff"=>"2",
101 |       "bayesian"=>"2", "corpu"=>"2"
102 |     })
103 |   end
104 | 
105 |   it '#decrement_word_counts_for_category' do
106 |     lp = get_linnaeus_persistence
107 |     train_a_document_in 'testcategory'
108 |     train_a_document_in 'testcategory'
109 |     untrain_a_document_in 'testcategory'
110 |     lp.get_words_with_count_for_category('testcategory').should eq({
111 |       "test"=>"1", "document"=>"1", "stuff"=>"1",
112 |       "bayesian"=>"1", "corpu"=>"1"
113 |     })
114 |   end
115 | 
116 |   it '#cleanup_empty_words_in_category' do
117 |     lp = get_linnaeus_persistence
118 |     train_a_document_in 'testcategory'
119 |     untrain_a_document_in 'testcategory'
120 |     lp.get_words_with_count_for_category('testcategory').should eq ({})
121 |   end
122 | 
123 |   def add_categories(lp, categories = ['foo','bar','baz','foo', 'bar'])
124 |     lp.add_categories(categories)
125 |   end
126 | 
127 |   def get_linnaeus_persistence(options = {})
128 |     Linnaeus::Persistence.new(options)
129 |   end
130 | 
131 |   def train_a_document_in(category, options = {})
132 |     lt = Linnaeus::Trainer.new(options)
133 |     lt.train category, document
134 |   end
135 | 
136 |   def untrain_a_document_in(category, options = {})
137 |     lt = Linnaeus::Trainer.new(options)
138 |     lt.untrain category, document
139 |   end
140 | 
141 |   def document
142 |     'I am a test document and I will have stuff in the bayesian corpus'
143 |   end
144 | end
145 | 


--------------------------------------------------------------------------------
/spec/linnaeus_spec.rb:
--------------------------------------------------------------------------------
1 | require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2 | 
3 | describe Linnaeus do
4 | end
5 | 


--------------------------------------------------------------------------------
/spec/linnaeus_stopwords_spec.rb:
--------------------------------------------------------------------------------
 1 | require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
 2 | 
 3 | describe Linnaeus::Stopwords do
 4 |   subject { Linnaeus::Stopwords.new }
 5 |   it '.to_a' do
 6 |     subject.should respond_to :to_a
 7 |     subject.to_a.should be_an_instance_of Array
 8 |     subject.to_a.should include 'the'
 9 |   end
10 |   it '.to_set' do
11 |     subject.should respond_to :to_set
12 |     subject.to_set.should be_an_instance_of Set
13 |     subject.to_set.should include 'the'
14 |   end
15 |   it 'can have stopwords overridden' do
16 |     subject.stopwords = ['foo','bar']
17 |     subject.to_a.should match_array ['foo','bar']
18 |     subject.to_set.should eq ['foo','bar'].to_set
19 |   end
20 | end
21 | 


--------------------------------------------------------------------------------
/spec/linnaeus_trainer_spec.rb:
--------------------------------------------------------------------------------
 1 | require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
 2 | 
 3 | describe Linnaeus::Trainer do
 4 |   context 'with default options' do
 5 |     subject { Linnaeus::Trainer.new }
 6 | 
 7 |     it 'should count word occurrencs properly' do
 8 |       subject.count_word_occurrences('foo bar foo baz').should == 
 9 |         { 'foo' => 2, 'bar' => 1, 'baz' => 1 }
10 |     end
11 | 
12 |     it 'should not count stopwords' do
13 |       subject.count_word_occurrences('foo the you').should == { 'foo' => 1 }
14 |     end
15 | 
16 |     it 'returns an empty hash when given an empty string' do
17 |       subject.count_word_occurrences.should == { }
18 |     end
19 | 
20 |     it 'should train documents properly' do
21 |       train_documents_properly
22 |     end
23 | 
24 |     it 'should partially untrain properly' do
25 |       lp = Linnaeus::Persistence.new
26 |       lp.clear_all_training_data
27 |       subject.train 'fruit', grape
28 |       subject.train 'fruit', orange
29 | 
30 |       subject.untrain 'fruit', grape
31 |       lp.get_words_with_count_for_category('fruit').should eq({"fruit"=>"1", "sweet"=>"1", "orang"=>"1", "round"=>"1", "citru"=>"1"})
32 |     end
33 | 
34 |     it 'should fully untrain properly' do
35 |       lp = Linnaeus::Persistence.new
36 |       lp.clear_all_training_data
37 |       subject.train 'fruit', grape
38 |       subject.untrain 'fruit', grape
39 |       lp.get_words_with_count_for_category('fruit').should eq({})
40 |     end
41 | 
42 |   end
43 | 
44 |   context 'with non-default stopwords' do
45 |     subject { Linnaeus::Trainer.new(stopwords_class: FooStop) }
46 |     it 'should count word occurrences properly' do
47 |       subject.count_word_occurrences('foo bar foo baz').should == { 'baz' => 1 }
48 |     end
49 |   end
50 | 
51 |   context 'with a custom scope' do
52 |     it 'should train on documents properly' do
53 |       train_documents_properly(scope: 'new-scope')
54 |     end
55 |   end
56 | 
57 |   def train_documents_properly(options = {})
58 |     lp = Linnaeus::Persistence.new(options)
59 |     lp.clear_all_training_data
60 |     subject = described_class.new(options)
61 |     subject.train 'fruit', grape
62 |     subject.train 'fruit', orange
63 |     lp.get_words_with_count_for_category('fruit').should eq(
64 |       {
65 |       "grape"=>"1", "purpl"=>"1", "blue"=>"1", "green"=>"1",
66 |       "fruit"=>"2", "sweet"=>"2", "wine"=>"1", "oval"=>"1",
67 |       "orang"=>"1", "round"=>"1", "citru"=>"1"
68 |     })
69 |   end
70 | 
71 |   def grape
72 |     'grape purple blue green fruit sweet wine oval'
73 |   end
74 | 
75 |   def orange
76 |     'orange round citrus fruit sweet'
77 |   end
78 | end
79 | 
80 | class FooStop
81 |   def to_set
82 |     Set.new ['foo','bar']
83 |   end
84 | end
85 | 


--------------------------------------------------------------------------------
/spec/spec_helper.rb:
--------------------------------------------------------------------------------
 1 | $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
 2 | $LOAD_PATH.unshift(File.dirname(__FILE__))
 3 | require 'simplecov'
 4 | SimpleCov.start
 5 | require 'rspec'
 6 | require 'linnaeus'
 7 | 
 8 | # Requires supporting files with custom matchers and macros, etc,
 9 | # in ./support/ and its subdirectories.
10 | Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
11 | 
12 | RSpec.configure do |config|
13 |   
14 | end
15 | 


--------------------------------------------------------------------------------