├── .document ├── .gitignore ├── .rspec ├── .ruby-version ├── .travis.yml ├── Gemfile ├── Gemfile.lock ├── LICENSE.txt ├── README.md ├── Rakefile ├── VERSION ├── images └── linnaeus.jpg ├── lib ├── linnaeus.rb └── linnaeus │ ├── classifier.rb │ ├── persistence.rb │ ├── stopwords.rb │ └── trainer.rb ├── linnaeus.gemspec └── spec ├── linnaeus_classifier_spec.rb ├── linnaeus_persistence_spec.rb ├── linnaeus_spec.rb ├── linnaeus_stopwords_spec.rb ├── linnaeus_trainer_spec.rb └── spec_helper.rb /.document: -------------------------------------------------------------------------------- 1 | lib/**/*.rb 2 | bin/* 3 | - 4 | features/**/*.feature 5 | LICENSE.txt 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # rcov generated 2 | coverage 3 | coverage.data 4 | 5 | # rdoc generated 6 | rdoc 7 | 8 | # yard generated 9 | doc 10 | .yardoc 11 | 12 | # bundler 13 | .bundle 14 | 15 | # jeweler generated 16 | pkg 17 | 18 | # Have editor/IDE/OS specific files you need to ignore? Consider using a global gitignore: 19 | # 20 | # * Create a file at ~/.gitignore 21 | # * Include files you want ignored 22 | # * Run: git config --global core.excludesfile ~/.gitignore 23 | # 24 | # After doing this, these files will be ignored in all your git projects, 25 | # saving you from having to 'pollute' every project you touch with them 26 | # 27 | # Not sure what to needs to be ignored for particular editors/OSes? Here's some ideas to get you started. (Remember, remove the leading # of the line) 28 | # 29 | # For MacOS: 30 | # 31 | #.DS_Store 32 | 33 | # For TextMate 34 | #*.tmproj 35 | #tmtags 36 | 37 | # For emacs: 38 | #*~ 39 | #\#* 40 | #.\#* 41 | 42 | # For vim: 43 | #*.swp 44 | 45 | # For redcar: 46 | #.redcar 47 | 48 | # For rubinius: 49 | #*.rbc 50 | -------------------------------------------------------------------------------- /.rspec: -------------------------------------------------------------------------------- 1 | --colour 2 | --order rand 3 | -f d 4 | -------------------------------------------------------------------------------- /.ruby-version: -------------------------------------------------------------------------------- 1 | 2.0.0-p353 2 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: ruby 2 | rvm: 3 | - 2.0.0 4 | - 1.9.3 5 | - 1.9.2 6 | services: 7 | - redis 8 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'http://rubygems.org' 2 | 3 | gem 'redis', '~> 3.0.x' 4 | gem 'stemmer', '~> 1.0.x' 5 | 6 | # Add dependencies to develop your gem here. 7 | # Include everything needed to run rake, tests, features, etc. 8 | group :development do 9 | gem 'rspec', '~> 2.11.0' 10 | gem 'yard', '~> 0.7' 11 | gem 'rdoc', '~> 3.12' 12 | gem 'bundler' 13 | gem 'jeweler' 14 | gem 'simplecov' 15 | gem 'redcarpet' 16 | end 17 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: http://rubygems.org/ 3 | specs: 4 | addressable (2.3.5) 5 | builder (3.2.2) 6 | descendants_tracker (0.0.3) 7 | diff-lcs (1.1.3) 8 | docile (1.1.3) 9 | faraday (0.9.0) 10 | multipart-post (>= 1.2, < 3) 11 | git (1.2.6) 12 | github_api (0.11.3) 13 | addressable (~> 2.3) 14 | descendants_tracker (~> 0.0.1) 15 | faraday (~> 0.8, < 0.10) 16 | hashie (>= 1.2) 17 | multi_json (>= 1.7.5, < 2.0) 18 | nokogiri (~> 1.6.0) 19 | oauth2 20 | hashie (2.0.5) 21 | highline (1.6.21) 22 | jeweler (2.0.1) 23 | builder 24 | bundler (>= 1.0) 25 | git (>= 1.2.5) 26 | github_api 27 | highline (>= 1.6.15) 28 | nokogiri (>= 1.5.10) 29 | rake 30 | rdoc 31 | json (1.8.1) 32 | jwt (0.1.11) 33 | multi_json (>= 1.5) 34 | mini_portile (0.5.2) 35 | multi_json (1.9.0) 36 | multi_xml (0.5.5) 37 | multipart-post (2.0.0) 38 | nokogiri (1.6.1) 39 | mini_portile (~> 0.5.0) 40 | oauth2 (0.9.3) 41 | faraday (>= 0.8, < 0.10) 42 | jwt (~> 0.1.8) 43 | multi_json (~> 1.3) 44 | multi_xml (~> 0.5) 45 | rack (~> 1.2) 46 | rack (1.5.2) 47 | rake (10.1.1) 48 | rdoc (3.12.2) 49 | json (~> 1.4) 50 | redcarpet (3.1.1) 51 | redis (3.0.7) 52 | rspec (2.11.0) 53 | rspec-core (~> 2.11.0) 54 | rspec-expectations (~> 2.11.0) 55 | rspec-mocks (~> 2.11.0) 56 | rspec-core (2.11.1) 57 | rspec-expectations (2.11.3) 58 | diff-lcs (~> 1.1.3) 59 | rspec-mocks (2.11.3) 60 | simplecov (0.8.2) 61 | docile (~> 1.1.0) 62 | multi_json 63 | simplecov-html (~> 0.8.0) 64 | simplecov-html (0.8.0) 65 | stemmer (1.0.1) 66 | yard (0.8.7.3) 67 | 68 | PLATFORMS 69 | ruby 70 | 71 | DEPENDENCIES 72 | bundler 73 | jeweler 74 | rdoc (~> 3.12) 75 | redcarpet 76 | redis (~> 3.0.x) 77 | rspec (~> 2.11.0) 78 | simplecov 79 | stemmer (~> 1.0.x) 80 | yard (~> 0.7) 81 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012 Dan Collis-Puro 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Linnaeus [![Build Status](https://secure.travis-ci.org/djcp/linnaeus.png?branch=master)](http://travis-ci.org/djcp/linnaeus) 2 | 3 | ![Carl Linnaeus](https://raw.github.com/djcp/linnaeus/master/images/linnaeus.jpg) 4 | 5 | Linnaeus is a redis-backed naive Bayesian classification system. Please see the [rdoc](http://rubydoc.info/gems/linnaeus/) for more information. Ruby 1.9 is required. 6 | 7 | Examples 8 | -------- 9 | 10 | lt = Linnaeus::Trainer.new # Used to train documents 11 | lc = Linnaeus::Classifier.new # Used to classify documents 12 | 13 | lt.train 'language', 'Ruby is a dynamic, reflective, general-purpose object-oriented programming language that combines syntax inspired by Perl with Smalltalk-like features.' 14 | lt.train 'database', 'PostgreSQL, often simply Postgres, is an object-relational database management system (ORDBMS) available for many platforms including Linux, FreeBSD, Solaris, Microsoft Windows and Mac OS X.' 15 | 16 | lc.classify 'Perl is a high-level, general-purpose, interpreted, dynamic programming language.' # returns "language" 17 | 18 | 19 | Contributing to linnaeus 20 | ------------------------ 21 | 22 | * Submit bugs to the github issue tracker: https://github.com/djcp/linnaeus/issues 23 | * If you'd like to add a feature, please submit a description of it to the issue tracker so we can discuss. 24 | * If the feature makes sense, fork the github repository. Write rspec tests and issue a pull request when your change is done. 25 | 26 | The Future 27 | ---------- 28 | 29 | * Create additional storage backends - sqlite, postgresql, mongodb, etc. 30 | * Allow for weighting tweaks. 31 | 32 | Copyright 33 | --------- 34 | 35 | Copyright (c) 2012 Dan Collis-Puro. See LICENSE.txt for further details. 36 | 37 | Credits 38 | ------- 39 | 40 | * Image courtesy wikipedia. About Carl Linnaeus: http://en.wikipedia.org/wiki/Linnaeus 41 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | require 'rubygems' 4 | require 'bundler' 5 | begin 6 | Bundler.setup(:default, :development) 7 | rescue Bundler::BundlerError => e 8 | $stderr.puts e.message 9 | $stderr.puts "Run `bundle install` to install missing gems" 10 | exit e.status_code 11 | end 12 | require 'rake' 13 | 14 | require 'jeweler' 15 | Jeweler::Tasks.new do |gem| 16 | # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options 17 | gem.name = "linnaeus" 18 | gem.homepage = "http://github.com/djcp/linnaeus" 19 | gem.license = "MIT" 20 | gem.summary = %Q{Another redis-backed Bayesian classifier} 21 | gem.description = %Q{Linnaeus provides a redis-backed Bayesian classifier. Words are stemmed, stopwords are stopped, and redis is used to allow for persistent and concurrent training and classification.} 22 | gem.email = "dan@collispuro.net" 23 | gem.authors = ["djcp"] 24 | # dependencies defined in Gemfile 25 | end 26 | Jeweler::RubygemsDotOrgTasks.new 27 | 28 | require 'rspec/core' 29 | require 'rspec/core/rake_task' 30 | RSpec::Core::RakeTask.new(:spec) do |spec| 31 | spec.pattern = FileList['spec/**/*_spec.rb'] 32 | end 33 | 34 | task :default => :spec 35 | 36 | require 'yard' 37 | YARD::Rake::YardocTask.new 38 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 1.2.0 -------------------------------------------------------------------------------- /images/linnaeus.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djcp/linnaeus/463951f9091b3a2469bfc5a9653803dd6d8c6722/images/linnaeus.jpg -------------------------------------------------------------------------------- /lib/linnaeus.rb: -------------------------------------------------------------------------------- 1 | $LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__)) 2 | 3 | require 'redis' 4 | require 'stemmer' 5 | 6 | # The base class. You won't use this directly - use one of the subclasses. 7 | class Linnaeus 8 | 9 | def initialize(opts = {}) 10 | options = { 11 | persistence_class: Persistence, 12 | stopwords_class: Stopwords, 13 | skip_stemming: false, 14 | encoding: 'UTF-8' 15 | }.merge(opts) 16 | 17 | @db = options[:persistence_class].new(options) 18 | @stopword_generator = options[:stopwords_class].new 19 | @skip_stemming = options[:skip_stemming] 20 | @encoding = options[:encoding] 21 | end 22 | 23 | # Count occurences of words in a text corpus. 24 | # 25 | # == Parameters 26 | # text:: 27 | # A string representing a document. Stopwords are removed and words are stemmed using the "Stemmer" gem. 28 | def count_word_occurrences(text = '') 29 | count = {} 30 | text.encode(@encoding).downcase.split.each do |word| 31 | stemmed_word = (@skip_stemming) ? word : word.stem_porter 32 | unless stopwords.include? stemmed_word 33 | count[stemmed_word] = count[stemmed_word] ? count[stemmed_word] + 1 : 1 34 | end 35 | end 36 | count 37 | end 38 | 39 | private 40 | # Format categories for training or untraining. 41 | # 42 | # == Parameters 43 | # categories:: 44 | # A string or array of categories 45 | def normalize_categories(categories = []) 46 | [categories].flatten.collect do |cat| 47 | cat.to_s.encode(@encoding).downcase.gsub(/[^a-z\d\.\-_ ]/,'') 48 | end.reject{|cat| cat == ''}.compact 49 | end 50 | 51 | # Get a Set of stopwords to remove from documents for training / classifying. 52 | def stopwords 53 | @stopwords ||= @stopword_generator.to_set 54 | end 55 | 56 | end 57 | 58 | require 'set' 59 | require 'linnaeus/stopwords' 60 | require 'linnaeus/persistence' 61 | require 'linnaeus/trainer' 62 | require 'linnaeus/classifier' 63 | -------------------------------------------------------------------------------- /lib/linnaeus/classifier.rb: -------------------------------------------------------------------------------- 1 | # Classify documents against the Bayesian corpus. 2 | # 3 | # lc = Linnaeus::Classifier.new() 4 | # lc.classify 'a string of text' #a wild category appears 5 | # lc.classification_scores 'a different string of text' #a hash of categories and scores 6 | # 7 | # == Constructor Options 8 | # persistence_class:: 9 | # A class implementing persistence - the default (Linnaeus::Persistence) uses redis. 10 | # stopwords_class:: 11 | # A class that emits a set of stopwords. The default is Linnaeus::Stopwords 12 | # skip_stemming:: 13 | # Set to true to skip porter stemming. 14 | # encoding:: 15 | # Force text to use this character set. UTF-8 by default. 16 | # redis_connection:: 17 | # An instantiated Redis connection, allowing you to reuse an existing Redis connection. 18 | # redis_host:: 19 | # Passed to persistence class constructor. Defaults to "127.0.0.1" 20 | # redis_port:: 21 | # Passed to persistence class constructor. Defaults to "6379". 22 | # redis_db:: 23 | # Passed to persistence class constructor. Defaults to "0". 24 | # redis_*:: 25 | # Please see Linnaeus::Persistence for the rest of the options that're passed through directly to the Redis client connection. 26 | class Linnaeus::Classifier < Linnaeus 27 | 28 | # Returns a hash of scores for each category in the Bayesian corpus. 29 | # The closer a score is to 0, the more likely a match it is. 30 | # 31 | # == Parameters 32 | # text:: 33 | # a string of text to classify. 34 | # 35 | # == Returns 36 | # a hash of categories with a score as the values. 37 | def classification_scores(text) 38 | scores = {} 39 | 40 | @db.get_categories.each do |category| 41 | words_with_count_for_category = @db.get_words_with_count_for_category category 42 | total_word_count_sum_for_category = words_with_count_for_category.values.reduce(0){|sum, count| sum += count.to_i} 43 | 44 | scores[category] = 0 45 | count_word_occurrences(text).each do |word, count| 46 | tmp_score = (words_with_count_for_category[word].nil?) ? 0.1 : words_with_count_for_category[word].to_i 47 | scores[category] += Math.log(tmp_score / total_word_count_sum_for_category.to_f) 48 | end 49 | end 50 | scores 51 | end 52 | 53 | # The most likely category for a document. 54 | # 55 | # == Parameters 56 | # text:: 57 | # a string of text to classify. 58 | # 59 | # == Returns 60 | # A string representing the most likely category. 61 | def classify(text) 62 | scores = classification_scores(text) 63 | if scores.any? 64 | (scores.sort_by { |a| -a[1] })[0][0] 65 | else 66 | '' 67 | end 68 | end 69 | 70 | end 71 | -------------------------------------------------------------------------------- /lib/linnaeus/persistence.rb: -------------------------------------------------------------------------------- 1 | # The redis persistence layer. 2 | class Linnaeus::Persistence < Linnaeus 3 | attr_accessor :redis 4 | 5 | def initialize(opts = {}) 6 | options = { 7 | redis_host: '127.0.0.1', 8 | redis_port: '6379', 9 | redis_db: 0, 10 | redis_scheme: "redis", 11 | redis_path: nil, 12 | redis_timeout: 5.0, 13 | redis_password: nil, 14 | redis_id: nil, 15 | redis_tcp_keepalive: 0, 16 | scope: nil 17 | }.merge(opts) 18 | 19 | @scope = options[:scope] 20 | 21 | if options[:redis_connection] 22 | @redis = options[:redis_connection] 23 | else 24 | @redis = Redis.new( 25 | host: options[:redis_host], 26 | port: options[:redis_port], 27 | db: options[:redis_db], 28 | scheme: options[:redis_scheme], 29 | path: options[:redis_path], 30 | timeout: options[:redis_timeout], 31 | password: options[:redis_password], 32 | id: options[:redis_id], 33 | tcp_keepalive: options[:redis_tcp_keepalive] 34 | ) 35 | end 36 | 37 | self 38 | end 39 | 40 | # Add categories to the bayesian corpus. 41 | # 42 | # == Parameters 43 | # categories:: 44 | # A string or array of categories. 45 | def add_categories(categories) 46 | @redis.sadd category_collection_key, categories 47 | end 48 | 49 | # Remove categories from the bayesian corpus 50 | # 51 | # == Parameters 52 | # categories:: 53 | # A string or array of categories. 54 | def remove_category(category) 55 | @redis.srem category_collection_key, category 56 | end 57 | 58 | # Get categories from the bayesian corpus 59 | # 60 | # == Parameters 61 | # categories:: 62 | # A string or array of categories. 63 | def get_categories 64 | @redis.smembers category_collection_key 65 | end 66 | 67 | # Get a list of words with their number of occurrences. 68 | # 69 | # == Parameters 70 | # category:: 71 | # A string representing a category. 72 | # 73 | # == Returns 74 | # A hash with the word counts for this category. 75 | def get_words_with_count_for_category(category) 76 | @redis.hgetall base_category_key + category 77 | end 78 | 79 | # Clear all training data from the backend. 80 | def clear_all_training_data 81 | @redis.flushdb 82 | end 83 | 84 | # Clear training data for the scope associated with this instance. 85 | def clear_training_data 86 | keys = @redis.keys(base_key.join(':') + '*') 87 | 88 | keys.each do |key| 89 | @redis.del key 90 | end 91 | end 92 | 93 | # Increment word counts within a category 94 | # 95 | # == Parameters 96 | # category:: 97 | # A string representing a category. 98 | # word_occurrences:: 99 | # A hash containing a count of the number of word occurences in a document 100 | def increment_word_counts_for_category(category, word_occurrences) 101 | word_occurrences.each do|word,count| 102 | @redis.hincrby base_category_key + category, word, count 103 | end 104 | end 105 | 106 | # Decrement word counts within a category. This is used when removing a document from the corpus. 107 | # 108 | # == Parameters 109 | # category:: 110 | # A string representing a category. 111 | # word_occurrences:: 112 | # A hash containing a count of the number of word occurences in a document 113 | def decrement_word_counts_for_category(category, word_occurrences) 114 | word_occurrences.each do|word,count| 115 | @redis.hincrby base_category_key + category, word, - count 116 | end 117 | end 118 | 119 | # Clean out words with a count of zero in a category. Used during untraining. 120 | # 121 | # == Parameters 122 | # category:: 123 | # A string representing a category. 124 | def cleanup_empty_words_in_category(category) 125 | word_counts = @redis.hgetall base_category_key + category 126 | empty_words = word_counts.select{|word, count| count.to_i <= 0} 127 | if empty_words == word_counts 128 | @redis.del base_category_key + category 129 | else 130 | if empty_words.any? 131 | @redis.hdel base_category_key + category, empty_words.keys 132 | end 133 | end 134 | end 135 | 136 | private 137 | 138 | # The Set (in the redis sense) of categories are stored in this key. 139 | def category_collection_key 140 | [ base_key, 'category' ].compact.join(':') 141 | end 142 | 143 | # The base key for a category within a scope in the redis corpus. Word 144 | # occurrence counts for a category appear under here. 145 | def base_category_key 146 | [ base_key, 'cat:' ].flatten.join(':') 147 | end 148 | 149 | def base_key 150 | [ 'Linnaeus', @scope ].compact 151 | end 152 | 153 | end 154 | -------------------------------------------------------------------------------- /lib/linnaeus/stopwords.rb: -------------------------------------------------------------------------------- 1 | # The stopword list - you can override this list by creating a stopword generator and registering it in the Linnaeus::Trainer or Linnaeus::Classifier constructors. 2 | class Linnaeus::Stopwords 3 | # The default stopword list. 4 | DEFAULT_STOPWORDS = %w(a able about across after all almost also am among an and any are as at be because been but by can cannot could dear did do does either else ever every for from get got had has have he her hers him his how however i if in into is it its just least let like likely may me might most must my neither no nor not of off often on only or other our own rather said say says she should since so some than that the their them then there these they this tis to too twas us wants was we were what when where which while who whom why will with would yet you your) 5 | 6 | attr_accessor :stopwords 7 | 8 | # The list of stopwords as an array 9 | def to_a 10 | @stopwords || DEFAULT_STOPWORDS 11 | end 12 | 13 | # The list of stopwords as a ruby Set 14 | def to_set 15 | to_a.to_set 16 | end 17 | end 18 | -------------------------------------------------------------------------------- /lib/linnaeus/trainer.rb: -------------------------------------------------------------------------------- 1 | # Train or untrain documents from the Bayesian corpus. 2 | # 3 | # lt = Linnaeus::Trainer.new() 4 | # lt.train 'category', 'a string of text' 5 | # lt.train 'differentcategory', 'another string of text' 6 | # lt.untrain 'category', 'a document we just removed' 7 | # 8 | # == Constructor Options 9 | # persistence_class:: 10 | # A class implementing persistence - the default (Linnaeus::Persistence) uses redis. 11 | # stopwords_class:: 12 | # A class that emits a set of stopwords. The default is Linnaeus::Stopwords 13 | # skip_stemming:: 14 | # Set to true to skip porter stemming. 15 | # encoding:: 16 | # Force text to use this character set. UTF-8 by default. 17 | # redis_host:: 18 | # Passed to persistence class constructor. Defaults to "127.0.0.1" 19 | # redis_port:: 20 | # Passed to persistence class constructor. Defaults to "6379". 21 | # redis_db:: 22 | # Passed to persistence class constructor. Defaults to "0". 23 | # redis_*:: 24 | # Please see Linnaeus::Persistence for the rest of the options that're passed through directly to the Redis client connection. 25 | class Linnaeus::Trainer < Linnaeus 26 | 27 | # Add a document to the training corpus. 28 | # 29 | # == Parameters 30 | # categories:: 31 | # A string or array of categories 32 | # text:: 33 | # A string of text in this document. 34 | def train(categories, text) 35 | categories = normalize_categories categories 36 | @db.add_categories(categories) 37 | 38 | word_occurrences = count_word_occurrences text 39 | categories.each do|cat| 40 | @db.increment_word_counts_for_category cat, word_occurrences 41 | end 42 | end 43 | 44 | # Remove a document from the training corpus. 45 | # 46 | # == Parameters 47 | # categories:: 48 | # A string or array of categories 49 | # text:: 50 | # A string of text in this document. 51 | def untrain(categories, text) 52 | categories = normalize_categories categories 53 | 54 | word_occurrences = count_word_occurrences text 55 | categories.each do|cat| 56 | @db.decrement_word_counts_for_category cat, word_occurrences 57 | @db.cleanup_empty_words_in_category cat 58 | end 59 | end 60 | 61 | end 62 | -------------------------------------------------------------------------------- /linnaeus.gemspec: -------------------------------------------------------------------------------- 1 | # Generated by jeweler 2 | # DO NOT EDIT THIS FILE DIRECTLY 3 | # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec' 4 | # -*- encoding: utf-8 -*- 5 | # stub: linnaeus 1.2.0 ruby lib 6 | 7 | Gem::Specification.new do |s| 8 | s.name = "linnaeus" 9 | s.version = "1.2.0" 10 | 11 | s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version= 12 | s.require_paths = ["lib"] 13 | s.authors = ["djcp"] 14 | s.date = "2014-03-18" 15 | s.description = "Linnaeus provides a redis-backed Bayesian classifier. Words are stemmed, stopwords are stopped, and redis is used to allow for persistent and concurrent training and classification." 16 | s.email = "dan@collispuro.net" 17 | s.extra_rdoc_files = [ 18 | "LICENSE.txt", 19 | "README.md" 20 | ] 21 | s.files = [ 22 | ".document", 23 | ".rspec", 24 | ".ruby-version", 25 | ".travis.yml", 26 | "Gemfile", 27 | "Gemfile.lock", 28 | "LICENSE.txt", 29 | "README.md", 30 | "Rakefile", 31 | "VERSION", 32 | "images/linnaeus.jpg", 33 | "lib/linnaeus.rb", 34 | "lib/linnaeus/classifier.rb", 35 | "lib/linnaeus/persistence.rb", 36 | "lib/linnaeus/stopwords.rb", 37 | "lib/linnaeus/trainer.rb", 38 | "linnaeus.gemspec", 39 | "spec/linnaeus_classifier_spec.rb", 40 | "spec/linnaeus_persistence_spec.rb", 41 | "spec/linnaeus_spec.rb", 42 | "spec/linnaeus_stopwords_spec.rb", 43 | "spec/linnaeus_trainer_spec.rb", 44 | "spec/spec_helper.rb" 45 | ] 46 | s.homepage = "http://github.com/djcp/linnaeus" 47 | s.licenses = ["MIT"] 48 | s.rubygems_version = "2.2.1" 49 | s.summary = "Another redis-backed Bayesian classifier" 50 | 51 | if s.respond_to? :specification_version then 52 | s.specification_version = 4 53 | 54 | if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then 55 | s.add_runtime_dependency(%q, ["~> 3.0"]) 56 | s.add_runtime_dependency(%q, ["~> 1.0.0"]) 57 | s.add_development_dependency(%q, ["~> 2.11.0"]) 58 | s.add_development_dependency(%q, ["~> 0.7"]) 59 | s.add_development_dependency(%q, ["~> 3.12"]) 60 | s.add_development_dependency(%q, [">= 0"]) 61 | s.add_development_dependency(%q, [">= 0"]) 62 | s.add_development_dependency(%q, [">= 0"]) 63 | s.add_development_dependency(%q, [">= 0"]) 64 | else 65 | s.add_dependency(%q, ["~> 3.0"]) 66 | s.add_dependency(%q, ["~> 1.0.0"]) 67 | s.add_dependency(%q, ["~> 2.11.0"]) 68 | s.add_dependency(%q, ["~> 0.7"]) 69 | s.add_dependency(%q, ["~> 3.12"]) 70 | s.add_dependency(%q, [">= 0"]) 71 | s.add_dependency(%q, [">= 0"]) 72 | s.add_dependency(%q, [">= 0"]) 73 | s.add_dependency(%q, [">= 0"]) 74 | end 75 | else 76 | s.add_dependency(%q, ["~> 3.0"]) 77 | s.add_dependency(%q, ["~> 1.0.0"]) 78 | s.add_dependency(%q, ["~> 2.11.0"]) 79 | s.add_dependency(%q, ["~> 0.7"]) 80 | s.add_dependency(%q, ["~> 3.12"]) 81 | s.add_dependency(%q, [">= 0"]) 82 | s.add_dependency(%q, [">= 0"]) 83 | s.add_dependency(%q, [">= 0"]) 84 | s.add_dependency(%q, [">= 0"]) 85 | end 86 | end 87 | 88 | -------------------------------------------------------------------------------- /spec/linnaeus_classifier_spec.rb: -------------------------------------------------------------------------------- 1 | require File.expand_path(File.dirname(__FILE__) + '/spec_helper') 2 | 3 | describe Linnaeus::Classifier do 4 | context 'with no training data' do 5 | it 'should return empty values when attempting to classify' do 6 | Linnaeus::Persistence.new.clear_all_training_data 7 | subject.classify("foo bar baz").should be_empty 8 | subject.classification_scores("foo bar baz").should be_empty 9 | end 10 | end 11 | 12 | context 'with a very small dataset' do 13 | before do 14 | create_small_dataset 15 | end 16 | 17 | it 'should classify easy things well' do 18 | subject.classify('A bird that migrates').should eq('bird') 19 | subject.classify('This was directed by Gus Van Sant').should eq('movie') 20 | end 21 | 22 | it 'should return correct classification scores' do 23 | subject.classification_scores('a bird').should eq( 24 | { "movie"=>-6.272877006546167, "bird"=>-4.2626798770413155 } 25 | ) 26 | subject.classification_scores('a directorial bird').should eq( 27 | { "movie"=>-10.24316892009829, "bird"=>-10.827944847076676 } 28 | ) 29 | end 30 | end 31 | 32 | def create_small_dataset 33 | Linnaeus::Persistence.new.clear_all_training_data 34 | lt = Linnaeus::Trainer.new 35 | lt.train 'movie', "Gone with the Wind is a 1939 American historical epic film adapted from Margaret Mitchell's Pulitzer-winning 1936 novel of the same name." 36 | lt.train 'movie', "THX 1138 is a 1971 science fiction film directed by George Lucas in his feature directorial debut. The film was written by Lucas and Walter Murch." 37 | lt.train 'movie', "Top Gun is a 1986 American action drama film directed by Tony Scott, and produced by Don Simpson and Jerry Bruckheimer, in association with the Paramount Pictures company." 38 | 39 | lt.train 'bird', "The Yellow-throated Warbler (Setophaga dominica) is a small migratory songbird species breeding in temperate North America. It belongs to the New World warbler family (Parulidae)." 40 | lt.train 'bird', "The Blue Jay (Cyanocitta cristata) is a passerine bird in the family Corvidae, native to North America. It is resident through most of eastern and central United States and southern Canada, although western populations may be migratory." 41 | lt.train 'bird', "The Mallard or Wild Duck (Anas platyrhynchos) is a dabbling duck which breeds throughout the temperate and subtropical Americas, Europe, Asia, and North Africa, and has been introduced to New Zealand and Australia. This duck belongs to the subfamily Anatinae of the waterfowl family Anatidae" 42 | end 43 | end 44 | -------------------------------------------------------------------------------- /spec/linnaeus_persistence_spec.rb: -------------------------------------------------------------------------------- 1 | require File.expand_path(File.dirname(__FILE__) + '/spec_helper') 2 | 3 | describe Linnaeus::Persistence do 4 | before do 5 | lp = get_linnaeus_persistence 6 | lp.clear_training_data 7 | end 8 | 9 | it "should accept an existing redis connection" do 10 | lp = Linnaeus::Persistence.new(redis_connection: Redis.new) 11 | lp.redis.should_not be_nil 12 | end 13 | 14 | it 'sets keys properly with defaults' do 15 | lp2 = get_linnaeus_persistence 16 | train_a_document_in('foobar') 17 | lp2.redis.keys('*').should match_array ['Linnaeus:category', 'Linnaeus:cat:foobar'] 18 | end 19 | 20 | context "custom scopes" do 21 | it 'sets keys properly' do 22 | lp2 = get_linnaeus_persistence(scope: 'new-scope') 23 | lp2.clear_all_training_data 24 | 25 | train_a_document_in('foobar', scope: 'new-scope') 26 | 27 | lp2.redis.keys('*').should match_array [ 28 | 'Linnaeus:new-scope:cat:foobar', 'Linnaeus:new-scope:category' 29 | ] 30 | end 31 | 32 | it 'can clear scoped training data separately' do 33 | lp = get_linnaeus_persistence 34 | 35 | train_a_document_in('foobar') 36 | 37 | lp2 = get_linnaeus_persistence(scope: 'new-scope') 38 | 39 | train_a_document_in('foobar', scope: 'new-scope') 40 | 41 | lp.redis.keys.should match_array [ 42 | "Linnaeus:cat:foobar", "Linnaeus:category", 43 | "Linnaeus:new-scope:cat:foobar", "Linnaeus:new-scope:category" 44 | ] 45 | 46 | lp2.clear_training_data 47 | 48 | lp.redis.keys.should match_array [ 49 | "Linnaeus:cat:foobar", "Linnaeus:category" 50 | ] 51 | end 52 | 53 | it 'stores categories successfully into different scopes' do 54 | lp = get_linnaeus_persistence 55 | add_categories lp 56 | 57 | lp2 = get_linnaeus_persistence(scope: 'new-scope') 58 | add_categories lp2, ['slack' , 'frop'] 59 | 60 | lp2.get_categories.should match_array ['frop', 'slack'] 61 | lp.get_categories.should match_array ['bar','baz','foo'] 62 | end 63 | end 64 | 65 | it '#clear_all_training_data' do 66 | lp = get_linnaeus_persistence 67 | train_a_document_in 'testcategory' 68 | lp.get_words_with_count_for_category('testcategory').should_not be_empty 69 | lp.clear_all_training_data 70 | lp.get_words_with_count_for_category('testcategory').should be_empty 71 | end 72 | 73 | it 'stores categories successfully' do 74 | lp = get_linnaeus_persistence 75 | add_categories lp 76 | lp.get_categories.should match_array ['bar','baz','foo'] 77 | end 78 | 79 | it 'can remove categories' do 80 | lp = get_linnaeus_persistence 81 | add_categories lp 82 | lp.remove_category 'bar' 83 | lp.get_categories.should match_array ['baz','foo'] 84 | end 85 | 86 | it '#get_words_with_count_for_category' do 87 | lp = get_linnaeus_persistence 88 | train_a_document_in 'testcategory' 89 | lp.get_words_with_count_for_category('testcategory').should eq({ 90 | "test"=>"1", "document"=>"1", "stuff"=>"1", 91 | "bayesian"=>"1", "corpu"=>"1" 92 | }) 93 | end 94 | 95 | it '#increment_word_counts_for_category' do 96 | lp = get_linnaeus_persistence 97 | train_a_document_in 'testcategory' 98 | train_a_document_in 'testcategory' 99 | lp.get_words_with_count_for_category('testcategory').should eq({ 100 | "test"=>"2", "document"=>"2", "stuff"=>"2", 101 | "bayesian"=>"2", "corpu"=>"2" 102 | }) 103 | end 104 | 105 | it '#decrement_word_counts_for_category' do 106 | lp = get_linnaeus_persistence 107 | train_a_document_in 'testcategory' 108 | train_a_document_in 'testcategory' 109 | untrain_a_document_in 'testcategory' 110 | lp.get_words_with_count_for_category('testcategory').should eq({ 111 | "test"=>"1", "document"=>"1", "stuff"=>"1", 112 | "bayesian"=>"1", "corpu"=>"1" 113 | }) 114 | end 115 | 116 | it '#cleanup_empty_words_in_category' do 117 | lp = get_linnaeus_persistence 118 | train_a_document_in 'testcategory' 119 | untrain_a_document_in 'testcategory' 120 | lp.get_words_with_count_for_category('testcategory').should eq ({}) 121 | end 122 | 123 | def add_categories(lp, categories = ['foo','bar','baz','foo', 'bar']) 124 | lp.add_categories(categories) 125 | end 126 | 127 | def get_linnaeus_persistence(options = {}) 128 | Linnaeus::Persistence.new(options) 129 | end 130 | 131 | def train_a_document_in(category, options = {}) 132 | lt = Linnaeus::Trainer.new(options) 133 | lt.train category, document 134 | end 135 | 136 | def untrain_a_document_in(category, options = {}) 137 | lt = Linnaeus::Trainer.new(options) 138 | lt.untrain category, document 139 | end 140 | 141 | def document 142 | 'I am a test document and I will have stuff in the bayesian corpus' 143 | end 144 | end 145 | -------------------------------------------------------------------------------- /spec/linnaeus_spec.rb: -------------------------------------------------------------------------------- 1 | require File.expand_path(File.dirname(__FILE__) + '/spec_helper') 2 | 3 | describe Linnaeus do 4 | end 5 | -------------------------------------------------------------------------------- /spec/linnaeus_stopwords_spec.rb: -------------------------------------------------------------------------------- 1 | require File.expand_path(File.dirname(__FILE__) + '/spec_helper') 2 | 3 | describe Linnaeus::Stopwords do 4 | subject { Linnaeus::Stopwords.new } 5 | it '.to_a' do 6 | subject.should respond_to :to_a 7 | subject.to_a.should be_an_instance_of Array 8 | subject.to_a.should include 'the' 9 | end 10 | it '.to_set' do 11 | subject.should respond_to :to_set 12 | subject.to_set.should be_an_instance_of Set 13 | subject.to_set.should include 'the' 14 | end 15 | it 'can have stopwords overridden' do 16 | subject.stopwords = ['foo','bar'] 17 | subject.to_a.should match_array ['foo','bar'] 18 | subject.to_set.should eq ['foo','bar'].to_set 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /spec/linnaeus_trainer_spec.rb: -------------------------------------------------------------------------------- 1 | require File.expand_path(File.dirname(__FILE__) + '/spec_helper') 2 | 3 | describe Linnaeus::Trainer do 4 | context 'with default options' do 5 | subject { Linnaeus::Trainer.new } 6 | 7 | it 'should count word occurrencs properly' do 8 | subject.count_word_occurrences('foo bar foo baz').should == 9 | { 'foo' => 2, 'bar' => 1, 'baz' => 1 } 10 | end 11 | 12 | it 'should not count stopwords' do 13 | subject.count_word_occurrences('foo the you').should == { 'foo' => 1 } 14 | end 15 | 16 | it 'returns an empty hash when given an empty string' do 17 | subject.count_word_occurrences.should == { } 18 | end 19 | 20 | it 'should train documents properly' do 21 | train_documents_properly 22 | end 23 | 24 | it 'should partially untrain properly' do 25 | lp = Linnaeus::Persistence.new 26 | lp.clear_all_training_data 27 | subject.train 'fruit', grape 28 | subject.train 'fruit', orange 29 | 30 | subject.untrain 'fruit', grape 31 | lp.get_words_with_count_for_category('fruit').should eq({"fruit"=>"1", "sweet"=>"1", "orang"=>"1", "round"=>"1", "citru"=>"1"}) 32 | end 33 | 34 | it 'should fully untrain properly' do 35 | lp = Linnaeus::Persistence.new 36 | lp.clear_all_training_data 37 | subject.train 'fruit', grape 38 | subject.untrain 'fruit', grape 39 | lp.get_words_with_count_for_category('fruit').should eq({}) 40 | end 41 | 42 | end 43 | 44 | context 'with non-default stopwords' do 45 | subject { Linnaeus::Trainer.new(stopwords_class: FooStop) } 46 | it 'should count word occurrences properly' do 47 | subject.count_word_occurrences('foo bar foo baz').should == { 'baz' => 1 } 48 | end 49 | end 50 | 51 | context 'with a custom scope' do 52 | it 'should train on documents properly' do 53 | train_documents_properly(scope: 'new-scope') 54 | end 55 | end 56 | 57 | def train_documents_properly(options = {}) 58 | lp = Linnaeus::Persistence.new(options) 59 | lp.clear_all_training_data 60 | subject = described_class.new(options) 61 | subject.train 'fruit', grape 62 | subject.train 'fruit', orange 63 | lp.get_words_with_count_for_category('fruit').should eq( 64 | { 65 | "grape"=>"1", "purpl"=>"1", "blue"=>"1", "green"=>"1", 66 | "fruit"=>"2", "sweet"=>"2", "wine"=>"1", "oval"=>"1", 67 | "orang"=>"1", "round"=>"1", "citru"=>"1" 68 | }) 69 | end 70 | 71 | def grape 72 | 'grape purple blue green fruit sweet wine oval' 73 | end 74 | 75 | def orange 76 | 'orange round citrus fruit sweet' 77 | end 78 | end 79 | 80 | class FooStop 81 | def to_set 82 | Set.new ['foo','bar'] 83 | end 84 | end 85 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib')) 2 | $LOAD_PATH.unshift(File.dirname(__FILE__)) 3 | require 'simplecov' 4 | SimpleCov.start 5 | require 'rspec' 6 | require 'linnaeus' 7 | 8 | # Requires supporting files with custom matchers and macros, etc, 9 | # in ./support/ and its subdirectories. 10 | Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f} 11 | 12 | RSpec.configure do |config| 13 | 14 | end 15 | --------------------------------------------------------------------------------