├── spec
    ├── spec.opts
    ├── spec_helper.rb
    └── basset
    │   ├── vector_collection_spec.rb
    │   ├── parser_spec.rb
    │   └── feature_collection_spec.rb
├── lib
    ├── basset.rb
    └── basset
    │   ├── parser.rb
    │   ├── vector_collection.rb
    │   └── feature_collection.rb
├── basset.gemspec
└── README.textile


/spec/spec.opts:
--------------------------------------------------------------------------------
1 | --diff
2 | --color
3 | 


--------------------------------------------------------------------------------
/lib/basset.rb:
--------------------------------------------------------------------------------
 1 | $LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
 2 | 
 3 | module Basset; end;
 4 | 
 5 | require 'json'
 6 | require 'basset/parser'
 7 | require 'basset/feature_collection'
 8 | require 'basset/vector_collection'
 9 | 
10 | module Basset
11 |   VERSION = "2.0.1"
12 | end


--------------------------------------------------------------------------------
/spec/spec_helper.rb:
--------------------------------------------------------------------------------
 1 | require "rubygems"
 2 | require "spec"
 3 | 
 4 | # gem install redgreen for colored test output
 5 | begin require "redgreen" unless ENV['TM_CURRENT_LINE']; rescue LoadError; end
 6 | 
 7 | path = File.expand_path(File.dirname(__FILE__) + "/../lib/")
 8 | $LOAD_PATH.unshift(path) unless $LOAD_PATH.include?(path)
 9 | 
10 | require "lib/basset"
11 | 


--------------------------------------------------------------------------------
/spec/basset/vector_collection_spec.rb:
--------------------------------------------------------------------------------
 1 | require File.dirname(__FILE__) + '/../spec_helper'
 2 | 
 3 | describe "vector collection" do
 4 |   describe "entropy normalization" do
 5 |     it "calculates the global weight for each column"
 6 |     it "converts vectors online"
 7 |     it "converts vectors from input file and writes to an output file" do
 8 |       
 9 |     end
10 |   end
11 |   
12 |   describe "tf-idf" do
13 |     
14 |   end
15 | end


--------------------------------------------------------------------------------
/lib/basset/parser.rb:
--------------------------------------------------------------------------------
 1 | class Basset::Parser
 2 |   def self.parse(text, options = {})
 3 |     unigrams = clean_text(text).split
 4 |     
 5 |     ngrams = (options[:ngrams] || 1)
 6 |     (unigrams + (2..ngrams).map {|n| ngrams(unigrams, n)}).flatten
 7 |   end
 8 |   
 9 |   def self.ngrams(unigrams, n)
10 |     grams = []
11 |     unigrams.each_cons(n) {|a| grams << a.join("_")}
12 |     grams
13 |   end
14 | 
15 |   def self.clean_text(text)
16 |     #text.tr(',"#$%^&*()_=+[]{}\|<>/`~\—', " ") .tr("@'\-\'\”\‘\’0123456789", "")
17 |     text.gsub(/\W/, ' ').gsub(/\d/, ' ').tr('_', ' ').downcase
18 |   end
19 | end


--------------------------------------------------------------------------------
/spec/basset/parser_spec.rb:
--------------------------------------------------------------------------------
 1 | require File.dirname(__FILE__) + '/../spec_helper'
 2 | 
 3 | describe "parsing" do
 4 |   it "should parse out punctuation" do
 5 |     Basset::Parser.parse("hello! there").should == %w[hello there]
 6 |   end
 7 |   
 8 |   it "should parse out numbers" do
 9 |     Basset::Parser.parse("this 234 number3").should == %w[this number]
10 |   end
11 |   
12 |   it "should optionally return bigrams" do
13 |     Basset::Parser.parse("hi there paul", :ngrams => 2).should == %w[hi there paul hi_there there_paul]
14 |   end
15 |   
16 |   it "should downcase everything" do
17 |     Basset::Parser.parse("HelLo").should == %w[hello]
18 |   end
19 | end
20 | 


--------------------------------------------------------------------------------
/basset.gemspec:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8 -*-
 2 |  
 3 | Gem::Specification.new do |s|
 4 |   s.name = %q{basset}
 5 |   s.version = "2.0.1"
 6 |  
 7 |   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
 8 |   s.authors = ["Paul Dix"]
 9 |   s.date = %q{2009-09-27}
10 |   s.email = %q{paul@pauldix.net}
11 |   s.files = [
12 |     "lib/basset.rb",
13 |     "lib/basset/parser.rb",
14 |     "README.textile",
15 |     "spec/spec.opts", 
16 |     "spec/spec_helper.rb",
17 |     "spec/basset/parser_spec.rb"]
18 |   s.has_rdoc = true
19 |   s.homepage = %q{http://github.com/pauldix/basset}
20 |   s.require_paths = ["lib"]
21 |   s.rubygems_version = %q{1.3.5}
22 |   s.summary = %q{A wonderful hound that finds patterns in your data using machine learning.}
23 |  
24 |   if s.respond_to? :specification_version then
25 |     current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
26 |     s.specification_version = 2
27 |  
28 |     if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
29 |     else
30 |     end
31 |   else
32 |   end
33 | end


--------------------------------------------------------------------------------
/README.textile:
--------------------------------------------------------------------------------
 1 | h1. Basset
 2 | 
 3 | "http://github.com/pauldix/basset":http://github.com/pauldix/basset
 4 | 
 5 | h2. Summary
 6 | 
 7 | A wonderful hound that finds patterns in your data using machine learning.
 8 | 
 9 | h2. Description
10 | 
11 | This library is under construction. I decided to reboot it from its former version. If for some reason you're still using the old version it can still be found at "http://github.com/pauldix/basset/tree/1.0.1":http://github.com/pauldix/basset/tree/1.0.1
12 | 
13 | h2. Installation
14 | 
15 | <pre>
16 |   gem install basset --source http://gemcutter.org
17 | </pre>
18 | 
19 | h2. Use
20 | 
21 | awesomeness goes here
22 | 
23 | h2. LICENSE
24 | 
25 | (The MIT License)
26 |  
27 | Copyright (c) 2009:
28 |  
29 | "Paul Dix":http://pauldix.net
30 |  
31 | Permission is hereby granted, free of charge, to any person obtaining
32 | a copy of this software and associated documentation files (the
33 | 'Software'), to deal in the Software without restriction, including
34 | without limitation the rights to use, copy, modify, merge, publish,
35 | distribute, sublicense, and/or sell copies of the Software, and to
36 | permit persons to whom the Software is furnished to do so, subject to
37 | the following conditions:
38 |  
39 | The above copyright notice and this permission notice shall be
40 | included in all copies or substantial portions of the Software.
41 |  
42 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
43 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
44 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
45 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
46 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
47 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
48 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/lib/basset/vector_collection.rb:
--------------------------------------------------------------------------------
 1 | class Basset::VectorCollection
 2 |   def initialize(options)
 3 |     @global_frequencies = array_size(options[:feature_count])
 4 |     @global_weights     = array_size(options[:feature_count])
 5 |     @vector_count       = 0
 6 |   end
 7 |   
 8 |   def entropy_normalize_vectors(input_file, output_file)
 9 |     compute_weights(input_file)
10 |     
11 |     output_file.puts "#{@global_weights.size},#{@vector_count}"
12 |     
13 |     input_file.each do |line|
14 |       output_vector = []
15 |       execute_calculation_on_line(line) do |column, count|
16 |         output_vector << "#{column},#{@global_weights[column] * (count + 1)}"
17 |       end
18 |       output_file.puts(output_vector.join(";"))
19 |     end
20 |   end
21 |   
22 |   # the weights computed are used for entropy normalization as documented here:
23 |   # http://www.dcs.shef.ac.uk/~genevieve/lsa_tutorial.htm
24 |   def compute_weights(input_file)
25 |     compute_frequencies(input_file)
26 |     
27 |     vector_count_log = Math.log(@vector_count)
28 |     
29 |     input_file.each do |line|
30 |       execute_calculation_on_line(line) do |column, count|
31 |         p = count.to_f/@global_frequencies[column]
32 |         @global_weights[column] ||= 0
33 |         @global_weights[column] += (p * Math.log(p))/vector_count_log
34 |       end
35 |     end
36 |     input_file.rewind
37 |     
38 |     @global_weights = @global_weights.map {|w| w += 1}
39 |   end
40 |   
41 |   def compute_frequencies(input_file)
42 |     @vector_count = 0
43 |     input_file.each do |line|
44 |       @vector_count += 0
45 |       execute_calculation_on_line(line) do |column, count|
46 |         @global_frequencies[column] ||= 0
47 |         @global_frequencies[column] += count
48 |       end
49 |     end
50 |     
51 |     input_file.rewind
52 |   end
53 |   
54 |   def execute_calculation_on_line(line)
55 |     vector = line_to_vector(line)
56 |     vector.each do |column_and_count|
57 |       yield(column_and_count[0], column_and_count[1])
58 |     end    
59 |   end
60 |   
61 |   def line_to_vector(line)
62 |     line.split(";").map {|v| v.split(",")}
63 |   end
64 | 
65 |   def array_size(count)
66 |     return count.nil? ? [] : Array.new(count)
67 |   end  
68 | end


--------------------------------------------------------------------------------
/lib/basset/feature_collection.rb:
--------------------------------------------------------------------------------
  1 | class Basset::FeatureCollection
  2 |   attr_accessor :row_count
  3 | 
  4 |   def initialize(options = {})
  5 |     @feature_map = options[:feature_map] || {}
  6 |     @row_count = options[:row_count] || 0
  7 |     @ordered_features = []
  8 |   end
  9 | 
 10 |   def add_row(features)
 11 |     @row_count += 1
 12 |     features.uniq.each do |f|
 13 |       feature = @feature_map[f]
 14 |       if feature
 15 |         feature[1] += 1
 16 |       else
 17 |         @ordered_features << f
 18 |         @feature_map[f] = [@feature_map.size, 1]
 19 |       end
 20 |     end
 21 |   end
 22 | 
 23 |   def feature_count
 24 |     @feature_map.size
 25 |   end
 26 | 
 27 |   def features
 28 |     @feature_map.keys.sort
 29 |   end
 30 | 
 31 |   def index_of(feature)
 32 |     f = @feature_map[feature]
 33 |     f[0] if f
 34 |   end
 35 | 
 36 |   def document_frequency(feature)
 37 |     f = @feature_map[feature]
 38 |     f[1] if f
 39 |   end
 40 | 
 41 |   def features_to_vector(features)
 42 |     vector = Array.new(@feature_map.size, 0)
 43 |     features.each do |f|
 44 |       index = index_of(f)
 45 |       vector[index] += 1 if index
 46 |     end
 47 |     vector
 48 |   end
 49 | 
 50 |   # The options hash has one option :value which can be either :tf or :tf_idf.
 51 |   # Note that if you choose :tf_idf, you must do it only after you have added all rows.
 52 |   # this typically means looping through your data set twice. first to add rows, second to
 53 |   # extract sparse feature vectors
 54 |   def features_to_sparse_vector(features, options = {})
 55 |     sparse_vector = Hash.new {|h, k| h[k] = 0}
 56 | 
 57 |     features.each do |feature|
 58 |       index = index_of(feature)
 59 |       sparse_vector[index] += 1 if index
 60 |     end
 61 | 
 62 |     case options[:value]
 63 |     when :tf
 64 |       # do nothing, we're already good
 65 |     when :boolean
 66 |       sparse_vector.keys.each {|key| sparse_vector[key] = 1}
 67 |     when :tf_idf
 68 |       idf(features, sparse_vector)
 69 |     when :sublinear_tf_idf
 70 |       sparse_vector.keys.each {|key| sparse_vector[key] = 1 + Math.log10(sparse_vector[key])}
 71 |       idf(features, sparse_vector)
 72 |     end
 73 | 
 74 |     sparse_vector.keys.sort.map {|k| [k, sparse_vector[k]]}
 75 |   end
 76 | 
 77 |   def idf(features, sparse_vector)
 78 |     features.uniq.each do |feature|
 79 |       index_and_count = @feature_map[feature]
 80 |       if index_and_count
 81 |         if index_and_count.size == 3
 82 |           idf = index_and_count[2]
 83 |         else
 84 |           idf = Math.log10(@row_count / index_and_count[1])
 85 |           index_and_count << idf
 86 |         end
 87 |         index = index_and_count[0]
 88 |         val = sparse_vector[index] * idf
 89 |         if val == 0
 90 |           sparse_vector.delete index
 91 |         else
 92 |           sparse_vector[index] = val
 93 |         end
 94 |       end
 95 |     end
 96 |   end
 97 | 
 98 |   def purge_features_occuring_less_than(times)
 99 |     @feature_map.each_pair do |feature, index_and_count|
100 |       @feature_map.delete(feature) if index_and_count[1] < times
101 |     end
102 | 
103 |     index = 0
104 |     @ordered_features.each do |f|
105 |       index_and_count = @feature_map[f]
106 |       if index_and_count
107 |         index_and_count[0] = index
108 |         index += 1
109 |       end
110 |     end
111 |   end
112 | 
113 |   def serializable_hash_map
114 |     {
115 |       :row_count => @row_count,
116 |       :feature_map => @feature_map
117 |     }
118 |   end
119 | 
120 |   def to_json
121 |     serializable_hash_map.to_json
122 |   end
123 | 
124 |   def self.from_json_hash(json)
125 |     new({
126 |       :feature_map => json["feature_map"],
127 |       :row_count   => json["row_count"],
128 |     })
129 |   end
130 | 
131 |   def self.from_json(json_string)
132 |     json = JSON.parse(json_string)
133 |     from_json_hash(json)
134 |   end
135 | end


--------------------------------------------------------------------------------
/spec/basset/feature_collection_spec.rb:
--------------------------------------------------------------------------------
  1 | require File.dirname(__FILE__) + '/../spec_helper'
  2 | 
  3 | describe "feature collection" do
  4 |   describe "numbering features" do
  5 |     before(:each) do
  6 |       @collection = Basset::FeatureCollection.new
  7 |       @collection.add_row %w[hello paul hello]
  8 |     end
  9 | 
 10 |     it "takes rows of features" do
 11 |       @collection.features.should == %w[hello paul]
 12 |     end
 13 | 
 14 |     it "counts how many rows have been added" do
 15 |       @collection.row_count.should == 1
 16 |     end
 17 | 
 18 |     it "counts the number of unique features" do
 19 |       @collection.feature_count.should == 2
 20 |     end
 21 | 
 22 |     it "keeps the index of a feature" do
 23 |       @collection.index_of("hello").should == 0
 24 |       @collection.index_of("paul").should == 1
 25 |     end
 26 | 
 27 |     it "returns nil as the index for a feature not in the collection" do
 28 |       @collection.index_of("whatevs").should == nil
 29 |     end
 30 | 
 31 |     it "knows the number of rows a feature occurs in" do
 32 |       @collection.add_row %w[code hello paul and paul]
 33 |       @collection.document_frequency("code").should == 1
 34 |       @collection.document_frequency("hello").should == 2
 35 |       @collection.document_frequency("paul").should == 2
 36 |     end
 37 | 
 38 |     it "should remove features that occur under a given number of times and renumber all others while preserving insertion order" do
 39 |       collection = Basset::FeatureCollection.new
 40 |       collection.add_row %w[hello basset library hello library]
 41 |       collection.add_row %w[basset is sweet hello]
 42 |       collection.purge_features_occuring_less_than(2)
 43 |       collection.features.size.should == 2
 44 |       collection.index_of("hello").should == 0
 45 |       collection.index_of("basset").should == 1
 46 |     end
 47 |   end
 48 | 
 49 |   describe "extracing feature vectors" do
 50 |     before(:each) do
 51 |       @collection = Basset::FeatureCollection.new()
 52 |       @collection.add_row %w[hello paul basset]
 53 |       @collection.add_row %w[basset is a ruby library]
 54 |     end
 55 | 
 56 |     it "can return a regular array with feature counts" do
 57 |       @collection.features_to_vector(%w[basset is written by paul is]).should == [0, 1, 1, 2, 0, 0, 0]
 58 |     end
 59 | 
 60 |     it "can extract a sparse vector format" do
 61 |       @collection.features_to_sparse_vector(%w[basset is written by paul is library]).should == [[1,1], [2,1], [3,2], [6,1]]
 62 |     end
 63 | 
 64 |     it "calculates a sparse vector with tf counts" do
 65 |       @collection.features_to_sparse_vector(%w[basset is written by paul is library], :value => :tf).should == [[1,1], [2,1], [3,2], [6,1]]
 66 |     end
 67 | 
 68 |     it "calculates a sparse vector with booleans on if a feature appeared" do
 69 |       @collection.features_to_sparse_vector(%w[basset is written by paul is library], :value => :boolean).should == [[1,1], [2,1], [3,1], [6,1]]
 70 |     end
 71 | 
 72 |     it "calculates a sparse vector with tf-idf counts and should exclude values of 0" do
 73 |       @collection.features_to_sparse_vector(%w[basset is written by paul is library], :value => :tf_idf).inspect.should == [[1, 0.301029995663981], [3, 0.602059991327962], [6, 0.301029995663981]].inspect
 74 |     end
 75 | 
 76 |     it "calculates a sparse vector with sublinear tf-idf counts and should exclude values of 0" do
 77 |       @collection.features_to_sparse_vector(%w[basset is written by paul is library], :value => :sublinear_tf_idf).inspect.should == [[1, 0.301029995663981], [3, 0.391649053953438], [6, 0.301029995663981]].inspect
 78 |     end
 79 |   end
 80 | 
 81 |   describe "serializin a feature collection" do
 82 |     it "can serialize to json" do
 83 |       collection = Basset::FeatureCollection.new
 84 |       collection.add_row %w[hello paul]
 85 |       collection.add_row %w[basset by paul]
 86 |       JSON.parse(collection.to_json).should == {
 87 |         "row_count" => 2,
 88 |         "feature_map" => {
 89 |           "hello" => [0, 1],
 90 |           "paul" => [1, 2],
 91 |           "basset" => [2, 1],
 92 |           "by" => [3, 1]
 93 |         }
 94 |       }
 95 |     end
 96 | 
 97 |     it "can marshall from json" do
 98 |       collection = Basset::FeatureCollection.new
 99 |       collection.add_row %w[paul hello paul]
100 |       collection.add_row %w[basset by paul]
101 | 
102 |       marshalled_collection = Basset::FeatureCollection.from_json(collection.to_json)
103 |       marshalled_collection.document_frequency("paul").should == 2
104 |       marshalled_collection.index_of("by").should == 3
105 |     end
106 |   end
107 | end
108 | 


--------------------------------------------------------------------------------