├── spec ├── spec.opts ├── spec_helper.rb └── basset │ ├── vector_collection_spec.rb │ ├── parser_spec.rb │ └── feature_collection_spec.rb ├── lib ├── basset.rb └── basset │ ├── parser.rb │ ├── vector_collection.rb │ └── feature_collection.rb ├── basset.gemspec └── README.textile /spec/spec.opts: -------------------------------------------------------------------------------- 1 | --diff 2 | --color 3 | -------------------------------------------------------------------------------- /lib/basset.rb: -------------------------------------------------------------------------------- 1 | $LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__)) 2 | 3 | module Basset; end; 4 | 5 | require 'json' 6 | require 'basset/parser' 7 | require 'basset/feature_collection' 8 | require 'basset/vector_collection' 9 | 10 | module Basset 11 | VERSION = "2.0.1" 12 | end -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | require "rubygems" 2 | require "spec" 3 | 4 | # gem install redgreen for colored test output 5 | begin require "redgreen" unless ENV['TM_CURRENT_LINE']; rescue LoadError; end 6 | 7 | path = File.expand_path(File.dirname(__FILE__) + "/../lib/") 8 | $LOAD_PATH.unshift(path) unless $LOAD_PATH.include?(path) 9 | 10 | require "lib/basset" 11 | -------------------------------------------------------------------------------- /spec/basset/vector_collection_spec.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/../spec_helper' 2 | 3 | describe "vector collection" do 4 | describe "entropy normalization" do 5 | it "calculates the global weight for each column" 6 | it "converts vectors online" 7 | it "converts vectors from input file and writes to an output file" do 8 | 9 | end 10 | end 11 | 12 | describe "tf-idf" do 13 | 14 | end 15 | end -------------------------------------------------------------------------------- /lib/basset/parser.rb: -------------------------------------------------------------------------------- 1 | class Basset::Parser 2 | def self.parse(text, options = {}) 3 | unigrams = clean_text(text).split 4 | 5 | ngrams = (options[:ngrams] || 1) 6 | (unigrams + (2..ngrams).map {|n| ngrams(unigrams, n)}).flatten 7 | end 8 | 9 | def self.ngrams(unigrams, n) 10 | grams = [] 11 | unigrams.each_cons(n) {|a| grams << a.join("_")} 12 | grams 13 | end 14 | 15 | def self.clean_text(text) 16 | #text.tr(',"#$%^&*()_=+[]{}\|<>/`~\—', " ") .tr("@'\-\'\”\‘\’0123456789", "") 17 | text.gsub(/\W/, ' ').gsub(/\d/, ' ').tr('_', ' ').downcase 18 | end 19 | end -------------------------------------------------------------------------------- /spec/basset/parser_spec.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/../spec_helper' 2 | 3 | describe "parsing" do 4 | it "should parse out punctuation" do 5 | Basset::Parser.parse("hello! there").should == %w[hello there] 6 | end 7 | 8 | it "should parse out numbers" do 9 | Basset::Parser.parse("this 234 number3").should == %w[this number] 10 | end 11 | 12 | it "should optionally return bigrams" do 13 | Basset::Parser.parse("hi there paul", :ngrams => 2).should == %w[hi there paul hi_there there_paul] 14 | end 15 | 16 | it "should downcase everything" do 17 | Basset::Parser.parse("HelLo").should == %w[hello] 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /basset.gemspec: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | Gem::Specification.new do |s| 4 | s.name = %q{basset} 5 | s.version = "2.0.1" 6 | 7 | s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version= 8 | s.authors = ["Paul Dix"] 9 | s.date = %q{2009-09-27} 10 | s.email = %q{paul@pauldix.net} 11 | s.files = [ 12 | "lib/basset.rb", 13 | "lib/basset/parser.rb", 14 | "README.textile", 15 | "spec/spec.opts", 16 | "spec/spec_helper.rb", 17 | "spec/basset/parser_spec.rb"] 18 | s.has_rdoc = true 19 | s.homepage = %q{http://github.com/pauldix/basset} 20 | s.require_paths = ["lib"] 21 | s.rubygems_version = %q{1.3.5} 22 | s.summary = %q{A wonderful hound that finds patterns in your data using machine learning.} 23 | 24 | if s.respond_to? :specification_version then 25 | current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION 26 | s.specification_version = 2 27 | 28 | if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then 29 | else 30 | end 31 | else 32 | end 33 | end -------------------------------------------------------------------------------- /README.textile: -------------------------------------------------------------------------------- 1 | h1. Basset 2 | 3 | "http://github.com/pauldix/basset":http://github.com/pauldix/basset 4 | 5 | h2. Summary 6 | 7 | A wonderful hound that finds patterns in your data using machine learning. 8 | 9 | h2. Description 10 | 11 | This library is under construction. I decided to reboot it from its former version. If for some reason you're still using the old version it can still be found at "http://github.com/pauldix/basset/tree/1.0.1":http://github.com/pauldix/basset/tree/1.0.1 12 | 13 | h2. Installation 14 | 15 |
16 |   gem install basset --source http://gemcutter.org
17 | 
18 | 19 | h2. Use 20 | 21 | awesomeness goes here 22 | 23 | h2. LICENSE 24 | 25 | (The MIT License) 26 | 27 | Copyright (c) 2009: 28 | 29 | "Paul Dix":http://pauldix.net 30 | 31 | Permission is hereby granted, free of charge, to any person obtaining 32 | a copy of this software and associated documentation files (the 33 | 'Software'), to deal in the Software without restriction, including 34 | without limitation the rights to use, copy, modify, merge, publish, 35 | distribute, sublicense, and/or sell copies of the Software, and to 36 | permit persons to whom the Software is furnished to do so, subject to 37 | the following conditions: 38 | 39 | The above copyright notice and this permission notice shall be 40 | included in all copies or substantial portions of the Software. 41 | 42 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, 43 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 44 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 45 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 46 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 47 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 48 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /lib/basset/vector_collection.rb: -------------------------------------------------------------------------------- 1 | class Basset::VectorCollection 2 | def initialize(options) 3 | @global_frequencies = array_size(options[:feature_count]) 4 | @global_weights = array_size(options[:feature_count]) 5 | @vector_count = 0 6 | end 7 | 8 | def entropy_normalize_vectors(input_file, output_file) 9 | compute_weights(input_file) 10 | 11 | output_file.puts "#{@global_weights.size},#{@vector_count}" 12 | 13 | input_file.each do |line| 14 | output_vector = [] 15 | execute_calculation_on_line(line) do |column, count| 16 | output_vector << "#{column},#{@global_weights[column] * (count + 1)}" 17 | end 18 | output_file.puts(output_vector.join(";")) 19 | end 20 | end 21 | 22 | # the weights computed are used for entropy normalization as documented here: 23 | # http://www.dcs.shef.ac.uk/~genevieve/lsa_tutorial.htm 24 | def compute_weights(input_file) 25 | compute_frequencies(input_file) 26 | 27 | vector_count_log = Math.log(@vector_count) 28 | 29 | input_file.each do |line| 30 | execute_calculation_on_line(line) do |column, count| 31 | p = count.to_f/@global_frequencies[column] 32 | @global_weights[column] ||= 0 33 | @global_weights[column] += (p * Math.log(p))/vector_count_log 34 | end 35 | end 36 | input_file.rewind 37 | 38 | @global_weights = @global_weights.map {|w| w += 1} 39 | end 40 | 41 | def compute_frequencies(input_file) 42 | @vector_count = 0 43 | input_file.each do |line| 44 | @vector_count += 0 45 | execute_calculation_on_line(line) do |column, count| 46 | @global_frequencies[column] ||= 0 47 | @global_frequencies[column] += count 48 | end 49 | end 50 | 51 | input_file.rewind 52 | end 53 | 54 | def execute_calculation_on_line(line) 55 | vector = line_to_vector(line) 56 | vector.each do |column_and_count| 57 | yield(column_and_count[0], column_and_count[1]) 58 | end 59 | end 60 | 61 | def line_to_vector(line) 62 | line.split(";").map {|v| v.split(",")} 63 | end 64 | 65 | def array_size(count) 66 | return count.nil? ? [] : Array.new(count) 67 | end 68 | end -------------------------------------------------------------------------------- /lib/basset/feature_collection.rb: -------------------------------------------------------------------------------- 1 | class Basset::FeatureCollection 2 | attr_accessor :row_count 3 | 4 | def initialize(options = {}) 5 | @feature_map = options[:feature_map] || {} 6 | @row_count = options[:row_count] || 0 7 | @ordered_features = [] 8 | end 9 | 10 | def add_row(features) 11 | @row_count += 1 12 | features.uniq.each do |f| 13 | feature = @feature_map[f] 14 | if feature 15 | feature[1] += 1 16 | else 17 | @ordered_features << f 18 | @feature_map[f] = [@feature_map.size, 1] 19 | end 20 | end 21 | end 22 | 23 | def feature_count 24 | @feature_map.size 25 | end 26 | 27 | def features 28 | @feature_map.keys.sort 29 | end 30 | 31 | def index_of(feature) 32 | f = @feature_map[feature] 33 | f[0] if f 34 | end 35 | 36 | def document_frequency(feature) 37 | f = @feature_map[feature] 38 | f[1] if f 39 | end 40 | 41 | def features_to_vector(features) 42 | vector = Array.new(@feature_map.size, 0) 43 | features.each do |f| 44 | index = index_of(f) 45 | vector[index] += 1 if index 46 | end 47 | vector 48 | end 49 | 50 | # The options hash has one option :value which can be either :tf or :tf_idf. 51 | # Note that if you choose :tf_idf, you must do it only after you have added all rows. 52 | # this typically means looping through your data set twice. first to add rows, second to 53 | # extract sparse feature vectors 54 | def features_to_sparse_vector(features, options = {}) 55 | sparse_vector = Hash.new {|h, k| h[k] = 0} 56 | 57 | features.each do |feature| 58 | index = index_of(feature) 59 | sparse_vector[index] += 1 if index 60 | end 61 | 62 | case options[:value] 63 | when :tf 64 | # do nothing, we're already good 65 | when :boolean 66 | sparse_vector.keys.each {|key| sparse_vector[key] = 1} 67 | when :tf_idf 68 | idf(features, sparse_vector) 69 | when :sublinear_tf_idf 70 | sparse_vector.keys.each {|key| sparse_vector[key] = 1 + Math.log10(sparse_vector[key])} 71 | idf(features, sparse_vector) 72 | end 73 | 74 | sparse_vector.keys.sort.map {|k| [k, sparse_vector[k]]} 75 | end 76 | 77 | def idf(features, sparse_vector) 78 | features.uniq.each do |feature| 79 | index_and_count = @feature_map[feature] 80 | if index_and_count 81 | if index_and_count.size == 3 82 | idf = index_and_count[2] 83 | else 84 | idf = Math.log10(@row_count / index_and_count[1]) 85 | index_and_count << idf 86 | end 87 | index = index_and_count[0] 88 | val = sparse_vector[index] * idf 89 | if val == 0 90 | sparse_vector.delete index 91 | else 92 | sparse_vector[index] = val 93 | end 94 | end 95 | end 96 | end 97 | 98 | def purge_features_occuring_less_than(times) 99 | @feature_map.each_pair do |feature, index_and_count| 100 | @feature_map.delete(feature) if index_and_count[1] < times 101 | end 102 | 103 | index = 0 104 | @ordered_features.each do |f| 105 | index_and_count = @feature_map[f] 106 | if index_and_count 107 | index_and_count[0] = index 108 | index += 1 109 | end 110 | end 111 | end 112 | 113 | def serializable_hash_map 114 | { 115 | :row_count => @row_count, 116 | :feature_map => @feature_map 117 | } 118 | end 119 | 120 | def to_json 121 | serializable_hash_map.to_json 122 | end 123 | 124 | def self.from_json_hash(json) 125 | new({ 126 | :feature_map => json["feature_map"], 127 | :row_count => json["row_count"], 128 | }) 129 | end 130 | 131 | def self.from_json(json_string) 132 | json = JSON.parse(json_string) 133 | from_json_hash(json) 134 | end 135 | end -------------------------------------------------------------------------------- /spec/basset/feature_collection_spec.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/../spec_helper' 2 | 3 | describe "feature collection" do 4 | describe "numbering features" do 5 | before(:each) do 6 | @collection = Basset::FeatureCollection.new 7 | @collection.add_row %w[hello paul hello] 8 | end 9 | 10 | it "takes rows of features" do 11 | @collection.features.should == %w[hello paul] 12 | end 13 | 14 | it "counts how many rows have been added" do 15 | @collection.row_count.should == 1 16 | end 17 | 18 | it "counts the number of unique features" do 19 | @collection.feature_count.should == 2 20 | end 21 | 22 | it "keeps the index of a feature" do 23 | @collection.index_of("hello").should == 0 24 | @collection.index_of("paul").should == 1 25 | end 26 | 27 | it "returns nil as the index for a feature not in the collection" do 28 | @collection.index_of("whatevs").should == nil 29 | end 30 | 31 | it "knows the number of rows a feature occurs in" do 32 | @collection.add_row %w[code hello paul and paul] 33 | @collection.document_frequency("code").should == 1 34 | @collection.document_frequency("hello").should == 2 35 | @collection.document_frequency("paul").should == 2 36 | end 37 | 38 | it "should remove features that occur under a given number of times and renumber all others while preserving insertion order" do 39 | collection = Basset::FeatureCollection.new 40 | collection.add_row %w[hello basset library hello library] 41 | collection.add_row %w[basset is sweet hello] 42 | collection.purge_features_occuring_less_than(2) 43 | collection.features.size.should == 2 44 | collection.index_of("hello").should == 0 45 | collection.index_of("basset").should == 1 46 | end 47 | end 48 | 49 | describe "extracing feature vectors" do 50 | before(:each) do 51 | @collection = Basset::FeatureCollection.new() 52 | @collection.add_row %w[hello paul basset] 53 | @collection.add_row %w[basset is a ruby library] 54 | end 55 | 56 | it "can return a regular array with feature counts" do 57 | @collection.features_to_vector(%w[basset is written by paul is]).should == [0, 1, 1, 2, 0, 0, 0] 58 | end 59 | 60 | it "can extract a sparse vector format" do 61 | @collection.features_to_sparse_vector(%w[basset is written by paul is library]).should == [[1,1], [2,1], [3,2], [6,1]] 62 | end 63 | 64 | it "calculates a sparse vector with tf counts" do 65 | @collection.features_to_sparse_vector(%w[basset is written by paul is library], :value => :tf).should == [[1,1], [2,1], [3,2], [6,1]] 66 | end 67 | 68 | it "calculates a sparse vector with booleans on if a feature appeared" do 69 | @collection.features_to_sparse_vector(%w[basset is written by paul is library], :value => :boolean).should == [[1,1], [2,1], [3,1], [6,1]] 70 | end 71 | 72 | it "calculates a sparse vector with tf-idf counts and should exclude values of 0" do 73 | @collection.features_to_sparse_vector(%w[basset is written by paul is library], :value => :tf_idf).inspect.should == [[1, 0.301029995663981], [3, 0.602059991327962], [6, 0.301029995663981]].inspect 74 | end 75 | 76 | it "calculates a sparse vector with sublinear tf-idf counts and should exclude values of 0" do 77 | @collection.features_to_sparse_vector(%w[basset is written by paul is library], :value => :sublinear_tf_idf).inspect.should == [[1, 0.301029995663981], [3, 0.391649053953438], [6, 0.301029995663981]].inspect 78 | end 79 | end 80 | 81 | describe "serializin a feature collection" do 82 | it "can serialize to json" do 83 | collection = Basset::FeatureCollection.new 84 | collection.add_row %w[hello paul] 85 | collection.add_row %w[basset by paul] 86 | JSON.parse(collection.to_json).should == { 87 | "row_count" => 2, 88 | "feature_map" => { 89 | "hello" => [0, 1], 90 | "paul" => [1, 2], 91 | "basset" => [2, 1], 92 | "by" => [3, 1] 93 | } 94 | } 95 | end 96 | 97 | it "can marshall from json" do 98 | collection = Basset::FeatureCollection.new 99 | collection.add_row %w[paul hello paul] 100 | collection.add_row %w[basset by paul] 101 | 102 | marshalled_collection = Basset::FeatureCollection.from_json(collection.to_json) 103 | marshalled_collection.document_frequency("paul").should == 2 104 | marshalled_collection.index_of("by").should == 3 105 | end 106 | end 107 | end 108 | --------------------------------------------------------------------------------