├── .autotest ├── History.txt ├── Manifest.txt ├── README.txt ├── Rakefile ├── lib └── minhash.rb └── test └── test_minhash.rb /.autotest: -------------------------------------------------------------------------------- 1 | # -*- ruby -*- 2 | 3 | require 'autotest/restart' 4 | 5 | # Autotest.add_hook :initialize do |at| 6 | # at.extra_files << "../some/external/dependency.rb" 7 | # 8 | # at.libs << ":../some/external" 9 | # 10 | # at.add_exception 'vendor' 11 | # 12 | # at.add_mapping(/dependency.rb/) do |f, _| 13 | # at.files_matching(/test_.*rb$/) 14 | # end 15 | # 16 | # %w(TestA TestB).each do |klass| 17 | # at.extra_class_map[klass] = "test/test_misc.rb" 18 | # end 19 | # end 20 | 21 | # Autotest.add_hook :run_command do |at| 22 | # system "rake build" 23 | # end 24 | -------------------------------------------------------------------------------- /History.txt: -------------------------------------------------------------------------------- 1 | === 1.0.0 / 2009-05-03 2 | 3 | * 1 major enhancement 4 | 5 | * Birthday! 6 | 7 | -------------------------------------------------------------------------------- /Manifest.txt: -------------------------------------------------------------------------------- 1 | History.txt 2 | Manifest.txt 3 | README.txt 4 | Rakefile 5 | lib/minhash.rb 6 | test/test_minhash.rb 7 | -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | = minhash 2 | 3 | * http://example.com 4 | 5 | == DESCRIPTION: 6 | 7 | Runs the MinHash algo across some strings using the murmur hash 8 | function. 9 | 10 | == FEATURES/PROBLEMS: 11 | 12 | * Does it right. 13 | 14 | * Only works well for datasets with fewer than 2**32 (~4 billion) 15 | unique items in it. Larger than that and, due to use our hash 16 | function using unsigned 64-bit integers, we run into the birthday 17 | paradox. 18 | 19 | == SYNOPSIS: 20 | # 23 can be any unsigned 32-bit integer, a la the MurmurHash docs. 21 | MinHash.minhash(['a', 'foo', 'c'], 23) 22 | 23 | == REQUIREMENTS: 24 | 25 | * murmur_hash gem 26 | 27 | == INSTALL: 28 | 29 | * sudo gem install minhash 30 | 31 | == LICENSE: 32 | 33 | (The MIT License) 34 | 35 | Copyright (c) 2009 Jeff Hodges 36 | 37 | Permission is hereby granted, free of charge, to any person obtaining 38 | a copy of this software and associated documentation files (the 39 | 'Software'), to deal in the Software without restriction, including 40 | without limitation the rights to use, copy, modify, merge, publish, 41 | distribute, sublicense, and/or sell copies of the Software, and to 42 | permit persons to whom the Software is furnished to do so, subject to 43 | the following conditions: 44 | 45 | The above copyright notice and this permission notice shall be 46 | included in all copies or substantial portions of the Software. 47 | 48 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, 49 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 50 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 51 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 52 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 53 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 54 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 55 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | # -*- ruby -*- 2 | 3 | require 'rubygems' 4 | require 'hoe' 5 | begin 6 | require './lib/minhash.rb' 7 | rescue LoadError 8 | end 9 | 10 | Hoe.new('minhash', MinHash::VERSION || '0.0.0') do |p| 11 | # p.rubyforge_name = 'minhash' # if different than lowercase project name 12 | p.developer('Jeff Hodges', 'jeff@somethingsimilar.com') 13 | p.extra_deps << ['murmur_hash', '1.0.1'] 14 | end 15 | 16 | # vim: syntax=Ruby 17 | -------------------------------------------------------------------------------- /lib/minhash.rb: -------------------------------------------------------------------------------- 1 | require 'murmur_hash' 2 | 3 | module MinHash 4 | VERSION = '0.9.9' 5 | # We use murmur_hash64 because we really do need 2**64 -1 items and 6 | # I'm assuming all the machines it will be deployed on are 7 | # little-endian. 8 | Infinity = 1.0 / 0.0 9 | def self.minhash(history, seed) 10 | val = Infinity 11 | Array(history).each do |item| 12 | n = MurmurHash.murmur_hash64(item, seed) 13 | val = n if n < val 14 | end 15 | val 16 | end 17 | 18 | def self.minhash32(history, seed) 19 | val = Infinity 20 | Array(history).each do |item| 21 | n = MurmurHash.murmur_hash(item, seed) 22 | val = n if n < val 23 | end 24 | val 25 | end 26 | end 27 | -------------------------------------------------------------------------------- /test/test_minhash.rb: -------------------------------------------------------------------------------- 1 | require "test/unit" 2 | require "rubygems" 3 | require "minhash" 4 | 5 | class TestMinHash < Test::Unit::TestCase 6 | def test_minhash_works 7 | m = MinHash.minhash(["foo", "bar", "baz"], 23) 8 | assert_equal 1157430835185436796, m 9 | 10 | m = MinHash.minhash(["bar", "foo", "baz"], 23) 11 | assert_equal 1157430835185436796, m 12 | 13 | m = MinHash.minhash(["baz", "bar", "foo"], 23) 14 | assert_equal 1157430835185436796, m 15 | end 16 | 17 | def test_minhash32_works 18 | m = MinHash.minhash32(["foo", "bar", "baz"], 23) 19 | assert_equal 568256043, m 20 | 21 | m = MinHash.minhash32(["bar", "foo", "baz"], 23) 22 | assert_equal 568256043, m 23 | 24 | m = MinHash.minhash32(["baz", "bar", "foo"], 23) 25 | assert_equal 568256043, m 26 | end 27 | end 28 | --------------------------------------------------------------------------------