├── .gitignore ├── .rspec ├── .ruby-version ├── .travis.yml ├── .yardopts ├── Gemfile ├── Gemfile.lock ├── Guardfile ├── LICENSE.txt ├── README.md ├── Rakefile ├── bin ├── bench └── blurrily ├── blurrily.gemspec ├── doc ├── bench-delete.png ├── bench-find.png ├── bench-save.png └── bench.numbers ├── ext └── blurrily │ ├── blurrily.h │ ├── extconf.rb │ ├── map_ext.c │ ├── search_tree.c │ ├── search_tree.h │ ├── storage.c │ ├── storage.h │ ├── tokeniser.c │ └── tokeniser.h ├── lib ├── blurrily.rb └── blurrily │ ├── client.rb │ ├── command_processor.rb │ ├── defaults.rb │ ├── map.rb │ ├── map_group.rb │ ├── server.rb │ └── version.rb └── spec ├── blurrily ├── client_spec.rb ├── command_processor_spec.rb ├── map_group_spec.rb ├── map_spec.rb └── server_spec.rb ├── integration_spec.rb └── spec_helper.rb /.gitignore: -------------------------------------------------------------------------------- 1 | # Package 2 | *.gem 3 | pkg 4 | tmp 5 | 6 | # Generated files for C extension 7 | Makefile 8 | *.o 9 | *.bundle 10 | *.so 11 | 12 | # Data files 13 | *.bz2 14 | *.trigrams 15 | 16 | # CTags 17 | .tags 18 | .tags_sorted_by_file 19 | 20 | # Mac turds 21 | .DS_Store 22 | 23 | # Docs 24 | doc/ 25 | .yardoc/ 26 | 27 | # Coveralls 28 | coverage/ 29 | -------------------------------------------------------------------------------- /.rspec: -------------------------------------------------------------------------------- 1 | --colour 2 | --backtrace 3 | -------------------------------------------------------------------------------- /.ruby-version: -------------------------------------------------------------------------------- 1 | 2.2.0 2 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | rvm: 2 | - "1.9.3" 3 | - "2.0.0" 4 | - "2.1.5" 5 | - "2.2.0" 6 | # increment to force build: 001 7 | -------------------------------------------------------------------------------- /.yardopts: -------------------------------------------------------------------------------- 1 | --no-private 2 | --protected 3 | --markup-provider=redcarpet 4 | --markup=markdown 5 | 6 | lib/**/*.rb - README.md LICENSE.txt 7 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | # Specify your gem's dependencies in blurrily.gemspec 4 | gemspec 5 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | PATH 2 | remote: . 3 | specs: 4 | blurrily (1.0.2) 5 | activesupport (~> 4) 6 | eventmachine 7 | 8 | GEM 9 | remote: https://rubygems.org/ 10 | specs: 11 | activesupport (4.2.0) 12 | i18n (~> 0.7) 13 | json (~> 1.7, >= 1.7.7) 14 | minitest (~> 5.1) 15 | thread_safe (~> 0.3, >= 0.3.4) 16 | tzinfo (~> 1.1) 17 | benchmark-ips (2.1.1) 18 | celluloid (0.16.0) 19 | timers (~> 4.0.0) 20 | coderay (1.1.0) 21 | coveralls (0.7.2) 22 | multi_json (~> 1.3) 23 | rest-client (= 1.6.7) 24 | simplecov (>= 0.7) 25 | term-ansicolor (= 1.2.2) 26 | thor (= 0.18.1) 27 | diff-lcs (1.2.5) 28 | docile (1.1.5) 29 | eventmachine (1.0.4) 30 | ffi (1.9.6) 31 | formatador (0.2.5) 32 | guard (2.11.1) 33 | formatador (>= 0.2.4) 34 | listen (~> 2.7) 35 | lumberjack (~> 1.0) 36 | nenv (~> 0.1) 37 | notiffany (~> 0.0) 38 | pry (>= 0.9.12) 39 | shellany (~> 0.0) 40 | thor (>= 0.18.1) 41 | guard-compat (1.2.1) 42 | guard-rspec (4.5.0) 43 | guard (~> 2.1) 44 | guard-compat (~> 1.1) 45 | rspec (>= 2.99.0, < 4.0) 46 | hitimes (1.2.2) 47 | i18n (0.7.0) 48 | json (1.8.2) 49 | listen (2.8.5) 50 | celluloid (>= 0.15.2) 51 | rb-fsevent (>= 0.9.3) 52 | rb-inotify (>= 0.9) 53 | lumberjack (1.0.9) 54 | method_source (0.8.2) 55 | mime-types (2.4.3) 56 | minitest (5.5.1) 57 | multi_json (1.10.1) 58 | nenv (0.2.0) 59 | notiffany (0.0.3) 60 | nenv (~> 0.1) 61 | shellany (~> 0.0) 62 | progressbar (0.21.0) 63 | pry (0.10.1) 64 | coderay (~> 1.1.0) 65 | method_source (~> 0.8.1) 66 | slop (~> 3.4) 67 | pry-doc (0.6.0) 68 | pry (~> 0.9) 69 | yard (~> 0.8) 70 | pry-nav (0.2.4) 71 | pry (>= 0.9.10, < 0.11.0) 72 | rake (10.4.2) 73 | rake-compiler (0.9.5) 74 | rake 75 | rb-fsevent (0.9.4) 76 | rb-inotify (0.9.5) 77 | ffi (>= 0.5.0) 78 | rest-client (1.6.7) 79 | mime-types (>= 1.16) 80 | rspec (3.1.0) 81 | rspec-core (~> 3.1.0) 82 | rspec-expectations (~> 3.1.0) 83 | rspec-mocks (~> 3.1.0) 84 | rspec-core (3.1.7) 85 | rspec-support (~> 3.1.0) 86 | rspec-expectations (3.1.2) 87 | diff-lcs (>= 1.2.0, < 2.0) 88 | rspec-support (~> 3.1.0) 89 | rspec-mocks (3.1.3) 90 | rspec-support (~> 3.1.0) 91 | rspec-support (3.1.2) 92 | shellany (0.0.1) 93 | simplecov (0.9.1) 94 | docile (~> 1.1.0) 95 | multi_json (~> 1.0) 96 | simplecov-html (~> 0.8.0) 97 | simplecov-html (0.8.0) 98 | slop (3.6.0) 99 | term-ansicolor (1.2.2) 100 | tins (~> 0.8) 101 | terminal-notifier-guard (1.6.4) 102 | thor (0.18.1) 103 | thread_safe (0.3.4) 104 | timers (4.0.1) 105 | hitimes 106 | tins (0.13.2) 107 | tzinfo (1.2.2) 108 | thread_safe (~> 0.1) 109 | yard (0.8.7.6) 110 | 111 | PLATFORMS 112 | ruby 113 | 114 | DEPENDENCIES 115 | benchmark-ips 116 | blurrily! 117 | coveralls 118 | guard 119 | guard-rspec 120 | progressbar 121 | pry 122 | pry-doc 123 | pry-nav 124 | rake 125 | rake-compiler 126 | rb-fsevent 127 | rspec 128 | terminal-notifier-guard 129 | -------------------------------------------------------------------------------- /Guardfile: -------------------------------------------------------------------------------- 1 | # A sample Guardfile 2 | # More info at https://github.com/guard/guard#readme 3 | 4 | guard 'rspec', cmd: 'bundle exec rspec' do 5 | watch(%r{^spec/.+_spec\.rb$}) 6 | watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" } 7 | watch('spec/spec_helper.rb') { "spec" } 8 | end 9 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013 HouseTrip Ltd. 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Blurrily — Millisecond fuzzy string matching 2 | 3 | [![Gem Version](https://badge.fury.io/rb/blurrily.svg)](http://badge.fury.io/rb/blurrily) 4 | [![Build Status](https://travis-ci.org/mezis/blurrily.svg?branch=master)](https://travis-ci.org/mezis/blurrily) 5 | [![Dependency Status](https://gemnasium.com/mezis/blurrily.svg)](https://gemnasium.com/mezis/blurrily) 6 | [![Code Climate](https://codeclimate.com/github/mezis/blurrily.svg)](https://codeclimate.com/github/mezis/blurrily) 7 | [![Coverage Status](https://coveralls.io/repos/mezis/blurrily/badge.png)](https://coveralls.io/r/mezis/blurrily) 8 | 9 | > Show me photos of **Marakech** ! 10 | > 11 | > Here are some photos of **Marrakesh**, Morroco. 12 | > Did you mean **Martanesh**, Albania, **Marakkanam**, India, or **Marasheshty**, Romania? 13 | 14 | Blurrily finds misspelled, prefix, or partial needles in a haystack of 15 | strings, quickly. It scales well: its response time is typically 1-2ms on 16 | user-input datasets and 75-100ms on pathological datasets 17 | ([more](#benchmarks)). 18 | 19 | Blurrily is compatible and tested with all MRI Rubies from 1.9.3 to 2.2.0. 20 | It is tested on Linux 2.6 (32bit and 64bit) and MacOS X 10.8. 21 | 22 | Blurrily uses a tweaked [trigram](http://en.wikipedia.org/wiki/N-gram)-based 23 | approach to find good matches. If you're using ActiveRecord and looking for 24 | a lightweight (albeit much slower), in-process, Rails-friendly version of 25 | this, check out [fuzzily](http://github.com/mezis/fuzzily), a Ruby gem to 26 | perform fuzzy text searching in ActiveRecord. 27 | 28 | 29 | ## Installation 30 | 31 | Add this line to your application's Gemfile: 32 | 33 | gem 'blurrily' 34 | 35 | Or install it yourself as: 36 | 37 | $ gem install blurrily 38 | 39 | ## Docker 40 | 41 | You can optionally run [Burrily as a Docker Container](https://github.com/mrmattwright/docker-blurrily). Maintained by [MrMattWright](https://github.com/mrmattwright). 42 | 43 | ## Usage 44 | 45 | You can use blurrily as a client/server combination (recommended in 46 | production), or use the internals standalone. 47 | 48 | See the [API Documentation](http://rubydoc.info/github/mezis/blurrily/frames) 49 | for more details. 50 | 51 | ### Client/server 52 | 53 | Fire up a blurrily server: 54 | 55 | $ blurrily 56 | 57 | Open up a console and connect: 58 | 59 | $ irb -rubygems 60 | > require 'blurrily/client' 61 | > client = Blurrily::Client.new 62 | 63 | Store a needle with a reference: 64 | 65 | > client.put('London', 1337) 66 | 67 | Recover a reference form the haystack: 68 | 69 | > client.find('lonndon') 70 | #=> [1337] 71 | 72 | ### Standalone 73 | 74 | Create the in-memory database: 75 | 76 | > map = Blurrily::Map.new 77 | 78 | Store a needle with a reference: 79 | 80 | > map.put('London', 1337) 81 | 82 | Recover a reference form the haystack: 83 | 84 | > map.find('lonndon') 85 | #=> [1337] 86 | 87 | Save the database to disk: 88 | 89 | > map.save('/var/db/data.trigrams') 90 | 91 | Load a previously saved database: 92 | 93 | > map = Blurrily::Map.load('/var/db/data.trigrams') 94 | 95 | 96 | ## Caveats 97 | 98 | ### Diacritics, non-latin languages 99 | 100 | Blurrily forms trigrams from the 26 latin letters and a stop character (used 101 | to model start-of-string and separation between words in multi-word 102 | strings). 103 | 104 | This means that case and diacritrics are completely ignored by Blurrily. For 105 | instance, *Puy-de-Dôme* is strictly equivalent to *puy de dome*. 106 | 107 | It also means that any non-latin input will probably result in garbage data 108 | and garbage results (although it won't crash). 109 | 110 | ### Multi-word needles and edge stickyness. 111 | 112 | Multi-word needles (say, *New York*) are supported. 113 | 114 | The engine always favours matches that begin and end similarly to the 115 | needle, with a bias to the beginning of the strings. 116 | 117 | This is because internally, the string *New York* is turned into this 118 | sequence of trigrams: `**n`, `*ne`, `new`, `ew*`, `w*y`, `*yo`, `yor`, 119 | `ork`, `rk*`. 120 | 121 | ## Production notes 122 | 123 | ### Memory usage 124 | 125 | Blurrily does not store your original strings but rather a flat map of 126 | references and weights for each trigram in your input strings. 127 | 128 | In practice any database will use up a base 560KB for the index header, plus 129 | 128 bits per trigram. 130 | 131 | As a rule of thumb idea memory usages is 40MB + 8 times the size of your 132 | input data, and 50% extra on top during bulk imports (lots of writes to the 133 | database). 134 | 135 | For instance, `/usr/share/dict/words` is a list of 235k English words, and 136 | weighs 2.5MB. Importing the whole list uses up 75MB of memory, 51MB of which 137 | are the database. 138 | 139 | Note that once a database has been written to disk and loaded from disk, 140 | memory usage is minimal (560KB per database) as the database file is memory 141 | mapped. For performance you do need as much free memory as the database 142 | size. 143 | 144 | ### Disk usage 145 | 146 | Disk usage is almost exactly like memory usage, since database files are 147 | nothing more than a memory dump. 148 | 149 | In the `/usr/share/dict/words` example, on-disk size is 51MB. 150 | For the whole list of Geonames places, on-disk size is 1.1GB. 151 | 152 | ### Read v write 153 | 154 | Writing to blurrily (with `#put`) is fairly expensive—it's a search engine 155 | after all, optimized for intensive reads. 156 | 157 | Supporting writes means the engine needs to keep a hash table of all 158 | references around, typically weighing 50% of your total input. This is build 159 | lazily while writing however; so if you load a database from disk and only 160 | ever read, you will not incur the memory penalty. 161 | 162 | ### Saving & backing up 163 | 164 | Blurrily saves atomically (writing to a separate file, then using rename(2) 165 | to overwrite the old file), meaning you should never lose data. 166 | 167 | The server does this for you every 60 seconds and when quitting. If using 168 | `Blurrily::Map` directly, remember that a map loaded from disk is more 169 | memory efficient that a map in memory, so if your workload is read-heavy, 170 | you should `.load` after each `#save`. 171 | 172 | Backing up comes with a caveat: database files are only portable across 173 | architectures if endianness and pointer size are the same (tested between 174 | darwin-x86_64 and linux-amd64). 175 | 176 | Database files are very compressible; `bzip2` typically shrinks them to 20% 177 | of their original size. 178 | 179 | 180 | ## Benchmarks 181 | 182 | Blurrily is wicked fast, often 100x faster than it's ancestor, 183 | [fuzzily](http://github.com/mezis/fuzzily). This is because it's a close-to- 184 | the-metal, single-purpose index using almost exclusively libc primitives. On 185 | the inside the only expensive operations it performs are 186 | 187 | - memcpy(2) lots of data around (selection); 188 | - mergesort(3) to aggregate/count similar entries (reduction); 189 | - qsort(3) to order by counts (sort). 190 | 191 | It tends to be faster with large datasets on BSD than on Linux because the 192 | former has fast quicksort and mergesort, wheras the latter only has `qsort`, 193 | a slower, catch-all sorter. In complexity terms this is because FIND tends 194 | to be *O(n)* on BSD and *O(n ln n)* on Linux. 195 | 196 | Enough talk, here are the graphs. The `LOAD` and `PUT` operations are O(1) 197 | and take respectively ~10ms and ~100µs on any platform, so they aren't 198 | graphed here. 199 | 200 | - [FIND latency](/doc/bench-find.png) 201 | - [SAVE latency](/doc/bench-save.png) 202 | - [DELETE latency](/doc/bench-delete.png) 203 | 204 | 205 | ## Contributing 206 | 207 | 1. Fork it 208 | 2. Create your feature branch (`git checkout -b my-new-feature`) 209 | 3. Commit your changes (`git commit -am 'Add some feature'`) 210 | 4. Push to the branch (`git push origin my-new-feature`) 211 | 5. Create new Pull Request 212 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | require 'rake/extensiontask' 3 | require 'rspec/core/rake_task' 4 | 5 | 6 | Rake::ExtensionTask.new('blurrily') do |ext| 7 | ext.name = 'map_ext' # indicate the name of the extension. 8 | # ext.ext_dir = 'ext/weird_world' # search for 'hello_world' inside it. 9 | ext.lib_dir = 'lib/blurrily' # put binaries into this folder. 10 | # ext.config_script = 'custom_extconf.rb' # use instead of the default 'extconf.rb'. 11 | # ext.tmp_dir = 'tmp' # temporary folder used during compilation. 12 | # ext.source_pattern = "*.{c,cpp}" # monitor file changes to allow simple rebuild. 13 | # ext.config_options << '--with-foo' # supply additional options to configure script. 14 | # ext.gem_spec = spec # optionally indicate which gem specification 15 | # # will be used. 16 | end 17 | 18 | RSpec::Core::RakeTask.new(:spec) 19 | 20 | task :default => [:compile, :spec] 21 | -------------------------------------------------------------------------------- /bin/bench: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | require 'rubygems' 3 | require 'bundler/setup' 4 | require 'blurrily/map' 5 | require 'progressbar' 6 | require 'open-uri' 7 | require 'tempfile' 8 | require 'benchmark/ips' 9 | require 'zlib' 10 | 11 | 12 | module Blurrily 13 | class Benchmark 14 | ARCHIVES = { 15 | cities: 'http://mezis.s3.amazonaws.com/blurrily-data/cities1000-filtered.txt.gz', 16 | europe: 'http://mezis.s3.amazonaws.com/blurrily-data/de_fr_es_gb_it-filtered.txt.gz', 17 | us: 'http://mezis.s3.amazonaws.com/blurrily-data/us-filtered.txt.gz', 18 | world: 'http://mezis.s3.amazonaws.com/blurrily-data/allCountries-filtered.txt.gz', 19 | russia: 'http://mezis.s3.amazonaws.com/blurrily-data/ru-filtered.txt.gz', 20 | asia: 'http://mezis.s3.amazonaws.com/blurrily-data/ru_cn_ir-filtered.txt.gz', 21 | english: 'http://mezis.s3.amazonaws.com/blurrily-data/english.txt.gz', 22 | } 23 | 24 | SEARCH_CITIES = %w(London Paris Rome Luxembourg) + 25 | %w(Lonndon Pari Roma Luxenbour) 26 | 27 | def initialize(key) 28 | @source_url = ARCHIVES[key] or raise ArgumentError 29 | @key = key 30 | end 31 | 32 | def run 33 | log "Starting benchmark for '#{key}'" 34 | do_download 35 | do_import 36 | do_save 37 | do_bm 38 | end 39 | 40 | private 41 | 42 | attr :key, :source_url 43 | 44 | def do_download 45 | return if raw_data_path.exist? 46 | log 'download and save file' 47 | output = Tempfile.new($PROGRAM_NAME) 48 | URI.parse(source_url).open do |input| 49 | output.write input.read 50 | end 51 | output.close 52 | FileUtils.cp(output.path, raw_data_path.to_s) 53 | return 54 | end 55 | 56 | def do_import 57 | log "Counting data entries" 58 | rows = 0 59 | get_reader.each_line { |line| rows += 1 } 60 | 61 | log "Importing data" 62 | progress = ProgressBar.new(key.to_s, rows) 63 | get_reader.each_line do |line| 64 | index, needle = line.strip.split("\t") 65 | map.put(needle, index.to_i) 66 | progress.inc 67 | end 68 | progress.finish 69 | puts "#{rows} records imported, #{map.stats[:references]} refs, #{map.stats[:trigrams]} trigrams" 70 | return 71 | end 72 | 73 | def do_save(path = nil) 74 | path ||= trigram_data_path.to_s 75 | map.save path 76 | end 77 | 78 | def do_load 79 | @map.close 80 | @map = Map.load(trigram_data_path.to_s) 81 | do_gc 82 | end 83 | 84 | def do_warm 85 | # rehersal, necessary as the hash table of refs will be reconstructed 86 | map.put 'foo', 123 87 | end 88 | 89 | def do_bm 90 | log 'Benchmarking' 91 | 92 | ::Benchmark.ips do |x| 93 | x.report('find') do |times| 94 | times.times { map.find(random_city) } 95 | end 96 | 97 | x.report('put') do |times| 98 | times.times { map.put(random_city, rand(1<<31)) } 99 | end 100 | 101 | x.report('delete') do |times| 102 | times.times { map.delete(rand(1<<31)) } 103 | end 104 | 105 | x.report('stress') do |times| 106 | times.times do 107 | case rand(3) 108 | when 0 then map.delete(rand(1<<31)) 109 | when 1 then map.put(random_city, rand(1<<31)) 110 | when 2 then map.find(random_city) 111 | end 112 | end 113 | end 114 | 115 | x.report('save') do |times| 116 | times.times do 117 | path = Pathname.new "tmp/#{$$}.#{rand(1<<32)}.trigrams" 118 | do_save path.to_s 119 | path.delete 120 | end 121 | end 122 | 123 | x.report('load') do |times| 124 | times.times { do_load } 125 | end 126 | 127 | x.report('warm load') do |times| 128 | times.times { do_load ; do_warm } 129 | end 130 | end 131 | end 132 | 133 | def do_gc 134 | Thread.pass 135 | ObjectSpace.garbage_collect 136 | GC.start 137 | end 138 | 139 | def raw_data_path 140 | @raw_data_path ||= Pathname.new("#{key}.txt.gz") 141 | end 142 | 143 | def trigram_data_path 144 | @trigram_data_path ||= Pathname.new("#{key}.trigrams") 145 | end 146 | 147 | def get_reader 148 | Zlib::GzipReader.open(raw_data_path.to_s) 149 | end 150 | 151 | def log(message) 152 | $stderr.puts "[%s] %s: %s" % [Time.now.strftime('%T.%L'), $0, message] 153 | $stderr.flush 154 | end 155 | 156 | def map 157 | @map ||= Map.new 158 | end 159 | 160 | def random_city 161 | SEARCH_CITIES[rand(SEARCH_CITIES.length)] 162 | end 163 | end 164 | end 165 | 166 | $PROGRAM_NAME = 'blurrily:bench' 167 | 168 | %i(english cities europe russia asia us world).each do |key| 169 | bbm = Blurrily::Benchmark.new(key) 170 | benchmarks = bbm.run 171 | puts "-"*80 172 | benchmarks.each do |bm| 173 | puts "%s\t%s\t%1.3e" % [key, bm.label, 1e3/bm.ips] 174 | end 175 | puts "-"*80 176 | end 177 | 178 | __END__ 179 | 180 | -------------------------------------------------------------------------------- /bin/blurrily: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | $PROGRAM_NAME = 'blurrily' 3 | 4 | require "blurrily/server" 5 | require 'optparse' 6 | require 'ostruct' 7 | 8 | options = OpenStruct.new 9 | 10 | # Defaults 11 | options.port = 12021 12 | options.directory = '.' 13 | options.host = '0.0.0.0' 14 | 15 | parser = OptionParser.new do |opts| 16 | opts.banner = "Usage: #{$PROGRAM_NAME} [options]" 17 | 18 | opts.on("-p", "--port ", "Bind to PORT, defaults to 12021") do |port| 19 | puts 'Port has to be numeric value' and exit unless port =~ /\d+/ 20 | options.port = port.to_i 21 | end 22 | 23 | opts.on("-d", "--directory ", "Work in DIRECTORY, defaults to .") do |directory| 24 | options.directory = directory 25 | end 26 | 27 | opts.on("-b", "--bind
", "Bind to ADDRESS, defaults to 0.0.0.0") do |address| 28 | options.host = address || '0.0.0.0' 29 | end 30 | 31 | opts.on("-V", "--version", "Output version") do |address| 32 | puts Blurrily::VERSION 33 | exit 34 | end 35 | 36 | opts.on_tail("-h", "--help", "Show this message") do 37 | puts opts 38 | exit 39 | end 40 | end 41 | 42 | parser.parse!(ARGV) 43 | Blurrily::Server.new(:host => options.host, :port => options.port, :directory => options.directory).start -------------------------------------------------------------------------------- /blurrily.gemspec: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | lib = File.expand_path('../lib', __FILE__) 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 4 | require 'blurrily/version' 5 | 6 | Gem::Specification.new do |gem| 7 | gem.name = "blurrily" 8 | gem.version = Blurrily::VERSION 9 | gem.authors = ["Julien Letessier", "Dawid Sklodowski", "Marcus Mitchell"] 10 | gem.email = ["julien.letessier@gmail.com"] 11 | gem.description = %q{Native fuzzy string search} 12 | gem.summary = %q{Native fuzzy string search} 13 | gem.homepage = "http://github.com/mezis/blurrily" 14 | 15 | gem.add_dependency 'activesupport', '~> 4' 16 | gem.add_dependency 'eventmachine' 17 | 18 | gem.add_development_dependency 'rspec' 19 | gem.add_development_dependency 'rake' 20 | gem.add_development_dependency 'rake-compiler' 21 | gem.add_development_dependency 'pry' 22 | gem.add_development_dependency 'pry-nav' 23 | gem.add_development_dependency 'pry-doc' 24 | gem.add_development_dependency 'progressbar' 25 | gem.add_development_dependency 'benchmark-ips' 26 | gem.add_development_dependency 'guard' 27 | gem.add_development_dependency 'guard-rspec' 28 | gem.add_development_dependency 'rb-fsevent' 29 | gem.add_development_dependency 'terminal-notifier-guard' 30 | gem.add_development_dependency 'coveralls' 31 | 32 | gem.extensions = ['ext/blurrily/extconf.rb'] 33 | gem.files = Dir.glob('lib/**/*.rb') + 34 | Dir.glob('ext/**/*.{c,h,rb}') + 35 | Dir.glob('*.{md,txt}') + 36 | Dir.glob('bin/blurrily') 37 | gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) } 38 | gem.test_files = gem.files.grep(%r{^(test|spec|features)/}) 39 | gem.require_paths = ["lib"] 40 | end 41 | -------------------------------------------------------------------------------- /doc/bench-delete.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mezis/blurrily/496deee64a7e1c04b1241968dab46250e7b36569/doc/bench-delete.png -------------------------------------------------------------------------------- /doc/bench-find.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mezis/blurrily/496deee64a7e1c04b1241968dab46250e7b36569/doc/bench-find.png -------------------------------------------------------------------------------- /doc/bench-save.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mezis/blurrily/496deee64a7e1c04b1241968dab46250e7b36569/doc/bench-save.png -------------------------------------------------------------------------------- /doc/bench.numbers: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mezis/blurrily/496deee64a7e1c04b1241968dab46250e7b36569/doc/bench.numbers -------------------------------------------------------------------------------- /ext/blurrily/blurrily.h: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | blurrily.h -- 4 | 5 | Helper macros 6 | 7 | */ 8 | 9 | #ifndef __BLURRILY_H__ 10 | #define __BLURRILY_H__ 1 11 | 12 | #define BR_PACKED_STRUCT __attribute__ ((__packed__)) 13 | #define UNUSED(_IDENT) _IDENT __attribute__ ((unused)) 14 | 15 | #ifdef DEBUG 16 | #define LOG(...) fprintf(stderr, __VA_ARGS__) 17 | #else 18 | #define LOG(...) 19 | #endif 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /ext/blurrily/extconf.rb: -------------------------------------------------------------------------------- 1 | require 'mkmf' 2 | 3 | PLATFORM = `uname`.strip.upcase 4 | SHARED_FLAGS = "-DPLATFORM_#{PLATFORM} --std=c99 -Wall -Wextra -Werror" 5 | 6 | case PLATFORM 7 | when 'LINUX' 8 | # make sure ftruncate is available 9 | SHARED_FLAGS << ' -D_XOPEN_SOURCE=700' 10 | SHARED_FLAGS << ' -D_GNU_SOURCE=1' 11 | # make sure off_t is 64 bit long 12 | SHARED_FLAGS << ' -D_FILE_OFFSET_BITS=64' 13 | end 14 | 15 | # production 16 | $CFLAGS += " #{SHARED_FLAGS} -Os" 17 | 18 | # development 19 | # $CFLAGS += " #{SHARED_FLAGS} -O0 -g" 20 | 21 | create_makefile('blurrily/map_ext') 22 | -------------------------------------------------------------------------------- /ext/blurrily/map_ext.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "storage.h" 4 | #include "blurrily.h" 5 | 6 | static VALUE eClosedError = Qnil; 7 | static VALUE eBlurrilyModule = Qnil; 8 | 9 | /******************************************************************************/ 10 | 11 | static int raise_if_closed(VALUE self) 12 | { 13 | if (rb_ivar_get(self, rb_intern("@closed")) != Qtrue) return 0; 14 | rb_raise(eClosedError, "Map was freed"); 15 | return 1; 16 | } 17 | 18 | static void mark_as_closed(VALUE self) 19 | { 20 | rb_ivar_set(self, rb_intern("@closed"), Qtrue); 21 | } 22 | 23 | /******************************************************************************/ 24 | 25 | static void blurrily_free(void* haystack) 26 | { 27 | int res = -1; 28 | 29 | if (haystack == NULL) return; 30 | res = blurrily_storage_close((trigram_map*) &haystack); 31 | assert(res >= 0); 32 | } 33 | 34 | /******************************************************************************/ 35 | 36 | static void blurrily_mark(void* haystack) 37 | { 38 | if (haystack == NULL) return; 39 | blurrily_storage_mark((trigram_map) haystack); 40 | } 41 | 42 | /******************************************************************************/ 43 | 44 | static VALUE blurrily_new(VALUE class) { 45 | VALUE wrapper = Qnil; 46 | trigram_map haystack = (trigram_map)NULL; 47 | int res = -1; 48 | 49 | res = blurrily_storage_new(&haystack); 50 | if (res < 0) { rb_sys_fail(NULL); return Qnil; } 51 | 52 | wrapper = Data_Wrap_Struct(class, blurrily_mark, blurrily_free, (void*)haystack); 53 | rb_obj_call_init(wrapper, 0, NULL); 54 | return wrapper; 55 | } 56 | 57 | /******************************************************************************/ 58 | 59 | static VALUE blurrily_load(VALUE class, VALUE rb_path) { 60 | char* path = StringValuePtr(rb_path); 61 | VALUE wrapper = Qnil; 62 | trigram_map haystack = (trigram_map)NULL; 63 | int res = -1; 64 | 65 | res = blurrily_storage_load(&haystack, path); 66 | if (res < 0) { rb_sys_fail(NULL); return Qnil; } 67 | 68 | wrapper = Data_Wrap_Struct(class, blurrily_mark, blurrily_free, (void*)haystack); 69 | rb_obj_call_init(wrapper, 0, NULL); 70 | return wrapper; 71 | } 72 | 73 | /******************************************************************************/ 74 | 75 | static VALUE blurrily_initialize(VALUE UNUSED(self)) { 76 | return Qtrue; 77 | } 78 | 79 | /******************************************************************************/ 80 | 81 | static VALUE blurrily_put(VALUE self, VALUE rb_needle, VALUE rb_reference, VALUE rb_weight) { 82 | trigram_map haystack = (trigram_map)NULL; 83 | int res = -1; 84 | char* needle = StringValuePtr(rb_needle); 85 | uint32_t reference = NUM2UINT(rb_reference); 86 | uint32_t weight = NUM2UINT(rb_weight); 87 | 88 | if (raise_if_closed(self)) return Qnil; 89 | Data_Get_Struct(self, struct trigram_map_t, haystack); 90 | 91 | res = blurrily_storage_put(haystack, needle, reference, weight); 92 | assert(res >= 0); 93 | 94 | return INT2NUM(res); 95 | } 96 | 97 | /******************************************************************************/ 98 | 99 | static VALUE blurrily_delete(VALUE self, VALUE rb_reference) { 100 | trigram_map haystack = (trigram_map)NULL; 101 | uint32_t reference = NUM2UINT(rb_reference); 102 | int res = -1; 103 | 104 | if (raise_if_closed(self)) return Qnil; 105 | Data_Get_Struct(self, struct trigram_map_t, haystack); 106 | 107 | res = blurrily_storage_delete(haystack, reference); 108 | assert(res >= 0); 109 | 110 | return INT2NUM(res); 111 | } 112 | 113 | /******************************************************************************/ 114 | 115 | static VALUE blurrily_save(VALUE self, VALUE rb_path) { 116 | trigram_map haystack = (trigram_map)NULL; 117 | int res = -1; 118 | const char* path = StringValuePtr(rb_path); 119 | 120 | if (raise_if_closed(self)) return Qnil; 121 | Data_Get_Struct(self, struct trigram_map_t, haystack); 122 | 123 | res = blurrily_storage_save(haystack, path); 124 | if (res < 0) rb_sys_fail(NULL); 125 | 126 | return Qnil; 127 | } 128 | 129 | /******************************************************************************/ 130 | 131 | static VALUE blurrily_find(VALUE self, VALUE rb_needle, VALUE rb_limit) { 132 | trigram_map haystack = (trigram_map)NULL; 133 | int res = -1; 134 | const char* needle = StringValuePtr(rb_needle); 135 | int limit = NUM2UINT(rb_limit); 136 | trigram_match matches = NULL; 137 | VALUE rb_matches = Qnil; 138 | 139 | if (raise_if_closed(self)) return Qnil; 140 | Data_Get_Struct(self, struct trigram_map_t, haystack); 141 | 142 | if (limit <= 0) { 143 | // rb_limit = rb_const_get(eBlurrilyModule, rb_intern('LIMIT_DEFAULT')); 144 | rb_limit = rb_const_get(eBlurrilyModule, rb_intern("LIMIT_DEFAULT")); 145 | limit = NUM2UINT(rb_limit); 146 | } 147 | matches = (trigram_match) malloc(limit * sizeof(trigram_match_t)); 148 | 149 | res = blurrily_storage_find(haystack, needle, limit, matches); 150 | assert(res >= 0); 151 | 152 | /* wrap the matches into a Ruby array */ 153 | rb_matches = rb_ary_new(); 154 | for (int k = 0; k < res; ++k) { 155 | VALUE rb_match = rb_ary_new(); 156 | rb_ary_push(rb_match, rb_uint_new(matches[k].reference)); 157 | rb_ary_push(rb_match, rb_uint_new(matches[k].matches)); 158 | rb_ary_push(rb_match, rb_uint_new(matches[k].weight)); 159 | rb_ary_push(rb_matches, rb_match); 160 | } 161 | return rb_matches; 162 | } 163 | 164 | 165 | /******************************************************************************/ 166 | 167 | static VALUE blurrily_stats(VALUE self) 168 | { 169 | trigram_map haystack = (trigram_map)NULL; 170 | trigram_stat_t stats; 171 | VALUE result = rb_hash_new(); 172 | int res = -1; 173 | 174 | if (raise_if_closed(self)) return Qnil; 175 | Data_Get_Struct(self, struct trigram_map_t, haystack); 176 | 177 | res = blurrily_storage_stats(haystack, &stats); 178 | assert(res >= 0); 179 | 180 | (void) rb_hash_aset(result, ID2SYM(rb_intern("references")), UINT2NUM(stats.references)); 181 | (void) rb_hash_aset(result, ID2SYM(rb_intern("trigrams")), UINT2NUM(stats.trigrams)); 182 | 183 | return result; 184 | } 185 | 186 | /******************************************************************************/ 187 | 188 | static VALUE blurrily_close(VALUE self) 189 | { 190 | trigram_map haystack = (trigram_map)NULL; 191 | int res = -1; 192 | 193 | if (raise_if_closed(self)) return Qnil; 194 | Data_Get_Struct(self, struct trigram_map_t, haystack); 195 | 196 | res = blurrily_storage_close(&haystack); 197 | if (res < 0) rb_sys_fail(NULL); 198 | 199 | DATA_PTR(self) = NULL; 200 | mark_as_closed(self); 201 | return Qnil; 202 | } 203 | 204 | /******************************************************************************/ 205 | 206 | void Init_map_ext(void) { 207 | VALUE klass = Qnil; 208 | 209 | /* assume we haven't yet defined blurrily */ 210 | eBlurrilyModule = rb_define_module("Blurrily"); 211 | assert(eBlurrilyModule != Qnil); 212 | 213 | klass = rb_define_class_under(eBlurrilyModule, "RawMap", rb_cObject); 214 | assert(klass != Qnil); 215 | 216 | eClosedError = rb_define_class_under(klass, "ClosedError", rb_eRuntimeError); 217 | assert(klass != Qnil); 218 | 219 | rb_define_singleton_method(klass, "new", blurrily_new, 0); 220 | rb_define_singleton_method(klass, "load", blurrily_load, 1); 221 | 222 | rb_define_method(klass, "initialize", blurrily_initialize, 0); 223 | rb_define_method(klass, "put", blurrily_put, 3); 224 | rb_define_method(klass, "delete", blurrily_delete, 1); 225 | rb_define_method(klass, "save", blurrily_save, 1); 226 | rb_define_method(klass, "find", blurrily_find, 2); 227 | rb_define_method(klass, "stats", blurrily_stats, 0); 228 | rb_define_method(klass, "close", blurrily_close, 0); 229 | return; 230 | } 231 | -------------------------------------------------------------------------------- /ext/blurrily/search_tree.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "blurrily.h" 4 | #include "ruby.h" 5 | 6 | /******************************************************************************/ 7 | 8 | typedef struct blurrily_refs_t { 9 | VALUE hash; 10 | } blurrily_refs_t; 11 | 12 | /******************************************************************************/ 13 | 14 | int blurrily_refs_new(blurrily_refs_t** refs_ptr) 15 | { 16 | blurrily_refs_t* refs = NULL; 17 | 18 | refs = (blurrily_refs_t*) malloc(sizeof(blurrily_refs_t)); 19 | if (!refs) return -1; 20 | 21 | refs->hash = rb_hash_new(); 22 | *refs_ptr = refs; 23 | return 0; 24 | } 25 | 26 | /******************************************************************************/ 27 | 28 | void blurrily_refs_mark(blurrily_refs_t* refs) 29 | { 30 | rb_gc_mark(refs->hash); 31 | return; 32 | } 33 | 34 | /******************************************************************************/ 35 | 36 | void blurrily_refs_free(blurrily_refs_t** refs_ptr) 37 | { 38 | blurrily_refs_t* refs = *refs_ptr; 39 | 40 | refs->hash = Qnil; 41 | free(refs); 42 | *refs_ptr = NULL; 43 | return; 44 | } 45 | 46 | /******************************************************************************/ 47 | 48 | void blurrily_refs_add(blurrily_refs_t* refs, uint32_t ref) 49 | { 50 | (void) rb_hash_aset(refs->hash, UINT2NUM(ref), Qtrue); 51 | return; 52 | } 53 | 54 | /******************************************************************************/ 55 | 56 | void blurrily_refs_remove(blurrily_refs_t* refs, uint32_t ref) 57 | { 58 | (void) rb_hash_aset(refs->hash, UINT2NUM(ref), Qnil); 59 | } 60 | 61 | /******************************************************************************/ 62 | 63 | int blurrily_refs_test(blurrily_refs_t* refs, uint32_t ref) 64 | { 65 | return rb_hash_aref(refs->hash, UINT2NUM(ref)) == Qtrue ? 1 : 0; 66 | } 67 | -------------------------------------------------------------------------------- /ext/blurrily/search_tree.h: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | search_tree.h -- 4 | 5 | List of all references that's fast to query for existence. 6 | 7 | */ 8 | #include 9 | 10 | 11 | typedef struct blurrily_refs_t blurrily_refs_t; 12 | 13 | 14 | /* Allocate a search tree */ 15 | int blurrily_refs_new(blurrily_refs_t** refs_ptr); 16 | 17 | /* Destroy a search tree */ 18 | void blurrily_refs_free(blurrily_refs_t** refs_ptr); 19 | 20 | /* Mark with Ruby's GC */ 21 | void blurrily_refs_mark(blurrily_refs_t* refs); 22 | 23 | /* Add a reference */ 24 | void blurrily_refs_add(blurrily_refs_t* refs, uint32_t ref); 25 | 26 | /* Remove a reference */ 27 | void blurrily_refs_remove(blurrily_refs_t* refs, uint32_t ref); 28 | 29 | /* Test for a reference (1 = present, 0 = absent) */ 30 | int blurrily_refs_test(blurrily_refs_t* refs, uint32_t ref); 31 | -------------------------------------------------------------------------------- /ext/blurrily/storage.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #ifdef PLATFORM_LINUX 12 | #include 13 | #define MERGESORT fake_mergesort 14 | #else 15 | #include 16 | #define MERGESORT mergesort 17 | #endif 18 | 19 | #ifndef PATH_MAX 20 | /* safe default ... */ 21 | #define PATH_MAX 1024 22 | #endif 23 | 24 | #include "storage.h" 25 | #include "search_tree.h" 26 | 27 | /******************************************************************************/ 28 | 29 | #define PAGE_SIZE 4096 30 | #define TRIGRAM_COUNT (TRIGRAM_BASE * TRIGRAM_BASE * TRIGRAM_BASE) 31 | #define TRIGRAM_ENTRIES_START_SIZE PAGE_SIZE/sizeof(trigram_entry_t) 32 | 33 | /******************************************************************************/ 34 | 35 | /* one trigram entry -- client reference and sorting weight */ 36 | struct BR_PACKED_STRUCT trigram_entry_t 37 | { 38 | uint32_t reference; 39 | uint32_t weight; 40 | }; 41 | typedef struct trigram_entry_t trigram_entry_t; 42 | 43 | 44 | /* collection of entries for a given trigram */ 45 | /* points to an array of entries */ 46 | /* of which are filled */ 47 | struct BR_PACKED_STRUCT trigram_entries_t 48 | { 49 | uint32_t buckets; 50 | uint32_t used; 51 | 52 | trigram_entry_t* entries; /* set when the structure is in memory */ 53 | off_t entries_offset; /* set when the structure is on disk */ 54 | 55 | uint8_t dirty; /* not optimised (presorted) yet */ 56 | }; 57 | typedef struct trigram_entries_t trigram_entries_t; 58 | 59 | 60 | /* hash map of all possible trigrams to collection of entries */ 61 | /* there are 28^3 = 19,683 possible trigrams */ 62 | struct BR_PACKED_STRUCT trigram_map_t 63 | { 64 | char magic[6]; /* the string "trigra" */ 65 | uint8_t big_endian; 66 | uint8_t pointer_size; 67 | 68 | uint32_t total_references; 69 | uint32_t total_trigrams; 70 | size_t mapped_size; /* when mapped from disk, the number of bytes mapped */ 71 | blurrily_refs_t* refs; 72 | 73 | trigram_entries_t map[TRIGRAM_COUNT]; /* this whole structure is ~500KB */ 74 | }; 75 | typedef struct trigram_map_t trigram_map_t; 76 | 77 | /******************************************************************************/ 78 | 79 | #ifdef PLATFORM_LINUX 80 | /* fake version of mergesort(3) implemented with qsort(3) as Linux lacks */ 81 | /* the specific variants */ 82 | static int fake_mergesort(void *base, size_t nel, size_t width, int (*compar)(const void *, const void *)) 83 | { 84 | qsort(base, nel, width, compar); 85 | return 0; 86 | } 87 | #endif 88 | 89 | /******************************************************************************/ 90 | 91 | #define SMALLOC(_NELEM,_TYPE) (_TYPE*) smalloc(_NELEM, sizeof(_TYPE)) 92 | 93 | static void* smalloc(size_t nelem, size_t length) 94 | { 95 | void* result = malloc(nelem * length); 96 | if (result) memset(result, 0xAA, nelem * length); 97 | return result; 98 | } 99 | 100 | /******************************************************************************/ 101 | 102 | /* 1 -> little endian, 2 -> big endian */ 103 | static uint8_t get_big_endian() 104 | { 105 | uint32_t magic = 0xAA0000BB; 106 | uint8_t head = *((uint8_t*) &magic); 107 | 108 | return (head == 0xBB) ? 1 : 2; 109 | } 110 | 111 | /******************************************************************************/ 112 | 113 | /* 4 or 8 (bytes) */ 114 | static uint8_t get_pointer_size() 115 | { 116 | return (uint8_t) sizeof(void*); 117 | } 118 | 119 | /******************************************************************************/ 120 | 121 | static int compare_entries(const void* left_p, const void* right_p) 122 | { 123 | trigram_entry_t* left = (trigram_entry_t*)left_p; 124 | trigram_entry_t* right = (trigram_entry_t*)right_p; 125 | return (int)left->reference - (int)right->reference; 126 | } 127 | 128 | /* compares matches on #matches (descending) then weight (ascending) */ 129 | static int compare_matches(const void* left_p, const void* right_p) 130 | { 131 | trigram_match_t* left = (trigram_match_t*)left_p; 132 | trigram_match_t* right = (trigram_match_t*)right_p; 133 | /* int delta = (int)left->matches - (int)right->matches; */ 134 | int delta = (int)right->matches - (int)left->matches; 135 | 136 | return (delta != 0) ? delta : ((int)left->weight - (int)right->weight); 137 | 138 | } 139 | 140 | /******************************************************************************/ 141 | 142 | static void sort_map_if_dirty(trigram_entries_t* map) 143 | { 144 | int res = -1; 145 | if (! map->dirty) return; 146 | 147 | res = MERGESORT(map->entries, map->used, sizeof(trigram_entry_t), &compare_entries); 148 | assert(res >= 0); 149 | map->dirty = 0; 150 | } 151 | 152 | /******************************************************************************/ 153 | 154 | static size_t round_to_page(size_t value) 155 | { 156 | if (value % PAGE_SIZE == 0) return value; 157 | return (value / PAGE_SIZE + 1) * PAGE_SIZE; 158 | } 159 | 160 | /******************************************************************************/ 161 | 162 | static size_t get_map_size(trigram_map haystack, int index) 163 | { 164 | return haystack->map[index].buckets * sizeof(trigram_entry_t); 165 | } 166 | 167 | /******************************************************************************/ 168 | 169 | static void free_if(void* ptr) 170 | { 171 | if (ptr == NULL) return; 172 | free(ptr); 173 | return; 174 | } 175 | 176 | /******************************************************************************/ 177 | 178 | int blurrily_storage_new(trigram_map* haystack_ptr) 179 | { 180 | trigram_map haystack = (trigram_map)NULL; 181 | trigram_entries_t* ptr = NULL; 182 | int k = 0; 183 | 184 | LOG("blurrily_storage_new\n"); 185 | haystack = SMALLOC(1, trigram_map_t); 186 | if (haystack == NULL) return -1; 187 | 188 | memcpy(haystack->magic, "trigra", 6); 189 | haystack->big_endian = get_big_endian(); 190 | haystack->pointer_size = get_pointer_size(); 191 | 192 | haystack->mapped_size = 0; /* not mapped, as we just created it in memory */ 193 | haystack->total_references = 0; 194 | haystack->total_trigrams = 0; 195 | haystack->refs = NULL; 196 | for(k = 0, ptr = haystack->map ; k < TRIGRAM_COUNT ; ++k, ++ptr) { 197 | ptr->buckets = 0; 198 | ptr->used = 0; 199 | ptr->dirty = 0; 200 | ptr->entries = (trigram_entry_t*)NULL; 201 | ptr->entries_offset = 0; 202 | } 203 | 204 | *haystack_ptr = haystack; 205 | return 0; 206 | } 207 | 208 | /******************************************************************************/ 209 | 210 | int blurrily_storage_load(trigram_map* haystack, const char* path) 211 | { 212 | int fd = -1; 213 | int res = -1; 214 | trigram_map header = NULL; 215 | uint8_t* origin = NULL; 216 | struct stat metadata; 217 | 218 | /* open and map file */ 219 | res = fd = open(path, O_RDONLY); 220 | if (res < 0) goto cleanup; 221 | 222 | res = fstat(fd, &metadata); 223 | if (res < 0) goto cleanup; 224 | 225 | /* check this file is at least lng enough to have a header */ 226 | if (metadata.st_size < (off_t) sizeof(trigram_map_t)) { 227 | errno = EPROTO; 228 | res = -1; 229 | goto cleanup; 230 | } 231 | 232 | header = (trigram_map) mmap(NULL, metadata.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); 233 | if (header == MAP_FAILED) { 234 | res = -1; 235 | header = NULL; 236 | goto cleanup; 237 | } 238 | 239 | /* fd not needed once mapping established */ 240 | res = close(fd); 241 | if (res < 0) goto cleanup; 242 | fd = -1; 243 | 244 | /* check magic */ 245 | res = memcmp(header->magic, "trigra", 6); 246 | if (res != 0 || header->big_endian != get_big_endian() || header->pointer_size != get_pointer_size()) { 247 | errno = EPROTO; 248 | res = -1; 249 | goto cleanup; 250 | } 251 | 252 | /* fix header data */ 253 | header->mapped_size = metadata.st_size; 254 | origin = (uint8_t*)header; 255 | for (int k = 0; k < TRIGRAM_COUNT; ++k) { 256 | trigram_entries_t* map = header->map + k; 257 | if (map->entries_offset == 0) continue; 258 | map->entries = (trigram_entry_t*) (origin + map->entries_offset); 259 | } 260 | *haystack = header; 261 | 262 | cleanup: 263 | if (fd > 0) (void) close(fd); 264 | if (res < 0 && header != NULL) (void) munmap(header, metadata.st_size); 265 | return res; 266 | } 267 | 268 | /******************************************************************************/ 269 | 270 | int blurrily_storage_close(trigram_map* haystack_ptr) 271 | { 272 | trigram_map haystack = *haystack_ptr; 273 | int res = 0; 274 | trigram_entries_t* ptr = haystack->map; 275 | 276 | LOG("blurrily_storage_close\n"); 277 | 278 | for(int k = 0 ; k < TRIGRAM_COUNT ; ++k) { 279 | if (ptr->entries_offset == 0) free(ptr->entries); 280 | ++ptr; 281 | } 282 | 283 | if (haystack->refs) blurrily_refs_free(&haystack->refs); 284 | 285 | if (haystack->mapped_size) { 286 | res = munmap(haystack, haystack->mapped_size); 287 | if (res < 0) goto cleanup; 288 | } else { 289 | free(haystack); 290 | } 291 | 292 | cleanup: 293 | *haystack_ptr = NULL; 294 | return res; 295 | } 296 | 297 | /******************************************************************************/ 298 | 299 | int blurrily_storage_save(trigram_map haystack, const char* path) 300 | { 301 | int fd = -1; 302 | int res = 0; 303 | uint8_t* ptr = (uint8_t*)NULL; 304 | size_t total_size = 0; 305 | size_t offset = 0; 306 | trigram_map header = NULL; 307 | char path_tmp[PATH_MAX]; 308 | 309 | /* cleanup maps in memory */ 310 | for (int k = 0; k < TRIGRAM_COUNT; ++k) { 311 | sort_map_if_dirty(haystack->map + k); 312 | } 313 | 314 | /* path for temporary file */ 315 | snprintf(path_tmp, PATH_MAX, "%s.tmp.%ld", path, random()); 316 | 317 | /* compute storage space required */ 318 | total_size += round_to_page(sizeof(trigram_map_t)); 319 | 320 | for (int k = 0; k < TRIGRAM_COUNT; ++k) { 321 | total_size += round_to_page(get_map_size(haystack, k)); 322 | } 323 | 324 | /* open and map file */ 325 | fd = open(path_tmp, O_RDWR | O_CREAT | O_TRUNC, 0644); 326 | if (fd < 0) goto cleanup; 327 | 328 | res = ftruncate(fd, total_size); 329 | if (res < 0) goto cleanup; 330 | 331 | ptr = mmap(NULL, total_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); 332 | if (ptr == MAP_FAILED) { res = -1 ; goto cleanup ; } 333 | 334 | (void) close(fd); 335 | fd = -1; 336 | 337 | /* flush data */ 338 | memset(ptr, 0xFF, total_size); 339 | 340 | /* copy header & clean copy */ 341 | memcpy(ptr, (void*)haystack, sizeof(trigram_map_t)); 342 | offset += round_to_page(sizeof(trigram_map_t)); 343 | header = (trigram_map)ptr; 344 | 345 | header->mapped_size = 0; 346 | header->refs = NULL; 347 | 348 | /* copy each map, set offset in header */ 349 | for (int k = 0; k < TRIGRAM_COUNT; ++k) { 350 | size_t block_size = get_map_size(haystack, k); 351 | 352 | if (block_size > 0) { 353 | memcpy(ptr+offset, haystack->map[k].entries, block_size); 354 | 355 | header->map[k].entries = NULL; 356 | header->map[k].entries_offset = offset; 357 | 358 | offset += round_to_page(block_size); 359 | } else { 360 | header->map[k].entries = NULL; 361 | header->map[k].entries_offset = 0; 362 | } 363 | } 364 | assert(offset == total_size); 365 | 366 | cleanup: 367 | if (ptr != NULL && total_size > 0) { 368 | res = munmap(ptr, total_size); 369 | } 370 | 371 | /* commit by renaming the file */ 372 | if (res >= 0 && path) { 373 | res = rename(path_tmp, path); 374 | } 375 | 376 | return res; 377 | } 378 | 379 | /******************************************************************************/ 380 | 381 | void add_all_refs(trigram_map haystack) 382 | { 383 | assert(haystack->refs != NULL); 384 | 385 | for (int k = 0; k < TRIGRAM_COUNT; ++k) { 386 | trigram_entries_t* map = haystack->map + k; 387 | trigram_entry_t* ptr = map->entries; 388 | assert(map->used <= map->buckets); 389 | for (uint32_t j = 0; j < map->used; ++j, ++ptr) { 390 | uint32_t ref = ptr->reference; 391 | blurrily_refs_add(haystack->refs, ref); 392 | } 393 | } 394 | } 395 | 396 | /******************************************************************************/ 397 | 398 | int blurrily_storage_put(trigram_map haystack, const char* needle, uint32_t reference, uint32_t weight) 399 | { 400 | int nb_trigrams = -1; 401 | size_t length = strlen(needle); 402 | trigram_t* trigrams = (trigram_t*)NULL; 403 | 404 | if (!haystack->refs) { 405 | blurrily_refs_new(&haystack->refs); 406 | add_all_refs(haystack); 407 | } 408 | if (blurrily_refs_test(haystack->refs, reference)) return 0; 409 | if (weight <= 0) weight = (uint32_t) length; 410 | 411 | trigrams = SMALLOC(length+1, trigram_t); 412 | nb_trigrams = blurrily_tokeniser_parse_string(needle, trigrams); 413 | 414 | 415 | for (int k = 0; k < nb_trigrams; ++k) { 416 | trigram_t t = trigrams[k]; 417 | trigram_entries_t* map = &haystack->map[t]; 418 | trigram_entry_t entry = { reference, weight }; 419 | 420 | assert(t < TRIGRAM_COUNT); 421 | assert(map-> used <= map-> buckets); 422 | 423 | /* allocate more space as needed (exponential growth) */ 424 | if (map->buckets == 0) { 425 | LOG("- alloc for %d\n", t); 426 | 427 | map->buckets = TRIGRAM_ENTRIES_START_SIZE; 428 | map->entries = SMALLOC(map->buckets, trigram_entry_t); 429 | } 430 | else if (map->used == map->buckets) { 431 | uint32_t new_buckets = map->buckets * 4/3; 432 | trigram_entry_t* new_entries = NULL; 433 | LOG("- realloc for %d\n", t); 434 | 435 | /* copy old data, free old pointer, zero extra space */ 436 | new_entries = SMALLOC(new_buckets, trigram_entry_t); 437 | assert(new_entries != NULL); 438 | memcpy(new_entries, map->entries, map->buckets * sizeof(trigram_entry_t)); 439 | /* scribble the rest of the map*/ 440 | // memset(new_entries + map->buckets, 0xFF, (new_buckets - map->buckets) * sizeof(trigram_entry_t)); 441 | 442 | #ifndef NDEBUG 443 | /* scribble old data */ 444 | memset(map->entries, 0xFF, map->buckets * sizeof(trigram_entry_t)); 445 | #endif 446 | 447 | if (map->entries_offset) { 448 | /* old data was on disk, just mark it as no longer on disk */ 449 | map->entries_offset = 0; 450 | } else { 451 | /* free old data */ 452 | free(map->entries); 453 | } 454 | 455 | /* swap fields */ 456 | map->buckets = new_buckets; 457 | map->entries = new_entries; 458 | } 459 | 460 | /* insert new entry */ 461 | assert(map->used < map->buckets); 462 | map->entries[map->used] = entry; 463 | map->used += 1; 464 | map->dirty = 1; 465 | } 466 | haystack->total_trigrams += nb_trigrams; 467 | haystack->total_references += 1; 468 | 469 | blurrily_refs_add(haystack->refs, reference); 470 | 471 | free((void*)trigrams); 472 | return nb_trigrams; 473 | } 474 | 475 | /******************************************************************************/ 476 | 477 | int blurrily_storage_find(trigram_map haystack, const char* needle, uint16_t limit, trigram_match results) 478 | { 479 | int nb_trigrams = -1; 480 | size_t length = strlen(needle); 481 | trigram_t* trigrams = (trigram_t*)NULL; 482 | int nb_entries = -1; 483 | trigram_entry_t* entries = NULL; 484 | trigram_entry_t* entry_ptr = NULL; 485 | int nb_matches = -1; 486 | trigram_match_t* matches = NULL; 487 | trigram_match_t* match_ptr = NULL; 488 | uint32_t last_ref = (uint32_t)-1; 489 | int nb_results = 0; 490 | 491 | trigrams = SMALLOC(length+1, trigram_t); 492 | nb_trigrams = blurrily_tokeniser_parse_string(needle, trigrams); 493 | if (nb_trigrams == 0) goto cleanup; 494 | 495 | LOG("%d trigrams in '%s'\n", nb_trigrams, needle); 496 | 497 | /* measure size required for sorting */ 498 | nb_entries = 0; 499 | for (int k = 0; k < nb_trigrams; ++k) { 500 | trigram_t t = trigrams[k]; 501 | nb_entries += haystack->map[t].used; 502 | } 503 | if (nb_entries == 0) goto cleanup; 504 | 505 | /* allocate sorting memory */ 506 | entries = SMALLOC(nb_entries, trigram_entry_t); 507 | assert(entries != NULL); 508 | LOG("allocated space for %zd trigrams entries\n", nb_entries); 509 | 510 | /* copy data for sorting */ 511 | entry_ptr = entries; 512 | for (int k = 0; k < nb_trigrams; ++k) { 513 | trigram_t t = trigrams[k]; 514 | size_t buckets = haystack->map[t].used; 515 | 516 | sort_map_if_dirty(haystack->map + t); 517 | memcpy(entry_ptr, haystack->map[t].entries, buckets * sizeof(trigram_entry_t)); 518 | entry_ptr += buckets; 519 | } 520 | assert(entry_ptr == entries + nb_entries); 521 | 522 | /* sort data */ 523 | MERGESORT(entries, nb_entries, sizeof(trigram_entry_t), &compare_entries); 524 | LOG("sorting entries\n"); 525 | 526 | /* count distinct matches */ 527 | entry_ptr = entries; 528 | last_ref = -1; 529 | nb_matches = 0; 530 | for (int k = 0; k < nb_entries; ++k) { 531 | if (entry_ptr->reference != last_ref) { 532 | last_ref = entry_ptr->reference; 533 | ++nb_matches; 534 | } 535 | ++entry_ptr; 536 | } 537 | assert(entry_ptr == entries + nb_entries); 538 | LOG("total %zd distinct matches\n", nb_matches); 539 | 540 | /* allocate maches result */ 541 | matches = SMALLOC(nb_matches, trigram_match_t); 542 | assert(matches != NULL); 543 | 544 | /* reduction, counting matches per reference */ 545 | entry_ptr = entries; 546 | match_ptr = matches; 547 | match_ptr->matches = 0; 548 | match_ptr->reference = entry_ptr->reference; /* setup the first match to */ 549 | match_ptr->weight = entry_ptr->weight; /* simplify the loop */ 550 | for (int k = 0; k < nb_entries; ++k) { 551 | if (entry_ptr->reference != match_ptr->reference) { 552 | ++match_ptr; 553 | match_ptr->reference = entry_ptr->reference; 554 | match_ptr->weight = entry_ptr->weight; 555 | match_ptr->matches = 1; 556 | } else { 557 | match_ptr->matches += 1; 558 | } 559 | assert((int) match_ptr->matches <= nb_trigrams); 560 | ++entry_ptr; 561 | } 562 | assert(match_ptr == matches + nb_matches - 1); 563 | assert(entry_ptr == entries + nb_entries); 564 | 565 | /* sort by weight (qsort) */ 566 | qsort(matches, nb_matches, sizeof(trigram_match_t), &compare_matches); 567 | 568 | /* output results */ 569 | nb_results = (limit < nb_matches) ? limit : nb_matches; 570 | for (int k = 0; k < nb_results; ++k) { 571 | results[k] = matches[k]; 572 | LOG("match %d: reference %d, matchiness %d, weight %d\n", k, matches[k].reference, matches[k].matches, matches[k].weight); 573 | } 574 | 575 | cleanup: 576 | free_if(entries); 577 | free_if(matches); 578 | free_if(trigrams); 579 | return nb_results; 580 | } 581 | 582 | /******************************************************************************/ 583 | 584 | int blurrily_storage_delete(trigram_map haystack, uint32_t reference) 585 | { 586 | int trigrams_deleted = 0; 587 | 588 | for (int k = 0; k < TRIGRAM_COUNT; ++k) { 589 | trigram_entries_t* map = haystack->map + k; 590 | trigram_entry_t* entry = NULL; 591 | 592 | for (unsigned int j = 0; j < map->used; ++j) { 593 | entry = map->entries + j; 594 | if (entry->reference != reference) continue; 595 | 596 | /* swap with the last entry */ 597 | *entry = map->entries[map->used - 1]; 598 | memset(map->entries + map->used - 1, 0xFF, sizeof(trigram_entry_t)); 599 | 600 | map->used -= 1; 601 | 602 | ++trigrams_deleted; 603 | --j; 604 | } 605 | } 606 | haystack->total_trigrams -= trigrams_deleted; 607 | if (trigrams_deleted > 0) haystack->total_references -= 1; 608 | 609 | if (haystack->refs) blurrily_refs_remove(haystack->refs, reference); 610 | 611 | return trigrams_deleted; 612 | } 613 | 614 | /******************************************************************************/ 615 | 616 | int blurrily_storage_stats(trigram_map haystack, trigram_stat_t* stats) 617 | { 618 | stats->references = haystack->total_references; 619 | stats->trigrams = haystack->total_trigrams; 620 | return 0; 621 | } 622 | 623 | /******************************************************************************/ 624 | 625 | void blurrily_storage_mark(trigram_map haystack) 626 | { 627 | if (haystack->refs) blurrily_refs_mark(haystack->refs); 628 | return; 629 | } 630 | -------------------------------------------------------------------------------- /ext/blurrily/storage.h: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | storage.h -- 4 | 5 | Trigram map creation, persistence, and qurying. 6 | 7 | */ 8 | #ifndef __STORAGE_H__ 9 | #define __STORAGE_H__ 10 | 11 | #include 12 | #include "tokeniser.h" 13 | #include "blurrily.h" 14 | 15 | struct trigram_map_t; 16 | typedef struct trigram_map_t* trigram_map; 17 | 18 | struct BR_PACKED_STRUCT trigram_match_t { 19 | uint32_t reference; 20 | uint32_t matches; 21 | uint32_t weight; 22 | }; 23 | typedef struct trigram_match_t trigram_match_t; 24 | typedef struct trigram_match_t* trigram_match; 25 | 26 | typedef struct trigram_stat_t { 27 | uint32_t references; 28 | uint32_t trigrams; 29 | 30 | } trigram_stat_t; 31 | 32 | 33 | /* 34 | Create a new trigram map, resident in memory. 35 | */ 36 | int blurrily_storage_new(trigram_map* haystack); 37 | 38 | /* 39 | Load an existing trigram map from disk. 40 | */ 41 | int blurrily_storage_load(trigram_map* haystack, const char* path); 42 | 43 | /* 44 | Release resources claimed by or . 45 | */ 46 | int blurrily_storage_close(trigram_map* haystack); 47 | 48 | /* 49 | Mark resources managed by Ruby GC. 50 | */ 51 | void blurrily_storage_mark(trigram_map haystack); 52 | 53 | 54 | /* 55 | Persist to disk what or 56 | gave you. 57 | */ 58 | int blurrily_storage_save(trigram_map haystack, const char* path); 59 | 60 | /* 61 | Add a new string to the map. is your identifier for that 62 | string, will be using to discriminate entries that match "as 63 | well" when searching. 64 | 65 | If is zero, it will be replaced by the number of characters in 66 | the . 67 | 68 | Returns positive on success, negative on failure. 69 | */ 70 | int blurrily_storage_put(trigram_map haystack, const char* needle, uint32_t reference, uint32_t weight); 71 | 72 | /* 73 | Check the map for an existing . 74 | 75 | Returns < 0 on error, 0 if the reference is not found, the number of trigrams 76 | for that reference otherwise. 77 | 78 | If is not NULL, will be set to the weight value passed to the put 79 | method on return (is the reference is found). 80 | 81 | If is not NULL, it should point an array long, 82 | and up to will be copied into it matching the 83 | originally passed to the put method. 84 | 85 | Not that this is a O(n) method: the whole map will be read. 86 | */ 87 | // int blurrily_storage_get(trigram_map haystack, uint32_t reference, uint32_t* weight, int nb_trigrams, trigram_t* trigrams); 88 | 89 | /* 90 | Remove a from the map. 91 | 92 | Note that this is very innefective. 93 | 94 | Returns positive on success, negative on failure. 95 | */ 96 | int blurrily_storage_delete(trigram_map haystack, uint32_t reference); 97 | 98 | /* 99 | Return at most entries matching from the . 100 | 101 | Results are written to . The first results are the ones entries 102 | sharing the most trigrams with the . Amongst entries with the same 103 | number of matches, the lightest ones (lowest ) will be returned 104 | first. 105 | 106 | should be allocated by the caller. 107 | 108 | Returns number of matches on success, negative on failure. 109 | */ 110 | int blurrily_storage_find(trigram_map haystack, const char* needle, uint16_t limit, trigram_match results); 111 | 112 | /* 113 | Copies metadata into 114 | 115 | Returns positive on success, negative on failure. 116 | */ 117 | int blurrily_storage_stats(trigram_map haystack, trigram_stat_t* stats); 118 | 119 | #endif 120 | -------------------------------------------------------------------------------- /ext/blurrily/tokeniser.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "tokeniser.h" 6 | #include "blurrily.h" 7 | 8 | 9 | /******************************************************************************/ 10 | 11 | static int ipow(int a, int b) 12 | { 13 | int result = 1; 14 | 15 | while (b-- > 0) result = result * a; 16 | return result; 17 | } 18 | 19 | /******************************************************************************/ 20 | 21 | static void string_to_code(const char* input, trigram_t *output) 22 | { 23 | trigram_t result = 0; 24 | 25 | for (int k = 0 ; k < 3; ++k) { 26 | if (input[k] == '*' || input[k] < 'a' || input[k] > 'z') continue; 27 | result += ipow(TRIGRAM_BASE, k) * (input[k] - 'a' + 1); 28 | } 29 | 30 | *output = result; 31 | } 32 | 33 | /******************************************************************************/ 34 | 35 | static void code_to_string(trigram_t input, char* output) 36 | { 37 | for (int k = 0 ; k < 3; ++k) { 38 | uint16_t elem = input / ipow(TRIGRAM_BASE, k) % TRIGRAM_BASE; 39 | if (elem == 0) { 40 | output[k] = '*'; 41 | } else { 42 | output[k] = ('a' + elem - 1); 43 | } 44 | } 45 | output[3] = 0; 46 | } 47 | 48 | /******************************************************************************/ 49 | 50 | static int blurrily_compare_trigrams(const void* left_p, const void* right_p) 51 | { 52 | trigram_t* left = (trigram_t*)left_p; 53 | trigram_t* right = (trigram_t*)right_p; 54 | return (int)*left - (int)*right; 55 | } 56 | 57 | /******************************************************************************/ 58 | 59 | int blurrily_tokeniser_parse_string(const char* input, trigram_t* output) 60 | { 61 | size_t length = strlen(input); 62 | char* normalized = (char*) malloc(length+5); 63 | size_t duplicates = 0; 64 | 65 | snprintf(normalized, length+4, "**%s*", input); 66 | 67 | /* replace spaces with '*' */ 68 | for (size_t k = 0; k < length+3; ++k) { 69 | if (normalized[k] == ' ') normalized[k] = '*'; 70 | } 71 | 72 | /* compute trigrams */ 73 | for (size_t k = 0; k <= length; ++k) { 74 | string_to_code(normalized+k, output+k); 75 | } 76 | 77 | /* print results */ 78 | LOG("-- normalization\n"); 79 | LOG("%s -> %s\n", input, normalized); 80 | LOG("-- tokenisation\n"); 81 | for (size_t k = 0; k <= length; ++k) { 82 | char res[4]; 83 | 84 | code_to_string(output[k], res); 85 | 86 | LOG("%c%c%c -> %d -> %s\n", 87 | normalized[k], normalized[k+1], normalized[k+2], 88 | output[k], res 89 | ); 90 | } 91 | 92 | /* sort */ 93 | qsort((void*)output, length+1, sizeof(trigram_t), &blurrily_compare_trigrams); 94 | 95 | /* remove duplicates */ 96 | for (size_t k = 1; k <= length; ++k) { 97 | trigram_t* previous = output + k - 1; 98 | trigram_t* current = output + k; 99 | 100 | if (*previous == *current) { 101 | *previous = 32768; 102 | ++duplicates; 103 | } 104 | } 105 | 106 | /* compact */ 107 | qsort((void*)output, length+1, sizeof(trigram_t), &blurrily_compare_trigrams); 108 | 109 | /* print again */ 110 | LOG("-- after sort/compact\n"); 111 | for (size_t k = 0; k <= length-duplicates; ++k) { 112 | char res[4]; 113 | code_to_string(output[k], res); 114 | LOG("%d -> %s\n", output[k], res); 115 | } 116 | 117 | free((void*)normalized); 118 | return (int) (length + 1 - duplicates); 119 | } 120 | 121 | /******************************************************************************/ 122 | 123 | int blurrily_tokeniser_trigram(trigram_t UNUSED(input), char* UNUSED(output)) 124 | { 125 | return 0; 126 | } 127 | -------------------------------------------------------------------------------- /ext/blurrily/tokeniser.h: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | tokeniser.h -- 4 | 5 | Split a string into an array of trigrams. 6 | 7 | The input string should be only lowercase latin letters and spaces 8 | (convert using iconv). 9 | 10 | Each trigram is a three-symbol tuple consisting of latters and the 11 | "epsilon" character used to represent spaces and beginning-of-word/end-of- 12 | word anchors. 13 | 14 | Each trigram is represented by a 16-bit integer. 15 | 16 | */ 17 | #ifndef __TOKENISER_H__ 18 | #define __TOKENISER_H__ 19 | 20 | #include 21 | 22 | #define TRIGRAM_BASE 28 23 | 24 | typedef uint16_t trigram_t; 25 | 26 | /* 27 | Parse the string and store the result in . 28 | must be allocated by the caller and provide at least as many slots 29 | as characters in , plus one. 30 | (not all will be necessarily be filled) 31 | 32 | Returns the number of trigrams on success, a negative number on failure. 33 | */ 34 | int blurrily_tokeniser_parse_string(const char* input, trigram_t* output); 35 | 36 | 37 | /* 38 | Given an returns a string representation of the trigram in . 39 | must be allocated by caller and will always be exactly 3 40 | # 25 | # ``` 26 | # 27 | # @returns the instance of {Blurrily::Client} 28 | def initialize(options = {}) 29 | @host = options.fetch(:host, DEFAULT_HOST) 30 | @port = options.fetch(:port, DEFAULT_PORT) 31 | @db_name = options.fetch(:db_name, DEFAULT_DATABASE) 32 | end 33 | 34 | # Find record references based on a given string (needle) 35 | # 36 | # @param needle The string you're searching for matches on. 37 | # Must not contain tabs. 38 | # Required 39 | # @param limit Limit the number of results retruned (default: 10). 40 | # Must be numeric. 41 | # Optional 42 | # 43 | # Examples 44 | # 45 | # ``` 46 | # @client.find('London') 47 | # # => [[123,6,3],[124,5,3]...] 48 | # ``` 49 | # 50 | # @returns an Array of matching [`ref`,`score`,`weight`] ordered by score. `ref` is the identifying value of the original record. 51 | # Note that unless modified, `weight` is simply the string length. 52 | def find(needle, limit = nil) 53 | limit ||= LIMIT_DEFAULT 54 | check_valid_needle(needle) 55 | raise(ArgumentError, "LIMIT value must be in #{LIMIT_RANGE}") unless LIMIT_RANGE.include?(limit) 56 | 57 | cmd = ["FIND", @db_name, needle, limit] 58 | send_cmd_and_get_results(cmd).map(&:to_i).each_slice(3).to_a 59 | end 60 | 61 | # Index a given record. 62 | # 63 | # @param db_name The name of the data store being targeted. Required 64 | # @param needle The string you wish to index. Must not contain tabs. Required 65 | # @param ref The indentifying value of the record being indexed. Must be numeric. Required 66 | # @param weight Weight of this particular reference. Default 0. Don't change unless you know what you're doing. Optional. 67 | # 68 | # Examples 69 | # 70 | # ``` 71 | # @client.put('location_en', 'London', 123, 0) 72 | # # => OK 73 | # ``` 74 | # 75 | # @returns something to let you know that all is well. 76 | def put(needle, ref, weight = 0) 77 | check_valid_needle(needle) 78 | check_valid_ref(ref) 79 | raise(ArgumentError, "WEIGHT value must be in #{WEIGHT_RANGE}") unless WEIGHT_RANGE.include?(weight) 80 | 81 | cmd = ["PUT", @db_name, needle, ref, weight] 82 | send_cmd_and_get_results(cmd) 83 | return 84 | end 85 | 86 | def delete(ref) 87 | check_valid_ref(ref) 88 | cmd = ['DELETE', @db_name, ref] 89 | send_cmd_and_get_results(cmd) 90 | return 91 | end 92 | 93 | def clear() 94 | send_cmd_and_get_results(['CLEAR', @db_name]) 95 | return 96 | end 97 | 98 | 99 | private 100 | 101 | 102 | PORT_RANGE = 1025..32768 103 | 104 | def check_valid_needle(needle) 105 | raise(ArgumentError, "bad needle") if !needle.kind_of?(String) || needle.empty? || needle.include?("\t") 106 | end 107 | 108 | def check_valid_ref(ref) 109 | raise(ArgumentError, "REF value must be in #{REF_RANGE}") unless REF_RANGE.include?(ref) 110 | end 111 | 112 | 113 | def connection 114 | @connection ||= TCPSocket.new(@host, @port) 115 | end 116 | 117 | def send_cmd_and_get_results(argv) 118 | output = argv.join("\t") 119 | connection.puts output 120 | input = connection.gets 121 | case input 122 | when "OK\n" 123 | return [] 124 | when /^OK\t(.*)\n/ 125 | return $1.split("\t") 126 | when /^ERROR\t(.*)\n/ 127 | raise Error, $1 128 | when nil 129 | raise Error, 'Server disconnected' 130 | else 131 | raise Error, 'Server did not respect protocol' 132 | end 133 | end 134 | 135 | end 136 | end 137 | -------------------------------------------------------------------------------- /lib/blurrily/command_processor.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | require 'blurrily/defaults' 3 | 4 | module Blurrily 5 | class CommandProcessor 6 | ProtocolError = Class.new(StandardError) 7 | 8 | def initialize(map_group) 9 | @map_group = map_group 10 | end 11 | 12 | def process_command(line) 13 | command, map_name, *args = line.split(/\t/) 14 | raise ProtocolError, 'Unknown command' unless COMMANDS.include? command 15 | raise ProtocolError, 'Invalid database name' unless map_name =~ /^[a-z_]+$/ 16 | result = send("on_#{command}", map_name, *args) 17 | ['OK', *result].compact.join("\t") 18 | rescue ArgumentError, ProtocolError => e 19 | ['ERROR', e.message].join("\t") 20 | end 21 | 22 | private 23 | 24 | COMMANDS = %w(FIND PUT DELETE CLEAR) 25 | 26 | def on_PUT(map_name, needle, ref, weight = nil) 27 | raise ProtocolError, 'Invalid reference' unless ref =~ /^\d+$/ && REF_RANGE.include?(ref.to_i) 28 | raise ProtocolError, 'Invalid weight' unless weight.nil? || (weight =~ /^\d+$/ && WEIGHT_RANGE.include?(weight.to_i)) 29 | 30 | @map_group.map(map_name).put(*[needle, ref.to_i, weight.to_i].compact) 31 | return 32 | end 33 | 34 | def on_DELETE(map_name, ref) 35 | raise ProtocolError, 'Invalid reference' unless ref =~ /^\d+$/ && REF_RANGE.include?(ref.to_i) 36 | 37 | @map_group.map(map_name).delete(ref.to_i) 38 | return 39 | end 40 | 41 | def on_FIND(map_name, needle, limit = nil) 42 | raise ProtocolError, 'Limit must be a number' if limit && !LIMIT_RANGE.include?(limit.to_i) 43 | 44 | results = @map_group.map(map_name).find(*[needle, limit && limit.to_i].compact) 45 | return results.flatten 46 | end 47 | 48 | def on_CLEAR(map_name) 49 | @map_group.clear(map_name) 50 | return 51 | end 52 | end 53 | end 54 | -------------------------------------------------------------------------------- /lib/blurrily/defaults.rb: -------------------------------------------------------------------------------- 1 | module Blurrily 2 | DEFAULT_HOST = 'localhost' 3 | DEFAULT_PORT = 12021 4 | DEFAULT_DATABASE = 'words' 5 | 6 | LIMIT_DEFAULT = 10 7 | LIMIT_RANGE = 1..1024 8 | REF_RANGE = 1..(1<<31) 9 | WEIGHT_RANGE = 0..(1<<31) 10 | end 11 | -------------------------------------------------------------------------------- /lib/blurrily/map.rb: -------------------------------------------------------------------------------- 1 | require 'blurrily/map_ext' 2 | require 'active_support/core_ext/module/aliasing' # alias_method_chain 3 | require 'active_support/core_ext/string/multibyte' # mb_chars 4 | 5 | module Blurrily 6 | class Map < RawMap 7 | 8 | def put(needle, reference, weight=nil) 9 | weight ||= 0 10 | needle = normalize_string needle 11 | @clean_path = nil 12 | super(needle, reference, weight) 13 | end 14 | 15 | def find(needle, limit=10) 16 | needle = normalize_string needle 17 | super(needle, limit) 18 | end 19 | 20 | def delete(*args) 21 | @clean_path = nil 22 | super(*args) 23 | end 24 | 25 | def save(path) 26 | return if @clean_path == path 27 | super(path) 28 | @clean_path = path 29 | nil 30 | end 31 | 32 | def self.load(path) 33 | super(path).tap do |map| 34 | map.instance_variable_set :@clean_path, path 35 | end 36 | end 37 | 38 | private 39 | 40 | def normalize_string(needle) 41 | result = needle.downcase 42 | unless result =~ /^([a-z ])+$/ 43 | result = ActiveSupport::Multibyte::Chars.new(result).mb_chars.normalize(:kd).gsub(/[^\x00-\x7F]/,'').to_s.gsub(/[^a-z]/,' ') 44 | # result = result.mb_chars.normalize(:kd).gsub(/[^\x00-\x7F]/,'').to_s.gsub(/[^a-z]/,' ') 45 | end 46 | result.gsub(/\s+/,' ').strip 47 | end 48 | end 49 | end 50 | -------------------------------------------------------------------------------- /lib/blurrily/map_group.rb: -------------------------------------------------------------------------------- 1 | require 'pathname' 2 | require 'blurrily/map' 3 | 4 | module Blurrily 5 | class MapGroup 6 | 7 | def initialize(directory = nil) 8 | @directory = Pathname.new(directory || Dir.pwd) 9 | @maps = {} 10 | end 11 | 12 | def map(name) 13 | @maps[name] ||= load_map(name) || Map.new 14 | end 15 | 16 | def save 17 | @directory.mkpath 18 | @maps.each do |name, map| 19 | map.save(path_for(name).to_s) 20 | end 21 | end 22 | 23 | def clear(name) 24 | @maps[name] = Map.new 25 | end 26 | 27 | private 28 | 29 | def load_map(name) 30 | Map.load(path_for(name).to_s) 31 | rescue Errno::ENOENT 32 | nil 33 | end 34 | 35 | def path_for(name) 36 | @directory.join("#{name}.trigrams") 37 | end 38 | end 39 | end -------------------------------------------------------------------------------- /lib/blurrily/server.rb: -------------------------------------------------------------------------------- 1 | require 'eventmachine' 2 | require 'blurrily/defaults' 3 | require 'blurrily/command_processor' 4 | require 'blurrily/map_group' 5 | 6 | module Blurrily 7 | class Server 8 | 9 | def initialize(options) 10 | @host = options.fetch(:host, '0.0.0.0') 11 | @port = options.fetch(:port, Blurrily::DEFAULT_PORT) 12 | directory = options.fetch(:directory, Dir.pwd) 13 | 14 | @map_group = MapGroup.new(directory) 15 | @command_processor = CommandProcessor.new(@map_group) 16 | end 17 | 18 | def start 19 | EventMachine.run do 20 | # hit Control + C to stop 21 | Signal.trap("INT") { EventMachine.stop } 22 | Signal.trap("TERM") { EventMachine.stop } 23 | 24 | saver = proc { @map_group.save } 25 | EventMachine.add_periodic_timer(60, &saver) 26 | EventMachine.add_shutdown_hook(&saver) 27 | Signal.trap("USR1", &saver) 28 | 29 | EventMachine.start_server(@host, @port, Handler, @command_processor) 30 | end 31 | end 32 | 33 | private 34 | 35 | module Handler 36 | def initialize(processor) 37 | @processor = processor 38 | end 39 | 40 | def receive_data(data) 41 | data.split("\n").each do |line| 42 | output = @processor.process_command(line.strip) 43 | output << "\n" 44 | send_data(output) 45 | end 46 | end 47 | end 48 | end 49 | end 50 | -------------------------------------------------------------------------------- /lib/blurrily/version.rb: -------------------------------------------------------------------------------- 1 | module Blurrily 2 | VERSION = "1.0.2" 3 | end 4 | -------------------------------------------------------------------------------- /spec/blurrily/client_spec.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | require 'spec_helper' 4 | require "blurrily/client" 5 | require 'pathname' 6 | 7 | describe Blurrily::Client do 8 | 9 | let(:config) { { :host => '0.0.0.0', :port => 12021, :db_name => 'location_en' } } 10 | 11 | subject { described_class.new(config) } 12 | 13 | context "find" do 14 | 15 | it "fails if no needle is passed" do 16 | expect{ subject.find() }.to raise_error(ArgumentError) 17 | end 18 | 19 | it "fails if the needle has a tab char" do 20 | expect{ subject.find("needle\twith\ttabs") }.to raise_error(ArgumentError) 21 | end 22 | 23 | it "fails if limit is not numeric" do 24 | expect{ subject.find("london", "blah") }.to raise_error(ArgumentError) 25 | end 26 | 27 | it "returns records" do 28 | mock_tcp_next_request("OK\t1337\t1\t2", "FIND\tlocation_en\tlondon\t10") 29 | expect(subject.find("london")).to eq([[1337,1,2]]) 30 | end 31 | 32 | it "handles no records found correctly" do 33 | mock_tcp_next_request("OK") 34 | expect(subject.find("blah")).to be_empty 35 | end 36 | 37 | it "handles errors correctly" do 38 | mock_tcp_next_request("ERROR") 39 | expect { subject.find("blah") }.to raise_exception(described_class::Error) 40 | end 41 | end 42 | 43 | context "put" do 44 | it "fails if no needle is passed" do 45 | expect { subject.put() }.to raise_error(ArgumentError) 46 | end 47 | 48 | it "fails if needle contains a tab" do 49 | expect { subject.put("South\tLondon", 123, 0) }.to raise_error(ArgumentError) 50 | end 51 | 52 | it "fails if no ref is passed" do 53 | expect { subject.put('London') }.to raise_error(ArgumentError) 54 | end 55 | 56 | it "fails if ref is not numeric" do 57 | expect { subject.put('London', 'abc', 0) }.to raise_error(ArgumentError) 58 | end 59 | 60 | it "fails if weight is not numeric" do 61 | expect { subject.put('London', 123, 'a') }.to raise_error(ArgumentError) 62 | end 63 | 64 | it "created a well formed request command string" do 65 | mock_tcp_next_request("OK", "PUT\tlocation_en\tLondon\t123\t0") 66 | expect(subject.put("London", 123, 0)).to be_nil 67 | end 68 | end 69 | end 70 | -------------------------------------------------------------------------------- /spec/blurrily/command_processor_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | require 'blurrily/command_processor' 3 | require 'blurrily/map_group' 4 | 5 | describe Blurrily::CommandProcessor do 6 | 7 | subject { described_class.new(Blurrily::MapGroup.new) } 8 | 9 | describe '#process_command' do 10 | # Accepts input strings: 11 | # CLEAR-> 12 | # FIND ->->->[limit] 13 | # PUT->->->->[weight] 14 | 15 | it 'PUT and FIND finds something' do 16 | expect(subject.process_command("PUT\tlocations_en\tgreat london\t12")).to eq('OK') 17 | expect(subject.process_command("PUT\tlocations_en\tgreater masovian\t13")).to eq('OK') 18 | expect(subject.process_command("FIND\tlocations_en\tgreat")).to eq("OK\t12\t6\t12\t13\t5\t16") 19 | end 20 | 21 | it 'FIND returns "OK" if nothing found' do 22 | expect(subject.process_command("FIND\tlocations_en\tgreat london")).to eq("OK") 23 | end 24 | 25 | 26 | it 'returns ERROR for bad input data' do 27 | expect(subject.process_command('Some stuff')).to match(/^ERROR\tUnknown command/) 28 | end 29 | 30 | it 'returns ERROR for bad db name' do 31 | expect(subject.process_command("FIND\tbad db name\tWhatever string")).to match(/^ERROR\tInvalid database name/) 32 | end 33 | 34 | it 'returns ERROR for not numeric limit' do 35 | expect(subject.process_command("FIND\tdb\tWhatever string\tlimit")).to match(/^ERROR\tLimit must be a number/) 36 | end 37 | 38 | it 'returns ERROR for not numeric ref' do 39 | expect(subject.process_command("PUT\tdb\tWhatever string\t12\tweight")).to match(/^ERROR\tInvalid weight/) 40 | end 41 | 42 | it 'returns ERROR for not numeric weight' do 43 | expect(subject.process_command("PUT\tdb\tWhatever string\tref")).to match(/^ERROR\tInvalid reference/) 44 | end 45 | 46 | it 'returns ERROR for too many aruments' do 47 | expect(subject.process_command("PUT\tdb\tWhatever string\tref\tweight\targument too much")).to match(/^ERROR\twrong number /) 48 | end 49 | 50 | it 'does not return ERROR for good PUT string' do 51 | expect(subject.process_command("PUT\tdb\tWhatever string\t12\t1")).to eq('OK') 52 | end 53 | 54 | it 'does not return ERROR for limit' do 55 | expect(subject.process_command("FIND\tdb\tWhatever string\t2")).to eq("OK") 56 | end 57 | 58 | # it 'CLEAR tries to clear given DB' do 59 | # subject.send(:map_group).should_receive(:clear).with('locations_en') 60 | # subject.process_command("CLEAR\tlocations_en") 61 | # end 62 | end 63 | end 64 | -------------------------------------------------------------------------------- /spec/blurrily/map_group_spec.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | require 'spec_helper' 4 | require 'blurrily/map_group' 5 | 6 | describe Blurrily::MapGroup do 7 | subject { described_class.new('.') } 8 | 9 | context "creating, loading and returning a db" do 10 | 11 | it "returns an instance of Map for a given DB" do 12 | expect(subject.map("location_en")).to be_a(Blurrily::Map) 13 | end 14 | 15 | it "returns the correct map, given the db name" do 16 | map1 = subject.map('location_en') 17 | map2 = subject.map('location_fr') 18 | expect(subject.map("location_en").object_id).to eq(map1.object_id) 19 | expect(subject.map("location_en").object_id).not_to eq(map2.object_id) 20 | end 21 | 22 | it "loads from file if exists rather than creating a new db" do 23 | map1 = subject.map('location_en') 24 | map1.put('aaa',123,0) 25 | subject.save 26 | loaded_map = described_class.new('.').map('location_en') 27 | expect(loaded_map.find('aaa').first.first).to eq(123) 28 | end 29 | end 30 | 31 | context "saving the map to file" do 32 | it "saves all maps" do 33 | subject.map('location_en') 34 | subject.map('location_fr') 35 | subject.save 36 | expect(Pathname('location_en.trigrams')).to exist 37 | expect(Pathname('location_fr.trigrams')).to exist 38 | end 39 | 40 | it 'saves in chosen directory' do 41 | map_group = described_class.new('tmp') 42 | map_group.map('test') 43 | map_group.save 44 | expect(Pathname('tmp/test.trigrams')).to exist 45 | end 46 | 47 | after(:each) do 48 | FileUtils.rm Dir.glob('tmp/test.trigrams') 49 | end 50 | end 51 | 52 | after(:each) do 53 | FileUtils.rm Dir.glob('location*.trigrams') 54 | end 55 | end 56 | -------------------------------------------------------------------------------- /spec/blurrily/map_spec.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | require 'spec_helper' 4 | require 'pathname' 5 | require "blurrily/map" 6 | 7 | describe Blurrily::Map do 8 | subject { described_class.new } 9 | let(:path) { Pathname.new('map.test') } 10 | 11 | after do 12 | path.delete_if_exists 13 | end 14 | 15 | describe '#stats' do 16 | let(:result) { subject.stats } 17 | 18 | it 'has :references' do 19 | expect(result[:references]).to be_a_kind_of(Integer) 20 | end 21 | 22 | it 'has :trigrams' do 23 | expect(result[:trigrams]).to be_a_kind_of(Integer) 24 | end 25 | 26 | end 27 | 28 | describe '#put' do 29 | let(:references) { subject.stats[:references] } 30 | let(:trigrams) { subject.stats[:trigrams] } 31 | 32 | it 'stores references' do 33 | subject.put 'foobar', 123, 0 34 | expect(references).to eq(1) 35 | expect(trigrams).to eq(7) 36 | end 37 | 38 | it 'returns number of added trigrams' do 39 | expect(subject.put('foobar', 123)).to eq(7) 40 | expect(subject.put('foobar', 123)).to eq(0) 41 | end 42 | 43 | it 'does not store duplicate references' do 44 | 2.times { subject.put 'foobar', 123, 0 } 45 | expect(references).to eq(1) 46 | expect(trigrams).to eq(7) 47 | end 48 | 49 | it 'accepts empty strings' do 50 | subject.put '', 123, 0 51 | expect(references).to eq(1) 52 | expect(trigrams).to eq(1) 53 | end 54 | 55 | it 'accepts non-letter characters' do 56 | subject.put '@€%é', 123, 0 57 | expect(references).to eq(1) 58 | expect(trigrams).to eq(2) 59 | end 60 | 61 | it 'ignores dupes after save/load cycle' do 62 | subject.put 'london', 123 63 | subject.save path.to_s 64 | map = described_class.load path.to_s 65 | map.put 'paris', 123 66 | expect(map.find('paris')).to be_empty 67 | end 68 | 69 | it 'makes map dirty' do 70 | subject.save path.to_s 71 | path.delete_if_exists 72 | subject.put 'london', 123 73 | subject.save path.to_s 74 | expect(path).to exist 75 | end 76 | end 77 | 78 | describe '#delete' do 79 | it 'removes references' do 80 | subject.put 'london', 123, 0 81 | subject.delete 123 82 | expect(subject.stats[:trigrams]).to eq(0) 83 | expect(subject.stats[:references]).to eq(0) 84 | end 85 | 86 | it 'makes map dirty' do 87 | subject.put 'london', 123, 0 88 | subject.save path.to_s 89 | path.delete_if_exists 90 | subject.delete 123 91 | subject.save path.to_s 92 | expect(path).to exist 93 | end 94 | 95 | context 'with duplicate references' do 96 | it 'removes duplicates' do 97 | 3.times { subject.put 'london', 123, 0 } 98 | subject.delete 123 99 | expect(subject.stats[:trigrams]).to eq(0) 100 | expect(subject.stats[:references]).to eq(0) 101 | end 102 | end 103 | 104 | it 'ignores missing references' do 105 | subject.delete 123 106 | expect(subject.stats[:trigrams]).to eq(0) 107 | end 108 | 109 | it 'permits re-adds' do 110 | subject.put 'london', 1337 111 | subject.delete 1337 112 | subject.put 'paris', 1337 113 | expect(subject.find('paris')).not_to be_empty 114 | end 115 | 116 | end 117 | 118 | describe '#find' do 119 | let(:needle) { 'london' } 120 | let(:limit) { 10 } 121 | let(:result) { subject.find needle, limit } 122 | 123 | context 'with an empty map' do 124 | it 'returns no results' do 125 | expect(result).to be_empty 126 | end 127 | end 128 | 129 | context 'with an empty string' do 130 | it 'returns no results' do 131 | needle.replace '' 132 | expect(result).to be_empty 133 | end 134 | end 135 | 136 | context 'with a limit option' do 137 | let(:limit) { 2 } 138 | it 'returns fewer results' do 139 | 5.times { |idx| subject.put 'london', idx, 0 } 140 | expect(result.length).to eq(2) 141 | end 142 | end 143 | 144 | it 'works with duplicated references' do 145 | subject.put needle, 123 146 | subject.put 'london2', 123 147 | expect(result.length).to eq(1) 148 | expect(result.first.first).to eq(123) 149 | end 150 | 151 | it 'works with duplicated needles and references' do 152 | subject.put needle, 123 153 | subject.put needle, 123 154 | expect(result.length).to eq(1) 155 | expect(result.first.first).to eq(123) 156 | end 157 | 158 | it 'returns perfect matches' do 159 | subject.put 'london', 123, 0 160 | expect(result.first).to eq([123, 7, 6]) 161 | end 162 | 163 | it 'favours exact matches' do 164 | subject.put 'lon', 125, 0 165 | subject.put 'london city airport', 124, 0 166 | subject.put 'london', 123, 0 167 | expect(result.first.first).to eq(123) 168 | end 169 | 170 | it 'ignores duplicate references' do 171 | subject.put 'london', 123 172 | subject.put 'paris', 123 173 | expect(result).not_to be_empty 174 | end 175 | 176 | context 'when needle is mis-spelt' do 177 | before { subject.put 'london', 123, 0 } 178 | 179 | it 'tolerates insertions' do 180 | needle.replace 'lonXdon' 181 | expect(result).not_to be_empty 182 | end 183 | 184 | it 'tolerates deletions' do 185 | needle.replace 'lodon' 186 | expect(result).not_to be_empty 187 | end 188 | 189 | it 'tolerates substitutions' do 190 | needle.replace 'lodnon' 191 | expect(result).not_to be_empty 192 | end 193 | end 194 | 195 | it 'sorts by descending matchiness' do 196 | subject.put 'New York', 1001, 0 197 | subject.put 'Yorkshire', 1002, 0 198 | subject.put 'York', 1003, 0 199 | subject.put 'Yorkisthan', 1004, 0 200 | needle.replace 'York' 201 | expect(result.map(&:first)).to eq([1003, 1001, 1002, 1004]) 202 | end 203 | 204 | it 'favours the lighter of two matches' do 205 | subject.put 'london', 103, 103 206 | subject.put 'london', 101, 101 207 | subject.put 'london', 102, 102 208 | expect(result.map(&:first)).to eq([101, 102, 103]) 209 | end 210 | end 211 | 212 | 213 | describe '#save' do 214 | 215 | def perform 216 | subject.save path.to_s 217 | end 218 | 219 | let(:wordsize_byte) do 220 | case ['foo'].pack('p').size # size of pointer to string 221 | when 8 then "\x08" 222 | when 4 then "\x04" 223 | else raise 'unknown platform' 224 | end 225 | end 226 | 227 | let(:big_endian_byte) do 228 | bytes = [0xAABB].pack('S').bytes.to_a 229 | if bytes == [0xBB, 0xAA] 230 | "\x01" 231 | elsif bytes == [0xAA, 0xBB] 232 | "\x02" 233 | else 234 | raise 'unknown platform' 235 | end 236 | end 237 | 238 | before do 239 | path.delete_if_exists 240 | 241 | subject.put 'london', 10, 0 242 | subject.put 'paris', 11, 0 243 | subject.put 'monaco', 12, 0 244 | end 245 | 246 | it 'creates a file on disk' do 247 | perform 248 | expect(path).to exist 249 | end 250 | 251 | it 'raises exception when directory does not exist' do 252 | expect { 253 | subject.save '/var/nonexistent/foo' 254 | }.to raise_exception(Errno::ENOENT) 255 | end 256 | 257 | it 'uses a magic header' do 258 | perform 259 | header = path.read(8) 260 | expect(header[0,6]).to eq("trigra") 261 | expect(header[6,1]).to eq(big_endian_byte) 262 | expect(header[7,1]).to eq(wordsize_byte) 263 | end 264 | 265 | it 'is idempotent' do 266 | hashes = (1..3).map { perform ; path.md5sum } 267 | expect(hashes[0]).to eq(hashes[1]) 268 | expect(hashes[0]).to eq(hashes[2]) 269 | end 270 | 271 | it 'makes map clean' do 272 | perform 273 | path.delete_if_exists 274 | perform 275 | expect(path).not_to exist 276 | end 277 | 278 | end 279 | 280 | 281 | describe '.load' do 282 | subject { described_class.load path.to_s } 283 | let(:alt_path) { Pathname.new('map2.test') } 284 | 285 | before do 286 | path.delete_if_exists 287 | Blurrily::Map.new.tap do |map| 288 | map.put 'london', 10, 0 289 | map.put 'paris', 11, 0 290 | map.put 'monaco', 12, 0 291 | map.save path.to_s 292 | end 293 | end 294 | 295 | after do 296 | alt_path.delete_if_exists 297 | end 298 | 299 | it 'results in a searchable map' do 300 | expect(subject.find('london')).not_to be_empty 301 | end 302 | 303 | it 'then saves to an identical file' do 304 | subject.save alt_path.to_s 305 | expect(path.md5sum).to eq(alt_path.md5sum) 306 | end 307 | 308 | it 'raises an exception when the file does not exist' do 309 | path.delete_if_exists 310 | expect { subject }.to raise_exception(Errno::ENOENT) 311 | end 312 | 313 | it 'raises an exception if the file is incorrect' do 314 | path.delete_if_exists 315 | path.open('w') { |io| io.write 'foo' } 316 | expect { subject }.to raise_exception(Errno::EPROTO) 317 | end 318 | 319 | it 'raises an exception if the file is corrupt' do 320 | path.truncate(128) # leave the magic in, but make it the wrong size 321 | expect { subject }.to raise_exception(Errno::EPROTO) 322 | end 323 | 324 | it 'loads clean map' do 325 | subject 326 | path.delete_if_exists 327 | subject.save path.to_s 328 | expect(path).not_to exist 329 | end 330 | end 331 | 332 | describe '#close' do 333 | let(:closed_error) { described_class::ClosedError } 334 | context 'after calling #close' do 335 | before { subject.close } 336 | 337 | it '#close fails' do 338 | expect { subject.close }.to raise_exception(closed_error) 339 | end 340 | 341 | it '#put fails' do 342 | expect { subject.put('london', 123) }.to raise_exception(closed_error) 343 | end 344 | 345 | it '#find fails' do 346 | expect { subject.find('london') }.to raise_exception(closed_error) 347 | end 348 | 349 | it '#save fails' do 350 | expect { subject.save('foo') }.to raise_exception(closed_error) 351 | end 352 | end 353 | end 354 | 355 | describe 'stress check' do 356 | let(:path) { Pathname.new "tmp/#{$$}.trigrams" } 357 | 358 | after { path.delete if path.exist? } 359 | 360 | context 'with 1k iterations' do 361 | let(:count) { 1024 } # enough cycles to force reallocations 362 | 363 | it 'puts' do 364 | count.times { |index| subject.put 'Port-au-Prince', index } 365 | expect(subject.stats[:references]).to eq(count) 366 | expect(subject.find('Port-au-Prince')).not_to be_empty 367 | end 368 | 369 | it 'put/delete/find' do 370 | count.times do |index| 371 | subject.put 'Port-au-Prince', index 372 | subject.delete index 373 | expect(subject.stats).to eq({ :references => 0, :trigrams => 0 }) 374 | expect(subject.find('Port-au-Prince')).to be_empty 375 | end 376 | end 377 | 378 | it 'put/find/delete' do 379 | count.times do |index| 380 | subject.put 'Port-au-Prince', index 381 | expect(subject.stats[:references]).to eq(1) 382 | expect(subject.find('Port-au-Prince').first.first).to eq(index) 383 | subject.delete index 384 | end 385 | end 386 | 387 | it 'puts, many deletes' do 388 | count.times { |index| subject.put 'Port-au-Prince', index } 389 | count.times { |index| subject.delete index } 390 | expect(subject.stats).to eq({ :references => 0, :trigrams => 0 }) 391 | expect(subject.find('Port-au-Prince')).to be_empty 392 | end 393 | 394 | it 'puts, reload, many deletes' do 395 | count.times { |index| subject.put 'Port-au-Prince', index } 396 | 397 | subject.save(path.to_s) 398 | subject = described_class.load(path.to_s) 399 | 400 | count.times { |index| subject.delete index } 401 | expect(subject.stats).to eq({ :references => 0, :trigrams => 0 }) 402 | expect(subject.find('Port-au-Prince')).to be_empty 403 | end 404 | end 405 | 406 | context 'with 100 iterations' do 407 | let(:count) { 100 } 408 | it 'cold loads' do 409 | count.times { |index| subject.put 'Port-au-Prince', index } 410 | subject.save(path.to_s) 411 | 412 | count.times do 413 | described_class.load(path.to_s) 414 | end 415 | end 416 | 417 | it 'put/save/load/delete' do 418 | map = subject 419 | count.times do |index| 420 | map.put 'Port-au-Prince', index 421 | map.save(path.to_s) 422 | map = described_class.load(path.to_s) 423 | map.delete(index) 424 | expect(map.stats[:references]).to eq(0) 425 | end 426 | end 427 | 428 | it 'put/save/load' do 429 | map = subject 430 | count.times do |index| 431 | map.put 'Port-au-Prince', index 432 | map.save(path.to_s) 433 | map = described_class.load(path.to_s) 434 | expect(map.stats[:references]).to eq(index+1) # index starts from 0 435 | end 436 | end 437 | end 438 | end 439 | 440 | end 441 | -------------------------------------------------------------------------------- /spec/blurrily/server_spec.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | require 'spec_helper' 4 | require 'pathname' 5 | require 'eventmachine' 6 | require 'socket' 7 | require "blurrily/server" 8 | 9 | describe Blurrily::Server do 10 | context 'running server' do 11 | let(:socket) { TCPSocket.new(host, @port) } 12 | let(:directory) { 'tmp/data' } 13 | let(:host) { 'localhost' } 14 | 15 | before do 16 | @port, @pid = try_to_start_server(directory) 17 | end 18 | 19 | after do 20 | if @pid 21 | Process.kill('KILL', @pid) 22 | Process.wait(@pid) 23 | end 24 | end 25 | 26 | after do 27 | FileUtils.rm_rf(directory) 28 | end 29 | 30 | it 'responds' do 31 | socket.puts 'Who is most beautiful in the world?' 32 | expect(socket.gets).to match(/^ERROR\tUnknown command/) 33 | end 34 | 35 | it 'does not close the connection' do 36 | 3.times { socket.puts 'Bad command' } 37 | 3.times do 38 | expect(socket.gets).to match(/^ERROR/) 39 | end 40 | end 41 | 42 | it 'saves when quitting' do 43 | socket = TCPSocket.new('localhost', @port) 44 | socket.puts("PUT\twords\tmerveilleux\t1") 45 | socket.gets 46 | socket.close 47 | 48 | Process.kill('TERM', @pid) 49 | Process.wait(@pid) 50 | @pid = nil 51 | path = Pathname.new(directory).join('words.trigrams') 52 | expect(path).to exist 53 | end 54 | end 55 | 56 | def try_to_start_server(directory) 57 | port = find_free_port 58 | pid = fork do 59 | described_class.new(:port => port, :directory => directory).start 60 | Kernel.exit 0 61 | end 62 | wait_for_socket('localhost', port) 63 | return [port, pid] 64 | end 65 | end 66 | -------------------------------------------------------------------------------- /spec/integration_spec.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | require 'spec_helper' 4 | require 'pathname' 5 | require 'fileutils' 6 | require 'blurrily/client' 7 | require 'blurrily/map' 8 | 9 | describe 'client/server integration' do 10 | let(:data_dir) { Pathname.new 'tmp/data' } 11 | let(:data_file) { data_dir.join('foobar.trigrams') } 12 | 13 | before { data_dir.rmtree if data_dir.exist? } 14 | after { data_dir.rmtree if data_dir.exist? } 15 | 16 | around do |example| 17 | @port = find_free_port 18 | @pid = fork { exec "bin/blurrily -d #{data_dir} -p #{@port}" } 19 | wait_for_socket 'localhost', @port # until server started 20 | 21 | @client = Blurrily::Client.new(:port => @port, :db_name => 'foobar') 22 | 23 | # puts 'calling example' 24 | example.call 25 | # puts 'finished example' 26 | 27 | Process.kill('KILL', @pid) 28 | Process.detach(@pid) 29 | end 30 | 31 | it 'does single find' do 32 | @client.put 'paris', 123 33 | expect(@client.find('paris')).to match([[123, 6, 5]]) 34 | expect(@client.find('pariis')).to match([[123, 5, 5]]) 35 | end 36 | 37 | it 'does put/find cycles' do 38 | @client.put 'paris', 123 39 | @client.put 'paris', 456 40 | expect(@client.find('paris').map(&:first)).to match([123, 456]) 41 | expect(@client.find('pariis').map(&:first)).to match([123, 456]) 42 | end 43 | 44 | it 'does put/delete/find cycles' do 45 | @client.put 'paris', 123 46 | @client.put 'paris', 456 47 | @client.delete 456 48 | expect(@client.find('paris').map(&:first)).to match([123]) 49 | end 50 | 51 | it 'handles multiple databases' do 52 | @other_client = Blurrily::Client.new(:port => @port, :db_name => 'qux') 53 | @client.put 'rome', 1 54 | @other_client.put 'venice', 2 55 | 56 | expect(@client.find('rome').map(&:first)).to eq([1]) 57 | expect(@client.find('venice')).to be_empty 58 | expect(@other_client.find('venice').map(&:first)).to eq([2]) 59 | expect(@other_client.find('rome')).to be_empty 60 | end 61 | 62 | it 'saves files on SIGURS1' do 63 | @client.put 'rome', 1 64 | Process.kill('USR1', @pid) 65 | wait_for_file(data_file) 66 | end 67 | 68 | it 'uses existing maps' do 69 | map = Blurrily::Map.new 70 | map.put('london', 1337) 71 | data_dir.mkpath 72 | map.save(data_file.to_s) 73 | 74 | expect(@client.find('london').map(&:first)).to eq([1337]) 75 | end 76 | 77 | end 78 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | require 'blurrily' 2 | require 'socket' 3 | require 'timeout' 4 | require 'coveralls' 5 | 6 | Coveralls.wear! 7 | 8 | # 9 | # Example: 10 | # mock_tcp_next_request("junk") 11 | # 12 | class FakeTCPSocket 13 | def initialize(canned_response) 14 | @canned_response = canned_response 15 | end 16 | 17 | def puts(ignored = nil) 18 | end 19 | 20 | def gets 21 | "#{@canned_response}\n" 22 | end 23 | end 24 | 25 | def mock_tcp_next_request(string, client_expectation=nil) 26 | allow(TCPSocket).to receive(:new) do 27 | FakeTCPSocket.new(string).tap do |fake_socket| 28 | if client_expectation 29 | expect(fake_socket).to receive(:puts).with(client_expectation) 30 | end 31 | end 32 | end 33 | end 34 | 35 | 36 | def is_port_open?(host, port) 37 | Timeout::timeout(1.0) do 38 | TCPSocket.new(host, port).close 39 | return true 40 | end 41 | rescue Timeout::Error, Errno::ECONNREFUSED, Errno::EHOSTUNREACH 42 | return false 43 | end 44 | 45 | def find_free_port() 46 | while true 47 | port = 1024 + rand(32768 - 1024) 48 | is_port_open?('localhost', port) and next 49 | break port 50 | end 51 | end 52 | 53 | def wait_for_socket(host, port, timeout=10.0) 54 | Timeout::timeout(timeout) do 55 | sleep 50e-3 while !is_port_open?(host, port) 56 | end 57 | end 58 | 59 | def wait_for_file(path, timeout=10.0) 60 | Timeout::timeout(timeout) do 61 | sleep 50e-3 until path.exist? 62 | end 63 | end 64 | 65 | 66 | RSpec.configure do |config| 67 | config.before(:each) do 68 | end 69 | 70 | config.after(:each) do 71 | end 72 | end 73 | 74 | 75 | Pathname.class_eval do 76 | def md5sum 77 | Digest::MD5.file(self.to_s) 78 | end 79 | 80 | def delete_if_exists 81 | delete if exist? 82 | end 83 | end 84 | --------------------------------------------------------------------------------