├── .gitignore ├── .travis.yml ├── Gemfile ├── Gemfile.lock ├── LICENSE ├── README.textile ├── Rakefile ├── VERSION.yml ├── ext └── trie │ ├── darray.c │ ├── darray.h │ ├── extconf.rb │ ├── fileutils.c │ ├── fileutils.h │ ├── tail.c │ ├── tail.h │ ├── trie-private.c │ ├── trie-private.h │ ├── trie.c │ ├── trie.h │ ├── triedefs.h │ └── typedefs.h ├── fast_trie.gemspec └── spec └── trie_spec.rb /.gitignore: -------------------------------------------------------------------------------- 1 | *.sw? 2 | *.o 3 | *.bundle 4 | *.dylib 5 | .DS_Store 6 | coverage 7 | *~ 8 | #* 9 | *.gem 10 | rdoc 11 | Makefile 12 | *.stackdump 13 | *.def 14 | *.so 15 | tmp/ -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: ruby 2 | rvm: 3 | - 2.1.0 4 | - 2.0.0 5 | - 1.9.3 6 | - 1.8.7 7 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | group :development do 4 | gem 'rake' 5 | gem 'rspec' 6 | gem 'rdoc', '~> 3.12' 7 | gem 'bundler', '~> 1.0' 8 | gem 'jeweler', '~> 2.0.1' 9 | gem 'rake-compiler' 10 | end 11 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: https://rubygems.org/ 3 | specs: 4 | addressable (2.3.5) 5 | builder (3.2.2) 6 | descendants_tracker (0.0.3) 7 | diff-lcs (1.2.5) 8 | faraday (0.9.0) 9 | multipart-post (>= 1.2, < 3) 10 | git (1.2.6) 11 | github_api (0.11.2) 12 | addressable (~> 2.3) 13 | descendants_tracker (~> 0.0.1) 14 | faraday (~> 0.8, < 0.10) 15 | hashie (>= 1.2) 16 | multi_json (>= 1.7.5, < 2.0) 17 | nokogiri (~> 1.6.0) 18 | oauth2 19 | hashie (2.0.5) 20 | highline (1.6.20) 21 | jeweler (2.0.1) 22 | builder 23 | bundler (>= 1.0) 24 | git (>= 1.2.5) 25 | github_api 26 | highline (>= 1.6.15) 27 | nokogiri (>= 1.5.10) 28 | rake 29 | rdoc 30 | json (1.8.1) 31 | jwt (0.1.11) 32 | multi_json (>= 1.5) 33 | mini_portile (0.5.2) 34 | multi_json (1.8.4) 35 | multi_xml (0.5.5) 36 | multipart-post (2.0.0) 37 | nokogiri (1.6.1-x86-mingw32) 38 | mini_portile (~> 0.5.0) 39 | oauth2 (0.9.3) 40 | faraday (>= 0.8, < 0.10) 41 | jwt (~> 0.1.8) 42 | multi_json (~> 1.3) 43 | multi_xml (~> 0.5) 44 | rack (~> 1.2) 45 | rack (1.5.2) 46 | rake (10.1.1) 47 | rake-compiler (0.9.2) 48 | rake 49 | rdoc (3.12.2) 50 | json (~> 1.4) 51 | rspec (2.14.1) 52 | rspec-core (~> 2.14.0) 53 | rspec-expectations (~> 2.14.0) 54 | rspec-mocks (~> 2.14.0) 55 | rspec-core (2.14.7) 56 | rspec-expectations (2.14.5) 57 | diff-lcs (>= 1.1.3, < 2.0) 58 | rspec-mocks (2.14.5) 59 | 60 | PLATFORMS 61 | x86-mingw32 62 | 63 | DEPENDENCIES 64 | bundler (~> 1.0) 65 | jeweler (~> 2.0.1) 66 | rake 67 | rake-compiler 68 | rdoc (~> 3.12) 69 | rspec 70 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2008 Tyler McMullen 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.textile: -------------------------------------------------------------------------------- 1 | h1. Trie 2 | 3 | !https://badge.fury.io/rb/fast_trie.svg!:https://rubygems.org/gems/fast_trie !https://travis-ci.org/tyler/trie.svg!:https://travis-ci.org/tyler/trie 4 | 5 | This is a trie for Ruby using "libdatrie":http://linux.thai.net/~thep/datrie/. It uses a dual-array system, meaning it has best-in-class memory usage and search time. 6 | 7 | 8 | h2. What is a trie? 9 | 10 | I suck at explaining things. Wikipedia doesn't. http://wikipedia.org/wiki/Trie. 11 | 12 | But in short a trie is a data structure that holds strings in a tree. So if you inserted the words 'arc', 'ark', and 'ape' in a trie you could visualize it thusly: 13 | 14 |
 15 |       p - e
 16 |     /
 17 |   a - r - c
 18 |         \
 19 |           k
 20 | 
21 | 22 | It's easy to see how this can have pretty neat implications for things like searching through lists of strings, sorting lists of strings, and things like spelling correction and autocompletion. 23 | 24 | h2. Installation 25 | 26 | From RubyGems https://rubygems.org/gems/fast_trie 27 | 28 |

 29 |   gem install fast_trie
 30 | 
31 | 32 | h2. Tutorial 33 | 34 | Let's go through building a simple autocompleter using "Trie":http://rubydoc.info/gems/fast_trie/Trie object. 35 | 36 |

 37 |   require 'trie'
 38 |   Trie.new
 39 | 
40 | 41 | Anyway. So we've created our blank trie. Now, since we're creating an autocompleter, we'll need to add some words into it. We do that simply with the add method. 42 | 43 |

 44 |   words.each do |word|
 45 |     trie.add word
 46 |   end
 47 | 
48 | 49 | Or if you have some integer data to store along with the words, such as weights or scores of some kind, you'd do it like so... 50 | 51 |

 52 |   words_and_weights do |word,weight|
 53 |     trie.add word, weight
 54 |   end
 55 | 
56 | 57 | Great, so we've populated our trie with some words. Let's make sure those words are really there. 58 | 59 |

 60 |   trie.has_key?('widget')  #=> true
 61 | 
 62 |   trie.get('widget')  #=> -1 or your value
 63 | 
 64 |   trie.get('not-in-the-trie')  #=> nil
 65 | 
66 | 67 | If you didn't enter a value to go along with the word, calling get with it will return -1. 68 | 69 | Okay great, we have our populated trie, we've confirmed that the keys are in there. Let's make an autocompleter! For this we'll need to use the children method. We'll do this as a simple Rails action, with the assumption you've initialized the trie into TRIE. 70 | 71 |

 72 |   def autocomplete
 73 |     children = TRIE.children(params[:prefix])
 74 | 
 75 |     respond_to do |format|
 76 |       format.js { render(:string => JSON.dump(children)) }
 77 |       format.yaml { render(:string => YAML.dump(children)) }
 78 |     end
 79 |   end
 80 | 
81 | 82 | Yep, that's it. 83 | 84 | There are, of course, some more interesting and advanced ways to use a trie. For instance, this snippet take a string, then walks down the trie, noting each word it finds along the way. 85 | 86 |

 87 |   word = 'forestry'
 88 |   node = trie.root
 89 | 
 90 |   word.split('').each do |char|
 91 |     break unless node.walk!(char)
 92 |     if node.terminal?
 93 |       puts "Found me a word: #{node.full_state}"
 94 |     end
 95 |   end
 96 | 
97 | 98 | By calling root on a Trie, you get a "TrieNode":http://rubydoc.info/gems/fast_trie/TrieNode, pointed at the root of the trie. You can then use this node to walk the trie and perceive things about each word. 99 | 100 | You can read the reference documentation at http://rubydoc.info/gems/fast_trie/frames/Trie 101 | 102 | h2. Performance Characteristics 103 | 104 | Here are some quick benchmarks on my 2.4ghz Intel Core 2 Duo MacBook Pro: 105 | 106 | For keys that are 5 characters long: 107 | 31,344 adds/second 108 | 1,827,408 searches/second 109 | 38,453 prefixes searches/second 110 | 111 | For keys that are 10 characters long: 112 | 30,653 adds/second 113 | 1,802,649 searches/second 114 | 13,553 prefix searches/second 115 | 116 | For keys that are 20 characters long: 117 | 30,488 adds/second 118 | 1,851,461 searches/second 119 | 5,855 prefix searches/second 120 | 121 | For keys that are 40 characters long: 122 | 30,710 adds/second 123 | 1,838,380 searches/second 124 | 2,762 prefix searches/second 125 | 126 | 127 | There are a few takeaways from this. First, there is no strong correlation between length of keys and insert or retrieve time. They stay fairly constant as the length of keys increase. Secondly, doing prefix searches with this trie gets slower linearly with the length of the keys in the trie. 128 | 129 | This points to a limitation of this type of trie. It is based on "libdatrie":http://linux.thai.net/~thep/datrie/ ("version 0.1.99":http://linux.thai.net/svn/software/datrie/trunk/NEWS), which is a dual-array trie. When finding branches from a particular node, we must query all possible branches to determine whether or not they exist. So for each node we do 255 of these queries. 130 | 131 | There may be some tricks to speed this up, but for now it is simply a limitation of this trie. 132 | 133 | Now, let's look at the effect of the size of the trie itself on query and insertion time. For this test I inserted 100, 1000, 10000, 100000, and 1000000 words in the trie. We measure the insertion and retrieval time in each. The graph below shows the results. 134 | 135 | !http://codehallow.com/effect_of_size.png! 136 | 137 | So, keeping in mind that we're increasing by orders of magnitude, you can see that the insertion time does take a signifcant hit. Retrieval also goes down but at a very gradual rate. (It decreases by about 50% in total, despite the size increasing by 1,000,000%.) 138 | 139 | The reason the insertion times takes such a beating is due, again, to a limitation of the trie. Storing a trie in the dual array setup that is used is excellent for memory usage and retrieval time. Best in class, in fact. However, the more things are added into the trie the more complicated it gets to insert things. It often requires shuffling large pieces of the arrays. There may be room for optimization here, but ultimately insertion time will increase with the size of the trie. 140 | 141 | 142 | 143 | Copyright (c) 2008 Tyler McMullen. See LICENSE for details. 144 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | require 'rubygems' 4 | require 'bundler' 5 | begin 6 | Bundler.setup(:default, :development) 7 | rescue Bundler::BundlerError => e 8 | $stderr.puts e.message 9 | $stderr.puts "Run `bundle install` to install missing gems" 10 | exit e.status_code 11 | end 12 | require 'rake' 13 | 14 | require 'jeweler' 15 | 16 | jeweler_tasks = Jeweler::Tasks.new do |s| 17 | s.name = "fast_trie" 18 | s.email = "tyler@scribd.com" 19 | s.homepage = "http://github.com/tyler/trie" 20 | s.description = "Ruby Trie based on libdatrie." 21 | s.summary = s.description 22 | s.authors = ["Tyler McMullen", "Matt Hickford"] 23 | s.extensions = ['ext/trie/extconf.rb'] 24 | s.require_paths = ['ext'] 25 | s.files = FileList["[A-Z]*.*", "{spec,ext}/**/*"] 26 | s.has_rdoc = true 27 | s.rdoc_options = ['--title', 'Trie', '--line-numbers', '--op', 'rdoc', '--main', 'ext/trie/trie.c', 'README'] 28 | end 29 | Jeweler::RubygemsDotOrgTasks.new 30 | 31 | $gemspec = jeweler_tasks.gemspec 32 | $gemspec.version = jeweler_tasks.jeweler.version 33 | 34 | require 'rake/extensiontask' 35 | Rake::ExtensionTask.new('trie', $gemspec) 36 | CLEAN.include 'lib/**/*.so' 37 | 38 | require 'rspec/core/rake_task' 39 | RSpec::Core::RakeTask.new 40 | 41 | require 'rdoc/task' 42 | Rake::RDocTask.new do |rdoc| 43 | rdoc.rdoc_dir = 'rdoc' 44 | rdoc.title = 'Trie' 45 | rdoc.options << '--line-numbers' << '--inline-source' 46 | rdoc.rdoc_files.include('README*') 47 | rdoc.rdoc_files.include('ext/trie/trie.c') 48 | end 49 | 50 | task :default => [:compile, :spec] 51 | -------------------------------------------------------------------------------- /VERSION.yml: -------------------------------------------------------------------------------- 1 | --- 2 | :major: 0 3 | :minor: 5 4 | :patch: 1 5 | :build: 6 | -------------------------------------------------------------------------------- /ext/trie/darray.c: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ 2 | /* 3 | * darray.c - Double-array trie structure 4 | * Created: 2006-08-13 5 | * Author: Theppitak Karoonboonyanan 6 | */ 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | #include "trie-private.h" 13 | #include "darray.h" 14 | #include "fileutils.h" 15 | 16 | /*----------------------------------* 17 | * INTERNAL TYPES DECLARATIONS * 18 | *----------------------------------*/ 19 | 20 | typedef struct _Symbols Symbols; 21 | 22 | struct _Symbols { 23 | short num_symbols; 24 | TrieChar symbols[256]; 25 | }; 26 | 27 | static Symbols * symbols_new (); 28 | static void symbols_free (Symbols *syms); 29 | static void symbols_add (Symbols *syms, TrieChar c); 30 | 31 | #define symbols_num(s) ((s)->num_symbols) 32 | #define symbols_get(s,i) ((s)->symbols[i]) 33 | #define symbols_add_fast(s,c) ((s)->symbols[(s)->num_symbols++] = c) 34 | 35 | /*-----------------------------------* 36 | * PRIVATE METHODS DECLARATIONS * 37 | *-----------------------------------*/ 38 | 39 | #define da_get_free_list(d) (1) 40 | 41 | static Bool da_check_free_cell (DArray *d, 42 | TrieIndex s); 43 | 44 | static Bool da_has_children (DArray *d, 45 | TrieIndex s); 46 | 47 | static Symbols * da_output_symbols (const DArray *d, 48 | TrieIndex s); 49 | 50 | static TrieChar * da_get_state_key (const DArray *d, 51 | TrieIndex state); 52 | 53 | static TrieIndex da_find_free_base (DArray *d, 54 | const Symbols *symbols); 55 | 56 | static Bool da_fit_symbols (DArray *d, 57 | TrieIndex base, 58 | const Symbols *symbols); 59 | 60 | static void da_relocate_base (DArray *d, 61 | TrieIndex s, 62 | TrieIndex new_base); 63 | 64 | static Bool da_extend_pool (DArray *d, 65 | TrieIndex to_index); 66 | 67 | static void da_alloc_cell (DArray *d, 68 | TrieIndex cell); 69 | 70 | static void da_free_cell (DArray *d, 71 | TrieIndex cell); 72 | 73 | static Bool da_enumerate_recursive (const DArray *d, 74 | TrieIndex state, 75 | DAEnumFunc enum_func, 76 | void *user_data); 77 | 78 | /* ==================== BEGIN IMPLEMENTATION PART ==================== */ 79 | 80 | /*------------------------------------* 81 | * INTERNAL TYPES IMPLEMENTATIONS * 82 | *------------------------------------*/ 83 | 84 | static Symbols * 85 | symbols_new () 86 | { 87 | Symbols *syms; 88 | 89 | syms = (Symbols *) malloc (sizeof (Symbols)); 90 | 91 | if (!syms) 92 | return NULL; 93 | 94 | syms->num_symbols = 0; 95 | 96 | return syms; 97 | } 98 | 99 | static void 100 | symbols_free (Symbols *syms) 101 | { 102 | free (syms); 103 | } 104 | 105 | static void 106 | symbols_add (Symbols *syms, TrieChar c) 107 | { 108 | short lower, upper; 109 | 110 | lower = 0; 111 | upper = syms->num_symbols; 112 | while (lower < upper) { 113 | short middle; 114 | 115 | middle = (lower + upper)/2; 116 | if (c > syms->symbols[middle]) 117 | lower = middle + 1; 118 | else if (c < syms->symbols[middle]) 119 | upper = middle; 120 | else 121 | return; 122 | } 123 | if (lower < syms->num_symbols) { 124 | memmove (syms->symbols + lower + 1, syms->symbols + lower, 125 | syms->num_symbols - lower); 126 | } 127 | syms->symbols[lower] = c; 128 | syms->num_symbols++; 129 | } 130 | 131 | /*------------------------------* 132 | * PRIVATE DATA DEFINITONS * 133 | *------------------------------*/ 134 | 135 | typedef struct { 136 | TrieIndex base; 137 | TrieIndex check; 138 | } DACell; 139 | 140 | struct _DArray { 141 | TrieIndex num_cells; 142 | DACell *cells; 143 | }; 144 | 145 | /*-----------------------------* 146 | * METHODS IMPLEMENTAIONS * 147 | *-----------------------------*/ 148 | 149 | #define DA_SIGNATURE 0xDAFCDAFC 150 | 151 | /* DA Header: 152 | * - Cell 0: SIGNATURE, number of cells 153 | * - Cell 1: free circular-list pointers 154 | * - Cell 2: root node 155 | * - Cell 3: DA pool begin 156 | */ 157 | #define DA_POOL_BEGIN 3 158 | 159 | DArray * 160 | da_new () 161 | { 162 | DArray *d; 163 | 164 | d = (DArray *) malloc (sizeof (DArray)); 165 | if (!d) 166 | return NULL; 167 | 168 | d->num_cells = DA_POOL_BEGIN; 169 | d->cells = (DACell *) malloc (d->num_cells * sizeof (DACell)); 170 | if (!d->cells) 171 | goto exit_da_created; 172 | d->cells[0].base = DA_SIGNATURE; 173 | d->cells[0].check = d->num_cells; 174 | d->cells[1].base = -1; 175 | d->cells[1].check = -1; 176 | d->cells[2].base = DA_POOL_BEGIN; 177 | d->cells[2].check = 0; 178 | 179 | return d; 180 | 181 | exit_da_created: 182 | free (d); 183 | return NULL; 184 | } 185 | 186 | DArray * 187 | da_read (FILE *file) 188 | { 189 | long save_pos; 190 | DArray *d = NULL; 191 | TrieIndex n; 192 | 193 | /* check signature */ 194 | save_pos = ftell (file); 195 | if (!file_read_int32 (file, &n) || DA_SIGNATURE != (uint32) n) { 196 | fseek (file, save_pos, SEEK_SET); 197 | return NULL; 198 | } 199 | 200 | d = (DArray *) malloc (sizeof (DArray)); 201 | if (!d) 202 | return NULL; 203 | 204 | /* read number of cells */ 205 | file_read_int32 (file, &d->num_cells); 206 | d->cells = (DACell *) malloc (d->num_cells * sizeof (DACell)); 207 | if (!d->cells) 208 | goto exit_da_created; 209 | d->cells[0].base = DA_SIGNATURE; 210 | d->cells[0].check= d->num_cells; 211 | for (n = 1; n < d->num_cells; n++) { 212 | file_read_int32 (file, &d->cells[n].base); 213 | file_read_int32 (file, &d->cells[n].check); 214 | } 215 | 216 | return d; 217 | 218 | exit_da_created: 219 | free (d); 220 | return NULL; 221 | } 222 | 223 | void 224 | da_free (DArray *d) 225 | { 226 | free (d->cells); 227 | free (d); 228 | } 229 | 230 | int 231 | da_write (const DArray *d, FILE *file) 232 | { 233 | TrieIndex i; 234 | 235 | for (i = 0; i < d->num_cells; i++) { 236 | if (!file_write_int32 (file, d->cells[i].base) || 237 | !file_write_int32 (file, d->cells[i].check)) 238 | { 239 | return -1; 240 | } 241 | } 242 | 243 | return 0; 244 | } 245 | 246 | 247 | TrieIndex 248 | da_get_root (const DArray *d) 249 | { 250 | /* can be calculated value for multi-index trie */ 251 | return 2; 252 | } 253 | 254 | 255 | TrieIndex 256 | da_get_base (const DArray *d, TrieIndex s) 257 | { 258 | return (0 <= s && s < d->num_cells) ? d->cells[s].base : TRIE_INDEX_ERROR; 259 | } 260 | 261 | TrieIndex 262 | da_get_check (const DArray *d, TrieIndex s) 263 | { 264 | return (0 <= s && s < d->num_cells) ? d->cells[s].check : TRIE_INDEX_ERROR; 265 | } 266 | 267 | 268 | void 269 | da_set_base (DArray *d, TrieIndex s, TrieIndex val) 270 | { 271 | if (0 <= s && s < d->num_cells) { 272 | d->cells[s].base = val; 273 | } 274 | } 275 | 276 | void 277 | da_set_check (DArray *d, TrieIndex s, TrieIndex val) 278 | { 279 | if (0 <= s && s < d->num_cells) { 280 | d->cells[s].check = val; 281 | } 282 | } 283 | 284 | Bool 285 | da_walk (const DArray *d, TrieIndex *s, TrieChar c) 286 | { 287 | TrieIndex next; 288 | 289 | next = da_get_base (d, *s) + c; 290 | if (da_get_check (d, next) == *s) { 291 | *s = next; 292 | return TRUE; 293 | } 294 | return FALSE; 295 | } 296 | 297 | TrieIndex 298 | da_insert_branch (DArray *d, TrieIndex s, TrieChar c) 299 | { 300 | TrieIndex base, next; 301 | 302 | base = da_get_base (d, s); 303 | 304 | if (base > 0) { 305 | next = base + c; 306 | 307 | /* if already there, do not actually insert */ 308 | if (da_get_check (d, next) == s) 309 | return next; 310 | 311 | /* if (base + c) > TRIE_INDEX_MAX which means 'next' is overflow, 312 | * or cell [next] is not free, relocate to a free slot 313 | */ 314 | if (base > TRIE_INDEX_MAX - c || !da_check_free_cell (d, next)) { 315 | Symbols *symbols; 316 | TrieIndex new_base; 317 | 318 | /* relocate BASE[s] */ 319 | symbols = da_output_symbols (d, s); 320 | symbols_add (symbols, c); 321 | new_base = da_find_free_base (d, symbols); 322 | symbols_free (symbols); 323 | 324 | if (TRIE_INDEX_ERROR == new_base) 325 | return TRIE_INDEX_ERROR; 326 | 327 | da_relocate_base (d, s, new_base); 328 | next = new_base + c; 329 | } 330 | } else { 331 | Symbols *symbols; 332 | TrieIndex new_base; 333 | 334 | symbols = symbols_new (); 335 | symbols_add (symbols, c); 336 | new_base = da_find_free_base (d, symbols); 337 | symbols_free (symbols); 338 | 339 | if (TRIE_INDEX_ERROR == new_base) 340 | return TRIE_INDEX_ERROR; 341 | 342 | da_set_base (d, s, new_base); 343 | next = new_base + c; 344 | } 345 | da_alloc_cell (d, next); 346 | da_set_check (d, next, s); 347 | 348 | return next; 349 | } 350 | 351 | static Bool 352 | da_check_free_cell (DArray *d, 353 | TrieIndex s) 354 | { 355 | return da_extend_pool (d, s) && da_get_check (d, s) < 0; 356 | } 357 | 358 | static Bool 359 | da_has_children (DArray *d, 360 | TrieIndex s) 361 | { 362 | TrieIndex base; 363 | TrieIndex c, max_c; 364 | 365 | base = da_get_base (d, s); 366 | if (TRIE_INDEX_ERROR == base || base < 0) 367 | return FALSE; 368 | 369 | max_c = MIN_VAL (TRIE_CHAR_MAX, TRIE_INDEX_MAX - base); 370 | for (c = 0; c < max_c; c++) { 371 | if (da_get_check (d, base + c) == s) 372 | return TRUE; 373 | } 374 | 375 | return FALSE; 376 | } 377 | 378 | static Symbols * 379 | da_output_symbols (const DArray *d, 380 | TrieIndex s) 381 | { 382 | Symbols *syms; 383 | TrieIndex base; 384 | TrieIndex c, max_c; 385 | 386 | syms = symbols_new (); 387 | 388 | base = da_get_base (d, s); 389 | max_c = MIN_VAL (TRIE_CHAR_MAX, TRIE_INDEX_MAX - base); 390 | for (c = 0; c < max_c; c++) { 391 | if (da_get_check (d, base + c) == s) 392 | symbols_add_fast (syms, (TrieChar) c); 393 | } 394 | 395 | return syms; 396 | } 397 | 398 | static TrieChar * 399 | da_get_state_key (const DArray *d, 400 | TrieIndex state) 401 | { 402 | TrieChar *key; 403 | int key_size, key_length; 404 | int i; 405 | 406 | key_size = 20; 407 | key_length = 0; 408 | key = (TrieChar *) malloc (key_size); 409 | 410 | /* trace back to root */ 411 | while (da_get_root (d) != state) { 412 | TrieIndex parent; 413 | 414 | if (key_length + 1 >= key_size) { 415 | key_size += 20; 416 | key = (TrieChar *) realloc (key, key_size); 417 | } 418 | parent = da_get_check (d, state); 419 | key[key_length++] = (TrieChar) (state - da_get_base (d, parent)); 420 | state = parent; 421 | } 422 | key[key_length] = '\0'; 423 | 424 | /* reverse the string */ 425 | for (i = 0; i < --key_length; i++) { 426 | TrieChar temp; 427 | 428 | temp = key[i]; 429 | key[i] = key[key_length]; 430 | key[key_length] = temp; 431 | } 432 | 433 | return key; 434 | } 435 | 436 | static TrieIndex 437 | da_find_free_base (DArray *d, 438 | const Symbols *symbols) 439 | { 440 | TrieChar first_sym; 441 | TrieIndex s; 442 | 443 | /* find first free cell that is beyond the first symbol */ 444 | first_sym = symbols_get (symbols, 0); 445 | s = -da_get_check (d, da_get_free_list (d)); 446 | while (s != da_get_free_list (d) 447 | && s < (TrieIndex) first_sym + DA_POOL_BEGIN) 448 | { 449 | s = -da_get_check (d, s); 450 | } 451 | if (s == da_get_free_list (d)) { 452 | for (s = first_sym + DA_POOL_BEGIN; ; ++s) { 453 | if (!da_extend_pool (d, s)) 454 | return TRIE_INDEX_ERROR; 455 | if (da_get_check (d, s) < 0) 456 | break; 457 | } 458 | } 459 | 460 | /* search for next free cell that fits the symbols set */ 461 | while (!da_fit_symbols (d, s - first_sym, symbols)) { 462 | /* extend pool before getting exhausted */ 463 | if (-da_get_check (d, s) == da_get_free_list (d)) { 464 | if (!da_extend_pool (d, d->num_cells)) 465 | return TRIE_INDEX_ERROR; 466 | } 467 | 468 | s = -da_get_check (d, s); 469 | } 470 | 471 | return s - first_sym; 472 | } 473 | 474 | static Bool 475 | da_fit_symbols (DArray *d, 476 | TrieIndex base, 477 | const Symbols *symbols) 478 | { 479 | int i; 480 | 481 | for (i = 0; i < symbols_num (symbols); i++) { 482 | TrieChar sym = symbols_get (symbols, i); 483 | 484 | /* if (base + sym) > TRIE_INDEX_MAX which means it's overflow, 485 | * or cell [base + sym] is not free, the symbol is not fit. 486 | */ 487 | if (base > TRIE_INDEX_MAX - sym || !da_check_free_cell (d, base + sym)) 488 | return FALSE; 489 | } 490 | return TRUE; 491 | } 492 | 493 | static void 494 | da_relocate_base (DArray *d, 495 | TrieIndex s, 496 | TrieIndex new_base) 497 | { 498 | TrieIndex old_base; 499 | Symbols *symbols; 500 | int i; 501 | 502 | old_base = da_get_base (d, s); 503 | symbols = da_output_symbols (d, s); 504 | 505 | for (i = 0; i < symbols_num (symbols); i++) { 506 | TrieIndex old_next, new_next, old_next_base; 507 | 508 | old_next = old_base + symbols_get (symbols, i); 509 | new_next = new_base + symbols_get (symbols, i); 510 | old_next_base = da_get_base (d, old_next); 511 | 512 | /* allocate new next node and copy BASE value */ 513 | da_alloc_cell (d, new_next); 514 | da_set_check (d, new_next, s); 515 | da_set_base (d, new_next, old_next_base); 516 | 517 | /* old_next node is now moved to new_next 518 | * so, all cells belonging to old_next 519 | * must be given to new_next 520 | */ 521 | /* preventing the case of TAIL pointer */ 522 | if (old_next_base > 0) { 523 | TrieIndex c, max_c; 524 | 525 | max_c = MIN_VAL (TRIE_CHAR_MAX, TRIE_INDEX_MAX - old_next_base); 526 | for (c = 0; c < max_c; c++) { 527 | if (da_get_check (d, old_next_base + c) == old_next) 528 | da_set_check (d, old_next_base + c, new_next); 529 | } 530 | } 531 | 532 | /* free old_next node */ 533 | da_free_cell (d, old_next); 534 | } 535 | 536 | symbols_free (symbols); 537 | 538 | /* finally, make BASE[s] point to new_base */ 539 | da_set_base (d, s, new_base); 540 | } 541 | 542 | static Bool 543 | da_extend_pool (DArray *d, 544 | TrieIndex to_index) 545 | { 546 | TrieIndex new_begin; 547 | TrieIndex i; 548 | TrieIndex free_tail; 549 | 550 | if (to_index <= 0 || TRIE_INDEX_MAX <= to_index) 551 | return FALSE; 552 | 553 | if (to_index < d->num_cells) 554 | return TRUE; 555 | 556 | d->cells = (DACell *) realloc (d->cells, (to_index + 1) * sizeof (DACell)); 557 | new_begin = d->num_cells; 558 | d->num_cells = to_index + 1; 559 | 560 | /* initialize new free list */ 561 | for (i = new_begin; i < to_index; i++) { 562 | da_set_check (d, i, -(i + 1)); 563 | da_set_base (d, i + 1, -i); 564 | } 565 | 566 | /* merge the new circular list to the old */ 567 | free_tail = -da_get_base (d, da_get_free_list (d)); 568 | da_set_check (d, free_tail, -new_begin); 569 | da_set_base (d, new_begin, -free_tail); 570 | da_set_check (d, to_index, -da_get_free_list (d)); 571 | da_set_base (d, da_get_free_list (d), -to_index); 572 | 573 | /* update header cell */ 574 | d->cells[0].check = d->num_cells; 575 | 576 | return TRUE; 577 | } 578 | 579 | void 580 | da_prune (DArray *d, TrieIndex s) 581 | { 582 | da_prune_upto (d, da_get_root (d), s); 583 | } 584 | 585 | void 586 | da_prune_upto (DArray *d, TrieIndex p, TrieIndex s) 587 | { 588 | while (p != s && !da_has_children (d, s)) { 589 | TrieIndex parent; 590 | 591 | parent = da_get_check (d, s); 592 | da_free_cell (d, s); 593 | s = parent; 594 | } 595 | } 596 | 597 | static void 598 | da_alloc_cell (DArray *d, 599 | TrieIndex cell) 600 | { 601 | TrieIndex prev, next; 602 | 603 | prev = -da_get_base (d, cell); 604 | next = -da_get_check (d, cell); 605 | 606 | /* remove the cell from free list */ 607 | da_set_check (d, prev, -next); 608 | da_set_base (d, next, -prev); 609 | } 610 | 611 | static void 612 | da_free_cell (DArray *d, 613 | TrieIndex cell) 614 | { 615 | TrieIndex i, prev; 616 | 617 | /* find insertion point */ 618 | i = -da_get_check (d, da_get_free_list (d)); 619 | while (i != da_get_free_list (d) && i < cell) 620 | i = -da_get_check (d, i); 621 | 622 | prev = -da_get_base (d, i); 623 | 624 | /* insert cell before i */ 625 | da_set_check (d, cell, -i); 626 | da_set_base (d, cell, -prev); 627 | da_set_check (d, prev, -cell); 628 | da_set_base (d, i, -cell); 629 | } 630 | 631 | Bool 632 | da_enumerate (const DArray *d, DAEnumFunc enum_func, void *user_data) 633 | { 634 | return da_enumerate_recursive (d, da_get_root (d), enum_func, user_data); 635 | } 636 | 637 | static Bool 638 | da_enumerate_recursive (const DArray *d, 639 | TrieIndex state, 640 | DAEnumFunc enum_func, 641 | void *user_data) 642 | { 643 | Bool ret; 644 | TrieIndex base; 645 | 646 | base = da_get_base (d, state); 647 | 648 | if (base < 0) { 649 | TrieChar *key; 650 | 651 | key = da_get_state_key (d, state); 652 | ret = (*enum_func) (key, state, user_data); 653 | free (key); 654 | } else { 655 | Symbols *symbols; 656 | int i; 657 | 658 | ret = TRUE; 659 | symbols = da_output_symbols (d, state); 660 | for (i = 0; ret && i < symbols_num (symbols); i++) { 661 | ret = da_enumerate_recursive (d, base + symbols_get (symbols, i), 662 | enum_func, user_data); 663 | } 664 | 665 | symbols_free (symbols); 666 | } 667 | 668 | return ret; 669 | } 670 | 671 | /* 672 | vi:ts=4:ai:expandtab 673 | */ 674 | -------------------------------------------------------------------------------- /ext/trie/darray.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ 2 | /* 3 | * darray.h - Double-array trie structure 4 | * Created: 2006-08-11 5 | * Author: Theppitak Karoonboonyanan 6 | */ 7 | 8 | #ifndef __DARRAY_H 9 | #define __DARRAY_H 10 | 11 | #include "triedefs.h" 12 | 13 | /** 14 | * @file darray.h 15 | * @brief Double-array trie structure 16 | */ 17 | 18 | /** 19 | * @brief Double-array structure type 20 | */ 21 | typedef struct _DArray DArray; 22 | 23 | /** 24 | * @brief Double-array entry enumeration function 25 | * 26 | * @param key : the key of the entry, up to @a sep_node 27 | * @param sep_node : the separate node of the entry 28 | * @param user_data : user-supplied data 29 | * 30 | * @return TRUE to continue enumeration, FALSE to stop 31 | */ 32 | typedef Bool (*DAEnumFunc) (const TrieChar *key, 33 | TrieIndex sep_node, 34 | void *user_data); 35 | 36 | 37 | /** 38 | * @brief Create a new double-array object 39 | * 40 | * Create a new empty doubla-array object. 41 | */ 42 | DArray * da_new (); 43 | 44 | /** 45 | * @brief Read double-array data from file 46 | * 47 | * @param file : the file to read 48 | * 49 | * @return a pointer to the openned double-array, NULL on failure 50 | * 51 | * Read double-array data from the opened file, starting from the current 52 | * file pointer until the end of double array data block. On return, the 53 | * file pointer is left at the position after the read block. 54 | */ 55 | DArray * da_read (FILE *file); 56 | 57 | /** 58 | * @brief Free double-array data 59 | * 60 | * @param d : the double-array data 61 | * 62 | * Free the given double-array data. 63 | */ 64 | void da_free (DArray *d); 65 | 66 | /** 67 | * @brief Write double-array data 68 | * 69 | * @param d : the double-array data 70 | * @param file : the file to write to 71 | * 72 | * @return 0 on success, non-zero on failure 73 | * 74 | * Write double-array data to the given @a file, starting from the current 75 | * file pointer. On return, the file pointer is left after the double-array 76 | * data block. 77 | */ 78 | int da_write (const DArray *d, FILE *file); 79 | 80 | 81 | /** 82 | * @brief Get root state 83 | * 84 | * @param d : the double-array data 85 | * 86 | * @return root state of the @a index set, or TRIE_INDEX_ERROR on failure 87 | * 88 | * Get root state for stepwise walking. 89 | */ 90 | TrieIndex da_get_root (const DArray *d); 91 | 92 | 93 | /** 94 | * @brief Get BASE cell 95 | * 96 | * @param d : the double-array data 97 | * @param s : the double-array state to get data 98 | * 99 | * @return the BASE cell value for the given state 100 | * 101 | * Get BASE cell value for the given state. 102 | */ 103 | TrieIndex da_get_base (const DArray *d, TrieIndex s); 104 | 105 | /** 106 | * @brief Get CHECK cell 107 | * 108 | * @param d : the double-array data 109 | * @param s : the double-array state to get data 110 | * 111 | * @return the CHECK cell value for the given state 112 | * 113 | * Get CHECK cell value for the given state. 114 | */ 115 | TrieIndex da_get_check (const DArray *d, TrieIndex s); 116 | 117 | 118 | /** 119 | * @brief Set BASE cell 120 | * 121 | * @param d : the double-array data 122 | * @param s : the double-array state to get data 123 | * @param val : the value to set 124 | * 125 | * Set BASE cell for the given state to the given value. 126 | */ 127 | void da_set_base (DArray *d, TrieIndex s, TrieIndex val); 128 | 129 | /** 130 | * @brief Set CHECK cell 131 | * 132 | * @param d : the double-array data 133 | * @param s : the double-array state to get data 134 | * @param val : the value to set 135 | * 136 | * Set CHECK cell for the given state to the given value. 137 | */ 138 | void da_set_check (DArray *d, TrieIndex s, TrieIndex val); 139 | 140 | /** 141 | * @brief Walk in double-array structure 142 | * 143 | * @param d : the double-array structure 144 | * @param s : current state 145 | * @param c : the input character 146 | * 147 | * @return boolean indicating success 148 | * 149 | * Walk the double-array trie from state @a *s, using input character @a c. 150 | * If there exists an edge from @a *s with arc labeled @a c, this function 151 | * returns TRUE and @a *s is updated to the new state. Otherwise, it returns 152 | * FALSE and @a *s is left unchanged. 153 | */ 154 | Bool da_walk (const DArray *d, TrieIndex *s, TrieChar c); 155 | 156 | /** 157 | * @brief Test walkability in double-array structure 158 | * 159 | * @param d : the double-array structure 160 | * @param s : current state 161 | * @param c : the input character 162 | * 163 | * @return boolean indicating walkability 164 | * 165 | * Test if there is a transition from state @a s with input character @a c. 166 | */ 167 | /* 168 | Bool da_is_walkable (DArray *d, TrieIndex s, TrieChar c); 169 | */ 170 | #define da_is_walkable(d,s,c) \ 171 | (da_get_check ((d), da_get_base ((d), (s)) + (c)) == (s)) 172 | 173 | /** 174 | * @brief Insert a branch from trie node 175 | * 176 | * @param d : the double-array structure 177 | * @param s : the state to add branch to 178 | * @param c : the character for the branch label 179 | * 180 | * @return the index of the new node 181 | * 182 | * Insert a new arc labelled with character @a c from the trie node 183 | * represented by index @a s in double-array structure @a d. 184 | * Note that it assumes that no such arc exists before inserting. 185 | */ 186 | TrieIndex da_insert_branch (DArray *d, TrieIndex s, TrieChar c); 187 | 188 | /** 189 | * @brief Prune the single branch 190 | * 191 | * @param d : the double-array structure 192 | * @param s : the dangling state to prune off 193 | * 194 | * Prune off a non-separate path up from the final state @a s. 195 | * If @a s still has some children states, it does nothing. Otherwise, 196 | * it deletes the node and all its parents which become non-separate. 197 | */ 198 | void da_prune (DArray *d, TrieIndex s); 199 | 200 | /** 201 | * @brief Prune the single branch up to given parent 202 | * 203 | * @param d : the double-array structure 204 | * @param p : the parent up to which to be pruned 205 | * @param s : the dangling state to prune off 206 | * 207 | * Prune off a non-separate path up from the final state @a s to the 208 | * given parent @a p. The prunning stop when either the parent @a p 209 | * is met, or a first non-separate node is found. 210 | */ 211 | void da_prune_upto (DArray *d, TrieIndex p, TrieIndex s); 212 | 213 | /** 214 | * @brief Enumerate entries stored in double-array structure 215 | * 216 | * @param d : the double-array structure 217 | * @param enum_func : the callback function to be called on each separate node 218 | * @param user_data : user-supplied data to send as an argument to @a enum_func 219 | * 220 | * @return boolean value indicating whether all the keys are visited 221 | * 222 | * Enumerate all keys stored in double-array structure. For each entry, the 223 | * user-supplied @a enum_func callback function is called, with the entry key, 224 | * the separate node, and user-supplied data. Returning FALSE from such 225 | * callback will stop enumeration and return FALSE. 226 | */ 227 | Bool da_enumerate (const DArray *d, DAEnumFunc enum_func, void *user_data); 228 | 229 | #endif /* __DARRAY_H */ 230 | 231 | /* 232 | vi:ts=4:ai:expandtab 233 | */ 234 | -------------------------------------------------------------------------------- /ext/trie/extconf.rb: -------------------------------------------------------------------------------- 1 | require 'mkmf' 2 | create_makefile 'trie' 3 | 4 | -------------------------------------------------------------------------------- /ext/trie/fileutils.c: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ 2 | /* 3 | * fileutils.h - File utility functions 4 | * Created: 2006-08-15 5 | * Author: Theppitak Karoonboonyanan 6 | */ 7 | 8 | #include 9 | #include 10 | 11 | #include "fileutils.h" 12 | 13 | /*--------------------------------------* 14 | * INTERNAL FUNCTIONS DECLARATIONS * 15 | *--------------------------------------*/ 16 | 17 | static char * make_full_path (const char *dir, 18 | const char *name, 19 | const char *ext); 20 | 21 | /* ==================== BEGIN IMPLEMENTATION PART ==================== */ 22 | 23 | /*--------------------------------* 24 | * FUNCTIONS IMPLEMENTATIONS * 25 | *--------------------------------*/ 26 | 27 | static char * 28 | make_full_path (const char *dir, const char *name, const char *ext) 29 | { 30 | char *path; 31 | 32 | path = (char *) malloc (strlen (dir) + strlen (name) + strlen (ext) + 2); 33 | sprintf (path, "%s/%s%s", dir, name, ext); 34 | 35 | return path; 36 | } 37 | 38 | FILE * 39 | file_open (const char *dir, const char *name, const char *ext, TrieIOMode mode) 40 | { 41 | const char *std_mode; 42 | char *full_path; 43 | FILE *file; 44 | 45 | if (mode & TRIE_IO_WRITE) 46 | std_mode = "r+"; 47 | else 48 | std_mode = "r"; 49 | 50 | full_path = make_full_path (dir, name, ext); 51 | file = fopen (full_path, std_mode); 52 | if (!file && mode & TRIE_IO_CREATE) 53 | file = fopen (full_path, "w+"); 54 | free (full_path); 55 | 56 | return file; 57 | } 58 | 59 | long 60 | file_length (FILE *file) 61 | { 62 | long cur_pos; 63 | long size; 64 | 65 | cur_pos = ftell (file); 66 | 67 | fseek (file, 0L, SEEK_END); 68 | size = ftell (file); 69 | 70 | fseek (file, cur_pos, SEEK_SET); 71 | 72 | return size; 73 | } 74 | 75 | Bool 76 | file_read_int32 (FILE *file, int32 *o_val) 77 | { 78 | unsigned char buff[4]; 79 | 80 | if (fread (buff, 4, 1, file) == 1) { 81 | *o_val = (buff[0] << 24) | (buff[1] << 16) | (buff[2] << 8) | buff[3]; 82 | return TRUE; 83 | } 84 | 85 | return FALSE; 86 | } 87 | 88 | Bool 89 | file_write_int32 (FILE *file, int32 val) 90 | { 91 | unsigned char buff[4]; 92 | 93 | buff[0] = (val >> 24) & 0xff; 94 | buff[1] = (val >> 16) & 0xff; 95 | buff[2] = (val >> 8) & 0xff; 96 | buff[3] = val & 0xff; 97 | 98 | return (fwrite (buff, 4, 1, file) == 1); 99 | } 100 | 101 | Bool 102 | file_read_int16 (FILE *file, int16 *o_val) 103 | { 104 | unsigned char buff[2]; 105 | 106 | if (fread (buff, 2, 1, file) == 1) { 107 | *o_val = (buff[0] << 8) | buff[1]; 108 | return TRUE; 109 | } 110 | 111 | return FALSE; 112 | } 113 | 114 | Bool 115 | file_write_int16 (FILE *file, int16 val) 116 | { 117 | unsigned char buff[2]; 118 | 119 | buff[0] = val >> 8; 120 | buff[1] = val & 0xff; 121 | 122 | return (fwrite (buff, 2, 1, file) == 1); 123 | } 124 | 125 | Bool 126 | file_read_int8 (FILE *file, int8 *o_val) 127 | { 128 | return (fread (o_val, sizeof (int8), 1, file) == 1); 129 | } 130 | 131 | Bool 132 | file_write_int8 (FILE *file, int8 val) 133 | { 134 | return (fwrite (&val, sizeof (int8), 1, file) == 1); 135 | } 136 | 137 | Bool 138 | file_read_chars (FILE *file, char *buff, int len) 139 | { 140 | return (fread (buff, sizeof (char), len, file) == len); 141 | } 142 | 143 | Bool 144 | file_write_chars (FILE *file, const char *buff, int len) 145 | { 146 | return (fwrite (buff, sizeof (char), len, file) == len); 147 | } 148 | 149 | /* 150 | vi:ts=4:ai:expandtab 151 | */ 152 | -------------------------------------------------------------------------------- /ext/trie/fileutils.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ 2 | /* 3 | * fileutils.h - File utility functions 4 | * Created: 2006-08-14 5 | * Author: Theppitak Karoonboonyanan 6 | */ 7 | 8 | #ifndef __FILEUTILS_H 9 | #define __FILEUTILS_H 10 | 11 | #include 12 | 13 | #include "triedefs.h" 14 | 15 | FILE * file_open (const char *dir, const char *name, const char *ext, 16 | TrieIOMode mode); 17 | 18 | long file_length (FILE *file); 19 | 20 | Bool file_read_int32 (FILE *file, int32 *o_val); 21 | Bool file_write_int32 (FILE *file, int32 val); 22 | 23 | Bool file_read_int16 (FILE *file, int16 *o_val); 24 | Bool file_write_int16 (FILE *file, int16 val); 25 | 26 | Bool file_read_int8 (FILE *file, int8 *o_val); 27 | Bool file_write_int8 (FILE *file, int8 val); 28 | 29 | Bool file_read_chars (FILE *file, char *buff, int len); 30 | Bool file_write_chars (FILE *file, const char *buff, int len); 31 | 32 | #endif /* __FILEUTILS_H */ 33 | 34 | /* 35 | vi:ts=4:ai:expandtab 36 | */ 37 | -------------------------------------------------------------------------------- /ext/trie/tail.c: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ 2 | /* 3 | * tail.c - trie tail for keeping suffixes 4 | * Created: 2006-08-15 5 | * Author: Theppitak Karoonboonyanan 6 | */ 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | #include "tail.h" 13 | #include "fileutils.h" 14 | 15 | /*----------------------------------* 16 | * INTERNAL TYPES DECLARATIONS * 17 | *----------------------------------*/ 18 | 19 | /*-----------------------------------* 20 | * PRIVATE METHODS DECLARATIONS * 21 | *-----------------------------------*/ 22 | 23 | static TrieIndex tail_alloc_block (Tail *t); 24 | static void tail_free_block (Tail *t, TrieIndex block); 25 | 26 | /* ==================== BEGIN IMPLEMENTATION PART ==================== */ 27 | 28 | /*------------------------------------* 29 | * INTERNAL TYPES IMPLEMENTATIONS * 30 | *------------------------------------*/ 31 | 32 | /*------------------------------* 33 | * PRIVATE DATA DEFINITONS * 34 | *------------------------------*/ 35 | 36 | typedef struct { 37 | TrieIndex next_free; 38 | TrieData data; 39 | TrieChar *suffix; 40 | } TailBlock; 41 | 42 | struct _Tail { 43 | TrieIndex num_tails; 44 | TailBlock *tails; 45 | TrieIndex first_free; 46 | }; 47 | 48 | /*-----------------------------* 49 | * METHODS IMPLEMENTAIONS * 50 | *-----------------------------*/ 51 | 52 | #define TAIL_SIGNATURE 0xDFFCDFFC 53 | #define TAIL_START_BLOCKNO 1 54 | 55 | /* Tail Header: 56 | * INT32: signature 57 | * INT32: pointer to first free slot 58 | * INT32: number of tail blocks 59 | * 60 | * Tail Blocks: 61 | * INT32: pointer to next free block (-1 for allocated blocks) 62 | * INT32: data for the key 63 | * INT16: length 64 | * BYTES[length]: suffix string (no terminating '\0') 65 | */ 66 | 67 | Tail * 68 | tail_new () 69 | { 70 | Tail *t; 71 | 72 | t = (Tail *) malloc (sizeof (Tail)); 73 | if (!t) 74 | return NULL; 75 | 76 | t->first_free = 0; 77 | t->num_tails = 0; 78 | t->tails = NULL; 79 | 80 | return t; 81 | } 82 | 83 | Tail * 84 | tail_read (FILE *file) 85 | { 86 | long save_pos; 87 | Tail *t; 88 | TrieIndex i; 89 | uint32 sig; 90 | 91 | /* check signature */ 92 | save_pos = ftell (file); 93 | if (!file_read_int32 (file, (int32 *) &sig) || TAIL_SIGNATURE != sig) { 94 | fseek (file, save_pos, SEEK_SET); 95 | return NULL; 96 | } 97 | 98 | t = (Tail *) malloc (sizeof (Tail)); 99 | if (!t) 100 | return NULL; 101 | 102 | file_read_int32 (file, &t->first_free); 103 | file_read_int32 (file, &t->num_tails); 104 | t->tails = (TailBlock *) malloc (t->num_tails * sizeof (TailBlock)); 105 | if (!t->tails) 106 | goto exit_tail_created; 107 | for (i = 0; i < t->num_tails; i++) { 108 | int16 length; 109 | 110 | file_read_int32 (file, &t->tails[i].next_free); 111 | file_read_int32 (file, &t->tails[i].data); 112 | 113 | file_read_int16 (file, &length); 114 | t->tails[i].suffix = (TrieChar *) malloc (length + 1); 115 | if (length > 0) 116 | file_read_chars (file, (char *)t->tails[i].suffix, length); 117 | t->tails[i].suffix[length] = '\0'; 118 | } 119 | 120 | return t; 121 | 122 | exit_tail_created: 123 | free (t); 124 | return NULL; 125 | } 126 | 127 | void 128 | tail_free (Tail *t) 129 | { 130 | TrieIndex i; 131 | 132 | if (t->tails) { 133 | for (i = 0; i < t->num_tails; i++) 134 | if (t->tails[i].suffix) 135 | free (t->tails[i].suffix); 136 | free (t->tails); 137 | } 138 | free (t); 139 | } 140 | 141 | int 142 | tail_write (const Tail *t, FILE *file) 143 | { 144 | TrieIndex i; 145 | 146 | if (!file_write_int32 (file, TAIL_SIGNATURE) || 147 | !file_write_int32 (file, t->first_free) || 148 | !file_write_int32 (file, t->num_tails)) 149 | { 150 | return -1; 151 | } 152 | for (i = 0; i < t->num_tails; i++) { 153 | int16 length; 154 | 155 | if (!file_write_int32 (file, t->tails[i].next_free) || 156 | !file_write_int32 (file, t->tails[i].data)) 157 | { 158 | return -1; 159 | } 160 | 161 | length = t->tails[i].suffix ? strlen ((const char *)t->tails[i].suffix) 162 | : 0; 163 | if (!file_write_int16 (file, length)) 164 | return -1; 165 | if (length > 0 && 166 | !file_write_chars (file, (char *)t->tails[i].suffix, length)) 167 | { 168 | return -1; 169 | } 170 | } 171 | 172 | return 0; 173 | } 174 | 175 | 176 | const TrieChar * 177 | tail_get_suffix (const Tail *t, TrieIndex index) 178 | { 179 | index -= TAIL_START_BLOCKNO; 180 | return (index < t->num_tails) ? t->tails[index].suffix : NULL; 181 | } 182 | 183 | Bool 184 | tail_set_suffix (Tail *t, TrieIndex index, const TrieChar *suffix) 185 | { 186 | index -= TAIL_START_BLOCKNO; 187 | if (index < t->num_tails) { 188 | /* suffix and t->tails[index].suffix may overlap; 189 | * so, dup it before it's overwritten 190 | */ 191 | TrieChar *tmp = NULL; 192 | if (suffix) 193 | tmp = (TrieChar *) strdup ((const char *)suffix); 194 | if (t->tails[index].suffix) 195 | free (t->tails[index].suffix); 196 | t->tails[index].suffix = tmp; 197 | 198 | return TRUE; 199 | } 200 | return FALSE; 201 | } 202 | 203 | TrieIndex 204 | tail_add_suffix (Tail *t, const TrieChar *suffix) 205 | { 206 | TrieIndex new_block; 207 | 208 | new_block = tail_alloc_block (t); 209 | tail_set_suffix (t, new_block, suffix); 210 | 211 | return new_block; 212 | } 213 | 214 | static TrieIndex 215 | tail_alloc_block (Tail *t) 216 | { 217 | TrieIndex block; 218 | 219 | if (0 != t->first_free) { 220 | block = t->first_free; 221 | t->first_free = t->tails[block].next_free; 222 | } else { 223 | block = t->num_tails; 224 | t->tails = (TailBlock *) realloc (t->tails, 225 | ++t->num_tails * sizeof (TailBlock)); 226 | } 227 | t->tails[block].next_free = -1; 228 | t->tails[block].data = TRIE_DATA_ERROR; 229 | t->tails[block].suffix = NULL; 230 | 231 | return block + TAIL_START_BLOCKNO; 232 | } 233 | 234 | static void 235 | tail_free_block (Tail *t, TrieIndex block) 236 | { 237 | TrieIndex i, j; 238 | 239 | block -= TAIL_START_BLOCKNO; 240 | 241 | if (block >= t->num_tails) 242 | return; 243 | 244 | t->tails[block].data = TRIE_DATA_ERROR; 245 | if (NULL != t->tails[block].suffix) { 246 | free (t->tails[block].suffix); 247 | t->tails[block].suffix = NULL; 248 | } 249 | 250 | /* find insertion point */ 251 | j = 0; 252 | for (i = t->first_free; i != 0 && i < block; i = t->tails[i].next_free) 253 | j = i; 254 | 255 | /* insert free block between j and i */ 256 | t->tails[block].next_free = i; 257 | if (0 != j) 258 | t->tails[j].next_free = block; 259 | else 260 | t->first_free = block; 261 | } 262 | 263 | TrieData 264 | tail_get_data (const Tail *t, TrieIndex index) 265 | { 266 | index -= TAIL_START_BLOCKNO; 267 | return (index < t->num_tails) ? t->tails[index].data : TRIE_DATA_ERROR; 268 | } 269 | 270 | Bool 271 | tail_set_data (Tail *t, TrieIndex index, TrieData data) 272 | { 273 | index -= TAIL_START_BLOCKNO; 274 | if (index < t->num_tails) { 275 | t->tails[index].data = data; 276 | return TRUE; 277 | } 278 | return FALSE; 279 | } 280 | 281 | void 282 | tail_delete (Tail *t, TrieIndex index) 283 | { 284 | tail_free_block (t, index); 285 | } 286 | 287 | int 288 | tail_walk_str (const Tail *t, 289 | TrieIndex s, 290 | short *suffix_idx, 291 | const TrieChar *str, 292 | int len) 293 | { 294 | const TrieChar *suffix; 295 | int i; 296 | short j; 297 | 298 | suffix = tail_get_suffix (t, s); 299 | if (!suffix) 300 | return FALSE; 301 | 302 | i = 0; j = *suffix_idx; 303 | while (i < len) { 304 | if (str[i] != suffix[j]) 305 | break; 306 | ++i; 307 | /* stop and stay at null-terminator */ 308 | if (0 == suffix[j]) 309 | break; 310 | ++j; 311 | } 312 | *suffix_idx = j; 313 | return i; 314 | } 315 | 316 | Bool 317 | tail_walk_char (const Tail *t, 318 | TrieIndex s, 319 | short *suffix_idx, 320 | TrieChar c) 321 | { 322 | const TrieChar *suffix; 323 | TrieChar suffix_char; 324 | 325 | suffix = tail_get_suffix (t, s); 326 | if (!suffix) 327 | return FALSE; 328 | 329 | suffix_char = suffix[*suffix_idx]; 330 | if (suffix_char == c) { 331 | if (0 != suffix_char) 332 | ++*suffix_idx; 333 | return TRUE; 334 | } 335 | return FALSE; 336 | } 337 | 338 | /* 339 | vi:ts=4:ai:expandtab 340 | */ 341 | -------------------------------------------------------------------------------- /ext/trie/tail.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ 2 | /* 3 | * tail.h - trie tail for keeping suffixes 4 | * Created: 2006-08-12 5 | * Author: Theppitak Karoonboonyanan 6 | */ 7 | 8 | #ifndef __TAIL_H 9 | #define __TAIL_H 10 | 11 | #include "triedefs.h" 12 | 13 | /** 14 | * @file tail.h 15 | * @brief trie tail for keeping suffixes 16 | */ 17 | 18 | /** 19 | * @brief Double-array structure type 20 | */ 21 | typedef struct _Tail Tail; 22 | 23 | /** 24 | * @brief Create a new tail object 25 | * 26 | * Create a new empty tail object. 27 | */ 28 | Tail * tail_new (); 29 | 30 | /** 31 | * @brief Read tail data from file 32 | * 33 | * @param file : the file to read 34 | * 35 | * @return a pointer to the openned tail data, NULL on failure 36 | * 37 | * Read tail data from the opened file, starting from the current 38 | * file pointer until the end of tail data block. On return, the 39 | * file pointer is left at the position after the read block. 40 | */ 41 | Tail * tail_read (FILE *file); 42 | 43 | /** 44 | * @brief Free tail data 45 | * 46 | * @param t : the tail data 47 | * 48 | * @return 0 on success, non-zero on failure 49 | * 50 | * Free the given tail data. 51 | */ 52 | void tail_free (Tail *t); 53 | 54 | /** 55 | * @brief Write tail data 56 | * 57 | * @param t : the tail data 58 | * @param file : the file to write to 59 | * 60 | * @return 0 on success, non-zero on failure 61 | * 62 | * Write tail data to the given @a file, starting from the current file 63 | * pointer. On return, the file pointer is left after the tail data block. 64 | */ 65 | int tail_write (const Tail *t, FILE *file); 66 | 67 | 68 | /** 69 | * @brief Get suffix 70 | * 71 | * @param t : the tail data 72 | * @param index : the index of the suffix 73 | * 74 | * @return an allocated string of the indexed suffix. 75 | * 76 | * Get suffix from tail with given @a index. The returned string is allocated. 77 | * The caller should free it with free(). 78 | */ 79 | const TrieChar * tail_get_suffix (const Tail *t, TrieIndex index); 80 | 81 | /** 82 | * @brief Set suffix of existing entry 83 | * 84 | * @param t : the tail data 85 | * @param index : the index of the suffix 86 | * @param suffix : the new suffix 87 | * 88 | * Set suffix of existing entry of given @a index in tail. 89 | */ 90 | Bool tail_set_suffix (Tail *t, TrieIndex index, const TrieChar *suffix); 91 | 92 | /** 93 | * @brief Add a new suffix 94 | * 95 | * @param t : the tail data 96 | * @param suffix : the new suffix 97 | * 98 | * @return the index of the newly added suffix. 99 | * 100 | * Add a new suffix entry to tail. 101 | */ 102 | TrieIndex tail_add_suffix (Tail *t, const TrieChar *suffix); 103 | 104 | /** 105 | * @brief Get data associated to suffix entry 106 | * 107 | * @param t : the tail data 108 | * @param index : the index of the suffix 109 | * 110 | * @return the data associated to the suffix entry 111 | * 112 | * Get data associated to suffix entry @a index in tail data. 113 | */ 114 | TrieData tail_get_data (const Tail *t, TrieIndex index); 115 | 116 | /** 117 | * @brief Set data associated to suffix entry 118 | * 119 | * @param t : the tail data 120 | * @param index : the index of the suffix 121 | * @param data : the data to set 122 | * 123 | * @return boolean indicating success 124 | * 125 | * Set data associated to suffix entry @a index in tail data. 126 | */ 127 | Bool tail_set_data (Tail *t, TrieIndex index, TrieData data); 128 | 129 | /** 130 | * @brief Delete suffix entry 131 | * 132 | * @param t : the tail data 133 | * @param index : the index of the suffix to delete 134 | * 135 | * Delete suffix entry from the tail data. 136 | */ 137 | void tail_delete (Tail *t, TrieIndex index); 138 | 139 | /** 140 | * @brief Walk in tail with a string 141 | * 142 | * @param t : the tail data 143 | * @param s : the tail data index 144 | * @param suffix_idx : pointer to current character index in suffix 145 | * @param str : the string to use in walking 146 | * @param len : total characters in @a str to walk 147 | * 148 | * @return total number of characters successfully walked 149 | * 150 | * Walk in the tail data @a t at entry @a s, from given character position 151 | * @a *suffix_idx, using @a len characters of given string @a str. On return, 152 | * @a *suffix_idx is updated to the position after the last successful walk, 153 | * and the function returns the total number of character succesfully walked. 154 | */ 155 | int tail_walk_str (const Tail *t, 156 | TrieIndex s, 157 | short *suffix_idx, 158 | const TrieChar *str, 159 | int len); 160 | 161 | /** 162 | * @brief Walk in tail with a character 163 | * 164 | * @param t : the tail data 165 | * @param s : the tail data index 166 | * @param suffix_idx : pointer to current character index in suffix 167 | * @param c : the character to use in walking 168 | * 169 | * @return boolean indicating success 170 | * 171 | * Walk in the tail data @a t at entry @a s, from given character position 172 | * @a *suffix_idx, using given character @a c. If the walk is successful, 173 | * it returns TRUE, and @a *suffix_idx is updated to the next character. 174 | * Otherwise, it returns FALSE, and @a *suffix_idx is left unchanged. 175 | */ 176 | Bool tail_walk_char (const Tail *t, 177 | TrieIndex s, 178 | short *suffix_idx, 179 | TrieChar c); 180 | 181 | /** 182 | * @brief Test walkability in tail with a character 183 | * 184 | * @param t : the tail data 185 | * @param s : the tail data index 186 | * @param suffix_idx : current character index in suffix 187 | * @param c : the character to test walkability 188 | * 189 | * @return boolean indicating walkability 190 | * 191 | * Test if the character @a c can be used to walk from given character 192 | * position @a suffix_idx of entry @a s of the tail data @a t. 193 | */ 194 | /* 195 | Bool tail_is_walkable_char (Tail *t, 196 | TrieIndex s, 197 | short suffix_idx, 198 | const TrieChar c); 199 | */ 200 | #define tail_is_walkable_char(t,s,suffix_idx,c) \ 201 | (tail_get_suffix ((t), (s)) [suffix_idx] == (c)) 202 | 203 | #endif /* __TAIL_H */ 204 | 205 | /* 206 | vi:ts=4:ai:expandtab 207 | */ 208 | -------------------------------------------------------------------------------- /ext/trie/trie-private.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "darray.h" 5 | #include "tail.h" 6 | #include "trie.h" 7 | 8 | Trie* trie_new() { 9 | Trie *trie = (Trie*) malloc(sizeof(Trie)); 10 | trie->da = da_new(); 11 | trie->tail = tail_new(); 12 | return trie; 13 | } 14 | 15 | void trie_free(Trie *trie) { 16 | da_free(trie->da); 17 | tail_free(trie->tail); 18 | free(trie); 19 | } 20 | 21 | static Bool trie_branch_in_branch (Trie *trie, TrieIndex sep_node, const TrieChar *suffix, TrieData data) { 22 | TrieIndex new_da, new_tail; 23 | 24 | new_da = da_insert_branch (trie->da, sep_node, *suffix); 25 | if (TRIE_INDEX_ERROR == new_da) 26 | return FALSE; 27 | 28 | if ('\0' != *suffix) 29 | ++suffix; 30 | 31 | new_tail = tail_add_suffix (trie->tail, suffix); 32 | tail_set_data (trie->tail, new_tail, data); 33 | trie_da_set_tail_index (trie->da, new_da, new_tail); 34 | 35 | // trie->is_dirty = TRUE; 36 | return TRUE; 37 | } 38 | 39 | static Bool trie_branch_in_tail(Trie *trie, TrieIndex sep_node, const TrieChar *suffix, TrieData data) { 40 | TrieIndex old_tail, old_da, s; 41 | const TrieChar *old_suffix, *p; 42 | 43 | /* adjust separate point in old path */ 44 | old_tail = trie_da_get_tail_index (trie->da, sep_node); 45 | old_suffix = tail_get_suffix (trie->tail, old_tail); 46 | if (!old_suffix) 47 | return FALSE; 48 | 49 | for (p = old_suffix, s = sep_node; *p == *suffix; p++, suffix++) { 50 | TrieIndex t = da_insert_branch (trie->da, s, *p); 51 | if (TRIE_INDEX_ERROR == t) 52 | goto fail; 53 | s = t; 54 | } 55 | 56 | old_da = da_insert_branch (trie->da, s, *p); 57 | if (TRIE_INDEX_ERROR == old_da) 58 | goto fail; 59 | 60 | if ('\0' != *p) 61 | ++p; 62 | tail_set_suffix (trie->tail, old_tail, p); 63 | trie_da_set_tail_index (trie->da, old_da, old_tail); 64 | 65 | /* insert the new branch at the new separate point */ 66 | return trie_branch_in_branch (trie, s, suffix, data); 67 | 68 | fail: 69 | /* failed, undo previous insertions and return error */ 70 | da_prune_upto (trie->da, sep_node, s); 71 | trie_da_set_tail_index (trie->da, sep_node, old_tail); 72 | return FALSE; 73 | } 74 | 75 | Bool trie_store (Trie *trie, const TrieChar *key, TrieData data) { 76 | TrieIndex s, t; 77 | short suffix_idx; 78 | const TrieChar *p, *sep; 79 | size_t len; 80 | 81 | /* walk through branches */ 82 | s = da_get_root (trie->da); 83 | for (p = key; !trie_da_is_separate (trie->da, s); p++) { 84 | if (!da_walk (trie->da, &s, *p)) 85 | return trie_branch_in_branch (trie, s, p, data); 86 | if (0 == *p) 87 | break; 88 | } 89 | 90 | /* walk through tail */ 91 | sep = p; 92 | t = trie_da_get_tail_index (trie->da, s); 93 | suffix_idx = 0; 94 | len = strlen ((const char *) p) + 1; /* including null-terminator */ 95 | if (tail_walk_str (trie->tail, t, &suffix_idx, p, len) != len) 96 | return trie_branch_in_tail (trie, s, p, data); 97 | 98 | /* duplicated key, overwrite val */ 99 | tail_set_data (trie->tail, t, data); 100 | // trie->is_dirty = TRUE; 101 | return TRUE; 102 | } 103 | 104 | 105 | Bool trie_has_key (const Trie *trie, const TrieChar *key) { 106 | TrieIndex s; 107 | short suffix_idx; 108 | const TrieChar *p; 109 | 110 | /* walk through branches */ 111 | s = da_get_root (trie->da); 112 | for (p = key; !trie_da_is_separate (trie->da, s); p++) { 113 | if (!da_walk (trie->da, &s, *p)) 114 | return FALSE; 115 | if (0 == *p) 116 | break; 117 | } 118 | 119 | /* walk through tail */ 120 | s = trie_da_get_tail_index (trie->da, s); 121 | suffix_idx = 0; 122 | for ( ; ; p++) { 123 | if (!tail_walk_char (trie->tail, s, &suffix_idx, *p)) 124 | return FALSE; 125 | if (0 == *p) 126 | break; 127 | } 128 | 129 | return TRUE; 130 | } 131 | 132 | 133 | Bool trie_retrieve (const Trie *trie, const TrieChar *key, TrieData *o_data) { 134 | TrieIndex s; 135 | short suffix_idx; 136 | const TrieChar *p; 137 | 138 | /* walk through branches */ 139 | s = da_get_root (trie->da); 140 | for (p = key; !trie_da_is_separate (trie->da, s); p++) { 141 | if (!da_walk (trie->da, &s, *p)) 142 | return FALSE; 143 | if (0 == *p) 144 | break; 145 | } 146 | 147 | /* walk through tail */ 148 | s = trie_da_get_tail_index (trie->da, s); 149 | suffix_idx = 0; 150 | for ( ; ; p++) { 151 | if (!tail_walk_char (trie->tail, s, &suffix_idx, *p)) 152 | return FALSE; 153 | if (0 == *p) 154 | break; 155 | } 156 | 157 | /* found, set the val and return */ 158 | if (o_data) 159 | *o_data = tail_get_data (trie->tail, s); 160 | return TRUE; 161 | } 162 | 163 | Bool trie_delete (Trie *trie, const TrieChar *key) { 164 | TrieIndex s, t; 165 | short suffix_idx; 166 | const TrieChar *p; 167 | 168 | /* walk through branches */ 169 | s = da_get_root (trie->da); 170 | for (p = key; !trie_da_is_separate (trie->da, s); p++) { 171 | if (!da_walk (trie->da, &s, *p)) 172 | return FALSE; 173 | if (0 == *p) 174 | break; 175 | } 176 | 177 | /* walk through tail */ 178 | t = trie_da_get_tail_index (trie->da, s); 179 | suffix_idx = 0; 180 | for ( ; ; p++) { 181 | if (!tail_walk_char (trie->tail, t, &suffix_idx, *p)) 182 | return FALSE; 183 | if (0 == *p) 184 | break; 185 | } 186 | 187 | tail_delete (trie->tail, t); 188 | da_set_base (trie->da, s, TRIE_INDEX_ERROR); 189 | da_prune (trie->da, s); 190 | 191 | //trie->is_dirty = TRUE; 192 | return TRUE; 193 | } 194 | 195 | /*-------------------------------* 196 | * STEPWISE QUERY OPERATIONS * 197 | *-------------------------------*/ 198 | 199 | TrieState * trie_root (const Trie *trie) { 200 | return trie_state_new (trie, da_get_root (trie->da), 0, FALSE); 201 | } 202 | 203 | /*----------------* 204 | * TRIE STATE * 205 | *----------------*/ 206 | 207 | static TrieState * trie_state_new (const Trie *trie, TrieIndex index, short suffix_idx, short is_suffix) { 208 | TrieState *s; 209 | 210 | s = (TrieState *) malloc (sizeof (TrieState)); 211 | if (!s) 212 | return NULL; 213 | 214 | s->trie = trie; 215 | s->index = index; 216 | s->suffix_idx = suffix_idx; 217 | s->is_suffix = is_suffix; 218 | 219 | return s; 220 | } 221 | 222 | TrieState * trie_state_clone (const TrieState *s) { 223 | return trie_state_new (s->trie, s->index, s->suffix_idx, s->is_suffix); 224 | } 225 | 226 | void trie_state_free (TrieState *s) { 227 | free (s); 228 | } 229 | 230 | void trie_state_rewind (TrieState *s) { 231 | s->index = da_get_root (s->trie->da); 232 | s->is_suffix = FALSE; 233 | } 234 | 235 | Bool trie_state_walk (TrieState *s, TrieChar c) { 236 | if (!s->is_suffix) { 237 | Bool ret; 238 | 239 | ret = da_walk (s->trie->da, &s->index, c); 240 | 241 | if (ret && trie_da_is_separate (s->trie->da, s->index)) { 242 | s->index = trie_da_get_tail_index (s->trie->da, s->index); 243 | s->suffix_idx = 0; 244 | s->is_suffix = TRUE; 245 | } 246 | 247 | return ret; 248 | } else { 249 | return tail_walk_char (s->trie->tail, s->index, &s->suffix_idx, c); 250 | } 251 | } 252 | 253 | Bool trie_state_is_walkable (const TrieState *s, TrieChar c) { 254 | if (!s->is_suffix) 255 | return da_is_walkable (s->trie->da, s->index, c); 256 | else 257 | return tail_is_walkable_char (s->trie->tail, s->index, s->suffix_idx, c); 258 | } 259 | 260 | Bool trie_state_is_leaf (const TrieState *s) { 261 | return s->is_suffix && trie_state_is_terminal (s); 262 | } 263 | 264 | TrieData trie_state_get_data (const TrieState *s) { 265 | return s->is_suffix ? tail_get_data (s->trie->tail, s->index) : TRIE_DATA_ERROR; 266 | } 267 | 268 | int main(void) { 269 | Bool res; 270 | TrieData *data = (TrieData*)malloc(sizeof(TrieData)); 271 | Trie *trie = trie_new(); 272 | 273 | 274 | trie_store(trie, (const TrieChar*)"hello", 1); 275 | trie_store(trie, (const TrieChar*)"he", 4); 276 | trie_store(trie, (const TrieChar*)"hel", 3); 277 | trie_store(trie, (const TrieChar*)"h", 5); 278 | trie_store(trie, (const TrieChar*)"hell", 2); 279 | 280 | 281 | res = trie_retrieve(trie, (const TrieChar*)"hello", data); 282 | printf(res ? "Win!\n" : "Fail!\n"); 283 | 284 | res = trie_retrieve(trie, (const TrieChar*)"hell", data); 285 | printf(res ? "Win!\n" : "Fail!\n"); 286 | 287 | res = trie_retrieve(trie, (const TrieChar*)"hel", data); 288 | printf(res ? "Win!\n" : "Fail!\n"); 289 | 290 | res = trie_retrieve(trie, (const TrieChar*)"he", data); 291 | printf(res ? "Win!\n" : "Fail!\n"); 292 | 293 | res = trie_retrieve(trie, (const TrieChar*)"h", data); 294 | printf(res ? "Win!\n" : "Fail!\n"); 295 | 296 | 297 | trie_free(trie); 298 | return 0; 299 | } 300 | -------------------------------------------------------------------------------- /ext/trie/trie-private.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ 2 | /* 3 | * trie-private.h - Private utilities for trie implementation 4 | * Created: 2007-08-25 5 | * Author: Theppitak Karoonboonyanan 6 | */ 7 | 8 | #ifndef __TRIE_PRIVATE_H 9 | #define __TRIE_PRIVATE_H 10 | 11 | #include "typedefs.h" 12 | 13 | /** 14 | * @file trie-private.h 15 | * @brief Private utilities for trie implementation 16 | */ 17 | 18 | /** 19 | * @brief Minimum value macro 20 | */ 21 | #define MIN_VAL(a,b) ((a)<(b)?(a):(b)) 22 | /** 23 | * @brief Maximum value macro 24 | */ 25 | #define MAX_VAL(a,b) ((a)>(b)?(a):(b)) 26 | 27 | #endif /* __TRIE_PRIVATE_H */ 28 | 29 | /* 30 | vi:ts=4:ai:expandtab 31 | */ 32 | -------------------------------------------------------------------------------- /ext/trie/trie.c: -------------------------------------------------------------------------------- 1 | #include "ruby.h" 2 | #include "trie.h" 3 | #include 4 | #include 5 | #include 6 | 7 | VALUE cTrie, cTrieNode; 8 | 9 | /* 10 | * Document-class: Trie 11 | * 12 | * A key-value data structure for string keys which is efficient memory usage and fast retrieval time. 13 | * 14 | */ 15 | 16 | static VALUE rb_trie_alloc(VALUE klass) { 17 | VALUE obj; 18 | obj = Data_Wrap_Struct(klass, 0, trie_free, trie_new()); 19 | return obj; 20 | } 21 | 22 | void raise_ioerror(const char * message) { 23 | VALUE rb_eIOError = rb_const_get(rb_cObject, rb_intern("IOError")); 24 | rb_raise(rb_eIOError, "%s", message); 25 | } 26 | 27 | /* 28 | * call-seq: 29 | * read(filename_base) -> Trie 30 | * 31 | * Returns a new trie with data as read from disk. 32 | */ 33 | static VALUE rb_trie_read(VALUE self, VALUE filename_base) { 34 | VALUE da_filename = rb_str_dup(filename_base); 35 | rb_str_concat(da_filename, rb_str_new2(".da")); 36 | StringValue(da_filename); 37 | 38 | VALUE tail_filename = rb_str_dup(filename_base); 39 | rb_str_concat(tail_filename, rb_str_new2(".tail")); 40 | StringValue(tail_filename); 41 | 42 | Trie *trie = trie_new(); 43 | 44 | VALUE obj; 45 | obj = Data_Wrap_Struct(self, 0, trie_free, trie); 46 | 47 | DArray *old_da = trie->da; 48 | Tail *old_tail = trie->tail; 49 | 50 | FILE *da_file = fopen(RSTRING_PTR(da_filename), "r"); 51 | if (da_file == NULL) 52 | raise_ioerror("Error reading .da file."); 53 | 54 | trie->da = da_read(da_file); 55 | fclose(da_file); 56 | 57 | FILE *tail_file = fopen(RSTRING_PTR(tail_filename), "r"); 58 | if (tail_file == NULL) 59 | raise_ioerror("Error reading .tail file."); 60 | 61 | trie->tail = tail_read(tail_file); 62 | fclose(tail_file); 63 | 64 | da_free(old_da); 65 | tail_free(old_tail); 66 | 67 | return obj; 68 | } 69 | 70 | /* 71 | * call-seq: 72 | * has_key?(key) -> true/false 73 | * 74 | * Determines whether or not a key exists in the Trie. Use this if you don't care about the value, as it 75 | * is marginally faster than Trie#get. 76 | * 77 | */ 78 | static VALUE rb_trie_has_key(VALUE self, VALUE key) { 79 | StringValue(key); 80 | 81 | Trie *trie; 82 | Data_Get_Struct(self, Trie, trie); 83 | 84 | if(trie_has_key(trie, (TrieChar*)RSTRING_PTR(key))) 85 | return Qtrue; 86 | else 87 | return Qnil; 88 | } 89 | 90 | /* 91 | * call-seq: 92 | * get(key) -> value 93 | * [key] -> value 94 | * 95 | * Retrieves the value for a particular key (or nil) from the Trie. 96 | * 97 | */ 98 | static VALUE rb_trie_get(VALUE self, VALUE key) { 99 | StringValue(key); 100 | 101 | Trie *trie; 102 | Data_Get_Struct(self, Trie, trie); 103 | 104 | TrieData data; 105 | if(trie_retrieve(trie, (TrieChar*)RSTRING_PTR(key), &data)) 106 | return (VALUE)data; 107 | else 108 | return Qnil; 109 | } 110 | 111 | /* 112 | * call-seq: 113 | * add(key) 114 | * add(key,value) 115 | * 116 | * Add a key, or a key and value to the Trie. If you add a key without a value it assumes true for the value. 117 | * 118 | */ 119 | static VALUE rb_trie_add(VALUE self, VALUE args) { 120 | Trie *trie; 121 | Data_Get_Struct(self, Trie, trie); 122 | 123 | int size = RARRAY_LEN(args); 124 | if(size < 1 || size > 2) 125 | return Qnil; 126 | 127 | VALUE key; 128 | key = RARRAY_PTR(args)[0]; 129 | StringValue(key); 130 | 131 | TrieData value = size == 2 ? RARRAY_PTR(args)[1] : TRIE_DATA_ERROR; 132 | 133 | if(trie_store(trie, (TrieChar*)RSTRING_PTR(key), value)) 134 | return Qtrue; 135 | else 136 | return Qnil; 137 | } 138 | 139 | /* 140 | * call-seq: 141 | * delete(key) 142 | * 143 | * Delete a key from the Trie. Returns true if it deleted a key, nil otherwise. 144 | * 145 | */ 146 | static VALUE rb_trie_delete(VALUE self, VALUE key) { 147 | StringValue(key); 148 | 149 | Trie *trie; 150 | Data_Get_Struct(self, Trie, trie); 151 | 152 | if(trie_delete(trie, (TrieChar*)RSTRING_PTR(key))) 153 | return Qtrue; 154 | else 155 | return Qnil; 156 | } 157 | 158 | static VALUE walk_all_paths(Trie *trie, VALUE children, TrieState *state, char *prefix, int prefix_size) { 159 | int c; 160 | for(c = 1; c < 256; c++) { 161 | if(trie_state_is_walkable(state,c)) { 162 | TrieState *next_state = trie_state_clone(state); 163 | trie_state_walk(next_state, c); 164 | 165 | prefix[prefix_size] = c; 166 | prefix[prefix_size + 1] = 0; 167 | 168 | if(trie_state_is_terminal(next_state)) { 169 | char *word = (char*) malloc(prefix_size + 2); 170 | memcpy(word, prefix, prefix_size + 2); 171 | rb_ary_push(children, rb_str_new2(word)); 172 | } 173 | 174 | walk_all_paths(trie, children, next_state, prefix, prefix_size + 1); 175 | 176 | prefix[prefix_size] = 0; 177 | trie_state_free(next_state); 178 | } 179 | } 180 | } 181 | 182 | 183 | static Bool traverse(TrieState *state, TrieChar *char_prefix) { 184 | const TrieChar *iterator = char_prefix; 185 | while(*iterator != 0) { 186 | if(!trie_state_is_walkable(state, *iterator)) 187 | return FALSE; 188 | trie_state_walk(state, *iterator); 189 | iterator++; 190 | } 191 | return TRUE; 192 | } 193 | 194 | 195 | /* 196 | * call-seq: 197 | * children(prefix) -> [ key, ... ] 198 | * 199 | * Finds all keys in the Trie beginning with the given prefix. 200 | * 201 | */ 202 | static VALUE rb_trie_children(VALUE self, VALUE prefix) { 203 | if(NIL_P(prefix)) 204 | return rb_ary_new(); 205 | 206 | StringValue(prefix); 207 | 208 | Trie *trie; 209 | Data_Get_Struct(self, Trie, trie); 210 | 211 | int prefix_size = RSTRING_LEN(prefix); 212 | TrieState *state = trie_root(trie); 213 | VALUE children = rb_ary_new(); 214 | TrieChar *char_prefix = (TrieChar*)RSTRING_PTR(prefix); 215 | 216 | if(!traverse(state, char_prefix)) { 217 | return children; 218 | } 219 | 220 | if(trie_state_is_terminal(state)) 221 | rb_ary_push(children, prefix); 222 | 223 | char prefix_buffer[1024]; 224 | memcpy(prefix_buffer, char_prefix, prefix_size); 225 | prefix_buffer[prefix_size] = 0; 226 | 227 | walk_all_paths(trie, children, state, prefix_buffer, prefix_size); 228 | 229 | trie_state_free(state); 230 | return children; 231 | } 232 | 233 | static Bool walk_all_paths_until_first_terminal(Trie *trie, TrieState *state, char *prefix, int prefix_size) { 234 | int c; 235 | Bool ret = FALSE; 236 | for(c = 1; c < 256; c++) { 237 | if(trie_state_is_walkable(state,c)) { 238 | TrieState *next_state = trie_state_clone(state); 239 | trie_state_walk(next_state, c); 240 | 241 | prefix[prefix_size] = c; 242 | prefix[prefix_size + 1] = 0; 243 | 244 | if(trie_state_is_terminal(next_state)) { 245 | return TRUE; 246 | } 247 | 248 | ret = walk_all_paths_until_first_terminal(trie, next_state, prefix, prefix_size + 1); 249 | 250 | prefix[prefix_size] = 0; 251 | trie_state_free(next_state); 252 | 253 | if (ret == TRUE) { 254 | return ret; 255 | } 256 | } 257 | } 258 | 259 | return ret; 260 | } 261 | 262 | static VALUE rb_trie_has_children(VALUE self, VALUE prefix) { 263 | if(NIL_P(prefix)) 264 | return rb_ary_new(); 265 | 266 | StringValue(prefix); 267 | 268 | Trie *trie; 269 | Data_Get_Struct(self, Trie, trie); 270 | 271 | int prefix_size = RSTRING_LEN(prefix); 272 | TrieState *state = trie_root(trie); 273 | TrieChar *char_prefix = (TrieChar*)RSTRING_PTR(prefix); 274 | 275 | if(!traverse(state, char_prefix)) { 276 | return Qfalse; 277 | } 278 | 279 | if(trie_state_is_terminal(state)) 280 | return Qtrue; 281 | 282 | char prefix_buffer[1024]; 283 | memcpy(prefix_buffer, char_prefix, prefix_size); 284 | prefix_buffer[prefix_size] = 0; 285 | 286 | Bool ret = walk_all_paths_until_first_terminal(trie, state, prefix_buffer, prefix_size); 287 | 288 | trie_state_free(state); 289 | return ret == TRUE ? Qtrue : Qfalse; 290 | } 291 | 292 | static VALUE walk_all_paths_with_values(Trie *trie, VALUE children, TrieState *state, char *prefix, int prefix_size) { 293 | int c; 294 | for(c = 1; c < 256; c++) { 295 | if(trie_state_is_walkable(state,c)) { 296 | TrieState *next_state = trie_state_clone(state); 297 | trie_state_walk(next_state, c); 298 | 299 | prefix[prefix_size] = c; 300 | prefix[prefix_size + 1] = 0; 301 | 302 | if(trie_state_is_terminal(next_state)) { 303 | TrieState *end_state = trie_state_clone(next_state); 304 | trie_state_walk(end_state, '\0'); 305 | 306 | char *word = (char*) malloc(prefix_size + 2); 307 | memcpy(word, prefix, prefix_size + 2); 308 | 309 | VALUE tuple = rb_ary_new(); 310 | rb_ary_push(tuple, rb_str_new2(word)); 311 | 312 | TrieData trie_data = trie_state_get_data(end_state); 313 | rb_ary_push(tuple, (VALUE)trie_data); 314 | rb_ary_push(children, tuple); 315 | 316 | trie_state_free(end_state); 317 | } 318 | 319 | walk_all_paths_with_values(trie, children, next_state, prefix, prefix_size + 1); 320 | 321 | prefix[prefix_size] = 0; 322 | trie_state_free(next_state); 323 | } 324 | } 325 | } 326 | 327 | /* 328 | * call-seq: 329 | * children_with_values(key) -> [ [key,value], ... ] 330 | * 331 | * Finds all keys with their respective values in the Trie beginning with the given prefix. 332 | * 333 | */ 334 | static VALUE rb_trie_children_with_values(VALUE self, VALUE prefix) { 335 | if(NIL_P(prefix)) 336 | return rb_ary_new(); 337 | 338 | StringValue(prefix); 339 | 340 | Trie *trie; 341 | Data_Get_Struct(self, Trie, trie); 342 | 343 | int prefix_size = RSTRING_LEN(prefix); 344 | TrieChar *char_prefix = (TrieChar*)RSTRING_PTR(prefix); 345 | 346 | VALUE children = rb_ary_new(); 347 | 348 | TrieState *state = trie_root(trie); 349 | 350 | if(!traverse(state, char_prefix)) { 351 | return children; 352 | } 353 | 354 | if(trie_state_is_terminal(state)) { 355 | TrieState *end_state = trie_state_clone(state); 356 | trie_state_walk(end_state, '\0'); 357 | 358 | VALUE tuple = rb_ary_new(); 359 | rb_ary_push(tuple, prefix); 360 | TrieData trie_data = trie_state_get_data(end_state); 361 | rb_ary_push(tuple, (VALUE)trie_data); 362 | rb_ary_push(children, tuple); 363 | 364 | trie_state_free(end_state); 365 | } 366 | 367 | char prefix_buffer[1024]; 368 | memcpy(prefix_buffer, char_prefix, prefix_size); 369 | prefix_buffer[prefix_size] = 0; 370 | 371 | walk_all_paths_with_values(trie, children, state, prefix_buffer, prefix_size); 372 | 373 | trie_state_free(state); 374 | return children; 375 | } 376 | 377 | static VALUE rb_trie_node_alloc(VALUE klass); 378 | 379 | /* 380 | * call-seq: 381 | * root -> TrieNode 382 | * 383 | * Returns a TrieNode representing the root of the Trie. 384 | * 385 | */ 386 | static VALUE rb_trie_root(VALUE self) { 387 | Trie *trie; 388 | Data_Get_Struct(self, Trie, trie); 389 | 390 | VALUE trie_node = rb_trie_node_alloc(cTrieNode); 391 | 392 | TrieState *state = trie_root(trie); 393 | RDATA(trie_node)->data = state; 394 | 395 | rb_iv_set(trie_node, "@state", Qnil); 396 | rb_iv_set(trie_node, "@full_state", rb_str_new2("")); 397 | return trie_node; 398 | } 399 | 400 | 401 | /* 402 | * Document-class: TrieNode 403 | * 404 | * Represents a single node in the Trie. It can be used as a cursor to walk around the Trie. 405 | * You can grab a TrieNode for the root of the Trie by using Trie#root. 406 | * 407 | */ 408 | 409 | static VALUE rb_trie_node_alloc(VALUE klass) { 410 | VALUE obj; 411 | obj = Data_Wrap_Struct(klass, 0, trie_state_free, NULL); 412 | return obj; 413 | } 414 | 415 | /* nodoc */ 416 | static VALUE rb_trie_node_initialize_copy(VALUE self, VALUE from) { 417 | RDATA(self)->data = trie_state_clone(RDATA(from)->data); 418 | 419 | VALUE state = rb_iv_get(from, "@state"); 420 | rb_iv_set(self, "@state", state == Qnil ? Qnil : rb_str_dup(state)); 421 | 422 | VALUE full_state = rb_iv_get(from, "@full_state"); 423 | rb_iv_set(self, "@full_state", full_state == Qnil ? Qnil : rb_str_dup(full_state)); 424 | 425 | return self; 426 | } 427 | 428 | /* 429 | * call-seq: 430 | * state -> single character 431 | * 432 | * Returns the letter that the TrieNode instance points to. So, if the node is pointing at the "e" in "monkeys", the state is "e". 433 | * 434 | */ 435 | static VALUE rb_trie_node_get_state(VALUE self) { 436 | return rb_iv_get(self, "@state"); 437 | } 438 | 439 | /* 440 | * call-seq: 441 | * full_state -> string 442 | * 443 | * Returns the full string from the root of the Trie up to this node. So if the node pointing at the "e" in "monkeys", 444 | * the full_state is "monke". 445 | * 446 | */ 447 | static VALUE rb_trie_node_get_full_state(VALUE self) { 448 | return rb_iv_get(self, "@full_state"); 449 | } 450 | 451 | /* 452 | * call-seq: 453 | * walk!(letter) -> TrieNode 454 | * 455 | * Tries to walk down a particular branch of the Trie. It modifies the node it is called on. 456 | * 457 | */ 458 | static VALUE rb_trie_node_walk_bang(VALUE self, VALUE rchar) { 459 | StringValue(rchar); 460 | 461 | TrieState *state; 462 | Data_Get_Struct(self, TrieState, state); 463 | 464 | if(RSTRING_LEN(rchar) != 1) 465 | return Qnil; 466 | 467 | Bool result = trie_state_walk(state, *RSTRING_PTR(rchar)); 468 | 469 | if(result) { 470 | rb_iv_set(self, "@state", rchar); 471 | VALUE full_state = rb_iv_get(self, "@full_state"); 472 | rb_str_append(full_state, rchar); 473 | rb_iv_set(self, "@full_state", full_state); 474 | return self; 475 | } else 476 | return Qnil; 477 | } 478 | 479 | /* 480 | * call-seq: 481 | * walk(letter) -> TrieNode 482 | * 483 | * Tries to walk down a particular branch of the Trie. It clones the node it is called on and 484 | * walks with that one, leaving the original unchanged. 485 | * 486 | */ 487 | static VALUE rb_trie_node_walk(VALUE self, VALUE rchar) { 488 | StringValue(rchar); 489 | 490 | VALUE new_node = rb_funcall(self, rb_intern("dup"), 0); 491 | 492 | TrieState *state; 493 | Data_Get_Struct(new_node, TrieState, state); 494 | 495 | if(RSTRING_LEN(rchar) != 1) 496 | return Qnil; 497 | 498 | Bool result = trie_state_walk(state, *RSTRING_PTR(rchar)); 499 | 500 | if(result) { 501 | rb_iv_set(new_node, "@state", rchar); 502 | VALUE full_state = rb_iv_get(new_node, "@full_state"); 503 | rb_str_append(full_state, rchar); 504 | rb_iv_set(new_node, "@full_state", full_state); 505 | return new_node; 506 | } else 507 | return Qnil; 508 | } 509 | 510 | /* 511 | * call-seq: 512 | * value 513 | * 514 | * Attempts to get the value at this node of the Trie. This only works if the node is a terminal 515 | * (i.e. end of a key), otherwise it returns nil. 516 | * 517 | */ 518 | static VALUE rb_trie_node_value(VALUE self) { 519 | TrieState *state; 520 | TrieState *dup; 521 | Data_Get_Struct(self, TrieState, state); 522 | 523 | dup = trie_state_clone(state); 524 | 525 | trie_state_walk(dup, 0); 526 | TrieData trie_data = trie_state_get_data(dup); 527 | trie_state_free(dup); 528 | 529 | return TRIE_DATA_ERROR == trie_data ? Qnil : (VALUE)trie_data; 530 | } 531 | 532 | /* 533 | * call-seq: 534 | * terminal? -> true/false 535 | * 536 | * Returns true if this node is at the end of a key. So if you have two keys in your Trie, "he" and 537 | * "hello", and you walk all the way to the end of "hello", the "e" and the "o" will return true for terminal?. 538 | * 539 | */ 540 | static VALUE rb_trie_node_terminal(VALUE self) { 541 | TrieState *state; 542 | Data_Get_Struct(self, TrieState, state); 543 | 544 | return trie_state_is_terminal(state) ? Qtrue : Qnil; 545 | } 546 | 547 | /* 548 | * call-seq: 549 | * leaf? -> true/false 550 | * 551 | * Returns true if there are no branches at this node. 552 | */ 553 | static VALUE rb_trie_node_leaf(VALUE self) { 554 | TrieState *state; 555 | Data_Get_Struct(self, TrieState, state); 556 | 557 | return trie_state_is_leaf(state) ? Qtrue : Qnil; 558 | } 559 | 560 | /* 561 | * call-seq: 562 | * save(filename_base) -> true 563 | * 564 | * Saves the trie data to two files, filename_base.da and filename_base.tail. 565 | * Returns true if saving was successful. 566 | */ 567 | static VALUE rb_trie_save(VALUE self, VALUE filename_base) { 568 | VALUE da_filename = rb_str_dup(filename_base); 569 | rb_str_concat(da_filename, rb_str_new2(".da")); 570 | StringValue(da_filename); 571 | 572 | VALUE tail_filename = rb_str_dup(filename_base); 573 | rb_str_concat(tail_filename, rb_str_new2(".tail")); 574 | StringValue(tail_filename); 575 | 576 | Trie *trie; 577 | Data_Get_Struct(self, Trie, trie); 578 | 579 | FILE *da_file = fopen(RSTRING_PTR(da_filename), "w"); 580 | if (da_file == NULL) 581 | raise_ioerror("Error opening .da file for writing."); 582 | if (da_write(trie->da, da_file) != 0) 583 | raise_ioerror("Error writing DArray data."); 584 | fclose(da_file); 585 | 586 | FILE *tail_file = fopen(RSTRING_PTR(tail_filename), "w"); 587 | if (tail_file == NULL) 588 | raise_ioerror("Error opening .tail file for writing."); 589 | if (tail_write(trie->tail, tail_file) != 0) 590 | raise_ioerror("Error writing Tail data."); 591 | fclose(tail_file); 592 | 593 | return Qtrue; 594 | } 595 | 596 | 597 | void Init_trie() { 598 | cTrie = rb_define_class("Trie", rb_cObject); 599 | rb_define_alloc_func(cTrie, rb_trie_alloc); 600 | rb_define_module_function(cTrie, "read", rb_trie_read, 1); 601 | rb_define_method(cTrie, "has_key?", rb_trie_has_key, 1); 602 | rb_define_method(cTrie, "get", rb_trie_get, 1); 603 | rb_define_method(cTrie, "add", rb_trie_add, -2); 604 | rb_define_method(cTrie, "delete", rb_trie_delete, 1); 605 | rb_define_method(cTrie, "children", rb_trie_children, 1); 606 | rb_define_method(cTrie, "children_with_values", rb_trie_children_with_values, 1); 607 | rb_define_method(cTrie, "has_children?", rb_trie_has_children, 1); 608 | rb_define_method(cTrie, "root", rb_trie_root, 0); 609 | rb_define_method(cTrie, "save", rb_trie_save, 1); 610 | 611 | cTrieNode = rb_define_class("TrieNode", rb_cObject); 612 | rb_define_alloc_func(cTrieNode, rb_trie_node_alloc); 613 | rb_define_method(cTrieNode, "initialize_copy", rb_trie_node_initialize_copy, 1); 614 | rb_define_method(cTrieNode, "state", rb_trie_node_get_state, 0); 615 | rb_define_method(cTrieNode, "full_state", rb_trie_node_get_full_state, 0); 616 | rb_define_method(cTrieNode, "walk!", rb_trie_node_walk_bang, 1); 617 | rb_define_method(cTrieNode, "walk", rb_trie_node_walk, 1); 618 | rb_define_method(cTrieNode, "value", rb_trie_node_value, 0); 619 | rb_define_method(cTrieNode, "terminal?", rb_trie_node_terminal, 0); 620 | rb_define_method(cTrieNode, "leaf?", rb_trie_node_leaf, 0); 621 | } 622 | -------------------------------------------------------------------------------- /ext/trie/trie.h: -------------------------------------------------------------------------------- 1 | #include "darray.h" 2 | #include "tail.h" 3 | 4 | typedef struct _Trie { 5 | DArray *da; 6 | Tail *tail; 7 | } Trie; 8 | 9 | typedef struct _TrieState { 10 | const Trie *trie; /**< the corresponding trie */ 11 | TrieIndex index; /**< index in double-array/tail structures */ 12 | short suffix_idx; /**< suffix character offset, if in suffix */ 13 | short is_suffix; /**< whether it is currently in suffix part */ 14 | } TrieState; 15 | 16 | 17 | #define trie_da_is_separate(da,s) (da_get_base ((da), (s)) < 0) 18 | #define trie_da_get_tail_index(da,s) (-da_get_base ((da), (s))) 19 | #define trie_da_set_tail_index(da,s,v) (da_set_base ((da), (s), -(v))) 20 | #define trie_state_is_terminal(s) trie_state_is_walkable((s),TRIE_CHAR_TERM) 21 | 22 | 23 | Trie* trie_new(); 24 | void trie_free(Trie *trie); 25 | static Bool trie_branch_in_branch (Trie *trie, TrieIndex sep_node, const TrieChar *suffix, TrieData data); 26 | static Bool trie_branch_in_tail(Trie *trie, TrieIndex sep_node, const TrieChar *suffix, TrieData data); 27 | Bool trie_store (Trie *trie, const TrieChar *key, TrieData data); 28 | Bool trie_retrieve (const Trie *trie, const TrieChar *key, TrieData *o_data); 29 | Bool trie_delete (Trie *trie, const TrieChar *key); 30 | TrieState * trie_root (const Trie *trie); 31 | static TrieState * trie_state_new (const Trie *trie, TrieIndex index, short suffix_idx, short is_suffix); 32 | TrieState * trie_state_clone (const TrieState *s); 33 | void trie_state_free (TrieState *s); 34 | void trie_state_rewind (TrieState *s); 35 | Bool trie_state_walk (TrieState *s, TrieChar c); 36 | Bool trie_state_is_walkable (const TrieState *s, TrieChar c); 37 | Bool trie_state_is_leaf (const TrieState *s); 38 | TrieData trie_state_get_data (const TrieState *s); 39 | 40 | 41 | -------------------------------------------------------------------------------- /ext/trie/triedefs.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ 2 | /* 3 | * triedefs.h - General typedefs for trie 4 | * Created: 2006-08-11 5 | * Author: Theppitak Karoonboonyanan 6 | */ 7 | 8 | #ifndef __TRIEDEFS_H 9 | #define __TRIEDEFS_H 10 | 11 | #include "typedefs.h" 12 | 13 | /** 14 | * @file triedefs.h 15 | * @brief General typedefs for trie 16 | */ 17 | 18 | /** 19 | * @brief Trie IO modes 20 | */ 21 | typedef enum { 22 | TRIE_IO_READ = 0x01, 23 | TRIE_IO_WRITE = 0x02, 24 | TRIE_IO_CREATE = 0x04 25 | } TrieIOMode; 26 | 27 | /** 28 | * @brief Trie character type for alphabet 29 | */ 30 | typedef uint32 AlphaChar; 31 | 32 | /** 33 | * @brief Error value for alphabet character 34 | */ 35 | #define ALPHA_CHAR_ERROR (~(AlphaChar)0) 36 | 37 | /** 38 | * @brief Trie character type for key 39 | */ 40 | typedef unsigned char TrieChar; 41 | /** 42 | * @brief Trie terminator character 43 | */ 44 | #define TRIE_CHAR_TERM '\0' 45 | #define TRIE_CHAR_MAX 255 46 | 47 | /** 48 | * @brief Type of Trie index 49 | */ 50 | typedef int32 TrieIndex; 51 | /** 52 | * @brief Trie error index 53 | */ 54 | #define TRIE_INDEX_ERROR 0 55 | /** 56 | * @brief Maximum trie index value 57 | */ 58 | #define TRIE_INDEX_MAX 0x7fffffff 59 | 60 | /** 61 | * @brief Type of value associated to trie entries 62 | */ 63 | typedef unsigned long TrieData; 64 | /** 65 | * @brief Trie error data 66 | */ 67 | #define TRIE_DATA_ERROR -1 68 | 69 | #endif /* __TRIEDEFS_H */ 70 | 71 | /* 72 | vi:ts=4:ai:expandtab 73 | */ 74 | -------------------------------------------------------------------------------- /ext/trie/typedefs.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ 2 | /* 3 | * typedefs.h - general types 4 | * Created : 11 Aug 2006 5 | * Author : Theppitak Karoonboonyanan 6 | */ 7 | 8 | #ifndef __TYPEDEFS_H 9 | #define __TYPEDEFS_H 10 | 11 | #include 12 | 13 | // fix for fast_trie on Windows. Should be easy to merge with future changes to libdatrie. MH 14 | #include 15 | #define Bool bool 16 | #define FALSE false 17 | #define TRUE true 18 | 19 | # if UCHAR_MAX == 0xff 20 | # ifndef UINT8_TYPEDEF 21 | # define UINT8_TYPEDEF 22 | typedef unsigned char uint8; 23 | # endif /* UINT8_TYPEDEF */ 24 | # endif /* UCHAR_MAX */ 25 | 26 | # if SCHAR_MAX == 0x7f 27 | # ifndef INT8_TYPEDEF 28 | # define INT8_TYPEDEF 29 | typedef signed char int8; 30 | # endif /* INT8_TYPEDEF */ 31 | # endif /* SCHAR_MAX */ 32 | 33 | # if UINT_MAX == 0xffff 34 | # ifndef UINT16_TYPEDEF 35 | # define UINT16_TYPEDEF 36 | typedef unsigned int uint16; 37 | # endif /* UINT16_TYPEDEF */ 38 | # endif /* UINT_MAX */ 39 | 40 | # if INT_MAX == 0x7fff 41 | # ifndef INT16_TYPEDEF 42 | # define INT16_TYPEDEF 43 | typedef int int16; 44 | # endif /* INT16_TYPEDEF */ 45 | # endif /* INT_MAX */ 46 | 47 | # if USHRT_MAX == 0xffff 48 | # ifndef UINT16_TYPEDEF 49 | # define UINT16_TYPEDEF 50 | typedef unsigned short uint16; 51 | # endif /* UINT16_TYPEDEF */ 52 | # endif /* USHRT_MAX */ 53 | 54 | # if SHRT_MAX == 0x7fff 55 | # ifndef INT16_TYPEDEF 56 | # define INT16_TYPEDEF 57 | typedef short int16; 58 | # endif /* INT16_TYPEDEF */ 59 | # endif /* SHRT_MAX */ 60 | 61 | # if UINT_MAX == 0xffffffff 62 | # ifndef UINT32_TYPEDEF 63 | # define UINT32_TYPEDEF 64 | typedef unsigned int uint32; 65 | # endif /* UINT32_TYPEDEF */ 66 | # endif /* UINT_MAX */ 67 | 68 | # if INT_MAX == 0x7fffffff 69 | # ifndef INT32_TYPEDEF 70 | # define INT32_TYPEDEF 71 | typedef int int32; 72 | # endif /* INT32_TYPEDEF */ 73 | # endif /* INT_MAX */ 74 | 75 | # if ULONG_MAX == 0xffffffff 76 | # ifndef UINT32_TYPEDEF 77 | # define UINT32_TYPEDEF 78 | typedef unsigned long uint32; 79 | # endif /* UINT32_TYPEDEF */ 80 | # endif /* ULONG_MAX */ 81 | 82 | # if LONG_MAX == 0x7fffffff 83 | # ifndef INT32_TYPEDEF 84 | # define INT32_TYPEDEF 85 | typedef long int32; 86 | # endif /* INT32_TYPEDEF */ 87 | # endif /* LONG_MAX */ 88 | 89 | # ifndef UINT8_TYPEDEF 90 | # error "uint8 type is undefined!" 91 | # endif 92 | # ifndef INT8_TYPEDEF 93 | # error "int8 type is undefined!" 94 | # endif 95 | # ifndef UINT16_TYPEDEF 96 | # error "uint16 type is undefined!" 97 | # endif 98 | # ifndef INT16_TYPEDEF 99 | # error "int16 type is undefined!" 100 | # endif 101 | # ifndef UINT32_TYPEDEF 102 | # error "uint32 type is undefined!" 103 | # endif 104 | # ifndef INT32_TYPEDEF 105 | # error "int32 type is undefined!" 106 | # endif 107 | 108 | typedef uint8 byte; 109 | typedef uint16 word; 110 | typedef uint32 dword; 111 | 112 | 113 | #endif /* __TYPEDEFS_H */ 114 | 115 | /* 116 | vi:ts=4:ai:expandtab 117 | */ 118 | -------------------------------------------------------------------------------- /fast_trie.gemspec: -------------------------------------------------------------------------------- 1 | # Generated by jeweler 2 | # DO NOT EDIT THIS FILE DIRECTLY 3 | # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec' 4 | # -*- encoding: utf-8 -*- 5 | # stub: fast_trie 0.5.1 ruby ext 6 | # stub: ext/trie/extconf.rb 7 | 8 | Gem::Specification.new do |s| 9 | s.name = "fast_trie" 10 | s.version = "0.5.1" 11 | 12 | s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version= 13 | s.require_paths = ["ext"] 14 | s.authors = ["Tyler McMullen", "Matt Hickford"] 15 | s.date = "2015-07-27" 16 | s.description = "Ruby Trie based on libdatrie." 17 | s.email = "tyler@scribd.com" 18 | s.extensions = ["ext/trie/extconf.rb"] 19 | s.extra_rdoc_files = [ 20 | "LICENSE", 21 | "README.textile" 22 | ] 23 | s.files = [ 24 | "Gemfile.lock", 25 | "README.textile", 26 | "VERSION.yml", 27 | "ext/trie/darray.c", 28 | "ext/trie/darray.h", 29 | "ext/trie/extconf.rb", 30 | "ext/trie/fileutils.c", 31 | "ext/trie/fileutils.h", 32 | "ext/trie/tail.c", 33 | "ext/trie/tail.h", 34 | "ext/trie/trie-private.c", 35 | "ext/trie/trie-private.h", 36 | "ext/trie/trie.c", 37 | "ext/trie/trie.h", 38 | "ext/trie/triedefs.h", 39 | "ext/trie/typedefs.h", 40 | "fast_trie.gemspec", 41 | "spec/trie_spec.rb" 42 | ] 43 | s.homepage = "http://github.com/tyler/trie" 44 | s.rdoc_options = ["--title", "Trie", "--line-numbers", "--op", "rdoc", "--main", "ext/trie/trie.c", "README"] 45 | s.rubygems_version = "2.4.5" 46 | s.summary = "Ruby Trie based on libdatrie." 47 | 48 | if s.respond_to? :specification_version then 49 | s.specification_version = 4 50 | 51 | if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then 52 | s.add_development_dependency(%q, [">= 0"]) 53 | s.add_development_dependency(%q, [">= 0"]) 54 | s.add_development_dependency(%q, ["~> 3.12"]) 55 | s.add_development_dependency(%q, ["~> 1.0"]) 56 | s.add_development_dependency(%q, ["~> 2.0.1"]) 57 | s.add_development_dependency(%q, [">= 0"]) 58 | else 59 | s.add_dependency(%q, [">= 0"]) 60 | s.add_dependency(%q, [">= 0"]) 61 | s.add_dependency(%q, ["~> 3.12"]) 62 | s.add_dependency(%q, ["~> 1.0"]) 63 | s.add_dependency(%q, ["~> 2.0.1"]) 64 | s.add_dependency(%q, [">= 0"]) 65 | end 66 | else 67 | s.add_dependency(%q, [">= 0"]) 68 | s.add_dependency(%q, [">= 0"]) 69 | s.add_dependency(%q, ["~> 3.12"]) 70 | s.add_dependency(%q, ["~> 1.0"]) 71 | s.add_dependency(%q, ["~> 2.0.1"]) 72 | s.add_dependency(%q, [">= 0"]) 73 | end 74 | end 75 | 76 | -------------------------------------------------------------------------------- /spec/trie_spec.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/../lib/trie' 2 | 3 | describe Trie do 4 | before :each do 5 | @trie = Trie.new; 6 | @trie.add('rocket') 7 | @trie.add('rock') 8 | @trie.add('frederico') 9 | end 10 | 11 | describe :has_key? do 12 | it 'returns true for words in the trie' do 13 | @trie.has_key?('rocket').should be_true 14 | end 15 | 16 | it 'returns nil for words that are not in the trie' do 17 | @trie.has_key?('not_in_the_trie').should be_nil 18 | end 19 | end 20 | 21 | describe :get do 22 | it 'returns -1 for words in the trie without a weight' do 23 | @trie.get('rocket').should == -1 24 | end 25 | 26 | it 'returns nil if the word is not in the trie' do 27 | @trie.get('not_in_the_trie').should be_nil 28 | end 29 | end 30 | 31 | describe :add do 32 | it 'adds a word to the trie' do 33 | @trie.add('forsooth').should == true 34 | @trie.get('forsooth').should == -1 35 | end 36 | 37 | it 'adds a word with a weight to the trie' do 38 | @trie.add('chicka',123).should == true 39 | @trie.get('chicka').should == 123 40 | end 41 | 42 | it 'adds values greater than 16-bit allows' do 43 | @trie.add('chicka', 72_000).should == true 44 | @trie.get('chicka').should == 72_000 45 | end 46 | 47 | it 'adds a word with a non-numeric value to the trie' do 48 | @trie.add('doot', 'Heeey').should == true 49 | @trie.get('doot').should == 'Heeey' 50 | end 51 | end 52 | 53 | describe :delete do 54 | it 'deletes a word from the trie' do 55 | @trie.delete('rocket').should == true 56 | @trie.has_key?('rocket').should be_nil 57 | end 58 | end 59 | 60 | describe :children do 61 | it 'returns all words beginning with a given prefix' do 62 | children = @trie.children('roc') 63 | children.size.should == 2 64 | children.should include('rock') 65 | children.should include('rocket') 66 | end 67 | 68 | it 'returns blank array if prefix does not exist' do 69 | @trie.children('ajsodij').should == [] 70 | end 71 | 72 | it 'includes the prefix if the prefix is a word' do 73 | children = @trie.children('rock') 74 | children.size.should == 2 75 | children.should include('rock') 76 | children.should include('rocket') 77 | end 78 | 79 | it 'returns blank array if prefix is nil' do 80 | @trie.children(nil).should == [] 81 | end 82 | end 83 | 84 | describe :children_with_values do 85 | before :each do 86 | @trie.add('abc',2) 87 | @trie.add('abcd',4) 88 | end 89 | 90 | it 'returns all words with values beginning with a given prefix' do 91 | children = @trie.children_with_values('ab') 92 | children.size.should == 2 93 | children.should include(['abc',2]) 94 | children.should include(['abcd',4]) 95 | end 96 | 97 | it 'returns nil if prefix does not exist' do 98 | @trie.children_with_values('ajsodij').should == [] 99 | end 100 | 101 | it 'includes the prefix if the prefix is a word' do 102 | children = @trie.children_with_values('abc') 103 | children.size.should == 2 104 | children.should include(['abc',2]) 105 | children.should include(['abcd',4]) 106 | end 107 | 108 | it 'returns blank array if prefix is nil' do 109 | @trie.children_with_values(nil).should == [] 110 | end 111 | end 112 | 113 | #describe :walk_to_terminal do 114 | # it 'returns the first word found along a path' do 115 | # @trie.add 'anderson' 116 | # @trie.add 'andreas' 117 | # @trie.add 'and' 118 | 119 | # @trie.walk_to_terminal('anderson').should == 'and' 120 | # end 121 | 122 | # it 'returns the first word and value along a path' do 123 | # @trie.add 'anderson' 124 | # @trie.add 'andreas' 125 | # @trie.add 'and', 15 126 | 127 | # @trie.walk_to_terminal('anderson',true).should == ['and', 15] 128 | # end 129 | #end 130 | 131 | describe :root do 132 | it 'returns a TrieNode' do 133 | @trie.root.should be_an_instance_of(TrieNode) 134 | end 135 | 136 | it 'returns a different TrieNode each time' do 137 | @trie.root.should_not == @trie.root 138 | end 139 | end 140 | 141 | describe 'save/read' do 142 | let(:filename_base) do 143 | dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'tmp')) 144 | FileUtils.mkdir_p(dir) 145 | File.join(dir, 'trie') 146 | end 147 | 148 | context 'when I save the populated trie to disk' do 149 | before(:each) do 150 | @trie.add('omgwtflolbbq', 123) 151 | @trie.save(filename_base) 152 | end 153 | 154 | it 'should contain the same data when reading from disk' do 155 | trie2 = Trie.read(filename_base) 156 | trie2.get('omgwtflolbbq').should == 123 157 | end 158 | end 159 | end 160 | 161 | describe :read do 162 | context 'when the files to read from do not exist' do 163 | let(:filename_base) do 164 | "phantasy/file/path/that/does/not/exist" 165 | end 166 | 167 | it 'should raise an error when attempting a read' do 168 | lambda { Trie.read(filename_base) }.should raise_error(IOError) 169 | end 170 | end 171 | end 172 | 173 | describe :has_children? do 174 | it 'returns true when there are children matching prefix' do 175 | @trie.has_children?('r').should be_true 176 | 177 | @trie.has_children?('rock').should be_true 178 | @trie.has_children?('rocket').should be_true 179 | end 180 | 181 | it 'returns false when there are no children matching prefix' do 182 | @trie.has_children?('no').should be_false 183 | @trie.has_children?('rome').should be_false 184 | @trie.has_children?('roc_').should be_false 185 | end 186 | end 187 | end 188 | 189 | describe TrieNode do 190 | before :each do 191 | @trie = Trie.new; 192 | @trie.add('rocket',1) 193 | @trie.add('rock',2) 194 | @trie.add('frederico',3) 195 | @node = @trie.root 196 | end 197 | 198 | describe :state do 199 | it 'returns the most recent state character' do 200 | @node.walk!('r') 201 | @node.state.should == 'r' 202 | @node.walk!('o') 203 | @node.state.should == 'o' 204 | end 205 | 206 | it 'is nil when no walk has occurred' do 207 | @node.state.should == nil 208 | end 209 | end 210 | 211 | describe :full_state do 212 | it 'returns the current string' do 213 | @node.walk!('r').walk!('o').walk!('c') 214 | @node.full_state.should == 'roc' 215 | end 216 | 217 | it 'is a blank string when no walk has occurred' do 218 | @node.full_state.should == '' 219 | end 220 | end 221 | 222 | describe :walk! do 223 | it 'returns the updated object when the walk succeeds' do 224 | other = @node.walk!('r') 225 | other.should == @node 226 | end 227 | 228 | it 'returns nil when the walk fails' do 229 | @node.walk!('q').should be_nil 230 | end 231 | end 232 | 233 | describe :walk do 234 | it 'returns a new node object when the walk succeeds' do 235 | other = @node.walk('r') 236 | other.should_not == @node 237 | end 238 | 239 | it 'returns nil when the walk fails' do 240 | @node.walk('q').should be_nil 241 | end 242 | end 243 | 244 | 245 | describe :value do 246 | it 'returns nil when the node is not terminal' do 247 | @node.walk!('r') 248 | @node.value.should be_nil 249 | end 250 | 251 | it 'returns a value when the node is terminal' do 252 | @node.walk!('r').walk!('o').walk!('c').walk!('k') 253 | @node.value.should == 2 254 | end 255 | end 256 | 257 | describe :terminal? do 258 | it 'returns true when the node is a word end' do 259 | @node.walk!('r').walk!('o').walk!('c').walk!('k') 260 | @node.should be_terminal 261 | end 262 | 263 | it 'returns nil when the node is not a word end' do 264 | @node.walk!('r').walk!('o').walk!('c') 265 | @node.should_not be_terminal 266 | end 267 | end 268 | 269 | describe :leaf? do 270 | it 'returns true when this is the end of a branch of the trie' do 271 | @node.walk!('r').walk!('o').walk!('c').walk!('k').walk!('e').walk!('t') 272 | @node.should be_leaf 273 | end 274 | 275 | it 'returns nil when there are more splits on this branch' do 276 | @node.walk!('r').walk!('o').walk!('c').walk!('k') 277 | @node.should_not be_leaf 278 | end 279 | end 280 | 281 | describe :clone do 282 | it 'creates a new instance of this node which is not this node' do 283 | new_node = @node.clone 284 | new_node.should_not == @node 285 | end 286 | 287 | it 'matches the state of the current node' do 288 | new_node = @node.clone 289 | new_node.state.should == @node.state 290 | end 291 | 292 | it 'matches the full_state of the current node' do 293 | new_node = @node.clone 294 | new_node.full_state.should == @node.full_state 295 | end 296 | end 297 | end 298 | --------------------------------------------------------------------------------