├── .gitignore
├── .travis.yml
├── Gemfile
├── Gemfile.lock
├── LICENSE
├── README.textile
├── Rakefile
├── VERSION.yml
├── ext
    └── trie
    │   ├── darray.c
    │   ├── darray.h
    │   ├── extconf.rb
    │   ├── fileutils.c
    │   ├── fileutils.h
    │   ├── tail.c
    │   ├── tail.h
    │   ├── trie-private.c
    │   ├── trie-private.h
    │   ├── trie.c
    │   ├── trie.h
    │   ├── triedefs.h
    │   └── typedefs.h
├── fast_trie.gemspec
└── spec
    └── trie_spec.rb


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.sw?
 2 | *.o
 3 | *.bundle
 4 | *.dylib
 5 | .DS_Store
 6 | coverage
 7 | *~
 8 | #*
 9 | *.gem
10 | rdoc
11 | Makefile
12 | *.stackdump
13 | *.def
14 | *.so
15 | tmp/


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: ruby
2 | rvm:
3 | - 2.1.0
4 | - 2.0.0
5 | - 1.9.3
6 | - 1.8.7
7 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
 1 | source 'https://rubygems.org'
 2 | 
 3 | group :development do
 4 |   gem 'rake'
 5 |   gem 'rspec'
 6 |   gem 'rdoc', '~> 3.12'
 7 |   gem 'bundler', '~> 1.0'
 8 |   gem 'jeweler', '~> 2.0.1'
 9 |   gem 'rake-compiler'
10 | end
11 | 


--------------------------------------------------------------------------------
/Gemfile.lock:
--------------------------------------------------------------------------------
 1 | GEM
 2 |   remote: https://rubygems.org/
 3 |   specs:
 4 |     addressable (2.3.5)
 5 |     builder (3.2.2)
 6 |     descendants_tracker (0.0.3)
 7 |     diff-lcs (1.2.5)
 8 |     faraday (0.9.0)
 9 |       multipart-post (>= 1.2, < 3)
10 |     git (1.2.6)
11 |     github_api (0.11.2)
12 |       addressable (~> 2.3)
13 |       descendants_tracker (~> 0.0.1)
14 |       faraday (~> 0.8, < 0.10)
15 |       hashie (>= 1.2)
16 |       multi_json (>= 1.7.5, < 2.0)
17 |       nokogiri (~> 1.6.0)
18 |       oauth2
19 |     hashie (2.0.5)
20 |     highline (1.6.20)
21 |     jeweler (2.0.1)
22 |       builder
23 |       bundler (>= 1.0)
24 |       git (>= 1.2.5)
25 |       github_api
26 |       highline (>= 1.6.15)
27 |       nokogiri (>= 1.5.10)
28 |       rake
29 |       rdoc
30 |     json (1.8.1)
31 |     jwt (0.1.11)
32 |       multi_json (>= 1.5)
33 |     mini_portile (0.5.2)
34 |     multi_json (1.8.4)
35 |     multi_xml (0.5.5)
36 |     multipart-post (2.0.0)
37 |     nokogiri (1.6.1-x86-mingw32)
38 |       mini_portile (~> 0.5.0)
39 |     oauth2 (0.9.3)
40 |       faraday (>= 0.8, < 0.10)
41 |       jwt (~> 0.1.8)
42 |       multi_json (~> 1.3)
43 |       multi_xml (~> 0.5)
44 |       rack (~> 1.2)
45 |     rack (1.5.2)
46 |     rake (10.1.1)
47 |     rake-compiler (0.9.2)
48 |       rake
49 |     rdoc (3.12.2)
50 |       json (~> 1.4)
51 |     rspec (2.14.1)
52 |       rspec-core (~> 2.14.0)
53 |       rspec-expectations (~> 2.14.0)
54 |       rspec-mocks (~> 2.14.0)
55 |     rspec-core (2.14.7)
56 |     rspec-expectations (2.14.5)
57 |       diff-lcs (>= 1.1.3, < 2.0)
58 |     rspec-mocks (2.14.5)
59 | 
60 | PLATFORMS
61 |   x86-mingw32
62 | 
63 | DEPENDENCIES
64 |   bundler (~> 1.0)
65 |   jeweler (~> 2.0.1)
66 |   rake
67 |   rake-compiler
68 |   rdoc (~> 3.12)
69 |   rspec
70 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2008 Tyler McMullen
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | "Software"), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/README.textile:
--------------------------------------------------------------------------------
  1 | h1. Trie
  2 | 
  3 | !https://badge.fury.io/rb/fast_trie.svg!:https://rubygems.org/gems/fast_trie !https://travis-ci.org/tyler/trie.svg!:https://travis-ci.org/tyler/trie
  4 | 
  5 | This is a trie for Ruby using "libdatrie":http://linux.thai.net/~thep/datrie/. It uses a dual-array system, meaning it has best-in-class memory usage and search time.
  6 | 
  7 | 
  8 | h2. What is a trie?
  9 | 
 10 | I suck at explaining things.  Wikipedia doesn't.  http://wikipedia.org/wiki/Trie.
 11 | 
 12 | But in short a trie is a data structure that holds strings in a tree.  So if you inserted the words 'arc', 'ark', and 'ape' in a trie you could visualize it thusly:
 13 | 
 14 | <pre>
 15 |       p - e
 16 |     /
 17 |   a - r - c
 18 |         \
 19 |           k
 20 | </pre>
 21 | 
 22 | It's easy to see how this can have pretty neat implications for things like searching through lists of strings, sorting lists of strings, and things like spelling correction and autocompletion.
 23 | 
 24 | h2. Installation
 25 | 
 26 | From RubyGems https://rubygems.org/gems/fast_trie
 27 | 
 28 | <pre><code>
 29 |   gem install fast_trie
 30 | </code></pre>
 31 | 
 32 | h2. Tutorial
 33 | 
 34 | Let's go through building a simple autocompleter using "Trie":http://rubydoc.info/gems/fast_trie/Trie object.
 35 | 
 36 | <pre><code>
 37 |   require 'trie'
 38 |   Trie.new
 39 | </code></pre>
 40 | 
 41 | Anyway.  So we've created our blank trie.  Now, since we're creating an autocompleter, we'll need to add some words into it.  We do that simply with the add method.
 42 | 
 43 | <pre><code>
 44 |   words.each do |word|
 45 |     trie.add word
 46 |   end
 47 | </code></pre>
 48 | 
 49 | Or if you have some integer data to store along with the words, such as weights or scores of some kind, you'd do it like so...
 50 | 
 51 | <pre><code>
 52 |   words_and_weights do |word,weight|
 53 |     trie.add word, weight
 54 |   end
 55 | </code></pre>
 56 | 
 57 | Great, so we've populated our trie with some words. Let's make sure those words are really there.
 58 | 
 59 | <pre><code>
 60 |   trie.has_key?('widget')  #=> true
 61 | 
 62 |   trie.get('widget')  #=> -1 or your value
 63 | 
 64 |   trie.get('not-in-the-trie')  #=> nil
 65 | </code></pre>
 66 | 
 67 | If you didn't enter a value to go along with the word, calling <code>get</code> with it will return -1.
 68 | 
 69 | Okay great, we have our populated trie, we've confirmed that the keys are in there.  Let's make an autocompleter!  For this we'll need to use the <code>children</code> method.  We'll do this as a simple Rails action, with the assumption you've initialized the trie into <code>TRIE</code>.
 70 | 
 71 | <pre><code>
 72 |   def autocomplete
 73 |     children = TRIE.children(params[:prefix])
 74 | 
 75 |     respond_to do |format|
 76 |       format.js { render(:string => JSON.dump(children)) }
 77 |       format.yaml { render(:string => YAML.dump(children)) }
 78 |     end
 79 |   end
 80 | </code></pre>
 81 | 
 82 | Yep, that's it.
 83 | 
 84 | There are, of course, some more interesting and advanced ways to use a trie.  For instance, this snippet take a string, then walks down the trie, noting each word it finds along the way.
 85 | 
 86 | <pre><code>
 87 |   word = 'forestry'
 88 |   node = trie.root
 89 | 
 90 |   word.split('').each do |char|
 91 |     break unless node.walk!(char)
 92 |     if node.terminal?
 93 |       puts "Found me a word: #{node.full_state}"
 94 |     end
 95 |   end
 96 | </code></pre>
 97 | 
 98 | By calling <code>root</code> on a Trie, you get a "TrieNode":http://rubydoc.info/gems/fast_trie/TrieNode, pointed at the root of the trie.  You can then use this node to walk the trie and perceive things about each word.
 99 | 
100 | You can read the reference documentation at http://rubydoc.info/gems/fast_trie/frames/Trie
101 | 
102 | h2. Performance Characteristics
103 | 
104 | Here are some quick benchmarks on my 2.4ghz Intel Core 2 Duo MacBook Pro:
105 | 
106 | For keys that are 5 characters long:
107 | 31,344 adds/second
108 | 1,827,408 searches/second
109 | 38,453 prefixes searches/second
110 | 
111 | For keys that are 10 characters long:
112 | 30,653 adds/second
113 | 1,802,649 searches/second
114 | 13,553 prefix searches/second
115 | 
116 | For keys that are 20 characters long:
117 | 30,488 adds/second
118 | 1,851,461 searches/second
119 | 5,855 prefix searches/second
120 | 
121 | For keys that are 40 characters long:
122 | 30,710 adds/second
123 | 1,838,380 searches/second
124 | 2,762 prefix searches/second
125 | 
126 | 
127 | There are a few takeaways from this. First, there is no strong correlation between length of keys and insert or retrieve time. They stay fairly constant as the length of keys increase. Secondly, doing prefix searches with this trie gets slower linearly with the length of the keys in the trie.
128 | 
129 | This points to a limitation of this type of trie.  It is based on "libdatrie":http://linux.thai.net/~thep/datrie/ ("version 0.1.99":http://linux.thai.net/svn/software/datrie/trunk/NEWS), which is a dual-array trie.  When finding branches from a particular node, we must query all possible branches to determine whether or not they exist.  So for each node we do 255 of these queries.
130 | 
131 | There may be some tricks to speed this up, but for now it is simply a limitation of this trie.
132 | 
133 | Now, let's look at the effect of the size of the trie itself on query and insertion time.  For this test I inserted 100, 1000, 10000, 100000, and 1000000 words in the trie.  We measure the insertion and retrieval time in each.  The graph below shows the results.
134 | 
135 | !http://codehallow.com/effect_of_size.png!
136 | 
137 | So, keeping in mind that we're increasing by orders of magnitude, you can see that the insertion time does take a signifcant hit.  Retrieval also goes down but at a very gradual rate.  (It decreases by about 50% in total, despite the size increasing by 1,000,000%.)
138 | 
139 | The reason the insertion times takes such a beating is due, again, to a limitation of the trie.  Storing a trie in the dual array setup that is used is excellent for memory usage and retrieval time.  Best in class, in fact. However, the more things are added into the trie the more complicated it gets to insert things.  It often requires shuffling large pieces of the arrays.  There may be room for optimization here, but ultimately insertion time will increase with the size of the trie.
140 | 
141 | 
142 | 
143 | Copyright (c) 2008 Tyler McMullen. See LICENSE for details.
144 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | require 'rubygems'
 4 | require 'bundler'
 5 | begin
 6 |   Bundler.setup(:default, :development)
 7 | rescue Bundler::BundlerError => e
 8 |   $stderr.puts e.message
 9 |   $stderr.puts "Run `bundle install` to install missing gems"
10 |   exit e.status_code
11 | end
12 | require 'rake'
13 | 
14 | require 'jeweler'
15 | 
16 | jeweler_tasks = Jeweler::Tasks.new do |s|
17 |     s.name = "fast_trie"
18 |     s.email = "tyler@scribd.com"
19 |     s.homepage = "http://github.com/tyler/trie"
20 |     s.description = "Ruby Trie based on libdatrie."
21 |     s.summary = s.description
22 |     s.authors = ["Tyler McMullen", "Matt Hickford"]
23 |     s.extensions = ['ext/trie/extconf.rb']
24 |     s.require_paths = ['ext']
25 |     s.files = FileList["[A-Z]*.*", "{spec,ext}/**/*"]
26 |     s.has_rdoc = true
27 |     s.rdoc_options = ['--title', 'Trie', '--line-numbers', '--op', 'rdoc', '--main', 'ext/trie/trie.c', 'README']
28 | end
29 | Jeweler::RubygemsDotOrgTasks.new
30 | 
31 | $gemspec         = jeweler_tasks.gemspec
32 | $gemspec.version = jeweler_tasks.jeweler.version
33 | 
34 | require 'rake/extensiontask'
35 | Rake::ExtensionTask.new('trie', $gemspec)
36 | CLEAN.include 'lib/**/*.so'
37 | 
38 | require 'rspec/core/rake_task'
39 | RSpec::Core::RakeTask.new
40 | 
41 | require 'rdoc/task'
42 | Rake::RDocTask.new do |rdoc|
43 |   rdoc.rdoc_dir = 'rdoc'
44 |   rdoc.title    = 'Trie'
45 |   rdoc.options << '--line-numbers' << '--inline-source'
46 |   rdoc.rdoc_files.include('README*')
47 |   rdoc.rdoc_files.include('ext/trie/trie.c')
48 | end
49 | 
50 | task :default => [:compile, :spec]
51 | 


--------------------------------------------------------------------------------
/VERSION.yml:
--------------------------------------------------------------------------------
1 | --- 
2 | :major: 0
3 | :minor: 5
4 | :patch: 1
5 | :build: 
6 | 


--------------------------------------------------------------------------------
/ext/trie/darray.c:
--------------------------------------------------------------------------------
  1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
  2 | /*
  3 |  * darray.c - Double-array trie structure
  4 |  * Created: 2006-08-13
  5 |  * Author:  Theppitak Karoonboonyanan <thep@linux.thai.net>
  6 |  */
  7 | 
  8 | #include <string.h>
  9 | #include <stdlib.h>
 10 | #include <stdio.h>
 11 | 
 12 | #include "trie-private.h"
 13 | #include "darray.h"
 14 | #include "fileutils.h"
 15 | 
 16 | /*----------------------------------*
 17 |  *    INTERNAL TYPES DECLARATIONS   *
 18 |  *----------------------------------*/
 19 | 
 20 | typedef struct _Symbols Symbols;
 21 | 
 22 | struct _Symbols {
 23 |     short       num_symbols;
 24 |     TrieChar    symbols[256];
 25 | };
 26 | 
 27 | static Symbols *    symbols_new ();
 28 | static void         symbols_free (Symbols *syms);
 29 | static void         symbols_add (Symbols *syms, TrieChar c);
 30 | 
 31 | #define symbols_num(s)          ((s)->num_symbols)
 32 | #define symbols_get(s,i)        ((s)->symbols[i])
 33 | #define symbols_add_fast(s,c)   ((s)->symbols[(s)->num_symbols++] = c)
 34 | 
 35 | /*-----------------------------------*
 36 |  *    PRIVATE METHODS DECLARATIONS   *
 37 |  *-----------------------------------*/
 38 | 
 39 | #define da_get_free_list(d)      (1)
 40 | 
 41 | static Bool         da_check_free_cell (DArray         *d,
 42 |                                         TrieIndex       s);
 43 | 
 44 | static Bool         da_has_children    (DArray         *d,
 45 |                                         TrieIndex       s);
 46 | 
 47 | static Symbols *    da_output_symbols  (const DArray   *d,
 48 |                                         TrieIndex       s);
 49 | 
 50 | static TrieChar *   da_get_state_key   (const DArray   *d,
 51 |                                         TrieIndex       state);
 52 | 
 53 | static TrieIndex    da_find_free_base  (DArray         *d,
 54 |                                         const Symbols  *symbols);
 55 | 
 56 | static Bool         da_fit_symbols     (DArray         *d,
 57 |                                         TrieIndex       base,
 58 |                                         const Symbols  *symbols);
 59 | 
 60 | static void         da_relocate_base   (DArray         *d,
 61 |                                         TrieIndex       s,
 62 |                                         TrieIndex       new_base);
 63 | 
 64 | static Bool         da_extend_pool     (DArray         *d,
 65 |                                         TrieIndex       to_index);
 66 | 
 67 | static void         da_alloc_cell      (DArray         *d,
 68 |                                         TrieIndex       cell);
 69 | 
 70 | static void         da_free_cell       (DArray         *d,
 71 |                                         TrieIndex       cell);
 72 | 
 73 | static Bool         da_enumerate_recursive (const DArray   *d,
 74 |                                             TrieIndex       state,
 75 |                                             DAEnumFunc      enum_func,
 76 |                                             void           *user_data);
 77 | 
 78 | /* ==================== BEGIN IMPLEMENTATION PART ====================  */
 79 | 
 80 | /*------------------------------------*
 81 |  *   INTERNAL TYPES IMPLEMENTATIONS   *
 82 |  *------------------------------------*/
 83 | 
 84 | static Symbols *
 85 | symbols_new ()
 86 | {
 87 |     Symbols *syms;
 88 | 
 89 |     syms = (Symbols *) malloc (sizeof (Symbols));
 90 | 
 91 |     if (!syms)
 92 |         return NULL;
 93 | 
 94 |     syms->num_symbols = 0;
 95 | 
 96 |     return syms;
 97 | }
 98 | 
 99 | static void
100 | symbols_free (Symbols *syms)
101 | {
102 |     free (syms);
103 | }
104 | 
105 | static void
106 | symbols_add (Symbols *syms, TrieChar c)
107 | {
108 |     short lower, upper;
109 | 
110 |     lower = 0;
111 |     upper = syms->num_symbols;
112 |     while (lower < upper) {
113 |         short middle;
114 | 
115 |         middle = (lower + upper)/2;
116 |         if (c > syms->symbols[middle])
117 |             lower = middle + 1;
118 |         else if (c < syms->symbols[middle])
119 |             upper = middle;
120 |         else
121 |             return;
122 |     }
123 |     if (lower < syms->num_symbols) {
124 |         memmove (syms->symbols + lower + 1, syms->symbols + lower,
125 |                  syms->num_symbols - lower);
126 |     }
127 |     syms->symbols[lower] = c;
128 |     syms->num_symbols++;
129 | }
130 | 
131 | /*------------------------------*
132 |  *    PRIVATE DATA DEFINITONS   *
133 |  *------------------------------*/
134 | 
135 | typedef struct {
136 |     TrieIndex   base;
137 |     TrieIndex   check;
138 | } DACell;
139 | 
140 | struct _DArray {
141 |     TrieIndex   num_cells;
142 |     DACell     *cells;
143 | };
144 | 
145 | /*-----------------------------*
146 |  *    METHODS IMPLEMENTAIONS   *
147 |  *-----------------------------*/
148 | 
149 | #define DA_SIGNATURE 0xDAFCDAFC
150 | 
151 | /* DA Header:
152 |  * - Cell 0: SIGNATURE, number of cells
153 |  * - Cell 1: free circular-list pointers
154 |  * - Cell 2: root node
155 |  * - Cell 3: DA pool begin
156 |  */
157 | #define DA_POOL_BEGIN 3
158 | 
159 | DArray *
160 | da_new ()
161 | {
162 |     DArray     *d;
163 | 
164 |     d = (DArray *) malloc (sizeof (DArray));
165 |     if (!d)
166 |         return NULL;
167 | 
168 |     d->num_cells = DA_POOL_BEGIN;
169 |     d->cells     = (DACell *) malloc (d->num_cells * sizeof (DACell));
170 |     if (!d->cells)
171 |         goto exit_da_created;
172 |     d->cells[0].base = DA_SIGNATURE;
173 |     d->cells[0].check = d->num_cells;
174 |     d->cells[1].base = -1;
175 |     d->cells[1].check = -1;
176 |     d->cells[2].base = DA_POOL_BEGIN;
177 |     d->cells[2].check = 0;
178 | 
179 |     return d;
180 | 
181 | exit_da_created:
182 |     free (d);
183 |     return NULL;
184 | }
185 | 
186 | DArray *
187 | da_read (FILE *file)
188 | {
189 |     long        save_pos;
190 |     DArray     *d = NULL;
191 |     TrieIndex   n;
192 | 
193 |     /* check signature */
194 |     save_pos = ftell (file);
195 |     if (!file_read_int32 (file, &n) || DA_SIGNATURE != (uint32) n) {
196 |         fseek (file, save_pos, SEEK_SET);
197 |         return NULL;
198 |     }
199 | 
200 |     d = (DArray *) malloc (sizeof (DArray));
201 |     if (!d)
202 |         return NULL;
203 | 
204 |     /* read number of cells */
205 |     file_read_int32 (file, &d->num_cells);
206 |     d->cells     = (DACell *) malloc (d->num_cells * sizeof (DACell));
207 |     if (!d->cells)
208 |         goto exit_da_created;
209 |     d->cells[0].base = DA_SIGNATURE;
210 |     d->cells[0].check= d->num_cells;
211 |     for (n = 1; n < d->num_cells; n++) {
212 |         file_read_int32 (file, &d->cells[n].base);
213 |         file_read_int32 (file, &d->cells[n].check);
214 |     }
215 | 
216 |     return d;
217 | 
218 | exit_da_created:
219 |     free (d);
220 |     return NULL;
221 | }
222 | 
223 | void
224 | da_free (DArray *d)
225 | {
226 |     free (d->cells);
227 |     free (d);
228 | }
229 | 
230 | int
231 | da_write (const DArray *d, FILE *file)
232 | {
233 |     TrieIndex   i;
234 | 
235 |     for (i = 0; i < d->num_cells; i++) {
236 |         if (!file_write_int32 (file, d->cells[i].base) ||
237 |             !file_write_int32 (file, d->cells[i].check))
238 |         {
239 |             return -1;
240 |         }
241 |     }
242 | 
243 |     return 0;
244 | }
245 | 
246 | 
247 | TrieIndex
248 | da_get_root (const DArray *d)
249 | {
250 |     /* can be calculated value for multi-index trie */
251 |     return 2;
252 | }
253 | 
254 | 
255 | TrieIndex
256 | da_get_base (const DArray *d, TrieIndex s)
257 | {
258 |     return (0 <= s && s < d->num_cells) ? d->cells[s].base : TRIE_INDEX_ERROR;
259 | }
260 | 
261 | TrieIndex
262 | da_get_check (const DArray *d, TrieIndex s)
263 | {
264 |     return (0 <= s && s < d->num_cells) ? d->cells[s].check : TRIE_INDEX_ERROR;
265 | }
266 | 
267 | 
268 | void
269 | da_set_base (DArray *d, TrieIndex s, TrieIndex val)
270 | {
271 |     if (0 <= s && s < d->num_cells) {
272 |         d->cells[s].base = val;
273 |     }
274 | }
275 | 
276 | void
277 | da_set_check (DArray *d, TrieIndex s, TrieIndex val)
278 | {
279 |     if (0 <= s && s < d->num_cells) {
280 |         d->cells[s].check = val;
281 |     }
282 | }
283 | 
284 | Bool
285 | da_walk (const DArray *d, TrieIndex *s, TrieChar c)
286 | {
287 |     TrieIndex   next;
288 | 
289 |     next = da_get_base (d, *s) + c;
290 |     if (da_get_check (d, next) == *s) {
291 |         *s = next;
292 |         return TRUE;
293 |     }
294 |     return FALSE;
295 | }
296 | 
297 | TrieIndex
298 | da_insert_branch (DArray *d, TrieIndex s, TrieChar c)
299 | {
300 |     TrieIndex   base, next;
301 | 
302 |     base = da_get_base (d, s);
303 | 
304 |     if (base > 0) {
305 |         next = base + c;
306 | 
307 |         /* if already there, do not actually insert */
308 |         if (da_get_check (d, next) == s)
309 |             return next;
310 | 
311 |         /* if (base + c) > TRIE_INDEX_MAX which means 'next' is overflow,
312 |          * or cell [next] is not free, relocate to a free slot
313 |          */
314 |         if (base > TRIE_INDEX_MAX - c || !da_check_free_cell (d, next)) {
315 |             Symbols    *symbols;
316 |             TrieIndex   new_base;
317 | 
318 |             /* relocate BASE[s] */
319 |             symbols = da_output_symbols (d, s);
320 |             symbols_add (symbols, c);
321 |             new_base = da_find_free_base (d, symbols);
322 |             symbols_free (symbols);
323 | 
324 |             if (TRIE_INDEX_ERROR == new_base)
325 |                 return TRIE_INDEX_ERROR;
326 | 
327 |             da_relocate_base (d, s, new_base);
328 |             next = new_base + c;
329 |         }
330 |     } else {
331 |         Symbols    *symbols;
332 |         TrieIndex   new_base;
333 | 
334 |         symbols = symbols_new ();
335 |         symbols_add (symbols, c);
336 |         new_base = da_find_free_base (d, symbols);
337 |         symbols_free (symbols);
338 | 
339 |         if (TRIE_INDEX_ERROR == new_base)
340 |             return TRIE_INDEX_ERROR;
341 | 
342 |         da_set_base (d, s, new_base);
343 |         next = new_base + c;
344 |     }
345 |     da_alloc_cell (d, next);
346 |     da_set_check (d, next, s);
347 | 
348 |     return next;
349 | }
350 | 
351 | static Bool
352 | da_check_free_cell (DArray         *d,
353 |                     TrieIndex       s)
354 | {
355 |     return da_extend_pool (d, s) && da_get_check (d, s) < 0;
356 | }
357 | 
358 | static Bool
359 | da_has_children    (DArray         *d,
360 |                     TrieIndex       s)
361 | {
362 |     TrieIndex   base;
363 |     TrieIndex   c, max_c;
364 | 
365 |     base = da_get_base (d, s);
366 |     if (TRIE_INDEX_ERROR == base || base < 0)
367 |         return FALSE;
368 | 
369 |     max_c = MIN_VAL (TRIE_CHAR_MAX, TRIE_INDEX_MAX - base);
370 |     for (c = 0; c < max_c; c++) {
371 |         if (da_get_check (d, base + c) == s)
372 |             return TRUE;
373 |     }
374 | 
375 |     return FALSE;
376 | }
377 | 
378 | static Symbols *
379 | da_output_symbols  (const DArray   *d,
380 |                     TrieIndex       s)
381 | {
382 |     Symbols    *syms;
383 |     TrieIndex   base;
384 |     TrieIndex   c, max_c;
385 | 
386 |     syms = symbols_new ();
387 | 
388 |     base = da_get_base (d, s);
389 |     max_c = MIN_VAL (TRIE_CHAR_MAX, TRIE_INDEX_MAX - base);
390 |     for (c = 0; c < max_c; c++) {
391 |         if (da_get_check (d, base + c) == s)
392 |             symbols_add_fast (syms, (TrieChar) c);
393 |     }
394 | 
395 |     return syms;
396 | }
397 | 
398 | static TrieChar *
399 | da_get_state_key   (const DArray   *d,
400 |                     TrieIndex       state)
401 | {
402 |     TrieChar   *key;
403 |     int         key_size, key_length;
404 |     int         i;
405 | 
406 |     key_size = 20;
407 |     key_length = 0;
408 |     key = (TrieChar *) malloc (key_size);
409 | 
410 |     /* trace back to root */
411 |     while (da_get_root (d) != state) {
412 |         TrieIndex   parent;
413 | 
414 |         if (key_length + 1 >= key_size) {
415 |             key_size += 20;
416 |             key = (TrieChar *) realloc (key, key_size);
417 |         }
418 |         parent = da_get_check (d, state);
419 |         key[key_length++] = (TrieChar) (state - da_get_base (d, parent));
420 |         state = parent;
421 |     }
422 |     key[key_length] = '\0';
423 | 
424 |     /* reverse the string */
425 |     for (i = 0; i < --key_length; i++) {
426 |         TrieChar temp;
427 | 
428 |         temp = key[i];
429 |         key[i] = key[key_length];
430 |         key[key_length] = temp;
431 |     }
432 | 
433 |     return key;
434 | }
435 | 
436 | static TrieIndex
437 | da_find_free_base  (DArray         *d,
438 |                     const Symbols  *symbols)
439 | {
440 |     TrieChar        first_sym;
441 |     TrieIndex       s;
442 | 
443 |     /* find first free cell that is beyond the first symbol */
444 |     first_sym = symbols_get (symbols, 0);
445 |     s = -da_get_check (d, da_get_free_list (d));
446 |     while (s != da_get_free_list (d)
447 |            && s < (TrieIndex) first_sym + DA_POOL_BEGIN)
448 |     {
449 |         s = -da_get_check (d, s);
450 |     }
451 |     if (s == da_get_free_list (d)) {
452 |         for (s = first_sym + DA_POOL_BEGIN; ; ++s) {
453 |             if (!da_extend_pool (d, s))
454 |                 return TRIE_INDEX_ERROR;
455 |             if (da_get_check (d, s) < 0)
456 |                 break;
457 |         }
458 |     }
459 | 
460 |     /* search for next free cell that fits the symbols set */
461 |     while (!da_fit_symbols (d, s - first_sym, symbols)) {
462 |         /* extend pool before getting exhausted */
463 |         if (-da_get_check (d, s) == da_get_free_list (d)) {
464 |             if (!da_extend_pool (d, d->num_cells))
465 |                 return TRIE_INDEX_ERROR;
466 |         }
467 | 
468 |         s = -da_get_check (d, s);
469 |     }
470 | 
471 |     return s - first_sym;
472 | }
473 | 
474 | static Bool
475 | da_fit_symbols     (DArray         *d,
476 |                     TrieIndex       base,
477 |                     const Symbols  *symbols)
478 | {
479 |     int         i;
480 | 
481 |     for (i = 0; i < symbols_num (symbols); i++) {
482 |         TrieChar    sym = symbols_get (symbols, i);
483 | 
484 |         /* if (base + sym) > TRIE_INDEX_MAX which means it's overflow,
485 |          * or cell [base + sym] is not free, the symbol is not fit.
486 |          */
487 |         if (base > TRIE_INDEX_MAX - sym || !da_check_free_cell (d, base + sym))
488 |             return FALSE;
489 |     }
490 |     return TRUE;
491 | }
492 | 
493 | static void
494 | da_relocate_base   (DArray         *d,
495 |                     TrieIndex       s,
496 |                     TrieIndex       new_base)
497 | {
498 |     TrieIndex   old_base;
499 |     Symbols    *symbols;
500 |     int         i;
501 | 
502 |     old_base = da_get_base (d, s);
503 |     symbols = da_output_symbols (d, s);
504 | 
505 |     for (i = 0; i < symbols_num (symbols); i++) {
506 |         TrieIndex   old_next, new_next, old_next_base;
507 | 
508 |         old_next = old_base + symbols_get (symbols, i);
509 |         new_next = new_base + symbols_get (symbols, i);
510 |         old_next_base = da_get_base (d, old_next);
511 | 
512 |         /* allocate new next node and copy BASE value */
513 |         da_alloc_cell (d, new_next);
514 |         da_set_check (d, new_next, s);
515 |         da_set_base (d, new_next, old_next_base);
516 | 
517 |         /* old_next node is now moved to new_next
518 |          * so, all cells belonging to old_next
519 |          * must be given to new_next
520 |          */
521 |         /* preventing the case of TAIL pointer */
522 |         if (old_next_base > 0) {
523 |             TrieIndex   c, max_c;
524 | 
525 |             max_c = MIN_VAL (TRIE_CHAR_MAX, TRIE_INDEX_MAX - old_next_base);
526 |             for  (c = 0; c < max_c; c++) {
527 |                 if (da_get_check (d, old_next_base + c) == old_next)
528 |                     da_set_check (d, old_next_base + c, new_next);
529 |             }
530 |         }
531 | 
532 |         /* free old_next node */
533 |         da_free_cell (d, old_next);
534 |     }
535 | 
536 |     symbols_free (symbols);
537 | 
538 |     /* finally, make BASE[s] point to new_base */
539 |     da_set_base (d, s, new_base);
540 | }
541 | 
542 | static Bool
543 | da_extend_pool     (DArray         *d,
544 |                     TrieIndex       to_index)
545 | {
546 |     TrieIndex   new_begin;
547 |     TrieIndex   i;
548 |     TrieIndex   free_tail;
549 | 
550 |     if (to_index <= 0 || TRIE_INDEX_MAX <= to_index)
551 |         return FALSE;
552 | 
553 |     if (to_index < d->num_cells)
554 |         return TRUE;
555 | 
556 |     d->cells = (DACell *) realloc (d->cells, (to_index + 1) * sizeof (DACell));
557 |     new_begin = d->num_cells;
558 |     d->num_cells = to_index + 1;
559 | 
560 |     /* initialize new free list */
561 |     for (i = new_begin; i < to_index; i++) {
562 |         da_set_check (d, i, -(i + 1));
563 |         da_set_base (d, i + 1, -i);
564 |     }
565 | 
566 |     /* merge the new circular list to the old */
567 |     free_tail = -da_get_base (d, da_get_free_list (d));
568 |     da_set_check (d, free_tail, -new_begin);
569 |     da_set_base (d, new_begin, -free_tail);
570 |     da_set_check (d, to_index, -da_get_free_list (d));
571 |     da_set_base (d, da_get_free_list (d), -to_index);
572 | 
573 |     /* update header cell */
574 |     d->cells[0].check = d->num_cells;
575 | 
576 |     return TRUE;
577 | }
578 | 
579 | void
580 | da_prune (DArray *d, TrieIndex s)
581 | {
582 |     da_prune_upto (d, da_get_root (d), s);
583 | }
584 | 
585 | void
586 | da_prune_upto (DArray *d, TrieIndex p, TrieIndex s)
587 | {
588 |     while (p != s && !da_has_children (d, s)) {
589 |         TrieIndex   parent;
590 | 
591 |         parent = da_get_check (d, s);
592 |         da_free_cell (d, s);
593 |         s = parent;
594 |     }
595 | }
596 | 
597 | static void
598 | da_alloc_cell      (DArray         *d,
599 |                     TrieIndex       cell)
600 | {
601 |     TrieIndex   prev, next;
602 | 
603 |     prev = -da_get_base (d, cell);
604 |     next = -da_get_check (d, cell);
605 | 
606 |     /* remove the cell from free list */
607 |     da_set_check (d, prev, -next);
608 |     da_set_base (d, next, -prev);
609 | }
610 | 
611 | static void
612 | da_free_cell       (DArray         *d,
613 |                     TrieIndex       cell)
614 | {
615 |     TrieIndex   i, prev;
616 | 
617 |     /* find insertion point */
618 |     i = -da_get_check (d, da_get_free_list (d));
619 |     while (i != da_get_free_list (d) && i < cell)
620 |         i = -da_get_check (d, i);
621 | 
622 |     prev = -da_get_base (d, i);
623 | 
624 |     /* insert cell before i */
625 |     da_set_check (d, cell, -i);
626 |     da_set_base (d, cell, -prev);
627 |     da_set_check (d, prev, -cell);
628 |     da_set_base (d, i, -cell);
629 | }
630 | 
631 | Bool
632 | da_enumerate (const DArray *d, DAEnumFunc enum_func, void *user_data)
633 | {
634 |     return da_enumerate_recursive (d, da_get_root (d), enum_func, user_data);
635 | }
636 | 
637 | static Bool
638 | da_enumerate_recursive (const DArray   *d,
639 |                         TrieIndex       state,
640 |                         DAEnumFunc      enum_func,
641 |                         void           *user_data)
642 | {
643 |     Bool        ret;
644 |     TrieIndex   base;
645 | 
646 |     base = da_get_base (d, state);
647 | 
648 |     if (base < 0) {
649 |         TrieChar   *key;
650 | 
651 |         key = da_get_state_key (d, state);
652 |         ret = (*enum_func) (key, state, user_data);
653 |         free (key);
654 |     } else {
655 |         Symbols *symbols;
656 |         int      i;
657 | 
658 |         ret = TRUE;
659 |         symbols = da_output_symbols (d, state);
660 |         for (i = 0; ret && i < symbols_num (symbols); i++) {
661 |             ret = da_enumerate_recursive (d, base + symbols_get (symbols, i),
662 |                                           enum_func, user_data);
663 |         }
664 | 
665 |         symbols_free (symbols);
666 |     }
667 | 
668 |     return ret;
669 | }
670 | 
671 | /*
672 | vi:ts=4:ai:expandtab
673 | */
674 | 


--------------------------------------------------------------------------------
/ext/trie/darray.h:
--------------------------------------------------------------------------------
  1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
  2 | /*
  3 |  * darray.h - Double-array trie structure
  4 |  * Created: 2006-08-11
  5 |  * Author:  Theppitak Karoonboonyanan <thep@linux.thai.net>
  6 |  */
  7 | 
  8 | #ifndef __DARRAY_H
  9 | #define __DARRAY_H
 10 | 
 11 | #include "triedefs.h"
 12 | 
 13 | /**
 14 |  * @file darray.h
 15 |  * @brief Double-array trie structure
 16 |  */
 17 | 
 18 | /**
 19 |  * @brief Double-array structure type
 20 |  */
 21 | typedef struct _DArray  DArray;
 22 | 
 23 | /**
 24 |  * @brief Double-array entry enumeration function
 25 |  *
 26 |  * @param key       : the key of the entry, up to @a sep_node
 27 |  * @param sep_node  : the separate node of the entry
 28 |  * @param user_data : user-supplied data
 29 |  *
 30 |  * @return TRUE to continue enumeration, FALSE to stop
 31 |  */
 32 | typedef Bool (*DAEnumFunc) (const TrieChar   *key,
 33 |                             TrieIndex         sep_node,
 34 |                             void             *user_data);
 35 | 
 36 | 
 37 | /**
 38 |  * @brief Create a new double-array object
 39 |  *
 40 |  * Create a new empty doubla-array object.
 41 |  */
 42 | DArray * da_new ();
 43 | 
 44 | /**
 45 |  * @brief Read double-array data from file
 46 |  *
 47 |  * @param file : the file to read
 48 |  *
 49 |  * @return a pointer to the openned double-array, NULL on failure
 50 |  *
 51 |  * Read double-array data from the opened file, starting from the current
 52 |  * file pointer until the end of double array data block. On return, the
 53 |  * file pointer is left at the position after the read block.
 54 |  */
 55 | DArray * da_read (FILE *file);
 56 | 
 57 | /**
 58 |  * @brief Free double-array data
 59 |  *
 60 |  * @param d : the double-array data
 61 |  *
 62 |  * Free the given double-array data.
 63 |  */
 64 | void     da_free (DArray *d);
 65 | 
 66 | /**
 67 |  * @brief Write double-array data
 68 |  *
 69 |  * @param d     : the double-array data
 70 |  * @param file  : the file to write to
 71 |  *
 72 |  * @return 0 on success, non-zero on failure
 73 |  *
 74 |  * Write double-array data to the given @a file, starting from the current
 75 |  * file pointer. On return, the file pointer is left after the double-array
 76 |  * data block.
 77 |  */
 78 | int      da_write (const DArray *d, FILE *file);
 79 | 
 80 | 
 81 | /**
 82 |  * @brief Get root state
 83 |  *
 84 |  * @param d     : the double-array data
 85 |  *
 86 |  * @return root state of the @a index set, or TRIE_INDEX_ERROR on failure
 87 |  *
 88 |  * Get root state for stepwise walking.
 89 |  */
 90 | TrieIndex  da_get_root (const DArray *d);
 91 | 
 92 | 
 93 | /**
 94 |  * @brief Get BASE cell
 95 |  *
 96 |  * @param d : the double-array data
 97 |  * @param s : the double-array state to get data
 98 |  *
 99 |  * @return the BASE cell value for the given state
100 |  *
101 |  * Get BASE cell value for the given state.
102 |  */
103 | TrieIndex  da_get_base (const DArray *d, TrieIndex s);
104 | 
105 | /**
106 |  * @brief Get CHECK cell
107 |  *
108 |  * @param d : the double-array data
109 |  * @param s : the double-array state to get data
110 |  *
111 |  * @return the CHECK cell value for the given state
112 |  *
113 |  * Get CHECK cell value for the given state.
114 |  */
115 | TrieIndex  da_get_check (const DArray *d, TrieIndex s);
116 | 
117 | 
118 | /**
119 |  * @brief Set BASE cell
120 |  *
121 |  * @param d   : the double-array data
122 |  * @param s   : the double-array state to get data
123 |  * @param val : the value to set
124 |  *
125 |  * Set BASE cell for the given state to the given value.
126 |  */
127 | void       da_set_base (DArray *d, TrieIndex s, TrieIndex val);
128 | 
129 | /**
130 |  * @brief Set CHECK cell
131 |  *
132 |  * @param d   : the double-array data
133 |  * @param s   : the double-array state to get data
134 |  * @param val : the value to set
135 |  *
136 |  * Set CHECK cell for the given state to the given value.
137 |  */
138 | void       da_set_check (DArray *d, TrieIndex s, TrieIndex val);
139 | 
140 | /**
141 |  * @brief Walk in double-array structure
142 |  *
143 |  * @param d : the double-array structure
144 |  * @param s : current state
145 |  * @param c : the input character
146 |  *
147 |  * @return boolean indicating success
148 |  *
149 |  * Walk the double-array trie from state @a *s, using input character @a c.
150 |  * If there exists an edge from @a *s with arc labeled @a c, this function
151 |  * returns TRUE and @a *s is updated to the new state. Otherwise, it returns
152 |  * FALSE and @a *s is left unchanged.
153 |  */
154 | Bool       da_walk (const DArray *d, TrieIndex *s, TrieChar c);
155 | 
156 | /**
157 |  * @brief Test walkability in double-array structure
158 |  *
159 |  * @param d : the double-array structure
160 |  * @param s : current state
161 |  * @param c : the input character
162 |  *
163 |  * @return boolean indicating walkability
164 |  *
165 |  * Test if there is a transition from state @a s with input character @a c.
166 |  */
167 | /*
168 | Bool       da_is_walkable (DArray *d, TrieIndex s, TrieChar c);
169 | */
170 | #define    da_is_walkable(d,s,c) \
171 |     (da_get_check ((d), da_get_base ((d), (s)) + (c)) == (s))
172 | 
173 | /**
174 |  * @brief Insert a branch from trie node
175 |  *
176 |  * @param d : the double-array structure
177 |  * @param s : the state to add branch to
178 |  * @param c : the character for the branch label
179 |  *
180 |  * @return the index of the new node
181 |  *
182 |  * Insert a new arc labelled with character @a c from the trie node 
183 |  * represented by index @a s in double-array structure @a d.
184 |  * Note that it assumes that no such arc exists before inserting.
185 |  */
186 | TrieIndex  da_insert_branch (DArray *d, TrieIndex s, TrieChar c);
187 | 
188 | /**
189 |  * @brief Prune the single branch
190 |  *
191 |  * @param d : the double-array structure
192 |  * @param s : the dangling state to prune off
193 |  *
194 |  * Prune off a non-separate path up from the final state @a s.
195 |  * If @a s still has some children states, it does nothing. Otherwise, 
196 |  * it deletes the node and all its parents which become non-separate.
197 |  */
198 | void       da_prune (DArray *d, TrieIndex s);
199 | 
200 | /**
201 |  * @brief Prune the single branch up to given parent
202 |  *
203 |  * @param d : the double-array structure
204 |  * @param p : the parent up to which to be pruned
205 |  * @param s : the dangling state to prune off
206 |  *
207 |  * Prune off a non-separate path up from the final state @a s to the
208 |  * given parent @a p. The prunning stop when either the parent @a p
209 |  * is met, or a first non-separate node is found.
210 |  */
211 | void       da_prune_upto (DArray *d, TrieIndex p, TrieIndex s);
212 | 
213 | /**
214 |  * @brief Enumerate entries stored in double-array structure
215 |  *
216 |  * @param d          : the double-array structure
217 |  * @param enum_func  : the callback function to be called on each separate node
218 |  * @param user_data  : user-supplied data to send as an argument to @a enum_func
219 |  *
220 |  * @return boolean value indicating whether all the keys are visited
221 |  *
222 |  * Enumerate all keys stored in double-array structure. For each entry, the 
223 |  * user-supplied @a enum_func callback function is called, with the entry key,
224 |  * the separate node, and user-supplied data. Returning FALSE from such
225 |  * callback will stop enumeration and return FALSE.
226 |  */
227 | Bool    da_enumerate (const DArray *d, DAEnumFunc enum_func, void *user_data);
228 | 
229 | #endif  /* __DARRAY_H */
230 | 
231 | /*
232 | vi:ts=4:ai:expandtab
233 | */
234 | 


--------------------------------------------------------------------------------
/ext/trie/extconf.rb:
--------------------------------------------------------------------------------
1 | require 'mkmf'
2 | create_makefile 'trie'
3 | 
4 | 


--------------------------------------------------------------------------------
/ext/trie/fileutils.c:
--------------------------------------------------------------------------------
  1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
  2 | /*
  3 |  * fileutils.h - File utility functions
  4 |  * Created: 2006-08-15
  5 |  * Author:  Theppitak Karoonboonyanan <thep@linux.thai.net>
  6 |  */
  7 | 
  8 | #include <string.h>
  9 | #include <stdlib.h>
 10 | 
 11 | #include "fileutils.h"
 12 | 
 13 | /*--------------------------------------*
 14 |  *    INTERNAL FUNCTIONS DECLARATIONS   *
 15 |  *--------------------------------------*/
 16 | 
 17 | static char *   make_full_path (const char *dir,
 18 |                                 const char *name,
 19 |                                 const char *ext);
 20 | 
 21 | /* ==================== BEGIN IMPLEMENTATION PART ====================  */
 22 | 
 23 | /*--------------------------------*
 24 |  *    FUNCTIONS IMPLEMENTATIONS   *
 25 |  *--------------------------------*/
 26 | 
 27 | static char *
 28 | make_full_path (const char *dir, const char *name, const char *ext)
 29 | {
 30 |     char   *path;
 31 | 
 32 |     path = (char *) malloc (strlen (dir) + strlen (name) + strlen (ext) + 2);
 33 |     sprintf (path, "%s/%s%s", dir, name, ext);
 34 | 
 35 |     return path;
 36 | }
 37 | 
 38 | FILE *
 39 | file_open (const char *dir, const char *name, const char *ext, TrieIOMode mode)
 40 | {
 41 |     const char *std_mode;
 42 |     char       *full_path;
 43 |     FILE       *file;
 44 | 
 45 |     if (mode & TRIE_IO_WRITE)
 46 |         std_mode = "r+";
 47 |     else
 48 |         std_mode = "r";
 49 | 
 50 |     full_path = make_full_path (dir, name, ext);
 51 |     file = fopen (full_path, std_mode);
 52 |     if (!file && mode & TRIE_IO_CREATE)
 53 |         file = fopen (full_path, "w+");
 54 |     free (full_path);
 55 | 
 56 |     return file;
 57 | }
 58 | 
 59 | long
 60 | file_length (FILE *file)
 61 | {
 62 |     long    cur_pos;
 63 |     long    size;
 64 | 
 65 |     cur_pos = ftell (file);
 66 | 
 67 |     fseek (file, 0L, SEEK_END);
 68 |     size = ftell (file);
 69 | 
 70 |     fseek (file, cur_pos, SEEK_SET);
 71 | 
 72 |     return size;
 73 | }
 74 | 
 75 | Bool
 76 | file_read_int32 (FILE *file, int32 *o_val)
 77 | {
 78 |     unsigned char   buff[4];
 79 | 
 80 |     if (fread (buff, 4, 1, file) == 1) {
 81 |         *o_val = (buff[0] << 24) | (buff[1] << 16) |  (buff[2] << 8) | buff[3];
 82 |         return TRUE;
 83 |     }
 84 | 
 85 |     return FALSE;
 86 | }
 87 | 
 88 | Bool
 89 | file_write_int32 (FILE *file, int32 val)
 90 | {
 91 |     unsigned char   buff[4];
 92 | 
 93 |     buff[0] = (val >> 24) & 0xff;
 94 |     buff[1] = (val >> 16) & 0xff;
 95 |     buff[2] = (val >> 8) & 0xff;
 96 |     buff[3] = val & 0xff;
 97 | 
 98 |     return (fwrite (buff, 4, 1, file) == 1);
 99 | }
100 | 
101 | Bool
102 | file_read_int16 (FILE *file, int16 *o_val)
103 | {
104 |     unsigned char   buff[2];
105 | 
106 |     if (fread (buff, 2, 1, file) == 1) {
107 |         *o_val = (buff[0] << 8) | buff[1];
108 |         return TRUE;
109 |     }
110 | 
111 |     return FALSE;
112 | }
113 | 
114 | Bool
115 | file_write_int16 (FILE *file, int16 val)
116 | {
117 |     unsigned char   buff[2];
118 | 
119 |     buff[0] = val >> 8;
120 |     buff[1] = val & 0xff;
121 | 
122 |     return (fwrite (buff, 2, 1, file) == 1);
123 | }
124 | 
125 | Bool
126 | file_read_int8 (FILE *file, int8 *o_val)
127 | {
128 |     return (fread (o_val, sizeof (int8), 1, file) == 1);
129 | }
130 | 
131 | Bool
132 | file_write_int8 (FILE *file, int8 val)
133 | {
134 |     return (fwrite (&val, sizeof (int8), 1, file) == 1);
135 | }
136 | 
137 | Bool
138 | file_read_chars (FILE *file, char *buff, int len)
139 | {
140 |     return (fread (buff, sizeof (char), len, file) == len);
141 | }
142 | 
143 | Bool
144 | file_write_chars (FILE *file, const char *buff, int len)
145 | {
146 |     return (fwrite (buff, sizeof (char), len, file) == len);
147 | }
148 | 
149 | /*
150 | vi:ts=4:ai:expandtab
151 | */
152 | 


--------------------------------------------------------------------------------
/ext/trie/fileutils.h:
--------------------------------------------------------------------------------
 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
 2 | /*
 3 |  * fileutils.h - File utility functions
 4 |  * Created: 2006-08-14
 5 |  * Author:  Theppitak Karoonboonyanan <thep@linux.thai.net>
 6 |  */
 7 | 
 8 | #ifndef __FILEUTILS_H
 9 | #define __FILEUTILS_H
10 | 
11 | #include <stdio.h>
12 | 
13 | #include "triedefs.h"
14 | 
15 | FILE * file_open (const char *dir, const char *name, const char *ext,
16 |                   TrieIOMode mode);
17 | 
18 | long   file_length (FILE *file);
19 | 
20 | Bool   file_read_int32 (FILE *file, int32 *o_val);
21 | Bool   file_write_int32 (FILE *file, int32 val);
22 | 
23 | Bool   file_read_int16 (FILE *file, int16 *o_val);
24 | Bool   file_write_int16 (FILE *file, int16 val);
25 | 
26 | Bool   file_read_int8 (FILE *file, int8 *o_val);
27 | Bool   file_write_int8 (FILE *file, int8 val);
28 | 
29 | Bool   file_read_chars (FILE *file, char *buff, int len);
30 | Bool   file_write_chars (FILE *file, const char *buff, int len);
31 | 
32 | #endif /* __FILEUTILS_H */
33 | 
34 | /*
35 | vi:ts=4:ai:expandtab
36 | */
37 | 


--------------------------------------------------------------------------------
/ext/trie/tail.c:
--------------------------------------------------------------------------------
  1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
  2 | /*
  3 |  * tail.c - trie tail for keeping suffixes
  4 |  * Created: 2006-08-15
  5 |  * Author:  Theppitak Karoonboonyanan <thep@linux.thai.net>
  6 |  */
  7 | 
  8 | #include <string.h>
  9 | #include <stdlib.h>
 10 | #include <stdio.h>
 11 | 
 12 | #include "tail.h"
 13 | #include "fileutils.h"
 14 | 
 15 | /*----------------------------------*
 16 |  *    INTERNAL TYPES DECLARATIONS   *
 17 |  *----------------------------------*/
 18 | 
 19 | /*-----------------------------------*
 20 |  *    PRIVATE METHODS DECLARATIONS   *
 21 |  *-----------------------------------*/
 22 | 
 23 | static TrieIndex    tail_alloc_block (Tail *t);
 24 | static void         tail_free_block (Tail *t, TrieIndex block);
 25 | 
 26 | /* ==================== BEGIN IMPLEMENTATION PART ====================  */
 27 | 
 28 | /*------------------------------------*
 29 |  *   INTERNAL TYPES IMPLEMENTATIONS   *
 30 |  *------------------------------------*/
 31 | 
 32 | /*------------------------------*
 33 |  *    PRIVATE DATA DEFINITONS   *
 34 |  *------------------------------*/
 35 | 
 36 | typedef struct {
 37 |     TrieIndex   next_free;
 38 |     TrieData    data;
 39 |     TrieChar   *suffix;
 40 | } TailBlock;
 41 | 
 42 | struct _Tail {
 43 |     TrieIndex   num_tails;
 44 |     TailBlock  *tails;
 45 |     TrieIndex   first_free;
 46 | };
 47 | 
 48 | /*-----------------------------*
 49 |  *    METHODS IMPLEMENTAIONS   *
 50 |  *-----------------------------*/
 51 | 
 52 | #define TAIL_SIGNATURE      0xDFFCDFFC
 53 | #define TAIL_START_BLOCKNO  1
 54 | 
 55 | /* Tail Header:
 56 |  * INT32: signature
 57 |  * INT32: pointer to first free slot
 58 |  * INT32: number of tail blocks
 59 |  *
 60 |  * Tail Blocks:
 61 |  * INT32: pointer to next free block (-1 for allocated blocks)
 62 |  * INT32: data for the key
 63 |  * INT16: length
 64 |  * BYTES[length]: suffix string (no terminating '\0')
 65 |  */
 66 | 
 67 | Tail *
 68 | tail_new ()
 69 | {
 70 |     Tail       *t;
 71 | 
 72 |     t = (Tail *) malloc (sizeof (Tail));
 73 |     if (!t)
 74 |         return NULL;
 75 | 
 76 |     t->first_free = 0;
 77 |     t->num_tails  = 0;
 78 |     t->tails      = NULL;
 79 | 
 80 |     return t;
 81 | }
 82 | 
 83 | Tail *
 84 | tail_read (FILE *file)
 85 | {
 86 |     long        save_pos;
 87 |     Tail       *t;
 88 |     TrieIndex   i;
 89 |     uint32      sig;
 90 | 
 91 |     /* check signature */
 92 |     save_pos = ftell (file);
 93 |     if (!file_read_int32 (file, (int32 *) &sig) || TAIL_SIGNATURE != sig) {
 94 |         fseek (file, save_pos, SEEK_SET);
 95 |         return NULL;
 96 |     }
 97 | 
 98 |     t = (Tail *) malloc (sizeof (Tail));
 99 |     if (!t)
100 |         return NULL;
101 | 
102 |     file_read_int32 (file, &t->first_free);
103 |     file_read_int32 (file, &t->num_tails);
104 |     t->tails = (TailBlock *) malloc (t->num_tails * sizeof (TailBlock));
105 |     if (!t->tails)
106 |         goto exit_tail_created;
107 |     for (i = 0; i < t->num_tails; i++) {
108 |         int16   length;
109 | 
110 |         file_read_int32 (file, &t->tails[i].next_free);
111 |         file_read_int32 (file, &t->tails[i].data);
112 | 
113 |         file_read_int16 (file, &length);
114 |         t->tails[i].suffix    = (TrieChar *) malloc (length + 1);
115 |         if (length > 0)
116 |             file_read_chars (file, (char *)t->tails[i].suffix, length);
117 |         t->tails[i].suffix[length] = '\0';
118 |     }
119 | 
120 |     return t;
121 | 
122 | exit_tail_created:
123 |     free (t);
124 |     return NULL;
125 | }
126 | 
127 | void
128 | tail_free (Tail *t)
129 | {
130 |     TrieIndex   i;
131 | 
132 |     if (t->tails) {
133 |         for (i = 0; i < t->num_tails; i++)
134 |             if (t->tails[i].suffix)
135 |                 free (t->tails[i].suffix);
136 |         free (t->tails);
137 |     }
138 |     free (t);
139 | }
140 | 
141 | int
142 | tail_write (const Tail *t, FILE *file)
143 | {
144 |     TrieIndex   i;
145 | 
146 |     if (!file_write_int32 (file, TAIL_SIGNATURE) ||
147 |         !file_write_int32 (file, t->first_free)  ||
148 |         !file_write_int32 (file, t->num_tails))
149 |     {
150 |         return -1;
151 |     }
152 |     for (i = 0; i < t->num_tails; i++) {
153 |         int16   length;
154 | 
155 |         if (!file_write_int32 (file, t->tails[i].next_free) ||
156 |             !file_write_int32 (file, t->tails[i].data))
157 |         {
158 |             return -1;
159 |         }
160 | 
161 |         length = t->tails[i].suffix ? strlen ((const char *)t->tails[i].suffix)
162 |                                     : 0;
163 |         if (!file_write_int16 (file, length))
164 |             return -1;
165 |         if (length > 0 &&
166 |             !file_write_chars (file, (char *)t->tails[i].suffix, length))
167 |         {
168 |             return -1;
169 |         }
170 |     }
171 | 
172 |     return 0;
173 | }
174 | 
175 | 
176 | const TrieChar *
177 | tail_get_suffix (const Tail *t, TrieIndex index)
178 | {
179 |     index -= TAIL_START_BLOCKNO;
180 |     return (index < t->num_tails) ? t->tails[index].suffix : NULL;
181 | }
182 | 
183 | Bool
184 | tail_set_suffix (Tail *t, TrieIndex index, const TrieChar *suffix)
185 | {
186 |     index -= TAIL_START_BLOCKNO;
187 |     if (index < t->num_tails) {
188 |         /* suffix and t->tails[index].suffix may overlap;
189 |          * so, dup it before it's overwritten
190 |          */
191 |         TrieChar *tmp = NULL;
192 |         if (suffix)
193 |             tmp = (TrieChar *) strdup ((const char *)suffix);
194 |         if (t->tails[index].suffix)
195 |             free (t->tails[index].suffix);
196 |         t->tails[index].suffix = tmp;
197 | 
198 |         return TRUE;
199 |     }
200 |     return FALSE;
201 | }
202 | 
203 | TrieIndex
204 | tail_add_suffix (Tail *t, const TrieChar *suffix)
205 | {
206 |     TrieIndex   new_block;
207 | 
208 |     new_block = tail_alloc_block (t);
209 |     tail_set_suffix (t, new_block, suffix);
210 | 
211 |     return new_block;
212 | }
213 | 
214 | static TrieIndex
215 | tail_alloc_block (Tail *t)
216 | {
217 |     TrieIndex   block;
218 | 
219 |     if (0 != t->first_free) {
220 |         block = t->first_free;
221 |         t->first_free = t->tails[block].next_free;
222 |     } else {
223 |         block = t->num_tails;
224 |         t->tails = (TailBlock *) realloc (t->tails,
225 |                                           ++t->num_tails * sizeof (TailBlock));
226 |     }
227 |     t->tails[block].next_free = -1;
228 |     t->tails[block].data = TRIE_DATA_ERROR;
229 |     t->tails[block].suffix = NULL;
230 |     
231 |     return block + TAIL_START_BLOCKNO;
232 | }
233 | 
234 | static void
235 | tail_free_block (Tail *t, TrieIndex block)
236 | {
237 |     TrieIndex   i, j;
238 | 
239 |     block -= TAIL_START_BLOCKNO;
240 | 
241 |     if (block >= t->num_tails)
242 |         return;
243 | 
244 |     t->tails[block].data = TRIE_DATA_ERROR;
245 |     if (NULL != t->tails[block].suffix) {
246 |         free (t->tails[block].suffix);
247 |         t->tails[block].suffix = NULL;
248 |     }
249 | 
250 |     /* find insertion point */
251 |     j = 0;
252 |     for (i = t->first_free; i != 0 && i < block; i = t->tails[i].next_free)
253 |         j = i;
254 | 
255 |     /* insert free block between j and i */
256 |     t->tails[block].next_free = i;
257 |     if (0 != j)
258 |         t->tails[j].next_free = block;
259 |     else
260 |         t->first_free = block;
261 | }
262 | 
263 | TrieData
264 | tail_get_data (const Tail *t, TrieIndex index)
265 | {
266 |     index -= TAIL_START_BLOCKNO;
267 |     return (index < t->num_tails) ? t->tails[index].data : TRIE_DATA_ERROR;
268 | }
269 | 
270 | Bool
271 | tail_set_data (Tail *t, TrieIndex index, TrieData data)
272 | {
273 |     index -= TAIL_START_BLOCKNO;
274 |     if (index < t->num_tails) {
275 |         t->tails[index].data = data;
276 |         return TRUE;
277 |     }
278 |     return FALSE;
279 | }
280 | 
281 | void
282 | tail_delete (Tail *t, TrieIndex index)
283 | {
284 |     tail_free_block (t, index);
285 | }
286 | 
287 | int
288 | tail_walk_str  (const Tail      *t,
289 |                 TrieIndex        s,
290 |                 short           *suffix_idx,
291 |                 const TrieChar  *str,
292 |                 int              len)
293 | {
294 |     const TrieChar *suffix;
295 |     int             i;
296 |     short           j;
297 | 
298 |     suffix = tail_get_suffix (t, s);
299 |     if (!suffix)
300 |         return FALSE;
301 | 
302 |     i = 0; j = *suffix_idx;
303 |     while (i < len) {
304 |         if (str[i] != suffix[j])
305 |             break;
306 |         ++i;
307 |         /* stop and stay at null-terminator */
308 |         if (0 == suffix[j])
309 |             break;
310 |         ++j;
311 |     }
312 |     *suffix_idx = j;
313 |     return i;
314 | }
315 | 
316 | Bool
317 | tail_walk_char (const Tail      *t,
318 |                 TrieIndex        s,
319 |                 short           *suffix_idx,
320 |                 TrieChar         c)
321 | {
322 |     const TrieChar *suffix;
323 |     TrieChar        suffix_char;
324 | 
325 |     suffix = tail_get_suffix (t, s);
326 |     if (!suffix)
327 |         return FALSE;
328 | 
329 |     suffix_char = suffix[*suffix_idx];
330 |     if (suffix_char == c) {
331 |         if (0 != suffix_char)
332 |             ++*suffix_idx;
333 |         return TRUE;
334 |     }
335 |     return FALSE;
336 | }
337 | 
338 | /*
339 | vi:ts=4:ai:expandtab
340 | */
341 | 


--------------------------------------------------------------------------------
/ext/trie/tail.h:
--------------------------------------------------------------------------------
  1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
  2 | /*
  3 |  * tail.h - trie tail for keeping suffixes
  4 |  * Created: 2006-08-12
  5 |  * Author:  Theppitak Karoonboonyanan <thep@linux.thai.net>
  6 |  */
  7 | 
  8 | #ifndef __TAIL_H
  9 | #define __TAIL_H
 10 | 
 11 | #include "triedefs.h"
 12 | 
 13 | /**
 14 |  * @file tail.h
 15 |  * @brief trie tail for keeping suffixes
 16 |  */
 17 | 
 18 | /**
 19 |  * @brief Double-array structure type
 20 |  */
 21 | typedef struct _Tail  Tail;
 22 | 
 23 | /**
 24 |  * @brief Create a new tail object
 25 |  *
 26 |  * Create a new empty tail object.
 27 |  */
 28 | Tail *   tail_new ();
 29 | 
 30 | /**
 31 |  * @brief Read tail data from file
 32 |  *
 33 |  * @param file : the file to read
 34 |  *
 35 |  * @return a pointer to the openned tail data, NULL on failure
 36 |  *
 37 |  * Read tail data from the opened file, starting from the current
 38 |  * file pointer until the end of tail data block. On return, the
 39 |  * file pointer is left at the position after the read block.
 40 |  */
 41 | Tail *   tail_read (FILE *file);
 42 | 
 43 | /**
 44 |  * @brief Free tail data
 45 |  *
 46 |  * @param t : the tail data
 47 |  *
 48 |  * @return 0 on success, non-zero on failure
 49 |  *
 50 |  * Free the given tail data.
 51 |  */
 52 | void     tail_free (Tail *t);
 53 | 
 54 | /**
 55 |  * @brief Write tail data
 56 |  *
 57 |  * @param t     : the tail data
 58 |  * @param file  : the file to write to
 59 |  *
 60 |  * @return 0 on success, non-zero on failure
 61 |  *
 62 |  * Write tail data to the given @a file, starting from the current file
 63 |  * pointer. On return, the file pointer is left after the tail data block.
 64 |  */
 65 | int      tail_write (const Tail *t, FILE *file);
 66 | 
 67 | 
 68 | /**
 69 |  * @brief Get suffix
 70 |  *
 71 |  * @param t     : the tail data
 72 |  * @param index : the index of the suffix
 73 |  *
 74 |  * @return an allocated string of the indexed suffix.
 75 |  *
 76 |  * Get suffix from tail with given @a index. The returned string is allocated.
 77 |  * The caller should free it with free().
 78 |  */
 79 | const TrieChar *    tail_get_suffix (const Tail *t, TrieIndex index);
 80 | 
 81 | /**
 82 |  * @brief Set suffix of existing entry
 83 |  *
 84 |  * @param t      : the tail data
 85 |  * @param index  : the index of the suffix
 86 |  * @param suffix : the new suffix
 87 |  *
 88 |  * Set suffix of existing entry of given @a index in tail.
 89 |  */
 90 | Bool     tail_set_suffix (Tail *t, TrieIndex index, const TrieChar *suffix);
 91 | 
 92 | /**
 93 |  * @brief Add a new suffix
 94 |  *
 95 |  * @param t      : the tail data
 96 |  * @param suffix : the new suffix
 97 |  *
 98 |  * @return the index of the newly added suffix.
 99 |  *
100 |  * Add a new suffix entry to tail.
101 |  */
102 | TrieIndex tail_add_suffix (Tail *t, const TrieChar *suffix);
103 | 
104 | /**
105 |  * @brief Get data associated to suffix entry
106 |  *
107 |  * @param t      : the tail data
108 |  * @param index  : the index of the suffix
109 |  *
110 |  * @return the data associated to the suffix entry
111 |  *
112 |  * Get data associated to suffix entry @a index in tail data.
113 |  */
114 | TrieData tail_get_data (const Tail *t, TrieIndex index);
115 | 
116 | /**
117 |  * @brief Set data associated to suffix entry
118 |  *
119 |  * @param t      : the tail data
120 |  * @param index  : the index of the suffix
121 |  * @param data   : the data to set
122 |  *
123 |  * @return boolean indicating success
124 |  *
125 |  * Set data associated to suffix entry @a index in tail data.
126 |  */
127 | Bool     tail_set_data (Tail *t, TrieIndex index, TrieData data);
128 | 
129 | /**
130 |  * @brief Delete suffix entry
131 |  *
132 |  * @param t      : the tail data
133 |  * @param index  : the index of the suffix to delete
134 |  *
135 |  * Delete suffix entry from the tail data.
136 |  */
137 | void     tail_delete (Tail *t, TrieIndex index);
138 | 
139 | /**
140 |  * @brief Walk in tail with a string
141 |  *
142 |  * @param t          : the tail data
143 |  * @param s          : the tail data index
144 |  * @param suffix_idx : pointer to current character index in suffix
145 |  * @param str        : the string to use in walking
146 |  * @param len        : total characters in @a str to walk
147 |  *
148 |  * @return total number of characters successfully walked
149 |  *
150 |  * Walk in the tail data @a t at entry @a s, from given character position
151 |  * @a *suffix_idx, using @a len characters of given string @a str. On return,
152 |  * @a *suffix_idx is updated to the position after the last successful walk,
153 |  * and the function returns the total number of character succesfully walked.
154 |  */
155 | int      tail_walk_str  (const Tail      *t,
156 |                          TrieIndex        s,
157 |                          short           *suffix_idx,
158 |                          const TrieChar  *str,
159 |                          int              len);
160 | 
161 | /**
162 |  * @brief Walk in tail with a character
163 |  *
164 |  * @param t          : the tail data
165 |  * @param s          : the tail data index
166 |  * @param suffix_idx : pointer to current character index in suffix
167 |  * @param c          : the character to use in walking
168 |  *
169 |  * @return boolean indicating success
170 |  *
171 |  * Walk in the tail data @a t at entry @a s, from given character position
172 |  * @a *suffix_idx, using given character @a c. If the walk is successful,
173 |  * it returns TRUE, and @a *suffix_idx is updated to the next character.
174 |  * Otherwise, it returns FALSE, and @a *suffix_idx is left unchanged.
175 |  */
176 | Bool     tail_walk_char (const Tail      *t,
177 |                          TrieIndex        s,
178 |                          short           *suffix_idx,
179 |                          TrieChar         c);
180 | 
181 | /**
182 |  * @brief Test walkability in tail with a character
183 |  *
184 |  * @param t          : the tail data
185 |  * @param s          : the tail data index
186 |  * @param suffix_idx : current character index in suffix
187 |  * @param c          : the character to test walkability
188 |  *
189 |  * @return boolean indicating walkability
190 |  *
191 |  * Test if the character @a c can be used to walk from given character 
192 |  * position @a suffix_idx of entry @a s of the tail data @a t.
193 |  */
194 | /*
195 | Bool     tail_is_walkable_char (Tail            *t,
196 |                                 TrieIndex        s,
197 |                                 short            suffix_idx,
198 |                                 const TrieChar   c);
199 | */
200 | #define  tail_is_walkable_char(t,s,suffix_idx,c) \
201 |     (tail_get_suffix ((t), (s)) [suffix_idx] == (c))
202 | 
203 | #endif  /* __TAIL_H */
204 | 
205 | /*
206 | vi:ts=4:ai:expandtab
207 | */
208 | 


--------------------------------------------------------------------------------
/ext/trie/trie-private.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include "darray.h"
  5 | #include "tail.h"
  6 | #include "trie.h"
  7 | 
  8 | Trie* trie_new() {
  9 | 	Trie *trie = (Trie*) malloc(sizeof(Trie));
 10 | 	trie->da = da_new();
 11 | 	trie->tail = tail_new();
 12 | 	return trie;
 13 | }
 14 | 
 15 | void trie_free(Trie *trie) {
 16 | 	da_free(trie->da);
 17 | 	tail_free(trie->tail);
 18 | 	free(trie);
 19 | }
 20 | 
 21 | static Bool trie_branch_in_branch (Trie *trie, TrieIndex sep_node, const TrieChar *suffix, TrieData data) {
 22 |     TrieIndex new_da, new_tail;
 23 | 
 24 |     new_da = da_insert_branch (trie->da, sep_node, *suffix);
 25 |     if (TRIE_INDEX_ERROR == new_da)
 26 |         return FALSE;
 27 | 
 28 |     if ('\0' != *suffix)
 29 |         ++suffix;
 30 | 
 31 |     new_tail = tail_add_suffix (trie->tail, suffix);
 32 |     tail_set_data (trie->tail, new_tail, data);
 33 |     trie_da_set_tail_index (trie->da, new_da, new_tail);
 34 | 
 35 |     // trie->is_dirty = TRUE;
 36 |     return TRUE;
 37 | }
 38 | 
 39 | static Bool trie_branch_in_tail(Trie *trie, TrieIndex sep_node, const TrieChar *suffix, TrieData data) {
 40 |     TrieIndex old_tail, old_da, s;
 41 |     const TrieChar *old_suffix, *p;
 42 | 
 43 |     /* adjust separate point in old path */
 44 |     old_tail = trie_da_get_tail_index (trie->da, sep_node);
 45 |     old_suffix = tail_get_suffix (trie->tail, old_tail);
 46 |     if (!old_suffix)
 47 |         return FALSE;
 48 | 
 49 |     for (p = old_suffix, s = sep_node; *p == *suffix; p++, suffix++) {
 50 |         TrieIndex t = da_insert_branch (trie->da, s, *p);
 51 |         if (TRIE_INDEX_ERROR == t)
 52 |             goto fail;
 53 |         s = t;
 54 |     }
 55 | 
 56 |     old_da = da_insert_branch (trie->da, s, *p);
 57 |     if (TRIE_INDEX_ERROR == old_da)
 58 |         goto fail;
 59 | 
 60 |     if ('\0' != *p)
 61 |         ++p;
 62 |     tail_set_suffix (trie->tail, old_tail, p);
 63 |     trie_da_set_tail_index (trie->da, old_da, old_tail);
 64 | 
 65 |     /* insert the new branch at the new separate point */
 66 |     return trie_branch_in_branch (trie, s, suffix, data);
 67 | 
 68 | fail:
 69 |     /* failed, undo previous insertions and return error */
 70 |     da_prune_upto (trie->da, sep_node, s);
 71 |     trie_da_set_tail_index (trie->da, sep_node, old_tail);
 72 |     return FALSE;
 73 | }
 74 | 
 75 | Bool trie_store (Trie *trie, const TrieChar *key, TrieData data) {
 76 |     TrieIndex        s, t;
 77 |     short            suffix_idx;
 78 |     const TrieChar *p, *sep;
 79 | 	size_t len;
 80 | 
 81 |     /* walk through branches */
 82 |     s = da_get_root (trie->da);
 83 |     for (p = key; !trie_da_is_separate (trie->da, s); p++) {
 84 |         if (!da_walk (trie->da, &s, *p))
 85 |             return trie_branch_in_branch (trie, s, p, data);
 86 |         if (0 == *p)
 87 |             break;
 88 |     }
 89 | 
 90 |     /* walk through tail */
 91 |     sep = p;
 92 |     t = trie_da_get_tail_index (trie->da, s);
 93 |     suffix_idx = 0;
 94 |     len = strlen ((const char *) p) + 1;    /* including null-terminator */
 95 |     if (tail_walk_str (trie->tail, t, &suffix_idx, p, len) != len)
 96 |         return trie_branch_in_tail (trie, s, p, data);
 97 | 
 98 |     /* duplicated key, overwrite val */
 99 |     tail_set_data (trie->tail, t, data);
100 |     // trie->is_dirty = TRUE;
101 |     return TRUE;
102 | }
103 | 
104 | 
105 | Bool trie_has_key (const Trie *trie, const TrieChar *key) {
106 |     TrieIndex        s;
107 |     short            suffix_idx;
108 |     const TrieChar *p;
109 | 
110 |     /* walk through branches */
111 |     s = da_get_root (trie->da);
112 |     for (p = key; !trie_da_is_separate (trie->da, s); p++) {
113 |         if (!da_walk (trie->da, &s, *p))
114 |             return FALSE;
115 |         if (0 == *p)
116 |             break;
117 |     }
118 | 
119 |     /* walk through tail */
120 |     s = trie_da_get_tail_index (trie->da, s);
121 |     suffix_idx = 0;
122 |     for ( ; ; p++) {
123 |         if (!tail_walk_char (trie->tail, s, &suffix_idx, *p))
124 |             return FALSE;
125 |         if (0 == *p)
126 |             break;
127 |     }
128 | 
129 |     return TRUE;
130 | }
131 | 
132 | 
133 | Bool trie_retrieve (const Trie *trie, const TrieChar *key, TrieData *o_data) {
134 |     TrieIndex        s;
135 |     short            suffix_idx;
136 |     const TrieChar *p;
137 | 
138 |     /* walk through branches */
139 |     s = da_get_root (trie->da);
140 |     for (p = key; !trie_da_is_separate (trie->da, s); p++) {
141 |         if (!da_walk (trie->da, &s, *p))
142 |             return FALSE;
143 |         if (0 == *p)
144 |             break;
145 |     }
146 | 
147 |     /* walk through tail */
148 |     s = trie_da_get_tail_index (trie->da, s);
149 |     suffix_idx = 0;
150 |     for ( ; ; p++) {
151 |         if (!tail_walk_char (trie->tail, s, &suffix_idx, *p))
152 |             return FALSE;
153 |         if (0 == *p)
154 |             break;
155 |     }
156 | 
157 |     /* found, set the val and return */
158 |     if (o_data)
159 |         *o_data = tail_get_data (trie->tail, s);
160 |     return TRUE;
161 | }
162 | 
163 | Bool trie_delete (Trie *trie, const TrieChar *key) {
164 |     TrieIndex        s, t;
165 |     short            suffix_idx;
166 |     const TrieChar *p;
167 | 
168 |     /* walk through branches */
169 |     s = da_get_root (trie->da);
170 |     for (p = key; !trie_da_is_separate (trie->da, s); p++) {
171 |         if (!da_walk (trie->da, &s, *p))
172 |             return FALSE;
173 |         if (0 == *p)
174 |             break;
175 |     }
176 | 
177 |     /* walk through tail */
178 |     t = trie_da_get_tail_index (trie->da, s);
179 |     suffix_idx = 0;
180 |     for ( ; ; p++) {
181 |         if (!tail_walk_char (trie->tail, t, &suffix_idx, *p))
182 |             return FALSE;
183 |         if (0 == *p)
184 |             break;
185 |     }
186 | 
187 |     tail_delete (trie->tail, t);
188 |     da_set_base (trie->da, s, TRIE_INDEX_ERROR);
189 |     da_prune (trie->da, s);
190 | 
191 |     //trie->is_dirty = TRUE;
192 |     return TRUE;
193 | }
194 | 
195 | /*-------------------------------*
196 |  *   STEPWISE QUERY OPERATIONS   *
197 |  *-------------------------------*/
198 | 
199 | TrieState * trie_root (const Trie *trie) {
200 |     return trie_state_new (trie, da_get_root (trie->da), 0, FALSE);
201 | }
202 | 
203 | /*----------------*
204 |  *   TRIE STATE   *
205 |  *----------------*/
206 | 
207 | static TrieState * trie_state_new (const Trie *trie, TrieIndex index, short suffix_idx, short is_suffix) {
208 |     TrieState *s;
209 | 
210 |     s = (TrieState *) malloc (sizeof (TrieState));
211 |     if (!s)
212 |         return NULL;
213 | 
214 |     s->trie       = trie;
215 |     s->index      = index;
216 |     s->suffix_idx = suffix_idx;
217 |     s->is_suffix  = is_suffix;
218 | 
219 |     return s;
220 | }
221 | 
222 | TrieState * trie_state_clone (const TrieState *s) {
223 |     return trie_state_new (s->trie, s->index, s->suffix_idx, s->is_suffix);
224 | }
225 | 
226 | void trie_state_free (TrieState *s) {
227 |     free (s);
228 | }
229 | 
230 | void trie_state_rewind (TrieState *s) {
231 |     s->index      = da_get_root (s->trie->da);
232 |     s->is_suffix  = FALSE;
233 | }
234 | 
235 | Bool trie_state_walk (TrieState *s, TrieChar c) {
236 |     if (!s->is_suffix) {
237 |         Bool ret;
238 | 
239 |         ret = da_walk (s->trie->da, &s->index, c);
240 | 
241 |         if (ret && trie_da_is_separate (s->trie->da, s->index)) {
242 |             s->index = trie_da_get_tail_index (s->trie->da, s->index);
243 |             s->suffix_idx = 0;
244 |             s->is_suffix = TRUE;
245 |         }
246 | 
247 |         return ret;
248 |     } else {
249 |         return tail_walk_char (s->trie->tail, s->index, &s->suffix_idx, c);
250 |     }
251 | }
252 | 
253 | Bool trie_state_is_walkable (const TrieState *s, TrieChar c) {
254 |     if (!s->is_suffix)
255 |         return da_is_walkable (s->trie->da, s->index, c);
256 |     else 
257 |         return tail_is_walkable_char (s->trie->tail, s->index, s->suffix_idx, c);
258 | }
259 | 
260 | Bool trie_state_is_leaf (const TrieState *s) {
261 |     return s->is_suffix && trie_state_is_terminal (s);
262 | }
263 | 
264 | TrieData trie_state_get_data (const TrieState *s) {
265 |     return s->is_suffix ? tail_get_data (s->trie->tail, s->index) : TRIE_DATA_ERROR;
266 | }
267 | 
268 | int main(void) {
269 | 	Bool res;
270 | 	TrieData *data = (TrieData*)malloc(sizeof(TrieData));
271 | 	Trie *trie = trie_new();
272 | 
273 | 
274 | 	trie_store(trie, (const TrieChar*)"hello", 1);
275 | 	trie_store(trie, (const TrieChar*)"he", 4);
276 | 	trie_store(trie, (const TrieChar*)"hel", 3);
277 | 	trie_store(trie, (const TrieChar*)"h", 5);
278 | 	trie_store(trie, (const TrieChar*)"hell", 2);
279 | 
280 | 
281 | 	res = trie_retrieve(trie, (const TrieChar*)"hello", data);
282 | 	printf(res ? "Win!\n" : "Fail!\n");
283 | 
284 | 	res = trie_retrieve(trie, (const TrieChar*)"hell", data);
285 | 	printf(res ? "Win!\n" : "Fail!\n");
286 | 
287 | 	res = trie_retrieve(trie, (const TrieChar*)"hel", data);
288 | 	printf(res ? "Win!\n" : "Fail!\n");
289 | 
290 | 	res = trie_retrieve(trie, (const TrieChar*)"he", data);
291 | 	printf(res ? "Win!\n" : "Fail!\n");
292 | 
293 | 	res = trie_retrieve(trie, (const TrieChar*)"h", data);
294 | 	printf(res ? "Win!\n" : "Fail!\n");
295 | 
296 | 
297 | 	trie_free(trie);
298 | 	return 0;
299 | }
300 | 


--------------------------------------------------------------------------------
/ext/trie/trie-private.h:
--------------------------------------------------------------------------------
 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
 2 | /*
 3 |  * trie-private.h - Private utilities for trie implementation
 4 |  * Created: 2007-08-25
 5 |  * Author:  Theppitak Karoonboonyanan <thep@linux.thai.net>
 6 |  */
 7 | 
 8 | #ifndef __TRIE_PRIVATE_H
 9 | #define __TRIE_PRIVATE_H
10 | 
11 | #include "typedefs.h"
12 | 
13 | /**
14 |  * @file trie-private.h
15 |  * @brief Private utilities for trie implementation
16 |  */
17 | 
18 | /**
19 |  * @brief Minimum value macro
20 |  */
21 | #define MIN_VAL(a,b)  ((a)<(b)?(a):(b))
22 | /**
23 |  * @brief Maximum value macro
24 |  */
25 | #define MAX_VAL(a,b)  ((a)>(b)?(a):(b))
26 | 
27 | #endif  /* __TRIE_PRIVATE_H */
28 | 
29 | /*
30 | vi:ts=4:ai:expandtab
31 | */
32 | 


--------------------------------------------------------------------------------
/ext/trie/trie.c:
--------------------------------------------------------------------------------
  1 | #include "ruby.h"
  2 | #include "trie.h"
  3 | #include <stdlib.h>
  4 | #include <stdio.h>
  5 | #include <string.h>
  6 | 
  7 | VALUE cTrie, cTrieNode;
  8 | 
  9 | /*
 10 |  * Document-class: Trie
 11 |  * 
 12 |  * A key-value data structure for string keys which is efficient memory usage and fast retrieval time.
 13 |  *
 14 |  */
 15 | 
 16 | static VALUE rb_trie_alloc(VALUE klass) {
 17 | 	VALUE obj;
 18 | 	obj = Data_Wrap_Struct(klass, 0, trie_free, trie_new());
 19 | 	return obj;
 20 | }
 21 | 
 22 | void raise_ioerror(const char * message) {
 23 |     VALUE rb_eIOError = rb_const_get(rb_cObject, rb_intern("IOError"));
 24 |     rb_raise(rb_eIOError, "%s", message);
 25 | }
 26 | 
 27 | /*
 28 |  * call-seq:
 29 |  *   read(filename_base) -> Trie
 30 |  *
 31 |  * Returns a new trie with data as read from disk.
 32 |  */
 33 | static VALUE rb_trie_read(VALUE self, VALUE filename_base) {
 34 |   VALUE da_filename = rb_str_dup(filename_base);
 35 |   rb_str_concat(da_filename, rb_str_new2(".da"));
 36 |   StringValue(da_filename);
 37 |     
 38 |   VALUE tail_filename = rb_str_dup(filename_base);
 39 |   rb_str_concat(tail_filename, rb_str_new2(".tail"));
 40 |   StringValue(tail_filename);
 41 | 
 42 |   Trie *trie = trie_new();
 43 | 
 44 |   VALUE obj;
 45 |   obj = Data_Wrap_Struct(self, 0, trie_free, trie);
 46 | 
 47 |   DArray *old_da = trie->da;
 48 |   Tail *old_tail = trie->tail;
 49 | 
 50 |   FILE *da_file = fopen(RSTRING_PTR(da_filename), "r");
 51 |   if (da_file == NULL)
 52 |     raise_ioerror("Error reading .da file.");
 53 | 
 54 |   trie->da = da_read(da_file);
 55 |   fclose(da_file);
 56 | 
 57 |   FILE *tail_file = fopen(RSTRING_PTR(tail_filename), "r");
 58 |   if (tail_file == NULL)
 59 |     raise_ioerror("Error reading .tail file.");
 60 | 
 61 |   trie->tail = tail_read(tail_file);
 62 |   fclose(tail_file);
 63 | 
 64 |   da_free(old_da);
 65 |   tail_free(old_tail);
 66 | 
 67 |   return obj;
 68 | }
 69 | 
 70 | /*
 71 |  * call-seq:
 72 |  *   has_key?(key) -> true/false
 73 |  *
 74 |  * Determines whether or not a key exists in the Trie.  Use this if you don't care about the value, as it
 75 |  * is marginally faster than Trie#get.
 76 |  *
 77 |  */
 78 | static VALUE rb_trie_has_key(VALUE self, VALUE key) {
 79 | 	StringValue(key);
 80 | 
 81 |     Trie *trie;
 82 |     Data_Get_Struct(self, Trie, trie);
 83 | 
 84 |     if(trie_has_key(trie, (TrieChar*)RSTRING_PTR(key)))
 85 | 		return Qtrue;
 86 |     else
 87 | 		return Qnil;
 88 | }
 89 | 
 90 | /*
 91 |  * call-seq:
 92 |  *   get(key) -> value
 93 |  *   [key]    -> value
 94 |  *
 95 |  * Retrieves the value for a particular key (or nil) from the Trie.
 96 |  *
 97 |  */
 98 | static VALUE rb_trie_get(VALUE self, VALUE key) {
 99 | 	StringValue(key);
100 | 
101 |     Trie *trie;
102 |     Data_Get_Struct(self, Trie, trie);
103 | 
104 | 	TrieData data;
105 |     if(trie_retrieve(trie, (TrieChar*)RSTRING_PTR(key), &data))
106 | 		return (VALUE)data;
107 |     else
108 | 		return Qnil;
109 | }
110 | 
111 | /*
112 |  * call-seq:
113 |  *   add(key)
114 |  *   add(key,value)
115 |  *
116 |  * Add a key, or a key and value to the Trie.  If you add a key without a value it assumes true for the value. 
117 |  *
118 |  */
119 | static VALUE rb_trie_add(VALUE self, VALUE args) {
120 | 	Trie *trie;
121 |     Data_Get_Struct(self, Trie, trie);
122 | 
123 |     int size = RARRAY_LEN(args);
124 |     if(size < 1 || size > 2)
125 | 		return Qnil;
126 | 
127 |     VALUE key;
128 |     key = RARRAY_PTR(args)[0];
129 | 	StringValue(key);
130 | 
131 |     TrieData value = size == 2 ? RARRAY_PTR(args)[1] : TRIE_DATA_ERROR;
132 |     
133 |     if(trie_store(trie, (TrieChar*)RSTRING_PTR(key), value))
134 | 		return Qtrue;
135 |     else
136 | 		return Qnil;
137 | }
138 | 
139 | /*
140 |  * call-seq:
141 |  *   delete(key)
142 |  *
143 |  * Delete a key from the Trie.  Returns true if it deleted a key, nil otherwise.
144 |  *
145 |  */
146 | static VALUE rb_trie_delete(VALUE self, VALUE key) {
147 | 	StringValue(key);
148 | 
149 | 	Trie *trie;
150 |     Data_Get_Struct(self, Trie, trie);
151 | 
152 |     if(trie_delete(trie, (TrieChar*)RSTRING_PTR(key)))
153 | 		return Qtrue;
154 |     else
155 | 		return Qnil;
156 | }
157 | 
158 | static VALUE walk_all_paths(Trie *trie, VALUE children, TrieState *state, char *prefix, int prefix_size) {
159 | 	int c;
160 |     for(c = 1; c < 256; c++) {
161 | 		if(trie_state_is_walkable(state,c)) {
162 | 			TrieState *next_state = trie_state_clone(state);
163 | 			trie_state_walk(next_state, c);
164 | 
165 | 			prefix[prefix_size] = c;
166 | 			prefix[prefix_size + 1] = 0;
167 | 
168 | 			if(trie_state_is_terminal(next_state)) {
169 | 				char *word = (char*) malloc(prefix_size + 2);
170 | 				memcpy(word, prefix, prefix_size + 2);
171 | 				rb_ary_push(children, rb_str_new2(word));
172 | 			}
173 | 
174 | 			walk_all_paths(trie, children, next_state, prefix, prefix_size + 1);
175 | 			
176 | 			prefix[prefix_size] = 0;
177 | 			trie_state_free(next_state);
178 | 		}
179 |     }
180 | }
181 | 
182 | 
183 | static Bool traverse(TrieState *state, TrieChar *char_prefix) {
184 | 	const TrieChar *iterator = char_prefix;
185 | 	while(*iterator != 0) {
186 | 		if(!trie_state_is_walkable(state, *iterator))
187 | 			return FALSE;
188 | 		trie_state_walk(state, *iterator);
189 | 		iterator++;
190 | 	}
191 | 	return TRUE;
192 | }
193 | 
194 | 
195 | /*
196 |  * call-seq:
197 |  *   children(prefix) -> [ key, ... ]
198 |  *
199 |  * Finds all keys in the Trie beginning with the given prefix. 
200 |  *
201 |  */
202 | static VALUE rb_trie_children(VALUE self, VALUE prefix) {
203 |     if(NIL_P(prefix))
204 | 		return rb_ary_new();
205 | 
206 | 	StringValue(prefix);
207 | 
208 |     Trie *trie;
209 |     Data_Get_Struct(self, Trie, trie);
210 | 
211 | 	int prefix_size = RSTRING_LEN(prefix);
212 |     TrieState *state = trie_root(trie);
213 |     VALUE children = rb_ary_new();
214 | 	TrieChar *char_prefix = (TrieChar*)RSTRING_PTR(prefix);
215 |     
216 |     if(!traverse(state, char_prefix)) {
217 |     	return children;
218 |     }
219 | 
220 |     if(trie_state_is_terminal(state))
221 | 		rb_ary_push(children, prefix);
222 | 	
223 | 	char prefix_buffer[1024];
224 | 	memcpy(prefix_buffer, char_prefix, prefix_size);
225 | 	prefix_buffer[prefix_size] = 0;
226 | 
227 |     walk_all_paths(trie, children, state, prefix_buffer, prefix_size);
228 | 
229 |     trie_state_free(state);
230 |     return children;
231 | }
232 | 
233 | static Bool walk_all_paths_until_first_terminal(Trie *trie, TrieState *state, char *prefix, int prefix_size) {
234 | 	int c;
235 | 	Bool ret = FALSE;
236 |     for(c = 1; c < 256; c++) {
237 | 		if(trie_state_is_walkable(state,c)) {
238 | 			TrieState *next_state = trie_state_clone(state);
239 | 			trie_state_walk(next_state, c);
240 | 
241 | 			prefix[prefix_size] = c;
242 | 			prefix[prefix_size + 1] = 0;
243 | 
244 | 			if(trie_state_is_terminal(next_state)) {
245 | 				return TRUE;
246 | 			}
247 | 
248 | 			ret = walk_all_paths_until_first_terminal(trie, next_state, prefix, prefix_size + 1);
249 | 
250 | 			prefix[prefix_size] = 0;
251 | 			trie_state_free(next_state);
252 | 
253 | 			if (ret == TRUE) {
254 | 				return ret;
255 | 			}
256 | 		}
257 |     }
258 | 
259 |     return ret;
260 | }
261 | 
262 | static VALUE rb_trie_has_children(VALUE self, VALUE prefix) {
263 |     if(NIL_P(prefix))
264 | 		return rb_ary_new();
265 | 
266 | 	StringValue(prefix);
267 | 
268 |     Trie *trie;
269 |     Data_Get_Struct(self, Trie, trie);
270 | 
271 | 	int prefix_size = RSTRING_LEN(prefix);
272 |     TrieState *state = trie_root(trie);
273 | 	TrieChar *char_prefix = (TrieChar*)RSTRING_PTR(prefix);
274 | 
275 |     if(!traverse(state, char_prefix)) {
276 | 		return Qfalse;
277 | 	}
278 | 
279 |     if(trie_state_is_terminal(state))
280 |         return Qtrue;
281 | 
282 | 	char prefix_buffer[1024];
283 | 	memcpy(prefix_buffer, char_prefix, prefix_size);
284 | 	prefix_buffer[prefix_size] = 0;
285 | 
286 |     Bool ret = walk_all_paths_until_first_terminal(trie, state, prefix_buffer, prefix_size);
287 | 
288 |     trie_state_free(state);
289 |     return ret == TRUE ? Qtrue : Qfalse;
290 | }
291 | 
292 | static VALUE walk_all_paths_with_values(Trie *trie, VALUE children, TrieState *state, char *prefix, int prefix_size) {
293 | 	int c;
294 |     for(c = 1; c < 256; c++) {
295 | 		if(trie_state_is_walkable(state,c)) {
296 | 			TrieState *next_state = trie_state_clone(state);
297 | 			trie_state_walk(next_state, c);
298 | 
299 | 			prefix[prefix_size] = c;
300 | 			prefix[prefix_size + 1] = 0;
301 | 
302 | 			if(trie_state_is_terminal(next_state)) {
303 | 				TrieState *end_state = trie_state_clone(next_state);
304 | 				trie_state_walk(end_state, '\0');
305 |  
306 | 				char *word = (char*) malloc(prefix_size + 2);
307 | 				memcpy(word, prefix, prefix_size + 2);
308 | 
309 | 				VALUE tuple = rb_ary_new();
310 | 				rb_ary_push(tuple, rb_str_new2(word));
311 | 
312 | 				TrieData trie_data = trie_state_get_data(end_state);
313 | 				rb_ary_push(tuple, (VALUE)trie_data);
314 | 				rb_ary_push(children, tuple);
315 |  
316 | 				trie_state_free(end_state);
317 | 			}
318 | 
319 | 			walk_all_paths_with_values(trie, children, next_state, prefix, prefix_size + 1);
320 | 			
321 | 			prefix[prefix_size] = 0;
322 | 			trie_state_free(next_state);
323 | 		}
324 |     }
325 | }
326 | 
327 | /*
328 |  * call-seq:
329 |  *   children_with_values(key) -> [ [key,value], ... ]
330 |  *
331 |  * Finds all keys with their respective values in the Trie beginning with the given prefix. 
332 |  * 
333 |  */
334 | static VALUE rb_trie_children_with_values(VALUE self, VALUE prefix) {
335 |     if(NIL_P(prefix))
336 | 		return rb_ary_new();
337 | 
338 | 	StringValue(prefix);
339 | 
340 |     Trie *trie;
341 |     Data_Get_Struct(self, Trie, trie);
342 | 
343 | 	int prefix_size = RSTRING_LEN(prefix);
344 |     TrieChar *char_prefix = (TrieChar*)RSTRING_PTR(prefix);
345 |     
346 |     VALUE children = rb_ary_new();
347 | 
348 |     TrieState *state = trie_root(trie);
349 |     
350 |     if(!traverse(state, char_prefix)) {
351 | 		return children;
352 | 	}
353 | 
354 |     if(trie_state_is_terminal(state)) {
355 | 		TrieState *end_state = trie_state_clone(state);
356 | 		trie_state_walk(end_state, '\0');
357 | 
358 | 		VALUE tuple = rb_ary_new();
359 | 		rb_ary_push(tuple, prefix);
360 | 		TrieData trie_data = trie_state_get_data(end_state);
361 | 		rb_ary_push(tuple, (VALUE)trie_data);
362 | 		rb_ary_push(children, tuple);
363 | 
364 | 		trie_state_free(end_state);
365 |     }
366 | 
367 | 	char prefix_buffer[1024];
368 | 	memcpy(prefix_buffer, char_prefix, prefix_size);
369 | 	prefix_buffer[prefix_size] = 0;
370 | 
371 |     walk_all_paths_with_values(trie, children, state, prefix_buffer, prefix_size);
372 | 
373 |     trie_state_free(state);
374 |     return children;
375 | }
376 | 
377 | static VALUE rb_trie_node_alloc(VALUE klass);
378 | 
379 | /*
380 |  * call-seq:
381 |  *   root -> TrieNode
382 |  *
383 |  * Returns a TrieNode representing the root of the Trie.
384 |  *
385 |  */
386 | static VALUE rb_trie_root(VALUE self) {
387 |     Trie *trie;
388 |     Data_Get_Struct(self, Trie, trie);
389 | 
390 |     VALUE trie_node = rb_trie_node_alloc(cTrieNode);
391 | 
392 | 	TrieState *state = trie_root(trie);
393 | 	RDATA(trie_node)->data = state;
394 |     
395 |     rb_iv_set(trie_node, "@state", Qnil);
396 |     rb_iv_set(trie_node, "@full_state", rb_str_new2(""));
397 |     return trie_node;
398 | }
399 | 
400 | 
401 | /*
402 |  * Document-class: TrieNode
403 |  * 
404 |  * Represents a single node in the Trie. It can be used as a cursor to walk around the Trie.
405 |  * You can grab a TrieNode for the root of the Trie by using Trie#root.
406 |  *
407 |  */
408 | 
409 | static VALUE rb_trie_node_alloc(VALUE klass) {
410 |     VALUE obj;
411 |     obj = Data_Wrap_Struct(klass, 0, trie_state_free, NULL);
412 |     return obj;
413 | }
414 | 
415 | /* nodoc */
416 | static VALUE rb_trie_node_initialize_copy(VALUE self, VALUE from) {
417 | 	RDATA(self)->data = trie_state_clone(RDATA(from)->data);
418 |     
419 |     VALUE state = rb_iv_get(from, "@state");
420 |     rb_iv_set(self, "@state", state == Qnil ? Qnil : rb_str_dup(state));
421 | 
422 |     VALUE full_state = rb_iv_get(from, "@full_state");
423 |     rb_iv_set(self, "@full_state", full_state == Qnil ? Qnil : rb_str_dup(full_state));
424 | 
425 |     return self;
426 | }
427 | 
428 | /*
429 |  * call-seq:
430 |  *   state -> single character
431 |  *
432 |  * Returns the letter that the TrieNode instance points to. So, if the node is pointing at the "e" in "monkeys", the state is "e".
433 |  *
434 |  */
435 | static VALUE rb_trie_node_get_state(VALUE self) {
436 |     return rb_iv_get(self, "@state");
437 | }
438 | 
439 | /*
440 |  * call-seq:
441 |  *   full_state -> string
442 |  *
443 |  * Returns the full string from the root of the Trie up to this node.  So if the node pointing at the "e" in "monkeys",
444 |  * the full_state is "monke".
445 |  *
446 |  */
447 | static VALUE rb_trie_node_get_full_state(VALUE self) {
448 |     return rb_iv_get(self, "@full_state");
449 | }
450 | 
451 | /*
452 |  * call-seq:
453 |  *   walk!(letter) -> TrieNode
454 |  *
455 |  * Tries to walk down a particular branch of the Trie.  It modifies the node it is called on.
456 |  *
457 |  */
458 | static VALUE rb_trie_node_walk_bang(VALUE self, VALUE rchar) {
459 | 	StringValue(rchar);
460 | 
461 |     TrieState *state;
462 |     Data_Get_Struct(self, TrieState, state);
463 | 
464 |     if(RSTRING_LEN(rchar) != 1)
465 | 		return Qnil;
466 | 
467 |     Bool result = trie_state_walk(state, *RSTRING_PTR(rchar));
468 |     
469 |     if(result) {
470 | 		rb_iv_set(self, "@state", rchar);
471 | 		VALUE full_state = rb_iv_get(self, "@full_state");
472 | 		rb_str_append(full_state, rchar);
473 | 		rb_iv_set(self, "@full_state", full_state);
474 | 		return self;
475 |     } else
476 | 		return Qnil;
477 | }
478 | 
479 | /*
480 |  * call-seq:
481 |  *   walk(letter) -> TrieNode
482 |  *
483 |  * Tries to walk down a particular branch of the Trie.  It clones the node it is called on and 
484 |  * walks with that one, leaving the original unchanged.
485 |  *
486 |  */
487 | static VALUE rb_trie_node_walk(VALUE self, VALUE rchar) {
488 | 	StringValue(rchar);
489 | 
490 | 	VALUE new_node = rb_funcall(self, rb_intern("dup"), 0);
491 | 
492 |     TrieState *state;
493 |     Data_Get_Struct(new_node, TrieState, state);
494 | 
495 |     if(RSTRING_LEN(rchar) != 1)
496 | 		return Qnil;
497 | 
498 |     Bool result = trie_state_walk(state, *RSTRING_PTR(rchar));
499 |     
500 |     if(result) {
501 | 		rb_iv_set(new_node, "@state", rchar);
502 | 		VALUE full_state = rb_iv_get(new_node, "@full_state");
503 | 		rb_str_append(full_state, rchar);
504 | 		rb_iv_set(new_node, "@full_state", full_state);
505 | 		return new_node;
506 |     } else
507 | 		return Qnil;
508 | }
509 | 
510 | /*
511 |  * call-seq:
512 |  *   value
513 |  *
514 |  * Attempts to get the value at this node of the Trie.  This only works if the node is a terminal 
515 |  * (i.e. end of a key), otherwise it returns nil.
516 |  *
517 |  */
518 | static VALUE rb_trie_node_value(VALUE self) {
519 |     TrieState *state;
520 | 	TrieState *dup;
521 |     Data_Get_Struct(self, TrieState, state);
522 |     
523 |     dup = trie_state_clone(state);
524 | 
525 |     trie_state_walk(dup, 0);
526 |     TrieData trie_data = trie_state_get_data(dup);
527 |     trie_state_free(dup);
528 | 
529 |     return TRIE_DATA_ERROR == trie_data ? Qnil : (VALUE)trie_data;
530 | }
531 | 
532 | /*
533 |  * call-seq:
534 |  *   terminal? -> true/false
535 |  *
536 |  * Returns true if this node is at the end of a key.  So if you have two keys in your Trie, "he" and
537 |  * "hello", and you walk all the way to the end of "hello", the "e" and the "o" will return true for terminal?.
538 |  *
539 |  */
540 | static VALUE rb_trie_node_terminal(VALUE self) {
541 |     TrieState *state;
542 |     Data_Get_Struct(self, TrieState, state);
543 |     
544 |     return trie_state_is_terminal(state) ? Qtrue : Qnil;
545 | }
546 | 
547 | /*
548 |  * call-seq:
549 |  *   leaf? -> true/false
550 |  *
551 |  * Returns true if there are no branches at this node.
552 |  */
553 | static VALUE rb_trie_node_leaf(VALUE self) {
554 |     TrieState *state;
555 |     Data_Get_Struct(self, TrieState, state);
556 |     
557 |     return trie_state_is_leaf(state) ? Qtrue : Qnil;
558 | }
559 | 
560 | /*
561 |  * call-seq:
562 |  *   save(filename_base) -> true
563 |  *
564 |  * Saves the trie data to two files, filename_base.da and filename_base.tail.
565 |  * Returns true if saving was successful.
566 |  */
567 | static VALUE rb_trie_save(VALUE self, VALUE filename_base) {
568 |   VALUE da_filename = rb_str_dup(filename_base);
569 |   rb_str_concat(da_filename, rb_str_new2(".da"));
570 |   StringValue(da_filename);
571 |     
572 |   VALUE tail_filename = rb_str_dup(filename_base);
573 |   rb_str_concat(tail_filename, rb_str_new2(".tail"));
574 |   StringValue(tail_filename);
575 | 
576 |   Trie *trie;
577 |   Data_Get_Struct(self, Trie, trie);
578 | 
579 |   FILE *da_file = fopen(RSTRING_PTR(da_filename), "w");
580 |   if (da_file == NULL)
581 |     raise_ioerror("Error opening .da file for writing.");
582 |   if (da_write(trie->da, da_file) != 0)
583 |     raise_ioerror("Error writing DArray data.");
584 |   fclose(da_file);
585 | 
586 |   FILE *tail_file = fopen(RSTRING_PTR(tail_filename), "w");
587 |   if (tail_file == NULL)
588 |     raise_ioerror("Error opening .tail file for writing.");
589 |   if (tail_write(trie->tail, tail_file) != 0)
590 |     raise_ioerror("Error writing Tail data.");
591 |   fclose(tail_file);
592 | 
593 |   return Qtrue;
594 | }
595 | 
596 |  
597 | void Init_trie() {
598 |     cTrie = rb_define_class("Trie", rb_cObject);
599 |     rb_define_alloc_func(cTrie, rb_trie_alloc);
600 |     rb_define_module_function(cTrie, "read", rb_trie_read, 1);
601 |     rb_define_method(cTrie, "has_key?", rb_trie_has_key, 1);
602 |     rb_define_method(cTrie, "get", rb_trie_get, 1);
603 |     rb_define_method(cTrie, "add", rb_trie_add, -2);
604 |     rb_define_method(cTrie, "delete", rb_trie_delete, 1);
605 |     rb_define_method(cTrie, "children", rb_trie_children, 1);
606 |     rb_define_method(cTrie, "children_with_values", rb_trie_children_with_values, 1);
607 |     rb_define_method(cTrie, "has_children?", rb_trie_has_children, 1);
608 |     rb_define_method(cTrie, "root", rb_trie_root, 0);
609 |     rb_define_method(cTrie, "save", rb_trie_save, 1);
610 | 
611 |     cTrieNode = rb_define_class("TrieNode", rb_cObject);
612 |     rb_define_alloc_func(cTrieNode, rb_trie_node_alloc);
613 |     rb_define_method(cTrieNode, "initialize_copy", rb_trie_node_initialize_copy, 1);
614 |     rb_define_method(cTrieNode, "state", rb_trie_node_get_state, 0);
615 |     rb_define_method(cTrieNode, "full_state", rb_trie_node_get_full_state, 0);
616 |     rb_define_method(cTrieNode, "walk!", rb_trie_node_walk_bang, 1);
617 |     rb_define_method(cTrieNode, "walk", rb_trie_node_walk, 1);
618 |     rb_define_method(cTrieNode, "value", rb_trie_node_value, 0);
619 |     rb_define_method(cTrieNode, "terminal?", rb_trie_node_terminal, 0);
620 |     rb_define_method(cTrieNode, "leaf?", rb_trie_node_leaf, 0);
621 | }
622 | 


--------------------------------------------------------------------------------
/ext/trie/trie.h:
--------------------------------------------------------------------------------
 1 | #include "darray.h"
 2 | #include "tail.h"
 3 | 
 4 | typedef struct _Trie {
 5 |     DArray     *da;
 6 |     Tail       *tail;
 7 | } Trie;
 8 | 
 9 | typedef struct _TrieState {
10 |     const Trie *trie;       /**< the corresponding trie */
11 |     TrieIndex   index;      /**< index in double-array/tail structures */
12 |     short       suffix_idx; /**< suffix character offset, if in suffix */
13 |     short       is_suffix;  /**< whether it is currently in suffix part */
14 | } TrieState;
15 | 
16 | 
17 | #define trie_da_is_separate(da,s)      (da_get_base ((da), (s)) < 0)
18 | #define trie_da_get_tail_index(da,s)   (-da_get_base ((da), (s)))
19 | #define trie_da_set_tail_index(da,s,v) (da_set_base ((da), (s), -(v)))
20 | #define trie_state_is_terminal(s) trie_state_is_walkable((s),TRIE_CHAR_TERM)
21 | 
22 | 
23 | Trie* trie_new();
24 | void trie_free(Trie *trie);
25 | static Bool trie_branch_in_branch (Trie *trie, TrieIndex sep_node, const TrieChar *suffix, TrieData data);
26 | static Bool trie_branch_in_tail(Trie *trie, TrieIndex sep_node, const TrieChar *suffix, TrieData data);
27 | Bool trie_store (Trie *trie, const TrieChar *key, TrieData data);
28 | Bool trie_retrieve (const Trie *trie, const TrieChar *key, TrieData *o_data);
29 | Bool trie_delete (Trie *trie, const TrieChar *key);
30 | TrieState * trie_root (const Trie *trie);
31 | static TrieState * trie_state_new (const Trie *trie, TrieIndex index, short suffix_idx, short is_suffix);
32 | TrieState * trie_state_clone (const TrieState *s);
33 | void trie_state_free (TrieState *s);
34 | void trie_state_rewind (TrieState *s);
35 | Bool trie_state_walk (TrieState *s, TrieChar c);
36 | Bool trie_state_is_walkable (const TrieState *s, TrieChar c);
37 | Bool trie_state_is_leaf (const TrieState *s);
38 | TrieData trie_state_get_data (const TrieState *s);
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/ext/trie/triedefs.h:
--------------------------------------------------------------------------------
 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
 2 | /*
 3 |  * triedefs.h - General typedefs for trie
 4 |  * Created: 2006-08-11
 5 |  * Author:  Theppitak Karoonboonyanan <thep@linux.thai.net>
 6 |  */
 7 | 
 8 | #ifndef __TRIEDEFS_H
 9 | #define __TRIEDEFS_H
10 | 
11 | #include "typedefs.h"
12 | 
13 | /**
14 |  * @file triedefs.h
15 |  * @brief General typedefs for trie
16 |  */
17 | 
18 | /**
19 |  * @brief Trie IO modes
20 |  */
21 | typedef enum {
22 |     TRIE_IO_READ   = 0x01,
23 |     TRIE_IO_WRITE  = 0x02,
24 |     TRIE_IO_CREATE = 0x04
25 | } TrieIOMode;
26 | 
27 | /**
28 |  * @brief Trie character type for alphabet
29 |  */
30 | typedef uint32         AlphaChar;
31 | 
32 | /**
33 |  * @brief Error value for alphabet character
34 |  */
35 | #define ALPHA_CHAR_ERROR   (~(AlphaChar)0)
36 | 
37 | /**
38 |  * @brief Trie character type for key
39 |  */
40 | typedef unsigned char  TrieChar;
41 | /**
42 |  * @brief Trie terminator character
43 |  */
44 | #define TRIE_CHAR_TERM    '\0'
45 | #define TRIE_CHAR_MAX     255
46 | 
47 | /**
48 |  * @brief Type of Trie index
49 |  */
50 | typedef int32          TrieIndex;
51 | /**
52 |  * @brief Trie error index
53 |  */
54 | #define TRIE_INDEX_ERROR  0
55 | /**
56 |  * @brief Maximum trie index value
57 |  */
58 | #define TRIE_INDEX_MAX    0x7fffffff
59 | 
60 | /**
61 |  * @brief Type of value associated to trie entries
62 |  */
63 | typedef unsigned long TrieData;
64 | /**
65 |  * @brief Trie error data
66 |  */
67 | #define TRIE_DATA_ERROR  -1
68 | 
69 | #endif  /* __TRIEDEFS_H */
70 | 
71 | /*
72 | vi:ts=4:ai:expandtab
73 | */
74 | 


--------------------------------------------------------------------------------
/ext/trie/typedefs.h:
--------------------------------------------------------------------------------
  1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
  2 | /*
  3 |  * typedefs.h - general types
  4 |  * Created : 11 Aug 2006
  5 |  * Author  : Theppitak Karoonboonyanan <thep@linux.thai.net>
  6 |  */
  7 | 
  8 | #ifndef __TYPEDEFS_H
  9 | #define __TYPEDEFS_H
 10 | 
 11 | #include <limits.h>
 12 | 
 13 | // fix for fast_trie on Windows. Should be easy to merge with future changes to libdatrie. MH
 14 | #include <stdbool.h>
 15 | #define Bool  bool
 16 | #define FALSE false
 17 | #define TRUE  true
 18 | 
 19 | # if UCHAR_MAX == 0xff
 20 | #   ifndef UINT8_TYPEDEF
 21 | #     define UINT8_TYPEDEF
 22 |       typedef unsigned char  uint8;
 23 | #   endif /* UINT8_TYPEDEF */
 24 | # endif /* UCHAR_MAX */
 25 | 
 26 | # if SCHAR_MAX == 0x7f
 27 | #   ifndef INT8_TYPEDEF
 28 | #     define INT8_TYPEDEF
 29 |       typedef signed char    int8;
 30 | #   endif /* INT8_TYPEDEF */
 31 | # endif /* SCHAR_MAX */
 32 | 
 33 | # if UINT_MAX == 0xffff
 34 | #   ifndef UINT16_TYPEDEF
 35 | #     define UINT16_TYPEDEF
 36 |       typedef unsigned int   uint16;
 37 | #   endif /* UINT16_TYPEDEF */
 38 | # endif /* UINT_MAX */
 39 | 
 40 | # if INT_MAX == 0x7fff
 41 | #   ifndef INT16_TYPEDEF
 42 | #     define INT16_TYPEDEF
 43 |       typedef int            int16;
 44 | #   endif /* INT16_TYPEDEF */
 45 | # endif /* INT_MAX */
 46 | 
 47 | # if USHRT_MAX == 0xffff
 48 | #   ifndef UINT16_TYPEDEF
 49 | #     define UINT16_TYPEDEF
 50 |       typedef unsigned short uint16;
 51 | #   endif /* UINT16_TYPEDEF */
 52 | # endif /* USHRT_MAX */
 53 | 
 54 | # if SHRT_MAX == 0x7fff
 55 | #   ifndef INT16_TYPEDEF
 56 | #     define INT16_TYPEDEF
 57 |       typedef short          int16;
 58 | #   endif /* INT16_TYPEDEF */
 59 | # endif /* SHRT_MAX */
 60 | 
 61 | # if UINT_MAX == 0xffffffff
 62 | #   ifndef UINT32_TYPEDEF
 63 | #     define UINT32_TYPEDEF
 64 |       typedef unsigned int   uint32;
 65 | #   endif /* UINT32_TYPEDEF */
 66 | # endif /* UINT_MAX */
 67 | 
 68 | # if INT_MAX == 0x7fffffff
 69 | #   ifndef INT32_TYPEDEF
 70 | #     define INT32_TYPEDEF
 71 |       typedef int            int32;
 72 | #   endif /* INT32_TYPEDEF */
 73 | # endif /* INT_MAX */
 74 | 
 75 | # if ULONG_MAX == 0xffffffff
 76 | #   ifndef UINT32_TYPEDEF
 77 | #     define UINT32_TYPEDEF
 78 |       typedef unsigned long  uint32;
 79 | #   endif /* UINT32_TYPEDEF */
 80 | # endif /* ULONG_MAX */
 81 | 
 82 | # if LONG_MAX == 0x7fffffff
 83 | #   ifndef INT32_TYPEDEF
 84 | #     define INT32_TYPEDEF
 85 |       typedef long           int32;
 86 | #   endif /* INT32_TYPEDEF */
 87 | # endif /* LONG_MAX */
 88 | 
 89 | # ifndef UINT8_TYPEDEF
 90 | #   error "uint8 type is undefined!"
 91 | # endif
 92 | # ifndef INT8_TYPEDEF
 93 | #   error "int8 type is undefined!"
 94 | # endif
 95 | # ifndef UINT16_TYPEDEF
 96 | #   error "uint16 type is undefined!"
 97 | # endif
 98 | # ifndef INT16_TYPEDEF
 99 | #   error "int16 type is undefined!"
100 | # endif
101 | # ifndef UINT32_TYPEDEF
102 | #   error "uint32 type is undefined!"
103 | # endif
104 | # ifndef INT32_TYPEDEF
105 | #   error "int32 type is undefined!"
106 | # endif
107 | 
108 | typedef uint8  byte;
109 | typedef uint16 word;
110 | typedef uint32 dword;
111 | 
112 | 
113 | #endif /* __TYPEDEFS_H */
114 | 
115 | /*
116 | vi:ts=4:ai:expandtab
117 | */
118 | 


--------------------------------------------------------------------------------
/fast_trie.gemspec:
--------------------------------------------------------------------------------
 1 | # Generated by jeweler
 2 | # DO NOT EDIT THIS FILE DIRECTLY
 3 | # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
 4 | # -*- encoding: utf-8 -*-
 5 | # stub: fast_trie 0.5.1 ruby ext
 6 | # stub: ext/trie/extconf.rb
 7 | 
 8 | Gem::Specification.new do |s|
 9 |   s.name = "fast_trie"
10 |   s.version = "0.5.1"
11 | 
12 |   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
13 |   s.require_paths = ["ext"]
14 |   s.authors = ["Tyler McMullen", "Matt Hickford"]
15 |   s.date = "2015-07-27"
16 |   s.description = "Ruby Trie based on libdatrie."
17 |   s.email = "tyler@scribd.com"
18 |   s.extensions = ["ext/trie/extconf.rb"]
19 |   s.extra_rdoc_files = [
20 |     "LICENSE",
21 |     "README.textile"
22 |   ]
23 |   s.files = [
24 |     "Gemfile.lock",
25 |     "README.textile",
26 |     "VERSION.yml",
27 |     "ext/trie/darray.c",
28 |     "ext/trie/darray.h",
29 |     "ext/trie/extconf.rb",
30 |     "ext/trie/fileutils.c",
31 |     "ext/trie/fileutils.h",
32 |     "ext/trie/tail.c",
33 |     "ext/trie/tail.h",
34 |     "ext/trie/trie-private.c",
35 |     "ext/trie/trie-private.h",
36 |     "ext/trie/trie.c",
37 |     "ext/trie/trie.h",
38 |     "ext/trie/triedefs.h",
39 |     "ext/trie/typedefs.h",
40 |     "fast_trie.gemspec",
41 |     "spec/trie_spec.rb"
42 |   ]
43 |   s.homepage = "http://github.com/tyler/trie"
44 |   s.rdoc_options = ["--title", "Trie", "--line-numbers", "--op", "rdoc", "--main", "ext/trie/trie.c", "README"]
45 |   s.rubygems_version = "2.4.5"
46 |   s.summary = "Ruby Trie based on libdatrie."
47 | 
48 |   if s.respond_to? :specification_version then
49 |     s.specification_version = 4
50 | 
51 |     if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
52 |       s.add_development_dependency(%q<rake>, [">= 0"])
53 |       s.add_development_dependency(%q<rspec>, [">= 0"])
54 |       s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
55 |       s.add_development_dependency(%q<bundler>, ["~> 1.0"])
56 |       s.add_development_dependency(%q<jeweler>, ["~> 2.0.1"])
57 |       s.add_development_dependency(%q<rake-compiler>, [">= 0"])
58 |     else
59 |       s.add_dependency(%q<rake>, [">= 0"])
60 |       s.add_dependency(%q<rspec>, [">= 0"])
61 |       s.add_dependency(%q<rdoc>, ["~> 3.12"])
62 |       s.add_dependency(%q<bundler>, ["~> 1.0"])
63 |       s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
64 |       s.add_dependency(%q<rake-compiler>, [">= 0"])
65 |     end
66 |   else
67 |     s.add_dependency(%q<rake>, [">= 0"])
68 |     s.add_dependency(%q<rspec>, [">= 0"])
69 |     s.add_dependency(%q<rdoc>, ["~> 3.12"])
70 |     s.add_dependency(%q<bundler>, ["~> 1.0"])
71 |     s.add_dependency(%q<jeweler>, ["~> 2.0.1"])
72 |     s.add_dependency(%q<rake-compiler>, [">= 0"])
73 |   end
74 | end
75 | 
76 | 


--------------------------------------------------------------------------------
/spec/trie_spec.rb:
--------------------------------------------------------------------------------
  1 | require File.dirname(__FILE__) + '/../lib/trie'
  2 | 
  3 | describe Trie do
  4 |   before :each do
  5 |     @trie = Trie.new;
  6 |     @trie.add('rocket')
  7 |     @trie.add('rock')
  8 |     @trie.add('frederico')
  9 |   end
 10 |   
 11 |   describe :has_key? do
 12 |     it 'returns true for words in the trie' do
 13 |       @trie.has_key?('rocket').should be_true
 14 |     end
 15 | 
 16 |     it 'returns nil for words that are not in the trie' do
 17 |       @trie.has_key?('not_in_the_trie').should be_nil
 18 |     end
 19 |   end
 20 | 
 21 |   describe :get do
 22 |     it 'returns -1 for words in the trie without a weight' do
 23 |       @trie.get('rocket').should == -1
 24 |     end
 25 | 
 26 |     it 'returns nil if the word is not in the trie' do
 27 |       @trie.get('not_in_the_trie').should be_nil
 28 |     end
 29 |   end
 30 | 
 31 |   describe :add do
 32 |     it 'adds a word to the trie' do
 33 |       @trie.add('forsooth').should == true
 34 |       @trie.get('forsooth').should == -1
 35 |     end
 36 | 
 37 |     it 'adds a word with a weight to the trie' do
 38 |       @trie.add('chicka',123).should == true
 39 |       @trie.get('chicka').should == 123
 40 |     end
 41 | 
 42 |     it 'adds values greater than 16-bit allows' do
 43 |       @trie.add('chicka', 72_000).should == true
 44 |       @trie.get('chicka').should == 72_000
 45 |     end
 46 | 
 47 |     it 'adds a word with a non-numeric value to the trie' do
 48 |       @trie.add('doot', 'Heeey').should == true
 49 |       @trie.get('doot').should == 'Heeey'
 50 |     end
 51 |   end
 52 | 
 53 |   describe :delete do
 54 |     it 'deletes a word from the trie' do
 55 |       @trie.delete('rocket').should == true
 56 |       @trie.has_key?('rocket').should be_nil
 57 |     end
 58 |   end
 59 | 
 60 |   describe :children do
 61 |     it 'returns all words beginning with a given prefix' do
 62 |       children = @trie.children('roc')
 63 |       children.size.should == 2
 64 |       children.should include('rock')
 65 |       children.should include('rocket')
 66 |     end
 67 | 
 68 |     it 'returns blank array if prefix does not exist' do
 69 |       @trie.children('ajsodij').should == []
 70 |     end
 71 | 
 72 |     it 'includes the prefix if the prefix is a word' do
 73 |       children = @trie.children('rock')
 74 |       children.size.should == 2
 75 |       children.should include('rock')
 76 |       children.should include('rocket')
 77 |     end
 78 | 
 79 |     it 'returns blank array if prefix is nil' do
 80 |       @trie.children(nil).should == []
 81 |     end
 82 |   end
 83 | 
 84 |   describe :children_with_values do
 85 |     before :each do
 86 |       @trie.add('abc',2)
 87 |       @trie.add('abcd',4)
 88 |     end
 89 | 
 90 |     it 'returns all words with values beginning with a given prefix' do
 91 |       children = @trie.children_with_values('ab')
 92 |       children.size.should == 2
 93 |       children.should include(['abc',2])
 94 |       children.should include(['abcd',4])
 95 |     end
 96 | 
 97 |     it 'returns nil if prefix does not exist' do
 98 |       @trie.children_with_values('ajsodij').should == []
 99 |     end
100 | 
101 |     it 'includes the prefix if the prefix is a word' do
102 |       children = @trie.children_with_values('abc')
103 |       children.size.should == 2
104 |       children.should include(['abc',2])
105 |       children.should include(['abcd',4])
106 |     end
107 | 
108 |     it 'returns blank array if prefix is nil' do
109 |       @trie.children_with_values(nil).should == []
110 |     end
111 |   end
112 | 
113 |   #describe :walk_to_terminal do
114 |   #  it 'returns the first word found along a path' do
115 |   #    @trie.add 'anderson'
116 |   #    @trie.add 'andreas'
117 |   #    @trie.add 'and'
118 | 
119 |   #    @trie.walk_to_terminal('anderson').should == 'and'
120 |   #  end
121 | 
122 |   #  it 'returns the first word and value along a path' do
123 |   #    @trie.add 'anderson'
124 |   #    @trie.add 'andreas'
125 |   #    @trie.add 'and', 15
126 | 
127 |   #    @trie.walk_to_terminal('anderson',true).should == ['and', 15]
128 |   #  end
129 |   #end
130 | 
131 |   describe :root do
132 |     it 'returns a TrieNode' do
133 |       @trie.root.should be_an_instance_of(TrieNode)
134 |     end
135 | 
136 |     it 'returns a different TrieNode each time' do
137 |       @trie.root.should_not == @trie.root
138 |     end
139 |   end
140 | 
141 |   describe 'save/read' do
142 |     let(:filename_base) do
143 |       dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'tmp'))
144 |       FileUtils.mkdir_p(dir)
145 |       File.join(dir, 'trie')
146 |     end
147 | 
148 |     context 'when I save the populated trie to disk' do
149 |       before(:each) do
150 |         @trie.add('omgwtflolbbq', 123)
151 |         @trie.save(filename_base)
152 |       end
153 | 
154 |       it 'should contain the same data when reading from disk' do
155 |         trie2 = Trie.read(filename_base)
156 |         trie2.get('omgwtflolbbq').should == 123
157 |       end
158 |     end
159 |   end
160 | 
161 |   describe :read do
162 |     context 'when the files to read from do not exist' do
163 |       let(:filename_base) do
164 |         "phantasy/file/path/that/does/not/exist"
165 |       end
166 | 
167 |       it 'should raise an error when attempting a read' do
168 |         lambda { Trie.read(filename_base) }.should raise_error(IOError)
169 |       end
170 |     end
171 |   end
172 | 
173 |   describe :has_children? do
174 |     it 'returns true when there are children matching prefix' do
175 |       @trie.has_children?('r').should be_true
176 | 
177 |       @trie.has_children?('rock').should be_true
178 |       @trie.has_children?('rocket').should be_true
179 |     end
180 | 
181 |     it 'returns false when there are no children matching prefix' do
182 |       @trie.has_children?('no').should be_false
183 |       @trie.has_children?('rome').should be_false
184 |       @trie.has_children?('roc_').should be_false
185 |     end
186 |   end
187 | end
188 | 
189 | describe TrieNode do
190 |   before :each do
191 |     @trie = Trie.new;
192 |     @trie.add('rocket',1)
193 |     @trie.add('rock',2)
194 |     @trie.add('frederico',3)
195 |     @node = @trie.root
196 |   end
197 |   
198 |   describe :state do
199 |     it 'returns the most recent state character' do
200 |       @node.walk!('r')
201 |       @node.state.should == 'r'
202 |       @node.walk!('o')
203 |       @node.state.should == 'o'
204 |     end
205 | 
206 |     it 'is nil when no walk has occurred' do
207 |       @node.state.should == nil
208 |     end
209 |   end
210 | 
211 |   describe :full_state do
212 |     it 'returns the current string' do
213 |       @node.walk!('r').walk!('o').walk!('c')
214 |       @node.full_state.should == 'roc'
215 |     end
216 | 
217 |     it 'is a blank string when no walk has occurred' do
218 |       @node.full_state.should == ''
219 |     end
220 |   end
221 |   
222 |   describe :walk! do
223 |     it 'returns the updated object when the walk succeeds' do
224 |       other = @node.walk!('r')
225 |       other.should == @node
226 |     end
227 | 
228 |     it 'returns nil when the walk fails' do
229 |       @node.walk!('q').should be_nil
230 |     end
231 |   end
232 | 
233 |   describe :walk do
234 |     it 'returns a new node object when the walk succeeds' do
235 |       other = @node.walk('r')
236 |       other.should_not == @node
237 |     end
238 | 
239 |     it 'returns nil when the walk fails' do
240 |       @node.walk('q').should be_nil
241 |     end
242 |   end
243 | 
244 | 
245 |   describe :value do
246 |     it 'returns nil when the node is not terminal' do
247 |       @node.walk!('r')
248 |       @node.value.should be_nil
249 |     end
250 | 
251 |     it 'returns a value when the node is terminal' do
252 |       @node.walk!('r').walk!('o').walk!('c').walk!('k')
253 |       @node.value.should == 2
254 |     end
255 |   end
256 | 
257 |   describe :terminal? do
258 |     it 'returns true when the node is a word end' do
259 |       @node.walk!('r').walk!('o').walk!('c').walk!('k')
260 |       @node.should be_terminal
261 |     end
262 | 
263 |     it 'returns nil when the node is not a word end' do
264 |       @node.walk!('r').walk!('o').walk!('c')
265 |       @node.should_not be_terminal
266 |     end
267 |   end
268 | 
269 |   describe :leaf? do
270 |     it 'returns true when this is the end of a branch of the trie' do
271 |       @node.walk!('r').walk!('o').walk!('c').walk!('k').walk!('e').walk!('t')
272 |       @node.should be_leaf
273 |     end
274 | 
275 |     it 'returns nil when there are more splits on this branch' do
276 |       @node.walk!('r').walk!('o').walk!('c').walk!('k')
277 |       @node.should_not be_leaf
278 |     end
279 |   end
280 | 
281 |   describe :clone do
282 |     it 'creates a new instance of this node which is not this node' do
283 |       new_node = @node.clone
284 |       new_node.should_not == @node
285 |     end
286 | 
287 |     it 'matches the state of the current node' do
288 |       new_node = @node.clone
289 |       new_node.state.should == @node.state
290 |     end
291 | 
292 |     it 'matches the full_state of the current node' do
293 |       new_node = @node.clone
294 |       new_node.full_state.should == @node.full_state
295 |     end
296 |   end
297 | end
298 | 


--------------------------------------------------------------------------------