├── .gitignore
├── .travis.yml
├── Gemfile
├── Gemfile.lock
├── LICENSE
├── README.textile
├── Rakefile
├── VERSION.yml
├── ext
└── trie
│ ├── darray.c
│ ├── darray.h
│ ├── extconf.rb
│ ├── fileutils.c
│ ├── fileutils.h
│ ├── tail.c
│ ├── tail.h
│ ├── trie-private.c
│ ├── trie-private.h
│ ├── trie.c
│ ├── trie.h
│ ├── triedefs.h
│ └── typedefs.h
├── fast_trie.gemspec
└── spec
└── trie_spec.rb
/.gitignore:
--------------------------------------------------------------------------------
1 | *.sw?
2 | *.o
3 | *.bundle
4 | *.dylib
5 | .DS_Store
6 | coverage
7 | *~
8 | #*
9 | *.gem
10 | rdoc
11 | Makefile
12 | *.stackdump
13 | *.def
14 | *.so
15 | tmp/
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: ruby
2 | rvm:
3 | - 2.1.0
4 | - 2.0.0
5 | - 1.9.3
6 | - 1.8.7
7 |
--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 |
3 | group :development do
4 | gem 'rake'
5 | gem 'rspec'
6 | gem 'rdoc', '~> 3.12'
7 | gem 'bundler', '~> 1.0'
8 | gem 'jeweler', '~> 2.0.1'
9 | gem 'rake-compiler'
10 | end
11 |
--------------------------------------------------------------------------------
/Gemfile.lock:
--------------------------------------------------------------------------------
1 | GEM
2 | remote: https://rubygems.org/
3 | specs:
4 | addressable (2.3.5)
5 | builder (3.2.2)
6 | descendants_tracker (0.0.3)
7 | diff-lcs (1.2.5)
8 | faraday (0.9.0)
9 | multipart-post (>= 1.2, < 3)
10 | git (1.2.6)
11 | github_api (0.11.2)
12 | addressable (~> 2.3)
13 | descendants_tracker (~> 0.0.1)
14 | faraday (~> 0.8, < 0.10)
15 | hashie (>= 1.2)
16 | multi_json (>= 1.7.5, < 2.0)
17 | nokogiri (~> 1.6.0)
18 | oauth2
19 | hashie (2.0.5)
20 | highline (1.6.20)
21 | jeweler (2.0.1)
22 | builder
23 | bundler (>= 1.0)
24 | git (>= 1.2.5)
25 | github_api
26 | highline (>= 1.6.15)
27 | nokogiri (>= 1.5.10)
28 | rake
29 | rdoc
30 | json (1.8.1)
31 | jwt (0.1.11)
32 | multi_json (>= 1.5)
33 | mini_portile (0.5.2)
34 | multi_json (1.8.4)
35 | multi_xml (0.5.5)
36 | multipart-post (2.0.0)
37 | nokogiri (1.6.1-x86-mingw32)
38 | mini_portile (~> 0.5.0)
39 | oauth2 (0.9.3)
40 | faraday (>= 0.8, < 0.10)
41 | jwt (~> 0.1.8)
42 | multi_json (~> 1.3)
43 | multi_xml (~> 0.5)
44 | rack (~> 1.2)
45 | rack (1.5.2)
46 | rake (10.1.1)
47 | rake-compiler (0.9.2)
48 | rake
49 | rdoc (3.12.2)
50 | json (~> 1.4)
51 | rspec (2.14.1)
52 | rspec-core (~> 2.14.0)
53 | rspec-expectations (~> 2.14.0)
54 | rspec-mocks (~> 2.14.0)
55 | rspec-core (2.14.7)
56 | rspec-expectations (2.14.5)
57 | diff-lcs (>= 1.1.3, < 2.0)
58 | rspec-mocks (2.14.5)
59 |
60 | PLATFORMS
61 | x86-mingw32
62 |
63 | DEPENDENCIES
64 | bundler (~> 1.0)
65 | jeweler (~> 2.0.1)
66 | rake
67 | rake-compiler
68 | rdoc (~> 3.12)
69 | rspec
70 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2008 Tyler McMullen
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining
4 | a copy of this software and associated documentation files (the
5 | "Software"), to deal in the Software without restriction, including
6 | without limitation the rights to use, copy, modify, merge, publish,
7 | distribute, sublicense, and/or sell copies of the Software, and to
8 | permit persons to whom the Software is furnished to do so, subject to
9 | the following conditions:
10 |
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 |
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 |
--------------------------------------------------------------------------------
/README.textile:
--------------------------------------------------------------------------------
1 | h1. Trie
2 |
3 | !https://badge.fury.io/rb/fast_trie.svg!:https://rubygems.org/gems/fast_trie !https://travis-ci.org/tyler/trie.svg!:https://travis-ci.org/tyler/trie
4 |
5 | This is a trie for Ruby using "libdatrie":http://linux.thai.net/~thep/datrie/. It uses a dual-array system, meaning it has best-in-class memory usage and search time.
6 |
7 |
8 | h2. What is a trie?
9 |
10 | I suck at explaining things. Wikipedia doesn't. http://wikipedia.org/wiki/Trie.
11 |
12 | But in short a trie is a data structure that holds strings in a tree. So if you inserted the words 'arc', 'ark', and 'ape' in a trie you could visualize it thusly:
13 |
14 |
15 | p - e
16 | /
17 | a - r - c
18 | \
19 | k
20 |
21 |
22 | It's easy to see how this can have pretty neat implications for things like searching through lists of strings, sorting lists of strings, and things like spelling correction and autocompletion.
23 |
24 | h2. Installation
25 |
26 | From RubyGems https://rubygems.org/gems/fast_trie
27 |
28 |
29 | gem install fast_trie
30 |
31 |
32 | h2. Tutorial
33 |
34 | Let's go through building a simple autocompleter using "Trie":http://rubydoc.info/gems/fast_trie/Trie object.
35 |
36 |
37 | require 'trie'
38 | Trie.new
39 |
40 |
41 | Anyway. So we've created our blank trie. Now, since we're creating an autocompleter, we'll need to add some words into it. We do that simply with the add method.
42 |
43 |
44 | words.each do |word|
45 | trie.add word
46 | end
47 |
48 |
49 | Or if you have some integer data to store along with the words, such as weights or scores of some kind, you'd do it like so...
50 |
51 |
52 | words_and_weights do |word,weight|
53 | trie.add word, weight
54 | end
55 |
56 |
57 | Great, so we've populated our trie with some words. Let's make sure those words are really there.
58 |
59 |
60 | trie.has_key?('widget') #=> true
61 |
62 | trie.get('widget') #=> -1 or your value
63 |
64 | trie.get('not-in-the-trie') #=> nil
65 |
66 |
67 | If you didn't enter a value to go along with the word, calling get
with it will return -1.
68 |
69 | Okay great, we have our populated trie, we've confirmed that the keys are in there. Let's make an autocompleter! For this we'll need to use the children
method. We'll do this as a simple Rails action, with the assumption you've initialized the trie into TRIE
.
70 |
71 |
72 | def autocomplete
73 | children = TRIE.children(params[:prefix])
74 |
75 | respond_to do |format|
76 | format.js { render(:string => JSON.dump(children)) }
77 | format.yaml { render(:string => YAML.dump(children)) }
78 | end
79 | end
80 |
81 |
82 | Yep, that's it.
83 |
84 | There are, of course, some more interesting and advanced ways to use a trie. For instance, this snippet take a string, then walks down the trie, noting each word it finds along the way.
85 |
86 |
87 | word = 'forestry'
88 | node = trie.root
89 |
90 | word.split('').each do |char|
91 | break unless node.walk!(char)
92 | if node.terminal?
93 | puts "Found me a word: #{node.full_state}"
94 | end
95 | end
96 |
97 |
98 | By calling root
on a Trie, you get a "TrieNode":http://rubydoc.info/gems/fast_trie/TrieNode, pointed at the root of the trie. You can then use this node to walk the trie and perceive things about each word.
99 |
100 | You can read the reference documentation at http://rubydoc.info/gems/fast_trie/frames/Trie
101 |
102 | h2. Performance Characteristics
103 |
104 | Here are some quick benchmarks on my 2.4ghz Intel Core 2 Duo MacBook Pro:
105 |
106 | For keys that are 5 characters long:
107 | 31,344 adds/second
108 | 1,827,408 searches/second
109 | 38,453 prefixes searches/second
110 |
111 | For keys that are 10 characters long:
112 | 30,653 adds/second
113 | 1,802,649 searches/second
114 | 13,553 prefix searches/second
115 |
116 | For keys that are 20 characters long:
117 | 30,488 adds/second
118 | 1,851,461 searches/second
119 | 5,855 prefix searches/second
120 |
121 | For keys that are 40 characters long:
122 | 30,710 adds/second
123 | 1,838,380 searches/second
124 | 2,762 prefix searches/second
125 |
126 |
127 | There are a few takeaways from this. First, there is no strong correlation between length of keys and insert or retrieve time. They stay fairly constant as the length of keys increase. Secondly, doing prefix searches with this trie gets slower linearly with the length of the keys in the trie.
128 |
129 | This points to a limitation of this type of trie. It is based on "libdatrie":http://linux.thai.net/~thep/datrie/ ("version 0.1.99":http://linux.thai.net/svn/software/datrie/trunk/NEWS), which is a dual-array trie. When finding branches from a particular node, we must query all possible branches to determine whether or not they exist. So for each node we do 255 of these queries.
130 |
131 | There may be some tricks to speed this up, but for now it is simply a limitation of this trie.
132 |
133 | Now, let's look at the effect of the size of the trie itself on query and insertion time. For this test I inserted 100, 1000, 10000, 100000, and 1000000 words in the trie. We measure the insertion and retrieval time in each. The graph below shows the results.
134 |
135 | !http://codehallow.com/effect_of_size.png!
136 |
137 | So, keeping in mind that we're increasing by orders of magnitude, you can see that the insertion time does take a signifcant hit. Retrieval also goes down but at a very gradual rate. (It decreases by about 50% in total, despite the size increasing by 1,000,000%.)
138 |
139 | The reason the insertion times takes such a beating is due, again, to a limitation of the trie. Storing a trie in the dual array setup that is used is excellent for memory usage and retrieval time. Best in class, in fact. However, the more things are added into the trie the more complicated it gets to insert things. It often requires shuffling large pieces of the arrays. There may be room for optimization here, but ultimately insertion time will increase with the size of the trie.
140 |
141 |
142 |
143 | Copyright (c) 2008 Tyler McMullen. See LICENSE for details.
144 |
--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | require 'rubygems'
4 | require 'bundler'
5 | begin
6 | Bundler.setup(:default, :development)
7 | rescue Bundler::BundlerError => e
8 | $stderr.puts e.message
9 | $stderr.puts "Run `bundle install` to install missing gems"
10 | exit e.status_code
11 | end
12 | require 'rake'
13 |
14 | require 'jeweler'
15 |
16 | jeweler_tasks = Jeweler::Tasks.new do |s|
17 | s.name = "fast_trie"
18 | s.email = "tyler@scribd.com"
19 | s.homepage = "http://github.com/tyler/trie"
20 | s.description = "Ruby Trie based on libdatrie."
21 | s.summary = s.description
22 | s.authors = ["Tyler McMullen", "Matt Hickford"]
23 | s.extensions = ['ext/trie/extconf.rb']
24 | s.require_paths = ['ext']
25 | s.files = FileList["[A-Z]*.*", "{spec,ext}/**/*"]
26 | s.has_rdoc = true
27 | s.rdoc_options = ['--title', 'Trie', '--line-numbers', '--op', 'rdoc', '--main', 'ext/trie/trie.c', 'README']
28 | end
29 | Jeweler::RubygemsDotOrgTasks.new
30 |
31 | $gemspec = jeweler_tasks.gemspec
32 | $gemspec.version = jeweler_tasks.jeweler.version
33 |
34 | require 'rake/extensiontask'
35 | Rake::ExtensionTask.new('trie', $gemspec)
36 | CLEAN.include 'lib/**/*.so'
37 |
38 | require 'rspec/core/rake_task'
39 | RSpec::Core::RakeTask.new
40 |
41 | require 'rdoc/task'
42 | Rake::RDocTask.new do |rdoc|
43 | rdoc.rdoc_dir = 'rdoc'
44 | rdoc.title = 'Trie'
45 | rdoc.options << '--line-numbers' << '--inline-source'
46 | rdoc.rdoc_files.include('README*')
47 | rdoc.rdoc_files.include('ext/trie/trie.c')
48 | end
49 |
50 | task :default => [:compile, :spec]
51 |
--------------------------------------------------------------------------------
/VERSION.yml:
--------------------------------------------------------------------------------
1 | ---
2 | :major: 0
3 | :minor: 5
4 | :patch: 1
5 | :build:
6 |
--------------------------------------------------------------------------------
/ext/trie/darray.c:
--------------------------------------------------------------------------------
1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 | /*
3 | * darray.c - Double-array trie structure
4 | * Created: 2006-08-13
5 | * Author: Theppitak Karoonboonyanan
6 | */
7 |
8 | #include
9 | #include
10 | #include
11 |
12 | #include "trie-private.h"
13 | #include "darray.h"
14 | #include "fileutils.h"
15 |
16 | /*----------------------------------*
17 | * INTERNAL TYPES DECLARATIONS *
18 | *----------------------------------*/
19 |
20 | typedef struct _Symbols Symbols;
21 |
22 | struct _Symbols {
23 | short num_symbols;
24 | TrieChar symbols[256];
25 | };
26 |
27 | static Symbols * symbols_new ();
28 | static void symbols_free (Symbols *syms);
29 | static void symbols_add (Symbols *syms, TrieChar c);
30 |
31 | #define symbols_num(s) ((s)->num_symbols)
32 | #define symbols_get(s,i) ((s)->symbols[i])
33 | #define symbols_add_fast(s,c) ((s)->symbols[(s)->num_symbols++] = c)
34 |
35 | /*-----------------------------------*
36 | * PRIVATE METHODS DECLARATIONS *
37 | *-----------------------------------*/
38 |
39 | #define da_get_free_list(d) (1)
40 |
41 | static Bool da_check_free_cell (DArray *d,
42 | TrieIndex s);
43 |
44 | static Bool da_has_children (DArray *d,
45 | TrieIndex s);
46 |
47 | static Symbols * da_output_symbols (const DArray *d,
48 | TrieIndex s);
49 |
50 | static TrieChar * da_get_state_key (const DArray *d,
51 | TrieIndex state);
52 |
53 | static TrieIndex da_find_free_base (DArray *d,
54 | const Symbols *symbols);
55 |
56 | static Bool da_fit_symbols (DArray *d,
57 | TrieIndex base,
58 | const Symbols *symbols);
59 |
60 | static void da_relocate_base (DArray *d,
61 | TrieIndex s,
62 | TrieIndex new_base);
63 |
64 | static Bool da_extend_pool (DArray *d,
65 | TrieIndex to_index);
66 |
67 | static void da_alloc_cell (DArray *d,
68 | TrieIndex cell);
69 |
70 | static void da_free_cell (DArray *d,
71 | TrieIndex cell);
72 |
73 | static Bool da_enumerate_recursive (const DArray *d,
74 | TrieIndex state,
75 | DAEnumFunc enum_func,
76 | void *user_data);
77 |
78 | /* ==================== BEGIN IMPLEMENTATION PART ==================== */
79 |
80 | /*------------------------------------*
81 | * INTERNAL TYPES IMPLEMENTATIONS *
82 | *------------------------------------*/
83 |
84 | static Symbols *
85 | symbols_new ()
86 | {
87 | Symbols *syms;
88 |
89 | syms = (Symbols *) malloc (sizeof (Symbols));
90 |
91 | if (!syms)
92 | return NULL;
93 |
94 | syms->num_symbols = 0;
95 |
96 | return syms;
97 | }
98 |
99 | static void
100 | symbols_free (Symbols *syms)
101 | {
102 | free (syms);
103 | }
104 |
105 | static void
106 | symbols_add (Symbols *syms, TrieChar c)
107 | {
108 | short lower, upper;
109 |
110 | lower = 0;
111 | upper = syms->num_symbols;
112 | while (lower < upper) {
113 | short middle;
114 |
115 | middle = (lower + upper)/2;
116 | if (c > syms->symbols[middle])
117 | lower = middle + 1;
118 | else if (c < syms->symbols[middle])
119 | upper = middle;
120 | else
121 | return;
122 | }
123 | if (lower < syms->num_symbols) {
124 | memmove (syms->symbols + lower + 1, syms->symbols + lower,
125 | syms->num_symbols - lower);
126 | }
127 | syms->symbols[lower] = c;
128 | syms->num_symbols++;
129 | }
130 |
131 | /*------------------------------*
132 | * PRIVATE DATA DEFINITONS *
133 | *------------------------------*/
134 |
135 | typedef struct {
136 | TrieIndex base;
137 | TrieIndex check;
138 | } DACell;
139 |
140 | struct _DArray {
141 | TrieIndex num_cells;
142 | DACell *cells;
143 | };
144 |
145 | /*-----------------------------*
146 | * METHODS IMPLEMENTAIONS *
147 | *-----------------------------*/
148 |
149 | #define DA_SIGNATURE 0xDAFCDAFC
150 |
151 | /* DA Header:
152 | * - Cell 0: SIGNATURE, number of cells
153 | * - Cell 1: free circular-list pointers
154 | * - Cell 2: root node
155 | * - Cell 3: DA pool begin
156 | */
157 | #define DA_POOL_BEGIN 3
158 |
159 | DArray *
160 | da_new ()
161 | {
162 | DArray *d;
163 |
164 | d = (DArray *) malloc (sizeof (DArray));
165 | if (!d)
166 | return NULL;
167 |
168 | d->num_cells = DA_POOL_BEGIN;
169 | d->cells = (DACell *) malloc (d->num_cells * sizeof (DACell));
170 | if (!d->cells)
171 | goto exit_da_created;
172 | d->cells[0].base = DA_SIGNATURE;
173 | d->cells[0].check = d->num_cells;
174 | d->cells[1].base = -1;
175 | d->cells[1].check = -1;
176 | d->cells[2].base = DA_POOL_BEGIN;
177 | d->cells[2].check = 0;
178 |
179 | return d;
180 |
181 | exit_da_created:
182 | free (d);
183 | return NULL;
184 | }
185 |
186 | DArray *
187 | da_read (FILE *file)
188 | {
189 | long save_pos;
190 | DArray *d = NULL;
191 | TrieIndex n;
192 |
193 | /* check signature */
194 | save_pos = ftell (file);
195 | if (!file_read_int32 (file, &n) || DA_SIGNATURE != (uint32) n) {
196 | fseek (file, save_pos, SEEK_SET);
197 | return NULL;
198 | }
199 |
200 | d = (DArray *) malloc (sizeof (DArray));
201 | if (!d)
202 | return NULL;
203 |
204 | /* read number of cells */
205 | file_read_int32 (file, &d->num_cells);
206 | d->cells = (DACell *) malloc (d->num_cells * sizeof (DACell));
207 | if (!d->cells)
208 | goto exit_da_created;
209 | d->cells[0].base = DA_SIGNATURE;
210 | d->cells[0].check= d->num_cells;
211 | for (n = 1; n < d->num_cells; n++) {
212 | file_read_int32 (file, &d->cells[n].base);
213 | file_read_int32 (file, &d->cells[n].check);
214 | }
215 |
216 | return d;
217 |
218 | exit_da_created:
219 | free (d);
220 | return NULL;
221 | }
222 |
223 | void
224 | da_free (DArray *d)
225 | {
226 | free (d->cells);
227 | free (d);
228 | }
229 |
230 | int
231 | da_write (const DArray *d, FILE *file)
232 | {
233 | TrieIndex i;
234 |
235 | for (i = 0; i < d->num_cells; i++) {
236 | if (!file_write_int32 (file, d->cells[i].base) ||
237 | !file_write_int32 (file, d->cells[i].check))
238 | {
239 | return -1;
240 | }
241 | }
242 |
243 | return 0;
244 | }
245 |
246 |
247 | TrieIndex
248 | da_get_root (const DArray *d)
249 | {
250 | /* can be calculated value for multi-index trie */
251 | return 2;
252 | }
253 |
254 |
255 | TrieIndex
256 | da_get_base (const DArray *d, TrieIndex s)
257 | {
258 | return (0 <= s && s < d->num_cells) ? d->cells[s].base : TRIE_INDEX_ERROR;
259 | }
260 |
261 | TrieIndex
262 | da_get_check (const DArray *d, TrieIndex s)
263 | {
264 | return (0 <= s && s < d->num_cells) ? d->cells[s].check : TRIE_INDEX_ERROR;
265 | }
266 |
267 |
268 | void
269 | da_set_base (DArray *d, TrieIndex s, TrieIndex val)
270 | {
271 | if (0 <= s && s < d->num_cells) {
272 | d->cells[s].base = val;
273 | }
274 | }
275 |
276 | void
277 | da_set_check (DArray *d, TrieIndex s, TrieIndex val)
278 | {
279 | if (0 <= s && s < d->num_cells) {
280 | d->cells[s].check = val;
281 | }
282 | }
283 |
284 | Bool
285 | da_walk (const DArray *d, TrieIndex *s, TrieChar c)
286 | {
287 | TrieIndex next;
288 |
289 | next = da_get_base (d, *s) + c;
290 | if (da_get_check (d, next) == *s) {
291 | *s = next;
292 | return TRUE;
293 | }
294 | return FALSE;
295 | }
296 |
297 | TrieIndex
298 | da_insert_branch (DArray *d, TrieIndex s, TrieChar c)
299 | {
300 | TrieIndex base, next;
301 |
302 | base = da_get_base (d, s);
303 |
304 | if (base > 0) {
305 | next = base + c;
306 |
307 | /* if already there, do not actually insert */
308 | if (da_get_check (d, next) == s)
309 | return next;
310 |
311 | /* if (base + c) > TRIE_INDEX_MAX which means 'next' is overflow,
312 | * or cell [next] is not free, relocate to a free slot
313 | */
314 | if (base > TRIE_INDEX_MAX - c || !da_check_free_cell (d, next)) {
315 | Symbols *symbols;
316 | TrieIndex new_base;
317 |
318 | /* relocate BASE[s] */
319 | symbols = da_output_symbols (d, s);
320 | symbols_add (symbols, c);
321 | new_base = da_find_free_base (d, symbols);
322 | symbols_free (symbols);
323 |
324 | if (TRIE_INDEX_ERROR == new_base)
325 | return TRIE_INDEX_ERROR;
326 |
327 | da_relocate_base (d, s, new_base);
328 | next = new_base + c;
329 | }
330 | } else {
331 | Symbols *symbols;
332 | TrieIndex new_base;
333 |
334 | symbols = symbols_new ();
335 | symbols_add (symbols, c);
336 | new_base = da_find_free_base (d, symbols);
337 | symbols_free (symbols);
338 |
339 | if (TRIE_INDEX_ERROR == new_base)
340 | return TRIE_INDEX_ERROR;
341 |
342 | da_set_base (d, s, new_base);
343 | next = new_base + c;
344 | }
345 | da_alloc_cell (d, next);
346 | da_set_check (d, next, s);
347 |
348 | return next;
349 | }
350 |
351 | static Bool
352 | da_check_free_cell (DArray *d,
353 | TrieIndex s)
354 | {
355 | return da_extend_pool (d, s) && da_get_check (d, s) < 0;
356 | }
357 |
358 | static Bool
359 | da_has_children (DArray *d,
360 | TrieIndex s)
361 | {
362 | TrieIndex base;
363 | TrieIndex c, max_c;
364 |
365 | base = da_get_base (d, s);
366 | if (TRIE_INDEX_ERROR == base || base < 0)
367 | return FALSE;
368 |
369 | max_c = MIN_VAL (TRIE_CHAR_MAX, TRIE_INDEX_MAX - base);
370 | for (c = 0; c < max_c; c++) {
371 | if (da_get_check (d, base + c) == s)
372 | return TRUE;
373 | }
374 |
375 | return FALSE;
376 | }
377 |
378 | static Symbols *
379 | da_output_symbols (const DArray *d,
380 | TrieIndex s)
381 | {
382 | Symbols *syms;
383 | TrieIndex base;
384 | TrieIndex c, max_c;
385 |
386 | syms = symbols_new ();
387 |
388 | base = da_get_base (d, s);
389 | max_c = MIN_VAL (TRIE_CHAR_MAX, TRIE_INDEX_MAX - base);
390 | for (c = 0; c < max_c; c++) {
391 | if (da_get_check (d, base + c) == s)
392 | symbols_add_fast (syms, (TrieChar) c);
393 | }
394 |
395 | return syms;
396 | }
397 |
398 | static TrieChar *
399 | da_get_state_key (const DArray *d,
400 | TrieIndex state)
401 | {
402 | TrieChar *key;
403 | int key_size, key_length;
404 | int i;
405 |
406 | key_size = 20;
407 | key_length = 0;
408 | key = (TrieChar *) malloc (key_size);
409 |
410 | /* trace back to root */
411 | while (da_get_root (d) != state) {
412 | TrieIndex parent;
413 |
414 | if (key_length + 1 >= key_size) {
415 | key_size += 20;
416 | key = (TrieChar *) realloc (key, key_size);
417 | }
418 | parent = da_get_check (d, state);
419 | key[key_length++] = (TrieChar) (state - da_get_base (d, parent));
420 | state = parent;
421 | }
422 | key[key_length] = '\0';
423 |
424 | /* reverse the string */
425 | for (i = 0; i < --key_length; i++) {
426 | TrieChar temp;
427 |
428 | temp = key[i];
429 | key[i] = key[key_length];
430 | key[key_length] = temp;
431 | }
432 |
433 | return key;
434 | }
435 |
436 | static TrieIndex
437 | da_find_free_base (DArray *d,
438 | const Symbols *symbols)
439 | {
440 | TrieChar first_sym;
441 | TrieIndex s;
442 |
443 | /* find first free cell that is beyond the first symbol */
444 | first_sym = symbols_get (symbols, 0);
445 | s = -da_get_check (d, da_get_free_list (d));
446 | while (s != da_get_free_list (d)
447 | && s < (TrieIndex) first_sym + DA_POOL_BEGIN)
448 | {
449 | s = -da_get_check (d, s);
450 | }
451 | if (s == da_get_free_list (d)) {
452 | for (s = first_sym + DA_POOL_BEGIN; ; ++s) {
453 | if (!da_extend_pool (d, s))
454 | return TRIE_INDEX_ERROR;
455 | if (da_get_check (d, s) < 0)
456 | break;
457 | }
458 | }
459 |
460 | /* search for next free cell that fits the symbols set */
461 | while (!da_fit_symbols (d, s - first_sym, symbols)) {
462 | /* extend pool before getting exhausted */
463 | if (-da_get_check (d, s) == da_get_free_list (d)) {
464 | if (!da_extend_pool (d, d->num_cells))
465 | return TRIE_INDEX_ERROR;
466 | }
467 |
468 | s = -da_get_check (d, s);
469 | }
470 |
471 | return s - first_sym;
472 | }
473 |
474 | static Bool
475 | da_fit_symbols (DArray *d,
476 | TrieIndex base,
477 | const Symbols *symbols)
478 | {
479 | int i;
480 |
481 | for (i = 0; i < symbols_num (symbols); i++) {
482 | TrieChar sym = symbols_get (symbols, i);
483 |
484 | /* if (base + sym) > TRIE_INDEX_MAX which means it's overflow,
485 | * or cell [base + sym] is not free, the symbol is not fit.
486 | */
487 | if (base > TRIE_INDEX_MAX - sym || !da_check_free_cell (d, base + sym))
488 | return FALSE;
489 | }
490 | return TRUE;
491 | }
492 |
493 | static void
494 | da_relocate_base (DArray *d,
495 | TrieIndex s,
496 | TrieIndex new_base)
497 | {
498 | TrieIndex old_base;
499 | Symbols *symbols;
500 | int i;
501 |
502 | old_base = da_get_base (d, s);
503 | symbols = da_output_symbols (d, s);
504 |
505 | for (i = 0; i < symbols_num (symbols); i++) {
506 | TrieIndex old_next, new_next, old_next_base;
507 |
508 | old_next = old_base + symbols_get (symbols, i);
509 | new_next = new_base + symbols_get (symbols, i);
510 | old_next_base = da_get_base (d, old_next);
511 |
512 | /* allocate new next node and copy BASE value */
513 | da_alloc_cell (d, new_next);
514 | da_set_check (d, new_next, s);
515 | da_set_base (d, new_next, old_next_base);
516 |
517 | /* old_next node is now moved to new_next
518 | * so, all cells belonging to old_next
519 | * must be given to new_next
520 | */
521 | /* preventing the case of TAIL pointer */
522 | if (old_next_base > 0) {
523 | TrieIndex c, max_c;
524 |
525 | max_c = MIN_VAL (TRIE_CHAR_MAX, TRIE_INDEX_MAX - old_next_base);
526 | for (c = 0; c < max_c; c++) {
527 | if (da_get_check (d, old_next_base + c) == old_next)
528 | da_set_check (d, old_next_base + c, new_next);
529 | }
530 | }
531 |
532 | /* free old_next node */
533 | da_free_cell (d, old_next);
534 | }
535 |
536 | symbols_free (symbols);
537 |
538 | /* finally, make BASE[s] point to new_base */
539 | da_set_base (d, s, new_base);
540 | }
541 |
542 | static Bool
543 | da_extend_pool (DArray *d,
544 | TrieIndex to_index)
545 | {
546 | TrieIndex new_begin;
547 | TrieIndex i;
548 | TrieIndex free_tail;
549 |
550 | if (to_index <= 0 || TRIE_INDEX_MAX <= to_index)
551 | return FALSE;
552 |
553 | if (to_index < d->num_cells)
554 | return TRUE;
555 |
556 | d->cells = (DACell *) realloc (d->cells, (to_index + 1) * sizeof (DACell));
557 | new_begin = d->num_cells;
558 | d->num_cells = to_index + 1;
559 |
560 | /* initialize new free list */
561 | for (i = new_begin; i < to_index; i++) {
562 | da_set_check (d, i, -(i + 1));
563 | da_set_base (d, i + 1, -i);
564 | }
565 |
566 | /* merge the new circular list to the old */
567 | free_tail = -da_get_base (d, da_get_free_list (d));
568 | da_set_check (d, free_tail, -new_begin);
569 | da_set_base (d, new_begin, -free_tail);
570 | da_set_check (d, to_index, -da_get_free_list (d));
571 | da_set_base (d, da_get_free_list (d), -to_index);
572 |
573 | /* update header cell */
574 | d->cells[0].check = d->num_cells;
575 |
576 | return TRUE;
577 | }
578 |
579 | void
580 | da_prune (DArray *d, TrieIndex s)
581 | {
582 | da_prune_upto (d, da_get_root (d), s);
583 | }
584 |
585 | void
586 | da_prune_upto (DArray *d, TrieIndex p, TrieIndex s)
587 | {
588 | while (p != s && !da_has_children (d, s)) {
589 | TrieIndex parent;
590 |
591 | parent = da_get_check (d, s);
592 | da_free_cell (d, s);
593 | s = parent;
594 | }
595 | }
596 |
597 | static void
598 | da_alloc_cell (DArray *d,
599 | TrieIndex cell)
600 | {
601 | TrieIndex prev, next;
602 |
603 | prev = -da_get_base (d, cell);
604 | next = -da_get_check (d, cell);
605 |
606 | /* remove the cell from free list */
607 | da_set_check (d, prev, -next);
608 | da_set_base (d, next, -prev);
609 | }
610 |
611 | static void
612 | da_free_cell (DArray *d,
613 | TrieIndex cell)
614 | {
615 | TrieIndex i, prev;
616 |
617 | /* find insertion point */
618 | i = -da_get_check (d, da_get_free_list (d));
619 | while (i != da_get_free_list (d) && i < cell)
620 | i = -da_get_check (d, i);
621 |
622 | prev = -da_get_base (d, i);
623 |
624 | /* insert cell before i */
625 | da_set_check (d, cell, -i);
626 | da_set_base (d, cell, -prev);
627 | da_set_check (d, prev, -cell);
628 | da_set_base (d, i, -cell);
629 | }
630 |
631 | Bool
632 | da_enumerate (const DArray *d, DAEnumFunc enum_func, void *user_data)
633 | {
634 | return da_enumerate_recursive (d, da_get_root (d), enum_func, user_data);
635 | }
636 |
637 | static Bool
638 | da_enumerate_recursive (const DArray *d,
639 | TrieIndex state,
640 | DAEnumFunc enum_func,
641 | void *user_data)
642 | {
643 | Bool ret;
644 | TrieIndex base;
645 |
646 | base = da_get_base (d, state);
647 |
648 | if (base < 0) {
649 | TrieChar *key;
650 |
651 | key = da_get_state_key (d, state);
652 | ret = (*enum_func) (key, state, user_data);
653 | free (key);
654 | } else {
655 | Symbols *symbols;
656 | int i;
657 |
658 | ret = TRUE;
659 | symbols = da_output_symbols (d, state);
660 | for (i = 0; ret && i < symbols_num (symbols); i++) {
661 | ret = da_enumerate_recursive (d, base + symbols_get (symbols, i),
662 | enum_func, user_data);
663 | }
664 |
665 | symbols_free (symbols);
666 | }
667 |
668 | return ret;
669 | }
670 |
671 | /*
672 | vi:ts=4:ai:expandtab
673 | */
674 |
--------------------------------------------------------------------------------
/ext/trie/darray.h:
--------------------------------------------------------------------------------
1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 | /*
3 | * darray.h - Double-array trie structure
4 | * Created: 2006-08-11
5 | * Author: Theppitak Karoonboonyanan
6 | */
7 |
8 | #ifndef __DARRAY_H
9 | #define __DARRAY_H
10 |
11 | #include "triedefs.h"
12 |
13 | /**
14 | * @file darray.h
15 | * @brief Double-array trie structure
16 | */
17 |
18 | /**
19 | * @brief Double-array structure type
20 | */
21 | typedef struct _DArray DArray;
22 |
23 | /**
24 | * @brief Double-array entry enumeration function
25 | *
26 | * @param key : the key of the entry, up to @a sep_node
27 | * @param sep_node : the separate node of the entry
28 | * @param user_data : user-supplied data
29 | *
30 | * @return TRUE to continue enumeration, FALSE to stop
31 | */
32 | typedef Bool (*DAEnumFunc) (const TrieChar *key,
33 | TrieIndex sep_node,
34 | void *user_data);
35 |
36 |
37 | /**
38 | * @brief Create a new double-array object
39 | *
40 | * Create a new empty doubla-array object.
41 | */
42 | DArray * da_new ();
43 |
44 | /**
45 | * @brief Read double-array data from file
46 | *
47 | * @param file : the file to read
48 | *
49 | * @return a pointer to the openned double-array, NULL on failure
50 | *
51 | * Read double-array data from the opened file, starting from the current
52 | * file pointer until the end of double array data block. On return, the
53 | * file pointer is left at the position after the read block.
54 | */
55 | DArray * da_read (FILE *file);
56 |
57 | /**
58 | * @brief Free double-array data
59 | *
60 | * @param d : the double-array data
61 | *
62 | * Free the given double-array data.
63 | */
64 | void da_free (DArray *d);
65 |
66 | /**
67 | * @brief Write double-array data
68 | *
69 | * @param d : the double-array data
70 | * @param file : the file to write to
71 | *
72 | * @return 0 on success, non-zero on failure
73 | *
74 | * Write double-array data to the given @a file, starting from the current
75 | * file pointer. On return, the file pointer is left after the double-array
76 | * data block.
77 | */
78 | int da_write (const DArray *d, FILE *file);
79 |
80 |
81 | /**
82 | * @brief Get root state
83 | *
84 | * @param d : the double-array data
85 | *
86 | * @return root state of the @a index set, or TRIE_INDEX_ERROR on failure
87 | *
88 | * Get root state for stepwise walking.
89 | */
90 | TrieIndex da_get_root (const DArray *d);
91 |
92 |
93 | /**
94 | * @brief Get BASE cell
95 | *
96 | * @param d : the double-array data
97 | * @param s : the double-array state to get data
98 | *
99 | * @return the BASE cell value for the given state
100 | *
101 | * Get BASE cell value for the given state.
102 | */
103 | TrieIndex da_get_base (const DArray *d, TrieIndex s);
104 |
105 | /**
106 | * @brief Get CHECK cell
107 | *
108 | * @param d : the double-array data
109 | * @param s : the double-array state to get data
110 | *
111 | * @return the CHECK cell value for the given state
112 | *
113 | * Get CHECK cell value for the given state.
114 | */
115 | TrieIndex da_get_check (const DArray *d, TrieIndex s);
116 |
117 |
118 | /**
119 | * @brief Set BASE cell
120 | *
121 | * @param d : the double-array data
122 | * @param s : the double-array state to get data
123 | * @param val : the value to set
124 | *
125 | * Set BASE cell for the given state to the given value.
126 | */
127 | void da_set_base (DArray *d, TrieIndex s, TrieIndex val);
128 |
129 | /**
130 | * @brief Set CHECK cell
131 | *
132 | * @param d : the double-array data
133 | * @param s : the double-array state to get data
134 | * @param val : the value to set
135 | *
136 | * Set CHECK cell for the given state to the given value.
137 | */
138 | void da_set_check (DArray *d, TrieIndex s, TrieIndex val);
139 |
140 | /**
141 | * @brief Walk in double-array structure
142 | *
143 | * @param d : the double-array structure
144 | * @param s : current state
145 | * @param c : the input character
146 | *
147 | * @return boolean indicating success
148 | *
149 | * Walk the double-array trie from state @a *s, using input character @a c.
150 | * If there exists an edge from @a *s with arc labeled @a c, this function
151 | * returns TRUE and @a *s is updated to the new state. Otherwise, it returns
152 | * FALSE and @a *s is left unchanged.
153 | */
154 | Bool da_walk (const DArray *d, TrieIndex *s, TrieChar c);
155 |
156 | /**
157 | * @brief Test walkability in double-array structure
158 | *
159 | * @param d : the double-array structure
160 | * @param s : current state
161 | * @param c : the input character
162 | *
163 | * @return boolean indicating walkability
164 | *
165 | * Test if there is a transition from state @a s with input character @a c.
166 | */
167 | /*
168 | Bool da_is_walkable (DArray *d, TrieIndex s, TrieChar c);
169 | */
170 | #define da_is_walkable(d,s,c) \
171 | (da_get_check ((d), da_get_base ((d), (s)) + (c)) == (s))
172 |
173 | /**
174 | * @brief Insert a branch from trie node
175 | *
176 | * @param d : the double-array structure
177 | * @param s : the state to add branch to
178 | * @param c : the character for the branch label
179 | *
180 | * @return the index of the new node
181 | *
182 | * Insert a new arc labelled with character @a c from the trie node
183 | * represented by index @a s in double-array structure @a d.
184 | * Note that it assumes that no such arc exists before inserting.
185 | */
186 | TrieIndex da_insert_branch (DArray *d, TrieIndex s, TrieChar c);
187 |
188 | /**
189 | * @brief Prune the single branch
190 | *
191 | * @param d : the double-array structure
192 | * @param s : the dangling state to prune off
193 | *
194 | * Prune off a non-separate path up from the final state @a s.
195 | * If @a s still has some children states, it does nothing. Otherwise,
196 | * it deletes the node and all its parents which become non-separate.
197 | */
198 | void da_prune (DArray *d, TrieIndex s);
199 |
200 | /**
201 | * @brief Prune the single branch up to given parent
202 | *
203 | * @param d : the double-array structure
204 | * @param p : the parent up to which to be pruned
205 | * @param s : the dangling state to prune off
206 | *
207 | * Prune off a non-separate path up from the final state @a s to the
208 | * given parent @a p. The prunning stop when either the parent @a p
209 | * is met, or a first non-separate node is found.
210 | */
211 | void da_prune_upto (DArray *d, TrieIndex p, TrieIndex s);
212 |
213 | /**
214 | * @brief Enumerate entries stored in double-array structure
215 | *
216 | * @param d : the double-array structure
217 | * @param enum_func : the callback function to be called on each separate node
218 | * @param user_data : user-supplied data to send as an argument to @a enum_func
219 | *
220 | * @return boolean value indicating whether all the keys are visited
221 | *
222 | * Enumerate all keys stored in double-array structure. For each entry, the
223 | * user-supplied @a enum_func callback function is called, with the entry key,
224 | * the separate node, and user-supplied data. Returning FALSE from such
225 | * callback will stop enumeration and return FALSE.
226 | */
227 | Bool da_enumerate (const DArray *d, DAEnumFunc enum_func, void *user_data);
228 |
229 | #endif /* __DARRAY_H */
230 |
231 | /*
232 | vi:ts=4:ai:expandtab
233 | */
234 |
--------------------------------------------------------------------------------
/ext/trie/extconf.rb:
--------------------------------------------------------------------------------
1 | require 'mkmf'
2 | create_makefile 'trie'
3 |
4 |
--------------------------------------------------------------------------------
/ext/trie/fileutils.c:
--------------------------------------------------------------------------------
1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 | /*
3 | * fileutils.h - File utility functions
4 | * Created: 2006-08-15
5 | * Author: Theppitak Karoonboonyanan
6 | */
7 |
8 | #include
9 | #include
10 |
11 | #include "fileutils.h"
12 |
13 | /*--------------------------------------*
14 | * INTERNAL FUNCTIONS DECLARATIONS *
15 | *--------------------------------------*/
16 |
17 | static char * make_full_path (const char *dir,
18 | const char *name,
19 | const char *ext);
20 |
21 | /* ==================== BEGIN IMPLEMENTATION PART ==================== */
22 |
23 | /*--------------------------------*
24 | * FUNCTIONS IMPLEMENTATIONS *
25 | *--------------------------------*/
26 |
27 | static char *
28 | make_full_path (const char *dir, const char *name, const char *ext)
29 | {
30 | char *path;
31 |
32 | path = (char *) malloc (strlen (dir) + strlen (name) + strlen (ext) + 2);
33 | sprintf (path, "%s/%s%s", dir, name, ext);
34 |
35 | return path;
36 | }
37 |
38 | FILE *
39 | file_open (const char *dir, const char *name, const char *ext, TrieIOMode mode)
40 | {
41 | const char *std_mode;
42 | char *full_path;
43 | FILE *file;
44 |
45 | if (mode & TRIE_IO_WRITE)
46 | std_mode = "r+";
47 | else
48 | std_mode = "r";
49 |
50 | full_path = make_full_path (dir, name, ext);
51 | file = fopen (full_path, std_mode);
52 | if (!file && mode & TRIE_IO_CREATE)
53 | file = fopen (full_path, "w+");
54 | free (full_path);
55 |
56 | return file;
57 | }
58 |
59 | long
60 | file_length (FILE *file)
61 | {
62 | long cur_pos;
63 | long size;
64 |
65 | cur_pos = ftell (file);
66 |
67 | fseek (file, 0L, SEEK_END);
68 | size = ftell (file);
69 |
70 | fseek (file, cur_pos, SEEK_SET);
71 |
72 | return size;
73 | }
74 |
75 | Bool
76 | file_read_int32 (FILE *file, int32 *o_val)
77 | {
78 | unsigned char buff[4];
79 |
80 | if (fread (buff, 4, 1, file) == 1) {
81 | *o_val = (buff[0] << 24) | (buff[1] << 16) | (buff[2] << 8) | buff[3];
82 | return TRUE;
83 | }
84 |
85 | return FALSE;
86 | }
87 |
88 | Bool
89 | file_write_int32 (FILE *file, int32 val)
90 | {
91 | unsigned char buff[4];
92 |
93 | buff[0] = (val >> 24) & 0xff;
94 | buff[1] = (val >> 16) & 0xff;
95 | buff[2] = (val >> 8) & 0xff;
96 | buff[3] = val & 0xff;
97 |
98 | return (fwrite (buff, 4, 1, file) == 1);
99 | }
100 |
101 | Bool
102 | file_read_int16 (FILE *file, int16 *o_val)
103 | {
104 | unsigned char buff[2];
105 |
106 | if (fread (buff, 2, 1, file) == 1) {
107 | *o_val = (buff[0] << 8) | buff[1];
108 | return TRUE;
109 | }
110 |
111 | return FALSE;
112 | }
113 |
114 | Bool
115 | file_write_int16 (FILE *file, int16 val)
116 | {
117 | unsigned char buff[2];
118 |
119 | buff[0] = val >> 8;
120 | buff[1] = val & 0xff;
121 |
122 | return (fwrite (buff, 2, 1, file) == 1);
123 | }
124 |
125 | Bool
126 | file_read_int8 (FILE *file, int8 *o_val)
127 | {
128 | return (fread (o_val, sizeof (int8), 1, file) == 1);
129 | }
130 |
131 | Bool
132 | file_write_int8 (FILE *file, int8 val)
133 | {
134 | return (fwrite (&val, sizeof (int8), 1, file) == 1);
135 | }
136 |
137 | Bool
138 | file_read_chars (FILE *file, char *buff, int len)
139 | {
140 | return (fread (buff, sizeof (char), len, file) == len);
141 | }
142 |
143 | Bool
144 | file_write_chars (FILE *file, const char *buff, int len)
145 | {
146 | return (fwrite (buff, sizeof (char), len, file) == len);
147 | }
148 |
149 | /*
150 | vi:ts=4:ai:expandtab
151 | */
152 |
--------------------------------------------------------------------------------
/ext/trie/fileutils.h:
--------------------------------------------------------------------------------
1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 | /*
3 | * fileutils.h - File utility functions
4 | * Created: 2006-08-14
5 | * Author: Theppitak Karoonboonyanan
6 | */
7 |
8 | #ifndef __FILEUTILS_H
9 | #define __FILEUTILS_H
10 |
11 | #include
12 |
13 | #include "triedefs.h"
14 |
15 | FILE * file_open (const char *dir, const char *name, const char *ext,
16 | TrieIOMode mode);
17 |
18 | long file_length (FILE *file);
19 |
20 | Bool file_read_int32 (FILE *file, int32 *o_val);
21 | Bool file_write_int32 (FILE *file, int32 val);
22 |
23 | Bool file_read_int16 (FILE *file, int16 *o_val);
24 | Bool file_write_int16 (FILE *file, int16 val);
25 |
26 | Bool file_read_int8 (FILE *file, int8 *o_val);
27 | Bool file_write_int8 (FILE *file, int8 val);
28 |
29 | Bool file_read_chars (FILE *file, char *buff, int len);
30 | Bool file_write_chars (FILE *file, const char *buff, int len);
31 |
32 | #endif /* __FILEUTILS_H */
33 |
34 | /*
35 | vi:ts=4:ai:expandtab
36 | */
37 |
--------------------------------------------------------------------------------
/ext/trie/tail.c:
--------------------------------------------------------------------------------
1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 | /*
3 | * tail.c - trie tail for keeping suffixes
4 | * Created: 2006-08-15
5 | * Author: Theppitak Karoonboonyanan
6 | */
7 |
8 | #include
9 | #include
10 | #include
11 |
12 | #include "tail.h"
13 | #include "fileutils.h"
14 |
15 | /*----------------------------------*
16 | * INTERNAL TYPES DECLARATIONS *
17 | *----------------------------------*/
18 |
19 | /*-----------------------------------*
20 | * PRIVATE METHODS DECLARATIONS *
21 | *-----------------------------------*/
22 |
23 | static TrieIndex tail_alloc_block (Tail *t);
24 | static void tail_free_block (Tail *t, TrieIndex block);
25 |
26 | /* ==================== BEGIN IMPLEMENTATION PART ==================== */
27 |
28 | /*------------------------------------*
29 | * INTERNAL TYPES IMPLEMENTATIONS *
30 | *------------------------------------*/
31 |
32 | /*------------------------------*
33 | * PRIVATE DATA DEFINITONS *
34 | *------------------------------*/
35 |
36 | typedef struct {
37 | TrieIndex next_free;
38 | TrieData data;
39 | TrieChar *suffix;
40 | } TailBlock;
41 |
42 | struct _Tail {
43 | TrieIndex num_tails;
44 | TailBlock *tails;
45 | TrieIndex first_free;
46 | };
47 |
48 | /*-----------------------------*
49 | * METHODS IMPLEMENTAIONS *
50 | *-----------------------------*/
51 |
52 | #define TAIL_SIGNATURE 0xDFFCDFFC
53 | #define TAIL_START_BLOCKNO 1
54 |
55 | /* Tail Header:
56 | * INT32: signature
57 | * INT32: pointer to first free slot
58 | * INT32: number of tail blocks
59 | *
60 | * Tail Blocks:
61 | * INT32: pointer to next free block (-1 for allocated blocks)
62 | * INT32: data for the key
63 | * INT16: length
64 | * BYTES[length]: suffix string (no terminating '\0')
65 | */
66 |
67 | Tail *
68 | tail_new ()
69 | {
70 | Tail *t;
71 |
72 | t = (Tail *) malloc (sizeof (Tail));
73 | if (!t)
74 | return NULL;
75 |
76 | t->first_free = 0;
77 | t->num_tails = 0;
78 | t->tails = NULL;
79 |
80 | return t;
81 | }
82 |
83 | Tail *
84 | tail_read (FILE *file)
85 | {
86 | long save_pos;
87 | Tail *t;
88 | TrieIndex i;
89 | uint32 sig;
90 |
91 | /* check signature */
92 | save_pos = ftell (file);
93 | if (!file_read_int32 (file, (int32 *) &sig) || TAIL_SIGNATURE != sig) {
94 | fseek (file, save_pos, SEEK_SET);
95 | return NULL;
96 | }
97 |
98 | t = (Tail *) malloc (sizeof (Tail));
99 | if (!t)
100 | return NULL;
101 |
102 | file_read_int32 (file, &t->first_free);
103 | file_read_int32 (file, &t->num_tails);
104 | t->tails = (TailBlock *) malloc (t->num_tails * sizeof (TailBlock));
105 | if (!t->tails)
106 | goto exit_tail_created;
107 | for (i = 0; i < t->num_tails; i++) {
108 | int16 length;
109 |
110 | file_read_int32 (file, &t->tails[i].next_free);
111 | file_read_int32 (file, &t->tails[i].data);
112 |
113 | file_read_int16 (file, &length);
114 | t->tails[i].suffix = (TrieChar *) malloc (length + 1);
115 | if (length > 0)
116 | file_read_chars (file, (char *)t->tails[i].suffix, length);
117 | t->tails[i].suffix[length] = '\0';
118 | }
119 |
120 | return t;
121 |
122 | exit_tail_created:
123 | free (t);
124 | return NULL;
125 | }
126 |
127 | void
128 | tail_free (Tail *t)
129 | {
130 | TrieIndex i;
131 |
132 | if (t->tails) {
133 | for (i = 0; i < t->num_tails; i++)
134 | if (t->tails[i].suffix)
135 | free (t->tails[i].suffix);
136 | free (t->tails);
137 | }
138 | free (t);
139 | }
140 |
141 | int
142 | tail_write (const Tail *t, FILE *file)
143 | {
144 | TrieIndex i;
145 |
146 | if (!file_write_int32 (file, TAIL_SIGNATURE) ||
147 | !file_write_int32 (file, t->first_free) ||
148 | !file_write_int32 (file, t->num_tails))
149 | {
150 | return -1;
151 | }
152 | for (i = 0; i < t->num_tails; i++) {
153 | int16 length;
154 |
155 | if (!file_write_int32 (file, t->tails[i].next_free) ||
156 | !file_write_int32 (file, t->tails[i].data))
157 | {
158 | return -1;
159 | }
160 |
161 | length = t->tails[i].suffix ? strlen ((const char *)t->tails[i].suffix)
162 | : 0;
163 | if (!file_write_int16 (file, length))
164 | return -1;
165 | if (length > 0 &&
166 | !file_write_chars (file, (char *)t->tails[i].suffix, length))
167 | {
168 | return -1;
169 | }
170 | }
171 |
172 | return 0;
173 | }
174 |
175 |
176 | const TrieChar *
177 | tail_get_suffix (const Tail *t, TrieIndex index)
178 | {
179 | index -= TAIL_START_BLOCKNO;
180 | return (index < t->num_tails) ? t->tails[index].suffix : NULL;
181 | }
182 |
183 | Bool
184 | tail_set_suffix (Tail *t, TrieIndex index, const TrieChar *suffix)
185 | {
186 | index -= TAIL_START_BLOCKNO;
187 | if (index < t->num_tails) {
188 | /* suffix and t->tails[index].suffix may overlap;
189 | * so, dup it before it's overwritten
190 | */
191 | TrieChar *tmp = NULL;
192 | if (suffix)
193 | tmp = (TrieChar *) strdup ((const char *)suffix);
194 | if (t->tails[index].suffix)
195 | free (t->tails[index].suffix);
196 | t->tails[index].suffix = tmp;
197 |
198 | return TRUE;
199 | }
200 | return FALSE;
201 | }
202 |
203 | TrieIndex
204 | tail_add_suffix (Tail *t, const TrieChar *suffix)
205 | {
206 | TrieIndex new_block;
207 |
208 | new_block = tail_alloc_block (t);
209 | tail_set_suffix (t, new_block, suffix);
210 |
211 | return new_block;
212 | }
213 |
214 | static TrieIndex
215 | tail_alloc_block (Tail *t)
216 | {
217 | TrieIndex block;
218 |
219 | if (0 != t->first_free) {
220 | block = t->first_free;
221 | t->first_free = t->tails[block].next_free;
222 | } else {
223 | block = t->num_tails;
224 | t->tails = (TailBlock *) realloc (t->tails,
225 | ++t->num_tails * sizeof (TailBlock));
226 | }
227 | t->tails[block].next_free = -1;
228 | t->tails[block].data = TRIE_DATA_ERROR;
229 | t->tails[block].suffix = NULL;
230 |
231 | return block + TAIL_START_BLOCKNO;
232 | }
233 |
234 | static void
235 | tail_free_block (Tail *t, TrieIndex block)
236 | {
237 | TrieIndex i, j;
238 |
239 | block -= TAIL_START_BLOCKNO;
240 |
241 | if (block >= t->num_tails)
242 | return;
243 |
244 | t->tails[block].data = TRIE_DATA_ERROR;
245 | if (NULL != t->tails[block].suffix) {
246 | free (t->tails[block].suffix);
247 | t->tails[block].suffix = NULL;
248 | }
249 |
250 | /* find insertion point */
251 | j = 0;
252 | for (i = t->first_free; i != 0 && i < block; i = t->tails[i].next_free)
253 | j = i;
254 |
255 | /* insert free block between j and i */
256 | t->tails[block].next_free = i;
257 | if (0 != j)
258 | t->tails[j].next_free = block;
259 | else
260 | t->first_free = block;
261 | }
262 |
263 | TrieData
264 | tail_get_data (const Tail *t, TrieIndex index)
265 | {
266 | index -= TAIL_START_BLOCKNO;
267 | return (index < t->num_tails) ? t->tails[index].data : TRIE_DATA_ERROR;
268 | }
269 |
270 | Bool
271 | tail_set_data (Tail *t, TrieIndex index, TrieData data)
272 | {
273 | index -= TAIL_START_BLOCKNO;
274 | if (index < t->num_tails) {
275 | t->tails[index].data = data;
276 | return TRUE;
277 | }
278 | return FALSE;
279 | }
280 |
281 | void
282 | tail_delete (Tail *t, TrieIndex index)
283 | {
284 | tail_free_block (t, index);
285 | }
286 |
287 | int
288 | tail_walk_str (const Tail *t,
289 | TrieIndex s,
290 | short *suffix_idx,
291 | const TrieChar *str,
292 | int len)
293 | {
294 | const TrieChar *suffix;
295 | int i;
296 | short j;
297 |
298 | suffix = tail_get_suffix (t, s);
299 | if (!suffix)
300 | return FALSE;
301 |
302 | i = 0; j = *suffix_idx;
303 | while (i < len) {
304 | if (str[i] != suffix[j])
305 | break;
306 | ++i;
307 | /* stop and stay at null-terminator */
308 | if (0 == suffix[j])
309 | break;
310 | ++j;
311 | }
312 | *suffix_idx = j;
313 | return i;
314 | }
315 |
316 | Bool
317 | tail_walk_char (const Tail *t,
318 | TrieIndex s,
319 | short *suffix_idx,
320 | TrieChar c)
321 | {
322 | const TrieChar *suffix;
323 | TrieChar suffix_char;
324 |
325 | suffix = tail_get_suffix (t, s);
326 | if (!suffix)
327 | return FALSE;
328 |
329 | suffix_char = suffix[*suffix_idx];
330 | if (suffix_char == c) {
331 | if (0 != suffix_char)
332 | ++*suffix_idx;
333 | return TRUE;
334 | }
335 | return FALSE;
336 | }
337 |
338 | /*
339 | vi:ts=4:ai:expandtab
340 | */
341 |
--------------------------------------------------------------------------------
/ext/trie/tail.h:
--------------------------------------------------------------------------------
1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 | /*
3 | * tail.h - trie tail for keeping suffixes
4 | * Created: 2006-08-12
5 | * Author: Theppitak Karoonboonyanan
6 | */
7 |
8 | #ifndef __TAIL_H
9 | #define __TAIL_H
10 |
11 | #include "triedefs.h"
12 |
13 | /**
14 | * @file tail.h
15 | * @brief trie tail for keeping suffixes
16 | */
17 |
18 | /**
19 | * @brief Double-array structure type
20 | */
21 | typedef struct _Tail Tail;
22 |
23 | /**
24 | * @brief Create a new tail object
25 | *
26 | * Create a new empty tail object.
27 | */
28 | Tail * tail_new ();
29 |
30 | /**
31 | * @brief Read tail data from file
32 | *
33 | * @param file : the file to read
34 | *
35 | * @return a pointer to the openned tail data, NULL on failure
36 | *
37 | * Read tail data from the opened file, starting from the current
38 | * file pointer until the end of tail data block. On return, the
39 | * file pointer is left at the position after the read block.
40 | */
41 | Tail * tail_read (FILE *file);
42 |
43 | /**
44 | * @brief Free tail data
45 | *
46 | * @param t : the tail data
47 | *
48 | * @return 0 on success, non-zero on failure
49 | *
50 | * Free the given tail data.
51 | */
52 | void tail_free (Tail *t);
53 |
54 | /**
55 | * @brief Write tail data
56 | *
57 | * @param t : the tail data
58 | * @param file : the file to write to
59 | *
60 | * @return 0 on success, non-zero on failure
61 | *
62 | * Write tail data to the given @a file, starting from the current file
63 | * pointer. On return, the file pointer is left after the tail data block.
64 | */
65 | int tail_write (const Tail *t, FILE *file);
66 |
67 |
68 | /**
69 | * @brief Get suffix
70 | *
71 | * @param t : the tail data
72 | * @param index : the index of the suffix
73 | *
74 | * @return an allocated string of the indexed suffix.
75 | *
76 | * Get suffix from tail with given @a index. The returned string is allocated.
77 | * The caller should free it with free().
78 | */
79 | const TrieChar * tail_get_suffix (const Tail *t, TrieIndex index);
80 |
81 | /**
82 | * @brief Set suffix of existing entry
83 | *
84 | * @param t : the tail data
85 | * @param index : the index of the suffix
86 | * @param suffix : the new suffix
87 | *
88 | * Set suffix of existing entry of given @a index in tail.
89 | */
90 | Bool tail_set_suffix (Tail *t, TrieIndex index, const TrieChar *suffix);
91 |
92 | /**
93 | * @brief Add a new suffix
94 | *
95 | * @param t : the tail data
96 | * @param suffix : the new suffix
97 | *
98 | * @return the index of the newly added suffix.
99 | *
100 | * Add a new suffix entry to tail.
101 | */
102 | TrieIndex tail_add_suffix (Tail *t, const TrieChar *suffix);
103 |
104 | /**
105 | * @brief Get data associated to suffix entry
106 | *
107 | * @param t : the tail data
108 | * @param index : the index of the suffix
109 | *
110 | * @return the data associated to the suffix entry
111 | *
112 | * Get data associated to suffix entry @a index in tail data.
113 | */
114 | TrieData tail_get_data (const Tail *t, TrieIndex index);
115 |
116 | /**
117 | * @brief Set data associated to suffix entry
118 | *
119 | * @param t : the tail data
120 | * @param index : the index of the suffix
121 | * @param data : the data to set
122 | *
123 | * @return boolean indicating success
124 | *
125 | * Set data associated to suffix entry @a index in tail data.
126 | */
127 | Bool tail_set_data (Tail *t, TrieIndex index, TrieData data);
128 |
129 | /**
130 | * @brief Delete suffix entry
131 | *
132 | * @param t : the tail data
133 | * @param index : the index of the suffix to delete
134 | *
135 | * Delete suffix entry from the tail data.
136 | */
137 | void tail_delete (Tail *t, TrieIndex index);
138 |
139 | /**
140 | * @brief Walk in tail with a string
141 | *
142 | * @param t : the tail data
143 | * @param s : the tail data index
144 | * @param suffix_idx : pointer to current character index in suffix
145 | * @param str : the string to use in walking
146 | * @param len : total characters in @a str to walk
147 | *
148 | * @return total number of characters successfully walked
149 | *
150 | * Walk in the tail data @a t at entry @a s, from given character position
151 | * @a *suffix_idx, using @a len characters of given string @a str. On return,
152 | * @a *suffix_idx is updated to the position after the last successful walk,
153 | * and the function returns the total number of character succesfully walked.
154 | */
155 | int tail_walk_str (const Tail *t,
156 | TrieIndex s,
157 | short *suffix_idx,
158 | const TrieChar *str,
159 | int len);
160 |
161 | /**
162 | * @brief Walk in tail with a character
163 | *
164 | * @param t : the tail data
165 | * @param s : the tail data index
166 | * @param suffix_idx : pointer to current character index in suffix
167 | * @param c : the character to use in walking
168 | *
169 | * @return boolean indicating success
170 | *
171 | * Walk in the tail data @a t at entry @a s, from given character position
172 | * @a *suffix_idx, using given character @a c. If the walk is successful,
173 | * it returns TRUE, and @a *suffix_idx is updated to the next character.
174 | * Otherwise, it returns FALSE, and @a *suffix_idx is left unchanged.
175 | */
176 | Bool tail_walk_char (const Tail *t,
177 | TrieIndex s,
178 | short *suffix_idx,
179 | TrieChar c);
180 |
181 | /**
182 | * @brief Test walkability in tail with a character
183 | *
184 | * @param t : the tail data
185 | * @param s : the tail data index
186 | * @param suffix_idx : current character index in suffix
187 | * @param c : the character to test walkability
188 | *
189 | * @return boolean indicating walkability
190 | *
191 | * Test if the character @a c can be used to walk from given character
192 | * position @a suffix_idx of entry @a s of the tail data @a t.
193 | */
194 | /*
195 | Bool tail_is_walkable_char (Tail *t,
196 | TrieIndex s,
197 | short suffix_idx,
198 | const TrieChar c);
199 | */
200 | #define tail_is_walkable_char(t,s,suffix_idx,c) \
201 | (tail_get_suffix ((t), (s)) [suffix_idx] == (c))
202 |
203 | #endif /* __TAIL_H */
204 |
205 | /*
206 | vi:ts=4:ai:expandtab
207 | */
208 |
--------------------------------------------------------------------------------
/ext/trie/trie-private.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include "darray.h"
5 | #include "tail.h"
6 | #include "trie.h"
7 |
8 | Trie* trie_new() {
9 | Trie *trie = (Trie*) malloc(sizeof(Trie));
10 | trie->da = da_new();
11 | trie->tail = tail_new();
12 | return trie;
13 | }
14 |
15 | void trie_free(Trie *trie) {
16 | da_free(trie->da);
17 | tail_free(trie->tail);
18 | free(trie);
19 | }
20 |
21 | static Bool trie_branch_in_branch (Trie *trie, TrieIndex sep_node, const TrieChar *suffix, TrieData data) {
22 | TrieIndex new_da, new_tail;
23 |
24 | new_da = da_insert_branch (trie->da, sep_node, *suffix);
25 | if (TRIE_INDEX_ERROR == new_da)
26 | return FALSE;
27 |
28 | if ('\0' != *suffix)
29 | ++suffix;
30 |
31 | new_tail = tail_add_suffix (trie->tail, suffix);
32 | tail_set_data (trie->tail, new_tail, data);
33 | trie_da_set_tail_index (trie->da, new_da, new_tail);
34 |
35 | // trie->is_dirty = TRUE;
36 | return TRUE;
37 | }
38 |
39 | static Bool trie_branch_in_tail(Trie *trie, TrieIndex sep_node, const TrieChar *suffix, TrieData data) {
40 | TrieIndex old_tail, old_da, s;
41 | const TrieChar *old_suffix, *p;
42 |
43 | /* adjust separate point in old path */
44 | old_tail = trie_da_get_tail_index (trie->da, sep_node);
45 | old_suffix = tail_get_suffix (trie->tail, old_tail);
46 | if (!old_suffix)
47 | return FALSE;
48 |
49 | for (p = old_suffix, s = sep_node; *p == *suffix; p++, suffix++) {
50 | TrieIndex t = da_insert_branch (trie->da, s, *p);
51 | if (TRIE_INDEX_ERROR == t)
52 | goto fail;
53 | s = t;
54 | }
55 |
56 | old_da = da_insert_branch (trie->da, s, *p);
57 | if (TRIE_INDEX_ERROR == old_da)
58 | goto fail;
59 |
60 | if ('\0' != *p)
61 | ++p;
62 | tail_set_suffix (trie->tail, old_tail, p);
63 | trie_da_set_tail_index (trie->da, old_da, old_tail);
64 |
65 | /* insert the new branch at the new separate point */
66 | return trie_branch_in_branch (trie, s, suffix, data);
67 |
68 | fail:
69 | /* failed, undo previous insertions and return error */
70 | da_prune_upto (trie->da, sep_node, s);
71 | trie_da_set_tail_index (trie->da, sep_node, old_tail);
72 | return FALSE;
73 | }
74 |
75 | Bool trie_store (Trie *trie, const TrieChar *key, TrieData data) {
76 | TrieIndex s, t;
77 | short suffix_idx;
78 | const TrieChar *p, *sep;
79 | size_t len;
80 |
81 | /* walk through branches */
82 | s = da_get_root (trie->da);
83 | for (p = key; !trie_da_is_separate (trie->da, s); p++) {
84 | if (!da_walk (trie->da, &s, *p))
85 | return trie_branch_in_branch (trie, s, p, data);
86 | if (0 == *p)
87 | break;
88 | }
89 |
90 | /* walk through tail */
91 | sep = p;
92 | t = trie_da_get_tail_index (trie->da, s);
93 | suffix_idx = 0;
94 | len = strlen ((const char *) p) + 1; /* including null-terminator */
95 | if (tail_walk_str (trie->tail, t, &suffix_idx, p, len) != len)
96 | return trie_branch_in_tail (trie, s, p, data);
97 |
98 | /* duplicated key, overwrite val */
99 | tail_set_data (trie->tail, t, data);
100 | // trie->is_dirty = TRUE;
101 | return TRUE;
102 | }
103 |
104 |
105 | Bool trie_has_key (const Trie *trie, const TrieChar *key) {
106 | TrieIndex s;
107 | short suffix_idx;
108 | const TrieChar *p;
109 |
110 | /* walk through branches */
111 | s = da_get_root (trie->da);
112 | for (p = key; !trie_da_is_separate (trie->da, s); p++) {
113 | if (!da_walk (trie->da, &s, *p))
114 | return FALSE;
115 | if (0 == *p)
116 | break;
117 | }
118 |
119 | /* walk through tail */
120 | s = trie_da_get_tail_index (trie->da, s);
121 | suffix_idx = 0;
122 | for ( ; ; p++) {
123 | if (!tail_walk_char (trie->tail, s, &suffix_idx, *p))
124 | return FALSE;
125 | if (0 == *p)
126 | break;
127 | }
128 |
129 | return TRUE;
130 | }
131 |
132 |
133 | Bool trie_retrieve (const Trie *trie, const TrieChar *key, TrieData *o_data) {
134 | TrieIndex s;
135 | short suffix_idx;
136 | const TrieChar *p;
137 |
138 | /* walk through branches */
139 | s = da_get_root (trie->da);
140 | for (p = key; !trie_da_is_separate (trie->da, s); p++) {
141 | if (!da_walk (trie->da, &s, *p))
142 | return FALSE;
143 | if (0 == *p)
144 | break;
145 | }
146 |
147 | /* walk through tail */
148 | s = trie_da_get_tail_index (trie->da, s);
149 | suffix_idx = 0;
150 | for ( ; ; p++) {
151 | if (!tail_walk_char (trie->tail, s, &suffix_idx, *p))
152 | return FALSE;
153 | if (0 == *p)
154 | break;
155 | }
156 |
157 | /* found, set the val and return */
158 | if (o_data)
159 | *o_data = tail_get_data (trie->tail, s);
160 | return TRUE;
161 | }
162 |
163 | Bool trie_delete (Trie *trie, const TrieChar *key) {
164 | TrieIndex s, t;
165 | short suffix_idx;
166 | const TrieChar *p;
167 |
168 | /* walk through branches */
169 | s = da_get_root (trie->da);
170 | for (p = key; !trie_da_is_separate (trie->da, s); p++) {
171 | if (!da_walk (trie->da, &s, *p))
172 | return FALSE;
173 | if (0 == *p)
174 | break;
175 | }
176 |
177 | /* walk through tail */
178 | t = trie_da_get_tail_index (trie->da, s);
179 | suffix_idx = 0;
180 | for ( ; ; p++) {
181 | if (!tail_walk_char (trie->tail, t, &suffix_idx, *p))
182 | return FALSE;
183 | if (0 == *p)
184 | break;
185 | }
186 |
187 | tail_delete (trie->tail, t);
188 | da_set_base (trie->da, s, TRIE_INDEX_ERROR);
189 | da_prune (trie->da, s);
190 |
191 | //trie->is_dirty = TRUE;
192 | return TRUE;
193 | }
194 |
195 | /*-------------------------------*
196 | * STEPWISE QUERY OPERATIONS *
197 | *-------------------------------*/
198 |
199 | TrieState * trie_root (const Trie *trie) {
200 | return trie_state_new (trie, da_get_root (trie->da), 0, FALSE);
201 | }
202 |
203 | /*----------------*
204 | * TRIE STATE *
205 | *----------------*/
206 |
207 | static TrieState * trie_state_new (const Trie *trie, TrieIndex index, short suffix_idx, short is_suffix) {
208 | TrieState *s;
209 |
210 | s = (TrieState *) malloc (sizeof (TrieState));
211 | if (!s)
212 | return NULL;
213 |
214 | s->trie = trie;
215 | s->index = index;
216 | s->suffix_idx = suffix_idx;
217 | s->is_suffix = is_suffix;
218 |
219 | return s;
220 | }
221 |
222 | TrieState * trie_state_clone (const TrieState *s) {
223 | return trie_state_new (s->trie, s->index, s->suffix_idx, s->is_suffix);
224 | }
225 |
226 | void trie_state_free (TrieState *s) {
227 | free (s);
228 | }
229 |
230 | void trie_state_rewind (TrieState *s) {
231 | s->index = da_get_root (s->trie->da);
232 | s->is_suffix = FALSE;
233 | }
234 |
235 | Bool trie_state_walk (TrieState *s, TrieChar c) {
236 | if (!s->is_suffix) {
237 | Bool ret;
238 |
239 | ret = da_walk (s->trie->da, &s->index, c);
240 |
241 | if (ret && trie_da_is_separate (s->trie->da, s->index)) {
242 | s->index = trie_da_get_tail_index (s->trie->da, s->index);
243 | s->suffix_idx = 0;
244 | s->is_suffix = TRUE;
245 | }
246 |
247 | return ret;
248 | } else {
249 | return tail_walk_char (s->trie->tail, s->index, &s->suffix_idx, c);
250 | }
251 | }
252 |
253 | Bool trie_state_is_walkable (const TrieState *s, TrieChar c) {
254 | if (!s->is_suffix)
255 | return da_is_walkable (s->trie->da, s->index, c);
256 | else
257 | return tail_is_walkable_char (s->trie->tail, s->index, s->suffix_idx, c);
258 | }
259 |
260 | Bool trie_state_is_leaf (const TrieState *s) {
261 | return s->is_suffix && trie_state_is_terminal (s);
262 | }
263 |
264 | TrieData trie_state_get_data (const TrieState *s) {
265 | return s->is_suffix ? tail_get_data (s->trie->tail, s->index) : TRIE_DATA_ERROR;
266 | }
267 |
268 | int main(void) {
269 | Bool res;
270 | TrieData *data = (TrieData*)malloc(sizeof(TrieData));
271 | Trie *trie = trie_new();
272 |
273 |
274 | trie_store(trie, (const TrieChar*)"hello", 1);
275 | trie_store(trie, (const TrieChar*)"he", 4);
276 | trie_store(trie, (const TrieChar*)"hel", 3);
277 | trie_store(trie, (const TrieChar*)"h", 5);
278 | trie_store(trie, (const TrieChar*)"hell", 2);
279 |
280 |
281 | res = trie_retrieve(trie, (const TrieChar*)"hello", data);
282 | printf(res ? "Win!\n" : "Fail!\n");
283 |
284 | res = trie_retrieve(trie, (const TrieChar*)"hell", data);
285 | printf(res ? "Win!\n" : "Fail!\n");
286 |
287 | res = trie_retrieve(trie, (const TrieChar*)"hel", data);
288 | printf(res ? "Win!\n" : "Fail!\n");
289 |
290 | res = trie_retrieve(trie, (const TrieChar*)"he", data);
291 | printf(res ? "Win!\n" : "Fail!\n");
292 |
293 | res = trie_retrieve(trie, (const TrieChar*)"h", data);
294 | printf(res ? "Win!\n" : "Fail!\n");
295 |
296 |
297 | trie_free(trie);
298 | return 0;
299 | }
300 |
--------------------------------------------------------------------------------
/ext/trie/trie-private.h:
--------------------------------------------------------------------------------
1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 | /*
3 | * trie-private.h - Private utilities for trie implementation
4 | * Created: 2007-08-25
5 | * Author: Theppitak Karoonboonyanan
6 | */
7 |
8 | #ifndef __TRIE_PRIVATE_H
9 | #define __TRIE_PRIVATE_H
10 |
11 | #include "typedefs.h"
12 |
13 | /**
14 | * @file trie-private.h
15 | * @brief Private utilities for trie implementation
16 | */
17 |
18 | /**
19 | * @brief Minimum value macro
20 | */
21 | #define MIN_VAL(a,b) ((a)<(b)?(a):(b))
22 | /**
23 | * @brief Maximum value macro
24 | */
25 | #define MAX_VAL(a,b) ((a)>(b)?(a):(b))
26 |
27 | #endif /* __TRIE_PRIVATE_H */
28 |
29 | /*
30 | vi:ts=4:ai:expandtab
31 | */
32 |
--------------------------------------------------------------------------------
/ext/trie/trie.c:
--------------------------------------------------------------------------------
1 | #include "ruby.h"
2 | #include "trie.h"
3 | #include
4 | #include
5 | #include
6 |
7 | VALUE cTrie, cTrieNode;
8 |
9 | /*
10 | * Document-class: Trie
11 | *
12 | * A key-value data structure for string keys which is efficient memory usage and fast retrieval time.
13 | *
14 | */
15 |
16 | static VALUE rb_trie_alloc(VALUE klass) {
17 | VALUE obj;
18 | obj = Data_Wrap_Struct(klass, 0, trie_free, trie_new());
19 | return obj;
20 | }
21 |
22 | void raise_ioerror(const char * message) {
23 | VALUE rb_eIOError = rb_const_get(rb_cObject, rb_intern("IOError"));
24 | rb_raise(rb_eIOError, "%s", message);
25 | }
26 |
27 | /*
28 | * call-seq:
29 | * read(filename_base) -> Trie
30 | *
31 | * Returns a new trie with data as read from disk.
32 | */
33 | static VALUE rb_trie_read(VALUE self, VALUE filename_base) {
34 | VALUE da_filename = rb_str_dup(filename_base);
35 | rb_str_concat(da_filename, rb_str_new2(".da"));
36 | StringValue(da_filename);
37 |
38 | VALUE tail_filename = rb_str_dup(filename_base);
39 | rb_str_concat(tail_filename, rb_str_new2(".tail"));
40 | StringValue(tail_filename);
41 |
42 | Trie *trie = trie_new();
43 |
44 | VALUE obj;
45 | obj = Data_Wrap_Struct(self, 0, trie_free, trie);
46 |
47 | DArray *old_da = trie->da;
48 | Tail *old_tail = trie->tail;
49 |
50 | FILE *da_file = fopen(RSTRING_PTR(da_filename), "r");
51 | if (da_file == NULL)
52 | raise_ioerror("Error reading .da file.");
53 |
54 | trie->da = da_read(da_file);
55 | fclose(da_file);
56 |
57 | FILE *tail_file = fopen(RSTRING_PTR(tail_filename), "r");
58 | if (tail_file == NULL)
59 | raise_ioerror("Error reading .tail file.");
60 |
61 | trie->tail = tail_read(tail_file);
62 | fclose(tail_file);
63 |
64 | da_free(old_da);
65 | tail_free(old_tail);
66 |
67 | return obj;
68 | }
69 |
70 | /*
71 | * call-seq:
72 | * has_key?(key) -> true/false
73 | *
74 | * Determines whether or not a key exists in the Trie. Use this if you don't care about the value, as it
75 | * is marginally faster than Trie#get.
76 | *
77 | */
78 | static VALUE rb_trie_has_key(VALUE self, VALUE key) {
79 | StringValue(key);
80 |
81 | Trie *trie;
82 | Data_Get_Struct(self, Trie, trie);
83 |
84 | if(trie_has_key(trie, (TrieChar*)RSTRING_PTR(key)))
85 | return Qtrue;
86 | else
87 | return Qnil;
88 | }
89 |
90 | /*
91 | * call-seq:
92 | * get(key) -> value
93 | * [key] -> value
94 | *
95 | * Retrieves the value for a particular key (or nil) from the Trie.
96 | *
97 | */
98 | static VALUE rb_trie_get(VALUE self, VALUE key) {
99 | StringValue(key);
100 |
101 | Trie *trie;
102 | Data_Get_Struct(self, Trie, trie);
103 |
104 | TrieData data;
105 | if(trie_retrieve(trie, (TrieChar*)RSTRING_PTR(key), &data))
106 | return (VALUE)data;
107 | else
108 | return Qnil;
109 | }
110 |
111 | /*
112 | * call-seq:
113 | * add(key)
114 | * add(key,value)
115 | *
116 | * Add a key, or a key and value to the Trie. If you add a key without a value it assumes true for the value.
117 | *
118 | */
119 | static VALUE rb_trie_add(VALUE self, VALUE args) {
120 | Trie *trie;
121 | Data_Get_Struct(self, Trie, trie);
122 |
123 | int size = RARRAY_LEN(args);
124 | if(size < 1 || size > 2)
125 | return Qnil;
126 |
127 | VALUE key;
128 | key = RARRAY_PTR(args)[0];
129 | StringValue(key);
130 |
131 | TrieData value = size == 2 ? RARRAY_PTR(args)[1] : TRIE_DATA_ERROR;
132 |
133 | if(trie_store(trie, (TrieChar*)RSTRING_PTR(key), value))
134 | return Qtrue;
135 | else
136 | return Qnil;
137 | }
138 |
139 | /*
140 | * call-seq:
141 | * delete(key)
142 | *
143 | * Delete a key from the Trie. Returns true if it deleted a key, nil otherwise.
144 | *
145 | */
146 | static VALUE rb_trie_delete(VALUE self, VALUE key) {
147 | StringValue(key);
148 |
149 | Trie *trie;
150 | Data_Get_Struct(self, Trie, trie);
151 |
152 | if(trie_delete(trie, (TrieChar*)RSTRING_PTR(key)))
153 | return Qtrue;
154 | else
155 | return Qnil;
156 | }
157 |
158 | static VALUE walk_all_paths(Trie *trie, VALUE children, TrieState *state, char *prefix, int prefix_size) {
159 | int c;
160 | for(c = 1; c < 256; c++) {
161 | if(trie_state_is_walkable(state,c)) {
162 | TrieState *next_state = trie_state_clone(state);
163 | trie_state_walk(next_state, c);
164 |
165 | prefix[prefix_size] = c;
166 | prefix[prefix_size + 1] = 0;
167 |
168 | if(trie_state_is_terminal(next_state)) {
169 | char *word = (char*) malloc(prefix_size + 2);
170 | memcpy(word, prefix, prefix_size + 2);
171 | rb_ary_push(children, rb_str_new2(word));
172 | }
173 |
174 | walk_all_paths(trie, children, next_state, prefix, prefix_size + 1);
175 |
176 | prefix[prefix_size] = 0;
177 | trie_state_free(next_state);
178 | }
179 | }
180 | }
181 |
182 |
183 | static Bool traverse(TrieState *state, TrieChar *char_prefix) {
184 | const TrieChar *iterator = char_prefix;
185 | while(*iterator != 0) {
186 | if(!trie_state_is_walkable(state, *iterator))
187 | return FALSE;
188 | trie_state_walk(state, *iterator);
189 | iterator++;
190 | }
191 | return TRUE;
192 | }
193 |
194 |
195 | /*
196 | * call-seq:
197 | * children(prefix) -> [ key, ... ]
198 | *
199 | * Finds all keys in the Trie beginning with the given prefix.
200 | *
201 | */
202 | static VALUE rb_trie_children(VALUE self, VALUE prefix) {
203 | if(NIL_P(prefix))
204 | return rb_ary_new();
205 |
206 | StringValue(prefix);
207 |
208 | Trie *trie;
209 | Data_Get_Struct(self, Trie, trie);
210 |
211 | int prefix_size = RSTRING_LEN(prefix);
212 | TrieState *state = trie_root(trie);
213 | VALUE children = rb_ary_new();
214 | TrieChar *char_prefix = (TrieChar*)RSTRING_PTR(prefix);
215 |
216 | if(!traverse(state, char_prefix)) {
217 | return children;
218 | }
219 |
220 | if(trie_state_is_terminal(state))
221 | rb_ary_push(children, prefix);
222 |
223 | char prefix_buffer[1024];
224 | memcpy(prefix_buffer, char_prefix, prefix_size);
225 | prefix_buffer[prefix_size] = 0;
226 |
227 | walk_all_paths(trie, children, state, prefix_buffer, prefix_size);
228 |
229 | trie_state_free(state);
230 | return children;
231 | }
232 |
233 | static Bool walk_all_paths_until_first_terminal(Trie *trie, TrieState *state, char *prefix, int prefix_size) {
234 | int c;
235 | Bool ret = FALSE;
236 | for(c = 1; c < 256; c++) {
237 | if(trie_state_is_walkable(state,c)) {
238 | TrieState *next_state = trie_state_clone(state);
239 | trie_state_walk(next_state, c);
240 |
241 | prefix[prefix_size] = c;
242 | prefix[prefix_size + 1] = 0;
243 |
244 | if(trie_state_is_terminal(next_state)) {
245 | return TRUE;
246 | }
247 |
248 | ret = walk_all_paths_until_first_terminal(trie, next_state, prefix, prefix_size + 1);
249 |
250 | prefix[prefix_size] = 0;
251 | trie_state_free(next_state);
252 |
253 | if (ret == TRUE) {
254 | return ret;
255 | }
256 | }
257 | }
258 |
259 | return ret;
260 | }
261 |
262 | static VALUE rb_trie_has_children(VALUE self, VALUE prefix) {
263 | if(NIL_P(prefix))
264 | return rb_ary_new();
265 |
266 | StringValue(prefix);
267 |
268 | Trie *trie;
269 | Data_Get_Struct(self, Trie, trie);
270 |
271 | int prefix_size = RSTRING_LEN(prefix);
272 | TrieState *state = trie_root(trie);
273 | TrieChar *char_prefix = (TrieChar*)RSTRING_PTR(prefix);
274 |
275 | if(!traverse(state, char_prefix)) {
276 | return Qfalse;
277 | }
278 |
279 | if(trie_state_is_terminal(state))
280 | return Qtrue;
281 |
282 | char prefix_buffer[1024];
283 | memcpy(prefix_buffer, char_prefix, prefix_size);
284 | prefix_buffer[prefix_size] = 0;
285 |
286 | Bool ret = walk_all_paths_until_first_terminal(trie, state, prefix_buffer, prefix_size);
287 |
288 | trie_state_free(state);
289 | return ret == TRUE ? Qtrue : Qfalse;
290 | }
291 |
292 | static VALUE walk_all_paths_with_values(Trie *trie, VALUE children, TrieState *state, char *prefix, int prefix_size) {
293 | int c;
294 | for(c = 1; c < 256; c++) {
295 | if(trie_state_is_walkable(state,c)) {
296 | TrieState *next_state = trie_state_clone(state);
297 | trie_state_walk(next_state, c);
298 |
299 | prefix[prefix_size] = c;
300 | prefix[prefix_size + 1] = 0;
301 |
302 | if(trie_state_is_terminal(next_state)) {
303 | TrieState *end_state = trie_state_clone(next_state);
304 | trie_state_walk(end_state, '\0');
305 |
306 | char *word = (char*) malloc(prefix_size + 2);
307 | memcpy(word, prefix, prefix_size + 2);
308 |
309 | VALUE tuple = rb_ary_new();
310 | rb_ary_push(tuple, rb_str_new2(word));
311 |
312 | TrieData trie_data = trie_state_get_data(end_state);
313 | rb_ary_push(tuple, (VALUE)trie_data);
314 | rb_ary_push(children, tuple);
315 |
316 | trie_state_free(end_state);
317 | }
318 |
319 | walk_all_paths_with_values(trie, children, next_state, prefix, prefix_size + 1);
320 |
321 | prefix[prefix_size] = 0;
322 | trie_state_free(next_state);
323 | }
324 | }
325 | }
326 |
327 | /*
328 | * call-seq:
329 | * children_with_values(key) -> [ [key,value], ... ]
330 | *
331 | * Finds all keys with their respective values in the Trie beginning with the given prefix.
332 | *
333 | */
334 | static VALUE rb_trie_children_with_values(VALUE self, VALUE prefix) {
335 | if(NIL_P(prefix))
336 | return rb_ary_new();
337 |
338 | StringValue(prefix);
339 |
340 | Trie *trie;
341 | Data_Get_Struct(self, Trie, trie);
342 |
343 | int prefix_size = RSTRING_LEN(prefix);
344 | TrieChar *char_prefix = (TrieChar*)RSTRING_PTR(prefix);
345 |
346 | VALUE children = rb_ary_new();
347 |
348 | TrieState *state = trie_root(trie);
349 |
350 | if(!traverse(state, char_prefix)) {
351 | return children;
352 | }
353 |
354 | if(trie_state_is_terminal(state)) {
355 | TrieState *end_state = trie_state_clone(state);
356 | trie_state_walk(end_state, '\0');
357 |
358 | VALUE tuple = rb_ary_new();
359 | rb_ary_push(tuple, prefix);
360 | TrieData trie_data = trie_state_get_data(end_state);
361 | rb_ary_push(tuple, (VALUE)trie_data);
362 | rb_ary_push(children, tuple);
363 |
364 | trie_state_free(end_state);
365 | }
366 |
367 | char prefix_buffer[1024];
368 | memcpy(prefix_buffer, char_prefix, prefix_size);
369 | prefix_buffer[prefix_size] = 0;
370 |
371 | walk_all_paths_with_values(trie, children, state, prefix_buffer, prefix_size);
372 |
373 | trie_state_free(state);
374 | return children;
375 | }
376 |
377 | static VALUE rb_trie_node_alloc(VALUE klass);
378 |
379 | /*
380 | * call-seq:
381 | * root -> TrieNode
382 | *
383 | * Returns a TrieNode representing the root of the Trie.
384 | *
385 | */
386 | static VALUE rb_trie_root(VALUE self) {
387 | Trie *trie;
388 | Data_Get_Struct(self, Trie, trie);
389 |
390 | VALUE trie_node = rb_trie_node_alloc(cTrieNode);
391 |
392 | TrieState *state = trie_root(trie);
393 | RDATA(trie_node)->data = state;
394 |
395 | rb_iv_set(trie_node, "@state", Qnil);
396 | rb_iv_set(trie_node, "@full_state", rb_str_new2(""));
397 | return trie_node;
398 | }
399 |
400 |
401 | /*
402 | * Document-class: TrieNode
403 | *
404 | * Represents a single node in the Trie. It can be used as a cursor to walk around the Trie.
405 | * You can grab a TrieNode for the root of the Trie by using Trie#root.
406 | *
407 | */
408 |
409 | static VALUE rb_trie_node_alloc(VALUE klass) {
410 | VALUE obj;
411 | obj = Data_Wrap_Struct(klass, 0, trie_state_free, NULL);
412 | return obj;
413 | }
414 |
415 | /* nodoc */
416 | static VALUE rb_trie_node_initialize_copy(VALUE self, VALUE from) {
417 | RDATA(self)->data = trie_state_clone(RDATA(from)->data);
418 |
419 | VALUE state = rb_iv_get(from, "@state");
420 | rb_iv_set(self, "@state", state == Qnil ? Qnil : rb_str_dup(state));
421 |
422 | VALUE full_state = rb_iv_get(from, "@full_state");
423 | rb_iv_set(self, "@full_state", full_state == Qnil ? Qnil : rb_str_dup(full_state));
424 |
425 | return self;
426 | }
427 |
428 | /*
429 | * call-seq:
430 | * state -> single character
431 | *
432 | * Returns the letter that the TrieNode instance points to. So, if the node is pointing at the "e" in "monkeys", the state is "e".
433 | *
434 | */
435 | static VALUE rb_trie_node_get_state(VALUE self) {
436 | return rb_iv_get(self, "@state");
437 | }
438 |
439 | /*
440 | * call-seq:
441 | * full_state -> string
442 | *
443 | * Returns the full string from the root of the Trie up to this node. So if the node pointing at the "e" in "monkeys",
444 | * the full_state is "monke".
445 | *
446 | */
447 | static VALUE rb_trie_node_get_full_state(VALUE self) {
448 | return rb_iv_get(self, "@full_state");
449 | }
450 |
451 | /*
452 | * call-seq:
453 | * walk!(letter) -> TrieNode
454 | *
455 | * Tries to walk down a particular branch of the Trie. It modifies the node it is called on.
456 | *
457 | */
458 | static VALUE rb_trie_node_walk_bang(VALUE self, VALUE rchar) {
459 | StringValue(rchar);
460 |
461 | TrieState *state;
462 | Data_Get_Struct(self, TrieState, state);
463 |
464 | if(RSTRING_LEN(rchar) != 1)
465 | return Qnil;
466 |
467 | Bool result = trie_state_walk(state, *RSTRING_PTR(rchar));
468 |
469 | if(result) {
470 | rb_iv_set(self, "@state", rchar);
471 | VALUE full_state = rb_iv_get(self, "@full_state");
472 | rb_str_append(full_state, rchar);
473 | rb_iv_set(self, "@full_state", full_state);
474 | return self;
475 | } else
476 | return Qnil;
477 | }
478 |
479 | /*
480 | * call-seq:
481 | * walk(letter) -> TrieNode
482 | *
483 | * Tries to walk down a particular branch of the Trie. It clones the node it is called on and
484 | * walks with that one, leaving the original unchanged.
485 | *
486 | */
487 | static VALUE rb_trie_node_walk(VALUE self, VALUE rchar) {
488 | StringValue(rchar);
489 |
490 | VALUE new_node = rb_funcall(self, rb_intern("dup"), 0);
491 |
492 | TrieState *state;
493 | Data_Get_Struct(new_node, TrieState, state);
494 |
495 | if(RSTRING_LEN(rchar) != 1)
496 | return Qnil;
497 |
498 | Bool result = trie_state_walk(state, *RSTRING_PTR(rchar));
499 |
500 | if(result) {
501 | rb_iv_set(new_node, "@state", rchar);
502 | VALUE full_state = rb_iv_get(new_node, "@full_state");
503 | rb_str_append(full_state, rchar);
504 | rb_iv_set(new_node, "@full_state", full_state);
505 | return new_node;
506 | } else
507 | return Qnil;
508 | }
509 |
510 | /*
511 | * call-seq:
512 | * value
513 | *
514 | * Attempts to get the value at this node of the Trie. This only works if the node is a terminal
515 | * (i.e. end of a key), otherwise it returns nil.
516 | *
517 | */
518 | static VALUE rb_trie_node_value(VALUE self) {
519 | TrieState *state;
520 | TrieState *dup;
521 | Data_Get_Struct(self, TrieState, state);
522 |
523 | dup = trie_state_clone(state);
524 |
525 | trie_state_walk(dup, 0);
526 | TrieData trie_data = trie_state_get_data(dup);
527 | trie_state_free(dup);
528 |
529 | return TRIE_DATA_ERROR == trie_data ? Qnil : (VALUE)trie_data;
530 | }
531 |
532 | /*
533 | * call-seq:
534 | * terminal? -> true/false
535 | *
536 | * Returns true if this node is at the end of a key. So if you have two keys in your Trie, "he" and
537 | * "hello", and you walk all the way to the end of "hello", the "e" and the "o" will return true for terminal?.
538 | *
539 | */
540 | static VALUE rb_trie_node_terminal(VALUE self) {
541 | TrieState *state;
542 | Data_Get_Struct(self, TrieState, state);
543 |
544 | return trie_state_is_terminal(state) ? Qtrue : Qnil;
545 | }
546 |
547 | /*
548 | * call-seq:
549 | * leaf? -> true/false
550 | *
551 | * Returns true if there are no branches at this node.
552 | */
553 | static VALUE rb_trie_node_leaf(VALUE self) {
554 | TrieState *state;
555 | Data_Get_Struct(self, TrieState, state);
556 |
557 | return trie_state_is_leaf(state) ? Qtrue : Qnil;
558 | }
559 |
560 | /*
561 | * call-seq:
562 | * save(filename_base) -> true
563 | *
564 | * Saves the trie data to two files, filename_base.da and filename_base.tail.
565 | * Returns true if saving was successful.
566 | */
567 | static VALUE rb_trie_save(VALUE self, VALUE filename_base) {
568 | VALUE da_filename = rb_str_dup(filename_base);
569 | rb_str_concat(da_filename, rb_str_new2(".da"));
570 | StringValue(da_filename);
571 |
572 | VALUE tail_filename = rb_str_dup(filename_base);
573 | rb_str_concat(tail_filename, rb_str_new2(".tail"));
574 | StringValue(tail_filename);
575 |
576 | Trie *trie;
577 | Data_Get_Struct(self, Trie, trie);
578 |
579 | FILE *da_file = fopen(RSTRING_PTR(da_filename), "w");
580 | if (da_file == NULL)
581 | raise_ioerror("Error opening .da file for writing.");
582 | if (da_write(trie->da, da_file) != 0)
583 | raise_ioerror("Error writing DArray data.");
584 | fclose(da_file);
585 |
586 | FILE *tail_file = fopen(RSTRING_PTR(tail_filename), "w");
587 | if (tail_file == NULL)
588 | raise_ioerror("Error opening .tail file for writing.");
589 | if (tail_write(trie->tail, tail_file) != 0)
590 | raise_ioerror("Error writing Tail data.");
591 | fclose(tail_file);
592 |
593 | return Qtrue;
594 | }
595 |
596 |
597 | void Init_trie() {
598 | cTrie = rb_define_class("Trie", rb_cObject);
599 | rb_define_alloc_func(cTrie, rb_trie_alloc);
600 | rb_define_module_function(cTrie, "read", rb_trie_read, 1);
601 | rb_define_method(cTrie, "has_key?", rb_trie_has_key, 1);
602 | rb_define_method(cTrie, "get", rb_trie_get, 1);
603 | rb_define_method(cTrie, "add", rb_trie_add, -2);
604 | rb_define_method(cTrie, "delete", rb_trie_delete, 1);
605 | rb_define_method(cTrie, "children", rb_trie_children, 1);
606 | rb_define_method(cTrie, "children_with_values", rb_trie_children_with_values, 1);
607 | rb_define_method(cTrie, "has_children?", rb_trie_has_children, 1);
608 | rb_define_method(cTrie, "root", rb_trie_root, 0);
609 | rb_define_method(cTrie, "save", rb_trie_save, 1);
610 |
611 | cTrieNode = rb_define_class("TrieNode", rb_cObject);
612 | rb_define_alloc_func(cTrieNode, rb_trie_node_alloc);
613 | rb_define_method(cTrieNode, "initialize_copy", rb_trie_node_initialize_copy, 1);
614 | rb_define_method(cTrieNode, "state", rb_trie_node_get_state, 0);
615 | rb_define_method(cTrieNode, "full_state", rb_trie_node_get_full_state, 0);
616 | rb_define_method(cTrieNode, "walk!", rb_trie_node_walk_bang, 1);
617 | rb_define_method(cTrieNode, "walk", rb_trie_node_walk, 1);
618 | rb_define_method(cTrieNode, "value", rb_trie_node_value, 0);
619 | rb_define_method(cTrieNode, "terminal?", rb_trie_node_terminal, 0);
620 | rb_define_method(cTrieNode, "leaf?", rb_trie_node_leaf, 0);
621 | }
622 |
--------------------------------------------------------------------------------
/ext/trie/trie.h:
--------------------------------------------------------------------------------
1 | #include "darray.h"
2 | #include "tail.h"
3 |
4 | typedef struct _Trie {
5 | DArray *da;
6 | Tail *tail;
7 | } Trie;
8 |
9 | typedef struct _TrieState {
10 | const Trie *trie; /**< the corresponding trie */
11 | TrieIndex index; /**< index in double-array/tail structures */
12 | short suffix_idx; /**< suffix character offset, if in suffix */
13 | short is_suffix; /**< whether it is currently in suffix part */
14 | } TrieState;
15 |
16 |
17 | #define trie_da_is_separate(da,s) (da_get_base ((da), (s)) < 0)
18 | #define trie_da_get_tail_index(da,s) (-da_get_base ((da), (s)))
19 | #define trie_da_set_tail_index(da,s,v) (da_set_base ((da), (s), -(v)))
20 | #define trie_state_is_terminal(s) trie_state_is_walkable((s),TRIE_CHAR_TERM)
21 |
22 |
23 | Trie* trie_new();
24 | void trie_free(Trie *trie);
25 | static Bool trie_branch_in_branch (Trie *trie, TrieIndex sep_node, const TrieChar *suffix, TrieData data);
26 | static Bool trie_branch_in_tail(Trie *trie, TrieIndex sep_node, const TrieChar *suffix, TrieData data);
27 | Bool trie_store (Trie *trie, const TrieChar *key, TrieData data);
28 | Bool trie_retrieve (const Trie *trie, const TrieChar *key, TrieData *o_data);
29 | Bool trie_delete (Trie *trie, const TrieChar *key);
30 | TrieState * trie_root (const Trie *trie);
31 | static TrieState * trie_state_new (const Trie *trie, TrieIndex index, short suffix_idx, short is_suffix);
32 | TrieState * trie_state_clone (const TrieState *s);
33 | void trie_state_free (TrieState *s);
34 | void trie_state_rewind (TrieState *s);
35 | Bool trie_state_walk (TrieState *s, TrieChar c);
36 | Bool trie_state_is_walkable (const TrieState *s, TrieChar c);
37 | Bool trie_state_is_leaf (const TrieState *s);
38 | TrieData trie_state_get_data (const TrieState *s);
39 |
40 |
41 |
--------------------------------------------------------------------------------
/ext/trie/triedefs.h:
--------------------------------------------------------------------------------
1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 | /*
3 | * triedefs.h - General typedefs for trie
4 | * Created: 2006-08-11
5 | * Author: Theppitak Karoonboonyanan
6 | */
7 |
8 | #ifndef __TRIEDEFS_H
9 | #define __TRIEDEFS_H
10 |
11 | #include "typedefs.h"
12 |
13 | /**
14 | * @file triedefs.h
15 | * @brief General typedefs for trie
16 | */
17 |
18 | /**
19 | * @brief Trie IO modes
20 | */
21 | typedef enum {
22 | TRIE_IO_READ = 0x01,
23 | TRIE_IO_WRITE = 0x02,
24 | TRIE_IO_CREATE = 0x04
25 | } TrieIOMode;
26 |
27 | /**
28 | * @brief Trie character type for alphabet
29 | */
30 | typedef uint32 AlphaChar;
31 |
32 | /**
33 | * @brief Error value for alphabet character
34 | */
35 | #define ALPHA_CHAR_ERROR (~(AlphaChar)0)
36 |
37 | /**
38 | * @brief Trie character type for key
39 | */
40 | typedef unsigned char TrieChar;
41 | /**
42 | * @brief Trie terminator character
43 | */
44 | #define TRIE_CHAR_TERM '\0'
45 | #define TRIE_CHAR_MAX 255
46 |
47 | /**
48 | * @brief Type of Trie index
49 | */
50 | typedef int32 TrieIndex;
51 | /**
52 | * @brief Trie error index
53 | */
54 | #define TRIE_INDEX_ERROR 0
55 | /**
56 | * @brief Maximum trie index value
57 | */
58 | #define TRIE_INDEX_MAX 0x7fffffff
59 |
60 | /**
61 | * @brief Type of value associated to trie entries
62 | */
63 | typedef unsigned long TrieData;
64 | /**
65 | * @brief Trie error data
66 | */
67 | #define TRIE_DATA_ERROR -1
68 |
69 | #endif /* __TRIEDEFS_H */
70 |
71 | /*
72 | vi:ts=4:ai:expandtab
73 | */
74 |
--------------------------------------------------------------------------------
/ext/trie/typedefs.h:
--------------------------------------------------------------------------------
1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 | /*
3 | * typedefs.h - general types
4 | * Created : 11 Aug 2006
5 | * Author : Theppitak Karoonboonyanan
6 | */
7 |
8 | #ifndef __TYPEDEFS_H
9 | #define __TYPEDEFS_H
10 |
11 | #include
12 |
13 | // fix for fast_trie on Windows. Should be easy to merge with future changes to libdatrie. MH
14 | #include
15 | #define Bool bool
16 | #define FALSE false
17 | #define TRUE true
18 |
19 | # if UCHAR_MAX == 0xff
20 | # ifndef UINT8_TYPEDEF
21 | # define UINT8_TYPEDEF
22 | typedef unsigned char uint8;
23 | # endif /* UINT8_TYPEDEF */
24 | # endif /* UCHAR_MAX */
25 |
26 | # if SCHAR_MAX == 0x7f
27 | # ifndef INT8_TYPEDEF
28 | # define INT8_TYPEDEF
29 | typedef signed char int8;
30 | # endif /* INT8_TYPEDEF */
31 | # endif /* SCHAR_MAX */
32 |
33 | # if UINT_MAX == 0xffff
34 | # ifndef UINT16_TYPEDEF
35 | # define UINT16_TYPEDEF
36 | typedef unsigned int uint16;
37 | # endif /* UINT16_TYPEDEF */
38 | # endif /* UINT_MAX */
39 |
40 | # if INT_MAX == 0x7fff
41 | # ifndef INT16_TYPEDEF
42 | # define INT16_TYPEDEF
43 | typedef int int16;
44 | # endif /* INT16_TYPEDEF */
45 | # endif /* INT_MAX */
46 |
47 | # if USHRT_MAX == 0xffff
48 | # ifndef UINT16_TYPEDEF
49 | # define UINT16_TYPEDEF
50 | typedef unsigned short uint16;
51 | # endif /* UINT16_TYPEDEF */
52 | # endif /* USHRT_MAX */
53 |
54 | # if SHRT_MAX == 0x7fff
55 | # ifndef INT16_TYPEDEF
56 | # define INT16_TYPEDEF
57 | typedef short int16;
58 | # endif /* INT16_TYPEDEF */
59 | # endif /* SHRT_MAX */
60 |
61 | # if UINT_MAX == 0xffffffff
62 | # ifndef UINT32_TYPEDEF
63 | # define UINT32_TYPEDEF
64 | typedef unsigned int uint32;
65 | # endif /* UINT32_TYPEDEF */
66 | # endif /* UINT_MAX */
67 |
68 | # if INT_MAX == 0x7fffffff
69 | # ifndef INT32_TYPEDEF
70 | # define INT32_TYPEDEF
71 | typedef int int32;
72 | # endif /* INT32_TYPEDEF */
73 | # endif /* INT_MAX */
74 |
75 | # if ULONG_MAX == 0xffffffff
76 | # ifndef UINT32_TYPEDEF
77 | # define UINT32_TYPEDEF
78 | typedef unsigned long uint32;
79 | # endif /* UINT32_TYPEDEF */
80 | # endif /* ULONG_MAX */
81 |
82 | # if LONG_MAX == 0x7fffffff
83 | # ifndef INT32_TYPEDEF
84 | # define INT32_TYPEDEF
85 | typedef long int32;
86 | # endif /* INT32_TYPEDEF */
87 | # endif /* LONG_MAX */
88 |
89 | # ifndef UINT8_TYPEDEF
90 | # error "uint8 type is undefined!"
91 | # endif
92 | # ifndef INT8_TYPEDEF
93 | # error "int8 type is undefined!"
94 | # endif
95 | # ifndef UINT16_TYPEDEF
96 | # error "uint16 type is undefined!"
97 | # endif
98 | # ifndef INT16_TYPEDEF
99 | # error "int16 type is undefined!"
100 | # endif
101 | # ifndef UINT32_TYPEDEF
102 | # error "uint32 type is undefined!"
103 | # endif
104 | # ifndef INT32_TYPEDEF
105 | # error "int32 type is undefined!"
106 | # endif
107 |
108 | typedef uint8 byte;
109 | typedef uint16 word;
110 | typedef uint32 dword;
111 |
112 |
113 | #endif /* __TYPEDEFS_H */
114 |
115 | /*
116 | vi:ts=4:ai:expandtab
117 | */
118 |
--------------------------------------------------------------------------------
/fast_trie.gemspec:
--------------------------------------------------------------------------------
1 | # Generated by jeweler
2 | # DO NOT EDIT THIS FILE DIRECTLY
3 | # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4 | # -*- encoding: utf-8 -*-
5 | # stub: fast_trie 0.5.1 ruby ext
6 | # stub: ext/trie/extconf.rb
7 |
8 | Gem::Specification.new do |s|
9 | s.name = "fast_trie"
10 | s.version = "0.5.1"
11 |
12 | s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
13 | s.require_paths = ["ext"]
14 | s.authors = ["Tyler McMullen", "Matt Hickford"]
15 | s.date = "2015-07-27"
16 | s.description = "Ruby Trie based on libdatrie."
17 | s.email = "tyler@scribd.com"
18 | s.extensions = ["ext/trie/extconf.rb"]
19 | s.extra_rdoc_files = [
20 | "LICENSE",
21 | "README.textile"
22 | ]
23 | s.files = [
24 | "Gemfile.lock",
25 | "README.textile",
26 | "VERSION.yml",
27 | "ext/trie/darray.c",
28 | "ext/trie/darray.h",
29 | "ext/trie/extconf.rb",
30 | "ext/trie/fileutils.c",
31 | "ext/trie/fileutils.h",
32 | "ext/trie/tail.c",
33 | "ext/trie/tail.h",
34 | "ext/trie/trie-private.c",
35 | "ext/trie/trie-private.h",
36 | "ext/trie/trie.c",
37 | "ext/trie/trie.h",
38 | "ext/trie/triedefs.h",
39 | "ext/trie/typedefs.h",
40 | "fast_trie.gemspec",
41 | "spec/trie_spec.rb"
42 | ]
43 | s.homepage = "http://github.com/tyler/trie"
44 | s.rdoc_options = ["--title", "Trie", "--line-numbers", "--op", "rdoc", "--main", "ext/trie/trie.c", "README"]
45 | s.rubygems_version = "2.4.5"
46 | s.summary = "Ruby Trie based on libdatrie."
47 |
48 | if s.respond_to? :specification_version then
49 | s.specification_version = 4
50 |
51 | if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
52 | s.add_development_dependency(%q, [">= 0"])
53 | s.add_development_dependency(%q, [">= 0"])
54 | s.add_development_dependency(%q, ["~> 3.12"])
55 | s.add_development_dependency(%q, ["~> 1.0"])
56 | s.add_development_dependency(%q, ["~> 2.0.1"])
57 | s.add_development_dependency(%q, [">= 0"])
58 | else
59 | s.add_dependency(%q, [">= 0"])
60 | s.add_dependency(%q, [">= 0"])
61 | s.add_dependency(%q, ["~> 3.12"])
62 | s.add_dependency(%q, ["~> 1.0"])
63 | s.add_dependency(%q, ["~> 2.0.1"])
64 | s.add_dependency(%q, [">= 0"])
65 | end
66 | else
67 | s.add_dependency(%q, [">= 0"])
68 | s.add_dependency(%q, [">= 0"])
69 | s.add_dependency(%q, ["~> 3.12"])
70 | s.add_dependency(%q, ["~> 1.0"])
71 | s.add_dependency(%q, ["~> 2.0.1"])
72 | s.add_dependency(%q, [">= 0"])
73 | end
74 | end
75 |
76 |
--------------------------------------------------------------------------------
/spec/trie_spec.rb:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/../lib/trie'
2 |
3 | describe Trie do
4 | before :each do
5 | @trie = Trie.new;
6 | @trie.add('rocket')
7 | @trie.add('rock')
8 | @trie.add('frederico')
9 | end
10 |
11 | describe :has_key? do
12 | it 'returns true for words in the trie' do
13 | @trie.has_key?('rocket').should be_true
14 | end
15 |
16 | it 'returns nil for words that are not in the trie' do
17 | @trie.has_key?('not_in_the_trie').should be_nil
18 | end
19 | end
20 |
21 | describe :get do
22 | it 'returns -1 for words in the trie without a weight' do
23 | @trie.get('rocket').should == -1
24 | end
25 |
26 | it 'returns nil if the word is not in the trie' do
27 | @trie.get('not_in_the_trie').should be_nil
28 | end
29 | end
30 |
31 | describe :add do
32 | it 'adds a word to the trie' do
33 | @trie.add('forsooth').should == true
34 | @trie.get('forsooth').should == -1
35 | end
36 |
37 | it 'adds a word with a weight to the trie' do
38 | @trie.add('chicka',123).should == true
39 | @trie.get('chicka').should == 123
40 | end
41 |
42 | it 'adds values greater than 16-bit allows' do
43 | @trie.add('chicka', 72_000).should == true
44 | @trie.get('chicka').should == 72_000
45 | end
46 |
47 | it 'adds a word with a non-numeric value to the trie' do
48 | @trie.add('doot', 'Heeey').should == true
49 | @trie.get('doot').should == 'Heeey'
50 | end
51 | end
52 |
53 | describe :delete do
54 | it 'deletes a word from the trie' do
55 | @trie.delete('rocket').should == true
56 | @trie.has_key?('rocket').should be_nil
57 | end
58 | end
59 |
60 | describe :children do
61 | it 'returns all words beginning with a given prefix' do
62 | children = @trie.children('roc')
63 | children.size.should == 2
64 | children.should include('rock')
65 | children.should include('rocket')
66 | end
67 |
68 | it 'returns blank array if prefix does not exist' do
69 | @trie.children('ajsodij').should == []
70 | end
71 |
72 | it 'includes the prefix if the prefix is a word' do
73 | children = @trie.children('rock')
74 | children.size.should == 2
75 | children.should include('rock')
76 | children.should include('rocket')
77 | end
78 |
79 | it 'returns blank array if prefix is nil' do
80 | @trie.children(nil).should == []
81 | end
82 | end
83 |
84 | describe :children_with_values do
85 | before :each do
86 | @trie.add('abc',2)
87 | @trie.add('abcd',4)
88 | end
89 |
90 | it 'returns all words with values beginning with a given prefix' do
91 | children = @trie.children_with_values('ab')
92 | children.size.should == 2
93 | children.should include(['abc',2])
94 | children.should include(['abcd',4])
95 | end
96 |
97 | it 'returns nil if prefix does not exist' do
98 | @trie.children_with_values('ajsodij').should == []
99 | end
100 |
101 | it 'includes the prefix if the prefix is a word' do
102 | children = @trie.children_with_values('abc')
103 | children.size.should == 2
104 | children.should include(['abc',2])
105 | children.should include(['abcd',4])
106 | end
107 |
108 | it 'returns blank array if prefix is nil' do
109 | @trie.children_with_values(nil).should == []
110 | end
111 | end
112 |
113 | #describe :walk_to_terminal do
114 | # it 'returns the first word found along a path' do
115 | # @trie.add 'anderson'
116 | # @trie.add 'andreas'
117 | # @trie.add 'and'
118 |
119 | # @trie.walk_to_terminal('anderson').should == 'and'
120 | # end
121 |
122 | # it 'returns the first word and value along a path' do
123 | # @trie.add 'anderson'
124 | # @trie.add 'andreas'
125 | # @trie.add 'and', 15
126 |
127 | # @trie.walk_to_terminal('anderson',true).should == ['and', 15]
128 | # end
129 | #end
130 |
131 | describe :root do
132 | it 'returns a TrieNode' do
133 | @trie.root.should be_an_instance_of(TrieNode)
134 | end
135 |
136 | it 'returns a different TrieNode each time' do
137 | @trie.root.should_not == @trie.root
138 | end
139 | end
140 |
141 | describe 'save/read' do
142 | let(:filename_base) do
143 | dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'tmp'))
144 | FileUtils.mkdir_p(dir)
145 | File.join(dir, 'trie')
146 | end
147 |
148 | context 'when I save the populated trie to disk' do
149 | before(:each) do
150 | @trie.add('omgwtflolbbq', 123)
151 | @trie.save(filename_base)
152 | end
153 |
154 | it 'should contain the same data when reading from disk' do
155 | trie2 = Trie.read(filename_base)
156 | trie2.get('omgwtflolbbq').should == 123
157 | end
158 | end
159 | end
160 |
161 | describe :read do
162 | context 'when the files to read from do not exist' do
163 | let(:filename_base) do
164 | "phantasy/file/path/that/does/not/exist"
165 | end
166 |
167 | it 'should raise an error when attempting a read' do
168 | lambda { Trie.read(filename_base) }.should raise_error(IOError)
169 | end
170 | end
171 | end
172 |
173 | describe :has_children? do
174 | it 'returns true when there are children matching prefix' do
175 | @trie.has_children?('r').should be_true
176 |
177 | @trie.has_children?('rock').should be_true
178 | @trie.has_children?('rocket').should be_true
179 | end
180 |
181 | it 'returns false when there are no children matching prefix' do
182 | @trie.has_children?('no').should be_false
183 | @trie.has_children?('rome').should be_false
184 | @trie.has_children?('roc_').should be_false
185 | end
186 | end
187 | end
188 |
189 | describe TrieNode do
190 | before :each do
191 | @trie = Trie.new;
192 | @trie.add('rocket',1)
193 | @trie.add('rock',2)
194 | @trie.add('frederico',3)
195 | @node = @trie.root
196 | end
197 |
198 | describe :state do
199 | it 'returns the most recent state character' do
200 | @node.walk!('r')
201 | @node.state.should == 'r'
202 | @node.walk!('o')
203 | @node.state.should == 'o'
204 | end
205 |
206 | it 'is nil when no walk has occurred' do
207 | @node.state.should == nil
208 | end
209 | end
210 |
211 | describe :full_state do
212 | it 'returns the current string' do
213 | @node.walk!('r').walk!('o').walk!('c')
214 | @node.full_state.should == 'roc'
215 | end
216 |
217 | it 'is a blank string when no walk has occurred' do
218 | @node.full_state.should == ''
219 | end
220 | end
221 |
222 | describe :walk! do
223 | it 'returns the updated object when the walk succeeds' do
224 | other = @node.walk!('r')
225 | other.should == @node
226 | end
227 |
228 | it 'returns nil when the walk fails' do
229 | @node.walk!('q').should be_nil
230 | end
231 | end
232 |
233 | describe :walk do
234 | it 'returns a new node object when the walk succeeds' do
235 | other = @node.walk('r')
236 | other.should_not == @node
237 | end
238 |
239 | it 'returns nil when the walk fails' do
240 | @node.walk('q').should be_nil
241 | end
242 | end
243 |
244 |
245 | describe :value do
246 | it 'returns nil when the node is not terminal' do
247 | @node.walk!('r')
248 | @node.value.should be_nil
249 | end
250 |
251 | it 'returns a value when the node is terminal' do
252 | @node.walk!('r').walk!('o').walk!('c').walk!('k')
253 | @node.value.should == 2
254 | end
255 | end
256 |
257 | describe :terminal? do
258 | it 'returns true when the node is a word end' do
259 | @node.walk!('r').walk!('o').walk!('c').walk!('k')
260 | @node.should be_terminal
261 | end
262 |
263 | it 'returns nil when the node is not a word end' do
264 | @node.walk!('r').walk!('o').walk!('c')
265 | @node.should_not be_terminal
266 | end
267 | end
268 |
269 | describe :leaf? do
270 | it 'returns true when this is the end of a branch of the trie' do
271 | @node.walk!('r').walk!('o').walk!('c').walk!('k').walk!('e').walk!('t')
272 | @node.should be_leaf
273 | end
274 |
275 | it 'returns nil when there are more splits on this branch' do
276 | @node.walk!('r').walk!('o').walk!('c').walk!('k')
277 | @node.should_not be_leaf
278 | end
279 | end
280 |
281 | describe :clone do
282 | it 'creates a new instance of this node which is not this node' do
283 | new_node = @node.clone
284 | new_node.should_not == @node
285 | end
286 |
287 | it 'matches the state of the current node' do
288 | new_node = @node.clone
289 | new_node.state.should == @node.state
290 | end
291 |
292 | it 'matches the full_state of the current node' do
293 | new_node = @node.clone
294 | new_node.full_state.should == @node.full_state
295 | end
296 | end
297 | end
298 |
--------------------------------------------------------------------------------