├── .gitignore
├── .travis.yml
├── AUTHORS.rst
├── CHANGES.rst
├── LICENSE
├── MANIFEST.in
├── README.rst
├── bench.ini
├── bench
    ├── __init__.py
    ├── speed.py
    └── words100k.txt.zip
├── docs
    ├── Makefile
    ├── conf.py
    ├── index.rst
    └── make.bat
├── lib
    ├── AUTHORS
    ├── COPYING
    ├── b64
    │   ├── AUTHORS
    │   ├── LICENSE
    │   ├── cdecode.c
    │   ├── cdecode.h
    │   ├── cencode.c
    │   ├── cencode.h
    │   ├── decode.h
    │   └── encode.h
    └── dawgdic
    │   ├── base-types.h
    │   ├── base-unit.h
    │   ├── bit-pool.h
    │   ├── completer.h
    │   ├── dawg-builder.h
    │   ├── dawg-unit.h
    │   ├── dawg.h
    │   ├── dictionary-builder.h
    │   ├── dictionary-extra-unit.h
    │   ├── dictionary-unit.h
    │   ├── dictionary.h
    │   ├── guide-builder.h
    │   ├── guide-unit.h
    │   ├── guide.h
    │   ├── link-table.h
    │   ├── object-pool.h
    │   ├── ranked-completer-candidate.h
    │   ├── ranked-completer-node.h
    │   ├── ranked-completer.h
    │   ├── ranked-guide-builder.h
    │   ├── ranked-guide-link.h
    │   ├── ranked-guide-unit.h
    │   └── ranked-guide.h
├── setup.py
├── src
    ├── _base_types.cpp
    ├── _base_types.pxd
    ├── _completer.cpp
    ├── _completer.pxd
    ├── _dawg.cpp
    ├── _dawg.pxd
    ├── _dawg_builder.cpp
    ├── _dawg_builder.pxd
    ├── _dictionary.cpp
    ├── _dictionary.pxd
    ├── _dictionary_builder.cpp
    ├── _dictionary_builder.pxd
    ├── _dictionary_unit.cpp
    ├── _dictionary_unit.pxd
    ├── _guide.cpp
    ├── _guide.pxd
    ├── _guide_builder.cpp
    ├── _guide_builder.pxd
    ├── _guide_unit.cpp
    ├── _guide_unit.pxd
    ├── b64_decode.cpp
    ├── b64_decode.pxd
    ├── dawg.cpp
    ├── dawg.pyx
    ├── iostream.cpp
    └── iostream.pxd
├── tests
    ├── __init__.py
    ├── test_dawg.py
    ├── test_payload_dawg.py
    └── test_prediction.py
├── tox.ini
└── update_cpp.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | MANIFEST
 2 | src/*.html
 3 | 
 4 | *.py[cod]
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Packages
10 | *.egg
11 | *.egg-info
12 | dist
13 | build
14 | eggs
15 | parts
16 | bin
17 | var
18 | sdist
19 | develop-eggs
20 | .installed.cfg
21 | lib
22 | lib64
23 | __pycache__
24 | 
25 | # Installer logs
26 | pip-log.txt
27 | 
28 | # Unit test / coverage reports
29 | .coverage
30 | .tox
31 | nosetests.xml
32 | 
33 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | matrix:
 4 |   include:
 5 |   - python: 3.5
 6 |     env: TOXENV=py35
 7 |   - python: 3.5
 8 |     env: TOXENV=py35-locale
 9 |   - python: 3.6
10 |     env: TOXENV=py36
11 |   - python: 3.7
12 |     env: TOXENV=py37
13 |   - python: 3.8
14 |     env: TOXENV=py38
15 | 
16 | install:
17 | - pip install -U tox
18 | 
19 | script: tox
20 | 


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
 1 | Authors & Contributors
 2 | ----------------------
 3 | 
 4 | * Mikhail Korobov <kmike84@gmail.com>;
 5 | * Dan Blanchard;
 6 | * Jakub Wilk;
 7 | * Alex Moiseenko;
 8 | * `Matt Hickford <https://github.com/matt-hickford>`_;
 9 | * `Ikuya Yamada <https://github.com/ikuyamada>`_.
10 | 
11 | This module uses `dawgdic`_ C++ library by
12 | Susumu Yata & contributors.
13 | 
14 | base64 decoder is a modified version of libb64_ (original author
15 | is Chris Venter).
16 | 
17 | .. _libb64: http://libb64.sourceforge.net/
18 | .. _dawgdic: https://code.google.com/p/dawgdic/
19 | 


--------------------------------------------------------------------------------
/CHANGES.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | Changes
  3 | =======
  4 | 
  5 | 0.8.0 (2020-02-19)
  6 | ------------------
  7 | 
  8 | * Python 3.8 support is added
  9 | * Python 3.2, 3.3 and 3.4 support is dropped
 10 | * Extension is rebuilt with Cython 0.29.15
 11 | 
 12 | 0.7.8 (2015-04-18)
 13 | ------------------
 14 | 
 15 | * extra type annotations are added to make the code a bit faster;
 16 | * mercurial mirror at bitbucket is dropped;
 17 | * wrapper is rebuilt with Cython 0.22.
 18 | 
 19 | 0.7.7 (2014-11-19)
 20 | ------------------
 21 | 
 22 | * ``DAWG.b_prefixes`` method for avoiding utf8 encoding/decoding
 23 |   (thanks Ikuya Yamada);
 24 | * wrapper is rebuilt with Cython 0.21.1.
 25 | 
 26 | 0.7.6 (2014-08-10)
 27 | ------------------
 28 | 
 29 | * Wrapper is rebuilt with Cython 0.20.2 to fix some issues.
 30 | 
 31 | 0.7.5 (2014-06-05)
 32 | ------------------
 33 | 
 34 | * Switched to setuptools;
 35 | * some wheels are uploaded to pypi.
 36 | 
 37 | 0.7.4 (2014-05-29)
 38 | ------------------
 39 | 
 40 | * Fixed a bug in DAWG building: input should be sorted according to its
 41 |   binary representation.
 42 | 
 43 | 0.7.3 (2014-05-29)
 44 | ------------------
 45 | 
 46 | * Wrapper is rebuilt with Cython 0.21dev;
 47 | * Python 3.4 compatibility is verified.
 48 | 
 49 | 0.7.2 (2013-10-03)
 50 | ------------------
 51 | 
 52 | * ``has_keys_with_prefix(prefix)`` method (thanks
 53 |   `Matt Hickford <https://github.com/matt-hickford>`_)
 54 | 
 55 | 0.7.1 (2013-05-25)
 56 | ------------------
 57 | 
 58 | - Extension is rebuilt with Cython 0.19.1;
 59 | - fixed segfault that happened on lookup from incorrectly loaded DAWG
 60 |   (thanks Alex Moiseenko).
 61 | 
 62 | 0.7 (2013-04-05)
 63 | ----------------
 64 | 
 65 | - IntCompletionDAWG
 66 | 
 67 | 0.6.1 (2013-03-23)
 68 | ------------------
 69 | 
 70 | - Installation issues in environments with LC_ALL=C are fixed;
 71 | - PyPy is officially unsupported now (use DAWG-Python_ with PyPy).
 72 | 
 73 | .. _DAWG-Python: https://github.com/pytries/DAWG-Python
 74 | 
 75 | 0.6 (2013-03-22)
 76 | ----------------
 77 | 
 78 | - many thread-safety bugs are fixed (at the cost of slowing library down).
 79 | 
 80 | 0.5.5 (2013-02-19)
 81 | ------------------
 82 | 
 83 | - fix installation under PyPy (note: DAWG is slow under PyPy
 84 |   and may have bugs).
 85 | 
 86 | 0.5.4 (2013-02-14)
 87 | ------------------
 88 | 
 89 | - small tweaks for docstrings;
 90 | - the extension is rebuilt using Cython 0.18.
 91 | 
 92 | 0.5.3 (2013-01-03)
 93 | ------------------
 94 | 
 95 | - small improvements to ``.compile_replaces`` method;
 96 | - benchmarks for ``.similar_items`` method;
 97 | - the extension is rebuilt with Cython pre-0.18; this made
 98 |   ``.prefixes`` and ``.iterprefixes`` methods faster
 99 |   (up to 6x in some cases).
100 | 
101 | 0.5.2 (2013-01-02)
102 | ------------------
103 | 
104 | - tests are included in source distribution;
105 | - benchmark results in README was nonrepresentative because of my
106 |   broken (slow) Python 3.2 install;
107 | - installation is fixed under Python 3.x with ``LC_ALL=C`` (thanks
108 |   Jakub Wilk).
109 | 
110 | 0.5.1 (2012-10-11)
111 | ------------------
112 | 
113 | - better error reporting while building DAWGs;
114 | - ``__contains__`` is fixed for keys with zero bytes;
115 | - ``dawg.Error`` exception class;
116 | - building of ``BytesDAWG`` and ``RecordDAWG`` fails instead of
117 |   producing incorrect results if some of the keys has unsupported characters.
118 | 
119 | 
120 | 0.5 (2012-10-08)
121 | ----------------
122 | 
123 | The storage scheme of ``BytesDAWG`` and ``RecordDAWG`` is changed in
124 | this release in order to provide the alphabetical ordering of items.
125 | 
126 | This is a backwards-incompatible release. In order to read ``BytesDAWG`` or
127 | ``RecordDAWG`` created with previous versions of DAWG use ``payload_separator``
128 | constructor argument::
129 | 
130 |     >>> BytesDAWG(payload_separator=b'\xff').load('old.dawg')
131 | 
132 | 
133 | 0.4.1 (2012-10-01)
134 | ------------------
135 | 
136 | - Segfaults with empty DAWGs are fixed by updating dawgdic to latest svn.
137 | 
138 | 0.4 (2012-09-26)
139 | ----------------
140 | 
141 | - ``iterkeys``, ``iteritems`` and ``iterprefixes`` methods
142 |   (thanks Dan Blanchard).
143 | 
144 | 0.3.2 (2012-09-24)
145 | ------------------
146 | 
147 | - ``prefixes`` method for finding all prefixes of a given key.
148 | 
149 | 0.3.1 (2012-09-20)
150 | ------------------
151 | 
152 | - bundled dawgdic C++ library is updated to the latest version.
153 | 
154 | 0.3 (2012-09-13)
155 | ----------------
156 | 
157 | - ``similar_keys``, ``similar_items`` and ``similar_item_values`` methods
158 |   for more permissive lookups (they may be useful e.g. for umlaut handling);
159 | - ``load`` method returns self;
160 | - Python 3.3 support.
161 | 
162 | 0.2 (2012-09-08)
163 | ----------------
164 | 
165 | Greatly improved memory usage for DAWGs loaded with ``load`` method.
166 | 
167 | There is currently a bug somewhere in a wrapper so DAWGs loaded with
168 | ``read()`` method or unpickled DAWGs uses 3x-4x memory compared to DAWGs
169 | loaded with ``load()`` method. ``load()`` is fixed in this release but
170 | other methods are not.
171 | 
172 | 0.1 (2012-09-08)
173 | ----------------
174 | 
175 | Initial release.
176 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) Mikhail Korobov, 2012-2014
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is furnished
 8 | to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
14 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR
15 | A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
16 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
17 | CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
18 | OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include README.rst
 2 | include AUTHORS.rst
 3 | include CHANGES.rst
 4 | include LICENSE
 5 | include tox.ini
 6 | include update_cpp.sh
 7 | include lib/COPYING
 8 | 
 9 | recursive-include docs *.rst *.py Makefile make.bat
10 | 
11 | recursive-include src *.cpp *.pxd *.pyx
12 | recursive-include lib *.c *.h
13 | recursive-include tests *.py


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | DAWG
 2 | ====
 3 | 
 4 | .. image:: https://travis-ci.org/pytries/DAWG.png?branch=master
 5 |     :target: https://travis-ci.org/pytries/DAWG
 6 | 
 7 | This package provides DAWG(DAFSA_)-based dictionary-like
 8 | read-only objects for Python (2.x and 3.x).
 9 | 
10 | String data in a DAWG may take 200x less memory than in
11 | a standard Python dict and the raw lookup speed is comparable;
12 | it also provides fast advanced methods like prefix search.
13 | 
14 | .. _DAFSA: https://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton
15 | 
16 | * Docs: https://dawg.readthedocs.org
17 | * Source code: https://github.com/pytries/DAWG
18 | * Issue tracker: https://github.com/pytries/DAWG/issues
19 | 
20 | License
21 | =======
22 | 
23 | Wrapper code is licensed under MIT License.
24 | Bundled `dawgdic`_ C++ library is licensed under BSD license.
25 | Bundled libb64_ is Public Domain.
26 | 
27 | .. _dawgdic: https://code.google.com/p/dawgdic/
28 | .. _libb64: http://libb64.sourceforge.net/
29 | 


--------------------------------------------------------------------------------
/bench.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py27,py35,py36,py37
 3 | 
 4 | [testenv]
 5 | deps =
 6 |     pytest
 7 | commands=
 8 |     python setup.py install
 9 |     python bench/speed.py
10 | 


--------------------------------------------------------------------------------
/bench/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import


--------------------------------------------------------------------------------
/bench/speed.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | from __future__ import absolute_import, unicode_literals, division
  4 | import random
  5 | import string
  6 | import timeit
  7 | import os
  8 | import zipfile
  9 | import struct
 10 | #import pstats
 11 | #import cProfile
 12 | 
 13 | import dawg
 14 | 
 15 | def words100k():
 16 |     zip_name = os.path.join(
 17 |         os.path.abspath(os.path.dirname(__file__)),
 18 |         'words100k.txt.zip'
 19 |     )
 20 |     zf = zipfile.ZipFile(zip_name)
 21 |     txt = zf.open(zf.namelist()[0]).read().decode('utf8')
 22 |     return txt.splitlines()
 23 | 
 24 | def random_words(num):
 25 |     russian = 'абвгдеёжзиклмнопрстуфхцчъыьэюя'
 26 |     alphabet = '%s%s' % (russian, string.ascii_letters)
 27 |     return [
 28 |         "".join([random.choice(alphabet) for x in range(random.randint(1,15))])
 29 |         for y in range(num)
 30 |     ]
 31 | 
 32 | def truncated_words(words):
 33 |     return [word[:3] for word in words]
 34 | 
 35 | def prefixes1k(words, prefix_len):
 36 |     words = [w for w in words if len(w) >= prefix_len]
 37 |     every_nth = int(len(words)/1000)
 38 |     _words = [w[:prefix_len] for w in words[::every_nth]]
 39 |     return _words[:1000]
 40 | 
 41 | def leet_words(words, replaces):
 42 |     for key, value in replaces.items():
 43 |         words = [w.replace(key, value) for w in words]
 44 |     return words
 45 | 
 46 | 
 47 | WORDS100k = words100k()
 48 | MIXED_WORDS100k = truncated_words(WORDS100k)
 49 | NON_WORDS100k = random_words(100000)
 50 | PREFIXES_3_1k = prefixes1k(WORDS100k, 3)
 51 | PREFIXES_5_1k = prefixes1k(WORDS100k, 5)
 52 | PREFIXES_8_1k = prefixes1k(WORDS100k, 8)
 53 | PREFIXES_15_1k = prefixes1k(WORDS100k, 15)
 54 | 
 55 | LEET_REPLACES = {
 56 |     'o': '0',
 57 |     'O': '0',
 58 |     'u': '0',
 59 |     'l': '1',
 60 |     'i': '1',
 61 |     'e': '3',
 62 |     'E': '3',
 63 |     'A': '4',
 64 |     'a': '4',
 65 |     'h': '4',
 66 |     's': 'z',
 67 | }
 68 | LEET_50k = leet_words(WORDS100k[:50000], LEET_REPLACES)
 69 | 
 70 | def format_result(key, value, text_width):
 71 |     key = key.ljust(text_width)
 72 |     print("    %s %s" % (key, value))
 73 | 
 74 | 
 75 | def bench(name, timer, descr='M ops/sec', op_count=0.1, repeats=3, runs=5,
 76 |           text_width=33):
 77 |     try:
 78 |         times = []
 79 |         for x in range(runs):
 80 |             times.append(timer.timeit(repeats))
 81 | 
 82 |         def op_time(time):
 83 |             return op_count*repeats / time
 84 | 
 85 |         val = "%0.3f%s" % (op_time(min(times)), descr)
 86 |         format_result(name, val, text_width)
 87 |     except (AttributeError, TypeError) as e:
 88 |         format_result(name, "not supported", text_width)
 89 | 
 90 | def create_dawg():
 91 |     words = words100k()
 92 |     return dawg.DAWG(words)
 93 | 
 94 | def create_bytes_dawg():
 95 |     words = words100k()
 96 |     values = [struct.pack(str('<H'), len(word)) for word in words]
 97 |     return dawg.BytesDAWG(zip(words, values))
 98 | 
 99 | def create_record_dawg():
100 |     words = words100k()
101 |     values = [ [len(word)] for word in words]
102 |     return dawg.RecordDAWG(str('<H'), zip(words, values))
103 | 
104 | def create_int_dawg():
105 |     words = words100k()
106 |     values = [len(word) for word in words]
107 |     return dawg.IntDAWG(zip(words, values))
108 | 
109 | def create_leet_dawg():
110 |     return dawg.DAWG(LEET_50k)
111 | 
112 | 
113 | def benchmark():
114 |     print('\n====== Benchmarks (100k unique unicode words) =======\n')
115 | 
116 |     tests = [
117 |         ('__getitem__ (hits)', "for word in WORDS100k: data[word]", 'M ops/sec', 0.1, 3),
118 |         ('get() (hits)', "for word in WORDS100k: data.get(word)", 'M ops/sec', 0.1, 3),
119 |         ('get() (misses)', "for word in NON_WORDS_10k: data.get(word)", 'M ops/sec', 0.01, 5),
120 |         ('__contains__ (hits)', "for word in WORDS100k: word in data", 'M ops/sec', 0.1, 3),
121 |         ('__contains__ (misses)', "for word in NON_WORDS100k: word in data", 'M ops/sec', 0.1, 3),
122 |         ('items()', 'list(data.items())', ' ops/sec', 1, 1),
123 |         ('keys()', 'list(data.keys())', ' ops/sec', 1, 1),
124 | #        ('values()', 'list(data.values())', ' ops/sec', 1, 1),
125 |     ]
126 | 
127 |     common_setup = """
128 | from __main__ import create_dawg, create_bytes_dawg, create_record_dawg, create_int_dawg, create_leet_dawg
129 | from __main__ import WORDS100k, NON_WORDS100k, MIXED_WORDS100k
130 | from __main__ import PREFIXES_3_1k, PREFIXES_5_1k, PREFIXES_8_1k, PREFIXES_15_1k
131 | from __main__ import LEET_50k, LEET_REPLACES
132 | NON_WORDS_10k = NON_WORDS100k[:10000]
133 | NON_WORDS_1k = ['ыва', 'xyz', 'соы', 'Axx', 'avы']*200
134 | """
135 |     dict_setup = common_setup + 'data = dict((word, len(word)) for word in WORDS100k);'
136 |     dawg_setup = common_setup + 'data = create_dawg(); repl = data.compile_replaces(LEET_REPLACES);'
137 |     bytes_dawg_setup = common_setup + 'data = create_bytes_dawg();'
138 |     record_dawg_setup = common_setup + 'data = create_record_dawg();'
139 |     int_dawg_setup = common_setup + 'data = create_int_dawg();'
140 |     leet_dawg_setup = common_setup + 'data = create_leet_dawg(); repl = data.compile_replaces(LEET_REPLACES);'
141 | 
142 |     structures = [
143 |         ('dict', dict_setup),
144 |         ('DAWG', dawg_setup),
145 |         ('BytesDAWG', bytes_dawg_setup),
146 |         ('RecordDAWG', record_dawg_setup),
147 |         ('IntDAWG', int_dawg_setup),
148 |     ]
149 |     for test_name, test, descr, op_count, repeats in tests:
150 |         for name, setup in structures:
151 |             timer = timeit.Timer(test, setup)
152 |             full_test_name = "%s %s" % (name, test_name)
153 |             bench(full_test_name, timer, descr, op_count, repeats, 9)
154 | 
155 |     # DAWG-specific benchmarks
156 | 
157 |     # benchmark for similar_keys
158 |     bench(
159 |         "DAWG.similar_keys  (no replaces)",
160 |         timeit.Timer(
161 |             "for word in WORDS100k[:50000]: data.similar_keys(word, repl)",
162 |             setup=dawg_setup,
163 |         ),
164 |         op_count=0.05
165 |     )
166 |     bench(
167 |         "DAWG.similar_keys  (l33t)",
168 |         timeit.Timer(
169 |             "for word in WORDS100k[:50000]: data.similar_keys(word, repl)",
170 |             setup=leet_dawg_setup,
171 |         ),
172 |         op_count=0.05
173 |     )
174 | 
175 |     for struct_name, setup in structures[1:]:
176 | 
177 |         # prefixes of a given key
178 |         _bench_data = [
179 |             ('hits', 'WORDS100k'),
180 |             ('mixed', 'MIXED_WORDS100k'),
181 |             ('misses', 'NON_WORDS100k'),
182 |         ]
183 | 
184 |         for meth in ['prefixes']:
185 |             for name, data in _bench_data:
186 |                 bench(
187 |                     '%s.%s (%s)' % (struct_name, meth, name),
188 |                     timeit.Timer(
189 |                         "for word in %s:\n"
190 |                         "   data.%s(word)" % (data, meth),
191 |                         setup
192 |                     ),
193 |                     runs=3,
194 |                 )
195 | 
196 |         for meth in ['iterprefixes']:
197 |             for name, data in _bench_data:
198 |                 bench(
199 |                     '%s.%s (%s)' % (struct_name, meth, name),
200 |                     timeit.Timer(
201 |                         "for word in %s:\n"
202 |                         "   list(data.%s(word))" % (data, meth),
203 |                         setup
204 |                     ),
205 |                     runs=3,
206 |                 )
207 | 
208 |         # keys with a given prefix
209 |         _bench_data = [
210 |             ('xxx', 'avg_len(res)==415', 'PREFIXES_3_1k'),
211 |             ('xxxxx', 'avg_len(res)==17', 'PREFIXES_5_1k'),
212 |             ('xxxxxxxx', 'avg_len(res)==3', 'PREFIXES_8_1k'),
213 |             ('xxxxx..xx', 'avg_len(res)==1.4', 'PREFIXES_15_1k'),
214 |             ('xxx', 'NON_EXISTING', 'NON_WORDS_1k'),
215 |         ]
216 |         for xxx, avg, data in _bench_data:
217 |             for meth in ['keys', 'items']:
218 |                 bench(
219 |                     '%s.%s(prefix="%s"), %s' % (struct_name, meth, xxx, avg),
220 |                     timeit.Timer(
221 |                         "for word in %s: data.%s(word)" % (data, meth),
222 |                         setup
223 |                     ),
224 |                     'K ops/sec',
225 |                     op_count=1,
226 |                     runs=3,
227 |                     text_width=60,
228 |                 )
229 |             for meth in ['iterkeys', 'iteritems']:
230 |                 bench(
231 |                     '%s.%s(prefix="%s"), %s' % (struct_name, meth, xxx, avg),
232 |                     timeit.Timer(
233 |                         "for word in %s: list(data.%s(word))" % (data, meth),
234 |                         setup
235 |                     ),
236 |                     'K ops/sec',
237 |                     op_count=1,
238 |                     runs=3,
239 |                     text_width=60,
240 |                 )
241 | 
242 | 
243 | def check_dawg(trie, words):
244 |     value = 0
245 |     for word in words:
246 |         value += trie[word]
247 |     if value != len(words):
248 |         raise Exception()
249 | 
250 | def profiling():
251 |     import pstats
252 |     import cProfile
253 |     print('\n====== Profiling =======\n')
254 |     d = create_bytes_dawg()
255 |     WORDS = words100k()
256 | 
257 |     def check_getitem(trie, words):
258 |         for word in words:
259 |             trie[word]
260 | 
261 |     cProfile.runctx("check_getitem(d, WORDS)", globals(), locals(), "Profile.prof")
262 | 
263 | #    def check_prefixes(trie, words):
264 | #        for word in words:
265 | #            trie.keys(word)
266 | #    cProfile.runctx("check_prefixes(d, NON_WORDS_1k)", globals(), locals(), "Profile.prof")
267 | #
268 |     #cProfile.runctx("check_trie(d, WORDS)", globals(), locals(), "Profile.prof")
269 | 
270 |     s = pstats.Stats("Profile.prof")
271 |     s.strip_dirs().sort_stats("time").print_stats(20)
272 | 
273 | 
274 | if __name__ == '__main__':
275 | 
276 |     benchmark()
277 |     #profiling()
278 |     print('\n~~~~~~~~~~~~~~\n')


--------------------------------------------------------------------------------
/bench/words100k.txt.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytries/DAWG/239a9ae6896789a5174b3f55107282d5b0f0c6a8/bench/words100k.txt.zip


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 14 | # the i18n builder cannot share the environment and doctrees with the others
 15 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 16 | 
 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 18 | 
 19 | help:
 20 | 	@echo "Please use \`make <target>' where <target> is one of"
 21 | 	@echo "  html       to make standalone HTML files"
 22 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 23 | 	@echo "  singlehtml to make a single large HTML file"
 24 | 	@echo "  pickle     to make pickle files"
 25 | 	@echo "  json       to make JSON files"
 26 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 27 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 28 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 29 | 	@echo "  epub       to make an epub"
 30 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 31 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 32 | 	@echo "  text       to make text files"
 33 | 	@echo "  man        to make manual pages"
 34 | 	@echo "  texinfo    to make Texinfo files"
 35 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 36 | 	@echo "  gettext    to make PO message catalogs"
 37 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 38 | 	@echo "  linkcheck  to check all external links for integrity"
 39 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 40 | 
 41 | clean:
 42 | 	-rm -rf $(BUILDDIR)/*
 43 | 
 44 | html:
 45 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 46 | 	@echo
 47 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 48 | 
 49 | dirhtml:
 50 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 51 | 	@echo
 52 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 53 | 
 54 | singlehtml:
 55 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 58 | 
 59 | pickle:
 60 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 61 | 	@echo
 62 | 	@echo "Build finished; now you can process the pickle files."
 63 | 
 64 | json:
 65 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 66 | 	@echo
 67 | 	@echo "Build finished; now you can process the JSON files."
 68 | 
 69 | htmlhelp:
 70 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 71 | 	@echo
 72 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 73 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 74 | 
 75 | qthelp:
 76 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 77 | 	@echo
 78 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 79 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 80 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/DAWG.qhcp"
 81 | 	@echo "To view the help file:"
 82 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/DAWG.qhc"
 83 | 
 84 | devhelp:
 85 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 86 | 	@echo
 87 | 	@echo "Build finished."
 88 | 	@echo "To view the help file:"
 89 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/DAWG"
 90 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/DAWG"
 91 | 	@echo "# devhelp"
 92 | 
 93 | epub:
 94 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
 95 | 	@echo
 96 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
 97 | 
 98 | latex:
 99 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
100 | 	@echo
101 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
102 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
103 | 	      "(use \`make latexpdf' here to do that automatically)."
104 | 
105 | latexpdf:
106 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
107 | 	@echo "Running LaTeX files through pdflatex..."
108 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
109 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
110 | 
111 | text:
112 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
113 | 	@echo
114 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
115 | 
116 | man:
117 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
118 | 	@echo
119 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
120 | 
121 | texinfo:
122 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
123 | 	@echo
124 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
125 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
126 | 	      "(use \`make info' here to do that automatically)."
127 | 
128 | info:
129 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
130 | 	@echo "Running Texinfo files through makeinfo..."
131 | 	make -C $(BUILDDIR)/texinfo info
132 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
133 | 
134 | gettext:
135 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
136 | 	@echo
137 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
138 | 
139 | changes:
140 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
141 | 	@echo
142 | 	@echo "The overview file is in $(BUILDDIR)/changes."
143 | 
144 | linkcheck:
145 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
146 | 	@echo
147 | 	@echo "Link check complete; look for any errors in the above output " \
148 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
149 | 
150 | doctest:
151 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
152 | 	@echo "Testing of doctests in the sources finished, look at the " \
153 | 	      "results in $(BUILDDIR)/doctest/output.txt."
154 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # DAWG documentation build configuration file, created by
  5 | # sphinx-quickstart on Sat Mar 23 00:33:42 2013.
  6 | #
  7 | # This file is execfile()d with the current directory set to its containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | import sys, os
 16 | 
 17 | # If extensions (or modules to document with autodoc) are in another directory,
 18 | # add these directories to sys.path here. If the directory is relative to the
 19 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 20 | #sys.path.insert(0, os.path.abspath('.'))
 21 | 
 22 | # -- General configuration -----------------------------------------------------
 23 | 
 24 | # If your documentation needs a minimal Sphinx version, state it here.
 25 | #needs_sphinx = '1.0'
 26 | 
 27 | # Add any Sphinx extension module names here, as strings. They can be extensions
 28 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 29 | extensions = []
 30 | 
 31 | # Add any paths that contain templates here, relative to this directory.
 32 | templates_path = ['_templates']
 33 | 
 34 | # The suffix of source filenames.
 35 | source_suffix = '.rst'
 36 | 
 37 | # The encoding of source files.
 38 | #source_encoding = 'utf-8-sig'
 39 | 
 40 | # The master toctree document.
 41 | master_doc = 'index'
 42 | 
 43 | # General information about the project.
 44 | project = 'DAWG'
 45 | copyright = '2015, Mikhail Korobov'
 46 | 
 47 | # The version info for the project you're documenting, acts as replacement for
 48 | # |version| and |release|, also used in various other places throughout the
 49 | # built documents.
 50 | #
 51 | # The short X.Y version.
 52 | version = '0.6'
 53 | # The full version, including alpha/beta/rc tags.
 54 | release = '0.6'
 55 | 
 56 | # The language for content autogenerated by Sphinx. Refer to documentation
 57 | # for a list of supported languages.
 58 | #language = None
 59 | 
 60 | # There are two options for replacing |today|: either, you set today to some
 61 | # non-false value, then it is used:
 62 | #today = ''
 63 | # Else, today_fmt is used as the format for a strftime call.
 64 | #today_fmt = '%B %d, %Y'
 65 | 
 66 | # List of patterns, relative to source directory, that match files and
 67 | # directories to ignore when looking for source files.
 68 | exclude_patterns = ['_build']
 69 | 
 70 | # The reST default role (used for this markup: `text`) to use for all documents.
 71 | #default_role = None
 72 | 
 73 | # If true, '()' will be appended to :func: etc. cross-reference text.
 74 | #add_function_parentheses = True
 75 | 
 76 | # If true, the current module name will be prepended to all description
 77 | # unit titles (such as .. function::).
 78 | #add_module_names = True
 79 | 
 80 | # If true, sectionauthor and moduleauthor directives will be shown in the
 81 | # output. They are ignored by default.
 82 | #show_authors = False
 83 | 
 84 | # The name of the Pygments (syntax highlighting) style to use.
 85 | pygments_style = 'sphinx'
 86 | 
 87 | # A list of ignored prefixes for module index sorting.
 88 | #modindex_common_prefix = []
 89 | 
 90 | 
 91 | # -- Options for HTML output ---------------------------------------------------
 92 | 
 93 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 94 | # a list of builtin themes.
 95 | html_theme = 'default'
 96 | 
 97 | # Theme options are theme-specific and customize the look and feel of a theme
 98 | # further.  For a list of options available for each theme, see the
 99 | # documentation.
100 | #html_theme_options = {}
101 | 
102 | # Add any paths that contain custom themes here, relative to this directory.
103 | #html_theme_path = []
104 | 
105 | # The name for this set of Sphinx documents.  If None, it defaults to
106 | # "<project> v<release> documentation".
107 | #html_title = None
108 | 
109 | # A shorter title for the navigation bar.  Default is the same as html_title.
110 | #html_short_title = None
111 | 
112 | # The name of an image file (relative to this directory) to place at the top
113 | # of the sidebar.
114 | #html_logo = None
115 | 
116 | # The name of an image file (within the static path) to use as favicon of the
117 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
118 | # pixels large.
119 | #html_favicon = None
120 | 
121 | # Add any paths that contain custom static files (such as style sheets) here,
122 | # relative to this directory. They are copied after the builtin static files,
123 | # so a file named "default.css" will overwrite the builtin "default.css".
124 | html_static_path = ['_static']
125 | 
126 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
127 | # using the given strftime format.
128 | #html_last_updated_fmt = '%b %d, %Y'
129 | 
130 | # If true, SmartyPants will be used to convert quotes and dashes to
131 | # typographically correct entities.
132 | #html_use_smartypants = True
133 | 
134 | # Custom sidebar templates, maps document names to template names.
135 | #html_sidebars = {}
136 | 
137 | # Additional templates that should be rendered to pages, maps page names to
138 | # template names.
139 | #html_additional_pages = {}
140 | 
141 | # If false, no module index is generated.
142 | #html_domain_indices = True
143 | 
144 | # If false, no index is generated.
145 | #html_use_index = True
146 | 
147 | # If true, the index is split into individual pages for each letter.
148 | #html_split_index = False
149 | 
150 | # If true, links to the reST sources are added to the pages.
151 | #html_show_sourcelink = True
152 | 
153 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
154 | #html_show_sphinx = True
155 | 
156 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
157 | #html_show_copyright = True
158 | 
159 | # If true, an OpenSearch description file will be output, and all pages will
160 | # contain a <link> tag referring to it.  The value of this option must be the
161 | # base URL from which the finished HTML is served.
162 | #html_use_opensearch = ''
163 | 
164 | # This is the file name suffix for HTML files (e.g. ".xhtml").
165 | #html_file_suffix = None
166 | 
167 | # Output file base name for HTML help builder.
168 | htmlhelp_basename = 'DAWGdoc'
169 | 
170 | 
171 | # -- Options for LaTeX output --------------------------------------------------
172 | 
173 | latex_elements = {
174 | # The paper size ('letterpaper' or 'a4paper').
175 | #'papersize': 'letterpaper',
176 | 
177 | # The font size ('10pt', '11pt' or '12pt').
178 | #'pointsize': '10pt',
179 | 
180 | # Additional stuff for the LaTeX preamble.
181 | #'preamble': '',
182 | }
183 | 
184 | # Grouping the document tree into LaTeX files. List of tuples
185 | # (source start file, target name, title, author, documentclass [howto/manual]).
186 | latex_documents = [
187 |   ('index', 'DAWG.tex', 'DAWG Documentation',
188 |    'Mikhail Korobov', 'manual'),
189 | ]
190 | 
191 | # The name of an image file (relative to this directory) to place at the top of
192 | # the title page.
193 | #latex_logo = None
194 | 
195 | # For "manual" documents, if this is true, then toplevel headings are parts,
196 | # not chapters.
197 | #latex_use_parts = False
198 | 
199 | # If true, show page references after internal links.
200 | #latex_show_pagerefs = False
201 | 
202 | # If true, show URL addresses after external links.
203 | #latex_show_urls = False
204 | 
205 | # Documents to append as an appendix to all manuals.
206 | #latex_appendices = []
207 | 
208 | # If false, no module index is generated.
209 | #latex_domain_indices = True
210 | 
211 | 
212 | # -- Options for manual page output --------------------------------------------
213 | 
214 | # One entry per manual page. List of tuples
215 | # (source start file, name, description, authors, manual section).
216 | man_pages = [
217 |     ('index', 'dawg', 'DAWG Documentation',
218 |      ['Mikhail Korobov'], 1)
219 | ]
220 | 
221 | # If true, show URL addresses after external links.
222 | #man_show_urls = False
223 | 
224 | 
225 | # -- Options for Texinfo output ------------------------------------------------
226 | 
227 | # Grouping the document tree into Texinfo files. List of tuples
228 | # (source start file, target name, title, author,
229 | #  dir menu entry, description, category)
230 | texinfo_documents = [
231 |   ('index', 'DAWG', 'DAWG Documentation',
232 |    'Mikhail Korobov', 'DAWG', 'One line description of project.',
233 |    'Miscellaneous'),
234 | ]
235 | 
236 | # Documents to append as an appendix to all manuals.
237 | #texinfo_appendices = []
238 | 
239 | # If false, no module index is generated.
240 | #texinfo_domain_indices = True
241 | 
242 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
243 | #texinfo_show_urls = 'footnote'
244 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
  1 | ==================
  2 | DAWG documentation
  3 | ==================
  4 | 
  5 | This package provides DAWG(DAFSA_)-based dictionary-like
  6 | read-only objects for Python (2.x and 3.x).
  7 | 
  8 | String data in a DAWG may take 200x less memory than in
  9 | a standard Python dict and the raw lookup speed is comparable;
 10 | it also provides fast advanced methods like prefix search.
 11 | 
 12 | Based on `dawgdic`_ C++ library.
 13 | 
 14 | .. _dawgdic: https://code.google.com/p/dawgdic/
 15 | .. _libb64: http://libb64.sourceforge.net/
 16 | .. _DAFSA: https://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton
 17 | 
 18 | License
 19 | =======
 20 | 
 21 | Wrapper code is licensed under MIT License.
 22 | Bundled `dawgdic`_ C++ library is licensed under BSD license.
 23 | Bundled libb64_ is Public Domain.
 24 | 
 25 | Installation
 26 | ============
 27 | 
 28 | From `PyPI <https://pypi.python.org/pypi/DAWG>`_::
 29 | 
 30 |     pip install DAWG
 31 | 
 32 | Usage
 33 | =====
 34 | 
 35 | There are several DAWG classes in this package:
 36 | 
 37 | * ``dawg.DAWG`` - basic DAWG wrapper; it can store unicode keys
 38 |   and do exact lookups;
 39 | 
 40 | * ``dawg.CompletionDAWG`` - ``dawg.DAWG`` subclass that supports
 41 |   key completion and prefix lookups (but requires more memory);
 42 | 
 43 | * ``dawg.BytesDAWG`` - ``dawg.CompletionDAWG`` subclass that
 44 |   maps unicode keys to lists of ``bytes`` objects.
 45 | 
 46 | * ``dawg.RecordDAWG`` - ``dawg.BytesDAWG`` subclass that
 47 |   maps unicode keys to lists of data tuples.
 48 |   All tuples must be of the same format (the data is packed
 49 |   using python ``struct`` module).
 50 | 
 51 | * ``dawg.IntDAWG`` - ``dawg.DAWG`` subclass that maps unicode keys
 52 |   to integer values.
 53 | 
 54 | * ``dawg.IntCompletionDAWG`` - ``dawg.CompletionDAWG`` subclass
 55 |   that maps unicode keys to integer values.
 56 | 
 57 | DAWG and CompletionDAWG
 58 | -----------------------
 59 | 
 60 | ``DAWG`` and ``CompletionDAWG`` are useful when you need
 61 | fast & memory efficient simple string storage. These classes
 62 | does not support assigning values to keys.
 63 | 
 64 | ``DAWG`` and ``CompletionDAWG`` constructors accept an iterable with keys::
 65 | 
 66 |     >>> import dawg
 67 |     >>> words = [u'foo', u'bar', u'foobar', u'foö', u'bör']
 68 |     >>> base_dawg = dawg.DAWG(words)
 69 |     >>> completion_dawg = dawg.CompletionDAWG(words)
 70 | 
 71 | It is then possible to check if the key is in a DAWG::
 72 | 
 73 |     >>> u'foo' in base_dawg
 74 |     True
 75 |     >>> u'baz' in completion_dawg
 76 |     False
 77 | 
 78 | It is possible to find all keys that starts with a given
 79 | prefix in a ``CompletionDAWG``::
 80 | 
 81 |     >>> completion_dawg.keys(u'foo')
 82 |     >>> [u'foo', u'foobar']
 83 | 
 84 | to test whether some key begins with a given prefix::
 85 | 
 86 |     >>> completion_dawg.has_keys_with_prefix(u'foo')
 87 |     >>> True
 88 | 
 89 | and to find all prefixes of a given key::
 90 | 
 91 |     >>> base_dawg.prefixes(u'foobarz')
 92 |     [u'foo', u'foobar']
 93 | 
 94 | Iterator versions are also available::
 95 | 
 96 |     >>> for key in completion_dawg.iterkeys(u'foo'):
 97 |     ...     print(key)
 98 |     foo
 99 |     foobar
100 |     >>> for prefix in base_dawg.iterprefixes(u'foobarz'):
101 |     ...     print(prefix)
102 |     foo
103 |     foobar
104 | 
105 | It is possible to find all keys similar to a given key (using a one-way
106 | char translation table)::
107 | 
108 |     >>> replaces = dawg.DAWG.compile_replaces({u'o': u'ö'})
109 |     >>> base_dawg.similar_keys(u'foo', replaces)
110 |     [u'foo', u'foö']
111 |     >>> base_dawg.similar_keys(u'foö', replaces)
112 |     [u'foö']
113 |     >>> base_dawg.similar_keys(u'bor', replaces)
114 |     [u'bör']
115 | 
116 | BytesDAWG
117 | ---------
118 | 
119 | ``BytesDAWG`` is a ``CompletionDAWG`` subclass that can store
120 | binary data for each key.
121 | 
122 | ``BytesDAWG`` constructor accepts an iterable with
123 | ``(unicode_key, bytes_value)`` tuples::
124 | 
125 |     >>> data = [(u'key1', b'value1'), (u'key2', b'value2'), (u'key1', b'value3')]
126 |     >>> bytes_dawg = dawg.BytesDAWG(data)
127 | 
128 | There can be duplicate keys; all unique values are stored in this case::
129 | 
130 |     >>> bytes_dawg[u'key1']
131 |     [b'value1, b'value3']
132 | 
133 | For unique keys a list with a single value is returned for consistency::
134 | 
135 |     >>> bytes_dawg[u'key2']
136 |     [b'value2']
137 | 
138 | ``KeyError`` is raised for missing keys; use ``get`` method if you need
139 | a default value instead::
140 | 
141 |     >>> bytes_dawg.get(u'foo', None)
142 |     None
143 | 
144 | ``BytesDAWG`` support ``keys``, ``items``, ``iterkeys`` and ``iteritems``
145 | methods (they all accept optional key prefix). There is also support for
146 | ``similar_keys``, ``similar_items`` and ``similar_item_values`` methods.
147 | 
148 | RecordDAWG
149 | ----------
150 | 
151 | ``RecordDAWG`` is a ``BytesDAWG`` subclass that automatically
152 | packs & unpacks the binary data from/to Python objects
153 | using ``struct`` module from the standard library.
154 | 
155 | First, you have to define a format of the data. Consult Python docs
156 | (http://docs.python.org/library/struct.html#format-strings) for the format
157 | string specification.
158 | 
159 | For example, let's store 3 short unsigned numbers (in a Big-Endian byte order)
160 | as values::
161 | 
162 |     >>> format = ">HHH"
163 | 
164 | ``RecordDAWG`` constructor accepts an iterable with
165 | ``(unicode_key, value_tuple)``. Let's create such iterable
166 | using ``zip`` function::
167 | 
168 |     >>> keys = [u'foo', u'bar', u'foobar', u'foo']
169 |     >>> values = [(1, 2, 3), (2, 1, 0), (3, 3, 3), (2, 1, 5)]
170 |     >>> data = zip(keys, values)
171 |     >>> record_dawg = RecordDAWG(format, data)
172 | 
173 | As with ``BytesDAWG``, there can be several values for the same key::
174 | 
175 |     >>> record_dawg['foo']
176 |     [(1, 2, 3), (2, 1, 5)]
177 |     >>> record_dawg['foobar']
178 |     [(3, 3, 3)]
179 | 
180 | 
181 | BytesDAWG and RecordDAWG implementation details
182 | -----------------------------------------------
183 | 
184 | ``BytesDAWG`` and ``RecordDAWG`` stores data at the end of the keys::
185 | 
186 |     <utf8-encoded key><separator><base64-encoded data>
187 | 
188 | Data is encoded to base64 because dawgdic_ C++ library doesn't allow
189 | zero bytes in keys (it uses null-terminated strings) and such keys are
190 | very likely in binary data.
191 | 
192 | In DAWG versions prior to 0.5 ``<separator>`` was ``chr(255)`` byte.
193 | It was chosen because keys are stored as UTF8-encoded strings and
194 | ``chr(255)`` is guaranteed not to appear in valid UTF8, so the end of
195 | text part of the key is not ambiguous.
196 | 
197 | But ``chr(255)`` was proven to be problematic: it changes the order
198 | of the keys. Keys are naturally returned in lexicographical order by DAWG.
199 | But if ``chr(255)`` appears at the end of each text part of a key then the
200 | visible order would change. Imagine ``'foo'`` key with some payload
201 | and ``'foobar'`` key with some payload. ``'foo'`` key would be greater
202 | than ``'foobar'`` key: values compared would be ``'foo<sep>'`` and ``'foobar<sep>'``
203 | and ``ord(<sep>)==255`` is greater than ``ord(<any other character>)``.
204 | 
205 | So now the default ``<separator>`` is chr(1). This is the lowest allowed
206 | character and so it preserves the alphabetical order.
207 | 
208 | It is not strictly correct to use chr(1) as a separator because chr(1)
209 | is a valid UTF8 character. But I think in practice this won't be an issue:
210 | such control character is very unlikely in text keys, and binary keys
211 | are not supported anyway because dawgdic_ doesn't support keys containing
212 | chr(0).
213 | 
214 | If you can't guarantee chr(1) is not a part of keys, lexicographical order
215 | is not important to you or there is a need to read
216 | a ``BytesDAWG``/``RecordDAWG`` created by DAWG < 0.5 then pass
217 | ``payload_separator`` argument to the constructor::
218 | 
219 |     >>> BytesDAWG(payload_separator=b'\xff').load('old.dawg')
220 | 
221 | The storage scheme has one more implication: values of ``BytesDAWG``
222 | and ``RecordDAWG`` are also sorted lexicographically.
223 | 
224 | For ``RecordDAWG`` there is a gotcha: in order to have meaningful
225 | ordering of numeric values store them in big-endian format::
226 | 
227 |     >>> data = [('foo', (3, 2, 256)), ('foo', (3, 2, 1)), ('foo', (3, 2, 3))]
228 |     >>> d = RecordDAWG("3H", data)
229 |     >>> d.items()
230 |     [(u'foo', (3, 2, 256)), (u'foo', (3, 2, 1)), (u'foo', (3, 2, 3))]
231 | 
232 |     >>> d2 = RecordDAWG(">3H", data)
233 |     >>> d2.items()
234 |     [(u'foo', (3, 2, 1)), (u'foo', (3, 2, 3)), (u'foo', (3, 2, 256))]
235 | 
236 | IntDAWG and IntCompletionDAWG
237 | -----------------------------
238 | 
239 | ``IntDAWG`` is a ``{unicode -> int}`` mapping. It is possible to
240 | use ``RecordDAWG`` for this, but ``IntDAWG`` is natively
241 | supported by dawgdic_ C++ library and so ``__getitem__`` is much faster.
242 | 
243 | Unlike ``BytesDAWG`` and ``RecordDAWG``, ``IntDAWG`` doesn't support
244 | having several values for the same key.
245 | 
246 | ``IntDAWG`` constructor accepts an iterable with (unicode_key, integer_value)
247 | tuples::
248 | 
249 |     >>> data = [ (u'foo', 1), (u'bar', 2) ]
250 |     >>> int_dawg = dawg.IntDAWG(data)
251 | 
252 | It is then possible to get a value from the IntDAWG::
253 | 
254 |     >>> int_dawg[u'foo']
255 |     1
256 | 
257 | ``IntCompletionDAWG`` supports all ``IntDAWG`` and ``CompletionDAWG`` methods,
258 | plus ``.items()`` and ``.iteritems()``.
259 | 
260 | Persistence
261 | -----------
262 | 
263 | All DAWGs support saving/loading and pickling/unpickling.
264 | 
265 | Write DAWG to a stream::
266 | 
267 |     >>> with open('words.dawg', 'wb') as f:
268 |     ...     d.write(f)
269 | 
270 | Save DAWG to a file::
271 | 
272 |     >>> d.save('words.dawg')
273 | 
274 | Load DAWG from a file::
275 | 
276 |     >>> d = dawg.DAWG()
277 |     >>> d.load('words.dawg')
278 | 
279 | .. warning::
280 | 
281 |     Reading DAWGs from streams and unpickling are currently using 3x memory
282 |     compared to loading DAWGs using ``load`` method; please avoid them until
283 |     the issue is fixed.
284 | 
285 | Read DAWG from a stream::
286 | 
287 |     >>> d = dawg.RecordDAWG(format_string)
288 |     >>> with open('words.record-dawg', 'rb') as f:
289 |     ...     d.read(f)
290 | 
291 | DAWG objects are picklable::
292 | 
293 |     >>> import pickle
294 |     >>> data = pickle.dumps(d)
295 |     >>> d2 = pickle.loads(data)
296 | 
297 | Benchmarks
298 | ==========
299 | 
300 | For a list of 3000000 (3 million) Russian words memory consumption
301 | with different data structures (under Python 2.7):
302 | 
303 | * dict(unicode words -> word lengths): about 600M
304 | * list(unicode words) : about 300M
305 | * ``marisa_trie.RecordTrie`` : 11M
306 | * ``marisa_trie.Trie``: 7M
307 | * ``dawg.DAWG``: 2M
308 | * ``dawg.CompletionDAWG``: 3M
309 | * ``dawg.IntDAWG``: 2.7M
310 | * ``dawg.RecordDAWG``: 4M
311 | 
312 | 
313 | .. note::
314 | 
315 |     Lengths of words were not stored as values in ``dawg.DAWG``,
316 |     ``dawg.CompletionDAWG`` and ``marisa_trie.Trie`` because they don't
317 |     support this.
318 | 
319 | .. note::
320 | 
321 |     `marisa-trie`_ is often more memory efficient than
322 |     DAWG (depending on data); it can also handle larger datasets
323 |     and provides memory-mapped IO, so don't dismiss `marisa-trie`_
324 |     based on this README file. It is still several times slower than
325 |     DAWG though.
326 | 
327 | .. _marisa-trie: https://github.com/pytries/marisa-trie
328 | 
329 | Benchmark results (100k unicode words, integer values (lengths of the words),
330 | Python 3.3, macbook air i5 1.8 Ghz)::
331 | 
332 |     dict __getitem__ (hits)           7.300M ops/sec
333 |     DAWG __getitem__ (hits)           not supported
334 |     BytesDAWG __getitem__ (hits)      1.230M ops/sec
335 |     RecordDAWG __getitem__ (hits)     0.792M ops/sec
336 |     IntDAWG __getitem__ (hits)        4.217M ops/sec
337 |     dict get() (hits)                 3.775M ops/sec
338 |     DAWG get() (hits)                 not supported
339 |     BytesDAWG get() (hits)            1.027M ops/sec
340 |     RecordDAWG get() (hits)           0.733M ops/sec
341 |     IntDAWG get() (hits)              3.162M ops/sec
342 |     dict get() (misses)               4.533M ops/sec
343 |     DAWG get() (misses)               not supported
344 |     BytesDAWG get() (misses)          3.545M ops/sec
345 |     RecordDAWG get() (misses)         3.485M ops/sec
346 |     IntDAWG get() (misses)            3.928M ops/sec
347 | 
348 |     dict __contains__ (hits)          7.090M ops/sec
349 |     DAWG __contains__ (hits)          4.685M ops/sec
350 |     BytesDAWG __contains__ (hits)     3.885M ops/sec
351 |     RecordDAWG __contains__ (hits)    3.898M ops/sec
352 |     IntDAWG __contains__ (hits)       4.612M ops/sec
353 | 
354 |     dict __contains__ (misses)        5.617M ops/sec
355 |     DAWG __contains__ (misses)        6.204M ops/sec
356 |     BytesDAWG __contains__ (misses)   6.026M ops/sec
357 |     RecordDAWG __contains__ (misses)  6.007M ops/sec
358 |     IntDAWG __contains__ (misses)     6.180M ops/sec
359 | 
360 |     DAWG.similar_keys  (no replaces)  0.492M ops/sec
361 |     DAWG.similar_keys  (l33t)         0.413M ops/sec
362 | 
363 |     dict items()                      55.032 ops/sec
364 |     DAWG items()                      not supported
365 |     BytesDAWG items()                 14.826 ops/sec
366 |     RecordDAWG items()                9.436 ops/sec
367 |     IntDAWG items()                   not supported
368 | 
369 |     dict keys()                       200.788 ops/sec
370 |     DAWG keys()                       not supported
371 |     BytesDAWG keys()                  20.657 ops/sec
372 |     RecordDAWG keys()                 20.873 ops/sec
373 |     IntDAWG keys()                    not supported
374 | 
375 |     DAWG.prefixes (hits)              1.552M ops/sec
376 |     DAWG.prefixes (mixed)             4.342M ops/sec
377 |     DAWG.prefixes (misses)            4.094M ops/sec
378 |     DAWG.iterprefixes (hits)          0.391M ops/sec
379 |     DAWG.iterprefixes (mixed)         0.476M ops/sec
380 |     DAWG.iterprefixes (misses)        0.498M ops/sec
381 | 
382 |     RecordDAWG.keys(prefix="xxx"), avg_len(res)==415             5.562K ops/sec
383 |     RecordDAWG.keys(prefix="xxxxx"), avg_len(res)==17            104.011K ops/sec
384 |     RecordDAWG.keys(prefix="xxxxxxxx"), avg_len(res)==3          318.129K ops/sec
385 |     RecordDAWG.keys(prefix="xxxxx..xx"), avg_len(res)==1.4       462.238K ops/sec
386 |     RecordDAWG.keys(prefix="xxx"), NON_EXISTING                  4292.625K ops/sec
387 | 
388 | 
389 | Please take this benchmark results with a grain of salt; this
390 | is a very simple benchmark on a single data set.
391 | 
392 | 
393 | Current limitations
394 | ===================
395 | 
396 | * ``IntDAWG`` is currently a subclass of ``DAWG`` and so it doesn't
397 |   support ``keys()`` and ``items()`` methods;
398 | * ``read()`` method reads the whole stream (DAWG must be the last or the
399 |   only item in a stream if it is read with ``read()`` method) - pickling
400 |   doesn't have this limitation;
401 | * DAWGs loaded with ``read()`` and unpickled DAWGs uses 3x-4x memory
402 |   compared to DAWGs loaded with ``load()`` method;
403 | * there are ``keys()`` and ``items()`` methods but no ``values()`` method;
404 | * iterator versions of methods are not always implemented;
405 | * ``BytesDAWG`` and ``RecordDAWG`` has a limitation: values
406 |   larger than 8KB are unsupported;
407 | * the maximum number of DAWG units is limited: number of DAWG units
408 |   (and thus transitions - but not elements) should be less than 2^29;
409 |   this mean that it may be impossible to build an especially huge DAWG
410 |   (you may split your data into several DAWGs or try `marisa-trie`_ in
411 |   this case).
412 | 
413 | Contributions are welcome!
414 | 
415 | 
416 | Contributing
417 | ============
418 | 
419 | Development happens at github: https://github.com/pytries/DAWG
420 | 
421 | Issue tracker: https://github.com/pytries/DAWG/issues
422 | 
423 | Feel free to submit ideas, bugs or pull requests.
424 | 
425 | If you found a bug in a C++ part please report it to the original
426 | `bug tracker <https://code.google.com/p/dawgdic/issues/list>`_.
427 | 
428 | How is source code organized
429 | ----------------------------
430 | 
431 | There are 4 folders in repository:
432 | 
433 | * ``bench`` - benchmarks & benchmark data;
434 | * ``lib`` - original unmodified `dawgdic`_ C++ library and
435 |   a customized version of `libb64`_ library. They are bundled
436 |   for easier distribution; if something is have to be fixed in these
437 |   libraries consider fixing it in the original repositories;
438 | * ``src`` - wrapper code; ``src/dawg.pyx`` is a wrapper implementation;
439 |   ``src/*.pxd`` files are Cython headers for corresponding C++ headers;
440 |   ``src/*.cpp`` files are the pre-built extension code and shouldn't be
441 |   modified directly (they should be updated via ``update_cpp.sh`` script).
442 | * ``tests`` - the test suite.
443 | 
444 | 
445 | Running tests and benchmarks
446 | ----------------------------
447 | 
448 | Make sure `tox`_ is installed and run
449 | 
450 | ::
451 | 
452 |     $ tox
453 | 
454 | from the source checkout. Tests should pass under python 2.7, 3.5-3.7.
455 | 
456 | In order to run benchmarks, type
457 | 
458 | ::
459 | 
460 |     $ tox -c bench.ini
461 | 
462 | .. _cython: http://cython.org
463 | .. _tox: http://tox.testrun.org
464 | 
465 | .. include:: ../AUTHORS.rst
466 | 
467 | .. include:: ../CHANGES.rst
468 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  linkcheck  to check all external links for integrity
 37 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 38 | 	goto end
 39 | )
 40 | 
 41 | if "%1" == "clean" (
 42 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 43 | 	del /q /s %BUILDDIR%\*
 44 | 	goto end
 45 | )
 46 | 
 47 | if "%1" == "html" (
 48 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 49 | 	if errorlevel 1 exit /b 1
 50 | 	echo.
 51 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 52 | 	goto end
 53 | )
 54 | 
 55 | if "%1" == "dirhtml" (
 56 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 57 | 	if errorlevel 1 exit /b 1
 58 | 	echo.
 59 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 60 | 	goto end
 61 | )
 62 | 
 63 | if "%1" == "singlehtml" (
 64 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 65 | 	if errorlevel 1 exit /b 1
 66 | 	echo.
 67 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 68 | 	goto end
 69 | )
 70 | 
 71 | if "%1" == "pickle" (
 72 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 73 | 	if errorlevel 1 exit /b 1
 74 | 	echo.
 75 | 	echo.Build finished; now you can process the pickle files.
 76 | 	goto end
 77 | )
 78 | 
 79 | if "%1" == "json" (
 80 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 81 | 	if errorlevel 1 exit /b 1
 82 | 	echo.
 83 | 	echo.Build finished; now you can process the JSON files.
 84 | 	goto end
 85 | )
 86 | 
 87 | if "%1" == "htmlhelp" (
 88 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
 89 | 	if errorlevel 1 exit /b 1
 90 | 	echo.
 91 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
 92 | .hhp project file in %BUILDDIR%/htmlhelp.
 93 | 	goto end
 94 | )
 95 | 
 96 | if "%1" == "qthelp" (
 97 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
 98 | 	if errorlevel 1 exit /b 1
 99 | 	echo.
100 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
101 | .qhcp project file in %BUILDDIR%/qthelp, like this:
102 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\DAWG.qhcp
103 | 	echo.To view the help file:
104 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\DAWG.ghc
105 | 	goto end
106 | )
107 | 
108 | if "%1" == "devhelp" (
109 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
110 | 	if errorlevel 1 exit /b 1
111 | 	echo.
112 | 	echo.Build finished.
113 | 	goto end
114 | )
115 | 
116 | if "%1" == "epub" (
117 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
118 | 	if errorlevel 1 exit /b 1
119 | 	echo.
120 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "latex" (
125 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
129 | 	goto end
130 | )
131 | 
132 | if "%1" == "text" (
133 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
134 | 	if errorlevel 1 exit /b 1
135 | 	echo.
136 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
137 | 	goto end
138 | )
139 | 
140 | if "%1" == "man" (
141 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
142 | 	if errorlevel 1 exit /b 1
143 | 	echo.
144 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
145 | 	goto end
146 | )
147 | 
148 | if "%1" == "texinfo" (
149 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
150 | 	if errorlevel 1 exit /b 1
151 | 	echo.
152 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
153 | 	goto end
154 | )
155 | 
156 | if "%1" == "gettext" (
157 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
158 | 	if errorlevel 1 exit /b 1
159 | 	echo.
160 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
161 | 	goto end
162 | )
163 | 
164 | if "%1" == "changes" (
165 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
166 | 	if errorlevel 1 exit /b 1
167 | 	echo.
168 | 	echo.The overview file is in %BUILDDIR%/changes.
169 | 	goto end
170 | )
171 | 
172 | if "%1" == "linkcheck" (
173 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
174 | 	if errorlevel 1 exit /b 1
175 | 	echo.
176 | 	echo.Link check complete; look for any errors in the above output ^
177 | or in %BUILDDIR%/linkcheck/output.txt.
178 | 	goto end
179 | )
180 | 
181 | if "%1" == "doctest" (
182 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
183 | 	if errorlevel 1 exit /b 1
184 | 	echo.
185 | 	echo.Testing of doctests in the sources finished, look at the ^
186 | results in %BUILDDIR%/doctest/output.txt.
187 | 	goto end
188 | )
189 | 
190 | :end
191 | 


--------------------------------------------------------------------------------
/lib/AUTHORS:
--------------------------------------------------------------------------------
1 | Susumu Yata <syata@acm.org>
2 | 


--------------------------------------------------------------------------------
/lib/COPYING:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2009-2012, Susumu Yata
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 5 | 
 6 | - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
 7 | - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
 8 | - Neither the name of the University of Tokushima nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
 9 | 
10 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
11 | 


--------------------------------------------------------------------------------
/lib/b64/AUTHORS:
--------------------------------------------------------------------------------
1 | libb64: Base64 Encoding/Decoding Routines
2 | ======================================
3 | 
4 | Authors:
5 | -------
6 | 
7 | Chris Venter	chris.venter@gmail.com	http://rocketpod.blogspot.com
8 | 


--------------------------------------------------------------------------------
/lib/b64/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright-Only Dedication (based on United States law) 
 2 | or Public Domain Certification
 3 | 
 4 | The person or persons who have associated work with this document (the
 5 | "Dedicator" or "Certifier") hereby either (a) certifies that, to the best of
 6 | his knowledge, the work of authorship identified is in the public domain of the
 7 | country from which the work is published, or (b) hereby dedicates whatever
 8 | copyright the dedicators holds in the work of authorship identified below (the
 9 | "Work") to the public domain. A certifier, moreover, dedicates any copyright
10 | interest he may have in the associated work, and for these purposes, is
11 | described as a "dedicator" below.
12 | 
13 | A certifier has taken reasonable steps to verify the copyright status of this
14 | work. Certifier recognizes that his good faith efforts may not shield him from
15 | liability if in fact the work certified is not in the public domain.
16 | 
17 | Dedicator makes this dedication for the benefit of the public at large and to
18 | the detriment of the Dedicator's heirs and successors. Dedicator intends this
19 | dedication to be an overt act of relinquishment in perpetuity of all present
20 | and future rights under copyright law, whether vested or contingent, in the
21 | Work. Dedicator understands that such relinquishment of all rights includes
22 | the relinquishment of all rights to enforce (by lawsuit or otherwise) those
23 | copyrights in the Work.
24 | 
25 | Dedicator recognizes that, once placed in the public domain, the Work may be
26 | freely reproduced, distributed, transmitted, used, modified, built upon, or
27 | otherwise exploited by anyone for any purpose, commercial or non-commercial,
28 | and in any way, including by methods that have not yet been invented or
29 | conceived.


--------------------------------------------------------------------------------
/lib/b64/cdecode.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | cdecoder.c - c source to a base64 decoding algorithm implementation
 3 | 
 4 | This is part of the libb64 project, and has been placed in the public domain.
 5 | For details, see http://sourceforge.net/projects/libb64
 6 | */
 7 | 
 8 | #include <b64/cdecode.h>
 9 | 
10 | int base64_decode_value(char value_in)
11 | {
12 | 	static const char decoding[] = {62,-1,-1,-1,63,52,53,54,55,56,57,58,59,60,61,-1,-1,-1,-2,-1,-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,-1,-1,-1,-1,-1,-1,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51};
13 | 	static const char decoding_size = sizeof(decoding);
14 | 	value_in -= 43;
15 | 	if (value_in < 0 || value_in > decoding_size) return -1;
16 | 	return decoding[(int)value_in];
17 | }
18 | 
19 | void base64_init_decodestate(base64_decodestate* state_in)
20 | {
21 | 	state_in->step = step_a;
22 | 	state_in->plainchar = 0;
23 | }
24 | 
25 | int base64_decode_block(const char* code_in, const int length_in, char* plaintext_out, base64_decodestate* state_in)
26 | {
27 | 	const char* codechar = code_in;
28 | 	char* plainchar = plaintext_out;
29 | 	char fragment;
30 | 	
31 | 	*plainchar = state_in->plainchar;
32 | 	
33 | 	switch (state_in->step)
34 | 	{
35 | 		while (1)
36 | 		{
37 | 	case step_a:
38 | 			do {
39 | 				if (codechar == code_in+length_in)
40 | 				{
41 | 					state_in->step = step_a;
42 | 					state_in->plainchar = *plainchar;
43 | 					return plainchar - plaintext_out;
44 | 				}
45 | 				fragment = (char)base64_decode_value(*codechar++);
46 | 			} while (fragment < 0);
47 | 			*plainchar    = (fragment & 0x03f) << 2;
48 | 	case step_b:
49 | 			do {
50 | 				if (codechar == code_in+length_in)
51 | 				{
52 | 					state_in->step = step_b;
53 | 					state_in->plainchar = *plainchar;
54 | 					return plainchar - plaintext_out;
55 | 				}
56 | 				fragment = (char)base64_decode_value(*codechar++);
57 | 			} while (fragment < 0);
58 | 			*plainchar++ |= (fragment & 0x030) >> 4;
59 | 			*plainchar    = (fragment & 0x00f) << 4;
60 | 	case step_c:
61 | 			do {
62 | 				if (codechar == code_in+length_in)
63 | 				{
64 | 					state_in->step = step_c;
65 | 					state_in->plainchar = *plainchar;
66 | 					return plainchar - plaintext_out;
67 | 				}
68 | 				fragment = (char)base64_decode_value(*codechar++);
69 | 			} while (fragment < 0);
70 | 			*plainchar++ |= (fragment & 0x03c) >> 2;
71 | 			*plainchar    = (fragment & 0x003) << 6;
72 | 	case step_d:
73 | 			do {
74 | 				if (codechar == code_in+length_in)
75 | 				{
76 | 					state_in->step = step_d;
77 | 					state_in->plainchar = *plainchar;
78 | 					return plainchar - plaintext_out;
79 | 				}
80 | 				fragment = (char)base64_decode_value(*codechar++);
81 | 			} while (fragment < 0);
82 | 			*plainchar++   |= (fragment & 0x03f);
83 | 		}
84 | 	}
85 | 	/* control should not reach here */
86 | 	return plainchar - plaintext_out;
87 | }
88 | 
89 | 


--------------------------------------------------------------------------------
/lib/b64/cdecode.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | cdecode.h - c header for a base64 decoding algorithm
 3 | 
 4 | This is part of the libb64 project, and has been placed in the public domain.
 5 | For details, see http://sourceforge.net/projects/libb64
 6 | */
 7 | 
 8 | #ifndef BASE64_CDECODE_H
 9 | #define BASE64_CDECODE_H
10 | 
11 | typedef enum
12 | {
13 | 	step_a, step_b, step_c, step_d
14 | } base64_decodestep;
15 | 
16 | typedef struct
17 | {
18 | 	base64_decodestep step;
19 | 	char plainchar;
20 | } base64_decodestate;
21 | 
22 | void base64_init_decodestate(base64_decodestate* state_in);
23 | 
24 | int base64_decode_value(char value_in);
25 | 
26 | int base64_decode_block(const char* code_in, const int length_in, char* plaintext_out, base64_decodestate* state_in);
27 | 
28 | #endif /* BASE64_CDECODE_H */
29 | 


--------------------------------------------------------------------------------
/lib/b64/cencode.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | cencoder.c - c source to a base64 encoding algorithm implementation
  3 | 
  4 | This is part of the libb64 project, and has been placed in the public domain.
  5 | For details, see http://sourceforge.net/projects/libb64
  6 | */
  7 | 
  8 | #include <b64/cencode.h>
  9 | 
 10 | const int CHARS_PER_LINE = 72;
 11 | 
 12 | void base64_init_encodestate(base64_encodestate* state_in)
 13 | {
 14 | 	state_in->step = step_A;
 15 | 	state_in->result = 0;
 16 | 	state_in->stepcount = 0;
 17 | }
 18 | 
 19 | char base64_encode_value(char value_in)
 20 | {
 21 | 	static const char* encoding = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
 22 | 	if (value_in > 63) return '=';
 23 | 	return encoding[(int)value_in];
 24 | }
 25 | 
 26 | int base64_encode_block(const char* plaintext_in, int length_in, char* code_out, base64_encodestate* state_in)
 27 | {
 28 | 	const char* plainchar = plaintext_in;
 29 | 	const char* const plaintextend = plaintext_in + length_in;
 30 | 	char* codechar = code_out;
 31 | 	char result;
 32 | 	char fragment;
 33 | 	
 34 | 	result = state_in->result;
 35 | 	
 36 | 	switch (state_in->step)
 37 | 	{
 38 | 		while (1)
 39 | 		{
 40 | 	case step_A:
 41 | 			if (plainchar == plaintextend)
 42 | 			{
 43 | 				state_in->result = result;
 44 | 				state_in->step = step_A;
 45 | 				return codechar - code_out;
 46 | 			}
 47 | 			fragment = *plainchar++;
 48 | 			result = (fragment & 0x0fc) >> 2;
 49 | 			*codechar++ = base64_encode_value(result);
 50 | 			result = (fragment & 0x003) << 4;
 51 | 	case step_B:
 52 | 			if (plainchar == plaintextend)
 53 | 			{
 54 | 				state_in->result = result;
 55 | 				state_in->step = step_B;
 56 | 				return codechar - code_out;
 57 | 			}
 58 | 			fragment = *plainchar++;
 59 | 			result |= (fragment & 0x0f0) >> 4;
 60 | 			*codechar++ = base64_encode_value(result);
 61 | 			result = (fragment & 0x00f) << 2;
 62 | 	case step_C:
 63 | 			if (plainchar == plaintextend)
 64 | 			{
 65 | 				state_in->result = result;
 66 | 				state_in->step = step_C;
 67 | 				return codechar - code_out;
 68 | 			}
 69 | 			fragment = *plainchar++;
 70 | 			result |= (fragment & 0x0c0) >> 6;
 71 | 			*codechar++ = base64_encode_value(result);
 72 | 			result  = (fragment & 0x03f) >> 0;
 73 | 			*codechar++ = base64_encode_value(result);
 74 | 			
 75 | 			++(state_in->stepcount);
 76 | 			if (state_in->stepcount == CHARS_PER_LINE/4)
 77 | 			{
 78 | 				*codechar++ = '\n';
 79 | 				state_in->stepcount = 0;
 80 | 			}
 81 | 		}
 82 | 	}
 83 | 	/* control should not reach here */
 84 | 	return codechar - code_out;
 85 | }
 86 | 
 87 | int base64_encode_blockend(char* code_out, base64_encodestate* state_in)
 88 | {
 89 | 	char* codechar = code_out;
 90 | 	
 91 | 	switch (state_in->step)
 92 | 	{
 93 | 	case step_B:
 94 | 		*codechar++ = base64_encode_value(state_in->result);
 95 | 		*codechar++ = '=';
 96 | 		*codechar++ = '=';
 97 | 		break;
 98 | 	case step_C:
 99 | 		*codechar++ = base64_encode_value(state_in->result);
100 | 		*codechar++ = '=';
101 | 		break;
102 | 	case step_A:
103 | 		break;
104 | 	}
105 | 	*codechar++ = '\n';
106 | 	
107 | 	return codechar - code_out;
108 | }
109 | 
110 | 


--------------------------------------------------------------------------------
/lib/b64/cencode.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | cencode.h - c header for a base64 encoding algorithm
 3 | 
 4 | This is part of the libb64 project, and has been placed in the public domain.
 5 | For details, see http://sourceforge.net/projects/libb64
 6 | */
 7 | 
 8 | #ifndef BASE64_CENCODE_H
 9 | #define BASE64_CENCODE_H
10 | 
11 | typedef enum
12 | {
13 | 	step_A, step_B, step_C
14 | } base64_encodestep;
15 | 
16 | typedef struct
17 | {
18 | 	base64_encodestep step;
19 | 	char result;
20 | 	int stepcount;
21 | } base64_encodestate;
22 | 
23 | void base64_init_encodestate(base64_encodestate* state_in);
24 | 
25 | char base64_encode_value(char value_in);
26 | 
27 | int base64_encode_block(const char* plaintext_in, int length_in, char* code_out, base64_encodestate* state_in);
28 | 
29 | int base64_encode_blockend(char* code_out, base64_encodestate* state_in);
30 | 
31 | #endif /* BASE64_CENCODE_H */
32 | 


--------------------------------------------------------------------------------
/lib/b64/decode.h:
--------------------------------------------------------------------------------
 1 | // :mode=c++:
 2 | /*
 3 | decode.h - c++ wrapper for a base64 decoding algorithm
 4 | 
 5 | This is part of the libb64 project, and has been placed in the public domain.
 6 | For details, see http://sourceforge.net/projects/libb64
 7 | */
 8 | #ifndef BASE64_DECODE_H
 9 | #define BASE64_DECODE_H
10 | 
11 | #include <iostream>
12 | 
13 | namespace base64
14 | {
15 | 	extern "C"
16 | 	{
17 | 		#include "cdecode.h"
18 | 	}
19 | 
20 | 	struct decoder
21 | 	{
22 | 		base64_decodestate _state;
23 | 		int _buffersize;
24 | 
25 | 		decoder(int buffersize_in = 4096)
26 | 		: _buffersize(buffersize_in)
27 | 		{}
28 | 
29 | 		int decode(char value_in)
30 | 		{
31 | 			return base64_decode_value(value_in);
32 | 		}
33 | 
34 | 		int decode(const char* code_in, const int length_in, char* plaintext_out)
35 | 		{
36 | 			return base64_decode_block(code_in, length_in, plaintext_out, &_state);
37 | 		}
38 | 
39 | 		void decode(std::istream& istream_in, std::ostream& ostream_in)
40 | 		{
41 | 			base64_init_decodestate(&_state);
42 | 			//
43 | 			const int N = _buffersize;
44 | 			char* code = new char[N];
45 | 			char* plaintext = new char[N];
46 | 			int codelength;
47 | 			int plainlength;
48 | 
49 | 			do
50 | 			{
51 | 				istream_in.read((char*)code, N);
52 | 				codelength = istream_in.gcount();
53 | 				plainlength = decode(code, codelength, plaintext);
54 | 				ostream_in.write((const char*)plaintext, plainlength);
55 | 			}
56 | 			while (istream_in.good() && codelength > 0);
57 | 			//
58 | 			base64_init_decodestate(&_state);
59 | 
60 | 			delete [] code;
61 | 			delete [] plaintext;
62 | 		}
63 | 
64 | 		void init()
65 | 		{
66 | 			base64_init_decodestate(&_state);
67 | 		}
68 | 	};
69 | 
70 | } // namespace base64
71 | 
72 | 
73 | 
74 | #endif // BASE64_DECODE_H
75 | 
76 | 


--------------------------------------------------------------------------------
/lib/b64/encode.h:
--------------------------------------------------------------------------------
 1 | // :mode=c++:
 2 | /*
 3 | encode.h - c++ wrapper for a base64 encoding algorithm
 4 | 
 5 | This is part of the libb64 project, and has been placed in the public domain.
 6 | For details, see http://sourceforge.net/projects/libb64
 7 | */
 8 | #ifndef BASE64_ENCODE_H
 9 | #define BASE64_ENCODE_H
10 | 
11 | #include <iostream>
12 | 
13 | namespace base64
14 | {
15 | 	extern "C" 
16 | 	{
17 | 		#include "cencode.h"
18 | 	}
19 | 
20 | 	struct encoder
21 | 	{
22 | 		base64_encodestate _state;
23 | 		int _buffersize;
24 | 
25 | 		encoder(int buffersize_in = BUFFERSIZE)
26 | 		: _buffersize(buffersize_in)
27 | 		{}
28 | 
29 | 		int encode(char value_in)
30 | 		{
31 | 			return base64_encode_value(value_in);
32 | 		}
33 | 
34 | 		int encode(const char* code_in, const int length_in, char* plaintext_out)
35 | 		{
36 | 			return base64_encode_block(code_in, length_in, plaintext_out, &_state);
37 | 		}
38 | 
39 | 		int encode_end(char* plaintext_out)
40 | 		{
41 | 			return base64_encode_blockend(plaintext_out, &_state);
42 | 		}
43 | 
44 | 		void encode(std::istream& istream_in, std::ostream& ostream_in)
45 | 		{
46 | 			base64_init_encodestate(&_state);
47 | 			//
48 | 			const int N = _buffersize;
49 | 			char* plaintext = new char[N];
50 | 			char* code = new char[2*N];
51 | 			int plainlength;
52 | 			int codelength;
53 | 
54 | 			do
55 | 			{
56 | 				istream_in.read(plaintext, N);
57 | 				plainlength = istream_in.gcount();
58 | 				//
59 | 				codelength = encode(plaintext, plainlength, code);
60 | 				ostream_in.write(code, codelength);
61 | 			}
62 | 			while (istream_in.good() && plainlength > 0);
63 | 
64 | 			codelength = encode_end(code);
65 | 			ostream_in.write(code, codelength);
66 | 			//
67 | 			base64_init_encodestate(&_state);
68 | 
69 | 			delete [] code;
70 | 			delete [] plaintext;
71 | 		}
72 | 	};
73 | 
74 | } // namespace base64
75 | 
76 | #endif // BASE64_ENCODE_H
77 | 
78 | 


--------------------------------------------------------------------------------
/lib/dawgdic/base-types.h:
--------------------------------------------------------------------------------
 1 | #ifndef DAWGDIC_BASE_TYPES_H
 2 | #define DAWGDIC_BASE_TYPES_H
 3 | 
 4 | #include <cstddef>
 5 | 
 6 | namespace dawgdic {
 7 | 
 8 | // 8-bit characters.
 9 | typedef char CharType;
10 | typedef unsigned char UCharType;
11 | 
12 | // 32-bit integer.
13 | typedef int ValueType;
14 | 
15 | // 32-bit unsigned integer.
16 | typedef unsigned int BaseType;
17 | 
18 | // 32 or 64-bit unsigned integer.
19 | typedef std::size_t SizeType;
20 | 
21 | }  // namespace dawgdic
22 | 
23 | #endif  // DAWGDIC_BASE_TYPES_H
24 | 


--------------------------------------------------------------------------------
/lib/dawgdic/base-unit.h:
--------------------------------------------------------------------------------
 1 | #ifndef DAWGDIC_BASE_UNIT_H
 2 | #define DAWGDIC_BASE_UNIT_H
 3 | 
 4 | #include "base-types.h"
 5 | 
 6 | namespace dawgdic {
 7 | 
 8 | // Unit for building a dawg.
 9 | class BaseUnit {
10 |  public:
11 |   BaseUnit() : base_(0) {}
12 | 
13 |   // Writes values.
14 |   void set_base(BaseType base) {
15 |     base_ = base;
16 |   }
17 |   BaseType base() const {
18 |     return base_;
19 |   }
20 | 
21 |   // Reads values.
22 |   BaseType child() const {
23 |     return base_ >> 2;
24 |   }
25 |   bool has_sibling() const {
26 |     return (base_ & 1) ? true : false;
27 |   }
28 |   ValueType value() const {
29 |     return static_cast<ValueType>(base_ >> 1);
30 |   }
31 |   bool is_state() const {
32 |     return (base_ & 2) ? true : false;
33 |   }
34 | 
35 |  private:
36 |   BaseType base_;
37 | 
38 |   // Copyable.
39 | };
40 | 
41 | }  // namespace dawgdic
42 | 
43 | #endif  // DAWGDIC_BASE_UNIT_H
44 | 


--------------------------------------------------------------------------------
/lib/dawgdic/bit-pool.h:
--------------------------------------------------------------------------------
 1 | #ifndef DAWGDIC_BIT_POOL_H
 2 | #define DAWGDIC_BIT_POOL_H
 3 | 
 4 | #include "object-pool.h"
 5 | 
 6 | namespace dawgdic {
 7 | 
 8 | // This class works as an array of bit flags with compact memory management.
 9 | template <SizeType BLOCK_SIZE = 1 << 10>
10 | class BitPool {
11 |  public:
12 |   BitPool() : pool_(), size_(0) {}
13 | 
14 |   // Accessors.
15 |   void set(SizeType index, bool bit) {
16 |     SizeType pool_index = PoolIndex(index);
17 |     UCharType bit_flag = BitFlag(index);
18 |     if (bit) {
19 |       pool_[pool_index] |= bit_flag;
20 |     } else {
21 |       pool_[pool_index] &= ~bit_flag;
22 |     }
23 |   }
24 |   bool get(SizeType index) const {
25 |     SizeType pool_index = PoolIndex(index);
26 |     UCharType bit_flag = BitFlag(index);
27 |     return (pool_[pool_index] & bit_flag) ? true : false;
28 |   }
29 | 
30 |   // Deletes all bits and frees memory.
31 |   void Clear() {
32 |     pool_.Clear();
33 |     size_ = 0;
34 |   }
35 | 
36 |   // Swaps bit pools.
37 |   void Swap(BitPool *bit_pool) {
38 |     pool_.Swap(&bit_pool->pool_);
39 |   }
40 | 
41 |   // Allocates memory for a new bit and returns its ID.
42 |   // Note: Allocated bits are filled with false.
43 |   SizeType Allocate() {
44 |     SizeType pool_index = PoolIndex(size_);
45 |     if (pool_index == pool_.size()) {
46 |       pool_.Allocate();
47 |       pool_[pool_index] = '\0';
48 |     }
49 |     return size_++;
50 |   }
51 | 
52 |  private:
53 |   ObjectPool<UCharType> pool_;
54 |   SizeType size_;
55 | 
56 |   // Disallows copies.
57 |   BitPool(const BitPool &);
58 |   BitPool &operator=(const BitPool &);
59 | 
60 |   static SizeType PoolIndex(SizeType index) {
61 |     return index / 8;
62 |   }
63 |   static UCharType BitFlag(BaseType index) {
64 |     return static_cast<UCharType>(1) << (index % 8);
65 |   }
66 | };
67 | 
68 | }  // namespace dawgdic
69 | 
70 | #endif  // DAWGDIC_BIT_POOL_H
71 | 


--------------------------------------------------------------------------------
/lib/dawgdic/completer.h:
--------------------------------------------------------------------------------
  1 | #ifndef DAWGDIC_COMPLETER_H
  2 | #define DAWGDIC_COMPLETER_H
  3 | 
  4 | #include "dictionary.h"
  5 | #include "guide.h"
  6 | 
  7 | #include <vector>
  8 | 
  9 | namespace dawgdic {
 10 | 
 11 | class Completer {
 12 |  public:
 13 |   Completer()
 14 |     : dic_(NULL), guide_(NULL), key_(), index_stack_(), last_index_(0) {}
 15 |   Completer(const Dictionary &dic, const Guide &guide)
 16 |     : dic_(&dic), guide_(&guide), key_(), index_stack_(), last_index_(0) {}
 17 | 
 18 |   void set_dic(const Dictionary &dic) {
 19 |     dic_ = &dic;
 20 |   }
 21 |   void set_guide(const Guide &guide) {
 22 |     guide_ = &guide;
 23 |   }
 24 | 
 25 |   const Dictionary &dic() const {
 26 |     return *dic_;
 27 |   }
 28 |   const Guide &guide() const {
 29 |     return *guide_;
 30 |   }
 31 | 
 32 |   // These member functions are available only when Next() returns true.
 33 |   const char *key() const {
 34 |     return reinterpret_cast<const char *>(&key_[0]);
 35 |   }
 36 |   SizeType length() const {
 37 |     return key_.size() - 1;
 38 |   }
 39 |   ValueType value() const {
 40 |     return dic_->value(last_index_);
 41 |   }
 42 | 
 43 |   // Starts completing keys from given index and prefix.
 44 |   void Start(BaseType index, const char *prefix = "") {
 45 |     SizeType length = 0;
 46 |     for (const char *p = prefix; *p != '\0'; ++p) {
 47 |       ++length;
 48 |     }
 49 |     Start(index, prefix, length);
 50 |   }
 51 |   void Start(BaseType index, const char *prefix, SizeType length) {
 52 |     key_.resize(length + 1);
 53 |     for (SizeType i = 0; i < length; ++i) {
 54 |       key_[i] = prefix[i];
 55 |     }
 56 |     key_[length] = '\0';
 57 | 
 58 |     index_stack_.clear();
 59 |     if (guide_->size() != 0) {
 60 |       index_stack_.push_back(index);
 61 |       last_index_ = dic_->root();
 62 |     }
 63 |   }
 64 | 
 65 |   // Gets the next key.
 66 |   bool Next() {
 67 |     if (index_stack_.empty()) {
 68 |       return false;
 69 |     }
 70 |     BaseType index = index_stack_.back();
 71 | 
 72 |     if (last_index_ != dic_->root()) {
 73 |       UCharType child_label = guide_->child(index);
 74 |       if (child_label != '\0') {
 75 |         // Follows a transition to the first child.
 76 |         if (!Follow(child_label, &index))
 77 |           return false;
 78 |       } else {
 79 |         for ( ; ; ) {
 80 |           UCharType sibling_label = guide_->sibling(index);
 81 | 
 82 |           // Moves to the previous node.
 83 |           if (key_.size() > 1) {
 84 |             key_.resize(key_.size() - 1);
 85 |             key_.back() = '\0';
 86 |           }
 87 |           index_stack_.resize(index_stack_.size() - 1);
 88 |           if (index_stack_.empty()) {
 89 |             return false;
 90 |           }
 91 | 
 92 |           index = index_stack_.back();
 93 |           if (sibling_label != '\0') {
 94 |             // Follows a transition to the next sibling.
 95 |             if (!Follow(sibling_label, &index)) {
 96 |               return false;
 97 |             }
 98 |             break;
 99 |           }
100 |         }
101 |       }
102 |     }
103 | 
104 |     // Finds a terminal.
105 |     return FindTerminal(index);
106 |   }
107 | 
108 |  private:
109 |   const Dictionary *dic_;
110 |   const Guide *guide_;
111 |   std::vector<UCharType> key_;
112 |   std::vector<BaseType> index_stack_;
113 |   BaseType last_index_;
114 | 
115 |   // Disallows copies.
116 |   Completer(const Completer &);
117 |   Completer &operator=(const Completer &);
118 | 
119 |   // Follows a transition.
120 |   bool Follow(UCharType label, BaseType *index) {
121 |     if (!dic_->Follow(label, index)) {
122 |       return false;
123 |     }
124 | 
125 |     key_.back() = label;
126 |     key_.push_back('\0');
127 |     index_stack_.push_back(*index);
128 |     return true;
129 |   }
130 | 
131 |   // Finds a terminal.
132 |   bool FindTerminal(BaseType index) {
133 |     while (!dic_->has_value(index)) {
134 |       UCharType label = guide_->child(index);
135 |       if (!dic_->Follow(label, &index)) {
136 |         return false;
137 |       }
138 | 
139 |       key_.back() = label;
140 |       key_.push_back('\0');
141 |       index_stack_.push_back(index);
142 |     }
143 | 
144 |     last_index_ = index;
145 |     return true;
146 |   }
147 | };
148 | 
149 | }  // namespace dawgdic
150 | 
151 | #endif  // DAWGDIC_COMPLETER_H
152 | 


--------------------------------------------------------------------------------
/lib/dawgdic/dawg-builder.h:
--------------------------------------------------------------------------------
  1 | #ifndef DAWGDIC_DAWG_BUILDER_H
  2 | #define DAWGDIC_DAWG_BUILDER_H
  3 | 
  4 | #include <algorithm>
  5 | #include <stack>
  6 | #include <vector>
  7 | 
  8 | #include "dawg.h"
  9 | #include "dawg-unit.h"
 10 | 
 11 | namespace dawgdic {
 12 | 
 13 | // DAWG builder.
 14 | class DawgBuilder {
 15 |  public:
 16 |   explicit DawgBuilder(SizeType initial_hash_table_size =
 17 |                        DEFAULT_INITIAL_HASH_TABLE_SIZE)
 18 |     : initial_hash_table_size_(initial_hash_table_size),
 19 |       base_pool_(), label_pool_(), flag_pool_(), unit_pool_(),
 20 |       hash_table_(), unfixed_units_(), unused_units_(), num_of_states_(1),
 21 |       num_of_merged_transitions_(0), num_of_merging_states_(0) {}
 22 | 
 23 |   // Number of units.
 24 |   SizeType size() const {
 25 |     return base_pool_.size();
 26 |   }
 27 |   // Number of transitions.
 28 |   SizeType num_of_transitions() const {
 29 |     return base_pool_.size() - 1;
 30 |   }
 31 |   // Number of states.
 32 |   SizeType num_of_states() const {
 33 |     return num_of_states_;
 34 |   }
 35 |   // Number of merged transitions.
 36 |   SizeType num_of_merged_transitions() const {
 37 |     return num_of_merged_transitions_;
 38 |   }
 39 |   // Number of merged states.
 40 |   SizeType num_of_merged_states() const {
 41 |     return num_of_transitions()
 42 |         + num_of_merged_transitions() + 1 - num_of_states();
 43 |   }
 44 |   // Number of merging states.
 45 |   SizeType num_of_merging_states() const {
 46 |     return num_of_merging_states_;
 47 |   }
 48 | 
 49 |   // Initializes a builder.
 50 |   void Clear() {
 51 |     base_pool_.Clear();
 52 |     label_pool_.Clear();
 53 |     flag_pool_.Clear();
 54 |     unit_pool_.Clear();
 55 | 
 56 |     std::vector<BaseType>(0).swap(hash_table_);
 57 |     while (!unfixed_units_.empty()) {
 58 |       unfixed_units_.pop();
 59 |     }
 60 |     while (!unused_units_.empty()) {
 61 |       unused_units_.pop();
 62 |     }
 63 | 
 64 |     num_of_states_ = 1;
 65 |     num_of_merged_transitions_ = 0;
 66 |     num_of_merging_states_ = 0;
 67 |   }
 68 | 
 69 |   // Inserts a key.
 70 |   bool Insert(const CharType *key, ValueType value = 0) {
 71 |     if (key == NULL || *key == '\0' || value < 0) {
 72 |       return false;
 73 |     }
 74 |     SizeType length = 1;
 75 |     while (key[length]) {
 76 |       ++length;
 77 |     }
 78 |     return InsertKey(key, length, value);
 79 |   }
 80 | 
 81 |   // Inserts a key.
 82 |   bool Insert(const CharType *key, SizeType length, ValueType value) {
 83 |     if (key == NULL || length <= 0 || value < 0) {
 84 |       return false;
 85 |     }
 86 |     for (SizeType i = 0; i < length; ++i) {
 87 |       if (key[i] == '\0') {
 88 |         return false;
 89 |       }
 90 |     }
 91 |     return InsertKey(key, length, value);
 92 |   }
 93 | 
 94 |   // Finishes building a dawg.
 95 |   bool Finish(Dawg *dawg) {
 96 |     // Initializes a builder if not initialized.
 97 |     if (hash_table_.empty()) {
 98 |       Init();
 99 |     }
100 | 
101 |     FixUnits(0);
102 |     base_pool_[0].set_base(unit_pool_[0].base());
103 |     label_pool_[0] = unit_pool_[0].label();
104 | 
105 |     dawg->set_num_of_states(num_of_states_);
106 |     dawg->set_num_of_merged_transitions(num_of_merged_transitions_);
107 |     dawg->set_num_of_merged_states(num_of_merged_states());
108 |     dawg->set_num_of_merging_states(num_of_merging_states_);
109 | 
110 |     dawg->SwapBasePool(&base_pool_);
111 |     dawg->SwapLabelPool(&label_pool_);
112 |     dawg->SwapFlagPool(&flag_pool_);
113 | 
114 |     Clear();
115 |     return true;
116 |   }
117 | 
118 |  private:
119 |   enum {
120 |     DEFAULT_INITIAL_HASH_TABLE_SIZE = 1 << 8
121 |   };
122 | 
123 |   const SizeType initial_hash_table_size_;
124 |   ObjectPool<BaseUnit> base_pool_;
125 |   ObjectPool<UCharType> label_pool_;
126 |   BitPool<> flag_pool_;
127 |   ObjectPool<DawgUnit> unit_pool_;
128 |   std::vector<BaseType> hash_table_;
129 |   std::stack<BaseType> unfixed_units_;
130 |   std::stack<BaseType> unused_units_;
131 |   SizeType num_of_states_;
132 |   SizeType num_of_merged_transitions_;
133 |   SizeType num_of_merging_states_;
134 | 
135 |   // Disallows copies.
136 |   DawgBuilder(const DawgBuilder &);
137 |   DawgBuilder &operator=(const DawgBuilder &);
138 | 
139 |   // Inserts a key.
140 |   bool InsertKey(const CharType *key, SizeType length, ValueType value) {
141 |     // Initializes a builder if not initialized.
142 |     if (hash_table_.empty()) {
143 |       Init();
144 |     }
145 | 
146 |     BaseType index = 0;
147 |     SizeType key_pos = 0;
148 | 
149 |     // Finds a separate unit.
150 |     for ( ; key_pos <= length; ++key_pos) {
151 |       BaseType child_index = unit_pool_[index].child();
152 |       if (!child_index) {
153 |         break;
154 |       }
155 | 
156 |       UCharType key_label = static_cast<UCharType>(
157 |           (key_pos < length) ? key[key_pos] : '\0');
158 |       UCharType unit_label = unit_pool_[child_index].label();
159 | 
160 |       // Checks the order of keys.
161 |       if (key_label < unit_label) {
162 |         return false;
163 |       } else if (key_label > unit_label) {
164 |         unit_pool_[child_index].set_has_sibling(true);
165 |         FixUnits(child_index);
166 |         break;
167 |       }
168 | 
169 |       index = child_index;
170 |     }
171 | 
172 |     // Adds new units.
173 |     for ( ; key_pos <= length; ++key_pos) {
174 |       UCharType key_label = static_cast<UCharType>(
175 |           (key_pos < length) ? key[key_pos] : '\0');
176 |       BaseType child_index = AllocateUnit();
177 | 
178 |       if (!unit_pool_[index].child()) {
179 |         unit_pool_[child_index].set_is_state(true);
180 |       }
181 |       unit_pool_[child_index].set_sibling(unit_pool_[index].child());
182 |       unit_pool_[child_index].set_label(key_label);
183 |       unit_pool_[index].set_child(child_index);
184 |       unfixed_units_.push(child_index);
185 | 
186 |       index = child_index;
187 |     }
188 |     unit_pool_[index].set_value(value);
189 |     return true;
190 |   }
191 | 
192 |   // Initializes an object.
193 |   void Init() {
194 |     hash_table_.resize(initial_hash_table_size_, 0);
195 |     AllocateUnit();
196 |     AllocateTransition();
197 |     unit_pool_[0].set_label(0xFF);
198 |     unfixed_units_.push(0);
199 |   }
200 | 
201 |   // Fixes units corresponding to the last inserted key.
202 |   // Also, some of units are merged into equivalent transitions.
203 |   void FixUnits(BaseType index) {
204 |     while (unfixed_units_.top() != index) {
205 |       BaseType unfixed_index = unfixed_units_.top();
206 |       unfixed_units_.pop();
207 | 
208 |       if (num_of_states_ >= hash_table_.size() - (hash_table_.size() >> 2)) {
209 |         ExpandHashTable();
210 |       }
211 | 
212 |       BaseType num_of_siblings = 0;
213 |       for (BaseType i = unfixed_index; i != 0; i = unit_pool_[i].sibling()) {
214 |         ++num_of_siblings;
215 |       }
216 | 
217 |       BaseType hash_id;
218 |       BaseType matched_index = FindUnit(unfixed_index, &hash_id);
219 |       if (matched_index != 0) {
220 |         num_of_merged_transitions_ += num_of_siblings;
221 | 
222 |         // Records a merging state.
223 |         if (flag_pool_.get(matched_index) == false) {
224 |           ++num_of_merging_states_;
225 |           flag_pool_.set(matched_index, true);
226 |         }
227 |       } else {
228 |         // Fixes units into pairs of base values and labels.
229 |         BaseType transition_index = 0;
230 |         for (BaseType i = 0; i < num_of_siblings; ++i) {
231 |           transition_index = AllocateTransition();
232 |         }
233 |         for (BaseType i = unfixed_index; i != 0; i = unit_pool_[i].sibling()) {
234 |           base_pool_[transition_index].set_base(unit_pool_[i].base());
235 |           label_pool_[transition_index] = unit_pool_[i].label();
236 |           --transition_index;
237 |         }
238 |         matched_index = transition_index + 1;
239 |         hash_table_[hash_id] = matched_index;
240 |         ++num_of_states_;
241 |       }
242 | 
243 |       // Deletes fixed units.
244 |       for (BaseType current = unfixed_index, next;
245 |            current != 0; current = next) {
246 |         next = unit_pool_[current].sibling();
247 |         FreeUnit(current);
248 |       }
249 | 
250 |       unit_pool_[unfixed_units_.top()].set_child(matched_index);
251 |     }
252 |     unfixed_units_.pop();
253 |   }
254 | 
255 |   // Expands a hash table.
256 |   void ExpandHashTable() {
257 |     SizeType hash_table_size = hash_table_.size() << 1;
258 |     std::vector<BaseType>(0).swap(hash_table_);
259 |     hash_table_.resize(hash_table_size, 0);
260 | 
261 |     // Builds a new hash table.
262 |     BaseType count = 0;
263 |     for (SizeType i = 1; i < base_pool_.size(); ++i) {
264 |       BaseType index = static_cast<BaseType>(i);
265 |       if (label_pool_[index] == '\0' || base_pool_[index].is_state()) {
266 |         BaseType hash_id;
267 |         FindTransition(index, &hash_id);
268 |         hash_table_[hash_id] = index;
269 |         ++count;
270 |       }
271 |     }
272 |   }
273 | 
274 |   // Finds a transition from a hash table.
275 |   BaseType FindTransition(BaseType index, BaseType *hash_id) const {
276 |     *hash_id = HashTransition(index) % hash_table_.size();
277 |     for ( ; ; *hash_id = (*hash_id + 1) % hash_table_.size()) {
278 |       BaseType transition_id = hash_table_[*hash_id];
279 |       if (transition_id == 0) {
280 |         break;
281 |       }
282 | 
283 |       // There must not be the same base value.
284 |     }
285 |     return 0;
286 |   }
287 | 
288 |   // Finds a unit from a hash table.
289 |   BaseType FindUnit(BaseType unit_index, BaseType *hash_id) const {
290 |     *hash_id = HashUnit(unit_index) % hash_table_.size();
291 |     for ( ; ; *hash_id = (*hash_id + 1) % hash_table_.size()) {
292 |       BaseType transition_id = hash_table_[*hash_id];
293 |       if (transition_id == 0) {
294 |         break;
295 |       }
296 | 
297 |       if (AreEqual(unit_index, transition_id)) {
298 |         return transition_id;
299 |       }
300 |     }
301 |     return 0;
302 |   }
303 | 
304 |   // Compares a unit and a transition.
305 |   bool AreEqual(BaseType unit_index, BaseType transition_index) const {
306 |     // Compares the numbers of transitions.
307 |     for (BaseType i = unit_pool_[unit_index].sibling(); i != 0;
308 |          i = unit_pool_[i].sibling()) {
309 |       if (base_pool_[transition_index].has_sibling() == false) {
310 |         return false;
311 |       }
312 |       ++transition_index;
313 |     }
314 |     if (base_pool_[transition_index].has_sibling() == true) {
315 |       return false;
316 |     }
317 | 
318 |     // Compares out-transitions.
319 |     for (BaseType i = unit_index; i;
320 |          i = unit_pool_[i].sibling(), --transition_index) {
321 |       if (unit_pool_[i].base() != base_pool_[transition_index].base() ||
322 |           unit_pool_[i].label() != label_pool_[transition_index]) {
323 |         return false;
324 |       }
325 |     }
326 |     return true;
327 |   }
328 | 
329 |   // Calculates a hash value from a transition.
330 |   BaseType HashTransition(BaseType index) const {
331 |     BaseType hash_value = 0;
332 |     for ( ; index != 0; ++index) {
333 |       BaseType base = base_pool_[index].base();
334 |       UCharType label = label_pool_[index];
335 |       hash_value ^= Hash((label << 24) ^ base);
336 | 
337 |       if (base_pool_[index].has_sibling() == false) {
338 |         break;
339 |       }
340 |     }
341 |     return hash_value;
342 |   }
343 | 
344 |   // Calculates a hash value from a unit.
345 |   BaseType HashUnit(BaseType index) const {
346 |     BaseType hash_value = 0;
347 |     for ( ; index != 0; index = unit_pool_[index].sibling()) {
348 |       BaseType base = unit_pool_[index].base();
349 |       UCharType label = unit_pool_[index].label();
350 |       hash_value ^= Hash((label << 24) ^ base);
351 |     }
352 |     return hash_value;
353 |   }
354 | 
355 |   // 32-bit mix function.
356 |   // http://www.concentric.net/~Ttwang/tech/inthash.htm
357 |   static BaseType Hash(BaseType key) {
358 |     key = ~key + (key << 15);  // key = (key << 15) - key - 1;
359 |     key = key ^ (key >> 12);
360 |     key = key + (key << 2);
361 |     key = key ^ (key >> 4);
362 |     key = key * 2057;  // key = (key + (key << 3)) + (key << 11);
363 |     key = key ^ (key >> 16);
364 |     return key;
365 |   }
366 | 
367 |   // Gets a transition from object pools.
368 |   BaseType AllocateTransition() {
369 |     flag_pool_.Allocate();
370 |     base_pool_.Allocate();
371 |     return static_cast<BaseType>(label_pool_.Allocate());
372 |   }
373 | 
374 |   // Gets a unit from an object pool.
375 |   BaseType AllocateUnit() {
376 |     BaseType index = 0;
377 |     if (unused_units_.empty()) {
378 |       index = static_cast<BaseType>(unit_pool_.Allocate());
379 |     } else {
380 |       index = unused_units_.top();
381 |       unused_units_.pop();
382 |     }
383 |     unit_pool_[index].Clear();
384 |     return index;
385 |   }
386 | 
387 |   // Returns a unit to an object pool.
388 |   void FreeUnit(BaseType index) {
389 |     unused_units_.push(index);
390 |   }
391 | };
392 | 
393 | }  // namespace dawgdic
394 | 
395 | #endif  // DAWGDIC_DAWG_BUILDER_H
396 | 


--------------------------------------------------------------------------------
/lib/dawgdic/dawg-unit.h:
--------------------------------------------------------------------------------
 1 | #ifndef DAWGDIC_DAWG_UNIT_H
 2 | #define DAWGDIC_DAWG_UNIT_H
 3 | 
 4 | #include "base-types.h"
 5 | 
 6 | namespace dawgdic {
 7 | 
 8 | // Unit for building a dawg.
 9 | class DawgUnit {
10 |  public:
11 |   DawgUnit()
12 |     : child_(0), sibling_(0), label_('\0'),
13 |       is_state_(false), has_sibling_(false) {}
14 | 
15 |   // Writes values.
16 |   void set_child(BaseType child) {
17 |     child_ = child;
18 |   }
19 |   void set_sibling(BaseType sibling) {
20 |     sibling_ = sibling;
21 |   }
22 |   void set_value(ValueType value) {
23 |     child_ = value;
24 |   }
25 |   void set_label(UCharType label) {
26 |     label_ = label;
27 |   }
28 |   void set_is_state(bool is_state) {
29 |     is_state_ = is_state;
30 |   }
31 |   void set_has_sibling(bool has_sibling) {
32 |     has_sibling_ = has_sibling;
33 |   }
34 | 
35 |   // Reads values.
36 |   BaseType child() const {
37 |     return child_;
38 |   }
39 |   BaseType sibling() const {
40 |     return sibling_;
41 |   }
42 |   ValueType value() const {
43 |     return static_cast<ValueType>(child_);
44 |   }
45 |   UCharType label() const {
46 |     return label_;
47 |   }
48 |   bool is_state() const {
49 |     return is_state_;
50 |   }
51 |   bool has_sibling() const {
52 |     return has_sibling_;
53 |   }
54 | 
55 |   // Calculates a base value of a unit.
56 |   BaseType base() const {
57 |     if (label_ == '\0') {
58 |       return (child_ << 1) | (has_sibling_ ? 1 : 0);
59 |     }
60 |     return (child_ << 2) | (is_state_ ? 2 : 0) | (has_sibling_ ? 1 : 0);
61 |   }
62 | 
63 |   // Initializes a unit.
64 |   void Clear() {
65 |     child_ = 0;
66 |     sibling_ = 0;
67 |     label_ = '\0';
68 |     is_state_ = false;
69 |     has_sibling_ = false;
70 |   }
71 | 
72 |  private:
73 |   BaseType child_;
74 |   BaseType sibling_;
75 |   UCharType label_;
76 |   bool is_state_;
77 |   bool has_sibling_;
78 | 
79 |   // Copyable.
80 | };
81 | 
82 | }  // namespace dawgdic
83 | 
84 | #endif  // DAWGDIC_DAWG_UNIT_H
85 | 


--------------------------------------------------------------------------------
/lib/dawgdic/dawg.h:
--------------------------------------------------------------------------------
  1 | #ifndef DAWGDIC_DAWG_H
  2 | #define DAWGDIC_DAWG_H
  3 | 
  4 | #include "base-unit.h"
  5 | #include "bit-pool.h"
  6 | #include "object-pool.h"
  7 | 
  8 | namespace dawgdic {
  9 | 
 10 | class Dawg {
 11 |  public:
 12 |   Dawg()
 13 |     : base_pool_(), label_pool_(), flag_pool_(),
 14 |       num_of_states_(0), num_of_merged_transitions_(0),
 15 |       num_of_merged_states_(0), num_of_merging_states_(0) {}
 16 | 
 17 |   // The root index.
 18 |   BaseType root() const {
 19 |     return 0;
 20 |   }
 21 | 
 22 |   // Number of units.
 23 |   SizeType size() const {
 24 |     return base_pool_.size();
 25 |   }
 26 |   // Number of transitions.
 27 |   SizeType num_of_transitions() const {
 28 |     return base_pool_.size() - 1;
 29 |   }
 30 |   // Number of states.
 31 |   SizeType num_of_states() const {
 32 |     return num_of_states_;
 33 |   }
 34 |   // Number of merged transitions.
 35 |   SizeType num_of_merged_transitions() const {
 36 |     return num_of_merged_transitions_;
 37 |   }
 38 |   // Number of merged states.
 39 |   SizeType num_of_merged_states() const {
 40 |     return num_of_merged_states_;
 41 |   }
 42 |   // Number of merging states.
 43 |   SizeType num_of_merging_states() const {
 44 |     return num_of_merging_states_;
 45 |   }
 46 | 
 47 |   // Reads values.
 48 |   BaseType child(BaseType index) const {
 49 |     return base_pool_[index].child();
 50 |   }
 51 |   BaseType sibling(BaseType index) const {
 52 |     return base_pool_[index].has_sibling() ? (index + 1) : 0;
 53 |   }
 54 |   ValueType value(BaseType index) const {
 55 |     return base_pool_[index].value();
 56 |   }
 57 | 
 58 |   bool is_leaf(BaseType index) const {
 59 |     return label(index) == '\0';
 60 |   }
 61 |   UCharType label(BaseType index) const {
 62 |     return label_pool_[index];
 63 |   }
 64 |   bool is_merging(BaseType index) const {
 65 |     return flag_pool_.get(index);
 66 |   }
 67 | 
 68 |   // Clears object pools.
 69 |   void Clear() {
 70 |     base_pool_.Clear();
 71 |     label_pool_.Clear();
 72 |     flag_pool_.Clear();
 73 |     num_of_states_ = 0;
 74 |     num_of_merged_states_ = 0;
 75 |   }
 76 | 
 77 |   // Swaps dawgs.
 78 |   void Swap(Dawg *dawg) {
 79 |     base_pool_.Swap(&dawg->base_pool_);
 80 |     label_pool_.Swap(&dawg->label_pool_);
 81 |     flag_pool_.Swap(&dawg->flag_pool_);
 82 |     std::swap(num_of_states_, dawg->num_of_states_);
 83 |     std::swap(num_of_merged_transitions_, dawg->num_of_merged_transitions_);
 84 |     std::swap(num_of_merged_states_, dawg->num_of_merged_states_);
 85 |     std::swap(num_of_merging_states_, dawg->num_of_merging_states_);
 86 |   }
 87 | 
 88 |  public:
 89 |   // Following member functions are called from DawgBuilder.
 90 | 
 91 |   // Sets the number of states.
 92 |   void set_num_of_states(SizeType num_of_states) {
 93 |     num_of_states_ = num_of_states;
 94 |   }
 95 |   // Sets the number of merged transitions.
 96 |   void set_num_of_merged_transitions(SizeType num_of_merged_transitions) {
 97 |     num_of_merged_transitions_ = num_of_merged_transitions;
 98 |   }
 99 |   // Sets the number of merged states.
100 |   void set_num_of_merged_states(SizeType num_of_merged_states) {
101 |     num_of_merged_states_ = num_of_merged_states;
102 |   }
103 |   // Sets the number of merging states.
104 |   void set_num_of_merging_states(SizeType num_of_merging_states) {
105 |     num_of_merging_states_ = num_of_merging_states;
106 |   }
107 | 
108 |   // Swaps base pools.
109 |   void SwapBasePool(ObjectPool<BaseUnit> *base_pool) {
110 |     base_pool_.Swap(base_pool);
111 |   }
112 |   // Swaps label pools.
113 |   void SwapLabelPool(ObjectPool<UCharType> *label_pool) {
114 |     label_pool_.Swap(label_pool);
115 |   }
116 |   // Swaps flag pools.
117 |   void SwapFlagPool(BitPool<> *flag_pool) {
118 |     flag_pool_.Swap(flag_pool);
119 |   }
120 | 
121 |  private:
122 |   ObjectPool<BaseUnit> base_pool_;
123 |   ObjectPool<UCharType> label_pool_;
124 |   BitPool<> flag_pool_;
125 |   SizeType num_of_states_;
126 |   SizeType num_of_merged_transitions_;
127 |   SizeType num_of_merged_states_;
128 |   SizeType num_of_merging_states_;
129 | 
130 |   // Disallows copies.
131 |   Dawg(const Dawg &);
132 |   Dawg &operator=(const Dawg &);
133 | };
134 | 
135 | }  // namespace dawgdic
136 | 
137 | #endif  // DAWGDIC_DAWG_H
138 | 


--------------------------------------------------------------------------------
/lib/dawgdic/dictionary-builder.h:
--------------------------------------------------------------------------------
  1 | #ifndef DAWGDIC_DICTIONARY_BUILDER_H
  2 | #define DAWGDIC_DICTIONARY_BUILDER_H
  3 | 
  4 | #include <vector>
  5 | 
  6 | #include "dawg.h"
  7 | #include "dictionary.h"
  8 | #include "dictionary-extra-unit.h"
  9 | #include "link-table.h"
 10 | 
 11 | namespace dawgdic {
 12 | 
 13 | class DictionaryBuilder {
 14 |  public:
 15 |   enum {
 16 |     // Number of units in a block.
 17 |     BLOCK_SIZE = 256,
 18 |     // Number of blocks kept unfixed.
 19 |     NUM_OF_UNFIXED_BLOCKS = 16,
 20 |     // Number of units kept unfixed.
 21 |     UNFIXED_SIZE = BLOCK_SIZE * NUM_OF_UNFIXED_BLOCKS
 22 |   };
 23 | 
 24 |   // Builds a dictionary from a list-form dawg.
 25 |   static bool Build(const Dawg &dawg, Dictionary *dic,
 26 |                     BaseType *num_of_unused_units = NULL) {
 27 |     DictionaryBuilder builder(dawg, dic);
 28 |     if (!builder.BuildDictionary()) {
 29 |       return false;
 30 |     }
 31 |     if (num_of_unused_units != NULL) {
 32 |       *num_of_unused_units = builder.num_of_unused_units_;
 33 |     }
 34 |     return true;
 35 |   }
 36 | 
 37 |  private:
 38 |   const Dawg &dawg_;
 39 |   Dictionary *dic_;
 40 | 
 41 |   std::vector<DictionaryUnit> units_;
 42 |   std::vector<DictionaryExtraUnit *> extras_;
 43 |   std::vector<UCharType> labels_;
 44 |   LinkTable link_table_;
 45 |   BaseType unfixed_index_;
 46 |   BaseType num_of_unused_units_;
 47 | 
 48 |   // Masks for offsets.
 49 |   static const BaseType UPPER_MASK = ~(DictionaryUnit::OFFSET_MAX - 1);
 50 |   static const BaseType LOWER_MASK = 0xFF;
 51 | 
 52 |   // Disallows copies.
 53 |   DictionaryBuilder(const DictionaryBuilder &);
 54 |   DictionaryBuilder &operator=(const DictionaryBuilder &);
 55 | 
 56 |   DictionaryBuilder(const Dawg &dawg, Dictionary *dic)
 57 |     : dawg_(dawg), dic_(dic), units_(), extras_(), labels_(),
 58 |       link_table_(), unfixed_index_(), num_of_unused_units_(0) {}
 59 |   ~DictionaryBuilder() {
 60 |     for (SizeType i = 0; i < extras_.size(); ++i) {
 61 |       delete [] extras_[i];
 62 |     }
 63 |   }
 64 | 
 65 |   // Accesses units.
 66 |   DictionaryUnit &units(BaseType index) {
 67 |     return units_[index];
 68 |   }
 69 |   const DictionaryUnit &units(BaseType index) const {
 70 |     return units_[index];
 71 |   }
 72 |   DictionaryExtraUnit &extras(BaseType index) {
 73 |     return extras_[index / BLOCK_SIZE][index % BLOCK_SIZE];
 74 |   }
 75 |   const DictionaryExtraUnit &extras(BaseType index) const {
 76 |     return extras_[index / BLOCK_SIZE][index % BLOCK_SIZE];
 77 |   }
 78 | 
 79 |   // Number of units.
 80 |   BaseType num_of_units() const {
 81 |     return static_cast<BaseType>(units_.size());
 82 |   }
 83 |   // Number of blocks.
 84 |   BaseType num_of_blocks() const {
 85 |     return static_cast<BaseType>(extras_.size());
 86 |   }
 87 | 
 88 |   // Builds a dictionary from a list-form dawg.
 89 |   bool BuildDictionary() {
 90 |     link_table_.Init(dawg_.num_of_merging_states() +
 91 |         (dawg_.num_of_merging_states() >> 1));
 92 | 
 93 |     ReserveUnit(0);
 94 |     extras(0).set_is_used();
 95 |     units(0).set_offset(1);
 96 |     units(0).set_label('\0');
 97 | 
 98 |     if (dawg_.size() > 1) {
 99 |       if (!BuildDictionary(dawg_.root(), 0)) {
100 |         return false;
101 |       }
102 |     }
103 | 
104 |     FixAllBlocks();
105 | 
106 |     dic_->SwapUnitsBuf(&units_);
107 |     return true;
108 |   }
109 | 
110 |   // Builds a dictionary from a dawg.
111 |   bool BuildDictionary(BaseType dawg_index, BaseType dic_index) {
112 |     if (dawg_.is_leaf(dawg_index)) {
113 |       return true;
114 |     }
115 | 
116 |     // Uses an existing offset if available.
117 |     BaseType dawg_child_index = dawg_.child(dawg_index);
118 |     if (dawg_.is_merging(dawg_child_index)) {
119 |       BaseType offset = link_table_.Find(dawg_child_index);
120 |       if (offset != 0) {
121 |         offset ^= dic_index;
122 |         if (!(offset & UPPER_MASK) || !(offset & LOWER_MASK)) {
123 |           if (dawg_.is_leaf(dawg_child_index)) {
124 |             units(dic_index).set_has_leaf();
125 |           }
126 |           units(dic_index).set_offset(offset);
127 |           return true;
128 |         }
129 |       }
130 |     }
131 | 
132 |     // Finds a good offset and arranges child nodes.
133 |     BaseType offset = ArrangeChildNodes(dawg_index, dic_index);
134 |     if (offset == 0) {
135 |       return false;
136 |     }
137 | 
138 |     if (dawg_.is_merging(dawg_child_index))
139 |       link_table_.Insert(dawg_child_index, offset); {
140 |     }
141 | 
142 |     // Builds a double-array in depth-first order.
143 |     do {
144 |       BaseType dic_child_index = offset ^ dawg_.label(dawg_child_index);
145 |       if (!BuildDictionary(dawg_child_index, dic_child_index)) {
146 |         return false;
147 |       }
148 |       dawg_child_index = dawg_.sibling(dawg_child_index);
149 |     } while (dawg_child_index != 0);
150 | 
151 |     return true;
152 |   }
153 | 
154 |   // Arranges child nodes.
155 |   BaseType ArrangeChildNodes(BaseType dawg_index, BaseType dic_index) {
156 |     labels_.clear();
157 | 
158 |     BaseType dawg_child_index = dawg_.child(dawg_index);
159 |     while (dawg_child_index != 0) {
160 |       labels_.push_back(dawg_.label(dawg_child_index));
161 |       dawg_child_index = dawg_.sibling(dawg_child_index);
162 |     }
163 | 
164 |     // Finds a good offset.
165 |     BaseType offset = FindGoodOffset(dic_index);
166 |     if (!units(dic_index).set_offset(dic_index ^ offset)) {
167 |       return 0;
168 |     }
169 | 
170 |     dawg_child_index = dawg_.child(dawg_index);
171 |     for (SizeType i = 0; i < labels_.size(); ++i) {
172 |       BaseType dic_child_index = offset ^ labels_[i];
173 |       ReserveUnit(dic_child_index);
174 | 
175 |       if (dawg_.is_leaf(dawg_child_index)) {
176 |         units(dic_index).set_has_leaf();
177 |         units(dic_child_index).set_value(dawg_.value(dawg_child_index));
178 |       } else {
179 |         units(dic_child_index).set_label(labels_[i]);
180 |       }
181 | 
182 |       dawg_child_index = dawg_.sibling(dawg_child_index);
183 |     }
184 |     extras(offset).set_is_used();
185 | 
186 |     return offset;
187 |   }
188 | 
189 |   // Finds a good offset.
190 |   BaseType FindGoodOffset(BaseType index) const {
191 |     if (unfixed_index_ >= num_of_units()) {
192 |       return num_of_units() | (index & 0xFF);
193 |     }
194 | 
195 |     // Scans unused units to find a good offset.
196 |     BaseType unfixed_index = unfixed_index_;
197 |     do {
198 |       BaseType offset = unfixed_index ^ labels_[0];
199 |       if (IsGoodOffset(index, offset)) {
200 |         return offset;
201 |       }
202 |       unfixed_index = extras(unfixed_index).next();
203 |     } while (unfixed_index != unfixed_index_);
204 | 
205 |     return num_of_units() | (index & 0xFF);
206 |   }
207 | 
208 |   // Checks if a given offset is valid or not.
209 |   bool IsGoodOffset(BaseType index, BaseType offset) const {
210 |     if (extras(offset).is_used()) {
211 |       return false;
212 |     }
213 | 
214 |     BaseType relative_offset = index ^ offset;
215 |     if ((relative_offset & LOWER_MASK) && (relative_offset & UPPER_MASK)) {
216 |       return false;
217 |     }
218 | 
219 |     // Finds a collision.
220 |     for (SizeType i = 1; i < labels_.size(); ++i) {
221 |       if (extras(offset ^ labels_[i]).is_fixed()) {
222 |         return false;
223 |       }
224 |     }
225 | 
226 |     return true;
227 |   }
228 | 
229 |   // Reserves an unused unit.
230 |   void ReserveUnit(BaseType index) {
231 |     if (index >= num_of_units()) {
232 |       ExpandDictionary();
233 |     }
234 | 
235 |     // Removes an unused unit from a circular linked list.
236 |     if (index == unfixed_index_) {
237 |       unfixed_index_ = extras(index).next();
238 |       if (unfixed_index_ == index) {
239 |         unfixed_index_ = num_of_units();
240 |       }
241 |     }
242 |     extras(extras(index).prev()).set_next(extras(index).next());
243 |     extras(extras(index).next()).set_prev(extras(index).prev());
244 |     extras(index).set_is_fixed();
245 |   }
246 | 
247 |   // Expands a dictionary.
248 |   void ExpandDictionary() {
249 |     BaseType src_num_of_units = num_of_units();
250 |     BaseType src_num_of_blocks = num_of_blocks();
251 | 
252 |     BaseType dest_num_of_units = src_num_of_units + BLOCK_SIZE;
253 |     BaseType dest_num_of_blocks = src_num_of_blocks + 1;
254 | 
255 |     // Fixes an old block.
256 |     if (dest_num_of_blocks > NUM_OF_UNFIXED_BLOCKS) {
257 |       FixBlock(src_num_of_blocks - NUM_OF_UNFIXED_BLOCKS);
258 |     }
259 | 
260 |     units_.resize(dest_num_of_units);
261 |     extras_.resize(dest_num_of_blocks, 0);
262 | 
263 |     // Allocates memory to a new block.
264 |     if (dest_num_of_blocks > NUM_OF_UNFIXED_BLOCKS) {
265 |       BaseType block_id = src_num_of_blocks - NUM_OF_UNFIXED_BLOCKS;
266 |       std::swap(extras_[block_id], extras_.back());
267 |       for (BaseType i = src_num_of_units; i < dest_num_of_units; ++i) {
268 |         extras(i).clear();
269 |       }
270 |     } else {
271 |       extras_.back() = new DictionaryExtraUnit[BLOCK_SIZE];
272 |     }
273 | 
274 |     // Creates a circular linked list for a new block.
275 |     for (BaseType i = src_num_of_units + 1; i < dest_num_of_units; ++i) {
276 |       extras(i - 1).set_next(i);
277 |       extras(i).set_prev(i - 1);
278 |     }
279 | 
280 |     extras(src_num_of_units).set_prev(dest_num_of_units - 1);
281 |     extras(dest_num_of_units - 1).set_next(src_num_of_units);
282 | 
283 |     // Merges 2 circular linked lists.
284 |     extras(src_num_of_units).set_prev(extras(unfixed_index_).prev());
285 |     extras(dest_num_of_units - 1).set_next(unfixed_index_);
286 | 
287 |     extras(extras(unfixed_index_).prev()).set_next(src_num_of_units);
288 |     extras(unfixed_index_).set_prev(dest_num_of_units - 1);
289 |   }
290 | 
291 |   // Fixes all blocks to avoid invalid transitions.
292 |   void FixAllBlocks() {
293 |     BaseType begin = 0;
294 |     if (num_of_blocks() > NUM_OF_UNFIXED_BLOCKS) {
295 |       begin = num_of_blocks() - NUM_OF_UNFIXED_BLOCKS;
296 |     }
297 |     BaseType end = num_of_blocks();
298 | 
299 |     for (BaseType block_id = begin; block_id != end; ++block_id) {
300 |       FixBlock(block_id);
301 |     }
302 |   }
303 | 
304 |   // Adjusts labels of unused units in a given block.
305 |   void FixBlock(BaseType block_id) {
306 |     BaseType begin = block_id * BLOCK_SIZE;
307 |     BaseType end = begin + BLOCK_SIZE;
308 | 
309 |     // Finds an unused offset.
310 |     BaseType unused_offset_for_label = 0;
311 |     for (BaseType offset = begin; offset != end; ++offset) {
312 |       if (!extras(offset).is_used()) {
313 |         unused_offset_for_label = offset;
314 |         break;
315 |       }
316 |     }
317 | 
318 |     // Labels of unused units are modified.
319 |     for (BaseType index = begin; index != end; ++index) {
320 |       if (!extras(index).is_fixed()) {
321 |         ReserveUnit(index);
322 |         units(index).set_label(
323 |             static_cast<UCharType>(index ^ unused_offset_for_label));
324 |         ++num_of_unused_units_;
325 |       }
326 |     }
327 |   }
328 | };
329 | 
330 | }  // namespace dawgdic
331 | 
332 | #endif  // DAWGDIC_DICTIONARY_BUILDER_H
333 | 


--------------------------------------------------------------------------------
/lib/dawgdic/dictionary-extra-unit.h:
--------------------------------------------------------------------------------
 1 | #ifndef DAWGDIC_DICTIONARY_EXTRA_UNIT_H
 2 | #define DAWGDIC_DICTIONARY_EXTRA_UNIT_H
 3 | 
 4 | #include "base-types.h"
 5 | 
 6 | namespace dawgdic {
 7 | 
 8 | // Extra unit for building a dictionary.
 9 | class DictionaryExtraUnit {
10 |  public:
11 |   DictionaryExtraUnit() : lo_values_(0), hi_values_(0) {}
12 | 
13 |   void clear() {
14 |     lo_values_ = hi_values_ = 0;
15 |   }
16 | 
17 |   // Sets if a unit is fixed or not.
18 |   void set_is_fixed() {
19 |     lo_values_ |= 1;
20 |   }
21 |   // Sets an index of the next unused unit.
22 |   void set_next(BaseType next) {
23 |     lo_values_ = (lo_values_ & 1) | (next << 1);
24 |   }
25 |   // Sets if an index is used as an offset or not.
26 |   void set_is_used() {
27 |     hi_values_ |= 1;
28 |   }
29 |   // Sets an index of the previous unused unit.
30 |   void set_prev(BaseType prev) {
31 |     hi_values_ = (hi_values_ & 1) | (prev << 1);
32 |   }
33 | 
34 |   // Reads if a unit is fixed or not.
35 |   bool is_fixed() const {
36 |     return (lo_values_ & 1) == 1;
37 |   }
38 |   // Reads an index of the next unused unit.
39 |   BaseType next() const {
40 |     return lo_values_ >> 1;
41 |   }
42 |   // Reads if an index is used as an offset or not.
43 |   bool is_used() const {
44 |     return (hi_values_ & 1) == 1;
45 |   }
46 |   // Reads an index of the previous unused unit.
47 |   BaseType prev() const {
48 |     return hi_values_ >> 1;
49 |   }
50 | 
51 |  private:
52 |   BaseType lo_values_;
53 |   BaseType hi_values_;
54 | 
55 |   // Copyable.
56 | };
57 | 
58 | }  // namespace dawgdic
59 | 
60 | #endif  // DAWGDIC_DICTIONARY_EXTRA_UNIT_H
61 | 


--------------------------------------------------------------------------------
/lib/dawgdic/dictionary-unit.h:
--------------------------------------------------------------------------------
 1 | #ifndef DAWGDIC_DICTIONARY_UNIT_H
 2 | #define DAWGDIC_DICTIONARY_UNIT_H
 3 | 
 4 | #include "base-types.h"
 5 | 
 6 | namespace dawgdic {
 7 | 
 8 | // Unit of a dictionary.
 9 | class DictionaryUnit
10 | {
11 |  public:
12 |   static const BaseType OFFSET_MAX = static_cast<BaseType>(1) << 21;
13 |   static const BaseType IS_LEAF_BIT = static_cast<BaseType>(1) << 31;
14 |   static const BaseType HAS_LEAF_BIT = static_cast<BaseType>(1) << 8;
15 |   static const BaseType EXTENSION_BIT = static_cast<BaseType>(1) << 9;
16 | 
17 |   DictionaryUnit() : base_(0) {}
18 | 
19 |   // Sets a flag to show that a unit has a leaf as a child.
20 |   void set_has_leaf() {
21 |     base_ |= HAS_LEAF_BIT;
22 |   }
23 |   // Sets a value to a leaf unit.
24 |   void set_value(ValueType value) {
25 |     base_ = static_cast<BaseType>(value) | IS_LEAF_BIT;
26 |   }
27 |   // Sets a label to a non-leaf unit.
28 |   void set_label(UCharType label) {
29 |     base_ = (base_ & ~static_cast<BaseType>(0xFF)) | label;
30 |   }
31 |   // Sets an offset to a non-leaf unit.
32 |   bool set_offset(BaseType offset) {
33 |     if (offset >= (OFFSET_MAX << 8)) {
34 |       return false;
35 |     }
36 | 
37 |     base_ &= IS_LEAF_BIT | HAS_LEAF_BIT | 0xFF;
38 |     if (offset < OFFSET_MAX) {
39 |       base_ |= (offset << 10);
40 |     } else {
41 |       base_ |= (offset << 2) | EXTENSION_BIT;
42 |     }
43 |     return true;
44 |   }
45 | 
46 |   // Checks if a unit has a leaf as a child or not.
47 |   bool has_leaf() const {
48 |     return (base_ & HAS_LEAF_BIT) ? true : false;
49 |   }
50 |   // Checks if a unit corresponds to a leaf or not.
51 |   ValueType value() const {
52 |     return static_cast<ValueType>(base_ & ~IS_LEAF_BIT);
53 |   }
54 |   // Reads a label with a leaf flag from a non-leaf unit.
55 |   BaseType label() const {
56 |     return base_ & (IS_LEAF_BIT | 0xFF);
57 |   }
58 |   // Reads an offset to child units from a non-leaf unit.
59 |   BaseType offset() const {
60 |     return (base_ >> 10) << ((base_ & EXTENSION_BIT) >> 6);
61 |   }
62 | 
63 |  private:
64 |   BaseType base_;
65 | 
66 |   // Copyable.
67 | };
68 | 
69 | }  // namespace dawgdic
70 | 
71 | #endif  // DAWGDIC_DICTIONARY_UNIT_H
72 | 


--------------------------------------------------------------------------------
/lib/dawgdic/dictionary.h:
--------------------------------------------------------------------------------
  1 | #ifndef DAWGDIC_DICTIONARY_H
  2 | #define DAWGDIC_DICTIONARY_H
  3 | 
  4 | #include <iostream>
  5 | #include <vector>
  6 | 
  7 | #include "base-types.h"
  8 | #include "dictionary-unit.h"
  9 | 
 10 | namespace dawgdic {
 11 | 
 12 | // Dictionary class for retrieval and binary I/O.
 13 | class Dictionary {
 14 |  public:
 15 |   Dictionary() : units_(NULL), size_(0), units_buf_() {}
 16 | 
 17 |   const DictionaryUnit *units() const {
 18 |     return units_;
 19 |   }
 20 |   SizeType size() const {
 21 |     return size_;
 22 |   }
 23 |   SizeType total_size() const {
 24 |     return sizeof(DictionaryUnit) * size_;
 25 |   }
 26 |   SizeType file_size() const {
 27 |     return sizeof(BaseType) + total_size();
 28 |   }
 29 | 
 30 |   // Root index.
 31 |   BaseType root() const {
 32 |     return 0;
 33 |   }
 34 | 
 35 |   // Checks if a given index is related to the end of a key.
 36 |   bool has_value(BaseType index) const {
 37 |     return units_[index].has_leaf();
 38 |   }
 39 |   // Gets a value from a given index.
 40 |   ValueType value(BaseType index) const {
 41 |     return units_[index ^ units_[index].offset()].value();
 42 |   }
 43 | 
 44 |   // Reads a dictionary from an input stream.
 45 |   bool Read(std::istream *input) {
 46 |     BaseType base_size;
 47 |     if (!input->read(reinterpret_cast<char *>(&base_size), sizeof(BaseType))) {
 48 |       return false;
 49 |     }
 50 | 
 51 |     SizeType size = static_cast<SizeType>(base_size);
 52 |     std::vector<DictionaryUnit> units_buf(size);
 53 |     if (!input->read(reinterpret_cast<char *>(&units_buf[0]),
 54 |                      sizeof(DictionaryUnit) * size)) {
 55 |       return false;
 56 |     }
 57 | 
 58 |     SwapUnitsBuf(&units_buf);
 59 |     return true;
 60 |   }
 61 | 
 62 |   // Writes a dictionry to an output stream.
 63 |   bool Write(std::ostream *output) const {
 64 |     BaseType base_size = static_cast<BaseType>(size_);
 65 |     if (!output->write(reinterpret_cast<const char *>(&base_size),
 66 |                        sizeof(BaseType))) {
 67 |       return false;
 68 |     }
 69 | 
 70 |     if (!output->write(reinterpret_cast<const char *>(units_),
 71 |                        sizeof(DictionaryUnit) * size_)) {
 72 |       return false;
 73 |     }
 74 | 
 75 |     return true;
 76 |   }
 77 | 
 78 |   // Exact matching.
 79 |   bool Contains(const CharType *key) const {
 80 |     BaseType index = root();
 81 |     if (!Follow(key, &index)) {
 82 |       return false;
 83 |     }
 84 |     return has_value(index);
 85 |   }
 86 |   bool Contains(const CharType *key, SizeType length) const {
 87 |     BaseType index = root();
 88 |     if (!Follow(key, length, &index)) {
 89 |       return false;
 90 |     }
 91 |     return has_value(index);
 92 |   }
 93 | 
 94 |   // Exact matching.
 95 |   ValueType Find(const CharType *key) const {
 96 |     BaseType index = root();
 97 |     if (!Follow(key, &index)) {
 98 |       return -1;
 99 |     }
100 |     return has_value(index) ? value(index) : -1;
101 |   }
102 |   ValueType Find(const CharType *key, SizeType length) const {
103 |     BaseType index = root();
104 |     if (!Follow(key, length, &index)) {
105 |       return -1;
106 |     }
107 |     return has_value(index) ? value(index) : -1;
108 |   }
109 |   bool Find(const CharType *key, ValueType *value) const {
110 |     BaseType index = root();
111 |     if (!Follow(key, &index) || !has_value(index)) {
112 |       return false;
113 |     }
114 |     *value = this->value(index);
115 |     return true;
116 |   }
117 |   bool Find(const CharType *key, SizeType length, ValueType *value) const {
118 |     BaseType index = root();
119 |     if (!Follow(key, length, &index) || !has_value(index)) {
120 |       return false;
121 |     }
122 |     *value = this->value(index);
123 |     return true;
124 |   }
125 | 
126 |   // Follows a transition.
127 |   bool Follow(CharType label, BaseType *index) const {
128 |     BaseType next_index =
129 |         *index ^ units_[*index].offset() ^ static_cast<UCharType>(label);
130 |     if (units_[next_index].label() != static_cast<UCharType>(label)) {
131 |       return false;
132 |     }
133 |     *index = next_index;
134 |     return true;
135 |   }
136 | 
137 |   // Follows transitions.
138 |   bool Follow(const CharType *s, BaseType *index) const {
139 |     while (*s != '\0' && Follow(*s, index)) {
140 |       ++s;
141 |     }
142 |     return *s == '\0';
143 |   }
144 |   bool Follow(const CharType *s, BaseType *index, SizeType *count) const {
145 |     while (*s != '\0' && Follow(*s, index)) {
146 |       ++s, ++*count;
147 |     }
148 |     return *s == '\0';
149 |   }
150 | 
151 |   // Follows transitions.
152 |   bool Follow(const CharType *s, SizeType length, BaseType *index) const {
153 |     for (SizeType i = 0; i < length; ++i) {
154 |       if (!Follow(s[i], index)) {
155 |         return false;
156 |       }
157 |     }
158 |     return true;
159 |   }
160 |   bool Follow(const CharType *s, SizeType length, BaseType *index,
161 |               SizeType *count) const {
162 |     for (SizeType i = 0; i < length; ++i, ++*count) {
163 |       if (!Follow(s[i], index)) {
164 |         return false;
165 |       }
166 |     }
167 |     return true;
168 |   }
169 | 
170 |   // Maps memory with its size.
171 |   void Map(const void *address) {
172 |     Clear();
173 |     units_ = reinterpret_cast<const DictionaryUnit *>(
174 |         static_cast<const BaseType *>(address) + 1);
175 |     size_ = *static_cast<const BaseType *>(address);
176 |   }
177 |   void Map(const void *address, SizeType size) {
178 |     Clear();
179 |     units_ = static_cast<const DictionaryUnit *>(address);
180 |     size_ = size;
181 |   }
182 | 
183 |   // Initializes a dictionary.
184 |   void Clear() {
185 |     units_ = NULL;
186 |     size_ = 0;
187 |     std::vector<DictionaryUnit>(0).swap(units_buf_);
188 |   }
189 | 
190 |   // Swaps dictionaries.
191 |   void Swap(Dictionary *dic) {
192 |     std::swap(units_, dic->units_);
193 |     std::swap(size_, dic->size_);
194 |     units_buf_.swap(dic->units_buf_);
195 |   }
196 | 
197 |   // Shrinks a vector.
198 |   void Shrink() {
199 |     if (units_buf_.size() == units_buf_.capacity()) {
200 |       return;
201 |     }
202 | 
203 |     std::vector<DictionaryUnit> units_buf(units_buf_);
204 |     SwapUnitsBuf(&units_buf);
205 |   }
206 | 
207 | public:
208 |   // Following member function is called from DawgBuilder.
209 | 
210 |   // Swaps buffers for units.
211 |   void SwapUnitsBuf(std::vector<DictionaryUnit> *units_buf) {
212 |     units_ = &(*units_buf)[0];
213 |     size_ = static_cast<BaseType>(units_buf->size());
214 |     units_buf_.swap(*units_buf);
215 |   }
216 | 
217 |  private:
218 |   const DictionaryUnit *units_;
219 |   SizeType size_;
220 |   std::vector<DictionaryUnit> units_buf_;
221 | 
222 |   // Disallows copies.
223 |   Dictionary(const Dictionary &);
224 |   Dictionary &operator=(const Dictionary &);
225 | };
226 | 
227 | }  // namespace dawgdic
228 | 
229 | #endif  // DAWGDIC_DICTIONARY_H
230 | 


--------------------------------------------------------------------------------
/lib/dawgdic/guide-builder.h:
--------------------------------------------------------------------------------
  1 | #ifndef DAWGDIC_GUIDE_BUILDER_H
  2 | #define DAWGDIC_GUIDE_BUILDER_H
  3 | 
  4 | #include "guide.h"
  5 | #include "dawg.h"
  6 | #include "dictionary.h"
  7 | 
  8 | #include <vector>
  9 | 
 10 | namespace dawgdic {
 11 | 
 12 | class GuideBuilder {
 13 |  public:
 14 |   // Builds a dictionary for completing keys.
 15 |   static bool Build(const Dawg &dawg, const Dictionary &dic, Guide *guide) {
 16 |     GuideBuilder builder(dawg, dic, guide);
 17 |     return builder.BuildGuide();
 18 |   }
 19 | 
 20 |  private:
 21 |   const Dawg &dawg_;
 22 |   const Dictionary &dic_;
 23 |   Guide *guide_;
 24 | 
 25 |   std::vector<GuideUnit> units_;
 26 |   std::vector<UCharType> is_fixed_table_;
 27 | 
 28 |   // Disallows copies.
 29 |   GuideBuilder(const GuideBuilder &);
 30 |   GuideBuilder &operator=(const GuideBuilder &);
 31 | 
 32 |   GuideBuilder(const Dawg &dawg, const Dictionary &dic, Guide *guide)
 33 |     : dawg_(dawg), dic_(dic), guide_(guide), units_(), is_fixed_table_() {}
 34 | 
 35 |   bool BuildGuide() {
 36 |     // Initializes units and flags.
 37 |     units_.resize(dic_.size());
 38 |     is_fixed_table_.resize(dic_.size() / 8, '\0');
 39 | 
 40 |     if (dawg_.size() <= 1) {
 41 |       return true;
 42 |     }
 43 | 
 44 |     if (!BuildGuide(dawg_.root(), dic_.root())) {
 45 |       return false;
 46 |     }
 47 | 
 48 |     guide_->SwapUnitsBuf(&units_);
 49 |     return true;
 50 |   }
 51 | 
 52 |   // Builds a guide recursively.
 53 |   bool BuildGuide(BaseType dawg_index, BaseType dic_index) {
 54 |     if (is_fixed(dic_index)) {
 55 |       return true;
 56 |     }
 57 |     set_is_fixed(dic_index);
 58 | 
 59 |     // Finds the first non-terminal child.
 60 |     BaseType dawg_child_index = dawg_.child(dawg_index);
 61 |     if (dawg_.label(dawg_child_index) == '\0') {
 62 |       dawg_child_index = dawg_.sibling(dawg_child_index);
 63 |       if (dawg_child_index == 0) {
 64 |         return true;
 65 |       }
 66 |     }
 67 |     units_[dic_index].set_child(dawg_.label(dawg_child_index));
 68 | 
 69 |     do {
 70 |       UCharType child_label = dawg_.label(dawg_child_index);
 71 |       BaseType dic_child_index = dic_index;
 72 |       if (!dic_.Follow(child_label, &dic_child_index)) {
 73 |         return false;
 74 |       }
 75 | 
 76 |       if (!BuildGuide(dawg_child_index, dic_child_index)) {
 77 |         return false;
 78 |       }
 79 | 
 80 |       BaseType dawg_sibling_index = dawg_.sibling(dawg_child_index);
 81 |       UCharType sibling_label = dawg_.label(dawg_sibling_index);
 82 |       if (dawg_sibling_index != 0) {
 83 |         units_[dic_child_index].set_sibling(sibling_label);
 84 |       }
 85 | 
 86 |       dawg_child_index = dawg_sibling_index;
 87 |     } while (dawg_child_index != 0);
 88 | 
 89 |     return true;
 90 |   }
 91 | 
 92 |   void set_is_fixed(BaseType index) {
 93 |     is_fixed_table_[index / 8] |= 1 << (index % 8);
 94 |   }
 95 | 
 96 |   bool is_fixed(BaseType index) const {
 97 |     return (is_fixed_table_[index / 8] & (1 << (index % 8))) != 0;
 98 |   }
 99 | };
100 | 
101 | }  // namespace dawgdic
102 | 
103 | #endif  // DAWGDIC_GUIDE_BUILDER_H
104 | 


--------------------------------------------------------------------------------
/lib/dawgdic/guide-unit.h:
--------------------------------------------------------------------------------
 1 | #ifndef DAWGDIC_GUIDE_UNIT_H
 2 | #define DAWGDIC_GUIDE_UNIT_H
 3 | 
 4 | #include "base-types.h"
 5 | 
 6 | namespace dawgdic {
 7 | 
 8 | class GuideUnit {
 9 |  public:
10 |   GuideUnit() : child_('\0'), sibling_('\0') {}
11 | 
12 |   void set_child(UCharType child) {
13 |     child_ = child;
14 |   }
15 |   void set_sibling(UCharType sibling) {
16 |     sibling_ = sibling;
17 |   }
18 | 
19 |   UCharType child() const {
20 |     return child_;
21 |   }
22 |   UCharType sibling() const {
23 |     return sibling_;
24 |   }
25 | 
26 |  private:
27 |   UCharType child_;
28 |   UCharType sibling_;
29 | 
30 |   // Copyable.
31 | };
32 | 
33 | }  // namespace dawgdic
34 | 
35 | #endif  // DAWGDIC_GUIDE_UNIT_H
36 | 


--------------------------------------------------------------------------------
/lib/dawgdic/guide.h:
--------------------------------------------------------------------------------
  1 | #ifndef DAWGDIC_GUIDE_H
  2 | #define DAWGDIC_GUIDE_H
  3 | 
  4 | #include "dictionary.h"
  5 | #include "guide-unit.h"
  6 | 
  7 | #include <iostream>
  8 | #include <vector>
  9 | 
 10 | namespace dawgdic {
 11 | 
 12 | class Guide {
 13 |  public:
 14 |   Guide() : units_(NULL), size_(0), units_buf_() {}
 15 | 
 16 |   const GuideUnit *units() const {
 17 |     return units_;
 18 |   }
 19 |   SizeType size() const {
 20 |     return size_;
 21 |   }
 22 |   SizeType total_size() const {
 23 |     return sizeof(GuideUnit) * size_;
 24 |   }
 25 |   SizeType file_size() const {
 26 |     return sizeof(BaseType) + total_size();
 27 |   }
 28 | 
 29 |   // The root index.
 30 |   BaseType root() const {
 31 |     return 0;
 32 |   }
 33 | 
 34 |   UCharType child(BaseType index) const {
 35 |     return units_[index].child();
 36 |   }
 37 |   UCharType sibling(BaseType index) const {
 38 |     return units_[index].sibling();
 39 |   }
 40 | 
 41 |   // Reads a dictionary from an input stream.
 42 |   bool Read(std::istream *input) {
 43 |     BaseType base_size;
 44 |     if (!input->read(reinterpret_cast<char *>(&base_size), sizeof(BaseType))) {
 45 |       return false;
 46 |     }
 47 | 
 48 |     SizeType size = static_cast<SizeType>(base_size);
 49 |     std::vector<GuideUnit> units_buf(size);
 50 |     if (!input->read(reinterpret_cast<char *>(&units_buf[0]),
 51 |                      sizeof(GuideUnit) * size)) {
 52 |       return false;
 53 |     }
 54 | 
 55 |     SwapUnitsBuf(&units_buf);
 56 |     return true;
 57 |   }
 58 | 
 59 |   // Writes a dictionry to an output stream.
 60 |   bool Write(std::ostream *output) const {
 61 |     BaseType base_size = static_cast<BaseType>(size_);
 62 |     if (!output->write(reinterpret_cast<const char *>(&base_size),
 63 |                        sizeof(BaseType))) {
 64 |       return false;
 65 |     }
 66 | 
 67 |     if (!output->write(reinterpret_cast<const char *>(units_),
 68 |                        sizeof(GuideUnit) * size_)) {
 69 |       return false;
 70 |     }
 71 | 
 72 |     return true;
 73 |   }
 74 | 
 75 |   // Maps memory with its size.
 76 |   void Map(const void *address) {
 77 |     Clear();
 78 |     units_ = reinterpret_cast<const GuideUnit *>(
 79 |         static_cast<const BaseType *>(address) + 1);
 80 |     size_ = *static_cast<const BaseType *>(address);
 81 |   }
 82 |   void Map(const void *address, SizeType size) {
 83 |     Clear();
 84 |     units_ = static_cast<const GuideUnit *>(address);
 85 |     size_ = size;
 86 |   }
 87 | 
 88 |   // Swaps Guides.
 89 |   void Swap(Guide *guide) {
 90 |     std::swap(units_, guide->units_);
 91 |     std::swap(size_, guide->size_);
 92 |     units_buf_.swap(guide->units_buf_);
 93 |   }
 94 | 
 95 |   // Initializes a Guide.
 96 |   void Clear() {
 97 |     units_ = NULL;
 98 |     size_ = 0;
 99 |     std::vector<GuideUnit>(0).swap(units_buf_);
100 |   }
101 | 
102 |  public:
103 |   // Following member function is called from DawgBuilder.
104 | 
105 |   // Swaps buffers for units.
106 |   void SwapUnitsBuf(std::vector<GuideUnit> *units_buf) {
107 |     units_ = &(*units_buf)[0];
108 |     size_ = static_cast<BaseType>(units_buf->size());
109 |     units_buf_.swap(*units_buf);
110 |   }
111 | 
112 |  private:
113 |   const GuideUnit *units_;
114 |   SizeType size_;
115 |   std::vector<GuideUnit> units_buf_;
116 | 
117 |   // Disables copies.
118 |   Guide(const Guide &);
119 |   Guide &operator=(const Guide &);
120 | };
121 | 
122 | }  // namespace dawgdic
123 | 
124 | #endif  // DAWGDIC_GUIDE_H
125 | 


--------------------------------------------------------------------------------
/lib/dawgdic/link-table.h:
--------------------------------------------------------------------------------
 1 | #ifndef DAWGDIC_LINK_TABLE_H
 2 | #define DAWGDIC_LINK_TABLE_H
 3 | 
 4 | #include "base-types.h"
 5 | #include "dictionary-unit.h"
 6 | 
 7 | #include <vector>
 8 | 
 9 | namespace dawgdic {
10 | 
11 | class LinkTable {
12 |  public:
13 |   explicit LinkTable() : hash_table_() {}
14 | 
15 |   // Initializes a hash table.
16 |   void Init(SizeType table_size) {
17 |     PairType initial_pair(0, 0);
18 |     std::vector<PairType> table(table_size, initial_pair);
19 |     hash_table_.swap(table);
20 |   }
21 | 
22 |   // Finds an offset that corresponds to a given index.
23 |   BaseType Find(BaseType index) const {
24 |     BaseType hash_id = FindId(index);
25 |     return hash_table_[hash_id].second;
26 |   }
27 | 
28 |   // Inserts an index with its offset.
29 |   void Insert(BaseType index, BaseType offset) {
30 |     BaseType hash_id = FindId(index);
31 |     hash_table_[hash_id].first = index;
32 |     hash_table_[hash_id].second = offset;
33 |   }
34 | 
35 |  private:
36 |   typedef std::pair<BaseType, BaseType> PairType;
37 | 
38 |   std::vector<PairType> hash_table_;
39 | 
40 |   // Disallows copies.
41 |   LinkTable(const LinkTable &);
42 |   LinkTable &operator=(const LinkTable &);
43 | 
44 |   // Finds an Id from an upper table.
45 |   BaseType FindId(BaseType index) const {
46 |     BaseType hash_id = Hash(index) % hash_table_.size();
47 |     while (hash_table_[hash_id].first != 0) {
48 |       if (index == hash_table_[hash_id].first) {
49 |         return hash_id;
50 |       }
51 |       hash_id = (hash_id + 1) % hash_table_.size();
52 |     }
53 |     return hash_id;
54 |   }
55 | 
56 |   // 32-bit mix function.
57 |   // http://www.concentric.net/~Ttwang/tech/inthash.htm
58 |   static BaseType Hash(BaseType key) {
59 |     key = ~key + (key << 15);  // key = (key << 15) - key - 1;
60 |     key = key ^ (key >> 12);
61 |     key = key + (key << 2);
62 |     key = key ^ (key >> 4);
63 |     key = key * 2057;  // key = (key + (key << 3)) + (key << 11);
64 |     key = key ^ (key >> 16);
65 |     return key;
66 |   }
67 | };
68 | 
69 | }  // namespace dawgdic
70 | 
71 | #endif  // DAWGDIC_LINK_TABLE_H
72 | 


--------------------------------------------------------------------------------
/lib/dawgdic/object-pool.h:
--------------------------------------------------------------------------------
 1 | #ifndef DAWGDIC_OBJECT_POOL_H
 2 | #define DAWGDIC_OBJECT_POOL_H
 3 | 
 4 | #include <vector>
 5 | 
 6 | #include "base-types.h"
 7 | 
 8 | namespace dawgdic {
 9 | 
10 | // This class works like an array of objects with compact memory management.
11 | template <typename OBJECT_TYPE, SizeType BLOCK_SIZE = 1 << 10>
12 | class ObjectPool {
13 |  public:
14 |   typedef OBJECT_TYPE ObjectType;
15 | 
16 |   ObjectPool() : blocks_(), size_(0) {}
17 |   ~ObjectPool() {
18 |     Clear();
19 |   }
20 | 
21 |   // Accessors.
22 |   ObjectType &operator[](SizeType index) {
23 |     return blocks_[index / BLOCK_SIZE][index % BLOCK_SIZE];
24 |   }
25 |   const ObjectType &operator[](SizeType index) const {
26 |     return blocks_[index / BLOCK_SIZE][index % BLOCK_SIZE];
27 |   }
28 | 
29 |   // Number of allocated objects.
30 |   SizeType size() const {
31 |     return size_;
32 |   }
33 | 
34 |   // Deletes all objects and frees memory.
35 |   void Clear() {
36 |     for (SizeType i = 0; i < blocks_.size(); ++i) {
37 |       delete [] blocks_[i];
38 |     }
39 | 
40 |     std::vector<ObjectType *>(0).swap(blocks_);
41 |     size_ = 0;
42 |   }
43 | 
44 |   // Swaps object pools.
45 |   void Swap(ObjectPool *pool) {
46 |     blocks_.swap(pool->blocks_);
47 |     std::swap(size_, pool->size_);
48 |   }
49 | 
50 |   // Allocates memory for a new object and returns its ID.
51 |   SizeType Allocate() {
52 |     if (size_ == BLOCK_SIZE * blocks_.size()) {
53 |       blocks_.push_back(new ObjectType[BLOCK_SIZE]);
54 |     }
55 |     return size_++;
56 |   }
57 | 
58 |  private:
59 |   std::vector<ObjectType *> blocks_;
60 |   SizeType size_;
61 | 
62 |   // Disallows copies.
63 |   ObjectPool(const ObjectPool &);
64 |   ObjectPool &operator=(const ObjectPool &);
65 | };
66 | 
67 | }  // namespace dawgdic
68 | 
69 | #endif  // DAWGDIC_OBJECT_POOL_H
70 | 


--------------------------------------------------------------------------------
/lib/dawgdic/ranked-completer-candidate.h:
--------------------------------------------------------------------------------
 1 | #ifndef DAWGDIC_RANKED_COMPLETER_CANDIDATE_H
 2 | #define DAWGDIC_RANKED_COMPLETER_CANDIDATE_H
 3 | 
 4 | #include "base-types.h"
 5 | 
 6 | namespace dawgdic {
 7 | 
 8 | class RankedCompleterCandidate {
 9 |  public:
10 |   RankedCompleterCandidate() : node_index_(0), value_(-1) {}
11 | 
12 |   void set_node_index(BaseType node_index) {
13 |     node_index_ = node_index;
14 |   }
15 |   void set_value(ValueType value) {
16 |     value_ = value;
17 |   }
18 | 
19 |   BaseType node_index() const {
20 |     return node_index_;
21 |   }
22 |   ValueType value() const {
23 |     return value_;
24 |   }
25 | 
26 |   template <typename VALUE_COMPARER_TYPE>
27 |   class Comparer {
28 |    public:
29 |     typedef VALUE_COMPARER_TYPE ValueComparerType;
30 | 
31 |     explicit Comparer(ValueComparerType value_comparer)
32 |       : value_comparer_(value_comparer) {}
33 | 
34 |     bool operator()(const RankedCompleterCandidate &lhs,
35 |                     const RankedCompleterCandidate &rhs) const {
36 |       if (lhs.value() != rhs.value()) {
37 |         return value_comparer_(lhs.value(), rhs.value());
38 |       }
39 |       return lhs.node_index() > rhs.node_index();
40 |     }
41 | 
42 |    private:
43 |     ValueComparerType value_comparer_;
44 |   };
45 | 
46 |   template <typename VALUE_COMPARER_TYPE>
47 |   static Comparer<VALUE_COMPARER_TYPE> MakeComparer(
48 |       VALUE_COMPARER_TYPE value_comparer) {
49 |     return Comparer<VALUE_COMPARER_TYPE>(value_comparer);
50 |   }
51 | 
52 |  private:
53 |   BaseType node_index_;
54 |   ValueType value_;
55 | 
56 |   // Copyable.
57 | };
58 | 
59 | }  // namespace dawgdic
60 | 
61 | #endif  // DAWGDIC_RANKED_COMPLETER_CANDIDATE_H
62 | 


--------------------------------------------------------------------------------
/lib/dawgdic/ranked-completer-node.h:
--------------------------------------------------------------------------------
 1 | #ifndef DAWGDIC_RANKED_COMPLETER_NODE_H
 2 | #define DAWGDIC_RANKED_COMPLETER_NODE_H
 3 | 
 4 | #include "base-types.h"
 5 | 
 6 | namespace dawgdic {
 7 | 
 8 | class RankedCompleterNode {
 9 |  public:
10 |   RankedCompleterNode()
11 |     : dic_index_(0), prev_node_index_(0),
12 |       label_('\0'), is_queued_(false), has_terminal_(false) {}
13 | 
14 |   void set_dic_index(BaseType dic_index) {
15 |     dic_index_ = dic_index;
16 |   }
17 |   void set_prev_node_index(BaseType prev_node_index) {
18 |     prev_node_index_ = prev_node_index;
19 |   }
20 |   void set_label(UCharType label) {
21 |     label_ = label;
22 |   }
23 |   void set_is_queued() {
24 |     is_queued_ = true;
25 |   }
26 |   void set_has_terminal(bool has_terminal) {
27 |     has_terminal_ = has_terminal;
28 |   }
29 | 
30 |   BaseType dic_index() const {
31 |     return dic_index_;
32 |   }
33 |   BaseType prev_node_index() const {
34 |     return prev_node_index_;
35 |   }
36 |   UCharType label() const {
37 |     return label_;
38 |   }
39 |   bool is_queued() const {
40 |     return is_queued_;
41 |   }
42 |   bool has_terminal() const {
43 |     return has_terminal_;
44 |   }
45 | 
46 |  private:
47 |   BaseType dic_index_;
48 |   BaseType prev_node_index_;
49 |   UCharType label_;
50 |   bool is_queued_;
51 |   bool has_terminal_;
52 | 
53 |   // Copyable.
54 | };
55 | 
56 | }  // namespace dawgdic
57 | 
58 | #endif  // DAWGDIC_RANKED_COMPLETER_NODE_H
59 | 


--------------------------------------------------------------------------------
/lib/dawgdic/ranked-completer.h:
--------------------------------------------------------------------------------
  1 | #ifndef DAWGDIC_RANKED_COMPLETER_H
  2 | #define DAWGDIC_RANKED_COMPLETER_H
  3 | 
  4 | #include "dictionary.h"
  5 | #include "ranked-completer-candidate.h"
  6 | #include "ranked-completer-node.h"
  7 | #include "ranked-guide.h"
  8 | 
  9 | #include <algorithm>
 10 | #include <functional>
 11 | #include <queue>
 12 | #include <vector>
 13 | 
 14 | namespace dawgdic {
 15 | 
 16 | template <typename VALUE_COMPARER_TYPE = std::less<ValueType> >
 17 | class RankedCompleterBase {
 18 |  public:
 19 |   typedef VALUE_COMPARER_TYPE ValueComparerType;
 20 | 
 21 |   explicit RankedCompleterBase(
 22 |       ValueComparerType value_comparer = ValueComparerType())
 23 |     : dic_(NULL), guide_(NULL), key_(), prefix_length_(0), value_(-1),
 24 |       nodes_(), node_queue_(), candidate_queue_(
 25 |           RankedCompleterCandidate::MakeComparer(value_comparer)) {}
 26 |   RankedCompleterBase(const Dictionary &dic, const RankedGuide &guide,
 27 |       ValueComparerType value_comparer = ValueComparerType())
 28 |     : dic_(&dic), guide_(&guide), key_(), prefix_length_(0), value_(-1),
 29 |       nodes_(), node_queue_(), candidate_queue_(
 30 |           RankedCompleterCandidate::MakeComparer(value_comparer)) {}
 31 | 
 32 |   void set_dic(const Dictionary &dic) {
 33 |     dic_ = &dic;
 34 |   }
 35 |   void set_guide(const RankedGuide &guide) {
 36 |     guide_ = &guide;
 37 |   }
 38 | 
 39 |   const Dictionary &dic() const {
 40 |     return *dic_;
 41 |   }
 42 |   const RankedGuide &guide() const {
 43 |     return *guide_;
 44 |   }
 45 | 
 46 |   // These member functions are available only when Next() returns true.
 47 |   const char *key() const {
 48 |     return reinterpret_cast<const char *>(&key_[0]);
 49 |   }
 50 |   SizeType length() const {
 51 |     return key_.size() - 1;
 52 |   }
 53 |   ValueType value() const {
 54 |     return value_;
 55 |   }
 56 | 
 57 |   // Starts completing keys from given index and prefix.
 58 |   void Start(BaseType index, const char *prefix = "") {
 59 |     SizeType length = 0;
 60 |     for (const char *p = prefix; *p != '\0'; ++p) {
 61 |       ++length;
 62 |     }
 63 | 
 64 |     Start(index, prefix, length);
 65 |   }
 66 |   void Start(BaseType index, const char *prefix, SizeType length) {
 67 |     key_.resize(length);
 68 |     for (SizeType i = 0; i < length; ++i) {
 69 |       key_[i] = prefix[i];
 70 |     }
 71 |     prefix_length_ = length;
 72 |     value_ = -1;
 73 | 
 74 |     nodes_.clear();
 75 |     node_queue_.clear();
 76 |     while (!candidate_queue_.empty()) {
 77 |       candidate_queue_.pop();
 78 |     }
 79 | 
 80 |     if (guide_->size() != 0) {
 81 |       CreateNode(index, 0, 'X');
 82 |       EnqueueNode(0);
 83 |     }
 84 |   }
 85 | 
 86 |   // Gets the next key.
 87 |   bool Next() {
 88 |     for (SizeType i = 0; i < node_queue_.size(); ++i) {
 89 |       BaseType node_index = node_queue_[i];
 90 |       if (value_ != -1 && !FindSibling(&node_index)) {
 91 |         continue;
 92 |       }
 93 |       node_index = FindTerminal(node_index);
 94 |       EnqueueCandidate(node_index);
 95 |     }
 96 |     node_queue_.clear();
 97 | 
 98 |     // Returns false if there is no candidate.
 99 |     if (candidate_queue_.empty()) {
100 |       return false;
101 |     }
102 | 
103 |     const RankedCompleterCandidate &candidate = candidate_queue_.top();
104 | 
105 |     BaseType node_index = candidate.node_index();
106 |     EnqueueNode(node_index);
107 |     node_index = nodes_[node_index].prev_node_index();
108 | 
109 |     key_.resize(prefix_length_);
110 |     while (node_index != 0) {
111 |       key_.push_back(nodes_[node_index].label());
112 |       EnqueueNode(node_index);
113 |       node_index = nodes_[node_index].prev_node_index();
114 |     }
115 |     std::reverse(key_.begin() + prefix_length_, key_.end());
116 |     key_.push_back('\0');
117 | 
118 |     value_ = candidate.value();
119 |     candidate_queue_.pop();
120 | 
121 |     return true;
122 |   }
123 | 
124 |  private:
125 |   const Dictionary *dic_;
126 |   const RankedGuide *guide_;
127 |   std::vector<UCharType> key_;
128 |   SizeType prefix_length_;
129 |   ValueType value_;
130 | 
131 |   std::vector<RankedCompleterNode> nodes_;
132 |   std::vector<BaseType> node_queue_;
133 |   std::priority_queue<RankedCompleterCandidate,
134 |                       std::vector<RankedCompleterCandidate>,
135 |                       RankedCompleterCandidate::Comparer<ValueComparerType> >
136 |       candidate_queue_;
137 | 
138 |   // Disallows copies.
139 |   RankedCompleterBase(const RankedCompleterBase &);
140 |   RankedCompleterBase &operator=(const RankedCompleterBase &);
141 | 
142 |   // Pushes a node to queue.
143 |   void EnqueueNode(BaseType node_index) {
144 |     if (nodes_[node_index].is_queued()) {
145 |       return;
146 |     }
147 | 
148 |     node_queue_.push_back(node_index);
149 |     nodes_[node_index].set_is_queued();
150 |   }
151 | 
152 |   // Pushes a candidate to priority queue.
153 |   void EnqueueCandidate(BaseType node_index) {
154 |     RankedCompleterCandidate candidate;
155 |     candidate.set_node_index(node_index);
156 |     candidate.set_value(
157 |         dic_->units()[nodes_[node_index].dic_index()].value());
158 |     candidate_queue_.push(candidate);
159 |   }
160 | 
161 |   // Finds a sibling of a given node.
162 |   bool FindSibling(BaseType *node_index) {
163 |     BaseType prev_node_index = nodes_[*node_index].prev_node_index();
164 |     BaseType dic_index = nodes_[*node_index].dic_index();
165 | 
166 |     UCharType sibling_label = guide_->sibling(dic_index);
167 |     if (sibling_label == '\0') {
168 |       if (!nodes_[prev_node_index].has_terminal()) {
169 |         return false;
170 |       }
171 |       nodes_[prev_node_index].set_has_terminal(false);
172 |     }
173 | 
174 |     // Follows a transition to sibling and creates a node for the sibling.
175 |     BaseType dic_prev_index = nodes_[prev_node_index].dic_index();
176 |     dic_index = FollowWithoutCheck(dic_prev_index, sibling_label);
177 |     *node_index = CreateNode(dic_index, prev_node_index, sibling_label);
178 | 
179 |     return true;
180 |   }
181 | 
182 |   // Follows transitions and finds a terminal.
183 |   BaseType FindTerminal(BaseType node_index) {
184 |     while (nodes_[node_index].label() != '\0') {
185 |       BaseType dic_index = nodes_[node_index].dic_index();
186 |       UCharType child_label = guide_->child(dic_index);
187 |       if (child_label == '\0') {
188 |         nodes_[node_index].set_has_terminal(false);
189 |       }
190 | 
191 |       // Follows a transition to child and creates a node for the child.
192 |       dic_index = FollowWithoutCheck(dic_index, child_label);
193 |       node_index = CreateNode(dic_index, node_index, child_label);
194 |     }
195 |     return node_index;
196 |   }
197 | 
198 |   // Follows a transition without any check.
199 |   BaseType FollowWithoutCheck(BaseType index, UCharType label) const {
200 |     return index ^ dic_->units()[index].offset() ^ label;
201 |   }
202 | 
203 |   // Creates a node.
204 |   BaseType CreateNode(BaseType dic_index, BaseType prev_node_index,
205 |                       UCharType label) {
206 |     RankedCompleterNode node;
207 |     node.set_dic_index(dic_index);
208 |     node.set_prev_node_index(prev_node_index);
209 |     node.set_label(label);
210 |     if (node.label() != '\0') {
211 |       node.set_has_terminal(dic_->has_value(node.dic_index()));
212 |     }
213 |     nodes_.push_back(node);
214 | 
215 |     return static_cast<BaseType>(nodes_.size() - 1);
216 |   }
217 | };
218 | 
219 | typedef RankedCompleterBase<> RankedCompleter;
220 | 
221 | }  // namespace dawgdic
222 | 
223 | #endif  // DAWGDIC_RANKED_COMPLETER_H
224 | 


--------------------------------------------------------------------------------
/lib/dawgdic/ranked-guide-builder.h:
--------------------------------------------------------------------------------
  1 | #ifndef DAWGDIC_RANKED_GUIDE_BUILDER_H
  2 | #define DAWGDIC_RANKED_GUIDE_BUILDER_H
  3 | 
  4 | #include "dawg.h"
  5 | #include "dictionary.h"
  6 | #include "ranked-guide.h"
  7 | #include "ranked-guide-link.h"
  8 | 
  9 | #include <algorithm>
 10 | #include <functional>
 11 | #include <vector>
 12 | 
 13 | namespace dawgdic {
 14 | 
 15 | class RankedGuideBuilder {
 16 |  public:
 17 |   // Builds a dictionary for completing keys.
 18 |   static bool Build(const Dawg &dawg, const Dictionary &dic,
 19 |                     RankedGuide *guide) {
 20 |     return Build(dawg, dic, guide, std::less<ValueType>());
 21 |   }
 22 | 
 23 |   // Builds a dictionary for completing keys.
 24 |   template <typename VALUE_COMPARER_TYPE>
 25 |   static bool Build(const Dawg &dawg, const Dictionary &dic,
 26 |                     RankedGuide *guide, VALUE_COMPARER_TYPE value_comparer) {
 27 |     RankedGuideBuilder builder(dawg, dic, guide);
 28 |     return builder.BuildRankedGuide(value_comparer);
 29 |   }
 30 | 
 31 |  private:
 32 |   const Dawg &dawg_;
 33 |   const Dictionary &dic_;
 34 |   RankedGuide *guide_;
 35 | 
 36 |   std::vector<RankedGuideUnit> units_;
 37 |   std::vector<RankedGuideLink> links_;
 38 |   std::vector<UCharType> is_fixed_table_;
 39 | 
 40 |   // Disallows copies.
 41 |   RankedGuideBuilder(const RankedGuideBuilder &);
 42 |   RankedGuideBuilder &operator=(const RankedGuideBuilder &);
 43 | 
 44 |   RankedGuideBuilder(const Dawg &dawg, const Dictionary &dic,
 45 |                      RankedGuide *guide)
 46 |     : dawg_(dawg), dic_(dic), guide_(guide),
 47 |       units_(), links_(), is_fixed_table_() {}
 48 | 
 49 |   template <typename VALUE_COMPARER_TYPE>
 50 |   bool BuildRankedGuide(VALUE_COMPARER_TYPE value_comparer) {
 51 |     // Initializes units and flags.
 52 |     units_.resize(dic_.size());
 53 |     is_fixed_table_.resize(dic_.size() / 8, '\0');
 54 | 
 55 |     if (dawg_.size() <= 1) {
 56 |       return true;
 57 |     }
 58 | 
 59 |     ValueType max_value = -1;
 60 |     if (!BuildRankedGuide(dawg_.root(), dic_.root(),
 61 |                           &max_value, value_comparer)) {
 62 |       return false;
 63 |     }
 64 | 
 65 |     guide_->SwapUnitsBuf(&units_);
 66 |     return true;
 67 |   }
 68 | 
 69 |   // Builds a guide recursively.
 70 |   template <typename VALUE_COMPARER_TYPE>
 71 |   bool BuildRankedGuide(BaseType dawg_index, BaseType dic_index,
 72 |                         ValueType *max_value,
 73 |                         VALUE_COMPARER_TYPE value_comparer) {
 74 |     if (is_fixed(dic_index)) {
 75 |       return FindMaxValue(dic_index, max_value);
 76 |     }
 77 |     set_is_fixed(dic_index);
 78 | 
 79 |     SizeType initial_num_links = links_.size();
 80 | 
 81 |     // Enumerates links to the next states.
 82 |     if (!EnumerateLinks(dawg_index, dic_index, value_comparer)) {
 83 |       return false;
 84 |     }
 85 | 
 86 |     std::stable_sort(links_.begin() + initial_num_links, links_.end(),
 87 |       RankedGuideLink::MakeComparer(value_comparer));
 88 | 
 89 |     // Reflects links into units.
 90 |     if (!TurnLinksToUnits(dic_index, initial_num_links)) {
 91 |       return false;
 92 |     }
 93 | 
 94 |     *max_value = links_[initial_num_links].value();
 95 |     links_.resize(initial_num_links);
 96 | 
 97 |     return true;
 98 |   }
 99 | 
100 |   // Finds the maximum value by using fixed units.
101 |   bool FindMaxValue(BaseType dic_index, ValueType *max_value) const {
102 |     while (units_[dic_index].child() != '\0') {
103 |       UCharType child_label = units_[dic_index].child();
104 |       if (!dic_.Follow(child_label, &dic_index)) {
105 |         return false;
106 |       }
107 |     }
108 |     if (!dic_.has_value(dic_index)) {
109 |       return false;
110 |     }
111 |     *max_value = dic_.value(dic_index);
112 |     return true;
113 |   }
114 | 
115 |   // Enumerates links to the next states.
116 |   template <typename VALUE_COMPARER_TYPE>
117 |   bool EnumerateLinks(BaseType dawg_index, BaseType dic_index,
118 |                       VALUE_COMPARER_TYPE value_comparer) {
119 |     for (BaseType dawg_child_index = dawg_.child(dawg_index);
120 |         dawg_child_index != 0;
121 |         dawg_child_index = dawg_.sibling(dawg_child_index)) {
122 |       ValueType value = -1;
123 |       UCharType child_label = dawg_.label(dawg_child_index);
124 |       if (child_label == '\0') {
125 |         if (!dic_.has_value(dic_index)) {
126 |           return false;
127 |         }
128 |         value = dic_.value(dic_index);
129 |       } else {
130 |         BaseType dic_child_index = dic_index;
131 |         if (!dic_.Follow(child_label, &dic_child_index)) {
132 |           return false;
133 |         }
134 | 
135 |         if (!BuildRankedGuide(dawg_child_index, dic_child_index,
136 |                               &value, value_comparer)) {
137 |           return false;
138 |         }
139 |       }
140 |       links_.push_back(RankedGuideLink(child_label, value));
141 |     }
142 | 
143 |     return true;
144 |   }
145 | 
146 |   // Modifies units.
147 |   bool TurnLinksToUnits(BaseType dic_index, SizeType links_begin) {
148 |     // The first child.
149 |     UCharType first_label = links_[links_begin].label();
150 |     units_[dic_index].set_child(first_label);
151 |     BaseType dic_child_index = FollowWithoutCheck(dic_index, first_label);
152 | 
153 |     // Other children.
154 |     for (SizeType i = links_begin + 1; i < links_.size(); ++i) {
155 |       UCharType sibling_label = links_[i].label();
156 | 
157 |       BaseType dic_sibling_index =
158 |           FollowWithoutCheck(dic_index, sibling_label);
159 |       units_[dic_child_index].set_sibling(sibling_label);
160 |       dic_child_index = dic_sibling_index;
161 |     }
162 | 
163 |     return true;
164 |   }
165 | 
166 |   // Follows a transition without any check.
167 |   BaseType FollowWithoutCheck(BaseType index, UCharType label) const {
168 |     return index ^ dic_.units()[index].offset() ^ label;
169 |   }
170 | 
171 |   void set_is_fixed(BaseType index) {
172 |     is_fixed_table_[index / 8] |= 1 << (index % 8);
173 |   }
174 | 
175 |   bool is_fixed(BaseType index) const {
176 |     return (is_fixed_table_[index / 8] & (1 << (index % 8))) != 0;
177 |   }
178 | };
179 | 
180 | }  // namespace dawgdic
181 | 
182 | #endif  // DAWGDIC_RANKED_GUIDE_BUILDER_H
183 | 


--------------------------------------------------------------------------------
/lib/dawgdic/ranked-guide-link.h:
--------------------------------------------------------------------------------
 1 | #ifndef DAWGDIC_RANKED_GUIDE_LINK_H
 2 | #define DAWGDIC_RANKED_GUIDE_LINK_H
 3 | 
 4 | namespace dawgdic {
 5 | 
 6 | class RankedGuideLink {
 7 |  public:
 8 |   RankedGuideLink() : label_('\0'), value_(-1) {}
 9 |   RankedGuideLink(UCharType label, ValueType value)
10 |     : label_(label), value_(value) {}
11 | 
12 |   void set_label(UCharType label) {
13 |     label_ = label;
14 |   }
15 |   void set_value(ValueType value) {
16 |     value_ = value;
17 |   }
18 | 
19 |   UCharType label() const {
20 |     return label_;
21 |   }
22 |   ValueType value() const {
23 |     return value_;
24 |   }
25 | 
26 |   // For sortings links in descending value order.
27 |   template <typename VALUE_COMPARER_TYPE>
28 |   class Comparer {
29 |    public:
30 |     typedef VALUE_COMPARER_TYPE ValueComparerType;
31 | 
32 |     explicit Comparer(ValueComparerType value_comparer)
33 |       : value_comparer_(value_comparer) {}
34 | 
35 |     bool operator()(const RankedGuideLink &lhs,
36 |                     const RankedGuideLink &rhs) const {
37 |       if (lhs.value() != rhs.value()) {
38 |         return value_comparer_(rhs.value(), lhs.value());
39 |       }
40 |       return lhs.label() < rhs.label();
41 |     }
42 | 
43 |    private:
44 |     ValueComparerType value_comparer_;
45 |   };
46 | 
47 |   template <typename VALUE_COMPARER_TYPE>
48 |   static Comparer<VALUE_COMPARER_TYPE> MakeComparer(
49 |       VALUE_COMPARER_TYPE value_comparer) {
50 |     return Comparer<VALUE_COMPARER_TYPE>(value_comparer);
51 |   }
52 | 
53 |  private:
54 |   UCharType label_;
55 |   ValueType value_;
56 | 
57 |   // Copyable.
58 | };
59 | 
60 | }  // namespace dawgdic
61 | 
62 | #endif  // DAWGDIC_RANKED_GUIDE_LINK_H
63 | 


--------------------------------------------------------------------------------
/lib/dawgdic/ranked-guide-unit.h:
--------------------------------------------------------------------------------
 1 | #ifndef DAWGDIC_RANKED_GUIDE_UNIT_H
 2 | #define DAWGDIC_RANKED_GUIDE_UNIT_H
 3 | 
 4 | #include "base-types.h"
 5 | 
 6 | namespace dawgdic {
 7 | 
 8 | class RankedGuideUnit {
 9 |  public:
10 |   RankedGuideUnit() : child_('\0'), sibling_('\0') {}
11 | 
12 |   void set_child(UCharType child) {
13 |     child_ = child;
14 |   }
15 |   void set_sibling(UCharType sibling) {
16 |     sibling_ = sibling;
17 |   }
18 | 
19 |   UCharType child() const {
20 |     return child_;
21 |   }
22 |   UCharType sibling() const {
23 |     return sibling_;
24 |   }
25 | 
26 |  private:
27 |   UCharType child_;
28 |   UCharType sibling_;
29 | 
30 |   // Copyable.
31 | };
32 | 
33 | }  // namespace dawgdic
34 | 
35 | #endif  // DAWGDIC_RANKED_GUIDE_UNIT_H
36 | 


--------------------------------------------------------------------------------
/lib/dawgdic/ranked-guide.h:
--------------------------------------------------------------------------------
  1 | #ifndef DAWGDIC_RANKED_GUIDE_H
  2 | #define DAWGDIC_RANKED_GUIDE_H
  3 | 
  4 | #include "dictionary.h"
  5 | #include "ranked-guide-unit.h"
  6 | 
  7 | #include <iostream>
  8 | #include <vector>
  9 | 
 10 | namespace dawgdic {
 11 | 
 12 | class RankedGuide {
 13 |  public:
 14 |   RankedGuide() : units_(NULL), size_(0), units_buf_() {}
 15 | 
 16 |   const RankedGuideUnit *units() const {
 17 |     return units_;
 18 |   }
 19 |   SizeType size() const {
 20 |     return size_;
 21 |   }
 22 |   SizeType total_size() const {
 23 |     return sizeof(RankedGuideUnit) * size_;
 24 |   }
 25 |   SizeType file_size() const {
 26 |     return sizeof(BaseType) + total_size();
 27 |   }
 28 | 
 29 |   // The root index.
 30 |   BaseType root() const {
 31 |     return 0;
 32 |   }
 33 | 
 34 |   UCharType child(BaseType index) const {
 35 |     return units_[index].child();
 36 |   }
 37 |   UCharType sibling(BaseType index) const {
 38 |     return units_[index].sibling();
 39 |   }
 40 | 
 41 |   // Reads a dictionary from an input stream.
 42 |   bool Read(std::istream *input) {
 43 |     BaseType base_size;
 44 |     if (!input->read(reinterpret_cast<char *>(&base_size), sizeof(BaseType))) {
 45 |       return false;
 46 |     }
 47 | 
 48 |     SizeType size = static_cast<SizeType>(base_size);
 49 |     std::vector<RankedGuideUnit> units_buf(size);
 50 |     if (!input->read(reinterpret_cast<char *>(&units_buf[0]),
 51 |                      sizeof(RankedGuideUnit) * size)) {
 52 |       return false;
 53 |     }
 54 | 
 55 |     SwapUnitsBuf(&units_buf);
 56 |     return true;
 57 |   }
 58 | 
 59 |   // Writes a dictionry to an output stream.
 60 |   bool Write(std::ostream *output) const {
 61 |     BaseType base_size = static_cast<BaseType>(size_);
 62 |     if (!output->write(reinterpret_cast<const char *>(&base_size),
 63 |                        sizeof(BaseType))) {
 64 |       return false;
 65 |     }
 66 | 
 67 |     if (!output->write(reinterpret_cast<const char *>(units_),
 68 |                        sizeof(RankedGuideUnit) * size_)) {
 69 |       return false;
 70 |     }
 71 | 
 72 |     return true;
 73 |   }
 74 | 
 75 |   // Maps memory with its size.
 76 |   void Map(const void *address) {
 77 |     Clear();
 78 |     units_ = reinterpret_cast<const RankedGuideUnit *>(
 79 |         static_cast<const BaseType *>(address) + 1);
 80 |     size_ = *static_cast<const BaseType *>(address);
 81 |   }
 82 |   void Map(const void *address, SizeType size) {
 83 |     Clear();
 84 |     units_ = static_cast<const RankedGuideUnit *>(address);
 85 |     size_ = size;
 86 |   }
 87 | 
 88 |   // Swaps RankedGuides.
 89 |   void Swap(RankedGuide *guide) {
 90 |     std::swap(units_, guide->units_);
 91 |     std::swap(size_, guide->size_);
 92 |     units_buf_.swap(guide->units_buf_);
 93 |   }
 94 | 
 95 |   // Initializes a RankedGuide.
 96 |   void Clear() {
 97 |     units_ = NULL;
 98 |     size_ = 0;
 99 |     std::vector<RankedGuideUnit>(0).swap(units_buf_);
100 |   }
101 | 
102 |  public:
103 |   // Following member function is called from DawgBuilder.
104 | 
105 |   // Swaps buffers for units.
106 |   void SwapUnitsBuf(std::vector<RankedGuideUnit> *units_buf) {
107 |     units_ = &(*units_buf)[0];
108 |     size_ = static_cast<BaseType>(units_buf->size());
109 |     units_buf_.swap(*units_buf);
110 |   }
111 | 
112 |  private:
113 |   const RankedGuideUnit *units_;
114 |   SizeType size_;
115 |   std::vector<RankedGuideUnit> units_buf_;
116 | 
117 |   // Disables copies.
118 |   RankedGuide(const RankedGuide &);
119 |   RankedGuide &operator=(const RankedGuide &);
120 | };
121 | 
122 | }  // namespace dawgdic
123 | 
124 | #endif  // DAWGDIC_RANKED_GUIDE_H
125 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | import glob
 3 | from setuptools import setup, Extension
 4 | 
 5 | setup(
 6 |     name="DAWG",
 7 |     version="0.8.0",
 8 |     description="Fast and memory efficient DAWG (DAFSA) for Python",
 9 |     long_description=open('README.rst').read() + '\n\n' + open('CHANGES.rst').read(),
10 |     author='Mikhail Korobov',
11 |     author_email='kmike84@gmail.com',
12 |     url='https://github.com/pytries/DAWG/',
13 | 
14 |     ext_modules=[
15 |         Extension(
16 |             "dawg",
17 |             sources=glob.glob('src/*.cpp') + glob.glob('lib/b64/*.c'),
18 |             include_dirs=['lib'],
19 |             language="c++",
20 |         )
21 |     ],
22 | 
23 |     classifiers=[
24 |         'Development Status :: 4 - Beta',
25 |         'Intended Audience :: Developers',
26 |         'Intended Audience :: Science/Research',
27 |         'License :: OSI Approved :: MIT License',
28 |         'Programming Language :: Cython',
29 |         'Programming Language :: Python',
30 |         'Programming Language :: Python :: 2',
31 |         'Programming Language :: Python :: 2.7',
32 |         'Programming Language :: Python :: 3',
33 |         'Programming Language :: Python :: 3.5',
34 |         'Programming Language :: Python :: 3.6',
35 |         'Programming Language :: Python :: 3.7',
36 |         'Programming Language :: Python :: 3.8',
37 |         'Programming Language :: Python :: Implementation :: CPython',
38 |         'Topic :: Software Development :: Libraries :: Python Modules',
39 |         'Topic :: Scientific/Engineering :: Information Analysis',
40 |         'Topic :: Text Processing :: Linguistic',
41 |     ],
42 | )
43 | 


--------------------------------------------------------------------------------
/src/_base_types.pxd:
--------------------------------------------------------------------------------
 1 | cdef extern from "../lib/dawgdic/base-types.h" namespace "dawgdic":
 2 |     # 8-bit characters.
 3 |     ctypedef char CharType
 4 |     ctypedef unsigned char UCharType
 5 | 
 6 |     # 32-bit integer.
 7 |     ctypedef int ValueType
 8 | 
 9 |     # 32-bit unsigned integer.
10 |     ctypedef unsigned int BaseType
11 | 
12 |     # 32 or 64-bit unsigned integer.
13 |     ctypedef int SizeType
14 | 
15 | 


--------------------------------------------------------------------------------
/src/_completer.pxd:
--------------------------------------------------------------------------------
 1 | from _base_types cimport BaseType, SizeType, ValueType
 2 | from _dawg cimport Dawg
 3 | from _dictionary cimport Dictionary
 4 | from _guide cimport Guide
 5 | 
 6 | cdef extern from "../lib/dawgdic/completer.h" namespace "dawgdic" nogil:
 7 |     cdef cppclass Completer:
 8 |         Completer()
 9 |         Completer(Dictionary &dic, Guide &guide)
10 | 
11 |         void set_dic(Dictionary &dic)
12 |         void set_guide(Guide &guide)
13 | 
14 |         Dictionary &dic()
15 |         Guide &guide()
16 | 
17 |         # These member functions are available only when Next() returns true.
18 |         char *key()
19 |         SizeType length()
20 |         ValueType value()
21 | 
22 |         # Starts completing keys from given index and prefix.
23 |         void Start(BaseType index)
24 |         void Start(BaseType index, char *prefix)
25 |         void Start(BaseType index, char *prefix, SizeType length)
26 | 
27 |         # Gets the next key.
28 |         bint Next()


--------------------------------------------------------------------------------
/src/_dawg.pxd:
--------------------------------------------------------------------------------
 1 | from _base_types cimport BaseType, SizeType, ValueType, UCharType
 2 | 
 3 | cdef extern from "../lib/dawgdic/dawg.h" namespace "dawgdic":
 4 | 
 5 |     cdef cppclass Dawg:
 6 |         Dawg()
 7 | 
 8 |         # The root index.
 9 |         BaseType root() nogil
10 | 
11 |         # Number of units.
12 |         SizeType size() nogil
13 | 
14 |         # Number of transitions.
15 |         SizeType num_of_transitions() nogil
16 | 
17 |         # Number of states.
18 |         SizeType num_of_states() nogil
19 | 
20 |         # Number of merged transitions.
21 |         SizeType num_of_merged_transitions() nogil
22 | 
23 |         # Number of merged states.
24 |         SizeType num_of_merged_states() nogil
25 | 
26 |         # Number of merging states.
27 |         SizeType num_of_merging_states() nogil
28 | 
29 |         # Reads values.
30 |         BaseType child(BaseType index) nogil
31 | 
32 |         BaseType sibling(BaseType index) nogil
33 | 
34 |         ValueType value(BaseType index) nogil
35 | 
36 |         bint is_leaf(BaseType index) nogil
37 | 
38 |         UCharType label(BaseType index) nogil
39 | 
40 |         bint is_merging(BaseType index) nogil
41 | 
42 |         # Clears object pools.
43 |         void Clear() nogil
44 | 
45 |         # Swaps dawgs.
46 |         void Swap(Dawg *dawg) nogil
47 | 


--------------------------------------------------------------------------------
/src/_dawg_builder.pxd:
--------------------------------------------------------------------------------
 1 | from _base_types cimport BaseType, SizeType, ValueType, UCharType, CharType
 2 | from _dawg cimport Dawg
 3 | 
 4 | cdef extern from "../lib/dawgdic/dawg-builder.h" namespace "dawgdic":
 5 |     cdef cppclass DawgBuilder:
 6 | 
 7 |         DawgBuilder() nogil  #(SizeType initial_hash_table_size = DEFAULT_INITIAL_HASH_TABLE_SIZE)
 8 | 
 9 |         # Number of units.
10 |         SizeType size() nogil
11 | 
12 |         # Number of transitions.
13 |         SizeType num_of_transitions() nogil
14 | 
15 |         # Number of states.
16 |         SizeType num_of_states() nogil
17 | 
18 |         # Number of merged transitions.
19 |         SizeType num_of_merged_transitions() nogil
20 | 
21 |         # Number of merged states.
22 |         SizeType num_of_merged_states() nogil
23 | 
24 |         # Number of merging states.
25 |         SizeType num_of_merging_states() nogil
26 | 
27 |         # Initializes a builder.
28 |         void Clear() nogil
29 | 
30 |         # Inserts a key.
31 |         bint Insert(CharType *key)
32 |         bint Insert(CharType *key, ValueType value)
33 |         bint Insert(CharType *key, SizeType length, ValueType value)
34 | 
35 |         # Finishes building a dawg.
36 |         bint Finish(Dawg *dawg)
37 | 


--------------------------------------------------------------------------------
/src/_dictionary.pxd:
--------------------------------------------------------------------------------
 1 | from _base_types cimport BaseType, SizeType, ValueType, UCharType, CharType
 2 | from _dictionary_unit cimport DictionaryUnit
 3 | from iostream cimport istream, ostream
 4 | 
 5 | cdef extern from "../lib/dawgdic/dictionary.h" namespace "dawgdic":
 6 |     cdef cppclass Dictionary:
 7 | 
 8 |         Dictionary() nogil
 9 | 
10 |         DictionaryUnit *units() nogil
11 |         SizeType size() nogil
12 |         SizeType total_size() nogil
13 |         SizeType file_size() nogil
14 | 
15 |         # Root index.
16 |         BaseType root() nogil
17 | 
18 |         # Checks if a given index is related to the end of a key.
19 |         bint has_value(BaseType index) nogil
20 | 
21 |         # Gets a value from a given index.
22 |         ValueType value(BaseType index) nogil
23 | 
24 |         # Reads a dictionary from an input stream.
25 |         bint Read(istream *input) nogil except +
26 | 
27 |         # Writes a dictionry to an output stream.
28 |         bint Write(ostream *output) nogil except +
29 | 
30 |         # Exact matching.
31 |         bint Contains(CharType *key) nogil
32 |         bint Contains(CharType *key, SizeType length) nogil
33 | 
34 |         # Exact matching.
35 |         ValueType Find(CharType *key) nogil
36 |         ValueType Find(CharType *key, SizeType length) nogil
37 |         bint Find(CharType *key, ValueType *value) nogil
38 |         bint Find(CharType *key, SizeType length, ValueType *value) nogil
39 | 
40 |         # Follows a transition.
41 |         bint Follow(CharType label, BaseType *index) nogil
42 | 
43 |         # Follows transitions.
44 |         bint Follow(CharType *s, BaseType *index) nogil
45 |         bint Follow(CharType *s, BaseType *index, SizeType *count) nogil
46 | 
47 |         # Follows transitions.
48 |         bint Follow(CharType *s, SizeType length, BaseType *index) nogil
49 |         bint Follow(CharType *s, SizeType length, BaseType *index, SizeType *count) nogil
50 | 
51 |         # Maps memory with its size.
52 |         void Map(void *address) nogil
53 |         void Map(void *address, SizeType size) nogil
54 | 
55 |         # Initializes a dictionary.
56 |         void Clear() nogil
57 | 
58 |         # Swaps dictionaries.
59 |         void Swap(Dictionary *dic) nogil
60 |         # Shrinks a vector.
61 |         void Shrink() nogil


--------------------------------------------------------------------------------
/src/_dictionary_builder.pxd:
--------------------------------------------------------------------------------
1 | from _base_types cimport BaseType
2 | from _dawg cimport Dawg
3 | from _dictionary cimport Dictionary
4 | 
5 | cdef extern from "../lib/dawgdic/dictionary-builder.h" namespace "dawgdic::DictionaryBuilder":
6 |     cdef bint Build (Dawg &dawg, Dictionary *dic) nogil
7 | 
8 | 


--------------------------------------------------------------------------------
/src/_dictionary_unit.pxd:
--------------------------------------------------------------------------------
 1 | from _base_types cimport BaseType, SizeType, ValueType, UCharType, CharType
 2 | 
 3 | cdef extern from "../lib/dawgdic/dictionary-unit.h" namespace "dawgdic":
 4 |     cdef cppclass DictionaryUnit:
 5 | 
 6 |         DictionaryUnit() nogil
 7 | 
 8 |         # Sets a flag to show that a unit has a leaf as a child.
 9 |         void set_has_leaf() nogil
10 | 
11 |         # Sets a value to a leaf unit.
12 |         void set_value(ValueType value) nogil
13 | 
14 |         # Sets a label to a non-leaf unit.
15 |         void set_label(UCharType label) nogil
16 | 
17 |         # Sets an offset to a non-leaf unit.
18 |         bint set_offset(BaseType offset) nogil
19 | 
20 | 
21 |         # Checks if a unit has a leaf as a child or not.
22 |         bint has_leaf() nogil
23 | 
24 |         # Checks if a unit corresponds to a leaf or not.
25 |         ValueType value() nogil
26 | 
27 |         # Reads a label with a leaf flag from a non-leaf unit.
28 |         BaseType label() nogil
29 | 
30 |         # Reads an offset to child units from a non-leaf unit.
31 |         BaseType offset() nogil
32 | 


--------------------------------------------------------------------------------
/src/_guide.pxd:
--------------------------------------------------------------------------------
 1 | from _base_types cimport BaseType, SizeType, ValueType, UCharType, CharType
 2 | from _guide_unit cimport GuideUnit
 3 | from iostream cimport istream, ostream
 4 | 
 5 | cdef extern from "../lib/dawgdic/guide.h" namespace "dawgdic":
 6 |     cdef cppclass Guide:
 7 | 
 8 |         Guide()
 9 | 
10 |         GuideUnit *units()
11 |         SizeType size()
12 |         SizeType total_size()
13 |         SizeType file_size()
14 | 
15 |         # The root index.
16 |         BaseType root()
17 | 
18 |         UCharType child(BaseType index)
19 |         UCharType sibling(BaseType index)
20 | 
21 |         # Reads a dictionary from an input stream.
22 |         bint Read(istream *input)
23 | 
24 |         # Writes a dictionry to an output stream.
25 |         bint Write(ostream *output)
26 | 
27 |         # Maps memory with its size.
28 |         void Map(void *address)
29 | 
30 |         # Swaps Guides.
31 |         void Swap(Guide *Guide)
32 | 
33 |         # Initializes a Guide.
34 |         void Clear()


--------------------------------------------------------------------------------
/src/_guide_builder.pxd:
--------------------------------------------------------------------------------
1 | from _base_types cimport BaseType
2 | from _dawg cimport Dawg
3 | from _dictionary cimport Dictionary
4 | from _guide cimport Guide
5 | 
6 | cdef extern from "../lib/dawgdic/guide-builder.h" namespace "dawgdic::GuideBuilder":
7 |     cdef bint Build (Dawg &dawg, Dictionary &dic, Guide* guide) nogil
8 | 
9 | 


--------------------------------------------------------------------------------
/src/_guide_unit.pxd:
--------------------------------------------------------------------------------
 1 | from _base_types cimport BaseType, SizeType, ValueType, UCharType, CharType
 2 | 
 3 | cdef extern from "../lib/dawgdic/guide-unit.h" namespace "dawgdic":
 4 |     cdef cppclass GuideUnit:
 5 |         GuideUnit() nogil
 6 | 
 7 |         void set_child(UCharType child) nogil
 8 |         void set_sibling(UCharType sibling) nogil
 9 |         UCharType child() nogil
10 |         UCharType sibling() nogil
11 | 


--------------------------------------------------------------------------------
/src/b64_decode.pxd:
--------------------------------------------------------------------------------
 1 | from iostream cimport istream, ostream
 2 | 
 3 | cdef extern from "../lib/b64/decode.h" namespace "base64":
 4 | 
 5 |     cdef cppclass decoder:
 6 |         decoder()
 7 |         decoder(int buffersize_in)
 8 | 
 9 |         int decode(char* code_in, int length_in, char* plaintext_out)
10 |         void init()
11 | 
12 |         void decode(istream istream_in, ostream ostream_in)


--------------------------------------------------------------------------------
/src/dawg.pyx:
--------------------------------------------------------------------------------
  1 | # cython: profile=False
  2 | # cython: embedsignature=True
  3 | from __future__ import unicode_literals
  4 | from libcpp.string cimport string
  5 | from libcpp.vector cimport vector
  6 | from iostream cimport stringstream, istream, ostream, ifstream
  7 | cimport iostream
  8 | 
  9 | cimport _dawg
 10 | from _dawg_builder cimport DawgBuilder
 11 | from _dictionary cimport Dictionary
 12 | from _guide cimport Guide
 13 | from _completer cimport Completer
 14 | from _base_types cimport BaseType, SizeType, CharType
 15 | cimport _guide_builder
 16 | cimport _dictionary_builder
 17 | cimport b64_decode
 18 | 
 19 | try:
 20 |     from collections.abc import Mapping
 21 | except ImportError:
 22 |     # Python 2.7
 23 |     from collections import Mapping
 24 | import struct
 25 | import sys
 26 | from binascii import b2a_base64
 27 | 
 28 | 
 29 | class Error(Exception):
 30 |     pass
 31 | 
 32 | 
 33 | cdef class DAWG:
 34 |     """
 35 |     Base DAWG wrapper.
 36 |     """
 37 |     cdef Dictionary dct
 38 |     cdef _dawg.Dawg dawg
 39 | 
 40 |     def __init__(self, arg=None, input_is_sorted=False):
 41 |         if arg is None:
 42 |             arg = []
 43 |         if not input_is_sorted:
 44 |             arg = [
 45 |                 (<unicode>key).encode('utf8') if isinstance(key, unicode) else key
 46 |                 for key in arg
 47 |             ]
 48 |             arg.sort()
 49 |         self._build_from_iterable(arg)
 50 | 
 51 |     def __dealloc__(self):
 52 |         self.dct.Clear()
 53 |         self.dawg.Clear()
 54 | 
 55 |     def _build_from_iterable(self, iterable):
 56 |         cdef DawgBuilder dawg_builder
 57 |         cdef bytes b_key
 58 |         cdef int value
 59 | 
 60 |         for key in iterable:
 61 |             if isinstance(key, tuple) or isinstance(key, list):
 62 |                 key, value = key
 63 |                 if value < 0:
 64 |                     raise ValueError("Negative values are not supported")
 65 |             else:
 66 |                 value = 0
 67 | 
 68 |             if isinstance(key, unicode):
 69 |                 b_key = <bytes>(<unicode>key).encode('utf8')
 70 |             else:
 71 |                 b_key = key
 72 | 
 73 |             if not dawg_builder.Insert(b_key, len(b_key), value):
 74 |                 raise Error("Can't insert key %r (with value %r)" % (b_key, value))
 75 | 
 76 |         if not dawg_builder.Finish(&self.dawg):
 77 |             raise Error("dawg_builder.Finish error")
 78 | 
 79 |         if not _dictionary_builder.Build(self.dawg, &self.dct):
 80 |             raise Error("Can't build dictionary")
 81 | 
 82 |     def __contains__(self, key):
 83 |         if isinstance(key, unicode):
 84 |             return self.has_key(<unicode>key)
 85 |         return self.b_has_key(key)
 86 | 
 87 |     cpdef bint has_key(self, unicode key) except -1:
 88 |         return self.b_has_key(<bytes>key.encode('utf8'))
 89 | 
 90 |     cpdef bint b_has_key(self, bytes key) except -1:
 91 |         return self.dct.Contains(key, len(key))
 92 | 
 93 |     cpdef bytes tobytes(self) except +:
 94 |         """
 95 |         Return raw DAWG content as bytes.
 96 |         """
 97 |         cdef stringstream stream
 98 |         self.dct.Write(<ostream *> &stream)
 99 |         cdef bytes res = stream.str()
100 |         return res
101 | 
102 |     cpdef frombytes(self, bytes data):
103 |         """
104 |         Load DAWG from bytes ``data``.
105 | 
106 |         FIXME: it seems there is a memory leak here (DAWG uses 3x memory
107 |         when loaded using ``.frombytes`` compared to DAWG loaded
108 |         using ``.load``).
109 |         """
110 |         cdef string s_data = data
111 |         cdef stringstream* stream = new stringstream(s_data)
112 | 
113 |         try:
114 |             res = self.dct.Read(<istream *> stream)
115 | 
116 |             if not res:
117 |                 self.dct.Clear()
118 |                 raise IOError("Invalid data format")
119 | 
120 |             return self
121 |         finally:
122 |             del stream
123 | 
124 |     def read(self, f):
125 |         """
126 |         Load DAWG from a file-like object.
127 | 
128 |         FIXME: this method should'n read the whole stream.
129 |         """
130 |         self.frombytes(f.read())
131 | 
132 |     def write(self, f):
133 |         """
134 |         Write DAWG to a file-like object.
135 |         """
136 |         f.write(self.tobytes())
137 | 
138 |     def load(self, path):
139 |         """
140 |         Load DAWG from a file.
141 |         """
142 |         if isinstance(path, unicode):
143 |             path = path.encode(sys.getfilesystemencoding())
144 | 
145 |         cdef ifstream stream
146 |         stream.open(path, iostream.binary)
147 |         if stream.fail():
148 |             raise IOError("It's not possible to read file stream")
149 | 
150 |         res = self.dct.Read(<istream*> &stream)
151 | 
152 |         stream.close()
153 | 
154 |         if not res:
155 |             self.dct.Clear()
156 |             raise IOError("Invalid data format")
157 | 
158 |         return self
159 | 
160 |     def save(self, path):
161 |         """
162 |         Save DAWG to a file.
163 |         """
164 |         with open(path, 'wb') as f:
165 |             self.write(f)
166 | 
167 |     # pickling support
168 |     def __reduce__(self):
169 |         return self.__class__, tuple(), self.tobytes()
170 | 
171 |     def __setstate__(self, state):
172 |         self.frombytes(state)
173 | 
174 |     # half-internal methods
175 |     def _size(self):
176 |         return self.dct.size()
177 | 
178 |     def _total_size(self):
179 |         return self.dct.total_size()
180 | 
181 |     def _file_size(self):
182 |         return self.dct.file_size()
183 | 
184 |     cdef bint _has_value(self, BaseType index):
185 |         return  self.dct.has_value(index)
186 | 
187 |     cdef list _similar_keys(self, unicode current_prefix, unicode key, BaseType cur_index, dict replace_chars):
188 |         cdef BaseType next_index, index = cur_index
189 |         cdef unicode prefix, u_replace_char, found_key
190 |         cdef bytes b_step, b_replace_char
191 |         cdef list res = []
192 |         cdef list extra_keys
193 | 
194 |         cdef int start_pos = len(current_prefix)
195 |         cdef int end_pos = len(key)
196 |         cdef int word_pos = start_pos
197 | 
198 |         while word_pos < end_pos:
199 |             b_step = <bytes>(key[word_pos].encode('utf8'))
200 | 
201 |             if b_step in replace_chars:
202 |                 next_index = index
203 |                 b_replace_char, u_replace_char = <tuple>replace_chars[b_step]
204 | 
205 |                 if self.dct.Follow(b_replace_char, &next_index):
206 |                     prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
207 |                     extra_keys = self._similar_keys(prefix, key, next_index, replace_chars)
208 |                     res.extend(extra_keys)
209 | 
210 |             if not self.dct.Follow(b_step, &index):
211 |                 break
212 |             word_pos += 1
213 | 
214 |         else:
215 |             if self._has_value(index):
216 |                 found_key = current_prefix + key[start_pos:]
217 |                 res.insert(0, found_key)
218 | 
219 |         return res
220 | 
221 |     cpdef list similar_keys(self, unicode key, dict replaces):
222 |         """
223 |         Return all variants of ``key`` in this DAWG according to
224 |         ``replaces``.
225 | 
226 |         ``replaces`` is an object obtained from
227 |         ``DAWG.compile_replaces(mapping)`` where mapping is a dict
228 |         that maps single-char unicode sitrings to another single-char
229 |         unicode strings.
230 | 
231 |         This may be useful e.g. for handling single-character umlauts.
232 |         """
233 |         return self._similar_keys("", key, self.dct.root(), replaces)
234 | 
235 |     cpdef list prefixes(self, unicode key):
236 |         '''
237 |         Return a list with keys of this DAWG that are prefixes of the ``key``.
238 |         '''
239 |         return [p.decode('utf8') for p in self.b_prefixes(<bytes>key.encode('utf8'))]
240 | 
241 |     cpdef list b_prefixes(self, bytes b_key):
242 |         cdef list res = []
243 |         cdef BaseType index = self.dct.root()
244 |         cdef int pos = 1
245 |         cdef CharType ch
246 | 
247 |         for ch in b_key:
248 |             if not self.dct.Follow(ch, &index):
249 |                 break
250 |             if self._has_value(index):
251 |                 res.append(b_key[:pos])
252 |             pos += 1
253 | 
254 |         return res
255 | 
256 |     def iterprefixes(self, unicode key):
257 |         '''
258 |         Return a generator with keys of this DAWG that are prefixes of the ``key``.
259 |         '''
260 |         cdef BaseType index = self.dct.root()
261 |         cdef bytes b_key = <bytes>key.encode('utf8')
262 |         cdef int pos = 1
263 |         cdef CharType ch
264 | 
265 |         for ch in b_key:
266 |             if not self.dct.Follow(ch, &index):
267 |                 return
268 |             if self._has_value(index):
269 |                 yield b_key[:pos].decode('utf8')
270 |             pos += 1
271 | 
272 |     @classmethod
273 |     def compile_replaces(cls, replaces):
274 | 
275 |         for k,v in replaces.items():
276 |             if len(k) != 1 or len(v) != 1:
277 |                 raise ValueError("Keys and values must be single-char unicode strings.")
278 | 
279 |         return dict(
280 |             (
281 |                 k.encode('utf8'),
282 |                 (v.encode('utf8'), unicode(v))
283 |             )
284 |             for k, v in replaces.items()
285 |         )
286 | 
287 | 
288 | cdef void init_completer(Completer& completer, Dictionary& dic, Guide& guide):
289 |     completer.set_dic(dic)
290 |     completer.set_guide(guide)
291 | 
292 | 
293 | cdef class CompletionDAWG(DAWG):
294 |     """
295 |     DAWG with key completion support.
296 |     """
297 |     cdef Guide guide
298 | 
299 |     def __init__(self, arg=None, input_is_sorted=False):
300 |         super(CompletionDAWG, self).__init__(arg, input_is_sorted)
301 |         if not _guide_builder.Build(self.dawg, self.dct, &self.guide):
302 |             raise Error("Error building completion information")
303 | 
304 |     def __dealloc__(self):
305 |         self.guide.Clear()
306 | 
307 |     cpdef list keys(self, unicode prefix=""):
308 |         cdef bytes b_prefix = prefix.encode('utf8')
309 |         cdef BaseType index = self.dct.root()
310 |         cdef list res = []
311 | 
312 |         if not self.dct.Follow(b_prefix, &index):
313 |             return res
314 | 
315 |         cdef Completer completer
316 |         init_completer(completer, self.dct, self.guide)
317 |         completer.Start(index, b_prefix)
318 | 
319 |         while completer.Next():
320 |             key = (<char*>completer.key()).decode('utf8')
321 |             res.append(key)
322 | 
323 |         return res
324 | 
325 |     def iterkeys(self, unicode prefix=""):
326 |         cdef bytes b_prefix = prefix.encode('utf8')
327 |         cdef BaseType index = self.dct.root()
328 | 
329 |         if not self.dct.Follow(b_prefix, &index):
330 |             return
331 | 
332 |         cdef Completer completer
333 |         init_completer(completer, self.dct, self.guide)
334 |         completer.Start(index, b_prefix)
335 | 
336 |         while completer.Next():
337 |             key = (<char*>completer.key()).decode('utf8')
338 |             yield key
339 | 
340 |     def has_keys_with_prefix(self, unicode prefix):
341 |         cdef bytes b_prefix = prefix.encode('utf8')
342 |         cdef BaseType index = self.dct.root()
343 | 
344 |         if not self.dct.Follow(b_prefix, &index):
345 |             return False
346 | 
347 |         cdef Completer completer
348 |         init_completer(completer, self.dct, self.guide)
349 |         completer.Start(index, b_prefix)
350 | 
351 |         return completer.Next()
352 | 
353 |     cpdef bytes tobytes(self) except +:
354 |         """
355 |         Return raw DAWG content as bytes.
356 |         """
357 |         cdef stringstream stream
358 |         self.dct.Write(<ostream *> &stream)
359 |         self.guide.Write(<ostream *> &stream)
360 |         cdef bytes res = stream.str()
361 |         return res
362 | 
363 |     cpdef frombytes(self, bytes data):
364 |         """
365 |         Load DAWG from bytes ``data``.
366 | 
367 |         FIXME: it seems there is memory leak here (DAWG uses 3x memory when
368 |         loaded using frombytes vs load).
369 |         """
370 |         cdef char* c_data = data
371 |         cdef stringstream stream
372 |         stream.write(c_data, len(data))
373 |         stream.seekg(0)
374 | 
375 |         res = self.dct.Read(<istream*> &stream)
376 |         if not res:
377 |             self.dct.Clear()
378 |             raise IOError("Invalid data format: can't load _dawg.Dictionary")
379 | 
380 |         res = self.guide.Read(<istream*> &stream)
381 |         if not res:
382 |             self.guide.Clear()
383 |             self.dct.Clear()
384 |             raise IOError("Invalid data format: can't load _dawg.Guide")
385 | 
386 |         return self
387 | 
388 |     def load(self, path):
389 |         """
390 |         Load DAWG from a file.
391 |         """
392 |         if isinstance(path, unicode):
393 |             path = path.encode(sys.getfilesystemencoding())
394 | 
395 |         cdef ifstream stream
396 |         stream.open(path, iostream.binary)
397 |         if stream.fail():
398 |             raise IOError("It's not possible to read file stream")
399 | 
400 |         try:
401 |             res = self.dct.Read(<istream*> &stream)
402 |             if not res:
403 |                 self.dct.Clear()
404 |                 raise IOError("Invalid data format: can't load _dawg.Dictionary")
405 | 
406 |             res = self.guide.Read(<istream*> &stream)
407 |             if not res:
408 |                 self.guide.Clear()
409 |                 self.dct.Clear()
410 |                 raise IOError("Invalid data format: can't load _dawg.Guide")
411 | 
412 |         finally:
413 |             stream.close()
414 | 
415 |         return self
416 | 
417 |     def _transitions(self):
418 |         transitions = set()
419 |         cdef BaseType index, prev_index, completer_index
420 |         cdef char* key
421 | 
422 |         cdef Completer completer
423 |         init_completer(completer, self.dct, self.guide)
424 |         completer.Start(self.dct.root())
425 | 
426 |         while completer.Next():
427 |             key = <char*>completer.key()
428 | 
429 |             index = self.dct.root()
430 | 
431 |             for i in range(completer.length()):
432 |                 prev_index = index
433 |                 self.dct.Follow(&(key[i]), 1, &index)
434 |                 transitions.add(
435 |                     (prev_index, <unsigned char>key[i], index)
436 |                 )
437 | 
438 |         return sorted(list(transitions))
439 | 
440 | 
441 | # The following symbol is not allowed in utf8 so it is safe to use
442 | # as a separator between utf8-encoded string and binary payload.
443 | # It has drawbacks however: sorting of utf8-encoded keys changes:
444 | # ('foo' becomes greater than 'foox' because strings are compared as
445 | # 'foo<sep>' and 'foox<sep>' and ord(<sep>)==255 is greater than
446 | # ord(<any other character>).
447 | # DEF PAYLOAD_SEPARATOR = b'\xff'
448 | 
449 | # That's why chr(1) is used as separator by default: this is the lowest allowed
450 | # character and so it will preserve keys alphabetical order.
451 | # It is not strictly correct to use chr(1) as separator because chr(1)
452 | # is a valid UTF8 character. But I think in practice this won't be an issue:
453 | # such control character is very unlikely in text keys, and binary keys
454 | # are not supported anyway because dawgdic doesn't support keys containing
455 | # chr(0).
456 | cdef bytes PAYLOAD_SEPARATOR = b'\x01'
457 | 
458 | DEF MAX_VALUE_SIZE = 32768
459 | 
460 | cdef class BytesDAWG(CompletionDAWG):
461 |     """
462 |     DAWG that is able to transparently store extra binary payload in keys;
463 |     there may be several payloads for the same key.
464 | 
465 |     In other words, this class implements read-only DAWG-based
466 |     {unicode -> list of bytes objects} mapping.
467 |     """
468 | 
469 |     cdef bytes _b_payload_separator
470 |     cdef CharType _c_payload_separator
471 |     cdef Completer* _completer
472 | 
473 |     def __init__(self, arg=None, input_is_sorted=False, bytes payload_separator=PAYLOAD_SEPARATOR):
474 |         """
475 |         ``arg`` must be an iterable of tuples (unicode_key, bytes_payload).
476 |         """
477 |         if arg is None:
478 |             arg = []
479 | 
480 |         self._b_payload_separator = payload_separator
481 |         self._c_payload_separator = <unsigned int>ord(payload_separator)
482 | 
483 |         keys = (self._raw_key(d[0], d[1]) for d in arg)
484 |         super(BytesDAWG, self).__init__(keys, input_is_sorted)
485 | 
486 |         self._update_completer()
487 | 
488 |     def __dealloc__(self):
489 |         if self._completer:
490 |             del self._completer
491 | 
492 |     cpdef bytes _raw_key(self, unicode key, bytes payload):
493 |         cdef bytes b_key = <bytes>key.encode('utf8')
494 | 
495 |         if self._b_payload_separator in b_key:
496 |             raise Error("Payload separator (%r) is found within utf8-encoded key ('%s')" % (self._b_payload_separator, key))
497 | 
498 |         cdef bytes encoded_payload = b2a_base64(payload)
499 |         return b_key + self._b_payload_separator + encoded_payload
500 | 
501 |     cdef _update_completer(self):
502 |         if self._completer:
503 |             del self._completer
504 |         self._completer = new Completer(self.dct, self.guide)
505 | 
506 |     def load(self, path):
507 |         res = super(BytesDAWG, self).load(path)
508 |         self._update_completer()
509 |         return res
510 | 
511 |     cpdef frombytes(self, bytes data):
512 |         res = super(BytesDAWG, self).frombytes(data)
513 |         self._update_completer()
514 |         return res
515 | 
516 |     cpdef bint b_has_key(self, bytes key) except -1:
517 |         cdef BaseType index
518 |         return self._follow_key(key, &index)
519 | 
520 |     def __getitem__(self, key):
521 |         res = self.get(key)
522 |         if res is None:
523 |             raise KeyError(key)
524 |         return res
525 | 
526 |     cpdef get(self, key, default=None):
527 |         """
528 |         Return a list of payloads (as byte objects) for a given key
529 |         or ``default`` if the key is not found.
530 |         """
531 |         if isinstance(key, unicode):
532 |             res = self.get_value(<unicode>key)
533 |         else:
534 |             res = self.b_get_value(key)
535 | 
536 |         if not res:
537 |             return default
538 |         return res
539 | 
540 |     cdef bint _follow_key(self, bytes key, BaseType* index):
541 |         index[0] = self.dct.root()
542 |         if not self.dct.Follow(key, len(key), index):
543 |             return False
544 |         return self.dct.Follow(self._c_payload_separator, index)
545 | 
546 |     cpdef list get_value(self, unicode key):
547 |         return self.b_get_value(<bytes>key.encode('utf8'))
548 | 
549 |     cdef list _value_for_index(self, BaseType index):
550 | 
551 |         # We want to use shared Completer instance because allocating
552 |         # a Completer makes this function (and thus __getitem__) 2x slower.
553 |         # This could be not thread-safe; GIL helps us, but we should be careful
554 |         # not to occasionally switch to an another thread by iteracting
555 |         # with Python interpreter in any way (switch happens
556 |         # between bytecode instructions).
557 | 
558 |         cdef int key_len
559 |         cdef b64_decode.decoder b64_decoder
560 |         cdef char[MAX_VALUE_SIZE] b64_decoder_storage
561 |         cdef vector[string] results
562 | 
563 |         self._completer.Start(index)
564 | 
565 |         while self._completer.Next():
566 |             b64_decoder.init()
567 |             key_len = b64_decoder.decode(
568 |                 self._completer.key(),
569 |                 self._completer.length(),
570 |                 b64_decoder_storage
571 |             )
572 |             results.push_back(string(b64_decoder_storage, key_len))
573 | 
574 |         return results
575 | 
576 |     cpdef list b_get_value(self, bytes key):
577 |         cdef BaseType index
578 |         if not self._follow_key(key, &index):
579 |             return []
580 |         return self._value_for_index(index)
581 | 
582 |     cpdef list items(self, unicode prefix=""):
583 |         cdef bytes b_prefix = prefix.encode('utf8')
584 |         cdef bytes value
585 |         cdef int i
586 |         cdef list res = []
587 |         cdef char* raw_key
588 |         cdef char* raw_value
589 |         cdef int raw_value_len
590 | 
591 |         cdef BaseType index = self.dct.root()
592 |         if not self.dct.Follow(b_prefix, &index):
593 |             return res
594 | 
595 |         cdef int _len
596 |         cdef b64_decode.decoder _b64_decoder
597 |         cdef char[MAX_VALUE_SIZE] _b64_decoder_storage
598 | 
599 |         cdef Completer completer
600 |         init_completer(completer, self.dct, self.guide)
601 |         completer.Start(index, b_prefix)
602 | 
603 |         while completer.Next():
604 |             raw_key = <char*>completer.key()
605 | 
606 |             for i in range(0, completer.length()):
607 |                 if raw_key[i] == self._c_payload_separator:
608 |                     break
609 | 
610 |             raw_value = &(raw_key[i])
611 |             raw_value_len = completer.length() - i
612 | 
613 |             _b64_decoder.init()
614 |             _len = _b64_decoder.decode(raw_value, raw_value_len, _b64_decoder_storage)
615 |             value = _b64_decoder_storage[:_len]
616 | 
617 |             u_key = raw_key[:i].decode('utf8')
618 |             res.append(
619 |                 (u_key, value)
620 |             )
621 | 
622 |         return res
623 | 
624 |     def iteritems(self, unicode prefix=""):
625 |         cdef bytes b_prefix = prefix.encode('utf8')
626 |         cdef bytes value
627 |         cdef int i
628 |         cdef char* raw_key
629 |         cdef char* raw_value
630 |         cdef int raw_value_len
631 | 
632 |         cdef BaseType index = self.dct.root()
633 |         if not self.dct.Follow(b_prefix, &index):
634 |             return
635 | 
636 |         cdef int _len
637 |         cdef b64_decode.decoder _b64_decoder
638 |         cdef char[MAX_VALUE_SIZE] _b64_decoder_storage
639 | 
640 |         cdef Completer completer
641 |         init_completer(completer, self.dct, self.guide)
642 |         completer.Start(index, b_prefix)
643 | 
644 |         while completer.Next():
645 |             raw_key = <char*>completer.key()
646 | 
647 |             for i in range(0, completer.length()):
648 |                 if raw_key[i] == self._c_payload_separator:
649 |                     break
650 | 
651 |             raw_value = &(raw_key[i])
652 |             raw_value_len = completer.length() - i
653 | 
654 |             _b64_decoder.init()
655 |             _len = _b64_decoder.decode(raw_value, raw_value_len, _b64_decoder_storage)
656 |             value = _b64_decoder_storage[:_len]
657 | 
658 |             u_key = raw_key[:i].decode('utf8')
659 |             yield (u_key, value)
660 | 
661 |     cpdef list keys(self, unicode prefix=""):
662 |         cdef bytes b_prefix = prefix.encode('utf8')
663 |         cdef int i
664 |         cdef list res = []
665 |         cdef char* raw_key
666 | 
667 |         cdef BaseType index = self.dct.root()
668 |         if not self.dct.Follow(b_prefix, &index):
669 |             return res
670 | 
671 |         cdef Completer completer
672 |         init_completer(completer, self.dct, self.guide)
673 |         completer.Start(index, b_prefix)
674 | 
675 |         while completer.Next():
676 |             raw_key = <char*>completer.key()
677 | 
678 |             for i in range(0, completer.length()):
679 |                 if raw_key[i] == self._c_payload_separator:
680 |                     break
681 | 
682 |             u_key = raw_key[:i].decode('utf8')
683 |             res.append(u_key)
684 |         return res
685 | 
686 |     def iterkeys(self, unicode prefix=""):
687 |         cdef bytes b_prefix = prefix.encode('utf8')
688 |         cdef int i
689 |         cdef char* raw_key
690 | 
691 |         cdef BaseType index = self.dct.root()
692 |         if not self.dct.Follow(b_prefix, &index):
693 |             return
694 | 
695 |         cdef Completer completer
696 |         init_completer(completer, self.dct, self.guide)
697 |         completer.Start(index, b_prefix)
698 | 
699 |         while completer.Next():
700 |             raw_key = <char*>completer.key()
701 | 
702 |             for i in range(0, completer.length()):
703 |                 if raw_key[i] == self._c_payload_separator:
704 |                     break
705 | 
706 |             u_key = raw_key[:i].decode('utf8')
707 |             yield u_key
708 | 
709 |     cdef bint _has_value(self, BaseType index):
710 |         cdef BaseType _index = index
711 |         return self.dct.Follow(self._c_payload_separator, &_index)
712 | 
713 |     cdef list _similar_items(self, unicode current_prefix, unicode key, BaseType cur_index, dict replace_chars):
714 |         cdef BaseType next_index, index = cur_index
715 |         cdef unicode prefix, u_replace_char, found_key
716 |         cdef bytes b_step, b_replace_char
717 |         cdef list res = []
718 |         cdef list extra_items, value
719 | 
720 |         cdef int start_pos = len(current_prefix)
721 |         cdef int end_pos = len(key)
722 |         cdef int word_pos = start_pos
723 | 
724 |         while word_pos < end_pos:
725 |             b_step = <bytes>(key[word_pos].encode('utf8'))
726 | 
727 |             if b_step in replace_chars:
728 |                 next_index = index
729 |                 b_replace_char, u_replace_char = <tuple>replace_chars[b_step]
730 | 
731 |                 if self.dct.Follow(b_replace_char, &next_index):
732 |                     prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
733 |                     extra_items = self._similar_items(prefix, key, next_index, replace_chars)
734 |                     res.extend(extra_items)
735 | 
736 |             if not self.dct.Follow(b_step, &index):
737 |                 break
738 |             word_pos += 1
739 | 
740 |         else:
741 |             if self.dct.Follow(self._c_payload_separator, &index):
742 |                 found_key = current_prefix + key[start_pos:]
743 |                 value = self._value_for_index(index)
744 |                 res.insert(0, (found_key, value))
745 | 
746 |         return res
747 | 
748 |     cpdef list similar_items(self, unicode key, dict replaces):
749 |         """
750 |         Return a list of (key, value) tuples for all variants of ``key``
751 |         in this DAWG according to ``replaces``.
752 | 
753 |         ``replaces`` is an object obtained from
754 |         ``DAWG.compile_replaces(mapping)`` where mapping is a dict
755 |         that maps single-char unicode sitrings to another single-char
756 |         unicode strings.
757 |         """
758 |         return self._similar_items("", key, self.dct.root(), replaces)
759 | 
760 |     cdef list _similar_item_values(self, int start_pos, unicode key, BaseType cur_index, dict replace_chars):
761 |         cdef BaseType next_index, index = cur_index
762 |         cdef unicode prefix, u_replace_char, found_key
763 |         cdef bytes b_step, b_replace_char
764 |         cdef list res = []
765 |         cdef list extra_items, value
766 | 
767 |         #cdef int start_pos = len(current_prefix)
768 |         cdef int end_pos = len(key)
769 |         cdef int word_pos = start_pos
770 | 
771 |         while word_pos < end_pos:
772 |             b_step = <bytes>(key[word_pos].encode('utf8'))
773 | 
774 |             if b_step in replace_chars:
775 |                 next_index = index
776 |                 b_replace_char, u_replace_char = <tuple>replace_chars[b_step]
777 | 
778 |                 if self.dct.Follow(b_replace_char, &next_index):
779 |                     extra_items = self._similar_item_values(word_pos+1, key, next_index, replace_chars)
780 |                     res.extend(extra_items)
781 | 
782 |             if not self.dct.Follow(b_step, &index):
783 |                 break
784 |             word_pos += 1
785 | 
786 |         else:
787 |             if self.dct.Follow(self._c_payload_separator, &index):
788 |                 value = self._value_for_index(index)
789 |                 res.insert(0, value)
790 | 
791 |         return res
792 | 
793 |     cpdef list similar_item_values(self, unicode key, dict replaces):
794 |         """
795 |         Return a list of values for all variants of the ``key``
796 |         in this DAWG according to ``replaces``.
797 | 
798 |         ``replaces`` is an object obtained from
799 |         ``DAWG.compile_replaces(mapping)`` where mapping is a dict
800 |         that maps single-char unicode sitrings to another single-char
801 |         unicode strings.
802 |         """
803 |         return self._similar_item_values(0, key, self.dct.root(), replaces)
804 | 
805 | 
806 | 
807 | cdef class RecordDAWG(BytesDAWG):
808 |     """
809 |     DAWG that is able to transparently store binary payload in keys;
810 |     there may be several payloads for the same key.
811 | 
812 |     The payload format must be defined at creation time using ``fmt``
813 |     constructor argument; it has the same meaning as ``fmt`` argument
814 |     for functions from ``struct`` module; take a look at
815 |     http://docs.python.org/library/struct.html#format-strings for the
816 |     specification.
817 | 
818 |     In other words, this class implements read-only DAWG-based
819 |     {unicode -> list of tuples} mapping where all tuples are of the
820 |     same structure and may be packed with the same format string.
821 |     """
822 |     cdef _struct
823 | 
824 |     def __init__(self, fmt, arg=None, input_is_sorted=False, bytes payload_separator=PAYLOAD_SEPARATOR):
825 |         """
826 |         ``arg`` must be an iterable of tuples (unicode_key, data_tuple).
827 |         data tuples will be converted to bytes with
828 |         ``struct.pack(fmt, *data_tuple)``.
829 | 
830 |         Take a look at
831 |         http://docs.python.org/library/struct.html#format-strings for the
832 |         format string specification.
833 |         """
834 |         self._struct = struct.Struct(str(fmt))
835 | 
836 |         if arg is None:
837 |             arg = []
838 | 
839 |         keys = ((d[0], self._struct.pack(*d[1])) for d in arg)
840 |         super(RecordDAWG, self).__init__(keys, input_is_sorted, payload_separator)
841 | 
842 |     cdef list _value_for_index(self, BaseType index):
843 |         cdef list value = BytesDAWG._value_for_index(self, index)
844 |         return [self._struct.unpack(val) for val in value]
845 | 
846 |     cpdef list items(self, unicode prefix=""):
847 |         cdef list items = BytesDAWG.items(self, prefix)
848 |         return [(key, self._struct.unpack(val)) for (key, val) in items]
849 | 
850 |     def iteritems(self, unicode prefix=""):
851 |         for key, val in BytesDAWG.iteritems(self, prefix):
852 |             yield (key, self._struct.unpack(val))
853 | 
854 | 
855 | def _iterable_from_argument(arg):
856 |     if arg is None:
857 |         arg = []
858 | 
859 |     if isinstance(arg, Mapping):
860 |         return ((key, arg[key]) for key in arg)
861 |     else:
862 |         return arg
863 | 
864 | DEF LOOKUP_ERROR = -1
865 | 
866 | cdef class IntDAWG(DAWG):
867 |     """
868 |     Dict-like class based on DAWG.
869 |     It can store integer values for unicode keys.
870 |     """
871 |     def __init__(self, arg=None, input_is_sorted=False):
872 |         """
873 |         ``arg`` must be an iterable of tuples (unicode_key, int_value)
874 |         or a dict {unicode_key: int_value}.
875 |         """
876 |         iterable = _iterable_from_argument(arg)
877 |         super(IntDAWG, self).__init__(iterable, input_is_sorted)
878 | 
879 |     def __getitem__(self, key):
880 |         cdef int res = self.get(key, LOOKUP_ERROR)
881 |         if res == LOOKUP_ERROR:
882 |             raise KeyError(key)
883 |         return res
884 | 
885 |     cpdef get(self, key, default=None):
886 |         """
887 |         Return value for the given key or ``default`` if the key is not found.
888 |         """
889 |         cdef int res
890 | 
891 |         if isinstance(key, unicode):
892 |             res = self.get_value(<unicode>key)
893 |         else:
894 |             res = self.b_get_value(key)
895 | 
896 |         if res == LOOKUP_ERROR:
897 |             return default
898 |         return res
899 | 
900 |     cpdef int get_value(self, unicode key):
901 |         cdef bytes b_key = <bytes>key.encode('utf8')
902 |         return self.dct.Find(b_key)
903 | 
904 |     cpdef int b_get_value(self, bytes key):
905 |         return self.dct.Find(key)
906 | 
907 | 
908 | # FIXME: code duplication.
909 | cdef class IntCompletionDAWG(CompletionDAWG):
910 |     """
911 |     Dict-like class based on DAWG.
912 |     It can store integer values for unicode keys and support key completion.
913 |     """
914 | 
915 |     def __init__(self, arg=None, input_is_sorted=False):
916 |         """
917 |         ``arg`` must be an iterable of tuples (unicode_key, int_value)
918 |         or a dict {unicode_key: int_value}.
919 |         """
920 |         iterable = _iterable_from_argument(arg)
921 |         super(IntCompletionDAWG, self).__init__(iterable, input_is_sorted)
922 | 
923 |     def __getitem__(self, key):
924 |         cdef int res = self.get(key, LOOKUP_ERROR)
925 |         if res == LOOKUP_ERROR:
926 |             raise KeyError(key)
927 |         return res
928 | 
929 |     cpdef get(self, key, default=None):
930 |         """
931 |         Return value for the given key or ``default`` if the key is not found.
932 |         """
933 |         cdef int res
934 | 
935 |         if isinstance(key, unicode):
936 |             res = self.get_value(<unicode>key)
937 |         else:
938 |             res = self.b_get_value(key)
939 | 
940 |         if res == LOOKUP_ERROR:
941 |             return default
942 |         return res
943 | 
944 |     cpdef int get_value(self, unicode key):
945 |         cdef bytes b_key = <bytes>key.encode('utf8')
946 |         return self.dct.Find(b_key)
947 | 
948 |     cpdef int b_get_value(self, bytes key):
949 |         return self.dct.Find(key)
950 | 
951 |     cpdef list items(self, unicode prefix=""):
952 |         cdef bytes b_prefix = prefix.encode('utf8')
953 |         cdef BaseType index = self.dct.root()
954 |         cdef list res = []
955 |         cdef int value
956 | 
957 |         if not self.dct.Follow(b_prefix, &index):
958 |             return res
959 | 
960 |         cdef Completer completer
961 |         init_completer(completer, self.dct, self.guide)
962 |         completer.Start(index, b_prefix)
963 | 
964 |         while completer.Next():
965 |             key = (<char*>completer.key()).decode('utf8')
966 |             value = completer.value()
967 |             res.append((key, value))
968 | 
969 |         return res
970 | 
971 |     def iteritems(self, unicode prefix=""):
972 |         cdef bytes b_prefix = prefix.encode('utf8')
973 |         cdef BaseType index = self.dct.root()
974 |         cdef int value
975 | 
976 |         if not self.dct.Follow(b_prefix, &index):
977 |             return
978 | 
979 |         cdef Completer completer
980 |         init_completer(completer, self.dct, self.guide)
981 |         completer.Start(index, b_prefix)
982 | 
983 |         while completer.Next():
984 |             key = (<char*>completer.key()).decode('utf8')
985 |             value = completer.value()
986 |             yield key, value
987 | 


--------------------------------------------------------------------------------
/src/iostream.pxd:
--------------------------------------------------------------------------------
 1 | from libcpp.string cimport string
 2 | from libcpp cimport bool
 3 | 
 4 | cdef extern from "<istream>" namespace "std" nogil:
 5 |     cdef cppclass istream:
 6 |         istream() except +
 7 |         istream& read (char* s, int n) except +
 8 | 
 9 |     cdef cppclass ostream:
10 |         ostream() except +
11 |         ostream& write (char* s, int n) except +
12 | 
13 | cdef extern from "<fstream>" namespace "std" nogil:
14 |     cdef cppclass ifstream:
15 |         ifstream() except +
16 |         istream(char* filename) except +
17 |         istream(char* filename, int mode) except +
18 | 
19 |         bool fail() except +
20 | 
21 |         void open(char* filename) except +
22 |         void open(char* filename, int mode) except +
23 |         void close() except +
24 | 
25 |         ifstream& read (char* s, int n) except +
26 | 
27 | 
28 | cdef extern from "<sstream>" namespace "std":
29 | 
30 |     cdef cppclass stringstream:
31 |         stringstream()
32 |         stringstream(string s)
33 |         stringstream(string s, int options)
34 |         string str ()
35 |         stringstream& write (char* s, int n)
36 |         stringstream& seekg (int pos)
37 | 
38 | 
39 | cdef extern from "<sstream>" namespace "std::stringstream":
40 | 
41 | #    int in
42 |     int out
43 |     int binary
44 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import


--------------------------------------------------------------------------------
/tests/test_dawg.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import, unicode_literals
  3 | import pickle
  4 | import tempfile
  5 | from io import BytesIO
  6 | 
  7 | import pytest
  8 | import dawg
  9 | 
 10 | def test_contains():
 11 |     d = dawg.IntDAWG({'foo': 1, 'bar': 2, 'foobar': 3})
 12 | 
 13 |     assert 'foo' in d
 14 |     assert 'bar' in d
 15 |     assert 'foobar' in d
 16 |     assert 'fo' not in d
 17 |     assert 'x' not in d
 18 | 
 19 |     assert b'foo' in d
 20 |     assert b'x' not in d
 21 | 
 22 | 
 23 | class TestDAWG(object):
 24 | 
 25 |     def test_sorted_iterable(self):
 26 | 
 27 |         sorted_data = ['bar', 'foo', 'foobar']
 28 |         contents = "\n".join(sorted_data).encode('utf8')
 29 |         with tempfile.NamedTemporaryFile() as f:
 30 |             f.write(contents)
 31 |             f.seek(0)
 32 | 
 33 |             words = (line.strip() for line in f)
 34 |             d = dawg.DAWG(words, input_is_sorted=True)
 35 | 
 36 |         assert 'bar' in d
 37 |         assert 'foo' in d
 38 | 
 39 |     def test_no_segfaults_on_invalid_file(self):
 40 |         d = dawg.DAWG()
 41 |         fd, path = tempfile.mkstemp()
 42 |         with open(path, 'w') as f:
 43 |             f.write('foo')
 44 | 
 45 |         with pytest.raises(IOError) as e:
 46 |             d.load(path)
 47 |             assert 'Invalid' in e.args[0]
 48 | 
 49 |         with open(path, 'rb') as f:
 50 |             with pytest.raises(IOError) as e:
 51 |                 d.read(f)
 52 |                 assert 'Invalid' in e.args[0]
 53 | 
 54 |     def test_no_segfaults_after_wrong_stream(self):
 55 |         d = dawg.DAWG()
 56 |         wrong_path = tempfile.mktemp()  # file doesn't exists
 57 | 
 58 |         with pytest.raises(IOError):
 59 |             d.load(wrong_path)
 60 | 
 61 |         assert 'random-key' not in d # there is possible segfault
 62 | 
 63 |     def test_build_errors(self):
 64 |         with pytest.raises(dawg.Error):
 65 |             data = [b'foo\x00bar', b'bar']
 66 |             dawg.DAWG(data)
 67 | 
 68 |     def test_contains_with_null_bytes(self):
 69 |         d = dawg.DAWG(['foo'])
 70 |         assert b'foo' in d
 71 |         assert b'foo\x00bar' not in d
 72 | 
 73 |     def test_unicode_sorting(self):
 74 |         key1 = '\U00010345\U0001033f\U00010337\U00010330\U0001033d'
 75 |         key2 = '\uff72\uff9c\uff90\uff7b\uff9e\uff9c'
 76 | 
 77 |         # This apparently depends on Python version:
 78 |         # assert key1 < key2
 79 |         # assert key1.encode('utf8') > key2.encode('utf8')
 80 | 
 81 |         # Constructor should sort data according to utf8 values,
 82 |         # not according to unicode sorting rules. It will raise an exception
 83 |         # if data is sorted according to unicode rules.
 84 |         dawg.DAWG([key1, key2])
 85 | 
 86 | 
 87 | 
 88 | class TestIntDAWG(object):
 89 | 
 90 |     IntDAWG = dawg.IntDAWG
 91 | 
 92 |     def dawg(self):
 93 |         payload = {'foo': 1, 'bar': 5, 'foobar': 3}
 94 |         d = self.IntDAWG(payload)
 95 |         return payload, d
 96 | 
 97 |     def test_getitem(self):
 98 |         payload, d = self.dawg()
 99 |         for key in payload:
100 |             assert d[key] == payload[key]
101 | 
102 |         with pytest.raises(KeyError):
103 |             d['fo']
104 | 
105 | 
106 |     def test_dumps_loads(self):
107 |         payload, d = self.dawg()
108 |         data = d.tobytes()
109 | 
110 |         d2 = self.IntDAWG()
111 |         d2.frombytes(data)
112 |         for key, value in payload.items():
113 |             assert key in d2
114 |             assert d2[key] == value
115 | 
116 |     def test_dump_load(self):
117 |         payload, _ = self.dawg()
118 | 
119 |         buf = BytesIO()
120 |         self.IntDAWG(payload).write(buf)
121 |         buf.seek(0)
122 | 
123 |         d = self.IntDAWG()
124 |         d.read(buf)
125 | 
126 |         for key, value in payload.items():
127 |             assert key in d
128 |             assert d[key] == value
129 | 
130 |     def test_pickling(self):
131 |         payload, d = self.dawg()
132 | 
133 |         data = pickle.dumps(d)
134 |         d2 = pickle.loads(data)
135 | 
136 |         for key, value in payload.items():
137 |             assert key in d2
138 |             assert d[key] == value
139 | 
140 |     def test_int_value_ranges(self):
141 |         for val in [0, 5, 2**16-1, 2**31-1]:
142 |             d = self.IntDAWG({'f': val})
143 |             assert d['f'] == val
144 | 
145 |         with pytest.raises(ValueError):
146 |             self.IntDAWG({'f': -1})
147 | 
148 |         with pytest.raises(OverflowError):
149 |             self.IntDAWG({'f': 2**32-1})
150 | 
151 | 
152 | class TestIntCompletionDAWG(TestIntDAWG):
153 |     IntDAWG = dawg.IntCompletionDAWG  # checks that all tests for IntDAWG pass
154 | 
155 | 
156 | class TestCompletionDAWG(object):
157 |     keys = ['f', 'bar', 'foo', 'foobar']
158 | 
159 |     def dawg(self):
160 |         return dawg.CompletionDAWG(self.keys)
161 | 
162 |     def empty_dawg(self):
163 |         return dawg.CompletionDAWG()
164 | 
165 |     def test_contains(self):
166 |         d = self.dawg()
167 |         for key in self.keys:
168 |             assert key in d
169 | 
170 |     def test_keys(self):
171 |         d = self.dawg()
172 |         assert d.keys() == sorted(self.keys)
173 | 
174 |     def test_iterkeys(self):
175 |         d = self.dawg()
176 |         assert list(d.iterkeys()) == sorted(self.keys)
177 |         assert list(d.iterkeys()) == d.keys()
178 | 
179 |     def test_prefixes(self):
180 |         d = self.dawg()
181 |         assert d.prefixes("foobarz") == ["f", "foo", "foobar"]
182 |         assert d.prefixes("x") == []
183 |         assert d.prefixes("bar") == ["bar"]
184 | 
185 |     def test_b_prefixes(self):
186 |         d = self.dawg()
187 |         assert d.b_prefixes(b"foobarz") == [b"f", b"foo", b"foobar"]
188 |         assert d.b_prefixes(b"x") == []
189 |         assert d.b_prefixes(b"bar") == [b"bar"]
190 | 
191 |     def test_iterprefixes(self):
192 |         d = self.dawg()
193 |         assert list(d.iterprefixes("foobarz")) == d.prefixes("foobarz")
194 |         assert list(d.iterprefixes("x")) == d.prefixes("x")
195 |         assert list(d.iterprefixes("bar")) == d.prefixes("bar")
196 | 
197 |     def test_completion(self):
198 |         d = self.dawg()
199 | 
200 |         assert d.keys('z') == []
201 |         assert d.keys('b') == ['bar']
202 |         assert d.keys('foo') == ['foo', 'foobar']
203 | 
204 |     def test_has_keys_with_prefix(self):
205 |         assert self.empty_dawg().has_keys_with_prefix('') == False
206 | 
207 |         d = self.dawg()
208 |         assert d.has_keys_with_prefix('') == True
209 |         assert d.has_keys_with_prefix('b') == True
210 |         assert d.has_keys_with_prefix('fo') == True
211 |         assert d.has_keys_with_prefix('bo') == False
212 | 
213 |     def test_completion_dawg_saveload(self):
214 |         buf = BytesIO()
215 |         self.dawg().write(buf)
216 |         buf.seek(0)
217 | 
218 |         d = self.empty_dawg()
219 |         d.read(buf)
220 | 
221 |         for key in self.keys:
222 |             assert key in d
223 | 
224 |         assert d.keys('foo') == ['foo', 'foobar']
225 |         assert d.keys('b') == ['bar']
226 |         assert d.keys('z') == []
227 | 
228 |     def test_no_segfaults_on_invalid_file(self):
229 |         d = self.dawg()
230 |         fd, path = tempfile.mkstemp()
231 |         with open(path, 'w') as f:
232 |             f.write('foo')
233 | 
234 |         with pytest.raises(IOError) as e:
235 |             d.load(path)
236 |             assert "can't load _dawg.Dictionary" in e.args[0]
237 | 
238 |     def test_no_segfaults_on_empty_dawg(self):
239 |         d = dawg.CompletionDAWG([])
240 |         assert d.keys() == []
241 | 
242 | 
243 | class TestIntCompletionDAWGComplete(TestCompletionDAWG):
244 |     keys = ['f', 'bar', 'foo', 'foobar']
245 | 
246 |     def dawg(self):
247 |         return dawg.IntCompletionDAWG((k, len(k)) for k in self.keys)
248 | 
249 |     def empty_dawg(self):
250 |         return dawg.IntCompletionDAWG()
251 | 
252 |     def test_no_segfaults_on_empty_dawg(self):
253 |         d = dawg.IntCompletionDAWG([])
254 |         assert d.keys() == []
255 | 
256 |     def test_items(self):
257 |         d = self.dawg()
258 |         items = d.items()
259 |         assert isinstance(items, list)
260 |         for key, value in items:
261 |             assert len(key) == value
262 | 
263 |     def test_iteritems(self):
264 |         d = self.dawg()
265 |         for key, value in d.iteritems():
266 |             assert len(key) == value
267 | 
268 |     def test_items_prefix(self):
269 |         d = self.dawg()
270 |         assert d.items('fo') == [('foo', 3), ('foobar', 6)]
271 | 


--------------------------------------------------------------------------------
/tests/test_payload_dawg.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import, unicode_literals
  3 | 
  4 | import pytest
  5 | import dawg
  6 | 
  7 | class TestBytesDAWG(object):
  8 | 
  9 |     DATA = (
 10 |         ('foo', b'data3'),
 11 |         ('bar', b'data2'),
 12 |         ('foo', b'data1'),
 13 |         ('foobar', b'data4')
 14 |     )
 15 | 
 16 |     DATA_KEYS = list(zip(*DATA))[0]
 17 | 
 18 |     def dawg(self, **kwargs):
 19 |         return dawg.BytesDAWG(self.DATA, **kwargs)
 20 | 
 21 |     def test_contains(self):
 22 |         d = self.dawg()
 23 |         for key, val in self.DATA:
 24 |             assert key in d
 25 | 
 26 |         assert 'food' not in d
 27 |         assert 'x' not in d
 28 |         assert 'fo' not in d
 29 | 
 30 | 
 31 |     def test_getitem(self):
 32 |         d = self.dawg()
 33 | 
 34 |         assert d['foo'] == [b'data1', b'data3']
 35 |         assert d['bar'] == [b'data2']
 36 |         assert d['foobar'] == [b'data4']
 37 | 
 38 |         with pytest.raises(KeyError):
 39 |             d['f']
 40 | 
 41 |         with pytest.raises(KeyError):
 42 |             d['food']
 43 | 
 44 |         with pytest.raises(KeyError):
 45 |             d['foobarz']
 46 | 
 47 |         with pytest.raises(KeyError):
 48 |             d['x']
 49 | 
 50 |     def test_prefixes(self):
 51 |         d = self.dawg()
 52 |         assert d.prefixes("foobarz") == ["foo", "foobar"]
 53 |         assert d.prefixes("x") == []
 54 |         assert d.prefixes("bar") == ["bar"]
 55 | 
 56 |     def test_keys(self):
 57 |         d = self.dawg()
 58 |         assert d.keys() == sorted(self.DATA_KEYS)
 59 | 
 60 |     def test_keys_ordering(self):
 61 |         data = [('foo', b'v1'), ('foobar', b'v2'), ('bar', b'v3')]
 62 | 
 63 |         d = dawg.BytesDAWG(data, payload_separator=b'\xff')
 64 |         assert d.keys() == ['bar', 'foobar', 'foo']
 65 | 
 66 |         d2 = dawg.BytesDAWG(data, payload_separator=b'\x01')
 67 |         assert d2.keys() == ['bar', 'foo', 'foobar']
 68 | 
 69 |     def test_iterkeys(self):
 70 |         d = self.dawg()
 71 |         assert list(d.iterkeys()) == d.keys()
 72 |         assert list(d.iterkeys()) == sorted(self.DATA_KEYS)
 73 | 
 74 |     def test_items(self):
 75 |         d = self.dawg()
 76 |         assert d.items() == sorted(self.DATA)
 77 | 
 78 |     def test_iteritems(self):
 79 |         d = self.dawg()
 80 |         assert list(d.iteritems()) == d.items()
 81 | 
 82 |     def test_build_error(self):
 83 |         with pytest.raises(dawg.Error):
 84 |             self.dawg(payload_separator=b'f')
 85 | 
 86 | 
 87 | 
 88 | class TestRecordDAWG(object):
 89 | 
 90 |     STRUCTURED_DATA = (
 91 |         ('foo',     (3, 2, 256)),
 92 |         ('bar',     (3, 1, 0)),
 93 |         ('foo',     (3, 2, 1)),
 94 |         ('foobar',  (6, 3, 0))
 95 |     )
 96 | 
 97 |     def dawg(self):
 98 |         return dawg.RecordDAWG(">3H", self.STRUCTURED_DATA)
 99 | 
100 |     def test_record_getitem(self):
101 |         d = self.dawg()
102 |         assert d['foo'] == [(3, 2, 1), (3, 2, 256)]
103 |         assert d['bar'] == [(3, 1, 0)]
104 |         assert d['foobar'] == [(6, 3, 0)]
105 | 
106 |     def test_record_items(self):
107 |         d = self.dawg()
108 |         assert d.items() == sorted(self.STRUCTURED_DATA)
109 | 
110 |     def test_record_keys(self):
111 |         d = self.dawg()
112 |         assert d.keys() == ['bar', 'foo', 'foo', 'foobar',]
113 | 
114 |     def test_record_iterkeys(self):
115 |         d = self.dawg()
116 |         assert list(d.iterkeys()) == d.keys()
117 | 
118 |     def test_record_iteritems(self):
119 |         d = self.dawg()
120 |         assert list(d.iteritems()) == d.items()
121 | 
122 |     def test_record_keys_prefix(self):
123 |         d = self.dawg()
124 |         assert d.keys('fo') == ['foo', 'foo', 'foobar']
125 |         assert d.keys('bar') == ['bar']
126 |         assert d.keys('barz') == []
127 | 
128 |     def test_prefixes(self):
129 |         d = self.dawg()
130 |         assert d.prefixes("foobarz") == ["foo", "foobar"]
131 |         assert d.prefixes("x") == []
132 |         assert d.prefixes("bar") == ["bar"]
133 | 


--------------------------------------------------------------------------------
/tests/test_prediction.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import, unicode_literals
 3 | import pytest
 4 | import dawg
 5 | 
 6 | class TestPrediction(object):
 7 |     DATA = ['ЁЖИК', 'ЁЖИКЕ', 'ЁЖ', 'ДЕРЕВНЯ', 'ДЕРЁВНЯ', 'ЕМ', 'ОЗЕРА', 'ОЗЁРА', 'ОЗЕРО']
 8 |     LENGTH_DATA = list(zip(DATA, ((len(w),) for w in DATA)))
 9 | 
10 |     REPLACES = dawg.DAWG.compile_replaces({'Е': 'Ё'})
11 | 
12 |     SUITE = [
13 |         ('УЖ', []),
14 |         ('ЕМ', ['ЕМ']),
15 |         ('ЁМ', []),
16 |         ('ЁЖ', ['ЁЖ']),
17 |         ('ЕЖ', ['ЁЖ']),
18 |         ('ЁЖИК', ['ЁЖИК']),
19 |         ('ЕЖИКЕ', ['ЁЖИКЕ']),
20 |         ('ДЕРЕВНЯ', ['ДЕРЕВНЯ', 'ДЕРЁВНЯ']),
21 |         ('ДЕРЁВНЯ', ['ДЕРЁВНЯ']),
22 |         ('ОЗЕРА', ['ОЗЕРА', 'ОЗЁРА']),
23 |         ('ОЗЕРО', ['ОЗЕРО']),
24 |     ]
25 | 
26 |     SUITE_ITEMS = [
27 |         (
28 |             it[0], # key
29 |             [
30 |                 (w, [(len(w),)]) # item, value pair
31 |                 for w in it[1]
32 |             ]
33 |         )
34 |         for it in SUITE
35 |     ]
36 | 
37 |     SUITE_VALUES = [
38 |         (
39 |             it[0], # key
40 |             [[(len(w),)] for w in it[1]]
41 |         )
42 |         for it in SUITE
43 |     ]
44 | 
45 | 
46 |     @pytest.mark.parametrize(("word", "prediction"), SUITE)
47 |     def test_dawg_prediction(self, word, prediction):
48 |         d = dawg.DAWG(self.DATA)
49 |         assert d.similar_keys(word, self.REPLACES) == prediction
50 | 
51 |     @pytest.mark.parametrize(("word", "prediction"), SUITE)
52 |     def test_record_dawg_prediction(self, word, prediction):
53 |         d = dawg.RecordDAWG(str("=H"), self.LENGTH_DATA)
54 |         assert d.similar_keys(word, self.REPLACES) == prediction
55 | 
56 |     @pytest.mark.parametrize(("word", "prediction"), SUITE_ITEMS)
57 |     def test_record_dawg_items(self, word, prediction):
58 |         d = dawg.RecordDAWG(str("=H"), self.LENGTH_DATA)
59 |         assert d.similar_items(word, self.REPLACES) == prediction
60 | 
61 |     @pytest.mark.parametrize(("word", "prediction"), SUITE_VALUES)
62 |     def test_record_dawg_items_values(self, word, prediction):
63 |         d = dawg.RecordDAWG(str("=H"), self.LENGTH_DATA)
64 |         assert d.similar_item_values(word, self.REPLACES) == prediction
65 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py27,py35,py35-locale,py36,py37,py38
 3 | 
 4 | [testenv]
 5 | deps =
 6 |     pytest
 7 | commands=
 8 |     python setup.py install
 9 |     py.test []
10 | 
11 | [testenv:py35-locale]
12 | basepython = python3.5
13 | setenv =
14 |     LC_ALL=C
15 | 


--------------------------------------------------------------------------------
/update_cpp.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | cython src/*.pyx src/*.pxd -a --cplus -2
3 | 


--------------------------------------------------------------------------------