├── .gitignore ├── .travis.yml ├── AUTHORS.rst ├── CHANGES.rst ├── LICENSE ├── MANIFEST.in ├── README.rst ├── bench.ini ├── bench ├── __init__.py ├── speed.py └── words100k.txt.zip ├── docs ├── Makefile ├── conf.py ├── index.rst └── make.bat ├── lib ├── AUTHORS ├── COPYING ├── b64 │ ├── AUTHORS │ ├── LICENSE │ ├── cdecode.c │ ├── cdecode.h │ ├── cencode.c │ ├── cencode.h │ ├── decode.h │ └── encode.h └── dawgdic │ ├── base-types.h │ ├── base-unit.h │ ├── bit-pool.h │ ├── completer.h │ ├── dawg-builder.h │ ├── dawg-unit.h │ ├── dawg.h │ ├── dictionary-builder.h │ ├── dictionary-extra-unit.h │ ├── dictionary-unit.h │ ├── dictionary.h │ ├── guide-builder.h │ ├── guide-unit.h │ ├── guide.h │ ├── link-table.h │ ├── object-pool.h │ ├── ranked-completer-candidate.h │ ├── ranked-completer-node.h │ ├── ranked-completer.h │ ├── ranked-guide-builder.h │ ├── ranked-guide-link.h │ ├── ranked-guide-unit.h │ └── ranked-guide.h ├── setup.py ├── src ├── _base_types.cpp ├── _base_types.pxd ├── _completer.cpp ├── _completer.pxd ├── _dawg.cpp ├── _dawg.pxd ├── _dawg_builder.cpp ├── _dawg_builder.pxd ├── _dictionary.cpp ├── _dictionary.pxd ├── _dictionary_builder.cpp ├── _dictionary_builder.pxd ├── _dictionary_unit.cpp ├── _dictionary_unit.pxd ├── _guide.cpp ├── _guide.pxd ├── _guide_builder.cpp ├── _guide_builder.pxd ├── _guide_unit.cpp ├── _guide_unit.pxd ├── b64_decode.cpp ├── b64_decode.pxd ├── dawg.cpp ├── dawg.pyx ├── iostream.cpp └── iostream.pxd ├── tests ├── __init__.py ├── test_dawg.py ├── test_payload_dawg.py └── test_prediction.py ├── tox.ini └── update_cpp.sh /.gitignore: -------------------------------------------------------------------------------- 1 | MANIFEST 2 | src/*.html 3 | 4 | *.py[cod] 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Packages 10 | *.egg 11 | *.egg-info 12 | dist 13 | build 14 | eggs 15 | parts 16 | bin 17 | var 18 | sdist 19 | develop-eggs 20 | .installed.cfg 21 | lib 22 | lib64 23 | __pycache__ 24 | 25 | # Installer logs 26 | pip-log.txt 27 | 28 | # Unit test / coverage reports 29 | .coverage 30 | .tox 31 | nosetests.xml 32 | 33 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | matrix: 4 | include: 5 | - python: 3.5 6 | env: TOXENV=py35 7 | - python: 3.5 8 | env: TOXENV=py35-locale 9 | - python: 3.6 10 | env: TOXENV=py36 11 | - python: 3.7 12 | env: TOXENV=py37 13 | - python: 3.8 14 | env: TOXENV=py38 15 | 16 | install: 17 | - pip install -U tox 18 | 19 | script: tox 20 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | Authors & Contributors 2 | ---------------------- 3 | 4 | * Mikhail Korobov ; 5 | * Dan Blanchard; 6 | * Jakub Wilk; 7 | * Alex Moiseenko; 8 | * `Matt Hickford `_; 9 | * `Ikuya Yamada `_. 10 | 11 | This module uses `dawgdic`_ C++ library by 12 | Susumu Yata & contributors. 13 | 14 | base64 decoder is a modified version of libb64_ (original author 15 | is Chris Venter). 16 | 17 | .. _libb64: http://libb64.sourceforge.net/ 18 | .. _dawgdic: https://code.google.com/p/dawgdic/ 19 | -------------------------------------------------------------------------------- /CHANGES.rst: -------------------------------------------------------------------------------- 1 | 2 | Changes 3 | ======= 4 | 5 | 0.8.0 (2020-02-19) 6 | ------------------ 7 | 8 | * Python 3.8 support is added 9 | * Python 3.2, 3.3 and 3.4 support is dropped 10 | * Extension is rebuilt with Cython 0.29.15 11 | 12 | 0.7.8 (2015-04-18) 13 | ------------------ 14 | 15 | * extra type annotations are added to make the code a bit faster; 16 | * mercurial mirror at bitbucket is dropped; 17 | * wrapper is rebuilt with Cython 0.22. 18 | 19 | 0.7.7 (2014-11-19) 20 | ------------------ 21 | 22 | * ``DAWG.b_prefixes`` method for avoiding utf8 encoding/decoding 23 | (thanks Ikuya Yamada); 24 | * wrapper is rebuilt with Cython 0.21.1. 25 | 26 | 0.7.6 (2014-08-10) 27 | ------------------ 28 | 29 | * Wrapper is rebuilt with Cython 0.20.2 to fix some issues. 30 | 31 | 0.7.5 (2014-06-05) 32 | ------------------ 33 | 34 | * Switched to setuptools; 35 | * some wheels are uploaded to pypi. 36 | 37 | 0.7.4 (2014-05-29) 38 | ------------------ 39 | 40 | * Fixed a bug in DAWG building: input should be sorted according to its 41 | binary representation. 42 | 43 | 0.7.3 (2014-05-29) 44 | ------------------ 45 | 46 | * Wrapper is rebuilt with Cython 0.21dev; 47 | * Python 3.4 compatibility is verified. 48 | 49 | 0.7.2 (2013-10-03) 50 | ------------------ 51 | 52 | * ``has_keys_with_prefix(prefix)`` method (thanks 53 | `Matt Hickford `_) 54 | 55 | 0.7.1 (2013-05-25) 56 | ------------------ 57 | 58 | - Extension is rebuilt with Cython 0.19.1; 59 | - fixed segfault that happened on lookup from incorrectly loaded DAWG 60 | (thanks Alex Moiseenko). 61 | 62 | 0.7 (2013-04-05) 63 | ---------------- 64 | 65 | - IntCompletionDAWG 66 | 67 | 0.6.1 (2013-03-23) 68 | ------------------ 69 | 70 | - Installation issues in environments with LC_ALL=C are fixed; 71 | - PyPy is officially unsupported now (use DAWG-Python_ with PyPy). 72 | 73 | .. _DAWG-Python: https://github.com/pytries/DAWG-Python 74 | 75 | 0.6 (2013-03-22) 76 | ---------------- 77 | 78 | - many thread-safety bugs are fixed (at the cost of slowing library down). 79 | 80 | 0.5.5 (2013-02-19) 81 | ------------------ 82 | 83 | - fix installation under PyPy (note: DAWG is slow under PyPy 84 | and may have bugs). 85 | 86 | 0.5.4 (2013-02-14) 87 | ------------------ 88 | 89 | - small tweaks for docstrings; 90 | - the extension is rebuilt using Cython 0.18. 91 | 92 | 0.5.3 (2013-01-03) 93 | ------------------ 94 | 95 | - small improvements to ``.compile_replaces`` method; 96 | - benchmarks for ``.similar_items`` method; 97 | - the extension is rebuilt with Cython pre-0.18; this made 98 | ``.prefixes`` and ``.iterprefixes`` methods faster 99 | (up to 6x in some cases). 100 | 101 | 0.5.2 (2013-01-02) 102 | ------------------ 103 | 104 | - tests are included in source distribution; 105 | - benchmark results in README was nonrepresentative because of my 106 | broken (slow) Python 3.2 install; 107 | - installation is fixed under Python 3.x with ``LC_ALL=C`` (thanks 108 | Jakub Wilk). 109 | 110 | 0.5.1 (2012-10-11) 111 | ------------------ 112 | 113 | - better error reporting while building DAWGs; 114 | - ``__contains__`` is fixed for keys with zero bytes; 115 | - ``dawg.Error`` exception class; 116 | - building of ``BytesDAWG`` and ``RecordDAWG`` fails instead of 117 | producing incorrect results if some of the keys has unsupported characters. 118 | 119 | 120 | 0.5 (2012-10-08) 121 | ---------------- 122 | 123 | The storage scheme of ``BytesDAWG`` and ``RecordDAWG`` is changed in 124 | this release in order to provide the alphabetical ordering of items. 125 | 126 | This is a backwards-incompatible release. In order to read ``BytesDAWG`` or 127 | ``RecordDAWG`` created with previous versions of DAWG use ``payload_separator`` 128 | constructor argument:: 129 | 130 | >>> BytesDAWG(payload_separator=b'\xff').load('old.dawg') 131 | 132 | 133 | 0.4.1 (2012-10-01) 134 | ------------------ 135 | 136 | - Segfaults with empty DAWGs are fixed by updating dawgdic to latest svn. 137 | 138 | 0.4 (2012-09-26) 139 | ---------------- 140 | 141 | - ``iterkeys``, ``iteritems`` and ``iterprefixes`` methods 142 | (thanks Dan Blanchard). 143 | 144 | 0.3.2 (2012-09-24) 145 | ------------------ 146 | 147 | - ``prefixes`` method for finding all prefixes of a given key. 148 | 149 | 0.3.1 (2012-09-20) 150 | ------------------ 151 | 152 | - bundled dawgdic C++ library is updated to the latest version. 153 | 154 | 0.3 (2012-09-13) 155 | ---------------- 156 | 157 | - ``similar_keys``, ``similar_items`` and ``similar_item_values`` methods 158 | for more permissive lookups (they may be useful e.g. for umlaut handling); 159 | - ``load`` method returns self; 160 | - Python 3.3 support. 161 | 162 | 0.2 (2012-09-08) 163 | ---------------- 164 | 165 | Greatly improved memory usage for DAWGs loaded with ``load`` method. 166 | 167 | There is currently a bug somewhere in a wrapper so DAWGs loaded with 168 | ``read()`` method or unpickled DAWGs uses 3x-4x memory compared to DAWGs 169 | loaded with ``load()`` method. ``load()`` is fixed in this release but 170 | other methods are not. 171 | 172 | 0.1 (2012-09-08) 173 | ---------------- 174 | 175 | Initial release. 176 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) Mikhail Korobov, 2012-2014 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is furnished 8 | to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 14 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR 15 | A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 16 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 17 | CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 18 | OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | include AUTHORS.rst 3 | include CHANGES.rst 4 | include LICENSE 5 | include tox.ini 6 | include update_cpp.sh 7 | include lib/COPYING 8 | 9 | recursive-include docs *.rst *.py Makefile make.bat 10 | 11 | recursive-include src *.cpp *.pxd *.pyx 12 | recursive-include lib *.c *.h 13 | recursive-include tests *.py -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | DAWG 2 | ==== 3 | 4 | .. image:: https://travis-ci.org/pytries/DAWG.png?branch=master 5 | :target: https://travis-ci.org/pytries/DAWG 6 | 7 | This package provides DAWG(DAFSA_)-based dictionary-like 8 | read-only objects for Python (2.x and 3.x). 9 | 10 | String data in a DAWG may take 200x less memory than in 11 | a standard Python dict and the raw lookup speed is comparable; 12 | it also provides fast advanced methods like prefix search. 13 | 14 | .. _DAFSA: https://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton 15 | 16 | * Docs: https://dawg.readthedocs.org 17 | * Source code: https://github.com/pytries/DAWG 18 | * Issue tracker: https://github.com/pytries/DAWG/issues 19 | 20 | License 21 | ======= 22 | 23 | Wrapper code is licensed under MIT License. 24 | Bundled `dawgdic`_ C++ library is licensed under BSD license. 25 | Bundled libb64_ is Public Domain. 26 | 27 | .. _dawgdic: https://code.google.com/p/dawgdic/ 28 | .. _libb64: http://libb64.sourceforge.net/ 29 | -------------------------------------------------------------------------------- /bench.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27,py35,py36,py37 3 | 4 | [testenv] 5 | deps = 6 | pytest 7 | commands= 8 | python setup.py install 9 | python bench/speed.py 10 | -------------------------------------------------------------------------------- /bench/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import -------------------------------------------------------------------------------- /bench/speed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from __future__ import absolute_import, unicode_literals, division 4 | import random 5 | import string 6 | import timeit 7 | import os 8 | import zipfile 9 | import struct 10 | #import pstats 11 | #import cProfile 12 | 13 | import dawg 14 | 15 | def words100k(): 16 | zip_name = os.path.join( 17 | os.path.abspath(os.path.dirname(__file__)), 18 | 'words100k.txt.zip' 19 | ) 20 | zf = zipfile.ZipFile(zip_name) 21 | txt = zf.open(zf.namelist()[0]).read().decode('utf8') 22 | return txt.splitlines() 23 | 24 | def random_words(num): 25 | russian = 'абвгдеёжзиклмнопрстуфхцчъыьэюя' 26 | alphabet = '%s%s' % (russian, string.ascii_letters) 27 | return [ 28 | "".join([random.choice(alphabet) for x in range(random.randint(1,15))]) 29 | for y in range(num) 30 | ] 31 | 32 | def truncated_words(words): 33 | return [word[:3] for word in words] 34 | 35 | def prefixes1k(words, prefix_len): 36 | words = [w for w in words if len(w) >= prefix_len] 37 | every_nth = int(len(words)/1000) 38 | _words = [w[:prefix_len] for w in words[::every_nth]] 39 | return _words[:1000] 40 | 41 | def leet_words(words, replaces): 42 | for key, value in replaces.items(): 43 | words = [w.replace(key, value) for w in words] 44 | return words 45 | 46 | 47 | WORDS100k = words100k() 48 | MIXED_WORDS100k = truncated_words(WORDS100k) 49 | NON_WORDS100k = random_words(100000) 50 | PREFIXES_3_1k = prefixes1k(WORDS100k, 3) 51 | PREFIXES_5_1k = prefixes1k(WORDS100k, 5) 52 | PREFIXES_8_1k = prefixes1k(WORDS100k, 8) 53 | PREFIXES_15_1k = prefixes1k(WORDS100k, 15) 54 | 55 | LEET_REPLACES = { 56 | 'o': '0', 57 | 'O': '0', 58 | 'u': '0', 59 | 'l': '1', 60 | 'i': '1', 61 | 'e': '3', 62 | 'E': '3', 63 | 'A': '4', 64 | 'a': '4', 65 | 'h': '4', 66 | 's': 'z', 67 | } 68 | LEET_50k = leet_words(WORDS100k[:50000], LEET_REPLACES) 69 | 70 | def format_result(key, value, text_width): 71 | key = key.ljust(text_width) 72 | print(" %s %s" % (key, value)) 73 | 74 | 75 | def bench(name, timer, descr='M ops/sec', op_count=0.1, repeats=3, runs=5, 76 | text_width=33): 77 | try: 78 | times = [] 79 | for x in range(runs): 80 | times.append(timer.timeit(repeats)) 81 | 82 | def op_time(time): 83 | return op_count*repeats / time 84 | 85 | val = "%0.3f%s" % (op_time(min(times)), descr) 86 | format_result(name, val, text_width) 87 | except (AttributeError, TypeError) as e: 88 | format_result(name, "not supported", text_width) 89 | 90 | def create_dawg(): 91 | words = words100k() 92 | return dawg.DAWG(words) 93 | 94 | def create_bytes_dawg(): 95 | words = words100k() 96 | values = [struct.pack(str('' where is one of" 21 | @echo " html to make standalone HTML files" 22 | @echo " dirhtml to make HTML files named index.html in directories" 23 | @echo " singlehtml to make a single large HTML file" 24 | @echo " pickle to make pickle files" 25 | @echo " json to make JSON files" 26 | @echo " htmlhelp to make HTML files and a HTML help project" 27 | @echo " qthelp to make HTML files and a qthelp project" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 31 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 32 | @echo " text to make text files" 33 | @echo " man to make manual pages" 34 | @echo " texinfo to make Texinfo files" 35 | @echo " info to make Texinfo files and run them through makeinfo" 36 | @echo " gettext to make PO message catalogs" 37 | @echo " changes to make an overview of all changed/added/deprecated items" 38 | @echo " linkcheck to check all external links for integrity" 39 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 40 | 41 | clean: 42 | -rm -rf $(BUILDDIR)/* 43 | 44 | html: 45 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 46 | @echo 47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 48 | 49 | dirhtml: 50 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 51 | @echo 52 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 53 | 54 | singlehtml: 55 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 56 | @echo 57 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 58 | 59 | pickle: 60 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 61 | @echo 62 | @echo "Build finished; now you can process the pickle files." 63 | 64 | json: 65 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 66 | @echo 67 | @echo "Build finished; now you can process the JSON files." 68 | 69 | htmlhelp: 70 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 71 | @echo 72 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 73 | ".hhp project file in $(BUILDDIR)/htmlhelp." 74 | 75 | qthelp: 76 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 77 | @echo 78 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 79 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 80 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/DAWG.qhcp" 81 | @echo "To view the help file:" 82 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/DAWG.qhc" 83 | 84 | devhelp: 85 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 86 | @echo 87 | @echo "Build finished." 88 | @echo "To view the help file:" 89 | @echo "# mkdir -p $$HOME/.local/share/devhelp/DAWG" 90 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/DAWG" 91 | @echo "# devhelp" 92 | 93 | epub: 94 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 95 | @echo 96 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 97 | 98 | latex: 99 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 100 | @echo 101 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 102 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 103 | "(use \`make latexpdf' here to do that automatically)." 104 | 105 | latexpdf: 106 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 107 | @echo "Running LaTeX files through pdflatex..." 108 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 109 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 110 | 111 | text: 112 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 113 | @echo 114 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 115 | 116 | man: 117 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 118 | @echo 119 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 120 | 121 | texinfo: 122 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 123 | @echo 124 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 125 | @echo "Run \`make' in that directory to run these through makeinfo" \ 126 | "(use \`make info' here to do that automatically)." 127 | 128 | info: 129 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 130 | @echo "Running Texinfo files through makeinfo..." 131 | make -C $(BUILDDIR)/texinfo info 132 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 133 | 134 | gettext: 135 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 136 | @echo 137 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 138 | 139 | changes: 140 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 141 | @echo 142 | @echo "The overview file is in $(BUILDDIR)/changes." 143 | 144 | linkcheck: 145 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 146 | @echo 147 | @echo "Link check complete; look for any errors in the above output " \ 148 | "or in $(BUILDDIR)/linkcheck/output.txt." 149 | 150 | doctest: 151 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 152 | @echo "Testing of doctests in the sources finished, look at the " \ 153 | "results in $(BUILDDIR)/doctest/output.txt." 154 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # DAWG documentation build configuration file, created by 5 | # sphinx-quickstart on Sat Mar 23 00:33:42 2013. 6 | # 7 | # This file is execfile()d with the current directory set to its containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys, os 16 | 17 | # If extensions (or modules to document with autodoc) are in another directory, 18 | # add these directories to sys.path here. If the directory is relative to the 19 | # documentation root, use os.path.abspath to make it absolute, like shown here. 20 | #sys.path.insert(0, os.path.abspath('.')) 21 | 22 | # -- General configuration ----------------------------------------------------- 23 | 24 | # If your documentation needs a minimal Sphinx version, state it here. 25 | #needs_sphinx = '1.0' 26 | 27 | # Add any Sphinx extension module names here, as strings. They can be extensions 28 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 29 | extensions = [] 30 | 31 | # Add any paths that contain templates here, relative to this directory. 32 | templates_path = ['_templates'] 33 | 34 | # The suffix of source filenames. 35 | source_suffix = '.rst' 36 | 37 | # The encoding of source files. 38 | #source_encoding = 'utf-8-sig' 39 | 40 | # The master toctree document. 41 | master_doc = 'index' 42 | 43 | # General information about the project. 44 | project = 'DAWG' 45 | copyright = '2015, Mikhail Korobov' 46 | 47 | # The version info for the project you're documenting, acts as replacement for 48 | # |version| and |release|, also used in various other places throughout the 49 | # built documents. 50 | # 51 | # The short X.Y version. 52 | version = '0.6' 53 | # The full version, including alpha/beta/rc tags. 54 | release = '0.6' 55 | 56 | # The language for content autogenerated by Sphinx. Refer to documentation 57 | # for a list of supported languages. 58 | #language = None 59 | 60 | # There are two options for replacing |today|: either, you set today to some 61 | # non-false value, then it is used: 62 | #today = '' 63 | # Else, today_fmt is used as the format for a strftime call. 64 | #today_fmt = '%B %d, %Y' 65 | 66 | # List of patterns, relative to source directory, that match files and 67 | # directories to ignore when looking for source files. 68 | exclude_patterns = ['_build'] 69 | 70 | # The reST default role (used for this markup: `text`) to use for all documents. 71 | #default_role = None 72 | 73 | # If true, '()' will be appended to :func: etc. cross-reference text. 74 | #add_function_parentheses = True 75 | 76 | # If true, the current module name will be prepended to all description 77 | # unit titles (such as .. function::). 78 | #add_module_names = True 79 | 80 | # If true, sectionauthor and moduleauthor directives will be shown in the 81 | # output. They are ignored by default. 82 | #show_authors = False 83 | 84 | # The name of the Pygments (syntax highlighting) style to use. 85 | pygments_style = 'sphinx' 86 | 87 | # A list of ignored prefixes for module index sorting. 88 | #modindex_common_prefix = [] 89 | 90 | 91 | # -- Options for HTML output --------------------------------------------------- 92 | 93 | # The theme to use for HTML and HTML Help pages. See the documentation for 94 | # a list of builtin themes. 95 | html_theme = 'default' 96 | 97 | # Theme options are theme-specific and customize the look and feel of a theme 98 | # further. For a list of options available for each theme, see the 99 | # documentation. 100 | #html_theme_options = {} 101 | 102 | # Add any paths that contain custom themes here, relative to this directory. 103 | #html_theme_path = [] 104 | 105 | # The name for this set of Sphinx documents. If None, it defaults to 106 | # " v documentation". 107 | #html_title = None 108 | 109 | # A shorter title for the navigation bar. Default is the same as html_title. 110 | #html_short_title = None 111 | 112 | # The name of an image file (relative to this directory) to place at the top 113 | # of the sidebar. 114 | #html_logo = None 115 | 116 | # The name of an image file (within the static path) to use as favicon of the 117 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 118 | # pixels large. 119 | #html_favicon = None 120 | 121 | # Add any paths that contain custom static files (such as style sheets) here, 122 | # relative to this directory. They are copied after the builtin static files, 123 | # so a file named "default.css" will overwrite the builtin "default.css". 124 | html_static_path = ['_static'] 125 | 126 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 127 | # using the given strftime format. 128 | #html_last_updated_fmt = '%b %d, %Y' 129 | 130 | # If true, SmartyPants will be used to convert quotes and dashes to 131 | # typographically correct entities. 132 | #html_use_smartypants = True 133 | 134 | # Custom sidebar templates, maps document names to template names. 135 | #html_sidebars = {} 136 | 137 | # Additional templates that should be rendered to pages, maps page names to 138 | # template names. 139 | #html_additional_pages = {} 140 | 141 | # If false, no module index is generated. 142 | #html_domain_indices = True 143 | 144 | # If false, no index is generated. 145 | #html_use_index = True 146 | 147 | # If true, the index is split into individual pages for each letter. 148 | #html_split_index = False 149 | 150 | # If true, links to the reST sources are added to the pages. 151 | #html_show_sourcelink = True 152 | 153 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 154 | #html_show_sphinx = True 155 | 156 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 157 | #html_show_copyright = True 158 | 159 | # If true, an OpenSearch description file will be output, and all pages will 160 | # contain a tag referring to it. The value of this option must be the 161 | # base URL from which the finished HTML is served. 162 | #html_use_opensearch = '' 163 | 164 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 165 | #html_file_suffix = None 166 | 167 | # Output file base name for HTML help builder. 168 | htmlhelp_basename = 'DAWGdoc' 169 | 170 | 171 | # -- Options for LaTeX output -------------------------------------------------- 172 | 173 | latex_elements = { 174 | # The paper size ('letterpaper' or 'a4paper'). 175 | #'papersize': 'letterpaper', 176 | 177 | # The font size ('10pt', '11pt' or '12pt'). 178 | #'pointsize': '10pt', 179 | 180 | # Additional stuff for the LaTeX preamble. 181 | #'preamble': '', 182 | } 183 | 184 | # Grouping the document tree into LaTeX files. List of tuples 185 | # (source start file, target name, title, author, documentclass [howto/manual]). 186 | latex_documents = [ 187 | ('index', 'DAWG.tex', 'DAWG Documentation', 188 | 'Mikhail Korobov', 'manual'), 189 | ] 190 | 191 | # The name of an image file (relative to this directory) to place at the top of 192 | # the title page. 193 | #latex_logo = None 194 | 195 | # For "manual" documents, if this is true, then toplevel headings are parts, 196 | # not chapters. 197 | #latex_use_parts = False 198 | 199 | # If true, show page references after internal links. 200 | #latex_show_pagerefs = False 201 | 202 | # If true, show URL addresses after external links. 203 | #latex_show_urls = False 204 | 205 | # Documents to append as an appendix to all manuals. 206 | #latex_appendices = [] 207 | 208 | # If false, no module index is generated. 209 | #latex_domain_indices = True 210 | 211 | 212 | # -- Options for manual page output -------------------------------------------- 213 | 214 | # One entry per manual page. List of tuples 215 | # (source start file, name, description, authors, manual section). 216 | man_pages = [ 217 | ('index', 'dawg', 'DAWG Documentation', 218 | ['Mikhail Korobov'], 1) 219 | ] 220 | 221 | # If true, show URL addresses after external links. 222 | #man_show_urls = False 223 | 224 | 225 | # -- Options for Texinfo output ------------------------------------------------ 226 | 227 | # Grouping the document tree into Texinfo files. List of tuples 228 | # (source start file, target name, title, author, 229 | # dir menu entry, description, category) 230 | texinfo_documents = [ 231 | ('index', 'DAWG', 'DAWG Documentation', 232 | 'Mikhail Korobov', 'DAWG', 'One line description of project.', 233 | 'Miscellaneous'), 234 | ] 235 | 236 | # Documents to append as an appendix to all manuals. 237 | #texinfo_appendices = [] 238 | 239 | # If false, no module index is generated. 240 | #texinfo_domain_indices = True 241 | 242 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 243 | #texinfo_show_urls = 'footnote' 244 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | ================== 2 | DAWG documentation 3 | ================== 4 | 5 | This package provides DAWG(DAFSA_)-based dictionary-like 6 | read-only objects for Python (2.x and 3.x). 7 | 8 | String data in a DAWG may take 200x less memory than in 9 | a standard Python dict and the raw lookup speed is comparable; 10 | it also provides fast advanced methods like prefix search. 11 | 12 | Based on `dawgdic`_ C++ library. 13 | 14 | .. _dawgdic: https://code.google.com/p/dawgdic/ 15 | .. _libb64: http://libb64.sourceforge.net/ 16 | .. _DAFSA: https://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton 17 | 18 | License 19 | ======= 20 | 21 | Wrapper code is licensed under MIT License. 22 | Bundled `dawgdic`_ C++ library is licensed under BSD license. 23 | Bundled libb64_ is Public Domain. 24 | 25 | Installation 26 | ============ 27 | 28 | From `PyPI `_:: 29 | 30 | pip install DAWG 31 | 32 | Usage 33 | ===== 34 | 35 | There are several DAWG classes in this package: 36 | 37 | * ``dawg.DAWG`` - basic DAWG wrapper; it can store unicode keys 38 | and do exact lookups; 39 | 40 | * ``dawg.CompletionDAWG`` - ``dawg.DAWG`` subclass that supports 41 | key completion and prefix lookups (but requires more memory); 42 | 43 | * ``dawg.BytesDAWG`` - ``dawg.CompletionDAWG`` subclass that 44 | maps unicode keys to lists of ``bytes`` objects. 45 | 46 | * ``dawg.RecordDAWG`` - ``dawg.BytesDAWG`` subclass that 47 | maps unicode keys to lists of data tuples. 48 | All tuples must be of the same format (the data is packed 49 | using python ``struct`` module). 50 | 51 | * ``dawg.IntDAWG`` - ``dawg.DAWG`` subclass that maps unicode keys 52 | to integer values. 53 | 54 | * ``dawg.IntCompletionDAWG`` - ``dawg.CompletionDAWG`` subclass 55 | that maps unicode keys to integer values. 56 | 57 | DAWG and CompletionDAWG 58 | ----------------------- 59 | 60 | ``DAWG`` and ``CompletionDAWG`` are useful when you need 61 | fast & memory efficient simple string storage. These classes 62 | does not support assigning values to keys. 63 | 64 | ``DAWG`` and ``CompletionDAWG`` constructors accept an iterable with keys:: 65 | 66 | >>> import dawg 67 | >>> words = [u'foo', u'bar', u'foobar', u'foö', u'bör'] 68 | >>> base_dawg = dawg.DAWG(words) 69 | >>> completion_dawg = dawg.CompletionDAWG(words) 70 | 71 | It is then possible to check if the key is in a DAWG:: 72 | 73 | >>> u'foo' in base_dawg 74 | True 75 | >>> u'baz' in completion_dawg 76 | False 77 | 78 | It is possible to find all keys that starts with a given 79 | prefix in a ``CompletionDAWG``:: 80 | 81 | >>> completion_dawg.keys(u'foo') 82 | >>> [u'foo', u'foobar'] 83 | 84 | to test whether some key begins with a given prefix:: 85 | 86 | >>> completion_dawg.has_keys_with_prefix(u'foo') 87 | >>> True 88 | 89 | and to find all prefixes of a given key:: 90 | 91 | >>> base_dawg.prefixes(u'foobarz') 92 | [u'foo', u'foobar'] 93 | 94 | Iterator versions are also available:: 95 | 96 | >>> for key in completion_dawg.iterkeys(u'foo'): 97 | ... print(key) 98 | foo 99 | foobar 100 | >>> for prefix in base_dawg.iterprefixes(u'foobarz'): 101 | ... print(prefix) 102 | foo 103 | foobar 104 | 105 | It is possible to find all keys similar to a given key (using a one-way 106 | char translation table):: 107 | 108 | >>> replaces = dawg.DAWG.compile_replaces({u'o': u'ö'}) 109 | >>> base_dawg.similar_keys(u'foo', replaces) 110 | [u'foo', u'foö'] 111 | >>> base_dawg.similar_keys(u'foö', replaces) 112 | [u'foö'] 113 | >>> base_dawg.similar_keys(u'bor', replaces) 114 | [u'bör'] 115 | 116 | BytesDAWG 117 | --------- 118 | 119 | ``BytesDAWG`` is a ``CompletionDAWG`` subclass that can store 120 | binary data for each key. 121 | 122 | ``BytesDAWG`` constructor accepts an iterable with 123 | ``(unicode_key, bytes_value)`` tuples:: 124 | 125 | >>> data = [(u'key1', b'value1'), (u'key2', b'value2'), (u'key1', b'value3')] 126 | >>> bytes_dawg = dawg.BytesDAWG(data) 127 | 128 | There can be duplicate keys; all unique values are stored in this case:: 129 | 130 | >>> bytes_dawg[u'key1'] 131 | [b'value1, b'value3'] 132 | 133 | For unique keys a list with a single value is returned for consistency:: 134 | 135 | >>> bytes_dawg[u'key2'] 136 | [b'value2'] 137 | 138 | ``KeyError`` is raised for missing keys; use ``get`` method if you need 139 | a default value instead:: 140 | 141 | >>> bytes_dawg.get(u'foo', None) 142 | None 143 | 144 | ``BytesDAWG`` support ``keys``, ``items``, ``iterkeys`` and ``iteritems`` 145 | methods (they all accept optional key prefix). There is also support for 146 | ``similar_keys``, ``similar_items`` and ``similar_item_values`` methods. 147 | 148 | RecordDAWG 149 | ---------- 150 | 151 | ``RecordDAWG`` is a ``BytesDAWG`` subclass that automatically 152 | packs & unpacks the binary data from/to Python objects 153 | using ``struct`` module from the standard library. 154 | 155 | First, you have to define a format of the data. Consult Python docs 156 | (http://docs.python.org/library/struct.html#format-strings) for the format 157 | string specification. 158 | 159 | For example, let's store 3 short unsigned numbers (in a Big-Endian byte order) 160 | as values:: 161 | 162 | >>> format = ">HHH" 163 | 164 | ``RecordDAWG`` constructor accepts an iterable with 165 | ``(unicode_key, value_tuple)``. Let's create such iterable 166 | using ``zip`` function:: 167 | 168 | >>> keys = [u'foo', u'bar', u'foobar', u'foo'] 169 | >>> values = [(1, 2, 3), (2, 1, 0), (3, 3, 3), (2, 1, 5)] 170 | >>> data = zip(keys, values) 171 | >>> record_dawg = RecordDAWG(format, data) 172 | 173 | As with ``BytesDAWG``, there can be several values for the same key:: 174 | 175 | >>> record_dawg['foo'] 176 | [(1, 2, 3), (2, 1, 5)] 177 | >>> record_dawg['foobar'] 178 | [(3, 3, 3)] 179 | 180 | 181 | BytesDAWG and RecordDAWG implementation details 182 | ----------------------------------------------- 183 | 184 | ``BytesDAWG`` and ``RecordDAWG`` stores data at the end of the keys:: 185 | 186 | 187 | 188 | Data is encoded to base64 because dawgdic_ C++ library doesn't allow 189 | zero bytes in keys (it uses null-terminated strings) and such keys are 190 | very likely in binary data. 191 | 192 | In DAWG versions prior to 0.5 ```` was ``chr(255)`` byte. 193 | It was chosen because keys are stored as UTF8-encoded strings and 194 | ``chr(255)`` is guaranteed not to appear in valid UTF8, so the end of 195 | text part of the key is not ambiguous. 196 | 197 | But ``chr(255)`` was proven to be problematic: it changes the order 198 | of the keys. Keys are naturally returned in lexicographical order by DAWG. 199 | But if ``chr(255)`` appears at the end of each text part of a key then the 200 | visible order would change. Imagine ``'foo'`` key with some payload 201 | and ``'foobar'`` key with some payload. ``'foo'`` key would be greater 202 | than ``'foobar'`` key: values compared would be ``'foo'`` and ``'foobar'`` 203 | and ``ord()==255`` is greater than ``ord()``. 204 | 205 | So now the default ```` is chr(1). This is the lowest allowed 206 | character and so it preserves the alphabetical order. 207 | 208 | It is not strictly correct to use chr(1) as a separator because chr(1) 209 | is a valid UTF8 character. But I think in practice this won't be an issue: 210 | such control character is very unlikely in text keys, and binary keys 211 | are not supported anyway because dawgdic_ doesn't support keys containing 212 | chr(0). 213 | 214 | If you can't guarantee chr(1) is not a part of keys, lexicographical order 215 | is not important to you or there is a need to read 216 | a ``BytesDAWG``/``RecordDAWG`` created by DAWG < 0.5 then pass 217 | ``payload_separator`` argument to the constructor:: 218 | 219 | >>> BytesDAWG(payload_separator=b'\xff').load('old.dawg') 220 | 221 | The storage scheme has one more implication: values of ``BytesDAWG`` 222 | and ``RecordDAWG`` are also sorted lexicographically. 223 | 224 | For ``RecordDAWG`` there is a gotcha: in order to have meaningful 225 | ordering of numeric values store them in big-endian format:: 226 | 227 | >>> data = [('foo', (3, 2, 256)), ('foo', (3, 2, 1)), ('foo', (3, 2, 3))] 228 | >>> d = RecordDAWG("3H", data) 229 | >>> d.items() 230 | [(u'foo', (3, 2, 256)), (u'foo', (3, 2, 1)), (u'foo', (3, 2, 3))] 231 | 232 | >>> d2 = RecordDAWG(">3H", data) 233 | >>> d2.items() 234 | [(u'foo', (3, 2, 1)), (u'foo', (3, 2, 3)), (u'foo', (3, 2, 256))] 235 | 236 | IntDAWG and IntCompletionDAWG 237 | ----------------------------- 238 | 239 | ``IntDAWG`` is a ``{unicode -> int}`` mapping. It is possible to 240 | use ``RecordDAWG`` for this, but ``IntDAWG`` is natively 241 | supported by dawgdic_ C++ library and so ``__getitem__`` is much faster. 242 | 243 | Unlike ``BytesDAWG`` and ``RecordDAWG``, ``IntDAWG`` doesn't support 244 | having several values for the same key. 245 | 246 | ``IntDAWG`` constructor accepts an iterable with (unicode_key, integer_value) 247 | tuples:: 248 | 249 | >>> data = [ (u'foo', 1), (u'bar', 2) ] 250 | >>> int_dawg = dawg.IntDAWG(data) 251 | 252 | It is then possible to get a value from the IntDAWG:: 253 | 254 | >>> int_dawg[u'foo'] 255 | 1 256 | 257 | ``IntCompletionDAWG`` supports all ``IntDAWG`` and ``CompletionDAWG`` methods, 258 | plus ``.items()`` and ``.iteritems()``. 259 | 260 | Persistence 261 | ----------- 262 | 263 | All DAWGs support saving/loading and pickling/unpickling. 264 | 265 | Write DAWG to a stream:: 266 | 267 | >>> with open('words.dawg', 'wb') as f: 268 | ... d.write(f) 269 | 270 | Save DAWG to a file:: 271 | 272 | >>> d.save('words.dawg') 273 | 274 | Load DAWG from a file:: 275 | 276 | >>> d = dawg.DAWG() 277 | >>> d.load('words.dawg') 278 | 279 | .. warning:: 280 | 281 | Reading DAWGs from streams and unpickling are currently using 3x memory 282 | compared to loading DAWGs using ``load`` method; please avoid them until 283 | the issue is fixed. 284 | 285 | Read DAWG from a stream:: 286 | 287 | >>> d = dawg.RecordDAWG(format_string) 288 | >>> with open('words.record-dawg', 'rb') as f: 289 | ... d.read(f) 290 | 291 | DAWG objects are picklable:: 292 | 293 | >>> import pickle 294 | >>> data = pickle.dumps(d) 295 | >>> d2 = pickle.loads(data) 296 | 297 | Benchmarks 298 | ========== 299 | 300 | For a list of 3000000 (3 million) Russian words memory consumption 301 | with different data structures (under Python 2.7): 302 | 303 | * dict(unicode words -> word lengths): about 600M 304 | * list(unicode words) : about 300M 305 | * ``marisa_trie.RecordTrie`` : 11M 306 | * ``marisa_trie.Trie``: 7M 307 | * ``dawg.DAWG``: 2M 308 | * ``dawg.CompletionDAWG``: 3M 309 | * ``dawg.IntDAWG``: 2.7M 310 | * ``dawg.RecordDAWG``: 4M 311 | 312 | 313 | .. note:: 314 | 315 | Lengths of words were not stored as values in ``dawg.DAWG``, 316 | ``dawg.CompletionDAWG`` and ``marisa_trie.Trie`` because they don't 317 | support this. 318 | 319 | .. note:: 320 | 321 | `marisa-trie`_ is often more memory efficient than 322 | DAWG (depending on data); it can also handle larger datasets 323 | and provides memory-mapped IO, so don't dismiss `marisa-trie`_ 324 | based on this README file. It is still several times slower than 325 | DAWG though. 326 | 327 | .. _marisa-trie: https://github.com/pytries/marisa-trie 328 | 329 | Benchmark results (100k unicode words, integer values (lengths of the words), 330 | Python 3.3, macbook air i5 1.8 Ghz):: 331 | 332 | dict __getitem__ (hits) 7.300M ops/sec 333 | DAWG __getitem__ (hits) not supported 334 | BytesDAWG __getitem__ (hits) 1.230M ops/sec 335 | RecordDAWG __getitem__ (hits) 0.792M ops/sec 336 | IntDAWG __getitem__ (hits) 4.217M ops/sec 337 | dict get() (hits) 3.775M ops/sec 338 | DAWG get() (hits) not supported 339 | BytesDAWG get() (hits) 1.027M ops/sec 340 | RecordDAWG get() (hits) 0.733M ops/sec 341 | IntDAWG get() (hits) 3.162M ops/sec 342 | dict get() (misses) 4.533M ops/sec 343 | DAWG get() (misses) not supported 344 | BytesDAWG get() (misses) 3.545M ops/sec 345 | RecordDAWG get() (misses) 3.485M ops/sec 346 | IntDAWG get() (misses) 3.928M ops/sec 347 | 348 | dict __contains__ (hits) 7.090M ops/sec 349 | DAWG __contains__ (hits) 4.685M ops/sec 350 | BytesDAWG __contains__ (hits) 3.885M ops/sec 351 | RecordDAWG __contains__ (hits) 3.898M ops/sec 352 | IntDAWG __contains__ (hits) 4.612M ops/sec 353 | 354 | dict __contains__ (misses) 5.617M ops/sec 355 | DAWG __contains__ (misses) 6.204M ops/sec 356 | BytesDAWG __contains__ (misses) 6.026M ops/sec 357 | RecordDAWG __contains__ (misses) 6.007M ops/sec 358 | IntDAWG __contains__ (misses) 6.180M ops/sec 359 | 360 | DAWG.similar_keys (no replaces) 0.492M ops/sec 361 | DAWG.similar_keys (l33t) 0.413M ops/sec 362 | 363 | dict items() 55.032 ops/sec 364 | DAWG items() not supported 365 | BytesDAWG items() 14.826 ops/sec 366 | RecordDAWG items() 9.436 ops/sec 367 | IntDAWG items() not supported 368 | 369 | dict keys() 200.788 ops/sec 370 | DAWG keys() not supported 371 | BytesDAWG keys() 20.657 ops/sec 372 | RecordDAWG keys() 20.873 ops/sec 373 | IntDAWG keys() not supported 374 | 375 | DAWG.prefixes (hits) 1.552M ops/sec 376 | DAWG.prefixes (mixed) 4.342M ops/sec 377 | DAWG.prefixes (misses) 4.094M ops/sec 378 | DAWG.iterprefixes (hits) 0.391M ops/sec 379 | DAWG.iterprefixes (mixed) 0.476M ops/sec 380 | DAWG.iterprefixes (misses) 0.498M ops/sec 381 | 382 | RecordDAWG.keys(prefix="xxx"), avg_len(res)==415 5.562K ops/sec 383 | RecordDAWG.keys(prefix="xxxxx"), avg_len(res)==17 104.011K ops/sec 384 | RecordDAWG.keys(prefix="xxxxxxxx"), avg_len(res)==3 318.129K ops/sec 385 | RecordDAWG.keys(prefix="xxxxx..xx"), avg_len(res)==1.4 462.238K ops/sec 386 | RecordDAWG.keys(prefix="xxx"), NON_EXISTING 4292.625K ops/sec 387 | 388 | 389 | Please take this benchmark results with a grain of salt; this 390 | is a very simple benchmark on a single data set. 391 | 392 | 393 | Current limitations 394 | =================== 395 | 396 | * ``IntDAWG`` is currently a subclass of ``DAWG`` and so it doesn't 397 | support ``keys()`` and ``items()`` methods; 398 | * ``read()`` method reads the whole stream (DAWG must be the last or the 399 | only item in a stream if it is read with ``read()`` method) - pickling 400 | doesn't have this limitation; 401 | * DAWGs loaded with ``read()`` and unpickled DAWGs uses 3x-4x memory 402 | compared to DAWGs loaded with ``load()`` method; 403 | * there are ``keys()`` and ``items()`` methods but no ``values()`` method; 404 | * iterator versions of methods are not always implemented; 405 | * ``BytesDAWG`` and ``RecordDAWG`` has a limitation: values 406 | larger than 8KB are unsupported; 407 | * the maximum number of DAWG units is limited: number of DAWG units 408 | (and thus transitions - but not elements) should be less than 2^29; 409 | this mean that it may be impossible to build an especially huge DAWG 410 | (you may split your data into several DAWGs or try `marisa-trie`_ in 411 | this case). 412 | 413 | Contributions are welcome! 414 | 415 | 416 | Contributing 417 | ============ 418 | 419 | Development happens at github: https://github.com/pytries/DAWG 420 | 421 | Issue tracker: https://github.com/pytries/DAWG/issues 422 | 423 | Feel free to submit ideas, bugs or pull requests. 424 | 425 | If you found a bug in a C++ part please report it to the original 426 | `bug tracker `_. 427 | 428 | How is source code organized 429 | ---------------------------- 430 | 431 | There are 4 folders in repository: 432 | 433 | * ``bench`` - benchmarks & benchmark data; 434 | * ``lib`` - original unmodified `dawgdic`_ C++ library and 435 | a customized version of `libb64`_ library. They are bundled 436 | for easier distribution; if something is have to be fixed in these 437 | libraries consider fixing it in the original repositories; 438 | * ``src`` - wrapper code; ``src/dawg.pyx`` is a wrapper implementation; 439 | ``src/*.pxd`` files are Cython headers for corresponding C++ headers; 440 | ``src/*.cpp`` files are the pre-built extension code and shouldn't be 441 | modified directly (they should be updated via ``update_cpp.sh`` script). 442 | * ``tests`` - the test suite. 443 | 444 | 445 | Running tests and benchmarks 446 | ---------------------------- 447 | 448 | Make sure `tox`_ is installed and run 449 | 450 | :: 451 | 452 | $ tox 453 | 454 | from the source checkout. Tests should pass under python 2.7, 3.5-3.7. 455 | 456 | In order to run benchmarks, type 457 | 458 | :: 459 | 460 | $ tox -c bench.ini 461 | 462 | .. _cython: http://cython.org 463 | .. _tox: http://tox.testrun.org 464 | 465 | .. include:: ../AUTHORS.rst 466 | 467 | .. include:: ../CHANGES.rst 468 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. linkcheck to check all external links for integrity 37 | echo. doctest to run all doctests embedded in the documentation if enabled 38 | goto end 39 | ) 40 | 41 | if "%1" == "clean" ( 42 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 43 | del /q /s %BUILDDIR%\* 44 | goto end 45 | ) 46 | 47 | if "%1" == "html" ( 48 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 49 | if errorlevel 1 exit /b 1 50 | echo. 51 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 52 | goto end 53 | ) 54 | 55 | if "%1" == "dirhtml" ( 56 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 57 | if errorlevel 1 exit /b 1 58 | echo. 59 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 60 | goto end 61 | ) 62 | 63 | if "%1" == "singlehtml" ( 64 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 68 | goto end 69 | ) 70 | 71 | if "%1" == "pickle" ( 72 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished; now you can process the pickle files. 76 | goto end 77 | ) 78 | 79 | if "%1" == "json" ( 80 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished; now you can process the JSON files. 84 | goto end 85 | ) 86 | 87 | if "%1" == "htmlhelp" ( 88 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can run HTML Help Workshop with the ^ 92 | .hhp project file in %BUILDDIR%/htmlhelp. 93 | goto end 94 | ) 95 | 96 | if "%1" == "qthelp" ( 97 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 98 | if errorlevel 1 exit /b 1 99 | echo. 100 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 101 | .qhcp project file in %BUILDDIR%/qthelp, like this: 102 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\DAWG.qhcp 103 | echo.To view the help file: 104 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\DAWG.ghc 105 | goto end 106 | ) 107 | 108 | if "%1" == "devhelp" ( 109 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 110 | if errorlevel 1 exit /b 1 111 | echo. 112 | echo.Build finished. 113 | goto end 114 | ) 115 | 116 | if "%1" == "epub" ( 117 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 118 | if errorlevel 1 exit /b 1 119 | echo. 120 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 121 | goto end 122 | ) 123 | 124 | if "%1" == "latex" ( 125 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 129 | goto end 130 | ) 131 | 132 | if "%1" == "text" ( 133 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The text files are in %BUILDDIR%/text. 137 | goto end 138 | ) 139 | 140 | if "%1" == "man" ( 141 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 145 | goto end 146 | ) 147 | 148 | if "%1" == "texinfo" ( 149 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 150 | if errorlevel 1 exit /b 1 151 | echo. 152 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 153 | goto end 154 | ) 155 | 156 | if "%1" == "gettext" ( 157 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 158 | if errorlevel 1 exit /b 1 159 | echo. 160 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 161 | goto end 162 | ) 163 | 164 | if "%1" == "changes" ( 165 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 166 | if errorlevel 1 exit /b 1 167 | echo. 168 | echo.The overview file is in %BUILDDIR%/changes. 169 | goto end 170 | ) 171 | 172 | if "%1" == "linkcheck" ( 173 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 174 | if errorlevel 1 exit /b 1 175 | echo. 176 | echo.Link check complete; look for any errors in the above output ^ 177 | or in %BUILDDIR%/linkcheck/output.txt. 178 | goto end 179 | ) 180 | 181 | if "%1" == "doctest" ( 182 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 183 | if errorlevel 1 exit /b 1 184 | echo. 185 | echo.Testing of doctests in the sources finished, look at the ^ 186 | results in %BUILDDIR%/doctest/output.txt. 187 | goto end 188 | ) 189 | 190 | :end 191 | -------------------------------------------------------------------------------- /lib/AUTHORS: -------------------------------------------------------------------------------- 1 | Susumu Yata 2 | -------------------------------------------------------------------------------- /lib/COPYING: -------------------------------------------------------------------------------- 1 | Copyright (c) 2009-2012, Susumu Yata 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | 6 | - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 8 | - Neither the name of the University of Tokushima nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 9 | 10 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 11 | -------------------------------------------------------------------------------- /lib/b64/AUTHORS: -------------------------------------------------------------------------------- 1 | libb64: Base64 Encoding/Decoding Routines 2 | ====================================== 3 | 4 | Authors: 5 | ------- 6 | 7 | Chris Venter chris.venter@gmail.com http://rocketpod.blogspot.com 8 | -------------------------------------------------------------------------------- /lib/b64/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright-Only Dedication (based on United States law) 2 | or Public Domain Certification 3 | 4 | The person or persons who have associated work with this document (the 5 | "Dedicator" or "Certifier") hereby either (a) certifies that, to the best of 6 | his knowledge, the work of authorship identified is in the public domain of the 7 | country from which the work is published, or (b) hereby dedicates whatever 8 | copyright the dedicators holds in the work of authorship identified below (the 9 | "Work") to the public domain. A certifier, moreover, dedicates any copyright 10 | interest he may have in the associated work, and for these purposes, is 11 | described as a "dedicator" below. 12 | 13 | A certifier has taken reasonable steps to verify the copyright status of this 14 | work. Certifier recognizes that his good faith efforts may not shield him from 15 | liability if in fact the work certified is not in the public domain. 16 | 17 | Dedicator makes this dedication for the benefit of the public at large and to 18 | the detriment of the Dedicator's heirs and successors. Dedicator intends this 19 | dedication to be an overt act of relinquishment in perpetuity of all present 20 | and future rights under copyright law, whether vested or contingent, in the 21 | Work. Dedicator understands that such relinquishment of all rights includes 22 | the relinquishment of all rights to enforce (by lawsuit or otherwise) those 23 | copyrights in the Work. 24 | 25 | Dedicator recognizes that, once placed in the public domain, the Work may be 26 | freely reproduced, distributed, transmitted, used, modified, built upon, or 27 | otherwise exploited by anyone for any purpose, commercial or non-commercial, 28 | and in any way, including by methods that have not yet been invented or 29 | conceived. -------------------------------------------------------------------------------- /lib/b64/cdecode.c: -------------------------------------------------------------------------------- 1 | /* 2 | cdecoder.c - c source to a base64 decoding algorithm implementation 3 | 4 | This is part of the libb64 project, and has been placed in the public domain. 5 | For details, see http://sourceforge.net/projects/libb64 6 | */ 7 | 8 | #include 9 | 10 | int base64_decode_value(char value_in) 11 | { 12 | static const char decoding[] = {62,-1,-1,-1,63,52,53,54,55,56,57,58,59,60,61,-1,-1,-1,-2,-1,-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,-1,-1,-1,-1,-1,-1,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51}; 13 | static const char decoding_size = sizeof(decoding); 14 | value_in -= 43; 15 | if (value_in < 0 || value_in > decoding_size) return -1; 16 | return decoding[(int)value_in]; 17 | } 18 | 19 | void base64_init_decodestate(base64_decodestate* state_in) 20 | { 21 | state_in->step = step_a; 22 | state_in->plainchar = 0; 23 | } 24 | 25 | int base64_decode_block(const char* code_in, const int length_in, char* plaintext_out, base64_decodestate* state_in) 26 | { 27 | const char* codechar = code_in; 28 | char* plainchar = plaintext_out; 29 | char fragment; 30 | 31 | *plainchar = state_in->plainchar; 32 | 33 | switch (state_in->step) 34 | { 35 | while (1) 36 | { 37 | case step_a: 38 | do { 39 | if (codechar == code_in+length_in) 40 | { 41 | state_in->step = step_a; 42 | state_in->plainchar = *plainchar; 43 | return plainchar - plaintext_out; 44 | } 45 | fragment = (char)base64_decode_value(*codechar++); 46 | } while (fragment < 0); 47 | *plainchar = (fragment & 0x03f) << 2; 48 | case step_b: 49 | do { 50 | if (codechar == code_in+length_in) 51 | { 52 | state_in->step = step_b; 53 | state_in->plainchar = *plainchar; 54 | return plainchar - plaintext_out; 55 | } 56 | fragment = (char)base64_decode_value(*codechar++); 57 | } while (fragment < 0); 58 | *plainchar++ |= (fragment & 0x030) >> 4; 59 | *plainchar = (fragment & 0x00f) << 4; 60 | case step_c: 61 | do { 62 | if (codechar == code_in+length_in) 63 | { 64 | state_in->step = step_c; 65 | state_in->plainchar = *plainchar; 66 | return plainchar - plaintext_out; 67 | } 68 | fragment = (char)base64_decode_value(*codechar++); 69 | } while (fragment < 0); 70 | *plainchar++ |= (fragment & 0x03c) >> 2; 71 | *plainchar = (fragment & 0x003) << 6; 72 | case step_d: 73 | do { 74 | if (codechar == code_in+length_in) 75 | { 76 | state_in->step = step_d; 77 | state_in->plainchar = *plainchar; 78 | return plainchar - plaintext_out; 79 | } 80 | fragment = (char)base64_decode_value(*codechar++); 81 | } while (fragment < 0); 82 | *plainchar++ |= (fragment & 0x03f); 83 | } 84 | } 85 | /* control should not reach here */ 86 | return plainchar - plaintext_out; 87 | } 88 | 89 | -------------------------------------------------------------------------------- /lib/b64/cdecode.h: -------------------------------------------------------------------------------- 1 | /* 2 | cdecode.h - c header for a base64 decoding algorithm 3 | 4 | This is part of the libb64 project, and has been placed in the public domain. 5 | For details, see http://sourceforge.net/projects/libb64 6 | */ 7 | 8 | #ifndef BASE64_CDECODE_H 9 | #define BASE64_CDECODE_H 10 | 11 | typedef enum 12 | { 13 | step_a, step_b, step_c, step_d 14 | } base64_decodestep; 15 | 16 | typedef struct 17 | { 18 | base64_decodestep step; 19 | char plainchar; 20 | } base64_decodestate; 21 | 22 | void base64_init_decodestate(base64_decodestate* state_in); 23 | 24 | int base64_decode_value(char value_in); 25 | 26 | int base64_decode_block(const char* code_in, const int length_in, char* plaintext_out, base64_decodestate* state_in); 27 | 28 | #endif /* BASE64_CDECODE_H */ 29 | -------------------------------------------------------------------------------- /lib/b64/cencode.c: -------------------------------------------------------------------------------- 1 | /* 2 | cencoder.c - c source to a base64 encoding algorithm implementation 3 | 4 | This is part of the libb64 project, and has been placed in the public domain. 5 | For details, see http://sourceforge.net/projects/libb64 6 | */ 7 | 8 | #include 9 | 10 | const int CHARS_PER_LINE = 72; 11 | 12 | void base64_init_encodestate(base64_encodestate* state_in) 13 | { 14 | state_in->step = step_A; 15 | state_in->result = 0; 16 | state_in->stepcount = 0; 17 | } 18 | 19 | char base64_encode_value(char value_in) 20 | { 21 | static const char* encoding = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; 22 | if (value_in > 63) return '='; 23 | return encoding[(int)value_in]; 24 | } 25 | 26 | int base64_encode_block(const char* plaintext_in, int length_in, char* code_out, base64_encodestate* state_in) 27 | { 28 | const char* plainchar = plaintext_in; 29 | const char* const plaintextend = plaintext_in + length_in; 30 | char* codechar = code_out; 31 | char result; 32 | char fragment; 33 | 34 | result = state_in->result; 35 | 36 | switch (state_in->step) 37 | { 38 | while (1) 39 | { 40 | case step_A: 41 | if (plainchar == plaintextend) 42 | { 43 | state_in->result = result; 44 | state_in->step = step_A; 45 | return codechar - code_out; 46 | } 47 | fragment = *plainchar++; 48 | result = (fragment & 0x0fc) >> 2; 49 | *codechar++ = base64_encode_value(result); 50 | result = (fragment & 0x003) << 4; 51 | case step_B: 52 | if (plainchar == plaintextend) 53 | { 54 | state_in->result = result; 55 | state_in->step = step_B; 56 | return codechar - code_out; 57 | } 58 | fragment = *plainchar++; 59 | result |= (fragment & 0x0f0) >> 4; 60 | *codechar++ = base64_encode_value(result); 61 | result = (fragment & 0x00f) << 2; 62 | case step_C: 63 | if (plainchar == plaintextend) 64 | { 65 | state_in->result = result; 66 | state_in->step = step_C; 67 | return codechar - code_out; 68 | } 69 | fragment = *plainchar++; 70 | result |= (fragment & 0x0c0) >> 6; 71 | *codechar++ = base64_encode_value(result); 72 | result = (fragment & 0x03f) >> 0; 73 | *codechar++ = base64_encode_value(result); 74 | 75 | ++(state_in->stepcount); 76 | if (state_in->stepcount == CHARS_PER_LINE/4) 77 | { 78 | *codechar++ = '\n'; 79 | state_in->stepcount = 0; 80 | } 81 | } 82 | } 83 | /* control should not reach here */ 84 | return codechar - code_out; 85 | } 86 | 87 | int base64_encode_blockend(char* code_out, base64_encodestate* state_in) 88 | { 89 | char* codechar = code_out; 90 | 91 | switch (state_in->step) 92 | { 93 | case step_B: 94 | *codechar++ = base64_encode_value(state_in->result); 95 | *codechar++ = '='; 96 | *codechar++ = '='; 97 | break; 98 | case step_C: 99 | *codechar++ = base64_encode_value(state_in->result); 100 | *codechar++ = '='; 101 | break; 102 | case step_A: 103 | break; 104 | } 105 | *codechar++ = '\n'; 106 | 107 | return codechar - code_out; 108 | } 109 | 110 | -------------------------------------------------------------------------------- /lib/b64/cencode.h: -------------------------------------------------------------------------------- 1 | /* 2 | cencode.h - c header for a base64 encoding algorithm 3 | 4 | This is part of the libb64 project, and has been placed in the public domain. 5 | For details, see http://sourceforge.net/projects/libb64 6 | */ 7 | 8 | #ifndef BASE64_CENCODE_H 9 | #define BASE64_CENCODE_H 10 | 11 | typedef enum 12 | { 13 | step_A, step_B, step_C 14 | } base64_encodestep; 15 | 16 | typedef struct 17 | { 18 | base64_encodestep step; 19 | char result; 20 | int stepcount; 21 | } base64_encodestate; 22 | 23 | void base64_init_encodestate(base64_encodestate* state_in); 24 | 25 | char base64_encode_value(char value_in); 26 | 27 | int base64_encode_block(const char* plaintext_in, int length_in, char* code_out, base64_encodestate* state_in); 28 | 29 | int base64_encode_blockend(char* code_out, base64_encodestate* state_in); 30 | 31 | #endif /* BASE64_CENCODE_H */ 32 | -------------------------------------------------------------------------------- /lib/b64/decode.h: -------------------------------------------------------------------------------- 1 | // :mode=c++: 2 | /* 3 | decode.h - c++ wrapper for a base64 decoding algorithm 4 | 5 | This is part of the libb64 project, and has been placed in the public domain. 6 | For details, see http://sourceforge.net/projects/libb64 7 | */ 8 | #ifndef BASE64_DECODE_H 9 | #define BASE64_DECODE_H 10 | 11 | #include 12 | 13 | namespace base64 14 | { 15 | extern "C" 16 | { 17 | #include "cdecode.h" 18 | } 19 | 20 | struct decoder 21 | { 22 | base64_decodestate _state; 23 | int _buffersize; 24 | 25 | decoder(int buffersize_in = 4096) 26 | : _buffersize(buffersize_in) 27 | {} 28 | 29 | int decode(char value_in) 30 | { 31 | return base64_decode_value(value_in); 32 | } 33 | 34 | int decode(const char* code_in, const int length_in, char* plaintext_out) 35 | { 36 | return base64_decode_block(code_in, length_in, plaintext_out, &_state); 37 | } 38 | 39 | void decode(std::istream& istream_in, std::ostream& ostream_in) 40 | { 41 | base64_init_decodestate(&_state); 42 | // 43 | const int N = _buffersize; 44 | char* code = new char[N]; 45 | char* plaintext = new char[N]; 46 | int codelength; 47 | int plainlength; 48 | 49 | do 50 | { 51 | istream_in.read((char*)code, N); 52 | codelength = istream_in.gcount(); 53 | plainlength = decode(code, codelength, plaintext); 54 | ostream_in.write((const char*)plaintext, plainlength); 55 | } 56 | while (istream_in.good() && codelength > 0); 57 | // 58 | base64_init_decodestate(&_state); 59 | 60 | delete [] code; 61 | delete [] plaintext; 62 | } 63 | 64 | void init() 65 | { 66 | base64_init_decodestate(&_state); 67 | } 68 | }; 69 | 70 | } // namespace base64 71 | 72 | 73 | 74 | #endif // BASE64_DECODE_H 75 | 76 | -------------------------------------------------------------------------------- /lib/b64/encode.h: -------------------------------------------------------------------------------- 1 | // :mode=c++: 2 | /* 3 | encode.h - c++ wrapper for a base64 encoding algorithm 4 | 5 | This is part of the libb64 project, and has been placed in the public domain. 6 | For details, see http://sourceforge.net/projects/libb64 7 | */ 8 | #ifndef BASE64_ENCODE_H 9 | #define BASE64_ENCODE_H 10 | 11 | #include 12 | 13 | namespace base64 14 | { 15 | extern "C" 16 | { 17 | #include "cencode.h" 18 | } 19 | 20 | struct encoder 21 | { 22 | base64_encodestate _state; 23 | int _buffersize; 24 | 25 | encoder(int buffersize_in = BUFFERSIZE) 26 | : _buffersize(buffersize_in) 27 | {} 28 | 29 | int encode(char value_in) 30 | { 31 | return base64_encode_value(value_in); 32 | } 33 | 34 | int encode(const char* code_in, const int length_in, char* plaintext_out) 35 | { 36 | return base64_encode_block(code_in, length_in, plaintext_out, &_state); 37 | } 38 | 39 | int encode_end(char* plaintext_out) 40 | { 41 | return base64_encode_blockend(plaintext_out, &_state); 42 | } 43 | 44 | void encode(std::istream& istream_in, std::ostream& ostream_in) 45 | { 46 | base64_init_encodestate(&_state); 47 | // 48 | const int N = _buffersize; 49 | char* plaintext = new char[N]; 50 | char* code = new char[2*N]; 51 | int plainlength; 52 | int codelength; 53 | 54 | do 55 | { 56 | istream_in.read(plaintext, N); 57 | plainlength = istream_in.gcount(); 58 | // 59 | codelength = encode(plaintext, plainlength, code); 60 | ostream_in.write(code, codelength); 61 | } 62 | while (istream_in.good() && plainlength > 0); 63 | 64 | codelength = encode_end(code); 65 | ostream_in.write(code, codelength); 66 | // 67 | base64_init_encodestate(&_state); 68 | 69 | delete [] code; 70 | delete [] plaintext; 71 | } 72 | }; 73 | 74 | } // namespace base64 75 | 76 | #endif // BASE64_ENCODE_H 77 | 78 | -------------------------------------------------------------------------------- /lib/dawgdic/base-types.h: -------------------------------------------------------------------------------- 1 | #ifndef DAWGDIC_BASE_TYPES_H 2 | #define DAWGDIC_BASE_TYPES_H 3 | 4 | #include 5 | 6 | namespace dawgdic { 7 | 8 | // 8-bit characters. 9 | typedef char CharType; 10 | typedef unsigned char UCharType; 11 | 12 | // 32-bit integer. 13 | typedef int ValueType; 14 | 15 | // 32-bit unsigned integer. 16 | typedef unsigned int BaseType; 17 | 18 | // 32 or 64-bit unsigned integer. 19 | typedef std::size_t SizeType; 20 | 21 | } // namespace dawgdic 22 | 23 | #endif // DAWGDIC_BASE_TYPES_H 24 | -------------------------------------------------------------------------------- /lib/dawgdic/base-unit.h: -------------------------------------------------------------------------------- 1 | #ifndef DAWGDIC_BASE_UNIT_H 2 | #define DAWGDIC_BASE_UNIT_H 3 | 4 | #include "base-types.h" 5 | 6 | namespace dawgdic { 7 | 8 | // Unit for building a dawg. 9 | class BaseUnit { 10 | public: 11 | BaseUnit() : base_(0) {} 12 | 13 | // Writes values. 14 | void set_base(BaseType base) { 15 | base_ = base; 16 | } 17 | BaseType base() const { 18 | return base_; 19 | } 20 | 21 | // Reads values. 22 | BaseType child() const { 23 | return base_ >> 2; 24 | } 25 | bool has_sibling() const { 26 | return (base_ & 1) ? true : false; 27 | } 28 | ValueType value() const { 29 | return static_cast(base_ >> 1); 30 | } 31 | bool is_state() const { 32 | return (base_ & 2) ? true : false; 33 | } 34 | 35 | private: 36 | BaseType base_; 37 | 38 | // Copyable. 39 | }; 40 | 41 | } // namespace dawgdic 42 | 43 | #endif // DAWGDIC_BASE_UNIT_H 44 | -------------------------------------------------------------------------------- /lib/dawgdic/bit-pool.h: -------------------------------------------------------------------------------- 1 | #ifndef DAWGDIC_BIT_POOL_H 2 | #define DAWGDIC_BIT_POOL_H 3 | 4 | #include "object-pool.h" 5 | 6 | namespace dawgdic { 7 | 8 | // This class works as an array of bit flags with compact memory management. 9 | template 10 | class BitPool { 11 | public: 12 | BitPool() : pool_(), size_(0) {} 13 | 14 | // Accessors. 15 | void set(SizeType index, bool bit) { 16 | SizeType pool_index = PoolIndex(index); 17 | UCharType bit_flag = BitFlag(index); 18 | if (bit) { 19 | pool_[pool_index] |= bit_flag; 20 | } else { 21 | pool_[pool_index] &= ~bit_flag; 22 | } 23 | } 24 | bool get(SizeType index) const { 25 | SizeType pool_index = PoolIndex(index); 26 | UCharType bit_flag = BitFlag(index); 27 | return (pool_[pool_index] & bit_flag) ? true : false; 28 | } 29 | 30 | // Deletes all bits and frees memory. 31 | void Clear() { 32 | pool_.Clear(); 33 | size_ = 0; 34 | } 35 | 36 | // Swaps bit pools. 37 | void Swap(BitPool *bit_pool) { 38 | pool_.Swap(&bit_pool->pool_); 39 | } 40 | 41 | // Allocates memory for a new bit and returns its ID. 42 | // Note: Allocated bits are filled with false. 43 | SizeType Allocate() { 44 | SizeType pool_index = PoolIndex(size_); 45 | if (pool_index == pool_.size()) { 46 | pool_.Allocate(); 47 | pool_[pool_index] = '\0'; 48 | } 49 | return size_++; 50 | } 51 | 52 | private: 53 | ObjectPool pool_; 54 | SizeType size_; 55 | 56 | // Disallows copies. 57 | BitPool(const BitPool &); 58 | BitPool &operator=(const BitPool &); 59 | 60 | static SizeType PoolIndex(SizeType index) { 61 | return index / 8; 62 | } 63 | static UCharType BitFlag(BaseType index) { 64 | return static_cast(1) << (index % 8); 65 | } 66 | }; 67 | 68 | } // namespace dawgdic 69 | 70 | #endif // DAWGDIC_BIT_POOL_H 71 | -------------------------------------------------------------------------------- /lib/dawgdic/completer.h: -------------------------------------------------------------------------------- 1 | #ifndef DAWGDIC_COMPLETER_H 2 | #define DAWGDIC_COMPLETER_H 3 | 4 | #include "dictionary.h" 5 | #include "guide.h" 6 | 7 | #include 8 | 9 | namespace dawgdic { 10 | 11 | class Completer { 12 | public: 13 | Completer() 14 | : dic_(NULL), guide_(NULL), key_(), index_stack_(), last_index_(0) {} 15 | Completer(const Dictionary &dic, const Guide &guide) 16 | : dic_(&dic), guide_(&guide), key_(), index_stack_(), last_index_(0) {} 17 | 18 | void set_dic(const Dictionary &dic) { 19 | dic_ = &dic; 20 | } 21 | void set_guide(const Guide &guide) { 22 | guide_ = &guide; 23 | } 24 | 25 | const Dictionary &dic() const { 26 | return *dic_; 27 | } 28 | const Guide &guide() const { 29 | return *guide_; 30 | } 31 | 32 | // These member functions are available only when Next() returns true. 33 | const char *key() const { 34 | return reinterpret_cast(&key_[0]); 35 | } 36 | SizeType length() const { 37 | return key_.size() - 1; 38 | } 39 | ValueType value() const { 40 | return dic_->value(last_index_); 41 | } 42 | 43 | // Starts completing keys from given index and prefix. 44 | void Start(BaseType index, const char *prefix = "") { 45 | SizeType length = 0; 46 | for (const char *p = prefix; *p != '\0'; ++p) { 47 | ++length; 48 | } 49 | Start(index, prefix, length); 50 | } 51 | void Start(BaseType index, const char *prefix, SizeType length) { 52 | key_.resize(length + 1); 53 | for (SizeType i = 0; i < length; ++i) { 54 | key_[i] = prefix[i]; 55 | } 56 | key_[length] = '\0'; 57 | 58 | index_stack_.clear(); 59 | if (guide_->size() != 0) { 60 | index_stack_.push_back(index); 61 | last_index_ = dic_->root(); 62 | } 63 | } 64 | 65 | // Gets the next key. 66 | bool Next() { 67 | if (index_stack_.empty()) { 68 | return false; 69 | } 70 | BaseType index = index_stack_.back(); 71 | 72 | if (last_index_ != dic_->root()) { 73 | UCharType child_label = guide_->child(index); 74 | if (child_label != '\0') { 75 | // Follows a transition to the first child. 76 | if (!Follow(child_label, &index)) 77 | return false; 78 | } else { 79 | for ( ; ; ) { 80 | UCharType sibling_label = guide_->sibling(index); 81 | 82 | // Moves to the previous node. 83 | if (key_.size() > 1) { 84 | key_.resize(key_.size() - 1); 85 | key_.back() = '\0'; 86 | } 87 | index_stack_.resize(index_stack_.size() - 1); 88 | if (index_stack_.empty()) { 89 | return false; 90 | } 91 | 92 | index = index_stack_.back(); 93 | if (sibling_label != '\0') { 94 | // Follows a transition to the next sibling. 95 | if (!Follow(sibling_label, &index)) { 96 | return false; 97 | } 98 | break; 99 | } 100 | } 101 | } 102 | } 103 | 104 | // Finds a terminal. 105 | return FindTerminal(index); 106 | } 107 | 108 | private: 109 | const Dictionary *dic_; 110 | const Guide *guide_; 111 | std::vector key_; 112 | std::vector index_stack_; 113 | BaseType last_index_; 114 | 115 | // Disallows copies. 116 | Completer(const Completer &); 117 | Completer &operator=(const Completer &); 118 | 119 | // Follows a transition. 120 | bool Follow(UCharType label, BaseType *index) { 121 | if (!dic_->Follow(label, index)) { 122 | return false; 123 | } 124 | 125 | key_.back() = label; 126 | key_.push_back('\0'); 127 | index_stack_.push_back(*index); 128 | return true; 129 | } 130 | 131 | // Finds a terminal. 132 | bool FindTerminal(BaseType index) { 133 | while (!dic_->has_value(index)) { 134 | UCharType label = guide_->child(index); 135 | if (!dic_->Follow(label, &index)) { 136 | return false; 137 | } 138 | 139 | key_.back() = label; 140 | key_.push_back('\0'); 141 | index_stack_.push_back(index); 142 | } 143 | 144 | last_index_ = index; 145 | return true; 146 | } 147 | }; 148 | 149 | } // namespace dawgdic 150 | 151 | #endif // DAWGDIC_COMPLETER_H 152 | -------------------------------------------------------------------------------- /lib/dawgdic/dawg-builder.h: -------------------------------------------------------------------------------- 1 | #ifndef DAWGDIC_DAWG_BUILDER_H 2 | #define DAWGDIC_DAWG_BUILDER_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "dawg.h" 9 | #include "dawg-unit.h" 10 | 11 | namespace dawgdic { 12 | 13 | // DAWG builder. 14 | class DawgBuilder { 15 | public: 16 | explicit DawgBuilder(SizeType initial_hash_table_size = 17 | DEFAULT_INITIAL_HASH_TABLE_SIZE) 18 | : initial_hash_table_size_(initial_hash_table_size), 19 | base_pool_(), label_pool_(), flag_pool_(), unit_pool_(), 20 | hash_table_(), unfixed_units_(), unused_units_(), num_of_states_(1), 21 | num_of_merged_transitions_(0), num_of_merging_states_(0) {} 22 | 23 | // Number of units. 24 | SizeType size() const { 25 | return base_pool_.size(); 26 | } 27 | // Number of transitions. 28 | SizeType num_of_transitions() const { 29 | return base_pool_.size() - 1; 30 | } 31 | // Number of states. 32 | SizeType num_of_states() const { 33 | return num_of_states_; 34 | } 35 | // Number of merged transitions. 36 | SizeType num_of_merged_transitions() const { 37 | return num_of_merged_transitions_; 38 | } 39 | // Number of merged states. 40 | SizeType num_of_merged_states() const { 41 | return num_of_transitions() 42 | + num_of_merged_transitions() + 1 - num_of_states(); 43 | } 44 | // Number of merging states. 45 | SizeType num_of_merging_states() const { 46 | return num_of_merging_states_; 47 | } 48 | 49 | // Initializes a builder. 50 | void Clear() { 51 | base_pool_.Clear(); 52 | label_pool_.Clear(); 53 | flag_pool_.Clear(); 54 | unit_pool_.Clear(); 55 | 56 | std::vector(0).swap(hash_table_); 57 | while (!unfixed_units_.empty()) { 58 | unfixed_units_.pop(); 59 | } 60 | while (!unused_units_.empty()) { 61 | unused_units_.pop(); 62 | } 63 | 64 | num_of_states_ = 1; 65 | num_of_merged_transitions_ = 0; 66 | num_of_merging_states_ = 0; 67 | } 68 | 69 | // Inserts a key. 70 | bool Insert(const CharType *key, ValueType value = 0) { 71 | if (key == NULL || *key == '\0' || value < 0) { 72 | return false; 73 | } 74 | SizeType length = 1; 75 | while (key[length]) { 76 | ++length; 77 | } 78 | return InsertKey(key, length, value); 79 | } 80 | 81 | // Inserts a key. 82 | bool Insert(const CharType *key, SizeType length, ValueType value) { 83 | if (key == NULL || length <= 0 || value < 0) { 84 | return false; 85 | } 86 | for (SizeType i = 0; i < length; ++i) { 87 | if (key[i] == '\0') { 88 | return false; 89 | } 90 | } 91 | return InsertKey(key, length, value); 92 | } 93 | 94 | // Finishes building a dawg. 95 | bool Finish(Dawg *dawg) { 96 | // Initializes a builder if not initialized. 97 | if (hash_table_.empty()) { 98 | Init(); 99 | } 100 | 101 | FixUnits(0); 102 | base_pool_[0].set_base(unit_pool_[0].base()); 103 | label_pool_[0] = unit_pool_[0].label(); 104 | 105 | dawg->set_num_of_states(num_of_states_); 106 | dawg->set_num_of_merged_transitions(num_of_merged_transitions_); 107 | dawg->set_num_of_merged_states(num_of_merged_states()); 108 | dawg->set_num_of_merging_states(num_of_merging_states_); 109 | 110 | dawg->SwapBasePool(&base_pool_); 111 | dawg->SwapLabelPool(&label_pool_); 112 | dawg->SwapFlagPool(&flag_pool_); 113 | 114 | Clear(); 115 | return true; 116 | } 117 | 118 | private: 119 | enum { 120 | DEFAULT_INITIAL_HASH_TABLE_SIZE = 1 << 8 121 | }; 122 | 123 | const SizeType initial_hash_table_size_; 124 | ObjectPool base_pool_; 125 | ObjectPool label_pool_; 126 | BitPool<> flag_pool_; 127 | ObjectPool unit_pool_; 128 | std::vector hash_table_; 129 | std::stack unfixed_units_; 130 | std::stack unused_units_; 131 | SizeType num_of_states_; 132 | SizeType num_of_merged_transitions_; 133 | SizeType num_of_merging_states_; 134 | 135 | // Disallows copies. 136 | DawgBuilder(const DawgBuilder &); 137 | DawgBuilder &operator=(const DawgBuilder &); 138 | 139 | // Inserts a key. 140 | bool InsertKey(const CharType *key, SizeType length, ValueType value) { 141 | // Initializes a builder if not initialized. 142 | if (hash_table_.empty()) { 143 | Init(); 144 | } 145 | 146 | BaseType index = 0; 147 | SizeType key_pos = 0; 148 | 149 | // Finds a separate unit. 150 | for ( ; key_pos <= length; ++key_pos) { 151 | BaseType child_index = unit_pool_[index].child(); 152 | if (!child_index) { 153 | break; 154 | } 155 | 156 | UCharType key_label = static_cast( 157 | (key_pos < length) ? key[key_pos] : '\0'); 158 | UCharType unit_label = unit_pool_[child_index].label(); 159 | 160 | // Checks the order of keys. 161 | if (key_label < unit_label) { 162 | return false; 163 | } else if (key_label > unit_label) { 164 | unit_pool_[child_index].set_has_sibling(true); 165 | FixUnits(child_index); 166 | break; 167 | } 168 | 169 | index = child_index; 170 | } 171 | 172 | // Adds new units. 173 | for ( ; key_pos <= length; ++key_pos) { 174 | UCharType key_label = static_cast( 175 | (key_pos < length) ? key[key_pos] : '\0'); 176 | BaseType child_index = AllocateUnit(); 177 | 178 | if (!unit_pool_[index].child()) { 179 | unit_pool_[child_index].set_is_state(true); 180 | } 181 | unit_pool_[child_index].set_sibling(unit_pool_[index].child()); 182 | unit_pool_[child_index].set_label(key_label); 183 | unit_pool_[index].set_child(child_index); 184 | unfixed_units_.push(child_index); 185 | 186 | index = child_index; 187 | } 188 | unit_pool_[index].set_value(value); 189 | return true; 190 | } 191 | 192 | // Initializes an object. 193 | void Init() { 194 | hash_table_.resize(initial_hash_table_size_, 0); 195 | AllocateUnit(); 196 | AllocateTransition(); 197 | unit_pool_[0].set_label(0xFF); 198 | unfixed_units_.push(0); 199 | } 200 | 201 | // Fixes units corresponding to the last inserted key. 202 | // Also, some of units are merged into equivalent transitions. 203 | void FixUnits(BaseType index) { 204 | while (unfixed_units_.top() != index) { 205 | BaseType unfixed_index = unfixed_units_.top(); 206 | unfixed_units_.pop(); 207 | 208 | if (num_of_states_ >= hash_table_.size() - (hash_table_.size() >> 2)) { 209 | ExpandHashTable(); 210 | } 211 | 212 | BaseType num_of_siblings = 0; 213 | for (BaseType i = unfixed_index; i != 0; i = unit_pool_[i].sibling()) { 214 | ++num_of_siblings; 215 | } 216 | 217 | BaseType hash_id; 218 | BaseType matched_index = FindUnit(unfixed_index, &hash_id); 219 | if (matched_index != 0) { 220 | num_of_merged_transitions_ += num_of_siblings; 221 | 222 | // Records a merging state. 223 | if (flag_pool_.get(matched_index) == false) { 224 | ++num_of_merging_states_; 225 | flag_pool_.set(matched_index, true); 226 | } 227 | } else { 228 | // Fixes units into pairs of base values and labels. 229 | BaseType transition_index = 0; 230 | for (BaseType i = 0; i < num_of_siblings; ++i) { 231 | transition_index = AllocateTransition(); 232 | } 233 | for (BaseType i = unfixed_index; i != 0; i = unit_pool_[i].sibling()) { 234 | base_pool_[transition_index].set_base(unit_pool_[i].base()); 235 | label_pool_[transition_index] = unit_pool_[i].label(); 236 | --transition_index; 237 | } 238 | matched_index = transition_index + 1; 239 | hash_table_[hash_id] = matched_index; 240 | ++num_of_states_; 241 | } 242 | 243 | // Deletes fixed units. 244 | for (BaseType current = unfixed_index, next; 245 | current != 0; current = next) { 246 | next = unit_pool_[current].sibling(); 247 | FreeUnit(current); 248 | } 249 | 250 | unit_pool_[unfixed_units_.top()].set_child(matched_index); 251 | } 252 | unfixed_units_.pop(); 253 | } 254 | 255 | // Expands a hash table. 256 | void ExpandHashTable() { 257 | SizeType hash_table_size = hash_table_.size() << 1; 258 | std::vector(0).swap(hash_table_); 259 | hash_table_.resize(hash_table_size, 0); 260 | 261 | // Builds a new hash table. 262 | BaseType count = 0; 263 | for (SizeType i = 1; i < base_pool_.size(); ++i) { 264 | BaseType index = static_cast(i); 265 | if (label_pool_[index] == '\0' || base_pool_[index].is_state()) { 266 | BaseType hash_id; 267 | FindTransition(index, &hash_id); 268 | hash_table_[hash_id] = index; 269 | ++count; 270 | } 271 | } 272 | } 273 | 274 | // Finds a transition from a hash table. 275 | BaseType FindTransition(BaseType index, BaseType *hash_id) const { 276 | *hash_id = HashTransition(index) % hash_table_.size(); 277 | for ( ; ; *hash_id = (*hash_id + 1) % hash_table_.size()) { 278 | BaseType transition_id = hash_table_[*hash_id]; 279 | if (transition_id == 0) { 280 | break; 281 | } 282 | 283 | // There must not be the same base value. 284 | } 285 | return 0; 286 | } 287 | 288 | // Finds a unit from a hash table. 289 | BaseType FindUnit(BaseType unit_index, BaseType *hash_id) const { 290 | *hash_id = HashUnit(unit_index) % hash_table_.size(); 291 | for ( ; ; *hash_id = (*hash_id + 1) % hash_table_.size()) { 292 | BaseType transition_id = hash_table_[*hash_id]; 293 | if (transition_id == 0) { 294 | break; 295 | } 296 | 297 | if (AreEqual(unit_index, transition_id)) { 298 | return transition_id; 299 | } 300 | } 301 | return 0; 302 | } 303 | 304 | // Compares a unit and a transition. 305 | bool AreEqual(BaseType unit_index, BaseType transition_index) const { 306 | // Compares the numbers of transitions. 307 | for (BaseType i = unit_pool_[unit_index].sibling(); i != 0; 308 | i = unit_pool_[i].sibling()) { 309 | if (base_pool_[transition_index].has_sibling() == false) { 310 | return false; 311 | } 312 | ++transition_index; 313 | } 314 | if (base_pool_[transition_index].has_sibling() == true) { 315 | return false; 316 | } 317 | 318 | // Compares out-transitions. 319 | for (BaseType i = unit_index; i; 320 | i = unit_pool_[i].sibling(), --transition_index) { 321 | if (unit_pool_[i].base() != base_pool_[transition_index].base() || 322 | unit_pool_[i].label() != label_pool_[transition_index]) { 323 | return false; 324 | } 325 | } 326 | return true; 327 | } 328 | 329 | // Calculates a hash value from a transition. 330 | BaseType HashTransition(BaseType index) const { 331 | BaseType hash_value = 0; 332 | for ( ; index != 0; ++index) { 333 | BaseType base = base_pool_[index].base(); 334 | UCharType label = label_pool_[index]; 335 | hash_value ^= Hash((label << 24) ^ base); 336 | 337 | if (base_pool_[index].has_sibling() == false) { 338 | break; 339 | } 340 | } 341 | return hash_value; 342 | } 343 | 344 | // Calculates a hash value from a unit. 345 | BaseType HashUnit(BaseType index) const { 346 | BaseType hash_value = 0; 347 | for ( ; index != 0; index = unit_pool_[index].sibling()) { 348 | BaseType base = unit_pool_[index].base(); 349 | UCharType label = unit_pool_[index].label(); 350 | hash_value ^= Hash((label << 24) ^ base); 351 | } 352 | return hash_value; 353 | } 354 | 355 | // 32-bit mix function. 356 | // http://www.concentric.net/~Ttwang/tech/inthash.htm 357 | static BaseType Hash(BaseType key) { 358 | key = ~key + (key << 15); // key = (key << 15) - key - 1; 359 | key = key ^ (key >> 12); 360 | key = key + (key << 2); 361 | key = key ^ (key >> 4); 362 | key = key * 2057; // key = (key + (key << 3)) + (key << 11); 363 | key = key ^ (key >> 16); 364 | return key; 365 | } 366 | 367 | // Gets a transition from object pools. 368 | BaseType AllocateTransition() { 369 | flag_pool_.Allocate(); 370 | base_pool_.Allocate(); 371 | return static_cast(label_pool_.Allocate()); 372 | } 373 | 374 | // Gets a unit from an object pool. 375 | BaseType AllocateUnit() { 376 | BaseType index = 0; 377 | if (unused_units_.empty()) { 378 | index = static_cast(unit_pool_.Allocate()); 379 | } else { 380 | index = unused_units_.top(); 381 | unused_units_.pop(); 382 | } 383 | unit_pool_[index].Clear(); 384 | return index; 385 | } 386 | 387 | // Returns a unit to an object pool. 388 | void FreeUnit(BaseType index) { 389 | unused_units_.push(index); 390 | } 391 | }; 392 | 393 | } // namespace dawgdic 394 | 395 | #endif // DAWGDIC_DAWG_BUILDER_H 396 | -------------------------------------------------------------------------------- /lib/dawgdic/dawg-unit.h: -------------------------------------------------------------------------------- 1 | #ifndef DAWGDIC_DAWG_UNIT_H 2 | #define DAWGDIC_DAWG_UNIT_H 3 | 4 | #include "base-types.h" 5 | 6 | namespace dawgdic { 7 | 8 | // Unit for building a dawg. 9 | class DawgUnit { 10 | public: 11 | DawgUnit() 12 | : child_(0), sibling_(0), label_('\0'), 13 | is_state_(false), has_sibling_(false) {} 14 | 15 | // Writes values. 16 | void set_child(BaseType child) { 17 | child_ = child; 18 | } 19 | void set_sibling(BaseType sibling) { 20 | sibling_ = sibling; 21 | } 22 | void set_value(ValueType value) { 23 | child_ = value; 24 | } 25 | void set_label(UCharType label) { 26 | label_ = label; 27 | } 28 | void set_is_state(bool is_state) { 29 | is_state_ = is_state; 30 | } 31 | void set_has_sibling(bool has_sibling) { 32 | has_sibling_ = has_sibling; 33 | } 34 | 35 | // Reads values. 36 | BaseType child() const { 37 | return child_; 38 | } 39 | BaseType sibling() const { 40 | return sibling_; 41 | } 42 | ValueType value() const { 43 | return static_cast(child_); 44 | } 45 | UCharType label() const { 46 | return label_; 47 | } 48 | bool is_state() const { 49 | return is_state_; 50 | } 51 | bool has_sibling() const { 52 | return has_sibling_; 53 | } 54 | 55 | // Calculates a base value of a unit. 56 | BaseType base() const { 57 | if (label_ == '\0') { 58 | return (child_ << 1) | (has_sibling_ ? 1 : 0); 59 | } 60 | return (child_ << 2) | (is_state_ ? 2 : 0) | (has_sibling_ ? 1 : 0); 61 | } 62 | 63 | // Initializes a unit. 64 | void Clear() { 65 | child_ = 0; 66 | sibling_ = 0; 67 | label_ = '\0'; 68 | is_state_ = false; 69 | has_sibling_ = false; 70 | } 71 | 72 | private: 73 | BaseType child_; 74 | BaseType sibling_; 75 | UCharType label_; 76 | bool is_state_; 77 | bool has_sibling_; 78 | 79 | // Copyable. 80 | }; 81 | 82 | } // namespace dawgdic 83 | 84 | #endif // DAWGDIC_DAWG_UNIT_H 85 | -------------------------------------------------------------------------------- /lib/dawgdic/dawg.h: -------------------------------------------------------------------------------- 1 | #ifndef DAWGDIC_DAWG_H 2 | #define DAWGDIC_DAWG_H 3 | 4 | #include "base-unit.h" 5 | #include "bit-pool.h" 6 | #include "object-pool.h" 7 | 8 | namespace dawgdic { 9 | 10 | class Dawg { 11 | public: 12 | Dawg() 13 | : base_pool_(), label_pool_(), flag_pool_(), 14 | num_of_states_(0), num_of_merged_transitions_(0), 15 | num_of_merged_states_(0), num_of_merging_states_(0) {} 16 | 17 | // The root index. 18 | BaseType root() const { 19 | return 0; 20 | } 21 | 22 | // Number of units. 23 | SizeType size() const { 24 | return base_pool_.size(); 25 | } 26 | // Number of transitions. 27 | SizeType num_of_transitions() const { 28 | return base_pool_.size() - 1; 29 | } 30 | // Number of states. 31 | SizeType num_of_states() const { 32 | return num_of_states_; 33 | } 34 | // Number of merged transitions. 35 | SizeType num_of_merged_transitions() const { 36 | return num_of_merged_transitions_; 37 | } 38 | // Number of merged states. 39 | SizeType num_of_merged_states() const { 40 | return num_of_merged_states_; 41 | } 42 | // Number of merging states. 43 | SizeType num_of_merging_states() const { 44 | return num_of_merging_states_; 45 | } 46 | 47 | // Reads values. 48 | BaseType child(BaseType index) const { 49 | return base_pool_[index].child(); 50 | } 51 | BaseType sibling(BaseType index) const { 52 | return base_pool_[index].has_sibling() ? (index + 1) : 0; 53 | } 54 | ValueType value(BaseType index) const { 55 | return base_pool_[index].value(); 56 | } 57 | 58 | bool is_leaf(BaseType index) const { 59 | return label(index) == '\0'; 60 | } 61 | UCharType label(BaseType index) const { 62 | return label_pool_[index]; 63 | } 64 | bool is_merging(BaseType index) const { 65 | return flag_pool_.get(index); 66 | } 67 | 68 | // Clears object pools. 69 | void Clear() { 70 | base_pool_.Clear(); 71 | label_pool_.Clear(); 72 | flag_pool_.Clear(); 73 | num_of_states_ = 0; 74 | num_of_merged_states_ = 0; 75 | } 76 | 77 | // Swaps dawgs. 78 | void Swap(Dawg *dawg) { 79 | base_pool_.Swap(&dawg->base_pool_); 80 | label_pool_.Swap(&dawg->label_pool_); 81 | flag_pool_.Swap(&dawg->flag_pool_); 82 | std::swap(num_of_states_, dawg->num_of_states_); 83 | std::swap(num_of_merged_transitions_, dawg->num_of_merged_transitions_); 84 | std::swap(num_of_merged_states_, dawg->num_of_merged_states_); 85 | std::swap(num_of_merging_states_, dawg->num_of_merging_states_); 86 | } 87 | 88 | public: 89 | // Following member functions are called from DawgBuilder. 90 | 91 | // Sets the number of states. 92 | void set_num_of_states(SizeType num_of_states) { 93 | num_of_states_ = num_of_states; 94 | } 95 | // Sets the number of merged transitions. 96 | void set_num_of_merged_transitions(SizeType num_of_merged_transitions) { 97 | num_of_merged_transitions_ = num_of_merged_transitions; 98 | } 99 | // Sets the number of merged states. 100 | void set_num_of_merged_states(SizeType num_of_merged_states) { 101 | num_of_merged_states_ = num_of_merged_states; 102 | } 103 | // Sets the number of merging states. 104 | void set_num_of_merging_states(SizeType num_of_merging_states) { 105 | num_of_merging_states_ = num_of_merging_states; 106 | } 107 | 108 | // Swaps base pools. 109 | void SwapBasePool(ObjectPool *base_pool) { 110 | base_pool_.Swap(base_pool); 111 | } 112 | // Swaps label pools. 113 | void SwapLabelPool(ObjectPool *label_pool) { 114 | label_pool_.Swap(label_pool); 115 | } 116 | // Swaps flag pools. 117 | void SwapFlagPool(BitPool<> *flag_pool) { 118 | flag_pool_.Swap(flag_pool); 119 | } 120 | 121 | private: 122 | ObjectPool base_pool_; 123 | ObjectPool label_pool_; 124 | BitPool<> flag_pool_; 125 | SizeType num_of_states_; 126 | SizeType num_of_merged_transitions_; 127 | SizeType num_of_merged_states_; 128 | SizeType num_of_merging_states_; 129 | 130 | // Disallows copies. 131 | Dawg(const Dawg &); 132 | Dawg &operator=(const Dawg &); 133 | }; 134 | 135 | } // namespace dawgdic 136 | 137 | #endif // DAWGDIC_DAWG_H 138 | -------------------------------------------------------------------------------- /lib/dawgdic/dictionary-builder.h: -------------------------------------------------------------------------------- 1 | #ifndef DAWGDIC_DICTIONARY_BUILDER_H 2 | #define DAWGDIC_DICTIONARY_BUILDER_H 3 | 4 | #include 5 | 6 | #include "dawg.h" 7 | #include "dictionary.h" 8 | #include "dictionary-extra-unit.h" 9 | #include "link-table.h" 10 | 11 | namespace dawgdic { 12 | 13 | class DictionaryBuilder { 14 | public: 15 | enum { 16 | // Number of units in a block. 17 | BLOCK_SIZE = 256, 18 | // Number of blocks kept unfixed. 19 | NUM_OF_UNFIXED_BLOCKS = 16, 20 | // Number of units kept unfixed. 21 | UNFIXED_SIZE = BLOCK_SIZE * NUM_OF_UNFIXED_BLOCKS 22 | }; 23 | 24 | // Builds a dictionary from a list-form dawg. 25 | static bool Build(const Dawg &dawg, Dictionary *dic, 26 | BaseType *num_of_unused_units = NULL) { 27 | DictionaryBuilder builder(dawg, dic); 28 | if (!builder.BuildDictionary()) { 29 | return false; 30 | } 31 | if (num_of_unused_units != NULL) { 32 | *num_of_unused_units = builder.num_of_unused_units_; 33 | } 34 | return true; 35 | } 36 | 37 | private: 38 | const Dawg &dawg_; 39 | Dictionary *dic_; 40 | 41 | std::vector units_; 42 | std::vector extras_; 43 | std::vector labels_; 44 | LinkTable link_table_; 45 | BaseType unfixed_index_; 46 | BaseType num_of_unused_units_; 47 | 48 | // Masks for offsets. 49 | static const BaseType UPPER_MASK = ~(DictionaryUnit::OFFSET_MAX - 1); 50 | static const BaseType LOWER_MASK = 0xFF; 51 | 52 | // Disallows copies. 53 | DictionaryBuilder(const DictionaryBuilder &); 54 | DictionaryBuilder &operator=(const DictionaryBuilder &); 55 | 56 | DictionaryBuilder(const Dawg &dawg, Dictionary *dic) 57 | : dawg_(dawg), dic_(dic), units_(), extras_(), labels_(), 58 | link_table_(), unfixed_index_(), num_of_unused_units_(0) {} 59 | ~DictionaryBuilder() { 60 | for (SizeType i = 0; i < extras_.size(); ++i) { 61 | delete [] extras_[i]; 62 | } 63 | } 64 | 65 | // Accesses units. 66 | DictionaryUnit &units(BaseType index) { 67 | return units_[index]; 68 | } 69 | const DictionaryUnit &units(BaseType index) const { 70 | return units_[index]; 71 | } 72 | DictionaryExtraUnit &extras(BaseType index) { 73 | return extras_[index / BLOCK_SIZE][index % BLOCK_SIZE]; 74 | } 75 | const DictionaryExtraUnit &extras(BaseType index) const { 76 | return extras_[index / BLOCK_SIZE][index % BLOCK_SIZE]; 77 | } 78 | 79 | // Number of units. 80 | BaseType num_of_units() const { 81 | return static_cast(units_.size()); 82 | } 83 | // Number of blocks. 84 | BaseType num_of_blocks() const { 85 | return static_cast(extras_.size()); 86 | } 87 | 88 | // Builds a dictionary from a list-form dawg. 89 | bool BuildDictionary() { 90 | link_table_.Init(dawg_.num_of_merging_states() + 91 | (dawg_.num_of_merging_states() >> 1)); 92 | 93 | ReserveUnit(0); 94 | extras(0).set_is_used(); 95 | units(0).set_offset(1); 96 | units(0).set_label('\0'); 97 | 98 | if (dawg_.size() > 1) { 99 | if (!BuildDictionary(dawg_.root(), 0)) { 100 | return false; 101 | } 102 | } 103 | 104 | FixAllBlocks(); 105 | 106 | dic_->SwapUnitsBuf(&units_); 107 | return true; 108 | } 109 | 110 | // Builds a dictionary from a dawg. 111 | bool BuildDictionary(BaseType dawg_index, BaseType dic_index) { 112 | if (dawg_.is_leaf(dawg_index)) { 113 | return true; 114 | } 115 | 116 | // Uses an existing offset if available. 117 | BaseType dawg_child_index = dawg_.child(dawg_index); 118 | if (dawg_.is_merging(dawg_child_index)) { 119 | BaseType offset = link_table_.Find(dawg_child_index); 120 | if (offset != 0) { 121 | offset ^= dic_index; 122 | if (!(offset & UPPER_MASK) || !(offset & LOWER_MASK)) { 123 | if (dawg_.is_leaf(dawg_child_index)) { 124 | units(dic_index).set_has_leaf(); 125 | } 126 | units(dic_index).set_offset(offset); 127 | return true; 128 | } 129 | } 130 | } 131 | 132 | // Finds a good offset and arranges child nodes. 133 | BaseType offset = ArrangeChildNodes(dawg_index, dic_index); 134 | if (offset == 0) { 135 | return false; 136 | } 137 | 138 | if (dawg_.is_merging(dawg_child_index)) 139 | link_table_.Insert(dawg_child_index, offset); { 140 | } 141 | 142 | // Builds a double-array in depth-first order. 143 | do { 144 | BaseType dic_child_index = offset ^ dawg_.label(dawg_child_index); 145 | if (!BuildDictionary(dawg_child_index, dic_child_index)) { 146 | return false; 147 | } 148 | dawg_child_index = dawg_.sibling(dawg_child_index); 149 | } while (dawg_child_index != 0); 150 | 151 | return true; 152 | } 153 | 154 | // Arranges child nodes. 155 | BaseType ArrangeChildNodes(BaseType dawg_index, BaseType dic_index) { 156 | labels_.clear(); 157 | 158 | BaseType dawg_child_index = dawg_.child(dawg_index); 159 | while (dawg_child_index != 0) { 160 | labels_.push_back(dawg_.label(dawg_child_index)); 161 | dawg_child_index = dawg_.sibling(dawg_child_index); 162 | } 163 | 164 | // Finds a good offset. 165 | BaseType offset = FindGoodOffset(dic_index); 166 | if (!units(dic_index).set_offset(dic_index ^ offset)) { 167 | return 0; 168 | } 169 | 170 | dawg_child_index = dawg_.child(dawg_index); 171 | for (SizeType i = 0; i < labels_.size(); ++i) { 172 | BaseType dic_child_index = offset ^ labels_[i]; 173 | ReserveUnit(dic_child_index); 174 | 175 | if (dawg_.is_leaf(dawg_child_index)) { 176 | units(dic_index).set_has_leaf(); 177 | units(dic_child_index).set_value(dawg_.value(dawg_child_index)); 178 | } else { 179 | units(dic_child_index).set_label(labels_[i]); 180 | } 181 | 182 | dawg_child_index = dawg_.sibling(dawg_child_index); 183 | } 184 | extras(offset).set_is_used(); 185 | 186 | return offset; 187 | } 188 | 189 | // Finds a good offset. 190 | BaseType FindGoodOffset(BaseType index) const { 191 | if (unfixed_index_ >= num_of_units()) { 192 | return num_of_units() | (index & 0xFF); 193 | } 194 | 195 | // Scans unused units to find a good offset. 196 | BaseType unfixed_index = unfixed_index_; 197 | do { 198 | BaseType offset = unfixed_index ^ labels_[0]; 199 | if (IsGoodOffset(index, offset)) { 200 | return offset; 201 | } 202 | unfixed_index = extras(unfixed_index).next(); 203 | } while (unfixed_index != unfixed_index_); 204 | 205 | return num_of_units() | (index & 0xFF); 206 | } 207 | 208 | // Checks if a given offset is valid or not. 209 | bool IsGoodOffset(BaseType index, BaseType offset) const { 210 | if (extras(offset).is_used()) { 211 | return false; 212 | } 213 | 214 | BaseType relative_offset = index ^ offset; 215 | if ((relative_offset & LOWER_MASK) && (relative_offset & UPPER_MASK)) { 216 | return false; 217 | } 218 | 219 | // Finds a collision. 220 | for (SizeType i = 1; i < labels_.size(); ++i) { 221 | if (extras(offset ^ labels_[i]).is_fixed()) { 222 | return false; 223 | } 224 | } 225 | 226 | return true; 227 | } 228 | 229 | // Reserves an unused unit. 230 | void ReserveUnit(BaseType index) { 231 | if (index >= num_of_units()) { 232 | ExpandDictionary(); 233 | } 234 | 235 | // Removes an unused unit from a circular linked list. 236 | if (index == unfixed_index_) { 237 | unfixed_index_ = extras(index).next(); 238 | if (unfixed_index_ == index) { 239 | unfixed_index_ = num_of_units(); 240 | } 241 | } 242 | extras(extras(index).prev()).set_next(extras(index).next()); 243 | extras(extras(index).next()).set_prev(extras(index).prev()); 244 | extras(index).set_is_fixed(); 245 | } 246 | 247 | // Expands a dictionary. 248 | void ExpandDictionary() { 249 | BaseType src_num_of_units = num_of_units(); 250 | BaseType src_num_of_blocks = num_of_blocks(); 251 | 252 | BaseType dest_num_of_units = src_num_of_units + BLOCK_SIZE; 253 | BaseType dest_num_of_blocks = src_num_of_blocks + 1; 254 | 255 | // Fixes an old block. 256 | if (dest_num_of_blocks > NUM_OF_UNFIXED_BLOCKS) { 257 | FixBlock(src_num_of_blocks - NUM_OF_UNFIXED_BLOCKS); 258 | } 259 | 260 | units_.resize(dest_num_of_units); 261 | extras_.resize(dest_num_of_blocks, 0); 262 | 263 | // Allocates memory to a new block. 264 | if (dest_num_of_blocks > NUM_OF_UNFIXED_BLOCKS) { 265 | BaseType block_id = src_num_of_blocks - NUM_OF_UNFIXED_BLOCKS; 266 | std::swap(extras_[block_id], extras_.back()); 267 | for (BaseType i = src_num_of_units; i < dest_num_of_units; ++i) { 268 | extras(i).clear(); 269 | } 270 | } else { 271 | extras_.back() = new DictionaryExtraUnit[BLOCK_SIZE]; 272 | } 273 | 274 | // Creates a circular linked list for a new block. 275 | for (BaseType i = src_num_of_units + 1; i < dest_num_of_units; ++i) { 276 | extras(i - 1).set_next(i); 277 | extras(i).set_prev(i - 1); 278 | } 279 | 280 | extras(src_num_of_units).set_prev(dest_num_of_units - 1); 281 | extras(dest_num_of_units - 1).set_next(src_num_of_units); 282 | 283 | // Merges 2 circular linked lists. 284 | extras(src_num_of_units).set_prev(extras(unfixed_index_).prev()); 285 | extras(dest_num_of_units - 1).set_next(unfixed_index_); 286 | 287 | extras(extras(unfixed_index_).prev()).set_next(src_num_of_units); 288 | extras(unfixed_index_).set_prev(dest_num_of_units - 1); 289 | } 290 | 291 | // Fixes all blocks to avoid invalid transitions. 292 | void FixAllBlocks() { 293 | BaseType begin = 0; 294 | if (num_of_blocks() > NUM_OF_UNFIXED_BLOCKS) { 295 | begin = num_of_blocks() - NUM_OF_UNFIXED_BLOCKS; 296 | } 297 | BaseType end = num_of_blocks(); 298 | 299 | for (BaseType block_id = begin; block_id != end; ++block_id) { 300 | FixBlock(block_id); 301 | } 302 | } 303 | 304 | // Adjusts labels of unused units in a given block. 305 | void FixBlock(BaseType block_id) { 306 | BaseType begin = block_id * BLOCK_SIZE; 307 | BaseType end = begin + BLOCK_SIZE; 308 | 309 | // Finds an unused offset. 310 | BaseType unused_offset_for_label = 0; 311 | for (BaseType offset = begin; offset != end; ++offset) { 312 | if (!extras(offset).is_used()) { 313 | unused_offset_for_label = offset; 314 | break; 315 | } 316 | } 317 | 318 | // Labels of unused units are modified. 319 | for (BaseType index = begin; index != end; ++index) { 320 | if (!extras(index).is_fixed()) { 321 | ReserveUnit(index); 322 | units(index).set_label( 323 | static_cast(index ^ unused_offset_for_label)); 324 | ++num_of_unused_units_; 325 | } 326 | } 327 | } 328 | }; 329 | 330 | } // namespace dawgdic 331 | 332 | #endif // DAWGDIC_DICTIONARY_BUILDER_H 333 | -------------------------------------------------------------------------------- /lib/dawgdic/dictionary-extra-unit.h: -------------------------------------------------------------------------------- 1 | #ifndef DAWGDIC_DICTIONARY_EXTRA_UNIT_H 2 | #define DAWGDIC_DICTIONARY_EXTRA_UNIT_H 3 | 4 | #include "base-types.h" 5 | 6 | namespace dawgdic { 7 | 8 | // Extra unit for building a dictionary. 9 | class DictionaryExtraUnit { 10 | public: 11 | DictionaryExtraUnit() : lo_values_(0), hi_values_(0) {} 12 | 13 | void clear() { 14 | lo_values_ = hi_values_ = 0; 15 | } 16 | 17 | // Sets if a unit is fixed or not. 18 | void set_is_fixed() { 19 | lo_values_ |= 1; 20 | } 21 | // Sets an index of the next unused unit. 22 | void set_next(BaseType next) { 23 | lo_values_ = (lo_values_ & 1) | (next << 1); 24 | } 25 | // Sets if an index is used as an offset or not. 26 | void set_is_used() { 27 | hi_values_ |= 1; 28 | } 29 | // Sets an index of the previous unused unit. 30 | void set_prev(BaseType prev) { 31 | hi_values_ = (hi_values_ & 1) | (prev << 1); 32 | } 33 | 34 | // Reads if a unit is fixed or not. 35 | bool is_fixed() const { 36 | return (lo_values_ & 1) == 1; 37 | } 38 | // Reads an index of the next unused unit. 39 | BaseType next() const { 40 | return lo_values_ >> 1; 41 | } 42 | // Reads if an index is used as an offset or not. 43 | bool is_used() const { 44 | return (hi_values_ & 1) == 1; 45 | } 46 | // Reads an index of the previous unused unit. 47 | BaseType prev() const { 48 | return hi_values_ >> 1; 49 | } 50 | 51 | private: 52 | BaseType lo_values_; 53 | BaseType hi_values_; 54 | 55 | // Copyable. 56 | }; 57 | 58 | } // namespace dawgdic 59 | 60 | #endif // DAWGDIC_DICTIONARY_EXTRA_UNIT_H 61 | -------------------------------------------------------------------------------- /lib/dawgdic/dictionary-unit.h: -------------------------------------------------------------------------------- 1 | #ifndef DAWGDIC_DICTIONARY_UNIT_H 2 | #define DAWGDIC_DICTIONARY_UNIT_H 3 | 4 | #include "base-types.h" 5 | 6 | namespace dawgdic { 7 | 8 | // Unit of a dictionary. 9 | class DictionaryUnit 10 | { 11 | public: 12 | static const BaseType OFFSET_MAX = static_cast(1) << 21; 13 | static const BaseType IS_LEAF_BIT = static_cast(1) << 31; 14 | static const BaseType HAS_LEAF_BIT = static_cast(1) << 8; 15 | static const BaseType EXTENSION_BIT = static_cast(1) << 9; 16 | 17 | DictionaryUnit() : base_(0) {} 18 | 19 | // Sets a flag to show that a unit has a leaf as a child. 20 | void set_has_leaf() { 21 | base_ |= HAS_LEAF_BIT; 22 | } 23 | // Sets a value to a leaf unit. 24 | void set_value(ValueType value) { 25 | base_ = static_cast(value) | IS_LEAF_BIT; 26 | } 27 | // Sets a label to a non-leaf unit. 28 | void set_label(UCharType label) { 29 | base_ = (base_ & ~static_cast(0xFF)) | label; 30 | } 31 | // Sets an offset to a non-leaf unit. 32 | bool set_offset(BaseType offset) { 33 | if (offset >= (OFFSET_MAX << 8)) { 34 | return false; 35 | } 36 | 37 | base_ &= IS_LEAF_BIT | HAS_LEAF_BIT | 0xFF; 38 | if (offset < OFFSET_MAX) { 39 | base_ |= (offset << 10); 40 | } else { 41 | base_ |= (offset << 2) | EXTENSION_BIT; 42 | } 43 | return true; 44 | } 45 | 46 | // Checks if a unit has a leaf as a child or not. 47 | bool has_leaf() const { 48 | return (base_ & HAS_LEAF_BIT) ? true : false; 49 | } 50 | // Checks if a unit corresponds to a leaf or not. 51 | ValueType value() const { 52 | return static_cast(base_ & ~IS_LEAF_BIT); 53 | } 54 | // Reads a label with a leaf flag from a non-leaf unit. 55 | BaseType label() const { 56 | return base_ & (IS_LEAF_BIT | 0xFF); 57 | } 58 | // Reads an offset to child units from a non-leaf unit. 59 | BaseType offset() const { 60 | return (base_ >> 10) << ((base_ & EXTENSION_BIT) >> 6); 61 | } 62 | 63 | private: 64 | BaseType base_; 65 | 66 | // Copyable. 67 | }; 68 | 69 | } // namespace dawgdic 70 | 71 | #endif // DAWGDIC_DICTIONARY_UNIT_H 72 | -------------------------------------------------------------------------------- /lib/dawgdic/dictionary.h: -------------------------------------------------------------------------------- 1 | #ifndef DAWGDIC_DICTIONARY_H 2 | #define DAWGDIC_DICTIONARY_H 3 | 4 | #include 5 | #include 6 | 7 | #include "base-types.h" 8 | #include "dictionary-unit.h" 9 | 10 | namespace dawgdic { 11 | 12 | // Dictionary class for retrieval and binary I/O. 13 | class Dictionary { 14 | public: 15 | Dictionary() : units_(NULL), size_(0), units_buf_() {} 16 | 17 | const DictionaryUnit *units() const { 18 | return units_; 19 | } 20 | SizeType size() const { 21 | return size_; 22 | } 23 | SizeType total_size() const { 24 | return sizeof(DictionaryUnit) * size_; 25 | } 26 | SizeType file_size() const { 27 | return sizeof(BaseType) + total_size(); 28 | } 29 | 30 | // Root index. 31 | BaseType root() const { 32 | return 0; 33 | } 34 | 35 | // Checks if a given index is related to the end of a key. 36 | bool has_value(BaseType index) const { 37 | return units_[index].has_leaf(); 38 | } 39 | // Gets a value from a given index. 40 | ValueType value(BaseType index) const { 41 | return units_[index ^ units_[index].offset()].value(); 42 | } 43 | 44 | // Reads a dictionary from an input stream. 45 | bool Read(std::istream *input) { 46 | BaseType base_size; 47 | if (!input->read(reinterpret_cast(&base_size), sizeof(BaseType))) { 48 | return false; 49 | } 50 | 51 | SizeType size = static_cast(base_size); 52 | std::vector units_buf(size); 53 | if (!input->read(reinterpret_cast(&units_buf[0]), 54 | sizeof(DictionaryUnit) * size)) { 55 | return false; 56 | } 57 | 58 | SwapUnitsBuf(&units_buf); 59 | return true; 60 | } 61 | 62 | // Writes a dictionry to an output stream. 63 | bool Write(std::ostream *output) const { 64 | BaseType base_size = static_cast(size_); 65 | if (!output->write(reinterpret_cast(&base_size), 66 | sizeof(BaseType))) { 67 | return false; 68 | } 69 | 70 | if (!output->write(reinterpret_cast(units_), 71 | sizeof(DictionaryUnit) * size_)) { 72 | return false; 73 | } 74 | 75 | return true; 76 | } 77 | 78 | // Exact matching. 79 | bool Contains(const CharType *key) const { 80 | BaseType index = root(); 81 | if (!Follow(key, &index)) { 82 | return false; 83 | } 84 | return has_value(index); 85 | } 86 | bool Contains(const CharType *key, SizeType length) const { 87 | BaseType index = root(); 88 | if (!Follow(key, length, &index)) { 89 | return false; 90 | } 91 | return has_value(index); 92 | } 93 | 94 | // Exact matching. 95 | ValueType Find(const CharType *key) const { 96 | BaseType index = root(); 97 | if (!Follow(key, &index)) { 98 | return -1; 99 | } 100 | return has_value(index) ? value(index) : -1; 101 | } 102 | ValueType Find(const CharType *key, SizeType length) const { 103 | BaseType index = root(); 104 | if (!Follow(key, length, &index)) { 105 | return -1; 106 | } 107 | return has_value(index) ? value(index) : -1; 108 | } 109 | bool Find(const CharType *key, ValueType *value) const { 110 | BaseType index = root(); 111 | if (!Follow(key, &index) || !has_value(index)) { 112 | return false; 113 | } 114 | *value = this->value(index); 115 | return true; 116 | } 117 | bool Find(const CharType *key, SizeType length, ValueType *value) const { 118 | BaseType index = root(); 119 | if (!Follow(key, length, &index) || !has_value(index)) { 120 | return false; 121 | } 122 | *value = this->value(index); 123 | return true; 124 | } 125 | 126 | // Follows a transition. 127 | bool Follow(CharType label, BaseType *index) const { 128 | BaseType next_index = 129 | *index ^ units_[*index].offset() ^ static_cast(label); 130 | if (units_[next_index].label() != static_cast(label)) { 131 | return false; 132 | } 133 | *index = next_index; 134 | return true; 135 | } 136 | 137 | // Follows transitions. 138 | bool Follow(const CharType *s, BaseType *index) const { 139 | while (*s != '\0' && Follow(*s, index)) { 140 | ++s; 141 | } 142 | return *s == '\0'; 143 | } 144 | bool Follow(const CharType *s, BaseType *index, SizeType *count) const { 145 | while (*s != '\0' && Follow(*s, index)) { 146 | ++s, ++*count; 147 | } 148 | return *s == '\0'; 149 | } 150 | 151 | // Follows transitions. 152 | bool Follow(const CharType *s, SizeType length, BaseType *index) const { 153 | for (SizeType i = 0; i < length; ++i) { 154 | if (!Follow(s[i], index)) { 155 | return false; 156 | } 157 | } 158 | return true; 159 | } 160 | bool Follow(const CharType *s, SizeType length, BaseType *index, 161 | SizeType *count) const { 162 | for (SizeType i = 0; i < length; ++i, ++*count) { 163 | if (!Follow(s[i], index)) { 164 | return false; 165 | } 166 | } 167 | return true; 168 | } 169 | 170 | // Maps memory with its size. 171 | void Map(const void *address) { 172 | Clear(); 173 | units_ = reinterpret_cast( 174 | static_cast(address) + 1); 175 | size_ = *static_cast(address); 176 | } 177 | void Map(const void *address, SizeType size) { 178 | Clear(); 179 | units_ = static_cast(address); 180 | size_ = size; 181 | } 182 | 183 | // Initializes a dictionary. 184 | void Clear() { 185 | units_ = NULL; 186 | size_ = 0; 187 | std::vector(0).swap(units_buf_); 188 | } 189 | 190 | // Swaps dictionaries. 191 | void Swap(Dictionary *dic) { 192 | std::swap(units_, dic->units_); 193 | std::swap(size_, dic->size_); 194 | units_buf_.swap(dic->units_buf_); 195 | } 196 | 197 | // Shrinks a vector. 198 | void Shrink() { 199 | if (units_buf_.size() == units_buf_.capacity()) { 200 | return; 201 | } 202 | 203 | std::vector units_buf(units_buf_); 204 | SwapUnitsBuf(&units_buf); 205 | } 206 | 207 | public: 208 | // Following member function is called from DawgBuilder. 209 | 210 | // Swaps buffers for units. 211 | void SwapUnitsBuf(std::vector *units_buf) { 212 | units_ = &(*units_buf)[0]; 213 | size_ = static_cast(units_buf->size()); 214 | units_buf_.swap(*units_buf); 215 | } 216 | 217 | private: 218 | const DictionaryUnit *units_; 219 | SizeType size_; 220 | std::vector units_buf_; 221 | 222 | // Disallows copies. 223 | Dictionary(const Dictionary &); 224 | Dictionary &operator=(const Dictionary &); 225 | }; 226 | 227 | } // namespace dawgdic 228 | 229 | #endif // DAWGDIC_DICTIONARY_H 230 | -------------------------------------------------------------------------------- /lib/dawgdic/guide-builder.h: -------------------------------------------------------------------------------- 1 | #ifndef DAWGDIC_GUIDE_BUILDER_H 2 | #define DAWGDIC_GUIDE_BUILDER_H 3 | 4 | #include "guide.h" 5 | #include "dawg.h" 6 | #include "dictionary.h" 7 | 8 | #include 9 | 10 | namespace dawgdic { 11 | 12 | class GuideBuilder { 13 | public: 14 | // Builds a dictionary for completing keys. 15 | static bool Build(const Dawg &dawg, const Dictionary &dic, Guide *guide) { 16 | GuideBuilder builder(dawg, dic, guide); 17 | return builder.BuildGuide(); 18 | } 19 | 20 | private: 21 | const Dawg &dawg_; 22 | const Dictionary &dic_; 23 | Guide *guide_; 24 | 25 | std::vector units_; 26 | std::vector is_fixed_table_; 27 | 28 | // Disallows copies. 29 | GuideBuilder(const GuideBuilder &); 30 | GuideBuilder &operator=(const GuideBuilder &); 31 | 32 | GuideBuilder(const Dawg &dawg, const Dictionary &dic, Guide *guide) 33 | : dawg_(dawg), dic_(dic), guide_(guide), units_(), is_fixed_table_() {} 34 | 35 | bool BuildGuide() { 36 | // Initializes units and flags. 37 | units_.resize(dic_.size()); 38 | is_fixed_table_.resize(dic_.size() / 8, '\0'); 39 | 40 | if (dawg_.size() <= 1) { 41 | return true; 42 | } 43 | 44 | if (!BuildGuide(dawg_.root(), dic_.root())) { 45 | return false; 46 | } 47 | 48 | guide_->SwapUnitsBuf(&units_); 49 | return true; 50 | } 51 | 52 | // Builds a guide recursively. 53 | bool BuildGuide(BaseType dawg_index, BaseType dic_index) { 54 | if (is_fixed(dic_index)) { 55 | return true; 56 | } 57 | set_is_fixed(dic_index); 58 | 59 | // Finds the first non-terminal child. 60 | BaseType dawg_child_index = dawg_.child(dawg_index); 61 | if (dawg_.label(dawg_child_index) == '\0') { 62 | dawg_child_index = dawg_.sibling(dawg_child_index); 63 | if (dawg_child_index == 0) { 64 | return true; 65 | } 66 | } 67 | units_[dic_index].set_child(dawg_.label(dawg_child_index)); 68 | 69 | do { 70 | UCharType child_label = dawg_.label(dawg_child_index); 71 | BaseType dic_child_index = dic_index; 72 | if (!dic_.Follow(child_label, &dic_child_index)) { 73 | return false; 74 | } 75 | 76 | if (!BuildGuide(dawg_child_index, dic_child_index)) { 77 | return false; 78 | } 79 | 80 | BaseType dawg_sibling_index = dawg_.sibling(dawg_child_index); 81 | UCharType sibling_label = dawg_.label(dawg_sibling_index); 82 | if (dawg_sibling_index != 0) { 83 | units_[dic_child_index].set_sibling(sibling_label); 84 | } 85 | 86 | dawg_child_index = dawg_sibling_index; 87 | } while (dawg_child_index != 0); 88 | 89 | return true; 90 | } 91 | 92 | void set_is_fixed(BaseType index) { 93 | is_fixed_table_[index / 8] |= 1 << (index % 8); 94 | } 95 | 96 | bool is_fixed(BaseType index) const { 97 | return (is_fixed_table_[index / 8] & (1 << (index % 8))) != 0; 98 | } 99 | }; 100 | 101 | } // namespace dawgdic 102 | 103 | #endif // DAWGDIC_GUIDE_BUILDER_H 104 | -------------------------------------------------------------------------------- /lib/dawgdic/guide-unit.h: -------------------------------------------------------------------------------- 1 | #ifndef DAWGDIC_GUIDE_UNIT_H 2 | #define DAWGDIC_GUIDE_UNIT_H 3 | 4 | #include "base-types.h" 5 | 6 | namespace dawgdic { 7 | 8 | class GuideUnit { 9 | public: 10 | GuideUnit() : child_('\0'), sibling_('\0') {} 11 | 12 | void set_child(UCharType child) { 13 | child_ = child; 14 | } 15 | void set_sibling(UCharType sibling) { 16 | sibling_ = sibling; 17 | } 18 | 19 | UCharType child() const { 20 | return child_; 21 | } 22 | UCharType sibling() const { 23 | return sibling_; 24 | } 25 | 26 | private: 27 | UCharType child_; 28 | UCharType sibling_; 29 | 30 | // Copyable. 31 | }; 32 | 33 | } // namespace dawgdic 34 | 35 | #endif // DAWGDIC_GUIDE_UNIT_H 36 | -------------------------------------------------------------------------------- /lib/dawgdic/guide.h: -------------------------------------------------------------------------------- 1 | #ifndef DAWGDIC_GUIDE_H 2 | #define DAWGDIC_GUIDE_H 3 | 4 | #include "dictionary.h" 5 | #include "guide-unit.h" 6 | 7 | #include 8 | #include 9 | 10 | namespace dawgdic { 11 | 12 | class Guide { 13 | public: 14 | Guide() : units_(NULL), size_(0), units_buf_() {} 15 | 16 | const GuideUnit *units() const { 17 | return units_; 18 | } 19 | SizeType size() const { 20 | return size_; 21 | } 22 | SizeType total_size() const { 23 | return sizeof(GuideUnit) * size_; 24 | } 25 | SizeType file_size() const { 26 | return sizeof(BaseType) + total_size(); 27 | } 28 | 29 | // The root index. 30 | BaseType root() const { 31 | return 0; 32 | } 33 | 34 | UCharType child(BaseType index) const { 35 | return units_[index].child(); 36 | } 37 | UCharType sibling(BaseType index) const { 38 | return units_[index].sibling(); 39 | } 40 | 41 | // Reads a dictionary from an input stream. 42 | bool Read(std::istream *input) { 43 | BaseType base_size; 44 | if (!input->read(reinterpret_cast(&base_size), sizeof(BaseType))) { 45 | return false; 46 | } 47 | 48 | SizeType size = static_cast(base_size); 49 | std::vector units_buf(size); 50 | if (!input->read(reinterpret_cast(&units_buf[0]), 51 | sizeof(GuideUnit) * size)) { 52 | return false; 53 | } 54 | 55 | SwapUnitsBuf(&units_buf); 56 | return true; 57 | } 58 | 59 | // Writes a dictionry to an output stream. 60 | bool Write(std::ostream *output) const { 61 | BaseType base_size = static_cast(size_); 62 | if (!output->write(reinterpret_cast(&base_size), 63 | sizeof(BaseType))) { 64 | return false; 65 | } 66 | 67 | if (!output->write(reinterpret_cast(units_), 68 | sizeof(GuideUnit) * size_)) { 69 | return false; 70 | } 71 | 72 | return true; 73 | } 74 | 75 | // Maps memory with its size. 76 | void Map(const void *address) { 77 | Clear(); 78 | units_ = reinterpret_cast( 79 | static_cast(address) + 1); 80 | size_ = *static_cast(address); 81 | } 82 | void Map(const void *address, SizeType size) { 83 | Clear(); 84 | units_ = static_cast(address); 85 | size_ = size; 86 | } 87 | 88 | // Swaps Guides. 89 | void Swap(Guide *guide) { 90 | std::swap(units_, guide->units_); 91 | std::swap(size_, guide->size_); 92 | units_buf_.swap(guide->units_buf_); 93 | } 94 | 95 | // Initializes a Guide. 96 | void Clear() { 97 | units_ = NULL; 98 | size_ = 0; 99 | std::vector(0).swap(units_buf_); 100 | } 101 | 102 | public: 103 | // Following member function is called from DawgBuilder. 104 | 105 | // Swaps buffers for units. 106 | void SwapUnitsBuf(std::vector *units_buf) { 107 | units_ = &(*units_buf)[0]; 108 | size_ = static_cast(units_buf->size()); 109 | units_buf_.swap(*units_buf); 110 | } 111 | 112 | private: 113 | const GuideUnit *units_; 114 | SizeType size_; 115 | std::vector units_buf_; 116 | 117 | // Disables copies. 118 | Guide(const Guide &); 119 | Guide &operator=(const Guide &); 120 | }; 121 | 122 | } // namespace dawgdic 123 | 124 | #endif // DAWGDIC_GUIDE_H 125 | -------------------------------------------------------------------------------- /lib/dawgdic/link-table.h: -------------------------------------------------------------------------------- 1 | #ifndef DAWGDIC_LINK_TABLE_H 2 | #define DAWGDIC_LINK_TABLE_H 3 | 4 | #include "base-types.h" 5 | #include "dictionary-unit.h" 6 | 7 | #include 8 | 9 | namespace dawgdic { 10 | 11 | class LinkTable { 12 | public: 13 | explicit LinkTable() : hash_table_() {} 14 | 15 | // Initializes a hash table. 16 | void Init(SizeType table_size) { 17 | PairType initial_pair(0, 0); 18 | std::vector table(table_size, initial_pair); 19 | hash_table_.swap(table); 20 | } 21 | 22 | // Finds an offset that corresponds to a given index. 23 | BaseType Find(BaseType index) const { 24 | BaseType hash_id = FindId(index); 25 | return hash_table_[hash_id].second; 26 | } 27 | 28 | // Inserts an index with its offset. 29 | void Insert(BaseType index, BaseType offset) { 30 | BaseType hash_id = FindId(index); 31 | hash_table_[hash_id].first = index; 32 | hash_table_[hash_id].second = offset; 33 | } 34 | 35 | private: 36 | typedef std::pair PairType; 37 | 38 | std::vector hash_table_; 39 | 40 | // Disallows copies. 41 | LinkTable(const LinkTable &); 42 | LinkTable &operator=(const LinkTable &); 43 | 44 | // Finds an Id from an upper table. 45 | BaseType FindId(BaseType index) const { 46 | BaseType hash_id = Hash(index) % hash_table_.size(); 47 | while (hash_table_[hash_id].first != 0) { 48 | if (index == hash_table_[hash_id].first) { 49 | return hash_id; 50 | } 51 | hash_id = (hash_id + 1) % hash_table_.size(); 52 | } 53 | return hash_id; 54 | } 55 | 56 | // 32-bit mix function. 57 | // http://www.concentric.net/~Ttwang/tech/inthash.htm 58 | static BaseType Hash(BaseType key) { 59 | key = ~key + (key << 15); // key = (key << 15) - key - 1; 60 | key = key ^ (key >> 12); 61 | key = key + (key << 2); 62 | key = key ^ (key >> 4); 63 | key = key * 2057; // key = (key + (key << 3)) + (key << 11); 64 | key = key ^ (key >> 16); 65 | return key; 66 | } 67 | }; 68 | 69 | } // namespace dawgdic 70 | 71 | #endif // DAWGDIC_LINK_TABLE_H 72 | -------------------------------------------------------------------------------- /lib/dawgdic/object-pool.h: -------------------------------------------------------------------------------- 1 | #ifndef DAWGDIC_OBJECT_POOL_H 2 | #define DAWGDIC_OBJECT_POOL_H 3 | 4 | #include 5 | 6 | #include "base-types.h" 7 | 8 | namespace dawgdic { 9 | 10 | // This class works like an array of objects with compact memory management. 11 | template 12 | class ObjectPool { 13 | public: 14 | typedef OBJECT_TYPE ObjectType; 15 | 16 | ObjectPool() : blocks_(), size_(0) {} 17 | ~ObjectPool() { 18 | Clear(); 19 | } 20 | 21 | // Accessors. 22 | ObjectType &operator[](SizeType index) { 23 | return blocks_[index / BLOCK_SIZE][index % BLOCK_SIZE]; 24 | } 25 | const ObjectType &operator[](SizeType index) const { 26 | return blocks_[index / BLOCK_SIZE][index % BLOCK_SIZE]; 27 | } 28 | 29 | // Number of allocated objects. 30 | SizeType size() const { 31 | return size_; 32 | } 33 | 34 | // Deletes all objects and frees memory. 35 | void Clear() { 36 | for (SizeType i = 0; i < blocks_.size(); ++i) { 37 | delete [] blocks_[i]; 38 | } 39 | 40 | std::vector(0).swap(blocks_); 41 | size_ = 0; 42 | } 43 | 44 | // Swaps object pools. 45 | void Swap(ObjectPool *pool) { 46 | blocks_.swap(pool->blocks_); 47 | std::swap(size_, pool->size_); 48 | } 49 | 50 | // Allocates memory for a new object and returns its ID. 51 | SizeType Allocate() { 52 | if (size_ == BLOCK_SIZE * blocks_.size()) { 53 | blocks_.push_back(new ObjectType[BLOCK_SIZE]); 54 | } 55 | return size_++; 56 | } 57 | 58 | private: 59 | std::vector blocks_; 60 | SizeType size_; 61 | 62 | // Disallows copies. 63 | ObjectPool(const ObjectPool &); 64 | ObjectPool &operator=(const ObjectPool &); 65 | }; 66 | 67 | } // namespace dawgdic 68 | 69 | #endif // DAWGDIC_OBJECT_POOL_H 70 | -------------------------------------------------------------------------------- /lib/dawgdic/ranked-completer-candidate.h: -------------------------------------------------------------------------------- 1 | #ifndef DAWGDIC_RANKED_COMPLETER_CANDIDATE_H 2 | #define DAWGDIC_RANKED_COMPLETER_CANDIDATE_H 3 | 4 | #include "base-types.h" 5 | 6 | namespace dawgdic { 7 | 8 | class RankedCompleterCandidate { 9 | public: 10 | RankedCompleterCandidate() : node_index_(0), value_(-1) {} 11 | 12 | void set_node_index(BaseType node_index) { 13 | node_index_ = node_index; 14 | } 15 | void set_value(ValueType value) { 16 | value_ = value; 17 | } 18 | 19 | BaseType node_index() const { 20 | return node_index_; 21 | } 22 | ValueType value() const { 23 | return value_; 24 | } 25 | 26 | template 27 | class Comparer { 28 | public: 29 | typedef VALUE_COMPARER_TYPE ValueComparerType; 30 | 31 | explicit Comparer(ValueComparerType value_comparer) 32 | : value_comparer_(value_comparer) {} 33 | 34 | bool operator()(const RankedCompleterCandidate &lhs, 35 | const RankedCompleterCandidate &rhs) const { 36 | if (lhs.value() != rhs.value()) { 37 | return value_comparer_(lhs.value(), rhs.value()); 38 | } 39 | return lhs.node_index() > rhs.node_index(); 40 | } 41 | 42 | private: 43 | ValueComparerType value_comparer_; 44 | }; 45 | 46 | template 47 | static Comparer MakeComparer( 48 | VALUE_COMPARER_TYPE value_comparer) { 49 | return Comparer(value_comparer); 50 | } 51 | 52 | private: 53 | BaseType node_index_; 54 | ValueType value_; 55 | 56 | // Copyable. 57 | }; 58 | 59 | } // namespace dawgdic 60 | 61 | #endif // DAWGDIC_RANKED_COMPLETER_CANDIDATE_H 62 | -------------------------------------------------------------------------------- /lib/dawgdic/ranked-completer-node.h: -------------------------------------------------------------------------------- 1 | #ifndef DAWGDIC_RANKED_COMPLETER_NODE_H 2 | #define DAWGDIC_RANKED_COMPLETER_NODE_H 3 | 4 | #include "base-types.h" 5 | 6 | namespace dawgdic { 7 | 8 | class RankedCompleterNode { 9 | public: 10 | RankedCompleterNode() 11 | : dic_index_(0), prev_node_index_(0), 12 | label_('\0'), is_queued_(false), has_terminal_(false) {} 13 | 14 | void set_dic_index(BaseType dic_index) { 15 | dic_index_ = dic_index; 16 | } 17 | void set_prev_node_index(BaseType prev_node_index) { 18 | prev_node_index_ = prev_node_index; 19 | } 20 | void set_label(UCharType label) { 21 | label_ = label; 22 | } 23 | void set_is_queued() { 24 | is_queued_ = true; 25 | } 26 | void set_has_terminal(bool has_terminal) { 27 | has_terminal_ = has_terminal; 28 | } 29 | 30 | BaseType dic_index() const { 31 | return dic_index_; 32 | } 33 | BaseType prev_node_index() const { 34 | return prev_node_index_; 35 | } 36 | UCharType label() const { 37 | return label_; 38 | } 39 | bool is_queued() const { 40 | return is_queued_; 41 | } 42 | bool has_terminal() const { 43 | return has_terminal_; 44 | } 45 | 46 | private: 47 | BaseType dic_index_; 48 | BaseType prev_node_index_; 49 | UCharType label_; 50 | bool is_queued_; 51 | bool has_terminal_; 52 | 53 | // Copyable. 54 | }; 55 | 56 | } // namespace dawgdic 57 | 58 | #endif // DAWGDIC_RANKED_COMPLETER_NODE_H 59 | -------------------------------------------------------------------------------- /lib/dawgdic/ranked-completer.h: -------------------------------------------------------------------------------- 1 | #ifndef DAWGDIC_RANKED_COMPLETER_H 2 | #define DAWGDIC_RANKED_COMPLETER_H 3 | 4 | #include "dictionary.h" 5 | #include "ranked-completer-candidate.h" 6 | #include "ranked-completer-node.h" 7 | #include "ranked-guide.h" 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | namespace dawgdic { 15 | 16 | template > 17 | class RankedCompleterBase { 18 | public: 19 | typedef VALUE_COMPARER_TYPE ValueComparerType; 20 | 21 | explicit RankedCompleterBase( 22 | ValueComparerType value_comparer = ValueComparerType()) 23 | : dic_(NULL), guide_(NULL), key_(), prefix_length_(0), value_(-1), 24 | nodes_(), node_queue_(), candidate_queue_( 25 | RankedCompleterCandidate::MakeComparer(value_comparer)) {} 26 | RankedCompleterBase(const Dictionary &dic, const RankedGuide &guide, 27 | ValueComparerType value_comparer = ValueComparerType()) 28 | : dic_(&dic), guide_(&guide), key_(), prefix_length_(0), value_(-1), 29 | nodes_(), node_queue_(), candidate_queue_( 30 | RankedCompleterCandidate::MakeComparer(value_comparer)) {} 31 | 32 | void set_dic(const Dictionary &dic) { 33 | dic_ = &dic; 34 | } 35 | void set_guide(const RankedGuide &guide) { 36 | guide_ = &guide; 37 | } 38 | 39 | const Dictionary &dic() const { 40 | return *dic_; 41 | } 42 | const RankedGuide &guide() const { 43 | return *guide_; 44 | } 45 | 46 | // These member functions are available only when Next() returns true. 47 | const char *key() const { 48 | return reinterpret_cast(&key_[0]); 49 | } 50 | SizeType length() const { 51 | return key_.size() - 1; 52 | } 53 | ValueType value() const { 54 | return value_; 55 | } 56 | 57 | // Starts completing keys from given index and prefix. 58 | void Start(BaseType index, const char *prefix = "") { 59 | SizeType length = 0; 60 | for (const char *p = prefix; *p != '\0'; ++p) { 61 | ++length; 62 | } 63 | 64 | Start(index, prefix, length); 65 | } 66 | void Start(BaseType index, const char *prefix, SizeType length) { 67 | key_.resize(length); 68 | for (SizeType i = 0; i < length; ++i) { 69 | key_[i] = prefix[i]; 70 | } 71 | prefix_length_ = length; 72 | value_ = -1; 73 | 74 | nodes_.clear(); 75 | node_queue_.clear(); 76 | while (!candidate_queue_.empty()) { 77 | candidate_queue_.pop(); 78 | } 79 | 80 | if (guide_->size() != 0) { 81 | CreateNode(index, 0, 'X'); 82 | EnqueueNode(0); 83 | } 84 | } 85 | 86 | // Gets the next key. 87 | bool Next() { 88 | for (SizeType i = 0; i < node_queue_.size(); ++i) { 89 | BaseType node_index = node_queue_[i]; 90 | if (value_ != -1 && !FindSibling(&node_index)) { 91 | continue; 92 | } 93 | node_index = FindTerminal(node_index); 94 | EnqueueCandidate(node_index); 95 | } 96 | node_queue_.clear(); 97 | 98 | // Returns false if there is no candidate. 99 | if (candidate_queue_.empty()) { 100 | return false; 101 | } 102 | 103 | const RankedCompleterCandidate &candidate = candidate_queue_.top(); 104 | 105 | BaseType node_index = candidate.node_index(); 106 | EnqueueNode(node_index); 107 | node_index = nodes_[node_index].prev_node_index(); 108 | 109 | key_.resize(prefix_length_); 110 | while (node_index != 0) { 111 | key_.push_back(nodes_[node_index].label()); 112 | EnqueueNode(node_index); 113 | node_index = nodes_[node_index].prev_node_index(); 114 | } 115 | std::reverse(key_.begin() + prefix_length_, key_.end()); 116 | key_.push_back('\0'); 117 | 118 | value_ = candidate.value(); 119 | candidate_queue_.pop(); 120 | 121 | return true; 122 | } 123 | 124 | private: 125 | const Dictionary *dic_; 126 | const RankedGuide *guide_; 127 | std::vector key_; 128 | SizeType prefix_length_; 129 | ValueType value_; 130 | 131 | std::vector nodes_; 132 | std::vector node_queue_; 133 | std::priority_queue, 135 | RankedCompleterCandidate::Comparer > 136 | candidate_queue_; 137 | 138 | // Disallows copies. 139 | RankedCompleterBase(const RankedCompleterBase &); 140 | RankedCompleterBase &operator=(const RankedCompleterBase &); 141 | 142 | // Pushes a node to queue. 143 | void EnqueueNode(BaseType node_index) { 144 | if (nodes_[node_index].is_queued()) { 145 | return; 146 | } 147 | 148 | node_queue_.push_back(node_index); 149 | nodes_[node_index].set_is_queued(); 150 | } 151 | 152 | // Pushes a candidate to priority queue. 153 | void EnqueueCandidate(BaseType node_index) { 154 | RankedCompleterCandidate candidate; 155 | candidate.set_node_index(node_index); 156 | candidate.set_value( 157 | dic_->units()[nodes_[node_index].dic_index()].value()); 158 | candidate_queue_.push(candidate); 159 | } 160 | 161 | // Finds a sibling of a given node. 162 | bool FindSibling(BaseType *node_index) { 163 | BaseType prev_node_index = nodes_[*node_index].prev_node_index(); 164 | BaseType dic_index = nodes_[*node_index].dic_index(); 165 | 166 | UCharType sibling_label = guide_->sibling(dic_index); 167 | if (sibling_label == '\0') { 168 | if (!nodes_[prev_node_index].has_terminal()) { 169 | return false; 170 | } 171 | nodes_[prev_node_index].set_has_terminal(false); 172 | } 173 | 174 | // Follows a transition to sibling and creates a node for the sibling. 175 | BaseType dic_prev_index = nodes_[prev_node_index].dic_index(); 176 | dic_index = FollowWithoutCheck(dic_prev_index, sibling_label); 177 | *node_index = CreateNode(dic_index, prev_node_index, sibling_label); 178 | 179 | return true; 180 | } 181 | 182 | // Follows transitions and finds a terminal. 183 | BaseType FindTerminal(BaseType node_index) { 184 | while (nodes_[node_index].label() != '\0') { 185 | BaseType dic_index = nodes_[node_index].dic_index(); 186 | UCharType child_label = guide_->child(dic_index); 187 | if (child_label == '\0') { 188 | nodes_[node_index].set_has_terminal(false); 189 | } 190 | 191 | // Follows a transition to child and creates a node for the child. 192 | dic_index = FollowWithoutCheck(dic_index, child_label); 193 | node_index = CreateNode(dic_index, node_index, child_label); 194 | } 195 | return node_index; 196 | } 197 | 198 | // Follows a transition without any check. 199 | BaseType FollowWithoutCheck(BaseType index, UCharType label) const { 200 | return index ^ dic_->units()[index].offset() ^ label; 201 | } 202 | 203 | // Creates a node. 204 | BaseType CreateNode(BaseType dic_index, BaseType prev_node_index, 205 | UCharType label) { 206 | RankedCompleterNode node; 207 | node.set_dic_index(dic_index); 208 | node.set_prev_node_index(prev_node_index); 209 | node.set_label(label); 210 | if (node.label() != '\0') { 211 | node.set_has_terminal(dic_->has_value(node.dic_index())); 212 | } 213 | nodes_.push_back(node); 214 | 215 | return static_cast(nodes_.size() - 1); 216 | } 217 | }; 218 | 219 | typedef RankedCompleterBase<> RankedCompleter; 220 | 221 | } // namespace dawgdic 222 | 223 | #endif // DAWGDIC_RANKED_COMPLETER_H 224 | -------------------------------------------------------------------------------- /lib/dawgdic/ranked-guide-builder.h: -------------------------------------------------------------------------------- 1 | #ifndef DAWGDIC_RANKED_GUIDE_BUILDER_H 2 | #define DAWGDIC_RANKED_GUIDE_BUILDER_H 3 | 4 | #include "dawg.h" 5 | #include "dictionary.h" 6 | #include "ranked-guide.h" 7 | #include "ranked-guide-link.h" 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | namespace dawgdic { 14 | 15 | class RankedGuideBuilder { 16 | public: 17 | // Builds a dictionary for completing keys. 18 | static bool Build(const Dawg &dawg, const Dictionary &dic, 19 | RankedGuide *guide) { 20 | return Build(dawg, dic, guide, std::less()); 21 | } 22 | 23 | // Builds a dictionary for completing keys. 24 | template 25 | static bool Build(const Dawg &dawg, const Dictionary &dic, 26 | RankedGuide *guide, VALUE_COMPARER_TYPE value_comparer) { 27 | RankedGuideBuilder builder(dawg, dic, guide); 28 | return builder.BuildRankedGuide(value_comparer); 29 | } 30 | 31 | private: 32 | const Dawg &dawg_; 33 | const Dictionary &dic_; 34 | RankedGuide *guide_; 35 | 36 | std::vector units_; 37 | std::vector links_; 38 | std::vector is_fixed_table_; 39 | 40 | // Disallows copies. 41 | RankedGuideBuilder(const RankedGuideBuilder &); 42 | RankedGuideBuilder &operator=(const RankedGuideBuilder &); 43 | 44 | RankedGuideBuilder(const Dawg &dawg, const Dictionary &dic, 45 | RankedGuide *guide) 46 | : dawg_(dawg), dic_(dic), guide_(guide), 47 | units_(), links_(), is_fixed_table_() {} 48 | 49 | template 50 | bool BuildRankedGuide(VALUE_COMPARER_TYPE value_comparer) { 51 | // Initializes units and flags. 52 | units_.resize(dic_.size()); 53 | is_fixed_table_.resize(dic_.size() / 8, '\0'); 54 | 55 | if (dawg_.size() <= 1) { 56 | return true; 57 | } 58 | 59 | ValueType max_value = -1; 60 | if (!BuildRankedGuide(dawg_.root(), dic_.root(), 61 | &max_value, value_comparer)) { 62 | return false; 63 | } 64 | 65 | guide_->SwapUnitsBuf(&units_); 66 | return true; 67 | } 68 | 69 | // Builds a guide recursively. 70 | template 71 | bool BuildRankedGuide(BaseType dawg_index, BaseType dic_index, 72 | ValueType *max_value, 73 | VALUE_COMPARER_TYPE value_comparer) { 74 | if (is_fixed(dic_index)) { 75 | return FindMaxValue(dic_index, max_value); 76 | } 77 | set_is_fixed(dic_index); 78 | 79 | SizeType initial_num_links = links_.size(); 80 | 81 | // Enumerates links to the next states. 82 | if (!EnumerateLinks(dawg_index, dic_index, value_comparer)) { 83 | return false; 84 | } 85 | 86 | std::stable_sort(links_.begin() + initial_num_links, links_.end(), 87 | RankedGuideLink::MakeComparer(value_comparer)); 88 | 89 | // Reflects links into units. 90 | if (!TurnLinksToUnits(dic_index, initial_num_links)) { 91 | return false; 92 | } 93 | 94 | *max_value = links_[initial_num_links].value(); 95 | links_.resize(initial_num_links); 96 | 97 | return true; 98 | } 99 | 100 | // Finds the maximum value by using fixed units. 101 | bool FindMaxValue(BaseType dic_index, ValueType *max_value) const { 102 | while (units_[dic_index].child() != '\0') { 103 | UCharType child_label = units_[dic_index].child(); 104 | if (!dic_.Follow(child_label, &dic_index)) { 105 | return false; 106 | } 107 | } 108 | if (!dic_.has_value(dic_index)) { 109 | return false; 110 | } 111 | *max_value = dic_.value(dic_index); 112 | return true; 113 | } 114 | 115 | // Enumerates links to the next states. 116 | template 117 | bool EnumerateLinks(BaseType dawg_index, BaseType dic_index, 118 | VALUE_COMPARER_TYPE value_comparer) { 119 | for (BaseType dawg_child_index = dawg_.child(dawg_index); 120 | dawg_child_index != 0; 121 | dawg_child_index = dawg_.sibling(dawg_child_index)) { 122 | ValueType value = -1; 123 | UCharType child_label = dawg_.label(dawg_child_index); 124 | if (child_label == '\0') { 125 | if (!dic_.has_value(dic_index)) { 126 | return false; 127 | } 128 | value = dic_.value(dic_index); 129 | } else { 130 | BaseType dic_child_index = dic_index; 131 | if (!dic_.Follow(child_label, &dic_child_index)) { 132 | return false; 133 | } 134 | 135 | if (!BuildRankedGuide(dawg_child_index, dic_child_index, 136 | &value, value_comparer)) { 137 | return false; 138 | } 139 | } 140 | links_.push_back(RankedGuideLink(child_label, value)); 141 | } 142 | 143 | return true; 144 | } 145 | 146 | // Modifies units. 147 | bool TurnLinksToUnits(BaseType dic_index, SizeType links_begin) { 148 | // The first child. 149 | UCharType first_label = links_[links_begin].label(); 150 | units_[dic_index].set_child(first_label); 151 | BaseType dic_child_index = FollowWithoutCheck(dic_index, first_label); 152 | 153 | // Other children. 154 | for (SizeType i = links_begin + 1; i < links_.size(); ++i) { 155 | UCharType sibling_label = links_[i].label(); 156 | 157 | BaseType dic_sibling_index = 158 | FollowWithoutCheck(dic_index, sibling_label); 159 | units_[dic_child_index].set_sibling(sibling_label); 160 | dic_child_index = dic_sibling_index; 161 | } 162 | 163 | return true; 164 | } 165 | 166 | // Follows a transition without any check. 167 | BaseType FollowWithoutCheck(BaseType index, UCharType label) const { 168 | return index ^ dic_.units()[index].offset() ^ label; 169 | } 170 | 171 | void set_is_fixed(BaseType index) { 172 | is_fixed_table_[index / 8] |= 1 << (index % 8); 173 | } 174 | 175 | bool is_fixed(BaseType index) const { 176 | return (is_fixed_table_[index / 8] & (1 << (index % 8))) != 0; 177 | } 178 | }; 179 | 180 | } // namespace dawgdic 181 | 182 | #endif // DAWGDIC_RANKED_GUIDE_BUILDER_H 183 | -------------------------------------------------------------------------------- /lib/dawgdic/ranked-guide-link.h: -------------------------------------------------------------------------------- 1 | #ifndef DAWGDIC_RANKED_GUIDE_LINK_H 2 | #define DAWGDIC_RANKED_GUIDE_LINK_H 3 | 4 | namespace dawgdic { 5 | 6 | class RankedGuideLink { 7 | public: 8 | RankedGuideLink() : label_('\0'), value_(-1) {} 9 | RankedGuideLink(UCharType label, ValueType value) 10 | : label_(label), value_(value) {} 11 | 12 | void set_label(UCharType label) { 13 | label_ = label; 14 | } 15 | void set_value(ValueType value) { 16 | value_ = value; 17 | } 18 | 19 | UCharType label() const { 20 | return label_; 21 | } 22 | ValueType value() const { 23 | return value_; 24 | } 25 | 26 | // For sortings links in descending value order. 27 | template 28 | class Comparer { 29 | public: 30 | typedef VALUE_COMPARER_TYPE ValueComparerType; 31 | 32 | explicit Comparer(ValueComparerType value_comparer) 33 | : value_comparer_(value_comparer) {} 34 | 35 | bool operator()(const RankedGuideLink &lhs, 36 | const RankedGuideLink &rhs) const { 37 | if (lhs.value() != rhs.value()) { 38 | return value_comparer_(rhs.value(), lhs.value()); 39 | } 40 | return lhs.label() < rhs.label(); 41 | } 42 | 43 | private: 44 | ValueComparerType value_comparer_; 45 | }; 46 | 47 | template 48 | static Comparer MakeComparer( 49 | VALUE_COMPARER_TYPE value_comparer) { 50 | return Comparer(value_comparer); 51 | } 52 | 53 | private: 54 | UCharType label_; 55 | ValueType value_; 56 | 57 | // Copyable. 58 | }; 59 | 60 | } // namespace dawgdic 61 | 62 | #endif // DAWGDIC_RANKED_GUIDE_LINK_H 63 | -------------------------------------------------------------------------------- /lib/dawgdic/ranked-guide-unit.h: -------------------------------------------------------------------------------- 1 | #ifndef DAWGDIC_RANKED_GUIDE_UNIT_H 2 | #define DAWGDIC_RANKED_GUIDE_UNIT_H 3 | 4 | #include "base-types.h" 5 | 6 | namespace dawgdic { 7 | 8 | class RankedGuideUnit { 9 | public: 10 | RankedGuideUnit() : child_('\0'), sibling_('\0') {} 11 | 12 | void set_child(UCharType child) { 13 | child_ = child; 14 | } 15 | void set_sibling(UCharType sibling) { 16 | sibling_ = sibling; 17 | } 18 | 19 | UCharType child() const { 20 | return child_; 21 | } 22 | UCharType sibling() const { 23 | return sibling_; 24 | } 25 | 26 | private: 27 | UCharType child_; 28 | UCharType sibling_; 29 | 30 | // Copyable. 31 | }; 32 | 33 | } // namespace dawgdic 34 | 35 | #endif // DAWGDIC_RANKED_GUIDE_UNIT_H 36 | -------------------------------------------------------------------------------- /lib/dawgdic/ranked-guide.h: -------------------------------------------------------------------------------- 1 | #ifndef DAWGDIC_RANKED_GUIDE_H 2 | #define DAWGDIC_RANKED_GUIDE_H 3 | 4 | #include "dictionary.h" 5 | #include "ranked-guide-unit.h" 6 | 7 | #include 8 | #include 9 | 10 | namespace dawgdic { 11 | 12 | class RankedGuide { 13 | public: 14 | RankedGuide() : units_(NULL), size_(0), units_buf_() {} 15 | 16 | const RankedGuideUnit *units() const { 17 | return units_; 18 | } 19 | SizeType size() const { 20 | return size_; 21 | } 22 | SizeType total_size() const { 23 | return sizeof(RankedGuideUnit) * size_; 24 | } 25 | SizeType file_size() const { 26 | return sizeof(BaseType) + total_size(); 27 | } 28 | 29 | // The root index. 30 | BaseType root() const { 31 | return 0; 32 | } 33 | 34 | UCharType child(BaseType index) const { 35 | return units_[index].child(); 36 | } 37 | UCharType sibling(BaseType index) const { 38 | return units_[index].sibling(); 39 | } 40 | 41 | // Reads a dictionary from an input stream. 42 | bool Read(std::istream *input) { 43 | BaseType base_size; 44 | if (!input->read(reinterpret_cast(&base_size), sizeof(BaseType))) { 45 | return false; 46 | } 47 | 48 | SizeType size = static_cast(base_size); 49 | std::vector units_buf(size); 50 | if (!input->read(reinterpret_cast(&units_buf[0]), 51 | sizeof(RankedGuideUnit) * size)) { 52 | return false; 53 | } 54 | 55 | SwapUnitsBuf(&units_buf); 56 | return true; 57 | } 58 | 59 | // Writes a dictionry to an output stream. 60 | bool Write(std::ostream *output) const { 61 | BaseType base_size = static_cast(size_); 62 | if (!output->write(reinterpret_cast(&base_size), 63 | sizeof(BaseType))) { 64 | return false; 65 | } 66 | 67 | if (!output->write(reinterpret_cast(units_), 68 | sizeof(RankedGuideUnit) * size_)) { 69 | return false; 70 | } 71 | 72 | return true; 73 | } 74 | 75 | // Maps memory with its size. 76 | void Map(const void *address) { 77 | Clear(); 78 | units_ = reinterpret_cast( 79 | static_cast(address) + 1); 80 | size_ = *static_cast(address); 81 | } 82 | void Map(const void *address, SizeType size) { 83 | Clear(); 84 | units_ = static_cast(address); 85 | size_ = size; 86 | } 87 | 88 | // Swaps RankedGuides. 89 | void Swap(RankedGuide *guide) { 90 | std::swap(units_, guide->units_); 91 | std::swap(size_, guide->size_); 92 | units_buf_.swap(guide->units_buf_); 93 | } 94 | 95 | // Initializes a RankedGuide. 96 | void Clear() { 97 | units_ = NULL; 98 | size_ = 0; 99 | std::vector(0).swap(units_buf_); 100 | } 101 | 102 | public: 103 | // Following member function is called from DawgBuilder. 104 | 105 | // Swaps buffers for units. 106 | void SwapUnitsBuf(std::vector *units_buf) { 107 | units_ = &(*units_buf)[0]; 108 | size_ = static_cast(units_buf->size()); 109 | units_buf_.swap(*units_buf); 110 | } 111 | 112 | private: 113 | const RankedGuideUnit *units_; 114 | SizeType size_; 115 | std::vector units_buf_; 116 | 117 | // Disables copies. 118 | RankedGuide(const RankedGuide &); 119 | RankedGuide &operator=(const RankedGuide &); 120 | }; 121 | 122 | } // namespace dawgdic 123 | 124 | #endif // DAWGDIC_RANKED_GUIDE_H 125 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | import glob 3 | from setuptools import setup, Extension 4 | 5 | setup( 6 | name="DAWG", 7 | version="0.8.0", 8 | description="Fast and memory efficient DAWG (DAFSA) for Python", 9 | long_description=open('README.rst').read() + '\n\n' + open('CHANGES.rst').read(), 10 | author='Mikhail Korobov', 11 | author_email='kmike84@gmail.com', 12 | url='https://github.com/pytries/DAWG/', 13 | 14 | ext_modules=[ 15 | Extension( 16 | "dawg", 17 | sources=glob.glob('src/*.cpp') + glob.glob('lib/b64/*.c'), 18 | include_dirs=['lib'], 19 | language="c++", 20 | ) 21 | ], 22 | 23 | classifiers=[ 24 | 'Development Status :: 4 - Beta', 25 | 'Intended Audience :: Developers', 26 | 'Intended Audience :: Science/Research', 27 | 'License :: OSI Approved :: MIT License', 28 | 'Programming Language :: Cython', 29 | 'Programming Language :: Python', 30 | 'Programming Language :: Python :: 2', 31 | 'Programming Language :: Python :: 2.7', 32 | 'Programming Language :: Python :: 3', 33 | 'Programming Language :: Python :: 3.5', 34 | 'Programming Language :: Python :: 3.6', 35 | 'Programming Language :: Python :: 3.7', 36 | 'Programming Language :: Python :: 3.8', 37 | 'Programming Language :: Python :: Implementation :: CPython', 38 | 'Topic :: Software Development :: Libraries :: Python Modules', 39 | 'Topic :: Scientific/Engineering :: Information Analysis', 40 | 'Topic :: Text Processing :: Linguistic', 41 | ], 42 | ) 43 | -------------------------------------------------------------------------------- /src/_base_types.pxd: -------------------------------------------------------------------------------- 1 | cdef extern from "../lib/dawgdic/base-types.h" namespace "dawgdic": 2 | # 8-bit characters. 3 | ctypedef char CharType 4 | ctypedef unsigned char UCharType 5 | 6 | # 32-bit integer. 7 | ctypedef int ValueType 8 | 9 | # 32-bit unsigned integer. 10 | ctypedef unsigned int BaseType 11 | 12 | # 32 or 64-bit unsigned integer. 13 | ctypedef int SizeType 14 | 15 | -------------------------------------------------------------------------------- /src/_completer.pxd: -------------------------------------------------------------------------------- 1 | from _base_types cimport BaseType, SizeType, ValueType 2 | from _dawg cimport Dawg 3 | from _dictionary cimport Dictionary 4 | from _guide cimport Guide 5 | 6 | cdef extern from "../lib/dawgdic/completer.h" namespace "dawgdic" nogil: 7 | cdef cppclass Completer: 8 | Completer() 9 | Completer(Dictionary &dic, Guide &guide) 10 | 11 | void set_dic(Dictionary &dic) 12 | void set_guide(Guide &guide) 13 | 14 | Dictionary &dic() 15 | Guide &guide() 16 | 17 | # These member functions are available only when Next() returns true. 18 | char *key() 19 | SizeType length() 20 | ValueType value() 21 | 22 | # Starts completing keys from given index and prefix. 23 | void Start(BaseType index) 24 | void Start(BaseType index, char *prefix) 25 | void Start(BaseType index, char *prefix, SizeType length) 26 | 27 | # Gets the next key. 28 | bint Next() -------------------------------------------------------------------------------- /src/_dawg.pxd: -------------------------------------------------------------------------------- 1 | from _base_types cimport BaseType, SizeType, ValueType, UCharType 2 | 3 | cdef extern from "../lib/dawgdic/dawg.h" namespace "dawgdic": 4 | 5 | cdef cppclass Dawg: 6 | Dawg() 7 | 8 | # The root index. 9 | BaseType root() nogil 10 | 11 | # Number of units. 12 | SizeType size() nogil 13 | 14 | # Number of transitions. 15 | SizeType num_of_transitions() nogil 16 | 17 | # Number of states. 18 | SizeType num_of_states() nogil 19 | 20 | # Number of merged transitions. 21 | SizeType num_of_merged_transitions() nogil 22 | 23 | # Number of merged states. 24 | SizeType num_of_merged_states() nogil 25 | 26 | # Number of merging states. 27 | SizeType num_of_merging_states() nogil 28 | 29 | # Reads values. 30 | BaseType child(BaseType index) nogil 31 | 32 | BaseType sibling(BaseType index) nogil 33 | 34 | ValueType value(BaseType index) nogil 35 | 36 | bint is_leaf(BaseType index) nogil 37 | 38 | UCharType label(BaseType index) nogil 39 | 40 | bint is_merging(BaseType index) nogil 41 | 42 | # Clears object pools. 43 | void Clear() nogil 44 | 45 | # Swaps dawgs. 46 | void Swap(Dawg *dawg) nogil 47 | -------------------------------------------------------------------------------- /src/_dawg_builder.pxd: -------------------------------------------------------------------------------- 1 | from _base_types cimport BaseType, SizeType, ValueType, UCharType, CharType 2 | from _dawg cimport Dawg 3 | 4 | cdef extern from "../lib/dawgdic/dawg-builder.h" namespace "dawgdic": 5 | cdef cppclass DawgBuilder: 6 | 7 | DawgBuilder() nogil #(SizeType initial_hash_table_size = DEFAULT_INITIAL_HASH_TABLE_SIZE) 8 | 9 | # Number of units. 10 | SizeType size() nogil 11 | 12 | # Number of transitions. 13 | SizeType num_of_transitions() nogil 14 | 15 | # Number of states. 16 | SizeType num_of_states() nogil 17 | 18 | # Number of merged transitions. 19 | SizeType num_of_merged_transitions() nogil 20 | 21 | # Number of merged states. 22 | SizeType num_of_merged_states() nogil 23 | 24 | # Number of merging states. 25 | SizeType num_of_merging_states() nogil 26 | 27 | # Initializes a builder. 28 | void Clear() nogil 29 | 30 | # Inserts a key. 31 | bint Insert(CharType *key) 32 | bint Insert(CharType *key, ValueType value) 33 | bint Insert(CharType *key, SizeType length, ValueType value) 34 | 35 | # Finishes building a dawg. 36 | bint Finish(Dawg *dawg) 37 | -------------------------------------------------------------------------------- /src/_dictionary.pxd: -------------------------------------------------------------------------------- 1 | from _base_types cimport BaseType, SizeType, ValueType, UCharType, CharType 2 | from _dictionary_unit cimport DictionaryUnit 3 | from iostream cimport istream, ostream 4 | 5 | cdef extern from "../lib/dawgdic/dictionary.h" namespace "dawgdic": 6 | cdef cppclass Dictionary: 7 | 8 | Dictionary() nogil 9 | 10 | DictionaryUnit *units() nogil 11 | SizeType size() nogil 12 | SizeType total_size() nogil 13 | SizeType file_size() nogil 14 | 15 | # Root index. 16 | BaseType root() nogil 17 | 18 | # Checks if a given index is related to the end of a key. 19 | bint has_value(BaseType index) nogil 20 | 21 | # Gets a value from a given index. 22 | ValueType value(BaseType index) nogil 23 | 24 | # Reads a dictionary from an input stream. 25 | bint Read(istream *input) nogil except + 26 | 27 | # Writes a dictionry to an output stream. 28 | bint Write(ostream *output) nogil except + 29 | 30 | # Exact matching. 31 | bint Contains(CharType *key) nogil 32 | bint Contains(CharType *key, SizeType length) nogil 33 | 34 | # Exact matching. 35 | ValueType Find(CharType *key) nogil 36 | ValueType Find(CharType *key, SizeType length) nogil 37 | bint Find(CharType *key, ValueType *value) nogil 38 | bint Find(CharType *key, SizeType length, ValueType *value) nogil 39 | 40 | # Follows a transition. 41 | bint Follow(CharType label, BaseType *index) nogil 42 | 43 | # Follows transitions. 44 | bint Follow(CharType *s, BaseType *index) nogil 45 | bint Follow(CharType *s, BaseType *index, SizeType *count) nogil 46 | 47 | # Follows transitions. 48 | bint Follow(CharType *s, SizeType length, BaseType *index) nogil 49 | bint Follow(CharType *s, SizeType length, BaseType *index, SizeType *count) nogil 50 | 51 | # Maps memory with its size. 52 | void Map(void *address) nogil 53 | void Map(void *address, SizeType size) nogil 54 | 55 | # Initializes a dictionary. 56 | void Clear() nogil 57 | 58 | # Swaps dictionaries. 59 | void Swap(Dictionary *dic) nogil 60 | # Shrinks a vector. 61 | void Shrink() nogil -------------------------------------------------------------------------------- /src/_dictionary_builder.pxd: -------------------------------------------------------------------------------- 1 | from _base_types cimport BaseType 2 | from _dawg cimport Dawg 3 | from _dictionary cimport Dictionary 4 | 5 | cdef extern from "../lib/dawgdic/dictionary-builder.h" namespace "dawgdic::DictionaryBuilder": 6 | cdef bint Build (Dawg &dawg, Dictionary *dic) nogil 7 | 8 | -------------------------------------------------------------------------------- /src/_dictionary_unit.pxd: -------------------------------------------------------------------------------- 1 | from _base_types cimport BaseType, SizeType, ValueType, UCharType, CharType 2 | 3 | cdef extern from "../lib/dawgdic/dictionary-unit.h" namespace "dawgdic": 4 | cdef cppclass DictionaryUnit: 5 | 6 | DictionaryUnit() nogil 7 | 8 | # Sets a flag to show that a unit has a leaf as a child. 9 | void set_has_leaf() nogil 10 | 11 | # Sets a value to a leaf unit. 12 | void set_value(ValueType value) nogil 13 | 14 | # Sets a label to a non-leaf unit. 15 | void set_label(UCharType label) nogil 16 | 17 | # Sets an offset to a non-leaf unit. 18 | bint set_offset(BaseType offset) nogil 19 | 20 | 21 | # Checks if a unit has a leaf as a child or not. 22 | bint has_leaf() nogil 23 | 24 | # Checks if a unit corresponds to a leaf or not. 25 | ValueType value() nogil 26 | 27 | # Reads a label with a leaf flag from a non-leaf unit. 28 | BaseType label() nogil 29 | 30 | # Reads an offset to child units from a non-leaf unit. 31 | BaseType offset() nogil 32 | -------------------------------------------------------------------------------- /src/_guide.pxd: -------------------------------------------------------------------------------- 1 | from _base_types cimport BaseType, SizeType, ValueType, UCharType, CharType 2 | from _guide_unit cimport GuideUnit 3 | from iostream cimport istream, ostream 4 | 5 | cdef extern from "../lib/dawgdic/guide.h" namespace "dawgdic": 6 | cdef cppclass Guide: 7 | 8 | Guide() 9 | 10 | GuideUnit *units() 11 | SizeType size() 12 | SizeType total_size() 13 | SizeType file_size() 14 | 15 | # The root index. 16 | BaseType root() 17 | 18 | UCharType child(BaseType index) 19 | UCharType sibling(BaseType index) 20 | 21 | # Reads a dictionary from an input stream. 22 | bint Read(istream *input) 23 | 24 | # Writes a dictionry to an output stream. 25 | bint Write(ostream *output) 26 | 27 | # Maps memory with its size. 28 | void Map(void *address) 29 | 30 | # Swaps Guides. 31 | void Swap(Guide *Guide) 32 | 33 | # Initializes a Guide. 34 | void Clear() -------------------------------------------------------------------------------- /src/_guide_builder.pxd: -------------------------------------------------------------------------------- 1 | from _base_types cimport BaseType 2 | from _dawg cimport Dawg 3 | from _dictionary cimport Dictionary 4 | from _guide cimport Guide 5 | 6 | cdef extern from "../lib/dawgdic/guide-builder.h" namespace "dawgdic::GuideBuilder": 7 | cdef bint Build (Dawg &dawg, Dictionary &dic, Guide* guide) nogil 8 | 9 | -------------------------------------------------------------------------------- /src/_guide_unit.pxd: -------------------------------------------------------------------------------- 1 | from _base_types cimport BaseType, SizeType, ValueType, UCharType, CharType 2 | 3 | cdef extern from "../lib/dawgdic/guide-unit.h" namespace "dawgdic": 4 | cdef cppclass GuideUnit: 5 | GuideUnit() nogil 6 | 7 | void set_child(UCharType child) nogil 8 | void set_sibling(UCharType sibling) nogil 9 | UCharType child() nogil 10 | UCharType sibling() nogil 11 | -------------------------------------------------------------------------------- /src/b64_decode.pxd: -------------------------------------------------------------------------------- 1 | from iostream cimport istream, ostream 2 | 3 | cdef extern from "../lib/b64/decode.h" namespace "base64": 4 | 5 | cdef cppclass decoder: 6 | decoder() 7 | decoder(int buffersize_in) 8 | 9 | int decode(char* code_in, int length_in, char* plaintext_out) 10 | void init() 11 | 12 | void decode(istream istream_in, ostream ostream_in) -------------------------------------------------------------------------------- /src/dawg.pyx: -------------------------------------------------------------------------------- 1 | # cython: profile=False 2 | # cython: embedsignature=True 3 | from __future__ import unicode_literals 4 | from libcpp.string cimport string 5 | from libcpp.vector cimport vector 6 | from iostream cimport stringstream, istream, ostream, ifstream 7 | cimport iostream 8 | 9 | cimport _dawg 10 | from _dawg_builder cimport DawgBuilder 11 | from _dictionary cimport Dictionary 12 | from _guide cimport Guide 13 | from _completer cimport Completer 14 | from _base_types cimport BaseType, SizeType, CharType 15 | cimport _guide_builder 16 | cimport _dictionary_builder 17 | cimport b64_decode 18 | 19 | try: 20 | from collections.abc import Mapping 21 | except ImportError: 22 | # Python 2.7 23 | from collections import Mapping 24 | import struct 25 | import sys 26 | from binascii import b2a_base64 27 | 28 | 29 | class Error(Exception): 30 | pass 31 | 32 | 33 | cdef class DAWG: 34 | """ 35 | Base DAWG wrapper. 36 | """ 37 | cdef Dictionary dct 38 | cdef _dawg.Dawg dawg 39 | 40 | def __init__(self, arg=None, input_is_sorted=False): 41 | if arg is None: 42 | arg = [] 43 | if not input_is_sorted: 44 | arg = [ 45 | (key).encode('utf8') if isinstance(key, unicode) else key 46 | for key in arg 47 | ] 48 | arg.sort() 49 | self._build_from_iterable(arg) 50 | 51 | def __dealloc__(self): 52 | self.dct.Clear() 53 | self.dawg.Clear() 54 | 55 | def _build_from_iterable(self, iterable): 56 | cdef DawgBuilder dawg_builder 57 | cdef bytes b_key 58 | cdef int value 59 | 60 | for key in iterable: 61 | if isinstance(key, tuple) or isinstance(key, list): 62 | key, value = key 63 | if value < 0: 64 | raise ValueError("Negative values are not supported") 65 | else: 66 | value = 0 67 | 68 | if isinstance(key, unicode): 69 | b_key = (key).encode('utf8') 70 | else: 71 | b_key = key 72 | 73 | if not dawg_builder.Insert(b_key, len(b_key), value): 74 | raise Error("Can't insert key %r (with value %r)" % (b_key, value)) 75 | 76 | if not dawg_builder.Finish(&self.dawg): 77 | raise Error("dawg_builder.Finish error") 78 | 79 | if not _dictionary_builder.Build(self.dawg, &self.dct): 80 | raise Error("Can't build dictionary") 81 | 82 | def __contains__(self, key): 83 | if isinstance(key, unicode): 84 | return self.has_key(key) 85 | return self.b_has_key(key) 86 | 87 | cpdef bint has_key(self, unicode key) except -1: 88 | return self.b_has_key(key.encode('utf8')) 89 | 90 | cpdef bint b_has_key(self, bytes key) except -1: 91 | return self.dct.Contains(key, len(key)) 92 | 93 | cpdef bytes tobytes(self) except +: 94 | """ 95 | Return raw DAWG content as bytes. 96 | """ 97 | cdef stringstream stream 98 | self.dct.Write( &stream) 99 | cdef bytes res = stream.str() 100 | return res 101 | 102 | cpdef frombytes(self, bytes data): 103 | """ 104 | Load DAWG from bytes ``data``. 105 | 106 | FIXME: it seems there is a memory leak here (DAWG uses 3x memory 107 | when loaded using ``.frombytes`` compared to DAWG loaded 108 | using ``.load``). 109 | """ 110 | cdef string s_data = data 111 | cdef stringstream* stream = new stringstream(s_data) 112 | 113 | try: 114 | res = self.dct.Read( stream) 115 | 116 | if not res: 117 | self.dct.Clear() 118 | raise IOError("Invalid data format") 119 | 120 | return self 121 | finally: 122 | del stream 123 | 124 | def read(self, f): 125 | """ 126 | Load DAWG from a file-like object. 127 | 128 | FIXME: this method should'n read the whole stream. 129 | """ 130 | self.frombytes(f.read()) 131 | 132 | def write(self, f): 133 | """ 134 | Write DAWG to a file-like object. 135 | """ 136 | f.write(self.tobytes()) 137 | 138 | def load(self, path): 139 | """ 140 | Load DAWG from a file. 141 | """ 142 | if isinstance(path, unicode): 143 | path = path.encode(sys.getfilesystemencoding()) 144 | 145 | cdef ifstream stream 146 | stream.open(path, iostream.binary) 147 | if stream.fail(): 148 | raise IOError("It's not possible to read file stream") 149 | 150 | res = self.dct.Read( &stream) 151 | 152 | stream.close() 153 | 154 | if not res: 155 | self.dct.Clear() 156 | raise IOError("Invalid data format") 157 | 158 | return self 159 | 160 | def save(self, path): 161 | """ 162 | Save DAWG to a file. 163 | """ 164 | with open(path, 'wb') as f: 165 | self.write(f) 166 | 167 | # pickling support 168 | def __reduce__(self): 169 | return self.__class__, tuple(), self.tobytes() 170 | 171 | def __setstate__(self, state): 172 | self.frombytes(state) 173 | 174 | # half-internal methods 175 | def _size(self): 176 | return self.dct.size() 177 | 178 | def _total_size(self): 179 | return self.dct.total_size() 180 | 181 | def _file_size(self): 182 | return self.dct.file_size() 183 | 184 | cdef bint _has_value(self, BaseType index): 185 | return self.dct.has_value(index) 186 | 187 | cdef list _similar_keys(self, unicode current_prefix, unicode key, BaseType cur_index, dict replace_chars): 188 | cdef BaseType next_index, index = cur_index 189 | cdef unicode prefix, u_replace_char, found_key 190 | cdef bytes b_step, b_replace_char 191 | cdef list res = [] 192 | cdef list extra_keys 193 | 194 | cdef int start_pos = len(current_prefix) 195 | cdef int end_pos = len(key) 196 | cdef int word_pos = start_pos 197 | 198 | while word_pos < end_pos: 199 | b_step = (key[word_pos].encode('utf8')) 200 | 201 | if b_step in replace_chars: 202 | next_index = index 203 | b_replace_char, u_replace_char = replace_chars[b_step] 204 | 205 | if self.dct.Follow(b_replace_char, &next_index): 206 | prefix = current_prefix + key[start_pos:word_pos] + u_replace_char 207 | extra_keys = self._similar_keys(prefix, key, next_index, replace_chars) 208 | res.extend(extra_keys) 209 | 210 | if not self.dct.Follow(b_step, &index): 211 | break 212 | word_pos += 1 213 | 214 | else: 215 | if self._has_value(index): 216 | found_key = current_prefix + key[start_pos:] 217 | res.insert(0, found_key) 218 | 219 | return res 220 | 221 | cpdef list similar_keys(self, unicode key, dict replaces): 222 | """ 223 | Return all variants of ``key`` in this DAWG according to 224 | ``replaces``. 225 | 226 | ``replaces`` is an object obtained from 227 | ``DAWG.compile_replaces(mapping)`` where mapping is a dict 228 | that maps single-char unicode sitrings to another single-char 229 | unicode strings. 230 | 231 | This may be useful e.g. for handling single-character umlauts. 232 | """ 233 | return self._similar_keys("", key, self.dct.root(), replaces) 234 | 235 | cpdef list prefixes(self, unicode key): 236 | ''' 237 | Return a list with keys of this DAWG that are prefixes of the ``key``. 238 | ''' 239 | return [p.decode('utf8') for p in self.b_prefixes(key.encode('utf8'))] 240 | 241 | cpdef list b_prefixes(self, bytes b_key): 242 | cdef list res = [] 243 | cdef BaseType index = self.dct.root() 244 | cdef int pos = 1 245 | cdef CharType ch 246 | 247 | for ch in b_key: 248 | if not self.dct.Follow(ch, &index): 249 | break 250 | if self._has_value(index): 251 | res.append(b_key[:pos]) 252 | pos += 1 253 | 254 | return res 255 | 256 | def iterprefixes(self, unicode key): 257 | ''' 258 | Return a generator with keys of this DAWG that are prefixes of the ``key``. 259 | ''' 260 | cdef BaseType index = self.dct.root() 261 | cdef bytes b_key = key.encode('utf8') 262 | cdef int pos = 1 263 | cdef CharType ch 264 | 265 | for ch in b_key: 266 | if not self.dct.Follow(ch, &index): 267 | return 268 | if self._has_value(index): 269 | yield b_key[:pos].decode('utf8') 270 | pos += 1 271 | 272 | @classmethod 273 | def compile_replaces(cls, replaces): 274 | 275 | for k,v in replaces.items(): 276 | if len(k) != 1 or len(v) != 1: 277 | raise ValueError("Keys and values must be single-char unicode strings.") 278 | 279 | return dict( 280 | ( 281 | k.encode('utf8'), 282 | (v.encode('utf8'), unicode(v)) 283 | ) 284 | for k, v in replaces.items() 285 | ) 286 | 287 | 288 | cdef void init_completer(Completer& completer, Dictionary& dic, Guide& guide): 289 | completer.set_dic(dic) 290 | completer.set_guide(guide) 291 | 292 | 293 | cdef class CompletionDAWG(DAWG): 294 | """ 295 | DAWG with key completion support. 296 | """ 297 | cdef Guide guide 298 | 299 | def __init__(self, arg=None, input_is_sorted=False): 300 | super(CompletionDAWG, self).__init__(arg, input_is_sorted) 301 | if not _guide_builder.Build(self.dawg, self.dct, &self.guide): 302 | raise Error("Error building completion information") 303 | 304 | def __dealloc__(self): 305 | self.guide.Clear() 306 | 307 | cpdef list keys(self, unicode prefix=""): 308 | cdef bytes b_prefix = prefix.encode('utf8') 309 | cdef BaseType index = self.dct.root() 310 | cdef list res = [] 311 | 312 | if not self.dct.Follow(b_prefix, &index): 313 | return res 314 | 315 | cdef Completer completer 316 | init_completer(completer, self.dct, self.guide) 317 | completer.Start(index, b_prefix) 318 | 319 | while completer.Next(): 320 | key = (completer.key()).decode('utf8') 321 | res.append(key) 322 | 323 | return res 324 | 325 | def iterkeys(self, unicode prefix=""): 326 | cdef bytes b_prefix = prefix.encode('utf8') 327 | cdef BaseType index = self.dct.root() 328 | 329 | if not self.dct.Follow(b_prefix, &index): 330 | return 331 | 332 | cdef Completer completer 333 | init_completer(completer, self.dct, self.guide) 334 | completer.Start(index, b_prefix) 335 | 336 | while completer.Next(): 337 | key = (completer.key()).decode('utf8') 338 | yield key 339 | 340 | def has_keys_with_prefix(self, unicode prefix): 341 | cdef bytes b_prefix = prefix.encode('utf8') 342 | cdef BaseType index = self.dct.root() 343 | 344 | if not self.dct.Follow(b_prefix, &index): 345 | return False 346 | 347 | cdef Completer completer 348 | init_completer(completer, self.dct, self.guide) 349 | completer.Start(index, b_prefix) 350 | 351 | return completer.Next() 352 | 353 | cpdef bytes tobytes(self) except +: 354 | """ 355 | Return raw DAWG content as bytes. 356 | """ 357 | cdef stringstream stream 358 | self.dct.Write( &stream) 359 | self.guide.Write( &stream) 360 | cdef bytes res = stream.str() 361 | return res 362 | 363 | cpdef frombytes(self, bytes data): 364 | """ 365 | Load DAWG from bytes ``data``. 366 | 367 | FIXME: it seems there is memory leak here (DAWG uses 3x memory when 368 | loaded using frombytes vs load). 369 | """ 370 | cdef char* c_data = data 371 | cdef stringstream stream 372 | stream.write(c_data, len(data)) 373 | stream.seekg(0) 374 | 375 | res = self.dct.Read( &stream) 376 | if not res: 377 | self.dct.Clear() 378 | raise IOError("Invalid data format: can't load _dawg.Dictionary") 379 | 380 | res = self.guide.Read( &stream) 381 | if not res: 382 | self.guide.Clear() 383 | self.dct.Clear() 384 | raise IOError("Invalid data format: can't load _dawg.Guide") 385 | 386 | return self 387 | 388 | def load(self, path): 389 | """ 390 | Load DAWG from a file. 391 | """ 392 | if isinstance(path, unicode): 393 | path = path.encode(sys.getfilesystemencoding()) 394 | 395 | cdef ifstream stream 396 | stream.open(path, iostream.binary) 397 | if stream.fail(): 398 | raise IOError("It's not possible to read file stream") 399 | 400 | try: 401 | res = self.dct.Read( &stream) 402 | if not res: 403 | self.dct.Clear() 404 | raise IOError("Invalid data format: can't load _dawg.Dictionary") 405 | 406 | res = self.guide.Read( &stream) 407 | if not res: 408 | self.guide.Clear() 409 | self.dct.Clear() 410 | raise IOError("Invalid data format: can't load _dawg.Guide") 411 | 412 | finally: 413 | stream.close() 414 | 415 | return self 416 | 417 | def _transitions(self): 418 | transitions = set() 419 | cdef BaseType index, prev_index, completer_index 420 | cdef char* key 421 | 422 | cdef Completer completer 423 | init_completer(completer, self.dct, self.guide) 424 | completer.Start(self.dct.root()) 425 | 426 | while completer.Next(): 427 | key = completer.key() 428 | 429 | index = self.dct.root() 430 | 431 | for i in range(completer.length()): 432 | prev_index = index 433 | self.dct.Follow(&(key[i]), 1, &index) 434 | transitions.add( 435 | (prev_index, key[i], index) 436 | ) 437 | 438 | return sorted(list(transitions)) 439 | 440 | 441 | # The following symbol is not allowed in utf8 so it is safe to use 442 | # as a separator between utf8-encoded string and binary payload. 443 | # It has drawbacks however: sorting of utf8-encoded keys changes: 444 | # ('foo' becomes greater than 'foox' because strings are compared as 445 | # 'foo' and 'foox' and ord()==255 is greater than 446 | # ord(). 447 | # DEF PAYLOAD_SEPARATOR = b'\xff' 448 | 449 | # That's why chr(1) is used as separator by default: this is the lowest allowed 450 | # character and so it will preserve keys alphabetical order. 451 | # It is not strictly correct to use chr(1) as separator because chr(1) 452 | # is a valid UTF8 character. But I think in practice this won't be an issue: 453 | # such control character is very unlikely in text keys, and binary keys 454 | # are not supported anyway because dawgdic doesn't support keys containing 455 | # chr(0). 456 | cdef bytes PAYLOAD_SEPARATOR = b'\x01' 457 | 458 | DEF MAX_VALUE_SIZE = 32768 459 | 460 | cdef class BytesDAWG(CompletionDAWG): 461 | """ 462 | DAWG that is able to transparently store extra binary payload in keys; 463 | there may be several payloads for the same key. 464 | 465 | In other words, this class implements read-only DAWG-based 466 | {unicode -> list of bytes objects} mapping. 467 | """ 468 | 469 | cdef bytes _b_payload_separator 470 | cdef CharType _c_payload_separator 471 | cdef Completer* _completer 472 | 473 | def __init__(self, arg=None, input_is_sorted=False, bytes payload_separator=PAYLOAD_SEPARATOR): 474 | """ 475 | ``arg`` must be an iterable of tuples (unicode_key, bytes_payload). 476 | """ 477 | if arg is None: 478 | arg = [] 479 | 480 | self._b_payload_separator = payload_separator 481 | self._c_payload_separator = ord(payload_separator) 482 | 483 | keys = (self._raw_key(d[0], d[1]) for d in arg) 484 | super(BytesDAWG, self).__init__(keys, input_is_sorted) 485 | 486 | self._update_completer() 487 | 488 | def __dealloc__(self): 489 | if self._completer: 490 | del self._completer 491 | 492 | cpdef bytes _raw_key(self, unicode key, bytes payload): 493 | cdef bytes b_key = key.encode('utf8') 494 | 495 | if self._b_payload_separator in b_key: 496 | raise Error("Payload separator (%r) is found within utf8-encoded key ('%s')" % (self._b_payload_separator, key)) 497 | 498 | cdef bytes encoded_payload = b2a_base64(payload) 499 | return b_key + self._b_payload_separator + encoded_payload 500 | 501 | cdef _update_completer(self): 502 | if self._completer: 503 | del self._completer 504 | self._completer = new Completer(self.dct, self.guide) 505 | 506 | def load(self, path): 507 | res = super(BytesDAWG, self).load(path) 508 | self._update_completer() 509 | return res 510 | 511 | cpdef frombytes(self, bytes data): 512 | res = super(BytesDAWG, self).frombytes(data) 513 | self._update_completer() 514 | return res 515 | 516 | cpdef bint b_has_key(self, bytes key) except -1: 517 | cdef BaseType index 518 | return self._follow_key(key, &index) 519 | 520 | def __getitem__(self, key): 521 | res = self.get(key) 522 | if res is None: 523 | raise KeyError(key) 524 | return res 525 | 526 | cpdef get(self, key, default=None): 527 | """ 528 | Return a list of payloads (as byte objects) for a given key 529 | or ``default`` if the key is not found. 530 | """ 531 | if isinstance(key, unicode): 532 | res = self.get_value(key) 533 | else: 534 | res = self.b_get_value(key) 535 | 536 | if not res: 537 | return default 538 | return res 539 | 540 | cdef bint _follow_key(self, bytes key, BaseType* index): 541 | index[0] = self.dct.root() 542 | if not self.dct.Follow(key, len(key), index): 543 | return False 544 | return self.dct.Follow(self._c_payload_separator, index) 545 | 546 | cpdef list get_value(self, unicode key): 547 | return self.b_get_value(key.encode('utf8')) 548 | 549 | cdef list _value_for_index(self, BaseType index): 550 | 551 | # We want to use shared Completer instance because allocating 552 | # a Completer makes this function (and thus __getitem__) 2x slower. 553 | # This could be not thread-safe; GIL helps us, but we should be careful 554 | # not to occasionally switch to an another thread by iteracting 555 | # with Python interpreter in any way (switch happens 556 | # between bytecode instructions). 557 | 558 | cdef int key_len 559 | cdef b64_decode.decoder b64_decoder 560 | cdef char[MAX_VALUE_SIZE] b64_decoder_storage 561 | cdef vector[string] results 562 | 563 | self._completer.Start(index) 564 | 565 | while self._completer.Next(): 566 | b64_decoder.init() 567 | key_len = b64_decoder.decode( 568 | self._completer.key(), 569 | self._completer.length(), 570 | b64_decoder_storage 571 | ) 572 | results.push_back(string(b64_decoder_storage, key_len)) 573 | 574 | return results 575 | 576 | cpdef list b_get_value(self, bytes key): 577 | cdef BaseType index 578 | if not self._follow_key(key, &index): 579 | return [] 580 | return self._value_for_index(index) 581 | 582 | cpdef list items(self, unicode prefix=""): 583 | cdef bytes b_prefix = prefix.encode('utf8') 584 | cdef bytes value 585 | cdef int i 586 | cdef list res = [] 587 | cdef char* raw_key 588 | cdef char* raw_value 589 | cdef int raw_value_len 590 | 591 | cdef BaseType index = self.dct.root() 592 | if not self.dct.Follow(b_prefix, &index): 593 | return res 594 | 595 | cdef int _len 596 | cdef b64_decode.decoder _b64_decoder 597 | cdef char[MAX_VALUE_SIZE] _b64_decoder_storage 598 | 599 | cdef Completer completer 600 | init_completer(completer, self.dct, self.guide) 601 | completer.Start(index, b_prefix) 602 | 603 | while completer.Next(): 604 | raw_key = completer.key() 605 | 606 | for i in range(0, completer.length()): 607 | if raw_key[i] == self._c_payload_separator: 608 | break 609 | 610 | raw_value = &(raw_key[i]) 611 | raw_value_len = completer.length() - i 612 | 613 | _b64_decoder.init() 614 | _len = _b64_decoder.decode(raw_value, raw_value_len, _b64_decoder_storage) 615 | value = _b64_decoder_storage[:_len] 616 | 617 | u_key = raw_key[:i].decode('utf8') 618 | res.append( 619 | (u_key, value) 620 | ) 621 | 622 | return res 623 | 624 | def iteritems(self, unicode prefix=""): 625 | cdef bytes b_prefix = prefix.encode('utf8') 626 | cdef bytes value 627 | cdef int i 628 | cdef char* raw_key 629 | cdef char* raw_value 630 | cdef int raw_value_len 631 | 632 | cdef BaseType index = self.dct.root() 633 | if not self.dct.Follow(b_prefix, &index): 634 | return 635 | 636 | cdef int _len 637 | cdef b64_decode.decoder _b64_decoder 638 | cdef char[MAX_VALUE_SIZE] _b64_decoder_storage 639 | 640 | cdef Completer completer 641 | init_completer(completer, self.dct, self.guide) 642 | completer.Start(index, b_prefix) 643 | 644 | while completer.Next(): 645 | raw_key = completer.key() 646 | 647 | for i in range(0, completer.length()): 648 | if raw_key[i] == self._c_payload_separator: 649 | break 650 | 651 | raw_value = &(raw_key[i]) 652 | raw_value_len = completer.length() - i 653 | 654 | _b64_decoder.init() 655 | _len = _b64_decoder.decode(raw_value, raw_value_len, _b64_decoder_storage) 656 | value = _b64_decoder_storage[:_len] 657 | 658 | u_key = raw_key[:i].decode('utf8') 659 | yield (u_key, value) 660 | 661 | cpdef list keys(self, unicode prefix=""): 662 | cdef bytes b_prefix = prefix.encode('utf8') 663 | cdef int i 664 | cdef list res = [] 665 | cdef char* raw_key 666 | 667 | cdef BaseType index = self.dct.root() 668 | if not self.dct.Follow(b_prefix, &index): 669 | return res 670 | 671 | cdef Completer completer 672 | init_completer(completer, self.dct, self.guide) 673 | completer.Start(index, b_prefix) 674 | 675 | while completer.Next(): 676 | raw_key = completer.key() 677 | 678 | for i in range(0, completer.length()): 679 | if raw_key[i] == self._c_payload_separator: 680 | break 681 | 682 | u_key = raw_key[:i].decode('utf8') 683 | res.append(u_key) 684 | return res 685 | 686 | def iterkeys(self, unicode prefix=""): 687 | cdef bytes b_prefix = prefix.encode('utf8') 688 | cdef int i 689 | cdef char* raw_key 690 | 691 | cdef BaseType index = self.dct.root() 692 | if not self.dct.Follow(b_prefix, &index): 693 | return 694 | 695 | cdef Completer completer 696 | init_completer(completer, self.dct, self.guide) 697 | completer.Start(index, b_prefix) 698 | 699 | while completer.Next(): 700 | raw_key = completer.key() 701 | 702 | for i in range(0, completer.length()): 703 | if raw_key[i] == self._c_payload_separator: 704 | break 705 | 706 | u_key = raw_key[:i].decode('utf8') 707 | yield u_key 708 | 709 | cdef bint _has_value(self, BaseType index): 710 | cdef BaseType _index = index 711 | return self.dct.Follow(self._c_payload_separator, &_index) 712 | 713 | cdef list _similar_items(self, unicode current_prefix, unicode key, BaseType cur_index, dict replace_chars): 714 | cdef BaseType next_index, index = cur_index 715 | cdef unicode prefix, u_replace_char, found_key 716 | cdef bytes b_step, b_replace_char 717 | cdef list res = [] 718 | cdef list extra_items, value 719 | 720 | cdef int start_pos = len(current_prefix) 721 | cdef int end_pos = len(key) 722 | cdef int word_pos = start_pos 723 | 724 | while word_pos < end_pos: 725 | b_step = (key[word_pos].encode('utf8')) 726 | 727 | if b_step in replace_chars: 728 | next_index = index 729 | b_replace_char, u_replace_char = replace_chars[b_step] 730 | 731 | if self.dct.Follow(b_replace_char, &next_index): 732 | prefix = current_prefix + key[start_pos:word_pos] + u_replace_char 733 | extra_items = self._similar_items(prefix, key, next_index, replace_chars) 734 | res.extend(extra_items) 735 | 736 | if not self.dct.Follow(b_step, &index): 737 | break 738 | word_pos += 1 739 | 740 | else: 741 | if self.dct.Follow(self._c_payload_separator, &index): 742 | found_key = current_prefix + key[start_pos:] 743 | value = self._value_for_index(index) 744 | res.insert(0, (found_key, value)) 745 | 746 | return res 747 | 748 | cpdef list similar_items(self, unicode key, dict replaces): 749 | """ 750 | Return a list of (key, value) tuples for all variants of ``key`` 751 | in this DAWG according to ``replaces``. 752 | 753 | ``replaces`` is an object obtained from 754 | ``DAWG.compile_replaces(mapping)`` where mapping is a dict 755 | that maps single-char unicode sitrings to another single-char 756 | unicode strings. 757 | """ 758 | return self._similar_items("", key, self.dct.root(), replaces) 759 | 760 | cdef list _similar_item_values(self, int start_pos, unicode key, BaseType cur_index, dict replace_chars): 761 | cdef BaseType next_index, index = cur_index 762 | cdef unicode prefix, u_replace_char, found_key 763 | cdef bytes b_step, b_replace_char 764 | cdef list res = [] 765 | cdef list extra_items, value 766 | 767 | #cdef int start_pos = len(current_prefix) 768 | cdef int end_pos = len(key) 769 | cdef int word_pos = start_pos 770 | 771 | while word_pos < end_pos: 772 | b_step = (key[word_pos].encode('utf8')) 773 | 774 | if b_step in replace_chars: 775 | next_index = index 776 | b_replace_char, u_replace_char = replace_chars[b_step] 777 | 778 | if self.dct.Follow(b_replace_char, &next_index): 779 | extra_items = self._similar_item_values(word_pos+1, key, next_index, replace_chars) 780 | res.extend(extra_items) 781 | 782 | if not self.dct.Follow(b_step, &index): 783 | break 784 | word_pos += 1 785 | 786 | else: 787 | if self.dct.Follow(self._c_payload_separator, &index): 788 | value = self._value_for_index(index) 789 | res.insert(0, value) 790 | 791 | return res 792 | 793 | cpdef list similar_item_values(self, unicode key, dict replaces): 794 | """ 795 | Return a list of values for all variants of the ``key`` 796 | in this DAWG according to ``replaces``. 797 | 798 | ``replaces`` is an object obtained from 799 | ``DAWG.compile_replaces(mapping)`` where mapping is a dict 800 | that maps single-char unicode sitrings to another single-char 801 | unicode strings. 802 | """ 803 | return self._similar_item_values(0, key, self.dct.root(), replaces) 804 | 805 | 806 | 807 | cdef class RecordDAWG(BytesDAWG): 808 | """ 809 | DAWG that is able to transparently store binary payload in keys; 810 | there may be several payloads for the same key. 811 | 812 | The payload format must be defined at creation time using ``fmt`` 813 | constructor argument; it has the same meaning as ``fmt`` argument 814 | for functions from ``struct`` module; take a look at 815 | http://docs.python.org/library/struct.html#format-strings for the 816 | specification. 817 | 818 | In other words, this class implements read-only DAWG-based 819 | {unicode -> list of tuples} mapping where all tuples are of the 820 | same structure and may be packed with the same format string. 821 | """ 822 | cdef _struct 823 | 824 | def __init__(self, fmt, arg=None, input_is_sorted=False, bytes payload_separator=PAYLOAD_SEPARATOR): 825 | """ 826 | ``arg`` must be an iterable of tuples (unicode_key, data_tuple). 827 | data tuples will be converted to bytes with 828 | ``struct.pack(fmt, *data_tuple)``. 829 | 830 | Take a look at 831 | http://docs.python.org/library/struct.html#format-strings for the 832 | format string specification. 833 | """ 834 | self._struct = struct.Struct(str(fmt)) 835 | 836 | if arg is None: 837 | arg = [] 838 | 839 | keys = ((d[0], self._struct.pack(*d[1])) for d in arg) 840 | super(RecordDAWG, self).__init__(keys, input_is_sorted, payload_separator) 841 | 842 | cdef list _value_for_index(self, BaseType index): 843 | cdef list value = BytesDAWG._value_for_index(self, index) 844 | return [self._struct.unpack(val) for val in value] 845 | 846 | cpdef list items(self, unicode prefix=""): 847 | cdef list items = BytesDAWG.items(self, prefix) 848 | return [(key, self._struct.unpack(val)) for (key, val) in items] 849 | 850 | def iteritems(self, unicode prefix=""): 851 | for key, val in BytesDAWG.iteritems(self, prefix): 852 | yield (key, self._struct.unpack(val)) 853 | 854 | 855 | def _iterable_from_argument(arg): 856 | if arg is None: 857 | arg = [] 858 | 859 | if isinstance(arg, Mapping): 860 | return ((key, arg[key]) for key in arg) 861 | else: 862 | return arg 863 | 864 | DEF LOOKUP_ERROR = -1 865 | 866 | cdef class IntDAWG(DAWG): 867 | """ 868 | Dict-like class based on DAWG. 869 | It can store integer values for unicode keys. 870 | """ 871 | def __init__(self, arg=None, input_is_sorted=False): 872 | """ 873 | ``arg`` must be an iterable of tuples (unicode_key, int_value) 874 | or a dict {unicode_key: int_value}. 875 | """ 876 | iterable = _iterable_from_argument(arg) 877 | super(IntDAWG, self).__init__(iterable, input_is_sorted) 878 | 879 | def __getitem__(self, key): 880 | cdef int res = self.get(key, LOOKUP_ERROR) 881 | if res == LOOKUP_ERROR: 882 | raise KeyError(key) 883 | return res 884 | 885 | cpdef get(self, key, default=None): 886 | """ 887 | Return value for the given key or ``default`` if the key is not found. 888 | """ 889 | cdef int res 890 | 891 | if isinstance(key, unicode): 892 | res = self.get_value(key) 893 | else: 894 | res = self.b_get_value(key) 895 | 896 | if res == LOOKUP_ERROR: 897 | return default 898 | return res 899 | 900 | cpdef int get_value(self, unicode key): 901 | cdef bytes b_key = key.encode('utf8') 902 | return self.dct.Find(b_key) 903 | 904 | cpdef int b_get_value(self, bytes key): 905 | return self.dct.Find(key) 906 | 907 | 908 | # FIXME: code duplication. 909 | cdef class IntCompletionDAWG(CompletionDAWG): 910 | """ 911 | Dict-like class based on DAWG. 912 | It can store integer values for unicode keys and support key completion. 913 | """ 914 | 915 | def __init__(self, arg=None, input_is_sorted=False): 916 | """ 917 | ``arg`` must be an iterable of tuples (unicode_key, int_value) 918 | or a dict {unicode_key: int_value}. 919 | """ 920 | iterable = _iterable_from_argument(arg) 921 | super(IntCompletionDAWG, self).__init__(iterable, input_is_sorted) 922 | 923 | def __getitem__(self, key): 924 | cdef int res = self.get(key, LOOKUP_ERROR) 925 | if res == LOOKUP_ERROR: 926 | raise KeyError(key) 927 | return res 928 | 929 | cpdef get(self, key, default=None): 930 | """ 931 | Return value for the given key or ``default`` if the key is not found. 932 | """ 933 | cdef int res 934 | 935 | if isinstance(key, unicode): 936 | res = self.get_value(key) 937 | else: 938 | res = self.b_get_value(key) 939 | 940 | if res == LOOKUP_ERROR: 941 | return default 942 | return res 943 | 944 | cpdef int get_value(self, unicode key): 945 | cdef bytes b_key = key.encode('utf8') 946 | return self.dct.Find(b_key) 947 | 948 | cpdef int b_get_value(self, bytes key): 949 | return self.dct.Find(key) 950 | 951 | cpdef list items(self, unicode prefix=""): 952 | cdef bytes b_prefix = prefix.encode('utf8') 953 | cdef BaseType index = self.dct.root() 954 | cdef list res = [] 955 | cdef int value 956 | 957 | if not self.dct.Follow(b_prefix, &index): 958 | return res 959 | 960 | cdef Completer completer 961 | init_completer(completer, self.dct, self.guide) 962 | completer.Start(index, b_prefix) 963 | 964 | while completer.Next(): 965 | key = (completer.key()).decode('utf8') 966 | value = completer.value() 967 | res.append((key, value)) 968 | 969 | return res 970 | 971 | def iteritems(self, unicode prefix=""): 972 | cdef bytes b_prefix = prefix.encode('utf8') 973 | cdef BaseType index = self.dct.root() 974 | cdef int value 975 | 976 | if not self.dct.Follow(b_prefix, &index): 977 | return 978 | 979 | cdef Completer completer 980 | init_completer(completer, self.dct, self.guide) 981 | completer.Start(index, b_prefix) 982 | 983 | while completer.Next(): 984 | key = (completer.key()).decode('utf8') 985 | value = completer.value() 986 | yield key, value 987 | -------------------------------------------------------------------------------- /src/iostream.pxd: -------------------------------------------------------------------------------- 1 | from libcpp.string cimport string 2 | from libcpp cimport bool 3 | 4 | cdef extern from "" namespace "std" nogil: 5 | cdef cppclass istream: 6 | istream() except + 7 | istream& read (char* s, int n) except + 8 | 9 | cdef cppclass ostream: 10 | ostream() except + 11 | ostream& write (char* s, int n) except + 12 | 13 | cdef extern from "" namespace "std" nogil: 14 | cdef cppclass ifstream: 15 | ifstream() except + 16 | istream(char* filename) except + 17 | istream(char* filename, int mode) except + 18 | 19 | bool fail() except + 20 | 21 | void open(char* filename) except + 22 | void open(char* filename, int mode) except + 23 | void close() except + 24 | 25 | ifstream& read (char* s, int n) except + 26 | 27 | 28 | cdef extern from "" namespace "std": 29 | 30 | cdef cppclass stringstream: 31 | stringstream() 32 | stringstream(string s) 33 | stringstream(string s, int options) 34 | string str () 35 | stringstream& write (char* s, int n) 36 | stringstream& seekg (int pos) 37 | 38 | 39 | cdef extern from "" namespace "std::stringstream": 40 | 41 | # int in 42 | int out 43 | int binary 44 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import -------------------------------------------------------------------------------- /tests/test_dawg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, unicode_literals 3 | import pickle 4 | import tempfile 5 | from io import BytesIO 6 | 7 | import pytest 8 | import dawg 9 | 10 | def test_contains(): 11 | d = dawg.IntDAWG({'foo': 1, 'bar': 2, 'foobar': 3}) 12 | 13 | assert 'foo' in d 14 | assert 'bar' in d 15 | assert 'foobar' in d 16 | assert 'fo' not in d 17 | assert 'x' not in d 18 | 19 | assert b'foo' in d 20 | assert b'x' not in d 21 | 22 | 23 | class TestDAWG(object): 24 | 25 | def test_sorted_iterable(self): 26 | 27 | sorted_data = ['bar', 'foo', 'foobar'] 28 | contents = "\n".join(sorted_data).encode('utf8') 29 | with tempfile.NamedTemporaryFile() as f: 30 | f.write(contents) 31 | f.seek(0) 32 | 33 | words = (line.strip() for line in f) 34 | d = dawg.DAWG(words, input_is_sorted=True) 35 | 36 | assert 'bar' in d 37 | assert 'foo' in d 38 | 39 | def test_no_segfaults_on_invalid_file(self): 40 | d = dawg.DAWG() 41 | fd, path = tempfile.mkstemp() 42 | with open(path, 'w') as f: 43 | f.write('foo') 44 | 45 | with pytest.raises(IOError) as e: 46 | d.load(path) 47 | assert 'Invalid' in e.args[0] 48 | 49 | with open(path, 'rb') as f: 50 | with pytest.raises(IOError) as e: 51 | d.read(f) 52 | assert 'Invalid' in e.args[0] 53 | 54 | def test_no_segfaults_after_wrong_stream(self): 55 | d = dawg.DAWG() 56 | wrong_path = tempfile.mktemp() # file doesn't exists 57 | 58 | with pytest.raises(IOError): 59 | d.load(wrong_path) 60 | 61 | assert 'random-key' not in d # there is possible segfault 62 | 63 | def test_build_errors(self): 64 | with pytest.raises(dawg.Error): 65 | data = [b'foo\x00bar', b'bar'] 66 | dawg.DAWG(data) 67 | 68 | def test_contains_with_null_bytes(self): 69 | d = dawg.DAWG(['foo']) 70 | assert b'foo' in d 71 | assert b'foo\x00bar' not in d 72 | 73 | def test_unicode_sorting(self): 74 | key1 = '\U00010345\U0001033f\U00010337\U00010330\U0001033d' 75 | key2 = '\uff72\uff9c\uff90\uff7b\uff9e\uff9c' 76 | 77 | # This apparently depends on Python version: 78 | # assert key1 < key2 79 | # assert key1.encode('utf8') > key2.encode('utf8') 80 | 81 | # Constructor should sort data according to utf8 values, 82 | # not according to unicode sorting rules. It will raise an exception 83 | # if data is sorted according to unicode rules. 84 | dawg.DAWG([key1, key2]) 85 | 86 | 87 | 88 | class TestIntDAWG(object): 89 | 90 | IntDAWG = dawg.IntDAWG 91 | 92 | def dawg(self): 93 | payload = {'foo': 1, 'bar': 5, 'foobar': 3} 94 | d = self.IntDAWG(payload) 95 | return payload, d 96 | 97 | def test_getitem(self): 98 | payload, d = self.dawg() 99 | for key in payload: 100 | assert d[key] == payload[key] 101 | 102 | with pytest.raises(KeyError): 103 | d['fo'] 104 | 105 | 106 | def test_dumps_loads(self): 107 | payload, d = self.dawg() 108 | data = d.tobytes() 109 | 110 | d2 = self.IntDAWG() 111 | d2.frombytes(data) 112 | for key, value in payload.items(): 113 | assert key in d2 114 | assert d2[key] == value 115 | 116 | def test_dump_load(self): 117 | payload, _ = self.dawg() 118 | 119 | buf = BytesIO() 120 | self.IntDAWG(payload).write(buf) 121 | buf.seek(0) 122 | 123 | d = self.IntDAWG() 124 | d.read(buf) 125 | 126 | for key, value in payload.items(): 127 | assert key in d 128 | assert d[key] == value 129 | 130 | def test_pickling(self): 131 | payload, d = self.dawg() 132 | 133 | data = pickle.dumps(d) 134 | d2 = pickle.loads(data) 135 | 136 | for key, value in payload.items(): 137 | assert key in d2 138 | assert d[key] == value 139 | 140 | def test_int_value_ranges(self): 141 | for val in [0, 5, 2**16-1, 2**31-1]: 142 | d = self.IntDAWG({'f': val}) 143 | assert d['f'] == val 144 | 145 | with pytest.raises(ValueError): 146 | self.IntDAWG({'f': -1}) 147 | 148 | with pytest.raises(OverflowError): 149 | self.IntDAWG({'f': 2**32-1}) 150 | 151 | 152 | class TestIntCompletionDAWG(TestIntDAWG): 153 | IntDAWG = dawg.IntCompletionDAWG # checks that all tests for IntDAWG pass 154 | 155 | 156 | class TestCompletionDAWG(object): 157 | keys = ['f', 'bar', 'foo', 'foobar'] 158 | 159 | def dawg(self): 160 | return dawg.CompletionDAWG(self.keys) 161 | 162 | def empty_dawg(self): 163 | return dawg.CompletionDAWG() 164 | 165 | def test_contains(self): 166 | d = self.dawg() 167 | for key in self.keys: 168 | assert key in d 169 | 170 | def test_keys(self): 171 | d = self.dawg() 172 | assert d.keys() == sorted(self.keys) 173 | 174 | def test_iterkeys(self): 175 | d = self.dawg() 176 | assert list(d.iterkeys()) == sorted(self.keys) 177 | assert list(d.iterkeys()) == d.keys() 178 | 179 | def test_prefixes(self): 180 | d = self.dawg() 181 | assert d.prefixes("foobarz") == ["f", "foo", "foobar"] 182 | assert d.prefixes("x") == [] 183 | assert d.prefixes("bar") == ["bar"] 184 | 185 | def test_b_prefixes(self): 186 | d = self.dawg() 187 | assert d.b_prefixes(b"foobarz") == [b"f", b"foo", b"foobar"] 188 | assert d.b_prefixes(b"x") == [] 189 | assert d.b_prefixes(b"bar") == [b"bar"] 190 | 191 | def test_iterprefixes(self): 192 | d = self.dawg() 193 | assert list(d.iterprefixes("foobarz")) == d.prefixes("foobarz") 194 | assert list(d.iterprefixes("x")) == d.prefixes("x") 195 | assert list(d.iterprefixes("bar")) == d.prefixes("bar") 196 | 197 | def test_completion(self): 198 | d = self.dawg() 199 | 200 | assert d.keys('z') == [] 201 | assert d.keys('b') == ['bar'] 202 | assert d.keys('foo') == ['foo', 'foobar'] 203 | 204 | def test_has_keys_with_prefix(self): 205 | assert self.empty_dawg().has_keys_with_prefix('') == False 206 | 207 | d = self.dawg() 208 | assert d.has_keys_with_prefix('') == True 209 | assert d.has_keys_with_prefix('b') == True 210 | assert d.has_keys_with_prefix('fo') == True 211 | assert d.has_keys_with_prefix('bo') == False 212 | 213 | def test_completion_dawg_saveload(self): 214 | buf = BytesIO() 215 | self.dawg().write(buf) 216 | buf.seek(0) 217 | 218 | d = self.empty_dawg() 219 | d.read(buf) 220 | 221 | for key in self.keys: 222 | assert key in d 223 | 224 | assert d.keys('foo') == ['foo', 'foobar'] 225 | assert d.keys('b') == ['bar'] 226 | assert d.keys('z') == [] 227 | 228 | def test_no_segfaults_on_invalid_file(self): 229 | d = self.dawg() 230 | fd, path = tempfile.mkstemp() 231 | with open(path, 'w') as f: 232 | f.write('foo') 233 | 234 | with pytest.raises(IOError) as e: 235 | d.load(path) 236 | assert "can't load _dawg.Dictionary" in e.args[0] 237 | 238 | def test_no_segfaults_on_empty_dawg(self): 239 | d = dawg.CompletionDAWG([]) 240 | assert d.keys() == [] 241 | 242 | 243 | class TestIntCompletionDAWGComplete(TestCompletionDAWG): 244 | keys = ['f', 'bar', 'foo', 'foobar'] 245 | 246 | def dawg(self): 247 | return dawg.IntCompletionDAWG((k, len(k)) for k in self.keys) 248 | 249 | def empty_dawg(self): 250 | return dawg.IntCompletionDAWG() 251 | 252 | def test_no_segfaults_on_empty_dawg(self): 253 | d = dawg.IntCompletionDAWG([]) 254 | assert d.keys() == [] 255 | 256 | def test_items(self): 257 | d = self.dawg() 258 | items = d.items() 259 | assert isinstance(items, list) 260 | for key, value in items: 261 | assert len(key) == value 262 | 263 | def test_iteritems(self): 264 | d = self.dawg() 265 | for key, value in d.iteritems(): 266 | assert len(key) == value 267 | 268 | def test_items_prefix(self): 269 | d = self.dawg() 270 | assert d.items('fo') == [('foo', 3), ('foobar', 6)] 271 | -------------------------------------------------------------------------------- /tests/test_payload_dawg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, unicode_literals 3 | 4 | import pytest 5 | import dawg 6 | 7 | class TestBytesDAWG(object): 8 | 9 | DATA = ( 10 | ('foo', b'data3'), 11 | ('bar', b'data2'), 12 | ('foo', b'data1'), 13 | ('foobar', b'data4') 14 | ) 15 | 16 | DATA_KEYS = list(zip(*DATA))[0] 17 | 18 | def dawg(self, **kwargs): 19 | return dawg.BytesDAWG(self.DATA, **kwargs) 20 | 21 | def test_contains(self): 22 | d = self.dawg() 23 | for key, val in self.DATA: 24 | assert key in d 25 | 26 | assert 'food' not in d 27 | assert 'x' not in d 28 | assert 'fo' not in d 29 | 30 | 31 | def test_getitem(self): 32 | d = self.dawg() 33 | 34 | assert d['foo'] == [b'data1', b'data3'] 35 | assert d['bar'] == [b'data2'] 36 | assert d['foobar'] == [b'data4'] 37 | 38 | with pytest.raises(KeyError): 39 | d['f'] 40 | 41 | with pytest.raises(KeyError): 42 | d['food'] 43 | 44 | with pytest.raises(KeyError): 45 | d['foobarz'] 46 | 47 | with pytest.raises(KeyError): 48 | d['x'] 49 | 50 | def test_prefixes(self): 51 | d = self.dawg() 52 | assert d.prefixes("foobarz") == ["foo", "foobar"] 53 | assert d.prefixes("x") == [] 54 | assert d.prefixes("bar") == ["bar"] 55 | 56 | def test_keys(self): 57 | d = self.dawg() 58 | assert d.keys() == sorted(self.DATA_KEYS) 59 | 60 | def test_keys_ordering(self): 61 | data = [('foo', b'v1'), ('foobar', b'v2'), ('bar', b'v3')] 62 | 63 | d = dawg.BytesDAWG(data, payload_separator=b'\xff') 64 | assert d.keys() == ['bar', 'foobar', 'foo'] 65 | 66 | d2 = dawg.BytesDAWG(data, payload_separator=b'\x01') 67 | assert d2.keys() == ['bar', 'foo', 'foobar'] 68 | 69 | def test_iterkeys(self): 70 | d = self.dawg() 71 | assert list(d.iterkeys()) == d.keys() 72 | assert list(d.iterkeys()) == sorted(self.DATA_KEYS) 73 | 74 | def test_items(self): 75 | d = self.dawg() 76 | assert d.items() == sorted(self.DATA) 77 | 78 | def test_iteritems(self): 79 | d = self.dawg() 80 | assert list(d.iteritems()) == d.items() 81 | 82 | def test_build_error(self): 83 | with pytest.raises(dawg.Error): 84 | self.dawg(payload_separator=b'f') 85 | 86 | 87 | 88 | class TestRecordDAWG(object): 89 | 90 | STRUCTURED_DATA = ( 91 | ('foo', (3, 2, 256)), 92 | ('bar', (3, 1, 0)), 93 | ('foo', (3, 2, 1)), 94 | ('foobar', (6, 3, 0)) 95 | ) 96 | 97 | def dawg(self): 98 | return dawg.RecordDAWG(">3H", self.STRUCTURED_DATA) 99 | 100 | def test_record_getitem(self): 101 | d = self.dawg() 102 | assert d['foo'] == [(3, 2, 1), (3, 2, 256)] 103 | assert d['bar'] == [(3, 1, 0)] 104 | assert d['foobar'] == [(6, 3, 0)] 105 | 106 | def test_record_items(self): 107 | d = self.dawg() 108 | assert d.items() == sorted(self.STRUCTURED_DATA) 109 | 110 | def test_record_keys(self): 111 | d = self.dawg() 112 | assert d.keys() == ['bar', 'foo', 'foo', 'foobar',] 113 | 114 | def test_record_iterkeys(self): 115 | d = self.dawg() 116 | assert list(d.iterkeys()) == d.keys() 117 | 118 | def test_record_iteritems(self): 119 | d = self.dawg() 120 | assert list(d.iteritems()) == d.items() 121 | 122 | def test_record_keys_prefix(self): 123 | d = self.dawg() 124 | assert d.keys('fo') == ['foo', 'foo', 'foobar'] 125 | assert d.keys('bar') == ['bar'] 126 | assert d.keys('barz') == [] 127 | 128 | def test_prefixes(self): 129 | d = self.dawg() 130 | assert d.prefixes("foobarz") == ["foo", "foobar"] 131 | assert d.prefixes("x") == [] 132 | assert d.prefixes("bar") == ["bar"] 133 | -------------------------------------------------------------------------------- /tests/test_prediction.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, unicode_literals 3 | import pytest 4 | import dawg 5 | 6 | class TestPrediction(object): 7 | DATA = ['ЁЖИК', 'ЁЖИКЕ', 'ЁЖ', 'ДЕРЕВНЯ', 'ДЕРЁВНЯ', 'ЕМ', 'ОЗЕРА', 'ОЗЁРА', 'ОЗЕРО'] 8 | LENGTH_DATA = list(zip(DATA, ((len(w),) for w in DATA))) 9 | 10 | REPLACES = dawg.DAWG.compile_replaces({'Е': 'Ё'}) 11 | 12 | SUITE = [ 13 | ('УЖ', []), 14 | ('ЕМ', ['ЕМ']), 15 | ('ЁМ', []), 16 | ('ЁЖ', ['ЁЖ']), 17 | ('ЕЖ', ['ЁЖ']), 18 | ('ЁЖИК', ['ЁЖИК']), 19 | ('ЕЖИКЕ', ['ЁЖИКЕ']), 20 | ('ДЕРЕВНЯ', ['ДЕРЕВНЯ', 'ДЕРЁВНЯ']), 21 | ('ДЕРЁВНЯ', ['ДЕРЁВНЯ']), 22 | ('ОЗЕРА', ['ОЗЕРА', 'ОЗЁРА']), 23 | ('ОЗЕРО', ['ОЗЕРО']), 24 | ] 25 | 26 | SUITE_ITEMS = [ 27 | ( 28 | it[0], # key 29 | [ 30 | (w, [(len(w),)]) # item, value pair 31 | for w in it[1] 32 | ] 33 | ) 34 | for it in SUITE 35 | ] 36 | 37 | SUITE_VALUES = [ 38 | ( 39 | it[0], # key 40 | [[(len(w),)] for w in it[1]] 41 | ) 42 | for it in SUITE 43 | ] 44 | 45 | 46 | @pytest.mark.parametrize(("word", "prediction"), SUITE) 47 | def test_dawg_prediction(self, word, prediction): 48 | d = dawg.DAWG(self.DATA) 49 | assert d.similar_keys(word, self.REPLACES) == prediction 50 | 51 | @pytest.mark.parametrize(("word", "prediction"), SUITE) 52 | def test_record_dawg_prediction(self, word, prediction): 53 | d = dawg.RecordDAWG(str("=H"), self.LENGTH_DATA) 54 | assert d.similar_keys(word, self.REPLACES) == prediction 55 | 56 | @pytest.mark.parametrize(("word", "prediction"), SUITE_ITEMS) 57 | def test_record_dawg_items(self, word, prediction): 58 | d = dawg.RecordDAWG(str("=H"), self.LENGTH_DATA) 59 | assert d.similar_items(word, self.REPLACES) == prediction 60 | 61 | @pytest.mark.parametrize(("word", "prediction"), SUITE_VALUES) 62 | def test_record_dawg_items_values(self, word, prediction): 63 | d = dawg.RecordDAWG(str("=H"), self.LENGTH_DATA) 64 | assert d.similar_item_values(word, self.REPLACES) == prediction 65 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27,py35,py35-locale,py36,py37,py38 3 | 4 | [testenv] 5 | deps = 6 | pytest 7 | commands= 8 | python setup.py install 9 | py.test [] 10 | 11 | [testenv:py35-locale] 12 | basepython = python3.5 13 | setenv = 14 | LC_ALL=C 15 | -------------------------------------------------------------------------------- /update_cpp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | cython src/*.pyx src/*.pxd -a --cplus -2 3 | --------------------------------------------------------------------------------