├── .coveragerc
├── .gitignore
├── .hgignore
├── .hgtags
├── .travis.yml
├── CHANGES.rst
├── LICENSE
├── MANIFEST.in
├── README.rst
├── _prepare_dev_data.py
├── bench.ini
├── bench
    ├── __init__.py
    ├── speed.py
    └── utils.py
├── dawg_python
    ├── __init__.py
    ├── compat.py
    ├── dawgs.py
    ├── units.py
    └── wrapper.py
├── dev_data
    ├── large
    │   ├── bytes_dawg.dawg
    │   ├── dawg.dawg
    │   ├── int_dawg.dawg
    │   └── record_dawg.dawg
    ├── small
    │   ├── bytes.dawg
    │   ├── completion-empty.dawg
    │   ├── completion.dawg
    │   ├── int_completion_dawg.dawg
    │   ├── int_dawg.dawg
    │   ├── prediction-record.dawg
    │   ├── prediction.dawg
    │   └── record.dawg
    └── words100k.txt.zip
├── setup.cfg
├── setup.py
├── tests
    ├── __init__.py
    ├── test_dawg.py
    ├── test_fuzzy.py
    ├── test_payload_dawg.py
    ├── test_prediction.py
    └── utils.py
└── tox.ini


/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | branch = True
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | build/
 2 | MANIFEST
 3 | dist/
 4 | DAWG_Python.egg-info/
 5 | 
 6 | ^stuff/
 7 | *.pyc
 8 | .tox/
 9 | *.orig
10 | 


--------------------------------------------------------------------------------
/.hgignore:
--------------------------------------------------------------------------------
 1 | ^build
 2 | ^MANIFEST$
 3 | ^dist
 4 | egg-info/
 5 | \.so$
 6 | \.o$
 7 | \.lo$
 8 | 
 9 | \.svn
10 | \.cvsignore
11 | 
12 | ^src/.*\.html$
13 | 
14 | ^stuff/
15 | \.rej$
16 | \.pyc$
17 | ^.tox
18 | \.orig$
19 | \.prof$
20 | \.coverage$
21 | 


--------------------------------------------------------------------------------
/.hgtags:
--------------------------------------------------------------------------------
 1 | 94b9f5fce67517f39370ffecb122b9110e26e3bc 0.1
 2 | 51dca297194c692b1a31e4017138819bf13838f7 0.2
 3 | 8098a78df797f5e44a41a19c1eef356b37f9a685 0.3
 4 | b5416eb9b17bd2e08e4c388ac72771e57480fd61 0.3.1
 5 | c78b8b58ad02b221c383af80bca42a229db780a9 0.5
 6 | 91cc324f4faf742c1c804a1cc5ccf3a5bf8a26ca 0.5.1
 7 | 5e92658f5e6d68677f0218a71d3281d76614d4e0 0.6
 8 | fac5235bf297ae2fcceb36a7fa1c399b5e3543a3 0.7
 9 | 7458eede668c62fe00e6c12d8b064f14b8ddd4a1 0.7.1
10 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "3.4"
 4 |   - "3.3"
 5 |   - "3.2"
 6 |   - "2.7"
 7 |   - "2.6"
 8 |   - "pypy"
 9 | 
10 | install:
11 |   - if [[ $TRAVIS_PYTHON_VERSION != 'pypy' ]]; then pip install dawg; fi
12 |   - pip install coverage pytest-cov coveralls
13 |   - python setup.py install
14 | 
15 | # command to run tests, e.g. python setup.py test
16 | script:
17 |   - py.test --cov=dawg_python
18 | 
19 | after_success:
20 |   - coveralls
21 | 


--------------------------------------------------------------------------------
/CHANGES.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Changes
 3 | =======
 4 | 
 5 | 0.7.2 (2015-04-18)
 6 | ------------------
 7 | 
 8 | - minor speedup;
 9 | - bitbucket mirror is no longer maintained.
10 | 
11 | 0.7.1 (2014-06-05)
12 | ------------------
13 | 
14 | - Switch to setuptools;
15 | - upload wheel tp pypi;
16 | - check Python 3.4 compatibility.
17 | 
18 | 0.7 (2013-10-13)
19 | ----------------
20 | 
21 | IntDAWG and IntCompletionDAWG are implemented.
22 | 
23 | 0.6 (2013-03-23)
24 | ----------------
25 | 
26 | Use less shared state internally. This should fix thread-safety bugs and
27 | make iterkeys/iteritems reenterant.
28 | 
29 | 0.5.1 (2013-03-01)
30 | ------------------
31 | 
32 | Internal tweaks: memory usage is reduced; something is a bit faster,
33 | something is a bit slower.
34 | 
35 | 0.5 (2012-10-08)
36 | ----------------
37 | 
38 | Storage scheme is updated to match DAWG==0.5. This enables
39 | the alphabetical ordering of ``BytesDAWG`` and ``RecordDAWG`` items.
40 | 
41 | In order to read ``BytesDAWG`` or ``RecordDAWG`` created with
42 | versions of DAWG < 0.5 use ``payload_separator`` constructor argument::
43 | 
44 |     >>> BytesDAWG(payload_separator=b'\xff').load('old.dawg')
45 | 
46 | 
47 | 0.3.1 (2012-10-01)
48 | ------------------
49 | 
50 | Bug with empty DAWGs is fixed.
51 | 
52 | 0.3 (2012-09-26)
53 | ----------------
54 | 
55 | - ``iterkeys`` and ``iteritems`` methods.
56 | 
57 | 0.2 (2012-09-24)
58 | ----------------
59 | 
60 | ``prefixes`` support.
61 | 
62 | 0.1 (2012-09-20)
63 | ----------------
64 | 
65 | Initial release.
66 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) Mikhail Korobov, 2012
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is furnished
 8 | to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
14 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR
15 | A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
16 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
17 | CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
18 | OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst
2 | include CHANGES.rst
3 | include LICENSE
4 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | DAWG-Python
  2 | ===========
  3 | 
  4 | .. image:: https://travis-ci.org/kmike/DAWG-Python.png?branch=master
  5 |     :target: https://travis-ci.org/kmike/DAWG-Python
  6 | .. image:: https://coveralls.io/repos/kmike/DAWG-Python/badge.png?branch=master
  7 |     :target: https://coveralls.io/r/kmike/DAWG-Python
  8 | 
  9 | 
 10 | This pure-python package provides read-only access for files
 11 | created by `dawgdic`_ C++ library and `DAWG`_ python package.
 12 | 
 13 | .. _dawgdic: https://code.google.com/p/dawgdic/
 14 | .. _DAWG: https://github.com/kmike/DAWG
 15 | 
 16 | This package is not capable of creating DAWGs. It works with DAWGs built by
 17 | `dawgdic`_ C++ library or `DAWG`_ Python extension module. The main purpose
 18 | of DAWG-Python is to provide an access to DAWGs without requiring compiled
 19 | extensions. It is also quite fast under PyPy (see benchmarks).
 20 | 
 21 | Installation
 22 | ============
 23 | 
 24 | pip install DAWG-Python
 25 | 
 26 | Usage
 27 | =====
 28 | 
 29 | The aim of DAWG-Python is to be API- and binary-compatible
 30 | with `DAWG`_ when it is possible.
 31 | 
 32 | First, you have to create a dawg using DAWG_ module::
 33 | 
 34 |     import dawg
 35 |     d = dawg.DAWG(data)
 36 |     d.save('words.dawg')
 37 | 
 38 | And then this dawg can be loaded without requiring C extensions::
 39 | 
 40 |     import dawg_python
 41 |     d = dawg_python.DAWG().load('words.dawg')
 42 | 
 43 | Please consult `DAWG`_ docs for detailed usage. Some features
 44 | (like constructor parameters or ``save`` method) are intentionally
 45 | unsupported.
 46 | 
 47 | Benchmarks
 48 | ==========
 49 | 
 50 | Benchmark results (100k unicode words, integer values (lenghts of the words),
 51 | PyPy 1.9, macbook air i5 1.8 Ghz)::
 52 | 
 53 |     dict __getitem__ (hits):        11.090M ops/sec
 54 |     DAWG __getitem__ (hits):        not supported
 55 |     BytesDAWG __getitem__ (hits):   0.493M ops/sec
 56 |     RecordDAWG __getitem__ (hits):  0.376M ops/sec
 57 | 
 58 |     dict get() (hits):              10.127M ops/sec
 59 |     DAWG get() (hits):              not supported
 60 |     BytesDAWG get() (hits):         0.481M ops/sec
 61 |     RecordDAWG get() (hits):        0.402M ops/sec
 62 |     dict get() (misses):            14.885M ops/sec
 63 |     DAWG get() (misses):            not supported
 64 |     BytesDAWG get() (misses):       1.259M ops/sec
 65 |     RecordDAWG get() (misses):      1.337M ops/sec
 66 | 
 67 |     dict __contains__ (hits):           11.100M ops/sec
 68 |     DAWG __contains__ (hits):           1.317M ops/sec
 69 |     BytesDAWG __contains__ (hits):      1.107M ops/sec
 70 |     RecordDAWG __contains__ (hits):     1.095M ops/sec
 71 | 
 72 |     dict __contains__ (misses):         10.567M ops/sec
 73 |     DAWG __contains__ (misses):         1.902M ops/sec
 74 |     BytesDAWG __contains__ (misses):    1.873M ops/sec
 75 |     RecordDAWG __contains__ (misses):   1.862M ops/sec
 76 | 
 77 |     dict items():           44.401 ops/sec
 78 |     DAWG items():           not supported
 79 |     BytesDAWG items():      3.226 ops/sec
 80 |     RecordDAWG items():     2.987 ops/sec
 81 |     dict keys():            426.250 ops/sec
 82 |     DAWG keys():            not supported
 83 |     BytesDAWG keys():       6.050 ops/sec
 84 |     RecordDAWG keys():      6.363 ops/sec
 85 | 
 86 |     DAWG.prefixes (hits):    0.756M ops/sec
 87 |     DAWG.prefixes (mixed):   1.965M ops/sec
 88 |     DAWG.prefixes (misses):  1.773M ops/sec
 89 | 
 90 |     RecordDAWG.keys(prefix="xxx"), avg_len(res)==415:       1.429K ops/sec
 91 |     RecordDAWG.keys(prefix="xxxxx"), avg_len(res)==17:      36.994K ops/sec
 92 |     RecordDAWG.keys(prefix="xxxxxxxx"), avg_len(res)==3:    121.897K ops/sec
 93 |     RecordDAWG.keys(prefix="xxxxx..xx"), avg_len(res)==1.4: 265.015K ops/sec
 94 |     RecordDAWG.keys(prefix="xxx"), NON_EXISTING:            2450.898K ops/sec
 95 | 
 96 | Under CPython expect it to be about 50x slower.
 97 | Memory consumption of DAWG-Python should be the same as of `DAWG`_.
 98 | 
 99 | .. _marisa-trie: https://github.com/kmike/marisa-trie
100 | 
101 | Current limitations
102 | ===================
103 | 
104 | * This package is not capable of creating DAWGs;
105 | * all the limitations of `DAWG`_ apply.
106 | 
107 | Contributions are welcome!
108 | 
109 | 
110 | Contributing
111 | ============
112 | 
113 | Development happens at github: https://github.com/kmike/DAWG-Python
114 | Issue tracker: https://github.com/kmike/DAWG-Python/issues
115 | 
116 | Feel free to submit ideas, bugs or pull requests.
117 | 
118 | Running tests and benchmarks
119 | ----------------------------
120 | 
121 | Make sure `tox`_ is installed and run
122 | 
123 | ::
124 | 
125 |     $ tox
126 | 
127 | from the source checkout. Tests should pass under python 2.6, 2.7, 3.2, 3.3,
128 | 3.4 and PyPy >= 1.9.
129 | 
130 | In order to run benchmarks, type
131 | 
132 | ::
133 | 
134 |     $ tox -c bench.ini -e pypy
135 | 
136 | This runs benchmarks under PyPy (they are about 50x slower under CPython).
137 | 
138 | .. _tox: http://tox.testrun.org
139 | 
140 | Authors & Contributors
141 | ----------------------
142 | 
143 | * Mikhail Korobov <kmike84@gmail.com>
144 | 
145 | The algorithms are from `dawgdic`_ C++ library by Susumu Yata & contributors.
146 | 
147 | License
148 | =======
149 | 
150 | This package is licensed under MIT License.
151 | 


--------------------------------------------------------------------------------
/_prepare_dev_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Script for building test DAWGs.
 5 | """
 6 | from __future__ import absolute_import, unicode_literals
 7 | import dawg
 8 | import os
 9 | import sys
10 | import struct
11 | 
12 | sys.path.insert(0, os.path.dirname(__file__))
13 | 
14 | from bench.utils import words100k
15 | from tests.test_prediction import TestPrediction
16 | 
17 | def create_dawg():
18 |     words = words100k()
19 |     return dawg.DAWG(words)
20 | 
21 | def create_bytes_dawg():
22 |     words = words100k()
23 |     values = [struct.pack(str('<H'), len(word)) for word in words]
24 |     return dawg.BytesDAWG(zip(words, values))
25 | 
26 | def create_record_dawg():
27 |     words = words100k()
28 |     values = [ [len(word)] for word in words]
29 |     return dawg.RecordDAWG(str('<H'), zip(words, values))
30 | 
31 | def create_int_dawg():
32 |     words = words100k()
33 |     values = [len(word) for word in words]
34 |     return dawg.IntDAWG(zip(words, values))
35 | 
36 | def create_int_completion_dawg():
37 |     words = words100k()
38 |     values = [len(word) for word in words]
39 |     return dawg.IntCompletionDAWG(zip(words, values))
40 | 
41 | def build_test_data():
42 | 
43 |     dawg.CompletionDAWG(['f', 'bar', 'foo', 'foobar']).save('dev_data/small/completion.dawg')
44 |     dawg.CompletionDAWG([]).save('dev_data/small/completion-empty.dawg')
45 | 
46 |     bytes_data =  (
47 |         ('foo', b'data1'),
48 |         ('bar', b'data2'),
49 |         ('foo', b'data3'),
50 |         ('foobar', b'data4')
51 |     )
52 |     dawg.BytesDAWG(bytes_data).save('dev_data/small/bytes.dawg')
53 | 
54 |     record_data = (
55 |         ('foo',     (3, 2, 256)),
56 |         ('bar',     (3, 1, 0)),
57 |         ('foo',     (3, 2, 1)),
58 |         ('foobar',  (6, 3, 0))
59 |     )
60 |     dawg.RecordDAWG(str(">3H"), record_data).save('dev_data/small/record.dawg')
61 | 
62 |     int_data = {'foo': 1, 'bar': 5, 'foobar': 3}
63 |     dawg.IntDAWG(int_data).save('dev_data/small/int_dawg.dawg')
64 |     dawg.IntCompletionDAWG(int_data).save('dev_data/small/int_completion_dawg.dawg')
65 | 
66 |     dawg.DAWG(TestPrediction.DATA).save('dev_data/small/prediction.dawg')
67 |     dawg.RecordDAWG(str("=H"), [(k, (len(k),)) for k in TestPrediction.DATA]).save('dev_data/small/prediction-record.dawg')
68 | 
69 |     create_dawg().save('dev_data/large/dawg.dawg')
70 |     create_bytes_dawg().save('dev_data/large/bytes_dawg.dawg')
71 |     create_record_dawg().save('dev_data/large/record_dawg.dawg')
72 |     create_int_dawg().save('dev_data/large/int_dawg.dawg')
73 |     #create_int_completion_dawg().save('dev_data/large/int_completion_dawg.dawg')
74 | 
75 | 
76 | if __name__ == '__main__':
77 |     build_test_data()
78 | 


--------------------------------------------------------------------------------
/bench.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py26,py27,py32,py33
 3 | 
 4 | [testenv]
 5 | deps =
 6 |     pytest
 7 |     dawg
 8 | commands=
 9 |     python bench/speed.py
10 | 
11 | [testenv:pypy]
12 | deps =
13 |     pytest


--------------------------------------------------------------------------------
/bench/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import


--------------------------------------------------------------------------------
/bench/speed.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | from __future__ import absolute_import, unicode_literals, division
  4 | import os
  5 | import sys
  6 | import random
  7 | import string
  8 | import timeit
  9 | 
 10 | import dawg_python
 11 | 
 12 | sys.path.insert(0, os.path.join(os.path.dirname(__file__)))
 13 | 
 14 | from utils import data_path, words100k
 15 | 
 16 | def random_words(num):
 17 |     russian = 'абвгдеёжзиклмнопрстуфхцчъыьэюя'
 18 |     alphabet = '%s%s' % (russian, string.ascii_letters)
 19 |     return [
 20 |         "".join([random.choice(alphabet) for x in range(random.randint(1,15))])
 21 |         for y in range(num)
 22 |     ]
 23 | 
 24 | def truncated_words(words):
 25 |     return [word[:3] for word in words]
 26 | 
 27 | def prefixes1k(words, prefix_len):
 28 |     words = [w for w in words if len(w) >= prefix_len]
 29 |     every_nth = int(len(words)/1000)
 30 |     _words = [w[:prefix_len] for w in words[::every_nth]]
 31 |     return _words[:1000]
 32 | 
 33 | WORDS100k = words100k()
 34 | MIXED_WORDS100k = truncated_words(WORDS100k)
 35 | NON_WORDS100k = random_words(100000)
 36 | PREFIXES_3_1k = prefixes1k(WORDS100k, 3)
 37 | PREFIXES_5_1k = prefixes1k(WORDS100k, 5)
 38 | PREFIXES_8_1k = prefixes1k(WORDS100k, 8)
 39 | PREFIXES_15_1k = prefixes1k(WORDS100k, 15)
 40 | 
 41 | 
 42 | def format_result(key, value):
 43 |     print("%55s:    %s" % (key, value))
 44 | 
 45 | 
 46 | def bench(name, timer, descr='M ops/sec', op_count=0.1, repeats=3, runs=5):
 47 |     try:
 48 |         times = []
 49 |         for x in range(runs):
 50 |             times.append(timer.timeit(repeats))
 51 | 
 52 |         def op_time(time):
 53 |             return op_count*repeats / time
 54 | 
 55 |         val = "%0.3f%s" % (op_time(min(times)), descr)
 56 |         format_result(name, val)
 57 |     except (AttributeError, TypeError) as e:
 58 |         format_result(name, "not supported")
 59 |         #print(e)
 60 | 
 61 | def load_dawg():
 62 |     return dawg_python.DAWG().load(data_path('large', 'dawg.dawg'))
 63 | 
 64 | def load_bytes_dawg():
 65 |     return dawg_python.BytesDAWG().load(data_path('large', 'bytes_dawg.dawg'))
 66 | 
 67 | def load_record_dawg():
 68 |     return dawg_python.RecordDAWG(str('<H')).load(data_path('large', 'record_dawg.dawg'))
 69 | 
 70 | def load_int_dawg():
 71 |     return dawg_python.IntDAWG().load(data_path('large', 'int_dawg.dawg'))
 72 | 
 73 | def benchmark():
 74 |     print('\n====== Benchmarks (100k unique unicode words) =======\n')
 75 | 
 76 |     tests = [
 77 |         ('__getitem__ (hits)', "for word in WORDS100k: data[word]", 'M ops/sec', 0.1, 3),
 78 |         ('get() (hits)', "for word in WORDS100k: data.get(word)", 'M ops/sec', 0.1, 3),
 79 |         ('get() (misses)', "for word in NON_WORDS_10k: data.get(word)", 'M ops/sec', 0.01, 5),
 80 |         ('__contains__ (hits)', "for word in WORDS100k: word in data", 'M ops/sec', 0.1, 3),
 81 |         ('__contains__ (misses)', "for word in NON_WORDS100k: word in data", 'M ops/sec', 0.1, 3),
 82 |         ('items()', 'list(data.items())', ' ops/sec', 1, 1),
 83 |         ('keys()', 'list(data.keys())', ' ops/sec', 1, 1),
 84 | #        ('values()', 'list(data.values())', ' ops/sec', 1, 1),
 85 |     ]
 86 | 
 87 |     common_setup = """
 88 | from __main__ import load_dawg, load_bytes_dawg, load_record_dawg, load_int_dawg
 89 | from __main__ import WORDS100k, NON_WORDS100k, MIXED_WORDS100k
 90 | from __main__ import PREFIXES_3_1k, PREFIXES_5_1k, PREFIXES_8_1k, PREFIXES_15_1k
 91 | NON_WORDS_10k = NON_WORDS100k[:10000]
 92 | NON_WORDS_1k = ['ыва', 'xyz', 'соы', 'Axx', 'avы']*200
 93 | """
 94 |     dict_setup = common_setup + 'data = dict((word, len(word)) for word in WORDS100k);'
 95 |     dawg_setup = common_setup + 'data = load_dawg();'
 96 |     bytes_dawg_setup = common_setup + 'data = load_bytes_dawg();'
 97 |     record_dawg_setup = common_setup + 'data = load_record_dawg();'
 98 |     int_dawg_setup = common_setup + 'data = load_int_dawg();'
 99 | 
100 |     structures = [
101 |         ('dict', dict_setup),
102 |         ('DAWG', dawg_setup),
103 |         ('BytesDAWG', bytes_dawg_setup),
104 |         ('RecordDAWG', record_dawg_setup),
105 |         ('IntDAWG', int_dawg_setup),
106 |     ]
107 |     for test_name, test, descr, op_count, repeats in tests:
108 |         for name, setup in structures:
109 |             timer = timeit.Timer(test, setup)
110 |             full_test_name = "%s %s" % (name, test_name)
111 |             bench(full_test_name, timer, descr, op_count, repeats)
112 | 
113 | 
114 |     # DAWG-specific benchmarks
115 |     for struct_name, setup in structures[1:]:
116 |         _bench_data = [
117 |             ('hits', 'WORDS100k'),
118 |             ('mixed', 'MIXED_WORDS100k'),
119 |             ('misses', 'NON_WORDS100k'),
120 |         ]
121 | 
122 |         for meth in ['prefixes']:
123 |             for name, data in _bench_data:
124 |                 bench(
125 |                     '%s.%s (%s)' % (struct_name, meth, name),
126 |                     timeit.Timer(
127 |                         "for word in %s:\n"
128 |                         "   data.%s(word)" % (data, meth),
129 |                         setup
130 |                     ),
131 |                     runs=3
132 |                 )
133 | 
134 |         _bench_data = [
135 |             ('xxx', 'avg_len(res)==415', 'PREFIXES_3_1k'),
136 |             ('xxxxx', 'avg_len(res)==17', 'PREFIXES_5_1k'),
137 |             ('xxxxxxxx', 'avg_len(res)==3', 'PREFIXES_8_1k'),
138 |             ('xxxxx..xx', 'avg_len(res)==1.4', 'PREFIXES_15_1k'),
139 |             ('xxx', 'NON_EXISTING', 'NON_WORDS_1k'),
140 |         ]
141 |         for xxx, avg, data in _bench_data:
142 |             for meth in ['keys', 'items']:
143 |                 bench(
144 |                     '%s.%s(prefix="%s"), %s' % (struct_name, meth, xxx, avg),
145 |                     timeit.Timer(
146 |                         "for word in %s: data.%s(word)" % (data, meth),
147 |                         setup
148 |                     ),
149 |                     'K ops/sec',
150 |                     op_count=1,
151 |                     runs=3
152 |                 )
153 | 
154 |             for meth in ['iterkeys', 'iteritems']:
155 |                 bench(
156 |                     '%s.%s(prefix="%s"), %s' % (struct_name, meth, xxx, avg),
157 |                     timeit.Timer(
158 |                         "for word in %s: list(data.%s(word))" % (data, meth),
159 |                         setup
160 |                     ),
161 |                     'K ops/sec',
162 |                     op_count=1,
163 |                     runs=3
164 |                 )
165 | 
166 | if __name__ == '__main__':
167 |     benchmark()
168 |     #profiling()
169 |     print('\n~~~~~~~~~~~~~~\n')


--------------------------------------------------------------------------------
/bench/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | import os
 4 | import zipfile
 5 | 
 6 | DEV_DATA_PATH = os.path.join(
 7 |     os.path.dirname(__file__),
 8 |     '..',
 9 |     'dev_data',
10 | )
11 | 
12 | def data_path(*args):
13 |     """
14 |     Returns a path to dev data
15 |     """
16 |     return os.path.join(DEV_DATA_PATH, *args)
17 | 
18 | def words100k():
19 |     zip_name = data_path('words100k.txt.zip')
20 |     zf = zipfile.ZipFile(zip_name)
21 |     txt = zf.open(zf.namelist()[0]).read().decode('utf8')
22 |     return txt.splitlines()
23 | 


--------------------------------------------------------------------------------
/dawg_python/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 | from .wrapper import Dictionary
4 | from .dawgs import (DAWG, CompletionDAWG, BytesDAWG, RecordDAWG,
5 |                     IntDAWG, IntCompletionDAWG)
6 | 


--------------------------------------------------------------------------------
/dawg_python/compat.py:
--------------------------------------------------------------------------------
1 | import sys
2 | PY3 = sys.version_info[0] == 3
3 | 
4 | if PY3:
5 |     def int_from_byte(b):
6 |         return b
7 | else:
8 |     int_from_byte = ord
9 | 


--------------------------------------------------------------------------------
/dawg_python/dawgs.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import, unicode_literals
  3 | 
  4 | import struct
  5 | from binascii import a2b_base64
  6 | 
  7 | from . import wrapper
  8 | from .compat import int_from_byte
  9 | 
 10 | class DAWG(object):
 11 |     """
 12 |     Base DAWG wrapper.
 13 |     """
 14 |     def __init__(self):
 15 |         self.dct = None
 16 | 
 17 |     def __contains__(self, key):
 18 |         if not isinstance(key, bytes):
 19 |             key = key.encode('utf8')
 20 |         return self.dct.contains(key)
 21 | 
 22 |     def load(self, path):
 23 |         """
 24 |         Loads DAWG from a file.
 25 |         """
 26 |         self.dct = wrapper.Dictionary.load(path)
 27 |         return self
 28 | 
 29 |     def _has_value(self, index):
 30 |         return self.dct.has_value(index)
 31 | 
 32 |     def _similar_keys(self, current_prefix, key, index, replace_chars):
 33 | 
 34 |         res = []
 35 |         start_pos = len(current_prefix)
 36 |         end_pos = len(key)
 37 |         word_pos = start_pos
 38 | 
 39 |         while word_pos < end_pos:
 40 |             b_step = key[word_pos].encode('utf8')
 41 | 
 42 |             if b_step in replace_chars:
 43 |                 next_index = index
 44 |                 b_replace_char, u_replace_char = replace_chars[b_step]
 45 | 
 46 |                 next_index = self.dct.follow_bytes(b_replace_char, next_index)
 47 | 
 48 |                 if next_index is not None:
 49 |                     prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
 50 |                     extra_keys = self._similar_keys(prefix, key, next_index, replace_chars)
 51 |                     res += extra_keys
 52 | 
 53 |             index = self.dct.follow_bytes(b_step, index)
 54 |             if index is None:
 55 |                 break
 56 |             word_pos += 1
 57 | 
 58 |         else:
 59 |             if self._has_value(index):
 60 |                 found_key = current_prefix + key[start_pos:]
 61 |                 res.insert(0, found_key)
 62 | 
 63 |         return res
 64 | 
 65 |     def similar_keys(self, key, replaces):
 66 |         """
 67 |         Returns all variants of ``key`` in this DAWG according to
 68 |         ``replaces``.
 69 | 
 70 |         ``replaces`` is an object obtained from
 71 |         ``DAWG.compile_replaces(mapping)`` where mapping is a dict
 72 |         that maps single-char unicode sitrings to another single-char
 73 |         unicode strings.
 74 | 
 75 |         This may be useful e.g. for handling single-character umlauts.
 76 |         """
 77 |         return self._similar_keys("", key, self.dct.ROOT, replaces)
 78 | 
 79 |     @classmethod
 80 |     def compile_replaces(cls, replaces):
 81 | 
 82 |         for k,v in replaces.items():
 83 |             if len(k) != 1 or len(v) != 1:
 84 |                 raise ValueError("Keys and values must be single-char unicode strings.")
 85 | 
 86 |         return dict(
 87 |             (
 88 |                 k.encode('utf8'),
 89 |                 (v.encode('utf8'), v)
 90 |             )
 91 |             for k, v in replaces.items()
 92 |         )
 93 | 
 94 |     def prefixes(self, key):
 95 |         '''
 96 |         Returns a list with keys of this DAWG that are prefixes of the ``key``.
 97 |         '''
 98 |         res = []
 99 |         index = self.dct.ROOT
100 |         if not isinstance(key, bytes):
101 |             key = key.encode('utf8')
102 | 
103 |         pos = 1
104 | 
105 |         for ch in key:
106 |             index = self.dct.follow_char(int_from_byte(ch), index)
107 |             if not index:
108 |                 break
109 | 
110 |             if self._has_value(index):
111 |                 res.append(key[:pos].decode('utf8'))
112 |             pos += 1
113 | 
114 |         return res
115 | 
116 | 
117 | 
118 | class CompletionDAWG(DAWG):
119 |     """
120 |     DAWG with key completion support.
121 |     """
122 | 
123 |     def __init__(self):
124 |         super(CompletionDAWG, self).__init__()
125 |         self.guide = None
126 | 
127 |     def keys(self, prefix=""):
128 |         b_prefix = prefix.encode('utf8')
129 |         res = []
130 | 
131 |         index = self.dct.follow_bytes(b_prefix, self.dct.ROOT)
132 |         if index is None:
133 |             return res
134 | 
135 |         completer = wrapper.Completer(self.dct, self.guide)
136 |         completer.start(index, b_prefix)
137 | 
138 |         while completer.next():
139 |             key = completer.key.decode('utf8')
140 |             res.append(key)
141 | 
142 |         return res
143 | 
144 |     def iterkeys(self, prefix=""):
145 |         b_prefix = prefix.encode('utf8')
146 |         index = self.dct.follow_bytes(b_prefix, self.dct.ROOT)
147 |         if index is None:
148 |             return
149 | 
150 |         completer = wrapper.Completer(self.dct, self.guide)
151 |         completer.start(index, b_prefix)
152 | 
153 |         while completer.next():
154 |             yield completer.key.decode('utf8')
155 | 
156 | 
157 |     def load(self, path):
158 |         """
159 |         Loads DAWG from a file.
160 |         """
161 |         self.dct = wrapper.Dictionary()
162 |         self.guide = wrapper.Guide()
163 | 
164 |         with open(path, 'rb') as f:
165 |             self.dct.read(f)
166 |             self.guide.read(f)
167 | 
168 |         return self
169 | 
170 | 
171 | PAYLOAD_SEPARATOR = b'\x01'
172 | MAX_VALUE_SIZE = 32768
173 | 
174 | class BytesDAWG(CompletionDAWG):
175 |     """
176 |     DAWG that is able to transparently store extra binary payload in keys;
177 |     there may be several payloads for the same key.
178 | 
179 |     In other words, this class implements read-only DAWG-based
180 |     {unicode -> list of bytes objects} mapping.
181 |     """
182 | 
183 |     def __init__(self, payload_separator=PAYLOAD_SEPARATOR):
184 |         self._payload_separator = payload_separator
185 | 
186 |     def __contains__(self, key):
187 |         if not isinstance(key, bytes):
188 |             key = key.encode('utf8')
189 |         return bool(self._follow_key(key))
190 | 
191 | #    def b_has_key(self, key):
192 | #        return bool(self._follow_key(key))
193 | 
194 |     def __getitem__(self, key):
195 |         res = self.get(key)
196 |         if res is None:
197 |             raise KeyError(key)
198 |         return res
199 | 
200 |     def get(self, key, default=None):
201 |         """
202 |         Returns a list of payloads (as byte objects) for a given key
203 |         or ``default`` if the key is not found.
204 |         """
205 |         if not isinstance(key, bytes):
206 |             key = key.encode('utf8')
207 | 
208 |         return self.b_get_value(key) or default
209 | 
210 |     def _follow_key(self, b_key):
211 |         index = self.dct.follow_bytes(b_key, self.dct.ROOT)
212 |         if not index:
213 |             return False
214 | 
215 |         index = self.dct.follow_bytes(self._payload_separator, index)
216 |         if not index:
217 |             return False
218 | 
219 |         return index
220 | 
221 |     def _value_for_index(self, index):
222 |         res = []
223 | 
224 |         completer = wrapper.Completer(self.dct, self.guide)
225 | 
226 |         completer.start(index)
227 |         while completer.next():
228 |             # a2b_base64 doesn't support bytearray in python 2.6
229 |             # so it is converted (and copied) to bytes
230 |             b64_data = bytes(completer.key)
231 |             res.append(a2b_base64(b64_data))
232 | 
233 |         return res
234 | 
235 |     def b_get_value(self, b_key):
236 |         index = self._follow_key(b_key)
237 |         if not index:
238 |             return []
239 |         return self._value_for_index(index)
240 | 
241 |     def keys(self, prefix=""):
242 |         if not isinstance(prefix, bytes):
243 |             prefix = prefix.encode('utf8')
244 |         res = []
245 | 
246 |         index = self.dct.ROOT
247 | 
248 |         if prefix:
249 |             index = self.dct.follow_bytes(prefix, index)
250 |             if not index:
251 |                 return res
252 | 
253 |         completer = wrapper.Completer(self.dct, self.guide)
254 |         completer.start(index, prefix)
255 | 
256 |         while completer.next():
257 |             payload_idx = completer.key.index(self._payload_separator)
258 |             u_key = completer.key[:payload_idx].decode('utf8')
259 |             res.append(u_key)
260 |         return res
261 | 
262 |     def iterkeys(self, prefix=""):
263 |         if not isinstance(prefix, bytes):
264 |             prefix = prefix.encode('utf8')
265 | 
266 |         index = self.dct.ROOT
267 | 
268 |         if prefix:
269 |             index = self.dct.follow_bytes(prefix, index)
270 |             if not index:
271 |                 return
272 | 
273 |         completer = wrapper.Completer(self.dct, self.guide)
274 |         completer.start(index, prefix)
275 | 
276 |         while completer.next():
277 |             payload_idx = completer.key.index(self._payload_separator)
278 |             u_key = completer.key[:payload_idx].decode('utf8')
279 |             yield u_key
280 | 
281 |     def items(self, prefix=""):
282 |         if not isinstance(prefix, bytes):
283 |             prefix = prefix.encode('utf8')
284 |         res = []
285 | 
286 |         index = self.dct.ROOT
287 |         if prefix:
288 |             index = self.dct.follow_bytes(prefix, index)
289 |             if not index:
290 |                 return res
291 | 
292 |         completer = wrapper.Completer(self.dct, self.guide)
293 |         completer.start(index, prefix)
294 | 
295 |         while completer.next():
296 |             key, value = completer.key.split(self._payload_separator)
297 |             res.append(
298 |                 (key.decode('utf8'), a2b_base64(bytes(value))) # bytes() cast is a python 2.6 fix
299 |             )
300 | 
301 |         return res
302 | 
303 |     def iteritems(self, prefix=""):
304 |         if not isinstance(prefix, bytes):
305 |             prefix = prefix.encode('utf8')
306 | 
307 |         index = self.dct.ROOT
308 |         if prefix:
309 |             index = self.dct.follow_bytes(prefix, index)
310 |             if not index:
311 |                 return
312 | 
313 |         completer = wrapper.Completer(self.dct, self.guide)
314 |         completer.start(index, prefix)
315 | 
316 |         while completer.next():
317 |             key, value = completer.key.split(self._payload_separator)
318 |             item = (key.decode('utf8'), a2b_base64(bytes(value))) # bytes() cast is a python 2.6 fix
319 |             yield item
320 | 
321 | 
322 |     def _has_value(self, index):
323 |         return self.dct.follow_bytes(PAYLOAD_SEPARATOR, index)
324 | 
325 |     def _similar_items(self, current_prefix, key, index, replace_chars):
326 | 
327 |         res = []
328 |         start_pos = len(current_prefix)
329 |         end_pos = len(key)
330 |         word_pos = start_pos
331 | 
332 |         while word_pos < end_pos:
333 |             b_step = key[word_pos].encode('utf8')
334 | 
335 |             if b_step in replace_chars:
336 |                 next_index = index
337 |                 b_replace_char, u_replace_char = replace_chars[b_step]
338 | 
339 |                 next_index = self.dct.follow_bytes(b_replace_char, next_index)
340 |                 if next_index:
341 |                     prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
342 |                     extra_items = self._similar_items(prefix, key, next_index, replace_chars)
343 |                     res += extra_items
344 | 
345 |             index = self.dct.follow_bytes(b_step, index)
346 |             if not index:
347 |                 break
348 |             word_pos += 1
349 | 
350 |         else:
351 |             index = self.dct.follow_bytes(self._payload_separator, index)
352 |             if index:
353 |                 found_key = current_prefix + key[start_pos:]
354 |                 value = self._value_for_index(index)
355 |                 res.insert(0, (found_key, value))
356 | 
357 |         return res
358 | 
359 |     def similar_items(self, key, replaces):
360 |         """
361 |         Returns a list of (key, value) tuples for all variants of ``key``
362 |         in this DAWG according to ``replaces``.
363 | 
364 |         ``replaces`` is an object obtained from
365 |         ``DAWG.compile_replaces(mapping)`` where mapping is a dict
366 |         that maps single-char unicode sitrings to another single-char
367 |         unicode strings.
368 |         """
369 |         return self._similar_items("", key, self.dct.ROOT, replaces)
370 | 
371 | 
372 |     def _similar_item_values(self, start_pos, key, index, replace_chars):
373 |         res = []
374 |         end_pos = len(key)
375 |         word_pos = start_pos
376 | 
377 |         while word_pos < end_pos:
378 |             b_step = key[word_pos].encode('utf8')
379 | 
380 |             if b_step in replace_chars:
381 |                 next_index = index
382 |                 b_replace_char, u_replace_char = replace_chars[b_step]
383 | 
384 |                 next_index = self.dct.follow_bytes(b_replace_char, next_index)
385 |                 if next_index:
386 |                     extra_items = self._similar_item_values(word_pos+1, key, next_index, replace_chars)
387 |                     res += extra_items
388 | 
389 |             index = self.dct.follow_bytes(b_step, index)
390 |             if not index:
391 |                 break
392 |             word_pos += 1
393 | 
394 |         else:
395 |             index = self.dct.follow_bytes(self._payload_separator, index)
396 |             if index:
397 |                 value = self._value_for_index(index)
398 |                 res.insert(0, value)
399 | 
400 |         return res
401 | 
402 |     def similar_item_values(self, key, replaces):
403 |         """
404 |         Returns a list of values for all variants of the ``key``
405 |         in this DAWG according to ``replaces``.
406 | 
407 |         ``replaces`` is an object obtained from
408 |         ``DAWG.compile_replaces(mapping)`` where mapping is a dict
409 |         that maps single-char unicode sitrings to another single-char
410 |         unicode strings.
411 |         """
412 |         return self._similar_item_values(0, key, self.dct.ROOT, replaces)
413 | 
414 | 
415 | class RecordDAWG(BytesDAWG):
416 |     def __init__(self, fmt, payload_separator=PAYLOAD_SEPARATOR):
417 |         super(RecordDAWG, self).__init__(payload_separator)
418 |         self._struct = struct.Struct(str(fmt))
419 |         self.fmt = fmt
420 | 
421 |     def _value_for_index(self, index):
422 |         value = super(RecordDAWG, self)._value_for_index(index)
423 |         return [self._struct.unpack(val) for val in value]
424 | 
425 |     def items(self, prefix=""):
426 |         res = super(RecordDAWG, self).items(prefix)
427 |         return [(key, self._struct.unpack(val)) for (key, val) in res]
428 | 
429 |     def iteritems(self, prefix=""):
430 |         res = super(RecordDAWG, self).iteritems(prefix)
431 |         return ((key, self._struct.unpack(val)) for (key, val) in res)
432 | 
433 | 
434 | LOOKUP_ERROR = -1
435 | 
436 | class IntDAWG(DAWG):
437 |     """
438 |     Dict-like class based on DAWG.
439 |     It can store integer values for unicode keys.
440 |     """
441 |     def __getitem__(self, key):
442 |         res = self.get(key, LOOKUP_ERROR)
443 |         if res == LOOKUP_ERROR:
444 |             raise KeyError(key)
445 |         return res
446 | 
447 |     def get(self, key, default=None):
448 |         """
449 |         Return value for the given key or ``default`` if the key is not found.
450 |         """
451 |         if not isinstance(key, bytes):
452 |             key = key.encode('utf8')
453 |         res = self.b_get_value(key)
454 |         if res == LOOKUP_ERROR:
455 |             return default
456 |         return res
457 | 
458 |     def b_get_value(self, key):
459 |         return self.dct.find(key)
460 | 
461 | 
462 | class IntCompletionDAWG(CompletionDAWG, IntDAWG):
463 |     """
464 |     Dict-like class based on DAWG.
465 |     It can store integer values for unicode keys and support key completion.
466 |     """
467 |     def items(self, prefix=""):
468 |         if not isinstance(prefix, bytes):
469 |             prefix = prefix.encode('utf8')
470 |         res = []
471 |         index = self.dct.ROOT
472 | 
473 |         if prefix:
474 |             index = self.dct.follow_bytes(prefix, index)
475 |             if not index:
476 |                 return res
477 | 
478 |         completer = wrapper.Completer(self.dct, self.guide)
479 |         completer.start(index, prefix)
480 | 
481 |         while completer.next():
482 |             res.append(
483 |                 (completer.key.decode('utf8'), completer.value())
484 |             )
485 | 
486 |         return res
487 | 
488 |     def iteritems(self, prefix=""):
489 |         if not isinstance(prefix, bytes):
490 |             prefix = prefix.encode('utf8')
491 |         index = self.dct.ROOT
492 | 
493 |         if prefix:
494 |             index = self.dct.follow_bytes(prefix, index)
495 |             if not index:
496 |                 return
497 | 
498 |         completer = wrapper.Completer(self.dct, self.guide)
499 |         completer.start(index, prefix)
500 | 
501 |         while completer.next():
502 |             yield completer.key.decode('utf8'), completer.value()
503 | 


--------------------------------------------------------------------------------
/dawg_python/units.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Unit of a dictionary
 4 | """
 5 | from __future__ import absolute_import
 6 | 
 7 | PRECISION_MASK = 0xFFFFFFFF
 8 | 
 9 | OFFSET_MAX = 1 << 21
10 | IS_LEAF_BIT = 1 << 31
11 | HAS_LEAF_BIT = 1 << 8
12 | EXTENSION_BIT = 1 << 9
13 | 
14 | 
15 | def has_leaf(base, _mask=HAS_LEAF_BIT):
16 |     """ Check if a unit has a leaf as a child or not. """
17 |     return bool(base & _mask)
18 | 
19 | 
20 | def value(base, _mask=~IS_LEAF_BIT & PRECISION_MASK):
21 |     """ Check if a unit corresponds to a leaf or not. """
22 |     return base & _mask
23 | 
24 | 
25 | def label(base, _mask=IS_LEAF_BIT | 0xFF):
26 |     """ Read a label with a leaf flag from a non-leaf unit. """
27 |     return base & _mask
28 | 
29 | 
30 | def offset(base):
31 |     """ Read an offset to child units from a non-leaf unit. """
32 |     return ((base >> 10) << ((base & EXTENSION_BIT) >> 6)) & PRECISION_MASK
33 | 


--------------------------------------------------------------------------------
/dawg_python/wrapper.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import, unicode_literals
  3 | import struct
  4 | import array
  5 | 
  6 | from . import units
  7 | from .compat import int_from_byte
  8 | 
  9 | 
 10 | class Dictionary(object):
 11 |     """
 12 |     Dictionary class for retrieval and binary I/O.
 13 |     """
 14 |     def __init__(self):
 15 |         self._units = array.array(str("I"))
 16 | 
 17 |     ROOT = 0
 18 |     "Root index"
 19 | 
 20 |     def has_value(self, index):
 21 |         "Checks if a given index is related to the end of a key."
 22 |         return units.has_leaf(self._units[index])
 23 | 
 24 |     def value(self, index):
 25 |         "Gets a value from a given index."
 26 |         offset = units.offset(self._units[index])
 27 |         value_index = (index ^ offset) & units.PRECISION_MASK
 28 |         return units.value(self._units[value_index])
 29 | 
 30 |     def read(self, fp):
 31 |         "Reads a dictionary from an input stream."
 32 |         base_size = struct.unpack(str("=I"), fp.read(4))[0]
 33 |         self._units.fromfile(fp, base_size)
 34 | 
 35 |     def contains(self, key):
 36 |         "Exact matching."
 37 |         index = self.follow_bytes(key, self.ROOT)
 38 |         if index is None:
 39 |             return False
 40 |         return self.has_value(index)
 41 | 
 42 |     def find(self, key):
 43 |         "Exact matching (returns value)"
 44 |         index = self.follow_bytes(key, self.ROOT)
 45 |         if index is None:
 46 |             return -1
 47 |         if not self.has_value(index):
 48 |             return -1
 49 |         return self.value(index)
 50 | 
 51 |     def follow_char(self, label, index):
 52 |         "Follows a transition"
 53 |         offset = units.offset(self._units[index])
 54 |         next_index = (index ^ offset ^ label) & units.PRECISION_MASK
 55 | 
 56 |         if units.label(self._units[next_index]) != label:
 57 |             return None
 58 | 
 59 |         return next_index
 60 | 
 61 |     def follow_bytes(self, s, index):
 62 |         "Follows transitions."
 63 |         for ch in s:
 64 |             index = self.follow_char(int_from_byte(ch), index)
 65 |             if index is None:
 66 |                 return None
 67 | 
 68 |         return index
 69 | 
 70 |     @classmethod
 71 |     def load(cls, path):
 72 |         dawg = cls()
 73 |         with open(path, 'rb') as f:
 74 |             dawg.read(f)
 75 |         return dawg
 76 | 
 77 | 
 78 | class Guide(object):
 79 | 
 80 |     ROOT = 0
 81 | 
 82 |     def __init__(self):
 83 |         self._units = array.array(str("B"))
 84 | 
 85 |     def child(self, index):
 86 |         return self._units[index*2]
 87 | 
 88 |     def sibling(self, index):
 89 |         return self._units[index*2 + 1]
 90 | 
 91 |     def read(self, fp):
 92 |         base_size = struct.unpack(str("=I"), fp.read(4))[0]
 93 |         self._units.fromfile(fp, base_size*2)
 94 | 
 95 |     def size(self):
 96 |         return len(self._units)
 97 | 
 98 | 
 99 | class Completer(object):
100 | 
101 |     def __init__(self, dic=None, guide=None):
102 |         self._dic = dic
103 |         self._guide = guide
104 | 
105 |     def value(self):
106 |         return self._dic.value(self._last_index)
107 | 
108 |     def start(self, index, prefix=b""):
109 |         self.key = bytearray(prefix)
110 | 
111 |         if self._guide.size():
112 |             self._index_stack = [index]
113 |             self._last_index = self._dic.ROOT
114 |         else:
115 |             self._index_stack = []
116 | 
117 |     def next(self):
118 |         "Gets the next key"
119 | 
120 |         if not self._index_stack:
121 |             return False
122 | 
123 |         index = self._index_stack[-1]
124 | 
125 |         if self._last_index != self._dic.ROOT:
126 | 
127 |             child_label = self._guide.child(index)  # UCharType
128 | 
129 |             if child_label:
130 |                 # Follows a transition to the first child.
131 |                 index = self._follow(child_label, index)
132 |                 if index is None:
133 |                     return False
134 |             else:
135 |                 while True:
136 |                     sibling_label = self._guide.sibling(index)
137 |                     # Moves to the previous node.
138 |                     if len(self.key) > 0:
139 |                         self.key.pop()
140 |                         #self.key[-1] = 0
141 | 
142 |                     self._index_stack.pop()
143 |                     if not self._index_stack:
144 |                         return False
145 | 
146 |                     index = self._index_stack[-1]
147 |                     if sibling_label:
148 |                         # Follows a transition to the next sibling.
149 |                         index = self._follow(sibling_label, index)
150 |                         if index is None:
151 |                             return False
152 |                         break
153 | 
154 |         return self._find_terminal(index)
155 | 
156 |     def _follow(self, label, index):
157 |         next_index = self._dic.follow_char(label, index)
158 |         if next_index is None:
159 |             return None
160 | 
161 |         self.key.append(label)
162 |         self._index_stack.append(next_index)
163 |         return next_index
164 | 
165 |     def _find_terminal(self, index):
166 |         while not self._dic.has_value(index):
167 |             label = self._guide.child(index)
168 | 
169 |             index = self._dic.follow_char(label, index)
170 |             if index is None:
171 |                 return False
172 | 
173 |             self.key.append(label)
174 |             self._index_stack.append(index)
175 | 
176 |         self._last_index = index
177 |         return True
178 | 


--------------------------------------------------------------------------------
/dev_data/large/bytes_dawg.dawg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytries/DAWG-Python/e56241ec919b78735ff79014bf18d7fd1f8e08b9/dev_data/large/bytes_dawg.dawg


--------------------------------------------------------------------------------
/dev_data/large/dawg.dawg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytries/DAWG-Python/e56241ec919b78735ff79014bf18d7fd1f8e08b9/dev_data/large/dawg.dawg


--------------------------------------------------------------------------------
/dev_data/large/int_dawg.dawg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytries/DAWG-Python/e56241ec919b78735ff79014bf18d7fd1f8e08b9/dev_data/large/int_dawg.dawg


--------------------------------------------------------------------------------
/dev_data/large/record_dawg.dawg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytries/DAWG-Python/e56241ec919b78735ff79014bf18d7fd1f8e08b9/dev_data/large/record_dawg.dawg


--------------------------------------------------------------------------------
/dev_data/small/bytes.dawg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytries/DAWG-Python/e56241ec919b78735ff79014bf18d7fd1f8e08b9/dev_data/small/bytes.dawg


--------------------------------------------------------------------------------
/dev_data/small/completion-empty.dawg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytries/DAWG-Python/e56241ec919b78735ff79014bf18d7fd1f8e08b9/dev_data/small/completion-empty.dawg


--------------------------------------------------------------------------------
/dev_data/small/completion.dawg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytries/DAWG-Python/e56241ec919b78735ff79014bf18d7fd1f8e08b9/dev_data/small/completion.dawg


--------------------------------------------------------------------------------
/dev_data/small/int_completion_dawg.dawg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytries/DAWG-Python/e56241ec919b78735ff79014bf18d7fd1f8e08b9/dev_data/small/int_completion_dawg.dawg


--------------------------------------------------------------------------------
/dev_data/small/int_dawg.dawg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytries/DAWG-Python/e56241ec919b78735ff79014bf18d7fd1f8e08b9/dev_data/small/int_dawg.dawg


--------------------------------------------------------------------------------
/dev_data/small/prediction-record.dawg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytries/DAWG-Python/e56241ec919b78735ff79014bf18d7fd1f8e08b9/dev_data/small/prediction-record.dawg


--------------------------------------------------------------------------------
/dev_data/small/prediction.dawg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytries/DAWG-Python/e56241ec919b78735ff79014bf18d7fd1f8e08b9/dev_data/small/prediction.dawg


--------------------------------------------------------------------------------
/dev_data/small/record.dawg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytries/DAWG-Python/e56241ec919b78735ff79014bf18d7fd1f8e08b9/dev_data/small/record.dawg


--------------------------------------------------------------------------------
/dev_data/words100k.txt.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytries/DAWG-Python/e56241ec919b78735ff79014bf18d7fd1f8e08b9/dev_data/words100k.txt.zip


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal=1
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | from setuptools import setup
 3 | 
 4 | setup(
 5 |     name="DAWG-Python",
 6 |     version="0.7.2",
 7 |     description="Pure-python reader for DAWGs (DAFSAs) created by dawgdic C++ library or DAWG Python extension.",
 8 |     long_description = open('README.rst').read() + "\n\n"+ open('CHANGES.rst').read(),
 9 |     author='Mikhail Korobov',
10 |     author_email='kmike84@gmail.com',
11 |     url='https://github.com/kmike/DAWG-Python/',
12 |     packages = ['dawg_python'],
13 | 
14 |     classifiers=[
15 |         'Development Status :: 4 - Beta',
16 |         'Intended Audience :: Developers',
17 |         'Intended Audience :: Science/Research',
18 |         'License :: OSI Approved :: MIT License',
19 |         'Programming Language :: Cython',
20 |         'Programming Language :: Python',
21 |         'Programming Language :: Python :: 2',
22 |         'Programming Language :: Python :: 2.6',
23 |         'Programming Language :: Python :: 2.7',
24 |         'Programming Language :: Python :: 3',
25 |         'Programming Language :: Python :: 3.2',
26 |         'Programming Language :: Python :: 3.3',
27 |         'Programming Language :: Python :: 3.4',
28 |         'Programming Language :: Python :: Implementation :: CPython',
29 |         'Programming Language :: Python :: Implementation :: PyPy',
30 |         'Topic :: Software Development :: Libraries :: Python Modules',
31 |         'Topic :: Scientific/Engineering :: Information Analysis',
32 |         'Topic :: Text Processing :: Linguistic',
33 |     ],
34 | )
35 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import


--------------------------------------------------------------------------------
/tests/test_dawg.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import, unicode_literals
  3 | import pickle
  4 | import tempfile
  5 | 
  6 | import pytest
  7 | import dawg_python
  8 | 
  9 | from .utils import data_path
 10 | 
 11 | def test_c_dawg_contains():
 12 |     dawg = pytest.importorskip("dawg")  # import dawg
 13 |     bin_dawg = dawg.IntDAWG({'foo': 1, 'bar': 2, 'foobar': 3})
 14 | 
 15 |     d = dawg_python.Dictionary()
 16 | 
 17 |     fd, path = tempfile.mkstemp()
 18 |     bin_dawg.save(path)
 19 | 
 20 |     with open(path, 'rb') as f:
 21 |         d.read(f)
 22 | 
 23 |     assert d.contains(b'foo')
 24 |     assert not d.contains(b'x')
 25 |     assert d.contains(b'foobar')
 26 |     assert d.contains(b'bar')
 27 | 
 28 | 
 29 | class TestCompletionDAWG(object):
 30 |     keys = ['f', 'bar', 'foo', 'foobar']
 31 | 
 32 |     def dawg(self):
 33 |         return dawg_python.CompletionDAWG().load(data_path('small', 'completion.dawg'))
 34 | 
 35 |     def test_contains(self):
 36 |         d = self.dawg()
 37 |         for key in self.keys:
 38 |             assert key in d
 39 | 
 40 |     def test_contains_bytes(self):
 41 |         d = self.dawg()
 42 |         for key in self.keys:
 43 |             assert key.encode('utf8') in d
 44 | 
 45 |     def test_keys(self):
 46 |         d = self.dawg()
 47 |         assert d.keys() == sorted(self.keys)
 48 | 
 49 |     def test_iterkeys(self):
 50 |         d = self.dawg()
 51 |         assert list(d.iterkeys()) == d.keys()
 52 | 
 53 |     def test_completion(self):
 54 |         d = self.dawg()
 55 | 
 56 |         assert d.keys('z') == []
 57 |         assert d.keys('b') == ['bar']
 58 |         assert d.keys('foo') == ['foo', 'foobar']
 59 | 
 60 |     def test_no_segfaults_on_invalid_file(self):
 61 |         d = self.dawg()
 62 |         fd, path = tempfile.mkstemp()
 63 |         with open(path, 'w') as f:
 64 |             f.write('foo')
 65 | 
 66 |         with pytest.raises(Exception) as e:
 67 |             d.load(path)
 68 | 
 69 |     def test_empty_dawg(self):
 70 |         d = dawg_python.CompletionDAWG().load(data_path('small', 'completion-empty.dawg'))
 71 |         assert d.keys() == []
 72 | 
 73 |     def test_prefixes(self):
 74 |         d = self.dawg()
 75 |         assert d.prefixes("foobarz") == ["f", "foo", "foobar"]
 76 |         assert d.prefixes("x") == []
 77 |         assert d.prefixes("bar") == ["bar"]
 78 | 
 79 | 
 80 | 
 81 | class TestIntDAWG(object):
 82 |     payload = {'foo': 1, 'bar': 5, 'foobar': 3}
 83 | 
 84 |     def dawg(self):
 85 |         return dawg_python.IntDAWG().load(data_path('small', 'int_dawg.dawg'))
 86 | 
 87 |     def test_getitem(self):
 88 |         d = self.dawg()
 89 |         for key in self.payload:
 90 |             assert d[key] == self.payload[key]
 91 | 
 92 |         with pytest.raises(KeyError):
 93 |             d['fo']
 94 | 
 95 |     def test_pickling(self):
 96 |         d = self.dawg()
 97 | 
 98 |         data = pickle.dumps(d)
 99 |         d2 = pickle.loads(data)
100 | 
101 |         for key, value in self.payload.items():
102 |             assert key in d2
103 |             assert d[key] == value
104 | 
105 | 
106 | class TestIntCompletionDawg(TestIntDAWG):
107 |     def dawg(self):
108 |         return dawg_python.IntCompletionDAWG().load(data_path('small', 'int_completion_dawg.dawg'))
109 | 
110 |     def test_completion_keys(self):
111 |         assert self.dawg().keys() == sorted(self.payload.keys())
112 | 
113 |     def test_completion_keys_with_prefix(self):
114 |         assert self.dawg().keys('fo') == ['foo', 'foobar']
115 |         assert self.dawg().keys('foo') == ['foo', 'foobar']
116 |         assert self.dawg().keys('foob') == ['foobar']
117 |         assert self.dawg().keys('z') == []
118 |         assert self.dawg().keys('b') == ['bar']
119 | 
120 |     def test_completion_items(self):
121 |         assert self.dawg().items() == sorted(self.payload.items(), key=lambda r: r[0])
122 | 


--------------------------------------------------------------------------------
/tests/test_fuzzy.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import, unicode_literals
 3 | 
 4 | import dawg_python
 5 | 
 6 | from .utils import words100k, data_path
 7 | 
 8 | words = words100k()
 9 | dawg = dawg_python.Dictionary.load(data_path('large', 'int_dawg.dawg'))
10 | 
11 | class TestDictionary(object):
12 | 
13 |     def test_contains(self):
14 |         for word in words:
15 |             assert dawg.contains(word.encode('utf8'))
16 | 
17 |     def test_find(self):
18 |         for word in words:
19 |             assert dawg.find(word.encode('utf8')) == len(word)
20 | 


--------------------------------------------------------------------------------
/tests/test_payload_dawg.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import, unicode_literals
  3 | 
  4 | import pytest
  5 | import dawg_python
  6 | from .utils import data_path
  7 | 
  8 | class TestBytesDAWG(object):
  9 | 
 10 |     DATA = (
 11 |         ('foo', b'data1'),
 12 |         ('bar', b'data2'),
 13 |         ('foo', b'data3'),
 14 |         ('foobar', b'data4')
 15 |     )
 16 | 
 17 |     def dawg(self):
 18 |         return dawg_python.BytesDAWG().load(data_path("small", "bytes.dawg"))
 19 | 
 20 |     def test_contains(self):
 21 |         d = self.dawg()
 22 |         for key, val in self.DATA:
 23 |             assert key in d
 24 | 
 25 |         assert 'food' not in d
 26 |         assert 'x' not in d
 27 |         assert 'fo' not in d
 28 | 
 29 | 
 30 |     def test_getitem(self):
 31 |         d = self.dawg()
 32 | 
 33 |         assert d['foo'] == [b'data1', b'data3']
 34 |         assert d['bar'] == [b'data2']
 35 |         assert d['foobar'] == [b'data4']
 36 | 
 37 | 
 38 |     def test_getitem_missing(self):
 39 |         d = self.dawg()
 40 | 
 41 |         with pytest.raises(KeyError):
 42 |             d['x']
 43 | 
 44 |         with pytest.raises(KeyError):
 45 |             d['food']
 46 | 
 47 |         with pytest.raises(KeyError):
 48 |             d['foobarz']
 49 | 
 50 |         with pytest.raises(KeyError):
 51 |             d['f']
 52 | 
 53 |     def test_keys(self):
 54 |         d = self.dawg()
 55 |         assert d.keys() == ['bar', 'foo', 'foo', 'foobar']
 56 | 
 57 |     def test_iterkeys(self):
 58 |         d = self.dawg()
 59 |         assert list(d.iterkeys()) == d.keys()
 60 | 
 61 |     def test_key_completion(self):
 62 |         d = self.dawg()
 63 |         assert d.keys('fo') == ['foo', 'foo', 'foobar']
 64 | 
 65 |     def test_items(self):
 66 |         d = self.dawg()
 67 |         assert d.items() == sorted(self.DATA)
 68 | 
 69 |     def test_iteritems(self):
 70 |         d = self.dawg()
 71 |         assert list(d.iteritems('xxx')) == []
 72 |         assert list(d.iteritems('fo')) == d.items('fo')
 73 |         assert list(d.iteritems()) == d.items()
 74 | 
 75 |     def test_items_completion(self):
 76 |         d = self.dawg()
 77 |         assert d.items('foob') == [('foobar', b'data4')]
 78 | 
 79 |     def test_prefixes(self):
 80 |         d = self.dawg()
 81 |         assert d.prefixes("foobarz") == ["foo", "foobar"]
 82 |         assert d.prefixes("x") == []
 83 |         assert d.prefixes("bar") == ["bar"]
 84 | 
 85 | 
 86 | class TestRecordDAWG(object):
 87 | 
 88 |     STRUCTURED_DATA = (
 89 |         ('foo',     (3, 2, 256)),
 90 |         ('bar',     (3, 1, 0)),
 91 |         ('foo',     (3, 2, 1)),
 92 |         ('foobar',  (6, 3, 0))
 93 |     )
 94 | 
 95 |     def dawg(self):
 96 |         path = data_path("small", "record.dawg")
 97 |         return dawg_python.RecordDAWG(">3H").load(path)
 98 | 
 99 |     def test_getitem(self):
100 |         d = self.dawg()
101 |         assert d['foo'] == [(3, 2, 1), (3, 2, 256)]
102 |         assert d['bar'] == [(3, 1, 0)]
103 |         assert d['foobar'] == [(6, 3, 0)]
104 | 
105 |     def test_getitem_missing(self):
106 |         d = self.dawg()
107 | 
108 |         with pytest.raises(KeyError):
109 |             d['x']
110 | 
111 |         with pytest.raises(KeyError):
112 |             d['food']
113 | 
114 |         with pytest.raises(KeyError):
115 |             d['foobarz']
116 | 
117 |         with pytest.raises(KeyError):
118 |             d['f']
119 | 
120 |     def test_record_items(self):
121 |         d = self.dawg()
122 |         assert d.items() == sorted(self.STRUCTURED_DATA)
123 | 
124 |     def test_record_keys(self):
125 |         d = self.dawg()
126 |         assert d.keys() == ['bar', 'foo', 'foo', 'foobar',]
127 | 
128 |     def test_record_keys_prefix(self):
129 |         d = self.dawg()
130 |         assert d.keys('fo') == ['foo', 'foo', 'foobar']
131 |         assert d.keys('bar') == ['bar']
132 |         assert d.keys('barz') == []
133 | 
134 |     def test_prefixes(self):
135 |         d = self.dawg()
136 |         assert d.prefixes("foobarz") == ["foo", "foobar"]
137 |         assert d.prefixes("x") == []
138 |         assert d.prefixes("bar") == ["bar"]
139 | 


--------------------------------------------------------------------------------
/tests/test_prediction.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import, unicode_literals
 3 | import pytest
 4 | import dawg_python
 5 | 
 6 | from .utils import data_path
 7 | 
 8 | class TestPrediction(object):
 9 | 
10 |     REPLACES = dawg_python.DAWG.compile_replaces({'Е': 'Ё'})
11 | 
12 |     DATA = ['ЁЖИК', 'ЁЖИКЕ', 'ЁЖ', 'ДЕРЕВНЯ', 'ДЕРЁВНЯ', 'ЕМ', 'ОЗЕРА', 'ОЗЁРА', 'ОЗЕРО']
13 |     SUITE = [
14 |         ('УЖ', []),
15 |         ('ЕМ', ['ЕМ']),
16 |         ('ЁМ', []),
17 |         ('ЁЖ', ['ЁЖ']),
18 |         ('ЕЖ', ['ЁЖ']),
19 |         ('ЁЖИК', ['ЁЖИК']),
20 |         ('ЕЖИКЕ', ['ЁЖИКЕ']),
21 |         ('ДЕРЕВНЯ', ['ДЕРЕВНЯ', 'ДЕРЁВНЯ']),
22 |         ('ДЕРЁВНЯ', ['ДЕРЁВНЯ']),
23 |         ('ОЗЕРА', ['ОЗЕРА', 'ОЗЁРА']),
24 |         ('ОЗЕРО', ['ОЗЕРО']),
25 |     ]
26 | 
27 |     SUITE_ITEMS = [
28 |         (
29 |             it[0], # key
30 |             [
31 |                 (w, [(len(w),)]) # item, value pair
32 |                 for w in it[1]
33 |             ]
34 |         )
35 |         for it in SUITE
36 |     ]
37 | 
38 |     SUITE_VALUES = [
39 |         (
40 |             it[0], # key
41 |             [[(len(w),)] for w in it[1]]
42 |         )
43 |         for it in SUITE
44 |     ]
45 | 
46 |     def record_dawg(self):
47 |         path = data_path("small", "prediction-record.dawg")
48 |         return dawg_python.RecordDAWG(str("=H")).load(path)
49 | 
50 | 
51 | 
52 |     @pytest.mark.parametrize(("word", "prediction"), SUITE)
53 |     def test_dawg_prediction(self, word, prediction):
54 |         d = dawg_python.DAWG().load(data_path("small", "prediction.dawg"))
55 |         assert d.similar_keys(word, self.REPLACES) == prediction
56 | 
57 |     @pytest.mark.parametrize(("word", "prediction"), SUITE)
58 |     def test_record_dawg_prediction(self, word, prediction):
59 |         d = self.record_dawg()
60 |         assert d.similar_keys(word, self.REPLACES) == prediction
61 | 
62 |     @pytest.mark.parametrize(("word", "prediction"), SUITE_ITEMS)
63 |     def test_record_dawg_items(self, word, prediction):
64 |         d = self.record_dawg()
65 |         assert d.similar_items(word, self.REPLACES) == prediction
66 | 
67 |     @pytest.mark.parametrize(("word", "prediction"), SUITE_VALUES)
68 |     def test_record_dawg_items_values(self, word, prediction):
69 |         d = self.record_dawg()
70 |         assert d.similar_item_values(word, self.REPLACES) == prediction
71 | 


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | import os
 4 | import zipfile
 5 | 
 6 | DEV_DATA_PATH = os.path.join(
 7 |     os.path.dirname(__file__),
 8 |     '..',
 9 |     'dev_data',
10 | )
11 | 
12 | def data_path(*args):
13 |     """
14 |     Returns a path to dev data
15 |     """
16 |     return os.path.join(DEV_DATA_PATH, *args)
17 | 
18 | def words100k():
19 |     zip_name = data_path('words100k.txt.zip')
20 |     zf = zipfile.ZipFile(zip_name)
21 |     txt = zf.open(zf.namelist()[0]).read().decode('utf8')
22 |     return txt.splitlines()
23 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py26,py27,py32,py33,py34,pypy
 3 | 
 4 | [testenv]
 5 | deps =
 6 |     pytest
 7 |     dawg
 8 | commands=
 9 |     py.test []
10 | 
11 | [testenv:pypy]
12 | deps =
13 |     pytest
14 | 


--------------------------------------------------------------------------------