├── .coveragerc ├── .gitignore ├── .hgignore ├── .hgtags ├── .travis.yml ├── CHANGES.rst ├── LICENSE ├── MANIFEST.in ├── README.rst ├── _prepare_dev_data.py ├── bench.ini ├── bench ├── __init__.py ├── speed.py └── utils.py ├── dawg_python ├── __init__.py ├── compat.py ├── dawgs.py ├── units.py └── wrapper.py ├── dev_data ├── large │ ├── bytes_dawg.dawg │ ├── dawg.dawg │ ├── int_dawg.dawg │ └── record_dawg.dawg ├── small │ ├── bytes.dawg │ ├── completion-empty.dawg │ ├── completion.dawg │ ├── int_completion_dawg.dawg │ ├── int_dawg.dawg │ ├── prediction-record.dawg │ ├── prediction.dawg │ └── record.dawg └── words100k.txt.zip ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── test_dawg.py ├── test_fuzzy.py ├── test_payload_dawg.py ├── test_prediction.py └── utils.py └── tox.ini /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | MANIFEST 3 | dist/ 4 | DAWG_Python.egg-info/ 5 | 6 | ^stuff/ 7 | *.pyc 8 | .tox/ 9 | *.orig 10 | -------------------------------------------------------------------------------- /.hgignore: -------------------------------------------------------------------------------- 1 | ^build 2 | ^MANIFEST$ 3 | ^dist 4 | egg-info/ 5 | \.so$ 6 | \.o$ 7 | \.lo$ 8 | 9 | \.svn 10 | \.cvsignore 11 | 12 | ^src/.*\.html$ 13 | 14 | ^stuff/ 15 | \.rej$ 16 | \.pyc$ 17 | ^.tox 18 | \.orig$ 19 | \.prof$ 20 | \.coverage$ 21 | -------------------------------------------------------------------------------- /.hgtags: -------------------------------------------------------------------------------- 1 | 94b9f5fce67517f39370ffecb122b9110e26e3bc 0.1 2 | 51dca297194c692b1a31e4017138819bf13838f7 0.2 3 | 8098a78df797f5e44a41a19c1eef356b37f9a685 0.3 4 | b5416eb9b17bd2e08e4c388ac72771e57480fd61 0.3.1 5 | c78b8b58ad02b221c383af80bca42a229db780a9 0.5 6 | 91cc324f4faf742c1c804a1cc5ccf3a5bf8a26ca 0.5.1 7 | 5e92658f5e6d68677f0218a71d3281d76614d4e0 0.6 8 | fac5235bf297ae2fcceb36a7fa1c399b5e3543a3 0.7 9 | 7458eede668c62fe00e6c12d8b064f14b8ddd4a1 0.7.1 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.4" 4 | - "3.3" 5 | - "3.2" 6 | - "2.7" 7 | - "2.6" 8 | - "pypy" 9 | 10 | install: 11 | - if [[ $TRAVIS_PYTHON_VERSION != 'pypy' ]]; then pip install dawg; fi 12 | - pip install coverage pytest-cov coveralls 13 | - python setup.py install 14 | 15 | # command to run tests, e.g. python setup.py test 16 | script: 17 | - py.test --cov=dawg_python 18 | 19 | after_success: 20 | - coveralls 21 | -------------------------------------------------------------------------------- /CHANGES.rst: -------------------------------------------------------------------------------- 1 | 2 | Changes 3 | ======= 4 | 5 | 0.7.2 (2015-04-18) 6 | ------------------ 7 | 8 | - minor speedup; 9 | - bitbucket mirror is no longer maintained. 10 | 11 | 0.7.1 (2014-06-05) 12 | ------------------ 13 | 14 | - Switch to setuptools; 15 | - upload wheel tp pypi; 16 | - check Python 3.4 compatibility. 17 | 18 | 0.7 (2013-10-13) 19 | ---------------- 20 | 21 | IntDAWG and IntCompletionDAWG are implemented. 22 | 23 | 0.6 (2013-03-23) 24 | ---------------- 25 | 26 | Use less shared state internally. This should fix thread-safety bugs and 27 | make iterkeys/iteritems reenterant. 28 | 29 | 0.5.1 (2013-03-01) 30 | ------------------ 31 | 32 | Internal tweaks: memory usage is reduced; something is a bit faster, 33 | something is a bit slower. 34 | 35 | 0.5 (2012-10-08) 36 | ---------------- 37 | 38 | Storage scheme is updated to match DAWG==0.5. This enables 39 | the alphabetical ordering of ``BytesDAWG`` and ``RecordDAWG`` items. 40 | 41 | In order to read ``BytesDAWG`` or ``RecordDAWG`` created with 42 | versions of DAWG < 0.5 use ``payload_separator`` constructor argument:: 43 | 44 | >>> BytesDAWG(payload_separator=b'\xff').load('old.dawg') 45 | 46 | 47 | 0.3.1 (2012-10-01) 48 | ------------------ 49 | 50 | Bug with empty DAWGs is fixed. 51 | 52 | 0.3 (2012-09-26) 53 | ---------------- 54 | 55 | - ``iterkeys`` and ``iteritems`` methods. 56 | 57 | 0.2 (2012-09-24) 58 | ---------------- 59 | 60 | ``prefixes`` support. 61 | 62 | 0.1 (2012-09-20) 63 | ---------------- 64 | 65 | Initial release. 66 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) Mikhail Korobov, 2012 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is furnished 8 | to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 14 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR 15 | A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 16 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 17 | CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 18 | OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | include CHANGES.rst 3 | include LICENSE 4 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | DAWG-Python 2 | =========== 3 | 4 | .. image:: https://travis-ci.org/kmike/DAWG-Python.png?branch=master 5 | :target: https://travis-ci.org/kmike/DAWG-Python 6 | .. image:: https://coveralls.io/repos/kmike/DAWG-Python/badge.png?branch=master 7 | :target: https://coveralls.io/r/kmike/DAWG-Python 8 | 9 | 10 | This pure-python package provides read-only access for files 11 | created by `dawgdic`_ C++ library and `DAWG`_ python package. 12 | 13 | .. _dawgdic: https://code.google.com/p/dawgdic/ 14 | .. _DAWG: https://github.com/kmike/DAWG 15 | 16 | This package is not capable of creating DAWGs. It works with DAWGs built by 17 | `dawgdic`_ C++ library or `DAWG`_ Python extension module. The main purpose 18 | of DAWG-Python is to provide an access to DAWGs without requiring compiled 19 | extensions. It is also quite fast under PyPy (see benchmarks). 20 | 21 | Installation 22 | ============ 23 | 24 | pip install DAWG-Python 25 | 26 | Usage 27 | ===== 28 | 29 | The aim of DAWG-Python is to be API- and binary-compatible 30 | with `DAWG`_ when it is possible. 31 | 32 | First, you have to create a dawg using DAWG_ module:: 33 | 34 | import dawg 35 | d = dawg.DAWG(data) 36 | d.save('words.dawg') 37 | 38 | And then this dawg can be loaded without requiring C extensions:: 39 | 40 | import dawg_python 41 | d = dawg_python.DAWG().load('words.dawg') 42 | 43 | Please consult `DAWG`_ docs for detailed usage. Some features 44 | (like constructor parameters or ``save`` method) are intentionally 45 | unsupported. 46 | 47 | Benchmarks 48 | ========== 49 | 50 | Benchmark results (100k unicode words, integer values (lenghts of the words), 51 | PyPy 1.9, macbook air i5 1.8 Ghz):: 52 | 53 | dict __getitem__ (hits): 11.090M ops/sec 54 | DAWG __getitem__ (hits): not supported 55 | BytesDAWG __getitem__ (hits): 0.493M ops/sec 56 | RecordDAWG __getitem__ (hits): 0.376M ops/sec 57 | 58 | dict get() (hits): 10.127M ops/sec 59 | DAWG get() (hits): not supported 60 | BytesDAWG get() (hits): 0.481M ops/sec 61 | RecordDAWG get() (hits): 0.402M ops/sec 62 | dict get() (misses): 14.885M ops/sec 63 | DAWG get() (misses): not supported 64 | BytesDAWG get() (misses): 1.259M ops/sec 65 | RecordDAWG get() (misses): 1.337M ops/sec 66 | 67 | dict __contains__ (hits): 11.100M ops/sec 68 | DAWG __contains__ (hits): 1.317M ops/sec 69 | BytesDAWG __contains__ (hits): 1.107M ops/sec 70 | RecordDAWG __contains__ (hits): 1.095M ops/sec 71 | 72 | dict __contains__ (misses): 10.567M ops/sec 73 | DAWG __contains__ (misses): 1.902M ops/sec 74 | BytesDAWG __contains__ (misses): 1.873M ops/sec 75 | RecordDAWG __contains__ (misses): 1.862M ops/sec 76 | 77 | dict items(): 44.401 ops/sec 78 | DAWG items(): not supported 79 | BytesDAWG items(): 3.226 ops/sec 80 | RecordDAWG items(): 2.987 ops/sec 81 | dict keys(): 426.250 ops/sec 82 | DAWG keys(): not supported 83 | BytesDAWG keys(): 6.050 ops/sec 84 | RecordDAWG keys(): 6.363 ops/sec 85 | 86 | DAWG.prefixes (hits): 0.756M ops/sec 87 | DAWG.prefixes (mixed): 1.965M ops/sec 88 | DAWG.prefixes (misses): 1.773M ops/sec 89 | 90 | RecordDAWG.keys(prefix="xxx"), avg_len(res)==415: 1.429K ops/sec 91 | RecordDAWG.keys(prefix="xxxxx"), avg_len(res)==17: 36.994K ops/sec 92 | RecordDAWG.keys(prefix="xxxxxxxx"), avg_len(res)==3: 121.897K ops/sec 93 | RecordDAWG.keys(prefix="xxxxx..xx"), avg_len(res)==1.4: 265.015K ops/sec 94 | RecordDAWG.keys(prefix="xxx"), NON_EXISTING: 2450.898K ops/sec 95 | 96 | Under CPython expect it to be about 50x slower. 97 | Memory consumption of DAWG-Python should be the same as of `DAWG`_. 98 | 99 | .. _marisa-trie: https://github.com/kmike/marisa-trie 100 | 101 | Current limitations 102 | =================== 103 | 104 | * This package is not capable of creating DAWGs; 105 | * all the limitations of `DAWG`_ apply. 106 | 107 | Contributions are welcome! 108 | 109 | 110 | Contributing 111 | ============ 112 | 113 | Development happens at github: https://github.com/kmike/DAWG-Python 114 | Issue tracker: https://github.com/kmike/DAWG-Python/issues 115 | 116 | Feel free to submit ideas, bugs or pull requests. 117 | 118 | Running tests and benchmarks 119 | ---------------------------- 120 | 121 | Make sure `tox`_ is installed and run 122 | 123 | :: 124 | 125 | $ tox 126 | 127 | from the source checkout. Tests should pass under python 2.6, 2.7, 3.2, 3.3, 128 | 3.4 and PyPy >= 1.9. 129 | 130 | In order to run benchmarks, type 131 | 132 | :: 133 | 134 | $ tox -c bench.ini -e pypy 135 | 136 | This runs benchmarks under PyPy (they are about 50x slower under CPython). 137 | 138 | .. _tox: http://tox.testrun.org 139 | 140 | Authors & Contributors 141 | ---------------------- 142 | 143 | * Mikhail Korobov 144 | 145 | The algorithms are from `dawgdic`_ C++ library by Susumu Yata & contributors. 146 | 147 | License 148 | ======= 149 | 150 | This package is licensed under MIT License. 151 | -------------------------------------------------------------------------------- /_prepare_dev_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Script for building test DAWGs. 5 | """ 6 | from __future__ import absolute_import, unicode_literals 7 | import dawg 8 | import os 9 | import sys 10 | import struct 11 | 12 | sys.path.insert(0, os.path.dirname(__file__)) 13 | 14 | from bench.utils import words100k 15 | from tests.test_prediction import TestPrediction 16 | 17 | def create_dawg(): 18 | words = words100k() 19 | return dawg.DAWG(words) 20 | 21 | def create_bytes_dawg(): 22 | words = words100k() 23 | values = [struct.pack(str('3H"), record_data).save('dev_data/small/record.dawg') 61 | 62 | int_data = {'foo': 1, 'bar': 5, 'foobar': 3} 63 | dawg.IntDAWG(int_data).save('dev_data/small/int_dawg.dawg') 64 | dawg.IntCompletionDAWG(int_data).save('dev_data/small/int_completion_dawg.dawg') 65 | 66 | dawg.DAWG(TestPrediction.DATA).save('dev_data/small/prediction.dawg') 67 | dawg.RecordDAWG(str("=H"), [(k, (len(k),)) for k in TestPrediction.DATA]).save('dev_data/small/prediction-record.dawg') 68 | 69 | create_dawg().save('dev_data/large/dawg.dawg') 70 | create_bytes_dawg().save('dev_data/large/bytes_dawg.dawg') 71 | create_record_dawg().save('dev_data/large/record_dawg.dawg') 72 | create_int_dawg().save('dev_data/large/int_dawg.dawg') 73 | #create_int_completion_dawg().save('dev_data/large/int_completion_dawg.dawg') 74 | 75 | 76 | if __name__ == '__main__': 77 | build_test_data() 78 | -------------------------------------------------------------------------------- /bench.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py26,py27,py32,py33 3 | 4 | [testenv] 5 | deps = 6 | pytest 7 | dawg 8 | commands= 9 | python bench/speed.py 10 | 11 | [testenv:pypy] 12 | deps = 13 | pytest -------------------------------------------------------------------------------- /bench/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import -------------------------------------------------------------------------------- /bench/speed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from __future__ import absolute_import, unicode_literals, division 4 | import os 5 | import sys 6 | import random 7 | import string 8 | import timeit 9 | 10 | import dawg_python 11 | 12 | sys.path.insert(0, os.path.join(os.path.dirname(__file__))) 13 | 14 | from utils import data_path, words100k 15 | 16 | def random_words(num): 17 | russian = 'абвгдеёжзиклмнопрстуфхцчъыьэюя' 18 | alphabet = '%s%s' % (russian, string.ascii_letters) 19 | return [ 20 | "".join([random.choice(alphabet) for x in range(random.randint(1,15))]) 21 | for y in range(num) 22 | ] 23 | 24 | def truncated_words(words): 25 | return [word[:3] for word in words] 26 | 27 | def prefixes1k(words, prefix_len): 28 | words = [w for w in words if len(w) >= prefix_len] 29 | every_nth = int(len(words)/1000) 30 | _words = [w[:prefix_len] for w in words[::every_nth]] 31 | return _words[:1000] 32 | 33 | WORDS100k = words100k() 34 | MIXED_WORDS100k = truncated_words(WORDS100k) 35 | NON_WORDS100k = random_words(100000) 36 | PREFIXES_3_1k = prefixes1k(WORDS100k, 3) 37 | PREFIXES_5_1k = prefixes1k(WORDS100k, 5) 38 | PREFIXES_8_1k = prefixes1k(WORDS100k, 8) 39 | PREFIXES_15_1k = prefixes1k(WORDS100k, 15) 40 | 41 | 42 | def format_result(key, value): 43 | print("%55s: %s" % (key, value)) 44 | 45 | 46 | def bench(name, timer, descr='M ops/sec', op_count=0.1, repeats=3, runs=5): 47 | try: 48 | times = [] 49 | for x in range(runs): 50 | times.append(timer.timeit(repeats)) 51 | 52 | def op_time(time): 53 | return op_count*repeats / time 54 | 55 | val = "%0.3f%s" % (op_time(min(times)), descr) 56 | format_result(name, val) 57 | except (AttributeError, TypeError) as e: 58 | format_result(name, "not supported") 59 | #print(e) 60 | 61 | def load_dawg(): 62 | return dawg_python.DAWG().load(data_path('large', 'dawg.dawg')) 63 | 64 | def load_bytes_dawg(): 65 | return dawg_python.BytesDAWG().load(data_path('large', 'bytes_dawg.dawg')) 66 | 67 | def load_record_dawg(): 68 | return dawg_python.RecordDAWG(str(' list of bytes objects} mapping. 181 | """ 182 | 183 | def __init__(self, payload_separator=PAYLOAD_SEPARATOR): 184 | self._payload_separator = payload_separator 185 | 186 | def __contains__(self, key): 187 | if not isinstance(key, bytes): 188 | key = key.encode('utf8') 189 | return bool(self._follow_key(key)) 190 | 191 | # def b_has_key(self, key): 192 | # return bool(self._follow_key(key)) 193 | 194 | def __getitem__(self, key): 195 | res = self.get(key) 196 | if res is None: 197 | raise KeyError(key) 198 | return res 199 | 200 | def get(self, key, default=None): 201 | """ 202 | Returns a list of payloads (as byte objects) for a given key 203 | or ``default`` if the key is not found. 204 | """ 205 | if not isinstance(key, bytes): 206 | key = key.encode('utf8') 207 | 208 | return self.b_get_value(key) or default 209 | 210 | def _follow_key(self, b_key): 211 | index = self.dct.follow_bytes(b_key, self.dct.ROOT) 212 | if not index: 213 | return False 214 | 215 | index = self.dct.follow_bytes(self._payload_separator, index) 216 | if not index: 217 | return False 218 | 219 | return index 220 | 221 | def _value_for_index(self, index): 222 | res = [] 223 | 224 | completer = wrapper.Completer(self.dct, self.guide) 225 | 226 | completer.start(index) 227 | while completer.next(): 228 | # a2b_base64 doesn't support bytearray in python 2.6 229 | # so it is converted (and copied) to bytes 230 | b64_data = bytes(completer.key) 231 | res.append(a2b_base64(b64_data)) 232 | 233 | return res 234 | 235 | def b_get_value(self, b_key): 236 | index = self._follow_key(b_key) 237 | if not index: 238 | return [] 239 | return self._value_for_index(index) 240 | 241 | def keys(self, prefix=""): 242 | if not isinstance(prefix, bytes): 243 | prefix = prefix.encode('utf8') 244 | res = [] 245 | 246 | index = self.dct.ROOT 247 | 248 | if prefix: 249 | index = self.dct.follow_bytes(prefix, index) 250 | if not index: 251 | return res 252 | 253 | completer = wrapper.Completer(self.dct, self.guide) 254 | completer.start(index, prefix) 255 | 256 | while completer.next(): 257 | payload_idx = completer.key.index(self._payload_separator) 258 | u_key = completer.key[:payload_idx].decode('utf8') 259 | res.append(u_key) 260 | return res 261 | 262 | def iterkeys(self, prefix=""): 263 | if not isinstance(prefix, bytes): 264 | prefix = prefix.encode('utf8') 265 | 266 | index = self.dct.ROOT 267 | 268 | if prefix: 269 | index = self.dct.follow_bytes(prefix, index) 270 | if not index: 271 | return 272 | 273 | completer = wrapper.Completer(self.dct, self.guide) 274 | completer.start(index, prefix) 275 | 276 | while completer.next(): 277 | payload_idx = completer.key.index(self._payload_separator) 278 | u_key = completer.key[:payload_idx].decode('utf8') 279 | yield u_key 280 | 281 | def items(self, prefix=""): 282 | if not isinstance(prefix, bytes): 283 | prefix = prefix.encode('utf8') 284 | res = [] 285 | 286 | index = self.dct.ROOT 287 | if prefix: 288 | index = self.dct.follow_bytes(prefix, index) 289 | if not index: 290 | return res 291 | 292 | completer = wrapper.Completer(self.dct, self.guide) 293 | completer.start(index, prefix) 294 | 295 | while completer.next(): 296 | key, value = completer.key.split(self._payload_separator) 297 | res.append( 298 | (key.decode('utf8'), a2b_base64(bytes(value))) # bytes() cast is a python 2.6 fix 299 | ) 300 | 301 | return res 302 | 303 | def iteritems(self, prefix=""): 304 | if not isinstance(prefix, bytes): 305 | prefix = prefix.encode('utf8') 306 | 307 | index = self.dct.ROOT 308 | if prefix: 309 | index = self.dct.follow_bytes(prefix, index) 310 | if not index: 311 | return 312 | 313 | completer = wrapper.Completer(self.dct, self.guide) 314 | completer.start(index, prefix) 315 | 316 | while completer.next(): 317 | key, value = completer.key.split(self._payload_separator) 318 | item = (key.decode('utf8'), a2b_base64(bytes(value))) # bytes() cast is a python 2.6 fix 319 | yield item 320 | 321 | 322 | def _has_value(self, index): 323 | return self.dct.follow_bytes(PAYLOAD_SEPARATOR, index) 324 | 325 | def _similar_items(self, current_prefix, key, index, replace_chars): 326 | 327 | res = [] 328 | start_pos = len(current_prefix) 329 | end_pos = len(key) 330 | word_pos = start_pos 331 | 332 | while word_pos < end_pos: 333 | b_step = key[word_pos].encode('utf8') 334 | 335 | if b_step in replace_chars: 336 | next_index = index 337 | b_replace_char, u_replace_char = replace_chars[b_step] 338 | 339 | next_index = self.dct.follow_bytes(b_replace_char, next_index) 340 | if next_index: 341 | prefix = current_prefix + key[start_pos:word_pos] + u_replace_char 342 | extra_items = self._similar_items(prefix, key, next_index, replace_chars) 343 | res += extra_items 344 | 345 | index = self.dct.follow_bytes(b_step, index) 346 | if not index: 347 | break 348 | word_pos += 1 349 | 350 | else: 351 | index = self.dct.follow_bytes(self._payload_separator, index) 352 | if index: 353 | found_key = current_prefix + key[start_pos:] 354 | value = self._value_for_index(index) 355 | res.insert(0, (found_key, value)) 356 | 357 | return res 358 | 359 | def similar_items(self, key, replaces): 360 | """ 361 | Returns a list of (key, value) tuples for all variants of ``key`` 362 | in this DAWG according to ``replaces``. 363 | 364 | ``replaces`` is an object obtained from 365 | ``DAWG.compile_replaces(mapping)`` where mapping is a dict 366 | that maps single-char unicode sitrings to another single-char 367 | unicode strings. 368 | """ 369 | return self._similar_items("", key, self.dct.ROOT, replaces) 370 | 371 | 372 | def _similar_item_values(self, start_pos, key, index, replace_chars): 373 | res = [] 374 | end_pos = len(key) 375 | word_pos = start_pos 376 | 377 | while word_pos < end_pos: 378 | b_step = key[word_pos].encode('utf8') 379 | 380 | if b_step in replace_chars: 381 | next_index = index 382 | b_replace_char, u_replace_char = replace_chars[b_step] 383 | 384 | next_index = self.dct.follow_bytes(b_replace_char, next_index) 385 | if next_index: 386 | extra_items = self._similar_item_values(word_pos+1, key, next_index, replace_chars) 387 | res += extra_items 388 | 389 | index = self.dct.follow_bytes(b_step, index) 390 | if not index: 391 | break 392 | word_pos += 1 393 | 394 | else: 395 | index = self.dct.follow_bytes(self._payload_separator, index) 396 | if index: 397 | value = self._value_for_index(index) 398 | res.insert(0, value) 399 | 400 | return res 401 | 402 | def similar_item_values(self, key, replaces): 403 | """ 404 | Returns a list of values for all variants of the ``key`` 405 | in this DAWG according to ``replaces``. 406 | 407 | ``replaces`` is an object obtained from 408 | ``DAWG.compile_replaces(mapping)`` where mapping is a dict 409 | that maps single-char unicode sitrings to another single-char 410 | unicode strings. 411 | """ 412 | return self._similar_item_values(0, key, self.dct.ROOT, replaces) 413 | 414 | 415 | class RecordDAWG(BytesDAWG): 416 | def __init__(self, fmt, payload_separator=PAYLOAD_SEPARATOR): 417 | super(RecordDAWG, self).__init__(payload_separator) 418 | self._struct = struct.Struct(str(fmt)) 419 | self.fmt = fmt 420 | 421 | def _value_for_index(self, index): 422 | value = super(RecordDAWG, self)._value_for_index(index) 423 | return [self._struct.unpack(val) for val in value] 424 | 425 | def items(self, prefix=""): 426 | res = super(RecordDAWG, self).items(prefix) 427 | return [(key, self._struct.unpack(val)) for (key, val) in res] 428 | 429 | def iteritems(self, prefix=""): 430 | res = super(RecordDAWG, self).iteritems(prefix) 431 | return ((key, self._struct.unpack(val)) for (key, val) in res) 432 | 433 | 434 | LOOKUP_ERROR = -1 435 | 436 | class IntDAWG(DAWG): 437 | """ 438 | Dict-like class based on DAWG. 439 | It can store integer values for unicode keys. 440 | """ 441 | def __getitem__(self, key): 442 | res = self.get(key, LOOKUP_ERROR) 443 | if res == LOOKUP_ERROR: 444 | raise KeyError(key) 445 | return res 446 | 447 | def get(self, key, default=None): 448 | """ 449 | Return value for the given key or ``default`` if the key is not found. 450 | """ 451 | if not isinstance(key, bytes): 452 | key = key.encode('utf8') 453 | res = self.b_get_value(key) 454 | if res == LOOKUP_ERROR: 455 | return default 456 | return res 457 | 458 | def b_get_value(self, key): 459 | return self.dct.find(key) 460 | 461 | 462 | class IntCompletionDAWG(CompletionDAWG, IntDAWG): 463 | """ 464 | Dict-like class based on DAWG. 465 | It can store integer values for unicode keys and support key completion. 466 | """ 467 | def items(self, prefix=""): 468 | if not isinstance(prefix, bytes): 469 | prefix = prefix.encode('utf8') 470 | res = [] 471 | index = self.dct.ROOT 472 | 473 | if prefix: 474 | index = self.dct.follow_bytes(prefix, index) 475 | if not index: 476 | return res 477 | 478 | completer = wrapper.Completer(self.dct, self.guide) 479 | completer.start(index, prefix) 480 | 481 | while completer.next(): 482 | res.append( 483 | (completer.key.decode('utf8'), completer.value()) 484 | ) 485 | 486 | return res 487 | 488 | def iteritems(self, prefix=""): 489 | if not isinstance(prefix, bytes): 490 | prefix = prefix.encode('utf8') 491 | index = self.dct.ROOT 492 | 493 | if prefix: 494 | index = self.dct.follow_bytes(prefix, index) 495 | if not index: 496 | return 497 | 498 | completer = wrapper.Completer(self.dct, self.guide) 499 | completer.start(index, prefix) 500 | 501 | while completer.next(): 502 | yield completer.key.decode('utf8'), completer.value() 503 | -------------------------------------------------------------------------------- /dawg_python/units.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Unit of a dictionary 4 | """ 5 | from __future__ import absolute_import 6 | 7 | PRECISION_MASK = 0xFFFFFFFF 8 | 9 | OFFSET_MAX = 1 << 21 10 | IS_LEAF_BIT = 1 << 31 11 | HAS_LEAF_BIT = 1 << 8 12 | EXTENSION_BIT = 1 << 9 13 | 14 | 15 | def has_leaf(base, _mask=HAS_LEAF_BIT): 16 | """ Check if a unit has a leaf as a child or not. """ 17 | return bool(base & _mask) 18 | 19 | 20 | def value(base, _mask=~IS_LEAF_BIT & PRECISION_MASK): 21 | """ Check if a unit corresponds to a leaf or not. """ 22 | return base & _mask 23 | 24 | 25 | def label(base, _mask=IS_LEAF_BIT | 0xFF): 26 | """ Read a label with a leaf flag from a non-leaf unit. """ 27 | return base & _mask 28 | 29 | 30 | def offset(base): 31 | """ Read an offset to child units from a non-leaf unit. """ 32 | return ((base >> 10) << ((base & EXTENSION_BIT) >> 6)) & PRECISION_MASK 33 | -------------------------------------------------------------------------------- /dawg_python/wrapper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, unicode_literals 3 | import struct 4 | import array 5 | 6 | from . import units 7 | from .compat import int_from_byte 8 | 9 | 10 | class Dictionary(object): 11 | """ 12 | Dictionary class for retrieval and binary I/O. 13 | """ 14 | def __init__(self): 15 | self._units = array.array(str("I")) 16 | 17 | ROOT = 0 18 | "Root index" 19 | 20 | def has_value(self, index): 21 | "Checks if a given index is related to the end of a key." 22 | return units.has_leaf(self._units[index]) 23 | 24 | def value(self, index): 25 | "Gets a value from a given index." 26 | offset = units.offset(self._units[index]) 27 | value_index = (index ^ offset) & units.PRECISION_MASK 28 | return units.value(self._units[value_index]) 29 | 30 | def read(self, fp): 31 | "Reads a dictionary from an input stream." 32 | base_size = struct.unpack(str("=I"), fp.read(4))[0] 33 | self._units.fromfile(fp, base_size) 34 | 35 | def contains(self, key): 36 | "Exact matching." 37 | index = self.follow_bytes(key, self.ROOT) 38 | if index is None: 39 | return False 40 | return self.has_value(index) 41 | 42 | def find(self, key): 43 | "Exact matching (returns value)" 44 | index = self.follow_bytes(key, self.ROOT) 45 | if index is None: 46 | return -1 47 | if not self.has_value(index): 48 | return -1 49 | return self.value(index) 50 | 51 | def follow_char(self, label, index): 52 | "Follows a transition" 53 | offset = units.offset(self._units[index]) 54 | next_index = (index ^ offset ^ label) & units.PRECISION_MASK 55 | 56 | if units.label(self._units[next_index]) != label: 57 | return None 58 | 59 | return next_index 60 | 61 | def follow_bytes(self, s, index): 62 | "Follows transitions." 63 | for ch in s: 64 | index = self.follow_char(int_from_byte(ch), index) 65 | if index is None: 66 | return None 67 | 68 | return index 69 | 70 | @classmethod 71 | def load(cls, path): 72 | dawg = cls() 73 | with open(path, 'rb') as f: 74 | dawg.read(f) 75 | return dawg 76 | 77 | 78 | class Guide(object): 79 | 80 | ROOT = 0 81 | 82 | def __init__(self): 83 | self._units = array.array(str("B")) 84 | 85 | def child(self, index): 86 | return self._units[index*2] 87 | 88 | def sibling(self, index): 89 | return self._units[index*2 + 1] 90 | 91 | def read(self, fp): 92 | base_size = struct.unpack(str("=I"), fp.read(4))[0] 93 | self._units.fromfile(fp, base_size*2) 94 | 95 | def size(self): 96 | return len(self._units) 97 | 98 | 99 | class Completer(object): 100 | 101 | def __init__(self, dic=None, guide=None): 102 | self._dic = dic 103 | self._guide = guide 104 | 105 | def value(self): 106 | return self._dic.value(self._last_index) 107 | 108 | def start(self, index, prefix=b""): 109 | self.key = bytearray(prefix) 110 | 111 | if self._guide.size(): 112 | self._index_stack = [index] 113 | self._last_index = self._dic.ROOT 114 | else: 115 | self._index_stack = [] 116 | 117 | def next(self): 118 | "Gets the next key" 119 | 120 | if not self._index_stack: 121 | return False 122 | 123 | index = self._index_stack[-1] 124 | 125 | if self._last_index != self._dic.ROOT: 126 | 127 | child_label = self._guide.child(index) # UCharType 128 | 129 | if child_label: 130 | # Follows a transition to the first child. 131 | index = self._follow(child_label, index) 132 | if index is None: 133 | return False 134 | else: 135 | while True: 136 | sibling_label = self._guide.sibling(index) 137 | # Moves to the previous node. 138 | if len(self.key) > 0: 139 | self.key.pop() 140 | #self.key[-1] = 0 141 | 142 | self._index_stack.pop() 143 | if not self._index_stack: 144 | return False 145 | 146 | index = self._index_stack[-1] 147 | if sibling_label: 148 | # Follows a transition to the next sibling. 149 | index = self._follow(sibling_label, index) 150 | if index is None: 151 | return False 152 | break 153 | 154 | return self._find_terminal(index) 155 | 156 | def _follow(self, label, index): 157 | next_index = self._dic.follow_char(label, index) 158 | if next_index is None: 159 | return None 160 | 161 | self.key.append(label) 162 | self._index_stack.append(next_index) 163 | return next_index 164 | 165 | def _find_terminal(self, index): 166 | while not self._dic.has_value(index): 167 | label = self._guide.child(index) 168 | 169 | index = self._dic.follow_char(label, index) 170 | if index is None: 171 | return False 172 | 173 | self.key.append(label) 174 | self._index_stack.append(index) 175 | 176 | self._last_index = index 177 | return True 178 | -------------------------------------------------------------------------------- /dev_data/large/bytes_dawg.dawg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytries/DAWG-Python/e56241ec919b78735ff79014bf18d7fd1f8e08b9/dev_data/large/bytes_dawg.dawg -------------------------------------------------------------------------------- /dev_data/large/dawg.dawg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytries/DAWG-Python/e56241ec919b78735ff79014bf18d7fd1f8e08b9/dev_data/large/dawg.dawg -------------------------------------------------------------------------------- /dev_data/large/int_dawg.dawg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytries/DAWG-Python/e56241ec919b78735ff79014bf18d7fd1f8e08b9/dev_data/large/int_dawg.dawg -------------------------------------------------------------------------------- /dev_data/large/record_dawg.dawg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytries/DAWG-Python/e56241ec919b78735ff79014bf18d7fd1f8e08b9/dev_data/large/record_dawg.dawg -------------------------------------------------------------------------------- /dev_data/small/bytes.dawg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytries/DAWG-Python/e56241ec919b78735ff79014bf18d7fd1f8e08b9/dev_data/small/bytes.dawg -------------------------------------------------------------------------------- /dev_data/small/completion-empty.dawg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytries/DAWG-Python/e56241ec919b78735ff79014bf18d7fd1f8e08b9/dev_data/small/completion-empty.dawg -------------------------------------------------------------------------------- /dev_data/small/completion.dawg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytries/DAWG-Python/e56241ec919b78735ff79014bf18d7fd1f8e08b9/dev_data/small/completion.dawg -------------------------------------------------------------------------------- /dev_data/small/int_completion_dawg.dawg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytries/DAWG-Python/e56241ec919b78735ff79014bf18d7fd1f8e08b9/dev_data/small/int_completion_dawg.dawg -------------------------------------------------------------------------------- /dev_data/small/int_dawg.dawg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytries/DAWG-Python/e56241ec919b78735ff79014bf18d7fd1f8e08b9/dev_data/small/int_dawg.dawg -------------------------------------------------------------------------------- /dev_data/small/prediction-record.dawg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytries/DAWG-Python/e56241ec919b78735ff79014bf18d7fd1f8e08b9/dev_data/small/prediction-record.dawg -------------------------------------------------------------------------------- /dev_data/small/prediction.dawg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytries/DAWG-Python/e56241ec919b78735ff79014bf18d7fd1f8e08b9/dev_data/small/prediction.dawg -------------------------------------------------------------------------------- /dev_data/small/record.dawg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytries/DAWG-Python/e56241ec919b78735ff79014bf18d7fd1f8e08b9/dev_data/small/record.dawg -------------------------------------------------------------------------------- /dev_data/words100k.txt.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytries/DAWG-Python/e56241ec919b78735ff79014bf18d7fd1f8e08b9/dev_data/words100k.txt.zip -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal=1 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | from setuptools import setup 3 | 4 | setup( 5 | name="DAWG-Python", 6 | version="0.7.2", 7 | description="Pure-python reader for DAWGs (DAFSAs) created by dawgdic C++ library or DAWG Python extension.", 8 | long_description = open('README.rst').read() + "\n\n"+ open('CHANGES.rst').read(), 9 | author='Mikhail Korobov', 10 | author_email='kmike84@gmail.com', 11 | url='https://github.com/kmike/DAWG-Python/', 12 | packages = ['dawg_python'], 13 | 14 | classifiers=[ 15 | 'Development Status :: 4 - Beta', 16 | 'Intended Audience :: Developers', 17 | 'Intended Audience :: Science/Research', 18 | 'License :: OSI Approved :: MIT License', 19 | 'Programming Language :: Cython', 20 | 'Programming Language :: Python', 21 | 'Programming Language :: Python :: 2', 22 | 'Programming Language :: Python :: 2.6', 23 | 'Programming Language :: Python :: 2.7', 24 | 'Programming Language :: Python :: 3', 25 | 'Programming Language :: Python :: 3.2', 26 | 'Programming Language :: Python :: 3.3', 27 | 'Programming Language :: Python :: 3.4', 28 | 'Programming Language :: Python :: Implementation :: CPython', 29 | 'Programming Language :: Python :: Implementation :: PyPy', 30 | 'Topic :: Software Development :: Libraries :: Python Modules', 31 | 'Topic :: Scientific/Engineering :: Information Analysis', 32 | 'Topic :: Text Processing :: Linguistic', 33 | ], 34 | ) 35 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import -------------------------------------------------------------------------------- /tests/test_dawg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, unicode_literals 3 | import pickle 4 | import tempfile 5 | 6 | import pytest 7 | import dawg_python 8 | 9 | from .utils import data_path 10 | 11 | def test_c_dawg_contains(): 12 | dawg = pytest.importorskip("dawg") # import dawg 13 | bin_dawg = dawg.IntDAWG({'foo': 1, 'bar': 2, 'foobar': 3}) 14 | 15 | d = dawg_python.Dictionary() 16 | 17 | fd, path = tempfile.mkstemp() 18 | bin_dawg.save(path) 19 | 20 | with open(path, 'rb') as f: 21 | d.read(f) 22 | 23 | assert d.contains(b'foo') 24 | assert not d.contains(b'x') 25 | assert d.contains(b'foobar') 26 | assert d.contains(b'bar') 27 | 28 | 29 | class TestCompletionDAWG(object): 30 | keys = ['f', 'bar', 'foo', 'foobar'] 31 | 32 | def dawg(self): 33 | return dawg_python.CompletionDAWG().load(data_path('small', 'completion.dawg')) 34 | 35 | def test_contains(self): 36 | d = self.dawg() 37 | for key in self.keys: 38 | assert key in d 39 | 40 | def test_contains_bytes(self): 41 | d = self.dawg() 42 | for key in self.keys: 43 | assert key.encode('utf8') in d 44 | 45 | def test_keys(self): 46 | d = self.dawg() 47 | assert d.keys() == sorted(self.keys) 48 | 49 | def test_iterkeys(self): 50 | d = self.dawg() 51 | assert list(d.iterkeys()) == d.keys() 52 | 53 | def test_completion(self): 54 | d = self.dawg() 55 | 56 | assert d.keys('z') == [] 57 | assert d.keys('b') == ['bar'] 58 | assert d.keys('foo') == ['foo', 'foobar'] 59 | 60 | def test_no_segfaults_on_invalid_file(self): 61 | d = self.dawg() 62 | fd, path = tempfile.mkstemp() 63 | with open(path, 'w') as f: 64 | f.write('foo') 65 | 66 | with pytest.raises(Exception) as e: 67 | d.load(path) 68 | 69 | def test_empty_dawg(self): 70 | d = dawg_python.CompletionDAWG().load(data_path('small', 'completion-empty.dawg')) 71 | assert d.keys() == [] 72 | 73 | def test_prefixes(self): 74 | d = self.dawg() 75 | assert d.prefixes("foobarz") == ["f", "foo", "foobar"] 76 | assert d.prefixes("x") == [] 77 | assert d.prefixes("bar") == ["bar"] 78 | 79 | 80 | 81 | class TestIntDAWG(object): 82 | payload = {'foo': 1, 'bar': 5, 'foobar': 3} 83 | 84 | def dawg(self): 85 | return dawg_python.IntDAWG().load(data_path('small', 'int_dawg.dawg')) 86 | 87 | def test_getitem(self): 88 | d = self.dawg() 89 | for key in self.payload: 90 | assert d[key] == self.payload[key] 91 | 92 | with pytest.raises(KeyError): 93 | d['fo'] 94 | 95 | def test_pickling(self): 96 | d = self.dawg() 97 | 98 | data = pickle.dumps(d) 99 | d2 = pickle.loads(data) 100 | 101 | for key, value in self.payload.items(): 102 | assert key in d2 103 | assert d[key] == value 104 | 105 | 106 | class TestIntCompletionDawg(TestIntDAWG): 107 | def dawg(self): 108 | return dawg_python.IntCompletionDAWG().load(data_path('small', 'int_completion_dawg.dawg')) 109 | 110 | def test_completion_keys(self): 111 | assert self.dawg().keys() == sorted(self.payload.keys()) 112 | 113 | def test_completion_keys_with_prefix(self): 114 | assert self.dawg().keys('fo') == ['foo', 'foobar'] 115 | assert self.dawg().keys('foo') == ['foo', 'foobar'] 116 | assert self.dawg().keys('foob') == ['foobar'] 117 | assert self.dawg().keys('z') == [] 118 | assert self.dawg().keys('b') == ['bar'] 119 | 120 | def test_completion_items(self): 121 | assert self.dawg().items() == sorted(self.payload.items(), key=lambda r: r[0]) 122 | -------------------------------------------------------------------------------- /tests/test_fuzzy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, unicode_literals 3 | 4 | import dawg_python 5 | 6 | from .utils import words100k, data_path 7 | 8 | words = words100k() 9 | dawg = dawg_python.Dictionary.load(data_path('large', 'int_dawg.dawg')) 10 | 11 | class TestDictionary(object): 12 | 13 | def test_contains(self): 14 | for word in words: 15 | assert dawg.contains(word.encode('utf8')) 16 | 17 | def test_find(self): 18 | for word in words: 19 | assert dawg.find(word.encode('utf8')) == len(word) 20 | -------------------------------------------------------------------------------- /tests/test_payload_dawg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, unicode_literals 3 | 4 | import pytest 5 | import dawg_python 6 | from .utils import data_path 7 | 8 | class TestBytesDAWG(object): 9 | 10 | DATA = ( 11 | ('foo', b'data1'), 12 | ('bar', b'data2'), 13 | ('foo', b'data3'), 14 | ('foobar', b'data4') 15 | ) 16 | 17 | def dawg(self): 18 | return dawg_python.BytesDAWG().load(data_path("small", "bytes.dawg")) 19 | 20 | def test_contains(self): 21 | d = self.dawg() 22 | for key, val in self.DATA: 23 | assert key in d 24 | 25 | assert 'food' not in d 26 | assert 'x' not in d 27 | assert 'fo' not in d 28 | 29 | 30 | def test_getitem(self): 31 | d = self.dawg() 32 | 33 | assert d['foo'] == [b'data1', b'data3'] 34 | assert d['bar'] == [b'data2'] 35 | assert d['foobar'] == [b'data4'] 36 | 37 | 38 | def test_getitem_missing(self): 39 | d = self.dawg() 40 | 41 | with pytest.raises(KeyError): 42 | d['x'] 43 | 44 | with pytest.raises(KeyError): 45 | d['food'] 46 | 47 | with pytest.raises(KeyError): 48 | d['foobarz'] 49 | 50 | with pytest.raises(KeyError): 51 | d['f'] 52 | 53 | def test_keys(self): 54 | d = self.dawg() 55 | assert d.keys() == ['bar', 'foo', 'foo', 'foobar'] 56 | 57 | def test_iterkeys(self): 58 | d = self.dawg() 59 | assert list(d.iterkeys()) == d.keys() 60 | 61 | def test_key_completion(self): 62 | d = self.dawg() 63 | assert d.keys('fo') == ['foo', 'foo', 'foobar'] 64 | 65 | def test_items(self): 66 | d = self.dawg() 67 | assert d.items() == sorted(self.DATA) 68 | 69 | def test_iteritems(self): 70 | d = self.dawg() 71 | assert list(d.iteritems('xxx')) == [] 72 | assert list(d.iteritems('fo')) == d.items('fo') 73 | assert list(d.iteritems()) == d.items() 74 | 75 | def test_items_completion(self): 76 | d = self.dawg() 77 | assert d.items('foob') == [('foobar', b'data4')] 78 | 79 | def test_prefixes(self): 80 | d = self.dawg() 81 | assert d.prefixes("foobarz") == ["foo", "foobar"] 82 | assert d.prefixes("x") == [] 83 | assert d.prefixes("bar") == ["bar"] 84 | 85 | 86 | class TestRecordDAWG(object): 87 | 88 | STRUCTURED_DATA = ( 89 | ('foo', (3, 2, 256)), 90 | ('bar', (3, 1, 0)), 91 | ('foo', (3, 2, 1)), 92 | ('foobar', (6, 3, 0)) 93 | ) 94 | 95 | def dawg(self): 96 | path = data_path("small", "record.dawg") 97 | return dawg_python.RecordDAWG(">3H").load(path) 98 | 99 | def test_getitem(self): 100 | d = self.dawg() 101 | assert d['foo'] == [(3, 2, 1), (3, 2, 256)] 102 | assert d['bar'] == [(3, 1, 0)] 103 | assert d['foobar'] == [(6, 3, 0)] 104 | 105 | def test_getitem_missing(self): 106 | d = self.dawg() 107 | 108 | with pytest.raises(KeyError): 109 | d['x'] 110 | 111 | with pytest.raises(KeyError): 112 | d['food'] 113 | 114 | with pytest.raises(KeyError): 115 | d['foobarz'] 116 | 117 | with pytest.raises(KeyError): 118 | d['f'] 119 | 120 | def test_record_items(self): 121 | d = self.dawg() 122 | assert d.items() == sorted(self.STRUCTURED_DATA) 123 | 124 | def test_record_keys(self): 125 | d = self.dawg() 126 | assert d.keys() == ['bar', 'foo', 'foo', 'foobar',] 127 | 128 | def test_record_keys_prefix(self): 129 | d = self.dawg() 130 | assert d.keys('fo') == ['foo', 'foo', 'foobar'] 131 | assert d.keys('bar') == ['bar'] 132 | assert d.keys('barz') == [] 133 | 134 | def test_prefixes(self): 135 | d = self.dawg() 136 | assert d.prefixes("foobarz") == ["foo", "foobar"] 137 | assert d.prefixes("x") == [] 138 | assert d.prefixes("bar") == ["bar"] 139 | -------------------------------------------------------------------------------- /tests/test_prediction.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, unicode_literals 3 | import pytest 4 | import dawg_python 5 | 6 | from .utils import data_path 7 | 8 | class TestPrediction(object): 9 | 10 | REPLACES = dawg_python.DAWG.compile_replaces({'Е': 'Ё'}) 11 | 12 | DATA = ['ЁЖИК', 'ЁЖИКЕ', 'ЁЖ', 'ДЕРЕВНЯ', 'ДЕРЁВНЯ', 'ЕМ', 'ОЗЕРА', 'ОЗЁРА', 'ОЗЕРО'] 13 | SUITE = [ 14 | ('УЖ', []), 15 | ('ЕМ', ['ЕМ']), 16 | ('ЁМ', []), 17 | ('ЁЖ', ['ЁЖ']), 18 | ('ЕЖ', ['ЁЖ']), 19 | ('ЁЖИК', ['ЁЖИК']), 20 | ('ЕЖИКЕ', ['ЁЖИКЕ']), 21 | ('ДЕРЕВНЯ', ['ДЕРЕВНЯ', 'ДЕРЁВНЯ']), 22 | ('ДЕРЁВНЯ', ['ДЕРЁВНЯ']), 23 | ('ОЗЕРА', ['ОЗЕРА', 'ОЗЁРА']), 24 | ('ОЗЕРО', ['ОЗЕРО']), 25 | ] 26 | 27 | SUITE_ITEMS = [ 28 | ( 29 | it[0], # key 30 | [ 31 | (w, [(len(w),)]) # item, value pair 32 | for w in it[1] 33 | ] 34 | ) 35 | for it in SUITE 36 | ] 37 | 38 | SUITE_VALUES = [ 39 | ( 40 | it[0], # key 41 | [[(len(w),)] for w in it[1]] 42 | ) 43 | for it in SUITE 44 | ] 45 | 46 | def record_dawg(self): 47 | path = data_path("small", "prediction-record.dawg") 48 | return dawg_python.RecordDAWG(str("=H")).load(path) 49 | 50 | 51 | 52 | @pytest.mark.parametrize(("word", "prediction"), SUITE) 53 | def test_dawg_prediction(self, word, prediction): 54 | d = dawg_python.DAWG().load(data_path("small", "prediction.dawg")) 55 | assert d.similar_keys(word, self.REPLACES) == prediction 56 | 57 | @pytest.mark.parametrize(("word", "prediction"), SUITE) 58 | def test_record_dawg_prediction(self, word, prediction): 59 | d = self.record_dawg() 60 | assert d.similar_keys(word, self.REPLACES) == prediction 61 | 62 | @pytest.mark.parametrize(("word", "prediction"), SUITE_ITEMS) 63 | def test_record_dawg_items(self, word, prediction): 64 | d = self.record_dawg() 65 | assert d.similar_items(word, self.REPLACES) == prediction 66 | 67 | @pytest.mark.parametrize(("word", "prediction"), SUITE_VALUES) 68 | def test_record_dawg_items_values(self, word, prediction): 69 | d = self.record_dawg() 70 | assert d.similar_item_values(word, self.REPLACES) == prediction 71 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | import os 4 | import zipfile 5 | 6 | DEV_DATA_PATH = os.path.join( 7 | os.path.dirname(__file__), 8 | '..', 9 | 'dev_data', 10 | ) 11 | 12 | def data_path(*args): 13 | """ 14 | Returns a path to dev data 15 | """ 16 | return os.path.join(DEV_DATA_PATH, *args) 17 | 18 | def words100k(): 19 | zip_name = data_path('words100k.txt.zip') 20 | zf = zipfile.ZipFile(zip_name) 21 | txt = zf.open(zf.namelist()[0]).read().decode('utf8') 22 | return txt.splitlines() 23 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py26,py27,py32,py33,py34,pypy 3 | 4 | [testenv] 5 | deps = 6 | pytest 7 | dawg 8 | commands= 9 | py.test [] 10 | 11 | [testenv:pypy] 12 | deps = 13 | pytest 14 | --------------------------------------------------------------------------------