├── test └── __init__.py ├── .gitignore ├── MANIFEST.in ├── README.md ├── revtok ├── __init__.py ├── tokenizer.py └── subwords.py ├── LICENSE └── setup.py /test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info/ 2 | dist/ 3 | build/ 4 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.md 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # revtok 2 | Reversible tokenization in Python. 3 | -------------------------------------------------------------------------------- /revtok/__init__.py: -------------------------------------------------------------------------------- 1 | from .tokenizer import tokenize, detokenize 2 | from .subwords import SubwordSegmenter, SubwordTokenizer 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Victor Zhong 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /revtok/tokenizer.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | import unicodedata 4 | 5 | 6 | HALF = ' ' 7 | CAP = '\ue302' 8 | 9 | 10 | def space_priority(char): 11 | return {'L': 7, 'M': 7, 'N': 5, 'S': 3, 'P': 1, 12 | 'Z': -1, 'C': -3}[unicodedata.category(char)[0]] 13 | 14 | 15 | def tokenize(s, decap=False, split_punctuation=True): 16 | """Simple reversible tokenizer""" 17 | 18 | toks = [''] 19 | current_cat = 0 20 | for c in s: 21 | cat = space_priority(c) 22 | if c == ' ': 23 | toks[-1] += HALF 24 | toks.append(HALF) 25 | current_cat = None 26 | continue 27 | elif current_cat is None: 28 | toks[-1] += c 29 | elif cat == current_cat and (cat > 2 or not split_punctuation): 30 | toks[-1] += c # HALF + c 31 | elif cat <= 0 and current_cat <= 0: 32 | toks.append(c) 33 | elif cat <= current_cat: 34 | toks[-1] += HALF 35 | toks.append(c) 36 | else: 37 | toks.append(HALF + c) 38 | current_cat = cat 39 | if toks[0] == '': 40 | toks = toks[1:] 41 | if current_cat is not None and current_cat > 0: 42 | toks[-1] += HALF 43 | if decap: 44 | toks = list(map(decapitalize, toks)) 45 | return [sys.intern(tok) for tok in toks] 46 | 47 | 48 | def decapitalize(tok): 49 | if len(tok) == 0: 50 | return tok 51 | pre, tok = (HALF, tok[1:]) if tok[0] == HALF else ('', tok) 52 | if tok[0] == tok[0].lower(): 53 | return pre + tok 54 | if tok[0] == tok[0].upper() and (len(tok) == 1 or tok[1] != tok[1].upper()): 55 | return CAP + pre + tok[0].lower() + tok[1:] 56 | return pre + tok 57 | 58 | def detokenize(l): 59 | text = ''.join(l).replace(CAP + HALF, HALF + CAP) 60 | text = re.sub(HALF + '+', lambda s: ' ' * (len(s.group(0)) // 2), text) 61 | return re.sub(CAP + '.', lambda s: s.group(0)[-1].upper(), text, flags=re.S) 62 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from codecs import open 3 | from os import path 4 | 5 | here = path.abspath(path.dirname(__file__)) 6 | 7 | # Get the long description from the README file 8 | with open(path.join(here, 'README.md'), encoding='utf-8') as f: 9 | long_description = f.read() 10 | 11 | setup( 12 | name='revtok', 13 | version='0.0.3', 14 | 15 | description='Reversible tokenization in Python.', 16 | long_description=long_description, 17 | 18 | # The project's main homepage. 19 | url='https://github.com/jekbradbury/revtok', 20 | 21 | # Author details 22 | author='James Bradbury', 23 | author_email='jekbradbury@gmail.com', 24 | 25 | # Choose your license 26 | license='MIT', 27 | 28 | # See https://pypi.python.org/pypi?%3Aaction=list_classifiers 29 | classifiers=[ 30 | # How mature is this project? Common values are 31 | # 3 - Alpha 32 | # 4 - Beta 33 | # 5 - Production/Stable 34 | 'Development Status :: 3 - Alpha', 35 | 36 | # Indicate who your project is intended for 37 | 'Intended Audience :: Developers', 38 | 39 | # Pick your license as you wish (should match "license" above) 40 | 'License :: OSI Approved :: MIT License', 41 | 42 | # Specify the Python versions you support here. In particular, ensure 43 | # that you indicate whether you support Python 2, Python 3 or both. 44 | 'Programming Language :: Python :: 3.5', 45 | ], 46 | 47 | # What does your project relate to? 48 | keywords='text nlp', 49 | 50 | # You can just specify the packages manually here if your project is 51 | # simple. Or you can use find_packages(). 52 | packages=find_packages(exclude=['contrib', 'docs', 'tests']), 53 | 54 | # List run-time dependencies here. These will be installed by pip when 55 | # your project is installed. For an analysis of "install_requires" vs pip's 56 | # requirements files see: 57 | # https://packaging.python.org/en/latest/requirements.html 58 | install_requires=[], 59 | 60 | # List additional groups of dependencies here (e.g. development 61 | # dependencies). You can install these using the following syntax, 62 | # for example: 63 | # $ pip install -e .[dev,test] 64 | extras_require={ 65 | 'dev': ['check-manifest'], 66 | 'test': ['coverage'], 67 | }, 68 | 69 | # If there are data files included in your packages that need to be 70 | # installed, specify them here. If using Python 2.6 or less, then these 71 | # have to be included in MANIFEST.in as well. 72 | package_data={ 73 | }, 74 | 75 | # Although 'package_data' is the preferred approach, in some case you may 76 | # need to place data files outside of your packages. See: 77 | # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa 78 | # In this case, 'data_file' will be installed into '/my_data' 79 | data_files=[], 80 | 81 | # To provide executable scripts, use entry points in preference to the 82 | # "scripts" keyword. Entry points provide cross-platform support and allow 83 | # pip to create the appropriate form of executable for the target platform. 84 | entry_points={ 85 | }, 86 | ) 87 | -------------------------------------------------------------------------------- /revtok/subwords.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from .tokenizer import tokenize 3 | 4 | from collections import defaultdict, Counter 5 | from operator import attrgetter 6 | import sys 7 | 8 | from tqdm import tqdm 9 | 10 | class keydefaultdict(defaultdict): 11 | def __missing__(self, key): 12 | ret = self[key] = self.default_factory(key) 13 | return ret 14 | 15 | class Utterance: 16 | def __init__(self, text): 17 | self.text = text 18 | self.count = 0 19 | self.ngrams = set() 20 | def overlaps(self, ngram1, ngram2): 21 | #print(self.text, ngram1.text, ngram2.text) 22 | inds1, inds2 = ngram1.utterances[self], ngram2.utterances[self] 23 | ret = 0 24 | for i1 in inds1: 25 | for i2 in inds2: 26 | #TODO verify all these exactly 27 | if i2 <= i1 <= i1 + ngram1.n <= i2 + ngram2.n: 28 | ret += 1 29 | elif i1 <= i2 <= i2 + ngram2.n <= i1 + ngram1.n: 30 | ret += 0 31 | elif i1 <= i2 < i1 + ngram1.n: 32 | ret += 1 # - (i2 - i1) / ngram1.n 33 | elif i2 <= i1 < i2 + ngram2.n: 34 | ret += 1 # (i1 - i2) / ngram1.n 35 | return ret / (len(inds1) * len(inds2)) 36 | 37 | class NGram: 38 | def __init__(self, text): 39 | self.n = len(text) 40 | self.text = text 41 | self.utterances = defaultdict(set) 42 | self._count = 0 43 | self.entropy = 0 44 | @property 45 | def count(self): 46 | return self._count 47 | @count.setter 48 | def count(self, value): 49 | self._count = value 50 | self.entropy = self._count * (self.n - 1) 51 | def add(self, utterance, i): 52 | self.count += utterance.count 53 | self.utterances[utterance].add(i) 54 | utterance.ngrams.add(self) 55 | def __repr__(self): 56 | return "'{0}': {1}".format(self.text, self.count) 57 | 58 | class NGrams: 59 | def __init__(self, counter): 60 | self.ngrams = keydefaultdict(NGram) 61 | utterances = keydefaultdict(Utterance) 62 | for text, count in counter.items(): 63 | utterances[text].count = count 64 | for utterance in tqdm(utterances.values(), 'enumerating ngrams'): 65 | self.from_utterance(utterance) 66 | def from_utterance(self, utterance): 67 | N = len(utterance.text) 68 | for i in range(N - 1): 69 | for n in range(2, N + 1 - i): 70 | self.ngrams[utterance.text[i:i+n]].add(utterance, i) 71 | 72 | class SubwordSegmenter: 73 | # TODO MAYBE allow segmentations like " aware " + "ness " 74 | def __init__(self, counter, max_size, force_python=False): 75 | self.cache = dict() 76 | if not force_python: 77 | try: 78 | import julia 79 | self.julia = julia.Julia() 80 | self.julia.using("Revtok") 81 | self.vocab = self.julia.buildvocab(counter, max_size) 82 | return 83 | except ImportError: 84 | print('For faster subwords, please install Julia 0.6, pyjulia, ' 85 | 'and Revtok.jl. Falling back to Python implementation...') 86 | except Exception as e: 87 | print(e) 88 | print('for faster subwords, please install Revtok.jl. ' 89 | 'Falling back to Python implementation...') 90 | self.vocab = Counter(''.join(counter.keys())).most_common() 91 | self.vocab.sort(key=lambda tup: (-tup[1], tup[0])) 92 | self.vocab = dict(self.vocab) 93 | ngrams = list(NGrams(counter).ngrams.values()) 94 | ngrams.sort(key=attrgetter('text')) 95 | key = attrgetter('entropy') 96 | for i in tqdm(range(max_size - len(self.vocab)), 'building subword vocab'): 97 | ngrams.sort(key=key, reverse=True) 98 | best = ngrams[0] 99 | #print(best) 100 | for utterance in best.utterances: 101 | seen = set([best]) 102 | for ngram in utterance.ngrams: 103 | if ngram not in seen: 104 | ngram.count -= utterance.count * utterance.overlaps(ngram, best) 105 | seen.add(ngram) 106 | self.vocab[ngrams[0].text] = ngrams[0].entropy 107 | ngrams = ngrams[1:] 108 | self.julia = None 109 | 110 | def __call__(self, utterance, use_julia=False): 111 | if self.julia is not None and use_julia: 112 | return self.julia.segment(utterance, self.vocab) 113 | if isinstance(utterance, list): 114 | return [tok for u in utterance for tok in self(u)] 115 | if utterance in self.vocab: 116 | return [utterance] 117 | if utterance in self.cache: 118 | return self.cache[utterance] 119 | i, segments = 0, {0: []} 120 | while True: 121 | for j in range(i + 1, len(utterance) + 1): 122 | potential_segment = utterance[i:j] 123 | if len(potential_segment) == 1 or potential_segment in self.vocab: 124 | #print(i, j, segments) 125 | curlen = len(segments[j]) if j in segments else len(utterance) + 1 126 | if len(segments[i]) + 1 < curlen: 127 | segments[j] = segments[i] + [potential_segment] 128 | #print(i, segments) 129 | inds = sorted(segments.keys()) 130 | if inds.index(i) < len(inds) - 1: 131 | i = inds[inds.index(i) + 1] 132 | else: 133 | break 134 | ret = segments[len(utterance)] 135 | ret = [sys.intern(seg) for seg in ret] 136 | self.cache[utterance] = ret 137 | return ret 138 | 139 | class SubwordTokenizer: 140 | def __init__(self, text, max_size): 141 | corpus = tokenize(text, decap=True) 142 | self.segmenter = SubwordSegmenter(Counter(corpus), max_size) 143 | def __call__(self, text): 144 | segments = map(self.segmenter, tokenize(text)) 145 | return [tok for word in segments for tok in word] 146 | 147 | # #corpus = ['megabyte', 'gigabyte'] 148 | # train = tokenize(""" 149 | # """) 150 | # test = tokenize(""" 151 | # """) 152 | # vocab = build_vocab(train, 1000) 153 | # print(vocab) 154 | # segments = [segment(tok, vocab) for tok in tqdm(test, 'segmenting')] 155 | # print(segments) 156 | # segments = [tok for word in segments for tok in word] 157 | # print(len(segments)) 158 | --------------------------------------------------------------------------------