├── test
    └── __init__.py
├── .gitignore
├── MANIFEST.in
├── README.md
├── revtok
    ├── __init__.py
    ├── tokenizer.py
    └── subwords.py
├── LICENSE
└── setup.py


/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.egg-info/
2 | dist/
3 | build/
4 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include README.md
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # revtok
2 | Reversible tokenization in Python.
3 | 


--------------------------------------------------------------------------------
/revtok/__init__.py:
--------------------------------------------------------------------------------
1 | from .tokenizer import tokenize, detokenize
2 | from .subwords import SubwordSegmenter, SubwordTokenizer
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Victor Zhong
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/revtok/tokenizer.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import sys
 3 | import unicodedata
 4 | 
 5 | 
 6 | HALF = ' '
 7 | CAP = '\ue302'
 8 | 
 9 | 
10 | def space_priority(char):
11 |     return {'L': 7, 'M': 7, 'N': 5, 'S': 3, 'P': 1,
12 |             'Z': -1, 'C': -3}[unicodedata.category(char)[0]]
13 | 
14 | 
15 | def tokenize(s, decap=False, split_punctuation=True):
16 |     """Simple reversible tokenizer"""
17 | 
18 |     toks = ['']
19 |     current_cat = 0
20 |     for c in s:
21 |         cat = space_priority(c)
22 |         if c == ' ':
23 |             toks[-1] += HALF
24 |             toks.append(HALF)
25 |             current_cat = None
26 |             continue
27 |         elif current_cat is None:
28 |             toks[-1] += c
29 |         elif cat == current_cat and (cat > 2 or not split_punctuation):
30 |             toks[-1] += c # HALF + c
31 |         elif cat <= 0 and current_cat <= 0:
32 |             toks.append(c)
33 |         elif cat <= current_cat:
34 |             toks[-1] += HALF
35 |             toks.append(c)
36 |         else:
37 |             toks.append(HALF + c)
38 |         current_cat = cat
39 |     if toks[0] == '':
40 |         toks = toks[1:]
41 |     if current_cat is not None and current_cat > 0:
42 |         toks[-1] += HALF
43 |     if decap:
44 |         toks = list(map(decapitalize, toks))
45 |     return [sys.intern(tok) for tok in toks]
46 | 
47 | 
48 | def decapitalize(tok):
49 |     if len(tok) == 0:
50 |         return tok
51 |     pre, tok = (HALF, tok[1:]) if tok[0] == HALF else ('', tok)
52 |     if tok[0] == tok[0].lower():
53 |         return pre + tok
54 |     if tok[0] == tok[0].upper() and (len(tok) == 1 or tok[1] != tok[1].upper()):
55 |         return CAP + pre + tok[0].lower() + tok[1:]
56 |     return pre + tok
57 | 
58 | def detokenize(l):
59 |     text = ''.join(l).replace(CAP + HALF, HALF + CAP)
60 |     text = re.sub(HALF + '+', lambda s: ' ' * (len(s.group(0)) // 2), text)
61 |     return re.sub(CAP + '.', lambda s: s.group(0)[-1].upper(), text, flags=re.S)
62 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | from codecs import open
 3 | from os import path
 4 | 
 5 | here = path.abspath(path.dirname(__file__))
 6 | 
 7 | # Get the long description from the README file
 8 | with open(path.join(here, 'README.md'), encoding='utf-8') as f:
 9 |     long_description = f.read()
10 | 
11 | setup(
12 |     name='revtok',
13 |     version='0.0.3',
14 | 
15 |     description='Reversible tokenization in Python.',
16 |     long_description=long_description,
17 | 
18 |     # The project's main homepage.
19 |     url='https://github.com/jekbradbury/revtok',
20 | 
21 |     # Author details
22 |     author='James Bradbury',
23 |     author_email='jekbradbury@gmail.com',
24 | 
25 |     # Choose your license
26 |     license='MIT',
27 | 
28 |     # See https://pypi.python.org/pypi?%3Aaction=list_classifiers
29 |     classifiers=[
30 |         # How mature is this project? Common values are
31 |         #   3 - Alpha
32 |         #   4 - Beta
33 |         #   5 - Production/Stable
34 |         'Development Status :: 3 - Alpha',
35 | 
36 |         # Indicate who your project is intended for
37 |         'Intended Audience :: Developers',
38 | 
39 |         # Pick your license as you wish (should match "license" above)
40 |         'License :: OSI Approved :: MIT License',
41 | 
42 |         # Specify the Python versions you support here. In particular, ensure
43 |         # that you indicate whether you support Python 2, Python 3 or both.
44 |         'Programming Language :: Python :: 3.5',
45 |     ],
46 | 
47 |     # What does your project relate to?
48 |     keywords='text nlp',
49 | 
50 |     # You can just specify the packages manually here if your project is
51 |     # simple. Or you can use find_packages().
52 |     packages=find_packages(exclude=['contrib', 'docs', 'tests']),
53 | 
54 |     # List run-time dependencies here.  These will be installed by pip when
55 |     # your project is installed. For an analysis of "install_requires" vs pip's
56 |     # requirements files see:
57 |     # https://packaging.python.org/en/latest/requirements.html
58 |     install_requires=[],
59 | 
60 |     # List additional groups of dependencies here (e.g. development
61 |     # dependencies). You can install these using the following syntax,
62 |     # for example:
63 |     # $ pip install -e .[dev,test]
64 |     extras_require={
65 |         'dev': ['check-manifest'],
66 |         'test': ['coverage'],
67 |     },
68 | 
69 |     # If there are data files included in your packages that need to be
70 |     # installed, specify them here.  If using Python 2.6 or less, then these
71 |     # have to be included in MANIFEST.in as well.
72 |     package_data={
73 |     },
74 | 
75 |     # Although 'package_data' is the preferred approach, in some case you may
76 |     # need to place data files outside of your packages. See:
77 |     # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa
78 |     # In this case, 'data_file' will be installed into '<sys.prefix>/my_data'
79 |     data_files=[],
80 | 
81 |     # To provide executable scripts, use entry points in preference to the
82 |     # "scripts" keyword. Entry points provide cross-platform support and allow
83 |     # pip to create the appropriate form of executable for the target platform.
84 |     entry_points={
85 |     },
86 | )
87 | 


--------------------------------------------------------------------------------
/revtok/subwords.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | from .tokenizer import tokenize
  3 | 
  4 | from collections import defaultdict, Counter
  5 | from operator import attrgetter
  6 | import sys
  7 | 
  8 | from tqdm import tqdm
  9 | 
 10 | class keydefaultdict(defaultdict):
 11 |     def __missing__(self, key):
 12 |         ret = self[key] = self.default_factory(key)
 13 |         return ret
 14 | 
 15 | class Utterance:
 16 |     def __init__(self, text):
 17 |         self.text = text
 18 |         self.count = 0
 19 |         self.ngrams = set()
 20 |     def overlaps(self, ngram1, ngram2):
 21 |         #print(self.text, ngram1.text, ngram2.text)
 22 |         inds1, inds2 = ngram1.utterances[self], ngram2.utterances[self]
 23 |         ret = 0
 24 |         for i1 in inds1:
 25 |             for i2 in inds2:
 26 |                 #TODO verify all these exactly
 27 |                 if i2 <= i1 <= i1 + ngram1.n <= i2 + ngram2.n:
 28 |                     ret += 1
 29 |                 elif i1 <= i2 <= i2 + ngram2.n <= i1 + ngram1.n:
 30 |                     ret += 0
 31 |                 elif i1 <= i2 < i1 + ngram1.n:
 32 |                     ret += 1 # - (i2 - i1) / ngram1.n
 33 |                 elif i2 <= i1 < i2 + ngram2.n:
 34 |                     ret += 1 # (i1 - i2) / ngram1.n
 35 |         return ret / (len(inds1) * len(inds2))
 36 | 
 37 | class NGram:
 38 |     def __init__(self, text):
 39 |         self.n = len(text)
 40 |         self.text = text
 41 |         self.utterances = defaultdict(set)
 42 |         self._count = 0
 43 |         self.entropy = 0
 44 |     @property
 45 |     def count(self):
 46 |         return self._count
 47 |     @count.setter
 48 |     def count(self, value):
 49 |         self._count = value
 50 |         self.entropy = self._count * (self.n - 1)
 51 |     def add(self, utterance, i):
 52 |         self.count += utterance.count
 53 |         self.utterances[utterance].add(i)
 54 |         utterance.ngrams.add(self)
 55 |     def __repr__(self):
 56 |         return "'{0}': {1}".format(self.text, self.count)
 57 | 
 58 | class NGrams:
 59 |     def __init__(self, counter):
 60 |         self.ngrams = keydefaultdict(NGram)
 61 |         utterances = keydefaultdict(Utterance)
 62 |         for text, count in counter.items():
 63 |             utterances[text].count = count
 64 |         for utterance in tqdm(utterances.values(), 'enumerating ngrams'):
 65 |             self.from_utterance(utterance)
 66 |     def from_utterance(self, utterance):
 67 |         N = len(utterance.text)
 68 |         for i in range(N - 1):
 69 |             for n in range(2, N + 1 - i):
 70 |                 self.ngrams[utterance.text[i:i+n]].add(utterance, i)
 71 | 
 72 | class SubwordSegmenter:
 73 |     # TODO MAYBE allow segmentations like " aware " + "ness "
 74 |     def __init__(self, counter, max_size, force_python=False):
 75 |         self.cache = dict()
 76 |         if not force_python:
 77 |             try:
 78 |                 import julia
 79 |                 self.julia = julia.Julia()
 80 |                 self.julia.using("Revtok")
 81 |                 self.vocab = self.julia.buildvocab(counter, max_size)
 82 |                 return
 83 |             except ImportError:
 84 |                 print('For faster subwords, please install Julia 0.6, pyjulia, '
 85 |                       'and Revtok.jl. Falling back to Python implementation...')
 86 |             except Exception as e:
 87 |                 print(e)
 88 |                 print('for faster subwords, please install Revtok.jl. '
 89 |                       'Falling back to Python implementation...')
 90 |         self.vocab = Counter(''.join(counter.keys())).most_common()
 91 |         self.vocab.sort(key=lambda tup: (-tup[1], tup[0]))
 92 |         self.vocab = dict(self.vocab)
 93 |         ngrams = list(NGrams(counter).ngrams.values())
 94 |         ngrams.sort(key=attrgetter('text'))
 95 |         key = attrgetter('entropy')
 96 |         for i in tqdm(range(max_size - len(self.vocab)), 'building subword vocab'):
 97 |             ngrams.sort(key=key, reverse=True)
 98 |             best = ngrams[0]
 99 |             #print(best)
100 |             for utterance in best.utterances:
101 |                 seen = set([best])
102 |                 for ngram in utterance.ngrams:
103 |                     if ngram not in seen:
104 |                         ngram.count -= utterance.count * utterance.overlaps(ngram, best)
105 |                         seen.add(ngram)
106 |             self.vocab[ngrams[0].text] = ngrams[0].entropy
107 |             ngrams = ngrams[1:]
108 |         self.julia = None
109 | 
110 |     def __call__(self, utterance, use_julia=False):
111 |         if self.julia is not None and use_julia:
112 |             return self.julia.segment(utterance, self.vocab)
113 |         if isinstance(utterance, list):
114 |             return [tok for u in utterance for tok in self(u)]
115 |         if utterance in self.vocab:
116 |             return [utterance]
117 |         if utterance in self.cache:
118 |             return self.cache[utterance]
119 |         i, segments = 0, {0: []}
120 |         while True:
121 |             for j in range(i + 1, len(utterance) + 1):
122 |                 potential_segment = utterance[i:j]
123 |                 if len(potential_segment) == 1 or potential_segment in self.vocab:
124 |                     #print(i, j, segments)
125 |                     curlen = len(segments[j]) if j in segments else len(utterance) + 1
126 |                     if len(segments[i]) + 1 < curlen:
127 |                         segments[j] = segments[i] + [potential_segment]
128 |             #print(i, segments)
129 |             inds = sorted(segments.keys())
130 |             if inds.index(i) < len(inds) - 1:
131 |                 i = inds[inds.index(i) + 1]
132 |             else:
133 |                 break
134 |         ret = segments[len(utterance)]
135 |         ret = [sys.intern(seg) for seg in ret]
136 |         self.cache[utterance] = ret
137 |         return ret
138 | 
139 | class SubwordTokenizer:
140 |     def __init__(self, text, max_size):
141 |         corpus = tokenize(text, decap=True)
142 |         self.segmenter = SubwordSegmenter(Counter(corpus), max_size)
143 |     def __call__(self, text):
144 |         segments = map(self.segmenter, tokenize(text))
145 |         return [tok for word in segments for tok in word]
146 | 
147 | # #corpus = ['megabyte', 'gigabyte']
148 | # train = tokenize("""
149 | # """)
150 | # test = tokenize("""
151 | # """)
152 | # vocab = build_vocab(train, 1000)
153 | # print(vocab)
154 | # segments = [segment(tok, vocab) for tok in tqdm(test, 'segmenting')]
155 | # print(segments)
156 | # segments = [tok for word in segments for tok in word]
157 | # print(len(segments))
158 | 


--------------------------------------------------------------------------------