├── .travis.yml
├── CHANGES
├── COPYING
├── COPYING.ipadic
├── MANIFEST.in
├── README
├── aozora_html2txt.py
├── appveyor.yml
├── igo
    ├── __init__.py
    ├── dictionary.py
    ├── dictreader.py
    ├── ipadic
    │   ├── char.category
    │   ├── code2category
    │   ├── matrix.bin
    │   ├── word.ary.idx
    │   ├── word.dat
    │   ├── word.inf
    │   └── word2id
    ├── parse.py
    ├── tagger.py
    ├── tests
    │   └── test_parse.py
    └── trie.py
├── setup.cfg
├── setup.py
├── test.py
└── tox.ini


/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: false
 2 | language: python
 3 | matrix:
 4 |   include:
 5 |     - python: "2.7"
 6 |     - python: "3.4"
 7 |     - python: "3.5"
 8 |     - python: "3.6"
 9 |     - python: "3.7"
10 |       dist: xenial
11 |       sudo: required
12 |     - python: "pypy"
13 |     - python: "pypy3"
14 | install:
15 |   - pip install tox-travis
16 | script: tox
17 | 


--------------------------------------------------------------------------------
/CHANGES:
--------------------------------------------------------------------------------
 1 | Changelog for Igo-Python
 2 | ========================
 3 | 2018-09-22 -- 1.0.0
 4 |     * dropped Python 3.3 from supported Python version.
 5 | 
 6 | 2017-02-09 -- 0.9.9
 7 |     * fixed few small issues and add unit test.
 8 | 
 9 | 2016-07-08 -- 0.9.8
10 |     * fixed a surrogate pair character handling issue
11 | 
12 | 2015-12-16 -- 0.9.7.1
13 |     * Tagger can be pickled on Python2 env
14 | 
15 | 2015-12-14 -- 0.9.7
16 |     * include pre-built IPA dictionary
17 | 
18 | 2015-07-03 -- 0.9.6
19 |     * fixed an issue of dictionary size in mmap mode
20 | 
21 | 2015-04-14 -- 0.9.5
22 |     * use mmap to load files of a dictionary if it is possible
23 | 
24 | 2013-07-05 -- 0.9.3
25 |     * support characters in Supplementary Planes
26 | 
27 | 2012-02-27 -- 0.9.2
28 |     * apply changes in Igo(java) 0.4.4
29 | 
30 | 2012-02-27 -- 0.9.1
31 |     * synchronize with Igo(java) 0.4.5
32 | 
33 | 2011-09-01 -- 0.9
34 |     * compatibility with python3.2
35 |     * update the code base to igo-0.4.3
36 |     * add some enhances and clean up.
37 | 
38 | 2011-08-29 -- 0.4
39 |     * add some enhances and clean up.
40 | 
41 | 2010-11-28 -- 0.3a
42 |     * fix bugs(if a phrase ends with 'た' will causes error)
43 | 
44 | 2010-11-27 -- 0.3
45 |     * drop mmap related code
46 |     * reduce memory footprint
47 | 
48 | 2010-11-27 -- 0.2
49 |     * support Google App Engine(maybe)
50 | 
51 | 2010-11-27 -- 0.1a
52 |     * update package info only.
53 | 
54 | 2010-11-25 -- 0.1
55 |     * first release.
56 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2010 Hideaki Takahashi
 4 | (This software is based on Igo Java Version (c) Takeru Ohta <phjgt308@gmail.com>
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in
14 | all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/COPYING.ipadic:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hideaki-t/igo-python/2144b91b4595d31d833bebaf9d99c223268316f4/COPYING.ipadic


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include COPYING CHANGES
2 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | ================
 2 |  Igo for Python
 3 | ================
 4 | 
 5 | About
 6 | =====
 7 | 
 8 | Igo_ is a Japanese morphological analyzer written in Java and Common Lisp.
 9 | This software is Python port of Igo(Java version).
10 | 
11 | .. _Igo: http://igo.osdn.jp/
12 | 
13 | Notice
14 | ======
15 | 
16 | Dictionary builder is not provided. You need to use Igo Java version to build the dictionary for Igo.
17 | From igo-python 0.9.7, pre-built `IPA dictionary (2.7.0-20070801)`__ is included for ease of use.
18 | 
19 | __ https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM
20 | 
21 | How To Use
22 | ==========
23 | 
24 | You can use Igo Python easily::
25 | 
26 |  >>> from igo.Tagger import Tagger
27 |  >>> t = Tagger() # use bundled dictionary
28 |  >>> for m in t.parse(u'すもももももももものうち'):
29 |  ...     print m.surface, m.feature
30 |  ...
31 |  すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
32 |  も 助詞,係助詞,*,*,*,*,も,モ,モ
33 |  もも 名詞,一般,*,*,*,*,もも,モモ,モモ
34 |  も 助詞,係助詞,*,*,*,*,も,モ,モ
35 |  もも 名詞,一般,*,*,*,*,もも,モモ,モモ
36 |  の 助詞,連体化,*,*,*,*,の,ノ,ノ
37 |  うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
38 |  >>>
39 | 
40 | 
41 | Using command line parser::
42 | 
43 |   $ igo # or python -m igo.parse
44 |   私の名前は中野です
45 |   私      名詞,代名詞,一般,*,*,*,私,ワタシ,ワタシ
46 |   の      助詞,連体化,*,*,*,*,の,ノ,ノ
47 |   名前    名詞,一般,*,*,*,*,名前,ナマエ,ナマエ
48 |   は      助詞,係助詞,*,*,*,*,は,ハ,ワ
49 |   中野    名詞,固有名詞,地域,一般,*,*,中野,ナカノ,ナカノ
50 |   です    助動詞,*,*,*,特殊・デス,基本形,です,デス,デス
51 |   EOS
52 | 


--------------------------------------------------------------------------------
/aozora_html2txt.py:
--------------------------------------------------------------------------------
 1 | import lxml.html
 2 | import sys
 3 | 
 4 | 
 5 | if sys.version_info[0] < 3:
 6 |     import codecs
 7 |     sys.stdout = codecs.lookup('utf-8').streamwriter(sys.stdout)
 8 | else:
 9 |     import io
10 |     sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
11 | 
12 | 
13 | target = sys.argv[1] if len(sys.argv) > 1 else sys.stdin
14 | r = lxml.html.parse(target).getroot()
15 | print(r.text_content())
16 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | cache:
 2 |   - '%LOCALAPPDATA%\pip\Cache'
 3 | 
 4 | install:
 5 |   - python -m pip install -U pip
 6 |   - python -m pip install tox
 7 |   - python -m pip install -e .
 8 | 
 9 | build: false
10 | 
11 | test_script:
12 |   - SET PYTHONIOENCODING=utf-8
13 |   - tox -e py27,py33,py34,py35,py36,py37
14 | 


--------------------------------------------------------------------------------
/igo/__init__.py:
--------------------------------------------------------------------------------
1 | import igo.tagger
2 | import sys
3 | sys.modules['igo.Tagger'] = igo.tagger
4 | igo.Tagger = igo.tagger
5 | 


--------------------------------------------------------------------------------
/igo/dictionary.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import division
  3 | import glob
  4 | import sys
  5 | import igo.dictreader as util
  6 | from igo.dictreader import DictReader
  7 | from igo.trie import Searcher
  8 | 
  9 | if sys.version_info[0] > 2:
 10 | 
 11 |     def tobytes(x):
 12 |         return x.tobytes()
 13 | else:
 14 | 
 15 |     def tobytes(x):
 16 |         return x.tostring()
 17 | 
 18 | 
 19 | class ViterbiNode(object):
 20 |     """
 21 |     Viterbiアルゴリズムで使用されるノード
 22 |     """
 23 |     __slots__ = ['cost', 'prev', 'word_id', 'start', 'length', 'left_id',
 24 |                  'right_id', 'isspace']
 25 | 
 26 |     def __init__(self, word_id, start, length, cost, left_id, right_id,
 27 |                  isspace):
 28 |         self.cost = cost
 29 |         """ 始点からノードまでの総コスト """
 30 |         self.prev = None
 31 |         """ コスト最小の前方のノードへのリンク """
 32 |         self.word_id = word_id
 33 |         """ 単語ID """
 34 |         self.start = start
 35 |         """ 入力テキスト内での形態素の開始位置 """
 36 |         self.length = length
 37 |         """ 形態素の表層形の長さ(文字数) """
 38 |         self.left_id = left_id
 39 |         """ 左文脈ID """
 40 |         self.right_id = right_id
 41 |         """ 右文脈ID """
 42 |         self.isspace = isspace
 43 |         """ 形態素の文字種(文字カテゴリ)が空白文字かどうか """
 44 | 
 45 |     @staticmethod
 46 |     def makeBOSEOS():
 47 |         return ViterbiNode(0, 0, 0, 0, 0, 0, False)
 48 | 
 49 |     def __repr__(self):
 50 |         return str({n: getattr(self, n) for n in ViterbiNode.__slots__})
 51 | 
 52 | 
 53 | class CharCategory:
 54 |     __slots__ = ['cc_rd', 'cat', 'c2c_rd', 'char2id', 'eql_masks']
 55 | 
 56 |     def __init__(self, path, bigendian=False, use_mmap=None):
 57 |         self.cc_rd = DictReader(path + "/char.category", bigendian, use_mmap)
 58 |         with self.cc_rd as r:
 59 |             self.cat = self.convert_categories(r.get_intarray())
 60 |         self.c2c_rd = DictReader(path + "/code2category", bigendian, use_mmap)
 61 |         with self.c2c_rd as r:
 62 |             self.char2id = r.get_intarray(r.size() // 4 // 2)
 63 |             self.eql_masks = r.get_intarray(r.size() // 4 // 2)
 64 | 
 65 |     def release(self):
 66 |         del self.cat
 67 |         del self.char2id
 68 |         del self.eql_masks
 69 |         self.cc_rd.release()
 70 |         del self.cc_rd
 71 |         self.c2c_rd.release()
 72 |         del self.c2c_rd
 73 | 
 74 |     def category(self, code):
 75 |         return self.cat[self.char2id[code]]
 76 | 
 77 |     def is_compatible(self, code1, code2):
 78 |         return (self.eql_masks[code1] & self.eql_masks[code2]) != 0
 79 | 
 80 |     def convert_categories(self, d):
 81 |         return [
 82 |             Category(d[i], d[i + 1], d[i + 2], d[i + 3])
 83 |             for i in range(0, len(d), 4)
 84 |         ]
 85 | 
 86 | 
 87 | class Category:
 88 |     __slots__ = ['id', 'length', 'invoke', 'group']
 89 | 
 90 |     def __init__(self, i, l, iv, g):
 91 |         self.id = i
 92 |         self.length = l
 93 |         self.invoke = iv == 1
 94 |         self.group = g == 1
 95 | 
 96 | 
 97 | class Matrix:
 98 |     """
 99 |     形態素の連接コスト表を扱うクラス
100 |     """
101 |     __slots__ = ['rd', 'left_size', 'matrix']
102 | 
103 |     def __init__(self, path, bigendian=False, use_mmap=None):
104 |         self.rd = DictReader(path + "/matrix.bin", bigendian, use_mmap)
105 |         with self.rd as r:
106 |             self.left_size = r.get_int()
107 |             right_size = r.get_int()
108 |             self.matrix = r.get_shortarray(self.left_size * right_size)
109 | 
110 |     def release(self):
111 |         del self.matrix
112 |         self.rd.release()
113 | 
114 |     def linkcost(self, left_id, right_id):
115 |         """
116 |         形態素同士の連接コストを求める
117 |         """
118 |         return self.matrix[right_id * self.left_size + left_id]
119 | 
120 | 
121 | class Unknown:
122 |     """
123 |     未知語の検索を行うクラス
124 |     """
125 |     __slots__ = ['category', 'space_id']
126 | 
127 |     def __init__(self, path, bigendian=False, use_mmap=None):
128 |         self.category = CharCategory(path, bigendian, use_mmap)
129 |         """文字カテゴリ管理クラス"""
130 |         # NOTE: ' 'の文字カテゴリはSPACEに予約されている
131 |         self.space_id = self.category.category(0x20).id
132 |         """文字カテゴリがSPACEの文字のID"""
133 | 
134 |     def release(self):
135 |         self.category.release()
136 | 
137 |     def search(self, text, start, wdic, callback):
138 |         category = self.category
139 |         ch = text[start]
140 |         ct = category.category(ch)
141 |         length = len(text)
142 | 
143 |         if not callback.isempty() and not ct.invoke:
144 |             return
145 | 
146 |         cid = ct.id
147 |         isspace = cid == self.space_id
148 |         limit = min(length, ct.length + start)
149 |         for i in range(start + 1, limit):
150 |             wdic.search_from_trie(cid, start, i - start, isspace, callback)
151 |             if not category.is_compatible(ch, text[i]):
152 |                 return
153 |         wdic.search_from_trie(cid, start, limit - start, isspace, callback)
154 | 
155 |         if ct.group and limit < length:
156 |             for i in range(limit, length):
157 |                 if not category.is_compatible(ch, text[i]):
158 |                     wdic.search_from_trie(cid, start, i - start, isspace,
159 |                                           callback)
160 |                     return
161 |             wdic.search_from_trie(cid, start, length - start, isspace,
162 |                                   callback)
163 | 
164 | 
165 | class WordDic:
166 |     __slots__ = ['splitted', 'trie', 'data', 'wd_rd', 'wa_rd', 'indices',
167 |                  'wi_rd', 'offsets', 'left_ids', 'right_ids', 'costs']
168 | 
169 |     def __init__(self, path, bigendian=False, splitted=False, use_mmap=None):
170 |         self.splitted = splitted
171 |         self.trie = Searcher(path + "/word2id", bigendian, use_mmap)
172 |         if splitted:
173 |             paths = sorted(glob.glob(path + "/word.dat.*"))
174 |             self.data = util.get_chararray_multi(paths, bigendian)
175 |         else:
176 |             self.wd_rd = DictReader(path + "/word.dat", bigendian, use_mmap)
177 |             with self.wd_rd as r:
178 |                 self.data = r.get_chararray()
179 |         self.wa_rd = DictReader(path + "/word.ary.idx", bigendian, use_mmap)
180 |         with self.wa_rd as r:
181 |             self.indices = r.get_intarray()
182 |         self.wi_rd = DictReader(path + "/word.inf", bigendian, use_mmap)
183 |         with self.wi_rd as r:
184 |             wc = r.size() // (4 + 2 + 2 + 2)
185 |             self.offsets = r.get_intarray(wc)
186 |             """ dataOffsets[単語ID] = 単語の素性データの開始位置 """
187 |             self.left_ids = r.get_shortarray(wc)
188 |             """ leftIds[単語ID] = 単語の左文脈ID """
189 |             self.right_ids = r.get_shortarray(wc)
190 |             """ rightIds[単語ID] = 単語の右文脈ID """
191 |             self.costs = r.get_shortarray(wc)
192 |             """ consts[単語ID] = 単語のコスト """
193 | 
194 |     def release(self):
195 |         del self.data
196 |         del self.indices
197 |         del self.offsets
198 |         del self.left_ids
199 |         del self.right_ids
200 |         del self.costs
201 |         self.trie.release()
202 |         del self.trie
203 |         if not self.splitted:
204 |             self.wd_rd.release()
205 |             del self.wd_rd
206 |         self.wa_rd.release()
207 |         del self.wa_rd
208 |         self.wi_rd.release()
209 |         del self.wi_rd
210 | 
211 |     def search(self, text, start, callback):
212 |         costs = self.costs
213 |         left_ids = self.left_ids
214 |         right_ids = self.right_ids
215 |         indices = self.indices
216 | 
217 |         def fn(start, offset, trieId):
218 |             end = indices[trieId + 1]
219 |             for i in range(indices[trieId], end):
220 |                 callback(ViterbiNode(i, start, offset, costs[i], left_ids[i],
221 |                                      right_ids[i], False))
222 | 
223 |         self.trie.commonprefix_search(text, start, fn)
224 | 
225 |     def search_from_trie(self, trie_id, start, length, isspace, callback):
226 |         costs = self.costs
227 |         left_ids = self.left_ids
228 |         right_ids = self.right_ids
229 |         end = self.indices[trie_id + 1]
230 |         for i in range(self.indices[trie_id], end):
231 |             callback(ViterbiNode(i, start, length, costs[i], left_ids[i],
232 |                                  right_ids[i], isspace))
233 | 
234 |     def word_data(self, word_id):
235 |         return tobytes(self.data[self.offsets[word_id]:self.offsets[word_id +
236 |                                                                     1]])
237 | 


--------------------------------------------------------------------------------
/igo/dictreader.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import division
  3 | import array
  4 | import codecs
  5 | import os
  6 | import struct
  7 | import sys
  8 | 
  9 | LE, UTF16Codec = (True, codecs.lookup('UTF-16-LE')) \
 10 |     if sys.byteorder == 'little' else (False, codecs.lookup('UTF-16-LE'))
 11 | 
 12 | try:
 13 |     import mmap
 14 |     sizemap = {t: struct.calcsize(t) for t in 'ihH'}
 15 |     allow_mmap = hasattr(memoryview, 'cast') and LE
 16 | except:
 17 |     allow_mmap = False
 18 | 
 19 | if hasattr(os, 'fstat'):
 20 | 
 21 |     def size(f):
 22 |         return os.fstat(f.fileno()).st_size
 23 | else:
 24 | 
 25 |     def size(f):
 26 |         return os.stat(f.name).st_size
 27 | 
 28 | 
 29 | def nop(a):
 30 |     pass
 31 | 
 32 | 
 33 | def swap(a):
 34 |     a.byteswap()
 35 | 
 36 | 
 37 | class StandardReader:
 38 |     """
 39 |     reader for dictionary files using normal file io
 40 |     """
 41 |     __slots__ = ['int_fmt', 'short_fmt', 'byteswap', 'decoder', 'f']
 42 | 
 43 |     def __init__(self, filepath, bigendian=False):
 44 |         if bigendian:
 45 |             self.int_fmt = '!i'
 46 |             """ big endian int32 """
 47 |             self.short_fmt = '!h'
 48 |             """ big endian int16 """
 49 |             self.byteswap = swap if LE and bigendian else nop
 50 |             self.decoder = codecs.getdecoder('UTF-16-BE')
 51 |         else:
 52 |             self.int_fmt = '=i'
 53 |             """ native int32 format """
 54 |             self.short_fmt = '=h'
 55 |             """ native int16 format """
 56 |             self.byteswap = nop
 57 |             self.decoder = UTF16Codec.decode
 58 |         self.f = open(filepath, 'rb')
 59 | 
 60 |     def __enter__(self):
 61 |         return self
 62 | 
 63 |     def __exit__(self, et, ev, t):
 64 |         self.close()
 65 | 
 66 |     def get_int(self):
 67 |         b = self.f.read(4)
 68 |         return struct.unpack(self.int_fmt, b)[0]
 69 | 
 70 |     def get_intarray(self, count=None):
 71 |         c = count if count is not None else (self.size() // 4)
 72 |         ary = array.array('i')
 73 |         ary.fromfile(self.f, c)
 74 |         self.byteswap(ary)
 75 |         return ary
 76 | 
 77 |     def get_shortarray(self, count):
 78 |         ary = array.array('h')
 79 |         ary.fromfile(self.f, count)
 80 |         self.byteswap(ary)
 81 |         return ary
 82 | 
 83 |     def get_chararray(self, count=None):
 84 |         c = count if count is not None else (self.size() // 2)
 85 |         ary = array.array('H')
 86 |         ary.fromfile(self.f, c)
 87 |         self.byteswap(ary)
 88 |         return ary
 89 | 
 90 |     def size(self):
 91 |         return size(self.f)
 92 | 
 93 |     def close(self):
 94 |         if self.f:
 95 |             self.f.close()
 96 |             self.f = None
 97 | 
 98 |     def release(self):
 99 |         self.close()
100 | 
101 | 
102 | class MMapedReader:
103 |     """
104 |     dictionary reader using mmap.
105 |     this only can read native datasize/byte order dictonary
106 |     """
107 |     __slots__ = ['fd', 'mmap', 'view', 'pos']
108 | 
109 |     def __init__(self, path, bigendian=False):
110 |         self.fd = os.open(path, os.O_RDONLY)
111 |         self.mmap = mmap.mmap(self.fd, length=0, access=mmap.ACCESS_READ)
112 |         self.view = memoryview(self.mmap)
113 |         self.pos = 0
114 | 
115 |     def __enter__(self):
116 |         return self
117 | 
118 |     def __exit__(self, et, ev, t):
119 |         self.close()
120 | 
121 |     def _get(self, fmt, cnt):
122 |         # need to support endian conversion?
123 |         # also size of types must be native
124 |         t = self.pos + sizemap[fmt] * cnt
125 |         view = memoryview(self.view[self.pos:t]).cast(fmt)
126 |         self.pos = t
127 |         return view
128 | 
129 |     def get_int(self):
130 |         v = self._get('i', 1)[0]
131 |         return v
132 | 
133 |     def get_intarray(self, count=None):
134 |         c = count if count is not None else (self.size() // 4)
135 |         return self._get('i', c)
136 | 
137 |     def get_shortarray(self, count):
138 |         return self._get('h', count)
139 | 
140 |     def get_chararray(self, count=None):
141 |         c = count if count is not None else (self.size() // 2)
142 |         return self._get('H', c)
143 | 
144 |     def size(self):
145 |         return len(self.mmap)
146 | 
147 |     def close(self):
148 |         # nothing to close, everything is mapped
149 |         pass
150 | 
151 |     def release(self):
152 |         self.view.release()
153 |         self.mmap.close()
154 |         os.close(self.fd)
155 | 
156 | 
157 | def DictReader(f, b=False, use_mmap=None):
158 |     m = allow_mmap if use_mmap is None else use_mmap
159 |     if m:
160 |         return MMapedReader(f, b)
161 |     else:
162 |         return StandardReader(f, b)
163 | 
164 | 
165 | # this is only used for splitted dictionary mode
166 | # no mmap version provided for now
167 | def get_chararray_multi(filepaths, bigendian=False):
168 |     ary = array.array('H')
169 |     for path in filepaths:
170 |         with open(path, 'rb') as f:
171 |             ary.fromfile(f, size(f) // 2)
172 |     if LE and bigendian:
173 |         ary.byteswap()
174 |     return ary
175 | 


--------------------------------------------------------------------------------
/igo/ipadic/char.category:
--------------------------------------------------------------------------------
1 |                                                                                                                        	              
2 |              


--------------------------------------------------------------------------------
/igo/ipadic/code2category:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hideaki-t/igo-python/2144b91b4595d31d833bebaf9d99c223268316f4/igo/ipadic/code2category


--------------------------------------------------------------------------------
/igo/ipadic/matrix.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hideaki-t/igo-python/2144b91b4595d31d833bebaf9d99c223268316f4/igo/ipadic/matrix.bin


--------------------------------------------------------------------------------
/igo/ipadic/word.ary.idx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hideaki-t/igo-python/2144b91b4595d31d833bebaf9d99c223268316f4/igo/ipadic/word.ary.idx


--------------------------------------------------------------------------------
/igo/ipadic/word.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hideaki-t/igo-python/2144b91b4595d31d833bebaf9d99c223268316f4/igo/ipadic/word.dat


--------------------------------------------------------------------------------
/igo/ipadic/word.inf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hideaki-t/igo-python/2144b91b4595d31d833bebaf9d99c223268316f4/igo/ipadic/word.inf


--------------------------------------------------------------------------------
/igo/ipadic/word2id:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hideaki-t/igo-python/2144b91b4595d31d833bebaf9d99c223268316f4/igo/ipadic/word2id


--------------------------------------------------------------------------------
/igo/parse.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals, print_function
 2 | import sys
 3 | import locale
 4 | import io
 5 | import os
 6 | from .tagger import Tagger
 7 | 
 8 | 
 9 | def main():
10 |     if sys.platform == 'cli':
11 |         i = sys.stdin
12 |         o = sys.stdout
13 |     elif sys.version_info[0] < 3:
14 |         enc = locale.getpreferredencoding()
15 |         i = io.open(sys.stdin.fileno(), encoding=enc, closefd=False)
16 |         o = io.open(sys.stdout.fileno(), mode='w', encoding=enc, closefd=False)
17 |     else:
18 |         # just turn on universal newline mode to align python2
19 |         i = io.TextIOWrapper(sys.stdin.buffer)
20 |         o = sys.stdout
21 | 
22 |     with Tagger(os.getenv('IGO_DICT')) as tagger:
23 |         for l in i:
24 |             for m in tagger.parse(l):
25 |                 print(m.fmt('{surface}\t{feature}'), file=o)
26 |             print('EOS', file=o)
27 | 
28 | if __name__ == '__main__':
29 |     main()
30 | 


--------------------------------------------------------------------------------
/igo/tagger.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from igo.dictionary import Matrix, WordDic, Unknown, ViterbiNode
  3 | from igo.dictreader import UTF16Codec
  4 | import os.path
  5 | from os.path import dirname, abspath
  6 | import array
  7 | 
  8 | decodeUTF16b = UTF16Codec.decode
  9 | try:
 10 |     UTF16Codec.decode(array.array('H', [0x3042]))
 11 | 
 12 |     def decodeUTF16a(a):
 13 |         return UTF16Codec.decode(a)
 14 | except:
 15 |     # for codec does not support an array
 16 |     print('fallback')
 17 | 
 18 |     def decodeUTF16a(a):
 19 |         return UTF16Codec.decode(a.tostring())
 20 | 
 21 | 
 22 | class Morpheme:
 23 |     """
 24 |     形態素クラス
 25 |     """
 26 |     __slots__ = ['surface', 'feature', 'start']
 27 | 
 28 |     def __init__(self, surface, feature, start):
 29 |         self.surface = surface
 30 |         """ 形態素の表層形 """
 31 |         self.feature = feature
 32 |         """ 形態素の素性 """
 33 |         self.start = start
 34 |         """ テキスト内での形態素の出現開始位置 """
 35 | 
 36 |     def __str__(self):
 37 |         return self.fmt()
 38 | 
 39 |     def fmt(self, fmt="surface: {surface}, feature: {feature}, start={start}"):
 40 |         return fmt.format(
 41 |             surface=self.surface, feature=self.feature, start=self.start)
 42 | 
 43 | 
 44 | class Tagger:
 45 |     """
 46 |     形態素解析を行うクラス
 47 |     """
 48 |     __slots__ = ['wdc', 'unk', 'mtx']
 49 |     __BOS_NODES = [ViterbiNode.makeBOSEOS()]
 50 | 
 51 |     @staticmethod
 52 |     def lookup():
 53 |         """
 54 |         モジュールが置いてある場所から辞書を探す
 55 |         @return: モジュール内で見つかった辞書のパス
 56 |         """
 57 |         path = os.path.join(abspath(dirname(__file__)), 'ipadic')
 58 |         if (os.path.exists(path)):
 59 |             return path
 60 |         return None
 61 | 
 62 |     def __init__(self, path=None, gae=False, use_mmap=None):
 63 |         """
 64 |         バイナリ辞書を読み込んで、形態素解析器のインスタンスを作成する
 65 | 
 66 |         @param path directory of a binary dictionary
 67 |         """
 68 |         if not path:
 69 |             path = Tagger.lookup()
 70 |         self.wdc = WordDic(path, gae, gae, use_mmap)
 71 |         self.unk = Unknown(path, gae, use_mmap)
 72 |         self.mtx = Matrix(path, gae, use_mmap)
 73 | 
 74 |     def parse(self, text, result=None):
 75 |         """
 76 |         形態素解析を行う
 77 | 
 78 |         @param text 解析対象テキスト
 79 |         @param result 解析結果の形態素が追加されるリスト. None指定時は内部でリストを作成する
 80 |         @return 解析結果の形態素リスト. {@code parse(text,result)=result}
 81 |         """
 82 |         if result is None:
 83 |             result = []
 84 |         text = array.array('H', UTF16Codec.encode(text)[0])
 85 |         vn = self.__parse(text)
 86 |         wd = self.wdc.word_data
 87 |         while vn:
 88 |             surface = decodeUTF16a(text[vn.start:vn.start + vn.length])[0]
 89 |             feature = decodeUTF16b(wd(vn.word_id))[0]
 90 |             result.append(Morpheme(surface, feature, vn.start))
 91 |             vn = vn.prev
 92 |         return result
 93 | 
 94 |     """
 95 |     分かち書きを行う
 96 | 
 97 |     @param text 分かち書きされるテキスト
 98 |     @param result 分かち書き結果の文字列が追加されるリスト. None指定時は内部でリストを作成する
 99 |     @return 分かち書きされた文字列のリスト. {@code wakati(text,result)=result}
100 |     """
101 | 
102 |     def wakati(self, text, result=None):
103 |         if result is None:
104 |             result = []
105 |         text = array.array('H', UTF16Codec.encode(text)[0])
106 |         vn = self.__parse(text)
107 |         while vn:
108 |             result.append(decodeUTF16a(text[vn.start:vn.start + vn.length])[0])
109 |             vn = vn.prev
110 |         return result
111 | 
112 |     def __parse(self, text):
113 |         length = len(text)
114 |         nodes = [None] * (length + 1)
115 |         nodes[0] = Tagger.__BOS_NODES
116 | 
117 |         wdc = self.wdc
118 |         unk = self.unk
119 |         fn = MakeLattice(nodes, self.set_mincost_node)
120 |         for i in range(0, length):
121 |             if nodes[i] is not None:
122 |                 fn.set(i)
123 |                 wdc.search(text, i, fn)  # 単語辞書から形態素を検索
124 |                 unk.search(text, i, wdc, fn)  # 未知語辞書から形態素を検索
125 | 
126 |         cur = self.set_mincost_node(ViterbiNode.makeBOSEOS(),
127 |                                     nodes[length]).prev
128 | 
129 |         # reverse
130 |         head = None
131 |         while cur.prev:
132 |             tmp = cur.prev
133 |             cur.prev = head
134 |             head = cur
135 |             cur = tmp
136 |         return head
137 | 
138 |     def set_mincost_node(self, vn, prevs):
139 |         mtx = self.mtx
140 |         left_id = vn.left_id
141 |         f = vn.prev = prevs[0]
142 |         mincost = f.cost + mtx.linkcost(f.right_id, left_id)
143 | 
144 |         for i in range(1, len(prevs)):
145 |             p = prevs[i]
146 |             cost = p.cost + mtx.linkcost(p.right_id, left_id)
147 |             if cost < mincost:
148 |                 mincost = cost
149 |                 vn.prev = p
150 | 
151 |         vn.cost += mincost
152 |         return vn
153 | 
154 |     def release(self):
155 |         self.wdc.release()
156 |         self.unk.release()
157 |         self.mtx.release()
158 | 
159 |     def __enter__(self):
160 |         return self
161 | 
162 |     def __exit__(self, et, ev, tb):
163 |         self.release()
164 | 
165 | 
166 | class MakeLattice:
167 |     __slots__ = ['nodes', 'i', 'prevs', 'empty', 'set_mincost_node']
168 | 
169 |     def __init__(self, nodes, set_mincost_node):
170 |         self.nodes = nodes
171 |         self.i = 0
172 |         self.prevs = None
173 |         self.empty = True
174 |         self.set_mincost_node = set_mincost_node
175 | 
176 |     def set(self, i):
177 |         self.i = i
178 |         self.prevs = self.nodes[i]
179 |         self.nodes[i] = None
180 |         self.empty = True
181 | 
182 |     def __call__(self, vn):
183 |         self.empty = False
184 |         nodes = self.nodes
185 |         end = self.i + vn.length
186 |         if nodes[end] is None:
187 |             nodes[end] = []
188 |         ends = nodes[end]
189 |         if vn.isspace:
190 |             ends.extend(self.prevs)
191 |         else:
192 |             ends.append(self.set_mincost_node(vn, self.prevs))
193 | 
194 |     def isempty(self):
195 |         return self.empty
196 | 


--------------------------------------------------------------------------------
/igo/tests/test_parse.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from __future__ import unicode_literals
 3 | 
 4 | import igo.tagger
 5 | 
 6 | 
 7 | def flat(m):
 8 |     return m.surface, m.feature, m.start
 9 | 
10 | 
11 | # t = igo.tagger.Tagger('ipadic_gae', gae=True)
12 | def test_simple():
13 |     t = igo.tagger.Tagger()
14 |     a = [flat(x) for x in t.parse('こんにちは世界')]
15 |     e = [('こんにちは', '感動詞,*,*,*,*,*,こんにちは,コンニチハ,コンニチワ', 0),
16 |          ('世界', '名詞,一般,*,*,*,*,世界,セカイ,セカイ', 5)]
17 |     assert a == e
18 | 
19 |     a = [flat(x) for x in t.parse('私の名前は中野です。')]
20 |     e = [('私', '名詞,代名詞,一般,*,*,*,私,ワタシ,ワタシ', 0),
21 |          ('の', '助詞,連体化,*,*,*,*,の,ノ,ノ', 1),
22 |          ('名前', '名詞,一般,*,*,*,*,名前,ナマエ,ナマエ', 2),
23 |          ('は', '助詞,係助詞,*,*,*,*,は,ハ,ワ', 4),
24 |          ('中野', '名詞,固有名詞,地域,一般,*,*,中野,ナカノ,ナカノ', 5),
25 |          ('です', '助動詞,*,*,*,特殊・デス,基本形,です,デス,デス', 7),
26 |          ('。', '記号,句点,*,*,*,*,。,。,。', 9)]
27 |     assert a == e
28 | 
29 | 
30 | def test_surrogate_pairs():
31 |     t = igo.tagger.Tagger()
32 |     a = [flat(x) for x in t.parse('おはようー😳こんにちはー美味しいご飯だよ')]
33 |     e = [('おはよう', '感動詞,*,*,*,*,*,おはよう,オハヨウ,オハヨー', 0),
34 |          ('ー', '名詞,一般,*,*,*,*,*', 4), ('😳', '記号,一般,*,*,*,*,*', 5),
35 |          ('こんにちは', '感動詞,*,*,*,*,*,こんにちは,コンニチハ,コンニチワ', 7),
36 |          ('ー', '名詞,一般,*,*,*,*,*', 12),
37 |          ('美味しい', '形容詞,自立,*,*,形容詞・イ段,基本形,美味しい,オイシイ,オイシイ', 13),
38 |          ('ご飯', '名詞,一般,*,*,*,*,ご飯,ゴハン,ゴハン', 17),
39 |          ('だ', '助動詞,*,*,*,特殊・ダ,基本形,だ,ダ,ダ', 19),
40 |          ('よ', '助詞,終助詞,*,*,*,*,よ,ヨ,ヨ', 20)]
41 |     assert a == e
42 | 
43 |     a = [flat(x) for x in t.parse('😳')]
44 |     e = [('😳', '記号,一般,*,*,*,*,*', 0)]
45 |     assert a == e
46 | 
47 |     a = [flat(x) for x in t.parse('😳😳')]
48 |     e = [('😳😳', '記号,一般,*,*,*,*,*', 0)]
49 |     assert a == e
50 | 
51 |     a = [flat(x) for x in t.parse('😳おはよう')]
52 |     e = [('😳', '記号,一般,*,*,*,*,*', 0),
53 |          ('おはよう', '感動詞,*,*,*,*,*,おはよう,オハヨウ,オハヨー', 2)]
54 |     assert a == e
55 | 
56 |     a = [flat(x) for x in t.parse('おはよう😳')]
57 |     e = [('おはよう', '感動詞,*,*,*,*,*,おはよう,オハヨウ,オハヨー', 0),
58 |          ('😳', '記号,一般,*,*,*,*,*', 4)]
59 |     assert a == e
60 | 


--------------------------------------------------------------------------------
/igo/trie.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import sys
  3 | from igo.dictreader import DictReader
  4 | 
  5 | 
  6 | if sys.version_info[0] > 2:
  7 |     unichr = chr
  8 | 
  9 | 
 10 | def base_id(nid):
 11 |     """
 12 |     BASEノードに格納するID値をエンコードするためのメソッド
 13 |     BASEノードに格納されているID値をデコードするためにも用いられる
 14 |     """
 15 |     return (-nid) - 1
 16 | 
 17 | 
 18 | chck_TERMINATE_CODE = 0
 19 | """
 20 | 文字列の終端を表す文字コード
 21 | 
 22 | この文字はシステムにより予約されており、辞書内の形態素の表層形および解析対象テキストに含まれていた場合の動作は未定義
 23 | """
 24 | 
 25 | chck_VACANT_CODE = 1
 26 | """
 27 | CHECKノードが未使用だということを示すための文字コード
 28 | 
 29 | この文字はシステムにより予約されており、辞書内の形態素の表層形および解析対象テキストに含まれていた場合の動作は未定義
 30 | """
 31 | 
 32 | chck_CODE_LIMIT = 0xFFFF
 33 | """
 34 | 使用可能な文字の最大値
 35 | """
 36 | 
 37 | 
 38 | class KeyStream:
 39 |     """
 40 |     文字列を文字のストリームとして扱うためのクラス。
 41 |     readメソッドで個々の文字を順に読み込み、文字列の終端に達した場合には{@code Chck_TERMINATE_CODE}が返される。
 42 |     * XXX: クラス名は不適切
 43 |     """
 44 |     __slots__ = ['s', 'cur', 'len']
 45 | 
 46 |     def __init__(self, key, start=0):
 47 |         self.s = key
 48 |         self.cur = start
 49 |         self.len = len(key)
 50 | 
 51 |     def startswith(self, prefix):
 52 |         cur = self.cur
 53 |         s = self.s
 54 |         length = len(prefix)
 55 | 
 56 |         if self.len - cur < length:
 57 |             return False
 58 |         return s[cur:cur + length] == prefix
 59 | 
 60 |     def rest(self):
 61 |         return self.s[self.cur:]
 62 | 
 63 |     def read(self):
 64 |         if self.eos():
 65 |             return chck_TERMINATE_CODE
 66 |         else:
 67 |             p = self.cur
 68 |             self.cur += 1
 69 |             return self.s[p]
 70 | 
 71 |     def eos(self):
 72 |         return self.cur == self.len
 73 | 
 74 | 
 75 | class Searcher:
 76 |     """
 77 |     DoubleArray検索用のクラス
 78 |     """
 79 |     __slots__ = ['rd', 'num_keys', 'begs', 'base', 'lens', 'chck', 'tail']
 80 | 
 81 |     def __init__(self, path, bigendian=False, use_mmap=None):
 82 |         """
 83 |         instantiate a DoubleArray Searcher
 84 | 
 85 |         @param filepath path of DoubleArray
 86 |         @param mmap use mmap or not; None: depends on environment
 87 |         """
 88 |         self.rd = DictReader(path, bigendian, use_mmap)
 89 |         with self.rd as r:
 90 |             node_size = r.get_int()
 91 |             tind_size = r.get_int()
 92 |             tail_size = r.get_int()
 93 |             self.num_keys = tind_size
 94 |             self.begs = r.get_intarray(tind_size)
 95 |             self.base = r.get_intarray(node_size)
 96 |             self.lens = r.get_shortarray(tind_size)
 97 |             self.chck = r.get_chararray(node_size)
 98 |             self.tail = r.get_chararray(tail_size)
 99 | 
100 |     def release(self):
101 |         del self.begs
102 |         del self.base
103 |         del self.lens
104 |         del self.chck
105 |         del self.tail
106 |         self.rd.release()
107 |         del self.rd
108 | 
109 |     def size(self):
110 |         """
111 |         DoubleArrayに格納されているキーの数を返す
112 |         @return DoubleArrayに格納されているキー数
113 |         """
114 |         return self.num_keys
115 | 
116 |     def search(self, key):
117 |         """
118 |         キーを検索する
119 | 
120 |         @param key 検索対象のキー文字列
121 |         @return キーが見つかった場合はそのIDを、見つからなかった場合は-1を返す
122 |         """
123 |         begs = self.begs
124 |         tail = self.tail
125 |         lens = self.lens
126 |         base = self.base
127 |         chck = self.chck
128 |         node = base[0]
129 | 
130 |         def exists(kin, node):
131 |             node_id = base_id(node)
132 |             beg = begs[node_id]
133 |             s = tail[beg:beg + lens[node_id]]
134 |             return kin.rest().equals(s)
135 | 
136 |         kin = KeyStream(key)
137 |         code = kin.read()
138 |         while 1:
139 |             idx = node + code
140 |             node = base[idx]
141 |             if chck[idx] == code:
142 |                 if node >= 0:
143 |                     continue
144 |                 elif kin.eos() or exists(kin, node):
145 |                     return base_id(node)
146 |             return -1
147 | 
148 | # with, iterator
149 |     def commonprefix_search(self, key, start, fn):
150 |         """
151 |         common-prefix検索を行う
152 |         条件に一致するキーが見つかる度に、fn.call(...)メソッドが呼び出される
153 | 
154 |         @param key 検索対象のキー文字列
155 |         @param start 検索対象となるキー文字列の最初の添字
156 |         @param fn 一致を検出した場合に呼び出されるメソッドを定義したコールバック関数
157 |         """
158 |         base = self.base
159 |         chck = self.chck
160 |         begs = self.begs
161 |         tail = self.tail
162 |         lens = self.lens
163 |         node = base[0]
164 |         offset = -1
165 |         kin = KeyStream(key, start)
166 | 
167 |         def call_if_key_including(kin, node, start, offset, fn):
168 |             node_id = base_id(node)
169 |             l = lens[node_id]
170 |             beg = begs[node_id]
171 |             prefix = tail[beg:beg+l]
172 |             if kin.startswith(prefix):
173 |                 fn(start, offset + l + 1, node_id)
174 | 
175 |         while 1:
176 |             code = kin.read()
177 |             offset += 1
178 |             terminal_idx = node + chck_TERMINATE_CODE
179 |             if chck[terminal_idx] == chck_TERMINATE_CODE:
180 |                 fn(start, offset, base_id(base[terminal_idx]))
181 |                 if code == chck_TERMINATE_CODE:
182 |                     return
183 |             idx = node + code
184 |             node = base[idx]
185 |             if chck[idx] == code:
186 |                 if node >= 0:
187 |                     continue
188 |                 else:
189 |                     call_if_key_including(kin, node, start, offset, fn)
190 |             return
191 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal=1
3 | 
4 | [devpi:upload]
5 | formats = sdist.tgz,bdist_wheel
6 | 
7 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | import io
 5 | from setuptools import setup
 6 | 
 7 | setup(
 8 |     name='igo-python',
 9 |     version='1.0.0',
10 |     description='Python port of Igo Japanese morphological analyzer',
11 |     long_description=io.open('README', encoding='utf-8').read() + "\n\n" +
12 |     io.open('CHANGES', encoding='utf-8').read(),
13 |     author='Hideaki Takahashi',
14 |     author_email='mymelo@gmail.com',
15 |     url='https://github.com/hideaki-t/igo-python/',
16 |     classifiers=[
17 |         'Development Status :: 5 - Production/Stable',
18 |         'Intended Audience :: Developers',
19 |         'License :: OSI Approved :: MIT License',
20 |         'Natural Language :: Japanese',
21 |         'Operating System :: OS Independent',
22 |         'Operating System :: Microsoft :: Windows',
23 |         'Operating System :: POSIX :: Linux',
24 |         'Programming Language :: Python :: 2.7',
25 |         'Programming Language :: Python :: 3.4',
26 |         'Programming Language :: Python :: 3.5',
27 |         'Programming Language :: Python :: 3.6',
28 |         'Programming Language :: Python :: 3.7',
29 |         'Programming Language :: Python :: Implementation :: CPython',
30 |         'Programming Language :: Python :: Implementation :: IronPython',
31 |         'Programming Language :: Python :: Implementation :: PyPy',
32 |         'Topic :: Scientific/Engineering :: Information Analysis',
33 |         'Topic :: Software Development :: Libraries :: Python Modules',
34 |         'Topic :: Text Processing :: Linguistic',
35 |     ],
36 |     keywords=[
37 |         'japanese',
38 |         'morphological analyzer',
39 |     ],
40 |     license='MIT',
41 |     packages=['igo'],
42 |     package_data={'igo': ['ipadic/*']},
43 |     entry_points={'console_scripts': ['igo = igo.parse:main']})
44 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import os
 3 | import sys
 4 | import igo.tagger
 5 | 
 6 | if sys.version_info[0] < 3:
 7 |     u = lambda s: s.decode('utf-8')
 8 |     import codecs
 9 |     sys.stdout = codecs.lookup('utf-8').streamwriter(sys.stdout)
10 | else:
11 |     u = str
12 | 
13 | 
14 | def pp(sf, ft, st):
15 |     sys.stdout.write(u("%s: %s at %d\n") % (sf, ft, st))
16 | 
17 | 
18 | t = igo.tagger.Tagger()
19 | #t = igo.tagger.Tagger('ipadic_gae', gae=True)
20 | for m in t.parse(u('私の名前は中野です。')):
21 |     pp(m.surface, m.feature, m.start)
22 | print('\n')
23 | 
24 | # t = igo.tagger.Tagger('ipadic')
25 | for m in t.parse(u('こんにちは世界')):
26 |     pp(m.surface, m.feature, m.start)
27 | print('\n')
28 | 
29 | # test if the dictionary exists
30 | try:
31 |     os.symlink(os.path.join(os.getcwd(), 'ipadic'), 'igo/dic')
32 |     if os.path.exists('igo/dic'):
33 |         t = igo.tagger.Tagger()
34 |         for m in t.parse(u('こんにちは世界')):
35 |             pp(m.surface, m.feature, m.start)
36 |         print('\n')
37 |     os.remove('igo/dic')
38 | except:
39 |     pass
40 | 
41 | # contains a surrogate pair char
42 | for m in t.parse(u('おはようー😳こんにちはー美味しいご飯だよ')):
43 |     pp(m.surface, m.feature, m.start)
44 | print('\n')
45 | 
46 | # only surrogate pair char
47 | for m in t.parse(u('😳')):
48 |     pp(m.surface, m.feature, m.start)
49 | print('\n')
50 | 
51 | # multiple surrogate pair chars
52 | for m in t.parse(u('😳😳')):
53 |     pp(m.surface, m.feature, m.start)
54 | print('\n')
55 | 
56 | # starts with a surrogate pair char
57 | for m in t.parse(u('😳おはよう')):
58 |     pp(m.surface, m.feature, m.start)
59 | print('\n')
60 | 
61 | # end with a surrogate pair char
62 | for m in t.parse(u('おはよう😳')):
63 |     pp(m.surface, m.feature, m.start)
64 | print('\n')
65 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py27, py33, py34, py35, py36, py37, pypy, pypy3
 3 | 
 4 | [testenv]
 5 | deps =
 6 |     pytest
 7 | 
 8 | commands =
 9 |     py.test
10 | 
11 | 


--------------------------------------------------------------------------------