├── .travis.yml ├── CHANGES ├── COPYING ├── COPYING.ipadic ├── MANIFEST.in ├── README ├── aozora_html2txt.py ├── appveyor.yml ├── igo ├── __init__.py ├── dictionary.py ├── dictreader.py ├── ipadic │ ├── char.category │ ├── code2category │ ├── matrix.bin │ ├── word.ary.idx │ ├── word.dat │ ├── word.inf │ └── word2id ├── parse.py ├── tagger.py ├── tests │ └── test_parse.py └── trie.py ├── setup.cfg ├── setup.py ├── test.py └── tox.ini /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: python 3 | matrix: 4 | include: 5 | - python: "2.7" 6 | - python: "3.4" 7 | - python: "3.5" 8 | - python: "3.6" 9 | - python: "3.7" 10 | dist: xenial 11 | sudo: required 12 | - python: "pypy" 13 | - python: "pypy3" 14 | install: 15 | - pip install tox-travis 16 | script: tox 17 | -------------------------------------------------------------------------------- /CHANGES: -------------------------------------------------------------------------------- 1 | Changelog for Igo-Python 2 | ======================== 3 | 2018-09-22 -- 1.0.0 4 | * dropped Python 3.3 from supported Python version. 5 | 6 | 2017-02-09 -- 0.9.9 7 | * fixed few small issues and add unit test. 8 | 9 | 2016-07-08 -- 0.9.8 10 | * fixed a surrogate pair character handling issue 11 | 12 | 2015-12-16 -- 0.9.7.1 13 | * Tagger can be pickled on Python2 env 14 | 15 | 2015-12-14 -- 0.9.7 16 | * include pre-built IPA dictionary 17 | 18 | 2015-07-03 -- 0.9.6 19 | * fixed an issue of dictionary size in mmap mode 20 | 21 | 2015-04-14 -- 0.9.5 22 | * use mmap to load files of a dictionary if it is possible 23 | 24 | 2013-07-05 -- 0.9.3 25 | * support characters in Supplementary Planes 26 | 27 | 2012-02-27 -- 0.9.2 28 | * apply changes in Igo(java) 0.4.4 29 | 30 | 2012-02-27 -- 0.9.1 31 | * synchronize with Igo(java) 0.4.5 32 | 33 | 2011-09-01 -- 0.9 34 | * compatibility with python3.2 35 | * update the code base to igo-0.4.3 36 | * add some enhances and clean up. 37 | 38 | 2011-08-29 -- 0.4 39 | * add some enhances and clean up. 40 | 41 | 2010-11-28 -- 0.3a 42 | * fix bugs(if a phrase ends with 'た' will causes error) 43 | 44 | 2010-11-27 -- 0.3 45 | * drop mmap related code 46 | * reduce memory footprint 47 | 48 | 2010-11-27 -- 0.2 49 | * support Google App Engine(maybe) 50 | 51 | 2010-11-27 -- 0.1a 52 | * update package info only. 53 | 54 | 2010-11-25 -- 0.1 55 | * first release. 56 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2010 Hideaki Takahashi 4 | (This software is based on Igo Java Version (c) Takeru Ohta 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /COPYING.ipadic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hideaki-t/igo-python/2144b91b4595d31d833bebaf9d99c223268316f4/COPYING.ipadic -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include COPYING CHANGES 2 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | ================ 2 | Igo for Python 3 | ================ 4 | 5 | About 6 | ===== 7 | 8 | Igo_ is a Japanese morphological analyzer written in Java and Common Lisp. 9 | This software is Python port of Igo(Java version). 10 | 11 | .. _Igo: http://igo.osdn.jp/ 12 | 13 | Notice 14 | ====== 15 | 16 | Dictionary builder is not provided. You need to use Igo Java version to build the dictionary for Igo. 17 | From igo-python 0.9.7, pre-built `IPA dictionary (2.7.0-20070801)`__ is included for ease of use. 18 | 19 | __ https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM 20 | 21 | How To Use 22 | ========== 23 | 24 | You can use Igo Python easily:: 25 | 26 | >>> from igo.Tagger import Tagger 27 | >>> t = Tagger() # use bundled dictionary 28 | >>> for m in t.parse(u'すもももももももものうち'): 29 | ... print m.surface, m.feature 30 | ... 31 | すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ 32 | も 助詞,係助詞,*,*,*,*,も,モ,モ 33 | もも 名詞,一般,*,*,*,*,もも,モモ,モモ 34 | も 助詞,係助詞,*,*,*,*,も,モ,モ 35 | もも 名詞,一般,*,*,*,*,もも,モモ,モモ 36 | の 助詞,連体化,*,*,*,*,の,ノ,ノ 37 | うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ 38 | >>> 39 | 40 | 41 | Using command line parser:: 42 | 43 | $ igo # or python -m igo.parse 44 | 私の名前は中野です 45 | 私 名詞,代名詞,一般,*,*,*,私,ワタシ,ワタシ 46 | の 助詞,連体化,*,*,*,*,の,ノ,ノ 47 | 名前 名詞,一般,*,*,*,*,名前,ナマエ,ナマエ 48 | は 助詞,係助詞,*,*,*,*,は,ハ,ワ 49 | 中野 名詞,固有名詞,地域,一般,*,*,中野,ナカノ,ナカノ 50 | です 助動詞,*,*,*,特殊・デス,基本形,です,デス,デス 51 | EOS 52 | -------------------------------------------------------------------------------- /aozora_html2txt.py: -------------------------------------------------------------------------------- 1 | import lxml.html 2 | import sys 3 | 4 | 5 | if sys.version_info[0] < 3: 6 | import codecs 7 | sys.stdout = codecs.lookup('utf-8').streamwriter(sys.stdout) 8 | else: 9 | import io 10 | sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') 11 | 12 | 13 | target = sys.argv[1] if len(sys.argv) > 1 else sys.stdin 14 | r = lxml.html.parse(target).getroot() 15 | print(r.text_content()) 16 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | cache: 2 | - '%LOCALAPPDATA%\pip\Cache' 3 | 4 | install: 5 | - python -m pip install -U pip 6 | - python -m pip install tox 7 | - python -m pip install -e . 8 | 9 | build: false 10 | 11 | test_script: 12 | - SET PYTHONIOENCODING=utf-8 13 | - tox -e py27,py33,py34,py35,py36,py37 14 | -------------------------------------------------------------------------------- /igo/__init__.py: -------------------------------------------------------------------------------- 1 | import igo.tagger 2 | import sys 3 | sys.modules['igo.Tagger'] = igo.tagger 4 | igo.Tagger = igo.tagger 5 | -------------------------------------------------------------------------------- /igo/dictionary.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | import glob 4 | import sys 5 | import igo.dictreader as util 6 | from igo.dictreader import DictReader 7 | from igo.trie import Searcher 8 | 9 | if sys.version_info[0] > 2: 10 | 11 | def tobytes(x): 12 | return x.tobytes() 13 | else: 14 | 15 | def tobytes(x): 16 | return x.tostring() 17 | 18 | 19 | class ViterbiNode(object): 20 | """ 21 | Viterbiアルゴリズムで使用されるノード 22 | """ 23 | __slots__ = ['cost', 'prev', 'word_id', 'start', 'length', 'left_id', 24 | 'right_id', 'isspace'] 25 | 26 | def __init__(self, word_id, start, length, cost, left_id, right_id, 27 | isspace): 28 | self.cost = cost 29 | """ 始点からノードまでの総コスト """ 30 | self.prev = None 31 | """ コスト最小の前方のノードへのリンク """ 32 | self.word_id = word_id 33 | """ 単語ID """ 34 | self.start = start 35 | """ 入力テキスト内での形態素の開始位置 """ 36 | self.length = length 37 | """ 形態素の表層形の長さ(文字数) """ 38 | self.left_id = left_id 39 | """ 左文脈ID """ 40 | self.right_id = right_id 41 | """ 右文脈ID """ 42 | self.isspace = isspace 43 | """ 形態素の文字種(文字カテゴリ)が空白文字かどうか """ 44 | 45 | @staticmethod 46 | def makeBOSEOS(): 47 | return ViterbiNode(0, 0, 0, 0, 0, 0, False) 48 | 49 | def __repr__(self): 50 | return str({n: getattr(self, n) for n in ViterbiNode.__slots__}) 51 | 52 | 53 | class CharCategory: 54 | __slots__ = ['cc_rd', 'cat', 'c2c_rd', 'char2id', 'eql_masks'] 55 | 56 | def __init__(self, path, bigendian=False, use_mmap=None): 57 | self.cc_rd = DictReader(path + "/char.category", bigendian, use_mmap) 58 | with self.cc_rd as r: 59 | self.cat = self.convert_categories(r.get_intarray()) 60 | self.c2c_rd = DictReader(path + "/code2category", bigendian, use_mmap) 61 | with self.c2c_rd as r: 62 | self.char2id = r.get_intarray(r.size() // 4 // 2) 63 | self.eql_masks = r.get_intarray(r.size() // 4 // 2) 64 | 65 | def release(self): 66 | del self.cat 67 | del self.char2id 68 | del self.eql_masks 69 | self.cc_rd.release() 70 | del self.cc_rd 71 | self.c2c_rd.release() 72 | del self.c2c_rd 73 | 74 | def category(self, code): 75 | return self.cat[self.char2id[code]] 76 | 77 | def is_compatible(self, code1, code2): 78 | return (self.eql_masks[code1] & self.eql_masks[code2]) != 0 79 | 80 | def convert_categories(self, d): 81 | return [ 82 | Category(d[i], d[i + 1], d[i + 2], d[i + 3]) 83 | for i in range(0, len(d), 4) 84 | ] 85 | 86 | 87 | class Category: 88 | __slots__ = ['id', 'length', 'invoke', 'group'] 89 | 90 | def __init__(self, i, l, iv, g): 91 | self.id = i 92 | self.length = l 93 | self.invoke = iv == 1 94 | self.group = g == 1 95 | 96 | 97 | class Matrix: 98 | """ 99 | 形態素の連接コスト表を扱うクラス 100 | """ 101 | __slots__ = ['rd', 'left_size', 'matrix'] 102 | 103 | def __init__(self, path, bigendian=False, use_mmap=None): 104 | self.rd = DictReader(path + "/matrix.bin", bigendian, use_mmap) 105 | with self.rd as r: 106 | self.left_size = r.get_int() 107 | right_size = r.get_int() 108 | self.matrix = r.get_shortarray(self.left_size * right_size) 109 | 110 | def release(self): 111 | del self.matrix 112 | self.rd.release() 113 | 114 | def linkcost(self, left_id, right_id): 115 | """ 116 | 形態素同士の連接コストを求める 117 | """ 118 | return self.matrix[right_id * self.left_size + left_id] 119 | 120 | 121 | class Unknown: 122 | """ 123 | 未知語の検索を行うクラス 124 | """ 125 | __slots__ = ['category', 'space_id'] 126 | 127 | def __init__(self, path, bigendian=False, use_mmap=None): 128 | self.category = CharCategory(path, bigendian, use_mmap) 129 | """文字カテゴリ管理クラス""" 130 | # NOTE: ' 'の文字カテゴリはSPACEに予約されている 131 | self.space_id = self.category.category(0x20).id 132 | """文字カテゴリがSPACEの文字のID""" 133 | 134 | def release(self): 135 | self.category.release() 136 | 137 | def search(self, text, start, wdic, callback): 138 | category = self.category 139 | ch = text[start] 140 | ct = category.category(ch) 141 | length = len(text) 142 | 143 | if not callback.isempty() and not ct.invoke: 144 | return 145 | 146 | cid = ct.id 147 | isspace = cid == self.space_id 148 | limit = min(length, ct.length + start) 149 | for i in range(start + 1, limit): 150 | wdic.search_from_trie(cid, start, i - start, isspace, callback) 151 | if not category.is_compatible(ch, text[i]): 152 | return 153 | wdic.search_from_trie(cid, start, limit - start, isspace, callback) 154 | 155 | if ct.group and limit < length: 156 | for i in range(limit, length): 157 | if not category.is_compatible(ch, text[i]): 158 | wdic.search_from_trie(cid, start, i - start, isspace, 159 | callback) 160 | return 161 | wdic.search_from_trie(cid, start, length - start, isspace, 162 | callback) 163 | 164 | 165 | class WordDic: 166 | __slots__ = ['splitted', 'trie', 'data', 'wd_rd', 'wa_rd', 'indices', 167 | 'wi_rd', 'offsets', 'left_ids', 'right_ids', 'costs'] 168 | 169 | def __init__(self, path, bigendian=False, splitted=False, use_mmap=None): 170 | self.splitted = splitted 171 | self.trie = Searcher(path + "/word2id", bigendian, use_mmap) 172 | if splitted: 173 | paths = sorted(glob.glob(path + "/word.dat.*")) 174 | self.data = util.get_chararray_multi(paths, bigendian) 175 | else: 176 | self.wd_rd = DictReader(path + "/word.dat", bigendian, use_mmap) 177 | with self.wd_rd as r: 178 | self.data = r.get_chararray() 179 | self.wa_rd = DictReader(path + "/word.ary.idx", bigendian, use_mmap) 180 | with self.wa_rd as r: 181 | self.indices = r.get_intarray() 182 | self.wi_rd = DictReader(path + "/word.inf", bigendian, use_mmap) 183 | with self.wi_rd as r: 184 | wc = r.size() // (4 + 2 + 2 + 2) 185 | self.offsets = r.get_intarray(wc) 186 | """ dataOffsets[単語ID] = 単語の素性データの開始位置 """ 187 | self.left_ids = r.get_shortarray(wc) 188 | """ leftIds[単語ID] = 単語の左文脈ID """ 189 | self.right_ids = r.get_shortarray(wc) 190 | """ rightIds[単語ID] = 単語の右文脈ID """ 191 | self.costs = r.get_shortarray(wc) 192 | """ consts[単語ID] = 単語のコスト """ 193 | 194 | def release(self): 195 | del self.data 196 | del self.indices 197 | del self.offsets 198 | del self.left_ids 199 | del self.right_ids 200 | del self.costs 201 | self.trie.release() 202 | del self.trie 203 | if not self.splitted: 204 | self.wd_rd.release() 205 | del self.wd_rd 206 | self.wa_rd.release() 207 | del self.wa_rd 208 | self.wi_rd.release() 209 | del self.wi_rd 210 | 211 | def search(self, text, start, callback): 212 | costs = self.costs 213 | left_ids = self.left_ids 214 | right_ids = self.right_ids 215 | indices = self.indices 216 | 217 | def fn(start, offset, trieId): 218 | end = indices[trieId + 1] 219 | for i in range(indices[trieId], end): 220 | callback(ViterbiNode(i, start, offset, costs[i], left_ids[i], 221 | right_ids[i], False)) 222 | 223 | self.trie.commonprefix_search(text, start, fn) 224 | 225 | def search_from_trie(self, trie_id, start, length, isspace, callback): 226 | costs = self.costs 227 | left_ids = self.left_ids 228 | right_ids = self.right_ids 229 | end = self.indices[trie_id + 1] 230 | for i in range(self.indices[trie_id], end): 231 | callback(ViterbiNode(i, start, length, costs[i], left_ids[i], 232 | right_ids[i], isspace)) 233 | 234 | def word_data(self, word_id): 235 | return tobytes(self.data[self.offsets[word_id]:self.offsets[word_id + 236 | 1]]) 237 | -------------------------------------------------------------------------------- /igo/dictreader.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | import array 4 | import codecs 5 | import os 6 | import struct 7 | import sys 8 | 9 | LE, UTF16Codec = (True, codecs.lookup('UTF-16-LE')) \ 10 | if sys.byteorder == 'little' else (False, codecs.lookup('UTF-16-LE')) 11 | 12 | try: 13 | import mmap 14 | sizemap = {t: struct.calcsize(t) for t in 'ihH'} 15 | allow_mmap = hasattr(memoryview, 'cast') and LE 16 | except: 17 | allow_mmap = False 18 | 19 | if hasattr(os, 'fstat'): 20 | 21 | def size(f): 22 | return os.fstat(f.fileno()).st_size 23 | else: 24 | 25 | def size(f): 26 | return os.stat(f.name).st_size 27 | 28 | 29 | def nop(a): 30 | pass 31 | 32 | 33 | def swap(a): 34 | a.byteswap() 35 | 36 | 37 | class StandardReader: 38 | """ 39 | reader for dictionary files using normal file io 40 | """ 41 | __slots__ = ['int_fmt', 'short_fmt', 'byteswap', 'decoder', 'f'] 42 | 43 | def __init__(self, filepath, bigendian=False): 44 | if bigendian: 45 | self.int_fmt = '!i' 46 | """ big endian int32 """ 47 | self.short_fmt = '!h' 48 | """ big endian int16 """ 49 | self.byteswap = swap if LE and bigendian else nop 50 | self.decoder = codecs.getdecoder('UTF-16-BE') 51 | else: 52 | self.int_fmt = '=i' 53 | """ native int32 format """ 54 | self.short_fmt = '=h' 55 | """ native int16 format """ 56 | self.byteswap = nop 57 | self.decoder = UTF16Codec.decode 58 | self.f = open(filepath, 'rb') 59 | 60 | def __enter__(self): 61 | return self 62 | 63 | def __exit__(self, et, ev, t): 64 | self.close() 65 | 66 | def get_int(self): 67 | b = self.f.read(4) 68 | return struct.unpack(self.int_fmt, b)[0] 69 | 70 | def get_intarray(self, count=None): 71 | c = count if count is not None else (self.size() // 4) 72 | ary = array.array('i') 73 | ary.fromfile(self.f, c) 74 | self.byteswap(ary) 75 | return ary 76 | 77 | def get_shortarray(self, count): 78 | ary = array.array('h') 79 | ary.fromfile(self.f, count) 80 | self.byteswap(ary) 81 | return ary 82 | 83 | def get_chararray(self, count=None): 84 | c = count if count is not None else (self.size() // 2) 85 | ary = array.array('H') 86 | ary.fromfile(self.f, c) 87 | self.byteswap(ary) 88 | return ary 89 | 90 | def size(self): 91 | return size(self.f) 92 | 93 | def close(self): 94 | if self.f: 95 | self.f.close() 96 | self.f = None 97 | 98 | def release(self): 99 | self.close() 100 | 101 | 102 | class MMapedReader: 103 | """ 104 | dictionary reader using mmap. 105 | this only can read native datasize/byte order dictonary 106 | """ 107 | __slots__ = ['fd', 'mmap', 'view', 'pos'] 108 | 109 | def __init__(self, path, bigendian=False): 110 | self.fd = os.open(path, os.O_RDONLY) 111 | self.mmap = mmap.mmap(self.fd, length=0, access=mmap.ACCESS_READ) 112 | self.view = memoryview(self.mmap) 113 | self.pos = 0 114 | 115 | def __enter__(self): 116 | return self 117 | 118 | def __exit__(self, et, ev, t): 119 | self.close() 120 | 121 | def _get(self, fmt, cnt): 122 | # need to support endian conversion? 123 | # also size of types must be native 124 | t = self.pos + sizemap[fmt] * cnt 125 | view = memoryview(self.view[self.pos:t]).cast(fmt) 126 | self.pos = t 127 | return view 128 | 129 | def get_int(self): 130 | v = self._get('i', 1)[0] 131 | return v 132 | 133 | def get_intarray(self, count=None): 134 | c = count if count is not None else (self.size() // 4) 135 | return self._get('i', c) 136 | 137 | def get_shortarray(self, count): 138 | return self._get('h', count) 139 | 140 | def get_chararray(self, count=None): 141 | c = count if count is not None else (self.size() // 2) 142 | return self._get('H', c) 143 | 144 | def size(self): 145 | return len(self.mmap) 146 | 147 | def close(self): 148 | # nothing to close, everything is mapped 149 | pass 150 | 151 | def release(self): 152 | self.view.release() 153 | self.mmap.close() 154 | os.close(self.fd) 155 | 156 | 157 | def DictReader(f, b=False, use_mmap=None): 158 | m = allow_mmap if use_mmap is None else use_mmap 159 | if m: 160 | return MMapedReader(f, b) 161 | else: 162 | return StandardReader(f, b) 163 | 164 | 165 | # this is only used for splitted dictionary mode 166 | # no mmap version provided for now 167 | def get_chararray_multi(filepaths, bigendian=False): 168 | ary = array.array('H') 169 | for path in filepaths: 170 | with open(path, 'rb') as f: 171 | ary.fromfile(f, size(f) // 2) 172 | if LE and bigendian: 173 | ary.byteswap() 174 | return ary 175 | -------------------------------------------------------------------------------- /igo/ipadic/char.category: -------------------------------------------------------------------------------- 1 |   2 |  -------------------------------------------------------------------------------- /igo/ipadic/code2category: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hideaki-t/igo-python/2144b91b4595d31d833bebaf9d99c223268316f4/igo/ipadic/code2category -------------------------------------------------------------------------------- /igo/ipadic/matrix.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hideaki-t/igo-python/2144b91b4595d31d833bebaf9d99c223268316f4/igo/ipadic/matrix.bin -------------------------------------------------------------------------------- /igo/ipadic/word.ary.idx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hideaki-t/igo-python/2144b91b4595d31d833bebaf9d99c223268316f4/igo/ipadic/word.ary.idx -------------------------------------------------------------------------------- /igo/ipadic/word.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hideaki-t/igo-python/2144b91b4595d31d833bebaf9d99c223268316f4/igo/ipadic/word.dat -------------------------------------------------------------------------------- /igo/ipadic/word.inf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hideaki-t/igo-python/2144b91b4595d31d833bebaf9d99c223268316f4/igo/ipadic/word.inf -------------------------------------------------------------------------------- /igo/ipadic/word2id: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hideaki-t/igo-python/2144b91b4595d31d833bebaf9d99c223268316f4/igo/ipadic/word2id -------------------------------------------------------------------------------- /igo/parse.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals, print_function 2 | import sys 3 | import locale 4 | import io 5 | import os 6 | from .tagger import Tagger 7 | 8 | 9 | def main(): 10 | if sys.platform == 'cli': 11 | i = sys.stdin 12 | o = sys.stdout 13 | elif sys.version_info[0] < 3: 14 | enc = locale.getpreferredencoding() 15 | i = io.open(sys.stdin.fileno(), encoding=enc, closefd=False) 16 | o = io.open(sys.stdout.fileno(), mode='w', encoding=enc, closefd=False) 17 | else: 18 | # just turn on universal newline mode to align python2 19 | i = io.TextIOWrapper(sys.stdin.buffer) 20 | o = sys.stdout 21 | 22 | with Tagger(os.getenv('IGO_DICT')) as tagger: 23 | for l in i: 24 | for m in tagger.parse(l): 25 | print(m.fmt('{surface}\t{feature}'), file=o) 26 | print('EOS', file=o) 27 | 28 | if __name__ == '__main__': 29 | main() 30 | -------------------------------------------------------------------------------- /igo/tagger.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from igo.dictionary import Matrix, WordDic, Unknown, ViterbiNode 3 | from igo.dictreader import UTF16Codec 4 | import os.path 5 | from os.path import dirname, abspath 6 | import array 7 | 8 | decodeUTF16b = UTF16Codec.decode 9 | try: 10 | UTF16Codec.decode(array.array('H', [0x3042])) 11 | 12 | def decodeUTF16a(a): 13 | return UTF16Codec.decode(a) 14 | except: 15 | # for codec does not support an array 16 | print('fallback') 17 | 18 | def decodeUTF16a(a): 19 | return UTF16Codec.decode(a.tostring()) 20 | 21 | 22 | class Morpheme: 23 | """ 24 | 形態素クラス 25 | """ 26 | __slots__ = ['surface', 'feature', 'start'] 27 | 28 | def __init__(self, surface, feature, start): 29 | self.surface = surface 30 | """ 形態素の表層形 """ 31 | self.feature = feature 32 | """ 形態素の素性 """ 33 | self.start = start 34 | """ テキスト内での形態素の出現開始位置 """ 35 | 36 | def __str__(self): 37 | return self.fmt() 38 | 39 | def fmt(self, fmt="surface: {surface}, feature: {feature}, start={start}"): 40 | return fmt.format( 41 | surface=self.surface, feature=self.feature, start=self.start) 42 | 43 | 44 | class Tagger: 45 | """ 46 | 形態素解析を行うクラス 47 | """ 48 | __slots__ = ['wdc', 'unk', 'mtx'] 49 | __BOS_NODES = [ViterbiNode.makeBOSEOS()] 50 | 51 | @staticmethod 52 | def lookup(): 53 | """ 54 | モジュールが置いてある場所から辞書を探す 55 | @return: モジュール内で見つかった辞書のパス 56 | """ 57 | path = os.path.join(abspath(dirname(__file__)), 'ipadic') 58 | if (os.path.exists(path)): 59 | return path 60 | return None 61 | 62 | def __init__(self, path=None, gae=False, use_mmap=None): 63 | """ 64 | バイナリ辞書を読み込んで、形態素解析器のインスタンスを作成する 65 | 66 | @param path directory of a binary dictionary 67 | """ 68 | if not path: 69 | path = Tagger.lookup() 70 | self.wdc = WordDic(path, gae, gae, use_mmap) 71 | self.unk = Unknown(path, gae, use_mmap) 72 | self.mtx = Matrix(path, gae, use_mmap) 73 | 74 | def parse(self, text, result=None): 75 | """ 76 | 形態素解析を行う 77 | 78 | @param text 解析対象テキスト 79 | @param result 解析結果の形態素が追加されるリスト. None指定時は内部でリストを作成する 80 | @return 解析結果の形態素リスト. {@code parse(text,result)=result} 81 | """ 82 | if result is None: 83 | result = [] 84 | text = array.array('H', UTF16Codec.encode(text)[0]) 85 | vn = self.__parse(text) 86 | wd = self.wdc.word_data 87 | while vn: 88 | surface = decodeUTF16a(text[vn.start:vn.start + vn.length])[0] 89 | feature = decodeUTF16b(wd(vn.word_id))[0] 90 | result.append(Morpheme(surface, feature, vn.start)) 91 | vn = vn.prev 92 | return result 93 | 94 | """ 95 | 分かち書きを行う 96 | 97 | @param text 分かち書きされるテキスト 98 | @param result 分かち書き結果の文字列が追加されるリスト. None指定時は内部でリストを作成する 99 | @return 分かち書きされた文字列のリスト. {@code wakati(text,result)=result} 100 | """ 101 | 102 | def wakati(self, text, result=None): 103 | if result is None: 104 | result = [] 105 | text = array.array('H', UTF16Codec.encode(text)[0]) 106 | vn = self.__parse(text) 107 | while vn: 108 | result.append(decodeUTF16a(text[vn.start:vn.start + vn.length])[0]) 109 | vn = vn.prev 110 | return result 111 | 112 | def __parse(self, text): 113 | length = len(text) 114 | nodes = [None] * (length + 1) 115 | nodes[0] = Tagger.__BOS_NODES 116 | 117 | wdc = self.wdc 118 | unk = self.unk 119 | fn = MakeLattice(nodes, self.set_mincost_node) 120 | for i in range(0, length): 121 | if nodes[i] is not None: 122 | fn.set(i) 123 | wdc.search(text, i, fn) # 単語辞書から形態素を検索 124 | unk.search(text, i, wdc, fn) # 未知語辞書から形態素を検索 125 | 126 | cur = self.set_mincost_node(ViterbiNode.makeBOSEOS(), 127 | nodes[length]).prev 128 | 129 | # reverse 130 | head = None 131 | while cur.prev: 132 | tmp = cur.prev 133 | cur.prev = head 134 | head = cur 135 | cur = tmp 136 | return head 137 | 138 | def set_mincost_node(self, vn, prevs): 139 | mtx = self.mtx 140 | left_id = vn.left_id 141 | f = vn.prev = prevs[0] 142 | mincost = f.cost + mtx.linkcost(f.right_id, left_id) 143 | 144 | for i in range(1, len(prevs)): 145 | p = prevs[i] 146 | cost = p.cost + mtx.linkcost(p.right_id, left_id) 147 | if cost < mincost: 148 | mincost = cost 149 | vn.prev = p 150 | 151 | vn.cost += mincost 152 | return vn 153 | 154 | def release(self): 155 | self.wdc.release() 156 | self.unk.release() 157 | self.mtx.release() 158 | 159 | def __enter__(self): 160 | return self 161 | 162 | def __exit__(self, et, ev, tb): 163 | self.release() 164 | 165 | 166 | class MakeLattice: 167 | __slots__ = ['nodes', 'i', 'prevs', 'empty', 'set_mincost_node'] 168 | 169 | def __init__(self, nodes, set_mincost_node): 170 | self.nodes = nodes 171 | self.i = 0 172 | self.prevs = None 173 | self.empty = True 174 | self.set_mincost_node = set_mincost_node 175 | 176 | def set(self, i): 177 | self.i = i 178 | self.prevs = self.nodes[i] 179 | self.nodes[i] = None 180 | self.empty = True 181 | 182 | def __call__(self, vn): 183 | self.empty = False 184 | nodes = self.nodes 185 | end = self.i + vn.length 186 | if nodes[end] is None: 187 | nodes[end] = [] 188 | ends = nodes[end] 189 | if vn.isspace: 190 | ends.extend(self.prevs) 191 | else: 192 | ends.append(self.set_mincost_node(vn, self.prevs)) 193 | 194 | def isempty(self): 195 | return self.empty 196 | -------------------------------------------------------------------------------- /igo/tests/test_parse.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import unicode_literals 3 | 4 | import igo.tagger 5 | 6 | 7 | def flat(m): 8 | return m.surface, m.feature, m.start 9 | 10 | 11 | # t = igo.tagger.Tagger('ipadic_gae', gae=True) 12 | def test_simple(): 13 | t = igo.tagger.Tagger() 14 | a = [flat(x) for x in t.parse('こんにちは世界')] 15 | e = [('こんにちは', '感動詞,*,*,*,*,*,こんにちは,コンニチハ,コンニチワ', 0), 16 | ('世界', '名詞,一般,*,*,*,*,世界,セカイ,セカイ', 5)] 17 | assert a == e 18 | 19 | a = [flat(x) for x in t.parse('私の名前は中野です。')] 20 | e = [('私', '名詞,代名詞,一般,*,*,*,私,ワタシ,ワタシ', 0), 21 | ('の', '助詞,連体化,*,*,*,*,の,ノ,ノ', 1), 22 | ('名前', '名詞,一般,*,*,*,*,名前,ナマエ,ナマエ', 2), 23 | ('は', '助詞,係助詞,*,*,*,*,は,ハ,ワ', 4), 24 | ('中野', '名詞,固有名詞,地域,一般,*,*,中野,ナカノ,ナカノ', 5), 25 | ('です', '助動詞,*,*,*,特殊・デス,基本形,です,デス,デス', 7), 26 | ('。', '記号,句点,*,*,*,*,。,。,。', 9)] 27 | assert a == e 28 | 29 | 30 | def test_surrogate_pairs(): 31 | t = igo.tagger.Tagger() 32 | a = [flat(x) for x in t.parse('おはようー😳こんにちはー美味しいご飯だよ')] 33 | e = [('おはよう', '感動詞,*,*,*,*,*,おはよう,オハヨウ,オハヨー', 0), 34 | ('ー', '名詞,一般,*,*,*,*,*', 4), ('😳', '記号,一般,*,*,*,*,*', 5), 35 | ('こんにちは', '感動詞,*,*,*,*,*,こんにちは,コンニチハ,コンニチワ', 7), 36 | ('ー', '名詞,一般,*,*,*,*,*', 12), 37 | ('美味しい', '形容詞,自立,*,*,形容詞・イ段,基本形,美味しい,オイシイ,オイシイ', 13), 38 | ('ご飯', '名詞,一般,*,*,*,*,ご飯,ゴハン,ゴハン', 17), 39 | ('だ', '助動詞,*,*,*,特殊・ダ,基本形,だ,ダ,ダ', 19), 40 | ('よ', '助詞,終助詞,*,*,*,*,よ,ヨ,ヨ', 20)] 41 | assert a == e 42 | 43 | a = [flat(x) for x in t.parse('😳')] 44 | e = [('😳', '記号,一般,*,*,*,*,*', 0)] 45 | assert a == e 46 | 47 | a = [flat(x) for x in t.parse('😳😳')] 48 | e = [('😳😳', '記号,一般,*,*,*,*,*', 0)] 49 | assert a == e 50 | 51 | a = [flat(x) for x in t.parse('😳おはよう')] 52 | e = [('😳', '記号,一般,*,*,*,*,*', 0), 53 | ('おはよう', '感動詞,*,*,*,*,*,おはよう,オハヨウ,オハヨー', 2)] 54 | assert a == e 55 | 56 | a = [flat(x) for x in t.parse('おはよう😳')] 57 | e = [('おはよう', '感動詞,*,*,*,*,*,おはよう,オハヨウ,オハヨー', 0), 58 | ('😳', '記号,一般,*,*,*,*,*', 4)] 59 | assert a == e 60 | -------------------------------------------------------------------------------- /igo/trie.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | from igo.dictreader import DictReader 4 | 5 | 6 | if sys.version_info[0] > 2: 7 | unichr = chr 8 | 9 | 10 | def base_id(nid): 11 | """ 12 | BASEノードに格納するID値をエンコードするためのメソッド 13 | BASEノードに格納されているID値をデコードするためにも用いられる 14 | """ 15 | return (-nid) - 1 16 | 17 | 18 | chck_TERMINATE_CODE = 0 19 | """ 20 | 文字列の終端を表す文字コード 21 | 22 | この文字はシステムにより予約されており、辞書内の形態素の表層形および解析対象テキストに含まれていた場合の動作は未定義 23 | """ 24 | 25 | chck_VACANT_CODE = 1 26 | """ 27 | CHECKノードが未使用だということを示すための文字コード 28 | 29 | この文字はシステムにより予約されており、辞書内の形態素の表層形および解析対象テキストに含まれていた場合の動作は未定義 30 | """ 31 | 32 | chck_CODE_LIMIT = 0xFFFF 33 | """ 34 | 使用可能な文字の最大値 35 | """ 36 | 37 | 38 | class KeyStream: 39 | """ 40 | 文字列を文字のストリームとして扱うためのクラス。 41 | readメソッドで個々の文字を順に読み込み、文字列の終端に達した場合には{@code Chck_TERMINATE_CODE}が返される。 42 | * XXX: クラス名は不適切 43 | """ 44 | __slots__ = ['s', 'cur', 'len'] 45 | 46 | def __init__(self, key, start=0): 47 | self.s = key 48 | self.cur = start 49 | self.len = len(key) 50 | 51 | def startswith(self, prefix): 52 | cur = self.cur 53 | s = self.s 54 | length = len(prefix) 55 | 56 | if self.len - cur < length: 57 | return False 58 | return s[cur:cur + length] == prefix 59 | 60 | def rest(self): 61 | return self.s[self.cur:] 62 | 63 | def read(self): 64 | if self.eos(): 65 | return chck_TERMINATE_CODE 66 | else: 67 | p = self.cur 68 | self.cur += 1 69 | return self.s[p] 70 | 71 | def eos(self): 72 | return self.cur == self.len 73 | 74 | 75 | class Searcher: 76 | """ 77 | DoubleArray検索用のクラス 78 | """ 79 | __slots__ = ['rd', 'num_keys', 'begs', 'base', 'lens', 'chck', 'tail'] 80 | 81 | def __init__(self, path, bigendian=False, use_mmap=None): 82 | """ 83 | instantiate a DoubleArray Searcher 84 | 85 | @param filepath path of DoubleArray 86 | @param mmap use mmap or not; None: depends on environment 87 | """ 88 | self.rd = DictReader(path, bigendian, use_mmap) 89 | with self.rd as r: 90 | node_size = r.get_int() 91 | tind_size = r.get_int() 92 | tail_size = r.get_int() 93 | self.num_keys = tind_size 94 | self.begs = r.get_intarray(tind_size) 95 | self.base = r.get_intarray(node_size) 96 | self.lens = r.get_shortarray(tind_size) 97 | self.chck = r.get_chararray(node_size) 98 | self.tail = r.get_chararray(tail_size) 99 | 100 | def release(self): 101 | del self.begs 102 | del self.base 103 | del self.lens 104 | del self.chck 105 | del self.tail 106 | self.rd.release() 107 | del self.rd 108 | 109 | def size(self): 110 | """ 111 | DoubleArrayに格納されているキーの数を返す 112 | @return DoubleArrayに格納されているキー数 113 | """ 114 | return self.num_keys 115 | 116 | def search(self, key): 117 | """ 118 | キーを検索する 119 | 120 | @param key 検索対象のキー文字列 121 | @return キーが見つかった場合はそのIDを、見つからなかった場合は-1を返す 122 | """ 123 | begs = self.begs 124 | tail = self.tail 125 | lens = self.lens 126 | base = self.base 127 | chck = self.chck 128 | node = base[0] 129 | 130 | def exists(kin, node): 131 | node_id = base_id(node) 132 | beg = begs[node_id] 133 | s = tail[beg:beg + lens[node_id]] 134 | return kin.rest().equals(s) 135 | 136 | kin = KeyStream(key) 137 | code = kin.read() 138 | while 1: 139 | idx = node + code 140 | node = base[idx] 141 | if chck[idx] == code: 142 | if node >= 0: 143 | continue 144 | elif kin.eos() or exists(kin, node): 145 | return base_id(node) 146 | return -1 147 | 148 | # with, iterator 149 | def commonprefix_search(self, key, start, fn): 150 | """ 151 | common-prefix検索を行う 152 | 条件に一致するキーが見つかる度に、fn.call(...)メソッドが呼び出される 153 | 154 | @param key 検索対象のキー文字列 155 | @param start 検索対象となるキー文字列の最初の添字 156 | @param fn 一致を検出した場合に呼び出されるメソッドを定義したコールバック関数 157 | """ 158 | base = self.base 159 | chck = self.chck 160 | begs = self.begs 161 | tail = self.tail 162 | lens = self.lens 163 | node = base[0] 164 | offset = -1 165 | kin = KeyStream(key, start) 166 | 167 | def call_if_key_including(kin, node, start, offset, fn): 168 | node_id = base_id(node) 169 | l = lens[node_id] 170 | beg = begs[node_id] 171 | prefix = tail[beg:beg+l] 172 | if kin.startswith(prefix): 173 | fn(start, offset + l + 1, node_id) 174 | 175 | while 1: 176 | code = kin.read() 177 | offset += 1 178 | terminal_idx = node + chck_TERMINATE_CODE 179 | if chck[terminal_idx] == chck_TERMINATE_CODE: 180 | fn(start, offset, base_id(base[terminal_idx])) 181 | if code == chck_TERMINATE_CODE: 182 | return 183 | idx = node + code 184 | node = base[idx] 185 | if chck[idx] == code: 186 | if node >= 0: 187 | continue 188 | else: 189 | call_if_key_including(kin, node, start, offset, fn) 190 | return 191 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal=1 3 | 4 | [devpi:upload] 5 | formats = sdist.tgz,bdist_wheel 6 | 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | import io 5 | from setuptools import setup 6 | 7 | setup( 8 | name='igo-python', 9 | version='1.0.0', 10 | description='Python port of Igo Japanese morphological analyzer', 11 | long_description=io.open('README', encoding='utf-8').read() + "\n\n" + 12 | io.open('CHANGES', encoding='utf-8').read(), 13 | author='Hideaki Takahashi', 14 | author_email='mymelo@gmail.com', 15 | url='https://github.com/hideaki-t/igo-python/', 16 | classifiers=[ 17 | 'Development Status :: 5 - Production/Stable', 18 | 'Intended Audience :: Developers', 19 | 'License :: OSI Approved :: MIT License', 20 | 'Natural Language :: Japanese', 21 | 'Operating System :: OS Independent', 22 | 'Operating System :: Microsoft :: Windows', 23 | 'Operating System :: POSIX :: Linux', 24 | 'Programming Language :: Python :: 2.7', 25 | 'Programming Language :: Python :: 3.4', 26 | 'Programming Language :: Python :: 3.5', 27 | 'Programming Language :: Python :: 3.6', 28 | 'Programming Language :: Python :: 3.7', 29 | 'Programming Language :: Python :: Implementation :: CPython', 30 | 'Programming Language :: Python :: Implementation :: IronPython', 31 | 'Programming Language :: Python :: Implementation :: PyPy', 32 | 'Topic :: Scientific/Engineering :: Information Analysis', 33 | 'Topic :: Software Development :: Libraries :: Python Modules', 34 | 'Topic :: Text Processing :: Linguistic', 35 | ], 36 | keywords=[ 37 | 'japanese', 38 | 'morphological analyzer', 39 | ], 40 | license='MIT', 41 | packages=['igo'], 42 | package_data={'igo': ['ipadic/*']}, 43 | entry_points={'console_scripts': ['igo = igo.parse:main']}) 44 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import os 3 | import sys 4 | import igo.tagger 5 | 6 | if sys.version_info[0] < 3: 7 | u = lambda s: s.decode('utf-8') 8 | import codecs 9 | sys.stdout = codecs.lookup('utf-8').streamwriter(sys.stdout) 10 | else: 11 | u = str 12 | 13 | 14 | def pp(sf, ft, st): 15 | sys.stdout.write(u("%s: %s at %d\n") % (sf, ft, st)) 16 | 17 | 18 | t = igo.tagger.Tagger() 19 | #t = igo.tagger.Tagger('ipadic_gae', gae=True) 20 | for m in t.parse(u('私の名前は中野です。')): 21 | pp(m.surface, m.feature, m.start) 22 | print('\n') 23 | 24 | # t = igo.tagger.Tagger('ipadic') 25 | for m in t.parse(u('こんにちは世界')): 26 | pp(m.surface, m.feature, m.start) 27 | print('\n') 28 | 29 | # test if the dictionary exists 30 | try: 31 | os.symlink(os.path.join(os.getcwd(), 'ipadic'), 'igo/dic') 32 | if os.path.exists('igo/dic'): 33 | t = igo.tagger.Tagger() 34 | for m in t.parse(u('こんにちは世界')): 35 | pp(m.surface, m.feature, m.start) 36 | print('\n') 37 | os.remove('igo/dic') 38 | except: 39 | pass 40 | 41 | # contains a surrogate pair char 42 | for m in t.parse(u('おはようー😳こんにちはー美味しいご飯だよ')): 43 | pp(m.surface, m.feature, m.start) 44 | print('\n') 45 | 46 | # only surrogate pair char 47 | for m in t.parse(u('😳')): 48 | pp(m.surface, m.feature, m.start) 49 | print('\n') 50 | 51 | # multiple surrogate pair chars 52 | for m in t.parse(u('😳😳')): 53 | pp(m.surface, m.feature, m.start) 54 | print('\n') 55 | 56 | # starts with a surrogate pair char 57 | for m in t.parse(u('😳おはよう')): 58 | pp(m.surface, m.feature, m.start) 59 | print('\n') 60 | 61 | # end with a surrogate pair char 62 | for m in t.parse(u('おはよう😳')): 63 | pp(m.surface, m.feature, m.start) 64 | print('\n') 65 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27, py33, py34, py35, py36, py37, pypy, pypy3 3 | 4 | [testenv] 5 | deps = 6 | pytest 7 | 8 | commands = 9 | py.test 10 | 11 | --------------------------------------------------------------------------------