├── .gitignore ├── LICENSE ├── README.md ├── setup.py └── wordseg ├── __init__.py ├── freqitem.py ├── hashtree.py ├── probability.py ├── sequence.py └── wordseg.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Kaiqiang Dawn 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ChineseWordSegmentation 2 | Chinese word segmentation algorithm without corpus 3 | 4 | ## Usage 5 | ``` 6 | from wordseg import WordSegment 7 | doc = u'十四是十四四十是四十,十四不是四十,四十不是十四' 8 | ws = WordSegment(doc, max_word_len=2, min_aggregation=1, min_entropy=0.5) 9 | ws.segSentence(doc) 10 | ``` 11 | 12 | This will generate words 13 | 14 | `十四 是 十四 四十 是 四十 , 十四 不是 四十 , 四十 不是 十四` 15 | 16 | In fact, `doc` should be a long enough document string for better results. In that condition, the min_aggregation should be set far greater than 1, such as 50, and min_entropy should also be set greater than 0.5, such as 1.5. 17 | 18 | Besides, both input and output of this function should be decoded as unicode. 19 | 20 | `WordSegment.segSentence` has an optional argument `method`, with values `WordSegment.L`, `WordSegment.S` and `WordSegment.ALL`, means 21 | 22 | + `WordSegment.L`: if a long word that is combinations of several shorter words found, given only the long word. 23 | + `WordSegment.S`: given the several shorter words. 24 | + `WordSegment.ALL`: given both the long and the shorters. 25 | 26 | ## Reference 27 | 28 | Thanks Matrix67's [article](http://www.matrix67.com/blog/archives/5044) -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from __future__ import absolute_import 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | import pkg_resources 7 | from setuptools import setup, find_packages 8 | import os 9 | import codecs 10 | import re 11 | import sys 12 | 13 | def read(*parts): 14 | path = os.path.join(os.path.dirname(__file__), *parts) 15 | with codecs.open(path, encoding='utf-8') as fobj: 16 | return fobj.read() 17 | 18 | def find_version(*file_paths): 19 | version_file = read(*file_paths) 20 | version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", 21 | version_file, re.M) 22 | if version_match: 23 | return version_match.group(1) 24 | raise RuntimeError("Unable to find version string.") 25 | 26 | setup( 27 | name='wordseg', 28 | version=find_version("wordseg", "__init__.py"), 29 | description='Chinese word segmentation algorithm without corpus', 30 | author='段凯强', 31 | author_email='', 32 | license='MIT', 33 | keywords='NLP,tokenizing,Chinese word segementation', 34 | url='https://github.com/bung87/ChineseWordSegmentation', 35 | packages = find_packages(), 36 | package_dir={'wordseg': 'wordseg'}, 37 | classifiers=[ 38 | 'Intended Audience :: Developers', 39 | 'License :: OSI Approved :: MIT License', 40 | 'Operating System :: OS Independent', 41 | 'Natural Language :: Chinese (Simplified)', 42 | 'Natural Language :: Chinese (Traditional)', 43 | 'Programming Language :: Python', 44 | 'Programming Language :: Python :: 2', 45 | 'Programming Language :: Python :: 2.6', 46 | 'Programming Language :: Python :: 2.7', 47 | 'Programming Language :: Python :: 3', 48 | 'Programming Language :: Python :: 3.2', 49 | 'Programming Language :: Python :: 3.3', 50 | 'Programming Language :: Python :: 3.4', 51 | 'Topic :: Text Processing', 52 | 'Topic :: Text Processing :: Indexing', 53 | 'Topic :: Text Processing :: Linguistic', 54 | ] 55 | ) 56 | -------------------------------------------------------------------------------- /wordseg/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.1.0' 2 | 3 | from . import probability 4 | from . import sequence 5 | from . wordseg import WordSegment -------------------------------------------------------------------------------- /wordseg/freqitem.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | """ 4 | A simple frequent itemset mining algorithm implementation 5 | Author: 段凯强 6 | """ 7 | 8 | import itertools 9 | 10 | from wordseg.sequence import dedup 11 | from wordseg.hashtree import HashTree, sameNodes 12 | from functools import reduce 13 | 14 | class FreqItem(object): 15 | def __init__(self, transactions, sup_theta=.1): 16 | self.transactions = [sorted(t) for t in [x for x in transactions if x]] 17 | self.sup_theta = sup_theta*len(transactions) 18 | self.freqset = [] 19 | 20 | def filterCandidates(self, cand): 21 | """ 22 | Build a HashTree with candidates cand, then count support of these candidates to filter out 23 | all those that have support not lower than sup_theta 24 | """ 25 | hashtree = HashTree(cand) 26 | hashtree.count(self.transactions) 27 | return hashtree.getNodes(self.sup_theta) 28 | 29 | def freqOneSet(self): 30 | """ 31 | Generate frequent 1-item sets 32 | """ 33 | one_item_cand = set() 34 | for t in self.transactions: 35 | for w in t: 36 | one_item_cand.add(w) 37 | return sorted(self.filterCandidates([[i] for i in one_item_cand]), key=lambda i: i[0].name) 38 | 39 | def genNextCand(self, preItems): 40 | """ 41 | Generate next candidates by dynamic programming 42 | Find range [i, j) such that items in this range have same prefix 43 | e.g., [1,2,3,4] and [1,2,3,5] have same prefix, so they should be in one same range 44 | Then, generate 2-combinations of these ranges as result 45 | """ 46 | res = [] 47 | i, j = 0, 0 48 | while i < len(preItems): 49 | if j < len(preItems) and sameNodes(preItems[j][:-1], preItems[i][:-1]): 50 | j += 1 51 | else: 52 | res += [pair[0] + [pair[1][-1]] for pair in itertools.combinations(preItems[i:j], 2)] 53 | i = j 54 | return [[i.name for i in items] for items in res] 55 | 56 | def genFreqItemSets(self): 57 | """ 58 | @return Frequent item sets with their frequency 59 | """ 60 | if self.freqset: return self.freqset 61 | cur = self.freqOneSet() 62 | freqKSet = [] 63 | while cur: 64 | freqKSet.append(cur) 65 | cur = self.filterCandidates(self.genNextCand(cur)) 66 | self.freqset = reduce(lambda res, x: res + x, freqKSet, []) 67 | name_freq_pairs = [[(i.name, i.val) for i in items] for items in self.freqset[::-1]] 68 | res = [list(zip(*items)) for items in name_freq_pairs] 69 | return [(list(pair[0]), pair[1][-1]) for pair in res] 70 | 71 | if __name__ == '__main__': 72 | transactions = [[1,2,3],[1,2,4],[2,4,6,8],[1,3,5,7], [5,7,2]] 73 | freqItem = FreqItem(transactions, sup_theta=.3) 74 | print(freqItem.genFreqItemSets()) 75 | -------------------------------------------------------------------------------- /wordseg/hashtree.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | """ 4 | A simple implementation of Hash Tree 5 | Author: 段凯强 6 | """ 7 | from functools import reduce 8 | 9 | class HashTreeNode(object): 10 | def __init__(self, name=''): 11 | self.val = 0 12 | self.name = name 13 | self.level = 0 14 | self.children = {} 15 | 16 | def addBag(self, bag): 17 | """ 18 | Note that bag must be sorted 19 | """ 20 | if bag: 21 | node = self.children.get(bag[0], HashTreeNode(name=bag[0])) 22 | node.addBag(bag[1:]) 23 | self.children[bag[0]] = node 24 | self.level = len(bag) 25 | 26 | def count(self, transaction): 27 | """ 28 | count the child who matches bag, suppose that current node matches 29 | """ 30 | if self.level == 0: 31 | self.val += 1 32 | elif self.level == 1: 33 | for t in transaction: 34 | if t in self.children: self.children[t].val += 1 35 | else: 36 | for i in range(0, len(transaction)): 37 | t = transaction[i] 38 | if t in self.children: 39 | self.children[t].count(transaction[i:]) 40 | 41 | def get(self, theta): 42 | return [[c.name for c in items] for items in self.getNodes(theta)] 43 | """ 44 | if self.level == 0: 45 | return [[self.name]] if self.val >= theta else None 46 | else: 47 | children_res = [self.children[i].get(theta) for i in sorted(self.children.keys())] 48 | total = reduce(lambda res, x: res + x, filter(lambda x: x, children_res), []) 49 | return map(lambda c: [self.name] + c, total) 50 | """ 51 | 52 | def getNodes(self, theta): 53 | if self.level == 0: 54 | return [[self]] if self.val >= theta else None 55 | else: 56 | children_res = [self.children[i].getNodes(theta) for i in sorted(self.children.keys())] 57 | total = reduce(lambda res, x: res + x, [x for x in children_res if x], []) 58 | return [[self] + c for c in total] 59 | 60 | def __str__(self): 61 | return '(%s : %s)'%(self.name, '; '.join([str(i) for i in list(self.children.values())])) 62 | 63 | def sameNode(node1, node2): 64 | return node1.name == node2.name 65 | 66 | def sameNodes(nodes1, nodes2): 67 | func = lambda n: n.name 68 | return list(map(func, nodes1)) == list(map(func, nodes2)) 69 | 70 | 71 | 72 | class HashTree(object): 73 | """ 74 | Note that all bags must be sorted 75 | """ 76 | def __init__(self, bags): 77 | self.root = HashTreeNode() 78 | self.root.val = 0 79 | for b in bags: 80 | if b: self.root.addBag(b) 81 | 82 | def count(self, transactions): 83 | for t in transactions: self.root.count(t) 84 | 85 | def get(self, theta): 86 | res = [c[1:] for c in self.root.get(theta)] 87 | return [] if res == [[]] else res 88 | 89 | def getNodes(self, theta): 90 | res = [c[1:] for c in self.root.getNodes(theta)] 91 | return [] if res == [[]] else res 92 | 93 | def __str__(self): 94 | return str(self.root) 95 | 96 | if __name__ == '__main__': 97 | to_count = [[1,2], [2,4], [1,3], [1,5], [3,4], [2,7], [6,8]] 98 | tree = HashTree(to_count) 99 | transactions = [[1,2,3],[1,2,4],[2,4,6,8],[1,3,5,7]] 100 | tree.count(transactions) 101 | print('Frequency with transactions', transactions) 102 | print(tree.get(2)) 103 | print(tree.get(1)) 104 | 105 | 106 | -------------------------------------------------------------------------------- /wordseg/probability.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | """ 4 | Algorithms about probability 5 | Author: 段凯强 6 | """ 7 | 8 | import math 9 | 10 | def entropyOfList(ls): 11 | """ 12 | Given a list of some items, compute entropy of the list 13 | The entropy is sum of -p[i]*log(p[i]) for every unique element i in the list, and p[i] is its frequency 14 | """ 15 | elements = {} 16 | for e in ls: 17 | elements[e] = elements.get(e, 0) + 1 18 | length = float(len(ls)) 19 | # if length is 0, which means one side of a word is empty, which is determinated, so entropy should be 0 20 | return length and sum([-v/length*math.log(v/length) for v in list(elements.values())]) 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /wordseg/sequence.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | """ 4 | Algorithms for sequences 5 | Author: 段凯强 6 | """ 7 | 8 | def dedup(ls): 9 | """ 10 | deduplicate the given SORTED list 11 | """ 12 | i, j = 0, 0 13 | while j < len(ls): 14 | if ls[j] == ls[i]: 15 | j += 1 16 | else: 17 | i += 1 18 | ls[i] = ls[j] 19 | return ls[:i + 1] 20 | 21 | def genSubstr(string, n): 22 | """ 23 | Generate all substrings of max length n for string 24 | """ 25 | length = len(string) 26 | res = [] 27 | for i in range(0, length): 28 | for j in range(i + 1, min(i + n + 1, length + 1)): 29 | res.append(string[i: j]) 30 | return res 31 | 32 | def genSubparts(string): 33 | """ 34 | Partition a string into all possible two parts, e.g. 35 | given "abcd", generate [("a", "bcd"), ("ab", "cd"), ("abc", "d")] 36 | For string of length 1, return empty list 37 | """ 38 | length = len(string) 39 | res = [] 40 | for i in range(1, length): 41 | res.append((string[0:i], string[i:])) 42 | return res 43 | 44 | def longestSubsequenceLength(s1, s2): 45 | n = len(s2) + 1 46 | cur = [0]*n 47 | next = [0]*n 48 | tmp = None 49 | for i in s1: 50 | for j in range(0, n): 51 | if j == 0: next[j] = 0 52 | else: next[j] = cur[j - 1] + 1 if i == s2[j - 1] else max(next[j - 1], cur[j]) 53 | tmp = next 54 | next = cur 55 | cur = tmp 56 | return cur[n - 1] 57 | 58 | def longestSubsequence(s1, s2): 59 | n = len(s2) + 1 60 | cur = [0]*n 61 | next = [0]*n 62 | tmp = None 63 | __NONE, __UP, __LEFT, __NEW = 0, 1, 2, 3 64 | orientation = [[__NONE]*n] 65 | for i in s1: 66 | ori = [] 67 | for j in range(0, n): 68 | if j == 0: 69 | next[j] = 0 70 | ori.append(__NONE) 71 | else: 72 | next[j] = cur[j - 1] + 1 if i == s2[j - 1] else max(next[j - 1], cur[j]) 73 | ori.append(__NEW if i == s2[j - 1] else (__LEFT if next[j - 1] > cur [j] else __UP)) 74 | orientation.append(ori) 75 | tmp = next 76 | next = cur 77 | cur = tmp 78 | i, j, res = len(s1), n - 1, '' 79 | ori = orientation[i][j] 80 | while ori != __NONE: 81 | if ori == __UP: i -= 1 82 | elif ori == __LEFT: j -= 1 83 | elif ori == __NEW: 84 | i -= 1 85 | j -= 1 86 | res += s2[j] 87 | ori = orientation[i][j] 88 | return res[::-1] 89 | 90 | -------------------------------------------------------------------------------- /wordseg/wordseg.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | """ 4 | Chinese word segmentation algorithm without corpus 5 | Author: 段凯强 6 | Reference: http://www.matrix67.com/blog/archives/5044 7 | """ 8 | 9 | import re 10 | 11 | from . probability import entropyOfList 12 | from . sequence import genSubparts, genSubstr 13 | 14 | 15 | 16 | 17 | def indexOfSortedSuffix(doc, max_word_len): 18 | """ 19 | Treat a suffix as an index where the suffix begins. 20 | Then sort these indexes by the suffixes. 21 | """ 22 | indexes = [] 23 | length = len(doc) 24 | for i in range(0, length): 25 | for j in range(i + 1, min(i + 1 + max_word_len, length + 1)): 26 | indexes.append((i, j)) 27 | return sorted(indexes, key=lambda i_j: doc[i_j[0]:i_j[1]]) 28 | 29 | 30 | class WordInfo(object): 31 | """ 32 | Store information of each word, including its freqency, left neighbors and right neighbors 33 | """ 34 | def __init__(self, text): 35 | super(WordInfo, self).__init__() 36 | self.text = text 37 | self.freq = 0.0 38 | self.left = [] 39 | self.right = [] 40 | self.aggregation = 0 41 | 42 | def update(self, left, right): 43 | """ 44 | Increase frequency of this word, then append left/right neighbors 45 | @param left a single character on the left side of this word 46 | @param right as left is, but on the right side 47 | """ 48 | self.freq += 1 49 | if left: self.left.append(left) 50 | if right: self.right.append(right) 51 | 52 | def compute(self, length): 53 | """ 54 | Compute frequency and entropy of this word 55 | @param length length of the document for training to get words 56 | """ 57 | self.freq /= length 58 | self.left = entropyOfList(self.left) 59 | self.right = entropyOfList(self.right) 60 | 61 | def computeAggregation(self, words_dict): 62 | """ 63 | Compute aggregation of this word 64 | @param words_dict frequency dict of all candidate words 65 | """ 66 | parts = genSubparts(self.text) 67 | if len(parts) > 0: 68 | self.aggregation = min([self.freq/words_dict[p1_p2[0]].freq/words_dict[p1_p2[1]].freq for p1_p2 in parts]) 69 | 70 | 71 | 72 | class WordSegment(object): 73 | 74 | """ 75 | Main class for Chinese word segmentation 76 | 1. Generate words from a long enough document 77 | 2. Do the segmentation work with the document 78 | """ 79 | 80 | # if a word is combination of other shorter words, then treat it as a long word 81 | L = 0 82 | # if a word is combination of other shorter words, then treat it as the set of shortest words 83 | S = 1 84 | # if a word contains other shorter words, then return all possible results 85 | ALL = 2 86 | 87 | def __init__(self, doc, max_word_len=5, min_freq=0.00005, min_entropy=2.0, min_aggregation=50): 88 | super(WordSegment, self).__init__() 89 | self.max_word_len = max_word_len 90 | self.min_freq = min_freq 91 | self.min_entropy = min_entropy 92 | self.min_aggregation = min_aggregation 93 | self.word_infos = self.genWords(doc) 94 | # Result infomations, i.e., average data of all words 95 | word_count = float(len(self.word_infos)) 96 | self.avg_len = sum([len(w.text) for w in self.word_infos])/word_count 97 | self.avg_freq = sum([w.freq for w in self.word_infos])/word_count 98 | self.avg_left_entropy = sum([w.left for w in self.word_infos])/word_count 99 | self.avg_right_entropy = sum([w.right for w in self.word_infos])/word_count 100 | self.avg_aggregation = sum([w.aggregation for w in self.word_infos])/word_count 101 | # Filter out the results satisfy all the requirements 102 | filter_func = lambda v: len(v.text) > 1 and v.aggregation > self.min_aggregation and\ 103 | v.freq > self.min_freq and v.left > self.min_entropy and v.right > self.min_entropy 104 | self.word_with_freq = [(w.text, w.freq) for w in list(filter(filter_func, self.word_infos))] 105 | self.words = [w[0] for w in self.word_with_freq] 106 | 107 | def genWords(self, doc): 108 | """ 109 | Generate all candidate words with their frequency/entropy/aggregation informations 110 | @param doc the document used for words generation 111 | """ 112 | pattern = re.compile('[\\s\\d,.<>/?:;\'\"[\\]{}()\\|~!@#$%^&*\\-_=+a-zA-Z,。《》、?:;“”‘’{}【】()…¥!—┄-]+') 113 | doc = re.sub(pattern, ' ', doc) 114 | suffix_indexes = indexOfSortedSuffix(doc, self.max_word_len) 115 | word_cands = {} 116 | # compute frequency and neighbors 117 | for suf in suffix_indexes: 118 | word = doc[suf[0]:suf[1]] 119 | if word not in word_cands: 120 | word_cands[word] = WordInfo(word) 121 | word_cands[word].update(doc[suf[0] - 1:suf[0]], doc[suf[1]:suf[1] + 1]) 122 | # compute probability and entropy 123 | length = len(doc) 124 | for k in word_cands: 125 | word_cands[k].compute(length) 126 | # compute aggregation of words whose length > 1 127 | values = sorted(list(word_cands.values()), key=lambda x: len(x.text)) 128 | for v in values: 129 | if len(v.text) == 1: continue 130 | v.computeAggregation(word_cands) 131 | return sorted(values, key=lambda v: v.freq, reverse=True) 132 | 133 | def segSentence(self, sentence, method=ALL): 134 | """ 135 | Segment a sentence with the words generated from a document 136 | @param sentence the sentence to be handled 137 | @param method segmentation method 138 | """ 139 | i = 0 140 | res = [] 141 | while i < len(sentence): 142 | if method == self.L or method == self.S: 143 | j_range = list(range(self.max_word_len, 0, -1)) if method == self.L else list(range(2, self.max_word_len + 1)) + [1] 144 | for j in j_range: 145 | if j == 1 or sentence[i:i + j] in self.words: 146 | res.append(sentence[i:i + j]) 147 | i += j 148 | break 149 | else: 150 | to_inc = 1 151 | for j in range(2, self.max_word_len + 1): 152 | if i + j <= len(sentence) and sentence[i:i + j] in self.words: 153 | res.append(sentence[i:i + j]) 154 | if to_inc == 1: to_inc = j 155 | if to_inc == 1: res.append(sentence[i]) 156 | i += to_inc 157 | return res 158 | 159 | 160 | if __name__ == '__main__': 161 | doc = '十四是十四四十是四十,,十四不是四十,,,,四十不是十四' 162 | ws = WordSegment(doc, max_word_len=2, min_aggregation=1.2, min_entropy=0.4) 163 | print(' '.join(['%s:%f'%w for w in ws.word_with_freq])) 164 | print(' '.join(ws.words)) 165 | print(' '.join(ws.segSentence(doc))) 166 | print('average len: ', ws.avg_len) 167 | print('average frequency: ', ws.avg_freq) 168 | print('average left entropy: ', ws.avg_left_entropy) 169 | print('average right entropy: ', ws.avg_right_entropy) 170 | print('average aggregation: ', ws.avg_aggregation) 171 | 172 | 173 | --------------------------------------------------------------------------------