├── .gitignore
├── LICENSE
├── README.md
├── setup.py
└── wordseg
    ├── __init__.py
    ├── freqitem.py
    ├── hashtree.py
    ├── probability.py
    ├── sequence.py
    └── wordseg.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Kaiqiang Dawn
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ChineseWordSegmentation
 2 | Chinese word segmentation algorithm without corpus
 3 | 
 4 | ## Usage
 5 | ```
 6 | from wordseg import WordSegment
 7 | doc = u'十四是十四四十是四十，十四不是四十，四十不是十四'
 8 | ws = WordSegment(doc, max_word_len=2, min_aggregation=1, min_entropy=0.5)
 9 | ws.segSentence(doc)
10 | ```
11 | 
12 | This will generate words
13 | 
14 | `十四 是 十四 四十 是 四十 ， 十四 不是 四十 ， 四十 不是 十四`
15 | 
16 | In fact, `doc` should be a long enough document string for better results. In that condition, the min_aggregation should be set far greater than 1, such as 50, and min_entropy should also be set greater than 0.5, such as 1.5.
17 | 
18 | Besides, both input and output of this function should be decoded as unicode.
19 | 
20 | `WordSegment.segSentence` has an optional argument `method`, with values `WordSegment.L`, `WordSegment.S` and `WordSegment.ALL`, means
21 | 
22 | + `WordSegment.L`: if a long word that is combinations of several shorter words found, given only the long word.
23 | + `WordSegment.S`: given the several shorter words.
24 | + `WordSegment.ALL`: given both the long and the shorters.
25 | 
26 | ## Reference
27 | 
28 | Thanks Matrix67's [article](http://www.matrix67.com/blog/archives/5044)


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | from __future__ import absolute_import
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | import pkg_resources
 7 | from setuptools import setup, find_packages
 8 | import os
 9 | import codecs
10 | import re
11 | import sys
12 | 
13 | def read(*parts):
14 |     path = os.path.join(os.path.dirname(__file__), *parts)
15 |     with codecs.open(path, encoding='utf-8') as fobj:
16 |         return fobj.read()
17 | 
18 | def find_version(*file_paths):
19 |     version_file = read(*file_paths)
20 |     version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
21 |                               version_file, re.M)
22 |     if version_match:
23 |         return version_match.group(1)
24 |     raise RuntimeError("Unable to find version string.")
25 | 
26 | setup(
27 |     name='wordseg',
28 |     version=find_version("wordseg", "__init__.py"),
29 |     description='Chinese word segmentation algorithm without corpus',
30 |     author='段凯强',
31 |     author_email='',
32 |     license='MIT',
33 |     keywords='NLP,tokenizing,Chinese word segementation',
34 |     url='https://github.com/bung87/ChineseWordSegmentation',
35 |     packages = find_packages(),
36 |     package_dir={'wordseg': 'wordseg'},
37 |     classifiers=[
38 |         'Intended Audience :: Developers',
39 |         'License :: OSI Approved :: MIT License',
40 |         'Operating System :: OS Independent',
41 |         'Natural Language :: Chinese (Simplified)',
42 |         'Natural Language :: Chinese (Traditional)',
43 |         'Programming Language :: Python',
44 |         'Programming Language :: Python :: 2',
45 |         'Programming Language :: Python :: 2.6',
46 |         'Programming Language :: Python :: 2.7',
47 |         'Programming Language :: Python :: 3',
48 |         'Programming Language :: Python :: 3.2',
49 |         'Programming Language :: Python :: 3.3',
50 |         'Programming Language :: Python :: 3.4',
51 |         'Topic :: Text Processing',
52 |         'Topic :: Text Processing :: Indexing',
53 |         'Topic :: Text Processing :: Linguistic',
54 |       ]
55 | )
56 | 


--------------------------------------------------------------------------------
/wordseg/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.1.0'
2 | 
3 | from . import probability
4 | from . import sequence
5 | from . wordseg import WordSegment


--------------------------------------------------------------------------------
/wordseg/freqitem.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 | 
 3 | """
 4 | A simple frequent itemset mining algorithm implementation
 5 | Author: 段凯强
 6 | """
 7 | 
 8 | import itertools
 9 | 
10 | from wordseg.sequence import dedup
11 | from wordseg.hashtree import HashTree, sameNodes
12 | from functools import reduce
13 | 
14 | class FreqItem(object):
15 |     def __init__(self, transactions, sup_theta=.1):
16 |         self.transactions = [sorted(t) for t in [x for x in transactions if x]]
17 |         self.sup_theta = sup_theta*len(transactions)
18 |         self.freqset = []
19 | 
20 |     def filterCandidates(self, cand):
21 |         """
22 |         Build a HashTree with candidates cand, then count support of these candidates to filter out
23 |         all those that have support not lower than sup_theta
24 |         """
25 |         hashtree = HashTree(cand)
26 |         hashtree.count(self.transactions)
27 |         return hashtree.getNodes(self.sup_theta)
28 | 
29 |     def freqOneSet(self):
30 |         """
31 |         Generate frequent 1-item sets
32 |         """
33 |         one_item_cand = set()
34 |         for t in self.transactions:
35 |             for w in t:
36 |                 one_item_cand.add(w)
37 |         return sorted(self.filterCandidates([[i] for i in one_item_cand]), key=lambda i: i[0].name)
38 | 
39 |     def genNextCand(self, preItems):
40 |         """
41 |         Generate next candidates by dynamic programming
42 |         Find range [i, j) such that items in this range have same prefix
43 |         e.g., [1,2,3,4] and [1,2,3,5] have same prefix, so they should be in one same range
44 |         Then, generate 2-combinations of these ranges as result
45 |         """
46 |         res = []
47 |         i, j = 0, 0
48 |         while i < len(preItems):
49 |             if j < len(preItems) and sameNodes(preItems[j][:-1], preItems[i][:-1]):
50 |                 j += 1
51 |             else:
52 |                 res += [pair[0] + [pair[1][-1]] for pair in itertools.combinations(preItems[i:j], 2)]
53 |                 i = j
54 |         return [[i.name for i in items] for items in res]
55 | 
56 |     def genFreqItemSets(self):
57 |         """
58 |         @return Frequent item sets with their frequency
59 |         """
60 |         if self.freqset: return self.freqset
61 |         cur = self.freqOneSet()
62 |         freqKSet = []
63 |         while cur:
64 |             freqKSet.append(cur)
65 |             cur = self.filterCandidates(self.genNextCand(cur))
66 |         self.freqset = reduce(lambda res, x: res + x, freqKSet, [])
67 |         name_freq_pairs = [[(i.name, i.val) for i in items] for items in self.freqset[::-1]]
68 |         res = [list(zip(*items)) for items in name_freq_pairs]
69 |         return [(list(pair[0]), pair[1][-1]) for pair in res]
70 | 
71 | if __name__ == '__main__':
72 |     transactions = [[1,2,3],[1,2,4],[2,4,6,8],[1,3,5,7], [5,7,2]]
73 |     freqItem = FreqItem(transactions, sup_theta=.3)
74 |     print(freqItem.genFreqItemSets())
75 | 


--------------------------------------------------------------------------------
/wordseg/hashtree.py:
--------------------------------------------------------------------------------
  1 | #coding=utf-8
  2 | 
  3 | """
  4 | A simple implementation of Hash Tree
  5 | Author: 段凯强
  6 | """
  7 | from functools import reduce
  8 | 
  9 | class HashTreeNode(object):
 10 |     def __init__(self, name=''):
 11 |         self.val = 0
 12 |         self.name = name
 13 |         self.level = 0
 14 |         self.children = {}
 15 | 
 16 |     def addBag(self, bag):
 17 |         """
 18 |         Note that bag must be sorted
 19 |         """
 20 |         if bag:
 21 |             node = self.children.get(bag[0], HashTreeNode(name=bag[0]))
 22 |             node.addBag(bag[1:])
 23 |             self.children[bag[0]] = node
 24 |             self.level = len(bag)
 25 | 
 26 |     def count(self, transaction):
 27 |         """
 28 |         count the child who matches bag, suppose that current node matches
 29 |         """
 30 |         if self.level == 0:
 31 |             self.val += 1
 32 |         elif self.level == 1:
 33 |             for t in transaction:
 34 |                 if t in self.children: self.children[t].val += 1
 35 |         else:
 36 |             for i in range(0, len(transaction)):
 37 |                 t = transaction[i]
 38 |                 if t in self.children:
 39 |                     self.children[t].count(transaction[i:])
 40 | 
 41 |     def get(self, theta):
 42 |         return [[c.name for c in items] for items in self.getNodes(theta)]
 43 |         """
 44 |         if self.level == 0:
 45 |             return [[self.name]] if self.val >= theta else None
 46 |         else:
 47 |             children_res = [self.children[i].get(theta) for i in sorted(self.children.keys())]
 48 |             total = reduce(lambda res, x: res + x, filter(lambda x: x, children_res), [])
 49 |             return map(lambda c: [self.name] + c, total)
 50 |         """
 51 | 
 52 |     def getNodes(self, theta):
 53 |         if self.level == 0:
 54 |             return [[self]] if self.val >= theta else None
 55 |         else:
 56 |             children_res = [self.children[i].getNodes(theta) for i in sorted(self.children.keys())]
 57 |             total = reduce(lambda res, x: res + x, [x for x in children_res if x], [])
 58 |             return [[self] + c for c in total]
 59 | 
 60 |     def __str__(self):
 61 |         return '(%s : %s)'%(self.name, '; '.join([str(i) for i in list(self.children.values())]))
 62 | 
 63 | def sameNode(node1, node2):
 64 |     return node1.name == node2.name
 65 | 
 66 | def sameNodes(nodes1, nodes2):
 67 |     func = lambda n: n.name
 68 |     return list(map(func, nodes1)) == list(map(func, nodes2))
 69 | 
 70 | 
 71 | 
 72 | class HashTree(object):
 73 |     """
 74 |     Note that all bags must be sorted
 75 |     """
 76 |     def __init__(self, bags):
 77 |         self.root = HashTreeNode()
 78 |         self.root.val = 0
 79 |         for b in bags:
 80 |             if b: self.root.addBag(b)
 81 | 
 82 |     def count(self, transactions):
 83 |         for t in transactions: self.root.count(t)
 84 | 
 85 |     def get(self, theta):
 86 |         res = [c[1:] for c in self.root.get(theta)]
 87 |         return [] if res == [[]] else res
 88 | 
 89 |     def getNodes(self, theta):
 90 |         res = [c[1:] for c in self.root.getNodes(theta)]
 91 |         return [] if res == [[]] else res
 92 | 
 93 |     def __str__(self):
 94 |         return str(self.root)
 95 | 
 96 | if __name__ == '__main__':
 97 |     to_count = [[1,2], [2,4], [1,3], [1,5], [3,4], [2,7], [6,8]]
 98 |     tree = HashTree(to_count)
 99 |     transactions = [[1,2,3],[1,2,4],[2,4,6,8],[1,3,5,7]]
100 |     tree.count(transactions)
101 |     print('Frequency with transactions', transactions)
102 |     print(tree.get(2))
103 |     print(tree.get(1))
104 | 
105 | 
106 | 


--------------------------------------------------------------------------------
/wordseg/probability.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 | 
 3 | """
 4 | Algorithms about probability
 5 | Author: 段凯强
 6 | """
 7 | 
 8 | import math
 9 | 
10 | def entropyOfList(ls):
11 |     """
12 |     Given a list of some items, compute entropy of the list
13 |     The entropy is sum of -p[i]*log(p[i]) for every unique element i in the list, and p[i] is its frequency
14 |     """
15 |     elements = {}
16 |     for e in ls:
17 |         elements[e] = elements.get(e, 0) + 1
18 |     length = float(len(ls))
19 |     # if length is 0, which means one side of a word is empty, which is determinated, so entropy should be 0
20 |     return length and sum([-v/length*math.log(v/length) for v in list(elements.values())])
21 | 
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/wordseg/sequence.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 | 
 3 | """
 4 | Algorithms for sequences
 5 | Author: 段凯强
 6 | """
 7 | 
 8 | def dedup(ls):
 9 |     """
10 |     deduplicate the given SORTED list
11 |     """
12 |     i, j = 0, 0
13 |     while j < len(ls):
14 |         if ls[j] == ls[i]:
15 |             j += 1
16 |         else:
17 |             i += 1
18 |             ls[i] = ls[j]
19 |     return ls[:i + 1]
20 | 
21 | def genSubstr(string, n):
22 |     """
23 |     Generate all substrings of max length n for string
24 |     """
25 |     length = len(string)
26 |     res = []
27 |     for i in range(0, length):
28 |         for j in range(i + 1, min(i + n + 1, length + 1)):
29 |             res.append(string[i: j])
30 |     return res
31 | 
32 | def genSubparts(string):
33 |     """
34 |     Partition a string into all possible two parts, e.g.
35 |     given "abcd", generate [("a", "bcd"), ("ab", "cd"), ("abc", "d")]
36 |     For string of length 1, return empty list
37 |     """
38 |     length = len(string)
39 |     res = []
40 |     for i in range(1, length):
41 |         res.append((string[0:i], string[i:]))
42 |     return res
43 | 
44 | def longestSubsequenceLength(s1, s2):
45 |     n = len(s2) + 1
46 |     cur = [0]*n
47 |     next = [0]*n
48 |     tmp = None
49 |     for i in s1:
50 |         for j in range(0, n):
51 |             if j == 0: next[j] = 0
52 |             else: next[j] = cur[j - 1] + 1 if i == s2[j - 1] else max(next[j - 1], cur[j])
53 |         tmp = next
54 |         next = cur
55 |         cur = tmp
56 |     return cur[n - 1]
57 | 
58 | def longestSubsequence(s1, s2):
59 |     n = len(s2) + 1
60 |     cur = [0]*n
61 |     next = [0]*n
62 |     tmp = None
63 |     __NONE, __UP, __LEFT, __NEW = 0, 1, 2, 3
64 |     orientation = [[__NONE]*n]
65 |     for i in s1:
66 |         ori = []
67 |         for j in range(0, n):
68 |             if j == 0:
69 |                 next[j] = 0
70 |                 ori.append(__NONE)
71 |             else:
72 |                 next[j] = cur[j - 1] + 1 if i == s2[j - 1] else max(next[j - 1], cur[j])
73 |                 ori.append(__NEW if i == s2[j - 1] else (__LEFT if next[j - 1] > cur [j] else __UP))
74 |         orientation.append(ori)
75 |         tmp = next
76 |         next = cur
77 |         cur = tmp
78 |     i, j, res = len(s1), n - 1, ''
79 |     ori = orientation[i][j]
80 |     while ori != __NONE:
81 |         if ori == __UP: i -= 1
82 |         elif ori == __LEFT: j -= 1
83 |         elif ori == __NEW:
84 |             i -= 1
85 |             j -= 1
86 |             res += s2[j]
87 |         ori = orientation[i][j]
88 |     return res[::-1]
89 | 
90 | 


--------------------------------------------------------------------------------
/wordseg/wordseg.py:
--------------------------------------------------------------------------------
  1 | #coding=utf-8
  2 | 
  3 | """
  4 | Chinese word segmentation algorithm without corpus
  5 | Author: 段凯强
  6 | Reference: http://www.matrix67.com/blog/archives/5044
  7 | """
  8 | 
  9 | import re
 10 | 
 11 | from . probability import entropyOfList
 12 | from . sequence import genSubparts, genSubstr
 13 | 
 14 | 
 15 | 
 16 | 
 17 | def indexOfSortedSuffix(doc, max_word_len):
 18 |     """
 19 |     Treat a suffix as an index where the suffix begins.
 20 |     Then sort these indexes by the suffixes.
 21 |     """
 22 |     indexes = []
 23 |     length = len(doc)
 24 |     for i in range(0, length):
 25 |         for j in range(i + 1, min(i + 1 + max_word_len, length + 1)):
 26 |             indexes.append((i, j))
 27 |     return sorted(indexes, key=lambda i_j: doc[i_j[0]:i_j[1]])
 28 | 
 29 | 
 30 | class WordInfo(object):
 31 |     """
 32 |     Store information of each word, including its freqency, left neighbors and right neighbors
 33 |     """
 34 |     def __init__(self, text):
 35 |         super(WordInfo, self).__init__()
 36 |         self.text = text
 37 |         self.freq = 0.0
 38 |         self.left = []
 39 |         self.right = []
 40 |         self.aggregation = 0
 41 | 
 42 |     def update(self, left, right):
 43 |         """
 44 |         Increase frequency of this word, then append left/right neighbors
 45 |         @param left a single character on the left side of this word
 46 |         @param right as left is, but on the right side
 47 |         """
 48 |         self.freq += 1
 49 |         if left: self.left.append(left)
 50 |         if right: self.right.append(right)
 51 | 
 52 |     def compute(self, length):
 53 |         """
 54 |         Compute frequency and entropy of this word
 55 |         @param length length of the document for training to get words
 56 |         """
 57 |         self.freq /= length
 58 |         self.left = entropyOfList(self.left)
 59 |         self.right = entropyOfList(self.right)
 60 | 
 61 |     def computeAggregation(self, words_dict):
 62 |         """
 63 |         Compute aggregation of this word
 64 |         @param words_dict frequency dict of all candidate words
 65 |         """
 66 |         parts = genSubparts(self.text)
 67 |         if len(parts) > 0:
 68 |             self.aggregation = min([self.freq/words_dict[p1_p2[0]].freq/words_dict[p1_p2[1]].freq for p1_p2 in parts])
 69 | 
 70 | 
 71 | 
 72 | class WordSegment(object):
 73 | 
 74 |     """
 75 |     Main class for Chinese word segmentation
 76 |     1. Generate words from a long enough document
 77 |     2. Do the segmentation work with the document
 78 |     """
 79 | 
 80 |     # if a word is combination of other shorter words, then treat it as a long word
 81 |     L = 0
 82 |     # if a word is combination of other shorter words, then treat it as the set of shortest words
 83 |     S = 1
 84 |     # if a word contains other shorter words, then return all possible results
 85 |     ALL = 2
 86 | 
 87 |     def __init__(self, doc, max_word_len=5, min_freq=0.00005, min_entropy=2.0, min_aggregation=50):
 88 |         super(WordSegment, self).__init__()
 89 |         self.max_word_len = max_word_len
 90 |         self.min_freq = min_freq
 91 |         self.min_entropy = min_entropy
 92 |         self.min_aggregation = min_aggregation
 93 |         self.word_infos = self.genWords(doc)
 94 |         # Result infomations, i.e., average data of all words
 95 |         word_count = float(len(self.word_infos))
 96 |         self.avg_len = sum([len(w.text) for w in self.word_infos])/word_count
 97 |         self.avg_freq = sum([w.freq for w in self.word_infos])/word_count
 98 |         self.avg_left_entropy = sum([w.left for w in self.word_infos])/word_count
 99 |         self.avg_right_entropy = sum([w.right for w in self.word_infos])/word_count
100 |         self.avg_aggregation = sum([w.aggregation for w in self.word_infos])/word_count
101 |         # Filter out the results satisfy all the requirements
102 |         filter_func = lambda v: len(v.text) > 1 and v.aggregation > self.min_aggregation and\
103 |                     v.freq > self.min_freq and v.left > self.min_entropy and v.right > self.min_entropy
104 |         self.word_with_freq = [(w.text, w.freq) for w in list(filter(filter_func, self.word_infos))]
105 |         self.words = [w[0] for w in self.word_with_freq]
106 | 
107 |     def genWords(self, doc):
108 |         """
109 |         Generate all candidate words with their frequency/entropy/aggregation informations
110 |         @param doc the document used for words generation
111 |         """
112 |         pattern = re.compile('[\\s\\d,.<>/?:;\'\"[\\]{}()\\|~!@#$%^&*\\-_=+a-zA-Z，。《》、？：；“”‘’｛｝【】（）…￥！—┄－]+')
113 |         doc = re.sub(pattern, ' ', doc)
114 |         suffix_indexes = indexOfSortedSuffix(doc, self.max_word_len)
115 |         word_cands = {}
116 |         # compute frequency and neighbors
117 |         for suf in suffix_indexes:
118 |             word = doc[suf[0]:suf[1]]
119 |             if word not in word_cands:
120 |                 word_cands[word] = WordInfo(word)
121 |             word_cands[word].update(doc[suf[0] - 1:suf[0]], doc[suf[1]:suf[1] + 1])
122 |         # compute probability and entropy
123 |         length = len(doc)
124 |         for k in word_cands:
125 |             word_cands[k].compute(length)
126 |         # compute aggregation of words whose length > 1
127 |         values = sorted(list(word_cands.values()), key=lambda x: len(x.text))
128 |         for v in values:
129 |             if len(v.text) == 1: continue
130 |             v.computeAggregation(word_cands)
131 |         return sorted(values, key=lambda v: v.freq, reverse=True)
132 | 
133 |     def segSentence(self, sentence, method=ALL):
134 |         """
135 |         Segment a sentence with the words generated from a document
136 |         @param sentence the sentence to be handled
137 |         @param method segmentation method
138 |         """
139 |         i = 0
140 |         res = []
141 |         while i < len(sentence):
142 |             if method == self.L or method == self.S:
143 |                 j_range = list(range(self.max_word_len, 0, -1)) if method == self.L else list(range(2, self.max_word_len + 1)) + [1]
144 |                 for j in j_range:
145 |                     if j == 1 or sentence[i:i + j] in self.words:
146 |                         res.append(sentence[i:i + j])
147 |                         i += j
148 |                         break
149 |             else:
150 |                 to_inc = 1
151 |                 for j in range(2, self.max_word_len + 1):
152 |                     if i + j <= len(sentence) and sentence[i:i + j] in self.words:
153 |                         res.append(sentence[i:i + j])
154 |                         if to_inc == 1: to_inc = j
155 |                 if to_inc == 1: res.append(sentence[i])
156 |                 i += to_inc
157 |         return res
158 | 
159 | 
160 | if __name__ == '__main__':
161 |     doc = '十四是十四四十是四十，，十四不是四十，，，，四十不是十四'
162 |     ws = WordSegment(doc, max_word_len=2, min_aggregation=1.2, min_entropy=0.4)
163 |     print(' '.join(['%s:%f'%w for w in ws.word_with_freq]))
164 |     print(' '.join(ws.words))
165 |     print(' '.join(ws.segSentence(doc)))
166 |     print('average len: ', ws.avg_len)
167 |     print('average frequency: ', ws.avg_freq)
168 |     print('average left entropy: ', ws.avg_left_entropy)
169 |     print('average right entropy: ', ws.avg_right_entropy)
170 |     print('average aggregation: ', ws.avg_aggregation)
171 | 
172 | 
173 | 


--------------------------------------------------------------------------------