├── .gitignore
├── LICENSE
├── README.md
├── demo
    └── demo_segment.py
├── setup.py
├── test
    ├── __init__.py
    ├── data
    │   ├── test.ngram.txt
    │   └── test.txt
    ├── test_bigramtable.py
    ├── test_dict.py
    ├── test_enum.py
    ├── test_hmm.py
    ├── test_organization_recognition.py
    ├── test_person_recognition.py
    ├── test_place_recognition.py
    ├── test_role_tag.py
    ├── test_segment.py
    ├── test_traditionalChineseDict.py
    ├── test_trie.py
    ├── test_viterbi_segment.py
    └── test_wordnet.py
└── yaya
    ├── __init__.py
    ├── collection
        ├── __init__.py
        ├── bigram.py
        ├── dict.py
        ├── hmm.py
        └── trie.py
    ├── common
        ├── __init__.py
        ├── enum.py
        ├── nature.py
        ├── nr.py
        ├── ns.py
        └── nt.py
    ├── config.py
    ├── const.py
    ├── dictionary
        ├── __init__.py
        ├── chinese_traditional_dict.py
        ├── org_dict.py
        ├── person_dict.py
        └── place_dict.py
    ├── recognition
        ├── __init__.py
        ├── organization_recognition.py
        ├── person_recognition.py
        ├── place_recognition.py
        └── recognition.py
    ├── seg
        ├── __init__.py
        ├── segment.py
        ├── viterbi.py
        └── wordnet.py
    └── utility
        ├── __init__.py
        ├── bytearray.py
        ├── chartype.py
        ├── persistence.py
        └── singleton.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.bin
2 | *.ya
3 | *.dat
4 | *.pyc
5 | /.idea
6 | /data
7 | /.project
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 | Version 2.0, January 2004
  3 | http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 | "License" shall mean the terms and conditions for use, reproduction, and
 10 | distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright
 13 | owner that is granting the License.
 14 | 
 15 | "Legal Entity" shall mean the union of the acting entity and all other entities
 16 | that control, are controlled by, or are under common control with that entity.
 17 | For the purposes of this definition, "control" means (i) the power, direct or
 18 | indirect, to cause the direction or management of such entity, whether by
 19 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the
 20 | outstanding shares, or (iii) beneficial ownership of such entity.
 21 | 
 22 | "You" (or "Your") shall mean an individual or Legal Entity exercising
 23 | permissions granted by this License.
 24 | 
 25 | "Source" form shall mean the preferred form for making modifications, including
 26 | but not limited to software source code, documentation source, and configuration
 27 | files.
 28 | 
 29 | "Object" form shall mean any form resulting from mechanical transformation or
 30 | translation of a Source form, including but not limited to compiled object code,
 31 | generated documentation, and conversions to other media types.
 32 | 
 33 | "Work" shall mean the work of authorship, whether in Source or Object form, made
 34 | available under the License, as indicated by a copyright notice that is included
 35 | in or attached to the work (an example is provided in the Appendix below).
 36 | 
 37 | "Derivative Works" shall mean any work, whether in Source or Object form, that
 38 | is based on (or derived from) the Work and for which the editorial revisions,
 39 | annotations, elaborations, or other modifications represent, as a whole, an
 40 | original work of authorship. For the purposes of this License, Derivative Works
 41 | shall not include works that remain separable from, or merely link (or bind by
 42 | name) to the interfaces of, the Work and Derivative Works thereof.
 43 | 
 44 | "Contribution" shall mean any work of authorship, including the original version
 45 | of the Work and any modifications or additions to that Work or Derivative Works
 46 | thereof, that is intentionally submitted to Licensor for inclusion in the Work
 47 | by the copyright owner or by an individual or Legal Entity authorized to submit
 48 | on behalf of the copyright owner. For the purposes of this definition,
 49 | "submitted" means any form of electronic, verbal, or written communication sent
 50 | to the Licensor or its representatives, including but not limited to
 51 | communication on electronic mailing lists, source code control systems, and
 52 | issue tracking systems that are managed by, or on behalf of, the Licensor for
 53 | the purpose of discussing and improving the Work, but excluding communication
 54 | that is conspicuously marked or otherwise designated in writing by the copyright
 55 | owner as "Not a Contribution."
 56 | 
 57 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf
 58 | of whom a Contribution has been received by Licensor and subsequently
 59 | incorporated within the Work.
 60 | 
 61 | 2. Grant of Copyright License.
 62 | 
 63 | Subject to the terms and conditions of this License, each Contributor hereby
 64 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
 65 | irrevocable copyright license to reproduce, prepare Derivative Works of,
 66 | publicly display, publicly perform, sublicense, and distribute the Work and such
 67 | Derivative Works in Source or Object form.
 68 | 
 69 | 3. Grant of Patent License.
 70 | 
 71 | Subject to the terms and conditions of this License, each Contributor hereby
 72 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
 73 | irrevocable (except as stated in this section) patent license to make, have
 74 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where
 75 | such license applies only to those patent claims licensable by such Contributor
 76 | that are necessarily infringed by their Contribution(s) alone or by combination
 77 | of their Contribution(s) with the Work to which such Contribution(s) was
 78 | submitted. If You institute patent litigation against any entity (including a
 79 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a
 80 | Contribution incorporated within the Work constitutes direct or contributory
 81 | patent infringement, then any patent licenses granted to You under this License
 82 | for that Work shall terminate as of the date such litigation is filed.
 83 | 
 84 | 4. Redistribution.
 85 | 
 86 | You may reproduce and distribute copies of the Work or Derivative Works thereof
 87 | in any medium, with or without modifications, and in Source or Object form,
 88 | provided that You meet the following conditions:
 89 | 
 90 | You must give any other recipients of the Work or Derivative Works a copy of
 91 | this License; and
 92 | You must cause any modified files to carry prominent notices stating that You
 93 | changed the files; and
 94 | You must retain, in the Source form of any Derivative Works that You distribute,
 95 | all copyright, patent, trademark, and attribution notices from the Source form
 96 | of the Work, excluding those notices that do not pertain to any part of the
 97 | Derivative Works; and
 98 | If the Work includes a "NOTICE" text file as part of its distribution, then any
 99 | Derivative Works that You distribute must include a readable copy of the
100 | attribution notices contained within such NOTICE file, excluding those notices
101 | that do not pertain to any part of the Derivative Works, in at least one of the
102 | following places: within a NOTICE text file distributed as part of the
103 | Derivative Works; within the Source form or documentation, if provided along
104 | with the Derivative Works; or, within a display generated by the Derivative
105 | Works, if and wherever such third-party notices normally appear. The contents of
106 | the NOTICE file are for informational purposes only and do not modify the
107 | License. You may add Your own attribution notices within Derivative Works that
108 | You distribute, alongside or as an addendum to the NOTICE text from the Work,
109 | provided that such additional attribution notices cannot be construed as
110 | modifying the License.
111 | You may add Your own copyright statement to Your modifications and may provide
112 | additional or different license terms and conditions for use, reproduction, or
113 | distribution of Your modifications, or for any such Derivative Works as a whole,
114 | provided Your use, reproduction, and distribution of the Work otherwise complies
115 | with the conditions stated in this License.
116 | 
117 | 5. Submission of Contributions.
118 | 
119 | Unless You explicitly state otherwise, any Contribution intentionally submitted
120 | for inclusion in the Work by You to the Licensor shall be under the terms and
121 | conditions of this License, without any additional terms or conditions.
122 | Notwithstanding the above, nothing herein shall supersede or modify the terms of
123 | any separate license agreement you may have executed with Licensor regarding
124 | such Contributions.
125 | 
126 | 6. Trademarks.
127 | 
128 | This License does not grant permission to use the trade names, trademarks,
129 | service marks, or product names of the Licensor, except as required for
130 | reasonable and customary use in describing the origin of the Work and
131 | reproducing the content of the NOTICE file.
132 | 
133 | 7. Disclaimer of Warranty.
134 | 
135 | Unless required by applicable law or agreed to in writing, Licensor provides the
136 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS,
137 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
138 | including, without limitation, any warranties or conditions of TITLE,
139 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
140 | solely responsible for determining the appropriateness of using or
141 | redistributing the Work and assume any risks associated with Your exercise of
142 | permissions under this License.
143 | 
144 | 8. Limitation of Liability.
145 | 
146 | In no event and under no legal theory, whether in tort (including negligence),
147 | contract, or otherwise, unless required by applicable law (such as deliberate
148 | and grossly negligent acts) or agreed to in writing, shall any Contributor be
149 | liable to You for damages, including any direct, indirect, special, incidental,
150 | or consequential damages of any character arising as a result of this License or
151 | out of the use or inability to use the Work (including but not limited to
152 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or
153 | any and all other commercial damages or losses), even if such Contributor has
154 | been advised of the possibility of such damages.
155 | 
156 | 9. Accepting Warranty or Additional Liability.
157 | 
158 | While redistributing the Work or Derivative Works thereof, You may choose to
159 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or
160 | other liability obligations and/or rights consistent with this License. However,
161 | in accepting such obligations, You may act only on Your own behalf and on Your
162 | sole responsibility, not on behalf of any other Contributor, and only if You
163 | agree to indemnify, defend, and hold each Contributor harmless for any liability
164 | incurred by, or claims asserted against, such Contributor by reason of your
165 | accepting any such warranty or additional liability.
166 | 
167 | END OF TERMS AND CONDITIONS
168 | 
169 | APPENDIX: How to apply the Apache License to your work
170 | 
171 | To apply the Apache License to your work, attach the following boilerplate
172 | notice, with the fields enclosed by brackets "{}" replaced with your own
173 | identifying information. (Don't include the brackets!) The text should be
174 | enclosed in the appropriate comment syntax for the file format. We also
175 | recommend that a file or class name and description of purpose be included on
176 | the same "printed page" as the copyright notice for easier identification within
177 | third-party archives.
178 | 
179 |    Copyright 2015 TonyWang
180 | 
181 |    Licensed under the Apache License, Version 2.0 (the "License");
182 |    you may not use this file except in compliance with the License.
183 |    You may obtain a copy of the License at
184 | 
185 |      http://www.apache.org/licenses/LICENSE-2.0
186 | 
187 |    Unless required by applicable law or agreed to in writing, software
188 |    distributed under the License is distributed on an "AS IS" BASIS,
189 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
190 |    See the License for the specific language governing permissions and
191 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # YaYaNLP: Chinese Language Processing
  2 | YaYaNLP是一个纯python编写的中文自然语言处理包，取名于“牙牙学语”。
  3 | YaYaNLP提供以下功能：
  4 | - 中文分词
  5 | - 词性标注
  6 | - 命名实体识别
  7 |  * 人名识别
  8 |  * 地名识别
  9 |  * 组织机构识别
 10 | - 简繁转换
 11 | 
 12 | ## 项目
 13 | 
 14 | 项目主页：[https://github.com/Tony-Wang/YaYaNLP](https://github.com/Tony-Wang/YaYaNLP)
 15 | 
 16 | 我的主页：[www.huangyong.me](http://www.huangyong.me)
 17 | 
 18 | ## 安装
 19 | 
 20 | ### 直接下载源码包，解压后运行
 21 | 
 22 | ``` bash
 23 | python setup.py install
 24 | ```
 25 | 
 26 | ### 下载字典与模型文件
 27 | 
 28 | YaYaNLP使用了与HanLP兼容的字典数据，而编译后的字典数据保存的扩展名为.ya
 29 | 可以直接从hanLP项目下载，[data-for-1.2.4.zip](http://pan.baidu.com/s/1gd1vo8j)
 30 | 
 31 | ### 配置数据文件路径
 32 | 
 33 | 在**yaya/config.py**修改自己的数据文件路径
 34 | ``` python
 35 | DATA_ROOT = "/your/data/path"
 36 | ```
 37 | 
 38 | ## 特性
 39 | 
 40 | ### 人名识别
 41 | 
 42 | ``` 
 43 |     # 识别人名
 44 |     text = u"签约仪式前，秦光荣、李纪恒、仇和等一同会见了参加签约的企业家。"
 45 |     terms = segment.seg(text)
 46 |     print_terms(terms)
 47 | ```
 48 | 
 49 | ```
 50 | 签约/vi
 51 | 仪式/n
 52 | 前/f
 53 | ，/w
 54 | 秦光荣/nr
 55 | 、/w
 56 | 李纪恒/nr
 57 | 、/w
 58 | 仇和/nr
 59 | 等/udeng
 60 | 一同/d
 61 | 会见/v
 62 | 了/ule
 63 | 参加/v
 64 | 签约/vi
 65 | 的/ude1
 66 | 企业家/nnt
 67 | 。/w
 68 | ```
 69 | 
 70 | 
 71 | ### 歧意词识别
 72 | 
 73 | ```
 74 |     # 识别歧意词
 75 |     text = u"龚学平等领导说,邓颖超生前杜绝超生"
 76 |     terms = segment.seg(text)
 77 |     print_terms(terms)
 78 | ```
 79 | 
 80 | ```
 81 | 龚学平/nr
 82 | 等/udeng
 83 | 领导/n
 84 | 说/v
 85 | ,/w
 86 | 邓颖超/nr
 87 | 生前/t
 88 | 杜绝/v
 89 | 超生/vi
 90 | ```
 91 | 
 92 | ### 地名识别
 93 | 
 94 | ``` 
 95 |     # 识别地名
 96 |     text = u"蓝翔给宁夏固原市彭阳县红河镇黑牛沟村捐赠了挖掘机"
 97 |     terms = segment.seg(text)
 98 |     print_terms(terms)
 99 | ```
100 | 
101 | ```
102 | 蓝翔/nt
103 | 给/p
104 | 宁夏/ns
105 | 固原市/ns
106 | 彭阳县/ns
107 | 红河镇/ns
108 | 黑牛沟村/ns
109 | 捐赠/v
110 | 了/ule
111 | 挖掘机/n
112 | ```
113 | 
114 | ### 组织名识别
115 | 
116 | ```
117 |     # 组织名识别
118 |     text = u"济南杨铭宇餐饮管理有限公司是由杨先生创办的餐饮企业"
119 |     terms = segment.seg(text)
120 |     print_terms(terms)
121 | ```
122 | 
123 | ```
124 | 济南杨铭宇餐饮管理有限公司/nt
125 | 是/vshi
126 | 由/p
127 | 杨先生/nr
128 | 创办/v
129 | 的/ude1
130 | 餐饮企业/nz
131 | ```
132 | 
133 | ### 简繁转换
134 | 
135 | ```
136 |     # 简繁转换
137 |     text = u"以后等你当上皇后，就能买草莓庆祝了"
138 |     print segment.simplified_to_traditional(text)
139 | ```
140 | 
141 | ```
142 | 以後等妳當上皇后，就能買士多啤梨慶祝了
143 | ```
144 | 
145 | ```
146 |     # 繁简转换
147 |     text = u"用筆記簿型電腦寫程式HelloWorld"
148 |     print segment.traditional_to_simplified(text)
149 | ```
150 | 
151 | ```
152 | 用笔记本电脑写程序HelloWorld
153 | ```
154 | 
155 | ## 感谢
156 | 本项目参考了[hanck/HanLP](https://github.com/hankcs/HanLP/)项目实现原理并使用了该项目的字典和模型文件。
157 | 
158 | 
159 | ## 版权
160 | * Apache License Version 2.0
161 | * 任何使用了YaYaNLP的全部或部分功能、词典、模型的项目、产品或文章等形式的成果必须显式注明YaYaNLP及此项目主页。
162 | 


--------------------------------------------------------------------------------
/demo/demo_segment.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from yaya.seg import segment
 3 | 
 4 | __author__ = 'tony'
 5 | 
 6 | 
 7 | def print_terms(terms):
 8 |     for i, v in enumerate(terms):
 9 |         print "%s/%s" % (v[0], v[1])
10 | 
11 | 
12 | def main():
13 | 
14 |     # 识别歧意词
15 |     text = u"龚学平、张晓辉等领导说,邓颖超生前杜绝超生"
16 |     terms = segment.seg(text)
17 |     print_terms(terms)
18 | 
19 |     # 识别人名
20 |     text = u"签约仪式前，秦光荣、李纪恒、仇和等一同会见了参加签约的企业家。"
21 |     terms = segment.seg(text)
22 |     print_terms(terms)
23 | 
24 |     # 识别地名
25 |     text = u"蓝翔给宁夏固原市彭阳县红河镇黑牛沟村捐赠了挖掘机"
26 |     terms = segment.seg(text)
27 |     print_terms(terms)
28 | 
29 |     # 识别组织名
30 |     text = u"济南杨铭宇餐饮管理有限公司是由杨先生创办的餐饮企业"
31 |     terms = segment.seg(text)
32 |     print_terms(terms)
33 | 
34 |     # 简繁转换
35 |     text = u"以后等你当上皇后，就能买草莓庆祝了"
36 |     print segment.simplified_to_traditional(text)
37 | 
38 |     # 繁简转换
39 |     text = u"用筆記簿型電腦寫程式HelloWorld"
40 |     print segment.traditional_to_simplified(text)
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     main()
45 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | PACKAGE = "yaya"
 4 | NAME = "YaYaNLP"
 5 | DESCRIPTION = "YaYaNLP: Chinese Language Processing"
 6 | AUTHOR = "tony huang"
 7 | AUTHOR_EMAIL = "tony@huangyong.me"
 8 | URL = "http://www.huangyong.me"
 9 | 
10 | VERSION = __import__(PACKAGE).__version__
11 | 
12 | setup(
13 |     name=NAME,
14 |     version=VERSION,
15 |     description=DESCRIPTION,
16 |     author=AUTHOR,
17 |     author_email=AUTHOR_EMAIL,
18 |     license="Apache",
19 |     url=URL,
20 |     packages=find_packages(exclude=["test*", "data*"]),
21 |     classifiers=[
22 |         'Development Status :: 0.1.1 - Alpha',
23 |         'Intended Audience :: Developers',
24 |         'License :: OSI Approved :: Apache License Version 2.0',
25 |         'Programming Language :: Python',
26 |     ],
27 | 
28 |     zip_safe=False,
29 | )
30 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tony-Wang/YaYaNLP/d75780290926877e55759fb64e1440f809d653ed/test/__init__.py


--------------------------------------------------------------------------------
/test/data/test.ngram.txt:
--------------------------------------------------------------------------------
 1 | 中华@骨髓 6
 2 | 中华@骨髓库 40
 3 | 中华@魂 1
 4 | 中华@鳖精 2
 5 | 中华@鸟类 4
 6 | 中华@鸟龙 7
 7 | 中华@， 15
 8 | 中华人民共和国@不可 1
 9 | 中华人民共和国@与 2
10 | 中华人民共和国@中央 1
11 | 中华人民共和国@中央政府 1
12 | 中华人民共和国@主席 2
13 | 中华人民共和国@主席令 2
14 | 中华人民共和国@主管 1


--------------------------------------------------------------------------------
/test/data/test.txt:
--------------------------------------------------------------------------------
1 | 一举 n 1
2 | 一举成名 n 1
3 | 一举成名天下知 n 1
4 | 成名 n 1
5 | 天下 n 1
6 | 法兰西 n 1
7 | 注册机 n 1


--------------------------------------------------------------------------------
/test/test_bigramtable.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from unittest import TestCase
 3 | from yaya.collection.bigram import *
 4 | 
 5 | __author__ = 'tony'
 6 | 
 7 | 
 8 | class TestBiGramTable(TestCase):
 9 |     def test_build(self):
10 |         filename = "./data/test.ngram.txt"
11 |         table = BiGramTable.build(filename)
12 |         self.assertEqual(table.get_bifreq(u"中华", u"鸟类"), 4)
13 |         self.assertEqual(table.get_bifreq(u"中华", u"鸟龙"), 7)
14 | 
15 |     def test_get_Bifreq(self):
16 |         self.assertEqual(CoreBiGramTable().table.get_bifreq(u"中华", u"鸟类"), 4)
17 |         self.assertEqual(CoreBiGramTable().table.get_bifreq(u"中华", u"鸟龙"), 7)
18 | 


--------------------------------------------------------------------------------
/test/test_dict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | from __future__ import absolute_import, unicode_literals
  3 | import os
  4 | from unittest import TestCase
  5 | 
  6 | from yaya.collection.dict import *
  7 | import yaya.config
  8 | from yaya.dictionary.person_dict import PersonDict
  9 | 
 10 | __author__ = 'tony'
 11 | 
 12 | 
 13 | class TestDoubleArrayTrie(TestCase):
 14 |     def test_fetch(self):
 15 |         trie = DoubleArrayTrie()
 16 |         words = []
 17 |         words.append(u"一举")
 18 |         words.append(u"一举一动")
 19 |         words.append(u"一举成名")
 20 |         words.append(u"一举成名天下知")
 21 |         words.append(u"啊")
 22 |         words.append(u"埃及")
 23 |         words.append(u"阿拉伯")
 24 |         words.append(u"阿拉伯人")
 25 |         words.append(u"阿根廷")
 26 |         words.append(u"阿胶")
 27 |         words.sort()
 28 |         trie.build(key=words, v=['一', '一', '一', '一', '二', '三', '四', '四', '四', '四'])
 29 |         self.assertGreater(trie.exact_match_search(u"一举一动"), 0)
 30 |         self.assertGreater(trie.exact_match_search(u"阿拉伯"), 0)
 31 |         self.assertGreater(trie.exact_match_search(u"阿拉伯人"), 0)
 32 | 
 33 |     def test_build(self):
 34 |         trie = DoubleArrayTrie()
 35 |         words = []
 36 |         words.append(u"一举 n 1")
 37 |         words.append(u"一举一动 n 1")
 38 |         words.append(u"一举成名 n 1")
 39 |         words.append(u"一举成名天下知 n 1")
 40 |         words.append(u"啊 n 1")
 41 |         words.append(u"埃及 n 1")
 42 |         words.append(u"阿拉伯 n 1")
 43 |         words.append(u"阿拉伯人 n 1")
 44 |         words.append(u"阿根廷 n 1")
 45 |         words.append(u"阿胶 n 1")
 46 |         words.sort()
 47 |         trie=DoubleArrayTrie.load_from_list(words)
 48 |         self.assertEqual(trie.get(u"一举")[1].nature, NATURE.n)
 49 |         self.assertEqual(trie.get(u"一举一动")[1].nature, NATURE.n)
 50 |         self.assertEqual(trie.get(u"一举成名")[1].nature, NATURE.n)
 51 |         self.assertEqual(trie.get(u"一举成名天下知")[1].nature, NATURE.n)
 52 |         self.assertEqual(trie.get(u"啊")[1].nature, NATURE.n)
 53 |         self.assertEqual(trie.get(u"埃及")[1].nature, NATURE.n)
 54 |         self.assertEqual(trie.get(u"阿拉伯")[1].nature, NATURE.n)
 55 | 
 56 |     def test_load_dict(self):
 57 |         new_trie = DoubleArrayTrie.load_dict_file(os.path.join("data", "test.txt"))
 58 |         self.assertGreater(new_trie.exact_match_search(u"注册机"), 0)
 59 | 
 60 |     def test_load_big(self):
 61 |         trie = DoubleArrayTrie.load(yaya.config.CORE_DICT_NAME)
 62 |         self.assertGreater(trie.exact_match_search(u"法兰西斯"), 0)
 63 |         self.assertIsNotNone(trie.get(u"法兰西")[1].nature, u"核心字典里的value字段不应该None")
 64 | 
 65 | 
 66 |     def test_search(self):
 67 |         trie = DoubleArrayTrie.load(os.path.join("data", "test.txt"))
 68 |         self.assertGreaterEqual(u"一举", 0, u"词典中含有")
 69 |         self.assertGreaterEqual(u"一举成名", 0, u"词典中含有")
 70 |         self.assertGreaterEqual(u"一举成名天下知", 0, u"词典中含有")
 71 |         search = trie.search(u"一举成名天下知", 0)
 72 |         while search.next():
 73 |             print(search.value)
 74 | 
 75 |     def test_searcher_generator(self):
 76 |         trie = DoubleArrayTrie.load(os.path.join("data", "test.txt"))
 77 |         self.assertGreaterEqual(u"一举", 0, u"词典中含有")
 78 |         self.assertGreaterEqual(u"一举成名", 0, u"词典中含有")
 79 |         self.assertGreaterEqual(u"一举成名天下知", 0, u"词典中含有")
 80 |         search = trie.search(u"一举成名天下知", 0)
 81 |         terms = []
 82 |         for i, k, v in search.search_all_words():
 83 |             terms.append((i, k, v))
 84 |             self.assertEqual(v.nature, NATURE.n)
 85 |             self.assertEqual(len(v), 1)
 86 |             self.assertEqual(v.to_tuple()[1], 1)
 87 |         self.assertEqual(len(terms), 5, u"搜索生成器，查找出所有词典里有的词")
 88 | 
 89 | 
 90 | 
 91 |     def test_custom_dict(self):
 92 |         self.assertGreaterEqual(CustomDict().trie.exact_match_search(u"黄勇"), 0)
 93 | 
 94 |     def test_dat_transition(self):
 95 |         trie = DoubleArrayTrie.load(os.path.join("data", "test.txt"))
 96 |         self.assertNotEqual(trie.transition(u"法兰西", 1), -1)
 97 |         self.assertEqual(trie.transition(u"法兰东", 1), -1)
 98 |         p = trie.transition(u"法兰", 1)
 99 |         self.assertNotEqual(trie.transition(u"西", p), -1)
100 |         self.assertEqual(trie.transition(u"东", p), -1)
101 | 
102 |     def test_dat_output(self):
103 |         dat = DoubleArrayTrie()
104 |         dat.build(key=[u"江河湖海"], v=[u"江河湖海 n 1"])
105 |         state = dat.transition(u'江河湖海', 1)
106 |         self.assertGreater(state, -1)
107 |         self.assertIsNotNone(dat.output(state))
108 |         self.assertEqual(dat.output(state), dat.get(u"江河湖海")[1])
109 | 
110 |         # state = CoreDict().trie.transition(u"大海", 1)
111 |         # self.assertGreater(state, -1)
112 |         # self.assertEqual(CoreDict().trie.output(state), CoreDict().trie.get(u'大海')[1])
113 | 
114 | 
115 | 
116 | class TestAttribute(TestCase):
117 |     def test_total_freq(self):
118 |         text = "测试 n 10 nz 3 p 4"
119 |         attr = Attribute(attr=text.split()[1:])
120 |         self.assertEqual(attr.total_frequency, 17)
121 |         # self.assertEqual(attr.get_nature_frequency('n'), 10)
122 |         self.assertEqual(attr.get_nature_frequency(NATURE.n), 10)
123 |         self.assertEqual(attr.get_nature_frequency(NATURE.nz), 3)
124 |         self.assertEqual(attr.get_nature_frequency(NATURE.p), 4)
125 | 
126 | 
127 | class TestAllDict(TestCase):
128 |     def test_PersonDict(self):
129 |         self.assertNotEqual(PersonDict().trie.exact_match_search(u"籍"), -1)
130 | 


--------------------------------------------------------------------------------
/test/test_enum.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from yaya.common.enum import Enum, EnumValue
 4 | from yaya.common.nr import NR
 5 | from yaya.common.nature import NATURE
 6 | 
 7 | __author__ = 'tony'
 8 | 
 9 | 
10 | class TestEnum(TestCase):
11 |     def test_nr(self):
12 |         self.assertEqual(NR.A.index, 14)
13 | 
14 |     def test_nature(self):
15 |         self.assertEqual(NATURE.n.index, 13)
16 | 
17 |     def test_nature_key_to_index(self):
18 |         self.assertEqual(type(NATURE.n), EnumValue)
19 | 
20 |     def test_nature_key_to_str(self):
21 |         self.assertEqual(str(NATURE.n), 'n')
22 | 
23 |     def test_enum(self, ):
24 |         E1 = Enum('a', 'b', enum_name='E1')
25 |         self.assertTrue(str(E1.b) == 'b')
26 |         self.assertEqual(E1['b'].index, 1)
27 | 
28 |     def test_demo(self):
29 |         # char => int
30 |         E1 = Enum('a', 'b', enum_name='E1')
31 |         self.assertTrue(str(E1.b) == 'b')
32 |         self.assertEqual(E1['b'].index, 1)
33 |         self.assertTrue(str(E1[1]) == 'b' )


--------------------------------------------------------------------------------
/test/test_hmm.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from unittest import TestCase
 3 | from yaya.collection.hmm import PersonTranMatrix
 4 | 
 5 | __author__ = 'tony'
 6 | 
 7 | 
 8 | class TestHMMMatrix(TestCase):
 9 |     def test_load(self):
10 |         self.assertIsNotNone(PersonTranMatrix().hmm, u"加载人名识别HMM转换矩阵")
11 |         self.assertNotEqual(PersonTranMatrix().hmm.matrix.__len__(), 0)
12 |         self.assertEqual(PersonTranMatrix().hmm.total_freq, 43938702)
13 | 


--------------------------------------------------------------------------------
/test/test_organization_recognition.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | __author__ = 'tony'
 3 | from unittest import TestCase
 4 | 
 5 | from yaya.config import Config
 6 | from yaya.recognition import person_recognition
 7 | from yaya.recognition import place_recognition
 8 | from yaya.recognition import organization_recognition
 9 | from yaya.seg.viterbi import viterbi
10 | from yaya.seg.wordnet import WordNet, Vertex, gen_word_net, dump_vertexs
11 | from yaya.seg.segment import traditional_to_simplified
12 | 
13 | class TestOrgRecognition(TestCase):
14 |     def gen_word(self, text):
15 |         self.text = text
16 |         self.word_net = WordNet(self.text)
17 |         # 粗分词网
18 |         gen_word_net(self.text, self.word_net)
19 |         # 维特比
20 |         self.vertexs = viterbi(self.word_net.vertexs)
21 |         self.word_net_optimum = WordNet(self.text, vertexs=self.vertexs)
22 | 
23 |     def test_recognition_1_level(self):
24 |         text = u"济南杨铭宇餐饮管理有限公司是由杨先生创办的餐饮企业"
25 |         self.gen_word(text)
26 |         # vertexs = persion_recognition.recognition(vertexs, word_net_optimum, word_net)
27 |         # word_net_optimum = WordNet(text, vertexs=vertexs)
28 |         organization_recognition.recognition(self.vertexs, self.word_net_optimum, self.word_net)
29 |         vertexs = viterbi(self.word_net_optimum.vertexs)
30 |         self.assertIn(Vertex(u"济南杨铭宇餐饮管理有限公司", attribute=u"nt 1"), vertexs)
31 | 
32 |     def test_recognition_2_level(self):
33 |         text = u"济南杨铭宇餐饮管理有限公司是由杨先生创办的餐饮企业"
34 |         self.gen_word(text)
35 |         person_recognition.recognition(self.vertexs, self.word_net_optimum, self.word_net)
36 |         place_recognition.recognition(self.vertexs, self.word_net_optimum, self.word_net)
37 |         word_net_optimum = WordNet(self.text, vertexs=self.vertexs)
38 |         vertexs = organization_recognition.recognition(self.vertexs, word_net_optimum, self.word_net)
39 |         # viterbi(word_net_optimum.vertexs)
40 |         dump_vertexs(vertexs)
41 |         self.assertIn(Vertex(u"济南杨铭宇餐饮管理有限公司", attribute=u"nt 1"), vertexs)
42 | 
43 |     def test_organization_recognition(self):
44 |         text = traditional_to_simplified(u"馬總統上午前往陸軍航空601旅，")
45 |         Config.debug = True
46 |         self.gen_word(text)
47 |         person_recognition.recognition(self.vertexs, self.word_net_optimum, self.word_net)
48 |         place_recognition.recognition(self.vertexs, self.word_net_optimum, self.word_net)
49 |         word_net_optimum = WordNet(self.text, vertexs=self.vertexs)
50 |         vertexs = organization_recognition.recognition(self.vertexs, word_net_optimum, self.word_net)
51 |         dump_vertexs(vertexs)
52 |         self.assertIn(Vertex(u"陆军航空601旅", attribute=u"nt 1"), vertexs)
53 | 
54 | 


--------------------------------------------------------------------------------
/test/test_person_recognition.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from unittest import TestCase
 3 | 
 4 | from yaya.seg import segment
 5 | from yaya.seg.viterbi import viterbi
 6 | from yaya.seg.wordnet import WordNet, gen_word_net, Vertex
 7 | from yaya.recognition import person_recognition
 8 | 
 9 | __author__ = 'tony'
10 | 
11 | 
12 | class TestPersonRecognition(TestCase):
13 |     def test_recognition(self):
14 |         text = u"签约仪式前，秦光荣、李纪恒、仇和、王春桂、张晓辉等一同会见了参加签约的企业家。"
15 |         word_net = WordNet(text)
16 | 
17 |         # 粗分词网
18 |         gen_word_net(text, word_net)
19 | 
20 |         # 维特比
21 |         vertexs = viterbi(word_net.vertexs)
22 |         word_net_optimum = WordNet(text, vertexs=vertexs)
23 |         person_recognition.recognition(vertexs, word_net_optimum, word_net)
24 |         vertexs = viterbi(word_net_optimum.vertexs)
25 |         self.assertIn(Vertex(u"秦光荣", attribute=u"nr 1"), vertexs)
26 |         self.assertIn(Vertex(u"李纪恒", attribute=u"nr 1"), vertexs)
27 |         self.assertIn(Vertex(u"仇和", attribute=u"nr 1"), vertexs)
28 |         self.assertIn(Vertex(u"王春桂", attribute=u"nr 1"), vertexs)
29 |         self.assertIn(Vertex(u"张晓辉", attribute=u"nr 1"), vertexs)
30 |         print(vertexs)
31 | 
32 |     def test_person_name_V_should_split_to_EL_DL(self):
33 |         text = u"龚学平、张晓辉等领导说,邓颖超生前杜绝超生"
34 |         vertexs = segment.seg_to_vertexs(text)
35 |         terms = segment.vertexs_to_terms(vertexs, True)
36 |         self.assertIn(u"龚学平", terms)
37 |         self.assertIn(u"张晓辉", terms)
38 |         self.assertIn(u"邓颖超", terms)
39 | 
40 | 


--------------------------------------------------------------------------------
/test/test_place_recognition.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | __author__ = 'tony'
 3 | 
 4 | from unittest import TestCase
 5 | 
 6 | from yaya.recognition import place_recognition
 7 | from yaya.collection.dict import CustomDict, Attribute
 8 | from yaya.seg import segment
 9 | from yaya.seg.viterbi import viterbi
10 | from yaya.seg.wordnet import WordNet, gen_word_net, Vertex, combine_by_custom_dict
11 | 
12 | 
13 | class TestPlaceRecognition(TestCase):
14 |     def setUp(self):
15 |         self.text = u"蓝翔给宁夏固原市彭阳县红河镇黑牛沟村捐赠了挖掘机"
16 |         self.word_net = WordNet(self.text)
17 |         # 粗分词网
18 |         gen_word_net(self.text, self.word_net)
19 |         # 维特比
20 |         self.vertexs = viterbi(self.word_net.vertexs)
21 |         self.vertexs = combine_by_custom_dict(self.vertexs, CustomDict().trie)
22 |         self.word_net_optimum = WordNet(self.text, vertexs=self.vertexs)
23 | 
24 |     def test_recognition(self):
25 |         place_recognition.recognition(self.vertexs, self.word_net_optimum, self.word_net)
26 |         vertexs = viterbi(self.word_net_optimum.vertexs)
27 |         self.assertIn(Vertex(u"宁夏"), vertexs)
28 |         self.assertIn(Vertex(u"固原市"), vertexs)
29 |         self.assertIn(Vertex(u"彭阳县", attribute=u"ns 1"), vertexs)
30 |         self.assertIn(Vertex(u"红河镇", attribute=u"ns 1"), vertexs)
31 |         self.assertIn(Vertex(u"黑牛沟村", attribute=u"ns 1"), vertexs)
32 | 


--------------------------------------------------------------------------------
/test/test_role_tag.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from unittest import TestCase
 3 | 
 4 | from yaya.collection.dict import Attribute, DoubleArrayTrie
 5 | from yaya.collection.hmm import PersonTranMatrix
 6 | from yaya.common.nr import NR, NRPattern
 7 | from yaya.const import *
 8 | from yaya.recognition.person_recognition import role_tag
 9 | from yaya.seg.viterbi import viterbi_roletag
10 | from yaya.seg.wordnet import new_tag_vertex, Vertex
11 | 
12 | __author__ = 'tony'
13 | 
14 | 
15 | class TestRole_tag(TestCase):
16 |     def test_role_tag(self):
17 |         word_seg_list = [
18 |             new_tag_vertex(TAG_BIGIN),
19 |             Vertex(u"秦", attribute=Attribute(u'n 1')),
20 |             Vertex(u"光荣", attribute=Attribute(u'n 1')),
21 |             Vertex(u"同志", attribute=Attribute(u'n 1')),
22 |             new_tag_vertex(TAG_END),
23 |         ]
24 |         taglist = role_tag(word_seg_list)
25 | 
26 |         self.assertTrue(isinstance(taglist, list))
27 |         self.assertEqual(taglist[2].to_tuple(), (NR.Z, 29, NR.L, 2))
28 | 
29 |         tag_index_list = viterbi_roletag(taglist, PersonTranMatrix().hmm)
30 |         self.assertEqual(tag_index_list[0], NR.A, u"人名识别，第一个标识应该为TAG_BAGIN")
31 |         self.assertEqual(tag_index_list[1], NR.B)
32 |         self.assertEqual(tag_index_list[2], NR.Z)
33 |         self.assertEqual(tag_index_list[3], NR.L)
34 |         self.assertEqual(tag_index_list[4], NR.A)
35 | 
36 |     def test_NRPattern(self):
37 |         """
38 | 
39 | 
40 |         """
41 |         trie = DoubleArrayTrie()
42 |         NRPattern.sort()
43 |         trie.build(key=NRPattern)
44 |         self.assertTrue(trie.exact_match_search("BCD") != -1)
45 |         self.assertTrue(trie.exact_match_search("BBCD") != -1)
46 |         self.assertTrue(trie.exact_match_search("BG") != -1)
47 |         self.assertTrue(trie.exact_match_search("DG") != -1)
48 |         self.assertTrue(trie.exact_match_search("CD") == -1)
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/test/test_segment.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from unittest import TestCase
 3 | 
 4 | from yaya.collection.dict import DoubleArrayTrie
 5 | from yaya.seg import segment
 6 | from yaya.seg.segment import traditional_seg
 7 | from yaya.seg.wordnet import atom_seg, WordNet, gen_word_net, combine_by_custom_dict
 8 | from yaya.utility.chartype import *
 9 | 
10 | __author__ = 'tony'
11 | 
12 | 
13 | class TestAtomSegment(TestCase):
14 |     def test_char_type(self):
15 |         self.assertEqual(get('a'), CT_SINGLE)
16 |         self.assertEqual(get('1'), CT_NUM)
17 |         self.assertEqual(get(u'中'), CT_CHINESE)
18 | 
19 |     def test_atom_seg(self):
20 |         text = '12341'
21 |         node_list = atom_seg(text, 0, text.__len__())
22 |         self.assertEqual(node_list.__len__(), 1)
23 |         self.assertEqual(node_list[0].pos, CT_NUM)
24 |         text = '123.41'
25 |         node_list = atom_seg(text, 0, text.__len__())
26 |         self.assertEqual(node_list.__len__(), 1)
27 |         self.assertEqual(node_list[0].pos, CT_NUM)
28 |         text = 'abc'
29 |         node_list = atom_seg(text, 0, text.__len__())
30 |         self.assertEqual(node_list.__len__(), 1)
31 |         self.assertEqual(node_list[0].pos, CT_SINGLE)
32 | 
33 | 
34 | class TestSegment(TestCase):
35 |     def test_seg_find_nr(self):
36 |         text = u"签约仪式前，秦光荣、李纪恒、仇和、王春桂等一同会见了参加签约的企业家。"
37 |         terms = segment.seg(text)
38 |         self.assertIn((u"秦光荣", 'nr', 6), terms, u"测试是否找出人名")
39 |         self.assertIn((u"李纪恒", 'nr', 10), terms, u"测试是否找出人名")
40 |         self.assertIn((u"仇和", 'nr', 14), terms, u"测试是否找出人名")
41 | 
42 |     def test_combin_by_dict(self):
43 |         dat = DoubleArrayTrie()
44 |         dat.build([u"江", u"河", u"湖", "海"])
45 |         text = u"江河湖海"
46 |         word_net = WordNet(text)
47 |         gen_word_net(text, word_net, dat)
48 |         vertexs = [v[0] for v in word_net.vertexs]
49 |         self.assertEqual(len(word_net), 6, u"自定义字典分词")
50 | 
51 |         combin_dat = DoubleArrayTrie()
52 |         combin_dat.build(key=[u"江河湖海"], v=[u"江河湖海 n 1"])
53 |         vertexs = combine_by_custom_dict(vertexs, combin_dat)
54 |         self.assertEqual(len(vertexs), 3, u"合并完成后应该只有前尾加中间词")
55 | 
56 |     def test_traditional_seg(self):
57 |         text = u"記者羅吉訓／新竹報導 雙方合作的主要內容包括，希望能夠促成太陽能設備安裝維修人才培養；結合推廣教育由綠野集團引薦國外學生來臺就讀；與觀光及餐飲系合作觀光休閒產業，提供來臺遊客入住大華科大樂群會館，並導覽參訪張學良故居等臺灣各知名景點。 訂閱聯絡電話：02-23222722-814 瀏覽器建議使用IE 9.0以上版本 最佳觀看解析度1024x768 網站更新日期：2015/12/13 "
58 |         print traditional_seg(text)
59 | 


--------------------------------------------------------------------------------
/test/test_traditionalChineseDict.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from unittest import TestCase
 3 | 
 4 | from yaya import config
 5 | from yaya.dictionary.chinese_traditional_dict import TraditionalChineseDict, SimplifiedChineseDict
 6 | 
 7 | __author__ = 'tony'
 8 | 
 9 | 
10 | class TestTraditionalChineseDict(TestCase):
11 |     def test_convert_simplified_to_traditional(self):
12 |         simplified = TraditionalChineseDict().convert_traditional_to_simplified(u"用筆記簿型電腦寫程式HelloWorld")
13 |         self.assertEqual(simplified, u"用笔记本电脑写程序HelloWorld")
14 | 
15 |     def test_convert_traditional_to_simplified(self):
16 |         config.Config.debug = True
17 |         traditional = SimplifiedChineseDict().convert_simplified_to_traditional(u"用笔记本电脑写程序HelloWorld")
18 |         self.assertEqual(traditional, u"用筆記簿型電腦寫程式HelloWorld")
19 | 
20 |     def test_traditional_chinese_dict_search_all_words(self):
21 |         searcher = TraditionalChineseDict().trie.search(u"用筆記簿型電腦寫程式HelloWorld")
22 |         for i, k, v in searcher.search_all_words():
23 |             print i, k, v
24 | 
25 |     def test_demo1(self):
26 |         text = u"記者羅吉訓／新竹報導 雙方合作的主要內容包括，希望能夠促成太陽能設備安裝維修人才培養；" \
27 |                u"結合推廣教育由綠野集團引薦國外學生來臺就讀；與觀光及餐飲系合作觀光休閒產業，" \
28 |                u"提供來臺遊客入住大華科大樂群會館，並導覽參訪張學良故居等臺灣各知名景點。 " \
29 |                u"訂閱聯絡電話：02-23222722-814 瀏覽器建議使用IE 9.0以上版本 最佳觀看解析度1024x768 " \
30 |                u"網站更新日期：2015/12/13 "
31 |         simplified = TraditionalChineseDict().convert_traditional_to_simplified(text)
32 |         print(simplified)
33 |         text = u"媒體詢問對目前選戰看法？朱立倫說最重要是要把沉默的大眾喚出來，" \
34 |                u"為了台灣安定、兩岸和平及經濟發展，拜託大家在最後關頭全力團結及共同支持。 " \
35 |                u"今晚黨內重量級人士到齊，媒體詢問等於是最高規格的選戰會議，" \
36 |                u"是否會向總統當面拜託總統夫人周美青出來？朱立倫馬上向身旁的馬總統說，" \
37 |                u"「對呀，請馬學長拜託周學姐出來輔選」，總統笑著說「我一定轉達」。 " \
38 |                u"朱立倫表示，今晚餐敘不是輔選會報，但不管是馬總統、吳副總統、王金平及行政院長毛治國，" \
39 |                u"大家都是同心協力，求團結勝選 。 他強調，最近到各地陸續見到好多民眾展現熱情，" \
40 |                u"希望最後一個月不斷加溫，直到明年1月16日勝選。1041217 這裡有個好粉絲團，需要你關注！"
41 |         simplified = TraditionalChineseDict().convert_traditional_to_simplified(text)
42 |         print(simplified)
43 | 


--------------------------------------------------------------------------------
/test/test_trie.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tony'
2 | 


--------------------------------------------------------------------------------
/test/test_viterbi_segment.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from unittest import TestCase
 3 | 
 4 | from yaya.collection.hmm import OrgTranMatrix
 5 | from yaya.common.nt import NT
 6 | from yaya.seg.segment import vertexs_to_terms
 7 | from yaya.seg.viterbi import *
 8 | from yaya.seg.wordnet import *
 9 | 
10 | __author__ = 'tony'
11 | 
12 | 
13 | class TestViterbiSegment(TestCase):
14 |     def test_viterbi(self):
15 |         text = u"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
16 |         # text = u"商品23和服务"
17 |         word_net = WordNet(text)
18 |         gen_word_net(text, word_net)
19 |         vertex_list = vertexs_to_terms(viterbi(word_net.vertexs), True)
20 |         self.assertTrue(u"工信处" in vertex_list)
21 |         self.assertTrue(u"女" in vertex_list)
22 |         self.assertTrue(u"干事" in vertex_list)
23 |         self.assertTrue(u"每月" in vertex_list)
24 |         self.assertTrue(u"经过" in vertex_list)
25 |         self.assertTrue(u"下属" in vertex_list)
26 |         self.assertTrue(u"科室" in vertex_list)
27 |         self.assertTrue(u"都" in vertex_list)
28 |         self.assertTrue(u"要" in vertex_list)
29 |         self.assertTrue(u"亲口" in vertex_list)
30 |         self.assertTrue(u"交代" in vertex_list)
31 |         self.assertTrue(u"24" in vertex_list)
32 |         self.assertTrue(u"口" in vertex_list)
33 |         self.assertTrue(u"交换机" in vertex_list)
34 |         self.assertTrue(u"等" in vertex_list)
35 |         self.assertTrue(u"技术性" in vertex_list)
36 |         self.assertTrue(u"器件" in vertex_list)
37 |         self.assertTrue(u"的" in vertex_list)
38 |         self.assertTrue(u"安装" in vertex_list)
39 |         self.assertTrue(u"工作" in vertex_list)
40 | 
41 |     def test_custom_dict(self):
42 |         text = u"黄勇今天来上班了"
43 |         word_net = WordNet(text)
44 |         gen_word_net(text, word_net)
45 |         vertex_list = viterbi(word_net.vertexs)
46 |         vertex_list = combine_by_custom_dict(vertex_list)
47 |         self.assertEqual(vertex_list[1].real_word, u"黄勇")
48 | 
49 | 
50 | class TestViterbi(TestCase):
51 |     def test_computer(self):
52 |         node_list = []
53 |         node_list.append(Attribute((NT.S, 19800)))
54 |         node_list.append(Attribute((NT.K, 1000, NT.D, 1000)))
55 |         node_list.append(Attribute((NT.C, 1000, NT.B, 1000)))
56 |         node_list.append(Attribute((NT.M, 1000)))
57 |         node_list.append(Attribute((NT.P, 12, NT.D, 1)))
58 |         node_list.append(Attribute((NT.B, 19800)))
59 |         tag_list = viterbi_standard(node_list, hmm=OrgTranMatrix().hmm)
60 |         self.assertEquals(6, len(tag_list))
61 |         self.assertEqual(NT.K, tag_list[1])
62 |         self.assertEqual(NT.C, tag_list[2])
63 |         self.assertEqual(NT.M, tag_list[3])
64 |         self.assertEqual(NT.D, tag_list[4])
65 | 


--------------------------------------------------------------------------------
/test/test_wordnet.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from unittest import TestCase
 4 | 
 5 | from yaya.const import TAG_BIGIN, TAG_END
 6 | from yaya.seg.wordnet import WordNet, gen_word_net, Vertex, new_tag_vertex
 7 | 
 8 | __author__ = 'tony'
 9 | 
10 | 
11 | class TestWordNet(TestCase):
12 |     def test_gen_word_net(self):
13 |         text = u"一举成名天下知"
14 |         word_net = WordNet(text)
15 |         gen_word_net(text, word_net)
16 |         self.assertEqual(word_net.vertexs.__len__(), text.__len__() + 2)
17 |         # 一举 一举成名
18 |         # 举
19 |         # 成 成名
20 |         # 名
21 |         # 天 天下
22 |         # 下
23 |         # 知
24 |         self.assertEqual(word_net.vertexs[1].__len__(), 2)
25 |         self.assertEqual(word_net.vertexs[2].__len__(), 1)
26 |         self.assertEqual(word_net.vertexs[3].__len__(), 2)
27 |         self.assertEqual(word_net.vertexs[4].__len__(), 1)
28 |         self.assertEqual(word_net.vertexs[5].__len__(), 2)
29 |         self.assertEqual(word_net.vertexs[6].__len__(), 1)
30 |         self.assertEqual(word_net.vertexs[7].__len__(), 1)
31 | 
32 |     def test_gen_word_net_include_num(self):
33 |         text = u"123456"
34 |         word_net = WordNet(text)
35 |         gen_word_net(text, word_net)
36 |         self.assertEqual(word_net.vertexs.__len__(), 6 + 2)
37 |         self.assertTrue([] not in word_net.vertexs, u"原始词网，不能可能有空节点")
38 | 
39 |     def test_vector(self):
40 |         v1 = Vertex("test", attribute="nr 1")
41 |         v2 = Vertex("test", attribute="nr 1")
42 |         v3 = Vertex("test", attribute="nr1 1")
43 |         self.assertEqual(v1, v2)
44 |         self.assertNotEqual(v1, v3)
45 |         self.assertIn(v1, [v2])
46 |         self.assertNotIn(v1, [v3])
47 | 
48 |     def test_tag_vector_real_word_len_should_eq_0(self):
49 |         # 标识词的real_word不能为空，否则在字典里无法表示
50 |         self.assertEqual(new_tag_vertex(TAG_BIGIN).real_word, chr(32))
51 |         self.assertEqual(new_tag_vertex(TAG_END).real_word, chr(32))
52 | 
53 |     def test_word_net_insert(self):
54 |         text = u"1234567890"
55 |         word_net_all = WordNet(text)
56 |         for i, c in enumerate(text):
57 |             word_net_all.add(i + 1, Vertex(c))
58 | 


--------------------------------------------------------------------------------
/yaya/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tony'
2 | __version__ = "0.1.1"
3 | 


--------------------------------------------------------------------------------
/yaya/collection/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tony'
2 | 


--------------------------------------------------------------------------------
/yaya/collection/bigram.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | from __future__ import absolute_import
  3 | 
  4 | import time
  5 | 
  6 | from yaya import config
  7 | from yaya.collection.dict import CoreDict
  8 | from yaya.const import logger
  9 | from yaya.utility.singleton import singleton
 10 | 
 11 | __author__ = 'tony'
 12 | 
 13 | 
 14 | class BiGramTable:
 15 |     def __init__(self):
 16 |         self.start = []
 17 |         self.pair = []
 18 | 
 19 |     def get_bifreq(self, pre_word, next_word):
 20 |         pre_word_id = pre_word if type(pre_word) is int else CoreDict().trie.exact_match_search(pre_word)
 21 |         if pre_word_id == -1:
 22 |             return 0
 23 |         next_word_id = next_word if type(next_word) is int else CoreDict().trie.exact_match_search(next_word)
 24 |         if next_word_id == -1:
 25 |             return 0
 26 |         index = binary_search(self.pair, self.start[pre_word_id],
 27 |                               self.start[pre_word_id + 1] - self.start[pre_word_id],
 28 |                               next_word_id)
 29 |         if index < 0:
 30 |             return 0
 31 |         index <<= 1
 32 |         return self.pair[index + 1]
 33 | 
 34 |     @staticmethod
 35 |     def load(filename=config.CORE_BIGRAM_NAME):
 36 |         start = time.time()
 37 |         logger.info(u"开始加载核心二元语法词表")
 38 |         import os
 39 |         if os.path.exists(filename + config.DICT_BIN_EXT):
 40 |             return BiGramTable.load_bin(filename + config.DICT_BIN_EXT)
 41 |         else:
 42 |             table = BiGramTable.build(filename)
 43 |             import cPickle as Pickle
 44 |             with open(filename + config.DICT_BIN_EXT, 'w') as f:
 45 |                 Pickle.dump(table, f)
 46 |             return table
 47 |         logger.info(u"加载核心二元语法词表完毕，耗时%s", time.time() - start)
 48 | 
 49 |     @staticmethod
 50 |     def load_bin(filename):
 51 |         import cPickle as Pickle
 52 |         with open(filename, 'r') as f:
 53 |             bigram = Pickle.load(f)
 54 |             f.close()
 55 |             return bigram
 56 | 
 57 |     @staticmethod
 58 |     def build(filename):
 59 |         import codecs
 60 |         f = codecs.open(filename, 'r', 'utf-8')
 61 |         pre_word_map = {}
 62 |         max_word_id = CoreDict().trie.word_size()
 63 |         total = 0
 64 |         while True:
 65 |             line = f.readline()
 66 |             if not line:
 67 |                 break
 68 |             params = line.split()
 69 |             if params.__len__() != 2:
 70 |                 continue
 71 |             two_word = params[0].split('@', 2)
 72 |             if two_word.__len__() != 2:
 73 |                 continue
 74 | 
 75 |             pre_word_id = CoreDict().trie.exact_match_search(two_word[0])
 76 |             if pre_word_id == -1:
 77 |                 continue
 78 |             next_word_id = CoreDict().trie.exact_match_search(two_word[1])
 79 |             if next_word_id == -1:
 80 |                 continue
 81 |             if pre_word_id not in pre_word_map:
 82 |                 pre_word_map[pre_word_id] = {}
 83 |             next_word_map = pre_word_map.get(pre_word_id)
 84 |             next_word_map[next_word_id] = int(params[1])
 85 |             total += 2
 86 |         f.close()
 87 | 
 88 |         table = BiGramTable()
 89 |         table.start = [0] * (max_word_id + 1)
 90 |         table.pair = [0] * total
 91 |         offset = 0
 92 |         for i in range(max_word_id):
 93 |             next_word_map = pre_word_map.get(i, None)
 94 |             if next_word_map is not None:
 95 |                 key_list = next_word_map.keys()
 96 |                 key_list.sort()
 97 |                 for k in key_list:
 98 |                     index = offset << 1
 99 |                     table.pair[index] = k
100 |                     table.pair[index + 1] = next_word_map[k]
101 |                     offset += 1
102 |             table.start[i + 1] = offset
103 |         return table
104 | 
105 | 
106 | def binary_search(a, from_index, length, key):
107 |     low = from_index
108 |     high = from_index + length - 1
109 |     while low <= high:
110 |         mid = (low + high) >> 1
111 |         mid_val = a[mid << 1]
112 |         if mid_val < key:
113 |             low = mid + 1
114 |         elif mid_val > key:
115 |             high = mid - 1
116 |         else:
117 |             return mid
118 |     return -(low + 1)
119 | 
120 | 
121 | @singleton
122 | class CoreBiGramTable:
123 |     def __init__(self):
124 |         self.table = BiGramTable.load()
125 | 
126 | 
127 | CORE_BIG_RAM_TABLE = CoreBiGramTable()
128 | 


--------------------------------------------------------------------------------
/yaya/collection/dict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | try:
  3 |     import cPickle as Pickle
  4 | except:
  5 |     import Pickle
  6 | try:
  7 |     import xrange as range
  8 | except:
  9 |     pass
 10 | 
 11 | from collections import OrderedDict
 12 | 
 13 | from yaya.const import *
 14 | from yaya import config
 15 | from yaya.utility.singleton import singleton
 16 | from yaya.common.nature import NATURE
 17 | 
 18 | ATTRIBUTE_MAIN_NATURE_INDEX = 0
 19 | 
 20 | 
 21 | class Node(object):
 22 |     def __init__(self, code=0, depth=0, left=0, right=0):
 23 |         self.code = code
 24 |         self.depth = depth
 25 |         self.left = left
 26 |         self.right = right
 27 | 
 28 | 
 29 | class Attribute(object):
 30 |     def __init__(self, attr, cls=NATURE):
 31 |         self.cls = cls
 32 |         self.total = 0
 33 |         if not isinstance(attr, tuple):
 34 |             self.data = ()
 35 |             if attr is not None:
 36 |                 attr = attr if isinstance(attr, list) else attr.split(' ')
 37 |                 nature = []
 38 |                 for i in range(0, attr.__len__(), 2):
 39 |                     nature.append(cls[attr[i]])
 40 |                     nature.append(int(attr[i + 1]))
 41 |                     self.total += int(attr[i + 1])
 42 |                 self.data = tuple(nature)
 43 |         else:
 44 |             self.data = attr
 45 |             for i in range(len(self.data)):
 46 |                 if i % 2 == 1:
 47 |                     self.total += self.data[i]
 48 | 
 49 |     def to_tuple(self):
 50 |         return self.data
 51 | 
 52 |     def __str__(self):
 53 |         return ' '.join([str(x) for x in self.data])
 54 | 
 55 |     def __repr__(self):
 56 |         return u"Attribute(%s)" % self.__str__()
 57 | 
 58 |     def __len__(self):
 59 |         return len(self.data) / 2
 60 | 
 61 |     def __eq__(self, other):
 62 |         return str(self) == str(other)
 63 | 
 64 |     def get_nature_frequency(self, nature):
 65 |         try:
 66 |             return self.data[self.data.index(nature) + 1]
 67 |         except:
 68 |             return 0
 69 | 
 70 |     @property
 71 |     def natures(self):
 72 |         for i in range(0, len(self.data), 2):
 73 |             yield i / 2, self.data[i], self.data[i + 1]
 74 |             # return self.data
 75 | 
 76 |     @property
 77 |     def nature(self):
 78 |         if self.data.__len__() != 0:
 79 |             return self.data[ATTRIBUTE_MAIN_NATURE_INDEX]
 80 |         else:
 81 |             return None
 82 | 
 83 |     @property
 84 |     def total_frequency(self):
 85 |         return self.total
 86 | 
 87 | 
 88 | class FastArray:
 89 |     def __init__(self, default_value=0):
 90 |         self.default_value = 0
 91 |         self.data = {}
 92 |         self._max_key = 0
 93 |         pass
 94 | 
 95 |     def __getitem__(self, item):
 96 |         return self.data.get(item, 0)
 97 | 
 98 |     def __setitem__(self, key, value):
 99 |         self.data[key] = value
100 |         self._max_key = max(self._max_key, key)
101 | 
102 |     @property
103 |     def max_key(self):
104 |         return self._max_key
105 | 
106 |     def extend(self, size):
107 |         pass
108 | 
109 | 
110 | class DoubleArrayTrie:
111 |     def __init__(self, enum_cls=NATURE):
112 |         self.alloc_size = 0
113 |         self.check = []
114 |         self.base = []
115 |         self.enum_cls = enum_cls
116 |         self.used = []
117 |         self.size = 0
118 |         self.key = []
119 |         self.key_size = 0
120 |         self.length = None
121 |         self.value = []
122 |         self.v = None
123 |         self.progress = 0
124 |         self.next_check_pos = 0
125 |         self.error_ = 0
126 | 
127 |     def word_size(self):
128 |         if self.v is None:
129 |             return 0
130 |         else:
131 |             return self.v.__len__()
132 | 
133 |     def resize(self, newsize):
134 |         offsize = newsize - self.alloc_size
135 |         self.base.extend([0] * offsize)
136 |         self.check.extend([0] * offsize)
137 |         self.used.extend([0] * offsize)
138 |         self.alloc_size = newsize
139 | 
140 |     def fetch(self, parent, siblings):
141 |         if self.error_ < 0:
142 |             return 0
143 |         prev = 0
144 |         for i in range(parent.left, parent.right):
145 |             if parent.depth > (self.length[i] if self.length is not None else self.key[i].__len__()):
146 |                 continue
147 |             tmp = self.key[i]
148 |             cur = 0
149 |             if (self.length[i] if self.length is not None else tmp.__len__()) != parent.depth:
150 |                 cur = ord(tmp[parent.depth]) + 1
151 | 
152 |             # 检测是不是字典序
153 |             if prev > cur:
154 |                 return 0
155 | 
156 |             if cur != prev or siblings.__len__() is 0:
157 |                 tmp_node = Node(depth=parent.depth + 1, code=cur, left=i, right=0)
158 |                 if siblings.__len__() != 0:
159 |                     siblings[-1].right = i
160 |                 siblings.append(tmp_node)
161 |             prev = cur
162 | 
163 |         if siblings.__len__() != 0:
164 |             siblings[-1].right = parent.right
165 | 
166 |         return siblings.__len__()
167 | 
168 |     def insert(self, siblings):
169 |         if self.error_ < 0:
170 |             return 0
171 | 
172 |         begin = 0
173 |         pos = (siblings[0].code + 1 if (siblings[0].code + 1 > self.next_check_pos) else self.next_check_pos) - 1
174 |         nonzero_num = 0
175 |         first = 0
176 | 
177 |         if self.alloc_size <= pos:
178 |             self.resize(pos + 1)
179 | 
180 |         while 1:
181 |             pos += 1
182 | 
183 |             if self.alloc_size <= pos:
184 |                 self.resize(pos + 1)
185 | 
186 |             if self.check[pos] != 0:
187 |                 nonzero_num += 1
188 |                 continue
189 |             elif first is 0:
190 |                 self.next_check_pos = pos
191 |                 first = 1
192 | 
193 |             begin = pos - siblings[0].code
194 | 
195 |             if self.alloc_size <= (begin + siblings[-1].code):
196 |                 if 1.05 > 1.0 * self.key_size / (self.progress + 1):
197 |                     l = 1.05
198 |                 else:
199 |                     l = 1.0 * self.key_size / (self.progress + 1)
200 |                 self.resize(int(self.alloc_size * l))
201 | 
202 |             if self.used[begin]:
203 |                 continue
204 | 
205 |             find = True
206 |             for i in range(siblings.__len__()):
207 |                 if self.check[begin + siblings[i].code] != 0:
208 |                     find = False
209 |                     break
210 |             if not find:
211 |                 continue
212 |             break
213 | 
214 |         if 1.0 * nonzero_num / (pos - self.next_check_pos + 1) >= 0.95:
215 |             self.next_check_pos = pos
216 | 
217 |         self.used[begin] = True
218 |         self.size = self.size if (self.size > begin + siblings[-1].code + 1) else \
219 |             begin + siblings[-1].code + 1
220 | 
221 |         for i in range(siblings.__len__()):
222 |             self.check[begin + siblings[i].code] = begin
223 | 
224 |         for i in range(siblings.__len__()):
225 |             new_siblings = []
226 | 
227 |             if self.fetch(siblings[i], new_siblings) is 0:
228 |                 self.base[begin + siblings[i].code] = -self.value[siblings[i].left] - 1 if (
229 |                     self.value is not None) else (-siblings[i].left - 1)
230 | 
231 |                 if self.value is not None and -self.value[siblings[i].left] - 1 >= 0:
232 |                     self.error_ = -2
233 |                     return 0
234 | 
235 |                 self.progress += 1
236 |             else:
237 |                 h = self.insert(new_siblings)
238 |                 self.base[begin + siblings[i].code] = h
239 | 
240 |         return begin
241 | 
242 |     def build(self, key=None, length=None, key_size=None, v=None):
243 |         if key is None:
244 |             return 0
245 |         if key_size is not None and key_size > key.__len__():
246 |             return 0
247 |         self.key = key
248 |         self.length = length
249 |         self.key_size = key_size if key_size is not None else key.__len__()
250 |         self.value = None
251 |         self.v = v if v is not None else key
252 |         self.progress = 0
253 | 
254 |         self.resize(65536 * 32)
255 | 
256 |         self.base[0] = 1
257 |         self.next_check_pos = 0
258 | 
259 |         root_node = Node(left=0, right=self.key_size, depth=0, code=0)
260 | 
261 |         siblings = []
262 |         self.fetch(root_node, siblings)
263 |         self.insert(siblings)
264 | 
265 |         self.key = None
266 | 
267 |         return self.error_
268 | 
269 |     def exact_match_search(self, key, pos=0, keylen=0, nodepos=0):
270 |         if key is None:
271 |             return -1
272 |         if keylen <= 0:
273 |             keylen = key.__len__()
274 |         if nodepos <= 0:
275 |             nodepos = 0
276 | 
277 |         result = -1
278 |         b = self.base[nodepos]
279 | 
280 |         for i in range(pos, keylen):
281 |             p = b + ord(key[i]) + 1
282 |             if b == self.check[p]:
283 |                 b = self.base[p]
284 |             else:
285 |                 return result
286 | 
287 |         p = b
288 |         n = self.base[p]
289 |         if b == self.check[p] and n < 0:
290 |             result = -n - 1
291 |         return result
292 | 
293 |     def get(self, word):
294 |         index = self.exact_match_search(word)
295 |         if index >= 0:
296 |             return index, self.get_attr(self.v[index])
297 |         else:
298 |             return index, None
299 | 
300 |     def get_attr(self, value):
301 |         if isinstance(value, unicode) or isinstance(value, str):
302 |             return Attribute(value.split(chr(32))[1:], cls=self.enum_cls)
303 |         elif isinstance(value, list):
304 |             return Attribute(value[1:], cls=self.enum_cls)
305 |         raise Exception("异常的字典值类型:%s" % type(value))
306 | 
307 |     def transition(self, path, state_from):
308 |         b = state_from
309 |         for i in range(len(path)):
310 |             p = b + ord(path[i]) + 1
311 |             if b == self.check[p]:
312 |                 b = self.base[p]
313 |             else:
314 |                 return -1
315 |         p = b
316 |         return p
317 | 
318 |     def output(self, state):
319 |         if state < 0:
320 |             return None
321 |         n = self.base[state]
322 |         if state == self.check[state] and n < 0:
323 |             return self.get_attr(self.v[-n - 1])
324 |         return None
325 | 
326 |     def dump(self):
327 |         for i in range(self.size):
328 |             print("i: %s [%s,%s]" % (i, self.base[i], self.check[i]))
329 | 
330 |     def compress(self):
331 |         last = self.alloc_size - 1
332 |         while self.used[last] == 0:
333 |             last -= 1
334 |         self.base = self.base[:last + 1]
335 |         self.check = self.check[:last + 1]
336 |         self.alloc_size = len(self.base)
337 | 
338 |     @staticmethod
339 |     def save_to_ya(trie, filename):
340 |         # trie.compress()
341 |         import cPickle as Pickle
342 |         with open(filename, 'wb') as f:
343 |             Pickle.dump(trie, f, protocol=Pickle.HIGHEST_PROTOCOL)
344 |             f.close()
345 | 
346 |     @staticmethod
347 |     def save_to_yaf(trie, filename):
348 |         pass
349 | 
350 |     @staticmethod
351 |     def load_bin(filename):
352 |         with open(filename, 'rb') as f:
353 |             trie = Pickle.load(f)
354 |             return trie
355 | 
356 |     @staticmethod
357 |     def load_dict_file(filenames, key_func=None, value_func=None, enum_cls=NATURE):
358 |         import codecs
359 |         k, v, dict_list = [], [], []
360 |         if not isinstance(filenames, list):
361 |             filenames = [filenames]
362 | 
363 |         for filename in filenames:
364 |             with codecs.open(filename, 'rb', 'utf-8') as f:
365 |                 dict_list += f.read().splitlines()
366 | 
367 |         return DoubleArrayTrie.load_from_list(dict_list, key_func, value_func, enum_cls)
368 | 
369 |     @staticmethod
370 |     def load_from_list(dict_list, key_func=None, value_func=None, enum_cls=NATURE):
371 |         key_func = key_func or (lambda i: i.split()[0])
372 |         value_func = value_func or (lambda i: i)
373 |         # sort
374 |         dict_map = {}
375 |         for i in dict_list:
376 |             try:
377 |                 i = i.replace('\t', chr(32))
378 |                 dict_map[key_func(i)] = value_func(i)  # 此处需要解开成列表，viterbi会直接用到
379 |             except:
380 |                 logger.error(u"字典项:[ %s ]格式异常。" % i)
381 |                 continue
382 |         dict_map = OrderedDict(sorted(dict_map.items()))
383 |         trie = DoubleArrayTrie(enum_cls=enum_cls)
384 |         trie.build(key=dict_map.keys(), v=dict_map.values())
385 |         return trie
386 | 
387 |     def search(self, key, offset=0):
388 |         return Searcher(self, key, offset)
389 | 
390 |     @staticmethod
391 |     def load(filenames, key_func=None, value_func=None,
392 |              dict_bin_ext=config.DICT_BIN_EXT, enum_cls=NATURE):
393 |         import os
394 |         # 考虑用户自定义宝典输入为列表的情况
395 |         filename = filenames[0] if type(filenames) is list else filenames
396 |         if config.Config.use_dict_cache and os.path.exists(filename + dict_bin_ext):
397 |             return DoubleArrayTrie.load_bin(filename + dict_bin_ext)
398 |         trie = DoubleArrayTrie.load_dict_file(filenames, key_func, value_func, enum_cls)
399 |         DoubleArrayTrie.save_to_ya(trie, filename + dict_bin_ext)
400 |         return trie
401 | 
402 |     @staticmethod
403 |     def buildcoredictsearcher(key, offset=0):
404 |         return DoubleArrayTrie().load(config.CORE_DICT_NAME).search(key, offset)
405 | 
406 | 
407 | class Searcher:
408 |     def __init__(self, trie, chararray, offset=0):
409 |         # key的起点
410 |         self.begin = 0
411 |         # key的长度
412 |         self.length = 0
413 |         # key的字典序坐标
414 |         self.index = 0
415 |         self.key = None
416 | 
417 |         # key对应的value
418 |         self.value = None
419 | 
420 |         # 传入的字符数组
421 |         self.code_array = [ord(c) for c in chararray]
422 | 
423 |         self.char_array = chararray
424 | 
425 |         # 上一个node位置
426 |         self.trie = trie
427 |         self.last = trie.base[0]
428 | 
429 |         # charArray的长度，效率起见，开个变量
430 |         self.array_length = chararray.__len__()
431 | 
432 |         # 上一个字符的下标
433 |         self.i = offset - 1
434 |         # // A trick，如果文本长度为0的话，调用next()时，会带来越界的问题。
435 |         self.begin = -1 if (self.array_length is 0) else offset
436 | 
437 |     # 是否命中，当返回false表示搜索结束，否则使用公开的成员读取命中的详细信息
438 |     def next(self):
439 |         b = self.last
440 |         while 1:
441 |             self.i += 1
442 |             if self.i == self.array_length:  # 指针到头了，将起点往前挪一个，重新开始，状态归零
443 |                 self.begin += 1
444 |                 if self.begin == self.array_length:
445 |                     break
446 |                 self.i = self.begin
447 |                 b = self.trie.base[0]
448 | 
449 |             p = b + self.code_array[self.i] + 1  # 状态转移 p = base[char[i-1]] + char[i] + 1
450 |             if b == self.trie.check[p]:  # base[char[i-1]] == check[base[char[i-1]] + char[i] + 1]
451 |                 b = self.trie.base[p]  # 转移成功
452 |             else:
453 |                 self.i = self.begin  # 转移失败，也将起点往前挪一个，重新开始，状态归零
454 |                 self.begin += 1
455 |                 if self.begin is self.array_length:
456 |                     break
457 |                 b = self.trie.base[0]
458 |                 continue
459 |             p = b
460 |             n = self.trie.base[p]
461 |             if b == self.trie.check[p] and n < 0:  # base[p] == check[p] && base[p] < 0 查到一个词
462 |                 self.length = self.i - self.begin + 1
463 |                 self.index = -n - 1
464 |                 self.key = self.char_array[self.begin:self.begin + self.length]
465 |                 self.value = self.trie.get_attr(self.trie.v[self.index])
466 |                 self.last = b
467 |                 return True
468 |         return False
469 | 
470 |     def search_all_words(self):
471 |         b = self.last
472 |         while 1:
473 |             self.i += 1
474 |             if self.i == self.array_length:  # 指针到头了，将起点往前挪一个，重新开始，状态归零
475 |                 self.begin += 1
476 |                 if self.begin == self.array_length:
477 |                     break
478 |                 self.i = self.begin
479 |                 b = self.trie.base[0]
480 | 
481 |             p = b + self.code_array[self.i] + 1  # 状态转移 p = base[char[i-1]] + char[i] + 1
482 |             if b == self.trie.check[p]:  # base[char[i-1]] == check[base[char[i-1]] + char[i] + 1]
483 |                 b = self.trie.base[p]  # 转移成功
484 |             else:
485 |                 self.i = self.begin  # 转移失败，也将起点往前挪一个，重新开始，状态归零
486 |                 self.begin += 1
487 |                 if self.begin == self.array_length:
488 |                     break
489 |                 b = self.trie.base[0]
490 |                 continue
491 |             p = b
492 |             n = self.trie.base[p]
493 |             if b == self.trie.check[p] and n < 0:  # base[p] == check[p] && base[p] < 0 查到一个词
494 |                 self.length = self.i - self.begin + 1
495 |                 self.index = -n - 1
496 |                 self.key = self.char_array[self.begin:self.begin + self.length]
497 |                 self.value = self.trie.get_attr(self.trie.v[self.index])
498 |                 self.last = b
499 |                 yield self.begin, self.key, self.value
500 |         return
501 | 
502 | 
503 | 
504 | 
505 |         # def seek(self,index):
506 |         #     self.i = index -1
507 |         #     self.begin = index
508 |         #     self.last = self.trie.base[0]
509 | 
510 | 
511 | # class MaxSearcher:
512 | #     def __init__(self, trie, chararray, offset=0):
513 | #         self.searcher = trie.search(chararray)
514 | #         self.textbegin = 0
515 | #         self.textend = 0
516 | #
517 | #     def next(self):
518 | #         prekey = None
519 | #         preindex = None
520 | #         prebegin = None
521 | #         preend = None
522 | #
523 | #         while self.searcher.next():
524 | #             if prekey == None or prekey == self.searcher.key[:len(prekey)] :
525 | #                 prekey = self.searcher.key
526 | #                 preindex = self.searcher.index
527 | #                 prebegin = self.searcher.begin
528 | #                 preend = self.searcher.begin+self.searcher.length
529 | #                 continue
530 | #             else:
531 | #                 self.key = prekey
532 | #                 self.value = self.searcher.trie.v[preindex]
533 | #                 self.textbegin = prebegin
534 | #                 self.textend = preend
535 | #                 # 需要将起点移到找到的词的后一个
536 | #                 self.searcher.seek(self.textend)
537 | #                 return True
538 | #         return False
539 | 
540 | 
541 | 
542 | 
543 | @singleton
544 | class CoreDict:
545 |     def __init__(self):
546 |         self.trie = DoubleArrayTrie.load(config.CORE_DICT_NAME)
547 | 
548 | 
549 | def __split_id_attribute(item):
550 |     index = item[0]
551 |     value = item[1]
552 |     if isinstance(value, str):
553 |         value = value.split()
554 |     if isinstance(value, list):
555 |         value = value[1:]
556 |     return index, value
557 | 
558 | 
559 | PERSON_WORD_ID, PERSON_ATTRIBUTE = __split_id_attribute(CoreDict().trie.get(TAG_PEOPLE))
560 | PLACE_WORD_ID, PLACE_ATTRIBUTE = __split_id_attribute(CoreDict().trie.get(TAG_PLACE))
561 | ORG_WORD_ID, ORG_ATTRIBUTE = __split_id_attribute(CoreDict().trie.get(TAG_GROUP))
562 | PROPER_WORD_ID, PROPER_ATTRIBUTE = __split_id_attribute(CoreDict().trie.get(TAG_PROPER))
563 | TIME_WORD_ID, TIME_ATTRIBUTE = __split_id_attribute(CoreDict().trie.get(TAG_TIME))
564 | NUMBER_WORD_ID, NUMBER_ATTRIBUTE = __split_id_attribute(CoreDict().trie.get(TAG_NUMBER))
565 | CLUSTER_WORD_ID, CLUSTER_ATTRIBUTE = __split_id_attribute(CoreDict().trie.get(TAG_CLUSTER))
566 | 
567 | @singleton
568 | class CustomDict:
569 |     def __init__(self):
570 |         self.trie = DoubleArrayTrie.load(config.CUSTOM_DICT_NAME)
571 | 


--------------------------------------------------------------------------------
/yaya/collection/hmm.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from __future__ import unicode_literals
 3 | import math
 4 | 
 5 | from yaya.common.ns import NS
 6 | from yaya import config
 7 | from yaya.common.nr import NR
 8 | from yaya.common.nt import NT
 9 | from yaya.utility.singleton import singleton
10 | 
11 | __author__ = 'tony'
12 | 
13 | 
14 | class HMMMatrix:
15 |     def __init__(self):
16 |         self.matrix = []
17 |         self.total = None
18 |         self.total_freq = 0
19 |         self.start_prob = None
20 |         self.trans_prob = None
21 | 
22 |     def get_total_freq(self, nature):
23 |         return self.total[nature.index]
24 | 
25 |     @staticmethod
26 |     def load(filename, cls):
27 |         with open(filename, 'r') as f:
28 |             flist = f.read().splitlines()
29 |         labels = flist[0].split(',')[1:]
30 |         ord_array = [[0]] * len(labels)
31 |         ord_max = 0
32 |         for i in range(len(ord_array)):
33 |             ord_array[i] = cls[labels[i]].index
34 |             ord_max = max(ord_max, ord_array[i])
35 |         # 找到最大的枚举值
36 |         ord_max += 1
37 |         hmm = HMMMatrix()
38 |         hmm.matrix = [[0 for col in range(ord_max)] for row in range(ord_max)]
39 |         for row in flist[1:]:
40 |             params = row.split(',')
41 |             cur_ord = cls[params[0]].index
42 |             for i in range(ord_array.__len__()):
43 |                 hmm.matrix[cur_ord][ord_array[i]] = int(params[1 + i])
44 | 
45 |         hmm.total = [[0]] * ord_max
46 |         for j in range(ord_max):
47 |             hmm.total[j] = 0
48 |             for i in range(ord_max):
49 |                 hmm.total[j] += hmm.matrix[i][j]
50 | 
51 |         for j in range(ord_max):
52 |             hmm.total[j] += hmm.matrix[j][j]
53 | 
54 |         for j in range(ord_max):
55 |             hmm.total_freq += hmm.total[j]
56 | 
57 |         # 计算HMM四元组
58 |         states = ord_array
59 |         hmm.start_prob = [[0]] * ord_max
60 |         for s in ord_array:
61 |             freq = hmm.total[s] + 1e-8
62 |             hmm.start_prob[s] = -math.log(freq / hmm.total_freq)
63 | 
64 |         hmm.trans_prob = [[0 for col in range(ord_max)] for row in range(ord_max)]
65 |         for f in ord_array:
66 |             for t in ord_array:
67 |                 freq = hmm.matrix[f][t] + 1e-8
68 |                 hmm.trans_prob[f][t] = -math.log(freq / hmm.total_freq)
69 |         return hmm
70 | 
71 | 
72 | @singleton
73 | class PersonTranMatrix:
74 |     def __init__(self):
75 |         self.hmm = HMMMatrix.load(config.PERSON_TR_PATH, NR)
76 | 
77 | 
78 | @singleton
79 | class OrgTranMatrix:
80 |     def __init__(self):
81 |         self.hmm = HMMMatrix.load(config.ORG_TR_PATH, NT)
82 | 
83 | @singleton
84 | class PlaceTranMatrix:
85 |     def __init__(self):
86 |         self.hmm = HMMMatrix.load(config.PLACE_TR_PATH, NS)
87 | 


--------------------------------------------------------------------------------
/yaya/collection/trie.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'tony'
  2 | __all__ = ['Trie', 'StringTrie', 'SortedTrie', 'SortedStringTrie', 'Node']
  3 | 
  4 | import sys
  5 | from copy import copy
  6 | from operator import itemgetter
  7 | from collections import MutableMapping
  8 | 
  9 | # Python 3 interoperability
 10 | PY3 = sys.version_info[0] == 3
 11 | if PY3:
 12 |     def itervalues(d):
 13 |         return d.values()
 14 | 
 15 | 
 16 |     def iteritems(d):
 17 |         return d.items()
 18 | else:
 19 |     def itervalues(d):
 20 |         return d.itervalues()
 21 | 
 22 | 
 23 |     def iteritems(d):
 24 |         return d.iteritems()
 25 | 
 26 | 
 27 | # Singleton sentinel - works with pickling
 28 | class NULL(object):
 29 |     pass
 30 | 
 31 | 
 32 | class Node(object):
 33 |     '''Trie node class.
 34 | 
 35 |     Subclasses may extend it to replace :attr:`ChildrenFactory` with a different
 36 |     mapping class (e.g. `sorteddict <http://pypi.python.org/pypi/sorteddict/>`_).
 37 | 
 38 |     :ivar value: The value of the key corresponding to this node or :const:`NULL`
 39 |         if there is no such key.
 40 |     :ivar children: A ``{key-part : child-node}`` mapping.
 41 |     '''
 42 |     __slots__ = ('value', 'children')
 43 | 
 44 |     #: A callable for creating a new :attr:`children` mapping.
 45 |     ChildrenFactory = dict
 46 | 
 47 |     def __init__(self, value=NULL):
 48 |         self.value = value
 49 |         self.children = self.ChildrenFactory()
 50 | 
 51 |     def numkeys(self):
 52 |         '''Return the number of keys in the subtree rooted at this node.'''
 53 |         return (int(self.value is not NULL) +
 54 |                 sum(child.numkeys() for child in itervalues(self.children)))
 55 | 
 56 |     def __repr__(self):
 57 |         return '(%s, {%s})' % (
 58 |             self.value is NULL and 'NULL' or repr(self.value),
 59 |             ', '.join('%r: %r' % t for t in iteritems(self.children)))
 60 | 
 61 |     def __copy__(self):
 62 |         clone = self.__class__(self.value)
 63 |         clone_children = clone.children
 64 |         for key, child in iteritems(self.children):
 65 |             clone_children[key] = child.__copy__()
 66 |         return clone
 67 | 
 68 |     def __getstate__(self):
 69 |         return (self.value, self.children)
 70 | 
 71 |     def __setstate__(self, state):
 72 |         self.value, self.children = state
 73 | 
 74 | 
 75 | class Trie(MutableMapping):
 76 |     '''Base trie class.
 77 | 
 78 |     As with regular dicts, keys are not necessarily returned sorted. Use
 79 |     :class:`SortedTrie` if sorting is required.
 80 |     '''
 81 | 
 82 |     #: Callable for forming a key from its parts.
 83 |     KeyFactory = tuple
 84 | 
 85 |     #: Callable for creating new trie nodes.
 86 |     NodeFactory = Node
 87 | 
 88 |     def __init__(self, *args, **kwargs):
 89 |         '''Create a new trie.
 90 | 
 91 |         Parameters are the same with ``dict()``.
 92 |         '''
 93 |         self._root = self.NodeFactory()
 94 | 
 95 |         self.update(*args, **kwargs)
 96 | 
 97 |     @classmethod
 98 |     def fromkeys(cls, iterable, value=None):
 99 |         '''Create a new trie with keys from ``iterable`` and values set to ``value``.
100 | 
101 |         Parameters are the same with ``dict.fromkeys()``.
102 |         '''
103 |         d = cls()
104 |         for key in iterable:
105 |             d[key] = value
106 |         return d
107 | 
108 |     # ----- trie-specific methods -----------------------------------------------
109 | 
110 |     def longest_prefix(self, key, default=NULL):
111 |         '''Return the longest key in this trie that is a prefix of ``key``.
112 | 
113 |         If the trie doesn't contain any prefix of ``key``:
114 |           - if ``default`` is given, return it
115 |           - otherwise raise ``KeyError``
116 |         '''
117 |         try:
118 |             return self.longest_prefix_item(key)[0]
119 |         except KeyError:
120 |             if default is not NULL:
121 |                 return default
122 |             raise
123 | 
124 |     def longest_prefix_value(self, key, default=NULL):
125 |         '''Return the value associated with the longest key in this trie that is
126 |         a prefix of ``key``.
127 | 
128 |         If the trie doesn't contain any prefix of ``key``:
129 |           - if ``default`` is given, return it
130 |           - otherwise raise ``KeyError``
131 |         '''
132 |         current = self._root
133 |         longest_prefix_value = NULL
134 |         for part in key:
135 |             current = current.children.get(part)
136 |             if current is None:
137 |                 break
138 |             value = current.value
139 |             if value is not NULL:
140 |                 longest_prefix_value = value
141 |         if longest_prefix_value is not NULL:
142 |             return longest_prefix_value
143 |         elif default is not NULL:
144 |             return default
145 |         else:
146 |             raise KeyError
147 | 
148 |     def longest_prefix_item(self, key, default=NULL):
149 |         '''Return the item (``(key,value)`` tuple) associated with the longest
150 |         key in this trie that is a prefix of ``key``.
151 | 
152 |         If the trie doesn't contain any prefix of ``key``:
153 |           - if ``default`` is given, return it
154 |           - otherwise raise ``KeyError``
155 |         '''
156 |         prefix = []
157 |         append = prefix.append
158 |         current = self._root
159 |         longest_prefix_value = NULL
160 |         max_non_null_index = -1
161 |         for i, part in enumerate(key):
162 |             current = current.children.get(part)
163 |             if current is None:
164 |                 break
165 |             append(part)
166 |             value = current.value
167 |             if value is not NULL:
168 |                 longest_prefix_value = value
169 |                 max_non_null_index = i
170 |         if longest_prefix_value is not NULL:
171 |             del prefix[max_non_null_index + 1:]
172 |             return (self.KeyFactory(prefix), longest_prefix_value)
173 |         elif default is not NULL:
174 |             return default
175 |         else:
176 |             raise KeyError
177 | 
178 |     def iter_prefixes(self, key):
179 |         'Return an iterator over the keys of this trie that are prefixes of ``key``.'
180 |         key_factory = self.KeyFactory
181 |         prefix = []
182 |         append = prefix.append
183 |         node = self._root
184 |         for part in key:
185 |             node = node.children.get(part)
186 |             if node is None:
187 |                 break
188 |             append(part)
189 |             if node.value is not NULL:
190 |                 yield key_factory(prefix)
191 | 
192 |     def iter_prefix_values(self, key):
193 |         '''Return an iterator over the values of this trie that are associated
194 |         with keys that are prefixes of ``key``.
195 |         '''
196 |         node = self._root
197 |         for part in key:
198 |             node = node.children.get(part)
199 |             if node is None:
200 |                 break
201 |             if node.value is not NULL:
202 |                 yield node.value
203 | 
204 |     def iter_prefix_items(self, key):
205 |         '''Return an iterator over the items (``(key,value)`` tuples) of this
206 |         trie that are associated with keys that are prefixes of ``key``.
207 |         '''
208 |         key_factory = self.KeyFactory
209 |         prefix = []
210 |         append = prefix.append
211 |         node = self._root
212 |         for part in key:
213 |             node = node.children.get(part)
214 |             if node is None:
215 |                 break
216 |             append(part)
217 |             if node.value is not NULL:
218 |                 yield (key_factory(prefix), node.value)
219 | 
220 |     # ----- extended mapping API methods ----------------------------------------
221 | 
222 |     def keys(self, prefix=None):
223 |         '''Return a list of this trie's keys.
224 | 
225 |         :param prefix: If not None, return only the keys prefixed by ``prefix``.
226 |         '''
227 |         return list(self.iterkeys(prefix))
228 | 
229 |     def values(self, prefix=None):
230 |         '''Return a list of this trie's values.
231 | 
232 |         :param prefix: If not None, return only the values associated with keys
233 |             prefixed by ``prefix``.
234 |         '''
235 |         return list(self.itervalues(prefix))
236 | 
237 |     def items(self, prefix=None):
238 |         '''Return a list of this trie's items (``(key,value)`` tuples).
239 | 
240 |         :param prefix: If not None, return only the items associated with keys
241 |             prefixed by ``prefix``.
242 |         '''
243 |         return list(self.iteritems(prefix))
244 | 
245 |     def iterkeys(self, prefix=None):
246 |         '''Return an iterator over this trie's keys.
247 | 
248 |         :param prefix: If not None, yield only the keys prefixed by ``prefix``.
249 |         '''
250 |         return (key for key, value in self.iteritems(prefix))
251 | 
252 |     def itervalues(self, prefix=None):
253 |         '''Return an iterator over this trie's values.
254 | 
255 |         :param prefix: If not None, yield only the values associated with keys
256 |             prefixed by ``prefix``.
257 |         '''
258 | 
259 |         def generator(node, NULL=NULL):
260 |             if node.value is not NULL:
261 |                 yield node.value
262 |             for part, child in iteritems(node.children):
263 |                 for subresult in generator(child):
264 |                     yield subresult
265 | 
266 |         if prefix is None:
267 |             node = self._root
268 |         else:
269 |             node = self._find(prefix)
270 |             if node is None:
271 |                 node = self.NodeFactory()
272 |         return generator(node)
273 | 
274 |     def iteritems(self, prefix=None):
275 |         '''Return an iterator over this trie's items (``(key,value)`` tuples).
276 | 
277 |         :param prefix: If not None, yield only the items associated with keys
278 |             prefixed by ``prefix``.
279 |         '''
280 |         parts = []
281 |         append = parts.append
282 | 
283 |         def generator(node, key_factory=self.KeyFactory, parts=parts,
284 |                       append=append, NULL=NULL):
285 |             if node.value is not NULL:
286 |                 yield (key_factory(parts), node.value)
287 |             for part, child in iteritems(node.children):
288 |                 append(part)
289 |                 for subresult in generator(child):
290 |                     yield subresult
291 |                 del parts[-1]
292 | 
293 |         node = self._root
294 |         if prefix is not None:
295 |             for part in prefix:
296 |                 append(part)
297 |                 node = node.children.get(part)
298 |                 if node is None:
299 |                     node = self.NodeFactory()
300 |                     break
301 |         return generator(node)
302 | 
303 |     # ----- original mapping API methods ----------------------------------------
304 | 
305 |     def __len__(self):
306 |         return self._root.numkeys()
307 | 
308 |     def __iter__(self):
309 |         return self.iterkeys()
310 | 
311 |     def __contains__(self, key):
312 |         node = self._find(key)
313 |         return node is not None and node.value is not NULL
314 | 
315 |     def has_key(self, key):
316 |         return key in self
317 | 
318 |     def __getitem__(self, key):
319 |         node = self._find(key)
320 |         if node is None or node.value is NULL:
321 |             raise KeyError
322 |         return node.value
323 | 
324 |     def __setitem__(self, key, value):
325 |         node = self._root
326 |         Node = self.NodeFactory
327 |         for part in key:
328 |             next = node.children.get(part)
329 |             if next is None:
330 |                 node = node.children.setdefault(part, Node())
331 |             else:
332 |                 node = next
333 |         node.value = value
334 | 
335 |     def __delitem__(self, key):
336 |         nodes_parts = []
337 |         append = nodes_parts.append
338 |         node = self._root
339 |         for part in key:
340 |             append((node, part))
341 |             node = node.children.get(part)
342 |             if node is None:
343 |                 break
344 |         if node is None or node.value is NULL:
345 |             raise KeyError
346 |         node.value = NULL
347 |         pop = nodes_parts.pop
348 |         while node.value is NULL and not node.children and nodes_parts:
349 |             node, part = pop()
350 |             del node.children[part]
351 | 
352 |     def clear(self):
353 |         self._root.children.clear()
354 | 
355 |     def copy(self):
356 |         clone = copy(super(Trie, self))
357 |         clone._root = copy(self._root)
358 |         return clone
359 | 
360 |     def __repr__(self):
361 |         return '%s({%s})' % (
362 |             self.__class__.__name__,
363 |             ', '.join('%r: %r' % t for t in self.iteritems()))
364 | 
365 |     def _find(self, key):
366 |         node = self._root
367 |         for part in key:
368 |             node = node.children.get(part)
369 |             if node is None:
370 |                 break
371 |         return node
372 | 
373 | 
374 | class StringTrie(Trie):
375 |     '''A more appropriate for string keys :class:`Trie`.'''
376 |     KeyFactory = ''.join
377 | 
378 | 
379 | # XXX: quick & dirty sorted dict
380 | # currently only iteritems() (for Python 2) or items() (for Python 3) has to be
381 | # overriden. However this is implementation detail that may change in the future
382 | class _SortedDict(dict):
383 |     if PY3:
384 |         def items(self):
385 |             return iter(sorted(dict.items(self), key=itemgetter(0)))
386 |     else:
387 |         def iteritems(self):
388 |             return iter(sorted(dict.iteritems(self), key=itemgetter(0)))
389 | 
390 | 
391 | class _SortedNode(Node):
392 |     ChildrenFactory = _SortedDict
393 | 
394 | 
395 | class SortedTrie(Trie):
396 |     '''A :class:`Trie` that returns its keys (and associated values/items) sorted.
397 | 
398 |     .. note::
399 |         This implementation does not keep the keys sorted internally; instead it
400 |         sorts them every time a method returning a list or iterator (e.g.
401 |         :meth:`keys`) is called. In cases where a trie is relatively stable
402 |         (few inserts/deletes) and is iterated often, it is probably more efficient
403 |         to use a :attr:`NodeFactory` based on a sorted dict such as
404 |         `sorteddict <http://pypi.python.org/pypi/sorteddict/>`_.
405 |     '''
406 |     NodeFactory = _SortedNode
407 | 
408 | 
409 | class SortedStringTrie(SortedTrie, StringTrie):
410 |     'A :class:`Trie` that is both a :class:`StringTrie` and a :class:`SortedTrie`.'
411 | 
412 | 
413 | if __name__ == '__main__':
414 |     import doctest
415 | 
416 |     doctest.testmod()
417 | 


--------------------------------------------------------------------------------
/yaya/common/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tony'
2 | 


--------------------------------------------------------------------------------
/yaya/common/enum.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | class EnumException(Exception):
  3 |     """ Base class for all exceptions in this module. """
  4 | 
  5 |     def __init__(self, *args, **kwargs):
  6 |         if self.__class__ is EnumException:
  7 |             raise NotImplementedError(
  8 |                 "%(class_name)s is an abstract base class" % vars())
  9 |         super(EnumException, self).__init__(*args, **kwargs)
 10 | 
 11 | 
 12 | class EnumEmptyError(AssertionError, EnumException):
 13 |     """ Raised when attempting to create an empty enumeration. """
 14 | 
 15 |     def __str__(self):
 16 |         return "Enumerations cannot be empty"
 17 | 
 18 | 
 19 | class EnumBadKeyError(TypeError, EnumException):
 20 |     """ Raised when creating an Enum with non-string keys. """
 21 | 
 22 |     def __init__(self, key):
 23 |         self.key = key
 24 | 
 25 |     def __str__(self):
 26 |         return "Enumeration keys must be strings: %(key)r" % vars(self)
 27 | 
 28 | 
 29 | class EnumImmutableError(TypeError, EnumException):
 30 |     """ Raised when attempting to modify an Enum. """
 31 | 
 32 |     def __init__(self, *args):
 33 |         self.args = args
 34 | 
 35 |     def __str__(self):
 36 |         return "Enumeration does not allow modification"
 37 | 
 38 | 
 39 | def _comparator(func):
 40 |     """ Decorator for EnumValue rich comparison methods. """
 41 | 
 42 |     def comparator_wrapper(self, other):
 43 |         try:
 44 |             assert self.enumtype == other.enumtype
 45 |             result = func(self.index, other.index)
 46 |         except (AssertionError, AttributeError):
 47 |             result = NotImplemented
 48 | 
 49 |         return result
 50 | 
 51 |     comparator_wrapper.__name__ = func.__name__
 52 |     comparator_wrapper.__doc__ = getattr(float, func.__name__).__doc__
 53 |     return comparator_wrapper
 54 | 
 55 | 
 56 | class EnumValue(object):
 57 |     """ A specific value of an enumerated type. """
 58 | 
 59 |     def __init__(self, enumtype, index, key):
 60 |         """ Set up a new instance. """
 61 |         self._enumtype = enumtype.enum_name
 62 |         self._index = index
 63 |         self._key = key
 64 | 
 65 |     @property
 66 |     def enumtype(self):
 67 |         return self._enumtype
 68 | 
 69 |     @property
 70 |     def key(self):
 71 |         return self._key
 72 | 
 73 |     def __str__(self):
 74 |         return str(self.key)
 75 | 
 76 |     @property
 77 |     def index(self):
 78 |         return self._index
 79 | 
 80 |     def __repr__(self):
 81 |         return "EnumValue(%(_enumtype)r, %(_index)r, %(_key)r)" % vars(self)
 82 | 
 83 |     def __hash__(self):
 84 |         return hash(self._index)
 85 | 
 86 |     @_comparator
 87 |     def __eq__(self, other):
 88 |         return self == other
 89 | 
 90 |     @_comparator
 91 |     def __ne__(self, other):
 92 |         return self != other
 93 | 
 94 |     @_comparator
 95 |     def __lt__(self, other):
 96 |         return self < other
 97 | 
 98 |     @_comparator
 99 |     def __le__(self, other):
100 |         return self <= other
101 | 
102 |     @_comparator
103 |     def __gt__(self, other):
104 |         return self > other
105 | 
106 |     @_comparator
107 |     def __ge__(self, other):
108 |         return self >= other
109 | 
110 | 
111 | class Enum(object):
112 |     """ Enumerated type. """
113 | 
114 |     def __init__(self, *keys, **kwargs):
115 |         """ Create an enumeration instance. """
116 | 
117 |         value_type = kwargs.get('value_type', EnumValue)
118 |         enum_name = kwargs.get('enum_name', None)
119 |         assert enum_name is not None
120 |         self.__dict__['enum_name'] = enum_name
121 |         if not keys:
122 |             raise EnumEmptyError()
123 | 
124 |         keys = tuple(keys)
125 |         values = [None] * len(keys)
126 | 
127 |         for i, key in enumerate(keys):
128 |             value = value_type(self, i, key)
129 |             values[i] = value
130 |             try:
131 |                 super(Enum, self).__setattr__(key, value)
132 |             except TypeError:
133 |                 raise EnumBadKeyError(key)
134 | 
135 |         self.__dict__['_keys'] = keys
136 |         self.__dict__['_values'] = values
137 | 
138 |     def __setattr__(self, name, value):
139 |         raise EnumImmutableError(name)
140 | 
141 |     def __delattr__(self, name):
142 |         raise EnumImmutableError(name)
143 | 
144 |     def __len__(self):
145 |         return len(self._values)
146 | 
147 |     def __getitem__(self, index):
148 |         # tony 添加，添加从字符型枚举名到变量值的转换
149 |         if isinstance(index, str) or isinstance(index, unicode) :
150 |             return self.__getattribute__(index)
151 |         else:
152 |             return self._values[index]
153 | 
154 |     def __setitem__(self, index, value):
155 |         raise EnumImmutableError(index)
156 | 
157 |     def __delitem__(self, index):
158 |         raise EnumImmutableError(index)
159 | 
160 |     def __iter__(self):
161 |         return iter(self._values)
162 | 
163 |     def __contains__(self, value):
164 |         is_member = False
165 |         if isinstance(value, basestring):
166 |             is_member = (value in self._keys)
167 |         else:
168 |             is_member = (value in self._values)
169 |         return is_member
170 | 


--------------------------------------------------------------------------------
/yaya/common/nature.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | from yaya.common.enum import Enum
  3 | 
  4 | __author__ = 'tony'
  5 | # 词性
  6 | 
  7 | NATURE = Enum(
  8 |     "bg",  # u"区别语素",
  9 |     "mg",  # u"数语素",
 10 |     "nl",  # u"名词性惯用语",
 11 |     "nx",  # u"字母专名",
 12 |     "qg",  # u"量词语素",
 13 |     "ud",  # u"助词",
 14 |     "uj",  # u"助词",
 15 |     "uz",  # u"着",
 16 |     "ug",  # u"过",
 17 |     "ul",  # u"连词",
 18 |     "uv",  # u"连词",
 19 |     "yg",  # u"语气语素",
 20 |     "zg",  # u"状态词",
 21 |     "n",  # u"名词",
 22 |     "nr",  # u"人名",
 23 |     "nrj",  # u"日语人名",
 24 |     "nrf",  # u"音译人名",
 25 |     "nr1",  # u"复姓",
 26 |     "nr2",  # u"蒙古姓名",
 27 |     "ns",  # u"地名",
 28 |     "nsf",  # u"音译地名",
 29 |     "nt",  # u"机构团体名",
 30 |     "ntc",  # u"公司名",
 31 |     "ntcf",  # u"工厂",
 32 |     "ntcb",  # u"银行",
 33 |     "ntch",  # u"酒店宾馆",
 34 |     "nto",  # u"政府机构",
 35 |     "ntu",  # u"大学",
 36 |     "nts",  # u"中小学",
 37 |     "nth",  # u"医院",
 38 |     "nh",  # u"医药疾病等健康相关名词",
 39 |     "nhm",  # u"药品",
 40 |     "nhd",  # u"疾病",
 41 |     "nn",  # u"工作相关名词",
 42 |     "nnt",  # u"职务职称",
 43 |     "nnd",  # u"职业",
 44 |     "ng",  # u"名词性语素",
 45 |     "nf",  # u"食品",
 46 |     "ni",  # u"机构相关",
 47 |     "nit",  # u"教育相关机构",
 48 |     "nic",  # u"下属机构",
 49 |     "nis",  # u"机构后缀",
 50 |     "nm",  # u"物品名",
 51 |     "nmc",  # u"化学品名",
 52 |     "nb",  # u"生物名",
 53 |     "nba",  # u"动物名",
 54 |     "nbc",  # u"动物纲目",
 55 |     "nbp",  # u"植物名",
 56 |     "nz",  # u"其他专名",
 57 |     "g",  # u"学术词汇",
 58 |     "gm",  # u"数学相关词汇",
 59 |     "gp",  # u"物理相关词汇",
 60 |     "gc",  # u"化学相关词汇",
 61 |     "gb",  # u"生物相关词汇",
 62 |     "gbc",  # u"生物类别",
 63 |     "gg",  # u"地理地质相关词汇",
 64 |     "gi",  # u"计算机相关词汇",
 65 |     "j",  # u"简称略语",
 66 |     "i",  # u"成语",
 67 |     "l",  # u"习用语",
 68 |     "t",  # u"时间词",
 69 |     "tg",  # u"时间词性语素",
 70 |     "s",  # u"处所词",
 71 |     "f",  # u"方位词",
 72 |     "v",  # u"动词",
 73 |     "vd",  # u"副动词",
 74 |     "vn",  # u"名动词",
 75 |     "vshi",  # u"动词",
 76 |     "vyou",  # u"动词",
 77 |     "vf",  # u"趋向动词",
 78 |     "vx",  # u"形式动词",
 79 |     "vi",  # u"不及物动词",
 80 |     "vl",  # u"动词性惯用语",
 81 |     "vg",  # u"动词性语素",
 82 |     "a",  # u"形容词",
 83 |     "ad",  # u"副形词",
 84 |     "an",  # u"名形词",
 85 |     "ag",  # u"形容词性语素",
 86 |     "al",  # u"形容词性惯用语",
 87 |     "b",  # u"区别词",
 88 |     "bl",  # u"区别词性惯用语",
 89 |     "z",  # u"状态词",
 90 |     "r",  # u"代词",
 91 |     "rr",  # u"人称代词",
 92 |     "rz",  # u"指示代词",
 93 |     "rzt",  # u"时间指示代词",
 94 |     "rzs",  # u"处所指示代词",
 95 |     "rzv",  # u"谓词性指示代词",
 96 |     "ry",  # u"疑问代词",
 97 |     "ryt",  # u"时间疑问代词",
 98 |     "rys",  # u"处所疑问代词",
 99 |     "ryv",  # u"谓词性疑问代词",
100 |     "rg",  # u"代词性语素",
101 |     "Rg",  # u"古汉语代词性语素",
102 |     "m",  # u"数词",
103 |     "mq",  # u"数量词",
104 |     "Mg",  # u"甲乙丙丁之类的数词",
105 |     "q",  # u"量词",
106 |     "qv",  # u"动量词",
107 |     "qt",  # u"时量词",
108 |     "d",  # u"副词",
109 |     "dg",  # u"辄",
110 |     "dl",  # u"连语",
111 |     "p",  # u"介词",
112 |     "pba",  # u"介词",
113 |     "pbei",  # u"介词",
114 |     "c",  # u"连词",
115 |     "cc",  # u"并列连词",
116 |     "u",  # u"助词",
117 |     "uzhe",  # u"着",
118 |     "ule",  # u"了 ",
119 |     "uguo",  # u"过",
120 |     "ude1",  # u"的 ",
121 |     "ude2",  # u"地",
122 |     "ude3",  # u"得",
123 |     "usuo",  # u"所",
124 |     "udeng",  # u"等 ",
125 |     "uyy",  # u"一样 ",
126 |     "udh",  # u"的话",
127 |     "uls",  # u"来讲 ",
128 |     "uzhi",  # u"之",
129 |     "ulian",  # u"连 ",
130 |     "e",  # u"叹词",
131 |     "y",  # u"语气词",
132 |     "o",  # u"拟声词",
133 |     "h",  # u"前缀",
134 |     "k",  # u"后缀",
135 |     "x",  # u"字符串",
136 |     "xx",  # u"非语素字",
137 |     "xu",  # u"网址",
138 |     "w",  # u"标点符号",
139 |     "wkz",  # u"左括号",
140 |     "wky",  # u"右括号",
141 |     "wyz",  # u"左引号",
142 |     "wyy",  # u"右引号",
143 |     "wj",  # u"句号",
144 |     "ww",  # u"问号",
145 |     "wt",  # u"叹号",
146 |     "wd",  # u"逗号",
147 |     "wf",  # u"分号",
148 |     "wn",  # u"顿号",
149 |     "wm",  # u"冒号",
150 |     "ws",  # u"省略号",
151 |     "wp",  # u"破折号",
152 |     "wb",  # u"百分号千分号",
153 |     "wh",  # u"单位符号",
154 |     "end",  # u"仅用于始",
155 |     "begin",  # u"仅用于终"
156 |     enum_name="NATURE"  # 如果不指定，enum持久化时按id来判断类型的相等
157 | )
158 | 
159 | 


--------------------------------------------------------------------------------
/yaya/common/nr.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from __future__ import unicode_literals
 3 | from yaya.common.enum import Enum
 4 | 
 5 | __author__ = 'tony'
 6 | 
 7 | NR = Enum(
 8 | 
 9 |     # 	Pf	姓氏	【张】华平先生
10 | 
11 |     'B',
12 | 
13 |     # 	Pm	双名的首字	张【华】平先生
14 | 
15 |     'C',
16 | 
17 |     # 	Pt	双名的末字	张华【平】先生
18 | 
19 |     'D',
20 | 
21 |     # 	Ps	单名	张【浩】说：“我是一个好人”
22 | 
23 |     'E',
24 | 
25 |     # 	Ppf	前缀	【老】刘、【小】李
26 | 
27 |     'F',
28 | 
29 |     # 	Plf	后缀	王【总】、刘【老】、肖【氏】、吴【妈】、叶【帅】
30 | 
31 |     'G',
32 | 
33 |     # 	Pp	人名的上文	又【来到】于洪洋的家。
34 | 
35 |     'K',
36 | 
37 |     # 	Pn	人名的下文	新华社记者黄文【摄】
38 | 
39 |     'L',
40 | 
41 |     # 	Ppn	两个中国人名之间的成分	编剧邵钧林【和】稽道青说
42 | 
43 |     'M',
44 | 
45 |     # 	Ppf	人名的上文和姓成词	这里【有关】天培的壮烈
46 | 
47 |     'U',
48 | 
49 |     # 	Pnw	三字人名的末字和下文成词	龚学平等领导, 邓颖【超生】前
50 | 
51 |     'V',
52 | 
53 |     # 	Pfm	姓与双名的首字成词	【王国】维、
54 | 
55 |     'X',
56 | 
57 |     # 	Pfs	姓与单名成词	【高峰】、【汪洋】
58 | 
59 |     'Y',
60 | 
61 |     # 	Pmt	双名本身成词	张【朝阳】
62 | 
63 |     'Z',
64 | 
65 |     # 	Po	以上之外其他的角色
66 | 
67 |     'A',
68 | 
69 |     # 句子的开头
70 | 
71 |     'S',
72 |     enum_name="NR"
73 | 
74 | )
75 | 
76 | NRPattern = [
77 |     'BBCD',
78 |     'BBE',
79 |     'BBZ',
80 |     'BCD',
81 |     'BEE',
82 |     'BE',
83 |     'BC',
84 |     'BEC',
85 |     'BG',
86 |     'DG',
87 |     'EG',
88 |     'BXD',
89 |     'BZ',
90 |     'EE',
91 |     'FE',
92 |     'FC',
93 |     'FB',
94 |     'FG',
95 |     'Y',
96 |     'XD',
97 |     'GD',
98 | ]


--------------------------------------------------------------------------------
/yaya/common/ns.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from __future__ import unicode_literals
 3 | from yaya.common.enum import Enum
 4 | 
 5 | __author__ = 'tony'
 6 | 
 7 | NS = Enum(
 8 |     'A',  # 地名的上文 我【来到】中关园
 9 |     'B',  # 地名的下文刘家村/和/下岸村/相邻
10 |     'C',  # 中国地名的第一个字
11 |     'D',  # 中国地名的第二个字
12 |     'E',  # 中国地名的第三个字
13 |     'G',  # 其他整个的地名
14 |     'H',  # 中国地名的后缀海/淀区
15 |     'X',  # 连接词刘家村/和/下岸村/相邻
16 |     'Z',  # 其它非地名成分
17 |     'S',  # 句子的开头
18 |     enum_name="NS"
19 | )
20 | 
21 | NSPattern = [
22 |     "CH",
23 |     "CDH",
24 |     "CDEH",
25 |     "GH"
26 | ]
27 | 


--------------------------------------------------------------------------------
/yaya/common/nt.py:
--------------------------------------------------------------------------------
   1 | # coding=utf-8
   2 | from __future__ import unicode_literals
   3 | from yaya.common.enum import Enum
   4 | 
   5 | __author__ = 'tony'
   6 | 
   7 | NT = Enum(
   8 |     'A',  # 上文	[参与]亚太经合组织的活动
   9 |     'B',  # 下文	中央电视台[报道]
  10 |     'X',  # 连接词	北京电视台[和]天津电视台
  11 |     'C',  # 特征词的一般性前缀	 北京[电影]学院
  12 |     'F',  # 特征词的译名性前缀 	美国[摩托罗拉]公司
  13 |     'G',  # 特征词的地名性前缀 	交通银行[北京]分行
  14 |     'H',  # 特征词的机构名前缀	  [中共中央]顾问委员会
  15 |     'I',  # 特征词的特殊性前缀	 [华谊]医院
  16 |     'J',  # 特征词的简称性前缀 	[巴]政府
  17 |     'K',  # 整个机构 [麦当劳]
  18 |     'L',  # 方位词
  19 |     'M',  # 数词 公交集团[五]分公司
  20 |     'P',  # 单字碎片
  21 |     'W',  # 符号
  22 |     'D',  # 机构名的特征词	国务院侨务[办公室]
  23 |     'Z',  # 非机构名成份
  24 |     'S',  # 句子的开头
  25 |     enum_name="NT"
  26 | )
  27 | 
  28 | NTPattern = [
  29 |     "CCCCCCCCD",
  30 |     "CCCCCCCD",
  31 |     "CCCCCCD",
  32 |     "CCCCCCGD",
  33 |     "CCCCCCICCCCD",
  34 |     "CCCCCCPD",
  35 |     "CCCCCD",
  36 |     "CCCCCDD",
  37 |     "CCCCCGCCD",
  38 |     "CCCCCICCCCCD",
  39 |     "CCCCCPCCD",
  40 |     "CCCCCWDWD",
  41 |     "CCCCD",
  42 |     "CCCCDCCD",
  43 |     "CCCCDCD",
  44 |     "CCCCDD",
  45 |     "CCCCID",
  46 |     "CCCCPCD",
  47 |     "CCCD",
  48 |     "CCCDCCCD",
  49 |     "CCCDCCD",
  50 |     "CCCDCD",
  51 |     "CCCDD",
  52 |     "CCCDICFPD",
  53 |     "CCCFCFFCD",
  54 |     "CCCGD",
  55 |     "CCCGID",
  56 |     "CCCGJCD",
  57 |     "CCCID",
  58 |     "CCCJCCD",
  59 |     "CCCJD",
  60 |     "CCCLGCD",
  61 |     "CCCMD",
  62 |     "CCCPCCCD",
  63 |     "CCCPCCD",
  64 |     "CCCPCD",
  65 |     "CCCPD",
  66 |     "CCD",
  67 |     "CCDCCCCCCD",
  68 |     "CCDCCCCD",
  69 |     "CCDCCCD",
  70 |     "CCDCCCDD",
  71 |     "CCDCCD",
  72 |     "CCDCD",
  73 |     "CCDCDD",
  74 |     "CCDCGCD",
  75 |     "CCDCGCDID",
  76 |     "CCDCGCDPD",
  77 |     "CCDCGGDD",
  78 |     "CCDCID",
  79 |     "CCDCJCCD",
  80 |     "CCDCJCCDD",
  81 |     "CCDD",
  82 |     "CCDDD",
  83 |     "CCDFIDGD",
  84 |     "CCDGCCD",
  85 |     "CCDICD",
  86 |     "CCDID",
  87 |     "CCDJCD",
  88 |     "CCDPCD",
  89 |     "CCDPJD",
  90 |     "CCFCCD",
  91 |     "CCFD",
  92 |     "CCGCCCD",
  93 |     "CCGCCD",
  94 |     "CCGCD",
  95 |     "CCGCDCD",
  96 |     "CCGCDCMD",
  97 |     "CCGD",
  98 |     "CCGGCD",
  99 |     "CCGID",
 100 |     "CCGIDD",
 101 |     "CCGJD",
 102 |     "CCGWGWD",
 103 |     "CCICCD",
 104 |     "CCICD",
 105 |     "CCICIFD",
 106 |     "CCICJPD",
 107 |     "CCID",
 108 |     "CCIDCD",
 109 |     "CCIDD",
 110 |     "CCIID",
 111 |     "CCJCCCD",
 112 |     "CCJCCD",
 113 |     "CCJCD",
 114 |     "CCJCFD",
 115 |     "CCJD",
 116 |     "CCJID",
 117 |     "CCJJMJD",
 118 |     "CCKID",
 119 |     "CCLD",
 120 |     "CCMD",
 121 |     "CCMMPDCD",
 122 |     "CCPCCD",
 123 |     "CCPCD",
 124 |     "CCPD",
 125 |     "CCPDCD",
 126 |     "CCPPD",
 127 |     "CCWCWD",
 128 |     "CCWGWCCD",
 129 |     "CCWGWD",
 130 |     "CD",
 131 |     "CDCCCCCCD",
 132 |     "CDCCCCD",
 133 |     "CDCCCD",
 134 |     "CDCCD",
 135 |     "CDCCDD",
 136 |     "CDCCJD",
 137 |     "CDCD",
 138 |     "CDCDD",
 139 |     "CDCGD",
 140 |     "CDCGPCCD",
 141 |     "CDCJD",
 142 |     "CDCLD",
 143 |     "CDCWIWD",
 144 |     "CDD",
 145 |     "CDDCCD",
 146 |     "CDDCCDD",
 147 |     "CDDCD",
 148 |     "CDDD",
 149 |     "CDFD",
 150 |     "CDFPCCD",
 151 |     "CDGCD",
 152 |     "CDGCICD",
 153 |     "CDGD",
 154 |     "CDICD",
 155 |     "CDID",
 156 |     "CDILLCCD",
 157 |     "CDJCCD",
 158 |     "CDJCD",
 159 |     "CDJD",
 160 |     "CDJLD",
 161 |     "CDLGCD",
 162 |     "CDLJD",
 163 |     "CDMCD",
 164 |     "CDPCCCCD",
 165 |     "CDPCCD",
 166 |     "CDPD",
 167 |     "CDPPD",
 168 |     "CFCCD",
 169 |     "CFCPD",
 170 |     "CFD",
 171 |     "CFPD",
 172 |     "CGCCCD",
 173 |     "CGCCD",
 174 |     "CGCD",
 175 |     "CGCDCD",
 176 |     "CGCDD",
 177 |     "CGD",
 178 |     "CGDCD",
 179 |     "CGDD",
 180 |     "CGDDCCD",
 181 |     "CGDDD",
 182 |     "CGDDID",
 183 |     "CGDJD",
 184 |     "CGDMD",
 185 |     "CGFD",
 186 |     "CGGCCCD",
 187 |     "CGGCCD",
 188 |     "CGGCD",
 189 |     "CGGD",
 190 |     "CGGGD",
 191 |     "CGGGDD",
 192 |     "CGGICD",
 193 |     "CGGJD",
 194 |     "CGICD",
 195 |     "CGID",
 196 |     "CGIJD",
 197 |     "CGJD",
 198 |     "CGMD",
 199 |     "CGPJD",
 200 |     "CICCCCD",
 201 |     "CICCD",
 202 |     "CICD",
 203 |     "CICDCD",
 204 |     "CICDD",
 205 |     "CICWGWD",
 206 |     "CID",
 207 |     "CIDD",
 208 |     "CIGCD",
 209 |     "CIGD",
 210 |     "CIID",
 211 |     "CILCD",
 212 |     "CIMD",
 213 |     "CJCCCCCD",
 214 |     "CJCCCD",
 215 |     "CJCCCDD",
 216 |     "CJCCD",
 217 |     "CJCCMD",
 218 |     "CJCD",
 219 |     "CJCDD",
 220 |     "CJCGCCD",
 221 |     "CJCGPJD",
 222 |     "CJCMD",
 223 |     "CJCPCCCD",
 224 |     "CJCPD",
 225 |     "CJD",
 226 |     "CJDCCCCD",
 227 |     "CJDCCJD",
 228 |     "CJDCD",
 229 |     "CJDD",
 230 |     "CJDFD",
 231 |     "CJDPD",
 232 |     "CJFCD",
 233 |     "CJFD",
 234 |     "CJGD",
 235 |     "CJGLD",
 236 |     "CJGPCJD",
 237 |     "CJID",
 238 |     "CJJCCD",
 239 |     "CJJD",
 240 |     "CJJJD",
 241 |     "CJJLD",
 242 |     "CJKD",
 243 |     "CJLCCD",
 244 |     "CJMCD",
 245 |     "CJMD",
 246 |     "CJPD",
 247 |     "CJWCCWCGJD",
 248 |     "CJWD",
 249 |     "CJWPMWCGD",
 250 |     "CKCD",
 251 |     "CKD",
 252 |     "CKJCDCD",
 253 |     "CKJPD",
 254 |     "CLCCCD",
 255 |     "CLCCD",
 256 |     "CLCCGCD",
 257 |     "CLCD",
 258 |     "CLD",
 259 |     "CLDFD",
 260 |     "CLID",
 261 |     "CLPCD",
 262 |     "CMCD",
 263 |     "CMCDD",
 264 |     "CMCGD",
 265 |     "CMD",
 266 |     "CMDCD",
 267 |     "CMDD",
 268 |     "CMMD",
 269 |     "CMMDCCD",
 270 |     "CMPD",
 271 |     "CPCCCCCCCD",
 272 |     "CPCCCCD",
 273 |     "CPCCCD",
 274 |     "CPCCD",
 275 |     "CPCD",
 276 |     "CPCDD",
 277 |     "CPCPD",
 278 |     "CPD",
 279 |     "CPDCCD",
 280 |     "CPDCD",
 281 |     "CPDD",
 282 |     "CPDGD",
 283 |     "CPDWGWD",
 284 |     "CPGCD",
 285 |     "CPGD",
 286 |     "CPID",
 287 |     "CPJCD",
 288 |     "CPJD",
 289 |     "CPJPD",
 290 |     "CPMD",
 291 |     "CPPD",
 292 |     "CWCD",
 293 |     "CWCGWCCD",
 294 |     "CWCWD",
 295 |     "CWDWDD",
 296 |     "CWGWCCD",
 297 |     "CWGWCD",
 298 |     "CWPWD",
 299 |     "DCCCCCD",
 300 |     "DCCCCD",
 301 |     "DCCCCDCCD",
 302 |     "DCCCD",
 303 |     "DCCD",
 304 |     "DCD",
 305 |     "DCDD",
 306 |     "DCGCD",
 307 |     "DCJD",
 308 |     "DCPD",
 309 |     "DD",
 310 |     "DDCCD",
 311 |     "DDCD",
 312 |     "DDD",
 313 |     "DDICCD",
 314 |     "DFD",
 315 |     "DGCCD",
 316 |     "DGCD",
 317 |     "DGD",
 318 |     "DGDCD",
 319 |     "DGDD",
 320 |     "DGDPD",
 321 |     "DGGD",
 322 |     "DICCCD",
 323 |     "DICD",
 324 |     "DID",
 325 |     "DIICD",
 326 |     "DJCCD",
 327 |     "DJCD",
 328 |     "DJD",
 329 |     "DLCCD",
 330 |     "DLCD",
 331 |     "DLD",
 332 |     "DMCD",
 333 |     "DMD",
 334 |     "DMMCD",
 335 |     "DPD",
 336 |     "DPMMCCD",
 337 |     "FCCCCCD",
 338 |     "FCCCCD",
 339 |     "FCCCD",
 340 |     "FCCCPCD",
 341 |     "FCCD",
 342 |     "FCCGD",
 343 |     "FCCID",
 344 |     "FCCPD",
 345 |     "FCCWGWD",
 346 |     "FCD",
 347 |     "FCDCD",
 348 |     "FCDD",
 349 |     "FCDFD",
 350 |     "FCFCD",
 351 |     "FCFPD",
 352 |     "FCGCCD",
 353 |     "FCGCD",
 354 |     "FCGD",
 355 |     "FCID",
 356 |     "FCIJJD",
 357 |     "FCJCD",
 358 |     "FCJD",
 359 |     "FCPD",
 360 |     "FCPGCD",
 361 |     "FCWGWD",
 362 |     "FD",
 363 |     "FDCD",
 364 |     "FDD",
 365 |     "FDFD",
 366 |     "FDGCCD",
 367 |     "FDID",
 368 |     "FDLCD",
 369 |     "FFCCD",
 370 |     "FFCD",
 371 |     "FFCKFCCD",
 372 |     "FFCLLD",
 373 |     "FFD",
 374 |     "FFFD",
 375 |     "FFGCCD",
 376 |     "FFGD",
 377 |     "FFJCD",
 378 |     "FFJD",
 379 |     "FFJPCD",
 380 |     "FFPD",
 381 |     "FGCCD",
 382 |     "FGCD",
 383 |     "FGCGCGCJCD",
 384 |     "FGD",
 385 |     "FGDD",
 386 |     "FGFD",
 387 |     "FGJCCD",
 388 |     "FICCD",
 389 |     "FICD",
 390 |     "FICDD",
 391 |     "FICGD",
 392 |     "FICID",
 393 |     "FID",
 394 |     "FIDCD",
 395 |     "FIDD",
 396 |     "FIFPD",
 397 |     "FIID",
 398 |     "FIJCD",
 399 |     "FIJD",
 400 |     "FJCCD",
 401 |     "FJCD",
 402 |     "FJCDD",
 403 |     "FJD",
 404 |     "FJDCD",
 405 |     "FJDD",
 406 |     "FJGD",
 407 |     "FJJCCD",
 408 |     "FJJCD",
 409 |     "FJJCLCD",
 410 |     "FJJD",
 411 |     "FJJJCCD",
 412 |     "FJJJD",
 413 |     "FJJJICCD",
 414 |     "FJJLJLCD",
 415 |     "FJPJD",
 416 |     "FKCD",
 417 |     "FKCJD",
 418 |     "FLD",
 419 |     "FLPCD",
 420 |     "FMD",
 421 |     "FPCCCD",
 422 |     "FPCD",
 423 |     "FPD",
 424 |     "FPFD",
 425 |     "FPFDD",
 426 |     "FPID",
 427 |     "FPJCCD",
 428 |     "FPJCD",
 429 |     "FPPCD",
 430 |     "FPPD",
 431 |     "FPPDLD",
 432 |     "FWCCCWCD",
 433 |     "FWCCCWD",
 434 |     "FWDWD",
 435 |     "FWFD",
 436 |     "FWFWCCCWD",
 437 |     "FWGJCD",
 438 |     "FWGWCD",
 439 |     "GCCCCCCCD",
 440 |     "GCCCCCCD",
 441 |     "GCCCCCD",
 442 |     "GCCCCCDCD",
 443 |     "GCCCCCDD",
 444 |     "GCCCCD",
 445 |     "GCCCCDCCD",
 446 |     "GCCCCDD",
 447 |     "GCCCCGD",
 448 |     "GCCCCJD",
 449 |     "GCCCCPD",
 450 |     "GCCCCWDWD",
 451 |     "GCCCD",
 452 |     "GCCCDCCCD",
 453 |     "GCCCDCCCDD",
 454 |     "GCCCDCCD",
 455 |     "GCCCDCD",
 456 |     "GCCCDD",
 457 |     "GCCCDDJD",
 458 |     "GCCCDID",
 459 |     "GCCCDMCD",
 460 |     "GCCCDPD",
 461 |     "GCCCDWGCDWD",
 462 |     "GCCCFCD",
 463 |     "GCCCGD",
 464 |     "GCCCICD",
 465 |     "GCCCID",
 466 |     "GCCCJCD",
 467 |     "GCCCJD",
 468 |     "GCCCJGD",
 469 |     "GCCCLD",
 470 |     "GCCCMD",
 471 |     "GCCCPCCD",
 472 |     "GCCCWDWD",
 473 |     "GCCD",
 474 |     "GCCDCCCCD",
 475 |     "GCCDCCCD",
 476 |     "GCCDCCCDCD",
 477 |     "GCCDCCD",
 478 |     "GCCDCD",
 479 |     "GCCDCID",
 480 |     "GCCDCJCD",
 481 |     "GCCDCPCD",
 482 |     "GCCDD",
 483 |     "GCCDDCCCD",
 484 |     "GCCDDCCD",
 485 |     "GCCDDD",
 486 |     "GCCDFD",
 487 |     "GCCDGCCD",
 488 |     "GCCDGD",
 489 |     "GCCDGGDCD",
 490 |     "GCCDID",
 491 |     "GCCDJCD",
 492 |     "GCCDJD",
 493 |     "GCCDLDD",
 494 |     "GCCDLJCD",
 495 |     "GCCDMJD",
 496 |     "GCCDMJMMCD",
 497 |     "GCCDMJMMD",
 498 |     "GCCDMMD",
 499 |     "GCCDPD",
 500 |     "GCCFCD",
 501 |     "GCCFDD",
 502 |     "GCCFJPD",
 503 |     "GCCFPD",
 504 |     "GCCGCCCD",
 505 |     "GCCGCCD",
 506 |     "GCCGCD",
 507 |     "GCCGCDD",
 508 |     "GCCGD",
 509 |     "GCCGGCGD",
 510 |     "GCCGGDD",
 511 |     "GCCICCDCCD",
 512 |     "GCCICD",
 513 |     "GCCID",
 514 |     "GCCIDD",
 515 |     "GCCJCCCD",
 516 |     "GCCJCCCID",
 517 |     "GCCJCCD",
 518 |     "GCCJCD",
 519 |     "GCCJCJD",
 520 |     "GCCJD",
 521 |     "GCCJICD",
 522 |     "GCCJID",
 523 |     "GCCJPCD",
 524 |     "GCCJPD",
 525 |     "GCCKD",
 526 |     "GCCLCCD",
 527 |     "GCCLCD",
 528 |     "GCCLCGCD",
 529 |     "GCCLD",
 530 |     "GCCMCD",
 531 |     "GCCMD",
 532 |     "GCCMPD",
 533 |     "GCCPCCCCD",
 534 |     "GCCPCCCID",
 535 |     "GCCPCCD",
 536 |     "GCCPCD",
 537 |     "GCCPD",
 538 |     "GCCPDD",
 539 |     "GCCPFWCJD",
 540 |     "GCCPJD",
 541 |     "GCCWCCWCD",
 542 |     "GCCWCDWCD",
 543 |     "GCCWDWCCD",
 544 |     "GCCWDWD",
 545 |     "GCD",
 546 |     "GCDCCCCD",
 547 |     "GCDCCCCPD",
 548 |     "GCDCCCD",
 549 |     "GCDCCD",
 550 |     "GCDCCDCD",
 551 |     "GCDCCDD",
 552 |     "GCDCCDID",
 553 |     "GCDCCJCD",
 554 |     "GCDCCJD",
 555 |     "GCDCD",
 556 |     "GCDCDD",
 557 |     "GCDCDICD",
 558 |     "GCDCGCD",
 559 |     "GCDCGD",
 560 |     "GCDCGMCD",
 561 |     "GCDCID",
 562 |     "GCDCJCD",
 563 |     "GCDCJD",
 564 |     "GCDCLDD",
 565 |     "GCDCMCD",
 566 |     "GCDCMD",
 567 |     "GCDCMDCD",
 568 |     "GCDCMDD",
 569 |     "GCDCMDID",
 570 |     "GCDCPD",
 571 |     "GCDD",
 572 |     "GCDDCD",
 573 |     "GCDDD",
 574 |     "GCDDMCD",
 575 |     "GCDFD",
 576 |     "GCDFGCD",
 577 |     "GCDFWFD",
 578 |     "GCDGCCCCCD",
 579 |     "GCDGCCD",
 580 |     "GCDGCD",
 581 |     "GCDGD",
 582 |     "GCDGDD",
 583 |     "GCDGGD",
 584 |     "GCDGLCCD",
 585 |     "GCDGLJPCD",
 586 |     "GCDICCCCD",
 587 |     "GCDICCD",
 588 |     "GCDICD",
 589 |     "GCDID",
 590 |     "GCDIDD",
 591 |     "GCDJCCD",
 592 |     "GCDJCD",
 593 |     "GCDJCDGPD",
 594 |     "GCDJD",
 595 |     "GCDJJD",
 596 |     "GCDKCDCD",
 597 |     "GCDLCCCD",
 598 |     "GCDLD",
 599 |     "GCDLGCCCCD",
 600 |     "GCDLGCD",
 601 |     "GCDLPD",
 602 |     "GCDMCD",
 603 |     "GCDMCDD",
 604 |     "GCDMD",
 605 |     "GCDMDD",
 606 |     "GCDMJD",
 607 |     "GCDPCD",
 608 |     "GCDPD",
 609 |     "GCDWFWD",
 610 |     "GCDWGWCD",
 611 |     "GCDWGWD",
 612 |     "GCFCCD",
 613 |     "GCFCCJFGDD",
 614 |     "GCFCD",
 615 |     "GCFD",
 616 |     "GCFDD",
 617 |     "GCFFD",
 618 |     "GCFID",
 619 |     "GCFJCCD",
 620 |     "GCFPCD",
 621 |     "GCFPD",
 622 |     "GCFWGCCD",
 623 |     "GCFWGCCDD",
 624 |     "GCFWGJCD",
 625 |     "GCGCCCD",
 626 |     "GCGCCD",
 627 |     "GCGCD",
 628 |     "GCGCID",
 629 |     "GCGCLD",
 630 |     "GCGCPPCCD",
 631 |     "GCGD",
 632 |     "GCGDD",
 633 |     "GCGGCD",
 634 |     "GCGGCGD",
 635 |     "GCGGD",
 636 |     "GCGICD",
 637 |     "GCGID",
 638 |     "GCGJCCD",
 639 |     "GCGPCCD",
 640 |     "GCICCCCD",
 641 |     "GCICCCD",
 642 |     "GCICCD",
 643 |     "GCICD",
 644 |     "GCICDD",
 645 |     "GCID",
 646 |     "GCIDD",
 647 |     "GCIDID",
 648 |     "GCIFCCD",
 649 |     "GCIID",
 650 |     "GCIJCD",
 651 |     "GCIJD",
 652 |     "GCIJICD",
 653 |     "GCIPCD",
 654 |     "GCIPD",
 655 |     "GCIWGIIWD",
 656 |     "GCJCCCCD",
 657 |     "GCJCCCD",
 658 |     "GCJCCD",
 659 |     "GCJCD",
 660 |     "GCJCGD",
 661 |     "GCJCID",
 662 |     "GCJCIID",
 663 |     "GCJCPD",
 664 |     "GCJD",
 665 |     "GCJDCCD",
 666 |     "GCJDCD",
 667 |     "GCJDD",
 668 |     "GCJDID",
 669 |     "GCJFD",
 670 |     "GCJGD",
 671 |     "GCJICD",
 672 |     "GCJID",
 673 |     "GCJJCCD",
 674 |     "GCJJCD",
 675 |     "GCJJD",
 676 |     "GCJJGD",
 677 |     "GCJKCD",
 678 |     "GCJLCCD",
 679 |     "GCJMD",
 680 |     "GCJPCCGJLFD",
 681 |     "GCJPD",
 682 |     "GCJWCCJCD",
 683 |     "GCKCCD",
 684 |     "GCKD",
 685 |     "GCLCCCD",
 686 |     "GCLCCD",
 687 |     "GCLCD",
 688 |     "GCLD",
 689 |     "GCLDD",
 690 |     "GCLGGCD",
 691 |     "GCMCCD",
 692 |     "GCMCD",
 693 |     "GCMD",
 694 |     "GCMDD",
 695 |     "GCMPCD",
 696 |     "GCMPMD",
 697 |     "GCPCCCCD",
 698 |     "GCPCCCD",
 699 |     "GCPCCD",
 700 |     "GCPCCDD",
 701 |     "GCPCD",
 702 |     "GCPCDD",
 703 |     "GCPCKCD",
 704 |     "GCPD",
 705 |     "GCPDCCD",
 706 |     "GCPDD",
 707 |     "GCPFD",
 708 |     "GCPICCCD",
 709 |     "GCPJCCD",
 710 |     "GCPJCD",
 711 |     "GCPJD",
 712 |     "GCPJDCD",
 713 |     "GCPJJCD",
 714 |     "GCPJJDD",
 715 |     "GCPJPD",
 716 |     "GCPPCCD",
 717 |     "GCPPD",
 718 |     "GCPPPD",
 719 |     "GCWCWCJD",
 720 |     "GCWCWD",
 721 |     "GCWDWCDD",
 722 |     "GCWDWD",
 723 |     "GCWGWDD",
 724 |     "GD",
 725 |     "GDCCCCCCD",
 726 |     "GDCCCCCD",
 727 |     "GDCCCCD",
 728 |     "GDCCCCPD",
 729 |     "GDCCCD",
 730 |     "GDCCCDD",
 731 |     "GDCCCGCCD",
 732 |     "GDCCCJCD",
 733 |     "GDCCCJD",
 734 |     "GDCCCJDCD",
 735 |     "GDCCD",
 736 |     "GDCCDCD",
 737 |     "GDCCDCDD",
 738 |     "GDCCDD",
 739 |     "GDCCID",
 740 |     "GDCCJD",
 741 |     "GDCCPCD",
 742 |     "GDCD",
 743 |     "GDCDCCD",
 744 |     "GDCDCD",
 745 |     "GDCDD",
 746 |     "GDCDICD",
 747 |     "GDCDPD",
 748 |     "GDCFD",
 749 |     "GDCGCCD",
 750 |     "GDCGD",
 751 |     "GDCGPPCCD",
 752 |     "GDCID",
 753 |     "GDCIDD",
 754 |     "GDCJCCD",
 755 |     "GDCJD",
 756 |     "GDCLD",
 757 |     "GDCMD",
 758 |     "GDCPD",
 759 |     "GDCPID",
 760 |     "GDCPJD",
 761 |     "GDD",
 762 |     "GDDCCCCD",
 763 |     "GDDCCCD",
 764 |     "GDDCCD",
 765 |     "GDDCD",
 766 |     "GDDCDD",
 767 |     "GDDCFD",
 768 |     "GDDCFDCD",
 769 |     "GDDCMD",
 770 |     "GDDD",
 771 |     "GDDDCD",
 772 |     "GDDID",
 773 |     "GDDPPD",
 774 |     "GDDPPLD",
 775 |     "GDFCCD",
 776 |     "GDFCD",
 777 |     "GDFD",
 778 |     "GDFFD",
 779 |     "GDFGD",
 780 |     "GDGCCCD",
 781 |     "GDGCCD",
 782 |     "GDGCD",
 783 |     "GDGD",
 784 |     "GDGDCD",
 785 |     "GDGDD",
 786 |     "GDGDFID",
 787 |     "GDGJCCD",
 788 |     "GDGMD",
 789 |     "GDICCD",
 790 |     "GDICD",
 791 |     "GDID",
 792 |     "GDIDCD",
 793 |     "GDIDD",
 794 |     "GDIGCD",
 795 |     "GDIID",
 796 |     "GDIPCD",
 797 |     "GDJCCCD",
 798 |     "GDJCCD",
 799 |     "GDJCD",
 800 |     "GDJD",
 801 |     "GDJICD",
 802 |     "GDJJD",
 803 |     "GDJJJD",
 804 |     "GDJPCD",
 805 |     "GDJPDD",
 806 |     "GDLCCCCCD",
 807 |     "GDLCID",
 808 |     "GDLD",
 809 |     "GDLJD",
 810 |     "GDLJDD",
 811 |     "GDMCD",
 812 |     "GDMD",
 813 |     "GDMDCD",
 814 |     "GDMDD",
 815 |     "GDMJD",
 816 |     "GDMJMMD",
 817 |     "GDMPD",
 818 |     "GDPCCCCCD",
 819 |     "GDPCCD",
 820 |     "GDPCD",
 821 |     "GDPD",
 822 |     "GDPGCD",
 823 |     "GDPID",
 824 |     "GDPJCD",
 825 |     "GDPJD",
 826 |     "GDPPD",
 827 |     "GDPPJD",
 828 |     "GDWDWCCD",
 829 |     "GDWDWCCDD",
 830 |     "GDWDWD",
 831 |     "GDWFWD",
 832 |     "GDWGWD",
 833 |     "GFCCCCCD",
 834 |     "GFCCCCD",
 835 |     "GFCCCCJD",
 836 |     "GFCCCD",
 837 |     "GFCCCID",
 838 |     "GFCCD",
 839 |     "GFCCDD",
 840 |     "GFCCFCD",
 841 |     "GFCCPD",
 842 |     "GFCCPGD",
 843 |     "GFCD",
 844 |     "GFCDCD",
 845 |     "GFCDD",
 846 |     "GFCID",
 847 |     "GFCJCD",
 848 |     "GFCJD",
 849 |     "GFCPCCD",
 850 |     "GFCPCD",
 851 |     "GFCPD",
 852 |     "GFCPJD",
 853 |     "GFCPJPD",
 854 |     "GFD",
 855 |     "GFDCCCD",
 856 |     "GFDCD",
 857 |     "GFDD",
 858 |     "GFFCCD",
 859 |     "GFFCD",
 860 |     "GFFD",
 861 |     "GFFPCGCD",
 862 |     "GFGCD",
 863 |     "GFGCID",
 864 |     "GFGD",
 865 |     "GFGJCD",
 866 |     "GFICCD",
 867 |     "GFICD",
 868 |     "GFID",
 869 |     "GFIICD",
 870 |     "GFJCCCD",
 871 |     "GFJCCD",
 872 |     "GFJCD",
 873 |     "GFJCDCD",
 874 |     "GFJD",
 875 |     "GFJJCCD",
 876 |     "GFJJD",
 877 |     "GFJJJCCD",
 878 |     "GFJJLJCLCD",
 879 |     "GFLD",
 880 |     "GFLPD",
 881 |     "GFMCD",
 882 |     "GFPCD",
 883 |     "GFPD",
 884 |     "GFPJCD",
 885 |     "GFPJD",
 886 |     "GFPJPD",
 887 |     "GFPPCCCD",
 888 |     "GFPPD",
 889 |     "GFWCJCPCCCWCCD",
 890 |     "GFWGWCD",
 891 |     "GGCCCCCD",
 892 |     "GGCCCCD",
 893 |     "GGCCCD",
 894 |     "GGCCCICD",
 895 |     "GGCCCID",
 896 |     "GGCCCWDWD",
 897 |     "GGCCD",
 898 |     "GGCCDCD",
 899 |     "GGCCDD",
 900 |     "GGCCGCD",
 901 |     "GGCCGD",
 902 |     "GGCCGJD",
 903 |     "GGCCJCD",
 904 |     "GGCCJD",
 905 |     "GGCD",
 906 |     "GGCDCCCCCD",
 907 |     "GGCDCCD",
 908 |     "GGCDCD",
 909 |     "GGCDD",
 910 |     "GGCDJD",
 911 |     "GGCFCCFCPD",
 912 |     "GGCFD",
 913 |     "GGCFJD",
 914 |     "GGCGCCCD",
 915 |     "GGCGCD",
 916 |     "GGCGD",
 917 |     "GGCGGD",
 918 |     "GGCICLCD",
 919 |     "GGCID",
 920 |     "GGCIJCD",
 921 |     "GGCJCCD",
 922 |     "GGCJCD",
 923 |     "GGCJD",
 924 |     "GGCJDDCD",
 925 |     "GGCJJCCD",
 926 |     "GGCJJD",
 927 |     "GGCJPCICCCD",
 928 |     "GGCJPD",
 929 |     "GGCLCD",
 930 |     "GGCLD",
 931 |     "GGCMD",
 932 |     "GGCPCCD",
 933 |     "GGCPCD",
 934 |     "GGCPD",
 935 |     "GGD",
 936 |     "GGDCCCD",
 937 |     "GGDCCD",
 938 |     "GGDCD",
 939 |     "GGDD",
 940 |     "GGDDCCD",
 941 |     "GGDDCD",
 942 |     "GGDDD",
 943 |     "GGDFCD",
 944 |     "GGDFD",
 945 |     "GGDGD",
 946 |     "GGDID",
 947 |     "GGDJCD",
 948 |     "GGDJD",
 949 |     "GGDJJD",
 950 |     "GGDPPJD",
 951 |     "GGFCCCD",
 952 |     "GGFCCD",
 953 |     "GGFCD",
 954 |     "GGFD",
 955 |     "GGFDD",
 956 |     "GGFFCD",
 957 |     "GGFFD",
 958 |     "GGFFDCD",
 959 |     "GGFFDD",
 960 |     "GGFGD",
 961 |     "GGFJCCD",
 962 |     "GGFJD",
 963 |     "GGFJDD",
 964 |     "GGFJJD",
 965 |     "GGFLD",
 966 |     "GGFPCFPCD",
 967 |     "GGGCCCCD",
 968 |     "GGGCCCD",
 969 |     "GGGCCD",
 970 |     "GGGCD",
 971 |     "GGGCDD",
 972 |     "GGGCGCD",
 973 |     "GGGCGD",
 974 |     "GGGCID",
 975 |     "GGGCJD",
 976 |     "GGGD",
 977 |     "GGGDCD",
 978 |     "GGGDD",
 979 |     "GGGFD",
 980 |     "GGGGCD",
 981 |     "GGGGD",
 982 |     "GGGGFJD",
 983 |     "GGGGICD",
 984 |     "GGGGJD",
 985 |     "GGGGJPD",
 986 |     "GGGGLD",
 987 |     "GGGGPCD",
 988 |     "GGGGPPD",
 989 |     "GGGICD",
 990 |     "GGGID",
 991 |     "GGGIDID",
 992 |     "GGGIGCJD",
 993 |     "GGGIJD",
 994 |     "GGGJCD",
 995 |     "GGGJD",
 996 |     "GGGJJCJD",
 997 |     "GGGJJD",
 998 |     "GGGJPCCD",
 999 |     "GGGLD",
1000 |     "GGGMD",
1001 |     "GGGPJD",
1002 |     "GGGWICWD",
1003 |     "GGICCCCD",
1004 |     "GGICCCD",
1005 |     "GGICCD",
1006 |     "GGICCGD",
1007 |     "GGICCLD",
1008 |     "GGICCPCCD",
1009 |     "GGICD",
1010 |     "GGICGCCCD",
1011 |     "GGICID",
1012 |     "GGICJD",
1013 |     "GGID",
1014 |     "GGIDCD",
1015 |     "GGIDD",
1016 |     "GGIFD",
1017 |     "GGIFJCD",
1018 |     "GGIFPD",
1019 |     "GGIGCCD",
1020 |     "GGIGD",
1021 |     "GGIICD",
1022 |     "GGIID",
1023 |     "GGIIPID",
1024 |     "GGIJCCD",
1025 |     "GGIJD",
1026 |     "GGIPCD",
1027 |     "GGIPD",
1028 |     "GGIPDD",
1029 |     "GGJCCCD",
1030 |     "GGJCCD",
1031 |     "GGJCCPCJCCD",
1032 |     "GGJCD",
1033 |     "GGJCWDWD",
1034 |     "GGJD",
1035 |     "GGJGCCCD",
1036 |     "GGJGCCD",
1037 |     "GGJGD",
1038 |     "GGJJD",
1039 |     "GGJJPCD",
1040 |     "GGJLD",
1041 |     "GGJPD",
1042 |     "GGJPDD",
1043 |     "GGKD",
1044 |     "GGKGD",
1045 |     "GGLCCCD",
1046 |     "GGLCD",
1047 |     "GGLCDD",
1048 |     "GGLCJD",
1049 |     "GGLCPD",
1050 |     "GGLD",
1051 |     "GGLFD",
1052 |     "GGLID",
1053 |     "GGLJD",
1054 |     "GGLLFD",
1055 |     "GGLPD",
1056 |     "GGMCD",
1057 |     "GGMCDD",
1058 |     "GGMD",
1059 |     "GGMJCD",
1060 |     "GGMLD",
1061 |     "GGMPCCD",
1062 |     "GGPCCCD",
1063 |     "GGPCCD",
1064 |     "GGPCD",
1065 |     "GGPCJCD",
1066 |     "GGPD",
1067 |     "GGPFD",
1068 |     "GGPICD",
1069 |     "GGPJCCCCD",
1070 |     "GGPJCD",
1071 |     "GGPJCDD",
1072 |     "GGPJD",
1073 |     "GGPLD",
1074 |     "GGPPCCD",
1075 |     "GGPPCD",
1076 |     "GGPPD",
1077 |     "GGPPJJD",
1078 |     "GGPPPCD",
1079 |     "GGWPCGWPJD",
1080 |     "GICCCCCCD",
1081 |     "GICCCCCD",
1082 |     "GICCCCD",
1083 |     "GICCCD",
1084 |     "GICCCDD",
1085 |     "GICCCJCD",
1086 |     "GICCD",
1087 |     "GICCDD",
1088 |     "GICCJD",
1089 |     "GICCLDD",
1090 |     "GICCPD",
1091 |     "GICD",
1092 |     "GICDCCCCD",
1093 |     "GICDCCD",
1094 |     "GICDCD",
1095 |     "GICDD",
1096 |     "GICDLPD",
1097 |     "GICDWCCWD",
1098 |     "GICGCCCCD",
1099 |     "GICGCCD",
1100 |     "GICGCJICD",
1101 |     "GICGD",
1102 |     "GICGGD",
1103 |     "GICGMMD",
1104 |     "GICGPCJD",
1105 |     "GICICCD",
1106 |     "GICICD",
1107 |     "GICID",
1108 |     "GICIGD",
1109 |     "GICIID",
1110 |     "GICJCCD",
1111 |     "GICJCD",
1112 |     "GICJD",
1113 |     "GICPCCCCD",
1114 |     "GICPD",
1115 |     "GICPICD",
1116 |     "GICPJD",
1117 |     "GID",
1118 |     "GIDCCCJCD",
1119 |     "GIDCCD",
1120 |     "GIDCD",
1121 |     "GIDD",
1122 |     "GIDDD",
1123 |     "GIDICCD",
1124 |     "GIDID",
1125 |     "GIDLPCD",
1126 |     "GIFCCD",
1127 |     "GIFD",
1128 |     "GIFICD",
1129 |     "GIFWFD",
1130 |     "GIGCCD",
1131 |     "GIGCD",
1132 |     "GIGCGCD",
1133 |     "GIGCJD",
1134 |     "GIGCPD",
1135 |     "GIGD",
1136 |     "GIGGD",
1137 |     "GIGICD",
1138 |     "GIGID",
1139 |     "GIGJPCD",
1140 |     "GIICCCCD",
1141 |     "GIICCD",
1142 |     "GIICD",
1143 |     "GIID",
1144 |     "GIIGD",
1145 |     "GIIID",
1146 |     "GIIJCCCD",
1147 |     "GIIJCD",
1148 |     "GIJCCCCCD",
1149 |     "GIJCCCCD",
1150 |     "GIJCCCD",
1151 |     "GIJCCD",
1152 |     "GIJCD",
1153 |     "GIJCPD",
1154 |     "GIJD",
1155 |     "GIJDD",
1156 |     "GIJID",
1157 |     "GIJJCCD",
1158 |     "GIJJCD",
1159 |     "GIJLD",
1160 |     "GIJPD",
1161 |     "GIJPDCD",
1162 |     "GIKD",
1163 |     "GILCCCCDD",
1164 |     "GILCCD",
1165 |     "GILCD",
1166 |     "GILD",
1167 |     "GILID",
1168 |     "GILPMD",
1169 |     "GIMCCD",
1170 |     "GIMCD",
1171 |     "GIMD",
1172 |     "GIMJCD",
1173 |     "GIMJD",
1174 |     "GIMPCCD",
1175 |     "GIPCCCCD",
1176 |     "GIPCCCD",
1177 |     "GIPCCD",
1178 |     "GIPCD",
1179 |     "GIPCMD",
1180 |     "GIPD",
1181 |     "GIPDCD",
1182 |     "GIPDD",
1183 |     "GIPICD",
1184 |     "GIPJCCD",
1185 |     "GIPJCD",
1186 |     "GIPPCD",
1187 |     "GIPPD",
1188 |     "GIWDCCWCD",
1189 |     "GIWDWD",
1190 |     "GIWGWCD",
1191 |     "GJCCCCCD",
1192 |     "GJCCCCD",
1193 |     "GJCCCD",
1194 |     "GJCCCDCDCD",
1195 |     "GJCCCDD",
1196 |     "GJCCD",
1197 |     "GJCCDCD",
1198 |     "GJCCDD",
1199 |     "GJCCFD",
1200 |     "GJCCGJPD",
1201 |     "GJCCICCD",
1202 |     "GJCCJCD",
1203 |     "GJCCJD",
1204 |     "GJCD",
1205 |     "GJCDCCD",
1206 |     "GJCDCJCCD",
1207 |     "GJCDD",
1208 |     "GJCDJCD",
1209 |     "GJCDPD",
1210 |     "GJCGCD",
1211 |     "GJCGD",
1212 |     "GJCGPJCCD",
1213 |     "GJCICCCD",
1214 |     "GJCICD",
1215 |     "GJCID",
1216 |     "GJCJCCD",
1217 |     "GJCJCD",
1218 |     "GJCJD",
1219 |     "GJCJJCCCCD",
1220 |     "GJCJJCD",
1221 |     "GJCJPD",
1222 |     "GJCJPPCD",
1223 |     "GJCLD",
1224 |     "GJCLJCCCD",
1225 |     "GJCMD",
1226 |     "GJCPD",
1227 |     "GJCPJD",
1228 |     "GJCPPD",
1229 |     "GJD",
1230 |     "GJDCCCD",
1231 |     "GJDCCD",
1232 |     "GJDCD",
1233 |     "GJDD",
1234 |     "GJDICD",
1235 |     "GJDID",
1236 |     "GJDLCD",
1237 |     "GJDPCD",
1238 |     "GJFCCD",
1239 |     "GJFCD",
1240 |     "GJFD",
1241 |     "GJFFD",
1242 |     "GJFGD",
1243 |     "GJFICD",
1244 |     "GJGCD",
1245 |     "GJGD",
1246 |     "GJGPCD",
1247 |     "GJICCCD",
1248 |     "GJICCD",
1249 |     "GJICD",
1250 |     "GJID",
1251 |     "GJIID",
1252 |     "GJJCCCD",
1253 |     "GJJCCD",
1254 |     "GJJCCDD",
1255 |     "GJJCD",
1256 |     "GJJCJCCCD",
1257 |     "GJJCJCCD",
1258 |     "GJJCPCD",
1259 |     "GJJD",
1260 |     "GJJDCD",
1261 |     "GJJDD",
1262 |     "GJJFCCD",
1263 |     "GJJFD",
1264 |     "GJJGD",
1265 |     "GJJJCD",
1266 |     "GJJJD",
1267 |     "GJJJICD",
1268 |     "GJJJJCCD",
1269 |     "GJJJJD",
1270 |     "GJJPCCCD",
1271 |     "GJJPCCD",
1272 |     "GJJPCID",
1273 |     "GJJPPD",
1274 |     "GJLCCCCD",
1275 |     "GJLCD",
1276 |     "GJLCDD",
1277 |     "GJLD",
1278 |     "GJMCCD",
1279 |     "GJMD",
1280 |     "GJPCCCCD",
1281 |     "GJPCCCD",
1282 |     "GJPCCD",
1283 |     "GJPCD",
1284 |     "GJPCDD",
1285 |     "GJPCJCD",
1286 |     "GJPCLCD",
1287 |     "GJPCMD",
1288 |     "GJPD",
1289 |     "GJPDD",
1290 |     "GJPGCCD",
1291 |     "GJPGD",
1292 |     "GJPICCD",
1293 |     "GJPICD",
1294 |     "GJPICDD",
1295 |     "GJPJCCD",
1296 |     "GJPJD",
1297 |     "GJPJPD",
1298 |     "GJPLCD",
1299 |     "GJPPJD",
1300 |     "GKCCCD",
1301 |     "GKCCD",
1302 |     "GKCCPD",
1303 |     "GKCD",
1304 |     "GKCDCD",
1305 |     "GKCDD",
1306 |     "GKCDJCD",
1307 |     "GKCJCD",
1308 |     "GKCMD",
1309 |     "GKD",
1310 |     "GKDD",
1311 |     "GKJJD",
1312 |     "GLCCCCCCD",
1313 |     "GLCCCCD",
1314 |     "GLCCCD",
1315 |     "GLCCD",
1316 |     "GLCCDD",
1317 |     "GLCCJCCCD",
1318 |     "GLCCJCCD",
1319 |     "GLCD",
1320 |     "GLCDD",
1321 |     "GLCDGCCD",
1322 |     "GLCGCJCD",
1323 |     "GLCGD",
1324 |     "GLCGDD",
1325 |     "GLCJD",
1326 |     "GLCJJCCCCCD",
1327 |     "GLCLD",
1328 |     "GLCMD",
1329 |     "GLCPCCD",
1330 |     "GLCPD",
1331 |     "GLD",
1332 |     "GLDCD",
1333 |     "GLDCMD",
1334 |     "GLDCMDCD",
1335 |     "GLDCMDD",
1336 |     "GLDD",
1337 |     "GLDDCKCD",
1338 |     "GLFCD",
1339 |     "GLFCFD",
1340 |     "GLFGCD",
1341 |     "GLGCD",
1342 |     "GLGD",
1343 |     "GLGPJD",
1344 |     "GLICCD",
1345 |     "GLICD",
1346 |     "GLID",
1347 |     "GLJCCCD",
1348 |     "GLJCCD",
1349 |     "GLJCD",
1350 |     "GLJCICCD",
1351 |     "GLJD",
1352 |     "GLJFCD",
1353 |     "GLJGD",
1354 |     "GLJICCD",
1355 |     "GLJID",
1356 |     "GLJJD",
1357 |     "GLJPCCD",
1358 |     "GLJPCICD",
1359 |     "GLJPJCCD",
1360 |     "GLJWGWCD",
1361 |     "GLLCCCD",
1362 |     "GLLCID",
1363 |     "GLPCCCD",
1364 |     "GLPCCD",
1365 |     "GLPCD",
1366 |     "GLPCDD",
1367 |     "GLPCPCCD",
1368 |     "GLPD",
1369 |     "GLPDD",
1370 |     "GLPGCD",
1371 |     "GLPJD",
1372 |     "GLPLJCCCD",
1373 |     "GLPLJCD",
1374 |     "GLPPCCCCD",
1375 |     "GLPPCCD",
1376 |     "GLPPCD",
1377 |     "GMCCCCD",
1378 |     "GMCCCD",
1379 |     "GMCCD",
1380 |     "GMCCID",
1381 |     "GMCD",
1382 |     "GMCDCCCD",
1383 |     "GMCDCCD",
1384 |     "GMCDCD",
1385 |     "GMCDD",
1386 |     "GMCDMCD",
1387 |     "GMCGD",
1388 |     "GMCJCD",
1389 |     "GMCMD",
1390 |     "GMCMJD",
1391 |     "GMD",
1392 |     "GMDCD",
1393 |     "GMDD",
1394 |     "GMDICD",
1395 |     "GMDID",
1396 |     "GMGJCD",
1397 |     "GMGJJD",
1398 |     "GMICD",
1399 |     "GMID",
1400 |     "GMIPJCCD",
1401 |     "GMJCCD",
1402 |     "GMJCD",
1403 |     "GMJD",
1404 |     "GMJDD",
1405 |     "GMJICCCD",
1406 |     "GMJMJFCD",
1407 |     "GMJPCD",
1408 |     "GMJPLCCD",
1409 |     "GMLD",
1410 |     "GMLDCD",
1411 |     "GMLGCD",
1412 |     "GMLID",
1413 |     "GMLLD",
1414 |     "GMMCCCD",
1415 |     "GMMD",
1416 |     "GMMGD",
1417 |     "GMMLCCD",
1418 |     "GMMPCD",
1419 |     "GMMPD",
1420 |     "GMPCCD",
1421 |     "GMPCD",
1422 |     "GMPD",
1423 |     "GMPDCD",
1424 |     "GMPDD",
1425 |     "GMPJCD",
1426 |     "GPCCCCCCD",
1427 |     "GPCCCCD",
1428 |     "GPCCCCID",
1429 |     "GPCCCD",
1430 |     "GPCCD",
1431 |     "GPCCDCCD",
1432 |     "GPCCDD",
1433 |     "GPCCDDD",
1434 |     "GPCD",
1435 |     "GPCDCCD",
1436 |     "GPCDCD",
1437 |     "GPCDD",
1438 |     "GPCFDCCD",
1439 |     "GPCFDD",
1440 |     "GPCGD",
1441 |     "GPCICCD",
1442 |     "GPCID",
1443 |     "GPCIJD",
1444 |     "GPCJCCCD",
1445 |     "GPCJCCD",
1446 |     "GPCJCD",
1447 |     "GPCPID",
1448 |     "GPCWDWCD",
1449 |     "GPD",
1450 |     "GPDCCD",
1451 |     "GPDCD",
1452 |     "GPDD",
1453 |     "GPFCCD",
1454 |     "GPFCD",
1455 |     "GPFD",
1456 |     "GPFFCD",
1457 |     "GPGCCCD",
1458 |     "GPGD",
1459 |     "GPGJCJCCCCD",
1460 |     "GPGPJD",
1461 |     "GPICCCCD",
1462 |     "GPICCCD",
1463 |     "GPICCD",
1464 |     "GPICD",
1465 |     "GPID",
1466 |     "GPIDCD",
1467 |     "GPIDD",
1468 |     "GPJCCCCCD",
1469 |     "GPJCCCD",
1470 |     "GPJCCD",
1471 |     "GPJCD",
1472 |     "GPJCDD",
1473 |     "GPJCJCCD",
1474 |     "GPJD",
1475 |     "GPJDCCD",
1476 |     "GPJDCD",
1477 |     "GPJDD",
1478 |     "GPJFICD",
1479 |     "GPJFID",
1480 |     "GPJGD",
1481 |     "GPJJCCD",
1482 |     "GPJJCD",
1483 |     "GPJLCD",
1484 |     "GPJWDWD",
1485 |     "GPLCWCWCWD",
1486 |     "GPLD",
1487 |     "GPLJCCD",
1488 |     "GPMJCGD",
1489 |     "GPMMD",
1490 |     "GPMPCCD",
1491 |     "GPPCCCCD",
1492 |     "GPPCCCD",
1493 |     "GPPCCD",
1494 |     "GPPCD",
1495 |     "GPPCDCCD",
1496 |     "GPPCDD",
1497 |     "GPPCLD",
1498 |     "GPPD",
1499 |     "GPPDCD",
1500 |     "GPPDCDD",
1501 |     "GPPDD",
1502 |     "GPPGCD",
1503 |     "GPPICCD",
1504 |     "GPPID",
1505 |     "GPPJCD",
1506 |     "GPPJD",
1507 |     "GPPJDD",
1508 |     "GPPJJCCCCD",
1509 |     "GPPLD",
1510 |     "GPPPCCD",
1511 |     "GPPPCKCCD",
1512 |     "GPPPPCCD",
1513 |     "GWCPWD",
1514 |     "GWCWCCCD",
1515 |     "GWCWCD",
1516 |     "GWCWD",
1517 |     "GWCWPJCD",
1518 |     "GWD",
1519 |     "GWFCD",
1520 |     "GWGCCCD",
1521 |     "GWGCCD",
1522 |     "GWGCCWCD",
1523 |     "GWGCD",
1524 |     "GWGCWD",
1525 |     "GWGD",
1526 |     "GWGID",
1527 |     "GWGWCCCCD",
1528 |     "GWGWCCCD",
1529 |     "GWGWCD",
1530 |     "GWGWICD",
1531 |     "GWGWLCD",
1532 |     "GWICD",
1533 |     "GWICWD",
1534 |     "GWIWD",
1535 |     "GWJWD",
1536 |     "GWLJWCD",
1537 |     "GWPD",
1538 |     "GWPJD",
1539 |     "ICCCCCCD",
1540 |     "ICCCCCD",
1541 |     "ICCCCD",
1542 |     "ICCCCDD",
1543 |     "ICCCD",
1544 |     "ICCD",
1545 |     "ICCDCCD",
1546 |     "ICCDCD",
1547 |     "ICCDD",
1548 |     "ICCGCCD",
1549 |     "ICCGCIPD",
1550 |     "ICCGD",
1551 |     "ICCJD",
1552 |     "ICCPD",
1553 |     "ICCWDWCD",
1554 |     "ICD",
1555 |     "ICDD",
1556 |     "ICDID",
1557 |     "ICFD",
1558 |     "ICGCCCD",
1559 |     "ICGCD",
1560 |     "ICGFD",
1561 |     "ICGGCD",
1562 |     "ICGLCMD",
1563 |     "ICICD",
1564 |     "ICID",
1565 |     "ICIGD",
1566 |     "ICJCD",
1567 |     "ICJD",
1568 |     "ICJJD",
1569 |     "ICLJCD",
1570 |     "ICMCCCCD",
1571 |     "ICMD",
1572 |     "ICPCD",
1573 |     "ICPD",
1574 |     "ICPPD",
1575 |     "ICWGWCD",
1576 |     "ICWGWD",
1577 |     "ICWGWDCD",
1578 |     "ID",
1579 |     "IDCCCCD",
1580 |     "IDCCCD",
1581 |     "IDCCD",
1582 |     "IDCCGJID",
1583 |     "IDCCICD",
1584 |     "IDCCICDID",
1585 |     "IDCD",
1586 |     "IDCDCD",
1587 |     "IDCDD",
1588 |     "IDCFCD",
1589 |     "IDCGD",
1590 |     "IDCICD",
1591 |     "IDCID",
1592 |     "IDCJD",
1593 |     "IDCPCCCCCCD",
1594 |     "IDD",
1595 |     "IDGCCCD",
1596 |     "IDGCD",
1597 |     "IDID",
1598 |     "IDIDD",
1599 |     "IDJCD",
1600 |     "IDKCD",
1601 |     "IDPD",
1602 |     "IDWCWCCDD",
1603 |     "IFD",
1604 |     "IFWGWCD",
1605 |     "IGCCCD",
1606 |     "IGCCCDD",
1607 |     "IGCCD",
1608 |     "IGCD",
1609 |     "IGCDCD",
1610 |     "IGCDD",
1611 |     "IGCGCCD",
1612 |     "IGCGCD",
1613 |     "IGCID",
1614 |     "IGCJD",
1615 |     "IGCPD",
1616 |     "IGCWJWD",
1617 |     "IGD",
1618 |     "IGDD",
1619 |     "IGFCCD",
1620 |     "IGFCD",
1621 |     "IGFD",
1622 |     "IGGCD",
1623 |     "IGID",
1624 |     "IGJD",
1625 |     "IGLCD",
1626 |     "IGLD",
1627 |     "IGPCD",
1628 |     "IGPCDD",
1629 |     "IICCCD",
1630 |     "IICCD",
1631 |     "IICD",
1632 |     "IICGD",
1633 |     "IID",
1634 |     "IIGD",
1635 |     "IIGJCJCD",
1636 |     "IIIGCD",
1637 |     "IIPCD",
1638 |     "IJCCCCD",
1639 |     "IJCCCD",
1640 |     "IJCCD",
1641 |     "IJCD",
1642 |     "IJD",
1643 |     "IJDCCD",
1644 |     "IJGCD",
1645 |     "IJGD",
1646 |     "IJJCD",
1647 |     "IJJD",
1648 |     "IJJJCD",
1649 |     "IJPCDD",
1650 |     "IJWCFIWGD",
1651 |     "IJWCFWD",
1652 |     "IJWCPWGD",
1653 |     "IKCCCD",
1654 |     "ILCD",
1655 |     "ILD",
1656 |     "ILPCD",
1657 |     "ILPMD",
1658 |     "IMCCD",
1659 |     "IMCD",
1660 |     "IMD",
1661 |     "IMPD",
1662 |     "IPCCCD",
1663 |     "IPCCD",
1664 |     "IPCCID",
1665 |     "IPCCJD",
1666 |     "IPCD",
1667 |     "IPCID",
1668 |     "IPCJD",
1669 |     "IPCPD",
1670 |     "IPD",
1671 |     "IPFCD",
1672 |     "IPID",
1673 |     "IPIJD",
1674 |     "IPJCGD",
1675 |     "IPJD",
1676 |     "IPPCD",
1677 |     "JCCCCCCD",
1678 |     "JCCCCCD",
1679 |     "JCCCCD",
1680 |     "JCCCD",
1681 |     "JCCCJCD",
1682 |     "JCCD",
1683 |     "JCCID",
1684 |     "JCCJD",
1685 |     "JCCMCD",
1686 |     "JCD",
1687 |     "JCDCCD",
1688 |     "JCDCD",
1689 |     "JCDD",
1690 |     "JCDID",
1691 |     "JCFCD",
1692 |     "JCGCCCCD",
1693 |     "JCGCCCD",
1694 |     "JCGCCD",
1695 |     "JCGCD",
1696 |     "JCGD",
1697 |     "JCGJGD",
1698 |     "JCICCCD",
1699 |     "JCID",
1700 |     "JCIDD",
1701 |     "JCJCCCD",
1702 |     "JCJCCD",
1703 |     "JCJCD",
1704 |     "JCJD",
1705 |     "JCJDD",
1706 |     "JCJFD",
1707 |     "JCJJPCD",
1708 |     "JCJPID",
1709 |     "JCJWGWD",
1710 |     "JCLD",
1711 |     "JCMD",
1712 |     "JCMPD",
1713 |     "JCPJCID",
1714 |     "JCPJJCD",
1715 |     "JCPPCCCD",
1716 |     "JD",
1717 |     "JDCD",
1718 |     "JDCMD",
1719 |     "JDD",
1720 |     "JDGD",
1721 |     "JDID",
1722 |     "JDJD",
1723 |     "JDMD",
1724 |     "JFCD",
1725 |     "JFD",
1726 |     "JGCCCD",
1727 |     "JGCD",
1728 |     "JGD",
1729 |     "JGDCJD",
1730 |     "JGGD",
1731 |     "JGPD",
1732 |     "JICCCD",
1733 |     "JICD",
1734 |     "JID",
1735 |     "JIDD",
1736 |     "JIID",
1737 |     "JIJD",
1738 |     "JILD",
1739 |     "JJCCCD",
1740 |     "JJCCD",
1741 |     "JJCCPGD",
1742 |     "JJCD",
1743 |     "JJD",
1744 |     "JJDCJD",
1745 |     "JJDD",
1746 |     "JJGCCD",
1747 |     "JJGD",
1748 |     "JJICD",
1749 |     "JJID",
1750 |     "JJJCCCD",
1751 |     "JJJCD",
1752 |     "JJJCFCCCD",
1753 |     "JJJD",
1754 |     "JJJGD",
1755 |     "JJMCID",
1756 |     "JJPCD",
1757 |     "JJPD",
1758 |     "JJPPJLCD",
1759 |     "JJWFWCCJJD",
1760 |     "JJWGWCD",
1761 |     "JJWGWCDD",
1762 |     "JKCD",
1763 |     "JKD",
1764 |     "JLCCD",
1765 |     "JLCCDD",
1766 |     "JLCCJD",
1767 |     "JLCD",
1768 |     "JLCDD",
1769 |     "JLCMD",
1770 |     "JLCMDD",
1771 |     "JLD",
1772 |     "JLDD",
1773 |     "JLGCJD",
1774 |     "JLGJCCCJD",
1775 |     "JLJD",
1776 |     "JMCD",
1777 |     "JMD",
1778 |     "JMJD",
1779 |     "JMPD",
1780 |     "JPCCD",
1781 |     "JPCD",
1782 |     "JPCMD",
1783 |     "JPCMDPD",
1784 |     "JPD",
1785 |     "JPDCCCD",
1786 |     "JPDD",
1787 |     "JPDGCD",
1788 |     "JPFCCD",
1789 |     "JPFD",
1790 |     "JPICD",
1791 |     "JPID",
1792 |     "JPIID",
1793 |     "JPJD",
1794 |     "JPJJCCCFPCD",
1795 |     "JPMD",
1796 |     "JPMDCCD",
1797 |     "JPMDD",
1798 |     "JPPJD",
1799 |     "JPPJLCD",
1800 |     "KCCCCCD",
1801 |     "KCCCCD",
1802 |     "KCCCCDCD",
1803 |     "KCCCD",
1804 |     "KCCCDCD",
1805 |     "KCCCDD",
1806 |     "KCCCDDCCCD",
1807 |     "KCCCGD",
1808 |     "KCCD",
1809 |     "KCCDCCD",
1810 |     "KCCDCD",
1811 |     "KCCJD",
1812 |     "KCCJDID",
1813 |     "KCCPD",
1814 |     "KCD",
1815 |     "KCDCCCCD",
1816 |     "KCDCCD",
1817 |     "KCDCD",
1818 |     "KCDD",
1819 |     "KCDICD",
1820 |     "KCDJD",
1821 |     "KCGCCCD",
1822 |     "KCGCCCDD",
1823 |     "KCGCCD",
1824 |     "KCGCD",
1825 |     "KCGD",
1826 |     "KCGGGD",
1827 |     "KCICD",
1828 |     "KCID",
1829 |     "KCIDCD",
1830 |     "KCJCD",
1831 |     "KCJD",
1832 |     "KCKCD",
1833 |     "KCMD",
1834 |     "KCMDCD",
1835 |     "KCPD",
1836 |     "KCWGWD",
1837 |     "KD",
1838 |     "KDCCCD",
1839 |     "KDCD",
1840 |     "KDD",
1841 |     "KDICD",
1842 |     "KDLCCPD",
1843 |     "KFCD",
1844 |     "KFCDD",
1845 |     "KFD",
1846 |     "KFWFD",
1847 |     "KGCCCD",
1848 |     "KGCCD",
1849 |     "KGCD",
1850 |     "KGCDCCD",
1851 |     "KGD",
1852 |     "KGDD",
1853 |     "KGGD",
1854 |     "KGJPD",
1855 |     "KICCD",
1856 |     "KICD",
1857 |     "KICDD",
1858 |     "KID",
1859 |     "KIDCCD",
1860 |     "KIDJCD",
1861 |     "KIGID",
1862 |     "KIMCD",
1863 |     "KIMD",
1864 |     "KIWGWD",
1865 |     "KJCCD",
1866 |     "KJCD",
1867 |     "KJD",
1868 |     "KJDD",
1869 |     "KJICCD",
1870 |     "KJJD",
1871 |     "KJJDCD",
1872 |     "KJJJD",
1873 |     "KJPD",
1874 |     "KLCCD",
1875 |     "KLD",
1876 |     "KMCCJCCD",
1877 |     "KMCD",
1878 |     "KMCDD",
1879 |     "KMD",
1880 |     "KMDCD",
1881 |     "KMDD",
1882 |     "KMMD",
1883 |     "KMMMD",
1884 |     "KPCCCD",
1885 |     "KPCCD",
1886 |     "KPCD",
1887 |     "KPD",
1888 |     "KPDD",
1889 |     "LCCCCD",
1890 |     "LCCCD",
1891 |     "LCCD",
1892 |     "LCCDD",
1893 |     "LCCDJCCD",
1894 |     "LCCGD",
1895 |     "LCCGID",
1896 |     "LCCID",
1897 |     "LCCPCD",
1898 |     "LCCWGWD",
1899 |     "LCD",
1900 |     "LCDCCD",
1901 |     "LCDCD",
1902 |     "LCDCDD",
1903 |     "LCDCDIGCD",
1904 |     "LCDD",
1905 |     "LCDFD",
1906 |     "LCDGDD",
1907 |     "LCDGID",
1908 |     "LCDID",
1909 |     "LCDLD",
1910 |     "LCDLDCD",
1911 |     "LCDLDD",
1912 |     "LCDMCDD",
1913 |     "LCDPD",
1914 |     "LCGD",
1915 |     "LCGDD",
1916 |     "LCICCWGWD",
1917 |     "LCID",
1918 |     "LCIGD",
1919 |     "LCJCD",
1920 |     "LCJD",
1921 |     "LCLD",
1922 |     "LCMCCD",
1923 |     "LCMCDD",
1924 |     "LCMCID",
1925 |     "LCMCMD",
1926 |     "LCMD",
1927 |     "LCMJCICD",
1928 |     "LCMJD",
1929 |     "LCPCJCD",
1930 |     "LCPD",
1931 |     "LCPMD",
1932 |     "LCPPCD",
1933 |     "LD",
1934 |     "LDCCD",
1935 |     "LDCD",
1936 |     "LDCLCD",
1937 |     "LDCLCDCD",
1938 |     "LDCPD",
1939 |     "LDD",
1940 |     "LDDD",
1941 |     "LDLCCCCD",
1942 |     "LFCD",
1943 |     "LFCFD",
1944 |     "LFD",
1945 |     "LFPPPCCD",
1946 |     "LGCD",
1947 |     "LGD",
1948 |     "LGGCCCD",
1949 |     "LGGCD",
1950 |     "LGJCD",
1951 |     "LGJLCD",
1952 |     "LGJLD",
1953 |     "LICCCD",
1954 |     "LICCD",
1955 |     "LICD",
1956 |     "LICLD",
1957 |     "LID",
1958 |     "LIGD",
1959 |     "LIPCCCD",
1960 |     "LIWGWCCCD",
1961 |     "LJCCCCD",
1962 |     "LJCCCCWGWD",
1963 |     "LJCCCD",
1964 |     "LJCCD",
1965 |     "LJCCDCCCD",
1966 |     "LJCCDCCD",
1967 |     "LJCCDCD",
1968 |     "LJCCDID",
1969 |     "LJCCDJCD",
1970 |     "LJCD",
1971 |     "LJCDD",
1972 |     "LJCGD",
1973 |     "LJCJJD",
1974 |     "LJCWCWJWCWJD",
1975 |     "LJD",
1976 |     "LJDCCD",
1977 |     "LJDCD",
1978 |     "LJDD",
1979 |     "LJDJPD",
1980 |     "LJDJPDD",
1981 |     "LJDJPDID",
1982 |     "LJDJPMDD",
1983 |     "LJFJJCLCD",
1984 |     "LJGD",
1985 |     "LJID",
1986 |     "LJJCD",
1987 |     "LJJD",
1988 |     "LJLD",
1989 |     "LJMD",
1990 |     "LJPCD",
1991 |     "LKCD",
1992 |     "LLCD",
1993 |     "LLD",
1994 |     "LLPD",
1995 |     "LMCCFCCD",
1996 |     "LMCD",
1997 |     "LMD",
1998 |     "LMID",
1999 |     "LPCCCCCD",
2000 |     "LPCCCD",
2001 |     "LPCCD",
2002 |     "LPCD",
2003 |     "LPCDD",
2004 |     "LPCFPPD",
2005 |     "LPCGCCCD",
2006 |     "LPCGCCD",
2007 |     "LPCGCCDCCD",
2008 |     "LPCGD",
2009 |     "LPCGDDPD",
2010 |     "LPD",
2011 |     "LPDD",
2012 |     "LPDDD",
2013 |     "LPICD",
2014 |     "LPID",
2015 |     "LPJD",
2016 |     "LPMDCCD",
2017 |     "LPPJD",
2018 |     "MCCCD",
2019 |     "MCCD",
2020 |     "MCCPD",
2021 |     "MCD",
2022 |     "MCDCCD",
2023 |     "MCDCCDCD",
2024 |     "MCDCCDD",
2025 |     "MCDCD",
2026 |     "MCDCGD",
2027 |     "MCDD",
2028 |     "MCDFD",
2029 |     "MCDFDD",
2030 |     "MCDLCD",
2031 |     "MCDPPD",
2032 |     "MCGCD",
2033 |     "MCICD",
2034 |     "MCID",
2035 |     "MCIDWGWD",
2036 |     "MCJD",
2037 |     "MCLD",
2038 |     "MCPD",
2039 |     "MD",
2040 |     "MDD",
2041 |     "MFD",
2042 |     "MGD",
2043 |     "MGJD",
2044 |     "MGJJD",
2045 |     "MICCD",
2046 |     "MICD",
2047 |     "MID",
2048 |     "MIDCCD",
2049 |     "MJCCD",
2050 |     "MJCD",
2051 |     "MJD",
2052 |     "MJDD",
2053 |     "MLCD",
2054 |     "MLD",
2055 |     "MLGD",
2056 |     "MLGGD",
2057 |     "MMCCD",
2058 |     "MMCD",
2059 |     "MMD",
2060 |     "MMMD",
2061 |     "MMPD",
2062 |     "MPCCD",
2063 |     "MPCD",
2064 |     "MPD",
2065 |     "MPDCD",
2066 |     "MPJPD",
2067 |     "MPPD",
2068 |     "PCCCCCCD",
2069 |     "PCCCCCD",
2070 |     "PCCCCD",
2071 |     "PCCCD",
2072 |     "PCCCDD",
2073 |     "PCCD",
2074 |     "PCCDD",
2075 |     "PCCGJGD",
2076 |     "PCCID",
2077 |     "PCCIDD",
2078 |     "PCD",
2079 |     "PCDCD",
2080 |     "PCDCJCD",
2081 |     "PCDD",
2082 |     "PCDFCCCD",
2083 |     "PCDID",
2084 |     "PCGCCD",
2085 |     "PCGCD",
2086 |     "PCGD",
2087 |     "PCID",
2088 |     "PCJCD",
2089 |     "PCJGD",
2090 |     "PCPCCD",
2091 |     "PCPD",
2092 |     "PD",
2093 |     "PDCCD",
2094 |     "PDD",
2095 |     "PDDD",
2096 |     "PFCCD",
2097 |     "PFCDD",
2098 |     "PFCJCD",
2099 |     "PFD",
2100 |     "PFFCD",
2101 |     "PFPCD",
2102 |     "PGCD",
2103 |     "PGCJD",
2104 |     "PGD",
2105 |     "PGDCICD",
2106 |     "PGJD",
2107 |     "PICCD",
2108 |     "PICD",
2109 |     "PICDD",
2110 |     "PID",
2111 |     "PIFD",
2112 |     "PIJCCD",
2113 |     "PIJD",
2114 |     "PJCCCDD",
2115 |     "PJCCD",
2116 |     "PJCD",
2117 |     "PJD",
2118 |     "PJDCD",
2119 |     "PJDD",
2120 |     "PJFD",
2121 |     "PJGD",
2122 |     "PJICCCPCD",
2123 |     "PJID",
2124 |     "PJJD",
2125 |     "PJJDD",
2126 |     "PJJPD",
2127 |     "PJLPCD",
2128 |     "PJPCD",
2129 |     "PJPD",
2130 |     "PLD",
2131 |     "PLPCD",
2132 |     "PMJCD",
2133 |     "PPCCCDCD",
2134 |     "PPCD",
2135 |     "PPCJCCD",
2136 |     "PPD",
2137 |     "PPDCD",
2138 |     "PPFCCD",
2139 |     "PPFCD",
2140 |     "PPGCID",
2141 |     "PPGD",
2142 |     "PPGJCCD",
2143 |     "PPICCD",
2144 |     "PPIGD",
2145 |     "PPJCD",
2146 |     "PPJD",
2147 |     "PPJJD",
2148 |     "PPMD",
2149 |     "PPPCPD",
2150 |     "PPPD",
2151 |     "PPPWGWCCD",
2152 |     "CCCCDID",
2153 |     "CCCDFGD",
2154 |     "CCCDGCD",
2155 |     "CCCDGDD",
2156 |     "CCCDWD",
2157 |     "CCCGCCD",
2158 |     "CCCGCD",
2159 |     "CCCWCWD",
2160 |     "CCCWGWCCD",
2161 |     "CCCWGWCCDWD",
2162 |     "CCCWGWD",
2163 |     "CCDDGCD",
2164 |     "CCDPCCD",
2165 |     "CCDWD",
2166 |     "CCFGCCCCCD",
2167 |     "CCFGFCCCD",
2168 |     "CCFPCD",
2169 |     "CCGDD",
2170 |     "CCGGCCD",
2171 |     "CCIDGD",
2172 |     "CCKD",
2173 |     "CCMIDGCD",
2174 |     "CCWD",
2175 |     "CCWGWCCCD",
2176 |     "CCWGWCD",
2177 |     "CCWGWDD",
2178 |     "CDWGWDGD",
2179 |     "CFCCGWD",
2180 |     "CFCD",
2181 |     "CFCWGWD",
2182 |     "CFGFGFGFGJID",
2183 |     "CFJD",
2184 |     "CFWGWCCDGCD",
2185 |     "CFWGWCJCD",
2186 |     "CGCCCCD",
2187 |     "CGCCID",
2188 |     "CGCCJCCCD",
2189 |     "CGCDCCD",
2190 |     "CGCFCCD",
2191 |     "CGCGCD",
2192 |     "CGCID",
2193 |     "CGFCCD",
2194 |     "CGFCD",
2195 |     "CGFDID",
2196 |     "CGGCICD",
2197 |     "CGGJPD",
2198 |     "CGICDGCD",
2199 |     "CGICDID",
2200 |     "CGIID",
2201 |     "CGJCCCD",
2202 |     "CGJCCD",
2203 |     "CGJCD",
2204 |     "CGJCDGD",
2205 |     "CGJCDWD",
2206 |     "CGJCJCD",
2207 |     "CGJDD",
2208 |     "CGJDDCCD",
2209 |     "CGJGCD",
2210 |     "CGJID",
2211 |     "CGLCCD",
2212 |     "CGPCCD",
2213 |     "CGPCD",
2214 |     "CGPD",
2215 |     "CGPFCCD",
2216 |     "CGPICD",
2217 |     "CGPID",
2218 |     "CGPJCDD",
2219 |     "CGPJJJCD",
2220 |     "CICCDGD",
2221 |     "CICFJGD",
2222 |     "CICGFID",
2223 |     "CIDCD",
2224 |     "CIDGD",
2225 |     "CIFID",
2226 |     "CIGCCD",
2227 |     "CIGMCD",
2228 |     "CIICCD",
2229 |     "CIICD",
2230 |     "CIJCWGWCD",
2231 |     "CIJD",
2232 |     "CIJWD",
2233 |     "CIPCCD",
2234 |     "CJCCDFD",
2235 |     "CJCGD",
2236 |     "CJCID",
2237 |     "CJCWCCCD",
2238 |     "CJCWGWD",
2239 |     "CJGCCCD",
2240 |     "CJICD",
2241 |     "CJIDD",
2242 |     "CJJCD",
2243 |     "CJWGCD",
2244 |     "CJWGWID",
2245 |     "CPCCDGJD",
2246 |     "CPCDCCD",
2247 |     "CPDFCD",
2248 |     "CPGID",
2249 |     "CPICD",
2250 |     "CPIWGWD",
2251 |     "CPJGD",
2252 |     "CPPCD",
2253 |     "CPWGWDGD",
2254 |     "D",
2255 |     "FCCCCCCCD",
2256 |     "FCCCCGD",
2257 |     "FCCCDGD",
2258 |     "FCCCWGWD",
2259 |     "FCCDD",
2260 |     "FCCDFCGD",
2261 |     "FCCDGD",
2262 |     "FCCDIPD",
2263 |     "FCCDWGWD",
2264 |     "FCCPCD",
2265 |     "FCCWGWDD",
2266 |     "FCDGD",
2267 |     "FCDWD",
2268 |     "FCDWGD",
2269 |     "FCFWGWD",
2270 |     "FCICCD",
2271 |     "FCICDGD",
2272 |     "FCIWGWDD",
2273 |     "FCPCD",
2274 |     "FCPCPD",
2275 |     "FCPDGD",
2276 |     "FCPPGD",
2277 |     "FCWGWCD",
2278 |     "FCWGWDD",
2279 |     "FDDD",
2280 |     "FDGD",
2281 |     "FDGJCCD",
2282 |     "FDWGWD",
2283 |     "FFCCWGWD",
2284 |     "FFFFD",
2285 |     "FFFFFWWFD",
2286 |     "FFFFWWD",
2287 |     "FFFWD",
2288 |     "FFFWWD",
2289 |     "FFFWWFD",
2290 |     "FFWWD",
2291 |     "FGFPCCD",
2292 |     "FGJWGWD",
2293 |     "FICCCD",
2294 |     "FICDGD",
2295 |     "FICGWD",
2296 |     "FICJD",
2297 |     "FIICD",
2298 |     "FIWGWCDD",
2299 |     "FIWGWD",
2300 |     "FIWGWDD",
2301 |     "FJCCDD",
2302 |     "FJGPCD",
2303 |     "FJID",
2304 |     "FJJGD",
2305 |     "FMJD",
2306 |     "FPCCD",
2307 |     "FPCDD",
2308 |     "FPDD",
2309 |     "FPIDGD",
2310 |     "FPWCWD",
2311 |     "FWFWFD",
2312 |     "FWGCD",
2313 |     "FWGWCCD",
2314 |     "FWGWCDGCD",
2315 |     "FWGWCDGD",
2316 |     "FWGWGD",
2317 |     "FWJD",
2318 |     "GCCCCCCDCD",
2319 |     "GCCCCCDGD",
2320 |     "GCCCCCID",
2321 |     "GCCCCCKFD",
2322 |     "GCCCCDCD",
2323 |     "GCCCCDGCD",
2324 |     "GCCCCDGCIJD",
2325 |     "GCCCCDGDGDDDD",
2326 |     "GCCCCDWFCCD",
2327 |     "GCCCCDWGD",
2328 |     "GCCCCFCCCCD",
2329 |     "GCCCCID",
2330 |     "GCCCDCPD",
2331 |     "GCCCDDGCD",
2332 |     "GCCCDDGD",
2333 |     "GCCCDFCD",
2334 |     "GCCCDGD",
2335 |     "GCCCDGID",
2336 |     "GCCCDICD",
2337 |     "GCCCDMD",
2338 |     "GCCCDWGCDWFCCD",
2339 |     "GCCCDWGD",
2340 |     "GCCCDWGWD",
2341 |     "GCCCDWID",
2342 |     "GCCCGPD",
2343 |     "GCCCIJD",
2344 |     "GCCCJCCD",
2345 |     "GCCCJJCD",
2346 |     "GCCCMCD",
2347 |     "GCCCWD",
2348 |     "GCCDCCMD",
2349 |     "GCCDDWD",
2350 |     "GCCDFCCD",
2351 |     "GCCDGCD",
2352 |     "GCCDGCGD",
2353 |     "GCCDGDGCD",
2354 |     "GCCDGJD",
2355 |     "GCCDPPCD",
2356 |     "GCCDWD",
2357 |     "GCCFCCD",
2358 |     "GCCFID",
2359 |     "GCCFJCD",
2360 |     "GCCFWCWCD",
2361 |     "GCCGDCD",
2362 |     "GCCGFD",
2363 |     "GCCGFICD",
2364 |     "GCCGID",
2365 |     "GCCGIID",
2366 |     "GCCICCD",
2367 |     "GCCICDCD",
2368 |     "GCCICWDD",
2369 |     "GCCIDWDCD",
2370 |     "GCCIID",
2371 |     "GCCIJD",
2372 |     "GCCJCDD",
2373 |     "GCCJCGCD",
2374 |     "GCCJDD",
2375 |     "GCCJIDCD",
2376 |     "GCCKDGD",
2377 |     "GCCMJCD",
2378 |     "GCCMJJCD",
2379 |     "GCCWD",
2380 |     "GCDCCCDGD",
2381 |     "GCDCWDWD",
2382 |     "GCDDDD",
2383 |     "GCDDJCD",
2384 |     "GCDFCD",
2385 |     "GCDFID",
2386 |     "GCDFJD",
2387 |     "GCDGCGD",
2388 |     "GCDGGGCD",
2389 |     "GCDGIID",
2390 |     "GCDIID",
2391 |     "GCDKD",
2392 |     "GCDMDFD",
2393 |     "GCDPGD",
2394 |     "GCDWD",
2395 |     "GCDWDWD",
2396 |     "GCFCCCD",
2397 |     "GCFCCCDGD",
2398 |     "GCFCDICD",
2399 |     "GCFCDWGD",
2400 |     "GCFCIFD",
2401 |     "GCFCJD",
2402 |     "GCFDDCID",
2403 |     "GCFFJD",
2404 |     "GCFGJPCD",
2405 |     "GCFICD",
2406 |     "GCFIDFD",
2407 |     "GCFJD",
2408 |     "GCFJDD",
2409 |     "GCFJPD",
2410 |     "GCFPCCCD",
2411 |     "GCFPDD",
2412 |     "GCFPID",
2413 |     "GCGCCCCD",
2414 |     "GCGCCCID",
2415 |     "GCGCCCIDD",
2416 |     "GCGCCDD",
2417 |     "GCGCCDFD",
2418 |     "GCGCCID",
2419 |     "GCGCCJCD",
2420 |     "GCGCCPD",
2421 |     "GCGCDCCCD",
2422 |     "GCGCDCD",
2423 |     "GCGCDCID",
2424 |     "GCGCDD",
2425 |     "GCGCFCCD",
2426 |     "GCGCFCD",
2427 |     "GCGCFGCD",
2428 |     "GCGCGCCD",
2429 |     "GCGCGCD",
2430 |     "GCGCGCPCCD",
2431 |     "GCGCGD",
2432 |     "GCGCGID",
2433 |     "GCGCGPD",
2434 |     "GCGCICCCD",
2435 |     "GCGCICDDFCCCD",
2436 |     "GCGCIDD",
2437 |     "GCGCIID",
2438 |     "GCGCJCCD",
2439 |     "GCGCJD",
2440 |     "GCGCJGWD",
2441 |     "GCGCJJD",
2442 |     "GCGCLCCD",
2443 |     "GCGCPCCD",
2444 |     "GCGCPCCID",
2445 |     "GCGCPCD",
2446 |     "GCGCPCJCCD",
2447 |     "GCGDCCICCD",
2448 |     "GCGDCD",
2449 |     "GCGDIMD",
2450 |     "GCGFCCD",
2451 |     "GCGFCD",
2452 |     "GCGFCJD",
2453 |     "GCGFCMJD",
2454 |     "GCGFD",
2455 |     "GCGFDD",
2456 |     "GCGFFCD",
2457 |     "GCGFFD",
2458 |     "GCGFID",
2459 |     "GCGFIDD",
2460 |     "GCGFJD",
2461 |     "GCGGCCD",
2462 |     "GCGGGJCD",
2463 |     "GCGGJCID",
2464 |     "GCGGJCJD",
2465 |     "GCGICCCD",
2466 |     "GCGICCD",
2467 |     "GCGICCJD",
2468 |     "GCGICDMD",
2469 |     "GCGICICCD",
2470 |     "GCGICJCDD",
2471 |     "GCGICJD",
2472 |     "GCGICJJD",
2473 |     "GCGIDCGD",
2474 |     "GCGIDD",
2475 |     "GCGIDGD",
2476 |     "GCGIGCCD",
2477 |     "GCGIICD",
2478 |     "GCGIID",
2479 |     "GCGIMCCD",
2480 |     "GCGIMJD",
2481 |     "GCGIPCCD",
2482 |     "GCGIPD",
2483 |     "GCGJCCCCDD",
2484 |     "GCGJCCCD",
2485 |     "GCGJCCDD",
2486 |     "GCGJCD",
2487 |     "GCGJCID",
2488 |     "GCGJD",
2489 |     "GCGJDD",
2490 |     "GCGJGICD",
2491 |     "GCGJICD",
2492 |     "GCGJID",
2493 |     "GCGJIFCD",
2494 |     "GCGJJCD",
2495 |     "GCGJPCCD",
2496 |     "GCGJPCD",
2497 |     "GCGKCD",
2498 |     "GCGKD",
2499 |     "GCGLCDCCD",
2500 |     "GCGLCJD",
2501 |     "GCGLGCCD",
2502 |     "GCGLGPCCID",
2503 |     "GCGLIPJD",
2504 |     "GCGLJJID",
2505 |     "GCGMCD",
2506 |     "GCGMD",
2507 |     "GCGPCCCCCD",
2508 |     "GCGPCCCD",
2509 |     "GCGPCD",
2510 |     "GCGPCFCCD",
2511 |     "GCGPCID",
2512 |     "GCGPCPD",
2513 |     "GCGPD",
2514 |     "GCGPFCD",
2515 |     "GCGPGCD",
2516 |     "GCGPIID",
2517 |     "GCGPJCCD",
2518 |     "GCGPJCD",
2519 |     "GCGPJD",
2520 |     "GCGPJGCD",
2521 |     "GCGPJID",
2522 |     "GCGPLICD",
2523 |     "GCGPLID",
2524 |     "GCGPPCCD",
2525 |     "GCGPPCD",
2526 |     "GCGPPD",
2527 |     "GCGPPID",
2528 |     "GCGPPJD",
2529 |     "GCGWPFCD",
2530 |     "GCICCCDD",
2531 |     "GCICCDFD",
2532 |     "GCICCJD",
2533 |     "GCICCWDWDCGD",
2534 |     "GCICDFCD",
2535 |     "GCICPD",
2536 |     "GCIDCD",
2537 |     "GCIDCGD",
2538 |     "GCIDDGD",
2539 |     "GCIDPCCD",
2540 |     "GCIICD",
2541 |     "GCIJCCD",
2542 |     "GCIJCCDMD",
2543 |     "GCIJCID",
2544 |     "GCIKD",
2545 |     "GCIPCCD",
2546 |     "GCIPCPD",
2547 |     "GCJCCCCCD",
2548 |     "GCJCCDCD",
2549 |     "GCJCCDGD",
2550 |     "GCJCCDMD",
2551 |     "GCJCCICD",
2552 |     "GCJCDD",
2553 |     "GCJCICD",
2554 |     "GCJCKDD",
2555 |     "GCJDCDCD",
2556 |     "GCJDDCD",
2557 |     "GCJGCD",
2558 |     "GCJICCCD",
2559 |     "GCJICGD",
2560 |     "GCJIDCD",
2561 |     "GCJIDD",
2562 |     "GCJJCDD",
2563 |     "GCJJCJCD",
2564 |     "GCJJDD",
2565 |     "GCJMCID",
2566 |     "GCJPCCCD",
2567 |     "GCJPCCD",
2568 |     "GCJPCD",
2569 |     "GCJPCDMD",
2570 |     "GCJPID",
2571 |     "GCJPJD",
2572 |     "GCJWCPWD",
2573 |     "GCKCCCD",
2574 |     "GCKCD",
2575 |     "GCKDGD",
2576 |     "GCKGD",
2577 |     "GCKICD",
2578 |     "GCKJCCD",
2579 |     "GCKPD",
2580 |     "GCLCID",
2581 |     "GCLGIJCD",
2582 |     "GCLID",
2583 |     "GCMCCDFD",
2584 |     "GCMCCKGD",
2585 |     "GCMCJCCD",
2586 |     "GCMCPD",
2587 |     "GCMDCGCD",
2588 |     "GCMFCDGD",
2589 |     "GCMID",
2590 |     "GCMJCD",
2591 |     "GCMJCDD",
2592 |     "GCMJCID",
2593 |     "GCMJID",
2594 |     "GCMJPCCCCD",
2595 |     "GCMKD",
2596 |     "GCMKGD",
2597 |     "GCMPCCD",
2598 |     "GCMPJD",
2599 |     "GCMPPCCD",
2600 |     "GCPCCCMD",
2601 |     "GCPCCDCD",
2602 |     "GCPCCDMD",
2603 |     "GCPCCDWD",
2604 |     "GCPCCWGCWD",
2605 |     "GCPCDCD",
2606 |     "GCPCDGD",
2607 |     "GCPCDWD",
2608 |     "GCPCICDWGD",
2609 |     "GCPCIICFD",
2610 |     "GCPCJCFD",
2611 |     "GCPCJD",
2612 |     "GCPDGD",
2613 |     "GCPGGCD",
2614 |     "GCPICCCDGD",
2615 |     "GCPICCD",
2616 |     "GCPICD",
2617 |     "GCPICID",
2618 |     "GCPID",
2619 |     "GCPIJCCD",
2620 |     "GCPJCDD",
2621 |     "GCPJPDD",
2622 |     "GCPKD",
2623 |     "GCPMCCD",
2624 |     "GCPMJCD",
2625 |     "GCPPCD",
2626 |     "GCPPID",
2627 |     "GCPPWCWID",
2628 |     "GCPWCWCD",
2629 |     "GCPWDWDCCD",
2630 |     "GCWDWDCCD",
2631 |     "GCWGWCCD",
2632 |     "GCWGWD",
2633 |     "GCWGWJD",
2634 |     "GCWJCCD",
2635 |     "GDCCCCFCD",
2636 |     "GDCCCFCPD",
2637 |     "GDCCPCCD",
2638 |     "GDCDGCD",
2639 |     "GDCDJD",
2640 |     "GDCKGCD",
2641 |     "GDDGCD",
2642 |     "GDDGD",
2643 |     "GDDMD",
2644 |     "GDICCCD",
2645 |     "GDIPD",
2646 |     "GDJCICD",
2647 |     "GDLCCD",
2648 |     "GFCCCCCCD",
2649 |     "GFCCCCFD",
2650 |     "GFCCCDD",
2651 |     "GFCCCDDD",
2652 |     "GFCCCDFCD",
2653 |     "GFCCCDFDD",
2654 |     "GFCCCDGCD",
2655 |     "GFCCCDGD",
2656 |     "GFCCCDID",
2657 |     "GFCCDCCD",
2658 |     "GFCCDCD",
2659 |     "GFCCDCFD",
2660 |     "GFCCDDD",
2661 |     "GFCCDFCD",
2662 |     "GFCCDFDMD",
2663 |     "GFCCDFFCD",
2664 |     "GFCCDFFD",
2665 |     "GFCCDFGD",
2666 |     "GFCCDGCCD",
2667 |     "GFCCDGCD",
2668 |     "GFCCDGD",
2669 |     "GFCCDGGID",
2670 |     "GFCCDGICDJICD",
2671 |     "GFCCDID",
2672 |     "GFCCDLD",
2673 |     "GFCCDMD",
2674 |     "GFCCDWCD",
2675 |     "GFCCDWD",
2676 |     "GFCCDWFDD",
2677 |     "GFCCDWGCD",
2678 |     "GFCCDWGD",
2679 |     "GFCCID",
2680 |     "GFCCJD",
2681 |     "GFCCPCD",
2682 |     "GFCDDCCCD",
2683 |     "GFCDFCCD",
2684 |     "GFCDFCD",
2685 |     "GFCDGCD",
2686 |     "GFCDGD",
2687 |     "GFCDGDD",
2688 |     "GFCDGFCD",
2689 |     "GFCDGGCD",
2690 |     "GFCDGGD",
2691 |     "GFCDGPD",
2692 |     "GFCDID",
2693 |     "GFCDMIDMD",
2694 |     "GFCDWCD",
2695 |     "GFCDWD",
2696 |     "GFCDWGD",
2697 |     "GFCFCD",
2698 |     "GFCGCD",
2699 |     "GFCGD",
2700 |     "GFCICD",
2701 |     "GFCIDCGD",
2702 |     "GFCIDWD",
2703 |     "GFCJCCCD",
2704 |     "GFCJCCD",
2705 |     "GFCJCCDD",
2706 |     "GFCJCJD",
2707 |     "GFCJDD",
2708 |     "GFCJID",
2709 |     "GFCKD",
2710 |     "GFCLCD",
2711 |     "GFCMCCD",
2712 |     "GFCMJCDWD",
2713 |     "GFCPDGD",
2714 |     "GFCPPCD",
2715 |     "GFCWCD",
2716 |     "GFCWCWCD",
2717 |     "GFCWFWFCCD",
2718 |     "GFCWGWD",
2719 |     "GFDCCD",
2720 |     "GFDCDCDD",
2721 |     "GFDCDD",
2722 |     "GFDCDGD",
2723 |     "GFDCID",
2724 |     "GFDDCCD",
2725 |     "GFDDCGD",
2726 |     "GFDDD",
2727 |     "GFDDGD",
2728 |     "GFDDPD",
2729 |     "GFDGCD",
2730 |     "GFDGD",
2731 |     "GFDICD",
2732 |     "GFDICPCD",
2733 |     "GFDID",
2734 |     "GFDJPCD",
2735 |     "GFDWD",
2736 |     "GFDWDWD",
2737 |     "GFFCCCD",
2738 |     "GFFCJD",
2739 |     "GFFDD",
2740 |     "GFFJJDGD",
2741 |     "GFFPDGD",
2742 |     "GFGCCCDD",
2743 |     "GFGCCD",
2744 |     "GFGCCDGPD",
2745 |     "GFGFICD",
2746 |     "GFGMPD",
2747 |     "GFICCDCD",
2748 |     "GFICCDD",
2749 |     "GFICDCD",
2750 |     "GFICDCJD",
2751 |     "GFICDD",
2752 |     "GFICDGD",
2753 |     "GFICJD",
2754 |     "GFICKD",
2755 |     "GFIDD",
2756 |     "GFIDFGD",
2757 |     "GFIDGCD",
2758 |     "GFIDGD",
2759 |     "GFIDPCPCD",
2760 |     "GFIGD",
2761 |     "GFIID",
2762 |     "GFIIDFCD",
2763 |     "GFIIGD",
2764 |     "GFIJCCD",
2765 |     "GFIJD",
2766 |     "GFJCCCCD",
2767 |     "GFJCCDD",
2768 |     "GFJCDD",
2769 |     "GFJCDGD",
2770 |     "GFJCDWD",
2771 |     "GFJCJD",
2772 |     "GFJDD",
2773 |     "GFJDGCD",
2774 |     "GFJDGFCD",
2775 |     "GFJDWD",
2776 |     "GFJDWFICGD",
2777 |     "GFJFD",
2778 |     "GFJICD",
2779 |     "GFJICDGD",
2780 |     "GFJID",
2781 |     "GFJJCD",
2782 |     "GFJJDWGD",
2783 |     "GFKD",
2784 |     "GFKDGD",
2785 |     "GFLCD",
2786 |     "GFMJCD",
2787 |     "GFPCCCD",
2788 |     "GFPCCD",
2789 |     "GFPCDCD",
2790 |     "GFPCDD",
2791 |     "GFPCJD",
2792 |     "GFPDCD",
2793 |     "GFPDD",
2794 |     "GFPDID",
2795 |     "GFPICD",
2796 |     "GFPIJD",
2797 |     "GFPJIDD",
2798 |     "GFPKD",
2799 |     "GFPPCCD",
2800 |     "GFPPCD",
2801 |     "GFWCWID",
2802 |     "GFWDWD",
2803 |     "GFWJD",
2804 |     "GGCCCCCCD",
2805 |     "GGCCCCJCD",
2806 |     "GGCCCDD",
2807 |     "GGCCCDDD",
2808 |     "GGCCCDGD",
2809 |     "GGCCCDWGCD",
2810 |     "GGCCDCDGD",
2811 |     "GGCCDFGCD",
2812 |     "GGCCDGD",
2813 |     "GGCCDGDCD",
2814 |     "GGCCDID",
2815 |     "GGCCDMCD",
2816 |     "GGCCDWGD",
2817 |     "GGCCFCD",
2818 |     "GGCCFD",
2819 |     "GGCCGCCD",
2820 |     "GGCCICDD",
2821 |     "GGCCID",
2822 |     "GGCCJCCDD",
2823 |     "GGCCLCD",
2824 |     "GGCCPCD",
2825 |     "GGCCPJD",
2826 |     "GGCDCCDGD",
2827 |     "GGCDCDD",
2828 |     "GGCDCDGD",
2829 |     "GGCDCGD",
2830 |     "GGCDDCCD",
2831 |     "GGCDGD",
2832 |     "GGCDGPGCD",
2833 |     "GGCDID",
2834 |     "GGCDMD",
2835 |     "GGCFCCD",
2836 |     "GGCFCD",
2837 |     "GGCFID",
2838 |     "GGCGCCCCD",
2839 |     "GGCGCCD",
2840 |     "GGCGCGCCD",
2841 |     "GGCGCGCD",
2842 |     "GGCGCGFCD",
2843 |     "GGCGDGCD",
2844 |     "GGCGFD",
2845 |     "GGCGFID",
2846 |     "GGCGGCD",
2847 |     "GGCGGGD",
2848 |     "GGCGGJD",
2849 |     "GGCGICCD",
2850 |     "GGCGIICD",
2851 |     "GGCGILICD",
2852 |     "GGCGJID",
2853 |     "GGCGJIJCD",
2854 |     "GGCGPCCD",
2855 |     "GGCGPCD",
2856 |     "GGCGPJCCD",
2857 |     "GGCGPJCD",
2858 |     "GGCGPJD",
2859 |     "GGCGPPD",
2860 |     "GGCICCD",
2861 |     "GGCICCID",
2862 |     "GGCICD",
2863 |     "GGCIDD",
2864 |     "GGCIICD",
2865 |     "GGCIIJD",
2866 |     "GGCIPCICCD",
2867 |     "GGCIPD",
2868 |     "GGCJCDD",
2869 |     "GGCJCJD",
2870 |     "GGCJDD",
2871 |     "GGCJID",
2872 |     "GGCKLCD",
2873 |     "GGCLCCD",
2874 |     "GGCMCIJD",
2875 |     "GGCMID",
2876 |     "GGCPCCCD",
2877 |     "GGCPCCJCCCWD",
2878 |     "GGCPCDD",
2879 |     "GGCPDD",
2880 |     "GGCPGGCID",
2881 |     "GGCPICD",
2882 |     "GGCPICDD",
2883 |     "GGCPID",
2884 |     "GGCPJCD",
2885 |     "GGCPPCCD",
2886 |     "GGCPPD",
2887 |     "GGCWDWCCDGCD",
2888 |     "GGCWGD",
2889 |     "GGDCDCCD",
2890 |     "GGDCJD",
2891 |     "GGDDFD",
2892 |     "GGDGCCGCD",
2893 |     "GGDGCDGD",
2894 |     "GGDLGD",
2895 |     "GGFCCCCD",
2896 |     "GGFCDID",
2897 |     "GGFCFCDD",
2898 |     "GGFCID",
2899 |     "GGFCJD",
2900 |     "GGFCMCCD",
2901 |     "GGFDCD",
2902 |     "GGFDDD",
2903 |     "GGFICD",
2904 |     "GGFICDD",
2905 |     "GGFID",
2906 |     "GGFJCD",
2907 |     "GGFJID",
2908 |     "GGFJMD",
2909 |     "GGFKID",
2910 |     "GGFMJDD",
2911 |     "GGFPCD",
2912 |     "GGFPD",
2913 |     "GGFWID",
2914 |     "GGGCCCDGD",
2915 |     "GGGCCCICD",
2916 |     "GGGCCDGD",
2917 |     "GGGCCID",
2918 |     "GGGCGGD",
2919 |     "GGGCJCD",
2920 |     "GGGCPD",
2921 |     "GGGCPFCPCD",
2922 |     "GGGCPJD",
2923 |     "GGGFCCD",
2924 |     "GGGFCCID",
2925 |     "GGGFCD",
2926 |     "GGGFCJD",
2927 |     "GGGFID",
2928 |     "GGGGPJD",
2929 |     "GGGICCD",
2930 |     "GGGICJD",
2931 |     "GGGIDGID",
2932 |     "GGGIICD",
2933 |     "GGGJCCD",
2934 |     "GGGJGID",
2935 |     "GGGKCD",
2936 |     "GGGKDJD",
2937 |     "GGGLJCD",
2938 |     "GGGMCD",
2939 |     "GGGPCD",
2940 |     "GGGPFIDWD",
2941 |     "GGGPIICD",
2942 |     "GGGPIPD",
2943 |     "GGGPPID",
2944 |     "GGICCGCD",
2945 |     "GGICCID",
2946 |     "GGICDD",
2947 |     "GGICFID",
2948 |     "GGICJCD",
2949 |     "GGICJDD",
2950 |     "GGICPCCID",
2951 |     "GGICPD",
2952 |     "GGIDID",
2953 |     "GGIDWGD",
2954 |     "GGIFCCD",
2955 |     "GGIFCD",
2956 |     "GGIFCJD",
2957 |     "GGIFICD",
2958 |     "GGIFID",
2959 |     "GGIFIDDD",
2960 |     "GGIFJD",
2961 |     "GGIFMID",
2962 |     "GGIGPFD",
2963 |     "GGIICCD",
2964 |     "GGIJCD",
2965 |     "GGIJCID",
2966 |     "GGIJDD",
2967 |     "GGIJICD",
2968 |     "GGIPCCD",
2969 |     "GGIPDCCD",
2970 |     "GGIPICD",
2971 |     "GGIPMICD",
2972 |     "GGJCCCCCD",
2973 |     "GGJCCCCD",
2974 |     "GGJCCICD",
2975 |     "GGJCDD",
2976 |     "GGJCGCD",
2977 |     "GGJCICCD",
2978 |     "GGJCICD",
2979 |     "GGJGCD",
2980 |     "GGJGCICD",
2981 |     "GGJGCLCGCD",
2982 |     "GGJICCD",
2983 |     "GGJICJD",
2984 |     "GGJICPCCD",
2985 |     "GGJID",
2986 |     "GGJIID",
2987 |     "GGJJCD",
2988 |     "GGJJCDD",
2989 |     "GGJJCKD",
2990 |     "GGJJID",
2991 |     "GGJMID",
2992 |     "GGJPCCCCD",
2993 |     "GGJPCCD",
2994 |     "GGJPCD",
2995 |     "GGJPCJCD",
2996 |     "GGJPCJPJCD",
2997 |     "GGJPID",
2998 |     "GGJPJD",
2999 |     "GGKCCCD",
3000 |     "GGKCD",
3001 |     "GGKDD",
3002 |     "GGLCCD",
3003 |     "GGLCCPJD",
3004 |     "GGLFCCCD",
3005 |     "GGLGCJD",
3006 |     "GGLGFID",
3007 |     "GGLGPCD",
3008 |     "GGLJCCD",
3009 |     "GGLJCID",
3010 |     "GGMFJD",
3011 |     "GGMJCDGD",
3012 |     "GGMPJD",
3013 |     "GGPCCDD",
3014 |     "GGPCDD",
3015 |     "GGPCICD",
3016 |     "GGPCID",
3017 |     "GGPFCCD",
3018 |     "GGPFCD",
3019 |     "GGPFCID",
3020 |     "GGPFJD",
3021 |     "GGPGCD",
3022 |     "GGPGID",
3023 |     "GGPICFCD",
3024 |     "GGPID",
3025 |     "GGPIDD",
3026 |     "GGPIID",
3027 |     "GGPJCCCD",
3028 |     "GGPJCCD",
3029 |     "GGPJCCID",
3030 |     "GGPJCJMD",
3031 |     "GGPJID",
3032 |     "GGPJKCCD",
3033 |     "GGPJPCD",
3034 |     "GGPPCID",
3035 |     "GGPPDD",
3036 |     "GGPPFCCD",
3037 |     "GGPPICD",
3038 |     "GGPPJCD",
3039 |     "GGWCJD",
3040 |     "GGWGWID",
3041 |     "GGWIWCCD",
3042 |     "GICCCCCCCD",
3043 |     "GICCCDGD",
3044 |     "GICCDDD",
3045 |     "GICCDGCD",
3046 |     "GICCDGD",
3047 |     "GICCDWD",
3048 |     "GICCDWGD",
3049 |     "GICCFCCD",
3050 |     "GICCICCD",
3051 |     "GICCICD",
3052 |     "GICCID",
3053 |     "GICCJCCD",
3054 |     "GICDCCCD",
3055 |     "GICDDWGD",
3056 |     "GICDGD",
3057 |     "GICDGJCD",
3058 |     "GICDID",
3059 |     "GICDWD",
3060 |     "GICFD",
3061 |     "GICFID",
3062 |     "GICGCD",
3063 |     "GICICCCCCCCCPD",
3064 |     "GICICDDGD",
3065 |     "GICICDFD",
3066 |     "GICIDGD",
3067 |     "GICIFD",
3068 |     "GICIIFID",
3069 |     "GICJDD",
3070 |     "GICJDGD",
3071 |     "GICJJD",
3072 |     "GICKD",
3073 |     "GICPCD",
3074 |     "GICPID",
3075 |     "GICPIDD",
3076 |     "GICWCWCWD",
3077 |     "GIDCDD",
3078 |     "GIDDCD",
3079 |     "GIDDGD",
3080 |     "GIDDWGD",
3081 |     "GIDFCD",
3082 |     "GIDGDCD",
3083 |     "GIDJJD",
3084 |     "GIFCCCD",
3085 |     "GIFCD",
3086 |     "GIFCJD",
3087 |     "GIFFFWFWD",
3088 |     "GIFGD",
3089 |     "GIFICCCD",
3090 |     "GIFID",
3091 |     "GIFIDCD",
3092 |     "GIFJD",
3093 |     "GIFPD",
3094 |     "GIFPDCD",
3095 |     "GIGCCDMD",
3096 |     "GIGGCD",
3097 |     "GIGJD",
3098 |     "GIGMD",
3099 |     "GIICCCD",
3100 |     "GIICCDD",
3101 |     "GIICCDGD",
3102 |     "GIICCDMCD",
3103 |     "GIICDD",
3104 |     "GIICID",
3105 |     "GIIDFCD",
3106 |     "GIIDGD",
3107 |     "GIIDJCD",
3108 |     "GIIFICD",
3109 |     "GIIICCD",
3110 |     "GIIJD",
3111 |     "GIIPCD",
3112 |     "GIIPD",
3113 |     "GIJCCDD",
3114 |     "GIJCCICD",
3115 |     "GIJCCJD",
3116 |     "GIJCDCD",
3117 |     "GIJCDGD",
3118 |     "GIJCDWCFD",
3119 |     "GIJDCD",
3120 |     "GIJICCD",
3121 |     "GIJICD",
3122 |     "GIJICDGD",
3123 |     "GIJIDD",
3124 |     "GIJJD",
3125 |     "GIJJICJD",
3126 |     "GIJPCD",
3127 |     "GIJPID",
3128 |     "GILGCD",
3129 |     "GIMCID",
3130 |     "GIMCPD",
3131 |     "GIPCCCCCD",
3132 |     "GIPCCCDGD",
3133 |     "GIPCCDD",
3134 |     "GIPDCCCCD",
3135 |     "GIPDGD",
3136 |     "GIPDWCCD",
3137 |     "GIPFD",
3138 |     "GIPID",
3139 |     "GIPJCDD",
3140 |     "GIPJD",
3141 |     "GIWGFWDGD",
3142 |     "GIWGWDD",
3143 |     "GJCCCCCDCD",
3144 |     "GJCCCCCDD",
3145 |     "GJCCCCDD",
3146 |     "GJCCCDCD",
3147 |     "GJCCCDGCD",
3148 |     "GJCCCDGD",
3149 |     "GJCCCDLD",
3150 |     "GJCCCDWD",
3151 |     "GJCCCPD",
3152 |     "GJCCDFDD",
3153 |     "GJCCID",
3154 |     "GJCCPD",
3155 |     "GJCCWKWCD",
3156 |     "GJCDMCCD",
3157 |     "GJCICCD",
3158 |     "GJCIWGWCCD",
3159 |     "GJCJCDID",
3160 |     "GJCKD",
3161 |     "GJCLCCCD",
3162 |     "GJCMIGD",
3163 |     "GJCMWD",
3164 |     "GJCPDCCD",
3165 |     "GJDCDCD",
3166 |     "GJFCCCD",
3167 |     "GJFCCDWGCGD",
3168 |     "GJFCDCD",
3169 |     "GJFCDD",
3170 |     "GJFCDWFCD",
3171 |     "GJFID",
3172 |     "GJGCCCCD",
3173 |     "GJGCCCD",
3174 |     "GJGCCD",
3175 |     "GJGCCDD",
3176 |     "GJGCCDID",
3177 |     "GJGCMJD",
3178 |     "GJGFJCD",
3179 |     "GJGJCD",
3180 |     "GJGJCJCD",
3181 |     "GJGJCKDGD",
3182 |     "GJGMCCD",
3183 |     "GJGPCCCD",
3184 |     "GJGPD",
3185 |     "GJICCCCD",
3186 |     "GJICCDCD",
3187 |     "GJICDD",
3188 |     "GJICJD",
3189 |     "GJIDD",
3190 |     "GJIDWCCCWD",
3191 |     "GJIICD",
3192 |     "GJIIID",
3193 |     "GJIJCCD",
3194 |     "GJIJCD",
3195 |     "GJIJD",
3196 |     "GJIPD",
3197 |     "GJJCCDGD",
3198 |     "GJJCDCCD",
3199 |     "GJJCDCD",
3200 |     "GJJCDD",
3201 |     "GJJCID",
3202 |     "GJJCPD",
3203 |     "GJJDDCCD",
3204 |     "GJJGCCCCD",
3205 |     "GJJICCCCD",
3206 |     "GJJICCDCD",
3207 |     "GJJICD",
3208 |     "GJJID",
3209 |     "GJJIPD",
3210 |     "GJJPD",
3211 |     "GJKJCD",
3212 |     "GJKPD",
3213 |     "GJMCCCCD",
3214 |     "GJMCCCD",
3215 |     "GJMCD",
3216 |     "GJMICD",
3217 |     "GJMJCCCCD",
3218 |     "GJMJCD",
3219 |     "GJMPCDD",
3220 |     "GJPCCCDGD",
3221 |     "GJPCCDD",
3222 |     "GJPCCDGD",
3223 |     "GJPCCDID",
3224 |     "GJPCID",
3225 |     "GJPDCCD",
3226 |     "GJPDDGD",
3227 |     "GJPDGD",
3228 |     "GJPID",
3229 |     "GJPIDID",
3230 |     "GJPJCD",
3231 |     "GJPJMJCD",
3232 |     "GJPPCCD",
3233 |     "GJWGWCD",
3234 |     "GKCCCCD",
3235 |     "GKCCDCD",
3236 |     "GKCCDD",
3237 |     "GKCCDWD",
3238 |     "GKCDDICD",
3239 |     "GKCID",
3240 |     "GKCJDDWD",
3241 |     "GKCWWD",
3242 |     "GKDCCD",
3243 |     "GKDID",
3244 |     "GKGDGCDD",
3245 |     "GKGPDD",
3246 |     "GKICD",
3247 |     "GKJCCDGCD",
3248 |     "GKJCCID",
3249 |     "GKJCD",
3250 |     "GKJD",
3251 |     "GKJID",
3252 |     "GKJIPCD",
3253 |     "GKPCDFJD",
3254 |     "GKPCJD",
3255 |     "GKWD",
3256 |     "GLCCCCCID",
3257 |     "GLCCCDD",
3258 |     "GLCDCGCD",
3259 |     "GLCICD",
3260 |     "GLCID",
3261 |     "GLCJID",
3262 |     "GLCMCICD",
3263 |     "GLCPCCDGD",
3264 |     "GLCWGWDGCD",
3265 |     "GLFCCD",
3266 |     "GLFD",
3267 |     "GLFJJCD",
3268 |     "GLFKCCCD",
3269 |     "GLGCCD",
3270 |     "GLGCDDFCD",
3271 |     "GLGJCCD",
3272 |     "GLICCCCD",
3273 |     "GLICCCD",
3274 |     "GLICCDD",
3275 |     "GLIFD",
3276 |     "GLIID",
3277 |     "GLIJD",
3278 |     "GLIWDWD",
3279 |     "GLJCCPCJD",
3280 |     "GLJCDD",
3281 |     "GLJCID",
3282 |     "GLJDCD",
3283 |     "GLJGCCD",
3284 |     "GLJGCDGD",
3285 |     "GLKCIID",
3286 |     "GLMCCDD",
3287 |     "GLMCD",
3288 |     "GLPCCCCD",
3289 |     "GLPCCDD",
3290 |     "GLPCCDDFD",
3291 |     "GLPICD",
3292 |     "GLPID",
3293 |     "GLPJCD",
3294 |     "GLPPCCDD",
3295 |     "GLPWIWD",
3296 |     "GMCCCDD",
3297 |     "GMCCDCCCD",
3298 |     "GMCCDCMD",
3299 |     "GMCCDMD",
3300 |     "GMCCDWD",
3301 |     "GMCID",
3302 |     "GMCIID",
3303 |     "GMCKCD",
3304 |     "GMDPDID",
3305 |     "GMFCCD",
3306 |     "GMFCD",
3307 |     "GMFCDCDGD",
3308 |     "GMFICD",
3309 |     "GMGCD",
3310 |     "GMGCDCD",
3311 |     "GMICID",
3312 |     "GMIID",
3313 |     "GMJCCCD",
3314 |     "GMMCDGDD",
3315 |     "GMMICCD",
3316 |     "GMPCCCD",
3317 |     "GMPDGCD",
3318 |     "GMPFD",
3319 |     "GMPICD",
3320 |     "GMPID",
3321 |     "GMPMCCCD",
3322 |     "GPCCCCCD",
3323 |     "GPCCCCDGD",
3324 |     "GPCCCDGD",
3325 |     "GPCCCDLD",
3326 |     "GPCCCID",
3327 |     "GPCCCPD",
3328 |     "GPCCDGD",
3329 |     "GPCCDWD",
3330 |     "GPCCGD",
3331 |     "GPCCICD",
3332 |     "GPCCID",
3333 |     "GPCCJD",
3334 |     "GPCCWCWCD",
3335 |     "GPCDCJD",
3336 |     "GPCDDCD",
3337 |     "GPCDGFCD",
3338 |     "GPCDIID",
3339 |     "GPCFCCCDGD",
3340 |     "GPCFD",
3341 |     "GPCGCD",
3342 |     "GPCICD",
3343 |     "GPCIDD",
3344 |     "GPCIDWCWD",
3345 |     "GPCIID",
3346 |     "GPCIJCD",
3347 |     "GPCIPD",
3348 |     "GPCJD",
3349 |     "GPCKD",
3350 |     "GPCPCD",
3351 |     "GPCPCDD",
3352 |     "GPCPD",
3353 |     "GPCWWD",
3354 |     "GPFCCDWD",
3355 |     "GPFCCWD",
3356 |     "GPFCDD",
3357 |     "GPFCWFWD",
3358 |     "GPFDD",
3359 |     "GPFICD",
3360 |     "GPFIGGPCD",
3361 |     "GPFJCD",
3362 |     "GPFJD",
3363 |     "GPFJDCD",
3364 |     "GPFPCDCD",
3365 |     "GPFPD",
3366 |     "GPGCCCCD",
3367 |     "GPGCCD",
3368 |     "GPGCD",
3369 |     "GPGCDWGD",
3370 |     "GPGCGCD",
3371 |     "GPGCIICD",
3372 |     "GPGCPCCD",
3373 |     "GPGFFCD",
3374 |     "GPGICCD",
3375 |     "GPGICD",
3376 |     "GPGID",
3377 |     "GPGJCD",
3378 |     "GPGJD",
3379 |     "GPGPCGD",
3380 |     "GPGPDD",
3381 |     "GPGPGJCCD",
3382 |     "GPGPICD",
3383 |     "GPICCCCDGD",
3384 |     "GPICDD",
3385 |     "GPICDGD",
3386 |     "GPICICD",
3387 |     "GPIDFGD",
3388 |     "GPIICD",
3389 |     "GPIICDGD",
3390 |     "GPIID",
3391 |     "GPIJCCD",
3392 |     "GPIJCD",
3393 |     "GPIJCDD",
3394 |     "GPIPCCCD",
3395 |     "GPIPCCD",
3396 |     "GPJCCCCD",
3397 |     "GPJCCDD",
3398 |     "GPJCDCPD",
3399 |     "GPJCDFD",
3400 |     "GPJCDGCD",
3401 |     "GPJCDGD",
3402 |     "GPJCID",
3403 |     "GPJCIDD",
3404 |     "GPJCJCD",
3405 |     "GPJCPCCD",
3406 |     "GPJFCCD",
3407 |     "GPJFIDD",
3408 |     "GPJICD",
3409 |     "GPJID",
3410 |     "GPJPCD",
3411 |     "GPJPD",
3412 |     "GPKD",
3413 |     "GPLCCCCD",
3414 |     "GPLCCD",
3415 |     "GPLCD",
3416 |     "GPLICD",
3417 |     "GPLID",
3418 |     "GPMCCCD",
3419 |     "GPMCCD",
3420 |     "GPMCD",
3421 |     "GPMD",
3422 |     "GPMDJCD",
3423 |     "GPPCCDD",
3424 |     "GPPCCDWD",
3425 |     "GPPCDWCCCD",
3426 |     "GPPDDD",
3427 |     "GPPFD",
3428 |     "GPPGCCCD",
3429 |     "GPPGD",
3430 |     "GPPGDGD",
3431 |     "GPPICD",
3432 |     "GPPIID",
3433 |     "GPPIJD",
3434 |     "GPPJCCD",
3435 |     "GPPJCDFCD",
3436 |     "GPPJDCD",
3437 |     "GPPMJCD",
3438 |     "GPPPCD",
3439 |     "GPPPICD",
3440 |     "GPWCCD",
3441 |     "GPWGWCDGD",
3442 |     "GWCCCCD",
3443 |     "GWCWJD",
3444 |     "GWGPPD",
3445 |     "GWGWCCD",
3446 |     "GWICCD",
3447 |     "GWJD",
3448 |     "GWPCPWD",
3449 |     "ICCCDFPCCFGCCD",
3450 |     "ICCCDGD",
3451 |     "ICCCDGJD",
3452 |     "ICCCID",
3453 |     "ICCCWGWD",
3454 |     "ICCCWGWDGD",
3455 |     "ICCFJCWGFJCCD",
3456 |     "ICCICCD",
3457 |     "ICCWGWD",
3458 |     "ICDCCCCD",
3459 |     "ICDCDCD",
3460 |     "ICDGD",
3461 |     "ICDWGD",
3462 |     "ICFCD",
3463 |     "ICFCJCD",
3464 |     "ICFDID",
3465 |     "ICGD",
3466 |     "ICGGD",
3467 |     "ICICCD",
3468 |     "ICIDWID",
3469 |     "ICIWD",
3470 |     "ICJCCD",
3471 |     "ICJWGWCCD",
3472 |     "ICWGWDD",
3473 |     "IDDGCD",
3474 |     "IDGD",
3475 |     "IFCD",
3476 |     "IFCICCDGD",
3477 |     "IFGD",
3478 |     "IFICCD",
3479 |     "IFIDWGD",
3480 |     "IFKD",
3481 |     "IGCCCCDCCD",
3482 |     "IGFGJCGDD",
3483 |     "IGGDFCD",
3484 |     "IGGPCGCD",
3485 |     "IGICCD",
3486 |     "IIGCD",
3487 |     "IIICD",
3488 |     "IIID",
3489 |     "IIJD",
3490 |     "IIWGWCD",
3491 |     "IIWGWD",
3492 |     "IJCCDGD",
3493 |     "IJCCWGWD",
3494 |     "IJCDCCD",
3495 |     "IJCDD",
3496 |     "IJDD",
3497 |     "IJGCCD",
3498 |     "IJPCCD",
3499 |     "IJPCD",
3500 |     "IKCD",
3501 |     "IMCCCCD",
3502 |     "IPCCCCD",
3503 |     "IPCGD",
3504 |     "IPFD",
3505 |     "IPJCD",
3506 |     "IPPD",
3507 |     "IPPDCD",
3508 |     "IPPJCD",
3509 |     "IPWGWCCD",
3510 |     "IPWGWCD",
3511 |     "IWGMFCCDGD",
3512 |     "IWGWCCD",
3513 |     "IWGWCD",
3514 |     "IWGWD",
3515 |     "IWGWID",
3516 |     "JCCCWFWD",
3517 |     "JCCGD",
3518 |     "JCCPD",
3519 |     "JCCWGWD",
3520 |     "JCDCWMWMWCCCD",
3521 |     "JCDDWD",
3522 |     "JCGDWCPWD",
3523 |     "JCIDGD",
3524 |     "JCIWGWD",
3525 |     "JCJGCD",
3526 |     "JCPCD",
3527 |     "JCWGWCDWD",
3528 |     "JDGCCD",
3529 |     "JDWGWCD",
3530 |     "JFCCD",
3531 |     "JFCCWGWDD",
3532 |     "JFID",
3533 |     "JFJCCD",
3534 |     "JGCCD",
3535 |     "JGCCGCD",
3536 |     "JGCDFCD",
3537 |     "JGCFCCCD",
3538 |     "JGFD",
3539 |     "JGICD",
3540 |     "JGID",
3541 |     "JGJDCD",
3542 |     "JGMCCD",
3543 |     "JGPCCD",
3544 |     "JICCD",
3545 |     "JIGPCD",
3546 |     "JIICD",
3547 |     "JIPCD",
3548 |     "JJCCCDGD",
3549 |     "JJCCJD",
3550 |     "JJCGD",
3551 |     "JJCID",
3552 |     "JJCWGWCCDFGCD",
3553 |     "JJCWGWD",
3554 |     "JJLWGWCD",
3555 |     "JJPJJD",
3556 |     "JJPPCD",
3557 |     "JPCCCCWGWD",
3558 |     "JPCCCD",
3559 |     "JPIWGWID",
3560 |     "JPJJD",
3561 |     "JPWWD",
3562 |     "JWWFWCD",
3563 |     "KCCCCDD",
3564 |     "KCCDGD",
3565 |     "KCDGCD",
3566 |     "KCDGD",
3567 |     "KCFCD",
3568 |     "KCICCD",
3569 |     "KCJCCCDGD",
3570 |     "KFCCD",
3571 |     "KGCDCCCD",
3572 |     "KGCDD",
3573 |     "KGCDGD",
3574 |     "KGCGCD",
3575 |     "KGGCDID",
3576 |     "KIDJCCD",
3577 |     "KIICD",
3578 |     "KJGCD",
3579 |     "KLPCCD",
3580 |     "KPFCCD",
3581 |     "KPGCD",
3582 |     "KPICD",
3583 |     "KPKICD",
3584 |     "LCCCDGCD",
3585 |     "LCGFCD",
3586 |     "LCWGWCFDD",
3587 |     "LCWGWICD",
3588 |     "LFCCD",
3589 |     "LFCWGWCD",
3590 |     "LFJD",
3591 |     "LGCCCD",
3592 |     "LGCCD",
3593 |     "LGCICD",
3594 |     "LGCPJCD",
3595 |     "LGFCCD",
3596 |     "LGFCD",
3597 |     "LGFD",
3598 |     "LGFICD",
3599 |     "LGFID",
3600 |     "LGFJD",
3601 |     "LGFPD",
3602 |     "LGICCDGCD",
3603 |     "LGICD",
3604 |     "LGID",
3605 |     "LGIICD",
3606 |     "LGJCCD",
3607 |     "LGJCID",
3608 |     "LGLIDD",
3609 |     "LGPCCD",
3610 |     "LGPCD",
3611 |     "LGPDDWD",
3612 |     "LGPJPD",
3613 |     "LIJD",
3614 |     "LIWGWD",
3615 |     "LJFCCD",
3616 |     "LJFGCD",
3617 |     "LKGPD",
3618 |     "LKPJCCD",
3619 |     "LMPJIPCCD",
3620 |     "LPCGDGD",
3621 |     "LPPWCCD",
3622 |     "MCCID",
3623 |     "MCCWGWD",
3624 |     "MCJCCD",
3625 |     "MCWD",
3626 |     "MJCWGWD",
3627 |     "MPCFCD",
3628 |     "MPCGD",
3629 |     "PCCGGCCCCD",
3630 |     "PCCKD",
3631 |     "PCDDDCCD",
3632 |     "PCJD",
3633 |     "PCJJDD",
3634 |     "PCLGCCD",
3635 |     "PCWGWCD",
3636 |     "PCWGWID",
3637 |     "PCWGWPDWD",
3638 |     "PFCCCWGWD",
3639 |     "PFCCJICCD",
3640 |     "PFCD",
3641 |     "PFCPCDCD",
3642 |     "PFCWGWCD",
3643 |     "PFID",
3644 |     "PFPID",
3645 |     "PGCCD",
3646 |     "PGCCDGD",
3647 |     "PGCID",
3648 |     "PGCJCJD",
3649 |     "PGFCD",
3650 |     "PGJCCD",
3651 |     "PGJICD",
3652 |     "PGPCCCJD",
3653 |     "PGPCCD",
3654 |     "PGPD",
3655 |     "PGPID",
3656 |     "PIGFCD",
3657 |     "PIWGWCCD",
3658 |     "PJCCCD",
3659 |     "PJCDD",
3660 |     "PJCWGWDID",
3661 |     "PJDFFD",
3662 |     "PLDWGWCDCGD",
3663 |     "PPCCCDD",
3664 |     "PPCCD",
3665 |     "PPCCDGD",
3666 |     "PPCCGWD",
3667 |     "PPCCWGWD",
3668 |     "PPCID",
3669 |     "PPCPWCCCWCDD",
3670 |     "PPCWGWD",
3671 |     "PPDD",
3672 |     "PPFWGWCD",
3673 |     "PPICD",
3674 |     "PPIDD",
3675 |     "PPLWGWCCD",
3676 |     "PPWGWCD",
3677 |     "PPWGWD",
3678 |     "PWCCCD",
3679 |     "PWCWCD",
3680 |     "PWFWCD",
3681 |     "PWWJWGWD",
3682 |     "WCCCD",
3683 |     "WCJWD",
3684 |     "WFFPCWJD",
3685 |     "WFPWPWCCD",
3686 |     "WFWD",
3687 |     "WGCJDWFCCCD",
3688 |     "WJCCD",
3689 |     "WPWJD",
3690 |     "WWIWWCWD",
3691 | ]
3692 | 


--------------------------------------------------------------------------------
/yaya/config.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | __author__ = 'tony'
 3 | import os
 4 | 
 5 | DICT_BIN_EXT = '.ya'
 6 | DICT_BIN_REVERSE_EXT = '.reverse.ya'
 7 | DATA_ROOT = "/home/tony/MyProject/YaYaNLP/data"
 8 | 
 9 | CUSTOM_DICT_NAME = [os.path.join(DATA_ROOT, "dictionary", "custom", f) for f in [
10 |     u"CustomDictionary.txt",
11 |     u"上海地名.txt",
12 |     u"人名词典.txt",
13 |     u"全国地名大全.txt",
14 |     u"机构名词典.txt",
15 |     u"现代汉语补充词库.txt"]]
16 | 
17 | CORE_DICT_NAME = os.path.join(DATA_ROOT, "dictionary", "CoreNatureDictionary.txt")
18 | CORE_BIGRAM_NAME = os.path.join(DATA_ROOT, "dictionary", "CoreNatureDictionary.ngram.txt")
19 | CORE_TR_PATH = os.path.join(DATA_ROOT, "dictionary", "person", "CoreNatureDictionary.tr.txt")
20 | 
21 | CHAR_TYPE_PATH = os.path.join(DATA_ROOT, "dictionary", "other", "CharType.dat.yes")
22 | 
23 | PERSON_TR_PATH = os.path.join(DATA_ROOT, "dictionary", "person", "nr.tr.txt")
24 | PERSON_DICT_NAME = os.path.join(DATA_ROOT, "dictionary", "person", "nr.txt")
25 | 
26 | ORG_TR_PATH = os.path.join(DATA_ROOT, "dictionary", "organization", "nt.tr.txt")
27 | ORG_DICT_NAME = os.path.join(DATA_ROOT, "dictionary", "organization", "nt.txt")
28 | 
29 | PLACE_TR_PATH = os.path.join(DATA_ROOT, "dictionary", "place", "ns.tr.txt")
30 | PLACE_DICT_NAME = os.path.join(DATA_ROOT, "dictionary", "place", "ns.txt")
31 | 
32 | TRADITIONAL_CHINESE_DICT_NAME = os.path.join(DATA_ROOT, "dictionary", "tc", "TraditionalChinese.txt")
33 | 
34 | # 全局配置
35 | class _Config:
36 |     # 是否优先使用缓存字典
37 |     use_dict_cache = True
38 | 
39 |     # 是否使用用户字典
40 |     use_custom_dict = True
41 | 
42 |     # 中国人名识别
43 |     name_recognize = True
44 | 
45 |     # 地名识别
46 |     place_recognize = True
47 | 
48 |     # 机构识别
49 |     org_recognize = True
50 | 
51 |     debug = True
52 | 
53 | 
54 | Config = _Config()
55 | 


--------------------------------------------------------------------------------
/yaya/const.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import logging
 3 | 
 4 | __author__ = 'tony'
 5 | 
 6 | logger = logging.getLogger("YaYaNLP")
 7 | 
 8 | # 算术常量
 9 | DOUBLE_MAX = 1.7976931348623157e+308
10 | 
11 | # 预定义常量
12 | TAG_PLACE = u"未##地"
13 | TAG_BIGIN = u"始##始"
14 | TAG_OTHER = u"未##它"
15 | TAG_GROUP = u"未##团"
16 | TAG_NUMBER = u"未##数"
17 | TAG_QUANTIFIER = u"未##量"
18 | TAG_PROPER = u"未##专"
19 | TAG_TIME = u"未##时"
20 | TAG_CLUSTER = u"未##串"
21 | TAG_END = u"末##末"
22 | TAG_PEOPLE = u"未##人"
23 | 
24 | # 总词频
25 | MAX_FREQUENCY = 25146057
26 | SMOOTHING_FACTOR = 1.0 / MAX_FREQUENCY + 0.00001
27 | SMOOTHING_PARAM = 0.1
28 | 


--------------------------------------------------------------------------------
/yaya/dictionary/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tony'
2 | 


--------------------------------------------------------------------------------
/yaya/dictionary/chinese_traditional_dict.py:
--------------------------------------------------------------------------------
 1 | from yaya.collection.dict import DoubleArrayTrie
 2 | from yaya import config
 3 | from yaya.utility.singleton import singleton
 4 | 
 5 | __author__ = 'tony'
 6 | 
 7 | 
 8 | class ChinseTraditionalBaseDict:
 9 |     def convert_key_to_value(self, text):
10 |         search = self.trie.search(text)
11 |         wordnet = [None] * search.array_length
12 |         lennet = [0] * search.array_length
13 |         for i, k, v in search.search_all_words():
14 |             if len(v[1]) > lennet[i]:
15 |                 wordnet[i] = v[1]
16 |                 lennet[i] = len(k)
17 |         offset = 0
18 |         valuetext = []
19 |         while offset < search.array_length:
20 |             if wordnet[offset] is None:
21 |                 valuetext.append(search.char_array[offset])
22 |                 offset += 1
23 |             else:
24 |                 valuetext.append(wordnet[offset])
25 |                 offset += lennet[offset]
26 |         return "".join(valuetext)
27 | 
28 | 
29 | @singleton
30 | class SimplifiedChineseDict(ChinseTraditionalBaseDict):
31 |     def __init__(self):
32 |         self.trie = DoubleArrayTrie.load(config.TRADITIONAL_CHINESE_DICT_NAME,
33 |                                          lambda i: i[i.find(u'=') + 1:],
34 |                                          lambda i: i.split('=')[::-1],
35 |                                          dict_bin_ext=config.DICT_BIN_REVERSE_EXT)
36 |         self.trie.get_attr = lambda v: v
37 | 
38 |     def convert_simplified_to_traditional(self, text):
39 |         return self.convert_key_to_value(text)
40 | 
41 | 
42 | @singleton
43 | class TraditionalChineseDict(ChinseTraditionalBaseDict):
44 |     def __init__(self):
45 |         self.trie = DoubleArrayTrie.load(config.TRADITIONAL_CHINESE_DICT_NAME,
46 |                                          lambda i: i[:i.find(u'=')],
47 |                                          lambda i: i.split('='))
48 |         self.trie.get_attr = lambda v: v
49 | 
50 |     def convert_traditional_to_simplified(self, text):
51 |         return self.convert_key_to_value(text)
52 | 


--------------------------------------------------------------------------------
/yaya/dictionary/org_dict.py:
--------------------------------------------------------------------------------
 1 | from yaya import config
 2 | from yaya.collection.dict import DoubleArrayTrie
 3 | from yaya.collection.hmm import HMMMatrix
 4 | from yaya.common.nt import NTPattern, NT
 5 | from yaya.utility.singleton import singleton
 6 | 
 7 | __author__ = 'tony'
 8 | 
 9 | 
10 | @singleton
11 | class OrgDict:
12 |     def __init__(self):
13 |         self.trie = DoubleArrayTrie.load(config.ORG_DICT_NAME, enum_cls=NT)
14 |         self.matrix = HMMMatrix.load(config.ORG_TR_PATH, NT)
15 | 
16 | 
17 | @singleton
18 | class NTPatternDict:
19 |     def __init__(self):
20 |         self.trie = DoubleArrayTrie()
21 |         NTPattern.sort()
22 |         self.trie.build(key=NTPattern)
23 | 


--------------------------------------------------------------------------------
/yaya/dictionary/person_dict.py:
--------------------------------------------------------------------------------
 1 | from yaya import config
 2 | from yaya.collection.dict import DoubleArrayTrie
 3 | from yaya.collection.hmm import HMMMatrix
 4 | from yaya.common.nr import NRPattern, NR
 5 | from yaya.utility.singleton import singleton
 6 | 
 7 | __author__ = 'tony'
 8 | 
 9 | 
10 | @singleton
11 | class PersonDict:
12 |     def __init__(self):
13 |         self.trie = DoubleArrayTrie.load(config.PERSON_DICT_NAME, enum_cls=NR)
14 |         self.matrix = HMMMatrix.load(config.PERSON_TR_PATH, NR)
15 | 
16 | 
17 | @singleton
18 | class NRPatternDict:
19 |     def __init__(self):
20 |         self.trie = DoubleArrayTrie()
21 |         NRPattern.sort()
22 |         self.trie.build(key=NRPattern)


--------------------------------------------------------------------------------
/yaya/dictionary/place_dict.py:
--------------------------------------------------------------------------------
 1 | from yaya.common.ns import NS, NSPattern
 2 | from yaya import config
 3 | from yaya.collection.dict import DoubleArrayTrie
 4 | from yaya.collection.hmm import HMMMatrix
 5 | from yaya.utility.singleton import singleton
 6 | 
 7 | __author__ = 'tony'
 8 | 
 9 | 
10 | @singleton
11 | class PlaceDict:
12 |     def __init__(self):
13 |         self.trie = DoubleArrayTrie.load(config.PLACE_DICT_NAME, enum_cls=NS)
14 |         self.matrix = HMMMatrix.load(config.PLACE_TR_PATH, NS)
15 | 
16 | 
17 | @singleton
18 | class NSPatternDict:
19 |     def __init__(self):
20 |         self.trie = DoubleArrayTrie()
21 |         NSPattern.sort()
22 |         self.trie.build(key=NSPattern)
23 | 


--------------------------------------------------------------------------------
/yaya/recognition/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tony'
2 | 
3 | 


--------------------------------------------------------------------------------
/yaya/recognition/organization_recognition.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from yaya.collection.dict import Attribute, ORG_ATTRIBUTE
 3 | from yaya.collection.hmm import OrgTranMatrix
 4 | from yaya.common.nature import NATURE
 5 | from yaya.common.nt import NT
 6 | from yaya.dictionary.org_dict import NTPatternDict, OrgDict
 7 | from yaya.recognition.recognition import role_viterbi
 8 | from yaya.seg.viterbi import viterbi_standard
 9 | 
10 | __author__ = 'tony'
11 | 
12 | 
13 | def recognition(vertexs, wordnet_optimum, wordnet_all):
14 |     # 识别角色，并进行一次维特比
15 |     return role_viterbi(vertexs, wordnet_optimum,
16 |                         hmm=OrgTranMatrix().hmm,
17 |                         trie=NTPatternDict().trie,
18 |                         recognition_attr=ORG_ATTRIBUTE,
19 |                         tag_func=role_tag,
20 |                         viterbi_fun=viterbi_standard
21 |                         )
22 | 
23 | def role_tag(word_seg_list):
24 |     tag_index_list = []
25 |     for vertex in word_seg_list:
26 |         nature = vertex.nature
27 |         if nature == NATURE.nz:
28 |             if vertex.attribute.total_frequency <= 1000:
29 |                 tag_index_list.append(Attribute([str(NT.F), 1000], cls=NT))  # ((NT.F, 1000))
30 |             else:
31 |                 break
32 |             continue
33 |         elif nature in [NATURE.ni,
34 |                         NATURE.nic,
35 |                         NATURE.nis,
36 |                         NATURE.nit]:
37 |             tag_index_list.append(Attribute([str(NT.K), 1000, str(NT.D), 1000], cls=NT))
38 |             continue
39 |         elif nature == NATURE.m:
40 |             tag_index_list.append(Attribute([str(NT.M), 1000], cls=NT))
41 |             continue
42 | 
43 |         index, value = OrgDict().trie.get(vertex.word)
44 |         if value is None:
45 |             value = Attribute([str(NT.Z), OrgDict().matrix.get_total_freq(NT.Z)], cls=NT)
46 |         # else:
47 |         #     # if not isinstance(value, list):
48 |         #     #     value = value.split()
49 |         #     # value = Attribute(value[1:], cls=NT)
50 | 
51 |         tag_index_list.append(value)
52 | 
53 |     return tag_index_list
54 | 


--------------------------------------------------------------------------------
/yaya/recognition/person_recognition.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from yaya.collection.dict import Attribute, PERSON_ATTRIBUTE
 3 | from yaya.collection.hmm import PersonTranMatrix
 4 | from yaya.common.nature import NATURE
 5 | from yaya.common.nr import NR
 6 | from yaya.dictionary.person_dict import PersonDict, NRPatternDict
 7 | from yaya.recognition.recognition import role_viterbi
 8 | from yaya.seg.wordnet import Vertex
 9 | 
10 | __author__ = 'tony'
11 | 
12 | def recognition(vertexs, wordnet_optimum, wordnet_all):
13 |     return role_viterbi(vertexs, wordnet_optimum,
14 |                         hmm=PersonTranMatrix().hmm,
15 |                         trie=NRPatternDict().trie,
16 |                         recognition_attr=PERSON_ATTRIBUTE,
17 |                         tag_func=role_tag
18 |                         )
19 |     # # 识别角色，并进行一次维特比
20 |     # tag_list = role_tag(vertexs)
21 |     # if Config.debug:
22 |     #     sb = []
23 |     #     for i, v in enumerate(vertexs):
24 |     #         sb.append("[%s %s]" % (unicode(vertexs[i]), tag_list[i].nature))
25 |     #     print u"人名角色观察:%s" % "".join(sb)
26 |     # tag_list = viterbi_template(tag_list, PersonTranMatrix().hmm)
27 |     #
28 |     #
29 |     # tag_str = [str(x) for x in tag_list]
30 |     # tag_str = ''.join(tag_str)
31 |     #
32 |     # # 处理V、U的特殊情况
33 |     # tag_str, vertexs = parse_pattern(tag_str, vertexs, None, None)
34 |     #
35 |     # search = Searcher(NRPatternDict().trie, tag_str)
36 |     # vertexs_offset = [0 for i in range(len(vertexs))]
37 |     # offset = 0
38 |     # vertexs_offset[1] = 1
39 |     # for i in range(2, len(vertexs) - 2):
40 |     #     vertexs_offset[i] = vertexs_offset[i - 1] + len(vertexs[i - 1].real_word)
41 |     # while search.next():
42 |     #     name_str = ""
43 |     #     for i in range(search.begin, search.begin + len(search.key)):
44 |     #         name_str += vertexs[i].real_word
45 |     #
46 |     #     # 添加到词网内
47 |     #     vertex = Vertex(name_str, attribute="nr 1")
48 |     #     wordnet_optimum.insert(vertexs_offset[search.begin], vertex, wordnet_all)
49 | 
50 | 
51 | 
52 | def role_tag(word_seg_list):
53 |     tag_index_list = []
54 |     for vertex in word_seg_list:
55 |         if vertex.nature == NATURE.nr and vertex.attribute.total_frequency <= 1000:
56 |             if vertex.real_word.__len__() == 2:
57 |                 tag_index_list.append(Attribute(attr=(NR.X, 1, NR.G, 1), cls=NR))
58 |                 continue
59 | 
60 |         index, value = PersonDict().trie.get(vertex.real_word)
61 | 
62 |         if value is None:
63 |             value = Attribute([str(NR.A), PersonDict().matrix.get_total_freq(NR.A)], cls=NR)
64 | 
65 |         tag_index_list.append(value)
66 |     return tag_index_list
67 | 
68 | 
69 | def parse_pattern(tag_str, vertexs, wordnet_optimum, wordnet_all):
70 |     new_tag_list = []
71 |     new_vertexs = []
72 |     for i, t in enumerate(tag_str):
73 |         if t == str(NR.U):
74 |             new_tag_list.append(str(NR.K))
75 |             new_tag_list.append(str(NR.B))
76 |             word_K = vertexs[i].real_word[:-1]
77 |             word_B = vertexs[i].real_word[-1]
78 |             new_vertexs.append(Vertex(word_K))
79 |             new_vertexs.append(Vertex(word_B))
80 |         elif t == str(NR.V):
81 |             if tag_str[i - 1] == str(NR.B):
82 |                 new_tag_list.append(str(NR.E))
83 |             else:
84 |                 new_tag_list.append(str(NR.D))
85 |             new_tag_list.append(str(NR.L))
86 |             word_ED = vertexs[i].real_word[:-1]
87 |             word_L = vertexs[i].real_word[-1]
88 |             new_vertexs.append(Vertex(word_ED))
89 |             new_vertexs.append(Vertex(word_L))
90 |         else:
91 |             new_tag_list.append(t)
92 |             new_vertexs.append(vertexs[i])
93 |     return "".join(new_tag_list), new_vertexs
94 | 


--------------------------------------------------------------------------------
/yaya/recognition/place_recognition.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from yaya.common.ns import NS
 3 | from yaya.dictionary.place_dict import NSPatternDict, PlaceDict
 4 | from yaya.recognition.recognition import role_viterbi
 5 | 
 6 | __author__ = 'tony'
 7 | # coding=utf-8
 8 | from yaya.collection.dict import Attribute, PLACE_ATTRIBUTE
 9 | from yaya.collection.hmm import PlaceTranMatrix
10 | from yaya.common.nature import NATURE
11 | 
12 | __author__ = 'tony'
13 | 
14 | 
15 | def recognition(vertexs, wordnet_optimum, wordnet_all):
16 |     return role_viterbi(vertexs, wordnet_optimum,
17 |                         hmm=PlaceTranMatrix().hmm,
18 |                         trie=NSPatternDict().trie,
19 |                         recognition_attr=PLACE_ATTRIBUTE,
20 |                         tag_func=role_tag
21 |                         )
22 | 
23 |     # # 识别角色，并进行一次维特比
24 |     # tag_list = viterbi_template(role_tag(vertexs), PlaceTranMatrix().hmm)
25 |     # tag_str = [str(x) for x in tag_list]
26 |     # tag_str = ''.join(tag_str)
27 |     # search = Searcher(NSPatternDict().trie, tag_str)
28 |     # vertexs_offset = [0] * len(vertexs)
29 |     # offset = 0
30 |     # for i in range(1, len(vertexs) - 2):
31 |     #     vertexs_offset[i] = offset
32 |     #     offset += len(vertexs[i].real_word)
33 |     # while search.next():
34 |     #     name_str = ""
35 |     #     for i in range(search.begin, search.begin + len(search.key)):
36 |     #         name_str += vertexs[i].real_word
37 |     #
38 |     #     # 添加到词网内
39 |     #     vertex = Vertex(name_str, attribute="ns 1")
40 |     #     wordnet_optimum.insert(vertexs_offset[search.begin + 1], vertex, wordnet_all)
41 | 
42 | 
43 | def role_tag(word_seg_list):
44 |     tag_index_list = []
45 |     for vertex in word_seg_list:
46 |         if vertex.nature == NATURE.ns.index and vertex.attribute.total_frequency <= 1000:
47 |             if vertex.real_word.__len__() < 3:
48 |                 tag_index_list.append(Attribute("%s 1 %s 1" % (NS.H, NS.G), NS))
49 |                 continue
50 |         index, value = PlaceDict().trie.get(vertex.real_word)
51 |         if value is None:
52 |             value = Attribute([str(NS.Z), PlaceDict().matrix.get_total_freq(NS.Z)], cls=NS)
53 |         # else:
54 |         #     if not isinstance(value, list):
55 |         #         value = value.split()
56 |         #     value = Attribute(value[1:], cls=NS)
57 |         tag_index_list.append(value)
58 |     return tag_index_list
59 | 


--------------------------------------------------------------------------------
/yaya/recognition/recognition.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from yaya.collection.dict import Searcher
 3 | from yaya.seg.viterbi import viterbi, viterbi_template
 4 | from yaya.seg.wordnet import Vertex
 5 | from yaya.config import Config
 6 | __author__ = 'tony'
 7 | 
 8 | 
 9 | def role_viterbi(vertexs, wordnet_optimum, hmm, trie, recognition_attr, tag_func, viterbi_fun=viterbi_template):
10 |     tag_list = tag_func(vertexs)
11 |     if Config.debug:
12 |         sb = []
13 |         for i, tag in enumerate(tag_list):
14 |             sb.append(u"[ %s %s ]" % (vertexs[i].real_word, tag))
15 |         print u"角色观察: %s" % u"".join(sb)
16 | 
17 |     tag_list = viterbi_fun(tag_list, hmm)
18 |     if Config.debug:
19 |         sb = []
20 |         for i, tag in enumerate(tag_list):
21 |             sb.append(u"%s/%s" % (vertexs[i].real_word, tag))
22 |         print(u"角色标注:[%s]" % u", ".join(sb))
23 | 
24 |     tag_str = [str(x) for x in tag_list]
25 |     tag_str = ''.join(tag_str)
26 |     search = Searcher(trie, tag_str)
27 |     vertexs_offset = [0] * len(vertexs)
28 |     offset = 1
29 |     # head tail skip
30 |     for i, v in enumerate(vertexs[1:-1]):
31 |         vertexs_offset[i + 1] = offset
32 |         offset += len(vertexs[i + 1].real_word)
33 |     while search.next():
34 |         name_str = ""
35 |         for i in range(search.begin, search.begin + len(search.key)):
36 |             name_str += vertexs[i].real_word
37 | 
38 |         # 添加到词网内
39 |         vertex = Vertex(name_str, attribute=recognition_attr)
40 |         wordnet_optimum.add(vertexs_offset[search.begin], vertex)
41 |     vertexs = viterbi(wordnet_optimum.vertexs)
42 |     return vertexs
43 | 


--------------------------------------------------------------------------------
/yaya/seg/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tony'
2 | 


--------------------------------------------------------------------------------
/yaya/seg/segment.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | from yaya.dictionary.chinese_traditional_dict import SimplifiedChineseDict, TraditionalChineseDict
  3 | from yaya.recognition import place_recognition
  4 | from yaya.config import Config
  5 | from yaya.recognition import person_recognition
  6 | from yaya.recognition import organization_recognition
  7 | from yaya.seg.viterbi import viterbi
  8 | from yaya.seg.wordnet import WordNet, gen_word_net, combine_by_custom_dict
  9 | 
 10 | __author__ = 'tony'
 11 | 
 12 | 
 13 | def vertexs_to_terms(vertexs, word_only=False):
 14 |     terms = []
 15 |     offset = 0
 16 |     if word_only:
 17 |         terms = [v.real_word for v in vertexs]
 18 |     else:
 19 |         for v in vertexs[1:-1]:
 20 |             terms.append((v.real_word, str(v.nature), offset))
 21 |             offset += len(v.real_word)
 22 |     return terms
 23 | 
 24 | 
 25 | # def combin_by_dict(vertexs, dat):
 26 | #     for i, start_v in enumerate(vertexs):
 27 | #         # skip head and skip combined word
 28 | #         if i == 0 or start_v is None:
 29 | #             continue
 30 | #         state = dat.transition(start_v.real_word, 1)
 31 | #         if state > 0:
 32 | #             start = i
 33 | #             end = -1
 34 | #             value = None
 35 | #             for j, end_v in enumerate(vertexs[i + 1:-1]):
 36 | #                 state = dat.transition(end_v.real_word, state)
 37 | #                 if state < 0:
 38 | #                     break
 39 | #                 value = dat.output(state)
 40 | #                 end = j + 1
 41 | #
 42 | #             if value is not None:
 43 | #                 for k in range(start, end + i + 1):
 44 | #                     vertexs[k] = None
 45 | #                 vertexs[i] = Vertex(value, attribute=value)
 46 | #
 47 | #     return [v for v in vertexs if v is not None]
 48 | 
 49 | 
 50 | def seg_to_vertexs(text):
 51 |     word_net = WordNet(text)
 52 | 
 53 |     # 粗分词网
 54 |     gen_word_net(text, word_net)
 55 | 
 56 |     if Config.debug:
 57 |         print(u"打印粗分词网：")
 58 |         print(unicode(word_net))
 59 | 
 60 |     # 维特比
 61 |     vertexs = viterbi(word_net.vertexs)
 62 |     if Config.use_custom_dict:
 63 |         vertexs = combine_by_custom_dict(vertexs)
 64 |     word_net_optimum = WordNet(text, vertexs=vertexs)
 65 | 
 66 |     if Config.name_recognize:
 67 |         person_recognition.recognition(vertexs, word_net_optimum, word_net)
 68 | 
 69 |     if Config.place_recognize:
 70 |         place_recognition.recognition(vertexs, word_net_optimum, word_net)
 71 | 
 72 |     if Config.debug:
 73 |         print(u"打印人名、地名识别词网：")
 74 |         print(unicode(word_net_optimum))
 75 | 
 76 |     vertexs = viterbi(word_net_optimum.vertexs)
 77 | 
 78 |     if Config.org_recognize:
 79 |         word_net_optimum = WordNet(text, vertexs=vertexs)
 80 |         vertexs = organization_recognition.recognition(vertexs, word_net_optimum, word_net)
 81 | 
 82 |     if Config.debug:
 83 |         print(u"打印人组织识别词网：")
 84 |         print(unicode(word_net_optimum))
 85 |     return vertexs
 86 | 
 87 | 
 88 | def seg(text):
 89 |     return vertexs_to_terms(seg_to_vertexs(text))
 90 | 
 91 | 
 92 | def traditional_seg(text):
 93 |     simplified = TraditionalChineseDict().convert_traditional_to_simplified(text)
 94 |     return seg(simplified)
 95 | 
 96 | def simplified_to_traditional(text):
 97 |     return SimplifiedChineseDict().convert_simplified_to_traditional(text)
 98 | 
 99 | 
100 | def traditional_to_simplified(text):
101 |     return TraditionalChineseDict().convert_traditional_to_simplified(text)
102 | 


--------------------------------------------------------------------------------
/yaya/seg/viterbi.py:
--------------------------------------------------------------------------------
  1 | # -*- encoding:utf-8 -*-
  2 | from __future__ import unicode_literals
  3 | import math
  4 | 
  5 | from yaya.const import DOUBLE_MAX
  6 | from yaya.config import Config
  7 | 
  8 | __author__ = 'tony'
  9 | 
 10 | 
 11 | class Viterbi:
 12 |     @staticmethod
 13 |     def computer(obs, states, start_p, trans_p, emit_p):
 14 |         max_states_value = 0
 15 |         for s in states:
 16 |             max_states_value = max(max_states_value, s)
 17 |         max_states_value += 1
 18 | 
 19 |         V = [[0 for col in range(obs.__len__())] for row in range(max_states_value)]
 20 |         path = [[0 for col in range(max_states_value)] for row in range(obs.__len__())]
 21 | 
 22 |         for y in states:
 23 |             V[0][y] = start_p[y] + emit_p[obs[0]]
 24 |             path[y][0] = y
 25 | 
 26 |         for t in range(obs.__len__()):
 27 |             new_path = [[0 for col in range(max_states_value) for row in range(obs.__len__())]]
 28 |             for y in states:
 29 |                 prob = DOUBLE_MAX
 30 |                 states = 0
 31 |                 for y0 in states:
 32 |                     nprob = V[t - 1][y0] + trans_p[y0][y] + emit_p[y][obs[t]]
 33 |                     if nprob < prob:
 34 |                         prob = nprob
 35 |                         state = y0
 36 |                         V[t][y] = prob
 37 |                         path[state][0:t] = new_path[y][0:t]
 38 |                         new_path[y][t] = y
 39 |             path = new_path
 40 |         prob = DOUBLE_MAX
 41 |         state = 0
 42 |         for y in states:
 43 |             if V[-1][y] < prob:
 44 |                 prob = V[-1][y]
 45 |                 state = y
 46 |         return path[state]
 47 | 
 48 | 
 49 | def viterbi(vertexs):
 50 |     for v in vertexs[1]:
 51 |         v.update_from(vertexs[0][0])
 52 |     for i in range(1, vertexs.__len__() - 1):
 53 |         node_array = vertexs[i]
 54 |         if node_array is None:
 55 |             continue
 56 |         for node in node_array:
 57 |             if node.vertex_from is None:
 58 |                 continue
 59 |             for node_to in vertexs[i + len(node.real_word)]:
 60 |                 node_to.update_from(node)
 61 |     vertex_from = vertexs[-1][0]
 62 |     vertex_list = []
 63 |     while vertex_from is not None:
 64 |         vertex_list.insert(0, vertex_from)
 65 |         vertex_from = vertex_from.vertex_from
 66 |     return vertex_list
 67 | 
 68 | def viterbi_roletag(roletaglist, hmm):
 69 |     _length = len(roletaglist)
 70 |     taglist = []
 71 |     # 得到第一个元素的第一个标签的词性
 72 |     _pre_nature = roletaglist[0].nature
 73 |     _perfect_nature = _pre_nature
 74 |     taglist.append(_pre_nature)
 75 |     for i in xrange(1, _length):
 76 |         perfect_cost = DOUBLE_MAX
 77 |         item = roletaglist[i]
 78 |         for i, nature, freq in item.natures:
 79 |             _now = hmm.trans_prob[_pre_nature.index][nature.index] - math.log((item.get_nature_frequency(nature)+1e-8) / hmm.get_total_freq(nature))
 80 |             if perfect_cost > _now:
 81 |                 perfect_cost = _now
 82 |                 _perfect_nature = nature
 83 |         _pre_nature = _perfect_nature
 84 |         taglist.append(_pre_nature)
 85 |     return taglist
 86 | 
 87 | def viterbi_template(node_list, hmm, init_cost=DOUBLE_MAX):
 88 |     node_count = len(node_list)
 89 |     taglist = []
 90 |     # 得到第一个元素的第一个标签的词性
 91 |     _pre_nature = node_list[0].nature
 92 |     _perfect_nature = _pre_nature
 93 |     taglist.append(_pre_nature)
 94 |     for i, cur_node in enumerate(node_list[1:]):
 95 |         perfect_cost = init_cost
 96 |         for j, vertex, freq in cur_node.natures:
 97 |             _now = hmm.trans_prob[_pre_nature.index][vertex.index] - math.log(
 98 |                 (cur_node.get_nature_frequency(vertex) + 1e-8) / hmm.get_total_freq(vertex))
 99 |             if perfect_cost > _now:
100 |                 perfect_cost = _now
101 |                 _perfect_nature = vertex
102 |         _pre_nature = _perfect_nature
103 |         taglist.append(_pre_nature)
104 |     return taglist
105 | 
106 | 
107 | def viterbi_standard(node_list, hmm, init_cost=DOUBLE_MAX):
108 |     node_count = len(node_list)
109 |     taglist = []
110 |     # 得到第一个元素的第一个标签的词性
111 |     route_cost = []
112 |     _pre_nature = node_list[0].nature
113 |     _perfect_nature = _pre_nature
114 |     taglist.append(_pre_nature)
115 | 
116 |     # 计算第2个元素
117 |     current_line = node_list[1]
118 |     for i, vertex, freq in current_line.natures:
119 |         _now = hmm.trans_prob[_pre_nature.index][vertex.index] - math.log(
120 |             (current_line.get_nature_frequency(vertex) + 1e-8) / hmm.get_total_freq(vertex))
121 |         route_cost.append(_now)
122 |     pre_line = current_line
123 | 
124 |     # 计算第三个元素
125 |     for i, current_line in enumerate(node_list[2:]):
126 |         new_route_cost = []
127 |         perfect_pre_nature = None
128 |         perfect_cost = init_cost
129 |         for k, cur_nature, cur_freq in current_line.natures:
130 |             new_route_cost.append(init_cost)
131 |             for j, pre_nature, pre_freq in pre_line.natures:
132 |                 assert j < len(route_cost)
133 | 
134 |                 _now = route_cost[j] + hmm.trans_prob[pre_nature.index][cur_nature.index] - math.log(
135 |                     (current_line.get_nature_frequency(cur_nature) + 1e-8) / hmm.get_total_freq(cur_nature))
136 | 
137 |                 if new_route_cost[k] > _now:
138 |                     new_route_cost[k] = _now
139 |                     if perfect_cost > _now:
140 |                         perfect_cost = _now
141 |                         perfect_pre_nature = pre_nature
142 | 
143 |         pre_line = current_line
144 |         route_cost = new_route_cost
145 |         if Config.debug:
146 |             print new_route_cost
147 |         taglist.append(perfect_pre_nature)
148 |     taglist.append(cur_nature)
149 |     return taglist
150 | 
151 | 


--------------------------------------------------------------------------------
/yaya/seg/wordnet.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | from __future__ import absolute_import
  3 | import math
  4 | import copy
  5 | 
  6 | from yaya.collection.dict import *
  7 | from yaya.common.nature import NATURE
  8 | from yaya.utility.chartype import *
  9 | from yaya.collection.bigram import CORE_BIG_RAM_TABLE
 10 | from yaya.const import *
 11 | 
 12 | __author__ = 'tony'
 13 | 
 14 | 
 15 | class AtomNode:
 16 |     def __init__(self, word, pos):
 17 |         self.word = word
 18 |         self.pos = pos
 19 | 
 20 |     def __str__(self):
 21 |         return "AtomNode{ word='%s', nature='%s' }" % (self.word, self.pos)
 22 | 
 23 | 
 24 | class Vertex:
 25 |     def __init__(self, real_word, *args, **kwargs):
 26 |         if kwargs.has_key('attribute'):
 27 |             attribute = kwargs.get('attribute')
 28 |         else:
 29 |             index, attribute = CoreDict().trie.get(real_word)
 30 |         self.attribute = attribute if isinstance(attribute, Attribute) else Attribute(attribute)
 31 | 
 32 |         self.word_id = kwargs.get('word_id', -1)
 33 |         self.real_word = real_word
 34 |         word = kwargs.get('word', None)
 35 |         self.word = word if word is not None else self.compile_real_word(self.real_word, self.attribute)
 36 |         self.vertex_from = None
 37 |         self.weight = 0
 38 | 
 39 |     def __unicode__(self):
 40 |         return u"%s/%s" % (self.real_word, self.word)
 41 | 
 42 |     def __repr__(self):
 43 |         return u"Vertex(%(real_word)r, %(attribute)r )" % vars(self)
 44 | 
 45 |     def __eq__(self, other):
 46 |         if type(self) != type(other):
 47 |             return False
 48 |         return self.real_word == other.real_word and self.nature == other.nature
 49 | 
 50 |     @property
 51 |     def nature(self):
 52 |         return self.attribute.nature
 53 | 
 54 | 
 55 |     @nature.setter
 56 |     def nature(self, value):
 57 |         self.attribute.nature = value
 58 | 
 59 |     def update_from(self, vertex_from):
 60 |         weight = vertex_from.weight + Vertex.calc_wight(vertex_from, self)
 61 |         if self.vertex_from is None or self.weight > weight:
 62 |             self.vertex_from = vertex_from
 63 |             self.weight = weight
 64 | 
 65 |     @staticmethod
 66 |     def calc_wight(vertex_p, vertex_n):
 67 |         freq = vertex_p.attribute.total_frequency
 68 |         if freq == 0:
 69 |             freq = 1
 70 |         two_word_freq = CORE_BIG_RAM_TABLE.table.get_bifreq(vertex_p.word_id, vertex_n.word_id)
 71 |         value = -math.log(SMOOTHING_PARAM * freq / MAX_FREQUENCY + (1 - SMOOTHING_PARAM) *
 72 |                           ((1 - SMOOTHING_FACTOR) * two_word_freq / freq + SMOOTHING_FACTOR))
 73 |         if value < 0:
 74 |             value = -value
 75 |         return value
 76 | 
 77 |     def compile_real_word(self, real_word, attribute):
 78 |         if (len(attribute) >= 1):
 79 |             if attribute.nature in [NATURE.nr,
 80 |                                     NATURE.nr1,
 81 |                                     NATURE.nr2,
 82 |                                     NATURE.nrf,
 83 |                                     NATURE.nrj]:
 84 |                 self.word_id = PERSON_WORD_ID
 85 |                 return TAG_PEOPLE
 86 |             elif attribute.nature in [NATURE.ns, NATURE.nsf]:
 87 |                 self.word_id = PLACE_WORD_ID
 88 |                 return TAG_PLACE
 89 |             elif attribute.nature in [NATURE.nz, NATURE.nx]:
 90 |                 self.word_id = PROPER_WORD_ID
 91 |                 return TAG_PROPER
 92 |             elif attribute.nature in [
 93 |                 NATURE.nt,
 94 |                 NATURE.ntc,
 95 |                 NATURE.ntcf,
 96 |                 NATURE.ntcb,
 97 |                 NATURE.ntch,
 98 |                 NATURE.nto,
 99 |                 NATURE.ntu,
100 |                 NATURE.nts,
101 |                 NATURE.nth,
102 |                 NATURE.nit]:
103 |                 self.word_id = PLACE_WORD_ID
104 |                 return TAG_GROUP
105 |             elif attribute.nature in [NATURE.m, NATURE.mq]:
106 |                 self.word_id = NUMBER_WORD_ID
107 |                 return TAG_NUMBER
108 |             elif attribute.nature == NATURE.x:
109 |                 self.word_id = CLUSTER_WORD_ID
110 |                 return TAG_CLUSTER
111 |             elif attribute.nature in [NATURE.t]:
112 |                 self.word_id = TIME_WORD_ID
113 |                 return TAG_TIME
114 |             return real_word
115 | 
116 | 
117 | def atom_seg(text, begin, end):
118 |     node_list = []
119 |     offset = begin
120 |     pre_type = get(text[offset])
121 |     offset += 1
122 |     while offset < end:
123 |         cur_type = get(text[offset])
124 |         if cur_type != pre_type:
125 |             # 处理浮点数
126 |             if text[offset] == '.' and pre_type == CT_NUM:
127 |                 offset += 1
128 |                 while offset < end:
129 |                     cur_type = get(text[offset])
130 |                     if cur_type != CT_NUM:
131 |                         break
132 |                     else:
133 |                         offset += 1
134 |             node_list.append(AtomNode(text[begin:offset], pre_type))
135 |             begin = offset
136 |         pre_type = cur_type
137 |         offset += 1
138 | 
139 |     if offset == end:
140 |         node_list.append(AtomNode(text[begin:offset], pre_type))
141 | 
142 |     return node_list
143 | 
144 | 
145 | def combine_by_custom_dict(vertexs, dat=CustomDict().trie):
146 |     dat = CustomDict().trie
147 |     for i in range(len(vertexs)):
148 |         state = 1
149 |         if vertexs[i] is None:
150 |             continue
151 |         state = dat.transition(vertexs[i].real_word, state)
152 |         value = None
153 |         if state > 0:
154 |             start = i
155 |             to = i + 1
156 |             end = - 1
157 |             for to in range(to, len(vertexs)):
158 |                 state = dat.transition(vertexs[to].real_word, state)
159 |                 if state < 0:
160 |                     break
161 |                 output = dat.output(state)
162 |                 if output is not None:
163 |                     value = output
164 |                     end = to + 1
165 | 
166 |             if value is not None:
167 |                 word = ""
168 |                 for j in range(start, end):
169 |                     word += vertexs[j].real_word
170 |                     vertexs[j] = None
171 |                 vertexs[i] = Vertex(real_word=word, attribute=value)
172 | 
173 |     # todo 考虑加入动态用户词典
174 |     return [v for v in vertexs if v is not None]
175 | 
176 | 
177 | 
178 | def dump_vertexs(vertexs):
179 |     logger.info("=" * 30)
180 |     for i, v in enumerate(vertexs):
181 |         logger.info("[%d] %s %s %s" % (i, v.real_word, v.word, v.nature))
182 | 
183 | class WordNet:
184 |     def __init__(self, text=None, vertexs=None):
185 |         self.vertexs = [[] for i in range(len(text) + 2)]
186 |         self.size = 2
187 |         if vertexs is not None:
188 |             i = 1
189 |             for v in vertexs[1:-1]:
190 |                 v.vertex_from = None
191 |                 self.vertexs[i]=[v]
192 |                 i += v.real_word.__len__()
193 |             self.vertexs[0] = [vertexs[0]]
194 |             self.vertexs[-1] = [vertexs[-1]]
195 |         else:
196 |             self.vertexs[0] = [new_tag_vertex(TAG_BIGIN)]
197 |             self.vertexs[-1] = [new_tag_vertex(TAG_END)]
198 |         pass
199 | 
200 |     def get_first(self, line):
201 |         if self.vertexs[line].__len__() > 0:
202 |             return self.vertexs[line][0]
203 |         else:
204 |             return None
205 | 
206 |     def get(self, line, word_length=None):
207 |         if word_length is None:
208 |             return self.vertexs[line]
209 |         for v in self.vertexs[line]:
210 |             if len(v.real_word) == word_length:
211 |                 return v
212 |         return None
213 | 
214 |     def add(self, line, vertex):
215 |         for v in self.vertexs[line]:
216 |             if v.real_word.__len__() == vertex.real_word.__len__():
217 |                 return
218 |         if self.vertexs[line].__len__() == 0:
219 |             self.vertexs[line] = [vertex]
220 |         else:
221 |             self.vertexs[line].append(vertex)
222 |         self.size += 1
223 | 
224 |     def insert(self, line, vertex, word_net):
225 |         self.add(line, vertex)
226 |         # 保证连接性
227 |         for l in range(line - 1, 1, -1):
228 |             if self.get(l, 1) is None:
229 |                 first = word_net.get_first(l)
230 |                 if first is None:
231 |                     return
232 |                 self.vertexs[l].append(copy.deepcopy(first))
233 |                 self.size += 1
234 |                 if len(self.vertexs) > 1:
235 |                     break
236 |             else:
237 |                 break
238 |         l = line + len(vertex.real_word)
239 |         if len(self.get(l)) == 0:
240 |             target_line = word_net.get(l)
241 |             if target_line is None or len(target_line) == 0:
242 |                 return
243 |             self.vertexs[l] = copy.deepcopy(target_line)
244 |             self.size += len(self.vertexs[l])
245 | 
246 |         for l in range(l, len(self.vertexs)):
247 |             if self.get(l).__len__() == 0:
248 |                 first = word_net.get_first(l)
249 |                 if first is None:
250 |                     break
251 |                 self.vertexs[l].append(copy.deepcopy(first))
252 |                 self.size += 1
253 |                 if self.vertexs[l].__len__() > 1:
254 |                     break
255 |             else:
256 |                 break
257 | 
258 |     def add_atoms(self, line, atom_list):
259 |         offset = 0
260 |         for atom_node in atom_list:
261 |             word = atom_node.word
262 |             nature = NATURE.n
263 |             if atom_node.pos in [CT_INDEX, CT_NUM]:
264 |                 nature = NATURE.m
265 |                 word = TAG_NUMBER
266 |             elif atom_node.pos in [CT_DELIMITER]:
267 |                 nature = NATURE.w
268 |             elif atom_node.pos in [CT_LETTER, CT_SINGLE]:
269 |                 nature = NATURE.nx
270 |                 word = TAG_CLUSTER
271 |             self.add(line + offset, Vertex(word=word,
272 |                                            real_word=atom_node.word,
273 |                                            attribute=Attribute([str(nature), '1']),
274 |                                            word_id=-1
275 |                                            ))
276 | 
277 |     def __len__(self):
278 |         return len(self.vertexs)
279 | 
280 |     def __unicode__(self):
281 |         sb = []
282 |         sb.append("=" * 30)
283 |         for i, vl in enumerate(self.vertexs):
284 |             sb.append(u"[%d]:[%s]" % (i, u",".join([v.real_word for v in vl])))
285 |         sb.append("=" * 30)
286 |         return u"\n".join(sb)
287 | 
288 | def gen_word_net(text, word_net, dat=CoreDict().trie):
289 |     searcher = dat.buildcoredictsearcher(text)
290 |     while searcher.next():
291 |         word_net.add(searcher.begin + 1, Vertex(real_word=searcher.key,
292 |                                                 attribute=searcher.value,
293 |                                             word_id=searcher.index))
294 |     for i in range(word_net.vertexs.__len__()):
295 |     # for i, v in enumerate(word_net.vertexs):
296 |         if word_net.vertexs[i].__len__() == 0:
297 |             j = i + 1
298 |             for j in range(i + 1, word_net.vertexs.__len__() - 1):
299 |                 if word_net.vertexs[j].__len__() != 0:
300 |                     break
301 |             word_net.add_atoms(i, atom_seg(text, i - 1, j - 1))
302 |         else:
303 |             i += word_net.vertexs[i][-1].real_word.__len__()
304 | 
305 | def new_tag_vertex(tag):
306 |     word_id, attribute = CoreDict().trie.get(tag)
307 |     if word_id > 0:
308 |         vertex = Vertex(chr(32), attribute=attribute, word=tag, word_id=word_id)
309 |         return vertex
310 |     else:
311 |         logger.error(u"从核心字典加载%s信息时出错", tag)
312 |         import sys
313 |         sys.exit(-1)
314 | 


--------------------------------------------------------------------------------
/yaya/utility/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tony'
2 | 


--------------------------------------------------------------------------------
/yaya/utility/bytearray.py:
--------------------------------------------------------------------------------
 1 | from io import FileIO
 2 | 
 3 | __author__ = 'tony'
 4 | import struct
 5 | 
 6 | 
 7 | class ByteArray:
 8 |     @staticmethod
 9 |     def load_from_file(filename):
10 |         f = FileIO(filename, 'rb')
11 |         data = f.readall()
12 |         return ByteArray(data)
13 | 
14 |     def __init__(self, data):
15 |         self.data = data
16 |         self.offset = 0
17 | 
18 |     def has_more(self):
19 |         return self.offset < len(self.data)
20 | 
21 |     def next_ushort(self):
22 |         data = struct.unpack_from('!h', self.data, self.offset)
23 |         self.offset += 2
24 |         return data[0]
25 | 
26 |     def next_uchar(self):
27 |         data = struct.unpack_from('!B', self.data, self.offset)
28 |         self.offset += 1
29 |         return data[0]
30 | 


--------------------------------------------------------------------------------
/yaya/utility/chartype.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import time
 3 | from yaya import config
 4 | from yaya.const import logger
 5 | from yaya.utility.bytearray import ByteArray
 6 | 
 7 | __author__ = 'tony'
 8 | 
 9 | CT_SINGLE = 5  # 单字节
10 | CT_DELIMITER = CT_SINGLE + 1  # 分隔符"!,.?()[]{}+=
11 | CT_CHINESE = CT_SINGLE + 2  # 中文字符
12 | CT_LETTER = CT_SINGLE + 3  # 字母
13 | CT_NUM = CT_SINGLE + 4  # 数字
14 | CT_INDEX = CT_SINGLE + 5  # 序号
15 | CT_OTHER = CT_SINGLE + 12  # 其他
16 | 
17 | char_type = [[]] * 65536
18 | 
19 | 
20 | def __init__():
21 |     logger.info("字符类型对应表开始加载 %s", config.CHAR_TYPE_PATH)
22 |     start = time.time()
23 |     byte_array = ByteArray.load_from_file(config.CHAR_TYPE_PATH)
24 |     if byte_array is None:
25 |         import sys
26 |         logger.error("字符类型对应表加载失败：" + config.CHAR_TYPE_PATH)
27 |         sys.exit(-1)
28 |     else:
29 |         while byte_array.has_more():
30 |             b = byte_array.next_ushort()
31 |             e = byte_array.next_ushort()
32 |             t = byte_array.next_uchar()
33 |             for i in range(b, e + 1):
34 |                 char_type[i] = t
35 |         logger.info("字符类型对应表加载成功，耗时 %s s", (time.time() - start))
36 | 
37 | 
38 | def get(c):
39 |     if type(c) is not int:
40 |         return char_type[ord(c)]
41 |     else:
42 |         return char_type[c]
43 | 
44 | 
45 | __init__()
46 | 


--------------------------------------------------------------------------------
/yaya/utility/persistence.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tony'
2 | 


--------------------------------------------------------------------------------
/yaya/utility/singleton.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'tony'
 2 | 
 3 | 
 4 | def singleton(class_):
 5 |     instances = {}
 6 | 
 7 |     def get_instance(*args, **kwargs):
 8 |         if class_ not in instances:
 9 |             instances[class_] = class_(*args, **kwargs)
10 |         return instances[class_]
11 | 
12 |     return get_instance
13 | 


--------------------------------------------------------------------------------