├── .gitignore
├── LICENSE
├── README.md
├── demo
└── demo_segment.py
├── setup.py
├── test
├── __init__.py
├── data
│ ├── test.ngram.txt
│ └── test.txt
├── test_bigramtable.py
├── test_dict.py
├── test_enum.py
├── test_hmm.py
├── test_organization_recognition.py
├── test_person_recognition.py
├── test_place_recognition.py
├── test_role_tag.py
├── test_segment.py
├── test_traditionalChineseDict.py
├── test_trie.py
├── test_viterbi_segment.py
└── test_wordnet.py
└── yaya
├── __init__.py
├── collection
├── __init__.py
├── bigram.py
├── dict.py
├── hmm.py
└── trie.py
├── common
├── __init__.py
├── enum.py
├── nature.py
├── nr.py
├── ns.py
└── nt.py
├── config.py
├── const.py
├── dictionary
├── __init__.py
├── chinese_traditional_dict.py
├── org_dict.py
├── person_dict.py
└── place_dict.py
├── recognition
├── __init__.py
├── organization_recognition.py
├── person_recognition.py
├── place_recognition.py
└── recognition.py
├── seg
├── __init__.py
├── segment.py
├── viterbi.py
└── wordnet.py
└── utility
├── __init__.py
├── bytearray.py
├── chartype.py
├── persistence.py
└── singleton.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.bin
2 | *.ya
3 | *.dat
4 | *.pyc
5 | /.idea
6 | /data
7 | /.project
8 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction, and
10 | distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright
13 | owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all other entities
16 | that control, are controlled by, or are under common control with that entity.
17 | For the purposes of this definition, "control" means (i) the power, direct or
18 | indirect, to cause the direction or management of such entity, whether by
19 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the
20 | outstanding shares, or (iii) beneficial ownership of such entity.
21 |
22 | "You" (or "Your") shall mean an individual or Legal Entity exercising
23 | permissions granted by this License.
24 |
25 | "Source" form shall mean the preferred form for making modifications, including
26 | but not limited to software source code, documentation source, and configuration
27 | files.
28 |
29 | "Object" form shall mean any form resulting from mechanical transformation or
30 | translation of a Source form, including but not limited to compiled object code,
31 | generated documentation, and conversions to other media types.
32 |
33 | "Work" shall mean the work of authorship, whether in Source or Object form, made
34 | available under the License, as indicated by a copyright notice that is included
35 | in or attached to the work (an example is provided in the Appendix below).
36 |
37 | "Derivative Works" shall mean any work, whether in Source or Object form, that
38 | is based on (or derived from) the Work and for which the editorial revisions,
39 | annotations, elaborations, or other modifications represent, as a whole, an
40 | original work of authorship. For the purposes of this License, Derivative Works
41 | shall not include works that remain separable from, or merely link (or bind by
42 | name) to the interfaces of, the Work and Derivative Works thereof.
43 |
44 | "Contribution" shall mean any work of authorship, including the original version
45 | of the Work and any modifications or additions to that Work or Derivative Works
46 | thereof, that is intentionally submitted to Licensor for inclusion in the Work
47 | by the copyright owner or by an individual or Legal Entity authorized to submit
48 | on behalf of the copyright owner. For the purposes of this definition,
49 | "submitted" means any form of electronic, verbal, or written communication sent
50 | to the Licensor or its representatives, including but not limited to
51 | communication on electronic mailing lists, source code control systems, and
52 | issue tracking systems that are managed by, or on behalf of, the Licensor for
53 | the purpose of discussing and improving the Work, but excluding communication
54 | that is conspicuously marked or otherwise designated in writing by the copyright
55 | owner as "Not a Contribution."
56 |
57 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf
58 | of whom a Contribution has been received by Licensor and subsequently
59 | incorporated within the Work.
60 |
61 | 2. Grant of Copyright License.
62 |
63 | Subject to the terms and conditions of this License, each Contributor hereby
64 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
65 | irrevocable copyright license to reproduce, prepare Derivative Works of,
66 | publicly display, publicly perform, sublicense, and distribute the Work and such
67 | Derivative Works in Source or Object form.
68 |
69 | 3. Grant of Patent License.
70 |
71 | Subject to the terms and conditions of this License, each Contributor hereby
72 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
73 | irrevocable (except as stated in this section) patent license to make, have
74 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where
75 | such license applies only to those patent claims licensable by such Contributor
76 | that are necessarily infringed by their Contribution(s) alone or by combination
77 | of their Contribution(s) with the Work to which such Contribution(s) was
78 | submitted. If You institute patent litigation against any entity (including a
79 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a
80 | Contribution incorporated within the Work constitutes direct or contributory
81 | patent infringement, then any patent licenses granted to You under this License
82 | for that Work shall terminate as of the date such litigation is filed.
83 |
84 | 4. Redistribution.
85 |
86 | You may reproduce and distribute copies of the Work or Derivative Works thereof
87 | in any medium, with or without modifications, and in Source or Object form,
88 | provided that You meet the following conditions:
89 |
90 | You must give any other recipients of the Work or Derivative Works a copy of
91 | this License; and
92 | You must cause any modified files to carry prominent notices stating that You
93 | changed the files; and
94 | You must retain, in the Source form of any Derivative Works that You distribute,
95 | all copyright, patent, trademark, and attribution notices from the Source form
96 | of the Work, excluding those notices that do not pertain to any part of the
97 | Derivative Works; and
98 | If the Work includes a "NOTICE" text file as part of its distribution, then any
99 | Derivative Works that You distribute must include a readable copy of the
100 | attribution notices contained within such NOTICE file, excluding those notices
101 | that do not pertain to any part of the Derivative Works, in at least one of the
102 | following places: within a NOTICE text file distributed as part of the
103 | Derivative Works; within the Source form or documentation, if provided along
104 | with the Derivative Works; or, within a display generated by the Derivative
105 | Works, if and wherever such third-party notices normally appear. The contents of
106 | the NOTICE file are for informational purposes only and do not modify the
107 | License. You may add Your own attribution notices within Derivative Works that
108 | You distribute, alongside or as an addendum to the NOTICE text from the Work,
109 | provided that such additional attribution notices cannot be construed as
110 | modifying the License.
111 | You may add Your own copyright statement to Your modifications and may provide
112 | additional or different license terms and conditions for use, reproduction, or
113 | distribution of Your modifications, or for any such Derivative Works as a whole,
114 | provided Your use, reproduction, and distribution of the Work otherwise complies
115 | with the conditions stated in this License.
116 |
117 | 5. Submission of Contributions.
118 |
119 | Unless You explicitly state otherwise, any Contribution intentionally submitted
120 | for inclusion in the Work by You to the Licensor shall be under the terms and
121 | conditions of this License, without any additional terms or conditions.
122 | Notwithstanding the above, nothing herein shall supersede or modify the terms of
123 | any separate license agreement you may have executed with Licensor regarding
124 | such Contributions.
125 |
126 | 6. Trademarks.
127 |
128 | This License does not grant permission to use the trade names, trademarks,
129 | service marks, or product names of the Licensor, except as required for
130 | reasonable and customary use in describing the origin of the Work and
131 | reproducing the content of the NOTICE file.
132 |
133 | 7. Disclaimer of Warranty.
134 |
135 | Unless required by applicable law or agreed to in writing, Licensor provides the
136 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS,
137 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
138 | including, without limitation, any warranties or conditions of TITLE,
139 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
140 | solely responsible for determining the appropriateness of using or
141 | redistributing the Work and assume any risks associated with Your exercise of
142 | permissions under this License.
143 |
144 | 8. Limitation of Liability.
145 |
146 | In no event and under no legal theory, whether in tort (including negligence),
147 | contract, or otherwise, unless required by applicable law (such as deliberate
148 | and grossly negligent acts) or agreed to in writing, shall any Contributor be
149 | liable to You for damages, including any direct, indirect, special, incidental,
150 | or consequential damages of any character arising as a result of this License or
151 | out of the use or inability to use the Work (including but not limited to
152 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or
153 | any and all other commercial damages or losses), even if such Contributor has
154 | been advised of the possibility of such damages.
155 |
156 | 9. Accepting Warranty or Additional Liability.
157 |
158 | While redistributing the Work or Derivative Works thereof, You may choose to
159 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or
160 | other liability obligations and/or rights consistent with this License. However,
161 | in accepting such obligations, You may act only on Your own behalf and on Your
162 | sole responsibility, not on behalf of any other Contributor, and only if You
163 | agree to indemnify, defend, and hold each Contributor harmless for any liability
164 | incurred by, or claims asserted against, such Contributor by reason of your
165 | accepting any such warranty or additional liability.
166 |
167 | END OF TERMS AND CONDITIONS
168 |
169 | APPENDIX: How to apply the Apache License to your work
170 |
171 | To apply the Apache License to your work, attach the following boilerplate
172 | notice, with the fields enclosed by brackets "{}" replaced with your own
173 | identifying information. (Don't include the brackets!) The text should be
174 | enclosed in the appropriate comment syntax for the file format. We also
175 | recommend that a file or class name and description of purpose be included on
176 | the same "printed page" as the copyright notice for easier identification within
177 | third-party archives.
178 |
179 | Copyright 2015 TonyWang
180 |
181 | Licensed under the Apache License, Version 2.0 (the "License");
182 | you may not use this file except in compliance with the License.
183 | You may obtain a copy of the License at
184 |
185 | http://www.apache.org/licenses/LICENSE-2.0
186 |
187 | Unless required by applicable law or agreed to in writing, software
188 | distributed under the License is distributed on an "AS IS" BASIS,
189 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
190 | See the License for the specific language governing permissions and
191 | limitations under the License.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # YaYaNLP: Chinese Language Processing
2 | YaYaNLP是一个纯python编写的中文自然语言处理包,取名于“牙牙学语”。
3 | YaYaNLP提供以下功能:
4 | - 中文分词
5 | - 词性标注
6 | - 命名实体识别
7 | * 人名识别
8 | * 地名识别
9 | * 组织机构识别
10 | - 简繁转换
11 |
12 | ## 项目
13 |
14 | 项目主页:[https://github.com/Tony-Wang/YaYaNLP](https://github.com/Tony-Wang/YaYaNLP)
15 |
16 | 我的主页:[www.huangyong.me](http://www.huangyong.me)
17 |
18 | ## 安装
19 |
20 | ### 直接下载源码包,解压后运行
21 |
22 | ``` bash
23 | python setup.py install
24 | ```
25 |
26 | ### 下载字典与模型文件
27 |
28 | YaYaNLP使用了与HanLP兼容的字典数据,而编译后的字典数据保存的扩展名为.ya
29 | 可以直接从hanLP项目下载,[data-for-1.2.4.zip](http://pan.baidu.com/s/1gd1vo8j)
30 |
31 | ### 配置数据文件路径
32 |
33 | 在**yaya/config.py**修改自己的数据文件路径
34 | ``` python
35 | DATA_ROOT = "/your/data/path"
36 | ```
37 |
38 | ## 特性
39 |
40 | ### 人名识别
41 |
42 | ```
43 | # 识别人名
44 | text = u"签约仪式前,秦光荣、李纪恒、仇和等一同会见了参加签约的企业家。"
45 | terms = segment.seg(text)
46 | print_terms(terms)
47 | ```
48 |
49 | ```
50 | 签约/vi
51 | 仪式/n
52 | 前/f
53 | ,/w
54 | 秦光荣/nr
55 | 、/w
56 | 李纪恒/nr
57 | 、/w
58 | 仇和/nr
59 | 等/udeng
60 | 一同/d
61 | 会见/v
62 | 了/ule
63 | 参加/v
64 | 签约/vi
65 | 的/ude1
66 | 企业家/nnt
67 | 。/w
68 | ```
69 |
70 |
71 | ### 歧意词识别
72 |
73 | ```
74 | # 识别歧意词
75 | text = u"龚学平等领导说,邓颖超生前杜绝超生"
76 | terms = segment.seg(text)
77 | print_terms(terms)
78 | ```
79 |
80 | ```
81 | 龚学平/nr
82 | 等/udeng
83 | 领导/n
84 | 说/v
85 | ,/w
86 | 邓颖超/nr
87 | 生前/t
88 | 杜绝/v
89 | 超生/vi
90 | ```
91 |
92 | ### 地名识别
93 |
94 | ```
95 | # 识别地名
96 | text = u"蓝翔给宁夏固原市彭阳县红河镇黑牛沟村捐赠了挖掘机"
97 | terms = segment.seg(text)
98 | print_terms(terms)
99 | ```
100 |
101 | ```
102 | 蓝翔/nt
103 | 给/p
104 | 宁夏/ns
105 | 固原市/ns
106 | 彭阳县/ns
107 | 红河镇/ns
108 | 黑牛沟村/ns
109 | 捐赠/v
110 | 了/ule
111 | 挖掘机/n
112 | ```
113 |
114 | ### 组织名识别
115 |
116 | ```
117 | # 组织名识别
118 | text = u"济南杨铭宇餐饮管理有限公司是由杨先生创办的餐饮企业"
119 | terms = segment.seg(text)
120 | print_terms(terms)
121 | ```
122 |
123 | ```
124 | 济南杨铭宇餐饮管理有限公司/nt
125 | 是/vshi
126 | 由/p
127 | 杨先生/nr
128 | 创办/v
129 | 的/ude1
130 | 餐饮企业/nz
131 | ```
132 |
133 | ### 简繁转换
134 |
135 | ```
136 | # 简繁转换
137 | text = u"以后等你当上皇后,就能买草莓庆祝了"
138 | print segment.simplified_to_traditional(text)
139 | ```
140 |
141 | ```
142 | 以後等妳當上皇后,就能買士多啤梨慶祝了
143 | ```
144 |
145 | ```
146 | # 繁简转换
147 | text = u"用筆記簿型電腦寫程式HelloWorld"
148 | print segment.traditional_to_simplified(text)
149 | ```
150 |
151 | ```
152 | 用笔记本电脑写程序HelloWorld
153 | ```
154 |
155 | ## 感谢
156 | 本项目参考了[hanck/HanLP](https://github.com/hankcs/HanLP/)项目实现原理并使用了该项目的字典和模型文件。
157 |
158 |
159 | ## 版权
160 | * Apache License Version 2.0
161 | * 任何使用了YaYaNLP的全部或部分功能、词典、模型的项目、产品或文章等形式的成果必须显式注明YaYaNLP及此项目主页。
162 |
--------------------------------------------------------------------------------
/demo/demo_segment.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from yaya.seg import segment
3 |
4 | __author__ = 'tony'
5 |
6 |
7 | def print_terms(terms):
8 | for i, v in enumerate(terms):
9 | print "%s/%s" % (v[0], v[1])
10 |
11 |
12 | def main():
13 |
14 | # 识别歧意词
15 | text = u"龚学平、张晓辉等领导说,邓颖超生前杜绝超生"
16 | terms = segment.seg(text)
17 | print_terms(terms)
18 |
19 | # 识别人名
20 | text = u"签约仪式前,秦光荣、李纪恒、仇和等一同会见了参加签约的企业家。"
21 | terms = segment.seg(text)
22 | print_terms(terms)
23 |
24 | # 识别地名
25 | text = u"蓝翔给宁夏固原市彭阳县红河镇黑牛沟村捐赠了挖掘机"
26 | terms = segment.seg(text)
27 | print_terms(terms)
28 |
29 | # 识别组织名
30 | text = u"济南杨铭宇餐饮管理有限公司是由杨先生创办的餐饮企业"
31 | terms = segment.seg(text)
32 | print_terms(terms)
33 |
34 | # 简繁转换
35 | text = u"以后等你当上皇后,就能买草莓庆祝了"
36 | print segment.simplified_to_traditional(text)
37 |
38 | # 繁简转换
39 | text = u"用筆記簿型電腦寫程式HelloWorld"
40 | print segment.traditional_to_simplified(text)
41 |
42 |
43 | if __name__ == '__main__':
44 | main()
45 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | PACKAGE = "yaya"
4 | NAME = "YaYaNLP"
5 | DESCRIPTION = "YaYaNLP: Chinese Language Processing"
6 | AUTHOR = "tony huang"
7 | AUTHOR_EMAIL = "tony@huangyong.me"
8 | URL = "http://www.huangyong.me"
9 |
10 | VERSION = __import__(PACKAGE).__version__
11 |
12 | setup(
13 | name=NAME,
14 | version=VERSION,
15 | description=DESCRIPTION,
16 | author=AUTHOR,
17 | author_email=AUTHOR_EMAIL,
18 | license="Apache",
19 | url=URL,
20 | packages=find_packages(exclude=["test*", "data*"]),
21 | classifiers=[
22 | 'Development Status :: 0.1.1 - Alpha',
23 | 'Intended Audience :: Developers',
24 | 'License :: OSI Approved :: Apache License Version 2.0',
25 | 'Programming Language :: Python',
26 | ],
27 |
28 | zip_safe=False,
29 | )
30 |
--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tony-Wang/YaYaNLP/d75780290926877e55759fb64e1440f809d653ed/test/__init__.py
--------------------------------------------------------------------------------
/test/data/test.ngram.txt:
--------------------------------------------------------------------------------
1 | 中华@骨髓 6
2 | 中华@骨髓库 40
3 | 中华@魂 1
4 | 中华@鳖精 2
5 | 中华@鸟类 4
6 | 中华@鸟龙 7
7 | 中华@, 15
8 | 中华人民共和国@不可 1
9 | 中华人民共和国@与 2
10 | 中华人民共和国@中央 1
11 | 中华人民共和国@中央政府 1
12 | 中华人民共和国@主席 2
13 | 中华人民共和国@主席令 2
14 | 中华人民共和国@主管 1
--------------------------------------------------------------------------------
/test/data/test.txt:
--------------------------------------------------------------------------------
1 | 一举 n 1
2 | 一举成名 n 1
3 | 一举成名天下知 n 1
4 | 成名 n 1
5 | 天下 n 1
6 | 法兰西 n 1
7 | 注册机 n 1
--------------------------------------------------------------------------------
/test/test_bigramtable.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from unittest import TestCase
3 | from yaya.collection.bigram import *
4 |
5 | __author__ = 'tony'
6 |
7 |
8 | class TestBiGramTable(TestCase):
9 | def test_build(self):
10 | filename = "./data/test.ngram.txt"
11 | table = BiGramTable.build(filename)
12 | self.assertEqual(table.get_bifreq(u"中华", u"鸟类"), 4)
13 | self.assertEqual(table.get_bifreq(u"中华", u"鸟龙"), 7)
14 |
15 | def test_get_Bifreq(self):
16 | self.assertEqual(CoreBiGramTable().table.get_bifreq(u"中华", u"鸟类"), 4)
17 | self.assertEqual(CoreBiGramTable().table.get_bifreq(u"中华", u"鸟龙"), 7)
18 |
--------------------------------------------------------------------------------
/test/test_dict.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | from __future__ import absolute_import, unicode_literals
3 | import os
4 | from unittest import TestCase
5 |
6 | from yaya.collection.dict import *
7 | import yaya.config
8 | from yaya.dictionary.person_dict import PersonDict
9 |
10 | __author__ = 'tony'
11 |
12 |
13 | class TestDoubleArrayTrie(TestCase):
14 | def test_fetch(self):
15 | trie = DoubleArrayTrie()
16 | words = []
17 | words.append(u"一举")
18 | words.append(u"一举一动")
19 | words.append(u"一举成名")
20 | words.append(u"一举成名天下知")
21 | words.append(u"啊")
22 | words.append(u"埃及")
23 | words.append(u"阿拉伯")
24 | words.append(u"阿拉伯人")
25 | words.append(u"阿根廷")
26 | words.append(u"阿胶")
27 | words.sort()
28 | trie.build(key=words, v=['一', '一', '一', '一', '二', '三', '四', '四', '四', '四'])
29 | self.assertGreater(trie.exact_match_search(u"一举一动"), 0)
30 | self.assertGreater(trie.exact_match_search(u"阿拉伯"), 0)
31 | self.assertGreater(trie.exact_match_search(u"阿拉伯人"), 0)
32 |
33 | def test_build(self):
34 | trie = DoubleArrayTrie()
35 | words = []
36 | words.append(u"一举 n 1")
37 | words.append(u"一举一动 n 1")
38 | words.append(u"一举成名 n 1")
39 | words.append(u"一举成名天下知 n 1")
40 | words.append(u"啊 n 1")
41 | words.append(u"埃及 n 1")
42 | words.append(u"阿拉伯 n 1")
43 | words.append(u"阿拉伯人 n 1")
44 | words.append(u"阿根廷 n 1")
45 | words.append(u"阿胶 n 1")
46 | words.sort()
47 | trie=DoubleArrayTrie.load_from_list(words)
48 | self.assertEqual(trie.get(u"一举")[1].nature, NATURE.n)
49 | self.assertEqual(trie.get(u"一举一动")[1].nature, NATURE.n)
50 | self.assertEqual(trie.get(u"一举成名")[1].nature, NATURE.n)
51 | self.assertEqual(trie.get(u"一举成名天下知")[1].nature, NATURE.n)
52 | self.assertEqual(trie.get(u"啊")[1].nature, NATURE.n)
53 | self.assertEqual(trie.get(u"埃及")[1].nature, NATURE.n)
54 | self.assertEqual(trie.get(u"阿拉伯")[1].nature, NATURE.n)
55 |
56 | def test_load_dict(self):
57 | new_trie = DoubleArrayTrie.load_dict_file(os.path.join("data", "test.txt"))
58 | self.assertGreater(new_trie.exact_match_search(u"注册机"), 0)
59 |
60 | def test_load_big(self):
61 | trie = DoubleArrayTrie.load(yaya.config.CORE_DICT_NAME)
62 | self.assertGreater(trie.exact_match_search(u"法兰西斯"), 0)
63 | self.assertIsNotNone(trie.get(u"法兰西")[1].nature, u"核心字典里的value字段不应该None")
64 |
65 |
66 | def test_search(self):
67 | trie = DoubleArrayTrie.load(os.path.join("data", "test.txt"))
68 | self.assertGreaterEqual(u"一举", 0, u"词典中含有")
69 | self.assertGreaterEqual(u"一举成名", 0, u"词典中含有")
70 | self.assertGreaterEqual(u"一举成名天下知", 0, u"词典中含有")
71 | search = trie.search(u"一举成名天下知", 0)
72 | while search.next():
73 | print(search.value)
74 |
75 | def test_searcher_generator(self):
76 | trie = DoubleArrayTrie.load(os.path.join("data", "test.txt"))
77 | self.assertGreaterEqual(u"一举", 0, u"词典中含有")
78 | self.assertGreaterEqual(u"一举成名", 0, u"词典中含有")
79 | self.assertGreaterEqual(u"一举成名天下知", 0, u"词典中含有")
80 | search = trie.search(u"一举成名天下知", 0)
81 | terms = []
82 | for i, k, v in search.search_all_words():
83 | terms.append((i, k, v))
84 | self.assertEqual(v.nature, NATURE.n)
85 | self.assertEqual(len(v), 1)
86 | self.assertEqual(v.to_tuple()[1], 1)
87 | self.assertEqual(len(terms), 5, u"搜索生成器,查找出所有词典里有的词")
88 |
89 |
90 |
91 | def test_custom_dict(self):
92 | self.assertGreaterEqual(CustomDict().trie.exact_match_search(u"黄勇"), 0)
93 |
94 | def test_dat_transition(self):
95 | trie = DoubleArrayTrie.load(os.path.join("data", "test.txt"))
96 | self.assertNotEqual(trie.transition(u"法兰西", 1), -1)
97 | self.assertEqual(trie.transition(u"法兰东", 1), -1)
98 | p = trie.transition(u"法兰", 1)
99 | self.assertNotEqual(trie.transition(u"西", p), -1)
100 | self.assertEqual(trie.transition(u"东", p), -1)
101 |
102 | def test_dat_output(self):
103 | dat = DoubleArrayTrie()
104 | dat.build(key=[u"江河湖海"], v=[u"江河湖海 n 1"])
105 | state = dat.transition(u'江河湖海', 1)
106 | self.assertGreater(state, -1)
107 | self.assertIsNotNone(dat.output(state))
108 | self.assertEqual(dat.output(state), dat.get(u"江河湖海")[1])
109 |
110 | # state = CoreDict().trie.transition(u"大海", 1)
111 | # self.assertGreater(state, -1)
112 | # self.assertEqual(CoreDict().trie.output(state), CoreDict().trie.get(u'大海')[1])
113 |
114 |
115 |
116 | class TestAttribute(TestCase):
117 | def test_total_freq(self):
118 | text = "测试 n 10 nz 3 p 4"
119 | attr = Attribute(attr=text.split()[1:])
120 | self.assertEqual(attr.total_frequency, 17)
121 | # self.assertEqual(attr.get_nature_frequency('n'), 10)
122 | self.assertEqual(attr.get_nature_frequency(NATURE.n), 10)
123 | self.assertEqual(attr.get_nature_frequency(NATURE.nz), 3)
124 | self.assertEqual(attr.get_nature_frequency(NATURE.p), 4)
125 |
126 |
127 | class TestAllDict(TestCase):
128 | def test_PersonDict(self):
129 | self.assertNotEqual(PersonDict().trie.exact_match_search(u"籍"), -1)
130 |
--------------------------------------------------------------------------------
/test/test_enum.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 |
3 | from yaya.common.enum import Enum, EnumValue
4 | from yaya.common.nr import NR
5 | from yaya.common.nature import NATURE
6 |
7 | __author__ = 'tony'
8 |
9 |
10 | class TestEnum(TestCase):
11 | def test_nr(self):
12 | self.assertEqual(NR.A.index, 14)
13 |
14 | def test_nature(self):
15 | self.assertEqual(NATURE.n.index, 13)
16 |
17 | def test_nature_key_to_index(self):
18 | self.assertEqual(type(NATURE.n), EnumValue)
19 |
20 | def test_nature_key_to_str(self):
21 | self.assertEqual(str(NATURE.n), 'n')
22 |
23 | def test_enum(self, ):
24 | E1 = Enum('a', 'b', enum_name='E1')
25 | self.assertTrue(str(E1.b) == 'b')
26 | self.assertEqual(E1['b'].index, 1)
27 |
28 | def test_demo(self):
29 | # char => int
30 | E1 = Enum('a', 'b', enum_name='E1')
31 | self.assertTrue(str(E1.b) == 'b')
32 | self.assertEqual(E1['b'].index, 1)
33 | self.assertTrue(str(E1[1]) == 'b' )
--------------------------------------------------------------------------------
/test/test_hmm.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from unittest import TestCase
3 | from yaya.collection.hmm import PersonTranMatrix
4 |
5 | __author__ = 'tony'
6 |
7 |
8 | class TestHMMMatrix(TestCase):
9 | def test_load(self):
10 | self.assertIsNotNone(PersonTranMatrix().hmm, u"加载人名识别HMM转换矩阵")
11 | self.assertNotEqual(PersonTranMatrix().hmm.matrix.__len__(), 0)
12 | self.assertEqual(PersonTranMatrix().hmm.total_freq, 43938702)
13 |
--------------------------------------------------------------------------------
/test/test_organization_recognition.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | __author__ = 'tony'
3 | from unittest import TestCase
4 |
5 | from yaya.config import Config
6 | from yaya.recognition import person_recognition
7 | from yaya.recognition import place_recognition
8 | from yaya.recognition import organization_recognition
9 | from yaya.seg.viterbi import viterbi
10 | from yaya.seg.wordnet import WordNet, Vertex, gen_word_net, dump_vertexs
11 | from yaya.seg.segment import traditional_to_simplified
12 |
13 | class TestOrgRecognition(TestCase):
14 | def gen_word(self, text):
15 | self.text = text
16 | self.word_net = WordNet(self.text)
17 | # 粗分词网
18 | gen_word_net(self.text, self.word_net)
19 | # 维特比
20 | self.vertexs = viterbi(self.word_net.vertexs)
21 | self.word_net_optimum = WordNet(self.text, vertexs=self.vertexs)
22 |
23 | def test_recognition_1_level(self):
24 | text = u"济南杨铭宇餐饮管理有限公司是由杨先生创办的餐饮企业"
25 | self.gen_word(text)
26 | # vertexs = persion_recognition.recognition(vertexs, word_net_optimum, word_net)
27 | # word_net_optimum = WordNet(text, vertexs=vertexs)
28 | organization_recognition.recognition(self.vertexs, self.word_net_optimum, self.word_net)
29 | vertexs = viterbi(self.word_net_optimum.vertexs)
30 | self.assertIn(Vertex(u"济南杨铭宇餐饮管理有限公司", attribute=u"nt 1"), vertexs)
31 |
32 | def test_recognition_2_level(self):
33 | text = u"济南杨铭宇餐饮管理有限公司是由杨先生创办的餐饮企业"
34 | self.gen_word(text)
35 | person_recognition.recognition(self.vertexs, self.word_net_optimum, self.word_net)
36 | place_recognition.recognition(self.vertexs, self.word_net_optimum, self.word_net)
37 | word_net_optimum = WordNet(self.text, vertexs=self.vertexs)
38 | vertexs = organization_recognition.recognition(self.vertexs, word_net_optimum, self.word_net)
39 | # viterbi(word_net_optimum.vertexs)
40 | dump_vertexs(vertexs)
41 | self.assertIn(Vertex(u"济南杨铭宇餐饮管理有限公司", attribute=u"nt 1"), vertexs)
42 |
43 | def test_organization_recognition(self):
44 | text = traditional_to_simplified(u"馬總統上午前往陸軍航空601旅,")
45 | Config.debug = True
46 | self.gen_word(text)
47 | person_recognition.recognition(self.vertexs, self.word_net_optimum, self.word_net)
48 | place_recognition.recognition(self.vertexs, self.word_net_optimum, self.word_net)
49 | word_net_optimum = WordNet(self.text, vertexs=self.vertexs)
50 | vertexs = organization_recognition.recognition(self.vertexs, word_net_optimum, self.word_net)
51 | dump_vertexs(vertexs)
52 | self.assertIn(Vertex(u"陆军航空601旅", attribute=u"nt 1"), vertexs)
53 |
54 |
--------------------------------------------------------------------------------
/test/test_person_recognition.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from unittest import TestCase
3 |
4 | from yaya.seg import segment
5 | from yaya.seg.viterbi import viterbi
6 | from yaya.seg.wordnet import WordNet, gen_word_net, Vertex
7 | from yaya.recognition import person_recognition
8 |
9 | __author__ = 'tony'
10 |
11 |
12 | class TestPersonRecognition(TestCase):
13 | def test_recognition(self):
14 | text = u"签约仪式前,秦光荣、李纪恒、仇和、王春桂、张晓辉等一同会见了参加签约的企业家。"
15 | word_net = WordNet(text)
16 |
17 | # 粗分词网
18 | gen_word_net(text, word_net)
19 |
20 | # 维特比
21 | vertexs = viterbi(word_net.vertexs)
22 | word_net_optimum = WordNet(text, vertexs=vertexs)
23 | person_recognition.recognition(vertexs, word_net_optimum, word_net)
24 | vertexs = viterbi(word_net_optimum.vertexs)
25 | self.assertIn(Vertex(u"秦光荣", attribute=u"nr 1"), vertexs)
26 | self.assertIn(Vertex(u"李纪恒", attribute=u"nr 1"), vertexs)
27 | self.assertIn(Vertex(u"仇和", attribute=u"nr 1"), vertexs)
28 | self.assertIn(Vertex(u"王春桂", attribute=u"nr 1"), vertexs)
29 | self.assertIn(Vertex(u"张晓辉", attribute=u"nr 1"), vertexs)
30 | print(vertexs)
31 |
32 | def test_person_name_V_should_split_to_EL_DL(self):
33 | text = u"龚学平、张晓辉等领导说,邓颖超生前杜绝超生"
34 | vertexs = segment.seg_to_vertexs(text)
35 | terms = segment.vertexs_to_terms(vertexs, True)
36 | self.assertIn(u"龚学平", terms)
37 | self.assertIn(u"张晓辉", terms)
38 | self.assertIn(u"邓颖超", terms)
39 |
40 |
--------------------------------------------------------------------------------
/test/test_place_recognition.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | __author__ = 'tony'
3 |
4 | from unittest import TestCase
5 |
6 | from yaya.recognition import place_recognition
7 | from yaya.collection.dict import CustomDict, Attribute
8 | from yaya.seg import segment
9 | from yaya.seg.viterbi import viterbi
10 | from yaya.seg.wordnet import WordNet, gen_word_net, Vertex, combine_by_custom_dict
11 |
12 |
13 | class TestPlaceRecognition(TestCase):
14 | def setUp(self):
15 | self.text = u"蓝翔给宁夏固原市彭阳县红河镇黑牛沟村捐赠了挖掘机"
16 | self.word_net = WordNet(self.text)
17 | # 粗分词网
18 | gen_word_net(self.text, self.word_net)
19 | # 维特比
20 | self.vertexs = viterbi(self.word_net.vertexs)
21 | self.vertexs = combine_by_custom_dict(self.vertexs, CustomDict().trie)
22 | self.word_net_optimum = WordNet(self.text, vertexs=self.vertexs)
23 |
24 | def test_recognition(self):
25 | place_recognition.recognition(self.vertexs, self.word_net_optimum, self.word_net)
26 | vertexs = viterbi(self.word_net_optimum.vertexs)
27 | self.assertIn(Vertex(u"宁夏"), vertexs)
28 | self.assertIn(Vertex(u"固原市"), vertexs)
29 | self.assertIn(Vertex(u"彭阳县", attribute=u"ns 1"), vertexs)
30 | self.assertIn(Vertex(u"红河镇", attribute=u"ns 1"), vertexs)
31 | self.assertIn(Vertex(u"黑牛沟村", attribute=u"ns 1"), vertexs)
32 |
--------------------------------------------------------------------------------
/test/test_role_tag.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from unittest import TestCase
3 |
4 | from yaya.collection.dict import Attribute, DoubleArrayTrie
5 | from yaya.collection.hmm import PersonTranMatrix
6 | from yaya.common.nr import NR, NRPattern
7 | from yaya.const import *
8 | from yaya.recognition.person_recognition import role_tag
9 | from yaya.seg.viterbi import viterbi_roletag
10 | from yaya.seg.wordnet import new_tag_vertex, Vertex
11 |
12 | __author__ = 'tony'
13 |
14 |
15 | class TestRole_tag(TestCase):
16 | def test_role_tag(self):
17 | word_seg_list = [
18 | new_tag_vertex(TAG_BIGIN),
19 | Vertex(u"秦", attribute=Attribute(u'n 1')),
20 | Vertex(u"光荣", attribute=Attribute(u'n 1')),
21 | Vertex(u"同志", attribute=Attribute(u'n 1')),
22 | new_tag_vertex(TAG_END),
23 | ]
24 | taglist = role_tag(word_seg_list)
25 |
26 | self.assertTrue(isinstance(taglist, list))
27 | self.assertEqual(taglist[2].to_tuple(), (NR.Z, 29, NR.L, 2))
28 |
29 | tag_index_list = viterbi_roletag(taglist, PersonTranMatrix().hmm)
30 | self.assertEqual(tag_index_list[0], NR.A, u"人名识别,第一个标识应该为TAG_BAGIN")
31 | self.assertEqual(tag_index_list[1], NR.B)
32 | self.assertEqual(tag_index_list[2], NR.Z)
33 | self.assertEqual(tag_index_list[3], NR.L)
34 | self.assertEqual(tag_index_list[4], NR.A)
35 |
36 | def test_NRPattern(self):
37 | """
38 |
39 |
40 | """
41 | trie = DoubleArrayTrie()
42 | NRPattern.sort()
43 | trie.build(key=NRPattern)
44 | self.assertTrue(trie.exact_match_search("BCD") != -1)
45 | self.assertTrue(trie.exact_match_search("BBCD") != -1)
46 | self.assertTrue(trie.exact_match_search("BG") != -1)
47 | self.assertTrue(trie.exact_match_search("DG") != -1)
48 | self.assertTrue(trie.exact_match_search("CD") == -1)
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
--------------------------------------------------------------------------------
/test/test_segment.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from unittest import TestCase
3 |
4 | from yaya.collection.dict import DoubleArrayTrie
5 | from yaya.seg import segment
6 | from yaya.seg.segment import traditional_seg
7 | from yaya.seg.wordnet import atom_seg, WordNet, gen_word_net, combine_by_custom_dict
8 | from yaya.utility.chartype import *
9 |
10 | __author__ = 'tony'
11 |
12 |
13 | class TestAtomSegment(TestCase):
14 | def test_char_type(self):
15 | self.assertEqual(get('a'), CT_SINGLE)
16 | self.assertEqual(get('1'), CT_NUM)
17 | self.assertEqual(get(u'中'), CT_CHINESE)
18 |
19 | def test_atom_seg(self):
20 | text = '12341'
21 | node_list = atom_seg(text, 0, text.__len__())
22 | self.assertEqual(node_list.__len__(), 1)
23 | self.assertEqual(node_list[0].pos, CT_NUM)
24 | text = '123.41'
25 | node_list = atom_seg(text, 0, text.__len__())
26 | self.assertEqual(node_list.__len__(), 1)
27 | self.assertEqual(node_list[0].pos, CT_NUM)
28 | text = 'abc'
29 | node_list = atom_seg(text, 0, text.__len__())
30 | self.assertEqual(node_list.__len__(), 1)
31 | self.assertEqual(node_list[0].pos, CT_SINGLE)
32 |
33 |
34 | class TestSegment(TestCase):
35 | def test_seg_find_nr(self):
36 | text = u"签约仪式前,秦光荣、李纪恒、仇和、王春桂等一同会见了参加签约的企业家。"
37 | terms = segment.seg(text)
38 | self.assertIn((u"秦光荣", 'nr', 6), terms, u"测试是否找出人名")
39 | self.assertIn((u"李纪恒", 'nr', 10), terms, u"测试是否找出人名")
40 | self.assertIn((u"仇和", 'nr', 14), terms, u"测试是否找出人名")
41 |
42 | def test_combin_by_dict(self):
43 | dat = DoubleArrayTrie()
44 | dat.build([u"江", u"河", u"湖", "海"])
45 | text = u"江河湖海"
46 | word_net = WordNet(text)
47 | gen_word_net(text, word_net, dat)
48 | vertexs = [v[0] for v in word_net.vertexs]
49 | self.assertEqual(len(word_net), 6, u"自定义字典分词")
50 |
51 | combin_dat = DoubleArrayTrie()
52 | combin_dat.build(key=[u"江河湖海"], v=[u"江河湖海 n 1"])
53 | vertexs = combine_by_custom_dict(vertexs, combin_dat)
54 | self.assertEqual(len(vertexs), 3, u"合并完成后应该只有前尾加中间词")
55 |
56 | def test_traditional_seg(self):
57 | text = u"記者羅吉訓/新竹報導 雙方合作的主要內容包括,希望能夠促成太陽能設備安裝維修人才培養;結合推廣教育由綠野集團引薦國外學生來臺就讀;與觀光及餐飲系合作觀光休閒產業,提供來臺遊客入住大華科大樂群會館,並導覽參訪張學良故居等臺灣各知名景點。 訂閱聯絡電話:02-23222722-814 瀏覽器建議使用IE 9.0以上版本 最佳觀看解析度1024x768 網站更新日期:2015/12/13 "
58 | print traditional_seg(text)
59 |
--------------------------------------------------------------------------------
/test/test_traditionalChineseDict.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from unittest import TestCase
3 |
4 | from yaya import config
5 | from yaya.dictionary.chinese_traditional_dict import TraditionalChineseDict, SimplifiedChineseDict
6 |
7 | __author__ = 'tony'
8 |
9 |
10 | class TestTraditionalChineseDict(TestCase):
11 | def test_convert_simplified_to_traditional(self):
12 | simplified = TraditionalChineseDict().convert_traditional_to_simplified(u"用筆記簿型電腦寫程式HelloWorld")
13 | self.assertEqual(simplified, u"用笔记本电脑写程序HelloWorld")
14 |
15 | def test_convert_traditional_to_simplified(self):
16 | config.Config.debug = True
17 | traditional = SimplifiedChineseDict().convert_simplified_to_traditional(u"用笔记本电脑写程序HelloWorld")
18 | self.assertEqual(traditional, u"用筆記簿型電腦寫程式HelloWorld")
19 |
20 | def test_traditional_chinese_dict_search_all_words(self):
21 | searcher = TraditionalChineseDict().trie.search(u"用筆記簿型電腦寫程式HelloWorld")
22 | for i, k, v in searcher.search_all_words():
23 | print i, k, v
24 |
25 | def test_demo1(self):
26 | text = u"記者羅吉訓/新竹報導 雙方合作的主要內容包括,希望能夠促成太陽能設備安裝維修人才培養;" \
27 | u"結合推廣教育由綠野集團引薦國外學生來臺就讀;與觀光及餐飲系合作觀光休閒產業," \
28 | u"提供來臺遊客入住大華科大樂群會館,並導覽參訪張學良故居等臺灣各知名景點。 " \
29 | u"訂閱聯絡電話:02-23222722-814 瀏覽器建議使用IE 9.0以上版本 最佳觀看解析度1024x768 " \
30 | u"網站更新日期:2015/12/13 "
31 | simplified = TraditionalChineseDict().convert_traditional_to_simplified(text)
32 | print(simplified)
33 | text = u"媒體詢問對目前選戰看法?朱立倫說最重要是要把沉默的大眾喚出來," \
34 | u"為了台灣安定、兩岸和平及經濟發展,拜託大家在最後關頭全力團結及共同支持。 " \
35 | u"今晚黨內重量級人士到齊,媒體詢問等於是最高規格的選戰會議," \
36 | u"是否會向總統當面拜託總統夫人周美青出來?朱立倫馬上向身旁的馬總統說," \
37 | u"「對呀,請馬學長拜託周學姐出來輔選」,總統笑著說「我一定轉達」。 " \
38 | u"朱立倫表示,今晚餐敘不是輔選會報,但不管是馬總統、吳副總統、王金平及行政院長毛治國," \
39 | u"大家都是同心協力,求團結勝選 。 他強調,最近到各地陸續見到好多民眾展現熱情," \
40 | u"希望最後一個月不斷加溫,直到明年1月16日勝選。1041217 這裡有個好粉絲團,需要你關注!"
41 | simplified = TraditionalChineseDict().convert_traditional_to_simplified(text)
42 | print(simplified)
43 |
--------------------------------------------------------------------------------
/test/test_trie.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tony'
2 |
--------------------------------------------------------------------------------
/test/test_viterbi_segment.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from unittest import TestCase
3 |
4 | from yaya.collection.hmm import OrgTranMatrix
5 | from yaya.common.nt import NT
6 | from yaya.seg.segment import vertexs_to_terms
7 | from yaya.seg.viterbi import *
8 | from yaya.seg.wordnet import *
9 |
10 | __author__ = 'tony'
11 |
12 |
13 | class TestViterbiSegment(TestCase):
14 | def test_viterbi(self):
15 | text = u"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
16 | # text = u"商品23和服务"
17 | word_net = WordNet(text)
18 | gen_word_net(text, word_net)
19 | vertex_list = vertexs_to_terms(viterbi(word_net.vertexs), True)
20 | self.assertTrue(u"工信处" in vertex_list)
21 | self.assertTrue(u"女" in vertex_list)
22 | self.assertTrue(u"干事" in vertex_list)
23 | self.assertTrue(u"每月" in vertex_list)
24 | self.assertTrue(u"经过" in vertex_list)
25 | self.assertTrue(u"下属" in vertex_list)
26 | self.assertTrue(u"科室" in vertex_list)
27 | self.assertTrue(u"都" in vertex_list)
28 | self.assertTrue(u"要" in vertex_list)
29 | self.assertTrue(u"亲口" in vertex_list)
30 | self.assertTrue(u"交代" in vertex_list)
31 | self.assertTrue(u"24" in vertex_list)
32 | self.assertTrue(u"口" in vertex_list)
33 | self.assertTrue(u"交换机" in vertex_list)
34 | self.assertTrue(u"等" in vertex_list)
35 | self.assertTrue(u"技术性" in vertex_list)
36 | self.assertTrue(u"器件" in vertex_list)
37 | self.assertTrue(u"的" in vertex_list)
38 | self.assertTrue(u"安装" in vertex_list)
39 | self.assertTrue(u"工作" in vertex_list)
40 |
41 | def test_custom_dict(self):
42 | text = u"黄勇今天来上班了"
43 | word_net = WordNet(text)
44 | gen_word_net(text, word_net)
45 | vertex_list = viterbi(word_net.vertexs)
46 | vertex_list = combine_by_custom_dict(vertex_list)
47 | self.assertEqual(vertex_list[1].real_word, u"黄勇")
48 |
49 |
50 | class TestViterbi(TestCase):
51 | def test_computer(self):
52 | node_list = []
53 | node_list.append(Attribute((NT.S, 19800)))
54 | node_list.append(Attribute((NT.K, 1000, NT.D, 1000)))
55 | node_list.append(Attribute((NT.C, 1000, NT.B, 1000)))
56 | node_list.append(Attribute((NT.M, 1000)))
57 | node_list.append(Attribute((NT.P, 12, NT.D, 1)))
58 | node_list.append(Attribute((NT.B, 19800)))
59 | tag_list = viterbi_standard(node_list, hmm=OrgTranMatrix().hmm)
60 | self.assertEquals(6, len(tag_list))
61 | self.assertEqual(NT.K, tag_list[1])
62 | self.assertEqual(NT.C, tag_list[2])
63 | self.assertEqual(NT.M, tag_list[3])
64 | self.assertEqual(NT.D, tag_list[4])
65 |
--------------------------------------------------------------------------------
/test/test_wordnet.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | from __future__ import absolute_import
3 | from unittest import TestCase
4 |
5 | from yaya.const import TAG_BIGIN, TAG_END
6 | from yaya.seg.wordnet import WordNet, gen_word_net, Vertex, new_tag_vertex
7 |
8 | __author__ = 'tony'
9 |
10 |
11 | class TestWordNet(TestCase):
12 | def test_gen_word_net(self):
13 | text = u"一举成名天下知"
14 | word_net = WordNet(text)
15 | gen_word_net(text, word_net)
16 | self.assertEqual(word_net.vertexs.__len__(), text.__len__() + 2)
17 | # 一举 一举成名
18 | # 举
19 | # 成 成名
20 | # 名
21 | # 天 天下
22 | # 下
23 | # 知
24 | self.assertEqual(word_net.vertexs[1].__len__(), 2)
25 | self.assertEqual(word_net.vertexs[2].__len__(), 1)
26 | self.assertEqual(word_net.vertexs[3].__len__(), 2)
27 | self.assertEqual(word_net.vertexs[4].__len__(), 1)
28 | self.assertEqual(word_net.vertexs[5].__len__(), 2)
29 | self.assertEqual(word_net.vertexs[6].__len__(), 1)
30 | self.assertEqual(word_net.vertexs[7].__len__(), 1)
31 |
32 | def test_gen_word_net_include_num(self):
33 | text = u"123456"
34 | word_net = WordNet(text)
35 | gen_word_net(text, word_net)
36 | self.assertEqual(word_net.vertexs.__len__(), 6 + 2)
37 | self.assertTrue([] not in word_net.vertexs, u"原始词网,不能可能有空节点")
38 |
39 | def test_vector(self):
40 | v1 = Vertex("test", attribute="nr 1")
41 | v2 = Vertex("test", attribute="nr 1")
42 | v3 = Vertex("test", attribute="nr1 1")
43 | self.assertEqual(v1, v2)
44 | self.assertNotEqual(v1, v3)
45 | self.assertIn(v1, [v2])
46 | self.assertNotIn(v1, [v3])
47 |
48 | def test_tag_vector_real_word_len_should_eq_0(self):
49 | # 标识词的real_word不能为空,否则在字典里无法表示
50 | self.assertEqual(new_tag_vertex(TAG_BIGIN).real_word, chr(32))
51 | self.assertEqual(new_tag_vertex(TAG_END).real_word, chr(32))
52 |
53 | def test_word_net_insert(self):
54 | text = u"1234567890"
55 | word_net_all = WordNet(text)
56 | for i, c in enumerate(text):
57 | word_net_all.add(i + 1, Vertex(c))
58 |
--------------------------------------------------------------------------------
/yaya/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tony'
2 | __version__ = "0.1.1"
3 |
--------------------------------------------------------------------------------
/yaya/collection/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tony'
2 |
--------------------------------------------------------------------------------
/yaya/collection/bigram.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from __future__ import absolute_import
3 |
4 | import time
5 |
6 | from yaya import config
7 | from yaya.collection.dict import CoreDict
8 | from yaya.const import logger
9 | from yaya.utility.singleton import singleton
10 |
11 | __author__ = 'tony'
12 |
13 |
14 | class BiGramTable:
15 | def __init__(self):
16 | self.start = []
17 | self.pair = []
18 |
19 | def get_bifreq(self, pre_word, next_word):
20 | pre_word_id = pre_word if type(pre_word) is int else CoreDict().trie.exact_match_search(pre_word)
21 | if pre_word_id == -1:
22 | return 0
23 | next_word_id = next_word if type(next_word) is int else CoreDict().trie.exact_match_search(next_word)
24 | if next_word_id == -1:
25 | return 0
26 | index = binary_search(self.pair, self.start[pre_word_id],
27 | self.start[pre_word_id + 1] - self.start[pre_word_id],
28 | next_word_id)
29 | if index < 0:
30 | return 0
31 | index <<= 1
32 | return self.pair[index + 1]
33 |
34 | @staticmethod
35 | def load(filename=config.CORE_BIGRAM_NAME):
36 | start = time.time()
37 | logger.info(u"开始加载核心二元语法词表")
38 | import os
39 | if os.path.exists(filename + config.DICT_BIN_EXT):
40 | return BiGramTable.load_bin(filename + config.DICT_BIN_EXT)
41 | else:
42 | table = BiGramTable.build(filename)
43 | import cPickle as Pickle
44 | with open(filename + config.DICT_BIN_EXT, 'w') as f:
45 | Pickle.dump(table, f)
46 | return table
47 | logger.info(u"加载核心二元语法词表完毕,耗时%s", time.time() - start)
48 |
49 | @staticmethod
50 | def load_bin(filename):
51 | import cPickle as Pickle
52 | with open(filename, 'r') as f:
53 | bigram = Pickle.load(f)
54 | f.close()
55 | return bigram
56 |
57 | @staticmethod
58 | def build(filename):
59 | import codecs
60 | f = codecs.open(filename, 'r', 'utf-8')
61 | pre_word_map = {}
62 | max_word_id = CoreDict().trie.word_size()
63 | total = 0
64 | while True:
65 | line = f.readline()
66 | if not line:
67 | break
68 | params = line.split()
69 | if params.__len__() != 2:
70 | continue
71 | two_word = params[0].split('@', 2)
72 | if two_word.__len__() != 2:
73 | continue
74 |
75 | pre_word_id = CoreDict().trie.exact_match_search(two_word[0])
76 | if pre_word_id == -1:
77 | continue
78 | next_word_id = CoreDict().trie.exact_match_search(two_word[1])
79 | if next_word_id == -1:
80 | continue
81 | if pre_word_id not in pre_word_map:
82 | pre_word_map[pre_word_id] = {}
83 | next_word_map = pre_word_map.get(pre_word_id)
84 | next_word_map[next_word_id] = int(params[1])
85 | total += 2
86 | f.close()
87 |
88 | table = BiGramTable()
89 | table.start = [0] * (max_word_id + 1)
90 | table.pair = [0] * total
91 | offset = 0
92 | for i in range(max_word_id):
93 | next_word_map = pre_word_map.get(i, None)
94 | if next_word_map is not None:
95 | key_list = next_word_map.keys()
96 | key_list.sort()
97 | for k in key_list:
98 | index = offset << 1
99 | table.pair[index] = k
100 | table.pair[index + 1] = next_word_map[k]
101 | offset += 1
102 | table.start[i + 1] = offset
103 | return table
104 |
105 |
106 | def binary_search(a, from_index, length, key):
107 | low = from_index
108 | high = from_index + length - 1
109 | while low <= high:
110 | mid = (low + high) >> 1
111 | mid_val = a[mid << 1]
112 | if mid_val < key:
113 | low = mid + 1
114 | elif mid_val > key:
115 | high = mid - 1
116 | else:
117 | return mid
118 | return -(low + 1)
119 |
120 |
121 | @singleton
122 | class CoreBiGramTable:
123 | def __init__(self):
124 | self.table = BiGramTable.load()
125 |
126 |
127 | CORE_BIG_RAM_TABLE = CoreBiGramTable()
128 |
--------------------------------------------------------------------------------
/yaya/collection/dict.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | try:
3 | import cPickle as Pickle
4 | except:
5 | import Pickle
6 | try:
7 | import xrange as range
8 | except:
9 | pass
10 |
11 | from collections import OrderedDict
12 |
13 | from yaya.const import *
14 | from yaya import config
15 | from yaya.utility.singleton import singleton
16 | from yaya.common.nature import NATURE
17 |
18 | ATTRIBUTE_MAIN_NATURE_INDEX = 0
19 |
20 |
21 | class Node(object):
22 | def __init__(self, code=0, depth=0, left=0, right=0):
23 | self.code = code
24 | self.depth = depth
25 | self.left = left
26 | self.right = right
27 |
28 |
29 | class Attribute(object):
30 | def __init__(self, attr, cls=NATURE):
31 | self.cls = cls
32 | self.total = 0
33 | if not isinstance(attr, tuple):
34 | self.data = ()
35 | if attr is not None:
36 | attr = attr if isinstance(attr, list) else attr.split(' ')
37 | nature = []
38 | for i in range(0, attr.__len__(), 2):
39 | nature.append(cls[attr[i]])
40 | nature.append(int(attr[i + 1]))
41 | self.total += int(attr[i + 1])
42 | self.data = tuple(nature)
43 | else:
44 | self.data = attr
45 | for i in range(len(self.data)):
46 | if i % 2 == 1:
47 | self.total += self.data[i]
48 |
49 | def to_tuple(self):
50 | return self.data
51 |
52 | def __str__(self):
53 | return ' '.join([str(x) for x in self.data])
54 |
55 | def __repr__(self):
56 | return u"Attribute(%s)" % self.__str__()
57 |
58 | def __len__(self):
59 | return len(self.data) / 2
60 |
61 | def __eq__(self, other):
62 | return str(self) == str(other)
63 |
64 | def get_nature_frequency(self, nature):
65 | try:
66 | return self.data[self.data.index(nature) + 1]
67 | except:
68 | return 0
69 |
70 | @property
71 | def natures(self):
72 | for i in range(0, len(self.data), 2):
73 | yield i / 2, self.data[i], self.data[i + 1]
74 | # return self.data
75 |
76 | @property
77 | def nature(self):
78 | if self.data.__len__() != 0:
79 | return self.data[ATTRIBUTE_MAIN_NATURE_INDEX]
80 | else:
81 | return None
82 |
83 | @property
84 | def total_frequency(self):
85 | return self.total
86 |
87 |
88 | class FastArray:
89 | def __init__(self, default_value=0):
90 | self.default_value = 0
91 | self.data = {}
92 | self._max_key = 0
93 | pass
94 |
95 | def __getitem__(self, item):
96 | return self.data.get(item, 0)
97 |
98 | def __setitem__(self, key, value):
99 | self.data[key] = value
100 | self._max_key = max(self._max_key, key)
101 |
102 | @property
103 | def max_key(self):
104 | return self._max_key
105 |
106 | def extend(self, size):
107 | pass
108 |
109 |
110 | class DoubleArrayTrie:
111 | def __init__(self, enum_cls=NATURE):
112 | self.alloc_size = 0
113 | self.check = []
114 | self.base = []
115 | self.enum_cls = enum_cls
116 | self.used = []
117 | self.size = 0
118 | self.key = []
119 | self.key_size = 0
120 | self.length = None
121 | self.value = []
122 | self.v = None
123 | self.progress = 0
124 | self.next_check_pos = 0
125 | self.error_ = 0
126 |
127 | def word_size(self):
128 | if self.v is None:
129 | return 0
130 | else:
131 | return self.v.__len__()
132 |
133 | def resize(self, newsize):
134 | offsize = newsize - self.alloc_size
135 | self.base.extend([0] * offsize)
136 | self.check.extend([0] * offsize)
137 | self.used.extend([0] * offsize)
138 | self.alloc_size = newsize
139 |
140 | def fetch(self, parent, siblings):
141 | if self.error_ < 0:
142 | return 0
143 | prev = 0
144 | for i in range(parent.left, parent.right):
145 | if parent.depth > (self.length[i] if self.length is not None else self.key[i].__len__()):
146 | continue
147 | tmp = self.key[i]
148 | cur = 0
149 | if (self.length[i] if self.length is not None else tmp.__len__()) != parent.depth:
150 | cur = ord(tmp[parent.depth]) + 1
151 |
152 | # 检测是不是字典序
153 | if prev > cur:
154 | return 0
155 |
156 | if cur != prev or siblings.__len__() is 0:
157 | tmp_node = Node(depth=parent.depth + 1, code=cur, left=i, right=0)
158 | if siblings.__len__() != 0:
159 | siblings[-1].right = i
160 | siblings.append(tmp_node)
161 | prev = cur
162 |
163 | if siblings.__len__() != 0:
164 | siblings[-1].right = parent.right
165 |
166 | return siblings.__len__()
167 |
168 | def insert(self, siblings):
169 | if self.error_ < 0:
170 | return 0
171 |
172 | begin = 0
173 | pos = (siblings[0].code + 1 if (siblings[0].code + 1 > self.next_check_pos) else self.next_check_pos) - 1
174 | nonzero_num = 0
175 | first = 0
176 |
177 | if self.alloc_size <= pos:
178 | self.resize(pos + 1)
179 |
180 | while 1:
181 | pos += 1
182 |
183 | if self.alloc_size <= pos:
184 | self.resize(pos + 1)
185 |
186 | if self.check[pos] != 0:
187 | nonzero_num += 1
188 | continue
189 | elif first is 0:
190 | self.next_check_pos = pos
191 | first = 1
192 |
193 | begin = pos - siblings[0].code
194 |
195 | if self.alloc_size <= (begin + siblings[-1].code):
196 | if 1.05 > 1.0 * self.key_size / (self.progress + 1):
197 | l = 1.05
198 | else:
199 | l = 1.0 * self.key_size / (self.progress + 1)
200 | self.resize(int(self.alloc_size * l))
201 |
202 | if self.used[begin]:
203 | continue
204 |
205 | find = True
206 | for i in range(siblings.__len__()):
207 | if self.check[begin + siblings[i].code] != 0:
208 | find = False
209 | break
210 | if not find:
211 | continue
212 | break
213 |
214 | if 1.0 * nonzero_num / (pos - self.next_check_pos + 1) >= 0.95:
215 | self.next_check_pos = pos
216 |
217 | self.used[begin] = True
218 | self.size = self.size if (self.size > begin + siblings[-1].code + 1) else \
219 | begin + siblings[-1].code + 1
220 |
221 | for i in range(siblings.__len__()):
222 | self.check[begin + siblings[i].code] = begin
223 |
224 | for i in range(siblings.__len__()):
225 | new_siblings = []
226 |
227 | if self.fetch(siblings[i], new_siblings) is 0:
228 | self.base[begin + siblings[i].code] = -self.value[siblings[i].left] - 1 if (
229 | self.value is not None) else (-siblings[i].left - 1)
230 |
231 | if self.value is not None and -self.value[siblings[i].left] - 1 >= 0:
232 | self.error_ = -2
233 | return 0
234 |
235 | self.progress += 1
236 | else:
237 | h = self.insert(new_siblings)
238 | self.base[begin + siblings[i].code] = h
239 |
240 | return begin
241 |
242 | def build(self, key=None, length=None, key_size=None, v=None):
243 | if key is None:
244 | return 0
245 | if key_size is not None and key_size > key.__len__():
246 | return 0
247 | self.key = key
248 | self.length = length
249 | self.key_size = key_size if key_size is not None else key.__len__()
250 | self.value = None
251 | self.v = v if v is not None else key
252 | self.progress = 0
253 |
254 | self.resize(65536 * 32)
255 |
256 | self.base[0] = 1
257 | self.next_check_pos = 0
258 |
259 | root_node = Node(left=0, right=self.key_size, depth=0, code=0)
260 |
261 | siblings = []
262 | self.fetch(root_node, siblings)
263 | self.insert(siblings)
264 |
265 | self.key = None
266 |
267 | return self.error_
268 |
269 | def exact_match_search(self, key, pos=0, keylen=0, nodepos=0):
270 | if key is None:
271 | return -1
272 | if keylen <= 0:
273 | keylen = key.__len__()
274 | if nodepos <= 0:
275 | nodepos = 0
276 |
277 | result = -1
278 | b = self.base[nodepos]
279 |
280 | for i in range(pos, keylen):
281 | p = b + ord(key[i]) + 1
282 | if b == self.check[p]:
283 | b = self.base[p]
284 | else:
285 | return result
286 |
287 | p = b
288 | n = self.base[p]
289 | if b == self.check[p] and n < 0:
290 | result = -n - 1
291 | return result
292 |
293 | def get(self, word):
294 | index = self.exact_match_search(word)
295 | if index >= 0:
296 | return index, self.get_attr(self.v[index])
297 | else:
298 | return index, None
299 |
300 | def get_attr(self, value):
301 | if isinstance(value, unicode) or isinstance(value, str):
302 | return Attribute(value.split(chr(32))[1:], cls=self.enum_cls)
303 | elif isinstance(value, list):
304 | return Attribute(value[1:], cls=self.enum_cls)
305 | raise Exception("异常的字典值类型:%s" % type(value))
306 |
307 | def transition(self, path, state_from):
308 | b = state_from
309 | for i in range(len(path)):
310 | p = b + ord(path[i]) + 1
311 | if b == self.check[p]:
312 | b = self.base[p]
313 | else:
314 | return -1
315 | p = b
316 | return p
317 |
318 | def output(self, state):
319 | if state < 0:
320 | return None
321 | n = self.base[state]
322 | if state == self.check[state] and n < 0:
323 | return self.get_attr(self.v[-n - 1])
324 | return None
325 |
326 | def dump(self):
327 | for i in range(self.size):
328 | print("i: %s [%s,%s]" % (i, self.base[i], self.check[i]))
329 |
330 | def compress(self):
331 | last = self.alloc_size - 1
332 | while self.used[last] == 0:
333 | last -= 1
334 | self.base = self.base[:last + 1]
335 | self.check = self.check[:last + 1]
336 | self.alloc_size = len(self.base)
337 |
338 | @staticmethod
339 | def save_to_ya(trie, filename):
340 | # trie.compress()
341 | import cPickle as Pickle
342 | with open(filename, 'wb') as f:
343 | Pickle.dump(trie, f, protocol=Pickle.HIGHEST_PROTOCOL)
344 | f.close()
345 |
346 | @staticmethod
347 | def save_to_yaf(trie, filename):
348 | pass
349 |
350 | @staticmethod
351 | def load_bin(filename):
352 | with open(filename, 'rb') as f:
353 | trie = Pickle.load(f)
354 | return trie
355 |
356 | @staticmethod
357 | def load_dict_file(filenames, key_func=None, value_func=None, enum_cls=NATURE):
358 | import codecs
359 | k, v, dict_list = [], [], []
360 | if not isinstance(filenames, list):
361 | filenames = [filenames]
362 |
363 | for filename in filenames:
364 | with codecs.open(filename, 'rb', 'utf-8') as f:
365 | dict_list += f.read().splitlines()
366 |
367 | return DoubleArrayTrie.load_from_list(dict_list, key_func, value_func, enum_cls)
368 |
369 | @staticmethod
370 | def load_from_list(dict_list, key_func=None, value_func=None, enum_cls=NATURE):
371 | key_func = key_func or (lambda i: i.split()[0])
372 | value_func = value_func or (lambda i: i)
373 | # sort
374 | dict_map = {}
375 | for i in dict_list:
376 | try:
377 | i = i.replace('\t', chr(32))
378 | dict_map[key_func(i)] = value_func(i) # 此处需要解开成列表,viterbi会直接用到
379 | except:
380 | logger.error(u"字典项:[ %s ]格式异常。" % i)
381 | continue
382 | dict_map = OrderedDict(sorted(dict_map.items()))
383 | trie = DoubleArrayTrie(enum_cls=enum_cls)
384 | trie.build(key=dict_map.keys(), v=dict_map.values())
385 | return trie
386 |
387 | def search(self, key, offset=0):
388 | return Searcher(self, key, offset)
389 |
390 | @staticmethod
391 | def load(filenames, key_func=None, value_func=None,
392 | dict_bin_ext=config.DICT_BIN_EXT, enum_cls=NATURE):
393 | import os
394 | # 考虑用户自定义宝典输入为列表的情况
395 | filename = filenames[0] if type(filenames) is list else filenames
396 | if config.Config.use_dict_cache and os.path.exists(filename + dict_bin_ext):
397 | return DoubleArrayTrie.load_bin(filename + dict_bin_ext)
398 | trie = DoubleArrayTrie.load_dict_file(filenames, key_func, value_func, enum_cls)
399 | DoubleArrayTrie.save_to_ya(trie, filename + dict_bin_ext)
400 | return trie
401 |
402 | @staticmethod
403 | def buildcoredictsearcher(key, offset=0):
404 | return DoubleArrayTrie().load(config.CORE_DICT_NAME).search(key, offset)
405 |
406 |
407 | class Searcher:
408 | def __init__(self, trie, chararray, offset=0):
409 | # key的起点
410 | self.begin = 0
411 | # key的长度
412 | self.length = 0
413 | # key的字典序坐标
414 | self.index = 0
415 | self.key = None
416 |
417 | # key对应的value
418 | self.value = None
419 |
420 | # 传入的字符数组
421 | self.code_array = [ord(c) for c in chararray]
422 |
423 | self.char_array = chararray
424 |
425 | # 上一个node位置
426 | self.trie = trie
427 | self.last = trie.base[0]
428 |
429 | # charArray的长度,效率起见,开个变量
430 | self.array_length = chararray.__len__()
431 |
432 | # 上一个字符的下标
433 | self.i = offset - 1
434 | # // A trick,如果文本长度为0的话,调用next()时,会带来越界的问题。
435 | self.begin = -1 if (self.array_length is 0) else offset
436 |
437 | # 是否命中,当返回false表示搜索结束,否则使用公开的成员读取命中的详细信息
438 | def next(self):
439 | b = self.last
440 | while 1:
441 | self.i += 1
442 | if self.i == self.array_length: # 指针到头了,将起点往前挪一个,重新开始,状态归零
443 | self.begin += 1
444 | if self.begin == self.array_length:
445 | break
446 | self.i = self.begin
447 | b = self.trie.base[0]
448 |
449 | p = b + self.code_array[self.i] + 1 # 状态转移 p = base[char[i-1]] + char[i] + 1
450 | if b == self.trie.check[p]: # base[char[i-1]] == check[base[char[i-1]] + char[i] + 1]
451 | b = self.trie.base[p] # 转移成功
452 | else:
453 | self.i = self.begin # 转移失败,也将起点往前挪一个,重新开始,状态归零
454 | self.begin += 1
455 | if self.begin is self.array_length:
456 | break
457 | b = self.trie.base[0]
458 | continue
459 | p = b
460 | n = self.trie.base[p]
461 | if b == self.trie.check[p] and n < 0: # base[p] == check[p] && base[p] < 0 查到一个词
462 | self.length = self.i - self.begin + 1
463 | self.index = -n - 1
464 | self.key = self.char_array[self.begin:self.begin + self.length]
465 | self.value = self.trie.get_attr(self.trie.v[self.index])
466 | self.last = b
467 | return True
468 | return False
469 |
470 | def search_all_words(self):
471 | b = self.last
472 | while 1:
473 | self.i += 1
474 | if self.i == self.array_length: # 指针到头了,将起点往前挪一个,重新开始,状态归零
475 | self.begin += 1
476 | if self.begin == self.array_length:
477 | break
478 | self.i = self.begin
479 | b = self.trie.base[0]
480 |
481 | p = b + self.code_array[self.i] + 1 # 状态转移 p = base[char[i-1]] + char[i] + 1
482 | if b == self.trie.check[p]: # base[char[i-1]] == check[base[char[i-1]] + char[i] + 1]
483 | b = self.trie.base[p] # 转移成功
484 | else:
485 | self.i = self.begin # 转移失败,也将起点往前挪一个,重新开始,状态归零
486 | self.begin += 1
487 | if self.begin == self.array_length:
488 | break
489 | b = self.trie.base[0]
490 | continue
491 | p = b
492 | n = self.trie.base[p]
493 | if b == self.trie.check[p] and n < 0: # base[p] == check[p] && base[p] < 0 查到一个词
494 | self.length = self.i - self.begin + 1
495 | self.index = -n - 1
496 | self.key = self.char_array[self.begin:self.begin + self.length]
497 | self.value = self.trie.get_attr(self.trie.v[self.index])
498 | self.last = b
499 | yield self.begin, self.key, self.value
500 | return
501 |
502 |
503 |
504 |
505 | # def seek(self,index):
506 | # self.i = index -1
507 | # self.begin = index
508 | # self.last = self.trie.base[0]
509 |
510 |
511 | # class MaxSearcher:
512 | # def __init__(self, trie, chararray, offset=0):
513 | # self.searcher = trie.search(chararray)
514 | # self.textbegin = 0
515 | # self.textend = 0
516 | #
517 | # def next(self):
518 | # prekey = None
519 | # preindex = None
520 | # prebegin = None
521 | # preend = None
522 | #
523 | # while self.searcher.next():
524 | # if prekey == None or prekey == self.searcher.key[:len(prekey)] :
525 | # prekey = self.searcher.key
526 | # preindex = self.searcher.index
527 | # prebegin = self.searcher.begin
528 | # preend = self.searcher.begin+self.searcher.length
529 | # continue
530 | # else:
531 | # self.key = prekey
532 | # self.value = self.searcher.trie.v[preindex]
533 | # self.textbegin = prebegin
534 | # self.textend = preend
535 | # # 需要将起点移到找到的词的后一个
536 | # self.searcher.seek(self.textend)
537 | # return True
538 | # return False
539 |
540 |
541 |
542 |
543 | @singleton
544 | class CoreDict:
545 | def __init__(self):
546 | self.trie = DoubleArrayTrie.load(config.CORE_DICT_NAME)
547 |
548 |
549 | def __split_id_attribute(item):
550 | index = item[0]
551 | value = item[1]
552 | if isinstance(value, str):
553 | value = value.split()
554 | if isinstance(value, list):
555 | value = value[1:]
556 | return index, value
557 |
558 |
559 | PERSON_WORD_ID, PERSON_ATTRIBUTE = __split_id_attribute(CoreDict().trie.get(TAG_PEOPLE))
560 | PLACE_WORD_ID, PLACE_ATTRIBUTE = __split_id_attribute(CoreDict().trie.get(TAG_PLACE))
561 | ORG_WORD_ID, ORG_ATTRIBUTE = __split_id_attribute(CoreDict().trie.get(TAG_GROUP))
562 | PROPER_WORD_ID, PROPER_ATTRIBUTE = __split_id_attribute(CoreDict().trie.get(TAG_PROPER))
563 | TIME_WORD_ID, TIME_ATTRIBUTE = __split_id_attribute(CoreDict().trie.get(TAG_TIME))
564 | NUMBER_WORD_ID, NUMBER_ATTRIBUTE = __split_id_attribute(CoreDict().trie.get(TAG_NUMBER))
565 | CLUSTER_WORD_ID, CLUSTER_ATTRIBUTE = __split_id_attribute(CoreDict().trie.get(TAG_CLUSTER))
566 |
567 | @singleton
568 | class CustomDict:
569 | def __init__(self):
570 | self.trie = DoubleArrayTrie.load(config.CUSTOM_DICT_NAME)
571 |
--------------------------------------------------------------------------------
/yaya/collection/hmm.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from __future__ import unicode_literals
3 | import math
4 |
5 | from yaya.common.ns import NS
6 | from yaya import config
7 | from yaya.common.nr import NR
8 | from yaya.common.nt import NT
9 | from yaya.utility.singleton import singleton
10 |
11 | __author__ = 'tony'
12 |
13 |
14 | class HMMMatrix:
15 | def __init__(self):
16 | self.matrix = []
17 | self.total = None
18 | self.total_freq = 0
19 | self.start_prob = None
20 | self.trans_prob = None
21 |
22 | def get_total_freq(self, nature):
23 | return self.total[nature.index]
24 |
25 | @staticmethod
26 | def load(filename, cls):
27 | with open(filename, 'r') as f:
28 | flist = f.read().splitlines()
29 | labels = flist[0].split(',')[1:]
30 | ord_array = [[0]] * len(labels)
31 | ord_max = 0
32 | for i in range(len(ord_array)):
33 | ord_array[i] = cls[labels[i]].index
34 | ord_max = max(ord_max, ord_array[i])
35 | # 找到最大的枚举值
36 | ord_max += 1
37 | hmm = HMMMatrix()
38 | hmm.matrix = [[0 for col in range(ord_max)] for row in range(ord_max)]
39 | for row in flist[1:]:
40 | params = row.split(',')
41 | cur_ord = cls[params[0]].index
42 | for i in range(ord_array.__len__()):
43 | hmm.matrix[cur_ord][ord_array[i]] = int(params[1 + i])
44 |
45 | hmm.total = [[0]] * ord_max
46 | for j in range(ord_max):
47 | hmm.total[j] = 0
48 | for i in range(ord_max):
49 | hmm.total[j] += hmm.matrix[i][j]
50 |
51 | for j in range(ord_max):
52 | hmm.total[j] += hmm.matrix[j][j]
53 |
54 | for j in range(ord_max):
55 | hmm.total_freq += hmm.total[j]
56 |
57 | # 计算HMM四元组
58 | states = ord_array
59 | hmm.start_prob = [[0]] * ord_max
60 | for s in ord_array:
61 | freq = hmm.total[s] + 1e-8
62 | hmm.start_prob[s] = -math.log(freq / hmm.total_freq)
63 |
64 | hmm.trans_prob = [[0 for col in range(ord_max)] for row in range(ord_max)]
65 | for f in ord_array:
66 | for t in ord_array:
67 | freq = hmm.matrix[f][t] + 1e-8
68 | hmm.trans_prob[f][t] = -math.log(freq / hmm.total_freq)
69 | return hmm
70 |
71 |
72 | @singleton
73 | class PersonTranMatrix:
74 | def __init__(self):
75 | self.hmm = HMMMatrix.load(config.PERSON_TR_PATH, NR)
76 |
77 |
78 | @singleton
79 | class OrgTranMatrix:
80 | def __init__(self):
81 | self.hmm = HMMMatrix.load(config.ORG_TR_PATH, NT)
82 |
83 | @singleton
84 | class PlaceTranMatrix:
85 | def __init__(self):
86 | self.hmm = HMMMatrix.load(config.PLACE_TR_PATH, NS)
87 |
--------------------------------------------------------------------------------
/yaya/collection/trie.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tony'
2 | __all__ = ['Trie', 'StringTrie', 'SortedTrie', 'SortedStringTrie', 'Node']
3 |
4 | import sys
5 | from copy import copy
6 | from operator import itemgetter
7 | from collections import MutableMapping
8 |
9 | # Python 3 interoperability
10 | PY3 = sys.version_info[0] == 3
11 | if PY3:
12 | def itervalues(d):
13 | return d.values()
14 |
15 |
16 | def iteritems(d):
17 | return d.items()
18 | else:
19 | def itervalues(d):
20 | return d.itervalues()
21 |
22 |
23 | def iteritems(d):
24 | return d.iteritems()
25 |
26 |
27 | # Singleton sentinel - works with pickling
28 | class NULL(object):
29 | pass
30 |
31 |
32 | class Node(object):
33 | '''Trie node class.
34 |
35 | Subclasses may extend it to replace :attr:`ChildrenFactory` with a different
36 | mapping class (e.g. `sorteddict `_).
37 |
38 | :ivar value: The value of the key corresponding to this node or :const:`NULL`
39 | if there is no such key.
40 | :ivar children: A ``{key-part : child-node}`` mapping.
41 | '''
42 | __slots__ = ('value', 'children')
43 |
44 | #: A callable for creating a new :attr:`children` mapping.
45 | ChildrenFactory = dict
46 |
47 | def __init__(self, value=NULL):
48 | self.value = value
49 | self.children = self.ChildrenFactory()
50 |
51 | def numkeys(self):
52 | '''Return the number of keys in the subtree rooted at this node.'''
53 | return (int(self.value is not NULL) +
54 | sum(child.numkeys() for child in itervalues(self.children)))
55 |
56 | def __repr__(self):
57 | return '(%s, {%s})' % (
58 | self.value is NULL and 'NULL' or repr(self.value),
59 | ', '.join('%r: %r' % t for t in iteritems(self.children)))
60 |
61 | def __copy__(self):
62 | clone = self.__class__(self.value)
63 | clone_children = clone.children
64 | for key, child in iteritems(self.children):
65 | clone_children[key] = child.__copy__()
66 | return clone
67 |
68 | def __getstate__(self):
69 | return (self.value, self.children)
70 |
71 | def __setstate__(self, state):
72 | self.value, self.children = state
73 |
74 |
75 | class Trie(MutableMapping):
76 | '''Base trie class.
77 |
78 | As with regular dicts, keys are not necessarily returned sorted. Use
79 | :class:`SortedTrie` if sorting is required.
80 | '''
81 |
82 | #: Callable for forming a key from its parts.
83 | KeyFactory = tuple
84 |
85 | #: Callable for creating new trie nodes.
86 | NodeFactory = Node
87 |
88 | def __init__(self, *args, **kwargs):
89 | '''Create a new trie.
90 |
91 | Parameters are the same with ``dict()``.
92 | '''
93 | self._root = self.NodeFactory()
94 |
95 | self.update(*args, **kwargs)
96 |
97 | @classmethod
98 | def fromkeys(cls, iterable, value=None):
99 | '''Create a new trie with keys from ``iterable`` and values set to ``value``.
100 |
101 | Parameters are the same with ``dict.fromkeys()``.
102 | '''
103 | d = cls()
104 | for key in iterable:
105 | d[key] = value
106 | return d
107 |
108 | # ----- trie-specific methods -----------------------------------------------
109 |
110 | def longest_prefix(self, key, default=NULL):
111 | '''Return the longest key in this trie that is a prefix of ``key``.
112 |
113 | If the trie doesn't contain any prefix of ``key``:
114 | - if ``default`` is given, return it
115 | - otherwise raise ``KeyError``
116 | '''
117 | try:
118 | return self.longest_prefix_item(key)[0]
119 | except KeyError:
120 | if default is not NULL:
121 | return default
122 | raise
123 |
124 | def longest_prefix_value(self, key, default=NULL):
125 | '''Return the value associated with the longest key in this trie that is
126 | a prefix of ``key``.
127 |
128 | If the trie doesn't contain any prefix of ``key``:
129 | - if ``default`` is given, return it
130 | - otherwise raise ``KeyError``
131 | '''
132 | current = self._root
133 | longest_prefix_value = NULL
134 | for part in key:
135 | current = current.children.get(part)
136 | if current is None:
137 | break
138 | value = current.value
139 | if value is not NULL:
140 | longest_prefix_value = value
141 | if longest_prefix_value is not NULL:
142 | return longest_prefix_value
143 | elif default is not NULL:
144 | return default
145 | else:
146 | raise KeyError
147 |
148 | def longest_prefix_item(self, key, default=NULL):
149 | '''Return the item (``(key,value)`` tuple) associated with the longest
150 | key in this trie that is a prefix of ``key``.
151 |
152 | If the trie doesn't contain any prefix of ``key``:
153 | - if ``default`` is given, return it
154 | - otherwise raise ``KeyError``
155 | '''
156 | prefix = []
157 | append = prefix.append
158 | current = self._root
159 | longest_prefix_value = NULL
160 | max_non_null_index = -1
161 | for i, part in enumerate(key):
162 | current = current.children.get(part)
163 | if current is None:
164 | break
165 | append(part)
166 | value = current.value
167 | if value is not NULL:
168 | longest_prefix_value = value
169 | max_non_null_index = i
170 | if longest_prefix_value is not NULL:
171 | del prefix[max_non_null_index + 1:]
172 | return (self.KeyFactory(prefix), longest_prefix_value)
173 | elif default is not NULL:
174 | return default
175 | else:
176 | raise KeyError
177 |
178 | def iter_prefixes(self, key):
179 | 'Return an iterator over the keys of this trie that are prefixes of ``key``.'
180 | key_factory = self.KeyFactory
181 | prefix = []
182 | append = prefix.append
183 | node = self._root
184 | for part in key:
185 | node = node.children.get(part)
186 | if node is None:
187 | break
188 | append(part)
189 | if node.value is not NULL:
190 | yield key_factory(prefix)
191 |
192 | def iter_prefix_values(self, key):
193 | '''Return an iterator over the values of this trie that are associated
194 | with keys that are prefixes of ``key``.
195 | '''
196 | node = self._root
197 | for part in key:
198 | node = node.children.get(part)
199 | if node is None:
200 | break
201 | if node.value is not NULL:
202 | yield node.value
203 |
204 | def iter_prefix_items(self, key):
205 | '''Return an iterator over the items (``(key,value)`` tuples) of this
206 | trie that are associated with keys that are prefixes of ``key``.
207 | '''
208 | key_factory = self.KeyFactory
209 | prefix = []
210 | append = prefix.append
211 | node = self._root
212 | for part in key:
213 | node = node.children.get(part)
214 | if node is None:
215 | break
216 | append(part)
217 | if node.value is not NULL:
218 | yield (key_factory(prefix), node.value)
219 |
220 | # ----- extended mapping API methods ----------------------------------------
221 |
222 | def keys(self, prefix=None):
223 | '''Return a list of this trie's keys.
224 |
225 | :param prefix: If not None, return only the keys prefixed by ``prefix``.
226 | '''
227 | return list(self.iterkeys(prefix))
228 |
229 | def values(self, prefix=None):
230 | '''Return a list of this trie's values.
231 |
232 | :param prefix: If not None, return only the values associated with keys
233 | prefixed by ``prefix``.
234 | '''
235 | return list(self.itervalues(prefix))
236 |
237 | def items(self, prefix=None):
238 | '''Return a list of this trie's items (``(key,value)`` tuples).
239 |
240 | :param prefix: If not None, return only the items associated with keys
241 | prefixed by ``prefix``.
242 | '''
243 | return list(self.iteritems(prefix))
244 |
245 | def iterkeys(self, prefix=None):
246 | '''Return an iterator over this trie's keys.
247 |
248 | :param prefix: If not None, yield only the keys prefixed by ``prefix``.
249 | '''
250 | return (key for key, value in self.iteritems(prefix))
251 |
252 | def itervalues(self, prefix=None):
253 | '''Return an iterator over this trie's values.
254 |
255 | :param prefix: If not None, yield only the values associated with keys
256 | prefixed by ``prefix``.
257 | '''
258 |
259 | def generator(node, NULL=NULL):
260 | if node.value is not NULL:
261 | yield node.value
262 | for part, child in iteritems(node.children):
263 | for subresult in generator(child):
264 | yield subresult
265 |
266 | if prefix is None:
267 | node = self._root
268 | else:
269 | node = self._find(prefix)
270 | if node is None:
271 | node = self.NodeFactory()
272 | return generator(node)
273 |
274 | def iteritems(self, prefix=None):
275 | '''Return an iterator over this trie's items (``(key,value)`` tuples).
276 |
277 | :param prefix: If not None, yield only the items associated with keys
278 | prefixed by ``prefix``.
279 | '''
280 | parts = []
281 | append = parts.append
282 |
283 | def generator(node, key_factory=self.KeyFactory, parts=parts,
284 | append=append, NULL=NULL):
285 | if node.value is not NULL:
286 | yield (key_factory(parts), node.value)
287 | for part, child in iteritems(node.children):
288 | append(part)
289 | for subresult in generator(child):
290 | yield subresult
291 | del parts[-1]
292 |
293 | node = self._root
294 | if prefix is not None:
295 | for part in prefix:
296 | append(part)
297 | node = node.children.get(part)
298 | if node is None:
299 | node = self.NodeFactory()
300 | break
301 | return generator(node)
302 |
303 | # ----- original mapping API methods ----------------------------------------
304 |
305 | def __len__(self):
306 | return self._root.numkeys()
307 |
308 | def __iter__(self):
309 | return self.iterkeys()
310 |
311 | def __contains__(self, key):
312 | node = self._find(key)
313 | return node is not None and node.value is not NULL
314 |
315 | def has_key(self, key):
316 | return key in self
317 |
318 | def __getitem__(self, key):
319 | node = self._find(key)
320 | if node is None or node.value is NULL:
321 | raise KeyError
322 | return node.value
323 |
324 | def __setitem__(self, key, value):
325 | node = self._root
326 | Node = self.NodeFactory
327 | for part in key:
328 | next = node.children.get(part)
329 | if next is None:
330 | node = node.children.setdefault(part, Node())
331 | else:
332 | node = next
333 | node.value = value
334 |
335 | def __delitem__(self, key):
336 | nodes_parts = []
337 | append = nodes_parts.append
338 | node = self._root
339 | for part in key:
340 | append((node, part))
341 | node = node.children.get(part)
342 | if node is None:
343 | break
344 | if node is None or node.value is NULL:
345 | raise KeyError
346 | node.value = NULL
347 | pop = nodes_parts.pop
348 | while node.value is NULL and not node.children and nodes_parts:
349 | node, part = pop()
350 | del node.children[part]
351 |
352 | def clear(self):
353 | self._root.children.clear()
354 |
355 | def copy(self):
356 | clone = copy(super(Trie, self))
357 | clone._root = copy(self._root)
358 | return clone
359 |
360 | def __repr__(self):
361 | return '%s({%s})' % (
362 | self.__class__.__name__,
363 | ', '.join('%r: %r' % t for t in self.iteritems()))
364 |
365 | def _find(self, key):
366 | node = self._root
367 | for part in key:
368 | node = node.children.get(part)
369 | if node is None:
370 | break
371 | return node
372 |
373 |
374 | class StringTrie(Trie):
375 | '''A more appropriate for string keys :class:`Trie`.'''
376 | KeyFactory = ''.join
377 |
378 |
379 | # XXX: quick & dirty sorted dict
380 | # currently only iteritems() (for Python 2) or items() (for Python 3) has to be
381 | # overriden. However this is implementation detail that may change in the future
382 | class _SortedDict(dict):
383 | if PY3:
384 | def items(self):
385 | return iter(sorted(dict.items(self), key=itemgetter(0)))
386 | else:
387 | def iteritems(self):
388 | return iter(sorted(dict.iteritems(self), key=itemgetter(0)))
389 |
390 |
391 | class _SortedNode(Node):
392 | ChildrenFactory = _SortedDict
393 |
394 |
395 | class SortedTrie(Trie):
396 | '''A :class:`Trie` that returns its keys (and associated values/items) sorted.
397 |
398 | .. note::
399 | This implementation does not keep the keys sorted internally; instead it
400 | sorts them every time a method returning a list or iterator (e.g.
401 | :meth:`keys`) is called. In cases where a trie is relatively stable
402 | (few inserts/deletes) and is iterated often, it is probably more efficient
403 | to use a :attr:`NodeFactory` based on a sorted dict such as
404 | `sorteddict `_.
405 | '''
406 | NodeFactory = _SortedNode
407 |
408 |
409 | class SortedStringTrie(SortedTrie, StringTrie):
410 | 'A :class:`Trie` that is both a :class:`StringTrie` and a :class:`SortedTrie`.'
411 |
412 |
413 | if __name__ == '__main__':
414 | import doctest
415 |
416 | doctest.testmod()
417 |
--------------------------------------------------------------------------------
/yaya/common/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tony'
2 |
--------------------------------------------------------------------------------
/yaya/common/enum.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | class EnumException(Exception):
3 | """ Base class for all exceptions in this module. """
4 |
5 | def __init__(self, *args, **kwargs):
6 | if self.__class__ is EnumException:
7 | raise NotImplementedError(
8 | "%(class_name)s is an abstract base class" % vars())
9 | super(EnumException, self).__init__(*args, **kwargs)
10 |
11 |
12 | class EnumEmptyError(AssertionError, EnumException):
13 | """ Raised when attempting to create an empty enumeration. """
14 |
15 | def __str__(self):
16 | return "Enumerations cannot be empty"
17 |
18 |
19 | class EnumBadKeyError(TypeError, EnumException):
20 | """ Raised when creating an Enum with non-string keys. """
21 |
22 | def __init__(self, key):
23 | self.key = key
24 |
25 | def __str__(self):
26 | return "Enumeration keys must be strings: %(key)r" % vars(self)
27 |
28 |
29 | class EnumImmutableError(TypeError, EnumException):
30 | """ Raised when attempting to modify an Enum. """
31 |
32 | def __init__(self, *args):
33 | self.args = args
34 |
35 | def __str__(self):
36 | return "Enumeration does not allow modification"
37 |
38 |
39 | def _comparator(func):
40 | """ Decorator for EnumValue rich comparison methods. """
41 |
42 | def comparator_wrapper(self, other):
43 | try:
44 | assert self.enumtype == other.enumtype
45 | result = func(self.index, other.index)
46 | except (AssertionError, AttributeError):
47 | result = NotImplemented
48 |
49 | return result
50 |
51 | comparator_wrapper.__name__ = func.__name__
52 | comparator_wrapper.__doc__ = getattr(float, func.__name__).__doc__
53 | return comparator_wrapper
54 |
55 |
56 | class EnumValue(object):
57 | """ A specific value of an enumerated type. """
58 |
59 | def __init__(self, enumtype, index, key):
60 | """ Set up a new instance. """
61 | self._enumtype = enumtype.enum_name
62 | self._index = index
63 | self._key = key
64 |
65 | @property
66 | def enumtype(self):
67 | return self._enumtype
68 |
69 | @property
70 | def key(self):
71 | return self._key
72 |
73 | def __str__(self):
74 | return str(self.key)
75 |
76 | @property
77 | def index(self):
78 | return self._index
79 |
80 | def __repr__(self):
81 | return "EnumValue(%(_enumtype)r, %(_index)r, %(_key)r)" % vars(self)
82 |
83 | def __hash__(self):
84 | return hash(self._index)
85 |
86 | @_comparator
87 | def __eq__(self, other):
88 | return self == other
89 |
90 | @_comparator
91 | def __ne__(self, other):
92 | return self != other
93 |
94 | @_comparator
95 | def __lt__(self, other):
96 | return self < other
97 |
98 | @_comparator
99 | def __le__(self, other):
100 | return self <= other
101 |
102 | @_comparator
103 | def __gt__(self, other):
104 | return self > other
105 |
106 | @_comparator
107 | def __ge__(self, other):
108 | return self >= other
109 |
110 |
111 | class Enum(object):
112 | """ Enumerated type. """
113 |
114 | def __init__(self, *keys, **kwargs):
115 | """ Create an enumeration instance. """
116 |
117 | value_type = kwargs.get('value_type', EnumValue)
118 | enum_name = kwargs.get('enum_name', None)
119 | assert enum_name is not None
120 | self.__dict__['enum_name'] = enum_name
121 | if not keys:
122 | raise EnumEmptyError()
123 |
124 | keys = tuple(keys)
125 | values = [None] * len(keys)
126 |
127 | for i, key in enumerate(keys):
128 | value = value_type(self, i, key)
129 | values[i] = value
130 | try:
131 | super(Enum, self).__setattr__(key, value)
132 | except TypeError:
133 | raise EnumBadKeyError(key)
134 |
135 | self.__dict__['_keys'] = keys
136 | self.__dict__['_values'] = values
137 |
138 | def __setattr__(self, name, value):
139 | raise EnumImmutableError(name)
140 |
141 | def __delattr__(self, name):
142 | raise EnumImmutableError(name)
143 |
144 | def __len__(self):
145 | return len(self._values)
146 |
147 | def __getitem__(self, index):
148 | # tony 添加,添加从字符型枚举名到变量值的转换
149 | if isinstance(index, str) or isinstance(index, unicode) :
150 | return self.__getattribute__(index)
151 | else:
152 | return self._values[index]
153 |
154 | def __setitem__(self, index, value):
155 | raise EnumImmutableError(index)
156 |
157 | def __delitem__(self, index):
158 | raise EnumImmutableError(index)
159 |
160 | def __iter__(self):
161 | return iter(self._values)
162 |
163 | def __contains__(self, value):
164 | is_member = False
165 | if isinstance(value, basestring):
166 | is_member = (value in self._keys)
167 | else:
168 | is_member = (value in self._values)
169 | return is_member
170 |
--------------------------------------------------------------------------------
/yaya/common/nature.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from yaya.common.enum import Enum
3 |
4 | __author__ = 'tony'
5 | # 词性
6 |
7 | NATURE = Enum(
8 | "bg", # u"区别语素",
9 | "mg", # u"数语素",
10 | "nl", # u"名词性惯用语",
11 | "nx", # u"字母专名",
12 | "qg", # u"量词语素",
13 | "ud", # u"助词",
14 | "uj", # u"助词",
15 | "uz", # u"着",
16 | "ug", # u"过",
17 | "ul", # u"连词",
18 | "uv", # u"连词",
19 | "yg", # u"语气语素",
20 | "zg", # u"状态词",
21 | "n", # u"名词",
22 | "nr", # u"人名",
23 | "nrj", # u"日语人名",
24 | "nrf", # u"音译人名",
25 | "nr1", # u"复姓",
26 | "nr2", # u"蒙古姓名",
27 | "ns", # u"地名",
28 | "nsf", # u"音译地名",
29 | "nt", # u"机构团体名",
30 | "ntc", # u"公司名",
31 | "ntcf", # u"工厂",
32 | "ntcb", # u"银行",
33 | "ntch", # u"酒店宾馆",
34 | "nto", # u"政府机构",
35 | "ntu", # u"大学",
36 | "nts", # u"中小学",
37 | "nth", # u"医院",
38 | "nh", # u"医药疾病等健康相关名词",
39 | "nhm", # u"药品",
40 | "nhd", # u"疾病",
41 | "nn", # u"工作相关名词",
42 | "nnt", # u"职务职称",
43 | "nnd", # u"职业",
44 | "ng", # u"名词性语素",
45 | "nf", # u"食品",
46 | "ni", # u"机构相关",
47 | "nit", # u"教育相关机构",
48 | "nic", # u"下属机构",
49 | "nis", # u"机构后缀",
50 | "nm", # u"物品名",
51 | "nmc", # u"化学品名",
52 | "nb", # u"生物名",
53 | "nba", # u"动物名",
54 | "nbc", # u"动物纲目",
55 | "nbp", # u"植物名",
56 | "nz", # u"其他专名",
57 | "g", # u"学术词汇",
58 | "gm", # u"数学相关词汇",
59 | "gp", # u"物理相关词汇",
60 | "gc", # u"化学相关词汇",
61 | "gb", # u"生物相关词汇",
62 | "gbc", # u"生物类别",
63 | "gg", # u"地理地质相关词汇",
64 | "gi", # u"计算机相关词汇",
65 | "j", # u"简称略语",
66 | "i", # u"成语",
67 | "l", # u"习用语",
68 | "t", # u"时间词",
69 | "tg", # u"时间词性语素",
70 | "s", # u"处所词",
71 | "f", # u"方位词",
72 | "v", # u"动词",
73 | "vd", # u"副动词",
74 | "vn", # u"名动词",
75 | "vshi", # u"动词",
76 | "vyou", # u"动词",
77 | "vf", # u"趋向动词",
78 | "vx", # u"形式动词",
79 | "vi", # u"不及物动词",
80 | "vl", # u"动词性惯用语",
81 | "vg", # u"动词性语素",
82 | "a", # u"形容词",
83 | "ad", # u"副形词",
84 | "an", # u"名形词",
85 | "ag", # u"形容词性语素",
86 | "al", # u"形容词性惯用语",
87 | "b", # u"区别词",
88 | "bl", # u"区别词性惯用语",
89 | "z", # u"状态词",
90 | "r", # u"代词",
91 | "rr", # u"人称代词",
92 | "rz", # u"指示代词",
93 | "rzt", # u"时间指示代词",
94 | "rzs", # u"处所指示代词",
95 | "rzv", # u"谓词性指示代词",
96 | "ry", # u"疑问代词",
97 | "ryt", # u"时间疑问代词",
98 | "rys", # u"处所疑问代词",
99 | "ryv", # u"谓词性疑问代词",
100 | "rg", # u"代词性语素",
101 | "Rg", # u"古汉语代词性语素",
102 | "m", # u"数词",
103 | "mq", # u"数量词",
104 | "Mg", # u"甲乙丙丁之类的数词",
105 | "q", # u"量词",
106 | "qv", # u"动量词",
107 | "qt", # u"时量词",
108 | "d", # u"副词",
109 | "dg", # u"辄",
110 | "dl", # u"连语",
111 | "p", # u"介词",
112 | "pba", # u"介词",
113 | "pbei", # u"介词",
114 | "c", # u"连词",
115 | "cc", # u"并列连词",
116 | "u", # u"助词",
117 | "uzhe", # u"着",
118 | "ule", # u"了 ",
119 | "uguo", # u"过",
120 | "ude1", # u"的 ",
121 | "ude2", # u"地",
122 | "ude3", # u"得",
123 | "usuo", # u"所",
124 | "udeng", # u"等 ",
125 | "uyy", # u"一样 ",
126 | "udh", # u"的话",
127 | "uls", # u"来讲 ",
128 | "uzhi", # u"之",
129 | "ulian", # u"连 ",
130 | "e", # u"叹词",
131 | "y", # u"语气词",
132 | "o", # u"拟声词",
133 | "h", # u"前缀",
134 | "k", # u"后缀",
135 | "x", # u"字符串",
136 | "xx", # u"非语素字",
137 | "xu", # u"网址",
138 | "w", # u"标点符号",
139 | "wkz", # u"左括号",
140 | "wky", # u"右括号",
141 | "wyz", # u"左引号",
142 | "wyy", # u"右引号",
143 | "wj", # u"句号",
144 | "ww", # u"问号",
145 | "wt", # u"叹号",
146 | "wd", # u"逗号",
147 | "wf", # u"分号",
148 | "wn", # u"顿号",
149 | "wm", # u"冒号",
150 | "ws", # u"省略号",
151 | "wp", # u"破折号",
152 | "wb", # u"百分号千分号",
153 | "wh", # u"单位符号",
154 | "end", # u"仅用于始",
155 | "begin", # u"仅用于终"
156 | enum_name="NATURE" # 如果不指定,enum持久化时按id来判断类型的相等
157 | )
158 |
159 |
--------------------------------------------------------------------------------
/yaya/common/nr.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from __future__ import unicode_literals
3 | from yaya.common.enum import Enum
4 |
5 | __author__ = 'tony'
6 |
7 | NR = Enum(
8 |
9 | # Pf 姓氏 【张】华平先生
10 |
11 | 'B',
12 |
13 | # Pm 双名的首字 张【华】平先生
14 |
15 | 'C',
16 |
17 | # Pt 双名的末字 张华【平】先生
18 |
19 | 'D',
20 |
21 | # Ps 单名 张【浩】说:“我是一个好人”
22 |
23 | 'E',
24 |
25 | # Ppf 前缀 【老】刘、【小】李
26 |
27 | 'F',
28 |
29 | # Plf 后缀 王【总】、刘【老】、肖【氏】、吴【妈】、叶【帅】
30 |
31 | 'G',
32 |
33 | # Pp 人名的上文 又【来到】于洪洋的家。
34 |
35 | 'K',
36 |
37 | # Pn 人名的下文 新华社记者黄文【摄】
38 |
39 | 'L',
40 |
41 | # Ppn 两个中国人名之间的成分 编剧邵钧林【和】稽道青说
42 |
43 | 'M',
44 |
45 | # Ppf 人名的上文和姓成词 这里【有关】天培的壮烈
46 |
47 | 'U',
48 |
49 | # Pnw 三字人名的末字和下文成词 龚学平等领导, 邓颖【超生】前
50 |
51 | 'V',
52 |
53 | # Pfm 姓与双名的首字成词 【王国】维、
54 |
55 | 'X',
56 |
57 | # Pfs 姓与单名成词 【高峰】、【汪洋】
58 |
59 | 'Y',
60 |
61 | # Pmt 双名本身成词 张【朝阳】
62 |
63 | 'Z',
64 |
65 | # Po 以上之外其他的角色
66 |
67 | 'A',
68 |
69 | # 句子的开头
70 |
71 | 'S',
72 | enum_name="NR"
73 |
74 | )
75 |
76 | NRPattern = [
77 | 'BBCD',
78 | 'BBE',
79 | 'BBZ',
80 | 'BCD',
81 | 'BEE',
82 | 'BE',
83 | 'BC',
84 | 'BEC',
85 | 'BG',
86 | 'DG',
87 | 'EG',
88 | 'BXD',
89 | 'BZ',
90 | 'EE',
91 | 'FE',
92 | 'FC',
93 | 'FB',
94 | 'FG',
95 | 'Y',
96 | 'XD',
97 | 'GD',
98 | ]
--------------------------------------------------------------------------------
/yaya/common/ns.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from __future__ import unicode_literals
3 | from yaya.common.enum import Enum
4 |
5 | __author__ = 'tony'
6 |
7 | NS = Enum(
8 | 'A', # 地名的上文 我【来到】中关园
9 | 'B', # 地名的下文刘家村/和/下岸村/相邻
10 | 'C', # 中国地名的第一个字
11 | 'D', # 中国地名的第二个字
12 | 'E', # 中国地名的第三个字
13 | 'G', # 其他整个的地名
14 | 'H', # 中国地名的后缀海/淀区
15 | 'X', # 连接词刘家村/和/下岸村/相邻
16 | 'Z', # 其它非地名成分
17 | 'S', # 句子的开头
18 | enum_name="NS"
19 | )
20 |
21 | NSPattern = [
22 | "CH",
23 | "CDH",
24 | "CDEH",
25 | "GH"
26 | ]
27 |
--------------------------------------------------------------------------------
/yaya/common/nt.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from __future__ import unicode_literals
3 | from yaya.common.enum import Enum
4 |
5 | __author__ = 'tony'
6 |
7 | NT = Enum(
8 | 'A', # 上文 [参与]亚太经合组织的活动
9 | 'B', # 下文 中央电视台[报道]
10 | 'X', # 连接词 北京电视台[和]天津电视台
11 | 'C', # 特征词的一般性前缀 北京[电影]学院
12 | 'F', # 特征词的译名性前缀 美国[摩托罗拉]公司
13 | 'G', # 特征词的地名性前缀 交通银行[北京]分行
14 | 'H', # 特征词的机构名前缀 [中共中央]顾问委员会
15 | 'I', # 特征词的特殊性前缀 [华谊]医院
16 | 'J', # 特征词的简称性前缀 [巴]政府
17 | 'K', # 整个机构 [麦当劳]
18 | 'L', # 方位词
19 | 'M', # 数词 公交集团[五]分公司
20 | 'P', # 单字碎片
21 | 'W', # 符号
22 | 'D', # 机构名的特征词 国务院侨务[办公室]
23 | 'Z', # 非机构名成份
24 | 'S', # 句子的开头
25 | enum_name="NT"
26 | )
27 |
28 | NTPattern = [
29 | "CCCCCCCCD",
30 | "CCCCCCCD",
31 | "CCCCCCD",
32 | "CCCCCCGD",
33 | "CCCCCCICCCCD",
34 | "CCCCCCPD",
35 | "CCCCCD",
36 | "CCCCCDD",
37 | "CCCCCGCCD",
38 | "CCCCCICCCCCD",
39 | "CCCCCPCCD",
40 | "CCCCCWDWD",
41 | "CCCCD",
42 | "CCCCDCCD",
43 | "CCCCDCD",
44 | "CCCCDD",
45 | "CCCCID",
46 | "CCCCPCD",
47 | "CCCD",
48 | "CCCDCCCD",
49 | "CCCDCCD",
50 | "CCCDCD",
51 | "CCCDD",
52 | "CCCDICFPD",
53 | "CCCFCFFCD",
54 | "CCCGD",
55 | "CCCGID",
56 | "CCCGJCD",
57 | "CCCID",
58 | "CCCJCCD",
59 | "CCCJD",
60 | "CCCLGCD",
61 | "CCCMD",
62 | "CCCPCCCD",
63 | "CCCPCCD",
64 | "CCCPCD",
65 | "CCCPD",
66 | "CCD",
67 | "CCDCCCCCCD",
68 | "CCDCCCCD",
69 | "CCDCCCD",
70 | "CCDCCCDD",
71 | "CCDCCD",
72 | "CCDCD",
73 | "CCDCDD",
74 | "CCDCGCD",
75 | "CCDCGCDID",
76 | "CCDCGCDPD",
77 | "CCDCGGDD",
78 | "CCDCID",
79 | "CCDCJCCD",
80 | "CCDCJCCDD",
81 | "CCDD",
82 | "CCDDD",
83 | "CCDFIDGD",
84 | "CCDGCCD",
85 | "CCDICD",
86 | "CCDID",
87 | "CCDJCD",
88 | "CCDPCD",
89 | "CCDPJD",
90 | "CCFCCD",
91 | "CCFD",
92 | "CCGCCCD",
93 | "CCGCCD",
94 | "CCGCD",
95 | "CCGCDCD",
96 | "CCGCDCMD",
97 | "CCGD",
98 | "CCGGCD",
99 | "CCGID",
100 | "CCGIDD",
101 | "CCGJD",
102 | "CCGWGWD",
103 | "CCICCD",
104 | "CCICD",
105 | "CCICIFD",
106 | "CCICJPD",
107 | "CCID",
108 | "CCIDCD",
109 | "CCIDD",
110 | "CCIID",
111 | "CCJCCCD",
112 | "CCJCCD",
113 | "CCJCD",
114 | "CCJCFD",
115 | "CCJD",
116 | "CCJID",
117 | "CCJJMJD",
118 | "CCKID",
119 | "CCLD",
120 | "CCMD",
121 | "CCMMPDCD",
122 | "CCPCCD",
123 | "CCPCD",
124 | "CCPD",
125 | "CCPDCD",
126 | "CCPPD",
127 | "CCWCWD",
128 | "CCWGWCCD",
129 | "CCWGWD",
130 | "CD",
131 | "CDCCCCCCD",
132 | "CDCCCCD",
133 | "CDCCCD",
134 | "CDCCD",
135 | "CDCCDD",
136 | "CDCCJD",
137 | "CDCD",
138 | "CDCDD",
139 | "CDCGD",
140 | "CDCGPCCD",
141 | "CDCJD",
142 | "CDCLD",
143 | "CDCWIWD",
144 | "CDD",
145 | "CDDCCD",
146 | "CDDCCDD",
147 | "CDDCD",
148 | "CDDD",
149 | "CDFD",
150 | "CDFPCCD",
151 | "CDGCD",
152 | "CDGCICD",
153 | "CDGD",
154 | "CDICD",
155 | "CDID",
156 | "CDILLCCD",
157 | "CDJCCD",
158 | "CDJCD",
159 | "CDJD",
160 | "CDJLD",
161 | "CDLGCD",
162 | "CDLJD",
163 | "CDMCD",
164 | "CDPCCCCD",
165 | "CDPCCD",
166 | "CDPD",
167 | "CDPPD",
168 | "CFCCD",
169 | "CFCPD",
170 | "CFD",
171 | "CFPD",
172 | "CGCCCD",
173 | "CGCCD",
174 | "CGCD",
175 | "CGCDCD",
176 | "CGCDD",
177 | "CGD",
178 | "CGDCD",
179 | "CGDD",
180 | "CGDDCCD",
181 | "CGDDD",
182 | "CGDDID",
183 | "CGDJD",
184 | "CGDMD",
185 | "CGFD",
186 | "CGGCCCD",
187 | "CGGCCD",
188 | "CGGCD",
189 | "CGGD",
190 | "CGGGD",
191 | "CGGGDD",
192 | "CGGICD",
193 | "CGGJD",
194 | "CGICD",
195 | "CGID",
196 | "CGIJD",
197 | "CGJD",
198 | "CGMD",
199 | "CGPJD",
200 | "CICCCCD",
201 | "CICCD",
202 | "CICD",
203 | "CICDCD",
204 | "CICDD",
205 | "CICWGWD",
206 | "CID",
207 | "CIDD",
208 | "CIGCD",
209 | "CIGD",
210 | "CIID",
211 | "CILCD",
212 | "CIMD",
213 | "CJCCCCCD",
214 | "CJCCCD",
215 | "CJCCCDD",
216 | "CJCCD",
217 | "CJCCMD",
218 | "CJCD",
219 | "CJCDD",
220 | "CJCGCCD",
221 | "CJCGPJD",
222 | "CJCMD",
223 | "CJCPCCCD",
224 | "CJCPD",
225 | "CJD",
226 | "CJDCCCCD",
227 | "CJDCCJD",
228 | "CJDCD",
229 | "CJDD",
230 | "CJDFD",
231 | "CJDPD",
232 | "CJFCD",
233 | "CJFD",
234 | "CJGD",
235 | "CJGLD",
236 | "CJGPCJD",
237 | "CJID",
238 | "CJJCCD",
239 | "CJJD",
240 | "CJJJD",
241 | "CJJLD",
242 | "CJKD",
243 | "CJLCCD",
244 | "CJMCD",
245 | "CJMD",
246 | "CJPD",
247 | "CJWCCWCGJD",
248 | "CJWD",
249 | "CJWPMWCGD",
250 | "CKCD",
251 | "CKD",
252 | "CKJCDCD",
253 | "CKJPD",
254 | "CLCCCD",
255 | "CLCCD",
256 | "CLCCGCD",
257 | "CLCD",
258 | "CLD",
259 | "CLDFD",
260 | "CLID",
261 | "CLPCD",
262 | "CMCD",
263 | "CMCDD",
264 | "CMCGD",
265 | "CMD",
266 | "CMDCD",
267 | "CMDD",
268 | "CMMD",
269 | "CMMDCCD",
270 | "CMPD",
271 | "CPCCCCCCCD",
272 | "CPCCCCD",
273 | "CPCCCD",
274 | "CPCCD",
275 | "CPCD",
276 | "CPCDD",
277 | "CPCPD",
278 | "CPD",
279 | "CPDCCD",
280 | "CPDCD",
281 | "CPDD",
282 | "CPDGD",
283 | "CPDWGWD",
284 | "CPGCD",
285 | "CPGD",
286 | "CPID",
287 | "CPJCD",
288 | "CPJD",
289 | "CPJPD",
290 | "CPMD",
291 | "CPPD",
292 | "CWCD",
293 | "CWCGWCCD",
294 | "CWCWD",
295 | "CWDWDD",
296 | "CWGWCCD",
297 | "CWGWCD",
298 | "CWPWD",
299 | "DCCCCCD",
300 | "DCCCCD",
301 | "DCCCCDCCD",
302 | "DCCCD",
303 | "DCCD",
304 | "DCD",
305 | "DCDD",
306 | "DCGCD",
307 | "DCJD",
308 | "DCPD",
309 | "DD",
310 | "DDCCD",
311 | "DDCD",
312 | "DDD",
313 | "DDICCD",
314 | "DFD",
315 | "DGCCD",
316 | "DGCD",
317 | "DGD",
318 | "DGDCD",
319 | "DGDD",
320 | "DGDPD",
321 | "DGGD",
322 | "DICCCD",
323 | "DICD",
324 | "DID",
325 | "DIICD",
326 | "DJCCD",
327 | "DJCD",
328 | "DJD",
329 | "DLCCD",
330 | "DLCD",
331 | "DLD",
332 | "DMCD",
333 | "DMD",
334 | "DMMCD",
335 | "DPD",
336 | "DPMMCCD",
337 | "FCCCCCD",
338 | "FCCCCD",
339 | "FCCCD",
340 | "FCCCPCD",
341 | "FCCD",
342 | "FCCGD",
343 | "FCCID",
344 | "FCCPD",
345 | "FCCWGWD",
346 | "FCD",
347 | "FCDCD",
348 | "FCDD",
349 | "FCDFD",
350 | "FCFCD",
351 | "FCFPD",
352 | "FCGCCD",
353 | "FCGCD",
354 | "FCGD",
355 | "FCID",
356 | "FCIJJD",
357 | "FCJCD",
358 | "FCJD",
359 | "FCPD",
360 | "FCPGCD",
361 | "FCWGWD",
362 | "FD",
363 | "FDCD",
364 | "FDD",
365 | "FDFD",
366 | "FDGCCD",
367 | "FDID",
368 | "FDLCD",
369 | "FFCCD",
370 | "FFCD",
371 | "FFCKFCCD",
372 | "FFCLLD",
373 | "FFD",
374 | "FFFD",
375 | "FFGCCD",
376 | "FFGD",
377 | "FFJCD",
378 | "FFJD",
379 | "FFJPCD",
380 | "FFPD",
381 | "FGCCD",
382 | "FGCD",
383 | "FGCGCGCJCD",
384 | "FGD",
385 | "FGDD",
386 | "FGFD",
387 | "FGJCCD",
388 | "FICCD",
389 | "FICD",
390 | "FICDD",
391 | "FICGD",
392 | "FICID",
393 | "FID",
394 | "FIDCD",
395 | "FIDD",
396 | "FIFPD",
397 | "FIID",
398 | "FIJCD",
399 | "FIJD",
400 | "FJCCD",
401 | "FJCD",
402 | "FJCDD",
403 | "FJD",
404 | "FJDCD",
405 | "FJDD",
406 | "FJGD",
407 | "FJJCCD",
408 | "FJJCD",
409 | "FJJCLCD",
410 | "FJJD",
411 | "FJJJCCD",
412 | "FJJJD",
413 | "FJJJICCD",
414 | "FJJLJLCD",
415 | "FJPJD",
416 | "FKCD",
417 | "FKCJD",
418 | "FLD",
419 | "FLPCD",
420 | "FMD",
421 | "FPCCCD",
422 | "FPCD",
423 | "FPD",
424 | "FPFD",
425 | "FPFDD",
426 | "FPID",
427 | "FPJCCD",
428 | "FPJCD",
429 | "FPPCD",
430 | "FPPD",
431 | "FPPDLD",
432 | "FWCCCWCD",
433 | "FWCCCWD",
434 | "FWDWD",
435 | "FWFD",
436 | "FWFWCCCWD",
437 | "FWGJCD",
438 | "FWGWCD",
439 | "GCCCCCCCD",
440 | "GCCCCCCD",
441 | "GCCCCCD",
442 | "GCCCCCDCD",
443 | "GCCCCCDD",
444 | "GCCCCD",
445 | "GCCCCDCCD",
446 | "GCCCCDD",
447 | "GCCCCGD",
448 | "GCCCCJD",
449 | "GCCCCPD",
450 | "GCCCCWDWD",
451 | "GCCCD",
452 | "GCCCDCCCD",
453 | "GCCCDCCCDD",
454 | "GCCCDCCD",
455 | "GCCCDCD",
456 | "GCCCDD",
457 | "GCCCDDJD",
458 | "GCCCDID",
459 | "GCCCDMCD",
460 | "GCCCDPD",
461 | "GCCCDWGCDWD",
462 | "GCCCFCD",
463 | "GCCCGD",
464 | "GCCCICD",
465 | "GCCCID",
466 | "GCCCJCD",
467 | "GCCCJD",
468 | "GCCCJGD",
469 | "GCCCLD",
470 | "GCCCMD",
471 | "GCCCPCCD",
472 | "GCCCWDWD",
473 | "GCCD",
474 | "GCCDCCCCD",
475 | "GCCDCCCD",
476 | "GCCDCCCDCD",
477 | "GCCDCCD",
478 | "GCCDCD",
479 | "GCCDCID",
480 | "GCCDCJCD",
481 | "GCCDCPCD",
482 | "GCCDD",
483 | "GCCDDCCCD",
484 | "GCCDDCCD",
485 | "GCCDDD",
486 | "GCCDFD",
487 | "GCCDGCCD",
488 | "GCCDGD",
489 | "GCCDGGDCD",
490 | "GCCDID",
491 | "GCCDJCD",
492 | "GCCDJD",
493 | "GCCDLDD",
494 | "GCCDLJCD",
495 | "GCCDMJD",
496 | "GCCDMJMMCD",
497 | "GCCDMJMMD",
498 | "GCCDMMD",
499 | "GCCDPD",
500 | "GCCFCD",
501 | "GCCFDD",
502 | "GCCFJPD",
503 | "GCCFPD",
504 | "GCCGCCCD",
505 | "GCCGCCD",
506 | "GCCGCD",
507 | "GCCGCDD",
508 | "GCCGD",
509 | "GCCGGCGD",
510 | "GCCGGDD",
511 | "GCCICCDCCD",
512 | "GCCICD",
513 | "GCCID",
514 | "GCCIDD",
515 | "GCCJCCCD",
516 | "GCCJCCCID",
517 | "GCCJCCD",
518 | "GCCJCD",
519 | "GCCJCJD",
520 | "GCCJD",
521 | "GCCJICD",
522 | "GCCJID",
523 | "GCCJPCD",
524 | "GCCJPD",
525 | "GCCKD",
526 | "GCCLCCD",
527 | "GCCLCD",
528 | "GCCLCGCD",
529 | "GCCLD",
530 | "GCCMCD",
531 | "GCCMD",
532 | "GCCMPD",
533 | "GCCPCCCCD",
534 | "GCCPCCCID",
535 | "GCCPCCD",
536 | "GCCPCD",
537 | "GCCPD",
538 | "GCCPDD",
539 | "GCCPFWCJD",
540 | "GCCPJD",
541 | "GCCWCCWCD",
542 | "GCCWCDWCD",
543 | "GCCWDWCCD",
544 | "GCCWDWD",
545 | "GCD",
546 | "GCDCCCCD",
547 | "GCDCCCCPD",
548 | "GCDCCCD",
549 | "GCDCCD",
550 | "GCDCCDCD",
551 | "GCDCCDD",
552 | "GCDCCDID",
553 | "GCDCCJCD",
554 | "GCDCCJD",
555 | "GCDCD",
556 | "GCDCDD",
557 | "GCDCDICD",
558 | "GCDCGCD",
559 | "GCDCGD",
560 | "GCDCGMCD",
561 | "GCDCID",
562 | "GCDCJCD",
563 | "GCDCJD",
564 | "GCDCLDD",
565 | "GCDCMCD",
566 | "GCDCMD",
567 | "GCDCMDCD",
568 | "GCDCMDD",
569 | "GCDCMDID",
570 | "GCDCPD",
571 | "GCDD",
572 | "GCDDCD",
573 | "GCDDD",
574 | "GCDDMCD",
575 | "GCDFD",
576 | "GCDFGCD",
577 | "GCDFWFD",
578 | "GCDGCCCCCD",
579 | "GCDGCCD",
580 | "GCDGCD",
581 | "GCDGD",
582 | "GCDGDD",
583 | "GCDGGD",
584 | "GCDGLCCD",
585 | "GCDGLJPCD",
586 | "GCDICCCCD",
587 | "GCDICCD",
588 | "GCDICD",
589 | "GCDID",
590 | "GCDIDD",
591 | "GCDJCCD",
592 | "GCDJCD",
593 | "GCDJCDGPD",
594 | "GCDJD",
595 | "GCDJJD",
596 | "GCDKCDCD",
597 | "GCDLCCCD",
598 | "GCDLD",
599 | "GCDLGCCCCD",
600 | "GCDLGCD",
601 | "GCDLPD",
602 | "GCDMCD",
603 | "GCDMCDD",
604 | "GCDMD",
605 | "GCDMDD",
606 | "GCDMJD",
607 | "GCDPCD",
608 | "GCDPD",
609 | "GCDWFWD",
610 | "GCDWGWCD",
611 | "GCDWGWD",
612 | "GCFCCD",
613 | "GCFCCJFGDD",
614 | "GCFCD",
615 | "GCFD",
616 | "GCFDD",
617 | "GCFFD",
618 | "GCFID",
619 | "GCFJCCD",
620 | "GCFPCD",
621 | "GCFPD",
622 | "GCFWGCCD",
623 | "GCFWGCCDD",
624 | "GCFWGJCD",
625 | "GCGCCCD",
626 | "GCGCCD",
627 | "GCGCD",
628 | "GCGCID",
629 | "GCGCLD",
630 | "GCGCPPCCD",
631 | "GCGD",
632 | "GCGDD",
633 | "GCGGCD",
634 | "GCGGCGD",
635 | "GCGGD",
636 | "GCGICD",
637 | "GCGID",
638 | "GCGJCCD",
639 | "GCGPCCD",
640 | "GCICCCCD",
641 | "GCICCCD",
642 | "GCICCD",
643 | "GCICD",
644 | "GCICDD",
645 | "GCID",
646 | "GCIDD",
647 | "GCIDID",
648 | "GCIFCCD",
649 | "GCIID",
650 | "GCIJCD",
651 | "GCIJD",
652 | "GCIJICD",
653 | "GCIPCD",
654 | "GCIPD",
655 | "GCIWGIIWD",
656 | "GCJCCCCD",
657 | "GCJCCCD",
658 | "GCJCCD",
659 | "GCJCD",
660 | "GCJCGD",
661 | "GCJCID",
662 | "GCJCIID",
663 | "GCJCPD",
664 | "GCJD",
665 | "GCJDCCD",
666 | "GCJDCD",
667 | "GCJDD",
668 | "GCJDID",
669 | "GCJFD",
670 | "GCJGD",
671 | "GCJICD",
672 | "GCJID",
673 | "GCJJCCD",
674 | "GCJJCD",
675 | "GCJJD",
676 | "GCJJGD",
677 | "GCJKCD",
678 | "GCJLCCD",
679 | "GCJMD",
680 | "GCJPCCGJLFD",
681 | "GCJPD",
682 | "GCJWCCJCD",
683 | "GCKCCD",
684 | "GCKD",
685 | "GCLCCCD",
686 | "GCLCCD",
687 | "GCLCD",
688 | "GCLD",
689 | "GCLDD",
690 | "GCLGGCD",
691 | "GCMCCD",
692 | "GCMCD",
693 | "GCMD",
694 | "GCMDD",
695 | "GCMPCD",
696 | "GCMPMD",
697 | "GCPCCCCD",
698 | "GCPCCCD",
699 | "GCPCCD",
700 | "GCPCCDD",
701 | "GCPCD",
702 | "GCPCDD",
703 | "GCPCKCD",
704 | "GCPD",
705 | "GCPDCCD",
706 | "GCPDD",
707 | "GCPFD",
708 | "GCPICCCD",
709 | "GCPJCCD",
710 | "GCPJCD",
711 | "GCPJD",
712 | "GCPJDCD",
713 | "GCPJJCD",
714 | "GCPJJDD",
715 | "GCPJPD",
716 | "GCPPCCD",
717 | "GCPPD",
718 | "GCPPPD",
719 | "GCWCWCJD",
720 | "GCWCWD",
721 | "GCWDWCDD",
722 | "GCWDWD",
723 | "GCWGWDD",
724 | "GD",
725 | "GDCCCCCCD",
726 | "GDCCCCCD",
727 | "GDCCCCD",
728 | "GDCCCCPD",
729 | "GDCCCD",
730 | "GDCCCDD",
731 | "GDCCCGCCD",
732 | "GDCCCJCD",
733 | "GDCCCJD",
734 | "GDCCCJDCD",
735 | "GDCCD",
736 | "GDCCDCD",
737 | "GDCCDCDD",
738 | "GDCCDD",
739 | "GDCCID",
740 | "GDCCJD",
741 | "GDCCPCD",
742 | "GDCD",
743 | "GDCDCCD",
744 | "GDCDCD",
745 | "GDCDD",
746 | "GDCDICD",
747 | "GDCDPD",
748 | "GDCFD",
749 | "GDCGCCD",
750 | "GDCGD",
751 | "GDCGPPCCD",
752 | "GDCID",
753 | "GDCIDD",
754 | "GDCJCCD",
755 | "GDCJD",
756 | "GDCLD",
757 | "GDCMD",
758 | "GDCPD",
759 | "GDCPID",
760 | "GDCPJD",
761 | "GDD",
762 | "GDDCCCCD",
763 | "GDDCCCD",
764 | "GDDCCD",
765 | "GDDCD",
766 | "GDDCDD",
767 | "GDDCFD",
768 | "GDDCFDCD",
769 | "GDDCMD",
770 | "GDDD",
771 | "GDDDCD",
772 | "GDDID",
773 | "GDDPPD",
774 | "GDDPPLD",
775 | "GDFCCD",
776 | "GDFCD",
777 | "GDFD",
778 | "GDFFD",
779 | "GDFGD",
780 | "GDGCCCD",
781 | "GDGCCD",
782 | "GDGCD",
783 | "GDGD",
784 | "GDGDCD",
785 | "GDGDD",
786 | "GDGDFID",
787 | "GDGJCCD",
788 | "GDGMD",
789 | "GDICCD",
790 | "GDICD",
791 | "GDID",
792 | "GDIDCD",
793 | "GDIDD",
794 | "GDIGCD",
795 | "GDIID",
796 | "GDIPCD",
797 | "GDJCCCD",
798 | "GDJCCD",
799 | "GDJCD",
800 | "GDJD",
801 | "GDJICD",
802 | "GDJJD",
803 | "GDJJJD",
804 | "GDJPCD",
805 | "GDJPDD",
806 | "GDLCCCCCD",
807 | "GDLCID",
808 | "GDLD",
809 | "GDLJD",
810 | "GDLJDD",
811 | "GDMCD",
812 | "GDMD",
813 | "GDMDCD",
814 | "GDMDD",
815 | "GDMJD",
816 | "GDMJMMD",
817 | "GDMPD",
818 | "GDPCCCCCD",
819 | "GDPCCD",
820 | "GDPCD",
821 | "GDPD",
822 | "GDPGCD",
823 | "GDPID",
824 | "GDPJCD",
825 | "GDPJD",
826 | "GDPPD",
827 | "GDPPJD",
828 | "GDWDWCCD",
829 | "GDWDWCCDD",
830 | "GDWDWD",
831 | "GDWFWD",
832 | "GDWGWD",
833 | "GFCCCCCD",
834 | "GFCCCCD",
835 | "GFCCCCJD",
836 | "GFCCCD",
837 | "GFCCCID",
838 | "GFCCD",
839 | "GFCCDD",
840 | "GFCCFCD",
841 | "GFCCPD",
842 | "GFCCPGD",
843 | "GFCD",
844 | "GFCDCD",
845 | "GFCDD",
846 | "GFCID",
847 | "GFCJCD",
848 | "GFCJD",
849 | "GFCPCCD",
850 | "GFCPCD",
851 | "GFCPD",
852 | "GFCPJD",
853 | "GFCPJPD",
854 | "GFD",
855 | "GFDCCCD",
856 | "GFDCD",
857 | "GFDD",
858 | "GFFCCD",
859 | "GFFCD",
860 | "GFFD",
861 | "GFFPCGCD",
862 | "GFGCD",
863 | "GFGCID",
864 | "GFGD",
865 | "GFGJCD",
866 | "GFICCD",
867 | "GFICD",
868 | "GFID",
869 | "GFIICD",
870 | "GFJCCCD",
871 | "GFJCCD",
872 | "GFJCD",
873 | "GFJCDCD",
874 | "GFJD",
875 | "GFJJCCD",
876 | "GFJJD",
877 | "GFJJJCCD",
878 | "GFJJLJCLCD",
879 | "GFLD",
880 | "GFLPD",
881 | "GFMCD",
882 | "GFPCD",
883 | "GFPD",
884 | "GFPJCD",
885 | "GFPJD",
886 | "GFPJPD",
887 | "GFPPCCCD",
888 | "GFPPD",
889 | "GFWCJCPCCCWCCD",
890 | "GFWGWCD",
891 | "GGCCCCCD",
892 | "GGCCCCD",
893 | "GGCCCD",
894 | "GGCCCICD",
895 | "GGCCCID",
896 | "GGCCCWDWD",
897 | "GGCCD",
898 | "GGCCDCD",
899 | "GGCCDD",
900 | "GGCCGCD",
901 | "GGCCGD",
902 | "GGCCGJD",
903 | "GGCCJCD",
904 | "GGCCJD",
905 | "GGCD",
906 | "GGCDCCCCCD",
907 | "GGCDCCD",
908 | "GGCDCD",
909 | "GGCDD",
910 | "GGCDJD",
911 | "GGCFCCFCPD",
912 | "GGCFD",
913 | "GGCFJD",
914 | "GGCGCCCD",
915 | "GGCGCD",
916 | "GGCGD",
917 | "GGCGGD",
918 | "GGCICLCD",
919 | "GGCID",
920 | "GGCIJCD",
921 | "GGCJCCD",
922 | "GGCJCD",
923 | "GGCJD",
924 | "GGCJDDCD",
925 | "GGCJJCCD",
926 | "GGCJJD",
927 | "GGCJPCICCCD",
928 | "GGCJPD",
929 | "GGCLCD",
930 | "GGCLD",
931 | "GGCMD",
932 | "GGCPCCD",
933 | "GGCPCD",
934 | "GGCPD",
935 | "GGD",
936 | "GGDCCCD",
937 | "GGDCCD",
938 | "GGDCD",
939 | "GGDD",
940 | "GGDDCCD",
941 | "GGDDCD",
942 | "GGDDD",
943 | "GGDFCD",
944 | "GGDFD",
945 | "GGDGD",
946 | "GGDID",
947 | "GGDJCD",
948 | "GGDJD",
949 | "GGDJJD",
950 | "GGDPPJD",
951 | "GGFCCCD",
952 | "GGFCCD",
953 | "GGFCD",
954 | "GGFD",
955 | "GGFDD",
956 | "GGFFCD",
957 | "GGFFD",
958 | "GGFFDCD",
959 | "GGFFDD",
960 | "GGFGD",
961 | "GGFJCCD",
962 | "GGFJD",
963 | "GGFJDD",
964 | "GGFJJD",
965 | "GGFLD",
966 | "GGFPCFPCD",
967 | "GGGCCCCD",
968 | "GGGCCCD",
969 | "GGGCCD",
970 | "GGGCD",
971 | "GGGCDD",
972 | "GGGCGCD",
973 | "GGGCGD",
974 | "GGGCID",
975 | "GGGCJD",
976 | "GGGD",
977 | "GGGDCD",
978 | "GGGDD",
979 | "GGGFD",
980 | "GGGGCD",
981 | "GGGGD",
982 | "GGGGFJD",
983 | "GGGGICD",
984 | "GGGGJD",
985 | "GGGGJPD",
986 | "GGGGLD",
987 | "GGGGPCD",
988 | "GGGGPPD",
989 | "GGGICD",
990 | "GGGID",
991 | "GGGIDID",
992 | "GGGIGCJD",
993 | "GGGIJD",
994 | "GGGJCD",
995 | "GGGJD",
996 | "GGGJJCJD",
997 | "GGGJJD",
998 | "GGGJPCCD",
999 | "GGGLD",
1000 | "GGGMD",
1001 | "GGGPJD",
1002 | "GGGWICWD",
1003 | "GGICCCCD",
1004 | "GGICCCD",
1005 | "GGICCD",
1006 | "GGICCGD",
1007 | "GGICCLD",
1008 | "GGICCPCCD",
1009 | "GGICD",
1010 | "GGICGCCCD",
1011 | "GGICID",
1012 | "GGICJD",
1013 | "GGID",
1014 | "GGIDCD",
1015 | "GGIDD",
1016 | "GGIFD",
1017 | "GGIFJCD",
1018 | "GGIFPD",
1019 | "GGIGCCD",
1020 | "GGIGD",
1021 | "GGIICD",
1022 | "GGIID",
1023 | "GGIIPID",
1024 | "GGIJCCD",
1025 | "GGIJD",
1026 | "GGIPCD",
1027 | "GGIPD",
1028 | "GGIPDD",
1029 | "GGJCCCD",
1030 | "GGJCCD",
1031 | "GGJCCPCJCCD",
1032 | "GGJCD",
1033 | "GGJCWDWD",
1034 | "GGJD",
1035 | "GGJGCCCD",
1036 | "GGJGCCD",
1037 | "GGJGD",
1038 | "GGJJD",
1039 | "GGJJPCD",
1040 | "GGJLD",
1041 | "GGJPD",
1042 | "GGJPDD",
1043 | "GGKD",
1044 | "GGKGD",
1045 | "GGLCCCD",
1046 | "GGLCD",
1047 | "GGLCDD",
1048 | "GGLCJD",
1049 | "GGLCPD",
1050 | "GGLD",
1051 | "GGLFD",
1052 | "GGLID",
1053 | "GGLJD",
1054 | "GGLLFD",
1055 | "GGLPD",
1056 | "GGMCD",
1057 | "GGMCDD",
1058 | "GGMD",
1059 | "GGMJCD",
1060 | "GGMLD",
1061 | "GGMPCCD",
1062 | "GGPCCCD",
1063 | "GGPCCD",
1064 | "GGPCD",
1065 | "GGPCJCD",
1066 | "GGPD",
1067 | "GGPFD",
1068 | "GGPICD",
1069 | "GGPJCCCCD",
1070 | "GGPJCD",
1071 | "GGPJCDD",
1072 | "GGPJD",
1073 | "GGPLD",
1074 | "GGPPCCD",
1075 | "GGPPCD",
1076 | "GGPPD",
1077 | "GGPPJJD",
1078 | "GGPPPCD",
1079 | "GGWPCGWPJD",
1080 | "GICCCCCCD",
1081 | "GICCCCCD",
1082 | "GICCCCD",
1083 | "GICCCD",
1084 | "GICCCDD",
1085 | "GICCCJCD",
1086 | "GICCD",
1087 | "GICCDD",
1088 | "GICCJD",
1089 | "GICCLDD",
1090 | "GICCPD",
1091 | "GICD",
1092 | "GICDCCCCD",
1093 | "GICDCCD",
1094 | "GICDCD",
1095 | "GICDD",
1096 | "GICDLPD",
1097 | "GICDWCCWD",
1098 | "GICGCCCCD",
1099 | "GICGCCD",
1100 | "GICGCJICD",
1101 | "GICGD",
1102 | "GICGGD",
1103 | "GICGMMD",
1104 | "GICGPCJD",
1105 | "GICICCD",
1106 | "GICICD",
1107 | "GICID",
1108 | "GICIGD",
1109 | "GICIID",
1110 | "GICJCCD",
1111 | "GICJCD",
1112 | "GICJD",
1113 | "GICPCCCCD",
1114 | "GICPD",
1115 | "GICPICD",
1116 | "GICPJD",
1117 | "GID",
1118 | "GIDCCCJCD",
1119 | "GIDCCD",
1120 | "GIDCD",
1121 | "GIDD",
1122 | "GIDDD",
1123 | "GIDICCD",
1124 | "GIDID",
1125 | "GIDLPCD",
1126 | "GIFCCD",
1127 | "GIFD",
1128 | "GIFICD",
1129 | "GIFWFD",
1130 | "GIGCCD",
1131 | "GIGCD",
1132 | "GIGCGCD",
1133 | "GIGCJD",
1134 | "GIGCPD",
1135 | "GIGD",
1136 | "GIGGD",
1137 | "GIGICD",
1138 | "GIGID",
1139 | "GIGJPCD",
1140 | "GIICCCCD",
1141 | "GIICCD",
1142 | "GIICD",
1143 | "GIID",
1144 | "GIIGD",
1145 | "GIIID",
1146 | "GIIJCCCD",
1147 | "GIIJCD",
1148 | "GIJCCCCCD",
1149 | "GIJCCCCD",
1150 | "GIJCCCD",
1151 | "GIJCCD",
1152 | "GIJCD",
1153 | "GIJCPD",
1154 | "GIJD",
1155 | "GIJDD",
1156 | "GIJID",
1157 | "GIJJCCD",
1158 | "GIJJCD",
1159 | "GIJLD",
1160 | "GIJPD",
1161 | "GIJPDCD",
1162 | "GIKD",
1163 | "GILCCCCDD",
1164 | "GILCCD",
1165 | "GILCD",
1166 | "GILD",
1167 | "GILID",
1168 | "GILPMD",
1169 | "GIMCCD",
1170 | "GIMCD",
1171 | "GIMD",
1172 | "GIMJCD",
1173 | "GIMJD",
1174 | "GIMPCCD",
1175 | "GIPCCCCD",
1176 | "GIPCCCD",
1177 | "GIPCCD",
1178 | "GIPCD",
1179 | "GIPCMD",
1180 | "GIPD",
1181 | "GIPDCD",
1182 | "GIPDD",
1183 | "GIPICD",
1184 | "GIPJCCD",
1185 | "GIPJCD",
1186 | "GIPPCD",
1187 | "GIPPD",
1188 | "GIWDCCWCD",
1189 | "GIWDWD",
1190 | "GIWGWCD",
1191 | "GJCCCCCD",
1192 | "GJCCCCD",
1193 | "GJCCCD",
1194 | "GJCCCDCDCD",
1195 | "GJCCCDD",
1196 | "GJCCD",
1197 | "GJCCDCD",
1198 | "GJCCDD",
1199 | "GJCCFD",
1200 | "GJCCGJPD",
1201 | "GJCCICCD",
1202 | "GJCCJCD",
1203 | "GJCCJD",
1204 | "GJCD",
1205 | "GJCDCCD",
1206 | "GJCDCJCCD",
1207 | "GJCDD",
1208 | "GJCDJCD",
1209 | "GJCDPD",
1210 | "GJCGCD",
1211 | "GJCGD",
1212 | "GJCGPJCCD",
1213 | "GJCICCCD",
1214 | "GJCICD",
1215 | "GJCID",
1216 | "GJCJCCD",
1217 | "GJCJCD",
1218 | "GJCJD",
1219 | "GJCJJCCCCD",
1220 | "GJCJJCD",
1221 | "GJCJPD",
1222 | "GJCJPPCD",
1223 | "GJCLD",
1224 | "GJCLJCCCD",
1225 | "GJCMD",
1226 | "GJCPD",
1227 | "GJCPJD",
1228 | "GJCPPD",
1229 | "GJD",
1230 | "GJDCCCD",
1231 | "GJDCCD",
1232 | "GJDCD",
1233 | "GJDD",
1234 | "GJDICD",
1235 | "GJDID",
1236 | "GJDLCD",
1237 | "GJDPCD",
1238 | "GJFCCD",
1239 | "GJFCD",
1240 | "GJFD",
1241 | "GJFFD",
1242 | "GJFGD",
1243 | "GJFICD",
1244 | "GJGCD",
1245 | "GJGD",
1246 | "GJGPCD",
1247 | "GJICCCD",
1248 | "GJICCD",
1249 | "GJICD",
1250 | "GJID",
1251 | "GJIID",
1252 | "GJJCCCD",
1253 | "GJJCCD",
1254 | "GJJCCDD",
1255 | "GJJCD",
1256 | "GJJCJCCCD",
1257 | "GJJCJCCD",
1258 | "GJJCPCD",
1259 | "GJJD",
1260 | "GJJDCD",
1261 | "GJJDD",
1262 | "GJJFCCD",
1263 | "GJJFD",
1264 | "GJJGD",
1265 | "GJJJCD",
1266 | "GJJJD",
1267 | "GJJJICD",
1268 | "GJJJJCCD",
1269 | "GJJJJD",
1270 | "GJJPCCCD",
1271 | "GJJPCCD",
1272 | "GJJPCID",
1273 | "GJJPPD",
1274 | "GJLCCCCD",
1275 | "GJLCD",
1276 | "GJLCDD",
1277 | "GJLD",
1278 | "GJMCCD",
1279 | "GJMD",
1280 | "GJPCCCCD",
1281 | "GJPCCCD",
1282 | "GJPCCD",
1283 | "GJPCD",
1284 | "GJPCDD",
1285 | "GJPCJCD",
1286 | "GJPCLCD",
1287 | "GJPCMD",
1288 | "GJPD",
1289 | "GJPDD",
1290 | "GJPGCCD",
1291 | "GJPGD",
1292 | "GJPICCD",
1293 | "GJPICD",
1294 | "GJPICDD",
1295 | "GJPJCCD",
1296 | "GJPJD",
1297 | "GJPJPD",
1298 | "GJPLCD",
1299 | "GJPPJD",
1300 | "GKCCCD",
1301 | "GKCCD",
1302 | "GKCCPD",
1303 | "GKCD",
1304 | "GKCDCD",
1305 | "GKCDD",
1306 | "GKCDJCD",
1307 | "GKCJCD",
1308 | "GKCMD",
1309 | "GKD",
1310 | "GKDD",
1311 | "GKJJD",
1312 | "GLCCCCCCD",
1313 | "GLCCCCD",
1314 | "GLCCCD",
1315 | "GLCCD",
1316 | "GLCCDD",
1317 | "GLCCJCCCD",
1318 | "GLCCJCCD",
1319 | "GLCD",
1320 | "GLCDD",
1321 | "GLCDGCCD",
1322 | "GLCGCJCD",
1323 | "GLCGD",
1324 | "GLCGDD",
1325 | "GLCJD",
1326 | "GLCJJCCCCCD",
1327 | "GLCLD",
1328 | "GLCMD",
1329 | "GLCPCCD",
1330 | "GLCPD",
1331 | "GLD",
1332 | "GLDCD",
1333 | "GLDCMD",
1334 | "GLDCMDCD",
1335 | "GLDCMDD",
1336 | "GLDD",
1337 | "GLDDCKCD",
1338 | "GLFCD",
1339 | "GLFCFD",
1340 | "GLFGCD",
1341 | "GLGCD",
1342 | "GLGD",
1343 | "GLGPJD",
1344 | "GLICCD",
1345 | "GLICD",
1346 | "GLID",
1347 | "GLJCCCD",
1348 | "GLJCCD",
1349 | "GLJCD",
1350 | "GLJCICCD",
1351 | "GLJD",
1352 | "GLJFCD",
1353 | "GLJGD",
1354 | "GLJICCD",
1355 | "GLJID",
1356 | "GLJJD",
1357 | "GLJPCCD",
1358 | "GLJPCICD",
1359 | "GLJPJCCD",
1360 | "GLJWGWCD",
1361 | "GLLCCCD",
1362 | "GLLCID",
1363 | "GLPCCCD",
1364 | "GLPCCD",
1365 | "GLPCD",
1366 | "GLPCDD",
1367 | "GLPCPCCD",
1368 | "GLPD",
1369 | "GLPDD",
1370 | "GLPGCD",
1371 | "GLPJD",
1372 | "GLPLJCCCD",
1373 | "GLPLJCD",
1374 | "GLPPCCCCD",
1375 | "GLPPCCD",
1376 | "GLPPCD",
1377 | "GMCCCCD",
1378 | "GMCCCD",
1379 | "GMCCD",
1380 | "GMCCID",
1381 | "GMCD",
1382 | "GMCDCCCD",
1383 | "GMCDCCD",
1384 | "GMCDCD",
1385 | "GMCDD",
1386 | "GMCDMCD",
1387 | "GMCGD",
1388 | "GMCJCD",
1389 | "GMCMD",
1390 | "GMCMJD",
1391 | "GMD",
1392 | "GMDCD",
1393 | "GMDD",
1394 | "GMDICD",
1395 | "GMDID",
1396 | "GMGJCD",
1397 | "GMGJJD",
1398 | "GMICD",
1399 | "GMID",
1400 | "GMIPJCCD",
1401 | "GMJCCD",
1402 | "GMJCD",
1403 | "GMJD",
1404 | "GMJDD",
1405 | "GMJICCCD",
1406 | "GMJMJFCD",
1407 | "GMJPCD",
1408 | "GMJPLCCD",
1409 | "GMLD",
1410 | "GMLDCD",
1411 | "GMLGCD",
1412 | "GMLID",
1413 | "GMLLD",
1414 | "GMMCCCD",
1415 | "GMMD",
1416 | "GMMGD",
1417 | "GMMLCCD",
1418 | "GMMPCD",
1419 | "GMMPD",
1420 | "GMPCCD",
1421 | "GMPCD",
1422 | "GMPD",
1423 | "GMPDCD",
1424 | "GMPDD",
1425 | "GMPJCD",
1426 | "GPCCCCCCD",
1427 | "GPCCCCD",
1428 | "GPCCCCID",
1429 | "GPCCCD",
1430 | "GPCCD",
1431 | "GPCCDCCD",
1432 | "GPCCDD",
1433 | "GPCCDDD",
1434 | "GPCD",
1435 | "GPCDCCD",
1436 | "GPCDCD",
1437 | "GPCDD",
1438 | "GPCFDCCD",
1439 | "GPCFDD",
1440 | "GPCGD",
1441 | "GPCICCD",
1442 | "GPCID",
1443 | "GPCIJD",
1444 | "GPCJCCCD",
1445 | "GPCJCCD",
1446 | "GPCJCD",
1447 | "GPCPID",
1448 | "GPCWDWCD",
1449 | "GPD",
1450 | "GPDCCD",
1451 | "GPDCD",
1452 | "GPDD",
1453 | "GPFCCD",
1454 | "GPFCD",
1455 | "GPFD",
1456 | "GPFFCD",
1457 | "GPGCCCD",
1458 | "GPGD",
1459 | "GPGJCJCCCCD",
1460 | "GPGPJD",
1461 | "GPICCCCD",
1462 | "GPICCCD",
1463 | "GPICCD",
1464 | "GPICD",
1465 | "GPID",
1466 | "GPIDCD",
1467 | "GPIDD",
1468 | "GPJCCCCCD",
1469 | "GPJCCCD",
1470 | "GPJCCD",
1471 | "GPJCD",
1472 | "GPJCDD",
1473 | "GPJCJCCD",
1474 | "GPJD",
1475 | "GPJDCCD",
1476 | "GPJDCD",
1477 | "GPJDD",
1478 | "GPJFICD",
1479 | "GPJFID",
1480 | "GPJGD",
1481 | "GPJJCCD",
1482 | "GPJJCD",
1483 | "GPJLCD",
1484 | "GPJWDWD",
1485 | "GPLCWCWCWD",
1486 | "GPLD",
1487 | "GPLJCCD",
1488 | "GPMJCGD",
1489 | "GPMMD",
1490 | "GPMPCCD",
1491 | "GPPCCCCD",
1492 | "GPPCCCD",
1493 | "GPPCCD",
1494 | "GPPCD",
1495 | "GPPCDCCD",
1496 | "GPPCDD",
1497 | "GPPCLD",
1498 | "GPPD",
1499 | "GPPDCD",
1500 | "GPPDCDD",
1501 | "GPPDD",
1502 | "GPPGCD",
1503 | "GPPICCD",
1504 | "GPPID",
1505 | "GPPJCD",
1506 | "GPPJD",
1507 | "GPPJDD",
1508 | "GPPJJCCCCD",
1509 | "GPPLD",
1510 | "GPPPCCD",
1511 | "GPPPCKCCD",
1512 | "GPPPPCCD",
1513 | "GWCPWD",
1514 | "GWCWCCCD",
1515 | "GWCWCD",
1516 | "GWCWD",
1517 | "GWCWPJCD",
1518 | "GWD",
1519 | "GWFCD",
1520 | "GWGCCCD",
1521 | "GWGCCD",
1522 | "GWGCCWCD",
1523 | "GWGCD",
1524 | "GWGCWD",
1525 | "GWGD",
1526 | "GWGID",
1527 | "GWGWCCCCD",
1528 | "GWGWCCCD",
1529 | "GWGWCD",
1530 | "GWGWICD",
1531 | "GWGWLCD",
1532 | "GWICD",
1533 | "GWICWD",
1534 | "GWIWD",
1535 | "GWJWD",
1536 | "GWLJWCD",
1537 | "GWPD",
1538 | "GWPJD",
1539 | "ICCCCCCD",
1540 | "ICCCCCD",
1541 | "ICCCCD",
1542 | "ICCCCDD",
1543 | "ICCCD",
1544 | "ICCD",
1545 | "ICCDCCD",
1546 | "ICCDCD",
1547 | "ICCDD",
1548 | "ICCGCCD",
1549 | "ICCGCIPD",
1550 | "ICCGD",
1551 | "ICCJD",
1552 | "ICCPD",
1553 | "ICCWDWCD",
1554 | "ICD",
1555 | "ICDD",
1556 | "ICDID",
1557 | "ICFD",
1558 | "ICGCCCD",
1559 | "ICGCD",
1560 | "ICGFD",
1561 | "ICGGCD",
1562 | "ICGLCMD",
1563 | "ICICD",
1564 | "ICID",
1565 | "ICIGD",
1566 | "ICJCD",
1567 | "ICJD",
1568 | "ICJJD",
1569 | "ICLJCD",
1570 | "ICMCCCCD",
1571 | "ICMD",
1572 | "ICPCD",
1573 | "ICPD",
1574 | "ICPPD",
1575 | "ICWGWCD",
1576 | "ICWGWD",
1577 | "ICWGWDCD",
1578 | "ID",
1579 | "IDCCCCD",
1580 | "IDCCCD",
1581 | "IDCCD",
1582 | "IDCCGJID",
1583 | "IDCCICD",
1584 | "IDCCICDID",
1585 | "IDCD",
1586 | "IDCDCD",
1587 | "IDCDD",
1588 | "IDCFCD",
1589 | "IDCGD",
1590 | "IDCICD",
1591 | "IDCID",
1592 | "IDCJD",
1593 | "IDCPCCCCCCD",
1594 | "IDD",
1595 | "IDGCCCD",
1596 | "IDGCD",
1597 | "IDID",
1598 | "IDIDD",
1599 | "IDJCD",
1600 | "IDKCD",
1601 | "IDPD",
1602 | "IDWCWCCDD",
1603 | "IFD",
1604 | "IFWGWCD",
1605 | "IGCCCD",
1606 | "IGCCCDD",
1607 | "IGCCD",
1608 | "IGCD",
1609 | "IGCDCD",
1610 | "IGCDD",
1611 | "IGCGCCD",
1612 | "IGCGCD",
1613 | "IGCID",
1614 | "IGCJD",
1615 | "IGCPD",
1616 | "IGCWJWD",
1617 | "IGD",
1618 | "IGDD",
1619 | "IGFCCD",
1620 | "IGFCD",
1621 | "IGFD",
1622 | "IGGCD",
1623 | "IGID",
1624 | "IGJD",
1625 | "IGLCD",
1626 | "IGLD",
1627 | "IGPCD",
1628 | "IGPCDD",
1629 | "IICCCD",
1630 | "IICCD",
1631 | "IICD",
1632 | "IICGD",
1633 | "IID",
1634 | "IIGD",
1635 | "IIGJCJCD",
1636 | "IIIGCD",
1637 | "IIPCD",
1638 | "IJCCCCD",
1639 | "IJCCCD",
1640 | "IJCCD",
1641 | "IJCD",
1642 | "IJD",
1643 | "IJDCCD",
1644 | "IJGCD",
1645 | "IJGD",
1646 | "IJJCD",
1647 | "IJJD",
1648 | "IJJJCD",
1649 | "IJPCDD",
1650 | "IJWCFIWGD",
1651 | "IJWCFWD",
1652 | "IJWCPWGD",
1653 | "IKCCCD",
1654 | "ILCD",
1655 | "ILD",
1656 | "ILPCD",
1657 | "ILPMD",
1658 | "IMCCD",
1659 | "IMCD",
1660 | "IMD",
1661 | "IMPD",
1662 | "IPCCCD",
1663 | "IPCCD",
1664 | "IPCCID",
1665 | "IPCCJD",
1666 | "IPCD",
1667 | "IPCID",
1668 | "IPCJD",
1669 | "IPCPD",
1670 | "IPD",
1671 | "IPFCD",
1672 | "IPID",
1673 | "IPIJD",
1674 | "IPJCGD",
1675 | "IPJD",
1676 | "IPPCD",
1677 | "JCCCCCCD",
1678 | "JCCCCCD",
1679 | "JCCCCD",
1680 | "JCCCD",
1681 | "JCCCJCD",
1682 | "JCCD",
1683 | "JCCID",
1684 | "JCCJD",
1685 | "JCCMCD",
1686 | "JCD",
1687 | "JCDCCD",
1688 | "JCDCD",
1689 | "JCDD",
1690 | "JCDID",
1691 | "JCFCD",
1692 | "JCGCCCCD",
1693 | "JCGCCCD",
1694 | "JCGCCD",
1695 | "JCGCD",
1696 | "JCGD",
1697 | "JCGJGD",
1698 | "JCICCCD",
1699 | "JCID",
1700 | "JCIDD",
1701 | "JCJCCCD",
1702 | "JCJCCD",
1703 | "JCJCD",
1704 | "JCJD",
1705 | "JCJDD",
1706 | "JCJFD",
1707 | "JCJJPCD",
1708 | "JCJPID",
1709 | "JCJWGWD",
1710 | "JCLD",
1711 | "JCMD",
1712 | "JCMPD",
1713 | "JCPJCID",
1714 | "JCPJJCD",
1715 | "JCPPCCCD",
1716 | "JD",
1717 | "JDCD",
1718 | "JDCMD",
1719 | "JDD",
1720 | "JDGD",
1721 | "JDID",
1722 | "JDJD",
1723 | "JDMD",
1724 | "JFCD",
1725 | "JFD",
1726 | "JGCCCD",
1727 | "JGCD",
1728 | "JGD",
1729 | "JGDCJD",
1730 | "JGGD",
1731 | "JGPD",
1732 | "JICCCD",
1733 | "JICD",
1734 | "JID",
1735 | "JIDD",
1736 | "JIID",
1737 | "JIJD",
1738 | "JILD",
1739 | "JJCCCD",
1740 | "JJCCD",
1741 | "JJCCPGD",
1742 | "JJCD",
1743 | "JJD",
1744 | "JJDCJD",
1745 | "JJDD",
1746 | "JJGCCD",
1747 | "JJGD",
1748 | "JJICD",
1749 | "JJID",
1750 | "JJJCCCD",
1751 | "JJJCD",
1752 | "JJJCFCCCD",
1753 | "JJJD",
1754 | "JJJGD",
1755 | "JJMCID",
1756 | "JJPCD",
1757 | "JJPD",
1758 | "JJPPJLCD",
1759 | "JJWFWCCJJD",
1760 | "JJWGWCD",
1761 | "JJWGWCDD",
1762 | "JKCD",
1763 | "JKD",
1764 | "JLCCD",
1765 | "JLCCDD",
1766 | "JLCCJD",
1767 | "JLCD",
1768 | "JLCDD",
1769 | "JLCMD",
1770 | "JLCMDD",
1771 | "JLD",
1772 | "JLDD",
1773 | "JLGCJD",
1774 | "JLGJCCCJD",
1775 | "JLJD",
1776 | "JMCD",
1777 | "JMD",
1778 | "JMJD",
1779 | "JMPD",
1780 | "JPCCD",
1781 | "JPCD",
1782 | "JPCMD",
1783 | "JPCMDPD",
1784 | "JPD",
1785 | "JPDCCCD",
1786 | "JPDD",
1787 | "JPDGCD",
1788 | "JPFCCD",
1789 | "JPFD",
1790 | "JPICD",
1791 | "JPID",
1792 | "JPIID",
1793 | "JPJD",
1794 | "JPJJCCCFPCD",
1795 | "JPMD",
1796 | "JPMDCCD",
1797 | "JPMDD",
1798 | "JPPJD",
1799 | "JPPJLCD",
1800 | "KCCCCCD",
1801 | "KCCCCD",
1802 | "KCCCCDCD",
1803 | "KCCCD",
1804 | "KCCCDCD",
1805 | "KCCCDD",
1806 | "KCCCDDCCCD",
1807 | "KCCCGD",
1808 | "KCCD",
1809 | "KCCDCCD",
1810 | "KCCDCD",
1811 | "KCCJD",
1812 | "KCCJDID",
1813 | "KCCPD",
1814 | "KCD",
1815 | "KCDCCCCD",
1816 | "KCDCCD",
1817 | "KCDCD",
1818 | "KCDD",
1819 | "KCDICD",
1820 | "KCDJD",
1821 | "KCGCCCD",
1822 | "KCGCCCDD",
1823 | "KCGCCD",
1824 | "KCGCD",
1825 | "KCGD",
1826 | "KCGGGD",
1827 | "KCICD",
1828 | "KCID",
1829 | "KCIDCD",
1830 | "KCJCD",
1831 | "KCJD",
1832 | "KCKCD",
1833 | "KCMD",
1834 | "KCMDCD",
1835 | "KCPD",
1836 | "KCWGWD",
1837 | "KD",
1838 | "KDCCCD",
1839 | "KDCD",
1840 | "KDD",
1841 | "KDICD",
1842 | "KDLCCPD",
1843 | "KFCD",
1844 | "KFCDD",
1845 | "KFD",
1846 | "KFWFD",
1847 | "KGCCCD",
1848 | "KGCCD",
1849 | "KGCD",
1850 | "KGCDCCD",
1851 | "KGD",
1852 | "KGDD",
1853 | "KGGD",
1854 | "KGJPD",
1855 | "KICCD",
1856 | "KICD",
1857 | "KICDD",
1858 | "KID",
1859 | "KIDCCD",
1860 | "KIDJCD",
1861 | "KIGID",
1862 | "KIMCD",
1863 | "KIMD",
1864 | "KIWGWD",
1865 | "KJCCD",
1866 | "KJCD",
1867 | "KJD",
1868 | "KJDD",
1869 | "KJICCD",
1870 | "KJJD",
1871 | "KJJDCD",
1872 | "KJJJD",
1873 | "KJPD",
1874 | "KLCCD",
1875 | "KLD",
1876 | "KMCCJCCD",
1877 | "KMCD",
1878 | "KMCDD",
1879 | "KMD",
1880 | "KMDCD",
1881 | "KMDD",
1882 | "KMMD",
1883 | "KMMMD",
1884 | "KPCCCD",
1885 | "KPCCD",
1886 | "KPCD",
1887 | "KPD",
1888 | "KPDD",
1889 | "LCCCCD",
1890 | "LCCCD",
1891 | "LCCD",
1892 | "LCCDD",
1893 | "LCCDJCCD",
1894 | "LCCGD",
1895 | "LCCGID",
1896 | "LCCID",
1897 | "LCCPCD",
1898 | "LCCWGWD",
1899 | "LCD",
1900 | "LCDCCD",
1901 | "LCDCD",
1902 | "LCDCDD",
1903 | "LCDCDIGCD",
1904 | "LCDD",
1905 | "LCDFD",
1906 | "LCDGDD",
1907 | "LCDGID",
1908 | "LCDID",
1909 | "LCDLD",
1910 | "LCDLDCD",
1911 | "LCDLDD",
1912 | "LCDMCDD",
1913 | "LCDPD",
1914 | "LCGD",
1915 | "LCGDD",
1916 | "LCICCWGWD",
1917 | "LCID",
1918 | "LCIGD",
1919 | "LCJCD",
1920 | "LCJD",
1921 | "LCLD",
1922 | "LCMCCD",
1923 | "LCMCDD",
1924 | "LCMCID",
1925 | "LCMCMD",
1926 | "LCMD",
1927 | "LCMJCICD",
1928 | "LCMJD",
1929 | "LCPCJCD",
1930 | "LCPD",
1931 | "LCPMD",
1932 | "LCPPCD",
1933 | "LD",
1934 | "LDCCD",
1935 | "LDCD",
1936 | "LDCLCD",
1937 | "LDCLCDCD",
1938 | "LDCPD",
1939 | "LDD",
1940 | "LDDD",
1941 | "LDLCCCCD",
1942 | "LFCD",
1943 | "LFCFD",
1944 | "LFD",
1945 | "LFPPPCCD",
1946 | "LGCD",
1947 | "LGD",
1948 | "LGGCCCD",
1949 | "LGGCD",
1950 | "LGJCD",
1951 | "LGJLCD",
1952 | "LGJLD",
1953 | "LICCCD",
1954 | "LICCD",
1955 | "LICD",
1956 | "LICLD",
1957 | "LID",
1958 | "LIGD",
1959 | "LIPCCCD",
1960 | "LIWGWCCCD",
1961 | "LJCCCCD",
1962 | "LJCCCCWGWD",
1963 | "LJCCCD",
1964 | "LJCCD",
1965 | "LJCCDCCCD",
1966 | "LJCCDCCD",
1967 | "LJCCDCD",
1968 | "LJCCDID",
1969 | "LJCCDJCD",
1970 | "LJCD",
1971 | "LJCDD",
1972 | "LJCGD",
1973 | "LJCJJD",
1974 | "LJCWCWJWCWJD",
1975 | "LJD",
1976 | "LJDCCD",
1977 | "LJDCD",
1978 | "LJDD",
1979 | "LJDJPD",
1980 | "LJDJPDD",
1981 | "LJDJPDID",
1982 | "LJDJPMDD",
1983 | "LJFJJCLCD",
1984 | "LJGD",
1985 | "LJID",
1986 | "LJJCD",
1987 | "LJJD",
1988 | "LJLD",
1989 | "LJMD",
1990 | "LJPCD",
1991 | "LKCD",
1992 | "LLCD",
1993 | "LLD",
1994 | "LLPD",
1995 | "LMCCFCCD",
1996 | "LMCD",
1997 | "LMD",
1998 | "LMID",
1999 | "LPCCCCCD",
2000 | "LPCCCD",
2001 | "LPCCD",
2002 | "LPCD",
2003 | "LPCDD",
2004 | "LPCFPPD",
2005 | "LPCGCCCD",
2006 | "LPCGCCD",
2007 | "LPCGCCDCCD",
2008 | "LPCGD",
2009 | "LPCGDDPD",
2010 | "LPD",
2011 | "LPDD",
2012 | "LPDDD",
2013 | "LPICD",
2014 | "LPID",
2015 | "LPJD",
2016 | "LPMDCCD",
2017 | "LPPJD",
2018 | "MCCCD",
2019 | "MCCD",
2020 | "MCCPD",
2021 | "MCD",
2022 | "MCDCCD",
2023 | "MCDCCDCD",
2024 | "MCDCCDD",
2025 | "MCDCD",
2026 | "MCDCGD",
2027 | "MCDD",
2028 | "MCDFD",
2029 | "MCDFDD",
2030 | "MCDLCD",
2031 | "MCDPPD",
2032 | "MCGCD",
2033 | "MCICD",
2034 | "MCID",
2035 | "MCIDWGWD",
2036 | "MCJD",
2037 | "MCLD",
2038 | "MCPD",
2039 | "MD",
2040 | "MDD",
2041 | "MFD",
2042 | "MGD",
2043 | "MGJD",
2044 | "MGJJD",
2045 | "MICCD",
2046 | "MICD",
2047 | "MID",
2048 | "MIDCCD",
2049 | "MJCCD",
2050 | "MJCD",
2051 | "MJD",
2052 | "MJDD",
2053 | "MLCD",
2054 | "MLD",
2055 | "MLGD",
2056 | "MLGGD",
2057 | "MMCCD",
2058 | "MMCD",
2059 | "MMD",
2060 | "MMMD",
2061 | "MMPD",
2062 | "MPCCD",
2063 | "MPCD",
2064 | "MPD",
2065 | "MPDCD",
2066 | "MPJPD",
2067 | "MPPD",
2068 | "PCCCCCCD",
2069 | "PCCCCCD",
2070 | "PCCCCD",
2071 | "PCCCD",
2072 | "PCCCDD",
2073 | "PCCD",
2074 | "PCCDD",
2075 | "PCCGJGD",
2076 | "PCCID",
2077 | "PCCIDD",
2078 | "PCD",
2079 | "PCDCD",
2080 | "PCDCJCD",
2081 | "PCDD",
2082 | "PCDFCCCD",
2083 | "PCDID",
2084 | "PCGCCD",
2085 | "PCGCD",
2086 | "PCGD",
2087 | "PCID",
2088 | "PCJCD",
2089 | "PCJGD",
2090 | "PCPCCD",
2091 | "PCPD",
2092 | "PD",
2093 | "PDCCD",
2094 | "PDD",
2095 | "PDDD",
2096 | "PFCCD",
2097 | "PFCDD",
2098 | "PFCJCD",
2099 | "PFD",
2100 | "PFFCD",
2101 | "PFPCD",
2102 | "PGCD",
2103 | "PGCJD",
2104 | "PGD",
2105 | "PGDCICD",
2106 | "PGJD",
2107 | "PICCD",
2108 | "PICD",
2109 | "PICDD",
2110 | "PID",
2111 | "PIFD",
2112 | "PIJCCD",
2113 | "PIJD",
2114 | "PJCCCDD",
2115 | "PJCCD",
2116 | "PJCD",
2117 | "PJD",
2118 | "PJDCD",
2119 | "PJDD",
2120 | "PJFD",
2121 | "PJGD",
2122 | "PJICCCPCD",
2123 | "PJID",
2124 | "PJJD",
2125 | "PJJDD",
2126 | "PJJPD",
2127 | "PJLPCD",
2128 | "PJPCD",
2129 | "PJPD",
2130 | "PLD",
2131 | "PLPCD",
2132 | "PMJCD",
2133 | "PPCCCDCD",
2134 | "PPCD",
2135 | "PPCJCCD",
2136 | "PPD",
2137 | "PPDCD",
2138 | "PPFCCD",
2139 | "PPFCD",
2140 | "PPGCID",
2141 | "PPGD",
2142 | "PPGJCCD",
2143 | "PPICCD",
2144 | "PPIGD",
2145 | "PPJCD",
2146 | "PPJD",
2147 | "PPJJD",
2148 | "PPMD",
2149 | "PPPCPD",
2150 | "PPPD",
2151 | "PPPWGWCCD",
2152 | "CCCCDID",
2153 | "CCCDFGD",
2154 | "CCCDGCD",
2155 | "CCCDGDD",
2156 | "CCCDWD",
2157 | "CCCGCCD",
2158 | "CCCGCD",
2159 | "CCCWCWD",
2160 | "CCCWGWCCD",
2161 | "CCCWGWCCDWD",
2162 | "CCCWGWD",
2163 | "CCDDGCD",
2164 | "CCDPCCD",
2165 | "CCDWD",
2166 | "CCFGCCCCCD",
2167 | "CCFGFCCCD",
2168 | "CCFPCD",
2169 | "CCGDD",
2170 | "CCGGCCD",
2171 | "CCIDGD",
2172 | "CCKD",
2173 | "CCMIDGCD",
2174 | "CCWD",
2175 | "CCWGWCCCD",
2176 | "CCWGWCD",
2177 | "CCWGWDD",
2178 | "CDWGWDGD",
2179 | "CFCCGWD",
2180 | "CFCD",
2181 | "CFCWGWD",
2182 | "CFGFGFGFGJID",
2183 | "CFJD",
2184 | "CFWGWCCDGCD",
2185 | "CFWGWCJCD",
2186 | "CGCCCCD",
2187 | "CGCCID",
2188 | "CGCCJCCCD",
2189 | "CGCDCCD",
2190 | "CGCFCCD",
2191 | "CGCGCD",
2192 | "CGCID",
2193 | "CGFCCD",
2194 | "CGFCD",
2195 | "CGFDID",
2196 | "CGGCICD",
2197 | "CGGJPD",
2198 | "CGICDGCD",
2199 | "CGICDID",
2200 | "CGIID",
2201 | "CGJCCCD",
2202 | "CGJCCD",
2203 | "CGJCD",
2204 | "CGJCDGD",
2205 | "CGJCDWD",
2206 | "CGJCJCD",
2207 | "CGJDD",
2208 | "CGJDDCCD",
2209 | "CGJGCD",
2210 | "CGJID",
2211 | "CGLCCD",
2212 | "CGPCCD",
2213 | "CGPCD",
2214 | "CGPD",
2215 | "CGPFCCD",
2216 | "CGPICD",
2217 | "CGPID",
2218 | "CGPJCDD",
2219 | "CGPJJJCD",
2220 | "CICCDGD",
2221 | "CICFJGD",
2222 | "CICGFID",
2223 | "CIDCD",
2224 | "CIDGD",
2225 | "CIFID",
2226 | "CIGCCD",
2227 | "CIGMCD",
2228 | "CIICCD",
2229 | "CIICD",
2230 | "CIJCWGWCD",
2231 | "CIJD",
2232 | "CIJWD",
2233 | "CIPCCD",
2234 | "CJCCDFD",
2235 | "CJCGD",
2236 | "CJCID",
2237 | "CJCWCCCD",
2238 | "CJCWGWD",
2239 | "CJGCCCD",
2240 | "CJICD",
2241 | "CJIDD",
2242 | "CJJCD",
2243 | "CJWGCD",
2244 | "CJWGWID",
2245 | "CPCCDGJD",
2246 | "CPCDCCD",
2247 | "CPDFCD",
2248 | "CPGID",
2249 | "CPICD",
2250 | "CPIWGWD",
2251 | "CPJGD",
2252 | "CPPCD",
2253 | "CPWGWDGD",
2254 | "D",
2255 | "FCCCCCCCD",
2256 | "FCCCCGD",
2257 | "FCCCDGD",
2258 | "FCCCWGWD",
2259 | "FCCDD",
2260 | "FCCDFCGD",
2261 | "FCCDGD",
2262 | "FCCDIPD",
2263 | "FCCDWGWD",
2264 | "FCCPCD",
2265 | "FCCWGWDD",
2266 | "FCDGD",
2267 | "FCDWD",
2268 | "FCDWGD",
2269 | "FCFWGWD",
2270 | "FCICCD",
2271 | "FCICDGD",
2272 | "FCIWGWDD",
2273 | "FCPCD",
2274 | "FCPCPD",
2275 | "FCPDGD",
2276 | "FCPPGD",
2277 | "FCWGWCD",
2278 | "FCWGWDD",
2279 | "FDDD",
2280 | "FDGD",
2281 | "FDGJCCD",
2282 | "FDWGWD",
2283 | "FFCCWGWD",
2284 | "FFFFD",
2285 | "FFFFFWWFD",
2286 | "FFFFWWD",
2287 | "FFFWD",
2288 | "FFFWWD",
2289 | "FFFWWFD",
2290 | "FFWWD",
2291 | "FGFPCCD",
2292 | "FGJWGWD",
2293 | "FICCCD",
2294 | "FICDGD",
2295 | "FICGWD",
2296 | "FICJD",
2297 | "FIICD",
2298 | "FIWGWCDD",
2299 | "FIWGWD",
2300 | "FIWGWDD",
2301 | "FJCCDD",
2302 | "FJGPCD",
2303 | "FJID",
2304 | "FJJGD",
2305 | "FMJD",
2306 | "FPCCD",
2307 | "FPCDD",
2308 | "FPDD",
2309 | "FPIDGD",
2310 | "FPWCWD",
2311 | "FWFWFD",
2312 | "FWGCD",
2313 | "FWGWCCD",
2314 | "FWGWCDGCD",
2315 | "FWGWCDGD",
2316 | "FWGWGD",
2317 | "FWJD",
2318 | "GCCCCCCDCD",
2319 | "GCCCCCDGD",
2320 | "GCCCCCID",
2321 | "GCCCCCKFD",
2322 | "GCCCCDCD",
2323 | "GCCCCDGCD",
2324 | "GCCCCDGCIJD",
2325 | "GCCCCDGDGDDDD",
2326 | "GCCCCDWFCCD",
2327 | "GCCCCDWGD",
2328 | "GCCCCFCCCCD",
2329 | "GCCCCID",
2330 | "GCCCDCPD",
2331 | "GCCCDDGCD",
2332 | "GCCCDDGD",
2333 | "GCCCDFCD",
2334 | "GCCCDGD",
2335 | "GCCCDGID",
2336 | "GCCCDICD",
2337 | "GCCCDMD",
2338 | "GCCCDWGCDWFCCD",
2339 | "GCCCDWGD",
2340 | "GCCCDWGWD",
2341 | "GCCCDWID",
2342 | "GCCCGPD",
2343 | "GCCCIJD",
2344 | "GCCCJCCD",
2345 | "GCCCJJCD",
2346 | "GCCCMCD",
2347 | "GCCCWD",
2348 | "GCCDCCMD",
2349 | "GCCDDWD",
2350 | "GCCDFCCD",
2351 | "GCCDGCD",
2352 | "GCCDGCGD",
2353 | "GCCDGDGCD",
2354 | "GCCDGJD",
2355 | "GCCDPPCD",
2356 | "GCCDWD",
2357 | "GCCFCCD",
2358 | "GCCFID",
2359 | "GCCFJCD",
2360 | "GCCFWCWCD",
2361 | "GCCGDCD",
2362 | "GCCGFD",
2363 | "GCCGFICD",
2364 | "GCCGID",
2365 | "GCCGIID",
2366 | "GCCICCD",
2367 | "GCCICDCD",
2368 | "GCCICWDD",
2369 | "GCCIDWDCD",
2370 | "GCCIID",
2371 | "GCCIJD",
2372 | "GCCJCDD",
2373 | "GCCJCGCD",
2374 | "GCCJDD",
2375 | "GCCJIDCD",
2376 | "GCCKDGD",
2377 | "GCCMJCD",
2378 | "GCCMJJCD",
2379 | "GCCWD",
2380 | "GCDCCCDGD",
2381 | "GCDCWDWD",
2382 | "GCDDDD",
2383 | "GCDDJCD",
2384 | "GCDFCD",
2385 | "GCDFID",
2386 | "GCDFJD",
2387 | "GCDGCGD",
2388 | "GCDGGGCD",
2389 | "GCDGIID",
2390 | "GCDIID",
2391 | "GCDKD",
2392 | "GCDMDFD",
2393 | "GCDPGD",
2394 | "GCDWD",
2395 | "GCDWDWD",
2396 | "GCFCCCD",
2397 | "GCFCCCDGD",
2398 | "GCFCDICD",
2399 | "GCFCDWGD",
2400 | "GCFCIFD",
2401 | "GCFCJD",
2402 | "GCFDDCID",
2403 | "GCFFJD",
2404 | "GCFGJPCD",
2405 | "GCFICD",
2406 | "GCFIDFD",
2407 | "GCFJD",
2408 | "GCFJDD",
2409 | "GCFJPD",
2410 | "GCFPCCCD",
2411 | "GCFPDD",
2412 | "GCFPID",
2413 | "GCGCCCCD",
2414 | "GCGCCCID",
2415 | "GCGCCCIDD",
2416 | "GCGCCDD",
2417 | "GCGCCDFD",
2418 | "GCGCCID",
2419 | "GCGCCJCD",
2420 | "GCGCCPD",
2421 | "GCGCDCCCD",
2422 | "GCGCDCD",
2423 | "GCGCDCID",
2424 | "GCGCDD",
2425 | "GCGCFCCD",
2426 | "GCGCFCD",
2427 | "GCGCFGCD",
2428 | "GCGCGCCD",
2429 | "GCGCGCD",
2430 | "GCGCGCPCCD",
2431 | "GCGCGD",
2432 | "GCGCGID",
2433 | "GCGCGPD",
2434 | "GCGCICCCD",
2435 | "GCGCICDDFCCCD",
2436 | "GCGCIDD",
2437 | "GCGCIID",
2438 | "GCGCJCCD",
2439 | "GCGCJD",
2440 | "GCGCJGWD",
2441 | "GCGCJJD",
2442 | "GCGCLCCD",
2443 | "GCGCPCCD",
2444 | "GCGCPCCID",
2445 | "GCGCPCD",
2446 | "GCGCPCJCCD",
2447 | "GCGDCCICCD",
2448 | "GCGDCD",
2449 | "GCGDIMD",
2450 | "GCGFCCD",
2451 | "GCGFCD",
2452 | "GCGFCJD",
2453 | "GCGFCMJD",
2454 | "GCGFD",
2455 | "GCGFDD",
2456 | "GCGFFCD",
2457 | "GCGFFD",
2458 | "GCGFID",
2459 | "GCGFIDD",
2460 | "GCGFJD",
2461 | "GCGGCCD",
2462 | "GCGGGJCD",
2463 | "GCGGJCID",
2464 | "GCGGJCJD",
2465 | "GCGICCCD",
2466 | "GCGICCD",
2467 | "GCGICCJD",
2468 | "GCGICDMD",
2469 | "GCGICICCD",
2470 | "GCGICJCDD",
2471 | "GCGICJD",
2472 | "GCGICJJD",
2473 | "GCGIDCGD",
2474 | "GCGIDD",
2475 | "GCGIDGD",
2476 | "GCGIGCCD",
2477 | "GCGIICD",
2478 | "GCGIID",
2479 | "GCGIMCCD",
2480 | "GCGIMJD",
2481 | "GCGIPCCD",
2482 | "GCGIPD",
2483 | "GCGJCCCCDD",
2484 | "GCGJCCCD",
2485 | "GCGJCCDD",
2486 | "GCGJCD",
2487 | "GCGJCID",
2488 | "GCGJD",
2489 | "GCGJDD",
2490 | "GCGJGICD",
2491 | "GCGJICD",
2492 | "GCGJID",
2493 | "GCGJIFCD",
2494 | "GCGJJCD",
2495 | "GCGJPCCD",
2496 | "GCGJPCD",
2497 | "GCGKCD",
2498 | "GCGKD",
2499 | "GCGLCDCCD",
2500 | "GCGLCJD",
2501 | "GCGLGCCD",
2502 | "GCGLGPCCID",
2503 | "GCGLIPJD",
2504 | "GCGLJJID",
2505 | "GCGMCD",
2506 | "GCGMD",
2507 | "GCGPCCCCCD",
2508 | "GCGPCCCD",
2509 | "GCGPCD",
2510 | "GCGPCFCCD",
2511 | "GCGPCID",
2512 | "GCGPCPD",
2513 | "GCGPD",
2514 | "GCGPFCD",
2515 | "GCGPGCD",
2516 | "GCGPIID",
2517 | "GCGPJCCD",
2518 | "GCGPJCD",
2519 | "GCGPJD",
2520 | "GCGPJGCD",
2521 | "GCGPJID",
2522 | "GCGPLICD",
2523 | "GCGPLID",
2524 | "GCGPPCCD",
2525 | "GCGPPCD",
2526 | "GCGPPD",
2527 | "GCGPPID",
2528 | "GCGPPJD",
2529 | "GCGWPFCD",
2530 | "GCICCCDD",
2531 | "GCICCDFD",
2532 | "GCICCJD",
2533 | "GCICCWDWDCGD",
2534 | "GCICDFCD",
2535 | "GCICPD",
2536 | "GCIDCD",
2537 | "GCIDCGD",
2538 | "GCIDDGD",
2539 | "GCIDPCCD",
2540 | "GCIICD",
2541 | "GCIJCCD",
2542 | "GCIJCCDMD",
2543 | "GCIJCID",
2544 | "GCIKD",
2545 | "GCIPCCD",
2546 | "GCIPCPD",
2547 | "GCJCCCCCD",
2548 | "GCJCCDCD",
2549 | "GCJCCDGD",
2550 | "GCJCCDMD",
2551 | "GCJCCICD",
2552 | "GCJCDD",
2553 | "GCJCICD",
2554 | "GCJCKDD",
2555 | "GCJDCDCD",
2556 | "GCJDDCD",
2557 | "GCJGCD",
2558 | "GCJICCCD",
2559 | "GCJICGD",
2560 | "GCJIDCD",
2561 | "GCJIDD",
2562 | "GCJJCDD",
2563 | "GCJJCJCD",
2564 | "GCJJDD",
2565 | "GCJMCID",
2566 | "GCJPCCCD",
2567 | "GCJPCCD",
2568 | "GCJPCD",
2569 | "GCJPCDMD",
2570 | "GCJPID",
2571 | "GCJPJD",
2572 | "GCJWCPWD",
2573 | "GCKCCCD",
2574 | "GCKCD",
2575 | "GCKDGD",
2576 | "GCKGD",
2577 | "GCKICD",
2578 | "GCKJCCD",
2579 | "GCKPD",
2580 | "GCLCID",
2581 | "GCLGIJCD",
2582 | "GCLID",
2583 | "GCMCCDFD",
2584 | "GCMCCKGD",
2585 | "GCMCJCCD",
2586 | "GCMCPD",
2587 | "GCMDCGCD",
2588 | "GCMFCDGD",
2589 | "GCMID",
2590 | "GCMJCD",
2591 | "GCMJCDD",
2592 | "GCMJCID",
2593 | "GCMJID",
2594 | "GCMJPCCCCD",
2595 | "GCMKD",
2596 | "GCMKGD",
2597 | "GCMPCCD",
2598 | "GCMPJD",
2599 | "GCMPPCCD",
2600 | "GCPCCCMD",
2601 | "GCPCCDCD",
2602 | "GCPCCDMD",
2603 | "GCPCCDWD",
2604 | "GCPCCWGCWD",
2605 | "GCPCDCD",
2606 | "GCPCDGD",
2607 | "GCPCDWD",
2608 | "GCPCICDWGD",
2609 | "GCPCIICFD",
2610 | "GCPCJCFD",
2611 | "GCPCJD",
2612 | "GCPDGD",
2613 | "GCPGGCD",
2614 | "GCPICCCDGD",
2615 | "GCPICCD",
2616 | "GCPICD",
2617 | "GCPICID",
2618 | "GCPID",
2619 | "GCPIJCCD",
2620 | "GCPJCDD",
2621 | "GCPJPDD",
2622 | "GCPKD",
2623 | "GCPMCCD",
2624 | "GCPMJCD",
2625 | "GCPPCD",
2626 | "GCPPID",
2627 | "GCPPWCWID",
2628 | "GCPWCWCD",
2629 | "GCPWDWDCCD",
2630 | "GCWDWDCCD",
2631 | "GCWGWCCD",
2632 | "GCWGWD",
2633 | "GCWGWJD",
2634 | "GCWJCCD",
2635 | "GDCCCCFCD",
2636 | "GDCCCFCPD",
2637 | "GDCCPCCD",
2638 | "GDCDGCD",
2639 | "GDCDJD",
2640 | "GDCKGCD",
2641 | "GDDGCD",
2642 | "GDDGD",
2643 | "GDDMD",
2644 | "GDICCCD",
2645 | "GDIPD",
2646 | "GDJCICD",
2647 | "GDLCCD",
2648 | "GFCCCCCCD",
2649 | "GFCCCCFD",
2650 | "GFCCCDD",
2651 | "GFCCCDDD",
2652 | "GFCCCDFCD",
2653 | "GFCCCDFDD",
2654 | "GFCCCDGCD",
2655 | "GFCCCDGD",
2656 | "GFCCCDID",
2657 | "GFCCDCCD",
2658 | "GFCCDCD",
2659 | "GFCCDCFD",
2660 | "GFCCDDD",
2661 | "GFCCDFCD",
2662 | "GFCCDFDMD",
2663 | "GFCCDFFCD",
2664 | "GFCCDFFD",
2665 | "GFCCDFGD",
2666 | "GFCCDGCCD",
2667 | "GFCCDGCD",
2668 | "GFCCDGD",
2669 | "GFCCDGGID",
2670 | "GFCCDGICDJICD",
2671 | "GFCCDID",
2672 | "GFCCDLD",
2673 | "GFCCDMD",
2674 | "GFCCDWCD",
2675 | "GFCCDWD",
2676 | "GFCCDWFDD",
2677 | "GFCCDWGCD",
2678 | "GFCCDWGD",
2679 | "GFCCID",
2680 | "GFCCJD",
2681 | "GFCCPCD",
2682 | "GFCDDCCCD",
2683 | "GFCDFCCD",
2684 | "GFCDFCD",
2685 | "GFCDGCD",
2686 | "GFCDGD",
2687 | "GFCDGDD",
2688 | "GFCDGFCD",
2689 | "GFCDGGCD",
2690 | "GFCDGGD",
2691 | "GFCDGPD",
2692 | "GFCDID",
2693 | "GFCDMIDMD",
2694 | "GFCDWCD",
2695 | "GFCDWD",
2696 | "GFCDWGD",
2697 | "GFCFCD",
2698 | "GFCGCD",
2699 | "GFCGD",
2700 | "GFCICD",
2701 | "GFCIDCGD",
2702 | "GFCIDWD",
2703 | "GFCJCCCD",
2704 | "GFCJCCD",
2705 | "GFCJCCDD",
2706 | "GFCJCJD",
2707 | "GFCJDD",
2708 | "GFCJID",
2709 | "GFCKD",
2710 | "GFCLCD",
2711 | "GFCMCCD",
2712 | "GFCMJCDWD",
2713 | "GFCPDGD",
2714 | "GFCPPCD",
2715 | "GFCWCD",
2716 | "GFCWCWCD",
2717 | "GFCWFWFCCD",
2718 | "GFCWGWD",
2719 | "GFDCCD",
2720 | "GFDCDCDD",
2721 | "GFDCDD",
2722 | "GFDCDGD",
2723 | "GFDCID",
2724 | "GFDDCCD",
2725 | "GFDDCGD",
2726 | "GFDDD",
2727 | "GFDDGD",
2728 | "GFDDPD",
2729 | "GFDGCD",
2730 | "GFDGD",
2731 | "GFDICD",
2732 | "GFDICPCD",
2733 | "GFDID",
2734 | "GFDJPCD",
2735 | "GFDWD",
2736 | "GFDWDWD",
2737 | "GFFCCCD",
2738 | "GFFCJD",
2739 | "GFFDD",
2740 | "GFFJJDGD",
2741 | "GFFPDGD",
2742 | "GFGCCCDD",
2743 | "GFGCCD",
2744 | "GFGCCDGPD",
2745 | "GFGFICD",
2746 | "GFGMPD",
2747 | "GFICCDCD",
2748 | "GFICCDD",
2749 | "GFICDCD",
2750 | "GFICDCJD",
2751 | "GFICDD",
2752 | "GFICDGD",
2753 | "GFICJD",
2754 | "GFICKD",
2755 | "GFIDD",
2756 | "GFIDFGD",
2757 | "GFIDGCD",
2758 | "GFIDGD",
2759 | "GFIDPCPCD",
2760 | "GFIGD",
2761 | "GFIID",
2762 | "GFIIDFCD",
2763 | "GFIIGD",
2764 | "GFIJCCD",
2765 | "GFIJD",
2766 | "GFJCCCCD",
2767 | "GFJCCDD",
2768 | "GFJCDD",
2769 | "GFJCDGD",
2770 | "GFJCDWD",
2771 | "GFJCJD",
2772 | "GFJDD",
2773 | "GFJDGCD",
2774 | "GFJDGFCD",
2775 | "GFJDWD",
2776 | "GFJDWFICGD",
2777 | "GFJFD",
2778 | "GFJICD",
2779 | "GFJICDGD",
2780 | "GFJID",
2781 | "GFJJCD",
2782 | "GFJJDWGD",
2783 | "GFKD",
2784 | "GFKDGD",
2785 | "GFLCD",
2786 | "GFMJCD",
2787 | "GFPCCCD",
2788 | "GFPCCD",
2789 | "GFPCDCD",
2790 | "GFPCDD",
2791 | "GFPCJD",
2792 | "GFPDCD",
2793 | "GFPDD",
2794 | "GFPDID",
2795 | "GFPICD",
2796 | "GFPIJD",
2797 | "GFPJIDD",
2798 | "GFPKD",
2799 | "GFPPCCD",
2800 | "GFPPCD",
2801 | "GFWCWID",
2802 | "GFWDWD",
2803 | "GFWJD",
2804 | "GGCCCCCCD",
2805 | "GGCCCCJCD",
2806 | "GGCCCDD",
2807 | "GGCCCDDD",
2808 | "GGCCCDGD",
2809 | "GGCCCDWGCD",
2810 | "GGCCDCDGD",
2811 | "GGCCDFGCD",
2812 | "GGCCDGD",
2813 | "GGCCDGDCD",
2814 | "GGCCDID",
2815 | "GGCCDMCD",
2816 | "GGCCDWGD",
2817 | "GGCCFCD",
2818 | "GGCCFD",
2819 | "GGCCGCCD",
2820 | "GGCCICDD",
2821 | "GGCCID",
2822 | "GGCCJCCDD",
2823 | "GGCCLCD",
2824 | "GGCCPCD",
2825 | "GGCCPJD",
2826 | "GGCDCCDGD",
2827 | "GGCDCDD",
2828 | "GGCDCDGD",
2829 | "GGCDCGD",
2830 | "GGCDDCCD",
2831 | "GGCDGD",
2832 | "GGCDGPGCD",
2833 | "GGCDID",
2834 | "GGCDMD",
2835 | "GGCFCCD",
2836 | "GGCFCD",
2837 | "GGCFID",
2838 | "GGCGCCCCD",
2839 | "GGCGCCD",
2840 | "GGCGCGCCD",
2841 | "GGCGCGCD",
2842 | "GGCGCGFCD",
2843 | "GGCGDGCD",
2844 | "GGCGFD",
2845 | "GGCGFID",
2846 | "GGCGGCD",
2847 | "GGCGGGD",
2848 | "GGCGGJD",
2849 | "GGCGICCD",
2850 | "GGCGIICD",
2851 | "GGCGILICD",
2852 | "GGCGJID",
2853 | "GGCGJIJCD",
2854 | "GGCGPCCD",
2855 | "GGCGPCD",
2856 | "GGCGPJCCD",
2857 | "GGCGPJCD",
2858 | "GGCGPJD",
2859 | "GGCGPPD",
2860 | "GGCICCD",
2861 | "GGCICCID",
2862 | "GGCICD",
2863 | "GGCIDD",
2864 | "GGCIICD",
2865 | "GGCIIJD",
2866 | "GGCIPCICCD",
2867 | "GGCIPD",
2868 | "GGCJCDD",
2869 | "GGCJCJD",
2870 | "GGCJDD",
2871 | "GGCJID",
2872 | "GGCKLCD",
2873 | "GGCLCCD",
2874 | "GGCMCIJD",
2875 | "GGCMID",
2876 | "GGCPCCCD",
2877 | "GGCPCCJCCCWD",
2878 | "GGCPCDD",
2879 | "GGCPDD",
2880 | "GGCPGGCID",
2881 | "GGCPICD",
2882 | "GGCPICDD",
2883 | "GGCPID",
2884 | "GGCPJCD",
2885 | "GGCPPCCD",
2886 | "GGCPPD",
2887 | "GGCWDWCCDGCD",
2888 | "GGCWGD",
2889 | "GGDCDCCD",
2890 | "GGDCJD",
2891 | "GGDDFD",
2892 | "GGDGCCGCD",
2893 | "GGDGCDGD",
2894 | "GGDLGD",
2895 | "GGFCCCCD",
2896 | "GGFCDID",
2897 | "GGFCFCDD",
2898 | "GGFCID",
2899 | "GGFCJD",
2900 | "GGFCMCCD",
2901 | "GGFDCD",
2902 | "GGFDDD",
2903 | "GGFICD",
2904 | "GGFICDD",
2905 | "GGFID",
2906 | "GGFJCD",
2907 | "GGFJID",
2908 | "GGFJMD",
2909 | "GGFKID",
2910 | "GGFMJDD",
2911 | "GGFPCD",
2912 | "GGFPD",
2913 | "GGFWID",
2914 | "GGGCCCDGD",
2915 | "GGGCCCICD",
2916 | "GGGCCDGD",
2917 | "GGGCCID",
2918 | "GGGCGGD",
2919 | "GGGCJCD",
2920 | "GGGCPD",
2921 | "GGGCPFCPCD",
2922 | "GGGCPJD",
2923 | "GGGFCCD",
2924 | "GGGFCCID",
2925 | "GGGFCD",
2926 | "GGGFCJD",
2927 | "GGGFID",
2928 | "GGGGPJD",
2929 | "GGGICCD",
2930 | "GGGICJD",
2931 | "GGGIDGID",
2932 | "GGGIICD",
2933 | "GGGJCCD",
2934 | "GGGJGID",
2935 | "GGGKCD",
2936 | "GGGKDJD",
2937 | "GGGLJCD",
2938 | "GGGMCD",
2939 | "GGGPCD",
2940 | "GGGPFIDWD",
2941 | "GGGPIICD",
2942 | "GGGPIPD",
2943 | "GGGPPID",
2944 | "GGICCGCD",
2945 | "GGICCID",
2946 | "GGICDD",
2947 | "GGICFID",
2948 | "GGICJCD",
2949 | "GGICJDD",
2950 | "GGICPCCID",
2951 | "GGICPD",
2952 | "GGIDID",
2953 | "GGIDWGD",
2954 | "GGIFCCD",
2955 | "GGIFCD",
2956 | "GGIFCJD",
2957 | "GGIFICD",
2958 | "GGIFID",
2959 | "GGIFIDDD",
2960 | "GGIFJD",
2961 | "GGIFMID",
2962 | "GGIGPFD",
2963 | "GGIICCD",
2964 | "GGIJCD",
2965 | "GGIJCID",
2966 | "GGIJDD",
2967 | "GGIJICD",
2968 | "GGIPCCD",
2969 | "GGIPDCCD",
2970 | "GGIPICD",
2971 | "GGIPMICD",
2972 | "GGJCCCCCD",
2973 | "GGJCCCCD",
2974 | "GGJCCICD",
2975 | "GGJCDD",
2976 | "GGJCGCD",
2977 | "GGJCICCD",
2978 | "GGJCICD",
2979 | "GGJGCD",
2980 | "GGJGCICD",
2981 | "GGJGCLCGCD",
2982 | "GGJICCD",
2983 | "GGJICJD",
2984 | "GGJICPCCD",
2985 | "GGJID",
2986 | "GGJIID",
2987 | "GGJJCD",
2988 | "GGJJCDD",
2989 | "GGJJCKD",
2990 | "GGJJID",
2991 | "GGJMID",
2992 | "GGJPCCCCD",
2993 | "GGJPCCD",
2994 | "GGJPCD",
2995 | "GGJPCJCD",
2996 | "GGJPCJPJCD",
2997 | "GGJPID",
2998 | "GGJPJD",
2999 | "GGKCCCD",
3000 | "GGKCD",
3001 | "GGKDD",
3002 | "GGLCCD",
3003 | "GGLCCPJD",
3004 | "GGLFCCCD",
3005 | "GGLGCJD",
3006 | "GGLGFID",
3007 | "GGLGPCD",
3008 | "GGLJCCD",
3009 | "GGLJCID",
3010 | "GGMFJD",
3011 | "GGMJCDGD",
3012 | "GGMPJD",
3013 | "GGPCCDD",
3014 | "GGPCDD",
3015 | "GGPCICD",
3016 | "GGPCID",
3017 | "GGPFCCD",
3018 | "GGPFCD",
3019 | "GGPFCID",
3020 | "GGPFJD",
3021 | "GGPGCD",
3022 | "GGPGID",
3023 | "GGPICFCD",
3024 | "GGPID",
3025 | "GGPIDD",
3026 | "GGPIID",
3027 | "GGPJCCCD",
3028 | "GGPJCCD",
3029 | "GGPJCCID",
3030 | "GGPJCJMD",
3031 | "GGPJID",
3032 | "GGPJKCCD",
3033 | "GGPJPCD",
3034 | "GGPPCID",
3035 | "GGPPDD",
3036 | "GGPPFCCD",
3037 | "GGPPICD",
3038 | "GGPPJCD",
3039 | "GGWCJD",
3040 | "GGWGWID",
3041 | "GGWIWCCD",
3042 | "GICCCCCCCD",
3043 | "GICCCDGD",
3044 | "GICCDDD",
3045 | "GICCDGCD",
3046 | "GICCDGD",
3047 | "GICCDWD",
3048 | "GICCDWGD",
3049 | "GICCFCCD",
3050 | "GICCICCD",
3051 | "GICCICD",
3052 | "GICCID",
3053 | "GICCJCCD",
3054 | "GICDCCCD",
3055 | "GICDDWGD",
3056 | "GICDGD",
3057 | "GICDGJCD",
3058 | "GICDID",
3059 | "GICDWD",
3060 | "GICFD",
3061 | "GICFID",
3062 | "GICGCD",
3063 | "GICICCCCCCCCPD",
3064 | "GICICDDGD",
3065 | "GICICDFD",
3066 | "GICIDGD",
3067 | "GICIFD",
3068 | "GICIIFID",
3069 | "GICJDD",
3070 | "GICJDGD",
3071 | "GICJJD",
3072 | "GICKD",
3073 | "GICPCD",
3074 | "GICPID",
3075 | "GICPIDD",
3076 | "GICWCWCWD",
3077 | "GIDCDD",
3078 | "GIDDCD",
3079 | "GIDDGD",
3080 | "GIDDWGD",
3081 | "GIDFCD",
3082 | "GIDGDCD",
3083 | "GIDJJD",
3084 | "GIFCCCD",
3085 | "GIFCD",
3086 | "GIFCJD",
3087 | "GIFFFWFWD",
3088 | "GIFGD",
3089 | "GIFICCCD",
3090 | "GIFID",
3091 | "GIFIDCD",
3092 | "GIFJD",
3093 | "GIFPD",
3094 | "GIFPDCD",
3095 | "GIGCCDMD",
3096 | "GIGGCD",
3097 | "GIGJD",
3098 | "GIGMD",
3099 | "GIICCCD",
3100 | "GIICCDD",
3101 | "GIICCDGD",
3102 | "GIICCDMCD",
3103 | "GIICDD",
3104 | "GIICID",
3105 | "GIIDFCD",
3106 | "GIIDGD",
3107 | "GIIDJCD",
3108 | "GIIFICD",
3109 | "GIIICCD",
3110 | "GIIJD",
3111 | "GIIPCD",
3112 | "GIIPD",
3113 | "GIJCCDD",
3114 | "GIJCCICD",
3115 | "GIJCCJD",
3116 | "GIJCDCD",
3117 | "GIJCDGD",
3118 | "GIJCDWCFD",
3119 | "GIJDCD",
3120 | "GIJICCD",
3121 | "GIJICD",
3122 | "GIJICDGD",
3123 | "GIJIDD",
3124 | "GIJJD",
3125 | "GIJJICJD",
3126 | "GIJPCD",
3127 | "GIJPID",
3128 | "GILGCD",
3129 | "GIMCID",
3130 | "GIMCPD",
3131 | "GIPCCCCCD",
3132 | "GIPCCCDGD",
3133 | "GIPCCDD",
3134 | "GIPDCCCCD",
3135 | "GIPDGD",
3136 | "GIPDWCCD",
3137 | "GIPFD",
3138 | "GIPID",
3139 | "GIPJCDD",
3140 | "GIPJD",
3141 | "GIWGFWDGD",
3142 | "GIWGWDD",
3143 | "GJCCCCCDCD",
3144 | "GJCCCCCDD",
3145 | "GJCCCCDD",
3146 | "GJCCCDCD",
3147 | "GJCCCDGCD",
3148 | "GJCCCDGD",
3149 | "GJCCCDLD",
3150 | "GJCCCDWD",
3151 | "GJCCCPD",
3152 | "GJCCDFDD",
3153 | "GJCCID",
3154 | "GJCCPD",
3155 | "GJCCWKWCD",
3156 | "GJCDMCCD",
3157 | "GJCICCD",
3158 | "GJCIWGWCCD",
3159 | "GJCJCDID",
3160 | "GJCKD",
3161 | "GJCLCCCD",
3162 | "GJCMIGD",
3163 | "GJCMWD",
3164 | "GJCPDCCD",
3165 | "GJDCDCD",
3166 | "GJFCCCD",
3167 | "GJFCCDWGCGD",
3168 | "GJFCDCD",
3169 | "GJFCDD",
3170 | "GJFCDWFCD",
3171 | "GJFID",
3172 | "GJGCCCCD",
3173 | "GJGCCCD",
3174 | "GJGCCD",
3175 | "GJGCCDD",
3176 | "GJGCCDID",
3177 | "GJGCMJD",
3178 | "GJGFJCD",
3179 | "GJGJCD",
3180 | "GJGJCJCD",
3181 | "GJGJCKDGD",
3182 | "GJGMCCD",
3183 | "GJGPCCCD",
3184 | "GJGPD",
3185 | "GJICCCCD",
3186 | "GJICCDCD",
3187 | "GJICDD",
3188 | "GJICJD",
3189 | "GJIDD",
3190 | "GJIDWCCCWD",
3191 | "GJIICD",
3192 | "GJIIID",
3193 | "GJIJCCD",
3194 | "GJIJCD",
3195 | "GJIJD",
3196 | "GJIPD",
3197 | "GJJCCDGD",
3198 | "GJJCDCCD",
3199 | "GJJCDCD",
3200 | "GJJCDD",
3201 | "GJJCID",
3202 | "GJJCPD",
3203 | "GJJDDCCD",
3204 | "GJJGCCCCD",
3205 | "GJJICCCCD",
3206 | "GJJICCDCD",
3207 | "GJJICD",
3208 | "GJJID",
3209 | "GJJIPD",
3210 | "GJJPD",
3211 | "GJKJCD",
3212 | "GJKPD",
3213 | "GJMCCCCD",
3214 | "GJMCCCD",
3215 | "GJMCD",
3216 | "GJMICD",
3217 | "GJMJCCCCD",
3218 | "GJMJCD",
3219 | "GJMPCDD",
3220 | "GJPCCCDGD",
3221 | "GJPCCDD",
3222 | "GJPCCDGD",
3223 | "GJPCCDID",
3224 | "GJPCID",
3225 | "GJPDCCD",
3226 | "GJPDDGD",
3227 | "GJPDGD",
3228 | "GJPID",
3229 | "GJPIDID",
3230 | "GJPJCD",
3231 | "GJPJMJCD",
3232 | "GJPPCCD",
3233 | "GJWGWCD",
3234 | "GKCCCCD",
3235 | "GKCCDCD",
3236 | "GKCCDD",
3237 | "GKCCDWD",
3238 | "GKCDDICD",
3239 | "GKCID",
3240 | "GKCJDDWD",
3241 | "GKCWWD",
3242 | "GKDCCD",
3243 | "GKDID",
3244 | "GKGDGCDD",
3245 | "GKGPDD",
3246 | "GKICD",
3247 | "GKJCCDGCD",
3248 | "GKJCCID",
3249 | "GKJCD",
3250 | "GKJD",
3251 | "GKJID",
3252 | "GKJIPCD",
3253 | "GKPCDFJD",
3254 | "GKPCJD",
3255 | "GKWD",
3256 | "GLCCCCCID",
3257 | "GLCCCDD",
3258 | "GLCDCGCD",
3259 | "GLCICD",
3260 | "GLCID",
3261 | "GLCJID",
3262 | "GLCMCICD",
3263 | "GLCPCCDGD",
3264 | "GLCWGWDGCD",
3265 | "GLFCCD",
3266 | "GLFD",
3267 | "GLFJJCD",
3268 | "GLFKCCCD",
3269 | "GLGCCD",
3270 | "GLGCDDFCD",
3271 | "GLGJCCD",
3272 | "GLICCCCD",
3273 | "GLICCCD",
3274 | "GLICCDD",
3275 | "GLIFD",
3276 | "GLIID",
3277 | "GLIJD",
3278 | "GLIWDWD",
3279 | "GLJCCPCJD",
3280 | "GLJCDD",
3281 | "GLJCID",
3282 | "GLJDCD",
3283 | "GLJGCCD",
3284 | "GLJGCDGD",
3285 | "GLKCIID",
3286 | "GLMCCDD",
3287 | "GLMCD",
3288 | "GLPCCCCD",
3289 | "GLPCCDD",
3290 | "GLPCCDDFD",
3291 | "GLPICD",
3292 | "GLPID",
3293 | "GLPJCD",
3294 | "GLPPCCDD",
3295 | "GLPWIWD",
3296 | "GMCCCDD",
3297 | "GMCCDCCCD",
3298 | "GMCCDCMD",
3299 | "GMCCDMD",
3300 | "GMCCDWD",
3301 | "GMCID",
3302 | "GMCIID",
3303 | "GMCKCD",
3304 | "GMDPDID",
3305 | "GMFCCD",
3306 | "GMFCD",
3307 | "GMFCDCDGD",
3308 | "GMFICD",
3309 | "GMGCD",
3310 | "GMGCDCD",
3311 | "GMICID",
3312 | "GMIID",
3313 | "GMJCCCD",
3314 | "GMMCDGDD",
3315 | "GMMICCD",
3316 | "GMPCCCD",
3317 | "GMPDGCD",
3318 | "GMPFD",
3319 | "GMPICD",
3320 | "GMPID",
3321 | "GMPMCCCD",
3322 | "GPCCCCCD",
3323 | "GPCCCCDGD",
3324 | "GPCCCDGD",
3325 | "GPCCCDLD",
3326 | "GPCCCID",
3327 | "GPCCCPD",
3328 | "GPCCDGD",
3329 | "GPCCDWD",
3330 | "GPCCGD",
3331 | "GPCCICD",
3332 | "GPCCID",
3333 | "GPCCJD",
3334 | "GPCCWCWCD",
3335 | "GPCDCJD",
3336 | "GPCDDCD",
3337 | "GPCDGFCD",
3338 | "GPCDIID",
3339 | "GPCFCCCDGD",
3340 | "GPCFD",
3341 | "GPCGCD",
3342 | "GPCICD",
3343 | "GPCIDD",
3344 | "GPCIDWCWD",
3345 | "GPCIID",
3346 | "GPCIJCD",
3347 | "GPCIPD",
3348 | "GPCJD",
3349 | "GPCKD",
3350 | "GPCPCD",
3351 | "GPCPCDD",
3352 | "GPCPD",
3353 | "GPCWWD",
3354 | "GPFCCDWD",
3355 | "GPFCCWD",
3356 | "GPFCDD",
3357 | "GPFCWFWD",
3358 | "GPFDD",
3359 | "GPFICD",
3360 | "GPFIGGPCD",
3361 | "GPFJCD",
3362 | "GPFJD",
3363 | "GPFJDCD",
3364 | "GPFPCDCD",
3365 | "GPFPD",
3366 | "GPGCCCCD",
3367 | "GPGCCD",
3368 | "GPGCD",
3369 | "GPGCDWGD",
3370 | "GPGCGCD",
3371 | "GPGCIICD",
3372 | "GPGCPCCD",
3373 | "GPGFFCD",
3374 | "GPGICCD",
3375 | "GPGICD",
3376 | "GPGID",
3377 | "GPGJCD",
3378 | "GPGJD",
3379 | "GPGPCGD",
3380 | "GPGPDD",
3381 | "GPGPGJCCD",
3382 | "GPGPICD",
3383 | "GPICCCCDGD",
3384 | "GPICDD",
3385 | "GPICDGD",
3386 | "GPICICD",
3387 | "GPIDFGD",
3388 | "GPIICD",
3389 | "GPIICDGD",
3390 | "GPIID",
3391 | "GPIJCCD",
3392 | "GPIJCD",
3393 | "GPIJCDD",
3394 | "GPIPCCCD",
3395 | "GPIPCCD",
3396 | "GPJCCCCD",
3397 | "GPJCCDD",
3398 | "GPJCDCPD",
3399 | "GPJCDFD",
3400 | "GPJCDGCD",
3401 | "GPJCDGD",
3402 | "GPJCID",
3403 | "GPJCIDD",
3404 | "GPJCJCD",
3405 | "GPJCPCCD",
3406 | "GPJFCCD",
3407 | "GPJFIDD",
3408 | "GPJICD",
3409 | "GPJID",
3410 | "GPJPCD",
3411 | "GPJPD",
3412 | "GPKD",
3413 | "GPLCCCCD",
3414 | "GPLCCD",
3415 | "GPLCD",
3416 | "GPLICD",
3417 | "GPLID",
3418 | "GPMCCCD",
3419 | "GPMCCD",
3420 | "GPMCD",
3421 | "GPMD",
3422 | "GPMDJCD",
3423 | "GPPCCDD",
3424 | "GPPCCDWD",
3425 | "GPPCDWCCCD",
3426 | "GPPDDD",
3427 | "GPPFD",
3428 | "GPPGCCCD",
3429 | "GPPGD",
3430 | "GPPGDGD",
3431 | "GPPICD",
3432 | "GPPIID",
3433 | "GPPIJD",
3434 | "GPPJCCD",
3435 | "GPPJCDFCD",
3436 | "GPPJDCD",
3437 | "GPPMJCD",
3438 | "GPPPCD",
3439 | "GPPPICD",
3440 | "GPWCCD",
3441 | "GPWGWCDGD",
3442 | "GWCCCCD",
3443 | "GWCWJD",
3444 | "GWGPPD",
3445 | "GWGWCCD",
3446 | "GWICCD",
3447 | "GWJD",
3448 | "GWPCPWD",
3449 | "ICCCDFPCCFGCCD",
3450 | "ICCCDGD",
3451 | "ICCCDGJD",
3452 | "ICCCID",
3453 | "ICCCWGWD",
3454 | "ICCCWGWDGD",
3455 | "ICCFJCWGFJCCD",
3456 | "ICCICCD",
3457 | "ICCWGWD",
3458 | "ICDCCCCD",
3459 | "ICDCDCD",
3460 | "ICDGD",
3461 | "ICDWGD",
3462 | "ICFCD",
3463 | "ICFCJCD",
3464 | "ICFDID",
3465 | "ICGD",
3466 | "ICGGD",
3467 | "ICICCD",
3468 | "ICIDWID",
3469 | "ICIWD",
3470 | "ICJCCD",
3471 | "ICJWGWCCD",
3472 | "ICWGWDD",
3473 | "IDDGCD",
3474 | "IDGD",
3475 | "IFCD",
3476 | "IFCICCDGD",
3477 | "IFGD",
3478 | "IFICCD",
3479 | "IFIDWGD",
3480 | "IFKD",
3481 | "IGCCCCDCCD",
3482 | "IGFGJCGDD",
3483 | "IGGDFCD",
3484 | "IGGPCGCD",
3485 | "IGICCD",
3486 | "IIGCD",
3487 | "IIICD",
3488 | "IIID",
3489 | "IIJD",
3490 | "IIWGWCD",
3491 | "IIWGWD",
3492 | "IJCCDGD",
3493 | "IJCCWGWD",
3494 | "IJCDCCD",
3495 | "IJCDD",
3496 | "IJDD",
3497 | "IJGCCD",
3498 | "IJPCCD",
3499 | "IJPCD",
3500 | "IKCD",
3501 | "IMCCCCD",
3502 | "IPCCCCD",
3503 | "IPCGD",
3504 | "IPFD",
3505 | "IPJCD",
3506 | "IPPD",
3507 | "IPPDCD",
3508 | "IPPJCD",
3509 | "IPWGWCCD",
3510 | "IPWGWCD",
3511 | "IWGMFCCDGD",
3512 | "IWGWCCD",
3513 | "IWGWCD",
3514 | "IWGWD",
3515 | "IWGWID",
3516 | "JCCCWFWD",
3517 | "JCCGD",
3518 | "JCCPD",
3519 | "JCCWGWD",
3520 | "JCDCWMWMWCCCD",
3521 | "JCDDWD",
3522 | "JCGDWCPWD",
3523 | "JCIDGD",
3524 | "JCIWGWD",
3525 | "JCJGCD",
3526 | "JCPCD",
3527 | "JCWGWCDWD",
3528 | "JDGCCD",
3529 | "JDWGWCD",
3530 | "JFCCD",
3531 | "JFCCWGWDD",
3532 | "JFID",
3533 | "JFJCCD",
3534 | "JGCCD",
3535 | "JGCCGCD",
3536 | "JGCDFCD",
3537 | "JGCFCCCD",
3538 | "JGFD",
3539 | "JGICD",
3540 | "JGID",
3541 | "JGJDCD",
3542 | "JGMCCD",
3543 | "JGPCCD",
3544 | "JICCD",
3545 | "JIGPCD",
3546 | "JIICD",
3547 | "JIPCD",
3548 | "JJCCCDGD",
3549 | "JJCCJD",
3550 | "JJCGD",
3551 | "JJCID",
3552 | "JJCWGWCCDFGCD",
3553 | "JJCWGWD",
3554 | "JJLWGWCD",
3555 | "JJPJJD",
3556 | "JJPPCD",
3557 | "JPCCCCWGWD",
3558 | "JPCCCD",
3559 | "JPIWGWID",
3560 | "JPJJD",
3561 | "JPWWD",
3562 | "JWWFWCD",
3563 | "KCCCCDD",
3564 | "KCCDGD",
3565 | "KCDGCD",
3566 | "KCDGD",
3567 | "KCFCD",
3568 | "KCICCD",
3569 | "KCJCCCDGD",
3570 | "KFCCD",
3571 | "KGCDCCCD",
3572 | "KGCDD",
3573 | "KGCDGD",
3574 | "KGCGCD",
3575 | "KGGCDID",
3576 | "KIDJCCD",
3577 | "KIICD",
3578 | "KJGCD",
3579 | "KLPCCD",
3580 | "KPFCCD",
3581 | "KPGCD",
3582 | "KPICD",
3583 | "KPKICD",
3584 | "LCCCDGCD",
3585 | "LCGFCD",
3586 | "LCWGWCFDD",
3587 | "LCWGWICD",
3588 | "LFCCD",
3589 | "LFCWGWCD",
3590 | "LFJD",
3591 | "LGCCCD",
3592 | "LGCCD",
3593 | "LGCICD",
3594 | "LGCPJCD",
3595 | "LGFCCD",
3596 | "LGFCD",
3597 | "LGFD",
3598 | "LGFICD",
3599 | "LGFID",
3600 | "LGFJD",
3601 | "LGFPD",
3602 | "LGICCDGCD",
3603 | "LGICD",
3604 | "LGID",
3605 | "LGIICD",
3606 | "LGJCCD",
3607 | "LGJCID",
3608 | "LGLIDD",
3609 | "LGPCCD",
3610 | "LGPCD",
3611 | "LGPDDWD",
3612 | "LGPJPD",
3613 | "LIJD",
3614 | "LIWGWD",
3615 | "LJFCCD",
3616 | "LJFGCD",
3617 | "LKGPD",
3618 | "LKPJCCD",
3619 | "LMPJIPCCD",
3620 | "LPCGDGD",
3621 | "LPPWCCD",
3622 | "MCCID",
3623 | "MCCWGWD",
3624 | "MCJCCD",
3625 | "MCWD",
3626 | "MJCWGWD",
3627 | "MPCFCD",
3628 | "MPCGD",
3629 | "PCCGGCCCCD",
3630 | "PCCKD",
3631 | "PCDDDCCD",
3632 | "PCJD",
3633 | "PCJJDD",
3634 | "PCLGCCD",
3635 | "PCWGWCD",
3636 | "PCWGWID",
3637 | "PCWGWPDWD",
3638 | "PFCCCWGWD",
3639 | "PFCCJICCD",
3640 | "PFCD",
3641 | "PFCPCDCD",
3642 | "PFCWGWCD",
3643 | "PFID",
3644 | "PFPID",
3645 | "PGCCD",
3646 | "PGCCDGD",
3647 | "PGCID",
3648 | "PGCJCJD",
3649 | "PGFCD",
3650 | "PGJCCD",
3651 | "PGJICD",
3652 | "PGPCCCJD",
3653 | "PGPCCD",
3654 | "PGPD",
3655 | "PGPID",
3656 | "PIGFCD",
3657 | "PIWGWCCD",
3658 | "PJCCCD",
3659 | "PJCDD",
3660 | "PJCWGWDID",
3661 | "PJDFFD",
3662 | "PLDWGWCDCGD",
3663 | "PPCCCDD",
3664 | "PPCCD",
3665 | "PPCCDGD",
3666 | "PPCCGWD",
3667 | "PPCCWGWD",
3668 | "PPCID",
3669 | "PPCPWCCCWCDD",
3670 | "PPCWGWD",
3671 | "PPDD",
3672 | "PPFWGWCD",
3673 | "PPICD",
3674 | "PPIDD",
3675 | "PPLWGWCCD",
3676 | "PPWGWCD",
3677 | "PPWGWD",
3678 | "PWCCCD",
3679 | "PWCWCD",
3680 | "PWFWCD",
3681 | "PWWJWGWD",
3682 | "WCCCD",
3683 | "WCJWD",
3684 | "WFFPCWJD",
3685 | "WFPWPWCCD",
3686 | "WFWD",
3687 | "WGCJDWFCCCD",
3688 | "WJCCD",
3689 | "WPWJD",
3690 | "WWIWWCWD",
3691 | ]
3692 |
--------------------------------------------------------------------------------
/yaya/config.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | __author__ = 'tony'
3 | import os
4 |
5 | DICT_BIN_EXT = '.ya'
6 | DICT_BIN_REVERSE_EXT = '.reverse.ya'
7 | DATA_ROOT = "/home/tony/MyProject/YaYaNLP/data"
8 |
9 | CUSTOM_DICT_NAME = [os.path.join(DATA_ROOT, "dictionary", "custom", f) for f in [
10 | u"CustomDictionary.txt",
11 | u"上海地名.txt",
12 | u"人名词典.txt",
13 | u"全国地名大全.txt",
14 | u"机构名词典.txt",
15 | u"现代汉语补充词库.txt"]]
16 |
17 | CORE_DICT_NAME = os.path.join(DATA_ROOT, "dictionary", "CoreNatureDictionary.txt")
18 | CORE_BIGRAM_NAME = os.path.join(DATA_ROOT, "dictionary", "CoreNatureDictionary.ngram.txt")
19 | CORE_TR_PATH = os.path.join(DATA_ROOT, "dictionary", "person", "CoreNatureDictionary.tr.txt")
20 |
21 | CHAR_TYPE_PATH = os.path.join(DATA_ROOT, "dictionary", "other", "CharType.dat.yes")
22 |
23 | PERSON_TR_PATH = os.path.join(DATA_ROOT, "dictionary", "person", "nr.tr.txt")
24 | PERSON_DICT_NAME = os.path.join(DATA_ROOT, "dictionary", "person", "nr.txt")
25 |
26 | ORG_TR_PATH = os.path.join(DATA_ROOT, "dictionary", "organization", "nt.tr.txt")
27 | ORG_DICT_NAME = os.path.join(DATA_ROOT, "dictionary", "organization", "nt.txt")
28 |
29 | PLACE_TR_PATH = os.path.join(DATA_ROOT, "dictionary", "place", "ns.tr.txt")
30 | PLACE_DICT_NAME = os.path.join(DATA_ROOT, "dictionary", "place", "ns.txt")
31 |
32 | TRADITIONAL_CHINESE_DICT_NAME = os.path.join(DATA_ROOT, "dictionary", "tc", "TraditionalChinese.txt")
33 |
34 | # 全局配置
35 | class _Config:
36 | # 是否优先使用缓存字典
37 | use_dict_cache = True
38 |
39 | # 是否使用用户字典
40 | use_custom_dict = True
41 |
42 | # 中国人名识别
43 | name_recognize = True
44 |
45 | # 地名识别
46 | place_recognize = True
47 |
48 | # 机构识别
49 | org_recognize = True
50 |
51 | debug = True
52 |
53 |
54 | Config = _Config()
55 |
--------------------------------------------------------------------------------
/yaya/const.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import logging
3 |
4 | __author__ = 'tony'
5 |
6 | logger = logging.getLogger("YaYaNLP")
7 |
8 | # 算术常量
9 | DOUBLE_MAX = 1.7976931348623157e+308
10 |
11 | # 预定义常量
12 | TAG_PLACE = u"未##地"
13 | TAG_BIGIN = u"始##始"
14 | TAG_OTHER = u"未##它"
15 | TAG_GROUP = u"未##团"
16 | TAG_NUMBER = u"未##数"
17 | TAG_QUANTIFIER = u"未##量"
18 | TAG_PROPER = u"未##专"
19 | TAG_TIME = u"未##时"
20 | TAG_CLUSTER = u"未##串"
21 | TAG_END = u"末##末"
22 | TAG_PEOPLE = u"未##人"
23 |
24 | # 总词频
25 | MAX_FREQUENCY = 25146057
26 | SMOOTHING_FACTOR = 1.0 / MAX_FREQUENCY + 0.00001
27 | SMOOTHING_PARAM = 0.1
28 |
--------------------------------------------------------------------------------
/yaya/dictionary/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tony'
2 |
--------------------------------------------------------------------------------
/yaya/dictionary/chinese_traditional_dict.py:
--------------------------------------------------------------------------------
1 | from yaya.collection.dict import DoubleArrayTrie
2 | from yaya import config
3 | from yaya.utility.singleton import singleton
4 |
5 | __author__ = 'tony'
6 |
7 |
8 | class ChinseTraditionalBaseDict:
9 | def convert_key_to_value(self, text):
10 | search = self.trie.search(text)
11 | wordnet = [None] * search.array_length
12 | lennet = [0] * search.array_length
13 | for i, k, v in search.search_all_words():
14 | if len(v[1]) > lennet[i]:
15 | wordnet[i] = v[1]
16 | lennet[i] = len(k)
17 | offset = 0
18 | valuetext = []
19 | while offset < search.array_length:
20 | if wordnet[offset] is None:
21 | valuetext.append(search.char_array[offset])
22 | offset += 1
23 | else:
24 | valuetext.append(wordnet[offset])
25 | offset += lennet[offset]
26 | return "".join(valuetext)
27 |
28 |
29 | @singleton
30 | class SimplifiedChineseDict(ChinseTraditionalBaseDict):
31 | def __init__(self):
32 | self.trie = DoubleArrayTrie.load(config.TRADITIONAL_CHINESE_DICT_NAME,
33 | lambda i: i[i.find(u'=') + 1:],
34 | lambda i: i.split('=')[::-1],
35 | dict_bin_ext=config.DICT_BIN_REVERSE_EXT)
36 | self.trie.get_attr = lambda v: v
37 |
38 | def convert_simplified_to_traditional(self, text):
39 | return self.convert_key_to_value(text)
40 |
41 |
42 | @singleton
43 | class TraditionalChineseDict(ChinseTraditionalBaseDict):
44 | def __init__(self):
45 | self.trie = DoubleArrayTrie.load(config.TRADITIONAL_CHINESE_DICT_NAME,
46 | lambda i: i[:i.find(u'=')],
47 | lambda i: i.split('='))
48 | self.trie.get_attr = lambda v: v
49 |
50 | def convert_traditional_to_simplified(self, text):
51 | return self.convert_key_to_value(text)
52 |
--------------------------------------------------------------------------------
/yaya/dictionary/org_dict.py:
--------------------------------------------------------------------------------
1 | from yaya import config
2 | from yaya.collection.dict import DoubleArrayTrie
3 | from yaya.collection.hmm import HMMMatrix
4 | from yaya.common.nt import NTPattern, NT
5 | from yaya.utility.singleton import singleton
6 |
7 | __author__ = 'tony'
8 |
9 |
10 | @singleton
11 | class OrgDict:
12 | def __init__(self):
13 | self.trie = DoubleArrayTrie.load(config.ORG_DICT_NAME, enum_cls=NT)
14 | self.matrix = HMMMatrix.load(config.ORG_TR_PATH, NT)
15 |
16 |
17 | @singleton
18 | class NTPatternDict:
19 | def __init__(self):
20 | self.trie = DoubleArrayTrie()
21 | NTPattern.sort()
22 | self.trie.build(key=NTPattern)
23 |
--------------------------------------------------------------------------------
/yaya/dictionary/person_dict.py:
--------------------------------------------------------------------------------
1 | from yaya import config
2 | from yaya.collection.dict import DoubleArrayTrie
3 | from yaya.collection.hmm import HMMMatrix
4 | from yaya.common.nr import NRPattern, NR
5 | from yaya.utility.singleton import singleton
6 |
7 | __author__ = 'tony'
8 |
9 |
10 | @singleton
11 | class PersonDict:
12 | def __init__(self):
13 | self.trie = DoubleArrayTrie.load(config.PERSON_DICT_NAME, enum_cls=NR)
14 | self.matrix = HMMMatrix.load(config.PERSON_TR_PATH, NR)
15 |
16 |
17 | @singleton
18 | class NRPatternDict:
19 | def __init__(self):
20 | self.trie = DoubleArrayTrie()
21 | NRPattern.sort()
22 | self.trie.build(key=NRPattern)
--------------------------------------------------------------------------------
/yaya/dictionary/place_dict.py:
--------------------------------------------------------------------------------
1 | from yaya.common.ns import NS, NSPattern
2 | from yaya import config
3 | from yaya.collection.dict import DoubleArrayTrie
4 | from yaya.collection.hmm import HMMMatrix
5 | from yaya.utility.singleton import singleton
6 |
7 | __author__ = 'tony'
8 |
9 |
10 | @singleton
11 | class PlaceDict:
12 | def __init__(self):
13 | self.trie = DoubleArrayTrie.load(config.PLACE_DICT_NAME, enum_cls=NS)
14 | self.matrix = HMMMatrix.load(config.PLACE_TR_PATH, NS)
15 |
16 |
17 | @singleton
18 | class NSPatternDict:
19 | def __init__(self):
20 | self.trie = DoubleArrayTrie()
21 | NSPattern.sort()
22 | self.trie.build(key=NSPattern)
23 |
--------------------------------------------------------------------------------
/yaya/recognition/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tony'
2 |
3 |
--------------------------------------------------------------------------------
/yaya/recognition/organization_recognition.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from yaya.collection.dict import Attribute, ORG_ATTRIBUTE
3 | from yaya.collection.hmm import OrgTranMatrix
4 | from yaya.common.nature import NATURE
5 | from yaya.common.nt import NT
6 | from yaya.dictionary.org_dict import NTPatternDict, OrgDict
7 | from yaya.recognition.recognition import role_viterbi
8 | from yaya.seg.viterbi import viterbi_standard
9 |
10 | __author__ = 'tony'
11 |
12 |
13 | def recognition(vertexs, wordnet_optimum, wordnet_all):
14 | # 识别角色,并进行一次维特比
15 | return role_viterbi(vertexs, wordnet_optimum,
16 | hmm=OrgTranMatrix().hmm,
17 | trie=NTPatternDict().trie,
18 | recognition_attr=ORG_ATTRIBUTE,
19 | tag_func=role_tag,
20 | viterbi_fun=viterbi_standard
21 | )
22 |
23 | def role_tag(word_seg_list):
24 | tag_index_list = []
25 | for vertex in word_seg_list:
26 | nature = vertex.nature
27 | if nature == NATURE.nz:
28 | if vertex.attribute.total_frequency <= 1000:
29 | tag_index_list.append(Attribute([str(NT.F), 1000], cls=NT)) # ((NT.F, 1000))
30 | else:
31 | break
32 | continue
33 | elif nature in [NATURE.ni,
34 | NATURE.nic,
35 | NATURE.nis,
36 | NATURE.nit]:
37 | tag_index_list.append(Attribute([str(NT.K), 1000, str(NT.D), 1000], cls=NT))
38 | continue
39 | elif nature == NATURE.m:
40 | tag_index_list.append(Attribute([str(NT.M), 1000], cls=NT))
41 | continue
42 |
43 | index, value = OrgDict().trie.get(vertex.word)
44 | if value is None:
45 | value = Attribute([str(NT.Z), OrgDict().matrix.get_total_freq(NT.Z)], cls=NT)
46 | # else:
47 | # # if not isinstance(value, list):
48 | # # value = value.split()
49 | # # value = Attribute(value[1:], cls=NT)
50 |
51 | tag_index_list.append(value)
52 |
53 | return tag_index_list
54 |
--------------------------------------------------------------------------------
/yaya/recognition/person_recognition.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from yaya.collection.dict import Attribute, PERSON_ATTRIBUTE
3 | from yaya.collection.hmm import PersonTranMatrix
4 | from yaya.common.nature import NATURE
5 | from yaya.common.nr import NR
6 | from yaya.dictionary.person_dict import PersonDict, NRPatternDict
7 | from yaya.recognition.recognition import role_viterbi
8 | from yaya.seg.wordnet import Vertex
9 |
10 | __author__ = 'tony'
11 |
12 | def recognition(vertexs, wordnet_optimum, wordnet_all):
13 | return role_viterbi(vertexs, wordnet_optimum,
14 | hmm=PersonTranMatrix().hmm,
15 | trie=NRPatternDict().trie,
16 | recognition_attr=PERSON_ATTRIBUTE,
17 | tag_func=role_tag
18 | )
19 | # # 识别角色,并进行一次维特比
20 | # tag_list = role_tag(vertexs)
21 | # if Config.debug:
22 | # sb = []
23 | # for i, v in enumerate(vertexs):
24 | # sb.append("[%s %s]" % (unicode(vertexs[i]), tag_list[i].nature))
25 | # print u"人名角色观察:%s" % "".join(sb)
26 | # tag_list = viterbi_template(tag_list, PersonTranMatrix().hmm)
27 | #
28 | #
29 | # tag_str = [str(x) for x in tag_list]
30 | # tag_str = ''.join(tag_str)
31 | #
32 | # # 处理V、U的特殊情况
33 | # tag_str, vertexs = parse_pattern(tag_str, vertexs, None, None)
34 | #
35 | # search = Searcher(NRPatternDict().trie, tag_str)
36 | # vertexs_offset = [0 for i in range(len(vertexs))]
37 | # offset = 0
38 | # vertexs_offset[1] = 1
39 | # for i in range(2, len(vertexs) - 2):
40 | # vertexs_offset[i] = vertexs_offset[i - 1] + len(vertexs[i - 1].real_word)
41 | # while search.next():
42 | # name_str = ""
43 | # for i in range(search.begin, search.begin + len(search.key)):
44 | # name_str += vertexs[i].real_word
45 | #
46 | # # 添加到词网内
47 | # vertex = Vertex(name_str, attribute="nr 1")
48 | # wordnet_optimum.insert(vertexs_offset[search.begin], vertex, wordnet_all)
49 |
50 |
51 |
52 | def role_tag(word_seg_list):
53 | tag_index_list = []
54 | for vertex in word_seg_list:
55 | if vertex.nature == NATURE.nr and vertex.attribute.total_frequency <= 1000:
56 | if vertex.real_word.__len__() == 2:
57 | tag_index_list.append(Attribute(attr=(NR.X, 1, NR.G, 1), cls=NR))
58 | continue
59 |
60 | index, value = PersonDict().trie.get(vertex.real_word)
61 |
62 | if value is None:
63 | value = Attribute([str(NR.A), PersonDict().matrix.get_total_freq(NR.A)], cls=NR)
64 |
65 | tag_index_list.append(value)
66 | return tag_index_list
67 |
68 |
69 | def parse_pattern(tag_str, vertexs, wordnet_optimum, wordnet_all):
70 | new_tag_list = []
71 | new_vertexs = []
72 | for i, t in enumerate(tag_str):
73 | if t == str(NR.U):
74 | new_tag_list.append(str(NR.K))
75 | new_tag_list.append(str(NR.B))
76 | word_K = vertexs[i].real_word[:-1]
77 | word_B = vertexs[i].real_word[-1]
78 | new_vertexs.append(Vertex(word_K))
79 | new_vertexs.append(Vertex(word_B))
80 | elif t == str(NR.V):
81 | if tag_str[i - 1] == str(NR.B):
82 | new_tag_list.append(str(NR.E))
83 | else:
84 | new_tag_list.append(str(NR.D))
85 | new_tag_list.append(str(NR.L))
86 | word_ED = vertexs[i].real_word[:-1]
87 | word_L = vertexs[i].real_word[-1]
88 | new_vertexs.append(Vertex(word_ED))
89 | new_vertexs.append(Vertex(word_L))
90 | else:
91 | new_tag_list.append(t)
92 | new_vertexs.append(vertexs[i])
93 | return "".join(new_tag_list), new_vertexs
94 |
--------------------------------------------------------------------------------
/yaya/recognition/place_recognition.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from yaya.common.ns import NS
3 | from yaya.dictionary.place_dict import NSPatternDict, PlaceDict
4 | from yaya.recognition.recognition import role_viterbi
5 |
6 | __author__ = 'tony'
7 | # coding=utf-8
8 | from yaya.collection.dict import Attribute, PLACE_ATTRIBUTE
9 | from yaya.collection.hmm import PlaceTranMatrix
10 | from yaya.common.nature import NATURE
11 |
12 | __author__ = 'tony'
13 |
14 |
15 | def recognition(vertexs, wordnet_optimum, wordnet_all):
16 | return role_viterbi(vertexs, wordnet_optimum,
17 | hmm=PlaceTranMatrix().hmm,
18 | trie=NSPatternDict().trie,
19 | recognition_attr=PLACE_ATTRIBUTE,
20 | tag_func=role_tag
21 | )
22 |
23 | # # 识别角色,并进行一次维特比
24 | # tag_list = viterbi_template(role_tag(vertexs), PlaceTranMatrix().hmm)
25 | # tag_str = [str(x) for x in tag_list]
26 | # tag_str = ''.join(tag_str)
27 | # search = Searcher(NSPatternDict().trie, tag_str)
28 | # vertexs_offset = [0] * len(vertexs)
29 | # offset = 0
30 | # for i in range(1, len(vertexs) - 2):
31 | # vertexs_offset[i] = offset
32 | # offset += len(vertexs[i].real_word)
33 | # while search.next():
34 | # name_str = ""
35 | # for i in range(search.begin, search.begin + len(search.key)):
36 | # name_str += vertexs[i].real_word
37 | #
38 | # # 添加到词网内
39 | # vertex = Vertex(name_str, attribute="ns 1")
40 | # wordnet_optimum.insert(vertexs_offset[search.begin + 1], vertex, wordnet_all)
41 |
42 |
43 | def role_tag(word_seg_list):
44 | tag_index_list = []
45 | for vertex in word_seg_list:
46 | if vertex.nature == NATURE.ns.index and vertex.attribute.total_frequency <= 1000:
47 | if vertex.real_word.__len__() < 3:
48 | tag_index_list.append(Attribute("%s 1 %s 1" % (NS.H, NS.G), NS))
49 | continue
50 | index, value = PlaceDict().trie.get(vertex.real_word)
51 | if value is None:
52 | value = Attribute([str(NS.Z), PlaceDict().matrix.get_total_freq(NS.Z)], cls=NS)
53 | # else:
54 | # if not isinstance(value, list):
55 | # value = value.split()
56 | # value = Attribute(value[1:], cls=NS)
57 | tag_index_list.append(value)
58 | return tag_index_list
59 |
--------------------------------------------------------------------------------
/yaya/recognition/recognition.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from yaya.collection.dict import Searcher
3 | from yaya.seg.viterbi import viterbi, viterbi_template
4 | from yaya.seg.wordnet import Vertex
5 | from yaya.config import Config
6 | __author__ = 'tony'
7 |
8 |
9 | def role_viterbi(vertexs, wordnet_optimum, hmm, trie, recognition_attr, tag_func, viterbi_fun=viterbi_template):
10 | tag_list = tag_func(vertexs)
11 | if Config.debug:
12 | sb = []
13 | for i, tag in enumerate(tag_list):
14 | sb.append(u"[ %s %s ]" % (vertexs[i].real_word, tag))
15 | print u"角色观察: %s" % u"".join(sb)
16 |
17 | tag_list = viterbi_fun(tag_list, hmm)
18 | if Config.debug:
19 | sb = []
20 | for i, tag in enumerate(tag_list):
21 | sb.append(u"%s/%s" % (vertexs[i].real_word, tag))
22 | print(u"角色标注:[%s]" % u", ".join(sb))
23 |
24 | tag_str = [str(x) for x in tag_list]
25 | tag_str = ''.join(tag_str)
26 | search = Searcher(trie, tag_str)
27 | vertexs_offset = [0] * len(vertexs)
28 | offset = 1
29 | # head tail skip
30 | for i, v in enumerate(vertexs[1:-1]):
31 | vertexs_offset[i + 1] = offset
32 | offset += len(vertexs[i + 1].real_word)
33 | while search.next():
34 | name_str = ""
35 | for i in range(search.begin, search.begin + len(search.key)):
36 | name_str += vertexs[i].real_word
37 |
38 | # 添加到词网内
39 | vertex = Vertex(name_str, attribute=recognition_attr)
40 | wordnet_optimum.add(vertexs_offset[search.begin], vertex)
41 | vertexs = viterbi(wordnet_optimum.vertexs)
42 | return vertexs
43 |
--------------------------------------------------------------------------------
/yaya/seg/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tony'
2 |
--------------------------------------------------------------------------------
/yaya/seg/segment.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from yaya.dictionary.chinese_traditional_dict import SimplifiedChineseDict, TraditionalChineseDict
3 | from yaya.recognition import place_recognition
4 | from yaya.config import Config
5 | from yaya.recognition import person_recognition
6 | from yaya.recognition import organization_recognition
7 | from yaya.seg.viterbi import viterbi
8 | from yaya.seg.wordnet import WordNet, gen_word_net, combine_by_custom_dict
9 |
10 | __author__ = 'tony'
11 |
12 |
13 | def vertexs_to_terms(vertexs, word_only=False):
14 | terms = []
15 | offset = 0
16 | if word_only:
17 | terms = [v.real_word for v in vertexs]
18 | else:
19 | for v in vertexs[1:-1]:
20 | terms.append((v.real_word, str(v.nature), offset))
21 | offset += len(v.real_word)
22 | return terms
23 |
24 |
25 | # def combin_by_dict(vertexs, dat):
26 | # for i, start_v in enumerate(vertexs):
27 | # # skip head and skip combined word
28 | # if i == 0 or start_v is None:
29 | # continue
30 | # state = dat.transition(start_v.real_word, 1)
31 | # if state > 0:
32 | # start = i
33 | # end = -1
34 | # value = None
35 | # for j, end_v in enumerate(vertexs[i + 1:-1]):
36 | # state = dat.transition(end_v.real_word, state)
37 | # if state < 0:
38 | # break
39 | # value = dat.output(state)
40 | # end = j + 1
41 | #
42 | # if value is not None:
43 | # for k in range(start, end + i + 1):
44 | # vertexs[k] = None
45 | # vertexs[i] = Vertex(value, attribute=value)
46 | #
47 | # return [v for v in vertexs if v is not None]
48 |
49 |
50 | def seg_to_vertexs(text):
51 | word_net = WordNet(text)
52 |
53 | # 粗分词网
54 | gen_word_net(text, word_net)
55 |
56 | if Config.debug:
57 | print(u"打印粗分词网:")
58 | print(unicode(word_net))
59 |
60 | # 维特比
61 | vertexs = viterbi(word_net.vertexs)
62 | if Config.use_custom_dict:
63 | vertexs = combine_by_custom_dict(vertexs)
64 | word_net_optimum = WordNet(text, vertexs=vertexs)
65 |
66 | if Config.name_recognize:
67 | person_recognition.recognition(vertexs, word_net_optimum, word_net)
68 |
69 | if Config.place_recognize:
70 | place_recognition.recognition(vertexs, word_net_optimum, word_net)
71 |
72 | if Config.debug:
73 | print(u"打印人名、地名识别词网:")
74 | print(unicode(word_net_optimum))
75 |
76 | vertexs = viterbi(word_net_optimum.vertexs)
77 |
78 | if Config.org_recognize:
79 | word_net_optimum = WordNet(text, vertexs=vertexs)
80 | vertexs = organization_recognition.recognition(vertexs, word_net_optimum, word_net)
81 |
82 | if Config.debug:
83 | print(u"打印人组织识别词网:")
84 | print(unicode(word_net_optimum))
85 | return vertexs
86 |
87 |
88 | def seg(text):
89 | return vertexs_to_terms(seg_to_vertexs(text))
90 |
91 |
92 | def traditional_seg(text):
93 | simplified = TraditionalChineseDict().convert_traditional_to_simplified(text)
94 | return seg(simplified)
95 |
96 | def simplified_to_traditional(text):
97 | return SimplifiedChineseDict().convert_simplified_to_traditional(text)
98 |
99 |
100 | def traditional_to_simplified(text):
101 | return TraditionalChineseDict().convert_traditional_to_simplified(text)
102 |
--------------------------------------------------------------------------------
/yaya/seg/viterbi.py:
--------------------------------------------------------------------------------
1 | # -*- encoding:utf-8 -*-
2 | from __future__ import unicode_literals
3 | import math
4 |
5 | from yaya.const import DOUBLE_MAX
6 | from yaya.config import Config
7 |
8 | __author__ = 'tony'
9 |
10 |
11 | class Viterbi:
12 | @staticmethod
13 | def computer(obs, states, start_p, trans_p, emit_p):
14 | max_states_value = 0
15 | for s in states:
16 | max_states_value = max(max_states_value, s)
17 | max_states_value += 1
18 |
19 | V = [[0 for col in range(obs.__len__())] for row in range(max_states_value)]
20 | path = [[0 for col in range(max_states_value)] for row in range(obs.__len__())]
21 |
22 | for y in states:
23 | V[0][y] = start_p[y] + emit_p[obs[0]]
24 | path[y][0] = y
25 |
26 | for t in range(obs.__len__()):
27 | new_path = [[0 for col in range(max_states_value) for row in range(obs.__len__())]]
28 | for y in states:
29 | prob = DOUBLE_MAX
30 | states = 0
31 | for y0 in states:
32 | nprob = V[t - 1][y0] + trans_p[y0][y] + emit_p[y][obs[t]]
33 | if nprob < prob:
34 | prob = nprob
35 | state = y0
36 | V[t][y] = prob
37 | path[state][0:t] = new_path[y][0:t]
38 | new_path[y][t] = y
39 | path = new_path
40 | prob = DOUBLE_MAX
41 | state = 0
42 | for y in states:
43 | if V[-1][y] < prob:
44 | prob = V[-1][y]
45 | state = y
46 | return path[state]
47 |
48 |
49 | def viterbi(vertexs):
50 | for v in vertexs[1]:
51 | v.update_from(vertexs[0][0])
52 | for i in range(1, vertexs.__len__() - 1):
53 | node_array = vertexs[i]
54 | if node_array is None:
55 | continue
56 | for node in node_array:
57 | if node.vertex_from is None:
58 | continue
59 | for node_to in vertexs[i + len(node.real_word)]:
60 | node_to.update_from(node)
61 | vertex_from = vertexs[-1][0]
62 | vertex_list = []
63 | while vertex_from is not None:
64 | vertex_list.insert(0, vertex_from)
65 | vertex_from = vertex_from.vertex_from
66 | return vertex_list
67 |
68 | def viterbi_roletag(roletaglist, hmm):
69 | _length = len(roletaglist)
70 | taglist = []
71 | # 得到第一个元素的第一个标签的词性
72 | _pre_nature = roletaglist[0].nature
73 | _perfect_nature = _pre_nature
74 | taglist.append(_pre_nature)
75 | for i in xrange(1, _length):
76 | perfect_cost = DOUBLE_MAX
77 | item = roletaglist[i]
78 | for i, nature, freq in item.natures:
79 | _now = hmm.trans_prob[_pre_nature.index][nature.index] - math.log((item.get_nature_frequency(nature)+1e-8) / hmm.get_total_freq(nature))
80 | if perfect_cost > _now:
81 | perfect_cost = _now
82 | _perfect_nature = nature
83 | _pre_nature = _perfect_nature
84 | taglist.append(_pre_nature)
85 | return taglist
86 |
87 | def viterbi_template(node_list, hmm, init_cost=DOUBLE_MAX):
88 | node_count = len(node_list)
89 | taglist = []
90 | # 得到第一个元素的第一个标签的词性
91 | _pre_nature = node_list[0].nature
92 | _perfect_nature = _pre_nature
93 | taglist.append(_pre_nature)
94 | for i, cur_node in enumerate(node_list[1:]):
95 | perfect_cost = init_cost
96 | for j, vertex, freq in cur_node.natures:
97 | _now = hmm.trans_prob[_pre_nature.index][vertex.index] - math.log(
98 | (cur_node.get_nature_frequency(vertex) + 1e-8) / hmm.get_total_freq(vertex))
99 | if perfect_cost > _now:
100 | perfect_cost = _now
101 | _perfect_nature = vertex
102 | _pre_nature = _perfect_nature
103 | taglist.append(_pre_nature)
104 | return taglist
105 |
106 |
107 | def viterbi_standard(node_list, hmm, init_cost=DOUBLE_MAX):
108 | node_count = len(node_list)
109 | taglist = []
110 | # 得到第一个元素的第一个标签的词性
111 | route_cost = []
112 | _pre_nature = node_list[0].nature
113 | _perfect_nature = _pre_nature
114 | taglist.append(_pre_nature)
115 |
116 | # 计算第2个元素
117 | current_line = node_list[1]
118 | for i, vertex, freq in current_line.natures:
119 | _now = hmm.trans_prob[_pre_nature.index][vertex.index] - math.log(
120 | (current_line.get_nature_frequency(vertex) + 1e-8) / hmm.get_total_freq(vertex))
121 | route_cost.append(_now)
122 | pre_line = current_line
123 |
124 | # 计算第三个元素
125 | for i, current_line in enumerate(node_list[2:]):
126 | new_route_cost = []
127 | perfect_pre_nature = None
128 | perfect_cost = init_cost
129 | for k, cur_nature, cur_freq in current_line.natures:
130 | new_route_cost.append(init_cost)
131 | for j, pre_nature, pre_freq in pre_line.natures:
132 | assert j < len(route_cost)
133 |
134 | _now = route_cost[j] + hmm.trans_prob[pre_nature.index][cur_nature.index] - math.log(
135 | (current_line.get_nature_frequency(cur_nature) + 1e-8) / hmm.get_total_freq(cur_nature))
136 |
137 | if new_route_cost[k] > _now:
138 | new_route_cost[k] = _now
139 | if perfect_cost > _now:
140 | perfect_cost = _now
141 | perfect_pre_nature = pre_nature
142 |
143 | pre_line = current_line
144 | route_cost = new_route_cost
145 | if Config.debug:
146 | print new_route_cost
147 | taglist.append(perfect_pre_nature)
148 | taglist.append(cur_nature)
149 | return taglist
150 |
151 |
--------------------------------------------------------------------------------
/yaya/seg/wordnet.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from __future__ import absolute_import
3 | import math
4 | import copy
5 |
6 | from yaya.collection.dict import *
7 | from yaya.common.nature import NATURE
8 | from yaya.utility.chartype import *
9 | from yaya.collection.bigram import CORE_BIG_RAM_TABLE
10 | from yaya.const import *
11 |
12 | __author__ = 'tony'
13 |
14 |
15 | class AtomNode:
16 | def __init__(self, word, pos):
17 | self.word = word
18 | self.pos = pos
19 |
20 | def __str__(self):
21 | return "AtomNode{ word='%s', nature='%s' }" % (self.word, self.pos)
22 |
23 |
24 | class Vertex:
25 | def __init__(self, real_word, *args, **kwargs):
26 | if kwargs.has_key('attribute'):
27 | attribute = kwargs.get('attribute')
28 | else:
29 | index, attribute = CoreDict().trie.get(real_word)
30 | self.attribute = attribute if isinstance(attribute, Attribute) else Attribute(attribute)
31 |
32 | self.word_id = kwargs.get('word_id', -1)
33 | self.real_word = real_word
34 | word = kwargs.get('word', None)
35 | self.word = word if word is not None else self.compile_real_word(self.real_word, self.attribute)
36 | self.vertex_from = None
37 | self.weight = 0
38 |
39 | def __unicode__(self):
40 | return u"%s/%s" % (self.real_word, self.word)
41 |
42 | def __repr__(self):
43 | return u"Vertex(%(real_word)r, %(attribute)r )" % vars(self)
44 |
45 | def __eq__(self, other):
46 | if type(self) != type(other):
47 | return False
48 | return self.real_word == other.real_word and self.nature == other.nature
49 |
50 | @property
51 | def nature(self):
52 | return self.attribute.nature
53 |
54 |
55 | @nature.setter
56 | def nature(self, value):
57 | self.attribute.nature = value
58 |
59 | def update_from(self, vertex_from):
60 | weight = vertex_from.weight + Vertex.calc_wight(vertex_from, self)
61 | if self.vertex_from is None or self.weight > weight:
62 | self.vertex_from = vertex_from
63 | self.weight = weight
64 |
65 | @staticmethod
66 | def calc_wight(vertex_p, vertex_n):
67 | freq = vertex_p.attribute.total_frequency
68 | if freq == 0:
69 | freq = 1
70 | two_word_freq = CORE_BIG_RAM_TABLE.table.get_bifreq(vertex_p.word_id, vertex_n.word_id)
71 | value = -math.log(SMOOTHING_PARAM * freq / MAX_FREQUENCY + (1 - SMOOTHING_PARAM) *
72 | ((1 - SMOOTHING_FACTOR) * two_word_freq / freq + SMOOTHING_FACTOR))
73 | if value < 0:
74 | value = -value
75 | return value
76 |
77 | def compile_real_word(self, real_word, attribute):
78 | if (len(attribute) >= 1):
79 | if attribute.nature in [NATURE.nr,
80 | NATURE.nr1,
81 | NATURE.nr2,
82 | NATURE.nrf,
83 | NATURE.nrj]:
84 | self.word_id = PERSON_WORD_ID
85 | return TAG_PEOPLE
86 | elif attribute.nature in [NATURE.ns, NATURE.nsf]:
87 | self.word_id = PLACE_WORD_ID
88 | return TAG_PLACE
89 | elif attribute.nature in [NATURE.nz, NATURE.nx]:
90 | self.word_id = PROPER_WORD_ID
91 | return TAG_PROPER
92 | elif attribute.nature in [
93 | NATURE.nt,
94 | NATURE.ntc,
95 | NATURE.ntcf,
96 | NATURE.ntcb,
97 | NATURE.ntch,
98 | NATURE.nto,
99 | NATURE.ntu,
100 | NATURE.nts,
101 | NATURE.nth,
102 | NATURE.nit]:
103 | self.word_id = PLACE_WORD_ID
104 | return TAG_GROUP
105 | elif attribute.nature in [NATURE.m, NATURE.mq]:
106 | self.word_id = NUMBER_WORD_ID
107 | return TAG_NUMBER
108 | elif attribute.nature == NATURE.x:
109 | self.word_id = CLUSTER_WORD_ID
110 | return TAG_CLUSTER
111 | elif attribute.nature in [NATURE.t]:
112 | self.word_id = TIME_WORD_ID
113 | return TAG_TIME
114 | return real_word
115 |
116 |
117 | def atom_seg(text, begin, end):
118 | node_list = []
119 | offset = begin
120 | pre_type = get(text[offset])
121 | offset += 1
122 | while offset < end:
123 | cur_type = get(text[offset])
124 | if cur_type != pre_type:
125 | # 处理浮点数
126 | if text[offset] == '.' and pre_type == CT_NUM:
127 | offset += 1
128 | while offset < end:
129 | cur_type = get(text[offset])
130 | if cur_type != CT_NUM:
131 | break
132 | else:
133 | offset += 1
134 | node_list.append(AtomNode(text[begin:offset], pre_type))
135 | begin = offset
136 | pre_type = cur_type
137 | offset += 1
138 |
139 | if offset == end:
140 | node_list.append(AtomNode(text[begin:offset], pre_type))
141 |
142 | return node_list
143 |
144 |
145 | def combine_by_custom_dict(vertexs, dat=CustomDict().trie):
146 | dat = CustomDict().trie
147 | for i in range(len(vertexs)):
148 | state = 1
149 | if vertexs[i] is None:
150 | continue
151 | state = dat.transition(vertexs[i].real_word, state)
152 | value = None
153 | if state > 0:
154 | start = i
155 | to = i + 1
156 | end = - 1
157 | for to in range(to, len(vertexs)):
158 | state = dat.transition(vertexs[to].real_word, state)
159 | if state < 0:
160 | break
161 | output = dat.output(state)
162 | if output is not None:
163 | value = output
164 | end = to + 1
165 |
166 | if value is not None:
167 | word = ""
168 | for j in range(start, end):
169 | word += vertexs[j].real_word
170 | vertexs[j] = None
171 | vertexs[i] = Vertex(real_word=word, attribute=value)
172 |
173 | # todo 考虑加入动态用户词典
174 | return [v for v in vertexs if v is not None]
175 |
176 |
177 |
178 | def dump_vertexs(vertexs):
179 | logger.info("=" * 30)
180 | for i, v in enumerate(vertexs):
181 | logger.info("[%d] %s %s %s" % (i, v.real_word, v.word, v.nature))
182 |
183 | class WordNet:
184 | def __init__(self, text=None, vertexs=None):
185 | self.vertexs = [[] for i in range(len(text) + 2)]
186 | self.size = 2
187 | if vertexs is not None:
188 | i = 1
189 | for v in vertexs[1:-1]:
190 | v.vertex_from = None
191 | self.vertexs[i]=[v]
192 | i += v.real_word.__len__()
193 | self.vertexs[0] = [vertexs[0]]
194 | self.vertexs[-1] = [vertexs[-1]]
195 | else:
196 | self.vertexs[0] = [new_tag_vertex(TAG_BIGIN)]
197 | self.vertexs[-1] = [new_tag_vertex(TAG_END)]
198 | pass
199 |
200 | def get_first(self, line):
201 | if self.vertexs[line].__len__() > 0:
202 | return self.vertexs[line][0]
203 | else:
204 | return None
205 |
206 | def get(self, line, word_length=None):
207 | if word_length is None:
208 | return self.vertexs[line]
209 | for v in self.vertexs[line]:
210 | if len(v.real_word) == word_length:
211 | return v
212 | return None
213 |
214 | def add(self, line, vertex):
215 | for v in self.vertexs[line]:
216 | if v.real_word.__len__() == vertex.real_word.__len__():
217 | return
218 | if self.vertexs[line].__len__() == 0:
219 | self.vertexs[line] = [vertex]
220 | else:
221 | self.vertexs[line].append(vertex)
222 | self.size += 1
223 |
224 | def insert(self, line, vertex, word_net):
225 | self.add(line, vertex)
226 | # 保证连接性
227 | for l in range(line - 1, 1, -1):
228 | if self.get(l, 1) is None:
229 | first = word_net.get_first(l)
230 | if first is None:
231 | return
232 | self.vertexs[l].append(copy.deepcopy(first))
233 | self.size += 1
234 | if len(self.vertexs) > 1:
235 | break
236 | else:
237 | break
238 | l = line + len(vertex.real_word)
239 | if len(self.get(l)) == 0:
240 | target_line = word_net.get(l)
241 | if target_line is None or len(target_line) == 0:
242 | return
243 | self.vertexs[l] = copy.deepcopy(target_line)
244 | self.size += len(self.vertexs[l])
245 |
246 | for l in range(l, len(self.vertexs)):
247 | if self.get(l).__len__() == 0:
248 | first = word_net.get_first(l)
249 | if first is None:
250 | break
251 | self.vertexs[l].append(copy.deepcopy(first))
252 | self.size += 1
253 | if self.vertexs[l].__len__() > 1:
254 | break
255 | else:
256 | break
257 |
258 | def add_atoms(self, line, atom_list):
259 | offset = 0
260 | for atom_node in atom_list:
261 | word = atom_node.word
262 | nature = NATURE.n
263 | if atom_node.pos in [CT_INDEX, CT_NUM]:
264 | nature = NATURE.m
265 | word = TAG_NUMBER
266 | elif atom_node.pos in [CT_DELIMITER]:
267 | nature = NATURE.w
268 | elif atom_node.pos in [CT_LETTER, CT_SINGLE]:
269 | nature = NATURE.nx
270 | word = TAG_CLUSTER
271 | self.add(line + offset, Vertex(word=word,
272 | real_word=atom_node.word,
273 | attribute=Attribute([str(nature), '1']),
274 | word_id=-1
275 | ))
276 |
277 | def __len__(self):
278 | return len(self.vertexs)
279 |
280 | def __unicode__(self):
281 | sb = []
282 | sb.append("=" * 30)
283 | for i, vl in enumerate(self.vertexs):
284 | sb.append(u"[%d]:[%s]" % (i, u",".join([v.real_word for v in vl])))
285 | sb.append("=" * 30)
286 | return u"\n".join(sb)
287 |
288 | def gen_word_net(text, word_net, dat=CoreDict().trie):
289 | searcher = dat.buildcoredictsearcher(text)
290 | while searcher.next():
291 | word_net.add(searcher.begin + 1, Vertex(real_word=searcher.key,
292 | attribute=searcher.value,
293 | word_id=searcher.index))
294 | for i in range(word_net.vertexs.__len__()):
295 | # for i, v in enumerate(word_net.vertexs):
296 | if word_net.vertexs[i].__len__() == 0:
297 | j = i + 1
298 | for j in range(i + 1, word_net.vertexs.__len__() - 1):
299 | if word_net.vertexs[j].__len__() != 0:
300 | break
301 | word_net.add_atoms(i, atom_seg(text, i - 1, j - 1))
302 | else:
303 | i += word_net.vertexs[i][-1].real_word.__len__()
304 |
305 | def new_tag_vertex(tag):
306 | word_id, attribute = CoreDict().trie.get(tag)
307 | if word_id > 0:
308 | vertex = Vertex(chr(32), attribute=attribute, word=tag, word_id=word_id)
309 | return vertex
310 | else:
311 | logger.error(u"从核心字典加载%s信息时出错", tag)
312 | import sys
313 | sys.exit(-1)
314 |
--------------------------------------------------------------------------------
/yaya/utility/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tony'
2 |
--------------------------------------------------------------------------------
/yaya/utility/bytearray.py:
--------------------------------------------------------------------------------
1 | from io import FileIO
2 |
3 | __author__ = 'tony'
4 | import struct
5 |
6 |
7 | class ByteArray:
8 | @staticmethod
9 | def load_from_file(filename):
10 | f = FileIO(filename, 'rb')
11 | data = f.readall()
12 | return ByteArray(data)
13 |
14 | def __init__(self, data):
15 | self.data = data
16 | self.offset = 0
17 |
18 | def has_more(self):
19 | return self.offset < len(self.data)
20 |
21 | def next_ushort(self):
22 | data = struct.unpack_from('!h', self.data, self.offset)
23 | self.offset += 2
24 | return data[0]
25 |
26 | def next_uchar(self):
27 | data = struct.unpack_from('!B', self.data, self.offset)
28 | self.offset += 1
29 | return data[0]
30 |
--------------------------------------------------------------------------------
/yaya/utility/chartype.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import time
3 | from yaya import config
4 | from yaya.const import logger
5 | from yaya.utility.bytearray import ByteArray
6 |
7 | __author__ = 'tony'
8 |
9 | CT_SINGLE = 5 # 单字节
10 | CT_DELIMITER = CT_SINGLE + 1 # 分隔符"!,.?()[]{}+=
11 | CT_CHINESE = CT_SINGLE + 2 # 中文字符
12 | CT_LETTER = CT_SINGLE + 3 # 字母
13 | CT_NUM = CT_SINGLE + 4 # 数字
14 | CT_INDEX = CT_SINGLE + 5 # 序号
15 | CT_OTHER = CT_SINGLE + 12 # 其他
16 |
17 | char_type = [[]] * 65536
18 |
19 |
20 | def __init__():
21 | logger.info("字符类型对应表开始加载 %s", config.CHAR_TYPE_PATH)
22 | start = time.time()
23 | byte_array = ByteArray.load_from_file(config.CHAR_TYPE_PATH)
24 | if byte_array is None:
25 | import sys
26 | logger.error("字符类型对应表加载失败:" + config.CHAR_TYPE_PATH)
27 | sys.exit(-1)
28 | else:
29 | while byte_array.has_more():
30 | b = byte_array.next_ushort()
31 | e = byte_array.next_ushort()
32 | t = byte_array.next_uchar()
33 | for i in range(b, e + 1):
34 | char_type[i] = t
35 | logger.info("字符类型对应表加载成功,耗时 %s s", (time.time() - start))
36 |
37 |
38 | def get(c):
39 | if type(c) is not int:
40 | return char_type[ord(c)]
41 | else:
42 | return char_type[c]
43 |
44 |
45 | __init__()
46 |
--------------------------------------------------------------------------------
/yaya/utility/persistence.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tony'
2 |
--------------------------------------------------------------------------------
/yaya/utility/singleton.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tony'
2 |
3 |
4 | def singleton(class_):
5 | instances = {}
6 |
7 | def get_instance(*args, **kwargs):
8 | if class_ not in instances:
9 | instances[class_] = class_(*args, **kwargs)
10 | return instances[class_]
11 |
12 | return get_instance
13 |
--------------------------------------------------------------------------------