├── .gitignore
├── .vscode
    ├── launch.json
    └── tasks.json
├── LICENSE
├── README.md
├── doc
    ├── advance.md
    └── grammer.md
├── rules
    ├── calparser
    ├── cnext
    ├── learn
    └── xmlparser
├── src
    ├── __init__.py
    ├── tngraph.py
    ├── tnnlp.py
    └── tnpy.py
└── test
    ├── chs.txt
    ├── learn.py
    └── sample.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | #log files
 7 | *.log
 8 | *.html
 9 | 
10 | # C extensions
11 | *.so
12 | 
13 | 
14 | # Distribution / packaging
15 | .Python
16 | env/
17 | build/
18 | develop-eggs/
19 | dist/
20 | downloads/
21 | eggs/
22 | .eggs/
23 | lib/
24 | lib64/
25 | parts/
26 | sdist/
27 | var/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 | 
32 | # PyInstaller
33 | #  Usually these files are written by a python script from a template
34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 | 
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 | 
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *,cover
51 | .hypothesis/
52 | 
53 | # Translations
54 | *.mo
55 | *.pot
56 | 
57 | # Django stuff:
58 | *.log
59 | 
60 | # Sphinx documentation
61 | docs/_build/
62 | 
63 | # PyBuilder
64 | target/
65 | 
66 | #Ipython Notebook
67 | .ipynb_checkpoints
68 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"version": "0.2.0",
 3 | 	"configurations": [
 4 | 		
 5 | 		{
 6 |     "name": "Python",
 7 |     "type": "python",
 8 |     "request": "launch",
 9 |     "stopOnEntry": true,
10 |     "program": "${file}",
11 |     "pythonPath": "D:/Anaconda3/python.exe",
12 |     "debugOptions": [
13 |         "WaitOnAbnormalExit",
14 |         "WaitOnNormalExit",
15 |         "RedirectOutput"
16 |     ]
17 | }
18 | 		
19 | 	]
20 | }


--------------------------------------------------------------------------------
/.vscode/tasks.json:
--------------------------------------------------------------------------------
1 | {
2 | 	// See http://go.microsoft.com/fwlink/?LinkId=733558
3 | 	// for the documentation about the tasks.json format
4 | 	"version": "0.1.0",
5 | 	"command": "echo",
6 | 	"isShellCommand": true,
7 | 	"args": ["Hello World"],
8 | 	"showOutput": "always"
9 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | tn是desert(沙漠之鹰)和tan共同开发的一种用于匹配，转写和抽取文本的语言（DSL）。并为其开发和优化了专用的编译器。基于递归下降方法和正则表达式，能解析自然文本并转换为树和字典，识别时间，地址，数量等复杂序列模式。
  2 | github地址：https://github.com/ferventdesert/tnpy
  3 | 
  4 | 语法介绍
  5 | 
  6 | ## 0.设计理由
  7 | 
  8 |  字符串分析和处理几乎是每个员程序必备的工作，简单到分割类似"1,2,3,4"这样的字符串，稍微复杂一些如字符串匹配，再复杂如编译和分析SQL语法。字符串几乎具有无穷的表达能力，解决字符串问题，就解决了计算机90%的问题。
  9 | 
 10 |   虽然字符串处理如此深入人心，但当分割字符时，本来都是按照逗号分割的，突然出现分号，程序就可能出错。再如日期处理，每个程序员肯定都对各种奇怪诡异的时间表达方式感到头疼，处理起来非常费时。这些功能，几乎只能以硬编码实现。它们是与外界交互的最底层模块，然而却如此脆弱。
 11 | 
 12 | >* 如何将”一百二十三“转换为数字？
 13 | >* 如何将”2013年12月14日“识别为时间并转换为时
 14 | 间类型？
 15 | >* 如何分析一个XML或JSON文件？
 16 | 
 17 | 正则表达式虽提供了强大的匹配功能，成为必备的工具，但它有不少局限，我们扩展了正则表达式引擎，使之能力大大增强。
 18 | 在线演示：http://www.desertlambda.com:81/extracttext.html
 19 | 
 20 | ## 1. 如何学习?
 21 | 
 22 | 基本上程序员都读过“30分钟学会正则表达式”这篇文章吧？最后没几个人能在30分钟内就读完它。不过相信我，TN引擎只需要15分钟就可以学会。
 23 | 
 24 | 详细的语法说明在这里：
 25 | 
 26 | [tn基本语法][1]
 27 | 
 28 | [使用tn构造自然语言计算器][2]
 29 | 
 30 | [tn实现的xml解析器][3]
 31 | 
 32 | TN可以实现文本的匹配，转写和信息抽取，可以理解为模板引擎的逆运算。简单的操作用正则表达式更方便，但不少问题是正则无法解决的。这时就需要使用TN了。
 33 | 
 34 | TN的解释器有Python,C#和C三种版本。C#版本已经不再维护。使用C#或Java等语言的，建议使用IronPython或Jython进行跨语言编译。
 35 | 
 36 | tnpy是tn的Python解释器，Python良好的可读性让代码写起来非常方便，代码不超过1000行，单文件，无第三方库依赖。推荐使用Python3。
 37 | 
 38 | tn是解释型语言，需要编写规则文件，并使用tnpy加载，再对文本进行处理。
 39 | 
 40 | ## 1. 基础的匹配和替换：
 41 | 
 42 | 首先我们先编写一个最简单的规则文件learn，内容如下：
 43 | 
 44 | ```
 45 | #%Order% 1
 46 | hello= ("你好");
 47 | ```
 48 | 接着，执行下面的python代码：
 49 | ```
 50 | from src.tnpy import RegexCore
 51 | core = RegexCore('../rules/learn')
 52 | matchs=core.Match('领导你好！老婆你好');
 53 | for m in matchs:
 54 |     print('match',m.mstr, 'pos:',m.pos)
 55 | ```
 56 | 引入tnpy命名空间，之后从learn规则文件初始化引擎，匹配该文本:
 57 | 
 58 | ```
 59 | success load tn rules:../rules/learn
 60 | match 你好 pos: 2
 61 | match 你好 pos: 7
 62 | ```
 63 | 
 64 | 上面输出了文本的匹配结果和位置。当然这一点正则也能做到。
 65 | 
 66 | 如果我们匹配的是`领导你好，老婆您好`，并想把所有的`你好`和`您好`，都转写为`hello`。
 67 | 
 68 | 为此我们添加hello2和hello3两个子规则：
 69 | 
 70 | ```
 71 | hello2= $(hello)| ("您好");
 72 | #%Order% 1
 73 | hello3= $(hello2) : (//:/hello/);
 74 | ```
 75 | 
 76 | `hello2`引用了刚才的`hello`规则，同时添加了`“您好”`。
 77 | 
 78 | hello3是主规则，负责将将`hello2`匹配的内容都转写为`hello`
 79 | 
 80 | `（$代表引用一条规则，|表示将几个规则并列排列，匹配最长的那个规则，:代表转写。）`
 81 | 
 82 | 执行下面的代码：`
 83 | 
 84 | ```
 85 | print(core.Rewrite('领导你好！老婆您好'));
 86 | ```
 87 | 
 88 | 结果为：
 89 | ```
 90 | 领导hello！老婆hello
 91 | ```
 92 | 
 93 | 如果我们想替换顺序，把“你好”放在前面呢？可以这样写：
 94 | 
 95 | ```
 96 | people= ("老婆") | ("领导");
 97 | #%Order% 1
 98 | reorder= $(people) $(hello3) : $2 $1;
 99 | ```
100 | 
101 | 先用`people`定义如何描述`老婆，领导`，然后用reorder来修改顺序， 注意reorder是个**顺序结构**，people匹配老婆和领导，hello3匹配您好/你好，并将其转换为`hello`。 `$2和$1`修改了转写顺序，执行Rewrite后输出:
102 | 
103 | ```
104 | hello领导！hello老婆
105 | ```
106 | 
107 | 我们把类似`$(name1) $(name2)`的结构，称为顺序表达式，把`$(name1) | $(name2) `称为或表达式。
108 | 如果将刚才所有的规则绘制成图，则是下面的样子：
109 | 
110 | ![foo.png-34.5kB][4]
111 | 
112 | ## 2. 正则表达式
113 | 
114 | 仅仅使用文本，表现力太差了。我们引入正则表达式来完成，正则表达式需要放在(//)中，注意和文本("")的区别。
115 | 
116 | 如果要进行转写，则标注为`(/match/:/rewrite/)`; 下面的表达式将所有的长空白符转换为一个空白符:
117 | 
118 | ```
119 | byte_det_space = (/ */://);
120 | ```
121 | 
122 | 下面将所有字母转换为空白：
123 | 
124 | ```
125 | low_letter_to_null = (/[a-z]/ ://);
126 | #或者下面:
127 | low_letter= (/[a-z]/);
128 | translate= $(low_letter) : ("");
129 | ```
130 | 
131 | 觉得没有挑战？我们接着看下面的。
132 | 
133 | ### 3. 复杂组合：中文数字转阿拉伯数字
134 | 
135 | 二十三如何转换为23？这种用普通的编程会比较困难。我们尝试用TN解决，会发现一点都不难。
136 | 先定义汉字的一二三到九转换为1-9，你肯定会写出这样的规则：
137 | 
138 | ```
139 | #定义0-9
140 | int_1 = ("一" : "1");
141 | int_0 =("零" : "0");
142 | int_2  = ("二" : "2") | ("两" : "2");
143 | int_3_9 = ("三" : "3") | ("四" : "4") | ("五" : "5") | ("六" : "6") | ("七" : "7") | ("八" : "8") | ("九" : "9");
144 | int_1_9 = $(int_1) | $(int_2) | $(int_3_9) | (/\d/);
145 | int_0_9 = $(int_0) | $(int_1_9);
146 | int_del_0 = (/零/ : /0/) |  (// : /0/);
147 | int_0_9_null = $(int_del_0) |  $(int_0_9);
148 | ```
149 | 
150 | 之所以要把0,1,2分开写，是因为这些数有特殊情况，如两和二都代表2，需要在后面特殊处理。
151 | 上面的`int_0_9_null`规则，就可以把`五七零二`转写为`5702`。但没法处理`二十三`这样的情况。
152 | 
153 | 再定义下面的规则，这样`一十三`可以转写为`13`
154 | 
155 | ```
156 | int_del_0 = (/零/ : /0/) |  (// : /0/);
157 | int_0_9_null = $(int_del_0) |  $(int_0_9);
158 | #定义10，十
159 | int_1_decades = (/十/ : /1/) | (/一十/ : /1/);
160 | ```
161 | 
162 | 再加上下面的规则，int_1_9_decades定义了十位数如何转写，而int_10_99定义了从十到九十九的转写规则。
163 | 
164 | ```
165 | int_10_99 = $(int_1_9_decades) $(int_0_9_null)  | (/[1-9][0-9]/) ;
166 | int_1_99 = $(int_1_9) | $(int_10_99) ;
167 | int_01_99 =  $(int_1_9) | $(int_10_99) | (/\d{1,2}/);
168 | 
169 | #%Order% 3
170 | int_0_99 =  $(int_0) | $(int_1_9) | $(int_10_99);
171 | ```
172 | 
173 | 看看下面的例子：
174 | `print({r:core.Rewrite(r) for r in ['十','三十七','一十三','68']});`
175 | 运行结果:
176 | `{'一十三': '13', '68': '68', '十': '10', '三十七': '37'}`
177 | 是不是感到很神奇？三十七是如何被转写为37的？
178 | 
179 | 仔细看规则，规则自底向上构造成了一棵规则树，in_0_99是整棵树的根节点。结构如下图：
180 | ![foo.png-132.1kB][5]
181 | 下面的log文件给出了匹配过程:
182 | 
183 | ```
184 | int_0_99,Table,Raw  =三十七
185 |   int_0,String,Raw  =三十七
186 |   int_0,String,NG
187 |   int_1_9,Table,Raw  =三十七
188 |     int_1,String,Raw  =三十七
189 |     int_1,String,NG
190 |     int_2,Table,Raw  =三十七
191 |       int_2_merge,Regex,Raw  =三十七
192 |       int_2_merge,Regex,NG
193 |     int_2,Table,NG
194 |     int_3_9,Table,Raw  =三十七
195 |       int_3_9_merge,Regex,Raw  =三十七
196 |       int_3_9_merge,Regex,Match=三
197 |     int_3_9,Table,Match=三
198 |     int_1_9_3,Regex,Raw  =三十七
199 |     int_1_9_3,Regex,NG
200 |   int_1_9,Table,Match=三
201 |   int_10_99,Table,Raw  =三十七
202 |     int_10_99_0,Sequence,Raw  =三十七
203 |       int_1_9_decades,Table,Raw  =三十七
204 |         int_1_decades,Table,Raw  =三十七
205 |           int_1_decades_0,Regex,Raw  =三十七
206 |           int_1_decades_0,Regex,Match=十
207 |           int_1_decades_1,Regex,Raw  =三十七
208 |           int_1_decades_1,Regex,NG
209 |         int_1_decades,Table,Match=十
210 |         int_1_9_decades_1,Sequence,Raw  =三十七
211 |           int_1_9,Table,Raw  =三十七
212 |           int_1_9,Table,Buff =三
213 |           unknown,Regex,Raw  =十七
214 |           unknown,Regex,Match=十
215 |         int_1_9_decades_1,Sequence,Match=三十
216 |       int_1_9_decades,Table,Match=三十
217 |       int_0_9_null,Table,Raw  =七
218 |         int_del_0,Table,Raw  =七
219 |           int_del_0_0,Regex,Raw  =七
220 |           int_del_0_0,Regex,NG
221 |           int_del_0_1,Regex,Raw  =七
222 |           int_del_0_1,Regex,Match=
223 |         int_del_0,Table,Match=
224 |         int_0_9,Table,Raw  =七
225 |           int_0,String,Raw  =七
226 |           int_0,String,NG
227 |           int_1_9,Table,Raw  =七
228 |             int_1,String,Raw  =七
229 |             int_1,String,NG
230 |             int_2,Table,Raw  =七
231 |               int_2_merge,Regex,Raw  =七
232 |               int_2_merge,Regex,NG
233 |             int_2,Table,NG
234 |             int_3_9,Table,Raw  =七
235 |               int_3_9_merge,Regex,Raw  =七
236 |               int_3_9_merge,Regex,Match=七
237 |             int_3_9,Table,Match=七
238 |             int_1_9_3,Regex,Raw  =七
239 |             int_1_9_3,Regex,NG
240 |           int_1_9,Table,Match=七
241 |         int_0_9,Table,Match=七
242 |       int_0_9_null,Table,Match=七
243 |     int_10_99_0,Sequence,Match=三十七
244 |     int_10_99_1,Regex,Raw  =三十七
245 |     int_10_99_1,Regex,NG
246 |   int_10_99,Table,Match=三十七
247 | int_0_99,Table,Match=三十七
248 | ```
249 | 
250 | 引擎从文本的左向右，沿着规则树寻找最长的文本，如果在一个顺序表达式上的任何一步失败，那么整个顺序表达式被抛弃。或表达式会遍历每个子表达式，直到发现最长的那个，返回结果。具体的匹配原理，以及优化，会在专门的文章中介绍。
251 | 
252 | ## 4. 由规则构造更复杂的规则
253 | 
254 | 自然而然的，知道怎么定义三十七，就可以定义五百三十七，那不过是`int_1_9_hundreds+int_0_99`（这个已经定义过了）。
255 | 
256 | ```
257 | int_1_9_hundreds = $(int_1_9) ("百" : "");
258 | int_100_999 =   $(int_1_9_hundreds) ("" : "00") |  $(int_1_9_hundreds) $(int_10_99);
259 | int_1_999 = $(int_1_99) | $(int_100_999);
260 | ```
261 | 
262 | `int_1_999`可以处理类似五百三十七这样的问题！
263 | 
264 | 进而，我们可以处理几千，几万，这个延伸到万以后，就可以自然而然地衍生出亿，万亿的表达。
265 | 
266 | 如何处理负数？这还不简单！
267 | 
268 | ```
269 | signed_symbol0 = ("正" : "") | ("负" : "-") | ("正负" : "±") | ("\+" : "+") | ("\-" : "-") | ("±" : "±") ; 
270 | signed_symbol = $(signed_symbol0) | $(null_2_null);
271 | ```
272 | 
273 | 接下来，我们默认正整数为`integer_int`，那么，整数（包含正负）就是：
274 | 
275 | `integer_signed = $(signed_symbol) $(integer_int)`
276 | 
277 | ## 5. 属性提取
278 | 沿着刚才的路，我们自然而然地能定义分数，但仅仅是转写还不够，遇到三分之一，我们不仅要将其处理为1/3，还要计算出它的值，这就涉及到属性抽取。也就是把信息从文本中提取为字典。
279 | 
280 | 分数，不过是`整数+分之+整数`，可以定义成下面的形式：
281 | 
282 | ```
283 | fraction_cnv_slash = ("分之" : "/");
284 | fraction2 = ("/" : "/");
285 | percent_transform= ("%" : "100") | ("‰" : "1000");
286 | #%Type% DOUBLE
287 | #%Property%  Denominator,,Numerator| Numerator ,, Denominator | Denominator ,, Numerator 
288 | #%Order% 101
289 | fraction = $(integer_int_extend) $(fraction_cnv_slash) $(integer_int) : $3 $2 $1
290 |     | $(integer_int) $(fraction2) $(integer_int)
291 |     | $(pure_decimal) ("" : "/") $(percent_transform);
292 | ```
293 | 
294 | 这个有点复杂，但容我慢慢讲解。分数有三种情况，如刚才的`三分之一`，或是`1/3`，或是`30%`。分别对应上面`fraction`规则的三个子规则。仔细地看上面的规则，不难理解。
295 | 
296 | 值得注意的是Property这个标签，该标签定义了如何抽取信息。也是用竖线分隔，每个名称对应下面的一个子规则，为空的直接跳过。那么”十三分之二十四“中，“十三”就对应Numerator， 而“二十四”对应Denominator。来测试一下：
297 | 
298 | `print(core.Extract('十三分之二十四',entities=[core.Entities['fraction']]))`
299 | 
300 | 我们用Extract函数来抽取文本，返回的是一个字典，entites是可选参数，我们限制只用fraction规则来匹配，获得输出：
301 | 
302 | ```
303 | [{'Numerator': '24', '#rewrite': '24/13', '#type': 'fraction',
304 | '#match': '十三分之二十四', 'Denominator': '13', '#pos': 3}]
305 | ```
306 | 
307 | 是不是很赞？
308 | 
309 | ### 6.嵌入Python脚本
310 | 
311 | 有一种需求还没谈到，将所有的大写字母转换为小写字母，你可能会想定义26个字符串规则，并用或表达式来拼接起来吧？这样太费事了。我们可以直接这样：
312 | 
313 | `low_to_up_letter =  (/[A-Z]/) : "str.lower(mt)";`
314 | 
315 | `[A-Z]`匹配了所有的大写字母，将匹配结果送到后半段的转写，内置的解释器会执行那段python代码，将其转换为小写，mt代表前面表达式的匹配串，rt代表转写串。好在`[A-Z]`不执行转写，可以认为`mt==rt`.
316 | 
317 | 这是在转写过程中嵌入python的例子，还能在匹配时嵌入转写：
318 | 
319 | `foo = "findsecret" : "print(mt)"`;
320 | 
321 | 前面的findsecret函数负责在字符串中找到“神秘文本”，后面的转写代码打印出来，并将原始的字符返回…
322 | 
323 | ## 6. 你在15分钟内读完了么？
324 | 
325 | 我相信你没有，因为读懂那个匹配规则的日志文件，就需要最少五分钟，但如果你有编译原理和正则基础的话，还是能很快理解的。而从零开发这个引擎，到反复优化和完善，花了一年之久。
326 | 
327 | 定义了各种数字之后，我们就能很快地定义时间，日期，电话号码，地址…而你看到的只是TN语言的冰山一角。
328 | 
329 |  - 它能够分析文本的模式，解析诸如ABCABC这样的序列，从而发现这是一个重复模式。
330 | 
331 |  - 不仅能够顺序匹配，还能逆向，甚至乱序匹配，这就能够抽取类似“学校的校训”这样的问题。
332 | 
333 |  - 规则可以调用自身，配合脚本，因此能够实现递归下降解析。例如30行代码实现xml解析，或20行规则实现自然语言计算器。
334 | 
335 |  - 规则可以嵌入脚本，甚至动态生成代码，因此，甚至在理论上，TN能够自己编译自己。
336 | 
337 |  - TN还能做一个简单的SQL解释器，或是中文英文的简单互相翻译的工具。
338 | 
339 | 是不是已经激动地颤抖了？唯一限制你能力的就是你的想象力。本博客将会进一步发布一系列有关tn的内容，包括高级语法，
340 | tn优化等。
341 | 
342 | 感兴趣的可以联系作者
343 | 
344 | 
345 |   [1]: http://www.cnblogs.com/buptzym/p/5355827.html
346 |   [2]: http://www.cnblogs.com/buptzym/p/5361121.html
347 |   [3]: http://www.cnblogs.com/buptzym/p/5355920.html
348 |   [4]: http://static.zybuluo.com/buptzym/ksl5ggrfcn1psmdf2f81i8wg/foo.png
349 |   [5]: http://static.zybuluo.com/buptzym/itwhlmz8ua2h3jgbqdq5z48g/foo.png
350 | 


--------------------------------------------------------------------------------
/doc/advance.md:
--------------------------------------------------------------------------------
  1 | 标签（空格分隔）： 未分类
  2 | 
  3 | ---
  4 | 
  5 | ##高级操作
  6 | ###1.脚本表达式
  7 | 
  8 | 用双引号包含的脚本被称为脚本表达式，目前支持嵌入Python。 脚本表达式只能在顺序表达式中使用。代码可以在三个位置存在:
  9 | |位置|功能|例子|
 10 | |--|--|--|
 11 | |匹配(match)|在字符串中匹配字符| match(m.mstr)|
 12 | |转写(rewrite)|对匹配完成的串转写| str.lower(m.mstr)|
 13 | |条件(condition)|判断转写条件是否满足||
 14 | 
 15 | 由于tn本身所带的匹配和转写功能一般足够使用，所以脚本在匹配和转写中只是作为补充，而条件是最需要嵌入脚本的。
 16 | 
 17 | 由于脚本表达式用双引号表示，为了避免语法解析出现错误，因此在Python代码中需要用单引号来表示字符串。
 18 | 
 19 | **例1**
 20 | `rule= $(rule0) $(rule1) $(rule2) : $(rewrite1) $(rewrite2) "m.str+'haha'"`
 21 | 
 22 | rewrite1负责转写rule0，rewrite2转写rule1, 后面的脚本表达式转写rule2：
 23 | 
 24 | m代表rule2所匹配的结果。这个结果称为MatchResult(可参考tnpy源代码)，它有如下属性：
 25 | ```
 26 | ot       #原始输入字符串
 27 | m.mstr   #匹配串
 28 | m.rstr   #转写串
 29 | m.pos    #匹配得到的位置
 30 | ```
 31 | 
 32 | **例2**
 33 | 
 34 | `rule= $(rule0) $(rule1) $(rule2) : "m[0].mstr+m[1].mstr+m[2].mstr";`
 35 | 
 36 | 以上脚本，将三个规则的匹配字符串加起来返回。
 37 | 
 38 | 转写部分只有一个规则时，该规则需要转写匹配部分里的全部内容，形参为m[0],m[1]...，就像这个例子描述的样子。
 39 | 
 40 | 但如果转写部分有多个规则，则转写部分的规则数量必须和匹配部分的规则数量一致，一一对应：
 41 | 
 42 | `rule= $(rule0) $(rule1) $(rule2) : "m.mstr" "m.mstr" "m.mstr";`
 43 | 
 44 | 此时，三个脚本表达式分别承载前面的三个顺序规则。由于对应的只有一个规则，所以m等价于m[0]。
 45 | 
 46 | **例3**
 47 | 
 48 | `low_to_up_letter =   $(low_letter) : "unicode.upper(m.mstr)";`
 49 | 
 50 | `$(low_letter)`匹配了小写字母，后面的表达式将前面表达式匹配后的结果转换为大写，并返回。
 51 | 
 52 | **例4**
 53 | 
 54 | `unit_electric = $(integer_decimal) $(unit_tabl_electric) :: "abs(e(unit_kywd_electric),m[0])<33"`
 55 | 
 56 | 这个例子稍微复杂一些，例如识别30m到底是30米还是30兆字节，就取决于文本附近有没有相应的关键字。
 57 | 
 58 | `unit_kywd_electric`规则定义如下：
 59 | 
 60 | `unit_kywd_electric = ("速度") | ("网速") | ("电脑") | ("导体")...;#省略一部分`
 61 | 
 62 | 上面的脚本，`$(integer_decimal)`匹配30, `$(unit_tabl_electric)`匹配m，
 63 | 
 64 | e函数在原始字符串中匹配`unit_kywd_electric`实体，之后判断这个实体在字符串的位置和m[0]的位置的绝对值是否小于33, 用来确定这是否是信息计量单位。
 65 | 
 66 | 上面的表达式有些复杂，同时，当e函数匹配失败返回None，那么程序就会报错，因此可以修改为
 67 | `dist('unit_kywd_electric',0)<33`
 68 | 
 69 | dist是tnpy里内置的一个函数：
 70 | 
 71 | ```
 72 |  def dist(name, i=0):
 73 |             header = e(name)
 74 | 			if head is None:
 75 | 				return int_max;
 76 |             return abs(header.pos - m[i].pos)
 77 | ```
 78 | 
 79 | tn脚本不建议（也不能够）写入超过多行python代码，因此为了安全和方便，可以自行定制函数来方便匹配和转写，tnpy会将这些函数嵌入到引擎当中，成为闭包函数，例如：
 80 | 
 81 | `#%Script% extends`
 82 | 
 83 | 这样就导入了extends.py库
 84 | 
 85 | ###2. 使用纯Python编写规则
 86 | 前面提到，之所以为tn定义一套特别的语法，是为了方便能够跨语言实现解析。这种TN语法能够用正则表达式方便的进行词法分析和语法分析，具体细节可参考tnpy源代码。
 87 | 
 88 | 但是，我们也可以使用纯Python来编写规则，这样有很多好处，可以内嵌其他实体类型，进一步扩展语言的功能。也能借助现成的Python编译器，及时发现未引用的规则。
 89 | 
 90 | 规则需要先引入实体：
 91 | ```
 92 | from tnpy import StringEntity as SE, RegexEntity as RE, TableEntity as TE, SequenceEntity as SQE, RepeatEntity as RPE
 93 | ```
 94 | 接下来我们就能够定义不同的规则了：
 95 | ```
 96 | build = SE('成立于', '建成了');
 97 | splitkw0 = RE('^|[,\.。，和\r\n]');
 98 | quotekw = TE([RE('校训'), RE('育人精神')]);
 99 | quote0 = RE('"([^"]+)"', '$1');
100 | anything = RE('.*');
101 | ```
102 | 得益于Python非常fancy的语法，buiid实际上是`("成立于":"建成了")`, quotekw则是两个正则的或表达式。
103 | 
104 | 下面定义了一个顺序表达式，是不是可读性也很强呢？
105 | 
106 | `quote1 = SQE([quotekw, anything, quote0, anything, splitkw0])`
107 | 
108 | python版本的规则和tn规则也能相互引用，tn规则可以直接引用py规则，而py规则想要引用，则需要
109 | 
110 | `quote_example= SQE([REF('quote')],[rewriterule])`
111 | 
112 | 我简直深深地爱上了Python。
113 | 
114 | ##3. 结合NLP和词性
115 | 
116 | 原始的tnpy，为了保证代码的纯粹性，没有加入这些功能，如果我们希望匹配
117 | 
118 | `**名词**确实是**形容词**`
119 | 
120 | 这样的表达，难道要把所有的名词和形容词都列进去吗？这显然是不必要的。**tnnlp**模块就是解决这个问题的。tnnlp已经添加入tnpy核心库中了。
121 | 
122 | 使用时也很简单：
123 | 
124 | `from tnnlp import NEREntity as NE,WordEntity as WE;`
125 | 
126 | 于是，”地名”建成于”时间”，就能用下面的表达式来解决：
127 | ```
128 | time2 = SQE([NE('nt'), build, 'date_fix'], rewriteOrders=[2, 1, 0]);
129 | ```
130 | 
131 | `rewriteorders=[2,1,0]`等价于tn规则里的```$3,$2,$1```.
132 | 
133 | 其中，`NE`代表一个实体，`nt`为地名;类似地，`n`是名词，`ad`是形容词。  NLP使用了结巴分词作为分词和词性标注的方法。
134 | 
135 | ###4.使用词库
136 | 
137 | 如果我们想匹配“程序员”是伟大的职业这样的表达，那么就需要把程序员或是某种工种的所有表达全部列出来。这个工作量太大了。
138 | 
139 | 同样，描述“好”的形容词也有很多，都列出来也会浪费大量的时间。解决这个问题的办法，就是使用词库。
140 | 
141 | tnnlp使用了哈工大标注的一份语料库：
142 | 
143 | ```
144 | Aa01A07= 者 手 匠 客 主 子 家 夫 翁 汉 员 分子 鬼 货 棍 徒
145 | Aa01A08= 每人 各人 每位
146 | Aa01A09= 该人 此人
147 | Aa01B01= 人民 民 国民 公民 平民 黎民 庶 庶民 老百姓 苍生 生灵 生人 布衣 白丁 赤子 氓 群氓 黔首 黎民百姓 庶人 百姓 全民 全员 萌
148 | Aa01B02= 群众 大众 公众 民众 万众 众生 千夫
149 | Aa01B03# 良民 顺民
150 | Aa01B04# 遗民 贱民 流民 游民 顽民 刁民 愚民 不法分子 孑遗
151 | Aa01C01= 众人 人人 人们
152 | Aa01C02= 人丛 人群 人海 人流 人潮
153 | Aa01C03= 大家 大伙儿 大家伙儿 大伙 一班人 众家 各户
154 | ```
155 | 890KB的词库，定义了大概几十万个词，并使用树结构来讲词义索引起来，例如，所有A开头的都是和人物有关的，后面的标注进一步做了分类。
156 | 
157 | 因此，你可以使用下面的表达，来描述Ae06节点下的所有词汇：
158 | ```
159 | word= WE('Ae06');
160 | rewrite= RE('.+','$1是一种伟大的职业');
161 | wordme= SQE([word],[rewrite])
162 | ```
163 | 
164 | 一旦遇到`Ae06`分支下的词，就会自动将其转换为xxx是一种伟大的职业。
165 | 
166 | 这也是写纯Python规则的好处，可以方便地定制类型，扩展核心引擎的功能。
167 | 
168 | ###5. 乱序匹配
169 | 
170 | 以提取校训为例，校训一般来说有以下几种表达：
171 | 
172 | 语句1：`北京邮电大学的校训是“厚德博学，敬业乐群”。`
173 | 
174 | 语句2：`“学为人师，行为世范”是北师大启功先生提出的校训。“为学生着想”….`
175 | 
176 | 如果用正则提取离校训最近的双引号的内容，可能会出错，因为前后可能还有其他双引号标注的内容，如上面的“为学生着想”。
177 | 
178 | 因此，想抽取校训主要有三个特征：**校训关键字**，**双引号**和**标点符号**。
179 | ```
180 | quote1 = SQE([quotekw, anything, quote0, anything, splitkw0], matchorders=[5, 1, 2, 1, 4]);
181 | quote2 = SQE([splitkw0,anything,quote0, anything, quotekw ], matchorders=[4, 1, 3, 1, 5]);
182 | quote = TE([quote1,quote2])
183 | ```
184 | (其他规则都已经在上面定义过了)
185 | 
186 | quote规则描述了两种类型`quote1`和`quote2`, 对quote1来说，要匹配语句1，匹配按照优先次序`5,1,2,1,4`，先匹配quotekw,找到了`校训`两字，再找分隔符，找到了句号。此时就把整个句子夹逼到了。
187 | 
188 | `校训是“厚德博学，敬业乐群”`，再匹配`quote0`,把实际的双引号中的校训提取出来。最终两个anything匹配`是`和`null`。
189 | 
190 | 对quote2来说，匹配语句2，quotekw匹配了`校训`,splitkw0匹配了句首,quote0匹配了`"学为人师，行为世范"`,anything匹配了夹逼后剩下的部分。
191 | 
192 | 乱序匹配本质上，是**通过定义匹配顺序，人为地通过`围栏`分割句子，将句子分割成树，然后在子节点上再进行匹配**，这就解决了顺序表达式难以解决的问题。
193 | 
194 | 其实，这里依旧有问题没能解决。如果我们想匹配ABC这三个字母的全排列，如CBA,CAB... 总共有6种方法，总不至于手工编写所有的匹配模式吧？这还只是三种，数量更多之后，手工编写就变得不可行了。那如何实现高效匹配呢？**此事我依旧没有思路**。
195 | 
196 | 
197 | ##6.总结
198 | 本文介绍了tn的高级语法，本质上tn是可以被任意改造和扩展的。因此不应当拘泥于本身提供的文法，而是按照自己的需求自行定制。之后会介绍tn的性能优化，用于模式匹配的技巧和实现原理。


--------------------------------------------------------------------------------
/doc/grammer.md:
--------------------------------------------------------------------------------
  1 | > tn是desert和tan共同开发的一种用于匹配，转写和抽取文本的语言。解释器使用Python实现，代码不超过1000行。
  2 | 
  3 | 本文主要介绍tn的基本语法。高级内容可以参考其他篇章。使用这样的语法，是为了实现语言无关，从而方便地编写不同语言的解释器。
  4 | 
  5 | ##基本语法
  6 | 引擎可以由一组规则构成，规则也可以被其他规则所组合。首先介绍最基本的元规则 。 
  7 | 
  8 | ###1. 字符串StringEntity
  9 | ```Form1: ("Matched string")
 10 | Form2: ("Matched string" : "Rewritten string")
 11 | ```
 12 | Form1是一种省略表达，即Rewritten==Matched
 13 | 样例:
 14 | ```("0" : "零") # 将 "0" 转写成 "零"
 15 | ("" : " ") # 在指定的地方插入一个空格
 16 | ("kg" : "kilogram") # 将 "kg" 或 "Kg" 扩展成 "kilogram" 
 17 | ```
 18 | ###2. 正则表达式RegexEntity
 19 | ```
 20 | Form1: (/Matched expression/)
 21 | Form2: (/Matched expression/ : /Rewritten expression/)
 22 | ```
 23 | 
 24 | 样例:
 25 | ```
 26 | (/\s+/ : / /) \#将一串连续的空格与换行符合并为一个空格
 27 | (/(\d+)\s?(-|~)\s?(\d+)/ : /$1 to $3/) #将 "15~20 dollars" 改写成 "15 to 20 dollars"
 28 | ```
 29 | 
 30 | 将用Matched匹配到字符串替换成Rewritten所表示的字符串。这里的正则表达式符合Perl正则规范。Form1只能作为匹配规则而不能作为转写规则，如果Rewritten为空，则只匹配不转写。Rewritten并不是真正的正则表达式，它仅支持普通字符串与`$1, $2, ..., $99，$n` 表示Matched expression匹配到的第n个Entity。 
 31 | 
 32 | ###3. 脚本表达式 ScriptEntity
 33 | 可以在文法中嵌入脚本，具体的语法规则由引擎所决定，目前可以嵌入Python。（详情可参考高级语法） 
 34 | 
 35 | -------
 36 | 其他各类表达式，都是由这三类表达式进行组合得到的。它们的并（或操作），连接和差操作，构成了以下三类复合实体。这三种操作与正则表达式的三类基本操作一致。
 37 | 表达式需要被其他表达式引用时，就需要为其命名，例如：
 38 | `entity= (/\s+/ : / /) ;`
 39 | 这样就表达了一个名称为entity的字符串表达式。名称与c语言的变量命名规则一致。中间由=连接。最后由分号结束。
 40 | 
 41 | 当引用其他表达式时，可以用$(RuleName)表达。
 42 | 
 43 | 
 44 | --------------------------------------------------------------------------------
 45 | 
 46 | ###4. 或表达式 TableEntity
 47 | `Form: Table_name =Entity1 | Entity2 | …`
 48 | 样例:
 49 | ```
 50 | digit_0_to_9 = ("0" : "nol") | ("1" : "satu") | ("2" : "dua") | ("3" : "tiga") | ("4" : "empat") | ("5" : "lima") | ("6" : "enam") | ("7" : "tujuh") | ("8" : "delapan") | ("9" : "sembilan"); #印尼语数字 0~9 的Map 表
 51 | integer_int_extend = $(integer_int) | ("百" : "100") | ("千" : "1000") | ("万" : "10000") | ("亿" : "100000000");
 52 | ```
 53 | 
 54 | integer_int_extend规则就是由integer_int和其他四个StringEntity构成的。
 55 | 或表达式中间的分隔符有两种，竖线|和斜杠/。 以竖线分割的实体是平级的，会对每一个子表达式进行匹配，找出离字符串起始位置最近且匹配到的字符串最长的那个子表达式。而以斜杠分割的实体，被看做一组(Group)，一旦匹配，就不会匹配之后的表达式。可以在表达式中指定多个组合平级实体。
 56 | 看下面的例子：
 57 | `grouptest= (/CD/) | (/ABC/) / (/AB/) | (/ABCD/);`
 58 | 该规则分成了两组，在匹配ABCD时，前一组已经匹配了ABC,因此就不会继续向后匹配到ABCD。因此该规则最终匹配的结果是ABC. 
 59 | 
 60 | ###5. 序列表达式 SequenceEntity
 61 | 序列表达式描述了表达式的连接。序列从左到右依次匹配，一旦出现不能匹配的情况，则整个序列匹配失败。注意，序列匹配的字符串必须是相邻的。
 62 | 
 63 | ```
 64 | integer_0_to_99 = $(integer_0_to_9) | $(integer_teens)
 65 | | $(integer_decades) $(del_0)
 66 | | $(integer_decades) $(ins_space) $(integer_1_to_9) $(ins_space);
 67 | ```
 68 | 
 69 | 这个表达式实际上是一个TableEntity，后两个子表达式是SequenceEntity。该表达式可以转写0~99范围内的整数。
 70 | 
 71 | 匹配211时它首先用第一个integer_0_to_9能匹配到 '2'，再用第二个integer_teens能匹配到 "11"，再用第三个表达式匹配失败，再用第四个Sequence能匹配到 "21"，最终选择离起点最近且匹配到的字符串最长的那一个进行转写：
 72 | `211 ：twenty one`
 73 | 序列表达式可以完成转写和顺序调整。例如：
 74 | `fraction = $(integer_int_extend) $(fraction_cnv_slash) $(integer_int) : $3 $2 $1 `
 75 | 
 76 | 三分之一转写为1/3，integer_int_extend可以匹配‘三’, fraction_cnv_slash可匹配 '分之' , integer_int可匹配'一'。 $3 $2 $1 对其顺序进行了重排。
 77 | 
 78 | ###6.重复表达式RepeatEntity
 79 | ```
 80 | Form1: Repetition_name = $(an_entity)+;
 81 | Form2: Repetition_name = $(an_entity){m,n};
 82 | ```
 83 | 由一条需要重复的规则、要重复的次数以及结尾的分号组成。需要重复的规则有且仅有一条。所以不能写成
 84 | `error_example= $(an_entity0) $(an_entity){m,n}; `
 85 | 
 86 | m到n次，m是≥0的整数，n是≥0的整数或-1，为-1时表示不限制重复次数。 
 87 | 
 88 | 这与正则表达式的规则基本一致。 
 89 | 
 90 |  
 91 | ###7.差集表达式DiffEntity
 92 | ```
 93 | Form1: Difference_name = $(Universe) - $(complement);
 94 | Form2: Difference_name = $(Universe) - $(complement1) - $(complement2) - …; 
 95 | ```
 96 | 由一组Complement以及结尾的分号组成。有且仅有一个Universe，后面用减号可以跟多个表达式。
 97 | 当Universe表达式能匹配且其他complement不能匹配时成立。例如：
 98 | ```
 99 | integer_1_to_9 = $(integer_0_to_9) - ("0" : "nol"); # 整数1~9
100 | integer_2_to_999 = $(integer_0_to_999) - $(digit_1) - $(digit_0); # 整数2~999 
101 | ```
102 | 
103 | --------------------------------------------------------------------------------
104 | 
105 | ###8. 元标签
106 | 可以为表达式增加标签，控制表达式的属性和功能。也可以引入规则等。 
107 | 
108 | ####文件级元标签：
109 | 文件级元标签，不需要贴在任何规则之上。
110 | `#%Include% Rules/cnext`
111 | 增加一个名称为cnext的外置文件。本文件中的规则即可引用该文件中的规则。支持双向引用。 
112 | 
113 | `#%Script% extends`
114 | 增加一个名称为extends.py的外置Python脚本。该标签适合在嵌入Python代码时使用。嵌入的代码可以执行外置脚本中定义的函数。引擎会在内部执行import(extends)函数。因此extends.py需要放置在规则文件同一级目录中。
115 | 
116 | ####规则级元标签:
117 | 规则级元标签需要放在规则文本行之上，如：
118 | ```
119 | #%Type% INT
120 | #%Order% 180
121 | int_0_4= $(int_0) | $(int_1) | $(int_2) | ("三" : "3") | ("四" : "4") ;
122 | ```
123 | 上面的两个标签意思分别为：
124 | 将int_0_4的类型标记为INT
125 | 将int_0_4的匹配优先级定义为140. 数字越大，优先级越低。 
126 | 
127 | 不是所有的规则都是有效规则，有些规则只是被其他规则引用。只有加上#%Order%标签的才是有效规则。规则可以手动编写优先级。也可以省略之后的数字，引擎会自动根据引用结构来制定优先级，被引用层级越高的优先级越高。 
128 | 
129 | `#%Parameter% `为规则赋值
130 | 这部分取决于引擎的设计，将在《高级话题》中描述。 
131 | 
132 | ####属性级元标签
133 | 在信息抽取时，属性元标签非常重要，它指定了引擎如何将文本转换为字典。
134 | 
135 | **案例1**：
136 | ```
137 | #%Property% Denominator,,Numerator| Numerator ,, Denominator | Denominator ,, Numerator
138 | fraction = $(integer_int_extend) $(fraction_cnv_slash) $(integer_int) : $3 $2 $1 |
139 | 
140 | $(integer_int) $(fraction2) $(integer_int) |
141 | 
142 | $(pure_decimal) ("" : "/") $(percent_transform);
143 | ```
144 | 属性标签为fraction的每一个引用实体增加了属性。 按照 '|' 分组，Denominator赋给integer_int_extend, Numerator赋给integer_int. 分别代表分子和分母。
145 | 
146 | **案例2**：
147 | 
148 | 当抽取类似JSON或XML的文本时，抽取的字典需要以键值对的形式标注，如下例子： 
149 | ```
150 | #%Property% ,$key,,$value
151 | properties =$(space) $(name) $(equal) $(property) $(space);
152 | ```
153 | 则在抽取时，会以name为键，property为value, 插入抽取的字典中。 
154 | 
155 | 
156 | --------------------------------------------------------------------------------
157 | 
158 | ##9.注意事项
159 | 
160 | ###注释
161 | 除了符合元标签格式的文本，以 # 开始的一行内容被认为是注释行被忽略。暂不支持在一行内容的中间或后面加注释，也不支持在某一规则的多行内容的中间插入一行注释。 
162 | 
163 | ###换行
164 | 当Rule内容特别长时可以直接换行，中间插入的换行符/空格/制表符会被忽略，但不支持在中间插入注释行。 
165 | 
166 | ###结束符
167 | 所有Rules都要以分号结束。 
168 | 
169 | ###交叉引用
170 | 规则可以支持交叉引用，甚至可以引用自身，但被引用的表达式需要存在，否则会引发错误。引用时，需要保证文法不是左递归的，否则将会陷入死循环。 
171 | 
172 | ###编码
173 | 由于文本处理引擎经常处理多国语言，因而要求使用UTF-8编码(no BOM)。


--------------------------------------------------------------------------------
/rules/calparser:
--------------------------------------------------------------------------------
 1 | #计算引擎
 2 | #尝试解决 三点五乘以八点三的功能
 3 | 
 4 | #%Include% Rules/cnext
 5 | add  = (/加上?|\+|＋/ : /+/) ;
 6 | sub  = (/减去?|\-|－/ : /-/);
 7 | mul = (/乘以?|\*|×/ : /*/);
 8 | div = (/除以?|/|÷/ : ///); 
 9 | pow2 = (/的?平方/ : /**2/);
10 | pow3 = (/的?立方/ : /**3/);
11 | pown=  (/的?/ : //) $(digit) (/次方/ : /**/) : $3 $2 $1; 
12 | 
13 | 
14 | result= (/的?结果/);
15 | addresult0= (/的?和/);
16 | subresult0= (/的?差/);
17 | addresult = $(result) $(addresult0);
18 | subresult = $(result) $(subresult0);
19 | addsub0=  $(add) | $(sub) ; 
20 | logic0 =$(or) | $(and)  ;
21 | divpow0 = $(mul) | $(div);
22 | equalcheck = $(bigger) |$(less) | $(noequal);
23 | 
24 | operator= $(addsub0) | $(equalcheck) | $(logic0);
25 | 
26 | divpow = $(digit) $(divpow0) $(digit) ;
27 | powx= $(pow2) | $(pow3) | $(pown);
28 | pow = $(digit) $(powx);	
29 | 
30 | #functions
31 | print = (/打印/ : /print/);
32 | send = (/发送/ : /send/);
33 | functions = $(print) | $(send);
34 | function = $(functions) $(noterminator) : "invoke(m[0].rstr,m[1].rstr)";
35 | 
36 | 
37 | addsub= $(not) $(noterminator)
38 | 	| $(noterminator) $(operator) $(noterminator);
39 | 	
40 | terminator   = $(digit) | $(ifelse) | $(pow) | $(divpow) | $(function);
41 | 
42 | 
43 | 
44 | #暂时无法分析 3加5的和乘以3，因为会造成循环递归，从左向右推导不可行
45 | #	| $(noterminator) $(add) $(noterminator)  $(addresult)
46 | #	| $(noterminator) $(sub) $(noterminator)  $(subresult);
47 | 
48 | #%Order% 28
49 | noterminator = $(terminator) : "eval(m.rstr)" |  $(addsub) : "eval(m.rstr)";
50 | 
51 | 
52 | or = (/或/ : / or /);
53 | and = (/且/ : / and /);
54 | not = (/不是/ : / not /);
55 | equal = (/等于|=/ : /=/);
56 | bigger = (/大于|>/ : />/);
57 | less = (/小于|</ :/</);
58 | noequal = (/不等于/: /!=/);
59 | 
60 | ifelse =  (/如果/) $(noterminator) (/,那么/) $(noterminator) (/,否则/) $(noterminator) : "check(m[1].rstr,m[3].rstr,m[5].rstr)";


--------------------------------------------------------------------------------
/rules/cnext:
--------------------------------------------------------------------------------
  1 | #中文语言规则系统
  2 | #赵一鸣
  3 | #转写不是主要任务，主要是识别类型，并尽可能地转换为标准形式
  4 | #可以通过拓展规则，找出字符串中的信息树
  5 | #例如：
  6 | #1.  ￥230元 转写为 230 ，同时确认其为价格
  7 | #2.  二零一四年八月  时间  2014/08   Year:2014 Month:08  
  8 | 
  9 | #不同的数据类型，可以通过额外的XML配置文件，定义其处理策略。
 10 | #例如: 当五分之三识别为分数之后，可以送入分数处理模块，得到其0.6的数值
 11 | 
 12 | 
 13 | ##%Script% extends
 14 | 
 15 | #================#
 16 | #   基础规则转写 #
 17 | #================#
 18 | 
 19 | byte_anything2null = (/.*/ : //);
 20 | byte_anything2space = (/.*/ : / /);
 21 | byte_ins_space = ("" : " ");
 22 | null_2_null = ("" : "");
 23 | byte_det_space = (/ */://);
 24 | byte_det_pot = (/ /:/ /);
 25 | byte_det_one_space = (/ ?/://);
 26 | 
 27 | byte_cnv_to = (/[ \t]*[-~][ \t]*/ : / to /);
 28 | #中文字符
 29 | chs=(/[\u4e00-\u9fbb]/);
 30 | chs_multi = (/[\u4e00-\u9fbb]+/);
 31 | Punctuations = (/[,.?!\(\)\[\]]/);
 32 | head_space = (/\b/) : $1;
 33 | back_space =  (/\b|$/) : $1;
 34 | #将全角符号转换为半角符号
 35 | 
 36 | low_letter = (/[a-z]/);
 37 | 
 38 | up_letter =  (/[A-Z]/);
 39 | ###%Order% 274
 40 | low_to_up_letter =   $(low_letter) : "unicode.upper(m.mstr)";
 41 | en_letter =  $(low_letter) | $(up_letter);
 42 | 
 43 | en_letters =  $(en_letter)+;
 44 | 
 45 | Ordinal_s_no_replace =("." : ".") | ("th" : "th");
 46 | 
 47 | 
 48 | 
 49 | int_1 = ("一" : "1");
 50 | int_0 =("零" : "0");
 51 | int_del_0 = (/零/ : /0/) |  (// : /0/);
 52 | int_0_null = $(int_0) ; 
 53 | #%Type% INT
 54 | int_2  = ("二" : "2") | ("两" : "2");
 55 | 
 56 | int_2_5 = $(int_2) | ("三" : "3") | ("四" : "4") | ("五" : "5");
 57 | #%Type% INT
 58 | int_0_4= $(int_0) | $(int_1) | $(int_2) | ("三" : "3") | ("四" : "4") ;
 59 | #%Type% INT
 60 | int_2_9 = $(int_2_5) |  ("六" : "6") | ("七" : "7") | ("八" : "8") | ("九" : "9");
 61 | #%Type% INT
 62 | int_1_9 = $(int_1) | $(int_2_9) | (/[1-9]/);
 63 | #%Type% INT
 64 | int_0_9 =  $(int_0) | $(int_1_9)| (/[0-9]/);
 65 | 
 66 | #可有可无的0-9，用于诸如四十这样的表达
 67 | #%Type% INT
 68 | int_0_9_null = $(int_del_0) |  $(int_1_9) | $(int_0);
 69 | int_rep = $(int_0_9)+;
 70 | int_rep0 = $(int_0)+;
 71 | int_rep0_null = ("" : "") | $(int_rep0);
 72 | 
 73 | signed_symbol0 = ("正" : "") | ("负" : "-") | ("正负" : "±") | ("\+" : "+") | ("\-" : "-") | ("±" : "±") ; 
 74 | signed_symbol = $(signed_symbol0) | $(null_2_null);
 75 | #%Type% INT
 76 | int_1_decades = (/十/ : /1/) | (/一十/ : /1/);
 77 | #%Type% INT
 78 | int_0_10 =  $(int_0) | $(int_1_9)| $(int_1_decades) | (/[0-9]|(10)/);
 79 | #%Type% INT
 80 | int_00_10 =  $(int_0)  $(int_1_9)| $(int_0_10);
 81 | #%Type% INT
 82 | int_1_2_decades = $(int_1_decades) | (/二/ : /2/) (/十/ : //);
 83 | #%Type% INT
 84 | int_1_5_decades = $(int_1_decades) | $(int_2_5) (/十/ : //);
 85 | #%Type% INT
 86 | int_1_9_decades = $(int_1_decades) | $(int_1_9) (/十/ : //);
 87 | #%Type% INT
 88 | int_10_99 = $(int_1_9_decades) $(int_0_9_null)  | (/[1-9][0-9]/) ;
 89 | #%Type% INT
 90 | int_10_59 = $(int_1_5_decades) $(int_0_9_null) | (/[1-5][0-9]/) ;
 91 | #%Type% INT
 92 | int_1_99 = $(int_1_9) | $(int_10_99) ;
 93 | #%Type% INT
 94 | int_01_99 =  $(int_1_9) | $(int_10_99) | (/\d{1,2}/);
 95 | #%Type% INT
 96 | int_0_99 =  $(int_0) | $(int_01_99) | (/\d{1,2}/) ;
 97 | #为了适应24进制
 98 | #%Type% INT
 99 | int_0_23 =  $(int_0)  
100 | 	| ("" : "0")  $(int_1_9) 
101 | 	| $(int_1_decades) $(int_1_9) 
102 | 	| (/二/ : /2/) | $(int_0_4);
103 | #为了适应60进制
104 | #%Type% INT
105 | int_0_60 =  $(int_0) 
106 | 	| $(int_10_59)
107 | 	| ("" : "0")  $(int_1_9)
108 | 	| (/([0-5][0-9])|(60)/);
109 | int_1_9_hundreds = $(int_1_9) ("百" : "");
110 | int_000_099 = $(int_del_0) $(int_del_0) $(int_0_9) | $(int_del_0) $(int_10_99) | $(int_del_0) $(int_del_0) $(int_del_0);
111 | int_100_999 =   $(int_1_9_hundreds) ("" : "00") |  $(int_1_9_hundreds) $(int_10_99);
112 | int_1_999 = $(int_1_99) | $(int_100_999);
113 | #%Type% INT
114 | int_0_999 =   $(int_0) | $(int_1_999);
115 | 
116 | int_1_9_thousands = $(int_1_9) ("千" : "");
117 | 
118 | int_1000_9999 = $(int_1_9_thousands) $(int_000_099) | $(int_1_9_thousands) $(int_100_999);
119 | #%Type% INT
120 | int_1_9999  = $(int_1000_9999) | $(int_1_999) | (/\d{1,4}/);
121 | #%Type% INT
122 | int_0_9999  =   $(int_1000_9999) | $(int_0_999) | (/\d{1,4}/);
123 | 
124 | int_wan =  ("万" : "");
125 | #%Type% INT
126 | int_5_8bit = $(int_1_9999) $(int_wan) $(int_0_9999) |  $(int_1_9999) $(int_wan)  (// : /0000/) ;
127 | 
128 | int_ins_yi =  (/个?亿/ : //);
129 | #%Type% INT
130 | int_9_12bit=  $(int_1_9999) $(int_ins_yi) $(int_5_8bit) | $(int_1_9999) $(int_ins_yi) (// : /00000000/);
131 | #%Type% INT
132 | pure_int = (/\d{1,}/);
133 | 
134 | #%Type% INT
135 | integer_int = $(int_0)
136 | 	| $(int_1_9999)
137 | 	| $(int_5_8bit)
138 | 	| $(int_9_12bit)
139 | 	| $(pure_int);
140 | #%Type% INT
141 | #%Order% 140
142 | integer_signed = $(signed_symbol) $(integer_int)
143 | ;
144 | 
145 | 
146 | 
147 | #================#
148 | #    decimals    #
149 | #================#
150 |  
151 |  
152 | pure_decimal= (/0\.\d+/)
153 | 	| (/(?:\d+,?)*\d+\.\d+/);
154 | decimal = $(pure_decimal)
155 | 	| $(integer_int) ("点" : ".") $(int_rep)
156 | ;
157 | 
158 | pure_digit = $(pure_decimal) | $(pure_int);
159 | #%Type% DOUBLE
160 | #%Order% 100
161 | decimal_signed = $(signed_symbol) $(decimal) | $(decimal);
162 | 
163 | 
164 | 
165 | 
166 | 
167 | #================#
168 | #   fractions    #
169 | integer_int_extend = $(integer_int) | ("百" : "100") | ("千" : "1000") | ("万" : "10000") | ("亿" : "100000000");
170 | #================#
171 | fraction_cnv_slash = ("分之" : "/");
172 | fraction2 = ("/" : "/");
173 | percent_transform= ("%" : "100") | ("‰" : "1000");
174 | #%Type% DOUBLE
175 | #%Property%  Denominator,,Numerator| Numerator ,, Denominator | Denominator ,, Numerator 
176 | #%Order% 101
177 | fraction = $(integer_int_extend) $(fraction_cnv_slash) $(integer_int) : $3 $2 $1
178 | 	| $(integer_int) $(fraction2) $(integer_int)
179 | 	| $(pure_decimal) ("" : "/") $(percent_transform);
180 | 
181 | fraction_signed = $(signed_symbol) $(fraction) | $(fraction);
182 | 
183 | integer_decimal = $(integer_int) | $(decimal);
184 | digit = $(integer_int) 
185 | 	| $(decimal)
186 | 	| $(fraction);
187 | 
188 | digit_signed = $(signed_symbol) $(digit);
189 | 	
190 | range_keywords = (/到|至|\-|~/ : /~/);
191 | 
192 | range_larger = ("大于") | (">");
193 | range_less = ("小于") | ("<");
194 | range_larger_result= (/.+/:/>/);
195 | range_less_result= (/.+/ : /</);
196 | range_back = (/以上/ : />/) | (/以下/ : /</) ;
197 |  
198 | range2= $(range_larger): $(range_larger_result) | $(range_less) : $(range_less_result);
199 | 
200 | 
201 | 
202 | 
203 | 
204 | #================#
205 | #       time     #
206 | #================#
207 | 
208 | date_from= ("从" : "") | (// : //);
209 | 
210 | time_kywd_nrml = (/\b(AM|PM|时|秒|凌晨|早晨|早上|上午|中午|下午|傍晚|晚上|深夜|午夜|时间|时区|时差|时钟|闹钟|闹铃|手表|开始|开会|开幕|截至|截止|为止|结束|闭幕|加班到|加班至|首班车|末班车|发车|时刻表|航班|准点|晚点|到点|现在是)\b/);
211 | time_special = ("半" : "30") | ("一刻" : "15") | ("三刻" : "45"); 
212 | time_cnv_dian = ("点" : ":") | (":" :":");
213 | 
214 | 
215 | time_fen =   ("分" : ":") | ("分钟" : ":") | ("min" : ":") | ("Min" : ":") | (":" : ":");
216 | #下面代表3点05 这种省略分钟的说法
217 | time_fen2 = $(time_fen) | ("" : "");  
218 |  
219 | 　
220 | time_mm_r = (/\b(?:00?)\b/ : //) | $(int_0_60) (/:?/ : /分/);
221 | time_miao = ("秒" : "") | $(null_2_null) ;
222 |  
223 | 
224 |  
225 | #%Property%  Hour,,Minute | Hour,,Minute |　Hour,,Minute | Hour,,Minute,,Second | Hour
226 | #%Order% 30
227 | time_fix = $(int_0_99) $(time_cnv_dian) $(int_00_10) $(time_fen2)
228 | 	| $(int_0_99) $(time_cnv_dian) $(int_10_59) $(time_fen2)
229 | 	| $(int_0_99) $(time_cnv_dian) $(time_special) 
230 | 	| $(int_0_99) $(time_cnv_dian) $(int_0_60) $(time_fen) $(int_0_60) $(time_miao)
231 | 	| $(int_0_99) $(time_cnv_dian) (// : /00/)
232 | ;
233 | 
234 | time_span =	 $(int_0_60) $(time_fen) $(int_0_60) $(time_miao) ("" : "00:") : $5 $1 $2 $3 $4
235 | 	| $(int_0_60) $(time_fen2) ("" : "00:") ("" : ":00") : $3 $1 $2 $4
236 | 	| $(int_0_60) ("秒" : "") ("" : "00:") ("" : ":00") : $3 $1 $2 $4;
237 | 
238 | 
239 | 
240 | #%Property% ,Start,,End
241 | #%Order% 28
242 | time_range =  $(date_from) $(time_fix) $(range_keywords) $(time_fix);
243 | 
244 | #================#
245 | #       date     #
246 | #================#
247 | 
248 | date_ri = ("日" : "") | ("号" : "");
249 | date_ri_null =  $(date_ri) $(null_2_null);
250 | date_DD = $(int_01_99) $(date_ri);
251 | date_DD_null=  $(date_DD) | $(int_01_99);
252 | date_cnv_yue = (/[-/.]/ : ///);
253 | date_yue = ("月": "");
254 | date_yue2 = ("月": "/");
255 | date_yue3 = $(date_yue2) | $(date_cnv_yue); 
256 |  
257 | date_cnv_nian = (/[-/.]/ : ///);
258 | date_nian = ("年" : "");
259 | date_nian2 = ("年" : "/");
260 | date_nian3 = $(date_nian2) | $(date_cnv_nian);
261 | 
262 | date_YYYY = (/[12]\d\d\d/) : $(int_0_9999);
263 | date_YYYY_Restrict = (/\b(?:19\d\d|20\d\d)\b/) : $(int_1000_9999);
264 | 
265 | date_keywords = ("AD") | ("BC") | ("年") | ("月") | ("前年") | ("去年") | ("昨年") | ("今年") | ("明年") | ("后年") | ("上月") | ("本月") | ("下月") | ("个月") | ("昨日") | ("今日") | ("明日") | ("前天") | ("昨天") | ("今天") | ("明天") | ("后天") | ("日期") | ("日子") | ("星期") | ("周一") | ("周二") | ("周三") | ("周四") | ("周五") | ("周六") | ("周日") | ("周末") | ("节假日") | ("工作日") | ("纪念日") | ("公元") | ("年度") | ("财年") | ("季度") | ("赛季") | ("节日") | ("生日") | ("假日") | ("元旦") | ("情人节") | ("妇女节") | ("愚人节") | ("植树节") | ("消费者权益日") | ("劳动节") | ("青年节") | ("儿童节") | ("建军节") | ("教师节") | ("国庆") | ("圣诞") | ("春节") | ("元宵") | ("清明") | ("端午") | ("七夕") | ("中秋") | ("重阳");
266 | # YYYY年  YYYY-YYYY年  YYYY年-YYYY年  YYYY年MM月  YYYY年MM-MM月  YYYY年MM月-MM月  YYYY年MM月-YYYY年MM月  YYYY年MM月DD日  YYYY年MM月DD-DD日  YYYY年MM月DD日-DD日  YYYY年MM月DD日-MM月DD日  YYYY年MM月DD日-YYYY年MM月DD日  MM月DD日  MM月DD-DD日  MM月DD日-DD日  MM月DD日-MM月DD日  MM月-MM月  MM-MM月  MM月  DD日-DD日  DD-DD日  DD日 #
267 | #%Type% DATETIME
268 | #%Property% Year | Year,,Month | Year,,Month,,Day | Month,,Day
269 | #%Order% 32
270 | date_fix = $(date_YYYY) $(date_nian)
271 | 	| $(date_YYYY) $(date_nian3) $(int_01_99) $(date_yue3)
272 | 	| $(date_YYYY) $(date_nian3) $(int_01_99) $(date_yue3) $(date_DD_null)
273 | 	| $(int_01_99) $(date_yue3) $(date_DD)
274 | 	| $(date_DD)
275 | ;
276 | #注意，对于元属性，对空格严格要求，中间只能空一格
277 | #%Property% ,Start,,End
278 | #%Order% 26
279 | date_range = $(date_from) $(date_fix) $(range_keywords) $(date_fix);
280 | 
281 | #================#
282 | #    时间长度    #
283 | #================#
284 | #时间长度和范围的概念有所不同，比如三个小时和6点到9点，是相似但不同的
285 | time_unit_trans = (/年|years/ : /Year/) | (/(个?月)|(months)/: /Month/) | (/天|日|days/ : /day/) | (/个?小时|hours/ : /hour/) | (/分钟|minutes|min/ : /minute/) | (/刻钟/: /min15/) | (/秒|sec|seconds/: /second/);
286 | 
287 | #%Property% Value,Unit
288 | #%Order% 54
289 | time_length = $(integer_signed) $(time_unit_trans); 
290 | 
291 | 
292 | #以下是各种单位
293 | #数值型单位需要识别其类型，同时将单位和数值提取出来，单位使用英语标准(如nm)输出，之后送入单位处理模块
294 | 
295 | #基本的量级单位
296 | #毫,纳,微,分,厘
297 | 
298 | unit_amount_mini= ("m") |  ("n") | ("μ") | ("d") | ("c");
299 | 
300 | unit_amount_mini_trans= ("M" : "m")  | ("C" : "c");
301 | 
302 | unit_amount_mini_chs= ("毫" : "m")  | ("分" : "c")  | ("微" : "m") | ("纳" : "n") ;
303 | #千,与下方分开，因为千米是常用表达，但没有兆米这样的说法
304 | unit_amount_large1 = ("K");
305 | unit_amount_large3 = ("k" : "K") ;
306 | unit_amount_large1_chs = ("千" : "K");
307 | #兆，吉，T
308 | 
309 | unit_amount_large2 =  ("M") | ("G") | ("T");
310 | unit_amount_large2_trans = ("m" : "M") | ("g" : "G");
311 | unit_amount_large2_chs =("兆" : "M") | ("吉" : "G") ;
312 | unit_amount_chs3 = $(unit_amount_mini_chs) | $(unit_amount_large1_chs);
313 | unit_amount_large_eng = $(unit_amount_large1) | $(unit_amount_large2);
314 | 
315 | unit_amount_chs4 = $(unit_amount_large1_chs) | $(unit_amount_large2_chs);
316 | unit_amount_eng_all = $(unit_amount_mini) | $(unit_amount_large_eng) | $(unit_amount_large2_trans) ;
317 | unit_amount_chs_all = $(unit_amount_mini_chs) | $(unit_amount_large1_chs) | $(unit_amount_large2_chs) ;
318 | #专用于表现存储量的，在信息系统中，这种表达很常见
319 | unit_memory = $(unit_amount_large_eng) | $(unit_amount_large2_chs);
320 | #平方,立方
321 | unit_keyword_pow = ("平方" : "");
322 | unit_keyword_cubic = ("立方": "");
323 | #----------#
324 | #  Length  #
325 | #----------#
326 | 
327 | unit_length_eng=  ("m" : "m")  | ("M" : "m");
328 | 
329 | unit_length_chs= ("米" : "m") | ("公分" : "cm");
330 | unit_tabl_length_1 =  $(unit_amount_mini) $(unit_length_eng) 
331 | 	| $(unit_amount_mini_chs) $(unit_length_eng) 
332 | 	| $(unit_amount_large1_chs) $(unit_length_chs)
333 | 	| $(unit_length_eng)  
334 | 	| $(unit_length_chs);
335 | 
336 | unit_tabl_mile = ("英里" : "mi") | ("海里" : "nmi") | ("公里" : "Km");
337 | unit_tabl_length = $(unit_tabl_length_1) | $(unit_tabl_mile);
338 | unit_kywd_length =(/长|宽|高|厚|深|里程|距离|海拔|速度|尺寸|幅|米|寸|尺|码/);
339 | unit_kywd_mile = (/海里|船|舰|海|空/);
340 | 
341 | #%Property% Value,Unit
342 | #%Order% 27
343 | unit_length = $(digit) $(unit_tabl_length)
344 | ;
345 | 
346 | #----------#
347 | #   Area   #
348 | #----------#
349 | unit_tabl_area_0 = ("m2" : "m2");
350 | 
351 | unit_area_speacial=  ("亩" : "acre") | ("公顷" : "ha") | ("平方公里" : "Km2");
352 | unit_tabl_area = $(unit_keyword_pow) $(unit_amount_chs3)  ("米" : "m2")
353 | 	| $(unit_amount_large_eng) |  $(unit_tabl_area_0)
354 | 	| $(unit_area_speacial);
355 | 
356 |  
357 | unit_kywd_area = ("面积") | ("土地") | ("英亩") | ("公顷");
358 | #%Property% Value,Unit
359 | #%Order% 28
360 | unit_area = $(digit) $(unit_tabl_area)
361 | ;
362 | 
363 | 
364 | 
365 | 
366 | #----------#
367 | #  Volume  #
368 | #----------#
369 | unit_tabl_volume_0 = ("m3" : "m3");
370 | unit_volume_specal = ("cc" : "cm3");
371 | unit_tabl_volum = $(unit_keyword_cubic) $(unit_amount_chs3)  ("米" : "m2")
372 | 	| $(unit_amount_large_eng) | $(unit_tabl_volume_0)
373 | 	| $(unit_volume_specal);
374 | 
375 |  
376 | 
377 | unit_kywd_volume = ("体积") | ("容积");
378 | #%Property% Value,Unit
379 | #%Order% 29
380 | unit_volume = $(digit) $(unit_tabl_volum);
381 | #	| $(digit) $(unit_tabl_volume) : : "CMP( DIST(A,33,$(unit_kywd_volume)) ≤ 33 )"
382 | 
383 | 
384 | #----------#
385 | #  Weight  #
386 | #----------#
387 | unit_weight_eng = ("g": "g") | ("G" : "G");
388 | unit_weight_chs = ("克": "克");
389 | 
390 | unit_weight_specal = ("t" : "t") | ("吨" : "t") | ("磅" : "pound") | ("公斤" : "Kg");
391 | unit_tabl_weight = $(unit_amount_mini) $(unit_weight_eng) 
392 | 	| $(unit_amount_mini_chs) $(unit_weight_chs) 
393 | 	| $(unit_amount_large1_chs) $(unit_weight_chs)
394 | 	| $(unit_weight_eng)  
395 | 	| $(unit_weight_specal)
396 | 	| $(unit_weight_chs);
397 | 
398 | unit_kywd_weight = ("质量") | ("重") | ("克");
399 | #%Property% Value,Unit
400 | #%Order% 95
401 | unit_weight = $(digit) $(unit_tabl_weight);
402 | #	| $(digit) $(unit_tabl_volume) : : "CMP( DIST(A,33,$(unit_kywd_volume)) ≤ 33 )"
403 | 
404 | 
405 | #----------#
406 | # Electric #
407 | #----------#
408 | 
409 | 
410 | unit_electric_eng = ("Ω") | ("W") | ("A") | ("V") | ("Hz") | ("F") | ("C") | ("H") | ("J");  
411 | 
412 | unit_electric_chs=  ("欧姆" : "Ω") | ("欧" : "Ω" ) | ("瓦" : "W") | ("安培" : "A") | ("安" : "A") | ("伏" : "V") | ("伏特" : "V") | ("法" : "F") | ("法拉" : "F") | ("库伦" :"C") | ("亨" : "H") | ("焦" :"J") | ("焦耳": "J") ;
413 | 
414 | unit_tabl_electric = $(unit_amount_eng_all) $(unit_electric_eng) 
415 | 	| $(unit_amount_chs_all) $(unit_electric_chs) 
416 | 	| $(unit_electric_eng)  
417 | 	| $(unit_electric_chs);
418 | 
419 | unit_kywd_electric = ("电") | ("交流") | ("直流") | ("导体") | ("功率") | ("负载") | ("阻抗") | ("线圈") | ("磁场") | ("回路") | ("滤波") | ("欧姆") | ("瓦特") | ("伏特") | ("焦耳") | ("毫安") |  ("安培") | ("赫兹");
420 | #%Property% Value,Unit
421 | #%Order% 1 
422 | unit_electric = $(integer_decimal) $(unit_tabl_electric) :: "dist('unit_kywd_electric')<33";
423 | 
424 | 
425 | 
426 | #----------#
427 | # Telecom  #
428 | #----------#
429 | 
430 | unit_telecom_eng = ("B" : "B") | ("b" : "b") | ("Byte" : "B") | ("bit" : "b") | ("byte" : "B");
431 | 
432 | unit_telecom_chs = ("比特" :  "B") | ("字节" : "B");
433 | 
434 | unit_kywd_telecom = ("CDMA") | ("GPRS") | ("GSM") | ("SCDMA") | ("WCDMA") | ("网络") | ("网速") | ("上网") | ("带宽") | ("容量") | ("比特") | ("字节") | ("内存") | ("硬盘") | ("闪存") | ("手机") | ("电脑") | ("平板") | ("套餐") | ("数据") | ("流量") | ("数码") | ("数字") | ("电信") | ("联通") | ("移动") | ("网通");
435 | 
436 |  
437 | unit_telecom_unit =  $(unit_amount_large_eng) $(unit_telecom_eng) 
438 | 	| $(unit_amount_chs4) $(unit_telecom_chs) 
439 | 	| $(unit_telecom_eng) 
440 | 	| $(unit_memory)
441 | 	| $(unit_telecom_chs);
442 | #%Property% Value,Unit
443 | #%Order% 32
444 | unit_telecom =  $(integer_decimal) $(unit_telecom_unit);
445 | 
446 | #	| $(unit_digit_comb) $(unit_tabl_telecom_1) : : "CMP( DIST(A,33,$(unit_kywd_telecom)) ≤ 33 )";
447 | 
448 | 
449 | #----------#
450 | # 布尔型   #
451 | #----------#
452 | 
453 | bool_true  = ("真") | ("是") | ("True") | ("true") | ("对") | ("Right") | ("正确");
454 | bool_true_result = (/.+/ : /true/);
455 | 
456 | bool_false = ("假") | ("否") | ("False") | ("false") | ("错") | ("Wrong") | ("错误");
457 | bool_false_result = (/.+/: /false/);
458 | bool_final =$(bool_true) : $(bool_true_result) |  $(bool_false) : $(bool_false_result);
459 |  
460 | #%Type% BOOL
461 | #%Order% 71
462 | bool_check = $(head_space) $(bool_final)  $(back_space);
463 | 
464 | #----------#
465 | #  身份证  #
466 | #----------#
467 | #%Type% 身份证信息转换
468 | #%Order% 19
469 | chsIDCard= (/\d{17}(\d|X|x)/); 
470 | 
471 | #----------#
472 | #  Email   #
473 | #----------#
474 | #%Order% 20
475 | #%Property% Accound,,ServiceName
476 | email_address= (/[a-zA-Z0-9]{3,20}/) (/@/) (/[a-zA-Z0-9]{2,20}/)(/(.com)|(.COM)/); 
477 | 
478 | #----------#
479 | #  Signal  #
480 | #----------#
481 | unit_signal_chs = ("赫兹" : "Hz") | ("分贝" : "dB") ;
482 | unit_signal_eng = ( "Hz" ) | ( "dB" );
483 | 
484 | unit_kywd_signal = ("CPU") | ("信号") | ("能量") | ("分贝") | ("赫兹") | ("中波") | ("周期") | ("频率") | ("声") | ("短波") | ("耳") | ("听觉") | ("长波") | ("电力") | ("电压") | ("电波") | ("电磁波") | ("声波");
485 | 
486 | unit_signal_unit =  $(unit_amount_large_eng) $(unit_signal_eng) 
487 | 	| $(unit_amount_chs4) $(unit_signal_chs) 
488 | 	| $(unit_signal_chs)  
489 | 	| $(unit_signal_eng);
490 | #%Property% Value,Unit
491 | #%Order% 33
492 | unit_signal =  $(integer_decimal) $(unit_signal_unit) ;
493 | 
494 | 
495 | #unit_signal = $(unit_digit_comb) $(unit_tabl_signal) : : "CMP( DIST(A,33,$(unit_kywd_signal)) ≤ 33 )";
496 | 
497 | 
498 | 
499 | #-----------------------#
500 | #  Atmospheric,Dynamics  #
501 | #------------------------#
502 | 
503 | unit_dynamic_chs = ("帕斯卡" : "Pa" ) | ("牛" : "N" ) | ("牛顿" : "N") | ("摄氏度" : "C" ) | ("华氏度" : "F" )  ;
504 | 
505 | unit_dynamic_eng = ( "Pa" ) | ( "N" ) | ("°F" : "F") | ("°C" : "C")| ("°F") | ("°C");
506 | 
507 | unit_dynamic_signal =("帕斯卡") | ("压力") | ("应力") | ("气压") | ("天气") | ("气象") | ("牛") | ("力") | ("质量") | ("速度") | ("重力") | ("摄氏度") | ("华氏度") | ("温度") | ("天气") | ("热") | ("凉") | ("冷") | ("寒") | ("融点") | ("沸点") | ("色温度") | ("水") | ("蒸发");
508 | 
509 | unit_dynamic_unit =  $(unit_amount_large_eng) $(unit_signal_eng) 
510 | 	| $(unit_amount_chs4) $(unit_signal_chs) 
511 | 	| $(unit_signal_chs)  
512 | 	| $(unit_signal_eng);
513 | 
514 | 
515 | #%Property% Value,Unit
516 | #%Order% 34
517 | unit_dynamic =  $(integer_decimal) $(unit_signal_unit);
518 | 
519 | #--------------#
520 | # Currency     #
521 | #--------------#
522 | #("美元") 
523 | unit_currency_chs = ("元" : "Yuan") 
524 |  | ("欧元" : "Euro") 
525 |  | ("英镑" : "Pound" ) 
526 |  | ("美元" : "Dollar");
527 | unit_currency_eng = ("￥" : "Yuan");
528 | #%Property% Value,Unit|Unit,Value 
529 | #%Order% 35
530 | unit_currency =  $(integer_decimal)  $(unit_currency_chs)
531 | 	| $(unit_currency_eng) $(integer_decimal) :$2 $1;
532 | 	
533 | 	
534 | #unit_temperature = $(unit_digit_temperature) ("°" : "度") : : "CMP( DIST(A,33,$(unit_kywd_temperature)) ≤ 33 )"
535 | #	| $(unit_digit_temperature) $(byte_cnv_to) $(unit_digit_temperature) ("°" : "度") : : "CMP( #DIST(A,33,$(unit_kywd_temperature)) ≤ 33 )"
536 | #	| $(unit_digit_temperature) $(unit_tabl_temperature)
537 | #	| $(unit_digit_temperature) $(byte_cnv_to) $(unit_digit_temperature) $(unit_tabl_temperature)
538 | #;
539 | 
540 | 
541 | all_unit =$(unit_area) | $(unit_currency) | $(unit_dynamic) | $(unit_electric) | $(unit_length);
542 | 
543 | all_unit_value = $(all_unit) | $(digit_signed);
544 | 
545 | 
546 | 
547 | #%Property% Value,Direction | From,,To | Direction,Value
548 | #%Order% 102
549 | digit_range = $(all_unit_value) $(range_back) : $2 $1
550 | 	| $(digit_signed) $(range_keywords) $(all_unit_value) 
551 | 	| $(range2) $(all_unit_value);
552 | 	
553 | #%Type% URL
554 | #%Order% 123	
555 | url = (/(http|https|ftp)://[!-~]+/) 
556 | 	| (/(www|www2)\.[!-~]+/) 
557 | 	| (/[!-~]+\.(com|net|edu|gov|org|mil|int|cc|cn|de|eu|fr|jp|hk|kp|ru|tw|uk|us|htm|html|jsp|asp|php)(/[!-~]+)?/);
558 | 
559 |  
560 | 
561 | #================#
562 | #     address    #
563 | #================#
564 | 
565 | addr_province = ("省") | ("自治区") | ("特别行政区") ;
566 | #市  作为独立存在;
567 | addr_city = ("市");
568 | addr_domain = ("县")| ("区");
569 | addr_country = ("乡") | ("镇");
570 | 
571 | #村  作为独立存在;
572 | 
573 | #因为地址没有特别的分割标点符号，所以用纯tn的做法，是一直向前推进，直到发现省或者市这样的关键词。
574 | #更好的做法，应该是分词模型吧
575 | except_keywords = $(addr_province) | $(addr_city) | $(addr_domain) | $(addr_country);
576 | addr_except_keywords= $(chs) - $(except_keywords);
577 | addr_part_name = $(addr_except_keywords){2,-1};
578 | 
579 | addr_kywd_nrml = ("地址") | ("小区") | ("大厦") | ("物业") | ("业主") | ("住在") | ("家住") | ("家在") | ("家是") | ("宾馆") | ("酒店") | ("宿舍") | ("公寓") | ("房间") | ("房号") | ("路") | ("楼") | ("座") | ("栋") | ("幢") | ("层") | ("室");
580 | 
581 | 
582 | 
583 | 
584 | #%Type% 地址
585 | #%Property% Province,,City,,Domain
586 | #%##Order% 15
587 | address = $(addr_part_name) $(addr_province) $(addr_part_name) $(addr_city) $(addr_part_name) $(addr_domain);
588 | 
589 | 
590 | #================#
591 | #    telecom     #
592 | #================#
593 | 
594 | telecom_kywd_nrml =("动感地带") | ("TEL") | ("℡") | ("FAX") | ("电话") | ("座机") | ("手机") | ("传真") | ("详回") | ("回") | ("回复") | ("询") | ("详询") | ("致电") | ("回电") | ("电询") | ("编辑") | ("编辑短信") | ("发送") | ("发短信") |("发送短信") | ("总机") | ("分机") | ("热线") | ("专线") | ("拨打") | ("拨") | ("加拨") | ("转接") | ("客服") | ("火警") | ("报警") | ("市话") | ("长话") | ("国内长途") | ("国际长途") | ("漫游") | ("查号台") | ("移动") | ("联通") | ("电信");
595 | #---------------------------#
596 | #    Mobile Phone Number    #
597 | # 1Yxxxxxxxxx       Y=[3-9] #
598 | # 1Yx xxxx xxxx     Y=[3-9] #
599 | # 1Yxx xxx xxxx     Y=[3-9] #
600 | # 1Yxx xxxx xxx     Y=[3-9] #
601 | #---------------------------#
602 | telecom_dash = (/ ?- ?| / : //);
603 | #%Type% 移动电话
604 | telecom_mobile_no =  (/1[3-9]\d/) (/\d{4}/) (/\d{4}\b/)
605 | 	| (/1[3-9]\d\b/) $(telecom_dash) (/\b\d{4}\b/) $(telecom_dash) (/\b\d{4}\b/)
606 | 	| (/1[3-9]\d{2}\b/) $(telecom_dash) (/\b\d{3}\b/) $(telecom_dash) (/\b\d{4}\b/)
607 | 	| (/1[3-9]\d{2}\b/) $(telecom_dash) (/\b\d{4}\b/) $(telecom_dash) (/\b\d{3}\b/)
608 | ;
609 | #固定电话号码  0350-22222222  [7-9位]
610 | #%Type% 固定电话
611 | #%Property% AreaCode,,PhoneCode | AreaCode,,PhoneCode| PhoneCode
612 | telecom_phone = (/\d{3,4}/) (/ /: / /) (/\d{7,9}/)
613 | 	| (/\d{3,4}/) $(telecom_dash) (/\d{7,9}/)
614 | 	| (/\d{7,9}/);
615 | #%Type% 电话号码转换
616 | #%Order% 35
617 | telecom= $(head_space) $(telecom_phone) | $(head_space) $(telecom_mobile_no);
618 | 
619 | 
620 | 
621 | 
622 | #%Type% IP地址
623 | #%Order% 45
624 | ip_part= (/((2[0-4]\d|25[0-5]|[01]?\d\d?)\.){3}(2[0-4]\d|25[0-5]|[01]?\d\d?)/);
625 | 
626 | 
627 | 　
628 | #---------------------------#
629 | #      Vehicle Related      #
630 | #---------------------------#
631 | 
632 | num_kywd_bus = ("乘坐") | ("换乘") | ("车站") | ("车牌") | ("站台") | ("公共交通") | ("公交") | ("公车") | ("巴士") | ("路车") | ("号线") | ("地铁") | ("轻轨") | ("电车") | ("交通") | ("路况") | ("堵车") | ("红灯") | ("绿灯") | ("一卡通") | ("公交卡") | ("无人售票") | ("站点") | ("站牌") | ("到站") | ("下车") | ("上车") | ("下一站") | ("起点站") | ("终点站") | ("首班车") | ("末班车");
633 | 
634 | num_bus_dflt = (/\b\d{1,4}\b/) ("路") ;
635 | 


--------------------------------------------------------------------------------
/rules/learn:
--------------------------------------------------------------------------------
 1 | #这是一个基本的入门教程
 2 | 
 3 | #1.将所有的“你好”匹配出来
 4 | 
 5 | hello= ("你好");
 6 | 
 7 | hello2= $(hello)| ("您好");
 8 | 
 9 | 
10 | 
11 | 
12 | hello3= $(hello2) : (// : /hello/);
13 | 
14 | people= ("老婆") | ("领导");
15 | 
16 | #%Order% 1
17 | reorder= $(people) $(hello3) : $2 $1;
18 | 
19 | #定义0-9
20 | int_1 = ("一" : "1");
21 | int_0 =("零" : "0");
22 | int_2  = ("二" : "2") | ("两" : "2");
23 | int_3_9 = ("三" : "3") | ("四" : "4") | ("五" : "5") | ("六" : "6") | ("七" : "7") | ("八" : "8") | ("九" : "9");
24 | int_1_9 = $(int_1) | $(int_2) | $(int_3_9) | (/\d/);
25 | int_0_9 = $(int_0) | $(int_1_9);
26 | int_del_0 = (/零/ : /0/) |  (// : /0/);
27 | int_0_9_null = $(int_del_0) |  $(int_0_9);
28 | #定义10，十
29 | int_1_decades = (/十/ : /1/) | (/一十/ : /1/);
30 | 
31 | #定义二十=>20, 
32 | int_1_9_decades = $(int_1_decades) | $(int_1_9) (/十/ : //);
33 | int_10_99 = $(int_1_9_decades) $(int_0_9_null)  | (/[1-9][0-9]/) ;
34 | int_1_99 = $(int_1_9) | $(int_10_99) ;
35 | int_01_99 =  $(int_1_9) | $(int_10_99) | (/\d{1,2}/);
36 | 
37 | #%Order% 3
38 | int_0_99 =  $(int_0) | $(int_1_9) | $(int_10_99);


--------------------------------------------------------------------------------
/rules/xmlparser:
--------------------------------------------------------------------------------
 1 | #this TN-XML parser, let's take it 
 2 | 
 3 | 
 4 | equal = (/\s?=\s?/);
 5 | equal0 = (/=/);
 6 | end= (/\s?;/);
 7 | shortspace =(/ /);
 8 | shortspace0 = (/\s?/);
 9 | space= (/[\s\r\n]*/);
10 | values= (/[#\sA-Za-z0-9;\*,_:%/\.\+\-\(\)]+/);
11 | #下面这个为注释使用，不使用-
12 | values0 = (/[#\sA-Za-z0-9;\*,_:%/\.\+\(\)]+/);
13 | property= (/"/) $(values) (/"/);
14 | name= (/[\w|\-:]+/);
15 | 
16 | noteleft= (/<!--/);
17 | noteright = (/-->/);
18 | #%Property% ,,note
19 | note =$(space) $(noteleft) $(values0) $(noteright) $(space);
20 |  
21 |  
22 | #%Property% ,$key,,$value
23 | properties =$(space) $(name)  $(equal) $(property) $(space);
24 | 
25 | manyproperties0 =$(properties)*;
26 | 
27 | manyproperties =$(manyproperties0) | $(space);
28 | leftbracket = (/</);
29 | endleftbracket = (/<//);
30 | rightbracket= (/>/);
31 | endrightbracket = (//>/);
32 | 
33 | 
34 | #%Property% children|text
35 | child = $(xmls) $(shortspace0) | $(values) $(shortspace0);
36 | 
37 | #%Order% 2
38 | #%Property% ,,name,property,,child |,,name,property
39 | xml = $(space) $(leftbracket) $(name) $(manyproperties) $(rightbracket)
40 | $(child) $(space)  $(endleftbracket) $(name)  $(rightbracket) $(space)
41 |  |  $(space) $(leftbracket) $(name) $(manyproperties) $(space)  $(endrightbracket)  $(space)
42 |  |  $(note);
43 | 
44 | xmls= $(xml)+;
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ferventdesert/tnpy/b6cc5fe9599890c2bfdc10c6e608439f2555afb2/src/__init__.py


--------------------------------------------------------------------------------
/src/tngraph.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'zhaoyiming-laptop'
 2 | import pygraphviz as pgv
 3 | import os;
 4 | from src.tnpy import StringEntity as SE, RegexEntity as RE, TableEntity as TE, SequenceEntity as SQE, RepeatEntity as RPE, \
 5 |     EntityBase
 6 | os.environ["PATH"]= r'D:\Program Files\graphviz-2.38\release\bin;'+os.environ["PATH"];
 7 | # strict (no parallel edges)
 8 | # digraph
 9 | # with attribute rankdir set to 'LR'
10 | 
11 | def addNode(A,entity,nodes):
12 |     nodeid= id(entity);
13 |     name= str(entity);
14 |     if nodeid not in nodes:
15 |         A.add_node(name);
16 |         nodes[nodeid]=entity;
17 |     if isinstance(entity, SQE):
18 |         for child in entity.MatchEntities:
19 |             addNode(A,child,nodes);
20 |             A.add_edge(name,str(child));
21 |         for child in entity.RewriteEntities:
22 |             addNode(A,child,nodes);
23 |             A.add_edge(name,str(child));
24 |     elif isinstance(entity, TE):
25 |         for child in entity.Tables:
26 |             addNode(A,child,nodes);
27 |             A.add_edge(name,str(child));
28 | 
29 | 
30 | 
31 |     A.add_node(entity)
32 | def buildGraph(tn,entityname):
33 |     A=pgv.AGraph(directed=True,strict=True)
34 |     entities=tn.Entities;
35 |     entity= entities[entityname];
36 |     nodes={};
37 |     addNode(A,entity,nodes);
38 |     A.graph_attr['epsilon']='0.001'
39 |     print (A.string()) # print dot file to standard output
40 |     A.write('foo.dot')
41 |     A.layout('dot') # layout with dot
42 |     A.draw('foo.png') # write to file
43 | 
44 | 


--------------------------------------------------------------------------------
/src/tnnlp.py:
--------------------------------------------------------------------------------
 1 | # encoding: UTF-8
 2 | 
 3 | from src.tnpy import StringEntity as SE, RegexEntity as RE, TableEntity as TE, SequenceEntity as SQE, RepeatEntity as RPE, \
 4 |     EntityBase
 5 | 
 6 | import jieba.posseg as pseg
 7 | 
 8 | 
 9 | wordlib={};
10 | 
11 | 
12 | def initwordlib(path):
13 |     read = open(path, 'r', 'utf-8')
14 |     lines = [(x) for x in read.readlines()]
15 |     for line in lines:
16 |         ws=line.split(' ');
17 |         name= ws[0];
18 |         words= [w.strip() for w in ws[1:]];
19 |         wordlib[name]=words;
20 | 
21 | 
22 | 
23 | class NEREntity(EntityBase):
24 |     def __init__(self, pos=None, maxlen=-1):
25 |         super(NEREntity, self).__init__()
26 |         if isinstance(pos, str):
27 |             self.Pos = [pos];
28 |         elif isinstance(pos, list):
29 |             self.Pos = pos;
30 |         else:
31 |             self.Pos = None;
32 | 
33 |         self.Len = maxlen;
34 | 
35 |     def RewriteItem(self, input):
36 |         return input
37 | 
38 |     def MatchItem(self, input, start, end,muststart, mode=None):
39 |         self.LogIn(input, start,end)
40 |         pos = start;
41 |         if end is None:
42 |             end=len(input);
43 |         seg_list = pseg.cut(input[start:end] if self.Len == -1 else input[start:start + self.Len]);
44 |         for word, flag in seg_list:
45 |             if self.Pos is None:
46 |                 sword = word;
47 |                 break;
48 |             else:
49 |                 if flag in self.Pos:
50 |                     sword = word;
51 |                     break;
52 |             pos += len(word);
53 |         if pos < 0 or (muststart == True and pos != start):
54 |             self.LogOut(None)
55 |             return start + self.Len if self.Len < 0 else tnpy.int_max;
56 |         self.LogOut(sword)
57 |         m = tnpy.MatchResult(self, sword, pos);
58 |         m.rstr = sword;
59 |         return m;
60 | 
61 | 
62 | 
63 | 
64 | class WordEntity(EntityBase):
65 |     def __init__(self, name=None ):
66 |         super(WordEntity, self).__init__()
67 |         self.Word=name;
68 |         if len(wordlib.keys())==0 :
69 |             initwordlib('libs/wordlib.txt');
70 | 
71 |     def RebuildEntity(self):
72 |         if wordlib is None:
73 |             print 'please init word lib';
74 |         words=[];
75 |         for r in wordlib:
76 |             if r.startswith(self.Word):
77 |                 for w in wordlib[r]:
78 |                     words.append(w);
79 | 
80 |         self.Re= tnpy.RegexEntity('|'.join(words));
81 |         self.Re.RebuildEntity();
82 |         self.Re.Core=self.Core;
83 |     def RewriteItem(self, input):
84 |         return input
85 | 
86 |     def MatchItem(self, input, start, muststart, end,mode=None):
87 |         return self.Re.MatchItem(input,start,muststart,end,mode);


--------------------------------------------------------------------------------
/src/tnpy.py:
--------------------------------------------------------------------------------
   1 | ﻿# coding=utf-8
   2 | import re
   3 | 
   4 | import itertools;
   5 | 
   6 | int_max = 9999999;
   7 | 
   8 | def findany(iteral, func):
   9 |     for r in iteral:
  10 |         if func(r):
  11 |             return r;
  12 |     return None;
  13 | 
  14 | def getindex(iteral, func):
  15 |     for r in range(len(iteral)):
  16 |         if func(iteral[r]):
  17 |             return r;
  18 |     return -1;
  19 | 
  20 | 
  21 | def __GetPublicRoute(m):
  22 |     from collections import deque
  23 |     d = deque()
  24 |     route = []
  25 |     route.append(m.Entity.Order)
  26 |     d.append(m)
  27 |     while True:
  28 |         if len(d) == 0:
  29 |             break
  30 |         m = d.popleft()
  31 |         route.append(m.MatchIndex)
  32 |         m = m.Children
  33 |         while m is not None:
  34 |             d.append(m)
  35 |             m = m.NextMatch
  36 |     return route
  37 | 
  38 | 
  39 | class MatchResult(object):
  40 |     def __init__(self, entity, match, start, children=None, rstr=None):
  41 |         super(MatchResult, self).__init__()
  42 |         self.Order = 0
  43 |         self.MatchIndex = 0
  44 |         self.PropertyName = ""
  45 |         self.Children = children
  46 |         self.Entity = entity
  47 |         self.mstr = match
  48 |         self.rstr = rstr;
  49 |         self.pos = start
  50 |         self.IsShouldRewrite = None;
  51 |         self.CanSplit = False;
  52 | 
  53 |     def GetShouldRewrite(self):
  54 |         if self.IsShouldRewrite != None:
  55 |             return self.IsShouldRewrite;
  56 |         if self.Children is None:
  57 |             if self.Entity is None:
  58 |                 return False;
  59 |             if isinstance(self.Entity, ScriptEntity) == False and self.Entity.Rewrite is None:
  60 |                 return False;
  61 |             else:
  62 |                 return True;
  63 |         else:
  64 |             r = False;
  65 |             order = 0;
  66 |             ms = self.Children;
  67 |             for m in ms:
  68 |                 if order != m.Order:  # order diff must be rewrite
  69 |                     r = True;
  70 |                     break;
  71 |                 order += 1;
  72 |                 r |= m.GetShouldRewrite();
  73 |             self.IsShouldRewrite = r;
  74 |             return r;
  75 | 
  76 |     def RewriteItem(self):
  77 |         if self.rstr is not None:
  78 |             return self.rstr;
  79 |         if not self.IsShouldRewrite:
  80 |             self.rstr = self.mstr;
  81 | 
  82 |         if self.Children is None:
  83 |             self.rstr = self.Entity.RewriteItem(self.mstr)
  84 |             return self.rstr;
  85 | 
  86 |         match = self.Children[:];
  87 |         match = sorted(match, key=lambda m: m.Order);
  88 |         frstr = "";
  89 |         for m in match:
  90 |             frstr += m.RewriteItem();
  91 |         self.rstr = frstr;
  92 |         if isinstance(self.Entity, ScriptEntity):
  93 |             self.rstr = self.Entity.RewriteItem(self.Children);
  94 |         return self.rstr
  95 | 
  96 |     def __str__(self):
  97 |         return self.mstr;
  98 | 
  99 |     def ExtractDocument(self, document, mode=0):
 100 |         childDoc = {};
 101 |         if isinstance(self.Entity, RepeatEntity):
 102 |             childDoc0 = [];
 103 |             for m in self.Children:
 104 |                 m.ExtractDocument(childDoc0, 1)
 105 |             if len(self.PropertyName) != 0 and len(childDoc0) > 0:
 106 |                 document['$' + self.PropertyName] = childDoc0;
 107 |         elif self.Children is not None:
 108 |             for m in self.Children:
 109 |                 m.ExtractDocument(childDoc)
 110 |         if mode == 0:
 111 |             if len(self.PropertyName) != 0:
 112 |                 if self.PropertyName == '$value':
 113 |                     document[document['$key']] = self.RewriteItem();
 114 |                     del document['$key']
 115 |                 else:
 116 |                     if RegexCore.ExtractDictEnabled:
 117 |                         if len(childDoc) != 0:
 118 |                             document['$' + self.PropertyName] = childDoc
 119 |                     document[self.PropertyName] = self.RewriteItem();
 120 |             else:
 121 |                 if len(childDoc) != 0:
 122 |                     for r in childDoc:
 123 |                         document[r] = childDoc[r]
 124 |         else:
 125 |             document.append(childDoc);
 126 | 
 127 | 
 128 | class EntityBase(object):
 129 |     def __init__(self):
 130 |         self.Script = None
 131 |         self.Order = 0
 132 |         self.Name = ""
 133 |         self.Rule = ""
 134 |         self.Type = ""
 135 |         self.Core = None
 136 |         self.Start = False;
 137 | 
 138 |     def RewriteItem(self, input):
 139 |         m = self.MatchItem(input, 0, None, True);
 140 |         return m.RewriteItem();
 141 | 
 142 |     def RebuildEntity(self):
 143 |         pass;
 144 | 
 145 |     def SetValues(self, values):
 146 |         if isinstance(values, dict):
 147 |             value = values.get("Order", None);
 148 |             if value is not None:
 149 |                 self.Order = int(value);
 150 |             value = values.get("Type", None);
 151 |             if value is not None:
 152 |                 self.Type = value;
 153 |             value = values.get("Parameter", None);
 154 |             if value is None:
 155 |                 return;
 156 |             if value.find('|') >= 0:
 157 |                 return;
 158 |             va = value.split(',');
 159 |             for v in va:
 160 |                 vs = v.split('=');
 161 |                 if len(vs) != 2:
 162 |                     continue;
 163 |                 key, value = vs[0].strip(), vs[1].strip();
 164 |                 value = eval(value);
 165 |                 setattr(self, vs[0].strip(), value);
 166 | 
 167 |     def EvalScript(self, m, ot=None):
 168 |         if self.Script == u'':
 169 |             return True
 170 |         if ot is None:
 171 |             ot = m[0].mstr;
 172 |         core = self.Core
 173 | 
 174 |         def check(condition, result, elsework=None):
 175 |             if eval(condition):
 176 |                 r = eval(result);
 177 |                 return r;
 178 |             elif elsework is not None:
 179 |                 r = eval(elsework);
 180 |                 return r;
 181 | 
 182 |         def invoke(func, para):
 183 |             return eval(func)(para);
 184 | 
 185 |         def e(entityname):
 186 |             entity = self.Core.Entities[entityname]
 187 |             header = None
 188 |             header = entity.MatchItem(ot, 0, True, header)
 189 |             if not IsFail(header):
 190 |                 header = MatchResult(entity, None, -100)
 191 |                 return header
 192 |             return None
 193 | 
 194 |         def dist(name, i=0):
 195 |             header = e(name)
 196 |             if header is None:
 197 |                 return int_max;
 198 |             return abs(header.pos - m[i].pos)
 199 | 
 200 |         result = eval(self.Script)
 201 |         return result
 202 | 
 203 |     def LogIn(self, input, start, end=None):
 204 |         if self.Core.LogFile is None:
 205 |             return
 206 |         if self.Core.LogFile.name.find('htm') < 0:
 207 |             if end is not None:
 208 |                 end = start + 200;
 209 |             input = input[start: end].replace('\n', '<\\n>').replace('\r', '<\\r>');
 210 |             self.Core.LogFile.write(' ' * self.Core.matchLevel * 2)
 211 |             self.Core.LogFile.write('%s,Raw  =%s\r' % (str(self), input))
 212 |         else:
 213 |             self.Core.LogFile.write('<p>' + '&nbsp;' * self.Core.matchLevel * 4)
 214 |             self.Core.LogFile.write('%s,Raw= <font color="#FF0000">%s</font></p>\r' % (str(self), input))
 215 |         self.Core.matchLevel += 1
 216 | 
 217 |     def LogOut(self, match, buffered=False):
 218 |         if self.Core.LogFile is None:
 219 |             return
 220 |         self.Core.matchLevel -= 1
 221 |         if self.Core.LogFile.name.find('htm') < 0:
 222 |             self.Core.LogFile.write(' ' * self.Core.matchLevel * 2)
 223 |             if match is not None:
 224 |                 match = match[:200].replace('\n', '<\\n>').replace('\r', '<\\r>');
 225 |                 self.Core.LogFile.write('%s,%s=%s\r' % (str(self), ('Buff ' if buffered else 'Match'), match))
 226 |             else:
 227 |                 self.Core.LogFile.write('%s,NG\r' % str(self))
 228 |         else:
 229 | 
 230 |             self.Core.LogFile.write('<p>' + '&nbsp;' * self.Core.matchLevel * 4)
 231 |             if match != None:
 232 |                 self.Core.LogFile.write('%s,<b>OK</b>,Raw= <font color="#FF0000">%s</font></p>\r' % (str(self), match))
 233 |             else:
 234 |                 self.Core.LogFile.write('%s,<font color="#FF0000"><b>NG</b></font></p>\r' % str(self))
 235 | 
 236 |     def MatchItem(self, input, start, end, muststart, mode=None):
 237 |         return None;
 238 | 
 239 |     def GetName(self):
 240 |         name = self.Name if self.Name != "" else "unknown"
 241 |         return "%s,%s" % (name, findany(re.split("[,.']", str(type(self))),lambda d:d.find('Entity')>0).replace("Entity", ""))
 242 | 
 243 |     def __str__(self):
 244 |         return self.GetName()
 245 | 
 246 | 
 247 | class StringEntity(EntityBase):
 248 |     def __init__(self, match="", rewrite=None, condition=''):
 249 |         super(StringEntity, self).__init__()
 250 |         self.Match = match
 251 |         self.Rewrite = rewrite
 252 |         self.Condition = condition
 253 | 
 254 |     def RewriteItem(self, input):
 255 |         if None == self.Rewrite:
 256 |             return input
 257 |         return input.replace(self.Match, self.Rewrite);
 258 | 
 259 |     def SetValues(self, values):
 260 |         super(StringEntity, self).SetValues(values);
 261 |         if isinstance(values, dict):
 262 |             return;
 263 |         self.Match = values[0]
 264 |         if len(values) > 1:
 265 |             self.Rewrite = values[1]
 266 | 
 267 |     def MatchItem(self, input, start, end, muststart, mode=None):
 268 |         self.LogIn(input, start)
 269 |         if end is None:
 270 |             end = int_max;
 271 |         pos = input.find(self.Match, start, end)
 272 |         if pos < 0 or (muststart == True and pos != start):
 273 |             self.LogOut(None,False)
 274 |             return int_max if pos < 0 else pos;
 275 | 
 276 |         self.LogOut(self.Match)
 277 |         m = MatchResult(self, self.Match, pos)
 278 |         m.rstr = self.Match if self.Rewrite is None else self.Rewrite;
 279 |         return m;
 280 | 
 281 | 
 282 | class RepeatEntity(EntityBase):
 283 |     def __init__(self, entity=None, least=1, most=1, equal=False):
 284 |         super(RepeatEntity, self).__init__()
 285 |         self.Least = least
 286 |         self.Most = most
 287 |         self.Entity = entity
 288 |         self.Equal = equal;
 289 | 
 290 |     __splitre = re.compile('[,{}]');
 291 | 
 292 |     def RebuildEntity(self):
 293 |         if isinstance(self.Entity, str):
 294 |             self.Entity = self.Core.Entities[self.Entity];
 295 |         self.Entity.Core = self.Core;
 296 |    
 297 | 
 298 |     def SetValues(self, values):
 299 |         super(RepeatEntity, self).SetValues(values);
 300 |         if isinstance(values, dict):
 301 |             return
 302 |         cal = values[0];
 303 |         if cal == '*':
 304 |             self.Least = 0
 305 |             self.Most = -1
 306 |         elif cal == '+':
 307 |             self.Least = 1
 308 |             self.Most = -1
 309 |         elif cal == '?':
 310 |             self.Least = 0
 311 |             self.Most = 1
 312 |         elif cal.startswith('{'):
 313 |             sp = self.__splitre.split(cal);
 314 |             self.Least = int(sp[1])
 315 |             self.Most = int(sp[2])
 316 |         if self.Most == -1:
 317 |             self.Most = 99999;
 318 | 
 319 |     def MatchItem(self, input, start, muststart, mode=None):
 320 |         self.LogIn(input, start)
 321 |         right = 0
 322 |         start = start
 323 |         lresult = None
 324 |         isStop = False;
 325 |         isReset = False;
 326 |         bestResults = [];
 327 | 
 328 |         omax = -1;
 329 |         while right < self.Most:
 330 |             result = self.Entity.MatchItem(input, start, muststart, None)
 331 |             if not IsFail(result):
 332 |                 if right == 0:
 333 |                     start = result.pos
 334 |                     bestResults.append(result);
 335 |                 else:
 336 |                     if self.Equal:
 337 |                         if result.pos != start or lresult.mstr != result.mstr:
 338 |                             if not isinstance(self.Entity, RepeatEntity):
 339 |                                 isStop = True;
 340 |                             else:
 341 |                                 if omax == -1:
 342 |                                     omax = self.Entity.Most;
 343 |                                     self.Entity.Most = self.Entity.Least;
 344 |                                 else:
 345 |                                     self.Entity.Most += 1;
 346 |                                 if self.Entity.Most >= omax:
 347 |                                     isStop = True;
 348 |                                 right = 0;
 349 |                                 start = 0;
 350 |                                 isReset = True;
 351 |                         else:
 352 |                             bestResults.append(result)
 353 | 
 354 |                     elif result.pos != start:
 355 |                         isStop = True;
 356 |                     else:
 357 |                         bestResults.append(result)
 358 |                 if isStop:
 359 |                     break
 360 |                 if not isReset:
 361 |                     lresult = result;
 362 |                     start = result.pos + len(result.mstr);
 363 |                     lresult.Order = right;
 364 |                     right += 1
 365 |                 isReset = False;
 366 | 
 367 |             else:
 368 |                 break;
 369 |         if right < self.Least:
 370 |             self.LogOut(None,False)
 371 |             return start;
 372 |         pos = start
 373 |         matchResultString = input[start:start]
 374 |         if bestResults == []:  # this is ? or * ,can be null
 375 |             bestResult = MatchResult(None, '', 0);
 376 |             bestResult.rstr = '';
 377 |         p = MatchResult(self, matchResultString, pos, bestResults)
 378 |         self.LogOut(matchResultString)
 379 |         return p;
 380 | 
 381 | 
 382 | class DiffEntity(EntityBase):
 383 |     def __init__(self, universe=None, complements=None):
 384 |         super(DiffEntity, self).__init__()
 385 |         self.Universe = universe
 386 |         self.Complements = complements if complements is not None else [];
 387 | 
 388 |     def RebuildEntity(self):
 389 |         if isinstance(self.Universe, str):
 390 |             self.Universe = self.Core.Entities[self.Universe];
 391 |         for r in range(len(self.Complements)):
 392 |             if isinstance(self.Complements[r], str):
 393 |                 self.Complements[r] = self.Core.Entities[self.Complements[r]];
 394 | 
 395 |     def MatchItem(self, input, start, end, muststart, mode=None):
 396 |         self.LogIn(input, start)
 397 |         unresult = self.Universe.MatchItem(input, start, end, muststart, None)
 398 |         if IsFail(unresult):
 399 |             self.LogOut(None)
 400 |             return unresult;
 401 |         matchResult = None
 402 |         if len(self.Complements) != 0:
 403 |             for en in self.Complements:
 404 |                 matchResult = en.MatchItem(unresult.mstr, 0, None, True, matchResult)
 405 |                 if IsFail(matchResult):
 406 |                     self.LogOut(None)
 407 |                     return unresult.pos;
 408 |         p = MatchResult(self, unresult.mstr, unresult.pos, [unresult])
 409 |         self.LogOut(unresult.mstr)
 410 |         return p;
 411 | 
 412 | 
 413 | class RegexEntity(EntityBase):
 414 |     def __init__(self, match="", rewrite=None):
 415 |         super(RegexEntity, self).__init__()
 416 |         self.Match = match
 417 |         self.Rewrite = rewrite
 418 |         self.regex = None
 419 |         self.merge = False;
 420 |         self.IsMatchMax = True;
 421 |         if self.Match != "":
 422 |             self.RebuildEntity();
 423 | 
 424 |     def RewriteItem(self, input):
 425 |         if self.Rewrite is None:
 426 |             return input
 427 |         m = self.regex.search(input);
 428 |         return self.__Replace(m, self.Rewrite)
 429 | 
 430 |     def RebuildEntity(self):
 431 |         if self.regex is None:
 432 |             try:
 433 |                 self.regex = re.compile(self.Match)
 434 |             except:
 435 |                 print("Regex Format error %s" % (self.Match));
 436 | 
 437 |     def SetValues(self, values):
 438 |         super(RegexEntity, self).SetValues(values);
 439 |         if isinstance(values, dict):
 440 |             return;
 441 |         self.Match = values[0]
 442 |         if len(values) > 1:
 443 |             if isinstance(values[1], str):
 444 |                 self.Rewrite = values[1]
 445 |             else:
 446 |                 self.merge = True;
 447 |                 self.maps = values[1];
 448 | 
 449 |         try:
 450 |             self.regex = re.compile(self.Match)
 451 |         except:
 452 |             print("Regex Format error %s" % (self.Match));
 453 | 
 454 |     def __Replace(self, m, string):
 455 |         if (m.lastindex != None):
 456 |             c = m.lastindex + 1;
 457 |         else:
 458 |             c = 1;
 459 |         for index in range(c):
 460 |             string = string.replace(u'$' + str(index), m.group(index)).replace('\\n', '\n');
 461 |         return string;
 462 | 
 463 |     def MatchItem(self, input, start, end, muststart, mode=None):
 464 |         self.LogIn(input, start)
 465 |         if end is None:
 466 |             if muststart:
 467 |                 m = self.regex.match(input, start);
 468 |             else:
 469 |                 m = self.regex.search(input, start);
 470 |         else:
 471 |             if muststart:
 472 |                 m = self.regex.match(input, start, end);
 473 |             else:
 474 |                 m = self.regex.search(input, start, end)
 475 |         if m is None or (muststart == True and m.start() != start):
 476 |             self.LogOut(None)
 477 |             return int_max if m is None else m.start();
 478 | 
 479 |         p = MatchResult(self, m.group(), m.start())
 480 |         if self.merge:
 481 | 
 482 |             p.rstr = self.maps[p.mstr].RewriteItem(p.mstr);
 483 |         elif self.Rewrite is None:
 484 |             p.rstr = p.mstr;
 485 |         else:
 486 |             p.rstr = self.__Replace(m, self.Rewrite);
 487 |         self.LogOut(m.group())
 488 |         return p;
 489 | 
 490 | 
 491 | class ScriptEntity(EntityBase):
 492 |     def __init__(self, script=""):
 493 |         super(ScriptEntity, self).__init__()
 494 |         self.Script = script
 495 | 
 496 |     def SetValues(self, values):
 497 |         super(ScriptEntity, self).SetValues(values);
 498 |         if isinstance(values, list):
 499 |             self.Script = values[0]
 500 | 
 501 |     def RewriteItem(self, match):
 502 |         return str(self.EvalScript(match));
 503 | 
 504 |     def MatchItem(self, input, start, end, muststart, mode=None):
 505 |         core = self.Core;
 506 |         return eval(self.Script);
 507 | 
 508 |     def MatchItem2(self, origin, rewritetarget, isrewrite=False):
 509 |         input = rewritetarget.mstr;
 510 |         self.LogIn(input, rewritetarget.pos)
 511 |         if isrewrite:
 512 |             r = input;
 513 |             pos = 0;
 514 |         else:
 515 |             r = self.EvalScript(None, origin, input);
 516 |             if r is None:
 517 |                 return None;
 518 |             pos = input.find(r);
 519 |             if pos < 0:
 520 |                 return None;
 521 |         p = MatchResult(self, r, pos, rewritetarget)
 522 |         self.LogOut(r)
 523 |         return p;
 524 | 
 525 | 
 526 | def IsFail(x):
 527 |     if isinstance(x, int):
 528 |         return True;
 529 |     return False;
 530 | 
 531 | 
 532 | class TableEntity(EntityBase):
 533 |     def __init__(self, tables=None, groups=None):
 534 |         super(TableEntity, self).__init__()
 535 | 
 536 |         self.Tables = tables if tables is not None else [];
 537 |         self.Properties = {}
 538 | 
 539 |         self.Group = groups if groups is not None else [];
 540 |         self.IsMatchMax = True;
 541 | 
 542 |     def ReplaceEscapeChar(self, s):
 543 |         for c in list('+-*().'):
 544 |             s = s.replace(str(c), '\\' + str(c));
 545 |         return s;
 546 | 
 547 |     def RebuildEntity(self):
 548 |         for r in range(len(self.Tables)):
 549 |             if isinstance(self.Tables[r], str):
 550 |                 self.Tables[r] = self.Core.Entities[self.Tables[r]];
 551 |             if isinstance(self.Tables[r], SequenceEntity):
 552 |                 self.Tables[r].RebuildEntity();
 553 |             self.Tables[r].Core = self.Core;
 554 | 
 555 |         if not RegexCore.AutoMerge:
 556 |             return;
 557 | 
 558 |         seqs = [m for m in self.Tables if isinstance(m, StringEntity)];
 559 |         if len(seqs) < 2:
 560 |             return;
 561 |         ms = {};
 562 |         rex = RegexEntity();
 563 |         rex.Name = self.Name + "_merge";
 564 |         rex.Core = self.Core;
 565 |         match = "";
 566 | 
 567 |         for r in seqs:
 568 |             m = self.ReplaceEscapeChar(r.Match);
 569 |             ms[r.Match] = r;
 570 |             match += m + '|';
 571 |         match = match[:-1];
 572 | 
 573 |         rex.SetValues([match, ms]);
 574 |         for r in seqs:
 575 |             self.Tables.remove(r);
 576 |         self.Tables.append(rex);
 577 |         for t in self.Tables:
 578 |             t.Core = self.Core;
 579 |         return
 580 | 
 581 |     def SetValues(self, values):
 582 |         super(TableEntity, self).SetValues(values);
 583 |         if isinstance(values, list):
 584 |             return;
 585 |         value = values.get("Property", None);
 586 |         if value is not None:
 587 |             items = [x.strip() for x in value.split('|')]
 588 |             for i in range(min(len(items), len(self.Tables))):
 589 |                 str2 = [x.strip() for x in items[i].split(',')]
 590 |                 self.Properties[i] = str2
 591 | 
 592 |     def MatchItem(self, input, start, end, muststart, mode=None):
 593 |         self.LogIn(input, start)
 594 |         bestLen = -1
 595 |         bestSeqID = -1
 596 |         bestStart = int_max;
 597 |         rpos = bestStart;
 598 |         submode = None;
 599 |         dictbuf = self.Core.Entities.SeqBuff
 600 |         bestMatchResult = None
 601 |         total = len(self.Tables)
 602 |         for seqid in range(total):
 603 |             if seqid in self.Group and bestSeqID != -1 and bestStart == start:
 604 |                 break;
 605 |             entity = self.Tables[seqid]
 606 |             if mode is not None and mode.MatchIndex != -1:
 607 |                 if seqid != mode.MatchIndex:
 608 |                     continue
 609 |                 submode = mode.Children;
 610 |             seqValue = dictbuf.GetMatch(entity, input, start, end);
 611 |             if seqValue == -1:
 612 |                 continue
 613 |             if seqValue is not None:
 614 |                 theader = seqValue;
 615 |             else:
 616 |                 theader = entity.MatchItem(input, start, end, muststart, submode)
 617 |                 if IsFail(theader):
 618 |                     rpos = min(theader, rpos);
 619 |                     if not muststart:
 620 |                         dictbuf.AddScan(entity, start, start);
 621 |                     if submode is not None and mode.MatchIndex == -1:
 622 |                         mode.MatchIndex = -1;
 623 |                         submode = None;
 624 |                 else:
 625 |                     dictbuf.AddScan(entity, start, theader.pos);
 626 |                     dictbuf.AddEntity(entity, theader)
 627 |             if IsFail(theader):
 628 |                 continue
 629 |             spos, slen = theader.pos, len(theader.mstr);
 630 |             if (spos < bestStart or (
 631 |                             spos == bestStart and (slen > bestLen if self.IsMatchMax == True else slen < bestLen))):
 632 |                 bestLen = slen
 633 |                 bestStart = spos
 634 |                 bestSeqID = seqid
 635 |                 bestMatchResult = theader;
 636 | 
 637 |         if bestMatchResult is not None:
 638 |             if bestMatchResult.Children is not None:
 639 |                 match = bestMatchResult.Children
 640 |             else:
 641 |                 match = [bestMatchResult];
 642 | 
 643 |             if len(self.Properties) > bestSeqID:
 644 |                 index = 0;
 645 |                 for element in self.Properties[bestSeqID]:
 646 |                     match[index].PropertyName = element
 647 |                     index += 1;
 648 |                     if index >= len(match):
 649 |                         break
 650 |             bestMatchResult = MatchResult(self, bestMatchResult.mstr, bestMatchResult.pos, [bestMatchResult]);
 651 |             bestMatchResult.MatchIndex = bestSeqID;
 652 |             self.LogOut(bestMatchResult.mstr)
 653 |             return bestMatchResult;
 654 |         self.LogOut(None)
 655 |         return rpos;
 656 | 
 657 | 
 658 | def AddArea(sb, start, end):
 659 |     l = len(sb);
 660 |     insp = 0;
 661 |     if l == 0:
 662 |         sb.append(start);
 663 |         sb.append(end);
 664 |     else:
 665 |         left = 0;
 666 |         right = l - 2;
 667 |         while left <= right:
 668 |             mid = ((left + right) >> 2) << 1;
 669 |             if sb[mid] < start:
 670 |                 left = mid + 2
 671 |                 if right < left:
 672 |                     if l <= left:
 673 |                         sb.append(start)
 674 |                         sb.append(end)
 675 |                         insp = l
 676 |                     else:
 677 |                         sb.insert(left, start)
 678 |                         sb.insert(left + 1, end)
 679 |                         insp = left;
 680 |                         break
 681 |             elif sb[mid] > start:
 682 |                 right = mid - 2
 683 |                 if right < left:
 684 |                     sb.insert(left, start)
 685 |                     sb.insert(left + 1, end)
 686 |                     insp = left;
 687 |                     break
 688 |             else:
 689 |                 if sb[mid + 1] < end:
 690 |                     sb[mid + 1] = end
 691 |                 break
 692 | 
 693 |     l = len(sb);
 694 |     if insp > 2:
 695 |         i = insp - 2;
 696 |     else:
 697 |         i = 0;
 698 |     while i < l:
 699 |         pi = i - 2
 700 |         if pi < 0 or sb[pi + 1] < sb[i] - 1:
 701 |             pi += 2;
 702 | 
 703 |         else:
 704 |             if sb[pi + 1] <= sb[i + 1]:
 705 |                 sb[pi + 1] = sb[i + 1];
 706 |             del sb[pi + 2:pi + 4];
 707 |             l -= 2;
 708 |             i -= 2;
 709 |         i += 2;
 710 |     return sb;
 711 | 
 712 | 
 713 | class BuffHelper(object):
 714 |     def __init__(self, slen):
 715 |         self.scanbuf = {};
 716 |         self.entitybuf = {};
 717 |         self.slen = slen;
 718 |         self.extractedarea = [];
 719 | 
 720 |     def BinarySearchIndex(self, arr, v):
 721 |         l = len(arr);
 722 |         if l == 0:
 723 |             return 0;
 724 |         left = 0;
 725 |         right = l - 1;
 726 |         while left <= right:
 727 |             mid = (left + right) >> 1;
 728 |             if arr[mid] > v:
 729 |                 right = mid - 1;
 730 |             elif arr[mid] < v:
 731 |                 left = mid + 1;
 732 |             else:
 733 |                 break;
 734 |         if v < arr[mid]:
 735 |             return mid;
 736 |         elif v == arr[mid]:
 737 |             if mid % 2 == 0:
 738 |                 return mid + 1;
 739 |             else:
 740 |                 return mid;
 741 |         return mid + 1;
 742 | 
 743 |     def AddEntity(self, entity, matchResult):
 744 |         entityid = id(entity);
 745 |         sb = self.entitybuf.get(entityid, None);
 746 |         if sb is None:
 747 |             sb = [];
 748 |             self.entitybuf[entityid] = sb;
 749 |         lo = 0
 750 |         hi = len(sb)
 751 |         while lo < hi:
 752 |             mid = (lo + hi) // 2
 753 |             if matchResult.pos < sb[mid].pos:
 754 |                 hi = mid
 755 |             else:
 756 |                 lo = mid + 1
 757 |         sb.insert(lo, matchResult)
 758 | 
 759 |     def AddScan(self, entity, start, end=None):
 760 | 
 761 |         if entity != 0:
 762 |             entityid = id(entity);
 763 |             sb = self.scanbuf.get(entityid, None);
 764 |         else:
 765 |             sb = self.extractedarea;
 766 |         if sb is None:
 767 |             sb = [];
 768 |             self.scanbuf[entityid] = sb;
 769 |         if end is None:
 770 |             end = self.slen;
 771 |         AddArea(sb, start, end);
 772 | 
 773 |     def IsInExtractArea(self, pos):
 774 |         i = self.BinarySearchIndex(self.extractedarea, pos);
 775 |         if i % 2 == 0:
 776 |             return pos;
 777 |         return self.extractedarea[i];
 778 | 
 779 |     def GetMatch(self, entity, input, start, end):
 780 | 
 781 |         entityid = id(entity);
 782 |         sb = self.scanbuf.get(entityid, None);
 783 |         if sb is None:
 784 |             return None;
 785 |         i = self.BinarySearchIndex(sb, start);
 786 |         if i % 2 == 0:
 787 |             return None;
 788 |         start = start;
 789 |         end = sb[i]
 790 |         eb = self.entitybuf.get(entityid, None);
 791 |         if eb is None:
 792 |             entity.LogIn(input, start, end)
 793 |             entity.LogOut(None,True);
 794 |             return end;
 795 |         hi = len(eb)
 796 |         lo = 0;
 797 |         while lo < hi:
 798 |             mid = (lo + hi) // 2
 799 |             if start < eb[mid].pos:
 800 |                 hi = mid
 801 |             elif start == eb[mid].pos:
 802 |                 lo = mid;
 803 |                 break;
 804 |             else:
 805 |                 lo = mid + 1
 806 |         if lo >= len(eb):
 807 |             entity.LogIn(input, start, end)
 808 |             entity.LogOut(None);
 809 |             return None;
 810 |         if eb[lo].pos <= end:
 811 |             entity.LogIn(input, start, end)
 812 |             entity.LogOut(eb[lo].mstr, True)
 813 |             return eb[lo];
 814 |         return None;
 815 | 
 816 | 
 817 | class TreeNode(object):
 818 |     def __init__(self):
 819 |         self.Left = None;
 820 |         self.Right = None;
 821 |         self.Root = None;
 822 |         self.Match = None;
 823 |         self.Rewrite = None;
 824 |         self.Index = 0;
 825 |         self.Order = 0
 826 | 
 827 |     def GetLeft(self):
 828 |         tree = self;
 829 |         while tree.Left is not None:
 830 |             tree = tree.Left;
 831 |         return tree;
 832 | 
 833 |     def GetRight(self):
 834 |         tree = self;
 835 |         while tree.Right is not None:
 836 |             tree = tree.Right;
 837 |         return tree;
 838 | 
 839 |     def InOrderTravel(self, node, func):
 840 |         if node is None:
 841 |             return;
 842 |         self.InOrderTravel(node.Left, func);
 843 |         func(node);
 844 |         self.InOrderTravel(node.Right, func)
 845 | 
 846 | 
 847 | def IsSameValue(arr, l, r):
 848 |     if r < l + 2:
 849 |         return False;
 850 |     for i in range(l + 1, r):
 851 |         if arr[i] != arr[l]:
 852 |             return False;
 853 |     return True;
 854 | 
 855 | 
 856 | def GetMaxIndex(arr, l, r):
 857 |     max_value = -100;
 858 |     max_index = -1;
 859 |     for i in range(l, r):
 860 |         if arr[i] > max_value:
 861 |             max_index = i;
 862 |             max_value = arr[i];
 863 |     return max_index;
 864 | 
 865 | 
 866 | class SequenceEntity(EntityBase):
 867 |     def __init__(self, matchEntities=None, rewriteEntities=None, matchorders=None, rewriteOrders=None, condition=None):
 868 |         super(SequenceEntity, self).__init__()
 869 |         self.DirectReplace = "直接替换"
 870 |         self.MatchEntities = matchEntities if matchEntities is not None else [];
 871 |         self.RewriteEntities = rewriteEntities if rewriteEntities is not None else [];
 872 |         self.RewriteOrders = rewriteOrders if rewriteOrders is not None else [];
 873 |         self.MatchOrders = matchorders if matchorders is not None else None;
 874 |         self.Property = []
 875 |         self.Condition = condition;
 876 |         self.Root = None;
 877 | 
 878 |     def SetValues(self, values):
 879 |         super(SequenceEntity, self).SetValues(values);
 880 |         if isinstance(values, list):
 881 |             return;
 882 |         value = values.get("Property", None);
 883 |         if value is not None:
 884 |             self.Property = [x.strip() for x in value.split(',')]
 885 | 
 886 |     def BuildMatchTree(self, l, r):
 887 |         if l > r or l >= len(self.MatchOrders):
 888 |             return None;
 889 |         if r == l:
 890 |             tree = TreeNode();
 891 |             tree.Match = self.MatchEntities[l];
 892 |             return tree;
 893 |         if IsSameValue(self.MatchOrders, l, r):
 894 |             tb = TableEntity();
 895 |             tb.Core = self.Core;
 896 |             for item in itertools.combinations(self.MatchEntities[l:r], r - l + 1):
 897 |                 se = SequenceEntity(item);
 898 |                 se.Core = core;
 899 |                 tb.Tables.append(se);
 900 |             tree = TreeNode();
 901 |             tree.Match = tb;
 902 |             return tree;
 903 |         max_index = GetMaxIndex(self.MatchOrders, l, r)
 904 |         tree = TreeNode();
 905 |         tree.Order = self.MatchOrders[max_index];
 906 |         tree.Index = max_index;
 907 |         tree.Match = self.MatchEntities[max_index];
 908 |         if max_index < len(self.RewriteEntities):
 909 |             tree.Rewrite = self.RewriteEntities[self.RewriteOrders[max_index]]
 910 |         tree.Left = self.BuildMatchTree(l, max_index - 1);
 911 |         if tree.Left is not None:
 912 |             tree.Left.Root = tree;
 913 |         tree.Right = self.BuildMatchTree(max_index + 1, r);
 914 |         if tree.Right is not None:
 915 |             tree.Right.Root = tree;
 916 |         return tree;
 917 | 
 918 |     def RebuildEntity(self):
 919 |         for r in range(len(self.MatchEntities)):
 920 |             if isinstance(self.MatchEntities[r], str):
 921 |                 self.MatchEntities[r] = self.Core.Entities[self.MatchEntities[r]];
 922 |             self.MatchEntities[r].Core = self.Core;
 923 |         for r in range(len(self.RewriteEntities)):
 924 |             if isinstance(self.RewriteEntities[r], str):
 925 |                 self.RewriteEntities[r] = self.Core.Entities[self.RewriteEntities[r]];
 926 |             self.RewriteEntities[r].Core = self.Core;
 927 |         if self.MatchOrders is None:
 928 |             self.MatchOrders = [i for i in range(len(self.MatchEntities), 0, -1)];
 929 | 
 930 |         self.Tree = self.BuildMatchTree(0, len(self.MatchOrders));
 931 | 
 932 |     def TreeNodeMatch(self, treenode, input, start, end, finalmatchScript, muststart=False):
 933 |         dictbuf = self.Core.Entities.SeqBuff
 934 |         matchEntity = treenode.Match;
 935 |         matchResult = dictbuf.GetMatch(matchEntity, input, start, end);
 936 |         fail = False;
 937 |         if matchResult is None:
 938 |             matchResult = matchEntity.MatchItem(input, start, end, treenode.Left is None and muststart)
 939 |             if not IsFail(matchResult):
 940 |                 dictbuf.AddScan(matchEntity, start, matchResult.pos);
 941 |                 dictbuf.AddEntity(matchEntity, matchResult);
 942 |         if not IsFail(matchResult):
 943 |             if treenode.Right is None and end is not None and matchResult.pos + len(matchResult.mstr) != end:
 944 |                 fail = True;
 945 |             if not finalmatchScript:
 946 |                 rewriteEntity = treenode.Rewrite;
 947 |             if rewriteEntity is not None and rewriteEntity.Name != self.DirectReplace:
 948 |                 if isinstance(rewriteEntity, ScriptEntity):
 949 |                     matchResult = rewriteEntity.MatchItem2(input, matchResult, True)
 950 |                     if matchResult is None:
 951 |                         fail = True;
 952 |                 else:
 953 |                     matchResult.rstr = rewriteEntity.RewriteItem(matchResult.mstr)
 954 |         if not fail and not IsFail(matchResult):
 955 |             mleft = matchResult.pos;
 956 |             mright = mleft + len(matchResult.mstr);
 957 |             runtree = TreeNode();
 958 |             runtree.Match = matchResult;
 959 |             if treenode.Left is not None:
 960 |                 left = self.TreeNodeMatch(treenode.Left, input, start, matchResult.pos, finalmatchScript);
 961 |                 if IsFail(left):
 962 |                     fail = True;
 963 |                 elif muststart == True and left.GetLeft().Match.pos != start:
 964 |                     fail = True;
 965 |                 else:
 966 |                     rm = left.GetRight().Match;
 967 |                     if rm.pos + len(rm.mstr) != mleft:
 968 |                         fail = True;
 969 |                 runtree.Left = left;
 970 |             if not fail and treenode.Right is not None:
 971 |                 right = self.TreeNodeMatch(treenode.Right, input, matchResult.pos + len(matchResult.mstr), end,
 972 |                                            finalmatchScript);
 973 |                 if IsFail(right):
 974 |                     fail = True;
 975 |                 elif right.GetLeft().Match.pos != mright:
 976 |                     fail = True;
 977 |                 elif end is not None:
 978 |                     rright = right.GetRight().Match;
 979 |                     rpos = rright.pos + len(rright.mstr);
 980 |                     if rpos != end:
 981 |                         fail = True;
 982 | 
 983 |                 runtree.Right = right;
 984 |         if IsFail(matchResult):
 985 |             return matchResult;
 986 |         elif fail:
 987 |             return matchResult.pos + len(matchResult.mstr);
 988 |         return runtree;
 989 | 
 990 |     def MatchItem(self, input, start, end, muststart, mode=None):
 991 |         self.LogIn(input, start)
 992 |         finalmatchScript = False;
 993 |         if len(self.MatchEntities) > 1 and len(self.RewriteEntities) == 1 and isinstance(self.RewriteEntities[0],
 994 |                                                                                          ScriptEntity):
 995 |             finalmatchScript = True;
 996 |         treeResult = self.TreeNodeMatch(self.Tree, input, start, end, finalmatchScript, muststart);
 997 |         if IsFail(treeResult):
 998 |             self.LogOut(None)
 999 |             return treeResult;
1000 |         matchResults = [];
1001 |         treeResult.InOrderTravel(treeResult, lambda m: matchResults.append(m.Match));
1002 |         for i in range(len(matchResults)):
1003 |             if i < len(self.Property):
1004 |                 matchResults[i].PropertyName = self.Property[i]
1005 |             if i < len(self.RewriteOrders):
1006 |                 matchResults[i].Order = self.RewriteOrders[i]
1007 |             else:
1008 |                 matchResults[i].Order = i
1009 |         if self.Condition is not None and self.Condition.EvalScript(matchResults, input) == False:
1010 |             self.LogOut(None)
1011 |             return start;
1012 |         start = matchResults[0].pos;
1013 |         sum = 0;
1014 |         for i in range(0, len(matchResults)):
1015 |             sum += len(matchResults[i].mstr);
1016 |         mstring = input[start:start + sum];
1017 |         if finalmatchScript:
1018 |             script = self.RewriteEntities[0];
1019 |             p = MatchResult(script, mstring, start, matchResults);
1020 |             return p;
1021 |         if len(matchResults) > 1:
1022 |             p = MatchResult(self, mstring, start, matchResults)
1023 |         else:
1024 |             p = matchResults[0];
1025 |         self.LogOut(mstring,False)
1026 |         return p;
1027 | 
1028 | 
1029 | class Entities(object):
1030 |     def __init__(self):
1031 |         super(Entities, self).__init__()
1032 |         self.AllEntities = []
1033 |         self.ValidEntities = []
1034 |         self.EntityNames = {}
1035 |         self.EntityIds = {}
1036 | 
1037 |     def appendids(self, entity):
1038 |         if -1 != entity.Order:
1039 |             self.EntityIds[entity.Order] = entity
1040 |         self.ValidEntities.append(entity)
1041 | 
1042 |     def append(self, entity):
1043 |         if entity.Name is not None:
1044 |             self.EntityNames[entity.Name] = entity
1045 |         self.AllEntities.append(entity)
1046 | 
1047 |     def __getitem__(self, item):
1048 |         if item in self.EntityNames:
1049 |             return self.EntityNames[item]
1050 |         else:
1051 |             print("Entity name %s can not be found!" % (item));
1052 |         for entity in self.AllEntities:
1053 |             if (entity.Name == item):
1054 |                 return entity
1055 |         return None
1056 | 
1057 | 
1058 | class Token:
1059 |     (NAME, ENTITY, MINUS, COLON, END, BAR, REPEAT, Script) = range(8)
1060 | 
1061 | 
1062 | class RegexToken:
1063 |     def __init__(self, regex, token, count, type=None):
1064 |         self.Regex = regex
1065 |         self.Token = token
1066 |         self.Count = count
1067 |         self.EntityType = type
1068 | 
1069 | 
1070 | class TokenItem:
1071 |     def __init__(self, token):
1072 |         self.Rule = None
1073 |         self.Token = token
1074 |         self.Entity = None
1075 |         self.Values = []
1076 | 
1077 | 
1078 | class RegexCore(object):
1079 |     AutoModeStudy = True
1080 |     ExtractDictEnabled = False
1081 |     LogFile = None
1082 |     matchLevel = 0
1083 |     AutoMerge = True
1084 |     MatchAllEntity = False;
1085 | 
1086 |     def __init__(self, rule=None):
1087 |         super(RegexCore, self).__init__()
1088 |         self.Entities = None
1089 |         self.__entity_name = re.compile(r"^(\w+)\s*=\s*")
1090 |         self.__entity_reexp = re.compile(r"^\(/((?:.(?!/\)))*?)/\s*:\s*/((?:(?!\(/).)*?)/\)\s*")
1091 |         self.__entity_string = re.compile("^\(\"((?:(?!\"\)).)*?)\"\s*:\s*\"(.*?)\"(?:\s*:\s*\"(.*?)\")?\s*\)\s*")
1092 |         self.__numbeRegex = re.compile(r"[0-9]+")
1093 |         self.__r_bar = re.compile(r"^\s*\|\s*")
1094 |         self.__r_bar2 = re.compile(r"^\s*/\s*")
1095 |         self.__r_colon = re.compile(r"^\s*:\s*")
1096 |         self.__r_conds = re.compile(r'^\s*\"([^"]*)\"')
1097 |         self.__r_entity = re.compile("^\$\((\w+)\)\s*")
1098 |         self.__r_minus = re.compile(r"^\s*-\s*")
1099 |         self.__r_order = re.compile(r"^\$0*(\d+)\s*")
1100 |         self.__r_reexp = re.compile(r"^\(/((?:.(?!/\s*:\s*/))*?)/\)\s*")
1101 |         self.__r_repeat = re.compile(r"^\s*([*?+]|{(\d+),(-1|\d+)})\s*")
1102 |         self.__r_semicolon = re.compile(r"^\s*;")
1103 |         self.__r_string = re.compile(r"^\(\s?\"(.*?)\"\s?\)\s*")
1104 |         self.tnFileName = None
1105 |         self.Entities = Entities()
1106 |         self.Entities.AllEntities = []
1107 |         self.Entities.ValidEntities = []
1108 |         if rule is not None:
1109 |             self.InitTNRule(rule);
1110 | 
1111 |     def InitPyRule(self, pyrule):
1112 |         for r in pyrule.__dict__:
1113 |             s = getattr(pyrule, r);
1114 |             if isinstance(s, EntityBase):
1115 |                 s.Core = self;
1116 |                 s.Name = r;
1117 |                 self.Entities.append(s);
1118 |                 if s.Order != 0:
1119 |                     self.Entities.appendids(s);
1120 |         for entity in self.Entities.AllEntities:
1121 |             entity.RebuildEntity();
1122 |     def WriteHTMLHeader(file):
1123 |         file.write('''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"><html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><title>{title}}</title></head><body>''');
1124 | 
1125 |     def WriteHTMLEnd(file):
1126 |         file.write('''</body></html>''');
1127 | 
1128 | 
1129 |     def ToHTML(self, newfile):
1130 |         file_object = open(self.tnFileName, 'r', 'utf-8')
1131 |         newfile = open(newfile, 'w', 'utf-8')
1132 |         text = file_object.read()
1133 |         texts = text.split('\n')
1134 |         r_entity = re.compile(r"\$\((\w+)\)\s*")
1135 |         self.WriteHTMLHeader(newfile)
1136 |         for t in texts:
1137 |             m = self.__entity_name.match(t)
1138 |             if m != None:
1139 |                 mt = m.group(1)
1140 |                 t = t.replace(mt, '<a name="%s"><b>%s</b></a>' % (mt, mt), 1)
1141 |             m = r_entity.findall(t)
1142 |             if len(m) > 0:
1143 |                 for mt in m:
1144 |                     t = t.replace(mt, '<a href="#%s">%s</a>' % (mt, mt))
1145 |             if t.startswith("#"):
1146 |                 newfile.write('<p><font color="#909090">%s</font></p>\n' % t)
1147 |             else:
1148 |                 newfile.write('<p>%s</p>\n' % t)
1149 | 
1150 |         self.WriteHTMLEnd(newfile)
1151 |         newfile.close()
1152 |         file_object.close()
1153 | 
1154 |     def InitRuleText(self, text, addtoOrder=True):
1155 |         propertyregex = re.compile("#%(\w+)%\s(.+)")
1156 |         tokenRegex = [RegexToken(self.__entity_name, Token.NAME, 2),
1157 |                       RegexToken(self.__entity_reexp, Token.ENTITY, 3, RegexEntity),
1158 |                       RegexToken(self.__entity_string, Token.ENTITY, 3, StringEntity),
1159 |                       RegexToken(self.__r_reexp, Token.ENTITY, 1, RegexEntity),
1160 |                       RegexToken(self.__r_order, Token.ENTITY, 1),
1161 |                       RegexToken(self.__r_entity, Token.ENTITY, 2),
1162 |                       RegexToken(self.__r_string, Token.ENTITY, 2, StringEntity),
1163 |                       RegexToken(self.__r_minus, Token.MINUS, 2),
1164 |                       RegexToken(self.__r_colon, Token.COLON, 1),
1165 |                       RegexToken(self.__r_repeat, Token.REPEAT, 2),
1166 |                       RegexToken(self.__r_bar, Token.BAR, 1),
1167 |                       RegexToken(self.__r_bar2, Token.BAR, 1),
1168 |                       RegexToken(self.__r_semicolon, Token.END, 1),
1169 |                       RegexToken(self.__r_conds, Token.ENTITY, 1, ScriptEntity),
1170 |                       ]
1171 |         sb = ""
1172 |         realRules = []
1173 |         rules = [x.strip() for x in text.split('\n')]  # PreProcessing
1174 |         for rule in rules:
1175 |             if propertyregex.match(rule):
1176 |                 realRules.append(rule.strip())
1177 |                 continue
1178 |             if rule.startswith(u"#"):
1179 |                 sb = ""
1180 |                 continue
1181 |             if rule.endswith(';'):
1182 |                 sb += rule
1183 |                 realRules.append(sb.strip())
1184 |                 sb = ""
1185 |                 continue
1186 |             else:
1187 |                 sb += rule
1188 |         properties = {};
1189 |         for rule in realRules:
1190 |             m = propertyregex.match(rule);
1191 |             if m is not None:
1192 |                 if m.lastindex is not None and m.lastindex == 2:
1193 |                     name, value = m.group(1), m.group(2);
1194 |                     if name == "Script":
1195 |                         item = __import__(value)
1196 |                         setattr(self, value, item)
1197 |                     elif name == "Include":
1198 |                         value = value.split(' ');
1199 |                         isadd = False;
1200 |                         if len(value) > 1:
1201 |                             isadd = value[1] == "True";
1202 |                         self.InitTNRule(value[0], isadd);
1203 |                     else:
1204 |                         properties[m.group(1)] = m.group(2);
1205 |                 continue
1206 | 
1207 |             tokenItems = []  # Lexical Analyse
1208 |             while True:
1209 |                 if len(rule) == 0:  break
1210 |                 canmatch = False;
1211 |                 for token in tokenRegex:
1212 |                     mat = token.Regex.match(rule)
1213 |                     if mat is None: continue
1214 | 
1215 |                     mcount = mat.lastindex if mat.lastindex is not None else 1;
1216 |                     if mcount < token.Count - 1:
1217 |                         continue
1218 |                     canmatch = True;
1219 |                     tokenItem = TokenItem(token.Token)
1220 |                     for r in range(mcount):
1221 |                         tokenItem.Values.append(mat.string if mat.lastindex is None else mat.group(r + 1))
1222 |                     tokenItem.Rule = mat.group(0)
1223 | 
1224 |                     e = None
1225 |                     if token.EntityType is not None:
1226 |                         e = token.EntityType()
1227 |                         e.Core = self
1228 |                         e.SetValues(tokenItem.Values)
1229 |                     elif token.Regex == self.__r_entity:
1230 |                         e = tokenItem.Values[0]
1231 |                     if e is not None:
1232 |                         tokenItem.Entity = e
1233 |                     rule = rule[len(tokenItem.Rule):]
1234 |                     tokenItems.append(tokenItem)
1235 |                     break
1236 |                 if not canmatch:
1237 |                     print("rule format error%s" % (rule));
1238 |                     return;
1239 | 
1240 |             if Token.NAME != tokenItems[0].Token:  # Grammer Analyse
1241 |                 print("name must be the first")
1242 |             if tokenItems[-1].Token != Token.END:
1243 |                 print("Rule must be ended by ;")
1244 | 
1245 |             if findany(tokenItems, lambda r: r.Token == Token.BAR):
1246 |                 entity = TableEntity()
1247 |                 entity.Core = self;
1248 | 
1249 |                 lastid = 0
1250 |                 for id in range(1, len(tokenItems)):
1251 |                     if tokenItems[id].Token == Token.BAR or tokenItems[id].Token == Token.END:
1252 |                         tentity = self.__GetNonTableEntity(tokenItems[lastid + 1:id], isOnlyOne=False);
1253 |                         if isinstance(tentity, EntityBase) and tentity.Name == "":
1254 |                             tentity.Name = "%s_%d" % (tokenItems[0].Values[0], len(entity.Tables));
1255 |                         entity.Tables.append(tentity);
1256 |                         lastid = id
1257 |                         if tokenItems[id].Rule.find("/") == 0:
1258 |                             entity.Group.append(len(entity.Tables))
1259 |             else:
1260 |                 entity = self.__GetNonTableEntity(tokenItems[1:-1], isOnlyOne=True)
1261 |             entity.Name = tokenItems[0].Values[0]
1262 | 
1263 |             entity.SetValues(properties);
1264 |             if entity.Order != 0 and addtoOrder:
1265 |                 self.Entities.appendids(entity);
1266 |             properties = {};
1267 |             self.Entities.append(entity)
1268 |         # rebuild reference
1269 |         for entity in self.Entities.AllEntities:
1270 |             entity.RebuildEntity();
1271 | 
1272 |         self.Entities.ValidEntities = sorted(self.Entities.ValidEntities, key=lambda x: x.Order)
1273 | 
1274 |     def __GetNonTableEntity(self, tokenItems, isOnlyOne):
1275 |         repeat = getindex(tokenItems, lambda r: r.Token == Token.REPEAT)
1276 |         if repeat < 0:
1277 |             pass
1278 |         elif repeat != 1:
1279 |             raise "repeat format error"
1280 |         else:
1281 |             entity = RepeatEntity()
1282 |             entity.Core = self
1283 |             entity.Entity = tokenItems[0].Entity
1284 |             entity.SetValues(tokenItems[1].Values)
1285 |             return entity
1286 |         minus = getindex(tokenItems, lambda r: r.Token == Token.MINUS)
1287 |         if minus < 0:
1288 |             pass
1289 |         elif minus != 1:
1290 |             raise "diff format error"
1291 |         else:
1292 |             entity = DiffEntity()
1293 |             entity.Core = self
1294 |             entity.Universe = tokenItems[0].Entity
1295 |             id = 1
1296 |             while id < len(tokenItems):
1297 |                 tokenItem = tokenItems[id]
1298 |                 if tokenItem.Token == Token.END:
1299 |                     return entity
1300 |                 if tokenItem.Token == Token.MINUS:
1301 |                     entity.Complements.append(tokenItems[id + 1].Entity)
1302 |                 id += 1
1303 |             return entity
1304 |         if len(tokenItems) == 1:
1305 |             if not isOnlyOne:
1306 |                 return tokenItems[0].Entity
1307 |             if isinstance(tokenItems[0].Entity, EntityBase) and tokenItems[0].Entity.Name == "":
1308 |                 return tokenItems[0].Entity
1309 |         entity = SequenceEntity()
1310 |         entity.Core = self;
1311 |         state = 0
1312 |         for id in range(len(tokenItems)):
1313 |             tokenItem = tokenItems[id]
1314 |             if tokenItem.Token == Token.END:
1315 |                 return entity
1316 |             if tokenItem.Token == Token.COLON:
1317 |                 state += 1
1318 |                 continue
1319 |             if state == 0:
1320 |                 entity.MatchEntities.append(tokenItem.Entity)
1321 |             elif state == 1:
1322 |                 if tokenItem.Entity is None:
1323 |                     entity.RewriteOrders.append(int(tokenItem.Rule.replace("$", "")) - 1)
1324 |                 else:
1325 |                     entity.RewriteEntities.append(tokenItem.Entity)
1326 |                     entity.RewriteOrders.append(len(entity.RewriteOrders))
1327 |             else:
1328 |                 entity.Condition = tokenItem.Entity
1329 |         return entity
1330 | 
1331 |     def InitTNRule(self, myfile, addtoOrder=True):
1332 |         self.tnFileName = myfile
1333 |         file_object = open(myfile, 'r', encoding='utf-8')
1334 |         texts = file_object.read()
1335 |         print("success load tn rules:%s" % (myfile))
1336 |         self.InitRuleText(texts, addtoOrder)
1337 |         file_object.close()
1338 | 
1339 |     def MatchEntity(self, entity, input, mode=None):
1340 |         startPos = 0
1341 |         matchResults = [];
1342 |         inputlen = len(input)
1343 |         while (1):
1344 |             if startPos >= inputlen:
1345 |                 break
1346 |             matchResult = entity.MatchItem(input, startPos, None, entity.Start, mode)
1347 |             if IsFail(matchResult):
1348 |                 if mode is not None and startPos == 0 and RegexCore.AutoModeStudy:
1349 |                     matchResult = entity.MatchItem(input, startPos, entity.Start)
1350 |                     if matchResult is not None:
1351 |                         self.__GetPublicTree(mode, matchResult)
1352 |                     else:
1353 |                         startPos = matchResult;
1354 |                 break
1355 | 
1356 |             startPos = matchResult.pos + len(matchResult.mstr);
1357 |             matchResults.append(matchResult);
1358 |         return matchResults;
1359 | 
1360 |     def RewriteEntity(self, entity, input, mode=None):
1361 |         matchResults = self.MatchEntity(entity, input, mode);
1362 |         if len(matchResults) == 0:
1363 |             return input, False;
1364 |         else:
1365 |             pos = 0;
1366 |             rewrite = "";
1367 |             for m in matchResults:
1368 |                 m.GetShouldRewrite();
1369 |                 m.RewriteItem()
1370 |                 rewrite += input[pos:m.pos] + m.rstr;
1371 |                 pos = m.pos + len(m.mstr);
1372 |             rewrite += input[pos:];
1373 |             return rewrite, True;
1374 | 
1375 |     def __GetPublicTree(self, item1, item2):
1376 |         if item1 is None:
1377 |             return item2
1378 |         stack1 = []
1379 |         stack2 = []
1380 |         stack1.append(item1)
1381 |         stack2.append(item2)
1382 |         while len(stack1) > 0:
1383 |             m1 = stack1.pop()
1384 |             m2 = stack2.pop()
1385 |             if m1.MatchIndex != m2.MatchIndex:
1386 |                 m1.MatchIndex = -1
1387 |                 continue
1388 |             if isinstance(m1.Children, EntityBase):
1389 |                 continue
1390 |             m1 = m1.Children
1391 |             m2 = m2.Children
1392 |             while m1 != None:
1393 |                 stack1.append(m1)
1394 |                 stack2.append(m2)
1395 |                 m1 = m1.NextMatch
1396 |                 m2 = m2.NextMatch
1397 |         return item1
1398 | 
1399 |     def CompileString(self, input, modes):
1400 | 
1401 |         startPos = 0
1402 |         while (1):
1403 |             if startPos >= len(input):
1404 |                 break
1405 |             modeindex = -1;
1406 |             matchResult = None;
1407 |             issuccess = False;
1408 |             if modes is not None:
1409 |                 for index in range(0, len(modes)):
1410 |                     mode = modes[index];
1411 |                     matchResult = mode.Entity.MatchItem(input, startPos, entity.Start, mode);
1412 |                     if matchResult is not None:
1413 |                         modeindex = index;
1414 |                         issuccess = True;
1415 |                         break;
1416 |             if not issuccess:
1417 |                 for entity in self.Entities.ValidEntities:
1418 |                     matchResult = entity.MatchItem(input, startPos, None, entity.Start)
1419 |                     if matchResult is not None:
1420 |                         if modes is None:
1421 |                             modes = [];
1422 |                         modes.append(matchResult);
1423 |                         break;
1424 |             if matchResult is None:
1425 |                 return modes;
1426 |             if modes is not None and modeindex != -1:
1427 |                 modes[modeindex] = self.__GetPublicTree(matchResult, modes[modeindex]);
1428 |             startPos += len(matchResult.mstr);
1429 |         return modes
1430 | 
1431 |     def Compile(self, texts):
1432 |         modes = None;
1433 |         for text in texts:
1434 |             modes = self.CompileString(text, modes);
1435 |         return modes;
1436 | 
1437 |     def Rewrite(self, rawinput, mode=None):
1438 | 
1439 |         if mode is not None:
1440 |             self.Entities.SeqBuff = BuffHelper(len(rawinput));
1441 |             return self.RewriteEntity(mode.Entity, rawinput, mode)
1442 |         else:
1443 |             self.Entities.SeqBuff = BuffHelper(len(rawinput));
1444 |             for entity in self.Entities.ValidEntities:
1445 |                 rewrite, succ = self.RewriteEntity(entity, rawinput, None)
1446 |                 if RegexCore.MatchAllEntity == False and succ == True:
1447 |                     return rewrite;
1448 |                 if rewrite != rawinput:
1449 |                     rawinput = rewrite;
1450 |                     self.Entities.SeqBuff = BuffHelper(len(rawinput));
1451 |             return rewrite
1452 |     def Match(self, rawinput, mode=None):
1453 | 
1454 |         if mode is not None:
1455 |             self.Entities.SeqBuff = BuffHelper(len(rawinput));
1456 |             return self.MatchEntity(mode.Entity, rawinput, mode)
1457 |         else:
1458 |             self.Entities.SeqBuff = BuffHelper(len(rawinput));
1459 |             for entity in self.Entities.ValidEntities:
1460 |                 match = self.MatchEntity(entity, rawinput, None)
1461 |                 if not RegexCore.MatchAllEntity:
1462 |                     return match;
1463 |             return None;
1464 | 
1465 |     def __MatchResult2Doc__(self, matchResult):
1466 |         docu = {};
1467 |         matchResult.RewriteItem();
1468 |         matchResult.ExtractDocument(docu, 0);
1469 |         docu['#type'] = matchResult.Entity.Name;
1470 |         docu['#pos'] = matchResult.pos;
1471 |         docu['#match'] = matchResult.mstr;
1472 |         docu['#rewrite'] = matchResult.rstr;
1473 |         return docu
1474 | 
1475 |     def ExtractEntity(self, entity, input, mode=None):
1476 |         start = 0
1477 |         docs = [];
1478 |         buffhelper = self.Entities.SeqBuff;
1479 |         inputlen = len(input)
1480 |         while (1):
1481 |             if start >= inputlen:
1482 |                 break;
1483 |             start = buffhelper.IsInExtractArea(start);
1484 |             matchResult = buffhelper.GetMatch(entity, input, start, None)
1485 | 
1486 |             if matchResult is None:
1487 |                 matchResult = entity.MatchItem(input, start, None, entity.Start, mode)
1488 | 
1489 |             if IsFail(matchResult):
1490 |                 if matchResult == start:
1491 |                     start = matchResult + 1;
1492 |                 else:
1493 |                     start = matchResult;
1494 |                 continue;
1495 | 
1496 |             p = buffhelper.IsInExtractArea(matchResult.pos);
1497 |             buffhelper.AddEntity(entity, matchResult)
1498 |             start = matchResult.pos + len(matchResult.mstr);
1499 |             if len(matchResult.mstr) == 0:
1500 |                 start += 1;
1501 |             if p == matchResult.pos:
1502 |                 docu = self.__MatchResult2Doc__(matchResult);
1503 |                 docs.append(docu);
1504 |                 buffhelper.AddScan(0, matchResult.pos, start);
1505 |         return docs;
1506 | 
1507 |     def Extract(self, input, modes=None,entities=None):
1508 |         if entities is None:
1509 |             entities=self.Entities.ValidEntities;
1510 |         self.Entities.SeqBuff = BuffHelper(len(input));
1511 |         docs = [];
1512 |         succ = False;
1513 |         if modes is not None:
1514 |             for mode in modes:
1515 |                 entity = mode.Entity;
1516 |                 mdocs = self.ExtractEntity(entity, input, mode)
1517 |                 for doc in mdocs:
1518 |                     docs.append(doc);
1519 |                 succ = True;
1520 |                 break;
1521 |         if not succ:
1522 |             for entity in entities:
1523 |                 mdocs = self.ExtractEntity(entity, input)
1524 |                 for doc in mdocs:
1525 |                     docs.append(doc);
1526 |         return docs;
1527 | 
1528 | 
1529 | 
1530 | 


--------------------------------------------------------------------------------
/test/chs.txt:
--------------------------------------------------------------------------------
 1 | 中国的面积有960万平方公里
 2 | 一百安培
 3 | 硬盘的容量是80GB
 4 | 1999年12月31日
 5 | 12点25分18秒
 6 | 80公分以上
 7 | 八点三十
 8 | 一百二十
 9 | 10点24分20秒
10 | 12点三十四分十五秒
11 | buptzym@qq.com
12 | zhaoyiming@qq.com
13 | 136-0377-0086
14 | 0341-8453235
15 | 152601196705082542
16 | 43072119880818492X
17 | 270元
18 | 270万元
19 | 424194253
20 | 三万2千
21 | 7月8号
22 | 三月1日
23 | 5月20日
24 | 2013.12.13
25 | 从三月1日到5月20日
26 | 从2005年到2010年
27 | 2013.12.13-2015.7.5
28 | 2014年七月
29 | 三点15分
30 | 八点一刻
31 | 凌晨4点十五分
32 | 十二点三十七分
33 | 12:30:04
34 | 12:30
35 | 20MB
36 | 五十兆
37 | 30欧姆
38 | 40Ω
39 | 一百二十分贝
40 | 硬盘的容量是80GB
41 | 八吨的大象
42 | 游泳池有500m宽
43 | http://news.163.com/special/bra_pc/
44 | 北京市东城区沙滩后街59号13排
45 | 西安市莲湖区桃园一坊简易2号楼12号
46 | 山西省忻州市忻府区
47 | 北京市海淀区
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/test/learn.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | from src.tnpy import RegexCore
 4 | core = RegexCore('../rules/learn')
 5 | import src.tngraph as graph
 6 | graph.buildGraph(core,'int_0_99');
 7 | exit()
 8 | RegexCore.LogFile = open("learn.log", 'w')
 9 | RegexCore.LogFile.truncate()
10 | #matchs=core.Match('领导你好！老婆你好');
11 | #for m in matchs:
12 | #    print('match',m.mstr, 'pos:',m.pos)
13 | 
14 | 
15 | print(core.Rewrite('领导你好！老婆您好'));
16 | 
17 | print({r:core.Rewrite(r) for r in ['十','三十七','一十三','68']});
18 | 
19 | RegexCore.LogFile.flush()
20 | RegexCore.LogFile.close()
21 | 


--------------------------------------------------------------------------------
/test/sample.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import sys
 4 | sys.path.append("../src")
 5 | from tnpy import RegexCore
 6 | import json;
 7 | #import tngraph as graph
 8 | 
 9 | core = RegexCore('../rules/cnext')
10 | #graph.buildGraph(core,'time_fix');
11 | #exit()
12 | #RegexCore.LogFile = open("info.html", 'w')
13 | #RegexCore.LogFile.truncate()
14 | 
15 | print(core.Extract('十三分之二十四',entities=[core.Entities['fraction']]))
16 | read = open('chs.txt', 'r', encoding='utf-8')
17 | lines = [x for x in read.readlines()]
18 | 
19 | 
20 | for line in lines:
21 |         r = core.Extract(line)
22 |         js = json.dumps(r, indent=2, ensure_ascii=False);
23 |         print(js);
24 | 
25 | 
26 | 
27 | 
28 | 
29 | 
30 | #RegexCore.LogFile.flush()
31 | #RegexCore.LogFile.close()


--------------------------------------------------------------------------------