├── .gitignore ├── .vscode ├── launch.json └── tasks.json ├── LICENSE ├── README.md ├── doc ├── advance.md └── grammer.md ├── rules ├── calparser ├── cnext ├── learn └── xmlparser ├── src ├── __init__.py ├── tngraph.py ├── tnnlp.py └── tnpy.py └── test ├── chs.txt ├── learn.py └── sample.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | #log files 7 | *.log 8 | *.html 9 | 10 | # C extensions 11 | *.so 12 | 13 | 14 | # Distribution / packaging 15 | .Python 16 | env/ 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *,cover 51 | .hypothesis/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | 60 | # Sphinx documentation 61 | docs/_build/ 62 | 63 | # PyBuilder 64 | target/ 65 | 66 | #Ipython Notebook 67 | .ipynb_checkpoints 68 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.2.0", 3 | "configurations": [ 4 | 5 | { 6 | "name": "Python", 7 | "type": "python", 8 | "request": "launch", 9 | "stopOnEntry": true, 10 | "program": "${file}", 11 | "pythonPath": "D:/Anaconda3/python.exe", 12 | "debugOptions": [ 13 | "WaitOnAbnormalExit", 14 | "WaitOnNormalExit", 15 | "RedirectOutput" 16 | ] 17 | } 18 | 19 | ] 20 | } -------------------------------------------------------------------------------- /.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | // See http://go.microsoft.com/fwlink/?LinkId=733558 3 | // for the documentation about the tasks.json format 4 | "version": "0.1.0", 5 | "command": "echo", 6 | "isShellCommand": true, 7 | "args": ["Hello World"], 8 | "showOutput": "always" 9 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | tn是desert(沙漠之鹰)和tan共同开发的一种用于匹配,转写和抽取文本的语言(DSL)。并为其开发和优化了专用的编译器。基于递归下降方法和正则表达式,能解析自然文本并转换为树和字典,识别时间,地址,数量等复杂序列模式。 2 | github地址:https://github.com/ferventdesert/tnpy 3 | 4 | 语法介绍 5 | 6 | ## 0.设计理由 7 | 8 | 字符串分析和处理几乎是每个员程序必备的工作,简单到分割类似"1,2,3,4"这样的字符串,稍微复杂一些如字符串匹配,再复杂如编译和分析SQL语法。字符串几乎具有无穷的表达能力,解决字符串问题,就解决了计算机90%的问题。 9 | 10 | 虽然字符串处理如此深入人心,但当分割字符时,本来都是按照逗号分割的,突然出现分号,程序就可能出错。再如日期处理,每个程序员肯定都对各种奇怪诡异的时间表达方式感到头疼,处理起来非常费时。这些功能,几乎只能以硬编码实现。它们是与外界交互的最底层模块,然而却如此脆弱。 11 | 12 | >* 如何将”一百二十三“转换为数字? 13 | >* 如何将”2013年12月14日“识别为时间并转换为时 14 | 间类型? 15 | >* 如何分析一个XML或JSON文件? 16 | 17 | 正则表达式虽提供了强大的匹配功能,成为必备的工具,但它有不少局限,我们扩展了正则表达式引擎,使之能力大大增强。 18 | 在线演示:http://www.desertlambda.com:81/extracttext.html 19 | 20 | ## 1. 如何学习? 21 | 22 | 基本上程序员都读过“30分钟学会正则表达式”这篇文章吧?最后没几个人能在30分钟内就读完它。不过相信我,TN引擎只需要15分钟就可以学会。 23 | 24 | 详细的语法说明在这里: 25 | 26 | [tn基本语法][1] 27 | 28 | [使用tn构造自然语言计算器][2] 29 | 30 | [tn实现的xml解析器][3] 31 | 32 | TN可以实现文本的匹配,转写和信息抽取,可以理解为模板引擎的逆运算。简单的操作用正则表达式更方便,但不少问题是正则无法解决的。这时就需要使用TN了。 33 | 34 | TN的解释器有Python,C#和C三种版本。C#版本已经不再维护。使用C#或Java等语言的,建议使用IronPython或Jython进行跨语言编译。 35 | 36 | tnpy是tn的Python解释器,Python良好的可读性让代码写起来非常方便,代码不超过1000行,单文件,无第三方库依赖。推荐使用Python3。 37 | 38 | tn是解释型语言,需要编写规则文件,并使用tnpy加载,再对文本进行处理。 39 | 40 | ## 1. 基础的匹配和替换: 41 | 42 | 首先我们先编写一个最简单的规则文件learn,内容如下: 43 | 44 | ``` 45 | #%Order% 1 46 | hello= ("你好"); 47 | ``` 48 | 接着,执行下面的python代码: 49 | ``` 50 | from src.tnpy import RegexCore 51 | core = RegexCore('../rules/learn') 52 | matchs=core.Match('领导你好!老婆你好'); 53 | for m in matchs: 54 | print('match',m.mstr, 'pos:',m.pos) 55 | ``` 56 | 引入tnpy命名空间,之后从learn规则文件初始化引擎,匹配该文本: 57 | 58 | ``` 59 | success load tn rules:../rules/learn 60 | match 你好 pos: 2 61 | match 你好 pos: 7 62 | ``` 63 | 64 | 上面输出了文本的匹配结果和位置。当然这一点正则也能做到。 65 | 66 | 如果我们匹配的是`领导你好,老婆您好`,并想把所有的`你好`和`您好`,都转写为`hello`。 67 | 68 | 为此我们添加hello2和hello3两个子规则: 69 | 70 | ``` 71 | hello2= $(hello)| ("您好"); 72 | #%Order% 1 73 | hello3= $(hello2) : (//:/hello/); 74 | ``` 75 | 76 | `hello2`引用了刚才的`hello`规则,同时添加了`“您好”`。 77 | 78 | hello3是主规则,负责将将`hello2`匹配的内容都转写为`hello` 79 | 80 | `($代表引用一条规则,|表示将几个规则并列排列,匹配最长的那个规则,:代表转写。)` 81 | 82 | 执行下面的代码:` 83 | 84 | ``` 85 | print(core.Rewrite('领导你好!老婆您好')); 86 | ``` 87 | 88 | 结果为: 89 | ``` 90 | 领导hello!老婆hello 91 | ``` 92 | 93 | 如果我们想替换顺序,把“你好”放在前面呢?可以这样写: 94 | 95 | ``` 96 | people= ("老婆") | ("领导"); 97 | #%Order% 1 98 | reorder= $(people) $(hello3) : $2 $1; 99 | ``` 100 | 101 | 先用`people`定义如何描述`老婆,领导`,然后用reorder来修改顺序, 注意reorder是个**顺序结构**,people匹配老婆和领导,hello3匹配您好/你好,并将其转换为`hello`。 `$2和$1`修改了转写顺序,执行Rewrite后输出: 102 | 103 | ``` 104 | hello领导!hello老婆 105 | ``` 106 | 107 | 我们把类似`$(name1) $(name2)`的结构,称为顺序表达式,把`$(name1) | $(name2) `称为或表达式。 108 | 如果将刚才所有的规则绘制成图,则是下面的样子: 109 | 110 | ![foo.png-34.5kB][4] 111 | 112 | ## 2. 正则表达式 113 | 114 | 仅仅使用文本,表现力太差了。我们引入正则表达式来完成,正则表达式需要放在(//)中,注意和文本("")的区别。 115 | 116 | 如果要进行转写,则标注为`(/match/:/rewrite/)`; 下面的表达式将所有的长空白符转换为一个空白符: 117 | 118 | ``` 119 | byte_det_space = (/ */://); 120 | ``` 121 | 122 | 下面将所有字母转换为空白: 123 | 124 | ``` 125 | low_letter_to_null = (/[a-z]/ ://); 126 | #或者下面: 127 | low_letter= (/[a-z]/); 128 | translate= $(low_letter) : (""); 129 | ``` 130 | 131 | 觉得没有挑战?我们接着看下面的。 132 | 133 | ### 3. 复杂组合:中文数字转阿拉伯数字 134 | 135 | 二十三如何转换为23?这种用普通的编程会比较困难。我们尝试用TN解决,会发现一点都不难。 136 | 先定义汉字的一二三到九转换为1-9,你肯定会写出这样的规则: 137 | 138 | ``` 139 | #定义0-9 140 | int_1 = ("一" : "1"); 141 | int_0 =("零" : "0"); 142 | int_2 = ("二" : "2") | ("两" : "2"); 143 | int_3_9 = ("三" : "3") | ("四" : "4") | ("五" : "5") | ("六" : "6") | ("七" : "7") | ("八" : "8") | ("九" : "9"); 144 | int_1_9 = $(int_1) | $(int_2) | $(int_3_9) | (/\d/); 145 | int_0_9 = $(int_0) | $(int_1_9); 146 | int_del_0 = (/零/ : /0/) | (// : /0/); 147 | int_0_9_null = $(int_del_0) | $(int_0_9); 148 | ``` 149 | 150 | 之所以要把0,1,2分开写,是因为这些数有特殊情况,如两和二都代表2,需要在后面特殊处理。 151 | 上面的`int_0_9_null`规则,就可以把`五七零二`转写为`5702`。但没法处理`二十三`这样的情况。 152 | 153 | 再定义下面的规则,这样`一十三`可以转写为`13` 154 | 155 | ``` 156 | int_del_0 = (/零/ : /0/) | (// : /0/); 157 | int_0_9_null = $(int_del_0) | $(int_0_9); 158 | #定义10,十 159 | int_1_decades = (/十/ : /1/) | (/一十/ : /1/); 160 | ``` 161 | 162 | 再加上下面的规则,int_1_9_decades定义了十位数如何转写,而int_10_99定义了从十到九十九的转写规则。 163 | 164 | ``` 165 | int_10_99 = $(int_1_9_decades) $(int_0_9_null) | (/[1-9][0-9]/) ; 166 | int_1_99 = $(int_1_9) | $(int_10_99) ; 167 | int_01_99 = $(int_1_9) | $(int_10_99) | (/\d{1,2}/); 168 | 169 | #%Order% 3 170 | int_0_99 = $(int_0) | $(int_1_9) | $(int_10_99); 171 | ``` 172 | 173 | 看看下面的例子: 174 | `print({r:core.Rewrite(r) for r in ['十','三十七','一十三','68']});` 175 | 运行结果: 176 | `{'一十三': '13', '68': '68', '十': '10', '三十七': '37'}` 177 | 是不是感到很神奇?三十七是如何被转写为37的? 178 | 179 | 仔细看规则,规则自底向上构造成了一棵规则树,in_0_99是整棵树的根节点。结构如下图: 180 | ![foo.png-132.1kB][5] 181 | 下面的log文件给出了匹配过程: 182 | 183 | ``` 184 | int_0_99,Table,Raw =三十七 185 | int_0,String,Raw =三十七 186 | int_0,String,NG 187 | int_1_9,Table,Raw =三十七 188 | int_1,String,Raw =三十七 189 | int_1,String,NG 190 | int_2,Table,Raw =三十七 191 | int_2_merge,Regex,Raw =三十七 192 | int_2_merge,Regex,NG 193 | int_2,Table,NG 194 | int_3_9,Table,Raw =三十七 195 | int_3_9_merge,Regex,Raw =三十七 196 | int_3_9_merge,Regex,Match=三 197 | int_3_9,Table,Match=三 198 | int_1_9_3,Regex,Raw =三十七 199 | int_1_9_3,Regex,NG 200 | int_1_9,Table,Match=三 201 | int_10_99,Table,Raw =三十七 202 | int_10_99_0,Sequence,Raw =三十七 203 | int_1_9_decades,Table,Raw =三十七 204 | int_1_decades,Table,Raw =三十七 205 | int_1_decades_0,Regex,Raw =三十七 206 | int_1_decades_0,Regex,Match=十 207 | int_1_decades_1,Regex,Raw =三十七 208 | int_1_decades_1,Regex,NG 209 | int_1_decades,Table,Match=十 210 | int_1_9_decades_1,Sequence,Raw =三十七 211 | int_1_9,Table,Raw =三十七 212 | int_1_9,Table,Buff =三 213 | unknown,Regex,Raw =十七 214 | unknown,Regex,Match=十 215 | int_1_9_decades_1,Sequence,Match=三十 216 | int_1_9_decades,Table,Match=三十 217 | int_0_9_null,Table,Raw =七 218 | int_del_0,Table,Raw =七 219 | int_del_0_0,Regex,Raw =七 220 | int_del_0_0,Regex,NG 221 | int_del_0_1,Regex,Raw =七 222 | int_del_0_1,Regex,Match= 223 | int_del_0,Table,Match= 224 | int_0_9,Table,Raw =七 225 | int_0,String,Raw =七 226 | int_0,String,NG 227 | int_1_9,Table,Raw =七 228 | int_1,String,Raw =七 229 | int_1,String,NG 230 | int_2,Table,Raw =七 231 | int_2_merge,Regex,Raw =七 232 | int_2_merge,Regex,NG 233 | int_2,Table,NG 234 | int_3_9,Table,Raw =七 235 | int_3_9_merge,Regex,Raw =七 236 | int_3_9_merge,Regex,Match=七 237 | int_3_9,Table,Match=七 238 | int_1_9_3,Regex,Raw =七 239 | int_1_9_3,Regex,NG 240 | int_1_9,Table,Match=七 241 | int_0_9,Table,Match=七 242 | int_0_9_null,Table,Match=七 243 | int_10_99_0,Sequence,Match=三十七 244 | int_10_99_1,Regex,Raw =三十七 245 | int_10_99_1,Regex,NG 246 | int_10_99,Table,Match=三十七 247 | int_0_99,Table,Match=三十七 248 | ``` 249 | 250 | 引擎从文本的左向右,沿着规则树寻找最长的文本,如果在一个顺序表达式上的任何一步失败,那么整个顺序表达式被抛弃。或表达式会遍历每个子表达式,直到发现最长的那个,返回结果。具体的匹配原理,以及优化,会在专门的文章中介绍。 251 | 252 | ## 4. 由规则构造更复杂的规则 253 | 254 | 自然而然的,知道怎么定义三十七,就可以定义五百三十七,那不过是`int_1_9_hundreds+int_0_99`(这个已经定义过了)。 255 | 256 | ``` 257 | int_1_9_hundreds = $(int_1_9) ("百" : ""); 258 | int_100_999 = $(int_1_9_hundreds) ("" : "00") | $(int_1_9_hundreds) $(int_10_99); 259 | int_1_999 = $(int_1_99) | $(int_100_999); 260 | ``` 261 | 262 | `int_1_999`可以处理类似五百三十七这样的问题! 263 | 264 | 进而,我们可以处理几千,几万,这个延伸到万以后,就可以自然而然地衍生出亿,万亿的表达。 265 | 266 | 如何处理负数?这还不简单! 267 | 268 | ``` 269 | signed_symbol0 = ("正" : "") | ("负" : "-") | ("正负" : "±") | ("\+" : "+") | ("\-" : "-") | ("±" : "±") ; 270 | signed_symbol = $(signed_symbol0) | $(null_2_null); 271 | ``` 272 | 273 | 接下来,我们默认正整数为`integer_int`,那么,整数(包含正负)就是: 274 | 275 | `integer_signed = $(signed_symbol) $(integer_int)` 276 | 277 | ## 5. 属性提取 278 | 沿着刚才的路,我们自然而然地能定义分数,但仅仅是转写还不够,遇到三分之一,我们不仅要将其处理为1/3,还要计算出它的值,这就涉及到属性抽取。也就是把信息从文本中提取为字典。 279 | 280 | 分数,不过是`整数+分之+整数`,可以定义成下面的形式: 281 | 282 | ``` 283 | fraction_cnv_slash = ("分之" : "/"); 284 | fraction2 = ("/" : "/"); 285 | percent_transform= ("%" : "100") | ("‰" : "1000"); 286 | #%Type% DOUBLE 287 | #%Property% Denominator,,Numerator| Numerator ,, Denominator | Denominator ,, Numerator 288 | #%Order% 101 289 | fraction = $(integer_int_extend) $(fraction_cnv_slash) $(integer_int) : $3 $2 $1 290 | | $(integer_int) $(fraction2) $(integer_int) 291 | | $(pure_decimal) ("" : "/") $(percent_transform); 292 | ``` 293 | 294 | 这个有点复杂,但容我慢慢讲解。分数有三种情况,如刚才的`三分之一`,或是`1/3`,或是`30%`。分别对应上面`fraction`规则的三个子规则。仔细地看上面的规则,不难理解。 295 | 296 | 值得注意的是Property这个标签,该标签定义了如何抽取信息。也是用竖线分隔,每个名称对应下面的一个子规则,为空的直接跳过。那么”十三分之二十四“中,“十三”就对应Numerator, 而“二十四”对应Denominator。来测试一下: 297 | 298 | `print(core.Extract('十三分之二十四',entities=[core.Entities['fraction']]))` 299 | 300 | 我们用Extract函数来抽取文本,返回的是一个字典,entites是可选参数,我们限制只用fraction规则来匹配,获得输出: 301 | 302 | ``` 303 | [{'Numerator': '24', '#rewrite': '24/13', '#type': 'fraction', 304 | '#match': '十三分之二十四', 'Denominator': '13', '#pos': 3}] 305 | ``` 306 | 307 | 是不是很赞? 308 | 309 | ### 6.嵌入Python脚本 310 | 311 | 有一种需求还没谈到,将所有的大写字母转换为小写字母,你可能会想定义26个字符串规则,并用或表达式来拼接起来吧?这样太费事了。我们可以直接这样: 312 | 313 | `low_to_up_letter = (/[A-Z]/) : "str.lower(mt)";` 314 | 315 | `[A-Z]`匹配了所有的大写字母,将匹配结果送到后半段的转写,内置的解释器会执行那段python代码,将其转换为小写,mt代表前面表达式的匹配串,rt代表转写串。好在`[A-Z]`不执行转写,可以认为`mt==rt`. 316 | 317 | 这是在转写过程中嵌入python的例子,还能在匹配时嵌入转写: 318 | 319 | `foo = "findsecret" : "print(mt)"`; 320 | 321 | 前面的findsecret函数负责在字符串中找到“神秘文本”,后面的转写代码打印出来,并将原始的字符返回… 322 | 323 | ## 6. 你在15分钟内读完了么? 324 | 325 | 我相信你没有,因为读懂那个匹配规则的日志文件,就需要最少五分钟,但如果你有编译原理和正则基础的话,还是能很快理解的。而从零开发这个引擎,到反复优化和完善,花了一年之久。 326 | 327 | 定义了各种数字之后,我们就能很快地定义时间,日期,电话号码,地址…而你看到的只是TN语言的冰山一角。 328 | 329 | - 它能够分析文本的模式,解析诸如ABCABC这样的序列,从而发现这是一个重复模式。 330 | 331 | - 不仅能够顺序匹配,还能逆向,甚至乱序匹配,这就能够抽取类似“学校的校训”这样的问题。 332 | 333 | - 规则可以调用自身,配合脚本,因此能够实现递归下降解析。例如30行代码实现xml解析,或20行规则实现自然语言计算器。 334 | 335 | - 规则可以嵌入脚本,甚至动态生成代码,因此,甚至在理论上,TN能够自己编译自己。 336 | 337 | - TN还能做一个简单的SQL解释器,或是中文英文的简单互相翻译的工具。 338 | 339 | 是不是已经激动地颤抖了?唯一限制你能力的就是你的想象力。本博客将会进一步发布一系列有关tn的内容,包括高级语法, 340 | tn优化等。 341 | 342 | 感兴趣的可以联系作者 343 | 344 | 345 | [1]: http://www.cnblogs.com/buptzym/p/5355827.html 346 | [2]: http://www.cnblogs.com/buptzym/p/5361121.html 347 | [3]: http://www.cnblogs.com/buptzym/p/5355920.html 348 | [4]: http://static.zybuluo.com/buptzym/ksl5ggrfcn1psmdf2f81i8wg/foo.png 349 | [5]: http://static.zybuluo.com/buptzym/itwhlmz8ua2h3jgbqdq5z48g/foo.png 350 | -------------------------------------------------------------------------------- /doc/advance.md: -------------------------------------------------------------------------------- 1 | 标签(空格分隔): 未分类 2 | 3 | --- 4 | 5 | ##高级操作 6 | ###1.脚本表达式 7 | 8 | 用双引号包含的脚本被称为脚本表达式,目前支持嵌入Python。 脚本表达式只能在顺序表达式中使用。代码可以在三个位置存在: 9 | |位置|功能|例子| 10 | |--|--|--| 11 | |匹配(match)|在字符串中匹配字符| match(m.mstr)| 12 | |转写(rewrite)|对匹配完成的串转写| str.lower(m.mstr)| 13 | |条件(condition)|判断转写条件是否满足|| 14 | 15 | 由于tn本身所带的匹配和转写功能一般足够使用,所以脚本在匹配和转写中只是作为补充,而条件是最需要嵌入脚本的。 16 | 17 | 由于脚本表达式用双引号表示,为了避免语法解析出现错误,因此在Python代码中需要用单引号来表示字符串。 18 | 19 | **例1** 20 | `rule= $(rule0) $(rule1) $(rule2) : $(rewrite1) $(rewrite2) "m.str+'haha'"` 21 | 22 | rewrite1负责转写rule0,rewrite2转写rule1, 后面的脚本表达式转写rule2: 23 | 24 | m代表rule2所匹配的结果。这个结果称为MatchResult(可参考tnpy源代码),它有如下属性: 25 | ``` 26 | ot #原始输入字符串 27 | m.mstr #匹配串 28 | m.rstr #转写串 29 | m.pos #匹配得到的位置 30 | ``` 31 | 32 | **例2** 33 | 34 | `rule= $(rule0) $(rule1) $(rule2) : "m[0].mstr+m[1].mstr+m[2].mstr";` 35 | 36 | 以上脚本,将三个规则的匹配字符串加起来返回。 37 | 38 | 转写部分只有一个规则时,该规则需要转写匹配部分里的全部内容,形参为m[0],m[1]...,就像这个例子描述的样子。 39 | 40 | 但如果转写部分有多个规则,则转写部分的规则数量必须和匹配部分的规则数量一致,一一对应: 41 | 42 | `rule= $(rule0) $(rule1) $(rule2) : "m.mstr" "m.mstr" "m.mstr";` 43 | 44 | 此时,三个脚本表达式分别承载前面的三个顺序规则。由于对应的只有一个规则,所以m等价于m[0]。 45 | 46 | **例3** 47 | 48 | `low_to_up_letter = $(low_letter) : "unicode.upper(m.mstr)";` 49 | 50 | `$(low_letter)`匹配了小写字母,后面的表达式将前面表达式匹配后的结果转换为大写,并返回。 51 | 52 | **例4** 53 | 54 | `unit_electric = $(integer_decimal) $(unit_tabl_electric) :: "abs(e(unit_kywd_electric),m[0])<33"` 55 | 56 | 这个例子稍微复杂一些,例如识别30m到底是30米还是30兆字节,就取决于文本附近有没有相应的关键字。 57 | 58 | `unit_kywd_electric`规则定义如下: 59 | 60 | `unit_kywd_electric = ("速度") | ("网速") | ("电脑") | ("导体")...;#省略一部分` 61 | 62 | 上面的脚本,`$(integer_decimal)`匹配30, `$(unit_tabl_electric)`匹配m, 63 | 64 | e函数在原始字符串中匹配`unit_kywd_electric`实体,之后判断这个实体在字符串的位置和m[0]的位置的绝对值是否小于33, 用来确定这是否是信息计量单位。 65 | 66 | 上面的表达式有些复杂,同时,当e函数匹配失败返回None,那么程序就会报错,因此可以修改为 67 | `dist('unit_kywd_electric',0)<33` 68 | 69 | dist是tnpy里内置的一个函数: 70 | 71 | ``` 72 | def dist(name, i=0): 73 | header = e(name) 74 | if head is None: 75 | return int_max; 76 | return abs(header.pos - m[i].pos) 77 | ``` 78 | 79 | tn脚本不建议(也不能够)写入超过多行python代码,因此为了安全和方便,可以自行定制函数来方便匹配和转写,tnpy会将这些函数嵌入到引擎当中,成为闭包函数,例如: 80 | 81 | `#%Script% extends` 82 | 83 | 这样就导入了extends.py库 84 | 85 | ###2. 使用纯Python编写规则 86 | 前面提到,之所以为tn定义一套特别的语法,是为了方便能够跨语言实现解析。这种TN语法能够用正则表达式方便的进行词法分析和语法分析,具体细节可参考tnpy源代码。 87 | 88 | 但是,我们也可以使用纯Python来编写规则,这样有很多好处,可以内嵌其他实体类型,进一步扩展语言的功能。也能借助现成的Python编译器,及时发现未引用的规则。 89 | 90 | 规则需要先引入实体: 91 | ``` 92 | from tnpy import StringEntity as SE, RegexEntity as RE, TableEntity as TE, SequenceEntity as SQE, RepeatEntity as RPE 93 | ``` 94 | 接下来我们就能够定义不同的规则了: 95 | ``` 96 | build = SE('成立于', '建成了'); 97 | splitkw0 = RE('^|[,\.。,和\r\n]'); 98 | quotekw = TE([RE('校训'), RE('育人精神')]); 99 | quote0 = RE('"([^"]+)"', '$1'); 100 | anything = RE('.*'); 101 | ``` 102 | 得益于Python非常fancy的语法,buiid实际上是`("成立于":"建成了")`, quotekw则是两个正则的或表达式。 103 | 104 | 下面定义了一个顺序表达式,是不是可读性也很强呢? 105 | 106 | `quote1 = SQE([quotekw, anything, quote0, anything, splitkw0])` 107 | 108 | python版本的规则和tn规则也能相互引用,tn规则可以直接引用py规则,而py规则想要引用,则需要 109 | 110 | `quote_example= SQE([REF('quote')],[rewriterule])` 111 | 112 | 我简直深深地爱上了Python。 113 | 114 | ##3. 结合NLP和词性 115 | 116 | 原始的tnpy,为了保证代码的纯粹性,没有加入这些功能,如果我们希望匹配 117 | 118 | `**名词**确实是**形容词**` 119 | 120 | 这样的表达,难道要把所有的名词和形容词都列进去吗?这显然是不必要的。**tnnlp**模块就是解决这个问题的。tnnlp已经添加入tnpy核心库中了。 121 | 122 | 使用时也很简单: 123 | 124 | `from tnnlp import NEREntity as NE,WordEntity as WE;` 125 | 126 | 于是,”地名”建成于”时间”,就能用下面的表达式来解决: 127 | ``` 128 | time2 = SQE([NE('nt'), build, 'date_fix'], rewriteOrders=[2, 1, 0]); 129 | ``` 130 | 131 | `rewriteorders=[2,1,0]`等价于tn规则里的```$3,$2,$1```. 132 | 133 | 其中,`NE`代表一个实体,`nt`为地名;类似地,`n`是名词,`ad`是形容词。 NLP使用了结巴分词作为分词和词性标注的方法。 134 | 135 | ###4.使用词库 136 | 137 | 如果我们想匹配“程序员”是伟大的职业这样的表达,那么就需要把程序员或是某种工种的所有表达全部列出来。这个工作量太大了。 138 | 139 | 同样,描述“好”的形容词也有很多,都列出来也会浪费大量的时间。解决这个问题的办法,就是使用词库。 140 | 141 | tnnlp使用了哈工大标注的一份语料库: 142 | 143 | ``` 144 | Aa01A07= 者 手 匠 客 主 子 家 夫 翁 汉 员 分子 鬼 货 棍 徒 145 | Aa01A08= 每人 各人 每位 146 | Aa01A09= 该人 此人 147 | Aa01B01= 人民 民 国民 公民 平民 黎民 庶 庶民 老百姓 苍生 生灵 生人 布衣 白丁 赤子 氓 群氓 黔首 黎民百姓 庶人 百姓 全民 全员 萌 148 | Aa01B02= 群众 大众 公众 民众 万众 众生 千夫 149 | Aa01B03# 良民 顺民 150 | Aa01B04# 遗民 贱民 流民 游民 顽民 刁民 愚民 不法分子 孑遗 151 | Aa01C01= 众人 人人 人们 152 | Aa01C02= 人丛 人群 人海 人流 人潮 153 | Aa01C03= 大家 大伙儿 大家伙儿 大伙 一班人 众家 各户 154 | ``` 155 | 890KB的词库,定义了大概几十万个词,并使用树结构来讲词义索引起来,例如,所有A开头的都是和人物有关的,后面的标注进一步做了分类。 156 | 157 | 因此,你可以使用下面的表达,来描述Ae06节点下的所有词汇: 158 | ``` 159 | word= WE('Ae06'); 160 | rewrite= RE('.+','$1是一种伟大的职业'); 161 | wordme= SQE([word],[rewrite]) 162 | ``` 163 | 164 | 一旦遇到`Ae06`分支下的词,就会自动将其转换为xxx是一种伟大的职业。 165 | 166 | 这也是写纯Python规则的好处,可以方便地定制类型,扩展核心引擎的功能。 167 | 168 | ###5. 乱序匹配 169 | 170 | 以提取校训为例,校训一般来说有以下几种表达: 171 | 172 | 语句1:`北京邮电大学的校训是“厚德博学,敬业乐群”。` 173 | 174 | 语句2:`“学为人师,行为世范”是北师大启功先生提出的校训。“为学生着想”….` 175 | 176 | 如果用正则提取离校训最近的双引号的内容,可能会出错,因为前后可能还有其他双引号标注的内容,如上面的“为学生着想”。 177 | 178 | 因此,想抽取校训主要有三个特征:**校训关键字**,**双引号**和**标点符号**。 179 | ``` 180 | quote1 = SQE([quotekw, anything, quote0, anything, splitkw0], matchorders=[5, 1, 2, 1, 4]); 181 | quote2 = SQE([splitkw0,anything,quote0, anything, quotekw ], matchorders=[4, 1, 3, 1, 5]); 182 | quote = TE([quote1,quote2]) 183 | ``` 184 | (其他规则都已经在上面定义过了) 185 | 186 | quote规则描述了两种类型`quote1`和`quote2`, 对quote1来说,要匹配语句1,匹配按照优先次序`5,1,2,1,4`,先匹配quotekw,找到了`校训`两字,再找分隔符,找到了句号。此时就把整个句子夹逼到了。 187 | 188 | `校训是“厚德博学,敬业乐群”`,再匹配`quote0`,把实际的双引号中的校训提取出来。最终两个anything匹配`是`和`null`。 189 | 190 | 对quote2来说,匹配语句2,quotekw匹配了`校训`,splitkw0匹配了句首,quote0匹配了`"学为人师,行为世范"`,anything匹配了夹逼后剩下的部分。 191 | 192 | 乱序匹配本质上,是**通过定义匹配顺序,人为地通过`围栏`分割句子,将句子分割成树,然后在子节点上再进行匹配**,这就解决了顺序表达式难以解决的问题。 193 | 194 | 其实,这里依旧有问题没能解决。如果我们想匹配ABC这三个字母的全排列,如CBA,CAB... 总共有6种方法,总不至于手工编写所有的匹配模式吧?这还只是三种,数量更多之后,手工编写就变得不可行了。那如何实现高效匹配呢?**此事我依旧没有思路**。 195 | 196 | 197 | ##6.总结 198 | 本文介绍了tn的高级语法,本质上tn是可以被任意改造和扩展的。因此不应当拘泥于本身提供的文法,而是按照自己的需求自行定制。之后会介绍tn的性能优化,用于模式匹配的技巧和实现原理。 -------------------------------------------------------------------------------- /doc/grammer.md: -------------------------------------------------------------------------------- 1 | > tn是desert和tan共同开发的一种用于匹配,转写和抽取文本的语言。解释器使用Python实现,代码不超过1000行。 2 | 3 | 本文主要介绍tn的基本语法。高级内容可以参考其他篇章。使用这样的语法,是为了实现语言无关,从而方便地编写不同语言的解释器。 4 | 5 | ##基本语法 6 | 引擎可以由一组规则构成,规则也可以被其他规则所组合。首先介绍最基本的元规则 。 7 | 8 | ###1. 字符串StringEntity 9 | ```Form1: ("Matched string") 10 | Form2: ("Matched string" : "Rewritten string") 11 | ``` 12 | Form1是一种省略表达,即Rewritten==Matched 13 | 样例: 14 | ```("0" : "零") # 将 "0" 转写成 "零" 15 | ("" : " ") # 在指定的地方插入一个空格 16 | ("kg" : "kilogram") # 将 "kg" 或 "Kg" 扩展成 "kilogram" 17 | ``` 18 | ###2. 正则表达式RegexEntity 19 | ``` 20 | Form1: (/Matched expression/) 21 | Form2: (/Matched expression/ : /Rewritten expression/) 22 | ``` 23 | 24 | 样例: 25 | ``` 26 | (/\s+/ : / /) \#将一串连续的空格与换行符合并为一个空格 27 | (/(\d+)\s?(-|~)\s?(\d+)/ : /$1 to $3/) #将 "15~20 dollars" 改写成 "15 to 20 dollars" 28 | ``` 29 | 30 | 将用Matched匹配到字符串替换成Rewritten所表示的字符串。这里的正则表达式符合Perl正则规范。Form1只能作为匹配规则而不能作为转写规则,如果Rewritten为空,则只匹配不转写。Rewritten并不是真正的正则表达式,它仅支持普通字符串与`$1, $2, ..., $99,$n` 表示Matched expression匹配到的第n个Entity。 31 | 32 | ###3. 脚本表达式 ScriptEntity 33 | 可以在文法中嵌入脚本,具体的语法规则由引擎所决定,目前可以嵌入Python。(详情可参考高级语法) 34 | 35 | ------- 36 | 其他各类表达式,都是由这三类表达式进行组合得到的。它们的并(或操作),连接和差操作,构成了以下三类复合实体。这三种操作与正则表达式的三类基本操作一致。 37 | 表达式需要被其他表达式引用时,就需要为其命名,例如: 38 | `entity= (/\s+/ : / /) ;` 39 | 这样就表达了一个名称为entity的字符串表达式。名称与c语言的变量命名规则一致。中间由=连接。最后由分号结束。 40 | 41 | 当引用其他表达式时,可以用$(RuleName)表达。 42 | 43 | 44 | -------------------------------------------------------------------------------- 45 | 46 | ###4. 或表达式 TableEntity 47 | `Form: Table_name =Entity1 | Entity2 | …` 48 | 样例: 49 | ``` 50 | digit_0_to_9 = ("0" : "nol") | ("1" : "satu") | ("2" : "dua") | ("3" : "tiga") | ("4" : "empat") | ("5" : "lima") | ("6" : "enam") | ("7" : "tujuh") | ("8" : "delapan") | ("9" : "sembilan"); #印尼语数字 0~9 的Map 表 51 | integer_int_extend = $(integer_int) | ("百" : "100") | ("千" : "1000") | ("万" : "10000") | ("亿" : "100000000"); 52 | ``` 53 | 54 | integer_int_extend规则就是由integer_int和其他四个StringEntity构成的。 55 | 或表达式中间的分隔符有两种,竖线|和斜杠/。 以竖线分割的实体是平级的,会对每一个子表达式进行匹配,找出离字符串起始位置最近且匹配到的字符串最长的那个子表达式。而以斜杠分割的实体,被看做一组(Group),一旦匹配,就不会匹配之后的表达式。可以在表达式中指定多个组合平级实体。 56 | 看下面的例子: 57 | `grouptest= (/CD/) | (/ABC/) / (/AB/) | (/ABCD/);` 58 | 该规则分成了两组,在匹配ABCD时,前一组已经匹配了ABC,因此就不会继续向后匹配到ABCD。因此该规则最终匹配的结果是ABC. 59 | 60 | ###5. 序列表达式 SequenceEntity 61 | 序列表达式描述了表达式的连接。序列从左到右依次匹配,一旦出现不能匹配的情况,则整个序列匹配失败。注意,序列匹配的字符串必须是相邻的。 62 | 63 | ``` 64 | integer_0_to_99 = $(integer_0_to_9) | $(integer_teens) 65 | | $(integer_decades) $(del_0) 66 | | $(integer_decades) $(ins_space) $(integer_1_to_9) $(ins_space); 67 | ``` 68 | 69 | 这个表达式实际上是一个TableEntity,后两个子表达式是SequenceEntity。该表达式可以转写0~99范围内的整数。 70 | 71 | 匹配211时它首先用第一个integer_0_to_9能匹配到 '2',再用第二个integer_teens能匹配到 "11",再用第三个表达式匹配失败,再用第四个Sequence能匹配到 "21",最终选择离起点最近且匹配到的字符串最长的那一个进行转写: 72 | `211 :twenty one` 73 | 序列表达式可以完成转写和顺序调整。例如: 74 | `fraction = $(integer_int_extend) $(fraction_cnv_slash) $(integer_int) : $3 $2 $1 ` 75 | 76 | 三分之一转写为1/3,integer_int_extend可以匹配‘三’, fraction_cnv_slash可匹配 '分之' , integer_int可匹配'一'。 $3 $2 $1 对其顺序进行了重排。 77 | 78 | ###6.重复表达式RepeatEntity 79 | ``` 80 | Form1: Repetition_name = $(an_entity)+; 81 | Form2: Repetition_name = $(an_entity){m,n}; 82 | ``` 83 | 由一条需要重复的规则、要重复的次数以及结尾的分号组成。需要重复的规则有且仅有一条。所以不能写成 84 | `error_example= $(an_entity0) $(an_entity){m,n}; ` 85 | 86 | m到n次,m是≥0的整数,n是≥0的整数或-1,为-1时表示不限制重复次数。 87 | 88 | 这与正则表达式的规则基本一致。 89 | 90 | 91 | ###7.差集表达式DiffEntity 92 | ``` 93 | Form1: Difference_name = $(Universe) - $(complement); 94 | Form2: Difference_name = $(Universe) - $(complement1) - $(complement2) - …; 95 | ``` 96 | 由一组Complement以及结尾的分号组成。有且仅有一个Universe,后面用减号可以跟多个表达式。 97 | 当Universe表达式能匹配且其他complement不能匹配时成立。例如: 98 | ``` 99 | integer_1_to_9 = $(integer_0_to_9) - ("0" : "nol"); # 整数1~9 100 | integer_2_to_999 = $(integer_0_to_999) - $(digit_1) - $(digit_0); # 整数2~999 101 | ``` 102 | 103 | -------------------------------------------------------------------------------- 104 | 105 | ###8. 元标签 106 | 可以为表达式增加标签,控制表达式的属性和功能。也可以引入规则等。 107 | 108 | ####文件级元标签: 109 | 文件级元标签,不需要贴在任何规则之上。 110 | `#%Include% Rules/cnext` 111 | 增加一个名称为cnext的外置文件。本文件中的规则即可引用该文件中的规则。支持双向引用。 112 | 113 | `#%Script% extends` 114 | 增加一个名称为extends.py的外置Python脚本。该标签适合在嵌入Python代码时使用。嵌入的代码可以执行外置脚本中定义的函数。引擎会在内部执行import(extends)函数。因此extends.py需要放置在规则文件同一级目录中。 115 | 116 | ####规则级元标签: 117 | 规则级元标签需要放在规则文本行之上,如: 118 | ``` 119 | #%Type% INT 120 | #%Order% 180 121 | int_0_4= $(int_0) | $(int_1) | $(int_2) | ("三" : "3") | ("四" : "4") ; 122 | ``` 123 | 上面的两个标签意思分别为: 124 | 将int_0_4的类型标记为INT 125 | 将int_0_4的匹配优先级定义为140. 数字越大,优先级越低。 126 | 127 | 不是所有的规则都是有效规则,有些规则只是被其他规则引用。只有加上#%Order%标签的才是有效规则。规则可以手动编写优先级。也可以省略之后的数字,引擎会自动根据引用结构来制定优先级,被引用层级越高的优先级越高。 128 | 129 | `#%Parameter% `为规则赋值 130 | 这部分取决于引擎的设计,将在《高级话题》中描述。 131 | 132 | ####属性级元标签 133 | 在信息抽取时,属性元标签非常重要,它指定了引擎如何将文本转换为字典。 134 | 135 | **案例1**: 136 | ``` 137 | #%Property% Denominator,,Numerator| Numerator ,, Denominator | Denominator ,, Numerator 138 | fraction = $(integer_int_extend) $(fraction_cnv_slash) $(integer_int) : $3 $2 $1 | 139 | 140 | $(integer_int) $(fraction2) $(integer_int) | 141 | 142 | $(pure_decimal) ("" : "/") $(percent_transform); 143 | ``` 144 | 属性标签为fraction的每一个引用实体增加了属性。 按照 '|' 分组,Denominator赋给integer_int_extend, Numerator赋给integer_int. 分别代表分子和分母。 145 | 146 | **案例2**: 147 | 148 | 当抽取类似JSON或XML的文本时,抽取的字典需要以键值对的形式标注,如下例子: 149 | ``` 150 | #%Property% ,$key,,$value 151 | properties =$(space) $(name) $(equal) $(property) $(space); 152 | ``` 153 | 则在抽取时,会以name为键,property为value, 插入抽取的字典中。 154 | 155 | 156 | -------------------------------------------------------------------------------- 157 | 158 | ##9.注意事项 159 | 160 | ###注释 161 | 除了符合元标签格式的文本,以 # 开始的一行内容被认为是注释行被忽略。暂不支持在一行内容的中间或后面加注释,也不支持在某一规则的多行内容的中间插入一行注释。 162 | 163 | ###换行 164 | 当Rule内容特别长时可以直接换行,中间插入的换行符/空格/制表符会被忽略,但不支持在中间插入注释行。 165 | 166 | ###结束符 167 | 所有Rules都要以分号结束。 168 | 169 | ###交叉引用 170 | 规则可以支持交叉引用,甚至可以引用自身,但被引用的表达式需要存在,否则会引发错误。引用时,需要保证文法不是左递归的,否则将会陷入死循环。 171 | 172 | ###编码 173 | 由于文本处理引擎经常处理多国语言,因而要求使用UTF-8编码(no BOM)。 -------------------------------------------------------------------------------- /rules/calparser: -------------------------------------------------------------------------------- 1 | #计算引擎 2 | #尝试解决 三点五乘以八点三的功能 3 | 4 | #%Include% Rules/cnext 5 | add = (/加上?|\+|+/ : /+/) ; 6 | sub = (/减去?|\-|-/ : /-/); 7 | mul = (/乘以?|\*|×/ : /*/); 8 | div = (/除以?|/|÷/ : ///); 9 | pow2 = (/的?平方/ : /**2/); 10 | pow3 = (/的?立方/ : /**3/); 11 | pown= (/的?/ : //) $(digit) (/次方/ : /**/) : $3 $2 $1; 12 | 13 | 14 | result= (/的?结果/); 15 | addresult0= (/的?和/); 16 | subresult0= (/的?差/); 17 | addresult = $(result) $(addresult0); 18 | subresult = $(result) $(subresult0); 19 | addsub0= $(add) | $(sub) ; 20 | logic0 =$(or) | $(and) ; 21 | divpow0 = $(mul) | $(div); 22 | equalcheck = $(bigger) |$(less) | $(noequal); 23 | 24 | operator= $(addsub0) | $(equalcheck) | $(logic0); 25 | 26 | divpow = $(digit) $(divpow0) $(digit) ; 27 | powx= $(pow2) | $(pow3) | $(pown); 28 | pow = $(digit) $(powx); 29 | 30 | #functions 31 | print = (/打印/ : /print/); 32 | send = (/发送/ : /send/); 33 | functions = $(print) | $(send); 34 | function = $(functions) $(noterminator) : "invoke(m[0].rstr,m[1].rstr)"; 35 | 36 | 37 | addsub= $(not) $(noterminator) 38 | | $(noterminator) $(operator) $(noterminator); 39 | 40 | terminator = $(digit) | $(ifelse) | $(pow) | $(divpow) | $(function); 41 | 42 | 43 | 44 | #暂时无法分析 3加5的和乘以3,因为会造成循环递归,从左向右推导不可行 45 | # | $(noterminator) $(add) $(noterminator) $(addresult) 46 | # | $(noterminator) $(sub) $(noterminator) $(subresult); 47 | 48 | #%Order% 28 49 | noterminator = $(terminator) : "eval(m.rstr)" | $(addsub) : "eval(m.rstr)"; 50 | 51 | 52 | or = (/或/ : / or /); 53 | and = (/且/ : / and /); 54 | not = (/不是/ : / not /); 55 | equal = (/等于|=/ : /=/); 56 | bigger = (/大于|>/ : />/); 57 | less = (/小于| :/); 58 | noequal = (/不等于/: /!=/); 59 | 60 | ifelse = (/如果/) $(noterminator) (/,那么/) $(noterminator) (/,否则/) $(noterminator) : "check(m[1].rstr,m[3].rstr,m[5].rstr)"; -------------------------------------------------------------------------------- /rules/cnext: -------------------------------------------------------------------------------- 1 | #中文语言规则系统 2 | #赵一鸣 3 | #转写不是主要任务,主要是识别类型,并尽可能地转换为标准形式 4 | #可以通过拓展规则,找出字符串中的信息树 5 | #例如: 6 | #1. ¥230元 转写为 230 ,同时确认其为价格 7 | #2. 二零一四年八月 时间 2014/08 Year:2014 Month:08 8 | 9 | #不同的数据类型,可以通过额外的XML配置文件,定义其处理策略。 10 | #例如: 当五分之三识别为分数之后,可以送入分数处理模块,得到其0.6的数值 11 | 12 | 13 | ##%Script% extends 14 | 15 | #================# 16 | # 基础规则转写 # 17 | #================# 18 | 19 | byte_anything2null = (/.*/ : //); 20 | byte_anything2space = (/.*/ : / /); 21 | byte_ins_space = ("" : " "); 22 | null_2_null = ("" : ""); 23 | byte_det_space = (/ */://); 24 | byte_det_pot = (/ /:/ /); 25 | byte_det_one_space = (/ ?/://); 26 | 27 | byte_cnv_to = (/[ \t]*[-~][ \t]*/ : / to /); 28 | #中文字符 29 | chs=(/[\u4e00-\u9fbb]/); 30 | chs_multi = (/[\u4e00-\u9fbb]+/); 31 | Punctuations = (/[,.?!\(\)\[\]]/); 32 | head_space = (/\b/) : $1; 33 | back_space = (/\b|$/) : $1; 34 | #将全角符号转换为半角符号 35 | 36 | low_letter = (/[a-z]/); 37 | 38 | up_letter = (/[A-Z]/); 39 | ###%Order% 274 40 | low_to_up_letter = $(low_letter) : "unicode.upper(m.mstr)"; 41 | en_letter = $(low_letter) | $(up_letter); 42 | 43 | en_letters = $(en_letter)+; 44 | 45 | Ordinal_s_no_replace =("." : ".") | ("th" : "th"); 46 | 47 | 48 | 49 | int_1 = ("一" : "1"); 50 | int_0 =("零" : "0"); 51 | int_del_0 = (/零/ : /0/) | (// : /0/); 52 | int_0_null = $(int_0) ; 53 | #%Type% INT 54 | int_2 = ("二" : "2") | ("两" : "2"); 55 | 56 | int_2_5 = $(int_2) | ("三" : "3") | ("四" : "4") | ("五" : "5"); 57 | #%Type% INT 58 | int_0_4= $(int_0) | $(int_1) | $(int_2) | ("三" : "3") | ("四" : "4") ; 59 | #%Type% INT 60 | int_2_9 = $(int_2_5) | ("六" : "6") | ("七" : "7") | ("八" : "8") | ("九" : "9"); 61 | #%Type% INT 62 | int_1_9 = $(int_1) | $(int_2_9) | (/[1-9]/); 63 | #%Type% INT 64 | int_0_9 = $(int_0) | $(int_1_9)| (/[0-9]/); 65 | 66 | #可有可无的0-9,用于诸如四十这样的表达 67 | #%Type% INT 68 | int_0_9_null = $(int_del_0) | $(int_1_9) | $(int_0); 69 | int_rep = $(int_0_9)+; 70 | int_rep0 = $(int_0)+; 71 | int_rep0_null = ("" : "") | $(int_rep0); 72 | 73 | signed_symbol0 = ("正" : "") | ("负" : "-") | ("正负" : "±") | ("\+" : "+") | ("\-" : "-") | ("±" : "±") ; 74 | signed_symbol = $(signed_symbol0) | $(null_2_null); 75 | #%Type% INT 76 | int_1_decades = (/十/ : /1/) | (/一十/ : /1/); 77 | #%Type% INT 78 | int_0_10 = $(int_0) | $(int_1_9)| $(int_1_decades) | (/[0-9]|(10)/); 79 | #%Type% INT 80 | int_00_10 = $(int_0) $(int_1_9)| $(int_0_10); 81 | #%Type% INT 82 | int_1_2_decades = $(int_1_decades) | (/二/ : /2/) (/十/ : //); 83 | #%Type% INT 84 | int_1_5_decades = $(int_1_decades) | $(int_2_5) (/十/ : //); 85 | #%Type% INT 86 | int_1_9_decades = $(int_1_decades) | $(int_1_9) (/十/ : //); 87 | #%Type% INT 88 | int_10_99 = $(int_1_9_decades) $(int_0_9_null) | (/[1-9][0-9]/) ; 89 | #%Type% INT 90 | int_10_59 = $(int_1_5_decades) $(int_0_9_null) | (/[1-5][0-9]/) ; 91 | #%Type% INT 92 | int_1_99 = $(int_1_9) | $(int_10_99) ; 93 | #%Type% INT 94 | int_01_99 = $(int_1_9) | $(int_10_99) | (/\d{1,2}/); 95 | #%Type% INT 96 | int_0_99 = $(int_0) | $(int_01_99) | (/\d{1,2}/) ; 97 | #为了适应24进制 98 | #%Type% INT 99 | int_0_23 = $(int_0) 100 | | ("" : "0") $(int_1_9) 101 | | $(int_1_decades) $(int_1_9) 102 | | (/二/ : /2/) | $(int_0_4); 103 | #为了适应60进制 104 | #%Type% INT 105 | int_0_60 = $(int_0) 106 | | $(int_10_59) 107 | | ("" : "0") $(int_1_9) 108 | | (/([0-5][0-9])|(60)/); 109 | int_1_9_hundreds = $(int_1_9) ("百" : ""); 110 | int_000_099 = $(int_del_0) $(int_del_0) $(int_0_9) | $(int_del_0) $(int_10_99) | $(int_del_0) $(int_del_0) $(int_del_0); 111 | int_100_999 = $(int_1_9_hundreds) ("" : "00") | $(int_1_9_hundreds) $(int_10_99); 112 | int_1_999 = $(int_1_99) | $(int_100_999); 113 | #%Type% INT 114 | int_0_999 = $(int_0) | $(int_1_999); 115 | 116 | int_1_9_thousands = $(int_1_9) ("千" : ""); 117 | 118 | int_1000_9999 = $(int_1_9_thousands) $(int_000_099) | $(int_1_9_thousands) $(int_100_999); 119 | #%Type% INT 120 | int_1_9999 = $(int_1000_9999) | $(int_1_999) | (/\d{1,4}/); 121 | #%Type% INT 122 | int_0_9999 = $(int_1000_9999) | $(int_0_999) | (/\d{1,4}/); 123 | 124 | int_wan = ("万" : ""); 125 | #%Type% INT 126 | int_5_8bit = $(int_1_9999) $(int_wan) $(int_0_9999) | $(int_1_9999) $(int_wan) (// : /0000/) ; 127 | 128 | int_ins_yi = (/个?亿/ : //); 129 | #%Type% INT 130 | int_9_12bit= $(int_1_9999) $(int_ins_yi) $(int_5_8bit) | $(int_1_9999) $(int_ins_yi) (// : /00000000/); 131 | #%Type% INT 132 | pure_int = (/\d{1,}/); 133 | 134 | #%Type% INT 135 | integer_int = $(int_0) 136 | | $(int_1_9999) 137 | | $(int_5_8bit) 138 | | $(int_9_12bit) 139 | | $(pure_int); 140 | #%Type% INT 141 | #%Order% 140 142 | integer_signed = $(signed_symbol) $(integer_int) 143 | ; 144 | 145 | 146 | 147 | #================# 148 | # decimals # 149 | #================# 150 | 151 | 152 | pure_decimal= (/0\.\d+/) 153 | | (/(?:\d+,?)*\d+\.\d+/); 154 | decimal = $(pure_decimal) 155 | | $(integer_int) ("点" : ".") $(int_rep) 156 | ; 157 | 158 | pure_digit = $(pure_decimal) | $(pure_int); 159 | #%Type% DOUBLE 160 | #%Order% 100 161 | decimal_signed = $(signed_symbol) $(decimal) | $(decimal); 162 | 163 | 164 | 165 | 166 | 167 | #================# 168 | # fractions # 169 | integer_int_extend = $(integer_int) | ("百" : "100") | ("千" : "1000") | ("万" : "10000") | ("亿" : "100000000"); 170 | #================# 171 | fraction_cnv_slash = ("分之" : "/"); 172 | fraction2 = ("/" : "/"); 173 | percent_transform= ("%" : "100") | ("‰" : "1000"); 174 | #%Type% DOUBLE 175 | #%Property% Denominator,,Numerator| Numerator ,, Denominator | Denominator ,, Numerator 176 | #%Order% 101 177 | fraction = $(integer_int_extend) $(fraction_cnv_slash) $(integer_int) : $3 $2 $1 178 | | $(integer_int) $(fraction2) $(integer_int) 179 | | $(pure_decimal) ("" : "/") $(percent_transform); 180 | 181 | fraction_signed = $(signed_symbol) $(fraction) | $(fraction); 182 | 183 | integer_decimal = $(integer_int) | $(decimal); 184 | digit = $(integer_int) 185 | | $(decimal) 186 | | $(fraction); 187 | 188 | digit_signed = $(signed_symbol) $(digit); 189 | 190 | range_keywords = (/到|至|\-|~/ : /~/); 191 | 192 | range_larger = ("大于") | (">"); 193 | range_less = ("小于") | ("<"); 194 | range_larger_result= (/.+/:/>/); 195 | range_less_result= (/.+/ : /); 196 | range_back = (/以上/ : />/) | (/以下/ : /) ; 197 | 198 | range2= $(range_larger): $(range_larger_result) | $(range_less) : $(range_less_result); 199 | 200 | 201 | 202 | 203 | 204 | #================# 205 | # time # 206 | #================# 207 | 208 | date_from= ("从" : "") | (// : //); 209 | 210 | time_kywd_nrml = (/\b(AM|PM|时|秒|凌晨|早晨|早上|上午|中午|下午|傍晚|晚上|深夜|午夜|时间|时区|时差|时钟|闹钟|闹铃|手表|开始|开会|开幕|截至|截止|为止|结束|闭幕|加班到|加班至|首班车|末班车|发车|时刻表|航班|准点|晚点|到点|现在是)\b/); 211 | time_special = ("半" : "30") | ("一刻" : "15") | ("三刻" : "45"); 212 | time_cnv_dian = ("点" : ":") | (":" :":"); 213 | 214 | 215 | time_fen = ("分" : ":") | ("分钟" : ":") | ("min" : ":") | ("Min" : ":") | (":" : ":"); 216 | #下面代表3点05 这种省略分钟的说法 217 | time_fen2 = $(time_fen) | ("" : ""); 218 | 219 | 220 | time_mm_r = (/\b(?:00?)\b/ : //) | $(int_0_60) (/:?/ : /分/); 221 | time_miao = ("秒" : "") | $(null_2_null) ; 222 | 223 | 224 | 225 | #%Property% Hour,,Minute | Hour,,Minute | Hour,,Minute | Hour,,Minute,,Second | Hour 226 | #%Order% 30 227 | time_fix = $(int_0_99) $(time_cnv_dian) $(int_00_10) $(time_fen2) 228 | | $(int_0_99) $(time_cnv_dian) $(int_10_59) $(time_fen2) 229 | | $(int_0_99) $(time_cnv_dian) $(time_special) 230 | | $(int_0_99) $(time_cnv_dian) $(int_0_60) $(time_fen) $(int_0_60) $(time_miao) 231 | | $(int_0_99) $(time_cnv_dian) (// : /00/) 232 | ; 233 | 234 | time_span = $(int_0_60) $(time_fen) $(int_0_60) $(time_miao) ("" : "00:") : $5 $1 $2 $3 $4 235 | | $(int_0_60) $(time_fen2) ("" : "00:") ("" : ":00") : $3 $1 $2 $4 236 | | $(int_0_60) ("秒" : "") ("" : "00:") ("" : ":00") : $3 $1 $2 $4; 237 | 238 | 239 | 240 | #%Property% ,Start,,End 241 | #%Order% 28 242 | time_range = $(date_from) $(time_fix) $(range_keywords) $(time_fix); 243 | 244 | #================# 245 | # date # 246 | #================# 247 | 248 | date_ri = ("日" : "") | ("号" : ""); 249 | date_ri_null = $(date_ri) $(null_2_null); 250 | date_DD = $(int_01_99) $(date_ri); 251 | date_DD_null= $(date_DD) | $(int_01_99); 252 | date_cnv_yue = (/[-/.]/ : ///); 253 | date_yue = ("月": ""); 254 | date_yue2 = ("月": "/"); 255 | date_yue3 = $(date_yue2) | $(date_cnv_yue); 256 | 257 | date_cnv_nian = (/[-/.]/ : ///); 258 | date_nian = ("年" : ""); 259 | date_nian2 = ("年" : "/"); 260 | date_nian3 = $(date_nian2) | $(date_cnv_nian); 261 | 262 | date_YYYY = (/[12]\d\d\d/) : $(int_0_9999); 263 | date_YYYY_Restrict = (/\b(?:19\d\d|20\d\d)\b/) : $(int_1000_9999); 264 | 265 | date_keywords = ("AD") | ("BC") | ("年") | ("月") | ("前年") | ("去年") | ("昨年") | ("今年") | ("明年") | ("后年") | ("上月") | ("本月") | ("下月") | ("个月") | ("昨日") | ("今日") | ("明日") | ("前天") | ("昨天") | ("今天") | ("明天") | ("后天") | ("日期") | ("日子") | ("星期") | ("周一") | ("周二") | ("周三") | ("周四") | ("周五") | ("周六") | ("周日") | ("周末") | ("节假日") | ("工作日") | ("纪念日") | ("公元") | ("年度") | ("财年") | ("季度") | ("赛季") | ("节日") | ("生日") | ("假日") | ("元旦") | ("情人节") | ("妇女节") | ("愚人节") | ("植树节") | ("消费者权益日") | ("劳动节") | ("青年节") | ("儿童节") | ("建军节") | ("教师节") | ("国庆") | ("圣诞") | ("春节") | ("元宵") | ("清明") | ("端午") | ("七夕") | ("中秋") | ("重阳"); 266 | # YYYY年 YYYY-YYYY年 YYYY年-YYYY年 YYYY年MM月 YYYY年MM-MM月 YYYY年MM月-MM月 YYYY年MM月-YYYY年MM月 YYYY年MM月DD日 YYYY年MM月DD-DD日 YYYY年MM月DD日-DD日 YYYY年MM月DD日-MM月DD日 YYYY年MM月DD日-YYYY年MM月DD日 MM月DD日 MM月DD-DD日 MM月DD日-DD日 MM月DD日-MM月DD日 MM月-MM月 MM-MM月 MM月 DD日-DD日 DD-DD日 DD日 # 267 | #%Type% DATETIME 268 | #%Property% Year | Year,,Month | Year,,Month,,Day | Month,,Day 269 | #%Order% 32 270 | date_fix = $(date_YYYY) $(date_nian) 271 | | $(date_YYYY) $(date_nian3) $(int_01_99) $(date_yue3) 272 | | $(date_YYYY) $(date_nian3) $(int_01_99) $(date_yue3) $(date_DD_null) 273 | | $(int_01_99) $(date_yue3) $(date_DD) 274 | | $(date_DD) 275 | ; 276 | #注意,对于元属性,对空格严格要求,中间只能空一格 277 | #%Property% ,Start,,End 278 | #%Order% 26 279 | date_range = $(date_from) $(date_fix) $(range_keywords) $(date_fix); 280 | 281 | #================# 282 | # 时间长度 # 283 | #================# 284 | #时间长度和范围的概念有所不同,比如三个小时和6点到9点,是相似但不同的 285 | time_unit_trans = (/年|years/ : /Year/) | (/(个?月)|(months)/: /Month/) | (/天|日|days/ : /day/) | (/个?小时|hours/ : /hour/) | (/分钟|minutes|min/ : /minute/) | (/刻钟/: /min15/) | (/秒|sec|seconds/: /second/); 286 | 287 | #%Property% Value,Unit 288 | #%Order% 54 289 | time_length = $(integer_signed) $(time_unit_trans); 290 | 291 | 292 | #以下是各种单位 293 | #数值型单位需要识别其类型,同时将单位和数值提取出来,单位使用英语标准(如nm)输出,之后送入单位处理模块 294 | 295 | #基本的量级单位 296 | #毫,纳,微,分,厘 297 | 298 | unit_amount_mini= ("m") | ("n") | ("μ") | ("d") | ("c"); 299 | 300 | unit_amount_mini_trans= ("M" : "m") | ("C" : "c"); 301 | 302 | unit_amount_mini_chs= ("毫" : "m") | ("分" : "c") | ("微" : "m") | ("纳" : "n") ; 303 | #千,与下方分开,因为千米是常用表达,但没有兆米这样的说法 304 | unit_amount_large1 = ("K"); 305 | unit_amount_large3 = ("k" : "K") ; 306 | unit_amount_large1_chs = ("千" : "K"); 307 | #兆,吉,T 308 | 309 | unit_amount_large2 = ("M") | ("G") | ("T"); 310 | unit_amount_large2_trans = ("m" : "M") | ("g" : "G"); 311 | unit_amount_large2_chs =("兆" : "M") | ("吉" : "G") ; 312 | unit_amount_chs3 = $(unit_amount_mini_chs) | $(unit_amount_large1_chs); 313 | unit_amount_large_eng = $(unit_amount_large1) | $(unit_amount_large2); 314 | 315 | unit_amount_chs4 = $(unit_amount_large1_chs) | $(unit_amount_large2_chs); 316 | unit_amount_eng_all = $(unit_amount_mini) | $(unit_amount_large_eng) | $(unit_amount_large2_trans) ; 317 | unit_amount_chs_all = $(unit_amount_mini_chs) | $(unit_amount_large1_chs) | $(unit_amount_large2_chs) ; 318 | #专用于表现存储量的,在信息系统中,这种表达很常见 319 | unit_memory = $(unit_amount_large_eng) | $(unit_amount_large2_chs); 320 | #平方,立方 321 | unit_keyword_pow = ("平方" : ""); 322 | unit_keyword_cubic = ("立方": ""); 323 | #----------# 324 | # Length # 325 | #----------# 326 | 327 | unit_length_eng= ("m" : "m") | ("M" : "m"); 328 | 329 | unit_length_chs= ("米" : "m") | ("公分" : "cm"); 330 | unit_tabl_length_1 = $(unit_amount_mini) $(unit_length_eng) 331 | | $(unit_amount_mini_chs) $(unit_length_eng) 332 | | $(unit_amount_large1_chs) $(unit_length_chs) 333 | | $(unit_length_eng) 334 | | $(unit_length_chs); 335 | 336 | unit_tabl_mile = ("英里" : "mi") | ("海里" : "nmi") | ("公里" : "Km"); 337 | unit_tabl_length = $(unit_tabl_length_1) | $(unit_tabl_mile); 338 | unit_kywd_length =(/长|宽|高|厚|深|里程|距离|海拔|速度|尺寸|幅|米|寸|尺|码/); 339 | unit_kywd_mile = (/海里|船|舰|海|空/); 340 | 341 | #%Property% Value,Unit 342 | #%Order% 27 343 | unit_length = $(digit) $(unit_tabl_length) 344 | ; 345 | 346 | #----------# 347 | # Area # 348 | #----------# 349 | unit_tabl_area_0 = ("m2" : "m2"); 350 | 351 | unit_area_speacial= ("亩" : "acre") | ("公顷" : "ha") | ("平方公里" : "Km2"); 352 | unit_tabl_area = $(unit_keyword_pow) $(unit_amount_chs3) ("米" : "m2") 353 | | $(unit_amount_large_eng) | $(unit_tabl_area_0) 354 | | $(unit_area_speacial); 355 | 356 | 357 | unit_kywd_area = ("面积") | ("土地") | ("英亩") | ("公顷"); 358 | #%Property% Value,Unit 359 | #%Order% 28 360 | unit_area = $(digit) $(unit_tabl_area) 361 | ; 362 | 363 | 364 | 365 | 366 | #----------# 367 | # Volume # 368 | #----------# 369 | unit_tabl_volume_0 = ("m3" : "m3"); 370 | unit_volume_specal = ("cc" : "cm3"); 371 | unit_tabl_volum = $(unit_keyword_cubic) $(unit_amount_chs3) ("米" : "m2") 372 | | $(unit_amount_large_eng) | $(unit_tabl_volume_0) 373 | | $(unit_volume_specal); 374 | 375 | 376 | 377 | unit_kywd_volume = ("体积") | ("容积"); 378 | #%Property% Value,Unit 379 | #%Order% 29 380 | unit_volume = $(digit) $(unit_tabl_volum); 381 | # | $(digit) $(unit_tabl_volume) : : "CMP( DIST(A,33,$(unit_kywd_volume)) ≤ 33 )" 382 | 383 | 384 | #----------# 385 | # Weight # 386 | #----------# 387 | unit_weight_eng = ("g": "g") | ("G" : "G"); 388 | unit_weight_chs = ("克": "克"); 389 | 390 | unit_weight_specal = ("t" : "t") | ("吨" : "t") | ("磅" : "pound") | ("公斤" : "Kg"); 391 | unit_tabl_weight = $(unit_amount_mini) $(unit_weight_eng) 392 | | $(unit_amount_mini_chs) $(unit_weight_chs) 393 | | $(unit_amount_large1_chs) $(unit_weight_chs) 394 | | $(unit_weight_eng) 395 | | $(unit_weight_specal) 396 | | $(unit_weight_chs); 397 | 398 | unit_kywd_weight = ("质量") | ("重") | ("克"); 399 | #%Property% Value,Unit 400 | #%Order% 95 401 | unit_weight = $(digit) $(unit_tabl_weight); 402 | # | $(digit) $(unit_tabl_volume) : : "CMP( DIST(A,33,$(unit_kywd_volume)) ≤ 33 )" 403 | 404 | 405 | #----------# 406 | # Electric # 407 | #----------# 408 | 409 | 410 | unit_electric_eng = ("Ω") | ("W") | ("A") | ("V") | ("Hz") | ("F") | ("C") | ("H") | ("J"); 411 | 412 | unit_electric_chs= ("欧姆" : "Ω") | ("欧" : "Ω" ) | ("瓦" : "W") | ("安培" : "A") | ("安" : "A") | ("伏" : "V") | ("伏特" : "V") | ("法" : "F") | ("法拉" : "F") | ("库伦" :"C") | ("亨" : "H") | ("焦" :"J") | ("焦耳": "J") ; 413 | 414 | unit_tabl_electric = $(unit_amount_eng_all) $(unit_electric_eng) 415 | | $(unit_amount_chs_all) $(unit_electric_chs) 416 | | $(unit_electric_eng) 417 | | $(unit_electric_chs); 418 | 419 | unit_kywd_electric = ("电") | ("交流") | ("直流") | ("导体") | ("功率") | ("负载") | ("阻抗") | ("线圈") | ("磁场") | ("回路") | ("滤波") | ("欧姆") | ("瓦特") | ("伏特") | ("焦耳") | ("毫安") | ("安培") | ("赫兹"); 420 | #%Property% Value,Unit 421 | #%Order% 1 422 | unit_electric = $(integer_decimal) $(unit_tabl_electric) :: "dist('unit_kywd_electric')<33"; 423 | 424 | 425 | 426 | #----------# 427 | # Telecom # 428 | #----------# 429 | 430 | unit_telecom_eng = ("B" : "B") | ("b" : "b") | ("Byte" : "B") | ("bit" : "b") | ("byte" : "B"); 431 | 432 | unit_telecom_chs = ("比特" : "B") | ("字节" : "B"); 433 | 434 | unit_kywd_telecom = ("CDMA") | ("GPRS") | ("GSM") | ("SCDMA") | ("WCDMA") | ("网络") | ("网速") | ("上网") | ("带宽") | ("容量") | ("比特") | ("字节") | ("内存") | ("硬盘") | ("闪存") | ("手机") | ("电脑") | ("平板") | ("套餐") | ("数据") | ("流量") | ("数码") | ("数字") | ("电信") | ("联通") | ("移动") | ("网通"); 435 | 436 | 437 | unit_telecom_unit = $(unit_amount_large_eng) $(unit_telecom_eng) 438 | | $(unit_amount_chs4) $(unit_telecom_chs) 439 | | $(unit_telecom_eng) 440 | | $(unit_memory) 441 | | $(unit_telecom_chs); 442 | #%Property% Value,Unit 443 | #%Order% 32 444 | unit_telecom = $(integer_decimal) $(unit_telecom_unit); 445 | 446 | # | $(unit_digit_comb) $(unit_tabl_telecom_1) : : "CMP( DIST(A,33,$(unit_kywd_telecom)) ≤ 33 )"; 447 | 448 | 449 | #----------# 450 | # 布尔型 # 451 | #----------# 452 | 453 | bool_true = ("真") | ("是") | ("True") | ("true") | ("对") | ("Right") | ("正确"); 454 | bool_true_result = (/.+/ : /true/); 455 | 456 | bool_false = ("假") | ("否") | ("False") | ("false") | ("错") | ("Wrong") | ("错误"); 457 | bool_false_result = (/.+/: /false/); 458 | bool_final =$(bool_true) : $(bool_true_result) | $(bool_false) : $(bool_false_result); 459 | 460 | #%Type% BOOL 461 | #%Order% 71 462 | bool_check = $(head_space) $(bool_final) $(back_space); 463 | 464 | #----------# 465 | # 身份证 # 466 | #----------# 467 | #%Type% 身份证信息转换 468 | #%Order% 19 469 | chsIDCard= (/\d{17}(\d|X|x)/); 470 | 471 | #----------# 472 | # Email # 473 | #----------# 474 | #%Order% 20 475 | #%Property% Accound,,ServiceName 476 | email_address= (/[a-zA-Z0-9]{3,20}/) (/@/) (/[a-zA-Z0-9]{2,20}/)(/(.com)|(.COM)/); 477 | 478 | #----------# 479 | # Signal # 480 | #----------# 481 | unit_signal_chs = ("赫兹" : "Hz") | ("分贝" : "dB") ; 482 | unit_signal_eng = ( "Hz" ) | ( "dB" ); 483 | 484 | unit_kywd_signal = ("CPU") | ("信号") | ("能量") | ("分贝") | ("赫兹") | ("中波") | ("周期") | ("频率") | ("声") | ("短波") | ("耳") | ("听觉") | ("长波") | ("电力") | ("电压") | ("电波") | ("电磁波") | ("声波"); 485 | 486 | unit_signal_unit = $(unit_amount_large_eng) $(unit_signal_eng) 487 | | $(unit_amount_chs4) $(unit_signal_chs) 488 | | $(unit_signal_chs) 489 | | $(unit_signal_eng); 490 | #%Property% Value,Unit 491 | #%Order% 33 492 | unit_signal = $(integer_decimal) $(unit_signal_unit) ; 493 | 494 | 495 | #unit_signal = $(unit_digit_comb) $(unit_tabl_signal) : : "CMP( DIST(A,33,$(unit_kywd_signal)) ≤ 33 )"; 496 | 497 | 498 | 499 | #-----------------------# 500 | # Atmospheric,Dynamics # 501 | #------------------------# 502 | 503 | unit_dynamic_chs = ("帕斯卡" : "Pa" ) | ("牛" : "N" ) | ("牛顿" : "N") | ("摄氏度" : "C" ) | ("华氏度" : "F" ) ; 504 | 505 | unit_dynamic_eng = ( "Pa" ) | ( "N" ) | ("°F" : "F") | ("°C" : "C")| ("°F") | ("°C"); 506 | 507 | unit_dynamic_signal =("帕斯卡") | ("压力") | ("应力") | ("气压") | ("天气") | ("气象") | ("牛") | ("力") | ("质量") | ("速度") | ("重力") | ("摄氏度") | ("华氏度") | ("温度") | ("天气") | ("热") | ("凉") | ("冷") | ("寒") | ("融点") | ("沸点") | ("色温度") | ("水") | ("蒸发"); 508 | 509 | unit_dynamic_unit = $(unit_amount_large_eng) $(unit_signal_eng) 510 | | $(unit_amount_chs4) $(unit_signal_chs) 511 | | $(unit_signal_chs) 512 | | $(unit_signal_eng); 513 | 514 | 515 | #%Property% Value,Unit 516 | #%Order% 34 517 | unit_dynamic = $(integer_decimal) $(unit_signal_unit); 518 | 519 | #--------------# 520 | # Currency # 521 | #--------------# 522 | #("美元") 523 | unit_currency_chs = ("元" : "Yuan") 524 | | ("欧元" : "Euro") 525 | | ("英镑" : "Pound" ) 526 | | ("美元" : "Dollar"); 527 | unit_currency_eng = ("¥" : "Yuan"); 528 | #%Property% Value,Unit|Unit,Value 529 | #%Order% 35 530 | unit_currency = $(integer_decimal) $(unit_currency_chs) 531 | | $(unit_currency_eng) $(integer_decimal) :$2 $1; 532 | 533 | 534 | #unit_temperature = $(unit_digit_temperature) ("°" : "度") : : "CMP( DIST(A,33,$(unit_kywd_temperature)) ≤ 33 )" 535 | # | $(unit_digit_temperature) $(byte_cnv_to) $(unit_digit_temperature) ("°" : "度") : : "CMP( #DIST(A,33,$(unit_kywd_temperature)) ≤ 33 )" 536 | # | $(unit_digit_temperature) $(unit_tabl_temperature) 537 | # | $(unit_digit_temperature) $(byte_cnv_to) $(unit_digit_temperature) $(unit_tabl_temperature) 538 | #; 539 | 540 | 541 | all_unit =$(unit_area) | $(unit_currency) | $(unit_dynamic) | $(unit_electric) | $(unit_length); 542 | 543 | all_unit_value = $(all_unit) | $(digit_signed); 544 | 545 | 546 | 547 | #%Property% Value,Direction | From,,To | Direction,Value 548 | #%Order% 102 549 | digit_range = $(all_unit_value) $(range_back) : $2 $1 550 | | $(digit_signed) $(range_keywords) $(all_unit_value) 551 | | $(range2) $(all_unit_value); 552 | 553 | #%Type% URL 554 | #%Order% 123 555 | url = (/(http|https|ftp)://[!-~]+/) 556 | | (/(www|www2)\.[!-~]+/) 557 | | (/[!-~]+\.(com|net|edu|gov|org|mil|int|cc|cn|de|eu|fr|jp|hk|kp|ru|tw|uk|us|htm|html|jsp|asp|php)(/[!-~]+)?/); 558 | 559 | 560 | 561 | #================# 562 | # address # 563 | #================# 564 | 565 | addr_province = ("省") | ("自治区") | ("特别行政区") ; 566 | #市 作为独立存在; 567 | addr_city = ("市"); 568 | addr_domain = ("县")| ("区"); 569 | addr_country = ("乡") | ("镇"); 570 | 571 | #村 作为独立存在; 572 | 573 | #因为地址没有特别的分割标点符号,所以用纯tn的做法,是一直向前推进,直到发现省或者市这样的关键词。 574 | #更好的做法,应该是分词模型吧 575 | except_keywords = $(addr_province) | $(addr_city) | $(addr_domain) | $(addr_country); 576 | addr_except_keywords= $(chs) - $(except_keywords); 577 | addr_part_name = $(addr_except_keywords){2,-1}; 578 | 579 | addr_kywd_nrml = ("地址") | ("小区") | ("大厦") | ("物业") | ("业主") | ("住在") | ("家住") | ("家在") | ("家是") | ("宾馆") | ("酒店") | ("宿舍") | ("公寓") | ("房间") | ("房号") | ("路") | ("楼") | ("座") | ("栋") | ("幢") | ("层") | ("室"); 580 | 581 | 582 | 583 | 584 | #%Type% 地址 585 | #%Property% Province,,City,,Domain 586 | #%##Order% 15 587 | address = $(addr_part_name) $(addr_province) $(addr_part_name) $(addr_city) $(addr_part_name) $(addr_domain); 588 | 589 | 590 | #================# 591 | # telecom # 592 | #================# 593 | 594 | telecom_kywd_nrml =("动感地带") | ("TEL") | ("℡") | ("FAX") | ("电话") | ("座机") | ("手机") | ("传真") | ("详回") | ("回") | ("回复") | ("询") | ("详询") | ("致电") | ("回电") | ("电询") | ("编辑") | ("编辑短信") | ("发送") | ("发短信") |("发送短信") | ("总机") | ("分机") | ("热线") | ("专线") | ("拨打") | ("拨") | ("加拨") | ("转接") | ("客服") | ("火警") | ("报警") | ("市话") | ("长话") | ("国内长途") | ("国际长途") | ("漫游") | ("查号台") | ("移动") | ("联通") | ("电信"); 595 | #---------------------------# 596 | # Mobile Phone Number # 597 | # 1Yxxxxxxxxx Y=[3-9] # 598 | # 1Yx xxxx xxxx Y=[3-9] # 599 | # 1Yxx xxx xxxx Y=[3-9] # 600 | # 1Yxx xxxx xxx Y=[3-9] # 601 | #---------------------------# 602 | telecom_dash = (/ ?- ?| / : //); 603 | #%Type% 移动电话 604 | telecom_mobile_no = (/1[3-9]\d/) (/\d{4}/) (/\d{4}\b/) 605 | | (/1[3-9]\d\b/) $(telecom_dash) (/\b\d{4}\b/) $(telecom_dash) (/\b\d{4}\b/) 606 | | (/1[3-9]\d{2}\b/) $(telecom_dash) (/\b\d{3}\b/) $(telecom_dash) (/\b\d{4}\b/) 607 | | (/1[3-9]\d{2}\b/) $(telecom_dash) (/\b\d{4}\b/) $(telecom_dash) (/\b\d{3}\b/) 608 | ; 609 | #固定电话号码 0350-22222222 [7-9位] 610 | #%Type% 固定电话 611 | #%Property% AreaCode,,PhoneCode | AreaCode,,PhoneCode| PhoneCode 612 | telecom_phone = (/\d{3,4}/) (/ /: / /) (/\d{7,9}/) 613 | | (/\d{3,4}/) $(telecom_dash) (/\d{7,9}/) 614 | | (/\d{7,9}/); 615 | #%Type% 电话号码转换 616 | #%Order% 35 617 | telecom= $(head_space) $(telecom_phone) | $(head_space) $(telecom_mobile_no); 618 | 619 | 620 | 621 | 622 | #%Type% IP地址 623 | #%Order% 45 624 | ip_part= (/((2[0-4]\d|25[0-5]|[01]?\d\d?)\.){3}(2[0-4]\d|25[0-5]|[01]?\d\d?)/); 625 | 626 | 627 | 628 | #---------------------------# 629 | # Vehicle Related # 630 | #---------------------------# 631 | 632 | num_kywd_bus = ("乘坐") | ("换乘") | ("车站") | ("车牌") | ("站台") | ("公共交通") | ("公交") | ("公车") | ("巴士") | ("路车") | ("号线") | ("地铁") | ("轻轨") | ("电车") | ("交通") | ("路况") | ("堵车") | ("红灯") | ("绿灯") | ("一卡通") | ("公交卡") | ("无人售票") | ("站点") | ("站牌") | ("到站") | ("下车") | ("上车") | ("下一站") | ("起点站") | ("终点站") | ("首班车") | ("末班车"); 633 | 634 | num_bus_dflt = (/\b\d{1,4}\b/) ("路") ; 635 | -------------------------------------------------------------------------------- /rules/learn: -------------------------------------------------------------------------------- 1 | #这是一个基本的入门教程 2 | 3 | #1.将所有的“你好”匹配出来 4 | 5 | hello= ("你好"); 6 | 7 | hello2= $(hello)| ("您好"); 8 | 9 | 10 | 11 | 12 | hello3= $(hello2) : (// : /hello/); 13 | 14 | people= ("老婆") | ("领导"); 15 | 16 | #%Order% 1 17 | reorder= $(people) $(hello3) : $2 $1; 18 | 19 | #定义0-9 20 | int_1 = ("一" : "1"); 21 | int_0 =("零" : "0"); 22 | int_2 = ("二" : "2") | ("两" : "2"); 23 | int_3_9 = ("三" : "3") | ("四" : "4") | ("五" : "5") | ("六" : "6") | ("七" : "7") | ("八" : "8") | ("九" : "9"); 24 | int_1_9 = $(int_1) | $(int_2) | $(int_3_9) | (/\d/); 25 | int_0_9 = $(int_0) | $(int_1_9); 26 | int_del_0 = (/零/ : /0/) | (// : /0/); 27 | int_0_9_null = $(int_del_0) | $(int_0_9); 28 | #定义10,十 29 | int_1_decades = (/十/ : /1/) | (/一十/ : /1/); 30 | 31 | #定义二十=>20, 32 | int_1_9_decades = $(int_1_decades) | $(int_1_9) (/十/ : //); 33 | int_10_99 = $(int_1_9_decades) $(int_0_9_null) | (/[1-9][0-9]/) ; 34 | int_1_99 = $(int_1_9) | $(int_10_99) ; 35 | int_01_99 = $(int_1_9) | $(int_10_99) | (/\d{1,2}/); 36 | 37 | #%Order% 3 38 | int_0_99 = $(int_0) | $(int_1_9) | $(int_10_99); -------------------------------------------------------------------------------- /rules/xmlparser: -------------------------------------------------------------------------------- 1 | #this TN-XML parser, let's take it 2 | 3 | 4 | equal = (/\s?=\s?/); 5 | equal0 = (/=/); 6 | end= (/\s?;/); 7 | shortspace =(/ /); 8 | shortspace0 = (/\s?/); 9 | space= (/[\s\r\n]*/); 10 | values= (/[#\sA-Za-z0-9;\*,_:%/\.\+\-\(\)]+/); 11 | #下面这个为注释使用,不使用- 12 | values0 = (/[#\sA-Za-z0-9;\*,_:%/\.\+\(\)]+/); 13 | property= (/"/) $(values) (/"/); 14 | name= (/[\w|\-:]+/); 15 | 16 | noteleft= (//); 18 | #%Property% ,,note 19 | note =$(space) $(noteleft) $(values0) $(noteright) $(space); 20 | 21 | 22 | #%Property% ,$key,,$value 23 | properties =$(space) $(name) $(equal) $(property) $(space); 24 | 25 | manyproperties0 =$(properties)*; 26 | 27 | manyproperties =$(manyproperties0) | $(space); 28 | leftbracket = (/); 29 | endleftbracket = (//); 30 | rightbracket= (/>/); 31 | endrightbracket = (//>/); 32 | 33 | 34 | #%Property% children|text 35 | child = $(xmls) $(shortspace0) | $(values) $(shortspace0); 36 | 37 | #%Order% 2 38 | #%Property% ,,name,property,,child |,,name,property 39 | xml = $(space) $(leftbracket) $(name) $(manyproperties) $(rightbracket) 40 | $(child) $(space) $(endleftbracket) $(name) $(rightbracket) $(space) 41 | | $(space) $(leftbracket) $(name) $(manyproperties) $(space) $(endrightbracket) $(space) 42 | | $(note); 43 | 44 | xmls= $(xml)+; 45 | 46 | 47 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ferventdesert/tnpy/b6cc5fe9599890c2bfdc10c6e608439f2555afb2/src/__init__.py -------------------------------------------------------------------------------- /src/tngraph.py: -------------------------------------------------------------------------------- 1 | __author__ = 'zhaoyiming-laptop' 2 | import pygraphviz as pgv 3 | import os; 4 | from src.tnpy import StringEntity as SE, RegexEntity as RE, TableEntity as TE, SequenceEntity as SQE, RepeatEntity as RPE, \ 5 | EntityBase 6 | os.environ["PATH"]= r'D:\Program Files\graphviz-2.38\release\bin;'+os.environ["PATH"]; 7 | # strict (no parallel edges) 8 | # digraph 9 | # with attribute rankdir set to 'LR' 10 | 11 | def addNode(A,entity,nodes): 12 | nodeid= id(entity); 13 | name= str(entity); 14 | if nodeid not in nodes: 15 | A.add_node(name); 16 | nodes[nodeid]=entity; 17 | if isinstance(entity, SQE): 18 | for child in entity.MatchEntities: 19 | addNode(A,child,nodes); 20 | A.add_edge(name,str(child)); 21 | for child in entity.RewriteEntities: 22 | addNode(A,child,nodes); 23 | A.add_edge(name,str(child)); 24 | elif isinstance(entity, TE): 25 | for child in entity.Tables: 26 | addNode(A,child,nodes); 27 | A.add_edge(name,str(child)); 28 | 29 | 30 | 31 | A.add_node(entity) 32 | def buildGraph(tn,entityname): 33 | A=pgv.AGraph(directed=True,strict=True) 34 | entities=tn.Entities; 35 | entity= entities[entityname]; 36 | nodes={}; 37 | addNode(A,entity,nodes); 38 | A.graph_attr['epsilon']='0.001' 39 | print (A.string()) # print dot file to standard output 40 | A.write('foo.dot') 41 | A.layout('dot') # layout with dot 42 | A.draw('foo.png') # write to file 43 | 44 | -------------------------------------------------------------------------------- /src/tnnlp.py: -------------------------------------------------------------------------------- 1 | # encoding: UTF-8 2 | 3 | from src.tnpy import StringEntity as SE, RegexEntity as RE, TableEntity as TE, SequenceEntity as SQE, RepeatEntity as RPE, \ 4 | EntityBase 5 | 6 | import jieba.posseg as pseg 7 | 8 | 9 | wordlib={}; 10 | 11 | 12 | def initwordlib(path): 13 | read = open(path, 'r', 'utf-8') 14 | lines = [(x) for x in read.readlines()] 15 | for line in lines: 16 | ws=line.split(' '); 17 | name= ws[0]; 18 | words= [w.strip() for w in ws[1:]]; 19 | wordlib[name]=words; 20 | 21 | 22 | 23 | class NEREntity(EntityBase): 24 | def __init__(self, pos=None, maxlen=-1): 25 | super(NEREntity, self).__init__() 26 | if isinstance(pos, str): 27 | self.Pos = [pos]; 28 | elif isinstance(pos, list): 29 | self.Pos = pos; 30 | else: 31 | self.Pos = None; 32 | 33 | self.Len = maxlen; 34 | 35 | def RewriteItem(self, input): 36 | return input 37 | 38 | def MatchItem(self, input, start, end,muststart, mode=None): 39 | self.LogIn(input, start,end) 40 | pos = start; 41 | if end is None: 42 | end=len(input); 43 | seg_list = pseg.cut(input[start:end] if self.Len == -1 else input[start:start + self.Len]); 44 | for word, flag in seg_list: 45 | if self.Pos is None: 46 | sword = word; 47 | break; 48 | else: 49 | if flag in self.Pos: 50 | sword = word; 51 | break; 52 | pos += len(word); 53 | if pos < 0 or (muststart == True and pos != start): 54 | self.LogOut(None) 55 | return start + self.Len if self.Len < 0 else tnpy.int_max; 56 | self.LogOut(sword) 57 | m = tnpy.MatchResult(self, sword, pos); 58 | m.rstr = sword; 59 | return m; 60 | 61 | 62 | 63 | 64 | class WordEntity(EntityBase): 65 | def __init__(self, name=None ): 66 | super(WordEntity, self).__init__() 67 | self.Word=name; 68 | if len(wordlib.keys())==0 : 69 | initwordlib('libs/wordlib.txt'); 70 | 71 | def RebuildEntity(self): 72 | if wordlib is None: 73 | print 'please init word lib'; 74 | words=[]; 75 | for r in wordlib: 76 | if r.startswith(self.Word): 77 | for w in wordlib[r]: 78 | words.append(w); 79 | 80 | self.Re= tnpy.RegexEntity('|'.join(words)); 81 | self.Re.RebuildEntity(); 82 | self.Re.Core=self.Core; 83 | def RewriteItem(self, input): 84 | return input 85 | 86 | def MatchItem(self, input, start, muststart, end,mode=None): 87 | return self.Re.MatchItem(input,start,muststart,end,mode); -------------------------------------------------------------------------------- /src/tnpy.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import re 3 | 4 | import itertools; 5 | 6 | int_max = 9999999; 7 | 8 | def findany(iteral, func): 9 | for r in iteral: 10 | if func(r): 11 | return r; 12 | return None; 13 | 14 | def getindex(iteral, func): 15 | for r in range(len(iteral)): 16 | if func(iteral[r]): 17 | return r; 18 | return -1; 19 | 20 | 21 | def __GetPublicRoute(m): 22 | from collections import deque 23 | d = deque() 24 | route = [] 25 | route.append(m.Entity.Order) 26 | d.append(m) 27 | while True: 28 | if len(d) == 0: 29 | break 30 | m = d.popleft() 31 | route.append(m.MatchIndex) 32 | m = m.Children 33 | while m is not None: 34 | d.append(m) 35 | m = m.NextMatch 36 | return route 37 | 38 | 39 | class MatchResult(object): 40 | def __init__(self, entity, match, start, children=None, rstr=None): 41 | super(MatchResult, self).__init__() 42 | self.Order = 0 43 | self.MatchIndex = 0 44 | self.PropertyName = "" 45 | self.Children = children 46 | self.Entity = entity 47 | self.mstr = match 48 | self.rstr = rstr; 49 | self.pos = start 50 | self.IsShouldRewrite = None; 51 | self.CanSplit = False; 52 | 53 | def GetShouldRewrite(self): 54 | if self.IsShouldRewrite != None: 55 | return self.IsShouldRewrite; 56 | if self.Children is None: 57 | if self.Entity is None: 58 | return False; 59 | if isinstance(self.Entity, ScriptEntity) == False and self.Entity.Rewrite is None: 60 | return False; 61 | else: 62 | return True; 63 | else: 64 | r = False; 65 | order = 0; 66 | ms = self.Children; 67 | for m in ms: 68 | if order != m.Order: # order diff must be rewrite 69 | r = True; 70 | break; 71 | order += 1; 72 | r |= m.GetShouldRewrite(); 73 | self.IsShouldRewrite = r; 74 | return r; 75 | 76 | def RewriteItem(self): 77 | if self.rstr is not None: 78 | return self.rstr; 79 | if not self.IsShouldRewrite: 80 | self.rstr = self.mstr; 81 | 82 | if self.Children is None: 83 | self.rstr = self.Entity.RewriteItem(self.mstr) 84 | return self.rstr; 85 | 86 | match = self.Children[:]; 87 | match = sorted(match, key=lambda m: m.Order); 88 | frstr = ""; 89 | for m in match: 90 | frstr += m.RewriteItem(); 91 | self.rstr = frstr; 92 | if isinstance(self.Entity, ScriptEntity): 93 | self.rstr = self.Entity.RewriteItem(self.Children); 94 | return self.rstr 95 | 96 | def __str__(self): 97 | return self.mstr; 98 | 99 | def ExtractDocument(self, document, mode=0): 100 | childDoc = {}; 101 | if isinstance(self.Entity, RepeatEntity): 102 | childDoc0 = []; 103 | for m in self.Children: 104 | m.ExtractDocument(childDoc0, 1) 105 | if len(self.PropertyName) != 0 and len(childDoc0) > 0: 106 | document['$' + self.PropertyName] = childDoc0; 107 | elif self.Children is not None: 108 | for m in self.Children: 109 | m.ExtractDocument(childDoc) 110 | if mode == 0: 111 | if len(self.PropertyName) != 0: 112 | if self.PropertyName == '$value': 113 | document[document['$key']] = self.RewriteItem(); 114 | del document['$key'] 115 | else: 116 | if RegexCore.ExtractDictEnabled: 117 | if len(childDoc) != 0: 118 | document['$' + self.PropertyName] = childDoc 119 | document[self.PropertyName] = self.RewriteItem(); 120 | else: 121 | if len(childDoc) != 0: 122 | for r in childDoc: 123 | document[r] = childDoc[r] 124 | else: 125 | document.append(childDoc); 126 | 127 | 128 | class EntityBase(object): 129 | def __init__(self): 130 | self.Script = None 131 | self.Order = 0 132 | self.Name = "" 133 | self.Rule = "" 134 | self.Type = "" 135 | self.Core = None 136 | self.Start = False; 137 | 138 | def RewriteItem(self, input): 139 | m = self.MatchItem(input, 0, None, True); 140 | return m.RewriteItem(); 141 | 142 | def RebuildEntity(self): 143 | pass; 144 | 145 | def SetValues(self, values): 146 | if isinstance(values, dict): 147 | value = values.get("Order", None); 148 | if value is not None: 149 | self.Order = int(value); 150 | value = values.get("Type", None); 151 | if value is not None: 152 | self.Type = value; 153 | value = values.get("Parameter", None); 154 | if value is None: 155 | return; 156 | if value.find('|') >= 0: 157 | return; 158 | va = value.split(','); 159 | for v in va: 160 | vs = v.split('='); 161 | if len(vs) != 2: 162 | continue; 163 | key, value = vs[0].strip(), vs[1].strip(); 164 | value = eval(value); 165 | setattr(self, vs[0].strip(), value); 166 | 167 | def EvalScript(self, m, ot=None): 168 | if self.Script == u'': 169 | return True 170 | if ot is None: 171 | ot = m[0].mstr; 172 | core = self.Core 173 | 174 | def check(condition, result, elsework=None): 175 | if eval(condition): 176 | r = eval(result); 177 | return r; 178 | elif elsework is not None: 179 | r = eval(elsework); 180 | return r; 181 | 182 | def invoke(func, para): 183 | return eval(func)(para); 184 | 185 | def e(entityname): 186 | entity = self.Core.Entities[entityname] 187 | header = None 188 | header = entity.MatchItem(ot, 0, True, header) 189 | if not IsFail(header): 190 | header = MatchResult(entity, None, -100) 191 | return header 192 | return None 193 | 194 | def dist(name, i=0): 195 | header = e(name) 196 | if header is None: 197 | return int_max; 198 | return abs(header.pos - m[i].pos) 199 | 200 | result = eval(self.Script) 201 | return result 202 | 203 | def LogIn(self, input, start, end=None): 204 | if self.Core.LogFile is None: 205 | return 206 | if self.Core.LogFile.name.find('htm') < 0: 207 | if end is not None: 208 | end = start + 200; 209 | input = input[start: end].replace('\n', '<\\n>').replace('\r', '<\\r>'); 210 | self.Core.LogFile.write(' ' * self.Core.matchLevel * 2) 211 | self.Core.LogFile.write('%s,Raw =%s\r' % (str(self), input)) 212 | else: 213 | self.Core.LogFile.write('
' + ' ' * self.Core.matchLevel * 4) 214 | self.Core.LogFile.write('%s,Raw= %s
\r' % (str(self), input)) 215 | self.Core.matchLevel += 1 216 | 217 | def LogOut(self, match, buffered=False): 218 | if self.Core.LogFile is None: 219 | return 220 | self.Core.matchLevel -= 1 221 | if self.Core.LogFile.name.find('htm') < 0: 222 | self.Core.LogFile.write(' ' * self.Core.matchLevel * 2) 223 | if match is not None: 224 | match = match[:200].replace('\n', '<\\n>').replace('\r', '<\\r>'); 225 | self.Core.LogFile.write('%s,%s=%s\r' % (str(self), ('Buff ' if buffered else 'Match'), match)) 226 | else: 227 | self.Core.LogFile.write('%s,NG\r' % str(self)) 228 | else: 229 | 230 | self.Core.LogFile.write('' + ' ' * self.Core.matchLevel * 4) 231 | if match != None: 232 | self.Core.LogFile.write('%s,OK,Raw= %s
\r' % (str(self), match)) 233 | else: 234 | self.Core.LogFile.write('%s,NG\r' % str(self)) 235 | 236 | def MatchItem(self, input, start, end, muststart, mode=None): 237 | return None; 238 | 239 | def GetName(self): 240 | name = self.Name if self.Name != "" else "unknown" 241 | return "%s,%s" % (name, findany(re.split("[,.']", str(type(self))),lambda d:d.find('Entity')>0).replace("Entity", "")) 242 | 243 | def __str__(self): 244 | return self.GetName() 245 | 246 | 247 | class StringEntity(EntityBase): 248 | def __init__(self, match="", rewrite=None, condition=''): 249 | super(StringEntity, self).__init__() 250 | self.Match = match 251 | self.Rewrite = rewrite 252 | self.Condition = condition 253 | 254 | def RewriteItem(self, input): 255 | if None == self.Rewrite: 256 | return input 257 | return input.replace(self.Match, self.Rewrite); 258 | 259 | def SetValues(self, values): 260 | super(StringEntity, self).SetValues(values); 261 | if isinstance(values, dict): 262 | return; 263 | self.Match = values[0] 264 | if len(values) > 1: 265 | self.Rewrite = values[1] 266 | 267 | def MatchItem(self, input, start, end, muststart, mode=None): 268 | self.LogIn(input, start) 269 | if end is None: 270 | end = int_max; 271 | pos = input.find(self.Match, start, end) 272 | if pos < 0 or (muststart == True and pos != start): 273 | self.LogOut(None,False) 274 | return int_max if pos < 0 else pos; 275 | 276 | self.LogOut(self.Match) 277 | m = MatchResult(self, self.Match, pos) 278 | m.rstr = self.Match if self.Rewrite is None else self.Rewrite; 279 | return m; 280 | 281 | 282 | class RepeatEntity(EntityBase): 283 | def __init__(self, entity=None, least=1, most=1, equal=False): 284 | super(RepeatEntity, self).__init__() 285 | self.Least = least 286 | self.Most = most 287 | self.Entity = entity 288 | self.Equal = equal; 289 | 290 | __splitre = re.compile('[,{}]'); 291 | 292 | def RebuildEntity(self): 293 | if isinstance(self.Entity, str): 294 | self.Entity = self.Core.Entities[self.Entity]; 295 | self.Entity.Core = self.Core; 296 | 297 | 298 | def SetValues(self, values): 299 | super(RepeatEntity, self).SetValues(values); 300 | if isinstance(values, dict): 301 | return 302 | cal = values[0]; 303 | if cal == '*': 304 | self.Least = 0 305 | self.Most = -1 306 | elif cal == '+': 307 | self.Least = 1 308 | self.Most = -1 309 | elif cal == '?': 310 | self.Least = 0 311 | self.Most = 1 312 | elif cal.startswith('{'): 313 | sp = self.__splitre.split(cal); 314 | self.Least = int(sp[1]) 315 | self.Most = int(sp[2]) 316 | if self.Most == -1: 317 | self.Most = 99999; 318 | 319 | def MatchItem(self, input, start, muststart, mode=None): 320 | self.LogIn(input, start) 321 | right = 0 322 | start = start 323 | lresult = None 324 | isStop = False; 325 | isReset = False; 326 | bestResults = []; 327 | 328 | omax = -1; 329 | while right < self.Most: 330 | result = self.Entity.MatchItem(input, start, muststart, None) 331 | if not IsFail(result): 332 | if right == 0: 333 | start = result.pos 334 | bestResults.append(result); 335 | else: 336 | if self.Equal: 337 | if result.pos != start or lresult.mstr != result.mstr: 338 | if not isinstance(self.Entity, RepeatEntity): 339 | isStop = True; 340 | else: 341 | if omax == -1: 342 | omax = self.Entity.Most; 343 | self.Entity.Most = self.Entity.Least; 344 | else: 345 | self.Entity.Most += 1; 346 | if self.Entity.Most >= omax: 347 | isStop = True; 348 | right = 0; 349 | start = 0; 350 | isReset = True; 351 | else: 352 | bestResults.append(result) 353 | 354 | elif result.pos != start: 355 | isStop = True; 356 | else: 357 | bestResults.append(result) 358 | if isStop: 359 | break 360 | if not isReset: 361 | lresult = result; 362 | start = result.pos + len(result.mstr); 363 | lresult.Order = right; 364 | right += 1 365 | isReset = False; 366 | 367 | else: 368 | break; 369 | if right < self.Least: 370 | self.LogOut(None,False) 371 | return start; 372 | pos = start 373 | matchResultString = input[start:start] 374 | if bestResults == []: # this is ? or * ,can be null 375 | bestResult = MatchResult(None, '', 0); 376 | bestResult.rstr = ''; 377 | p = MatchResult(self, matchResultString, pos, bestResults) 378 | self.LogOut(matchResultString) 379 | return p; 380 | 381 | 382 | class DiffEntity(EntityBase): 383 | def __init__(self, universe=None, complements=None): 384 | super(DiffEntity, self).__init__() 385 | self.Universe = universe 386 | self.Complements = complements if complements is not None else []; 387 | 388 | def RebuildEntity(self): 389 | if isinstance(self.Universe, str): 390 | self.Universe = self.Core.Entities[self.Universe]; 391 | for r in range(len(self.Complements)): 392 | if isinstance(self.Complements[r], str): 393 | self.Complements[r] = self.Core.Entities[self.Complements[r]]; 394 | 395 | def MatchItem(self, input, start, end, muststart, mode=None): 396 | self.LogIn(input, start) 397 | unresult = self.Universe.MatchItem(input, start, end, muststart, None) 398 | if IsFail(unresult): 399 | self.LogOut(None) 400 | return unresult; 401 | matchResult = None 402 | if len(self.Complements) != 0: 403 | for en in self.Complements: 404 | matchResult = en.MatchItem(unresult.mstr, 0, None, True, matchResult) 405 | if IsFail(matchResult): 406 | self.LogOut(None) 407 | return unresult.pos; 408 | p = MatchResult(self, unresult.mstr, unresult.pos, [unresult]) 409 | self.LogOut(unresult.mstr) 410 | return p; 411 | 412 | 413 | class RegexEntity(EntityBase): 414 | def __init__(self, match="", rewrite=None): 415 | super(RegexEntity, self).__init__() 416 | self.Match = match 417 | self.Rewrite = rewrite 418 | self.regex = None 419 | self.merge = False; 420 | self.IsMatchMax = True; 421 | if self.Match != "": 422 | self.RebuildEntity(); 423 | 424 | def RewriteItem(self, input): 425 | if self.Rewrite is None: 426 | return input 427 | m = self.regex.search(input); 428 | return self.__Replace(m, self.Rewrite) 429 | 430 | def RebuildEntity(self): 431 | if self.regex is None: 432 | try: 433 | self.regex = re.compile(self.Match) 434 | except: 435 | print("Regex Format error %s" % (self.Match)); 436 | 437 | def SetValues(self, values): 438 | super(RegexEntity, self).SetValues(values); 439 | if isinstance(values, dict): 440 | return; 441 | self.Match = values[0] 442 | if len(values) > 1: 443 | if isinstance(values[1], str): 444 | self.Rewrite = values[1] 445 | else: 446 | self.merge = True; 447 | self.maps = values[1]; 448 | 449 | try: 450 | self.regex = re.compile(self.Match) 451 | except: 452 | print("Regex Format error %s" % (self.Match)); 453 | 454 | def __Replace(self, m, string): 455 | if (m.lastindex != None): 456 | c = m.lastindex + 1; 457 | else: 458 | c = 1; 459 | for index in range(c): 460 | string = string.replace(u'$' + str(index), m.group(index)).replace('\\n', '\n'); 461 | return string; 462 | 463 | def MatchItem(self, input, start, end, muststart, mode=None): 464 | self.LogIn(input, start) 465 | if end is None: 466 | if muststart: 467 | m = self.regex.match(input, start); 468 | else: 469 | m = self.regex.search(input, start); 470 | else: 471 | if muststart: 472 | m = self.regex.match(input, start, end); 473 | else: 474 | m = self.regex.search(input, start, end) 475 | if m is None or (muststart == True and m.start() != start): 476 | self.LogOut(None) 477 | return int_max if m is None else m.start(); 478 | 479 | p = MatchResult(self, m.group(), m.start()) 480 | if self.merge: 481 | 482 | p.rstr = self.maps[p.mstr].RewriteItem(p.mstr); 483 | elif self.Rewrite is None: 484 | p.rstr = p.mstr; 485 | else: 486 | p.rstr = self.__Replace(m, self.Rewrite); 487 | self.LogOut(m.group()) 488 | return p; 489 | 490 | 491 | class ScriptEntity(EntityBase): 492 | def __init__(self, script=""): 493 | super(ScriptEntity, self).__init__() 494 | self.Script = script 495 | 496 | def SetValues(self, values): 497 | super(ScriptEntity, self).SetValues(values); 498 | if isinstance(values, list): 499 | self.Script = values[0] 500 | 501 | def RewriteItem(self, match): 502 | return str(self.EvalScript(match)); 503 | 504 | def MatchItem(self, input, start, end, muststart, mode=None): 505 | core = self.Core; 506 | return eval(self.Script); 507 | 508 | def MatchItem2(self, origin, rewritetarget, isrewrite=False): 509 | input = rewritetarget.mstr; 510 | self.LogIn(input, rewritetarget.pos) 511 | if isrewrite: 512 | r = input; 513 | pos = 0; 514 | else: 515 | r = self.EvalScript(None, origin, input); 516 | if r is None: 517 | return None; 518 | pos = input.find(r); 519 | if pos < 0: 520 | return None; 521 | p = MatchResult(self, r, pos, rewritetarget) 522 | self.LogOut(r) 523 | return p; 524 | 525 | 526 | def IsFail(x): 527 | if isinstance(x, int): 528 | return True; 529 | return False; 530 | 531 | 532 | class TableEntity(EntityBase): 533 | def __init__(self, tables=None, groups=None): 534 | super(TableEntity, self).__init__() 535 | 536 | self.Tables = tables if tables is not None else []; 537 | self.Properties = {} 538 | 539 | self.Group = groups if groups is not None else []; 540 | self.IsMatchMax = True; 541 | 542 | def ReplaceEscapeChar(self, s): 543 | for c in list('+-*().'): 544 | s = s.replace(str(c), '\\' + str(c)); 545 | return s; 546 | 547 | def RebuildEntity(self): 548 | for r in range(len(self.Tables)): 549 | if isinstance(self.Tables[r], str): 550 | self.Tables[r] = self.Core.Entities[self.Tables[r]]; 551 | if isinstance(self.Tables[r], SequenceEntity): 552 | self.Tables[r].RebuildEntity(); 553 | self.Tables[r].Core = self.Core; 554 | 555 | if not RegexCore.AutoMerge: 556 | return; 557 | 558 | seqs = [m for m in self.Tables if isinstance(m, StringEntity)]; 559 | if len(seqs) < 2: 560 | return; 561 | ms = {}; 562 | rex = RegexEntity(); 563 | rex.Name = self.Name + "_merge"; 564 | rex.Core = self.Core; 565 | match = ""; 566 | 567 | for r in seqs: 568 | m = self.ReplaceEscapeChar(r.Match); 569 | ms[r.Match] = r; 570 | match += m + '|'; 571 | match = match[:-1]; 572 | 573 | rex.SetValues([match, ms]); 574 | for r in seqs: 575 | self.Tables.remove(r); 576 | self.Tables.append(rex); 577 | for t in self.Tables: 578 | t.Core = self.Core; 579 | return 580 | 581 | def SetValues(self, values): 582 | super(TableEntity, self).SetValues(values); 583 | if isinstance(values, list): 584 | return; 585 | value = values.get("Property", None); 586 | if value is not None: 587 | items = [x.strip() for x in value.split('|')] 588 | for i in range(min(len(items), len(self.Tables))): 589 | str2 = [x.strip() for x in items[i].split(',')] 590 | self.Properties[i] = str2 591 | 592 | def MatchItem(self, input, start, end, muststart, mode=None): 593 | self.LogIn(input, start) 594 | bestLen = -1 595 | bestSeqID = -1 596 | bestStart = int_max; 597 | rpos = bestStart; 598 | submode = None; 599 | dictbuf = self.Core.Entities.SeqBuff 600 | bestMatchResult = None 601 | total = len(self.Tables) 602 | for seqid in range(total): 603 | if seqid in self.Group and bestSeqID != -1 and bestStart == start: 604 | break; 605 | entity = self.Tables[seqid] 606 | if mode is not None and mode.MatchIndex != -1: 607 | if seqid != mode.MatchIndex: 608 | continue 609 | submode = mode.Children; 610 | seqValue = dictbuf.GetMatch(entity, input, start, end); 611 | if seqValue == -1: 612 | continue 613 | if seqValue is not None: 614 | theader = seqValue; 615 | else: 616 | theader = entity.MatchItem(input, start, end, muststart, submode) 617 | if IsFail(theader): 618 | rpos = min(theader, rpos); 619 | if not muststart: 620 | dictbuf.AddScan(entity, start, start); 621 | if submode is not None and mode.MatchIndex == -1: 622 | mode.MatchIndex = -1; 623 | submode = None; 624 | else: 625 | dictbuf.AddScan(entity, start, theader.pos); 626 | dictbuf.AddEntity(entity, theader) 627 | if IsFail(theader): 628 | continue 629 | spos, slen = theader.pos, len(theader.mstr); 630 | if (spos < bestStart or ( 631 | spos == bestStart and (slen > bestLen if self.IsMatchMax == True else slen < bestLen))): 632 | bestLen = slen 633 | bestStart = spos 634 | bestSeqID = seqid 635 | bestMatchResult = theader; 636 | 637 | if bestMatchResult is not None: 638 | if bestMatchResult.Children is not None: 639 | match = bestMatchResult.Children 640 | else: 641 | match = [bestMatchResult]; 642 | 643 | if len(self.Properties) > bestSeqID: 644 | index = 0; 645 | for element in self.Properties[bestSeqID]: 646 | match[index].PropertyName = element 647 | index += 1; 648 | if index >= len(match): 649 | break 650 | bestMatchResult = MatchResult(self, bestMatchResult.mstr, bestMatchResult.pos, [bestMatchResult]); 651 | bestMatchResult.MatchIndex = bestSeqID; 652 | self.LogOut(bestMatchResult.mstr) 653 | return bestMatchResult; 654 | self.LogOut(None) 655 | return rpos; 656 | 657 | 658 | def AddArea(sb, start, end): 659 | l = len(sb); 660 | insp = 0; 661 | if l == 0: 662 | sb.append(start); 663 | sb.append(end); 664 | else: 665 | left = 0; 666 | right = l - 2; 667 | while left <= right: 668 | mid = ((left + right) >> 2) << 1; 669 | if sb[mid] < start: 670 | left = mid + 2 671 | if right < left: 672 | if l <= left: 673 | sb.append(start) 674 | sb.append(end) 675 | insp = l 676 | else: 677 | sb.insert(left, start) 678 | sb.insert(left + 1, end) 679 | insp = left; 680 | break 681 | elif sb[mid] > start: 682 | right = mid - 2 683 | if right < left: 684 | sb.insert(left, start) 685 | sb.insert(left + 1, end) 686 | insp = left; 687 | break 688 | else: 689 | if sb[mid + 1] < end: 690 | sb[mid + 1] = end 691 | break 692 | 693 | l = len(sb); 694 | if insp > 2: 695 | i = insp - 2; 696 | else: 697 | i = 0; 698 | while i < l: 699 | pi = i - 2 700 | if pi < 0 or sb[pi + 1] < sb[i] - 1: 701 | pi += 2; 702 | 703 | else: 704 | if sb[pi + 1] <= sb[i + 1]: 705 | sb[pi + 1] = sb[i + 1]; 706 | del sb[pi + 2:pi + 4]; 707 | l -= 2; 708 | i -= 2; 709 | i += 2; 710 | return sb; 711 | 712 | 713 | class BuffHelper(object): 714 | def __init__(self, slen): 715 | self.scanbuf = {}; 716 | self.entitybuf = {}; 717 | self.slen = slen; 718 | self.extractedarea = []; 719 | 720 | def BinarySearchIndex(self, arr, v): 721 | l = len(arr); 722 | if l == 0: 723 | return 0; 724 | left = 0; 725 | right = l - 1; 726 | while left <= right: 727 | mid = (left + right) >> 1; 728 | if arr[mid] > v: 729 | right = mid - 1; 730 | elif arr[mid] < v: 731 | left = mid + 1; 732 | else: 733 | break; 734 | if v < arr[mid]: 735 | return mid; 736 | elif v == arr[mid]: 737 | if mid % 2 == 0: 738 | return mid + 1; 739 | else: 740 | return mid; 741 | return mid + 1; 742 | 743 | def AddEntity(self, entity, matchResult): 744 | entityid = id(entity); 745 | sb = self.entitybuf.get(entityid, None); 746 | if sb is None: 747 | sb = []; 748 | self.entitybuf[entityid] = sb; 749 | lo = 0 750 | hi = len(sb) 751 | while lo < hi: 752 | mid = (lo + hi) // 2 753 | if matchResult.pos < sb[mid].pos: 754 | hi = mid 755 | else: 756 | lo = mid + 1 757 | sb.insert(lo, matchResult) 758 | 759 | def AddScan(self, entity, start, end=None): 760 | 761 | if entity != 0: 762 | entityid = id(entity); 763 | sb = self.scanbuf.get(entityid, None); 764 | else: 765 | sb = self.extractedarea; 766 | if sb is None: 767 | sb = []; 768 | self.scanbuf[entityid] = sb; 769 | if end is None: 770 | end = self.slen; 771 | AddArea(sb, start, end); 772 | 773 | def IsInExtractArea(self, pos): 774 | i = self.BinarySearchIndex(self.extractedarea, pos); 775 | if i % 2 == 0: 776 | return pos; 777 | return self.extractedarea[i]; 778 | 779 | def GetMatch(self, entity, input, start, end): 780 | 781 | entityid = id(entity); 782 | sb = self.scanbuf.get(entityid, None); 783 | if sb is None: 784 | return None; 785 | i = self.BinarySearchIndex(sb, start); 786 | if i % 2 == 0: 787 | return None; 788 | start = start; 789 | end = sb[i] 790 | eb = self.entitybuf.get(entityid, None); 791 | if eb is None: 792 | entity.LogIn(input, start, end) 793 | entity.LogOut(None,True); 794 | return end; 795 | hi = len(eb) 796 | lo = 0; 797 | while lo < hi: 798 | mid = (lo + hi) // 2 799 | if start < eb[mid].pos: 800 | hi = mid 801 | elif start == eb[mid].pos: 802 | lo = mid; 803 | break; 804 | else: 805 | lo = mid + 1 806 | if lo >= len(eb): 807 | entity.LogIn(input, start, end) 808 | entity.LogOut(None); 809 | return None; 810 | if eb[lo].pos <= end: 811 | entity.LogIn(input, start, end) 812 | entity.LogOut(eb[lo].mstr, True) 813 | return eb[lo]; 814 | return None; 815 | 816 | 817 | class TreeNode(object): 818 | def __init__(self): 819 | self.Left = None; 820 | self.Right = None; 821 | self.Root = None; 822 | self.Match = None; 823 | self.Rewrite = None; 824 | self.Index = 0; 825 | self.Order = 0 826 | 827 | def GetLeft(self): 828 | tree = self; 829 | while tree.Left is not None: 830 | tree = tree.Left; 831 | return tree; 832 | 833 | def GetRight(self): 834 | tree = self; 835 | while tree.Right is not None: 836 | tree = tree.Right; 837 | return tree; 838 | 839 | def InOrderTravel(self, node, func): 840 | if node is None: 841 | return; 842 | self.InOrderTravel(node.Left, func); 843 | func(node); 844 | self.InOrderTravel(node.Right, func) 845 | 846 | 847 | def IsSameValue(arr, l, r): 848 | if r < l + 2: 849 | return False; 850 | for i in range(l + 1, r): 851 | if arr[i] != arr[l]: 852 | return False; 853 | return True; 854 | 855 | 856 | def GetMaxIndex(arr, l, r): 857 | max_value = -100; 858 | max_index = -1; 859 | for i in range(l, r): 860 | if arr[i] > max_value: 861 | max_index = i; 862 | max_value = arr[i]; 863 | return max_index; 864 | 865 | 866 | class SequenceEntity(EntityBase): 867 | def __init__(self, matchEntities=None, rewriteEntities=None, matchorders=None, rewriteOrders=None, condition=None): 868 | super(SequenceEntity, self).__init__() 869 | self.DirectReplace = "直接替换" 870 | self.MatchEntities = matchEntities if matchEntities is not None else []; 871 | self.RewriteEntities = rewriteEntities if rewriteEntities is not None else []; 872 | self.RewriteOrders = rewriteOrders if rewriteOrders is not None else []; 873 | self.MatchOrders = matchorders if matchorders is not None else None; 874 | self.Property = [] 875 | self.Condition = condition; 876 | self.Root = None; 877 | 878 | def SetValues(self, values): 879 | super(SequenceEntity, self).SetValues(values); 880 | if isinstance(values, list): 881 | return; 882 | value = values.get("Property", None); 883 | if value is not None: 884 | self.Property = [x.strip() for x in value.split(',')] 885 | 886 | def BuildMatchTree(self, l, r): 887 | if l > r or l >= len(self.MatchOrders): 888 | return None; 889 | if r == l: 890 | tree = TreeNode(); 891 | tree.Match = self.MatchEntities[l]; 892 | return tree; 893 | if IsSameValue(self.MatchOrders, l, r): 894 | tb = TableEntity(); 895 | tb.Core = self.Core; 896 | for item in itertools.combinations(self.MatchEntities[l:r], r - l + 1): 897 | se = SequenceEntity(item); 898 | se.Core = core; 899 | tb.Tables.append(se); 900 | tree = TreeNode(); 901 | tree.Match = tb; 902 | return tree; 903 | max_index = GetMaxIndex(self.MatchOrders, l, r) 904 | tree = TreeNode(); 905 | tree.Order = self.MatchOrders[max_index]; 906 | tree.Index = max_index; 907 | tree.Match = self.MatchEntities[max_index]; 908 | if max_index < len(self.RewriteEntities): 909 | tree.Rewrite = self.RewriteEntities[self.RewriteOrders[max_index]] 910 | tree.Left = self.BuildMatchTree(l, max_index - 1); 911 | if tree.Left is not None: 912 | tree.Left.Root = tree; 913 | tree.Right = self.BuildMatchTree(max_index + 1, r); 914 | if tree.Right is not None: 915 | tree.Right.Root = tree; 916 | return tree; 917 | 918 | def RebuildEntity(self): 919 | for r in range(len(self.MatchEntities)): 920 | if isinstance(self.MatchEntities[r], str): 921 | self.MatchEntities[r] = self.Core.Entities[self.MatchEntities[r]]; 922 | self.MatchEntities[r].Core = self.Core; 923 | for r in range(len(self.RewriteEntities)): 924 | if isinstance(self.RewriteEntities[r], str): 925 | self.RewriteEntities[r] = self.Core.Entities[self.RewriteEntities[r]]; 926 | self.RewriteEntities[r].Core = self.Core; 927 | if self.MatchOrders is None: 928 | self.MatchOrders = [i for i in range(len(self.MatchEntities), 0, -1)]; 929 | 930 | self.Tree = self.BuildMatchTree(0, len(self.MatchOrders)); 931 | 932 | def TreeNodeMatch(self, treenode, input, start, end, finalmatchScript, muststart=False): 933 | dictbuf = self.Core.Entities.SeqBuff 934 | matchEntity = treenode.Match; 935 | matchResult = dictbuf.GetMatch(matchEntity, input, start, end); 936 | fail = False; 937 | if matchResult is None: 938 | matchResult = matchEntity.MatchItem(input, start, end, treenode.Left is None and muststart) 939 | if not IsFail(matchResult): 940 | dictbuf.AddScan(matchEntity, start, matchResult.pos); 941 | dictbuf.AddEntity(matchEntity, matchResult); 942 | if not IsFail(matchResult): 943 | if treenode.Right is None and end is not None and matchResult.pos + len(matchResult.mstr) != end: 944 | fail = True; 945 | if not finalmatchScript: 946 | rewriteEntity = treenode.Rewrite; 947 | if rewriteEntity is not None and rewriteEntity.Name != self.DirectReplace: 948 | if isinstance(rewriteEntity, ScriptEntity): 949 | matchResult = rewriteEntity.MatchItem2(input, matchResult, True) 950 | if matchResult is None: 951 | fail = True; 952 | else: 953 | matchResult.rstr = rewriteEntity.RewriteItem(matchResult.mstr) 954 | if not fail and not IsFail(matchResult): 955 | mleft = matchResult.pos; 956 | mright = mleft + len(matchResult.mstr); 957 | runtree = TreeNode(); 958 | runtree.Match = matchResult; 959 | if treenode.Left is not None: 960 | left = self.TreeNodeMatch(treenode.Left, input, start, matchResult.pos, finalmatchScript); 961 | if IsFail(left): 962 | fail = True; 963 | elif muststart == True and left.GetLeft().Match.pos != start: 964 | fail = True; 965 | else: 966 | rm = left.GetRight().Match; 967 | if rm.pos + len(rm.mstr) != mleft: 968 | fail = True; 969 | runtree.Left = left; 970 | if not fail and treenode.Right is not None: 971 | right = self.TreeNodeMatch(treenode.Right, input, matchResult.pos + len(matchResult.mstr), end, 972 | finalmatchScript); 973 | if IsFail(right): 974 | fail = True; 975 | elif right.GetLeft().Match.pos != mright: 976 | fail = True; 977 | elif end is not None: 978 | rright = right.GetRight().Match; 979 | rpos = rright.pos + len(rright.mstr); 980 | if rpos != end: 981 | fail = True; 982 | 983 | runtree.Right = right; 984 | if IsFail(matchResult): 985 | return matchResult; 986 | elif fail: 987 | return matchResult.pos + len(matchResult.mstr); 988 | return runtree; 989 | 990 | def MatchItem(self, input, start, end, muststart, mode=None): 991 | self.LogIn(input, start) 992 | finalmatchScript = False; 993 | if len(self.MatchEntities) > 1 and len(self.RewriteEntities) == 1 and isinstance(self.RewriteEntities[0], 994 | ScriptEntity): 995 | finalmatchScript = True; 996 | treeResult = self.TreeNodeMatch(self.Tree, input, start, end, finalmatchScript, muststart); 997 | if IsFail(treeResult): 998 | self.LogOut(None) 999 | return treeResult; 1000 | matchResults = []; 1001 | treeResult.InOrderTravel(treeResult, lambda m: matchResults.append(m.Match)); 1002 | for i in range(len(matchResults)): 1003 | if i < len(self.Property): 1004 | matchResults[i].PropertyName = self.Property[i] 1005 | if i < len(self.RewriteOrders): 1006 | matchResults[i].Order = self.RewriteOrders[i] 1007 | else: 1008 | matchResults[i].Order = i 1009 | if self.Condition is not None and self.Condition.EvalScript(matchResults, input) == False: 1010 | self.LogOut(None) 1011 | return start; 1012 | start = matchResults[0].pos; 1013 | sum = 0; 1014 | for i in range(0, len(matchResults)): 1015 | sum += len(matchResults[i].mstr); 1016 | mstring = input[start:start + sum]; 1017 | if finalmatchScript: 1018 | script = self.RewriteEntities[0]; 1019 | p = MatchResult(script, mstring, start, matchResults); 1020 | return p; 1021 | if len(matchResults) > 1: 1022 | p = MatchResult(self, mstring, start, matchResults) 1023 | else: 1024 | p = matchResults[0]; 1025 | self.LogOut(mstring,False) 1026 | return p; 1027 | 1028 | 1029 | class Entities(object): 1030 | def __init__(self): 1031 | super(Entities, self).__init__() 1032 | self.AllEntities = [] 1033 | self.ValidEntities = [] 1034 | self.EntityNames = {} 1035 | self.EntityIds = {} 1036 | 1037 | def appendids(self, entity): 1038 | if -1 != entity.Order: 1039 | self.EntityIds[entity.Order] = entity 1040 | self.ValidEntities.append(entity) 1041 | 1042 | def append(self, entity): 1043 | if entity.Name is not None: 1044 | self.EntityNames[entity.Name] = entity 1045 | self.AllEntities.append(entity) 1046 | 1047 | def __getitem__(self, item): 1048 | if item in self.EntityNames: 1049 | return self.EntityNames[item] 1050 | else: 1051 | print("Entity name %s can not be found!" % (item)); 1052 | for entity in self.AllEntities: 1053 | if (entity.Name == item): 1054 | return entity 1055 | return None 1056 | 1057 | 1058 | class Token: 1059 | (NAME, ENTITY, MINUS, COLON, END, BAR, REPEAT, Script) = range(8) 1060 | 1061 | 1062 | class RegexToken: 1063 | def __init__(self, regex, token, count, type=None): 1064 | self.Regex = regex 1065 | self.Token = token 1066 | self.Count = count 1067 | self.EntityType = type 1068 | 1069 | 1070 | class TokenItem: 1071 | def __init__(self, token): 1072 | self.Rule = None 1073 | self.Token = token 1074 | self.Entity = None 1075 | self.Values = [] 1076 | 1077 | 1078 | class RegexCore(object): 1079 | AutoModeStudy = True 1080 | ExtractDictEnabled = False 1081 | LogFile = None 1082 | matchLevel = 0 1083 | AutoMerge = True 1084 | MatchAllEntity = False; 1085 | 1086 | def __init__(self, rule=None): 1087 | super(RegexCore, self).__init__() 1088 | self.Entities = None 1089 | self.__entity_name = re.compile(r"^(\w+)\s*=\s*") 1090 | self.__entity_reexp = re.compile(r"^\(/((?:.(?!/\)))*?)/\s*:\s*/((?:(?!\(/).)*?)/\)\s*") 1091 | self.__entity_string = re.compile("^\(\"((?:(?!\"\)).)*?)\"\s*:\s*\"(.*?)\"(?:\s*:\s*\"(.*?)\")?\s*\)\s*") 1092 | self.__numbeRegex = re.compile(r"[0-9]+") 1093 | self.__r_bar = re.compile(r"^\s*\|\s*") 1094 | self.__r_bar2 = re.compile(r"^\s*/\s*") 1095 | self.__r_colon = re.compile(r"^\s*:\s*") 1096 | self.__r_conds = re.compile(r'^\s*\"([^"]*)\"') 1097 | self.__r_entity = re.compile("^\$\((\w+)\)\s*") 1098 | self.__r_minus = re.compile(r"^\s*-\s*") 1099 | self.__r_order = re.compile(r"^\$0*(\d+)\s*") 1100 | self.__r_reexp = re.compile(r"^\(/((?:.(?!/\s*:\s*/))*?)/\)\s*") 1101 | self.__r_repeat = re.compile(r"^\s*([*?+]|{(\d+),(-1|\d+)})\s*") 1102 | self.__r_semicolon = re.compile(r"^\s*;") 1103 | self.__r_string = re.compile(r"^\(\s?\"(.*?)\"\s?\)\s*") 1104 | self.tnFileName = None 1105 | self.Entities = Entities() 1106 | self.Entities.AllEntities = [] 1107 | self.Entities.ValidEntities = [] 1108 | if rule is not None: 1109 | self.InitTNRule(rule); 1110 | 1111 | def InitPyRule(self, pyrule): 1112 | for r in pyrule.__dict__: 1113 | s = getattr(pyrule, r); 1114 | if isinstance(s, EntityBase): 1115 | s.Core = self; 1116 | s.Name = r; 1117 | self.Entities.append(s); 1118 | if s.Order != 0: 1119 | self.Entities.appendids(s); 1120 | for entity in self.Entities.AllEntities: 1121 | entity.RebuildEntity(); 1122 | def WriteHTMLHeader(file): 1123 | file.write('''%s
\n' % t) 1147 | else: 1148 | newfile.write('%s
\n' % t) 1149 | 1150 | self.WriteHTMLEnd(newfile) 1151 | newfile.close() 1152 | file_object.close() 1153 | 1154 | def InitRuleText(self, text, addtoOrder=True): 1155 | propertyregex = re.compile("#%(\w+)%\s(.+)") 1156 | tokenRegex = [RegexToken(self.__entity_name, Token.NAME, 2), 1157 | RegexToken(self.__entity_reexp, Token.ENTITY, 3, RegexEntity), 1158 | RegexToken(self.__entity_string, Token.ENTITY, 3, StringEntity), 1159 | RegexToken(self.__r_reexp, Token.ENTITY, 1, RegexEntity), 1160 | RegexToken(self.__r_order, Token.ENTITY, 1), 1161 | RegexToken(self.__r_entity, Token.ENTITY, 2), 1162 | RegexToken(self.__r_string, Token.ENTITY, 2, StringEntity), 1163 | RegexToken(self.__r_minus, Token.MINUS, 2), 1164 | RegexToken(self.__r_colon, Token.COLON, 1), 1165 | RegexToken(self.__r_repeat, Token.REPEAT, 2), 1166 | RegexToken(self.__r_bar, Token.BAR, 1), 1167 | RegexToken(self.__r_bar2, Token.BAR, 1), 1168 | RegexToken(self.__r_semicolon, Token.END, 1), 1169 | RegexToken(self.__r_conds, Token.ENTITY, 1, ScriptEntity), 1170 | ] 1171 | sb = "" 1172 | realRules = [] 1173 | rules = [x.strip() for x in text.split('\n')] # PreProcessing 1174 | for rule in rules: 1175 | if propertyregex.match(rule): 1176 | realRules.append(rule.strip()) 1177 | continue 1178 | if rule.startswith(u"#"): 1179 | sb = "" 1180 | continue 1181 | if rule.endswith(';'): 1182 | sb += rule 1183 | realRules.append(sb.strip()) 1184 | sb = "" 1185 | continue 1186 | else: 1187 | sb += rule 1188 | properties = {}; 1189 | for rule in realRules: 1190 | m = propertyregex.match(rule); 1191 | if m is not None: 1192 | if m.lastindex is not None and m.lastindex == 2: 1193 | name, value = m.group(1), m.group(2); 1194 | if name == "Script": 1195 | item = __import__(value) 1196 | setattr(self, value, item) 1197 | elif name == "Include": 1198 | value = value.split(' '); 1199 | isadd = False; 1200 | if len(value) > 1: 1201 | isadd = value[1] == "True"; 1202 | self.InitTNRule(value[0], isadd); 1203 | else: 1204 | properties[m.group(1)] = m.group(2); 1205 | continue 1206 | 1207 | tokenItems = [] # Lexical Analyse 1208 | while True: 1209 | if len(rule) == 0: break 1210 | canmatch = False; 1211 | for token in tokenRegex: 1212 | mat = token.Regex.match(rule) 1213 | if mat is None: continue 1214 | 1215 | mcount = mat.lastindex if mat.lastindex is not None else 1; 1216 | if mcount < token.Count - 1: 1217 | continue 1218 | canmatch = True; 1219 | tokenItem = TokenItem(token.Token) 1220 | for r in range(mcount): 1221 | tokenItem.Values.append(mat.string if mat.lastindex is None else mat.group(r + 1)) 1222 | tokenItem.Rule = mat.group(0) 1223 | 1224 | e = None 1225 | if token.EntityType is not None: 1226 | e = token.EntityType() 1227 | e.Core = self 1228 | e.SetValues(tokenItem.Values) 1229 | elif token.Regex == self.__r_entity: 1230 | e = tokenItem.Values[0] 1231 | if e is not None: 1232 | tokenItem.Entity = e 1233 | rule = rule[len(tokenItem.Rule):] 1234 | tokenItems.append(tokenItem) 1235 | break 1236 | if not canmatch: 1237 | print("rule format error%s" % (rule)); 1238 | return; 1239 | 1240 | if Token.NAME != tokenItems[0].Token: # Grammer Analyse 1241 | print("name must be the first") 1242 | if tokenItems[-1].Token != Token.END: 1243 | print("Rule must be ended by ;") 1244 | 1245 | if findany(tokenItems, lambda r: r.Token == Token.BAR): 1246 | entity = TableEntity() 1247 | entity.Core = self; 1248 | 1249 | lastid = 0 1250 | for id in range(1, len(tokenItems)): 1251 | if tokenItems[id].Token == Token.BAR or tokenItems[id].Token == Token.END: 1252 | tentity = self.__GetNonTableEntity(tokenItems[lastid + 1:id], isOnlyOne=False); 1253 | if isinstance(tentity, EntityBase) and tentity.Name == "": 1254 | tentity.Name = "%s_%d" % (tokenItems[0].Values[0], len(entity.Tables)); 1255 | entity.Tables.append(tentity); 1256 | lastid = id 1257 | if tokenItems[id].Rule.find("/") == 0: 1258 | entity.Group.append(len(entity.Tables)) 1259 | else: 1260 | entity = self.__GetNonTableEntity(tokenItems[1:-1], isOnlyOne=True) 1261 | entity.Name = tokenItems[0].Values[0] 1262 | 1263 | entity.SetValues(properties); 1264 | if entity.Order != 0 and addtoOrder: 1265 | self.Entities.appendids(entity); 1266 | properties = {}; 1267 | self.Entities.append(entity) 1268 | # rebuild reference 1269 | for entity in self.Entities.AllEntities: 1270 | entity.RebuildEntity(); 1271 | 1272 | self.Entities.ValidEntities = sorted(self.Entities.ValidEntities, key=lambda x: x.Order) 1273 | 1274 | def __GetNonTableEntity(self, tokenItems, isOnlyOne): 1275 | repeat = getindex(tokenItems, lambda r: r.Token == Token.REPEAT) 1276 | if repeat < 0: 1277 | pass 1278 | elif repeat != 1: 1279 | raise "repeat format error" 1280 | else: 1281 | entity = RepeatEntity() 1282 | entity.Core = self 1283 | entity.Entity = tokenItems[0].Entity 1284 | entity.SetValues(tokenItems[1].Values) 1285 | return entity 1286 | minus = getindex(tokenItems, lambda r: r.Token == Token.MINUS) 1287 | if minus < 0: 1288 | pass 1289 | elif minus != 1: 1290 | raise "diff format error" 1291 | else: 1292 | entity = DiffEntity() 1293 | entity.Core = self 1294 | entity.Universe = tokenItems[0].Entity 1295 | id = 1 1296 | while id < len(tokenItems): 1297 | tokenItem = tokenItems[id] 1298 | if tokenItem.Token == Token.END: 1299 | return entity 1300 | if tokenItem.Token == Token.MINUS: 1301 | entity.Complements.append(tokenItems[id + 1].Entity) 1302 | id += 1 1303 | return entity 1304 | if len(tokenItems) == 1: 1305 | if not isOnlyOne: 1306 | return tokenItems[0].Entity 1307 | if isinstance(tokenItems[0].Entity, EntityBase) and tokenItems[0].Entity.Name == "": 1308 | return tokenItems[0].Entity 1309 | entity = SequenceEntity() 1310 | entity.Core = self; 1311 | state = 0 1312 | for id in range(len(tokenItems)): 1313 | tokenItem = tokenItems[id] 1314 | if tokenItem.Token == Token.END: 1315 | return entity 1316 | if tokenItem.Token == Token.COLON: 1317 | state += 1 1318 | continue 1319 | if state == 0: 1320 | entity.MatchEntities.append(tokenItem.Entity) 1321 | elif state == 1: 1322 | if tokenItem.Entity is None: 1323 | entity.RewriteOrders.append(int(tokenItem.Rule.replace("$", "")) - 1) 1324 | else: 1325 | entity.RewriteEntities.append(tokenItem.Entity) 1326 | entity.RewriteOrders.append(len(entity.RewriteOrders)) 1327 | else: 1328 | entity.Condition = tokenItem.Entity 1329 | return entity 1330 | 1331 | def InitTNRule(self, myfile, addtoOrder=True): 1332 | self.tnFileName = myfile 1333 | file_object = open(myfile, 'r', encoding='utf-8') 1334 | texts = file_object.read() 1335 | print("success load tn rules:%s" % (myfile)) 1336 | self.InitRuleText(texts, addtoOrder) 1337 | file_object.close() 1338 | 1339 | def MatchEntity(self, entity, input, mode=None): 1340 | startPos = 0 1341 | matchResults = []; 1342 | inputlen = len(input) 1343 | while (1): 1344 | if startPos >= inputlen: 1345 | break 1346 | matchResult = entity.MatchItem(input, startPos, None, entity.Start, mode) 1347 | if IsFail(matchResult): 1348 | if mode is not None and startPos == 0 and RegexCore.AutoModeStudy: 1349 | matchResult = entity.MatchItem(input, startPos, entity.Start) 1350 | if matchResult is not None: 1351 | self.__GetPublicTree(mode, matchResult) 1352 | else: 1353 | startPos = matchResult; 1354 | break 1355 | 1356 | startPos = matchResult.pos + len(matchResult.mstr); 1357 | matchResults.append(matchResult); 1358 | return matchResults; 1359 | 1360 | def RewriteEntity(self, entity, input, mode=None): 1361 | matchResults = self.MatchEntity(entity, input, mode); 1362 | if len(matchResults) == 0: 1363 | return input, False; 1364 | else: 1365 | pos = 0; 1366 | rewrite = ""; 1367 | for m in matchResults: 1368 | m.GetShouldRewrite(); 1369 | m.RewriteItem() 1370 | rewrite += input[pos:m.pos] + m.rstr; 1371 | pos = m.pos + len(m.mstr); 1372 | rewrite += input[pos:]; 1373 | return rewrite, True; 1374 | 1375 | def __GetPublicTree(self, item1, item2): 1376 | if item1 is None: 1377 | return item2 1378 | stack1 = [] 1379 | stack2 = [] 1380 | stack1.append(item1) 1381 | stack2.append(item2) 1382 | while len(stack1) > 0: 1383 | m1 = stack1.pop() 1384 | m2 = stack2.pop() 1385 | if m1.MatchIndex != m2.MatchIndex: 1386 | m1.MatchIndex = -1 1387 | continue 1388 | if isinstance(m1.Children, EntityBase): 1389 | continue 1390 | m1 = m1.Children 1391 | m2 = m2.Children 1392 | while m1 != None: 1393 | stack1.append(m1) 1394 | stack2.append(m2) 1395 | m1 = m1.NextMatch 1396 | m2 = m2.NextMatch 1397 | return item1 1398 | 1399 | def CompileString(self, input, modes): 1400 | 1401 | startPos = 0 1402 | while (1): 1403 | if startPos >= len(input): 1404 | break 1405 | modeindex = -1; 1406 | matchResult = None; 1407 | issuccess = False; 1408 | if modes is not None: 1409 | for index in range(0, len(modes)): 1410 | mode = modes[index]; 1411 | matchResult = mode.Entity.MatchItem(input, startPos, entity.Start, mode); 1412 | if matchResult is not None: 1413 | modeindex = index; 1414 | issuccess = True; 1415 | break; 1416 | if not issuccess: 1417 | for entity in self.Entities.ValidEntities: 1418 | matchResult = entity.MatchItem(input, startPos, None, entity.Start) 1419 | if matchResult is not None: 1420 | if modes is None: 1421 | modes = []; 1422 | modes.append(matchResult); 1423 | break; 1424 | if matchResult is None: 1425 | return modes; 1426 | if modes is not None and modeindex != -1: 1427 | modes[modeindex] = self.__GetPublicTree(matchResult, modes[modeindex]); 1428 | startPos += len(matchResult.mstr); 1429 | return modes 1430 | 1431 | def Compile(self, texts): 1432 | modes = None; 1433 | for text in texts: 1434 | modes = self.CompileString(text, modes); 1435 | return modes; 1436 | 1437 | def Rewrite(self, rawinput, mode=None): 1438 | 1439 | if mode is not None: 1440 | self.Entities.SeqBuff = BuffHelper(len(rawinput)); 1441 | return self.RewriteEntity(mode.Entity, rawinput, mode) 1442 | else: 1443 | self.Entities.SeqBuff = BuffHelper(len(rawinput)); 1444 | for entity in self.Entities.ValidEntities: 1445 | rewrite, succ = self.RewriteEntity(entity, rawinput, None) 1446 | if RegexCore.MatchAllEntity == False and succ == True: 1447 | return rewrite; 1448 | if rewrite != rawinput: 1449 | rawinput = rewrite; 1450 | self.Entities.SeqBuff = BuffHelper(len(rawinput)); 1451 | return rewrite 1452 | def Match(self, rawinput, mode=None): 1453 | 1454 | if mode is not None: 1455 | self.Entities.SeqBuff = BuffHelper(len(rawinput)); 1456 | return self.MatchEntity(mode.Entity, rawinput, mode) 1457 | else: 1458 | self.Entities.SeqBuff = BuffHelper(len(rawinput)); 1459 | for entity in self.Entities.ValidEntities: 1460 | match = self.MatchEntity(entity, rawinput, None) 1461 | if not RegexCore.MatchAllEntity: 1462 | return match; 1463 | return None; 1464 | 1465 | def __MatchResult2Doc__(self, matchResult): 1466 | docu = {}; 1467 | matchResult.RewriteItem(); 1468 | matchResult.ExtractDocument(docu, 0); 1469 | docu['#type'] = matchResult.Entity.Name; 1470 | docu['#pos'] = matchResult.pos; 1471 | docu['#match'] = matchResult.mstr; 1472 | docu['#rewrite'] = matchResult.rstr; 1473 | return docu 1474 | 1475 | def ExtractEntity(self, entity, input, mode=None): 1476 | start = 0 1477 | docs = []; 1478 | buffhelper = self.Entities.SeqBuff; 1479 | inputlen = len(input) 1480 | while (1): 1481 | if start >= inputlen: 1482 | break; 1483 | start = buffhelper.IsInExtractArea(start); 1484 | matchResult = buffhelper.GetMatch(entity, input, start, None) 1485 | 1486 | if matchResult is None: 1487 | matchResult = entity.MatchItem(input, start, None, entity.Start, mode) 1488 | 1489 | if IsFail(matchResult): 1490 | if matchResult == start: 1491 | start = matchResult + 1; 1492 | else: 1493 | start = matchResult; 1494 | continue; 1495 | 1496 | p = buffhelper.IsInExtractArea(matchResult.pos); 1497 | buffhelper.AddEntity(entity, matchResult) 1498 | start = matchResult.pos + len(matchResult.mstr); 1499 | if len(matchResult.mstr) == 0: 1500 | start += 1; 1501 | if p == matchResult.pos: 1502 | docu = self.__MatchResult2Doc__(matchResult); 1503 | docs.append(docu); 1504 | buffhelper.AddScan(0, matchResult.pos, start); 1505 | return docs; 1506 | 1507 | def Extract(self, input, modes=None,entities=None): 1508 | if entities is None: 1509 | entities=self.Entities.ValidEntities; 1510 | self.Entities.SeqBuff = BuffHelper(len(input)); 1511 | docs = []; 1512 | succ = False; 1513 | if modes is not None: 1514 | for mode in modes: 1515 | entity = mode.Entity; 1516 | mdocs = self.ExtractEntity(entity, input, mode) 1517 | for doc in mdocs: 1518 | docs.append(doc); 1519 | succ = True; 1520 | break; 1521 | if not succ: 1522 | for entity in entities: 1523 | mdocs = self.ExtractEntity(entity, input) 1524 | for doc in mdocs: 1525 | docs.append(doc); 1526 | return docs; 1527 | 1528 | 1529 | 1530 | -------------------------------------------------------------------------------- /test/chs.txt: -------------------------------------------------------------------------------- 1 | 中国的面积有960万平方公里 2 | 一百安培 3 | 硬盘的容量是80GB 4 | 1999年12月31日 5 | 12点25分18秒 6 | 80公分以上 7 | 八点三十 8 | 一百二十 9 | 10点24分20秒 10 | 12点三十四分十五秒 11 | buptzym@qq.com 12 | zhaoyiming@qq.com 13 | 136-0377-0086 14 | 0341-8453235 15 | 152601196705082542 16 | 43072119880818492X 17 | 270元 18 | 270万元 19 | 424194253 20 | 三万2千 21 | 7月8号 22 | 三月1日 23 | 5月20日 24 | 2013.12.13 25 | 从三月1日到5月20日 26 | 从2005年到2010年 27 | 2013.12.13-2015.7.5 28 | 2014年七月 29 | 三点15分 30 | 八点一刻 31 | 凌晨4点十五分 32 | 十二点三十七分 33 | 12:30:04 34 | 12:30 35 | 20MB 36 | 五十兆 37 | 30欧姆 38 | 40Ω 39 | 一百二十分贝 40 | 硬盘的容量是80GB 41 | 八吨的大象 42 | 游泳池有500m宽 43 | http://news.163.com/special/bra_pc/ 44 | 北京市东城区沙滩后街59号13排 45 | 西安市莲湖区桃园一坊简易2号楼12号 46 | 山西省忻州市忻府区 47 | 北京市海淀区 48 | 49 | 50 | -------------------------------------------------------------------------------- /test/learn.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from src.tnpy import RegexCore 4 | core = RegexCore('../rules/learn') 5 | import src.tngraph as graph 6 | graph.buildGraph(core,'int_0_99'); 7 | exit() 8 | RegexCore.LogFile = open("learn.log", 'w') 9 | RegexCore.LogFile.truncate() 10 | #matchs=core.Match('领导你好!老婆你好'); 11 | #for m in matchs: 12 | # print('match',m.mstr, 'pos:',m.pos) 13 | 14 | 15 | print(core.Rewrite('领导你好!老婆您好')); 16 | 17 | print({r:core.Rewrite(r) for r in ['十','三十七','一十三','68']}); 18 | 19 | RegexCore.LogFile.flush() 20 | RegexCore.LogFile.close() 21 | -------------------------------------------------------------------------------- /test/sample.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import sys 4 | sys.path.append("../src") 5 | from tnpy import RegexCore 6 | import json; 7 | #import tngraph as graph 8 | 9 | core = RegexCore('../rules/cnext') 10 | #graph.buildGraph(core,'time_fix'); 11 | #exit() 12 | #RegexCore.LogFile = open("info.html", 'w') 13 | #RegexCore.LogFile.truncate() 14 | 15 | print(core.Extract('十三分之二十四',entities=[core.Entities['fraction']])) 16 | read = open('chs.txt', 'r', encoding='utf-8') 17 | lines = [x for x in read.readlines()] 18 | 19 | 20 | for line in lines: 21 | r = core.Extract(line) 22 | js = json.dumps(r, indent=2, ensure_ascii=False); 23 | print(js); 24 | 25 | 26 | 27 | 28 | 29 | 30 | #RegexCore.LogFile.flush() 31 | #RegexCore.LogFile.close() --------------------------------------------------------------------------------