├── .gitignore ├── .travis.yml ├── Cargo.toml ├── LICENSE ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── core ├── Cargo.toml └── src │ ├── builder.rs │ ├── helpers.rs │ ├── lib.rs │ ├── pattern.rs │ ├── range.rs │ ├── rule.rs │ └── stash.rs ├── ml ├── Cargo.toml └── src │ └── lib.rs ├── src ├── lib.rs ├── macros.rs └── train.rs └── update_version.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | /target/ 4 | 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 6 | # More information here http://doc.crates.io/guide.html#cargotoml-vs-cargolock 7 | Cargo.lock 8 | target 9 | Cargo.lock 10 | 11 | *.rustfmt 12 | tmp 13 | temp 14 | .idea/ 15 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: rust 2 | rust: 3 | - stable 4 | - beta 5 | - nightly 6 | matrix: 7 | allow_failures: 8 | - rust: nightly 9 | script: 10 | - cargo test --all --verbose -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rustling" 3 | version = "0.9.1" 4 | authors = ["hdlj ", "Mathieu Poumeyrol "] 5 | edition = "2018" 6 | 7 | [workspace] 8 | members = ["core", "ml"] 9 | 10 | [dependencies] 11 | rustling-core = { path = "core" } 12 | rustling-ml = { path = "ml" } 13 | failure = "0.1" 14 | fnv = "1.0" 15 | serde = { version = "1.0", features = ["derive"] } 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | ## License 2 | 3 | Licensed under either of 4 | * Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) 5 | * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) 6 | at your option. 7 | 8 | ### Contribution 9 | 10 | Unless you explicitly state otherwise, any contribution intentionally submitted 11 | for inclusion in the work by you, as defined in the Apache-2.0 license, shall 12 | be dual licensed as above, without any additional terms or conditions. 13 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Permission is hereby granted, free of charge, to any 2 | person obtaining a copy of this software and associated 3 | documentation files (the "Software"), to deal in the 4 | Software without restriction, including without 5 | limitation the rights to use, copy, modify, merge, 6 | publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software 8 | is furnished to do so, subject to the following 9 | conditions: 10 | 11 | The above copyright notice and this permission notice 12 | shall be included in all copies or substantial portions 13 | of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 16 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 17 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 18 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 19 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 22 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 | DEALINGS IN THE SOFTWARE. 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Rustling 2 | [![Build Status](https://travis-ci.org/snipsco/rustling.svg?branch=master)](https://travis-ci.org/snipsco/rustling) 3 | 4 | 5 | Rust port of https://github.com/facebookincubator/duckling 6 | 7 | # License 8 | 9 | ## Apache 2.0/MIT 10 | 11 | All original work licensed under either of 12 | * Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) 13 | * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) 14 | at your option. 15 | 16 | ## Contribution 17 | 18 | Unless you explicitly state otherwise, any contribution intentionally submitted 19 | for inclusion in the work by you, as defined in the Apache-2.0 license, shall 20 | be dual licensed as above, without any additional terms or conditions. 21 | -------------------------------------------------------------------------------- /core/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rustling-core" 3 | version = "0.9.1" 4 | authors = ["hdlj ", "Mathieu Poumeyrol "] 5 | edition = "2018" 6 | 7 | [dependencies] 8 | regex = "1.0" 9 | smallvec = "0.6" 10 | failure = "0.1" 11 | string-interner = "0.7" 12 | serde = { version = "1.0", features = ["derive"] } 13 | -------------------------------------------------------------------------------- /core/src/builder.rs: -------------------------------------------------------------------------------- 1 | use crate::helpers::BoundariesChecker; 2 | use crate::rule::{ 3 | Rule, Rule1, Rule2, Rule3, Rule4, Rule5, Rule6, RuleProductionArg, RuleResult, TerminalRule, 4 | }; 5 | use crate::{ 6 | cell, pattern, CoreResult, NodePayload, Pattern, RuleSet, StashIndexable, Sym, SymbolTable, 7 | TerminalPattern, 8 | }; 9 | 10 | pub struct RuleSetBuilder { 11 | symbols: cell::RefCell, 12 | composition_rules: cell::RefCell>>>, 13 | terminal_rules: cell::RefCell>>>, 14 | word_boundaries: BoundariesChecker, 15 | match_boundaries: BoundariesChecker, 16 | } 17 | 18 | impl RuleSetBuilder { 19 | pub fn new( 20 | word_boundaries: BoundariesChecker, 21 | match_boundaries: BoundariesChecker, 22 | ) -> RuleSetBuilder { 23 | RuleSetBuilder { 24 | symbols: cell::RefCell::new(SymbolTable::default()), 25 | composition_rules: cell::RefCell::new(vec![]), 26 | terminal_rules: cell::RefCell::new(vec![]), 27 | word_boundaries, 28 | match_boundaries, 29 | } 30 | } 31 | } 32 | 33 | impl RuleSetBuilder { 34 | pub fn sym(&self, val: S) -> Sym 35 | where 36 | S: Into + AsRef, 37 | { 38 | self.symbols.borrow_mut().sym(val) 39 | } 40 | 41 | pub fn rule_1(&self, sym: S, pa: PA, production: F) 42 | where 43 | S: Into + AsRef, 44 | V: NodePayload + 'static, 45 | StashValue: StashIndexable + From + 'static, 46 | F: for<'a> Fn(&RuleProductionArg<'a, PA::M>) -> RuleResult + 'static + Send + Sync, 47 | PA: Pattern + 'static, 48 | { 49 | let sym = self.sym(sym); 50 | self.composition_rules 51 | .borrow_mut() 52 | .push(Box::new(Rule1::new(sym, pa, production))) 53 | } 54 | 55 | pub fn rule_1_terminal(&self, sym: S, pa: PA, production: F) 56 | where 57 | S: Into + AsRef, 58 | V: NodePayload + 'static, 59 | StashValue: StashIndexable + From + 'static, 60 | F: for<'a> Fn(&RuleProductionArg<'a, PA::M>) -> RuleResult + 'static + Send + Sync, 61 | PA: TerminalPattern + 'static, 62 | { 63 | let sym = self.sym(sym); 64 | self.terminal_rules 65 | .borrow_mut() 66 | .push(Box::new(Rule1::new(sym, pa, production))) 67 | } 68 | 69 | pub fn rule_2(&self, sym: S, pa: PA, pb: PB, production: F) 70 | where 71 | S: Into + AsRef, 72 | V: NodePayload + 'static, 73 | StashValue: StashIndexable + From + 'static, 74 | F: for<'a> Fn( 75 | &RuleProductionArg<'a, PA::M>, 76 | &RuleProductionArg<'a, PB::M>, 77 | ) -> RuleResult 78 | + 'static 79 | + Send 80 | + Sync, 81 | PA: Pattern + 'static, 82 | PB: Pattern + 'static, 83 | { 84 | let sym = self.sym(sym); 85 | self.composition_rules 86 | .borrow_mut() 87 | .push(Box::new(Rule2::new(sym, (pa, pb), production))) 88 | } 89 | 90 | pub fn rule_2_terminal(&self, sym: S, pa: PA, pb: PB, production: F) 91 | where 92 | S: Into + AsRef, 93 | V: NodePayload + 'static, 94 | StashValue: StashIndexable + From + 'static, 95 | F: for<'a> Fn( 96 | &RuleProductionArg<'a, PA::M>, 97 | &RuleProductionArg<'a, PB::M>, 98 | ) -> RuleResult 99 | + 'static 100 | + Send 101 | + Sync, 102 | PA: TerminalPattern + 'static, 103 | PB: TerminalPattern + 'static, 104 | { 105 | let sym = self.sym(sym); 106 | self.terminal_rules 107 | .borrow_mut() 108 | .push(Box::new(Rule2::new(sym, (pa, pb), production))) 109 | } 110 | 111 | pub fn rule_3(&self, sym: S, pa: PA, pb: PB, pc: PC, production: F) 112 | where 113 | S: Into + AsRef, 114 | V: NodePayload + 'static, 115 | StashValue: StashIndexable + From + 'static, 116 | F: for<'a> Fn( 117 | &RuleProductionArg<'a, PA::M>, 118 | &RuleProductionArg<'a, PB::M>, 119 | &RuleProductionArg<'a, PC::M>, 120 | ) -> RuleResult 121 | + 'static 122 | + Send 123 | + Sync, 124 | PA: Pattern + 'static, 125 | PB: Pattern + 'static, 126 | PC: Pattern + 'static, 127 | { 128 | let sym = self.sym(sym); 129 | self.composition_rules 130 | .borrow_mut() 131 | .push(Box::new(Rule3::new(sym, (pa, pb, pc), production))) 132 | } 133 | 134 | pub fn rule_3_terminal( 135 | &self, 136 | sym: S, 137 | pa: PA, 138 | pb: PB, 139 | pc: PC, 140 | production: F, 141 | ) where 142 | S: Into + AsRef, 143 | V: NodePayload + 'static, 144 | StashValue: StashIndexable + From + 'static, 145 | F: for<'a> Fn( 146 | &RuleProductionArg<'a, PA::M>, 147 | &RuleProductionArg<'a, PB::M>, 148 | &RuleProductionArg<'a, PC::M>, 149 | ) -> RuleResult 150 | + 'static 151 | + Send 152 | + Sync, 153 | PA: TerminalPattern + 'static, 154 | PB: TerminalPattern + 'static, 155 | PC: TerminalPattern + 'static, 156 | { 157 | let sym = self.sym(sym); 158 | self.composition_rules 159 | .borrow_mut() 160 | .push(Box::new(Rule3::new(sym, (pa, pb, pc), production))) 161 | } 162 | 163 | pub fn rule_4( 164 | &self, 165 | sym: S, 166 | pa: PA, 167 | pb: PB, 168 | pc: PC, 169 | pd: PD, 170 | production: F, 171 | ) where 172 | S: Into + AsRef, 173 | V: NodePayload + 'static, 174 | StashValue: StashIndexable + From + 'static, 175 | F: for<'a> Fn( 176 | &RuleProductionArg<'a, PA::M>, 177 | &RuleProductionArg<'a, PB::M>, 178 | &RuleProductionArg<'a, PC::M>, 179 | &RuleProductionArg<'a, PD::M>, 180 | ) -> RuleResult 181 | + 'static 182 | + Send 183 | + Sync, 184 | PA: Pattern + 'static, 185 | PB: Pattern + 'static, 186 | PC: Pattern + 'static, 187 | PD: Pattern + 'static, 188 | { 189 | let sym = self.sym(sym); 190 | self.composition_rules 191 | .borrow_mut() 192 | .push(Box::new(Rule4::new(sym, (pa, pb, pc, pd), production))) 193 | } 194 | 195 | pub fn rule_4_terminal( 196 | &self, 197 | sym: S, 198 | pa: PA, 199 | pb: PB, 200 | pc: PC, 201 | pd: PD, 202 | production: F, 203 | ) where 204 | S: Into + AsRef, 205 | V: NodePayload + 'static, 206 | StashValue: StashIndexable + From + 'static, 207 | F: for<'a> Fn( 208 | &RuleProductionArg<'a, PA::M>, 209 | &RuleProductionArg<'a, PB::M>, 210 | &RuleProductionArg<'a, PC::M>, 211 | &RuleProductionArg<'a, PD::M>, 212 | ) -> RuleResult 213 | + 'static 214 | + Send 215 | + Sync, 216 | PA: TerminalPattern + 'static, 217 | PB: TerminalPattern + 'static, 218 | PC: TerminalPattern + 'static, 219 | PD: TerminalPattern + 'static, 220 | { 221 | let sym = self.sym(sym); 222 | self.composition_rules 223 | .borrow_mut() 224 | .push(Box::new(Rule4::new(sym, (pa, pb, pc, pd), production))) 225 | } 226 | 227 | pub fn rule_5( 228 | &self, 229 | sym: S, 230 | pa: PA, 231 | pb: PB, 232 | pc: PC, 233 | pd: PD, 234 | pe: PE, 235 | production: F, 236 | ) where 237 | S: Into + AsRef, 238 | V: NodePayload + 'static, 239 | StashValue: StashIndexable + From + 'static, 240 | F: for<'a> Fn( 241 | &RuleProductionArg<'a, PA::M>, 242 | &RuleProductionArg<'a, PB::M>, 243 | &RuleProductionArg<'a, PC::M>, 244 | &RuleProductionArg<'a, PD::M>, 245 | &RuleProductionArg<'a, PE::M>, 246 | ) -> RuleResult 247 | + 'static 248 | + Send 249 | + Sync, 250 | PA: Pattern + 'static, 251 | PB: Pattern + 'static, 252 | PC: Pattern + 'static, 253 | PD: Pattern + 'static, 254 | PE: Pattern + 'static, 255 | { 256 | let sym = self.sym(sym); 257 | self.composition_rules 258 | .borrow_mut() 259 | .push(Box::new(Rule5::new(sym, (pa, pb, pc, pd, pe), production))) 260 | } 261 | 262 | pub fn rule_5_terminal( 263 | &self, 264 | sym: S, 265 | pa: PA, 266 | pb: PB, 267 | pc: PC, 268 | pd: PD, 269 | pe: PE, 270 | production: F, 271 | ) where 272 | S: Into + AsRef, 273 | V: NodePayload + 'static, 274 | StashValue: StashIndexable + From + 'static, 275 | F: for<'a> Fn( 276 | &RuleProductionArg<'a, PA::M>, 277 | &RuleProductionArg<'a, PB::M>, 278 | &RuleProductionArg<'a, PC::M>, 279 | &RuleProductionArg<'a, PD::M>, 280 | &RuleProductionArg<'a, PE::M>, 281 | ) -> RuleResult 282 | + 'static 283 | + Send 284 | + Sync, 285 | PA: TerminalPattern + 'static, 286 | PB: TerminalPattern + 'static, 287 | PC: TerminalPattern + 'static, 288 | PD: TerminalPattern + 'static, 289 | PE: TerminalPattern + 'static, 290 | { 291 | let sym = self.sym(sym); 292 | self.composition_rules 293 | .borrow_mut() 294 | .push(Box::new(Rule5::new(sym, (pa, pb, pc, pd, pe), production))) 295 | } 296 | 297 | pub fn rule_6( 298 | &self, 299 | sym: S, 300 | pa: PA, 301 | pb: PB, 302 | pc: PC, 303 | pd: PD, 304 | pe: PE, 305 | pf: PF, 306 | production: F, 307 | ) where 308 | S: Into + AsRef, 309 | V: NodePayload + 'static, 310 | StashValue: StashIndexable + From + 'static, 311 | F: for<'a> Fn( 312 | &RuleProductionArg<'a, PA::M>, 313 | &RuleProductionArg<'a, PB::M>, 314 | &RuleProductionArg<'a, PC::M>, 315 | &RuleProductionArg<'a, PD::M>, 316 | &RuleProductionArg<'a, PE::M>, 317 | &RuleProductionArg<'a, PF::M>, 318 | ) -> RuleResult 319 | + 'static 320 | + Send 321 | + Sync, 322 | PA: Pattern + 'static, 323 | PB: Pattern + 'static, 324 | PC: Pattern + 'static, 325 | PD: Pattern + 'static, 326 | PE: Pattern + 'static, 327 | PF: Pattern + 'static, 328 | { 329 | let sym = self.sym(sym); 330 | self.composition_rules 331 | .borrow_mut() 332 | .push(Box::new(Rule6::new( 333 | sym, 334 | (pa, pb, pc, pd, pe, pf), 335 | production, 336 | ))) 337 | } 338 | 339 | pub fn rule_6_terminal( 340 | &self, 341 | sym: S, 342 | pa: PA, 343 | pb: PB, 344 | pc: PC, 345 | pd: PD, 346 | pe: PE, 347 | pf: PF, 348 | production: F, 349 | ) where 350 | S: Into + AsRef, 351 | V: NodePayload + 'static, 352 | StashValue: StashIndexable + From + 'static, 353 | F: for<'a> Fn( 354 | &RuleProductionArg<'a, PA::M>, 355 | &RuleProductionArg<'a, PB::M>, 356 | &RuleProductionArg<'a, PC::M>, 357 | &RuleProductionArg<'a, PD::M>, 358 | &RuleProductionArg<'a, PE::M>, 359 | &RuleProductionArg<'a, PF::M>, 360 | ) -> RuleResult 361 | + 'static 362 | + Send 363 | + Sync, 364 | PA: TerminalPattern + 'static, 365 | PB: TerminalPattern + 'static, 366 | PC: TerminalPattern + 'static, 367 | PD: TerminalPattern + 'static, 368 | PE: TerminalPattern + 'static, 369 | PF: TerminalPattern + 'static, 370 | { 371 | let sym = self.sym(sym); 372 | self.composition_rules 373 | .borrow_mut() 374 | .push(Box::new(Rule6::new( 375 | sym, 376 | (pa, pb, pc, pd, pe, pf), 377 | production, 378 | ))) 379 | } 380 | 381 | pub fn reg(&self, regex: &str) -> CoreResult> { 382 | Ok(pattern::TextPattern::new( 383 | ::regex::Regex::new(regex)?, 384 | self.sym(regex), 385 | self.word_boundaries.clone(), 386 | )) 387 | } 388 | 389 | pub fn reg_neg_lh( 390 | &self, 391 | regex: &str, 392 | neg_lh: &str, 393 | ) -> CoreResult> { 394 | Ok(pattern::TextNegLHPattern::new( 395 | ::regex::Regex::new(regex)?, 396 | ::regex::Regex::new(neg_lh)?, 397 | self.sym(format!("{}(?:{})", regex, neg_lh)), 398 | self.word_boundaries.clone(), 399 | )) 400 | } 401 | 402 | pub fn build(self) -> RuleSet { 403 | RuleSet { 404 | symbols: self.symbols.into_inner(), 405 | terminal_rules: self.terminal_rules.into_inner(), 406 | composition_rules: self.composition_rules.into_inner(), 407 | match_boundaries: self.match_boundaries, 408 | } 409 | } 410 | } 411 | -------------------------------------------------------------------------------- /core/src/helpers.rs: -------------------------------------------------------------------------------- 1 | use crate::range::Range; 2 | 3 | #[derive(Copy, Clone, Debug, PartialEq)] 4 | enum BoundariesClass { 5 | AlphanumericWord { option: ValidBoundariesOption }, 6 | AlphabeticWord { option: ValidBoundariesOption }, 7 | Detailed { option: ValidBoundariesOption }, 8 | NoClass, 9 | } 10 | 11 | impl BoundariesClass { 12 | fn apply_left(&self, sentence: &str, range: &Range) -> bool { 13 | match self { 14 | &BoundariesClass::AlphanumericWord { option } => { 15 | left_valid_boundaries(sentence, range, &option, &alphanumeric_class) 16 | } 17 | &BoundariesClass::AlphabeticWord { option } => { 18 | left_valid_boundaries(sentence, range, &option, &alphabetic_class) 19 | } 20 | &BoundariesClass::Detailed { option } => { 21 | left_valid_boundaries(sentence, range, &option, &detailed_class) 22 | } 23 | &BoundariesClass::NoClass => true, 24 | } 25 | } 26 | fn apply_right(&self, sentence: &str, range: &Range) -> bool { 27 | match self { 28 | &BoundariesClass::AlphanumericWord { option } => { 29 | right_valid_boundaries(sentence, range, &option, &alphanumeric_class) 30 | } 31 | &BoundariesClass::AlphabeticWord { option } => { 32 | right_valid_boundaries(sentence, range, &option, &alphabetic_class) 33 | } 34 | &BoundariesClass::Detailed { option } => { 35 | right_valid_boundaries(sentence, range, &option, &detailed_class) 36 | } 37 | &BoundariesClass::NoClass => true, 38 | } 39 | } 40 | } 41 | 42 | #[derive(Clone, Debug, PartialEq)] 43 | pub struct BoundariesChecker(Vec); 44 | 45 | impl BoundariesChecker { 46 | pub fn check(&self, sentence: &str, range: Range) -> bool { 47 | self.0.iter().any(|c| c.apply_left(sentence, &range)) 48 | && self.0.iter().any(|c| c.apply_right(sentence, &range)) 49 | } 50 | 51 | pub fn separated_alphanumeric_word() -> BoundariesChecker { 52 | BoundariesChecker(vec![BoundariesClass::AlphanumericWord { 53 | option: ValidBoundariesOption::OnCharClassChange, 54 | }]) 55 | } 56 | 57 | pub fn detailed() -> BoundariesChecker { 58 | BoundariesChecker(vec![BoundariesClass::Detailed { 59 | option: ValidBoundariesOption::OnCharClassChange, 60 | }]) 61 | } 62 | 63 | pub fn composed_word_or_detailed() -> BoundariesChecker { 64 | BoundariesChecker(vec![ 65 | BoundariesClass::AlphabeticWord { 66 | option: ValidBoundariesOption::OnSameCharClass, 67 | }, 68 | BoundariesClass::Detailed { 69 | option: ValidBoundariesOption::OnCharClassChange, 70 | }, 71 | ]) 72 | } 73 | 74 | pub fn no_check() -> BoundariesChecker { 75 | BoundariesChecker(vec![BoundariesClass::NoClass]) 76 | } 77 | } 78 | 79 | #[derive(Copy, Clone, Debug, PartialEq)] 80 | enum ValidBoundariesOption { 81 | OnCharClassChange, 82 | OnSameCharClass, 83 | } 84 | 85 | fn alphabetic_class(c: char) -> char { 86 | if c.is_alphabetic() { 87 | 'A' 88 | } else { 89 | 'O' 90 | } 91 | } 92 | 93 | fn alphanumeric_class(c: char) -> char { 94 | if c.is_alphanumeric() { 95 | 'A' 96 | } else { 97 | c 98 | } 99 | } 100 | 101 | fn detailed_class(c: char) -> char { 102 | if c.is_uppercase() { 103 | 'u' 104 | } else if c.is_lowercase() { 105 | 'l' 106 | } else if c.is_digit(10) { 107 | 'd' 108 | } else { 109 | c 110 | } 111 | } 112 | 113 | fn right_valid_boundaries( 114 | sentence: &str, 115 | range: &Range, 116 | option: &ValidBoundariesOption, 117 | char_class: &CharClass, 118 | ) -> bool 119 | where 120 | CharClass: Fn(char) -> char, 121 | { 122 | let last_mine = sentence[range.0..range.1] 123 | .chars() 124 | .next_back() 125 | .map(char_class); //Some(c) 126 | let first_after = sentence[range.1..].chars().next().map(char_class); // Option(c) 127 | 128 | match option { 129 | &ValidBoundariesOption::OnCharClassChange => last_mine != first_after, 130 | &ValidBoundariesOption::OnSameCharClass => first_after == None || last_mine == first_after, 131 | } 132 | } 133 | 134 | fn left_valid_boundaries( 135 | sentence: &str, 136 | range: &Range, 137 | option: &ValidBoundariesOption, 138 | char_class: &CharClass, 139 | ) -> bool 140 | where 141 | CharClass: Fn(char) -> char, 142 | { 143 | let first_mine = sentence[range.0..range.1].chars().next().map(char_class); // Some(c) 144 | let last_before = sentence[..range.0].chars().next_back().map(char_class); // Option(c) 145 | 146 | match option { 147 | &ValidBoundariesOption::OnCharClassChange => first_mine != last_before, 148 | &ValidBoundariesOption::OnSameCharClass => first_mine == None || first_mine == last_before, 149 | } 150 | } 151 | 152 | #[cfg(test)] 153 | mod tests { 154 | use super::*; 155 | 156 | #[test] 157 | fn test_valid_boundaries_alphanumeric() { 158 | let checker = BoundariesChecker::separated_alphanumeric_word(); 159 | assert_eq!(true, checker.check("abc def ret", Range(4, 7))); // "def" 160 | assert_eq!(false, checker.check("abc def ret", Range(2, 8))); // "c def r" 161 | assert_eq!(false, checker.check("abc def123 ret", Range(4, 7))); // "def" 162 | assert_eq!(true, checker.check("def123 ret", Range(0, 6))); // "def123" 163 | assert_eq!(false, checker.check("def123 ret", Range(0, 3))); // "def" 164 | assert_eq!(true, checker.check("ret def", Range(4, 7))); // "def" 165 | assert_eq!(false, checker.check("ret 123def", Range(7, 10))); // "def" 166 | assert_eq!(false, checker.check("aéc def ret", Range(3, 9))); // "c def r" 167 | assert_eq!(false, checker.check("aec def rét", Range(2, 8))); // "c def r" 168 | assert_eq!(false, checker.check("aec déf ret", Range(2, 9))); // "c déf r" 169 | assert_eq!(false, checker.check("aeç def ret", Range(2, 9))); // "ç def r" 170 | assert_eq!(true, checker.check("aeç def ret", Range(4, 8))); // " def " 171 | } 172 | 173 | #[test] 174 | fn test_valid_boundaries_composed_word_or_detailed() { 175 | let checker = BoundariesChecker::composed_word_or_detailed(); 176 | assert_eq!(true, checker.check("abc def ret", Range(4, 7))); // "def" 177 | assert_eq!(true, checker.check("abc def ret", Range(2, 8))); // "c def r" 178 | assert_eq!(true, checker.check("abc def123 ret", Range(4, 7))); // "def" 179 | assert_eq!(true, checker.check("def123 ret", Range(0, 6))); // "def123" 180 | assert_eq!(true, checker.check("def123 ret", Range(0, 3))); // "def" 181 | assert_eq!(true, checker.check("ret def", Range(4, 7))); // "def" 182 | assert_eq!(true, checker.check("ret 123def", Range(7, 10))); // "def" 183 | assert_eq!(true, checker.check("aéc def ret", Range(3, 9))); // "c def r" 184 | assert_eq!(true, checker.check("aec def rét", Range(2, 8))); // "c def r" 185 | assert_eq!(true, checker.check("aec déf ret", Range(2, 9))); // "c déf r" 186 | assert_eq!(true, checker.check("aeç def ret", Range(2, 9))); // "ç def r" 187 | assert_eq!(true, checker.check("aeç def ret", Range(4, 8))); // " def " 188 | } 189 | 190 | #[test] 191 | fn test_valid_boundaries_detailed() { 192 | let checker = BoundariesChecker::detailed(); 193 | assert_eq!(true, checker.check("abc def ret", Range(4, 7))); // "def" 194 | assert_eq!(false, checker.check("abc def ret", Range(2, 8))); // "c def r" 195 | assert_eq!(true, checker.check("abc def123 ret", Range(4, 7))); // "def" 196 | assert_eq!(true, checker.check("def123 ret", Range(0, 6))); // "def123" 197 | assert_eq!(true, checker.check("def123 ret", Range(0, 3))); // "def" 198 | assert_eq!(true, checker.check("ret def", Range(4, 7))); // "def" 199 | assert_eq!(true, checker.check("ret 123def", Range(7, 10))); // "def" 200 | assert_eq!(false, checker.check("aéc def ret", Range(3, 9))); // "c def r" 201 | assert_eq!(false, checker.check("aec def rét", Range(2, 8))); // "c def r" 202 | assert_eq!(false, checker.check("aec déf ret", Range(2, 9))); // "c déf r" 203 | assert_eq!(false, checker.check("aeç def ret", Range(2, 9))); // "ç def r" 204 | assert_eq!(true, checker.check("aeç def ret", Range(4, 8))); // " def " 205 | } 206 | 207 | #[test] 208 | fn test_valid_boundaries_no_check() { 209 | let checker = BoundariesChecker::no_check(); 210 | assert_eq!(true, checker.check("abc def ret", Range(4, 7))); // "def" 211 | assert_eq!(true, checker.check("abc def ret", Range(2, 8))); // "c def r" 212 | assert_eq!(true, checker.check("abc def123 ret", Range(4, 7))); // "def" 213 | assert_eq!(true, checker.check("def123 ret", Range(0, 6))); // "def123" 214 | assert_eq!(true, checker.check("def123 ret", Range(0, 3))); // "def" 215 | assert_eq!(true, checker.check("ret def", Range(4, 7))); // "def" 216 | assert_eq!(true, checker.check("ret 123def", Range(7, 10))); // "def" 217 | assert_eq!(true, checker.check("aéc def ret", Range(3, 9))); // "c def r" 218 | assert_eq!(true, checker.check("aec def rét", Range(2, 8))); // "c def r" 219 | assert_eq!(true, checker.check("aec déf ret", Range(2, 9))); // "c déf r" 220 | assert_eq!(true, checker.check("aeç def ret", Range(2, 9))); // "ç def r" 221 | assert_eq!(true, checker.check("aeç def ret", Range(4, 8))); // " def " 222 | } 223 | } 224 | -------------------------------------------------------------------------------- /core/src/lib.rs: -------------------------------------------------------------------------------- 1 | #[macro_use] 2 | extern crate failure; 3 | pub extern crate regex; 4 | extern crate smallvec; 5 | extern crate string_interner; 6 | 7 | mod builder; 8 | mod helpers; 9 | pub mod pattern; 10 | mod range; 11 | pub mod rule; 12 | mod stash; 13 | 14 | pub use builder::RuleSetBuilder; 15 | pub use helpers::BoundariesChecker; 16 | use pattern::Pattern; 17 | use pattern::TerminalPattern; 18 | pub use range::Range; 19 | use rule::Rule; 20 | use rule::TerminalRule; 21 | pub use rule::{RuleError, RuleResult}; 22 | use serde::{Deserialize, Serialize}; 23 | use smallvec::SmallVec; 24 | use stash::Stash; 25 | pub use stash::{InnerStashIndexable, StashIndexable}; 26 | use std::collections::HashSet; 27 | use std::fmt::Debug; 28 | use std::{cell, rc}; 29 | use string_interner::StringInterner; 30 | 31 | pub type CoreResult = Result; 32 | 33 | pub trait AttemptFrom: Sized { 34 | fn attempt_from(v: V) -> Option; 35 | } 36 | 37 | pub trait AttemptInto: Sized { 38 | fn attempt_into(self) -> Option; 39 | } 40 | 41 | impl AttemptInto for S 42 | where 43 | S: Clone, 44 | T: AttemptFrom, 45 | { 46 | fn attempt_into(self) -> Option { 47 | T::attempt_from(self) 48 | } 49 | } 50 | 51 | pub trait NodePayload: Clone { 52 | type Payload: Clone + PartialEq + Debug; 53 | fn extract_payload(&self) -> Option; 54 | } 55 | 56 | pub type ChildrenNodes = SmallVec<[rc::Rc>; 2]>; 57 | 58 | #[derive(Copy, Ord, Eq, Clone, PartialEq, PartialOrd, Debug, Hash, Serialize, Deserialize)] 59 | pub struct Sym(usize); 60 | 61 | impl string_interner::Symbol for Sym { 62 | fn from_usize(val: usize) -> Self { 63 | Sym(val) 64 | } 65 | 66 | fn to_usize(self) -> usize { 67 | self.0 68 | } 69 | } 70 | 71 | impl From for Sym { 72 | fn from(it: usize) -> Sym { 73 | Sym(it) 74 | } 75 | } 76 | impl From for usize { 77 | fn from(it: Sym) -> usize { 78 | it.0 79 | } 80 | } 81 | 82 | pub struct SymbolTable(StringInterner); 83 | 84 | impl Default for SymbolTable { 85 | fn default() -> SymbolTable { 86 | SymbolTable(string_interner::StringInterner::new()) 87 | } 88 | } 89 | 90 | impl SymbolTable { 91 | pub fn sym(&mut self, val: T) -> Sym 92 | where 93 | T: Into + AsRef, 94 | { 95 | self.0.get_or_intern(val) 96 | } 97 | } 98 | 99 | #[derive(Debug, PartialEq, Clone, Hash, Eq, Copy)] 100 | pub enum ParsingStatus { 101 | Continue, 102 | Exit, 103 | } 104 | 105 | impl ParsingStatus { 106 | pub fn is_exit(&self) -> bool { 107 | match self { 108 | &ParsingStatus::Exit => true, 109 | _ => false, 110 | } 111 | } 112 | 113 | pub fn is_continue(&self) -> bool { 114 | match self { 115 | &ParsingStatus::Continue => true, 116 | _ => false, 117 | } 118 | } 119 | } 120 | 121 | #[derive(Debug, PartialEq, Clone, Hash, Eq)] 122 | pub struct Node { 123 | pub rule_sym: Sym, 124 | pub byte_range: Range, 125 | pub payload: Option, 126 | pub children: ChildrenNodes, 127 | } 128 | 129 | impl Node { 130 | fn new( 131 | sym: Sym, 132 | byte_range: Range, 133 | payload: Option, 134 | children: ChildrenNodes, 135 | ) -> rc::Rc> { 136 | rc::Rc::new(Node { 137 | rule_sym: sym, 138 | byte_range, 139 | payload, 140 | children, 141 | }) 142 | } 143 | 144 | pub fn height(&self) -> usize { 145 | 1 + self.children.iter().map(|c| c.height()).max().unwrap_or(0) 146 | } 147 | 148 | pub fn num_nodes(&self) -> usize { 149 | let num_children: usize = self.children.iter().map(|c| c.num_nodes()).sum(); 150 | num_children + 1 151 | } 152 | 153 | pub fn all_syms(&self) -> HashSet<&Sym> { 154 | let mut hash_set = HashSet::new(); 155 | hash_set.insert(&self.rule_sym); 156 | for child in self.children.iter() { 157 | for sym in child.all_syms().into_iter() { 158 | hash_set.insert(sym); 159 | } 160 | } 161 | hash_set 162 | } 163 | } 164 | 165 | #[derive(Debug, PartialEq, Clone)] 166 | pub struct ParsedNode { 167 | pub root_node: rc::Rc>, 168 | pub value: V, 169 | } 170 | 171 | impl ParsedNode { 172 | fn new( 173 | sym: Sym, 174 | v: V, 175 | r: Range, 176 | payload: Option, 177 | children: ChildrenNodes, 178 | ) -> ParsedNode { 179 | ParsedNode { 180 | root_node: Node::new(sym, r, payload, children), 181 | value: v, 182 | } 183 | } 184 | } 185 | 186 | pub struct RuleSet { 187 | symbols: SymbolTable, 188 | composition_rules: Vec>>, 189 | terminal_rules: Vec>>, 190 | match_boundaries: BoundariesChecker, 191 | } 192 | 193 | impl RuleSet { 194 | fn apply_terminal_rules( 195 | &self, 196 | stash: &mut Stash, 197 | sentence: &str, 198 | ) -> CoreResult<()> { 199 | let mut produced_nodes = vec![]; 200 | for rule in &self.terminal_rules { 201 | produced_nodes.extend(rule.apply(stash, sentence)?.nodes); 202 | } 203 | stash.extend(produced_nodes); 204 | Ok(()) 205 | } 206 | 207 | fn apply_composition_rules( 208 | &self, 209 | stash: &mut Stash, 210 | sentence: &str, 211 | rules_mask_status: &mut Vec, 212 | ) -> CoreResult<()> { 213 | let mut produced_nodes = vec![]; 214 | for (idx, rule) in self.composition_rules.iter().enumerate() { 215 | if rules_mask_status[idx].is_continue() { 216 | let output = rule.apply(stash, sentence)?; 217 | rules_mask_status[idx] = output.status; 218 | produced_nodes.extend(output.nodes); 219 | } 220 | } 221 | stash.extend(produced_nodes); 222 | Ok(()) 223 | } 224 | 225 | pub fn apply_all(&self, sentence: &str) -> CoreResult>> { 226 | let iterations_max = 10; 227 | let max_stash_size = 600; 228 | let mut stash = Stash::default(); 229 | 230 | self.apply_terminal_rules(&mut stash, sentence)?; 231 | let mut previous_stash_size = stash.len(); 232 | 233 | let mut rules_mask_status = vec![ParsingStatus::Continue; self.composition_rules.len()]; 234 | 235 | for _ in 0..iterations_max { 236 | self.apply_composition_rules(&mut stash, sentence, &mut rules_mask_status)?; 237 | if stash.len() <= previous_stash_size || stash.len() > max_stash_size { 238 | break; 239 | } 240 | previous_stash_size = stash.len(); 241 | } 242 | Ok(stash 243 | .into_iter() 244 | .filter(|pn| { 245 | self.match_boundaries 246 | .check(sentence, pn.root_node.byte_range) 247 | }) 248 | .collect()) 249 | } 250 | 251 | pub fn resolve_sym(&self, sym: &Sym) -> Option<&str> { 252 | self.symbols.0.resolve(*sym) 253 | } 254 | 255 | pub fn all_syms(&self) -> Vec { 256 | self.symbols.0.iter().map(|s| s.0).collect() 257 | } 258 | 259 | pub fn rules_syms(&self) -> Vec { 260 | self.composition_rules 261 | .iter() 262 | .map(|r| r.rule_sym()) 263 | .chain(self.terminal_rules.iter().map(|r| r.rule_sym())) 264 | .collect() 265 | } 266 | } 267 | 268 | #[derive(Copy, Clone, Debug, PartialEq)] 269 | pub struct SendSyncPhantomData(::std::marker::PhantomData); 270 | unsafe impl Send for SendSyncPhantomData {} 271 | unsafe impl Sync for SendSyncPhantomData {} 272 | impl SendSyncPhantomData { 273 | pub fn new() -> SendSyncPhantomData { 274 | SendSyncPhantomData(::std::marker::PhantomData) 275 | } 276 | } 277 | -------------------------------------------------------------------------------- /core/src/pattern.rs: -------------------------------------------------------------------------------- 1 | use crate::helpers::BoundariesChecker; 2 | use crate::range::Range; 3 | use crate::{ 4 | AttemptFrom, CoreResult, InnerStashIndexable, Node, NodePayload, ParsedNode, ParsingStatus, 5 | SendSyncPhantomData, Stash, StashIndexable, Sym, 6 | }; 7 | use smallvec::SmallVec; 8 | use std::rc; 9 | use std::slice::Iter; 10 | use std::vec::IntoIter; 11 | 12 | pub trait Match: Clone { 13 | type NV: Clone; 14 | fn byte_range(&self) -> Range; 15 | fn to_node(&self) -> rc::Rc>; 16 | } 17 | 18 | impl Match for ParsedNode { 19 | type NV = V::Payload; 20 | fn byte_range(&self) -> Range { 21 | self.root_node.byte_range 22 | } 23 | 24 | fn to_node(&self) -> rc::Rc> { 25 | self.root_node.clone() 26 | } 27 | } 28 | 29 | #[derive(Clone, Debug, PartialEq)] 30 | pub struct Text { 31 | pub groups: SmallVec<[Range; 4]>, 32 | pub byte_range: Range, 33 | pattern_sym: Sym, 34 | _phantom: SendSyncPhantomData, 35 | } 36 | 37 | impl Text { 38 | pub fn new(groups: SmallVec<[Range; 4]>, byte_range: Range, pattern_sym: Sym) -> Text { 39 | Text { 40 | groups, 41 | byte_range, 42 | pattern_sym, 43 | _phantom: SendSyncPhantomData::new(), 44 | } 45 | } 46 | } 47 | 48 | impl Match for Text { 49 | type NV = V::Payload; 50 | fn byte_range(&self) -> Range { 51 | self.byte_range 52 | } 53 | 54 | fn to_node(&self) -> rc::Rc> { 55 | rc::Rc::new(Node { 56 | rule_sym: self.pattern_sym, 57 | byte_range: self.byte_range(), 58 | payload: None, 59 | children: SmallVec::new(), 60 | }) 61 | } 62 | } 63 | 64 | pub struct PredicateMatches { 65 | pub matches: Vec, 66 | pub status: ParsingStatus, 67 | } 68 | 69 | impl PredicateMatches { 70 | pub fn with_status(status: ParsingStatus) -> PredicateMatches { 71 | PredicateMatches { 72 | matches: vec![], 73 | status, 74 | } 75 | } 76 | 77 | pub fn continue_with(matches: Vec) -> PredicateMatches { 78 | PredicateMatches { 79 | matches, 80 | status: ParsingStatus::Continue, 81 | } 82 | } 83 | 84 | pub fn exit_if_empty(self) -> PredicateMatches { 85 | if self.matches.len() == 0 { 86 | PredicateMatches::with_status(ParsingStatus::Exit) 87 | } else { 88 | self 89 | } 90 | } 91 | 92 | pub fn push(&mut self, match_: M) { 93 | self.matches.push(match_) 94 | } 95 | 96 | pub fn is_empty(&self) -> bool { 97 | self.matches.is_empty() 98 | } 99 | 100 | pub fn len(&self) -> usize { 101 | self.matches.len() 102 | } 103 | 104 | pub fn iter(&self) -> Iter { 105 | self.matches.iter() 106 | } 107 | 108 | pub fn into_iter(self) -> IntoIter { 109 | self.matches.into_iter() 110 | } 111 | } 112 | 113 | pub trait Pattern: Send + Sync { 114 | type M: Match; 115 | fn predicate( 116 | &self, 117 | stash: &Stash, 118 | sentence: &str, 119 | ) -> CoreResult>; 120 | } 121 | 122 | pub trait TerminalPattern: 123 | Pattern> 124 | { 125 | } 126 | 127 | pub struct TextPattern { 128 | pattern: ::regex::Regex, 129 | pattern_sym: Sym, 130 | boundaries_checker: BoundariesChecker, 131 | _phantom: SendSyncPhantomData, 132 | } 133 | 134 | impl TextPattern { 135 | pub fn new( 136 | regex: ::regex::Regex, 137 | sym: Sym, 138 | boundaries_checker: BoundariesChecker, 139 | ) -> TextPattern { 140 | TextPattern { 141 | pattern: regex, 142 | pattern_sym: sym, 143 | boundaries_checker, 144 | _phantom: SendSyncPhantomData::new(), 145 | } 146 | } 147 | } 148 | 149 | impl Pattern for TextPattern { 150 | type M = Text; 151 | fn predicate( 152 | &self, 153 | _stash: &Stash, 154 | sentence: &str, 155 | ) -> CoreResult> { 156 | let mut results = PredicateMatches::with_status(ParsingStatus::Continue); 157 | for cap in self.pattern.captures_iter(&sentence) { 158 | let full = cap.get(0).ok_or_else(|| { 159 | format_err!( 160 | "No capture for regexp {} in rule {:?} for sentence: {}", 161 | self.pattern, 162 | self.pattern_sym, 163 | sentence 164 | ) 165 | })?; 166 | let full_range = Range(full.start(), full.end()); 167 | if !self.boundaries_checker.check(sentence, full_range) { 168 | continue; 169 | } 170 | let mut groups = SmallVec::new(); 171 | for (ix, group) in cap.iter().enumerate() { 172 | let group = group.ok_or_else(|| { 173 | format_err!( 174 | "No capture for regexp {} in rule {:?}, group number {} in \ 175 | capture: {}", 176 | self.pattern, 177 | self.pattern_sym, 178 | ix, 179 | full.as_str() 180 | ) 181 | })?; 182 | let range = Range(group.start(), group.end()); 183 | groups.push(range); 184 | } 185 | results.push(Text { 186 | groups, 187 | byte_range: full_range, 188 | pattern_sym: self.pattern_sym, 189 | _phantom: SendSyncPhantomData::new(), 190 | }) 191 | } 192 | 193 | Ok(results.exit_if_empty()) 194 | } 195 | } 196 | 197 | impl TerminalPattern 198 | for TextPattern 199 | { 200 | } 201 | 202 | pub struct TextNegLHPattern { 203 | pattern: ::regex::Regex, 204 | neg_look_ahead: ::regex::Regex, 205 | boundaries_checker: BoundariesChecker, 206 | pattern_sym: Sym, 207 | _phantom: SendSyncPhantomData, 208 | } 209 | 210 | impl TextNegLHPattern { 211 | pub fn new( 212 | pattern: ::regex::Regex, 213 | neg_look_ahead: ::regex::Regex, 214 | pattern_sym: Sym, 215 | boundaries_checker: BoundariesChecker, 216 | ) -> TextNegLHPattern { 217 | TextNegLHPattern { 218 | pattern, 219 | neg_look_ahead, 220 | pattern_sym, 221 | boundaries_checker, 222 | _phantom: SendSyncPhantomData::new(), 223 | } 224 | } 225 | } 226 | 227 | impl Pattern 228 | for TextNegLHPattern 229 | { 230 | type M = Text; 231 | fn predicate( 232 | &self, 233 | _stash: &Stash, 234 | sentence: &str, 235 | ) -> CoreResult>> { 236 | let mut results = PredicateMatches::with_status(ParsingStatus::Continue); 237 | for cap in self.pattern.captures_iter(&sentence) { 238 | let full = cap.get(0).ok_or_else(|| { 239 | format_err!( 240 | "No capture for regexp {} in rule {:?} for sentence: {}", 241 | self.pattern, 242 | self.pattern_sym, 243 | sentence 244 | ) 245 | })?; 246 | let full_range = Range(full.start(), full.end()); 247 | if !self.boundaries_checker.check(sentence, full_range) { 248 | continue; 249 | } 250 | if let Some(mat) = self.neg_look_ahead.find(&sentence[full.end()..]) { 251 | if mat.start() == 0 { 252 | continue; 253 | } 254 | } 255 | let mut groups = SmallVec::new(); 256 | for (ix, group) in cap.iter().enumerate() { 257 | let group = group.ok_or_else(|| { 258 | format_err!( 259 | "No capture for regexp {} in rule {:?}, group number {} in \ 260 | capture: {}", 261 | self.pattern, 262 | self.pattern_sym, 263 | ix, 264 | full.as_str() 265 | ) 266 | })?; 267 | let range = Range(group.start(), group.end()); 268 | groups.push(range); 269 | } 270 | results.push(Text { 271 | groups, 272 | byte_range: full_range, 273 | pattern_sym: self.pattern_sym, 274 | _phantom: SendSyncPhantomData::new(), 275 | }) 276 | } 277 | 278 | Ok(results.exit_if_empty()) 279 | } 280 | } 281 | 282 | impl TerminalPattern 283 | for TextNegLHPattern 284 | { 285 | } 286 | 287 | pub type AnyNodePattern = FilterNodePattern; 288 | 289 | pub struct FilterNodePattern 290 | where 291 | V: NodePayload + InnerStashIndexable, 292 | { 293 | predicates: Vec bool + Send + Sync>>, 294 | _phantom: SendSyncPhantomData, 295 | } 296 | 297 | impl AnyNodePattern { 298 | pub fn new() -> AnyNodePattern { 299 | FilterNodePattern { 300 | predicates: vec![], 301 | _phantom: SendSyncPhantomData::new(), 302 | } 303 | } 304 | } 305 | 306 | impl FilterNodePattern 307 | where 308 | V: NodePayload + InnerStashIndexable, 309 | { 310 | pub fn filter(predicates: Vec bool + Sync + Send>>) -> FilterNodePattern { 311 | FilterNodePattern { 312 | predicates, 313 | _phantom: SendSyncPhantomData::new(), 314 | } 315 | } 316 | } 317 | 318 | impl Pattern for FilterNodePattern 319 | where 320 | StashValue: NodePayload + StashIndexable, 321 | V: NodePayload 322 | + InnerStashIndexable 323 | + AttemptFrom, 324 | { 325 | type M = ParsedNode; 326 | fn predicate( 327 | &self, 328 | stash: &Stash, 329 | _sentence: &str, 330 | ) -> CoreResult>> { 331 | Ok(PredicateMatches::continue_with(stash.filter(|v| { 332 | self.predicates.iter().all(|predicate| (predicate)(&v)) 333 | }))) 334 | } 335 | } 336 | 337 | #[cfg(test)] 338 | mod tests { 339 | use super::*; 340 | 341 | macro_rules! svec4 { 342 | ($($item:expr),*) => { { 343 | let mut v = ::smallvec::SmallVec::<[_;4]>::new(); 344 | $( v.push($item); )* 345 | v 346 | } 347 | } 348 | } 349 | 350 | #[test] 351 | fn test_regex_separated_string() { 352 | let stash = Stash::default(); 353 | let checker = BoundariesChecker::detailed(); 354 | let pat: TextPattern = 355 | TextPattern::new(::regex::Regex::new("a+").unwrap(), Sym(0), checker); 356 | assert_eq!( 357 | vec![Text::new(svec4!(Range(0, 3)), Range(0, 3), Sym(0))], 358 | pat.predicate(&stash, "aaa").unwrap().matches 359 | ); 360 | assert_eq!( 361 | vec![Text::new(svec4!(Range(0, 3)), Range(0, 3), Sym(0))], 362 | pat.predicate(&stash, "aaa bbb").unwrap().matches 363 | ); 364 | assert_eq!( 365 | vec![Text::new(svec4!(Range(4, 7)), Range(4, 7), Sym(0))], 366 | pat.predicate(&stash, "bbb aaa").unwrap().matches 367 | ); 368 | assert_eq!( 369 | Vec::>::new(), 370 | pat.predicate(&stash, "baaa").unwrap().matches 371 | ); 372 | assert_eq!( 373 | Vec::>::new(), 374 | pat.predicate(&stash, "aaab").unwrap().matches 375 | ); 376 | assert_eq!( 377 | Vec::>::new(), 378 | pat.predicate(&stash, "aaaé").unwrap().matches 379 | ); 380 | assert_eq!( 381 | Vec::>::new(), 382 | pat.predicate(&stash, "éaaa").unwrap().matches 383 | ); 384 | assert_eq!( 385 | vec![Text::new(svec4!(Range(1, 4)), Range(1, 4), Sym(0))], 386 | pat.predicate(&stash, "1aaa").unwrap().matches 387 | ); 388 | assert_eq!( 389 | vec![Text::new(svec4!(Range(0, 3)), Range(0, 3), Sym(0))], 390 | pat.predicate(&stash, "aaa1").unwrap().matches 391 | ); 392 | assert_eq!( 393 | vec![Text::new(svec4!(Range(0, 3)), Range(0, 3), Sym(0))], 394 | pat.predicate(&stash, "aaa-toto").unwrap().matches 395 | ); 396 | } 397 | } 398 | -------------------------------------------------------------------------------- /core/src/range.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::{Ordering, PartialOrd}; 2 | 3 | /// Represent a semi-inclusive range of position, in bytes, in the matched 4 | /// sentence. 5 | #[derive(PartialEq, Clone, Debug, Copy, Hash, Eq)] 6 | pub struct Range(pub usize, pub usize); 7 | 8 | impl Range { 9 | pub fn intersects(&self, other: &Self) -> bool { 10 | self.partial_cmp(other).is_none() && (self.1 >= other.0 && other.1 >= self.0) 11 | } 12 | 13 | pub fn char_range(&self, string: &str) -> Range { 14 | Range( 15 | convert_char_index(string, self.0), 16 | convert_char_index(string, self.1), 17 | ) 18 | } 19 | 20 | pub fn byte_range(&self, string: &str) -> Range { 21 | Range( 22 | convert_byte_index(string, self.0), 23 | convert_byte_index(string, self.1), 24 | ) 25 | } 26 | 27 | pub fn len(&self) -> usize { 28 | self.1 - self.0 29 | } 30 | 31 | pub fn is_disjoint(&self, other: &Self) -> bool { 32 | self.0 >= other.1 || other.0 >= self.1 33 | } 34 | } 35 | 36 | impl PartialOrd for Range { 37 | fn partial_cmp(&self, other: &Self) -> Option { 38 | if self == other { 39 | Some(Ordering::Equal) 40 | } else if self.0 <= other.0 && other.1 <= self.1 { 41 | Some(Ordering::Greater) 42 | } else if other.0 <= self.0 && self.1 <= other.1 { 43 | Some(Ordering::Less) 44 | } else { 45 | None 46 | } 47 | } 48 | } 49 | 50 | pub fn convert_char_index(string: &str, byte_index: usize) -> usize { 51 | if string.is_empty() { 52 | return 0; 53 | } 54 | let mut acc = 0; 55 | let mut last_char_index = 0; 56 | for (char_index, char) in string.chars().enumerate() { 57 | if byte_index <= acc { 58 | return char_index; 59 | } 60 | acc += char.len_utf8(); 61 | last_char_index = char_index; 62 | } 63 | last_char_index + 1 64 | } 65 | 66 | pub fn convert_byte_index(string: &str, char_index: usize) -> usize { 67 | let mut result = 0; 68 | for (current_char_index, char) in string.chars().enumerate() { 69 | if current_char_index == char_index { 70 | return result; 71 | } 72 | result += char.len_utf8() 73 | } 74 | result 75 | } 76 | -------------------------------------------------------------------------------- /core/src/rule.rs: -------------------------------------------------------------------------------- 1 | use crate::pattern::*; 2 | use crate::stash::Stash; 3 | use crate::{ 4 | CoreResult, NodePayload, ParsedNode, ParsingStatus, Range, SendSyncPhantomData, StashIndexable, 5 | Sym, 6 | }; 7 | use smallvec::SmallVec; 8 | 9 | #[derive(Debug, Fail)] 10 | pub enum RuleError { 11 | #[fail(display = "invalid rule")] 12 | Invalid, 13 | } 14 | 15 | pub type RuleResult = Result; 16 | 17 | macro_rules! svec { 18 | ($($item:expr),*) => { { 19 | let mut v =SmallVec::new(); 20 | $( v.push($item); )* 21 | v 22 | } 23 | } 24 | } 25 | 26 | #[derive(Debug)] 27 | pub struct RuleProductionArg<'a, M: Match + 'a> { 28 | sentence: &'a str, 29 | match_: &'a M, 30 | } 31 | 32 | impl<'a, M: Match> RuleProductionArg<'a, M> { 33 | pub fn new(sentence: &'a str, match_: &'a M) -> RuleProductionArg<'a, M> { 34 | RuleProductionArg { sentence, match_ } 35 | } 36 | } 37 | 38 | impl<'a, V: NodePayload> RuleProductionArg<'a, Text> { 39 | pub fn group(&self, ix: usize) -> &'a str { 40 | let g = self.match_.groups[ix]; 41 | &self.sentence[g.0..g.1] 42 | } 43 | 44 | pub fn num_groups(&self) -> usize { 45 | self.match_.groups.len() 46 | } 47 | } 48 | 49 | impl<'a, V: NodePayload> RuleProductionArg<'a, ParsedNode> { 50 | pub fn value(&self) -> &V { 51 | &self.match_.value 52 | } 53 | } 54 | 55 | fn adjacent(a: &A, b: &B, sentence: &str) -> bool { 56 | a.byte_range().1 <= b.byte_range().0 57 | && sentence[a.byte_range().1..b.byte_range().0] 58 | .chars() 59 | .all(|c| c.is_whitespace()) 60 | } 61 | 62 | #[derive(Debug, Clone)] 63 | pub struct RuleOutput { 64 | pub nodes: ParsedNodes, 65 | pub status: ParsingStatus, 66 | } 67 | 68 | impl RuleOutput { 69 | fn exit() -> RuleOutput { 70 | RuleOutput { 71 | nodes: ParsedNodes::new(), 72 | status: ParsingStatus::Exit, 73 | } 74 | } 75 | 76 | fn continue_with(nodes: ParsedNodes) -> RuleOutput { 77 | RuleOutput { 78 | nodes, 79 | status: ParsingStatus::Continue, 80 | } 81 | } 82 | } 83 | 84 | type ParsedNodes = SmallVec<[ParsedNode; 1]>; 85 | 86 | pub trait Rule: Send + Sync { 87 | fn rule_sym(&self) -> Sym; 88 | fn apply( 89 | &self, 90 | stash: &Stash, 91 | sentence: &str, 92 | ) -> CoreResult>; 93 | } 94 | 95 | pub trait TerminalRule: Rule {} 96 | 97 | pub struct Rule1 98 | where 99 | V: NodePayload, 100 | StashValue: NodePayload + StashIndexable + From, 101 | F: for<'a> Fn(&RuleProductionArg<'a, PA::M>) -> RuleResult, 102 | PA: Pattern, 103 | { 104 | sym: Sym, 105 | pattern: PA, 106 | production: F, 107 | _phantom: SendSyncPhantomData<(V, StashValue)>, 108 | } 109 | 110 | impl Rule for Rule1 111 | where 112 | V: NodePayload, 113 | StashValue: NodePayload + StashIndexable + From, 114 | F: for<'a> Fn(&RuleProductionArg<'a, PA::M>) -> RuleResult + Send + Sync, 115 | PA: Pattern, 116 | { 117 | fn rule_sym(&self) -> Sym { 118 | self.sym 119 | } 120 | 121 | fn apply( 122 | &self, 123 | stash: &Stash, 124 | sentence: &str, 125 | ) -> CoreResult> { 126 | let matches = self.matches(&stash, sentence)?; 127 | 128 | if matches.status.is_exit() { 129 | return Ok(RuleOutput::exit()); 130 | } 131 | 132 | let nodes: CoreResult<_> = matches 133 | .iter() 134 | .filter_map(|sub| { 135 | let nodes = svec![sub.to_node()]; 136 | if stash.iter().all(|old_node| { 137 | old_node.root_node.children != nodes || old_node.root_node.rule_sym != self.sym 138 | }) { 139 | match (self.production)(&RuleProductionArg::new(sentence, sub)) { 140 | Ok(v) => { 141 | let payload = v.extract_payload(); 142 | Some(Ok(ParsedNode::new( 143 | self.sym, 144 | v.into(), 145 | sub.byte_range(), 146 | payload, 147 | nodes, 148 | ))) 149 | } 150 | Err(e) => match e.downcast::() { 151 | Ok(RuleError::Invalid) => None, 152 | Err(e) => Some(Err(e)), 153 | }, 154 | } 155 | } else { 156 | None 157 | } 158 | }) 159 | .collect(); 160 | Ok(RuleOutput::continue_with(nodes?)) 161 | } 162 | } 163 | 164 | impl TerminalRule for Rule1 165 | where 166 | V: NodePayload, 167 | StashValue: NodePayload + StashIndexable + From, 168 | F: for<'a> Fn(&RuleProductionArg<'a, PA::M>) -> RuleResult + Send + Sync, 169 | PA: TerminalPattern, 170 | { 171 | } 172 | 173 | impl Rule1 174 | where 175 | V: NodePayload, 176 | StashValue: NodePayload + StashIndexable + From, 177 | F: for<'a> Fn(&RuleProductionArg<'a, PA::M>) -> RuleResult, 178 | PA: Pattern, 179 | { 180 | pub fn new(sym: Sym, pat: PA, prod: F) -> Rule1 { 181 | Rule1 { 182 | sym, 183 | pattern: pat, 184 | production: prod, 185 | _phantom: SendSyncPhantomData::new(), 186 | } 187 | } 188 | 189 | fn matches( 190 | &self, 191 | stash: &Stash, 192 | sentence: &str, 193 | ) -> CoreResult> { 194 | self.pattern.predicate(stash, sentence) 195 | } 196 | } 197 | 198 | pub struct Rule2 199 | where 200 | V: NodePayload, 201 | StashValue: NodePayload + StashIndexable + From, 202 | F: for<'a> Fn(&RuleProductionArg<'a, PA::M>, &RuleProductionArg<'a, PB::M>) -> RuleResult, 203 | PA: Pattern, 204 | PB: Pattern, 205 | { 206 | sym: Sym, 207 | pattern: (PA, PB), 208 | production: F, 209 | _phantom: SendSyncPhantomData<(V, StashValue)>, 210 | } 211 | 212 | impl Rule for Rule2 213 | where 214 | V: NodePayload, 215 | StashValue: NodePayload + StashIndexable + From, 216 | F: for<'a> Fn(&RuleProductionArg<'a, PA::M>, &RuleProductionArg<'a, PB::M>) -> RuleResult 217 | + Send 218 | + Sync, 219 | PA: Pattern, 220 | PB: Pattern, 221 | { 222 | fn rule_sym(&self) -> Sym { 223 | self.sym 224 | } 225 | 226 | fn apply( 227 | &self, 228 | stash: &Stash, 229 | sentence: &str, 230 | ) -> CoreResult> { 231 | let matches = self.matches(&stash, sentence)?; 232 | 233 | if matches.status.is_exit() { 234 | return Ok(RuleOutput::exit()); 235 | } 236 | 237 | let nodes: CoreResult<_> = matches 238 | .iter() 239 | .filter_map(|sub| { 240 | let nodes = svec![sub.0.to_node(), sub.1.to_node()]; 241 | if stash.iter().all(|old_node| { 242 | old_node.root_node.children != nodes || old_node.root_node.rule_sym != self.sym 243 | }) { 244 | let byte_range = Range(sub.0.byte_range().0, sub.1.byte_range().1); 245 | match (self.production)( 246 | &RuleProductionArg::new(sentence, &sub.0), 247 | &RuleProductionArg::new(sentence, &sub.1), 248 | ) { 249 | Ok(v) => { 250 | let payload = v.extract_payload(); 251 | Some(Ok(ParsedNode::new( 252 | self.sym, 253 | v.into(), 254 | byte_range, 255 | payload, 256 | nodes, 257 | ))) 258 | } 259 | Err(e) => match e.downcast::() { 260 | Ok(RuleError::Invalid) => None, 261 | Err(e) => Some(Err(e)), 262 | }, 263 | } 264 | } else { 265 | None 266 | } 267 | }) 268 | .collect(); 269 | Ok(RuleOutput::continue_with(nodes?)) 270 | } 271 | } 272 | 273 | impl TerminalRule for Rule2 274 | where 275 | V: NodePayload, 276 | StashValue: NodePayload + StashIndexable + From, 277 | F: for<'a> Fn(&RuleProductionArg<'a, PA::M>, &RuleProductionArg<'a, PB::M>) -> RuleResult 278 | + Send 279 | + Sync, 280 | PA: TerminalPattern, 281 | PB: TerminalPattern, 282 | { 283 | } 284 | 285 | impl Rule2 286 | where 287 | V: NodePayload, 288 | StashValue: NodePayload + StashIndexable + From, 289 | F: for<'a> Fn(&RuleProductionArg<'a, PA::M>, &RuleProductionArg<'a, PB::M>) -> RuleResult 290 | + Send 291 | + Sync, 292 | PA: Pattern, 293 | PB: Pattern, 294 | { 295 | pub fn new(sym: Sym, pat: (PA, PB), prod: F) -> Rule2 { 296 | Rule2 { 297 | sym, 298 | pattern: pat, 299 | production: prod, 300 | _phantom: SendSyncPhantomData::new(), 301 | } 302 | } 303 | 304 | fn matches( 305 | &self, 306 | stash: &Stash, 307 | sentence: &str, 308 | ) -> CoreResult> { 309 | let matches_0 = self.pattern.0.predicate(stash, sentence)?; 310 | if matches_0.is_empty() { 311 | return Ok(PredicateMatches::with_status(matches_0.status)); 312 | } 313 | let mut result = PredicateMatches::with_status(ParsingStatus::Continue); 314 | let matches_1 = self.pattern.1.predicate(stash, sentence)?; 315 | for m0 in matches_0.iter() { 316 | for m1 in matches_1.iter() { 317 | if adjacent(m0, m1, sentence) { 318 | result.push((m0.clone(), m1.clone())) 319 | } 320 | } 321 | } 322 | Ok(result) 323 | } 324 | } 325 | 326 | pub struct Rule3 327 | where 328 | V: NodePayload, 329 | StashValue: NodePayload + StashIndexable + From, 330 | F: for<'a> Fn( 331 | &RuleProductionArg<'a, PA::M>, 332 | &RuleProductionArg<'a, PB::M>, 333 | &RuleProductionArg<'a, PC::M>, 334 | ) -> RuleResult 335 | + Send 336 | + Sync, 337 | PA: Pattern, 338 | PB: Pattern, 339 | PC: Pattern, 340 | { 341 | sym: Sym, 342 | pattern: (PA, PB, PC), 343 | production: F, 344 | _phantom: SendSyncPhantomData<(V, StashValue)>, 345 | } 346 | 347 | impl Rule for Rule3 348 | where 349 | V: NodePayload, 350 | StashValue: NodePayload + StashIndexable + From, 351 | F: for<'a> Fn( 352 | &RuleProductionArg<'a, PA::M>, 353 | &RuleProductionArg<'a, PB::M>, 354 | &RuleProductionArg<'a, PC::M>, 355 | ) -> RuleResult 356 | + Send 357 | + Sync, 358 | PA: Pattern, 359 | PB: Pattern, 360 | PC: Pattern, 361 | { 362 | fn rule_sym(&self) -> Sym { 363 | self.sym 364 | } 365 | 366 | fn apply( 367 | &self, 368 | stash: &Stash, 369 | sentence: &str, 370 | ) -> CoreResult> { 371 | let matches = self.matches(&stash, sentence)?; 372 | 373 | if matches.status.is_exit() { 374 | return Ok(RuleOutput::exit()); 375 | } 376 | 377 | let nodes: CoreResult<_> = matches 378 | .iter() 379 | .filter_map(|sub| { 380 | let nodes = svec!(sub.0.to_node(), sub.1.to_node(), sub.2.to_node()); 381 | if stash.iter().all(|old_node| { 382 | old_node.root_node.children != nodes || old_node.root_node.rule_sym != self.sym 383 | }) { 384 | let byte_range = Range(sub.0.byte_range().0, sub.2.byte_range().1); 385 | match (self.production)( 386 | &RuleProductionArg::new(sentence, &sub.0), 387 | &RuleProductionArg::new(sentence, &sub.1), 388 | &RuleProductionArg::new(sentence, &sub.2), 389 | ) { 390 | Ok(v) => { 391 | let payload = v.extract_payload(); 392 | Some(Ok(ParsedNode::new( 393 | self.sym, 394 | v.clone().into(), 395 | byte_range, 396 | payload, 397 | nodes, 398 | ))) 399 | } 400 | Err(e) => match e.downcast::() { 401 | Ok(RuleError::Invalid) => None, 402 | Err(e) => Some(Err(e)), 403 | }, 404 | } 405 | } else { 406 | None 407 | } 408 | }) 409 | .collect(); 410 | Ok(RuleOutput::continue_with(nodes?)) 411 | } 412 | } 413 | 414 | impl Rule3 415 | where 416 | V: NodePayload, 417 | StashValue: NodePayload + StashIndexable + From, 418 | F: for<'a> Fn( 419 | &RuleProductionArg<'a, PA::M>, 420 | &RuleProductionArg<'a, PB::M>, 421 | &RuleProductionArg<'a, PC::M>, 422 | ) -> RuleResult 423 | + Send 424 | + Sync, 425 | PA: Pattern, 426 | PB: Pattern, 427 | PC: Pattern, 428 | { 429 | pub fn new(sym: Sym, pat: (PA, PB, PC), prod: F) -> Rule3 { 430 | Rule3 { 431 | sym, 432 | pattern: pat, 433 | production: prod, 434 | _phantom: SendSyncPhantomData::new(), 435 | } 436 | } 437 | 438 | fn matches( 439 | &self, 440 | stash: &Stash, 441 | sentence: &str, 442 | ) -> CoreResult> { 443 | let matches_0 = self.pattern.0.predicate(stash, sentence)?; 444 | if matches_0.is_empty() { 445 | return Ok(PredicateMatches::with_status(matches_0.status)); 446 | } 447 | let matches_1 = self.pattern.1.predicate(stash, sentence)?; 448 | if matches_1.is_empty() { 449 | return Ok(PredicateMatches::with_status(matches_1.status)); 450 | } 451 | let matches_2 = self.pattern.2.predicate(stash, sentence)?; 452 | if matches_2.is_empty() { 453 | return Ok(PredicateMatches::with_status(matches_2.status)); 454 | } 455 | let mut result = PredicateMatches::with_status(ParsingStatus::Continue); 456 | for m0 in matches_0.iter() { 457 | for m1 in matches_1.iter() { 458 | if adjacent(m0, m1, sentence) { 459 | for m2 in matches_2.iter() { 460 | if adjacent(m1, m2, sentence) { 461 | result.push((m0.clone(), m1.clone(), m2.clone())) 462 | } 463 | } 464 | } 465 | } 466 | } 467 | Ok(result) 468 | } 469 | } 470 | 471 | pub struct Rule4 472 | where 473 | V: NodePayload, 474 | StashValue: NodePayload + StashIndexable + From, 475 | F: for<'a> Fn( 476 | &RuleProductionArg<'a, PA::M>, 477 | &RuleProductionArg<'a, PB::M>, 478 | &RuleProductionArg<'a, PC::M>, 479 | &RuleProductionArg<'a, PD::M>, 480 | ) -> RuleResult 481 | + Send 482 | + Sync, 483 | PA: Pattern, 484 | PB: Pattern, 485 | PC: Pattern, 486 | PD: Pattern, 487 | { 488 | sym: Sym, 489 | pattern: (PA, PB, PC, PD), 490 | production: F, 491 | _phantom: SendSyncPhantomData<(V, StashValue)>, 492 | } 493 | 494 | impl Rule for Rule4 495 | where 496 | V: NodePayload, 497 | StashValue: NodePayload + StashIndexable + From, 498 | F: for<'a> Fn( 499 | &RuleProductionArg<'a, PA::M>, 500 | &RuleProductionArg<'a, PB::M>, 501 | &RuleProductionArg<'a, PC::M>, 502 | &RuleProductionArg<'a, PD::M>, 503 | ) -> RuleResult 504 | + Send 505 | + Sync, 506 | PA: Pattern, 507 | PB: Pattern, 508 | PC: Pattern, 509 | PD: Pattern, 510 | { 511 | fn rule_sym(&self) -> Sym { 512 | self.sym 513 | } 514 | 515 | fn apply( 516 | &self, 517 | stash: &Stash, 518 | sentence: &str, 519 | ) -> CoreResult> { 520 | let matches = self.matches(&stash, sentence)?; 521 | 522 | if matches.status.is_exit() { 523 | return Ok(RuleOutput::exit()); 524 | } 525 | 526 | let nodes: CoreResult<_> = matches 527 | .iter() 528 | .filter_map(|sub| { 529 | let nodes = svec!( 530 | sub.0.to_node(), 531 | sub.1.to_node(), 532 | sub.2.to_node(), 533 | sub.3.to_node() 534 | ); 535 | if stash.iter().all(|old_node| { 536 | old_node.root_node.children != nodes || old_node.root_node.rule_sym != self.sym 537 | }) { 538 | let byte_range = Range(sub.0.byte_range().0, sub.3.byte_range().1); 539 | match (self.production)( 540 | &RuleProductionArg::new(sentence, &sub.0), 541 | &RuleProductionArg::new(sentence, &sub.1), 542 | &RuleProductionArg::new(sentence, &sub.2), 543 | &RuleProductionArg::new(sentence, &sub.3), 544 | ) { 545 | Ok(v) => { 546 | let payload = v.extract_payload(); 547 | Some(Ok(ParsedNode::new( 548 | self.sym, 549 | v.clone().into(), 550 | byte_range, 551 | payload, 552 | nodes, 553 | ))) 554 | } 555 | Err(e) => match e.downcast::() { 556 | Ok(RuleError::Invalid) => None, 557 | Err(e) => Some(Err(e)), 558 | }, 559 | } 560 | } else { 561 | None 562 | } 563 | }) 564 | .collect(); 565 | 566 | Ok(RuleOutput::continue_with(nodes?)) 567 | } 568 | } 569 | 570 | impl Rule4 571 | where 572 | V: NodePayload, 573 | StashValue: NodePayload + StashIndexable + From, 574 | F: for<'a> Fn( 575 | &RuleProductionArg<'a, PA::M>, 576 | &RuleProductionArg<'a, PB::M>, 577 | &RuleProductionArg<'a, PC::M>, 578 | &RuleProductionArg<'a, PD::M>, 579 | ) -> RuleResult 580 | + Send 581 | + Sync, 582 | PA: Pattern, 583 | PB: Pattern, 584 | PC: Pattern, 585 | PD: Pattern, 586 | { 587 | pub fn new( 588 | sym: Sym, 589 | pat: (PA, PB, PC, PD), 590 | prod: F, 591 | ) -> Rule4 { 592 | Rule4 { 593 | sym, 594 | pattern: pat, 595 | production: prod, 596 | _phantom: SendSyncPhantomData::new(), 597 | } 598 | } 599 | 600 | fn matches( 601 | &self, 602 | stash: &Stash, 603 | sentence: &str, 604 | ) -> CoreResult> { 605 | let matches_0 = self.pattern.0.predicate(stash, sentence)?; 606 | if matches_0.is_empty() { 607 | return Ok(PredicateMatches::with_status(matches_0.status)); 608 | } 609 | let matches_1 = self.pattern.1.predicate(stash, sentence)?; 610 | if matches_1.is_empty() { 611 | return Ok(PredicateMatches::with_status(matches_1.status)); 612 | } 613 | let matches_2 = self.pattern.2.predicate(stash, sentence)?; 614 | if matches_2.is_empty() { 615 | return Ok(PredicateMatches::with_status(matches_2.status)); 616 | } 617 | let matches_3 = self.pattern.3.predicate(stash, sentence)?; 618 | if matches_3.is_empty() { 619 | return Ok(PredicateMatches::with_status(matches_3.status)); 620 | } 621 | let mut result = PredicateMatches::with_status(ParsingStatus::Continue); 622 | for m0 in matches_0.iter() { 623 | for m1 in matches_1.iter() { 624 | if adjacent(m0, m1, sentence) { 625 | for m2 in matches_2.iter() { 626 | if adjacent(m1, m2, sentence) { 627 | for m3 in matches_3.iter() { 628 | if adjacent(m2, m3, sentence) { 629 | result.push((m0.clone(), m1.clone(), m2.clone(), m3.clone())) 630 | } 631 | } 632 | } 633 | } 634 | } 635 | } 636 | } 637 | Ok(result) 638 | } 639 | } 640 | 641 | pub struct Rule5 642 | where 643 | V: NodePayload, 644 | StashValue: NodePayload + StashIndexable + From, 645 | F: for<'a> Fn( 646 | &RuleProductionArg<'a, PA::M>, 647 | &RuleProductionArg<'a, PB::M>, 648 | &RuleProductionArg<'a, PC::M>, 649 | &RuleProductionArg<'a, PD::M>, 650 | &RuleProductionArg<'a, PE::M>, 651 | ) -> RuleResult 652 | + Send 653 | + Sync, 654 | PA: Pattern, 655 | PB: Pattern, 656 | PC: Pattern, 657 | PD: Pattern, 658 | PE: Pattern, 659 | { 660 | sym: Sym, 661 | pattern: (PA, PB, PC, PD, PE), 662 | production: F, 663 | _phantom: SendSyncPhantomData<(V, StashValue)>, 664 | } 665 | 666 | impl Rule 667 | for Rule5 668 | where 669 | V: NodePayload, 670 | StashValue: NodePayload + StashIndexable + From, 671 | F: for<'a> Fn( 672 | &RuleProductionArg<'a, PA::M>, 673 | &RuleProductionArg<'a, PB::M>, 674 | &RuleProductionArg<'a, PC::M>, 675 | &RuleProductionArg<'a, PD::M>, 676 | &RuleProductionArg<'a, PE::M>, 677 | ) -> RuleResult 678 | + Send 679 | + Sync, 680 | PA: Pattern, 681 | PB: Pattern, 682 | PC: Pattern, 683 | PD: Pattern, 684 | PE: Pattern, 685 | { 686 | fn rule_sym(&self) -> Sym { 687 | self.sym 688 | } 689 | 690 | fn apply( 691 | &self, 692 | stash: &Stash, 693 | sentence: &str, 694 | ) -> CoreResult> { 695 | let matches = self.matches(&stash, sentence)?; 696 | 697 | if matches.status.is_exit() { 698 | return Ok(RuleOutput::exit()); 699 | } 700 | 701 | let nodes: CoreResult<_> = matches 702 | .iter() 703 | .filter_map(|sub| { 704 | let nodes = svec!( 705 | sub.0.to_node(), 706 | sub.1.to_node(), 707 | sub.2.to_node(), 708 | sub.3.to_node(), 709 | sub.4.to_node() 710 | ); 711 | if stash.iter().all(|old_node| { 712 | old_node.root_node.children != nodes || old_node.root_node.rule_sym != self.sym 713 | }) { 714 | let byte_range = Range(sub.0.byte_range().0, sub.4.byte_range().1); 715 | match (self.production)( 716 | &RuleProductionArg::new(sentence, &sub.0), 717 | &RuleProductionArg::new(sentence, &sub.1), 718 | &RuleProductionArg::new(sentence, &sub.2), 719 | &RuleProductionArg::new(sentence, &sub.3), 720 | &RuleProductionArg::new(sentence, &sub.4), 721 | ) { 722 | Ok(v) => { 723 | let payload = v.extract_payload(); 724 | Some(Ok(ParsedNode::new( 725 | self.sym, 726 | v.into(), 727 | byte_range, 728 | payload, 729 | nodes, 730 | ))) 731 | } 732 | Err(e) => match e.downcast::() { 733 | Ok(RuleError::Invalid) => None, 734 | Err(e) => Some(Err(e)), 735 | }, 736 | } 737 | } else { 738 | None 739 | } 740 | }) 741 | .collect(); 742 | Ok(RuleOutput::continue_with(nodes?)) 743 | } 744 | } 745 | 746 | impl Rule5 747 | where 748 | V: NodePayload, 749 | StashValue: NodePayload + StashIndexable + From, 750 | F: for<'a> Fn( 751 | &RuleProductionArg<'a, PA::M>, 752 | &RuleProductionArg<'a, PB::M>, 753 | &RuleProductionArg<'a, PC::M>, 754 | &RuleProductionArg<'a, PD::M>, 755 | &RuleProductionArg<'a, PE::M>, 756 | ) -> RuleResult 757 | + Send 758 | + Sync, 759 | PA: Pattern, 760 | PB: Pattern, 761 | PC: Pattern, 762 | PD: Pattern, 763 | PE: Pattern, 764 | { 765 | pub fn new( 766 | sym: Sym, 767 | pat: (PA, PB, PC, PD, PE), 768 | prod: F, 769 | ) -> Rule5 { 770 | Rule5 { 771 | sym, 772 | pattern: pat, 773 | production: prod, 774 | _phantom: SendSyncPhantomData::new(), 775 | } 776 | } 777 | 778 | fn matches( 779 | &self, 780 | stash: &Stash, 781 | sentence: &str, 782 | ) -> CoreResult> { 783 | let matches_0 = self.pattern.0.predicate(stash, sentence)?; 784 | if matches_0.is_empty() { 785 | return Ok(PredicateMatches::with_status(matches_0.status)); 786 | } 787 | let matches_1 = self.pattern.1.predicate(stash, sentence)?; 788 | if matches_1.is_empty() { 789 | return Ok(PredicateMatches::with_status(matches_1.status)); 790 | } 791 | let matches_2 = self.pattern.2.predicate(stash, sentence)?; 792 | if matches_2.is_empty() { 793 | return Ok(PredicateMatches::with_status(matches_2.status)); 794 | } 795 | let matches_3 = self.pattern.3.predicate(stash, sentence)?; 796 | if matches_3.is_empty() { 797 | return Ok(PredicateMatches::with_status(matches_3.status));; 798 | } 799 | let matches_4 = self.pattern.4.predicate(stash, sentence)?; 800 | if matches_4.is_empty() { 801 | return Ok(PredicateMatches::with_status(matches_4.status)); 802 | } 803 | let mut result = PredicateMatches::with_status(ParsingStatus::Continue); 804 | for m0 in matches_0.iter() { 805 | for m1 in matches_1.iter() { 806 | if adjacent(m0, m1, sentence) { 807 | for m2 in matches_2.iter() { 808 | if adjacent(m1, m2, sentence) { 809 | for m3 in matches_3.iter() { 810 | if adjacent(m2, m3, sentence) { 811 | for m4 in matches_4.iter() { 812 | if adjacent(m3, m4, sentence) { 813 | result.push(( 814 | m0.clone(), 815 | m1.clone(), 816 | m2.clone(), 817 | m3.clone(), 818 | m4.clone(), 819 | )) 820 | } 821 | } 822 | } 823 | } 824 | } 825 | } 826 | } 827 | } 828 | } 829 | Ok(result) 830 | } 831 | } 832 | 833 | pub struct Rule6 834 | where 835 | V: NodePayload, 836 | StashValue: NodePayload + StashIndexable + From, 837 | F: for<'a> Fn( 838 | &RuleProductionArg<'a, PA::M>, 839 | &RuleProductionArg<'a, PB::M>, 840 | &RuleProductionArg<'a, PC::M>, 841 | &RuleProductionArg<'a, PD::M>, 842 | &RuleProductionArg<'a, PE::M>, 843 | &RuleProductionArg<'a, PF::M>, 844 | ) -> RuleResult 845 | + Send 846 | + Sync, 847 | PA: Pattern, 848 | PB: Pattern, 849 | PC: Pattern, 850 | PD: Pattern, 851 | PE: Pattern, 852 | PF: Pattern, 853 | { 854 | sym: Sym, 855 | pattern: (PA, PB, PC, PD, PE, PF), 856 | production: F, 857 | _phantom: SendSyncPhantomData<(V, StashValue)>, 858 | } 859 | 860 | impl Rule 861 | for Rule6 862 | where 863 | V: NodePayload, 864 | StashValue: NodePayload + StashIndexable + From, 865 | F: for<'a> Fn( 866 | &RuleProductionArg<'a, PA::M>, 867 | &RuleProductionArg<'a, PB::M>, 868 | &RuleProductionArg<'a, PC::M>, 869 | &RuleProductionArg<'a, PD::M>, 870 | &RuleProductionArg<'a, PE::M>, 871 | &RuleProductionArg<'a, PF::M>, 872 | ) -> RuleResult 873 | + Send 874 | + Sync, 875 | PA: Pattern, 876 | PB: Pattern, 877 | PC: Pattern, 878 | PD: Pattern, 879 | PE: Pattern, 880 | PF: Pattern, 881 | { 882 | fn rule_sym(&self) -> Sym { 883 | self.sym 884 | } 885 | 886 | fn apply( 887 | &self, 888 | stash: &Stash, 889 | sentence: &str, 890 | ) -> CoreResult> { 891 | let matches = self.matches(&stash, sentence)?; 892 | 893 | if matches.status.is_exit() { 894 | return Ok(RuleOutput::exit()); 895 | } 896 | 897 | let nodes: CoreResult<_> = matches 898 | .iter() 899 | .filter_map(|sub| { 900 | let nodes = svec!( 901 | sub.0.to_node(), 902 | sub.1.to_node(), 903 | sub.2.to_node(), 904 | sub.3.to_node(), 905 | sub.4.to_node(), 906 | sub.5.to_node() 907 | ); 908 | if stash.iter().all(|old_node| { 909 | old_node.root_node.children != nodes || old_node.root_node.rule_sym != self.sym 910 | }) { 911 | let byte_range = Range(sub.0.byte_range().0, sub.5.byte_range().1); 912 | match (self.production)( 913 | &RuleProductionArg::new(sentence, &sub.0), 914 | &RuleProductionArg::new(sentence, &sub.1), 915 | &RuleProductionArg::new(sentence, &sub.2), 916 | &RuleProductionArg::new(sentence, &sub.3), 917 | &RuleProductionArg::new(sentence, &sub.4), 918 | &RuleProductionArg::new(sentence, &sub.5), 919 | ) { 920 | Ok(v) => { 921 | let payload = v.extract_payload(); 922 | Some(Ok(ParsedNode::new( 923 | self.sym, 924 | v.clone().into(), 925 | byte_range, 926 | payload, 927 | nodes, 928 | ))) 929 | } 930 | Err(e) => match e.downcast::() { 931 | Ok(RuleError::Invalid) => None, 932 | Err(e) => Some(Err(e)), 933 | }, 934 | } 935 | } else { 936 | None 937 | } 938 | }) 939 | .collect(); 940 | Ok(RuleOutput::continue_with(nodes?)) 941 | } 942 | } 943 | 944 | impl Rule6 945 | where 946 | V: NodePayload, 947 | StashValue: NodePayload + StashIndexable + From, 948 | F: for<'a> Fn( 949 | &RuleProductionArg<'a, PA::M>, 950 | &RuleProductionArg<'a, PB::M>, 951 | &RuleProductionArg<'a, PC::M>, 952 | &RuleProductionArg<'a, PD::M>, 953 | &RuleProductionArg<'a, PE::M>, 954 | &RuleProductionArg<'a, PF::M>, 955 | ) -> RuleResult 956 | + Send 957 | + Sync, 958 | PA: Pattern, 959 | PB: Pattern, 960 | PC: Pattern, 961 | PD: Pattern, 962 | PE: Pattern, 963 | PF: Pattern, 964 | { 965 | pub fn new( 966 | sym: Sym, 967 | pat: (PA, PB, PC, PD, PE, PF), 968 | prod: F, 969 | ) -> Rule6 { 970 | Rule6 { 971 | sym, 972 | pattern: pat, 973 | production: prod, 974 | _phantom: SendSyncPhantomData::new(), 975 | } 976 | } 977 | 978 | fn matches( 979 | &self, 980 | stash: &Stash, 981 | sentence: &str, 982 | ) -> CoreResult> { 983 | let matches_0 = self.pattern.0.predicate(stash, sentence)?; 984 | if matches_0.is_empty() { 985 | return Ok(PredicateMatches::with_status(matches_0.status)); 986 | } 987 | let matches_1 = self.pattern.1.predicate(stash, sentence)?; 988 | if matches_1.is_empty() { 989 | return Ok(PredicateMatches::with_status(matches_1.status)); 990 | } 991 | let matches_2 = self.pattern.2.predicate(stash, sentence)?; 992 | if matches_2.is_empty() { 993 | return Ok(PredicateMatches::with_status(matches_2.status)); 994 | } 995 | let matches_3 = self.pattern.3.predicate(stash, sentence)?; 996 | if matches_3.is_empty() { 997 | return Ok(PredicateMatches::with_status(matches_3.status)); 998 | } 999 | let matches_4 = self.pattern.4.predicate(stash, sentence)?; 1000 | if matches_4.is_empty() { 1001 | return Ok(PredicateMatches::with_status(matches_4.status)); 1002 | } 1003 | let matches_5 = self.pattern.5.predicate(stash, sentence)?; 1004 | if matches_5.is_empty() { 1005 | return Ok(PredicateMatches::with_status(matches_5.status)); 1006 | } 1007 | let mut result = PredicateMatches::with_status(ParsingStatus::Continue); 1008 | for m0 in matches_0.iter() { 1009 | for m1 in matches_1.iter() { 1010 | if adjacent(m0, m1, sentence) { 1011 | for m2 in matches_2.iter() { 1012 | if adjacent(m1, m2, sentence) { 1013 | for m3 in matches_3.iter() { 1014 | if adjacent(m2, m3, sentence) { 1015 | for m4 in matches_4.iter() { 1016 | if adjacent(m3, m4, sentence) { 1017 | for m5 in matches_5.iter() { 1018 | if adjacent(m4, m5, sentence) { 1019 | result.push(( 1020 | m0.clone(), 1021 | m1.clone(), 1022 | m2.clone(), 1023 | m3.clone(), 1024 | m4.clone(), 1025 | m5.clone(), 1026 | )) 1027 | } 1028 | } 1029 | } 1030 | } 1031 | } 1032 | } 1033 | } 1034 | } 1035 | } 1036 | } 1037 | } 1038 | Ok(result) 1039 | } 1040 | } 1041 | 1042 | #[cfg(test)] 1043 | #[allow(unused_mut)] 1044 | mod tests { 1045 | use crate::helpers::BoundariesChecker; 1046 | use crate::pattern::{FilterNodePattern, Text}; 1047 | use crate::rule::*; 1048 | use crate::stash::Stash; 1049 | use crate::{ 1050 | AttemptFrom, InnerStashIndexable, Node, NodePayload, ParsedNode, Range, StashIndexable, 1051 | SymbolTable, 1052 | }; 1053 | use regex::Regex; 1054 | use smallvec::SmallVec; 1055 | 1056 | macro_rules! svec { 1057 | ($($item:expr),*) => { { 1058 | let mut v = SmallVec::new(); 1059 | $( v.push($item); )* 1060 | v 1061 | } 1062 | } 1063 | } 1064 | 1065 | macro_rules! svec4 { 1066 | ($($item:expr),*) => { { 1067 | let mut v =SmallVec::<[_;4]>::new(); 1068 | $( v.push($item); )* 1069 | v 1070 | } 1071 | } 1072 | } 1073 | 1074 | impl AttemptFrom for usize { 1075 | fn attempt_from(v: usize) -> Option { 1076 | Some(v) 1077 | } 1078 | } 1079 | 1080 | impl NodePayload for usize { 1081 | type Payload = usize; 1082 | fn extract_payload(&self) -> Option { 1083 | Some(*self) 1084 | } 1085 | } 1086 | 1087 | impl StashIndexable for usize { 1088 | type Index = usize; 1089 | fn index(&self) -> usize { 1090 | 0 1091 | } 1092 | } 1093 | 1094 | impl InnerStashIndexable for usize { 1095 | type Index = usize; 1096 | fn index() -> usize { 1097 | 0 1098 | } 1099 | } 1100 | 1101 | macro_rules! reg { 1102 | ($st:expr, $typ:ty, $pattern:expr) => { 1103 | $crate::pattern::TextPattern::<$typ>::new( 1104 | Regex::new($pattern).unwrap(), 1105 | $st.sym($pattern), 1106 | BoundariesChecker::separated_alphanumeric_word(), 1107 | ) 1108 | }; 1109 | } 1110 | 1111 | #[test] 1112 | fn test_integer_numeric_en_rule() { 1113 | let mut st = SymbolTable::default(); 1114 | let ten = st.sym("ten"); 1115 | let rule = Rule1::new(ten, reg!(st, usize, "ten"), |_| Ok(10usize)); 1116 | assert_eq!( 1117 | vec![Text::new(svec![Range(8, 11)], Range(8, 11), ten)], 1118 | rule.matches(&Stash::default(), "foobar: ten") 1119 | .unwrap() 1120 | .matches 1121 | ); 1122 | assert_eq!( 1123 | vec![ 1124 | Text::new(svec![Range(8, 11)], Range(8, 11), ten), 1125 | Text::new(svec![Range(12, 15)], Range(12, 15), ten) 1126 | ], 1127 | rule.matches(&Stash::default(), "foobar: ten ten") 1128 | .unwrap() 1129 | .matches 1130 | ); 1131 | assert_eq!( 1132 | svec4![ 1133 | ParsedNode::new( 1134 | ten, 1135 | 10usize, 1136 | Range(8, 11), 1137 | Some(10usize), 1138 | svec![Node::new(ten, Range(8, 11), None, svec![])] 1139 | ), 1140 | ParsedNode::new( 1141 | ten, 1142 | 10usize, 1143 | Range(12, 15), 1144 | Some(10usize), 1145 | svec![Node::new(ten, Range(12, 15), None, svec![])] 1146 | ) 1147 | ], 1148 | rule.apply(&Stash::default(), "foobar: ten ten") 1149 | .unwrap() 1150 | .nodes 1151 | ) 1152 | } 1153 | 1154 | #[test] 1155 | fn test_integer_numeric_compo_en_rule() { 1156 | let mut st = SymbolTable::default(); 1157 | let rule_consec = Rule2::new( 1158 | st.sym("2 consecutive ints"), 1159 | ( 1160 | AnyNodePattern::::new(), 1161 | FilterNodePattern::::filter(vec![Box::new(|integer: &usize| { 1162 | *integer == 10 1163 | })]), 1164 | ), 1165 | |a, b| Ok(a.value() + b.value()), 1166 | ); 1167 | let mut stash = Stash::default(); 1168 | stash.push(ParsedNode::new( 1169 | st.sym("ten"), 1170 | 10, 1171 | Range(8, 11), 1172 | None, 1173 | svec![], 1174 | )); 1175 | stash.push(ParsedNode::new( 1176 | st.sym("ten"), 1177 | 10, 1178 | Range(12, 15), 1179 | None, 1180 | svec![], 1181 | )); 1182 | 1183 | assert_eq!( 1184 | vec![(stash.values()[0].clone(), stash.values()[1].clone())], 1185 | rule_consec 1186 | .matches(&stash, "foobar: ten ten") 1187 | .unwrap() 1188 | .matches 1189 | ); 1190 | assert_eq!( 1191 | svec4![ParsedNode::new( 1192 | st.sym("2 consecutive ints"), 1193 | 20, 1194 | Range(8, 15), 1195 | Some(20), 1196 | svec![ 1197 | stash.values()[0].root_node.clone(), 1198 | stash.values()[1].root_node.clone() 1199 | ] 1200 | )], 1201 | rule_consec.apply(&stash, "foobar: ten ten").unwrap().nodes 1202 | ); 1203 | } 1204 | 1205 | #[test] 1206 | fn test_integer_numeric_int_rule() { 1207 | use std::str::FromStr; 1208 | let mut st = SymbolTable::default(); 1209 | let rule_int = Rule1::new(st.sym("int"), reg!(st, usize, "\\d+"), |a| { 1210 | Ok(usize::from_str(&*a.group(0))?) 1211 | }); 1212 | assert_eq!( 1213 | svec4![ParsedNode::new( 1214 | st.sym("int"), 1215 | 42, 1216 | Range(8, 10), 1217 | Some(42), 1218 | svec![Node::new(st.sym("\\d+"), Range(8, 10), None, svec![])] 1219 | )], 1220 | rule_int 1221 | .apply(&Stash::default(), "foobar: 42") 1222 | .unwrap() 1223 | .nodes 1224 | ); 1225 | } 1226 | 1227 | } 1228 | -------------------------------------------------------------------------------- /core/src/stash.rs: -------------------------------------------------------------------------------- 1 | use crate::pattern::Match; 2 | use crate::{AttemptFrom, NodePayload, ParsedNode}; 3 | use std::collections::HashMap; 4 | use std::hash::Hash; 5 | use std::slice::Iter; 6 | use std::vec::IntoIter; 7 | 8 | pub trait StashIndexable { 9 | type Index: Hash + Eq; 10 | fn index(&self) -> Self::Index; 11 | } 12 | 13 | pub trait InnerStashIndexable { 14 | type Index: Hash + Eq; 15 | fn index() -> Self::Index; 16 | } 17 | 18 | pub struct Stash { 19 | values: Vec>, 20 | index: HashMap>, 21 | } 22 | 23 | impl Default for Stash { 24 | fn default() -> Stash { 25 | Stash { 26 | values: vec![], 27 | index: HashMap::new(), 28 | } 29 | } 30 | } 31 | 32 | impl Stash { 33 | pub fn extend(&mut self, nodes: Vec>) { 34 | for node in nodes.into_iter() { 35 | self.push(node); 36 | } 37 | } 38 | pub fn push(&mut self, node: ParsedNode) { 39 | let node_position = self.values.len(); 40 | let node_index = node.value.index(); 41 | self.values.push(node); 42 | self.index 43 | .entry(node_index) 44 | .or_insert(vec![]) 45 | .push(node_position); 46 | } 47 | 48 | pub fn filter(&self, predicate: F) -> Vec> 49 | where 50 | V: InnerStashIndexable 51 | + NodePayload 52 | + AttemptFrom, 53 | F: Fn(&V) -> bool, 54 | { 55 | self.index 56 | .get(&V::index()) 57 | .map(|nodes| { 58 | nodes 59 | .iter() 60 | .filter_map(|position| { 61 | let ref node = self.values[*position]; 62 | if let Some(v) = V::attempt_from(node.value.clone()) { 63 | if (predicate)(&v) { 64 | Some(ParsedNode::new( 65 | node.root_node.rule_sym, 66 | v, 67 | node.byte_range(), 68 | node.root_node.payload.clone(), 69 | node.root_node.children.clone(), 70 | )) 71 | } else { 72 | None 73 | } 74 | } else { 75 | None 76 | } 77 | }) 78 | .collect() 79 | }) 80 | .unwrap_or(vec![]) 81 | } 82 | 83 | pub fn iter(&self) -> Iter> { 84 | self.values.iter() 85 | } 86 | pub fn into_iter(self) -> IntoIter> { 87 | self.values.into_iter() 88 | } 89 | 90 | pub fn len(&self) -> usize { 91 | self.values.len() 92 | } 93 | 94 | #[cfg(test)] 95 | pub fn values(&self) -> &Vec> { 96 | self.values.as_ref() 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /ml/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rustling-ml" 3 | version = "0.9.1" 4 | authors = ["hdlj ", "Mathieu Poumeyrol "] 5 | edition = "2018" 6 | 7 | [dependencies] 8 | failure = "0.1" 9 | fnv = "1.0" 10 | serde = { version = "1.0", features = ["derive"] } 11 | -------------------------------------------------------------------------------- /ml/src/lib.rs: -------------------------------------------------------------------------------- 1 | #[macro_use] 2 | extern crate failure; 3 | extern crate fnv; 4 | 5 | use fnv::{FnvHashMap, FnvHashSet}; 6 | use std::fmt::Debug; 7 | use std::hash; 8 | use serde::{Deserialize, Serialize}; 9 | 10 | pub type MLResult = Result; 11 | 12 | pub trait ClassifierId: Eq + hash::Hash + Clone + Debug {} 13 | pub trait ClassId: Eq + hash::Hash + Clone + Debug {} 14 | pub trait Feature: Eq + hash::Hash + Clone + Debug {} 15 | 16 | pub struct Input { 17 | pub classifier_id: Id, 18 | pub features: Vec, 19 | pub children: Vec>, 20 | } 21 | 22 | #[derive(PartialEq, Debug, Clone, Serialize, Deserialize)] 23 | pub struct Model { 24 | pub classifiers: FnvHashMap>, 25 | } 26 | 27 | #[derive(PartialEq, Debug, Clone, Serialize, Deserialize)] 28 | pub struct Classifier { 29 | pub classes: FnvHashMap>, 30 | } 31 | 32 | #[derive(PartialEq, Debug, Clone, Serialize, Deserialize)] 33 | pub struct ClassInfo { 34 | pub example_count: usize, 35 | pub unk_probalog: f32, 36 | pub class_probalog: f32, 37 | pub feat_probalog: FnvHashMap, 38 | } 39 | 40 | impl Model { 41 | pub fn classify(&self, input: &Input, target: &Class) -> MLResult { 42 | let classifier = if let Some(classifier) = self.classifiers.get(&input.classifier_id) { 43 | classifier 44 | } else { 45 | return Ok(0.0); 46 | }; 47 | 48 | let mut bag_of_features: FnvHashMap = FnvHashMap::default(); 49 | for feat in &input.features { 50 | let counter = bag_of_features.entry(feat.clone()).or_insert(0); 51 | *counter += 1; 52 | } 53 | 54 | let mut probalog = classifier 55 | .scores(&bag_of_features) 56 | .iter() 57 | .find(|item| &item.0 == target) 58 | .map(|item| item.1) 59 | .unwrap_or(::std::f32::NEG_INFINITY); 60 | for child in &input.children { 61 | probalog += self.classify(&child, target)?; 62 | } 63 | Ok(probalog) 64 | } 65 | } 66 | 67 | impl Classifier { 68 | // max(log(π(Prob(feat|class)^count)*Prob(class))) = 69 | // max(sum(logprob(feat|class)*count + logprob(class)) 70 | 71 | pub fn scores(&self, bag_of_features: &FnvHashMap) -> Vec<(Id, f32)> { 72 | let mut scores: Vec<_> = self 73 | .classes 74 | .iter() 75 | .map(|(cid, cinfo)| { 76 | let probalog: f32 = bag_of_features 77 | .iter() 78 | .map(|(feat, count)| { 79 | *count as f32 * cinfo.feat_probalog.get(feat).unwrap_or(&cinfo.unk_probalog) 80 | }) 81 | .sum(); 82 | (cid.clone(), probalog + cinfo.class_probalog) 83 | }) 84 | .collect(); 85 | let normlog = f32::ln(scores.iter().map(|p| f32::exp(p.1)).sum::()); 86 | for s in scores.iter_mut() { 87 | s.1 -= normlog 88 | } 89 | scores 90 | } 91 | 92 | pub fn classify(&self, bag_of_features: &FnvHashMap) -> MLResult<(Id, f32)> { 93 | self.scores(bag_of_features) 94 | .into_iter() 95 | .max_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(::std::cmp::Ordering::Equal)) 96 | .ok_or(format_err!("no classes in classifier")) 97 | } 98 | 99 | pub fn train(examples: &Vec<(FnvHashMap, Id)>) -> Classifier { 100 | let mut classes: FnvHashMap)> = FnvHashMap::default(); 101 | let total_examples = examples.len(); 102 | let mut all_features = FnvHashSet::default(); 103 | for &(ref features, ref class) in examples { 104 | let mut data = classes 105 | .entry(class.clone()) 106 | .or_insert_with(|| (0, FnvHashMap::default())); 107 | data.0 += 1; 108 | for (feat, count) in features { 109 | all_features.insert(feat.clone()); 110 | *data.1.entry(feat.clone()).or_insert(0) += *count; 111 | } 112 | } 113 | let total_features = all_features.len(); 114 | let class_infos = classes 115 | .into_iter() 116 | .map(|(k, v)| { 117 | let smooth_denom: f32 = (total_features + v.1.values().sum::()) as f32; 118 | let feat_probalog = 119 | v.1.into_iter() 120 | .map(|(k, v)| (k, f32::ln((v as f32 + 1 as f32) / smooth_denom))) 121 | .collect(); 122 | ( 123 | k, 124 | ClassInfo { 125 | example_count: v.0, 126 | class_probalog: f32::ln(v.0 as f32 / total_examples as f32), 127 | unk_probalog: f32::ln(1.0 / smooth_denom), 128 | feat_probalog: feat_probalog, 129 | }, 130 | ) 131 | }) 132 | .collect(); 133 | Classifier { 134 | classes: class_infos, 135 | } 136 | } 137 | } 138 | 139 | #[cfg(test)] 140 | mod tests { 141 | use super::*; 142 | use fnv::FnvHashMap; 143 | 144 | macro_rules! hmap( 145 | { } => { FnvHashMap::default() }; 146 | { $($key:expr => $value:expr),+} => { 147 | { 148 | let mut m = FnvHashMap::default(); 149 | $( m.insert($key, $value); )* 150 | m 151 | } 152 | }; 153 | ($($k:expr => $v:expr),+,) => { hmap!($($k => $v),+) } 154 | ); 155 | 156 | #[derive(Eq, PartialEq, Debug, Hash, Clone)] 157 | enum Species { 158 | Cat, 159 | Dog, 160 | Human, 161 | } 162 | impl ClassId for Species {} 163 | 164 | #[derive(Eq, PartialEq, Debug, Hash, Clone)] 165 | enum Friend { 166 | Cat, 167 | Dog, 168 | Human, 169 | Fish, 170 | } 171 | impl Feature for Friend {} 172 | 173 | impl ClassifierId for &'static str {} 174 | 175 | fn mammals_classifier() -> Classifier { 176 | Classifier { 177 | classes: hmap!( 178 | Species::Cat => ClassInfo { 179 | class_probalog: -1.0986123, 180 | unk_probalog: -2.3978953, 181 | example_count: 4, 182 | feat_probalog: hmap!( 183 | Friend::Cat => -1.0116009, 184 | Friend::Human => -1.704748, 185 | Friend::Fish => -1.0116009, 186 | ) 187 | }, 188 | Species::Dog => ClassInfo { 189 | class_probalog: -1.0986123, 190 | unk_probalog: -2.3978953, 191 | example_count: 4, 192 | feat_probalog: hmap!( 193 | Friend::Cat => -1.704748, 194 | Friend::Dog => -1.0116009, 195 | Friend::Human => -1.0116009, 196 | ) 197 | }, 198 | Species::Human => ClassInfo { 199 | class_probalog: -1.0986123, 200 | unk_probalog: -2.7725887, 201 | example_count: 4, 202 | feat_probalog: hmap!( 203 | Friend::Cat => -1.3862944, 204 | Friend::Dog => -1.3862944, 205 | Friend::Human => -1.3862944, 206 | Friend::Fish => -1.3862944, 207 | ) 208 | } 209 | ), 210 | } 211 | } 212 | 213 | #[test] 214 | fn test_train() { 215 | let examples = vec![ 216 | ( 217 | hmap!(Friend::Dog => 1, Friend::Human => 1, Friend::Cat => 1), 218 | Species::Dog, 219 | ), 220 | (hmap!(Friend::Dog => 1), Species::Dog), 221 | (hmap!(Friend::Dog => 1, Friend::Human => 1), Species::Dog), 222 | (hmap!(Friend::Human => 1), Species::Dog), 223 | (hmap!(Friend::Fish => 1, Friend::Cat => 1), Species::Cat), 224 | (hmap!(Friend::Cat => 1), Species::Cat), 225 | (hmap!(Friend::Fish => 1), Species::Cat), 226 | ( 227 | hmap!(Friend::Human => 1, Friend::Fish => 1, Friend::Cat => 1), 228 | Species::Cat, 229 | ), 230 | ( 231 | hmap!(Friend::Human => 1, Friend::Fish => 1, Friend::Cat => 1, Friend::Dog => 1), 232 | Species::Human, 233 | ), 234 | ( 235 | hmap!(Friend::Fish => 1, Friend::Cat => 1, Friend::Dog => 1), 236 | Species::Human, 237 | ), 238 | ( 239 | hmap!(Friend::Human => 1, Friend::Fish => 1, Friend::Dog => 1), 240 | Species::Human, 241 | ), 242 | (hmap!(Friend::Human => 1, Friend::Cat => 1), Species::Human), 243 | ]; 244 | let classifier = Classifier::train(&examples); 245 | assert_eq!(mammals_classifier(), classifier); 246 | } 247 | 248 | #[test] 249 | fn test_classify_norm() { 250 | let classifier = mammals_classifier(); 251 | let probable_cat = hmap!(Friend::Fish => 1, Friend::Cat => 1); 252 | let norm = classifier 253 | .scores(&probable_cat) 254 | .iter() 255 | .map(|pair| pair.1) 256 | .map(f32::exp) 257 | .sum::(); 258 | assert!(norm > 0.9999 && norm < 1.0001); 259 | } 260 | 261 | #[test] 262 | fn test_classify() { 263 | let classifier = mammals_classifier(); 264 | let probable_cat = hmap!(Friend::Fish => 1, Friend::Cat => 1); 265 | assert_eq!(Species::Cat, classifier.classify(&probable_cat).unwrap().0); 266 | 267 | let probable_dog = hmap!(Friend::Human => 1, Friend::Dog => 1); 268 | assert_eq!(Species::Dog, classifier.classify(&probable_dog).unwrap().0); 269 | 270 | let probable_human = 271 | hmap!(Friend::Dog => 1, Friend::Cat => 1, Friend::Human => 1, Friend::Fish => 1); 272 | assert_eq!( 273 | Species::Human, 274 | classifier.classify(&probable_human).unwrap().0 275 | ); 276 | } 277 | 278 | #[test] 279 | fn test_model() { 280 | let model = Model { 281 | classifiers: hmap!( 282 | "mammals" => mammals_classifier(), 283 | "void" => Classifier { classes: hmap!() }, 284 | ), 285 | }; 286 | let input_dog = Input { 287 | classifier_id: "mammals", 288 | children: vec![], 289 | features: vec![Friend::Human, Friend::Dog], 290 | }; 291 | assert!(model.classify(&input_dog, &Species::Dog).unwrap() > -0.5); 292 | assert!(model.classify(&input_dog, &Species::Cat).unwrap() < -0.5); 293 | let input_dog = Input { 294 | classifier_id: "mammals", 295 | children: vec![input_dog], 296 | features: vec![Friend::Human, Friend::Dog], 297 | }; 298 | let dog_dog = model.classify(&input_dog, &Species::Dog).unwrap(); 299 | assert!(dog_dog > -1.0, "probalog: {:?}", dog_dog); 300 | assert!(dog_dog < 0.5, "probalog: {:?}", dog_dog); 301 | } 302 | } 303 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #[macro_use] 2 | extern crate failure; 3 | extern crate fnv; 4 | extern crate rustling_core; 5 | extern crate rustling_ml; 6 | 7 | pub use rustling_core::regex; 8 | pub use rustling_core::{ 9 | AttemptFrom, AttemptInto, BoundariesChecker, InnerStashIndexable, Node, NodePayload, 10 | ParsedNode, Range, RuleSet, RuleSetBuilder, StashIndexable, Sym, 11 | }; 12 | pub use rustling_core::{RuleError, RuleResult}; 13 | pub use rustling_ml::{ClassId, Classifier, ClassifierId, Feature, Input, Model}; 14 | use serde::{Deserialize, Serialize}; 15 | use std::collections::HashSet; 16 | pub use train::{Check, Example}; 17 | 18 | #[macro_use] 19 | pub mod macros; 20 | pub mod train; 21 | 22 | pub mod core { 23 | pub use rustling_core::pattern::{ 24 | AnyNodePattern, FilterNodePattern, TextNegLHPattern, TextPattern, 25 | }; 26 | pub use rustling_core::rule::{Rule1, Rule2, Rule3, Rule4, Rule5, Rule6}; 27 | } 28 | 29 | pub type RustlingResult = Result; 30 | 31 | #[derive(Debug, Hash, Clone, Eq, PartialEq, Serialize, Deserialize)] 32 | pub struct RuleId(pub Sym); 33 | impl ClassifierId for RuleId {} 34 | 35 | #[derive(Debug, Hash, Clone, Eq, PartialEq, Serialize, Deserialize)] 36 | pub struct Truth(pub bool); 37 | impl ClassId for Truth {} 38 | 39 | pub trait Value: NodePayload { 40 | type Kind: PartialEq; 41 | fn kind(&self) -> Self::Kind; 42 | fn latent(&self) -> bool; 43 | } 44 | 45 | /// Match holder for the Parser. 46 | #[derive(Debug, Clone, PartialEq)] 47 | pub struct ParserMatch { 48 | /// Range in bytes of matched area 49 | pub byte_range: Range, 50 | /// Range in char of matched area 51 | pub char_range: Range, 52 | /// Parsing tree height 53 | pub parsing_tree_height: usize, 54 | /// Number of nodes in the parsing tree 55 | pub parsing_tree_num_nodes: usize, 56 | /// Actual value built from the text. 57 | pub value: V, 58 | /// Logarithmic probability of the match after machine-learned model 59 | /// evaluation. 60 | pub probalog: f32, 61 | pub latent: bool, 62 | } 63 | 64 | pub trait MaxElementTagger { 65 | type O; 66 | fn tag(&self, candidates: Vec<(ParsedNode, ParserMatch)>) -> Vec>; 67 | } 68 | 69 | pub trait FeatureExtractor { 70 | fn for_parsed_node(&self, node: &ParsedNode) -> Input; 71 | fn for_node(&self, node: &Node) -> Input; 72 | } 73 | 74 | #[derive(Debug, Clone, PartialEq)] 75 | pub struct ParsingAnalysis<'a> { 76 | /// Coverage of rules used during the analysis 77 | pub rules_coverage: f32, 78 | /// Coverage of text pattern used during the analysis 79 | pub text_pattern_coverage: f32, 80 | /// Coverage of example with only one output 81 | pub examples_coverage: f32, 82 | /// Rules' names which were not used during the analysis 83 | pub unused_rules: Vec<&'a str>, 84 | /// Text patterns's names which were not used during the analysis 85 | pub unused_text_pattern: Vec<&'a str>, 86 | /// IFailed examples with the position of the example and the number of output found. An example is a success if and only if one output is found during the parsing 87 | pub failed_examples: Vec<(usize, usize)>, 88 | } 89 | 90 | #[derive(Debug, Clone)] 91 | pub struct Candidate { 92 | pub node: ParsedNode, 93 | pub match_: ParserMatch, 94 | pub tagged: bool, 95 | } 96 | 97 | pub struct Parser 98 | where 99 | V: Value + StashIndexable, 100 | Feat: Feature, 101 | Extractor: FeatureExtractor, 102 | { 103 | rules: RuleSet, 104 | model: Model, 105 | extractor: Extractor, 106 | } 107 | 108 | impl Parser 109 | where 110 | V: Value + ::std::fmt::Debug + StashIndexable, 111 | RuleId: ClassifierId, 112 | Feat: Feature, 113 | Extractor: FeatureExtractor, 114 | { 115 | pub fn new( 116 | rules: RuleSet, 117 | model: Model, 118 | extractor: Extractor, 119 | ) -> Parser { 120 | Parser { 121 | rules, 122 | model, 123 | extractor, 124 | } 125 | } 126 | 127 | fn raw_candidates(&self, input: &str) -> RustlingResult, ParserMatch)>> { 128 | self.rules 129 | .apply_all(input)? 130 | .into_iter() 131 | .map(|p| { 132 | let features: Input = self.extractor.for_parsed_node(&p); 133 | let probalog = self.model.classify(&features, &Truth(true))?; 134 | let pm = ParserMatch { 135 | byte_range: p.root_node.byte_range, 136 | char_range: p.root_node.byte_range.char_range(input), 137 | value: p.value.clone().into(), 138 | parsing_tree_height: p.root_node.height(), 139 | parsing_tree_num_nodes: p.root_node.num_nodes(), 140 | probalog, 141 | latent: p.value.latent(), 142 | }; 143 | Ok((p, pm)) 144 | }) 145 | .collect() 146 | } 147 | 148 | pub fn candidates>( 149 | &self, 150 | input: &str, 151 | tagger: &Tagger, 152 | ) -> RustlingResult>> { 153 | Ok(tagger.tag(self.raw_candidates(input)?)) 154 | } 155 | 156 | pub fn parse>( 157 | &self, 158 | input: &str, 159 | tagger: &Tagger, 160 | ) -> RustlingResult>> { 161 | Ok(self 162 | .candidates(input, tagger)? 163 | .into_iter() 164 | .filter_map(|c| if c.tagged { Some(c.match_) } else { None }) 165 | .collect()) 166 | } 167 | 168 | pub fn analyse>( 169 | &self, 170 | examples: Vec<&str>, 171 | tagger: &Tagger, 172 | ) -> RustlingResult { 173 | let all_syms = self.rules.all_syms().into_iter().collect::>(); 174 | let rules_syms = self.rules.rules_syms().into_iter().collect::>(); 175 | let text_pattern_syms: HashSet<_> = all_syms.difference(&rules_syms).map(|s| *s).collect(); 176 | 177 | let mut used_syms = HashSet::new(); 178 | let mut failed_examples = vec![]; 179 | 180 | for (idx, example) in examples.iter().enumerate() { 181 | let outputs = self 182 | .candidates(example, tagger)? 183 | .into_iter() 184 | .filter(|c| c.tagged) 185 | .collect::>(); 186 | 187 | if outputs.len() != 1 { 188 | failed_examples.push((idx, outputs.len())); 189 | } else { 190 | for sym in outputs[0].node.root_node.all_syms().into_iter() { 191 | used_syms.insert(*sym); 192 | } 193 | } 194 | } 195 | let unused_rules: Vec<_> = rules_syms 196 | .difference(&used_syms) 197 | .filter_map(|s| self.resolve_sym(&s)) 198 | .collect(); 199 | 200 | let unused_text_pattern: Vec<_> = text_pattern_syms 201 | .difference(&used_syms) 202 | .filter_map(|s| self.resolve_sym(&s)) 203 | .collect(); 204 | 205 | Ok(ParsingAnalysis { 206 | rules_coverage: 1.0 - (unused_rules.len() as f32 / rules_syms.len() as f32), 207 | text_pattern_coverage: 1.0 208 | - (unused_text_pattern.len() as f32 / text_pattern_syms.len() as f32), 209 | examples_coverage: 1.0 - (failed_examples.len() as f32 / examples.len() as f32), 210 | unused_rules, 211 | unused_text_pattern, 212 | failed_examples, 213 | }) 214 | } 215 | 216 | pub fn num_rules(&self) -> usize { 217 | self.rules 218 | .rules_syms() 219 | .into_iter() 220 | .collect::>() 221 | .len() 222 | } 223 | 224 | pub fn num_text_patterns(&self) -> usize { 225 | let all_syms = self.rules.all_syms().into_iter().collect::>(); 226 | let rules_syms = self.rules.rules_syms().into_iter().collect::>(); 227 | let text_pattern_syms: HashSet<_> = all_syms.difference(&rules_syms).map(|s| *s).collect(); 228 | text_pattern_syms.len() 229 | } 230 | 231 | pub fn resolve_sym(&self, sym: &Sym) -> Option<&str> { 232 | self.rules.resolve_sym(sym) 233 | } 234 | } 235 | 236 | #[cfg(test)] 237 | mod tests { 238 | use super::*; 239 | use fnv::FnvHashMap; 240 | use std::str::FromStr; 241 | 242 | #[derive(Copy, Clone, Debug, PartialEq)] 243 | pub struct MyPayload; 244 | 245 | #[derive(Copy, Clone, Debug, PartialEq, Default)] 246 | pub struct Int(usize); 247 | 248 | impl StashIndexable for Int { 249 | type Index = MyValueKind; 250 | fn index(&self) -> Self::Index { 251 | MyValueKind::UI 252 | } 253 | } 254 | 255 | #[derive(Copy, Clone, Debug, PartialEq, Default)] 256 | pub struct F32(f32); 257 | 258 | impl AttemptFrom for Int { 259 | fn attempt_from(v: Int) -> Option { 260 | Some(v) 261 | } 262 | } 263 | 264 | fn rules() -> RuleSet { 265 | let b = RuleSetBuilder::new( 266 | BoundariesChecker::detailed(), 267 | BoundariesChecker::separated_alphanumeric_word(), 268 | ); 269 | b.rule_1( 270 | "integer (numeric)", 271 | b.reg(r#"(\d{1,18})"#).unwrap(), 272 | |text_match| Ok(Int(text_match.group(0).parse::()?)), 273 | ); 274 | b.rule_1("integer (thousand)", b.reg("thousands?").unwrap(), |_| { 275 | Ok(Int(1000)) 276 | }); 277 | b.rule_2( 278 | "number thousands", 279 | dim!(Int, vec![Box::new(|a: &Int| a.0 > 1 && a.0 < 99)]), 280 | dim!(Int, vec![Box::new(|a: &Int| a.0 == 1000)]), 281 | |a, _| Ok(Int(a.value().0 * 1000)), 282 | ); 283 | b.build() 284 | } 285 | 286 | #[test] 287 | fn test_rule_set_application_all() { 288 | let rule_set = rules(); 289 | let output_stash = rule_set.apply_all("foobar: 12 thousands").unwrap(); 290 | assert_eq!(3, output_stash.len()); 291 | let values: Vec<_> = output_stash.iter().map(|pn| pn.value).collect(); 292 | assert_eq!(vec![Int(12), Int(1000), Int(12000)], values); 293 | } 294 | 295 | #[test] 296 | fn test_integer_numeric_infix_rule() { 297 | let b = RuleSetBuilder::new( 298 | BoundariesChecker::detailed(), 299 | BoundariesChecker::separated_alphanumeric_word(), 300 | ); 301 | b.rule_1("int", b.reg("\\d+").unwrap(), |a| { 302 | Ok(Int(usize::from_str(&*a.group(0))?)) 303 | }); 304 | b.rule_3( 305 | "add", 306 | dim!(Int), 307 | b.reg("\\+").unwrap(), 308 | dim!(Int), 309 | |a, _, b| Ok(Int(a.value().0 + b.value().0)), 310 | ); 311 | b.rule_3( 312 | "mul", 313 | dim!(Int), 314 | b.reg("\\*").unwrap(), 315 | dim!(Int), 316 | |a, _, b| Ok(Int(a.value().0 * b.value().0)), 317 | ); 318 | let rs = b.build(); 319 | let results = rs.apply_all("foo: 12 + 42, 12* 42").unwrap(); 320 | let values: Vec<_> = results.iter().map(|pn| pn.value).collect(); 321 | assert_eq!( 322 | vec![Int(12), Int(42), Int(12), Int(42), Int(54), Int(504)], 323 | values 324 | ); 325 | } 326 | 327 | rustling_value! { 328 | #[doc="an union"] 329 | #[derive(Clone,PartialEq,Debug)] 330 | MyValue MyValueKind { 331 | UI(Int), 332 | FP(F32), 333 | } 334 | 335 | fn latent(v: &MyValue) -> bool { 336 | false 337 | } 338 | 339 | fn extract_payload(v: &MyValue) -> Option { 340 | None 341 | } 342 | } 343 | 344 | #[derive(Debug, Hash, Clone, Eq, PartialEq, Serialize, Deserialize)] 345 | struct TestFeat; 346 | 347 | impl Feature for TestFeat {} 348 | 349 | struct TestFeatExtractor(); 350 | 351 | impl FeatureExtractor for TestFeatExtractor { 352 | fn for_parsed_node(&self, node: &ParsedNode) -> Input { 353 | self.for_node(&node.root_node) 354 | } 355 | fn for_node(&self, node: &Node) -> Input { 356 | Input { 357 | classifier_id: RuleId(node.rule_sym), 358 | children: vec![], 359 | features: vec![], 360 | } 361 | } 362 | } 363 | 364 | struct TestMaxElementTagger; 365 | 366 | impl MaxElementTagger for TestMaxElementTagger { 367 | type O = MyValue; 368 | fn tag( 369 | &self, 370 | candidates: Vec<(ParsedNode, ParserMatch)>, 371 | ) -> Vec> { 372 | let mut candidates = candidates; 373 | candidates.sort_by(|a, b| a.1.byte_range.len().cmp(&b.1.byte_range.len())); 374 | candidates 375 | .into_iter() 376 | .rev() 377 | .enumerate() 378 | .map(|(idx, c)| Candidate { 379 | node: c.0, 380 | match_: c.1, 381 | tagged: idx == 0, 382 | }) 383 | .collect() 384 | } 385 | } 386 | 387 | fn rules_with_enum_value() -> RuleSet { 388 | let b = RuleSetBuilder::new( 389 | BoundariesChecker::detailed(), 390 | BoundariesChecker::separated_alphanumeric_word(), 391 | ); 392 | b.rule_1("int", b.reg("\\d+").unwrap(), |a| { 393 | Ok(Int(usize::from_str(&*a.group(0))?)) 394 | }); 395 | b.rule_1("fp", b.reg("\\d+\\.\\d+").unwrap(), |a| { 396 | Ok(F32(f32::from_str(&*a.group(0))?)) 397 | }); 398 | b.rule_3( 399 | "pow", 400 | dim!(F32), 401 | b.reg("\\^").unwrap(), 402 | dim!(Int), 403 | |a, _, b| Ok(F32(a.value().0.powi(b.value().0 as i32))), 404 | ); 405 | b.build() 406 | } 407 | 408 | fn parser() -> Parser { 409 | Parser { 410 | rules: rules_with_enum_value(), 411 | model: Model { 412 | classifiers: FnvHashMap::default(), 413 | }, 414 | extractor: TestFeatExtractor(), 415 | } 416 | } 417 | 418 | #[test] 419 | fn test_with_enum_value() { 420 | let rule_set = rules_with_enum_value(); 421 | let results = rule_set.apply_all("foo: 1.5^2").unwrap(); 422 | let values: Vec<_> = results.into_iter().map(|pn| pn.value).collect(); 423 | assert_eq!( 424 | vec![ 425 | MyValue::UI(Int(1)), 426 | MyValue::UI(Int(5)), 427 | MyValue::UI(Int(2)), 428 | MyValue::FP(F32(1.5)), 429 | MyValue::FP(F32(2.25)) 430 | ], 431 | values 432 | ); 433 | } 434 | 435 | #[test] 436 | fn test_parsing_analysis() { 437 | let parser = parser(); 438 | assert_eq!( 439 | ParsingAnalysis { 440 | rules_coverage: 0.6666666, 441 | text_pattern_coverage: 0.6666666, 442 | examples_coverage: 0.5, 443 | unused_rules: vec!["pow"], 444 | unused_text_pattern: vec!["\\^"], 445 | failed_examples: vec![(0, 0), (1, 0),], 446 | }, 447 | parser 448 | .analyse( 449 | vec![ 450 | "example that should fail", 451 | "another one", 452 | "foo: 1.5", 453 | "foo: 2" 454 | ], 455 | &TestMaxElementTagger 456 | ) 457 | .unwrap() 458 | ); 459 | assert_eq!( 460 | ParsingAnalysis { 461 | rules_coverage: 1.0, 462 | text_pattern_coverage: 1.0, 463 | examples_coverage: 0.6666666, 464 | unused_rules: vec![], 465 | unused_text_pattern: vec![], 466 | failed_examples: vec![(0, 0)], 467 | }, 468 | parser 469 | .analyse( 470 | vec!["example that should fail", "foo: 1.5^2", "foo: 2"], 471 | &TestMaxElementTagger 472 | ) 473 | .unwrap() 474 | ); 475 | } 476 | } 477 | -------------------------------------------------------------------------------- /src/macros.rs: -------------------------------------------------------------------------------- 1 | #[macro_export] 2 | macro_rules! variant_converters { 3 | ($name:ident, $varname:ident, $varty:ty) => { 4 | impl From<$varty> for $name { 5 | fn from(v: $varty) -> $name { 6 | $name::$varname(v) 7 | } 8 | } 9 | 10 | impl $crate::AttemptFrom<$name> for $varty { 11 | fn attempt_from(v: $name) -> Option<$varty> { 12 | if let $name::$varname(value) = v { 13 | Some(value) 14 | } else { 15 | None 16 | } 17 | } 18 | } 19 | } 20 | } 21 | 22 | #[macro_export] 23 | macro_rules! enum_kind { 24 | ($kindname:ident, [$($varname:ident),*]) => { 25 | #[derive(Debug,Copy,Clone,PartialEq, Hash, Eq)] 26 | pub enum $kindname { 27 | $( $varname ),* 28 | } 29 | 30 | impl ::std::str::FromStr for $kindname { 31 | type Err=String; 32 | fn from_str(s: &str) -> ::std::result::Result<$kindname, Self::Err> { 33 | match s { 34 | $( 35 | stringify!($varname) => Ok($kindname::$varname), 36 | )* 37 | _ => Err(format!("{} is not a known {}", s, stringify!($kindname))) 38 | } 39 | } 40 | } 41 | 42 | impl ::std::string::ToString for $kindname { 43 | fn to_string(&self) -> String { 44 | match self { 45 | $( 46 | &$kindname::$varname => stringify!($varname).to_string(), 47 | )* 48 | } 49 | } 50 | } 51 | } 52 | } 53 | 54 | #[macro_export] 55 | macro_rules! rustling_value { 56 | ( #[$doc:meta] #[$derive:meta] $name:ident $kindname:ident { $($varname:ident($varty:ty)),*, } fn latent($v1:ident: &$t1:ty) -> bool { $( $body1:tt )* } fn extract_payload($v2:ident: &$t2:ty) -> Option<$payload:ty> { $( $body2:tt )* } ) => { 57 | #[$doc] #[$derive] 58 | pub enum $name { 59 | $( $varname($varty) ),* 60 | } 61 | 62 | enum_kind!($kindname, [ 63 | $( $varname ),* 64 | ]); 65 | 66 | impl Value for $name { 67 | type Kind = $kindname; 68 | fn kind(&self) -> Self::Kind { 69 | match self { 70 | $( 71 | &$name::$varname(_) => $kindname::$varname, 72 | )* 73 | } 74 | } 75 | 76 | fn latent(&self) -> bool { 77 | #[allow(unused_variables)] 78 | fn i($v1: &$t1) -> bool { 79 | $( $body1 )* 80 | } 81 | i(&self) 82 | } 83 | } 84 | 85 | impl StashIndexable for $name { 86 | type Index = $kindname; 87 | fn index(&self) -> Self::Index { 88 | match self { 89 | $( 90 | &$name::$varname(_) => $kindname::$varname, 91 | )* 92 | } 93 | } 94 | } 95 | 96 | impl NodePayload for $name { 97 | type Payload = $payload; 98 | fn extract_payload(&self) -> Option { 99 | #[allow(unused_variables)] 100 | fn i($v2: &$t2) -> Option<$payload> { 101 | $( $body2 )* 102 | } 103 | i(&self) 104 | } 105 | } 106 | 107 | $( 108 | variant_converters!($name, $varname, $varty); 109 | 110 | impl NodePayload for $varty { 111 | type Payload = $payload; 112 | fn extract_payload(&self) -> Option { 113 | $name::from(self.clone()).extract_payload() 114 | } 115 | } 116 | 117 | impl InnerStashIndexable for $varty { 118 | type Index = $kindname; 119 | fn index() -> Self::Index { 120 | $kindname::$varname 121 | } 122 | } 123 | )* 124 | } 125 | } 126 | 127 | #[macro_export] 128 | macro_rules! dim { 129 | ($typ:ty) => ( $crate::core::AnyNodePattern::<$typ>::new() ); 130 | ($typ:ty, $predicates:expr) => ( $crate::core::FilterNodePattern::<$typ>::filter($predicates) ); 131 | } 132 | -------------------------------------------------------------------------------- /src/train.rs: -------------------------------------------------------------------------------- 1 | use crate::{ 2 | Classifier, Feature, FeatureExtractor, Model, Node, ParsedNode, Range, RuleId, RuleSet, 3 | RustlingResult, StashIndexable, Truth, Value, 4 | }; 5 | use fnv::FnvHashMap; 6 | use fnv::FnvHashSet; 7 | use std::cmp::Eq; 8 | use std::fmt::Debug; 9 | use std::hash::Hash; 10 | 11 | #[derive(Debug)] 12 | pub struct Example { 13 | pub text: &'static str, 14 | pub predicate: Box>, 15 | } 16 | 17 | impl Example { 18 | pub fn new(text: &'static str, predicate: Box>) -> Example { 19 | Example { text, predicate } 20 | } 21 | } 22 | 23 | pub trait Check: Debug { 24 | fn check(&self, value: &ParsedNode) -> bool; 25 | } 26 | 27 | pub fn train( 28 | rules: &RuleSet, 29 | examples: Vec>, 30 | feature_extractor: E, 31 | ) -> RustlingResult> 32 | where 33 | V: Value + Debug + StashIndexable, 34 | V::Payload: Debug + Eq + Hash, 35 | F: Feature, 36 | E: FeatureExtractor, 37 | { 38 | let mut classified_ex: FnvHashMap, Truth)>> = 39 | FnvHashMap::default(); 40 | for ex in examples.iter() { 41 | let stash = rules.apply_all(&ex.text.to_lowercase()).unwrap(); 42 | 43 | // - keep only full-range parsed nodes 44 | // - partition them according to the example check value 45 | let (positive_parsed_nodes, negative_parse_nodes) = stash 46 | .into_iter() 47 | .filter(|candidate| candidate.root_node.byte_range == Range(0, ex.text.len())) 48 | .partition::, _>(|candidate| ex.predicate.check(&candidate)); 49 | // - example sanity check 50 | if positive_parsed_nodes.is_empty() { 51 | Err(format_err!("example: {:?} matched no rule", ex.text))? 52 | } 53 | 54 | // - expand parse nodes to nodes, according to the partition 55 | let mut negative_nodes = FnvHashSet::default(); 56 | let mut positive_nodes = FnvHashSet::default(); 57 | 58 | fn add_to_set( 59 | nodes: &mut FnvHashSet>, 60 | node: &Node, 61 | ) { 62 | nodes.insert(node.clone()); 63 | for child in &node.children { 64 | add_to_set(nodes, child); 65 | } 66 | } 67 | 68 | for parsed_node in positive_parsed_nodes { 69 | add_to_set(&mut positive_nodes, &parsed_node.root_node); 70 | } 71 | for parsed_node in negative_parse_nodes { 72 | add_to_set(&mut negative_nodes, &parsed_node.root_node); 73 | } 74 | 75 | // - ignore negative nodes if there is a matching positive node 76 | for pos in &positive_nodes { 77 | negative_nodes.remove(pos); 78 | } 79 | // - put node counted features, with truth value in the trainable hashmaps 80 | for (nodes, truth) in vec![(positive_nodes, true), (negative_nodes, false)].into_iter() { 81 | for n in nodes.into_iter() { 82 | let mut counted_features = FnvHashMap::default(); 83 | for f in feature_extractor.for_node(&n).features { 84 | *counted_features.entry(f).or_insert(0) += 1; 85 | } 86 | classified_ex 87 | .entry(RuleId(n.rule_sym)) 88 | .or_insert(vec![]) 89 | .push((counted_features, Truth(truth))); 90 | } 91 | } 92 | } 93 | // - train the classifiers 94 | let classifiers = classified_ex 95 | .into_iter() 96 | .map(|(id, examples)| (id, Classifier::train(&examples))) 97 | .collect(); 98 | Ok(Model { classifiers }) 99 | } 100 | -------------------------------------------------------------------------------- /update_version.sh: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/env bash 3 | 4 | set -e 5 | 6 | NEW_VERSION=$1 7 | 8 | if [ -z $NEW_VERSION ] 9 | then 10 | echo "Usage: $0 NEW_VERSION" 11 | exit 1 12 | fi 13 | 14 | perl -p -i -e "s/^version = \".*\"\$/version = \"$NEW_VERSION\"/g" Cargo.toml 15 | perl -p -i -e "s/^version = \".*\"\$/version = \"$NEW_VERSION\"/g" */Cargo.toml --------------------------------------------------------------------------------