├── .gitignore
├── .travis.yml
├── Cargo.toml
├── LICENSE
├── LICENSE-APACHE
├── LICENSE-MIT
├── README.md
├── core
    ├── Cargo.toml
    └── src
    │   ├── builder.rs
    │   ├── helpers.rs
    │   ├── lib.rs
    │   ├── pattern.rs
    │   ├── range.rs
    │   ├── rule.rs
    │   └── stash.rs
├── ml
    ├── Cargo.toml
    └── src
    │   └── lib.rs
├── src
    ├── lib.rs
    ├── macros.rs
    └── train.rs
└── update_version.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Generated by Cargo
 2 | # will have compiled files and executables
 3 | /target/
 4 | 
 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
 6 | # More information here http://doc.crates.io/guide.html#cargotoml-vs-cargolock
 7 | Cargo.lock
 8 | target
 9 | Cargo.lock
10 | 
11 | *.rustfmt
12 | tmp
13 | temp
14 | .idea/
15 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: rust
 2 | rust:
 3 |     - stable
 4 |     - beta
 5 |     - nightly
 6 | matrix:
 7 |   allow_failures:
 8 |     - rust: nightly
 9 | script:
10 |     - cargo test --all --verbose


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "rustling"
 3 | version = "0.9.1"
 4 | authors = ["hdlj <hubert.delajonquiere@snips.net>", "Mathieu Poumeyrol <kali@zoy.org>"]
 5 | edition = "2018"
 6 | 
 7 | [workspace]
 8 | members = ["core", "ml"]
 9 | 
10 | [dependencies]
11 | rustling-core = { path = "core" }
12 | rustling-ml = { path = "ml" }
13 | failure = "0.1"
14 | fnv = "1.0"
15 | serde = { version = "1.0", features = ["derive"] }
16 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | ## License
 2 | 
 3 | Licensed under either of
 4 |  * Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
 5 |  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
 6 | at your option.
 7 | 
 8 | ### Contribution
 9 | 
10 | Unless you explicitly state otherwise, any contribution intentionally submitted
11 | for inclusion in the work by you, as defined in the Apache-2.0 license, shall
12 | be dual licensed as above, without any additional terms or conditions.
13 | 


--------------------------------------------------------------------------------
/LICENSE-APACHE:
--------------------------------------------------------------------------------
  1 |                               Apache License
  2 |                         Version 2.0, January 2004
  3 |                      http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 |    "License" shall mean the terms and conditions for use, reproduction,
 10 |    and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |    "Licensor" shall mean the copyright owner or entity authorized by
 13 |    the copyright owner that is granting the License.
 14 | 
 15 |    "Legal Entity" shall mean the union of the acting entity and all
 16 |    other entities that control, are controlled by, or are under common
 17 |    control with that entity. For the purposes of this definition,
 18 |    "control" means (i) the power, direct or indirect, to cause the
 19 |    direction or management of such entity, whether by contract or
 20 |    otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |    outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |    "You" (or "Your") shall mean an individual or Legal Entity
 24 |    exercising permissions granted by this License.
 25 | 
 26 |    "Source" form shall mean the preferred form for making modifications,
 27 |    including but not limited to software source code, documentation
 28 |    source, and configuration files.
 29 | 
 30 |    "Object" form shall mean any form resulting from mechanical
 31 |    transformation or translation of a Source form, including but
 32 |    not limited to compiled object code, generated documentation,
 33 |    and conversions to other media types.
 34 | 
 35 |    "Work" shall mean the work of authorship, whether in Source or
 36 |    Object form, made available under the License, as indicated by a
 37 |    copyright notice that is included in or attached to the work
 38 |    (an example is provided in the Appendix below).
 39 | 
 40 |    "Derivative Works" shall mean any work, whether in Source or Object
 41 |    form, that is based on (or derived from) the Work and for which the
 42 |    editorial revisions, annotations, elaborations, or other modifications
 43 |    represent, as a whole, an original work of authorship. For the purposes
 44 |    of this License, Derivative Works shall not include works that remain
 45 |    separable from, or merely link (or bind by name) to the interfaces of,
 46 |    the Work and Derivative Works thereof.
 47 | 
 48 |    "Contribution" shall mean any work of authorship, including
 49 |    the original version of the Work and any modifications or additions
 50 |    to that Work or Derivative Works thereof, that is intentionally
 51 |    submitted to Licensor for inclusion in the Work by the copyright owner
 52 |    or by an individual or Legal Entity authorized to submit on behalf of
 53 |    the copyright owner. For the purposes of this definition, "submitted"
 54 |    means any form of electronic, verbal, or written communication sent
 55 |    to the Licensor or its representatives, including but not limited to
 56 |    communication on electronic mailing lists, source code control systems,
 57 |    and issue tracking systems that are managed by, or on behalf of, the
 58 |    Licensor for the purpose of discussing and improving the Work, but
 59 |    excluding communication that is conspicuously marked or otherwise
 60 |    designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |    "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |    on behalf of whom a Contribution has been received by Licensor and
 64 |    subsequently incorporated within the Work.
 65 | 
 66 | 2. Grant of Copyright License. Subject to the terms and conditions of
 67 |    this License, each Contributor hereby grants to You a perpetual,
 68 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |    copyright license to reproduce, prepare Derivative Works of,
 70 |    publicly display, publicly perform, sublicense, and distribute the
 71 |    Work and such Derivative Works in Source or Object form.
 72 | 
 73 | 3. Grant of Patent License. Subject to the terms and conditions of
 74 |    this License, each Contributor hereby grants to You a perpetual,
 75 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |    (except as stated in this section) patent license to make, have made,
 77 |    use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |    where such license applies only to those patent claims licensable
 79 |    by such Contributor that are necessarily infringed by their
 80 |    Contribution(s) alone or by combination of their Contribution(s)
 81 |    with the Work to which such Contribution(s) was submitted. If You
 82 |    institute patent litigation against any entity (including a
 83 |    cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |    or a Contribution incorporated within the Work constitutes direct
 85 |    or contributory patent infringement, then any patent licenses
 86 |    granted to You under this License for that Work shall terminate
 87 |    as of the date such litigation is filed.
 88 | 
 89 | 4. Redistribution. You may reproduce and distribute copies of the
 90 |    Work or Derivative Works thereof in any medium, with or without
 91 |    modifications, and in Source or Object form, provided that You
 92 |    meet the following conditions:
 93 | 
 94 |    (a) You must give any other recipients of the Work or
 95 |        Derivative Works a copy of this License; and
 96 | 
 97 |    (b) You must cause any modified files to carry prominent notices
 98 |        stating that You changed the files; and
 99 | 
100 |    (c) You must retain, in the Source form of any Derivative Works
101 |        that You distribute, all copyright, patent, trademark, and
102 |        attribution notices from the Source form of the Work,
103 |        excluding those notices that do not pertain to any part of
104 |        the Derivative Works; and
105 | 
106 |    (d) If the Work includes a "NOTICE" text file as part of its
107 |        distribution, then any Derivative Works that You distribute must
108 |        include a readable copy of the attribution notices contained
109 |        within such NOTICE file, excluding those notices that do not
110 |        pertain to any part of the Derivative Works, in at least one
111 |        of the following places: within a NOTICE text file distributed
112 |        as part of the Derivative Works; within the Source form or
113 |        documentation, if provided along with the Derivative Works; or,
114 |        within a display generated by the Derivative Works, if and
115 |        wherever such third-party notices normally appear. The contents
116 |        of the NOTICE file are for informational purposes only and
117 |        do not modify the License. You may add Your own attribution
118 |        notices within Derivative Works that You distribute, alongside
119 |        or as an addendum to the NOTICE text from the Work, provided
120 |        that such additional attribution notices cannot be construed
121 |        as modifying the License.
122 | 
123 |    You may add Your own copyright statement to Your modifications and
124 |    may provide additional or different license terms and conditions
125 |    for use, reproduction, or distribution of Your modifications, or
126 |    for any such Derivative Works as a whole, provided Your use,
127 |    reproduction, and distribution of the Work otherwise complies with
128 |    the conditions stated in this License.
129 | 
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 |    any Contribution intentionally submitted for inclusion in the Work
132 |    by You to the Licensor shall be under the terms and conditions of
133 |    this License, without any additional terms or conditions.
134 |    Notwithstanding the above, nothing herein shall supersede or modify
135 |    the terms of any separate license agreement you may have executed
136 |    with Licensor regarding such Contributions.
137 | 
138 | 6. Trademarks. This License does not grant permission to use the trade
139 |    names, trademarks, service marks, or product names of the Licensor,
140 |    except as required for reasonable and customary use in describing the
141 |    origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 |    agreed to in writing, Licensor provides the Work (and each
145 |    Contributor provides its Contributions) on an "AS IS" BASIS,
146 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |    implied, including, without limitation, any warranties or conditions
148 |    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |    PARTICULAR PURPOSE. You are solely responsible for determining the
150 |    appropriateness of using or redistributing the Work and assume any
151 |    risks associated with Your exercise of permissions under this License.
152 | 
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 |    whether in tort (including negligence), contract, or otherwise,
155 |    unless required by applicable law (such as deliberate and grossly
156 |    negligent acts) or agreed to in writing, shall any Contributor be
157 |    liable to You for damages, including any direct, indirect, special,
158 |    incidental, or consequential damages of any character arising as a
159 |    result of this License or out of the use or inability to use the
160 |    Work (including but not limited to damages for loss of goodwill,
161 |    work stoppage, computer failure or malfunction, or any and all
162 |    other commercial damages or losses), even if such Contributor
163 |    has been advised of the possibility of such damages.
164 | 
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 |    the Work or Derivative Works thereof, You may choose to offer,
167 |    and charge a fee for, acceptance of support, warranty, indemnity,
168 |    or other liability obligations and/or rights consistent with this
169 |    License. However, in accepting such obligations, You may act only
170 |    on Your own behalf and on Your sole responsibility, not on behalf
171 |    of any other Contributor, and only if You agree to indemnify,
172 |    defend, and hold each Contributor harmless for any liability
173 |    incurred by, or claims asserted against, such Contributor by reason
174 |    of your accepting any such warranty or additional liability.
175 | 
176 | END OF TERMS AND CONDITIONS
177 | 
178 | APPENDIX: How to apply the Apache License to your work.
179 | 
180 |    To apply the Apache License to your work, attach the following
181 |    boilerplate notice, with the fields enclosed by brackets "[]"
182 |    replaced with your own identifying information. (Don't include
183 |    the brackets!)  The text should be enclosed in the appropriate
184 |    comment syntax for the file format. We also recommend that a
185 |    file or class name and description of purpose be included on the
186 |    same "printed page" as the copyright notice for easier
187 |    identification within third-party archives.
188 | 
189 | Copyright [yyyy] [name of copyright owner]
190 | 
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 | 
195 | 	http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 | 


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | Permission is hereby granted, free of charge, to any
 2 | person obtaining a copy of this software and associated
 3 | documentation files (the "Software"), to deal in the
 4 | Software without restriction, including without
 5 | limitation the rights to use, copy, modify, merge,
 6 | publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software
 8 | is furnished to do so, subject to the following
 9 | conditions:
10 | 
11 | The above copyright notice and this permission notice
12 | shall be included in all copies or substantial portions
13 | of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
16 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
17 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
18 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
19 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
22 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 | DEALINGS IN THE SOFTWARE.
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Rustling
 2 | [![Build Status](https://travis-ci.org/snipsco/rustling.svg?branch=master)](https://travis-ci.org/snipsco/rustling)
 3 | 
 4 | 
 5 | Rust port of https://github.com/facebookincubator/duckling
 6 | 
 7 | # License
 8 | 
 9 | ## Apache 2.0/MIT
10 | 
11 | All original work licensed under either of
12 |  * Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
13 |  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
14 | at your option.
15 | 
16 | ## Contribution
17 | 
18 | Unless you explicitly state otherwise, any contribution intentionally submitted
19 | for inclusion in the work by you, as defined in the Apache-2.0 license, shall
20 | be dual licensed as above, without any additional terms or conditions.
21 | 


--------------------------------------------------------------------------------
/core/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "rustling-core"
 3 | version = "0.9.1"
 4 | authors = ["hdlj <hubert.delajonquiere@snips.net>", "Mathieu Poumeyrol <kali@zoy.org>"]
 5 | edition = "2018"
 6 | 
 7 | [dependencies]
 8 | regex = "1.0"
 9 | smallvec = "0.6"
10 | failure = "0.1"
11 | string-interner = "0.7"
12 | serde = { version = "1.0", features = ["derive"] }
13 | 


--------------------------------------------------------------------------------
/core/src/builder.rs:
--------------------------------------------------------------------------------
  1 | use crate::helpers::BoundariesChecker;
  2 | use crate::rule::{
  3 |     Rule, Rule1, Rule2, Rule3, Rule4, Rule5, Rule6, RuleProductionArg, RuleResult, TerminalRule,
  4 | };
  5 | use crate::{
  6 |     cell, pattern, CoreResult, NodePayload, Pattern, RuleSet, StashIndexable, Sym, SymbolTable,
  7 |     TerminalPattern,
  8 | };
  9 | 
 10 | pub struct RuleSetBuilder<StashValue: NodePayload + StashIndexable> {
 11 |     symbols: cell::RefCell<SymbolTable>,
 12 |     composition_rules: cell::RefCell<Vec<Box<dyn Rule<StashValue>>>>,
 13 |     terminal_rules: cell::RefCell<Vec<Box<dyn TerminalRule<StashValue>>>>,
 14 |     word_boundaries: BoundariesChecker,
 15 |     match_boundaries: BoundariesChecker,
 16 | }
 17 | 
 18 | impl<StashValue: NodePayload + StashIndexable> RuleSetBuilder<StashValue> {
 19 |     pub fn new(
 20 |         word_boundaries: BoundariesChecker,
 21 |         match_boundaries: BoundariesChecker,
 22 |     ) -> RuleSetBuilder<StashValue> {
 23 |         RuleSetBuilder {
 24 |             symbols: cell::RefCell::new(SymbolTable::default()),
 25 |             composition_rules: cell::RefCell::new(vec![]),
 26 |             terminal_rules: cell::RefCell::new(vec![]),
 27 |             word_boundaries,
 28 |             match_boundaries,
 29 |         }
 30 |     }
 31 | }
 32 | 
 33 | impl<StashValue: NodePayload + StashIndexable> RuleSetBuilder<StashValue> {
 34 |     pub fn sym<S>(&self, val: S) -> Sym
 35 |     where
 36 |         S: Into<String> + AsRef<str>,
 37 |     {
 38 |         self.symbols.borrow_mut().sym(val)
 39 |     }
 40 | 
 41 |     pub fn rule_1<S, PA, V, F>(&self, sym: S, pa: PA, production: F)
 42 |     where
 43 |         S: Into<String> + AsRef<str>,
 44 |         V: NodePayload<Payload = StashValue::Payload> + 'static,
 45 |         StashValue: StashIndexable + From<V> + 'static,
 46 |         F: for<'a> Fn(&RuleProductionArg<'a, PA::M>) -> RuleResult<V> + 'static + Send + Sync,
 47 |         PA: Pattern<StashValue> + 'static,
 48 |     {
 49 |         let sym = self.sym(sym);
 50 |         self.composition_rules
 51 |             .borrow_mut()
 52 |             .push(Box::new(Rule1::new(sym, pa, production)))
 53 |     }
 54 | 
 55 |     pub fn rule_1_terminal<S, PA, V, F>(&self, sym: S, pa: PA, production: F)
 56 |     where
 57 |         S: Into<String> + AsRef<str>,
 58 |         V: NodePayload<Payload = StashValue::Payload> + 'static,
 59 |         StashValue: StashIndexable + From<V> + 'static,
 60 |         F: for<'a> Fn(&RuleProductionArg<'a, PA::M>) -> RuleResult<V> + 'static + Send + Sync,
 61 |         PA: TerminalPattern<StashValue> + 'static,
 62 |     {
 63 |         let sym = self.sym(sym);
 64 |         self.terminal_rules
 65 |             .borrow_mut()
 66 |             .push(Box::new(Rule1::new(sym, pa, production)))
 67 |     }
 68 | 
 69 |     pub fn rule_2<S, PA, PB, V, F>(&self, sym: S, pa: PA, pb: PB, production: F)
 70 |     where
 71 |         S: Into<String> + AsRef<str>,
 72 |         V: NodePayload<Payload = StashValue::Payload> + 'static,
 73 |         StashValue: StashIndexable + From<V> + 'static,
 74 |         F: for<'a> Fn(
 75 |                 &RuleProductionArg<'a, PA::M>,
 76 |                 &RuleProductionArg<'a, PB::M>,
 77 |             ) -> RuleResult<V>
 78 |             + 'static
 79 |             + Send
 80 |             + Sync,
 81 |         PA: Pattern<StashValue> + 'static,
 82 |         PB: Pattern<StashValue> + 'static,
 83 |     {
 84 |         let sym = self.sym(sym);
 85 |         self.composition_rules
 86 |             .borrow_mut()
 87 |             .push(Box::new(Rule2::new(sym, (pa, pb), production)))
 88 |     }
 89 | 
 90 |     pub fn rule_2_terminal<S, PA, PB, V, F>(&self, sym: S, pa: PA, pb: PB, production: F)
 91 |     where
 92 |         S: Into<String> + AsRef<str>,
 93 |         V: NodePayload<Payload = StashValue::Payload> + 'static,
 94 |         StashValue: StashIndexable + From<V> + 'static,
 95 |         F: for<'a> Fn(
 96 |                 &RuleProductionArg<'a, PA::M>,
 97 |                 &RuleProductionArg<'a, PB::M>,
 98 |             ) -> RuleResult<V>
 99 |             + 'static
100 |             + Send
101 |             + Sync,
102 |         PA: TerminalPattern<StashValue> + 'static,
103 |         PB: TerminalPattern<StashValue> + 'static,
104 |     {
105 |         let sym = self.sym(sym);
106 |         self.terminal_rules
107 |             .borrow_mut()
108 |             .push(Box::new(Rule2::new(sym, (pa, pb), production)))
109 |     }
110 | 
111 |     pub fn rule_3<S, PA, PB, PC, V, F>(&self, sym: S, pa: PA, pb: PB, pc: PC, production: F)
112 |     where
113 |         S: Into<String> + AsRef<str>,
114 |         V: NodePayload<Payload = StashValue::Payload> + 'static,
115 |         StashValue: StashIndexable + From<V> + 'static,
116 |         F: for<'a> Fn(
117 |                 &RuleProductionArg<'a, PA::M>,
118 |                 &RuleProductionArg<'a, PB::M>,
119 |                 &RuleProductionArg<'a, PC::M>,
120 |             ) -> RuleResult<V>
121 |             + 'static
122 |             + Send
123 |             + Sync,
124 |         PA: Pattern<StashValue> + 'static,
125 |         PB: Pattern<StashValue> + 'static,
126 |         PC: Pattern<StashValue> + 'static,
127 |     {
128 |         let sym = self.sym(sym);
129 |         self.composition_rules
130 |             .borrow_mut()
131 |             .push(Box::new(Rule3::new(sym, (pa, pb, pc), production)))
132 |     }
133 | 
134 |     pub fn rule_3_terminal<S, PA, PB, PC, V, F>(
135 |         &self,
136 |         sym: S,
137 |         pa: PA,
138 |         pb: PB,
139 |         pc: PC,
140 |         production: F,
141 |     ) where
142 |         S: Into<String> + AsRef<str>,
143 |         V: NodePayload<Payload = StashValue::Payload> + 'static,
144 |         StashValue: StashIndexable + From<V> + 'static,
145 |         F: for<'a> Fn(
146 |                 &RuleProductionArg<'a, PA::M>,
147 |                 &RuleProductionArg<'a, PB::M>,
148 |                 &RuleProductionArg<'a, PC::M>,
149 |             ) -> RuleResult<V>
150 |             + 'static
151 |             + Send
152 |             + Sync,
153 |         PA: TerminalPattern<StashValue> + 'static,
154 |         PB: TerminalPattern<StashValue> + 'static,
155 |         PC: TerminalPattern<StashValue> + 'static,
156 |     {
157 |         let sym = self.sym(sym);
158 |         self.composition_rules
159 |             .borrow_mut()
160 |             .push(Box::new(Rule3::new(sym, (pa, pb, pc), production)))
161 |     }
162 | 
163 |     pub fn rule_4<S, PA, PB, PC, PD, V, F>(
164 |         &self,
165 |         sym: S,
166 |         pa: PA,
167 |         pb: PB,
168 |         pc: PC,
169 |         pd: PD,
170 |         production: F,
171 |     ) where
172 |         S: Into<String> + AsRef<str>,
173 |         V: NodePayload<Payload = StashValue::Payload> + 'static,
174 |         StashValue: StashIndexable + From<V> + 'static,
175 |         F: for<'a> Fn(
176 |                 &RuleProductionArg<'a, PA::M>,
177 |                 &RuleProductionArg<'a, PB::M>,
178 |                 &RuleProductionArg<'a, PC::M>,
179 |                 &RuleProductionArg<'a, PD::M>,
180 |             ) -> RuleResult<V>
181 |             + 'static
182 |             + Send
183 |             + Sync,
184 |         PA: Pattern<StashValue> + 'static,
185 |         PB: Pattern<StashValue> + 'static,
186 |         PC: Pattern<StashValue> + 'static,
187 |         PD: Pattern<StashValue> + 'static,
188 |     {
189 |         let sym = self.sym(sym);
190 |         self.composition_rules
191 |             .borrow_mut()
192 |             .push(Box::new(Rule4::new(sym, (pa, pb, pc, pd), production)))
193 |     }
194 | 
195 |     pub fn rule_4_terminal<S, PA, PB, PC, PD, V, F>(
196 |         &self,
197 |         sym: S,
198 |         pa: PA,
199 |         pb: PB,
200 |         pc: PC,
201 |         pd: PD,
202 |         production: F,
203 |     ) where
204 |         S: Into<String> + AsRef<str>,
205 |         V: NodePayload<Payload = StashValue::Payload> + 'static,
206 |         StashValue: StashIndexable + From<V> + 'static,
207 |         F: for<'a> Fn(
208 |                 &RuleProductionArg<'a, PA::M>,
209 |                 &RuleProductionArg<'a, PB::M>,
210 |                 &RuleProductionArg<'a, PC::M>,
211 |                 &RuleProductionArg<'a, PD::M>,
212 |             ) -> RuleResult<V>
213 |             + 'static
214 |             + Send
215 |             + Sync,
216 |         PA: TerminalPattern<StashValue> + 'static,
217 |         PB: TerminalPattern<StashValue> + 'static,
218 |         PC: TerminalPattern<StashValue> + 'static,
219 |         PD: TerminalPattern<StashValue> + 'static,
220 |     {
221 |         let sym = self.sym(sym);
222 |         self.composition_rules
223 |             .borrow_mut()
224 |             .push(Box::new(Rule4::new(sym, (pa, pb, pc, pd), production)))
225 |     }
226 | 
227 |     pub fn rule_5<S, PA, PB, PC, PD, PE, V, F>(
228 |         &self,
229 |         sym: S,
230 |         pa: PA,
231 |         pb: PB,
232 |         pc: PC,
233 |         pd: PD,
234 |         pe: PE,
235 |         production: F,
236 |     ) where
237 |         S: Into<String> + AsRef<str>,
238 |         V: NodePayload<Payload = StashValue::Payload> + 'static,
239 |         StashValue: StashIndexable + From<V> + 'static,
240 |         F: for<'a> Fn(
241 |                 &RuleProductionArg<'a, PA::M>,
242 |                 &RuleProductionArg<'a, PB::M>,
243 |                 &RuleProductionArg<'a, PC::M>,
244 |                 &RuleProductionArg<'a, PD::M>,
245 |                 &RuleProductionArg<'a, PE::M>,
246 |             ) -> RuleResult<V>
247 |             + 'static
248 |             + Send
249 |             + Sync,
250 |         PA: Pattern<StashValue> + 'static,
251 |         PB: Pattern<StashValue> + 'static,
252 |         PC: Pattern<StashValue> + 'static,
253 |         PD: Pattern<StashValue> + 'static,
254 |         PE: Pattern<StashValue> + 'static,
255 |     {
256 |         let sym = self.sym(sym);
257 |         self.composition_rules
258 |             .borrow_mut()
259 |             .push(Box::new(Rule5::new(sym, (pa, pb, pc, pd, pe), production)))
260 |     }
261 | 
262 |     pub fn rule_5_terminal<S, PA, PB, PC, PD, PE, V, F>(
263 |         &self,
264 |         sym: S,
265 |         pa: PA,
266 |         pb: PB,
267 |         pc: PC,
268 |         pd: PD,
269 |         pe: PE,
270 |         production: F,
271 |     ) where
272 |         S: Into<String> + AsRef<str>,
273 |         V: NodePayload<Payload = StashValue::Payload> + 'static,
274 |         StashValue: StashIndexable + From<V> + 'static,
275 |         F: for<'a> Fn(
276 |                 &RuleProductionArg<'a, PA::M>,
277 |                 &RuleProductionArg<'a, PB::M>,
278 |                 &RuleProductionArg<'a, PC::M>,
279 |                 &RuleProductionArg<'a, PD::M>,
280 |                 &RuleProductionArg<'a, PE::M>,
281 |             ) -> RuleResult<V>
282 |             + 'static
283 |             + Send
284 |             + Sync,
285 |         PA: TerminalPattern<StashValue> + 'static,
286 |         PB: TerminalPattern<StashValue> + 'static,
287 |         PC: TerminalPattern<StashValue> + 'static,
288 |         PD: TerminalPattern<StashValue> + 'static,
289 |         PE: TerminalPattern<StashValue> + 'static,
290 |     {
291 |         let sym = self.sym(sym);
292 |         self.composition_rules
293 |             .borrow_mut()
294 |             .push(Box::new(Rule5::new(sym, (pa, pb, pc, pd, pe), production)))
295 |     }
296 | 
297 |     pub fn rule_6<S, PA, PB, PC, PD, PE, PF, V, F>(
298 |         &self,
299 |         sym: S,
300 |         pa: PA,
301 |         pb: PB,
302 |         pc: PC,
303 |         pd: PD,
304 |         pe: PE,
305 |         pf: PF,
306 |         production: F,
307 |     ) where
308 |         S: Into<String> + AsRef<str>,
309 |         V: NodePayload<Payload = StashValue::Payload> + 'static,
310 |         StashValue: StashIndexable + From<V> + 'static,
311 |         F: for<'a> Fn(
312 |                 &RuleProductionArg<'a, PA::M>,
313 |                 &RuleProductionArg<'a, PB::M>,
314 |                 &RuleProductionArg<'a, PC::M>,
315 |                 &RuleProductionArg<'a, PD::M>,
316 |                 &RuleProductionArg<'a, PE::M>,
317 |                 &RuleProductionArg<'a, PF::M>,
318 |             ) -> RuleResult<V>
319 |             + 'static
320 |             + Send
321 |             + Sync,
322 |         PA: Pattern<StashValue> + 'static,
323 |         PB: Pattern<StashValue> + 'static,
324 |         PC: Pattern<StashValue> + 'static,
325 |         PD: Pattern<StashValue> + 'static,
326 |         PE: Pattern<StashValue> + 'static,
327 |         PF: Pattern<StashValue> + 'static,
328 |     {
329 |         let sym = self.sym(sym);
330 |         self.composition_rules
331 |             .borrow_mut()
332 |             .push(Box::new(Rule6::new(
333 |                 sym,
334 |                 (pa, pb, pc, pd, pe, pf),
335 |                 production,
336 |             )))
337 |     }
338 | 
339 |     pub fn rule_6_terminal<S, PA, PB, PC, PD, PE, PF, V, F>(
340 |         &self,
341 |         sym: S,
342 |         pa: PA,
343 |         pb: PB,
344 |         pc: PC,
345 |         pd: PD,
346 |         pe: PE,
347 |         pf: PF,
348 |         production: F,
349 |     ) where
350 |         S: Into<String> + AsRef<str>,
351 |         V: NodePayload<Payload = StashValue::Payload> + 'static,
352 |         StashValue: StashIndexable + From<V> + 'static,
353 |         F: for<'a> Fn(
354 |                 &RuleProductionArg<'a, PA::M>,
355 |                 &RuleProductionArg<'a, PB::M>,
356 |                 &RuleProductionArg<'a, PC::M>,
357 |                 &RuleProductionArg<'a, PD::M>,
358 |                 &RuleProductionArg<'a, PE::M>,
359 |                 &RuleProductionArg<'a, PF::M>,
360 |             ) -> RuleResult<V>
361 |             + 'static
362 |             + Send
363 |             + Sync,
364 |         PA: TerminalPattern<StashValue> + 'static,
365 |         PB: TerminalPattern<StashValue> + 'static,
366 |         PC: TerminalPattern<StashValue> + 'static,
367 |         PD: TerminalPattern<StashValue> + 'static,
368 |         PE: TerminalPattern<StashValue> + 'static,
369 |         PF: TerminalPattern<StashValue> + 'static,
370 |     {
371 |         let sym = self.sym(sym);
372 |         self.composition_rules
373 |             .borrow_mut()
374 |             .push(Box::new(Rule6::new(
375 |                 sym,
376 |                 (pa, pb, pc, pd, pe, pf),
377 |                 production,
378 |             )))
379 |     }
380 | 
381 |     pub fn reg(&self, regex: &str) -> CoreResult<pattern::TextPattern<StashValue>> {
382 |         Ok(pattern::TextPattern::new(
383 |             ::regex::Regex::new(regex)?,
384 |             self.sym(regex),
385 |             self.word_boundaries.clone(),
386 |         ))
387 |     }
388 | 
389 |     pub fn reg_neg_lh(
390 |         &self,
391 |         regex: &str,
392 |         neg_lh: &str,
393 |     ) -> CoreResult<pattern::TextNegLHPattern<StashValue>> {
394 |         Ok(pattern::TextNegLHPattern::new(
395 |             ::regex::Regex::new(regex)?,
396 |             ::regex::Regex::new(neg_lh)?,
397 |             self.sym(format!("{}(?:{})", regex, neg_lh)),
398 |             self.word_boundaries.clone(),
399 |         ))
400 |     }
401 | 
402 |     pub fn build(self) -> RuleSet<StashValue> {
403 |         RuleSet {
404 |             symbols: self.symbols.into_inner(),
405 |             terminal_rules: self.terminal_rules.into_inner(),
406 |             composition_rules: self.composition_rules.into_inner(),
407 |             match_boundaries: self.match_boundaries,
408 |         }
409 |     }
410 | }
411 | 


--------------------------------------------------------------------------------
/core/src/helpers.rs:
--------------------------------------------------------------------------------
  1 | use crate::range::Range;
  2 | 
  3 | #[derive(Copy, Clone, Debug, PartialEq)]
  4 | enum BoundariesClass {
  5 |     AlphanumericWord { option: ValidBoundariesOption },
  6 |     AlphabeticWord { option: ValidBoundariesOption },
  7 |     Detailed { option: ValidBoundariesOption },
  8 |     NoClass,
  9 | }
 10 | 
 11 | impl BoundariesClass {
 12 |     fn apply_left(&self, sentence: &str, range: &Range) -> bool {
 13 |         match self {
 14 |             &BoundariesClass::AlphanumericWord { option } => {
 15 |                 left_valid_boundaries(sentence, range, &option, &alphanumeric_class)
 16 |             }
 17 |             &BoundariesClass::AlphabeticWord { option } => {
 18 |                 left_valid_boundaries(sentence, range, &option, &alphabetic_class)
 19 |             }
 20 |             &BoundariesClass::Detailed { option } => {
 21 |                 left_valid_boundaries(sentence, range, &option, &detailed_class)
 22 |             }
 23 |             &BoundariesClass::NoClass => true,
 24 |         }
 25 |     }
 26 |     fn apply_right(&self, sentence: &str, range: &Range) -> bool {
 27 |         match self {
 28 |             &BoundariesClass::AlphanumericWord { option } => {
 29 |                 right_valid_boundaries(sentence, range, &option, &alphanumeric_class)
 30 |             }
 31 |             &BoundariesClass::AlphabeticWord { option } => {
 32 |                 right_valid_boundaries(sentence, range, &option, &alphabetic_class)
 33 |             }
 34 |             &BoundariesClass::Detailed { option } => {
 35 |                 right_valid_boundaries(sentence, range, &option, &detailed_class)
 36 |             }
 37 |             &BoundariesClass::NoClass => true,
 38 |         }
 39 |     }
 40 | }
 41 | 
 42 | #[derive(Clone, Debug, PartialEq)]
 43 | pub struct BoundariesChecker(Vec<BoundariesClass>);
 44 | 
 45 | impl BoundariesChecker {
 46 |     pub fn check(&self, sentence: &str, range: Range) -> bool {
 47 |         self.0.iter().any(|c| c.apply_left(sentence, &range))
 48 |             && self.0.iter().any(|c| c.apply_right(sentence, &range))
 49 |     }
 50 | 
 51 |     pub fn separated_alphanumeric_word() -> BoundariesChecker {
 52 |         BoundariesChecker(vec![BoundariesClass::AlphanumericWord {
 53 |             option: ValidBoundariesOption::OnCharClassChange,
 54 |         }])
 55 |     }
 56 | 
 57 |     pub fn detailed() -> BoundariesChecker {
 58 |         BoundariesChecker(vec![BoundariesClass::Detailed {
 59 |             option: ValidBoundariesOption::OnCharClassChange,
 60 |         }])
 61 |     }
 62 | 
 63 |     pub fn composed_word_or_detailed() -> BoundariesChecker {
 64 |         BoundariesChecker(vec![
 65 |             BoundariesClass::AlphabeticWord {
 66 |                 option: ValidBoundariesOption::OnSameCharClass,
 67 |             },
 68 |             BoundariesClass::Detailed {
 69 |                 option: ValidBoundariesOption::OnCharClassChange,
 70 |             },
 71 |         ])
 72 |     }
 73 | 
 74 |     pub fn no_check() -> BoundariesChecker {
 75 |         BoundariesChecker(vec![BoundariesClass::NoClass])
 76 |     }
 77 | }
 78 | 
 79 | #[derive(Copy, Clone, Debug, PartialEq)]
 80 | enum ValidBoundariesOption {
 81 |     OnCharClassChange,
 82 |     OnSameCharClass,
 83 | }
 84 | 
 85 | fn alphabetic_class(c: char) -> char {
 86 |     if c.is_alphabetic() {
 87 |         'A'
 88 |     } else {
 89 |         'O'
 90 |     }
 91 | }
 92 | 
 93 | fn alphanumeric_class(c: char) -> char {
 94 |     if c.is_alphanumeric() {
 95 |         'A'
 96 |     } else {
 97 |         c
 98 |     }
 99 | }
100 | 
101 | fn detailed_class(c: char) -> char {
102 |     if c.is_uppercase() {
103 |         'u'
104 |     } else if c.is_lowercase() {
105 |         'l'
106 |     } else if c.is_digit(10) {
107 |         'd'
108 |     } else {
109 |         c
110 |     }
111 | }
112 | 
113 | fn right_valid_boundaries<CharClass>(
114 |     sentence: &str,
115 |     range: &Range,
116 |     option: &ValidBoundariesOption,
117 |     char_class: &CharClass,
118 | ) -> bool
119 | where
120 |     CharClass: Fn(char) -> char,
121 | {
122 |     let last_mine = sentence[range.0..range.1]
123 |         .chars()
124 |         .next_back()
125 |         .map(char_class); //Some(c)
126 |     let first_after = sentence[range.1..].chars().next().map(char_class); // Option(c)
127 | 
128 |     match option {
129 |         &ValidBoundariesOption::OnCharClassChange => last_mine != first_after,
130 |         &ValidBoundariesOption::OnSameCharClass => first_after == None || last_mine == first_after,
131 |     }
132 | }
133 | 
134 | fn left_valid_boundaries<CharClass>(
135 |     sentence: &str,
136 |     range: &Range,
137 |     option: &ValidBoundariesOption,
138 |     char_class: &CharClass,
139 | ) -> bool
140 | where
141 |     CharClass: Fn(char) -> char,
142 | {
143 |     let first_mine = sentence[range.0..range.1].chars().next().map(char_class); // Some(c)
144 |     let last_before = sentence[..range.0].chars().next_back().map(char_class); // Option(c)
145 | 
146 |     match option {
147 |         &ValidBoundariesOption::OnCharClassChange => first_mine != last_before,
148 |         &ValidBoundariesOption::OnSameCharClass => first_mine == None || first_mine == last_before,
149 |     }
150 | }
151 | 
152 | #[cfg(test)]
153 | mod tests {
154 |     use super::*;
155 | 
156 |     #[test]
157 |     fn test_valid_boundaries_alphanumeric() {
158 |         let checker = BoundariesChecker::separated_alphanumeric_word();
159 |         assert_eq!(true, checker.check("abc def ret", Range(4, 7))); // "def"
160 |         assert_eq!(false, checker.check("abc def ret", Range(2, 8))); // "c def r"
161 |         assert_eq!(false, checker.check("abc def123 ret", Range(4, 7))); // "def"
162 |         assert_eq!(true, checker.check("def123 ret", Range(0, 6))); // "def123"
163 |         assert_eq!(false, checker.check("def123 ret", Range(0, 3))); // "def"
164 |         assert_eq!(true, checker.check("ret def", Range(4, 7))); // "def"
165 |         assert_eq!(false, checker.check("ret 123def", Range(7, 10))); // "def"
166 |         assert_eq!(false, checker.check("aéc def ret", Range(3, 9))); // "c def r"
167 |         assert_eq!(false, checker.check("aec def rét", Range(2, 8))); // "c def r"
168 |         assert_eq!(false, checker.check("aec déf ret", Range(2, 9))); // "c déf r"
169 |         assert_eq!(false, checker.check("aeç def ret", Range(2, 9))); // "ç def r"
170 |         assert_eq!(true, checker.check("aeç def ret", Range(4, 8))); // " def "
171 |     }
172 | 
173 |     #[test]
174 |     fn test_valid_boundaries_composed_word_or_detailed() {
175 |         let checker = BoundariesChecker::composed_word_or_detailed();
176 |         assert_eq!(true, checker.check("abc def ret", Range(4, 7))); // "def"
177 |         assert_eq!(true, checker.check("abc def ret", Range(2, 8))); // "c def r"
178 |         assert_eq!(true, checker.check("abc def123 ret", Range(4, 7))); // "def"
179 |         assert_eq!(true, checker.check("def123 ret", Range(0, 6))); // "def123"
180 |         assert_eq!(true, checker.check("def123 ret", Range(0, 3))); // "def"
181 |         assert_eq!(true, checker.check("ret def", Range(4, 7))); // "def"
182 |         assert_eq!(true, checker.check("ret 123def", Range(7, 10))); // "def"
183 |         assert_eq!(true, checker.check("aéc def ret", Range(3, 9))); // "c def r"
184 |         assert_eq!(true, checker.check("aec def rét", Range(2, 8))); // "c def r"
185 |         assert_eq!(true, checker.check("aec déf ret", Range(2, 9))); // "c déf r"
186 |         assert_eq!(true, checker.check("aeç def ret", Range(2, 9))); // "ç def r"
187 |         assert_eq!(true, checker.check("aeç def ret", Range(4, 8))); // " def "
188 |     }
189 | 
190 |     #[test]
191 |     fn test_valid_boundaries_detailed() {
192 |         let checker = BoundariesChecker::detailed();
193 |         assert_eq!(true, checker.check("abc def ret", Range(4, 7))); // "def"
194 |         assert_eq!(false, checker.check("abc def ret", Range(2, 8))); // "c def r"
195 |         assert_eq!(true, checker.check("abc def123 ret", Range(4, 7))); // "def"
196 |         assert_eq!(true, checker.check("def123 ret", Range(0, 6))); // "def123"
197 |         assert_eq!(true, checker.check("def123 ret", Range(0, 3))); // "def"
198 |         assert_eq!(true, checker.check("ret def", Range(4, 7))); // "def"
199 |         assert_eq!(true, checker.check("ret 123def", Range(7, 10))); // "def"
200 |         assert_eq!(false, checker.check("aéc def ret", Range(3, 9))); // "c def r"
201 |         assert_eq!(false, checker.check("aec def rét", Range(2, 8))); // "c def r"
202 |         assert_eq!(false, checker.check("aec déf ret", Range(2, 9))); // "c déf r"
203 |         assert_eq!(false, checker.check("aeç def ret", Range(2, 9))); // "ç def r"
204 |         assert_eq!(true, checker.check("aeç def ret", Range(4, 8))); // " def "
205 |     }
206 | 
207 |     #[test]
208 |     fn test_valid_boundaries_no_check() {
209 |         let checker = BoundariesChecker::no_check();
210 |         assert_eq!(true, checker.check("abc def ret", Range(4, 7))); // "def"
211 |         assert_eq!(true, checker.check("abc def ret", Range(2, 8))); // "c def r"
212 |         assert_eq!(true, checker.check("abc def123 ret", Range(4, 7))); // "def"
213 |         assert_eq!(true, checker.check("def123 ret", Range(0, 6))); // "def123"
214 |         assert_eq!(true, checker.check("def123 ret", Range(0, 3))); // "def"
215 |         assert_eq!(true, checker.check("ret def", Range(4, 7))); // "def"
216 |         assert_eq!(true, checker.check("ret 123def", Range(7, 10))); // "def"
217 |         assert_eq!(true, checker.check("aéc def ret", Range(3, 9))); // "c def r"
218 |         assert_eq!(true, checker.check("aec def rét", Range(2, 8))); // "c def r"
219 |         assert_eq!(true, checker.check("aec déf ret", Range(2, 9))); // "c déf r"
220 |         assert_eq!(true, checker.check("aeç def ret", Range(2, 9))); // "ç def r"
221 |         assert_eq!(true, checker.check("aeç def ret", Range(4, 8))); // " def "
222 |     }
223 | }
224 | 


--------------------------------------------------------------------------------
/core/src/lib.rs:
--------------------------------------------------------------------------------
  1 | #[macro_use]
  2 | extern crate failure;
  3 | pub extern crate regex;
  4 | extern crate smallvec;
  5 | extern crate string_interner;
  6 | 
  7 | mod builder;
  8 | mod helpers;
  9 | pub mod pattern;
 10 | mod range;
 11 | pub mod rule;
 12 | mod stash;
 13 | 
 14 | pub use builder::RuleSetBuilder;
 15 | pub use helpers::BoundariesChecker;
 16 | use pattern::Pattern;
 17 | use pattern::TerminalPattern;
 18 | pub use range::Range;
 19 | use rule::Rule;
 20 | use rule::TerminalRule;
 21 | pub use rule::{RuleError, RuleResult};
 22 | use serde::{Deserialize, Serialize};
 23 | use smallvec::SmallVec;
 24 | use stash::Stash;
 25 | pub use stash::{InnerStashIndexable, StashIndexable};
 26 | use std::collections::HashSet;
 27 | use std::fmt::Debug;
 28 | use std::{cell, rc};
 29 | use string_interner::StringInterner;
 30 | 
 31 | pub type CoreResult<T> = Result<T, ::failure::Error>;
 32 | 
 33 | pub trait AttemptFrom<V>: Sized {
 34 |     fn attempt_from(v: V) -> Option<Self>;
 35 | }
 36 | 
 37 | pub trait AttemptInto<T>: Sized {
 38 |     fn attempt_into(self) -> Option<T>;
 39 | }
 40 | 
 41 | impl<S, T> AttemptInto<T> for S
 42 | where
 43 |     S: Clone,
 44 |     T: AttemptFrom<S>,
 45 | {
 46 |     fn attempt_into(self) -> Option<T> {
 47 |         T::attempt_from(self)
 48 |     }
 49 | }
 50 | 
 51 | pub trait NodePayload: Clone {
 52 |     type Payload: Clone + PartialEq + Debug;
 53 |     fn extract_payload(&self) -> Option<Self::Payload>;
 54 | }
 55 | 
 56 | pub type ChildrenNodes<Payload> = SmallVec<[rc::Rc<Node<Payload>>; 2]>;
 57 | 
 58 | #[derive(Copy, Ord, Eq, Clone, PartialEq, PartialOrd, Debug, Hash, Serialize, Deserialize)]
 59 | pub struct Sym(usize);
 60 | 
 61 | impl string_interner::Symbol for Sym {
 62 |     fn from_usize(val: usize) -> Self {
 63 |         Sym(val)
 64 |     }
 65 | 
 66 |     fn to_usize(self) -> usize {
 67 |         self.0
 68 |     }
 69 | }
 70 | 
 71 | impl From<usize> for Sym {
 72 |     fn from(it: usize) -> Sym {
 73 |         Sym(it)
 74 |     }
 75 | }
 76 | impl From<Sym> for usize {
 77 |     fn from(it: Sym) -> usize {
 78 |         it.0
 79 |     }
 80 | }
 81 | 
 82 | pub struct SymbolTable(StringInterner<Sym>);
 83 | 
 84 | impl Default for SymbolTable {
 85 |     fn default() -> SymbolTable {
 86 |         SymbolTable(string_interner::StringInterner::new())
 87 |     }
 88 | }
 89 | 
 90 | impl SymbolTable {
 91 |     pub fn sym<T>(&mut self, val: T) -> Sym
 92 |     where
 93 |         T: Into<String> + AsRef<str>,
 94 |     {
 95 |         self.0.get_or_intern(val)
 96 |     }
 97 | }
 98 | 
 99 | #[derive(Debug, PartialEq, Clone, Hash, Eq, Copy)]
100 | pub enum ParsingStatus {
101 |     Continue,
102 |     Exit,
103 | }
104 | 
105 | impl ParsingStatus {
106 |     pub fn is_exit(&self) -> bool {
107 |         match self {
108 |             &ParsingStatus::Exit => true,
109 |             _ => false,
110 |         }
111 |     }
112 | 
113 |     pub fn is_continue(&self) -> bool {
114 |         match self {
115 |             &ParsingStatus::Continue => true,
116 |             _ => false,
117 |         }
118 |     }
119 | }
120 | 
121 | #[derive(Debug, PartialEq, Clone, Hash, Eq)]
122 | pub struct Node<Payload: Clone> {
123 |     pub rule_sym: Sym,
124 |     pub byte_range: Range,
125 |     pub payload: Option<Payload>,
126 |     pub children: ChildrenNodes<Payload>,
127 | }
128 | 
129 | impl<Payload: Clone> Node<Payload> {
130 |     fn new(
131 |         sym: Sym,
132 |         byte_range: Range,
133 |         payload: Option<Payload>,
134 |         children: ChildrenNodes<Payload>,
135 |     ) -> rc::Rc<Node<Payload>> {
136 |         rc::Rc::new(Node {
137 |             rule_sym: sym,
138 |             byte_range,
139 |             payload,
140 |             children,
141 |         })
142 |     }
143 | 
144 |     pub fn height(&self) -> usize {
145 |         1 + self.children.iter().map(|c| c.height()).max().unwrap_or(0)
146 |     }
147 | 
148 |     pub fn num_nodes(&self) -> usize {
149 |         let num_children: usize = self.children.iter().map(|c| c.num_nodes()).sum();
150 |         num_children + 1
151 |     }
152 | 
153 |     pub fn all_syms(&self) -> HashSet<&Sym> {
154 |         let mut hash_set = HashSet::new();
155 |         hash_set.insert(&self.rule_sym);
156 |         for child in self.children.iter() {
157 |             for sym in child.all_syms().into_iter() {
158 |                 hash_set.insert(sym);
159 |             }
160 |         }
161 |         hash_set
162 |     }
163 | }
164 | 
165 | #[derive(Debug, PartialEq, Clone)]
166 | pub struct ParsedNode<V: NodePayload> {
167 |     pub root_node: rc::Rc<Node<V::Payload>>,
168 |     pub value: V,
169 | }
170 | 
171 | impl<V: NodePayload> ParsedNode<V> {
172 |     fn new(
173 |         sym: Sym,
174 |         v: V,
175 |         r: Range,
176 |         payload: Option<V::Payload>,
177 |         children: ChildrenNodes<V::Payload>,
178 |     ) -> ParsedNode<V> {
179 |         ParsedNode {
180 |             root_node: Node::new(sym, r, payload, children),
181 |             value: v,
182 |         }
183 |     }
184 | }
185 | 
186 | pub struct RuleSet<StashValue: NodePayload + StashIndexable> {
187 |     symbols: SymbolTable,
188 |     composition_rules: Vec<Box<dyn Rule<StashValue>>>,
189 |     terminal_rules: Vec<Box<dyn TerminalRule<StashValue>>>,
190 |     match_boundaries: BoundariesChecker,
191 | }
192 | 
193 | impl<StashValue: NodePayload + StashIndexable> RuleSet<StashValue> {
194 |     fn apply_terminal_rules(
195 |         &self,
196 |         stash: &mut Stash<StashValue>,
197 |         sentence: &str,
198 |     ) -> CoreResult<()> {
199 |         let mut produced_nodes = vec![];
200 |         for rule in &self.terminal_rules {
201 |             produced_nodes.extend(rule.apply(stash, sentence)?.nodes);
202 |         }
203 |         stash.extend(produced_nodes);
204 |         Ok(())
205 |     }
206 | 
207 |     fn apply_composition_rules(
208 |         &self,
209 |         stash: &mut Stash<StashValue>,
210 |         sentence: &str,
211 |         rules_mask_status: &mut Vec<ParsingStatus>,
212 |     ) -> CoreResult<()> {
213 |         let mut produced_nodes = vec![];
214 |         for (idx, rule) in self.composition_rules.iter().enumerate() {
215 |             if rules_mask_status[idx].is_continue() {
216 |                 let output = rule.apply(stash, sentence)?;
217 |                 rules_mask_status[idx] = output.status;
218 |                 produced_nodes.extend(output.nodes);
219 |             }
220 |         }
221 |         stash.extend(produced_nodes);
222 |         Ok(())
223 |     }
224 | 
225 |     pub fn apply_all(&self, sentence: &str) -> CoreResult<Vec<ParsedNode<StashValue>>> {
226 |         let iterations_max = 10;
227 |         let max_stash_size = 600;
228 |         let mut stash = Stash::default();
229 | 
230 |         self.apply_terminal_rules(&mut stash, sentence)?;
231 |         let mut previous_stash_size = stash.len();
232 | 
233 |         let mut rules_mask_status = vec![ParsingStatus::Continue; self.composition_rules.len()];
234 | 
235 |         for _ in 0..iterations_max {
236 |             self.apply_composition_rules(&mut stash, sentence, &mut rules_mask_status)?;
237 |             if stash.len() <= previous_stash_size || stash.len() > max_stash_size {
238 |                 break;
239 |             }
240 |             previous_stash_size = stash.len();
241 |         }
242 |         Ok(stash
243 |             .into_iter()
244 |             .filter(|pn| {
245 |                 self.match_boundaries
246 |                     .check(sentence, pn.root_node.byte_range)
247 |             })
248 |             .collect())
249 |     }
250 | 
251 |     pub fn resolve_sym(&self, sym: &Sym) -> Option<&str> {
252 |         self.symbols.0.resolve(*sym)
253 |     }
254 | 
255 |     pub fn all_syms(&self) -> Vec<Sym> {
256 |         self.symbols.0.iter().map(|s| s.0).collect()
257 |     }
258 | 
259 |     pub fn rules_syms(&self) -> Vec<Sym> {
260 |         self.composition_rules
261 |             .iter()
262 |             .map(|r| r.rule_sym())
263 |             .chain(self.terminal_rules.iter().map(|r| r.rule_sym()))
264 |             .collect()
265 |     }
266 | }
267 | 
268 | #[derive(Copy, Clone, Debug, PartialEq)]
269 | pub struct SendSyncPhantomData<T>(::std::marker::PhantomData<T>);
270 | unsafe impl<T> Send for SendSyncPhantomData<T> {}
271 | unsafe impl<T> Sync for SendSyncPhantomData<T> {}
272 | impl<T> SendSyncPhantomData<T> {
273 |     pub fn new() -> SendSyncPhantomData<T> {
274 |         SendSyncPhantomData(::std::marker::PhantomData)
275 |     }
276 | }
277 | 


--------------------------------------------------------------------------------
/core/src/pattern.rs:
--------------------------------------------------------------------------------
  1 | use crate::helpers::BoundariesChecker;
  2 | use crate::range::Range;
  3 | use crate::{
  4 |     AttemptFrom, CoreResult, InnerStashIndexable, Node, NodePayload, ParsedNode, ParsingStatus,
  5 |     SendSyncPhantomData, Stash, StashIndexable, Sym,
  6 | };
  7 | use smallvec::SmallVec;
  8 | use std::rc;
  9 | use std::slice::Iter;
 10 | use std::vec::IntoIter;
 11 | 
 12 | pub trait Match: Clone {
 13 |     type NV: Clone;
 14 |     fn byte_range(&self) -> Range;
 15 |     fn to_node(&self) -> rc::Rc<Node<Self::NV>>;
 16 | }
 17 | 
 18 | impl<V: NodePayload> Match for ParsedNode<V> {
 19 |     type NV = V::Payload;
 20 |     fn byte_range(&self) -> Range {
 21 |         self.root_node.byte_range
 22 |     }
 23 | 
 24 |     fn to_node(&self) -> rc::Rc<Node<Self::NV>> {
 25 |         self.root_node.clone()
 26 |     }
 27 | }
 28 | 
 29 | #[derive(Clone, Debug, PartialEq)]
 30 | pub struct Text<V: NodePayload> {
 31 |     pub groups: SmallVec<[Range; 4]>,
 32 |     pub byte_range: Range,
 33 |     pattern_sym: Sym,
 34 |     _phantom: SendSyncPhantomData<V>,
 35 | }
 36 | 
 37 | impl<V: NodePayload> Text<V> {
 38 |     pub fn new(groups: SmallVec<[Range; 4]>, byte_range: Range, pattern_sym: Sym) -> Text<V> {
 39 |         Text {
 40 |             groups,
 41 |             byte_range,
 42 |             pattern_sym,
 43 |             _phantom: SendSyncPhantomData::new(),
 44 |         }
 45 |     }
 46 | }
 47 | 
 48 | impl<V: NodePayload> Match for Text<V> {
 49 |     type NV = V::Payload;
 50 |     fn byte_range(&self) -> Range {
 51 |         self.byte_range
 52 |     }
 53 | 
 54 |     fn to_node(&self) -> rc::Rc<Node<Self::NV>> {
 55 |         rc::Rc::new(Node {
 56 |             rule_sym: self.pattern_sym,
 57 |             byte_range: self.byte_range(),
 58 |             payload: None,
 59 |             children: SmallVec::new(),
 60 |         })
 61 |     }
 62 | }
 63 | 
 64 | pub struct PredicateMatches<M> {
 65 |     pub matches: Vec<M>,
 66 |     pub status: ParsingStatus,
 67 | }
 68 | 
 69 | impl<M> PredicateMatches<M> {
 70 |     pub fn with_status(status: ParsingStatus) -> PredicateMatches<M> {
 71 |         PredicateMatches {
 72 |             matches: vec![],
 73 |             status,
 74 |         }
 75 |     }
 76 | 
 77 |     pub fn continue_with(matches: Vec<M>) -> PredicateMatches<M> {
 78 |         PredicateMatches {
 79 |             matches,
 80 |             status: ParsingStatus::Continue,
 81 |         }
 82 |     }
 83 | 
 84 |     pub fn exit_if_empty(self) -> PredicateMatches<M> {
 85 |         if self.matches.len() == 0 {
 86 |             PredicateMatches::with_status(ParsingStatus::Exit)
 87 |         } else {
 88 |             self
 89 |         }
 90 |     }
 91 | 
 92 |     pub fn push(&mut self, match_: M) {
 93 |         self.matches.push(match_)
 94 |     }
 95 | 
 96 |     pub fn is_empty(&self) -> bool {
 97 |         self.matches.is_empty()
 98 |     }
 99 | 
100 |     pub fn len(&self) -> usize {
101 |         self.matches.len()
102 |     }
103 | 
104 |     pub fn iter(&self) -> Iter<M> {
105 |         self.matches.iter()
106 |     }
107 | 
108 |     pub fn into_iter(self) -> IntoIter<M> {
109 |         self.matches.into_iter()
110 |     }
111 | }
112 | 
113 | pub trait Pattern<StashValue: NodePayload + StashIndexable>: Send + Sync {
114 |     type M: Match<NV = StashValue::Payload>;
115 |     fn predicate(
116 |         &self,
117 |         stash: &Stash<StashValue>,
118 |         sentence: &str,
119 |     ) -> CoreResult<PredicateMatches<Self::M>>;
120 | }
121 | 
122 | pub trait TerminalPattern<StashValue: NodePayload + StashIndexable>:
123 |     Pattern<StashValue, M = Text<StashValue>>
124 | {
125 | }
126 | 
127 | pub struct TextPattern<StashValue: NodePayload + StashIndexable> {
128 |     pattern: ::regex::Regex,
129 |     pattern_sym: Sym,
130 |     boundaries_checker: BoundariesChecker,
131 |     _phantom: SendSyncPhantomData<StashValue>,
132 | }
133 | 
134 | impl<StashValue: NodePayload + StashIndexable> TextPattern<StashValue> {
135 |     pub fn new(
136 |         regex: ::regex::Regex,
137 |         sym: Sym,
138 |         boundaries_checker: BoundariesChecker,
139 |     ) -> TextPattern<StashValue> {
140 |         TextPattern {
141 |             pattern: regex,
142 |             pattern_sym: sym,
143 |             boundaries_checker,
144 |             _phantom: SendSyncPhantomData::new(),
145 |         }
146 |     }
147 | }
148 | 
149 | impl<StashValue: NodePayload + StashIndexable> Pattern<StashValue> for TextPattern<StashValue> {
150 |     type M = Text<StashValue>;
151 |     fn predicate(
152 |         &self,
153 |         _stash: &Stash<StashValue>,
154 |         sentence: &str,
155 |     ) -> CoreResult<PredicateMatches<Self::M>> {
156 |         let mut results = PredicateMatches::with_status(ParsingStatus::Continue);
157 |         for cap in self.pattern.captures_iter(&sentence) {
158 |             let full = cap.get(0).ok_or_else(|| {
159 |                 format_err!(
160 |                     "No capture for regexp {} in rule {:?} for sentence: {}",
161 |                     self.pattern,
162 |                     self.pattern_sym,
163 |                     sentence
164 |                 )
165 |             })?;
166 |             let full_range = Range(full.start(), full.end());
167 |             if !self.boundaries_checker.check(sentence, full_range) {
168 |                 continue;
169 |             }
170 |             let mut groups = SmallVec::new();
171 |             for (ix, group) in cap.iter().enumerate() {
172 |                 let group = group.ok_or_else(|| {
173 |                     format_err!(
174 |                         "No capture for regexp {} in rule {:?}, group number {} in \
175 |                          capture: {}",
176 |                         self.pattern,
177 |                         self.pattern_sym,
178 |                         ix,
179 |                         full.as_str()
180 |                     )
181 |                 })?;
182 |                 let range = Range(group.start(), group.end());
183 |                 groups.push(range);
184 |             }
185 |             results.push(Text {
186 |                 groups,
187 |                 byte_range: full_range,
188 |                 pattern_sym: self.pattern_sym,
189 |                 _phantom: SendSyncPhantomData::new(),
190 |             })
191 |         }
192 | 
193 |         Ok(results.exit_if_empty())
194 |     }
195 | }
196 | 
197 | impl<StashValue: NodePayload + StashIndexable> TerminalPattern<StashValue>
198 |     for TextPattern<StashValue>
199 | {
200 | }
201 | 
202 | pub struct TextNegLHPattern<StashValue: NodePayload + StashIndexable> {
203 |     pattern: ::regex::Regex,
204 |     neg_look_ahead: ::regex::Regex,
205 |     boundaries_checker: BoundariesChecker,
206 |     pattern_sym: Sym,
207 |     _phantom: SendSyncPhantomData<StashValue>,
208 | }
209 | 
210 | impl<StashValue: NodePayload + StashIndexable> TextNegLHPattern<StashValue> {
211 |     pub fn new(
212 |         pattern: ::regex::Regex,
213 |         neg_look_ahead: ::regex::Regex,
214 |         pattern_sym: Sym,
215 |         boundaries_checker: BoundariesChecker,
216 |     ) -> TextNegLHPattern<StashValue> {
217 |         TextNegLHPattern {
218 |             pattern,
219 |             neg_look_ahead,
220 |             pattern_sym,
221 |             boundaries_checker,
222 |             _phantom: SendSyncPhantomData::new(),
223 |         }
224 |     }
225 | }
226 | 
227 | impl<StashValue: NodePayload + StashIndexable> Pattern<StashValue>
228 |     for TextNegLHPattern<StashValue>
229 | {
230 |     type M = Text<StashValue>;
231 |     fn predicate(
232 |         &self,
233 |         _stash: &Stash<StashValue>,
234 |         sentence: &str,
235 |     ) -> CoreResult<PredicateMatches<Text<StashValue>>> {
236 |         let mut results = PredicateMatches::with_status(ParsingStatus::Continue);
237 |         for cap in self.pattern.captures_iter(&sentence) {
238 |             let full = cap.get(0).ok_or_else(|| {
239 |                 format_err!(
240 |                     "No capture for regexp {} in rule {:?} for sentence: {}",
241 |                     self.pattern,
242 |                     self.pattern_sym,
243 |                     sentence
244 |                 )
245 |             })?;
246 |             let full_range = Range(full.start(), full.end());
247 |             if !self.boundaries_checker.check(sentence, full_range) {
248 |                 continue;
249 |             }
250 |             if let Some(mat) = self.neg_look_ahead.find(&sentence[full.end()..]) {
251 |                 if mat.start() == 0 {
252 |                     continue;
253 |                 }
254 |             }
255 |             let mut groups = SmallVec::new();
256 |             for (ix, group) in cap.iter().enumerate() {
257 |                 let group = group.ok_or_else(|| {
258 |                     format_err!(
259 |                         "No capture for regexp {} in rule {:?}, group number {} in \
260 |                          capture: {}",
261 |                         self.pattern,
262 |                         self.pattern_sym,
263 |                         ix,
264 |                         full.as_str()
265 |                     )
266 |                 })?;
267 |                 let range = Range(group.start(), group.end());
268 |                 groups.push(range);
269 |             }
270 |             results.push(Text {
271 |                 groups,
272 |                 byte_range: full_range,
273 |                 pattern_sym: self.pattern_sym,
274 |                 _phantom: SendSyncPhantomData::new(),
275 |             })
276 |         }
277 | 
278 |         Ok(results.exit_if_empty())
279 |     }
280 | }
281 | 
282 | impl<StashValue: NodePayload + StashIndexable> TerminalPattern<StashValue>
283 |     for TextNegLHPattern<StashValue>
284 | {
285 | }
286 | 
287 | pub type AnyNodePattern<V> = FilterNodePattern<V>;
288 | 
289 | pub struct FilterNodePattern<V>
290 | where
291 |     V: NodePayload + InnerStashIndexable,
292 | {
293 |     predicates: Vec<Box<dyn Fn(&V) -> bool + Send + Sync>>,
294 |     _phantom: SendSyncPhantomData<V>,
295 | }
296 | 
297 | impl<V: NodePayload + InnerStashIndexable> AnyNodePattern<V> {
298 |     pub fn new() -> AnyNodePattern<V> {
299 |         FilterNodePattern {
300 |             predicates: vec![],
301 |             _phantom: SendSyncPhantomData::new(),
302 |         }
303 |     }
304 | }
305 | 
306 | impl<V> FilterNodePattern<V>
307 | where
308 |     V: NodePayload + InnerStashIndexable,
309 | {
310 |     pub fn filter(predicates: Vec<Box<dyn Fn(&V) -> bool + Sync + Send>>) -> FilterNodePattern<V> {
311 |         FilterNodePattern {
312 |             predicates,
313 |             _phantom: SendSyncPhantomData::new(),
314 |         }
315 |     }
316 | }
317 | 
318 | impl<StashValue, V> Pattern<StashValue> for FilterNodePattern<V>
319 | where
320 |     StashValue: NodePayload + StashIndexable,
321 |     V: NodePayload<Payload = StashValue::Payload>
322 |         + InnerStashIndexable<Index = StashValue::Index>
323 |         + AttemptFrom<StashValue>,
324 | {
325 |     type M = ParsedNode<V>;
326 |     fn predicate(
327 |         &self,
328 |         stash: &Stash<StashValue>,
329 |         _sentence: &str,
330 |     ) -> CoreResult<PredicateMatches<ParsedNode<V>>> {
331 |         Ok(PredicateMatches::continue_with(stash.filter(|v| {
332 |             self.predicates.iter().all(|predicate| (predicate)(&v))
333 |         })))
334 |     }
335 | }
336 | 
337 | #[cfg(test)]
338 | mod tests {
339 |     use super::*;
340 | 
341 |     macro_rules! svec4 {
342 |         ($($item:expr),*) => { {
343 |             let mut v = ::smallvec::SmallVec::<[_;4]>::new();
344 |             $( v.push($item); )*
345 |             v
346 |         }
347 |         }
348 |     }
349 | 
350 |     #[test]
351 |     fn test_regex_separated_string() {
352 |         let stash = Stash::default();
353 |         let checker = BoundariesChecker::detailed();
354 |         let pat: TextPattern<usize> =
355 |             TextPattern::new(::regex::Regex::new("a+").unwrap(), Sym(0), checker);
356 |         assert_eq!(
357 |             vec![Text::new(svec4!(Range(0, 3)), Range(0, 3), Sym(0))],
358 |             pat.predicate(&stash, "aaa").unwrap().matches
359 |         );
360 |         assert_eq!(
361 |             vec![Text::new(svec4!(Range(0, 3)), Range(0, 3), Sym(0))],
362 |             pat.predicate(&stash, "aaa bbb").unwrap().matches
363 |         );
364 |         assert_eq!(
365 |             vec![Text::new(svec4!(Range(4, 7)), Range(4, 7), Sym(0))],
366 |             pat.predicate(&stash, "bbb aaa").unwrap().matches
367 |         );
368 |         assert_eq!(
369 |             Vec::<Text<usize>>::new(),
370 |             pat.predicate(&stash, "baaa").unwrap().matches
371 |         );
372 |         assert_eq!(
373 |             Vec::<Text<usize>>::new(),
374 |             pat.predicate(&stash, "aaab").unwrap().matches
375 |         );
376 |         assert_eq!(
377 |             Vec::<Text<usize>>::new(),
378 |             pat.predicate(&stash, "aaaé").unwrap().matches
379 |         );
380 |         assert_eq!(
381 |             Vec::<Text<usize>>::new(),
382 |             pat.predicate(&stash, "éaaa").unwrap().matches
383 |         );
384 |         assert_eq!(
385 |             vec![Text::new(svec4!(Range(1, 4)), Range(1, 4), Sym(0))],
386 |             pat.predicate(&stash, "1aaa").unwrap().matches
387 |         );
388 |         assert_eq!(
389 |             vec![Text::new(svec4!(Range(0, 3)), Range(0, 3), Sym(0))],
390 |             pat.predicate(&stash, "aaa1").unwrap().matches
391 |         );
392 |         assert_eq!(
393 |             vec![Text::new(svec4!(Range(0, 3)), Range(0, 3), Sym(0))],
394 |             pat.predicate(&stash, "aaa-toto").unwrap().matches
395 |         );
396 |     }
397 | }
398 | 


--------------------------------------------------------------------------------
/core/src/range.rs:
--------------------------------------------------------------------------------
 1 | use std::cmp::{Ordering, PartialOrd};
 2 | 
 3 | /// Represent a semi-inclusive range of position, in bytes, in the matched
 4 | /// sentence.
 5 | #[derive(PartialEq, Clone, Debug, Copy, Hash, Eq)]
 6 | pub struct Range(pub usize, pub usize);
 7 | 
 8 | impl Range {
 9 |     pub fn intersects(&self, other: &Self) -> bool {
10 |         self.partial_cmp(other).is_none() && (self.1 >= other.0 && other.1 >= self.0)
11 |     }
12 | 
13 |     pub fn char_range(&self, string: &str) -> Range {
14 |         Range(
15 |             convert_char_index(string, self.0),
16 |             convert_char_index(string, self.1),
17 |         )
18 |     }
19 | 
20 |     pub fn byte_range(&self, string: &str) -> Range {
21 |         Range(
22 |             convert_byte_index(string, self.0),
23 |             convert_byte_index(string, self.1),
24 |         )
25 |     }
26 | 
27 |     pub fn len(&self) -> usize {
28 |         self.1 - self.0
29 |     }
30 | 
31 |     pub fn is_disjoint(&self, other: &Self) -> bool {
32 |         self.0 >= other.1 || other.0 >= self.1
33 |     }
34 | }
35 | 
36 | impl PartialOrd for Range {
37 |     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
38 |         if self == other {
39 |             Some(Ordering::Equal)
40 |         } else if self.0 <= other.0 && other.1 <= self.1 {
41 |             Some(Ordering::Greater)
42 |         } else if other.0 <= self.0 && self.1 <= other.1 {
43 |             Some(Ordering::Less)
44 |         } else {
45 |             None
46 |         }
47 |     }
48 | }
49 | 
50 | pub fn convert_char_index(string: &str, byte_index: usize) -> usize {
51 |     if string.is_empty() {
52 |         return 0;
53 |     }
54 |     let mut acc = 0;
55 |     let mut last_char_index = 0;
56 |     for (char_index, char) in string.chars().enumerate() {
57 |         if byte_index <= acc {
58 |             return char_index;
59 |         }
60 |         acc += char.len_utf8();
61 |         last_char_index = char_index;
62 |     }
63 |     last_char_index + 1
64 | }
65 | 
66 | pub fn convert_byte_index(string: &str, char_index: usize) -> usize {
67 |     let mut result = 0;
68 |     for (current_char_index, char) in string.chars().enumerate() {
69 |         if current_char_index == char_index {
70 |             return result;
71 |         }
72 |         result += char.len_utf8()
73 |     }
74 |     result
75 | }
76 | 


--------------------------------------------------------------------------------
/core/src/rule.rs:
--------------------------------------------------------------------------------
   1 | use crate::pattern::*;
   2 | use crate::stash::Stash;
   3 | use crate::{
   4 |     CoreResult, NodePayload, ParsedNode, ParsingStatus, Range, SendSyncPhantomData, StashIndexable,
   5 |     Sym,
   6 | };
   7 | use smallvec::SmallVec;
   8 | 
   9 | #[derive(Debug, Fail)]
  10 | pub enum RuleError {
  11 |     #[fail(display = "invalid rule")]
  12 |     Invalid,
  13 | }
  14 | 
  15 | pub type RuleResult<T> = Result<T, failure::Error>;
  16 | 
  17 | macro_rules! svec {
  18 |     ($($item:expr),*) => { {
  19 |         let mut v =SmallVec::new();
  20 |         $( v.push($item); )*
  21 |         v
  22 |     }
  23 |     }
  24 | }
  25 | 
  26 | #[derive(Debug)]
  27 | pub struct RuleProductionArg<'a, M: Match + 'a> {
  28 |     sentence: &'a str,
  29 |     match_: &'a M,
  30 | }
  31 | 
  32 | impl<'a, M: Match> RuleProductionArg<'a, M> {
  33 |     pub fn new(sentence: &'a str, match_: &'a M) -> RuleProductionArg<'a, M> {
  34 |         RuleProductionArg { sentence, match_ }
  35 |     }
  36 | }
  37 | 
  38 | impl<'a, V: NodePayload> RuleProductionArg<'a, Text<V>> {
  39 |     pub fn group(&self, ix: usize) -> &'a str {
  40 |         let g = self.match_.groups[ix];
  41 |         &self.sentence[g.0..g.1]
  42 |     }
  43 | 
  44 |     pub fn num_groups(&self) -> usize {
  45 |         self.match_.groups.len()
  46 |     }
  47 | }
  48 | 
  49 | impl<'a, V: NodePayload> RuleProductionArg<'a, ParsedNode<V>> {
  50 |     pub fn value(&self) -> &V {
  51 |         &self.match_.value
  52 |     }
  53 | }
  54 | 
  55 | fn adjacent<A: Match, B: Match>(a: &A, b: &B, sentence: &str) -> bool {
  56 |     a.byte_range().1 <= b.byte_range().0
  57 |         && sentence[a.byte_range().1..b.byte_range().0]
  58 |             .chars()
  59 |             .all(|c| c.is_whitespace())
  60 | }
  61 | 
  62 | #[derive(Debug, Clone)]
  63 | pub struct RuleOutput<StashValue: NodePayload> {
  64 |     pub nodes: ParsedNodes<StashValue>,
  65 |     pub status: ParsingStatus,
  66 | }
  67 | 
  68 | impl<StashValue: NodePayload> RuleOutput<StashValue> {
  69 |     fn exit() -> RuleOutput<StashValue> {
  70 |         RuleOutput {
  71 |             nodes: ParsedNodes::new(),
  72 |             status: ParsingStatus::Exit,
  73 |         }
  74 |     }
  75 | 
  76 |     fn continue_with(nodes: ParsedNodes<StashValue>) -> RuleOutput<StashValue> {
  77 |         RuleOutput {
  78 |             nodes,
  79 |             status: ParsingStatus::Continue,
  80 |         }
  81 |     }
  82 | }
  83 | 
  84 | type ParsedNodes<StashValue> = SmallVec<[ParsedNode<StashValue>; 1]>;
  85 | 
  86 | pub trait Rule<StashValue: NodePayload + StashIndexable>: Send + Sync {
  87 |     fn rule_sym(&self) -> Sym;
  88 |     fn apply(
  89 |         &self,
  90 |         stash: &Stash<StashValue>,
  91 |         sentence: &str,
  92 |     ) -> CoreResult<RuleOutput<StashValue>>;
  93 | }
  94 | 
  95 | pub trait TerminalRule<StashValue: NodePayload + StashIndexable>: Rule<StashValue> {}
  96 | 
  97 | pub struct Rule1<PA, V, StashValue, F>
  98 | where
  99 |     V: NodePayload,
 100 |     StashValue: NodePayload<Payload = V::Payload> + StashIndexable + From<V>,
 101 |     F: for<'a> Fn(&RuleProductionArg<'a, PA::M>) -> RuleResult<V>,
 102 |     PA: Pattern<StashValue>,
 103 | {
 104 |     sym: Sym,
 105 |     pattern: PA,
 106 |     production: F,
 107 |     _phantom: SendSyncPhantomData<(V, StashValue)>,
 108 | }
 109 | 
 110 | impl<PA, V, StashValue, F> Rule<StashValue> for Rule1<PA, V, StashValue, F>
 111 | where
 112 |     V: NodePayload,
 113 |     StashValue: NodePayload<Payload = V::Payload> + StashIndexable + From<V>,
 114 |     F: for<'a> Fn(&RuleProductionArg<'a, PA::M>) -> RuleResult<V> + Send + Sync,
 115 |     PA: Pattern<StashValue>,
 116 | {
 117 |     fn rule_sym(&self) -> Sym {
 118 |         self.sym
 119 |     }
 120 | 
 121 |     fn apply(
 122 |         &self,
 123 |         stash: &Stash<StashValue>,
 124 |         sentence: &str,
 125 |     ) -> CoreResult<RuleOutput<StashValue>> {
 126 |         let matches = self.matches(&stash, sentence)?;
 127 | 
 128 |         if matches.status.is_exit() {
 129 |             return Ok(RuleOutput::exit());
 130 |         }
 131 | 
 132 |         let nodes: CoreResult<_> = matches
 133 |             .iter()
 134 |             .filter_map(|sub| {
 135 |                 let nodes = svec![sub.to_node()];
 136 |                 if stash.iter().all(|old_node| {
 137 |                     old_node.root_node.children != nodes || old_node.root_node.rule_sym != self.sym
 138 |                 }) {
 139 |                     match (self.production)(&RuleProductionArg::new(sentence, sub)) {
 140 |                         Ok(v) => {
 141 |                             let payload = v.extract_payload();
 142 |                             Some(Ok(ParsedNode::new(
 143 |                                 self.sym,
 144 |                                 v.into(),
 145 |                                 sub.byte_range(),
 146 |                                 payload,
 147 |                                 nodes,
 148 |                             )))
 149 |                         }
 150 |                         Err(e) => match e.downcast::<RuleError>() {
 151 |                             Ok(RuleError::Invalid) => None,
 152 |                             Err(e) => Some(Err(e)),
 153 |                         },
 154 |                     }
 155 |                 } else {
 156 |                     None
 157 |                 }
 158 |             })
 159 |             .collect();
 160 |         Ok(RuleOutput::continue_with(nodes?))
 161 |     }
 162 | }
 163 | 
 164 | impl<PA, V, StashValue, F> TerminalRule<StashValue> for Rule1<PA, V, StashValue, F>
 165 | where
 166 |     V: NodePayload,
 167 |     StashValue: NodePayload<Payload = V::Payload> + StashIndexable + From<V>,
 168 |     F: for<'a> Fn(&RuleProductionArg<'a, PA::M>) -> RuleResult<V> + Send + Sync,
 169 |     PA: TerminalPattern<StashValue>,
 170 | {
 171 | }
 172 | 
 173 | impl<PA, V, StashValue, F> Rule1<PA, V, StashValue, F>
 174 | where
 175 |     V: NodePayload,
 176 |     StashValue: NodePayload<Payload = V::Payload> + StashIndexable + From<V>,
 177 |     F: for<'a> Fn(&RuleProductionArg<'a, PA::M>) -> RuleResult<V>,
 178 |     PA: Pattern<StashValue>,
 179 | {
 180 |     pub fn new(sym: Sym, pat: PA, prod: F) -> Rule1<PA, V, StashValue, F> {
 181 |         Rule1 {
 182 |             sym,
 183 |             pattern: pat,
 184 |             production: prod,
 185 |             _phantom: SendSyncPhantomData::new(),
 186 |         }
 187 |     }
 188 | 
 189 |     fn matches(
 190 |         &self,
 191 |         stash: &Stash<StashValue>,
 192 |         sentence: &str,
 193 |     ) -> CoreResult<PredicateMatches<PA::M>> {
 194 |         self.pattern.predicate(stash, sentence)
 195 |     }
 196 | }
 197 | 
 198 | pub struct Rule2<PA, PB, V, StashValue, F>
 199 | where
 200 |     V: NodePayload,
 201 |     StashValue: NodePayload<Payload = V::Payload> + StashIndexable + From<V>,
 202 |     F: for<'a> Fn(&RuleProductionArg<'a, PA::M>, &RuleProductionArg<'a, PB::M>) -> RuleResult<V>,
 203 |     PA: Pattern<StashValue>,
 204 |     PB: Pattern<StashValue>,
 205 | {
 206 |     sym: Sym,
 207 |     pattern: (PA, PB),
 208 |     production: F,
 209 |     _phantom: SendSyncPhantomData<(V, StashValue)>,
 210 | }
 211 | 
 212 | impl<PA, PB, V, StashValue, F> Rule<StashValue> for Rule2<PA, PB, V, StashValue, F>
 213 | where
 214 |     V: NodePayload,
 215 |     StashValue: NodePayload<Payload = V::Payload> + StashIndexable + From<V>,
 216 |     F: for<'a> Fn(&RuleProductionArg<'a, PA::M>, &RuleProductionArg<'a, PB::M>) -> RuleResult<V>
 217 |         + Send
 218 |         + Sync,
 219 |     PA: Pattern<StashValue>,
 220 |     PB: Pattern<StashValue>,
 221 | {
 222 |     fn rule_sym(&self) -> Sym {
 223 |         self.sym
 224 |     }
 225 | 
 226 |     fn apply(
 227 |         &self,
 228 |         stash: &Stash<StashValue>,
 229 |         sentence: &str,
 230 |     ) -> CoreResult<RuleOutput<StashValue>> {
 231 |         let matches = self.matches(&stash, sentence)?;
 232 | 
 233 |         if matches.status.is_exit() {
 234 |             return Ok(RuleOutput::exit());
 235 |         }
 236 | 
 237 |         let nodes: CoreResult<_> = matches
 238 |             .iter()
 239 |             .filter_map(|sub| {
 240 |                 let nodes = svec![sub.0.to_node(), sub.1.to_node()];
 241 |                 if stash.iter().all(|old_node| {
 242 |                     old_node.root_node.children != nodes || old_node.root_node.rule_sym != self.sym
 243 |                 }) {
 244 |                     let byte_range = Range(sub.0.byte_range().0, sub.1.byte_range().1);
 245 |                     match (self.production)(
 246 |                         &RuleProductionArg::new(sentence, &sub.0),
 247 |                         &RuleProductionArg::new(sentence, &sub.1),
 248 |                     ) {
 249 |                         Ok(v) => {
 250 |                             let payload = v.extract_payload();
 251 |                             Some(Ok(ParsedNode::new(
 252 |                                 self.sym,
 253 |                                 v.into(),
 254 |                                 byte_range,
 255 |                                 payload,
 256 |                                 nodes,
 257 |                             )))
 258 |                         }
 259 |                         Err(e) => match e.downcast::<RuleError>() {
 260 |                             Ok(RuleError::Invalid) => None,
 261 |                             Err(e) => Some(Err(e)),
 262 |                         },
 263 |                     }
 264 |                 } else {
 265 |                     None
 266 |                 }
 267 |             })
 268 |             .collect();
 269 |         Ok(RuleOutput::continue_with(nodes?))
 270 |     }
 271 | }
 272 | 
 273 | impl<PA, PB, V, StashValue, F> TerminalRule<StashValue> for Rule2<PA, PB, V, StashValue, F>
 274 | where
 275 |     V: NodePayload,
 276 |     StashValue: NodePayload<Payload = V::Payload> + StashIndexable + From<V>,
 277 |     F: for<'a> Fn(&RuleProductionArg<'a, PA::M>, &RuleProductionArg<'a, PB::M>) -> RuleResult<V>
 278 |         + Send
 279 |         + Sync,
 280 |     PA: TerminalPattern<StashValue>,
 281 |     PB: TerminalPattern<StashValue>,
 282 | {
 283 | }
 284 | 
 285 | impl<PA, PB, V, StashValue, F> Rule2<PA, PB, V, StashValue, F>
 286 | where
 287 |     V: NodePayload,
 288 |     StashValue: NodePayload<Payload = V::Payload> + StashIndexable + From<V>,
 289 |     F: for<'a> Fn(&RuleProductionArg<'a, PA::M>, &RuleProductionArg<'a, PB::M>) -> RuleResult<V>
 290 |         + Send
 291 |         + Sync,
 292 |     PA: Pattern<StashValue>,
 293 |     PB: Pattern<StashValue>,
 294 | {
 295 |     pub fn new(sym: Sym, pat: (PA, PB), prod: F) -> Rule2<PA, PB, V, StashValue, F> {
 296 |         Rule2 {
 297 |             sym,
 298 |             pattern: pat,
 299 |             production: prod,
 300 |             _phantom: SendSyncPhantomData::new(),
 301 |         }
 302 |     }
 303 | 
 304 |     fn matches(
 305 |         &self,
 306 |         stash: &Stash<StashValue>,
 307 |         sentence: &str,
 308 |     ) -> CoreResult<PredicateMatches<(PA::M, PB::M)>> {
 309 |         let matches_0 = self.pattern.0.predicate(stash, sentence)?;
 310 |         if matches_0.is_empty() {
 311 |             return Ok(PredicateMatches::with_status(matches_0.status));
 312 |         }
 313 |         let mut result = PredicateMatches::with_status(ParsingStatus::Continue);
 314 |         let matches_1 = self.pattern.1.predicate(stash, sentence)?;
 315 |         for m0 in matches_0.iter() {
 316 |             for m1 in matches_1.iter() {
 317 |                 if adjacent(m0, m1, sentence) {
 318 |                     result.push((m0.clone(), m1.clone()))
 319 |                 }
 320 |             }
 321 |         }
 322 |         Ok(result)
 323 |     }
 324 | }
 325 | 
 326 | pub struct Rule3<PA, PB, PC, V, StashValue, F>
 327 | where
 328 |     V: NodePayload,
 329 |     StashValue: NodePayload<Payload = V::Payload> + StashIndexable + From<V>,
 330 |     F: for<'a> Fn(
 331 |             &RuleProductionArg<'a, PA::M>,
 332 |             &RuleProductionArg<'a, PB::M>,
 333 |             &RuleProductionArg<'a, PC::M>,
 334 |         ) -> RuleResult<V>
 335 |         + Send
 336 |         + Sync,
 337 |     PA: Pattern<StashValue>,
 338 |     PB: Pattern<StashValue>,
 339 |     PC: Pattern<StashValue>,
 340 | {
 341 |     sym: Sym,
 342 |     pattern: (PA, PB, PC),
 343 |     production: F,
 344 |     _phantom: SendSyncPhantomData<(V, StashValue)>,
 345 | }
 346 | 
 347 | impl<PA, PB, PC, V, StashValue, F> Rule<StashValue> for Rule3<PA, PB, PC, V, StashValue, F>
 348 | where
 349 |     V: NodePayload,
 350 |     StashValue: NodePayload<Payload = V::Payload> + StashIndexable + From<V>,
 351 |     F: for<'a> Fn(
 352 |             &RuleProductionArg<'a, PA::M>,
 353 |             &RuleProductionArg<'a, PB::M>,
 354 |             &RuleProductionArg<'a, PC::M>,
 355 |         ) -> RuleResult<V>
 356 |         + Send
 357 |         + Sync,
 358 |     PA: Pattern<StashValue>,
 359 |     PB: Pattern<StashValue>,
 360 |     PC: Pattern<StashValue>,
 361 | {
 362 |     fn rule_sym(&self) -> Sym {
 363 |         self.sym
 364 |     }
 365 | 
 366 |     fn apply(
 367 |         &self,
 368 |         stash: &Stash<StashValue>,
 369 |         sentence: &str,
 370 |     ) -> CoreResult<RuleOutput<StashValue>> {
 371 |         let matches = self.matches(&stash, sentence)?;
 372 | 
 373 |         if matches.status.is_exit() {
 374 |             return Ok(RuleOutput::exit());
 375 |         }
 376 | 
 377 |         let nodes: CoreResult<_> = matches
 378 |             .iter()
 379 |             .filter_map(|sub| {
 380 |                 let nodes = svec!(sub.0.to_node(), sub.1.to_node(), sub.2.to_node());
 381 |                 if stash.iter().all(|old_node| {
 382 |                     old_node.root_node.children != nodes || old_node.root_node.rule_sym != self.sym
 383 |                 }) {
 384 |                     let byte_range = Range(sub.0.byte_range().0, sub.2.byte_range().1);
 385 |                     match (self.production)(
 386 |                         &RuleProductionArg::new(sentence, &sub.0),
 387 |                         &RuleProductionArg::new(sentence, &sub.1),
 388 |                         &RuleProductionArg::new(sentence, &sub.2),
 389 |                     ) {
 390 |                         Ok(v) => {
 391 |                             let payload = v.extract_payload();
 392 |                             Some(Ok(ParsedNode::new(
 393 |                                 self.sym,
 394 |                                 v.clone().into(),
 395 |                                 byte_range,
 396 |                                 payload,
 397 |                                 nodes,
 398 |                             )))
 399 |                         }
 400 |                         Err(e) => match e.downcast::<RuleError>() {
 401 |                             Ok(RuleError::Invalid) => None,
 402 |                             Err(e) => Some(Err(e)),
 403 |                         },
 404 |                     }
 405 |                 } else {
 406 |                     None
 407 |                 }
 408 |             })
 409 |             .collect();
 410 |         Ok(RuleOutput::continue_with(nodes?))
 411 |     }
 412 | }
 413 | 
 414 | impl<PA, PB, PC, V, StashValue, F> Rule3<PA, PB, PC, V, StashValue, F>
 415 | where
 416 |     V: NodePayload,
 417 |     StashValue: NodePayload<Payload = V::Payload> + StashIndexable + From<V>,
 418 |     F: for<'a> Fn(
 419 |             &RuleProductionArg<'a, PA::M>,
 420 |             &RuleProductionArg<'a, PB::M>,
 421 |             &RuleProductionArg<'a, PC::M>,
 422 |         ) -> RuleResult<V>
 423 |         + Send
 424 |         + Sync,
 425 |     PA: Pattern<StashValue>,
 426 |     PB: Pattern<StashValue>,
 427 |     PC: Pattern<StashValue>,
 428 | {
 429 |     pub fn new(sym: Sym, pat: (PA, PB, PC), prod: F) -> Rule3<PA, PB, PC, V, StashValue, F> {
 430 |         Rule3 {
 431 |             sym,
 432 |             pattern: pat,
 433 |             production: prod,
 434 |             _phantom: SendSyncPhantomData::new(),
 435 |         }
 436 |     }
 437 | 
 438 |     fn matches(
 439 |         &self,
 440 |         stash: &Stash<StashValue>,
 441 |         sentence: &str,
 442 |     ) -> CoreResult<PredicateMatches<(PA::M, PB::M, PC::M)>> {
 443 |         let matches_0 = self.pattern.0.predicate(stash, sentence)?;
 444 |         if matches_0.is_empty() {
 445 |             return Ok(PredicateMatches::with_status(matches_0.status));
 446 |         }
 447 |         let matches_1 = self.pattern.1.predicate(stash, sentence)?;
 448 |         if matches_1.is_empty() {
 449 |             return Ok(PredicateMatches::with_status(matches_1.status));
 450 |         }
 451 |         let matches_2 = self.pattern.2.predicate(stash, sentence)?;
 452 |         if matches_2.is_empty() {
 453 |             return Ok(PredicateMatches::with_status(matches_2.status));
 454 |         }
 455 |         let mut result = PredicateMatches::with_status(ParsingStatus::Continue);
 456 |         for m0 in matches_0.iter() {
 457 |             for m1 in matches_1.iter() {
 458 |                 if adjacent(m0, m1, sentence) {
 459 |                     for m2 in matches_2.iter() {
 460 |                         if adjacent(m1, m2, sentence) {
 461 |                             result.push((m0.clone(), m1.clone(), m2.clone()))
 462 |                         }
 463 |                     }
 464 |                 }
 465 |             }
 466 |         }
 467 |         Ok(result)
 468 |     }
 469 | }
 470 | 
 471 | pub struct Rule4<PA, PB, PC, PD, V, StashValue, F>
 472 | where
 473 |     V: NodePayload,
 474 |     StashValue: NodePayload<Payload = V::Payload> + StashIndexable + From<V>,
 475 |     F: for<'a> Fn(
 476 |             &RuleProductionArg<'a, PA::M>,
 477 |             &RuleProductionArg<'a, PB::M>,
 478 |             &RuleProductionArg<'a, PC::M>,
 479 |             &RuleProductionArg<'a, PD::M>,
 480 |         ) -> RuleResult<V>
 481 |         + Send
 482 |         + Sync,
 483 |     PA: Pattern<StashValue>,
 484 |     PB: Pattern<StashValue>,
 485 |     PC: Pattern<StashValue>,
 486 |     PD: Pattern<StashValue>,
 487 | {
 488 |     sym: Sym,
 489 |     pattern: (PA, PB, PC, PD),
 490 |     production: F,
 491 |     _phantom: SendSyncPhantomData<(V, StashValue)>,
 492 | }
 493 | 
 494 | impl<PA, PB, PC, PD, V, StashValue, F> Rule<StashValue> for Rule4<PA, PB, PC, PD, V, StashValue, F>
 495 | where
 496 |     V: NodePayload,
 497 |     StashValue: NodePayload<Payload = V::Payload> + StashIndexable + From<V>,
 498 |     F: for<'a> Fn(
 499 |             &RuleProductionArg<'a, PA::M>,
 500 |             &RuleProductionArg<'a, PB::M>,
 501 |             &RuleProductionArg<'a, PC::M>,
 502 |             &RuleProductionArg<'a, PD::M>,
 503 |         ) -> RuleResult<V>
 504 |         + Send
 505 |         + Sync,
 506 |     PA: Pattern<StashValue>,
 507 |     PB: Pattern<StashValue>,
 508 |     PC: Pattern<StashValue>,
 509 |     PD: Pattern<StashValue>,
 510 | {
 511 |     fn rule_sym(&self) -> Sym {
 512 |         self.sym
 513 |     }
 514 | 
 515 |     fn apply(
 516 |         &self,
 517 |         stash: &Stash<StashValue>,
 518 |         sentence: &str,
 519 |     ) -> CoreResult<RuleOutput<StashValue>> {
 520 |         let matches = self.matches(&stash, sentence)?;
 521 | 
 522 |         if matches.status.is_exit() {
 523 |             return Ok(RuleOutput::exit());
 524 |         }
 525 | 
 526 |         let nodes: CoreResult<_> = matches
 527 |             .iter()
 528 |             .filter_map(|sub| {
 529 |                 let nodes = svec!(
 530 |                     sub.0.to_node(),
 531 |                     sub.1.to_node(),
 532 |                     sub.2.to_node(),
 533 |                     sub.3.to_node()
 534 |                 );
 535 |                 if stash.iter().all(|old_node| {
 536 |                     old_node.root_node.children != nodes || old_node.root_node.rule_sym != self.sym
 537 |                 }) {
 538 |                     let byte_range = Range(sub.0.byte_range().0, sub.3.byte_range().1);
 539 |                     match (self.production)(
 540 |                         &RuleProductionArg::new(sentence, &sub.0),
 541 |                         &RuleProductionArg::new(sentence, &sub.1),
 542 |                         &RuleProductionArg::new(sentence, &sub.2),
 543 |                         &RuleProductionArg::new(sentence, &sub.3),
 544 |                     ) {
 545 |                         Ok(v) => {
 546 |                             let payload = v.extract_payload();
 547 |                             Some(Ok(ParsedNode::new(
 548 |                                 self.sym,
 549 |                                 v.clone().into(),
 550 |                                 byte_range,
 551 |                                 payload,
 552 |                                 nodes,
 553 |                             )))
 554 |                         }
 555 |                         Err(e) => match e.downcast::<RuleError>() {
 556 |                             Ok(RuleError::Invalid) => None,
 557 |                             Err(e) => Some(Err(e)),
 558 |                         },
 559 |                     }
 560 |                 } else {
 561 |                     None
 562 |                 }
 563 |             })
 564 |             .collect();
 565 | 
 566 |         Ok(RuleOutput::continue_with(nodes?))
 567 |     }
 568 | }
 569 | 
 570 | impl<PA, PB, PC, PD, V, StashValue, F> Rule4<PA, PB, PC, PD, V, StashValue, F>
 571 | where
 572 |     V: NodePayload,
 573 |     StashValue: NodePayload<Payload = V::Payload> + StashIndexable + From<V>,
 574 |     F: for<'a> Fn(
 575 |             &RuleProductionArg<'a, PA::M>,
 576 |             &RuleProductionArg<'a, PB::M>,
 577 |             &RuleProductionArg<'a, PC::M>,
 578 |             &RuleProductionArg<'a, PD::M>,
 579 |         ) -> RuleResult<V>
 580 |         + Send
 581 |         + Sync,
 582 |     PA: Pattern<StashValue>,
 583 |     PB: Pattern<StashValue>,
 584 |     PC: Pattern<StashValue>,
 585 |     PD: Pattern<StashValue>,
 586 | {
 587 |     pub fn new(
 588 |         sym: Sym,
 589 |         pat: (PA, PB, PC, PD),
 590 |         prod: F,
 591 |     ) -> Rule4<PA, PB, PC, PD, V, StashValue, F> {
 592 |         Rule4 {
 593 |             sym,
 594 |             pattern: pat,
 595 |             production: prod,
 596 |             _phantom: SendSyncPhantomData::new(),
 597 |         }
 598 |     }
 599 | 
 600 |     fn matches(
 601 |         &self,
 602 |         stash: &Stash<StashValue>,
 603 |         sentence: &str,
 604 |     ) -> CoreResult<PredicateMatches<(PA::M, PB::M, PC::M, PD::M)>> {
 605 |         let matches_0 = self.pattern.0.predicate(stash, sentence)?;
 606 |         if matches_0.is_empty() {
 607 |             return Ok(PredicateMatches::with_status(matches_0.status));
 608 |         }
 609 |         let matches_1 = self.pattern.1.predicate(stash, sentence)?;
 610 |         if matches_1.is_empty() {
 611 |             return Ok(PredicateMatches::with_status(matches_1.status));
 612 |         }
 613 |         let matches_2 = self.pattern.2.predicate(stash, sentence)?;
 614 |         if matches_2.is_empty() {
 615 |             return Ok(PredicateMatches::with_status(matches_2.status));
 616 |         }
 617 |         let matches_3 = self.pattern.3.predicate(stash, sentence)?;
 618 |         if matches_3.is_empty() {
 619 |             return Ok(PredicateMatches::with_status(matches_3.status));
 620 |         }
 621 |         let mut result = PredicateMatches::with_status(ParsingStatus::Continue);
 622 |         for m0 in matches_0.iter() {
 623 |             for m1 in matches_1.iter() {
 624 |                 if adjacent(m0, m1, sentence) {
 625 |                     for m2 in matches_2.iter() {
 626 |                         if adjacent(m1, m2, sentence) {
 627 |                             for m3 in matches_3.iter() {
 628 |                                 if adjacent(m2, m3, sentence) {
 629 |                                     result.push((m0.clone(), m1.clone(), m2.clone(), m3.clone()))
 630 |                                 }
 631 |                             }
 632 |                         }
 633 |                     }
 634 |                 }
 635 |             }
 636 |         }
 637 |         Ok(result)
 638 |     }
 639 | }
 640 | 
 641 | pub struct Rule5<PA, PB, PC, PD, PE, V, StashValue, F>
 642 | where
 643 |     V: NodePayload,
 644 |     StashValue: NodePayload<Payload = V::Payload> + StashIndexable + From<V>,
 645 |     F: for<'a> Fn(
 646 |             &RuleProductionArg<'a, PA::M>,
 647 |             &RuleProductionArg<'a, PB::M>,
 648 |             &RuleProductionArg<'a, PC::M>,
 649 |             &RuleProductionArg<'a, PD::M>,
 650 |             &RuleProductionArg<'a, PE::M>,
 651 |         ) -> RuleResult<V>
 652 |         + Send
 653 |         + Sync,
 654 |     PA: Pattern<StashValue>,
 655 |     PB: Pattern<StashValue>,
 656 |     PC: Pattern<StashValue>,
 657 |     PD: Pattern<StashValue>,
 658 |     PE: Pattern<StashValue>,
 659 | {
 660 |     sym: Sym,
 661 |     pattern: (PA, PB, PC, PD, PE),
 662 |     production: F,
 663 |     _phantom: SendSyncPhantomData<(V, StashValue)>,
 664 | }
 665 | 
 666 | impl<PA, PB, PC, PD, PE, V, StashValue, F> Rule<StashValue>
 667 |     for Rule5<PA, PB, PC, PD, PE, V, StashValue, F>
 668 | where
 669 |     V: NodePayload,
 670 |     StashValue: NodePayload<Payload = V::Payload> + StashIndexable + From<V>,
 671 |     F: for<'a> Fn(
 672 |             &RuleProductionArg<'a, PA::M>,
 673 |             &RuleProductionArg<'a, PB::M>,
 674 |             &RuleProductionArg<'a, PC::M>,
 675 |             &RuleProductionArg<'a, PD::M>,
 676 |             &RuleProductionArg<'a, PE::M>,
 677 |         ) -> RuleResult<V>
 678 |         + Send
 679 |         + Sync,
 680 |     PA: Pattern<StashValue>,
 681 |     PB: Pattern<StashValue>,
 682 |     PC: Pattern<StashValue>,
 683 |     PD: Pattern<StashValue>,
 684 |     PE: Pattern<StashValue>,
 685 | {
 686 |     fn rule_sym(&self) -> Sym {
 687 |         self.sym
 688 |     }
 689 | 
 690 |     fn apply(
 691 |         &self,
 692 |         stash: &Stash<StashValue>,
 693 |         sentence: &str,
 694 |     ) -> CoreResult<RuleOutput<StashValue>> {
 695 |         let matches = self.matches(&stash, sentence)?;
 696 | 
 697 |         if matches.status.is_exit() {
 698 |             return Ok(RuleOutput::exit());
 699 |         }
 700 | 
 701 |         let nodes: CoreResult<_> = matches
 702 |             .iter()
 703 |             .filter_map(|sub| {
 704 |                 let nodes = svec!(
 705 |                     sub.0.to_node(),
 706 |                     sub.1.to_node(),
 707 |                     sub.2.to_node(),
 708 |                     sub.3.to_node(),
 709 |                     sub.4.to_node()
 710 |                 );
 711 |                 if stash.iter().all(|old_node| {
 712 |                     old_node.root_node.children != nodes || old_node.root_node.rule_sym != self.sym
 713 |                 }) {
 714 |                     let byte_range = Range(sub.0.byte_range().0, sub.4.byte_range().1);
 715 |                     match (self.production)(
 716 |                         &RuleProductionArg::new(sentence, &sub.0),
 717 |                         &RuleProductionArg::new(sentence, &sub.1),
 718 |                         &RuleProductionArg::new(sentence, &sub.2),
 719 |                         &RuleProductionArg::new(sentence, &sub.3),
 720 |                         &RuleProductionArg::new(sentence, &sub.4),
 721 |                     ) {
 722 |                         Ok(v) => {
 723 |                             let payload = v.extract_payload();
 724 |                             Some(Ok(ParsedNode::new(
 725 |                                 self.sym,
 726 |                                 v.into(),
 727 |                                 byte_range,
 728 |                                 payload,
 729 |                                 nodes,
 730 |                             )))
 731 |                         }
 732 |                         Err(e) => match e.downcast::<RuleError>() {
 733 |                             Ok(RuleError::Invalid) => None,
 734 |                             Err(e) => Some(Err(e)),
 735 |                         },
 736 |                     }
 737 |                 } else {
 738 |                     None
 739 |                 }
 740 |             })
 741 |             .collect();
 742 |         Ok(RuleOutput::continue_with(nodes?))
 743 |     }
 744 | }
 745 | 
 746 | impl<PA, PB, PC, PD, PE, V, StashValue, F> Rule5<PA, PB, PC, PD, PE, V, StashValue, F>
 747 | where
 748 |     V: NodePayload,
 749 |     StashValue: NodePayload<Payload = V::Payload> + StashIndexable + From<V>,
 750 |     F: for<'a> Fn(
 751 |             &RuleProductionArg<'a, PA::M>,
 752 |             &RuleProductionArg<'a, PB::M>,
 753 |             &RuleProductionArg<'a, PC::M>,
 754 |             &RuleProductionArg<'a, PD::M>,
 755 |             &RuleProductionArg<'a, PE::M>,
 756 |         ) -> RuleResult<V>
 757 |         + Send
 758 |         + Sync,
 759 |     PA: Pattern<StashValue>,
 760 |     PB: Pattern<StashValue>,
 761 |     PC: Pattern<StashValue>,
 762 |     PD: Pattern<StashValue>,
 763 |     PE: Pattern<StashValue>,
 764 | {
 765 |     pub fn new(
 766 |         sym: Sym,
 767 |         pat: (PA, PB, PC, PD, PE),
 768 |         prod: F,
 769 |     ) -> Rule5<PA, PB, PC, PD, PE, V, StashValue, F> {
 770 |         Rule5 {
 771 |             sym,
 772 |             pattern: pat,
 773 |             production: prod,
 774 |             _phantom: SendSyncPhantomData::new(),
 775 |         }
 776 |     }
 777 | 
 778 |     fn matches(
 779 |         &self,
 780 |         stash: &Stash<StashValue>,
 781 |         sentence: &str,
 782 |     ) -> CoreResult<PredicateMatches<(PA::M, PB::M, PC::M, PD::M, PE::M)>> {
 783 |         let matches_0 = self.pattern.0.predicate(stash, sentence)?;
 784 |         if matches_0.is_empty() {
 785 |             return Ok(PredicateMatches::with_status(matches_0.status));
 786 |         }
 787 |         let matches_1 = self.pattern.1.predicate(stash, sentence)?;
 788 |         if matches_1.is_empty() {
 789 |             return Ok(PredicateMatches::with_status(matches_1.status));
 790 |         }
 791 |         let matches_2 = self.pattern.2.predicate(stash, sentence)?;
 792 |         if matches_2.is_empty() {
 793 |             return Ok(PredicateMatches::with_status(matches_2.status));
 794 |         }
 795 |         let matches_3 = self.pattern.3.predicate(stash, sentence)?;
 796 |         if matches_3.is_empty() {
 797 |             return Ok(PredicateMatches::with_status(matches_3.status));;
 798 |         }
 799 |         let matches_4 = self.pattern.4.predicate(stash, sentence)?;
 800 |         if matches_4.is_empty() {
 801 |             return Ok(PredicateMatches::with_status(matches_4.status));
 802 |         }
 803 |         let mut result = PredicateMatches::with_status(ParsingStatus::Continue);
 804 |         for m0 in matches_0.iter() {
 805 |             for m1 in matches_1.iter() {
 806 |                 if adjacent(m0, m1, sentence) {
 807 |                     for m2 in matches_2.iter() {
 808 |                         if adjacent(m1, m2, sentence) {
 809 |                             for m3 in matches_3.iter() {
 810 |                                 if adjacent(m2, m3, sentence) {
 811 |                                     for m4 in matches_4.iter() {
 812 |                                         if adjacent(m3, m4, sentence) {
 813 |                                             result.push((
 814 |                                                 m0.clone(),
 815 |                                                 m1.clone(),
 816 |                                                 m2.clone(),
 817 |                                                 m3.clone(),
 818 |                                                 m4.clone(),
 819 |                                             ))
 820 |                                         }
 821 |                                     }
 822 |                                 }
 823 |                             }
 824 |                         }
 825 |                     }
 826 |                 }
 827 |             }
 828 |         }
 829 |         Ok(result)
 830 |     }
 831 | }
 832 | 
 833 | pub struct Rule6<PA, PB, PC, PD, PE, PF, V, StashValue, F>
 834 | where
 835 |     V: NodePayload,
 836 |     StashValue: NodePayload<Payload = V::Payload> + StashIndexable + From<V>,
 837 |     F: for<'a> Fn(
 838 |             &RuleProductionArg<'a, PA::M>,
 839 |             &RuleProductionArg<'a, PB::M>,
 840 |             &RuleProductionArg<'a, PC::M>,
 841 |             &RuleProductionArg<'a, PD::M>,
 842 |             &RuleProductionArg<'a, PE::M>,
 843 |             &RuleProductionArg<'a, PF::M>,
 844 |         ) -> RuleResult<V>
 845 |         + Send
 846 |         + Sync,
 847 |     PA: Pattern<StashValue>,
 848 |     PB: Pattern<StashValue>,
 849 |     PC: Pattern<StashValue>,
 850 |     PD: Pattern<StashValue>,
 851 |     PE: Pattern<StashValue>,
 852 |     PF: Pattern<StashValue>,
 853 | {
 854 |     sym: Sym,
 855 |     pattern: (PA, PB, PC, PD, PE, PF),
 856 |     production: F,
 857 |     _phantom: SendSyncPhantomData<(V, StashValue)>,
 858 | }
 859 | 
 860 | impl<PA, PB, PC, PD, PE, PF, V, StashValue, F> Rule<StashValue>
 861 |     for Rule6<PA, PB, PC, PD, PE, PF, V, StashValue, F>
 862 | where
 863 |     V: NodePayload,
 864 |     StashValue: NodePayload<Payload = V::Payload> + StashIndexable + From<V>,
 865 |     F: for<'a> Fn(
 866 |             &RuleProductionArg<'a, PA::M>,
 867 |             &RuleProductionArg<'a, PB::M>,
 868 |             &RuleProductionArg<'a, PC::M>,
 869 |             &RuleProductionArg<'a, PD::M>,
 870 |             &RuleProductionArg<'a, PE::M>,
 871 |             &RuleProductionArg<'a, PF::M>,
 872 |         ) -> RuleResult<V>
 873 |         + Send
 874 |         + Sync,
 875 |     PA: Pattern<StashValue>,
 876 |     PB: Pattern<StashValue>,
 877 |     PC: Pattern<StashValue>,
 878 |     PD: Pattern<StashValue>,
 879 |     PE: Pattern<StashValue>,
 880 |     PF: Pattern<StashValue>,
 881 | {
 882 |     fn rule_sym(&self) -> Sym {
 883 |         self.sym
 884 |     }
 885 | 
 886 |     fn apply(
 887 |         &self,
 888 |         stash: &Stash<StashValue>,
 889 |         sentence: &str,
 890 |     ) -> CoreResult<RuleOutput<StashValue>> {
 891 |         let matches = self.matches(&stash, sentence)?;
 892 | 
 893 |         if matches.status.is_exit() {
 894 |             return Ok(RuleOutput::exit());
 895 |         }
 896 | 
 897 |         let nodes: CoreResult<_> = matches
 898 |             .iter()
 899 |             .filter_map(|sub| {
 900 |                 let nodes = svec!(
 901 |                     sub.0.to_node(),
 902 |                     sub.1.to_node(),
 903 |                     sub.2.to_node(),
 904 |                     sub.3.to_node(),
 905 |                     sub.4.to_node(),
 906 |                     sub.5.to_node()
 907 |                 );
 908 |                 if stash.iter().all(|old_node| {
 909 |                     old_node.root_node.children != nodes || old_node.root_node.rule_sym != self.sym
 910 |                 }) {
 911 |                     let byte_range = Range(sub.0.byte_range().0, sub.5.byte_range().1);
 912 |                     match (self.production)(
 913 |                         &RuleProductionArg::new(sentence, &sub.0),
 914 |                         &RuleProductionArg::new(sentence, &sub.1),
 915 |                         &RuleProductionArg::new(sentence, &sub.2),
 916 |                         &RuleProductionArg::new(sentence, &sub.3),
 917 |                         &RuleProductionArg::new(sentence, &sub.4),
 918 |                         &RuleProductionArg::new(sentence, &sub.5),
 919 |                     ) {
 920 |                         Ok(v) => {
 921 |                             let payload = v.extract_payload();
 922 |                             Some(Ok(ParsedNode::new(
 923 |                                 self.sym,
 924 |                                 v.clone().into(),
 925 |                                 byte_range,
 926 |                                 payload,
 927 |                                 nodes,
 928 |                             )))
 929 |                         }
 930 |                         Err(e) => match e.downcast::<RuleError>() {
 931 |                             Ok(RuleError::Invalid) => None,
 932 |                             Err(e) => Some(Err(e)),
 933 |                         },
 934 |                     }
 935 |                 } else {
 936 |                     None
 937 |                 }
 938 |             })
 939 |             .collect();
 940 |         Ok(RuleOutput::continue_with(nodes?))
 941 |     }
 942 | }
 943 | 
 944 | impl<PA, PB, PC, PD, PE, PF, V, StashValue, F> Rule6<PA, PB, PC, PD, PE, PF, V, StashValue, F>
 945 | where
 946 |     V: NodePayload,
 947 |     StashValue: NodePayload<Payload = V::Payload> + StashIndexable + From<V>,
 948 |     F: for<'a> Fn(
 949 |             &RuleProductionArg<'a, PA::M>,
 950 |             &RuleProductionArg<'a, PB::M>,
 951 |             &RuleProductionArg<'a, PC::M>,
 952 |             &RuleProductionArg<'a, PD::M>,
 953 |             &RuleProductionArg<'a, PE::M>,
 954 |             &RuleProductionArg<'a, PF::M>,
 955 |         ) -> RuleResult<V>
 956 |         + Send
 957 |         + Sync,
 958 |     PA: Pattern<StashValue>,
 959 |     PB: Pattern<StashValue>,
 960 |     PC: Pattern<StashValue>,
 961 |     PD: Pattern<StashValue>,
 962 |     PE: Pattern<StashValue>,
 963 |     PF: Pattern<StashValue>,
 964 | {
 965 |     pub fn new(
 966 |         sym: Sym,
 967 |         pat: (PA, PB, PC, PD, PE, PF),
 968 |         prod: F,
 969 |     ) -> Rule6<PA, PB, PC, PD, PE, PF, V, StashValue, F> {
 970 |         Rule6 {
 971 |             sym,
 972 |             pattern: pat,
 973 |             production: prod,
 974 |             _phantom: SendSyncPhantomData::new(),
 975 |         }
 976 |     }
 977 | 
 978 |     fn matches(
 979 |         &self,
 980 |         stash: &Stash<StashValue>,
 981 |         sentence: &str,
 982 |     ) -> CoreResult<PredicateMatches<(PA::M, PB::M, PC::M, PD::M, PE::M, PF::M)>> {
 983 |         let matches_0 = self.pattern.0.predicate(stash, sentence)?;
 984 |         if matches_0.is_empty() {
 985 |             return Ok(PredicateMatches::with_status(matches_0.status));
 986 |         }
 987 |         let matches_1 = self.pattern.1.predicate(stash, sentence)?;
 988 |         if matches_1.is_empty() {
 989 |             return Ok(PredicateMatches::with_status(matches_1.status));
 990 |         }
 991 |         let matches_2 = self.pattern.2.predicate(stash, sentence)?;
 992 |         if matches_2.is_empty() {
 993 |             return Ok(PredicateMatches::with_status(matches_2.status));
 994 |         }
 995 |         let matches_3 = self.pattern.3.predicate(stash, sentence)?;
 996 |         if matches_3.is_empty() {
 997 |             return Ok(PredicateMatches::with_status(matches_3.status));
 998 |         }
 999 |         let matches_4 = self.pattern.4.predicate(stash, sentence)?;
1000 |         if matches_4.is_empty() {
1001 |             return Ok(PredicateMatches::with_status(matches_4.status));
1002 |         }
1003 |         let matches_5 = self.pattern.5.predicate(stash, sentence)?;
1004 |         if matches_5.is_empty() {
1005 |             return Ok(PredicateMatches::with_status(matches_5.status));
1006 |         }
1007 |         let mut result = PredicateMatches::with_status(ParsingStatus::Continue);
1008 |         for m0 in matches_0.iter() {
1009 |             for m1 in matches_1.iter() {
1010 |                 if adjacent(m0, m1, sentence) {
1011 |                     for m2 in matches_2.iter() {
1012 |                         if adjacent(m1, m2, sentence) {
1013 |                             for m3 in matches_3.iter() {
1014 |                                 if adjacent(m2, m3, sentence) {
1015 |                                     for m4 in matches_4.iter() {
1016 |                                         if adjacent(m3, m4, sentence) {
1017 |                                             for m5 in matches_5.iter() {
1018 |                                                 if adjacent(m4, m5, sentence) {
1019 |                                                     result.push((
1020 |                                                         m0.clone(),
1021 |                                                         m1.clone(),
1022 |                                                         m2.clone(),
1023 |                                                         m3.clone(),
1024 |                                                         m4.clone(),
1025 |                                                         m5.clone(),
1026 |                                                     ))
1027 |                                                 }
1028 |                                             }
1029 |                                         }
1030 |                                     }
1031 |                                 }
1032 |                             }
1033 |                         }
1034 |                     }
1035 |                 }
1036 |             }
1037 |         }
1038 |         Ok(result)
1039 |     }
1040 | }
1041 | 
1042 | #[cfg(test)]
1043 | #[allow(unused_mut)]
1044 | mod tests {
1045 |     use crate::helpers::BoundariesChecker;
1046 |     use crate::pattern::{FilterNodePattern, Text};
1047 |     use crate::rule::*;
1048 |     use crate::stash::Stash;
1049 |     use crate::{
1050 |         AttemptFrom, InnerStashIndexable, Node, NodePayload, ParsedNode, Range, StashIndexable,
1051 |         SymbolTable,
1052 |     };
1053 |     use regex::Regex;
1054 |     use smallvec::SmallVec;
1055 | 
1056 |     macro_rules! svec {
1057 |         ($($item:expr),*) => { {
1058 |             let mut v = SmallVec::new();
1059 |             $( v.push($item); )*
1060 |             v
1061 |         }
1062 |         }
1063 |     }
1064 | 
1065 |     macro_rules! svec4 {
1066 |         ($($item:expr),*) => { {
1067 |             let mut v =SmallVec::<[_;4]>::new();
1068 |             $( v.push($item); )*
1069 |             v
1070 |         }
1071 |         }
1072 |     }
1073 | 
1074 |     impl AttemptFrom<usize> for usize {
1075 |         fn attempt_from(v: usize) -> Option<usize> {
1076 |             Some(v)
1077 |         }
1078 |     }
1079 | 
1080 |     impl NodePayload for usize {
1081 |         type Payload = usize;
1082 |         fn extract_payload(&self) -> Option<usize> {
1083 |             Some(*self)
1084 |         }
1085 |     }
1086 | 
1087 |     impl StashIndexable for usize {
1088 |         type Index = usize;
1089 |         fn index(&self) -> usize {
1090 |             0
1091 |         }
1092 |     }
1093 | 
1094 |     impl InnerStashIndexable for usize {
1095 |         type Index = usize;
1096 |         fn index() -> usize {
1097 |             0
1098 |         }
1099 |     }
1100 | 
1101 |     macro_rules! reg {
1102 |         ($st:expr, $typ:ty, $pattern:expr) => {
1103 |             $crate::pattern::TextPattern::<$typ>::new(
1104 |                 Regex::new($pattern).unwrap(),
1105 |                 $st.sym($pattern),
1106 |                 BoundariesChecker::separated_alphanumeric_word(),
1107 |             )
1108 |         };
1109 |     }
1110 | 
1111 |     #[test]
1112 |     fn test_integer_numeric_en_rule() {
1113 |         let mut st = SymbolTable::default();
1114 |         let ten = st.sym("ten");
1115 |         let rule = Rule1::new(ten, reg!(st, usize, "ten"), |_| Ok(10usize));
1116 |         assert_eq!(
1117 |             vec![Text::new(svec![Range(8, 11)], Range(8, 11), ten)],
1118 |             rule.matches(&Stash::default(), "foobar: ten")
1119 |                 .unwrap()
1120 |                 .matches
1121 |         );
1122 |         assert_eq!(
1123 |             vec![
1124 |                 Text::new(svec![Range(8, 11)], Range(8, 11), ten),
1125 |                 Text::new(svec![Range(12, 15)], Range(12, 15), ten)
1126 |             ],
1127 |             rule.matches(&Stash::default(), "foobar: ten ten")
1128 |                 .unwrap()
1129 |                 .matches
1130 |         );
1131 |         assert_eq!(
1132 |             svec4![
1133 |                 ParsedNode::new(
1134 |                     ten,
1135 |                     10usize,
1136 |                     Range(8, 11),
1137 |                     Some(10usize),
1138 |                     svec![Node::new(ten, Range(8, 11), None, svec![])]
1139 |                 ),
1140 |                 ParsedNode::new(
1141 |                     ten,
1142 |                     10usize,
1143 |                     Range(12, 15),
1144 |                     Some(10usize),
1145 |                     svec![Node::new(ten, Range(12, 15), None, svec![])]
1146 |                 )
1147 |             ],
1148 |             rule.apply(&Stash::default(), "foobar: ten ten")
1149 |                 .unwrap()
1150 |                 .nodes
1151 |         )
1152 |     }
1153 | 
1154 |     #[test]
1155 |     fn test_integer_numeric_compo_en_rule() {
1156 |         let mut st = SymbolTable::default();
1157 |         let rule_consec = Rule2::new(
1158 |             st.sym("2 consecutive ints"),
1159 |             (
1160 |                 AnyNodePattern::<usize>::new(),
1161 |                 FilterNodePattern::<usize>::filter(vec![Box::new(|integer: &usize| {
1162 |                     *integer == 10
1163 |                 })]),
1164 |             ),
1165 |             |a, b| Ok(a.value() + b.value()),
1166 |         );
1167 |         let mut stash = Stash::default();
1168 |         stash.push(ParsedNode::new(
1169 |             st.sym("ten"),
1170 |             10,
1171 |             Range(8, 11),
1172 |             None,
1173 |             svec![],
1174 |         ));
1175 |         stash.push(ParsedNode::new(
1176 |             st.sym("ten"),
1177 |             10,
1178 |             Range(12, 15),
1179 |             None,
1180 |             svec![],
1181 |         ));
1182 | 
1183 |         assert_eq!(
1184 |             vec![(stash.values()[0].clone(), stash.values()[1].clone())],
1185 |             rule_consec
1186 |                 .matches(&stash, "foobar: ten ten")
1187 |                 .unwrap()
1188 |                 .matches
1189 |         );
1190 |         assert_eq!(
1191 |             svec4![ParsedNode::new(
1192 |                 st.sym("2 consecutive ints"),
1193 |                 20,
1194 |                 Range(8, 15),
1195 |                 Some(20),
1196 |                 svec![
1197 |                     stash.values()[0].root_node.clone(),
1198 |                     stash.values()[1].root_node.clone()
1199 |                 ]
1200 |             )],
1201 |             rule_consec.apply(&stash, "foobar: ten ten").unwrap().nodes
1202 |         );
1203 |     }
1204 | 
1205 |     #[test]
1206 |     fn test_integer_numeric_int_rule() {
1207 |         use std::str::FromStr;
1208 |         let mut st = SymbolTable::default();
1209 |         let rule_int = Rule1::new(st.sym("int"), reg!(st, usize, "\\d+"), |a| {
1210 |             Ok(usize::from_str(&*a.group(0))?)
1211 |         });
1212 |         assert_eq!(
1213 |             svec4![ParsedNode::new(
1214 |                 st.sym("int"),
1215 |                 42,
1216 |                 Range(8, 10),
1217 |                 Some(42),
1218 |                 svec![Node::new(st.sym("\\d+"), Range(8, 10), None, svec![])]
1219 |             )],
1220 |             rule_int
1221 |                 .apply(&Stash::default(), "foobar: 42")
1222 |                 .unwrap()
1223 |                 .nodes
1224 |         );
1225 |     }
1226 | 
1227 | }
1228 | 


--------------------------------------------------------------------------------
/core/src/stash.rs:
--------------------------------------------------------------------------------
 1 | use crate::pattern::Match;
 2 | use crate::{AttemptFrom, NodePayload, ParsedNode};
 3 | use std::collections::HashMap;
 4 | use std::hash::Hash;
 5 | use std::slice::Iter;
 6 | use std::vec::IntoIter;
 7 | 
 8 | pub trait StashIndexable {
 9 |     type Index: Hash + Eq;
10 |     fn index(&self) -> Self::Index;
11 | }
12 | 
13 | pub trait InnerStashIndexable {
14 |     type Index: Hash + Eq;
15 |     fn index() -> Self::Index;
16 | }
17 | 
18 | pub struct Stash<S: StashIndexable + NodePayload> {
19 |     values: Vec<ParsedNode<S>>,
20 |     index: HashMap<S::Index, Vec<usize>>,
21 | }
22 | 
23 | impl<S: StashIndexable + NodePayload> Default for Stash<S> {
24 |     fn default() -> Stash<S> {
25 |         Stash {
26 |             values: vec![],
27 |             index: HashMap::new(),
28 |         }
29 |     }
30 | }
31 | 
32 | impl<S: StashIndexable + NodePayload> Stash<S> {
33 |     pub fn extend(&mut self, nodes: Vec<ParsedNode<S>>) {
34 |         for node in nodes.into_iter() {
35 |             self.push(node);
36 |         }
37 |     }
38 |     pub fn push(&mut self, node: ParsedNode<S>) {
39 |         let node_position = self.values.len();
40 |         let node_index = node.value.index();
41 |         self.values.push(node);
42 |         self.index
43 |             .entry(node_index)
44 |             .or_insert(vec![])
45 |             .push(node_position);
46 |     }
47 | 
48 |     pub fn filter<V, F>(&self, predicate: F) -> Vec<ParsedNode<V>>
49 |     where
50 |         V: InnerStashIndexable<Index = S::Index>
51 |             + NodePayload<Payload = S::Payload>
52 |             + AttemptFrom<S>,
53 |         F: Fn(&V) -> bool,
54 |     {
55 |         self.index
56 |             .get(&V::index())
57 |             .map(|nodes| {
58 |                 nodes
59 |                     .iter()
60 |                     .filter_map(|position| {
61 |                         let ref node = self.values[*position];
62 |                         if let Some(v) = V::attempt_from(node.value.clone()) {
63 |                             if (predicate)(&v) {
64 |                                 Some(ParsedNode::new(
65 |                                     node.root_node.rule_sym,
66 |                                     v,
67 |                                     node.byte_range(),
68 |                                     node.root_node.payload.clone(),
69 |                                     node.root_node.children.clone(),
70 |                                 ))
71 |                             } else {
72 |                                 None
73 |                             }
74 |                         } else {
75 |                             None
76 |                         }
77 |                     })
78 |                     .collect()
79 |             })
80 |             .unwrap_or(vec![])
81 |     }
82 | 
83 |     pub fn iter(&self) -> Iter<ParsedNode<S>> {
84 |         self.values.iter()
85 |     }
86 |     pub fn into_iter(self) -> IntoIter<ParsedNode<S>> {
87 |         self.values.into_iter()
88 |     }
89 | 
90 |     pub fn len(&self) -> usize {
91 |         self.values.len()
92 |     }
93 | 
94 |     #[cfg(test)]
95 |     pub fn values(&self) -> &Vec<ParsedNode<S>> {
96 |         self.values.as_ref()
97 |     }
98 | }
99 | 


--------------------------------------------------------------------------------
/ml/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "rustling-ml"
 3 | version = "0.9.1"
 4 | authors = ["hdlj <hubert.delajonquiere@snips.net>", "Mathieu Poumeyrol <kali@zoy.org>"]
 5 | edition = "2018"
 6 | 
 7 | [dependencies]
 8 | failure = "0.1"
 9 | fnv = "1.0"
10 | serde = { version = "1.0", features = ["derive"] }
11 | 


--------------------------------------------------------------------------------
/ml/src/lib.rs:
--------------------------------------------------------------------------------
  1 | #[macro_use]
  2 | extern crate failure;
  3 | extern crate fnv;
  4 | 
  5 | use fnv::{FnvHashMap, FnvHashSet};
  6 | use std::fmt::Debug;
  7 | use std::hash;
  8 | use serde::{Deserialize, Serialize};
  9 | 
 10 | pub type MLResult<T> = Result<T, ::failure::Error>;
 11 | 
 12 | pub trait ClassifierId: Eq + hash::Hash + Clone + Debug {}
 13 | pub trait ClassId: Eq + hash::Hash + Clone + Debug {}
 14 | pub trait Feature: Eq + hash::Hash + Clone + Debug {}
 15 | 
 16 | pub struct Input<Id: ClassifierId, Feat: Feature> {
 17 |     pub classifier_id: Id,
 18 |     pub features: Vec<Feat>,
 19 |     pub children: Vec<Input<Id, Feat>>,
 20 | }
 21 | 
 22 | #[derive(PartialEq, Debug, Clone, Serialize, Deserialize)]
 23 | pub struct Model<Id: ClassifierId, Class: ClassId, Feat: Feature> {
 24 |     pub classifiers: FnvHashMap<Id, Classifier<Class, Feat>>,
 25 | }
 26 | 
 27 | #[derive(PartialEq, Debug, Clone, Serialize, Deserialize)]
 28 | pub struct Classifier<Id: ClassId, Feat: Feature> {
 29 |     pub classes: FnvHashMap<Id, ClassInfo<Feat>>,
 30 | }
 31 | 
 32 | #[derive(PartialEq, Debug, Clone, Serialize, Deserialize)]
 33 | pub struct ClassInfo<Feat: Feature> {
 34 |     pub example_count: usize,
 35 |     pub unk_probalog: f32,
 36 |     pub class_probalog: f32,
 37 |     pub feat_probalog: FnvHashMap<Feat, f32>,
 38 | }
 39 | 
 40 | impl<Id: ClassifierId, Class: ClassId, Feat: Feature> Model<Id, Class, Feat> {
 41 |     pub fn classify(&self, input: &Input<Id, Feat>, target: &Class) -> MLResult<f32> {
 42 |         let classifier = if let Some(classifier) = self.classifiers.get(&input.classifier_id) {
 43 |             classifier
 44 |         } else {
 45 |             return Ok(0.0);
 46 |         };
 47 | 
 48 |         let mut bag_of_features: FnvHashMap<Feat, usize> = FnvHashMap::default();
 49 |         for feat in &input.features {
 50 |             let counter = bag_of_features.entry(feat.clone()).or_insert(0);
 51 |             *counter += 1;
 52 |         }
 53 | 
 54 |         let mut probalog = classifier
 55 |             .scores(&bag_of_features)
 56 |             .iter()
 57 |             .find(|item| &item.0 == target)
 58 |             .map(|item| item.1)
 59 |             .unwrap_or(::std::f32::NEG_INFINITY);
 60 |         for child in &input.children {
 61 |             probalog += self.classify(&child, target)?;
 62 |         }
 63 |         Ok(probalog)
 64 |     }
 65 | }
 66 | 
 67 | impl<Id: ClassId, Feat: Feature> Classifier<Id, Feat> {
 68 |     // max(log(π(Prob(feat|class)^count)*Prob(class))) =
 69 |     // max(sum(logprob(feat|class)*count + logprob(class))
 70 | 
 71 |     pub fn scores(&self, bag_of_features: &FnvHashMap<Feat, usize>) -> Vec<(Id, f32)> {
 72 |         let mut scores: Vec<_> = self
 73 |             .classes
 74 |             .iter()
 75 |             .map(|(cid, cinfo)| {
 76 |                 let probalog: f32 = bag_of_features
 77 |                     .iter()
 78 |                     .map(|(feat, count)| {
 79 |                         *count as f32 * cinfo.feat_probalog.get(feat).unwrap_or(&cinfo.unk_probalog)
 80 |                     })
 81 |                     .sum();
 82 |                 (cid.clone(), probalog + cinfo.class_probalog)
 83 |             })
 84 |             .collect();
 85 |         let normlog = f32::ln(scores.iter().map(|p| f32::exp(p.1)).sum::<f32>());
 86 |         for s in scores.iter_mut() {
 87 |             s.1 -= normlog
 88 |         }
 89 |         scores
 90 |     }
 91 | 
 92 |     pub fn classify(&self, bag_of_features: &FnvHashMap<Feat, usize>) -> MLResult<(Id, f32)> {
 93 |         self.scores(bag_of_features)
 94 |             .into_iter()
 95 |             .max_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(::std::cmp::Ordering::Equal))
 96 |             .ok_or(format_err!("no classes in classifier"))
 97 |     }
 98 | 
 99 |     pub fn train(examples: &Vec<(FnvHashMap<Feat, usize>, Id)>) -> Classifier<Id, Feat> {
100 |         let mut classes: FnvHashMap<Id, (usize, FnvHashMap<Feat, usize>)> = FnvHashMap::default();
101 |         let total_examples = examples.len();
102 |         let mut all_features = FnvHashSet::default();
103 |         for &(ref features, ref class) in examples {
104 |             let mut data = classes
105 |                 .entry(class.clone())
106 |                 .or_insert_with(|| (0, FnvHashMap::default()));
107 |             data.0 += 1;
108 |             for (feat, count) in features {
109 |                 all_features.insert(feat.clone());
110 |                 *data.1.entry(feat.clone()).or_insert(0) += *count;
111 |             }
112 |         }
113 |         let total_features = all_features.len();
114 |         let class_infos = classes
115 |             .into_iter()
116 |             .map(|(k, v)| {
117 |                 let smooth_denom: f32 = (total_features + v.1.values().sum::<usize>()) as f32;
118 |                 let feat_probalog =
119 |                     v.1.into_iter()
120 |                         .map(|(k, v)| (k, f32::ln((v as f32 + 1 as f32) / smooth_denom)))
121 |                         .collect();
122 |                 (
123 |                     k,
124 |                     ClassInfo {
125 |                         example_count: v.0,
126 |                         class_probalog: f32::ln(v.0 as f32 / total_examples as f32),
127 |                         unk_probalog: f32::ln(1.0 / smooth_denom),
128 |                         feat_probalog: feat_probalog,
129 |                     },
130 |                 )
131 |             })
132 |             .collect();
133 |         Classifier {
134 |             classes: class_infos,
135 |         }
136 |     }
137 | }
138 | 
139 | #[cfg(test)]
140 | mod tests {
141 |     use super::*;
142 |     use fnv::FnvHashMap;
143 | 
144 |     macro_rules! hmap(
145 |         { } => { FnvHashMap::default() };
146 |         { $($key:expr => $value:expr),+} => {
147 |             {
148 |                 let mut m = FnvHashMap::default();
149 |                 $( m.insert($key, $value); )*
150 |                 m
151 |             }
152 |         };
153 |         ($($k:expr => $v:expr),+,) => { hmap!($($k => $v),+) }
154 |     );
155 | 
156 |     #[derive(Eq, PartialEq, Debug, Hash, Clone)]
157 |     enum Species {
158 |         Cat,
159 |         Dog,
160 |         Human,
161 |     }
162 |     impl ClassId for Species {}
163 | 
164 |     #[derive(Eq, PartialEq, Debug, Hash, Clone)]
165 |     enum Friend {
166 |         Cat,
167 |         Dog,
168 |         Human,
169 |         Fish,
170 |     }
171 |     impl Feature for Friend {}
172 | 
173 |     impl ClassifierId for &'static str {}
174 | 
175 |     fn mammals_classifier() -> Classifier<Species, Friend> {
176 |         Classifier {
177 |             classes: hmap!(
178 |                 Species::Cat => ClassInfo {
179 |                     class_probalog: -1.0986123,
180 |                     unk_probalog: -2.3978953,
181 |                     example_count: 4,
182 |                     feat_probalog: hmap!(
183 |                         Friend::Cat => -1.0116009,
184 |                         Friend::Human => -1.704748,
185 |                         Friend::Fish => -1.0116009,
186 |                     )
187 |                 },
188 |                 Species::Dog => ClassInfo {
189 |                     class_probalog: -1.0986123,
190 |                     unk_probalog: -2.3978953,
191 |                     example_count: 4,
192 |                     feat_probalog: hmap!(
193 |                         Friend::Cat => -1.704748,
194 |                         Friend::Dog => -1.0116009,
195 |                         Friend::Human => -1.0116009,
196 |                     )
197 |                 },
198 |                 Species::Human => ClassInfo {
199 |                     class_probalog: -1.0986123,
200 |                     unk_probalog: -2.7725887,
201 |                     example_count: 4,
202 |                     feat_probalog: hmap!(
203 |                         Friend::Cat => -1.3862944,
204 |                         Friend::Dog => -1.3862944,
205 |                         Friend::Human => -1.3862944,
206 |                         Friend::Fish => -1.3862944,
207 |                     )
208 |                 }
209 |             ),
210 |         }
211 |     }
212 | 
213 |     #[test]
214 |     fn test_train() {
215 |         let examples = vec![
216 |             (
217 |                 hmap!(Friend::Dog => 1, Friend::Human => 1, Friend::Cat => 1),
218 |                 Species::Dog,
219 |             ),
220 |             (hmap!(Friend::Dog => 1), Species::Dog),
221 |             (hmap!(Friend::Dog => 1, Friend::Human => 1), Species::Dog),
222 |             (hmap!(Friend::Human => 1), Species::Dog),
223 |             (hmap!(Friend::Fish => 1, Friend::Cat => 1), Species::Cat),
224 |             (hmap!(Friend::Cat => 1), Species::Cat),
225 |             (hmap!(Friend::Fish => 1), Species::Cat),
226 |             (
227 |                 hmap!(Friend::Human => 1, Friend::Fish => 1, Friend::Cat => 1),
228 |                 Species::Cat,
229 |             ),
230 |             (
231 |                 hmap!(Friend::Human => 1, Friend::Fish => 1, Friend::Cat => 1, Friend::Dog => 1),
232 |                 Species::Human,
233 |             ),
234 |             (
235 |                 hmap!(Friend::Fish => 1, Friend::Cat => 1, Friend::Dog => 1),
236 |                 Species::Human,
237 |             ),
238 |             (
239 |                 hmap!(Friend::Human => 1, Friend::Fish => 1, Friend::Dog => 1),
240 |                 Species::Human,
241 |             ),
242 |             (hmap!(Friend::Human => 1, Friend::Cat => 1), Species::Human),
243 |         ];
244 |         let classifier = Classifier::train(&examples);
245 |         assert_eq!(mammals_classifier(), classifier);
246 |     }
247 | 
248 |     #[test]
249 |     fn test_classify_norm() {
250 |         let classifier = mammals_classifier();
251 |         let probable_cat = hmap!(Friend::Fish => 1, Friend::Cat => 1);
252 |         let norm = classifier
253 |             .scores(&probable_cat)
254 |             .iter()
255 |             .map(|pair| pair.1)
256 |             .map(f32::exp)
257 |             .sum::<f32>();
258 |         assert!(norm > 0.9999 && norm < 1.0001);
259 |     }
260 | 
261 |     #[test]
262 |     fn test_classify() {
263 |         let classifier = mammals_classifier();
264 |         let probable_cat = hmap!(Friend::Fish => 1, Friend::Cat => 1);
265 |         assert_eq!(Species::Cat, classifier.classify(&probable_cat).unwrap().0);
266 | 
267 |         let probable_dog = hmap!(Friend::Human => 1, Friend::Dog => 1);
268 |         assert_eq!(Species::Dog, classifier.classify(&probable_dog).unwrap().0);
269 | 
270 |         let probable_human =
271 |             hmap!(Friend::Dog => 1, Friend::Cat => 1, Friend::Human => 1, Friend::Fish => 1);
272 |         assert_eq!(
273 |             Species::Human,
274 |             classifier.classify(&probable_human).unwrap().0
275 |         );
276 |     }
277 | 
278 |     #[test]
279 |     fn test_model() {
280 |         let model = Model {
281 |             classifiers: hmap!(
282 |                 "mammals" => mammals_classifier(),
283 |                 "void" => Classifier { classes: hmap!() },
284 |             ),
285 |         };
286 |         let input_dog = Input {
287 |             classifier_id: "mammals",
288 |             children: vec![],
289 |             features: vec![Friend::Human, Friend::Dog],
290 |         };
291 |         assert!(model.classify(&input_dog, &Species::Dog).unwrap() > -0.5);
292 |         assert!(model.classify(&input_dog, &Species::Cat).unwrap() < -0.5);
293 |         let input_dog = Input {
294 |             classifier_id: "mammals",
295 |             children: vec![input_dog],
296 |             features: vec![Friend::Human, Friend::Dog],
297 |         };
298 |         let dog_dog = model.classify(&input_dog, &Species::Dog).unwrap();
299 |         assert!(dog_dog > -1.0, "probalog: {:?}", dog_dog);
300 |         assert!(dog_dog < 0.5, "probalog: {:?}", dog_dog);
301 |     }
302 | }
303 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | #[macro_use]
  2 | extern crate failure;
  3 | extern crate fnv;
  4 | extern crate rustling_core;
  5 | extern crate rustling_ml;
  6 | 
  7 | pub use rustling_core::regex;
  8 | pub use rustling_core::{
  9 |     AttemptFrom, AttemptInto, BoundariesChecker, InnerStashIndexable, Node, NodePayload,
 10 |     ParsedNode, Range, RuleSet, RuleSetBuilder, StashIndexable, Sym,
 11 | };
 12 | pub use rustling_core::{RuleError, RuleResult};
 13 | pub use rustling_ml::{ClassId, Classifier, ClassifierId, Feature, Input, Model};
 14 | use serde::{Deserialize, Serialize};
 15 | use std::collections::HashSet;
 16 | pub use train::{Check, Example};
 17 | 
 18 | #[macro_use]
 19 | pub mod macros;
 20 | pub mod train;
 21 | 
 22 | pub mod core {
 23 |     pub use rustling_core::pattern::{
 24 |         AnyNodePattern, FilterNodePattern, TextNegLHPattern, TextPattern,
 25 |     };
 26 |     pub use rustling_core::rule::{Rule1, Rule2, Rule3, Rule4, Rule5, Rule6};
 27 | }
 28 | 
 29 | pub type RustlingResult<T> = Result<T, ::failure::Error>;
 30 | 
 31 | #[derive(Debug, Hash, Clone, Eq, PartialEq, Serialize, Deserialize)]
 32 | pub struct RuleId(pub Sym);
 33 | impl ClassifierId for RuleId {}
 34 | 
 35 | #[derive(Debug, Hash, Clone, Eq, PartialEq, Serialize, Deserialize)]
 36 | pub struct Truth(pub bool);
 37 | impl ClassId for Truth {}
 38 | 
 39 | pub trait Value: NodePayload {
 40 |     type Kind: PartialEq;
 41 |     fn kind(&self) -> Self::Kind;
 42 |     fn latent(&self) -> bool;
 43 | }
 44 | 
 45 | /// Match holder for the Parser.
 46 | #[derive(Debug, Clone, PartialEq)]
 47 | pub struct ParserMatch<V> {
 48 |     /// Range in bytes of matched area
 49 |     pub byte_range: Range,
 50 |     /// Range in char of matched area
 51 |     pub char_range: Range,
 52 |     /// Parsing tree height
 53 |     pub parsing_tree_height: usize,
 54 |     /// Number of nodes in the parsing tree
 55 |     pub parsing_tree_num_nodes: usize,
 56 |     /// Actual value built from the text.
 57 |     pub value: V,
 58 |     /// Logarithmic probability of the match after machine-learned model
 59 |     /// evaluation.
 60 |     pub probalog: f32,
 61 |     pub latent: bool,
 62 | }
 63 | 
 64 | pub trait MaxElementTagger<V: Value> {
 65 |     type O;
 66 |     fn tag(&self, candidates: Vec<(ParsedNode<V>, ParserMatch<V>)>) -> Vec<Candidate<V, Self::O>>;
 67 | }
 68 | 
 69 | pub trait FeatureExtractor<V: Value, Feat: Feature> {
 70 |     fn for_parsed_node(&self, node: &ParsedNode<V>) -> Input<RuleId, Feat>;
 71 |     fn for_node(&self, node: &Node<V::Payload>) -> Input<RuleId, Feat>;
 72 | }
 73 | 
 74 | #[derive(Debug, Clone, PartialEq)]
 75 | pub struct ParsingAnalysis<'a> {
 76 |     /// Coverage of rules used during the analysis
 77 |     pub rules_coverage: f32,
 78 |     /// Coverage of text pattern used during the analysis
 79 |     pub text_pattern_coverage: f32,
 80 |     /// Coverage of example with only one output
 81 |     pub examples_coverage: f32,
 82 |     /// Rules' names which were not used during the analysis
 83 |     pub unused_rules: Vec<&'a str>,
 84 |     /// Text patterns's names which were not used during the analysis
 85 |     pub unused_text_pattern: Vec<&'a str>,
 86 |     /// IFailed examples with the position of the example and the number of output found. An example is a success if and only if one output is found during the parsing
 87 |     pub failed_examples: Vec<(usize, usize)>,
 88 | }
 89 | 
 90 | #[derive(Debug, Clone)]
 91 | pub struct Candidate<V: Value, ResolvedV> {
 92 |     pub node: ParsedNode<V>,
 93 |     pub match_: ParserMatch<ResolvedV>,
 94 |     pub tagged: bool,
 95 | }
 96 | 
 97 | pub struct Parser<V, Feat, Extractor>
 98 | where
 99 |     V: Value + StashIndexable,
100 |     Feat: Feature,
101 |     Extractor: FeatureExtractor<V, Feat>,
102 | {
103 |     rules: RuleSet<V>,
104 |     model: Model<RuleId, Truth, Feat>,
105 |     extractor: Extractor,
106 | }
107 | 
108 | impl<V, Feat, Extractor> Parser<V, Feat, Extractor>
109 | where
110 |     V: Value + ::std::fmt::Debug + StashIndexable,
111 |     RuleId: ClassifierId,
112 |     Feat: Feature,
113 |     Extractor: FeatureExtractor<V, Feat>,
114 | {
115 |     pub fn new(
116 |         rules: RuleSet<V>,
117 |         model: Model<RuleId, Truth, Feat>,
118 |         extractor: Extractor,
119 |     ) -> Parser<V, Feat, Extractor> {
120 |         Parser {
121 |             rules,
122 |             model,
123 |             extractor,
124 |         }
125 |     }
126 | 
127 |     fn raw_candidates(&self, input: &str) -> RustlingResult<Vec<(ParsedNode<V>, ParserMatch<V>)>> {
128 |         self.rules
129 |             .apply_all(input)?
130 |             .into_iter()
131 |             .map(|p| {
132 |                 let features: Input<RuleId, Feat> = self.extractor.for_parsed_node(&p);
133 |                 let probalog = self.model.classify(&features, &Truth(true))?;
134 |                 let pm = ParserMatch {
135 |                     byte_range: p.root_node.byte_range,
136 |                     char_range: p.root_node.byte_range.char_range(input),
137 |                     value: p.value.clone().into(),
138 |                     parsing_tree_height: p.root_node.height(),
139 |                     parsing_tree_num_nodes: p.root_node.num_nodes(),
140 |                     probalog,
141 |                     latent: p.value.latent(),
142 |                 };
143 |                 Ok((p, pm))
144 |             })
145 |             .collect()
146 |     }
147 | 
148 |     pub fn candidates<Tagger: MaxElementTagger<V>>(
149 |         &self,
150 |         input: &str,
151 |         tagger: &Tagger,
152 |     ) -> RustlingResult<Vec<Candidate<V, Tagger::O>>> {
153 |         Ok(tagger.tag(self.raw_candidates(input)?))
154 |     }
155 | 
156 |     pub fn parse<Tagger: MaxElementTagger<V>>(
157 |         &self,
158 |         input: &str,
159 |         tagger: &Tagger,
160 |     ) -> RustlingResult<Vec<ParserMatch<Tagger::O>>> {
161 |         Ok(self
162 |             .candidates(input, tagger)?
163 |             .into_iter()
164 |             .filter_map(|c| if c.tagged { Some(c.match_) } else { None })
165 |             .collect())
166 |     }
167 | 
168 |     pub fn analyse<Tagger: MaxElementTagger<V>>(
169 |         &self,
170 |         examples: Vec<&str>,
171 |         tagger: &Tagger,
172 |     ) -> RustlingResult<ParsingAnalysis> {
173 |         let all_syms = self.rules.all_syms().into_iter().collect::<HashSet<_>>();
174 |         let rules_syms = self.rules.rules_syms().into_iter().collect::<HashSet<_>>();
175 |         let text_pattern_syms: HashSet<_> = all_syms.difference(&rules_syms).map(|s| *s).collect();
176 | 
177 |         let mut used_syms = HashSet::new();
178 |         let mut failed_examples = vec![];
179 | 
180 |         for (idx, example) in examples.iter().enumerate() {
181 |             let outputs = self
182 |                 .candidates(example, tagger)?
183 |                 .into_iter()
184 |                 .filter(|c| c.tagged)
185 |                 .collect::<Vec<_>>();
186 | 
187 |             if outputs.len() != 1 {
188 |                 failed_examples.push((idx, outputs.len()));
189 |             } else {
190 |                 for sym in outputs[0].node.root_node.all_syms().into_iter() {
191 |                     used_syms.insert(*sym);
192 |                 }
193 |             }
194 |         }
195 |         let unused_rules: Vec<_> = rules_syms
196 |             .difference(&used_syms)
197 |             .filter_map(|s| self.resolve_sym(&s))
198 |             .collect();
199 | 
200 |         let unused_text_pattern: Vec<_> = text_pattern_syms
201 |             .difference(&used_syms)
202 |             .filter_map(|s| self.resolve_sym(&s))
203 |             .collect();
204 | 
205 |         Ok(ParsingAnalysis {
206 |             rules_coverage: 1.0 - (unused_rules.len() as f32 / rules_syms.len() as f32),
207 |             text_pattern_coverage: 1.0
208 |                 - (unused_text_pattern.len() as f32 / text_pattern_syms.len() as f32),
209 |             examples_coverage: 1.0 - (failed_examples.len() as f32 / examples.len() as f32),
210 |             unused_rules,
211 |             unused_text_pattern,
212 |             failed_examples,
213 |         })
214 |     }
215 | 
216 |     pub fn num_rules(&self) -> usize {
217 |         self.rules
218 |             .rules_syms()
219 |             .into_iter()
220 |             .collect::<HashSet<_>>()
221 |             .len()
222 |     }
223 | 
224 |     pub fn num_text_patterns(&self) -> usize {
225 |         let all_syms = self.rules.all_syms().into_iter().collect::<HashSet<_>>();
226 |         let rules_syms = self.rules.rules_syms().into_iter().collect::<HashSet<_>>();
227 |         let text_pattern_syms: HashSet<_> = all_syms.difference(&rules_syms).map(|s| *s).collect();
228 |         text_pattern_syms.len()
229 |     }
230 | 
231 |     pub fn resolve_sym(&self, sym: &Sym) -> Option<&str> {
232 |         self.rules.resolve_sym(sym)
233 |     }
234 | }
235 | 
236 | #[cfg(test)]
237 | mod tests {
238 |     use super::*;
239 |     use fnv::FnvHashMap;
240 |     use std::str::FromStr;
241 | 
242 |     #[derive(Copy, Clone, Debug, PartialEq)]
243 |     pub struct MyPayload;
244 | 
245 |     #[derive(Copy, Clone, Debug, PartialEq, Default)]
246 |     pub struct Int(usize);
247 | 
248 |     impl StashIndexable for Int {
249 |         type Index = MyValueKind;
250 |         fn index(&self) -> Self::Index {
251 |             MyValueKind::UI
252 |         }
253 |     }
254 | 
255 |     #[derive(Copy, Clone, Debug, PartialEq, Default)]
256 |     pub struct F32(f32);
257 | 
258 |     impl AttemptFrom<Int> for Int {
259 |         fn attempt_from(v: Int) -> Option<Int> {
260 |             Some(v)
261 |         }
262 |     }
263 | 
264 |     fn rules() -> RuleSet<Int> {
265 |         let b = RuleSetBuilder::new(
266 |             BoundariesChecker::detailed(),
267 |             BoundariesChecker::separated_alphanumeric_word(),
268 |         );
269 |         b.rule_1(
270 |             "integer (numeric)",
271 |             b.reg(r#"(\d{1,18})"#).unwrap(),
272 |             |text_match| Ok(Int(text_match.group(0).parse::<usize>()?)),
273 |         );
274 |         b.rule_1("integer (thousand)", b.reg("thousands?").unwrap(), |_| {
275 |             Ok(Int(1000))
276 |         });
277 |         b.rule_2(
278 |             "number thousands",
279 |             dim!(Int, vec![Box::new(|a: &Int| a.0 > 1 && a.0 < 99)]),
280 |             dim!(Int, vec![Box::new(|a: &Int| a.0 == 1000)]),
281 |             |a, _| Ok(Int(a.value().0 * 1000)),
282 |         );
283 |         b.build()
284 |     }
285 | 
286 |     #[test]
287 |     fn test_rule_set_application_all() {
288 |         let rule_set = rules();
289 |         let output_stash = rule_set.apply_all("foobar: 12 thousands").unwrap();
290 |         assert_eq!(3, output_stash.len());
291 |         let values: Vec<_> = output_stash.iter().map(|pn| pn.value).collect();
292 |         assert_eq!(vec![Int(12), Int(1000), Int(12000)], values);
293 |     }
294 | 
295 |     #[test]
296 |     fn test_integer_numeric_infix_rule() {
297 |         let b = RuleSetBuilder::new(
298 |             BoundariesChecker::detailed(),
299 |             BoundariesChecker::separated_alphanumeric_word(),
300 |         );
301 |         b.rule_1("int", b.reg("\\d+").unwrap(), |a| {
302 |             Ok(Int(usize::from_str(&*a.group(0))?))
303 |         });
304 |         b.rule_3(
305 |             "add",
306 |             dim!(Int),
307 |             b.reg("\\+").unwrap(),
308 |             dim!(Int),
309 |             |a, _, b| Ok(Int(a.value().0 + b.value().0)),
310 |         );
311 |         b.rule_3(
312 |             "mul",
313 |             dim!(Int),
314 |             b.reg("\\*").unwrap(),
315 |             dim!(Int),
316 |             |a, _, b| Ok(Int(a.value().0 * b.value().0)),
317 |         );
318 |         let rs = b.build();
319 |         let results = rs.apply_all("foo: 12 + 42, 12* 42").unwrap();
320 |         let values: Vec<_> = results.iter().map(|pn| pn.value).collect();
321 |         assert_eq!(
322 |             vec![Int(12), Int(42), Int(12), Int(42), Int(54), Int(504)],
323 |             values
324 |         );
325 |     }
326 | 
327 |     rustling_value! {
328 |         #[doc="an union"]
329 |         #[derive(Clone,PartialEq,Debug)]
330 |         MyValue MyValueKind {
331 |             UI(Int),
332 |             FP(F32),
333 |         }
334 | 
335 |         fn latent(v: &MyValue) -> bool {
336 |             false
337 |         }
338 | 
339 |         fn extract_payload(v: &MyValue) -> Option<usize> {
340 |             None
341 |         }
342 |     }
343 | 
344 |     #[derive(Debug, Hash, Clone, Eq, PartialEq, Serialize, Deserialize)]
345 |     struct TestFeat;
346 | 
347 |     impl Feature for TestFeat {}
348 | 
349 |     struct TestFeatExtractor();
350 | 
351 |     impl FeatureExtractor<MyValue, TestFeat> for TestFeatExtractor {
352 |         fn for_parsed_node(&self, node: &ParsedNode<MyValue>) -> Input<RuleId, TestFeat> {
353 |             self.for_node(&node.root_node)
354 |         }
355 |         fn for_node(&self, node: &Node<usize>) -> Input<RuleId, TestFeat> {
356 |             Input {
357 |                 classifier_id: RuleId(node.rule_sym),
358 |                 children: vec![],
359 |                 features: vec![],
360 |             }
361 |         }
362 |     }
363 | 
364 |     struct TestMaxElementTagger;
365 | 
366 |     impl MaxElementTagger<MyValue> for TestMaxElementTagger {
367 |         type O = MyValue;
368 |         fn tag(
369 |             &self,
370 |             candidates: Vec<(ParsedNode<MyValue>, ParserMatch<MyValue>)>,
371 |         ) -> Vec<Candidate<MyValue, MyValue>> {
372 |             let mut candidates = candidates;
373 |             candidates.sort_by(|a, b| a.1.byte_range.len().cmp(&b.1.byte_range.len()));
374 |             candidates
375 |                 .into_iter()
376 |                 .rev()
377 |                 .enumerate()
378 |                 .map(|(idx, c)| Candidate {
379 |                     node: c.0,
380 |                     match_: c.1,
381 |                     tagged: idx == 0,
382 |                 })
383 |                 .collect()
384 |         }
385 |     }
386 | 
387 |     fn rules_with_enum_value() -> RuleSet<MyValue> {
388 |         let b = RuleSetBuilder::new(
389 |             BoundariesChecker::detailed(),
390 |             BoundariesChecker::separated_alphanumeric_word(),
391 |         );
392 |         b.rule_1("int", b.reg("\\d+").unwrap(), |a| {
393 |             Ok(Int(usize::from_str(&*a.group(0))?))
394 |         });
395 |         b.rule_1("fp", b.reg("\\d+\\.\\d+").unwrap(), |a| {
396 |             Ok(F32(f32::from_str(&*a.group(0))?))
397 |         });
398 |         b.rule_3(
399 |             "pow",
400 |             dim!(F32),
401 |             b.reg("\\^").unwrap(),
402 |             dim!(Int),
403 |             |a, _, b| Ok(F32(a.value().0.powi(b.value().0 as i32))),
404 |         );
405 |         b.build()
406 |     }
407 | 
408 |     fn parser() -> Parser<MyValue, TestFeat, TestFeatExtractor> {
409 |         Parser {
410 |             rules: rules_with_enum_value(),
411 |             model: Model {
412 |                 classifiers: FnvHashMap::default(),
413 |             },
414 |             extractor: TestFeatExtractor(),
415 |         }
416 |     }
417 | 
418 |     #[test]
419 |     fn test_with_enum_value() {
420 |         let rule_set = rules_with_enum_value();
421 |         let results = rule_set.apply_all("foo: 1.5^2").unwrap();
422 |         let values: Vec<_> = results.into_iter().map(|pn| pn.value).collect();
423 |         assert_eq!(
424 |             vec![
425 |                 MyValue::UI(Int(1)),
426 |                 MyValue::UI(Int(5)),
427 |                 MyValue::UI(Int(2)),
428 |                 MyValue::FP(F32(1.5)),
429 |                 MyValue::FP(F32(2.25))
430 |             ],
431 |             values
432 |         );
433 |     }
434 | 
435 |     #[test]
436 |     fn test_parsing_analysis() {
437 |         let parser = parser();
438 |         assert_eq!(
439 |             ParsingAnalysis {
440 |                 rules_coverage: 0.6666666,
441 |                 text_pattern_coverage: 0.6666666,
442 |                 examples_coverage: 0.5,
443 |                 unused_rules: vec!["pow"],
444 |                 unused_text_pattern: vec!["\\^"],
445 |                 failed_examples: vec![(0, 0), (1, 0),],
446 |             },
447 |             parser
448 |                 .analyse(
449 |                     vec![
450 |                         "example that should fail",
451 |                         "another one",
452 |                         "foo: 1.5",
453 |                         "foo: 2"
454 |                     ],
455 |                     &TestMaxElementTagger
456 |                 )
457 |                 .unwrap()
458 |         );
459 |         assert_eq!(
460 |             ParsingAnalysis {
461 |                 rules_coverage: 1.0,
462 |                 text_pattern_coverage: 1.0,
463 |                 examples_coverage: 0.6666666,
464 |                 unused_rules: vec![],
465 |                 unused_text_pattern: vec![],
466 |                 failed_examples: vec![(0, 0)],
467 |             },
468 |             parser
469 |                 .analyse(
470 |                     vec!["example that should fail", "foo: 1.5^2", "foo: 2"],
471 |                     &TestMaxElementTagger
472 |                 )
473 |                 .unwrap()
474 |         );
475 |     }
476 | }
477 | 


--------------------------------------------------------------------------------
/src/macros.rs:
--------------------------------------------------------------------------------
  1 | #[macro_export]
  2 | macro_rules! variant_converters {
  3 |     ($name:ident, $varname:ident, $varty:ty) => {
  4 |         impl From<$varty> for $name {
  5 |             fn from(v: $varty) -> $name {
  6 |                 $name::$varname(v)
  7 |             }
  8 |         }
  9 | 
 10 |         impl $crate::AttemptFrom<$name> for $varty {
 11 |             fn attempt_from(v: $name) -> Option<$varty> {
 12 |                 if let $name::$varname(value) = v {
 13 |                     Some(value)
 14 |                 } else {
 15 |                     None
 16 |                 }
 17 |             }
 18 |         }
 19 |     }
 20 | }
 21 | 
 22 | #[macro_export]
 23 | macro_rules! enum_kind {
 24 |     ($kindname:ident, [$($varname:ident),*]) => {
 25 |         #[derive(Debug,Copy,Clone,PartialEq, Hash, Eq)]
 26 |         pub enum $kindname {
 27 |             $( $varname ),*
 28 |         }
 29 | 
 30 |         impl ::std::str::FromStr for $kindname {
 31 |             type Err=String;
 32 |             fn from_str(s: &str) -> ::std::result::Result<$kindname, Self::Err> {
 33 |                 match s {
 34 |                     $(
 35 |                         stringify!($varname) => Ok($kindname::$varname),
 36 |                     )*
 37 |                     _ => Err(format!("{} is not a known {}", s, stringify!($kindname)))
 38 |                 }
 39 |             }
 40 |         }
 41 | 
 42 |         impl ::std::string::ToString for $kindname {
 43 |             fn to_string(&self) -> String {
 44 |                 match self {
 45 |                     $(
 46 |                         &$kindname::$varname => stringify!($varname).to_string(),
 47 |                     )*
 48 |                 }
 49 |             }
 50 |         }
 51 |     }
 52 | }
 53 | 
 54 | #[macro_export]  
 55 | macro_rules! rustling_value {
 56 |     ( #[$doc:meta] #[$derive:meta] $name:ident $kindname:ident { $($varname:ident($varty:ty)),*, } fn latent($v1:ident: &$t1:ty) -> bool { $( $body1:tt )* } fn extract_payload($v2:ident: &$t2:ty) -> Option<$payload:ty> { $( $body2:tt )* } ) => {
 57 |         #[$doc] #[$derive]
 58 |         pub enum $name {
 59 |             $( $varname($varty) ),*
 60 |         }
 61 | 
 62 |         enum_kind!($kindname, [
 63 |                 $( $varname ),*
 64 |         ]);
 65 | 
 66 |         impl Value for $name {
 67 |             type Kind = $kindname;
 68 |             fn kind(&self) -> Self::Kind {
 69 |                 match self {
 70 |                     $(
 71 |                         &$name::$varname(_) => $kindname::$varname,
 72 |                     )*
 73 |                 }
 74 |             }
 75 | 
 76 |             fn latent(&self) -> bool {
 77 |                 #[allow(unused_variables)]
 78 |                 fn i($v1: &$t1) -> bool {
 79 |                     $( $body1 )*
 80 |                 }
 81 |                 i(&self)
 82 |             }
 83 |         }
 84 | 
 85 |         impl StashIndexable for $name {
 86 |             type Index = $kindname;
 87 |             fn index(&self) -> Self::Index {
 88 |                 match self {
 89 |                     $(
 90 |                         &$name::$varname(_) => $kindname::$varname,
 91 |                     )*
 92 |                 }
 93 |             }
 94 |         }
 95 | 
 96 |         impl NodePayload for $name {
 97 |             type Payload = $payload;
 98 |             fn extract_payload(&self) -> Option<Self::Payload> {
 99 |                 #[allow(unused_variables)]
100 |                 fn i($v2: &$t2) -> Option<$payload> {
101 |                     $( $body2 )*
102 |                 }
103 |                 i(&self)
104 |             }
105 |         }
106 | 
107 |         $( 
108 |             variant_converters!($name, $varname, $varty); 
109 |             
110 |             impl NodePayload for $varty {
111 |                 type Payload = $payload;
112 |                 fn extract_payload(&self) -> Option<Self::Payload> {
113 |                     $name::from(self.clone()).extract_payload()
114 |                 }
115 |             }
116 | 
117 |             impl InnerStashIndexable for $varty {
118 |                 type Index = $kindname;
119 |                 fn index() -> Self::Index {
120 |                     $kindname::$varname
121 |                 }
122 |             }
123 |         )*
124 |     }
125 | }
126 | 
127 | #[macro_export]
128 | macro_rules! dim {
129 |     ($typ:ty) => ( $crate::core::AnyNodePattern::<$typ>::new() );
130 |     ($typ:ty, $predicates:expr) => ( $crate::core::FilterNodePattern::<$typ>::filter($predicates) );
131 | }
132 | 


--------------------------------------------------------------------------------
/src/train.rs:
--------------------------------------------------------------------------------
  1 | use crate::{
  2 |     Classifier, Feature, FeatureExtractor, Model, Node, ParsedNode, Range, RuleId, RuleSet,
  3 |     RustlingResult, StashIndexable, Truth, Value,
  4 | };
  5 | use fnv::FnvHashMap;
  6 | use fnv::FnvHashSet;
  7 | use std::cmp::Eq;
  8 | use std::fmt::Debug;
  9 | use std::hash::Hash;
 10 | 
 11 | #[derive(Debug)]
 12 | pub struct Example<V: Value> {
 13 |     pub text: &'static str,
 14 |     pub predicate: Box<dyn Check<V>>,
 15 | }
 16 | 
 17 | impl<V: Value> Example<V> {
 18 |     pub fn new(text: &'static str, predicate: Box<dyn Check<V>>) -> Example<V> {
 19 |         Example { text, predicate }
 20 |     }
 21 | }
 22 | 
 23 | pub trait Check<V: Value>: Debug {
 24 |     fn check(&self, value: &ParsedNode<V>) -> bool;
 25 | }
 26 | 
 27 | pub fn train<V, F, E>(
 28 |     rules: &RuleSet<V>,
 29 |     examples: Vec<Example<V>>,
 30 |     feature_extractor: E,
 31 | ) -> RustlingResult<Model<RuleId, Truth, F>>
 32 | where
 33 |     V: Value + Debug + StashIndexable,
 34 |     V::Payload: Debug + Eq + Hash,
 35 |     F: Feature,
 36 |     E: FeatureExtractor<V, F>,
 37 | {
 38 |     let mut classified_ex: FnvHashMap<RuleId, Vec<(FnvHashMap<F, usize>, Truth)>> =
 39 |         FnvHashMap::default();
 40 |     for ex in examples.iter() {
 41 |         let stash = rules.apply_all(&ex.text.to_lowercase()).unwrap();
 42 | 
 43 |         // - keep only full-range parsed nodes
 44 |         // - partition them according to the example check value
 45 |         let (positive_parsed_nodes, negative_parse_nodes) = stash
 46 |             .into_iter()
 47 |             .filter(|candidate| candidate.root_node.byte_range == Range(0, ex.text.len()))
 48 |             .partition::<Vec<_>, _>(|candidate| ex.predicate.check(&candidate));
 49 |         // - example sanity check
 50 |         if positive_parsed_nodes.is_empty() {
 51 |             Err(format_err!("example: {:?} matched no rule", ex.text))?
 52 |         }
 53 | 
 54 |         // - expand parse nodes to nodes, according to the partition
 55 |         let mut negative_nodes = FnvHashSet::default();
 56 |         let mut positive_nodes = FnvHashSet::default();
 57 | 
 58 |         fn add_to_set<Payload: Clone + Eq + Hash>(
 59 |             nodes: &mut FnvHashSet<Node<Payload>>,
 60 |             node: &Node<Payload>,
 61 |         ) {
 62 |             nodes.insert(node.clone());
 63 |             for child in &node.children {
 64 |                 add_to_set(nodes, child);
 65 |             }
 66 |         }
 67 | 
 68 |         for parsed_node in positive_parsed_nodes {
 69 |             add_to_set(&mut positive_nodes, &parsed_node.root_node);
 70 |         }
 71 |         for parsed_node in negative_parse_nodes {
 72 |             add_to_set(&mut negative_nodes, &parsed_node.root_node);
 73 |         }
 74 | 
 75 |         // - ignore negative nodes if there is a matching positive node
 76 |         for pos in &positive_nodes {
 77 |             negative_nodes.remove(pos);
 78 |         }
 79 |         // - put node counted features, with truth value in the trainable hashmaps
 80 |         for (nodes, truth) in vec![(positive_nodes, true), (negative_nodes, false)].into_iter() {
 81 |             for n in nodes.into_iter() {
 82 |                 let mut counted_features = FnvHashMap::default();
 83 |                 for f in feature_extractor.for_node(&n).features {
 84 |                     *counted_features.entry(f).or_insert(0) += 1;
 85 |                 }
 86 |                 classified_ex
 87 |                     .entry(RuleId(n.rule_sym))
 88 |                     .or_insert(vec![])
 89 |                     .push((counted_features, Truth(truth)));
 90 |             }
 91 |         }
 92 |     }
 93 |     // - train the classifiers
 94 |     let classifiers = classified_ex
 95 |         .into_iter()
 96 |         .map(|(id, examples)| (id, Classifier::train(&examples)))
 97 |         .collect();
 98 |     Ok(Model { classifiers })
 99 | }
100 | 


--------------------------------------------------------------------------------
/update_version.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | #!/usr/bin/env bash
 3 | 
 4 | set -e
 5 | 
 6 | NEW_VERSION=$1
 7 | 
 8 | if [ -z $NEW_VERSION ]
 9 | then
10 |     echo "Usage: $0 NEW_VERSION"
11 |     exit 1
12 | fi
13 | 
14 | perl -p -i -e "s/^version = \".*\"\$/version = \"$NEW_VERSION\"/g" Cargo.toml
15 | perl -p -i -e "s/^version = \".*\"\$/version = \"$NEW_VERSION\"/g" */Cargo.toml


--------------------------------------------------------------------------------