├── .gitignore
├── .travis.yml
├── CHANGELOG.md
├── CONTRIBUTORS.md
├── LICENSE
├── README.md
├── docs
├── index.html
├── nimdoc.out.css
├── nregex.html
└── nregex
│ ├── common.html
│ ├── dfa.html
│ ├── dfamacro.html
│ ├── dfamatch.html
│ ├── exptransformation.html
│ ├── nfa.html
│ ├── nodematch.html
│ ├── nodetype.html
│ ├── parser.html
│ ├── private
│ ├── common.html
│ ├── dfa.html
│ ├── dfamacro.html
│ ├── dfamatch.html
│ ├── exptransformation.html
│ ├── nfa.html
│ ├── nodematch.html
│ ├── nodetype.html
│ ├── parser.html
│ └── scanner.html
│ └── scanner.html
├── nregex.nimble
├── src
├── nregex.nim
└── nregex
│ └── private
│ ├── common.nim
│ ├── dfa.nim
│ ├── dfamacro.nim
│ ├── dfamatch.nim
│ ├── exptransformation.nim
│ ├── nfa.nim
│ ├── nodematch.nim
│ ├── nodetype.nim
│ ├── parser.nim
│ └── scanner.nim
└── tests
├── nim.cfg
└── tests.nim
/.gitignore:
--------------------------------------------------------------------------------
1 | nimcache/
2 | bin/nregex
3 | bin/nregex.js
4 | tests/tests
5 | tests/tests.js
6 | docs/ugh
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | services:
2 | - docker
3 | env:
4 | - NIM=1.0.4
5 | before_install:
6 | - docker pull nimlang/nim:$NIM
7 | script:
8 | - docker run --rm -v `pwd`:/usr/src/app -w /usr/src/app nimlang/nim:$NIM /bin/bash -c "nimble install -y; nimble test"
9 | notifications:
10 | email:
11 | on_failure: never
12 | on_success: never
13 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | v0.0.3
2 | ==================
3 |
4 | * Added `findAll` API
5 |
6 | v0.0.2
7 | ==================
8 |
9 | * DFA minimization
10 | * Ascii flag
11 | * Added `match(string, Regex)` API
12 |
13 | v0.0.1
14 | ==================
15 |
16 | * Initial release
17 |
--------------------------------------------------------------------------------
/CONTRIBUTORS.md:
--------------------------------------------------------------------------------
1 | List of contributors:
2 |
3 | * timotheecour (Timothee Cour, timothee.cour2@gmail.com)
4 | * data-man (Dmitry Atamanov)
5 | * xmonader (xmonader@gmail.com)
6 | * kaushalmodi (Kaushal Modi)
7 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Esteban Castro Borsani
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # nregex
2 |
3 | [](https://raw.githubusercontent.com/nitely/nregex/master/LICENSE)
4 |
5 | This is currently a PoC for a DFA that supports submatches extraction. The match time complexity is linear in length of the text to match. [Read the article](https://nitely.github.io/2020/01/19/a-dfa-for-submatches-extraction.html) if you are interested in the implementation.
6 |
7 | > [!WARNING]
8 | > Pls use [nim-regex](https://github.com/nitely/nim-regex) for anything serious, instead of this package.
9 |
10 | ## Install
11 |
12 | ```
13 | nimble install nregex
14 | ```
15 |
16 | # Compatibility
17 |
18 | Nim +1.0.4
19 |
20 | ## Usage
21 |
22 | ```nim
23 | import pkg/nregex
24 |
25 | var m: RegexMatch
26 | doAssert match("abc", re"abc", m)
27 | doAssert match("ab", re"a(b|c)", m)
28 |
29 | doAssert match("aabcd", re"(aa)bcd", m)
30 | doAssert m.group(0) == @[0 .. 1]
31 | doAssert match("aab", re"((a)*b)", m)
32 | doAssert m.group(0) == @[0 .. 2]
33 | doAssert m.group(1) == @[0 .. 0, 1 .. 1]
34 |
35 | doAssert "abcd".find(re"bc", m)
36 | doAssert "2222".find(re"(22)*", m)
37 | doAssert m.group(0) == @[0 .. 1, 2 .. 3]
38 |
39 | doAssert re"bc" in "abcd"
40 | doAssert re"(23)+" in "112323211"
41 | ```
42 |
43 | ## Docs
44 |
45 | [Read the docs](https://nitely.github.io/nregex/)
46 |
47 | ## Benchmarks
48 |
49 | The following benchmarks show nregex is up to 22 times faster than PCRE. However, when the RE contains capture groups, PCRE is about 4 times faster than nregex.
50 |
51 | | | relative | time/iter | iters/s | regex | text
52 | | --- | --- | --- | --- | --- | ---
53 | CPU | | 294.85ps | 3.39G
54 | PCRE | | 1.10ms | 912.11 | ^\w\*sol\w\*$ | (a\*100000)sol(b\*100000)
55 | nregex | 739.52% | 148.25us | 6.75K
56 | PCRE | | 174.87ns | 5.72M | ^[0-9]+-[0-9]+-[0-9]+$ | 650-253-0001
57 | nregex | 2280.84% | 7.67ns | 130.43M
58 | PCRE | | 179.23ns | 5.58M | ^[0-9]+..+$ | 650-253-0001
59 | nregex | 1447.15% | 12.38ns | 80.74M
60 |
61 | ## Tests
62 |
63 | ```
64 | nimble test
65 | ```
66 |
67 | ## LICENSE
68 |
69 | MIT
70 |
--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Please follow this link.
8 |
9 |
10 |
--------------------------------------------------------------------------------
/docs/nregex/common.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 | nregex/common
21 |
22 |
23 |
24 |
25 |
69 |
70 |
71 |
72 |
73 |
74 |
nregex/common
75 |
76 |
77 |
78 |
82 |
Dark Mode
83 |
84 |
88 |
89 | Search:
91 |
92 |
93 | Group by:
94 |
98 |
99 |
100 | -
101 | Types
102 |
107 |
108 | -
109 | Consts
110 |
117 |
118 | -
119 | Procs
120 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 | RegexError = object of ValueError
147 | -
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 | invalidRune = -1'i32
159 | -
160 |
161 |
162 |
163 |
164 |
165 | lineBreakRune = 10'i32
166 | -
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 | proc toRune(s: string): Rune {...}{.raises: [], tags: [].}
178 | -
179 |
180 |
181 |
182 |
183 |
184 | proc `<=`(x, y: Rune): bool {...}{.raises: [], tags: [].}
185 | -
186 |
187 |
188 |
189 |
190 |
191 | proc cmp(x, y: Rune): int {...}{.raises: [], tags: [].}
192 | -
193 |
194 |
195 |
196 |
197 |
198 | proc `%%`(formatstr: string; a: openArray[string]): string {...}{.noSideEffect, raises: [],
199 | tags: [].}
200 | -
201 |
202 | same as "$#" % ["foo"] but returns empty string on error
203 |
204 |
205 |
206 | proc `%%`(formatstr: string; a: string): string {...}{.raises: [], tags: [].}
207 | -
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 | Made with Nim. Generated: 2020-03-18 12:52:35 UTC
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
--------------------------------------------------------------------------------
/docs/nregex/dfa.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 | nregex/dfa
21 |
22 |
23 |
24 |
25 |
69 |
70 |
71 |
72 |
73 |
74 |
nregex/dfa
75 |
76 |
77 |
78 |
82 |
Dark Mode
83 |
84 |
88 |
89 | Search:
91 |
92 |
93 | Group by:
94 |
98 |
99 |
100 | -
101 | Imports
102 |
105 |
106 | -
107 | Types
108 |
124 |
125 | -
126 | Consts
127 |
140 |
141 | -
142 | Funcs
143 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
164 |
165 |
166 |
167 |
168 | AlphabetSym = int32
169 | -
170 |
171 |
172 |
173 |
174 |
175 | Closure = HashSet[int16]
176 | -
177 |
178 |
179 |
180 |
181 |
182 | DfaRow = Table[AlphabetSym, int32]
183 | -
184 |
185 |
186 |
187 |
188 |
189 | DfaClosure = Table[AlphabetSym, int32]
190 | -
191 |
192 |
193 |
194 |
195 |
196 | Dfa = object
197 | table*: seq[DfaRow]
198 | cs*: seq[Closure]
199 | closures*: seq[DfaClosure]
200 |
201 | -
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 | symEoe = -1'i32
213 | -
214 |
215 |
216 |
217 |
218 |
219 | symWord = -3'i32
220 | -
221 |
222 |
223 |
224 |
225 |
226 | symDigit = -4'i32
227 | -
228 |
229 |
230 |
231 |
232 |
233 | symAny = -6'i32
234 | -
235 |
236 |
237 |
238 |
239 |
240 | symAnyNl = -7'i32
241 | -
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 | func dfa(nfa: seq[Node]; alphabet: var seq[AlphabetSym]): Dfa {...}{.
253 | raises: [IndexError, KeyError], tags: [].}
254 | -
255 |
256 | Powerset construction
257 |
258 |
259 |
260 | func minimize(dfa: Dfa; alphabet: seq[AlphabetSym]): Dfa {...}{.raises: [KeyError], tags: [].}
261 | -
262 |
263 | Hopcroft
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 | Made with Nim. Generated: 2020-03-18 12:52:36 UTC
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
--------------------------------------------------------------------------------
/docs/nregex/dfamacro.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 | nregex/dfamacro
21 |
22 |
23 |
24 |
25 |
69 |
70 |
71 |
72 |
73 |
74 |
nregex/dfamacro
75 |
76 |
77 |
78 |
82 |
Dark Mode
83 |
84 |
88 |
89 | Search:
91 |
92 |
93 | Group by:
94 |
98 |
99 |
100 | -
101 | Imports
102 |
105 |
106 | -
107 | Funcs
108 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
128 |
129 |
130 |
131 |
132 | func matchImpl(text: string; regex: static Regex; m: var RegexMatch;
133 | flags: static MatchFlags; start = 0): bool {...}{.inline.}
134 | -
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 | Made with Nim. Generated: 2020-03-18 12:52:36 UTC
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
--------------------------------------------------------------------------------
/docs/nregex/exptransformation.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 | nregex/exptransformation
21 |
22 |
23 |
24 |
25 |
69 |
70 |
71 |
72 |
73 |
74 |
nregex/exptransformation
75 |
76 |
77 |
78 |
82 |
Dark Mode
83 |
84 |
88 |
89 | Search:
91 |
92 |
93 | Group by:
94 |
98 |
99 |
100 | -
101 | Imports
102 |
105 |
106 | -
107 | Types
108 |
115 |
116 | -
117 | Funcs
118 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
137 |
138 |
139 |
140 |
141 | GroupsCapture = object
142 | count*: int16
143 | names*: OrderedTable[string, int16]
144 |
145 | -
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 | func transformExp(exp: seq[Node]; groups: var GroupsCapture): seq[Node] {...}{.inline,
157 | raises: [RegexError], tags: [].}
158 | -
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 | Made with Nim. Generated: 2020-03-18 12:52:36 UTC
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
--------------------------------------------------------------------------------
/docs/nregex/nfa.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 | nregex/nfa
21 |
22 |
23 |
24 |
25 |
69 |
70 |
71 |
72 |
73 |
74 |
nregex/nfa
75 |
76 |
77 |
78 |
82 |
Dark Mode
83 |
84 |
88 |
89 | Search:
91 |
92 |
93 | Group by:
94 |
98 |
99 |
100 | -
101 | Imports
102 |
105 |
106 | -
107 | Types
108 |
121 |
122 | -
123 | Funcs
124 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
143 |
174 |
175 |
176 |
177 |
178 | func nfa(exp: seq[Node]; transitions: var Transitions): seq[Node] {...}{.
179 | raises: [IndexError, RegexError], tags: [].}
180 | -
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 | Made with Nim. Generated: 2020-03-18 12:52:36 UTC
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
--------------------------------------------------------------------------------
/docs/nregex/nodematch.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 | nregex/nodematch
21 |
22 |
23 |
24 |
25 |
69 |
70 |
71 |
72 |
73 |
74 |
nregex/nodematch
75 |
76 |
77 |
78 |
82 |
Dark Mode
83 |
84 |
88 |
89 | Search:
91 |
92 |
93 | Group by:
94 |
98 |
99 |
100 | -
101 | Imports
102 |
105 |
106 | -
107 | Funcs
108 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
133 |
134 |
135 |
136 |
137 | func isWord(r: Rune): bool {...}{.inline, raises: [], tags: [].}
138 | -
139 |
140 |
141 |
142 |
143 |
144 | func match(n: Node; r: Rune; nxt: Rune): bool {...}{.raises: [], tags: [].}
145 | -
146 |
147 | match for Node of assertion kind. Return whether the node matches the current characters or not
148 |
149 |
150 |
151 | func swapCase(r: Rune): Rune {...}{.raises: [], tags: [].}
152 | -
153 |
154 |
155 |
156 |
157 |
158 | func match(n: Node; r: Rune): bool {...}{.raises: [], tags: [].}
159 | -
160 |
161 | match for Node of matchable kind. Return whether the node matches the current character or not
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 | Made with Nim. Generated: 2020-03-18 12:52:36 UTC
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
--------------------------------------------------------------------------------
/docs/nregex/parser.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 | nregex/parser
21 |
22 |
23 |
24 |
25 |
69 |
70 |
71 |
72 |
73 |
74 |
nregex/parser
75 |
76 |
77 |
78 |
82 |
Dark Mode
83 |
84 |
88 |
89 | Search:
91 |
92 |
93 | Group by:
94 |
98 |
99 |
100 | -
101 | Imports
102 |
105 |
106 | -
107 | Funcs
108 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
127 |
128 |
129 |
130 |
131 | func parse(expression: string): seq[Node] {...}{.raises: [RegexError], tags: [].}
132 | -
133 |
134 | convert a string regex expression into a Node expression
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 | Made with Nim. Generated: 2020-03-18 12:52:36 UTC
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
--------------------------------------------------------------------------------
/docs/nregex/private/common.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 | nregex/private/common
21 |
22 |
23 |
24 |
25 |
69 |
70 |
71 |
72 |
73 |
74 |
nregex/private/common
75 |
76 |
77 |
78 |
82 |
Dark Mode
83 |
84 |
88 |
89 | Search:
91 |
92 |
93 | Group by:
94 |
98 |
99 |
100 | -
101 | Types
102 |
107 |
108 | -
109 | Consts
110 |
117 |
118 | -
119 | Procs
120 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 | RegexError = object of ValueError
147 | -
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 | invalidRune = -1'i32
159 | -
160 |
161 |
162 |
163 |
164 |
165 | lineBreakRune = 10'i32
166 | -
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 | proc toRune(s: string): Rune {...}{.raises: [], tags: [].}
178 | -
179 |
180 |
181 |
182 |
183 |
184 | proc `<=`(x, y: Rune): bool {...}{.raises: [], tags: [].}
185 | -
186 |
187 |
188 |
189 |
190 |
191 | proc cmp(x, y: Rune): int {...}{.raises: [], tags: [].}
192 | -
193 |
194 |
195 |
196 |
197 |
198 | proc `%%`(formatstr: string; a: openArray[string]): string {...}{.noSideEffect, raises: [],
199 | tags: [].}
200 | -
201 |
202 | same as "$#" % ["foo"] but returns empty string on error
203 |
204 |
205 |
206 | proc `%%`(formatstr: string; a: string): string {...}{.raises: [], tags: [].}
207 | -
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 | Made with Nim. Generated: 2020-03-18 13:13:57 UTC
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
--------------------------------------------------------------------------------
/docs/nregex/private/dfamacro.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 | nregex/private/dfamacro
21 |
22 |
23 |
24 |
25 |
69 |
70 |
71 |
72 |
73 |
74 |
nregex/private/dfamacro
75 |
76 |
77 |
78 |
82 |
Dark Mode
83 |
84 |
88 |
89 | Search:
91 |
92 |
93 | Group by:
94 |
98 |
99 |
100 | -
101 | Imports
102 |
105 |
106 | -
107 | Funcs
108 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
128 |
129 |
130 |
131 |
132 | func matchImpl(text: string; regex: static Regex; m: var RegexMatch;
133 | flags: static MatchFlags; start = 0): bool {...}{.inline.}
134 | -
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 | Made with Nim. Generated: 2020-03-18 13:13:58 UTC
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
--------------------------------------------------------------------------------
/docs/nregex/private/exptransformation.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 | nregex/private/exptransformation
21 |
22 |
23 |
24 |
25 |
69 |
70 |
71 |
72 |
73 |
74 |
nregex/private/exptransformation
75 |
76 |
77 |
78 |
82 |
Dark Mode
83 |
84 |
88 |
89 | Search:
91 |
92 |
93 | Group by:
94 |
98 |
99 |
100 | -
101 | Imports
102 |
105 |
106 | -
107 | Types
108 |
115 |
116 | -
117 | Funcs
118 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
137 |
138 |
139 |
140 |
141 | GroupsCapture = object
142 | count*: int16
143 | names*: OrderedTable[string, int16]
144 |
145 | -
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 | func transformExp(exp: seq[Node]; groups: var GroupsCapture): seq[Node] {...}{.inline,
157 | raises: [RegexError], tags: [].}
158 | -
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 | Made with Nim. Generated: 2020-03-18 13:13:57 UTC
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
--------------------------------------------------------------------------------
/docs/nregex/private/nfa.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 | nregex/private/nfa
21 |
22 |
23 |
24 |
25 |
69 |
70 |
71 |
72 |
73 |
74 |
nregex/private/nfa
75 |
76 |
77 |
78 |
82 |
Dark Mode
83 |
84 |
88 |
89 | Search:
91 |
92 |
93 | Group by:
94 |
98 |
99 |
100 | -
101 | Imports
102 |
105 |
106 | -
107 | Types
108 |
121 |
122 | -
123 | Funcs
124 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
143 |
174 |
175 |
176 |
177 |
178 | func nfa(exp: seq[Node]; transitions: var Transitions): seq[Node] {...}{.
179 | raises: [IndexError, RegexError], tags: [].}
180 | -
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 | Made with Nim. Generated: 2020-03-18 13:13:57 UTC
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
--------------------------------------------------------------------------------
/docs/nregex/private/nodematch.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 | nregex/private/nodematch
21 |
22 |
23 |
24 |
25 |
69 |
70 |
71 |
72 |
73 |
74 |
nregex/private/nodematch
75 |
76 |
77 |
78 |
82 |
Dark Mode
83 |
84 |
88 |
89 | Search:
91 |
92 |
93 | Group by:
94 |
98 |
99 |
100 | -
101 | Imports
102 |
105 |
106 | -
107 | Funcs
108 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
133 |
134 |
135 |
136 |
137 | func isWord(r: Rune): bool {...}{.inline, raises: [], tags: [].}
138 | -
139 |
140 |
141 |
142 |
143 |
144 | func match(n: Node; r: Rune; nxt: Rune): bool {...}{.raises: [], tags: [].}
145 | -
146 |
147 | match for Node of assertion kind. Return whether the node matches the current characters or not
148 |
149 |
150 |
151 | func swapCase(r: Rune): Rune {...}{.raises: [], tags: [].}
152 | -
153 |
154 |
155 |
156 |
157 |
158 | func match(n: Node; r: Rune): bool {...}{.raises: [], tags: [].}
159 | -
160 |
161 | match for Node of matchable kind. Return whether the node matches the current character or not
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 | Made with Nim. Generated: 2020-03-18 13:13:58 UTC
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
--------------------------------------------------------------------------------
/docs/nregex/private/parser.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 | nregex/private/parser
21 |
22 |
23 |
24 |
25 |
69 |
70 |
71 |
72 |
73 |
74 |
nregex/private/parser
75 |
76 |
77 |
78 |
82 |
Dark Mode
83 |
84 |
88 |
89 | Search:
91 |
92 |
93 | Group by:
94 |
98 |
99 |
100 | -
101 | Imports
102 |
105 |
106 | -
107 | Funcs
108 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
127 |
128 |
129 |
130 |
131 | func parse(expression: string): seq[Node] {...}{.raises: [RegexError], tags: [].}
132 | -
133 |
134 | convert a string regex expression into a Node expression
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 | Made with Nim. Generated: 2020-03-18 13:13:57 UTC
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
--------------------------------------------------------------------------------
/nregex.nimble:
--------------------------------------------------------------------------------
1 | # Package
2 |
3 | version = "0.0.4"
4 | author = "Esteban Castro Borsani (@nitely)"
5 | description = "A DFA based regex engine"
6 | license = "MIT"
7 | srcDir = "src"
8 | skipDirs = @["tests", "docs"]
9 |
10 | requires "nim >= 1.0.4"
11 | requires "unicodedb >= 0.7.2"
12 | requires "unicodeplus >= 0.5.0"
13 |
14 | task test, "Test":
15 | exec "nim c -r -o:bin/nregex src/nregex.nim"
16 | exec "nim c -r tests/tests.nim"
17 | exec "nim c -r -d:forceRegexAtRuntime tests/tests.nim"
18 | #when (NimMajor, NimMinor) >= (1, 1):
19 | # exec "nim c -d:runTestAtCT --maxLoopIterationsVM:1000000000 tests/tests.nim"
20 | exec "nim js -r -o:bin/nregex.js --styleCheck:off src/nregex.nim"
21 | exec "nim js -r --styleCheck:off tests/tests.nim"
22 | exec "nim js -r --styleCheck:off -d:forceRegexAtRuntime tests/tests.nim"
23 |
24 | # Test runnable examples
25 | exec "nim doc -o:./docs/ugh/ugh.html ./src/nregex.nim"
26 |
27 | task docs, "Docs":
28 | exec "nim doc --project -o:./docs ./src/nregex.nim"
29 |
--------------------------------------------------------------------------------
/src/nregex/private/common.nim:
--------------------------------------------------------------------------------
1 | import std/unicode
2 | import std/strutils
3 |
4 | type
5 | RegexError* = object of ValueError
6 | ## raised when the pattern
7 | ## is not a valid regex
8 |
9 | const
10 | # This is used as start
11 | # and end of string. It should
12 | # be invalid code, but while it
13 | # works it simplifies things a bit.
14 | # An alternative would be opt[Rune]
15 | # or just using int32 and convert
16 | # Rune to int when needed
17 | invalidRune* = Rune(-1)
18 | # `\n` is platform specific in
19 | # Nim and not the actual `\n`
20 | lineBreakRune* = Rune(10)
21 |
22 | proc toRune*(s: string): Rune =
23 | result = s.runeAt(0)
24 |
25 | proc `<=`*(x, y: Rune): bool =
26 | x.int <= y.int
27 |
28 | proc cmp*(x, y: Rune): int =
29 | x.int - y.int
30 |
31 | proc `%%`*(
32 | formatstr: string,
33 | a: openArray[string]
34 | ): string {.noSideEffect, raises: [].} =
35 | ## same as ``"$#" % ["foo"]`` but
36 | ## returns empty string on error
37 | try:
38 | formatstr % a
39 | except ValueError:
40 | ""
41 |
42 | proc `%%`*(formatstr: string, a: string): string =
43 | formatstr %% [a]
44 |
--------------------------------------------------------------------------------
/src/nregex/private/dfa.nim:
--------------------------------------------------------------------------------
1 | import std/unicode
2 | import std/sets
3 | import std/tables
4 | import std/deques
5 |
6 | import nodematch
7 | import nodetype
8 | import common
9 |
10 | type
11 | AlphabetSym* = int32
12 | Closure* = HashSet[int16]
13 | DfaRow* = Table[AlphabetSym, int32]
14 | DfaClosure* = Table[AlphabetSym, int32]
15 | Dfa* = object
16 | table*: seq[DfaRow]
17 | cs*: seq[Closure]
18 | closures*: seq[DfaClosure]
19 |
20 | const
21 | symEoe* = -1'i32
22 | symWord* = -3'i32
23 | symDigit* = -4'i32
24 | symAny* = -6'i32
25 | symAnyNl* = -7'i32
26 |
27 | func createAlphabet(nfa: seq[Node]): seq[AlphabetSym] =
28 | var inAlphabet: HashSet[AlphabetSym]
29 | # speedup ascii matching
30 | for c in 0 .. 128:
31 | result.add(c.int32)
32 | inAlphabet.incl(c.int32)
33 | # special symbols
34 | result.add(symEoe)
35 | result.add(symWord)
36 | result.add(symDigit)
37 | result.add(symAny)
38 | result.add(symAnyNl)
39 | # expression chars
40 | for n in nfa:
41 | case n.kind
42 | of reChar:
43 | if n.cp.int32 notin inAlphabet:
44 | result.add(n.cp.int32)
45 | inAlphabet.incl(n.cp.int32)
46 | of reCharCI:
47 | if n.cp.int32 notin inAlphabet:
48 | result.add(n.cp.int32)
49 | inAlphabet.incl(n.cp.int32)
50 | let cp2 = n.cp.swapCase()
51 | if cp2.int32 notin inAlphabet:
52 | result.add(cp2.int32)
53 | inAlphabet.incl(cp2.int32)
54 | of reInSet:
55 | for cp in n.cps:
56 | if cp.int32 notin inAlphabet:
57 | result.add(cp.int32)
58 | inAlphabet.incl(cp.int32)
59 | for rg in n.ranges:
60 | for cp in rg:
61 | if cp.int32 notin inAlphabet:
62 | result.add(cp.int32)
63 | inAlphabet.incl(cp.int32)
64 | else:
65 | discard
66 | assert result.toHashSet.len == result.len
67 |
68 | func delta(
69 | nfa: seq[Node],
70 | states: Closure,
71 | sym: AlphabetSym
72 | ): Closure =
73 | result = initHashSet[int16](2)
74 | if sym > -1:
75 | for s in states:
76 | if match(nfa[s], sym.Rune):
77 | result.incl(s)
78 | else:
79 | # XXX this will add every sym for reAny, but we should only add symAny
80 | let kinds = case sym
81 | of symEoe: {reEoe}
82 | of symWord: {reAnyNl, reAny, reWord}
83 | of symDigit: {reAnyNl, reAny, reWord, reDigit}
84 | of symAny: {reAnyNl, reAny}
85 | of symAnyNl: {reAnyNl}
86 | else: {}
87 | for s in states:
88 | if nfa[s].kind in kinds:
89 | result.incl(s)
90 | if nfa[s].kind == reInSet:
91 | for sh in nfa[s].shorthands:
92 | if sh.kind in kinds:
93 | result.incl(s)
94 | break
95 |
96 | func dfa*(
97 | nfa: seq[Node],
98 | alphabet: var seq[AlphabetSym]
99 | ): Dfa =
100 | ## Powerset construction
101 | template closure(result, states) =
102 | for s in states:
103 | for sn in nfa[s].next:
104 | result.incl(sn)
105 | alphabet = createAlphabet(nfa)
106 | let n0 = 0
107 | var q0: Closure
108 | closure(q0, [n0])
109 | var qw = initDeque[Closure]()
110 | qw.addFirst(q0)
111 | var qu = initTable[Closure, int32]()
112 | var quPos = 0'i32
113 | qu[q0] = quPos
114 | inc quPos
115 | result.table.add(initTable[AlphabetSym, int32](2))
116 | result.closures.add(initTable[AlphabetSym, int32](2))
117 | var t = initHashSet[int16]()
118 | var csRev = initTable[Closure, int32]()
119 | while qw.len > 0:
120 | let qa = qw.popLast()
121 | for sym in alphabet:
122 | let s = delta(nfa, qa, sym)
123 | if s.len == 0:
124 | continue
125 | t.clear()
126 | closure(t, s)
127 | if t notin qu:
128 | qu[t] = quPos
129 | inc quPos
130 | qw.addFirst(t)
131 | result.table.add(initTable[AlphabetSym, int32](2))
132 | result.closures.add(initTable[AlphabetSym, int32](2))
133 | result.table[qu[qa]][sym] = qu[t]
134 | if s in csRev:
135 | result.closures[qu[qa]][sym] = csRev[s]
136 | else:
137 | result.closures[qu[qa]][sym] = result.cs.len.int32
138 | csRev[s] = result.cs.len.int32
139 | result.cs.add(s)
140 | assert result.table.len == result.closures.len
141 | assert result.cs.toHashSet.len == result.cs.len
142 |
143 | func minDfaTable(
144 | dfa: Dfa,
145 | p: seq[HashSet[int32]]
146 | ): Dfa {.inline.} =
147 | ## Construct DFA from Hopcroft partitions.
148 | ## This is O(N*A) where N is the number of
149 | ## DFA states and A the size of the alphabet,
150 | ## albeit it can be faster since not every state
151 | ## has a transition on every alphabet symbol
152 | # map DFA states to min-DFA states
153 | var statesMap = newSeq[int32](dfa.table.len)
154 | for i in 0 .. statesMap.len-1:
155 | statesMap[i] = -1
156 | for ri, r in p.pairs:
157 | for q in r:
158 | assert statesMap[q] == -1
159 | statesMap[q] = ri.int32
160 | # construct min-DFA table
161 | result.table.setLen(p.len)
162 | result.closures.setLen(p.len)
163 | var csRev = initTable[Closure, int32]()
164 | var closures = initTable[AlphabetSym, Closure]()
165 | for ri, r in p.pairs:
166 | assert r.len > 0
167 | result.table[ri] = initTable[AlphabetSym, int32](2)
168 | result.closures[ri] = initTable[AlphabetSym, int32](2)
169 | closures.clear()
170 | for q in r:
171 | for c, q2 in dfa.table[q].pairs:
172 | assert c notin result.table[ri] or
173 | result.table[ri][c] == statesMap[q2]
174 | result.table[ri][c] = statesMap[q2]
175 | if c notin closures:
176 | closures[c] = initHashSet[int16](2)
177 | closures[c].incl(dfa.cs[dfa.closures[q][c]])
178 | for c, closure in closures.pairs:
179 | if closure in csRev:
180 | result.closures[ri][c] = csRev[closure]
181 | else:
182 | result.closures[ri][c] = result.cs.len.int32
183 | csRev[closure] = result.cs.len.int32
184 | result.cs.add(closure)
185 | assert result.table.len == result.closures.len
186 | assert result.cs.toHashSet.len == result.cs.len
187 |
188 | func reverse(dfa: Dfa): Dfa {.inline.} =
189 | ## return reversed dfa table
190 | result.table.setLen(dfa.table.len)
191 | for i in 0 .. dfa.table.len-1:
192 | result.table[i] = initTable[AlphabetSym, int32](2)
193 | for i, t in dfa.table.pairs:
194 | for c, q in t.pairs:
195 | # add dup key, for multiple
196 | # in-transitions of same symbol
197 | result.table[q].add(c, i.int32)
198 |
199 | func xF(dfa: Dfa): HashSet[int32] {.inline.} =
200 | ## return all final states
201 | result = initHashSet[int32](2)
202 | for i, t in dfa.table.pairs:
203 | if symEoe in t:
204 | result.incl(i.int32)
205 | doAssert result.len > 0
206 |
207 | func xQ(dfa: Dfa): HashSet[int32] {.inline.} =
208 | ## return all states
209 | result = initHashSet[int32](2)
210 | for i in 0'i32 .. (dfa.table.len-1).int32:
211 | result.incl(i)
212 |
213 | func delta(
214 | dfa: Dfa,
215 | s: HashSet[int32],
216 | c: AlphabetSym
217 | ): HashSet[int32] {.inline.} =
218 | ## return set of states that can reach `s` on `c`,
219 | ## expects the reversed dfa
220 | result = initHashSet[int32](2)
221 | for q in s:
222 | for q2 in dfa.table[q].allValues(c):
223 | result.incl(q2)
224 |
225 | func canPartition(r, i: HashSet[int32]): bool {.inline.} =
226 | ## return true if:
227 | ## * intersection of R and I is not empty,
228 | ## * and the complement of R and I is not empty
229 | var intr = 0
230 | for q in r:
231 | intr += int(q in i)
232 | result = 0 < intr and intr < r.len
233 |
234 | func partition(
235 | r, i: HashSet[int32]
236 | ): (HashSet[int32], HashSet[int32]) {.inline.} =
237 | ## partition r into r1 and r2, such as r1 is the intersection
238 | ## of r and i, and r2 is r - such intersection
239 | result = (
240 | initHashSet[int32](2),
241 | initHashSet[int32](2))
242 | for x in r:
243 | if x in i:
244 | result[0].incl(x)
245 | else:
246 | result[1].incl(x)
247 |
248 | # without minimize
249 | # 43745 lines compiled; 8.679 sec total; 256.574MiB peakmem
250 | # unoptimized minimize
251 | # 43746 lines compiled; 35.277 sec total; 309.113MiB peakmem;
252 | # removing p[p.find(r)] and (r in w)
253 | # 43746 lines compiled; 32.970 sec total; 319.766MiB peakmem;
254 | # removing two (r - i) intersections of hashSets
255 | # 43756 lines compiled; 16.145 sec total; 308.73MiB peakmem;
256 | # dfa.reverse and init all hashsets to 2, except q-f is 64
257 | # 43779 lines compiled; 12.209 sec total; 309.234MiB peakmem
258 | # optimized dfa table construction
259 | # 43825 lines compiled; 11.985 sec total; 258.664MiB peakmem;
260 | func minimize*(
261 | dfa: Dfa,
262 | alphabet: seq[AlphabetSym]
263 | ): Dfa =
264 | ## Hopcroft
265 | template r: untyped {.dirty.} = p[ri]
266 | let dfaRev = dfa.reverse()
267 | let f = dfa.xF()
268 | let q = dfa.xQ()
269 | var w: seq[HashSet[int32]]
270 | w.add(f)
271 | w.add(q - f)
272 | var p: seq[HashSet[int32]]
273 | p.add(f)
274 | p.add(q - f)
275 | while w.len > 0:
276 | let s = w.pop
277 | for c in alphabet: # XXX take alphabet from `for q in s: dfa[q]`
278 | let i = delta(dfaRev, s, c)
279 | if i.len == 0:
280 | continue
281 | for ri in 0 .. p.len-1:
282 | if not canPartition(r, i):
283 | continue
284 | let wi = w.find r
285 | let (r1, r2) = partition(r, i)
286 | r = r1
287 | p.add r2
288 | if wi > -1:
289 | w[wi] = r1
290 | w.add r2
291 | elif r1.len <= r2.len:
292 | w.add r1
293 | else:
294 | w.add r2
295 | assert p.len <= dfa.table.len, "not a min DFA, wtf?"
296 | # make the initial state the first state
297 | var ri0 = -1
298 | for ri, r in p.pairs:
299 | if 0 in r:
300 | ri0 = ri
301 | break
302 | assert ri0 > -1
303 | swap p[0], p[ri0]
304 | result = minDfaTable(dfa, p)
305 |
--------------------------------------------------------------------------------
/src/nregex/private/dfamacro.nim:
--------------------------------------------------------------------------------
1 | import std/sets
2 | import std/tables
3 | import std/unicode
4 | import std/macros
5 |
6 | import pkg/unicodeplus except isUpper, isLower
7 |
8 | import common
9 | import nodetype
10 | import nodematch
11 | import nfa
12 | import dfa
13 | import dfamatch
14 |
15 | macro genClosureTable(
16 | qt: int32,
17 | nt: int16,
18 | regex: static Regex
19 | ): untyped =
20 | #[
21 | case qt: # curr closure
22 | of 1.int32:
23 | case nt: # next state
24 | of 2.int32:
25 | true
26 | else:
27 | false
28 | else: false
29 | ]#
30 | doAssert regex.dfa.cs.len > 0
31 | result = newStmtList()
32 | var caseStmtQ: seq[NimNode]
33 | caseStmtQ.add(qt)
34 | for i, t2 in regex.dfa.cs.pairs:
35 | #if t2.len == 0: # ?
36 | # continue
37 | var caseStmtNt: seq[NimNode]
38 | caseStmtNt.add(nt)
39 | for s in t2:
40 | caseStmtNt.add(newTree(nnkOfBranch,
41 | newLit s,
42 | quote do:
43 | return true))
44 | caseStmtNt.add(newTree(nnkElse,
45 | quote do:
46 | return false))
47 | caseStmtQ.add(newTree(nnkOfBranch,
48 | newLit i.int32,
49 | newStmtList(
50 | newTree(nnkCaseStmt, caseStmtNt))))
51 | caseStmtQ.add(newTree(nnkElse,
52 | quote do:
53 | return false))
54 | result.add(newTree(nnkCaseStmt, caseStmtQ))
55 | when defined(reDumpMacro):
56 | echo "==== genClosureTable ===="
57 | echo repr(result)
58 |
59 | func inClosure(
60 | qt: int32,
61 | nt: int16,
62 | regex: static Regex
63 | ): bool =
64 | genClosureTable(qt, nt, regex)
65 |
66 | macro genSubmatch(
67 | n, c, qt, cPrev, capt, captx, charIndex, matched, smB, capts: typed,
68 | regex: static Regex
69 | ): untyped =
70 | result = newStmtList()
71 | var caseStmtN: seq[NimNode]
72 | caseStmtN.add(n)
73 | for i, t in regex.transitions.all.pairs:
74 | if t.len == 0: # end state
75 | continue
76 | var branchBodyN: seq[NimNode]
77 | for nti, nt in t.pairs:
78 | let ntLit = newLit nt
79 | var inClosureBranch: seq[NimNode]
80 | if regex.transitions.allZ[i][nti] == -1'i16:
81 | inClosureBranch.add(quote do:
82 | add(`smB`, (`ntLit`, `capt`)))
83 | else:
84 | inClosureBranch.add(quote do:
85 | `matched` = true
86 | `captx` = `capt`)
87 | for z in regex.transitions.z[regex.transitions.allZ[i][nti]]:
88 | case z.kind
89 | of groupKind:
90 | let zIdx = newLit z.idx
91 | inClosureBranch.add(quote do:
92 | add(`capts`, CaptNode(parent: `captx`, bound: `charIndex`, idx: `zIdx`))
93 | `captx` = (len(`capts`) - 1).int32)
94 | of assertionKind:
95 | # https://github.com/nim-lang/Nim/issues/13266
96 | #let zLit = newLit z
97 | inClosureBranch.add(quote do:
98 | `matched` = `matched` and match(`z`, Rune(`cPrev`), Rune(`c`)))
99 | of matchTransitionKind:
100 | #let zLit = newLit z
101 | inClosureBranch.add(quote do:
102 | `matched` = `matched` and match(`z`, Rune(`c`)))
103 | else:
104 | doAssert false
105 | inClosureBranch.add(quote do:
106 | if `matched`:
107 | add(`smB`, (`ntLit`, `captx`)))
108 | doAssert inClosureBranch.len > 0
109 | let inClosureBranchStmt = newStmtList inClosureBranch
110 | branchBodyN.add(quote do:
111 | if inClosure(`qt`, `ntLit`, regex) and not hasState(`smB`, `ntLit`):
112 | `inClosureBranchStmt`)
113 | doAssert branchBodyN.len > 0
114 | caseStmtN.add(newTree(nnkOfBranch,
115 | newLit i.int16,
116 | newStmtList(
117 | branchBodyN)))
118 | caseStmtN.add(newTree(nnkElse,
119 | newStmtList(
120 | newTree(nnkDiscardStmt, newEmptyNode()))))
121 | result.add(newTree(nnkCaseStmt, caseStmtN))
122 | when defined(reDumpMacro):
123 | echo "==== genSubmatch ===="
124 | echo repr(result)
125 |
126 | template submatch(
127 | smA, smB, capts, regex, i, qt, cprev, c: untyped
128 | ): untyped =
129 | var captx: int32
130 | var matched = true
131 | for n, capt in smA.items:
132 | genSubmatch(
133 | n, c, qt, cPrev, capt, captx, i, matched, smB, capts, regex)
134 | swap(smA, smB)
135 | smB.clear()
136 |
137 | macro genEoeTable(
138 | matched: bool,
139 | q, qt: int32,
140 | regex: static Regex
141 | ): untyped =
142 | ## Generate Eoe table
143 | result = newStmtList()
144 | var caseStmtQ: seq[NimNode]
145 | caseStmtQ.add(q)
146 | for i, t in regex.dfa.table.pairs:
147 | if symEoe in t:
148 | let trueLit = newLit true
149 | let qtLit = newLit regex.dfa.closures[i][symEoe]
150 | caseStmtQ.add(newTree(nnkOfBranch,
151 | newLit i.int32,
152 | quote do:
153 | `matched` = `trueLit`
154 | `qt` = `qtLit`))
155 | doAssert caseStmtQ.len > 1
156 | let falseLit = newLit false
157 | let qtLit = newLit -1'i32
158 | caseStmtQ.add(newTree(nnkElse,
159 | quote do:
160 | `matched` = `falseLit`
161 | `qt` = `qtLit`))
162 | result.add(
163 | newTree(nnkCaseStmt, caseStmtQ))
164 | when defined(reDumpMacro):
165 | echo "==== genEoeTable ===="
166 | echo repr(result)
167 |
168 | macro genSymMatchTable(
169 | q, qt, c: int32,
170 | regex: static Regex
171 | ): untyped =
172 | ## Generate symMatch transition table
173 | result = newStmtList()
174 | var caseStmtQ: seq[NimNode]
175 | caseStmtQ.add(q)
176 | var qBranches: seq[NimNode]
177 | for i, t in regex.dfa.table.pairs:
178 | var symIfs: seq[NimNode]
179 | for sym in syms:
180 | if sym notin regex.dfa.table[i]:
181 | continue
182 | case sym:
183 | of symDigit:
184 | let tLit = newLit regex.dfa.table[i][symDigit]
185 | let qtLit = newLit regex.dfa.closures[i][symDigit]
186 | symIfs.add(newTree(nnkElifBranch,
187 | quote do:
188 | isDecimal(Rune(`c`)),
189 | quote do:
190 | `q` = `tLit`
191 | `qt` = `qtLit`))
192 | of symWord:
193 | let tLit = newLit regex.dfa.table[i][symWord]
194 | let qtLit = newLit regex.dfa.closures[i][symWord]
195 | symIfs.add(newTree(nnkElifBranch,
196 | quote do:
197 | isWord(Rune(`c`)),
198 | quote do:
199 | `q` = `tLit`
200 | `qt` = `qtLit`))
201 | of symAny:
202 | let lineBreakLit = newLit lineBreakRune.int32
203 | let tLit = newLit regex.dfa.table[i][symAny]
204 | let qtLit = newLit regex.dfa.closures[i][symAny]
205 | symIfs.add(newTree(nnkElifBranch,
206 | quote do:
207 | `c` != `lineBreakLit`,
208 | quote do:
209 | `q` = `tLit`
210 | `qt` = `qtLit`))
211 | of symAnyNl:
212 | let tLit = newLit regex.dfa.table[i][symAnyNl]
213 | let qtLit = newLit regex.dfa.closures[i][symAnyNl]
214 | symIfs.add(newTree(nnkElifBranch,
215 | quote do:
216 | true,
217 | quote do:
218 | `q` = `tLit`
219 | `qt` = `qtLit`))
220 | else:
221 | doAssert false
222 | discard
223 | if symIfs.len > 0:
224 | let tLit = newLit -1'i32
225 | symIfs.add(newTree(nnkElse,
226 | quote do:
227 | `q` = `tLit`
228 | `qt` = `tLit`))
229 | qBranches.add(newTree(nnkOfBranch,
230 | newLit i.int32,
231 | newStmtList(
232 | newTree(nnkIfStmt, symIfs))))
233 | let tLit = newLit -1'i32
234 | if qBranches.len > 0:
235 | caseStmtQ.add(qBranches)
236 | caseStmtQ.add(newTree(nnkElse,
237 | quote do:
238 | `q` = `tLit`
239 | `qt` = `tLit`))
240 | result.add(newTree(nnkCaseStmt, caseStmtQ))
241 | else:
242 | result.add(quote do:
243 | `q` = `tLit`
244 | `qt` = `tLit`)
245 | when defined(reDumpMacro):
246 | echo "==== genSymMatchTable ===="
247 | echo repr(result)
248 |
249 | macro genTable(
250 | q, qt, c: int32,
251 | regex: static Regex
252 | ): untyped =
253 | ## Generate transition table
254 | var caseStmtQ: seq[NimNode]
255 | caseStmtQ.add(q)
256 | for i, t in regex.dfa.table.pairs:
257 | var caseStmtC: seq[NimNode]
258 | caseStmtC.add(c)
259 | for c2, t2 in t:
260 | let t2Lit = newLit t2.int32
261 | let qtLit = newLit regex.dfa.closures[i][c2]
262 | caseStmtC.add(newTree(nnkOfBranch,
263 | newLit c2,
264 | quote do:
265 | `q` = `t2Lit`
266 | `qt` = `qtLit`))
267 | let t2Lit = newLit -1'i32
268 | caseStmtC.add(newTree(nnkElse,
269 | quote do:
270 | `q` = `t2Lit`
271 | `qt` = `t2Lit`))
272 | caseStmtQ.add(newTree(nnkOfBranch,
273 | newLit i.int32,
274 | newStmtList(
275 | newTree(nnkCaseStmt, caseStmtC))))
276 | caseStmtQ.add(newTree(nnkElse,
277 | newStmtList(
278 | newTree(nnkDiscardStmt, newEmptyNode()))))
279 | result = newStmtList(
280 | newTree(nnkCaseStmt, caseStmtQ))
281 | when defined(reDumpMacro):
282 | echo "==== genTable ===="
283 | echo repr(result)
284 |
285 | func matchImpl*(
286 | text: string,
287 | regex: static Regex,
288 | m: var RegexMatch,
289 | flags: static MatchFlags,
290 | start = 0
291 | ): bool {.inline.} =
292 | m.clear()
293 | result = false
294 | var
295 | cPrev = -1'i32
296 | c: Rune
297 | q = 0'i32
298 | qOld {.used.} = q
299 | qt = q
300 | i = start
301 | iPrev = start
302 | # workaround for VM registry limitation
303 | const
304 | zCount = regex.transitions.zCount
305 | zGroupsCount = regex.groupsCount * 2
306 | noCaptures = mfNoCaptures in flags
307 | # workaround for https://github.com/nim-lang/Nim/issues/13252
308 | const
309 | reFlags = regex.flags
310 | canSkipTransitionsZ = noCaptures and
311 | zGroupsCount == zCount
312 | hasTransitionsZ = zCount > 0 and
313 | not canSkipTransitionsZ
314 | groupCount {.used.} = regex.groupsCount
315 | namedGroups {.used.} = regex.namedGroups
316 | when hasTransitionsZ:
317 | var
318 | smA = newSubmatches(regex.transitions.all.len)
319 | smB = newSubmatches(regex.transitions.all.len)
320 | capts: Capts
321 | smA.add((0'i16, -1'i32))
322 | while i < len(text):
323 | when reAscii notin reFlags:
324 | fastRuneAt(text, i, c, true)
325 | qOld = q
326 | else:
327 | c = Rune(text[i])
328 | inc i
329 | genTable(q, qt, c.int32, regex)
330 | if (q == -1'i32).unlikely:
331 | when reAscii notin reFlags:
332 | q = qOld
333 | genSymMatchTable(q, qt, c.int32, regex)
334 | if (q == -1'i32).unlikely:
335 | return
336 | when hasTransitionsZ:
337 | submatch(smA, smB, capts, regex, iPrev, qt, cPrev, c.int32)
338 | iPrev = i
339 | cPrev = c.int32
340 | genEoeTable(result, q, qt, regex)
341 | when hasTransitionsZ:
342 | if not result:
343 | return
344 | # XXX lighter submatchEoe
345 | submatch(smA, smB, capts, regex, iPrev, qt, cPrev, symEoe)
346 | if smA.len == 0:
347 | result = false
348 | return
349 | constructSubmatches(m.captures, capts, smA[0][1], groupCount)
350 | when namedGroups.len > 0:
351 | m.namedGroups = namedGroups
352 | m.boundaries = start .. iPrev-1
353 |
--------------------------------------------------------------------------------
/src/nregex/private/dfamatch.nim:
--------------------------------------------------------------------------------
1 | ## DFA matcher for non-static regexes
2 |
3 | import std/unicode
4 | import std/sets
5 | import std/tables
6 | import std/deques
7 | import std/algorithm
8 |
9 | import pkg/unicodeplus except isUpper, isLower
10 |
11 | import nodematch
12 | import nodetype
13 | import common
14 | import nfa
15 | import dfa
16 |
17 | type
18 | CaptNode* = object
19 | parent*: int32
20 | bound*: int
21 | idx*: int16
22 | Capts* = seq[CaptNode]
23 | Captures* = seq[seq[Slice[int]]]
24 |
25 | func constructSubmatches*(
26 | captures: var Captures,
27 | capts: Capts,
28 | capt, size: int
29 | ) {.inline.} =
30 | template currGroup: untyped = captures[capts[capt].idx]
31 | captures.setLen(size)
32 | for i in 0 .. captures.len-1:
33 | captures[i].setLen(0)
34 | if capts.len == 0:
35 | return
36 | var capt = capt
37 | while capt != -1:
38 | if currGroup.len == 0:
39 | currGroup.add(-2 .. -2)
40 | if currGroup[^1].a != -2:
41 | currGroup.add(-2 .. -2)
42 | if currGroup[^1].b == -2:
43 | currGroup[^1].b = capts[capt].bound-1
44 | else:
45 | currGroup[^1].a = capts[capt].bound
46 | capt = capts[capt].parent
47 | for c in captures.mitems:
48 | c.reverse()
49 |
50 | type
51 | NodeIdx = int16
52 | CaptIdx = int32
53 | Submatches* = ref object
54 | ## Parallel states would be a better name
55 | sx: seq[(NodeIdx, CaptIdx)]
56 | # use custom len because setLen(0) is slower,
57 | # and {.noInit.} makes no difference
58 | si: int16
59 | ss: seq[int16]
60 |
61 | func newSubmatches*(size: int): Submatches {.inline.} =
62 | result = new Submatches
63 | result.sx = newSeq[(NodeIdx, CaptIdx)](8)
64 | result.ss = newSeq[int16](size)
65 | result.si = 0
66 |
67 | func `[]`*(sm: Submatches, i: int): (NodeIdx, CaptIdx) {.inline.} =
68 | assert i < sm.si
69 | sm.sx[i]
70 |
71 | func hasState*(sm: Submatches, n: int16): bool {.inline.} =
72 | sm.ss[n] < sm.si and sm.sx[sm.ss[n]][0] == n
73 |
74 | func add*(sm: var Submatches, item: (NodeIdx, CaptIdx)) {.inline.} =
75 | assert not sm.hasState(item[0])
76 | assert sm.si <= sm.sx.len
77 | if (sm.si == sm.sx.len).unlikely:
78 | sm.sx.setLen(sm.sx.len * 2)
79 | sm.sx[sm.si] = item
80 | sm.ss[item[0]] = sm.si
81 | sm.si += 1'i16
82 |
83 | func len*(sm: Submatches): int {.inline.} =
84 | sm.si
85 |
86 | func clear*(sm: var Submatches) {.inline.} =
87 | sm.si = 0
88 |
89 | iterator items*(sm: Submatches): (NodeIdx, CaptIdx) {.inline.} =
90 | for i in 0 .. sm.len-1:
91 | yield sm.sx[i]
92 |
93 | func submatch(
94 | smA, smB: var Submatches,
95 | capts: var Capts,
96 | transitions: Transitions,
97 | states: Closure,
98 | i: int,
99 | cprev, c: int32
100 | ) {.inline.} =
101 | smB.clear()
102 | var captx: int32
103 | var matched = true
104 | for n, capt in smA.items:
105 | for nti, nt in transitions.all[n].pairs:
106 | if smB.hasState(nt):
107 | continue
108 | if nt notin states:
109 | continue
110 | if transitions.allZ[n][nti] == -1'i16:
111 | smB.add((nt, capt))
112 | continue
113 | matched = true
114 | captx = capt
115 | for z in transitions.z[transitions.allZ[n][nti]]:
116 | if not matched:
117 | break
118 | case z.kind
119 | of groupKind:
120 | capts.add(CaptNode(
121 | parent: captx,
122 | bound: i,
123 | idx: z.idx))
124 | captx = (capts.len-1'i32).int32
125 | of assertionKind:
126 | matched = match(z, cprev.Rune, c.Rune)
127 | of matchTransitionKind:
128 | matched = match(z, c.Rune)
129 | else:
130 | assert false
131 | discard
132 | if matched:
133 | smB.add((nt, captx))
134 | swap(smA, smB)
135 |
136 | type
137 | RegexFlag* = enum
138 | reAscii
139 | Regex* = object
140 | ## a compiled regular expression
141 | dfa*: Dfa
142 | transitions*: Transitions
143 | groupsCount*: int16
144 | namedGroups*: OrderedTable[string, int16]
145 | flags*: set[RegexFlag]
146 | MatchFlag* = enum
147 | mfShortestMatch
148 | mfLongestMatch
149 | mfNoCaptures
150 | MatchFlags* = set[MatchFlag]
151 | RegexMatch* = object
152 | ## result from matching operations
153 | captures*: Captures
154 | namedGroups*: OrderedTable[string, int16]
155 | boundaries*: Slice[int]
156 |
157 | func clear*(m: var RegexMatch) {.inline.} =
158 | if m.captures.len > 0:
159 | m.captures.setLen(0)
160 | if m.namedGroups.len > 0:
161 | m.namedGroups.clear()
162 | m.boundaries = 0 .. -1
163 |
164 | # Order matters, subsets first
165 | const syms* = [
166 | symDigit,
167 | symWord,
168 | symAny,
169 | symAnyNl
170 | ]
171 |
172 | # Slow match
173 | func symMatch(
174 | q: var int32,
175 | c: Rune,
176 | cSym: var int32,
177 | regex: Regex
178 | ) {.inline.} =
179 | var matched = false
180 | for sym in syms:
181 | if sym notin regex.dfa.table[q]:
182 | continue
183 | matched = case sym:
184 | of symDigit: c.isDecimal()
185 | of symWord: c.isWord()
186 | of symAny: c != lineBreakRune
187 | of symAnyNl: true
188 | else: false
189 | if matched:
190 | q = regex.dfa.table[q][sym]
191 | cSym = sym
192 | break
193 | if not matched:
194 | q = -1'i32
195 |
196 | # Can't return early because of boundaries
197 | template longestMatchEnter(): untyped {.dirty.} =
198 | if symEoe in regex.dfa.table[q]:
199 | matchedLong = true
200 | iPrevLong = iPrev
201 | if hasTransitionsZ:
202 | submatch(
203 | smA, smB, capts, regex.transitions,
204 | regex.dfa.cs[regex.dfa.closures[q][symEoe]], iPrev, cPrev, c.int32)
205 | if smA.len > 0:
206 | captLong = smA[0][1]
207 | swap(smA, smB)
208 |
209 | template longestMatchExit(): untyped {.dirty.} =
210 | result = matchedLong
211 | if hasTransitionsZ:
212 | constructSubmatches(m.captures, capts, captLong, regex.groupsCount)
213 | if regex.namedGroups.len > 0:
214 | m.namedGroups = regex.namedGroups
215 | m.boundaries = start .. iPrevLong-1
216 | return
217 |
218 | template shortestMatch(): untyped {.dirty.} =
219 | if symEoe in regex.dfa.table[q]:
220 | if hasTransitionsZ:
221 | submatch(
222 | smA, smB, capts, regex.transitions,
223 | regex.dfa.cs[regex.dfa.closures[q][symEoe]], iPrev, cPrev, c.int32)
224 | if smA.len > 0:
225 | result = true
226 | return
227 | swap(smA, smB)
228 | else:
229 | result = true
230 | return
231 |
232 | func matchImpl*(
233 | text: string,
234 | regex: Regex,
235 | m: var RegexMatch,
236 | flags: static MatchFlags,
237 | start = 0
238 | ): bool {.inline.} =
239 | #echo dfa
240 | m.clear()
241 | result = false
242 | let
243 | asciiMode = reAscii in regex.flags
244 | canSkipTransitionsZ = mfNoCaptures in flags and
245 | regex.groupsCount * 2 == regex.transitions.zCount
246 | hasTransitionsZ = regex.transitions.zCount > 0 and
247 | not canSkipTransitionsZ
248 | var
249 | smA: Submatches
250 | smB: Submatches
251 | capts: Capts
252 | c: Rune
253 | cPrev = -1'i32
254 | cSym: int32
255 | q = 0'i32
256 | qnext = 0'i32
257 | i = start
258 | iPrev = start
259 | # Long match
260 | matchedLong {.used.} = false
261 | captLong {.used.} = -1
262 | iPrevLong {.used.} = start
263 | if hasTransitionsZ:
264 | smA = newSubmatches(regex.transitions.all.len)
265 | smB = newSubmatches(regex.transitions.all.len)
266 | smA.add((0'i16, -1'i32))
267 | #echo regex.dfa
268 | while i < len(text):
269 | if not asciiMode:
270 | fastRuneAt(text, i, c, true)
271 | else:
272 | c = Rune(text[i])
273 | inc i
274 | when mfShortestMatch in flags:
275 | shortestMatch()
276 | when mfLongestMatch in flags:
277 | longestMatchEnter()
278 | cSym = c.int32
279 | if (c.int32 in regex.dfa.table[q]).likely:
280 | qnext = regex.dfa.table[q][c.int32]
281 | else:
282 | if not asciiMode:
283 | symMatch(qnext, c, cSym, regex)
284 | if qnext == -1 or asciiMode:
285 | when mfLongestMatch in flags:
286 | longestMatchExit()
287 | else:
288 | return
289 | if hasTransitionsZ:
290 | submatch(
291 | smA, smB, capts, regex.transitions,
292 | regex.dfa.cs[regex.dfa.closures[q][cSym]], iPrev, cPrev, c.int32)
293 | iPrev = i
294 | cPrev = c.int32
295 | q = qnext
296 | #echo q
297 | result = symEoe in regex.dfa.table[q]
298 | if not result:
299 | when mfLongestMatch in flags:
300 | longestMatchExit()
301 | return
302 | if hasTransitionsZ:
303 | submatch(
304 | smA, smB, capts, regex.transitions,
305 | regex.dfa.cs[regex.dfa.closures[q][symEoe]], iPrev, cPrev, -1'i32)
306 | if smA.len == 0: # XXX is this possible?
307 | when mfLongestMatch in flags:
308 | longestMatchExit()
309 | result = false
310 | return
311 | constructSubmatches(m.captures, capts, smA[0][1], regex.groupsCount)
312 | if regex.namedGroups.len > 0:
313 | m.namedGroups = regex.namedGroups
314 | m.boundaries = start .. iPrev-1
315 |
--------------------------------------------------------------------------------
/src/nregex/private/exptransformation.nim:
--------------------------------------------------------------------------------
1 | import std/unicode
2 | import std/sets
3 | import std/tables
4 | import std/algorithm
5 |
6 | import nodetype
7 | import common
8 | import scanner
9 |
10 | # todo: can not use unicodeplus due to
11 | # https://github.com/nim-lang/Nim/issues/7059
12 | func swapCase(r: Rune): Rune =
13 | # Note a character can be
14 | # non-lower and non-upper
15 | if r.isUpper():
16 | result = r.toLower()
17 | elif r.isLower():
18 | result = r.toUpper()
19 | else:
20 | result = r
21 |
22 | func check(cond: bool, msg: string) =
23 | if not cond:
24 | raise newException(RegexError, msg)
25 |
26 | func greediness(expression: seq[Node]): seq[Node] =
27 | ## apply greediness to an expression
28 | result = newSeqOfCap[Node](expression.len)
29 | var sc = expression.scan()
30 | for n in sc.mitems():
31 | if (n.kind in repetitionKind or
32 | n.kind == reZeroOrOne) and
33 | sc.peek.kind == reZeroOrOne:
34 | n.isGreedy = true
35 | discard sc.next
36 | result.add(n)
37 |
38 | type
39 | GroupsCapture* = object
40 | count*: int16
41 | names*: OrderedTable[string, int16]
42 |
43 | func fillGroups(
44 | exp: seq[Node],
45 | groups: var GroupsCapture
46 | ): seq[Node] =
47 | ## populate group indices, names and capturing mark
48 | result = exp
49 | groups.names = initOrderedTable[string, int16](2)
50 | groups.count = 0'i16
51 | var gs = newSeq[int]()
52 | for i, n in result.mpairs:
53 | case n.kind
54 | of reGroupStart:
55 | gs.add(i)
56 | if n.isCapturing:
57 | n.idx = groups.count
58 | inc groups.count
59 | if n.name.len > 0:
60 | assert n.isCapturing
61 | groups.names[n.name] = n.idx
62 | of reGroupEnd:
63 | check(
64 | gs.len > 0,
65 | "Invalid capturing group. " &
66 | "Found too many closing symbols")
67 | let start = gs.pop()
68 | n.isCapturing = result[start].isCapturing
69 | n.idx = result[start].idx
70 | else:
71 | discard
72 | check(
73 | groups.count < int16.high,
74 | ("Invalid number of capturing groups, " &
75 | "the limit is $#") %% $(int16.high - 1))
76 | check(
77 | gs.len == 0,
78 | "Invalid capturing group. " &
79 | "Found too many opening symbols")
80 |
81 | func toAsciiKind(k: NodeKind): NodeKind =
82 | case k
83 | of reWordBoundary:
84 | reWordBoundaryAscii
85 | of reNotWordBoundary:
86 | reNotWordBoundaryAscii
87 | of reWord:
88 | reWordAscii
89 | of reDigit:
90 | reDigitAscii
91 | of reWhiteSpace:
92 | reWhiteSpaceAscii
93 | of reNotAlphaNum:
94 | reNotAlphaNumAscii
95 | of reNotDigit:
96 | reNotDigitAscii
97 | of reNotWhiteSpace:
98 | reNotWhiteSpaceAscii
99 | of reAny:
100 | reAnyAscii
101 | of reAnyNL:
102 | reAnyNLAscii
103 | else:
104 | k
105 |
106 | func toggle(f: Flag): Flag =
107 | ## toggle regular flag to
108 | ## negated flag and the other way around
109 | case f
110 | of flagCaseInsensitive:
111 | flagNotCaseInsensitive
112 | of flagNotCaseInsensitive:
113 | flagCaseInsensitive
114 | of flagMultiLine:
115 | flagNotMultiLine
116 | of flagNotMultiLine:
117 | flagMultiLine
118 | of flagAnyMatchNewLine:
119 | flagNotAnyMatchNewLine
120 | of flagNotAnyMatchNewLine:
121 | flagAnyMatchNewLine
122 | of flagUnGreedy:
123 | flagNotUnGreedy
124 | of flagNotUnGreedy:
125 | flagUnGreedy
126 | of flagUnicode:
127 | flagNotUnicode
128 | of flagNotUnicode:
129 | flagUnicode
130 | of flagVerbose:
131 | flagNotVerbose
132 | of flagNotVerbose:
133 | flagVerbose
134 |
135 | func squash(flags: seq[seq[Flag]]): array[Flag, bool] =
136 | ## Nested groups may contain flags,
137 | ## this will set/unset those flags
138 | ## in order. It should be done each time
139 | ## there is a group start/end
140 | for ff in flags:
141 | for f in ff:
142 | result[f.toggle()] = false
143 | result[f] = true
144 |
145 | func applyFlag(n: var Node, f: Flag) =
146 | case f
147 | of flagAnyMatchNewLine:
148 | if n.kind == reAny:
149 | n.kind = reAnyNL
150 | of flagMultiLine:
151 | case n.kind
152 | of reStartSym:
153 | n.kind = reStartSymML
154 | of reEndSym:
155 | n.kind = reEndSymML
156 | else:
157 | discard
158 | of flagCaseInsensitive:
159 | if n.kind == reChar and n.cp != n.cp.swapCase():
160 | n.kind = reCharCI
161 | # todo: apply recursevely to
162 | # shorthands of reInSet/reNotSet (i.e: [:ascii:])
163 | if n.kind in {reInSet, reNotSet}:
164 | var cps = initHashSet[Rune]()
165 | cps.incl(n.cps)
166 | for cp in cps:
167 | let cpsc = cp.swapCase()
168 | if cp != cpsc:
169 | n.cps.incl(cpsc)
170 | for sl in n.ranges[0 .. ^1]:
171 | let
172 | cpa = sl.a.swapCase()
173 | cpb = sl.b.swapCase()
174 | if sl.a != cpa and sl.b != cpb:
175 | n.ranges.add(cpa .. cpb)
176 | of flagUnGreedy:
177 | if n.kind in opKind:
178 | n.isGreedy = not n.isGreedy
179 | of flagNotUnicode:
180 | n.kind = n.kind.toAsciiKind()
181 | if n.kind in {reInSet, reNotSet}:
182 | for nn in n.shorthands.mitems:
183 | nn.kind = nn.kind.toAsciiKind()
184 | else:
185 | assert f in {
186 | flagNotAnyMatchNewLine,
187 | flagNotMultiLine,
188 | flagNotCaseInsensitive,
189 | flagNotUnGreedy,
190 | flagUnicode,
191 | flagVerbose,
192 | flagNotVerbose}
193 |
194 | func applyFlags(expression: seq[Node]): seq[Node] =
195 | ## apply flags to each group
196 | result = newSeqOfCap[Node](expression.len)
197 | var flags = newSeq[seq[Flag]]()
198 | var sc = expression.scan()
199 | for n in sc.mitems():
200 | # (?flags)
201 | # Orphan flags are added to current group
202 | case n.kind
203 | of reGroupStart:
204 | if n.flags.len == 0:
205 | flags.add(@[])
206 | result.add(n)
207 | continue
208 | if sc.peek.kind == reGroupEnd: # (?flags)
209 | discard sc.next()
210 | if flags.len > 0:
211 | flags[flags.len - 1].add(n.flags)
212 | else:
213 | flags.add(n.flags)
214 | continue # skip (
215 | flags.add(n.flags)
216 | of reGroupEnd:
217 | discard flags.pop()
218 | else:
219 | let ff = flags.squash()
220 | for f in Flag.low .. Flag.high:
221 | if ff[f]:
222 | applyFlag(n, f)
223 | result.add(n)
224 |
225 | func expandOneRepRange(subExpr: seq[Node], n: Node): seq[Node] =
226 | ## expand a repetition-range expression
227 | ## into the equivalent repeated expression
228 | assert n.kind == reRepRange
229 | if n.max == -1: # a{n,} -> aaa*
230 | result = newSeqOfCap[Node](subExpr.len * (n.min + 1) + 1)
231 | for _ in 0 ..< n.min:
232 | result.add(subExpr)
233 | result.add(Node(
234 | kind: reZeroOrMore,
235 | cp: "*".toRune,
236 | isGreedy: n.isGreedy))
237 | elif n.min == n.max: # a{n} -> aaa
238 | result = newSeqOfCap[Node](subExpr.len * n.max)
239 | for _ in 0 ..< n.max - 1:
240 | result.add(subExpr)
241 | else: # a{n,m} -> aaa?a?
242 | assert n.min < n.max
243 | result = newSeqOfCap[Node](subExpr.len * n.max + n.max - n.min)
244 | for _ in 0 ..< n.min:
245 | result.add(subExpr)
246 | for _ in n.min ..< n.max - 1:
247 | result.add(Node(
248 | kind: reZeroOrOne,
249 | cp: "?".toRune,
250 | isGreedy: n.isGreedy))
251 | result.add(subExpr)
252 | result.add(Node(
253 | kind: reZeroOrOne,
254 | cp: "?".toRune,
255 | isGreedy: n.isGreedy))
256 |
257 | func expandRepRange(expression: seq[Node]): seq[Node] =
258 | ## expand every repetition range
259 | result = newSeqOfCap[Node](expression.len)
260 | var i: int
261 | var gi: int
262 | for n in expression:
263 | if n.kind != reRepRange:
264 | result.add(n)
265 | continue
266 | check(
267 | result.len > 0,
268 | "Invalid repeition range, " &
269 | "nothing to repeat")
270 | case result[^1].kind
271 | of reGroupEnd:
272 | i = 0
273 | gi = 0
274 | for ne in result.reversed:
275 | inc i
276 | if ne.kind == reGroupEnd:
277 | inc gi
278 | if ne.kind == reGroupStart:
279 | dec gi
280 | if gi == 0:
281 | break
282 | doAssert gi >= 0
283 | doAssert gi == 0
284 | assert result[result.len-i].kind == reGroupStart
285 | result.add(result[result.len-i .. result.len-1].expandOneRepRange(n))
286 | of matchableKind:
287 | result.add(result[result.len-1 .. result.len-1].expandOneRepRange(n))
288 | else:
289 | raise newException(RegexError, (
290 | "Invalid repetition range, either " &
291 | "char, shorthand (i.e: \\w), group, or set " &
292 | "expected before repetition range"))
293 |
294 | func joinAtoms(expression: seq[Node]): seq[Node] =
295 | ## Put a ``~`` joiner between atoms. An atom is
296 | ## a piece of expression that would loose
297 | ## meaning when breaking it up (i.e.: ``a~(b|c)*~d``)
298 | result = newSeqOfCap[Node](expression.len * 2)
299 | var atomsCount = 0
300 | for n in expression:
301 | case n.kind
302 | of matchableKind, assertionKind:
303 | inc atomsCount
304 | if atomsCount > 1:
305 | atomsCount = 1
306 | result.add(initJoinerNode())
307 | of reGroupStart:
308 | if atomsCount > 0:
309 | result.add(initJoinerNode())
310 | atomsCount = 0
311 | of reOr:
312 | atomsCount = 0
313 | of reGroupEnd,
314 | reZeroOrMore,
315 | reOneOrMore,
316 | reZeroOrOne,
317 | reRepRange:
318 | inc atomsCount
319 | else:
320 | assert false
321 | result.add(n)
322 |
323 | type
324 | Associativity = enum
325 | ## Operator associativity. Unary ops are
326 | ## right[-to-left] and binary ops are
327 | ## left[-to-right]
328 | asyRight
329 | asyLeft
330 | OpsPA = tuple
331 | precedence: int
332 | associativity: Associativity
333 |
334 | func opsPA(nk: NodeKind): OpsPA =
335 | ## return the precedence and
336 | ## associativity of a given node kind
337 | assert nk in opKind
338 | case nk
339 | of reRepRange,
340 | reZeroOrMore,
341 | reOneOrMore,
342 | reZeroOrOne:
343 | result = (5, asyRight)
344 | of reJoiner:
345 | result = (4, asyLeft)
346 | of reOr:
347 | result = (3, asyLeft)
348 | else:
349 | assert false
350 |
351 | func hasPrecedence(a: NodeKind, b: NodeKind): bool =
352 | ## Check ``b`` has precedence over ``a``.
353 | ## Both ``a`` and ``b`` are expected to
354 | ## be valid operators. Unary operators such
355 | ## as: ``*``, ``?`` and ``+`` have right-to-left
356 | ## associativity. Binary operators
357 | ## such as: ``|`` (or) and ``~`` (joiner) have
358 | ## left-to-right associativity
359 | result =
360 | (opsPA(b).associativity == asyRight and
361 | opsPA(b).precedence <= opsPA(a).precedence) or
362 | (opsPA(b).associativity == asyLeft and
363 | opsPA(b).precedence < opsPA(a).precedence)
364 |
365 | func popGreaterThan(ops: var seq[Node], op: Node): seq[Node] =
366 | assert op.kind in opKind
367 | result = newSeqOfCap[Node](ops.len)
368 | while (ops.len > 0 and
369 | ops[ops.len - 1].kind in opKind and
370 | ops[ops.len - 1].kind.hasPrecedence(op.kind)):
371 | result.add(ops.pop())
372 |
373 | func popUntilGroupStart(ops: var seq[Node]): seq[Node] =
374 | result = newSeqOfCap[Node](ops.len)
375 | while true:
376 | let op = ops.pop()
377 | result.add(op)
378 | if op.kind == reGroupStart:
379 | break
380 |
381 | func rpn(expression: seq[Node]): seq[Node] =
382 | ## An adaptation of the Shunting-yard algorithm
383 | ## for producing `Reverse Polish Notation` out of
384 | ## an expression specified in infix notation.
385 | ## It supports regex primitives including groups.
386 | ## The point of doing this is greatly simplifying
387 | ## the parsing of the regular expression into an NFA.
388 | ## Suffix notation removes nesting and so it can
389 | ## be parsed in a linear way instead of recursively
390 | result = newSeqOfCap[Node](expression.len)
391 | var ops = newSeq[Node]()
392 | for n in expression:
393 | case n.kind
394 | of matchableKind, assertionKind:
395 | result.add(n)
396 | of reGroupStart:
397 | ops.add(n)
398 | of reGroupEnd:
399 | result.add(ops.popUntilGroupStart())
400 | result.add(n)
401 | of opKind:
402 | result.add(ops.popGreaterThan(n))
403 | ops.add(n)
404 | else:
405 | assert false
406 | # reverse ops
407 | for i in 1 .. ops.len:
408 | result.add(ops[ops.len - i])
409 |
410 | func transformExp*(
411 | exp: seq[Node],
412 | groups: var GroupsCapture
413 | ): seq[Node] {.inline.} =
414 | result = exp
415 | .fillGroups(groups)
416 | .greediness
417 | .applyFlags
418 | .expandRepRange
419 | .joinAtoms
420 | .rpn
421 |
--------------------------------------------------------------------------------
/src/nregex/private/nfa.nim:
--------------------------------------------------------------------------------
1 | import std/deques
2 |
3 | import nodetype
4 | import common
5 |
6 | func check(cond: bool, msg: string) =
7 | if not cond:
8 | raise newException(RegexError, msg)
9 |
10 | type
11 | End = seq[int16]
12 | ## store all the last
13 | ## states of a given state.
14 | ## Avoids having to recurse
15 | ## a state to find its ends,
16 | ## but have to keep them up-to-date
17 |
18 | func combine(
19 | nfa: var seq[Node],
20 | ends: var seq[End],
21 | org: int16,
22 | target: int16
23 | ) =
24 | ## combine ends of ``org``
25 | ## with ``target``
26 | for e in ends[org]:
27 | for i, ni in nfa[e].next.mpairs:
28 | if nfa[ni].kind == reEOE:
29 | ni = target
30 | ends[org] = ends[target]
31 |
32 | func update(
33 | ends: var seq[End],
34 | ni: int16,
35 | next: openArray[int16]
36 | ) =
37 | ## update the ends of Node ``ni``
38 | ## to point to ends of ``n.outA``
39 | ## and ``n.outB``. If either outA
40 | ## or outB are ``0`` (EOE),
41 | ## the ends will point to itself
42 | ends[ni].setLen(0)
43 | for n in next:
44 | if n == 0:
45 | ends[ni].add(ni)
46 | else:
47 | ends[ni].add(ends[n])
48 |
49 | const eoe = 0'i16
50 |
51 | func eNfa(expression: seq[Node]): seq[Node] =
52 | ## Thompson's construction
53 | result = newSeqOfCap[Node](expression.len + 2)
54 | result.add(initEOENode())
55 | var
56 | ends = newSeq[End](expression.len + 1)
57 | states = newSeq[int16]()
58 | if expression.len == 0:
59 | states.add(eoe)
60 | for n in expression:
61 | var n = n
62 | assert n.next.len == 0
63 | check(
64 | result.high < int16.high,
65 | ("The expression is too long, " &
66 | "limit is ~$#") %% $int16.high)
67 | let ni = result.len.int16
68 | case n.kind
69 | of matchableKind, assertionKind:
70 | n.next.add(eoe)
71 | ends.update(ni, [eoe])
72 | result.add(n)
73 | states.add(ni)
74 | of reJoiner:
75 | let
76 | stateB = states.pop()
77 | stateA = states.pop()
78 | result.combine(ends, stateA, stateB)
79 | states.add(stateA)
80 | of reOr:
81 | check(
82 | states.len >= 2,
83 | "Invalid OR conditional, nothing to " &
84 | "match at right/left side of the condition")
85 | let
86 | stateB = states.pop()
87 | stateA = states.pop()
88 | n.next.add([stateA, stateB])
89 | ends.update(ni, n.next)
90 | result.add(n)
91 | states.add(ni)
92 | of reZeroOrMore:
93 | check(
94 | states.len >= 1,
95 | "Invalid `*` operator, " &
96 | "nothing to repeat")
97 | let stateA = states.pop()
98 | n.next.add([stateA, eoe])
99 | ends.update(ni, n.next)
100 | result.combine(ends, stateA, ni)
101 | result.add(n)
102 | states.add(ni)
103 | if n.isGreedy:
104 | swap(result[^1].next[0], result[^1].next[1])
105 | of reOneOrMore:
106 | check(
107 | states.len >= 1,
108 | "Invalid `+` operator, " &
109 | "nothing to repeat")
110 | let stateA = states.pop()
111 | n.next.add([stateA, eoe])
112 | ends.update(ni, n.next)
113 | result.combine(ends, stateA, ni)
114 | result.add(n)
115 | states.add(stateA)
116 | if n.isGreedy:
117 | swap(result[^1].next[0], result[^1].next[1])
118 | of reZeroOrOne:
119 | check(
120 | states.len >= 1,
121 | "Invalid `?` operator, " &
122 | "nothing to make optional")
123 | let stateA = states.pop()
124 | n.next.add([stateA, eoe])
125 | ends.update(ni, n.next)
126 | result.add(n)
127 | states.add(ni)
128 | if n.isGreedy:
129 | swap(result[^1].next[0], result[^1].next[1])
130 | of reGroupStart:
131 | let stateA = states.pop()
132 | n.next.add(stateA)
133 | ends.update(ni, n.next)
134 | result.add(n)
135 | states.add(ni)
136 | of reGroupEnd:
137 | n.next.add(eoe)
138 | ends.update(ni, n.next)
139 | let stateA = states.pop()
140 | result.combine(ends, stateA, ni)
141 | result.add(n)
142 | states.add(stateA)
143 | else:
144 | assert(false, "Unhandled node: $#" %% $n.kind)
145 | assert states.len == 1
146 | result.add(Node(
147 | kind: reSkip,
148 | cp: "#".toRune,
149 | next: states))
150 |
151 | type
152 | Zclosure = seq[int16]
153 | TeClosure = seq[(int16, Zclosure)]
154 |
155 | func isTransitionZ(n: Node): bool {.inline.} =
156 | result = case n.kind
157 | of groupKind:
158 | n.isCapturing
159 | of reInSet:
160 | # XXX always false in ascii mode
161 | var isZ = false
162 | for s in n.shorthands:
163 | isZ = s.kind notin {reAny, reAnyNl, reDigit, reWord}
164 | if isZ:
165 | break
166 | isZ
167 | of assertionKind:
168 | true
169 | of matchTransitionKind - {reInSet}:
170 | # XXX false in ascii mode
171 | true
172 | else:
173 | false
174 |
175 | func countTransitionsZ(nfa: seq[Node]): int =
176 | result = 0
177 | for n in nfa:
178 | result += int(n.isTransitionZ)
179 |
180 | func teClosure(
181 | result: var TeClosure,
182 | nfa: seq[Node],
183 | state: int16,
184 | visited: var set[int16],
185 | zTransitions: Zclosure
186 | ) =
187 | if state in visited:
188 | return
189 | visited.incl(state)
190 | var zTransitionsCurr = zTransitions
191 | if isTransitionZ(nfa[state]):
192 | zTransitionsCurr.add(state)
193 | if nfa[state].kind in matchableKind + {reEOE}:
194 | result.add((state, zTransitionsCurr))
195 | return
196 | for s in nfa[state].next:
197 | teClosure(result, nfa, s, visited, zTransitionsCurr)
198 |
199 | func teClosure(
200 | result: var TeClosure,
201 | nfa: seq[Node],
202 | state: int16
203 | ) =
204 | var visited: set[int16]
205 | var zclosure: Zclosure
206 | for s in nfa[state].next:
207 | teClosure(result, nfa, s, visited, zclosure)
208 |
209 | type
210 | TransitionsAll* = seq[seq[int16]]
211 | ZclosureStates* = seq[seq[Node]]
212 | Transitions* = object
213 | all*: TransitionsAll
214 | allZ*: TransitionsAll
215 | z*: ZclosureStates
216 | zCount*: int
217 |
218 | # XXX do not add char classes transitions \w, \d, etc in ascii mode
219 | func eRemoval(
220 | eNfa: seq[Node],
221 | transitions: var Transitions
222 | ): seq[Node] =
223 | ## Remove e-transitions and return
224 | ## remaining state transtions and
225 | ## submatches, and zero matches.
226 | ## Transitions are added in matching order (BFS),
227 | ## which may help matching performance
228 | #echo eNfa
229 | var eNfa = eNfa
230 | transitions.all.setLen(eNfa.len)
231 | transitions.allZ.setLen(eNfa.len)
232 | var statesMap = newSeq[int16](eNfa.len)
233 | for i in 0 .. statesMap.len-1:
234 | statesMap[i] = -1
235 | var statePos = 0'i16
236 | let start = int16(eNfa.len-1)
237 | statesMap[start] = statePos
238 | inc statePos
239 | var closure: TeClosure
240 | var zc: seq[Node]
241 | var qw = initDeque[int16]()
242 | qw.addFirst(start)
243 | var qu: set[int16]
244 | qu.incl(start)
245 | while qw.len > 0:
246 | let qa = qw.popLast()
247 | closure.setLen(0)
248 | teClosure(closure, eNfa, qa)
249 | eNfa[qa].next.setLen(0)
250 | for qb, zclosure in closure.items:
251 | eNfa[qa].next.add(qb)
252 | if statesMap[qb] == -1:
253 | statesMap[qb] = statePos
254 | inc statePos
255 | assert statesMap[qa] > -1
256 | assert statesMap[qb] > -1
257 | transitions.all[statesMap[qa]].add(statesMap[qb])
258 | transitions.allZ[statesMap[qa]].add(-1'i16)
259 | zc.setLen(0)
260 | for z in zclosure:
261 | zc.add(eNfa[z])
262 | if zc.len > 0:
263 | transitions.z.add(zc)
264 | transitions.allZ[statesMap[qa]][^1] = int16(transitions.z.len-1)
265 | if qb notin qu:
266 | qu.incl(qb)
267 | qw.addFirst(qb)
268 | transitions.all.setLen(statePos)
269 | transitions.allZ.setLen(statePos)
270 | if transitions.z.len == 0:
271 | transitions.allZ = @[]
272 | transitions.zCount = eNfa.countTransitionsZ
273 | result = newSeq[Node](statePos)
274 | for en, nn in statesMap.pairs:
275 | if nn == -1:
276 | continue
277 | result[nn] = if isTransitionZ(eNfa[en]):
278 | doAssert eNfa[en].kind in matchableKind
279 | Node(kind: reAnyNl, cp: "#".toRune)
280 | else:
281 | eNfa[en]
282 | result[nn].next.setLen(0)
283 | for en2 in eNfa[en].next:
284 | doAssert statesMap[en2] > -1
285 | result[nn].next.add(statesMap[en2])
286 |
287 | func nfa*(
288 | exp: seq[Node],
289 | transitions: var Transitions
290 | ): seq[Node] =
291 | result = exp.eNfa.eRemoval(transitions)
292 |
--------------------------------------------------------------------------------
/src/nregex/private/nodematch.nim:
--------------------------------------------------------------------------------
1 | import std/unicode
2 | import std/sets
3 |
4 | import pkg/unicodedb/properties
5 | import pkg/unicodedb/types
6 | import pkg/unicodeplus
7 |
8 | import nodetype
9 | import common
10 |
11 | func isWord*(r: Rune): bool {.inline.} =
12 | utmWord in unicodeTypes(r)
13 |
14 | func isWordAscii(r: Rune): bool {.inline.} =
15 | ## return ``true`` if the given
16 | ## rune is in ``[A-Za-z0-9]`` range
17 | case r.int
18 | of 'A'.ord .. 'Z'.ord,
19 | 'a'.ord .. 'z'.ord,
20 | '0'.ord .. '9'.ord,
21 | '_'.ord:
22 | true
23 | else:
24 | false
25 |
26 | template isWordBoundaryImpl(r, nxt, isWordProc): bool =
27 | (r.int > -1 and isWordProc(r)) xor
28 | (nxt.int > -1 and isWordProc(nxt))
29 |
30 | func isWordBoundary(r: Rune, nxt: Rune): bool {.inline.} =
31 | ## check if current match
32 | ## is a boundary (i.e the end of a word)
33 | isWordBoundaryImpl(r, nxt, isWord)
34 |
35 | func isWordBoundaryAscii(r: Rune, nxt: Rune): bool {.inline.} =
36 | ## check if current match
37 | ## is a boundary. Match ascii only
38 | isWordBoundaryImpl(r, nxt, isWordAscii)
39 |
40 | func match*(n: Node, r: Rune, nxt: Rune): bool =
41 | ## match for ``Node`` of assertion kind.
42 | ## Return whether the node matches
43 | ## the current characters or not
44 | case n.kind
45 | of reStart, reStartSym:
46 | r == invalidRune
47 | of reEnd, reEndSym:
48 | nxt == invalidRune
49 | of reStartSymML:
50 | (r == invalidRune or
51 | r == lineBreakRune)
52 | of reEndSymML:
53 | (nxt == invalidRune or
54 | nxt == lineBreakRune)
55 | of reWordBoundary:
56 | isWordBoundary(r, nxt)
57 | of reNotWordBoundary:
58 | not isWordBoundary(r, nxt)
59 | of reWordBoundaryAscii:
60 | isWordBoundaryAscii(r, nxt)
61 | of reNotWordBoundaryAscii:
62 | not isWordBoundaryAscii(r, nxt)
63 | of reLookahead:
64 | n.cp == nxt
65 | of reNotLookahead:
66 | n.cp != nxt
67 | of reLookbehind:
68 | n.cp == r
69 | of reNotLookbehind:
70 | n.cp != r
71 | else:
72 | assert false
73 | false
74 |
75 | func contains(sr: seq[Slice[Rune]], r: Rune): bool =
76 | result = false
77 | for sl in sr:
78 | result = r in sl
79 | if result:
80 | break
81 |
82 | func isWhiteSpace(r: Rune): bool {.inline.} =
83 | utmWhiteSpace in unicodeTypes(r)
84 |
85 | func isWhiteSpaceAscii(r: Rune): bool {.inline.} =
86 | case r.int
87 | of ' '.ord,
88 | '\t'.ord,
89 | '\L'.ord,
90 | '\r'.ord,
91 | '\f'.ord,
92 | '\v'.ord:
93 | true
94 | else:
95 | false
96 |
97 | func isDigitAscii(r: Rune): bool {.inline.} =
98 | case r.int
99 | of '0'.ord .. '9'.ord:
100 | true
101 | else:
102 | false
103 |
104 | func isAnyAscii(r: Rune): bool {.inline.} =
105 | (r.int <= int8.high and
106 | r != lineBreakRune)
107 |
108 | func swapCase*(r: Rune): Rune =
109 | result = r.toLower()
110 | if result != r:
111 | return
112 | result = r.toUpper()
113 |
114 | func match*(n: Node, r: Rune): bool =
115 | ## match for ``Node`` of matchable kind.
116 | ## Return whether the node matches
117 | ## the current character or not
118 | assert r != invalidRune
119 | case n.kind
120 | of reEOE:
121 | false
122 | of reWord:
123 | r.isWord()
124 | of reNotAlphaNum:
125 | not r.isWord()
126 | of reDigit:
127 | r.isDecimal()
128 | of reNotDigit:
129 | not r.isDecimal()
130 | of reWhiteSpace:
131 | r.isWhiteSpace()
132 | of reNotWhiteSpace:
133 | not r.isWhiteSpace()
134 | of reInSet, reNotSet:
135 | var matches = (
136 | r in n.cps or
137 | r in n.ranges)
138 | if not matches:
139 | for nn in n.shorthands:
140 | matches = nn.match(r)
141 | if matches: break
142 | ((matches and n.kind == reInSet) or
143 | (not matches and n.kind == reNotSet))
144 | of reAny:
145 | r != lineBreakRune
146 | of reAnyNL:
147 | true
148 | of reCharCI:
149 | r == n.cp or r == n.cp.swapCase()
150 | of reWordAscii:
151 | r.isWordAscii()
152 | of reDigitAscii:
153 | r.isDigitAscii()
154 | of reWhiteSpaceAscii:
155 | r.isWhiteSpaceAscii()
156 | of reUCC:
157 | r.unicodeCategory() in n.cc
158 | of reNotAlphaNumAscii:
159 | not r.isWordAscii()
160 | of reNotDigitAscii:
161 | not r.isDigitAscii()
162 | of reNotWhiteSpaceAscii:
163 | not r.isWhiteSpaceAscii()
164 | of reNotUCC:
165 | r.unicodeCategory() notin n.cc
166 | of reAnyAscii:
167 | r.isAnyAscii()
168 | of reAnyNLAscii:
169 | r.isAnyAscii() or r == lineBreakRune
170 | else:
171 | assert n.kind == reChar
172 | n.cp == r
173 |
--------------------------------------------------------------------------------
/src/nregex/private/nodetype.nim:
--------------------------------------------------------------------------------
1 | import std/unicode
2 | import std/sets
3 |
4 | import pkg/unicodedb/properties
5 |
6 | import common
7 |
8 | type
9 | Flag* = enum
10 | flagCaseInsensitive, # i
11 | flagNotCaseInsensitive, # -i
12 | flagMultiLine, # m
13 | flagNotMultiLine, # -m
14 | flagAnyMatchNewLine, # s
15 | flagNotAnyMatchNewLine, # -s
16 | flagUnGreedy, # U
17 | flagNotUnGreedy, # -U
18 | flagUnicode, # u
19 | flagNotUnicode, # -u
20 | flagVerbose, # x
21 | flagNotVerbose # -x
22 | NodeKind* = enum
23 | reChar,
24 | reCharCi,
25 | reJoiner, # ~
26 | reGroupStart, # (
27 | reGroupEnd, # )
28 | reOr, # |
29 | reZeroOrMore, # *
30 | reOneOrMore, # +
31 | reZeroOrOne, # ?
32 | reRepRange, # {n,m}
33 | reStartSym, # ^
34 | reEndSym, # $
35 | reStartSymML, # ^ multi-line
36 | reEndSymML, # $ multi-line
37 | reStart, # \A
38 | reEnd, # \z
39 | reWordBoundary, # \b
40 | reNotWordBoundary, # \B
41 | reWord, # \w
42 | reDigit, # \d
43 | reWhiteSpace, # \s
44 | reUCC, # \pN or \p{Nn}
45 | reNotAlphaNum, # \W
46 | reNotDigit, # \D
47 | reNotWhiteSpace, # \S
48 | reNotUCC, # \PN or \P{Nn}
49 | reAny, # .
50 | reAnyNl, # . new-line
51 | reWordBoundaryAscii, # \b ascii only
52 | reNotWordBoundaryAscii, # \B ascii only
53 | reWordAscii, # \w ascii only
54 | reDigitAscii, # \d ascii only
55 | reWhiteSpaceAscii, # \s ascii only
56 | reNotAlphaNumAscii, # \W ascii only
57 | reNotDigitAscii, # \D ascii only
58 | reNotWhiteSpaceAscii, # \S ascii only
59 | reAnyAscii, # . ascii only
60 | reAnyNlAscii, # . new-line ascii only
61 | reInSet, # [abc]
62 | reNotSet, # [^abc]
63 | reLookahead, # (?=...)
64 | reLookbehind, # (?<=...)
65 | reNotLookahead, # (?!...)
66 | reNotLookbehind, # (? sc.s.high
40 |
41 | func prev*[T](sc: Scanner[T]): T =
42 | sc.s[sc.pos - 1]
43 |
44 | func curr*[T](sc: Scanner[T]): T =
45 | sc.s[sc.pos]
46 |
47 | func next*[T](sc: Scanner[T]): T =
48 | ## return current item and consume it
49 | result = sc.s[sc.pos]
50 | inc sc.pos
51 |
52 | func peekImpl[T](sc: Scanner[T], default: T): T {.inline.} =
53 | ## same as ``curr`` except it
54 | ## returns a default/invalid value when
55 | ## the data is fully consumed
56 | if sc.pos > sc.s.high:
57 | default
58 | else:
59 | sc.s[sc.pos]
60 |
61 | func peek*(sc: Scanner[Rune]): Rune =
62 | peekImpl(sc, invalidRune)
63 |
64 | func peek*(sc: Scanner[Node]): Node =
65 | peekImpl(sc, initEOENode())
66 |
67 | iterator peek*[T](sc: Scanner[T]): (T, T) =
68 | for s in sc:
69 | yield (s, sc.peek)
70 |
71 | func find*(sc: Scanner[Rune], r: Rune): int =
72 | ## return number of consumed chars.
73 | ## The scanner's position is not moved.
74 | ## ``-1`` is returned when char is not found
75 | result = 0
76 | let pos = sc.pos
77 | while true:
78 | if sc.finished:
79 | result = -1
80 | break
81 | if sc.curr == r:
82 | break
83 | discard sc.next()
84 | inc result
85 | sc.pos = pos
86 |
--------------------------------------------------------------------------------
/tests/nim.cfg:
--------------------------------------------------------------------------------
1 | --path:"../src/"
2 |
--------------------------------------------------------------------------------