├── .gitignore ├── .travis.yml ├── CHANGELOG.md ├── CONTRIBUTORS.md ├── LICENSE ├── README.md ├── docs ├── index.html ├── nimdoc.out.css ├── nregex.html └── nregex │ ├── common.html │ ├── dfa.html │ ├── dfamacro.html │ ├── dfamatch.html │ ├── exptransformation.html │ ├── nfa.html │ ├── nodematch.html │ ├── nodetype.html │ ├── parser.html │ ├── private │ ├── common.html │ ├── dfa.html │ ├── dfamacro.html │ ├── dfamatch.html │ ├── exptransformation.html │ ├── nfa.html │ ├── nodematch.html │ ├── nodetype.html │ ├── parser.html │ └── scanner.html │ └── scanner.html ├── nregex.nimble ├── src ├── nregex.nim └── nregex │ └── private │ ├── common.nim │ ├── dfa.nim │ ├── dfamacro.nim │ ├── dfamatch.nim │ ├── exptransformation.nim │ ├── nfa.nim │ ├── nodematch.nim │ ├── nodetype.nim │ ├── parser.nim │ └── scanner.nim └── tests ├── nim.cfg └── tests.nim /.gitignore: -------------------------------------------------------------------------------- 1 | nimcache/ 2 | bin/nregex 3 | bin/nregex.js 4 | tests/tests 5 | tests/tests.js 6 | docs/ugh -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | services: 2 | - docker 3 | env: 4 | - NIM=1.0.4 5 | before_install: 6 | - docker pull nimlang/nim:$NIM 7 | script: 8 | - docker run --rm -v `pwd`:/usr/src/app -w /usr/src/app nimlang/nim:$NIM /bin/bash -c "nimble install -y; nimble test" 9 | notifications: 10 | email: 11 | on_failure: never 12 | on_success: never 13 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | v0.0.3 2 | ================== 3 | 4 | * Added `findAll` API 5 | 6 | v0.0.2 7 | ================== 8 | 9 | * DFA minimization 10 | * Ascii flag 11 | * Added `match(string, Regex)` API 12 | 13 | v0.0.1 14 | ================== 15 | 16 | * Initial release 17 | -------------------------------------------------------------------------------- /CONTRIBUTORS.md: -------------------------------------------------------------------------------- 1 | List of contributors: 2 | 3 | * timotheecour (Timothee Cour, timothee.cour2@gmail.com) 4 | * data-man (Dmitry Atamanov) 5 | * xmonader (xmonader@gmail.com) 6 | * kaushalmodi (Kaushal Modi) 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Esteban Castro Borsani 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # nregex 2 | 3 | [![licence](https://img.shields.io/github/license/nitely/nregex.svg?style=flat-square)](https://raw.githubusercontent.com/nitely/nregex/master/LICENSE) 4 | 5 | This is currently a PoC for a DFA that supports submatches extraction. The match time complexity is linear in length of the text to match. [Read the article](https://nitely.github.io/2020/01/19/a-dfa-for-submatches-extraction.html) if you are interested in the implementation. 6 | 7 | > [!WARNING] 8 | > Pls use [nim-regex](https://github.com/nitely/nim-regex) for anything serious, instead of this package. 9 | 10 | ## Install 11 | 12 | ``` 13 | nimble install nregex 14 | ``` 15 | 16 | # Compatibility 17 | 18 | Nim +1.0.4 19 | 20 | ## Usage 21 | 22 | ```nim 23 | import pkg/nregex 24 | 25 | var m: RegexMatch 26 | doAssert match("abc", re"abc", m) 27 | doAssert match("ab", re"a(b|c)", m) 28 | 29 | doAssert match("aabcd", re"(aa)bcd", m) 30 | doAssert m.group(0) == @[0 .. 1] 31 | doAssert match("aab", re"((a)*b)", m) 32 | doAssert m.group(0) == @[0 .. 2] 33 | doAssert m.group(1) == @[0 .. 0, 1 .. 1] 34 | 35 | doAssert "abcd".find(re"bc", m) 36 | doAssert "2222".find(re"(22)*", m) 37 | doAssert m.group(0) == @[0 .. 1, 2 .. 3] 38 | 39 | doAssert re"bc" in "abcd" 40 | doAssert re"(23)+" in "112323211" 41 | ``` 42 | 43 | ## Docs 44 | 45 | [Read the docs](https://nitely.github.io/nregex/) 46 | 47 | ## Benchmarks 48 | 49 | The following benchmarks show nregex is up to 22 times faster than PCRE. However, when the RE contains capture groups, PCRE is about 4 times faster than nregex. 50 | 51 | | | relative | time/iter | iters/s | regex | text 52 | | --- | --- | --- | --- | --- | --- 53 | CPU | | 294.85ps | 3.39G 54 | PCRE | | 1.10ms | 912.11 | ^\w\*sol\w\*$ | (a\*100000)sol(b\*100000) 55 | nregex | 739.52% | 148.25us | 6.75K 56 | PCRE | | 174.87ns | 5.72M | ^[0-9]+-[0-9]+-[0-9]+$ | 650-253-0001 57 | nregex | 2280.84% | 7.67ns | 130.43M 58 | PCRE | | 179.23ns | 5.58M | ^[0-9]+..+$ | 650-253-0001 59 | nregex | 1447.15% | 12.38ns | 80.74M 60 | 61 | ## Tests 62 | 63 | ``` 64 | nimble test 65 | ``` 66 | 67 | ## LICENSE 68 | 69 | MIT 70 | -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 |

Please follow this link.

8 | 9 | 10 | -------------------------------------------------------------------------------- /docs/nregex/common.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | nregex/common 21 | 22 | 23 | 24 | 25 | 69 | 70 | 71 | 72 |
73 |
74 |

nregex/common

75 |
76 |
77 |
78 | 82 |     Dark Mode 83 |
84 | 88 |
89 | Search: 91 |
92 |
93 | Group by: 94 | 98 |
99 | 136 | 137 |
138 |
139 |
140 | 141 |

142 |
143 |

Types

144 |
145 | 146 |
RegexError = object of ValueError
147 |
148 | 149 | 150 | 151 |
152 | 153 |
154 |
155 |

Consts

156 |
157 | 158 |
invalidRune = -1'i32
159 |
160 | 161 | 162 | 163 |
164 | 165 |
lineBreakRune = 10'i32
166 |
167 | 168 | 169 | 170 |
171 | 172 |
173 |
174 |

Procs

175 |
176 | 177 |
proc toRune(s: string): Rune {...}{.raises: [], tags: [].}
178 |
179 | 180 | 181 | 182 |
183 | 184 |
proc `<=`(x, y: Rune): bool {...}{.raises: [], tags: [].}
185 |
186 | 187 | 188 | 189 |
190 | 191 |
proc cmp(x, y: Rune): int {...}{.raises: [], tags: [].}
192 |
193 | 194 | 195 | 196 |
197 | 198 |
proc `%%`(formatstr: string; a: openArray[string]): string {...}{.noSideEffect, raises: [],
199 |     tags: [].}
200 |
201 | 202 | same as "$#" % ["foo"] but returns empty string on error 203 | 204 |
205 | 206 |
proc `%%`(formatstr: string; a: string): string {...}{.raises: [], tags: [].}
207 |
208 | 209 | 210 | 211 |
212 | 213 |
214 | 215 |
216 |
217 | 218 |
219 | 224 |
225 |
226 |
227 | 228 | 229 | 230 | -------------------------------------------------------------------------------- /docs/nregex/dfa.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | nregex/dfa 21 | 22 | 23 | 24 | 25 | 69 | 70 | 71 | 72 |
73 |
74 |

nregex/dfa

75 |
76 |
77 |
78 | 82 |     Dark Mode 83 |
84 | 88 |
89 | Search: 91 |
92 |
93 | Group by: 94 | 98 |
99 | 153 | 154 |
155 |
156 |
157 | 158 |

159 |
160 |

Imports

161 |
162 | nodematch, nodetype, common 163 |
164 |
165 |

Types

166 |
167 | 168 |
AlphabetSym = int32
169 |
170 | 171 | 172 | 173 |
174 | 175 |
Closure = HashSet[int16]
176 |
177 | 178 | 179 | 180 |
181 | 182 |
DfaRow = Table[AlphabetSym, int32]
183 |
184 | 185 | 186 | 187 |
188 | 189 |
DfaClosure = Table[AlphabetSym, int32]
190 |
191 | 192 | 193 | 194 |
195 | 196 |
Dfa = object
197 |   table*: seq[DfaRow]
198 |   cs*: seq[Closure]
199 |   closures*: seq[DfaClosure]
200 | 
201 |
202 | 203 | 204 | 205 |
206 | 207 |
208 |
209 |

Consts

210 |
211 | 212 |
symEoe = -1'i32
213 |
214 | 215 | 216 | 217 |
218 | 219 |
symWord = -3'i32
220 |
221 | 222 | 223 | 224 |
225 | 226 |
symDigit = -4'i32
227 |
228 | 229 | 230 | 231 |
232 | 233 |
symAny = -6'i32
234 |
235 | 236 | 237 | 238 |
239 | 240 |
symAnyNl = -7'i32
241 |
242 | 243 | 244 | 245 |
246 | 247 |
248 |
249 |

Funcs

250 |
251 | 252 |
func dfa(nfa: seq[Node]; alphabet: var seq[AlphabetSym]): Dfa {...}{.
253 |     raises: [IndexError, KeyError], tags: [].}
254 |
255 | 256 | Powerset construction 257 | 258 |
259 | 260 |
func minimize(dfa: Dfa; alphabet: seq[AlphabetSym]): Dfa {...}{.raises: [KeyError], tags: [].}
261 |
262 | 263 | Hopcroft 264 | 265 |
266 | 267 |
268 | 269 |
270 |
271 | 272 |
273 | 278 |
279 |
280 |
281 | 282 | 283 | 284 | -------------------------------------------------------------------------------- /docs/nregex/dfamacro.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | nregex/dfamacro 21 | 22 | 23 | 24 | 25 | 69 | 70 | 71 | 72 |
73 |
74 |

nregex/dfamacro

75 |
76 |
77 |
78 | 82 |     Dark Mode 83 |
84 | 88 |
89 | Search: 91 |
92 |
93 | Group by: 94 | 98 |
99 |
    100 |
  • 101 | Imports 102 |
      103 | 104 |
    105 |
  • 106 |
  • 107 | Funcs 108 | 114 |
  • 115 | 116 |
117 | 118 |
119 |
120 |
121 | 122 |

123 |
124 |

Imports

125 |
126 | common, nodetype, nodematch, nfa, dfa, dfamatch 127 |
128 |
129 |

Funcs

130 |
131 | 132 |
func matchImpl(text: string; regex: static Regex; m: var RegexMatch;
133 |               flags: static MatchFlags; start = 0): bool {...}{.inline.}
134 |
135 | 136 | 137 | 138 |
139 | 140 |
141 | 142 |
143 |
144 | 145 |
146 | 151 |
152 |
153 |
154 | 155 | 156 | 157 | -------------------------------------------------------------------------------- /docs/nregex/exptransformation.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | nregex/exptransformation 21 | 22 | 23 | 24 | 25 | 69 | 70 | 71 | 72 |
73 |
74 |

nregex/exptransformation

75 |
76 |
77 |
78 | 82 |     Dark Mode 83 |
84 | 88 |
89 | Search: 91 |
92 |
93 | Group by: 94 | 98 |
99 | 126 | 127 |
128 |
129 |
130 | 131 |

132 |
133 |

Imports

134 |
135 | nodetype, common, scanner 136 |
137 |
138 |

Types

139 |
140 | 141 |
GroupsCapture = object
142 |   count*: int16
143 |   names*: OrderedTable[string, int16]
144 | 
145 |
146 | 147 | 148 | 149 |
150 | 151 |
152 |
153 |

Funcs

154 |
155 | 156 |
func transformExp(exp: seq[Node]; groups: var GroupsCapture): seq[Node] {...}{.inline,
157 |     raises: [RegexError], tags: [].}
158 |
159 | 160 | 161 | 162 |
163 | 164 |
165 | 166 |
167 |
168 | 169 |
170 | 175 |
176 |
177 |
178 | 179 | 180 | 181 | -------------------------------------------------------------------------------- /docs/nregex/nfa.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | nregex/nfa 21 | 22 | 23 | 24 | 25 | 69 | 70 | 71 | 72 |
73 |
74 |

nregex/nfa

75 |
76 |
77 |
78 | 82 |     Dark Mode 83 |
84 | 88 |
89 | Search: 91 |
92 |
93 | Group by: 94 | 98 |
99 | 132 | 133 |
134 |
135 |
136 | 137 |

138 |
139 |

Imports

140 |
141 | nodetype, common 142 |
143 |
144 |

Types

145 |
146 | 147 |
TransitionsAll = seq[seq[int16]]
148 |
149 | 150 | 151 | 152 |
153 | 154 |
ZclosureStates = seq[seq[Node]]
155 |
156 | 157 | 158 | 159 |
160 | 161 |
Transitions = object
162 |   all*: TransitionsAll
163 |   allZ*: TransitionsAll
164 |   z*: ZclosureStates
165 |   zCount*: int
166 | 
167 |
168 | 169 | 170 | 171 |
172 | 173 |
174 |
175 |

Funcs

176 |
177 | 178 |
func nfa(exp: seq[Node]; transitions: var Transitions): seq[Node] {...}{.
179 |     raises: [IndexError, RegexError], tags: [].}
180 |
181 | 182 | 183 | 184 |
185 | 186 |
187 | 188 |
189 |
190 | 191 |
192 | 197 |
198 |
199 |
200 | 201 | 202 | 203 | -------------------------------------------------------------------------------- /docs/nregex/nodematch.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | nregex/nodematch 21 | 22 | 23 | 24 | 25 | 69 | 70 | 71 | 72 |
73 |
74 |

nregex/nodematch

75 |
76 |
77 |
78 | 82 |     Dark Mode 83 |
84 | 88 |
89 | Search: 91 |
92 |
93 | Group by: 94 | 98 |
99 | 122 | 123 |
124 |
125 |
126 | 127 |

128 |
129 |

Imports

130 |
131 | nodetype, common 132 |
133 |
134 |

Funcs

135 |
136 | 137 |
func isWord(r: Rune): bool {...}{.inline, raises: [], tags: [].}
138 |
139 | 140 | 141 | 142 |
143 | 144 |
func match(n: Node; r: Rune; nxt: Rune): bool {...}{.raises: [], tags: [].}
145 |
146 | 147 | match for Node of assertion kind. Return whether the node matches the current characters or not 148 | 149 |
150 | 151 |
func swapCase(r: Rune): Rune {...}{.raises: [], tags: [].}
152 |
153 | 154 | 155 | 156 |
157 | 158 |
func match(n: Node; r: Rune): bool {...}{.raises: [], tags: [].}
159 |
160 | 161 | match for Node of matchable kind. Return whether the node matches the current character or not 162 | 163 |
164 | 165 |
166 | 167 |
168 |
169 | 170 |
171 | 176 |
177 |
178 |
179 | 180 | 181 | 182 | -------------------------------------------------------------------------------- /docs/nregex/parser.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | nregex/parser 21 | 22 | 23 | 24 | 25 | 69 | 70 | 71 | 72 |
73 |
74 |

nregex/parser

75 |
76 |
77 |
78 | 82 |     Dark Mode 83 |
84 | 88 |
89 | Search: 91 |
92 |
93 | Group by: 94 | 98 |
99 |
    100 |
  • 101 | Imports 102 |
      103 | 104 |
    105 |
  • 106 |
  • 107 | Funcs 108 |
      109 |
    • parse
    • 111 | 112 |
    113 |
  • 114 | 115 |
116 | 117 |
118 |
119 |
120 | 121 |

122 |
123 |

Imports

124 |
125 | nodetype, common, scanner 126 |
127 |
128 |

Funcs

129 |
130 | 131 |
func parse(expression: string): seq[Node] {...}{.raises: [RegexError], tags: [].}
132 |
133 | 134 | convert a string regex expression into a Node expression 135 | 136 |
137 | 138 |
139 | 140 |
141 |
142 | 143 |
144 | 149 |
150 |
151 |
152 | 153 | 154 | 155 | -------------------------------------------------------------------------------- /docs/nregex/private/common.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | nregex/private/common 21 | 22 | 23 | 24 | 25 | 69 | 70 | 71 | 72 |
73 |
74 |

nregex/private/common

75 |
76 |
77 |
78 | 82 |     Dark Mode 83 |
84 | 88 |
89 | Search: 91 |
92 |
93 | Group by: 94 | 98 |
99 | 136 | 137 |
138 |
139 |
140 | 141 |

142 |
143 |

Types

144 |
145 | 146 |
RegexError = object of ValueError
147 |
148 | 149 | 150 | 151 |
152 | 153 |
154 |
155 |

Consts

156 |
157 | 158 |
invalidRune = -1'i32
159 |
160 | 161 | 162 | 163 |
164 | 165 |
lineBreakRune = 10'i32
166 |
167 | 168 | 169 | 170 |
171 | 172 |
173 |
174 |

Procs

175 |
176 | 177 |
proc toRune(s: string): Rune {...}{.raises: [], tags: [].}
178 |
179 | 180 | 181 | 182 |
183 | 184 |
proc `<=`(x, y: Rune): bool {...}{.raises: [], tags: [].}
185 |
186 | 187 | 188 | 189 |
190 | 191 |
proc cmp(x, y: Rune): int {...}{.raises: [], tags: [].}
192 |
193 | 194 | 195 | 196 |
197 | 198 |
proc `%%`(formatstr: string; a: openArray[string]): string {...}{.noSideEffect, raises: [],
199 |     tags: [].}
200 |
201 | 202 | same as "$#" % ["foo"] but returns empty string on error 203 | 204 |
205 | 206 |
proc `%%`(formatstr: string; a: string): string {...}{.raises: [], tags: [].}
207 |
208 | 209 | 210 | 211 |
212 | 213 |
214 | 215 |
216 |
217 | 218 |
219 | 224 |
225 |
226 |
227 | 228 | 229 | 230 | -------------------------------------------------------------------------------- /docs/nregex/private/dfamacro.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | nregex/private/dfamacro 21 | 22 | 23 | 24 | 25 | 69 | 70 | 71 | 72 |
73 |
74 |

nregex/private/dfamacro

75 |
76 |
77 |
78 | 82 |     Dark Mode 83 |
84 | 88 |
89 | Search: 91 |
92 |
93 | Group by: 94 | 98 |
99 |
    100 |
  • 101 | Imports 102 |
      103 | 104 |
    105 |
  • 106 |
  • 107 | Funcs 108 | 114 |
  • 115 | 116 |
117 | 118 |
119 |
120 |
121 | 122 |

123 |
124 |

Imports

125 |
126 | common, nodetype, nodematch, nfa, dfa, dfamatch 127 |
128 |
129 |

Funcs

130 |
131 | 132 |
func matchImpl(text: string; regex: static Regex; m: var RegexMatch;
133 |               flags: static MatchFlags; start = 0): bool {...}{.inline.}
134 |
135 | 136 | 137 | 138 |
139 | 140 |
141 | 142 |
143 |
144 | 145 |
146 | 151 |
152 |
153 |
154 | 155 | 156 | 157 | -------------------------------------------------------------------------------- /docs/nregex/private/exptransformation.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | nregex/private/exptransformation 21 | 22 | 23 | 24 | 25 | 69 | 70 | 71 | 72 |
73 |
74 |

nregex/private/exptransformation

75 |
76 |
77 |
78 | 82 |     Dark Mode 83 |
84 | 88 |
89 | Search: 91 |
92 |
93 | Group by: 94 | 98 |
99 | 126 | 127 |
128 |
129 |
130 | 131 |

132 |
133 |

Imports

134 |
135 | nodetype, common, scanner 136 |
137 |
138 |

Types

139 |
140 | 141 |
GroupsCapture = object
142 |   count*: int16
143 |   names*: OrderedTable[string, int16]
144 | 
145 |
146 | 147 | 148 | 149 |
150 | 151 |
152 |
153 |

Funcs

154 |
155 | 156 |
func transformExp(exp: seq[Node]; groups: var GroupsCapture): seq[Node] {...}{.inline,
157 |     raises: [RegexError], tags: [].}
158 |
159 | 160 | 161 | 162 |
163 | 164 |
165 | 166 |
167 |
168 | 169 |
170 | 175 |
176 |
177 |
178 | 179 | 180 | 181 | -------------------------------------------------------------------------------- /docs/nregex/private/nfa.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | nregex/private/nfa 21 | 22 | 23 | 24 | 25 | 69 | 70 | 71 | 72 |
73 |
74 |

nregex/private/nfa

75 |
76 |
77 |
78 | 82 |     Dark Mode 83 |
84 | 88 |
89 | Search: 91 |
92 |
93 | Group by: 94 | 98 |
99 | 132 | 133 |
134 |
135 |
136 | 137 |

138 |
139 |

Imports

140 |
141 | nodetype, common 142 |
143 |
144 |

Types

145 |
146 | 147 |
TransitionsAll = seq[seq[int16]]
148 |
149 | 150 | 151 | 152 |
153 | 154 |
ZclosureStates = seq[seq[Node]]
155 |
156 | 157 | 158 | 159 |
160 | 161 |
Transitions = object
162 |   all*: TransitionsAll
163 |   allZ*: TransitionsAll
164 |   z*: ZclosureStates
165 |   zCount*: int
166 | 
167 |
168 | 169 | 170 | 171 |
172 | 173 |
174 |
175 |

Funcs

176 |
177 | 178 |
func nfa(exp: seq[Node]; transitions: var Transitions): seq[Node] {...}{.
179 |     raises: [IndexError, RegexError], tags: [].}
180 |
181 | 182 | 183 | 184 |
185 | 186 |
187 | 188 |
189 |
190 | 191 |
192 | 197 |
198 |
199 |
200 | 201 | 202 | 203 | -------------------------------------------------------------------------------- /docs/nregex/private/nodematch.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | nregex/private/nodematch 21 | 22 | 23 | 24 | 25 | 69 | 70 | 71 | 72 |
73 |
74 |

nregex/private/nodematch

75 |
76 |
77 |
78 | 82 |     Dark Mode 83 |
84 | 88 |
89 | Search: 91 |
92 |
93 | Group by: 94 | 98 |
99 | 122 | 123 |
124 |
125 |
126 | 127 |

128 |
129 |

Imports

130 |
131 | nodetype, common 132 |
133 |
134 |

Funcs

135 |
136 | 137 |
func isWord(r: Rune): bool {...}{.inline, raises: [], tags: [].}
138 |
139 | 140 | 141 | 142 |
143 | 144 |
func match(n: Node; r: Rune; nxt: Rune): bool {...}{.raises: [], tags: [].}
145 |
146 | 147 | match for Node of assertion kind. Return whether the node matches the current characters or not 148 | 149 |
150 | 151 |
func swapCase(r: Rune): Rune {...}{.raises: [], tags: [].}
152 |
153 | 154 | 155 | 156 |
157 | 158 |
func match(n: Node; r: Rune): bool {...}{.raises: [], tags: [].}
159 |
160 | 161 | match for Node of matchable kind. Return whether the node matches the current character or not 162 | 163 |
164 | 165 |
166 | 167 |
168 |
169 | 170 |
171 | 176 |
177 |
178 |
179 | 180 | 181 | 182 | -------------------------------------------------------------------------------- /docs/nregex/private/parser.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | nregex/private/parser 21 | 22 | 23 | 24 | 25 | 69 | 70 | 71 | 72 |
73 |
74 |

nregex/private/parser

75 |
76 |
77 |
78 | 82 |     Dark Mode 83 |
84 | 88 |
89 | Search: 91 |
92 |
93 | Group by: 94 | 98 |
99 |
    100 |
  • 101 | Imports 102 |
      103 | 104 |
    105 |
  • 106 |
  • 107 | Funcs 108 |
      109 |
    • parse
    • 111 | 112 |
    113 |
  • 114 | 115 |
116 | 117 |
118 |
119 |
120 | 121 |

122 |
123 |

Imports

124 |
125 | nodetype, common, scanner 126 |
127 |
128 |

Funcs

129 |
130 | 131 |
func parse(expression: string): seq[Node] {...}{.raises: [RegexError], tags: [].}
132 |
133 | 134 | convert a string regex expression into a Node expression 135 | 136 |
137 | 138 |
139 | 140 |
141 |
142 | 143 |
144 | 149 |
150 |
151 |
152 | 153 | 154 | 155 | -------------------------------------------------------------------------------- /nregex.nimble: -------------------------------------------------------------------------------- 1 | # Package 2 | 3 | version = "0.0.4" 4 | author = "Esteban Castro Borsani (@nitely)" 5 | description = "A DFA based regex engine" 6 | license = "MIT" 7 | srcDir = "src" 8 | skipDirs = @["tests", "docs"] 9 | 10 | requires "nim >= 1.0.4" 11 | requires "unicodedb >= 0.7.2" 12 | requires "unicodeplus >= 0.5.0" 13 | 14 | task test, "Test": 15 | exec "nim c -r -o:bin/nregex src/nregex.nim" 16 | exec "nim c -r tests/tests.nim" 17 | exec "nim c -r -d:forceRegexAtRuntime tests/tests.nim" 18 | #when (NimMajor, NimMinor) >= (1, 1): 19 | # exec "nim c -d:runTestAtCT --maxLoopIterationsVM:1000000000 tests/tests.nim" 20 | exec "nim js -r -o:bin/nregex.js --styleCheck:off src/nregex.nim" 21 | exec "nim js -r --styleCheck:off tests/tests.nim" 22 | exec "nim js -r --styleCheck:off -d:forceRegexAtRuntime tests/tests.nim" 23 | 24 | # Test runnable examples 25 | exec "nim doc -o:./docs/ugh/ugh.html ./src/nregex.nim" 26 | 27 | task docs, "Docs": 28 | exec "nim doc --project -o:./docs ./src/nregex.nim" 29 | -------------------------------------------------------------------------------- /src/nregex/private/common.nim: -------------------------------------------------------------------------------- 1 | import std/unicode 2 | import std/strutils 3 | 4 | type 5 | RegexError* = object of ValueError 6 | ## raised when the pattern 7 | ## is not a valid regex 8 | 9 | const 10 | # This is used as start 11 | # and end of string. It should 12 | # be invalid code, but while it 13 | # works it simplifies things a bit. 14 | # An alternative would be opt[Rune] 15 | # or just using int32 and convert 16 | # Rune to int when needed 17 | invalidRune* = Rune(-1) 18 | # `\n` is platform specific in 19 | # Nim and not the actual `\n` 20 | lineBreakRune* = Rune(10) 21 | 22 | proc toRune*(s: string): Rune = 23 | result = s.runeAt(0) 24 | 25 | proc `<=`*(x, y: Rune): bool = 26 | x.int <= y.int 27 | 28 | proc cmp*(x, y: Rune): int = 29 | x.int - y.int 30 | 31 | proc `%%`*( 32 | formatstr: string, 33 | a: openArray[string] 34 | ): string {.noSideEffect, raises: [].} = 35 | ## same as ``"$#" % ["foo"]`` but 36 | ## returns empty string on error 37 | try: 38 | formatstr % a 39 | except ValueError: 40 | "" 41 | 42 | proc `%%`*(formatstr: string, a: string): string = 43 | formatstr %% [a] 44 | -------------------------------------------------------------------------------- /src/nregex/private/dfa.nim: -------------------------------------------------------------------------------- 1 | import std/unicode 2 | import std/sets 3 | import std/tables 4 | import std/deques 5 | 6 | import nodematch 7 | import nodetype 8 | import common 9 | 10 | type 11 | AlphabetSym* = int32 12 | Closure* = HashSet[int16] 13 | DfaRow* = Table[AlphabetSym, int32] 14 | DfaClosure* = Table[AlphabetSym, int32] 15 | Dfa* = object 16 | table*: seq[DfaRow] 17 | cs*: seq[Closure] 18 | closures*: seq[DfaClosure] 19 | 20 | const 21 | symEoe* = -1'i32 22 | symWord* = -3'i32 23 | symDigit* = -4'i32 24 | symAny* = -6'i32 25 | symAnyNl* = -7'i32 26 | 27 | func createAlphabet(nfa: seq[Node]): seq[AlphabetSym] = 28 | var inAlphabet: HashSet[AlphabetSym] 29 | # speedup ascii matching 30 | for c in 0 .. 128: 31 | result.add(c.int32) 32 | inAlphabet.incl(c.int32) 33 | # special symbols 34 | result.add(symEoe) 35 | result.add(symWord) 36 | result.add(symDigit) 37 | result.add(symAny) 38 | result.add(symAnyNl) 39 | # expression chars 40 | for n in nfa: 41 | case n.kind 42 | of reChar: 43 | if n.cp.int32 notin inAlphabet: 44 | result.add(n.cp.int32) 45 | inAlphabet.incl(n.cp.int32) 46 | of reCharCI: 47 | if n.cp.int32 notin inAlphabet: 48 | result.add(n.cp.int32) 49 | inAlphabet.incl(n.cp.int32) 50 | let cp2 = n.cp.swapCase() 51 | if cp2.int32 notin inAlphabet: 52 | result.add(cp2.int32) 53 | inAlphabet.incl(cp2.int32) 54 | of reInSet: 55 | for cp in n.cps: 56 | if cp.int32 notin inAlphabet: 57 | result.add(cp.int32) 58 | inAlphabet.incl(cp.int32) 59 | for rg in n.ranges: 60 | for cp in rg: 61 | if cp.int32 notin inAlphabet: 62 | result.add(cp.int32) 63 | inAlphabet.incl(cp.int32) 64 | else: 65 | discard 66 | assert result.toHashSet.len == result.len 67 | 68 | func delta( 69 | nfa: seq[Node], 70 | states: Closure, 71 | sym: AlphabetSym 72 | ): Closure = 73 | result = initHashSet[int16](2) 74 | if sym > -1: 75 | for s in states: 76 | if match(nfa[s], sym.Rune): 77 | result.incl(s) 78 | else: 79 | # XXX this will add every sym for reAny, but we should only add symAny 80 | let kinds = case sym 81 | of symEoe: {reEoe} 82 | of symWord: {reAnyNl, reAny, reWord} 83 | of symDigit: {reAnyNl, reAny, reWord, reDigit} 84 | of symAny: {reAnyNl, reAny} 85 | of symAnyNl: {reAnyNl} 86 | else: {} 87 | for s in states: 88 | if nfa[s].kind in kinds: 89 | result.incl(s) 90 | if nfa[s].kind == reInSet: 91 | for sh in nfa[s].shorthands: 92 | if sh.kind in kinds: 93 | result.incl(s) 94 | break 95 | 96 | func dfa*( 97 | nfa: seq[Node], 98 | alphabet: var seq[AlphabetSym] 99 | ): Dfa = 100 | ## Powerset construction 101 | template closure(result, states) = 102 | for s in states: 103 | for sn in nfa[s].next: 104 | result.incl(sn) 105 | alphabet = createAlphabet(nfa) 106 | let n0 = 0 107 | var q0: Closure 108 | closure(q0, [n0]) 109 | var qw = initDeque[Closure]() 110 | qw.addFirst(q0) 111 | var qu = initTable[Closure, int32]() 112 | var quPos = 0'i32 113 | qu[q0] = quPos 114 | inc quPos 115 | result.table.add(initTable[AlphabetSym, int32](2)) 116 | result.closures.add(initTable[AlphabetSym, int32](2)) 117 | var t = initHashSet[int16]() 118 | var csRev = initTable[Closure, int32]() 119 | while qw.len > 0: 120 | let qa = qw.popLast() 121 | for sym in alphabet: 122 | let s = delta(nfa, qa, sym) 123 | if s.len == 0: 124 | continue 125 | t.clear() 126 | closure(t, s) 127 | if t notin qu: 128 | qu[t] = quPos 129 | inc quPos 130 | qw.addFirst(t) 131 | result.table.add(initTable[AlphabetSym, int32](2)) 132 | result.closures.add(initTable[AlphabetSym, int32](2)) 133 | result.table[qu[qa]][sym] = qu[t] 134 | if s in csRev: 135 | result.closures[qu[qa]][sym] = csRev[s] 136 | else: 137 | result.closures[qu[qa]][sym] = result.cs.len.int32 138 | csRev[s] = result.cs.len.int32 139 | result.cs.add(s) 140 | assert result.table.len == result.closures.len 141 | assert result.cs.toHashSet.len == result.cs.len 142 | 143 | func minDfaTable( 144 | dfa: Dfa, 145 | p: seq[HashSet[int32]] 146 | ): Dfa {.inline.} = 147 | ## Construct DFA from Hopcroft partitions. 148 | ## This is O(N*A) where N is the number of 149 | ## DFA states and A the size of the alphabet, 150 | ## albeit it can be faster since not every state 151 | ## has a transition on every alphabet symbol 152 | # map DFA states to min-DFA states 153 | var statesMap = newSeq[int32](dfa.table.len) 154 | for i in 0 .. statesMap.len-1: 155 | statesMap[i] = -1 156 | for ri, r in p.pairs: 157 | for q in r: 158 | assert statesMap[q] == -1 159 | statesMap[q] = ri.int32 160 | # construct min-DFA table 161 | result.table.setLen(p.len) 162 | result.closures.setLen(p.len) 163 | var csRev = initTable[Closure, int32]() 164 | var closures = initTable[AlphabetSym, Closure]() 165 | for ri, r in p.pairs: 166 | assert r.len > 0 167 | result.table[ri] = initTable[AlphabetSym, int32](2) 168 | result.closures[ri] = initTable[AlphabetSym, int32](2) 169 | closures.clear() 170 | for q in r: 171 | for c, q2 in dfa.table[q].pairs: 172 | assert c notin result.table[ri] or 173 | result.table[ri][c] == statesMap[q2] 174 | result.table[ri][c] = statesMap[q2] 175 | if c notin closures: 176 | closures[c] = initHashSet[int16](2) 177 | closures[c].incl(dfa.cs[dfa.closures[q][c]]) 178 | for c, closure in closures.pairs: 179 | if closure in csRev: 180 | result.closures[ri][c] = csRev[closure] 181 | else: 182 | result.closures[ri][c] = result.cs.len.int32 183 | csRev[closure] = result.cs.len.int32 184 | result.cs.add(closure) 185 | assert result.table.len == result.closures.len 186 | assert result.cs.toHashSet.len == result.cs.len 187 | 188 | func reverse(dfa: Dfa): Dfa {.inline.} = 189 | ## return reversed dfa table 190 | result.table.setLen(dfa.table.len) 191 | for i in 0 .. dfa.table.len-1: 192 | result.table[i] = initTable[AlphabetSym, int32](2) 193 | for i, t in dfa.table.pairs: 194 | for c, q in t.pairs: 195 | # add dup key, for multiple 196 | # in-transitions of same symbol 197 | result.table[q].add(c, i.int32) 198 | 199 | func xF(dfa: Dfa): HashSet[int32] {.inline.} = 200 | ## return all final states 201 | result = initHashSet[int32](2) 202 | for i, t in dfa.table.pairs: 203 | if symEoe in t: 204 | result.incl(i.int32) 205 | doAssert result.len > 0 206 | 207 | func xQ(dfa: Dfa): HashSet[int32] {.inline.} = 208 | ## return all states 209 | result = initHashSet[int32](2) 210 | for i in 0'i32 .. (dfa.table.len-1).int32: 211 | result.incl(i) 212 | 213 | func delta( 214 | dfa: Dfa, 215 | s: HashSet[int32], 216 | c: AlphabetSym 217 | ): HashSet[int32] {.inline.} = 218 | ## return set of states that can reach `s` on `c`, 219 | ## expects the reversed dfa 220 | result = initHashSet[int32](2) 221 | for q in s: 222 | for q2 in dfa.table[q].allValues(c): 223 | result.incl(q2) 224 | 225 | func canPartition(r, i: HashSet[int32]): bool {.inline.} = 226 | ## return true if: 227 | ## * intersection of R and I is not empty, 228 | ## * and the complement of R and I is not empty 229 | var intr = 0 230 | for q in r: 231 | intr += int(q in i) 232 | result = 0 < intr and intr < r.len 233 | 234 | func partition( 235 | r, i: HashSet[int32] 236 | ): (HashSet[int32], HashSet[int32]) {.inline.} = 237 | ## partition r into r1 and r2, such as r1 is the intersection 238 | ## of r and i, and r2 is r - such intersection 239 | result = ( 240 | initHashSet[int32](2), 241 | initHashSet[int32](2)) 242 | for x in r: 243 | if x in i: 244 | result[0].incl(x) 245 | else: 246 | result[1].incl(x) 247 | 248 | # without minimize 249 | # 43745 lines compiled; 8.679 sec total; 256.574MiB peakmem 250 | # unoptimized minimize 251 | # 43746 lines compiled; 35.277 sec total; 309.113MiB peakmem; 252 | # removing p[p.find(r)] and (r in w) 253 | # 43746 lines compiled; 32.970 sec total; 319.766MiB peakmem; 254 | # removing two (r - i) intersections of hashSets 255 | # 43756 lines compiled; 16.145 sec total; 308.73MiB peakmem; 256 | # dfa.reverse and init all hashsets to 2, except q-f is 64 257 | # 43779 lines compiled; 12.209 sec total; 309.234MiB peakmem 258 | # optimized dfa table construction 259 | # 43825 lines compiled; 11.985 sec total; 258.664MiB peakmem; 260 | func minimize*( 261 | dfa: Dfa, 262 | alphabet: seq[AlphabetSym] 263 | ): Dfa = 264 | ## Hopcroft 265 | template r: untyped {.dirty.} = p[ri] 266 | let dfaRev = dfa.reverse() 267 | let f = dfa.xF() 268 | let q = dfa.xQ() 269 | var w: seq[HashSet[int32]] 270 | w.add(f) 271 | w.add(q - f) 272 | var p: seq[HashSet[int32]] 273 | p.add(f) 274 | p.add(q - f) 275 | while w.len > 0: 276 | let s = w.pop 277 | for c in alphabet: # XXX take alphabet from `for q in s: dfa[q]` 278 | let i = delta(dfaRev, s, c) 279 | if i.len == 0: 280 | continue 281 | for ri in 0 .. p.len-1: 282 | if not canPartition(r, i): 283 | continue 284 | let wi = w.find r 285 | let (r1, r2) = partition(r, i) 286 | r = r1 287 | p.add r2 288 | if wi > -1: 289 | w[wi] = r1 290 | w.add r2 291 | elif r1.len <= r2.len: 292 | w.add r1 293 | else: 294 | w.add r2 295 | assert p.len <= dfa.table.len, "not a min DFA, wtf?" 296 | # make the initial state the first state 297 | var ri0 = -1 298 | for ri, r in p.pairs: 299 | if 0 in r: 300 | ri0 = ri 301 | break 302 | assert ri0 > -1 303 | swap p[0], p[ri0] 304 | result = minDfaTable(dfa, p) 305 | -------------------------------------------------------------------------------- /src/nregex/private/dfamacro.nim: -------------------------------------------------------------------------------- 1 | import std/sets 2 | import std/tables 3 | import std/unicode 4 | import std/macros 5 | 6 | import pkg/unicodeplus except isUpper, isLower 7 | 8 | import common 9 | import nodetype 10 | import nodematch 11 | import nfa 12 | import dfa 13 | import dfamatch 14 | 15 | macro genClosureTable( 16 | qt: int32, 17 | nt: int16, 18 | regex: static Regex 19 | ): untyped = 20 | #[ 21 | case qt: # curr closure 22 | of 1.int32: 23 | case nt: # next state 24 | of 2.int32: 25 | true 26 | else: 27 | false 28 | else: false 29 | ]# 30 | doAssert regex.dfa.cs.len > 0 31 | result = newStmtList() 32 | var caseStmtQ: seq[NimNode] 33 | caseStmtQ.add(qt) 34 | for i, t2 in regex.dfa.cs.pairs: 35 | #if t2.len == 0: # ? 36 | # continue 37 | var caseStmtNt: seq[NimNode] 38 | caseStmtNt.add(nt) 39 | for s in t2: 40 | caseStmtNt.add(newTree(nnkOfBranch, 41 | newLit s, 42 | quote do: 43 | return true)) 44 | caseStmtNt.add(newTree(nnkElse, 45 | quote do: 46 | return false)) 47 | caseStmtQ.add(newTree(nnkOfBranch, 48 | newLit i.int32, 49 | newStmtList( 50 | newTree(nnkCaseStmt, caseStmtNt)))) 51 | caseStmtQ.add(newTree(nnkElse, 52 | quote do: 53 | return false)) 54 | result.add(newTree(nnkCaseStmt, caseStmtQ)) 55 | when defined(reDumpMacro): 56 | echo "==== genClosureTable ====" 57 | echo repr(result) 58 | 59 | func inClosure( 60 | qt: int32, 61 | nt: int16, 62 | regex: static Regex 63 | ): bool = 64 | genClosureTable(qt, nt, regex) 65 | 66 | macro genSubmatch( 67 | n, c, qt, cPrev, capt, captx, charIndex, matched, smB, capts: typed, 68 | regex: static Regex 69 | ): untyped = 70 | result = newStmtList() 71 | var caseStmtN: seq[NimNode] 72 | caseStmtN.add(n) 73 | for i, t in regex.transitions.all.pairs: 74 | if t.len == 0: # end state 75 | continue 76 | var branchBodyN: seq[NimNode] 77 | for nti, nt in t.pairs: 78 | let ntLit = newLit nt 79 | var inClosureBranch: seq[NimNode] 80 | if regex.transitions.allZ[i][nti] == -1'i16: 81 | inClosureBranch.add(quote do: 82 | add(`smB`, (`ntLit`, `capt`))) 83 | else: 84 | inClosureBranch.add(quote do: 85 | `matched` = true 86 | `captx` = `capt`) 87 | for z in regex.transitions.z[regex.transitions.allZ[i][nti]]: 88 | case z.kind 89 | of groupKind: 90 | let zIdx = newLit z.idx 91 | inClosureBranch.add(quote do: 92 | add(`capts`, CaptNode(parent: `captx`, bound: `charIndex`, idx: `zIdx`)) 93 | `captx` = (len(`capts`) - 1).int32) 94 | of assertionKind: 95 | # https://github.com/nim-lang/Nim/issues/13266 96 | #let zLit = newLit z 97 | inClosureBranch.add(quote do: 98 | `matched` = `matched` and match(`z`, Rune(`cPrev`), Rune(`c`))) 99 | of matchTransitionKind: 100 | #let zLit = newLit z 101 | inClosureBranch.add(quote do: 102 | `matched` = `matched` and match(`z`, Rune(`c`))) 103 | else: 104 | doAssert false 105 | inClosureBranch.add(quote do: 106 | if `matched`: 107 | add(`smB`, (`ntLit`, `captx`))) 108 | doAssert inClosureBranch.len > 0 109 | let inClosureBranchStmt = newStmtList inClosureBranch 110 | branchBodyN.add(quote do: 111 | if inClosure(`qt`, `ntLit`, regex) and not hasState(`smB`, `ntLit`): 112 | `inClosureBranchStmt`) 113 | doAssert branchBodyN.len > 0 114 | caseStmtN.add(newTree(nnkOfBranch, 115 | newLit i.int16, 116 | newStmtList( 117 | branchBodyN))) 118 | caseStmtN.add(newTree(nnkElse, 119 | newStmtList( 120 | newTree(nnkDiscardStmt, newEmptyNode())))) 121 | result.add(newTree(nnkCaseStmt, caseStmtN)) 122 | when defined(reDumpMacro): 123 | echo "==== genSubmatch ====" 124 | echo repr(result) 125 | 126 | template submatch( 127 | smA, smB, capts, regex, i, qt, cprev, c: untyped 128 | ): untyped = 129 | var captx: int32 130 | var matched = true 131 | for n, capt in smA.items: 132 | genSubmatch( 133 | n, c, qt, cPrev, capt, captx, i, matched, smB, capts, regex) 134 | swap(smA, smB) 135 | smB.clear() 136 | 137 | macro genEoeTable( 138 | matched: bool, 139 | q, qt: int32, 140 | regex: static Regex 141 | ): untyped = 142 | ## Generate Eoe table 143 | result = newStmtList() 144 | var caseStmtQ: seq[NimNode] 145 | caseStmtQ.add(q) 146 | for i, t in regex.dfa.table.pairs: 147 | if symEoe in t: 148 | let trueLit = newLit true 149 | let qtLit = newLit regex.dfa.closures[i][symEoe] 150 | caseStmtQ.add(newTree(nnkOfBranch, 151 | newLit i.int32, 152 | quote do: 153 | `matched` = `trueLit` 154 | `qt` = `qtLit`)) 155 | doAssert caseStmtQ.len > 1 156 | let falseLit = newLit false 157 | let qtLit = newLit -1'i32 158 | caseStmtQ.add(newTree(nnkElse, 159 | quote do: 160 | `matched` = `falseLit` 161 | `qt` = `qtLit`)) 162 | result.add( 163 | newTree(nnkCaseStmt, caseStmtQ)) 164 | when defined(reDumpMacro): 165 | echo "==== genEoeTable ====" 166 | echo repr(result) 167 | 168 | macro genSymMatchTable( 169 | q, qt, c: int32, 170 | regex: static Regex 171 | ): untyped = 172 | ## Generate symMatch transition table 173 | result = newStmtList() 174 | var caseStmtQ: seq[NimNode] 175 | caseStmtQ.add(q) 176 | var qBranches: seq[NimNode] 177 | for i, t in regex.dfa.table.pairs: 178 | var symIfs: seq[NimNode] 179 | for sym in syms: 180 | if sym notin regex.dfa.table[i]: 181 | continue 182 | case sym: 183 | of symDigit: 184 | let tLit = newLit regex.dfa.table[i][symDigit] 185 | let qtLit = newLit regex.dfa.closures[i][symDigit] 186 | symIfs.add(newTree(nnkElifBranch, 187 | quote do: 188 | isDecimal(Rune(`c`)), 189 | quote do: 190 | `q` = `tLit` 191 | `qt` = `qtLit`)) 192 | of symWord: 193 | let tLit = newLit regex.dfa.table[i][symWord] 194 | let qtLit = newLit regex.dfa.closures[i][symWord] 195 | symIfs.add(newTree(nnkElifBranch, 196 | quote do: 197 | isWord(Rune(`c`)), 198 | quote do: 199 | `q` = `tLit` 200 | `qt` = `qtLit`)) 201 | of symAny: 202 | let lineBreakLit = newLit lineBreakRune.int32 203 | let tLit = newLit regex.dfa.table[i][symAny] 204 | let qtLit = newLit regex.dfa.closures[i][symAny] 205 | symIfs.add(newTree(nnkElifBranch, 206 | quote do: 207 | `c` != `lineBreakLit`, 208 | quote do: 209 | `q` = `tLit` 210 | `qt` = `qtLit`)) 211 | of symAnyNl: 212 | let tLit = newLit regex.dfa.table[i][symAnyNl] 213 | let qtLit = newLit regex.dfa.closures[i][symAnyNl] 214 | symIfs.add(newTree(nnkElifBranch, 215 | quote do: 216 | true, 217 | quote do: 218 | `q` = `tLit` 219 | `qt` = `qtLit`)) 220 | else: 221 | doAssert false 222 | discard 223 | if symIfs.len > 0: 224 | let tLit = newLit -1'i32 225 | symIfs.add(newTree(nnkElse, 226 | quote do: 227 | `q` = `tLit` 228 | `qt` = `tLit`)) 229 | qBranches.add(newTree(nnkOfBranch, 230 | newLit i.int32, 231 | newStmtList( 232 | newTree(nnkIfStmt, symIfs)))) 233 | let tLit = newLit -1'i32 234 | if qBranches.len > 0: 235 | caseStmtQ.add(qBranches) 236 | caseStmtQ.add(newTree(nnkElse, 237 | quote do: 238 | `q` = `tLit` 239 | `qt` = `tLit`)) 240 | result.add(newTree(nnkCaseStmt, caseStmtQ)) 241 | else: 242 | result.add(quote do: 243 | `q` = `tLit` 244 | `qt` = `tLit`) 245 | when defined(reDumpMacro): 246 | echo "==== genSymMatchTable ====" 247 | echo repr(result) 248 | 249 | macro genTable( 250 | q, qt, c: int32, 251 | regex: static Regex 252 | ): untyped = 253 | ## Generate transition table 254 | var caseStmtQ: seq[NimNode] 255 | caseStmtQ.add(q) 256 | for i, t in regex.dfa.table.pairs: 257 | var caseStmtC: seq[NimNode] 258 | caseStmtC.add(c) 259 | for c2, t2 in t: 260 | let t2Lit = newLit t2.int32 261 | let qtLit = newLit regex.dfa.closures[i][c2] 262 | caseStmtC.add(newTree(nnkOfBranch, 263 | newLit c2, 264 | quote do: 265 | `q` = `t2Lit` 266 | `qt` = `qtLit`)) 267 | let t2Lit = newLit -1'i32 268 | caseStmtC.add(newTree(nnkElse, 269 | quote do: 270 | `q` = `t2Lit` 271 | `qt` = `t2Lit`)) 272 | caseStmtQ.add(newTree(nnkOfBranch, 273 | newLit i.int32, 274 | newStmtList( 275 | newTree(nnkCaseStmt, caseStmtC)))) 276 | caseStmtQ.add(newTree(nnkElse, 277 | newStmtList( 278 | newTree(nnkDiscardStmt, newEmptyNode())))) 279 | result = newStmtList( 280 | newTree(nnkCaseStmt, caseStmtQ)) 281 | when defined(reDumpMacro): 282 | echo "==== genTable ====" 283 | echo repr(result) 284 | 285 | func matchImpl*( 286 | text: string, 287 | regex: static Regex, 288 | m: var RegexMatch, 289 | flags: static MatchFlags, 290 | start = 0 291 | ): bool {.inline.} = 292 | m.clear() 293 | result = false 294 | var 295 | cPrev = -1'i32 296 | c: Rune 297 | q = 0'i32 298 | qOld {.used.} = q 299 | qt = q 300 | i = start 301 | iPrev = start 302 | # workaround for VM registry limitation 303 | const 304 | zCount = regex.transitions.zCount 305 | zGroupsCount = regex.groupsCount * 2 306 | noCaptures = mfNoCaptures in flags 307 | # workaround for https://github.com/nim-lang/Nim/issues/13252 308 | const 309 | reFlags = regex.flags 310 | canSkipTransitionsZ = noCaptures and 311 | zGroupsCount == zCount 312 | hasTransitionsZ = zCount > 0 and 313 | not canSkipTransitionsZ 314 | groupCount {.used.} = regex.groupsCount 315 | namedGroups {.used.} = regex.namedGroups 316 | when hasTransitionsZ: 317 | var 318 | smA = newSubmatches(regex.transitions.all.len) 319 | smB = newSubmatches(regex.transitions.all.len) 320 | capts: Capts 321 | smA.add((0'i16, -1'i32)) 322 | while i < len(text): 323 | when reAscii notin reFlags: 324 | fastRuneAt(text, i, c, true) 325 | qOld = q 326 | else: 327 | c = Rune(text[i]) 328 | inc i 329 | genTable(q, qt, c.int32, regex) 330 | if (q == -1'i32).unlikely: 331 | when reAscii notin reFlags: 332 | q = qOld 333 | genSymMatchTable(q, qt, c.int32, regex) 334 | if (q == -1'i32).unlikely: 335 | return 336 | when hasTransitionsZ: 337 | submatch(smA, smB, capts, regex, iPrev, qt, cPrev, c.int32) 338 | iPrev = i 339 | cPrev = c.int32 340 | genEoeTable(result, q, qt, regex) 341 | when hasTransitionsZ: 342 | if not result: 343 | return 344 | # XXX lighter submatchEoe 345 | submatch(smA, smB, capts, regex, iPrev, qt, cPrev, symEoe) 346 | if smA.len == 0: 347 | result = false 348 | return 349 | constructSubmatches(m.captures, capts, smA[0][1], groupCount) 350 | when namedGroups.len > 0: 351 | m.namedGroups = namedGroups 352 | m.boundaries = start .. iPrev-1 353 | -------------------------------------------------------------------------------- /src/nregex/private/dfamatch.nim: -------------------------------------------------------------------------------- 1 | ## DFA matcher for non-static regexes 2 | 3 | import std/unicode 4 | import std/sets 5 | import std/tables 6 | import std/deques 7 | import std/algorithm 8 | 9 | import pkg/unicodeplus except isUpper, isLower 10 | 11 | import nodematch 12 | import nodetype 13 | import common 14 | import nfa 15 | import dfa 16 | 17 | type 18 | CaptNode* = object 19 | parent*: int32 20 | bound*: int 21 | idx*: int16 22 | Capts* = seq[CaptNode] 23 | Captures* = seq[seq[Slice[int]]] 24 | 25 | func constructSubmatches*( 26 | captures: var Captures, 27 | capts: Capts, 28 | capt, size: int 29 | ) {.inline.} = 30 | template currGroup: untyped = captures[capts[capt].idx] 31 | captures.setLen(size) 32 | for i in 0 .. captures.len-1: 33 | captures[i].setLen(0) 34 | if capts.len == 0: 35 | return 36 | var capt = capt 37 | while capt != -1: 38 | if currGroup.len == 0: 39 | currGroup.add(-2 .. -2) 40 | if currGroup[^1].a != -2: 41 | currGroup.add(-2 .. -2) 42 | if currGroup[^1].b == -2: 43 | currGroup[^1].b = capts[capt].bound-1 44 | else: 45 | currGroup[^1].a = capts[capt].bound 46 | capt = capts[capt].parent 47 | for c in captures.mitems: 48 | c.reverse() 49 | 50 | type 51 | NodeIdx = int16 52 | CaptIdx = int32 53 | Submatches* = ref object 54 | ## Parallel states would be a better name 55 | sx: seq[(NodeIdx, CaptIdx)] 56 | # use custom len because setLen(0) is slower, 57 | # and {.noInit.} makes no difference 58 | si: int16 59 | ss: seq[int16] 60 | 61 | func newSubmatches*(size: int): Submatches {.inline.} = 62 | result = new Submatches 63 | result.sx = newSeq[(NodeIdx, CaptIdx)](8) 64 | result.ss = newSeq[int16](size) 65 | result.si = 0 66 | 67 | func `[]`*(sm: Submatches, i: int): (NodeIdx, CaptIdx) {.inline.} = 68 | assert i < sm.si 69 | sm.sx[i] 70 | 71 | func hasState*(sm: Submatches, n: int16): bool {.inline.} = 72 | sm.ss[n] < sm.si and sm.sx[sm.ss[n]][0] == n 73 | 74 | func add*(sm: var Submatches, item: (NodeIdx, CaptIdx)) {.inline.} = 75 | assert not sm.hasState(item[0]) 76 | assert sm.si <= sm.sx.len 77 | if (sm.si == sm.sx.len).unlikely: 78 | sm.sx.setLen(sm.sx.len * 2) 79 | sm.sx[sm.si] = item 80 | sm.ss[item[0]] = sm.si 81 | sm.si += 1'i16 82 | 83 | func len*(sm: Submatches): int {.inline.} = 84 | sm.si 85 | 86 | func clear*(sm: var Submatches) {.inline.} = 87 | sm.si = 0 88 | 89 | iterator items*(sm: Submatches): (NodeIdx, CaptIdx) {.inline.} = 90 | for i in 0 .. sm.len-1: 91 | yield sm.sx[i] 92 | 93 | func submatch( 94 | smA, smB: var Submatches, 95 | capts: var Capts, 96 | transitions: Transitions, 97 | states: Closure, 98 | i: int, 99 | cprev, c: int32 100 | ) {.inline.} = 101 | smB.clear() 102 | var captx: int32 103 | var matched = true 104 | for n, capt in smA.items: 105 | for nti, nt in transitions.all[n].pairs: 106 | if smB.hasState(nt): 107 | continue 108 | if nt notin states: 109 | continue 110 | if transitions.allZ[n][nti] == -1'i16: 111 | smB.add((nt, capt)) 112 | continue 113 | matched = true 114 | captx = capt 115 | for z in transitions.z[transitions.allZ[n][nti]]: 116 | if not matched: 117 | break 118 | case z.kind 119 | of groupKind: 120 | capts.add(CaptNode( 121 | parent: captx, 122 | bound: i, 123 | idx: z.idx)) 124 | captx = (capts.len-1'i32).int32 125 | of assertionKind: 126 | matched = match(z, cprev.Rune, c.Rune) 127 | of matchTransitionKind: 128 | matched = match(z, c.Rune) 129 | else: 130 | assert false 131 | discard 132 | if matched: 133 | smB.add((nt, captx)) 134 | swap(smA, smB) 135 | 136 | type 137 | RegexFlag* = enum 138 | reAscii 139 | Regex* = object 140 | ## a compiled regular expression 141 | dfa*: Dfa 142 | transitions*: Transitions 143 | groupsCount*: int16 144 | namedGroups*: OrderedTable[string, int16] 145 | flags*: set[RegexFlag] 146 | MatchFlag* = enum 147 | mfShortestMatch 148 | mfLongestMatch 149 | mfNoCaptures 150 | MatchFlags* = set[MatchFlag] 151 | RegexMatch* = object 152 | ## result from matching operations 153 | captures*: Captures 154 | namedGroups*: OrderedTable[string, int16] 155 | boundaries*: Slice[int] 156 | 157 | func clear*(m: var RegexMatch) {.inline.} = 158 | if m.captures.len > 0: 159 | m.captures.setLen(0) 160 | if m.namedGroups.len > 0: 161 | m.namedGroups.clear() 162 | m.boundaries = 0 .. -1 163 | 164 | # Order matters, subsets first 165 | const syms* = [ 166 | symDigit, 167 | symWord, 168 | symAny, 169 | symAnyNl 170 | ] 171 | 172 | # Slow match 173 | func symMatch( 174 | q: var int32, 175 | c: Rune, 176 | cSym: var int32, 177 | regex: Regex 178 | ) {.inline.} = 179 | var matched = false 180 | for sym in syms: 181 | if sym notin regex.dfa.table[q]: 182 | continue 183 | matched = case sym: 184 | of symDigit: c.isDecimal() 185 | of symWord: c.isWord() 186 | of symAny: c != lineBreakRune 187 | of symAnyNl: true 188 | else: false 189 | if matched: 190 | q = regex.dfa.table[q][sym] 191 | cSym = sym 192 | break 193 | if not matched: 194 | q = -1'i32 195 | 196 | # Can't return early because of boundaries 197 | template longestMatchEnter(): untyped {.dirty.} = 198 | if symEoe in regex.dfa.table[q]: 199 | matchedLong = true 200 | iPrevLong = iPrev 201 | if hasTransitionsZ: 202 | submatch( 203 | smA, smB, capts, regex.transitions, 204 | regex.dfa.cs[regex.dfa.closures[q][symEoe]], iPrev, cPrev, c.int32) 205 | if smA.len > 0: 206 | captLong = smA[0][1] 207 | swap(smA, smB) 208 | 209 | template longestMatchExit(): untyped {.dirty.} = 210 | result = matchedLong 211 | if hasTransitionsZ: 212 | constructSubmatches(m.captures, capts, captLong, regex.groupsCount) 213 | if regex.namedGroups.len > 0: 214 | m.namedGroups = regex.namedGroups 215 | m.boundaries = start .. iPrevLong-1 216 | return 217 | 218 | template shortestMatch(): untyped {.dirty.} = 219 | if symEoe in regex.dfa.table[q]: 220 | if hasTransitionsZ: 221 | submatch( 222 | smA, smB, capts, regex.transitions, 223 | regex.dfa.cs[regex.dfa.closures[q][symEoe]], iPrev, cPrev, c.int32) 224 | if smA.len > 0: 225 | result = true 226 | return 227 | swap(smA, smB) 228 | else: 229 | result = true 230 | return 231 | 232 | func matchImpl*( 233 | text: string, 234 | regex: Regex, 235 | m: var RegexMatch, 236 | flags: static MatchFlags, 237 | start = 0 238 | ): bool {.inline.} = 239 | #echo dfa 240 | m.clear() 241 | result = false 242 | let 243 | asciiMode = reAscii in regex.flags 244 | canSkipTransitionsZ = mfNoCaptures in flags and 245 | regex.groupsCount * 2 == regex.transitions.zCount 246 | hasTransitionsZ = regex.transitions.zCount > 0 and 247 | not canSkipTransitionsZ 248 | var 249 | smA: Submatches 250 | smB: Submatches 251 | capts: Capts 252 | c: Rune 253 | cPrev = -1'i32 254 | cSym: int32 255 | q = 0'i32 256 | qnext = 0'i32 257 | i = start 258 | iPrev = start 259 | # Long match 260 | matchedLong {.used.} = false 261 | captLong {.used.} = -1 262 | iPrevLong {.used.} = start 263 | if hasTransitionsZ: 264 | smA = newSubmatches(regex.transitions.all.len) 265 | smB = newSubmatches(regex.transitions.all.len) 266 | smA.add((0'i16, -1'i32)) 267 | #echo regex.dfa 268 | while i < len(text): 269 | if not asciiMode: 270 | fastRuneAt(text, i, c, true) 271 | else: 272 | c = Rune(text[i]) 273 | inc i 274 | when mfShortestMatch in flags: 275 | shortestMatch() 276 | when mfLongestMatch in flags: 277 | longestMatchEnter() 278 | cSym = c.int32 279 | if (c.int32 in regex.dfa.table[q]).likely: 280 | qnext = regex.dfa.table[q][c.int32] 281 | else: 282 | if not asciiMode: 283 | symMatch(qnext, c, cSym, regex) 284 | if qnext == -1 or asciiMode: 285 | when mfLongestMatch in flags: 286 | longestMatchExit() 287 | else: 288 | return 289 | if hasTransitionsZ: 290 | submatch( 291 | smA, smB, capts, regex.transitions, 292 | regex.dfa.cs[regex.dfa.closures[q][cSym]], iPrev, cPrev, c.int32) 293 | iPrev = i 294 | cPrev = c.int32 295 | q = qnext 296 | #echo q 297 | result = symEoe in regex.dfa.table[q] 298 | if not result: 299 | when mfLongestMatch in flags: 300 | longestMatchExit() 301 | return 302 | if hasTransitionsZ: 303 | submatch( 304 | smA, smB, capts, regex.transitions, 305 | regex.dfa.cs[regex.dfa.closures[q][symEoe]], iPrev, cPrev, -1'i32) 306 | if smA.len == 0: # XXX is this possible? 307 | when mfLongestMatch in flags: 308 | longestMatchExit() 309 | result = false 310 | return 311 | constructSubmatches(m.captures, capts, smA[0][1], regex.groupsCount) 312 | if regex.namedGroups.len > 0: 313 | m.namedGroups = regex.namedGroups 314 | m.boundaries = start .. iPrev-1 315 | -------------------------------------------------------------------------------- /src/nregex/private/exptransformation.nim: -------------------------------------------------------------------------------- 1 | import std/unicode 2 | import std/sets 3 | import std/tables 4 | import std/algorithm 5 | 6 | import nodetype 7 | import common 8 | import scanner 9 | 10 | # todo: can not use unicodeplus due to 11 | # https://github.com/nim-lang/Nim/issues/7059 12 | func swapCase(r: Rune): Rune = 13 | # Note a character can be 14 | # non-lower and non-upper 15 | if r.isUpper(): 16 | result = r.toLower() 17 | elif r.isLower(): 18 | result = r.toUpper() 19 | else: 20 | result = r 21 | 22 | func check(cond: bool, msg: string) = 23 | if not cond: 24 | raise newException(RegexError, msg) 25 | 26 | func greediness(expression: seq[Node]): seq[Node] = 27 | ## apply greediness to an expression 28 | result = newSeqOfCap[Node](expression.len) 29 | var sc = expression.scan() 30 | for n in sc.mitems(): 31 | if (n.kind in repetitionKind or 32 | n.kind == reZeroOrOne) and 33 | sc.peek.kind == reZeroOrOne: 34 | n.isGreedy = true 35 | discard sc.next 36 | result.add(n) 37 | 38 | type 39 | GroupsCapture* = object 40 | count*: int16 41 | names*: OrderedTable[string, int16] 42 | 43 | func fillGroups( 44 | exp: seq[Node], 45 | groups: var GroupsCapture 46 | ): seq[Node] = 47 | ## populate group indices, names and capturing mark 48 | result = exp 49 | groups.names = initOrderedTable[string, int16](2) 50 | groups.count = 0'i16 51 | var gs = newSeq[int]() 52 | for i, n in result.mpairs: 53 | case n.kind 54 | of reGroupStart: 55 | gs.add(i) 56 | if n.isCapturing: 57 | n.idx = groups.count 58 | inc groups.count 59 | if n.name.len > 0: 60 | assert n.isCapturing 61 | groups.names[n.name] = n.idx 62 | of reGroupEnd: 63 | check( 64 | gs.len > 0, 65 | "Invalid capturing group. " & 66 | "Found too many closing symbols") 67 | let start = gs.pop() 68 | n.isCapturing = result[start].isCapturing 69 | n.idx = result[start].idx 70 | else: 71 | discard 72 | check( 73 | groups.count < int16.high, 74 | ("Invalid number of capturing groups, " & 75 | "the limit is $#") %% $(int16.high - 1)) 76 | check( 77 | gs.len == 0, 78 | "Invalid capturing group. " & 79 | "Found too many opening symbols") 80 | 81 | func toAsciiKind(k: NodeKind): NodeKind = 82 | case k 83 | of reWordBoundary: 84 | reWordBoundaryAscii 85 | of reNotWordBoundary: 86 | reNotWordBoundaryAscii 87 | of reWord: 88 | reWordAscii 89 | of reDigit: 90 | reDigitAscii 91 | of reWhiteSpace: 92 | reWhiteSpaceAscii 93 | of reNotAlphaNum: 94 | reNotAlphaNumAscii 95 | of reNotDigit: 96 | reNotDigitAscii 97 | of reNotWhiteSpace: 98 | reNotWhiteSpaceAscii 99 | of reAny: 100 | reAnyAscii 101 | of reAnyNL: 102 | reAnyNLAscii 103 | else: 104 | k 105 | 106 | func toggle(f: Flag): Flag = 107 | ## toggle regular flag to 108 | ## negated flag and the other way around 109 | case f 110 | of flagCaseInsensitive: 111 | flagNotCaseInsensitive 112 | of flagNotCaseInsensitive: 113 | flagCaseInsensitive 114 | of flagMultiLine: 115 | flagNotMultiLine 116 | of flagNotMultiLine: 117 | flagMultiLine 118 | of flagAnyMatchNewLine: 119 | flagNotAnyMatchNewLine 120 | of flagNotAnyMatchNewLine: 121 | flagAnyMatchNewLine 122 | of flagUnGreedy: 123 | flagNotUnGreedy 124 | of flagNotUnGreedy: 125 | flagUnGreedy 126 | of flagUnicode: 127 | flagNotUnicode 128 | of flagNotUnicode: 129 | flagUnicode 130 | of flagVerbose: 131 | flagNotVerbose 132 | of flagNotVerbose: 133 | flagVerbose 134 | 135 | func squash(flags: seq[seq[Flag]]): array[Flag, bool] = 136 | ## Nested groups may contain flags, 137 | ## this will set/unset those flags 138 | ## in order. It should be done each time 139 | ## there is a group start/end 140 | for ff in flags: 141 | for f in ff: 142 | result[f.toggle()] = false 143 | result[f] = true 144 | 145 | func applyFlag(n: var Node, f: Flag) = 146 | case f 147 | of flagAnyMatchNewLine: 148 | if n.kind == reAny: 149 | n.kind = reAnyNL 150 | of flagMultiLine: 151 | case n.kind 152 | of reStartSym: 153 | n.kind = reStartSymML 154 | of reEndSym: 155 | n.kind = reEndSymML 156 | else: 157 | discard 158 | of flagCaseInsensitive: 159 | if n.kind == reChar and n.cp != n.cp.swapCase(): 160 | n.kind = reCharCI 161 | # todo: apply recursevely to 162 | # shorthands of reInSet/reNotSet (i.e: [:ascii:]) 163 | if n.kind in {reInSet, reNotSet}: 164 | var cps = initHashSet[Rune]() 165 | cps.incl(n.cps) 166 | for cp in cps: 167 | let cpsc = cp.swapCase() 168 | if cp != cpsc: 169 | n.cps.incl(cpsc) 170 | for sl in n.ranges[0 .. ^1]: 171 | let 172 | cpa = sl.a.swapCase() 173 | cpb = sl.b.swapCase() 174 | if sl.a != cpa and sl.b != cpb: 175 | n.ranges.add(cpa .. cpb) 176 | of flagUnGreedy: 177 | if n.kind in opKind: 178 | n.isGreedy = not n.isGreedy 179 | of flagNotUnicode: 180 | n.kind = n.kind.toAsciiKind() 181 | if n.kind in {reInSet, reNotSet}: 182 | for nn in n.shorthands.mitems: 183 | nn.kind = nn.kind.toAsciiKind() 184 | else: 185 | assert f in { 186 | flagNotAnyMatchNewLine, 187 | flagNotMultiLine, 188 | flagNotCaseInsensitive, 189 | flagNotUnGreedy, 190 | flagUnicode, 191 | flagVerbose, 192 | flagNotVerbose} 193 | 194 | func applyFlags(expression: seq[Node]): seq[Node] = 195 | ## apply flags to each group 196 | result = newSeqOfCap[Node](expression.len) 197 | var flags = newSeq[seq[Flag]]() 198 | var sc = expression.scan() 199 | for n in sc.mitems(): 200 | # (?flags) 201 | # Orphan flags are added to current group 202 | case n.kind 203 | of reGroupStart: 204 | if n.flags.len == 0: 205 | flags.add(@[]) 206 | result.add(n) 207 | continue 208 | if sc.peek.kind == reGroupEnd: # (?flags) 209 | discard sc.next() 210 | if flags.len > 0: 211 | flags[flags.len - 1].add(n.flags) 212 | else: 213 | flags.add(n.flags) 214 | continue # skip ( 215 | flags.add(n.flags) 216 | of reGroupEnd: 217 | discard flags.pop() 218 | else: 219 | let ff = flags.squash() 220 | for f in Flag.low .. Flag.high: 221 | if ff[f]: 222 | applyFlag(n, f) 223 | result.add(n) 224 | 225 | func expandOneRepRange(subExpr: seq[Node], n: Node): seq[Node] = 226 | ## expand a repetition-range expression 227 | ## into the equivalent repeated expression 228 | assert n.kind == reRepRange 229 | if n.max == -1: # a{n,} -> aaa* 230 | result = newSeqOfCap[Node](subExpr.len * (n.min + 1) + 1) 231 | for _ in 0 ..< n.min: 232 | result.add(subExpr) 233 | result.add(Node( 234 | kind: reZeroOrMore, 235 | cp: "*".toRune, 236 | isGreedy: n.isGreedy)) 237 | elif n.min == n.max: # a{n} -> aaa 238 | result = newSeqOfCap[Node](subExpr.len * n.max) 239 | for _ in 0 ..< n.max - 1: 240 | result.add(subExpr) 241 | else: # a{n,m} -> aaa?a? 242 | assert n.min < n.max 243 | result = newSeqOfCap[Node](subExpr.len * n.max + n.max - n.min) 244 | for _ in 0 ..< n.min: 245 | result.add(subExpr) 246 | for _ in n.min ..< n.max - 1: 247 | result.add(Node( 248 | kind: reZeroOrOne, 249 | cp: "?".toRune, 250 | isGreedy: n.isGreedy)) 251 | result.add(subExpr) 252 | result.add(Node( 253 | kind: reZeroOrOne, 254 | cp: "?".toRune, 255 | isGreedy: n.isGreedy)) 256 | 257 | func expandRepRange(expression: seq[Node]): seq[Node] = 258 | ## expand every repetition range 259 | result = newSeqOfCap[Node](expression.len) 260 | var i: int 261 | var gi: int 262 | for n in expression: 263 | if n.kind != reRepRange: 264 | result.add(n) 265 | continue 266 | check( 267 | result.len > 0, 268 | "Invalid repeition range, " & 269 | "nothing to repeat") 270 | case result[^1].kind 271 | of reGroupEnd: 272 | i = 0 273 | gi = 0 274 | for ne in result.reversed: 275 | inc i 276 | if ne.kind == reGroupEnd: 277 | inc gi 278 | if ne.kind == reGroupStart: 279 | dec gi 280 | if gi == 0: 281 | break 282 | doAssert gi >= 0 283 | doAssert gi == 0 284 | assert result[result.len-i].kind == reGroupStart 285 | result.add(result[result.len-i .. result.len-1].expandOneRepRange(n)) 286 | of matchableKind: 287 | result.add(result[result.len-1 .. result.len-1].expandOneRepRange(n)) 288 | else: 289 | raise newException(RegexError, ( 290 | "Invalid repetition range, either " & 291 | "char, shorthand (i.e: \\w), group, or set " & 292 | "expected before repetition range")) 293 | 294 | func joinAtoms(expression: seq[Node]): seq[Node] = 295 | ## Put a ``~`` joiner between atoms. An atom is 296 | ## a piece of expression that would loose 297 | ## meaning when breaking it up (i.e.: ``a~(b|c)*~d``) 298 | result = newSeqOfCap[Node](expression.len * 2) 299 | var atomsCount = 0 300 | for n in expression: 301 | case n.kind 302 | of matchableKind, assertionKind: 303 | inc atomsCount 304 | if atomsCount > 1: 305 | atomsCount = 1 306 | result.add(initJoinerNode()) 307 | of reGroupStart: 308 | if atomsCount > 0: 309 | result.add(initJoinerNode()) 310 | atomsCount = 0 311 | of reOr: 312 | atomsCount = 0 313 | of reGroupEnd, 314 | reZeroOrMore, 315 | reOneOrMore, 316 | reZeroOrOne, 317 | reRepRange: 318 | inc atomsCount 319 | else: 320 | assert false 321 | result.add(n) 322 | 323 | type 324 | Associativity = enum 325 | ## Operator associativity. Unary ops are 326 | ## right[-to-left] and binary ops are 327 | ## left[-to-right] 328 | asyRight 329 | asyLeft 330 | OpsPA = tuple 331 | precedence: int 332 | associativity: Associativity 333 | 334 | func opsPA(nk: NodeKind): OpsPA = 335 | ## return the precedence and 336 | ## associativity of a given node kind 337 | assert nk in opKind 338 | case nk 339 | of reRepRange, 340 | reZeroOrMore, 341 | reOneOrMore, 342 | reZeroOrOne: 343 | result = (5, asyRight) 344 | of reJoiner: 345 | result = (4, asyLeft) 346 | of reOr: 347 | result = (3, asyLeft) 348 | else: 349 | assert false 350 | 351 | func hasPrecedence(a: NodeKind, b: NodeKind): bool = 352 | ## Check ``b`` has precedence over ``a``. 353 | ## Both ``a`` and ``b`` are expected to 354 | ## be valid operators. Unary operators such 355 | ## as: ``*``, ``?`` and ``+`` have right-to-left 356 | ## associativity. Binary operators 357 | ## such as: ``|`` (or) and ``~`` (joiner) have 358 | ## left-to-right associativity 359 | result = 360 | (opsPA(b).associativity == asyRight and 361 | opsPA(b).precedence <= opsPA(a).precedence) or 362 | (opsPA(b).associativity == asyLeft and 363 | opsPA(b).precedence < opsPA(a).precedence) 364 | 365 | func popGreaterThan(ops: var seq[Node], op: Node): seq[Node] = 366 | assert op.kind in opKind 367 | result = newSeqOfCap[Node](ops.len) 368 | while (ops.len > 0 and 369 | ops[ops.len - 1].kind in opKind and 370 | ops[ops.len - 1].kind.hasPrecedence(op.kind)): 371 | result.add(ops.pop()) 372 | 373 | func popUntilGroupStart(ops: var seq[Node]): seq[Node] = 374 | result = newSeqOfCap[Node](ops.len) 375 | while true: 376 | let op = ops.pop() 377 | result.add(op) 378 | if op.kind == reGroupStart: 379 | break 380 | 381 | func rpn(expression: seq[Node]): seq[Node] = 382 | ## An adaptation of the Shunting-yard algorithm 383 | ## for producing `Reverse Polish Notation` out of 384 | ## an expression specified in infix notation. 385 | ## It supports regex primitives including groups. 386 | ## The point of doing this is greatly simplifying 387 | ## the parsing of the regular expression into an NFA. 388 | ## Suffix notation removes nesting and so it can 389 | ## be parsed in a linear way instead of recursively 390 | result = newSeqOfCap[Node](expression.len) 391 | var ops = newSeq[Node]() 392 | for n in expression: 393 | case n.kind 394 | of matchableKind, assertionKind: 395 | result.add(n) 396 | of reGroupStart: 397 | ops.add(n) 398 | of reGroupEnd: 399 | result.add(ops.popUntilGroupStart()) 400 | result.add(n) 401 | of opKind: 402 | result.add(ops.popGreaterThan(n)) 403 | ops.add(n) 404 | else: 405 | assert false 406 | # reverse ops 407 | for i in 1 .. ops.len: 408 | result.add(ops[ops.len - i]) 409 | 410 | func transformExp*( 411 | exp: seq[Node], 412 | groups: var GroupsCapture 413 | ): seq[Node] {.inline.} = 414 | result = exp 415 | .fillGroups(groups) 416 | .greediness 417 | .applyFlags 418 | .expandRepRange 419 | .joinAtoms 420 | .rpn 421 | -------------------------------------------------------------------------------- /src/nregex/private/nfa.nim: -------------------------------------------------------------------------------- 1 | import std/deques 2 | 3 | import nodetype 4 | import common 5 | 6 | func check(cond: bool, msg: string) = 7 | if not cond: 8 | raise newException(RegexError, msg) 9 | 10 | type 11 | End = seq[int16] 12 | ## store all the last 13 | ## states of a given state. 14 | ## Avoids having to recurse 15 | ## a state to find its ends, 16 | ## but have to keep them up-to-date 17 | 18 | func combine( 19 | nfa: var seq[Node], 20 | ends: var seq[End], 21 | org: int16, 22 | target: int16 23 | ) = 24 | ## combine ends of ``org`` 25 | ## with ``target`` 26 | for e in ends[org]: 27 | for i, ni in nfa[e].next.mpairs: 28 | if nfa[ni].kind == reEOE: 29 | ni = target 30 | ends[org] = ends[target] 31 | 32 | func update( 33 | ends: var seq[End], 34 | ni: int16, 35 | next: openArray[int16] 36 | ) = 37 | ## update the ends of Node ``ni`` 38 | ## to point to ends of ``n.outA`` 39 | ## and ``n.outB``. If either outA 40 | ## or outB are ``0`` (EOE), 41 | ## the ends will point to itself 42 | ends[ni].setLen(0) 43 | for n in next: 44 | if n == 0: 45 | ends[ni].add(ni) 46 | else: 47 | ends[ni].add(ends[n]) 48 | 49 | const eoe = 0'i16 50 | 51 | func eNfa(expression: seq[Node]): seq[Node] = 52 | ## Thompson's construction 53 | result = newSeqOfCap[Node](expression.len + 2) 54 | result.add(initEOENode()) 55 | var 56 | ends = newSeq[End](expression.len + 1) 57 | states = newSeq[int16]() 58 | if expression.len == 0: 59 | states.add(eoe) 60 | for n in expression: 61 | var n = n 62 | assert n.next.len == 0 63 | check( 64 | result.high < int16.high, 65 | ("The expression is too long, " & 66 | "limit is ~$#") %% $int16.high) 67 | let ni = result.len.int16 68 | case n.kind 69 | of matchableKind, assertionKind: 70 | n.next.add(eoe) 71 | ends.update(ni, [eoe]) 72 | result.add(n) 73 | states.add(ni) 74 | of reJoiner: 75 | let 76 | stateB = states.pop() 77 | stateA = states.pop() 78 | result.combine(ends, stateA, stateB) 79 | states.add(stateA) 80 | of reOr: 81 | check( 82 | states.len >= 2, 83 | "Invalid OR conditional, nothing to " & 84 | "match at right/left side of the condition") 85 | let 86 | stateB = states.pop() 87 | stateA = states.pop() 88 | n.next.add([stateA, stateB]) 89 | ends.update(ni, n.next) 90 | result.add(n) 91 | states.add(ni) 92 | of reZeroOrMore: 93 | check( 94 | states.len >= 1, 95 | "Invalid `*` operator, " & 96 | "nothing to repeat") 97 | let stateA = states.pop() 98 | n.next.add([stateA, eoe]) 99 | ends.update(ni, n.next) 100 | result.combine(ends, stateA, ni) 101 | result.add(n) 102 | states.add(ni) 103 | if n.isGreedy: 104 | swap(result[^1].next[0], result[^1].next[1]) 105 | of reOneOrMore: 106 | check( 107 | states.len >= 1, 108 | "Invalid `+` operator, " & 109 | "nothing to repeat") 110 | let stateA = states.pop() 111 | n.next.add([stateA, eoe]) 112 | ends.update(ni, n.next) 113 | result.combine(ends, stateA, ni) 114 | result.add(n) 115 | states.add(stateA) 116 | if n.isGreedy: 117 | swap(result[^1].next[0], result[^1].next[1]) 118 | of reZeroOrOne: 119 | check( 120 | states.len >= 1, 121 | "Invalid `?` operator, " & 122 | "nothing to make optional") 123 | let stateA = states.pop() 124 | n.next.add([stateA, eoe]) 125 | ends.update(ni, n.next) 126 | result.add(n) 127 | states.add(ni) 128 | if n.isGreedy: 129 | swap(result[^1].next[0], result[^1].next[1]) 130 | of reGroupStart: 131 | let stateA = states.pop() 132 | n.next.add(stateA) 133 | ends.update(ni, n.next) 134 | result.add(n) 135 | states.add(ni) 136 | of reGroupEnd: 137 | n.next.add(eoe) 138 | ends.update(ni, n.next) 139 | let stateA = states.pop() 140 | result.combine(ends, stateA, ni) 141 | result.add(n) 142 | states.add(stateA) 143 | else: 144 | assert(false, "Unhandled node: $#" %% $n.kind) 145 | assert states.len == 1 146 | result.add(Node( 147 | kind: reSkip, 148 | cp: "#".toRune, 149 | next: states)) 150 | 151 | type 152 | Zclosure = seq[int16] 153 | TeClosure = seq[(int16, Zclosure)] 154 | 155 | func isTransitionZ(n: Node): bool {.inline.} = 156 | result = case n.kind 157 | of groupKind: 158 | n.isCapturing 159 | of reInSet: 160 | # XXX always false in ascii mode 161 | var isZ = false 162 | for s in n.shorthands: 163 | isZ = s.kind notin {reAny, reAnyNl, reDigit, reWord} 164 | if isZ: 165 | break 166 | isZ 167 | of assertionKind: 168 | true 169 | of matchTransitionKind - {reInSet}: 170 | # XXX false in ascii mode 171 | true 172 | else: 173 | false 174 | 175 | func countTransitionsZ(nfa: seq[Node]): int = 176 | result = 0 177 | for n in nfa: 178 | result += int(n.isTransitionZ) 179 | 180 | func teClosure( 181 | result: var TeClosure, 182 | nfa: seq[Node], 183 | state: int16, 184 | visited: var set[int16], 185 | zTransitions: Zclosure 186 | ) = 187 | if state in visited: 188 | return 189 | visited.incl(state) 190 | var zTransitionsCurr = zTransitions 191 | if isTransitionZ(nfa[state]): 192 | zTransitionsCurr.add(state) 193 | if nfa[state].kind in matchableKind + {reEOE}: 194 | result.add((state, zTransitionsCurr)) 195 | return 196 | for s in nfa[state].next: 197 | teClosure(result, nfa, s, visited, zTransitionsCurr) 198 | 199 | func teClosure( 200 | result: var TeClosure, 201 | nfa: seq[Node], 202 | state: int16 203 | ) = 204 | var visited: set[int16] 205 | var zclosure: Zclosure 206 | for s in nfa[state].next: 207 | teClosure(result, nfa, s, visited, zclosure) 208 | 209 | type 210 | TransitionsAll* = seq[seq[int16]] 211 | ZclosureStates* = seq[seq[Node]] 212 | Transitions* = object 213 | all*: TransitionsAll 214 | allZ*: TransitionsAll 215 | z*: ZclosureStates 216 | zCount*: int 217 | 218 | # XXX do not add char classes transitions \w, \d, etc in ascii mode 219 | func eRemoval( 220 | eNfa: seq[Node], 221 | transitions: var Transitions 222 | ): seq[Node] = 223 | ## Remove e-transitions and return 224 | ## remaining state transtions and 225 | ## submatches, and zero matches. 226 | ## Transitions are added in matching order (BFS), 227 | ## which may help matching performance 228 | #echo eNfa 229 | var eNfa = eNfa 230 | transitions.all.setLen(eNfa.len) 231 | transitions.allZ.setLen(eNfa.len) 232 | var statesMap = newSeq[int16](eNfa.len) 233 | for i in 0 .. statesMap.len-1: 234 | statesMap[i] = -1 235 | var statePos = 0'i16 236 | let start = int16(eNfa.len-1) 237 | statesMap[start] = statePos 238 | inc statePos 239 | var closure: TeClosure 240 | var zc: seq[Node] 241 | var qw = initDeque[int16]() 242 | qw.addFirst(start) 243 | var qu: set[int16] 244 | qu.incl(start) 245 | while qw.len > 0: 246 | let qa = qw.popLast() 247 | closure.setLen(0) 248 | teClosure(closure, eNfa, qa) 249 | eNfa[qa].next.setLen(0) 250 | for qb, zclosure in closure.items: 251 | eNfa[qa].next.add(qb) 252 | if statesMap[qb] == -1: 253 | statesMap[qb] = statePos 254 | inc statePos 255 | assert statesMap[qa] > -1 256 | assert statesMap[qb] > -1 257 | transitions.all[statesMap[qa]].add(statesMap[qb]) 258 | transitions.allZ[statesMap[qa]].add(-1'i16) 259 | zc.setLen(0) 260 | for z in zclosure: 261 | zc.add(eNfa[z]) 262 | if zc.len > 0: 263 | transitions.z.add(zc) 264 | transitions.allZ[statesMap[qa]][^1] = int16(transitions.z.len-1) 265 | if qb notin qu: 266 | qu.incl(qb) 267 | qw.addFirst(qb) 268 | transitions.all.setLen(statePos) 269 | transitions.allZ.setLen(statePos) 270 | if transitions.z.len == 0: 271 | transitions.allZ = @[] 272 | transitions.zCount = eNfa.countTransitionsZ 273 | result = newSeq[Node](statePos) 274 | for en, nn in statesMap.pairs: 275 | if nn == -1: 276 | continue 277 | result[nn] = if isTransitionZ(eNfa[en]): 278 | doAssert eNfa[en].kind in matchableKind 279 | Node(kind: reAnyNl, cp: "#".toRune) 280 | else: 281 | eNfa[en] 282 | result[nn].next.setLen(0) 283 | for en2 in eNfa[en].next: 284 | doAssert statesMap[en2] > -1 285 | result[nn].next.add(statesMap[en2]) 286 | 287 | func nfa*( 288 | exp: seq[Node], 289 | transitions: var Transitions 290 | ): seq[Node] = 291 | result = exp.eNfa.eRemoval(transitions) 292 | -------------------------------------------------------------------------------- /src/nregex/private/nodematch.nim: -------------------------------------------------------------------------------- 1 | import std/unicode 2 | import std/sets 3 | 4 | import pkg/unicodedb/properties 5 | import pkg/unicodedb/types 6 | import pkg/unicodeplus 7 | 8 | import nodetype 9 | import common 10 | 11 | func isWord*(r: Rune): bool {.inline.} = 12 | utmWord in unicodeTypes(r) 13 | 14 | func isWordAscii(r: Rune): bool {.inline.} = 15 | ## return ``true`` if the given 16 | ## rune is in ``[A-Za-z0-9]`` range 17 | case r.int 18 | of 'A'.ord .. 'Z'.ord, 19 | 'a'.ord .. 'z'.ord, 20 | '0'.ord .. '9'.ord, 21 | '_'.ord: 22 | true 23 | else: 24 | false 25 | 26 | template isWordBoundaryImpl(r, nxt, isWordProc): bool = 27 | (r.int > -1 and isWordProc(r)) xor 28 | (nxt.int > -1 and isWordProc(nxt)) 29 | 30 | func isWordBoundary(r: Rune, nxt: Rune): bool {.inline.} = 31 | ## check if current match 32 | ## is a boundary (i.e the end of a word) 33 | isWordBoundaryImpl(r, nxt, isWord) 34 | 35 | func isWordBoundaryAscii(r: Rune, nxt: Rune): bool {.inline.} = 36 | ## check if current match 37 | ## is a boundary. Match ascii only 38 | isWordBoundaryImpl(r, nxt, isWordAscii) 39 | 40 | func match*(n: Node, r: Rune, nxt: Rune): bool = 41 | ## match for ``Node`` of assertion kind. 42 | ## Return whether the node matches 43 | ## the current characters or not 44 | case n.kind 45 | of reStart, reStartSym: 46 | r == invalidRune 47 | of reEnd, reEndSym: 48 | nxt == invalidRune 49 | of reStartSymML: 50 | (r == invalidRune or 51 | r == lineBreakRune) 52 | of reEndSymML: 53 | (nxt == invalidRune or 54 | nxt == lineBreakRune) 55 | of reWordBoundary: 56 | isWordBoundary(r, nxt) 57 | of reNotWordBoundary: 58 | not isWordBoundary(r, nxt) 59 | of reWordBoundaryAscii: 60 | isWordBoundaryAscii(r, nxt) 61 | of reNotWordBoundaryAscii: 62 | not isWordBoundaryAscii(r, nxt) 63 | of reLookahead: 64 | n.cp == nxt 65 | of reNotLookahead: 66 | n.cp != nxt 67 | of reLookbehind: 68 | n.cp == r 69 | of reNotLookbehind: 70 | n.cp != r 71 | else: 72 | assert false 73 | false 74 | 75 | func contains(sr: seq[Slice[Rune]], r: Rune): bool = 76 | result = false 77 | for sl in sr: 78 | result = r in sl 79 | if result: 80 | break 81 | 82 | func isWhiteSpace(r: Rune): bool {.inline.} = 83 | utmWhiteSpace in unicodeTypes(r) 84 | 85 | func isWhiteSpaceAscii(r: Rune): bool {.inline.} = 86 | case r.int 87 | of ' '.ord, 88 | '\t'.ord, 89 | '\L'.ord, 90 | '\r'.ord, 91 | '\f'.ord, 92 | '\v'.ord: 93 | true 94 | else: 95 | false 96 | 97 | func isDigitAscii(r: Rune): bool {.inline.} = 98 | case r.int 99 | of '0'.ord .. '9'.ord: 100 | true 101 | else: 102 | false 103 | 104 | func isAnyAscii(r: Rune): bool {.inline.} = 105 | (r.int <= int8.high and 106 | r != lineBreakRune) 107 | 108 | func swapCase*(r: Rune): Rune = 109 | result = r.toLower() 110 | if result != r: 111 | return 112 | result = r.toUpper() 113 | 114 | func match*(n: Node, r: Rune): bool = 115 | ## match for ``Node`` of matchable kind. 116 | ## Return whether the node matches 117 | ## the current character or not 118 | assert r != invalidRune 119 | case n.kind 120 | of reEOE: 121 | false 122 | of reWord: 123 | r.isWord() 124 | of reNotAlphaNum: 125 | not r.isWord() 126 | of reDigit: 127 | r.isDecimal() 128 | of reNotDigit: 129 | not r.isDecimal() 130 | of reWhiteSpace: 131 | r.isWhiteSpace() 132 | of reNotWhiteSpace: 133 | not r.isWhiteSpace() 134 | of reInSet, reNotSet: 135 | var matches = ( 136 | r in n.cps or 137 | r in n.ranges) 138 | if not matches: 139 | for nn in n.shorthands: 140 | matches = nn.match(r) 141 | if matches: break 142 | ((matches and n.kind == reInSet) or 143 | (not matches and n.kind == reNotSet)) 144 | of reAny: 145 | r != lineBreakRune 146 | of reAnyNL: 147 | true 148 | of reCharCI: 149 | r == n.cp or r == n.cp.swapCase() 150 | of reWordAscii: 151 | r.isWordAscii() 152 | of reDigitAscii: 153 | r.isDigitAscii() 154 | of reWhiteSpaceAscii: 155 | r.isWhiteSpaceAscii() 156 | of reUCC: 157 | r.unicodeCategory() in n.cc 158 | of reNotAlphaNumAscii: 159 | not r.isWordAscii() 160 | of reNotDigitAscii: 161 | not r.isDigitAscii() 162 | of reNotWhiteSpaceAscii: 163 | not r.isWhiteSpaceAscii() 164 | of reNotUCC: 165 | r.unicodeCategory() notin n.cc 166 | of reAnyAscii: 167 | r.isAnyAscii() 168 | of reAnyNLAscii: 169 | r.isAnyAscii() or r == lineBreakRune 170 | else: 171 | assert n.kind == reChar 172 | n.cp == r 173 | -------------------------------------------------------------------------------- /src/nregex/private/nodetype.nim: -------------------------------------------------------------------------------- 1 | import std/unicode 2 | import std/sets 3 | 4 | import pkg/unicodedb/properties 5 | 6 | import common 7 | 8 | type 9 | Flag* = enum 10 | flagCaseInsensitive, # i 11 | flagNotCaseInsensitive, # -i 12 | flagMultiLine, # m 13 | flagNotMultiLine, # -m 14 | flagAnyMatchNewLine, # s 15 | flagNotAnyMatchNewLine, # -s 16 | flagUnGreedy, # U 17 | flagNotUnGreedy, # -U 18 | flagUnicode, # u 19 | flagNotUnicode, # -u 20 | flagVerbose, # x 21 | flagNotVerbose # -x 22 | NodeKind* = enum 23 | reChar, 24 | reCharCi, 25 | reJoiner, # ~ 26 | reGroupStart, # ( 27 | reGroupEnd, # ) 28 | reOr, # | 29 | reZeroOrMore, # * 30 | reOneOrMore, # + 31 | reZeroOrOne, # ? 32 | reRepRange, # {n,m} 33 | reStartSym, # ^ 34 | reEndSym, # $ 35 | reStartSymML, # ^ multi-line 36 | reEndSymML, # $ multi-line 37 | reStart, # \A 38 | reEnd, # \z 39 | reWordBoundary, # \b 40 | reNotWordBoundary, # \B 41 | reWord, # \w 42 | reDigit, # \d 43 | reWhiteSpace, # \s 44 | reUCC, # \pN or \p{Nn} 45 | reNotAlphaNum, # \W 46 | reNotDigit, # \D 47 | reNotWhiteSpace, # \S 48 | reNotUCC, # \PN or \P{Nn} 49 | reAny, # . 50 | reAnyNl, # . new-line 51 | reWordBoundaryAscii, # \b ascii only 52 | reNotWordBoundaryAscii, # \B ascii only 53 | reWordAscii, # \w ascii only 54 | reDigitAscii, # \d ascii only 55 | reWhiteSpaceAscii, # \s ascii only 56 | reNotAlphaNumAscii, # \W ascii only 57 | reNotDigitAscii, # \D ascii only 58 | reNotWhiteSpaceAscii, # \S ascii only 59 | reAnyAscii, # . ascii only 60 | reAnyNlAscii, # . new-line ascii only 61 | reInSet, # [abc] 62 | reNotSet, # [^abc] 63 | reLookahead, # (?=...) 64 | reLookbehind, # (?<=...) 65 | reNotLookahead, # (?!...) 66 | reNotLookbehind, # (? sc.s.high 40 | 41 | func prev*[T](sc: Scanner[T]): T = 42 | sc.s[sc.pos - 1] 43 | 44 | func curr*[T](sc: Scanner[T]): T = 45 | sc.s[sc.pos] 46 | 47 | func next*[T](sc: Scanner[T]): T = 48 | ## return current item and consume it 49 | result = sc.s[sc.pos] 50 | inc sc.pos 51 | 52 | func peekImpl[T](sc: Scanner[T], default: T): T {.inline.} = 53 | ## same as ``curr`` except it 54 | ## returns a default/invalid value when 55 | ## the data is fully consumed 56 | if sc.pos > sc.s.high: 57 | default 58 | else: 59 | sc.s[sc.pos] 60 | 61 | func peek*(sc: Scanner[Rune]): Rune = 62 | peekImpl(sc, invalidRune) 63 | 64 | func peek*(sc: Scanner[Node]): Node = 65 | peekImpl(sc, initEOENode()) 66 | 67 | iterator peek*[T](sc: Scanner[T]): (T, T) = 68 | for s in sc: 69 | yield (s, sc.peek) 70 | 71 | func find*(sc: Scanner[Rune], r: Rune): int = 72 | ## return number of consumed chars. 73 | ## The scanner's position is not moved. 74 | ## ``-1`` is returned when char is not found 75 | result = 0 76 | let pos = sc.pos 77 | while true: 78 | if sc.finished: 79 | result = -1 80 | break 81 | if sc.curr == r: 82 | break 83 | discard sc.next() 84 | inc result 85 | sc.pos = pos 86 | -------------------------------------------------------------------------------- /tests/nim.cfg: -------------------------------------------------------------------------------- 1 | --path:"../src/" 2 | --------------------------------------------------------------------------------